Index: head/sys/amd64/amd64/exception.S
===================================================================
--- head/sys/amd64/amd64/exception.S	(revision 258540)
+++ head/sys/amd64/amd64/exception.S	(revision 258541)
@@ -1,906 +1,905 @@
 /*-
  * Copyright (c) 1989, 1990 William F. Jolitz.
  * Copyright (c) 1990 The Regents of the University of California.
  * Copyright (c) 2007 The FreeBSD Foundation
  * All rights reserved.
  *
  * Portions of this software were developed by A. Joseph Koshy under
  * sponsorship from the FreeBSD Foundation and Google, Inc.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  * $FreeBSD$
  */
 
 #include "opt_atpic.h"
 #include "opt_compat.h"
 #include "opt_hwpmc_hooks.h"
-#include "opt_kdtrace.h"
 
 #include <machine/asmacros.h>
 #include <machine/psl.h>
 #include <machine/trap.h>
 #include <machine/specialreg.h>
 
 #include "assym.s"
 
 #ifdef KDTRACE_HOOKS
 	.bss
 	.globl	dtrace_invop_jump_addr
 	.align	8
 	.type	dtrace_invop_jump_addr,@object
 	.size	dtrace_invop_jump_addr,8
 dtrace_invop_jump_addr:
 	.zero	8
 	.globl	dtrace_invop_calltrap_addr
 	.align	8
 	.type	dtrace_invop_calltrap_addr,@object
 	.size	dtrace_invop_calltrap_addr,8
 dtrace_invop_calltrap_addr:
 	.zero	8
 #endif
 	.text
 #ifdef HWPMC_HOOKS
 	ENTRY(start_exceptions)
 #endif
 
 /*****************************************************************************/
 /* Trap handling                                                             */
 /*****************************************************************************/
 /*
  * Trap and fault vector routines.
  *
  * All traps are 'interrupt gates', SDT_SYSIGT.  An interrupt gate pushes
  * state on the stack but also disables interrupts.  This is important for
  * us for the use of the swapgs instruction.  We cannot be interrupted
  * until the GS.base value is correct.  For most traps, we automatically
  * then enable interrupts if the interrupted context had them enabled.
  * This is equivalent to the i386 port's use of SDT_SYS386TGT.
  *
  * The cpu will push a certain amount of state onto the kernel stack for
  * the current process.  See amd64/include/frame.h.
  * This includes the current RFLAGS (status register, which includes
  * the interrupt disable state prior to the trap), the code segment register,
  * and the return instruction pointer are pushed by the cpu.  The cpu
  * will also push an 'error' code for certain traps.  We push a dummy
  * error code for those traps where the cpu doesn't in order to maintain
  * a consistent frame.  We also push a contrived 'trap number'.
  *
  * The CPU does not push the general registers, so we must do that, and we
  * must restore them prior to calling 'iret'.  The CPU adjusts %cs and %ss
  * but does not mess with %ds, %es, %gs or %fs.  We swap the %gs base for
  * for the kernel mode operation shortly, without changes to the selector
  * loaded.  Since superuser long mode works with any selectors loaded into
  * segment registers other then %cs, which makes them mostly unused in long
  * mode, and kernel does not reference %fs, leave them alone.  The segment
  * registers are reloaded on return to the usermode.
  */
 
 MCOUNT_LABEL(user)
 MCOUNT_LABEL(btrap)
 
 /* Traps that we leave interrupts disabled for.. */
 #define	TRAP_NOEN(a)	\
 	subq $TF_RIP,%rsp; \
 	movl $(a),TF_TRAPNO(%rsp) ; \
 	movq $0,TF_ADDR(%rsp) ; \
 	movq $0,TF_ERR(%rsp) ; \
 	jmp alltraps_noen
 IDTVEC(dbg)
 	TRAP_NOEN(T_TRCTRAP)
 IDTVEC(bpt)
 	TRAP_NOEN(T_BPTFLT)
 #ifdef KDTRACE_HOOKS
 IDTVEC(dtrace_ret)
 	TRAP_NOEN(T_DTRACE_RET)
 #endif
 
 /* Regular traps; The cpu does not supply tf_err for these. */
 #define	TRAP(a)	 \
 	subq $TF_RIP,%rsp; \
 	movl $(a),TF_TRAPNO(%rsp) ; \
 	movq $0,TF_ADDR(%rsp) ; \
 	movq $0,TF_ERR(%rsp) ; \
 	jmp alltraps
 IDTVEC(div)
 	TRAP(T_DIVIDE)
 IDTVEC(ofl)
 	TRAP(T_OFLOW)
 IDTVEC(bnd)
 	TRAP(T_BOUND)
 IDTVEC(ill)
 	TRAP(T_PRIVINFLT)
 IDTVEC(dna)
 	TRAP(T_DNA)
 IDTVEC(fpusegm)
 	TRAP(T_FPOPFLT)
 IDTVEC(mchk)
 	TRAP(T_MCHK)
 IDTVEC(rsvd)
 	TRAP(T_RESERVED)
 IDTVEC(fpu)
 	TRAP(T_ARITHTRAP)
 IDTVEC(xmm)
 	TRAP(T_XMMFLT)
 
 /* This group of traps have tf_err already pushed by the cpu */
 #define	TRAP_ERR(a)	\
 	subq $TF_ERR,%rsp; \
 	movl $(a),TF_TRAPNO(%rsp) ; \
 	movq $0,TF_ADDR(%rsp) ; \
 	jmp alltraps
 IDTVEC(tss)
 	TRAP_ERR(T_TSSFLT)
 IDTVEC(missing)
 	TRAP_ERR(T_SEGNPFLT)
 IDTVEC(stk)
 	TRAP_ERR(T_STKFLT)
 IDTVEC(align)
 	TRAP_ERR(T_ALIGNFLT)
 
 	/*
 	 * alltraps entry point.  Use swapgs if this is the first time in the
 	 * kernel from userland.  Reenable interrupts if they were enabled
 	 * before the trap.  This approximates SDT_SYS386TGT on the i386 port.
 	 */
 	SUPERALIGN_TEXT
 	.globl	alltraps
 	.type	alltraps,@function
 alltraps:
 	movq	%rdi,TF_RDI(%rsp)
 	testb	$SEL_RPL_MASK,TF_CS(%rsp) /* Did we come from kernel? */
 	jz	alltraps_testi		/* already running with kernel GS.base */
 	swapgs
 	movq	PCPU(CURPCB),%rdi
 	andl	$~PCB_FULL_IRET,PCB_FLAGS(%rdi)
 	movw	%fs,TF_FS(%rsp)
 	movw	%gs,TF_GS(%rsp)
 	movw	%es,TF_ES(%rsp)
 	movw	%ds,TF_DS(%rsp)
 alltraps_testi:
 	testl	$PSL_I,TF_RFLAGS(%rsp)
 	jz	alltraps_pushregs_no_rdi
 	sti
 alltraps_pushregs_no_rdi:
 	movq	%rsi,TF_RSI(%rsp)
 	movq	%rdx,TF_RDX(%rsp)
 	movq	%rcx,TF_RCX(%rsp)
 	movq	%r8,TF_R8(%rsp)
 	movq	%r9,TF_R9(%rsp)
 	movq	%rax,TF_RAX(%rsp)
 	movq	%rbx,TF_RBX(%rsp)
 	movq	%rbp,TF_RBP(%rsp)
 	movq	%r10,TF_R10(%rsp)
 	movq	%r11,TF_R11(%rsp)
 	movq	%r12,TF_R12(%rsp)
 	movq	%r13,TF_R13(%rsp)
 	movq	%r14,TF_R14(%rsp)
 	movq	%r15,TF_R15(%rsp)
 	movl	$TF_HASSEGS,TF_FLAGS(%rsp)
 	cld
 	FAKE_MCOUNT(TF_RIP(%rsp))
 #ifdef KDTRACE_HOOKS
 	/*
 	 * DTrace Function Boundary Trace (fbt) probes are triggered
 	 * by int3 (0xcc) which causes the #BP (T_BPTFLT) breakpoint
 	 * interrupt. For all other trap types, just handle them in
 	 * the usual way.
 	 */
 	cmpl	$T_BPTFLT,TF_TRAPNO(%rsp)
 	jne	calltrap
 
 	/* Check if there is no DTrace hook registered. */
 	cmpq	$0,dtrace_invop_jump_addr
 	je	calltrap
 
 	/*
 	 * Set our jump address for the jump back in the event that
 	 * the breakpoint wasn't caused by DTrace at all.
 	 */
 	movq	$calltrap,dtrace_invop_calltrap_addr(%rip)
 
 	/* Jump to the code hooked in by DTrace. */
 	movq	dtrace_invop_jump_addr,%rax
 	jmpq	*dtrace_invop_jump_addr
 #endif
 	.globl	calltrap
 	.type	calltrap,@function
 calltrap:
 	movq	%rsp,%rdi
 	call	trap
 	MEXITCOUNT
 	jmp	doreti			/* Handle any pending ASTs */
 
 	/*
 	 * alltraps_noen entry point.  Unlike alltraps above, we want to
 	 * leave the interrupts disabled.  This corresponds to
 	 * SDT_SYS386IGT on the i386 port.
 	 */
 	SUPERALIGN_TEXT
 	.globl	alltraps_noen
 	.type	alltraps_noen,@function
 alltraps_noen:
 	movq	%rdi,TF_RDI(%rsp)
 	testb	$SEL_RPL_MASK,TF_CS(%rsp) /* Did we come from kernel? */
 	jz	1f	/* already running with kernel GS.base */
 	swapgs
 	movq	PCPU(CURPCB),%rdi
 	andl	$~PCB_FULL_IRET,PCB_FLAGS(%rdi)
 1:	movw	%fs,TF_FS(%rsp)
 	movw	%gs,TF_GS(%rsp)
 	movw	%es,TF_ES(%rsp)
 	movw	%ds,TF_DS(%rsp)
 	jmp	alltraps_pushregs_no_rdi
 
 IDTVEC(dblfault)
 	subq	$TF_ERR,%rsp
 	movl	$T_DOUBLEFLT,TF_TRAPNO(%rsp)
 	movq	$0,TF_ADDR(%rsp)
 	movq	$0,TF_ERR(%rsp)
 	movq	%rdi,TF_RDI(%rsp)
 	movq	%rsi,TF_RSI(%rsp)
 	movq	%rdx,TF_RDX(%rsp)
 	movq	%rcx,TF_RCX(%rsp)
 	movq	%r8,TF_R8(%rsp)
 	movq	%r9,TF_R9(%rsp)
 	movq	%rax,TF_RAX(%rsp)
 	movq	%rbx,TF_RBX(%rsp)
 	movq	%rbp,TF_RBP(%rsp)
 	movq	%r10,TF_R10(%rsp)
 	movq	%r11,TF_R11(%rsp)
 	movq	%r12,TF_R12(%rsp)
 	movq	%r13,TF_R13(%rsp)
 	movq	%r14,TF_R14(%rsp)
 	movq	%r15,TF_R15(%rsp)
 	movw	%fs,TF_FS(%rsp)
 	movw	%gs,TF_GS(%rsp)
 	movw	%es,TF_ES(%rsp)
 	movw	%ds,TF_DS(%rsp)
 	movl	$TF_HASSEGS,TF_FLAGS(%rsp)
 	cld
 	testb	$SEL_RPL_MASK,TF_CS(%rsp) /* Did we come from kernel? */
 	jz	1f			/* already running with kernel GS.base */
 	swapgs
 1:
 	movq	%rsp,%rdi
 	call	dblfault_handler
 2:
 	hlt
 	jmp	2b
 
 IDTVEC(page)
 	subq	$TF_ERR,%rsp
 	movl	$T_PAGEFLT,TF_TRAPNO(%rsp)
 	movq	%rdi,TF_RDI(%rsp)	/* free up a GP register */
 	testb	$SEL_RPL_MASK,TF_CS(%rsp) /* Did we come from kernel? */
 	jz	1f			/* already running with kernel GS.base */
 	swapgs
 	movq	PCPU(CURPCB),%rdi
 	andl	$~PCB_FULL_IRET,PCB_FLAGS(%rdi)
 1:	movq	%cr2,%rdi		/* preserve %cr2 before ..  */
 	movq	%rdi,TF_ADDR(%rsp)	/* enabling interrupts. */
 	movw	%fs,TF_FS(%rsp)
 	movw	%gs,TF_GS(%rsp)
 	movw	%es,TF_ES(%rsp)
 	movw	%ds,TF_DS(%rsp)
 	testl	$PSL_I,TF_RFLAGS(%rsp)
 	jz	alltraps_pushregs_no_rdi
 	sti
 	jmp	alltraps_pushregs_no_rdi
 
 	/*
 	 * We have to special-case this one.  If we get a trap in doreti() at
 	 * the iretq stage, we'll reenter with the wrong gs state.  We'll have
 	 * to do a special the swapgs in this case even coming from the kernel.
 	 * XXX linux has a trap handler for their equivalent of load_gs().
 	 */
 IDTVEC(prot)
 	subq	$TF_ERR,%rsp
 	movl	$T_PROTFLT,TF_TRAPNO(%rsp)
 	movq	$0,TF_ADDR(%rsp)
 	movq	%rdi,TF_RDI(%rsp)	/* free up a GP register */
 	leaq	doreti_iret(%rip),%rdi
 	cmpq	%rdi,TF_RIP(%rsp)
 	je	1f			/* kernel but with user gsbase!! */
 	testb	$SEL_RPL_MASK,TF_CS(%rsp) /* Did we come from kernel? */
 	jz	2f			/* already running with kernel GS.base */
 1:	swapgs
 2:	movq	PCPU(CURPCB),%rdi
 	orl	$PCB_FULL_IRET,PCB_FLAGS(%rdi)	/* always full iret from GPF */
 	movw	%fs,TF_FS(%rsp)
 	movw	%gs,TF_GS(%rsp)
 	movw	%es,TF_ES(%rsp)
 	movw	%ds,TF_DS(%rsp)
 	testl	$PSL_I,TF_RFLAGS(%rsp)
 	jz	alltraps_pushregs_no_rdi
 	sti
 	jmp	alltraps_pushregs_no_rdi
 
 /*
  * Fast syscall entry point.  We enter here with just our new %cs/%ss set,
  * and the new privilige level.  We are still running on the old user stack
  * pointer.  We have to juggle a few things around to find our stack etc.
  * swapgs gives us access to our PCPU space only.
  *
  * We do not support invoking this from a custom %cs or %ss (e.g. using
  * entries from an LDT).
  */
 IDTVEC(fast_syscall)
 	swapgs
 	movq	%rsp,PCPU(SCRATCH_RSP)
 	movq	PCPU(RSP0),%rsp
 	/* Now emulate a trapframe. Make the 8 byte alignment odd for call. */
 	subq	$TF_SIZE,%rsp
 	/* defer TF_RSP till we have a spare register */
 	movq	%r11,TF_RFLAGS(%rsp)
 	movq	%rcx,TF_RIP(%rsp)	/* %rcx original value is in %r10 */
 	movq	PCPU(SCRATCH_RSP),%r11	/* %r11 already saved */
 	movq	%r11,TF_RSP(%rsp)	/* user stack pointer */
 	movw	%fs,TF_FS(%rsp)
 	movw	%gs,TF_GS(%rsp)
 	movw	%es,TF_ES(%rsp)
 	movw	%ds,TF_DS(%rsp)
 	movq	PCPU(CURPCB),%r11
 	andl	$~PCB_FULL_IRET,PCB_FLAGS(%r11)
 	sti
 	movq	$KUDSEL,TF_SS(%rsp)
 	movq	$KUCSEL,TF_CS(%rsp)
 	movq	$2,TF_ERR(%rsp)
 	movq	%rdi,TF_RDI(%rsp)	/* arg 1 */
 	movq	%rsi,TF_RSI(%rsp)	/* arg 2 */
 	movq	%rdx,TF_RDX(%rsp)	/* arg 3 */
 	movq	%r10,TF_RCX(%rsp)	/* arg 4 */
 	movq	%r8,TF_R8(%rsp)		/* arg 5 */
 	movq	%r9,TF_R9(%rsp)		/* arg 6 */
 	movq	%rax,TF_RAX(%rsp)	/* syscall number */
 	movq	%rbx,TF_RBX(%rsp)	/* C preserved */
 	movq	%rbp,TF_RBP(%rsp)	/* C preserved */
 	movq	%r12,TF_R12(%rsp)	/* C preserved */
 	movq	%r13,TF_R13(%rsp)	/* C preserved */
 	movq	%r14,TF_R14(%rsp)	/* C preserved */
 	movq	%r15,TF_R15(%rsp)	/* C preserved */
 	movl	$TF_HASSEGS,TF_FLAGS(%rsp)
 	cld
 	FAKE_MCOUNT(TF_RIP(%rsp))
 	movq	PCPU(CURTHREAD),%rdi
 	movq	%rsp,TD_FRAME(%rdi)
 	movl	TF_RFLAGS(%rsp),%esi
 	andl	$PSL_T,%esi
 	call	amd64_syscall
 1:	movq	PCPU(CURPCB),%rax
 	/* Disable interrupts before testing PCB_FULL_IRET. */
 	cli
 	testl	$PCB_FULL_IRET,PCB_FLAGS(%rax)
 	jnz	3f
 	/* Check for and handle AST's on return to userland. */
 	movq	PCPU(CURTHREAD),%rax
 	testl	$TDF_ASTPENDING | TDF_NEEDRESCHED,TD_FLAGS(%rax)
 	jne	2f
 	/* Restore preserved registers. */
 	MEXITCOUNT
 	movq	TF_RDI(%rsp),%rdi	/* bonus; preserve arg 1 */
 	movq	TF_RSI(%rsp),%rsi	/* bonus: preserve arg 2 */
 	movq	TF_RDX(%rsp),%rdx	/* return value 2 */
 	movq	TF_RAX(%rsp),%rax	/* return value 1 */
 	movq	TF_RFLAGS(%rsp),%r11	/* original %rflags */
 	movq	TF_RIP(%rsp),%rcx	/* original %rip */
 	movq	TF_RSP(%rsp),%rsp	/* user stack pointer */
 	swapgs
 	sysretq
 
 2:	/* AST scheduled. */
 	sti
 	movq	%rsp,%rdi
 	call	ast
 	jmp	1b
 
 3:	/* Requested full context restore, use doreti for that. */
 	MEXITCOUNT
 	jmp	doreti
 
 /*
  * Here for CYA insurance, in case a "syscall" instruction gets
  * issued from 32 bit compatability mode. MSR_CSTAR has to point
  * to *something* if EFER_SCE is enabled.
  */
 IDTVEC(fast_syscall32)
 	sysret
 
 /*
  * NMI handling is special.
  *
  * First, NMIs do not respect the state of the processor's RFLAGS.IF
  * bit.  The NMI handler may be entered at any time, including when
  * the processor is in a critical section with RFLAGS.IF == 0.
  * The processor's GS.base value could be invalid on entry to the
  * handler.
  *
  * Second, the processor treats NMIs specially, blocking further NMIs
  * until an 'iretq' instruction is executed.  We thus need to execute
  * the NMI handler with interrupts disabled, to prevent a nested interrupt
  * from executing an 'iretq' instruction and inadvertently taking the
  * processor out of NMI mode.
  *
  * Third, the NMI handler runs on its own stack (tss_ist2). The canonical
  * GS.base value for the processor is stored just above the bottom of its
  * NMI stack.  For NMIs taken from kernel mode, the current value in
  * the processor's GS.base is saved at entry to C-preserved register %r12,
  * the canonical value for GS.base is then loaded into the processor, and
  * the saved value is restored at exit time.  For NMIs taken from user mode,
  * the cheaper 'SWAPGS' instructions are used for swapping GS.base.
  */
 
 IDTVEC(nmi)
 	subq	$TF_RIP,%rsp
 	movl	$(T_NMI),TF_TRAPNO(%rsp)
 	movq	$0,TF_ADDR(%rsp)
 	movq	$0,TF_ERR(%rsp)
 	movq	%rdi,TF_RDI(%rsp)
 	movq	%rsi,TF_RSI(%rsp)
 	movq	%rdx,TF_RDX(%rsp)
 	movq	%rcx,TF_RCX(%rsp)
 	movq	%r8,TF_R8(%rsp)
 	movq	%r9,TF_R9(%rsp)
 	movq	%rax,TF_RAX(%rsp)
 	movq	%rbx,TF_RBX(%rsp)
 	movq	%rbp,TF_RBP(%rsp)
 	movq	%r10,TF_R10(%rsp)
 	movq	%r11,TF_R11(%rsp)
 	movq	%r12,TF_R12(%rsp)
 	movq	%r13,TF_R13(%rsp)
 	movq	%r14,TF_R14(%rsp)
 	movq	%r15,TF_R15(%rsp)
 	movw	%fs,TF_FS(%rsp)
 	movw	%gs,TF_GS(%rsp)
 	movw	%es,TF_ES(%rsp)
 	movw	%ds,TF_DS(%rsp)
 	movl	$TF_HASSEGS,TF_FLAGS(%rsp)
 	cld
 	xorl	%ebx,%ebx
 	testb	$SEL_RPL_MASK,TF_CS(%rsp)
 	jnz	nmi_fromuserspace
 	/*
 	 * We've interrupted the kernel.  Preserve GS.base in %r12.
 	 */
 	movl	$MSR_GSBASE,%ecx
 	rdmsr
 	movq	%rax,%r12
 	shlq	$32,%rdx
 	orq	%rdx,%r12
 	/* Retrieve and load the canonical value for GS.base. */
 	movq	TF_SIZE(%rsp),%rdx
 	movl	%edx,%eax
 	shrq	$32,%rdx
 	wrmsr
 	jmp	nmi_calltrap
 nmi_fromuserspace:
 	incl	%ebx
 	swapgs
 /* Note: this label is also used by ddb and gdb: */
 nmi_calltrap:
 	FAKE_MCOUNT(TF_RIP(%rsp))
 	movq	%rsp,%rdi
 	call	trap
 	MEXITCOUNT
 #ifdef HWPMC_HOOKS
 	/*
 	 * Capture a userspace callchain if needed.
 	 *
 	 * - Check if the current trap was from user mode.
 	 * - Check if the current thread is valid.
 	 * - Check if the thread requires a user call chain to be
 	 *   captured.
 	 *
 	 * We are still in NMI mode at this point.
 	 */
 	testl	%ebx,%ebx
 	jz	nocallchain	/* not from userspace */
 	movq	PCPU(CURTHREAD),%rax
 	orq	%rax,%rax	/* curthread present? */
 	jz	nocallchain
 	testl	$TDP_CALLCHAIN,TD_PFLAGS(%rax) /* flagged for capture? */
 	jz	nocallchain
 	/*
 	 * A user callchain is to be captured, so:
 	 * - Move execution to the regular kernel stack, to allow for
 	 *   nested NMI interrupts.
 	 * - Take the processor out of "NMI" mode by faking an "iret".
 	 * - Enable interrupts, so that copyin() can work.
 	 */
 	movq	%rsp,%rsi	/* source stack pointer */
 	movq	$TF_SIZE,%rcx
 	movq	PCPU(RSP0),%rdx
 	subq	%rcx,%rdx
 	movq	%rdx,%rdi	/* destination stack pointer */
 
 	shrq	$3,%rcx		/* trap frame size in long words */
 	cld
 	rep
 	movsq			/* copy trapframe */
 
 	movl	%ss,%eax
 	pushq	%rax		/* tf_ss */
 	pushq	%rdx		/* tf_rsp (on kernel stack) */
 	pushfq			/* tf_rflags */
 	movl	%cs,%eax
 	pushq	%rax		/* tf_cs */
 	pushq	$outofnmi	/* tf_rip */
 	iretq
 outofnmi:
 	/*
 	 * At this point the processor has exited NMI mode and is running
 	 * with interrupts turned off on the normal kernel stack.
 	 *
 	 * If a pending NMI gets recognized at or after this point, it
 	 * will cause a kernel callchain to be traced.
 	 *
 	 * We turn interrupts back on, and call the user callchain capture hook.
 	 */
 	movq	pmc_hook,%rax
 	orq	%rax,%rax
 	jz	nocallchain
 	movq	PCPU(CURTHREAD),%rdi		/* thread */
 	movq	$PMC_FN_USER_CALLCHAIN,%rsi	/* command */
 	movq	%rsp,%rdx			/* frame */
 	sti
 	call	*%rax
 	cli
 nocallchain:
 #endif
 	testl	%ebx,%ebx
 	jnz	doreti_exit
 nmi_kernelexit:
 	/*
 	 * Put back the preserved MSR_GSBASE value.
 	 */
 	movl	$MSR_GSBASE,%ecx
 	movq	%r12,%rdx
 	movl	%edx,%eax
 	shrq	$32,%rdx
 	wrmsr
 nmi_restoreregs:
 	movq	TF_RDI(%rsp),%rdi
 	movq	TF_RSI(%rsp),%rsi
 	movq	TF_RDX(%rsp),%rdx
 	movq	TF_RCX(%rsp),%rcx
 	movq	TF_R8(%rsp),%r8
 	movq	TF_R9(%rsp),%r9
 	movq	TF_RAX(%rsp),%rax
 	movq	TF_RBX(%rsp),%rbx
 	movq	TF_RBP(%rsp),%rbp
 	movq	TF_R10(%rsp),%r10
 	movq	TF_R11(%rsp),%r11
 	movq	TF_R12(%rsp),%r12
 	movq	TF_R13(%rsp),%r13
 	movq	TF_R14(%rsp),%r14
 	movq	TF_R15(%rsp),%r15
 	addq	$TF_RIP,%rsp
 	jmp	doreti_iret
 
 ENTRY(fork_trampoline)
 	movq	%r12,%rdi		/* function */
 	movq	%rbx,%rsi		/* arg1 */
 	movq	%rsp,%rdx		/* trapframe pointer */
 	call	fork_exit
 	MEXITCOUNT
 	jmp	doreti			/* Handle any ASTs */
 
 /*
  * To efficiently implement classification of trap and interrupt handlers
  * for profiling, there must be only trap handlers between the labels btrap
  * and bintr, and only interrupt handlers between the labels bintr and
  * eintr.  This is implemented (partly) by including files that contain
  * some of the handlers.  Before including the files, set up a normal asm
  * environment so that the included files doen't need to know that they are
  * included.
  */
 
 #ifdef COMPAT_FREEBSD32
 	.data
 	.p2align 4
 	.text
 	SUPERALIGN_TEXT
 
 #include <amd64/ia32/ia32_exception.S>
 #endif
 
 	.data
 	.p2align 4
 	.text
 	SUPERALIGN_TEXT
 MCOUNT_LABEL(bintr)
 
 #include <amd64/amd64/apic_vector.S>
 
 #ifdef DEV_ATPIC
 	.data
 	.p2align 4
 	.text
 	SUPERALIGN_TEXT
 
 #include <amd64/amd64/atpic_vector.S>
 #endif
 
 	.text
 MCOUNT_LABEL(eintr)
 
 /*
  * void doreti(struct trapframe)
  *
  * Handle return from interrupts, traps and syscalls.
  */
 	.text
 	SUPERALIGN_TEXT
 	.type	doreti,@function
 doreti:
 	FAKE_MCOUNT($bintr)		/* init "from" bintr -> doreti */
 	/*
 	 * Check if ASTs can be handled now.
 	 */
 	testb	$SEL_RPL_MASK,TF_CS(%rsp) /* are we returning to user mode? */
 	jz	doreti_exit		/* can't handle ASTs now if not */
 
 doreti_ast:
 	/*
 	 * Check for ASTs atomically with returning.  Disabling CPU
 	 * interrupts provides sufficient locking even in the SMP case,
 	 * since we will be informed of any new ASTs by an IPI.
 	 */
 	cli
 	movq	PCPU(CURTHREAD),%rax
 	testl	$TDF_ASTPENDING | TDF_NEEDRESCHED,TD_FLAGS(%rax)
 	je	doreti_exit
 	sti
 	movq	%rsp,%rdi	/* pass a pointer to the trapframe */
 	call	ast
 	jmp	doreti_ast
 
 	/*
 	 * doreti_exit:	pop registers, iret.
 	 *
 	 *	The segment register pop is a special case, since it may
 	 *	fault if (for example) a sigreturn specifies bad segment
 	 *	registers.  The fault is handled in trap.c.
 	 */
 doreti_exit:
 	MEXITCOUNT
 	movq	PCPU(CURPCB),%r8
 
 	/*
 	 * Do not reload segment registers for kernel.
 	 * Since we do not reload segments registers with sane
 	 * values on kernel entry, descriptors referenced by
 	 * segments registers might be not valid.  This is fatal
 	 * for user mode, but is not a problem for the kernel.
 	 */
 	testb	$SEL_RPL_MASK,TF_CS(%rsp)
 	jz	ld_regs
 	testl	$PCB_FULL_IRET,PCB_FLAGS(%r8)
 	jz	ld_regs
 	testl	$TF_HASSEGS,TF_FLAGS(%rsp)
 	je	set_segs
 
 do_segs:
 	/* Restore %fs and fsbase */
 	movw	TF_FS(%rsp),%ax
 	.globl	ld_fs
 ld_fs:
 	movw	%ax,%fs
 	cmpw	$KUF32SEL,%ax
 	jne	1f
 	movl	$MSR_FSBASE,%ecx
 	movl	PCB_FSBASE(%r8),%eax
 	movl	PCB_FSBASE+4(%r8),%edx
 	.globl	ld_fsbase
 ld_fsbase:
 	wrmsr
 1:
 	/* Restore %gs and gsbase */
 	movw	TF_GS(%rsp),%si
 	pushfq
 	cli
 	movl	$MSR_GSBASE,%ecx
 	rdmsr
 	.globl	ld_gs
 ld_gs:
 	movw	%si,%gs
 	wrmsr
 	popfq
 	cmpw	$KUG32SEL,%si
 	jne	1f
 	movl	$MSR_KGSBASE,%ecx
 	movl	PCB_GSBASE(%r8),%eax
 	movl	PCB_GSBASE+4(%r8),%edx
 	.globl	ld_gsbase
 ld_gsbase:
 	wrmsr
 1:
 	.globl	ld_es
 ld_es:
 	movw	TF_ES(%rsp),%es
 	.globl	ld_ds
 ld_ds:
 	movw	TF_DS(%rsp),%ds
 ld_regs:
 	movq	TF_RDI(%rsp),%rdi
 	movq	TF_RSI(%rsp),%rsi
 	movq	TF_RDX(%rsp),%rdx
 	movq	TF_RCX(%rsp),%rcx
 	movq	TF_R8(%rsp),%r8
 	movq	TF_R9(%rsp),%r9
 	movq	TF_RAX(%rsp),%rax
 	movq	TF_RBX(%rsp),%rbx
 	movq	TF_RBP(%rsp),%rbp
 	movq	TF_R10(%rsp),%r10
 	movq	TF_R11(%rsp),%r11
 	movq	TF_R12(%rsp),%r12
 	movq	TF_R13(%rsp),%r13
 	movq	TF_R14(%rsp),%r14
 	movq	TF_R15(%rsp),%r15
 	testb	$SEL_RPL_MASK,TF_CS(%rsp) /* Did we come from kernel? */
 	jz	1f			/* keep running with kernel GS.base */
 	cli
 	swapgs
 1:
 	addq	$TF_RIP,%rsp		/* skip over tf_err, tf_trapno */
 	.globl	doreti_iret
 doreti_iret:
 	iretq
 
 set_segs:
 	movw	$KUDSEL,%ax
 	movw	%ax,TF_DS(%rsp)
 	movw	%ax,TF_ES(%rsp)
 	movw	$KUF32SEL,TF_FS(%rsp)
 	movw	$KUG32SEL,TF_GS(%rsp)
 	jmp	do_segs
 
 	/*
 	 * doreti_iret_fault.  Alternative return code for
 	 * the case where we get a fault in the doreti_exit code
 	 * above.  trap() (amd64/amd64/trap.c) catches this specific
 	 * case, sends the process a signal and continues in the
 	 * corresponding place in the code below.
 	 */
 	ALIGN_TEXT
 	.globl	doreti_iret_fault
 doreti_iret_fault:
 	subq	$TF_RIP,%rsp		/* space including tf_err, tf_trapno */
 	testl	$PSL_I,TF_RFLAGS(%rsp)
 	jz	1f
 	sti
 1:
 	movw	%fs,TF_FS(%rsp)
 	movw	%gs,TF_GS(%rsp)
 	movw	%es,TF_ES(%rsp)
 	movw	%ds,TF_DS(%rsp)
 	movl	$TF_HASSEGS,TF_FLAGS(%rsp)
 	movq	%rdi,TF_RDI(%rsp)
 	movq	%rsi,TF_RSI(%rsp)
 	movq	%rdx,TF_RDX(%rsp)
 	movq	%rcx,TF_RCX(%rsp)
 	movq	%r8,TF_R8(%rsp)
 	movq	%r9,TF_R9(%rsp)
 	movq	%rax,TF_RAX(%rsp)
 	movq	%rbx,TF_RBX(%rsp)
 	movq	%rbp,TF_RBP(%rsp)
 	movq	%r10,TF_R10(%rsp)
 	movq	%r11,TF_R11(%rsp)
 	movq	%r12,TF_R12(%rsp)
 	movq	%r13,TF_R13(%rsp)
 	movq	%r14,TF_R14(%rsp)
 	movq	%r15,TF_R15(%rsp)
 	movl	$T_PROTFLT,TF_TRAPNO(%rsp)
 	movq	$0,TF_ERR(%rsp)	/* XXX should be the error code */
 	movq	$0,TF_ADDR(%rsp)
 	FAKE_MCOUNT(TF_RIP(%rsp))
 	jmp	calltrap
 
 	ALIGN_TEXT
 	.globl	ds_load_fault
 ds_load_fault:
 	movl	$T_PROTFLT,TF_TRAPNO(%rsp)
 	testl	$PSL_I,TF_RFLAGS(%rsp)
 	jz	1f
 	sti
 1:
 	movq	%rsp,%rdi
 	call	trap
 	movw	$KUDSEL,TF_DS(%rsp)
 	jmp	doreti
 
 	ALIGN_TEXT
 	.globl	es_load_fault
 es_load_fault:
 	movl	$T_PROTFLT,TF_TRAPNO(%rsp)
 	testl	$PSL_I,TF_RFLAGS(%rsp)
 	jz	1f
 	sti
 1:
 	movq	%rsp,%rdi
 	call	trap
 	movw	$KUDSEL,TF_ES(%rsp)
 	jmp	doreti
 
 	ALIGN_TEXT
 	.globl	fs_load_fault
 fs_load_fault:
 	testl	$PSL_I,TF_RFLAGS(%rsp)
 	jz	1f
 	sti
 1:
 	movl	$T_PROTFLT,TF_TRAPNO(%rsp)
 	movq	%rsp,%rdi
 	call	trap
 	movw	$KUF32SEL,TF_FS(%rsp)
 	jmp	doreti
 
 	ALIGN_TEXT
 	.globl	gs_load_fault
 gs_load_fault:
 	popfq
 	movl	$T_PROTFLT,TF_TRAPNO(%rsp)
 	testl	$PSL_I,TF_RFLAGS(%rsp)
 	jz	1f
 	sti
 1:
 	movq	%rsp,%rdi
 	call	trap
 	movw	$KUG32SEL,TF_GS(%rsp)
 	jmp	doreti
 
 	ALIGN_TEXT
 	.globl	fsbase_load_fault
 fsbase_load_fault:
 	movl	$T_PROTFLT,TF_TRAPNO(%rsp)
 	testl	$PSL_I,TF_RFLAGS(%rsp)
 	jz	1f
 	sti
 1:
 	movq	%rsp,%rdi
 	call	trap
 	movq	PCPU(CURTHREAD),%r8
 	movq	TD_PCB(%r8),%r8
 	movq	$0,PCB_FSBASE(%r8)
 	jmp	doreti
 
 	ALIGN_TEXT
 	.globl	gsbase_load_fault
 gsbase_load_fault:
 	movl	$T_PROTFLT,TF_TRAPNO(%rsp)
 	testl	$PSL_I,TF_RFLAGS(%rsp)
 	jz	1f
 	sti
 1:
 	movq	%rsp,%rdi
 	call	trap
 	movq	PCPU(CURTHREAD),%r8
 	movq	TD_PCB(%r8),%r8
 	movq	$0,PCB_GSBASE(%r8)
 	jmp	doreti
 
 #ifdef HWPMC_HOOKS
 	ENTRY(end_exceptions)
 #endif
Index: head/sys/amd64/amd64/machdep.c
===================================================================
--- head/sys/amd64/amd64/machdep.c	(revision 258540)
+++ head/sys/amd64/amd64/machdep.c	(revision 258541)
@@ -1,2595 +1,2594 @@
 /*-
  * Copyright (c) 2003 Peter Wemm.
  * Copyright (c) 1992 Terrence R. Lambert.
  * Copyright (c) 1982, 1987, 1990 The Regents of the University of California.
  * All rights reserved.
  *
  * This code is derived from software contributed to Berkeley by
  * William Jolitz.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *	This product includes software developed by the University of
  *	California, Berkeley and its contributors.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	from: @(#)machdep.c	7.4 (Berkeley) 6/3/91
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_atalk.h"
 #include "opt_atpic.h"
 #include "opt_compat.h"
 #include "opt_cpu.h"
 #include "opt_ddb.h"
 #include "opt_inet.h"
 #include "opt_ipx.h"
 #include "opt_isa.h"
 #include "opt_kstack_pages.h"
 #include "opt_maxmem.h"
 #include "opt_mp_watchdog.h"
 #include "opt_perfmon.h"
 #include "opt_platform.h"
 #include "opt_sched.h"
-#include "opt_kdtrace.h"
 
 #include <sys/param.h>
 #include <sys/proc.h>
 #include <sys/systm.h>
 #include <sys/bio.h>
 #include <sys/buf.h>
 #include <sys/bus.h>
 #include <sys/callout.h>
 #include <sys/cons.h>
 #include <sys/cpu.h>
 #include <sys/eventhandler.h>
 #include <sys/exec.h>
 #include <sys/imgact.h>
 #include <sys/kdb.h>
 #include <sys/kernel.h>
 #include <sys/ktr.h>
 #include <sys/linker.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/memrange.h>
 #include <sys/msgbuf.h>
 #include <sys/mutex.h>
 #include <sys/pcpu.h>
 #include <sys/ptrace.h>
 #include <sys/reboot.h>
 #include <sys/rwlock.h>
 #include <sys/sched.h>
 #include <sys/signalvar.h>
 #ifdef SMP
 #include <sys/smp.h>
 #endif
 #include <sys/syscallsubr.h>
 #include <sys/sysctl.h>
 #include <sys/sysent.h>
 #include <sys/sysproto.h>
 #include <sys/ucontext.h>
 #include <sys/vmmeter.h>
 
 #include <vm/vm.h>
 #include <vm/vm_extern.h>
 #include <vm/vm_kern.h>
 #include <vm/vm_page.h>
 #include <vm/vm_map.h>
 #include <vm/vm_object.h>
 #include <vm/vm_pager.h>
 #include <vm/vm_param.h>
 
 #ifdef DDB
 #ifndef KDB
 #error KDB must be enabled in order for DDB to work!
 #endif
 #include <ddb/ddb.h>
 #include <ddb/db_sym.h>
 #endif
 
 #include <net/netisr.h>
 
 #include <machine/clock.h>
 #include <machine/cpu.h>
 #include <machine/cputypes.h>
 #include <machine/intr_machdep.h>
 #include <x86/mca.h>
 #include <machine/md_var.h>
 #include <machine/metadata.h>
 #include <machine/mp_watchdog.h>
 #include <machine/pc/bios.h>
 #include <machine/pcb.h>
 #include <machine/proc.h>
 #include <machine/reg.h>
 #include <machine/sigframe.h>
 #include <machine/specialreg.h>
 #ifdef PERFMON
 #include <machine/perfmon.h>
 #endif
 #include <machine/tss.h>
 #ifdef SMP
 #include <machine/smp.h>
 #endif
 #ifdef FDT
 #include <x86/fdt.h>
 #endif
 
 #ifdef DEV_ATPIC
 #include <x86/isa/icu.h>
 #else
 #include <machine/apicvar.h>
 #endif
 
 #include <isa/isareg.h>
 #include <isa/rtc.h>
 
 /* Sanity check for __curthread() */
 CTASSERT(offsetof(struct pcpu, pc_curthread) == 0);
 
 extern u_int64_t hammer_time(u_int64_t, u_int64_t);
 
 extern void printcpuinfo(void);	/* XXX header file */
 extern void identify_cpu(void);
 extern void panicifcpuunsupported(void);
 
 #define	CS_SECURE(cs)		(ISPL(cs) == SEL_UPL)
 #define	EFL_SECURE(ef, oef)	((((ef) ^ (oef)) & ~PSL_USERCHANGE) == 0)
 
 static void cpu_startup(void *);
 static void get_fpcontext(struct thread *td, mcontext_t *mcp,
     char *xfpusave, size_t xfpusave_len);
 static int  set_fpcontext(struct thread *td, const mcontext_t *mcp,
     char *xfpustate, size_t xfpustate_len);
 SYSINIT(cpu, SI_SUB_CPU, SI_ORDER_FIRST, cpu_startup, NULL);
 
 /*
  * The file "conf/ldscript.amd64" defines the symbol "kernphys".  Its value is
  * the physical address at which the kernel is loaded.
  */
 extern char kernphys[];
 #ifdef DDB
 extern vm_offset_t ksym_start, ksym_end;
 #endif
 
 struct msgbuf *msgbufp;
 
 /* Intel ICH registers */
 #define ICH_PMBASE	0x400
 #define ICH_SMI_EN	ICH_PMBASE + 0x30
 
 int	_udatasel, _ucodesel, _ucode32sel, _ufssel, _ugssel;
 
 int cold = 1;
 
 long Maxmem = 0;
 long realmem = 0;
 
 /*
  * The number of PHYSMAP entries must be one less than the number of
  * PHYSSEG entries because the PHYSMAP entry that spans the largest
  * physical address that is accessible by ISA DMA is split into two
  * PHYSSEG entries.
  */
 #define	PHYSMAP_SIZE	(2 * (VM_PHYSSEG_MAX - 1))
 
 vm_paddr_t phys_avail[PHYSMAP_SIZE + 2];
 vm_paddr_t dump_avail[PHYSMAP_SIZE + 2];
 
 /* must be 2 less so 0 0 can signal end of chunks */
 #define PHYS_AVAIL_ARRAY_END ((sizeof(phys_avail) / sizeof(phys_avail[0])) - 2)
 #define DUMP_AVAIL_ARRAY_END ((sizeof(dump_avail) / sizeof(dump_avail[0])) - 2)
 
 struct kva_md_info kmi;
 
 static struct trapframe proc0_tf;
 struct region_descriptor r_gdt, r_idt;
 
 struct pcpu __pcpu[MAXCPU];
 
 struct mtx icu_lock;
 
 struct mem_range_softc mem_range_softc;
 
 struct mtx dt_lock;	/* lock for GDT and LDT */
 
 static void
 cpu_startup(dummy)
 	void *dummy;
 {
 	uintmax_t memsize;
 	char *sysenv;
 
 	/*
 	 * On MacBooks, we need to disallow the legacy USB circuit to
 	 * generate an SMI# because this can cause several problems,
 	 * namely: incorrect CPU frequency detection and failure to
 	 * start the APs.
 	 * We do this by disabling a bit in the SMI_EN (SMI Control and
 	 * Enable register) of the Intel ICH LPC Interface Bridge. 
 	 */
 	sysenv = getenv("smbios.system.product");
 	if (sysenv != NULL) {
 		if (strncmp(sysenv, "MacBook1,1", 10) == 0 ||
 		    strncmp(sysenv, "MacBook3,1", 10) == 0 ||
 		    strncmp(sysenv, "MacBookPro1,1", 13) == 0 ||
 		    strncmp(sysenv, "MacBookPro1,2", 13) == 0 ||
 		    strncmp(sysenv, "MacBookPro3,1", 13) == 0 ||
 		    strncmp(sysenv, "Macmini1,1", 10) == 0) {
 			if (bootverbose)
 				printf("Disabling LEGACY_USB_EN bit on "
 				    "Intel ICH.\n");
 			outl(ICH_SMI_EN, inl(ICH_SMI_EN) & ~0x8);
 		}
 		freeenv(sysenv);
 	}
 
 	/*
 	 * Good {morning,afternoon,evening,night}.
 	 */
 	startrtclock();
 	printcpuinfo();
 	panicifcpuunsupported();
 #ifdef PERFMON
 	perfmon_init();
 #endif
 
 	/*
 	 * Display physical memory if SMBIOS reports reasonable amount.
 	 */
 	memsize = 0;
 	sysenv = getenv("smbios.memory.enabled");
 	if (sysenv != NULL) {
 		memsize = (uintmax_t)strtoul(sysenv, (char **)NULL, 10) << 10;
 		freeenv(sysenv);
 	}
 	if (memsize < ptoa((uintmax_t)cnt.v_free_count))
 		memsize = ptoa((uintmax_t)Maxmem);
 	printf("real memory  = %ju (%ju MB)\n", memsize, memsize >> 20);
 	realmem = atop(memsize);
 
 	/*
 	 * Display any holes after the first chunk of extended memory.
 	 */
 	if (bootverbose) {
 		int indx;
 
 		printf("Physical memory chunk(s):\n");
 		for (indx = 0; phys_avail[indx + 1] != 0; indx += 2) {
 			vm_paddr_t size;
 
 			size = phys_avail[indx + 1] - phys_avail[indx];
 			printf(
 			    "0x%016jx - 0x%016jx, %ju bytes (%ju pages)\n",
 			    (uintmax_t)phys_avail[indx],
 			    (uintmax_t)phys_avail[indx + 1] - 1,
 			    (uintmax_t)size, (uintmax_t)size / PAGE_SIZE);
 		}
 	}
 
 	vm_ksubmap_init(&kmi);
 
 	printf("avail memory = %ju (%ju MB)\n",
 	    ptoa((uintmax_t)cnt.v_free_count),
 	    ptoa((uintmax_t)cnt.v_free_count) / 1048576);
 
 	/*
 	 * Set up buffers, so they can be used to read disk labels.
 	 */
 	bufinit();
 	vm_pager_bufferinit();
 
 	cpu_setregs();
 }
 
 /*
  * Send an interrupt to process.
  *
  * Stack is set up to allow sigcode stored
  * at top to call routine, followed by call
  * to sigreturn routine below.  After sigreturn
  * resets the signal mask, the stack, and the
  * frame pointer, it returns to the user
  * specified pc, psl.
  */
 void
 sendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask)
 {
 	struct sigframe sf, *sfp;
 	struct pcb *pcb;
 	struct proc *p;
 	struct thread *td;
 	struct sigacts *psp;
 	char *sp;
 	struct trapframe *regs;
 	char *xfpusave;
 	size_t xfpusave_len;
 	int sig;
 	int oonstack;
 
 	td = curthread;
 	pcb = td->td_pcb;
 	p = td->td_proc;
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	sig = ksi->ksi_signo;
 	psp = p->p_sigacts;
 	mtx_assert(&psp->ps_mtx, MA_OWNED);
 	regs = td->td_frame;
 	oonstack = sigonstack(regs->tf_rsp);
 
 	if (cpu_max_ext_state_size > sizeof(struct savefpu) && use_xsave) {
 		xfpusave_len = cpu_max_ext_state_size - sizeof(struct savefpu);
 		xfpusave = __builtin_alloca(xfpusave_len);
 	} else {
 		xfpusave_len = 0;
 		xfpusave = NULL;
 	}
 
 	/* Save user context. */
 	bzero(&sf, sizeof(sf));
 	sf.sf_uc.uc_sigmask = *mask;
 	sf.sf_uc.uc_stack = td->td_sigstk;
 	sf.sf_uc.uc_stack.ss_flags = (td->td_pflags & TDP_ALTSTACK)
 	    ? ((oonstack) ? SS_ONSTACK : 0) : SS_DISABLE;
 	sf.sf_uc.uc_mcontext.mc_onstack = (oonstack) ? 1 : 0;
 	bcopy(regs, &sf.sf_uc.uc_mcontext.mc_rdi, sizeof(*regs));
 	sf.sf_uc.uc_mcontext.mc_len = sizeof(sf.sf_uc.uc_mcontext); /* magic */
 	get_fpcontext(td, &sf.sf_uc.uc_mcontext, xfpusave, xfpusave_len);
 	fpstate_drop(td);
 	sf.sf_uc.uc_mcontext.mc_fsbase = pcb->pcb_fsbase;
 	sf.sf_uc.uc_mcontext.mc_gsbase = pcb->pcb_gsbase;
 	bzero(sf.sf_uc.uc_mcontext.mc_spare,
 	    sizeof(sf.sf_uc.uc_mcontext.mc_spare));
 	bzero(sf.sf_uc.__spare__, sizeof(sf.sf_uc.__spare__));
 
 	/* Allocate space for the signal handler context. */
 	if ((td->td_pflags & TDP_ALTSTACK) != 0 && !oonstack &&
 	    SIGISMEMBER(psp->ps_sigonstack, sig)) {
 		sp = td->td_sigstk.ss_sp + td->td_sigstk.ss_size;
 #if defined(COMPAT_43)
 		td->td_sigstk.ss_flags |= SS_ONSTACK;
 #endif
 	} else
 		sp = (char *)regs->tf_rsp - 128;
 	if (xfpusave != NULL) {
 		sp -= xfpusave_len;
 		sp = (char *)((unsigned long)sp & ~0x3Ful);
 		sf.sf_uc.uc_mcontext.mc_xfpustate = (register_t)sp;
 	}
 	sp -= sizeof(struct sigframe);
 	/* Align to 16 bytes. */
 	sfp = (struct sigframe *)((unsigned long)sp & ~0xFul);
 
 	/* Translate the signal if appropriate. */
 	if (p->p_sysent->sv_sigtbl && sig <= p->p_sysent->sv_sigsize)
 		sig = p->p_sysent->sv_sigtbl[_SIG_IDX(sig)];
 
 	/* Build the argument list for the signal handler. */
 	regs->tf_rdi = sig;			/* arg 1 in %rdi */
 	regs->tf_rdx = (register_t)&sfp->sf_uc;	/* arg 3 in %rdx */
 	bzero(&sf.sf_si, sizeof(sf.sf_si));
 	if (SIGISMEMBER(psp->ps_siginfo, sig)) {
 		/* Signal handler installed with SA_SIGINFO. */
 		regs->tf_rsi = (register_t)&sfp->sf_si;	/* arg 2 in %rsi */
 		sf.sf_ahu.sf_action = (__siginfohandler_t *)catcher;
 
 		/* Fill in POSIX parts */
 		sf.sf_si = ksi->ksi_info;
 		sf.sf_si.si_signo = sig; /* maybe a translated signal */
 		regs->tf_rcx = (register_t)ksi->ksi_addr; /* arg 4 in %rcx */
 	} else {
 		/* Old FreeBSD-style arguments. */
 		regs->tf_rsi = ksi->ksi_code;	/* arg 2 in %rsi */
 		regs->tf_rcx = (register_t)ksi->ksi_addr; /* arg 4 in %rcx */
 		sf.sf_ahu.sf_handler = catcher;
 	}
 	mtx_unlock(&psp->ps_mtx);
 	PROC_UNLOCK(p);
 
 	/*
 	 * Copy the sigframe out to the user's stack.
 	 */
 	if (copyout(&sf, sfp, sizeof(*sfp)) != 0 ||
 	    (xfpusave != NULL && copyout(xfpusave,
 	    (void *)sf.sf_uc.uc_mcontext.mc_xfpustate, xfpusave_len)
 	    != 0)) {
 #ifdef DEBUG
 		printf("process %ld has trashed its stack\n", (long)p->p_pid);
 #endif
 		PROC_LOCK(p);
 		sigexit(td, SIGILL);
 	}
 
 	regs->tf_rsp = (long)sfp;
 	regs->tf_rip = p->p_sysent->sv_sigcode_base;
 	regs->tf_rflags &= ~(PSL_T | PSL_D);
 	regs->tf_cs = _ucodesel;
 	regs->tf_ds = _udatasel;
 	regs->tf_es = _udatasel;
 	regs->tf_fs = _ufssel;
 	regs->tf_gs = _ugssel;
 	regs->tf_flags = TF_HASSEGS;
 	set_pcb_flags(pcb, PCB_FULL_IRET);
 	PROC_LOCK(p);
 	mtx_lock(&psp->ps_mtx);
 }
 
 /*
  * System call to cleanup state after a signal
  * has been taken.  Reset signal mask and
  * stack state from context left by sendsig (above).
  * Return to previous pc and psl as specified by
  * context left by sendsig. Check carefully to
  * make sure that the user has not modified the
  * state to gain improper privileges.
  *
  * MPSAFE
  */
 int
 sys_sigreturn(td, uap)
 	struct thread *td;
 	struct sigreturn_args /* {
 		const struct __ucontext *sigcntxp;
 	} */ *uap;
 {
 	ucontext_t uc;
 	struct pcb *pcb;
 	struct proc *p;
 	struct trapframe *regs;
 	ucontext_t *ucp;
 	char *xfpustate;
 	size_t xfpustate_len;
 	long rflags;
 	int cs, error, ret;
 	ksiginfo_t ksi;
 
 	pcb = td->td_pcb;
 	p = td->td_proc;
 
 	error = copyin(uap->sigcntxp, &uc, sizeof(uc));
 	if (error != 0) {
 		uprintf("pid %d (%s): sigreturn copyin failed\n",
 		    p->p_pid, td->td_name);
 		return (error);
 	}
 	ucp = &uc;
 	if ((ucp->uc_mcontext.mc_flags & ~_MC_FLAG_MASK) != 0) {
 		uprintf("pid %d (%s): sigreturn mc_flags %x\n", p->p_pid,
 		    td->td_name, ucp->uc_mcontext.mc_flags);
 		return (EINVAL);
 	}
 	regs = td->td_frame;
 	rflags = ucp->uc_mcontext.mc_rflags;
 	/*
 	 * Don't allow users to change privileged or reserved flags.
 	 */
 	if (!EFL_SECURE(rflags, regs->tf_rflags)) {
 		uprintf("pid %d (%s): sigreturn rflags = 0x%lx\n", p->p_pid,
 		    td->td_name, rflags);
 		return (EINVAL);
 	}
 
 	/*
 	 * Don't allow users to load a valid privileged %cs.  Let the
 	 * hardware check for invalid selectors, excess privilege in
 	 * other selectors, invalid %eip's and invalid %esp's.
 	 */
 	cs = ucp->uc_mcontext.mc_cs;
 	if (!CS_SECURE(cs)) {
 		uprintf("pid %d (%s): sigreturn cs = 0x%x\n", p->p_pid,
 		    td->td_name, cs);
 		ksiginfo_init_trap(&ksi);
 		ksi.ksi_signo = SIGBUS;
 		ksi.ksi_code = BUS_OBJERR;
 		ksi.ksi_trapno = T_PROTFLT;
 		ksi.ksi_addr = (void *)regs->tf_rip;
 		trapsignal(td, &ksi);
 		return (EINVAL);
 	}
 
 	if ((uc.uc_mcontext.mc_flags & _MC_HASFPXSTATE) != 0) {
 		xfpustate_len = uc.uc_mcontext.mc_xfpustate_len;
 		if (xfpustate_len > cpu_max_ext_state_size -
 		    sizeof(struct savefpu)) {
 			uprintf("pid %d (%s): sigreturn xfpusave_len = 0x%zx\n",
 			    p->p_pid, td->td_name, xfpustate_len);
 			return (EINVAL);
 		}
 		xfpustate = __builtin_alloca(xfpustate_len);
 		error = copyin((const void *)uc.uc_mcontext.mc_xfpustate,
 		    xfpustate, xfpustate_len);
 		if (error != 0) {
 			uprintf(
 	"pid %d (%s): sigreturn copying xfpustate failed\n",
 			    p->p_pid, td->td_name);
 			return (error);
 		}
 	} else {
 		xfpustate = NULL;
 		xfpustate_len = 0;
 	}
 	ret = set_fpcontext(td, &ucp->uc_mcontext, xfpustate, xfpustate_len);
 	if (ret != 0) {
 		uprintf("pid %d (%s): sigreturn set_fpcontext err %d\n",
 		    p->p_pid, td->td_name, ret);
 		return (ret);
 	}
 	bcopy(&ucp->uc_mcontext.mc_rdi, regs, sizeof(*regs));
 	pcb->pcb_fsbase = ucp->uc_mcontext.mc_fsbase;
 	pcb->pcb_gsbase = ucp->uc_mcontext.mc_gsbase;
 
 #if defined(COMPAT_43)
 	if (ucp->uc_mcontext.mc_onstack & 1)
 		td->td_sigstk.ss_flags |= SS_ONSTACK;
 	else
 		td->td_sigstk.ss_flags &= ~SS_ONSTACK;
 #endif
 
 	kern_sigprocmask(td, SIG_SETMASK, &ucp->uc_sigmask, NULL, 0);
 	set_pcb_flags(pcb, PCB_FULL_IRET);
 	return (EJUSTRETURN);
 }
 
 #ifdef COMPAT_FREEBSD4
 int
 freebsd4_sigreturn(struct thread *td, struct freebsd4_sigreturn_args *uap)
 {
  
 	return sys_sigreturn(td, (struct sigreturn_args *)uap);
 }
 #endif
 
 
 /*
  * Machine dependent boot() routine
  *
  * I haven't seen anything to put here yet
  * Possibly some stuff might be grafted back here from boot()
  */
 void
 cpu_boot(int howto)
 {
 }
 
 /*
  * Flush the D-cache for non-DMA I/O so that the I-cache can
  * be made coherent later.
  */
 void
 cpu_flush_dcache(void *ptr, size_t len)
 {
 	/* Not applicable */
 }
 
 /* Get current clock frequency for the given cpu id. */
 int
 cpu_est_clockrate(int cpu_id, uint64_t *rate)
 {
 	uint64_t tsc1, tsc2;
 	uint64_t acnt, mcnt, perf;
 	register_t reg;
 
 	if (pcpu_find(cpu_id) == NULL || rate == NULL)
 		return (EINVAL);
 
 	/*
 	 * If TSC is P-state invariant and APERF/MPERF MSRs do not exist,
 	 * DELAY(9) based logic fails.
 	 */
 	if (tsc_is_invariant && !tsc_perf_stat)
 		return (EOPNOTSUPP);
 
 #ifdef SMP
 	if (smp_cpus > 1) {
 		/* Schedule ourselves on the indicated cpu. */
 		thread_lock(curthread);
 		sched_bind(curthread, cpu_id);
 		thread_unlock(curthread);
 	}
 #endif
 
 	/* Calibrate by measuring a short delay. */
 	reg = intr_disable();
 	if (tsc_is_invariant) {
 		wrmsr(MSR_MPERF, 0);
 		wrmsr(MSR_APERF, 0);
 		tsc1 = rdtsc();
 		DELAY(1000);
 		mcnt = rdmsr(MSR_MPERF);
 		acnt = rdmsr(MSR_APERF);
 		tsc2 = rdtsc();
 		intr_restore(reg);
 		perf = 1000 * acnt / mcnt;
 		*rate = (tsc2 - tsc1) * perf;
 	} else {
 		tsc1 = rdtsc();
 		DELAY(1000);
 		tsc2 = rdtsc();
 		intr_restore(reg);
 		*rate = (tsc2 - tsc1) * 1000;
 	}
 
 #ifdef SMP
 	if (smp_cpus > 1) {
 		thread_lock(curthread);
 		sched_unbind(curthread);
 		thread_unlock(curthread);
 	}
 #endif
 
 	return (0);
 }
 
 /*
  * Shutdown the CPU as much as possible
  */
 void
 cpu_halt(void)
 {
 	for (;;)
 		halt();
 }
 
 void (*cpu_idle_hook)(sbintime_t) = NULL;	/* ACPI idle hook. */
 static int	cpu_ident_amdc1e = 0;	/* AMD C1E supported. */
 static int	idle_mwait = 1;		/* Use MONITOR/MWAIT for short idle. */
 TUNABLE_INT("machdep.idle_mwait", &idle_mwait);
 SYSCTL_INT(_machdep, OID_AUTO, idle_mwait, CTLFLAG_RW, &idle_mwait,
     0, "Use MONITOR/MWAIT for short idle");
 
 #define	STATE_RUNNING	0x0
 #define	STATE_MWAIT	0x1
 #define	STATE_SLEEPING	0x2
 
 static void
 cpu_idle_acpi(sbintime_t sbt)
 {
 	int *state;
 
 	state = (int *)PCPU_PTR(monitorbuf);
 	*state = STATE_SLEEPING;
 
 	/* See comments in cpu_idle_hlt(). */
 	disable_intr();
 	if (sched_runnable())
 		enable_intr();
 	else if (cpu_idle_hook)
 		cpu_idle_hook(sbt);
 	else
 		__asm __volatile("sti; hlt");
 	*state = STATE_RUNNING;
 }
 
 static void
 cpu_idle_hlt(sbintime_t sbt)
 {
 	int *state;
 
 	state = (int *)PCPU_PTR(monitorbuf);
 	*state = STATE_SLEEPING;
 
 	/*
 	 * Since we may be in a critical section from cpu_idle(), if
 	 * an interrupt fires during that critical section we may have
 	 * a pending preemption.  If the CPU halts, then that thread
 	 * may not execute until a later interrupt awakens the CPU.
 	 * To handle this race, check for a runnable thread after
 	 * disabling interrupts and immediately return if one is
 	 * found.  Also, we must absolutely guarentee that hlt is
 	 * the next instruction after sti.  This ensures that any
 	 * interrupt that fires after the call to disable_intr() will
 	 * immediately awaken the CPU from hlt.  Finally, please note
 	 * that on x86 this works fine because of interrupts enabled only
 	 * after the instruction following sti takes place, while IF is set
 	 * to 1 immediately, allowing hlt instruction to acknowledge the
 	 * interrupt.
 	 */
 	disable_intr();
 	if (sched_runnable())
 		enable_intr();
 	else
 		__asm __volatile("sti; hlt");
 	*state = STATE_RUNNING;
 }
 
 /*
  * MWAIT cpu power states.  Lower 4 bits are sub-states.
  */
 #define	MWAIT_C0	0xf0
 #define	MWAIT_C1	0x00
 #define	MWAIT_C2	0x10
 #define	MWAIT_C3	0x20
 #define	MWAIT_C4	0x30
 
 static void
 cpu_idle_mwait(sbintime_t sbt)
 {
 	int *state;
 
 	state = (int *)PCPU_PTR(monitorbuf);
 	*state = STATE_MWAIT;
 
 	/* See comments in cpu_idle_hlt(). */
 	disable_intr();
 	if (sched_runnable()) {
 		enable_intr();
 		*state = STATE_RUNNING;
 		return;
 	}
 	cpu_monitor(state, 0, 0);
 	if (*state == STATE_MWAIT)
 		__asm __volatile("sti; mwait" : : "a" (MWAIT_C1), "c" (0));
 	else
 		enable_intr();
 	*state = STATE_RUNNING;
 }
 
 static void
 cpu_idle_spin(sbintime_t sbt)
 {
 	int *state;
 	int i;
 
 	state = (int *)PCPU_PTR(monitorbuf);
 	*state = STATE_RUNNING;
 
 	/*
 	 * The sched_runnable() call is racy but as long as there is
 	 * a loop missing it one time will have just a little impact if any
 	 * (and it is much better than missing the check at all).
 	 */
 	for (i = 0; i < 1000; i++) {
 		if (sched_runnable())
 			return;
 		cpu_spinwait();
 	}
 }
 
 /*
  * C1E renders the local APIC timer dead, so we disable it by
  * reading the Interrupt Pending Message register and clearing
  * both C1eOnCmpHalt (bit 28) and SmiOnCmpHalt (bit 27).
  * 
  * Reference:
  *   "BIOS and Kernel Developer's Guide for AMD NPT Family 0Fh Processors"
  *   #32559 revision 3.00+
  */
 #define	MSR_AMDK8_IPM		0xc0010055
 #define	AMDK8_SMIONCMPHALT	(1ULL << 27)
 #define	AMDK8_C1EONCMPHALT	(1ULL << 28)
 #define	AMDK8_CMPHALT		(AMDK8_SMIONCMPHALT | AMDK8_C1EONCMPHALT)
 
 static void
 cpu_probe_amdc1e(void)
 {
 
 	/*
 	 * Detect the presence of C1E capability mostly on latest
 	 * dual-cores (or future) k8 family.
 	 */
 	if (cpu_vendor_id == CPU_VENDOR_AMD &&
 	    (cpu_id & 0x00000f00) == 0x00000f00 &&
 	    (cpu_id & 0x0fff0000) >=  0x00040000) {
 		cpu_ident_amdc1e = 1;
 	}
 }
 
 void (*cpu_idle_fn)(sbintime_t) = cpu_idle_acpi;
 
 void
 cpu_idle(int busy)
 {
 	uint64_t msr;
 	sbintime_t sbt = -1;
 
 	CTR2(KTR_SPARE2, "cpu_idle(%d) at %d",
 	    busy, curcpu);
 #ifdef MP_WATCHDOG
 	ap_watchdog(PCPU_GET(cpuid));
 #endif
 	/* If we are busy - try to use fast methods. */
 	if (busy) {
 		if ((cpu_feature2 & CPUID2_MON) && idle_mwait) {
 			cpu_idle_mwait(busy);
 			goto out;
 		}
 	}
 
 	/* If we have time - switch timers into idle mode. */
 	if (!busy) {
 		critical_enter();
 		sbt = cpu_idleclock();
 	}
 
 	/* Apply AMD APIC timer C1E workaround. */
 	if (cpu_ident_amdc1e && cpu_disable_deep_sleep) {
 		msr = rdmsr(MSR_AMDK8_IPM);
 		if (msr & AMDK8_CMPHALT)
 			wrmsr(MSR_AMDK8_IPM, msr & ~AMDK8_CMPHALT);
 	}
 
 	/* Call main idle method. */
 	cpu_idle_fn(sbt);
 
 	/* Switch timers mack into active mode. */
 	if (!busy) {
 		cpu_activeclock();
 		critical_exit();
 	}
 out:
 	CTR2(KTR_SPARE2, "cpu_idle(%d) at %d done",
 	    busy, curcpu);
 }
 
 int
 cpu_idle_wakeup(int cpu)
 {
 	struct pcpu *pcpu;
 	int *state;
 
 	pcpu = pcpu_find(cpu);
 	state = (int *)pcpu->pc_monitorbuf;
 	/*
 	 * This doesn't need to be atomic since missing the race will
 	 * simply result in unnecessary IPIs.
 	 */
 	if (*state == STATE_SLEEPING)
 		return (0);
 	if (*state == STATE_MWAIT)
 		*state = STATE_RUNNING;
 	return (1);
 }
 
 /*
  * Ordered by speed/power consumption.
  */
 struct {
 	void	*id_fn;
 	char	*id_name;
 } idle_tbl[] = {
 	{ cpu_idle_spin, "spin" },
 	{ cpu_idle_mwait, "mwait" },
 	{ cpu_idle_hlt, "hlt" },
 	{ cpu_idle_acpi, "acpi" },
 	{ NULL, NULL }
 };
 
 static int
 idle_sysctl_available(SYSCTL_HANDLER_ARGS)
 {
 	char *avail, *p;
 	int error;
 	int i;
 
 	avail = malloc(256, M_TEMP, M_WAITOK);
 	p = avail;
 	for (i = 0; idle_tbl[i].id_name != NULL; i++) {
 		if (strstr(idle_tbl[i].id_name, "mwait") &&
 		    (cpu_feature2 & CPUID2_MON) == 0)
 			continue;
 		if (strcmp(idle_tbl[i].id_name, "acpi") == 0 &&
 		    cpu_idle_hook == NULL)
 			continue;
 		p += sprintf(p, "%s%s", p != avail ? ", " : "",
 		    idle_tbl[i].id_name);
 	}
 	error = sysctl_handle_string(oidp, avail, 0, req);
 	free(avail, M_TEMP);
 	return (error);
 }
 
 SYSCTL_PROC(_machdep, OID_AUTO, idle_available, CTLTYPE_STRING | CTLFLAG_RD,
     0, 0, idle_sysctl_available, "A", "list of available idle functions");
 
 static int
 idle_sysctl(SYSCTL_HANDLER_ARGS)
 {
 	char buf[16];
 	int error;
 	char *p;
 	int i;
 
 	p = "unknown";
 	for (i = 0; idle_tbl[i].id_name != NULL; i++) {
 		if (idle_tbl[i].id_fn == cpu_idle_fn) {
 			p = idle_tbl[i].id_name;
 			break;
 		}
 	}
 	strncpy(buf, p, sizeof(buf));
 	error = sysctl_handle_string(oidp, buf, sizeof(buf), req);
 	if (error != 0 || req->newptr == NULL)
 		return (error);
 	for (i = 0; idle_tbl[i].id_name != NULL; i++) {
 		if (strstr(idle_tbl[i].id_name, "mwait") &&
 		    (cpu_feature2 & CPUID2_MON) == 0)
 			continue;
 		if (strcmp(idle_tbl[i].id_name, "acpi") == 0 &&
 		    cpu_idle_hook == NULL)
 			continue;
 		if (strcmp(idle_tbl[i].id_name, buf))
 			continue;
 		cpu_idle_fn = idle_tbl[i].id_fn;
 		return (0);
 	}
 	return (EINVAL);
 }
 
 SYSCTL_PROC(_machdep, OID_AUTO, idle, CTLTYPE_STRING | CTLFLAG_RW, 0, 0,
     idle_sysctl, "A", "currently selected idle function");
 
 /*
  * Reset registers to default values on exec.
  */
 void
 exec_setregs(struct thread *td, struct image_params *imgp, u_long stack)
 {
 	struct trapframe *regs = td->td_frame;
 	struct pcb *pcb = td->td_pcb;
 
 	mtx_lock(&dt_lock);
 	if (td->td_proc->p_md.md_ldt != NULL)
 		user_ldt_free(td);
 	else
 		mtx_unlock(&dt_lock);
 	
 	pcb->pcb_fsbase = 0;
 	pcb->pcb_gsbase = 0;
 	clear_pcb_flags(pcb, PCB_32BIT);
 	pcb->pcb_initial_fpucw = __INITIAL_FPUCW__;
 	set_pcb_flags(pcb, PCB_FULL_IRET);
 
 	bzero((char *)regs, sizeof(struct trapframe));
 	regs->tf_rip = imgp->entry_addr;
 	regs->tf_rsp = ((stack - 8) & ~0xFul) + 8;
 	regs->tf_rdi = stack;		/* argv */
 	regs->tf_rflags = PSL_USER | (regs->tf_rflags & PSL_T);
 	regs->tf_ss = _udatasel;
 	regs->tf_cs = _ucodesel;
 	regs->tf_ds = _udatasel;
 	regs->tf_es = _udatasel;
 	regs->tf_fs = _ufssel;
 	regs->tf_gs = _ugssel;
 	regs->tf_flags = TF_HASSEGS;
 	td->td_retval[1] = 0;
 
 	/*
 	 * Reset the hardware debug registers if they were in use.
 	 * They won't have any meaning for the newly exec'd process.
 	 */
 	if (pcb->pcb_flags & PCB_DBREGS) {
 		pcb->pcb_dr0 = 0;
 		pcb->pcb_dr1 = 0;
 		pcb->pcb_dr2 = 0;
 		pcb->pcb_dr3 = 0;
 		pcb->pcb_dr6 = 0;
 		pcb->pcb_dr7 = 0;
 		if (pcb == curpcb) {
 			/*
 			 * Clear the debug registers on the running
 			 * CPU, otherwise they will end up affecting
 			 * the next process we switch to.
 			 */
 			reset_dbregs();
 		}
 		clear_pcb_flags(pcb, PCB_DBREGS);
 	}
 
 	/*
 	 * Drop the FP state if we hold it, so that the process gets a
 	 * clean FP state if it uses the FPU again.
 	 */
 	fpstate_drop(td);
 }
 
 void
 cpu_setregs(void)
 {
 	register_t cr0;
 
 	cr0 = rcr0();
 	/*
 	 * CR0_MP, CR0_NE and CR0_TS are also set by npx_probe() for the
 	 * BSP.  See the comments there about why we set them.
 	 */
 	cr0 |= CR0_MP | CR0_NE | CR0_TS | CR0_WP | CR0_AM;
 	load_cr0(cr0);
 }
 
 /*
  * Initialize amd64 and configure to run kernel
  */
 
 /*
  * Initialize segments & interrupt table
  */
 
 struct user_segment_descriptor gdt[NGDT * MAXCPU];/* global descriptor tables */
 static struct gate_descriptor idt0[NIDT];
 struct gate_descriptor *idt = &idt0[0];	/* interrupt descriptor table */
 
 static char dblfault_stack[PAGE_SIZE] __aligned(16);
 
 static char nmi0_stack[PAGE_SIZE] __aligned(16);
 CTASSERT(sizeof(struct nmi_pcpu) == 16);
 
 struct amd64tss common_tss[MAXCPU];
 
 /*
  * Software prototypes -- in more palatable form.
  *
  * Keep GUFS32, GUGS32, GUCODE32 and GUDATA at the same
  * slots as corresponding segments for i386 kernel.
  */
 struct soft_segment_descriptor gdt_segs[] = {
 /* GNULL_SEL	0 Null Descriptor */
 {	.ssd_base = 0x0,
 	.ssd_limit = 0x0,
 	.ssd_type = 0,
 	.ssd_dpl = 0,
 	.ssd_p = 0,
 	.ssd_long = 0,
 	.ssd_def32 = 0,
 	.ssd_gran = 0		},
 /* GNULL2_SEL	1 Null Descriptor */
 {	.ssd_base = 0x0,
 	.ssd_limit = 0x0,
 	.ssd_type = 0,
 	.ssd_dpl = 0,
 	.ssd_p = 0,
 	.ssd_long = 0,
 	.ssd_def32 = 0,
 	.ssd_gran = 0		},
 /* GUFS32_SEL	2 32 bit %gs Descriptor for user */
 {	.ssd_base = 0x0,
 	.ssd_limit = 0xfffff,
 	.ssd_type = SDT_MEMRWA,
 	.ssd_dpl = SEL_UPL,
 	.ssd_p = 1,
 	.ssd_long = 0,
 	.ssd_def32 = 1,
 	.ssd_gran = 1		},
 /* GUGS32_SEL	3 32 bit %fs Descriptor for user */
 {	.ssd_base = 0x0,
 	.ssd_limit = 0xfffff,
 	.ssd_type = SDT_MEMRWA,
 	.ssd_dpl = SEL_UPL,
 	.ssd_p = 1,
 	.ssd_long = 0,
 	.ssd_def32 = 1,
 	.ssd_gran = 1		},
 /* GCODE_SEL	4 Code Descriptor for kernel */
 {	.ssd_base = 0x0,
 	.ssd_limit = 0xfffff,
 	.ssd_type = SDT_MEMERA,
 	.ssd_dpl = SEL_KPL,
 	.ssd_p = 1,
 	.ssd_long = 1,
 	.ssd_def32 = 0,
 	.ssd_gran = 1		},
 /* GDATA_SEL	5 Data Descriptor for kernel */
 {	.ssd_base = 0x0,
 	.ssd_limit = 0xfffff,
 	.ssd_type = SDT_MEMRWA,
 	.ssd_dpl = SEL_KPL,
 	.ssd_p = 1,
 	.ssd_long = 1,
 	.ssd_def32 = 0,
 	.ssd_gran = 1		},
 /* GUCODE32_SEL	6 32 bit Code Descriptor for user */
 {	.ssd_base = 0x0,
 	.ssd_limit = 0xfffff,
 	.ssd_type = SDT_MEMERA,
 	.ssd_dpl = SEL_UPL,
 	.ssd_p = 1,
 	.ssd_long = 0,
 	.ssd_def32 = 1,
 	.ssd_gran = 1		},
 /* GUDATA_SEL	7 32/64 bit Data Descriptor for user */
 {	.ssd_base = 0x0,
 	.ssd_limit = 0xfffff,
 	.ssd_type = SDT_MEMRWA,
 	.ssd_dpl = SEL_UPL,
 	.ssd_p = 1,
 	.ssd_long = 0,
 	.ssd_def32 = 1,
 	.ssd_gran = 1		},
 /* GUCODE_SEL	8 64 bit Code Descriptor for user */
 {	.ssd_base = 0x0,
 	.ssd_limit = 0xfffff,
 	.ssd_type = SDT_MEMERA,
 	.ssd_dpl = SEL_UPL,
 	.ssd_p = 1,
 	.ssd_long = 1,
 	.ssd_def32 = 0,
 	.ssd_gran = 1		},
 /* GPROC0_SEL	9 Proc 0 Tss Descriptor */
 {	.ssd_base = 0x0,
 	.ssd_limit = sizeof(struct amd64tss) + IOPAGES * PAGE_SIZE - 1,
 	.ssd_type = SDT_SYSTSS,
 	.ssd_dpl = SEL_KPL,
 	.ssd_p = 1,
 	.ssd_long = 0,
 	.ssd_def32 = 0,
 	.ssd_gran = 0		},
 /* Actually, the TSS is a system descriptor which is double size */
 {	.ssd_base = 0x0,
 	.ssd_limit = 0x0,
 	.ssd_type = 0,
 	.ssd_dpl = 0,
 	.ssd_p = 0,
 	.ssd_long = 0,
 	.ssd_def32 = 0,
 	.ssd_gran = 0		},
 /* GUSERLDT_SEL	11 LDT Descriptor */
 {	.ssd_base = 0x0,
 	.ssd_limit = 0x0,
 	.ssd_type = 0,
 	.ssd_dpl = 0,
 	.ssd_p = 0,
 	.ssd_long = 0,
 	.ssd_def32 = 0,
 	.ssd_gran = 0		},
 /* GUSERLDT_SEL	12 LDT Descriptor, double size */
 {	.ssd_base = 0x0,
 	.ssd_limit = 0x0,
 	.ssd_type = 0,
 	.ssd_dpl = 0,
 	.ssd_p = 0,
 	.ssd_long = 0,
 	.ssd_def32 = 0,
 	.ssd_gran = 0		},
 };
 
 void
 setidt(idx, func, typ, dpl, ist)
 	int idx;
 	inthand_t *func;
 	int typ;
 	int dpl;
 	int ist;
 {
 	struct gate_descriptor *ip;
 
 	ip = idt + idx;
 	ip->gd_looffset = (uintptr_t)func;
 	ip->gd_selector = GSEL(GCODE_SEL, SEL_KPL);
 	ip->gd_ist = ist;
 	ip->gd_xx = 0;
 	ip->gd_type = typ;
 	ip->gd_dpl = dpl;
 	ip->gd_p = 1;
 	ip->gd_hioffset = ((uintptr_t)func)>>16 ;
 }
 
 extern inthand_t
 	IDTVEC(div), IDTVEC(dbg), IDTVEC(nmi), IDTVEC(bpt), IDTVEC(ofl),
 	IDTVEC(bnd), IDTVEC(ill), IDTVEC(dna), IDTVEC(fpusegm),
 	IDTVEC(tss), IDTVEC(missing), IDTVEC(stk), IDTVEC(prot),
 	IDTVEC(page), IDTVEC(mchk), IDTVEC(rsvd), IDTVEC(fpu), IDTVEC(align),
 	IDTVEC(xmm), IDTVEC(dblfault),
 #ifdef KDTRACE_HOOKS
 	IDTVEC(dtrace_ret),
 #endif
 #ifdef XENHVM
 	IDTVEC(xen_intr_upcall),
 #endif
 	IDTVEC(fast_syscall), IDTVEC(fast_syscall32);
 
 #ifdef DDB
 /*
  * Display the index and function name of any IDT entries that don't use
  * the default 'rsvd' entry point.
  */
 DB_SHOW_COMMAND(idt, db_show_idt)
 {
 	struct gate_descriptor *ip;
 	int idx;
 	uintptr_t func;
 
 	ip = idt;
 	for (idx = 0; idx < NIDT && !db_pager_quit; idx++) {
 		func = ((long)ip->gd_hioffset << 16 | ip->gd_looffset);
 		if (func != (uintptr_t)&IDTVEC(rsvd)) {
 			db_printf("%3d\t", idx);
 			db_printsym(func, DB_STGY_PROC);
 			db_printf("\n");
 		}
 		ip++;
 	}
 }
 
 /* Show privileged registers. */
 DB_SHOW_COMMAND(sysregs, db_show_sysregs)
 {
 	struct {
 		uint16_t limit;
 		uint64_t base;
 	} __packed idtr, gdtr;
 	uint16_t ldt, tr;
 
 	__asm __volatile("sidt %0" : "=m" (idtr));
 	db_printf("idtr\t0x%016lx/%04x\n",
 	    (u_long)idtr.base, (u_int)idtr.limit);
 	__asm __volatile("sgdt %0" : "=m" (gdtr));
 	db_printf("gdtr\t0x%016lx/%04x\n",
 	    (u_long)gdtr.base, (u_int)gdtr.limit);
 	__asm __volatile("sldt %0" : "=r" (ldt));
 	db_printf("ldtr\t0x%04x\n", ldt);
 	__asm __volatile("str %0" : "=r" (tr));
 	db_printf("tr\t0x%04x\n", tr);
 	db_printf("cr0\t0x%016lx\n", rcr0());
 	db_printf("cr2\t0x%016lx\n", rcr2());
 	db_printf("cr3\t0x%016lx\n", rcr3());
 	db_printf("cr4\t0x%016lx\n", rcr4());
 	db_printf("EFER\t%016lx\n", rdmsr(MSR_EFER));
 	db_printf("FEATURES_CTL\t%016lx\n", rdmsr(MSR_IA32_FEATURE_CONTROL));
 	db_printf("DEBUG_CTL\t%016lx\n", rdmsr(MSR_DEBUGCTLMSR));
 	db_printf("PAT\t%016lx\n", rdmsr(MSR_PAT));
 	db_printf("GSBASE\t%016lx\n", rdmsr(MSR_GSBASE));
 }
 #endif
 
 void
 sdtossd(sd, ssd)
 	struct user_segment_descriptor *sd;
 	struct soft_segment_descriptor *ssd;
 {
 
 	ssd->ssd_base  = (sd->sd_hibase << 24) | sd->sd_lobase;
 	ssd->ssd_limit = (sd->sd_hilimit << 16) | sd->sd_lolimit;
 	ssd->ssd_type  = sd->sd_type;
 	ssd->ssd_dpl   = sd->sd_dpl;
 	ssd->ssd_p     = sd->sd_p;
 	ssd->ssd_long  = sd->sd_long;
 	ssd->ssd_def32 = sd->sd_def32;
 	ssd->ssd_gran  = sd->sd_gran;
 }
 
 void
 ssdtosd(ssd, sd)
 	struct soft_segment_descriptor *ssd;
 	struct user_segment_descriptor *sd;
 {
 
 	sd->sd_lobase = (ssd->ssd_base) & 0xffffff;
 	sd->sd_hibase = (ssd->ssd_base >> 24) & 0xff;
 	sd->sd_lolimit = (ssd->ssd_limit) & 0xffff;
 	sd->sd_hilimit = (ssd->ssd_limit >> 16) & 0xf;
 	sd->sd_type  = ssd->ssd_type;
 	sd->sd_dpl   = ssd->ssd_dpl;
 	sd->sd_p     = ssd->ssd_p;
 	sd->sd_long  = ssd->ssd_long;
 	sd->sd_def32 = ssd->ssd_def32;
 	sd->sd_gran  = ssd->ssd_gran;
 }
 
 void
 ssdtosyssd(ssd, sd)
 	struct soft_segment_descriptor *ssd;
 	struct system_segment_descriptor *sd;
 {
 
 	sd->sd_lobase = (ssd->ssd_base) & 0xffffff;
 	sd->sd_hibase = (ssd->ssd_base >> 24) & 0xfffffffffful;
 	sd->sd_lolimit = (ssd->ssd_limit) & 0xffff;
 	sd->sd_hilimit = (ssd->ssd_limit >> 16) & 0xf;
 	sd->sd_type  = ssd->ssd_type;
 	sd->sd_dpl   = ssd->ssd_dpl;
 	sd->sd_p     = ssd->ssd_p;
 	sd->sd_gran  = ssd->ssd_gran;
 }
 
 #if !defined(DEV_ATPIC) && defined(DEV_ISA)
 #include <isa/isavar.h>
 #include <isa/isareg.h>
 /*
  * Return a bitmap of the current interrupt requests.  This is 8259-specific
  * and is only suitable for use at probe time.
  * This is only here to pacify sio.  It is NOT FATAL if this doesn't work.
  * It shouldn't be here.  There should probably be an APIC centric
  * implementation in the apic driver code, if at all.
  */
 intrmask_t
 isa_irq_pending(void)
 {
 	u_char irr1;
 	u_char irr2;
 
 	irr1 = inb(IO_ICU1);
 	irr2 = inb(IO_ICU2);
 	return ((irr2 << 8) | irr1);
 }
 #endif
 
 u_int basemem;
 
 static int
 add_physmap_entry(uint64_t base, uint64_t length, vm_paddr_t *physmap,
     int *physmap_idxp)
 {
 	int i, insert_idx, physmap_idx;
 
 	physmap_idx = *physmap_idxp;
 
 	if (length == 0)
 		return (1);
 
 	/*
 	 * Find insertion point while checking for overlap.  Start off by
 	 * assuming the new entry will be added to the end.
 	 */
 	insert_idx = physmap_idx + 2;
 	for (i = 0; i <= physmap_idx; i += 2) {
 		if (base < physmap[i + 1]) {
 			if (base + length <= physmap[i]) {
 				insert_idx = i;
 				break;
 			}
 			if (boothowto & RB_VERBOSE)
 				printf(
 		    "Overlapping memory regions, ignoring second region\n");
 			return (1);
 		}
 	}
 
 	/* See if we can prepend to the next entry. */
 	if (insert_idx <= physmap_idx && base + length == physmap[insert_idx]) {
 		physmap[insert_idx] = base;
 		return (1);
 	}
 
 	/* See if we can append to the previous entry. */
 	if (insert_idx > 0 && base == physmap[insert_idx - 1]) {
 		physmap[insert_idx - 1] += length;
 		return (1);
 	}
 
 	physmap_idx += 2;
 	*physmap_idxp = physmap_idx;
 	if (physmap_idx == PHYSMAP_SIZE) {
 		printf(
 		"Too many segments in the physical address map, giving up\n");
 		return (0);
 	}
 
 	/*
 	 * Move the last 'N' entries down to make room for the new
 	 * entry if needed.
 	 */
 	for (i = physmap_idx; i > insert_idx; i -= 2) {
 		physmap[i] = physmap[i - 2];
 		physmap[i + 1] = physmap[i - 1];
 	}
 
 	/* Insert the new entry. */
 	physmap[insert_idx] = base;
 	physmap[insert_idx + 1] = base + length;
 	return (1);
 }
 
 static void
 add_smap_entries(struct bios_smap *smapbase, vm_paddr_t *physmap,
     int *physmap_idx)
 {
 	struct bios_smap *smap, *smapend;
 	u_int32_t smapsize;
 
 	/*
 	 * Memory map from INT 15:E820.
 	 *
 	 * subr_module.c says:
 	 * "Consumer may safely assume that size value precedes data."
 	 * ie: an int32_t immediately precedes smap.
 	 */
 	smapsize = *((u_int32_t *)smapbase - 1);
 	smapend = (struct bios_smap *)((uintptr_t)smapbase + smapsize);
 
 	for (smap = smapbase; smap < smapend; smap++) {
 		if (boothowto & RB_VERBOSE)
 			printf("SMAP type=%02x base=%016lx len=%016lx\n",
 			    smap->type, smap->base, smap->length);
 
 		if (smap->type != SMAP_TYPE_MEMORY)
 			continue;
 
 		if (!add_physmap_entry(smap->base, smap->length, physmap,
 		    physmap_idx))
 			break;
 	}
 }
 
 /*
  * Populate the (physmap) array with base/bound pairs describing the
  * available physical memory in the system, then test this memory and
  * build the phys_avail array describing the actually-available memory.
  *
  * Total memory size may be set by the kernel environment variable
  * hw.physmem or the compile-time define MAXMEM.
  *
  * XXX first should be vm_paddr_t.
  */
 static void
 getmemsize(caddr_t kmdp, u_int64_t first)
 {
 	int i, physmap_idx, pa_indx, da_indx;
 	vm_paddr_t pa, physmap[PHYSMAP_SIZE];
 	u_long physmem_start, physmem_tunable, memtest;
 	pt_entry_t *pte;
 	struct bios_smap *smapbase;
 	quad_t dcons_addr, dcons_size;
 
 	bzero(physmap, sizeof(physmap));
 	basemem = 0;
 	physmap_idx = 0;
 
 	smapbase = (struct bios_smap *)preload_search_info(kmdp,
 	    MODINFO_METADATA | MODINFOMD_SMAP);
 	if (smapbase == NULL)
 		panic("No BIOS smap info from loader!");
 
 	add_smap_entries(smapbase, physmap, &physmap_idx);
 
 	/*
 	 * Find the 'base memory' segment for SMP
 	 */
 	basemem = 0;
 	for (i = 0; i <= physmap_idx; i += 2) {
 		if (physmap[i] == 0x00000000) {
 			basemem = physmap[i + 1] / 1024;
 			break;
 		}
 	}
 	if (basemem == 0)
 		panic("BIOS smap did not include a basemem segment!");
 
 #ifdef SMP
 	/* make hole for AP bootstrap code */
 	physmap[1] = mp_bootaddress(physmap[1] / 1024);
 #endif
 
 	/*
 	 * Maxmem isn't the "maximum memory", it's one larger than the
 	 * highest page of the physical address space.  It should be
 	 * called something like "Maxphyspage".  We may adjust this
 	 * based on ``hw.physmem'' and the results of the memory test.
 	 */
 	Maxmem = atop(physmap[physmap_idx + 1]);
 
 #ifdef MAXMEM
 	Maxmem = MAXMEM / 4;
 #endif
 
 	if (TUNABLE_ULONG_FETCH("hw.physmem", &physmem_tunable))
 		Maxmem = atop(physmem_tunable);
 
 	/*
 	 * The boot memory test is disabled by default, as it takes a
 	 * significant amount of time on large-memory systems, and is
 	 * unfriendly to virtual machines as it unnecessarily touches all
 	 * pages.
 	 *
 	 * A general name is used as the code may be extended to support
 	 * additional tests beyond the current "page present" test.
 	 */
 	memtest = 0;
 	TUNABLE_ULONG_FETCH("hw.memtest.tests", &memtest);
 
 	/*
 	 * Don't allow MAXMEM or hw.physmem to extend the amount of memory
 	 * in the system.
 	 */
 	if (Maxmem > atop(physmap[physmap_idx + 1]))
 		Maxmem = atop(physmap[physmap_idx + 1]);
 
 	if (atop(physmap[physmap_idx + 1]) != Maxmem &&
 	    (boothowto & RB_VERBOSE))
 		printf("Physical memory use set to %ldK\n", Maxmem * 4);
 
 	/* call pmap initialization to make new kernel address space */
 	pmap_bootstrap(&first);
 
 	/*
 	 * Size up each available chunk of physical memory.
 	 *
 	 * XXX Some BIOSes corrupt low 64KB between suspend and resume.
 	 * By default, mask off the first 16 pages unless we appear to be
 	 * running in a VM.
 	 */
 	physmem_start = (vm_guest > VM_GUEST_NO ? 1 : 16) << PAGE_SHIFT;
 	TUNABLE_ULONG_FETCH("hw.physmem.start", &physmem_start);
 	if (physmem_start < PAGE_SIZE)
 		physmap[0] = PAGE_SIZE;
 	else if (physmem_start >= physmap[1])
 		physmap[0] = round_page(physmap[1] - PAGE_SIZE);
 	else
 		physmap[0] = round_page(physmem_start);
 	pa_indx = 0;
 	da_indx = 1;
 	phys_avail[pa_indx++] = physmap[0];
 	phys_avail[pa_indx] = physmap[0];
 	dump_avail[da_indx] = physmap[0];
 	pte = CMAP1;
 
 	/*
 	 * Get dcons buffer address
 	 */
 	if (getenv_quad("dcons.addr", &dcons_addr) == 0 ||
 	    getenv_quad("dcons.size", &dcons_size) == 0)
 		dcons_addr = 0;
 
 	/*
 	 * physmap is in bytes, so when converting to page boundaries,
 	 * round up the start address and round down the end address.
 	 */
 	for (i = 0; i <= physmap_idx; i += 2) {
 		vm_paddr_t end;
 
 		end = ptoa((vm_paddr_t)Maxmem);
 		if (physmap[i + 1] < end)
 			end = trunc_page(physmap[i + 1]);
 		for (pa = round_page(physmap[i]); pa < end; pa += PAGE_SIZE) {
 			int tmp, page_bad, full;
 			int *ptr = (int *)CADDR1;
 
 			full = FALSE;
 			/*
 			 * block out kernel memory as not available.
 			 */
 			if (pa >= (vm_paddr_t)kernphys && pa < first)
 				goto do_dump_avail;
 
 			/*
 			 * block out dcons buffer
 			 */
 			if (dcons_addr > 0
 			    && pa >= trunc_page(dcons_addr)
 			    && pa < dcons_addr + dcons_size)
 				goto do_dump_avail;
 
 			page_bad = FALSE;
 			if (memtest == 0)
 				goto skip_memtest;
 
 			/*
 			 * map page into kernel: valid, read/write,non-cacheable
 			 */
 			*pte = pa | PG_V | PG_RW | PG_NC_PWT | PG_NC_PCD;
 			invltlb();
 
 			tmp = *(int *)ptr;
 			/*
 			 * Test for alternating 1's and 0's
 			 */
 			*(volatile int *)ptr = 0xaaaaaaaa;
 			if (*(volatile int *)ptr != 0xaaaaaaaa)
 				page_bad = TRUE;
 			/*
 			 * Test for alternating 0's and 1's
 			 */
 			*(volatile int *)ptr = 0x55555555;
 			if (*(volatile int *)ptr != 0x55555555)
 				page_bad = TRUE;
 			/*
 			 * Test for all 1's
 			 */
 			*(volatile int *)ptr = 0xffffffff;
 			if (*(volatile int *)ptr != 0xffffffff)
 				page_bad = TRUE;
 			/*
 			 * Test for all 0's
 			 */
 			*(volatile int *)ptr = 0x0;
 			if (*(volatile int *)ptr != 0x0)
 				page_bad = TRUE;
 			/*
 			 * Restore original value.
 			 */
 			*(int *)ptr = tmp;
 
 skip_memtest:
 			/*
 			 * Adjust array of valid/good pages.
 			 */
 			if (page_bad == TRUE)
 				continue;
 			/*
 			 * If this good page is a continuation of the
 			 * previous set of good pages, then just increase
 			 * the end pointer. Otherwise start a new chunk.
 			 * Note that "end" points one higher than end,
 			 * making the range >= start and < end.
 			 * If we're also doing a speculative memory
 			 * test and we at or past the end, bump up Maxmem
 			 * so that we keep going. The first bad page
 			 * will terminate the loop.
 			 */
 			if (phys_avail[pa_indx] == pa) {
 				phys_avail[pa_indx] += PAGE_SIZE;
 			} else {
 				pa_indx++;
 				if (pa_indx == PHYS_AVAIL_ARRAY_END) {
 					printf(
 		"Too many holes in the physical address space, giving up\n");
 					pa_indx--;
 					full = TRUE;
 					goto do_dump_avail;
 				}
 				phys_avail[pa_indx++] = pa;	/* start */
 				phys_avail[pa_indx] = pa + PAGE_SIZE; /* end */
 			}
 			physmem++;
 do_dump_avail:
 			if (dump_avail[da_indx] == pa) {
 				dump_avail[da_indx] += PAGE_SIZE;
 			} else {
 				da_indx++;
 				if (da_indx == DUMP_AVAIL_ARRAY_END) {
 					da_indx--;
 					goto do_next;
 				}
 				dump_avail[da_indx++] = pa; /* start */
 				dump_avail[da_indx] = pa + PAGE_SIZE; /* end */
 			}
 do_next:
 			if (full)
 				break;
 		}
 	}
 	*pte = 0;
 	invltlb();
 
 	/*
 	 * XXX
 	 * The last chunk must contain at least one page plus the message
 	 * buffer to avoid complicating other code (message buffer address
 	 * calculation, etc.).
 	 */
 	while (phys_avail[pa_indx - 1] + PAGE_SIZE +
 	    round_page(msgbufsize) >= phys_avail[pa_indx]) {
 		physmem -= atop(phys_avail[pa_indx] - phys_avail[pa_indx - 1]);
 		phys_avail[pa_indx--] = 0;
 		phys_avail[pa_indx--] = 0;
 	}
 
 	Maxmem = atop(phys_avail[pa_indx]);
 
 	/* Trim off space for the message buffer. */
 	phys_avail[pa_indx] -= round_page(msgbufsize);
 
 	/* Map the message buffer. */
 	msgbufp = (struct msgbuf *)PHYS_TO_DMAP(phys_avail[pa_indx]);
 }
 
 u_int64_t
 hammer_time(u_int64_t modulep, u_int64_t physfree)
 {
 	caddr_t kmdp;
 	int gsel_tss, x;
 	struct pcpu *pc;
 	struct nmi_pcpu *np;
 	struct xstate_hdr *xhdr;
 	u_int64_t msr;
 	char *env;
 	size_t kstack0_sz;
 
 	thread0.td_kstack = physfree + KERNBASE;
 	thread0.td_kstack_pages = KSTACK_PAGES;
 	kstack0_sz = thread0.td_kstack_pages * PAGE_SIZE;
 	bzero((void *)thread0.td_kstack, kstack0_sz);
 	physfree += kstack0_sz;
 
 	/*
  	 * This may be done better later if it gets more high level
  	 * components in it. If so just link td->td_proc here.
 	 */
 	proc_linkup0(&proc0, &thread0);
 
 	preload_metadata = (caddr_t)(uintptr_t)(modulep + KERNBASE);
 	preload_bootstrap_relocate(KERNBASE);
 	kmdp = preload_search_by_type("elf kernel");
 	if (kmdp == NULL)
 		kmdp = preload_search_by_type("elf64 kernel");
 	boothowto = MD_FETCH(kmdp, MODINFOMD_HOWTO, int);
 	kern_envp = MD_FETCH(kmdp, MODINFOMD_ENVP, char *) + KERNBASE;
 #ifdef DDB
 	ksym_start = MD_FETCH(kmdp, MODINFOMD_SSYM, uintptr_t);
 	ksym_end = MD_FETCH(kmdp, MODINFOMD_ESYM, uintptr_t);
 #endif
 
 	/* Init basic tunables, hz etc */
 	init_param1();
 
 	/*
 	 * make gdt memory segments
 	 */
 	for (x = 0; x < NGDT; x++) {
 		if (x != GPROC0_SEL && x != (GPROC0_SEL + 1) &&
 		    x != GUSERLDT_SEL && x != (GUSERLDT_SEL) + 1)
 			ssdtosd(&gdt_segs[x], &gdt[x]);
 	}
 	gdt_segs[GPROC0_SEL].ssd_base = (uintptr_t)&common_tss[0];
 	ssdtosyssd(&gdt_segs[GPROC0_SEL],
 	    (struct system_segment_descriptor *)&gdt[GPROC0_SEL]);
 
 	r_gdt.rd_limit = NGDT * sizeof(gdt[0]) - 1;
 	r_gdt.rd_base =  (long) gdt;
 	lgdt(&r_gdt);
 	pc = &__pcpu[0];
 
 	wrmsr(MSR_FSBASE, 0);		/* User value */
 	wrmsr(MSR_GSBASE, (u_int64_t)pc);
 	wrmsr(MSR_KGSBASE, 0);		/* User value while in the kernel */
 
 	pcpu_init(pc, 0, sizeof(struct pcpu));
 	dpcpu_init((void *)(physfree + KERNBASE), 0);
 	physfree += DPCPU_SIZE;
 	PCPU_SET(prvspace, pc);
 	PCPU_SET(curthread, &thread0);
 	PCPU_SET(tssp, &common_tss[0]);
 	PCPU_SET(commontssp, &common_tss[0]);
 	PCPU_SET(tss, (struct system_segment_descriptor *)&gdt[GPROC0_SEL]);
 	PCPU_SET(ldt, (struct system_segment_descriptor *)&gdt[GUSERLDT_SEL]);
 	PCPU_SET(fs32p, &gdt[GUFS32_SEL]);
 	PCPU_SET(gs32p, &gdt[GUGS32_SEL]);
 
 	/*
 	 * Initialize mutexes.
 	 *
 	 * icu_lock: in order to allow an interrupt to occur in a critical
 	 * 	     section, to set pcpu->ipending (etc...) properly, we
 	 *	     must be able to get the icu lock, so it can't be
 	 *	     under witness.
 	 */
 	mutex_init();
 	mtx_init(&icu_lock, "icu", NULL, MTX_SPIN | MTX_NOWITNESS);
 	mtx_init(&dt_lock, "descriptor tables", NULL, MTX_DEF);
 
 	/* exceptions */
 	for (x = 0; x < NIDT; x++)
 		setidt(x, &IDTVEC(rsvd), SDT_SYSIGT, SEL_KPL, 0);
 	setidt(IDT_DE, &IDTVEC(div),  SDT_SYSIGT, SEL_KPL, 0);
 	setidt(IDT_DB, &IDTVEC(dbg),  SDT_SYSIGT, SEL_KPL, 0);
 	setidt(IDT_NMI, &IDTVEC(nmi),  SDT_SYSIGT, SEL_KPL, 2);
  	setidt(IDT_BP, &IDTVEC(bpt),  SDT_SYSIGT, SEL_UPL, 0);
 	setidt(IDT_OF, &IDTVEC(ofl),  SDT_SYSIGT, SEL_KPL, 0);
 	setidt(IDT_BR, &IDTVEC(bnd),  SDT_SYSIGT, SEL_KPL, 0);
 	setidt(IDT_UD, &IDTVEC(ill),  SDT_SYSIGT, SEL_KPL, 0);
 	setidt(IDT_NM, &IDTVEC(dna),  SDT_SYSIGT, SEL_KPL, 0);
 	setidt(IDT_DF, &IDTVEC(dblfault), SDT_SYSIGT, SEL_KPL, 1);
 	setidt(IDT_FPUGP, &IDTVEC(fpusegm),  SDT_SYSIGT, SEL_KPL, 0);
 	setidt(IDT_TS, &IDTVEC(tss),  SDT_SYSIGT, SEL_KPL, 0);
 	setidt(IDT_NP, &IDTVEC(missing),  SDT_SYSIGT, SEL_KPL, 0);
 	setidt(IDT_SS, &IDTVEC(stk),  SDT_SYSIGT, SEL_KPL, 0);
 	setidt(IDT_GP, &IDTVEC(prot),  SDT_SYSIGT, SEL_KPL, 0);
 	setidt(IDT_PF, &IDTVEC(page),  SDT_SYSIGT, SEL_KPL, 0);
 	setidt(IDT_MF, &IDTVEC(fpu),  SDT_SYSIGT, SEL_KPL, 0);
 	setidt(IDT_AC, &IDTVEC(align), SDT_SYSIGT, SEL_KPL, 0);
 	setidt(IDT_MC, &IDTVEC(mchk),  SDT_SYSIGT, SEL_KPL, 0);
 	setidt(IDT_XF, &IDTVEC(xmm), SDT_SYSIGT, SEL_KPL, 0);
 #ifdef KDTRACE_HOOKS
 	setidt(IDT_DTRACE_RET, &IDTVEC(dtrace_ret), SDT_SYSIGT, SEL_UPL, 0);
 #endif
 #ifdef XENHVM
 	setidt(IDT_EVTCHN, &IDTVEC(xen_intr_upcall), SDT_SYSIGT, SEL_UPL, 0);
 #endif
 
 	r_idt.rd_limit = sizeof(idt0) - 1;
 	r_idt.rd_base = (long) idt;
 	lidt(&r_idt);
 
 	/*
 	 * Initialize the i8254 before the console so that console
 	 * initialization can use DELAY().
 	 */
 	i8254_init();
 
 	/*
 	 * Initialize the console before we print anything out.
 	 */
 	cninit();
 
 #ifdef DEV_ISA
 #ifdef DEV_ATPIC
 	elcr_probe();
 	atpic_startup();
 #else
 	/* Reset and mask the atpics and leave them shut down. */
 	atpic_reset();
 
 	/*
 	 * Point the ICU spurious interrupt vectors at the APIC spurious
 	 * interrupt handler.
 	 */
 	setidt(IDT_IO_INTS + 7, IDTVEC(spuriousint), SDT_SYSIGT, SEL_KPL, 0);
 	setidt(IDT_IO_INTS + 15, IDTVEC(spuriousint), SDT_SYSIGT, SEL_KPL, 0);
 #endif
 #else
 #error "have you forgotten the isa device?";
 #endif
 
 	kdb_init();
 
 #ifdef KDB
 	if (boothowto & RB_KDB)
 		kdb_enter(KDB_WHY_BOOTFLAGS,
 		    "Boot flags requested debugger");
 #endif
 
 	identify_cpu();		/* Final stage of CPU initialization */
 	initializecpu();	/* Initialize CPU registers */
 	initializecpucache();
 
 	/* doublefault stack space, runs on ist1 */
 	common_tss[0].tss_ist1 = (long)&dblfault_stack[sizeof(dblfault_stack)];
 
 	/*
 	 * NMI stack, runs on ist2.  The pcpu pointer is stored just
 	 * above the start of the ist2 stack.
 	 */
 	np = ((struct nmi_pcpu *) &nmi0_stack[sizeof(nmi0_stack)]) - 1;
 	np->np_pcpu = (register_t) pc;
 	common_tss[0].tss_ist2 = (long) np;
 
 	/* Set the IO permission bitmap (empty due to tss seg limit) */
 	common_tss[0].tss_iobase = sizeof(struct amd64tss) +
 	    IOPAGES * PAGE_SIZE;
 
 	gsel_tss = GSEL(GPROC0_SEL, SEL_KPL);
 	ltr(gsel_tss);
 
 	/* Set up the fast syscall stuff */
 	msr = rdmsr(MSR_EFER) | EFER_SCE;
 	wrmsr(MSR_EFER, msr);
 	wrmsr(MSR_LSTAR, (u_int64_t)IDTVEC(fast_syscall));
 	wrmsr(MSR_CSTAR, (u_int64_t)IDTVEC(fast_syscall32));
 	msr = ((u_int64_t)GSEL(GCODE_SEL, SEL_KPL) << 32) |
 	      ((u_int64_t)GSEL(GUCODE32_SEL, SEL_UPL) << 48);
 	wrmsr(MSR_STAR, msr);
 	wrmsr(MSR_SF_MASK, PSL_NT|PSL_T|PSL_I|PSL_C|PSL_D);
 
 	getmemsize(kmdp, physfree);
 	init_param2(physmem);
 
 	/* now running on new page tables, configured,and u/iom is accessible */
 
 	msgbufinit(msgbufp, msgbufsize);
 	fpuinit();
 
 	/*
 	 * Set up thread0 pcb after fpuinit calculated pcb + fpu save
 	 * area size.  Zero out the extended state header in fpu save
 	 * area.
 	 */
 	thread0.td_pcb = get_pcb_td(&thread0);
 	bzero(get_pcb_user_save_td(&thread0), cpu_max_ext_state_size);
 	if (use_xsave) {
 		xhdr = (struct xstate_hdr *)(get_pcb_user_save_td(&thread0) +
 		    1);
 		xhdr->xstate_bv = xsave_mask;
 	}
 	/* make an initial tss so cpu can get interrupt stack on syscall! */
 	common_tss[0].tss_rsp0 = (vm_offset_t)thread0.td_pcb;
 	/* Ensure the stack is aligned to 16 bytes */
 	common_tss[0].tss_rsp0 &= ~0xFul;
 	PCPU_SET(rsp0, common_tss[0].tss_rsp0);
 	PCPU_SET(curpcb, thread0.td_pcb);
 
 	/* transfer to user mode */
 
 	_ucodesel = GSEL(GUCODE_SEL, SEL_UPL);
 	_udatasel = GSEL(GUDATA_SEL, SEL_UPL);
 	_ucode32sel = GSEL(GUCODE32_SEL, SEL_UPL);
 	_ufssel = GSEL(GUFS32_SEL, SEL_UPL);
 	_ugssel = GSEL(GUGS32_SEL, SEL_UPL);
 
 	load_ds(_udatasel);
 	load_es(_udatasel);
 	load_fs(_ufssel);
 
 	/* setup proc 0's pcb */
 	thread0.td_pcb->pcb_flags = 0;
 	thread0.td_pcb->pcb_cr3 = KPML4phys; /* PCID 0 is reserved for kernel */
 	thread0.td_frame = &proc0_tf;
 
         env = getenv("kernelname");
 	if (env != NULL)
 		strlcpy(kernelname, env, sizeof(kernelname));
 
 	cpu_probe_amdc1e();
 
 #ifdef FDT
 	x86_init_fdt();
 #endif
 
 	/* Location of kernel stack for locore */
 	return ((u_int64_t)thread0.td_pcb);
 }
 
 void
 cpu_pcpu_init(struct pcpu *pcpu, int cpuid, size_t size)
 {
 
 	pcpu->pc_acpi_id = 0xffffffff;
 }
 
 void
 spinlock_enter(void)
 {
 	struct thread *td;
 	register_t flags;
 
 	td = curthread;
 	if (td->td_md.md_spinlock_count == 0) {
 		flags = intr_disable();
 		td->td_md.md_spinlock_count = 1;
 		td->td_md.md_saved_flags = flags;
 	} else
 		td->td_md.md_spinlock_count++;
 	critical_enter();
 }
 
 void
 spinlock_exit(void)
 {
 	struct thread *td;
 	register_t flags;
 
 	td = curthread;
 	critical_exit();
 	flags = td->td_md.md_saved_flags;
 	td->td_md.md_spinlock_count--;
 	if (td->td_md.md_spinlock_count == 0)
 		intr_restore(flags);
 }
 
 /*
  * Construct a PCB from a trapframe. This is called from kdb_trap() where
  * we want to start a backtrace from the function that caused us to enter
  * the debugger. We have the context in the trapframe, but base the trace
  * on the PCB. The PCB doesn't have to be perfect, as long as it contains
  * enough for a backtrace.
  */
 void
 makectx(struct trapframe *tf, struct pcb *pcb)
 {
 
 	pcb->pcb_r12 = tf->tf_r12;
 	pcb->pcb_r13 = tf->tf_r13;
 	pcb->pcb_r14 = tf->tf_r14;
 	pcb->pcb_r15 = tf->tf_r15;
 	pcb->pcb_rbp = tf->tf_rbp;
 	pcb->pcb_rbx = tf->tf_rbx;
 	pcb->pcb_rip = tf->tf_rip;
 	pcb->pcb_rsp = tf->tf_rsp;
 }
 
 int
 ptrace_set_pc(struct thread *td, unsigned long addr)
 {
 	td->td_frame->tf_rip = addr;
 	return (0);
 }
 
 int
 ptrace_single_step(struct thread *td)
 {
 	td->td_frame->tf_rflags |= PSL_T;
 	return (0);
 }
 
 int
 ptrace_clear_single_step(struct thread *td)
 {
 	td->td_frame->tf_rflags &= ~PSL_T;
 	return (0);
 }
 
 int
 fill_regs(struct thread *td, struct reg *regs)
 {
 	struct trapframe *tp;
 
 	tp = td->td_frame;
 	return (fill_frame_regs(tp, regs));
 }
 
 int
 fill_frame_regs(struct trapframe *tp, struct reg *regs)
 {
 	regs->r_r15 = tp->tf_r15;
 	regs->r_r14 = tp->tf_r14;
 	regs->r_r13 = tp->tf_r13;
 	regs->r_r12 = tp->tf_r12;
 	regs->r_r11 = tp->tf_r11;
 	regs->r_r10 = tp->tf_r10;
 	regs->r_r9  = tp->tf_r9;
 	regs->r_r8  = tp->tf_r8;
 	regs->r_rdi = tp->tf_rdi;
 	regs->r_rsi = tp->tf_rsi;
 	regs->r_rbp = tp->tf_rbp;
 	regs->r_rbx = tp->tf_rbx;
 	regs->r_rdx = tp->tf_rdx;
 	regs->r_rcx = tp->tf_rcx;
 	regs->r_rax = tp->tf_rax;
 	regs->r_rip = tp->tf_rip;
 	regs->r_cs = tp->tf_cs;
 	regs->r_rflags = tp->tf_rflags;
 	regs->r_rsp = tp->tf_rsp;
 	regs->r_ss = tp->tf_ss;
 	if (tp->tf_flags & TF_HASSEGS) {
 		regs->r_ds = tp->tf_ds;
 		regs->r_es = tp->tf_es;
 		regs->r_fs = tp->tf_fs;
 		regs->r_gs = tp->tf_gs;
 	} else {
 		regs->r_ds = 0;
 		regs->r_es = 0;
 		regs->r_fs = 0;
 		regs->r_gs = 0;
 	}
 	return (0);
 }
 
 int
 set_regs(struct thread *td, struct reg *regs)
 {
 	struct trapframe *tp;
 	register_t rflags;
 
 	tp = td->td_frame;
 	rflags = regs->r_rflags & 0xffffffff;
 	if (!EFL_SECURE(rflags, tp->tf_rflags) || !CS_SECURE(regs->r_cs))
 		return (EINVAL);
 	tp->tf_r15 = regs->r_r15;
 	tp->tf_r14 = regs->r_r14;
 	tp->tf_r13 = regs->r_r13;
 	tp->tf_r12 = regs->r_r12;
 	tp->tf_r11 = regs->r_r11;
 	tp->tf_r10 = regs->r_r10;
 	tp->tf_r9  = regs->r_r9;
 	tp->tf_r8  = regs->r_r8;
 	tp->tf_rdi = regs->r_rdi;
 	tp->tf_rsi = regs->r_rsi;
 	tp->tf_rbp = regs->r_rbp;
 	tp->tf_rbx = regs->r_rbx;
 	tp->tf_rdx = regs->r_rdx;
 	tp->tf_rcx = regs->r_rcx;
 	tp->tf_rax = regs->r_rax;
 	tp->tf_rip = regs->r_rip;
 	tp->tf_cs = regs->r_cs;
 	tp->tf_rflags = rflags;
 	tp->tf_rsp = regs->r_rsp;
 	tp->tf_ss = regs->r_ss;
 	if (0) {	/* XXXKIB */
 		tp->tf_ds = regs->r_ds;
 		tp->tf_es = regs->r_es;
 		tp->tf_fs = regs->r_fs;
 		tp->tf_gs = regs->r_gs;
 		tp->tf_flags = TF_HASSEGS;
 		set_pcb_flags(td->td_pcb, PCB_FULL_IRET);
 	}
 	return (0);
 }
 
 /* XXX check all this stuff! */
 /* externalize from sv_xmm */
 static void
 fill_fpregs_xmm(struct savefpu *sv_xmm, struct fpreg *fpregs)
 {
 	struct envxmm *penv_fpreg = (struct envxmm *)&fpregs->fpr_env;
 	struct envxmm *penv_xmm = &sv_xmm->sv_env;
 	int i;
 
 	/* pcb -> fpregs */
 	bzero(fpregs, sizeof(*fpregs));
 
 	/* FPU control/status */
 	penv_fpreg->en_cw = penv_xmm->en_cw;
 	penv_fpreg->en_sw = penv_xmm->en_sw;
 	penv_fpreg->en_tw = penv_xmm->en_tw;
 	penv_fpreg->en_opcode = penv_xmm->en_opcode;
 	penv_fpreg->en_rip = penv_xmm->en_rip;
 	penv_fpreg->en_rdp = penv_xmm->en_rdp;
 	penv_fpreg->en_mxcsr = penv_xmm->en_mxcsr;
 	penv_fpreg->en_mxcsr_mask = penv_xmm->en_mxcsr_mask;
 
 	/* FPU registers */
 	for (i = 0; i < 8; ++i)
 		bcopy(sv_xmm->sv_fp[i].fp_acc.fp_bytes, fpregs->fpr_acc[i], 10);
 
 	/* SSE registers */
 	for (i = 0; i < 16; ++i)
 		bcopy(sv_xmm->sv_xmm[i].xmm_bytes, fpregs->fpr_xacc[i], 16);
 }
 
 /* internalize from fpregs into sv_xmm */
 static void
 set_fpregs_xmm(struct fpreg *fpregs, struct savefpu *sv_xmm)
 {
 	struct envxmm *penv_xmm = &sv_xmm->sv_env;
 	struct envxmm *penv_fpreg = (struct envxmm *)&fpregs->fpr_env;
 	int i;
 
 	/* fpregs -> pcb */
 	/* FPU control/status */
 	penv_xmm->en_cw = penv_fpreg->en_cw;
 	penv_xmm->en_sw = penv_fpreg->en_sw;
 	penv_xmm->en_tw = penv_fpreg->en_tw;
 	penv_xmm->en_opcode = penv_fpreg->en_opcode;
 	penv_xmm->en_rip = penv_fpreg->en_rip;
 	penv_xmm->en_rdp = penv_fpreg->en_rdp;
 	penv_xmm->en_mxcsr = penv_fpreg->en_mxcsr;
 	penv_xmm->en_mxcsr_mask = penv_fpreg->en_mxcsr_mask & cpu_mxcsr_mask;
 
 	/* FPU registers */
 	for (i = 0; i < 8; ++i)
 		bcopy(fpregs->fpr_acc[i], sv_xmm->sv_fp[i].fp_acc.fp_bytes, 10);
 
 	/* SSE registers */
 	for (i = 0; i < 16; ++i)
 		bcopy(fpregs->fpr_xacc[i], sv_xmm->sv_xmm[i].xmm_bytes, 16);
 }
 
 /* externalize from td->pcb */
 int
 fill_fpregs(struct thread *td, struct fpreg *fpregs)
 {
 
 	KASSERT(td == curthread || TD_IS_SUSPENDED(td) ||
 	    P_SHOULDSTOP(td->td_proc),
 	    ("not suspended thread %p", td));
 	fpugetregs(td);
 	fill_fpregs_xmm(get_pcb_user_save_td(td), fpregs);
 	return (0);
 }
 
 /* internalize to td->pcb */
 int
 set_fpregs(struct thread *td, struct fpreg *fpregs)
 {
 
 	set_fpregs_xmm(fpregs, get_pcb_user_save_td(td));
 	fpuuserinited(td);
 	return (0);
 }
 
 /*
  * Get machine context.
  */
 int
 get_mcontext(struct thread *td, mcontext_t *mcp, int flags)
 {
 	struct pcb *pcb;
 	struct trapframe *tp;
 
 	pcb = td->td_pcb;
 	tp = td->td_frame;
 	PROC_LOCK(curthread->td_proc);
 	mcp->mc_onstack = sigonstack(tp->tf_rsp);
 	PROC_UNLOCK(curthread->td_proc);
 	mcp->mc_r15 = tp->tf_r15;
 	mcp->mc_r14 = tp->tf_r14;
 	mcp->mc_r13 = tp->tf_r13;
 	mcp->mc_r12 = tp->tf_r12;
 	mcp->mc_r11 = tp->tf_r11;
 	mcp->mc_r10 = tp->tf_r10;
 	mcp->mc_r9  = tp->tf_r9;
 	mcp->mc_r8  = tp->tf_r8;
 	mcp->mc_rdi = tp->tf_rdi;
 	mcp->mc_rsi = tp->tf_rsi;
 	mcp->mc_rbp = tp->tf_rbp;
 	mcp->mc_rbx = tp->tf_rbx;
 	mcp->mc_rcx = tp->tf_rcx;
 	mcp->mc_rflags = tp->tf_rflags;
 	if (flags & GET_MC_CLEAR_RET) {
 		mcp->mc_rax = 0;
 		mcp->mc_rdx = 0;
 		mcp->mc_rflags &= ~PSL_C;
 	} else {
 		mcp->mc_rax = tp->tf_rax;
 		mcp->mc_rdx = tp->tf_rdx;
 	}
 	mcp->mc_rip = tp->tf_rip;
 	mcp->mc_cs = tp->tf_cs;
 	mcp->mc_rsp = tp->tf_rsp;
 	mcp->mc_ss = tp->tf_ss;
 	mcp->mc_ds = tp->tf_ds;
 	mcp->mc_es = tp->tf_es;
 	mcp->mc_fs = tp->tf_fs;
 	mcp->mc_gs = tp->tf_gs;
 	mcp->mc_flags = tp->tf_flags;
 	mcp->mc_len = sizeof(*mcp);
 	get_fpcontext(td, mcp, NULL, 0);
 	mcp->mc_fsbase = pcb->pcb_fsbase;
 	mcp->mc_gsbase = pcb->pcb_gsbase;
 	mcp->mc_xfpustate = 0;
 	mcp->mc_xfpustate_len = 0;
 	bzero(mcp->mc_spare, sizeof(mcp->mc_spare));
 	return (0);
 }
 
 /*
  * Set machine context.
  *
  * However, we don't set any but the user modifiable flags, and we won't
  * touch the cs selector.
  */
 int
 set_mcontext(struct thread *td, const mcontext_t *mcp)
 {
 	struct pcb *pcb;
 	struct trapframe *tp;
 	char *xfpustate;
 	long rflags;
 	int ret;
 
 	pcb = td->td_pcb;
 	tp = td->td_frame;
 	if (mcp->mc_len != sizeof(*mcp) ||
 	    (mcp->mc_flags & ~_MC_FLAG_MASK) != 0)
 		return (EINVAL);
 	rflags = (mcp->mc_rflags & PSL_USERCHANGE) |
 	    (tp->tf_rflags & ~PSL_USERCHANGE);
 	if (mcp->mc_flags & _MC_HASFPXSTATE) {
 		if (mcp->mc_xfpustate_len > cpu_max_ext_state_size -
 		    sizeof(struct savefpu))
 			return (EINVAL);
 		xfpustate = __builtin_alloca(mcp->mc_xfpustate_len);
 		ret = copyin((void *)mcp->mc_xfpustate, xfpustate,
 		    mcp->mc_xfpustate_len);
 		if (ret != 0)
 			return (ret);
 	} else
 		xfpustate = NULL;
 	ret = set_fpcontext(td, mcp, xfpustate, mcp->mc_xfpustate_len);
 	if (ret != 0)
 		return (ret);
 	tp->tf_r15 = mcp->mc_r15;
 	tp->tf_r14 = mcp->mc_r14;
 	tp->tf_r13 = mcp->mc_r13;
 	tp->tf_r12 = mcp->mc_r12;
 	tp->tf_r11 = mcp->mc_r11;
 	tp->tf_r10 = mcp->mc_r10;
 	tp->tf_r9  = mcp->mc_r9;
 	tp->tf_r8  = mcp->mc_r8;
 	tp->tf_rdi = mcp->mc_rdi;
 	tp->tf_rsi = mcp->mc_rsi;
 	tp->tf_rbp = mcp->mc_rbp;
 	tp->tf_rbx = mcp->mc_rbx;
 	tp->tf_rdx = mcp->mc_rdx;
 	tp->tf_rcx = mcp->mc_rcx;
 	tp->tf_rax = mcp->mc_rax;
 	tp->tf_rip = mcp->mc_rip;
 	tp->tf_rflags = rflags;
 	tp->tf_rsp = mcp->mc_rsp;
 	tp->tf_ss = mcp->mc_ss;
 	tp->tf_flags = mcp->mc_flags;
 	if (tp->tf_flags & TF_HASSEGS) {
 		tp->tf_ds = mcp->mc_ds;
 		tp->tf_es = mcp->mc_es;
 		tp->tf_fs = mcp->mc_fs;
 		tp->tf_gs = mcp->mc_gs;
 	}
 	if (mcp->mc_flags & _MC_HASBASES) {
 		pcb->pcb_fsbase = mcp->mc_fsbase;
 		pcb->pcb_gsbase = mcp->mc_gsbase;
 	}
 	set_pcb_flags(pcb, PCB_FULL_IRET);
 	return (0);
 }
 
 static void
 get_fpcontext(struct thread *td, mcontext_t *mcp, char *xfpusave,
     size_t xfpusave_len)
 {
 	size_t max_len, len;
 
 	mcp->mc_ownedfp = fpugetregs(td);
 	bcopy(get_pcb_user_save_td(td), &mcp->mc_fpstate[0],
 	    sizeof(mcp->mc_fpstate));
 	mcp->mc_fpformat = fpuformat();
 	if (!use_xsave || xfpusave_len == 0)
 		return;
 	max_len = cpu_max_ext_state_size - sizeof(struct savefpu);
 	len = xfpusave_len;
 	if (len > max_len) {
 		len = max_len;
 		bzero(xfpusave + max_len, len - max_len);
 	}
 	mcp->mc_flags |= _MC_HASFPXSTATE;
 	mcp->mc_xfpustate_len = len;
 	bcopy(get_pcb_user_save_td(td) + 1, xfpusave, len);
 }
 
 static int
 set_fpcontext(struct thread *td, const mcontext_t *mcp, char *xfpustate,
     size_t xfpustate_len)
 {
 	struct savefpu *fpstate;
 	int error;
 
 	if (mcp->mc_fpformat == _MC_FPFMT_NODEV)
 		return (0);
 	else if (mcp->mc_fpformat != _MC_FPFMT_XMM)
 		return (EINVAL);
 	else if (mcp->mc_ownedfp == _MC_FPOWNED_NONE) {
 		/* We don't care what state is left in the FPU or PCB. */
 		fpstate_drop(td);
 		error = 0;
 	} else if (mcp->mc_ownedfp == _MC_FPOWNED_FPU ||
 	    mcp->mc_ownedfp == _MC_FPOWNED_PCB) {
 		fpstate = (struct savefpu *)&mcp->mc_fpstate;
 		fpstate->sv_env.en_mxcsr &= cpu_mxcsr_mask;
 		error = fpusetregs(td, fpstate, xfpustate, xfpustate_len);
 	} else
 		return (EINVAL);
 	return (error);
 }
 
 void
 fpstate_drop(struct thread *td)
 {
 
 	KASSERT(PCB_USER_FPU(td->td_pcb), ("fpstate_drop: kernel-owned fpu"));
 	critical_enter();
 	if (PCPU_GET(fpcurthread) == td)
 		fpudrop();
 	/*
 	 * XXX force a full drop of the fpu.  The above only drops it if we
 	 * owned it.
 	 *
 	 * XXX I don't much like fpugetuserregs()'s semantics of doing a full
 	 * drop.  Dropping only to the pcb matches fnsave's behaviour.
 	 * We only need to drop to !PCB_INITDONE in sendsig().  But
 	 * sendsig() is the only caller of fpugetuserregs()... perhaps we just
 	 * have too many layers.
 	 */
 	clear_pcb_flags(curthread->td_pcb,
 	    PCB_FPUINITDONE | PCB_USERFPUINITDONE);
 	critical_exit();
 }
 
 int
 fill_dbregs(struct thread *td, struct dbreg *dbregs)
 {
 	struct pcb *pcb;
 
 	if (td == NULL) {
 		dbregs->dr[0] = rdr0();
 		dbregs->dr[1] = rdr1();
 		dbregs->dr[2] = rdr2();
 		dbregs->dr[3] = rdr3();
 		dbregs->dr[6] = rdr6();
 		dbregs->dr[7] = rdr7();
 	} else {
 		pcb = td->td_pcb;
 		dbregs->dr[0] = pcb->pcb_dr0;
 		dbregs->dr[1] = pcb->pcb_dr1;
 		dbregs->dr[2] = pcb->pcb_dr2;
 		dbregs->dr[3] = pcb->pcb_dr3;
 		dbregs->dr[6] = pcb->pcb_dr6;
 		dbregs->dr[7] = pcb->pcb_dr7;
 	}
 	dbregs->dr[4] = 0;
 	dbregs->dr[5] = 0;
 	dbregs->dr[8] = 0;
 	dbregs->dr[9] = 0;
 	dbregs->dr[10] = 0;
 	dbregs->dr[11] = 0;
 	dbregs->dr[12] = 0;
 	dbregs->dr[13] = 0;
 	dbregs->dr[14] = 0;
 	dbregs->dr[15] = 0;
 	return (0);
 }
 
 int
 set_dbregs(struct thread *td, struct dbreg *dbregs)
 {
 	struct pcb *pcb;
 	int i;
 
 	if (td == NULL) {
 		load_dr0(dbregs->dr[0]);
 		load_dr1(dbregs->dr[1]);
 		load_dr2(dbregs->dr[2]);
 		load_dr3(dbregs->dr[3]);
 		load_dr6(dbregs->dr[6]);
 		load_dr7(dbregs->dr[7]);
 	} else {
 		/*
 		 * Don't let an illegal value for dr7 get set.  Specifically,
 		 * check for undefined settings.  Setting these bit patterns
 		 * result in undefined behaviour and can lead to an unexpected
 		 * TRCTRAP or a general protection fault right here.
 		 * Upper bits of dr6 and dr7 must not be set
 		 */
 		for (i = 0; i < 4; i++) {
 			if (DBREG_DR7_ACCESS(dbregs->dr[7], i) == 0x02)
 				return (EINVAL);
 			if (td->td_frame->tf_cs == _ucode32sel &&
 			    DBREG_DR7_LEN(dbregs->dr[7], i) == DBREG_DR7_LEN_8)
 				return (EINVAL);
 		}
 		if ((dbregs->dr[6] & 0xffffffff00000000ul) != 0 ||
 		    (dbregs->dr[7] & 0xffffffff00000000ul) != 0)
 			return (EINVAL);
 
 		pcb = td->td_pcb;
 
 		/*
 		 * Don't let a process set a breakpoint that is not within the
 		 * process's address space.  If a process could do this, it
 		 * could halt the system by setting a breakpoint in the kernel
 		 * (if ddb was enabled).  Thus, we need to check to make sure
 		 * that no breakpoints are being enabled for addresses outside
 		 * process's address space.
 		 *
 		 * XXX - what about when the watched area of the user's
 		 * address space is written into from within the kernel
 		 * ... wouldn't that still cause a breakpoint to be generated
 		 * from within kernel mode?
 		 */
 
 		if (DBREG_DR7_ENABLED(dbregs->dr[7], 0)) {
 			/* dr0 is enabled */
 			if (dbregs->dr[0] >= VM_MAXUSER_ADDRESS)
 				return (EINVAL);
 		}
 		if (DBREG_DR7_ENABLED(dbregs->dr[7], 1)) {
 			/* dr1 is enabled */
 			if (dbregs->dr[1] >= VM_MAXUSER_ADDRESS)
 				return (EINVAL);
 		}
 		if (DBREG_DR7_ENABLED(dbregs->dr[7], 2)) {
 			/* dr2 is enabled */
 			if (dbregs->dr[2] >= VM_MAXUSER_ADDRESS)
 				return (EINVAL);
 		}
 		if (DBREG_DR7_ENABLED(dbregs->dr[7], 3)) {
 			/* dr3 is enabled */
 			if (dbregs->dr[3] >= VM_MAXUSER_ADDRESS)
 				return (EINVAL);
 		}
 
 		pcb->pcb_dr0 = dbregs->dr[0];
 		pcb->pcb_dr1 = dbregs->dr[1];
 		pcb->pcb_dr2 = dbregs->dr[2];
 		pcb->pcb_dr3 = dbregs->dr[3];
 		pcb->pcb_dr6 = dbregs->dr[6];
 		pcb->pcb_dr7 = dbregs->dr[7];
 
 		set_pcb_flags(pcb, PCB_DBREGS);
 	}
 
 	return (0);
 }
 
 void
 reset_dbregs(void)
 {
 
 	load_dr7(0);	/* Turn off the control bits first */
 	load_dr0(0);
 	load_dr1(0);
 	load_dr2(0);
 	load_dr3(0);
 	load_dr6(0);
 }
 
 /*
  * Return > 0 if a hardware breakpoint has been hit, and the
  * breakpoint was in user space.  Return 0, otherwise.
  */
 int
 user_dbreg_trap(void)
 {
         u_int64_t dr7, dr6; /* debug registers dr6 and dr7 */
         u_int64_t bp;       /* breakpoint bits extracted from dr6 */
         int nbp;            /* number of breakpoints that triggered */
         caddr_t addr[4];    /* breakpoint addresses */
         int i;
         
         dr7 = rdr7();
         if ((dr7 & 0x000000ff) == 0) {
                 /*
                  * all GE and LE bits in the dr7 register are zero,
                  * thus the trap couldn't have been caused by the
                  * hardware debug registers
                  */
                 return 0;
         }
 
         nbp = 0;
         dr6 = rdr6();
         bp = dr6 & 0x0000000f;
 
         if (!bp) {
                 /*
                  * None of the breakpoint bits are set meaning this
                  * trap was not caused by any of the debug registers
                  */
                 return 0;
         }
 
         /*
          * at least one of the breakpoints were hit, check to see
          * which ones and if any of them are user space addresses
          */
 
         if (bp & 0x01) {
                 addr[nbp++] = (caddr_t)rdr0();
         }
         if (bp & 0x02) {
                 addr[nbp++] = (caddr_t)rdr1();
         }
         if (bp & 0x04) {
                 addr[nbp++] = (caddr_t)rdr2();
         }
         if (bp & 0x08) {
                 addr[nbp++] = (caddr_t)rdr3();
         }
 
         for (i = 0; i < nbp; i++) {
                 if (addr[i] < (caddr_t)VM_MAXUSER_ADDRESS) {
                         /*
                          * addr[i] is in user space
                          */
                         return nbp;
                 }
         }
 
         /*
          * None of the breakpoints are in user space.
          */
         return 0;
 }
 
 #ifdef KDB
 
 /*
  * Provide inb() and outb() as functions.  They are normally only available as
  * inline functions, thus cannot be called from the debugger.
  */
 
 /* silence compiler warnings */
 u_char inb_(u_short);
 void outb_(u_short, u_char);
 
 u_char
 inb_(u_short port)
 {
 	return inb(port);
 }
 
 void
 outb_(u_short port, u_char data)
 {
 	outb(port, data);
 }
 
 #endif /* KDB */
Index: head/sys/amd64/amd64/trap.c
===================================================================
--- head/sys/amd64/amd64/trap.c	(revision 258540)
+++ head/sys/amd64/amd64/trap.c	(revision 258541)
@@ -1,1009 +1,1008 @@
 /*-
  * Copyright (C) 1994, David Greenman
  * Copyright (c) 1990, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * This code is derived from software contributed to Berkeley by
  * the University of Utah, and William Jolitz.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *	This product includes software developed by the University of
  *	California, Berkeley and its contributors.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	from: @(#)trap.c	7.4 (Berkeley) 5/13/91
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 /*
  * AMD64 Trap and System call handling
  */
 
 #include "opt_clock.h"
 #include "opt_cpu.h"
 #include "opt_hwpmc_hooks.h"
 #include "opt_isa.h"
 #include "opt_kdb.h"
-#include "opt_kdtrace.h"
 
 #include <sys/param.h>
 #include <sys/bus.h>
 #include <sys/systm.h>
 #include <sys/proc.h>
 #include <sys/pioctl.h>
 #include <sys/ptrace.h>
 #include <sys/kdb.h>
 #include <sys/kernel.h>
 #include <sys/ktr.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/resourcevar.h>
 #include <sys/signalvar.h>
 #include <sys/syscall.h>
 #include <sys/sysctl.h>
 #include <sys/sysent.h>
 #include <sys/uio.h>
 #include <sys/vmmeter.h>
 #ifdef HWPMC_HOOKS
 #include <sys/pmckern.h>
 PMC_SOFT_DEFINE( , , page_fault, all);
 PMC_SOFT_DEFINE( , , page_fault, read);
 PMC_SOFT_DEFINE( , , page_fault, write);
 #endif
 
 #include <vm/vm.h>
 #include <vm/vm_param.h>
 #include <vm/pmap.h>
 #include <vm/vm_kern.h>
 #include <vm/vm_map.h>
 #include <vm/vm_page.h>
 #include <vm/vm_extern.h>
 
 #include <machine/cpu.h>
 #include <machine/intr_machdep.h>
 #include <x86/mca.h>
 #include <machine/md_var.h>
 #include <machine/pcb.h>
 #ifdef SMP
 #include <machine/smp.h>
 #endif
 #include <machine/tss.h>
 
 #ifdef KDTRACE_HOOKS
 #include <sys/dtrace_bsd.h>
 
 /*
  * This is a hook which is initialised by the dtrace module
  * to handle traps which might occur during DTrace probe
  * execution.
  */
 dtrace_trap_func_t	dtrace_trap_func;
 
 dtrace_doubletrap_func_t	dtrace_doubletrap_func;
 
 /*
  * This is a hook which is initialised by the systrace module
  * when it is loaded. This keeps the DTrace syscall provider
  * implementation opaque. 
  */
 systrace_probe_func_t	systrace_probe_func;
 
 /*
  * These hooks are necessary for the pid and usdt providers.
  */
 dtrace_pid_probe_ptr_t		dtrace_pid_probe_ptr;
 dtrace_return_probe_ptr_t	dtrace_return_probe_ptr;
 #endif
 
 extern void trap(struct trapframe *frame);
 extern void syscall(struct trapframe *frame);
 void dblfault_handler(struct trapframe *frame);
 
 static int trap_pfault(struct trapframe *, int);
 static void trap_fatal(struct trapframe *, vm_offset_t);
 
 #define MAX_TRAP_MSG		32
 static char *trap_msg[] = {
 	"",					/*  0 unused */
 	"privileged instruction fault",		/*  1 T_PRIVINFLT */
 	"",					/*  2 unused */
 	"breakpoint instruction fault",		/*  3 T_BPTFLT */
 	"",					/*  4 unused */
 	"",					/*  5 unused */
 	"arithmetic trap",			/*  6 T_ARITHTRAP */
 	"",					/*  7 unused */
 	"",					/*  8 unused */
 	"general protection fault",		/*  9 T_PROTFLT */
 	"trace trap",				/* 10 T_TRCTRAP */
 	"",					/* 11 unused */
 	"page fault",				/* 12 T_PAGEFLT */
 	"",					/* 13 unused */
 	"alignment fault",			/* 14 T_ALIGNFLT */
 	"",					/* 15 unused */
 	"",					/* 16 unused */
 	"",					/* 17 unused */
 	"integer divide fault",			/* 18 T_DIVIDE */
 	"non-maskable interrupt trap",		/* 19 T_NMI */
 	"overflow trap",			/* 20 T_OFLOW */
 	"FPU bounds check fault",		/* 21 T_BOUND */
 	"FPU device not available",		/* 22 T_DNA */
 	"double fault",				/* 23 T_DOUBLEFLT */
 	"FPU operand fetch fault",		/* 24 T_FPOPFLT */
 	"invalid TSS fault",			/* 25 T_TSSFLT */
 	"segment not present fault",		/* 26 T_SEGNPFLT */
 	"stack fault",				/* 27 T_STKFLT */
 	"machine check trap",			/* 28 T_MCHK */
 	"SIMD floating-point exception",	/* 29 T_XMMFLT */
 	"reserved (unknown) fault",		/* 30 T_RESERVED */
 	"",					/* 31 unused (reserved) */
 	"DTrace pid return trap",		/* 32 T_DTRACE_RET */
 };
 
 #ifdef KDB
 static int kdb_on_nmi = 1;
 SYSCTL_INT(_machdep, OID_AUTO, kdb_on_nmi, CTLFLAG_RW,
 	&kdb_on_nmi, 0, "Go to KDB on NMI");
 TUNABLE_INT("machdep.kdb_on_nmi", &kdb_on_nmi);
 #endif
 static int panic_on_nmi = 1;
 SYSCTL_INT(_machdep, OID_AUTO, panic_on_nmi, CTLFLAG_RW,
 	&panic_on_nmi, 0, "Panic on NMI");
 TUNABLE_INT("machdep.panic_on_nmi", &panic_on_nmi);
 static int prot_fault_translation;
 SYSCTL_INT(_machdep, OID_AUTO, prot_fault_translation, CTLFLAG_RW,
     &prot_fault_translation, 0,
     "Select signal to deliver on protection fault");
 static int uprintf_signal;
 SYSCTL_INT(_machdep, OID_AUTO, uprintf_signal, CTLFLAG_RW,
     &uprintf_signal, 0,
     "Print debugging information on trap signal to ctty");
 
 /*
  * Exception, fault, and trap interface to the FreeBSD kernel.
  * This common code is called from assembly language IDT gate entry
  * routines that prepare a suitable stack frame, and restore this
  * frame after the exception has been processed.
  */
 
 void
 trap(struct trapframe *frame)
 {
 	struct thread *td = curthread;
 	struct proc *p = td->td_proc;
 	int i = 0, ucode = 0, code;
 	u_int type;
 	register_t addr = 0;
 	ksiginfo_t ksi;
 
 	PCPU_INC(cnt.v_trap);
 	type = frame->tf_trapno;
 
 #ifdef SMP
 	/* Handler for NMI IPIs used for stopping CPUs. */
 	if (type == T_NMI) {
 	         if (ipi_nmi_handler() == 0)
 	                   goto out;
 	}
 #endif /* SMP */
 
 #ifdef KDB
 	if (kdb_active) {
 		kdb_reenter();
 		goto out;
 	}
 #endif
 
 	if (type == T_RESERVED) {
 		trap_fatal(frame, 0);
 		goto out;
 	}
 
 #ifdef	HWPMC_HOOKS
 	/*
 	 * CPU PMCs interrupt using an NMI.  If the PMC module is
 	 * active, pass the 'rip' value to the PMC module's interrupt
 	 * handler.  A return value of '1' from the handler means that
 	 * the NMI was handled by it and we can return immediately.
 	 */
 	if (type == T_NMI && pmc_intr &&
 	    (*pmc_intr)(PCPU_GET(cpuid), frame))
 		goto out;
 #endif
 
 	if (type == T_MCHK) {
 		mca_intr();
 		goto out;
 	}
 
 #ifdef KDTRACE_HOOKS
 	/*
 	 * A trap can occur while DTrace executes a probe. Before
 	 * executing the probe, DTrace blocks re-scheduling and sets
 	 * a flag in it's per-cpu flags to indicate that it doesn't
 	 * want to fault. On returning from the probe, the no-fault
 	 * flag is cleared and finally re-scheduling is enabled.
 	 *
 	 * If the DTrace kernel module has registered a trap handler,
 	 * call it and if it returns non-zero, assume that it has
 	 * handled the trap and modified the trap frame so that this
 	 * function can return normally.
 	 */
 	if (type == T_DTRACE_RET || type == T_BPTFLT) {
 		struct reg regs;
 
 		fill_frame_regs(frame, &regs);
 		if (type == T_BPTFLT &&
 		    dtrace_pid_probe_ptr != NULL &&
 		    dtrace_pid_probe_ptr(&regs) == 0)
 			goto out;
 		else if (type == T_DTRACE_RET &&
 		    dtrace_return_probe_ptr != NULL &&
 		    dtrace_return_probe_ptr(&regs) == 0)
 			goto out;
 	}
 	if (dtrace_trap_func != NULL && (*dtrace_trap_func)(frame, type))
 		goto out;
 #endif
 
 	if ((frame->tf_rflags & PSL_I) == 0) {
 		/*
 		 * Buggy application or kernel code has disabled
 		 * interrupts and then trapped.  Enabling interrupts
 		 * now is wrong, but it is better than running with
 		 * interrupts disabled until they are accidentally
 		 * enabled later.
 		 */
 		if (ISPL(frame->tf_cs) == SEL_UPL)
 			uprintf(
 			    "pid %ld (%s): trap %d with interrupts disabled\n",
 			    (long)curproc->p_pid, curthread->td_name, type);
 		else if (type != T_NMI && type != T_BPTFLT &&
 		    type != T_TRCTRAP) {
 			/*
 			 * XXX not quite right, since this may be for a
 			 * multiple fault in user mode.
 			 */
 			printf("kernel trap %d with interrupts disabled\n",
 			    type);
 
 			/*
 			 * We shouldn't enable interrupts while holding a
 			 * spin lock.
 			 */
 			if (td->td_md.md_spinlock_count == 0)
 				enable_intr();
 		}
 	}
 
 	code = frame->tf_err;
 
         if (ISPL(frame->tf_cs) == SEL_UPL) {
 		/* user trap */
 
 		td->td_pticks = 0;
 		td->td_frame = frame;
 		addr = frame->tf_rip;
 		if (td->td_ucred != p->p_ucred) 
 			cred_update_thread(td);
 
 		switch (type) {
 		case T_PRIVINFLT:	/* privileged instruction fault */
 			i = SIGILL;
 			ucode = ILL_PRVOPC;
 			break;
 
 		case T_BPTFLT:		/* bpt instruction fault */
 		case T_TRCTRAP:		/* trace trap */
 			enable_intr();
 			frame->tf_rflags &= ~PSL_T;
 			i = SIGTRAP;
 			ucode = (type == T_TRCTRAP ? TRAP_TRACE : TRAP_BRKPT);
 			break;
 
 		case T_ARITHTRAP:	/* arithmetic trap */
 			ucode = fputrap_x87();
 			if (ucode == -1)
 				goto userout;
 			i = SIGFPE;
 			break;
 
 		case T_PROTFLT:		/* general protection fault */
 			i = SIGBUS;
 			ucode = BUS_OBJERR;
 			break;
 		case T_STKFLT:		/* stack fault */
 		case T_SEGNPFLT:	/* segment not present fault */
 			i = SIGBUS;
 			ucode = BUS_ADRERR;
 			break;
 		case T_TSSFLT:		/* invalid TSS fault */
 			i = SIGBUS;
 			ucode = BUS_OBJERR;
 			break;
 		case T_DOUBLEFLT:	/* double fault */
 		default:
 			i = SIGBUS;
 			ucode = BUS_OBJERR;
 			break;
 
 		case T_PAGEFLT:		/* page fault */
 			addr = frame->tf_addr;
 			i = trap_pfault(frame, TRUE);
 			if (i == -1)
 				goto userout;
 			if (i == 0)
 				goto user;
 
 			if (i == SIGSEGV)
 				ucode = SEGV_MAPERR;
 			else {
 				if (prot_fault_translation == 0) {
 					/*
 					 * Autodetect.
 					 * This check also covers the images
 					 * without the ABI-tag ELF note.
 					 */
 					if (SV_CURPROC_ABI() == SV_ABI_FREEBSD
 					    && p->p_osrel >= P_OSREL_SIGSEGV) {
 						i = SIGSEGV;
 						ucode = SEGV_ACCERR;
 					} else {
 						i = SIGBUS;
 						ucode = BUS_PAGE_FAULT;
 					}
 				} else if (prot_fault_translation == 1) {
 					/*
 					 * Always compat mode.
 					 */
 					i = SIGBUS;
 					ucode = BUS_PAGE_FAULT;
 				} else {
 					/*
 					 * Always SIGSEGV mode.
 					 */
 					i = SIGSEGV;
 					ucode = SEGV_ACCERR;
 				}
 			}
 			break;
 
 		case T_DIVIDE:		/* integer divide fault */
 			ucode = FPE_INTDIV;
 			i = SIGFPE;
 			break;
 
 #ifdef DEV_ISA
 		case T_NMI:
 			/* machine/parity/power fail/"kitchen sink" faults */
 			if (isa_nmi(code) == 0) {
 #ifdef KDB
 				/*
 				 * NMI can be hooked up to a pushbutton
 				 * for debugging.
 				 */
 				if (kdb_on_nmi) {
 					printf ("NMI ... going to debugger\n");
 					kdb_trap(type, 0, frame);
 				}
 #endif /* KDB */
 				goto userout;
 			} else if (panic_on_nmi)
 				panic("NMI indicates hardware failure");
 			break;
 #endif /* DEV_ISA */
 
 		case T_OFLOW:		/* integer overflow fault */
 			ucode = FPE_INTOVF;
 			i = SIGFPE;
 			break;
 
 		case T_BOUND:		/* bounds check fault */
 			ucode = FPE_FLTSUB;
 			i = SIGFPE;
 			break;
 
 		case T_DNA:
 			/* transparent fault (due to context switch "late") */
 			KASSERT(PCB_USER_FPU(td->td_pcb),
 			    ("kernel FPU ctx has leaked"));
 			fpudna();
 			goto userout;
 
 		case T_FPOPFLT:		/* FPU operand fetch fault */
 			ucode = ILL_COPROC;
 			i = SIGILL;
 			break;
 
 		case T_XMMFLT:		/* SIMD floating-point exception */
 			ucode = fputrap_sse();
 			if (ucode == -1)
 				goto userout;
 			i = SIGFPE;
 			break;
 		}
 	} else {
 		/* kernel trap */
 
 		KASSERT(cold || td->td_ucred != NULL,
 		    ("kernel trap doesn't have ucred"));
 		switch (type) {
 		case T_PAGEFLT:			/* page fault */
 			(void) trap_pfault(frame, FALSE);
 			goto out;
 
 		case T_DNA:
 			KASSERT(!PCB_USER_FPU(td->td_pcb),
 			    ("Unregistered use of FPU in kernel"));
 			fpudna();
 			goto out;
 
 		case T_ARITHTRAP:	/* arithmetic trap */
 		case T_XMMFLT:		/* SIMD floating-point exception */
 		case T_FPOPFLT:		/* FPU operand fetch fault */
 			/*
 			 * XXXKIB for now disable any FPU traps in kernel
 			 * handler registration seems to be overkill
 			 */
 			trap_fatal(frame, 0);
 			goto out;
 
 		case T_STKFLT:		/* stack fault */
 			break;
 
 		case T_PROTFLT:		/* general protection fault */
 		case T_SEGNPFLT:	/* segment not present fault */
 			if (td->td_intr_nesting_level != 0)
 				break;
 
 			/*
 			 * Invalid segment selectors and out of bounds
 			 * %rip's and %rsp's can be set up in user mode.
 			 * This causes a fault in kernel mode when the
 			 * kernel tries to return to user mode.  We want
 			 * to get this fault so that we can fix the
 			 * problem here and not have to check all the
 			 * selectors and pointers when the user changes
 			 * them.
 			 */
 			if (frame->tf_rip == (long)doreti_iret) {
 				frame->tf_rip = (long)doreti_iret_fault;
 				goto out;
 			}
 			if (frame->tf_rip == (long)ld_ds) {
 				frame->tf_rip = (long)ds_load_fault;
 				goto out;
 			}
 			if (frame->tf_rip == (long)ld_es) {
 				frame->tf_rip = (long)es_load_fault;
 				goto out;
 			}
 			if (frame->tf_rip == (long)ld_fs) {
 				frame->tf_rip = (long)fs_load_fault;
 				goto out;
 			}
 			if (frame->tf_rip == (long)ld_gs) {
 				frame->tf_rip = (long)gs_load_fault;
 				goto out;
 			}
 			if (frame->tf_rip == (long)ld_gsbase) {
 				frame->tf_rip = (long)gsbase_load_fault;
 				goto out;
 			}
 			if (frame->tf_rip == (long)ld_fsbase) {
 				frame->tf_rip = (long)fsbase_load_fault;
 				goto out;
 			}
 			if (curpcb->pcb_onfault != NULL) {
 				frame->tf_rip = (long)curpcb->pcb_onfault;
 				goto out;
 			}
 			break;
 
 		case T_TSSFLT:
 			/*
 			 * PSL_NT can be set in user mode and isn't cleared
 			 * automatically when the kernel is entered.  This
 			 * causes a TSS fault when the kernel attempts to
 			 * `iret' because the TSS link is uninitialized.  We
 			 * want to get this fault so that we can fix the
 			 * problem here and not every time the kernel is
 			 * entered.
 			 */
 			if (frame->tf_rflags & PSL_NT) {
 				frame->tf_rflags &= ~PSL_NT;
 				goto out;
 			}
 			break;
 
 		case T_TRCTRAP:	 /* trace trap */
 			/*
 			 * Ignore debug register trace traps due to
 			 * accesses in the user's address space, which
 			 * can happen under several conditions such as
 			 * if a user sets a watchpoint on a buffer and
 			 * then passes that buffer to a system call.
 			 * We still want to get TRCTRAPS for addresses
 			 * in kernel space because that is useful when
 			 * debugging the kernel.
 			 */
 			if (user_dbreg_trap()) {
 				/*
 				 * Reset breakpoint bits because the
 				 * processor doesn't
 				 */
 				/* XXX check upper bits here */
 				load_dr6(rdr6() & 0xfffffff0);
 				goto out;
 			}
 			/*
 			 * FALLTHROUGH (TRCTRAP kernel mode, kernel address)
 			 */
 		case T_BPTFLT:
 			/*
 			 * If KDB is enabled, let it handle the debugger trap.
 			 * Otherwise, debugger traps "can't happen".
 			 */
 #ifdef KDB
 			if (kdb_trap(type, 0, frame))
 				goto out;
 #endif
 			break;
 
 #ifdef DEV_ISA
 		case T_NMI:
 			/* machine/parity/power fail/"kitchen sink" faults */
 			if (isa_nmi(code) == 0) {
 #ifdef KDB
 				/*
 				 * NMI can be hooked up to a pushbutton
 				 * for debugging.
 				 */
 				if (kdb_on_nmi) {
 					printf ("NMI ... going to debugger\n");
 					kdb_trap(type, 0, frame);
 				}
 #endif /* KDB */
 				goto out;
 			} else if (panic_on_nmi == 0)
 				goto out;
 			/* FALLTHROUGH */
 #endif /* DEV_ISA */
 		}
 
 		trap_fatal(frame, 0);
 		goto out;
 	}
 
 	/* Translate fault for emulators (e.g. Linux) */
 	if (*p->p_sysent->sv_transtrap)
 		i = (*p->p_sysent->sv_transtrap)(i, type);
 
 	ksiginfo_init_trap(&ksi);
 	ksi.ksi_signo = i;
 	ksi.ksi_code = ucode;
 	ksi.ksi_trapno = type;
 	ksi.ksi_addr = (void *)addr;
 	if (uprintf_signal) {
 		uprintf("pid %d comm %s: signal %d err %lx code %d type %d "
 		    "addr 0x%lx rsp 0x%lx rip 0x%lx "
 		    "<%02x %02x %02x %02x %02x %02x %02x %02x>\n",
 		    p->p_pid, p->p_comm, i, frame->tf_err, ucode, type, addr,
 		    frame->tf_rsp, frame->tf_rip,
 		    fubyte((void *)(frame->tf_rip + 0)),
 		    fubyte((void *)(frame->tf_rip + 1)),
 		    fubyte((void *)(frame->tf_rip + 2)),
 		    fubyte((void *)(frame->tf_rip + 3)),
 		    fubyte((void *)(frame->tf_rip + 4)),
 		    fubyte((void *)(frame->tf_rip + 5)),
 		    fubyte((void *)(frame->tf_rip + 6)),
 		    fubyte((void *)(frame->tf_rip + 7)));
 	}
 	KASSERT((read_rflags() & PSL_I) != 0, ("interrupts disabled"));
 	trapsignal(td, &ksi);
 
 user:
 	userret(td, frame);
 	KASSERT(PCB_USER_FPU(td->td_pcb),
 	    ("Return from trap with kernel FPU ctx leaked"));
 userout:
 out:
 	return;
 }
 
 static int
 trap_pfault(frame, usermode)
 	struct trapframe *frame;
 	int usermode;
 {
 	vm_offset_t va;
 	struct vmspace *vm;
 	vm_map_t map;
 	int rv = 0;
 	vm_prot_t ftype;
 	struct thread *td = curthread;
 	struct proc *p = td->td_proc;
 	vm_offset_t eva = frame->tf_addr;
 
 	if (__predict_false((td->td_pflags & TDP_NOFAULTING) != 0)) {
 		/*
 		 * Due to both processor errata and lazy TLB invalidation when
 		 * access restrictions are removed from virtual pages, memory
 		 * accesses that are allowed by the physical mapping layer may
 		 * nonetheless cause one spurious page fault per virtual page. 
 		 * When the thread is executing a "no faulting" section that
 		 * is bracketed by vm_fault_{disable,enable}_pagefaults(),
 		 * every page fault is treated as a spurious page fault,
 		 * unless it accesses the same virtual address as the most
 		 * recent page fault within the same "no faulting" section.
 		 */
 		if (td->td_md.md_spurflt_addr != eva ||
 		    (td->td_pflags & TDP_RESETSPUR) != 0) {
 			/*
 			 * Do nothing to the TLB.  A stale TLB entry is
 			 * flushed automatically by a page fault.
 			 */
 			td->td_md.md_spurflt_addr = eva;
 			td->td_pflags &= ~TDP_RESETSPUR;
 			return (0);
 		}
 	} else {
 		/*
 		 * If we get a page fault while in a critical section, then
 		 * it is most likely a fatal kernel page fault.  The kernel
 		 * is already going to panic trying to get a sleep lock to
 		 * do the VM lookup, so just consider it a fatal trap so the
 		 * kernel can print out a useful trap message and even get
 		 * to the debugger.
 		 *
 		 * If we get a page fault while holding a non-sleepable
 		 * lock, then it is most likely a fatal kernel page fault.
 		 * If WITNESS is enabled, then it's going to whine about
 		 * bogus LORs with various VM locks, so just skip to the
 		 * fatal trap handling directly.
 		 */
 		if (td->td_critnest != 0 ||
 		    WITNESS_CHECK(WARN_SLEEPOK | WARN_GIANTOK, NULL,
 		    "Kernel page fault") != 0) {
 			trap_fatal(frame, eva);
 			return (-1);
 		}
 	}
 	va = trunc_page(eva);
 	if (va >= VM_MIN_KERNEL_ADDRESS) {
 		/*
 		 * Don't allow user-mode faults in kernel address space.
 		 */
 		if (usermode)
 			goto nogo;
 
 		map = kernel_map;
 	} else {
 		/*
 		 * This is a fault on non-kernel virtual memory.  If either
 		 * p or p->p_vmspace is NULL, then the fault is fatal.
 		 */
 		if (p == NULL || (vm = p->p_vmspace) == NULL)
 			goto nogo;
 
 		map = &vm->vm_map;
 
 		/*
 		 * When accessing a usermode address, kernel must be
 		 * ready to accept the page fault, and provide a
 		 * handling routine.  Since accessing the address
 		 * without the handler is a bug, do not try to handle
 		 * it normally, and panic immediately.
 		 */
 		if (!usermode && (td->td_intr_nesting_level != 0 ||
 		    curpcb->pcb_onfault == NULL)) {
 			trap_fatal(frame, eva);
 			return (-1);
 		}
 	}
 
 	/*
 	 * If the trap was caused by errant bits in the PTE then panic.
 	 */
 	if (frame->tf_err & PGEX_RSV) {
 		trap_fatal(frame, eva);
 		return (-1);
 	}
 
 	/*
 	 * PGEX_I is defined only if the execute disable bit capability is
 	 * supported and enabled.
 	 */
 	if (frame->tf_err & PGEX_W)
 		ftype = VM_PROT_WRITE;
 	else if ((frame->tf_err & PGEX_I) && pg_nx != 0)
 		ftype = VM_PROT_EXECUTE;
 	else
 		ftype = VM_PROT_READ;
 
 	if (map != kernel_map) {
 		/*
 		 * Keep swapout from messing with us during this
 		 *	critical time.
 		 */
 		PROC_LOCK(p);
 		++p->p_lock;
 		PROC_UNLOCK(p);
 
 		/* Fault in the user page: */
 		rv = vm_fault(map, va, ftype, VM_FAULT_NORMAL);
 
 		PROC_LOCK(p);
 		--p->p_lock;
 		PROC_UNLOCK(p);
 	} else {
 		/*
 		 * Don't have to worry about process locking or stacks in the
 		 * kernel.
 		 */
 		rv = vm_fault(map, va, ftype, VM_FAULT_NORMAL);
 	}
 	if (rv == KERN_SUCCESS) {
 #ifdef HWPMC_HOOKS
 		if (ftype == VM_PROT_READ || ftype == VM_PROT_WRITE) {
 			PMC_SOFT_CALL_TF( , , page_fault, all, frame);
 			if (ftype == VM_PROT_READ)
 				PMC_SOFT_CALL_TF( , , page_fault, read,
 				    frame);
 			else
 				PMC_SOFT_CALL_TF( , , page_fault, write,
 				    frame);
 		}
 #endif
 		return (0);
 	}
 nogo:
 	if (!usermode) {
 		if (td->td_intr_nesting_level == 0 &&
 		    curpcb->pcb_onfault != NULL) {
 			frame->tf_rip = (long)curpcb->pcb_onfault;
 			return (0);
 		}
 		trap_fatal(frame, eva);
 		return (-1);
 	}
 	return ((rv == KERN_PROTECTION_FAILURE) ? SIGBUS : SIGSEGV);
 }
 
 static void
 trap_fatal(frame, eva)
 	struct trapframe *frame;
 	vm_offset_t eva;
 {
 	int code, ss;
 	u_int type;
 	long esp;
 	struct soft_segment_descriptor softseg;
 	char *msg;
 
 	code = frame->tf_err;
 	type = frame->tf_trapno;
 	sdtossd(&gdt[NGDT * PCPU_GET(cpuid) + IDXSEL(frame->tf_cs & 0xffff)],
 	    &softseg);
 
 	if (type <= MAX_TRAP_MSG)
 		msg = trap_msg[type];
 	else
 		msg = "UNKNOWN";
 	printf("\n\nFatal trap %d: %s while in %s mode\n", type, msg,
 	    ISPL(frame->tf_cs) == SEL_UPL ? "user" : "kernel");
 #ifdef SMP
 	/* two separate prints in case of a trap on an unmapped page */
 	printf("cpuid = %d; ", PCPU_GET(cpuid));
 	printf("apic id = %02x\n", PCPU_GET(apic_id));
 #endif
 	if (type == T_PAGEFLT) {
 		printf("fault virtual address	= 0x%lx\n", eva);
 		printf("fault code		= %s %s %s%s, %s\n",
 			code & PGEX_U ? "user" : "supervisor",
 			code & PGEX_W ? "write" : "read",
 			code & PGEX_I ? "instruction" : "data",
 			code & PGEX_RSV ? " rsv" : "",
 			code & PGEX_P ? "protection violation" : "page not present");
 	}
 	printf("instruction pointer	= 0x%lx:0x%lx\n",
 	       frame->tf_cs & 0xffff, frame->tf_rip);
         if (ISPL(frame->tf_cs) == SEL_UPL) {
 		ss = frame->tf_ss & 0xffff;
 		esp = frame->tf_rsp;
 	} else {
 		ss = GSEL(GDATA_SEL, SEL_KPL);
 		esp = (long)&frame->tf_rsp;
 	}
 	printf("stack pointer	        = 0x%x:0x%lx\n", ss, esp);
 	printf("frame pointer	        = 0x%x:0x%lx\n", ss, frame->tf_rbp);
 	printf("code segment		= base 0x%lx, limit 0x%lx, type 0x%x\n",
 	       softseg.ssd_base, softseg.ssd_limit, softseg.ssd_type);
 	printf("			= DPL %d, pres %d, long %d, def32 %d, gran %d\n",
 	       softseg.ssd_dpl, softseg.ssd_p, softseg.ssd_long, softseg.ssd_def32,
 	       softseg.ssd_gran);
 	printf("processor eflags	= ");
 	if (frame->tf_rflags & PSL_T)
 		printf("trace trap, ");
 	if (frame->tf_rflags & PSL_I)
 		printf("interrupt enabled, ");
 	if (frame->tf_rflags & PSL_NT)
 		printf("nested task, ");
 	if (frame->tf_rflags & PSL_RF)
 		printf("resume, ");
 	printf("IOPL = %ld\n", (frame->tf_rflags & PSL_IOPL) >> 12);
 	printf("current process		= ");
 	if (curproc) {
 		printf("%lu (%s)\n",
 		    (u_long)curproc->p_pid, curthread->td_name ?
 		    curthread->td_name : "");
 	} else {
 		printf("Idle\n");
 	}
 
 #ifdef KDB
 	if (debugger_on_panic || kdb_active)
 		if (kdb_trap(type, 0, frame))
 			return;
 #endif
 	printf("trap number		= %d\n", type);
 	if (type <= MAX_TRAP_MSG)
 		panic("%s", trap_msg[type]);
 	else
 		panic("unknown/reserved trap");
 }
 
 /*
  * Double fault handler. Called when a fault occurs while writing
  * a frame for a trap/exception onto the stack. This usually occurs
  * when the stack overflows (such is the case with infinite recursion,
  * for example).
  */
 void
 dblfault_handler(struct trapframe *frame)
 {
 #ifdef KDTRACE_HOOKS
 	if (dtrace_doubletrap_func != NULL)
 		(*dtrace_doubletrap_func)();
 #endif
 	printf("\nFatal double fault\n");
 	printf("rip = 0x%lx\n", frame->tf_rip);
 	printf("rsp = 0x%lx\n", frame->tf_rsp);
 	printf("rbp = 0x%lx\n", frame->tf_rbp);
 #ifdef SMP
 	/* two separate prints in case of a trap on an unmapped page */
 	printf("cpuid = %d; ", PCPU_GET(cpuid));
 	printf("apic id = %02x\n", PCPU_GET(apic_id));
 #endif
 	panic("double fault");
 }
 
 int
 cpu_fetch_syscall_args(struct thread *td, struct syscall_args *sa)
 {
 	struct proc *p;
 	struct trapframe *frame;
 	register_t *argp;
 	caddr_t params;
 	int reg, regcnt, error;
 
 	p = td->td_proc;
 	frame = td->td_frame;
 	reg = 0;
 	regcnt = 6;
 
 	params = (caddr_t)frame->tf_rsp + sizeof(register_t);
 	sa->code = frame->tf_rax;
 
 	if (sa->code == SYS_syscall || sa->code == SYS___syscall) {
 		sa->code = frame->tf_rdi;
 		reg++;
 		regcnt--;
 	}
  	if (p->p_sysent->sv_mask)
  		sa->code &= p->p_sysent->sv_mask;
 
  	if (sa->code >= p->p_sysent->sv_size)
  		sa->callp = &p->p_sysent->sv_table[0];
   	else
  		sa->callp = &p->p_sysent->sv_table[sa->code];
 
 	sa->narg = sa->callp->sy_narg;
 	KASSERT(sa->narg <= sizeof(sa->args) / sizeof(sa->args[0]),
 	    ("Too many syscall arguments!"));
 	error = 0;
 	argp = &frame->tf_rdi;
 	argp += reg;
 	bcopy(argp, sa->args, sizeof(sa->args[0]) * regcnt);
 	if (sa->narg > regcnt) {
 		KASSERT(params != NULL, ("copyin args with no params!"));
 		error = copyin(params, &sa->args[regcnt],
 	    	    (sa->narg - regcnt) * sizeof(sa->args[0]));
 	}
 
 	if (error == 0) {
 		td->td_retval[0] = 0;
 		td->td_retval[1] = frame->tf_rdx;
 	}
 
 	return (error);
 }
 
 #include "../../kern/subr_syscall.c"
 
 /*
  * System call handler for native binaries.  The trap frame is already
  * set up by the assembler trampoline and a pointer to it is saved in
  * td_frame.
  */
 void
 amd64_syscall(struct thread *td, int traced)
 {
 	struct syscall_args sa;
 	int error;
 	ksiginfo_t ksi;
 
 #ifdef DIAGNOSTIC
 	if (ISPL(td->td_frame->tf_cs) != SEL_UPL) {
 		panic("syscall");
 		/* NOT REACHED */
 	}
 #endif
 	error = syscallenter(td, &sa);
 
 	/*
 	 * Traced syscall.
 	 */
 	if (__predict_false(traced)) {
 		td->td_frame->tf_rflags &= ~PSL_T;
 		ksiginfo_init_trap(&ksi);
 		ksi.ksi_signo = SIGTRAP;
 		ksi.ksi_code = TRAP_TRACE;
 		ksi.ksi_addr = (void *)td->td_frame->tf_rip;
 		trapsignal(td, &ksi);
 	}
 
 	KASSERT(PCB_USER_FPU(td->td_pcb),
 	    ("System call %s returing with kernel FPU ctx leaked",
 	     syscallname(td->td_proc, sa.code)));
 	KASSERT(td->td_pcb->pcb_save == get_pcb_user_save_td(td),
 	    ("System call %s returning with mangled pcb_save",
 	     syscallname(td->td_proc, sa.code)));
 
 	syscallret(td, error, &sa);
 
 	/*
 	 * If the user-supplied value of %rip is not a canonical
 	 * address, then some CPUs will trigger a ring 0 #GP during
 	 * the sysret instruction.  However, the fault handler would
 	 * execute in ring 0 with the user's %gs and %rsp which would
 	 * not be safe.  Instead, use the full return path which
 	 * catches the problem safely.
 	 */
 	if (td->td_frame->tf_rip >= VM_MAXUSER_ADDRESS)
 		set_pcb_flags(td->td_pcb, PCB_FULL_IRET);
 }
Index: head/sys/amd64/linux32/linux32_dummy.c
===================================================================
--- head/sys/amd64/linux32/linux32_dummy.c	(revision 258540)
+++ head/sys/amd64/linux32/linux32_dummy.c	(revision 258541)
@@ -1,176 +1,175 @@
 /*-
  * Copyright (c) 1994-1995 Søren Schmidt
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer 
  *    in this position and unchanged.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. The name of the author may not be used to endorse or promote products
  *    derived from this software without specific prior written permission
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_compat.h"
-#include "opt_kdtrace.h"
 
 #include <sys/param.h>
 #include <sys/kernel.h>
 #include <sys/sdt.h>
 #include <sys/systm.h>
 #include <sys/proc.h>
 
 #include <amd64/linux32/linux.h>
 #include <amd64/linux32/linux32_proto.h>
 #include <compat/linux/linux_dtrace.h>
 #include <compat/linux/linux_util.h>
 
 /* DTrace init */
 LIN_SDT_PROVIDER_DECLARE(LINUX_DTRACE);
 
 DUMMY(stime);
 DUMMY(olduname);
 DUMMY(syslog);
 DUMMY(uname);
 DUMMY(vhangup);
 DUMMY(swapoff);
 DUMMY(adjtimex);
 DUMMY(create_module);
 DUMMY(init_module);
 DUMMY(delete_module);
 DUMMY(get_kernel_syms);
 DUMMY(quotactl);
 DUMMY(bdflush);
 DUMMY(sysfs);
 DUMMY(query_module);
 DUMMY(nfsservctl);
 DUMMY(rt_sigqueueinfo);
 DUMMY(sendfile);
 DUMMY(setfsuid);
 DUMMY(setfsgid);
 DUMMY(pivot_root);
 DUMMY(mincore);
 DUMMY(ptrace);
 DUMMY(lookup_dcookie);
 DUMMY(epoll_create);
 DUMMY(epoll_ctl);
 DUMMY(epoll_wait);
 DUMMY(remap_file_pages);
 DUMMY(timer_create);
 DUMMY(timer_settime);
 DUMMY(timer_gettime);
 DUMMY(timer_getoverrun);
 DUMMY(timer_delete);
 DUMMY(fstatfs64);
 DUMMY(mbind);
 DUMMY(get_mempolicy);
 DUMMY(set_mempolicy);
 DUMMY(mq_open);
 DUMMY(mq_unlink);
 DUMMY(mq_timedsend);
 DUMMY(mq_timedreceive);
 DUMMY(mq_notify);
 DUMMY(mq_getsetattr);
 DUMMY(kexec_load);
 DUMMY(waitid);
 /* linux 2.6.11: */
 DUMMY(add_key);
 DUMMY(request_key);
 DUMMY(keyctl);
 /* linux 2.6.13: */
 DUMMY(ioprio_set);
 DUMMY(ioprio_get);
 DUMMY(inotify_init);
 DUMMY(inotify_add_watch);
 DUMMY(inotify_rm_watch);
 /* linux 2.6.16: */
 DUMMY(migrate_pages);
 DUMMY(pselect6);
 DUMMY(ppoll);
 DUMMY(unshare);
 /* linux 2.6.17: */
 DUMMY(splice);
 DUMMY(sync_file_range);
 DUMMY(tee);
 DUMMY(vmsplice);
 /* linux 2.6.18: */
 DUMMY(move_pages);
 /* linux 2.6.19: */
 DUMMY(getcpu);
 DUMMY(epoll_pwait);
 /* linux 2.6.22: */
 DUMMY(utimensat);
 DUMMY(signalfd);
 DUMMY(timerfd_create);
 DUMMY(eventfd);
 /* linux 2.6.23: */
 DUMMY(fallocate);
 /* linux 2.6.25: */
 DUMMY(timerfd_settime);
 DUMMY(timerfd_gettime);
 /* linux 2.6.27: */
 DUMMY(signalfd4);
 DUMMY(eventfd2);
 DUMMY(epoll_create1);
 DUMMY(dup3);
 DUMMY(inotify_init1);
 /* linux 2.6.30: */
 DUMMY(preadv);
 DUMMY(pwritev);
 /* linux 2.6.31: */
 DUMMY(rt_tsigqueueinfo);
 DUMMY(perf_event_open);
 /* linux 2.6.33: */
 DUMMY(recvmmsg);
 DUMMY(fanotify_init);
 DUMMY(fanotify_mark);
 /* linux 2.6.36: */
 DUMMY(prlimit64);
 /* later: */
 DUMMY(name_to_handle_at);
 DUMMY(open_by_handle_at);
 DUMMY(clock_adjtime);
 DUMMY(syncfs);
 DUMMY(sendmmsg);
 DUMMY(setns);
 DUMMY(process_vm_readv);
 DUMMY(process_vm_writev);
 
 #define DUMMY_XATTR(s)						\
 int								\
 linux_ ## s ## xattr(						\
     struct thread *td, struct linux_ ## s ## xattr_args *arg)	\
 {								\
 								\
 	return (ENOATTR);					\
 }
 DUMMY_XATTR(set);
 DUMMY_XATTR(lset);
 DUMMY_XATTR(fset);
 DUMMY_XATTR(get);
 DUMMY_XATTR(lget);
 DUMMY_XATTR(fget);
 DUMMY_XATTR(list);
 DUMMY_XATTR(llist);
 DUMMY_XATTR(flist);
 DUMMY_XATTR(remove);
 DUMMY_XATTR(lremove);
 DUMMY_XATTR(fremove);
Index: head/sys/cam/ctl/ctl_backend_block.c
===================================================================
--- head/sys/cam/ctl/ctl_backend_block.c	(revision 258540)
+++ head/sys/cam/ctl/ctl_backend_block.c	(revision 258541)
@@ -1,2347 +1,2345 @@
 /*-
  * Copyright (c) 2003 Silicon Graphics International Corp.
  * Copyright (c) 2009-2011 Spectra Logic Corporation
  * Copyright (c) 2012 The FreeBSD Foundation
  * All rights reserved.
  *
  * Portions of this software were developed by Edward Tomasz Napierala
  * under sponsorship from the FreeBSD Foundation.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions, and the following disclaimer,
  *    without modification.
  * 2. Redistributions in binary form must reproduce at minimum a disclaimer
  *    substantially similar to the "NO WARRANTY" disclaimer below
  *    ("Disclaimer") and any redistribution must be conditioned upon
  *    including a substantially similar Disclaimer requirement for further
  *    binary redistribution.
  *
  * NO WARRANTY
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTIBILITY AND FITNESS FOR
  * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
  * HOLDERS OR CONTRIBUTORS BE LIABLE FOR SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
  * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
  * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  * POSSIBILITY OF SUCH DAMAGES.
  *
  * $Id: //depot/users/kenm/FreeBSD-test2/sys/cam/ctl/ctl_backend_block.c#5 $
  */
 /*
  * CAM Target Layer driver backend for block devices.
  *
  * Author: Ken Merry <ken@FreeBSD.org>
  */
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
-#include <opt_kdtrace.h>
-
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/kernel.h>
 #include <sys/types.h>
 #include <sys/kthread.h>
 #include <sys/bio.h>
 #include <sys/fcntl.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/condvar.h>
 #include <sys/malloc.h>
 #include <sys/conf.h>
 #include <sys/ioccom.h>
 #include <sys/queue.h>
 #include <sys/sbuf.h>
 #include <sys/endian.h>
 #include <sys/uio.h>
 #include <sys/buf.h>
 #include <sys/taskqueue.h>
 #include <sys/vnode.h>
 #include <sys/namei.h>
 #include <sys/mount.h>
 #include <sys/disk.h>
 #include <sys/fcntl.h>
 #include <sys/filedesc.h>
 #include <sys/proc.h>
 #include <sys/pcpu.h>
 #include <sys/module.h>
 #include <sys/sdt.h>
 #include <sys/devicestat.h>
 #include <sys/sysctl.h>
 
 #include <geom/geom.h>
 
 #include <cam/cam.h>
 #include <cam/scsi/scsi_all.h>
 #include <cam/scsi/scsi_da.h>
 #include <cam/ctl/ctl_io.h>
 #include <cam/ctl/ctl.h>
 #include <cam/ctl/ctl_backend.h>
 #include <cam/ctl/ctl_frontend_internal.h>
 #include <cam/ctl/ctl_ioctl.h>
 #include <cam/ctl/ctl_scsi_all.h>
 #include <cam/ctl/ctl_error.h>
 
 /*
  * The idea here is that we'll allocate enough S/G space to hold a 16MB
  * I/O.  If we get an I/O larger than that, we'll reject it.
  */
 #define	CTLBLK_MAX_IO_SIZE	(16 * 1024 * 1024)
 #define	CTLBLK_MAX_SEGS		(CTLBLK_MAX_IO_SIZE / MAXPHYS) + 1
 
 #ifdef CTLBLK_DEBUG
 #define DPRINTF(fmt, args...) \
     printf("cbb(%s:%d): " fmt, __FUNCTION__, __LINE__, ##args)
 #else
 #define DPRINTF(fmt, args...) do {} while(0)
 #endif
 
 SDT_PROVIDER_DEFINE(cbb);
 
 typedef enum {
 	CTL_BE_BLOCK_LUN_UNCONFIGURED	= 0x01,
 	CTL_BE_BLOCK_LUN_CONFIG_ERR	= 0x02,
 	CTL_BE_BLOCK_LUN_WAITING	= 0x04,
 	CTL_BE_BLOCK_LUN_MULTI_THREAD	= 0x08
 } ctl_be_block_lun_flags;
 
 typedef enum {
 	CTL_BE_BLOCK_NONE,
 	CTL_BE_BLOCK_DEV,
 	CTL_BE_BLOCK_FILE
 } ctl_be_block_type;
 
 struct ctl_be_block_devdata {
 	struct cdev *cdev;
 	struct cdevsw *csw;
 	int dev_ref;
 };
 
 struct ctl_be_block_filedata {
 	struct ucred *cred;
 };
 
 union ctl_be_block_bedata {
 	struct ctl_be_block_devdata dev;
 	struct ctl_be_block_filedata file;
 };
 
 struct ctl_be_block_io;
 struct ctl_be_block_lun;
 
 typedef void (*cbb_dispatch_t)(struct ctl_be_block_lun *be_lun,
 			       struct ctl_be_block_io *beio);
 
 /*
  * Backend LUN structure.  There is a 1:1 mapping between a block device
  * and a backend block LUN, and between a backend block LUN and a CTL LUN.
  */
 struct ctl_be_block_lun {
 	struct ctl_block_disk *disk;
 	char lunname[32];
 	char *dev_path;
 	ctl_be_block_type dev_type;
 	struct vnode *vn;
 	union ctl_be_block_bedata backend;
 	cbb_dispatch_t dispatch;
 	cbb_dispatch_t lun_flush;
 	struct mtx lock;
 	uma_zone_t lun_zone;
 	uint64_t size_blocks;
 	uint64_t size_bytes;
 	uint32_t blocksize;
 	int blocksize_shift;
 	struct ctl_be_block_softc *softc;
 	struct devstat *disk_stats;
 	ctl_be_block_lun_flags flags;
 	STAILQ_ENTRY(ctl_be_block_lun) links;
 	struct ctl_be_lun ctl_be_lun;
 	struct taskqueue *io_taskqueue;
 	struct task io_task;
 	int num_threads;
 	STAILQ_HEAD(, ctl_io_hdr) input_queue;
 	STAILQ_HEAD(, ctl_io_hdr) config_write_queue;
 	STAILQ_HEAD(, ctl_io_hdr) datamove_queue;
 };
 
 /*
  * Overall softc structure for the block backend module.
  */
 struct ctl_be_block_softc {
 	STAILQ_HEAD(, ctl_be_block_io)   beio_free_queue;
 	struct mtx			 lock;
 	int				 prealloc_beio;
 	int				 num_disks;
 	STAILQ_HEAD(, ctl_block_disk)	 disk_list;
 	int				 num_luns;
 	STAILQ_HEAD(, ctl_be_block_lun)	 lun_list;
 };
 
 static struct ctl_be_block_softc backend_block_softc;
 
 /*
  * Per-I/O information.
  */
 struct ctl_be_block_io {
 	union ctl_io			*io;
 	struct ctl_sg_entry		sg_segs[CTLBLK_MAX_SEGS];
 	struct iovec			xiovecs[CTLBLK_MAX_SEGS];
 	int				bio_cmd;
 	int				bio_flags;
 	int				num_segs;
 	int				num_bios_sent;
 	int				num_bios_done;
 	int				send_complete;
 	int				num_errors;
 	struct bintime			ds_t0;
 	devstat_tag_type		ds_tag_type;
 	devstat_trans_flags		ds_trans_type;
 	uint64_t			io_len;
 	uint64_t			io_offset;
 	struct ctl_be_block_softc	*softc;
 	struct ctl_be_block_lun		*lun;
 	STAILQ_ENTRY(ctl_be_block_io)	links;
 };
 
 static int cbb_num_threads = 14;
 TUNABLE_INT("kern.cam.ctl.block.num_threads", &cbb_num_threads);
 SYSCTL_NODE(_kern_cam_ctl, OID_AUTO, block, CTLFLAG_RD, 0,
 	    "CAM Target Layer Block Backend");
 SYSCTL_INT(_kern_cam_ctl_block, OID_AUTO, num_threads, CTLFLAG_RW,
            &cbb_num_threads, 0, "Number of threads per backing file");
 
 static struct ctl_be_block_io *ctl_alloc_beio(struct ctl_be_block_softc *softc);
 static void ctl_free_beio(struct ctl_be_block_io *beio);
 static int ctl_grow_beio(struct ctl_be_block_softc *softc, int count);
 #if 0
 static void ctl_shrink_beio(struct ctl_be_block_softc *softc);
 #endif
 static void ctl_complete_beio(struct ctl_be_block_io *beio);
 static int ctl_be_block_move_done(union ctl_io *io);
 static void ctl_be_block_biodone(struct bio *bio);
 static void ctl_be_block_flush_file(struct ctl_be_block_lun *be_lun,
 				    struct ctl_be_block_io *beio);
 static void ctl_be_block_dispatch_file(struct ctl_be_block_lun *be_lun,
 				       struct ctl_be_block_io *beio);
 static void ctl_be_block_flush_dev(struct ctl_be_block_lun *be_lun,
 				   struct ctl_be_block_io *beio);
 static void ctl_be_block_dispatch_dev(struct ctl_be_block_lun *be_lun,
 				      struct ctl_be_block_io *beio);
 static void ctl_be_block_cw_dispatch(struct ctl_be_block_lun *be_lun,
 				    union ctl_io *io);
 static void ctl_be_block_dispatch(struct ctl_be_block_lun *be_lun,
 				  union ctl_io *io);
 static void ctl_be_block_worker(void *context, int pending);
 static int ctl_be_block_submit(union ctl_io *io);
 static int ctl_be_block_ioctl(struct cdev *dev, u_long cmd, caddr_t addr,
 				   int flag, struct thread *td);
 static int ctl_be_block_open_file(struct ctl_be_block_lun *be_lun,
 				  struct ctl_lun_req *req);
 static int ctl_be_block_open_dev(struct ctl_be_block_lun *be_lun,
 				 struct ctl_lun_req *req);
 static int ctl_be_block_close(struct ctl_be_block_lun *be_lun);
 static int ctl_be_block_open(struct ctl_be_block_softc *softc,
 			     struct ctl_be_block_lun *be_lun,
 			     struct ctl_lun_req *req);
 static int ctl_be_block_create(struct ctl_be_block_softc *softc,
 			       struct ctl_lun_req *req);
 static int ctl_be_block_rm(struct ctl_be_block_softc *softc,
 			   struct ctl_lun_req *req);
 static int ctl_be_block_modify_file(struct ctl_be_block_lun *be_lun,
 				  struct ctl_lun_req *req);
 static int ctl_be_block_modify_dev(struct ctl_be_block_lun *be_lun,
 				 struct ctl_lun_req *req);
 static int ctl_be_block_modify(struct ctl_be_block_softc *softc,
 			   struct ctl_lun_req *req);
 static void ctl_be_block_lun_shutdown(void *be_lun);
 static void ctl_be_block_lun_config_status(void *be_lun,
 					   ctl_lun_config_status status);
 static int ctl_be_block_config_write(union ctl_io *io);
 static int ctl_be_block_config_read(union ctl_io *io);
 static int ctl_be_block_lun_info(void *be_lun, struct sbuf *sb);
 int ctl_be_block_init(void);
 
 static struct ctl_backend_driver ctl_be_block_driver = 
 {
 	.name = "block",
 	.flags = CTL_BE_FLAG_HAS_CONFIG,
 	.init = ctl_be_block_init,
 	.data_submit = ctl_be_block_submit,
 	.data_move_done = ctl_be_block_move_done,
 	.config_read = ctl_be_block_config_read,
 	.config_write = ctl_be_block_config_write,
 	.ioctl = ctl_be_block_ioctl,
 	.lun_info = ctl_be_block_lun_info
 };
 
 MALLOC_DEFINE(M_CTLBLK, "ctlblk", "Memory used for CTL block backend");
 CTL_BACKEND_DECLARE(cbb, ctl_be_block_driver);
 
 static struct ctl_be_block_io *
 ctl_alloc_beio(struct ctl_be_block_softc *softc)
 {
 	struct ctl_be_block_io *beio;
 	int count;
 
 	mtx_lock(&softc->lock);
 
 	beio = STAILQ_FIRST(&softc->beio_free_queue);
 	if (beio != NULL) {
 		STAILQ_REMOVE(&softc->beio_free_queue, beio,
 			      ctl_be_block_io, links);
 	}
 	mtx_unlock(&softc->lock);
 
 	if (beio != NULL) {
 		bzero(beio, sizeof(*beio));
 		beio->softc = softc;
 		return (beio);
 	}
 
 	for (;;) {
 
 		count = ctl_grow_beio(softc, /*count*/ 10);
 
 		/*
 		 * This shouldn't be possible, since ctl_grow_beio() uses a
 		 * blocking malloc.
 		 */
 		if (count == 0)
 			return (NULL);
 
 		/*
 		 * Since we have to drop the lock when we're allocating beio
 		 * structures, it's possible someone else can come along and
 		 * allocate the beio's we've just allocated.
 		 */
 		mtx_lock(&softc->lock);
 		beio = STAILQ_FIRST(&softc->beio_free_queue);
 		if (beio != NULL) {
 			STAILQ_REMOVE(&softc->beio_free_queue, beio,
 				      ctl_be_block_io, links);
 		}
 		mtx_unlock(&softc->lock);
 
 		if (beio != NULL) {
 			bzero(beio, sizeof(*beio));
 			beio->softc = softc;
 			break;
 		}
 	}
 	return (beio);
 }
 
 static void
 ctl_free_beio(struct ctl_be_block_io *beio)
 {
 	struct ctl_be_block_softc *softc;
 	int duplicate_free;
 	int i;
 
 	softc = beio->softc;
 	duplicate_free = 0;
 
 	for (i = 0; i < beio->num_segs; i++) {
 		if (beio->sg_segs[i].addr == NULL)
 			duplicate_free++;
 
 		uma_zfree(beio->lun->lun_zone, beio->sg_segs[i].addr);
 		beio->sg_segs[i].addr = NULL;
 	}
 
 	if (duplicate_free > 0) {
 		printf("%s: %d duplicate frees out of %d segments\n", __func__,
 		       duplicate_free, beio->num_segs);
 	}
 	mtx_lock(&softc->lock);
 	STAILQ_INSERT_TAIL(&softc->beio_free_queue, beio, links);
 	mtx_unlock(&softc->lock);
 }
 
 static int
 ctl_grow_beio(struct ctl_be_block_softc *softc, int count)
 {
 	int i;
 
 	for (i = 0; i < count; i++) {
 		struct ctl_be_block_io *beio;
 
 		beio = (struct ctl_be_block_io *)malloc(sizeof(*beio),
 							   M_CTLBLK,
 							   M_WAITOK | M_ZERO);
 		beio->softc = softc;
 		mtx_lock(&softc->lock);
 		STAILQ_INSERT_TAIL(&softc->beio_free_queue, beio, links);
 		mtx_unlock(&softc->lock);
 	}
 
 	return (i);
 }
 
 #if 0
 static void
 ctl_shrink_beio(struct ctl_be_block_softc *softc)
 {
 	struct ctl_be_block_io *beio, *beio_tmp;
 
 	mtx_lock(&softc->lock);
 	STAILQ_FOREACH_SAFE(beio, &softc->beio_free_queue, links, beio_tmp) {
 		STAILQ_REMOVE(&softc->beio_free_queue, beio,
 			      ctl_be_block_io, links);
 		free(beio, M_CTLBLK);
 	}
 	mtx_unlock(&softc->lock);
 }
 #endif
 
 static void
 ctl_complete_beio(struct ctl_be_block_io *beio)
 {
 	union ctl_io *io;
 	int io_len;
 
 	io = beio->io;
 
 	if ((io->io_hdr.status & CTL_STATUS_MASK) == CTL_SUCCESS)
 		io_len = beio->io_len;
 	else
 		io_len = 0;
 
 	devstat_end_transaction(beio->lun->disk_stats,
 				/*bytes*/ io_len,
 				beio->ds_tag_type,
 				beio->ds_trans_type,
 				/*now*/ NULL,
 				/*then*/&beio->ds_t0);
 
 	ctl_free_beio(beio);
 	ctl_done(io);
 }
 
 static int
 ctl_be_block_move_done(union ctl_io *io)
 {
 	struct ctl_be_block_io *beio;
 	struct ctl_be_block_lun *be_lun;
 #ifdef CTL_TIME_IO
 	struct bintime cur_bt;
 #endif  
 
 	beio = (struct ctl_be_block_io *)
 		io->io_hdr.ctl_private[CTL_PRIV_BACKEND].ptr;
 
 	be_lun = beio->lun;
 
 	DPRINTF("entered\n");
 
 #ifdef CTL_TIME_IO
 	getbintime(&cur_bt);
 	bintime_sub(&cur_bt, &io->io_hdr.dma_start_bt);
 	bintime_add(&io->io_hdr.dma_bt, &cur_bt);
 	io->io_hdr.num_dmas++;
 #endif  
 
 	/*
 	 * We set status at this point for read commands, and write
 	 * commands with errors.
 	 */
 	if ((beio->bio_cmd == BIO_READ)
 	 && (io->io_hdr.port_status == 0)
 	 && ((io->io_hdr.flags & CTL_FLAG_ABORT) == 0)
 	 && ((io->io_hdr.status & CTL_STATUS_MASK) == CTL_STATUS_NONE))
 		ctl_set_success(&io->scsiio);
 	else if ((io->io_hdr.port_status != 0)
 	      && ((io->io_hdr.flags & CTL_FLAG_ABORT) == 0)
 	      && ((io->io_hdr.status & CTL_STATUS_MASK) == CTL_STATUS_NONE)) {
 		/*
 		 * For hardware error sense keys, the sense key
 		 * specific value is defined to be a retry count,
 		 * but we use it to pass back an internal FETD
 		 * error code.  XXX KDM  Hopefully the FETD is only
 		 * using 16 bits for an error code, since that's
 		 * all the space we have in the sks field.
 		 */
 		ctl_set_internal_failure(&io->scsiio,
 					 /*sks_valid*/ 1,
 					 /*retry_count*/
 					 io->io_hdr.port_status);
 	}
 
 	/*
 	 * If this is a read, or a write with errors, it is done.
 	 */
 	if ((beio->bio_cmd == BIO_READ)
 	 || ((io->io_hdr.flags & CTL_FLAG_ABORT) != 0)
 	 || ((io->io_hdr.status & CTL_STATUS_MASK) != CTL_STATUS_NONE)) {
 		ctl_complete_beio(beio);
 		return (0);
 	}
 
 	/*
 	 * At this point, we have a write and the DMA completed
 	 * successfully.  We now have to queue it to the task queue to
 	 * execute the backend I/O.  That is because we do blocking
 	 * memory allocations, and in the file backing case, blocking I/O.
 	 * This move done routine is generally called in the SIM's
 	 * interrupt context, and therefore we cannot block.
 	 */
 	mtx_lock(&be_lun->lock);
 	/*
 	 * XXX KDM make sure that links is okay to use at this point.
 	 * Otherwise, we either need to add another field to ctl_io_hdr,
 	 * or deal with resource allocation here.
 	 */
 	STAILQ_INSERT_TAIL(&be_lun->datamove_queue, &io->io_hdr, links);
 	mtx_unlock(&be_lun->lock);
 
 	taskqueue_enqueue(be_lun->io_taskqueue, &be_lun->io_task);
 
 	return (0);
 }
 
 static void
 ctl_be_block_biodone(struct bio *bio)
 {
 	struct ctl_be_block_io *beio;
 	struct ctl_be_block_lun *be_lun;
 	union ctl_io *io;
 
 	beio = bio->bio_caller1;
 	be_lun = beio->lun;
 	io = beio->io;
 
 	DPRINTF("entered\n");
 
 	mtx_lock(&be_lun->lock);
 	if (bio->bio_error != 0)
 		beio->num_errors++;
 
 	beio->num_bios_done++;
 
 	/*
 	 * XXX KDM will this cause WITNESS to complain?  Holding a lock
 	 * during the free might cause it to complain.
 	 */
 	g_destroy_bio(bio);
 
 	/*
 	 * If the send complete bit isn't set, or we aren't the last I/O to
 	 * complete, then we're done.
 	 */
 	if ((beio->send_complete == 0)
 	 || (beio->num_bios_done < beio->num_bios_sent)) {
 		mtx_unlock(&be_lun->lock);
 		return;
 	}
 
 	/*
 	 * At this point, we've verified that we are the last I/O to
 	 * complete, so it's safe to drop the lock.
 	 */
 	mtx_unlock(&be_lun->lock);
 
 	/*
 	 * If there are any errors from the backing device, we fail the
 	 * entire I/O with a medium error.
 	 */
 	if (beio->num_errors > 0) {
 		if (beio->bio_cmd == BIO_FLUSH) {
 			/* XXX KDM is there is a better error here? */
 			ctl_set_internal_failure(&io->scsiio,
 						 /*sks_valid*/ 1,
 						 /*retry_count*/ 0xbad2);
 		} else
 			ctl_set_medium_error(&io->scsiio);
 		ctl_complete_beio(beio);
 		return;
 	}
 
 	/*
 	 * If this is a write or a flush, we're all done.
 	 * If this is a read, we can now send the data to the user.
 	 */
 	if ((beio->bio_cmd == BIO_WRITE)
 	 || (beio->bio_cmd == BIO_FLUSH)) {
 		ctl_set_success(&io->scsiio);
 		ctl_complete_beio(beio);
 	} else {
 		io->scsiio.be_move_done = ctl_be_block_move_done;
 		io->scsiio.kern_data_ptr = (uint8_t *)beio->sg_segs;
 		io->scsiio.kern_data_len = beio->io_len;
 		io->scsiio.kern_total_len = beio->io_len;
 		io->scsiio.kern_rel_offset = 0;
 		io->scsiio.kern_data_resid = 0;
 		io->scsiio.kern_sg_entries = beio->num_segs;
 		io->io_hdr.flags |= CTL_FLAG_ALLOCATED | CTL_FLAG_KDPTR_SGLIST;
 #ifdef CTL_TIME_IO
         	getbintime(&io->io_hdr.dma_start_bt);
 #endif  
 		ctl_datamove(io);
 	}
 }
 
 static void
 ctl_be_block_flush_file(struct ctl_be_block_lun *be_lun,
 			struct ctl_be_block_io *beio)
 {
 	union ctl_io *io;
 	struct mount *mountpoint;
 	int error, lock_flags;
 
 	DPRINTF("entered\n");
 
 	io = beio->io;
 
        	(void) vn_start_write(be_lun->vn, &mountpoint, V_WAIT);
 
 	if (MNT_SHARED_WRITES(mountpoint)
 	 || ((mountpoint == NULL)
 	  && MNT_SHARED_WRITES(be_lun->vn->v_mount)))
 		lock_flags = LK_SHARED;
 	else
 		lock_flags = LK_EXCLUSIVE;
 
 	vn_lock(be_lun->vn, lock_flags | LK_RETRY);
 
 	binuptime(&beio->ds_t0);
 	devstat_start_transaction(beio->lun->disk_stats, &beio->ds_t0);
 
 	error = VOP_FSYNC(be_lun->vn, MNT_WAIT, curthread);
 	VOP_UNLOCK(be_lun->vn, 0);
 
 	vn_finished_write(mountpoint);
 
 	if (error == 0)
 		ctl_set_success(&io->scsiio);
 	else {
 		/* XXX KDM is there is a better error here? */
 		ctl_set_internal_failure(&io->scsiio,
 					 /*sks_valid*/ 1,
 					 /*retry_count*/ 0xbad1);
 	}
 
 	ctl_complete_beio(beio);
 }
 
 SDT_PROBE_DEFINE1(cbb, kernel, read, file_start, file_start, "uint64_t");
 SDT_PROBE_DEFINE1(cbb, kernel, write, file_start, file_start, "uint64_t");
 SDT_PROBE_DEFINE1(cbb, kernel, read, file_done, file_done,"uint64_t");
 SDT_PROBE_DEFINE1(cbb, kernel, write, file_done, file_done, "uint64_t");
 
 static void
 ctl_be_block_dispatch_file(struct ctl_be_block_lun *be_lun,
 			   struct ctl_be_block_io *beio)
 {
 	struct ctl_be_block_filedata *file_data;
 	union ctl_io *io;
 	struct uio xuio;
 	struct iovec *xiovec;
 	int flags;
 	int error, i;
 
 	DPRINTF("entered\n");
 
 	file_data = &be_lun->backend.file;
 	io = beio->io;
 	flags = beio->bio_flags;
 
 	if (beio->bio_cmd == BIO_READ) {
 		SDT_PROBE(cbb, kernel, read, file_start, 0, 0, 0, 0, 0);
 	} else {
 		SDT_PROBE(cbb, kernel, write, file_start, 0, 0, 0, 0, 0);
 	}
 
 	bzero(&xuio, sizeof(xuio));
 	if (beio->bio_cmd == BIO_READ)
 		xuio.uio_rw = UIO_READ;
 	else
 		xuio.uio_rw = UIO_WRITE;
 
 	xuio.uio_offset = beio->io_offset;
 	xuio.uio_resid = beio->io_len;
 	xuio.uio_segflg = UIO_SYSSPACE;
 	xuio.uio_iov = beio->xiovecs;
 	xuio.uio_iovcnt = beio->num_segs;
 	xuio.uio_td = curthread;
 
 	for (i = 0, xiovec = xuio.uio_iov; i < xuio.uio_iovcnt; i++, xiovec++) {
 		xiovec->iov_base = beio->sg_segs[i].addr;
 		xiovec->iov_len = beio->sg_segs[i].len;
 	}
 
 	if (beio->bio_cmd == BIO_READ) {
 		vn_lock(be_lun->vn, LK_SHARED | LK_RETRY);
 
 		binuptime(&beio->ds_t0);
 		devstat_start_transaction(beio->lun->disk_stats, &beio->ds_t0);
 
 		/*
 		 * UFS pays attention to IO_DIRECT for reads.  If the
 		 * DIRECTIO option is configured into the kernel, it calls
 		 * ffs_rawread().  But that only works for single-segment
 		 * uios with user space addresses.  In our case, with a
 		 * kernel uio, it still reads into the buffer cache, but it
 		 * will just try to release the buffer from the cache later
 		 * on in ffs_read().
 		 *
 		 * ZFS does not pay attention to IO_DIRECT for reads.
 		 *
 		 * UFS does not pay attention to IO_SYNC for reads.
 		 *
 		 * ZFS pays attention to IO_SYNC (which translates into the
 		 * Solaris define FRSYNC for zfs_read()) for reads.  It
 		 * attempts to sync the file before reading.
 		 *
 		 * So, to attempt to provide some barrier semantics in the
 		 * BIO_ORDERED case, set both IO_DIRECT and IO_SYNC.
 		 */
 		error = VOP_READ(be_lun->vn, &xuio, (flags & BIO_ORDERED) ?
 				 (IO_DIRECT|IO_SYNC) : 0, file_data->cred);
 
 		VOP_UNLOCK(be_lun->vn, 0);
 	} else {
 		struct mount *mountpoint;
 		int lock_flags;
 
 		(void)vn_start_write(be_lun->vn, &mountpoint, V_WAIT);
 
 		if (MNT_SHARED_WRITES(mountpoint)
 		 || ((mountpoint == NULL)
 		  && MNT_SHARED_WRITES(be_lun->vn->v_mount)))
 			lock_flags = LK_SHARED;
 		else
 			lock_flags = LK_EXCLUSIVE;
 
 		vn_lock(be_lun->vn, lock_flags | LK_RETRY);
 
 		binuptime(&beio->ds_t0);
 		devstat_start_transaction(beio->lun->disk_stats, &beio->ds_t0);
 
 		/*
 		 * UFS pays attention to IO_DIRECT for writes.  The write
 		 * is done asynchronously.  (Normally the write would just
 		 * get put into cache.
 		 *
 		 * UFS pays attention to IO_SYNC for writes.  It will
 		 * attempt to write the buffer out synchronously if that
 		 * flag is set.
 		 *
 		 * ZFS does not pay attention to IO_DIRECT for writes.
 		 *
 		 * ZFS pays attention to IO_SYNC (a.k.a. FSYNC or FRSYNC)
 		 * for writes.  It will flush the transaction from the
 		 * cache before returning.
 		 *
 		 * So if we've got the BIO_ORDERED flag set, we want
 		 * IO_SYNC in either the UFS or ZFS case.
 		 */
 		error = VOP_WRITE(be_lun->vn, &xuio, (flags & BIO_ORDERED) ?
 				  IO_SYNC : 0, file_data->cred);
 		VOP_UNLOCK(be_lun->vn, 0);
 
 		vn_finished_write(mountpoint);
         }
 
 	/*
 	 * If we got an error, set the sense data to "MEDIUM ERROR" and
 	 * return the I/O to the user.
 	 */
 	if (error != 0) {
 		char path_str[32];
 
 		ctl_scsi_path_string(io, path_str, sizeof(path_str));
 		/*
 		 * XXX KDM ZFS returns ENOSPC when the underlying
 		 * filesystem fills up.  What kind of SCSI error should we
 		 * return for that?
 		 */
 		printf("%s%s command returned errno %d\n", path_str,
 		       (beio->bio_cmd == BIO_READ) ? "READ" : "WRITE", error);
 		ctl_set_medium_error(&io->scsiio);
 		ctl_complete_beio(beio);
 		return;
 	}
 
 	/*
 	 * If this is a write, we're all done.
 	 * If this is a read, we can now send the data to the user.
 	 */
 	if (beio->bio_cmd == BIO_WRITE) {
 		ctl_set_success(&io->scsiio);
 		SDT_PROBE(cbb, kernel, write, file_done, 0, 0, 0, 0, 0);
 		ctl_complete_beio(beio);
 	} else {
 		SDT_PROBE(cbb, kernel, read, file_done, 0, 0, 0, 0, 0);
 		io->scsiio.be_move_done = ctl_be_block_move_done;
 		io->scsiio.kern_data_ptr = (uint8_t *)beio->sg_segs;
 		io->scsiio.kern_data_len = beio->io_len;
 		io->scsiio.kern_total_len = beio->io_len;
 		io->scsiio.kern_rel_offset = 0;
 		io->scsiio.kern_data_resid = 0;
 		io->scsiio.kern_sg_entries = beio->num_segs;
 		io->io_hdr.flags |= CTL_FLAG_ALLOCATED | CTL_FLAG_KDPTR_SGLIST;
 #ifdef CTL_TIME_IO
         	getbintime(&io->io_hdr.dma_start_bt);
 #endif  
 		ctl_datamove(io);
 	}
 }
 
 static void
 ctl_be_block_flush_dev(struct ctl_be_block_lun *be_lun,
 		       struct ctl_be_block_io *beio)
 {
 	struct bio *bio;
 	union ctl_io *io;
 	struct ctl_be_block_devdata *dev_data;
 
 	dev_data = &be_lun->backend.dev;
 	io = beio->io;
 
 	DPRINTF("entered\n");
 
 	/* This can't fail, it's a blocking allocation. */
 	bio = g_alloc_bio();
 
 	bio->bio_cmd	    = BIO_FLUSH;
 	bio->bio_flags	   |= BIO_ORDERED;
 	bio->bio_dev	    = dev_data->cdev;
 	bio->bio_offset	    = 0;
 	bio->bio_data	    = 0;
 	bio->bio_done	    = ctl_be_block_biodone;
 	bio->bio_caller1    = beio;
 	bio->bio_pblkno	    = 0;
 
 	/*
 	 * We don't need to acquire the LUN lock here, because we are only
 	 * sending one bio, and so there is no other context to synchronize
 	 * with.
 	 */
 	beio->num_bios_sent = 1;
 	beio->send_complete = 1;
 
 	binuptime(&beio->ds_t0);
 	devstat_start_transaction(be_lun->disk_stats, &beio->ds_t0);
 
 	(*dev_data->csw->d_strategy)(bio);
 }
 
 static void
 ctl_be_block_dispatch_dev(struct ctl_be_block_lun *be_lun,
 			  struct ctl_be_block_io *beio)
 {
 	int i;
 	struct bio *bio;
 	struct ctl_be_block_devdata *dev_data;
 	off_t cur_offset;
 	int max_iosize;
 
 	DPRINTF("entered\n");
 
 	dev_data = &be_lun->backend.dev;
 
 	/*
 	 * We have to limit our I/O size to the maximum supported by the
 	 * backend device.  Hopefully it is MAXPHYS.  If the driver doesn't
 	 * set it properly, use DFLTPHYS.
 	 */
 	max_iosize = dev_data->cdev->si_iosize_max;
 	if (max_iosize < PAGE_SIZE)
 		max_iosize = DFLTPHYS;
 
 	cur_offset = beio->io_offset;
 
 	/*
 	 * XXX KDM need to accurately reflect the number of I/Os outstanding
 	 * to a device.
 	 */
 	binuptime(&beio->ds_t0);
 	devstat_start_transaction(be_lun->disk_stats, &beio->ds_t0);
 
 	for (i = 0; i < beio->num_segs; i++) {
 		size_t cur_size;
 		uint8_t *cur_ptr;
 
 		cur_size = beio->sg_segs[i].len;
 		cur_ptr = beio->sg_segs[i].addr;
 
 		while (cur_size > 0) {
 			/* This can't fail, it's a blocking allocation. */
 			bio = g_alloc_bio();
 
 			KASSERT(bio != NULL, ("g_alloc_bio() failed!\n"));
 
 			bio->bio_cmd = beio->bio_cmd;
 			bio->bio_flags |= beio->bio_flags;
 			bio->bio_dev = dev_data->cdev;
 			bio->bio_caller1 = beio;
 			bio->bio_length = min(cur_size, max_iosize);
 			bio->bio_offset = cur_offset;
 			bio->bio_data = cur_ptr;
 			bio->bio_done = ctl_be_block_biodone;
 			bio->bio_pblkno = cur_offset / be_lun->blocksize;
 
 			cur_offset += bio->bio_length;
 			cur_ptr += bio->bio_length;
 			cur_size -= bio->bio_length;
 
 			/*
 			 * Make sure we set the complete bit just before we
 			 * issue the last bio so we don't wind up with a
 			 * race.
 			 *
 			 * Use the LUN mutex here instead of a combination
 			 * of atomic variables for simplicity.
 			 *
 			 * XXX KDM we could have a per-IO lock, but that
 			 * would cause additional per-IO setup and teardown
 			 * overhead.  Hopefully there won't be too much
 			 * contention on the LUN lock.
 			 */
 			mtx_lock(&be_lun->lock);
 
 			beio->num_bios_sent++;
 
 			if ((i == beio->num_segs - 1)
 			 && (cur_size == 0))
 				beio->send_complete = 1;
 
 			mtx_unlock(&be_lun->lock);
 
 			(*dev_data->csw->d_strategy)(bio);
 		}
 	}
 }
 
 static void
 ctl_be_block_cw_dispatch(struct ctl_be_block_lun *be_lun,
 			 union ctl_io *io)
 {
 	struct ctl_be_block_io *beio;
 	struct ctl_be_block_softc *softc;
 
 	DPRINTF("entered\n");
 
 	softc = be_lun->softc;
 	beio = ctl_alloc_beio(softc);
 	if (beio == NULL) {
 		/*
 		 * This should not happen.  ctl_alloc_beio() will call
 		 * ctl_grow_beio() with a blocking malloc as needed.
 		 * A malloc with M_WAITOK should not fail.
 		 */
 		ctl_set_busy(&io->scsiio);
 		ctl_done(io);
 		return;
 	}
 
 	beio->io = io;
 	beio->softc = softc;
 	beio->lun = be_lun;
 	io->io_hdr.ctl_private[CTL_PRIV_BACKEND].ptr = beio;
 
 	switch (io->scsiio.cdb[0]) {
 	case SYNCHRONIZE_CACHE:
 	case SYNCHRONIZE_CACHE_16:
 		beio->bio_cmd = BIO_FLUSH;
 		beio->ds_trans_type = DEVSTAT_NO_DATA;
 		beio->ds_tag_type = DEVSTAT_TAG_ORDERED;
 		beio->io_len = 0;
 		be_lun->lun_flush(be_lun, beio);
 		break;
 	default:
 		panic("Unhandled CDB type %#x", io->scsiio.cdb[0]);
 		break;
 	}
 }
 
 SDT_PROBE_DEFINE1(cbb, kernel, read, start, start, "uint64_t");
 SDT_PROBE_DEFINE1(cbb, kernel, write, start, start, "uint64_t");
 SDT_PROBE_DEFINE1(cbb, kernel, read, alloc_done, alloc_done, "uint64_t");
 SDT_PROBE_DEFINE1(cbb, kernel, write, alloc_done, alloc_done, "uint64_t");
 
 static void
 ctl_be_block_dispatch(struct ctl_be_block_lun *be_lun,
 			   union ctl_io *io)
 {
 	struct ctl_be_block_io *beio;
 	struct ctl_be_block_softc *softc;
 	struct ctl_lba_len lbalen;
 	uint64_t len_left, io_size_bytes;
 	int i;
 
 	softc = be_lun->softc;
 
 	DPRINTF("entered\n");
 
 	if ((io->io_hdr.flags & CTL_FLAG_DATA_MASK) == CTL_FLAG_DATA_IN) {
 		SDT_PROBE(cbb, kernel, read, start, 0, 0, 0, 0, 0);
 	} else {
 		SDT_PROBE(cbb, kernel, write, start, 0, 0, 0, 0, 0);
 	}
 
 	memcpy(&lbalen, io->io_hdr.ctl_private[CTL_PRIV_LBA_LEN].bytes,
 	       sizeof(lbalen));
 
 	io_size_bytes = lbalen.len * be_lun->blocksize;
 
 	/*
 	 * XXX KDM this is temporary, until we implement chaining of beio
 	 * structures and multiple datamove calls to move all the data in
 	 * or out.
 	 */
 	if (io_size_bytes > CTLBLK_MAX_IO_SIZE) {
 		printf("%s: IO length %ju > max io size %u\n", __func__,
 		       io_size_bytes, CTLBLK_MAX_IO_SIZE);
 		ctl_set_invalid_field(&io->scsiio,
 				      /*sks_valid*/ 0,
 				      /*command*/ 1,
 				      /*field*/ 0,
 				      /*bit_valid*/ 0,
 				      /*bit*/ 0);
 		ctl_done(io);
 		return;
 	}
 
 	beio = ctl_alloc_beio(softc);
 	if (beio == NULL) {
 		/*
 		 * This should not happen.  ctl_alloc_beio() will call
 		 * ctl_grow_beio() with a blocking malloc as needed.
 		 * A malloc with M_WAITOK should not fail.
 		 */
 		ctl_set_busy(&io->scsiio);
 		ctl_done(io);
 		return;
 	}
 
 	beio->io = io;
 	beio->softc = softc;
 	beio->lun = be_lun;
 	io->io_hdr.ctl_private[CTL_PRIV_BACKEND].ptr = beio;
 
 	/*
 	 * If the I/O came down with an ordered or head of queue tag, set
 	 * the BIO_ORDERED attribute.  For head of queue tags, that's
 	 * pretty much the best we can do.
 	 *
 	 * XXX KDM we don't have a great way to easily know about the FUA
 	 * bit right now (it is decoded in ctl_read_write(), but we don't
 	 * pass that knowledge to the backend), and in any case we would
 	 * need to determine how to handle it.  
 	 */
 	if ((io->scsiio.tag_type == CTL_TAG_ORDERED)
 	 || (io->scsiio.tag_type == CTL_TAG_HEAD_OF_QUEUE))
 		beio->bio_flags = BIO_ORDERED;
 
 	switch (io->scsiio.tag_type) {
 	case CTL_TAG_ORDERED:
 		beio->ds_tag_type = DEVSTAT_TAG_ORDERED;
 		break;
 	case CTL_TAG_HEAD_OF_QUEUE:
 		beio->ds_tag_type = DEVSTAT_TAG_HEAD;
 		break;
 	case CTL_TAG_UNTAGGED:
 	case CTL_TAG_SIMPLE:
 	case CTL_TAG_ACA:
 	default:
 		beio->ds_tag_type = DEVSTAT_TAG_SIMPLE;
 		break;
 	}
 
 	/*
 	 * This path handles read and write only.  The config write path
 	 * handles flush operations.
 	 */
 	if ((io->io_hdr.flags & CTL_FLAG_DATA_MASK) == CTL_FLAG_DATA_IN) {
 		beio->bio_cmd = BIO_READ;
 		beio->ds_trans_type = DEVSTAT_READ;
 	} else {
 		beio->bio_cmd = BIO_WRITE;
 		beio->ds_trans_type = DEVSTAT_WRITE;
 	}
 
 	beio->io_len = lbalen.len * be_lun->blocksize;
 	beio->io_offset = lbalen.lba * be_lun->blocksize;
 
 	DPRINTF("%s at LBA %jx len %u\n",
 	       (beio->bio_cmd == BIO_READ) ? "READ" : "WRITE",
 	       (uintmax_t)lbalen.lba, lbalen.len);
 
 	for (i = 0, len_left = io_size_bytes; i < CTLBLK_MAX_SEGS &&
 	     len_left > 0; i++) {
 
 		/*
 		 * Setup the S/G entry for this chunk.
 		 */
 		beio->sg_segs[i].len = min(MAXPHYS, len_left);
 		beio->sg_segs[i].addr = uma_zalloc(be_lun->lun_zone, M_WAITOK);
 
 		DPRINTF("segment %d addr %p len %zd\n", i,
 			beio->sg_segs[i].addr, beio->sg_segs[i].len);
 
 		beio->num_segs++;
 		len_left -= beio->sg_segs[i].len;
 	}
 
 	/*
 	 * For the read case, we need to read the data into our buffers and
 	 * then we can send it back to the user.  For the write case, we
 	 * need to get the data from the user first.
 	 */
 	if (beio->bio_cmd == BIO_READ) {
 		SDT_PROBE(cbb, kernel, read, alloc_done, 0, 0, 0, 0, 0);
 		be_lun->dispatch(be_lun, beio);
 	} else {
 		SDT_PROBE(cbb, kernel, write, alloc_done, 0, 0, 0, 0, 0);
 		io->scsiio.be_move_done = ctl_be_block_move_done;
 		io->scsiio.kern_data_ptr = (uint8_t *)beio->sg_segs;
 		io->scsiio.kern_data_len = beio->io_len;
 		io->scsiio.kern_total_len = beio->io_len;
 		io->scsiio.kern_rel_offset = 0;
 		io->scsiio.kern_data_resid = 0;
 		io->scsiio.kern_sg_entries = beio->num_segs;
 		io->io_hdr.flags |= CTL_FLAG_ALLOCATED | CTL_FLAG_KDPTR_SGLIST;
 #ifdef CTL_TIME_IO
         	getbintime(&io->io_hdr.dma_start_bt);
 #endif  
 		ctl_datamove(io);
 	}
 }
 
 static void
 ctl_be_block_worker(void *context, int pending)
 {
 	struct ctl_be_block_lun *be_lun;
 	struct ctl_be_block_softc *softc;
 	union ctl_io *io;
 
 	be_lun = (struct ctl_be_block_lun *)context;
 	softc = be_lun->softc;
 
 	DPRINTF("entered\n");
 
 	mtx_lock(&be_lun->lock);
 	for (;;) {
 		io = (union ctl_io *)STAILQ_FIRST(&be_lun->datamove_queue);
 		if (io != NULL) {
 			struct ctl_be_block_io *beio;
 
 			DPRINTF("datamove queue\n");
 
 			STAILQ_REMOVE(&be_lun->datamove_queue, &io->io_hdr,
 				      ctl_io_hdr, links);
 
 			mtx_unlock(&be_lun->lock);
 
 			beio = (struct ctl_be_block_io *)
 			    io->io_hdr.ctl_private[CTL_PRIV_BACKEND].ptr;
 
 			be_lun->dispatch(be_lun, beio);
 
 			mtx_lock(&be_lun->lock);
 			continue;
 		}
 		io = (union ctl_io *)STAILQ_FIRST(&be_lun->config_write_queue);
 		if (io != NULL) {
 
 			DPRINTF("config write queue\n");
 
 			STAILQ_REMOVE(&be_lun->config_write_queue, &io->io_hdr,
 				      ctl_io_hdr, links);
 
 			mtx_unlock(&be_lun->lock);
 
 			ctl_be_block_cw_dispatch(be_lun, io);
 
 			mtx_lock(&be_lun->lock);
 			continue;
 		}
 		io = (union ctl_io *)STAILQ_FIRST(&be_lun->input_queue);
 		if (io != NULL) {
 			DPRINTF("input queue\n");
 
 			STAILQ_REMOVE(&be_lun->input_queue, &io->io_hdr,
 				      ctl_io_hdr, links);
 			mtx_unlock(&be_lun->lock);
 
 			/*
 			 * We must drop the lock, since this routine and
 			 * its children may sleep.
 			 */
 			ctl_be_block_dispatch(be_lun, io);
 
 			mtx_lock(&be_lun->lock);
 			continue;
 		}
 
 		/*
 		 * If we get here, there is no work left in the queues, so
 		 * just break out and let the task queue go to sleep.
 		 */
 		break;
 	}
 	mtx_unlock(&be_lun->lock);
 }
 
 /*
  * Entry point from CTL to the backend for I/O.  We queue everything to a
  * work thread, so this just puts the I/O on a queue and wakes up the
  * thread.
  */
 static int
 ctl_be_block_submit(union ctl_io *io)
 {
 	struct ctl_be_block_lun *be_lun;
 	struct ctl_be_lun *ctl_be_lun;
 	int retval;
 
 	DPRINTF("entered\n");
 
 	retval = CTL_RETVAL_COMPLETE;
 
 	ctl_be_lun = (struct ctl_be_lun *)io->io_hdr.ctl_private[
 		CTL_PRIV_BACKEND_LUN].ptr;
 	be_lun = (struct ctl_be_block_lun *)ctl_be_lun->be_lun;
 
 	/*
 	 * Make sure we only get SCSI I/O.
 	 */
 	KASSERT(io->io_hdr.io_type == CTL_IO_SCSI, ("Non-SCSI I/O (type "
 		"%#x) encountered", io->io_hdr.io_type));
 
 	mtx_lock(&be_lun->lock);
 	/*
 	 * XXX KDM make sure that links is okay to use at this point.
 	 * Otherwise, we either need to add another field to ctl_io_hdr,
 	 * or deal with resource allocation here.
 	 */
 	STAILQ_INSERT_TAIL(&be_lun->input_queue, &io->io_hdr, links);
 	mtx_unlock(&be_lun->lock);
 
 	taskqueue_enqueue(be_lun->io_taskqueue, &be_lun->io_task);
 
 	return (retval);
 }
 
 static int
 ctl_be_block_ioctl(struct cdev *dev, u_long cmd, caddr_t addr,
 			int flag, struct thread *td)
 {
 	struct ctl_be_block_softc *softc;
 	int error;
 
 	softc = &backend_block_softc;
 
 	error = 0;
 
 	switch (cmd) {
 	case CTL_LUN_REQ: {
 		struct ctl_lun_req *lun_req;
 
 		lun_req = (struct ctl_lun_req *)addr;
 
 		switch (lun_req->reqtype) {
 		case CTL_LUNREQ_CREATE:
 			error = ctl_be_block_create(softc, lun_req);
 			break;
 		case CTL_LUNREQ_RM:
 			error = ctl_be_block_rm(softc, lun_req);
 			break;
 		case CTL_LUNREQ_MODIFY:
 			error = ctl_be_block_modify(softc, lun_req);
 			break;
 		default:
 			lun_req->status = CTL_LUN_ERROR;
 			snprintf(lun_req->error_str, sizeof(lun_req->error_str),
 				 "%s: invalid LUN request type %d", __func__,
 				 lun_req->reqtype);
 			break;
 		}
 		break;
 	}
 	default:
 		error = ENOTTY;
 		break;
 	}
 
 	return (error);
 }
 
 static int
 ctl_be_block_open_file(struct ctl_be_block_lun *be_lun, struct ctl_lun_req *req)
 {
 	struct ctl_be_block_filedata *file_data;
 	struct ctl_lun_create_params *params;
 	struct vattr		      vattr;
 	int			      error;
 
 	error = 0;
 	file_data = &be_lun->backend.file;
 	params = &req->reqdata.create;
 
 	be_lun->dev_type = CTL_BE_BLOCK_FILE;
 	be_lun->dispatch = ctl_be_block_dispatch_file;
 	be_lun->lun_flush = ctl_be_block_flush_file;
 
 	error = VOP_GETATTR(be_lun->vn, &vattr, curthread->td_ucred);
 	if (error != 0) {
 		snprintf(req->error_str, sizeof(req->error_str),
 			 "error calling VOP_GETATTR() for file %s",
 			 be_lun->dev_path);
 		return (error);
 	}
 
 	/*
 	 * Verify that we have the ability to upgrade to exclusive
 	 * access on this file so we can trap errors at open instead
 	 * of reporting them during first access.
 	 */
 	if (VOP_ISLOCKED(be_lun->vn) != LK_EXCLUSIVE) {
 		vn_lock(be_lun->vn, LK_UPGRADE | LK_RETRY);
 		if (be_lun->vn->v_iflag & VI_DOOMED) {
 			error = EBADF;
 			snprintf(req->error_str, sizeof(req->error_str),
 				 "error locking file %s", be_lun->dev_path);
 			return (error);
 		}
 	}
 
 
 	file_data->cred = crhold(curthread->td_ucred);
 	if (params->lun_size_bytes != 0)
 		be_lun->size_bytes = params->lun_size_bytes;
 	else
 		be_lun->size_bytes = vattr.va_size;
 	/*
 	 * We set the multi thread flag for file operations because all
 	 * filesystems (in theory) are capable of allowing multiple readers
 	 * of a file at once.  So we want to get the maximum possible
 	 * concurrency.
 	 */
 	be_lun->flags |= CTL_BE_BLOCK_LUN_MULTI_THREAD;
 
 	/*
 	 * XXX KDM vattr.va_blocksize may be larger than 512 bytes here.
 	 * With ZFS, it is 131072 bytes.  Block sizes that large don't work
 	 * with disklabel and UFS on FreeBSD at least.  Large block sizes
 	 * may not work with other OSes as well.  So just export a sector
 	 * size of 512 bytes, which should work with any OS or
 	 * application.  Since our backing is a file, any block size will
 	 * work fine for the backing store.
 	 */
 #if 0
 	be_lun->blocksize= vattr.va_blocksize;
 #endif
 	if (params->blocksize_bytes != 0)
 		be_lun->blocksize = params->blocksize_bytes;
 	else
 		be_lun->blocksize = 512;
 
 	/*
 	 * Sanity check.  The media size has to be at least one
 	 * sector long.
 	 */
 	if (be_lun->size_bytes < be_lun->blocksize) {
 		error = EINVAL;
 		snprintf(req->error_str, sizeof(req->error_str),
 			 "file %s size %ju < block size %u", be_lun->dev_path,
 			 (uintmax_t)be_lun->size_bytes, be_lun->blocksize);
 	}
 	return (error);
 }
 
 static int
 ctl_be_block_open_dev(struct ctl_be_block_lun *be_lun, struct ctl_lun_req *req)
 {
 	struct ctl_lun_create_params *params;
 	struct vattr		      vattr;
 	struct cdev		     *dev;
 	struct cdevsw		     *devsw;
 	int			      error;
 
 	params = &req->reqdata.create;
 
 	be_lun->dev_type = CTL_BE_BLOCK_DEV;
 	be_lun->dispatch = ctl_be_block_dispatch_dev;
 	be_lun->lun_flush = ctl_be_block_flush_dev;
 	be_lun->backend.dev.cdev = be_lun->vn->v_rdev;
 	be_lun->backend.dev.csw = dev_refthread(be_lun->backend.dev.cdev,
 					     &be_lun->backend.dev.dev_ref);
 	if (be_lun->backend.dev.csw == NULL)
 		panic("Unable to retrieve device switch");
 
 	error = VOP_GETATTR(be_lun->vn, &vattr, NOCRED);
 	if (error) {
 		snprintf(req->error_str, sizeof(req->error_str),
 			 "%s: error getting vnode attributes for device %s",
 			 __func__, be_lun->dev_path);
 		return (error);
 	}
 
 	dev = be_lun->vn->v_rdev;
 	devsw = dev->si_devsw;
 	if (!devsw->d_ioctl) {
 		snprintf(req->error_str, sizeof(req->error_str),
 			 "%s: no d_ioctl for device %s!", __func__,
 			 be_lun->dev_path);
 		return (ENODEV);
 	}
 
 	error = devsw->d_ioctl(dev, DIOCGSECTORSIZE,
 			       (caddr_t)&be_lun->blocksize, FREAD,
 			       curthread);
 	if (error) {
 		snprintf(req->error_str, sizeof(req->error_str),
 			 "%s: error %d returned for DIOCGSECTORSIZE ioctl "
 			 "on %s!", __func__, error, be_lun->dev_path);
 		return (error);
 	}
 
 	/*
 	 * If the user has asked for a blocksize that is greater than the
 	 * backing device's blocksize, we can do it only if the blocksize
 	 * the user is asking for is an even multiple of the underlying 
 	 * device's blocksize.
 	 */
 	if ((params->blocksize_bytes != 0)
 	 && (params->blocksize_bytes > be_lun->blocksize)) {
 		uint32_t bs_multiple, tmp_blocksize;
 
 		bs_multiple = params->blocksize_bytes / be_lun->blocksize;
 
 		tmp_blocksize = bs_multiple * be_lun->blocksize;
 
 		if (tmp_blocksize == params->blocksize_bytes) {
 			be_lun->blocksize = params->blocksize_bytes;
 		} else {
 			snprintf(req->error_str, sizeof(req->error_str),
 				 "%s: requested blocksize %u is not an even "
 				 "multiple of backing device blocksize %u",
 				 __func__, params->blocksize_bytes,
 				 be_lun->blocksize);
 			return (EINVAL);
 			
 		}
 	} else if ((params->blocksize_bytes != 0)
 		&& (params->blocksize_bytes != be_lun->blocksize)) {
 		snprintf(req->error_str, sizeof(req->error_str),
 			 "%s: requested blocksize %u < backing device "
 			 "blocksize %u", __func__, params->blocksize_bytes,
 			 be_lun->blocksize);
 		return (EINVAL);
 	}
 
 	error = devsw->d_ioctl(dev, DIOCGMEDIASIZE,
 			       (caddr_t)&be_lun->size_bytes, FREAD,
 			       curthread);
 	if (error) {
 		snprintf(req->error_str, sizeof(req->error_str),
 			 "%s: error %d returned for DIOCGMEDIASIZE "
 			 " ioctl on %s!", __func__, error,
 			 be_lun->dev_path);
 		return (error);
 	}
 
 	if (params->lun_size_bytes != 0) {
 		if (params->lun_size_bytes > be_lun->size_bytes) {
 			snprintf(req->error_str, sizeof(req->error_str),
 				 "%s: requested LUN size %ju > backing device "
 				 "size %ju", __func__,
 				 (uintmax_t)params->lun_size_bytes,
 				 (uintmax_t)be_lun->size_bytes);
 			return (EINVAL);
 		}
 
 		be_lun->size_bytes = params->lun_size_bytes;
 	}
 
 	return (0);
 }
 
 static int
 ctl_be_block_close(struct ctl_be_block_lun *be_lun)
 {
 	DROP_GIANT();
 	if (be_lun->vn) {
 		int flags = FREAD | FWRITE;
 
 		switch (be_lun->dev_type) {
 		case CTL_BE_BLOCK_DEV:
 			if (be_lun->backend.dev.csw) {
 				dev_relthread(be_lun->backend.dev.cdev,
 					      be_lun->backend.dev.dev_ref);
 				be_lun->backend.dev.csw  = NULL;
 				be_lun->backend.dev.cdev = NULL;
 			}
 			break;
 		case CTL_BE_BLOCK_FILE:
 			break;
 		case CTL_BE_BLOCK_NONE:
 		default:
 			panic("Unexpected backend type.");
 			break;
 		}
 
 		(void)vn_close(be_lun->vn, flags, NOCRED, curthread);
 		be_lun->vn = NULL;
 
 		switch (be_lun->dev_type) {
 		case CTL_BE_BLOCK_DEV:
 			break;
 		case CTL_BE_BLOCK_FILE:
 			if (be_lun->backend.file.cred != NULL) {
 				crfree(be_lun->backend.file.cred);
 				be_lun->backend.file.cred = NULL;
 			}
 			break;
 		case CTL_BE_BLOCK_NONE:
 		default:
 			panic("Unexpected backend type.");
 			break;
 		}
 	}
 	PICKUP_GIANT();
 
 	return (0);
 }
 
 static int
 ctl_be_block_open(struct ctl_be_block_softc *softc,
 		       struct ctl_be_block_lun *be_lun, struct ctl_lun_req *req)
 {
 	struct nameidata nd;
 	int		 flags;
 	int		 error;
 
 	/*
 	 * XXX KDM allow a read-only option?
 	 */
 	flags = FREAD | FWRITE;
 	error = 0;
 
 	if (rootvnode == NULL) {
 		snprintf(req->error_str, sizeof(req->error_str),
 			 "%s: Root filesystem is not mounted", __func__);
 		return (1);
 	}
 
 	if (!curthread->td_proc->p_fd->fd_cdir) {
 		curthread->td_proc->p_fd->fd_cdir = rootvnode;
 		VREF(rootvnode);
 	}
 	if (!curthread->td_proc->p_fd->fd_rdir) {
 		curthread->td_proc->p_fd->fd_rdir = rootvnode;
 		VREF(rootvnode);
 	}
 	if (!curthread->td_proc->p_fd->fd_jdir) {
 		curthread->td_proc->p_fd->fd_jdir = rootvnode;
 		VREF(rootvnode);
 	}
 
  again:
 	NDINIT(&nd, LOOKUP, FOLLOW, UIO_SYSSPACE, be_lun->dev_path, curthread);
 	error = vn_open(&nd, &flags, 0, NULL);
 	if (error) {
 		/*
 		 * This is the only reasonable guess we can make as far as
 		 * path if the user doesn't give us a fully qualified path.
 		 * If they want to specify a file, they need to specify the
 		 * full path.
 		 */
 		if (be_lun->dev_path[0] != '/') {
 			char *dev_path = "/dev/";
 			char *dev_name;
 
 			/* Try adding device path at beginning of name */
 			dev_name = malloc(strlen(be_lun->dev_path)
 					+ strlen(dev_path) + 1,
 					  M_CTLBLK, M_WAITOK);
 			if (dev_name) {
 				sprintf(dev_name, "%s%s", dev_path,
 					be_lun->dev_path);
 				free(be_lun->dev_path, M_CTLBLK);
 				be_lun->dev_path = dev_name;
 				goto again;
 			}
 		}
 		snprintf(req->error_str, sizeof(req->error_str),
 			 "%s: error opening %s", __func__, be_lun->dev_path);
 		return (error);
 	}
 
 	NDFREE(&nd, NDF_ONLY_PNBUF);
 		
 	be_lun->vn = nd.ni_vp;
 
 	/* We only support disks and files. */
 	if (vn_isdisk(be_lun->vn, &error)) {
 		error = ctl_be_block_open_dev(be_lun, req);
 	} else if (be_lun->vn->v_type == VREG) {
 		error = ctl_be_block_open_file(be_lun, req);
 	} else {
 		error = EINVAL;
 		snprintf(req->error_str, sizeof(req->error_str),
 			 "%s is not a disk or file", be_lun->dev_path);
 	}
 	VOP_UNLOCK(be_lun->vn, 0);
 
 	if (error != 0) {
 		ctl_be_block_close(be_lun);
 		return (error);
 	}
 
 	be_lun->blocksize_shift = fls(be_lun->blocksize) - 1;
 	be_lun->size_blocks = be_lun->size_bytes >> be_lun->blocksize_shift;
 
 	return (0);
 }
 
 static int
 ctl_be_block_create(struct ctl_be_block_softc *softc, struct ctl_lun_req *req)
 {
 	struct ctl_be_block_lun *be_lun;
 	struct ctl_lun_create_params *params;
 	struct ctl_be_arg *file_arg;
 	char tmpstr[32];
 	int retval, num_threads;
 	int i;
 
 	params = &req->reqdata.create;
 	retval = 0;
 
 	num_threads = cbb_num_threads;
 
 	file_arg = NULL;
 
 	be_lun = malloc(sizeof(*be_lun), M_CTLBLK, M_ZERO | M_WAITOK);
 
 	be_lun->softc = softc;
 	STAILQ_INIT(&be_lun->input_queue);
 	STAILQ_INIT(&be_lun->config_write_queue);
 	STAILQ_INIT(&be_lun->datamove_queue);
 	STAILQ_INIT(&be_lun->ctl_be_lun.options);
 	sprintf(be_lun->lunname, "cblk%d", softc->num_luns);
 	mtx_init(&be_lun->lock, be_lun->lunname, NULL, MTX_DEF);
 
 	be_lun->lun_zone = uma_zcreate(be_lun->lunname, MAXPHYS, 
 	    NULL, NULL, NULL, NULL, /*align*/ 0, /*flags*/0);
 
 	if (be_lun->lun_zone == NULL) {
 		snprintf(req->error_str, sizeof(req->error_str),
 			 "%s: error allocating UMA zone", __func__);
 		goto bailout_error;
 	}
 
 	if (params->flags & CTL_LUN_FLAG_DEV_TYPE)
 		be_lun->ctl_be_lun.lun_type = params->device_type;
 	else
 		be_lun->ctl_be_lun.lun_type = T_DIRECT;
 
 	if (be_lun->ctl_be_lun.lun_type == T_DIRECT) {
 		for (i = 0; i < req->num_be_args; i++) {
 			if (strcmp(req->kern_be_args[i].kname, "file") == 0) {
 				file_arg = &req->kern_be_args[i];
 				break;
 			}
 		}
 
 		if (file_arg == NULL) {
 			snprintf(req->error_str, sizeof(req->error_str),
 				 "%s: no file argument specified", __func__);
 			goto bailout_error;
 		}
 
 		be_lun->dev_path = malloc(file_arg->vallen, M_CTLBLK,
 					  M_WAITOK | M_ZERO);
 
 		strlcpy(be_lun->dev_path, (char *)file_arg->kvalue,
 			file_arg->vallen);
 
 		retval = ctl_be_block_open(softc, be_lun, req);
 		if (retval != 0) {
 			retval = 0;
 			goto bailout_error;
 		}
 
 		/*
 		 * Tell the user the size of the file/device.
 		 */
 		params->lun_size_bytes = be_lun->size_bytes;
 
 		/*
 		 * The maximum LBA is the size - 1.
 		 */
 		be_lun->ctl_be_lun.maxlba = be_lun->size_blocks - 1;
 	} else {
 		/*
 		 * For processor devices, we don't have any size.
 		 */
 		be_lun->blocksize = 0;
 		be_lun->size_blocks = 0;
 		be_lun->size_bytes = 0;
 		be_lun->ctl_be_lun.maxlba = 0;
 		params->lun_size_bytes = 0;
 
 		/*
 		 * Default to just 1 thread for processor devices.
 		 */
 		num_threads = 1;
 	}
 
 	/*
 	 * XXX This searching loop might be refactored to be combined with
 	 * the loop above,
 	 */
 	for (i = 0; i < req->num_be_args; i++) {
 		if (strcmp(req->kern_be_args[i].kname, "num_threads") == 0) {
 			struct ctl_be_arg *thread_arg;
 			char num_thread_str[16];
 			int tmp_num_threads;
 
 
 			thread_arg = &req->kern_be_args[i];
 
 			strlcpy(num_thread_str, (char *)thread_arg->kvalue,
 				min(thread_arg->vallen,
 				sizeof(num_thread_str)));
 
 			tmp_num_threads = strtol(num_thread_str, NULL, 0);
 
 			/*
 			 * We don't let the user specify less than one
 			 * thread, but hope he's clueful enough not to
 			 * specify 1000 threads.
 			 */
 			if (tmp_num_threads < 1) {
 				snprintf(req->error_str, sizeof(req->error_str),
 					 "%s: invalid number of threads %s",
 				         __func__, num_thread_str);
 				goto bailout_error;
 			}
 
 			num_threads = tmp_num_threads;
 		} else if (strcmp(req->kern_be_args[i].kname, "file") != 0 &&
 		    strcmp(req->kern_be_args[i].kname, "dev") != 0) {
 			struct ctl_be_lun_option *opt;
 
 			opt = malloc(sizeof(*opt), M_CTLBLK, M_WAITOK);
 			opt->name = malloc(strlen(req->kern_be_args[i].kname) + 1, M_CTLBLK, M_WAITOK);
 			strcpy(opt->name, req->kern_be_args[i].kname);
 			opt->value = malloc(strlen(req->kern_be_args[i].kvalue) + 1, M_CTLBLK, M_WAITOK);
 			strcpy(opt->value, req->kern_be_args[i].kvalue);
 			STAILQ_INSERT_TAIL(&be_lun->ctl_be_lun.options, opt, links);
 		}
 	}
 
 	be_lun->flags = CTL_BE_BLOCK_LUN_UNCONFIGURED;
 	be_lun->ctl_be_lun.flags = CTL_LUN_FLAG_PRIMARY;
 	be_lun->ctl_be_lun.be_lun = be_lun;
 	be_lun->ctl_be_lun.blocksize = be_lun->blocksize;
 	/* Tell the user the blocksize we ended up using */
 	params->blocksize_bytes = be_lun->blocksize;
 	if (params->flags & CTL_LUN_FLAG_ID_REQ) {
 		be_lun->ctl_be_lun.req_lun_id = params->req_lun_id;
 		be_lun->ctl_be_lun.flags |= CTL_LUN_FLAG_ID_REQ;
 	} else
 		be_lun->ctl_be_lun.req_lun_id = 0;
 
 	be_lun->ctl_be_lun.lun_shutdown = ctl_be_block_lun_shutdown;
 	be_lun->ctl_be_lun.lun_config_status =
 		ctl_be_block_lun_config_status;
 	be_lun->ctl_be_lun.be = &ctl_be_block_driver;
 
 	if ((params->flags & CTL_LUN_FLAG_SERIAL_NUM) == 0) {
 		snprintf(tmpstr, sizeof(tmpstr), "MYSERIAL%4d",
 			 softc->num_luns);
 		strncpy((char *)be_lun->ctl_be_lun.serial_num, tmpstr,
 			ctl_min(sizeof(be_lun->ctl_be_lun.serial_num),
 			sizeof(tmpstr)));
 
 		/* Tell the user what we used for a serial number */
 		strncpy((char *)params->serial_num, tmpstr,
 			ctl_min(sizeof(params->serial_num), sizeof(tmpstr)));
 	} else { 
 		strncpy((char *)be_lun->ctl_be_lun.serial_num,
 			params->serial_num,
 			ctl_min(sizeof(be_lun->ctl_be_lun.serial_num),
 			sizeof(params->serial_num)));
 	}
 	if ((params->flags & CTL_LUN_FLAG_DEVID) == 0) {
 		snprintf(tmpstr, sizeof(tmpstr), "MYDEVID%4d", softc->num_luns);
 		strncpy((char *)be_lun->ctl_be_lun.device_id, tmpstr,
 			ctl_min(sizeof(be_lun->ctl_be_lun.device_id),
 			sizeof(tmpstr)));
 
 		/* Tell the user what we used for a device ID */
 		strncpy((char *)params->device_id, tmpstr,
 			ctl_min(sizeof(params->device_id), sizeof(tmpstr)));
 	} else {
 		strncpy((char *)be_lun->ctl_be_lun.device_id,
 			params->device_id,
 			ctl_min(sizeof(be_lun->ctl_be_lun.device_id),
 				sizeof(params->device_id)));
 	}
 
 	TASK_INIT(&be_lun->io_task, /*priority*/0, ctl_be_block_worker, be_lun);
 
 	be_lun->io_taskqueue = taskqueue_create(be_lun->lunname, M_WAITOK,
 	    taskqueue_thread_enqueue, /*context*/&be_lun->io_taskqueue);
 
 	if (be_lun->io_taskqueue == NULL) {
 		snprintf(req->error_str, sizeof(req->error_str),
 			 "%s: Unable to create taskqueue", __func__);
 		goto bailout_error;
 	}
 
 	/*
 	 * Note that we start the same number of threads by default for
 	 * both the file case and the block device case.  For the file
 	 * case, we need multiple threads to allow concurrency, because the
 	 * vnode interface is designed to be a blocking interface.  For the
 	 * block device case, ZFS zvols at least will block the caller's
 	 * context in many instances, and so we need multiple threads to
 	 * overcome that problem.  Other block devices don't need as many
 	 * threads, but they shouldn't cause too many problems.
 	 *
 	 * If the user wants to just have a single thread for a block
 	 * device, he can specify that when the LUN is created, or change
 	 * the tunable/sysctl to alter the default number of threads.
 	 */
 	retval = taskqueue_start_threads(&be_lun->io_taskqueue,
 					 /*num threads*/num_threads,
 					 /*priority*/PWAIT,
 					 /*thread name*/
 					 "%s taskq", be_lun->lunname);
 
 	if (retval != 0)
 		goto bailout_error;
 
 	be_lun->num_threads = num_threads;
 
 	mtx_lock(&softc->lock);
 	softc->num_luns++;
 	STAILQ_INSERT_TAIL(&softc->lun_list, be_lun, links);
 
 	mtx_unlock(&softc->lock);
 
 	retval = ctl_add_lun(&be_lun->ctl_be_lun);
 	if (retval != 0) {
 		mtx_lock(&softc->lock);
 		STAILQ_REMOVE(&softc->lun_list, be_lun, ctl_be_block_lun,
 			      links);
 		softc->num_luns--;
 		mtx_unlock(&softc->lock);
 		snprintf(req->error_str, sizeof(req->error_str),
 			 "%s: ctl_add_lun() returned error %d, see dmesg for "
 			"details", __func__, retval);
 		retval = 0;
 		goto bailout_error;
 	}
 
 	mtx_lock(&softc->lock);
 
 	/*
 	 * Tell the config_status routine that we're waiting so it won't
 	 * clean up the LUN in the event of an error.
 	 */
 	be_lun->flags |= CTL_BE_BLOCK_LUN_WAITING;
 
 	while (be_lun->flags & CTL_BE_BLOCK_LUN_UNCONFIGURED) {
 		retval = msleep(be_lun, &softc->lock, PCATCH, "ctlblk", 0);
 		if (retval == EINTR)
 			break;
 	}
 	be_lun->flags &= ~CTL_BE_BLOCK_LUN_WAITING;
 
 	if (be_lun->flags & CTL_BE_BLOCK_LUN_CONFIG_ERR) {
 		snprintf(req->error_str, sizeof(req->error_str),
 			 "%s: LUN configuration error, see dmesg for details",
 			 __func__);
 		STAILQ_REMOVE(&softc->lun_list, be_lun, ctl_be_block_lun,
 			      links);
 		softc->num_luns--;
 		mtx_unlock(&softc->lock);
 		goto bailout_error;
 	} else {
 		params->req_lun_id = be_lun->ctl_be_lun.lun_id;
 	}
 
 	mtx_unlock(&softc->lock);
 
 	be_lun->disk_stats = devstat_new_entry("cbb", params->req_lun_id,
 					       be_lun->blocksize,
 					       DEVSTAT_ALL_SUPPORTED,
 					       be_lun->ctl_be_lun.lun_type
 					       | DEVSTAT_TYPE_IF_OTHER,
 					       DEVSTAT_PRIORITY_OTHER);
 
 
 	req->status = CTL_LUN_OK;
 
 	return (retval);
 
 bailout_error:
 	req->status = CTL_LUN_ERROR;
 
 	ctl_be_block_close(be_lun);
 
 	free(be_lun->dev_path, M_CTLBLK);
 	free(be_lun, M_CTLBLK);
 
 	return (retval);
 }
 
 static int
 ctl_be_block_rm(struct ctl_be_block_softc *softc, struct ctl_lun_req *req)
 {
 	struct ctl_lun_rm_params *params;
 	struct ctl_be_block_lun *be_lun;
 	int retval;
 
 	params = &req->reqdata.rm;
 
 	mtx_lock(&softc->lock);
 
 	be_lun = NULL;
 
 	STAILQ_FOREACH(be_lun, &softc->lun_list, links) {
 		if (be_lun->ctl_be_lun.lun_id == params->lun_id)
 			break;
 	}
 	mtx_unlock(&softc->lock);
 
 	if (be_lun == NULL) {
 		snprintf(req->error_str, sizeof(req->error_str),
 			 "%s: LUN %u is not managed by the block backend",
 			 __func__, params->lun_id);
 		goto bailout_error;
 	}
 
 	retval = ctl_disable_lun(&be_lun->ctl_be_lun);
 
 	if (retval != 0) {
 		snprintf(req->error_str, sizeof(req->error_str),
 			 "%s: error %d returned from ctl_disable_lun() for "
 			 "LUN %d", __func__, retval, params->lun_id);
 		goto bailout_error;
 
 	}
 
 	retval = ctl_invalidate_lun(&be_lun->ctl_be_lun);
 	if (retval != 0) {
 		snprintf(req->error_str, sizeof(req->error_str),
 			 "%s: error %d returned from ctl_invalidate_lun() for "
 			 "LUN %d", __func__, retval, params->lun_id);
 		goto bailout_error;
 	}
 
 	mtx_lock(&softc->lock);
 
 	be_lun->flags |= CTL_BE_BLOCK_LUN_WAITING;
 
 	while ((be_lun->flags & CTL_BE_BLOCK_LUN_UNCONFIGURED) == 0) {
                 retval = msleep(be_lun, &softc->lock, PCATCH, "ctlblk", 0);
                 if (retval == EINTR)
                         break;
         }
 
 	be_lun->flags &= ~CTL_BE_BLOCK_LUN_WAITING;
 
 	if ((be_lun->flags & CTL_BE_BLOCK_LUN_UNCONFIGURED) == 0) {
 		snprintf(req->error_str, sizeof(req->error_str),
 			 "%s: interrupted waiting for LUN to be freed", 
 			 __func__);
 		mtx_unlock(&softc->lock);
 		goto bailout_error;
 	}
 
 	STAILQ_REMOVE(&softc->lun_list, be_lun, ctl_be_block_lun, links);
 
 	softc->num_luns--;
 	mtx_unlock(&softc->lock);
 
 	taskqueue_drain(be_lun->io_taskqueue, &be_lun->io_task);
 
 	taskqueue_free(be_lun->io_taskqueue);
 
 	ctl_be_block_close(be_lun);
 
 	if (be_lun->disk_stats != NULL)
 		devstat_remove_entry(be_lun->disk_stats);
 
 	uma_zdestroy(be_lun->lun_zone);
 
 	free(be_lun->dev_path, M_CTLBLK);
 
 	free(be_lun, M_CTLBLK);
 
 	req->status = CTL_LUN_OK;
 
 	return (0);
 
 bailout_error:
 
 	req->status = CTL_LUN_ERROR;
 
 	return (0);
 }
 
 static int
 ctl_be_block_modify_file(struct ctl_be_block_lun *be_lun,
 			 struct ctl_lun_req *req)
 {
 	struct vattr vattr;
 	int error;
 	struct ctl_lun_modify_params *params;
 
 	params = &req->reqdata.modify;
 
 	if (params->lun_size_bytes != 0) {
 		be_lun->size_bytes = params->lun_size_bytes;
 	} else  {
 		error = VOP_GETATTR(be_lun->vn, &vattr, curthread->td_ucred);
 		if (error != 0) {
 			snprintf(req->error_str, sizeof(req->error_str),
 				 "error calling VOP_GETATTR() for file %s",
 				 be_lun->dev_path);
 			return (error);
 		}
 
 		be_lun->size_bytes = vattr.va_size;
 	}
 
 	return (0);
 }
 
 static int
 ctl_be_block_modify_dev(struct ctl_be_block_lun *be_lun,
 			struct ctl_lun_req *req)
 {
 	struct cdev *dev;
 	struct cdevsw *devsw;
 	int error;
 	struct ctl_lun_modify_params *params;
 	uint64_t size_bytes;
 
 	params = &req->reqdata.modify;
 
 	dev = be_lun->vn->v_rdev;
 	devsw = dev->si_devsw;
 	if (!devsw->d_ioctl) {
 		snprintf(req->error_str, sizeof(req->error_str),
 			 "%s: no d_ioctl for device %s!", __func__,
 			 be_lun->dev_path);
 		return (ENODEV);
 	}
 
 	error = devsw->d_ioctl(dev, DIOCGMEDIASIZE,
 			       (caddr_t)&size_bytes, FREAD,
 			       curthread);
 	if (error) {
 		snprintf(req->error_str, sizeof(req->error_str),
 			 "%s: error %d returned for DIOCGMEDIASIZE ioctl "
 			 "on %s!", __func__, error, be_lun->dev_path);
 		return (error);
 	}
 
 	if (params->lun_size_bytes != 0) {
 		if (params->lun_size_bytes > size_bytes) {
 			snprintf(req->error_str, sizeof(req->error_str),
 				 "%s: requested LUN size %ju > backing device "
 				 "size %ju", __func__,
 				 (uintmax_t)params->lun_size_bytes,
 				 (uintmax_t)size_bytes);
 			return (EINVAL);
 		}
 
 		be_lun->size_bytes = params->lun_size_bytes;
 	} else {
 		be_lun->size_bytes = size_bytes;
 	}
 
 	return (0);
 }
 
 static int
 ctl_be_block_modify(struct ctl_be_block_softc *softc, struct ctl_lun_req *req)
 {
 	struct ctl_lun_modify_params *params;
 	struct ctl_be_block_lun *be_lun;
 	int error;
 
 	params = &req->reqdata.modify;
 
 	mtx_lock(&softc->lock);
 
 	be_lun = NULL;
 
 	STAILQ_FOREACH(be_lun, &softc->lun_list, links) {
 		if (be_lun->ctl_be_lun.lun_id == params->lun_id)
 			break;
 	}
 	mtx_unlock(&softc->lock);
 
 	if (be_lun == NULL) {
 		snprintf(req->error_str, sizeof(req->error_str),
 			 "%s: LUN %u is not managed by the block backend",
 			 __func__, params->lun_id);
 		goto bailout_error;
 	}
 
 	if (params->lun_size_bytes != 0) {
 		if (params->lun_size_bytes < be_lun->blocksize) {
 			snprintf(req->error_str, sizeof(req->error_str),
 				"%s: LUN size %ju < blocksize %u", __func__,
 				params->lun_size_bytes, be_lun->blocksize);
 			goto bailout_error;
 		}
 	}
 
 	vn_lock(be_lun->vn, LK_SHARED | LK_RETRY);
 
 	if (be_lun->vn->v_type == VREG)
 		error = ctl_be_block_modify_file(be_lun, req);
 	else
 		error = ctl_be_block_modify_dev(be_lun, req);
 
 	VOP_UNLOCK(be_lun->vn, 0);
 
 	if (error != 0)
 		goto bailout_error;
 
 	be_lun->size_blocks = be_lun->size_bytes >> be_lun->blocksize_shift;
 
 	/*
 	 * The maximum LBA is the size - 1.
 	 *
 	 * XXX: Note that this field is being updated without locking,
 	 * 	which might cause problems on 32-bit architectures.
 	 */
 	be_lun->ctl_be_lun.maxlba = be_lun->size_blocks - 1;
 	ctl_lun_capacity_changed(&be_lun->ctl_be_lun);
 
 	/* Tell the user the exact size we ended up using */
 	params->lun_size_bytes = be_lun->size_bytes;
 
 	req->status = CTL_LUN_OK;
 
 	return (0);
 
 bailout_error:
 	req->status = CTL_LUN_ERROR;
 
 	return (0);
 }
 
 static void
 ctl_be_block_lun_shutdown(void *be_lun)
 {
 	struct ctl_be_block_lun *lun;
 	struct ctl_be_block_softc *softc;
 
 	lun = (struct ctl_be_block_lun *)be_lun;
 
 	softc = lun->softc;
 
 	mtx_lock(&softc->lock);
 	lun->flags |= CTL_BE_BLOCK_LUN_UNCONFIGURED;
 	if (lun->flags & CTL_BE_BLOCK_LUN_WAITING)
 		wakeup(lun);
 	mtx_unlock(&softc->lock);
 
 }
 
 static void
 ctl_be_block_lun_config_status(void *be_lun, ctl_lun_config_status status)
 {
 	struct ctl_be_block_lun *lun;
 	struct ctl_be_block_softc *softc;
 
 	lun = (struct ctl_be_block_lun *)be_lun;
 	softc = lun->softc;
 
 	if (status == CTL_LUN_CONFIG_OK) {
 		mtx_lock(&softc->lock);
 		lun->flags &= ~CTL_BE_BLOCK_LUN_UNCONFIGURED;
 		if (lun->flags & CTL_BE_BLOCK_LUN_WAITING)
 			wakeup(lun);
 		mtx_unlock(&softc->lock);
 
 		/*
 		 * We successfully added the LUN, attempt to enable it.
 		 */
 		if (ctl_enable_lun(&lun->ctl_be_lun) != 0) {
 			printf("%s: ctl_enable_lun() failed!\n", __func__);
 			if (ctl_invalidate_lun(&lun->ctl_be_lun) != 0) {
 				printf("%s: ctl_invalidate_lun() failed!\n",
 				       __func__);
 			}
 		}
 
 		return;
 	}
 
 
 	mtx_lock(&softc->lock);
 	lun->flags &= ~CTL_BE_BLOCK_LUN_UNCONFIGURED;
 	lun->flags |= CTL_BE_BLOCK_LUN_CONFIG_ERR;
 	wakeup(lun);
 	mtx_unlock(&softc->lock);
 }
 
 
 static int
 ctl_be_block_config_write(union ctl_io *io)
 {
 	struct ctl_be_block_lun *be_lun;
 	struct ctl_be_lun *ctl_be_lun;
 	int retval;
 
 	retval = 0;
 
 	DPRINTF("entered\n");
 
 	ctl_be_lun = (struct ctl_be_lun *)io->io_hdr.ctl_private[
 		CTL_PRIV_BACKEND_LUN].ptr;
 	be_lun = (struct ctl_be_block_lun *)ctl_be_lun->be_lun;
 
 	switch (io->scsiio.cdb[0]) {
 	case SYNCHRONIZE_CACHE:
 	case SYNCHRONIZE_CACHE_16:
 		/*
 		 * The upper level CTL code will filter out any CDBs with
 		 * the immediate bit set and return the proper error.
 		 *
 		 * We don't really need to worry about what LBA range the
 		 * user asked to be synced out.  When they issue a sync
 		 * cache command, we'll sync out the whole thing.
 		 */
 		mtx_lock(&be_lun->lock);
 		STAILQ_INSERT_TAIL(&be_lun->config_write_queue, &io->io_hdr,
 				   links);
 		mtx_unlock(&be_lun->lock);
 		taskqueue_enqueue(be_lun->io_taskqueue, &be_lun->io_task);
 		break;
 	case START_STOP_UNIT: {
 		struct scsi_start_stop_unit *cdb;
 
 		cdb = (struct scsi_start_stop_unit *)io->scsiio.cdb;
 
 		if (cdb->how & SSS_START)
 			retval = ctl_start_lun(ctl_be_lun);
 		else {
 			retval = ctl_stop_lun(ctl_be_lun);
 			/*
 			 * XXX KDM Copan-specific offline behavior.
 			 * Figure out a reasonable way to port this?
 			 */
 #ifdef NEEDTOPORT
 			if ((retval == 0)
 			 && (cdb->byte2 & SSS_ONOFFLINE))
 				retval = ctl_lun_offline(ctl_be_lun);
 #endif
 		}
 
 		/*
 		 * In general, the above routines should not fail.  They
 		 * just set state for the LUN.  So we've got something
 		 * pretty wrong here if we can't start or stop the LUN.
 		 */
 		if (retval != 0) {
 			ctl_set_internal_failure(&io->scsiio,
 						 /*sks_valid*/ 1,
 						 /*retry_count*/ 0xf051);
 			retval = CTL_RETVAL_COMPLETE;
 		} else {
 			ctl_set_success(&io->scsiio);
 		}
 		ctl_config_write_done(io);
 		break;
 	}
 	default:
 		ctl_set_invalid_opcode(&io->scsiio);
 		ctl_config_write_done(io);
 		retval = CTL_RETVAL_COMPLETE;
 		break;
 	}
 
 	return (retval);
 
 }
 
 static int
 ctl_be_block_config_read(union ctl_io *io)
 {
 	return (0);
 }
 
 static int
 ctl_be_block_lun_info(void *be_lun, struct sbuf *sb)
 {
 	struct ctl_be_block_lun *lun;
 	int retval;
 
 	lun = (struct ctl_be_block_lun *)be_lun;
 	retval = 0;
 
 	retval = sbuf_printf(sb, "<num_threads>");
 
 	if (retval != 0)
 		goto bailout;
 
 	retval = sbuf_printf(sb, "%d", lun->num_threads);
 
 	if (retval != 0)
 		goto bailout;
 
 	retval = sbuf_printf(sb, "</num_threads>");
 
 	/*
 	 * For processor devices, we don't have a path variable.
 	 */
 	if ((retval != 0)
 	 || (lun->dev_path == NULL))
 		goto bailout;
 
 	retval = sbuf_printf(sb, "<file>");
 
 	if (retval != 0)
 		goto bailout;
 
 	retval = ctl_sbuf_printf_esc(sb, lun->dev_path);
 
 	if (retval != 0)
 		goto bailout;
 
 	retval = sbuf_printf(sb, "</file>\n");
 
 bailout:
 
 	return (retval);
 }
 
 int
 ctl_be_block_init(void)
 {
 	struct ctl_be_block_softc *softc;
 	int retval;
 
 	softc = &backend_block_softc;
 	retval = 0;
 
 	mtx_init(&softc->lock, "ctlblk", NULL, MTX_DEF);
 	STAILQ_INIT(&softc->beio_free_queue);
 	STAILQ_INIT(&softc->disk_list);
 	STAILQ_INIT(&softc->lun_list);
 	ctl_grow_beio(softc, 200);
 
 	return (retval);
 }
Index: head/sys/cddl/dev/dtrace/dtrace_test.c
===================================================================
--- head/sys/cddl/dev/dtrace/dtrace_test.c	(revision 258540)
+++ head/sys/cddl/dev/dtrace/dtrace_test.c	(revision 258541)
@@ -1,110 +1,108 @@
 /*-
  * Copyright 2008 John Birrell <jb@FreeBSD.org>
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 
  * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  * $FreeBSD$
  *
  */
-#include "opt_kdtrace.h"
-
 #include <sys/cdefs.h>
 #include <sys/types.h>
 #include <sys/param.h>
 #include <sys/systm.h>
 
 #include <sys/conf.h>
 #include <sys/kernel.h>
 #include <sys/module.h>
 #include <sys/sdt.h>
 #include <sys/sysctl.h>
 #include <sys/vnode.h>
 
 SDT_PROVIDER_DEFINE(test);
 
 SDT_PROBE_DEFINE7(test, , , sdttest, sdttest, "int", "int", "int", "int", "int",
     "int", "int");
 
 /*
  * These are variables that the DTrace test suite references in the
  * Solaris kernel. We define them here so that the tests function 
  * unaltered.
  */
 int	kmem_flags;
 
 typedef struct vnode vnode_t;
 vnode_t dummy;
 vnode_t *rootvp = &dummy;
 
 /*
  * Test SDT probes with more than 5 arguments. On amd64, such probes require
  * special handling since only the first 5 arguments will be passed to
  * dtrace_probe() in registers; the rest must be fetched off the stack.
  */
 static int
 dtrace_test_sdttest(SYSCTL_HANDLER_ARGS)
 {
 	int val, error;
 
 	val = 0;
 	error = sysctl_handle_int(oidp, &val, 0, req);
 	if (error || req->newptr == NULL)
 		return (error);
 	else if (val == 0)
 		return (0);
 
 	SDT_PROBE7(test, , , sdttest, 1, 2, 3, 4, 5, 6, 7);
 
 	return (error);
 }
 
 static SYSCTL_NODE(_debug, OID_AUTO, dtracetest, CTLFLAG_RD, 0, "");
 
 SYSCTL_PROC(_debug_dtracetest, OID_AUTO, sdttest, CTLTYPE_INT | CTLFLAG_RW,
     NULL, 0, dtrace_test_sdttest, "I", "Trigger the SDT test probe");
 
 static int
 dtrace_test_modevent(module_t mod, int type, void *data)
 {
 	int error = 0;
 
 	switch (type) {
 	case MOD_LOAD:
 		break;
 
 	case MOD_UNLOAD:
 		break;
 
 	case MOD_SHUTDOWN:
 		break;
 
 	default:
 		error = EOPNOTSUPP;
 		break;
 
 	}
 	return (error);
 }
 
 DEV_MODULE(dtrace_test, dtrace_test_modevent, NULL);
 MODULE_VERSION(dtrace_test, 1);
 MODULE_DEPEND(dtrace_test, dtraceall, 1, 1, 1);
Index: head/sys/cddl/dev/lockstat/lockstat.c
===================================================================
--- head/sys/cddl/dev/lockstat/lockstat.c	(revision 258540)
+++ head/sys/cddl/dev/lockstat/lockstat.c	(revision 258541)
@@ -1,328 +1,326 @@
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  *
  * Portions Copyright (c) 2008-2009 Stacey Son <sson@FreeBSD.org> 
  *
  * $FreeBSD$
  *
  */
 
 /*
  * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
-#include "opt_kdtrace.h"
-
 #include <sys/cdefs.h>
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/conf.h>
 #include <sys/kernel.h>
 #include <sys/limits.h>
 #include <sys/lock.h>
 #include <sys/linker.h>
 #include <sys/module.h>
 #include <sys/mutex.h>
 
 #include <sys/dtrace.h>
 #include <sys/lockstat.h>
 
 #if defined(__i386__) || defined(__amd64__) || \
 	defined(__mips__) || defined(__powerpc__)
 #define LOCKSTAT_AFRAMES 1
 #else
 #error "architecture not supported"
 #endif
 
 static d_open_t lockstat_open;
 static void     lockstat_provide(void *, dtrace_probedesc_t *);
 static void     lockstat_destroy(void *, dtrace_id_t, void *);
 static void     lockstat_enable(void *, dtrace_id_t, void *);
 static void     lockstat_disable(void *, dtrace_id_t, void *);
 static void     lockstat_load(void *);
 static int     	lockstat_unload(void);
 
 
 typedef struct lockstat_probe {
 	char		*lsp_func;
 	char		*lsp_name;
 	int		lsp_probe;
 	dtrace_id_t	lsp_id;
 #ifdef __FreeBSD__
 	int		lsp_frame;
 #endif
 } lockstat_probe_t;
 
 #ifdef __FreeBSD__
 lockstat_probe_t lockstat_probes[] =
 {
   /* Spin Locks */
   { LS_MTX_SPIN_LOCK,	LSS_ACQUIRE,	LS_MTX_SPIN_LOCK_ACQUIRE,
 	  DTRACE_IDNONE, LOCKSTAT_AFRAMES },
   { LS_MTX_SPIN_LOCK, 	LSS_SPIN,	LS_MTX_SPIN_LOCK_SPIN,
 	  DTRACE_IDNONE, LOCKSTAT_AFRAMES },
   { LS_MTX_SPIN_UNLOCK,	LSS_RELEASE,	LS_MTX_SPIN_UNLOCK_RELEASE,
 	  DTRACE_IDNONE, LOCKSTAT_AFRAMES },
   /* Adaptive Locks */
   { LS_MTX_LOCK,	LSA_ACQUIRE,	LS_MTX_LOCK_ACQUIRE,
 	  DTRACE_IDNONE, (LOCKSTAT_AFRAMES + 1) },
   { LS_MTX_LOCK,	LSA_BLOCK,	LS_MTX_LOCK_BLOCK,
 	  DTRACE_IDNONE, (LOCKSTAT_AFRAMES + 1) },
   { LS_MTX_LOCK,	LSA_SPIN,	LS_MTX_LOCK_SPIN,
 	  DTRACE_IDNONE, (LOCKSTAT_AFRAMES + 1) },
   { LS_MTX_UNLOCK,	LSA_RELEASE,	LS_MTX_UNLOCK_RELEASE,
 	  DTRACE_IDNONE, LOCKSTAT_AFRAMES },
   { LS_MTX_TRYLOCK,	LSA_ACQUIRE,	LS_MTX_TRYLOCK_ACQUIRE,
 	  DTRACE_IDNONE, LOCKSTAT_AFRAMES },
   /* Reader/Writer Locks */
   { LS_RW_RLOCK,	LSR_ACQUIRE,	LS_RW_RLOCK_ACQUIRE,
 	  DTRACE_IDNONE, LOCKSTAT_AFRAMES },
   { LS_RW_RLOCK,	LSR_BLOCK,	LS_RW_RLOCK_BLOCK,
 	  DTRACE_IDNONE, LOCKSTAT_AFRAMES },
   { LS_RW_RLOCK,	LSR_SPIN,	LS_RW_RLOCK_SPIN,
 	  DTRACE_IDNONE, LOCKSTAT_AFRAMES },
   { LS_RW_RUNLOCK,	LSR_RELEASE,	LS_RW_RUNLOCK_RELEASE,
 	  DTRACE_IDNONE, LOCKSTAT_AFRAMES },
   { LS_RW_WLOCK,	LSR_ACQUIRE,	LS_RW_WLOCK_ACQUIRE,
 	  DTRACE_IDNONE, LOCKSTAT_AFRAMES },
   { LS_RW_WLOCK,	LSR_BLOCK,	LS_RW_WLOCK_BLOCK,
 	  DTRACE_IDNONE, LOCKSTAT_AFRAMES },
   { LS_RW_WLOCK,	LSR_SPIN,	LS_RW_WLOCK_SPIN,
 	  DTRACE_IDNONE, LOCKSTAT_AFRAMES },
   { LS_RW_WUNLOCK,	LSR_RELEASE,	LS_RW_WUNLOCK_RELEASE,
 	  DTRACE_IDNONE, LOCKSTAT_AFRAMES },
   { LS_RW_TRYUPGRADE,	LSR_UPGRADE,   	LS_RW_TRYUPGRADE_UPGRADE,
 	  DTRACE_IDNONE, LOCKSTAT_AFRAMES },
   { LS_RW_DOWNGRADE,	LSR_DOWNGRADE, 	LS_RW_DOWNGRADE_DOWNGRADE,
 	  DTRACE_IDNONE, LOCKSTAT_AFRAMES },
   /* Shared/Exclusive Locks */
   { LS_SX_SLOCK,	LSX_ACQUIRE,	LS_SX_SLOCK_ACQUIRE,
 	  DTRACE_IDNONE, LOCKSTAT_AFRAMES },
   { LS_SX_SLOCK,	LSX_BLOCK,	LS_SX_SLOCK_BLOCK,
 	  DTRACE_IDNONE, LOCKSTAT_AFRAMES },
   { LS_SX_SLOCK,	LSX_SPIN,	LS_SX_SLOCK_SPIN,
 	  DTRACE_IDNONE, LOCKSTAT_AFRAMES },
   { LS_SX_SUNLOCK,	LSX_RELEASE,	LS_SX_SUNLOCK_RELEASE,
 	  DTRACE_IDNONE, LOCKSTAT_AFRAMES },
   { LS_SX_XLOCK,	LSX_ACQUIRE,	LS_SX_XLOCK_ACQUIRE,
 	  DTRACE_IDNONE, LOCKSTAT_AFRAMES },
   { LS_SX_XLOCK,	LSX_BLOCK,	LS_SX_XLOCK_BLOCK,
 	  DTRACE_IDNONE, LOCKSTAT_AFRAMES },
   { LS_SX_XLOCK,	LSX_SPIN,	LS_SX_XLOCK_SPIN,
 	  DTRACE_IDNONE, LOCKSTAT_AFRAMES },
   { LS_SX_XUNLOCK,	LSX_RELEASE,	LS_SX_XUNLOCK_RELEASE,
 	  DTRACE_IDNONE, LOCKSTAT_AFRAMES },
   { LS_SX_TRYUPGRADE,	LSX_UPGRADE,	LS_SX_TRYUPGRADE_UPGRADE,
 	  DTRACE_IDNONE, LOCKSTAT_AFRAMES },
   { LS_SX_DOWNGRADE,	LSX_DOWNGRADE,	LS_SX_DOWNGRADE_DOWNGRADE,
 	  DTRACE_IDNONE, LOCKSTAT_AFRAMES },
   /* Thread Locks */
   { LS_THREAD_LOCK,	LST_SPIN,	LS_THREAD_LOCK_SPIN,
 	  DTRACE_IDNONE, LOCKSTAT_AFRAMES },
   { NULL }
 };
 #else
 #error "OS not supported"
 #endif
 
 
 static struct cdevsw lockstat_cdevsw = {
 	.d_version	= D_VERSION,
 	.d_open		= lockstat_open,
 	.d_name		= "lockstat",
 };
 
 static struct cdev		*lockstat_cdev; 
 static dtrace_provider_id_t 	lockstat_id;
 
 /*ARGSUSED*/
 static void
 lockstat_enable(void *arg, dtrace_id_t id, void *parg)
 {
 	lockstat_probe_t *probe = parg;
 
 	ASSERT(!lockstat_probemap[probe->lsp_probe]);
 
 	lockstat_probemap[probe->lsp_probe] = id;
 #ifdef DOODAD
 	membar_producer();
 #endif
 
 	lockstat_probe_func = dtrace_probe;
 #ifdef DOODAD
 	membar_producer();
 
 	lockstat_hot_patch();
 	membar_producer();
 #endif
 }
 
 /*ARGSUSED*/
 static void
 lockstat_disable(void *arg, dtrace_id_t id, void *parg)
 {
 	lockstat_probe_t *probe = parg;
 	int i;
 
 	ASSERT(lockstat_probemap[probe->lsp_probe]);
 
 	lockstat_probemap[probe->lsp_probe] = 0;
 #ifdef DOODAD
 	lockstat_hot_patch();
 	membar_producer();
 #endif
 
 	/*
 	 * See if we have any probes left enabled.
 	 */
 	for (i = 0; i < LS_NPROBES; i++) {
 		if (lockstat_probemap[i]) {
 			/*
 			 * This probe is still enabled.  We don't need to deal
 			 * with waiting for all threads to be out of the
 			 * lockstat critical sections; just return.
 			 */
 			return;
 		}
 	}
 
 }
 
 /*ARGSUSED*/
 static int
 lockstat_open(struct cdev *dev __unused, int oflags __unused, 
 	      int devtype __unused, struct thread *td __unused)
 {
 	return (0);
 }
 
 /*ARGSUSED*/
 static void
 lockstat_provide(void *arg, dtrace_probedesc_t *desc)
 {
 	int i = 0;
 
 	for (i = 0; lockstat_probes[i].lsp_func != NULL; i++) {
 		lockstat_probe_t *probe = &lockstat_probes[i];
 
 		if (dtrace_probe_lookup(lockstat_id, "kernel",
 		    probe->lsp_func, probe->lsp_name) != 0)
 			continue;
 
 		ASSERT(!probe->lsp_id);
 #ifdef __FreeBSD__
 		probe->lsp_id = dtrace_probe_create(lockstat_id,
 		    "kernel", probe->lsp_func, probe->lsp_name,
 		    probe->lsp_frame, probe);
 #else
 		probe->lsp_id = dtrace_probe_create(lockstat_id,
 		    "kernel", probe->lsp_func, probe->lsp_name,
 		    LOCKSTAT_AFRAMES, probe);
 #endif
 	}
 }
 
 /*ARGSUSED*/
 static void
 lockstat_destroy(void *arg, dtrace_id_t id, void *parg)
 {
 	lockstat_probe_t *probe = parg;
 
 	ASSERT(!lockstat_probemap[probe->lsp_probe]);
 	probe->lsp_id = 0;
 }
 
 static dtrace_pattr_t lockstat_attr = {
 { DTRACE_STABILITY_EVOLVING, DTRACE_STABILITY_EVOLVING, DTRACE_CLASS_COMMON },
 { DTRACE_STABILITY_PRIVATE, DTRACE_STABILITY_PRIVATE, DTRACE_CLASS_UNKNOWN },
 { DTRACE_STABILITY_PRIVATE, DTRACE_STABILITY_PRIVATE, DTRACE_CLASS_UNKNOWN },
 { DTRACE_STABILITY_EVOLVING, DTRACE_STABILITY_EVOLVING, DTRACE_CLASS_COMMON },
 { DTRACE_STABILITY_EVOLVING, DTRACE_STABILITY_EVOLVING, DTRACE_CLASS_COMMON },
 };
 
 static dtrace_pops_t lockstat_pops = {
 	lockstat_provide,
 	NULL,
 	lockstat_enable,
 	lockstat_disable,
 	NULL,
 	NULL,
 	NULL,
 	NULL,
 	NULL,
 	lockstat_destroy
 };
 
 static void
 lockstat_load(void *dummy)
 {
 	/* Create the /dev/dtrace/lockstat entry. */
 	lockstat_cdev = make_dev(&lockstat_cdevsw, 0, UID_ROOT, GID_WHEEL, 0600,
 	    "dtrace/lockstat");
 
 	if (dtrace_register("lockstat", &lockstat_attr, DTRACE_PRIV_USER,
 	    NULL, &lockstat_pops, NULL, &lockstat_id) != 0)
 	        return;
 }
 
 static int
 lockstat_unload()
 {
 	int error = 0;
 
 	if ((error = dtrace_unregister(lockstat_id)) != 0)
 	    return (error);
 
 	destroy_dev(lockstat_cdev);
 
 	return (error);
 }
 
 /* ARGSUSED */
 static int
 lockstat_modevent(module_t mod __unused, int type, void *data __unused)
 {
 	int error = 0;
 
 	switch (type) {
 	case MOD_LOAD:
 		break;
 
 	case MOD_UNLOAD:
 		break;
 
 	case MOD_SHUTDOWN:
 		break;
 
 	default:
 		error = EOPNOTSUPP;
 		break;
 	}
 	return (error);
 }
 
 SYSINIT(lockstat_load, SI_SUB_DTRACE_PROVIDER, SI_ORDER_ANY, lockstat_load, NULL);
 SYSUNINIT(lockstat_unload, SI_SUB_DTRACE_PROVIDER, SI_ORDER_ANY, lockstat_unload, NULL);
 
 DEV_MODULE(lockstat, lockstat_modevent, NULL);
 MODULE_VERSION(lockstat, 1);
 MODULE_DEPEND(lockstat, dtrace, 1, 1, 1);
 MODULE_DEPEND(lockstat, opensolaris, 1, 1, 1);
Index: head/sys/cddl/dev/sdt/sdt.c
===================================================================
--- head/sys/cddl/dev/sdt/sdt.c	(revision 258540)
+++ head/sys/cddl/dev/sdt/sdt.c	(revision 258541)
@@ -1,387 +1,385 @@
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  *
  * Portions Copyright 2006-2008 John Birrell jb@freebsd.org
  *
  * $FreeBSD$
  *
  */
 
 /*
  * This file contains a reimplementation of the statically-defined tracing (SDT)
  * framework for DTrace. Probes and SDT providers are defined using the macros
  * in sys/sdt.h, which append all the needed structures to linker sets. When
  * this module is loaded, it iterates over all of the loaded modules and
  * registers probes and providers with the DTrace framework based on the
  * contents of these linker sets.
  *
  * A list of SDT providers is maintained here since a provider may span multiple
  * modules. When a kernel module is unloaded, a provider defined in that module
  * is unregistered only if no other modules refer to it. The DTrace framework is
  * responsible for destroying individual probes when a kernel module is
  * unloaded; in particular, probes may not span multiple kernel modules.
  */
 
-#include "opt_kdtrace.h"
-
 #include <sys/cdefs.h>
 #include <sys/param.h>
 #include <sys/systm.h>
 
 #include <sys/conf.h>
 #include <sys/eventhandler.h>
 #include <sys/kernel.h>
 #include <sys/limits.h>
 #include <sys/linker.h>
 #include <sys/linker_set.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/module.h>
 #include <sys/mutex.h>
 #include <sys/queue.h>
 #include <sys/sdt.h>
 
 #include <sys/dtrace.h>
 #include <sys/dtrace_bsd.h>
 
 /* DTrace methods. */
 static void	sdt_getargdesc(void *, dtrace_id_t, void *, dtrace_argdesc_t *);
 static void	sdt_provide_probes(void *, dtrace_probedesc_t *);
 static void	sdt_destroy(void *, dtrace_id_t, void *);
 static void	sdt_enable(void *, dtrace_id_t, void *);
 static void	sdt_disable(void *, dtrace_id_t, void *);
 
 static void	sdt_load(void);
 static int	sdt_unload(void);
 static void	sdt_create_provider(struct sdt_provider *);
 static void	sdt_create_probe(struct sdt_probe *);
 static void	sdt_kld_load(void *, struct linker_file *);
 static void	sdt_kld_unload_try(void *, struct linker_file *, int *);
 
 static MALLOC_DEFINE(M_SDT, "SDT", "DTrace SDT providers");
 
 static dtrace_pattr_t sdt_attr = {
 { DTRACE_STABILITY_EVOLVING, DTRACE_STABILITY_EVOLVING, DTRACE_CLASS_COMMON },
 { DTRACE_STABILITY_PRIVATE, DTRACE_STABILITY_PRIVATE, DTRACE_CLASS_UNKNOWN },
 { DTRACE_STABILITY_PRIVATE, DTRACE_STABILITY_PRIVATE, DTRACE_CLASS_ISA },
 { DTRACE_STABILITY_EVOLVING, DTRACE_STABILITY_EVOLVING, DTRACE_CLASS_COMMON },
 { DTRACE_STABILITY_PRIVATE, DTRACE_STABILITY_PRIVATE, DTRACE_CLASS_ISA },
 };
 
 static dtrace_pops_t sdt_pops = {
 	sdt_provide_probes,
 	NULL,
 	sdt_enable,
 	sdt_disable,
 	NULL,
 	NULL,
 	sdt_getargdesc,
 	NULL,
 	NULL,
 	sdt_destroy,
 };
 
 static TAILQ_HEAD(, sdt_provider) sdt_prov_list;
 
 eventhandler_tag	sdt_kld_load_tag;
 eventhandler_tag	sdt_kld_unload_try_tag;
 
 static void
 sdt_create_provider(struct sdt_provider *prov)
 {
 	struct sdt_provider *curr, *newprov;
 
 	TAILQ_FOREACH(curr, &sdt_prov_list, prov_entry)
 		if (strcmp(prov->name, curr->name) == 0) {
 			/* The provider has already been defined. */
 			curr->sdt_refs++;
 			return;
 		}
 
 	/*
 	 * Make a copy of prov so that we don't lose fields if its module is
 	 * unloaded but the provider isn't destroyed. This could happen with
 	 * a provider that spans multiple modules.
 	 */
 	newprov = malloc(sizeof(*newprov), M_SDT, M_WAITOK | M_ZERO);
 	newprov->name = strdup(prov->name, M_SDT);
 	prov->sdt_refs = newprov->sdt_refs = 1;
 
 	TAILQ_INSERT_TAIL(&sdt_prov_list, newprov, prov_entry);
 
 	(void)dtrace_register(newprov->name, &sdt_attr, DTRACE_PRIV_USER, NULL,
 	    &sdt_pops, NULL, (dtrace_provider_id_t *)&newprov->id);
 	prov->id = newprov->id;
 }
 
 static void
 sdt_create_probe(struct sdt_probe *probe)
 {
 	struct sdt_provider *prov;
 	char mod[DTRACE_MODNAMELEN];
 	char func[DTRACE_FUNCNAMELEN];
 	char name[DTRACE_NAMELEN];
 	size_t len;
 
 	TAILQ_FOREACH(prov, &sdt_prov_list, prov_entry)
 		if (strcmp(prov->name, probe->prov->name) == 0)
 			break;
 
 	KASSERT(prov != NULL, ("probe defined without a provider"));
 
 	/* If no module name was specified, use the module filename. */
 	if (*probe->mod == 0) {
 		len = strlcpy(mod, probe->sdtp_lf->filename, sizeof(mod));
 		if (len > 3 && strcmp(mod + len - 3, ".ko") == 0)
 			mod[len - 3] = '\0';
 	} else
 		strlcpy(mod, probe->mod, sizeof(mod));
 
 	/*
 	 * Unfortunately this is necessary because the Solaris DTrace
 	 * code mixes consts and non-consts with casts to override
 	 * the incompatibilies. On FreeBSD, we use strict warnings
 	 * in the C compiler, so we have to respect const vs non-const.
 	 */
 	strlcpy(func, probe->func, sizeof(func));
 	strlcpy(name, probe->name, sizeof(name));
 
 	if (dtrace_probe_lookup(prov->id, mod, func, name) != DTRACE_IDNONE)
 		return;
 
 	(void)dtrace_probe_create(prov->id, mod, func, name, 1, probe);
 }
 
 /*
  * Probes are created through the SDT module load/unload hook, so this function
  * has nothing to do. It only exists because the DTrace provider framework
  * requires one of provide_probes and provide_module to be defined.
  */
 static void
 sdt_provide_probes(void *arg, dtrace_probedesc_t *desc)
 {
 }
 
 static void
 sdt_enable(void *arg __unused, dtrace_id_t id, void *parg)
 {
 	struct sdt_probe *probe = parg;
 
 	probe->id = id;
 	probe->sdtp_lf->nenabled++;
 }
 
 static void
 sdt_disable(void *arg __unused, dtrace_id_t id, void *parg)
 {
 	struct sdt_probe *probe = parg;
 
 	KASSERT(probe->sdtp_lf->nenabled > 0, ("no probes enabled"));
 
 	probe->id = 0;
 	probe->sdtp_lf->nenabled--;
 }
 
 static void
 sdt_getargdesc(void *arg, dtrace_id_t id, void *parg, dtrace_argdesc_t *desc)
 {
 	struct sdt_argtype *argtype;
 	struct sdt_probe *probe = parg;
 
 	if (desc->dtargd_ndx >= probe->n_args) {
 		desc->dtargd_ndx = DTRACE_ARGNONE;
 		return;
 	}
 
 	TAILQ_FOREACH(argtype, &probe->argtype_list, argtype_entry) {
 		if (desc->dtargd_ndx == argtype->ndx) {
 			desc->dtargd_mapping = desc->dtargd_ndx;
 			if (argtype->type == NULL) {
 				desc->dtargd_native[0] = '\0';
 				desc->dtargd_xlate[0] = '\0';
 				continue;
 			}
 			strlcpy(desc->dtargd_native, argtype->type,
 			    sizeof(desc->dtargd_native));
 			if (argtype->xtype != NULL)
 				strlcpy(desc->dtargd_xlate, argtype->xtype,
 				    sizeof(desc->dtargd_xlate));
 		}
 	}
 }
 
 static void
 sdt_destroy(void *arg, dtrace_id_t id, void *parg)
 {
 }
 
 /*
  * Called from the kernel linker when a module is loaded, before
  * dtrace_module_loaded() is called. This is done so that it's possible to
  * register new providers when modules are loaded. The DTrace framework
  * explicitly disallows calling into the framework from the provide_module
  * provider method, so we cannot do this there.
  */
 static void
 sdt_kld_load(void *arg __unused, struct linker_file *lf)
 {
 	struct sdt_provider **prov, **begin, **end;
 	struct sdt_probe **probe, **p_begin, **p_end;
 	struct sdt_argtype **argtype, **a_begin, **a_end;
 
 	if (linker_file_lookup_set(lf, "sdt_providers_set", &begin, &end, NULL))
 		return;
 	for (prov = begin; prov < end; prov++)
 		sdt_create_provider(*prov);
 
 	if (linker_file_lookup_set(lf, "sdt_probes_set", &p_begin, &p_end,
 	    NULL))
 		return;
 	for (probe = p_begin; probe < p_end; probe++) {
 		(*probe)->sdtp_lf = lf;
 		sdt_create_probe(*probe);
 		TAILQ_INIT(&(*probe)->argtype_list);
 	}
 
 	if (linker_file_lookup_set(lf, "sdt_argtypes_set", &a_begin, &a_end,
 	    NULL))
 		return;
 	for (argtype = a_begin; argtype < a_end; argtype++) {
 		(*argtype)->probe->n_args++;
 		TAILQ_INSERT_TAIL(&(*argtype)->probe->argtype_list, *argtype,
 		    argtype_entry);
 	}
 }
 
 static void
 sdt_kld_unload_try(void *arg __unused, struct linker_file *lf, int *error)
 {
 	struct sdt_provider *prov, **curr, **begin, **end, *tmp;
 
 	if (*error != 0)
 		/* We already have an error, so don't do anything. */
 		return;
 	else if (linker_file_lookup_set(lf, "sdt_providers_set", &begin, &end,
 	    NULL))
 		/* No DTrace providers are declared in this file. */
 		return;
 
 	/*
 	 * Go through all the providers declared in this linker file and
 	 * unregister any that aren't declared in another loaded file.
 	 */
 	for (curr = begin; curr < end; curr++) {
 		TAILQ_FOREACH_SAFE(prov, &sdt_prov_list, prov_entry, tmp) {
 			if (strcmp(prov->name, (*curr)->name) != 0)
 				continue;
 
 			if (prov->sdt_refs == 1) {
 				if (dtrace_unregister(prov->id) != 0) {
 					*error = 1;
 					return;
 				}
 				TAILQ_REMOVE(&sdt_prov_list, prov, prov_entry);
 				free(prov->name, M_SDT);
 				free(prov, M_SDT);
 			} else
 				prov->sdt_refs--;
 			break;
 		}
 	}
 }
 
 static int
 sdt_linker_file_cb(linker_file_t lf, void *arg __unused)
 {
 
 	sdt_kld_load(NULL, lf);
 
 	return (0);
 }
 
 static void
 sdt_load()
 {
 
 	TAILQ_INIT(&sdt_prov_list);
 
 	sdt_probe_func = dtrace_probe;
 
 	sdt_kld_load_tag = EVENTHANDLER_REGISTER(kld_load, sdt_kld_load, NULL,
 	    EVENTHANDLER_PRI_ANY);
 	sdt_kld_unload_try_tag = EVENTHANDLER_REGISTER(kld_unload_try,
 	    sdt_kld_unload_try, NULL, EVENTHANDLER_PRI_ANY);
 
 	/* Pick up probes from the kernel and already-loaded linker files. */
 	linker_file_foreach(sdt_linker_file_cb, NULL);
 }
 
 static int
 sdt_unload()
 {
 	struct sdt_provider *prov, *tmp;
 	int ret;
 
 	EVENTHANDLER_DEREGISTER(kld_load, sdt_kld_load_tag);
 	EVENTHANDLER_DEREGISTER(kld_unload_try, sdt_kld_unload_try_tag);
 
 	sdt_probe_func = sdt_probe_stub;
 
 	TAILQ_FOREACH_SAFE(prov, &sdt_prov_list, prov_entry, tmp) {
 		ret = dtrace_unregister(prov->id);
 		if (ret != 0)
 			return (ret);
 		TAILQ_REMOVE(&sdt_prov_list, prov, prov_entry);
 		free(prov->name, M_SDT);
 		free(prov, M_SDT);
 	}
 
 	return (0);
 }
 
 static int
 sdt_modevent(module_t mod __unused, int type, void *data __unused)
 {
 	int error = 0;
 
 	switch (type) {
 	case MOD_LOAD:
 		sdt_load();
 		break;
 
 	case MOD_UNLOAD:
 		error = sdt_unload();
 		break;
 
 	case MOD_SHUTDOWN:
 		break;
 
 	default:
 		error = EOPNOTSUPP;
 		break;
 	}
 
 	return (error);
 }
 
 DEV_MODULE(sdt, sdt_modevent, NULL);
 MODULE_VERSION(sdt, 1);
 MODULE_DEPEND(sdt, dtrace, 1, 1, 1);
 MODULE_DEPEND(sdt, opensolaris, 1, 1, 1);
Index: head/sys/compat/linux/linux_emul.c
===================================================================
--- head/sys/compat/linux/linux_emul.c	(revision 258540)
+++ head/sys/compat/linux/linux_emul.c	(revision 258541)
@@ -1,474 +1,473 @@
 /*-
  * Copyright (c) 2006 Roman Divacky
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer
  *    in this position and unchanged.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. The name of the author may not be used to endorse or promote products
  *    derived from this software without specific prior written permission
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_compat.h"
-#include "opt_kdtrace.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/imgact.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mutex.h>
 #include <sys/sdt.h>
 #include <sys/sx.h>
 #include <sys/proc.h>
 #include <sys/syscallsubr.h>
 #include <sys/sysent.h>
 #include <sys/sysproto.h>
 #include <sys/unistd.h>
 
 #ifdef COMPAT_LINUX32
 #include <machine/../linux32/linux.h>
 #include <machine/../linux32/linux32_proto.h>
 #else
 #include <machine/../linux/linux.h>
 #include <machine/../linux/linux_proto.h>
 #endif
 
 #include <compat/linux/linux_dtrace.h>
 #include <compat/linux/linux_emul.h>
 #include <compat/linux/linux_futex.h>
 #include <compat/linux/linux_misc.h>
 
 /**
  * Special DTrace provider for the linuxulator.
  *
  * In this file we define the provider for the entire linuxulator. All
  * modules (= files of the linuxulator) use it.
  *
  * We define a different name depending on the emulated bitsize, see
  * ../../<ARCH>/linux{,32}/linux.h, e.g.:
  *      native bitsize          = linuxulator
  *      amd64, 32bit emulation  = linuxulator32
  */
 LIN_SDT_PROVIDER_DEFINE(LINUX_DTRACE);
 
 /**
  * Special DTrace module "locks", it covers some linuxulator internal
  * locks.
  */
 LIN_SDT_PROBE_DEFINE1(locks, emul_lock, locked, "struct mtx *");
 LIN_SDT_PROBE_DEFINE1(locks, emul_lock, unlock, "struct mtx *");
 LIN_SDT_PROBE_DEFINE1(locks, emul_shared_rlock, locked, "struct sx *");
 LIN_SDT_PROBE_DEFINE1(locks, emul_shared_rlock, unlock, "struct sx *");
 LIN_SDT_PROBE_DEFINE1(locks, emul_shared_wlock, locked, "struct sx *");
 LIN_SDT_PROBE_DEFINE1(locks, emul_shared_wlock, unlock, "struct sx *");
 
 /**
  * DTrace probes in this module.
  */
 LIN_SDT_PROBE_DEFINE2(emul, em_find, entry, "struct proc *", "int");
 LIN_SDT_PROBE_DEFINE0(emul, em_find, return);
 LIN_SDT_PROBE_DEFINE3(emul, proc_init, entry, "struct thread *", "pid_t",
     "int");
 LIN_SDT_PROBE_DEFINE0(emul, proc_init, create_thread);
 LIN_SDT_PROBE_DEFINE0(emul, proc_init, fork);
 LIN_SDT_PROBE_DEFINE0(emul, proc_init, exec);
 LIN_SDT_PROBE_DEFINE0(emul, proc_init, return);
 LIN_SDT_PROBE_DEFINE1(emul, proc_exit, entry, "struct proc *");
 LIN_SDT_PROBE_DEFINE0(emul, proc_exit, futex_failed);
 LIN_SDT_PROBE_DEFINE3(emul, proc_exit, reparent, "pid_t", "pid_t",
     "struct proc *");
 LIN_SDT_PROBE_DEFINE1(emul, proc_exit, child_clear_tid_error, "int");
 LIN_SDT_PROBE_DEFINE0(emul, proc_exit, return);
 LIN_SDT_PROBE_DEFINE2(emul, proc_exec, entry, "struct proc *",
     "struct image_params *");
 LIN_SDT_PROBE_DEFINE0(emul, proc_exec, return);
 LIN_SDT_PROBE_DEFINE0(emul, linux_schedtail, entry);
 LIN_SDT_PROBE_DEFINE1(emul, linux_schedtail, copyout_error, "int");
 LIN_SDT_PROBE_DEFINE0(emul, linux_schedtail, return);
 LIN_SDT_PROBE_DEFINE1(emul, linux_set_tid_address, entry, "int *");
 LIN_SDT_PROBE_DEFINE0(emul, linux_set_tid_address, return);
 LIN_SDT_PROBE_DEFINE2(emul, linux_kill_threads, entry, "struct thread *",
     "int");
 LIN_SDT_PROBE_DEFINE1(emul, linux_kill_threads, kill, "pid_t");
 LIN_SDT_PROBE_DEFINE0(emul, linux_kill_threads, return);
 
 struct sx	emul_shared_lock;
 struct mtx	emul_lock;
 
 /* this returns locked reference to the emuldata entry (if found) */
 struct linux_emuldata *
 em_find(struct proc *p, int locked)
 {
 	struct linux_emuldata *em;
 
 	LIN_SDT_PROBE2(emul, em_find, entry, p, locked);
 
 	if (locked == EMUL_DOLOCK)
 		EMUL_LOCK(&emul_lock);
 
 	em = p->p_emuldata;
 
 	if (em == NULL && locked == EMUL_DOLOCK)
 		EMUL_UNLOCK(&emul_lock);
 
 	LIN_SDT_PROBE1(emul, em_find, return, em);
 	return (em);
 }
 
 int
 linux_proc_init(struct thread *td, pid_t child, int flags)
 {
 	struct linux_emuldata *em, *p_em;
 	struct proc *p;
 
 	LIN_SDT_PROBE3(emul, proc_init, entry, td, child, flags);
 
 	if (child != 0) {
 		/* fork or create a thread */
 		em = malloc(sizeof *em, M_LINUX, M_WAITOK | M_ZERO);
 		em->pid = child;
 		em->pdeath_signal = 0;
 		em->flags = 0;
 		em->robust_futexes = NULL;
 		if (flags & LINUX_CLONE_THREAD) {
 			/* handled later in the code */
 			LIN_SDT_PROBE0(emul, proc_init, create_thread);
 		} else {
 			struct linux_emuldata_shared *s;
 
 			LIN_SDT_PROBE0(emul, proc_init, fork);
 
 			s = malloc(sizeof *s, M_LINUX, M_WAITOK | M_ZERO);
 			s->refs = 1;
 			s->group_pid = child;
 
 			LIST_INIT(&s->threads);
 			em->shared = s;
 		}
 	} else {
 		/* exec */
 		LIN_SDT_PROBE0(emul, proc_init, exec);
 
 		/* lookup the old one */
 		em = em_find(td->td_proc, EMUL_DOLOCK);
 		KASSERT(em != NULL, ("proc_init: emuldata not found in exec case.\n"));
 	}
 
 	em->child_clear_tid = NULL;
 	em->child_set_tid = NULL;
 
 	/*
 	 * allocate the shared struct only in clone()/fork cases in the case
 	 * of clone() td = calling proc and child = pid of the newly created
 	 * proc
 	 */
 	if (child != 0) {
 		if (flags & LINUX_CLONE_THREAD) {
 			/* lookup the parent */
 			/* 
 			 * we dont have to lock the p_em because
 			 * its waiting for us in linux_clone so
 			 * there is no chance of it changing the
 			 * p_em->shared address
 			 */
 			p_em = em_find(td->td_proc, EMUL_DONTLOCK);
 			KASSERT(p_em != NULL, ("proc_init: parent emuldata not found for CLONE_THREAD\n"));
 			em->shared = p_em->shared;
 			EMUL_SHARED_WLOCK(&emul_shared_lock);
 			em->shared->refs++;
 			EMUL_SHARED_WUNLOCK(&emul_shared_lock);
 		} else {
 			/*
 			 * handled earlier to avoid malloc(M_WAITOK) with
 			 * rwlock held
 			 */
 		}
 
 		EMUL_SHARED_WLOCK(&emul_shared_lock);
 		LIST_INSERT_HEAD(&em->shared->threads, em, threads);
 		EMUL_SHARED_WUNLOCK(&emul_shared_lock);
 
 		p = pfind(child);
 		KASSERT(p != NULL, ("process not found in proc_init\n"));
 		p->p_emuldata = em;
 		PROC_UNLOCK(p);
 	} else
 		EMUL_UNLOCK(&emul_lock);
 
 	LIN_SDT_PROBE0(emul, proc_init, return);
 	return (0);
 }
 
 void
 linux_proc_exit(void *arg __unused, struct proc *p)
 {
 	struct linux_emuldata *em;
 	int error, shared_flags, shared_xstat;
 	struct thread *td = FIRST_THREAD_IN_PROC(p);
 	int *child_clear_tid;
 	struct proc *q, *nq;
 
 	if (__predict_true(p->p_sysent != &elf_linux_sysvec))
 		return;
 
 	LIN_SDT_PROBE1(emul, proc_exit, entry, p);
 
 	release_futexes(p);
 
 	/* find the emuldata */
 	em = em_find(p, EMUL_DOLOCK);
 
 	KASSERT(em != NULL, ("proc_exit: emuldata not found.\n"));
 
 	/* reparent all procs that are not a thread leader to initproc */
 	if (em->shared->group_pid != p->p_pid) {
 		LIN_SDT_PROBE3(emul, proc_exit, reparent,
 		    em->shared->group_pid, p->p_pid, p);
 
 		child_clear_tid = em->child_clear_tid;
 		EMUL_UNLOCK(&emul_lock);
 		sx_xlock(&proctree_lock);
 		wakeup(initproc);
 		PROC_LOCK(p);
 		proc_reparent(p, initproc);
 		p->p_sigparent = SIGCHLD;
 		PROC_UNLOCK(p);
 		sx_xunlock(&proctree_lock);
 	} else {
 		child_clear_tid = em->child_clear_tid;
 		EMUL_UNLOCK(&emul_lock);	
 	}
 
 	EMUL_SHARED_WLOCK(&emul_shared_lock);
 	shared_flags = em->shared->flags;
 	shared_xstat = em->shared->xstat;
 	LIST_REMOVE(em, threads);
 
 	em->shared->refs--;
 	if (em->shared->refs == 0) {
 		EMUL_SHARED_WUNLOCK(&emul_shared_lock);
 		free(em->shared, M_LINUX);
 	} else	
 		EMUL_SHARED_WUNLOCK(&emul_shared_lock);
 
 	if ((shared_flags & EMUL_SHARED_HASXSTAT) != 0)
 		p->p_xstat = shared_xstat;
 
 	if (child_clear_tid != NULL) {
 		struct linux_sys_futex_args cup;
 		int null = 0;
 
 		error = copyout(&null, child_clear_tid, sizeof(null));
 		if (error) {
 			LIN_SDT_PROBE1(emul, proc_exit,
 			    child_clear_tid_error, error);
 
 			free(em, M_LINUX);
 
 			LIN_SDT_PROBE0(emul, proc_exit, return);
 			return;
 		}
 
 		/* futexes stuff */
 		cup.uaddr = child_clear_tid;
 		cup.op = LINUX_FUTEX_WAKE;
 		cup.val = 0x7fffffff;	/* Awake everyone */
 		cup.timeout = NULL;
 		cup.uaddr2 = NULL;
 		cup.val3 = 0;
 		error = linux_sys_futex(FIRST_THREAD_IN_PROC(p), &cup);
 		/*
 		 * this cannot happen at the moment and if this happens it
 		 * probably means there is a user space bug
 		 */
 		if (error) {
 			LIN_SDT_PROBE0(emul, proc_exit, futex_failed);
 			printf(LMSG("futex stuff in proc_exit failed.\n"));
 		}
 	}
 
 	/* clean the stuff up */
 	free(em, M_LINUX);
 
 	/* this is a little weird but rewritten from exit1() */
 	sx_xlock(&proctree_lock);
 	q = LIST_FIRST(&p->p_children);
 	for (; q != NULL; q = nq) {
 		nq = LIST_NEXT(q, p_sibling);
 		if (q->p_flag & P_WEXIT)
 			continue;
 		if (__predict_false(q->p_sysent != &elf_linux_sysvec))
 			continue;
 		em = em_find(q, EMUL_DOLOCK);
 		KASSERT(em != NULL, ("linux_reparent: emuldata not found: %i\n", q->p_pid));
 		PROC_LOCK(q);
 		if ((q->p_flag & P_WEXIT) == 0 && em->pdeath_signal != 0) {
 			kern_psignal(q, em->pdeath_signal);
 		}
 		PROC_UNLOCK(q);
 		EMUL_UNLOCK(&emul_lock);
 	}
 	sx_xunlock(&proctree_lock);
 
 	LIN_SDT_PROBE0(emul, proc_exit, return);
 }
 
 /*
  * This is used in a case of transition from FreeBSD binary execing to linux binary
  * in this case we create linux emuldata proc entry with the pid of the currently running
  * process.
  */
 void 
 linux_proc_exec(void *arg __unused, struct proc *p, struct image_params *imgp)
 {
 	if (__predict_false(imgp->sysent == &elf_linux_sysvec)) {
 		LIN_SDT_PROBE2(emul, proc_exec, entry, p, imgp);
 	}
 	if (__predict_false(imgp->sysent == &elf_linux_sysvec
 	    && p->p_sysent != &elf_linux_sysvec))
 		linux_proc_init(FIRST_THREAD_IN_PROC(p), p->p_pid, 0);
 	if (__predict_false((p->p_sysent->sv_flags & SV_ABI_MASK) ==
 	    SV_ABI_LINUX))
 		/* Kill threads regardless of imgp->sysent value */
 		linux_kill_threads(FIRST_THREAD_IN_PROC(p), SIGKILL);
 	if (__predict_false(imgp->sysent != &elf_linux_sysvec
 	    && p->p_sysent == &elf_linux_sysvec)) {
 		struct linux_emuldata *em;
 
 		/* 
 		 * XXX:There's a race because here we assign p->p_emuldata NULL
 		 * but the process is still counted as linux one for a short
  		 * time so some other process might reference it and try to
  		 * access its p->p_emuldata and panicing on a NULL reference.
 		 */
 		em = em_find(p, EMUL_DONTLOCK);
 
 		KASSERT(em != NULL, ("proc_exec: emuldata not found.\n"));
 
 		EMUL_SHARED_WLOCK(&emul_shared_lock);
 		LIST_REMOVE(em, threads);
 
 		PROC_LOCK(p);
 		p->p_emuldata = NULL;
 		PROC_UNLOCK(p);
 
 		em->shared->refs--;
 		if (em->shared->refs == 0) {
 			EMUL_SHARED_WUNLOCK(&emul_shared_lock);
 			free(em->shared, M_LINUX);
 		} else
 			EMUL_SHARED_WUNLOCK(&emul_shared_lock);
 
 		free(em, M_LINUX);
 	}
 
 	if (__predict_false(imgp->sysent == &elf_linux_sysvec)) {
 		LIN_SDT_PROBE0(emul, proc_exec, return);
 	}
 }
 
 void
 linux_schedtail(struct thread *td)
 {
 	struct linux_emuldata *em;
 	struct proc *p;
 	int error = 0;
 	int *child_set_tid;
 
 	p = td->td_proc;
 
 	LIN_SDT_PROBE1(emul, linux_schedtail, entry, p);
 
 	/* find the emuldata */
 	em = em_find(p, EMUL_DOLOCK);
 
 	KASSERT(em != NULL, ("linux_schedtail: emuldata not found.\n"));
 	child_set_tid = em->child_set_tid;
 	EMUL_UNLOCK(&emul_lock);
 
 	if (child_set_tid != NULL) {
 		error = copyout(&p->p_pid, (int *)child_set_tid,
 		    sizeof(p->p_pid));
 
 		if (error != 0) {
 			LIN_SDT_PROBE1(emul, linux_schedtail, copyout_error,
 			    error);
 		}
 	}
 
 	LIN_SDT_PROBE0(emul, linux_schedtail, return);
 
 	return;
 }
 
 int
 linux_set_tid_address(struct thread *td, struct linux_set_tid_address_args *args)
 {
 	struct linux_emuldata *em;
 
 	LIN_SDT_PROBE1(emul, linux_set_tid_address, entry, args->tidptr);
 
 	/* find the emuldata */
 	em = em_find(td->td_proc, EMUL_DOLOCK);
 
 	KASSERT(em != NULL, ("set_tid_address: emuldata not found.\n"));
 
 	em->child_clear_tid = args->tidptr;
 	td->td_retval[0] = td->td_proc->p_pid;
 
 	EMUL_UNLOCK(&emul_lock);
 
 	LIN_SDT_PROBE0(emul, linux_set_tid_address, return);
 	return 0;
 }
 
 void
 linux_kill_threads(struct thread *td, int sig)
 {
 	struct linux_emuldata *em, *td_em, *tmp_em;
 	struct proc *sp;
 
 	LIN_SDT_PROBE2(emul, linux_kill_threads, entry, td, sig);
 
 	td_em = em_find(td->td_proc, EMUL_DONTLOCK);
 
 	KASSERT(td_em != NULL, ("linux_kill_threads: emuldata not found.\n"));
 
 	EMUL_SHARED_RLOCK(&emul_shared_lock);
 	LIST_FOREACH_SAFE(em, &td_em->shared->threads, threads, tmp_em) {
 		if (em->pid == td_em->pid)
 			continue;
 
 		sp = pfind(em->pid);
 		if ((sp->p_flag & P_WEXIT) == 0)
 			kern_psignal(sp, sig);
 		PROC_UNLOCK(sp);
 
 		LIN_SDT_PROBE1(emul, linux_kill_threads, kill, em->pid);
 	}
 	EMUL_SHARED_RUNLOCK(&emul_shared_lock);
 
 	LIN_SDT_PROBE0(emul, linux_kill_threads, return);
 }
Index: head/sys/compat/linux/linux_fork.c
===================================================================
--- head/sys/compat/linux/linux_fork.c	(revision 258540)
+++ head/sys/compat/linux/linux_fork.c	(revision 258541)
@@ -1,300 +1,299 @@
 /*-
  * Copyright (c) 2004 Tim J. Robbins
  * Copyright (c) 2002 Doug Rabson
  * Copyright (c) 2000 Marcel Moolenaar
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer
  *    in this position and unchanged.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_compat.h"
-#include "opt_kdtrace.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/imgact.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/proc.h>
 #include <sys/sched.h>
 #include <sys/sdt.h>
 #include <sys/sx.h>
 #include <sys/unistd.h>
 
 #ifdef COMPAT_LINUX32
 #include <machine/../linux32/linux.h>
 #include <machine/../linux32/linux32_proto.h>
 #else
 #include <machine/../linux/linux.h>
 #include <machine/../linux/linux_proto.h>
 #endif
 #include <compat/linux/linux_dtrace.h>
 #include <compat/linux/linux_signal.h>
 #include <compat/linux/linux_emul.h>
 #include <compat/linux/linux_misc.h>
 
 /* DTrace init */
 LIN_SDT_PROVIDER_DECLARE(LINUX_DTRACE);
 
 /* Linuxulator-global DTrace probes */
 LIN_SDT_PROBE_DECLARE(locks, emul_lock, locked);
 LIN_SDT_PROBE_DECLARE(locks, emul_lock, unlock);
 
 
 int
 linux_fork(struct thread *td, struct linux_fork_args *args)
 {
 	int error;
 	struct proc *p2;
 	struct thread *td2;
 
 #ifdef DEBUG
 	if (ldebug(fork))
 		printf(ARGS(fork, ""));
 #endif
 
 	if ((error = fork1(td, RFFDG | RFPROC | RFSTOPPED, 0, &p2, NULL, 0))
 	    != 0)
 		return (error);
 
 	td->td_retval[0] = p2->p_pid;
 	td->td_retval[1] = 0;
 
 	error = linux_proc_init(td, td->td_retval[0], 0);
 	if (error)
 		return (error);
 
 	td2 = FIRST_THREAD_IN_PROC(p2);
 
 	/*
 	 * Make this runnable after we are finished with it.
 	 */
 	thread_lock(td2);
 	TD_SET_CAN_RUN(td2);
 	sched_add(td2, SRQ_BORING);
 	thread_unlock(td2);
 
 	return (0);
 }
 
 int
 linux_vfork(struct thread *td, struct linux_vfork_args *args)
 {
 	int error;
 	struct proc *p2;
 	struct thread *td2;
 
 #ifdef DEBUG
 	if (ldebug(vfork))
 		printf(ARGS(vfork, ""));
 #endif
 
 	/* Exclude RFPPWAIT */
 	if ((error = fork1(td, RFFDG | RFPROC | RFMEM | RFSTOPPED, 0, &p2,
 	    NULL, 0)) != 0)
 		return (error);
 
    	td->td_retval[0] = p2->p_pid;
 
 	error = linux_proc_init(td, td->td_retval[0], 0);
 	if (error)
 		return (error);
 
 	PROC_LOCK(p2);
 	p2->p_flag |= P_PPWAIT;
 	PROC_UNLOCK(p2);
 
 	td2 = FIRST_THREAD_IN_PROC(p2);
 
 	/*
 	 * Make this runnable after we are finished with it.
 	 */
 	thread_lock(td2);
 	TD_SET_CAN_RUN(td2);
 	sched_add(td2, SRQ_BORING);
 	thread_unlock(td2);
 
 	/* wait for the children to exit, ie. emulate vfork */
 	PROC_LOCK(p2);
 	while (p2->p_flag & P_PPWAIT)
 		cv_wait(&p2->p_pwait, &p2->p_mtx);
 	PROC_UNLOCK(p2);
 
 	return (0);
 }
 
 int
 linux_clone(struct thread *td, struct linux_clone_args *args)
 {
 	int error, ff = RFPROC | RFSTOPPED;
 	struct proc *p2;
 	struct thread *td2;
 	int exit_signal;
 	struct linux_emuldata *em;
 
 #ifdef DEBUG
 	if (ldebug(clone)) {
 		printf(ARGS(clone, "flags %x, stack %p, parent tid: %p, "
 		    "child tid: %p"), (unsigned)args->flags,
 		    args->stack, args->parent_tidptr, args->child_tidptr);
 	}
 #endif
 
 	exit_signal = args->flags & 0x000000ff;
 	if (LINUX_SIG_VALID(exit_signal)) {
 		if (exit_signal <= LINUX_SIGTBLSZ)
 			exit_signal =
 			    linux_to_bsd_signal[_SIG_IDX(exit_signal)];
 	} else if (exit_signal != 0)
 		return (EINVAL);
 
 	if (args->flags & LINUX_CLONE_VM)
 		ff |= RFMEM;
 	if (args->flags & LINUX_CLONE_SIGHAND)
 		ff |= RFSIGSHARE;
 	/*
 	 * XXX: In Linux, sharing of fs info (chroot/cwd/umask)
 	 * and open files is independant.  In FreeBSD, its in one
 	 * structure but in reality it does not cause any problems
 	 * because both of these flags are usually set together.
 	 */
 	if (!(args->flags & (LINUX_CLONE_FILES | LINUX_CLONE_FS)))
 		ff |= RFFDG;
 
 	/*
 	 * Attempt to detect when linux_clone(2) is used for creating
 	 * kernel threads. Unfortunately despite the existence of the
 	 * CLONE_THREAD flag, version of linuxthreads package used in
 	 * most popular distros as of beginning of 2005 doesn't make
 	 * any use of it. Therefore, this detection relies on
 	 * empirical observation that linuxthreads sets certain
 	 * combination of flags, so that we can make more or less
 	 * precise detection and notify the FreeBSD kernel that several
 	 * processes are in fact part of the same threading group, so
 	 * that special treatment is necessary for signal delivery
 	 * between those processes and fd locking.
 	 */
 	if ((args->flags & 0xffffff00) == LINUX_THREADING_FLAGS)
 		ff |= RFTHREAD;
 
 	if (args->flags & LINUX_CLONE_PARENT_SETTID)
 		if (args->parent_tidptr == NULL)
 			return (EINVAL);
 
 	error = fork1(td, ff, 0, &p2, NULL, 0);
 	if (error)
 		return (error);
 
 	if (args->flags & (LINUX_CLONE_PARENT | LINUX_CLONE_THREAD)) {
 	   	sx_xlock(&proctree_lock);
 		PROC_LOCK(p2);
 		proc_reparent(p2, td->td_proc->p_pptr);
 		PROC_UNLOCK(p2);
 		sx_xunlock(&proctree_lock);
 	}
 
 	/* create the emuldata */
 	error = linux_proc_init(td, p2->p_pid, args->flags);
 	/* reference it - no need to check this */
 	em = em_find(p2, EMUL_DOLOCK);
 	KASSERT(em != NULL, ("clone: emuldata not found."));
 	/* and adjust it */
 
 	if (args->flags & LINUX_CLONE_THREAD) {
 #ifdef notyet
 	   	PROC_LOCK(p2);
 	   	p2->p_pgrp = td->td_proc->p_pgrp;
 	   	PROC_UNLOCK(p2);
 #endif
 		exit_signal = 0;
 	}
 
 	if (args->flags & LINUX_CLONE_CHILD_SETTID)
 		em->child_set_tid = args->child_tidptr;
 	else
 	   	em->child_set_tid = NULL;
 
 	if (args->flags & LINUX_CLONE_CHILD_CLEARTID)
 		em->child_clear_tid = args->child_tidptr;
 	else
 	   	em->child_clear_tid = NULL;
 
 	EMUL_UNLOCK(&emul_lock);
 
 	if (args->flags & LINUX_CLONE_PARENT_SETTID) {
 		error = copyout(&p2->p_pid, args->parent_tidptr,
 		    sizeof(p2->p_pid));
 		if (error)
 			printf(LMSG("copyout failed!"));
 	}
 
 	PROC_LOCK(p2);
 	p2->p_sigparent = exit_signal;
 	PROC_UNLOCK(p2);
 	td2 = FIRST_THREAD_IN_PROC(p2);
 	/*
 	 * In a case of stack = NULL, we are supposed to COW calling process
 	 * stack. This is what normal fork() does, so we just keep tf_rsp arg
 	 * intact.
 	 */
 	if (args->stack)
 		linux_set_upcall_kse(td2, PTROUT(args->stack));
 
 	if (args->flags & LINUX_CLONE_SETTLS)
 		linux_set_cloned_tls(td2, args->tls);
 
 #ifdef DEBUG
 	if (ldebug(clone))
 		printf(LMSG("clone: successful rfork to %d, "
 		    "stack %p sig = %d"), (int)p2->p_pid, args->stack,
 		    exit_signal);
 #endif
 	if (args->flags & LINUX_CLONE_VFORK) {
 	   	PROC_LOCK(p2);
 	   	p2->p_flag |= P_PPWAIT;
 	   	PROC_UNLOCK(p2);
 	}
 
 	/*
 	 * Make this runnable after we are finished with it.
 	 */
 	thread_lock(td2);
 	TD_SET_CAN_RUN(td2);
 	sched_add(td2, SRQ_BORING);
 	thread_unlock(td2);
 
 	td->td_retval[0] = p2->p_pid;
 	td->td_retval[1] = 0;
 
 	if (args->flags & LINUX_CLONE_VFORK) {
 		/* wait for the children to exit, ie. emulate vfork */
 		PROC_LOCK(p2);
 		while (p2->p_flag & P_PPWAIT)
 			cv_wait(&p2->p_pwait, &p2->p_mtx);
 		PROC_UNLOCK(p2);
 	}
 
 	return (0);
 }
Index: head/sys/compat/linux/linux_futex.c
===================================================================
--- head/sys/compat/linux/linux_futex.c	(revision 258540)
+++ head/sys/compat/linux/linux_futex.c	(revision 258541)
@@ -1,1228 +1,1227 @@
 /*	$NetBSD: linux_futex.c,v 1.7 2006/07/24 19:01:49 manu Exp $ */
 
 /*-
  * Copyright (c) 2005 Emmanuel Dreyfus, all rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *	This product includes software developed by Emmanuel Dreyfus
  * 4. The name of the author may not be used to endorse or promote
  *    products derived from this software without specific prior written
  *    permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE THE AUTHOR AND CONTRIBUTORS ``AS IS''
  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
  * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS
  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  * POSSIBILITY OF SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 #if 0
 __KERNEL_RCSID(1, "$NetBSD: linux_futex.c,v 1.7 2006/07/24 19:01:49 manu Exp $");
 #endif
 
 #include "opt_compat.h"
-#include "opt_kdtrace.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/imgact.h>
 #include <sys/kernel.h>
 #include <sys/ktr.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mutex.h>
 #include <sys/priv.h>
 #include <sys/proc.h>
 #include <sys/queue.h>
 #include <sys/sched.h>
 #include <sys/sdt.h>
 #include <sys/sx.h>
 #include <sys/umtx.h>
 
 #ifdef COMPAT_LINUX32
 #include <machine/../linux32/linux.h>
 #include <machine/../linux32/linux32_proto.h>
 #else
 #include <machine/../linux/linux.h>
 #include <machine/../linux/linux_proto.h>
 #endif
 #include <compat/linux/linux_dtrace.h>
 #include <compat/linux/linux_emul.h>
 #include <compat/linux/linux_futex.h>
 #include <compat/linux/linux_util.h>
 
 /* DTrace init */
 LIN_SDT_PROVIDER_DECLARE(LINUX_DTRACE);
 
 /* Linuxulator-global DTrace probes */
 LIN_SDT_PROBE_DECLARE(locks, emul_lock, locked);
 LIN_SDT_PROBE_DECLARE(locks, emul_lock, unlock);
 
 /**
  * Futex part for the special DTrace module "locks".
  */
 LIN_SDT_PROBE_DEFINE1(locks, futex_mtx, locked, "struct mtx *");
 LIN_SDT_PROBE_DEFINE1(locks, futex_mtx, unlock, "struct mtx *");
 
 /**
  * Per futex probes.
  */
 LIN_SDT_PROBE_DEFINE1(futex, futex, create, "struct sx *");
 LIN_SDT_PROBE_DEFINE1(futex, futex, destroy, "struct sx *");
 
 /**
  * DTrace probes in this module.
  */
 LIN_SDT_PROBE_DEFINE2(futex, futex_put, entry, "struct futex *",
     "struct waiting_proc *");
 LIN_SDT_PROBE_DEFINE3(futex, futex_put, destroy, "uint32_t *", "uint32_t",
     "int");
 LIN_SDT_PROBE_DEFINE3(futex, futex_put, unlock, "uint32_t *", "uint32_t",
     "int");
 LIN_SDT_PROBE_DEFINE0(futex, futex_put, return);
 LIN_SDT_PROBE_DEFINE3(futex, futex_get0, entry, "uint32_t *", "struct futex **",
     "uint32_t");
 LIN_SDT_PROBE_DEFINE1(futex, futex_get0, umtx_key_get_error, "int");
 LIN_SDT_PROBE_DEFINE3(futex, futex_get0, shared, "uint32_t *", "uint32_t",
     "int");
 LIN_SDT_PROBE_DEFINE1(futex, futex_get0, null, "uint32_t *");
 LIN_SDT_PROBE_DEFINE3(futex, futex_get0, new, "uint32_t *", "uint32_t", "int");
 LIN_SDT_PROBE_DEFINE1(futex, futex_get0, return, "int");
 LIN_SDT_PROBE_DEFINE3(futex, futex_get, entry, "uint32_t *",
     "struct waiting_proc **", "struct futex **");
 LIN_SDT_PROBE_DEFINE0(futex, futex_get, error);
 LIN_SDT_PROBE_DEFINE1(futex, futex_get, return, "int");
 LIN_SDT_PROBE_DEFINE3(futex, futex_sleep, entry, "struct futex *",
     "struct waiting_proc **", "int");
 LIN_SDT_PROBE_DEFINE5(futex, futex_sleep, requeue_error, "int", "uint32_t *",
     "struct waiting_proc *", "uint32_t *", "uint32_t");
 LIN_SDT_PROBE_DEFINE3(futex, futex_sleep, sleep_error, "int", "uint32_t *",
     "struct waiting_proc *");
 LIN_SDT_PROBE_DEFINE1(futex, futex_sleep, return, "int");
 LIN_SDT_PROBE_DEFINE3(futex, futex_wake, entry, "struct futex *", "int",
     "uint32_t");
 LIN_SDT_PROBE_DEFINE3(futex, futex_wake, iterate, "uint32_t",
     "struct waiting_proc *", "uint32_t");
 LIN_SDT_PROBE_DEFINE1(futex, futex_wake, wakeup, "struct waiting_proc *");
 LIN_SDT_PROBE_DEFINE1(futex, futex_wake, return, "int");
 LIN_SDT_PROBE_DEFINE4(futex, futex_requeue, entry, "struct futex *", "int",
     "struct futex *", "int");
 LIN_SDT_PROBE_DEFINE1(futex, futex_requeue, wakeup, "struct waiting_proc *");
 LIN_SDT_PROBE_DEFINE3(futex, futex_requeue, requeue, "uint32_t *",
     "struct waiting_proc *", "uint32_t");
 LIN_SDT_PROBE_DEFINE1(futex, futex_requeue, return, "int");
 LIN_SDT_PROBE_DEFINE4(futex, futex_wait, entry, "struct futex *",
     "struct waiting_proc **", "struct l_timespec *", "uint32_t");
 LIN_SDT_PROBE_DEFINE1(futex, futex_wait, copyin_error, "int");
 LIN_SDT_PROBE_DEFINE1(futex, futex_wait, itimerfix_error, "int");
 LIN_SDT_PROBE_DEFINE1(futex, futex_wait, sleep_error, "int");
 LIN_SDT_PROBE_DEFINE1(futex, futex_wait, return, "int");
 LIN_SDT_PROBE_DEFINE3(futex, futex_atomic_op, entry, "struct thread *",
     "int", "uint32_t");
 LIN_SDT_PROBE_DEFINE4(futex, futex_atomic_op, decoded_op, "int", "int", "int",
     "int");
 LIN_SDT_PROBE_DEFINE0(futex, futex_atomic_op, missing_access_check);
 LIN_SDT_PROBE_DEFINE1(futex, futex_atomic_op, unimplemented_op, "int");
 LIN_SDT_PROBE_DEFINE1(futex, futex_atomic_op, unimplemented_cmp, "int");
 LIN_SDT_PROBE_DEFINE1(futex, futex_atomic_op, return, "int");
 LIN_SDT_PROBE_DEFINE2(futex, linux_sys_futex, entry, "struct thread *",
     "struct linux_sys_futex_args *");
 LIN_SDT_PROBE_DEFINE0(futex, linux_sys_futex, unimplemented_clockswitch);
 LIN_SDT_PROBE_DEFINE1(futex, linux_sys_futex, copyin_error, "int");
 LIN_SDT_PROBE_DEFINE0(futex, linux_sys_futex, invalid_cmp_requeue_use);
 LIN_SDT_PROBE_DEFINE3(futex, linux_sys_futex, debug_wait, "uint32_t *",
     "uint32_t", "uint32_t");
 LIN_SDT_PROBE_DEFINE4(futex, linux_sys_futex, debug_wait_value_neq,
     "uint32_t *", "uint32_t", "int", "uint32_t");
 LIN_SDT_PROBE_DEFINE3(futex, linux_sys_futex, debug_wake, "uint32_t *",
     "uint32_t", "uint32_t");
 LIN_SDT_PROBE_DEFINE5(futex, linux_sys_futex, debug_cmp_requeue, "uint32_t *",
     "uint32_t", "uint32_t", "uint32_t *", "struct l_timespec *");
 LIN_SDT_PROBE_DEFINE2(futex, linux_sys_futex, debug_cmp_requeue_value_neq,
     "uint32_t", "int");
 LIN_SDT_PROBE_DEFINE5(futex, linux_sys_futex, debug_wake_op, "uint32_t *",
     "int", "uint32_t", "uint32_t *", "uint32_t");
 LIN_SDT_PROBE_DEFINE0(futex, linux_sys_futex, unhandled_efault);
 LIN_SDT_PROBE_DEFINE0(futex, linux_sys_futex, unimplemented_lock_pi);
 LIN_SDT_PROBE_DEFINE0(futex, linux_sys_futex, unimplemented_unlock_pi);
 LIN_SDT_PROBE_DEFINE0(futex, linux_sys_futex, unimplemented_trylock_pi);
 LIN_SDT_PROBE_DEFINE0(futex, linux_sys_futex, deprecated_requeue);
 LIN_SDT_PROBE_DEFINE0(futex, linux_sys_futex, unimplemented_wait_requeue_pi);
 LIN_SDT_PROBE_DEFINE0(futex, linux_sys_futex, unimplemented_cmp_requeue_pi);
 LIN_SDT_PROBE_DEFINE1(futex, linux_sys_futex, unknown_operation, "int");
 LIN_SDT_PROBE_DEFINE1(futex, linux_sys_futex, return, "int");
 LIN_SDT_PROBE_DEFINE2(futex, linux_set_robust_list, entry, "struct thread *",
     "struct linux_set_robust_list_args *");
 LIN_SDT_PROBE_DEFINE0(futex, linux_set_robust_list, size_error);
 LIN_SDT_PROBE_DEFINE1(futex, linux_set_robust_list, return, "int");
 LIN_SDT_PROBE_DEFINE2(futex, linux_get_robust_list, entry, "struct thread *",
     "struct linux_get_robust_list_args *");
 LIN_SDT_PROBE_DEFINE1(futex, linux_get_robust_list, copyout_error, "int");
 LIN_SDT_PROBE_DEFINE1(futex, linux_get_robust_list, return, "int");
 LIN_SDT_PROBE_DEFINE3(futex, handle_futex_death, entry, "struct proc *",
     "uint32_t *", "int");
 LIN_SDT_PROBE_DEFINE1(futex, handle_futex_death, copyin_error, "int");
 LIN_SDT_PROBE_DEFINE1(futex, handle_futex_death, return, "int");
 LIN_SDT_PROBE_DEFINE3(futex, fetch_robust_entry, entry,
     "struct linux_robust_list **", "struct linux_robust_list **", "int *");
 LIN_SDT_PROBE_DEFINE1(futex, fetch_robust_entry, copyin_error, "int");
 LIN_SDT_PROBE_DEFINE1(futex, fetch_robust_entry, return, "int");
 LIN_SDT_PROBE_DEFINE1(futex, release_futexes, entry, "struct proc *");
 LIN_SDT_PROBE_DEFINE1(futex, release_futexes, copyin_error, "int");
 LIN_SDT_PROBE_DEFINE0(futex, release_futexes, return);
 
 static MALLOC_DEFINE(M_FUTEX, "futex", "Linux futexes");
 static MALLOC_DEFINE(M_FUTEX_WP, "futex wp", "Linux futexes wp");
 
 struct futex;
 
 struct waiting_proc {
 	uint32_t	wp_flags;
 	struct futex	*wp_futex;
 	TAILQ_ENTRY(waiting_proc) wp_list;
 };
 
 struct futex {
 	struct sx	f_lck;
 	uint32_t	*f_uaddr;	/* user-supplied value, for debug */
 	struct umtx_key	f_key;
 	uint32_t	f_refcount;
 	uint32_t	f_bitset;
 	LIST_ENTRY(futex) f_list;
 	TAILQ_HEAD(lf_waiting_proc, waiting_proc) f_waiting_proc;
 };
 
 struct futex_list futex_list;
 
 #define FUTEX_LOCK(f)		sx_xlock(&(f)->f_lck)
 #define FUTEX_UNLOCK(f)		sx_xunlock(&(f)->f_lck)
 #define FUTEX_INIT(f)		do { \
 				    sx_init_flags(&(f)->f_lck, "ftlk", \
 					SX_DUPOK); \
 				    LIN_SDT_PROBE1(futex, futex, create, \
 					&(f)->f_lck); \
 				} while (0)
 #define FUTEX_DESTROY(f)	do { \
 				    LIN_SDT_PROBE1(futex, futex, destroy, \
 					&(f)->f_lck); \
 				    sx_destroy(&(f)->f_lck); \
 				} while (0)
 #define FUTEX_ASSERT_LOCKED(f)	sx_assert(&(f)->f_lck, SA_XLOCKED)
 
 struct mtx futex_mtx;			/* protects the futex list */
 #define FUTEXES_LOCK		do { \
 				    mtx_lock(&futex_mtx); \
 				    LIN_SDT_PROBE1(locks, futex_mtx, \
 					locked, &futex_mtx); \
 				} while (0)
 #define FUTEXES_UNLOCK		do { \
 				    LIN_SDT_PROBE1(locks, futex_mtx, \
 					unlock, &futex_mtx); \
 				    mtx_unlock(&futex_mtx); \
 				} while (0)
 
 /* flags for futex_get() */
 #define FUTEX_CREATE_WP		0x1	/* create waiting_proc */
 #define FUTEX_DONTCREATE	0x2	/* don't create futex if not exists */
 #define FUTEX_DONTEXISTS	0x4	/* return EINVAL if futex exists */
 #define	FUTEX_SHARED		0x8	/* shared futex */
 
 /* wp_flags */
 #define FUTEX_WP_REQUEUED	0x1	/* wp requeued - wp moved from wp_list
 					 * of futex where thread sleep to wp_list
 					 * of another futex.
 					 */
 #define FUTEX_WP_REMOVED	0x2	/* wp is woken up and removed from futex
 					 * wp_list to prevent double wakeup.
 					 */
 
 /* support.s */
 int futex_xchgl(int oparg, uint32_t *uaddr, int *oldval);
 int futex_addl(int oparg, uint32_t *uaddr, int *oldval);
 int futex_orl(int oparg, uint32_t *uaddr, int *oldval);
 int futex_andl(int oparg, uint32_t *uaddr, int *oldval);
 int futex_xorl(int oparg, uint32_t *uaddr, int *oldval);
 
 static void
 futex_put(struct futex *f, struct waiting_proc *wp)
 {
 	LIN_SDT_PROBE2(futex, futex_put, entry, f, wp);
 
 	FUTEX_ASSERT_LOCKED(f);
 	if (wp != NULL) {
 		if ((wp->wp_flags & FUTEX_WP_REMOVED) == 0)
 			TAILQ_REMOVE(&f->f_waiting_proc, wp, wp_list);
 		free(wp, M_FUTEX_WP);
 	}
 
 	FUTEXES_LOCK;
 	if (--f->f_refcount == 0) {
 		LIST_REMOVE(f, f_list);
 		FUTEXES_UNLOCK;
 		FUTEX_UNLOCK(f);
 
 		LIN_SDT_PROBE3(futex, futex_put, destroy, f->f_uaddr,
 		    f->f_refcount, f->f_key.shared);
 		LINUX_CTR3(sys_futex, "futex_put destroy uaddr %p ref %d "
 		    "shared %d", f->f_uaddr, f->f_refcount, f->f_key.shared);
 		umtx_key_release(&f->f_key);
 		FUTEX_DESTROY(f);
 		free(f, M_FUTEX);
 
 		LIN_SDT_PROBE0(futex, futex_put, return);
 		return;
 	}
 
 	LIN_SDT_PROBE3(futex, futex_put, unlock, f->f_uaddr, f->f_refcount,
 	    f->f_key.shared);
 	LINUX_CTR3(sys_futex, "futex_put uaddr %p ref %d shared %d",
 	    f->f_uaddr, f->f_refcount, f->f_key.shared);
 	FUTEXES_UNLOCK;
 	FUTEX_UNLOCK(f);
 
 	LIN_SDT_PROBE0(futex, futex_put, return);
 }
 
 static int
 futex_get0(uint32_t *uaddr, struct futex **newf, uint32_t flags)
 {
 	struct futex *f, *tmpf;
 	struct umtx_key key;
 	int error;
 
 	LIN_SDT_PROBE3(futex, futex_get0, entry, uaddr, newf, flags);
 
 	*newf = tmpf = NULL;
 
 	error = umtx_key_get(uaddr, TYPE_FUTEX, (flags & FUTEX_SHARED) ?
 	    AUTO_SHARE : THREAD_SHARE, &key);
 	if (error) {
 		LIN_SDT_PROBE1(futex, futex_get0, umtx_key_get_error, error);
 		LIN_SDT_PROBE1(futex, futex_get0, return, error);
 		return (error);
 	}
 retry:
 	FUTEXES_LOCK;
 	LIST_FOREACH(f, &futex_list, f_list) {
 		if (umtx_key_match(&f->f_key, &key)) {
 			if (tmpf != NULL) {
 				FUTEX_UNLOCK(tmpf);
 				FUTEX_DESTROY(tmpf);
 				free(tmpf, M_FUTEX);
 			}
 			if (flags & FUTEX_DONTEXISTS) {
 				FUTEXES_UNLOCK;
 				umtx_key_release(&key);
 
 				LIN_SDT_PROBE1(futex, futex_get0, return,
 				    EINVAL);
 				return (EINVAL);
 			}
 
 			/*
 			 * Increment refcount of the found futex to
 			 * prevent it from deallocation before FUTEX_LOCK()
 			 */
 			++f->f_refcount;
 			FUTEXES_UNLOCK;
 			umtx_key_release(&key);
 
 			FUTEX_LOCK(f);
 			*newf = f;
 			LIN_SDT_PROBE3(futex, futex_get0, shared, uaddr,
 			    f->f_refcount, f->f_key.shared);
 			LINUX_CTR3(sys_futex, "futex_get uaddr %p ref %d shared %d",
 			    uaddr, f->f_refcount, f->f_key.shared);
 
 			LIN_SDT_PROBE1(futex, futex_get0, return, 0);
 			return (0);
 		}
 	}
 
 	if (flags & FUTEX_DONTCREATE) {
 		FUTEXES_UNLOCK;
 		umtx_key_release(&key);
 		LIN_SDT_PROBE1(futex, futex_get0, null, uaddr);
 		LINUX_CTR1(sys_futex, "futex_get uaddr %p null", uaddr);
 
 		LIN_SDT_PROBE1(futex, futex_get0, return, 0);
 		return (0);
 	}
 
 	if (tmpf == NULL) {
 		FUTEXES_UNLOCK;
 		tmpf = malloc(sizeof(*tmpf), M_FUTEX, M_WAITOK | M_ZERO);
 		tmpf->f_uaddr = uaddr;
 		tmpf->f_key = key;
 		tmpf->f_refcount = 1;
 		tmpf->f_bitset = FUTEX_BITSET_MATCH_ANY;
 		FUTEX_INIT(tmpf);
 		TAILQ_INIT(&tmpf->f_waiting_proc);
 
 		/*
 		 * Lock the new futex before an insert into the futex_list
 		 * to prevent futex usage by other.
 		 */
 		FUTEX_LOCK(tmpf);
 		goto retry;
 	}
 
 	LIST_INSERT_HEAD(&futex_list, tmpf, f_list);
 	FUTEXES_UNLOCK;
 
 	LIN_SDT_PROBE3(futex, futex_get0, new, uaddr, tmpf->f_refcount,
 	    tmpf->f_key.shared);
 	LINUX_CTR3(sys_futex, "futex_get uaddr %p ref %d shared %d new",
 	    uaddr, tmpf->f_refcount, tmpf->f_key.shared);
 	*newf = tmpf;
 
 	LIN_SDT_PROBE1(futex, futex_get0, return, 0);
 	return (0);
 }
 
 static int
 futex_get(uint32_t *uaddr, struct waiting_proc **wp, struct futex **f,
     uint32_t flags)
 {
 	int error;
 
 	LIN_SDT_PROBE3(futex, futex_get, entry, uaddr, wp, f);
 
 	if (flags & FUTEX_CREATE_WP) {
 		*wp = malloc(sizeof(struct waiting_proc), M_FUTEX_WP, M_WAITOK);
 		(*wp)->wp_flags = 0;
 	}
 	error = futex_get0(uaddr, f, flags);
 	if (error) {
 		LIN_SDT_PROBE0(futex, futex_get, error);
 
 		if (flags & FUTEX_CREATE_WP)
 			free(*wp, M_FUTEX_WP);
 
 		LIN_SDT_PROBE1(futex, futex_get, return, error);
 		return (error);
 	}
 	if (flags & FUTEX_CREATE_WP) {
 		TAILQ_INSERT_HEAD(&(*f)->f_waiting_proc, *wp, wp_list);
 		(*wp)->wp_futex = *f;
 	}
 
 	LIN_SDT_PROBE1(futex, futex_get, return, error);
 	return (error);
 }
 
 static int
 futex_sleep(struct futex *f, struct waiting_proc *wp, int timeout)
 {
 	int error;
 
 	FUTEX_ASSERT_LOCKED(f);
 	LIN_SDT_PROBE3(futex, futex_sleep, entry, f, wp, timeout);
 	LINUX_CTR4(sys_futex, "futex_sleep enter uaddr %p wp %p timo %d ref %d",
 	    f->f_uaddr, wp, timeout, f->f_refcount);
 	error = sx_sleep(wp, &f->f_lck, PCATCH, "futex", timeout);
 	if (wp->wp_flags & FUTEX_WP_REQUEUED) {
 		KASSERT(f != wp->wp_futex, ("futex != wp_futex"));
 
 		if (error) {
 			LIN_SDT_PROBE5(futex, futex_sleep, requeue_error, error,
 			    f->f_uaddr, wp, wp->wp_futex->f_uaddr,
 			    wp->wp_futex->f_refcount);
 		}
 
 		LINUX_CTR5(sys_futex, "futex_sleep out error %d uaddr %p wp"
 		    " %p requeued uaddr %p ref %d",
 		    error, f->f_uaddr, wp, wp->wp_futex->f_uaddr,
 		    wp->wp_futex->f_refcount);
 		futex_put(f, NULL);
 		f = wp->wp_futex;
 		FUTEX_LOCK(f);
 	} else {
 		if (error) {
 			LIN_SDT_PROBE3(futex, futex_sleep, sleep_error, error,
 			    f->f_uaddr, wp);
 		}
 		LINUX_CTR3(sys_futex, "futex_sleep out error %d uaddr %p wp %p",
 		    error, f->f_uaddr, wp);
 	}
 
 	futex_put(f, wp);
 
 	LIN_SDT_PROBE1(futex, futex_sleep, return, error);
 	return (error);
 }
 
 static int
 futex_wake(struct futex *f, int n, uint32_t bitset)
 {
 	struct waiting_proc *wp, *wpt;
 	int count = 0;
 
 	LIN_SDT_PROBE3(futex, futex_wake, entry, f, n, bitset);
 
 	if (bitset == 0) {
 		LIN_SDT_PROBE1(futex, futex_wake, return, EINVAL);
 		return (EINVAL);
 	}
 
 	FUTEX_ASSERT_LOCKED(f);
 	TAILQ_FOREACH_SAFE(wp, &f->f_waiting_proc, wp_list, wpt) {
 		LIN_SDT_PROBE3(futex, futex_wake, iterate, f->f_uaddr, wp,
 		    f->f_refcount);
 		LINUX_CTR3(sys_futex, "futex_wake uaddr %p wp %p ref %d",
 		    f->f_uaddr, wp, f->f_refcount);
 		/*
 		 * Unless we find a matching bit in
 		 * the bitset, continue searching.
 		 */
 		if (!(wp->wp_futex->f_bitset & bitset))
 			continue;
 
 		wp->wp_flags |= FUTEX_WP_REMOVED;
 		TAILQ_REMOVE(&f->f_waiting_proc, wp, wp_list);
 		LIN_SDT_PROBE1(futex, futex_wake, wakeup, wp);
 		wakeup_one(wp);
 		if (++count == n)
 			break;
 	}
 
 	LIN_SDT_PROBE1(futex, futex_wake, return, count);
 	return (count);
 }
 
 static int
 futex_requeue(struct futex *f, int n, struct futex *f2, int n2)
 {
 	struct waiting_proc *wp, *wpt;
 	int count = 0;
 
 	LIN_SDT_PROBE4(futex, futex_requeue, entry, f, n, f2, n2);
 
 	FUTEX_ASSERT_LOCKED(f);
 	FUTEX_ASSERT_LOCKED(f2);
 
 	TAILQ_FOREACH_SAFE(wp, &f->f_waiting_proc, wp_list, wpt) {
 		if (++count <= n) {
 			LINUX_CTR2(sys_futex, "futex_req_wake uaddr %p wp %p",
 			    f->f_uaddr, wp);
 			wp->wp_flags |= FUTEX_WP_REMOVED;
 			TAILQ_REMOVE(&f->f_waiting_proc, wp, wp_list);
 			LIN_SDT_PROBE1(futex, futex_requeue, wakeup, wp);
 			wakeup_one(wp);
 		} else {
 			LIN_SDT_PROBE3(futex, futex_requeue, requeue,
 			    f->f_uaddr, wp, f2->f_uaddr);
 			LINUX_CTR3(sys_futex, "futex_requeue uaddr %p wp %p to %p",
 			    f->f_uaddr, wp, f2->f_uaddr);
 			wp->wp_flags |= FUTEX_WP_REQUEUED;
 			/* Move wp to wp_list of f2 futex */
 			TAILQ_REMOVE(&f->f_waiting_proc, wp, wp_list);
 			TAILQ_INSERT_HEAD(&f2->f_waiting_proc, wp, wp_list);
 
 			/*
 			 * Thread which sleeps on wp after waking should
 			 * acquire f2 lock, so increment refcount of f2 to
 			 * prevent it from premature deallocation.
 			 */
 			wp->wp_futex = f2;
 			FUTEXES_LOCK;
 			++f2->f_refcount;
 			FUTEXES_UNLOCK;
 			if (count - n >= n2)
 				break;
 		}
 	}
 
 	LIN_SDT_PROBE1(futex, futex_requeue, return, count);
 	return (count);
 }
 
 static int
 futex_wait(struct futex *f, struct waiting_proc *wp, struct l_timespec *ts,
     uint32_t bitset)
 {
 	struct l_timespec timeout;
 	struct timeval tv;
 	int timeout_hz;
 	int error;
 
 	LIN_SDT_PROBE4(futex, futex_wait, entry, f, wp, ts, bitset);
 
 	if (bitset == 0) {
 		LIN_SDT_PROBE1(futex, futex_wait, return, EINVAL);
 		return (EINVAL);
 	}
 
 	f->f_bitset = bitset;
 
 	if (ts != NULL) {
 		error = copyin(ts, &timeout, sizeof(timeout));
 		if (error) {
 			LIN_SDT_PROBE1(futex, futex_wait, copyin_error, error);
 			LIN_SDT_PROBE1(futex, futex_wait, return, error);
 			return (error);
 		}
 		TIMESPEC_TO_TIMEVAL(&tv, &timeout);
 		error = itimerfix(&tv);
 		if (error) {
 			LIN_SDT_PROBE1(futex, futex_wait, itimerfix_error,
 			    error);
 			LIN_SDT_PROBE1(futex, futex_wait, return, error);
 			return (error);
 		}
 		timeout_hz = tvtohz(&tv);
 	} else
 		timeout_hz = 0;
 
 	error = futex_sleep(f, wp, timeout_hz);
 	if (error) {
 		LIN_SDT_PROBE1(futex, futex_wait, sleep_error, error);
 	}
 	if (error == EWOULDBLOCK)
 		error = ETIMEDOUT;
 
 	LIN_SDT_PROBE1(futex, futex_wait, return, error);
 	return (error);
 }
 
 static int
 futex_atomic_op(struct thread *td, int encoded_op, uint32_t *uaddr)
 {
 	int op = (encoded_op >> 28) & 7;
 	int cmp = (encoded_op >> 24) & 15;
 	int oparg = (encoded_op << 8) >> 20;
 	int cmparg = (encoded_op << 20) >> 20;
 	int oldval = 0, ret;
 
 	LIN_SDT_PROBE3(futex, futex_atomic_op, entry, td, encoded_op, uaddr);
 
 	if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28))
 		oparg = 1 << oparg;
 
 	LIN_SDT_PROBE4(futex, futex_atomic_op, decoded_op, op, cmp, oparg,
 	    cmparg);
 	
 	/* XXX: Linux verifies access here and returns EFAULT */
 	LIN_SDT_PROBE0(futex, futex_atomic_op, missing_access_check);
 
 	switch (op) {
 	case FUTEX_OP_SET:
 		ret = futex_xchgl(oparg, uaddr, &oldval);
 		break;
 	case FUTEX_OP_ADD:
 		ret = futex_addl(oparg, uaddr, &oldval);
 		break;
 	case FUTEX_OP_OR:
 		ret = futex_orl(oparg, uaddr, &oldval);
 		break;
 	case FUTEX_OP_ANDN:
 		ret = futex_andl(~oparg, uaddr, &oldval);
 		break;
 	case FUTEX_OP_XOR:
 		ret = futex_xorl(oparg, uaddr, &oldval);
 		break;
 	default:
 		LIN_SDT_PROBE1(futex, futex_atomic_op, unimplemented_op, op);
 		ret = -ENOSYS;
 		break;
 	}
 
 	if (ret) {
 		LIN_SDT_PROBE1(futex, futex_atomic_op, return, ret);
 		return (ret);
 	}
 
 	switch (cmp) {
 	case FUTEX_OP_CMP_EQ:
 		ret = (oldval == cmparg);
 		break;
 	case FUTEX_OP_CMP_NE:
 		ret = (oldval != cmparg);
 		break;
 	case FUTEX_OP_CMP_LT:
 		ret = (oldval < cmparg);
 		break;
 	case FUTEX_OP_CMP_GE:
 		ret = (oldval >= cmparg);
 		break;
 	case FUTEX_OP_CMP_LE:
 		ret = (oldval <= cmparg);
 		break;
 	case FUTEX_OP_CMP_GT:
 		ret = (oldval > cmparg);
 		break;
 	default:
 		LIN_SDT_PROBE1(futex, futex_atomic_op, unimplemented_cmp, cmp);
 		ret = -ENOSYS;
 	}
 
 	LIN_SDT_PROBE1(futex, futex_atomic_op, return, ret);
 	return (ret);
 }
 
 int
 linux_sys_futex(struct thread *td, struct linux_sys_futex_args *args)
 {
 	int clockrt, nrwake, op_ret, ret, val;
 	struct linux_emuldata *em;
 	struct waiting_proc *wp;
 	struct futex *f, *f2;
 	int error;
 	uint32_t flags;
 
 	LIN_SDT_PROBE2(futex, linux_sys_futex, entry, td, args);
 
 	if (args->op & LINUX_FUTEX_PRIVATE_FLAG) {
 		flags = 0;
 		args->op &= ~LINUX_FUTEX_PRIVATE_FLAG;
 	} else
 		flags = FUTEX_SHARED;
 
 	/*
 	 * Currently support for switching between CLOCK_MONOTONIC and
 	 * CLOCK_REALTIME is not present. However Linux forbids the use of
 	 * FUTEX_CLOCK_REALTIME with any op except FUTEX_WAIT_BITSET and
 	 * FUTEX_WAIT_REQUEUE_PI.
 	 */
 	clockrt = args->op & LINUX_FUTEX_CLOCK_REALTIME;
 	args->op = args->op & ~LINUX_FUTEX_CLOCK_REALTIME;
 	if (clockrt && args->op != LINUX_FUTEX_WAIT_BITSET &&
 		args->op != LINUX_FUTEX_WAIT_REQUEUE_PI) {
 		LIN_SDT_PROBE0(futex, linux_sys_futex,
 		    unimplemented_clockswitch);
 		LIN_SDT_PROBE1(futex, linux_sys_futex, return, ENOSYS);
 		return (ENOSYS);
 	}
 
 	error = 0;
 	f = f2 = NULL;
 
 	switch (args->op) {
 	case LINUX_FUTEX_WAIT:
 		args->val3 = FUTEX_BITSET_MATCH_ANY;
 		/* FALLTHROUGH */
 
 	case LINUX_FUTEX_WAIT_BITSET:
 		LIN_SDT_PROBE3(futex, linux_sys_futex, debug_wait, args->uaddr,
 		    args->val, args->val3);
 		LINUX_CTR3(sys_futex, "WAIT uaddr %p val %d val3 %d",
 		    args->uaddr, args->val, args->val3);
 
 		error = futex_get(args->uaddr, &wp, &f,
 		    flags | FUTEX_CREATE_WP);
 		if (error) {
 			LIN_SDT_PROBE1(futex, linux_sys_futex, return, error);
 			return (error);
 		}
 
 		error = copyin(args->uaddr, &val, sizeof(val));
 		if (error) {
 			LIN_SDT_PROBE1(futex, linux_sys_futex, copyin_error,
 			    error);
 			LINUX_CTR1(sys_futex, "WAIT copyin failed %d",
 			    error);
 			futex_put(f, wp);
 
 			LIN_SDT_PROBE1(futex, linux_sys_futex, return, error);
 			return (error);
 		}
 		if (val != args->val) {
 			LIN_SDT_PROBE4(futex, linux_sys_futex,
 			    debug_wait_value_neq, args->uaddr, args->val, val,
 			    args->val3);
 			LINUX_CTR4(sys_futex,
 			    "WAIT uaddr %p val %d != uval %d val3 %d",
 			    args->uaddr, args->val, val, args->val3);
 			futex_put(f, wp);
 
 			LIN_SDT_PROBE1(futex, linux_sys_futex, return,
 			    EWOULDBLOCK);
 			return (EWOULDBLOCK);
 		}
 
 		error = futex_wait(f, wp, args->timeout, args->val3);
 		break;
 
 	case LINUX_FUTEX_WAKE:
 		args->val3 = FUTEX_BITSET_MATCH_ANY;
 		/* FALLTHROUGH */
 
 	case LINUX_FUTEX_WAKE_BITSET:
 		LIN_SDT_PROBE3(futex, linux_sys_futex, debug_wake, args->uaddr,
 		    args->val, args->val3);
 		LINUX_CTR3(sys_futex, "WAKE uaddr %p val % d val3 %d",
 		    args->uaddr, args->val, args->val3);
 
 		error = futex_get(args->uaddr, NULL, &f,
 		    flags | FUTEX_DONTCREATE);
 		if (error) {
 			LIN_SDT_PROBE1(futex, linux_sys_futex, return, error);
 			return (error);
 		}
 
 		if (f == NULL) {
 			td->td_retval[0] = 0;
 
 			LIN_SDT_PROBE1(futex, linux_sys_futex, return, error);
 			return (error);
 		}
 		td->td_retval[0] = futex_wake(f, args->val, args->val3);
 		futex_put(f, NULL);
 		break;
 
 	case LINUX_FUTEX_CMP_REQUEUE:
 		LIN_SDT_PROBE5(futex, linux_sys_futex, debug_cmp_requeue,
 		    args->uaddr, args->val, args->val3, args->uaddr2,
 		    args->timeout);
 		LINUX_CTR5(sys_futex, "CMP_REQUEUE uaddr %p "
 		    "val %d val3 %d uaddr2 %p val2 %d",
 		    args->uaddr, args->val, args->val3, args->uaddr2,
 		    (int)(unsigned long)args->timeout);
 
 		/*
 		 * Linux allows this, we would not, it is an incorrect
 		 * usage of declared ABI, so return EINVAL.
 		 */
 		if (args->uaddr == args->uaddr2) {
 			LIN_SDT_PROBE0(futex, linux_sys_futex,
 			    invalid_cmp_requeue_use);
 			LIN_SDT_PROBE1(futex, linux_sys_futex, return, EINVAL);
 			return (EINVAL);
 		}
 
 		error = futex_get(args->uaddr, NULL, &f, flags);
 		if (error) {
 			LIN_SDT_PROBE1(futex, linux_sys_futex, return, error);
 			return (error);
 		}
 
 		/*
 		 * To avoid deadlocks return EINVAL if second futex
 		 * exists at this time.
 		 *
 		 * Glibc fall back to FUTEX_WAKE in case of any error
 		 * returned by FUTEX_CMP_REQUEUE.
 		 */
 		error = futex_get(args->uaddr2, NULL, &f2,
 		    flags | FUTEX_DONTEXISTS);
 		if (error) {
 			futex_put(f, NULL);
 
 			LIN_SDT_PROBE1(futex, linux_sys_futex, return, error);
 			return (error);
 		}
 		error = copyin(args->uaddr, &val, sizeof(val));
 		if (error) {
 			LIN_SDT_PROBE1(futex, linux_sys_futex, copyin_error,
 			    error);
 			LINUX_CTR1(sys_futex, "CMP_REQUEUE copyin failed %d",
 			    error);
 			futex_put(f2, NULL);
 			futex_put(f, NULL);
 
 			LIN_SDT_PROBE1(futex, linux_sys_futex, return, error);
 			return (error);
 		}
 		if (val != args->val3) {
 			LIN_SDT_PROBE2(futex, linux_sys_futex,
 			    debug_cmp_requeue_value_neq, args->val, val);
 			LINUX_CTR2(sys_futex, "CMP_REQUEUE val %d != uval %d",
 			    args->val, val);
 			futex_put(f2, NULL);
 			futex_put(f, NULL);
 
 			LIN_SDT_PROBE1(futex, linux_sys_futex, return, EAGAIN);
 			return (EAGAIN);
 		}
 
 		nrwake = (int)(unsigned long)args->timeout;
 		td->td_retval[0] = futex_requeue(f, args->val, f2, nrwake);
 		futex_put(f2, NULL);
 		futex_put(f, NULL);
 		break;
 
 	case LINUX_FUTEX_WAKE_OP:
 		LIN_SDT_PROBE5(futex, linux_sys_futex, debug_wake_op,
 		    args->uaddr, args->op, args->val, args->uaddr2, args->val3);
 		LINUX_CTR5(sys_futex, "WAKE_OP "
 		    "uaddr %p op %d val %x uaddr2 %p val3 %x",
 		    args->uaddr, args->op, args->val,
 		    args->uaddr2, args->val3);
 
 		error = futex_get(args->uaddr, NULL, &f, flags);
 		if (error) {
 			LIN_SDT_PROBE1(futex, linux_sys_futex, return, error);
 			return (error);
 		}
 
 		if (args->uaddr != args->uaddr2)
 			error = futex_get(args->uaddr2, NULL, &f2, flags);
 		if (error) {
 			futex_put(f, NULL);
 
 			LIN_SDT_PROBE1(futex, linux_sys_futex, return, error);
 			return (error);
 		}
 
 		/*
 		 * This function returns positive number as results and
 		 * negative as errors
 		 */
 		op_ret = futex_atomic_op(td, args->val3, args->uaddr2);
 
 		if (op_ret < 0) {
 			/* XXX: We don't handle the EFAULT yet. */
 			if (op_ret != -EFAULT) {
 				if (f2 != NULL)
 					futex_put(f2, NULL);
 				futex_put(f, NULL);
 
 				LIN_SDT_PROBE1(futex, linux_sys_futex, return,
 				    -op_ret);
 				return (-op_ret);
 			} else {
 				LIN_SDT_PROBE0(futex, linux_sys_futex,
 				    unhandled_efault);
 			}
 			if (f2 != NULL)
 				futex_put(f2, NULL);
 			futex_put(f, NULL);
 
 			LIN_SDT_PROBE1(futex, linux_sys_futex, return, EFAULT);
 			return (EFAULT);
 		}
 
 		ret = futex_wake(f, args->val, args->val3);
 
 		if (op_ret > 0) {
 			op_ret = 0;
 			nrwake = (int)(unsigned long)args->timeout;
 
 			if (f2 != NULL)
 				op_ret += futex_wake(f2, nrwake, args->val3);
 			else
 				op_ret += futex_wake(f, nrwake, args->val3);
 			ret += op_ret;
 
 		}
 		if (f2 != NULL)
 			futex_put(f2, NULL);
 		futex_put(f, NULL);
 		td->td_retval[0] = ret;
 		break;
 
 	case LINUX_FUTEX_LOCK_PI:
 		/* not yet implemented */
 		linux_msg(td,
 			  "linux_sys_futex: "
 			  "op LINUX_FUTEX_LOCK_PI not implemented\n");
 		LIN_SDT_PROBE0(futex, linux_sys_futex, unimplemented_lock_pi);
 		LIN_SDT_PROBE1(futex, linux_sys_futex, return, ENOSYS);
 		return (ENOSYS);
 
 	case LINUX_FUTEX_UNLOCK_PI:
 		/* not yet implemented */
 		linux_msg(td,
 			  "linux_sys_futex: "
 			  "op LINUX_FUTEX_UNLOCK_PI not implemented\n");
 		LIN_SDT_PROBE0(futex, linux_sys_futex, unimplemented_unlock_pi);
 		LIN_SDT_PROBE1(futex, linux_sys_futex, return, ENOSYS);
 		return (ENOSYS);
 
 	case LINUX_FUTEX_TRYLOCK_PI:
 		/* not yet implemented */
 		linux_msg(td,
 			  "linux_sys_futex: "
 			  "op LINUX_FUTEX_TRYLOCK_PI not implemented\n");
 		LIN_SDT_PROBE0(futex, linux_sys_futex,
 		    unimplemented_trylock_pi);
 		LIN_SDT_PROBE1(futex, linux_sys_futex, return, ENOSYS);
 		return (ENOSYS);
 
 	case LINUX_FUTEX_REQUEUE:
 
 		/*
 		 * Glibc does not use this operation since version 2.3.3,
 		 * as it is racy and replaced by FUTEX_CMP_REQUEUE operation.
 		 * Glibc versions prior to 2.3.3 fall back to FUTEX_WAKE when
 		 * FUTEX_REQUEUE returned EINVAL.
 		 */
 		em = em_find(td->td_proc, EMUL_DONTLOCK);
 		if ((em->flags & LINUX_XDEPR_REQUEUEOP) == 0) {
 			linux_msg(td,
 				  "linux_sys_futex: "
 				  "unsupported futex_requeue op\n");
 			em->flags |= LINUX_XDEPR_REQUEUEOP;
 			LIN_SDT_PROBE0(futex, linux_sys_futex,
 			    deprecated_requeue);
 		}
 
 		LIN_SDT_PROBE1(futex, linux_sys_futex, return, EINVAL);
 		return (EINVAL);
 
 	case LINUX_FUTEX_WAIT_REQUEUE_PI:
 		/* not yet implemented */
 		linux_msg(td,
 			  "linux_sys_futex: "
 			  "op FUTEX_WAIT_REQUEUE_PI not implemented\n");
 		LIN_SDT_PROBE0(futex, linux_sys_futex,
 		    unimplemented_wait_requeue_pi);
 		LIN_SDT_PROBE1(futex, linux_sys_futex, return, ENOSYS);
 		return (ENOSYS);
 
 	case LINUX_FUTEX_CMP_REQUEUE_PI:
 		/* not yet implemented */
 		linux_msg(td,
 			    "linux_sys_futex: "
 			    "op LINUX_FUTEX_CMP_REQUEUE_PI not implemented\n");
 		LIN_SDT_PROBE0(futex, linux_sys_futex,
 		    unimplemented_cmp_requeue_pi);
 		LIN_SDT_PROBE1(futex, linux_sys_futex, return, ENOSYS);
 		return (ENOSYS);
 
 	default:
 		linux_msg(td,
 			  "linux_sys_futex: unknown op %d\n", args->op);
 		LIN_SDT_PROBE1(futex, linux_sys_futex, unknown_operation,
 		    args->op);
 		LIN_SDT_PROBE1(futex, linux_sys_futex, return, ENOSYS);
 		return (ENOSYS);
 	}
 
 	LIN_SDT_PROBE1(futex, linux_sys_futex, return, error);
 	return (error);
 }
 
 int
 linux_set_robust_list(struct thread *td, struct linux_set_robust_list_args *args)
 {
 	struct linux_emuldata *em;
 
 	LIN_SDT_PROBE2(futex, linux_set_robust_list, entry, td, args);
 
 	if (args->len != sizeof(struct linux_robust_list_head)) {
 		LIN_SDT_PROBE0(futex, linux_set_robust_list, size_error);
 		LIN_SDT_PROBE1(futex, linux_set_robust_list, return, EINVAL);
 		return (EINVAL);
 	}
 
 	em = em_find(td->td_proc, EMUL_DOLOCK);
 	em->robust_futexes = args->head;
 	EMUL_UNLOCK(&emul_lock);
 
 	LIN_SDT_PROBE1(futex, linux_set_robust_list, return, 0);
 	return (0);
 }
 
 int
 linux_get_robust_list(struct thread *td, struct linux_get_robust_list_args *args)
 {
 	struct linux_emuldata *em;
 	struct linux_robust_list_head *head;
 	l_size_t len = sizeof(struct linux_robust_list_head);
 	int error = 0;
 
 	LIN_SDT_PROBE2(futex, linux_get_robust_list, entry, td, args);
 
 	if (!args->pid) {
 		em = em_find(td->td_proc, EMUL_DONTLOCK);
 		head = em->robust_futexes;
 	} else {
 		struct proc *p;
 
 		p = pfind(args->pid);
 		if (p == NULL) {
 			LIN_SDT_PROBE1(futex, linux_get_robust_list, return,
 			    ESRCH);
 			return (ESRCH);
 		}
 
 		em = em_find(p, EMUL_DONTLOCK);
 		/* XXX: ptrace? */
 		if (priv_check(td, PRIV_CRED_SETUID) ||
 		    priv_check(td, PRIV_CRED_SETEUID) ||
 		    p_candebug(td, p)) {
 			PROC_UNLOCK(p);
 
 			LIN_SDT_PROBE1(futex, linux_get_robust_list, return,
 			    EPERM);
 			return (EPERM);
 		}
 		head = em->robust_futexes;
 
 		PROC_UNLOCK(p);
 	}
 
 	error = copyout(&len, args->len, sizeof(l_size_t));
 	if (error) {
 		LIN_SDT_PROBE1(futex, linux_get_robust_list, copyout_error,
 		    error);
 		LIN_SDT_PROBE1(futex, linux_get_robust_list, return, EFAULT);
 		return (EFAULT);
 	}
 
 	error = copyout(head, args->head, sizeof(struct linux_robust_list_head));
 	if (error) {
 		LIN_SDT_PROBE1(futex, linux_get_robust_list, copyout_error,
 		    error);
 	}
 
 	LIN_SDT_PROBE1(futex, linux_get_robust_list, return, error);
 	return (error);
 }
 
 static int
 handle_futex_death(struct proc *p, uint32_t *uaddr, int pi)
 {
 	uint32_t uval, nval, mval;
 	struct futex *f;
 	int error;
 
 	LIN_SDT_PROBE3(futex, handle_futex_death, entry, p, uaddr, pi);
 
 retry:
 	error = copyin(uaddr, &uval, 4);
 	if (error) {
 		LIN_SDT_PROBE1(futex, handle_futex_death, copyin_error, error);
 		LIN_SDT_PROBE1(futex, handle_futex_death, return, EFAULT);
 		return (EFAULT);
 	}
 	if ((uval & FUTEX_TID_MASK) == p->p_pid) {
 		mval = (uval & FUTEX_WAITERS) | FUTEX_OWNER_DIED;
 		nval = casuword32(uaddr, uval, mval);
 
 		if (nval == -1) {
 			LIN_SDT_PROBE1(futex, handle_futex_death, return,
 			    EFAULT);
 			return (EFAULT);
 		}
 
 		if (nval != uval)
 			goto retry;
 
 		if (!pi && (uval & FUTEX_WAITERS)) {
 			error = futex_get(uaddr, NULL, &f,
 			    FUTEX_DONTCREATE | FUTEX_SHARED);
 			if (error) {
 				LIN_SDT_PROBE1(futex, handle_futex_death,
 				    return, error);
 				return (error);
 			}
 			if (f != NULL) {
 				futex_wake(f, 1, FUTEX_BITSET_MATCH_ANY);
 				futex_put(f, NULL);
 			}
 		}
 	}
 
 	LIN_SDT_PROBE1(futex, handle_futex_death, return, 0);
 	return (0);
 }
 
 static int
 fetch_robust_entry(struct linux_robust_list **entry,
     struct linux_robust_list **head, int *pi)
 {
 	l_ulong uentry;
 	int error;
 
 	LIN_SDT_PROBE3(futex, fetch_robust_entry, entry, entry, head, pi);
 
 	error = copyin((const void *)head, &uentry, sizeof(l_ulong));
 	if (error) {
 		LIN_SDT_PROBE1(futex, fetch_robust_entry, copyin_error, error);
 		LIN_SDT_PROBE1(futex, fetch_robust_entry, return, EFAULT);
 		return (EFAULT);
 	}
 
 	*entry = (void *)(uentry & ~1UL);
 	*pi = uentry & 1;
 
 	LIN_SDT_PROBE1(futex, fetch_robust_entry, return, 0);
 	return (0);
 }
 
 /* This walks the list of robust futexes releasing them. */
 void
 release_futexes(struct proc *p)
 {
 	struct linux_robust_list_head *head = NULL;
 	struct linux_robust_list *entry, *next_entry, *pending;
 	unsigned int limit = 2048, pi, next_pi, pip;
 	struct linux_emuldata *em;
 	l_long futex_offset;
 	int rc, error;
 
 	LIN_SDT_PROBE1(futex, release_futexes, entry, p);
 
 	em = em_find(p, EMUL_DONTLOCK);
 	head = em->robust_futexes;
 
 	if (head == NULL) {
 		LIN_SDT_PROBE0(futex, release_futexes, return);
 		return;
 	}
 
 	if (fetch_robust_entry(&entry, PTRIN(&head->list.next), &pi)) {
 		LIN_SDT_PROBE0(futex, release_futexes, return);
 		return;
 	}
 
 	error = copyin(&head->futex_offset, &futex_offset,
 	    sizeof(futex_offset));
 	if (error) {
 		LIN_SDT_PROBE1(futex, release_futexes, copyin_error, error);
 		LIN_SDT_PROBE0(futex, release_futexes, return);
 		return;
 	}
 
 	if (fetch_robust_entry(&pending, PTRIN(&head->pending_list), &pip)) {
 		LIN_SDT_PROBE0(futex, release_futexes, return);
 		return;
 	}
 
 	while (entry != &head->list) {
 		rc = fetch_robust_entry(&next_entry, PTRIN(&entry->next), &next_pi);
 
 		if (entry != pending)
 			if (handle_futex_death(p,
 			    (uint32_t *)((caddr_t)entry + futex_offset), pi)) {
 				LIN_SDT_PROBE0(futex, release_futexes, return);
 				return;
 			}
 		if (rc) {
 			LIN_SDT_PROBE0(futex, release_futexes, return);
 			return;
 		}
 
 		entry = next_entry;
 		pi = next_pi;
 
 		if (!--limit)
 			break;
 
 		sched_relinquish(curthread);
 	}
 
 	if (pending)
 		handle_futex_death(p, (uint32_t *)((caddr_t)pending + futex_offset), pip);
 
 	LIN_SDT_PROBE0(futex, release_futexes, return);
 }
Index: head/sys/compat/linux/linux_mib.c
===================================================================
--- head/sys/compat/linux/linux_mib.c	(revision 258540)
+++ head/sys/compat/linux/linux_mib.c	(revision 258541)
@@ -1,858 +1,857 @@
 /*-
  * Copyright (c) 1999 Marcel Moolenaar
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer 
  *    in this position and unchanged.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. The name of the author may not be used to endorse or promote products
  *    derived from this software without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_compat.h"
-#include "opt_kdtrace.h"
 
 #include <sys/param.h>
 #include <sys/kernel.h>
 #include <sys/sdt.h>
 #include <sys/systm.h>
 #include <sys/sysctl.h>
 #include <sys/proc.h>
 #include <sys/malloc.h>
 #include <sys/mount.h>
 #include <sys/jail.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/sx.h>
 
 #ifdef COMPAT_LINUX32
 #include <machine/../linux32/linux.h>
 #else
 #include <machine/../linux/linux.h>
 #endif
 #include <compat/linux/linux_dtrace.h>
 #include <compat/linux/linux_mib.h>
 #include <compat/linux/linux_misc.h>
 
 /* DTrace init */
 LIN_SDT_PROVIDER_DECLARE(LINUX_DTRACE);
 
 /**
  * DTrace probes in this module.
  */
 LIN_SDT_PROBE_DEFINE0(mib, linux_sysctl_osname, entry);
 LIN_SDT_PROBE_DEFINE1(mib, linux_sysctl_osname, sysctl_string_error, "int");
 LIN_SDT_PROBE_DEFINE1(mib, linux_sysctl_osname, return, "int");
 
 LIN_SDT_PROBE_DEFINE0(mib, linux_sysctl_osrelease, entry);
 LIN_SDT_PROBE_DEFINE1(mib, linux_sysctl_osrelease, sysctl_string_error, "int");
 LIN_SDT_PROBE_DEFINE1(mib, linux_sysctl_osrelease, return, "int");
 LIN_SDT_PROBE_DEFINE0(mib, linux_sysctl_oss_version, entry);
 LIN_SDT_PROBE_DEFINE1(mib, linux_sysctl_oss_version, sysctl_string_error,
     "int");
 LIN_SDT_PROBE_DEFINE1(mib, linux_sysctl_oss_version, return, "int");
 LIN_SDT_PROBE_DEFINE2(mib, linux_map_osrel, entry, "char *", "int *");
 LIN_SDT_PROBE_DEFINE1(mib, linux_map_osrel, return, "int");
 LIN_SDT_PROBE_DEFINE2(mib, linux_get_prison, entry, "struct prison *",
     "struct prison **");
 LIN_SDT_PROBE_DEFINE1(mib, linux_get_prison, return, "struct linux_prison *");
 LIN_SDT_PROBE_DEFINE2(mib, linux_alloc_prison, entry, "struct prison *",
     "struct linux_prison **");
 LIN_SDT_PROBE_DEFINE1(mib, linux_alloc_prison, return, "int");
 LIN_SDT_PROBE_DEFINE2(mib, linux_prison_create, entry, "void *", "void *");
 LIN_SDT_PROBE_DEFINE1(mib, linux_prison_create, vfs_copyopt_error, "int");
 LIN_SDT_PROBE_DEFINE1(mib, linux_prison_create, return, "int");
 LIN_SDT_PROBE_DEFINE2(mib, linux_prison_check, entry, "void *", "void *");
 LIN_SDT_PROBE_DEFINE1(mib, linux_prison_check, vfs_copyopt_error, "int");
 LIN_SDT_PROBE_DEFINE1(mib, linux_prison_check, vfs_getopt_error, "int");
 LIN_SDT_PROBE_DEFINE1(mib, linux_prison_check, return, "int");
 LIN_SDT_PROBE_DEFINE2(mib, linux_prison_set, entry, "void *", "void *");
 LIN_SDT_PROBE_DEFINE1(mib, linux_prison_set, vfs_copyopt_error, "int");
 LIN_SDT_PROBE_DEFINE1(mib, linux_prison_set, vfs_getopt_error, "int");
 LIN_SDT_PROBE_DEFINE1(mib, linux_prison_set, return, "int");
 LIN_SDT_PROBE_DEFINE2(mib, linux_prison_get, entry, "void *", "void *");
 LIN_SDT_PROBE_DEFINE1(mib, linux_prison_get, vfs_setopt_error, "int");
 LIN_SDT_PROBE_DEFINE1(mib, linux_prison_get, vfs_setopts_error, "int");
 LIN_SDT_PROBE_DEFINE1(mib, linux_prison_get, return, "int");
 LIN_SDT_PROBE_DEFINE1(mib, linux_prison_destructor, entry, "void *");
 LIN_SDT_PROBE_DEFINE0(mib, linux_prison_destructor, return);
 LIN_SDT_PROBE_DEFINE0(mib, linux_osd_jail_register, entry);
 LIN_SDT_PROBE_DEFINE0(mib, linux_osd_jail_register, return);
 LIN_SDT_PROBE_DEFINE0(mib, linux_osd_jail_deregister, entry);
 LIN_SDT_PROBE_DEFINE0(mib, linux_osd_jail_deregister, return);
 LIN_SDT_PROBE_DEFINE2(mib, linux_get_osname, entry, "struct thread *",
     "char *");
 LIN_SDT_PROBE_DEFINE0(mib, linux_get_osname, return);
 LIN_SDT_PROBE_DEFINE2(mib, linux_set_osname, entry, "struct thread *",
     "char *");
 LIN_SDT_PROBE_DEFINE1(mib, linux_set_osname, return, "int");
 LIN_SDT_PROBE_DEFINE2(mib, linux_get_osrelease, entry, "struct thread *",
     "char *");
 LIN_SDT_PROBE_DEFINE0(mib, linux_get_osrelease, return);
 LIN_SDT_PROBE_DEFINE1(mib, linux_kernver, entry, "struct thread *");
 LIN_SDT_PROBE_DEFINE1(mib, linux_kernver, return, "int");
 LIN_SDT_PROBE_DEFINE2(mib, linux_set_osrelease, entry, "struct thread *",
     "char *");
 LIN_SDT_PROBE_DEFINE1(mib, linux_set_osrelease, return, "int");
 LIN_SDT_PROBE_DEFINE1(mib, linux_get_oss_version, entry, "struct thread *");
 LIN_SDT_PROBE_DEFINE1(mib, linux_get_oss_version, return, "int");
 
 LIN_SDT_PROBE_DEFINE2(mib, linux_set_oss_version, entry, "struct thread *",
     "int");
 LIN_SDT_PROBE_DEFINE1(mib, linux_set_oss_version, return, "int");
 
 struct linux_prison {
 	char	pr_osname[LINUX_MAX_UTSNAME];
 	char	pr_osrelease[LINUX_MAX_UTSNAME];
 	int	pr_oss_version;
 	int	pr_osrel;
 };
 
 static struct linux_prison lprison0 = {
 	.pr_osname =		"Linux",
 	.pr_osrelease =		"2.6.16",
 	.pr_oss_version =	0x030600,
 	.pr_osrel =		2006016
 };
 
 static unsigned linux_osd_jail_slot;
 
 static SYSCTL_NODE(_compat, OID_AUTO, linux, CTLFLAG_RW, 0,
 	    "Linux mode");
 
 static int	linux_set_osname(struct thread *td, char *osname);
 static int	linux_set_osrelease(struct thread *td, char *osrelease);
 static int	linux_set_oss_version(struct thread *td, int oss_version);
 
 static int
 linux_sysctl_osname(SYSCTL_HANDLER_ARGS)
 {
 	char osname[LINUX_MAX_UTSNAME];
 	int error;
 
 	LIN_SDT_PROBE0(mib, linux_sysctl_osname, entry);
 
 	linux_get_osname(req->td, osname);
 	error = sysctl_handle_string(oidp, osname, LINUX_MAX_UTSNAME, req);
 	if (error != 0 || req->newptr == NULL) {
 		LIN_SDT_PROBE1(mib, linux_sysctl_osname, sysctl_string_error,
 		    error);
 		LIN_SDT_PROBE1(mib, linux_sysctl_osname, return, error);
 		return (error);
 	}
 	error = linux_set_osname(req->td, osname);
 
 	LIN_SDT_PROBE1(mib, linux_sysctl_osname, return, error);
 	return (error);
 }
 
 SYSCTL_PROC(_compat_linux, OID_AUTO, osname,
 	    CTLTYPE_STRING | CTLFLAG_RW | CTLFLAG_PRISON | CTLFLAG_MPSAFE,
 	    0, 0, linux_sysctl_osname, "A",
 	    "Linux kernel OS name");
 
 static int
 linux_sysctl_osrelease(SYSCTL_HANDLER_ARGS)
 {
 	char osrelease[LINUX_MAX_UTSNAME];
 	int error;
 
 	LIN_SDT_PROBE0(mib, linux_sysctl_osrelease, entry);
 
 	linux_get_osrelease(req->td, osrelease);
 	error = sysctl_handle_string(oidp, osrelease, LINUX_MAX_UTSNAME, req);
 	if (error != 0 || req->newptr == NULL) {
 		LIN_SDT_PROBE1(mib, linux_sysctl_osrelease, sysctl_string_error,
 		    error);
 		LIN_SDT_PROBE1(mib, linux_sysctl_osrelease, return, error);
 		return (error);
 	}
 	error = linux_set_osrelease(req->td, osrelease);
 
 	LIN_SDT_PROBE1(mib, linux_sysctl_osrelease, return, error);
 	return (error);
 }
 
 SYSCTL_PROC(_compat_linux, OID_AUTO, osrelease,
 	    CTLTYPE_STRING | CTLFLAG_RW | CTLFLAG_PRISON | CTLFLAG_MPSAFE,
 	    0, 0, linux_sysctl_osrelease, "A",
 	    "Linux kernel OS release");
 
 static int
 linux_sysctl_oss_version(SYSCTL_HANDLER_ARGS)
 {
 	int oss_version;
 	int error;
 
 	LIN_SDT_PROBE0(mib, linux_sysctl_oss_version, entry);
 
 	oss_version = linux_get_oss_version(req->td);
 	error = sysctl_handle_int(oidp, &oss_version, 0, req);
 	if (error != 0 || req->newptr == NULL) {
 		LIN_SDT_PROBE1(mib, linux_sysctl_oss_version,
 		    sysctl_string_error, error);
 		LIN_SDT_PROBE1(mib, linux_sysctl_oss_version, return, error);
 		return (error);
 	}
 	error = linux_set_oss_version(req->td, oss_version);
 
 	LIN_SDT_PROBE1(mib, linux_sysctl_oss_version, return, error);
 	return (error);
 }
 
 SYSCTL_PROC(_compat_linux, OID_AUTO, oss_version,
 	    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_PRISON | CTLFLAG_MPSAFE,
 	    0, 0, linux_sysctl_oss_version, "I",
 	    "Linux OSS version");
 
 /*
  * Map the osrelease into integer
  */
 static int
 linux_map_osrel(char *osrelease, int *osrel)
 {
 	char *sep, *eosrelease;
 	int len, v0, v1, v2, v;
 
 	LIN_SDT_PROBE2(mib, linux_map_osrel, entry, osrelease, osrel);
 
 	len = strlen(osrelease);
 	eosrelease = osrelease + len;
 	v0 = strtol(osrelease, &sep, 10);
 	if (osrelease == sep || sep + 1 >= eosrelease || *sep != '.') {
 		LIN_SDT_PROBE1(mib, linux_map_osrel, return, EINVAL);
 		return (EINVAL);
 	}
 	osrelease = sep + 1;
 	v1 = strtol(osrelease, &sep, 10);
 	if (osrelease == sep || sep + 1 >= eosrelease || *sep != '.') {
 		LIN_SDT_PROBE1(mib, linux_map_osrel, return, EINVAL);
 		return (EINVAL);
 	}
 	osrelease = sep + 1;
 	v2 = strtol(osrelease, &sep, 10);
 	if (osrelease == sep || sep != eosrelease) {
 		LIN_SDT_PROBE1(mib, linux_map_osrel, return, EINVAL);
 		return (EINVAL);
 	}
 
 	v = v0 * 1000000 + v1 * 1000 + v2;
 	if (v < 1000000) {
 		LIN_SDT_PROBE1(mib, linux_map_osrel, return, EINVAL);
 		return (EINVAL);
 	}
 
 	*osrel = v;
 
 	LIN_SDT_PROBE1(mib, linux_map_osrel, return, 0);
 	return (0);
 }
 
 /*
  * Find a prison with Linux info.
  * Return the Linux info and the (locked) prison.
  */
 static struct linux_prison *
 linux_find_prison(struct prison *spr, struct prison **prp)
 {
 	struct prison *pr;
 	struct linux_prison *lpr;
 
 	LIN_SDT_PROBE2(mib, linux_get_prison, entry, spr, prp);
 
 	if (!linux_osd_jail_slot)
 		/* In case osd_register failed. */
 		spr = &prison0;
 	for (pr = spr;; pr = pr->pr_parent) {
 		mtx_lock(&pr->pr_mtx);
 		lpr = (pr == &prison0)
 		    ? &lprison0
 		    : osd_jail_get(pr, linux_osd_jail_slot);
 		if (lpr != NULL)
 			break;
 		mtx_unlock(&pr->pr_mtx);
 	}
 	*prp = pr;
 
 	LIN_SDT_PROBE1(mib, linux_get_prison, return, lpr);
 	return (lpr);
 }
 
 /*
  * Ensure a prison has its own Linux info.  If lprp is non-null, point it to
  * the Linux info and lock the prison.
  */
 static int
 linux_alloc_prison(struct prison *pr, struct linux_prison **lprp)
 {
 	struct prison *ppr;
 	struct linux_prison *lpr, *nlpr;
 	int error;
 
 	LIN_SDT_PROBE2(mib, linux_alloc_prison, entry, pr, lprp);
 
 	/* If this prison already has Linux info, return that. */
 	error = 0;
 	lpr = linux_find_prison(pr, &ppr);
 	if (ppr == pr)
 		goto done;
 	/*
 	 * Allocate a new info record.  Then check again, in case something
 	 * changed during the allocation.
 	 */
 	mtx_unlock(&ppr->pr_mtx);
 	nlpr = malloc(sizeof(struct linux_prison), M_PRISON, M_WAITOK);
 	lpr = linux_find_prison(pr, &ppr);
 	if (ppr == pr) {
 		free(nlpr, M_PRISON);
 		goto done;
 	}
 	/* Inherit the initial values from the ancestor. */
 	mtx_lock(&pr->pr_mtx);
 	error = osd_jail_set(pr, linux_osd_jail_slot, nlpr);
 	if (error == 0) {
 		bcopy(lpr, nlpr, sizeof(*lpr));
 		lpr = nlpr;
 	} else {
 		free(nlpr, M_PRISON);
 		lpr = NULL;
 	}
 	mtx_unlock(&ppr->pr_mtx);
  done:
 	if (lprp != NULL)
 		*lprp = lpr;
 	else
 		mtx_unlock(&pr->pr_mtx);
 
 	LIN_SDT_PROBE1(mib, linux_alloc_prison, return, error);
 	return (error);
 }
 
 /*
  * Jail OSD methods for Linux prison data.
  */
 static int
 linux_prison_create(void *obj, void *data)
 {
 	struct prison *pr = obj;
 	struct vfsoptlist *opts = data;
 	int jsys, error;
 
 	LIN_SDT_PROBE2(mib, linux_prison_create, entry, obj, data);
 
 	error = vfs_copyopt(opts, "linux", &jsys, sizeof(jsys));
 	if (error != 0) {
 		LIN_SDT_PROBE1(mib, linux_prison_create, vfs_copyopt_error,
 		    error);
 	} else if (jsys == JAIL_SYS_INHERIT) {
 		LIN_SDT_PROBE1(mib, linux_prison_create, return, 0);
 		return (0);
 	}
 	/*
 	 * Inherit a prison's initial values from its parent
 	 * (different from JAIL_SYS_INHERIT which also inherits changes).
 	 */
 	error = linux_alloc_prison(pr, NULL);
 
 	LIN_SDT_PROBE1(mib, linux_prison_create, return, error);
 	return (error);
 }
 
 static int
 linux_prison_check(void *obj __unused, void *data)
 {
 	struct vfsoptlist *opts = data;
 	char *osname, *osrelease;
 	int error, jsys, len, osrel, oss_version;
 
 	LIN_SDT_PROBE2(mib, linux_prison_check, entry, obj, data);
 
 	/* Check that the parameters are correct. */
 	error = vfs_copyopt(opts, "linux", &jsys, sizeof(jsys));
 	if (error != 0) {
 		LIN_SDT_PROBE1(mib, linux_prison_check, vfs_copyopt_error,
 		    error);
 	}
 	if (error != ENOENT) {
 		if (error != 0) {
 			LIN_SDT_PROBE1(mib, linux_prison_check, return, error);
 			return (error);
 		}
 		if (jsys != JAIL_SYS_NEW && jsys != JAIL_SYS_INHERIT) {
 			LIN_SDT_PROBE1(mib, linux_prison_check, return, EINVAL);
 			return (EINVAL);
 		}
 	}
 	error = vfs_getopt(opts, "linux.osname", (void **)&osname, &len);
 	if (error != 0) {
 		LIN_SDT_PROBE1(mib, linux_prison_check, vfs_getopt_error,
 		    error);
 	}
 	if (error != ENOENT) {
 		if (error != 0) {
 			LIN_SDT_PROBE1(mib, linux_prison_check, return, error);
 			return (error);
 		}
 		if (len == 0 || osname[len - 1] != '\0') {
 			LIN_SDT_PROBE1(mib, linux_prison_check, return, EINVAL);
 			return (EINVAL);
 		}
 		if (len > LINUX_MAX_UTSNAME) {
 			vfs_opterror(opts, "linux.osname too long");
 			LIN_SDT_PROBE1(mib, linux_prison_check, return,
 			    ENAMETOOLONG);
 			return (ENAMETOOLONG);
 		}
 	}
 	error = vfs_getopt(opts, "linux.osrelease", (void **)&osrelease, &len);
 	if (error != 0) {
 		LIN_SDT_PROBE1(mib, linux_prison_check, vfs_getopt_error,
 		    error);
 	}
 	if (error != ENOENT) {
 		if (error != 0) {
 			LIN_SDT_PROBE1(mib, linux_prison_check, return, error);
 			return (error);
 		}
 		if (len == 0 || osrelease[len - 1] != '\0') {
 			LIN_SDT_PROBE1(mib, linux_prison_check, return, EINVAL);
 			return (EINVAL);
 		}
 		if (len > LINUX_MAX_UTSNAME) {
 			vfs_opterror(opts, "linux.osrelease too long");
 			LIN_SDT_PROBE1(mib, linux_prison_check, return,
 			    ENAMETOOLONG);
 			return (ENAMETOOLONG);
 		}
 		error = linux_map_osrel(osrelease, &osrel);
 		if (error != 0) {
 			vfs_opterror(opts, "linux.osrelease format error");
 			LIN_SDT_PROBE1(mib, linux_prison_check, return, error);
 			return (error);
 		}
 	}
 	error = vfs_copyopt(opts, "linux.oss_version", &oss_version,
 	    sizeof(oss_version));
 	if (error != 0)
 	    LIN_SDT_PROBE1(mib, linux_prison_check, vfs_copyopt_error, error);
 
 	if (error == ENOENT)
 		error = 0;
 	LIN_SDT_PROBE1(mib, linux_prison_check, return, error);
 	return (error);
 }
 
 static int
 linux_prison_set(void *obj, void *data)
 {
 	struct linux_prison *lpr;
 	struct prison *pr = obj;
 	struct vfsoptlist *opts = data;
 	char *osname, *osrelease;
 	int error, gotversion, jsys, len, oss_version;
 
 	LIN_SDT_PROBE2(mib, linux_prison_set, entry, obj, data);
 
 	/* Set the parameters, which should be correct. */
 	error = vfs_copyopt(opts, "linux", &jsys, sizeof(jsys));
 	if (error != 0)
 		LIN_SDT_PROBE1(mib, linux_prison_set, vfs_copyopt_error, error);
 	if (error == ENOENT)
 		jsys = -1;
 	error = vfs_getopt(opts, "linux.osname", (void **)&osname, &len);
 	if (error != 0)
 		LIN_SDT_PROBE1(mib, linux_prison_set, vfs_getopt_error, error);
 	if (error == ENOENT)
 		osname = NULL;
 	else
 		jsys = JAIL_SYS_NEW;
 	error = vfs_getopt(opts, "linux.osrelease", (void **)&osrelease, &len);
 	if (error != 0)
 		LIN_SDT_PROBE1(mib, linux_prison_set, vfs_getopt_error, error);
 	if (error == ENOENT)
 		osrelease = NULL;
 	else
 		jsys = JAIL_SYS_NEW;
 	error = vfs_copyopt(opts, "linux.oss_version", &oss_version,
 	    sizeof(oss_version));
 	if (error != 0)
 		LIN_SDT_PROBE1(mib, linux_prison_set, vfs_copyopt_error, error);
 	if (error == ENOENT)
 		gotversion = 0;
 	else {
 		gotversion = 1;
 		jsys = JAIL_SYS_NEW;
 	}
 	switch (jsys) {
 	case JAIL_SYS_INHERIT:
 		/* "linux=inherit": inherit the parent's Linux info. */
 		mtx_lock(&pr->pr_mtx);
 		osd_jail_del(pr, linux_osd_jail_slot);
 		mtx_unlock(&pr->pr_mtx);
 		break;
 	case JAIL_SYS_NEW:
 		/*
 		 * "linux=new" or "linux.*":
 		 * the prison gets its own Linux info.
 		 */
 		error = linux_alloc_prison(pr, &lpr);
 		if (error) {
 			mtx_unlock(&pr->pr_mtx);
 			LIN_SDT_PROBE1(mib, linux_prison_set, return, error);
 			return (error);
 		}
 		if (osrelease) {
 			error = linux_map_osrel(osrelease, &lpr->pr_osrel);
 			if (error) {
 				mtx_unlock(&pr->pr_mtx);
 				LIN_SDT_PROBE1(mib, linux_prison_set, return,
 				    error);
 				return (error);
 			}
 			strlcpy(lpr->pr_osrelease, osrelease,
 			    LINUX_MAX_UTSNAME);
 		}
 		if (osname)
 			strlcpy(lpr->pr_osname, osname, LINUX_MAX_UTSNAME);
 		if (gotversion)
 			lpr->pr_oss_version = oss_version;
 		mtx_unlock(&pr->pr_mtx);
 	}
 
 	LIN_SDT_PROBE1(mib, linux_prison_set, return, 0);
 	return (0);
 }
 
 SYSCTL_JAIL_PARAM_SYS_NODE(linux, CTLFLAG_RW, "Jail Linux parameters");
 SYSCTL_JAIL_PARAM_STRING(_linux, osname, CTLFLAG_RW, LINUX_MAX_UTSNAME,
     "Jail Linux kernel OS name");
 SYSCTL_JAIL_PARAM_STRING(_linux, osrelease, CTLFLAG_RW, LINUX_MAX_UTSNAME,
     "Jail Linux kernel OS release");
 SYSCTL_JAIL_PARAM(_linux, oss_version, CTLTYPE_INT | CTLFLAG_RW,
     "I", "Jail Linux OSS version");
 
 static int
 linux_prison_get(void *obj, void *data)
 {
 	struct linux_prison *lpr;
 	struct prison *ppr;
 	struct prison *pr = obj;
 	struct vfsoptlist *opts = data;
 	int error, i;
 
 	static int version0;
 
 	LIN_SDT_PROBE2(mib, linux_prison_get, entry, obj, data);
 
 	/* See if this prison is the one with the Linux info. */
 	lpr = linux_find_prison(pr, &ppr);
 	i = (ppr == pr) ? JAIL_SYS_NEW : JAIL_SYS_INHERIT;
 	error = vfs_setopt(opts, "linux", &i, sizeof(i));
 	if (error != 0) {
 		LIN_SDT_PROBE1(mib, linux_prison_get, vfs_setopt_error, error);
 		if (error != ENOENT)
 			goto done;
 	}
 	if (i) {
 		error = vfs_setopts(opts, "linux.osname", lpr->pr_osname);
 		if (error != 0) {
 			LIN_SDT_PROBE1(mib, linux_prison_get, vfs_setopts_error,
 			    error);
 			if (error != ENOENT)
 				goto done;
 		}
 		error = vfs_setopts(opts, "linux.osrelease", lpr->pr_osrelease);
 		if (error != 0) {
 			LIN_SDT_PROBE1(mib, linux_prison_get, vfs_setopts_error,
 			    error);
 			if (error != ENOENT)
 				goto done;
 		}
 		error = vfs_setopt(opts, "linux.oss_version",
 		    &lpr->pr_oss_version, sizeof(lpr->pr_oss_version));
 		if (error != 0) {
 			LIN_SDT_PROBE1(mib, linux_prison_get, vfs_setopt_error,
 			    error);
 			if(error != ENOENT)
 				goto done;
 		}
 	} else {
 		/*
 		 * If this prison is inheriting its Linux info, report
 		 * empty/zero parameters.
 		 */
 		error = vfs_setopts(opts, "linux.osname", "");
 		if (error != 0) {
 			LIN_SDT_PROBE1(mib, linux_prison_get, vfs_setopts_error,
 			    error);
 			if(error != ENOENT)
 				goto done;
 		}
 		error = vfs_setopts(opts, "linux.osrelease", "");
 		if (error != 0) {
 			LIN_SDT_PROBE1(mib, linux_prison_get, vfs_setopts_error,
 			    error);
 			if(error != ENOENT)
 				goto done;
 		}
 		error = vfs_setopt(opts, "linux.oss_version", &version0,
 		    sizeof(lpr->pr_oss_version));
 		if (error != 0) {
 			LIN_SDT_PROBE1(mib, linux_prison_get, vfs_setopt_error,
 			    error);
 			if(error != ENOENT)
 				goto done;
 		}
 	}
 	error = 0;
 
  done:
 	mtx_unlock(&ppr->pr_mtx);
 
 	LIN_SDT_PROBE1(mib, linux_prison_get, return, error);
 	return (error);
 }
 
 static void
 linux_prison_destructor(void *data)
 {
 
 	LIN_SDT_PROBE1(mib, linux_prison_destructor, entry, data);
 	free(data, M_PRISON);
 	LIN_SDT_PROBE0(mib, linux_prison_destructor, return);
 }
 
 void
 linux_osd_jail_register(void)
 {
 	struct prison *pr;
 	osd_method_t methods[PR_MAXMETHOD] = {
 	    [PR_METHOD_CREATE] =	linux_prison_create,
 	    [PR_METHOD_GET] =		linux_prison_get,
 	    [PR_METHOD_SET] =		linux_prison_set,
 	    [PR_METHOD_CHECK] =		linux_prison_check
 	};
 
 	LIN_SDT_PROBE0(mib, linux_osd_jail_register, entry);
 
 	linux_osd_jail_slot =
 	    osd_jail_register(linux_prison_destructor, methods);
 	if (linux_osd_jail_slot > 0) {
 		/* Copy the system linux info to any current prisons. */
 		sx_xlock(&allprison_lock);
 		TAILQ_FOREACH(pr, &allprison, pr_list)
 			(void)linux_alloc_prison(pr, NULL);
 		sx_xunlock(&allprison_lock);
 	}
 
 	LIN_SDT_PROBE0(mib, linux_osd_jail_register, return);
 }
 
 void
 linux_osd_jail_deregister(void)
 {
 
 	LIN_SDT_PROBE0(mib, linux_osd_jail_register, entry);
 
 	if (linux_osd_jail_slot)
 		osd_jail_deregister(linux_osd_jail_slot);
 
 	LIN_SDT_PROBE0(mib, linux_osd_jail_register, return);
 }
 
 void
 linux_get_osname(struct thread *td, char *dst)
 {
 	struct prison *pr;
 	struct linux_prison *lpr;
 
 	LIN_SDT_PROBE2(mib, linux_get_osname, entry, td, dst);
 
 	lpr = linux_find_prison(td->td_ucred->cr_prison, &pr);
 	bcopy(lpr->pr_osname, dst, LINUX_MAX_UTSNAME);
 	mtx_unlock(&pr->pr_mtx);
 
 	LIN_SDT_PROBE0(mib, linux_get_osname, return);
 }
 
 static int
 linux_set_osname(struct thread *td, char *osname)
 {
 	struct prison *pr;
 	struct linux_prison *lpr;
 
 	LIN_SDT_PROBE2(mib, linux_set_osname, entry, td, osname);
 
 	lpr = linux_find_prison(td->td_ucred->cr_prison, &pr);
 	strlcpy(lpr->pr_osname, osname, LINUX_MAX_UTSNAME);
 	mtx_unlock(&pr->pr_mtx);
 
 	LIN_SDT_PROBE1(mib, linux_set_osname, return, 0);
 	return (0);
 }
 
 void
 linux_get_osrelease(struct thread *td, char *dst)
 {
 	struct prison *pr;
 	struct linux_prison *lpr;
 
 	LIN_SDT_PROBE2(mib, linux_get_osrelease, entry, td, dst);
 
 	lpr = linux_find_prison(td->td_ucred->cr_prison, &pr);
 	bcopy(lpr->pr_osrelease, dst, LINUX_MAX_UTSNAME);
 	mtx_unlock(&pr->pr_mtx);
 
 	LIN_SDT_PROBE0(mib, linux_get_osrelease, return);
 }
 
 int
 linux_kernver(struct thread *td)
 {
 	struct prison *pr;
 	struct linux_prison *lpr;
 	int osrel;
 
 	LIN_SDT_PROBE1(mib, linux_kernver, entry, td);
 
 	lpr = linux_find_prison(td->td_ucred->cr_prison, &pr);
 	osrel = lpr->pr_osrel;
 	mtx_unlock(&pr->pr_mtx);
 
 	LIN_SDT_PROBE1(mib, linux_kernver, return, osrel);
 	return (osrel);
 }
 
 static int
 linux_set_osrelease(struct thread *td, char *osrelease)
 {
 	struct prison *pr;
 	struct linux_prison *lpr;
 	int error;
 
 	LIN_SDT_PROBE2(mib, linux_set_osrelease, entry, td, osrelease);
 
 	lpr = linux_find_prison(td->td_ucred->cr_prison, &pr);
 	error = linux_map_osrel(osrelease, &lpr->pr_osrel);
 	if (error == 0)
 		strlcpy(lpr->pr_osrelease, osrelease, LINUX_MAX_UTSNAME);
 	mtx_unlock(&pr->pr_mtx);
 
 	LIN_SDT_PROBE1(mib, linux_set_osrelease, return, error);
 	return (error);
 }
 
 int
 linux_get_oss_version(struct thread *td)
 {
 	struct prison *pr;
 	struct linux_prison *lpr;
 	int version;
 
 	LIN_SDT_PROBE1(mib, linux_get_oss_version, entry, td);
 
 	lpr = linux_find_prison(td->td_ucred->cr_prison, &pr);
 	version = lpr->pr_oss_version;
 	mtx_unlock(&pr->pr_mtx);
 
 	LIN_SDT_PROBE1(mib, linux_get_oss_version, return, version);
 	return (version);
 }
 
 static int
 linux_set_oss_version(struct thread *td, int oss_version)
 {
 	struct prison *pr;
 	struct linux_prison *lpr;
 
 	LIN_SDT_PROBE2(mib, linux_set_oss_version, entry, td, oss_version);
 
 	lpr = linux_find_prison(td->td_ucred->cr_prison, &pr);
 	lpr->pr_oss_version = oss_version;
 	mtx_unlock(&pr->pr_mtx);
 
 	LIN_SDT_PROBE1(mib, linux_set_oss_version, return, 0);
 	return (0);
 }
 
 #if defined(DEBUG) || defined(KTR)
 /* XXX: can be removed when every ldebug(...) and KTR stuff are removed. */
 
 u_char linux_debug_map[howmany(LINUX_SYS_MAXSYSCALL, sizeof(u_char))];
 
 static int
 linux_debug(int syscall, int toggle, int global)
 {
 
 	if (global) {
 		char c = toggle ? 0 : 0xff;
 
 		memset(linux_debug_map, c, sizeof(linux_debug_map));
 		return (0);
 	}
 	if (syscall < 0 || syscall >= LINUX_SYS_MAXSYSCALL)
 		return (EINVAL);
 	if (toggle)
 		clrbit(linux_debug_map, syscall);
 	else
 		setbit(linux_debug_map, syscall);
 	return (0);
 }
 
 /*
  * Usage: sysctl linux.debug=<syscall_nr>.<0/1>
  *
  *    E.g.: sysctl linux.debug=21.0
  *
  * As a special case, syscall "all" will apply to all syscalls globally.
  */
 #define LINUX_MAX_DEBUGSTR	16
 static int
 linux_sysctl_debug(SYSCTL_HANDLER_ARGS)
 {
 	char value[LINUX_MAX_DEBUGSTR], *p;
 	int error, sysc, toggle;
 	int global = 0;
 
 	value[0] = '\0';
 	error = sysctl_handle_string(oidp, value, LINUX_MAX_DEBUGSTR, req);
 	if (error || req->newptr == NULL)
 		return (error);
 	for (p = value; *p != '\0' && *p != '.'; p++);
 	if (*p == '\0')
 		return (EINVAL);
 	*p++ = '\0';
 	sysc = strtol(value, NULL, 0);
 	toggle = strtol(p, NULL, 0);
 	if (strcmp(value, "all") == 0)
 		global = 1;
 	error = linux_debug(sysc, toggle, global);
 	return (error);
 }
 
 SYSCTL_PROC(_compat_linux, OID_AUTO, debug,
             CTLTYPE_STRING | CTLFLAG_RW,
             0, 0, linux_sysctl_debug, "A",
             "Linux debugging control");
 
 #endif /* DEBUG || KTR */
Index: head/sys/compat/linux/linux_misc.c
===================================================================
--- head/sys/compat/linux/linux_misc.c	(revision 258540)
+++ head/sys/compat/linux/linux_misc.c	(revision 258541)
@@ -1,1928 +1,1927 @@
 /*-
  * Copyright (c) 2002 Doug Rabson
  * Copyright (c) 1994-1995 Søren Schmidt
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer
  *    in this position and unchanged.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. The name of the author may not be used to endorse or promote products
  *    derived from this software without specific prior written permission
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_compat.h"
-#include "opt_kdtrace.h"
 
 #include <sys/param.h>
 #include <sys/blist.h>
 #include <sys/fcntl.h>
 #if defined(__i386__)
 #include <sys/imgact_aout.h>
 #endif
 #include <sys/jail.h>
 #include <sys/kernel.h>
 #include <sys/limits.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mman.h>
 #include <sys/mount.h>
 #include <sys/mutex.h>
 #include <sys/namei.h>
 #include <sys/priv.h>
 #include <sys/proc.h>
 #include <sys/reboot.h>
 #include <sys/racct.h>
 #include <sys/resourcevar.h>
 #include <sys/sched.h>
 #include <sys/sdt.h>
 #include <sys/signalvar.h>
 #include <sys/stat.h>
 #include <sys/syscallsubr.h>
 #include <sys/sysctl.h>
 #include <sys/sysproto.h>
 #include <sys/systm.h>
 #include <sys/time.h>
 #include <sys/vmmeter.h>
 #include <sys/vnode.h>
 #include <sys/wait.h>
 #include <sys/cpuset.h>
 
 #include <security/mac/mac_framework.h>
 
 #include <vm/vm.h>
 #include <vm/pmap.h>
 #include <vm/vm_kern.h>
 #include <vm/vm_map.h>
 #include <vm/vm_extern.h>
 #include <vm/vm_object.h>
 #include <vm/swap_pager.h>
 
 #ifdef COMPAT_LINUX32
 #include <machine/../linux32/linux.h>
 #include <machine/../linux32/linux32_proto.h>
 #else
 #include <machine/../linux/linux.h>
 #include <machine/../linux/linux_proto.h>
 #endif
 
 #include <compat/linux/linux_dtrace.h>
 #include <compat/linux/linux_file.h>
 #include <compat/linux/linux_mib.h>
 #include <compat/linux/linux_signal.h>
 #include <compat/linux/linux_util.h>
 #include <compat/linux/linux_sysproto.h>
 #include <compat/linux/linux_emul.h>
 #include <compat/linux/linux_misc.h>
 
 /* DTrace init */
 LIN_SDT_PROVIDER_DECLARE(LINUX_DTRACE);
 
 /* Linuxulator-global DTrace probes */
 LIN_SDT_PROBE_DECLARE(locks, emul_lock, locked);
 LIN_SDT_PROBE_DECLARE(locks, emul_lock, unlock);
 LIN_SDT_PROBE_DECLARE(locks, emul_shared_rlock, locked);
 LIN_SDT_PROBE_DECLARE(locks, emul_shared_rlock, unlock);
 LIN_SDT_PROBE_DECLARE(locks, emul_shared_wlock, locked);
 LIN_SDT_PROBE_DECLARE(locks, emul_shared_wlock, unlock);
 
 int stclohz;				/* Statistics clock frequency */
 
 static unsigned int linux_to_bsd_resource[LINUX_RLIM_NLIMITS] = {
 	RLIMIT_CPU, RLIMIT_FSIZE, RLIMIT_DATA, RLIMIT_STACK,
 	RLIMIT_CORE, RLIMIT_RSS, RLIMIT_NPROC, RLIMIT_NOFILE,
 	RLIMIT_MEMLOCK, RLIMIT_AS 
 };
 
 struct l_sysinfo {
 	l_long		uptime;		/* Seconds since boot */
 	l_ulong		loads[3];	/* 1, 5, and 15 minute load averages */
 #define LINUX_SYSINFO_LOADS_SCALE 65536
 	l_ulong		totalram;	/* Total usable main memory size */
 	l_ulong		freeram;	/* Available memory size */
 	l_ulong		sharedram;	/* Amount of shared memory */
 	l_ulong		bufferram;	/* Memory used by buffers */
 	l_ulong		totalswap;	/* Total swap space size */
 	l_ulong		freeswap;	/* swap space still available */
 	l_ushort	procs;		/* Number of current processes */
 	l_ushort	pads;
 	l_ulong		totalbig;
 	l_ulong		freebig;
 	l_uint		mem_unit;
 	char		_f[20-2*sizeof(l_long)-sizeof(l_int)];	/* padding */
 };
 int
 linux_sysinfo(struct thread *td, struct linux_sysinfo_args *args)
 {
 	struct l_sysinfo sysinfo;
 	vm_object_t object;
 	int i, j;
 	struct timespec ts;
 
 	getnanouptime(&ts);
 	if (ts.tv_nsec != 0)
 		ts.tv_sec++;
 	sysinfo.uptime = ts.tv_sec;
 
 	/* Use the information from the mib to get our load averages */
 	for (i = 0; i < 3; i++)
 		sysinfo.loads[i] = averunnable.ldavg[i] *
 		    LINUX_SYSINFO_LOADS_SCALE / averunnable.fscale;
 
 	sysinfo.totalram = physmem * PAGE_SIZE;
 	sysinfo.freeram = sysinfo.totalram - cnt.v_wire_count * PAGE_SIZE;
 
 	sysinfo.sharedram = 0;
 	mtx_lock(&vm_object_list_mtx);
 	TAILQ_FOREACH(object, &vm_object_list, object_list)
 		if (object->shadow_count > 1)
 			sysinfo.sharedram += object->resident_page_count;
 	mtx_unlock(&vm_object_list_mtx);
 
 	sysinfo.sharedram *= PAGE_SIZE;
 	sysinfo.bufferram = 0;
 
 	swap_pager_status(&i, &j);
 	sysinfo.totalswap = i * PAGE_SIZE;
 	sysinfo.freeswap = (i - j) * PAGE_SIZE;
 
 	sysinfo.procs = nprocs;
 
 	/* The following are only present in newer Linux kernels. */
 	sysinfo.totalbig = 0;
 	sysinfo.freebig = 0;
 	sysinfo.mem_unit = 1;
 
 	return (copyout(&sysinfo, args->info, sizeof(sysinfo)));
 }
 
 int
 linux_alarm(struct thread *td, struct linux_alarm_args *args)
 {
 	struct itimerval it, old_it;
 	u_int secs;
 	int error;
 
 #ifdef DEBUG
 	if (ldebug(alarm))
 		printf(ARGS(alarm, "%u"), args->secs);
 #endif
 	
 	secs = args->secs;
 
 	if (secs > INT_MAX)
 		secs = INT_MAX;
 
 	it.it_value.tv_sec = (long) secs;
 	it.it_value.tv_usec = 0;
 	it.it_interval.tv_sec = 0;
 	it.it_interval.tv_usec = 0;
 	error = kern_setitimer(td, ITIMER_REAL, &it, &old_it);
 	if (error)
 		return (error);
 	if (timevalisset(&old_it.it_value)) {
 		if (old_it.it_value.tv_usec != 0)
 			old_it.it_value.tv_sec++;
 		td->td_retval[0] = old_it.it_value.tv_sec;
 	}
 	return (0);
 }
 
 int
 linux_brk(struct thread *td, struct linux_brk_args *args)
 {
 	struct vmspace *vm = td->td_proc->p_vmspace;
 	vm_offset_t new, old;
 	struct obreak_args /* {
 		char * nsize;
 	} */ tmp;
 
 #ifdef DEBUG
 	if (ldebug(brk))
 		printf(ARGS(brk, "%p"), (void *)(uintptr_t)args->dsend);
 #endif
 	old = (vm_offset_t)vm->vm_daddr + ctob(vm->vm_dsize);
 	new = (vm_offset_t)args->dsend;
 	tmp.nsize = (char *)new;
 	if (((caddr_t)new > vm->vm_daddr) && !sys_obreak(td, &tmp))
 		td->td_retval[0] = (long)new;
 	else
 		td->td_retval[0] = (long)old;
 
 	return (0);
 }
 
 #if defined(__i386__)
 /* XXX: what about amd64/linux32? */
 
 int
 linux_uselib(struct thread *td, struct linux_uselib_args *args)
 {
 	struct nameidata ni;
 	struct vnode *vp;
 	struct exec *a_out;
 	struct vattr attr;
 	vm_offset_t vmaddr;
 	unsigned long file_offset;
 	unsigned long bss_size;
 	char *library;
 	ssize_t aresid;
 	int error, locked, writecount;
 
 	LCONVPATHEXIST(td, args->library, &library);
 
 #ifdef DEBUG
 	if (ldebug(uselib))
 		printf(ARGS(uselib, "%s"), library);
 #endif
 
 	a_out = NULL;
 	locked = 0;
 	vp = NULL;
 
 	NDINIT(&ni, LOOKUP, ISOPEN | FOLLOW | LOCKLEAF | AUDITVNODE1,
 	    UIO_SYSSPACE, library, td);
 	error = namei(&ni);
 	LFREEPATH(library);
 	if (error)
 		goto cleanup;
 
 	vp = ni.ni_vp;
 	NDFREE(&ni, NDF_ONLY_PNBUF);
 
 	/*
 	 * From here on down, we have a locked vnode that must be unlocked.
 	 * XXX: The code below largely duplicates exec_check_permissions().
 	 */
 	locked = 1;
 
 	/* Writable? */
 	error = VOP_GET_WRITECOUNT(vp, &writecount);
 	if (error != 0)
 		goto cleanup;
 	if (writecount != 0) {
 		error = ETXTBSY;
 		goto cleanup;
 	}
 
 	/* Executable? */
 	error = VOP_GETATTR(vp, &attr, td->td_ucred);
 	if (error)
 		goto cleanup;
 
 	if ((vp->v_mount->mnt_flag & MNT_NOEXEC) ||
 	    ((attr.va_mode & 0111) == 0) || (attr.va_type != VREG)) {
 		/* EACCESS is what exec(2) returns. */
 		error = ENOEXEC;
 		goto cleanup;
 	}
 
 	/* Sensible size? */
 	if (attr.va_size == 0) {
 		error = ENOEXEC;
 		goto cleanup;
 	}
 
 	/* Can we access it? */
 	error = VOP_ACCESS(vp, VEXEC, td->td_ucred, td);
 	if (error)
 		goto cleanup;
 
 	/*
 	 * XXX: This should use vn_open() so that it is properly authorized,
 	 * and to reduce code redundancy all over the place here.
 	 * XXX: Not really, it duplicates far more of exec_check_permissions()
 	 * than vn_open().
 	 */
 #ifdef MAC
 	error = mac_vnode_check_open(td->td_ucred, vp, VREAD);
 	if (error)
 		goto cleanup;
 #endif
 	error = VOP_OPEN(vp, FREAD, td->td_ucred, td, NULL);
 	if (error)
 		goto cleanup;
 
 	/* Pull in executable header into exec_map */
 	error = vm_mmap(exec_map, (vm_offset_t *)&a_out, PAGE_SIZE,
 	    VM_PROT_READ, VM_PROT_READ, 0, OBJT_VNODE, vp, 0);
 	if (error)
 		goto cleanup;
 
 	/* Is it a Linux binary ? */
 	if (((a_out->a_magic >> 16) & 0xff) != 0x64) {
 		error = ENOEXEC;
 		goto cleanup;
 	}
 
 	/*
 	 * While we are here, we should REALLY do some more checks
 	 */
 
 	/* Set file/virtual offset based on a.out variant. */
 	switch ((int)(a_out->a_magic & 0xffff)) {
 	case 0413:			/* ZMAGIC */
 		file_offset = 1024;
 		break;
 	case 0314:			/* QMAGIC */
 		file_offset = 0;
 		break;
 	default:
 		error = ENOEXEC;
 		goto cleanup;
 	}
 
 	bss_size = round_page(a_out->a_bss);
 
 	/* Check various fields in header for validity/bounds. */
 	if (a_out->a_text & PAGE_MASK || a_out->a_data & PAGE_MASK) {
 		error = ENOEXEC;
 		goto cleanup;
 	}
 
 	/* text + data can't exceed file size */
 	if (a_out->a_data + a_out->a_text > attr.va_size) {
 		error = EFAULT;
 		goto cleanup;
 	}
 
 	/*
 	 * text/data/bss must not exceed limits
 	 * XXX - this is not complete. it should check current usage PLUS
 	 * the resources needed by this library.
 	 */
 	PROC_LOCK(td->td_proc);
 	if (a_out->a_text > maxtsiz ||
 	    a_out->a_data + bss_size > lim_cur(td->td_proc, RLIMIT_DATA) ||
 	    racct_set(td->td_proc, RACCT_DATA, a_out->a_data +
 	    bss_size) != 0) {
 		PROC_UNLOCK(td->td_proc);
 		error = ENOMEM;
 		goto cleanup;
 	}
 	PROC_UNLOCK(td->td_proc);
 
 	/*
 	 * Prevent more writers.
 	 * XXX: Note that if any of the VM operations fail below we don't
 	 * clear this flag.
 	 */
 	VOP_SET_TEXT(vp);
 
 	/*
 	 * Lock no longer needed
 	 */
 	locked = 0;
 	VOP_UNLOCK(vp, 0);
 
 	/*
 	 * Check if file_offset page aligned. Currently we cannot handle
 	 * misalinged file offsets, and so we read in the entire image
 	 * (what a waste).
 	 */
 	if (file_offset & PAGE_MASK) {
 #ifdef DEBUG
 		printf("uselib: Non page aligned binary %lu\n", file_offset);
 #endif
 		/* Map text+data read/write/execute */
 
 		/* a_entry is the load address and is page aligned */
 		vmaddr = trunc_page(a_out->a_entry);
 
 		/* get anon user mapping, read+write+execute */
 		error = vm_map_find(&td->td_proc->p_vmspace->vm_map, NULL, 0,
 		    &vmaddr, a_out->a_text + a_out->a_data, 0, VMFS_NO_SPACE,
 		    VM_PROT_ALL, VM_PROT_ALL, 0);
 		if (error)
 			goto cleanup;
 
 		error = vn_rdwr(UIO_READ, vp, (void *)vmaddr, file_offset,
 		    a_out->a_text + a_out->a_data, UIO_USERSPACE, 0,
 		    td->td_ucred, NOCRED, &aresid, td);
 		if (error != 0)
 			goto cleanup;
 		if (aresid != 0) {
 			error = ENOEXEC;
 			goto cleanup;
 		}
 	} else {
 #ifdef DEBUG
 		printf("uselib: Page aligned binary %lu\n", file_offset);
 #endif
 		/*
 		 * for QMAGIC, a_entry is 20 bytes beyond the load address
 		 * to skip the executable header
 		 */
 		vmaddr = trunc_page(a_out->a_entry);
 
 		/*
 		 * Map it all into the process's space as a single
 		 * copy-on-write "data" segment.
 		 */
 		error = vm_mmap(&td->td_proc->p_vmspace->vm_map, &vmaddr,
 		    a_out->a_text + a_out->a_data, VM_PROT_ALL, VM_PROT_ALL,
 		    MAP_PRIVATE | MAP_FIXED, OBJT_VNODE, vp, file_offset);
 		if (error)
 			goto cleanup;
 	}
 #ifdef DEBUG
 	printf("mem=%08lx = %08lx %08lx\n", (long)vmaddr, ((long *)vmaddr)[0],
 	    ((long *)vmaddr)[1]);
 #endif
 	if (bss_size != 0) {
 		/* Calculate BSS start address */
 		vmaddr = trunc_page(a_out->a_entry) + a_out->a_text +
 		    a_out->a_data;
 
 		/* allocate some 'anon' space */
 		error = vm_map_find(&td->td_proc->p_vmspace->vm_map, NULL, 0,
 		    &vmaddr, bss_size, 0, VMFS_NO_SPACE, VM_PROT_ALL,
 		    VM_PROT_ALL, 0);
 		if (error)
 			goto cleanup;
 	}
 
 cleanup:
 	/* Unlock vnode if needed */
 	if (locked)
 		VOP_UNLOCK(vp, 0);
 
 	/* Release the temporary mapping. */
 	if (a_out)
 		kmap_free_wakeup(exec_map, (vm_offset_t)a_out, PAGE_SIZE);
 
 	return (error);
 }
 
 #endif	/* __i386__ */
 
 int
 linux_select(struct thread *td, struct linux_select_args *args)
 {
 	l_timeval ltv;
 	struct timeval tv0, tv1, utv, *tvp;
 	int error;
 
 #ifdef DEBUG
 	if (ldebug(select))
 		printf(ARGS(select, "%d, %p, %p, %p, %p"), args->nfds,
 		    (void *)args->readfds, (void *)args->writefds,
 		    (void *)args->exceptfds, (void *)args->timeout);
 #endif
 
 	/*
 	 * Store current time for computation of the amount of
 	 * time left.
 	 */
 	if (args->timeout) {
 		if ((error = copyin(args->timeout, &ltv, sizeof(ltv))))
 			goto select_out;
 		utv.tv_sec = ltv.tv_sec;
 		utv.tv_usec = ltv.tv_usec;
 #ifdef DEBUG
 		if (ldebug(select))
 			printf(LMSG("incoming timeout (%jd/%ld)"),
 			    (intmax_t)utv.tv_sec, utv.tv_usec);
 #endif
 
 		if (itimerfix(&utv)) {
 			/*
 			 * The timeval was invalid.  Convert it to something
 			 * valid that will act as it does under Linux.
 			 */
 			utv.tv_sec += utv.tv_usec / 1000000;
 			utv.tv_usec %= 1000000;
 			if (utv.tv_usec < 0) {
 				utv.tv_sec -= 1;
 				utv.tv_usec += 1000000;
 			}
 			if (utv.tv_sec < 0)
 				timevalclear(&utv);
 		}
 		microtime(&tv0);
 		tvp = &utv;
 	} else
 		tvp = NULL;
 
 	error = kern_select(td, args->nfds, args->readfds, args->writefds,
 	    args->exceptfds, tvp, sizeof(l_int) * 8);
 
 #ifdef DEBUG
 	if (ldebug(select))
 		printf(LMSG("real select returns %d"), error);
 #endif
 	if (error)
 		goto select_out;
 
 	if (args->timeout) {
 		if (td->td_retval[0]) {
 			/*
 			 * Compute how much time was left of the timeout,
 			 * by subtracting the current time and the time
 			 * before we started the call, and subtracting
 			 * that result from the user-supplied value.
 			 */
 			microtime(&tv1);
 			timevalsub(&tv1, &tv0);
 			timevalsub(&utv, &tv1);
 			if (utv.tv_sec < 0)
 				timevalclear(&utv);
 		} else
 			timevalclear(&utv);
 #ifdef DEBUG
 		if (ldebug(select))
 			printf(LMSG("outgoing timeout (%jd/%ld)"),
 			    (intmax_t)utv.tv_sec, utv.tv_usec);
 #endif
 		ltv.tv_sec = utv.tv_sec;
 		ltv.tv_usec = utv.tv_usec;
 		if ((error = copyout(&ltv, args->timeout, sizeof(ltv))))
 			goto select_out;
 	}
 
 select_out:
 #ifdef DEBUG
 	if (ldebug(select))
 		printf(LMSG("select_out -> %d"), error);
 #endif
 	return (error);
 }
 
 int
 linux_mremap(struct thread *td, struct linux_mremap_args *args)
 {
 	struct munmap_args /* {
 		void *addr;
 		size_t len;
 	} */ bsd_args;
 	int error = 0;
 
 #ifdef DEBUG
 	if (ldebug(mremap))
 		printf(ARGS(mremap, "%p, %08lx, %08lx, %08lx"),
 		    (void *)(uintptr_t)args->addr,
 		    (unsigned long)args->old_len,
 		    (unsigned long)args->new_len,
 		    (unsigned long)args->flags);
 #endif
 
 	if (args->flags & ~(LINUX_MREMAP_FIXED | LINUX_MREMAP_MAYMOVE)) {
 		td->td_retval[0] = 0;
 		return (EINVAL);
 	}
 
 	/*
 	 * Check for the page alignment.
 	 * Linux defines PAGE_MASK to be FreeBSD ~PAGE_MASK.
 	 */
 	if (args->addr & PAGE_MASK) {
 		td->td_retval[0] = 0;
 		return (EINVAL);
 	}
 
 	args->new_len = round_page(args->new_len);
 	args->old_len = round_page(args->old_len);
 
 	if (args->new_len > args->old_len) {
 		td->td_retval[0] = 0;
 		return (ENOMEM);
 	}
 
 	if (args->new_len < args->old_len) {
 		bsd_args.addr =
 		    (caddr_t)((uintptr_t)args->addr + args->new_len);
 		bsd_args.len = args->old_len - args->new_len;
 		error = sys_munmap(td, &bsd_args);
 	}
 
 	td->td_retval[0] = error ? 0 : (uintptr_t)args->addr;
 	return (error);
 }
 
 #define LINUX_MS_ASYNC       0x0001
 #define LINUX_MS_INVALIDATE  0x0002
 #define LINUX_MS_SYNC        0x0004
 
 int
 linux_msync(struct thread *td, struct linux_msync_args *args)
 {
 	struct msync_args bsd_args;
 
 	bsd_args.addr = (caddr_t)(uintptr_t)args->addr;
 	bsd_args.len = (uintptr_t)args->len;
 	bsd_args.flags = args->fl & ~LINUX_MS_SYNC;
 
 	return (sys_msync(td, &bsd_args));
 }
 
 int
 linux_time(struct thread *td, struct linux_time_args *args)
 {
 	struct timeval tv;
 	l_time_t tm;
 	int error;
 
 #ifdef DEBUG
 	if (ldebug(time))
 		printf(ARGS(time, "*"));
 #endif
 
 	microtime(&tv);
 	tm = tv.tv_sec;
 	if (args->tm && (error = copyout(&tm, args->tm, sizeof(tm))))
 		return (error);
 	td->td_retval[0] = tm;
 	return (0);
 }
 
 struct l_times_argv {
 	l_clock_t	tms_utime;
 	l_clock_t	tms_stime;
 	l_clock_t	tms_cutime;
 	l_clock_t	tms_cstime;
 };
 
 
 /*
  * Glibc versions prior to 2.2.1 always use hard-coded CLK_TCK value.
  * Since 2.2.1 Glibc uses value exported from kernel via AT_CLKTCK
  * auxiliary vector entry.
  */
 #define	CLK_TCK		100
 
 #define	CONVOTCK(r)	(r.tv_sec * CLK_TCK + r.tv_usec / (1000000 / CLK_TCK))
 #define	CONVNTCK(r)	(r.tv_sec * stclohz + r.tv_usec / (1000000 / stclohz))
 
 #define	CONVTCK(r)	(linux_kernver(td) >= LINUX_KERNVER_2004000 ?		\
 			    CONVNTCK(r) : CONVOTCK(r))
 
 int
 linux_times(struct thread *td, struct linux_times_args *args)
 {
 	struct timeval tv, utime, stime, cutime, cstime;
 	struct l_times_argv tms;
 	struct proc *p;
 	int error;
 
 #ifdef DEBUG
 	if (ldebug(times))
 		printf(ARGS(times, "*"));
 #endif
 
 	if (args->buf != NULL) {
 		p = td->td_proc;
 		PROC_LOCK(p);
 		PROC_SLOCK(p);
 		calcru(p, &utime, &stime);
 		PROC_SUNLOCK(p);
 		calccru(p, &cutime, &cstime);
 		PROC_UNLOCK(p);
 
 		tms.tms_utime = CONVTCK(utime);
 		tms.tms_stime = CONVTCK(stime);
 
 		tms.tms_cutime = CONVTCK(cutime);
 		tms.tms_cstime = CONVTCK(cstime);
 
 		if ((error = copyout(&tms, args->buf, sizeof(tms))))
 			return (error);
 	}
 
 	microuptime(&tv);
 	td->td_retval[0] = (int)CONVTCK(tv);
 	return (0);
 }
 
 int
 linux_newuname(struct thread *td, struct linux_newuname_args *args)
 {
 	struct l_new_utsname utsname;
 	char osname[LINUX_MAX_UTSNAME];
 	char osrelease[LINUX_MAX_UTSNAME];
 	char *p;
 
 #ifdef DEBUG
 	if (ldebug(newuname))
 		printf(ARGS(newuname, "*"));
 #endif
 
 	linux_get_osname(td, osname);
 	linux_get_osrelease(td, osrelease);
 
 	bzero(&utsname, sizeof(utsname));
 	strlcpy(utsname.sysname, osname, LINUX_MAX_UTSNAME);
 	getcredhostname(td->td_ucred, utsname.nodename, LINUX_MAX_UTSNAME);
 	getcreddomainname(td->td_ucred, utsname.domainname, LINUX_MAX_UTSNAME);
 	strlcpy(utsname.release, osrelease, LINUX_MAX_UTSNAME);
 	strlcpy(utsname.version, version, LINUX_MAX_UTSNAME);
 	for (p = utsname.version; *p != '\0'; ++p)
 		if (*p == '\n') {
 			*p = '\0';
 			break;
 		}
 	strlcpy(utsname.machine, linux_platform, LINUX_MAX_UTSNAME);
 
 	return (copyout(&utsname, args->buf, sizeof(utsname)));
 }
 
 #if defined(__i386__) || (defined(__amd64__) && defined(COMPAT_LINUX32))
 struct l_utimbuf {
 	l_time_t l_actime;
 	l_time_t l_modtime;
 };
 
 int
 linux_utime(struct thread *td, struct linux_utime_args *args)
 {
 	struct timeval tv[2], *tvp;
 	struct l_utimbuf lut;
 	char *fname;
 	int error;
 
 	LCONVPATHEXIST(td, args->fname, &fname);
 
 #ifdef DEBUG
 	if (ldebug(utime))
 		printf(ARGS(utime, "%s, *"), fname);
 #endif
 
 	if (args->times) {
 		if ((error = copyin(args->times, &lut, sizeof lut))) {
 			LFREEPATH(fname);
 			return (error);
 		}
 		tv[0].tv_sec = lut.l_actime;
 		tv[0].tv_usec = 0;
 		tv[1].tv_sec = lut.l_modtime;
 		tv[1].tv_usec = 0;
 		tvp = tv;
 	} else
 		tvp = NULL;
 
 	error = kern_utimes(td, fname, UIO_SYSSPACE, tvp, UIO_SYSSPACE);
 	LFREEPATH(fname);
 	return (error);
 }
 
 int
 linux_utimes(struct thread *td, struct linux_utimes_args *args)
 {
 	l_timeval ltv[2];
 	struct timeval tv[2], *tvp = NULL;
 	char *fname;
 	int error;
 
 	LCONVPATHEXIST(td, args->fname, &fname);
 
 #ifdef DEBUG
 	if (ldebug(utimes))
 		printf(ARGS(utimes, "%s, *"), fname);
 #endif
 
 	if (args->tptr != NULL) {
 		if ((error = copyin(args->tptr, ltv, sizeof ltv))) {
 			LFREEPATH(fname);
 			return (error);
 		}
 		tv[0].tv_sec = ltv[0].tv_sec;
 		tv[0].tv_usec = ltv[0].tv_usec;
 		tv[1].tv_sec = ltv[1].tv_sec;
 		tv[1].tv_usec = ltv[1].tv_usec;
 		tvp = tv;
 	}
 
 	error = kern_utimes(td, fname, UIO_SYSSPACE, tvp, UIO_SYSSPACE);
 	LFREEPATH(fname);
 	return (error);
 }
 
 int
 linux_futimesat(struct thread *td, struct linux_futimesat_args *args)
 {
 	l_timeval ltv[2];
 	struct timeval tv[2], *tvp = NULL;
 	char *fname;
 	int error, dfd;
 
 	dfd = (args->dfd == LINUX_AT_FDCWD) ? AT_FDCWD : args->dfd;
 	LCONVPATHEXIST_AT(td, args->filename, &fname, dfd);
 
 #ifdef DEBUG
 	if (ldebug(futimesat))
 		printf(ARGS(futimesat, "%s, *"), fname);
 #endif
 
 	if (args->utimes != NULL) {
 		if ((error = copyin(args->utimes, ltv, sizeof ltv))) {
 			LFREEPATH(fname);
 			return (error);
 		}
 		tv[0].tv_sec = ltv[0].tv_sec;
 		tv[0].tv_usec = ltv[0].tv_usec;
 		tv[1].tv_sec = ltv[1].tv_sec;
 		tv[1].tv_usec = ltv[1].tv_usec;
 		tvp = tv;
 	}
 
 	error = kern_utimesat(td, dfd, fname, UIO_SYSSPACE, tvp, UIO_SYSSPACE);
 	LFREEPATH(fname);
 	return (error);
 }
 #endif /* __i386__ || (__amd64__ && COMPAT_LINUX32) */
 
 int
 linux_common_wait(struct thread *td, int pid, int *status,
     int options, struct rusage *ru)
 {
 	int error, tmpstat;
 
 	error = kern_wait(td, pid, &tmpstat, options, ru);
 	if (error)
 		return (error);
 
 	if (status) {
 		tmpstat &= 0xffff;
 		if (WIFSIGNALED(tmpstat))
 			tmpstat = (tmpstat & 0xffffff80) |
 			    BSD_TO_LINUX_SIGNAL(WTERMSIG(tmpstat));
 		else if (WIFSTOPPED(tmpstat))
 			tmpstat = (tmpstat & 0xffff00ff) |
 			    (BSD_TO_LINUX_SIGNAL(WSTOPSIG(tmpstat)) << 8);
 		error = copyout(&tmpstat, status, sizeof(int));
 	}
 
 	return (error);
 }
 
 int
 linux_waitpid(struct thread *td, struct linux_waitpid_args *args)
 {
 	int options;
  
 #ifdef DEBUG
 	if (ldebug(waitpid))
 		printf(ARGS(waitpid, "%d, %p, %d"),
 		    args->pid, (void *)args->status, args->options);
 #endif
 	/*
 	 * this is necessary because the test in kern_wait doesn't work
 	 * because we mess with the options here
 	 */
 	if (args->options & ~(WUNTRACED | WNOHANG | WCONTINUED | __WCLONE))
 		return (EINVAL);
    
 	options = (args->options & (WNOHANG | WUNTRACED));
 	/* WLINUXCLONE should be equal to __WCLONE, but we make sure */
 	if (args->options & __WCLONE)
 		options |= WLINUXCLONE;
 
 	return (linux_common_wait(td, args->pid, args->status, options, NULL));
 }
 
 
 int
 linux_mknod(struct thread *td, struct linux_mknod_args *args)
 {
 	char *path;
 	int error;
 
 	LCONVPATHCREAT(td, args->path, &path);
 
 #ifdef DEBUG
 	if (ldebug(mknod))
 		printf(ARGS(mknod, "%s, %d, %d"), path, args->mode, args->dev);
 #endif
 
 	switch (args->mode & S_IFMT) {
 	case S_IFIFO:
 	case S_IFSOCK:
 		error = kern_mkfifo(td, path, UIO_SYSSPACE, args->mode);
 		break;
 
 	case S_IFCHR:
 	case S_IFBLK:
 		error = kern_mknod(td, path, UIO_SYSSPACE, args->mode,
 		    args->dev);
 		break;
 
 	case S_IFDIR:
 		error = EPERM;
 		break;
 
 	case 0:
 		args->mode |= S_IFREG;
 		/* FALLTHROUGH */
 	case S_IFREG:
 		error = kern_open(td, path, UIO_SYSSPACE,
 		    O_WRONLY | O_CREAT | O_TRUNC, args->mode);
 		if (error == 0)
 			kern_close(td, td->td_retval[0]);
 		break;
 
 	default:
 		error = EINVAL;
 		break;
 	}
 	LFREEPATH(path);
 	return (error);
 }
 
 int
 linux_mknodat(struct thread *td, struct linux_mknodat_args *args)
 {
 	char *path;
 	int error, dfd;
 
 	dfd = (args->dfd == LINUX_AT_FDCWD) ? AT_FDCWD : args->dfd;
 	LCONVPATHCREAT_AT(td, args->filename, &path, dfd);
 
 #ifdef DEBUG
 	if (ldebug(mknodat))
 		printf(ARGS(mknodat, "%s, %d, %d"), path, args->mode, args->dev);
 #endif
 
 	switch (args->mode & S_IFMT) {
 	case S_IFIFO:
 	case S_IFSOCK:
 		error = kern_mkfifoat(td, dfd, path, UIO_SYSSPACE, args->mode);
 		break;
 
 	case S_IFCHR:
 	case S_IFBLK:
 		error = kern_mknodat(td, dfd, path, UIO_SYSSPACE, args->mode,
 		    args->dev);
 		break;
 
 	case S_IFDIR:
 		error = EPERM;
 		break;
 
 	case 0:
 		args->mode |= S_IFREG;
 		/* FALLTHROUGH */
 	case S_IFREG:
 		error = kern_openat(td, dfd, path, UIO_SYSSPACE,
 		    O_WRONLY | O_CREAT | O_TRUNC, args->mode);
 		if (error == 0)
 			kern_close(td, td->td_retval[0]);
 		break;
 
 	default:
 		error = EINVAL;
 		break;
 	}
 	LFREEPATH(path);
 	return (error);
 }
 
 /*
  * UGH! This is just about the dumbest idea I've ever heard!!
  */
 int
 linux_personality(struct thread *td, struct linux_personality_args *args)
 {
 #ifdef DEBUG
 	if (ldebug(personality))
 		printf(ARGS(personality, "%lu"), (unsigned long)args->per);
 #endif
 	if (args->per != 0)
 		return (EINVAL);
 
 	/* Yes Jim, it's still a Linux... */
 	td->td_retval[0] = 0;
 	return (0);
 }
 
 struct l_itimerval {
 	l_timeval it_interval;
 	l_timeval it_value;
 };
 
 #define	B2L_ITIMERVAL(bip, lip) 					\
 	(bip)->it_interval.tv_sec = (lip)->it_interval.tv_sec;		\
 	(bip)->it_interval.tv_usec = (lip)->it_interval.tv_usec;	\
 	(bip)->it_value.tv_sec = (lip)->it_value.tv_sec;		\
 	(bip)->it_value.tv_usec = (lip)->it_value.tv_usec;
 
 int
 linux_setitimer(struct thread *td, struct linux_setitimer_args *uap)
 {
 	int error;
 	struct l_itimerval ls;
 	struct itimerval aitv, oitv;
 
 #ifdef DEBUG
 	if (ldebug(setitimer))
 		printf(ARGS(setitimer, "%p, %p"),
 		    (void *)uap->itv, (void *)uap->oitv);
 #endif
 
 	if (uap->itv == NULL) {
 		uap->itv = uap->oitv;
 		return (linux_getitimer(td, (struct linux_getitimer_args *)uap));
 	}
 
 	error = copyin(uap->itv, &ls, sizeof(ls));
 	if (error != 0)
 		return (error);
 	B2L_ITIMERVAL(&aitv, &ls);
 #ifdef DEBUG
 	if (ldebug(setitimer)) {
 		printf("setitimer: value: sec: %jd, usec: %ld\n",
 		    (intmax_t)aitv.it_value.tv_sec, aitv.it_value.tv_usec);
 		printf("setitimer: interval: sec: %jd, usec: %ld\n",
 		    (intmax_t)aitv.it_interval.tv_sec, aitv.it_interval.tv_usec);
 	}
 #endif
 	error = kern_setitimer(td, uap->which, &aitv, &oitv);
 	if (error != 0 || uap->oitv == NULL)
 		return (error);
 	B2L_ITIMERVAL(&ls, &oitv);
 
 	return (copyout(&ls, uap->oitv, sizeof(ls)));
 }
 
 int
 linux_getitimer(struct thread *td, struct linux_getitimer_args *uap)
 {
 	int error;
 	struct l_itimerval ls;
 	struct itimerval aitv;
 
 #ifdef DEBUG
 	if (ldebug(getitimer))
 		printf(ARGS(getitimer, "%p"), (void *)uap->itv);
 #endif
 	error = kern_getitimer(td, uap->which, &aitv);
 	if (error != 0)
 		return (error);
 	B2L_ITIMERVAL(&ls, &aitv);
 	return (copyout(&ls, uap->itv, sizeof(ls)));
 }
 
 int
 linux_nice(struct thread *td, struct linux_nice_args *args)
 {
 	struct setpriority_args bsd_args;
 
 	bsd_args.which = PRIO_PROCESS;
 	bsd_args.who = 0;		/* current process */
 	bsd_args.prio = args->inc;
 	return (sys_setpriority(td, &bsd_args));
 }
 
 int
 linux_setgroups(struct thread *td, struct linux_setgroups_args *args)
 {
 	struct ucred *newcred, *oldcred;
 	l_gid_t *linux_gidset;
 	gid_t *bsd_gidset;
 	int ngrp, error;
 	struct proc *p;
 
 	ngrp = args->gidsetsize;
 	if (ngrp < 0 || ngrp >= ngroups_max + 1)
 		return (EINVAL);
 	linux_gidset = malloc(ngrp * sizeof(*linux_gidset), M_TEMP, M_WAITOK);
 	error = copyin(args->grouplist, linux_gidset, ngrp * sizeof(l_gid_t));
 	if (error)
 		goto out;
 	newcred = crget();
 	p = td->td_proc;
 	PROC_LOCK(p);
 	oldcred = crcopysafe(p, newcred);
 
 	/*
 	 * cr_groups[0] holds egid. Setting the whole set from
 	 * the supplied set will cause egid to be changed too.
 	 * Keep cr_groups[0] unchanged to prevent that.
 	 */
 
 	if ((error = priv_check_cred(oldcred, PRIV_CRED_SETGROUPS, 0)) != 0) {
 		PROC_UNLOCK(p);
 		crfree(newcred);
 		goto out;
 	}
 
 	if (ngrp > 0) {
 		newcred->cr_ngroups = ngrp + 1;
 
 		bsd_gidset = newcred->cr_groups;
 		ngrp--;
 		while (ngrp >= 0) {
 			bsd_gidset[ngrp + 1] = linux_gidset[ngrp];
 			ngrp--;
 		}
 	} else
 		newcred->cr_ngroups = 1;
 
 	setsugid(p);
 	p->p_ucred = newcred;
 	PROC_UNLOCK(p);
 	crfree(oldcred);
 	error = 0;
 out:
 	free(linux_gidset, M_TEMP);
 	return (error);
 }
 
 int
 linux_getgroups(struct thread *td, struct linux_getgroups_args *args)
 {
 	struct ucred *cred;
 	l_gid_t *linux_gidset;
 	gid_t *bsd_gidset;
 	int bsd_gidsetsz, ngrp, error;
 
 	cred = td->td_ucred;
 	bsd_gidset = cred->cr_groups;
 	bsd_gidsetsz = cred->cr_ngroups - 1;
 
 	/*
 	 * cr_groups[0] holds egid. Returning the whole set
 	 * here will cause a duplicate. Exclude cr_groups[0]
 	 * to prevent that.
 	 */
 
 	if ((ngrp = args->gidsetsize) == 0) {
 		td->td_retval[0] = bsd_gidsetsz;
 		return (0);
 	}
 
 	if (ngrp < bsd_gidsetsz)
 		return (EINVAL);
 
 	ngrp = 0;
 	linux_gidset = malloc(bsd_gidsetsz * sizeof(*linux_gidset),
 	    M_TEMP, M_WAITOK);
 	while (ngrp < bsd_gidsetsz) {
 		linux_gidset[ngrp] = bsd_gidset[ngrp + 1];
 		ngrp++;
 	}
 
 	error = copyout(linux_gidset, args->grouplist, ngrp * sizeof(l_gid_t));
 	free(linux_gidset, M_TEMP);
 	if (error)
 		return (error);
 
 	td->td_retval[0] = ngrp;
 	return (0);
 }
 
 int
 linux_setrlimit(struct thread *td, struct linux_setrlimit_args *args)
 {
 	struct rlimit bsd_rlim;
 	struct l_rlimit rlim;
 	u_int which;
 	int error;
 
 #ifdef DEBUG
 	if (ldebug(setrlimit))
 		printf(ARGS(setrlimit, "%d, %p"),
 		    args->resource, (void *)args->rlim);
 #endif
 
 	if (args->resource >= LINUX_RLIM_NLIMITS)
 		return (EINVAL);
 
 	which = linux_to_bsd_resource[args->resource];
 	if (which == -1)
 		return (EINVAL);
 
 	error = copyin(args->rlim, &rlim, sizeof(rlim));
 	if (error)
 		return (error);
 
 	bsd_rlim.rlim_cur = (rlim_t)rlim.rlim_cur;
 	bsd_rlim.rlim_max = (rlim_t)rlim.rlim_max;
 	return (kern_setrlimit(td, which, &bsd_rlim));
 }
 
 int
 linux_old_getrlimit(struct thread *td, struct linux_old_getrlimit_args *args)
 {
 	struct l_rlimit rlim;
 	struct proc *p = td->td_proc;
 	struct rlimit bsd_rlim;
 	u_int which;
 
 #ifdef DEBUG
 	if (ldebug(old_getrlimit))
 		printf(ARGS(old_getrlimit, "%d, %p"),
 		    args->resource, (void *)args->rlim);
 #endif
 
 	if (args->resource >= LINUX_RLIM_NLIMITS)
 		return (EINVAL);
 
 	which = linux_to_bsd_resource[args->resource];
 	if (which == -1)
 		return (EINVAL);
 
 	PROC_LOCK(p);
 	lim_rlimit(p, which, &bsd_rlim);
 	PROC_UNLOCK(p);
 
 #ifdef COMPAT_LINUX32
 	rlim.rlim_cur = (unsigned int)bsd_rlim.rlim_cur;
 	if (rlim.rlim_cur == UINT_MAX)
 		rlim.rlim_cur = INT_MAX;
 	rlim.rlim_max = (unsigned int)bsd_rlim.rlim_max;
 	if (rlim.rlim_max == UINT_MAX)
 		rlim.rlim_max = INT_MAX;
 #else
 	rlim.rlim_cur = (unsigned long)bsd_rlim.rlim_cur;
 	if (rlim.rlim_cur == ULONG_MAX)
 		rlim.rlim_cur = LONG_MAX;
 	rlim.rlim_max = (unsigned long)bsd_rlim.rlim_max;
 	if (rlim.rlim_max == ULONG_MAX)
 		rlim.rlim_max = LONG_MAX;
 #endif
 	return (copyout(&rlim, args->rlim, sizeof(rlim)));
 }
 
 int
 linux_getrlimit(struct thread *td, struct linux_getrlimit_args *args)
 {
 	struct l_rlimit rlim;
 	struct proc *p = td->td_proc;
 	struct rlimit bsd_rlim;
 	u_int which;
 
 #ifdef DEBUG
 	if (ldebug(getrlimit))
 		printf(ARGS(getrlimit, "%d, %p"),
 		    args->resource, (void *)args->rlim);
 #endif
 
 	if (args->resource >= LINUX_RLIM_NLIMITS)
 		return (EINVAL);
 
 	which = linux_to_bsd_resource[args->resource];
 	if (which == -1)
 		return (EINVAL);
 
 	PROC_LOCK(p);
 	lim_rlimit(p, which, &bsd_rlim);
 	PROC_UNLOCK(p);
 
 	rlim.rlim_cur = (l_ulong)bsd_rlim.rlim_cur;
 	rlim.rlim_max = (l_ulong)bsd_rlim.rlim_max;
 	return (copyout(&rlim, args->rlim, sizeof(rlim)));
 }
 
 int
 linux_sched_setscheduler(struct thread *td,
     struct linux_sched_setscheduler_args *args)
 {
 	struct sched_setscheduler_args bsd;
 
 #ifdef DEBUG
 	if (ldebug(sched_setscheduler))
 		printf(ARGS(sched_setscheduler, "%d, %d, %p"),
 		    args->pid, args->policy, (const void *)args->param);
 #endif
 
 	switch (args->policy) {
 	case LINUX_SCHED_OTHER:
 		bsd.policy = SCHED_OTHER;
 		break;
 	case LINUX_SCHED_FIFO:
 		bsd.policy = SCHED_FIFO;
 		break;
 	case LINUX_SCHED_RR:
 		bsd.policy = SCHED_RR;
 		break;
 	default:
 		return (EINVAL);
 	}
 
 	bsd.pid = args->pid;
 	bsd.param = (struct sched_param *)args->param;
 	return (sys_sched_setscheduler(td, &bsd));
 }
 
 int
 linux_sched_getscheduler(struct thread *td,
     struct linux_sched_getscheduler_args *args)
 {
 	struct sched_getscheduler_args bsd;
 	int error;
 
 #ifdef DEBUG
 	if (ldebug(sched_getscheduler))
 		printf(ARGS(sched_getscheduler, "%d"), args->pid);
 #endif
 
 	bsd.pid = args->pid;
 	error = sys_sched_getscheduler(td, &bsd);
 
 	switch (td->td_retval[0]) {
 	case SCHED_OTHER:
 		td->td_retval[0] = LINUX_SCHED_OTHER;
 		break;
 	case SCHED_FIFO:
 		td->td_retval[0] = LINUX_SCHED_FIFO;
 		break;
 	case SCHED_RR:
 		td->td_retval[0] = LINUX_SCHED_RR;
 		break;
 	}
 
 	return (error);
 }
 
 int
 linux_sched_get_priority_max(struct thread *td,
     struct linux_sched_get_priority_max_args *args)
 {
 	struct sched_get_priority_max_args bsd;
 
 #ifdef DEBUG
 	if (ldebug(sched_get_priority_max))
 		printf(ARGS(sched_get_priority_max, "%d"), args->policy);
 #endif
 
 	switch (args->policy) {
 	case LINUX_SCHED_OTHER:
 		bsd.policy = SCHED_OTHER;
 		break;
 	case LINUX_SCHED_FIFO:
 		bsd.policy = SCHED_FIFO;
 		break;
 	case LINUX_SCHED_RR:
 		bsd.policy = SCHED_RR;
 		break;
 	default:
 		return (EINVAL);
 	}
 	return (sys_sched_get_priority_max(td, &bsd));
 }
 
 int
 linux_sched_get_priority_min(struct thread *td,
     struct linux_sched_get_priority_min_args *args)
 {
 	struct sched_get_priority_min_args bsd;
 
 #ifdef DEBUG
 	if (ldebug(sched_get_priority_min))
 		printf(ARGS(sched_get_priority_min, "%d"), args->policy);
 #endif
 
 	switch (args->policy) {
 	case LINUX_SCHED_OTHER:
 		bsd.policy = SCHED_OTHER;
 		break;
 	case LINUX_SCHED_FIFO:
 		bsd.policy = SCHED_FIFO;
 		break;
 	case LINUX_SCHED_RR:
 		bsd.policy = SCHED_RR;
 		break;
 	default:
 		return (EINVAL);
 	}
 	return (sys_sched_get_priority_min(td, &bsd));
 }
 
 #define REBOOT_CAD_ON	0x89abcdef
 #define REBOOT_CAD_OFF	0
 #define REBOOT_HALT	0xcdef0123
 #define REBOOT_RESTART	0x01234567
 #define REBOOT_RESTART2	0xA1B2C3D4
 #define REBOOT_POWEROFF	0x4321FEDC
 #define REBOOT_MAGIC1	0xfee1dead
 #define REBOOT_MAGIC2	0x28121969
 #define REBOOT_MAGIC2A	0x05121996
 #define REBOOT_MAGIC2B	0x16041998
 
 int
 linux_reboot(struct thread *td, struct linux_reboot_args *args)
 {
 	struct reboot_args bsd_args;
 
 #ifdef DEBUG
 	if (ldebug(reboot))
 		printf(ARGS(reboot, "0x%x"), args->cmd);
 #endif
 
 	if (args->magic1 != REBOOT_MAGIC1)
 		return (EINVAL);
 
 	switch (args->magic2) {
 	case REBOOT_MAGIC2:
 	case REBOOT_MAGIC2A:
 	case REBOOT_MAGIC2B:
 		break;
 	default:
 		return (EINVAL);
 	}
 
 	switch (args->cmd) {
 	case REBOOT_CAD_ON:
 	case REBOOT_CAD_OFF:
 		return (priv_check(td, PRIV_REBOOT));
 	case REBOOT_HALT:
 		bsd_args.opt = RB_HALT;
 		break;
 	case REBOOT_RESTART:
 	case REBOOT_RESTART2:
 		bsd_args.opt = 0;
 		break;
 	case REBOOT_POWEROFF:
 		bsd_args.opt = RB_POWEROFF;
 		break;
 	default:
 		return (EINVAL);
 	}
 	return (sys_reboot(td, &bsd_args));
 }
 
 
 /*
  * The FreeBSD native getpid(2), getgid(2) and getuid(2) also modify
  * td->td_retval[1] when COMPAT_43 is defined. This clobbers registers that
  * are assumed to be preserved. The following lightweight syscalls fixes
  * this. See also linux_getgid16() and linux_getuid16() in linux_uid16.c
  *
  * linux_getpid() - MP SAFE
  * linux_getgid() - MP SAFE
  * linux_getuid() - MP SAFE
  */
 
 int
 linux_getpid(struct thread *td, struct linux_getpid_args *args)
 {
 	struct linux_emuldata *em;
 
 #ifdef DEBUG
 	if (ldebug(getpid))
 		printf(ARGS(getpid, ""));
 #endif
 
 	if (linux_use26(td)) {
 		em = em_find(td->td_proc, EMUL_DONTLOCK);
 		KASSERT(em != NULL, ("getpid: emuldata not found.\n"));
 		td->td_retval[0] = em->shared->group_pid;
 	} else {
 		td->td_retval[0] = td->td_proc->p_pid;
 	}
 
 	return (0);
 }
 
 int
 linux_gettid(struct thread *td, struct linux_gettid_args *args)
 {
 
 #ifdef DEBUG
 	if (ldebug(gettid))
 		printf(ARGS(gettid, ""));
 #endif
 
 	td->td_retval[0] = td->td_proc->p_pid;
 	return (0);
 }
 
 
 int
 linux_getppid(struct thread *td, struct linux_getppid_args *args)
 {
 	struct linux_emuldata *em;
 	struct proc *p, *pp;
 
 #ifdef DEBUG
 	if (ldebug(getppid))
 		printf(ARGS(getppid, ""));
 #endif
 
 	if (!linux_use26(td)) {
 		PROC_LOCK(td->td_proc);
 		td->td_retval[0] = td->td_proc->p_pptr->p_pid;
 		PROC_UNLOCK(td->td_proc);
 		return (0);
 	}
 
 	em = em_find(td->td_proc, EMUL_DONTLOCK);
 
 	KASSERT(em != NULL, ("getppid: process emuldata not found.\n"));
 
 	/* find the group leader */
 	p = pfind(em->shared->group_pid);
 
 	if (p == NULL) {
 #ifdef DEBUG
 	   	printf(LMSG("parent process not found.\n"));
 #endif
 		return (0);
 	}
 
 	pp = p->p_pptr;		/* switch to parent */
 	PROC_LOCK(pp);
 	PROC_UNLOCK(p);
 
 	/* if its also linux process */
 	if (pp->p_sysent == &elf_linux_sysvec) {
 		em = em_find(pp, EMUL_DONTLOCK);
 		KASSERT(em != NULL, ("getppid: parent emuldata not found.\n"));
 
 		td->td_retval[0] = em->shared->group_pid;
 	} else
 		td->td_retval[0] = pp->p_pid;
 
 	PROC_UNLOCK(pp);
 
 	return (0);
 }
 
 int
 linux_getgid(struct thread *td, struct linux_getgid_args *args)
 {
 
 #ifdef DEBUG
 	if (ldebug(getgid))
 		printf(ARGS(getgid, ""));
 #endif
 
 	td->td_retval[0] = td->td_ucred->cr_rgid;
 	return (0);
 }
 
 int
 linux_getuid(struct thread *td, struct linux_getuid_args *args)
 {
 
 #ifdef DEBUG
 	if (ldebug(getuid))
 		printf(ARGS(getuid, ""));
 #endif
 
 	td->td_retval[0] = td->td_ucred->cr_ruid;
 	return (0);
 }
 
 
 int
 linux_getsid(struct thread *td, struct linux_getsid_args *args)
 {
 	struct getsid_args bsd;
 
 #ifdef DEBUG
 	if (ldebug(getsid))
 		printf(ARGS(getsid, "%i"), args->pid);
 #endif
 
 	bsd.pid = args->pid;
 	return (sys_getsid(td, &bsd));
 }
 
 int
 linux_nosys(struct thread *td, struct nosys_args *ignore)
 {
 
 	return (ENOSYS);
 }
 
 int
 linux_getpriority(struct thread *td, struct linux_getpriority_args *args)
 {
 	struct getpriority_args bsd_args;
 	int error;
 
 #ifdef DEBUG
 	if (ldebug(getpriority))
 		printf(ARGS(getpriority, "%i, %i"), args->which, args->who);
 #endif
 
 	bsd_args.which = args->which;
 	bsd_args.who = args->who;
 	error = sys_getpriority(td, &bsd_args);
 	td->td_retval[0] = 20 - td->td_retval[0];
 	return (error);
 }
 
 int
 linux_sethostname(struct thread *td, struct linux_sethostname_args *args)
 {
 	int name[2];
 
 #ifdef DEBUG
 	if (ldebug(sethostname))
 		printf(ARGS(sethostname, "*, %i"), args->len);
 #endif
 
 	name[0] = CTL_KERN;
 	name[1] = KERN_HOSTNAME;
 	return (userland_sysctl(td, name, 2, 0, 0, 0, args->hostname,
 	    args->len, 0, 0));
 }
 
 int
 linux_setdomainname(struct thread *td, struct linux_setdomainname_args *args)
 {
 	int name[2];
 
 #ifdef DEBUG
 	if (ldebug(setdomainname))
 		printf(ARGS(setdomainname, "*, %i"), args->len);
 #endif
 
 	name[0] = CTL_KERN;
 	name[1] = KERN_NISDOMAINNAME;
 	return (userland_sysctl(td, name, 2, 0, 0, 0, args->name,
 	    args->len, 0, 0));
 }
 
 int
 linux_exit_group(struct thread *td, struct linux_exit_group_args *args)
 {
 	struct linux_emuldata *em;
 
 #ifdef DEBUG
 	if (ldebug(exit_group))
 		printf(ARGS(exit_group, "%i"), args->error_code);
 #endif
 
 	em = em_find(td->td_proc, EMUL_DONTLOCK);
 	if (em->shared->refs > 1) {
 		EMUL_SHARED_WLOCK(&emul_shared_lock);
 		em->shared->flags |= EMUL_SHARED_HASXSTAT;
 		em->shared->xstat = W_EXITCODE(args->error_code, 0);
 		EMUL_SHARED_WUNLOCK(&emul_shared_lock);
 		if (linux_use26(td))
 			linux_kill_threads(td, SIGKILL);
 	}
 
 	/*
 	 * XXX: we should send a signal to the parent if
 	 * SIGNAL_EXIT_GROUP is set. We ignore that (temporarily?)
 	 * as it doesnt occur often.
 	 */
 	exit1(td, W_EXITCODE(args->error_code, 0));
 
 	return (0);
 }
 
 #define _LINUX_CAPABILITY_VERSION  0x19980330
 
 struct l_user_cap_header {
 	l_int	version;
 	l_int	pid;
 };
 
 struct l_user_cap_data {
 	l_int	effective;
 	l_int	permitted;
 	l_int	inheritable;
 };
 
 int
 linux_capget(struct thread *td, struct linux_capget_args *args)
 {
 	struct l_user_cap_header luch;
 	struct l_user_cap_data lucd;
 	int error;
 
 	if (args->hdrp == NULL)
 		return (EFAULT);
 
 	error = copyin(args->hdrp, &luch, sizeof(luch));
 	if (error != 0)
 		return (error);
 
 	if (luch.version != _LINUX_CAPABILITY_VERSION) {
 		luch.version = _LINUX_CAPABILITY_VERSION;
 		error = copyout(&luch, args->hdrp, sizeof(luch));
 		if (error)
 			return (error);
 		return (EINVAL);
 	}
 
 	if (luch.pid)
 		return (EPERM);
 
 	if (args->datap) {
 		/*
 		 * The current implementation doesn't support setting
 		 * a capability (it's essentially a stub) so indicate
 		 * that no capabilities are currently set or available
 		 * to request.
 		 */
 		bzero (&lucd, sizeof(lucd));
 		error = copyout(&lucd, args->datap, sizeof(lucd));
 	}
 
 	return (error);
 }
 
 int
 linux_capset(struct thread *td, struct linux_capset_args *args)
 {
 	struct l_user_cap_header luch;
 	struct l_user_cap_data lucd;
 	int error;
 
 	if (args->hdrp == NULL || args->datap == NULL)
 		return (EFAULT);
 
 	error = copyin(args->hdrp, &luch, sizeof(luch));
 	if (error != 0)
 		return (error);
 
 	if (luch.version != _LINUX_CAPABILITY_VERSION) {
 		luch.version = _LINUX_CAPABILITY_VERSION;
 		error = copyout(&luch, args->hdrp, sizeof(luch));
 		if (error)
 			return (error);
 		return (EINVAL);
 	}
 
 	if (luch.pid)
 		return (EPERM);
 
 	error = copyin(args->datap, &lucd, sizeof(lucd));
 	if (error != 0)
 		return (error);
 
 	/* We currently don't support setting any capabilities. */
 	if (lucd.effective || lucd.permitted || lucd.inheritable) {
 		linux_msg(td,
 			  "capset effective=0x%x, permitted=0x%x, "
 			  "inheritable=0x%x is not implemented",
 			  (int)lucd.effective, (int)lucd.permitted,
 			  (int)lucd.inheritable);
 		return (EPERM);
 	}
 
 	return (0);
 }
 
 int
 linux_prctl(struct thread *td, struct linux_prctl_args *args)
 {
 	int error = 0, max_size;
 	struct proc *p = td->td_proc;
 	char comm[LINUX_MAX_COMM_LEN];
 	struct linux_emuldata *em;
 	int pdeath_signal;
 
 #ifdef DEBUG
 	if (ldebug(prctl))
 		printf(ARGS(prctl, "%d, %d, %d, %d, %d"), args->option,
 		    args->arg2, args->arg3, args->arg4, args->arg5);
 #endif
 
 	switch (args->option) {
 	case LINUX_PR_SET_PDEATHSIG:
 		if (!LINUX_SIG_VALID(args->arg2))
 			return (EINVAL);
 		em = em_find(p, EMUL_DOLOCK);
 		KASSERT(em != NULL, ("prctl: emuldata not found.\n"));
 		em->pdeath_signal = args->arg2;
 		EMUL_UNLOCK(&emul_lock);
 		break;
 	case LINUX_PR_GET_PDEATHSIG:
 		em = em_find(p, EMUL_DOLOCK);
 		KASSERT(em != NULL, ("prctl: emuldata not found.\n"));
 		pdeath_signal = em->pdeath_signal;
 		EMUL_UNLOCK(&emul_lock);
 		error = copyout(&pdeath_signal,
 		    (void *)(register_t)args->arg2,
 		    sizeof(pdeath_signal));
 		break;
 	case LINUX_PR_GET_KEEPCAPS:
 		/*
 		 * Indicate that we always clear the effective and
 		 * permitted capability sets when the user id becomes
 		 * non-zero (actually the capability sets are simply
 		 * always zero in the current implementation).
 		 */
 		td->td_retval[0] = 0;
 		break;
 	case LINUX_PR_SET_KEEPCAPS:
 		/*
 		 * Ignore requests to keep the effective and permitted
 		 * capability sets when the user id becomes non-zero.
 		 */
 		break;
 	case LINUX_PR_SET_NAME:
 		/*
 		 * To be on the safe side we need to make sure to not
 		 * overflow the size a linux program expects. We already
 		 * do this here in the copyin, so that we don't need to
 		 * check on copyout.
 		 */
 		max_size = MIN(sizeof(comm), sizeof(p->p_comm));
 		error = copyinstr((void *)(register_t)args->arg2, comm,
 		    max_size, NULL);
 
 		/* Linux silently truncates the name if it is too long. */
 		if (error == ENAMETOOLONG) {
 			/*
 			 * XXX: copyinstr() isn't documented to populate the
 			 * array completely, so do a copyin() to be on the
 			 * safe side. This should be changed in case
 			 * copyinstr() is changed to guarantee this.
 			 */
 			error = copyin((void *)(register_t)args->arg2, comm,
 			    max_size - 1);
 			comm[max_size - 1] = '\0';
 		}
 		if (error)
 			return (error);
 
 		PROC_LOCK(p);
 		strlcpy(p->p_comm, comm, sizeof(p->p_comm));
 		PROC_UNLOCK(p);
 		break;
 	case LINUX_PR_GET_NAME:
 		PROC_LOCK(p);
 		strlcpy(comm, p->p_comm, sizeof(comm));
 		PROC_UNLOCK(p);
 		error = copyout(comm, (void *)(register_t)args->arg2,
 		    strlen(comm) + 1);
 		break;
 	default:
 		error = EINVAL;
 		break;
 	}
 
 	return (error);
 }
 
 /*
  * Get affinity of a process.
  */
 int
 linux_sched_getaffinity(struct thread *td,
     struct linux_sched_getaffinity_args *args)
 {
 	int error;
 	struct cpuset_getaffinity_args cga;
 
 #ifdef DEBUG
 	if (ldebug(sched_getaffinity))
 		printf(ARGS(sched_getaffinity, "%d, %d, *"), args->pid,
 		    args->len);
 #endif
 	if (args->len < sizeof(cpuset_t))
 		return (EINVAL);
 
 	cga.level = CPU_LEVEL_WHICH;
 	cga.which = CPU_WHICH_PID;
 	cga.id = args->pid;
 	cga.cpusetsize = sizeof(cpuset_t);
 	cga.mask = (cpuset_t *) args->user_mask_ptr;
 
 	if ((error = sys_cpuset_getaffinity(td, &cga)) == 0)
 		td->td_retval[0] = sizeof(cpuset_t);
 
 	return (error);
 }
 
 /*
  *  Set affinity of a process.
  */
 int
 linux_sched_setaffinity(struct thread *td,
     struct linux_sched_setaffinity_args *args)
 {
 	struct cpuset_setaffinity_args csa;
 
 #ifdef DEBUG
 	if (ldebug(sched_setaffinity))
 		printf(ARGS(sched_setaffinity, "%d, %d, *"), args->pid,
 		    args->len);
 #endif
 	if (args->len < sizeof(cpuset_t))
 		return (EINVAL);
 
 	csa.level = CPU_LEVEL_WHICH;
 	csa.which = CPU_WHICH_PID;
 	csa.id = args->pid;
 	csa.cpusetsize = sizeof(cpuset_t);
 	csa.mask = (cpuset_t *) args->user_mask_ptr;
 
 	return (sys_cpuset_setaffinity(td, &csa));
 }
Index: head/sys/compat/linux/linux_sysctl.c
===================================================================
--- head/sys/compat/linux/linux_sysctl.c	(revision 258540)
+++ head/sys/compat/linux/linux_sysctl.c	(revision 258541)
@@ -1,194 +1,193 @@
 /*-
  * Copyright (c) 2001 Marcel Moolenaar
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer
  *    in this position and unchanged.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. The name of the author may not be used to endorse or promote products
  *    derived from this software without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_compat.h"
-#include "opt_kdtrace.h"
 
 #include <sys/param.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mutex.h>
 #include <sys/proc.h>
 #include <sys/sdt.h>
 #include <sys/sysctl.h>
 #include <sys/systm.h>
 #include <sys/sbuf.h>
 
 #ifdef COMPAT_LINUX32
 #include <machine/../linux32/linux.h>
 #include <machine/../linux32/linux32_proto.h>
 #else
 #include <machine/../linux/linux.h>
 #include <machine/../linux/linux_proto.h>
 #endif
 
 #include <compat/linux/linux_dtrace.h>
 #include <compat/linux/linux_misc.h>
 #include <compat/linux/linux_util.h>
 
 #define	LINUX_CTL_KERN		1
 #define	LINUX_CTL_VM		2
 #define	LINUX_CTL_NET		3
 #define	LINUX_CTL_PROC		4
 #define	LINUX_CTL_FS		5
 #define	LINUX_CTL_DEBUG		6
 #define	LINUX_CTL_DEV		7
 #define	LINUX_CTL_BUS		8
 
 /* CTL_KERN names */
 #define	LINUX_KERN_OSTYPE	1
 #define	LINUX_KERN_OSRELEASE	2
 #define	LINUX_KERN_OSREV	3
 #define	LINUX_KERN_VERSION	4
 
 /* DTrace init */
 LIN_SDT_PROVIDER_DECLARE(LINUX_DTRACE);
 
 /**
  * DTrace probes in this module.
  */
 LIN_SDT_PROBE_DEFINE2(sysctl, handle_string, entry, "struct l___sysctl_args *",
     "char *");
 LIN_SDT_PROBE_DEFINE1(sysctl, handle_string, copyout_error, "int");
 LIN_SDT_PROBE_DEFINE1(sysctl, handle_string, return, "int");
 LIN_SDT_PROBE_DEFINE2(sysctl, linux_sysctl, entry, "struct l___sysctl_args *",
     "struct thread *");
 LIN_SDT_PROBE_DEFINE1(sysctl, linux_sysctl, copyin_error, "int");
 LIN_SDT_PROBE_DEFINE2(sysctl, linux_sysctl, wrong_length, "int", "int");
 LIN_SDT_PROBE_DEFINE1(sysctl, linux_sysctl, unsupported_sysctl, "char *");
 LIN_SDT_PROBE_DEFINE1(sysctl, linux_sysctl, return, "int");
 
 static int
 handle_string(struct l___sysctl_args *la, char *value)
 {
 	int error;
 
 	LIN_SDT_PROBE2(sysctl, handle_string, entry, la, value);
 
 	if (la->oldval != 0) {
 		l_int len = strlen(value);
 		error = copyout(value, PTRIN(la->oldval), len + 1);
 		if (!error && la->oldlenp != 0)
 			error = copyout(&len, PTRIN(la->oldlenp), sizeof(len));
 		if (error) {
 			LIN_SDT_PROBE1(sysctl, handle_string, copyout_error,
 			    error);
 			LIN_SDT_PROBE1(sysctl, handle_string, return, error);
 			return (error);
 		}
 	}
 
 	if (la->newval != 0) {
 		LIN_SDT_PROBE1(sysctl, handle_string, return, ENOTDIR);
 		return (ENOTDIR);
 	}
 
 	LIN_SDT_PROBE1(sysctl, handle_string, return, 0);
 	return (0);
 }
 
 int
 linux_sysctl(struct thread *td, struct linux_sysctl_args *args)
 {
 	struct l___sysctl_args la;
 	struct sbuf *sb;
 	l_int *mib;
 	char *sysctl_string;
 	int error, i;
 
 	LIN_SDT_PROBE2(sysctl, linux_sysctl, entry, td, args->args);
 
 	error = copyin(args->args, &la, sizeof(la));
 	if (error) {
 		LIN_SDT_PROBE1(sysctl, linux_sysctl, copyin_error, error);
 		LIN_SDT_PROBE1(sysctl, linux_sysctl, return, error);
 		return (error);
 	}
 
 	if (la.nlen <= 0 || la.nlen > LINUX_CTL_MAXNAME) {
 		LIN_SDT_PROBE2(sysctl, linux_sysctl, wrong_length, la.nlen,
 		    LINUX_CTL_MAXNAME);
 		LIN_SDT_PROBE1(sysctl, linux_sysctl, return, ENOTDIR);
 		return (ENOTDIR);
 	}
 
 	mib = malloc(la.nlen * sizeof(l_int), M_TEMP, M_WAITOK);
 	error = copyin(PTRIN(la.name), mib, la.nlen * sizeof(l_int));
 	if (error) {
 		LIN_SDT_PROBE1(sysctl, linux_sysctl, copyin_error, error);
 		LIN_SDT_PROBE1(sysctl, linux_sysctl, return, error);
 		free(mib, M_TEMP);
 		return (error);
 	}
 
 	switch (mib[0]) {
 	case LINUX_CTL_KERN:
 		if (la.nlen < 2)
 			break;
 
 		switch (mib[1]) {
 		case LINUX_KERN_VERSION:
 			error = handle_string(&la, version);
 			free(mib, M_TEMP);
 			LIN_SDT_PROBE1(sysctl, linux_sysctl, return, error);
 			return (error);
 		default:
 			break;
 		}
 		break;
 	default:
 		break;
 	}
 
 	sb = sbuf_new(NULL, NULL, 20 + la.nlen * 5, SBUF_AUTOEXTEND);
 	if (sb == NULL) {
 		linux_msg(td, "sysctl is not implemented");
 		LIN_SDT_PROBE1(sysctl, linux_sysctl, unsupported_sysctl,
 		    "unknown sysctl, ENOMEM during lookup");
 	} else {
 		sbuf_printf(sb, "sysctl ");
 		for (i = 0; i < la.nlen; i++)
 			sbuf_printf(sb, "%c%d", (i) ? ',' : '{', mib[i]);
 		sbuf_printf(sb, "} is not implemented");
 		sbuf_finish(sb);
 		sysctl_string = sbuf_data(sb);
 		linux_msg(td, "%s", sbuf_data(sb));
 		LIN_SDT_PROBE1(sysctl, linux_sysctl, unsupported_sysctl,
 		    sysctl_string);
 		sbuf_delete(sb);
 	}
 
 	free(mib, M_TEMP);
 
 	LIN_SDT_PROBE1(sysctl, linux_sysctl, return, ENOTDIR);
 	return (ENOTDIR);
 }
Index: head/sys/compat/linux/linux_time.c
===================================================================
--- head/sys/compat/linux/linux_time.c	(revision 258540)
+++ head/sys/compat/linux/linux_time.c	(revision 258541)
@@ -1,417 +1,416 @@
 /*	$NetBSD: linux_time.c,v 1.14 2006/05/14 03:40:54 christos Exp $ */
 
 /*-
  * Copyright (c) 2001 The NetBSD Foundation, Inc.
  * All rights reserved.
  *
  * This code is derived from software contributed to The NetBSD Foundation
  * by Emmanuel Dreyfus.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  * POSSIBILITY OF SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 #if 0
 __KERNEL_RCSID(0, "$NetBSD: linux_time.c,v 1.14 2006/05/14 03:40:54 christos Exp $");
 #endif
 
 #include "opt_compat.h"
-#include "opt_kdtrace.h"
 
 #include <sys/param.h>
 #include <sys/kernel.h>
 #include <sys/ucred.h>
 #include <sys/mount.h>
 #include <sys/sdt.h>
 #include <sys/signal.h>
 #include <sys/stdint.h>
 #include <sys/syscallsubr.h>
 #include <sys/sysproto.h>
 #include <sys/time.h>
 #include <sys/systm.h>
 #include <sys/proc.h>
 
 #ifdef COMPAT_LINUX32
 #include <machine/../linux32/linux.h>
 #include <machine/../linux32/linux32_proto.h>
 #else
 #include <machine/../linux/linux.h>
 #include <machine/../linux/linux_proto.h>
 #endif
 
 #include <compat/linux/linux_dtrace.h>
 #include <compat/linux/linux_misc.h>
 
 /* DTrace init */
 LIN_SDT_PROVIDER_DECLARE(LINUX_DTRACE);
 
 /**
  * DTrace probes in this module.
  */
 LIN_SDT_PROBE_DEFINE2(time, native_to_linux_timespec, entry,
     "struct l_timespec *", "struct timespec *");
 LIN_SDT_PROBE_DEFINE0(time, native_to_linux_timespec, return);
 LIN_SDT_PROBE_DEFINE2(time, linux_to_native_timespec, entry,
     "struct timespec *", "struct l_timespec *");
 LIN_SDT_PROBE_DEFINE1(time, linux_to_native_timespec, return, "int");
 LIN_SDT_PROBE_DEFINE2(time, linux_to_native_clockid, entry, "clockid_t *",
     "clockid_t");
 LIN_SDT_PROBE_DEFINE1(time, linux_to_native_clockid, unsupported_clockid,
     "clockid_t");
 LIN_SDT_PROBE_DEFINE1(time, linux_to_native_clockid, unknown_clockid,
     "clockid_t");
 LIN_SDT_PROBE_DEFINE1(time, linux_to_native_clockid, return, "int");
 LIN_SDT_PROBE_DEFINE2(time, linux_clock_gettime, entry, "clockid_t",
     "struct l_timespec *");
 LIN_SDT_PROBE_DEFINE1(time, linux_clock_gettime, conversion_error, "int");
 LIN_SDT_PROBE_DEFINE1(time, linux_clock_gettime, gettime_error, "int");
 LIN_SDT_PROBE_DEFINE1(time, linux_clock_gettime, copyout_error, "int");
 LIN_SDT_PROBE_DEFINE1(time, linux_clock_gettime, return, "int");
 LIN_SDT_PROBE_DEFINE2(time, linux_clock_settime, entry, "clockid_t",
     "struct l_timespec *");
 LIN_SDT_PROBE_DEFINE1(time, linux_clock_settime, conversion_error, "int");
 LIN_SDT_PROBE_DEFINE1(time, linux_clock_settime, settime_error, "int");
 LIN_SDT_PROBE_DEFINE1(time, linux_clock_settime, copyin_error, "int");
 LIN_SDT_PROBE_DEFINE1(time, linux_clock_settime, return, "int");
 LIN_SDT_PROBE_DEFINE2(time, linux_clock_getres, entry, "clockid_t",
     "struct l_timespec *");
 LIN_SDT_PROBE_DEFINE0(time, linux_clock_getres, nullcall);
 LIN_SDT_PROBE_DEFINE1(time, linux_clock_getres, conversion_error, "int");
 LIN_SDT_PROBE_DEFINE1(time, linux_clock_getres, getres_error, "int");
 LIN_SDT_PROBE_DEFINE1(time, linux_clock_getres, copyout_error, "int");
 LIN_SDT_PROBE_DEFINE1(time, linux_clock_getres, return, "int");
 LIN_SDT_PROBE_DEFINE2(time, linux_nanosleep, entry, "const struct l_timespec *",
     "struct l_timespec *");
 LIN_SDT_PROBE_DEFINE1(time, linux_nanosleep, conversion_error, "int");
 LIN_SDT_PROBE_DEFINE1(time, linux_nanosleep, nanosleep_error, "int");
 LIN_SDT_PROBE_DEFINE1(time, linux_nanosleep, copyout_error, "int");
 LIN_SDT_PROBE_DEFINE1(time, linux_nanosleep, copyin_error, "int");
 LIN_SDT_PROBE_DEFINE1(time, linux_nanosleep, return, "int");
 LIN_SDT_PROBE_DEFINE4(time, linux_clock_nanosleep, entry, "clockid_t", "int",
     "struct l_timespec *", "struct l_timespec *");
 LIN_SDT_PROBE_DEFINE1(time, linux_clock_nanosleep, conversion_error, "int");
 LIN_SDT_PROBE_DEFINE1(time, linux_clock_nanosleep, nanosleep_error, "int");
 LIN_SDT_PROBE_DEFINE1(time, linux_clock_nanosleep, copyout_error, "int");
 LIN_SDT_PROBE_DEFINE1(time, linux_clock_nanosleep, copyin_error, "int");
 LIN_SDT_PROBE_DEFINE1(time, linux_clock_nanosleep, unsupported_flags, "int");
 LIN_SDT_PROBE_DEFINE1(time, linux_clock_nanosleep, unsupported_clockid, "int");
 LIN_SDT_PROBE_DEFINE1(time, linux_clock_nanosleep, return, "int");
 
 static void native_to_linux_timespec(struct l_timespec *,
 				     struct timespec *);
 static int linux_to_native_timespec(struct timespec *,
 				     struct l_timespec *);
 static int linux_to_native_clockid(clockid_t *, clockid_t);
 
 static void
 native_to_linux_timespec(struct l_timespec *ltp, struct timespec *ntp)
 {
 
 	LIN_SDT_PROBE2(time, native_to_linux_timespec, entry, ltp, ntp);
 
 	ltp->tv_sec = ntp->tv_sec;
 	ltp->tv_nsec = ntp->tv_nsec;
 
 	LIN_SDT_PROBE0(time, native_to_linux_timespec, return);
 }
 
 static int
 linux_to_native_timespec(struct timespec *ntp, struct l_timespec *ltp)
 {
 
 	LIN_SDT_PROBE2(time, linux_to_native_timespec, entry, ntp, ltp);
 
 	if (ltp->tv_sec < 0 || ltp->tv_nsec > (l_long)999999999L) {
 		LIN_SDT_PROBE1(time, linux_to_native_timespec, return, EINVAL);
 		return (EINVAL);
 	}
 	ntp->tv_sec = ltp->tv_sec;
 	ntp->tv_nsec = ltp->tv_nsec;
 
 	LIN_SDT_PROBE1(time, linux_to_native_timespec, return, 0);
 	return (0);
 }
 
 static int
 linux_to_native_clockid(clockid_t *n, clockid_t l)
 {
 
 	LIN_SDT_PROBE2(time, linux_to_native_clockid, entry, n, l);
 
 	switch (l) {
 	case LINUX_CLOCK_REALTIME:
 		*n = CLOCK_REALTIME;
 		break;
 	case LINUX_CLOCK_MONOTONIC:
 		*n = CLOCK_MONOTONIC;
 		break;
 	case LINUX_CLOCK_PROCESS_CPUTIME_ID:
 	case LINUX_CLOCK_THREAD_CPUTIME_ID:
 	case LINUX_CLOCK_REALTIME_HR:
 	case LINUX_CLOCK_MONOTONIC_HR:
 		LIN_SDT_PROBE1(time, linux_to_native_clockid,
 		    unsupported_clockid, l);
 		LIN_SDT_PROBE1(time, linux_to_native_clockid, return, EINVAL);
 		return (EINVAL);
 		break;
 	default:
 		LIN_SDT_PROBE1(time, linux_to_native_clockid,
 		    unknown_clockid, l);
 		LIN_SDT_PROBE1(time, linux_to_native_clockid, return, EINVAL);
 		return (EINVAL);
 		break;
 	}
 
 	LIN_SDT_PROBE1(time, linux_to_native_clockid, return, 0);
 	return (0);
 }
 
 int
 linux_clock_gettime(struct thread *td, struct linux_clock_gettime_args *args)
 {
 	struct l_timespec lts;
 	int error;
 	clockid_t nwhich = 0;	/* XXX: GCC */
 	struct timespec tp;
 
 	LIN_SDT_PROBE2(time, linux_clock_gettime, entry, args->which, args->tp);
 
 	error = linux_to_native_clockid(&nwhich, args->which);
 	if (error != 0) {
 		LIN_SDT_PROBE1(time, linux_clock_gettime, conversion_error,
 		    error);
 		LIN_SDT_PROBE1(time, linux_clock_gettime, return, error);
 		return (error);
 	}
 	error = kern_clock_gettime(td, nwhich, &tp);
 	if (error != 0) {
 		LIN_SDT_PROBE1(time, linux_clock_gettime, gettime_error, error);
 		LIN_SDT_PROBE1(time, linux_clock_gettime, return, error);
 		return (error);
 	}
 	native_to_linux_timespec(&lts, &tp);
 
 	error = copyout(&lts, args->tp, sizeof lts);
 	if (error != 0)
 		LIN_SDT_PROBE1(time, linux_clock_gettime, copyout_error, error);
 
 	LIN_SDT_PROBE1(time, linux_clock_gettime, return, error);
 	return (error);
 }
 
 int
 linux_clock_settime(struct thread *td, struct linux_clock_settime_args *args)
 {
 	struct timespec ts;
 	struct l_timespec lts;
 	int error;
 	clockid_t nwhich = 0;	/* XXX: GCC */
 
 	LIN_SDT_PROBE2(time, linux_clock_settime, entry, args->which, args->tp);
 
 	error = linux_to_native_clockid(&nwhich, args->which);
 	if (error != 0) {
 		LIN_SDT_PROBE1(time, linux_clock_settime, conversion_error,
 		    error);
 		LIN_SDT_PROBE1(time, linux_clock_settime, return, error);
 		return (error);
 	}
 	error = copyin(args->tp, &lts, sizeof lts);
 	if (error != 0) {
 		LIN_SDT_PROBE1(time, linux_clock_settime, copyin_error, error);
 		LIN_SDT_PROBE1(time, linux_clock_settime, return, error);
 		return (error);
 	}
 	error = linux_to_native_timespec(&ts, &lts);
 	if (error != 0) {
 		LIN_SDT_PROBE1(time, linux_clock_settime, conversion_error,
 		    error);
 		LIN_SDT_PROBE1(time, linux_clock_settime, return, error);
 		return (error);
 	}
 
 	error = kern_clock_settime(td, nwhich, &ts);
 	if (error != 0)
 		LIN_SDT_PROBE1(time, linux_clock_settime, settime_error, error);
 
 	LIN_SDT_PROBE1(time, linux_clock_settime, return, error);
 	return (error);
 }
 
 int
 linux_clock_getres(struct thread *td, struct linux_clock_getres_args *args)
 {
 	struct timespec ts;
 	struct l_timespec lts;
 	int error;
 	clockid_t nwhich = 0;	/* XXX: GCC */
 
 	LIN_SDT_PROBE2(time, linux_clock_getres, entry, args->which, args->tp);
 
 	if (args->tp == NULL) {
 		LIN_SDT_PROBE0(time, linux_clock_getres, nullcall);
 		LIN_SDT_PROBE1(time, linux_clock_getres, return, 0);
 	  	return (0);
 	}
 
 	error = linux_to_native_clockid(&nwhich, args->which);
 	if (error != 0) {
 		LIN_SDT_PROBE1(time, linux_clock_getres, conversion_error,
 		    error);
 		LIN_SDT_PROBE1(time, linux_clock_getres, return, error);
 		return (error);
 	}
 	error = kern_clock_getres(td, nwhich, &ts);
 	if (error != 0) {
 		LIN_SDT_PROBE1(time, linux_clock_getres, getres_error, error);
 		LIN_SDT_PROBE1(time, linux_clock_getres, return, error);
 		return (error);
 	}
 	native_to_linux_timespec(&lts, &ts);
 
 	error = copyout(&lts, args->tp, sizeof lts);
 	if (error != 0)
 		LIN_SDT_PROBE1(time, linux_clock_getres, copyout_error, error);
 
 	LIN_SDT_PROBE1(time, linux_clock_getres, return, error);
 	return (error);
 }
 
 int
 linux_nanosleep(struct thread *td, struct linux_nanosleep_args *args)
 {
 	struct timespec *rmtp;
 	struct l_timespec lrqts, lrmts;
 	struct timespec rqts, rmts;
 	int error;
 
 	LIN_SDT_PROBE2(time, linux_nanosleep, entry, args->rqtp, args->rmtp);
 
 	error = copyin(args->rqtp, &lrqts, sizeof lrqts);
 	if (error != 0) {
 		LIN_SDT_PROBE1(time, linux_nanosleep, copyin_error, error);
 		LIN_SDT_PROBE1(time, linux_nanosleep, return, error);
 		return (error);
 	}
 
 	if (args->rmtp != NULL)
 	   	rmtp = &rmts;
 	else
 	   	rmtp = NULL;
 
 	error = linux_to_native_timespec(&rqts, &lrqts);
 	if (error != 0) {
 		LIN_SDT_PROBE1(time, linux_nanosleep, conversion_error, error);
 		LIN_SDT_PROBE1(time, linux_nanosleep, return, error);
 		return (error);
 	}
 	error = kern_nanosleep(td, &rqts, rmtp);
 	if (error != 0) {
 		LIN_SDT_PROBE1(time, linux_nanosleep, nanosleep_error, error);
 		LIN_SDT_PROBE1(time, linux_nanosleep, return, error);
 		return (error);
 	}
 
 	if (args->rmtp != NULL) {
 	   	native_to_linux_timespec(&lrmts, rmtp);
 	   	error = copyout(&lrmts, args->rmtp, sizeof(lrmts));
 		if (error != 0) {
 			LIN_SDT_PROBE1(time, linux_nanosleep, copyout_error,
 			    error);
 			LIN_SDT_PROBE1(time, linux_nanosleep, return, error);
 		   	return (error);
 		}
 	}
 
 	LIN_SDT_PROBE1(time, linux_nanosleep, return, 0);
 	return (0);
 }
 
 int
 linux_clock_nanosleep(struct thread *td, struct linux_clock_nanosleep_args *args)
 {
 	struct timespec *rmtp;
 	struct l_timespec lrqts, lrmts;
 	struct timespec rqts, rmts;
 	int error;
 
 	LIN_SDT_PROBE4(time, linux_clock_nanosleep, entry, args->which,
 	    args->flags, args->rqtp, args->rmtp);
 
 	if (args->flags != 0) {
 		/* XXX deal with TIMER_ABSTIME */
 		LIN_SDT_PROBE1(time, linux_clock_nanosleep, unsupported_flags,
 		    args->flags);
 		LIN_SDT_PROBE1(time, linux_clock_nanosleep, return, EINVAL);
 		return (EINVAL);	/* XXX deal with TIMER_ABSTIME */
 	}
 
 	if (args->which != LINUX_CLOCK_REALTIME) {
 		LIN_SDT_PROBE1(time, linux_clock_nanosleep, unsupported_clockid,
 		    args->which);
 		LIN_SDT_PROBE1(time, linux_clock_nanosleep, return, EINVAL);
 		return (EINVAL);
 	}
 
 	error = copyin(args->rqtp, &lrqts, sizeof lrqts);
 	if (error != 0) {
 		LIN_SDT_PROBE1(time, linux_clock_nanosleep, copyin_error,
 		    error);
 		LIN_SDT_PROBE1(time, linux_clock_nanosleep, return, error);
 		return (error);
 	}
 
 	if (args->rmtp != NULL)
 	   	rmtp = &rmts;
 	else
 	   	rmtp = NULL;
 
 	error = linux_to_native_timespec(&rqts, &lrqts);
 	if (error != 0) {
 		LIN_SDT_PROBE1(time, linux_clock_nanosleep, conversion_error,
 		    error);
 		LIN_SDT_PROBE1(time, linux_clock_nanosleep, return, error);
 		return (error);
 	}
 	error = kern_nanosleep(td, &rqts, rmtp);
 	if (error != 0) {
 		LIN_SDT_PROBE1(time, linux_clock_nanosleep, nanosleep_error,
 		    error);
 		LIN_SDT_PROBE1(time, linux_clock_nanosleep, return, error);
 		return (error);
 	}
 
 	if (args->rmtp != NULL) {
 	   	native_to_linux_timespec(&lrmts, rmtp);
 	   	error = copyout(&lrmts, args->rmtp, sizeof lrmts );
 		if (error != 0) {
 			LIN_SDT_PROBE1(time, linux_clock_nanosleep,
 			    copyout_error, error);
 			LIN_SDT_PROBE1(time, linux_nanosleep, return, error);
 		   	return (error);
 		}
 	}
 
 	LIN_SDT_PROBE1(time, linux_clock_nanosleep, return, 0);
 	return (0);
 }
Index: head/sys/compat/linux/linux_uid16.c
===================================================================
--- head/sys/compat/linux/linux_uid16.c	(revision 258540)
+++ head/sys/compat/linux/linux_uid16.c	(revision 258541)
@@ -1,441 +1,440 @@
 /*-
  * Copyright (c) 2001  The FreeBSD Project
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_compat.h"
-#include "opt_kdtrace.h"
 
 #include <sys/fcntl.h>
 #include <sys/param.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mutex.h>
 #include <sys/priv.h>
 #include <sys/proc.h>
 #include <sys/sdt.h>
 #include <sys/syscallsubr.h>
 #include <sys/sysproto.h>
 #include <sys/systm.h>
 
 #ifdef COMPAT_LINUX32
 #include <machine/../linux32/linux.h>
 #include <machine/../linux32/linux32_proto.h>
 #else
 #include <machine/../linux/linux.h>
 #include <machine/../linux/linux_proto.h>
 #endif
 
 #include <compat/linux/linux_dtrace.h>
 #include <compat/linux/linux_util.h>
 
 /* DTrace init */
 LIN_SDT_PROVIDER_DECLARE(LINUX_DTRACE);
 
 /**
  * DTrace probes in this module.
  */
 LIN_SDT_PROBE_DEFINE3(uid16, linux_chown16, entry, "char *", "l_uid16_t",
     "l_gid16_t");
 LIN_SDT_PROBE_DEFINE1(uid16, linux_chown16, conv_path, "char *");
 LIN_SDT_PROBE_DEFINE1(uid16, linux_chown16, return, "int");
 LIN_SDT_PROBE_DEFINE3(uid16, linux_lchown16, entry, "char *", "l_uid16_t",
     "l_gid16_t");
 LIN_SDT_PROBE_DEFINE1(uid16, linux_lchown16, conv_path, "char *");
 LIN_SDT_PROBE_DEFINE1(uid16, linux_lchown16, return, "int");
 LIN_SDT_PROBE_DEFINE2(uid16, linux_setgroups16, entry, "l_uint", "l_gid16_t *");
 LIN_SDT_PROBE_DEFINE1(uid16, linux_setgroups16, copyin_error, "int");
 LIN_SDT_PROBE_DEFINE1(uid16, linux_setgroups16, priv_check_cred_error, "int");
 LIN_SDT_PROBE_DEFINE1(uid16, linux_setgroups16, return, "int");
 LIN_SDT_PROBE_DEFINE2(uid16, linux_getgroups16, entry, "l_uint", "l_gid16_t *");
 LIN_SDT_PROBE_DEFINE1(uid16, linux_getgroups16, copyout_error, "int");
 LIN_SDT_PROBE_DEFINE1(uid16, linux_getgroups16, return, "int");
 LIN_SDT_PROBE_DEFINE0(uid16, linux_getgid16, entry);
 LIN_SDT_PROBE_DEFINE1(uid16, linux_getgid16, return, "int");
 LIN_SDT_PROBE_DEFINE0(uid16, linux_getuid16, entry);
 LIN_SDT_PROBE_DEFINE1(uid16, linux_getuid16, return, "int");
 LIN_SDT_PROBE_DEFINE0(uid16, linux_getegid16, entry);
 LIN_SDT_PROBE_DEFINE1(uid16, linux_getegid16, return, "int");
 LIN_SDT_PROBE_DEFINE0(uid16, linux_geteuid16, entry);
 LIN_SDT_PROBE_DEFINE1(uid16, linux_geteuid16, return, "int");
 LIN_SDT_PROBE_DEFINE1(uid16, linux_setgid16, entry, "l_gid16_t");
 LIN_SDT_PROBE_DEFINE1(uid16, linux_setgid16, return, "int");
 LIN_SDT_PROBE_DEFINE1(uid16, linux_setuid16, entry, "l_uid16_t");
 LIN_SDT_PROBE_DEFINE1(uid16, linux_setuid16, return, "int");
 LIN_SDT_PROBE_DEFINE2(uid16, linux_setregid16, entry, "l_gid16_t", "l_gid16_t");
 LIN_SDT_PROBE_DEFINE1(uid16, linux_setregid16, return, "int");
 LIN_SDT_PROBE_DEFINE2(uid16, linux_setreuid16, entry, "l_uid16_t", "l_uid16_t");
 LIN_SDT_PROBE_DEFINE1(uid16, linux_setreuid16, return, "int");
 LIN_SDT_PROBE_DEFINE3(uid16, linux_setresgid16, entry, "l_gid16_t", "l_gid16_t",
     "l_gid16_t");
 LIN_SDT_PROBE_DEFINE1(uid16, linux_setresgid16, return, "int");
 LIN_SDT_PROBE_DEFINE3(uid16, linux_setresuid16, entry, "l_uid16_t", "l_uid16_t",
     "l_uid16_t");
 LIN_SDT_PROBE_DEFINE1(uid16, linux_setresuid16, return, "int");
 
 DUMMY(setfsuid16);
 DUMMY(setfsgid16);
 DUMMY(getresuid16);
 DUMMY(getresgid16);
 
 #define	CAST_NOCHG(x)	((x == 0xFFFF) ? -1 : x)
 
 int
 linux_chown16(struct thread *td, struct linux_chown16_args *args)
 {
 	char *path;
 	int error;
 
 	LCONVPATHEXIST(td, args->path, &path);
 
 	/*
 	 * The DTrace probes have to be after the LCONVPATHEXIST, as
 	 * LCONVPATHEXIST may return on its own and we do not want to
 	 * have a stray entry without the corresponding return.
 	 */
 	LIN_SDT_PROBE3(uid16, linux_chown16, entry, args->path, args->uid,
 	    args->gid);
 	LIN_SDT_PROBE1(uid16, linux_chown16, conv_path, path);
 
 	error = kern_chown(td, path, UIO_SYSSPACE, CAST_NOCHG(args->uid),
 	    CAST_NOCHG(args->gid));
 	LFREEPATH(path);
 
 	LIN_SDT_PROBE1(uid16, linux_chown16, return, error);
 	return (error);
 }
 
 int
 linux_lchown16(struct thread *td, struct linux_lchown16_args *args)
 {
 	char *path;
 	int error;
 
 	LCONVPATHEXIST(td, args->path, &path);
 
 	/*
 	 * The DTrace probes have to be after the LCONVPATHEXIST, as
 	 * LCONVPATHEXIST may return on its own and we do not want to
 	 * have a stray entry without the corresponding return.
 	 */
 	LIN_SDT_PROBE3(uid16, linux_lchown16, entry, args->path, args->uid,
 	    args->gid);
 	LIN_SDT_PROBE1(uid16, linux_lchown16, conv_path, path);
 
 	error = kern_lchown(td, path, UIO_SYSSPACE, CAST_NOCHG(args->uid),
 	    CAST_NOCHG(args->gid));
 	LFREEPATH(path);
 
 	LIN_SDT_PROBE1(uid16, linux_lchown16, return, error);
 	return (error);
 }
 
 int
 linux_setgroups16(struct thread *td, struct linux_setgroups16_args *args)
 {
 	struct ucred *newcred, *oldcred;
 	l_gid16_t *linux_gidset;
 	gid_t *bsd_gidset;
 	int ngrp, error;
 	struct proc *p;
 
 	LIN_SDT_PROBE2(uid16, linux_setgroups16, entry, args->gidsetsize,
 	    args->gidset);
 
 	ngrp = args->gidsetsize;
 	if (ngrp < 0 || ngrp >= ngroups_max + 1) {
 		LIN_SDT_PROBE1(uid16, linux_setgroups16, return, EINVAL);
 		return (EINVAL);
 	}
 	linux_gidset = malloc(ngrp * sizeof(*linux_gidset), M_TEMP, M_WAITOK);
 	error = copyin(args->gidset, linux_gidset, ngrp * sizeof(l_gid16_t));
 	if (error) {
 		LIN_SDT_PROBE1(uid16, linux_setgroups16, copyin_error, error);
 		LIN_SDT_PROBE1(uid16, linux_setgroups16, return, error);
 		free(linux_gidset, M_TEMP);
 		return (error);
 	}
 	newcred = crget();
 	p = td->td_proc;
 	PROC_LOCK(p);
 	oldcred = crcopysafe(p, newcred);
 
 	/*
 	 * cr_groups[0] holds egid. Setting the whole set from
 	 * the supplied set will cause egid to be changed too.
 	 * Keep cr_groups[0] unchanged to prevent that.
 	 */
 
 	if ((error = priv_check_cred(oldcred, PRIV_CRED_SETGROUPS, 0)) != 0) {
 		PROC_UNLOCK(p);
 		crfree(newcred);
 
 		LIN_SDT_PROBE1(uid16, linux_setgroups16, priv_check_cred_error,
 		    error);
 		goto out;
 	}
 
 	if (ngrp > 0) {
 		newcred->cr_ngroups = ngrp + 1;
 
 		bsd_gidset = newcred->cr_groups;
 		ngrp--;
 		while (ngrp >= 0) {
 			bsd_gidset[ngrp + 1] = linux_gidset[ngrp];
 			ngrp--;
 		}
 	}
 	else
 		newcred->cr_ngroups = 1;
 
 	setsugid(td->td_proc);
 	p->p_ucred = newcred;
 	PROC_UNLOCK(p);
 	crfree(oldcred);
 	error = 0;
 out:
 	free(linux_gidset, M_TEMP);
 
 	LIN_SDT_PROBE1(uid16, linux_setgroups16, return, error);
 	return (error);
 }
 
 int
 linux_getgroups16(struct thread *td, struct linux_getgroups16_args *args)
 {
 	struct ucred *cred;
 	l_gid16_t *linux_gidset;
 	gid_t *bsd_gidset;
 	int bsd_gidsetsz, ngrp, error;
 
 	LIN_SDT_PROBE2(uid16, linux_getgroups16, entry, args->gidsetsize,
 	    args->gidset);
 
 	cred = td->td_ucred;
 	bsd_gidset = cred->cr_groups;
 	bsd_gidsetsz = cred->cr_ngroups - 1;
 
 	/*
 	 * cr_groups[0] holds egid. Returning the whole set
 	 * here will cause a duplicate. Exclude cr_groups[0]
 	 * to prevent that.
 	 */
 
 	if ((ngrp = args->gidsetsize) == 0) {
 		td->td_retval[0] = bsd_gidsetsz;
 
 		LIN_SDT_PROBE1(uid16, linux_getgroups16, return, 0);
 		return (0);
 	}
 
 	if (ngrp < bsd_gidsetsz) {
 		LIN_SDT_PROBE1(uid16, linux_getgroups16, return, EINVAL);
 		return (EINVAL);
 	}
 
 	ngrp = 0;
 	linux_gidset = malloc(bsd_gidsetsz * sizeof(*linux_gidset),
 	    M_TEMP, M_WAITOK);
 	while (ngrp < bsd_gidsetsz) {
 		linux_gidset[ngrp] = bsd_gidset[ngrp + 1];
 		ngrp++;
 	}
 
 	error = copyout(linux_gidset, args->gidset, ngrp * sizeof(l_gid16_t));
 	free(linux_gidset, M_TEMP);
 	if (error) {
 		LIN_SDT_PROBE1(uid16, linux_getgroups16, copyout_error, error);
 		LIN_SDT_PROBE1(uid16, linux_getgroups16, return, error);
 		return (error);
 	}
 
 	td->td_retval[0] = ngrp;
 
 	LIN_SDT_PROBE1(uid16, linux_getgroups16, return, 0);
 	return (0);
 }
 
 /*
  * The FreeBSD native getgid(2) and getuid(2) also modify td->td_retval[1]
  * when COMPAT_43 is defined. This clobbers registers that are assumed to
  * be preserved. The following lightweight syscalls fixes this. See also
  * linux_getpid(2), linux_getgid(2) and linux_getuid(2) in linux_misc.c
  *
  * linux_getgid16() - MP SAFE
  * linux_getuid16() - MP SAFE
  */
 
 int
 linux_getgid16(struct thread *td, struct linux_getgid16_args *args)
 {
 
 	LIN_SDT_PROBE0(uid16, linux_getgid16, entry);
 
 	td->td_retval[0] = td->td_ucred->cr_rgid;
 
 	LIN_SDT_PROBE1(uid16, linux_getgid16, return, 0);
 	return (0);
 }
 
 int
 linux_getuid16(struct thread *td, struct linux_getuid16_args *args)
 {
 
 	LIN_SDT_PROBE0(uid16, linux_getuid16, entry);
 
 	td->td_retval[0] = td->td_ucred->cr_ruid;
 
 	LIN_SDT_PROBE1(uid16, linux_getuid16, return, 0);
 	return (0);
 }
 
 int
 linux_getegid16(struct thread *td, struct linux_getegid16_args *args)
 {
 	struct getegid_args bsd;
 	int error;
 
 	LIN_SDT_PROBE0(uid16, linux_getegid16, entry);
 
 	error = sys_getegid(td, &bsd);
 
 	LIN_SDT_PROBE1(uid16, linux_getegid16, return, error);
 	return (error);
 }
 
 int
 linux_geteuid16(struct thread *td, struct linux_geteuid16_args *args)
 {
 	struct geteuid_args bsd;
 	int error;
 
 	LIN_SDT_PROBE0(uid16, linux_geteuid16, entry);
 
 	error = sys_geteuid(td, &bsd);
 
 	LIN_SDT_PROBE1(uid16, linux_geteuid16, return, error);
 	return (error);
 }
 
 int
 linux_setgid16(struct thread *td, struct linux_setgid16_args *args)
 {
 	struct setgid_args bsd;
 	int error;
 
 	LIN_SDT_PROBE1(uid16, linux_setgid16, entry, args->gid);
 
 	bsd.gid = args->gid;
 	error = sys_setgid(td, &bsd);
 
 	LIN_SDT_PROBE1(uid16, linux_setgid16, return, error);
 	return (error);
 }
 
 int
 linux_setuid16(struct thread *td, struct linux_setuid16_args *args)
 {
 	struct setuid_args bsd;
 	int error;
 
 	LIN_SDT_PROBE1(uid16, linux_setuid16, entry, args->uid);
 
 	bsd.uid = args->uid;
 	error = sys_setuid(td, &bsd);
 
 	LIN_SDT_PROBE1(uid16, linux_setuid16, return, error);
 	return (error);
 }
 
 int
 linux_setregid16(struct thread *td, struct linux_setregid16_args *args)
 {
 	struct setregid_args bsd;
 	int error;
 
 	LIN_SDT_PROBE2(uid16, linux_setregid16, entry, args->rgid, args->egid);
 
 	bsd.rgid = CAST_NOCHG(args->rgid);
 	bsd.egid = CAST_NOCHG(args->egid);
 	error = sys_setregid(td, &bsd);
 
 	LIN_SDT_PROBE1(uid16, linux_setregid16, return, error);
 	return (error);
 }
 
 int
 linux_setreuid16(struct thread *td, struct linux_setreuid16_args *args)
 {
 	struct setreuid_args bsd;
 	int error;
 
 	LIN_SDT_PROBE2(uid16, linux_setreuid16, entry, args->ruid, args->euid);
 
 	bsd.ruid = CAST_NOCHG(args->ruid);
 	bsd.euid = CAST_NOCHG(args->euid);
 	error = sys_setreuid(td, &bsd);
 
 	LIN_SDT_PROBE1(uid16, linux_setreuid16, return, error);
 	return (error);
 }
 
 int
 linux_setresgid16(struct thread *td, struct linux_setresgid16_args *args)
 {
 	struct setresgid_args bsd;
 	int error;
 
 	LIN_SDT_PROBE3(uid16, linux_setresgid16, entry, args->rgid, args->egid,
 	    args->sgid);
 
 	bsd.rgid = CAST_NOCHG(args->rgid);
 	bsd.egid = CAST_NOCHG(args->egid);
 	bsd.sgid = CAST_NOCHG(args->sgid);
 	error = sys_setresgid(td, &bsd);
 
 	LIN_SDT_PROBE1(uid16, linux_setresgid16, return, error);
 	return (error);
 }
 
 int
 linux_setresuid16(struct thread *td, struct linux_setresuid16_args *args)
 {
 	struct setresuid_args bsd;
 	int error;
 
 	LIN_SDT_PROBE3(uid16, linux_setresuid16, entry, args->ruid, args->euid,
 	    args->suid);
 
 	bsd.ruid = CAST_NOCHG(args->ruid);
 	bsd.euid = CAST_NOCHG(args->euid);
 	bsd.suid = CAST_NOCHG(args->suid);
 	error = sys_setresuid(td, &bsd);
 
 	LIN_SDT_PROBE1(uid16, linux_setresuid16, return, error);
 	return (error);
 }
Index: head/sys/compat/linux/linux_util.c
===================================================================
--- head/sys/compat/linux/linux_util.c	(revision 258540)
+++ head/sys/compat/linux/linux_util.c	(revision 258541)
@@ -1,333 +1,332 @@
 /*-
  * Copyright (c) 1994 Christos Zoulas
  * Copyright (c) 1995 Frank van der Linden
  * Copyright (c) 1995 Scott Bartram
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. The name of the author may not be used to endorse or promote products
  *    derived from this software without specific prior written permission
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  *	from: svr4_util.c,v 1.5 1995/01/22 23:44:50 christos Exp
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_compat.h"
-#include "opt_kdtrace.h"
 
 #include <sys/param.h>
 #include <sys/bus.h>
 #include <sys/fcntl.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/kernel.h>
 #include <sys/linker_set.h>
 #include <sys/mutex.h>
 #include <sys/namei.h>
 #include <sys/proc.h>
 #include <sys/sdt.h>
 #include <sys/syscallsubr.h>
 #include <sys/systm.h>
 #include <sys/vnode.h>
 
 #include <machine/stdarg.h>
 
 #include <compat/linux/linux_util.h>
 #ifdef COMPAT_LINUX32
 #include <machine/../linux32/linux.h>
 #else
 #include <machine/../linux/linux.h>
 #endif
 
 #include <compat/linux/linux_dtrace.h>
 
 const char      linux_emul_path[] = "/compat/linux";
 
 /* DTrace init */
 LIN_SDT_PROVIDER_DECLARE(LINUX_DTRACE);
 
 /**
  * DTrace probes in this module.
  */
 LIN_SDT_PROBE_DEFINE5(util, linux_emul_convpath, entry, "const char *",
     "enum uio_seg", "char **", "int", "int");
 LIN_SDT_PROBE_DEFINE1(util, linux_emul_convpath, return, "int");
 LIN_SDT_PROBE_DEFINE1(util, linux_msg, entry, "const char *");
 LIN_SDT_PROBE_DEFINE0(util, linux_msg, return);
 LIN_SDT_PROBE_DEFINE2(util, linux_driver_get_name_dev, entry, "device_t",
     "const char *");
 LIN_SDT_PROBE_DEFINE0(util, linux_driver_get_name_dev, nullcall);
 LIN_SDT_PROBE_DEFINE1(util, linux_driver_get_name_dev, return, "char *");
 LIN_SDT_PROBE_DEFINE3(util, linux_driver_get_major_minor, entry, "char *",
     "int *", "int *");
 LIN_SDT_PROBE_DEFINE0(util, linux_driver_get_major_minor, nullcall);
 LIN_SDT_PROBE_DEFINE1(util, linux_driver_get_major_minor, notfound, "char *");
 LIN_SDT_PROBE_DEFINE3(util, linux_driver_get_major_minor, return, "int",
     "int", "int");
 LIN_SDT_PROBE_DEFINE0(util, linux_get_char_devices, entry);
 LIN_SDT_PROBE_DEFINE1(util, linux_get_char_devices, return, "char *");
 LIN_SDT_PROBE_DEFINE1(util, linux_free_get_char_devices, entry, "char *");
 LIN_SDT_PROBE_DEFINE0(util, linux_free_get_char_devices, return);
 LIN_SDT_PROBE_DEFINE1(util, linux_device_register_handler, entry,
     "struct linux_device_handler *");
 LIN_SDT_PROBE_DEFINE1(util, linux_device_register_handler, return, "int");
 LIN_SDT_PROBE_DEFINE1(util, linux_device_unregister_handler, entry,
     "struct linux_device_handler *");
 LIN_SDT_PROBE_DEFINE1(util, linux_device_unregister_handler, return, "int");
 
 /*
  * Search an alternate path before passing pathname arguments on to
  * system calls. Useful for keeping a separate 'emulation tree'.
  *
  * If cflag is set, we check if an attempt can be made to create the
  * named file, i.e. we check if the directory it should be in exists.
  */
 int
 linux_emul_convpath(struct thread *td, const char *path, enum uio_seg pathseg,
     char **pbuf, int cflag, int dfd)
 {
 	int retval;
 
 	LIN_SDT_PROBE5(util, linux_emul_convpath, entry, path, pathseg, pbuf,
 	    cflag, dfd);
 
 	retval = kern_alternate_path(td, linux_emul_path, path, pathseg, pbuf,
 	    cflag, dfd);
 
 	LIN_SDT_PROBE1(util, linux_emul_convpath, return, retval);
 	return (retval);
 }
 
 void
 linux_msg(const struct thread *td, const char *fmt, ...)
 {
 	va_list ap;
 	struct proc *p;
 
 	LIN_SDT_PROBE1(util, linux_msg, entry, fmt);
 
 	p = td->td_proc;
 	printf("linux: pid %d (%s): ", (int)p->p_pid, p->p_comm);
 	va_start(ap, fmt);
 	vprintf(fmt, ap);
 	va_end(ap);
 	printf("\n");
 
 	LIN_SDT_PROBE0(util, linux_msg, return);
 }
 
 struct device_element
 {
 	TAILQ_ENTRY(device_element) list;
 	struct linux_device_handler entry;
 };
 
 static TAILQ_HEAD(, device_element) devices =
 	TAILQ_HEAD_INITIALIZER(devices);
 
 static struct linux_device_handler null_handler =
 	{ "mem", "mem", "null", "null", 1, 3, 1};
 
 DATA_SET(linux_device_handler_set, null_handler);
 
 char *
 linux_driver_get_name_dev(device_t dev)
 {
 	struct device_element *de;
 	const char *device_name = device_get_name(dev);
 
 	LIN_SDT_PROBE2(util, linux_driver_get_name_dev, entry, dev,
 	    device_name);
 
 	if (device_name == NULL) {
 		LIN_SDT_PROBE0(util, linux_driver_get_name_dev, nullcall);
 		LIN_SDT_PROBE1(util, linux_driver_get_name_dev, return, NULL);
 		return NULL;
 	}
 	TAILQ_FOREACH(de, &devices, list) {
 		if (strcmp(device_name, de->entry.bsd_driver_name) == 0) {
 			LIN_SDT_PROBE1(util, linux_driver_get_name_dev, return,
 			    de->entry.linux_driver_name);
 			return (de->entry.linux_driver_name);
 		}
 	}
 
 	LIN_SDT_PROBE1(util, linux_driver_get_name_dev, return, NULL);
 	return NULL;
 }
 
 int
 linux_driver_get_major_minor(const char *node, int *major, int *minor)
 {
 	struct device_element *de;
 
 	LIN_SDT_PROBE3(util, linux_driver_get_major_minor, entry, node, major,
 	    minor);
 
 	if (node == NULL || major == NULL || minor == NULL) {
 		LIN_SDT_PROBE0(util, linux_driver_get_major_minor, nullcall);
 		LIN_SDT_PROBE3(util, linux_driver_get_major_minor, return, 1,
 		   0, 0);
 		return 1;
 	}
 
 	if (strlen(node) > strlen("pts/") &&
 	    strncmp(node, "pts/", strlen("pts/")) == 0) {
 		unsigned long devno;
 
 		/*
 		 * Linux checks major and minors of the slave device
 		 * to make sure it's a pty device, so let's make him
 		 * believe it is.
 		 */
 		devno = strtoul(node + strlen("pts/"), NULL, 10);
 		*major = 136 + (devno / 256);
 		*minor = devno % 256;
 
 		LIN_SDT_PROBE3(util, linux_driver_get_major_minor, return, 0,
 		    *major, *minor);
 		return 0;
 	}
 
 	TAILQ_FOREACH(de, &devices, list) {
 		if (strcmp(node, de->entry.bsd_device_name) == 0) {
 			*major = de->entry.linux_major;
 			*minor = de->entry.linux_minor;
 
 			LIN_SDT_PROBE3(util, linux_driver_get_major_minor,
 			    return, 0, *major, *minor);
 			return 0;
 		}
 	}
 
 	LIN_SDT_PROBE1(util, linux_driver_get_major_minor, notfound, node);
 	LIN_SDT_PROBE3(util, linux_driver_get_major_minor, return, 1, 0, 0);
 	return 1;
 }
 
 char *
 linux_get_char_devices()
 {
 	struct device_element *de;
 	char *temp, *string, *last;
 	char formated[256];
 	int current_size = 0, string_size = 1024;
 
 	LIN_SDT_PROBE0(util, linux_get_char_devices, entry);
 
 	string = malloc(string_size, M_LINUX, M_WAITOK);
 	string[0] = '\000';
 	last = "";
 	TAILQ_FOREACH(de, &devices, list) {
 		if (!de->entry.linux_char_device)
 			continue;
 		temp = string;
 		if (strcmp(last, de->entry.bsd_driver_name) != 0) {
 			last = de->entry.bsd_driver_name;
 
 			snprintf(formated, sizeof(formated), "%3d %s\n",
 				 de->entry.linux_major,
 				 de->entry.linux_device_name);
 			if (strlen(formated) + current_size
 			    >= string_size) {
 				string_size *= 2;
 				string = malloc(string_size,
 				    M_LINUX, M_WAITOK);
 				bcopy(temp, string, current_size);
 				free(temp, M_LINUX);
 			}
 			strcat(string, formated);
 			current_size = strlen(string);
 		}
 	}
 
 	LIN_SDT_PROBE1(util, linux_get_char_devices, return, string);
 	return string;
 }
 
 void
 linux_free_get_char_devices(char *string)
 {
 
 	LIN_SDT_PROBE1(util, linux_get_char_devices, entry, string);
 
 	free(string, M_LINUX);
 
 	LIN_SDT_PROBE0(util, linux_get_char_devices, return);
 }
 
 static int linux_major_starting = 200;
 
 int
 linux_device_register_handler(struct linux_device_handler *d)
 {
 	struct device_element *de;
 
 	LIN_SDT_PROBE1(util, linux_device_register_handler, entry, d);
 
 	if (d == NULL) {
 		LIN_SDT_PROBE1(util, linux_device_register_handler, return,
 		    EINVAL);
 		return (EINVAL);
 	}
 
 	de = malloc(sizeof(*de), M_LINUX, M_WAITOK);
 	if (d->linux_major < 0) {
 		d->linux_major = linux_major_starting++;
 	}
 	bcopy(d, &de->entry, sizeof(*d));
 
 	/* Add the element to the list, sorted on span. */
 	TAILQ_INSERT_TAIL(&devices, de, list);
 
 	LIN_SDT_PROBE1(util, linux_device_register_handler, return, 0);
 	return (0);
 }
 
 int
 linux_device_unregister_handler(struct linux_device_handler *d)
 {
 	struct device_element *de;
 
 	LIN_SDT_PROBE1(util, linux_device_unregister_handler, entry, d);
 
 	if (d == NULL) {
 		LIN_SDT_PROBE1(util, linux_device_unregister_handler, return,
 		    EINVAL);
 		return (EINVAL);
 	}
 
 	TAILQ_FOREACH(de, &devices, list) {
 		if (bcmp(d, &de->entry, sizeof(*d)) == 0) {
 			TAILQ_REMOVE(&devices, de, list);
 			free(de, M_LINUX);
 
 			LIN_SDT_PROBE1(util, linux_device_unregister_handler,
 			    return, 0);
 			return (0);
 		}
 	}
 
 	LIN_SDT_PROBE1(util, linux_device_unregister_handler, return, EINVAL);
 	return (EINVAL);
 }
Index: head/sys/conf/options
===================================================================
--- head/sys/conf/options	(revision 258540)
+++ head/sys/conf/options	(revision 258541)
@@ -1,915 +1,915 @@
 # $FreeBSD$
 #
 #        On the handling of kernel options
 #
 # All kernel options should be listed in NOTES, with suitable
 # descriptions.  Negative options (options that make some code not
 # compile) should be commented out; LINT (generated from NOTES) should
 # compile as much code as possible.  Try to structure option-using
 # code so that a single option only switch code on, or only switch
 # code off, to make it possible to have a full compile-test.  If
 # necessary, you can check for COMPILING_LINT to get maximum code
 # coverage.
 #
 # All new options shall also be listed in either "conf/options" or
 # "conf/options.<machine>".  Options that affect a single source-file
 # <xxx>.[c|s] should be directed into "opt_<xxx>.h", while options
 # that affect multiple files should either go in "opt_global.h" if
 # this is a kernel-wide option (used just about everywhere), or in
 # "opt_<option-name-in-lower-case>.h" if it affects only some files.
 # Note that the effect of listing only an option without a
 # header-file-name in conf/options (and cousins) is that the last
 # convention is followed.
 #
 # This handling scheme is not yet fully implemented.
 #
 #
 # Format of this file:
 # Option name	filename
 #
 # If filename is missing, the default is
 # opt_<name-of-option-in-lower-case>.h
 
 AAC_DEBUG		opt_aac.h
 AACRAID_DEBUG		opt_aacraid.h
 AHC_ALLOW_MEMIO		opt_aic7xxx.h
 AHC_TMODE_ENABLE	opt_aic7xxx.h
 AHC_DUMP_EEPROM		opt_aic7xxx.h
 AHC_DEBUG		opt_aic7xxx.h
 AHC_DEBUG_OPTS		opt_aic7xxx.h
 AHC_REG_PRETTY_PRINT	opt_aic7xxx.h
 AHD_DEBUG		opt_aic79xx.h
 AHD_DEBUG_OPTS		opt_aic79xx.h
 AHD_TMODE_ENABLE	opt_aic79xx.h	
 AHD_REG_PRETTY_PRINT	opt_aic79xx.h
 ADW_ALLOW_MEMIO		opt_adw.h
 
 TWA_DEBUG		opt_twa.h
 TWA_FLASH_FIRMWARE	opt_twa.h
 
 # Debugging options.
 ALT_BREAK_TO_DEBUGGER	opt_kdb.h
 BREAK_TO_DEBUGGER	opt_kdb.h
 DDB
 DDB_BUFR_SIZE	opt_ddb.h
 DDB_CAPTURE_DEFAULTBUFSIZE	opt_ddb.h
 DDB_CAPTURE_MAXBUFSIZE	opt_ddb.h
 DDB_CTF		opt_ddb.h
 DDB_NUMSYM	opt_ddb.h
 GDB
 KDB		opt_global.h
 KDB_TRACE	opt_kdb.h
 KDB_UNATTENDED	opt_kdb.h
 KLD_DEBUG	opt_kld.h
 SYSCTL_DEBUG	opt_sysctl.h
 TEXTDUMP_PREFERRED	opt_ddb.h
 TEXTDUMP_VERBOSE	opt_ddb.h
 
 # Miscellaneous options.
 ADAPTIVE_LOCKMGRS
 ALQ
 ALTERA_SDCARD_FAST_SIM	opt_altera_sdcard.h
 ATSE_CFI_HACK	opt_cfi.h
 AUDIT		opt_global.h
 BOOTHOWTO	opt_global.h
 BOOTVERBOSE	opt_global.h
 CALLOUT_PROFILING
 CAPABILITIES	opt_capsicum.h
 CAPABILITY_MODE	opt_capsicum.h
 COMPAT_43	opt_compat.h
 COMPAT_43TTY	opt_compat.h
 COMPAT_FREEBSD4	opt_compat.h
 COMPAT_FREEBSD5	opt_compat.h
 COMPAT_FREEBSD6	opt_compat.h
 COMPAT_FREEBSD7	opt_compat.h
 COMPILING_LINT	opt_global.h
 COMPRESS_USER_CORES opt_core.h
 CY_PCI_FASTINTR
 DEADLKRES	opt_watchdog.h
 DIRECTIO
 FILEMON		opt_dontuse.h
 FFCLOCK
 FULL_PREEMPTION	opt_sched.h
 IPI_PREEMPTION	opt_sched.h
 GEOM_AES	opt_geom.h
 GEOM_BDE	opt_geom.h
 GEOM_BSD	opt_geom.h
 GEOM_CACHE	opt_geom.h
 GEOM_CONCAT	opt_geom.h
 GEOM_ELI	opt_geom.h
 GEOM_FOX	opt_geom.h
 GEOM_GATE	opt_geom.h
 GEOM_JOURNAL	opt_geom.h
 GEOM_LABEL	opt_geom.h
 GEOM_LINUX_LVM	opt_geom.h
 GEOM_MBR	opt_geom.h
 GEOM_MIRROR	opt_geom.h
 GEOM_MULTIPATH	opt_geom.h
 GEOM_NOP	opt_geom.h
 GEOM_PART_APM	opt_geom.h
 GEOM_PART_BSD	opt_geom.h
 GEOM_PART_EBR	opt_geom.h
 GEOM_PART_EBR_COMPAT	opt_geom.h
 GEOM_PART_GPT	opt_geom.h
 GEOM_PART_LDM	opt_geom.h
 GEOM_PART_MBR	opt_geom.h
 GEOM_PART_PC98	opt_geom.h
 GEOM_PART_VTOC8	opt_geom.h
 GEOM_PC98	opt_geom.h
 GEOM_RAID	opt_geom.h
 GEOM_RAID3	opt_geom.h
 GEOM_SHSEC	opt_geom.h
 GEOM_STRIPE	opt_geom.h
 GEOM_SUNLABEL	opt_geom.h
 GEOM_UNCOMPRESS	opt_geom.h
 GEOM_UZIP	opt_geom.h
 GEOM_VIRSTOR	opt_geom.h
 GEOM_VOL	opt_geom.h
 GEOM_ZERO	opt_geom.h
-KDTRACE_HOOKS	opt_kdtrace.h
+KDTRACE_HOOKS	opt_global.h
 KSTACK_MAX_PAGES
 KSTACK_PAGES
 KTRACE
 KTRACE_REQUEST_POOL	opt_ktrace.h
 LIBICONV
 MAC		opt_global.h
 MAC_BIBA	opt_dontuse.h
 MAC_BSDEXTENDED	opt_dontuse.h
 MAC_IFOFF	opt_dontuse.h
 MAC_LOMAC	opt_dontuse.h
 MAC_MLS		opt_dontuse.h
 MAC_NONE	opt_dontuse.h
 MAC_PARTITION	opt_dontuse.h
 MAC_PORTACL	opt_dontuse.h
 MAC_SEEOTHERUIDS	opt_dontuse.h
 MAC_STATIC	opt_mac.h
 MAC_STUB	opt_dontuse.h
 MAC_TEST	opt_dontuse.h
 MD_ROOT		opt_md.h
 MD_ROOT_FSTYPE	opt_md.h
 MD_ROOT_SIZE	opt_md.h
 MFI_DEBUG	opt_mfi.h
 MFI_DECODE_LOG	opt_mfi.h
 MPROF_BUFFERS	opt_mprof.h
 MPROF_HASH_SIZE	opt_mprof.h
 NEW_PCIB	opt_global.h
 NO_ADAPTIVE_MUTEXES	opt_adaptive_mutexes.h
 NO_ADAPTIVE_RWLOCKS
 NO_ADAPTIVE_SX
 NO_SYSCTL_DESCR	opt_global.h
 NSWBUF_MIN	opt_swap.h
 MBUF_PACKET_ZONE_DISABLE	opt_global.h
 PANIC_REBOOT_WAIT_TIME	opt_panic.h
 PPC_DEBUG	opt_ppc.h
 PPC_PROBE_CHIPSET	opt_ppc.h
 PPS_SYNC	opt_ntp.h
 PREEMPTION	opt_sched.h
 PROCDESC	opt_procdesc.h
 QUOTA
 SCHED_4BSD	opt_sched.h
 SCHED_STATS	opt_sched.h
 SCHED_ULE	opt_sched.h
 SLEEPQUEUE_PROFILING
 SLHCI_DEBUG	opt_slhci.h
 SPX_HACK
 STACK		opt_stack.h
 SUIDDIR
 MSGMNB		opt_sysvipc.h
 MSGMNI		opt_sysvipc.h
 MSGSEG		opt_sysvipc.h
 MSGSSZ		opt_sysvipc.h
 MSGTQL		opt_sysvipc.h
 SEMMNI		opt_sysvipc.h
 SEMMNS		opt_sysvipc.h
 SEMMNU		opt_sysvipc.h
 SEMMSL		opt_sysvipc.h
 SEMOPM		opt_sysvipc.h
 SEMUME		opt_sysvipc.h
 SHMALL		opt_sysvipc.h
 SHMMAX		opt_sysvipc.h
 SHMMAXPGS	opt_sysvipc.h
 SHMMIN		opt_sysvipc.h
 SHMMNI		opt_sysvipc.h
 SHMSEG		opt_sysvipc.h
 SYSVMSG		opt_sysvipc.h
 SYSVSEM		opt_sysvipc.h
 SYSVSHM		opt_sysvipc.h
 SW_WATCHDOG	opt_watchdog.h
 TURNSTILE_PROFILING
 UMTX_PROFILING
 VFS_AIO
 VERBOSE_SYSINIT	opt_global.h
 WLCACHE		opt_wavelan.h
 WLDEBUG		opt_wavelan.h
 
 # POSIX kernel options
 P1003_1B_MQUEUE			opt_posix.h
 P1003_1B_SEMAPHORES		opt_posix.h
 _KPOSIX_PRIORITY_SCHEDULING	opt_posix.h
 
 # Do we want the config file compiled into the kernel?
 INCLUDE_CONFIG_FILE	opt_config.h
 
 # Options for static filesystems.  These should only be used at config
 # time, since the corresponding lkms cannot work if there are any static
 # dependencies.  Unusability is enforced by hiding the defines for the
 # options in a never-included header.
 CD9660		opt_dontuse.h
 EXT2FS		opt_dontuse.h
 FDESCFS		opt_dontuse.h
 FFS		opt_dontuse.h
 FUSE		opt_dontuse.h
 MSDOSFS		opt_dontuse.h
 NANDFS		opt_dontuse.h
 NULLFS		opt_dontuse.h
 PROCFS		opt_dontuse.h
 PSEUDOFS	opt_dontuse.h
 REISERFS	opt_dontuse.h
 SMBFS		opt_dontuse.h
 TMPFS		opt_dontuse.h
 UDF		opt_dontuse.h
 UNIONFS		opt_dontuse.h
 ZFS		opt_dontuse.h
 
 # Pseudofs debugging
 PSEUDOFS_TRACE	opt_pseudofs.h
 
 # In-kernel GSS-API
 KGSSAPI		opt_kgssapi.h
 KGSSAPI_DEBUG	opt_kgssapi.h
 
 # These static filesystems have one slightly bogus static dependency in
 # sys/i386/i386/autoconf.c.  If any of these filesystems are
 # statically compiled into the kernel, code for mounting them as root
 # filesystems will be enabled - but look below.
 NFSCLIENT	opt_nfs.h
 NFSSERVER	opt_nfs.h
 
 # Use these options to compile the experimental nfs client and/or
 # server that supports NFSv4 into a kernel.
 # NFSCL - client
 # NFSD - server
 NFSCL		opt_nfs.h
 NFSD		opt_nfs.h
 
 # filesystems and libiconv bridge
 CD9660_ICONV	opt_dontuse.h
 MSDOSFS_ICONV	opt_dontuse.h
 UDF_ICONV	opt_dontuse.h
 
 # If you are following the conditions in the copyright,
 # you can enable soft-updates which will speed up a lot of thigs
 # and make the system safer from crashes at the same time.
 # otherwise a STUB module will be compiled in.
 SOFTUPDATES	opt_ffs.h
 
 # On small, embedded systems, it can be useful to turn off support for
 # snapshots.  It saves about 30-40k for a feature that would be lightly
 # used, if it is used at all.
 NO_FFS_SNAPSHOT	opt_ffs.h
 
 # Enabling this option turns on support for Access Control Lists in UFS,
 # which can be used to support high security configurations.  Depends on
 # UFS_EXTATTR.
 UFS_ACL		opt_ufs.h
 
 # Enabling this option turns on support for extended attributes in UFS-based
 # filesystems, which can be used to support high security configurations
 # as well as new filesystem features.
 UFS_EXTATTR	opt_ufs.h
 UFS_EXTATTR_AUTOSTART	opt_ufs.h
 
 # Enable fast hash lookups for large directories on UFS-based filesystems.
 UFS_DIRHASH	opt_ufs.h
 
 # Enable gjournal-based UFS journal.
 UFS_GJOURNAL	opt_ufs.h
 
 # The below sentence is not in English, and neither is this one.
 # We plan to remove the static dependences above, with a
 # <filesystem>_ROOT option to control if it usable as root.  This list
 # allows these options to be present in config files already (though
 # they won't make any difference yet).
 NFS_ROOT	opt_nfsroot.h
 
 # SMB/CIFS requester
 NETSMB		opt_netsmb.h
 
 # Options used only in subr_param.c.
 HZ		opt_param.h
 MAXFILES	opt_param.h
 NBUF		opt_param.h
 NSFBUFS		opt_param.h
 VM_BCACHE_SIZE_MAX	opt_param.h
 VM_SWZONE_SIZE_MAX	opt_param.h
 MAXUSERS
 DFLDSIZ		opt_param.h
 MAXDSIZ		opt_param.h
 MAXSSIZ		opt_param.h
 
 # Generic SCSI options.
 CAM_MAX_HIGHPOWER	opt_cam.h
 CAMDEBUG		opt_cam.h
 CAM_DEBUG_COMPILE	opt_cam.h
 CAM_DEBUG_DELAY		opt_cam.h
 CAM_DEBUG_BUS		opt_cam.h
 CAM_DEBUG_TARGET	opt_cam.h
 CAM_DEBUG_LUN		opt_cam.h
 CAM_DEBUG_FLAGS		opt_cam.h
 CAM_BOOT_DELAY		opt_cam.h
 SCSI_DELAY		opt_scsi.h
 SCSI_NO_SENSE_STRINGS	opt_scsi.h
 SCSI_NO_OP_STRINGS	opt_scsi.h
 
 # Options used only in cam/ata/ata_da.c
 ADA_TEST_FAILURE	opt_ada.h
 ATA_STATIC_ID		opt_ada.h
 
 # Options used only in cam/scsi/scsi_cd.c
 CHANGER_MIN_BUSY_SECONDS	opt_cd.h
 CHANGER_MAX_BUSY_SECONDS	opt_cd.h
 
 # Options used only in cam/scsi/scsi_sa.c.
 SA_IO_TIMEOUT		opt_sa.h
 SA_SPACE_TIMEOUT	opt_sa.h
 SA_REWIND_TIMEOUT	opt_sa.h
 SA_ERASE_TIMEOUT	opt_sa.h
 SA_1FM_AT_EOD		opt_sa.h
 
 # Options used only in cam/scsi/scsi_pt.c
 SCSI_PT_DEFAULT_TIMEOUT	opt_pt.h
 
 # Options used only in cam/scsi/scsi_ses.c
 SES_ENABLE_PASSTHROUGH	opt_ses.h
 
 # Options used in dev/sym/ (Symbios SCSI driver).
 SYM_SETUP_LP_PROBE_MAP	opt_sym.h	#-Low Priority Probe Map (bits)
 					# Allows the ncr to take precedence
 					# 1 (1<<0) -> 810a, 860
 					# 2 (1<<1) -> 825a, 875, 885, 895
 					# 4 (1<<2) -> 895a, 896, 1510d 
 SYM_SETUP_SCSI_DIFF	opt_sym.h	#-HVD support for 825a, 875, 885
 					# disabled:0 (default), enabled:1
 SYM_SETUP_PCI_PARITY	opt_sym.h	#-PCI parity checking
 					# disabled:0, enabled:1 (default)
 SYM_SETUP_MAX_LUN	opt_sym.h	#-Number of LUNs supported
 					# default:8, range:[1..64]
 
 # Options used only in pci/ncr.c
 SCSI_NCR_DEBUG		opt_ncr.h
 SCSI_NCR_MAX_SYNC	opt_ncr.h
 SCSI_NCR_MAX_WIDE	opt_ncr.h
 SCSI_NCR_MYADDR		opt_ncr.h
 
 # Options used only in dev/isp/*
 ISP_TARGET_MODE		opt_isp.h
 ISP_FW_CRASH_DUMP	opt_isp.h
 ISP_DEFAULT_ROLES	opt_isp.h
 ISP_INTERNAL_TARGET	opt_isp.h
 
 # Options used only in dev/iscsi
 ISCSI_INITIATOR_DEBUG	opt_iscsi_initiator.h
 
 # Net stuff.
 ACCEPT_FILTER_DATA
 ACCEPT_FILTER_DNS
 ACCEPT_FILTER_HTTP
 ALTQ			opt_global.h
 ALTQ_CBQ		opt_altq.h
 ALTQ_CDNR		opt_altq.h
 ALTQ_DEBUG		opt_altq.h
 ALTQ_HFSC		opt_altq.h
 ALTQ_NOPCC		opt_altq.h
 ALTQ_PRIQ		opt_altq.h
 ALTQ_RED		opt_altq.h
 ALTQ_RIO		opt_altq.h
 BOOTP			opt_bootp.h
 BOOTP_BLOCKSIZE		opt_bootp.h
 BOOTP_COMPAT		opt_bootp.h
 BOOTP_NFSROOT		opt_bootp.h
 BOOTP_NFSV3		opt_bootp.h
 BOOTP_WIRED_TO		opt_bootp.h
 DEVICE_POLLING
 DEV_ENC			opt_enc.h
 DEV_PF			opt_pf.h
 DEV_PFLOG		opt_pf.h
 DEV_PFSYNC		opt_pf.h
 DEV_VLAN		opt_vlan.h
 DUMMYNET		opt_ipdn.h
 ETHER_8022		opt_ef.h
 ETHER_8023		opt_ef.h
 ETHER_II		opt_ef.h
 ETHER_SNAP		opt_ef.h
 INET			opt_inet.h
 INET6			opt_inet6.h
 IPDIVERT
 IPFILTER		opt_ipfilter.h
 IPFILTER_DEFAULT_BLOCK	opt_ipfilter.h
 IPFILTER_LOG		opt_ipfilter.h
 IPFILTER_LOOKUP		opt_ipfilter.h
 IPFIREWALL		opt_ipfw.h
 IPFIREWALL_DEFAULT_TO_ACCEPT	opt_ipfw.h
 IPFIREWALL_NAT		opt_ipfw.h
 IPFIREWALL_VERBOSE	opt_ipfw.h
 IPFIREWALL_VERBOSE_LIMIT	opt_ipfw.h
 IPSEC			opt_ipsec.h
 IPSEC_DEBUG		opt_ipsec.h
 IPSEC_FILTERTUNNEL	opt_ipsec.h
 IPSEC_NAT_T		opt_ipsec.h
 IPSTEALTH
 IPX
 KRPC
 LIBALIAS
 LIBMBPOOL
 LIBMCHAIN
 MBUF_PROFILING
 MBUF_STRESS_TEST
 MROUTING		opt_mrouting.h
 NETATALK		opt_atalk.h
 NFSLOCKD
 PCBGROUP		opt_pcbgroup.h
 PF_DEFAULT_TO_DROP	opt_pf.h
 RADIX_MPATH		opt_mpath.h
 ROUTETABLES		opt_route.h
 SLIP_IFF_OPTS		opt_slip.h
 TCPDEBUG
 TCP_OFFLOAD		opt_inet.h # Enable code to dispatch TCP offloading
 TCP_SIGNATURE		opt_inet.h
 VLAN_ARRAY		opt_vlan.h
 XBONEHACK
 FLOWTABLE		opt_route.h
 
 #
 # SCTP
 #
 SCTP			opt_sctp.h
 SCTP_DEBUG		opt_sctp.h # Enable debug printfs
 SCTP_WITH_NO_CSUM	opt_sctp.h # Use this at your peril
 SCTP_LOCK_LOGGING	opt_sctp.h # Log to KTR lock activity
 SCTP_MBUF_LOGGING	opt_sctp.h # Log to KTR general mbuf aloc/free
 SCTP_MBCNT_LOGGING	opt_sctp.h # Log to KTR mbcnt activity
 SCTP_PACKET_LOGGING	opt_sctp.h # Log to a packet buffer last N packets
 SCTP_LTRACE_CHUNKS	opt_sctp.h # Log to KTR chunks processed
 SCTP_LTRACE_ERRORS	opt_sctp.h # Log to KTR error returns.
 SCTP_USE_PERCPU_STAT    opt_sctp.h # Use per cpu stats.
 SCTP_MCORE_INPUT        opt_sctp.h # Have multiple input threads for input mbufs
 #
 #
 #
 
 # Netgraph(4). Use option NETGRAPH to enable the base netgraph code.
 # Each netgraph node type can be either be compiled into the kernel
 # or loaded dynamically. To get the former, include the corresponding
 # option below. Each type has its own man page, e.g. ng_async(4).
 NETGRAPH
 NETGRAPH_DEBUG		opt_netgraph.h
 NETGRAPH_ASYNC		opt_netgraph.h
 NETGRAPH_ATMLLC		opt_netgraph.h
 NETGRAPH_ATM_ATMPIF	opt_netgraph.h
 NETGRAPH_BLUETOOTH	opt_netgraph.h
 NETGRAPH_BLUETOOTH_BT3C	opt_netgraph.h
 NETGRAPH_BLUETOOTH_H4	opt_netgraph.h
 NETGRAPH_BLUETOOTH_HCI	opt_netgraph.h
 NETGRAPH_BLUETOOTH_L2CAP	opt_netgraph.h
 NETGRAPH_BLUETOOTH_SOCKET	opt_netgraph.h
 NETGRAPH_BLUETOOTH_UBT	opt_netgraph.h
 NETGRAPH_BLUETOOTH_UBTBCMFW	opt_netgraph.h
 NETGRAPH_BPF		opt_netgraph.h
 NETGRAPH_BRIDGE		opt_netgraph.h
 NETGRAPH_CAR		opt_netgraph.h
 NETGRAPH_CISCO		opt_netgraph.h
 NETGRAPH_DEFLATE	opt_netgraph.h
 NETGRAPH_DEVICE		opt_netgraph.h
 NETGRAPH_ECHO		opt_netgraph.h
 NETGRAPH_EIFACE		opt_netgraph.h
 NETGRAPH_ETHER		opt_netgraph.h
 NETGRAPH_ETHER_ECHO	opt_netgraph.h
 NETGRAPH_FEC		opt_netgraph.h
 NETGRAPH_FRAME_RELAY	opt_netgraph.h
 NETGRAPH_GIF		opt_netgraph.h
 NETGRAPH_GIF_DEMUX	opt_netgraph.h
 NETGRAPH_HOLE		opt_netgraph.h
 NETGRAPH_IFACE		opt_netgraph.h
 NETGRAPH_IP_INPUT	opt_netgraph.h
 NETGRAPH_IPFW		opt_netgraph.h
 NETGRAPH_KSOCKET	opt_netgraph.h
 NETGRAPH_L2TP		opt_netgraph.h
 NETGRAPH_LMI		opt_netgraph.h
 # MPPC compression requires proprietary files (not included)
 NETGRAPH_MPPC_COMPRESSION	opt_netgraph.h
 NETGRAPH_MPPC_ENCRYPTION	opt_netgraph.h
 NETGRAPH_NAT		opt_netgraph.h
 NETGRAPH_NETFLOW	opt_netgraph.h
 NETGRAPH_ONE2MANY	opt_netgraph.h
 NETGRAPH_PATCH		opt_netgraph.h
 NETGRAPH_PIPE		opt_netgraph.h
 NETGRAPH_PPP		opt_netgraph.h
 NETGRAPH_PPPOE		opt_netgraph.h
 NETGRAPH_PPTPGRE	opt_netgraph.h
 NETGRAPH_PRED1		opt_netgraph.h
 NETGRAPH_RFC1490	opt_netgraph.h
 NETGRAPH_SOCKET		opt_netgraph.h
 NETGRAPH_SPLIT		opt_netgraph.h
 NETGRAPH_SPPP		opt_netgraph.h
 NETGRAPH_TAG		opt_netgraph.h
 NETGRAPH_TCPMSS		opt_netgraph.h
 NETGRAPH_TEE		opt_netgraph.h
 NETGRAPH_TTY		opt_netgraph.h
 NETGRAPH_UI		opt_netgraph.h
 NETGRAPH_VJC		opt_netgraph.h
 NETGRAPH_VLAN		opt_netgraph.h
 
 # NgATM options
 NGATM_ATM		opt_netgraph.h
 NGATM_ATMBASE		opt_netgraph.h
 NGATM_SSCOP		opt_netgraph.h
 NGATM_SSCFU		opt_netgraph.h
 NGATM_UNI		opt_netgraph.h
 NGATM_CCATM		opt_netgraph.h
 
 # DRM options
 DRM_DEBUG		opt_drm.h
 
 TI_SF_BUF_JUMBO		opt_ti.h
 TI_JUMBO_HDRSPLIT	opt_ti.h
 
 # XXX Conflict: # of devices vs network protocol (Native ATM).
 # This makes "atm.h" unusable.
 NATM
 
 # DPT driver debug flags
 DPT_MEASURE_PERFORMANCE	opt_dpt.h
 DPT_RESET_HBA		opt_dpt.h
 
 # Misc debug flags.  Most of these should probably be replaced with
 # 'DEBUG', and then let people recompile just the interesting modules
 # with 'make CC="cc -DDEBUG"'.
 CLUSTERDEBUG		opt_debug_cluster.h
 DEBUG_1284		opt_ppb_1284.h
 VP0_DEBUG		opt_vpo.h
 LPT_DEBUG		opt_lpt.h
 PLIP_DEBUG		opt_plip.h
 LOCKF_DEBUG		opt_debug_lockf.h
 NETATALKDEBUG		opt_atalk.h
 SI_DEBUG		opt_debug_si.h
 
 # Fb options
 FB_DEBUG		opt_fb.h
 FB_INSTALL_CDEV		opt_fb.h
 
 # ppbus related options
 PERIPH_1284		opt_ppb_1284.h
 DONTPROBE_1284		opt_ppb_1284.h
 
 # smbus related options
 ENABLE_ALART		opt_intpm.h
 
 # These cause changes all over the kernel
 BLKDEV_IOSIZE		opt_global.h
 BURN_BRIDGES		opt_global.h
 DEBUG			opt_global.h
 DEBUG_LOCKS		opt_global.h
 DEBUG_VFS_LOCKS		opt_global.h
 DFLTPHYS		opt_global.h
 DIAGNOSTIC		opt_global.h
 INVARIANT_SUPPORT	opt_global.h
 INVARIANTS		opt_global.h
 MAXCPU			opt_global.h
 MAXMEMDOM		opt_global.h
 MAXPHYS			opt_global.h
 MCLSHIFT		opt_global.h
 MUTEX_DEBUG		opt_global.h
 MUTEX_NOINLINE		opt_global.h
 LOCK_PROFILING		opt_global.h
 LOCK_PROFILING_FAST	opt_global.h
 MSIZE			opt_global.h
 REGRESSION		opt_global.h
 RWLOCK_NOINLINE		opt_global.h
 SX_NOINLINE		opt_global.h
 VFS_BIO_DEBUG		opt_global.h
 
 # These are VM related options
 VM_KMEM_SIZE		opt_vm.h
 VM_KMEM_SIZE_SCALE	opt_vm.h
 VM_KMEM_SIZE_MAX	opt_vm.h
 VM_NRESERVLEVEL		opt_vm.h
 VM_LEVEL_0_ORDER	opt_vm.h
 NO_SWAPPING		opt_vm.h
 MALLOC_MAKE_FAILURES	opt_vm.h
 MALLOC_PROFILE		opt_vm.h
 MALLOC_DEBUG_MAXZONES	opt_vm.h
 
 # The MemGuard replacement allocator used for tamper-after-free detection
 DEBUG_MEMGUARD		opt_vm.h
 
 # The RedZone malloc(9) protection
 DEBUG_REDZONE		opt_vm.h
 
 # Standard SMP options
 SMP			opt_global.h
 
 # Size of the kernel message buffer
 MSGBUF_SIZE		opt_msgbuf.h
 
 # NFS options
 NFS_MINATTRTIMO		opt_nfs.h
 NFS_MAXATTRTIMO		opt_nfs.h
 NFS_MINDIRATTRTIMO	opt_nfs.h
 NFS_MAXDIRATTRTIMO	opt_nfs.h
 NFS_GATHERDELAY		opt_nfs.h
 NFS_WDELAYHASHSIZ	opt_nfs.h
 NFS_DEBUG		opt_nfs.h
 
 # For the Bt848/Bt848A/Bt849/Bt878/Bt879 driver
 OVERRIDE_CARD			opt_bktr.h
 OVERRIDE_TUNER			opt_bktr.h
 OVERRIDE_DBX			opt_bktr.h
 OVERRIDE_MSP			opt_bktr.h
 BROOKTREE_SYSTEM_DEFAULT	opt_bktr.h
 BROOKTREE_ALLOC_PAGES		opt_bktr.h
 BKTR_OVERRIDE_CARD		opt_bktr.h
 BKTR_OVERRIDE_TUNER		opt_bktr.h
 BKTR_OVERRIDE_DBX		opt_bktr.h
 BKTR_OVERRIDE_MSP		opt_bktr.h
 BKTR_SYSTEM_DEFAULT		opt_bktr.h
 BKTR_ALLOC_PAGES		opt_bktr.h
 BKTR_USE_PLL			opt_bktr.h	
 BKTR_GPIO_ACCESS		opt_bktr.h
 BKTR_NO_MSP_RESET		opt_bktr.h
 BKTR_430_FX_MODE		opt_bktr.h
 BKTR_SIS_VIA_MODE		opt_bktr.h
 BKTR_USE_FREEBSD_SMBUS		opt_bktr.h
 BKTR_NEW_MSP34XX_DRIVER		opt_bktr.h
 
 # Options for uart(4)
 UART_PPS_ON_CTS		opt_uart.h
 UART_POLL_FREQ		opt_uart.h
 
 # options for bus/device framework
 BUS_DEBUG		opt_bus.h
 
 # options for USB support
 USB_DEBUG		opt_usb.h
 USB_HOST_ALIGN		opt_usb.h
 USB_REQ_DEBUG		opt_usb.h
 USB_TEMPLATE		opt_usb.h
 USB_VERBOSE		opt_usb.h
 USB_EHCI_BIG_ENDIAN_DESC	opt_usb.h
 U3G_DEBUG		opt_u3g.h
 UKBD_DFLT_KEYMAP	opt_ukbd.h
 UPLCOM_INTR_INTERVAL	opt_uplcom.h
 UVSCOM_DEFAULT_OPKTSIZE	opt_uvscom.h
 UVSCOM_INTR_INTERVAL	opt_uvscom.h
 
 # Embedded system options
 INIT_PATH
 
 ROOTDEVNAME
 
 FDC_DEBUG		opt_fdc.h
 PCFCLOCK_VERBOSE	opt_pcfclock.h
 PCFCLOCK_MAX_RETRIES	opt_pcfclock.h
 
 KTR			opt_global.h
 KTR_ALQ			opt_ktr.h
 KTR_MASK		opt_ktr.h
 KTR_CPUMASK		opt_ktr.h
 KTR_COMPILE		opt_global.h
 KTR_BOOT_ENTRIES	opt_global.h
 KTR_ENTRIES		opt_global.h
 KTR_VERBOSE		opt_ktr.h
 WITNESS			opt_global.h
 WITNESS_KDB		opt_witness.h
 WITNESS_NO_VNODE	opt_witness.h
 WITNESS_SKIPSPIN	opt_witness.h
 OPENSOLARIS_WITNESS	opt_global.h
 
 # options for ACPI support
 ACPI_DEBUG		opt_acpi.h
 ACPI_MAX_TASKS		opt_acpi.h
 ACPI_MAX_THREADS	opt_acpi.h
 ACPI_DMAR		opt_acpi.h
 
 # ISA support
 DEV_ISA			opt_isa.h
 ISAPNP			opt_isa.h
 
 # various 'device presence' options.
 DEV_BPF			opt_bpf.h
 DEV_NETMAP		opt_global.h
 DEV_MCA			opt_mca.h
 DEV_CARP		opt_carp.h
 DEV_SPLASH		opt_splash.h
 
 # EISA support
 DEV_EISA		opt_eisa.h
 EISA_SLOTS		opt_eisa.h
 
 # ed driver
 ED_HPP			opt_ed.h
 ED_3C503		opt_ed.h
 ED_SIC			opt_ed.h
 
 # bce driver
 BCE_DEBUG		opt_bce.h
 BCE_NVRAM_WRITE_SUPPORT	opt_bce.h
 
 SOCKBUF_DEBUG		opt_global.h
 
 
 # options for ubsec driver
 UBSEC_DEBUG		opt_ubsec.h
 UBSEC_RNDTEST		opt_ubsec.h
 UBSEC_NO_RNG		opt_ubsec.h
 
 # options for hifn driver
 HIFN_DEBUG		opt_hifn.h
 HIFN_RNDTEST		opt_hifn.h
 
 # options for safenet driver
 SAFE_DEBUG		opt_safe.h
 SAFE_NO_RNG		opt_safe.h
 SAFE_RNDTEST		opt_safe.h
 
 # syscons options
 MAXCONS			opt_syscons.h
 SC_ALT_MOUSE_IMAGE	opt_syscons.h
 SC_CUT_SPACES2TABS	opt_syscons.h
 SC_CUT_SEPCHARS		opt_syscons.h
 SC_DEBUG_LEVEL		opt_syscons.h
 SC_DFLT_FONT		opt_syscons.h
 SC_DISABLE_KDBKEY	opt_syscons.h
 SC_DISABLE_REBOOT	opt_syscons.h
 SC_HISTORY_SIZE		opt_syscons.h
 SC_KERNEL_CONS_ATTR	opt_syscons.h
 SC_KERNEL_CONS_REV_ATTR	opt_syscons.h
 SC_MOUSE_CHAR		opt_syscons.h
 SC_NO_CUTPASTE		opt_syscons.h
 SC_NO_FONT_LOADING	opt_syscons.h
 SC_NO_HISTORY		opt_syscons.h
 SC_NO_MODE_CHANGE	opt_syscons.h
 SC_NO_SUSPEND_VTYSWITCH	opt_syscons.h
 SC_NO_SYSMOUSE		opt_syscons.h
 SC_NORM_ATTR		opt_syscons.h
 SC_NORM_REV_ATTR	opt_syscons.h
 SC_PIXEL_MODE		opt_syscons.h
 SC_RENDER_DEBUG		opt_syscons.h
 SC_TWOBUTTON_MOUSE	opt_syscons.h
 
 # teken terminal emulator options
 TEKEN_CONS25		opt_teken.h
 TEKEN_UTF8		opt_teken.h
 
 # options for printf
 PRINTF_BUFR_SIZE	opt_printf.h
 
 # kbd options
 KBD_DISABLE_KEYMAP_LOAD	opt_kbd.h
 KBD_INSTALL_CDEV	opt_kbd.h
 KBD_MAXRETRY		opt_kbd.h
 KBD_MAXWAIT		opt_kbd.h
 KBD_RESETDELAY		opt_kbd.h
 KBDIO_DEBUG		opt_kbd.h
 
 # options for the Atheros driver
 ATH_DEBUG		opt_ath.h
 ATH_TXBUF		opt_ath.h
 ATH_RXBUF		opt_ath.h
 ATH_DIAGAPI		opt_ath.h
 ATH_TX99_DIAG		opt_ath.h
 ATH_ENABLE_11N		opt_ath.h
 ATH_ENABLE_DFS		opt_ath.h
 ATH_EEPROM_FIRMWARE	opt_ath.h
 ATH_ENABLE_RADIOTAP_VENDOR_EXT	opt_ath.h
 ATH_DEBUG_ALQ		opt_ath.h
 ATH_KTR_INTR_DEBUG	opt_ath.h
 
 # options for the Atheros hal
 AH_SUPPORT_AR5416	opt_ah.h
 # XXX For now, this breaks non-AR9130 chipsets, so only use it
 # XXX when actually targetting AR9130.
 AH_SUPPORT_AR9130	opt_ah.h
 
 # This is required for AR933x SoC support
 AH_SUPPORT_AR9330	opt_ah.h
 AH_SUPPORT_AR9340	opt_ah.h
 
 AH_DEBUG		opt_ah.h
 AH_ASSERT		opt_ah.h
 AH_DEBUG_ALQ		opt_ah.h
 AH_REGOPS_FUNC		opt_ah.h
 AH_WRITE_REGDOMAIN	opt_ah.h
 AH_DEBUG_COUNTRY	opt_ah.h
 AH_WRITE_EEPROM		opt_ah.h
 AH_PRIVATE_DIAG		opt_ah.h
 AH_NEED_DESC_SWAP	opt_ah.h
 AH_USE_INIPDGAIN	opt_ah.h
 AH_MAXCHAN		opt_ah.h
 AH_RXCFG_SDMAMW_4BYTES	opt_ah.h
 AH_INTERRUPT_DEBUGGING	opt_ah.h
 # AR5416 and later interrupt mitigation
 # XXX do not use this for AR9130
 AH_AR5416_INTERRUPT_MITIGATION	opt_ah.h
 
 # options for the Broadcom BCM43xx driver (bwi)
 BWI_DEBUG		opt_bwi.h
 BWI_DEBUG_VERBOSE	opt_bwi.h
 
 # options for the Marvell 8335 wireless driver
 MALO_DEBUG		opt_malo.h
 MALO_TXBUF		opt_malo.h
 MALO_RXBUF		opt_malo.h
 
 # options for the Marvell wireless driver
 MWL_DEBUG		opt_mwl.h
 MWL_TXBUF		opt_mwl.h
 MWL_RXBUF		opt_mwl.h
 MWL_DIAGAPI		opt_mwl.h
 MWL_AGGR_SIZE		opt_mwl.h
 MWL_TX_NODROP		opt_mwl.h
 
 # Options for the Intel 802.11n wireless driver
 IWN_DEBUG		opt_iwn.h
 
 # dcons options 
 DCONS_BUF_SIZE		opt_dcons.h
 DCONS_POLL_HZ		opt_dcons.h
 DCONS_FORCE_CONSOLE	opt_dcons.h
 DCONS_FORCE_GDB		opt_dcons.h
 
 # HWPMC options
 HWPMC_HOOKS
 HWPMC_MIPS_BACKTRACE 	opt_hwpmc_hooks.h
 
 # XBOX options for FreeBSD/i386, but some files are MI
 XBOX			opt_xbox.h
 
 # Interrupt filtering
 INTR_FILTER
 
 # 802.11 support layer
 IEEE80211_DEBUG		opt_wlan.h
 IEEE80211_DEBUG_REFCNT	opt_wlan.h
 IEEE80211_AMPDU_AGE	opt_wlan.h
 IEEE80211_SUPPORT_MESH	opt_wlan.h
 IEEE80211_SUPPORT_SUPERG	opt_wlan.h
 IEEE80211_SUPPORT_TDMA	opt_wlan.h
 IEEE80211_ALQ		opt_wlan.h
 IEEE80211_DFS_DEBUG	opt_wlan.h
 
 # 802.11 TDMA support
 TDMA_SLOTLEN_DEFAULT	opt_tdma.h
 TDMA_SLOTCNT_DEFAULT	opt_tdma.h
 TDMA_BINTVAL_DEFAULT	opt_tdma.h
 TDMA_TXRATE_11B_DEFAULT	opt_tdma.h
 TDMA_TXRATE_11G_DEFAULT	opt_tdma.h
 TDMA_TXRATE_11A_DEFAULT	opt_tdma.h
 TDMA_TXRATE_TURBO_DEFAULT	opt_tdma.h
 TDMA_TXRATE_HALF_DEFAULT	opt_tdma.h
 TDMA_TXRATE_QUARTER_DEFAULT	opt_tdma.h
 TDMA_TXRATE_11NA_DEFAULT	opt_tdma.h
 TDMA_TXRATE_11NG_DEFAULT	opt_tdma.h
 
 # Network stack virtualization options
 VIMAGE			opt_global.h
 VNET_DEBUG		opt_global.h
 
 # Common Flash Interface (CFI) options
 CFI_SUPPORT_STRATAFLASH	opt_cfi.h
 CFI_ARMEDANDDANGEROUS	opt_cfi.h
 
 # Sound options
 SND_DEBUG		opt_snd.h
 SND_DIAGNOSTIC		opt_snd.h
 SND_FEEDER_MULTIFORMAT	opt_snd.h
 SND_FEEDER_FULL_MULTIFORMAT	opt_snd.h
 SND_FEEDER_RATE_HP	opt_snd.h
 SND_PCM_64		opt_snd.h
 SND_OLDSTEREO		opt_snd.h
 
 X86BIOS
 
 # Flattened device tree options
 FDT		opt_platform.h
 FDT_DTB_STATIC	opt_platform.h
 
 # OFED Infiniband stack
 OFED		opt_ofed.h
 OFED_DEBUG_INIT	opt_ofed.h
 SDP		opt_ofed.h
 SDP_DEBUG	opt_ofed.h
 IPOIB		opt_ofed.h
 IPOIB_DEBUG	opt_ofed.h
 IPOIB_CM	opt_ofed.h
 
 # Resource Accounting
 RACCT		opt_global.h
 
 # Resource Limits
 RCTL		opt_global.h
 
 # Random number generator(s)
 RANDOM_YARROW	opt_random.h
 RANDOM_FORTUNA	opt_random.h
 RANDOM_DEBUG	opt_random.h
 RANDOM_RWFILE	opt_random.h
Index: head/sys/dev/sfxge/common/efsys.h
===================================================================
--- head/sys/dev/sfxge/common/efsys.h	(revision 258540)
+++ head/sys/dev/sfxge/common/efsys.h	(revision 258541)
@@ -1,833 +1,833 @@
 /*-
  * Copyright (c) 2010-2011 Solarflare Communications, Inc.
  * All rights reserved.
  *
  * This software was developed in part by Philip Paeps under contract for
  * Solarflare Communications, Inc.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  * $FreeBSD$
  */
 
 #ifndef	_SYS_EFSYS_H
 #define	_SYS_EFSYS_H
 
 #ifdef	__cplusplus
 extern "C" {
 #endif
 
 #include <sys/param.h>
 #include <sys/bus.h>
 #include <sys/endian.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mbuf.h>
 #include <sys/mutex.h>
 #include <sys/rwlock.h>
 #include <sys/sdt.h>
 #include <sys/systm.h>
 
 #include <machine/bus.h>
 #include <machine/endian.h>
 
 #define	EFSYS_HAS_UINT64 1
 #define	EFSYS_USE_UINT64 0
 #if _BYTE_ORDER == _BIG_ENDIAN
 #define EFSYS_IS_BIG_ENDIAN 1
 #define EFSYS_IS_LITTLE_ENDIAN 0
 #elif _BYTE_ORDER == _LITTLE_ENDIAN
 #define EFSYS_IS_BIG_ENDIAN 0
 #define EFSYS_IS_LITTLE_ENDIAN 1
 #endif
 #include "efx_types.h"
 
 /* Common code requires this */
 #if __FreeBSD_version < 800068
 #define memmove(d, s, l) bcopy(s, d, l)
 #endif
 	
 /* FreeBSD equivalents of Solaris things */
 #ifndef _NOTE
 #define _NOTE(s)
 #endif
 
 #ifndef B_FALSE
 #define B_FALSE FALSE
 #endif
 #ifndef B_TRUE
 #define B_TRUE TRUE
 #endif
 
 #ifndef IS_P2ALIGNED
 #define	IS_P2ALIGNED(v, a) ((((uintptr_t)(v)) & ((uintptr_t)(a) - 1)) == 0)
 #endif
 
 #ifndef P2ROUNDUP
 #define P2ROUNDUP(x, align)             (-(-(x) & -(align)))
 #endif
 
 #ifndef IS2P
 #define ISP2(x)         (((x) & ((x) - 1)) == 0)
 #endif
 
 #define ENOTACTIVE EINVAL
 
 /* Memory type to use on FreeBSD */
 MALLOC_DECLARE(M_SFXGE);
 
 /* Machine dependend prefetch wrappers */
 #if defined(__i386__) || defined(__amd64__)
 static __inline void
 prefetch_read_many(void *addr)
 {
 
 	__asm__(
 	    "prefetcht0 (%0)"
 	    :
 	    : "r" (addr));
 }
 
 static __inline void
 prefetch_read_once(void *addr)
 {
 
 	__asm__(
 	    "prefetchnta (%0)"
 	    :
 	    : "r" (addr));
 }
 #elif defined(__sparc64__)
 static __inline void
 prefetch_read_many(void *addr)
 {
 
 	__asm__(
 	    "prefetch [%0], 0"
 	    :
 	    : "r" (addr));
 }
 
 static __inline void
 prefetch_read_once(void *addr)
 {
 
 	__asm__(
 	    "prefetch [%0], 1"
 	    :
 	    : "r" (addr));
 }
 #else
 static __inline void
 prefetch_read_many(void *addr)
 {
 
 }
 
 static __inline void
 prefetch_read_once(void *addr)
 {
 
 }
 #endif
 
 #if defined(__i386__) || defined(__amd64__)
 #include <vm/vm.h>
 #include <vm/pmap.h>
 #endif
 static __inline void
 sfxge_map_mbuf_fast(bus_dma_tag_t tag, bus_dmamap_t map,
     struct mbuf *m, bus_dma_segment_t *seg)
 {
 #if defined(__i386__) || defined(__amd64__)
 	seg->ds_addr = pmap_kextract(mtod(m, vm_offset_t));
 	seg->ds_len = m->m_len;
 #else
 	int nsegstmp;
 
 	bus_dmamap_load_mbuf_sg(tag, map, m, seg, &nsegstmp, 0);
 #endif
 }
 
 /* Modifiers used for DOS builds */
 #define	__cs
 #define	__far
 
 /* Modifiers used for Windows builds */
 #define	__in
 #define	__in_opt
 #define	__in_ecount(_n)
 #define	__in_ecount_opt(_n)
 #define	__in_bcount(_n)
 #define	__in_bcount_opt(_n)
 
 #define	__out
 #define	__out_opt
 #define	__out_ecount(_n)
 #define	__out_ecount_opt(_n)
 #define	__out_bcount(_n)
 #define	__out_bcount_opt(_n)
 
 #define	__deref_out
 
 #define	__inout
 #define	__inout_opt
 #define	__inout_ecount(_n)
 #define	__inout_ecount_opt(_n)
 #define	__inout_bcount(_n)
 #define	__inout_bcount_opt(_n)
 #define	__inout_bcount_full_opt(_n)
 
 #define	__deref_out_bcount_opt(n)
 
 #define	__checkReturn
 
 #define	__drv_when(_p, _c)
 
 /* Code inclusion options */
 
 
 #define	EFSYS_OPT_NAMES 1
 
 #define	EFSYS_OPT_FALCON 0
 #define	EFSYS_OPT_FALCON_NIC_CFG_OVERRIDE 0
 #define	EFSYS_OPT_SIENA 1
 #ifdef DEBUG
 #define	EFSYS_OPT_CHECK_REG 1
 #else
 #define	EFSYS_OPT_CHECK_REG 0
 #endif
 
 #define	EFSYS_OPT_MCDI 1
 
 #define	EFSYS_OPT_MAC_FALCON_GMAC 0
 #define	EFSYS_OPT_MAC_FALCON_XMAC 0
 #define	EFSYS_OPT_MAC_STATS 1
 
 #define	EFSYS_OPT_LOOPBACK 0
 
 #define	EFSYS_OPT_MON_NULL 0
 #define	EFSYS_OPT_MON_LM87 0
 #define	EFSYS_OPT_MON_MAX6647 0
 #define	EFSYS_OPT_MON_SIENA 0
 #define	EFSYS_OPT_MON_STATS 0
 
 #define	EFSYS_OPT_PHY_NULL 0
 #define	EFSYS_OPT_PHY_QT2022C2 0
 #define	EFSYS_OPT_PHY_SFX7101 0
 #define	EFSYS_OPT_PHY_TXC43128 0
 #define	EFSYS_OPT_PHY_PM8358 0
 #define	EFSYS_OPT_PHY_SFT9001 0
 #define	EFSYS_OPT_PHY_QT2025C 0
 #define	EFSYS_OPT_PHY_STATS 1
 #define	EFSYS_OPT_PHY_PROPS 0
 #define	EFSYS_OPT_PHY_BIST 1
 #define	EFSYS_OPT_PHY_LED_CONTROL 1
 #define EFSYS_OPT_PHY_FLAGS 0
 
 #define	EFSYS_OPT_VPD 1
 #define	EFSYS_OPT_NVRAM 1
 #define	EFSYS_OPT_NVRAM_FALCON_BOOTROM 0
 #define	EFSYS_OPT_NVRAM_SFT9001	0
 #define	EFSYS_OPT_NVRAM_SFX7101	0
 #define	EFSYS_OPT_BOOTCFG 0
 
 #define	EFSYS_OPT_PCIE_TUNE 0
 #define	EFSYS_OPT_DIAG 0
 #define	EFSYS_OPT_WOL 1
 #define	EFSYS_OPT_RX_SCALE 1
 #define	EFSYS_OPT_QSTATS 1
 #define EFSYS_OPT_FILTER 0
 #define EFSYS_OPT_RX_SCATTER 0
 #define	EFSYS_OPT_RX_HDR_SPLIT 0
 
 #define	EFSYS_OPT_EV_PREFETCH 0
 
 #define	EFSYS_OPT_DECODE_INTR_FATAL 1
 
 /* ID */
 
 typedef struct __efsys_identifier_s	efsys_identifier_t;
 
 /* PROBE */
 
-#ifndef KDTRACE_HOOKS
+#ifndef DTRACE_PROBE
 
 #define EFSYS_PROBE(_name)
 
 #define	EFSYS_PROBE1(_name, _type1, _arg1)
 
 #define	EFSYS_PROBE2(_name, _type1, _arg1, _type2, _arg2)
 
 #define	EFSYS_PROBE3(_name, _type1, _arg1, _type2, _arg2,		\
 	    _type3, _arg3)
 
 #define	EFSYS_PROBE4(_name, _type1, _arg1, _type2, _arg2,		\
 	    _type3, _arg3, _type4, _arg4)
 
 #define	EFSYS_PROBE5(_name, _type1, _arg1, _type2, _arg2,		\
 	    _type3, _arg3, _type4, _arg4, _type5, _arg5)
 
 #define	EFSYS_PROBE6(_name, _type1, _arg1, _type2, _arg2,		\
 	    _type3, _arg3, _type4, _arg4, _type5, _arg5,		\
 	    _type6, _arg6)
 
 #define	EFSYS_PROBE7(_name, _type1, _arg1, _type2, _arg2,		\
 	    _type3, _arg3, _type4, _arg4, _type5, _arg5,		\
 	    _type6, _arg6, _type7, _arg7)
 
-#else /* KDTRACE_HOOKS */
+#else /* DTRACE_PROBE */
 
 #define	EFSYS_PROBE(_name)						\
 	DTRACE_PROBE(_name)
 
 #define	EFSYS_PROBE1(_name, _type1, _arg1)				\
 	DTRACE_PROBE1(_name, _type1, _arg1)
 
 #define	EFSYS_PROBE2(_name, _type1, _arg1, _type2, _arg2)		\
 	DTRACE_PROBE2(_name, _type1, _arg1, _type2, _arg2)
 
 #define	EFSYS_PROBE3(_name, _type1, _arg1, _type2, _arg2,		\
 	    _type3, _arg3)						\
 	DTRACE_PROBE3(_name, _type1, _arg1, _type2, _arg2,		\
 	    _type3, _arg3)
 
 #define	EFSYS_PROBE4(_name, _type1, _arg1, _type2, _arg2,		\
 	    _type3, _arg3, _type4, _arg4)				\
 	DTRACE_PROBE4(_name, _type1, _arg1, _type2, _arg2,		\
 	    _type3, _arg3, _type4, _arg4)
 
 #ifdef DTRACE_PROBE5
 #define	EFSYS_PROBE5(_name, _type1, _arg1, _type2, _arg2,		\
 	    _type3, _arg3, _type4, _arg4, _type5, _arg5)		\
 	DTRACE_PROBE5(_name, _type1, _arg1, _type2, _arg2,		\
 	    _type3, _arg3, _type4, _arg4, _type5, _arg5)
 #else
 #define	EFSYS_PROBE5(_name, _type1, _arg1, _type2, _arg2,		\
 	    _type3, _arg3, _type4, _arg4, _type5, _arg5)		\
 	DTRACE_PROBE4(_name, _type1, _arg1, _type2, _arg2,		\
 	    _type3, _arg3, _type4, _arg4)
 #endif
 
 #ifdef DTRACE_PROBE6
 #define	EFSYS_PROBE6(_name, _type1, _arg1, _type2, _arg2,		\
 	    _type3, _arg3, _type4, _arg4, _type5, _arg5,		\
 	    _type6, _arg6)						\
 	DTRACE_PROBE6(_name, _type1, _arg1, _type2, _arg2,		\
 	    _type3, _arg3, _type4, _arg4, _type5, _arg5,		\
 	    _type6, _arg6)
 #else
 #define	EFSYS_PROBE6(_name, _type1, _arg1, _type2, _arg2,		\
 	    _type3, _arg3, _type4, _arg4, _type5, _arg5,		\
 	    _type6, _arg6)						\
 	EFSYS_PROBE5(_name, _type1, _arg1, _type2, _arg2,		\
 	    _type3, _arg3, _type4, _arg4, _type5, _arg5)
 #endif
 
 #ifdef DTRACE_PROBE7
 #define	EFSYS_PROBE7(_name, _type1, _arg1, _type2, _arg2,		\
 	    _type3, _arg3, _type4, _arg4, _type5, _arg5,		\
 	    _type6, _arg6, _type7, _arg7)				\
 	DTRACE_PROBE7(_name, _type1, _arg1, _type2, _arg2,		\
 	    _type3, _arg3, _type4, _arg4, _type5, _arg5,		\
 	    _type6, _arg6, _type7, _arg7)
 #else
 #define	EFSYS_PROBE7(_name, _type1, _arg1, _type2, _arg2,		\
 	    _type3, _arg3, _type4, _arg4, _type5, _arg5,		\
 	    _type6, _arg6, _type7, _arg7)				\
 	EFSYS_PROBE6(_name, _type1, _arg1, _type2, _arg2,		\
 	    _type3, _arg3, _type4, _arg4, _type5, _arg5,		\
 	    _type6, _arg6)
 #endif
 
-#endif /* KDTRACE_HOOKS */
+#endif /* DTRACE_PROBE */
 
 /* DMA */
 
 typedef uint64_t		efsys_dma_addr_t;
 
 typedef struct efsys_mem_s {
 	bus_dma_tag_t		esm_tag;
 	bus_dmamap_t		esm_map;
 	caddr_t			esm_base;
 	efsys_dma_addr_t	esm_addr;
 	size_t			esm_size;
 } efsys_mem_t;
 
 
 #define	EFSYS_MEM_ZERO(_esmp, _size)					\
 	do {								\
 		(void) memset((_esmp)->esm_base, 0, (_size));		\
 									\
 	_NOTE(CONSTANTCONDITION)					\
 	} while (B_FALSE)
 
 #define	EFSYS_MEM_READD(_esmp, _offset, _edp)				\
 	do {								\
 		uint32_t *addr;						\
 									\
 		_NOTE(CONSTANTCONDITION)				\
 		KASSERT(IS_P2ALIGNED(_offset, sizeof (efx_dword_t)),	\
 		    ("not power of 2 aligned"));			\
 									\
 		addr = (void *)((_esmp)->esm_base + (_offset));		\
 									\
 		(_edp)->ed_u32[0] = *addr;				\
 									\
 		EFSYS_PROBE2(mem_readd, unsigned int, (_offset),	\
 		    uint32_t, (_edp)->ed_u32[0]);			\
 									\
 	_NOTE(CONSTANTCONDITION)					\
 	} while (B_FALSE)
 
 #define	EFSYS_MEM_READQ(_esmp, _offset, _eqp)				\
 	do {								\
 		uint32_t *addr;						\
 									\
 		_NOTE(CONSTANTCONDITION)				\
 		KASSERT(IS_P2ALIGNED(_offset, sizeof (efx_qword_t)),	\
 		    ("not power of 2 aligned"));			\
 									\
 		addr = (void *)((_esmp)->esm_base + (_offset));		\
 									\
 		(_eqp)->eq_u32[0] = *addr++;				\
 		(_eqp)->eq_u32[1] = *addr;				\
 									\
 		EFSYS_PROBE3(mem_readq, unsigned int, (_offset),	\
 		    uint32_t, (_eqp)->eq_u32[1],			\
 		    uint32_t, (_eqp)->eq_u32[0]);			\
 									\
 	_NOTE(CONSTANTCONDITION)					\
 	} while (B_FALSE)
 
 #define	EFSYS_MEM_READO(_esmp, _offset, _eop)				\
 	do {								\
 		uint32_t *addr;						\
 									\
 		_NOTE(CONSTANTCONDITION)				\
 		KASSERT(IS_P2ALIGNED(_offset, sizeof (efx_oword_t)),	\
 		    ("not power of 2 aligned"));			\
 									\
 		addr = (void *)((_esmp)->esm_base + (_offset));		\
 									\
 		(_eop)->eo_u32[0] = *addr++;				\
 		(_eop)->eo_u32[1] = *addr++;				\
 		(_eop)->eo_u32[2] = *addr++;				\
 		(_eop)->eo_u32[3] = *addr;				\
 									\
 		EFSYS_PROBE5(mem_reado, unsigned int, (_offset),	\
 		    uint32_t, (_eop)->eo_u32[3],			\
 		    uint32_t, (_eop)->eo_u32[2],			\
 		    uint32_t, (_eop)->eo_u32[1],			\
 		    uint32_t, (_eop)->eo_u32[0]);			\
 									\
 	_NOTE(CONSTANTCONDITION)					\
 	} while (B_FALSE)
 
 #define	EFSYS_MEM_WRITED(_esmp, _offset, _edp)				\
 	do {								\
 		uint32_t *addr;						\
 									\
 		_NOTE(CONSTANTCONDITION)				\
 		KASSERT(IS_P2ALIGNED(_offset, sizeof (efx_dword_t)),	\
 		    ("not power of 2 aligned"));			\
 									\
 		EFSYS_PROBE2(mem_writed, unsigned int, (_offset),	\
 		    uint32_t, (_edp)->ed_u32[0]);			\
 									\
 		addr = (void *)((_esmp)->esm_base + (_offset));		\
 									\
 		*addr = (_edp)->ed_u32[0];				\
 									\
 	_NOTE(CONSTANTCONDITION)					\
 	} while (B_FALSE)
 
 #define	EFSYS_MEM_WRITEQ(_esmp, _offset, _eqp)				\
 	do {								\
 		uint32_t *addr;						\
 									\
 		_NOTE(CONSTANTCONDITION)				\
 		KASSERT(IS_P2ALIGNED(_offset, sizeof (efx_qword_t)),	\
 		    ("not power of 2 aligned"));			\
 									\
 		EFSYS_PROBE3(mem_writeq, unsigned int, (_offset),	\
 		    uint32_t, (_eqp)->eq_u32[1],			\
 		    uint32_t, (_eqp)->eq_u32[0]);			\
 									\
 		addr = (void *)((_esmp)->esm_base + (_offset));		\
 									\
 		*addr++ = (_eqp)->eq_u32[0];				\
 		*addr   = (_eqp)->eq_u32[1];				\
 									\
 	_NOTE(CONSTANTCONDITION)					\
 	} while (B_FALSE)
 
 #define	EFSYS_MEM_WRITEO(_esmp, _offset, _eop)				\
 	do {								\
 		uint32_t *addr;						\
 									\
 		_NOTE(CONSTANTCONDITION)				\
 		KASSERT(IS_P2ALIGNED(_offset, sizeof (efx_oword_t)),	\
 		    ("not power of 2 aligned"));			\
 									\
 		EFSYS_PROBE5(mem_writeo, unsigned int, (_offset),	\
 		    uint32_t, (_eop)->eo_u32[3],			\
 		    uint32_t, (_eop)->eo_u32[2],			\
 		    uint32_t, (_eop)->eo_u32[1],			\
 		    uint32_t, (_eop)->eo_u32[0]);			\
 									\
 		addr = (void *)((_esmp)->esm_base + (_offset));		\
 									\
 		*addr++ = (_eop)->eo_u32[0];				\
 		*addr++ = (_eop)->eo_u32[1];				\
 		*addr++ = (_eop)->eo_u32[2];				\
 		*addr   = (_eop)->eo_u32[3];				\
 									\
 	_NOTE(CONSTANTCONDITION)					\
 	} while (B_FALSE)
 
 #define	EFSYS_MEM_ADDR(_esmp)						\
 	((_esmp)->esm_addr)
 
 /* BAR */
 
 typedef struct efsys_bar_s {
 	struct mtx		esb_lock;
 	bus_space_tag_t		esb_tag;
 	bus_space_handle_t	esb_handle;
 	int			esb_rid;
 	struct resource		*esb_res;
 } efsys_bar_t;
 
 #define	EFSYS_BAR_READD(_esbp, _offset, _edp, _lock)			\
 	do {								\
 		_NOTE(CONSTANTCONDITION)				\
 		KASSERT(IS_P2ALIGNED(_offset, sizeof (efx_dword_t)),	\
 		    ("not power of 2 aligned"));			\
 									\
 		_NOTE(CONSTANTCONDITION)				\
 		if (_lock)						\
 			mtx_lock(&((_esbp)->esb_lock));			\
 									\
 		(_edp)->ed_u32[0] = bus_space_read_4((_esbp)->esb_tag,	\
 		    (_esbp)->esb_handle, (_offset));			\
 									\
 		EFSYS_PROBE2(bar_readd, unsigned int, (_offset),	\
 		    uint32_t, (_edp)->ed_u32[0]);			\
 									\
 		_NOTE(CONSTANTCONDITION)				\
 		if (_lock)						\
 			mtx_unlock(&((_esbp)->esb_lock));		\
 	_NOTE(CONSTANTCONDITION)					\
 	} while (B_FALSE)
 
 #define	EFSYS_BAR_READQ(_esbp, _offset, _eqp)				\
 	do {								\
 		_NOTE(CONSTANTCONDITION)				\
 		KASSERT(IS_P2ALIGNED(_offset, sizeof (efx_qword_t)),	\
 		    ("not power of 2 aligned"));			\
 									\
 		mtx_lock(&((_esbp)->esb_lock));				\
 									\
 		(_eqp)->eq_u32[0] = bus_space_read_4((_esbp)->esb_tag,	\
 		    (_esbp)->esb_handle, (_offset));			\
 		(_eqp)->eq_u32[1] = bus_space_read_4((_esbp)->esb_tag,	\
 		    (_esbp)->esb_handle, (_offset+4));			\
 									\
 		EFSYS_PROBE3(bar_readq, unsigned int, (_offset),	\
 		    uint32_t, (_eqp)->eq_u32[1],			\
 		    uint32_t, (_eqp)->eq_u32[0]);			\
 									\
 		mtx_unlock(&((_esbp)->esb_lock));			\
 	_NOTE(CONSTANTCONDITION)					\
 	} while (B_FALSE)
 
 #define	EFSYS_BAR_READO(_esbp, _offset, _eop, _lock)			\
 	do {								\
 		_NOTE(CONSTANTCONDITION)				\
 		KASSERT(IS_P2ALIGNED(_offset, sizeof (efx_oword_t)),	\
 		    ("not power of 2 aligned"));			\
 									\
 		_NOTE(CONSTANTCONDITION)				\
 		if (_lock)						\
 			mtx_lock(&((_esbp)->esb_lock));			\
 									\
 		(_eop)->eo_u32[0] = bus_space_read_4((_esbp)->esb_tag,	\
 		    (_esbp)->esb_handle, (_offset));			\
 		(_eop)->eo_u32[1] = bus_space_read_4((_esbp)->esb_tag,	\
 		    (_esbp)->esb_handle, (_offset+4));			\
 		(_eop)->eo_u32[2] = bus_space_read_4((_esbp)->esb_tag,	\
 		    (_esbp)->esb_handle, (_offset+8));			\
 		(_eop)->eo_u32[3] = bus_space_read_4((_esbp)->esb_tag,	\
 		    (_esbp)->esb_handle, (_offset+12));			\
 									\
 		EFSYS_PROBE5(bar_reado, unsigned int, (_offset),	\
 		    uint32_t, (_eop)->eo_u32[3],			\
 		    uint32_t, (_eop)->eo_u32[2],			\
 		    uint32_t, (_eop)->eo_u32[1],			\
 		    uint32_t, (_eop)->eo_u32[0]);			\
 									\
 		_NOTE(CONSTANTCONDITION)				\
 		if (_lock)						\
 			mtx_unlock(&((_esbp)->esb_lock));		\
 	_NOTE(CONSTANTCONDITION)					\
 	} while (B_FALSE)
 
 #define	EFSYS_BAR_WRITED(_esbp, _offset, _edp, _lock)			\
 	do {								\
 		_NOTE(CONSTANTCONDITION)				\
 		KASSERT(IS_P2ALIGNED(_offset, sizeof (efx_dword_t)),	\
 		    ("not power of 2 aligned"));			\
 									\
 		_NOTE(CONSTANTCONDITION)				\
 		if (_lock)						\
 			mtx_lock(&((_esbp)->esb_lock));			\
 									\
 		EFSYS_PROBE2(bar_writed, unsigned int, (_offset),	\
 		    uint32_t, (_edp)->ed_u32[0]);			\
 									\
 		bus_space_write_4((_esbp)->esb_tag, (_esbp)->esb_handle,\
 		    (_offset), (_edp)->ed_u32[0]);			\
 									\
 		_NOTE(CONSTANTCONDITION)				\
 		if (_lock)						\
 			mtx_unlock(&((_esbp)->esb_lock));		\
 	_NOTE(CONSTANTCONDITION)					\
 	} while (B_FALSE)
 
 #define	EFSYS_BAR_WRITEQ(_esbp, _offset, _eqp)				\
 	do {								\
 		_NOTE(CONSTANTCONDITION)				\
 		KASSERT(IS_P2ALIGNED(_offset, sizeof (efx_qword_t)),	\
 		    ("not power of 2 aligned"));			\
 									\
 		mtx_lock(&((_esbp)->esb_lock));				\
 									\
 		EFSYS_PROBE3(bar_writeq, unsigned int, (_offset),	\
 		    uint32_t, (_eqp)->eq_u32[1],			\
 		    uint32_t, (_eqp)->eq_u32[0]);			\
 									\
 		bus_space_write_4((_esbp)->esb_tag, (_esbp)->esb_handle,\
 		    (_offset), (_eqp)->eq_u32[0]);			\
 		bus_space_write_4((_esbp)->esb_tag, (_esbp)->esb_handle,\
 		    (_offset+4), (_eqp)->eq_u32[1]);			\
 									\
 		mtx_unlock(&((_esbp)->esb_lock));			\
 	_NOTE(CONSTANTCONDITION)					\
 	} while (B_FALSE)
 
 #define	EFSYS_BAR_WRITEO(_esbp, _offset, _eop, _lock)			\
 	do {								\
 		_NOTE(CONSTANTCONDITION)				\
 		KASSERT(IS_P2ALIGNED(_offset, sizeof (efx_oword_t)),	\
 		    ("not power of 2 aligned"));			\
 									\
 		_NOTE(CONSTANTCONDITION)				\
 		if (_lock)						\
 			mtx_lock(&((_esbp)->esb_lock));			\
 									\
 		EFSYS_PROBE5(bar_writeo, unsigned int, (_offset),	\
 		    uint32_t, (_eop)->eo_u32[3],			\
 		    uint32_t, (_eop)->eo_u32[2],			\
 		    uint32_t, (_eop)->eo_u32[1],			\
 		    uint32_t, (_eop)->eo_u32[0]);			\
 									\
 		bus_space_write_4((_esbp)->esb_tag, (_esbp)->esb_handle,\
 		    (_offset), (_eop)->eo_u32[0]);			\
 		bus_space_write_4((_esbp)->esb_tag, (_esbp)->esb_handle,\
 		    (_offset+4), (_eop)->eo_u32[1]);			\
 		bus_space_write_4((_esbp)->esb_tag, (_esbp)->esb_handle,\
 		    (_offset+8), (_eop)->eo_u32[2]);			\
 		bus_space_write_4((_esbp)->esb_tag, (_esbp)->esb_handle,\
 		    (_offset+12), (_eop)->eo_u32[3]);			\
 									\
 		_NOTE(CONSTANTCONDITION)				\
 		if (_lock)						\
 			mtx_unlock(&((_esbp)->esb_lock));		\
 	_NOTE(CONSTANTCONDITION)					\
 	} while (B_FALSE)
 
 /* SPIN */
 
 #define	EFSYS_SPIN(_us)							\
 	do {								\
 		DELAY(_us);						\
 	_NOTE(CONSTANTCONDITION)					\
 	} while (B_FALSE)
 
 #define	EFSYS_SLEEP	EFSYS_SPIN
 
 /* BARRIERS */
 
 /* Strict ordering guaranteed by devacc.devacc_attr_dataorder */
 #define	EFSYS_MEM_READ_BARRIER()
 #define	EFSYS_PIO_WRITE_BARRIER()
 
 /* TIMESTAMP */
 
 typedef	clock_t	efsys_timestamp_t;
 
 #define	EFSYS_TIMESTAMP(_usp)						\
 	do {								\
 		clock_t now;						\
 									\
 		now = ticks;						\
 		*(_usp) = now * hz / 1000000;				\
 	_NOTE(CONSTANTCONDITION)					\
 	} while (B_FALSE)
 
 /* KMEM */
 
 #define	EFSYS_KMEM_ALLOC(_esip, _size, _p)				\
 	do {								\
 		(_esip) = (_esip);					\
 		(_p) = malloc((_size), M_SFXGE, M_WAITOK|M_ZERO);	\
 	_NOTE(CONSTANTCONDITION)					\
 	} while (B_FALSE)
 
 #define	EFSYS_KMEM_FREE(_esip, _size, _p)				\
 	do {								\
 		(void) (_esip);						\
 		(void) (_size);						\
 		free((_p), M_SFXGE);					\
 	_NOTE(CONSTANTCONDITION)					\
 	} while (B_FALSE)
 
 /* LOCK */
 
 typedef struct mtx	efsys_lock_t;
 
 #define	EFSYS_LOCK_MAGIC	0x000010c4
 
 #define	EFSYS_LOCK(_lockp, _state)					\
 	do {								\
 		mtx_lock(_lockp);					\
 		(_state) = EFSYS_LOCK_MAGIC;				\
 	_NOTE(CONSTANTCONDITION)					\
 	} while (B_FALSE)
 
 #define	EFSYS_UNLOCK(_lockp, _state)					\
 	do {								\
 		if ((_state) != EFSYS_LOCK_MAGIC)			\
 			KASSERT(B_FALSE, ("not locked"));		\
 		mtx_unlock(_lockp);					\
 	_NOTE(CONSTANTCONDITION)					\
 	} while (B_FALSE)
 
 /* PREEMPT */
 
 #define	EFSYS_PREEMPT_DISABLE(_state)					\
 	do {								\
 		(_state) = (_state);					\
 		critical_enter();					\
 	_NOTE(CONSTANTCONDITION)					\
 	} while (B_FALSE)
 
 #define	EFSYS_PREEMPT_ENABLE(_state)					\
 	do {								\
 		(_state) = (_state);					\
 		critical_exit(_state);					\
 	_NOTE(CONSTANTCONDITION)					\
 	} while (B_FALSE)
 
 /* STAT */
 
 typedef uint64_t		efsys_stat_t;
 
 #define	EFSYS_STAT_INCR(_knp, _delta) 					\
 	do {								\
 		*(_knp) += (_delta);					\
 	_NOTE(CONSTANTCONDITION)					\
 	} while (B_FALSE)
 
 #define	EFSYS_STAT_DECR(_knp, _delta) 					\
 	do {								\
 		*(_knp) -= (_delta);					\
 	_NOTE(CONSTANTCONDITION)					\
 	} while (B_FALSE)
 
 #define	EFSYS_STAT_SET(_knp, _val)					\
 	do {								\
 		*(_knp) = (_val);					\
 	_NOTE(CONSTANTCONDITION)					\
 	} while (B_FALSE)
 
 #define	EFSYS_STAT_SET_QWORD(_knp, _valp)				\
 	do {								\
 		*(_knp) = le64toh((_valp)->eq_u64[0]);			\
 	_NOTE(CONSTANTCONDITION)					\
 	} while (B_FALSE)
 
 #define	EFSYS_STAT_SET_DWORD(_knp, _valp)				\
 	do {								\
 		*(_knp) = le32toh((_valp)->ed_u32[0]);			\
 	_NOTE(CONSTANTCONDITION)					\
 	} while (B_FALSE)
 
 #define	EFSYS_STAT_INCR_QWORD(_knp, _valp)				\
 	do {								\
 		*(_knp) += le64toh((_valp)->eq_u64[0]);			\
 	_NOTE(CONSTANTCONDITION)					\
 	} while (B_FALSE)
 
 #define	EFSYS_STAT_SUBR_QWORD(_knp, _valp)				\
 	do {								\
 		*(_knp) -= le64toh((_valp)->eq_u64[0]);			\
 	_NOTE(CONSTANTCONDITION)					\
 	} while (B_FALSE)
 
 /* ERR */
 
 extern void	sfxge_err(efsys_identifier_t *, unsigned int,
 		    uint32_t, uint32_t);
 
 #if EFSYS_OPT_DECODE_INTR_FATAL
 #define	EFSYS_ERR(_esip, _code, _dword0, _dword1)			\
 	do {								\
 		sfxge_err((_esip), (_code), (_dword0), (_dword1));	\
 	_NOTE(CONSTANTCONDITION)					\
 	} while (B_FALSE)
 #endif
 
 /* ASSERT */
 
 #define	EFSYS_ASSERT(_exp) do {						\
 	if (!(_exp))							\
 		panic(#_exp);						\
 	} while (0)
 
 #define EFSYS_ASSERT3(_x, _op, _y, _t) do {				\
 	const _t __x = (_t)(_x);					\
 	const _t __y = (_t)(_y);					\
 	if (!(__x _op __y))						\
 	        panic("assertion failed at %s:%u", __FILE__, __LINE__);	\
 	} while(0)
 
 #define EFSYS_ASSERT3U(_x, _op, _y)	EFSYS_ASSERT3(_x, _op, _y, uint64_t)
 #define EFSYS_ASSERT3S(_x, _op, _y)	EFSYS_ASSERT3(_x, _op, _y, int64_t)
 #define EFSYS_ASSERT3P(_x, _op, _y)	EFSYS_ASSERT3(_x, _op, _y, uintptr_t)
 
 #ifdef	__cplusplus
 }
 #endif
 
 #endif	/* _SYS_EFSYS_H */
Index: head/sys/dev/xen/blkback/blkback.c
===================================================================
--- head/sys/dev/xen/blkback/blkback.c	(revision 258540)
+++ head/sys/dev/xen/blkback/blkback.c	(revision 258541)
@@ -1,3973 +1,3971 @@
 /*-
  * Copyright (c) 2009-2011 Spectra Logic Corporation
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions, and the following disclaimer,
  *    without modification.
  * 2. Redistributions in binary form must reproduce at minimum a disclaimer
  *    substantially similar to the "NO WARRANTY" disclaimer below
  *    ("Disclaimer") and any redistribution must be conditioned upon
  *    including a substantially similar Disclaimer requirement for further
  *    binary redistribution.
  *
  * NO WARRANTY
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTIBILITY AND FITNESS FOR
  * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
  * HOLDERS OR CONTRIBUTORS BE LIABLE FOR SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
  * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
  * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  * POSSIBILITY OF SUCH DAMAGES.
  *
  * Authors: Justin T. Gibbs     (Spectra Logic Corporation)
  *          Ken Merry           (Spectra Logic Corporation)
  */
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 /**
  * \file blkback.c
  *
  * \brief Device driver supporting the vending of block storage from
  *        a FreeBSD domain to other domains.
  */
 
-#include "opt_kdtrace.h"
-
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/kernel.h>
 #include <sys/malloc.h>
 
 #include <sys/bio.h>
 #include <sys/bus.h>
 #include <sys/conf.h>
 #include <sys/devicestat.h>
 #include <sys/disk.h>
 #include <sys/fcntl.h>
 #include <sys/filedesc.h>
 #include <sys/kdb.h>
 #include <sys/module.h>
 #include <sys/namei.h>
 #include <sys/proc.h>
 #include <sys/rman.h>
 #include <sys/taskqueue.h>
 #include <sys/types.h>
 #include <sys/vnode.h>
 #include <sys/mount.h>
 #include <sys/sysctl.h>
 #include <sys/bitstring.h>
 #include <sys/sdt.h>
 
 #include <geom/geom.h>
 
 #include <machine/_inttypes.h>
 
 #include <vm/vm.h>
 #include <vm/vm_extern.h>
 #include <vm/vm_kern.h>
 
 #include <xen/xen-os.h>
 #include <xen/blkif.h>
 #include <xen/gnttab.h>
 #include <xen/xen_intr.h>
 
 #include <xen/interface/event_channel.h>
 #include <xen/interface/grant_table.h>
 
 #include <xen/xenbus/xenbusvar.h>
 
 /*--------------------------- Compile-time Tunables --------------------------*/
 /**
  * The maximum number of outstanding request blocks (request headers plus
  * additional segment blocks) we will allow in a negotiated block-front/back
  * communication channel.
  */
 #define	XBB_MAX_REQUESTS	256
 
 /**
  * \brief Define to force all I/O to be performed on memory owned by the
  *        backend device, with a copy-in/out to the remote domain's memory.
  *
  * \note  This option is currently required when this driver's domain is
  *        operating in HVM mode on a system using an IOMMU.
  *
  * This driver uses Xen's grant table API to gain access to the memory of
  * the remote domains it serves.  When our domain is operating in PV mode,
  * the grant table mechanism directly updates our domain's page table entries
  * to point to the physical pages of the remote domain.  This scheme guarantees
  * that blkback and the backing devices it uses can safely perform DMA
  * operations to satisfy requests.  In HVM mode, Xen may use a HW IOMMU to
  * insure that our domain cannot DMA to pages owned by another domain.  As
  * of Xen 4.0, IOMMU mappings for HVM guests are not updated via the grant
  * table API.  For this reason, in HVM mode, we must bounce all requests into
  * memory that is mapped into our domain at domain startup and thus has
  * valid IOMMU mappings.
  */
 #define XBB_USE_BOUNCE_BUFFERS
 
 /**
  * \brief Define to enable rudimentary request logging to the console.
  */
 #undef XBB_DEBUG
 
 /*---------------------------------- Macros ----------------------------------*/
 /**
  * Custom malloc type for all driver allocations.
  */
 static MALLOC_DEFINE(M_XENBLOCKBACK, "xbbd", "Xen Block Back Driver Data");
 
 #ifdef XBB_DEBUG
 #define DPRINTF(fmt, args...)					\
     printf("xbb(%s:%d): " fmt, __FUNCTION__, __LINE__, ##args)
 #else
 #define DPRINTF(fmt, args...) do {} while(0)
 #endif
 
 /**
  * The maximum mapped region size per request we will allow in a negotiated
  * block-front/back communication channel.
  */
 #define	XBB_MAX_REQUEST_SIZE					\
 	MIN(MAXPHYS, BLKIF_MAX_SEGMENTS_PER_REQUEST * PAGE_SIZE)
 
 /**
  * The maximum number of segments (within a request header and accompanying
  * segment blocks) per request we will allow in a negotiated block-front/back
  * communication channel.
  */
 #define	XBB_MAX_SEGMENTS_PER_REQUEST				\
 	(MIN(UIO_MAXIOV,					\
 	     MIN(BLKIF_MAX_SEGMENTS_PER_REQUEST,		\
 		 (XBB_MAX_REQUEST_SIZE / PAGE_SIZE) + 1)))
 
 /**
  * The maximum number of shared memory ring pages we will allow in a
  * negotiated block-front/back communication channel.  Allow enough
  * ring space for all requests to be XBB_MAX_REQUEST_SIZE'd.
  */
 #define	XBB_MAX_RING_PAGES						    \
 	BLKIF_RING_PAGES(BLKIF_SEGS_TO_BLOCKS(XBB_MAX_SEGMENTS_PER_REQUEST) \
 		       * XBB_MAX_REQUESTS)
 /**
  * The maximum number of ring pages that we can allow per request list.
  * We limit this to the maximum number of segments per request, because
  * that is already a reasonable number of segments to aggregate.  This
  * number should never be smaller than XBB_MAX_SEGMENTS_PER_REQUEST,
  * because that would leave situations where we can't dispatch even one
  * large request.
  */
 #define	XBB_MAX_SEGMENTS_PER_REQLIST XBB_MAX_SEGMENTS_PER_REQUEST
 
 /*--------------------------- Forward Declarations ---------------------------*/
 struct xbb_softc;
 struct xbb_xen_req;
 
 static void xbb_attach_failed(struct xbb_softc *xbb, int err, const char *fmt,
 			      ...) __attribute__((format(printf, 3, 4)));
 static int  xbb_shutdown(struct xbb_softc *xbb);
 static int  xbb_detach(device_t dev);
 
 /*------------------------------ Data Structures -----------------------------*/
 
 STAILQ_HEAD(xbb_xen_req_list, xbb_xen_req);
 
 typedef enum {
 	XBB_REQLIST_NONE	= 0x00,
 	XBB_REQLIST_MAPPED	= 0x01
 } xbb_reqlist_flags;
 
 struct xbb_xen_reqlist {
 	/**
 	 * Back reference to the parent block back instance for this
 	 * request.  Used during bio_done handling.
 	 */
 	struct xbb_softc        *xbb;
 
 	/**
 	 * BLKIF_OP code for this request.
 	 */
 	int			 operation;
 
 	/**
 	 * Set to BLKIF_RSP_* to indicate request status.
 	 *
 	 * This field allows an error status to be recorded even if the
 	 * delivery of this status must be deferred.  Deferred reporting
 	 * is necessary, for example, when an error is detected during
 	 * completion processing of one bio when other bios for this
 	 * request are still outstanding.
 	 */
 	int			 status;
 
 	/**
 	 * Number of 512 byte sectors not transferred.
 	 */
 	int			 residual_512b_sectors;
 
 	/**
 	 * Starting sector number of the first request in the list.
 	 */
 	off_t			 starting_sector_number;
 
 	/**
 	 * If we're going to coalesce, the next contiguous sector would be
 	 * this one.
 	 */
 	off_t			 next_contig_sector;
 
 	/**
 	 * Number of child requests in the list.
 	 */
 	int			 num_children;
 
 	/**
 	 * Number of I/O requests still pending on the backend.
 	 */
 	int			 pendcnt;
 
 	/**
 	 * Total number of segments for requests in the list.
 	 */
 	int			 nr_segments;
 
 	/**
 	 * Flags for this particular request list.
 	 */
 	xbb_reqlist_flags	 flags;
 
 	/**
 	 * Kernel virtual address space reserved for this request
 	 * list structure and used to map the remote domain's pages for
 	 * this I/O, into our domain's address space.
 	 */
 	uint8_t			*kva;
 
 	/**
 	 * Base, psuedo-physical address, corresponding to the start
 	 * of this request's kva region.
 	 */
 	uint64_t	 	 gnt_base;
 
 
 #ifdef XBB_USE_BOUNCE_BUFFERS
 	/**
 	 * Pre-allocated domain local memory used to proxy remote
 	 * domain memory during I/O operations.
 	 */
 	uint8_t			*bounce;
 #endif
 
 	/**
 	 * Array of grant handles (one per page) used to map this request.
 	 */
 	grant_handle_t		*gnt_handles;
 
 	/**
 	 * Device statistics request ordering type (ordered or simple).
 	 */
 	devstat_tag_type	 ds_tag_type;
 
 	/**
 	 * Device statistics request type (read, write, no_data).
 	 */
 	devstat_trans_flags	 ds_trans_type;
 
 	/**
 	 * The start time for this request.
 	 */
 	struct bintime		 ds_t0;
 
 	/**
 	 * Linked list of contiguous requests with the same operation type.
 	 */
 	struct xbb_xen_req_list	 contig_req_list;
 
 	/**
 	 * Linked list links used to aggregate idle requests in the
 	 * request list free pool (xbb->reqlist_free_stailq) and pending
 	 * requests waiting for execution (xbb->reqlist_pending_stailq).
 	 */
 	STAILQ_ENTRY(xbb_xen_reqlist) links;
 };
 
 STAILQ_HEAD(xbb_xen_reqlist_list, xbb_xen_reqlist);
 
 /**
  * \brief Object tracking an in-flight I/O from a Xen VBD consumer.
  */
 struct xbb_xen_req {
 	/**
 	 * Linked list links used to aggregate requests into a reqlist
 	 * and to store them in the request free pool.
 	 */
 	STAILQ_ENTRY(xbb_xen_req) links;
 
 	/**
 	 * The remote domain's identifier for this I/O request.
 	 */
 	uint64_t		  id;
 
 	/**
 	 * The number of pages currently mapped for this request.
 	 */
 	int			  nr_pages;
 
 	/**
 	 * The number of 512 byte sectors comprising this requests.
 	 */
 	int			  nr_512b_sectors;
 
 	/**
 	 * BLKIF_OP code for this request.
 	 */
 	int			  operation;
 
 	/**
 	 * Storage used for non-native ring requests.
 	 */
 	blkif_request_t		 ring_req_storage;
 
 	/**
 	 * Pointer to the Xen request in the ring.
 	 */
 	blkif_request_t		*ring_req;
 
 	/**
 	 * Consumer index for this request.
 	 */
 	RING_IDX		 req_ring_idx;
 
 	/**
 	 * The start time for this request.
 	 */
 	struct bintime		 ds_t0;
 
 	/**
 	 * Pointer back to our parent request list.
 	 */
 	struct xbb_xen_reqlist  *reqlist;
 };
 SLIST_HEAD(xbb_xen_req_slist, xbb_xen_req);
 
 /**
  * \brief Configuration data for the shared memory request ring
  *        used to communicate with the front-end client of this
  *        this driver.
  */
 struct xbb_ring_config {
 	/** KVA address where ring memory is mapped. */
 	vm_offset_t	va;
 
 	/** The pseudo-physical address where ring memory is mapped.*/
 	uint64_t	gnt_addr;
 
 	/**
 	 * Grant table handles, one per-ring page, returned by the
 	 * hyperpervisor upon mapping of the ring and required to
 	 * unmap it when a connection is torn down.
 	 */
 	grant_handle_t	handle[XBB_MAX_RING_PAGES];
 
 	/**
 	 * The device bus address returned by the hypervisor when
 	 * mapping the ring and required to unmap it when a connection
 	 * is torn down.
 	 */
 	uint64_t	bus_addr[XBB_MAX_RING_PAGES];
 
 	/** The number of ring pages mapped for the current connection. */
 	u_int		ring_pages;
 
 	/**
 	 * The grant references, one per-ring page, supplied by the
 	 * front-end, allowing us to reference the ring pages in the
 	 * front-end's domain and to map these pages into our own domain.
 	 */
 	grant_ref_t	ring_ref[XBB_MAX_RING_PAGES];
 
 	/** The interrupt driven even channel used to signal ring events. */
 	evtchn_port_t   evtchn;
 };
 
 /**
  * Per-instance connection state flags.
  */
 typedef enum
 {
 	/**
 	 * The front-end requested a read-only mount of the
 	 * back-end device/file.
 	 */
 	XBBF_READ_ONLY         = 0x01,
 
 	/** Communication with the front-end has been established. */
 	XBBF_RING_CONNECTED    = 0x02,
 
 	/**
 	 * Front-end requests exist in the ring and are waiting for
 	 * xbb_xen_req objects to free up.
 	 */
 	XBBF_RESOURCE_SHORTAGE = 0x04,
 
 	/** Connection teardown in progress. */
 	XBBF_SHUTDOWN          = 0x08,
 
 	/** A thread is already performing shutdown processing. */
 	XBBF_IN_SHUTDOWN       = 0x10
 } xbb_flag_t;
 
 /** Backend device type.  */
 typedef enum {
 	/** Backend type unknown. */
 	XBB_TYPE_NONE		= 0x00,
 
 	/**
 	 * Backend type disk (access via cdev switch
 	 * strategy routine).
 	 */
 	XBB_TYPE_DISK		= 0x01,
 
 	/** Backend type file (access vnode operations.). */
 	XBB_TYPE_FILE		= 0x02
 } xbb_type;
 
 /**
  * \brief Structure used to memoize information about a per-request
  *        scatter-gather list.
  *
  * The chief benefit of using this data structure is it avoids having
  * to reparse the possibly discontiguous S/G list in the original
  * request.  Due to the way that the mapping of the memory backing an
  * I/O transaction is handled by Xen, a second pass is unavoidable.
  * At least this way the second walk is a simple array traversal.
  *
  * \note A single Scatter/Gather element in the block interface covers
  *       at most 1 machine page.  In this context a sector (blkif
  *       nomenclature, not what I'd choose) is a 512b aligned unit
  *       of mapping within the machine page referenced by an S/G
  *       element.
  */
 struct xbb_sg {
 	/** The number of 512b data chunks mapped in this S/G element. */
 	int16_t nsect;
 
 	/**
 	 * The index (0 based) of the first 512b data chunk mapped
 	 * in this S/G element.
 	 */
 	uint8_t first_sect;
 
 	/**
 	 * The index (0 based) of the last 512b data chunk mapped
 	 * in this S/G element.
 	 */
 	uint8_t last_sect;
 };
 
 /**
  * Character device backend specific configuration data.
  */
 struct xbb_dev_data {
 	/** Cdev used for device backend access.  */
 	struct cdev   *cdev;
 
 	/** Cdev switch used for device backend access.  */
 	struct cdevsw *csw;
 
 	/** Used to hold a reference on opened cdev backend devices. */
 	int	       dev_ref;
 };
 
 /**
  * File backend specific configuration data.
  */
 struct xbb_file_data {
 	/** Credentials to use for vnode backed (file based) I/O. */
 	struct ucred   *cred;
 
 	/**
 	 * \brief Array of io vectors used to process file based I/O.
 	 *
 	 * Only a single file based request is outstanding per-xbb instance,
 	 * so we only need one of these.
 	 */
 	struct iovec	xiovecs[XBB_MAX_SEGMENTS_PER_REQLIST];
 #ifdef XBB_USE_BOUNCE_BUFFERS
 
 	/**
 	 * \brief Array of io vectors used to handle bouncing of file reads.
 	 *
 	 * Vnode operations are free to modify uio data during their
 	 * exectuion.  In the case of a read with bounce buffering active,
 	 * we need some of the data from the original uio in order to
 	 * bounce-out the read data.  This array serves as the temporary
 	 * storage for this saved data.
 	 */
 	struct iovec	saved_xiovecs[XBB_MAX_SEGMENTS_PER_REQLIST];
 
 	/**
 	 * \brief Array of memoized bounce buffer kva offsets used
 	 *        in the file based backend.
 	 *
 	 * Due to the way that the mapping of the memory backing an
 	 * I/O transaction is handled by Xen, a second pass through
 	 * the request sg elements is unavoidable. We memoize the computed
 	 * bounce address here to reduce the cost of the second walk.
 	 */
 	void		*xiovecs_vaddr[XBB_MAX_SEGMENTS_PER_REQLIST];
 #endif /* XBB_USE_BOUNCE_BUFFERS */
 };
 
 /**
  * Collection of backend type specific data.
  */
 union xbb_backend_data {
 	struct xbb_dev_data  dev;
 	struct xbb_file_data file;
 };
 
 /**
  * Function signature of backend specific I/O handlers.
  */
 typedef int (*xbb_dispatch_t)(struct xbb_softc *xbb,
 			      struct xbb_xen_reqlist *reqlist, int operation,
 			      int flags);
 
 /**
  * Per-instance configuration data.
  */
 struct xbb_softc {
 
 	/**
 	 * Task-queue used to process I/O requests.
 	 */
 	struct taskqueue	 *io_taskqueue;
 
 	/**
 	 * Single "run the request queue" task enqueued
 	 * on io_taskqueue.
 	 */
 	struct task		  io_task;
 
 	/** Device type for this instance. */
 	xbb_type		  device_type;
 
 	/** NewBus device corresponding to this instance. */
 	device_t		  dev;
 
 	/** Backend specific dispatch routine for this instance. */
 	xbb_dispatch_t		  dispatch_io;
 
 	/** The number of requests outstanding on the backend device/file. */
 	int			  active_request_count;
 
 	/** Free pool of request tracking structures. */
 	struct xbb_xen_req_list   request_free_stailq;
 
 	/** Array, sized at connection time, of request tracking structures. */
 	struct xbb_xen_req	 *requests;
 
 	/** Free pool of request list structures. */
 	struct xbb_xen_reqlist_list reqlist_free_stailq;
 
 	/** List of pending request lists awaiting execution. */
 	struct xbb_xen_reqlist_list reqlist_pending_stailq;
 
 	/** Array, sized at connection time, of request list structures. */
 	struct xbb_xen_reqlist	 *request_lists;
 
 	/**
 	 * Global pool of kva used for mapping remote domain ring
 	 * and I/O transaction data.
 	 */
 	vm_offset_t		  kva;
 
 	/** Psuedo-physical address corresponding to kva. */
 	uint64_t		  gnt_base_addr;
 
 	/** The size of the global kva pool. */
 	int			  kva_size;
 
 	/** The size of the KVA area used for request lists. */
 	int			  reqlist_kva_size;
 
 	/** The number of pages of KVA used for request lists */
 	int			  reqlist_kva_pages;
 
 	/** Bitmap of free KVA pages */
 	bitstr_t		 *kva_free;
 
 	/**
 	 * \brief Cached value of the front-end's domain id.
 	 * 
 	 * This value is used at once for each mapped page in
 	 * a transaction.  We cache it to avoid incuring the
 	 * cost of an ivar access every time this is needed.
 	 */
 	domid_t			  otherend_id;
 
 	/**
 	 * \brief The blkif protocol abi in effect.
 	 *
 	 * There are situations where the back and front ends can
 	 * have a different, native abi (e.g. intel x86_64 and
 	 * 32bit x86 domains on the same machine).  The back-end
 	 * always accomodates the front-end's native abi.  That
 	 * value is pulled from the XenStore and recorded here.
 	 */
 	int			  abi;
 
 	/**
 	 * \brief The maximum number of requests and request lists allowed
 	 *        to be in flight at a time.
 	 *
 	 * This value is negotiated via the XenStore.
 	 */
 	u_int			  max_requests;
 
 	/**
 	 * \brief The maximum number of segments (1 page per segment)
 	 *	  that can be mapped by a request.
 	 *
 	 * This value is negotiated via the XenStore.
 	 */
 	u_int			  max_request_segments;
 
 	/**
 	 * \brief Maximum number of segments per request list.
 	 *
 	 * This value is derived from and will generally be larger than
 	 * max_request_segments.
 	 */
 	u_int			  max_reqlist_segments;
 
 	/**
 	 * The maximum size of any request to this back-end
 	 * device.
 	 *
 	 * This value is negotiated via the XenStore.
 	 */
 	u_int			  max_request_size;
 
 	/**
 	 * The maximum size of any request list.  This is derived directly
 	 * from max_reqlist_segments.
 	 */
 	u_int			  max_reqlist_size;
 
 	/** Various configuration and state bit flags. */
 	xbb_flag_t		  flags;
 
 	/** Ring mapping and interrupt configuration data. */
 	struct xbb_ring_config	  ring_config;
 
 	/** Runtime, cross-abi safe, structures for ring access. */
 	blkif_back_rings_t	  rings;
 
 	/** IRQ mapping for the communication ring event channel. */
 	xen_intr_handle_t	  xen_intr_handle;
 
 	/**
 	 * \brief Backend access mode flags (e.g. write, or read-only).
 	 *
 	 * This value is passed to us by the front-end via the XenStore.
 	 */
 	char			 *dev_mode;
 
 	/**
 	 * \brief Backend device type (e.g. "disk", "cdrom", "floppy").
 	 *
 	 * This value is passed to us by the front-end via the XenStore.
 	 * Currently unused.
 	 */
 	char			 *dev_type;
 
 	/**
 	 * \brief Backend device/file identifier.
 	 *
 	 * This value is passed to us by the front-end via the XenStore.
 	 * We expect this to be a POSIX path indicating the file or
 	 * device to open.
 	 */
 	char			 *dev_name;
 
 	/**
 	 * Vnode corresponding to the backend device node or file
 	 * we are acessing.
 	 */
 	struct vnode		 *vn;
 
 	union xbb_backend_data	  backend;
 
 	/** The native sector size of the backend. */
 	u_int			  sector_size;
 
 	/** log2 of sector_size.  */
 	u_int			  sector_size_shift;
 
 	/** Size in bytes of the backend device or file.  */
 	off_t			  media_size;
 
 	/**
 	 * \brief media_size expressed in terms of the backend native
 	 *	  sector size.
 	 *
 	 * (e.g. xbb->media_size >> xbb->sector_size_shift).
 	 */
 	uint64_t		  media_num_sectors;
 
 	/**
 	 * \brief Array of memoized scatter gather data computed during the
 	 *	  conversion of blkif ring requests to internal xbb_xen_req
 	 *	  structures.
 	 *
 	 * Ring processing is serialized so we only need one of these.
 	 */
 	struct xbb_sg		  xbb_sgs[XBB_MAX_SEGMENTS_PER_REQLIST];
 
 	/**
 	 * Temporary grant table map used in xbb_dispatch_io().  When
 	 * XBB_MAX_SEGMENTS_PER_REQLIST gets large, keeping this on the
 	 * stack could cause a stack overflow.
 	 */
 	struct gnttab_map_grant_ref   maps[XBB_MAX_SEGMENTS_PER_REQLIST];
 
 	/** Mutex protecting per-instance data. */
 	struct mtx		  lock;
 
 #ifdef XENHVM
 	/**
 	 * Resource representing allocated physical address space
 	 * associated with our per-instance kva region.
 	 */
 	struct resource		 *pseudo_phys_res;
 
 	/** Resource id for allocated physical address space. */
 	int			  pseudo_phys_res_id;
 #endif
 
 	/**
 	 * I/O statistics from BlockBack dispatch down.  These are
 	 * coalesced requests, and we start them right before execution.
 	 */
 	struct devstat		 *xbb_stats;
 
 	/**
 	 * I/O statistics coming into BlockBack.  These are the requests as
 	 * we get them from BlockFront.  They are started as soon as we
 	 * receive a request, and completed when the I/O is complete.
 	 */
 	struct devstat		 *xbb_stats_in;
 
 	/** Disable sending flush to the backend */
 	int			  disable_flush;
 
 	/** Send a real flush for every N flush requests */
 	int			  flush_interval;
 
 	/** Count of flush requests in the interval */
 	int			  flush_count;
 
 	/** Don't coalesce requests if this is set */
 	int			  no_coalesce_reqs;
 
 	/** Number of requests we have received */
 	uint64_t		  reqs_received;
 
 	/** Number of requests we have completed*/
 	uint64_t		  reqs_completed;
 
 	/** How many forced dispatches (i.e. without coalescing) have happend */
 	uint64_t		  forced_dispatch;
 
 	/** How many normal dispatches have happend */
 	uint64_t		  normal_dispatch;
 
 	/** How many total dispatches have happend */
 	uint64_t		  total_dispatch;
 
 	/** How many times we have run out of KVA */
 	uint64_t		  kva_shortages;
 
 	/** How many times we have run out of request structures */
 	uint64_t		  request_shortages;
 };
 
 /*---------------------------- Request Processing ----------------------------*/
 /**
  * Allocate an internal transaction tracking structure from the free pool.
  *
  * \param xbb  Per-instance xbb configuration structure.
  *
  * \return  On success, a pointer to the allocated xbb_xen_req structure.
  *          Otherwise NULL.
  */
 static inline struct xbb_xen_req *
 xbb_get_req(struct xbb_softc *xbb)
 {
 	struct xbb_xen_req *req;
 
 	req = NULL;
 
 	mtx_assert(&xbb->lock, MA_OWNED);
 
 	if ((req = STAILQ_FIRST(&xbb->request_free_stailq)) != NULL) {
 		STAILQ_REMOVE_HEAD(&xbb->request_free_stailq, links);
 		xbb->active_request_count++;
 	}
 
 	return (req);
 }
 
 /**
  * Return an allocated transaction tracking structure to the free pool.
  *
  * \param xbb  Per-instance xbb configuration structure.
  * \param req  The request structure to free.
  */
 static inline void
 xbb_release_req(struct xbb_softc *xbb, struct xbb_xen_req *req)
 {
 	mtx_assert(&xbb->lock, MA_OWNED);
 
 	STAILQ_INSERT_HEAD(&xbb->request_free_stailq, req, links);
 	xbb->active_request_count--;
 
 	KASSERT(xbb->active_request_count >= 0,
 		("xbb_release_req: negative active count"));
 }
 
 /**
  * Return an xbb_xen_req_list of allocated xbb_xen_reqs to the free pool.
  *
  * \param xbb	    Per-instance xbb configuration structure.
  * \param req_list  The list of requests to free.
  * \param nreqs	    The number of items in the list.
  */
 static inline void
 xbb_release_reqs(struct xbb_softc *xbb, struct xbb_xen_req_list *req_list,
 		 int nreqs)
 {
 	mtx_assert(&xbb->lock, MA_OWNED);
 
 	STAILQ_CONCAT(&xbb->request_free_stailq, req_list);
 	xbb->active_request_count -= nreqs;
 
 	KASSERT(xbb->active_request_count >= 0,
 		("xbb_release_reqs: negative active count"));
 }
 
 /**
  * Given a page index and 512b sector offset within that page,
  * calculate an offset into a request's kva region.
  *
  * \param reqlist The request structure whose kva region will be accessed.
  * \param pagenr  The page index used to compute the kva offset.
  * \param sector  The 512b sector index used to compute the page relative
  *                kva offset.
  *
  * \return  The computed global KVA offset.
  */
 static inline uint8_t *
 xbb_reqlist_vaddr(struct xbb_xen_reqlist *reqlist, int pagenr, int sector)
 {
 	return (reqlist->kva + (PAGE_SIZE * pagenr) + (sector << 9));
 }
 
 #ifdef XBB_USE_BOUNCE_BUFFERS
 /**
  * Given a page index and 512b sector offset within that page,
  * calculate an offset into a request's local bounce memory region.
  *
  * \param reqlist The request structure whose bounce region will be accessed.
  * \param pagenr  The page index used to compute the bounce offset.
  * \param sector  The 512b sector index used to compute the page relative
  *                bounce offset.
  *
  * \return  The computed global bounce buffer address.
  */
 static inline uint8_t *
 xbb_reqlist_bounce_addr(struct xbb_xen_reqlist *reqlist, int pagenr, int sector)
 {
 	return (reqlist->bounce + (PAGE_SIZE * pagenr) + (sector << 9));
 }
 #endif
 
 /**
  * Given a page number and 512b sector offset within that page,
  * calculate an offset into the request's memory region that the
  * underlying backend device/file should use for I/O.
  *
  * \param reqlist The request structure whose I/O region will be accessed.
  * \param pagenr  The page index used to compute the I/O offset.
  * \param sector  The 512b sector index used to compute the page relative
  *                I/O offset.
  *
  * \return  The computed global I/O address.
  *
  * Depending on configuration, this will either be a local bounce buffer
  * or a pointer to the memory mapped in from the front-end domain for
  * this request.
  */
 static inline uint8_t *
 xbb_reqlist_ioaddr(struct xbb_xen_reqlist *reqlist, int pagenr, int sector)
 {
 #ifdef XBB_USE_BOUNCE_BUFFERS
 	return (xbb_reqlist_bounce_addr(reqlist, pagenr, sector));
 #else
 	return (xbb_reqlist_vaddr(reqlist, pagenr, sector));
 #endif
 }
 
 /**
  * Given a page index and 512b sector offset within that page, calculate
  * an offset into the local psuedo-physical address space used to map a
  * front-end's request data into a request.
  *
  * \param reqlist The request list structure whose pseudo-physical region
  *                will be accessed.
  * \param pagenr  The page index used to compute the pseudo-physical offset.
  * \param sector  The 512b sector index used to compute the page relative
  *                pseudo-physical offset.
  *
  * \return  The computed global pseudo-phsyical address.
  *
  * Depending on configuration, this will either be a local bounce buffer
  * or a pointer to the memory mapped in from the front-end domain for
  * this request.
  */
 static inline uintptr_t
 xbb_get_gntaddr(struct xbb_xen_reqlist *reqlist, int pagenr, int sector)
 {
 	struct xbb_softc *xbb;
 
 	xbb = reqlist->xbb;
 
 	return ((uintptr_t)(xbb->gnt_base_addr +
 		(uintptr_t)(reqlist->kva - xbb->kva) +
 		(PAGE_SIZE * pagenr) + (sector << 9)));
 }
 
 /**
  * Get Kernel Virtual Address space for mapping requests.
  *
  * \param xbb         Per-instance xbb configuration structure.
  * \param nr_pages    Number of pages needed.
  * \param check_only  If set, check for free KVA but don't allocate it.
  * \param have_lock   If set, xbb lock is already held.
  *
  * \return  On success, a pointer to the allocated KVA region.  Otherwise NULL.
  *
  * Note:  This should be unnecessary once we have either chaining or
  * scatter/gather support for struct bio.  At that point we'll be able to
  * put multiple addresses and lengths in one bio/bio chain and won't need
  * to map everything into one virtual segment.
  */
 static uint8_t *
 xbb_get_kva(struct xbb_softc *xbb, int nr_pages)
 {
 	intptr_t first_clear;
 	intptr_t num_clear;
 	uint8_t *free_kva;
 	int      i;
 
 	KASSERT(nr_pages != 0, ("xbb_get_kva of zero length"));
 
 	first_clear = 0;
 	free_kva = NULL;
 
 	mtx_lock(&xbb->lock);
 
 	/*
 	 * Look for the first available page.  If there are none, we're done.
 	 */
 	bit_ffc(xbb->kva_free, xbb->reqlist_kva_pages, &first_clear);
 
 	if (first_clear == -1)
 		goto bailout;
 
 	/*
 	 * Starting at the first available page, look for consecutive free
 	 * pages that will satisfy the user's request.
 	 */
 	for (i = first_clear, num_clear = 0; i < xbb->reqlist_kva_pages; i++) {
 		/*
 		 * If this is true, the page is used, so we have to reset
 		 * the number of clear pages and the first clear page
 		 * (since it pointed to a region with an insufficient number
 		 * of clear pages).
 		 */
 		if (bit_test(xbb->kva_free, i)) {
 			num_clear = 0;
 			first_clear = -1;
 			continue;
 		}
 
 		if (first_clear == -1)
 			first_clear = i;
 
 		/*
 		 * If this is true, we've found a large enough free region
 		 * to satisfy the request.
 		 */
 		if (++num_clear == nr_pages) {
 
 			bit_nset(xbb->kva_free, first_clear,
 				 first_clear + nr_pages - 1);
 
 			free_kva = xbb->kva +
 				(uint8_t *)(first_clear * PAGE_SIZE);
 
 			KASSERT(free_kva >= (uint8_t *)xbb->kva &&
 				free_kva + (nr_pages * PAGE_SIZE) <=
 				(uint8_t *)xbb->ring_config.va,
 				("Free KVA %p len %d out of range, "
 				 "kva = %#jx, ring VA = %#jx\n", free_kva,
 				 nr_pages * PAGE_SIZE, (uintmax_t)xbb->kva,
 				 (uintmax_t)xbb->ring_config.va));
 			break;
 		}
 	}
 
 bailout:
 
 	if (free_kva == NULL) {
 		xbb->flags |= XBBF_RESOURCE_SHORTAGE;
 		xbb->kva_shortages++;
 	}
 
 	mtx_unlock(&xbb->lock);
 
 	return (free_kva);
 }
 
 /**
  * Free allocated KVA.
  *
  * \param xbb	    Per-instance xbb configuration structure.
  * \param kva_ptr   Pointer to allocated KVA region.  
  * \param nr_pages  Number of pages in the KVA region.
  */
 static void
 xbb_free_kva(struct xbb_softc *xbb, uint8_t *kva_ptr, int nr_pages)
 {
 	intptr_t start_page;
 
 	mtx_assert(&xbb->lock, MA_OWNED);
 
 	start_page = (intptr_t)(kva_ptr - xbb->kva) >> PAGE_SHIFT;
 	bit_nclear(xbb->kva_free, start_page, start_page + nr_pages - 1);
 
 }
 
 /**
  * Unmap the front-end pages associated with this I/O request.
  *
  * \param req  The request structure to unmap.
  */
 static void
 xbb_unmap_reqlist(struct xbb_xen_reqlist *reqlist)
 {
 	struct gnttab_unmap_grant_ref unmap[XBB_MAX_SEGMENTS_PER_REQLIST];
 	u_int			      i;
 	u_int			      invcount;
 	int			      error;
 
 	invcount = 0;
 	for (i = 0; i < reqlist->nr_segments; i++) {
 
 		if (reqlist->gnt_handles[i] == GRANT_REF_INVALID)
 			continue;
 
 		unmap[invcount].host_addr    = xbb_get_gntaddr(reqlist, i, 0);
 		unmap[invcount].dev_bus_addr = 0;
 		unmap[invcount].handle       = reqlist->gnt_handles[i];
 		reqlist->gnt_handles[i]	     = GRANT_REF_INVALID;
 		invcount++;
 	}
 
 	error = HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref,
 					  unmap, invcount);
 	KASSERT(error == 0, ("Grant table operation failed"));
 }
 
 /**
  * Allocate an internal transaction tracking structure from the free pool.
  *
  * \param xbb  Per-instance xbb configuration structure.
  *
  * \return  On success, a pointer to the allocated xbb_xen_reqlist structure.
  *          Otherwise NULL.
  */
 static inline struct xbb_xen_reqlist *
 xbb_get_reqlist(struct xbb_softc *xbb)
 {
 	struct xbb_xen_reqlist *reqlist;
 
 	reqlist = NULL;
 
 	mtx_assert(&xbb->lock, MA_OWNED);
 
 	if ((reqlist = STAILQ_FIRST(&xbb->reqlist_free_stailq)) != NULL) {
 
 		STAILQ_REMOVE_HEAD(&xbb->reqlist_free_stailq, links);
 		reqlist->flags = XBB_REQLIST_NONE;
 		reqlist->kva = NULL;
 		reqlist->status = BLKIF_RSP_OKAY;
 		reqlist->residual_512b_sectors = 0;
 		reqlist->num_children = 0;
 		reqlist->nr_segments = 0;
 		STAILQ_INIT(&reqlist->contig_req_list);
 	}
 
 	return (reqlist);
 }
 
 /**
  * Return an allocated transaction tracking structure to the free pool.
  *
  * \param xbb        Per-instance xbb configuration structure.
  * \param req        The request list structure to free.
  * \param wakeup     If set, wakeup the work thread if freeing this reqlist
  *                   during a resource shortage condition.
  */
 static inline void
 xbb_release_reqlist(struct xbb_softc *xbb, struct xbb_xen_reqlist *reqlist,
 		    int wakeup)
 {
 
 	mtx_lock(&xbb->lock);
 
 	if (wakeup) {
 		wakeup = xbb->flags & XBBF_RESOURCE_SHORTAGE;
 		xbb->flags &= ~XBBF_RESOURCE_SHORTAGE;
 	}
 
 	if (reqlist->kva != NULL)
 		xbb_free_kva(xbb, reqlist->kva, reqlist->nr_segments);
 
 	xbb_release_reqs(xbb, &reqlist->contig_req_list, reqlist->num_children);
 
 	STAILQ_INSERT_TAIL(&xbb->reqlist_free_stailq, reqlist, links);
 
 	if ((xbb->flags & XBBF_SHUTDOWN) != 0) {
 		/*
 		 * Shutdown is in progress.  See if we can
 		 * progress further now that one more request
 		 * has completed and been returned to the
 		 * free pool.
 		 */
 		xbb_shutdown(xbb);
 	}
 
 	mtx_unlock(&xbb->lock);
 
 	if (wakeup != 0)
 		taskqueue_enqueue(xbb->io_taskqueue, &xbb->io_task); 
 }
 
 /**
  * Request resources and do basic request setup.
  *
  * \param xbb          Per-instance xbb configuration structure.
  * \param reqlist      Pointer to reqlist pointer.
  * \param ring_req     Pointer to a block ring request.
  * \param ring_index   The ring index of this request.
  *
  * \return  0 for success, non-zero for failure.
  */
 static int
 xbb_get_resources(struct xbb_softc *xbb, struct xbb_xen_reqlist **reqlist,
 		  blkif_request_t *ring_req, RING_IDX ring_idx)
 {
 	struct xbb_xen_reqlist *nreqlist;
 	struct xbb_xen_req     *nreq;
 
 	nreqlist = NULL;
 	nreq     = NULL;
 
 	mtx_lock(&xbb->lock);
 
 	/*
 	 * We don't allow new resources to be allocated if we're in the
 	 * process of shutting down.
 	 */
 	if ((xbb->flags & XBBF_SHUTDOWN) != 0) {
 		mtx_unlock(&xbb->lock);
 		return (1);
 	}
 
 	/*
 	 * Allocate a reqlist if the caller doesn't have one already.
 	 */
 	if (*reqlist == NULL) {
 		nreqlist = xbb_get_reqlist(xbb);
 		if (nreqlist == NULL)
 			goto bailout_error;
 	}
 
 	/* We always allocate a request. */
 	nreq = xbb_get_req(xbb);
 	if (nreq == NULL)
 		goto bailout_error;
 
 	mtx_unlock(&xbb->lock);
 
 	if (*reqlist == NULL) {
 		*reqlist = nreqlist;
 		nreqlist->operation = ring_req->operation;
 		nreqlist->starting_sector_number = ring_req->sector_number;
 		STAILQ_INSERT_TAIL(&xbb->reqlist_pending_stailq, nreqlist,
 				   links);
 	}
 
 	nreq->reqlist = *reqlist;
 	nreq->req_ring_idx = ring_idx;
 	nreq->id = ring_req->id;
 	nreq->operation = ring_req->operation;
 
 	if (xbb->abi != BLKIF_PROTOCOL_NATIVE) {
 		bcopy(ring_req, &nreq->ring_req_storage, sizeof(*ring_req));
 		nreq->ring_req = &nreq->ring_req_storage;
 	} else {
 		nreq->ring_req = ring_req;
 	}
 
 	binuptime(&nreq->ds_t0);
 	devstat_start_transaction(xbb->xbb_stats_in, &nreq->ds_t0);
 	STAILQ_INSERT_TAIL(&(*reqlist)->contig_req_list, nreq, links);
 	(*reqlist)->num_children++;
 	(*reqlist)->nr_segments += ring_req->nr_segments;
 
 	return (0);
 
 bailout_error:
 
 	/*
 	 * We're out of resources, so set the shortage flag.  The next time
 	 * a request is released, we'll try waking up the work thread to
 	 * see if we can allocate more resources.
 	 */
 	xbb->flags |= XBBF_RESOURCE_SHORTAGE;
 	xbb->request_shortages++;
 
 	if (nreq != NULL)
 		xbb_release_req(xbb, nreq);
 
 	mtx_unlock(&xbb->lock);
 
 	if (nreqlist != NULL)
 		xbb_release_reqlist(xbb, nreqlist, /*wakeup*/ 0);
 
 	return (1);
 }
 
 /**
  * Create and transmit a response to a blkif request.
  * 
  * \param xbb     Per-instance xbb configuration structure.
  * \param req     The request structure to which to respond.
  * \param status  The status code to report.  See BLKIF_RSP_*
  *                in sys/xen/interface/io/blkif.h.
  */
 static void
 xbb_send_response(struct xbb_softc *xbb, struct xbb_xen_req *req, int status)
 {
 	blkif_response_t *resp;
 	int		  more_to_do;
 	int		  notify;
 
 	more_to_do = 0;
 
 	/*
 	 * Place on the response ring for the relevant domain.
 	 * For now, only the spacing between entries is different
 	 * in the different ABIs, not the response entry layout.
 	 */
 	mtx_lock(&xbb->lock);
 	switch (xbb->abi) {
 	case BLKIF_PROTOCOL_NATIVE:
 		resp = RING_GET_RESPONSE(&xbb->rings.native,
 					 xbb->rings.native.rsp_prod_pvt);
 		break;
 	case BLKIF_PROTOCOL_X86_32:
 		resp = (blkif_response_t *)
 		    RING_GET_RESPONSE(&xbb->rings.x86_32,
 				      xbb->rings.x86_32.rsp_prod_pvt);
 		break;
 	case BLKIF_PROTOCOL_X86_64:
 		resp = (blkif_response_t *)
 		    RING_GET_RESPONSE(&xbb->rings.x86_64,
 				      xbb->rings.x86_64.rsp_prod_pvt);
 		break;
 	default:
 		panic("Unexpected blkif protocol ABI.");
 	}
 
 	resp->id        = req->id;
 	resp->operation = req->operation;
 	resp->status    = status;
 
 	xbb->rings.common.rsp_prod_pvt += BLKIF_SEGS_TO_BLOCKS(req->nr_pages);
 	RING_PUSH_RESPONSES_AND_CHECK_NOTIFY(&xbb->rings.common, notify);
 
 	if (xbb->rings.common.rsp_prod_pvt == xbb->rings.common.req_cons) {
 
 		/*
 		 * Tail check for pending requests. Allows frontend to avoid
 		 * notifications if requests are already in flight (lower
 		 * overheads and promotes batching).
 		 */
 		RING_FINAL_CHECK_FOR_REQUESTS(&xbb->rings.common, more_to_do);
 	} else if (RING_HAS_UNCONSUMED_REQUESTS(&xbb->rings.common)) {
 
 		more_to_do = 1;
 	}
 
 	xbb->reqs_completed++;
 
 	mtx_unlock(&xbb->lock);
 
 	if (more_to_do)
 		taskqueue_enqueue(xbb->io_taskqueue, &xbb->io_task); 
 
 	if (notify)
 		xen_intr_signal(xbb->xen_intr_handle);
 }
 
 /**
  * Complete a request list.
  *
  * \param xbb        Per-instance xbb configuration structure.
  * \param reqlist    Allocated internal request list structure.
  */
 static void
 xbb_complete_reqlist(struct xbb_softc *xbb, struct xbb_xen_reqlist *reqlist)
 {
 	struct xbb_xen_req *nreq;
 	off_t		    sectors_sent;
 
 	sectors_sent = 0;
 
 	if (reqlist->flags & XBB_REQLIST_MAPPED)
 		xbb_unmap_reqlist(reqlist);
 
 	/*
 	 * All I/O is done, send the response.  A lock should not be
 	 * necessary here because the request list is complete, and
 	 * therefore this is the only context accessing this request
 	 * right now.  The functions we call do their own locking if
 	 * necessary.
 	 */
 	STAILQ_FOREACH(nreq, &reqlist->contig_req_list, links) {
 		off_t cur_sectors_sent;
 
 		xbb_send_response(xbb, nreq, reqlist->status);
 
 		/* We don't report bytes sent if there is an error. */
 		if (reqlist->status == BLKIF_RSP_OKAY)
 			cur_sectors_sent = nreq->nr_512b_sectors;
 		else
 			cur_sectors_sent = 0;
 
 		sectors_sent += cur_sectors_sent;
 
 		devstat_end_transaction(xbb->xbb_stats_in,
 					/*bytes*/cur_sectors_sent << 9,
 					reqlist->ds_tag_type,
 					reqlist->ds_trans_type,
 					/*now*/NULL,
 					/*then*/&nreq->ds_t0);
 	}
 
 	/*
 	 * Take out any sectors not sent.  If we wind up negative (which
 	 * might happen if an error is reported as well as a residual), just
 	 * report 0 sectors sent.
 	 */
 	sectors_sent -= reqlist->residual_512b_sectors;
 	if (sectors_sent < 0)
 		sectors_sent = 0;
 
 	devstat_end_transaction(xbb->xbb_stats,
 				/*bytes*/ sectors_sent << 9,
 				reqlist->ds_tag_type,
 				reqlist->ds_trans_type,
 				/*now*/NULL,
 				/*then*/&reqlist->ds_t0);
 
 	xbb_release_reqlist(xbb, reqlist, /*wakeup*/ 1);
 }
 
 /**
  * Completion handler for buffer I/O requests issued by the device
  * backend driver.
  *
  * \param bio  The buffer I/O request on which to perform completion
  *             processing.
  */
 static void
 xbb_bio_done(struct bio *bio)
 {
 	struct xbb_softc       *xbb;
 	struct xbb_xen_reqlist *reqlist;
 
 	reqlist = bio->bio_caller1;
 	xbb     = reqlist->xbb;
 
 	reqlist->residual_512b_sectors += bio->bio_resid >> 9;
 
 	/*
 	 * This is a bit imprecise.  With aggregated I/O a single
 	 * request list can contain multiple front-end requests and
 	 * a multiple bios may point to a single request.  By carefully
 	 * walking the request list, we could map residuals and errors
 	 * back to the original front-end request, but the interface
 	 * isn't sufficiently rich for us to properly report the error.
 	 * So, we just treat the entire request list as having failed if an
 	 * error occurs on any part.  And, if an error occurs, we treat
 	 * the amount of data transferred as 0.
 	 *
 	 * For residuals, we report it on the overall aggregated device,
 	 * but not on the individual requests, since we don't currently
 	 * do the work to determine which front-end request to which the
 	 * residual applies.
 	 */
 	if (bio->bio_error) {
 		DPRINTF("BIO returned error %d for operation on device %s\n",
 			bio->bio_error, xbb->dev_name);
 		reqlist->status = BLKIF_RSP_ERROR;
 
 		if (bio->bio_error == ENXIO
 		 && xenbus_get_state(xbb->dev) == XenbusStateConnected) {
 
 			/*
 			 * Backend device has disappeared.  Signal the
 			 * front-end that we (the device proxy) want to
 			 * go away.
 			 */
 			xenbus_set_state(xbb->dev, XenbusStateClosing);
 		}
 	}
 
 #ifdef XBB_USE_BOUNCE_BUFFERS
 	if (bio->bio_cmd == BIO_READ) {
 		vm_offset_t kva_offset;
 
 		kva_offset = (vm_offset_t)bio->bio_data
 			   - (vm_offset_t)reqlist->bounce;
 		memcpy((uint8_t *)reqlist->kva + kva_offset,
 		       bio->bio_data, bio->bio_bcount);
 	}
 #endif /* XBB_USE_BOUNCE_BUFFERS */
 
 	/*
 	 * Decrement the pending count for the request list.  When we're
 	 * done with the requests, send status back for all of them.
 	 */
 	if (atomic_fetchadd_int(&reqlist->pendcnt, -1) == 1)
 		xbb_complete_reqlist(xbb, reqlist);
 
 	g_destroy_bio(bio);
 }
 
 /**
  * Parse a blkif request into an internal request structure and send
  * it to the backend for processing.
  *
  * \param xbb       Per-instance xbb configuration structure.
  * \param reqlist   Allocated internal request list structure.
  *
  * \return          On success, 0.  For resource shortages, non-zero.
  *  
  * This routine performs the backend common aspects of request parsing
  * including compiling an internal request structure, parsing the S/G
  * list and any secondary ring requests in which they may reside, and
  * the mapping of front-end I/O pages into our domain.
  */
 static int
 xbb_dispatch_io(struct xbb_softc *xbb, struct xbb_xen_reqlist *reqlist)
 {
 	struct xbb_sg                *xbb_sg;
 	struct gnttab_map_grant_ref  *map;
 	struct blkif_request_segment *sg;
 	struct blkif_request_segment *last_block_sg;
 	struct xbb_xen_req	     *nreq;
 	u_int			      nseg;
 	u_int			      seg_idx;
 	u_int			      block_segs;
 	int			      nr_sects;
 	int			      total_sects;
 	int			      operation;
 	uint8_t			      bio_flags;
 	int			      error;
 
 	reqlist->ds_tag_type = DEVSTAT_TAG_SIMPLE;
 	bio_flags            = 0;
 	total_sects	     = 0;
 	nr_sects	     = 0;
 
 	/*
 	 * First determine whether we have enough free KVA to satisfy this
 	 * request list.  If not, tell xbb_run_queue() so it can go to
 	 * sleep until we have more KVA.
 	 */
 	reqlist->kva = NULL;
 	if (reqlist->nr_segments != 0) {
 		reqlist->kva = xbb_get_kva(xbb, reqlist->nr_segments);
 		if (reqlist->kva == NULL) {
 			/*
 			 * If we're out of KVA, return ENOMEM.
 			 */
 			return (ENOMEM);
 		}
 	}
 
 	binuptime(&reqlist->ds_t0);
 	devstat_start_transaction(xbb->xbb_stats, &reqlist->ds_t0);
 
 	switch (reqlist->operation) {
 	case BLKIF_OP_WRITE_BARRIER:
 		bio_flags       |= BIO_ORDERED;
 		reqlist->ds_tag_type = DEVSTAT_TAG_ORDERED;
 		/* FALLTHROUGH */
 	case BLKIF_OP_WRITE:
 		operation = BIO_WRITE;
 		reqlist->ds_trans_type = DEVSTAT_WRITE;
 		if ((xbb->flags & XBBF_READ_ONLY) != 0) {
 			DPRINTF("Attempt to write to read only device %s\n",
 				xbb->dev_name);
 			reqlist->status = BLKIF_RSP_ERROR;
 			goto send_response;
 		}
 		break;
 	case BLKIF_OP_READ:
 		operation = BIO_READ;
 		reqlist->ds_trans_type = DEVSTAT_READ;
 		break;
 	case BLKIF_OP_FLUSH_DISKCACHE:
 		/*
 		 * If this is true, the user has requested that we disable
 		 * flush support.  So we just complete the requests
 		 * successfully.
 		 */
 		if (xbb->disable_flush != 0) {
 			goto send_response;
 		}
 
 		/*
 		 * The user has requested that we only send a real flush
 		 * for every N flush requests.  So keep count, and either
 		 * complete the request immediately or queue it for the
 		 * backend.
 		 */
 		if (xbb->flush_interval != 0) {
 		 	if (++(xbb->flush_count) < xbb->flush_interval) {
 				goto send_response;
 			} else
 				xbb->flush_count = 0;
 		}
 
 		operation = BIO_FLUSH;
 		reqlist->ds_tag_type = DEVSTAT_TAG_ORDERED;
 		reqlist->ds_trans_type = DEVSTAT_NO_DATA;
 		goto do_dispatch;
 		/*NOTREACHED*/
 	default:
 		DPRINTF("error: unknown block io operation [%d]\n",
 			reqlist->operation);
 		reqlist->status = BLKIF_RSP_ERROR;
 		goto send_response;
 	}
 
 	reqlist->xbb  = xbb;
 	xbb_sg        = xbb->xbb_sgs;
 	map	      = xbb->maps;
 	seg_idx	      = 0;
 
 	STAILQ_FOREACH(nreq, &reqlist->contig_req_list, links) {
 		blkif_request_t		*ring_req;
 		RING_IDX		 req_ring_idx;
 		u_int			 req_seg_idx;
 
 		ring_req	      = nreq->ring_req;
 		req_ring_idx	      = nreq->req_ring_idx;
 		nr_sects              = 0;
 		nseg                  = ring_req->nr_segments;
 		nreq->nr_pages        = nseg;
 		nreq->nr_512b_sectors = 0;
 		req_seg_idx	      = 0;
 		sg	              = NULL;
 
 		/* Check that number of segments is sane. */
 		if (__predict_false(nseg == 0)
 		 || __predict_false(nseg > xbb->max_request_segments)) {
 			DPRINTF("Bad number of segments in request (%d)\n",
 				nseg);
 			reqlist->status = BLKIF_RSP_ERROR;
 			goto send_response;
 		}
 
 		block_segs    = MIN(nreq->nr_pages,
 				    BLKIF_MAX_SEGMENTS_PER_HEADER_BLOCK);
 		sg            = ring_req->seg;
 		last_block_sg = sg + block_segs;
 		while (1) {
 
 			while (sg < last_block_sg) {
 				KASSERT(seg_idx <
 					XBB_MAX_SEGMENTS_PER_REQLIST,
 					("seg_idx %d is too large, max "
 					"segs %d\n", seg_idx,
 					XBB_MAX_SEGMENTS_PER_REQLIST));
 			
 				xbb_sg->first_sect = sg->first_sect;
 				xbb_sg->last_sect  = sg->last_sect;
 				xbb_sg->nsect =
 				    (int8_t)(sg->last_sect -
 				    sg->first_sect + 1);
 
 				if ((sg->last_sect >= (PAGE_SIZE >> 9))
 				 || (xbb_sg->nsect <= 0)) {
 					reqlist->status = BLKIF_RSP_ERROR;
 					goto send_response;
 				}
 
 				nr_sects += xbb_sg->nsect;
 				map->host_addr = xbb_get_gntaddr(reqlist,
 							seg_idx, /*sector*/0);
 				KASSERT(map->host_addr + PAGE_SIZE <=
 					xbb->ring_config.gnt_addr,
 					("Host address %#jx len %d overlaps "
 					 "ring address %#jx\n",
 					(uintmax_t)map->host_addr, PAGE_SIZE,
 					(uintmax_t)xbb->ring_config.gnt_addr));
 					
 				map->flags     = GNTMAP_host_map;
 				map->ref       = sg->gref;
 				map->dom       = xbb->otherend_id;
 				if (operation == BIO_WRITE)
 					map->flags |= GNTMAP_readonly;
 				sg++;
 				map++;
 				xbb_sg++;
 				seg_idx++;
 				req_seg_idx++;
 			}
 
 			block_segs = MIN(nseg - req_seg_idx,
 					 BLKIF_MAX_SEGMENTS_PER_SEGMENT_BLOCK);
 			if (block_segs == 0)
 				break;
 
 			/*
 			 * Fetch the next request block full of SG elements.
 			 * For now, only the spacing between entries is
 			 * different in the different ABIs, not the sg entry
 			 * layout.
 			 */
 			req_ring_idx++;
 			switch (xbb->abi) {
 			case BLKIF_PROTOCOL_NATIVE:
 				sg = BLKRING_GET_SEG_BLOCK(&xbb->rings.native,
 							   req_ring_idx);
 				break;
 			case BLKIF_PROTOCOL_X86_32:
 			{
 				sg = BLKRING_GET_SEG_BLOCK(&xbb->rings.x86_32,
 							   req_ring_idx);
 				break;
 			}
 			case BLKIF_PROTOCOL_X86_64:
 			{
 				sg = BLKRING_GET_SEG_BLOCK(&xbb->rings.x86_64,
 							   req_ring_idx);
 				break;
 			}
 			default:
 				panic("Unexpected blkif protocol ABI.");
 				/* NOTREACHED */
 			} 
 			last_block_sg = sg + block_segs;
 		}
 
 		/* Convert to the disk's sector size */
 		nreq->nr_512b_sectors = nr_sects;
 		nr_sects = (nr_sects << 9) >> xbb->sector_size_shift;
 		total_sects += nr_sects;
 
 		if ((nreq->nr_512b_sectors &
 		    ((xbb->sector_size >> 9) - 1)) != 0) {
 			device_printf(xbb->dev, "%s: I/O size (%d) is not "
 				      "a multiple of the backing store sector "
 				      "size (%d)\n", __func__,
 				      nreq->nr_512b_sectors << 9,
 				      xbb->sector_size);
 			reqlist->status = BLKIF_RSP_ERROR;
 			goto send_response;
 		}
 	}
 
 	error = HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref,
 					  xbb->maps, reqlist->nr_segments);
 	if (error != 0)
 		panic("Grant table operation failed (%d)", error);
 
 	reqlist->flags |= XBB_REQLIST_MAPPED;
 
 	for (seg_idx = 0, map = xbb->maps; seg_idx < reqlist->nr_segments;
 	     seg_idx++, map++){
 
 		if (__predict_false(map->status != 0)) {
 			DPRINTF("invalid buffer -- could not remap "
 			        "it (%d)\n", map->status);
 			DPRINTF("Mapping(%d): Host Addr 0x%lx, flags "
 			        "0x%x ref 0x%x, dom %d\n", seg_idx,
 				map->host_addr, map->flags, map->ref,
 				map->dom);
 			reqlist->status = BLKIF_RSP_ERROR;
 			goto send_response;
 		}
 
 		reqlist->gnt_handles[seg_idx] = map->handle;
 	}
 	if (reqlist->starting_sector_number + total_sects >
 	    xbb->media_num_sectors) {
 
 		DPRINTF("%s of [%" PRIu64 ",%" PRIu64 "] "
 			"extends past end of device %s\n",
 			operation == BIO_READ ? "read" : "write",
 			reqlist->starting_sector_number,
 			reqlist->starting_sector_number + total_sects,
 			xbb->dev_name); 
 		reqlist->status = BLKIF_RSP_ERROR;
 		goto send_response;
 	}
 
 do_dispatch:
 
 	error = xbb->dispatch_io(xbb,
 				 reqlist,
 				 operation,
 				 bio_flags);
 
 	if (error != 0) {
 		reqlist->status = BLKIF_RSP_ERROR;
 		goto send_response;
 	}
 
 	return (0);
 
 send_response:
 
 	xbb_complete_reqlist(xbb, reqlist);
 
 	return (0);
 }
 
 static __inline int
 xbb_count_sects(blkif_request_t *ring_req)
 {
 	int i;
 	int cur_size = 0;
 
 	for (i = 0; i < ring_req->nr_segments; i++) {
 		int nsect;
 
 		nsect = (int8_t)(ring_req->seg[i].last_sect -
 			ring_req->seg[i].first_sect + 1);
 		if (nsect <= 0)
 			break;
 
 		cur_size += nsect;
 	}
 
 	return (cur_size);
 }
 
 /**
  * Process incoming requests from the shared communication ring in response
  * to a signal on the ring's event channel.
  *
  * \param context  Callback argument registerd during task initialization -
  *                 the xbb_softc for this instance.
  * \param pending  The number of taskqueue_enqueue events that have
  *                 occurred since this handler was last run.
  */
 static void
 xbb_run_queue(void *context, int pending)
 {
 	struct xbb_softc       *xbb;
 	blkif_back_rings_t     *rings;
 	RING_IDX		rp;
 	uint64_t		cur_sector;
 	int			cur_operation;
 	struct xbb_xen_reqlist *reqlist;
 
 
 	xbb   = (struct xbb_softc *)context;
 	rings = &xbb->rings;
 
 	/*
 	 * Work gather and dispatch loop.  Note that we have a bias here
 	 * towards gathering I/O sent by blockfront.  We first gather up
 	 * everything in the ring, as long as we have resources.  Then we
 	 * dispatch one request, and then attempt to gather up any
 	 * additional requests that have come in while we were dispatching
 	 * the request.
 	 *
 	 * This allows us to get a clearer picture (via devstat) of how
 	 * many requests blockfront is queueing to us at any given time.
 	 */
 	for (;;) {
 		int retval;
 
 		/*
 		 * Initialize reqlist to the last element in the pending
 		 * queue, if there is one.  This allows us to add more
 		 * requests to that request list, if we have room.
 		 */
 		reqlist = STAILQ_LAST(&xbb->reqlist_pending_stailq,
 				      xbb_xen_reqlist, links);
 		if (reqlist != NULL) {
 			cur_sector = reqlist->next_contig_sector;
 			cur_operation = reqlist->operation;
 		} else {
 			cur_operation = 0;
 			cur_sector    = 0;
 		}
 
 		/*
 		 * Cache req_prod to avoid accessing a cache line shared
 		 * with the frontend.
 		 */
 		rp = rings->common.sring->req_prod;
 
 		/* Ensure we see queued requests up to 'rp'. */
 		rmb();
 
 		/**
 		 * Run so long as there is work to consume and the generation
 		 * of a response will not overflow the ring.
 		 *
 		 * @note There's a 1 to 1 relationship between requests and
 		 *       responses, so an overflow should never occur.  This
 		 *       test is to protect our domain from digesting bogus
 		 *       data.  Shouldn't we log this?
 		 */
 		while (rings->common.req_cons != rp
 		    && RING_REQUEST_CONS_OVERFLOW(&rings->common,
 						  rings->common.req_cons) == 0){
 			blkif_request_t	        ring_req_storage;
 			blkif_request_t	       *ring_req;
 			int			cur_size;
 
 			switch (xbb->abi) {
 			case BLKIF_PROTOCOL_NATIVE:
 				ring_req = RING_GET_REQUEST(&xbb->rings.native,
 				    rings->common.req_cons);
 				break;
 			case BLKIF_PROTOCOL_X86_32:
 			{
 				struct blkif_x86_32_request *ring_req32;
 
 				ring_req32 = RING_GET_REQUEST(
 				    &xbb->rings.x86_32, rings->common.req_cons);
 				blkif_get_x86_32_req(&ring_req_storage,
 						     ring_req32);
 				ring_req = &ring_req_storage;
 				break;
 			}
 			case BLKIF_PROTOCOL_X86_64:
 			{
 				struct blkif_x86_64_request *ring_req64;
 
 				ring_req64 =RING_GET_REQUEST(&xbb->rings.x86_64,
 				    rings->common.req_cons);
 				blkif_get_x86_64_req(&ring_req_storage,
 						     ring_req64);
 				ring_req = &ring_req_storage;
 				break;
 			}
 			default:
 				panic("Unexpected blkif protocol ABI.");
 				/* NOTREACHED */
 			} 
 
 			/*
 			 * Check for situations that would require closing
 			 * off this I/O for further coalescing:
 			 *  - Coalescing is turned off.
 			 *  - Current I/O is out of sequence with the previous
 			 *    I/O.
 			 *  - Coalesced I/O would be too large.
 			 */
 			if ((reqlist != NULL)
 			 && ((xbb->no_coalesce_reqs != 0)
 			  || ((xbb->no_coalesce_reqs == 0)
 			   && ((ring_req->sector_number != cur_sector)
 			    || (ring_req->operation != cur_operation)
 			    || ((ring_req->nr_segments + reqlist->nr_segments) >
 			         xbb->max_reqlist_segments))))) {
 				reqlist = NULL;
 			}
 
 			/*
 			 * Grab and check for all resources in one shot.
 			 * If we can't get all of the resources we need,
 			 * the shortage is noted and the thread will get
 			 * woken up when more resources are available.
 			 */
 			retval = xbb_get_resources(xbb, &reqlist, ring_req,
 						   xbb->rings.common.req_cons);
 
 			if (retval != 0) {
 				/*
 				 * Resource shortage has been recorded.
 				 * We'll be scheduled to run once a request
 				 * object frees up due to a completion.
 				 */
 				break;
 			}
 
 			/*
 			 * Signify that	we can overwrite this request with
 			 * a response by incrementing our consumer index.
 			 * The response won't be generated until after
 			 * we've already consumed all necessary data out
 			 * of the version of the request in the ring buffer
 			 * (for native mode).  We must update the consumer
 			 * index  before issueing back-end I/O so there is
 			 * no possibility that it will complete and a
 			 * response be generated before we make room in 
 			 * the queue for that response.
 			 */
 			xbb->rings.common.req_cons +=
 			    BLKIF_SEGS_TO_BLOCKS(ring_req->nr_segments);
 			xbb->reqs_received++;
 
 			cur_size = xbb_count_sects(ring_req);
 			cur_sector = ring_req->sector_number + cur_size;
 			reqlist->next_contig_sector = cur_sector;
 			cur_operation = ring_req->operation;
 		}
 
 		/* Check for I/O to dispatch */
 		reqlist = STAILQ_FIRST(&xbb->reqlist_pending_stailq);
 		if (reqlist == NULL) {
 			/*
 			 * We're out of work to do, put the task queue to
 			 * sleep.
 			 */
 			break;
 		}
 
 		/*
 		 * Grab the first request off the queue and attempt
 		 * to dispatch it.
 		 */
 		STAILQ_REMOVE_HEAD(&xbb->reqlist_pending_stailq, links);
 
 		retval = xbb_dispatch_io(xbb, reqlist);
 		if (retval != 0) {
 			/*
 			 * xbb_dispatch_io() returns non-zero only when
 			 * there is a resource shortage.  If that's the
 			 * case, re-queue this request on the head of the
 			 * queue, and go to sleep until we have more
 			 * resources.
 			 */
 			STAILQ_INSERT_HEAD(&xbb->reqlist_pending_stailq,
 					   reqlist, links);
 			break;
 		} else {
 			/*
 			 * If we still have anything on the queue after
 			 * removing the head entry, that is because we
 			 * met one of the criteria to create a new
 			 * request list (outlined above), and we'll call
 			 * that a forced dispatch for statistical purposes.
 			 *
 			 * Otherwise, if there is only one element on the
 			 * queue, we coalesced everything available on
 			 * the ring and we'll call that a normal dispatch.
 			 */
 			reqlist = STAILQ_FIRST(&xbb->reqlist_pending_stailq);
 
 			if (reqlist != NULL)
 				xbb->forced_dispatch++;
 			else
 				xbb->normal_dispatch++;
 
 			xbb->total_dispatch++;
 		}
 	}
 }
 
 /**
  * Interrupt handler bound to the shared ring's event channel.
  *
  * \param arg  Callback argument registerd during event channel
  *             binding - the xbb_softc for this instance.
  */
 static int
 xbb_filter(void *arg)
 {
 	struct xbb_softc *xbb;
 
 	/* Defer to taskqueue thread. */
 	xbb = (struct xbb_softc *)arg;
 	taskqueue_enqueue(xbb->io_taskqueue, &xbb->io_task); 
 
 	return (FILTER_HANDLED);
 }
 
 SDT_PROVIDER_DEFINE(xbb);
 SDT_PROBE_DEFINE1(xbb, kernel, xbb_dispatch_dev, flush, flush, "int");
 SDT_PROBE_DEFINE3(xbb, kernel, xbb_dispatch_dev, read, read, "int", "uint64_t",
 		  "uint64_t");
 SDT_PROBE_DEFINE3(xbb, kernel, xbb_dispatch_dev, write, write, "int",
 		  "uint64_t", "uint64_t");
 
 /*----------------------------- Backend Handlers -----------------------------*/
 /**
  * Backend handler for character device access.
  *
  * \param xbb        Per-instance xbb configuration structure.
  * \param reqlist    Allocated internal request list structure.
  * \param operation  BIO_* I/O operation code.
  * \param bio_flags  Additional bio_flag data to pass to any generated
  *                   bios (e.g. BIO_ORDERED)..
  *
  * \return  0 for success, errno codes for failure.
  */
 static int
 xbb_dispatch_dev(struct xbb_softc *xbb, struct xbb_xen_reqlist *reqlist,
 		 int operation, int bio_flags)
 {
 	struct xbb_dev_data *dev_data;
 	struct bio          *bios[XBB_MAX_SEGMENTS_PER_REQLIST];
 	off_t                bio_offset;
 	struct bio          *bio;
 	struct xbb_sg       *xbb_sg;
 	u_int	             nbio;
 	u_int                bio_idx;
 	u_int		     nseg;
 	u_int                seg_idx;
 	int                  error;
 
 	dev_data   = &xbb->backend.dev;
 	bio_offset = (off_t)reqlist->starting_sector_number
 		   << xbb->sector_size_shift;
 	error      = 0;
 	nbio       = 0;
 	bio_idx    = 0;
 
 	if (operation == BIO_FLUSH) {
 		bio = g_new_bio();
 		if (__predict_false(bio == NULL)) {
 			DPRINTF("Unable to allocate bio for BIO_FLUSH\n");
 			error = ENOMEM;
 			return (error);
 		}
 
 		bio->bio_cmd	 = BIO_FLUSH;
 		bio->bio_flags	|= BIO_ORDERED;
 		bio->bio_dev	 = dev_data->cdev;
 		bio->bio_offset	 = 0;
 		bio->bio_data	 = 0;
 		bio->bio_done	 = xbb_bio_done;
 		bio->bio_caller1 = reqlist;
 		bio->bio_pblkno	 = 0;
 
 		reqlist->pendcnt = 1;
 
 		SDT_PROBE1(xbb, kernel, xbb_dispatch_dev, flush,
 			   device_get_unit(xbb->dev));
 
 		(*dev_data->csw->d_strategy)(bio);
 
 		return (0);
 	}
 
 	xbb_sg = xbb->xbb_sgs;
 	bio    = NULL;
 	nseg = reqlist->nr_segments;
 
 	for (seg_idx = 0; seg_idx < nseg; seg_idx++, xbb_sg++) {
 
 		/*
 		 * KVA will not be contiguous, so any additional
 		 * I/O will need to be represented in a new bio.
 		 */
 		if ((bio != NULL)
 		 && (xbb_sg->first_sect != 0)) {
 			if ((bio->bio_length & (xbb->sector_size - 1)) != 0) {
 				printf("%s: Discontiguous I/O request "
 				       "from domain %d ends on "
 				       "non-sector boundary\n",
 				       __func__, xbb->otherend_id);
 				error = EINVAL;
 				goto fail_free_bios;
 			}
 			bio = NULL;
 		}
 
 		if (bio == NULL) {
 			/*
 			 * Make sure that the start of this bio is
 			 * aligned to a device sector.
 			 */
 			if ((bio_offset & (xbb->sector_size - 1)) != 0){
 				printf("%s: Misaligned I/O request "
 				       "from domain %d\n", __func__,
 				       xbb->otherend_id);
 				error = EINVAL;
 				goto fail_free_bios;
 			}
 
 			bio = bios[nbio++] = g_new_bio();
 			if (__predict_false(bio == NULL)) {
 				error = ENOMEM;
 				goto fail_free_bios;
 			}
 			bio->bio_cmd     = operation;
 			bio->bio_flags  |= bio_flags;
 			bio->bio_dev     = dev_data->cdev;
 			bio->bio_offset  = bio_offset;
 			bio->bio_data    = xbb_reqlist_ioaddr(reqlist, seg_idx,
 						xbb_sg->first_sect);
 			bio->bio_done    = xbb_bio_done;
 			bio->bio_caller1 = reqlist;
 			bio->bio_pblkno  = bio_offset >> xbb->sector_size_shift;
 		}
 
 		bio->bio_length += xbb_sg->nsect << 9;
 		bio->bio_bcount  = bio->bio_length;
 		bio_offset      += xbb_sg->nsect << 9;
 
 		if (xbb_sg->last_sect != (PAGE_SIZE - 512) >> 9) {
 
 			if ((bio->bio_length & (xbb->sector_size - 1)) != 0) {
 				printf("%s: Discontiguous I/O request "
 				       "from domain %d ends on "
 				       "non-sector boundary\n",
 				       __func__, xbb->otherend_id);
 				error = EINVAL;
 				goto fail_free_bios;
 			}
 			/*
 			 * KVA will not be contiguous, so any additional
 			 * I/O will need to be represented in a new bio.
 			 */
 			bio = NULL;
 		}
 	}
 
 	reqlist->pendcnt = nbio;
 
 	for (bio_idx = 0; bio_idx < nbio; bio_idx++)
 	{
 #ifdef XBB_USE_BOUNCE_BUFFERS
 		vm_offset_t kva_offset;
 
 		kva_offset = (vm_offset_t)bios[bio_idx]->bio_data
 			   - (vm_offset_t)reqlist->bounce;
 		if (operation == BIO_WRITE) {
 			memcpy(bios[bio_idx]->bio_data,
 			       (uint8_t *)reqlist->kva + kva_offset,
 			       bios[bio_idx]->bio_bcount);
 		}
 #endif
 		if (operation == BIO_READ) {
 			SDT_PROBE3(xbb, kernel, xbb_dispatch_dev, read,
 				   device_get_unit(xbb->dev),
 				   bios[bio_idx]->bio_offset,
 				   bios[bio_idx]->bio_length);
 		} else if (operation == BIO_WRITE) {
 			SDT_PROBE3(xbb, kernel, xbb_dispatch_dev, write,
 				   device_get_unit(xbb->dev),
 				   bios[bio_idx]->bio_offset,
 				   bios[bio_idx]->bio_length);
 		}
 		(*dev_data->csw->d_strategy)(bios[bio_idx]);
 	}
 
 	return (error);
 
 fail_free_bios:
 	for (bio_idx = 0; bio_idx < (nbio-1); bio_idx++)
 		g_destroy_bio(bios[bio_idx]);
 	
 	return (error);
 }
 
 SDT_PROBE_DEFINE1(xbb, kernel, xbb_dispatch_file, flush, flush, "int");
 SDT_PROBE_DEFINE3(xbb, kernel, xbb_dispatch_file, read, read, "int", "uint64_t",
 		  "uint64_t");
 SDT_PROBE_DEFINE3(xbb, kernel, xbb_dispatch_file, write, write, "int",
 		  "uint64_t", "uint64_t");
 
 /**
  * Backend handler for file access.
  *
  * \param xbb        Per-instance xbb configuration structure.
  * \param reqlist    Allocated internal request list.
  * \param operation  BIO_* I/O operation code.
  * \param flags      Additional bio_flag data to pass to any generated bios
  *                   (e.g. BIO_ORDERED)..
  *
  * \return  0 for success, errno codes for failure.
  */
 static int
 xbb_dispatch_file(struct xbb_softc *xbb, struct xbb_xen_reqlist *reqlist,
 		  int operation, int flags)
 {
 	struct xbb_file_data *file_data;
 	u_int                 seg_idx;
 	u_int		      nseg;
 	off_t		      sectors_sent;
 	struct uio            xuio;
 	struct xbb_sg        *xbb_sg;
 	struct iovec         *xiovec;
 #ifdef XBB_USE_BOUNCE_BUFFERS
 	void                **p_vaddr;
 	int                   saved_uio_iovcnt;
 #endif /* XBB_USE_BOUNCE_BUFFERS */
 	int                   error;
 
 	file_data = &xbb->backend.file;
 	sectors_sent = 0;
 	error = 0;
 	bzero(&xuio, sizeof(xuio));
 
 	switch (operation) {
 	case BIO_READ:
 		xuio.uio_rw = UIO_READ;
 		break;
 	case BIO_WRITE:
 		xuio.uio_rw = UIO_WRITE;
 		break;
 	case BIO_FLUSH: {
 		struct mount *mountpoint;
 
 		SDT_PROBE1(xbb, kernel, xbb_dispatch_file, flush,
 			   device_get_unit(xbb->dev));
 
 		(void) vn_start_write(xbb->vn, &mountpoint, V_WAIT);
 
 		vn_lock(xbb->vn, LK_EXCLUSIVE | LK_RETRY);
 		error = VOP_FSYNC(xbb->vn, MNT_WAIT, curthread);
 		VOP_UNLOCK(xbb->vn, 0);
 
 		vn_finished_write(mountpoint);
 
 		goto bailout_send_response;
 		/* NOTREACHED */
 	}
 	default:
 		panic("invalid operation %d", operation);
 		/* NOTREACHED */
 	}
 	xuio.uio_offset = (vm_offset_t)reqlist->starting_sector_number
 			<< xbb->sector_size_shift;
 	xuio.uio_segflg = UIO_SYSSPACE;
 	xuio.uio_iov = file_data->xiovecs;
 	xuio.uio_iovcnt = 0;
 	xbb_sg = xbb->xbb_sgs;
 	nseg = reqlist->nr_segments;
 
 	for (xiovec = NULL, seg_idx = 0; seg_idx < nseg; seg_idx++, xbb_sg++) {
 
 		/*
 		 * If the first sector is not 0, the KVA will
 		 * not be contiguous and we'll need to go on
 		 * to another segment.
 		 */
 		if (xbb_sg->first_sect != 0)
 			xiovec = NULL;
 
 		if (xiovec == NULL) {
 			xiovec = &file_data->xiovecs[xuio.uio_iovcnt];
 			xiovec->iov_base = xbb_reqlist_ioaddr(reqlist,
 			    seg_idx, xbb_sg->first_sect);
 #ifdef XBB_USE_BOUNCE_BUFFERS
 			/*
 			 * Store the address of the incoming
 			 * buffer at this particular offset
 			 * as well, so we can do the copy
 			 * later without having to do more
 			 * work to recalculate this address.
 		 	 */
 			p_vaddr = &file_data->xiovecs_vaddr[xuio.uio_iovcnt];
 			*p_vaddr = xbb_reqlist_vaddr(reqlist, seg_idx,
 			    xbb_sg->first_sect);
 #endif /* XBB_USE_BOUNCE_BUFFERS */
 			xiovec->iov_len = 0;
 			xuio.uio_iovcnt++;
 		}
 
 		xiovec->iov_len += xbb_sg->nsect << 9;
 
 		xuio.uio_resid += xbb_sg->nsect << 9;
 
 		/*
 		 * If the last sector is not the full page
 		 * size count, the next segment will not be
 		 * contiguous in KVA and we need a new iovec.
 		 */
 		if (xbb_sg->last_sect != (PAGE_SIZE - 512) >> 9)
 			xiovec = NULL;
 	}
 
 	xuio.uio_td = curthread;
 
 #ifdef XBB_USE_BOUNCE_BUFFERS
 	saved_uio_iovcnt = xuio.uio_iovcnt;
 
 	if (operation == BIO_WRITE) {
 		/* Copy the write data to the local buffer. */
 		for (seg_idx = 0, p_vaddr = file_data->xiovecs_vaddr,
 		     xiovec = xuio.uio_iov; seg_idx < xuio.uio_iovcnt;
 		     seg_idx++, xiovec++, p_vaddr++) {
 
 			memcpy(xiovec->iov_base, *p_vaddr, xiovec->iov_len);
 		}
 	} else {
 		/*
 		 * We only need to save off the iovecs in the case of a
 		 * read, because the copy for the read happens after the
 		 * VOP_READ().  (The uio will get modified in that call
 		 * sequence.)
 		 */
 		memcpy(file_data->saved_xiovecs, xuio.uio_iov,
 		       xuio.uio_iovcnt * sizeof(xuio.uio_iov[0]));
 	}
 #endif /* XBB_USE_BOUNCE_BUFFERS */
 
 	switch (operation) {
 	case BIO_READ:
 
 		SDT_PROBE3(xbb, kernel, xbb_dispatch_file, read,
 			   device_get_unit(xbb->dev), xuio.uio_offset,
 			   xuio.uio_resid);
 
 		vn_lock(xbb->vn, LK_EXCLUSIVE | LK_RETRY);
 
 		/*
 		 * UFS pays attention to IO_DIRECT for reads.  If the
 		 * DIRECTIO option is configured into the kernel, it calls
 		 * ffs_rawread().  But that only works for single-segment
 		 * uios with user space addresses.  In our case, with a
 		 * kernel uio, it still reads into the buffer cache, but it
 		 * will just try to release the buffer from the cache later
 		 * on in ffs_read().
 		 *
 		 * ZFS does not pay attention to IO_DIRECT for reads.
 		 *
 		 * UFS does not pay attention to IO_SYNC for reads.
 		 *
 		 * ZFS pays attention to IO_SYNC (which translates into the
 		 * Solaris define FRSYNC for zfs_read()) for reads.  It
 		 * attempts to sync the file before reading.
 		 *
 		 * So, to attempt to provide some barrier semantics in the
 		 * BIO_ORDERED case, set both IO_DIRECT and IO_SYNC.  
 		 */
 		error = VOP_READ(xbb->vn, &xuio, (flags & BIO_ORDERED) ? 
 				 (IO_DIRECT|IO_SYNC) : 0, file_data->cred);
 
 		VOP_UNLOCK(xbb->vn, 0);
 		break;
 	case BIO_WRITE: {
 		struct mount *mountpoint;
 
 		SDT_PROBE3(xbb, kernel, xbb_dispatch_file, write,
 			   device_get_unit(xbb->dev), xuio.uio_offset,
 			   xuio.uio_resid);
 
 		(void)vn_start_write(xbb->vn, &mountpoint, V_WAIT);
 
 		vn_lock(xbb->vn, LK_EXCLUSIVE | LK_RETRY);
 
 		/*
 		 * UFS pays attention to IO_DIRECT for writes.  The write
 		 * is done asynchronously.  (Normally the write would just
 		 * get put into cache.
 		 *
 		 * UFS pays attention to IO_SYNC for writes.  It will
 		 * attempt to write the buffer out synchronously if that
 		 * flag is set.
 		 *
 		 * ZFS does not pay attention to IO_DIRECT for writes.
 		 *
 		 * ZFS pays attention to IO_SYNC (a.k.a. FSYNC or FRSYNC)
 		 * for writes.  It will flush the transaction from the
 		 * cache before returning.
 		 *
 		 * So if we've got the BIO_ORDERED flag set, we want
 		 * IO_SYNC in either the UFS or ZFS case.
 		 */
 		error = VOP_WRITE(xbb->vn, &xuio, (flags & BIO_ORDERED) ?
 				  IO_SYNC : 0, file_data->cred);
 		VOP_UNLOCK(xbb->vn, 0);
 
 		vn_finished_write(mountpoint);
 
 		break;
 	}
 	default:
 		panic("invalid operation %d", operation);
 		/* NOTREACHED */
 	}
 
 #ifdef XBB_USE_BOUNCE_BUFFERS
 	/* We only need to copy here for read operations */
 	if (operation == BIO_READ) {
 
 		for (seg_idx = 0, p_vaddr = file_data->xiovecs_vaddr,
 		     xiovec = file_data->saved_xiovecs;
 		     seg_idx < saved_uio_iovcnt; seg_idx++,
 		     xiovec++, p_vaddr++) {
 
 			/*
 			 * Note that we have to use the copy of the 
 			 * io vector we made above.  uiomove() modifies
 			 * the uio and its referenced vector as uiomove
 			 * performs the copy, so we can't rely on any
 			 * state from the original uio.
 			 */
 			memcpy(*p_vaddr, xiovec->iov_base, xiovec->iov_len);
 		}
 	}
 #endif /* XBB_USE_BOUNCE_BUFFERS */
 
 bailout_send_response:
 
 	if (error != 0)
 		reqlist->status = BLKIF_RSP_ERROR;
 
 	xbb_complete_reqlist(xbb, reqlist);
 
 	return (0);
 }
 
 /*--------------------------- Backend Configuration --------------------------*/
 /**
  * Close and cleanup any backend device/file specific state for this
  * block back instance. 
  *
  * \param xbb  Per-instance xbb configuration structure.
  */
 static void
 xbb_close_backend(struct xbb_softc *xbb)
 {
 	DROP_GIANT();
 	DPRINTF("closing dev=%s\n", xbb->dev_name);
 	if (xbb->vn) {
 		int flags = FREAD;
 
 		if ((xbb->flags & XBBF_READ_ONLY) == 0)
 			flags |= FWRITE;
 
 		switch (xbb->device_type) {
 		case XBB_TYPE_DISK:
 			if (xbb->backend.dev.csw) {
 				dev_relthread(xbb->backend.dev.cdev,
 					      xbb->backend.dev.dev_ref);
 				xbb->backend.dev.csw  = NULL;
 				xbb->backend.dev.cdev = NULL;
 			}
 			break;
 		case XBB_TYPE_FILE:
 			break;
 		case XBB_TYPE_NONE:
 		default:
 			panic("Unexpected backend type.");
 			break;
 		}
 
 		(void)vn_close(xbb->vn, flags, NOCRED, curthread);
 		xbb->vn = NULL;
 
 		switch (xbb->device_type) {
 		case XBB_TYPE_DISK:
 			break;
 		case XBB_TYPE_FILE:
 			if (xbb->backend.file.cred != NULL) {
 				crfree(xbb->backend.file.cred);
 				xbb->backend.file.cred = NULL;
 			}
 			break;
 		case XBB_TYPE_NONE:
 		default:
 			panic("Unexpected backend type.");
 			break;
 		}
 	}
 	PICKUP_GIANT();
 }
 
 /**
  * Open a character device to be used for backend I/O.
  *
  * \param xbb  Per-instance xbb configuration structure.
  *
  * \return  0 for success, errno codes for failure.
  */
 static int
 xbb_open_dev(struct xbb_softc *xbb)
 {
 	struct vattr   vattr;
 	struct cdev   *dev;
 	struct cdevsw *devsw;
 	int	       error;
 
 	xbb->device_type = XBB_TYPE_DISK;
 	xbb->dispatch_io = xbb_dispatch_dev;
 	xbb->backend.dev.cdev = xbb->vn->v_rdev;
 	xbb->backend.dev.csw = dev_refthread(xbb->backend.dev.cdev,
 					     &xbb->backend.dev.dev_ref);
 	if (xbb->backend.dev.csw == NULL)
 		panic("Unable to retrieve device switch");
 
 	error = VOP_GETATTR(xbb->vn, &vattr, NOCRED);
 	if (error) {
 		xenbus_dev_fatal(xbb->dev, error, "error getting "
 				 "vnode attributes for device %s",
 				 xbb->dev_name);
 		return (error);
 	}
 
 
 	dev = xbb->vn->v_rdev;
 	devsw = dev->si_devsw;
 	if (!devsw->d_ioctl) {
 		xenbus_dev_fatal(xbb->dev, ENODEV, "no d_ioctl for "
 				 "device %s!", xbb->dev_name);
 		return (ENODEV);
 	}
 
 	error = devsw->d_ioctl(dev, DIOCGSECTORSIZE,
 			       (caddr_t)&xbb->sector_size, FREAD,
 			       curthread);
 	if (error) {
 		xenbus_dev_fatal(xbb->dev, error,
 				 "error calling ioctl DIOCGSECTORSIZE "
 				 "for device %s", xbb->dev_name);
 		return (error);
 	}
 
 	error = devsw->d_ioctl(dev, DIOCGMEDIASIZE,
 			       (caddr_t)&xbb->media_size, FREAD,
 			       curthread);
 	if (error) {
 		xenbus_dev_fatal(xbb->dev, error,
 				 "error calling ioctl DIOCGMEDIASIZE "
 				 "for device %s", xbb->dev_name);
 		return (error);
 	}
 
 	return (0);
 }
 
 /**
  * Open a file to be used for backend I/O.
  *
  * \param xbb  Per-instance xbb configuration structure.
  *
  * \return  0 for success, errno codes for failure.
  */
 static int
 xbb_open_file(struct xbb_softc *xbb)
 {
 	struct xbb_file_data *file_data;
 	struct vattr          vattr;
 	int                   error;
 
 	file_data = &xbb->backend.file;
 	xbb->device_type = XBB_TYPE_FILE;
 	xbb->dispatch_io = xbb_dispatch_file;
 	error = VOP_GETATTR(xbb->vn, &vattr, curthread->td_ucred);
 	if (error != 0) {
 		xenbus_dev_fatal(xbb->dev, error,
 				 "error calling VOP_GETATTR()"
 				 "for file %s", xbb->dev_name);
 		return (error);
 	}
 
 	/*
 	 * Verify that we have the ability to upgrade to exclusive
 	 * access on this file so we can trap errors at open instead
 	 * of reporting them during first access.
 	 */
 	if (VOP_ISLOCKED(xbb->vn) != LK_EXCLUSIVE) {
 		vn_lock(xbb->vn, LK_UPGRADE | LK_RETRY);
 		if (xbb->vn->v_iflag & VI_DOOMED) {
 			error = EBADF;
 			xenbus_dev_fatal(xbb->dev, error,
 					 "error locking file %s",
 					 xbb->dev_name);
 
 			return (error);
 		}
 	}
 
 	file_data->cred = crhold(curthread->td_ucred);
 	xbb->media_size = vattr.va_size;
 
 	/*
 	 * XXX KDM vattr.va_blocksize may be larger than 512 bytes here.
 	 * With ZFS, it is 131072 bytes.  Block sizes that large don't work
 	 * with disklabel and UFS on FreeBSD at least.  Large block sizes
 	 * may not work with other OSes as well.  So just export a sector
 	 * size of 512 bytes, which should work with any OS or
 	 * application.  Since our backing is a file, any block size will
 	 * work fine for the backing store.
 	 */
 #if 0
 	xbb->sector_size = vattr.va_blocksize;
 #endif
 	xbb->sector_size = 512;
 
 	/*
 	 * Sanity check.  The media size has to be at least one
 	 * sector long.
 	 */
 	if (xbb->media_size < xbb->sector_size) {
 		error = EINVAL;
 		xenbus_dev_fatal(xbb->dev, error,
 				 "file %s size %ju < block size %u",
 				 xbb->dev_name,
 				 (uintmax_t)xbb->media_size,
 				 xbb->sector_size);
 	}
 	return (error);
 }
 
 /**
  * Open the backend provider for this connection.
  *
  * \param xbb  Per-instance xbb configuration structure.
  *
  * \return  0 for success, errno codes for failure.
  */
 static int
 xbb_open_backend(struct xbb_softc *xbb)
 {
 	struct nameidata nd;
 	int		 flags;
 	int		 error;
 
 	flags = FREAD;
 	error = 0;
 
 	DPRINTF("opening dev=%s\n", xbb->dev_name);
 
 	if (rootvnode == NULL) {
 		xenbus_dev_fatal(xbb->dev, ENOENT,
 				 "Root file system not mounted");
 		return (ENOENT);
 	}
 
 	if ((xbb->flags & XBBF_READ_ONLY) == 0)
 		flags |= FWRITE;
 
 	if (!curthread->td_proc->p_fd->fd_cdir) {
 		curthread->td_proc->p_fd->fd_cdir = rootvnode;
 		VREF(rootvnode);
 	}
 	if (!curthread->td_proc->p_fd->fd_rdir) {
 		curthread->td_proc->p_fd->fd_rdir = rootvnode;
 		VREF(rootvnode);
 	}
 	if (!curthread->td_proc->p_fd->fd_jdir) {
 		curthread->td_proc->p_fd->fd_jdir = rootvnode;
 		VREF(rootvnode);
 	}
 
  again:
 	NDINIT(&nd, LOOKUP, FOLLOW, UIO_SYSSPACE, xbb->dev_name, curthread);
 	error = vn_open(&nd, &flags, 0, NULL);
 	if (error) {
 		/*
 		 * This is the only reasonable guess we can make as far as
 		 * path if the user doesn't give us a fully qualified path.
 		 * If they want to specify a file, they need to specify the
 		 * full path.
 		 */
 		if (xbb->dev_name[0] != '/') {
 			char *dev_path = "/dev/";
 			char *dev_name;
 
 			/* Try adding device path at beginning of name */
 			dev_name = malloc(strlen(xbb->dev_name)
 					+ strlen(dev_path) + 1,
 					  M_XENBLOCKBACK, M_NOWAIT);
 			if (dev_name) {
 				sprintf(dev_name, "%s%s", dev_path,
 					xbb->dev_name);
 				free(xbb->dev_name, M_XENBLOCKBACK);
 				xbb->dev_name = dev_name;
 				goto again;
 			}
 		}
 		xenbus_dev_fatal(xbb->dev, error, "error opening device %s",
 				 xbb->dev_name);
 		return (error);
 	}
 
 	NDFREE(&nd, NDF_ONLY_PNBUF);
 		
 	xbb->vn = nd.ni_vp;
 
 	/* We only support disks and files. */
 	if (vn_isdisk(xbb->vn, &error)) {
 		error = xbb_open_dev(xbb);
 	} else if (xbb->vn->v_type == VREG) {
 		error = xbb_open_file(xbb);
 	} else {
 		error = EINVAL;
 		xenbus_dev_fatal(xbb->dev, error, "%s is not a disk "
 				 "or file", xbb->dev_name);
 	}
 	VOP_UNLOCK(xbb->vn, 0);
 
 	if (error != 0) {
 		xbb_close_backend(xbb);
 		return (error);
 	}
 
 	xbb->sector_size_shift = fls(xbb->sector_size) - 1;
 	xbb->media_num_sectors = xbb->media_size >> xbb->sector_size_shift;
 
 	DPRINTF("opened %s=%s sector_size=%u media_size=%" PRId64 "\n",
 		(xbb->device_type == XBB_TYPE_DISK) ? "dev" : "file",
 		xbb->dev_name, xbb->sector_size, xbb->media_size);
 
 	return (0);
 }
 
 /*------------------------ Inter-Domain Communication ------------------------*/
 /**
  * Free dynamically allocated KVA or pseudo-physical address allocations.
  *
  * \param xbb  Per-instance xbb configuration structure.
  */
 static void
 xbb_free_communication_mem(struct xbb_softc *xbb)
 {
 	if (xbb->kva != 0) {
 #ifndef XENHVM
 		kva_free(xbb->kva, xbb->kva_size);
 #else
 		if (xbb->pseudo_phys_res != NULL) {
 			bus_release_resource(xbb->dev, SYS_RES_MEMORY,
 					     xbb->pseudo_phys_res_id,
 					     xbb->pseudo_phys_res);
 			xbb->pseudo_phys_res = NULL;
 		}
 #endif
 	}
 	xbb->kva = 0;
 	xbb->gnt_base_addr = 0;
 	if (xbb->kva_free != NULL) {
 		free(xbb->kva_free, M_XENBLOCKBACK);
 		xbb->kva_free = NULL;
 	}
 }
 
 /**
  * Cleanup all inter-domain communication mechanisms.
  *
  * \param xbb  Per-instance xbb configuration structure.
  */
 static int
 xbb_disconnect(struct xbb_softc *xbb)
 {
 	struct gnttab_unmap_grant_ref  ops[XBB_MAX_RING_PAGES];
 	struct gnttab_unmap_grant_ref *op;
 	u_int			       ring_idx;
 	int			       error;
 
 	DPRINTF("\n");
 
 	if ((xbb->flags & XBBF_RING_CONNECTED) == 0)
 		return (0);
 
 	xen_intr_unbind(&xbb->xen_intr_handle);
 
 	mtx_unlock(&xbb->lock);
 	taskqueue_drain(xbb->io_taskqueue, &xbb->io_task); 
 	mtx_lock(&xbb->lock);
 
 	/*
 	 * No new interrupts can generate work, but we must wait
 	 * for all currently active requests to drain.
 	 */
 	if (xbb->active_request_count != 0)
 		return (EAGAIN);
 	
 	for (ring_idx = 0, op = ops;
 	     ring_idx < xbb->ring_config.ring_pages;
 	     ring_idx++, op++) {
 
 		op->host_addr    = xbb->ring_config.gnt_addr
 			         + (ring_idx * PAGE_SIZE);
 		op->dev_bus_addr = xbb->ring_config.bus_addr[ring_idx];
 		op->handle	 = xbb->ring_config.handle[ring_idx];
 	}
 
 	error = HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref, ops,
 					  xbb->ring_config.ring_pages);
 	if (error != 0)
 		panic("Grant table op failed (%d)", error);
 
 	xbb_free_communication_mem(xbb);
 
 	if (xbb->requests != NULL) {
 		free(xbb->requests, M_XENBLOCKBACK);
 		xbb->requests = NULL;
 	}
 
 	if (xbb->request_lists != NULL) {
 		struct xbb_xen_reqlist *reqlist;
 		int i;
 
 		/* There is one request list for ever allocated request. */
 		for (i = 0, reqlist = xbb->request_lists;
 		     i < xbb->max_requests; i++, reqlist++){
 #ifdef XBB_USE_BOUNCE_BUFFERS
 			if (reqlist->bounce != NULL) {
 				free(reqlist->bounce, M_XENBLOCKBACK);
 				reqlist->bounce = NULL;
 			}
 #endif
 			if (reqlist->gnt_handles != NULL) {
 				free(reqlist->gnt_handles, M_XENBLOCKBACK);
 				reqlist->gnt_handles = NULL;
 			}
 		}
 		free(xbb->request_lists, M_XENBLOCKBACK);
 		xbb->request_lists = NULL;
 	}
 
 	xbb->flags &= ~XBBF_RING_CONNECTED;
 	return (0);
 }
 
 /**
  * Map shared memory ring into domain local address space, initialize
  * ring control structures, and bind an interrupt to the event channel
  * used to notify us of ring changes.
  *
  * \param xbb  Per-instance xbb configuration structure.
  */
 static int
 xbb_connect_ring(struct xbb_softc *xbb)
 {
 	struct gnttab_map_grant_ref  gnts[XBB_MAX_RING_PAGES];
 	struct gnttab_map_grant_ref *gnt;
 	u_int			     ring_idx;
 	int			     error;
 
 	if ((xbb->flags & XBBF_RING_CONNECTED) != 0)
 		return (0);
 
 	/*
 	 * Kva for our ring is at the tail of the region of kva allocated
 	 * by xbb_alloc_communication_mem().
 	 */
 	xbb->ring_config.va = xbb->kva
 			    + (xbb->kva_size
 			     - (xbb->ring_config.ring_pages * PAGE_SIZE));
 	xbb->ring_config.gnt_addr = xbb->gnt_base_addr
 				  + (xbb->kva_size
 				   - (xbb->ring_config.ring_pages * PAGE_SIZE));
 
 	for (ring_idx = 0, gnt = gnts;
 	     ring_idx < xbb->ring_config.ring_pages;
 	     ring_idx++, gnt++) {
 
 		gnt->host_addr = xbb->ring_config.gnt_addr
 			       + (ring_idx * PAGE_SIZE);
 		gnt->flags     = GNTMAP_host_map;
 		gnt->ref       = xbb->ring_config.ring_ref[ring_idx];
 		gnt->dom       = xbb->otherend_id;
 	}
 
 	error = HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, gnts,
 					  xbb->ring_config.ring_pages);
 	if (error)
 		panic("blkback: Ring page grant table op failed (%d)", error);
 
 	for (ring_idx = 0, gnt = gnts;
 	     ring_idx < xbb->ring_config.ring_pages;
 	     ring_idx++, gnt++) {
 		if (gnt->status != 0) {
 			xbb->ring_config.va = 0;
 			xenbus_dev_fatal(xbb->dev, EACCES,
 					 "Ring shared page mapping failed. "
 					 "Status %d.", gnt->status);
 			return (EACCES);
 		}
 		xbb->ring_config.handle[ring_idx]   = gnt->handle;
 		xbb->ring_config.bus_addr[ring_idx] = gnt->dev_bus_addr;
 	}
 
 	/* Initialize the ring based on ABI. */
 	switch (xbb->abi) {
 	case BLKIF_PROTOCOL_NATIVE:
 	{
 		blkif_sring_t *sring;
 		sring = (blkif_sring_t *)xbb->ring_config.va;
 		BACK_RING_INIT(&xbb->rings.native, sring,
 			       xbb->ring_config.ring_pages * PAGE_SIZE);
 		break;
 	}
 	case BLKIF_PROTOCOL_X86_32:
 	{
 		blkif_x86_32_sring_t *sring_x86_32;
 		sring_x86_32 = (blkif_x86_32_sring_t *)xbb->ring_config.va;
 		BACK_RING_INIT(&xbb->rings.x86_32, sring_x86_32,
 			       xbb->ring_config.ring_pages * PAGE_SIZE);
 		break;
 	}
 	case BLKIF_PROTOCOL_X86_64:
 	{
 		blkif_x86_64_sring_t *sring_x86_64;
 		sring_x86_64 = (blkif_x86_64_sring_t *)xbb->ring_config.va;
 		BACK_RING_INIT(&xbb->rings.x86_64, sring_x86_64,
 			       xbb->ring_config.ring_pages * PAGE_SIZE);
 		break;
 	}
 	default:
 		panic("Unexpected blkif protocol ABI.");
 	}
 
 	xbb->flags |= XBBF_RING_CONNECTED;
 
 	error = xen_intr_bind_remote_port(xbb->dev,
 					  xbb->otherend_id,
 					  xbb->ring_config.evtchn,
 					  xbb_filter,
 					  /*ithread_handler*/NULL,
 					  /*arg*/xbb,
 					  INTR_TYPE_BIO | INTR_MPSAFE,
 					  &xbb->xen_intr_handle);
 	if (error) {
 		(void)xbb_disconnect(xbb);
 		xenbus_dev_fatal(xbb->dev, error, "binding event channel");
 		return (error);
 	}
 
 	DPRINTF("rings connected!\n");
 
 	return 0;
 }
 
 /* Needed to make bit_alloc() macro work */
 #define	calloc(count, size) malloc((count)*(size), M_XENBLOCKBACK,	\
 				   M_NOWAIT|M_ZERO);
 
 /**
  * Size KVA and pseudo-physical address allocations based on negotiated
  * values for the size and number of I/O requests, and the size of our
  * communication ring.
  *
  * \param xbb  Per-instance xbb configuration structure.
  *
  * These address spaces are used to dynamically map pages in the
  * front-end's domain into our own.
  */
 static int
 xbb_alloc_communication_mem(struct xbb_softc *xbb)
 {
 	xbb->reqlist_kva_pages = xbb->max_requests * xbb->max_request_segments;
 	xbb->reqlist_kva_size = xbb->reqlist_kva_pages * PAGE_SIZE;
 	xbb->kva_size = xbb->reqlist_kva_size +
 			(xbb->ring_config.ring_pages * PAGE_SIZE);
 
 	xbb->kva_free = bit_alloc(xbb->reqlist_kva_pages);
 	if (xbb->kva_free == NULL)
 		return (ENOMEM);
 
 	DPRINTF("%s: kva_size = %d, reqlist_kva_size = %d\n",
 		device_get_nameunit(xbb->dev), xbb->kva_size,
 		xbb->reqlist_kva_size);
 #ifndef XENHVM
 	xbb->kva = kva_alloc(xbb->kva_size);
 	if (xbb->kva == 0)
 		return (ENOMEM);
 	xbb->gnt_base_addr = xbb->kva;
 #else /* XENHVM */
 	/*
 	 * Reserve a range of pseudo physical memory that we can map
 	 * into kva.  These pages will only be backed by machine
 	 * pages ("real memory") during the lifetime of front-end requests
 	 * via grant table operations.
 	 */
 	xbb->pseudo_phys_res_id = 0;
 	xbb->pseudo_phys_res = bus_alloc_resource(xbb->dev, SYS_RES_MEMORY,
 						  &xbb->pseudo_phys_res_id,
 						  0, ~0, xbb->kva_size,
 						  RF_ACTIVE);
 	if (xbb->pseudo_phys_res == NULL) {
 		xbb->kva = 0;
 		return (ENOMEM);
 	}
 	xbb->kva = (vm_offset_t)rman_get_virtual(xbb->pseudo_phys_res);
 	xbb->gnt_base_addr = rman_get_start(xbb->pseudo_phys_res);
 #endif /* XENHVM */
 
 	DPRINTF("%s: kva: %#jx, gnt_base_addr: %#jx\n",
 		device_get_nameunit(xbb->dev), (uintmax_t)xbb->kva,
 		(uintmax_t)xbb->gnt_base_addr); 
 	return (0);
 }
 
 /**
  * Collect front-end information from the XenStore.
  *
  * \param xbb  Per-instance xbb configuration structure.
  */
 static int
 xbb_collect_frontend_info(struct xbb_softc *xbb)
 {
 	char	    protocol_abi[64];
 	const char *otherend_path;
 	int	    error;
 	u_int	    ring_idx;
 	u_int	    ring_page_order;
 	size_t	    ring_size;
 
 	otherend_path = xenbus_get_otherend_path(xbb->dev);
 
 	/*
 	 * Protocol defaults valid even if all negotiation fails.
 	 */
 	xbb->ring_config.ring_pages = 1;
 	xbb->max_request_segments   = BLKIF_MAX_SEGMENTS_PER_HEADER_BLOCK;
 	xbb->max_request_size	    = xbb->max_request_segments * PAGE_SIZE;
 
 	/*
 	 * Mandatory data (used in all versions of the protocol) first.
 	 */
 	error = xs_scanf(XST_NIL, otherend_path,
 			 "event-channel", NULL, "%" PRIu32,
 			 &xbb->ring_config.evtchn);
 	if (error != 0) {
 		xenbus_dev_fatal(xbb->dev, error,
 				 "Unable to retrieve event-channel information "
 				 "from frontend %s.  Unable to connect.",
 				 xenbus_get_otherend_path(xbb->dev));
 		return (error);
 	}
 
 	/*
 	 * These fields are initialized to legacy protocol defaults
 	 * so we only need to fail if reading the updated value succeeds
 	 * and the new value is outside of its allowed range.
 	 *
 	 * \note xs_gather() returns on the first encountered error, so
 	 *       we must use independant calls in order to guarantee
 	 *       we don't miss information in a sparsly populated front-end
 	 *       tree.
 	 *
 	 * \note xs_scanf() does not update variables for unmatched
 	 *       fields.
 	 */
 	ring_page_order = 0;
 	(void)xs_scanf(XST_NIL, otherend_path,
 		       "ring-page-order", NULL, "%u",
 		       &ring_page_order);
 	xbb->ring_config.ring_pages = 1 << ring_page_order;
 	(void)xs_scanf(XST_NIL, otherend_path,
 		       "num-ring-pages", NULL, "%u",
 		       &xbb->ring_config.ring_pages);
 	ring_size = PAGE_SIZE * xbb->ring_config.ring_pages;
 	xbb->max_requests = BLKIF_MAX_RING_REQUESTS(ring_size);
 
 	(void)xs_scanf(XST_NIL, otherend_path,
 		       "max-requests", NULL, "%u",
 		       &xbb->max_requests);
 
 	(void)xs_scanf(XST_NIL, otherend_path,
 		       "max-request-segments", NULL, "%u",
 		       &xbb->max_request_segments);
 
 	(void)xs_scanf(XST_NIL, otherend_path,
 		       "max-request-size", NULL, "%u",
 		       &xbb->max_request_size);
 
 	if (xbb->ring_config.ring_pages	> XBB_MAX_RING_PAGES) {
 		xenbus_dev_fatal(xbb->dev, EINVAL,
 				 "Front-end specified ring-pages of %u "
 				 "exceeds backend limit of %zu.  "
 				 "Unable to connect.",
 				 xbb->ring_config.ring_pages,
 				 XBB_MAX_RING_PAGES);
 		return (EINVAL);
 	} else if (xbb->max_requests > XBB_MAX_REQUESTS) {
 		xenbus_dev_fatal(xbb->dev, EINVAL,
 				 "Front-end specified max_requests of %u "
 				 "exceeds backend limit of %u.  "
 				 "Unable to connect.",
 				 xbb->max_requests,
 				 XBB_MAX_REQUESTS);
 		return (EINVAL);
 	} else if (xbb->max_request_segments > XBB_MAX_SEGMENTS_PER_REQUEST) {
 		xenbus_dev_fatal(xbb->dev, EINVAL,
 				 "Front-end specified max_requests_segments "
 				 "of %u exceeds backend limit of %u.  "
 				 "Unable to connect.",
 				 xbb->max_request_segments,
 				 XBB_MAX_SEGMENTS_PER_REQUEST);
 		return (EINVAL);
 	} else if (xbb->max_request_size > XBB_MAX_REQUEST_SIZE) {
 		xenbus_dev_fatal(xbb->dev, EINVAL,
 				 "Front-end specified max_request_size "
 				 "of %u exceeds backend limit of %u.  "
 				 "Unable to connect.",
 				 xbb->max_request_size,
 				 XBB_MAX_REQUEST_SIZE);
 		return (EINVAL);
 	}
 
 	if (xbb->ring_config.ring_pages	== 1) {
 		error = xs_gather(XST_NIL, otherend_path,
 				  "ring-ref", "%" PRIu32,
 				  &xbb->ring_config.ring_ref[0],
 				  NULL);
 		if (error != 0) {
 			xenbus_dev_fatal(xbb->dev, error,
 					 "Unable to retrieve ring information "
 					 "from frontend %s.  Unable to "
 					 "connect.",
 					 xenbus_get_otherend_path(xbb->dev));
 			return (error);
 		}
 	} else {
 		/* Multi-page ring format. */
 		for (ring_idx = 0; ring_idx < xbb->ring_config.ring_pages;
 		     ring_idx++) {
 			char ring_ref_name[]= "ring_refXX";
 
 			snprintf(ring_ref_name, sizeof(ring_ref_name),
 				 "ring-ref%u", ring_idx);
 			error = xs_scanf(XST_NIL, otherend_path,
 					 ring_ref_name, NULL, "%" PRIu32,
 					 &xbb->ring_config.ring_ref[ring_idx]);
 			if (error != 0) {
 				xenbus_dev_fatal(xbb->dev, error,
 						 "Failed to retriev grant "
 						 "reference for page %u of "
 						 "shared ring.  Unable "
 						 "to connect.", ring_idx);
 				return (error);
 			}
 		}
 	}
 
 	error = xs_gather(XST_NIL, otherend_path,
 			  "protocol", "%63s", protocol_abi,
 			  NULL); 
 	if (error != 0
 	 || !strcmp(protocol_abi, XEN_IO_PROTO_ABI_NATIVE)) {
 		/*
 		 * Assume native if the frontend has not
 		 * published ABI data or it has published and
 		 * matches our own ABI.
 		 */
 		xbb->abi = BLKIF_PROTOCOL_NATIVE;
 	} else if (!strcmp(protocol_abi, XEN_IO_PROTO_ABI_X86_32)) {
 
 		xbb->abi = BLKIF_PROTOCOL_X86_32;
 	} else if (!strcmp(protocol_abi, XEN_IO_PROTO_ABI_X86_64)) {
 
 		xbb->abi = BLKIF_PROTOCOL_X86_64;
 	} else {
 
 		xenbus_dev_fatal(xbb->dev, EINVAL,
 				 "Unknown protocol ABI (%s) published by "
 				 "frontend.  Unable to connect.", protocol_abi);
 		return (EINVAL);
 	}
 	return (0);
 }
 
 /**
  * Allocate per-request data structures given request size and number
  * information negotiated with the front-end.
  *
  * \param xbb  Per-instance xbb configuration structure.
  */
 static int
 xbb_alloc_requests(struct xbb_softc *xbb)
 {
 	struct xbb_xen_req *req;
 	struct xbb_xen_req *last_req;
 
 	/*
 	 * Allocate request book keeping datastructures.
 	 */
 	xbb->requests = malloc(xbb->max_requests * sizeof(*xbb->requests),
 			       M_XENBLOCKBACK, M_NOWAIT|M_ZERO);
 	if (xbb->requests == NULL) {
 		xenbus_dev_fatal(xbb->dev, ENOMEM, 
 				  "Unable to allocate request structures");
 		return (ENOMEM);
 	}
 
 	req      = xbb->requests;
 	last_req = &xbb->requests[xbb->max_requests - 1];
 	STAILQ_INIT(&xbb->request_free_stailq);
 	while (req <= last_req) {
 		STAILQ_INSERT_TAIL(&xbb->request_free_stailq, req, links);
 		req++;
 	}
 	return (0);
 }
 
 static int
 xbb_alloc_request_lists(struct xbb_softc *xbb)
 {
 	struct xbb_xen_reqlist *reqlist;
 	int			i;
 
 	/*
 	 * If no requests can be merged, we need 1 request list per
 	 * in flight request.
 	 */
 	xbb->request_lists = malloc(xbb->max_requests *
 		sizeof(*xbb->request_lists), M_XENBLOCKBACK, M_NOWAIT|M_ZERO);
 	if (xbb->request_lists == NULL) {
 		xenbus_dev_fatal(xbb->dev, ENOMEM, 
 				  "Unable to allocate request list structures");
 		return (ENOMEM);
 	}
 
 	STAILQ_INIT(&xbb->reqlist_free_stailq);
 	STAILQ_INIT(&xbb->reqlist_pending_stailq);
 	for (i = 0; i < xbb->max_requests; i++) {
 		int seg;
 
 		reqlist      = &xbb->request_lists[i];
 
 		reqlist->xbb = xbb;
 
 #ifdef XBB_USE_BOUNCE_BUFFERS
 		reqlist->bounce = malloc(xbb->max_reqlist_size,
 					 M_XENBLOCKBACK, M_NOWAIT);
 		if (reqlist->bounce == NULL) {
 			xenbus_dev_fatal(xbb->dev, ENOMEM, 
 					 "Unable to allocate request "
 					 "bounce buffers");
 			return (ENOMEM);
 		}
 #endif /* XBB_USE_BOUNCE_BUFFERS */
 
 		reqlist->gnt_handles = malloc(xbb->max_reqlist_segments *
 					      sizeof(*reqlist->gnt_handles),
 					      M_XENBLOCKBACK, M_NOWAIT|M_ZERO);
 		if (reqlist->gnt_handles == NULL) {
 			xenbus_dev_fatal(xbb->dev, ENOMEM,
 					  "Unable to allocate request "
 					  "grant references");
 			return (ENOMEM);
 		}
 
 		for (seg = 0; seg < xbb->max_reqlist_segments; seg++)
 			reqlist->gnt_handles[seg] = GRANT_REF_INVALID;
 
 		STAILQ_INSERT_TAIL(&xbb->reqlist_free_stailq, reqlist, links);
 	}
 	return (0);
 }
 
 /**
  * Supply information about the physical device to the frontend
  * via XenBus.
  *
  * \param xbb  Per-instance xbb configuration structure.
  */
 static int
 xbb_publish_backend_info(struct xbb_softc *xbb)
 {
 	struct xs_transaction xst;
 	const char	     *our_path;
 	const char	     *leaf;
 	int		      error;
 
 	our_path = xenbus_get_node(xbb->dev);
 	while (1) {
 		error = xs_transaction_start(&xst);
 		if (error != 0) {
 			xenbus_dev_fatal(xbb->dev, error,
 					 "Error publishing backend info "
 					 "(start transaction)");
 			return (error);
 		}
 
 		leaf = "sectors";
 		error = xs_printf(xst, our_path, leaf,
 				  "%"PRIu64, xbb->media_num_sectors);
 		if (error != 0)
 			break;
 
 		/* XXX Support all VBD attributes here. */
 		leaf = "info";
 		error = xs_printf(xst, our_path, leaf, "%u",
 				  xbb->flags & XBBF_READ_ONLY
 				? VDISK_READONLY : 0);
 		if (error != 0)
 			break;
 
 		leaf = "sector-size";
 		error = xs_printf(xst, our_path, leaf, "%u",
 				  xbb->sector_size);
 		if (error != 0)
 			break;
 
 		error = xs_transaction_end(xst, 0);
 		if (error == 0) {
 			return (0);
 		} else if (error != EAGAIN) {
 			xenbus_dev_fatal(xbb->dev, error, "ending transaction");
 			return (error);
 		}
 	}
 
 	xenbus_dev_fatal(xbb->dev, error, "writing %s/%s",
 			our_path, leaf);
 	xs_transaction_end(xst, 1);
 	return (error);
 }
 
 /**
  * Connect to our blkfront peer now that it has completed publishing
  * its configuration into the XenStore.
  *
  * \param xbb  Per-instance xbb configuration structure.
  */
 static void
 xbb_connect(struct xbb_softc *xbb)
 {
 	int error;
 
 	if (xenbus_get_state(xbb->dev) == XenbusStateConnected)
 		return;
 
 	if (xbb_collect_frontend_info(xbb) != 0)
 		return;
 
 	xbb->flags &= ~XBBF_SHUTDOWN;
 
 	/*
 	 * We limit the maximum number of reqlist segments to the maximum
 	 * number of segments in the ring, or our absolute maximum,
 	 * whichever is smaller.
 	 */
 	xbb->max_reqlist_segments = MIN(xbb->max_request_segments *
 		xbb->max_requests, XBB_MAX_SEGMENTS_PER_REQLIST);
 
 	/*
 	 * The maximum size is simply a function of the number of segments
 	 * we can handle.
 	 */
 	xbb->max_reqlist_size = xbb->max_reqlist_segments * PAGE_SIZE;
 
 	/* Allocate resources whose size depends on front-end configuration. */
 	error = xbb_alloc_communication_mem(xbb);
 	if (error != 0) {
 		xenbus_dev_fatal(xbb->dev, error,
 				 "Unable to allocate communication memory");
 		return;
 	}
 
 	error = xbb_alloc_requests(xbb);
 	if (error != 0) {
 		/* Specific errors are reported by xbb_alloc_requests(). */
 		return;
 	}
 
 	error = xbb_alloc_request_lists(xbb);
 	if (error != 0) {
 		/* Specific errors are reported by xbb_alloc_request_lists(). */
 		return;
 	}
 
 	/*
 	 * Connect communication channel.
 	 */
 	error = xbb_connect_ring(xbb);
 	if (error != 0) {
 		/* Specific errors are reported by xbb_connect_ring(). */
 		return;
 	}
 	
 	if (xbb_publish_backend_info(xbb) != 0) {
 		/*
 		 * If we can't publish our data, we cannot participate
 		 * in this connection, and waiting for a front-end state
 		 * change will not help the situation.
 		 */
 		(void)xbb_disconnect(xbb);
 		return;
 	}
 
 	/* Ready for I/O. */
 	xenbus_set_state(xbb->dev, XenbusStateConnected);
 }
 
 /*-------------------------- Device Teardown Support -------------------------*/
 /**
  * Perform device shutdown functions.
  *
  * \param xbb  Per-instance xbb configuration structure.
  *
  * Mark this instance as shutting down, wait for any active I/O on the
  * backend device/file to drain, disconnect from the front-end, and notify
  * any waiters (e.g. a thread invoking our detach method) that detach can
  * now proceed.
  */
 static int
 xbb_shutdown(struct xbb_softc *xbb)
 {
 	XenbusState frontState;
 	int	    error;
 
 	DPRINTF("\n");
 
 	/*
 	 * Due to the need to drop our mutex during some
 	 * xenbus operations, it is possible for two threads
 	 * to attempt to close out shutdown processing at
 	 * the same time.  Tell the caller that hits this
 	 * race to try back later. 
 	 */
 	if ((xbb->flags & XBBF_IN_SHUTDOWN) != 0)
 		return (EAGAIN);
 
 	xbb->flags |= XBBF_IN_SHUTDOWN;
 	mtx_unlock(&xbb->lock);
 
 	if (xenbus_get_state(xbb->dev) < XenbusStateClosing)
 		xenbus_set_state(xbb->dev, XenbusStateClosing);
 
 	frontState = xenbus_get_otherend_state(xbb->dev);
 	mtx_lock(&xbb->lock);
 	xbb->flags &= ~XBBF_IN_SHUTDOWN;
 
 	/* The front can submit I/O until entering the closed state. */
 	if (frontState < XenbusStateClosed)
 		return (EAGAIN);
 
 	DPRINTF("\n");
 
 	/* Indicate shutdown is in progress. */
 	xbb->flags |= XBBF_SHUTDOWN;
 
 	/* Disconnect from the front-end. */
 	error = xbb_disconnect(xbb);
 	if (error != 0) {
 		/*
 		 * Requests still outstanding.  We'll be called again
 		 * once they complete.
 		 */
 		KASSERT(error == EAGAIN,
 			("%s: Unexpected xbb_disconnect() failure %d",
 			 __func__, error));
 
 		return (error);
 	}
 
 	DPRINTF("\n");
 
 	/* Indicate to xbb_detach() that is it safe to proceed. */
 	wakeup(xbb);
 
 	return (0);
 }
 
 /**
  * Report an attach time error to the console and Xen, and cleanup
  * this instance by forcing immediate detach processing.
  *
  * \param xbb  Per-instance xbb configuration structure.
  * \param err  Errno describing the error.
  * \param fmt  Printf style format and arguments
  */
 static void
 xbb_attach_failed(struct xbb_softc *xbb, int err, const char *fmt, ...)
 {
 	va_list ap;
 	va_list ap_hotplug;
 
 	va_start(ap, fmt);
 	va_copy(ap_hotplug, ap);
 	xs_vprintf(XST_NIL, xenbus_get_node(xbb->dev),
 		  "hotplug-error", fmt, ap_hotplug);
 	va_end(ap_hotplug);
 	xs_printf(XST_NIL, xenbus_get_node(xbb->dev),
 		  "hotplug-status", "error");
 
 	xenbus_dev_vfatal(xbb->dev, err, fmt, ap);
 	va_end(ap);
 
 	xs_printf(XST_NIL, xenbus_get_node(xbb->dev),
 		  "online", "0");
 	xbb_detach(xbb->dev);
 }
 
 /*---------------------------- NewBus Entrypoints ----------------------------*/
 /**
  * Inspect a XenBus device and claim it if is of the appropriate type.
  * 
  * \param dev  NewBus device object representing a candidate XenBus device.
  *
  * \return  0 for success, errno codes for failure.
  */
 static int
 xbb_probe(device_t dev)
 {
  
         if (!strcmp(xenbus_get_type(dev), "vbd")) {
                 device_set_desc(dev, "Backend Virtual Block Device");
                 device_quiet(dev);
                 return (0);
         }
 
         return (ENXIO);
 }
 
 /**
  * Setup sysctl variables to control various Block Back parameters.
  *
  * \param xbb  Xen Block Back softc.
  *
  */
 static void
 xbb_setup_sysctl(struct xbb_softc *xbb)
 {
 	struct sysctl_ctx_list *sysctl_ctx = NULL;
 	struct sysctl_oid      *sysctl_tree = NULL;
 	
 	sysctl_ctx = device_get_sysctl_ctx(xbb->dev);
 	if (sysctl_ctx == NULL)
 		return;
 
 	sysctl_tree = device_get_sysctl_tree(xbb->dev);
 	if (sysctl_tree == NULL)
 		return;
 
 	SYSCTL_ADD_INT(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO,
 		       "disable_flush", CTLFLAG_RW, &xbb->disable_flush, 0,
 		       "fake the flush command");
 
 	SYSCTL_ADD_INT(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO,
 		       "flush_interval", CTLFLAG_RW, &xbb->flush_interval, 0,
 		       "send a real flush for N flush requests");
 
 	SYSCTL_ADD_INT(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO,
 		       "no_coalesce_reqs", CTLFLAG_RW, &xbb->no_coalesce_reqs,0,
 		       "Don't coalesce contiguous requests");
 
 	SYSCTL_ADD_UQUAD(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO,
 			 "reqs_received", CTLFLAG_RW, &xbb->reqs_received,
 			 "how many I/O requests we have received");
 
 	SYSCTL_ADD_UQUAD(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO,
 			 "reqs_completed", CTLFLAG_RW, &xbb->reqs_completed,
 			 "how many I/O requests have been completed");
 
 	SYSCTL_ADD_UQUAD(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO,
 			 "forced_dispatch", CTLFLAG_RW, &xbb->forced_dispatch,
 			 "how many I/O dispatches were forced");
 
 	SYSCTL_ADD_UQUAD(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO,
 			 "normal_dispatch", CTLFLAG_RW, &xbb->normal_dispatch,
 			 "how many I/O dispatches were normal");
 
 	SYSCTL_ADD_UQUAD(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO,
 			 "total_dispatch", CTLFLAG_RW, &xbb->total_dispatch,
 			 "total number of I/O dispatches");
 
 	SYSCTL_ADD_UQUAD(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO,
 			 "kva_shortages", CTLFLAG_RW, &xbb->kva_shortages,
 			 "how many times we have run out of KVA");
 
 	SYSCTL_ADD_UQUAD(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO,
 			 "request_shortages", CTLFLAG_RW,
 			 &xbb->request_shortages,
 			 "how many times we have run out of requests");
 
 	SYSCTL_ADD_UINT(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO,
 		        "max_requests", CTLFLAG_RD, &xbb->max_requests, 0,
 		        "maximum outstanding requests (negotiated)");
 
 	SYSCTL_ADD_UINT(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO,
 		        "max_request_segments", CTLFLAG_RD,
 		        &xbb->max_request_segments, 0,
 		        "maximum number of pages per requests (negotiated)");
 
 	SYSCTL_ADD_UINT(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO,
 		        "max_request_size", CTLFLAG_RD,
 		        &xbb->max_request_size, 0,
 		        "maximum size in bytes of a request (negotiated)");
 
 	SYSCTL_ADD_UINT(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO,
 		        "ring_pages", CTLFLAG_RD,
 		        &xbb->ring_config.ring_pages, 0,
 		        "communication channel pages (negotiated)");
 }
 
 /**
  * Attach to a XenBus device that has been claimed by our probe routine.
  *
  * \param dev  NewBus device object representing this Xen Block Back instance.
  *
  * \return  0 for success, errno codes for failure.
  */
 static int
 xbb_attach(device_t dev)
 {
 	struct xbb_softc	*xbb;
 	int			 error;
 	u_int			 max_ring_page_order;
 
 	DPRINTF("Attaching to %s\n", xenbus_get_node(dev));
 
 	/*
 	 * Basic initialization.
 	 * After this block it is safe to call xbb_detach()
 	 * to clean up any allocated data for this instance.
 	 */
 	xbb = device_get_softc(dev);
 	xbb->dev = dev;
 	xbb->otherend_id = xenbus_get_otherend_id(dev);
 	TASK_INIT(&xbb->io_task, /*priority*/0, xbb_run_queue, xbb);
 	mtx_init(&xbb->lock, device_get_nameunit(dev), NULL, MTX_DEF);
 
 	/*
 	 * Publish protocol capabilities for consumption by the
 	 * front-end.
 	 */
 	error = xs_printf(XST_NIL, xenbus_get_node(xbb->dev),
 			  "feature-barrier", "1");
 	if (error) {
 		xbb_attach_failed(xbb, error, "writing %s/feature-barrier",
 				  xenbus_get_node(xbb->dev));
 		return (error);
 	}
 
 	error = xs_printf(XST_NIL, xenbus_get_node(xbb->dev),
 			  "feature-flush-cache", "1");
 	if (error) {
 		xbb_attach_failed(xbb, error, "writing %s/feature-flush-cache",
 				  xenbus_get_node(xbb->dev));
 		return (error);
 	}
 
 	/*
 	 * Amazon EC2 client compatility.  They refer to max-ring-pages
 	 * instead of to max-ring-page-order.
 	 */
 	error = xs_printf(XST_NIL, xenbus_get_node(xbb->dev),
 			  "max-ring-pages", "%zu", XBB_MAX_RING_PAGES);
 	if (error) {
 		xbb_attach_failed(xbb, error, "writing %s/max-ring-pages",
 				  xenbus_get_node(xbb->dev));
 		return (error);
 	}
 
 	max_ring_page_order = flsl(XBB_MAX_RING_PAGES) - 1;
 	error = xs_printf(XST_NIL, xenbus_get_node(xbb->dev),
 			  "max-ring-page-order", "%u", max_ring_page_order);
 	if (error) {
 		xbb_attach_failed(xbb, error, "writing %s/max-ring-page-order",
 				  xenbus_get_node(xbb->dev));
 		return (error);
 	}
 
 	error = xs_printf(XST_NIL, xenbus_get_node(xbb->dev),
 			  "max-requests", "%u", XBB_MAX_REQUESTS);
 	if (error) {
 		xbb_attach_failed(xbb, error, "writing %s/max-requests",
 				  xenbus_get_node(xbb->dev));
 		return (error);
 	}
 
 	error = xs_printf(XST_NIL, xenbus_get_node(xbb->dev),
 			  "max-request-segments", "%u",
 			  XBB_MAX_SEGMENTS_PER_REQUEST);
 	if (error) {
 		xbb_attach_failed(xbb, error, "writing %s/max-request-segments",
 				  xenbus_get_node(xbb->dev));
 		return (error);
 	}
 
 	error = xs_printf(XST_NIL, xenbus_get_node(xbb->dev),
 			  "max-request-size", "%u",
 			  XBB_MAX_REQUEST_SIZE);
 	if (error) {
 		xbb_attach_failed(xbb, error, "writing %s/max-request-size",
 				  xenbus_get_node(xbb->dev));
 		return (error);
 	}
 
 	/* Collect physical device information. */
 	error = xs_gather(XST_NIL, xenbus_get_otherend_path(xbb->dev),
 			  "device-type", NULL, &xbb->dev_type,
 			  NULL);
 	if (error != 0)
 		xbb->dev_type = NULL;
 
 	error = xs_gather(XST_NIL, xenbus_get_node(dev),
                           "mode", NULL, &xbb->dev_mode,
 			  "params", NULL, &xbb->dev_name,
                           NULL);
 	if (error != 0) {
 		xbb_attach_failed(xbb, error, "reading backend fields at %s",
 				  xenbus_get_node(dev));
                 return (ENXIO);
         }
 
 	/* Parse fopen style mode flags. */
 	if (strchr(xbb->dev_mode, 'w') == NULL)
 		xbb->flags |= XBBF_READ_ONLY;
 
 	/*
 	 * Verify the physical device is present and can support
 	 * the desired I/O mode.
 	 */
 	DROP_GIANT();
 	error = xbb_open_backend(xbb);
 	PICKUP_GIANT();
 	if (error != 0) {
 		xbb_attach_failed(xbb, error, "Unable to open %s",
 				  xbb->dev_name);
 		return (ENXIO);
 	}
 
 	/* Use devstat(9) for recording statistics. */
 	xbb->xbb_stats = devstat_new_entry("xbb", device_get_unit(xbb->dev),
 					   xbb->sector_size,
 					   DEVSTAT_ALL_SUPPORTED,
 					   DEVSTAT_TYPE_DIRECT
 					 | DEVSTAT_TYPE_IF_OTHER,
 					   DEVSTAT_PRIORITY_OTHER);
 
 	xbb->xbb_stats_in = devstat_new_entry("xbbi", device_get_unit(xbb->dev),
 					      xbb->sector_size,
 					      DEVSTAT_ALL_SUPPORTED,
 					      DEVSTAT_TYPE_DIRECT
 					    | DEVSTAT_TYPE_IF_OTHER,
 					      DEVSTAT_PRIORITY_OTHER);
 	/*
 	 * Setup sysctl variables.
 	 */
 	xbb_setup_sysctl(xbb);
 
 	/*
 	 * Create a taskqueue for doing work that must occur from a
 	 * thread context.
 	 */
 	xbb->io_taskqueue = taskqueue_create_fast(device_get_nameunit(dev),
 						  M_NOWAIT,
 						  taskqueue_thread_enqueue,
 						  /*contxt*/&xbb->io_taskqueue);
 	if (xbb->io_taskqueue == NULL) {
 		xbb_attach_failed(xbb, error, "Unable to create taskqueue");
 		return (ENOMEM);
 	}
 
 	taskqueue_start_threads(&xbb->io_taskqueue,
 				/*num threads*/1,
 				/*priority*/PWAIT,
 				/*thread name*/
 				"%s taskq", device_get_nameunit(dev));
 
 	/* Update hot-plug status to satisfy xend. */
 	error = xs_printf(XST_NIL, xenbus_get_node(xbb->dev),
 			  "hotplug-status", "connected");
 	if (error) {
 		xbb_attach_failed(xbb, error, "writing %s/hotplug-status",
 				  xenbus_get_node(xbb->dev));
 		return (error);
 	}
 
 	/* Tell the front end that we are ready to connect. */
 	xenbus_set_state(dev, XenbusStateInitWait);
 
 	return (0);
 }
 
 /**
  * Detach from a block back device instance.
  *
  * \param dev  NewBus device object representing this Xen Block Back instance.
  *
  * \return  0 for success, errno codes for failure.
  * 
  * \note A block back device may be detached at any time in its life-cycle,
  *       including part way through the attach process.  For this reason,
  *       initialization order and the intialization state checks in this
  *       routine must be carefully coupled so that attach time failures
  *       are gracefully handled.
  */
 static int
 xbb_detach(device_t dev)
 {
         struct xbb_softc *xbb;
 
 	DPRINTF("\n");
 
         xbb = device_get_softc(dev);
 	mtx_lock(&xbb->lock);
 	while (xbb_shutdown(xbb) == EAGAIN) {
 		msleep(xbb, &xbb->lock, /*wakeup prio unchanged*/0,
 		       "xbb_shutdown", 0);
 	}
 	mtx_unlock(&xbb->lock);
 
 	DPRINTF("\n");
 
 	if (xbb->io_taskqueue != NULL)
 		taskqueue_free(xbb->io_taskqueue);
 
 	if (xbb->xbb_stats != NULL)
 		devstat_remove_entry(xbb->xbb_stats);
 
 	if (xbb->xbb_stats_in != NULL)
 		devstat_remove_entry(xbb->xbb_stats_in);
 
 	xbb_close_backend(xbb);
 
 	if (xbb->dev_mode != NULL) {
 		free(xbb->dev_mode, M_XENBUS);
 		xbb->dev_mode = NULL;
 	}
 
 	if (xbb->dev_type != NULL) {
 		free(xbb->dev_type, M_XENBUS);
 		xbb->dev_type = NULL;
 	}
 
 	if (xbb->dev_name != NULL) {
 		free(xbb->dev_name, M_XENBUS);
 		xbb->dev_name = NULL;
 	}
 
 	mtx_destroy(&xbb->lock);
         return (0);
 }
 
 /**
  * Prepare this block back device for suspension of this VM.
  * 
  * \param dev  NewBus device object representing this Xen Block Back instance.
  *
  * \return  0 for success, errno codes for failure.
  */
 static int
 xbb_suspend(device_t dev)
 {
 #ifdef NOT_YET
         struct xbb_softc *sc = device_get_softc(dev);
 
         /* Prevent new requests being issued until we fix things up. */
         mtx_lock(&sc->xb_io_lock);
         sc->connected = BLKIF_STATE_SUSPENDED;
         mtx_unlock(&sc->xb_io_lock);
 #endif
 
         return (0);
 }
 
 /**
  * Perform any processing required to recover from a suspended state.
  * 
  * \param dev  NewBus device object representing this Xen Block Back instance.
  *
  * \return  0 for success, errno codes for failure.
  */
 static int
 xbb_resume(device_t dev)
 {
 	return (0);
 }
 
 /**
  * Handle state changes expressed via the XenStore by our front-end peer.
  *
  * \param dev             NewBus device object representing this Xen
  *                        Block Back instance.
  * \param frontend_state  The new state of the front-end.
  *
  * \return  0 for success, errno codes for failure.
  */
 static void
 xbb_frontend_changed(device_t dev, XenbusState frontend_state)
 {
 	struct xbb_softc *xbb = device_get_softc(dev);
 
 	DPRINTF("frontend_state=%s, xbb_state=%s\n",
 	        xenbus_strstate(frontend_state),
 		xenbus_strstate(xenbus_get_state(xbb->dev)));
 
 	switch (frontend_state) {
 	case XenbusStateInitialising:
 		break;
 	case XenbusStateInitialised:
 	case XenbusStateConnected:
 		xbb_connect(xbb);
 		break;
 	case XenbusStateClosing:
 	case XenbusStateClosed:
 		mtx_lock(&xbb->lock);
 		xbb_shutdown(xbb);
 		mtx_unlock(&xbb->lock);
 		if (frontend_state == XenbusStateClosed)
 			xenbus_set_state(xbb->dev, XenbusStateClosed);
 		break;
 	default:
 		xenbus_dev_fatal(xbb->dev, EINVAL, "saw state %d at frontend",
 				 frontend_state);
 		break;
 	}
 }
 
 /*---------------------------- NewBus Registration ---------------------------*/
 static device_method_t xbb_methods[] = {
 	/* Device interface */
 	DEVMETHOD(device_probe,		xbb_probe),
 	DEVMETHOD(device_attach,	xbb_attach),
 	DEVMETHOD(device_detach,	xbb_detach),
 	DEVMETHOD(device_shutdown,	bus_generic_shutdown),
 	DEVMETHOD(device_suspend,	xbb_suspend),
 	DEVMETHOD(device_resume,	xbb_resume),
 
 	/* Xenbus interface */
 	DEVMETHOD(xenbus_otherend_changed, xbb_frontend_changed),
 
 	{ 0, 0 }
 };
 
 static driver_t xbb_driver = {
         "xbbd",
         xbb_methods,
         sizeof(struct xbb_softc),
 };
 devclass_t xbb_devclass;
 
 DRIVER_MODULE(xbbd, xenbusb_back, xbb_driver, xbb_devclass, 0, 0);
Index: head/sys/fs/nfs/nfs_commonkrpc.c
===================================================================
--- head/sys/fs/nfs/nfs_commonkrpc.c	(revision 258540)
+++ head/sys/fs/nfs/nfs_commonkrpc.c	(revision 258541)
@@ -1,1216 +1,1215 @@
 /*-
  * Copyright (c) 1989, 1991, 1993, 1995
  *	The Regents of the University of California.  All rights reserved.
  *
  * This code is derived from software contributed to Berkeley by
  * Rick Macklem at The University of Guelph.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 /*
  * Socket operations for use by nfs
  */
 
-#include "opt_kdtrace.h"
 #include "opt_kgssapi.h"
 #include "opt_nfs.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/kernel.h>
 #include <sys/limits.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mbuf.h>
 #include <sys/mount.h>
 #include <sys/mutex.h>
 #include <sys/proc.h>
 #include <sys/signalvar.h>
 #include <sys/syscallsubr.h>
 #include <sys/sysctl.h>
 #include <sys/syslog.h>
 #include <sys/vnode.h>
 
 #include <rpc/rpc.h>
 
 #include <kgssapi/krb5/kcrypto.h>
 
 #include <fs/nfs/nfsport.h>
 
 #ifdef KDTRACE_HOOKS
 #include <sys/dtrace_bsd.h>
 
 dtrace_nfsclient_nfs23_start_probe_func_t
 		dtrace_nfscl_nfs234_start_probe;
 
 dtrace_nfsclient_nfs23_done_probe_func_t
 		dtrace_nfscl_nfs234_done_probe;
 
 /*
  * Registered probes by RPC type.
  */
 uint32_t	nfscl_nfs2_start_probes[NFSV41_NPROCS + 1];
 uint32_t	nfscl_nfs2_done_probes[NFSV41_NPROCS + 1];
 
 uint32_t	nfscl_nfs3_start_probes[NFSV41_NPROCS + 1];
 uint32_t	nfscl_nfs3_done_probes[NFSV41_NPROCS + 1];
 
 uint32_t	nfscl_nfs4_start_probes[NFSV41_NPROCS + 1];
 uint32_t	nfscl_nfs4_done_probes[NFSV41_NPROCS + 1];
 #endif
 
 NFSSTATESPINLOCK;
 NFSREQSPINLOCK;
 NFSDLOCKMUTEX;
 extern struct nfsstats newnfsstats;
 extern struct nfsreqhead nfsd_reqq;
 extern int nfscl_ticks;
 extern void (*ncl_call_invalcaches)(struct vnode *);
 extern int nfs_numnfscbd;
 extern int nfscl_debuglevel;
 
 SVCPOOL		*nfscbd_pool;
 static int	nfsrv_gsscallbackson = 0;
 static int	nfs_bufpackets = 4;
 static int	nfs_reconnects;
 static int	nfs3_jukebox_delay = 10;
 static int	nfs_skip_wcc_data_onerr = 1;
 
 SYSCTL_DECL(_vfs_nfs);
 
 SYSCTL_INT(_vfs_nfs, OID_AUTO, bufpackets, CTLFLAG_RW, &nfs_bufpackets, 0,
     "Buffer reservation size 2 < x < 64");
 SYSCTL_INT(_vfs_nfs, OID_AUTO, reconnects, CTLFLAG_RD, &nfs_reconnects, 0,
     "Number of times the nfs client has had to reconnect");
 SYSCTL_INT(_vfs_nfs, OID_AUTO, nfs3_jukebox_delay, CTLFLAG_RW, &nfs3_jukebox_delay, 0,
     "Number of seconds to delay a retry after receiving EJUKEBOX");
 SYSCTL_INT(_vfs_nfs, OID_AUTO, skip_wcc_data_onerr, CTLFLAG_RW, &nfs_skip_wcc_data_onerr, 0,
     "Disable weak cache consistency checking when server returns an error");
 
 static void	nfs_down(struct nfsmount *, struct thread *, const char *,
     int, int);
 static void	nfs_up(struct nfsmount *, struct thread *, const char *,
     int, int);
 static int	nfs_msg(struct thread *, const char *, const char *, int);
 
 struct nfs_cached_auth {
 	int		ca_refs; /* refcount, including 1 from the cache */
 	uid_t		ca_uid;	 /* uid that corresponds to this auth */
 	AUTH		*ca_auth; /* RPC auth handle */
 };
 
 static int nfsv2_procid[NFS_V3NPROCS] = {
 	NFSV2PROC_NULL,
 	NFSV2PROC_GETATTR,
 	NFSV2PROC_SETATTR,
 	NFSV2PROC_LOOKUP,
 	NFSV2PROC_NOOP,
 	NFSV2PROC_READLINK,
 	NFSV2PROC_READ,
 	NFSV2PROC_WRITE,
 	NFSV2PROC_CREATE,
 	NFSV2PROC_MKDIR,
 	NFSV2PROC_SYMLINK,
 	NFSV2PROC_CREATE,
 	NFSV2PROC_REMOVE,
 	NFSV2PROC_RMDIR,
 	NFSV2PROC_RENAME,
 	NFSV2PROC_LINK,
 	NFSV2PROC_READDIR,
 	NFSV2PROC_NOOP,
 	NFSV2PROC_STATFS,
 	NFSV2PROC_NOOP,
 	NFSV2PROC_NOOP,
 	NFSV2PROC_NOOP,
 };
 
 /*
  * Initialize sockets and congestion for a new NFS connection.
  * We do not free the sockaddr if error.
  */
 int
 newnfs_connect(struct nfsmount *nmp, struct nfssockreq *nrp,
     struct ucred *cred, NFSPROC_T *p, int callback_retry_mult)
 {
 	int rcvreserve, sndreserve;
 	int pktscale;
 	struct sockaddr *saddr;
 	struct ucred *origcred;
 	CLIENT *client;
 	struct netconfig *nconf;
 	struct socket *so;
 	int one = 1, retries, error = 0;
 	struct thread *td = curthread;
 	SVCXPRT *xprt;
 	struct timeval timo;
 
 	/*
 	 * We need to establish the socket using the credentials of
 	 * the mountpoint.  Some parts of this process (such as
 	 * sobind() and soconnect()) will use the curent thread's
 	 * credential instead of the socket credential.  To work
 	 * around this, temporarily change the current thread's
 	 * credential to that of the mountpoint.
 	 *
 	 * XXX: It would be better to explicitly pass the correct
 	 * credential to sobind() and soconnect().
 	 */
 	origcred = td->td_ucred;
 
 	/*
 	 * Use the credential in nr_cred, if not NULL.
 	 */
 	if (nrp->nr_cred != NULL)
 		td->td_ucred = nrp->nr_cred;
 	else
 		td->td_ucred = cred;
 	saddr = nrp->nr_nam;
 
 	if (saddr->sa_family == AF_INET)
 		if (nrp->nr_sotype == SOCK_DGRAM)
 			nconf = getnetconfigent("udp");
 		else
 			nconf = getnetconfigent("tcp");
 	else
 		if (nrp->nr_sotype == SOCK_DGRAM)
 			nconf = getnetconfigent("udp6");
 		else
 			nconf = getnetconfigent("tcp6");
 			
 	pktscale = nfs_bufpackets;
 	if (pktscale < 2)
 		pktscale = 2;
 	if (pktscale > 64)
 		pktscale = 64;
 	/*
 	 * soreserve() can fail if sb_max is too small, so shrink pktscale
 	 * and try again if there is an error.
 	 * Print a log message suggesting increasing sb_max.
 	 * Creating a socket and doing this is necessary since, if the
 	 * reservation sizes are too large and will make soreserve() fail,
 	 * the connection will work until a large send is attempted and
 	 * then it will loop in the krpc code.
 	 */
 	so = NULL;
 	saddr = NFSSOCKADDR(nrp->nr_nam, struct sockaddr *);
 	error = socreate(saddr->sa_family, &so, nrp->nr_sotype, 
 	    nrp->nr_soproto, td->td_ucred, td);
 	if (error) {
 		td->td_ucred = origcred;
 		goto out;
 	}
 	do {
 	    if (error != 0 && pktscale > 2)
 		pktscale--;
 	    if (nrp->nr_sotype == SOCK_DGRAM) {
 		if (nmp != NULL) {
 			sndreserve = (NFS_MAXDGRAMDATA + NFS_MAXPKTHDR) *
 			    pktscale;
 			rcvreserve = (NFS_MAXDGRAMDATA + NFS_MAXPKTHDR) *
 			    pktscale;
 		} else {
 			sndreserve = rcvreserve = 1024 * pktscale;
 		}
 	    } else {
 		if (nrp->nr_sotype != SOCK_STREAM)
 			panic("nfscon sotype");
 		if (nmp != NULL) {
 			sndreserve = (NFS_MAXBSIZE + NFS_MAXPKTHDR +
 			    sizeof (u_int32_t)) * pktscale;
 			rcvreserve = (NFS_MAXBSIZE + NFS_MAXPKTHDR +
 			    sizeof (u_int32_t)) * pktscale;
 		} else {
 			sndreserve = rcvreserve = 1024 * pktscale;
 		}
 	    }
 	    error = soreserve(so, sndreserve, rcvreserve);
 	} while (error != 0 && pktscale > 2);
 	soclose(so);
 	if (error) {
 		td->td_ucred = origcred;
 		goto out;
 	}
 
 	client = clnt_reconnect_create(nconf, saddr, nrp->nr_prog,
 	    nrp->nr_vers, sndreserve, rcvreserve);
 	CLNT_CONTROL(client, CLSET_WAITCHAN, "newnfsreq");
 	if (nmp != NULL) {
 		if ((nmp->nm_flag & NFSMNT_INT))
 			CLNT_CONTROL(client, CLSET_INTERRUPTIBLE, &one);
 		if ((nmp->nm_flag & NFSMNT_RESVPORT))
 			CLNT_CONTROL(client, CLSET_PRIVPORT, &one);
 		if (NFSHASSOFT(nmp)) {
 			if (nmp->nm_sotype == SOCK_DGRAM)
 				/*
 				 * For UDP, the large timeout for a reconnect
 				 * will be set to "nm_retry * nm_timeo / 2", so
 				 * we only want to do 2 reconnect timeout
 				 * retries.
 				 */
 				retries = 2;
 			else
 				retries = nmp->nm_retry;
 		} else
 			retries = INT_MAX;
 		if (NFSHASNFSV4N(nmp)) {
 			/*
 			 * Make sure the nfscbd_pool doesn't get destroyed
 			 * while doing this.
 			 */
 			NFSD_LOCK();
 			if (nfs_numnfscbd > 0) {
 				nfs_numnfscbd++;
 				NFSD_UNLOCK();
 				xprt = svc_vc_create_backchannel(nfscbd_pool);
 				CLNT_CONTROL(client, CLSET_BACKCHANNEL, xprt);
 				NFSD_LOCK();
 				nfs_numnfscbd--;
 				if (nfs_numnfscbd == 0)
 					wakeup(&nfs_numnfscbd);
 			}
 			NFSD_UNLOCK();
 		}
 	} else {
 		/*
 		 * Three cases:
 		 * - Null RPC callback to client
 		 * - Non-Null RPC callback to client, wait a little longer
 		 * - upcalls to nfsuserd and gssd (clp == NULL)
 		 */
 		if (callback_retry_mult == 0) {
 			retries = NFSV4_UPCALLRETRY;
 			CLNT_CONTROL(client, CLSET_PRIVPORT, &one);
 		} else {
 			retries = NFSV4_CALLBACKRETRY * callback_retry_mult;
 		}
 	}
 	CLNT_CONTROL(client, CLSET_RETRIES, &retries);
 
 	if (nmp != NULL) {
 		/*
 		 * For UDP, there are 2 timeouts:
 		 * - CLSET_RETRY_TIMEOUT sets the initial timeout for the timer
 		 *   that does a retransmit of an RPC request using the same 
 		 *   socket and xid. This is what you normally want to do,
 		 *   since NFS servers depend on "same xid" for their
 		 *   Duplicate Request Cache.
 		 * - timeout specified in CLNT_CALL_MBUF(), which specifies when
 		 *   retransmits on the same socket should fail and a fresh
 		 *   socket created. Each of these timeouts counts as one
 		 *   CLSET_RETRIES as set above.
 		 * Set the initial retransmit timeout for UDP. This timeout
 		 * doesn't exist for TCP and the following call just fails,
 		 * which is ok.
 		 */
 		timo.tv_sec = nmp->nm_timeo / NFS_HZ;
 		timo.tv_usec = (nmp->nm_timeo % NFS_HZ) * 1000000 / NFS_HZ;
 		CLNT_CONTROL(client, CLSET_RETRY_TIMEOUT, &timo);
 	}
 
 	mtx_lock(&nrp->nr_mtx);
 	if (nrp->nr_client != NULL) {
 		mtx_unlock(&nrp->nr_mtx);
 		/*
 		 * Someone else already connected.
 		 */
 		CLNT_RELEASE(client);
 	} else {
 		nrp->nr_client = client;
 		/*
 		 * Protocols that do not require connections may be optionally
 		 * left unconnected for servers that reply from a port other
 		 * than NFS_PORT.
 		 */
 		if (nmp == NULL || (nmp->nm_flag & NFSMNT_NOCONN) == 0) {
 			mtx_unlock(&nrp->nr_mtx);
 			CLNT_CONTROL(client, CLSET_CONNECT, &one);
 		} else
 			mtx_unlock(&nrp->nr_mtx);
 	}
 
 
 	/* Restore current thread's credentials. */
 	td->td_ucred = origcred;
 
 out:
 	NFSEXITCODE(error);
 	return (error);
 }
 
 /*
  * NFS disconnect. Clean up and unlink.
  */
 void
 newnfs_disconnect(struct nfssockreq *nrp)
 {
 	CLIENT *client;
 
 	mtx_lock(&nrp->nr_mtx);
 	if (nrp->nr_client != NULL) {
 		client = nrp->nr_client;
 		nrp->nr_client = NULL;
 		mtx_unlock(&nrp->nr_mtx);
 		rpc_gss_secpurge_call(client);
 		CLNT_CLOSE(client);
 		CLNT_RELEASE(client);
 	} else {
 		mtx_unlock(&nrp->nr_mtx);
 	}
 }
 
 static AUTH *
 nfs_getauth(struct nfssockreq *nrp, int secflavour, char *clnt_principal,
     char *srv_principal, gss_OID mech_oid, struct ucred *cred)
 {
 	rpc_gss_service_t svc;
 	AUTH *auth;
 
 	switch (secflavour) {
 	case RPCSEC_GSS_KRB5:
 	case RPCSEC_GSS_KRB5I:
 	case RPCSEC_GSS_KRB5P:
 		if (!mech_oid) {
 			if (!rpc_gss_mech_to_oid_call("kerberosv5", &mech_oid))
 				return (NULL);
 		}
 		if (secflavour == RPCSEC_GSS_KRB5)
 			svc = rpc_gss_svc_none;
 		else if (secflavour == RPCSEC_GSS_KRB5I)
 			svc = rpc_gss_svc_integrity;
 		else
 			svc = rpc_gss_svc_privacy;
 
 		if (clnt_principal == NULL)
 			auth = rpc_gss_secfind_call(nrp->nr_client, cred,
 			    srv_principal, mech_oid, svc);
 		else {
 			auth = rpc_gss_seccreate_call(nrp->nr_client, cred,
 			    clnt_principal, srv_principal, "kerberosv5",
 			    svc, NULL, NULL, NULL);
 			return (auth);
 		}
 		if (auth != NULL)
 			return (auth);
 		/* fallthrough */
 	case AUTH_SYS:
 	default:
 		return (authunix_create(cred));
 
 	}
 }
 
 /*
  * Callback from the RPC code to generate up/down notifications.
  */
 
 struct nfs_feedback_arg {
 	struct nfsmount *nf_mount;
 	int		nf_lastmsg;	/* last tprintf */
 	int		nf_tprintfmsg;
 	struct thread	*nf_td;
 };
 
 static void
 nfs_feedback(int type, int proc, void *arg)
 {
 	struct nfs_feedback_arg *nf = (struct nfs_feedback_arg *) arg;
 	struct nfsmount *nmp = nf->nf_mount;
 	time_t now;
 
 	switch (type) {
 	case FEEDBACK_REXMIT2:
 	case FEEDBACK_RECONNECT:
 		now = NFSD_MONOSEC;
 		if (nf->nf_lastmsg + nmp->nm_tprintf_delay < now) {
 			nfs_down(nmp, nf->nf_td,
 			    "not responding", 0, NFSSTA_TIMEO);
 			nf->nf_tprintfmsg = TRUE;
 			nf->nf_lastmsg = now;
 		}
 		break;
 
 	case FEEDBACK_OK:
 		nfs_up(nf->nf_mount, nf->nf_td,
 		    "is alive again", NFSSTA_TIMEO, nf->nf_tprintfmsg);
 		break;
 	}
 }
 
 /*
  * newnfs_request - goes something like this
  *	- does the rpc by calling the krpc layer
  *	- break down rpc header and return with nfs reply
  * nb: always frees up nd_mreq mbuf list
  */
 int
 newnfs_request(struct nfsrv_descript *nd, struct nfsmount *nmp,
     struct nfsclient *clp, struct nfssockreq *nrp, vnode_t vp,
     struct thread *td, struct ucred *cred, u_int32_t prog, u_int32_t vers,
     u_char *retsum, int toplevel, u_int64_t *xidp, struct nfsclsession *sep)
 {
 	u_int32_t retseq, retval, *tl;
 	time_t waituntil;
 	int i = 0, j = 0, opcnt, set_sigset = 0, slot;
 	int trycnt, error = 0, usegssname = 0, secflavour = AUTH_SYS;
 	int freeslot, timeo;
 	u_int16_t procnum;
 	u_int trylater_delay = 1;
 	struct nfs_feedback_arg nf;
 	struct timeval timo;
 	AUTH *auth;
 	struct rpc_callextra ext;
 	enum clnt_stat stat;
 	struct nfsreq *rep = NULL;
 	char *srv_principal = NULL, *clnt_principal = NULL;
 	sigset_t oldset;
 	struct ucred *authcred;
 
 	if (xidp != NULL)
 		*xidp = 0;
 	/* Reject requests while attempting a forced unmount. */
 	if (nmp != NULL && (nmp->nm_mountp->mnt_kern_flag & MNTK_UNMOUNTF)) {
 		m_freem(nd->nd_mreq);
 		return (ESTALE);
 	}
 
 	/*
 	 * Set authcred, which is used to acquire RPC credentials to
 	 * the cred argument, by default. The crhold() should not be
 	 * necessary, but will ensure that some future code change
 	 * doesn't result in the credential being free'd prematurely.
 	 */
 	authcred = crhold(cred);
 
 	/* For client side interruptible mounts, mask off the signals. */
 	if (nmp != NULL && td != NULL && NFSHASINT(nmp)) {
 		newnfs_set_sigmask(td, &oldset);
 		set_sigset = 1;
 	}
 
 	/*
 	 * XXX if not already connected call nfs_connect now. Longer
 	 * term, change nfs_mount to call nfs_connect unconditionally
 	 * and let clnt_reconnect_create handle reconnects.
 	 */
 	if (nrp->nr_client == NULL)
 		newnfs_connect(nmp, nrp, cred, td, 0);
 
 	/*
 	 * For a client side mount, nmp is != NULL and clp == NULL. For
 	 * server calls (callbacks or upcalls), nmp == NULL.
 	 */
 	if (clp != NULL) {
 		NFSLOCKSTATE();
 		if ((clp->lc_flags & LCL_GSS) && nfsrv_gsscallbackson) {
 			secflavour = RPCSEC_GSS_KRB5;
 			if (nd->nd_procnum != NFSPROC_NULL) {
 				if (clp->lc_flags & LCL_GSSINTEGRITY)
 					secflavour = RPCSEC_GSS_KRB5I;
 				else if (clp->lc_flags & LCL_GSSPRIVACY)
 					secflavour = RPCSEC_GSS_KRB5P;
 			}
 		}
 		NFSUNLOCKSTATE();
 	} else if (nmp != NULL && NFSHASKERB(nmp) &&
 	     nd->nd_procnum != NFSPROC_NULL) {
 		if (NFSHASALLGSSNAME(nmp) && nmp->nm_krbnamelen > 0)
 			nd->nd_flag |= ND_USEGSSNAME;
 		if ((nd->nd_flag & ND_USEGSSNAME) != 0) {
 			/*
 			 * If there is a client side host based credential,
 			 * use that, otherwise use the system uid, if set.
 			 * The system uid is in the nmp->nm_sockreq.nr_cred
 			 * credentials.
 			 */
 			if (nmp->nm_krbnamelen > 0) {
 				usegssname = 1;
 				clnt_principal = nmp->nm_krbname;
 			} else if (nmp->nm_uid != (uid_t)-1) {
 				KASSERT(nmp->nm_sockreq.nr_cred != NULL,
 				    ("newnfs_request: NULL nr_cred"));
 				crfree(authcred);
 				authcred = crhold(nmp->nm_sockreq.nr_cred);
 			}
 		} else if (nmp->nm_krbnamelen == 0 &&
 		    nmp->nm_uid != (uid_t)-1 && cred->cr_uid == (uid_t)0) {
 			/*
 			 * If there is no host based principal name and
 			 * the system uid is set and this is root, use the
 			 * system uid, since root won't have user
 			 * credentials in a credentials cache file.
 			 * The system uid is in the nmp->nm_sockreq.nr_cred
 			 * credentials.
 			 */
 			KASSERT(nmp->nm_sockreq.nr_cred != NULL,
 			    ("newnfs_request: NULL nr_cred"));
 			crfree(authcred);
 			authcred = crhold(nmp->nm_sockreq.nr_cred);
 		}
 		if (NFSHASINTEGRITY(nmp))
 			secflavour = RPCSEC_GSS_KRB5I;
 		else if (NFSHASPRIVACY(nmp))
 			secflavour = RPCSEC_GSS_KRB5P;
 		else
 			secflavour = RPCSEC_GSS_KRB5;
 		srv_principal = NFSMNT_SRVKRBNAME(nmp);
 	} else if (nmp != NULL && !NFSHASKERB(nmp) &&
 	    nd->nd_procnum != NFSPROC_NULL &&
 	    (nd->nd_flag & ND_USEGSSNAME) != 0) {
 		/*
 		 * Use the uid that did the mount when the RPC is doing
 		 * NFSv4 system operations, as indicated by the
 		 * ND_USEGSSNAME flag, for the AUTH_SYS case.
 		 * The credentials in nm_sockreq.nr_cred were used for the
 		 * mount.
 		 */
 		KASSERT(nmp->nm_sockreq.nr_cred != NULL,
 		    ("newnfs_request: NULL nr_cred"));
 		crfree(authcred);
 		authcred = crhold(nmp->nm_sockreq.nr_cred);
 	}
 
 	if (nmp != NULL) {
 		bzero(&nf, sizeof(struct nfs_feedback_arg));
 		nf.nf_mount = nmp;
 		nf.nf_td = td;
 		nf.nf_lastmsg = NFSD_MONOSEC -
 		    ((nmp->nm_tprintf_delay)-(nmp->nm_tprintf_initial_delay));
 	}
 
 	if (nd->nd_procnum == NFSPROC_NULL)
 		auth = authnone_create();
 	else if (usegssname) {
 		/*
 		 * For this case, the authenticator is held in the
 		 * nfssockreq structure, so don't release the reference count
 		 * held on it. --> Don't AUTH_DESTROY() it in this function.
 		 */
 		if (nrp->nr_auth == NULL)
 			nrp->nr_auth = nfs_getauth(nrp, secflavour,
 			    clnt_principal, srv_principal, NULL, authcred);
 		else
 			rpc_gss_refresh_auth_call(nrp->nr_auth);
 		auth = nrp->nr_auth;
 	} else
 		auth = nfs_getauth(nrp, secflavour, NULL,
 		    srv_principal, NULL, authcred);
 	crfree(authcred);
 	if (auth == NULL) {
 		m_freem(nd->nd_mreq);
 		if (set_sigset)
 			newnfs_restore_sigmask(td, &oldset);
 		return (EACCES);
 	}
 	bzero(&ext, sizeof(ext));
 	ext.rc_auth = auth;
 	if (nmp != NULL) {
 		ext.rc_feedback = nfs_feedback;
 		ext.rc_feedback_arg = &nf;
 	}
 
 	procnum = nd->nd_procnum;
 	if ((nd->nd_flag & ND_NFSV4) &&
 	    nd->nd_procnum != NFSPROC_NULL &&
 	    nd->nd_procnum != NFSV4PROC_CBCOMPOUND)
 		procnum = NFSV4PROC_COMPOUND;
 
 	if (nmp != NULL) {
 		NFSINCRGLOBAL(newnfsstats.rpcrequests);
 
 		/* Map the procnum to the old NFSv2 one, as required. */
 		if ((nd->nd_flag & ND_NFSV2) != 0) {
 			if (nd->nd_procnum < NFS_V3NPROCS)
 				procnum = nfsv2_procid[nd->nd_procnum];
 			else
 				procnum = NFSV2PROC_NOOP;
 		}
 
 		/*
 		 * Now only used for the R_DONTRECOVER case, but until that is
 		 * supported within the krpc code, I need to keep a queue of
 		 * outstanding RPCs for nfsv4 client requests.
 		 */
 		if ((nd->nd_flag & ND_NFSV4) && procnum == NFSV4PROC_COMPOUND)
 			MALLOC(rep, struct nfsreq *, sizeof(struct nfsreq),
 			    M_NFSDREQ, M_WAITOK);
 #ifdef KDTRACE_HOOKS
 		if (dtrace_nfscl_nfs234_start_probe != NULL) {
 			uint32_t probe_id;
 			int probe_procnum;
 	
 			if (nd->nd_flag & ND_NFSV4) {
 				probe_id =
 				    nfscl_nfs4_start_probes[nd->nd_procnum];
 				probe_procnum = nd->nd_procnum;
 			} else if (nd->nd_flag & ND_NFSV3) {
 				probe_id = nfscl_nfs3_start_probes[procnum];
 				probe_procnum = procnum;
 			} else {
 				probe_id =
 				    nfscl_nfs2_start_probes[nd->nd_procnum];
 				probe_procnum = procnum;
 			}
 			if (probe_id != 0)
 				(dtrace_nfscl_nfs234_start_probe)
 				    (probe_id, vp, nd->nd_mreq, cred,
 				     probe_procnum);
 		}
 #endif
 	}
 	trycnt = 0;
 	freeslot = -1;		/* Set to slot that needs to be free'd */
 tryagain:
 	slot = -1;		/* Slot that needs a sequence# increment. */
 	/*
 	 * This timeout specifies when a new socket should be created,
 	 * along with new xid values. For UDP, this should be done
 	 * infrequently, since retransmits of RPC requests should normally
 	 * use the same xid.
 	 */
 	if (nmp == NULL) {
 		timo.tv_usec = 0;
 		if (clp == NULL)
 			timo.tv_sec = NFSV4_UPCALLTIMEO;
 		else
 			timo.tv_sec = NFSV4_CALLBACKTIMEO;
 	} else {
 		if (nrp->nr_sotype != SOCK_DGRAM) {
 			timo.tv_usec = 0;
 			if ((nmp->nm_flag & NFSMNT_NFSV4))
 				timo.tv_sec = INT_MAX;
 			else
 				timo.tv_sec = NFS_TCPTIMEO;
 		} else {
 			if (NFSHASSOFT(nmp)) {
 				/*
 				 * CLSET_RETRIES is set to 2, so this should be
 				 * half of the total timeout required.
 				 */
 				timeo = nmp->nm_retry * nmp->nm_timeo / 2;
 				if (timeo < 1)
 					timeo = 1;
 				timo.tv_sec = timeo / NFS_HZ;
 				timo.tv_usec = (timeo % NFS_HZ) * 1000000 /
 				    NFS_HZ;
 			} else {
 				/* For UDP hard mounts, use a large value. */
 				timo.tv_sec = NFS_MAXTIMEO / NFS_HZ;
 				timo.tv_usec = 0;
 			}
 		}
 
 		if (rep != NULL) {
 			rep->r_flags = 0;
 			rep->r_nmp = nmp;
 			/*
 			 * Chain request into list of outstanding requests.
 			 */
 			NFSLOCKREQ();
 			TAILQ_INSERT_TAIL(&nfsd_reqq, rep, r_chain);
 			NFSUNLOCKREQ();
 		}
 	}
 
 	nd->nd_mrep = NULL;
 	stat = CLNT_CALL_MBUF(nrp->nr_client, &ext, procnum, nd->nd_mreq,
 	    &nd->nd_mrep, timo);
 
 	if (rep != NULL) {
 		/*
 		 * RPC done, unlink the request.
 		 */
 		NFSLOCKREQ();
 		TAILQ_REMOVE(&nfsd_reqq, rep, r_chain);
 		NFSUNLOCKREQ();
 	}
 
 	/*
 	 * If there was a successful reply and a tprintf msg.
 	 * tprintf a response.
 	 */
 	if (stat == RPC_SUCCESS) {
 		error = 0;
 	} else if (stat == RPC_TIMEDOUT) {
 		NFSINCRGLOBAL(newnfsstats.rpctimeouts);
 		error = ETIMEDOUT;
 	} else if (stat == RPC_VERSMISMATCH) {
 		NFSINCRGLOBAL(newnfsstats.rpcinvalid);
 		error = EOPNOTSUPP;
 	} else if (stat == RPC_PROGVERSMISMATCH) {
 		NFSINCRGLOBAL(newnfsstats.rpcinvalid);
 		error = EPROTONOSUPPORT;
 	} else if (stat == RPC_INTR) {
 		error = EINTR;
 	} else {
 		NFSINCRGLOBAL(newnfsstats.rpcinvalid);
 		error = EACCES;
 	}
 	if (error) {
 		m_freem(nd->nd_mreq);
 		if (usegssname == 0)
 			AUTH_DESTROY(auth);
 		if (rep != NULL)
 			FREE((caddr_t)rep, M_NFSDREQ);
 		if (set_sigset)
 			newnfs_restore_sigmask(td, &oldset);
 		return (error);
 	}
 
 	KASSERT(nd->nd_mrep != NULL, ("mrep shouldn't be NULL if no error\n"));
 
 	/*
 	 * Search for any mbufs that are not a multiple of 4 bytes long
 	 * or with m_data not longword aligned.
 	 * These could cause pointer alignment problems, so copy them to
 	 * well aligned mbufs.
 	 */
 	newnfs_realign(&nd->nd_mrep, M_WAITOK);
 	nd->nd_md = nd->nd_mrep;
 	nd->nd_dpos = NFSMTOD(nd->nd_md, caddr_t);
 	nd->nd_repstat = 0;
 	if (nd->nd_procnum != NFSPROC_NULL) {
 		/* If sep == NULL, set it to the default in nmp. */
 		if (sep == NULL && nmp != NULL)
 			sep = NFSMNT_MDSSESSION(nmp);
 		/*
 		 * and now the actual NFS xdr.
 		 */
 		NFSM_DISSECT(tl, u_int32_t *, NFSX_UNSIGNED);
 		nd->nd_repstat = fxdr_unsigned(u_int32_t, *tl);
 		if (nd->nd_repstat >= 10000)
 			NFSCL_DEBUG(1, "proc=%d reps=%d\n", (int)nd->nd_procnum,
 			    (int)nd->nd_repstat);
 
 		/*
 		 * Get rid of the tag, return count and SEQUENCE result for
 		 * NFSv4.
 		 */
 		if ((nd->nd_flag & ND_NFSV4) != 0) {
 			NFSM_DISSECT(tl, u_int32_t *, NFSX_UNSIGNED);
 			i = fxdr_unsigned(int, *tl);
 			error = nfsm_advance(nd, NFSM_RNDUP(i), -1);
 			if (error)
 				goto nfsmout;
 			NFSM_DISSECT(tl, u_int32_t *, 3 * NFSX_UNSIGNED);
 			opcnt = fxdr_unsigned(int, *tl++);
 			i = fxdr_unsigned(int, *tl++);
 			j = fxdr_unsigned(int, *tl);
 			if (j >= 10000)
 				NFSCL_DEBUG(1, "fop=%d fst=%d\n", i, j);
 			/*
 			 * If the first op is Sequence, free up the slot.
 			 */
 			if (nmp != NULL && i == NFSV4OP_SEQUENCE && j != 0)
 				NFSCL_DEBUG(1, "failed seq=%d\n", j);
 			if (nmp != NULL && i == NFSV4OP_SEQUENCE && j == 0) {
 				NFSM_DISSECT(tl, uint32_t *, NFSX_V4SESSIONID +
 				    5 * NFSX_UNSIGNED);
 				mtx_lock(&sep->nfsess_mtx);
 				tl += NFSX_V4SESSIONID / NFSX_UNSIGNED;
 				retseq = fxdr_unsigned(uint32_t, *tl++);
 				slot = fxdr_unsigned(int, *tl++);
 				freeslot = slot;
 				if (retseq != sep->nfsess_slotseq[slot])
 					printf("retseq diff 0x%x\n", retseq);
 				retval = fxdr_unsigned(uint32_t, *++tl);
 				if ((retval + 1) < sep->nfsess_foreslots)
 					sep->nfsess_foreslots = (retval + 1);
 				else if ((retval + 1) > sep->nfsess_foreslots)
 					sep->nfsess_foreslots = (retval < 64) ?
 					    (retval + 1) : 64;
 				mtx_unlock(&sep->nfsess_mtx);
 
 				/* Grab the op and status for the next one. */
 				if (opcnt > 1) {
 					NFSM_DISSECT(tl, uint32_t *,
 					    2 * NFSX_UNSIGNED);
 					i = fxdr_unsigned(int, *tl++);
 					j = fxdr_unsigned(int, *tl);
 				}
 			}
 		}
 		if (nd->nd_repstat != 0) {
 			if (((nd->nd_repstat == NFSERR_DELAY ||
 			      nd->nd_repstat == NFSERR_GRACE) &&
 			     (nd->nd_flag & ND_NFSV4) &&
 			     nd->nd_procnum != NFSPROC_DELEGRETURN &&
 			     nd->nd_procnum != NFSPROC_SETATTR &&
 			     nd->nd_procnum != NFSPROC_READ &&
 			     nd->nd_procnum != NFSPROC_READDS &&
 			     nd->nd_procnum != NFSPROC_WRITE &&
 			     nd->nd_procnum != NFSPROC_WRITEDS &&
 			     nd->nd_procnum != NFSPROC_OPEN &&
 			     nd->nd_procnum != NFSPROC_CREATE &&
 			     nd->nd_procnum != NFSPROC_OPENCONFIRM &&
 			     nd->nd_procnum != NFSPROC_OPENDOWNGRADE &&
 			     nd->nd_procnum != NFSPROC_CLOSE &&
 			     nd->nd_procnum != NFSPROC_LOCK &&
 			     nd->nd_procnum != NFSPROC_LOCKU) ||
 			    (nd->nd_repstat == NFSERR_DELAY &&
 			     (nd->nd_flag & ND_NFSV4) == 0) ||
 			    nd->nd_repstat == NFSERR_RESOURCE) {
 				if (trylater_delay > NFS_TRYLATERDEL)
 					trylater_delay = NFS_TRYLATERDEL;
 				waituntil = NFSD_MONOSEC + trylater_delay;
 				while (NFSD_MONOSEC < waituntil)
 					(void) nfs_catnap(PZERO, 0, "nfstry");
 				trylater_delay *= 2;
 				if (slot != -1) {
 					mtx_lock(&sep->nfsess_mtx);
 					sep->nfsess_slotseq[slot]++;
 					*nd->nd_slotseq = txdr_unsigned(
 					    sep->nfsess_slotseq[slot]);
 					mtx_unlock(&sep->nfsess_mtx);
 				}
 				m_freem(nd->nd_mrep);
 				nd->nd_mrep = NULL;
 				goto tryagain;
 			}
 
 			/*
 			 * If the File Handle was stale, invalidate the
 			 * lookup cache, just in case.
 			 * (vp != NULL implies a client side call)
 			 */
 			if (nd->nd_repstat == ESTALE && vp != NULL) {
 				cache_purge(vp);
 				if (ncl_call_invalcaches != NULL)
 					(*ncl_call_invalcaches)(vp);
 			}
 		}
 		if ((nd->nd_flag & ND_NFSV4) != 0) {
 			/* Free the slot, as required. */
 			if (freeslot != -1)
 				nfsv4_freeslot(sep, freeslot);
 			/*
 			 * If this op is Putfh, throw its results away.
 			 */
 			if (j >= 10000)
 				NFSCL_DEBUG(1, "nop=%d nst=%d\n", i, j);
 			if (nmp != NULL && i == NFSV4OP_PUTFH && j == 0) {
 				NFSM_DISSECT(tl,u_int32_t *,2 * NFSX_UNSIGNED);
 				i = fxdr_unsigned(int, *tl++);
 				j = fxdr_unsigned(int, *tl);
 				if (j >= 10000)
 					NFSCL_DEBUG(1, "n2op=%d n2st=%d\n", i,
 					    j);
 				/*
 				 * All Compounds that do an Op that must
 				 * be in sequence consist of NFSV4OP_PUTFH
 				 * followed by one of these. As such, we
 				 * can determine if the seqid# should be
 				 * incremented, here.
 				 */
 				if ((i == NFSV4OP_OPEN ||
 				     i == NFSV4OP_OPENCONFIRM ||
 				     i == NFSV4OP_OPENDOWNGRADE ||
 				     i == NFSV4OP_CLOSE ||
 				     i == NFSV4OP_LOCK ||
 				     i == NFSV4OP_LOCKU) &&
 				    (j == 0 ||
 				     (j != NFSERR_STALECLIENTID &&
 				      j != NFSERR_STALESTATEID &&
 				      j != NFSERR_BADSTATEID &&
 				      j != NFSERR_BADSEQID &&
 				      j != NFSERR_BADXDR &&	 
 				      j != NFSERR_RESOURCE &&
 				      j != NFSERR_NOFILEHANDLE)))		 
 					nd->nd_flag |= ND_INCRSEQID;
 			}
 			/*
 			 * If this op's status is non-zero, mark
 			 * that there is no more data to process.
 			 */
 			if (j)
 				nd->nd_flag |= ND_NOMOREDATA;
 
 			/*
 			 * If R_DONTRECOVER is set, replace the stale error
 			 * reply, so that recovery isn't initiated.
 			 */
 			if ((nd->nd_repstat == NFSERR_STALECLIENTID ||
 			     nd->nd_repstat == NFSERR_BADSESSION ||
 			     nd->nd_repstat == NFSERR_STALESTATEID) &&
 			    rep != NULL && (rep->r_flags & R_DONTRECOVER))
 				nd->nd_repstat = NFSERR_STALEDONTRECOVER;
 		}
 	}
 
 #ifdef KDTRACE_HOOKS
 	if (nmp != NULL && dtrace_nfscl_nfs234_done_probe != NULL) {
 		uint32_t probe_id;
 		int probe_procnum;
 
 		if (nd->nd_flag & ND_NFSV4) {
 			probe_id = nfscl_nfs4_done_probes[nd->nd_procnum];
 			probe_procnum = nd->nd_procnum;
 		} else if (nd->nd_flag & ND_NFSV3) {
 			probe_id = nfscl_nfs3_done_probes[procnum];
 			probe_procnum = procnum;
 		} else {
 			probe_id = nfscl_nfs2_done_probes[nd->nd_procnum];
 			probe_procnum = procnum;
 		}
 		if (probe_id != 0)
 			(dtrace_nfscl_nfs234_done_probe)(probe_id, vp,
 			    nd->nd_mreq, cred, probe_procnum, 0);
 	}
 #endif
 
 	m_freem(nd->nd_mreq);
 	if (usegssname == 0)
 		AUTH_DESTROY(auth);
 	if (rep != NULL)
 		FREE((caddr_t)rep, M_NFSDREQ);
 	if (set_sigset)
 		newnfs_restore_sigmask(td, &oldset);
 	return (0);
 nfsmout:
 	mbuf_freem(nd->nd_mrep);
 	mbuf_freem(nd->nd_mreq);
 	if (usegssname == 0)
 		AUTH_DESTROY(auth);
 	if (rep != NULL)
 		FREE((caddr_t)rep, M_NFSDREQ);
 	if (set_sigset)
 		newnfs_restore_sigmask(td, &oldset);
 	return (error);
 }
 
 /*
  * Mark all of an nfs mount's outstanding requests with R_SOFTTERM and
  * wait for all requests to complete. This is used by forced unmounts
  * to terminate any outstanding RPCs.
  */
 int
 newnfs_nmcancelreqs(struct nfsmount *nmp)
 {
 
 	if (nmp->nm_sockreq.nr_client != NULL)
 		CLNT_CLOSE(nmp->nm_sockreq.nr_client);
 	return (0);
 }
 
 /*
  * Any signal that can interrupt an NFS operation in an intr mount
  * should be added to this set. SIGSTOP and SIGKILL cannot be masked.
  */
 int newnfs_sig_set[] = {
 	SIGINT,
 	SIGTERM,
 	SIGHUP,
 	SIGKILL,
 	SIGQUIT
 };
 
 /*
  * Check to see if one of the signals in our subset is pending on
  * the process (in an intr mount).
  */
 static int
 nfs_sig_pending(sigset_t set)
 {
 	int i;
 	
 	for (i = 0 ; i < sizeof(newnfs_sig_set)/sizeof(int) ; i++)
 		if (SIGISMEMBER(set, newnfs_sig_set[i]))
 			return (1);
 	return (0);
 }
  
 /*
  * The set/restore sigmask functions are used to (temporarily) overwrite
  * the thread td_sigmask during an RPC call (for example). These are also
  * used in other places in the NFS client that might tsleep().
  */
 void
 newnfs_set_sigmask(struct thread *td, sigset_t *oldset)
 {
 	sigset_t newset;
 	int i;
 	struct proc *p;
 	
 	SIGFILLSET(newset);
 	if (td == NULL)
 		td = curthread; /* XXX */
 	p = td->td_proc;
 	/* Remove the NFS set of signals from newset */
 	PROC_LOCK(p);
 	mtx_lock(&p->p_sigacts->ps_mtx);
 	for (i = 0 ; i < sizeof(newnfs_sig_set)/sizeof(int) ; i++) {
 		/*
 		 * But make sure we leave the ones already masked
 		 * by the process, ie. remove the signal from the
 		 * temporary signalmask only if it wasn't already
 		 * in p_sigmask.
 		 */
 		if (!SIGISMEMBER(td->td_sigmask, newnfs_sig_set[i]) &&
 		    !SIGISMEMBER(p->p_sigacts->ps_sigignore, newnfs_sig_set[i]))
 			SIGDELSET(newset, newnfs_sig_set[i]);
 	}
 	mtx_unlock(&p->p_sigacts->ps_mtx);
 	kern_sigprocmask(td, SIG_SETMASK, &newset, oldset,
 	    SIGPROCMASK_PROC_LOCKED);
 	PROC_UNLOCK(p);
 }
 
 void
 newnfs_restore_sigmask(struct thread *td, sigset_t *set)
 {
 	if (td == NULL)
 		td = curthread; /* XXX */
 	kern_sigprocmask(td, SIG_SETMASK, set, NULL, 0);
 }
 
 /*
  * NFS wrapper to msleep(), that shoves a new p_sigmask and restores the
  * old one after msleep() returns.
  */
 int
 newnfs_msleep(struct thread *td, void *ident, struct mtx *mtx, int priority, char *wmesg, int timo)
 {
 	sigset_t oldset;
 	int error;
 	struct proc *p;
 	
 	if ((priority & PCATCH) == 0)
 		return msleep(ident, mtx, priority, wmesg, timo);
 	if (td == NULL)
 		td = curthread; /* XXX */
 	newnfs_set_sigmask(td, &oldset);
 	error = msleep(ident, mtx, priority, wmesg, timo);
 	newnfs_restore_sigmask(td, &oldset);
 	p = td->td_proc;
 	return (error);
 }
 
 /*
  * Test for a termination condition pending on the process.
  * This is used for NFSMNT_INT mounts.
  */
 int
 newnfs_sigintr(struct nfsmount *nmp, struct thread *td)
 {
 	struct proc *p;
 	sigset_t tmpset;
 	
 	/* Terminate all requests while attempting a forced unmount. */
 	if (nmp->nm_mountp->mnt_kern_flag & MNTK_UNMOUNTF)
 		return (EIO);
 	if (!(nmp->nm_flag & NFSMNT_INT))
 		return (0);
 	if (td == NULL)
 		return (0);
 	p = td->td_proc;
 	PROC_LOCK(p);
 	tmpset = p->p_siglist;
 	SIGSETOR(tmpset, td->td_siglist);
 	SIGSETNAND(tmpset, td->td_sigmask);
 	mtx_lock(&p->p_sigacts->ps_mtx);
 	SIGSETNAND(tmpset, p->p_sigacts->ps_sigignore);
 	mtx_unlock(&p->p_sigacts->ps_mtx);
 	if ((SIGNOTEMPTY(p->p_siglist) || SIGNOTEMPTY(td->td_siglist))
 	    && nfs_sig_pending(tmpset)) {
 		PROC_UNLOCK(p);
 		return (EINTR);
 	}
 	PROC_UNLOCK(p);
 	return (0);
 }
 
 static int
 nfs_msg(struct thread *td, const char *server, const char *msg, int error)
 {
 	struct proc *p;
 
 	p = td ? td->td_proc : NULL;
 	if (error) {
 		tprintf(p, LOG_INFO, "newnfs server %s: %s, error %d\n",
 		    server, msg, error);
 	} else {
 		tprintf(p, LOG_INFO, "newnfs server %s: %s\n", server, msg);
 	}
 	return (0);
 }
 
 static void
 nfs_down(struct nfsmount *nmp, struct thread *td, const char *msg,
     int error, int flags)
 {
 	if (nmp == NULL)
 		return;
 	mtx_lock(&nmp->nm_mtx);
 	if ((flags & NFSSTA_TIMEO) && !(nmp->nm_state & NFSSTA_TIMEO)) {
 		nmp->nm_state |= NFSSTA_TIMEO;
 		mtx_unlock(&nmp->nm_mtx);
 		vfs_event_signal(&nmp->nm_mountp->mnt_stat.f_fsid,
 		    VQ_NOTRESP, 0);
 	} else
 		mtx_unlock(&nmp->nm_mtx);
 	mtx_lock(&nmp->nm_mtx);
 	if ((flags & NFSSTA_LOCKTIMEO) && !(nmp->nm_state & NFSSTA_LOCKTIMEO)) {
 		nmp->nm_state |= NFSSTA_LOCKTIMEO;
 		mtx_unlock(&nmp->nm_mtx);
 		vfs_event_signal(&nmp->nm_mountp->mnt_stat.f_fsid,
 		    VQ_NOTRESPLOCK, 0);
 	} else
 		mtx_unlock(&nmp->nm_mtx);
 	nfs_msg(td, nmp->nm_mountp->mnt_stat.f_mntfromname, msg, error);
 }
 
 static void
 nfs_up(struct nfsmount *nmp, struct thread *td, const char *msg,
     int flags, int tprintfmsg)
 {
 	if (nmp == NULL)
 		return;
 	if (tprintfmsg) {
 		nfs_msg(td, nmp->nm_mountp->mnt_stat.f_mntfromname, msg, 0);
 	}
 
 	mtx_lock(&nmp->nm_mtx);
 	if ((flags & NFSSTA_TIMEO) && (nmp->nm_state & NFSSTA_TIMEO)) {
 		nmp->nm_state &= ~NFSSTA_TIMEO;
 		mtx_unlock(&nmp->nm_mtx);
 		vfs_event_signal(&nmp->nm_mountp->mnt_stat.f_fsid,
 		    VQ_NOTRESP, 1);
 	} else
 		mtx_unlock(&nmp->nm_mtx);
 	
 	mtx_lock(&nmp->nm_mtx);
 	if ((flags & NFSSTA_LOCKTIMEO) && (nmp->nm_state & NFSSTA_LOCKTIMEO)) {
 		nmp->nm_state &= ~NFSSTA_LOCKTIMEO;
 		mtx_unlock(&nmp->nm_mtx);
 		vfs_event_signal(&nmp->nm_mountp->mnt_stat.f_fsid,
 		    VQ_NOTRESPLOCK, 1);
 	} else
 		mtx_unlock(&nmp->nm_mtx);
 }
 
Index: head/sys/fs/nfsclient/nfs_clbio.c
===================================================================
--- head/sys/fs/nfsclient/nfs_clbio.c	(revision 258540)
+++ head/sys/fs/nfsclient/nfs_clbio.c	(revision 258541)
@@ -1,1887 +1,1885 @@
 /*-
  * Copyright (c) 1989, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * This code is derived from software contributed to Berkeley by
  * Rick Macklem at The University of Guelph.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)nfs_bio.c	8.9 (Berkeley) 3/30/95
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
-#include "opt_kdtrace.h"
-
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/bio.h>
 #include <sys/buf.h>
 #include <sys/kernel.h>
 #include <sys/mount.h>
 #include <sys/rwlock.h>
 #include <sys/vmmeter.h>
 #include <sys/vnode.h>
 
 #include <vm/vm.h>
 #include <vm/vm_param.h>
 #include <vm/vm_extern.h>
 #include <vm/vm_page.h>
 #include <vm/vm_object.h>
 #include <vm/vm_pager.h>
 #include <vm/vnode_pager.h>
 
 #include <fs/nfs/nfsport.h>
 #include <fs/nfsclient/nfsmount.h>
 #include <fs/nfsclient/nfs.h>
 #include <fs/nfsclient/nfsnode.h>
 #include <fs/nfsclient/nfs_kdtrace.h>
 
 extern int newnfs_directio_allow_mmap;
 extern struct nfsstats newnfsstats;
 extern struct mtx ncl_iod_mutex;
 extern int ncl_numasync;
 extern enum nfsiod_state ncl_iodwant[NFS_MAXASYNCDAEMON];
 extern struct nfsmount *ncl_iodmount[NFS_MAXASYNCDAEMON];
 extern int newnfs_directio_enable;
 extern int nfs_keep_dirty_on_error;
 
 int ncl_pbuf_freecnt = -1;	/* start out unlimited */
 
 static struct buf *nfs_getcacheblk(struct vnode *vp, daddr_t bn, int size,
     struct thread *td);
 static int nfs_directio_write(struct vnode *vp, struct uio *uiop,
     struct ucred *cred, int ioflag);
 
 /*
  * Vnode op for VM getpages.
  */
 int
 ncl_getpages(struct vop_getpages_args *ap)
 {
 	int i, error, nextoff, size, toff, count, npages;
 	struct uio uio;
 	struct iovec iov;
 	vm_offset_t kva;
 	struct buf *bp;
 	struct vnode *vp;
 	struct thread *td;
 	struct ucred *cred;
 	struct nfsmount *nmp;
 	vm_object_t object;
 	vm_page_t *pages;
 	struct nfsnode *np;
 
 	vp = ap->a_vp;
 	np = VTONFS(vp);
 	td = curthread;				/* XXX */
 	cred = curthread->td_ucred;		/* XXX */
 	nmp = VFSTONFS(vp->v_mount);
 	pages = ap->a_m;
 	count = ap->a_count;
 
 	if ((object = vp->v_object) == NULL) {
 		ncl_printf("nfs_getpages: called with non-merged cache vnode??\n");
 		return (VM_PAGER_ERROR);
 	}
 
 	if (newnfs_directio_enable && !newnfs_directio_allow_mmap) {
 		mtx_lock(&np->n_mtx);
 		if ((np->n_flag & NNONCACHE) && (vp->v_type == VREG)) {
 			mtx_unlock(&np->n_mtx);
 			ncl_printf("nfs_getpages: called on non-cacheable vnode??\n");
 			return (VM_PAGER_ERROR);
 		} else
 			mtx_unlock(&np->n_mtx);
 	}
 
 	mtx_lock(&nmp->nm_mtx);
 	if ((nmp->nm_flag & NFSMNT_NFSV3) != 0 &&
 	    (nmp->nm_state & NFSSTA_GOTFSINFO) == 0) {
 		mtx_unlock(&nmp->nm_mtx);
 		/* We'll never get here for v4, because we always have fsinfo */
 		(void)ncl_fsinfo(nmp, vp, cred, td);
 	} else
 		mtx_unlock(&nmp->nm_mtx);
 
 	npages = btoc(count);
 
 	/*
 	 * If the requested page is partially valid, just return it and
 	 * allow the pager to zero-out the blanks.  Partially valid pages
 	 * can only occur at the file EOF.
 	 */
 	VM_OBJECT_WLOCK(object);
 	if (pages[ap->a_reqpage]->valid != 0) {
 		for (i = 0; i < npages; ++i) {
 			if (i != ap->a_reqpage) {
 				vm_page_lock(pages[i]);
 				vm_page_free(pages[i]);
 				vm_page_unlock(pages[i]);
 			}
 		}
 		VM_OBJECT_WUNLOCK(object);
 		return (0);
 	}
 	VM_OBJECT_WUNLOCK(object);
 
 	/*
 	 * We use only the kva address for the buffer, but this is extremely
 	 * convienient and fast.
 	 */
 	bp = getpbuf(&ncl_pbuf_freecnt);
 
 	kva = (vm_offset_t) bp->b_data;
 	pmap_qenter(kva, pages, npages);
 	PCPU_INC(cnt.v_vnodein);
 	PCPU_ADD(cnt.v_vnodepgsin, npages);
 
 	iov.iov_base = (caddr_t) kva;
 	iov.iov_len = count;
 	uio.uio_iov = &iov;
 	uio.uio_iovcnt = 1;
 	uio.uio_offset = IDX_TO_OFF(pages[0]->pindex);
 	uio.uio_resid = count;
 	uio.uio_segflg = UIO_SYSSPACE;
 	uio.uio_rw = UIO_READ;
 	uio.uio_td = td;
 
 	error = ncl_readrpc(vp, &uio, cred);
 	pmap_qremove(kva, npages);
 
 	relpbuf(bp, &ncl_pbuf_freecnt);
 
 	if (error && (uio.uio_resid == count)) {
 		ncl_printf("nfs_getpages: error %d\n", error);
 		VM_OBJECT_WLOCK(object);
 		for (i = 0; i < npages; ++i) {
 			if (i != ap->a_reqpage) {
 				vm_page_lock(pages[i]);
 				vm_page_free(pages[i]);
 				vm_page_unlock(pages[i]);
 			}
 		}
 		VM_OBJECT_WUNLOCK(object);
 		return (VM_PAGER_ERROR);
 	}
 
 	/*
 	 * Calculate the number of bytes read and validate only that number
 	 * of bytes.  Note that due to pending writes, size may be 0.  This
 	 * does not mean that the remaining data is invalid!
 	 */
 
 	size = count - uio.uio_resid;
 	VM_OBJECT_WLOCK(object);
 	for (i = 0, toff = 0; i < npages; i++, toff = nextoff) {
 		vm_page_t m;
 		nextoff = toff + PAGE_SIZE;
 		m = pages[i];
 
 		if (nextoff <= size) {
 			/*
 			 * Read operation filled an entire page
 			 */
 			m->valid = VM_PAGE_BITS_ALL;
 			KASSERT(m->dirty == 0,
 			    ("nfs_getpages: page %p is dirty", m));
 		} else if (size > toff) {
 			/*
 			 * Read operation filled a partial page.
 			 */
 			m->valid = 0;
 			vm_page_set_valid_range(m, 0, size - toff);
 			KASSERT(m->dirty == 0,
 			    ("nfs_getpages: page %p is dirty", m));
 		} else {
 			/*
 			 * Read operation was short.  If no error
 			 * occured we may have hit a zero-fill
 			 * section.  We leave valid set to 0, and page
 			 * is freed by vm_page_readahead_finish() if
 			 * its index is not equal to requested, or
 			 * page is zeroed and set valid by
 			 * vm_pager_get_pages() for requested page.
 			 */
 			;
 		}
 		if (i != ap->a_reqpage)
 			vm_page_readahead_finish(m);
 	}
 	VM_OBJECT_WUNLOCK(object);
 	return (0);
 }
 
 /*
  * Vnode op for VM putpages.
  */
 int
 ncl_putpages(struct vop_putpages_args *ap)
 {
 	struct uio uio;
 	struct iovec iov;
 	vm_offset_t kva;
 	struct buf *bp;
 	int iomode, must_commit, i, error, npages, count;
 	off_t offset;
 	int *rtvals;
 	struct vnode *vp;
 	struct thread *td;
 	struct ucred *cred;
 	struct nfsmount *nmp;
 	struct nfsnode *np;
 	vm_page_t *pages;
 
 	vp = ap->a_vp;
 	np = VTONFS(vp);
 	td = curthread;				/* XXX */
 	/* Set the cred to n_writecred for the write rpcs. */
 	if (np->n_writecred != NULL)
 		cred = crhold(np->n_writecred);
 	else
 		cred = crhold(curthread->td_ucred);	/* XXX */
 	nmp = VFSTONFS(vp->v_mount);
 	pages = ap->a_m;
 	count = ap->a_count;
 	rtvals = ap->a_rtvals;
 	npages = btoc(count);
 	offset = IDX_TO_OFF(pages[0]->pindex);
 
 	mtx_lock(&nmp->nm_mtx);
 	if ((nmp->nm_flag & NFSMNT_NFSV3) != 0 &&
 	    (nmp->nm_state & NFSSTA_GOTFSINFO) == 0) {
 		mtx_unlock(&nmp->nm_mtx);
 		(void)ncl_fsinfo(nmp, vp, cred, td);
 	} else
 		mtx_unlock(&nmp->nm_mtx);
 
 	mtx_lock(&np->n_mtx);
 	if (newnfs_directio_enable && !newnfs_directio_allow_mmap &&
 	    (np->n_flag & NNONCACHE) && (vp->v_type == VREG)) {
 		mtx_unlock(&np->n_mtx);
 		ncl_printf("ncl_putpages: called on noncache-able vnode??\n");
 		mtx_lock(&np->n_mtx);
 	}
 
 	for (i = 0; i < npages; i++)
 		rtvals[i] = VM_PAGER_ERROR;
 
 	/*
 	 * When putting pages, do not extend file past EOF.
 	 */
 	if (offset + count > np->n_size) {
 		count = np->n_size - offset;
 		if (count < 0)
 			count = 0;
 	}
 	mtx_unlock(&np->n_mtx);
 
 	/*
 	 * We use only the kva address for the buffer, but this is extremely
 	 * convienient and fast.
 	 */
 	bp = getpbuf(&ncl_pbuf_freecnt);
 
 	kva = (vm_offset_t) bp->b_data;
 	pmap_qenter(kva, pages, npages);
 	PCPU_INC(cnt.v_vnodeout);
 	PCPU_ADD(cnt.v_vnodepgsout, count);
 
 	iov.iov_base = (caddr_t) kva;
 	iov.iov_len = count;
 	uio.uio_iov = &iov;
 	uio.uio_iovcnt = 1;
 	uio.uio_offset = offset;
 	uio.uio_resid = count;
 	uio.uio_segflg = UIO_SYSSPACE;
 	uio.uio_rw = UIO_WRITE;
 	uio.uio_td = td;
 
 	if ((ap->a_sync & VM_PAGER_PUT_SYNC) == 0)
 	    iomode = NFSWRITE_UNSTABLE;
 	else
 	    iomode = NFSWRITE_FILESYNC;
 
 	error = ncl_writerpc(vp, &uio, cred, &iomode, &must_commit, 0);
 	crfree(cred);
 
 	pmap_qremove(kva, npages);
 	relpbuf(bp, &ncl_pbuf_freecnt);
 
 	if (error == 0 || !nfs_keep_dirty_on_error) {
 		vnode_pager_undirty_pages(pages, rtvals, count - uio.uio_resid);
 		if (must_commit)
 			ncl_clearcommit(vp->v_mount);
 	}
 	return rtvals[0];
 }
 
 /*
  * For nfs, cache consistency can only be maintained approximately.
  * Although RFC1094 does not specify the criteria, the following is
  * believed to be compatible with the reference port.
  * For nfs:
  * If the file's modify time on the server has changed since the
  * last read rpc or you have written to the file,
  * you may have lost data cache consistency with the
  * server, so flush all of the file's data out of the cache.
  * Then force a getattr rpc to ensure that you have up to date
  * attributes.
  * NB: This implies that cache data can be read when up to
  * NFS_ATTRTIMEO seconds out of date. If you find that you need current
  * attributes this could be forced by setting n_attrstamp to 0 before
  * the VOP_GETATTR() call.
  */
 static inline int
 nfs_bioread_check_cons(struct vnode *vp, struct thread *td, struct ucred *cred)
 {
 	int error = 0;
 	struct vattr vattr;
 	struct nfsnode *np = VTONFS(vp);
 	int old_lock;
 
 	/*
 	 * Grab the exclusive lock before checking whether the cache is
 	 * consistent.
 	 * XXX - We can make this cheaper later (by acquiring cheaper locks).
 	 * But for now, this suffices.
 	 */
 	old_lock = ncl_upgrade_vnlock(vp);
 	if (vp->v_iflag & VI_DOOMED) {
 		ncl_downgrade_vnlock(vp, old_lock);
 		return (EBADF);
 	}
 
 	mtx_lock(&np->n_mtx);
 	if (np->n_flag & NMODIFIED) {
 		mtx_unlock(&np->n_mtx);
 		if (vp->v_type != VREG) {
 			if (vp->v_type != VDIR)
 				panic("nfs: bioread, not dir");
 			ncl_invaldir(vp);
 			error = ncl_vinvalbuf(vp, V_SAVE, td, 1);
 			if (error)
 				goto out;
 		}
 		np->n_attrstamp = 0;
 		KDTRACE_NFS_ATTRCACHE_FLUSH_DONE(vp);
 		error = VOP_GETATTR(vp, &vattr, cred);
 		if (error)
 			goto out;
 		mtx_lock(&np->n_mtx);
 		np->n_mtime = vattr.va_mtime;
 		mtx_unlock(&np->n_mtx);
 	} else {
 		mtx_unlock(&np->n_mtx);
 		error = VOP_GETATTR(vp, &vattr, cred);
 		if (error)
 			return (error);
 		mtx_lock(&np->n_mtx);
 		if ((np->n_flag & NSIZECHANGED)
 		    || (NFS_TIMESPEC_COMPARE(&np->n_mtime, &vattr.va_mtime))) {
 			mtx_unlock(&np->n_mtx);
 			if (vp->v_type == VDIR)
 				ncl_invaldir(vp);
 			error = ncl_vinvalbuf(vp, V_SAVE, td, 1);
 			if (error)
 				goto out;
 			mtx_lock(&np->n_mtx);
 			np->n_mtime = vattr.va_mtime;
 			np->n_flag &= ~NSIZECHANGED;
 		}
 		mtx_unlock(&np->n_mtx);
 	}
 out:
 	ncl_downgrade_vnlock(vp, old_lock);
 	return error;
 }
 
 /*
  * Vnode op for read using bio
  */
 int
 ncl_bioread(struct vnode *vp, struct uio *uio, int ioflag, struct ucred *cred)
 {
 	struct nfsnode *np = VTONFS(vp);
 	int biosize, i;
 	struct buf *bp, *rabp;
 	struct thread *td;
 	struct nfsmount *nmp = VFSTONFS(vp->v_mount);
 	daddr_t lbn, rabn;
 	int bcount;
 	int seqcount;
 	int nra, error = 0, n = 0, on = 0;
 	off_t tmp_off;
 
 	KASSERT(uio->uio_rw == UIO_READ, ("ncl_read mode"));
 	if (uio->uio_resid == 0)
 		return (0);
 	if (uio->uio_offset < 0)	/* XXX VDIR cookies can be negative */
 		return (EINVAL);
 	td = uio->uio_td;
 
 	mtx_lock(&nmp->nm_mtx);
 	if ((nmp->nm_flag & NFSMNT_NFSV3) != 0 &&
 	    (nmp->nm_state & NFSSTA_GOTFSINFO) == 0) {
 		mtx_unlock(&nmp->nm_mtx);
 		(void)ncl_fsinfo(nmp, vp, cred, td);
 		mtx_lock(&nmp->nm_mtx);
 	}
 	if (nmp->nm_rsize == 0 || nmp->nm_readdirsize == 0)
 		(void) newnfs_iosize(nmp);
 
 	tmp_off = uio->uio_offset + uio->uio_resid;
 	if (vp->v_type != VDIR &&
 	    (tmp_off > nmp->nm_maxfilesize || tmp_off < uio->uio_offset)) {
 		mtx_unlock(&nmp->nm_mtx);
 		return (EFBIG);
 	}
 	mtx_unlock(&nmp->nm_mtx);
 
 	if (newnfs_directio_enable && (ioflag & IO_DIRECT) && (vp->v_type == VREG))
 		/* No caching/ no readaheads. Just read data into the user buffer */
 		return ncl_readrpc(vp, uio, cred);
 
 	biosize = vp->v_bufobj.bo_bsize;
 	seqcount = (int)((off_t)(ioflag >> IO_SEQSHIFT) * biosize / BKVASIZE);
 
 	error = nfs_bioread_check_cons(vp, td, cred);
 	if (error)
 		return error;
 
 	do {
 	    u_quad_t nsize;
 
 	    mtx_lock(&np->n_mtx);
 	    nsize = np->n_size;
 	    mtx_unlock(&np->n_mtx);
 
 	    switch (vp->v_type) {
 	    case VREG:
 		NFSINCRGLOBAL(newnfsstats.biocache_reads);
 		lbn = uio->uio_offset / biosize;
 		on = uio->uio_offset - (lbn * biosize);
 
 		/*
 		 * Start the read ahead(s), as required.
 		 */
 		if (nmp->nm_readahead > 0) {
 		    for (nra = 0; nra < nmp->nm_readahead && nra < seqcount &&
 			(off_t)(lbn + 1 + nra) * biosize < nsize; nra++) {
 			rabn = lbn + 1 + nra;
 			if (incore(&vp->v_bufobj, rabn) == NULL) {
 			    rabp = nfs_getcacheblk(vp, rabn, biosize, td);
 			    if (!rabp) {
 				error = newnfs_sigintr(nmp, td);
 				return (error ? error : EINTR);
 			    }
 			    if ((rabp->b_flags & (B_CACHE|B_DELWRI)) == 0) {
 				rabp->b_flags |= B_ASYNC;
 				rabp->b_iocmd = BIO_READ;
 				vfs_busy_pages(rabp, 0);
 				if (ncl_asyncio(nmp, rabp, cred, td)) {
 				    rabp->b_flags |= B_INVAL;
 				    rabp->b_ioflags |= BIO_ERROR;
 				    vfs_unbusy_pages(rabp);
 				    brelse(rabp);
 				    break;
 				}
 			    } else {
 				brelse(rabp);
 			    }
 			}
 		    }
 		}
 
 		/* Note that bcount is *not* DEV_BSIZE aligned. */
 		bcount = biosize;
 		if ((off_t)lbn * biosize >= nsize) {
 			bcount = 0;
 		} else if ((off_t)(lbn + 1) * biosize > nsize) {
 			bcount = nsize - (off_t)lbn * biosize;
 		}
 		bp = nfs_getcacheblk(vp, lbn, bcount, td);
 
 		if (!bp) {
 			error = newnfs_sigintr(nmp, td);
 			return (error ? error : EINTR);
 		}
 
 		/*
 		 * If B_CACHE is not set, we must issue the read.  If this
 		 * fails, we return an error.
 		 */
 
 		if ((bp->b_flags & B_CACHE) == 0) {
 		    bp->b_iocmd = BIO_READ;
 		    vfs_busy_pages(bp, 0);
 		    error = ncl_doio(vp, bp, cred, td, 0);
 		    if (error) {
 			brelse(bp);
 			return (error);
 		    }
 		}
 
 		/*
 		 * on is the offset into the current bp.  Figure out how many
 		 * bytes we can copy out of the bp.  Note that bcount is
 		 * NOT DEV_BSIZE aligned.
 		 *
 		 * Then figure out how many bytes we can copy into the uio.
 		 */
 
 		n = 0;
 		if (on < bcount)
 			n = MIN((unsigned)(bcount - on), uio->uio_resid);
 		break;
 	    case VLNK:
 		NFSINCRGLOBAL(newnfsstats.biocache_readlinks);
 		bp = nfs_getcacheblk(vp, (daddr_t)0, NFS_MAXPATHLEN, td);
 		if (!bp) {
 			error = newnfs_sigintr(nmp, td);
 			return (error ? error : EINTR);
 		}
 		if ((bp->b_flags & B_CACHE) == 0) {
 		    bp->b_iocmd = BIO_READ;
 		    vfs_busy_pages(bp, 0);
 		    error = ncl_doio(vp, bp, cred, td, 0);
 		    if (error) {
 			bp->b_ioflags |= BIO_ERROR;
 			brelse(bp);
 			return (error);
 		    }
 		}
 		n = MIN(uio->uio_resid, NFS_MAXPATHLEN - bp->b_resid);
 		on = 0;
 		break;
 	    case VDIR:
 		NFSINCRGLOBAL(newnfsstats.biocache_readdirs);
 		if (np->n_direofoffset
 		    && uio->uio_offset >= np->n_direofoffset) {
 		    return (0);
 		}
 		lbn = (uoff_t)uio->uio_offset / NFS_DIRBLKSIZ;
 		on = uio->uio_offset & (NFS_DIRBLKSIZ - 1);
 		bp = nfs_getcacheblk(vp, lbn, NFS_DIRBLKSIZ, td);
 		if (!bp) {
 		    error = newnfs_sigintr(nmp, td);
 		    return (error ? error : EINTR);
 		}
 		if ((bp->b_flags & B_CACHE) == 0) {
 		    bp->b_iocmd = BIO_READ;
 		    vfs_busy_pages(bp, 0);
 		    error = ncl_doio(vp, bp, cred, td, 0);
 		    if (error) {
 			    brelse(bp);
 		    }
 		    while (error == NFSERR_BAD_COOKIE) {
 			ncl_invaldir(vp);
 			error = ncl_vinvalbuf(vp, 0, td, 1);
 			/*
 			 * Yuck! The directory has been modified on the
 			 * server. The only way to get the block is by
 			 * reading from the beginning to get all the
 			 * offset cookies.
 			 *
 			 * Leave the last bp intact unless there is an error.
 			 * Loop back up to the while if the error is another
 			 * NFSERR_BAD_COOKIE (double yuch!).
 			 */
 			for (i = 0; i <= lbn && !error; i++) {
 			    if (np->n_direofoffset
 				&& (i * NFS_DIRBLKSIZ) >= np->n_direofoffset)
 				    return (0);
 			    bp = nfs_getcacheblk(vp, i, NFS_DIRBLKSIZ, td);
 			    if (!bp) {
 				error = newnfs_sigintr(nmp, td);
 				return (error ? error : EINTR);
 			    }
 			    if ((bp->b_flags & B_CACHE) == 0) {
 				    bp->b_iocmd = BIO_READ;
 				    vfs_busy_pages(bp, 0);
 				    error = ncl_doio(vp, bp, cred, td, 0);
 				    /*
 				     * no error + B_INVAL == directory EOF,
 				     * use the block.
 				     */
 				    if (error == 0 && (bp->b_flags & B_INVAL))
 					    break;
 			    }
 			    /*
 			     * An error will throw away the block and the
 			     * for loop will break out.  If no error and this
 			     * is not the block we want, we throw away the
 			     * block and go for the next one via the for loop.
 			     */
 			    if (error || i < lbn)
 				    brelse(bp);
 			}
 		    }
 		    /*
 		     * The above while is repeated if we hit another cookie
 		     * error.  If we hit an error and it wasn't a cookie error,
 		     * we give up.
 		     */
 		    if (error)
 			    return (error);
 		}
 
 		/*
 		 * If not eof and read aheads are enabled, start one.
 		 * (You need the current block first, so that you have the
 		 *  directory offset cookie of the next block.)
 		 */
 		if (nmp->nm_readahead > 0 &&
 		    (bp->b_flags & B_INVAL) == 0 &&
 		    (np->n_direofoffset == 0 ||
 		    (lbn + 1) * NFS_DIRBLKSIZ < np->n_direofoffset) &&
 		    incore(&vp->v_bufobj, lbn + 1) == NULL) {
 			rabp = nfs_getcacheblk(vp, lbn + 1, NFS_DIRBLKSIZ, td);
 			if (rabp) {
 			    if ((rabp->b_flags & (B_CACHE|B_DELWRI)) == 0) {
 				rabp->b_flags |= B_ASYNC;
 				rabp->b_iocmd = BIO_READ;
 				vfs_busy_pages(rabp, 0);
 				if (ncl_asyncio(nmp, rabp, cred, td)) {
 				    rabp->b_flags |= B_INVAL;
 				    rabp->b_ioflags |= BIO_ERROR;
 				    vfs_unbusy_pages(rabp);
 				    brelse(rabp);
 				}
 			    } else {
 				brelse(rabp);
 			    }
 			}
 		}
 		/*
 		 * Unlike VREG files, whos buffer size ( bp->b_bcount ) is
 		 * chopped for the EOF condition, we cannot tell how large
 		 * NFS directories are going to be until we hit EOF.  So
 		 * an NFS directory buffer is *not* chopped to its EOF.  Now,
 		 * it just so happens that b_resid will effectively chop it
 		 * to EOF.  *BUT* this information is lost if the buffer goes
 		 * away and is reconstituted into a B_CACHE state ( due to
 		 * being VMIO ) later.  So we keep track of the directory eof
 		 * in np->n_direofoffset and chop it off as an extra step
 		 * right here.
 		 */
 		n = lmin(uio->uio_resid, NFS_DIRBLKSIZ - bp->b_resid - on);
 		if (np->n_direofoffset && n > np->n_direofoffset - uio->uio_offset)
 			n = np->n_direofoffset - uio->uio_offset;
 		break;
 	    default:
 		ncl_printf(" ncl_bioread: type %x unexpected\n", vp->v_type);
 		bp = NULL;
 		break;
 	    };
 
 	    if (n > 0) {
 		    error = vn_io_fault_uiomove(bp->b_data + on, (int)n, uio);
 	    }
 	    if (vp->v_type == VLNK)
 		n = 0;
 	    if (bp != NULL)
 		brelse(bp);
 	} while (error == 0 && uio->uio_resid > 0 && n > 0);
 	return (error);
 }
 
 /*
  * The NFS write path cannot handle iovecs with len > 1. So we need to
  * break up iovecs accordingly (restricting them to wsize).
  * For the SYNC case, we can do this with 1 copy (user buffer -> mbuf).
  * For the ASYNC case, 2 copies are needed. The first a copy from the
  * user buffer to a staging buffer and then a second copy from the staging
  * buffer to mbufs. This can be optimized by copying from the user buffer
  * directly into mbufs and passing the chain down, but that requires a
  * fair amount of re-working of the relevant codepaths (and can be done
  * later).
  */
 static int
 nfs_directio_write(vp, uiop, cred, ioflag)
 	struct vnode *vp;
 	struct uio *uiop;
 	struct ucred *cred;
 	int ioflag;
 {
 	int error;
 	struct nfsmount *nmp = VFSTONFS(vp->v_mount);
 	struct thread *td = uiop->uio_td;
 	int size;
 	int wsize;
 
 	mtx_lock(&nmp->nm_mtx);
 	wsize = nmp->nm_wsize;
 	mtx_unlock(&nmp->nm_mtx);
 	if (ioflag & IO_SYNC) {
 		int iomode, must_commit;
 		struct uio uio;
 		struct iovec iov;
 do_sync:
 		while (uiop->uio_resid > 0) {
 			size = MIN(uiop->uio_resid, wsize);
 			size = MIN(uiop->uio_iov->iov_len, size);
 			iov.iov_base = uiop->uio_iov->iov_base;
 			iov.iov_len = size;
 			uio.uio_iov = &iov;
 			uio.uio_iovcnt = 1;
 			uio.uio_offset = uiop->uio_offset;
 			uio.uio_resid = size;
 			uio.uio_segflg = UIO_USERSPACE;
 			uio.uio_rw = UIO_WRITE;
 			uio.uio_td = td;
 			iomode = NFSWRITE_FILESYNC;
 			error = ncl_writerpc(vp, &uio, cred, &iomode,
 			    &must_commit, 0);
 			KASSERT((must_commit == 0),
 				("ncl_directio_write: Did not commit write"));
 			if (error)
 				return (error);
 			uiop->uio_offset += size;
 			uiop->uio_resid -= size;
 			if (uiop->uio_iov->iov_len <= size) {
 				uiop->uio_iovcnt--;
 				uiop->uio_iov++;
 			} else {
 				uiop->uio_iov->iov_base =
 					(char *)uiop->uio_iov->iov_base + size;
 				uiop->uio_iov->iov_len -= size;
 			}
 		}
 	} else {
 		struct uio *t_uio;
 		struct iovec *t_iov;
 		struct buf *bp;
 
 		/*
 		 * Break up the write into blocksize chunks and hand these
 		 * over to nfsiod's for write back.
 		 * Unfortunately, this incurs a copy of the data. Since
 		 * the user could modify the buffer before the write is
 		 * initiated.
 		 *
 		 * The obvious optimization here is that one of the 2 copies
 		 * in the async write path can be eliminated by copying the
 		 * data here directly into mbufs and passing the mbuf chain
 		 * down. But that will require a fair amount of re-working
 		 * of the code and can be done if there's enough interest
 		 * in NFS directio access.
 		 */
 		while (uiop->uio_resid > 0) {
 			size = MIN(uiop->uio_resid, wsize);
 			size = MIN(uiop->uio_iov->iov_len, size);
 			bp = getpbuf(&ncl_pbuf_freecnt);
 			t_uio = malloc(sizeof(struct uio), M_NFSDIRECTIO, M_WAITOK);
 			t_iov = malloc(sizeof(struct iovec), M_NFSDIRECTIO, M_WAITOK);
 			t_iov->iov_base = malloc(size, M_NFSDIRECTIO, M_WAITOK);
 			t_iov->iov_len = size;
 			t_uio->uio_iov = t_iov;
 			t_uio->uio_iovcnt = 1;
 			t_uio->uio_offset = uiop->uio_offset;
 			t_uio->uio_resid = size;
 			t_uio->uio_segflg = UIO_SYSSPACE;
 			t_uio->uio_rw = UIO_WRITE;
 			t_uio->uio_td = td;
 			KASSERT(uiop->uio_segflg == UIO_USERSPACE ||
 			    uiop->uio_segflg == UIO_SYSSPACE,
 			    ("nfs_directio_write: Bad uio_segflg"));
 			if (uiop->uio_segflg == UIO_USERSPACE) {
 				error = copyin(uiop->uio_iov->iov_base,
 				    t_iov->iov_base, size);
 				if (error != 0)
 					goto err_free;
 			} else
 				/*
 				 * UIO_SYSSPACE may never happen, but handle
 				 * it just in case it does.
 				 */
 				bcopy(uiop->uio_iov->iov_base, t_iov->iov_base,
 				    size);
 			bp->b_flags |= B_DIRECT;
 			bp->b_iocmd = BIO_WRITE;
 			if (cred != NOCRED) {
 				crhold(cred);
 				bp->b_wcred = cred;
 			} else
 				bp->b_wcred = NOCRED;
 			bp->b_caller1 = (void *)t_uio;
 			bp->b_vp = vp;
 			error = ncl_asyncio(nmp, bp, NOCRED, td);
 err_free:
 			if (error) {
 				free(t_iov->iov_base, M_NFSDIRECTIO);
 				free(t_iov, M_NFSDIRECTIO);
 				free(t_uio, M_NFSDIRECTIO);
 				bp->b_vp = NULL;
 				relpbuf(bp, &ncl_pbuf_freecnt);
 				if (error == EINTR)
 					return (error);
 				goto do_sync;
 			}
 			uiop->uio_offset += size;
 			uiop->uio_resid -= size;
 			if (uiop->uio_iov->iov_len <= size) {
 				uiop->uio_iovcnt--;
 				uiop->uio_iov++;
 			} else {
 				uiop->uio_iov->iov_base =
 					(char *)uiop->uio_iov->iov_base + size;
 				uiop->uio_iov->iov_len -= size;
 			}
 		}
 	}
 	return (0);
 }
 
 /*
  * Vnode op for write using bio
  */
 int
 ncl_write(struct vop_write_args *ap)
 {
 	int biosize;
 	struct uio *uio = ap->a_uio;
 	struct thread *td = uio->uio_td;
 	struct vnode *vp = ap->a_vp;
 	struct nfsnode *np = VTONFS(vp);
 	struct ucred *cred = ap->a_cred;
 	int ioflag = ap->a_ioflag;
 	struct buf *bp;
 	struct vattr vattr;
 	struct nfsmount *nmp = VFSTONFS(vp->v_mount);
 	daddr_t lbn;
 	int bcount;
 	int bp_cached, n, on, error = 0, error1;
 	size_t orig_resid, local_resid;
 	off_t orig_size, tmp_off;
 
 	KASSERT(uio->uio_rw == UIO_WRITE, ("ncl_write mode"));
 	KASSERT(uio->uio_segflg != UIO_USERSPACE || uio->uio_td == curthread,
 	    ("ncl_write proc"));
 	if (vp->v_type != VREG)
 		return (EIO);
 	mtx_lock(&np->n_mtx);
 	if (np->n_flag & NWRITEERR) {
 		np->n_flag &= ~NWRITEERR;
 		mtx_unlock(&np->n_mtx);
 		return (np->n_error);
 	} else
 		mtx_unlock(&np->n_mtx);
 	mtx_lock(&nmp->nm_mtx);
 	if ((nmp->nm_flag & NFSMNT_NFSV3) != 0 &&
 	    (nmp->nm_state & NFSSTA_GOTFSINFO) == 0) {
 		mtx_unlock(&nmp->nm_mtx);
 		(void)ncl_fsinfo(nmp, vp, cred, td);
 		mtx_lock(&nmp->nm_mtx);
 	}
 	if (nmp->nm_wsize == 0)
 		(void) newnfs_iosize(nmp);
 	mtx_unlock(&nmp->nm_mtx);
 
 	/*
 	 * Synchronously flush pending buffers if we are in synchronous
 	 * mode or if we are appending.
 	 */
 	if (ioflag & (IO_APPEND | IO_SYNC)) {
 		mtx_lock(&np->n_mtx);
 		if (np->n_flag & NMODIFIED) {
 			mtx_unlock(&np->n_mtx);
 #ifdef notyet /* Needs matching nonblock semantics elsewhere, too. */
 			/*
 			 * Require non-blocking, synchronous writes to
 			 * dirty files to inform the program it needs
 			 * to fsync(2) explicitly.
 			 */
 			if (ioflag & IO_NDELAY)
 				return (EAGAIN);
 #endif
 flush_and_restart:
 			np->n_attrstamp = 0;
 			KDTRACE_NFS_ATTRCACHE_FLUSH_DONE(vp);
 			error = ncl_vinvalbuf(vp, V_SAVE, td, 1);
 			if (error)
 				return (error);
 		} else
 			mtx_unlock(&np->n_mtx);
 	}
 
 	orig_resid = uio->uio_resid;
 	mtx_lock(&np->n_mtx);
 	orig_size = np->n_size;
 	mtx_unlock(&np->n_mtx);
 
 	/*
 	 * If IO_APPEND then load uio_offset.  We restart here if we cannot
 	 * get the append lock.
 	 */
 	if (ioflag & IO_APPEND) {
 		np->n_attrstamp = 0;
 		KDTRACE_NFS_ATTRCACHE_FLUSH_DONE(vp);
 		error = VOP_GETATTR(vp, &vattr, cred);
 		if (error)
 			return (error);
 		mtx_lock(&np->n_mtx);
 		uio->uio_offset = np->n_size;
 		mtx_unlock(&np->n_mtx);
 	}
 
 	if (uio->uio_offset < 0)
 		return (EINVAL);
 	tmp_off = uio->uio_offset + uio->uio_resid;
 	if (tmp_off > nmp->nm_maxfilesize || tmp_off < uio->uio_offset)
 		return (EFBIG);
 	if (uio->uio_resid == 0)
 		return (0);
 
 	if (newnfs_directio_enable && (ioflag & IO_DIRECT) && vp->v_type == VREG)
 		return nfs_directio_write(vp, uio, cred, ioflag);
 
 	/*
 	 * Maybe this should be above the vnode op call, but so long as
 	 * file servers have no limits, i don't think it matters
 	 */
 	if (vn_rlimit_fsize(vp, uio, td))
 		return (EFBIG);
 
 	biosize = vp->v_bufobj.bo_bsize;
 	/*
 	 * Find all of this file's B_NEEDCOMMIT buffers.  If our writes
 	 * would exceed the local maximum per-file write commit size when
 	 * combined with those, we must decide whether to flush,
 	 * go synchronous, or return error.  We don't bother checking
 	 * IO_UNIT -- we just make all writes atomic anyway, as there's
 	 * no point optimizing for something that really won't ever happen.
 	 */
 	if (!(ioflag & IO_SYNC)) {
 		int nflag;
 
 		mtx_lock(&np->n_mtx);
 		nflag = np->n_flag;
 		mtx_unlock(&np->n_mtx);
 		int needrestart = 0;
 		if (nmp->nm_wcommitsize < uio->uio_resid) {
 			/*
 			 * If this request could not possibly be completed
 			 * without exceeding the maximum outstanding write
 			 * commit size, see if we can convert it into a
 			 * synchronous write operation.
 			 */
 			if (ioflag & IO_NDELAY)
 				return (EAGAIN);
 			ioflag |= IO_SYNC;
 			if (nflag & NMODIFIED)
 				needrestart = 1;
 		} else if (nflag & NMODIFIED) {
 			int wouldcommit = 0;
 			BO_LOCK(&vp->v_bufobj);
 			if (vp->v_bufobj.bo_dirty.bv_cnt != 0) {
 				TAILQ_FOREACH(bp, &vp->v_bufobj.bo_dirty.bv_hd,
 				    b_bobufs) {
 					if (bp->b_flags & B_NEEDCOMMIT)
 						wouldcommit += bp->b_bcount;
 				}
 			}
 			BO_UNLOCK(&vp->v_bufobj);
 			/*
 			 * Since we're not operating synchronously and
 			 * bypassing the buffer cache, we are in a commit
 			 * and holding all of these buffers whether
 			 * transmitted or not.  If not limited, this
 			 * will lead to the buffer cache deadlocking,
 			 * as no one else can flush our uncommitted buffers.
 			 */
 			wouldcommit += uio->uio_resid;
 			/*
 			 * If we would initially exceed the maximum
 			 * outstanding write commit size, flush and restart.
 			 */
 			if (wouldcommit > nmp->nm_wcommitsize)
 				needrestart = 1;
 		}
 		if (needrestart)
 			goto flush_and_restart;
 	}
 
 	do {
 		NFSINCRGLOBAL(newnfsstats.biocache_writes);
 		lbn = uio->uio_offset / biosize;
 		on = uio->uio_offset - (lbn * biosize);
 		n = MIN((unsigned)(biosize - on), uio->uio_resid);
 again:
 		/*
 		 * Handle direct append and file extension cases, calculate
 		 * unaligned buffer size.
 		 */
 		mtx_lock(&np->n_mtx);
 		if (uio->uio_offset == np->n_size && n) {
 			mtx_unlock(&np->n_mtx);
 			/*
 			 * Get the buffer (in its pre-append state to maintain
 			 * B_CACHE if it was previously set).  Resize the
 			 * nfsnode after we have locked the buffer to prevent
 			 * readers from reading garbage.
 			 */
 			bcount = on;
 			bp = nfs_getcacheblk(vp, lbn, bcount, td);
 
 			if (bp != NULL) {
 				long save;
 
 				mtx_lock(&np->n_mtx);
 				np->n_size = uio->uio_offset + n;
 				np->n_flag |= NMODIFIED;
 				vnode_pager_setsize(vp, np->n_size);
 				mtx_unlock(&np->n_mtx);
 
 				save = bp->b_flags & B_CACHE;
 				bcount += n;
 				allocbuf(bp, bcount);
 				bp->b_flags |= save;
 			}
 		} else {
 			/*
 			 * Obtain the locked cache block first, and then
 			 * adjust the file's size as appropriate.
 			 */
 			bcount = on + n;
 			if ((off_t)lbn * biosize + bcount < np->n_size) {
 				if ((off_t)(lbn + 1) * biosize < np->n_size)
 					bcount = biosize;
 				else
 					bcount = np->n_size - (off_t)lbn * biosize;
 			}
 			mtx_unlock(&np->n_mtx);
 			bp = nfs_getcacheblk(vp, lbn, bcount, td);
 			mtx_lock(&np->n_mtx);
 			if (uio->uio_offset + n > np->n_size) {
 				np->n_size = uio->uio_offset + n;
 				np->n_flag |= NMODIFIED;
 				vnode_pager_setsize(vp, np->n_size);
 			}
 			mtx_unlock(&np->n_mtx);
 		}
 
 		if (!bp) {
 			error = newnfs_sigintr(nmp, td);
 			if (!error)
 				error = EINTR;
 			break;
 		}
 
 		/*
 		 * Issue a READ if B_CACHE is not set.  In special-append
 		 * mode, B_CACHE is based on the buffer prior to the write
 		 * op and is typically set, avoiding the read.  If a read
 		 * is required in special append mode, the server will
 		 * probably send us a short-read since we extended the file
 		 * on our end, resulting in b_resid == 0 and, thusly,
 		 * B_CACHE getting set.
 		 *
 		 * We can also avoid issuing the read if the write covers
 		 * the entire buffer.  We have to make sure the buffer state
 		 * is reasonable in this case since we will not be initiating
 		 * I/O.  See the comments in kern/vfs_bio.c's getblk() for
 		 * more information.
 		 *
 		 * B_CACHE may also be set due to the buffer being cached
 		 * normally.
 		 */
 
 		bp_cached = 1;
 		if (on == 0 && n == bcount) {
 			if ((bp->b_flags & B_CACHE) == 0)
 				bp_cached = 0;
 			bp->b_flags |= B_CACHE;
 			bp->b_flags &= ~B_INVAL;
 			bp->b_ioflags &= ~BIO_ERROR;
 		}
 
 		if ((bp->b_flags & B_CACHE) == 0) {
 			bp->b_iocmd = BIO_READ;
 			vfs_busy_pages(bp, 0);
 			error = ncl_doio(vp, bp, cred, td, 0);
 			if (error) {
 				brelse(bp);
 				break;
 			}
 		}
 		if (bp->b_wcred == NOCRED)
 			bp->b_wcred = crhold(cred);
 		mtx_lock(&np->n_mtx);
 		np->n_flag |= NMODIFIED;
 		mtx_unlock(&np->n_mtx);
 
 		/*
 		 * If dirtyend exceeds file size, chop it down.  This should
 		 * not normally occur but there is an append race where it
 		 * might occur XXX, so we log it.
 		 *
 		 * If the chopping creates a reverse-indexed or degenerate
 		 * situation with dirtyoff/end, we 0 both of them.
 		 */
 
 		if (bp->b_dirtyend > bcount) {
 			ncl_printf("NFS append race @%lx:%d\n",
 			    (long)bp->b_blkno * DEV_BSIZE,
 			    bp->b_dirtyend - bcount);
 			bp->b_dirtyend = bcount;
 		}
 
 		if (bp->b_dirtyoff >= bp->b_dirtyend)
 			bp->b_dirtyoff = bp->b_dirtyend = 0;
 
 		/*
 		 * If the new write will leave a contiguous dirty
 		 * area, just update the b_dirtyoff and b_dirtyend,
 		 * otherwise force a write rpc of the old dirty area.
 		 *
 		 * While it is possible to merge discontiguous writes due to
 		 * our having a B_CACHE buffer ( and thus valid read data
 		 * for the hole), we don't because it could lead to
 		 * significant cache coherency problems with multiple clients,
 		 * especially if locking is implemented later on.
 		 *
 		 * As an optimization we could theoretically maintain
 		 * a linked list of discontinuous areas, but we would still
 		 * have to commit them separately so there isn't much
 		 * advantage to it except perhaps a bit of asynchronization.
 		 */
 
 		if (bp->b_dirtyend > 0 &&
 		    (on > bp->b_dirtyend || (on + n) < bp->b_dirtyoff)) {
 			if (bwrite(bp) == EINTR) {
 				error = EINTR;
 				break;
 			}
 			goto again;
 		}
 
 		local_resid = uio->uio_resid;
 		error = vn_io_fault_uiomove((char *)bp->b_data + on, n, uio);
 
 		if (error != 0 && !bp_cached) {
 			/*
 			 * This block has no other content then what
 			 * possibly was written by the faulty uiomove.
 			 * Release it, forgetting the data pages, to
 			 * prevent the leak of uninitialized data to
 			 * usermode.
 			 */
 			bp->b_ioflags |= BIO_ERROR;
 			brelse(bp);
 			uio->uio_offset -= local_resid - uio->uio_resid;
 			uio->uio_resid = local_resid;
 			break;
 		}
 
 		/*
 		 * Since this block is being modified, it must be written
 		 * again and not just committed.  Since write clustering does
 		 * not work for the stage 1 data write, only the stage 2
 		 * commit rpc, we have to clear B_CLUSTEROK as well.
 		 */
 		bp->b_flags &= ~(B_NEEDCOMMIT | B_CLUSTEROK);
 
 		/*
 		 * Get the partial update on the progress made from
 		 * uiomove, if an error occured.
 		 */
 		if (error != 0)
 			n = local_resid - uio->uio_resid;
 
 		/*
 		 * Only update dirtyoff/dirtyend if not a degenerate
 		 * condition.
 		 */
 		if (n > 0) {
 			if (bp->b_dirtyend > 0) {
 				bp->b_dirtyoff = min(on, bp->b_dirtyoff);
 				bp->b_dirtyend = max((on + n), bp->b_dirtyend);
 			} else {
 				bp->b_dirtyoff = on;
 				bp->b_dirtyend = on + n;
 			}
 			vfs_bio_set_valid(bp, on, n);
 		}
 
 		/*
 		 * If IO_SYNC do bwrite().
 		 *
 		 * IO_INVAL appears to be unused.  The idea appears to be
 		 * to turn off caching in this case.  Very odd.  XXX
 		 */
 		if ((ioflag & IO_SYNC)) {
 			if (ioflag & IO_INVAL)
 				bp->b_flags |= B_NOCACHE;
 			error1 = bwrite(bp);
 			if (error1 != 0) {
 				if (error == 0)
 					error = error1;
 				break;
 			}
 		} else if ((n + on) == biosize) {
 			bp->b_flags |= B_ASYNC;
 			(void) ncl_writebp(bp, 0, NULL);
 		} else {
 			bdwrite(bp);
 		}
 
 		if (error != 0)
 			break;
 	} while (uio->uio_resid > 0 && n > 0);
 
 	if (error != 0) {
 		if (ioflag & IO_UNIT) {
 			VATTR_NULL(&vattr);
 			vattr.va_size = orig_size;
 			/* IO_SYNC is handled implicitely */
 			(void)VOP_SETATTR(vp, &vattr, cred);
 			uio->uio_offset -= orig_resid - uio->uio_resid;
 			uio->uio_resid = orig_resid;
 		}
 	}
 
 	return (error);
 }
 
 /*
  * Get an nfs cache block.
  *
  * Allocate a new one if the block isn't currently in the cache
  * and return the block marked busy. If the calling process is
  * interrupted by a signal for an interruptible mount point, return
  * NULL.
  *
  * The caller must carefully deal with the possible B_INVAL state of
  * the buffer.  ncl_doio() clears B_INVAL (and ncl_asyncio() clears it
  * indirectly), so synchronous reads can be issued without worrying about
  * the B_INVAL state.  We have to be a little more careful when dealing
  * with writes (see comments in nfs_write()) when extending a file past
  * its EOF.
  */
 static struct buf *
 nfs_getcacheblk(struct vnode *vp, daddr_t bn, int size, struct thread *td)
 {
 	struct buf *bp;
 	struct mount *mp;
 	struct nfsmount *nmp;
 
 	mp = vp->v_mount;
 	nmp = VFSTONFS(mp);
 
 	if (nmp->nm_flag & NFSMNT_INT) {
 		sigset_t oldset;
 
 		newnfs_set_sigmask(td, &oldset);
 		bp = getblk(vp, bn, size, PCATCH, 0, 0);
 		newnfs_restore_sigmask(td, &oldset);
 		while (bp == NULL) {
 			if (newnfs_sigintr(nmp, td))
 				return (NULL);
 			bp = getblk(vp, bn, size, 0, 2 * hz, 0);
 		}
 	} else {
 		bp = getblk(vp, bn, size, 0, 0, 0);
 	}
 
 	if (vp->v_type == VREG)
 		bp->b_blkno = bn * (vp->v_bufobj.bo_bsize / DEV_BSIZE);
 	return (bp);
 }
 
 /*
  * Flush and invalidate all dirty buffers. If another process is already
  * doing the flush, just wait for completion.
  */
 int
 ncl_vinvalbuf(struct vnode *vp, int flags, struct thread *td, int intrflg)
 {
 	struct nfsnode *np = VTONFS(vp);
 	struct nfsmount *nmp = VFSTONFS(vp->v_mount);
 	int error = 0, slpflag, slptimeo;
 	int old_lock = 0;
 
 	ASSERT_VOP_LOCKED(vp, "ncl_vinvalbuf");
 
 	if ((nmp->nm_flag & NFSMNT_INT) == 0)
 		intrflg = 0;
 	if ((nmp->nm_mountp->mnt_kern_flag & MNTK_UNMOUNTF))
 		intrflg = 1;
 	if (intrflg) {
 		slpflag = PCATCH;
 		slptimeo = 2 * hz;
 	} else {
 		slpflag = 0;
 		slptimeo = 0;
 	}
 
 	old_lock = ncl_upgrade_vnlock(vp);
 	if (vp->v_iflag & VI_DOOMED) {
 		/*
 		 * Since vgonel() uses the generic vinvalbuf() to flush
 		 * dirty buffers and it does not call this function, it
 		 * is safe to just return OK when VI_DOOMED is set.
 		 */
 		ncl_downgrade_vnlock(vp, old_lock);
 		return (0);
 	}
 
 	/*
 	 * Now, flush as required.
 	 */
 	if ((flags & V_SAVE) && (vp->v_bufobj.bo_object != NULL)) {
 		VM_OBJECT_WLOCK(vp->v_bufobj.bo_object);
 		vm_object_page_clean(vp->v_bufobj.bo_object, 0, 0, OBJPC_SYNC);
 		VM_OBJECT_WUNLOCK(vp->v_bufobj.bo_object);
 		/*
 		 * If the page clean was interrupted, fail the invalidation.
 		 * Not doing so, we run the risk of losing dirty pages in the
 		 * vinvalbuf() call below.
 		 */
 		if (intrflg && (error = newnfs_sigintr(nmp, td)))
 			goto out;
 	}
 
 	error = vinvalbuf(vp, flags, slpflag, 0);
 	while (error) {
 		if (intrflg && (error = newnfs_sigintr(nmp, td)))
 			goto out;
 		error = vinvalbuf(vp, flags, 0, slptimeo);
 	}
 	if (NFSHASPNFS(nmp)) {
 		nfscl_layoutcommit(vp, td);
 		/*
 		 * Invalidate the attribute cache, since writes to a DS
 		 * won't update the size attribute.
 		 */
 		mtx_lock(&np->n_mtx);
 		np->n_attrstamp = 0;
 	} else
 		mtx_lock(&np->n_mtx);
 	if (np->n_directio_asyncwr == 0)
 		np->n_flag &= ~NMODIFIED;
 	mtx_unlock(&np->n_mtx);
 out:
 	ncl_downgrade_vnlock(vp, old_lock);
 	return error;
 }
 
 /*
  * Initiate asynchronous I/O. Return an error if no nfsiods are available.
  * This is mainly to avoid queueing async I/O requests when the nfsiods
  * are all hung on a dead server.
  *
  * Note: ncl_asyncio() does not clear (BIO_ERROR|B_INVAL) but when the bp
  * is eventually dequeued by the async daemon, ncl_doio() *will*.
  */
 int
 ncl_asyncio(struct nfsmount *nmp, struct buf *bp, struct ucred *cred, struct thread *td)
 {
 	int iod;
 	int gotiod;
 	int slpflag = 0;
 	int slptimeo = 0;
 	int error, error2;
 
 	/*
 	 * Commits are usually short and sweet so lets save some cpu and
 	 * leave the async daemons for more important rpc's (such as reads
 	 * and writes).
 	 *
 	 * Readdirplus RPCs do vget()s to acquire the vnodes for entries
 	 * in the directory in order to update attributes. This can deadlock
 	 * with another thread that is waiting for async I/O to be done by
 	 * an nfsiod thread while holding a lock on one of these vnodes.
 	 * To avoid this deadlock, don't allow the async nfsiod threads to
 	 * perform Readdirplus RPCs.
 	 */
 	mtx_lock(&ncl_iod_mutex);
 	if ((bp->b_iocmd == BIO_WRITE && (bp->b_flags & B_NEEDCOMMIT) &&
 	     (nmp->nm_bufqiods > ncl_numasync / 2)) ||
 	    (bp->b_vp->v_type == VDIR && (nmp->nm_flag & NFSMNT_RDIRPLUS))) {
 		mtx_unlock(&ncl_iod_mutex);
 		return(EIO);
 	}
 again:
 	if (nmp->nm_flag & NFSMNT_INT)
 		slpflag = PCATCH;
 	gotiod = FALSE;
 
 	/*
 	 * Find a free iod to process this request.
 	 */
 	for (iod = 0; iod < ncl_numasync; iod++)
 		if (ncl_iodwant[iod] == NFSIOD_AVAILABLE) {
 			gotiod = TRUE;
 			break;
 		}
 
 	/*
 	 * Try to create one if none are free.
 	 */
 	if (!gotiod)
 		ncl_nfsiodnew();
 	else {
 		/*
 		 * Found one, so wake it up and tell it which
 		 * mount to process.
 		 */
 		NFS_DPF(ASYNCIO, ("ncl_asyncio: waking iod %d for mount %p\n",
 		    iod, nmp));
 		ncl_iodwant[iod] = NFSIOD_NOT_AVAILABLE;
 		ncl_iodmount[iod] = nmp;
 		nmp->nm_bufqiods++;
 		wakeup(&ncl_iodwant[iod]);
 	}
 
 	/*
 	 * If none are free, we may already have an iod working on this mount
 	 * point.  If so, it will process our request.
 	 */
 	if (!gotiod) {
 		if (nmp->nm_bufqiods > 0) {
 			NFS_DPF(ASYNCIO,
 				("ncl_asyncio: %d iods are already processing mount %p\n",
 				 nmp->nm_bufqiods, nmp));
 			gotiod = TRUE;
 		}
 	}
 
 	/*
 	 * If we have an iod which can process the request, then queue
 	 * the buffer.
 	 */
 	if (gotiod) {
 		/*
 		 * Ensure that the queue never grows too large.  We still want
 		 * to asynchronize so we block rather then return EIO.
 		 */
 		while (nmp->nm_bufqlen >= 2*ncl_numasync) {
 			NFS_DPF(ASYNCIO,
 				("ncl_asyncio: waiting for mount %p queue to drain\n", nmp));
 			nmp->nm_bufqwant = TRUE;
 			error = newnfs_msleep(td, &nmp->nm_bufq,
 			    &ncl_iod_mutex, slpflag | PRIBIO, "nfsaio",
 			   slptimeo);
 			if (error) {
 				error2 = newnfs_sigintr(nmp, td);
 				if (error2) {
 					mtx_unlock(&ncl_iod_mutex);
 					return (error2);
 				}
 				if (slpflag == PCATCH) {
 					slpflag = 0;
 					slptimeo = 2 * hz;
 				}
 			}
 			/*
 			 * We might have lost our iod while sleeping,
 			 * so check and loop if nescessary.
 			 */
 			goto again;
 		}
 
 		/* We might have lost our nfsiod */
 		if (nmp->nm_bufqiods == 0) {
 			NFS_DPF(ASYNCIO,
 				("ncl_asyncio: no iods after mount %p queue was drained, looping\n", nmp));
 			goto again;
 		}
 
 		if (bp->b_iocmd == BIO_READ) {
 			if (bp->b_rcred == NOCRED && cred != NOCRED)
 				bp->b_rcred = crhold(cred);
 		} else {
 			if (bp->b_wcred == NOCRED && cred != NOCRED)
 				bp->b_wcred = crhold(cred);
 		}
 
 		if (bp->b_flags & B_REMFREE)
 			bremfreef(bp);
 		BUF_KERNPROC(bp);
 		TAILQ_INSERT_TAIL(&nmp->nm_bufq, bp, b_freelist);
 		nmp->nm_bufqlen++;
 		if ((bp->b_flags & B_DIRECT) && bp->b_iocmd == BIO_WRITE) {
 			mtx_lock(&(VTONFS(bp->b_vp))->n_mtx);
 			VTONFS(bp->b_vp)->n_flag |= NMODIFIED;
 			VTONFS(bp->b_vp)->n_directio_asyncwr++;
 			mtx_unlock(&(VTONFS(bp->b_vp))->n_mtx);
 		}
 		mtx_unlock(&ncl_iod_mutex);
 		return (0);
 	}
 
 	mtx_unlock(&ncl_iod_mutex);
 
 	/*
 	 * All the iods are busy on other mounts, so return EIO to
 	 * force the caller to process the i/o synchronously.
 	 */
 	NFS_DPF(ASYNCIO, ("ncl_asyncio: no iods available, i/o is synchronous\n"));
 	return (EIO);
 }
 
 void
 ncl_doio_directwrite(struct buf *bp)
 {
 	int iomode, must_commit;
 	struct uio *uiop = (struct uio *)bp->b_caller1;
 	char *iov_base = uiop->uio_iov->iov_base;
 
 	iomode = NFSWRITE_FILESYNC;
 	uiop->uio_td = NULL; /* NULL since we're in nfsiod */
 	ncl_writerpc(bp->b_vp, uiop, bp->b_wcred, &iomode, &must_commit, 0);
 	KASSERT((must_commit == 0), ("ncl_doio_directwrite: Did not commit write"));
 	free(iov_base, M_NFSDIRECTIO);
 	free(uiop->uio_iov, M_NFSDIRECTIO);
 	free(uiop, M_NFSDIRECTIO);
 	if ((bp->b_flags & B_DIRECT) && bp->b_iocmd == BIO_WRITE) {
 		struct nfsnode *np = VTONFS(bp->b_vp);
 		mtx_lock(&np->n_mtx);
 		if (NFSHASPNFS(VFSTONFS(vnode_mount(bp->b_vp)))) {
 			/*
 			 * Invalidate the attribute cache, since writes to a DS
 			 * won't update the size attribute.
 			 */
 			np->n_attrstamp = 0;
 		}
 		np->n_directio_asyncwr--;
 		if (np->n_directio_asyncwr == 0) {
 			np->n_flag &= ~NMODIFIED;
 			if ((np->n_flag & NFSYNCWAIT)) {
 				np->n_flag &= ~NFSYNCWAIT;
 				wakeup((caddr_t)&np->n_directio_asyncwr);
 			}
 		}
 		mtx_unlock(&np->n_mtx);
 	}
 	bp->b_vp = NULL;
 	relpbuf(bp, &ncl_pbuf_freecnt);
 }
 
 /*
  * Do an I/O operation to/from a cache block. This may be called
  * synchronously or from an nfsiod.
  */
 int
 ncl_doio(struct vnode *vp, struct buf *bp, struct ucred *cr, struct thread *td,
     int called_from_strategy)
 {
 	struct uio *uiop;
 	struct nfsnode *np;
 	struct nfsmount *nmp;
 	int error = 0, iomode, must_commit = 0;
 	struct uio uio;
 	struct iovec io;
 	struct proc *p = td ? td->td_proc : NULL;
 	uint8_t	iocmd;
 
 	np = VTONFS(vp);
 	nmp = VFSTONFS(vp->v_mount);
 	uiop = &uio;
 	uiop->uio_iov = &io;
 	uiop->uio_iovcnt = 1;
 	uiop->uio_segflg = UIO_SYSSPACE;
 	uiop->uio_td = td;
 
 	/*
 	 * clear BIO_ERROR and B_INVAL state prior to initiating the I/O.  We
 	 * do this here so we do not have to do it in all the code that
 	 * calls us.
 	 */
 	bp->b_flags &= ~B_INVAL;
 	bp->b_ioflags &= ~BIO_ERROR;
 
 	KASSERT(!(bp->b_flags & B_DONE), ("ncl_doio: bp %p already marked done", bp));
 	iocmd = bp->b_iocmd;
 	if (iocmd == BIO_READ) {
 	    io.iov_len = uiop->uio_resid = bp->b_bcount;
 	    io.iov_base = bp->b_data;
 	    uiop->uio_rw = UIO_READ;
 
 	    switch (vp->v_type) {
 	    case VREG:
 		uiop->uio_offset = ((off_t)bp->b_blkno) * DEV_BSIZE;
 		NFSINCRGLOBAL(newnfsstats.read_bios);
 		error = ncl_readrpc(vp, uiop, cr);
 
 		if (!error) {
 		    if (uiop->uio_resid) {
 			/*
 			 * If we had a short read with no error, we must have
 			 * hit a file hole.  We should zero-fill the remainder.
 			 * This can also occur if the server hits the file EOF.
 			 *
 			 * Holes used to be able to occur due to pending
 			 * writes, but that is not possible any longer.
 			 */
 			int nread = bp->b_bcount - uiop->uio_resid;
 			ssize_t left = uiop->uio_resid;
 
 			if (left > 0)
 				bzero((char *)bp->b_data + nread, left);
 			uiop->uio_resid = 0;
 		    }
 		}
 		/* ASSERT_VOP_LOCKED(vp, "ncl_doio"); */
 		if (p && (vp->v_vflag & VV_TEXT)) {
 			mtx_lock(&np->n_mtx);
 			if (NFS_TIMESPEC_COMPARE(&np->n_mtime, &np->n_vattr.na_mtime)) {
 				mtx_unlock(&np->n_mtx);
 				PROC_LOCK(p);
 				killproc(p, "text file modification");
 				PROC_UNLOCK(p);
 			} else
 				mtx_unlock(&np->n_mtx);
 		}
 		break;
 	    case VLNK:
 		uiop->uio_offset = (off_t)0;
 		NFSINCRGLOBAL(newnfsstats.readlink_bios);
 		error = ncl_readlinkrpc(vp, uiop, cr);
 		break;
 	    case VDIR:
 		NFSINCRGLOBAL(newnfsstats.readdir_bios);
 		uiop->uio_offset = ((u_quad_t)bp->b_lblkno) * NFS_DIRBLKSIZ;
 		if ((nmp->nm_flag & NFSMNT_RDIRPLUS) != 0) {
 			error = ncl_readdirplusrpc(vp, uiop, cr, td);
 			if (error == NFSERR_NOTSUPP)
 				nmp->nm_flag &= ~NFSMNT_RDIRPLUS;
 		}
 		if ((nmp->nm_flag & NFSMNT_RDIRPLUS) == 0)
 			error = ncl_readdirrpc(vp, uiop, cr, td);
 		/*
 		 * end-of-directory sets B_INVAL but does not generate an
 		 * error.
 		 */
 		if (error == 0 && uiop->uio_resid == bp->b_bcount)
 			bp->b_flags |= B_INVAL;
 		break;
 	    default:
 		ncl_printf("ncl_doio:  type %x unexpected\n", vp->v_type);
 		break;
 	    };
 	    if (error) {
 		bp->b_ioflags |= BIO_ERROR;
 		bp->b_error = error;
 	    }
 	} else {
 	    /*
 	     * If we only need to commit, try to commit
 	     */
 	    if (bp->b_flags & B_NEEDCOMMIT) {
 		    int retv;
 		    off_t off;
 
 		    off = ((u_quad_t)bp->b_blkno) * DEV_BSIZE + bp->b_dirtyoff;
 		    retv = ncl_commit(vp, off, bp->b_dirtyend-bp->b_dirtyoff,
 			bp->b_wcred, td);
 		    if (retv == 0) {
 			    bp->b_dirtyoff = bp->b_dirtyend = 0;
 			    bp->b_flags &= ~(B_NEEDCOMMIT | B_CLUSTEROK);
 			    bp->b_resid = 0;
 			    bufdone(bp);
 			    return (0);
 		    }
 		    if (retv == NFSERR_STALEWRITEVERF) {
 			    ncl_clearcommit(vp->v_mount);
 		    }
 	    }
 
 	    /*
 	     * Setup for actual write
 	     */
 	    mtx_lock(&np->n_mtx);
 	    if ((off_t)bp->b_blkno * DEV_BSIZE + bp->b_dirtyend > np->n_size)
 		bp->b_dirtyend = np->n_size - (off_t)bp->b_blkno * DEV_BSIZE;
 	    mtx_unlock(&np->n_mtx);
 
 	    if (bp->b_dirtyend > bp->b_dirtyoff) {
 		io.iov_len = uiop->uio_resid = bp->b_dirtyend
 		    - bp->b_dirtyoff;
 		uiop->uio_offset = (off_t)bp->b_blkno * DEV_BSIZE
 		    + bp->b_dirtyoff;
 		io.iov_base = (char *)bp->b_data + bp->b_dirtyoff;
 		uiop->uio_rw = UIO_WRITE;
 		NFSINCRGLOBAL(newnfsstats.write_bios);
 
 		if ((bp->b_flags & (B_ASYNC | B_NEEDCOMMIT | B_NOCACHE | B_CLUSTER)) == B_ASYNC)
 		    iomode = NFSWRITE_UNSTABLE;
 		else
 		    iomode = NFSWRITE_FILESYNC;
 
 		error = ncl_writerpc(vp, uiop, cr, &iomode, &must_commit,
 		    called_from_strategy);
 
 		/*
 		 * When setting B_NEEDCOMMIT also set B_CLUSTEROK to try
 		 * to cluster the buffers needing commit.  This will allow
 		 * the system to submit a single commit rpc for the whole
 		 * cluster.  We can do this even if the buffer is not 100%
 		 * dirty (relative to the NFS blocksize), so we optimize the
 		 * append-to-file-case.
 		 *
 		 * (when clearing B_NEEDCOMMIT, B_CLUSTEROK must also be
 		 * cleared because write clustering only works for commit
 		 * rpc's, not for the data portion of the write).
 		 */
 
 		if (!error && iomode == NFSWRITE_UNSTABLE) {
 		    bp->b_flags |= B_NEEDCOMMIT;
 		    if (bp->b_dirtyoff == 0
 			&& bp->b_dirtyend == bp->b_bcount)
 			bp->b_flags |= B_CLUSTEROK;
 		} else {
 		    bp->b_flags &= ~(B_NEEDCOMMIT | B_CLUSTEROK);
 		}
 
 		/*
 		 * For an interrupted write, the buffer is still valid
 		 * and the write hasn't been pushed to the server yet,
 		 * so we can't set BIO_ERROR and report the interruption
 		 * by setting B_EINTR. For the B_ASYNC case, B_EINTR
 		 * is not relevant, so the rpc attempt is essentially
 		 * a noop.  For the case of a V3 write rpc not being
 		 * committed to stable storage, the block is still
 		 * dirty and requires either a commit rpc or another
 		 * write rpc with iomode == NFSV3WRITE_FILESYNC before
 		 * the block is reused. This is indicated by setting
 		 * the B_DELWRI and B_NEEDCOMMIT flags.
 		 *
 		 * EIO is returned by ncl_writerpc() to indicate a recoverable
 		 * write error and is handled as above, except that
 		 * B_EINTR isn't set. One cause of this is a stale stateid
 		 * error for the RPC that indicates recovery is required,
 		 * when called with called_from_strategy != 0.
 		 *
 		 * If the buffer is marked B_PAGING, it does not reside on
 		 * the vp's paging queues so we cannot call bdirty().  The
 		 * bp in this case is not an NFS cache block so we should
 		 * be safe. XXX
 		 *
 		 * The logic below breaks up errors into recoverable and
 		 * unrecoverable. For the former, we clear B_INVAL|B_NOCACHE
 		 * and keep the buffer around for potential write retries.
 		 * For the latter (eg ESTALE), we toss the buffer away (B_INVAL)
 		 * and save the error in the nfsnode. This is less than ideal
 		 * but necessary. Keeping such buffers around could potentially
 		 * cause buffer exhaustion eventually (they can never be written
 		 * out, so will get constantly be re-dirtied). It also causes
 		 * all sorts of vfs panics. For non-recoverable write errors,
 		 * also invalidate the attrcache, so we'll be forced to go over
 		 * the wire for this object, returning an error to user on next
 		 * call (most of the time).
 		 */
 		if (error == EINTR || error == EIO || error == ETIMEDOUT
 		    || (!error && (bp->b_flags & B_NEEDCOMMIT))) {
 			int s;
 
 			s = splbio();
 			bp->b_flags &= ~(B_INVAL|B_NOCACHE);
 			if ((bp->b_flags & B_PAGING) == 0) {
 			    bdirty(bp);
 			    bp->b_flags &= ~B_DONE;
 			}
 			if ((error == EINTR || error == ETIMEDOUT) &&
 			    (bp->b_flags & B_ASYNC) == 0)
 			    bp->b_flags |= B_EINTR;
 			splx(s);
 		} else {
 		    if (error) {
 			bp->b_ioflags |= BIO_ERROR;
 			bp->b_flags |= B_INVAL;
 			bp->b_error = np->n_error = error;
 			mtx_lock(&np->n_mtx);
 			np->n_flag |= NWRITEERR;
 			np->n_attrstamp = 0;
 			KDTRACE_NFS_ATTRCACHE_FLUSH_DONE(vp);
 			mtx_unlock(&np->n_mtx);
 		    }
 		    bp->b_dirtyoff = bp->b_dirtyend = 0;
 		}
 	    } else {
 		bp->b_resid = 0;
 		bufdone(bp);
 		return (0);
 	    }
 	}
 	bp->b_resid = uiop->uio_resid;
 	if (must_commit)
 	    ncl_clearcommit(vp->v_mount);
 	bufdone(bp);
 	return (error);
 }
 
 /*
  * Used to aid in handling ftruncate() operations on the NFS client side.
  * Truncation creates a number of special problems for NFS.  We have to
  * throw away VM pages and buffer cache buffers that are beyond EOF, and
  * we have to properly handle VM pages or (potentially dirty) buffers
  * that straddle the truncation point.
  */
 
 int
 ncl_meta_setsize(struct vnode *vp, struct ucred *cred, struct thread *td, u_quad_t nsize)
 {
 	struct nfsnode *np = VTONFS(vp);
 	u_quad_t tsize;
 	int biosize = vp->v_bufobj.bo_bsize;
 	int error = 0;
 
 	mtx_lock(&np->n_mtx);
 	tsize = np->n_size;
 	np->n_size = nsize;
 	mtx_unlock(&np->n_mtx);
 
 	if (nsize < tsize) {
 		struct buf *bp;
 		daddr_t lbn;
 		int bufsize;
 
 		/*
 		 * vtruncbuf() doesn't get the buffer overlapping the
 		 * truncation point.  We may have a B_DELWRI and/or B_CACHE
 		 * buffer that now needs to be truncated.
 		 */
 		error = vtruncbuf(vp, cred, nsize, biosize);
 		lbn = nsize / biosize;
 		bufsize = nsize - (lbn * biosize);
 		bp = nfs_getcacheblk(vp, lbn, bufsize, td);
 		if (!bp)
 			return EINTR;
 		if (bp->b_dirtyoff > bp->b_bcount)
 			bp->b_dirtyoff = bp->b_bcount;
 		if (bp->b_dirtyend > bp->b_bcount)
 			bp->b_dirtyend = bp->b_bcount;
 		bp->b_flags |= B_RELBUF;  /* don't leave garbage around */
 		brelse(bp);
 	} else {
 		vnode_pager_setsize(vp, nsize);
 	}
 	return(error);
 }
 
Index: head/sys/fs/nfsclient/nfs_clnode.c
===================================================================
--- head/sys/fs/nfsclient/nfs_clnode.c	(revision 258540)
+++ head/sys/fs/nfsclient/nfs_clnode.c	(revision 258541)
@@ -1,335 +1,333 @@
 /*-
  * Copyright (c) 1989, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * This code is derived from software contributed to Berkeley by
  * Rick Macklem at The University of Guelph.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	from nfs_node.c	8.6 (Berkeley) 5/22/95
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
-#include "opt_kdtrace.h"
-
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/fcntl.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mount.h>
 #include <sys/namei.h>
 #include <sys/proc.h>
 #include <sys/socket.h>
 #include <sys/sysctl.h>
 #include <sys/taskqueue.h>
 #include <sys/vnode.h>
 
 #include <vm/uma.h>
 
 #include <fs/nfs/nfsport.h>
 #include <fs/nfsclient/nfsnode.h>
 #include <fs/nfsclient/nfsmount.h>
 #include <fs/nfsclient/nfs.h>
 #include <fs/nfsclient/nfs_kdtrace.h>
 
 #include <nfs/nfs_lock.h>
 
 extern struct vop_vector newnfs_vnodeops;
 extern struct buf_ops buf_ops_newnfs;
 MALLOC_DECLARE(M_NEWNFSREQ);
 
 uma_zone_t newnfsnode_zone;
 
 static void	nfs_freesillyrename(void *arg, __unused int pending);
 
 void
 ncl_nhinit(void)
 {
 
 	newnfsnode_zone = uma_zcreate("NCLNODE", sizeof(struct nfsnode), NULL,
 	    NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
 }
 
 void
 ncl_nhuninit(void)
 {
 	uma_zdestroy(newnfsnode_zone);
 }
 
 /*
  * ONLY USED FOR THE ROOT DIRECTORY. nfscl_nget() does the rest. If this
  * function is going to be used to get Regular Files, code must be added
  * to fill in the "struct nfsv4node".
  * Look up a vnode/nfsnode by file handle.
  * Callers must check for mount points!!
  * In all cases, a pointer to a
  * nfsnode structure is returned.
  */
 int
 ncl_nget(struct mount *mntp, u_int8_t *fhp, int fhsize, struct nfsnode **npp,
     int lkflags)
 {
 	struct thread *td = curthread;	/* XXX */
 	struct nfsnode *np;
 	struct vnode *vp;
 	struct vnode *nvp;
 	int error;
 	u_int hash;
 	struct nfsmount *nmp;
 	struct nfsfh *nfhp;
 
 	nmp = VFSTONFS(mntp);
 	*npp = NULL;
 
 	hash = fnv_32_buf(fhp, fhsize, FNV1_32_INIT);
 
 	MALLOC(nfhp, struct nfsfh *, sizeof (struct nfsfh) + fhsize,
 	    M_NFSFH, M_WAITOK);
 	bcopy(fhp, &nfhp->nfh_fh[0], fhsize);
 	nfhp->nfh_len = fhsize;
 	error = vfs_hash_get(mntp, hash, lkflags,
 	    td, &nvp, newnfs_vncmpf, nfhp);
 	FREE(nfhp, M_NFSFH);
 	if (error)
 		return (error);
 	if (nvp != NULL) {
 		*npp = VTONFS(nvp);
 		return (0);
 	}
 	np = uma_zalloc(newnfsnode_zone, M_WAITOK | M_ZERO);
 
 	error = getnewvnode("newnfs", mntp, &newnfs_vnodeops, &nvp);
 	if (error) {
 		uma_zfree(newnfsnode_zone, np);
 		return (error);
 	}
 	vp = nvp;
 	KASSERT(vp->v_bufobj.bo_bsize != 0, ("ncl_nget: bo_bsize == 0"));
 	vp->v_bufobj.bo_ops = &buf_ops_newnfs;
 	vp->v_data = np;
 	np->n_vnode = vp;
 	/* 
 	 * Initialize the mutex even if the vnode is going to be a loser.
 	 * This simplifies the logic in reclaim, which can then unconditionally
 	 * destroy the mutex (in the case of the loser, or if hash_insert
 	 * happened to return an error no special casing is needed).
 	 */
 	mtx_init(&np->n_mtx, "NEWNFSnode lock", NULL, MTX_DEF | MTX_DUPOK);
 	/*
 	 * NFS supports recursive and shared locking.
 	 */
 	lockmgr(vp->v_vnlock, LK_EXCLUSIVE | LK_NOWITNESS, NULL);
 	VN_LOCK_AREC(vp);
 	VN_LOCK_ASHARE(vp);
 	/* 
 	 * Are we getting the root? If so, make sure the vnode flags
 	 * are correct 
 	 */
 	if ((fhsize == nmp->nm_fhsize) &&
 	    !bcmp(fhp, nmp->nm_fh, fhsize)) {
 		if (vp->v_type == VNON)
 			vp->v_type = VDIR;
 		vp->v_vflag |= VV_ROOT;
 	}
 	
 	MALLOC(np->n_fhp, struct nfsfh *, sizeof (struct nfsfh) + fhsize,
 	    M_NFSFH, M_WAITOK);
 	bcopy(fhp, np->n_fhp->nfh_fh, fhsize);
 	np->n_fhp->nfh_len = fhsize;
 	error = insmntque(vp, mntp);
 	if (error != 0) {
 		*npp = NULL;
 		FREE((caddr_t)np->n_fhp, M_NFSFH);
 		mtx_destroy(&np->n_mtx);
 		uma_zfree(newnfsnode_zone, np);
 		return (error);
 	}
 	error = vfs_hash_insert(vp, hash, lkflags, 
 	    td, &nvp, newnfs_vncmpf, np->n_fhp);
 	if (error)
 		return (error);
 	if (nvp != NULL) {
 		*npp = VTONFS(nvp);
 		/* vfs_hash_insert() vput()'s the losing vnode */
 		return (0);
 	}
 	*npp = np;
 
 	return (0);
 }
 
 /*
  * Do the vrele(sp->s_dvp) as a separate task in order to avoid a
  * deadlock because of a LOR when vrele() locks the directory vnode.
  */
 static void
 nfs_freesillyrename(void *arg, __unused int pending)
 {
 	struct sillyrename *sp;
 
 	sp = arg;
 	vrele(sp->s_dvp);
 	free(sp, M_NEWNFSREQ);
 }
 
 int
 ncl_inactive(struct vop_inactive_args *ap)
 {
 	struct nfsnode *np;
 	struct sillyrename *sp;
 	struct vnode *vp = ap->a_vp;
 	boolean_t retv;
 
 	np = VTONFS(vp);
 
 	if (NFS_ISV4(vp) && vp->v_type == VREG) {
 		/*
 		 * Since mmap()'d files do I/O after VOP_CLOSE(), the NFSv4
 		 * Close operations are delayed until now. Any dirty
 		 * buffers/pages must be flushed before the close, so that the
 		 * stateid is available for the writes.
 		 */
 		if (vp->v_object != NULL) {
 			VM_OBJECT_WLOCK(vp->v_object);
 			retv = vm_object_page_clean(vp->v_object, 0, 0,
 			    OBJPC_SYNC);
 			VM_OBJECT_WUNLOCK(vp->v_object);
 		} else
 			retv = TRUE;
 		if (retv == TRUE) {
 			(void)ncl_flush(vp, MNT_WAIT, NULL, ap->a_td, 1, 0);
 			(void)nfsrpc_close(vp, 1, ap->a_td);
 		}
 	}
 
 	mtx_lock(&np->n_mtx);
 	if (vp->v_type != VDIR) {
 		sp = np->n_sillyrename;
 		np->n_sillyrename = NULL;
 	} else
 		sp = NULL;
 	if (sp) {
 		mtx_unlock(&np->n_mtx);
 		(void) ncl_vinvalbuf(vp, 0, ap->a_td, 1);
 		/*
 		 * Remove the silly file that was rename'd earlier
 		 */
 		ncl_removeit(sp, vp);
 		crfree(sp->s_cred);
 		TASK_INIT(&sp->s_task, 0, nfs_freesillyrename, sp);
 		taskqueue_enqueue(taskqueue_thread, &sp->s_task);
 		mtx_lock(&np->n_mtx);
 	}
 	np->n_flag &= NMODIFIED;
 	mtx_unlock(&np->n_mtx);
 	return (0);
 }
 
 /*
  * Reclaim an nfsnode so that it can be used for other purposes.
  */
 int
 ncl_reclaim(struct vop_reclaim_args *ap)
 {
 	struct vnode *vp = ap->a_vp;
 	struct nfsnode *np = VTONFS(vp);
 	struct nfsdmap *dp, *dp2;
 
 	/*
 	 * If the NLM is running, give it a chance to abort pending
 	 * locks.
 	 */
 	if (nfs_reclaim_p != NULL)
 		nfs_reclaim_p(ap);
 
 	/*
 	 * Destroy the vm object and flush associated pages.
 	 */
 	vnode_destroy_vobject(vp);
 
 	if (NFS_ISV4(vp) && vp->v_type == VREG)
 		/*
 		 * We can now safely close any remaining NFSv4 Opens for
 		 * this file. Most opens will have already been closed by
 		 * ncl_inactive(), but there are cases where it is not
 		 * called, so we need to do it again here.
 		 */
 		(void) nfsrpc_close(vp, 1, ap->a_td);
 
 	vfs_hash_remove(vp);
 
 	/*
 	 * Call nfscl_reclaimnode() to save attributes in the delegation,
 	 * as required.
 	 */
 	if (vp->v_type == VREG)
 		nfscl_reclaimnode(vp);
 
 	/*
 	 * Free up any directory cookie structures and
 	 * large file handle structures that might be associated with
 	 * this nfs node.
 	 */
 	if (vp->v_type == VDIR) {
 		dp = LIST_FIRST(&np->n_cookies);
 		while (dp) {
 			dp2 = dp;
 			dp = LIST_NEXT(dp, ndm_list);
 			FREE((caddr_t)dp2, M_NFSDIROFF);
 		}
 	}
 	if (np->n_writecred != NULL)
 		crfree(np->n_writecred);
 	FREE((caddr_t)np->n_fhp, M_NFSFH);
 	if (np->n_v4 != NULL)
 		FREE((caddr_t)np->n_v4, M_NFSV4NODE);
 	mtx_destroy(&np->n_mtx);
 	uma_zfree(newnfsnode_zone, vp->v_data);
 	vp->v_data = NULL;
 	return (0);
 }
 
 /*
  * Invalidate both the access and attribute caches for this vnode.
  */
 void
 ncl_invalcaches(struct vnode *vp)
 {
 	struct nfsnode *np = VTONFS(vp);
 	int i;
 
 	mtx_lock(&np->n_mtx);
 	for (i = 0; i < NFS_ACCESSCACHESIZE; i++)
 		np->n_accesscache[i].stamp = 0;
 	KDTRACE_NFS_ACCESSCACHE_FLUSH_DONE(vp);
 	np->n_attrstamp = 0;
 	KDTRACE_NFS_ATTRCACHE_FLUSH_DONE(vp);
 	mtx_unlock(&np->n_mtx);
 }
 
Index: head/sys/fs/nfsclient/nfs_clport.c
===================================================================
--- head/sys/fs/nfsclient/nfs_clport.c	(revision 258540)
+++ head/sys/fs/nfsclient/nfs_clport.c	(revision 258541)
@@ -1,1354 +1,1353 @@
 /*-
  * Copyright (c) 1989, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * This code is derived from software contributed to Berkeley by
  * Rick Macklem at The University of Guelph.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_inet6.h"
-#include "opt_kdtrace.h"
 
 #include <sys/capability.h>
 
 /*
  * generally, I don't like #includes inside .h files, but it seems to
  * be the easiest way to handle the port.
  */
 #include <sys/hash.h>
 #include <fs/nfs/nfsport.h>
 #include <netinet/if_ether.h>
 #include <net/if_types.h>
 
 #include <fs/nfsclient/nfs_kdtrace.h>
 
 #ifdef KDTRACE_HOOKS
 dtrace_nfsclient_attrcache_flush_probe_func_t
 		dtrace_nfscl_attrcache_flush_done_probe;
 uint32_t	nfscl_attrcache_flush_done_id;
 
 dtrace_nfsclient_attrcache_get_hit_probe_func_t
 		dtrace_nfscl_attrcache_get_hit_probe;
 uint32_t	nfscl_attrcache_get_hit_id;
 
 dtrace_nfsclient_attrcache_get_miss_probe_func_t
 		dtrace_nfscl_attrcache_get_miss_probe;
 uint32_t	nfscl_attrcache_get_miss_id;
 
 dtrace_nfsclient_attrcache_load_probe_func_t
 		dtrace_nfscl_attrcache_load_done_probe;
 uint32_t	nfscl_attrcache_load_done_id;
 #endif /* !KDTRACE_HOOKS */
 
 extern u_int32_t newnfs_true, newnfs_false, newnfs_xdrneg1;
 extern struct vop_vector newnfs_vnodeops;
 extern struct vop_vector newnfs_fifoops;
 extern uma_zone_t newnfsnode_zone;
 extern struct buf_ops buf_ops_newnfs;
 extern int ncl_pbuf_freecnt;
 extern short nfsv4_cbport;
 extern int nfscl_enablecallb;
 extern int nfs_numnfscbd;
 extern int nfscl_inited;
 struct mtx nfs_clstate_mutex;
 struct mtx ncl_iod_mutex;
 NFSDLOCKMUTEX;
 
 extern void (*ncl_call_invalcaches)(struct vnode *);
 
 /*
  * Comparison function for vfs_hash functions.
  */
 int
 newnfs_vncmpf(struct vnode *vp, void *arg)
 {
 	struct nfsfh *nfhp = (struct nfsfh *)arg;
 	struct nfsnode *np = VTONFS(vp);
 
 	if (np->n_fhp->nfh_len != nfhp->nfh_len ||
 	    NFSBCMP(np->n_fhp->nfh_fh, nfhp->nfh_fh, nfhp->nfh_len))
 		return (1);
 	return (0);
 }
 
 /*
  * Look up a vnode/nfsnode by file handle.
  * Callers must check for mount points!!
  * In all cases, a pointer to a
  * nfsnode structure is returned.
  * This variant takes a "struct nfsfh *" as second argument and uses
  * that structure up, either by hanging off the nfsnode or FREEing it.
  */
 int
 nfscl_nget(struct mount *mntp, struct vnode *dvp, struct nfsfh *nfhp,
     struct componentname *cnp, struct thread *td, struct nfsnode **npp,
     void *stuff, int lkflags)
 {
 	struct nfsnode *np, *dnp;
 	struct vnode *vp, *nvp;
 	struct nfsv4node *newd, *oldd;
 	int error;
 	u_int hash;
 	struct nfsmount *nmp;
 
 	nmp = VFSTONFS(mntp);
 	dnp = VTONFS(dvp);
 	*npp = NULL;
 
 	hash = fnv_32_buf(nfhp->nfh_fh, nfhp->nfh_len, FNV1_32_INIT);
 
 	error = vfs_hash_get(mntp, hash, lkflags,
 	    td, &nvp, newnfs_vncmpf, nfhp);
 	if (error == 0 && nvp != NULL) {
 		/*
 		 * I believe there is a slight chance that vgonel() could
 		 * get called on this vnode between when NFSVOPLOCK() drops
 		 * the VI_LOCK() and vget() acquires it again, so that it
 		 * hasn't yet had v_usecount incremented. If this were to
 		 * happen, the VI_DOOMED flag would be set, so check for
 		 * that here. Since we now have the v_usecount incremented,
 		 * we should be ok until we vrele() it, if the VI_DOOMED
 		 * flag isn't set now.
 		 */
 		VI_LOCK(nvp);
 		if ((nvp->v_iflag & VI_DOOMED)) {
 			VI_UNLOCK(nvp);
 			vrele(nvp);
 			error = ENOENT;
 		} else {
 			VI_UNLOCK(nvp);
 		}
 	}
 	if (error) {
 		FREE((caddr_t)nfhp, M_NFSFH);
 		return (error);
 	}
 	if (nvp != NULL) {
 		np = VTONFS(nvp);
 		/*
 		 * For NFSv4, check to see if it is the same name and
 		 * replace the name, if it is different.
 		 */
 		oldd = newd = NULL;
 		if ((nmp->nm_flag & NFSMNT_NFSV4) && np->n_v4 != NULL &&
 		    nvp->v_type == VREG &&
 		    (np->n_v4->n4_namelen != cnp->cn_namelen ||
 		     NFSBCMP(cnp->cn_nameptr, NFS4NODENAME(np->n_v4),
 		     cnp->cn_namelen) ||
 		     dnp->n_fhp->nfh_len != np->n_v4->n4_fhlen ||
 		     NFSBCMP(dnp->n_fhp->nfh_fh, np->n_v4->n4_data,
 		     dnp->n_fhp->nfh_len))) {
 		    MALLOC(newd, struct nfsv4node *,
 			sizeof (struct nfsv4node) + dnp->n_fhp->nfh_len +
 			+ cnp->cn_namelen - 1, M_NFSV4NODE, M_WAITOK);
 		    NFSLOCKNODE(np);
 		    if (newd != NULL && np->n_v4 != NULL && nvp->v_type == VREG
 			&& (np->n_v4->n4_namelen != cnp->cn_namelen ||
 			 NFSBCMP(cnp->cn_nameptr, NFS4NODENAME(np->n_v4),
 			 cnp->cn_namelen) ||
 			 dnp->n_fhp->nfh_len != np->n_v4->n4_fhlen ||
 			 NFSBCMP(dnp->n_fhp->nfh_fh, np->n_v4->n4_data,
 			 dnp->n_fhp->nfh_len))) {
 			oldd = np->n_v4;
 			np->n_v4 = newd;
 			newd = NULL;
 			np->n_v4->n4_fhlen = dnp->n_fhp->nfh_len;
 			np->n_v4->n4_namelen = cnp->cn_namelen;
 			NFSBCOPY(dnp->n_fhp->nfh_fh, np->n_v4->n4_data,
 			    dnp->n_fhp->nfh_len);
 			NFSBCOPY(cnp->cn_nameptr, NFS4NODENAME(np->n_v4),
 			    cnp->cn_namelen);
 		    }
 		    NFSUNLOCKNODE(np);
 		}
 		if (newd != NULL)
 			FREE((caddr_t)newd, M_NFSV4NODE);
 		if (oldd != NULL)
 			FREE((caddr_t)oldd, M_NFSV4NODE);
 		*npp = np;
 		FREE((caddr_t)nfhp, M_NFSFH);
 		return (0);
 	}
 	np = uma_zalloc(newnfsnode_zone, M_WAITOK | M_ZERO);
 
 	error = getnewvnode("newnfs", mntp, &newnfs_vnodeops, &nvp);
 	if (error) {
 		uma_zfree(newnfsnode_zone, np);
 		FREE((caddr_t)nfhp, M_NFSFH);
 		return (error);
 	}
 	vp = nvp;
 	KASSERT(vp->v_bufobj.bo_bsize != 0, ("nfscl_nget: bo_bsize == 0"));
 	vp->v_bufobj.bo_ops = &buf_ops_newnfs;
 	vp->v_data = np;
 	np->n_vnode = vp;
 	/* 
 	 * Initialize the mutex even if the vnode is going to be a loser.
 	 * This simplifies the logic in reclaim, which can then unconditionally
 	 * destroy the mutex (in the case of the loser, or if hash_insert
 	 * happened to return an error no special casing is needed).
 	 */
 	mtx_init(&np->n_mtx, "NEWNFSnode lock", NULL, MTX_DEF | MTX_DUPOK);
 
 	/* 
 	 * Are we getting the root? If so, make sure the vnode flags
 	 * are correct 
 	 */
 	if ((nfhp->nfh_len == nmp->nm_fhsize) &&
 	    !bcmp(nfhp->nfh_fh, nmp->nm_fh, nfhp->nfh_len)) {
 		if (vp->v_type == VNON)
 			vp->v_type = VDIR;
 		vp->v_vflag |= VV_ROOT;
 	}
 	
 	np->n_fhp = nfhp;
 	/*
 	 * For NFSv4, we have to attach the directory file handle and
 	 * file name, so that Open Ops can be done later.
 	 */
 	if (nmp->nm_flag & NFSMNT_NFSV4) {
 		MALLOC(np->n_v4, struct nfsv4node *, sizeof (struct nfsv4node)
 		    + dnp->n_fhp->nfh_len + cnp->cn_namelen - 1, M_NFSV4NODE,
 		    M_WAITOK);
 		np->n_v4->n4_fhlen = dnp->n_fhp->nfh_len;
 		np->n_v4->n4_namelen = cnp->cn_namelen;
 		NFSBCOPY(dnp->n_fhp->nfh_fh, np->n_v4->n4_data,
 		    dnp->n_fhp->nfh_len);
 		NFSBCOPY(cnp->cn_nameptr, NFS4NODENAME(np->n_v4),
 		    cnp->cn_namelen);
 	} else {
 		np->n_v4 = NULL;
 	}
 
 	/*
 	 * NFS supports recursive and shared locking.
 	 */
 	lockmgr(vp->v_vnlock, LK_EXCLUSIVE | LK_NOWITNESS, NULL);
 	VN_LOCK_AREC(vp);
 	VN_LOCK_ASHARE(vp);
 	error = insmntque(vp, mntp);
 	if (error != 0) {
 		*npp = NULL;
 		mtx_destroy(&np->n_mtx);
 		FREE((caddr_t)nfhp, M_NFSFH);
 		if (np->n_v4 != NULL)
 			FREE((caddr_t)np->n_v4, M_NFSV4NODE);
 		uma_zfree(newnfsnode_zone, np);
 		return (error);
 	}
 	error = vfs_hash_insert(vp, hash, lkflags, 
 	    td, &nvp, newnfs_vncmpf, nfhp);
 	if (error)
 		return (error);
 	if (nvp != NULL) {
 		*npp = VTONFS(nvp);
 		/* vfs_hash_insert() vput()'s the losing vnode */
 		return (0);
 	}
 	*npp = np;
 
 	return (0);
 }
 
 /*
  * Anothe variant of nfs_nget(). This one is only used by reopen. It
  * takes almost the same args as nfs_nget(), but only succeeds if an entry
  * exists in the cache. (Since files should already be "open" with a
  * vnode ref cnt on the node when reopen calls this, it should always
  * succeed.)
  * Also, don't get a vnode lock, since it may already be locked by some
  * other process that is handling it. This is ok, since all other threads
  * on the client are blocked by the nfsc_lock being exclusively held by the
  * caller of this function.
  */
 int
 nfscl_ngetreopen(struct mount *mntp, u_int8_t *fhp, int fhsize,
     struct thread *td, struct nfsnode **npp)
 {
 	struct vnode *nvp;
 	u_int hash;
 	struct nfsfh *nfhp;
 	int error;
 
 	*npp = NULL;
 	/* For forced dismounts, just return error. */
 	if ((mntp->mnt_kern_flag & MNTK_UNMOUNTF))
 		return (EINTR);
 	MALLOC(nfhp, struct nfsfh *, sizeof (struct nfsfh) + fhsize,
 	    M_NFSFH, M_WAITOK);
 	bcopy(fhp, &nfhp->nfh_fh[0], fhsize);
 	nfhp->nfh_len = fhsize;
 
 	hash = fnv_32_buf(fhp, fhsize, FNV1_32_INIT);
 
 	/*
 	 * First, try to get the vnode locked, but don't block for the lock.
 	 */
 	error = vfs_hash_get(mntp, hash, (LK_EXCLUSIVE | LK_NOWAIT), td, &nvp,
 	    newnfs_vncmpf, nfhp);
 	if (error == 0 && nvp != NULL) {
 		NFSVOPUNLOCK(nvp, 0);
 	} else if (error == EBUSY) {
 		/*
 		 * The LK_EXCLOTHER lock type tells nfs_lock1() to not try
 		 * and lock the vnode, but just get a v_usecount on it.
 		 * LK_NOWAIT is set so that when vget() returns ENOENT,
 		 * vfs_hash_get() fails instead of looping.
 		 * If this succeeds, it is safe so long as a vflush() with
 		 * FORCECLOSE has not been done. Since the Renew thread is
 		 * stopped and the MNTK_UNMOUNTF flag is set before doing
 		 * a vflush() with FORCECLOSE, we should be ok here.
 		 */
 		if ((mntp->mnt_kern_flag & MNTK_UNMOUNTF))
 			error = EINTR;
 		else
 			error = vfs_hash_get(mntp, hash,
 			    (LK_EXCLOTHER | LK_NOWAIT), td, &nvp,
 			    newnfs_vncmpf, nfhp);
 	}
 	FREE(nfhp, M_NFSFH);
 	if (error)
 		return (error);
 	if (nvp != NULL) {
 		*npp = VTONFS(nvp);
 		return (0);
 	}
 	return (EINVAL);
 }
 
 /*
  * Load the attribute cache (that lives in the nfsnode entry) with
  * the attributes of the second argument and
  * Iff vaper not NULL
  *    copy the attributes to *vaper
  * Similar to nfs_loadattrcache(), except the attributes are passed in
  * instead of being parsed out of the mbuf list.
  */
 int
 nfscl_loadattrcache(struct vnode **vpp, struct nfsvattr *nap, void *nvaper,
     void *stuff, int writeattr, int dontshrink)
 {
 	struct vnode *vp = *vpp;
 	struct vattr *vap, *nvap = &nap->na_vattr, *vaper = nvaper;
 	struct nfsnode *np;
 	struct nfsmount *nmp;
 	struct timespec mtime_save;
 	u_quad_t nsize;
 	int setnsize;
 
 	/*
 	 * If v_type == VNON it is a new node, so fill in the v_type,
 	 * n_mtime fields. Check to see if it represents a special 
 	 * device, and if so, check for a possible alias. Once the
 	 * correct vnode has been obtained, fill in the rest of the
 	 * information.
 	 */
 	np = VTONFS(vp);
 	NFSLOCKNODE(np);
 	if (vp->v_type != nvap->va_type) {
 		vp->v_type = nvap->va_type;
 		if (vp->v_type == VFIFO)
 			vp->v_op = &newnfs_fifoops;
 		np->n_mtime = nvap->va_mtime;
 	}
 	nmp = VFSTONFS(vp->v_mount);
 	vap = &np->n_vattr.na_vattr;
 	mtime_save = vap->va_mtime;
 	if (writeattr) {
 		np->n_vattr.na_filerev = nap->na_filerev;
 		np->n_vattr.na_size = nap->na_size;
 		np->n_vattr.na_mtime = nap->na_mtime;
 		np->n_vattr.na_ctime = nap->na_ctime;
 		np->n_vattr.na_fsid = nap->na_fsid;
 		np->n_vattr.na_mode = nap->na_mode;
 	} else {
 		NFSBCOPY((caddr_t)nap, (caddr_t)&np->n_vattr,
 		    sizeof (struct nfsvattr));
 	}
 
 	/*
 	 * For NFSv4, if the node's fsid is not equal to the mount point's
 	 * fsid, return the low order 32bits of the node's fsid. This
 	 * allows getcwd(3) to work. There is a chance that the fsid might
 	 * be the same as a local fs, but since this is in an NFS mount
 	 * point, I don't think that will cause any problems?
 	 */
 	if (NFSHASNFSV4(nmp) && NFSHASHASSETFSID(nmp) &&
 	    (nmp->nm_fsid[0] != np->n_vattr.na_filesid[0] ||
 	     nmp->nm_fsid[1] != np->n_vattr.na_filesid[1])) {
 		/*
 		 * va_fsid needs to be set to some value derived from
 		 * np->n_vattr.na_filesid that is not equal
 		 * vp->v_mount->mnt_stat.f_fsid[0], so that it changes
 		 * from the value used for the top level server volume
 		 * in the mounted subtree.
 		 */
 		if (vp->v_mount->mnt_stat.f_fsid.val[0] !=
 		    (uint32_t)np->n_vattr.na_filesid[0])
 			vap->va_fsid = (uint32_t)np->n_vattr.na_filesid[0];
 		else
 			vap->va_fsid = (uint32_t)hash32_buf(
 			    np->n_vattr.na_filesid, 2 * sizeof(uint64_t), 0);
 	} else
 		vap->va_fsid = vp->v_mount->mnt_stat.f_fsid.val[0];
 	np->n_attrstamp = time_second;
 	setnsize = 0;
 	nsize = 0;
 	if (vap->va_size != np->n_size) {
 		if (vap->va_type == VREG) {
 			if (dontshrink && vap->va_size < np->n_size) {
 				/*
 				 * We've been told not to shrink the file;
 				 * zero np->n_attrstamp to indicate that
 				 * the attributes are stale.
 				 */
 				vap->va_size = np->n_size;
 				np->n_attrstamp = 0;
 				KDTRACE_NFS_ATTRCACHE_FLUSH_DONE(vp);
 				vnode_pager_setsize(vp, np->n_size);
 			} else if (np->n_flag & NMODIFIED) {
 				/*
 				 * We've modified the file: Use the larger
 				 * of our size, and the server's size.
 				 */
 				if (vap->va_size < np->n_size) {
 					vap->va_size = np->n_size;
 				} else {
 					np->n_size = vap->va_size;
 					np->n_flag |= NSIZECHANGED;
 				}
 				vnode_pager_setsize(vp, np->n_size);
 			} else if (vap->va_size < np->n_size) {
 				/*
 				 * When shrinking the size, the call to
 				 * vnode_pager_setsize() cannot be done
 				 * with the mutex held, so delay it until
 				 * after the mtx_unlock call.
 				 */
 				nsize = np->n_size = vap->va_size;
 				np->n_flag |= NSIZECHANGED;
 				setnsize = 1;
 			} else {
 				np->n_size = vap->va_size;
 				np->n_flag |= NSIZECHANGED;
 				vnode_pager_setsize(vp, np->n_size);
 			}
 		} else {
 			np->n_size = vap->va_size;
 		}
 	}
 	/*
 	 * The following checks are added to prevent a race between (say)
 	 * a READDIR+ and a WRITE. 
 	 * READDIR+, WRITE requests sent out.
 	 * READDIR+ resp, WRITE resp received on client.
 	 * However, the WRITE resp was handled before the READDIR+ resp
 	 * causing the post op attrs from the write to be loaded first
 	 * and the attrs from the READDIR+ to be loaded later. If this 
 	 * happens, we have stale attrs loaded into the attrcache.
 	 * We detect this by for the mtime moving back. We invalidate the 
 	 * attrcache when this happens.
 	 */
 	if (timespeccmp(&mtime_save, &vap->va_mtime, >)) {
 		/* Size changed or mtime went backwards */
 		np->n_attrstamp = 0;
 		KDTRACE_NFS_ATTRCACHE_FLUSH_DONE(vp);
 	}
 	if (vaper != NULL) {
 		NFSBCOPY((caddr_t)vap, (caddr_t)vaper, sizeof(*vap));
 		if (np->n_flag & NCHG) {
 			if (np->n_flag & NACC)
 				vaper->va_atime = np->n_atim;
 			if (np->n_flag & NUPD)
 				vaper->va_mtime = np->n_mtim;
 		}
 	}
 #ifdef KDTRACE_HOOKS
 	if (np->n_attrstamp != 0)
 		KDTRACE_NFS_ATTRCACHE_LOAD_DONE(vp, vap, 0);
 #endif
 	NFSUNLOCKNODE(np);
 	if (setnsize)
 		vnode_pager_setsize(vp, nsize);
 	return (0);
 }
 
 /*
  * Fill in the client id name. For these bytes:
  * 1 - they must be unique
  * 2 - they should be persistent across client reboots
  * 1 is more critical than 2
  * Use the mount point's unique id plus either the uuid or, if that
  * isn't set, random junk.
  */
 void
 nfscl_fillclid(u_int64_t clval, char *uuid, u_int8_t *cp, u_int16_t idlen)
 {
 	int uuidlen;
 
 	/*
 	 * First, put in the 64bit mount point identifier.
 	 */
 	if (idlen >= sizeof (u_int64_t)) {
 		NFSBCOPY((caddr_t)&clval, cp, sizeof (u_int64_t));
 		cp += sizeof (u_int64_t);
 		idlen -= sizeof (u_int64_t);
 	}
 
 	/*
 	 * If uuid is non-zero length, use it.
 	 */
 	uuidlen = strlen(uuid);
 	if (uuidlen > 0 && idlen >= uuidlen) {
 		NFSBCOPY(uuid, cp, uuidlen);
 		cp += uuidlen;
 		idlen -= uuidlen;
 	}
 
 	/*
 	 * This only normally happens if the uuid isn't set.
 	 */
 	while (idlen > 0) {
 		*cp++ = (u_int8_t)(arc4random() % 256);
 		idlen--;
 	}
 }
 
 /*
  * Fill in a lock owner name. For now, pid + the process's creation time.
  */
 void
 nfscl_filllockowner(void *id, u_int8_t *cp, int flags)
 {
 	union {
 		u_int32_t	lval;
 		u_int8_t	cval[4];
 	} tl;
 	struct proc *p;
 
 	if (id == NULL) {
 		printf("NULL id\n");
 		bzero(cp, NFSV4CL_LOCKNAMELEN);
 		return;
 	}
 	if ((flags & F_POSIX) != 0) {
 		p = (struct proc *)id;
 		tl.lval = p->p_pid;
 		*cp++ = tl.cval[0];
 		*cp++ = tl.cval[1];
 		*cp++ = tl.cval[2];
 		*cp++ = tl.cval[3];
 		tl.lval = p->p_stats->p_start.tv_sec;
 		*cp++ = tl.cval[0];
 		*cp++ = tl.cval[1];
 		*cp++ = tl.cval[2];
 		*cp++ = tl.cval[3];
 		tl.lval = p->p_stats->p_start.tv_usec;
 		*cp++ = tl.cval[0];
 		*cp++ = tl.cval[1];
 		*cp++ = tl.cval[2];
 		*cp = tl.cval[3];
 	} else if ((flags & F_FLOCK) != 0) {
 		bcopy(&id, cp, sizeof(id));
 		bzero(&cp[sizeof(id)], NFSV4CL_LOCKNAMELEN - sizeof(id));
 	} else {
 		printf("nfscl_filllockowner: not F_POSIX or F_FLOCK\n");
 		bzero(cp, NFSV4CL_LOCKNAMELEN);
 	}
 }
 
 /*
  * Find the parent process for the thread passed in as an argument.
  * If none exists, return NULL, otherwise return a thread for the parent.
  * (Can be any of the threads, since it is only used for td->td_proc.)
  */
 NFSPROC_T *
 nfscl_getparent(struct thread *td)
 {
 	struct proc *p;
 	struct thread *ptd;
 
 	if (td == NULL)
 		return (NULL);
 	p = td->td_proc;
 	if (p->p_pid == 0)
 		return (NULL);
 	p = p->p_pptr;
 	if (p == NULL)
 		return (NULL);
 	ptd = TAILQ_FIRST(&p->p_threads);
 	return (ptd);
 }
 
 /*
  * Start up the renew kernel thread.
  */
 static void
 start_nfscl(void *arg)
 {
 	struct nfsclclient *clp;
 	struct thread *td;
 
 	clp = (struct nfsclclient *)arg;
 	td = TAILQ_FIRST(&clp->nfsc_renewthread->p_threads);
 	nfscl_renewthread(clp, td);
 	kproc_exit(0);
 }
 
 void
 nfscl_start_renewthread(struct nfsclclient *clp)
 {
 
 	kproc_create(start_nfscl, (void *)clp, &clp->nfsc_renewthread, 0, 0,
 	    "nfscl");
 }
 
 /*
  * Handle wcc_data.
  * For NFSv4, it assumes that nfsv4_wccattr() was used to set up the getattr
  * as the first Op after PutFH.
  * (For NFSv4, the postop attributes are after the Op, so they can't be
  *  parsed here. A separate call to nfscl_postop_attr() is required.)
  */
 int
 nfscl_wcc_data(struct nfsrv_descript *nd, struct vnode *vp,
     struct nfsvattr *nap, int *flagp, int *wccflagp, void *stuff)
 {
 	u_int32_t *tl;
 	struct nfsnode *np = VTONFS(vp);
 	struct nfsvattr nfsva;
 	int error = 0;
 
 	if (wccflagp != NULL)
 		*wccflagp = 0;
 	if (nd->nd_flag & ND_NFSV3) {
 		*flagp = 0;
 		NFSM_DISSECT(tl, u_int32_t *, NFSX_UNSIGNED);
 		if (*tl == newnfs_true) {
 			NFSM_DISSECT(tl, u_int32_t *, 6 * NFSX_UNSIGNED);
 			if (wccflagp != NULL) {
 				mtx_lock(&np->n_mtx);
 				*wccflagp = (np->n_mtime.tv_sec ==
 				    fxdr_unsigned(u_int32_t, *(tl + 2)) &&
 				    np->n_mtime.tv_nsec ==
 				    fxdr_unsigned(u_int32_t, *(tl + 3)));
 				mtx_unlock(&np->n_mtx);
 			}
 		}
 		error = nfscl_postop_attr(nd, nap, flagp, stuff);
 	} else if ((nd->nd_flag & (ND_NOMOREDATA | ND_NFSV4 | ND_V4WCCATTR))
 	    == (ND_NFSV4 | ND_V4WCCATTR)) {
 		error = nfsv4_loadattr(nd, NULL, &nfsva, NULL,
 		    NULL, 0, NULL, NULL, NULL, NULL, NULL, 0,
 		    NULL, NULL, NULL, NULL, NULL);
 		if (error)
 			return (error);
 		/*
 		 * Get rid of Op# and status for next op.
 		 */
 		NFSM_DISSECT(tl, u_int32_t *, 2 * NFSX_UNSIGNED);
 		if (*++tl)
 			nd->nd_flag |= ND_NOMOREDATA;
 		if (wccflagp != NULL &&
 		    nfsva.na_vattr.va_mtime.tv_sec != 0) {
 			mtx_lock(&np->n_mtx);
 			*wccflagp = (np->n_mtime.tv_sec ==
 			    nfsva.na_vattr.va_mtime.tv_sec &&
 			    np->n_mtime.tv_nsec ==
 			    nfsva.na_vattr.va_mtime.tv_sec);
 			mtx_unlock(&np->n_mtx);
 		}
 	}
 nfsmout:
 	return (error);
 }
 
 /*
  * Get postop attributes.
  */
 int
 nfscl_postop_attr(struct nfsrv_descript *nd, struct nfsvattr *nap, int *retp,
     void *stuff)
 {
 	u_int32_t *tl;
 	int error = 0;
 
 	*retp = 0;
 	if (nd->nd_flag & ND_NOMOREDATA)
 		return (error);
 	if (nd->nd_flag & ND_NFSV3) {
 		NFSM_DISSECT(tl, u_int32_t *, NFSX_UNSIGNED);
 		*retp = fxdr_unsigned(int, *tl);
 	} else if (nd->nd_flag & ND_NFSV4) {
 		/*
 		 * For NFSv4, the postop attr are at the end, so no point
 		 * in looking if nd_repstat != 0.
 		 */
 		if (!nd->nd_repstat) {
 			NFSM_DISSECT(tl, u_int32_t *, 2 * NFSX_UNSIGNED);
 			if (*(tl + 1))
 				/* should never happen since nd_repstat != 0 */
 				nd->nd_flag |= ND_NOMOREDATA;
 			else
 				*retp = 1;
 		}
 	} else if (!nd->nd_repstat) {
 		/* For NFSv2, the attributes are here iff nd_repstat == 0 */
 		*retp = 1;
 	}
 	if (*retp) {
 		error = nfsm_loadattr(nd, nap);
 		if (error)
 			*retp = 0;
 	}
 nfsmout:
 	return (error);
 }
 
 /*
  * Fill in the setable attributes. The full argument indicates whether
  * to fill in them all or just mode and time.
  */
 void
 nfscl_fillsattr(struct nfsrv_descript *nd, struct vattr *vap,
     struct vnode *vp, int flags, u_int32_t rdev)
 {
 	u_int32_t *tl;
 	struct nfsv2_sattr *sp;
 	nfsattrbit_t attrbits;
 
 	switch (nd->nd_flag & (ND_NFSV2 | ND_NFSV3 | ND_NFSV4)) {
 	case ND_NFSV2:
 		NFSM_BUILD(sp, struct nfsv2_sattr *, NFSX_V2SATTR);
 		if (vap->va_mode == (mode_t)VNOVAL)
 			sp->sa_mode = newnfs_xdrneg1;
 		else
 			sp->sa_mode = vtonfsv2_mode(vap->va_type, vap->va_mode);
 		if (vap->va_uid == (uid_t)VNOVAL)
 			sp->sa_uid = newnfs_xdrneg1;
 		else
 			sp->sa_uid = txdr_unsigned(vap->va_uid);
 		if (vap->va_gid == (gid_t)VNOVAL)
 			sp->sa_gid = newnfs_xdrneg1;
 		else
 			sp->sa_gid = txdr_unsigned(vap->va_gid);
 		if (flags & NFSSATTR_SIZE0)
 			sp->sa_size = 0;
 		else if (flags & NFSSATTR_SIZENEG1)
 			sp->sa_size = newnfs_xdrneg1;
 		else if (flags & NFSSATTR_SIZERDEV)
 			sp->sa_size = txdr_unsigned(rdev);
 		else
 			sp->sa_size = txdr_unsigned(vap->va_size);
 		txdr_nfsv2time(&vap->va_atime, &sp->sa_atime);
 		txdr_nfsv2time(&vap->va_mtime, &sp->sa_mtime);
 		break;
 	case ND_NFSV3:
 		if (vap->va_mode != (mode_t)VNOVAL) {
 			NFSM_BUILD(tl, u_int32_t *, 2 * NFSX_UNSIGNED);
 			*tl++ = newnfs_true;
 			*tl = txdr_unsigned(vap->va_mode);
 		} else {
 			NFSM_BUILD(tl, u_int32_t *, NFSX_UNSIGNED);
 			*tl = newnfs_false;
 		}
 		if ((flags & NFSSATTR_FULL) && vap->va_uid != (uid_t)VNOVAL) {
 			NFSM_BUILD(tl, u_int32_t *, 2 * NFSX_UNSIGNED);
 			*tl++ = newnfs_true;
 			*tl = txdr_unsigned(vap->va_uid);
 		} else {
 			NFSM_BUILD(tl, u_int32_t *, NFSX_UNSIGNED);
 			*tl = newnfs_false;
 		}
 		if ((flags & NFSSATTR_FULL) && vap->va_gid != (gid_t)VNOVAL) {
 			NFSM_BUILD(tl, u_int32_t *, 2 * NFSX_UNSIGNED);
 			*tl++ = newnfs_true;
 			*tl = txdr_unsigned(vap->va_gid);
 		} else {
 			NFSM_BUILD(tl, u_int32_t *, NFSX_UNSIGNED);
 			*tl = newnfs_false;
 		}
 		if ((flags & NFSSATTR_FULL) && vap->va_size != VNOVAL) {
 			NFSM_BUILD(tl, u_int32_t *, 3 * NFSX_UNSIGNED);
 			*tl++ = newnfs_true;
 			txdr_hyper(vap->va_size, tl);
 		} else {
 			NFSM_BUILD(tl, u_int32_t *, NFSX_UNSIGNED);
 			*tl = newnfs_false;
 		}
 		if (vap->va_atime.tv_sec != VNOVAL) {
 			if ((vap->va_vaflags & VA_UTIMES_NULL) == 0) {
 				NFSM_BUILD(tl, u_int32_t *, 3 * NFSX_UNSIGNED);
 				*tl++ = txdr_unsigned(NFSV3SATTRTIME_TOCLIENT);
 				txdr_nfsv3time(&vap->va_atime, tl);
 			} else {
 				NFSM_BUILD(tl, u_int32_t *, NFSX_UNSIGNED);
 				*tl = txdr_unsigned(NFSV3SATTRTIME_TOSERVER);
 			}
 		} else {
 			NFSM_BUILD(tl, u_int32_t *, NFSX_UNSIGNED);
 			*tl = txdr_unsigned(NFSV3SATTRTIME_DONTCHANGE);
 		}
 		if (vap->va_mtime.tv_sec != VNOVAL) {
 			if ((vap->va_vaflags & VA_UTIMES_NULL) == 0) {
 				NFSM_BUILD(tl, u_int32_t *, 3 * NFSX_UNSIGNED);
 				*tl++ = txdr_unsigned(NFSV3SATTRTIME_TOCLIENT);
 				txdr_nfsv3time(&vap->va_mtime, tl);
 			} else {
 				NFSM_BUILD(tl, u_int32_t *, NFSX_UNSIGNED);
 				*tl = txdr_unsigned(NFSV3SATTRTIME_TOSERVER);
 			}
 		} else {
 			NFSM_BUILD(tl, u_int32_t *, NFSX_UNSIGNED);
 			*tl = txdr_unsigned(NFSV3SATTRTIME_DONTCHANGE);
 		}
 		break;
 	case ND_NFSV4:
 		NFSZERO_ATTRBIT(&attrbits);
 		if (vap->va_mode != (mode_t)VNOVAL)
 			NFSSETBIT_ATTRBIT(&attrbits, NFSATTRBIT_MODE);
 		if ((flags & NFSSATTR_FULL) && vap->va_uid != (uid_t)VNOVAL)
 			NFSSETBIT_ATTRBIT(&attrbits, NFSATTRBIT_OWNER);
 		if ((flags & NFSSATTR_FULL) && vap->va_gid != (gid_t)VNOVAL)
 			NFSSETBIT_ATTRBIT(&attrbits, NFSATTRBIT_OWNERGROUP);
 		if ((flags & NFSSATTR_FULL) && vap->va_size != VNOVAL)
 			NFSSETBIT_ATTRBIT(&attrbits, NFSATTRBIT_SIZE);
 		if (vap->va_atime.tv_sec != VNOVAL)
 			NFSSETBIT_ATTRBIT(&attrbits, NFSATTRBIT_TIMEACCESSSET);
 		if (vap->va_mtime.tv_sec != VNOVAL)
 			NFSSETBIT_ATTRBIT(&attrbits, NFSATTRBIT_TIMEMODIFYSET);
 		(void) nfsv4_fillattr(nd, vp->v_mount, vp, NULL, vap, NULL, 0,
 		    &attrbits, NULL, NULL, 0, 0, 0, 0, (uint64_t)0);
 		break;
 	};
 }
 
 /*
  * nfscl_request() - mostly a wrapper for newnfs_request().
  */
 int
 nfscl_request(struct nfsrv_descript *nd, struct vnode *vp, NFSPROC_T *p,
     struct ucred *cred, void *stuff)
 {
 	int ret, vers;
 	struct nfsmount *nmp;
 
 	nmp = VFSTONFS(vp->v_mount);
 	if (nd->nd_flag & ND_NFSV4)
 		vers = NFS_VER4;
 	else if (nd->nd_flag & ND_NFSV3)
 		vers = NFS_VER3;
 	else
 		vers = NFS_VER2;
 	ret = newnfs_request(nd, nmp, NULL, &nmp->nm_sockreq, vp, p, cred,
 		NFS_PROG, vers, NULL, 1, NULL, NULL);
 	return (ret);
 }
 
 /*
  * fill in this bsden's variant of statfs using nfsstatfs.
  */
 void
 nfscl_loadsbinfo(struct nfsmount *nmp, struct nfsstatfs *sfp, void *statfs)
 {
 	struct statfs *sbp = (struct statfs *)statfs;
 
 	if (nmp->nm_flag & (NFSMNT_NFSV3 | NFSMNT_NFSV4)) {
 		sbp->f_bsize = NFS_FABLKSIZE;
 		sbp->f_blocks = sfp->sf_tbytes / NFS_FABLKSIZE;
 		sbp->f_bfree = sfp->sf_fbytes / NFS_FABLKSIZE;
 		/*
 		 * Although sf_abytes is uint64_t and f_bavail is int64_t,
 		 * the value after dividing by NFS_FABLKSIZE is small
 		 * enough that it will fit in 63bits, so it is ok to
 		 * assign it to f_bavail without fear that it will become
 		 * negative.
 		 */
 		sbp->f_bavail = sfp->sf_abytes / NFS_FABLKSIZE;
 		sbp->f_files = sfp->sf_tfiles;
 		/* Since f_ffree is int64_t, clip it to 63bits. */
 		if (sfp->sf_ffiles > INT64_MAX)
 			sbp->f_ffree = INT64_MAX;
 		else
 			sbp->f_ffree = sfp->sf_ffiles;
 	} else if ((nmp->nm_flag & NFSMNT_NFSV4) == 0) {
 		/*
 		 * The type casts to (int32_t) ensure that this code is
 		 * compatible with the old NFS client, in that it will
 		 * propagate bit31 to the high order bits. This may or may
 		 * not be correct for NFSv2, but since it is a legacy
 		 * environment, I'd rather retain backwards compatibility.
 		 */
 		sbp->f_bsize = (int32_t)sfp->sf_bsize;
 		sbp->f_blocks = (int32_t)sfp->sf_blocks;
 		sbp->f_bfree = (int32_t)sfp->sf_bfree;
 		sbp->f_bavail = (int32_t)sfp->sf_bavail;
 		sbp->f_files = 0;
 		sbp->f_ffree = 0;
 	}
 }
 
 /*
  * Use the fsinfo stuff to update the mount point.
  */
 void
 nfscl_loadfsinfo(struct nfsmount *nmp, struct nfsfsinfo *fsp)
 {
 
 	if ((nmp->nm_wsize == 0 || fsp->fs_wtpref < nmp->nm_wsize) &&
 	    fsp->fs_wtpref >= NFS_FABLKSIZE)
 		nmp->nm_wsize = (fsp->fs_wtpref + NFS_FABLKSIZE - 1) &
 		    ~(NFS_FABLKSIZE - 1);
 	if (fsp->fs_wtmax < nmp->nm_wsize && fsp->fs_wtmax > 0) {
 		nmp->nm_wsize = fsp->fs_wtmax & ~(NFS_FABLKSIZE - 1);
 		if (nmp->nm_wsize == 0)
 			nmp->nm_wsize = fsp->fs_wtmax;
 	}
 	if (nmp->nm_wsize < NFS_FABLKSIZE)
 		nmp->nm_wsize = NFS_FABLKSIZE;
 	if ((nmp->nm_rsize == 0 || fsp->fs_rtpref < nmp->nm_rsize) &&
 	    fsp->fs_rtpref >= NFS_FABLKSIZE)
 		nmp->nm_rsize = (fsp->fs_rtpref + NFS_FABLKSIZE - 1) &
 		    ~(NFS_FABLKSIZE - 1);
 	if (fsp->fs_rtmax < nmp->nm_rsize && fsp->fs_rtmax > 0) {
 		nmp->nm_rsize = fsp->fs_rtmax & ~(NFS_FABLKSIZE - 1);
 		if (nmp->nm_rsize == 0)
 			nmp->nm_rsize = fsp->fs_rtmax;
 	}
 	if (nmp->nm_rsize < NFS_FABLKSIZE)
 		nmp->nm_rsize = NFS_FABLKSIZE;
 	if ((nmp->nm_readdirsize == 0 || fsp->fs_dtpref < nmp->nm_readdirsize)
 	    && fsp->fs_dtpref >= NFS_DIRBLKSIZ)
 		nmp->nm_readdirsize = (fsp->fs_dtpref + NFS_DIRBLKSIZ - 1) &
 		    ~(NFS_DIRBLKSIZ - 1);
 	if (fsp->fs_rtmax < nmp->nm_readdirsize && fsp->fs_rtmax > 0) {
 		nmp->nm_readdirsize = fsp->fs_rtmax & ~(NFS_DIRBLKSIZ - 1);
 		if (nmp->nm_readdirsize == 0)
 			nmp->nm_readdirsize = fsp->fs_rtmax;
 	}
 	if (nmp->nm_readdirsize < NFS_DIRBLKSIZ)
 		nmp->nm_readdirsize = NFS_DIRBLKSIZ;
 	if (fsp->fs_maxfilesize > 0 &&
 	    fsp->fs_maxfilesize < nmp->nm_maxfilesize)
 		nmp->nm_maxfilesize = fsp->fs_maxfilesize;
 	nmp->nm_mountp->mnt_stat.f_iosize = newnfs_iosize(nmp);
 	nmp->nm_state |= NFSSTA_GOTFSINFO;
 }
 
 /*
  * Get a pointer to my IP addrress and return it.
  * Return NULL if you can't find one.
  */
 u_int8_t *
 nfscl_getmyip(struct nfsmount *nmp, int *isinet6p)
 {
 	struct sockaddr_in sad, *sin;
 	struct rtentry *rt;
 	u_int8_t *retp = NULL;
 	static struct in_addr laddr;
 
 	*isinet6p = 0;
 	/*
 	 * Loop up a route for the destination address.
 	 */
 	if (nmp->nm_nam->sa_family == AF_INET) {
 		bzero(&sad, sizeof (sad));
 		sin = (struct sockaddr_in *)nmp->nm_nam;
 		sad.sin_family = AF_INET;
 		sad.sin_len = sizeof (struct sockaddr_in);
 		sad.sin_addr.s_addr = sin->sin_addr.s_addr;
 		CURVNET_SET(CRED_TO_VNET(nmp->nm_sockreq.nr_cred));
 		rt = rtalloc1_fib((struct sockaddr *)&sad, 0, 0UL,
 		     curthread->td_proc->p_fibnum);
 		if (rt != NULL) {
 			if (rt->rt_ifp != NULL &&
 			    rt->rt_ifa != NULL &&
 			    ((rt->rt_ifp->if_flags & IFF_LOOPBACK) == 0) &&
 			    rt->rt_ifa->ifa_addr->sa_family == AF_INET) {
 				sin = (struct sockaddr_in *)
 				    rt->rt_ifa->ifa_addr;
 				laddr.s_addr = sin->sin_addr.s_addr;
 				retp = (u_int8_t *)&laddr;
 			}
 			RTFREE_LOCKED(rt);
 		}
 		CURVNET_RESTORE();
 #ifdef INET6
 	} else if (nmp->nm_nam->sa_family == AF_INET6) {
 		struct sockaddr_in6 sad6, *sin6;
 		static struct in6_addr laddr6;
 
 		bzero(&sad6, sizeof (sad6));
 		sin6 = (struct sockaddr_in6 *)nmp->nm_nam;
 		sad6.sin6_family = AF_INET6;
 		sad6.sin6_len = sizeof (struct sockaddr_in6);
 		sad6.sin6_addr = sin6->sin6_addr;
 		CURVNET_SET(CRED_TO_VNET(nmp->nm_sockreq.nr_cred));
 		rt = rtalloc1_fib((struct sockaddr *)&sad6, 0, 0UL,
 		     curthread->td_proc->p_fibnum);
 		if (rt != NULL) {
 			if (rt->rt_ifp != NULL &&
 			    rt->rt_ifa != NULL &&
 			    ((rt->rt_ifp->if_flags & IFF_LOOPBACK) == 0) &&
 			    rt->rt_ifa->ifa_addr->sa_family == AF_INET6) {
 				sin6 = (struct sockaddr_in6 *)
 				    rt->rt_ifa->ifa_addr;
 				laddr6 = sin6->sin6_addr;
 				retp = (u_int8_t *)&laddr6;
 				*isinet6p = 1;
 			}
 			RTFREE_LOCKED(rt);
 		}
 		CURVNET_RESTORE();
 #endif
 	}
 	return (retp);
 }
 
 /*
  * Copy NFS uid, gids from the cred structure.
  */
 void
 newnfs_copyincred(struct ucred *cr, struct nfscred *nfscr)
 {
 	int i;
 
 	KASSERT(cr->cr_ngroups >= 0,
 	    ("newnfs_copyincred: negative cr_ngroups"));
 	nfscr->nfsc_uid = cr->cr_uid;
 	nfscr->nfsc_ngroups = MIN(cr->cr_ngroups, NFS_MAXGRPS + 1);
 	for (i = 0; i < nfscr->nfsc_ngroups; i++)
 		nfscr->nfsc_groups[i] = cr->cr_groups[i];
 }
 
 
 /*
  * Do any client specific initialization.
  */
 void
 nfscl_init(void)
 {
 	static int inited = 0;
 
 	if (inited)
 		return;
 	inited = 1;
 	nfscl_inited = 1;
 	ncl_pbuf_freecnt = nswbuf / 2 + 1;
 }
 
 /*
  * Check each of the attributes to be set, to ensure they aren't already
  * the correct value. Disable setting ones already correct.
  */
 int
 nfscl_checksattr(struct vattr *vap, struct nfsvattr *nvap)
 {
 
 	if (vap->va_mode != (mode_t)VNOVAL) {
 		if (vap->va_mode == nvap->na_mode)
 			vap->va_mode = (mode_t)VNOVAL;
 	}
 	if (vap->va_uid != (uid_t)VNOVAL) {
 		if (vap->va_uid == nvap->na_uid)
 			vap->va_uid = (uid_t)VNOVAL;
 	}
 	if (vap->va_gid != (gid_t)VNOVAL) {
 		if (vap->va_gid == nvap->na_gid)
 			vap->va_gid = (gid_t)VNOVAL;
 	}
 	if (vap->va_size != VNOVAL) {
 		if (vap->va_size == nvap->na_size)
 			vap->va_size = VNOVAL;
 	}
 
 	/*
 	 * We are normally called with only a partially initialized
 	 * VAP.  Since the NFSv3 spec says that server may use the
 	 * file attributes to store the verifier, the spec requires
 	 * us to do a SETATTR RPC. FreeBSD servers store the verifier
 	 * in atime, but we can't really assume that all servers will
 	 * so we ensure that our SETATTR sets both atime and mtime.
 	 */
 	if (vap->va_mtime.tv_sec == VNOVAL)
 		vfs_timestamp(&vap->va_mtime);
 	if (vap->va_atime.tv_sec == VNOVAL)
 		vap->va_atime = vap->va_mtime;
 	return (1);
 }
 
 /*
  * Map nfsv4 errors to errno.h errors.
  * The uid and gid arguments are only used for NFSERR_BADOWNER and that
  * error should only be returned for the Open, Create and Setattr Ops.
  * As such, most calls can just pass in 0 for those arguments.
  */
 APPLESTATIC int
 nfscl_maperr(struct thread *td, int error, uid_t uid, gid_t gid)
 {
 	struct proc *p;
 
 	if (error < 10000)
 		return (error);
 	if (td != NULL)
 		p = td->td_proc;
 	else
 		p = NULL;
 	switch (error) {
 	case NFSERR_BADOWNER:
 		tprintf(p, LOG_INFO,
 		    "No name and/or group mapping for uid,gid:(%d,%d)\n",
 		    uid, gid);
 		return (EPERM);
 	case NFSERR_BADNAME:
 	case NFSERR_BADCHAR:
 		printf("nfsv4 char/name not handled by server\n");
 		return (ENOENT);
 	case NFSERR_STALECLIENTID:
 	case NFSERR_STALESTATEID:
 	case NFSERR_EXPIRED:
 	case NFSERR_BADSTATEID:
 	case NFSERR_BADSESSION:
 		printf("nfsv4 recover err returned %d\n", error);
 		return (EIO);
 	case NFSERR_BADHANDLE:
 	case NFSERR_SERVERFAULT:
 	case NFSERR_BADTYPE:
 	case NFSERR_FHEXPIRED:
 	case NFSERR_RESOURCE:
 	case NFSERR_MOVED:
 	case NFSERR_NOFILEHANDLE:
 	case NFSERR_MINORVERMISMATCH:
 	case NFSERR_OLDSTATEID:
 	case NFSERR_BADSEQID:
 	case NFSERR_LEASEMOVED:
 	case NFSERR_RECLAIMBAD:
 	case NFSERR_BADXDR:
 	case NFSERR_OPILLEGAL:
 		printf("nfsv4 client/server protocol prob err=%d\n",
 		    error);
 		return (EIO);
 	default:
 		tprintf(p, LOG_INFO, "nfsv4 err=%d\n", error);
 		return (EIO);
 	};
 }
 
 /*
  * Check to see if the process for this owner exists. Return 1 if it doesn't
  * and 0 otherwise.
  */
 int
 nfscl_procdoesntexist(u_int8_t *own)
 {
 	union {
 		u_int32_t	lval;
 		u_int8_t	cval[4];
 	} tl;
 	struct proc *p;
 	pid_t pid;
 	int ret = 0;
 
 	tl.cval[0] = *own++;
 	tl.cval[1] = *own++;
 	tl.cval[2] = *own++;
 	tl.cval[3] = *own++;
 	pid = tl.lval;
 	p = pfind_locked(pid);
 	if (p == NULL)
 		return (1);
 	if (p->p_stats == NULL) {
 		PROC_UNLOCK(p);
 		return (0);
 	}
 	tl.cval[0] = *own++;
 	tl.cval[1] = *own++;
 	tl.cval[2] = *own++;
 	tl.cval[3] = *own++;
 	if (tl.lval != p->p_stats->p_start.tv_sec) {
 		ret = 1;
 	} else {
 		tl.cval[0] = *own++;
 		tl.cval[1] = *own++;
 		tl.cval[2] = *own++;
 		tl.cval[3] = *own;
 		if (tl.lval != p->p_stats->p_start.tv_usec)
 			ret = 1;
 	}
 	PROC_UNLOCK(p);
 	return (ret);
 }
 
 /*
  * - nfs pseudo system call for the client
  */
 /*
  * MPSAFE
  */
 static int
 nfssvc_nfscl(struct thread *td, struct nfssvc_args *uap)
 {
 	struct file *fp;
 	struct nfscbd_args nfscbdarg;
 	struct nfsd_nfscbd_args nfscbdarg2;
 	struct nameidata nd;
 	struct nfscl_dumpmntopts dumpmntopts;
 	cap_rights_t rights;
 	char *buf;
 	int error;
 
 	if (uap->flag & NFSSVC_CBADDSOCK) {
 		error = copyin(uap->argp, (caddr_t)&nfscbdarg, sizeof(nfscbdarg));
 		if (error)
 			return (error);
 		/*
 		 * Since we don't know what rights might be required,
 		 * pretend that we need them all. It is better to be too
 		 * careful than too reckless.
 		 */
 		error = fget(td, nfscbdarg.sock,
 		    cap_rights_init(&rights, CAP_SOCK_CLIENT), &fp);
 		if (error)
 			return (error);
 		if (fp->f_type != DTYPE_SOCKET) {
 			fdrop(fp, td);
 			return (EPERM);
 		}
 		error = nfscbd_addsock(fp);
 		fdrop(fp, td);
 		if (!error && nfscl_enablecallb == 0) {
 			nfsv4_cbport = nfscbdarg.port;
 			nfscl_enablecallb = 1;
 		}
 	} else if (uap->flag & NFSSVC_NFSCBD) {
 		if (uap->argp == NULL) 
 			return (EINVAL);
 		error = copyin(uap->argp, (caddr_t)&nfscbdarg2,
 		    sizeof(nfscbdarg2));
 		if (error)
 			return (error);
 		error = nfscbd_nfsd(td, &nfscbdarg2);
 	} else if (uap->flag & NFSSVC_DUMPMNTOPTS) {
 		error = copyin(uap->argp, &dumpmntopts, sizeof(dumpmntopts));
 		if (error == 0 && (dumpmntopts.ndmnt_blen < 256 ||
 		    dumpmntopts.ndmnt_blen > 1024))
 			error = EINVAL;
 		if (error == 0)
 			error = nfsrv_lookupfilename(&nd,
 			    dumpmntopts.ndmnt_fname, td);
 		if (error == 0 && strcmp(nd.ni_vp->v_mount->mnt_vfc->vfc_name,
 		    "nfs") != 0) {
 			vput(nd.ni_vp);
 			error = EINVAL;
 		}
 		if (error == 0) {
 			buf = malloc(dumpmntopts.ndmnt_blen, M_TEMP, M_WAITOK);
 			nfscl_retopts(VFSTONFS(nd.ni_vp->v_mount), buf,
 			    dumpmntopts.ndmnt_blen);
 			vput(nd.ni_vp);
 			error = copyout(buf, dumpmntopts.ndmnt_buf,
 			    dumpmntopts.ndmnt_blen);
 			free(buf, M_TEMP);
 		}
 	} else {
 		error = EINVAL;
 	}
 	return (error);
 }
 
 extern int (*nfsd_call_nfscl)(struct thread *, struct nfssvc_args *);
 
 /*
  * Called once to initialize data structures...
  */
 static int
 nfscl_modevent(module_t mod, int type, void *data)
 {
 	int error = 0;
 	static int loaded = 0;
 
 	switch (type) {
 	case MOD_LOAD:
 		if (loaded)
 			return (0);
 		newnfs_portinit();
 		mtx_init(&nfs_clstate_mutex, "nfs_clstate_mutex", NULL,
 		    MTX_DEF);
 		mtx_init(&ncl_iod_mutex, "ncl_iod_mutex", NULL, MTX_DEF);
 		nfscl_init();
 		NFSD_LOCK();
 		nfsrvd_cbinit(0);
 		NFSD_UNLOCK();
 		ncl_call_invalcaches = ncl_invalcaches;
 		nfsd_call_nfscl = nfssvc_nfscl;
 		loaded = 1;
 		break;
 
 	case MOD_UNLOAD:
 		if (nfs_numnfscbd != 0) {
 			error = EBUSY;
 			break;
 		}
 
 		/*
 		 * XXX: Unloading of nfscl module is unsupported.
 		 */
 #if 0
 		ncl_call_invalcaches = NULL;
 		nfsd_call_nfscl = NULL;
 		/* and get rid of the mutexes */
 		mtx_destroy(&nfs_clstate_mutex);
 		mtx_destroy(&ncl_iod_mutex);
 		loaded = 0;
 		break;
 #else
 		/* FALLTHROUGH */
 #endif
 	default:
 		error = EOPNOTSUPP;
 		break;
 	}
 	return error;
 }
 static moduledata_t nfscl_mod = {
 	"nfscl",
 	nfscl_modevent,
 	NULL,
 };
 DECLARE_MODULE(nfscl, nfscl_mod, SI_SUB_VFS, SI_ORDER_FIRST);
 
 /* So that loader and kldload(2) can find us, wherever we are.. */
 MODULE_VERSION(nfscl, 1);
 MODULE_DEPEND(nfscl, nfscommon, 1, 1, 1);
 MODULE_DEPEND(nfscl, krpc, 1, 1, 1);
 MODULE_DEPEND(nfscl, nfssvc, 1, 1, 1);
 MODULE_DEPEND(nfscl, nfslock, 1, 1, 1);
 
Index: head/sys/fs/nfsclient/nfs_clsubs.c
===================================================================
--- head/sys/fs/nfsclient/nfs_clsubs.c	(revision 258540)
+++ head/sys/fs/nfsclient/nfs_clsubs.c	(revision 258541)
@@ -1,404 +1,402 @@
 /*-
  * Copyright (c) 1989, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * This code is derived from software contributed to Berkeley by
  * Rick Macklem at The University of Guelph.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	from nfs_subs.c  8.8 (Berkeley) 5/22/95
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
-#include "opt_kdtrace.h"
-
 /*
  * These functions support the macros and help fiddle mbuf chains for
  * the nfs op functions. They do things like create the rpc header and
  * copy data between mbuf chains and uio lists.
  */
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/kernel.h>
 #include <sys/bio.h>
 #include <sys/buf.h>
 #include <sys/proc.h>
 #include <sys/mount.h>
 #include <sys/vnode.h>
 #include <sys/namei.h>
 #include <sys/mbuf.h>
 #include <sys/socket.h>
 #include <sys/stat.h>
 #include <sys/malloc.h>
 #include <sys/sysent.h>
 #include <sys/syscall.h>
 #include <sys/sysproto.h>
 #include <sys/taskqueue.h>
 
 #include <vm/vm.h>
 #include <vm/vm_object.h>
 #include <vm/vm_extern.h>
 #include <vm/uma.h>
 
 #include <fs/nfs/nfsport.h>
 #include <fs/nfsclient/nfsnode.h>
 #include <fs/nfsclient/nfsmount.h>
 #include <fs/nfsclient/nfs.h>
 #include <fs/nfsclient/nfs_kdtrace.h>
 
 #include <netinet/in.h>
 
 /*
  * Note that stdarg.h and the ANSI style va_start macro is used for both
  * ANSI and traditional C compilers.
  */
 #include <machine/stdarg.h>
 
 extern struct mtx ncl_iod_mutex;
 extern enum nfsiod_state ncl_iodwant[NFS_MAXASYNCDAEMON];
 extern struct nfsmount *ncl_iodmount[NFS_MAXASYNCDAEMON];
 extern int ncl_numasync;
 extern unsigned int ncl_iodmax;
 extern struct nfsstats newnfsstats;
 
 struct task	ncl_nfsiodnew_task;
 
 int
 ncl_uninit(struct vfsconf *vfsp)
 {
 	/*
 	 * XXX: Unloading of nfscl module is unsupported.
 	 */
 #if 0
 	int i;
 
 	/*
 	 * Tell all nfsiod processes to exit. Clear ncl_iodmax, and wakeup
 	 * any sleeping nfsiods so they check ncl_iodmax and exit.
 	 */
 	mtx_lock(&ncl_iod_mutex);
 	ncl_iodmax = 0;
 	for (i = 0; i < ncl_numasync; i++)
 		if (ncl_iodwant[i] == NFSIOD_AVAILABLE)
 			wakeup(&ncl_iodwant[i]);
 	/* The last nfsiod to exit will wake us up when ncl_numasync hits 0 */
 	while (ncl_numasync)
 		msleep(&ncl_numasync, &ncl_iod_mutex, PWAIT, "ioddie", 0);
 	mtx_unlock(&ncl_iod_mutex);
 	ncl_nhuninit();
 	return (0);
 #else
 	return (EOPNOTSUPP);
 #endif
 }
 
 void 
 ncl_dircookie_lock(struct nfsnode *np)
 {
 	mtx_lock(&np->n_mtx);
 	while (np->n_flag & NDIRCOOKIELK)
 		(void) msleep(&np->n_flag, &np->n_mtx, PZERO, "nfsdirlk", 0);
 	np->n_flag |= NDIRCOOKIELK;
 	mtx_unlock(&np->n_mtx);
 }
 
 void 
 ncl_dircookie_unlock(struct nfsnode *np)
 {
 	mtx_lock(&np->n_mtx);
 	np->n_flag &= ~NDIRCOOKIELK;
 	wakeup(&np->n_flag);
 	mtx_unlock(&np->n_mtx);
 }
 
 int
 ncl_upgrade_vnlock(struct vnode *vp)
 {
 	int old_lock;
 
 	ASSERT_VOP_LOCKED(vp, "ncl_upgrade_vnlock");
 	old_lock = NFSVOPISLOCKED(vp);
 	if (old_lock != LK_EXCLUSIVE) {
 		KASSERT(old_lock == LK_SHARED,
 		    ("ncl_upgrade_vnlock: wrong old_lock %d", old_lock));
 		/* Upgrade to exclusive lock, this might block */
 		NFSVOPLOCK(vp, LK_UPGRADE | LK_RETRY);
   	}
 	return (old_lock);
 }
 
 void
 ncl_downgrade_vnlock(struct vnode *vp, int old_lock)
 {
 	if (old_lock != LK_EXCLUSIVE) {
 		KASSERT(old_lock == LK_SHARED, ("wrong old_lock %d", old_lock));
 		/* Downgrade from exclusive lock. */
 		NFSVOPLOCK(vp, LK_DOWNGRADE | LK_RETRY);
   	}
 }
 
 void
 ncl_printf(const char *fmt, ...)
 {
 	va_list ap;
 
 	mtx_lock(&Giant);
 	va_start(ap, fmt);
 	vprintf(fmt, ap);
 	va_end(ap);
 	mtx_unlock(&Giant);
 }
 
 #ifdef NFS_ACDEBUG
 #include <sys/sysctl.h>
 SYSCTL_DECL(_vfs_nfs);
 static int nfs_acdebug;
 SYSCTL_INT(_vfs_nfs, OID_AUTO, acdebug, CTLFLAG_RW, &nfs_acdebug, 0, "");
 #endif
 
 /*
  * Check the time stamp
  * If the cache is valid, copy contents to *vap and return 0
  * otherwise return an error
  */
 int
 ncl_getattrcache(struct vnode *vp, struct vattr *vaper)
 {
 	struct nfsnode *np;
 	struct vattr *vap;
 	struct nfsmount *nmp;
 	int timeo, mustflush;
 	
 	np = VTONFS(vp);
 	vap = &np->n_vattr.na_vattr;
 	nmp = VFSTONFS(vp->v_mount);
 	mustflush = nfscl_mustflush(vp);	/* must be before mtx_lock() */
 #ifdef NFS_ACDEBUG
 	mtx_lock(&Giant);	/* ncl_printf() */
 #endif
 	mtx_lock(&np->n_mtx);
 	/* XXX n_mtime doesn't seem to be updated on a miss-and-reload */
 	timeo = (time_second - np->n_mtime.tv_sec) / 10;
 
 #ifdef NFS_ACDEBUG
 	if (nfs_acdebug>1)
 		ncl_printf("nfs_getattrcache: initial timeo = %d\n", timeo);
 #endif
 
 	if (vap->va_type == VDIR) {
 		if ((np->n_flag & NMODIFIED) || timeo < nmp->nm_acdirmin)
 			timeo = nmp->nm_acdirmin;
 		else if (timeo > nmp->nm_acdirmax)
 			timeo = nmp->nm_acdirmax;
 	} else {
 		if ((np->n_flag & NMODIFIED) || timeo < nmp->nm_acregmin)
 			timeo = nmp->nm_acregmin;
 		else if (timeo > nmp->nm_acregmax)
 			timeo = nmp->nm_acregmax;
 	}
 
 #ifdef NFS_ACDEBUG
 	if (nfs_acdebug > 2)
 		ncl_printf("acregmin %d; acregmax %d; acdirmin %d; acdirmax %d\n",
 			   nmp->nm_acregmin, nmp->nm_acregmax,
 			   nmp->nm_acdirmin, nmp->nm_acdirmax);
 
 	if (nfs_acdebug)
 		ncl_printf("nfs_getattrcache: age = %d; final timeo = %d\n",
 			   (time_second - np->n_attrstamp), timeo);
 #endif
 
 	if ((time_second - np->n_attrstamp) >= timeo &&
 	    (mustflush != 0 || np->n_attrstamp == 0)) {
 		newnfsstats.attrcache_misses++;
 		mtx_unlock(&np->n_mtx);
 #ifdef NFS_ACDEBUG
 		mtx_unlock(&Giant);	/* ncl_printf() */
 #endif
 		KDTRACE_NFS_ATTRCACHE_GET_MISS(vp);
 		return( ENOENT);
 	}
 	newnfsstats.attrcache_hits++;
 	if (vap->va_size != np->n_size) {
 		if (vap->va_type == VREG) {
 			if (np->n_flag & NMODIFIED) {
 				if (vap->va_size < np->n_size)
 					vap->va_size = np->n_size;
 				else
 					np->n_size = vap->va_size;
 			} else {
 				np->n_size = vap->va_size;
 			}
 			vnode_pager_setsize(vp, np->n_size);
 		} else {
 			np->n_size = vap->va_size;
 		}
 	}
 	bcopy((caddr_t)vap, (caddr_t)vaper, sizeof(struct vattr));
 	if (np->n_flag & NCHG) {
 		if (np->n_flag & NACC)
 			vaper->va_atime = np->n_atim;
 		if (np->n_flag & NUPD)
 			vaper->va_mtime = np->n_mtim;
 	}
 	mtx_unlock(&np->n_mtx);
 #ifdef NFS_ACDEBUG
 	mtx_unlock(&Giant);	/* ncl_printf() */
 #endif
 	KDTRACE_NFS_ATTRCACHE_GET_HIT(vp, vap);
 	return (0);
 }
 
 static nfsuint64 nfs_nullcookie = { { 0, 0 } };
 /*
  * This function finds the directory cookie that corresponds to the
  * logical byte offset given.
  */
 nfsuint64 *
 ncl_getcookie(struct nfsnode *np, off_t off, int add)
 {
 	struct nfsdmap *dp, *dp2;
 	int pos;
 	nfsuint64 *retval = NULL;
 	
 	pos = (uoff_t)off / NFS_DIRBLKSIZ;
 	if (pos == 0 || off < 0) {
 		KASSERT(!add, ("nfs getcookie add at <= 0"));
 		return (&nfs_nullcookie);
 	}
 	pos--;
 	dp = LIST_FIRST(&np->n_cookies);
 	if (!dp) {
 		if (add) {
 			MALLOC(dp, struct nfsdmap *, sizeof (struct nfsdmap),
 				M_NFSDIROFF, M_WAITOK);
 			dp->ndm_eocookie = 0;
 			LIST_INSERT_HEAD(&np->n_cookies, dp, ndm_list);
 		} else
 			goto out;
 	}
 	while (pos >= NFSNUMCOOKIES) {
 		pos -= NFSNUMCOOKIES;
 		if (LIST_NEXT(dp, ndm_list)) {
 			if (!add && dp->ndm_eocookie < NFSNUMCOOKIES &&
 			    pos >= dp->ndm_eocookie)
 				goto out;
 			dp = LIST_NEXT(dp, ndm_list);
 		} else if (add) {
 			MALLOC(dp2, struct nfsdmap *, sizeof (struct nfsdmap),
 				M_NFSDIROFF, M_WAITOK);
 			dp2->ndm_eocookie = 0;
 			LIST_INSERT_AFTER(dp, dp2, ndm_list);
 			dp = dp2;
 		} else
 			goto out;
 	}
 	if (pos >= dp->ndm_eocookie) {
 		if (add)
 			dp->ndm_eocookie = pos + 1;
 		else
 			goto out;
 	}
 	retval = &dp->ndm_cookies[pos];
 out:
 	return (retval);
 }
 
 /*
  * Invalidate cached directory information, except for the actual directory
  * blocks (which are invalidated separately).
  * Done mainly to avoid the use of stale offset cookies.
  */
 void
 ncl_invaldir(struct vnode *vp)
 {
 	struct nfsnode *np = VTONFS(vp);
 
 	KASSERT(vp->v_type == VDIR, ("nfs: invaldir not dir"));
 	ncl_dircookie_lock(np);
 	np->n_direofoffset = 0;
 	np->n_cookieverf.nfsuquad[0] = 0;
 	np->n_cookieverf.nfsuquad[1] = 0;
 	if (LIST_FIRST(&np->n_cookies))
 		LIST_FIRST(&np->n_cookies)->ndm_eocookie = 0;
 	ncl_dircookie_unlock(np);
 }
 
 /*
  * The write verifier has changed (probably due to a server reboot), so all
  * B_NEEDCOMMIT blocks will have to be written again. Since they are on the
  * dirty block list as B_DELWRI, all this takes is clearing the B_NEEDCOMMIT
  * and B_CLUSTEROK flags.  Once done the new write verifier can be set for the
  * mount point.
  *
  * B_CLUSTEROK must be cleared along with B_NEEDCOMMIT because stage 1 data
  * writes are not clusterable.
  */
 void
 ncl_clearcommit(struct mount *mp)
 {
 	struct vnode *vp, *nvp;
 	struct buf *bp, *nbp;
 	struct bufobj *bo;
 
 	MNT_VNODE_FOREACH_ALL(vp, mp, nvp) {
 		bo = &vp->v_bufobj;
 		vholdl(vp);
 		VI_UNLOCK(vp);
 		BO_LOCK(bo);
 		TAILQ_FOREACH_SAFE(bp, &bo->bo_dirty.bv_hd, b_bobufs, nbp) {
 			if (!BUF_ISLOCKED(bp) &&
 			    (bp->b_flags & (B_DELWRI | B_NEEDCOMMIT))
 				== (B_DELWRI | B_NEEDCOMMIT))
 				bp->b_flags &= ~(B_NEEDCOMMIT | B_CLUSTEROK);
 		}
 		BO_UNLOCK(bo);
 		vdrop(vp);
 	}
 }
 
 /*
  * Called once to initialize data structures...
  */
 int
 ncl_init(struct vfsconf *vfsp)
 {
 	int i;
 
 	/* Ensure async daemons disabled */
 	for (i = 0; i < NFS_MAXASYNCDAEMON; i++) {
 		ncl_iodwant[i] = NFSIOD_NOT_AVAILABLE;
 		ncl_iodmount[i] = NULL;
 	}
 	TASK_INIT(&ncl_nfsiodnew_task, 0, ncl_nfsiodnew_tq, NULL);
 	ncl_nhinit();			/* Init the nfsnode table */
 
 	return (0);
 }
 
Index: head/sys/fs/nfsclient/nfs_clvnops.c
===================================================================
--- head/sys/fs/nfsclient/nfs_clvnops.c	(revision 258540)
+++ head/sys/fs/nfsclient/nfs_clvnops.c	(revision 258541)
@@ -1,3528 +1,3527 @@
 /*-
  * Copyright (c) 1989, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * This code is derived from software contributed to Berkeley by
  * Rick Macklem at The University of Guelph.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	from nfs_vnops.c	8.16 (Berkeley) 5/27/95
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 /*
  * vnode op calls for Sun NFS version 2, 3 and 4
  */
 
-#include "opt_kdtrace.h"
 #include "opt_inet.h"
 
 #include <sys/param.h>
 #include <sys/kernel.h>
 #include <sys/systm.h>
 #include <sys/resourcevar.h>
 #include <sys/proc.h>
 #include <sys/mount.h>
 #include <sys/bio.h>
 #include <sys/buf.h>
 #include <sys/jail.h>
 #include <sys/malloc.h>
 #include <sys/mbuf.h>
 #include <sys/namei.h>
 #include <sys/socket.h>
 #include <sys/vnode.h>
 #include <sys/dirent.h>
 #include <sys/fcntl.h>
 #include <sys/lockf.h>
 #include <sys/stat.h>
 #include <sys/sysctl.h>
 #include <sys/signalvar.h>
 
 #include <vm/vm.h>
 #include <vm/vm_extern.h>
 #include <vm/vm_object.h>
 
 #include <fs/nfs/nfsport.h>
 #include <fs/nfsclient/nfsnode.h>
 #include <fs/nfsclient/nfsmount.h>
 #include <fs/nfsclient/nfs.h>
 #include <fs/nfsclient/nfs_kdtrace.h>
 
 #include <net/if.h>
 #include <netinet/in.h>
 #include <netinet/in_var.h>
 
 #include <nfs/nfs_lock.h>
 
 #ifdef KDTRACE_HOOKS
 #include <sys/dtrace_bsd.h>
 
 dtrace_nfsclient_accesscache_flush_probe_func_t
 		dtrace_nfscl_accesscache_flush_done_probe;
 uint32_t	nfscl_accesscache_flush_done_id;
 
 dtrace_nfsclient_accesscache_get_probe_func_t
 		dtrace_nfscl_accesscache_get_hit_probe,
 		dtrace_nfscl_accesscache_get_miss_probe;
 uint32_t	nfscl_accesscache_get_hit_id;
 uint32_t	nfscl_accesscache_get_miss_id;
 
 dtrace_nfsclient_accesscache_load_probe_func_t
 		dtrace_nfscl_accesscache_load_done_probe;
 uint32_t	nfscl_accesscache_load_done_id;
 #endif /* !KDTRACE_HOOKS */
 
 /* Defs */
 #define	TRUE	1
 #define	FALSE	0
 
 extern struct nfsstats newnfsstats;
 extern int nfsrv_useacl;
 extern int nfscl_debuglevel;
 MALLOC_DECLARE(M_NEWNFSREQ);
 
 /*
  * Ifdef for FreeBSD-current merged buffer cache. It is unfortunate that these
  * calls are not in getblk() and brelse() so that they would not be necessary
  * here.
  */
 #ifndef B_VMIO
 #define	vfs_busy_pages(bp, f)
 #endif
 
 static vop_read_t	nfsfifo_read;
 static vop_write_t	nfsfifo_write;
 static vop_close_t	nfsfifo_close;
 static int	nfs_setattrrpc(struct vnode *, struct vattr *, struct ucred *,
 		    struct thread *);
 static vop_lookup_t	nfs_lookup;
 static vop_create_t	nfs_create;
 static vop_mknod_t	nfs_mknod;
 static vop_open_t	nfs_open;
 static vop_pathconf_t	nfs_pathconf;
 static vop_close_t	nfs_close;
 static vop_access_t	nfs_access;
 static vop_getattr_t	nfs_getattr;
 static vop_setattr_t	nfs_setattr;
 static vop_read_t	nfs_read;
 static vop_fsync_t	nfs_fsync;
 static vop_remove_t	nfs_remove;
 static vop_link_t	nfs_link;
 static vop_rename_t	nfs_rename;
 static vop_mkdir_t	nfs_mkdir;
 static vop_rmdir_t	nfs_rmdir;
 static vop_symlink_t	nfs_symlink;
 static vop_readdir_t	nfs_readdir;
 static vop_strategy_t	nfs_strategy;
 static vop_lock1_t	nfs_lock1;
 static	int	nfs_lookitup(struct vnode *, char *, int,
 		    struct ucred *, struct thread *, struct nfsnode **);
 static	int	nfs_sillyrename(struct vnode *, struct vnode *,
 		    struct componentname *);
 static vop_access_t	nfsspec_access;
 static vop_readlink_t	nfs_readlink;
 static vop_print_t	nfs_print;
 static vop_advlock_t	nfs_advlock;
 static vop_advlockasync_t nfs_advlockasync;
 static vop_getacl_t nfs_getacl;
 static vop_setacl_t nfs_setacl;
 
 /*
  * Global vfs data structures for nfs
  */
 struct vop_vector newnfs_vnodeops = {
 	.vop_default =		&default_vnodeops,
 	.vop_access =		nfs_access,
 	.vop_advlock =		nfs_advlock,
 	.vop_advlockasync =	nfs_advlockasync,
 	.vop_close =		nfs_close,
 	.vop_create =		nfs_create,
 	.vop_fsync =		nfs_fsync,
 	.vop_getattr =		nfs_getattr,
 	.vop_getpages =		ncl_getpages,
 	.vop_putpages =		ncl_putpages,
 	.vop_inactive =		ncl_inactive,
 	.vop_link =		nfs_link,
 	.vop_lock1 = 		nfs_lock1,
 	.vop_lookup =		nfs_lookup,
 	.vop_mkdir =		nfs_mkdir,
 	.vop_mknod =		nfs_mknod,
 	.vop_open =		nfs_open,
 	.vop_pathconf =		nfs_pathconf,
 	.vop_print =		nfs_print,
 	.vop_read =		nfs_read,
 	.vop_readdir =		nfs_readdir,
 	.vop_readlink =		nfs_readlink,
 	.vop_reclaim =		ncl_reclaim,
 	.vop_remove =		nfs_remove,
 	.vop_rename =		nfs_rename,
 	.vop_rmdir =		nfs_rmdir,
 	.vop_setattr =		nfs_setattr,
 	.vop_strategy =		nfs_strategy,
 	.vop_symlink =		nfs_symlink,
 	.vop_write =		ncl_write,
 	.vop_getacl =		nfs_getacl,
 	.vop_setacl =		nfs_setacl,
 };
 
 struct vop_vector newnfs_fifoops = {
 	.vop_default =		&fifo_specops,
 	.vop_access =		nfsspec_access,
 	.vop_close =		nfsfifo_close,
 	.vop_fsync =		nfs_fsync,
 	.vop_getattr =		nfs_getattr,
 	.vop_inactive =		ncl_inactive,
 	.vop_print =		nfs_print,
 	.vop_read =		nfsfifo_read,
 	.vop_reclaim =		ncl_reclaim,
 	.vop_setattr =		nfs_setattr,
 	.vop_write =		nfsfifo_write,
 };
 
 static int nfs_mknodrpc(struct vnode *dvp, struct vnode **vpp,
     struct componentname *cnp, struct vattr *vap);
 static int nfs_removerpc(struct vnode *dvp, struct vnode *vp, char *name,
     int namelen, struct ucred *cred, struct thread *td);
 static int nfs_renamerpc(struct vnode *fdvp, struct vnode *fvp,
     char *fnameptr, int fnamelen, struct vnode *tdvp, struct vnode *tvp,
     char *tnameptr, int tnamelen, struct ucred *cred, struct thread *td);
 static int nfs_renameit(struct vnode *sdvp, struct vnode *svp,
     struct componentname *scnp, struct sillyrename *sp);
 
 /*
  * Global variables
  */
 #define	DIRHDSIZ	(sizeof (struct dirent) - (MAXNAMLEN + 1))
 
 SYSCTL_DECL(_vfs_nfs);
 
 static int	nfsaccess_cache_timeout = NFS_MAXATTRTIMO;
 SYSCTL_INT(_vfs_nfs, OID_AUTO, access_cache_timeout, CTLFLAG_RW,
 	   &nfsaccess_cache_timeout, 0, "NFS ACCESS cache timeout");
 
 static int	nfs_prime_access_cache = 0;
 SYSCTL_INT(_vfs_nfs, OID_AUTO, prime_access_cache, CTLFLAG_RW,
 	   &nfs_prime_access_cache, 0,
 	   "Prime NFS ACCESS cache when fetching attributes");
 
 static int	newnfs_commit_on_close = 0;
 SYSCTL_INT(_vfs_nfs, OID_AUTO, commit_on_close, CTLFLAG_RW,
     &newnfs_commit_on_close, 0, "write+commit on close, else only write");
 
 static int	nfs_clean_pages_on_close = 1;
 SYSCTL_INT(_vfs_nfs, OID_AUTO, clean_pages_on_close, CTLFLAG_RW,
 	   &nfs_clean_pages_on_close, 0, "NFS clean dirty pages on close");
 
 int newnfs_directio_enable = 0;
 SYSCTL_INT(_vfs_nfs, OID_AUTO, nfs_directio_enable, CTLFLAG_RW,
 	   &newnfs_directio_enable, 0, "Enable NFS directio");
 
 int nfs_keep_dirty_on_error;
 SYSCTL_INT(_vfs_nfs, OID_AUTO, nfs_keep_dirty_on_error, CTLFLAG_RW,
     &nfs_keep_dirty_on_error, 0, "Retry pageout if error returned");
 
 /*
  * This sysctl allows other processes to mmap a file that has been opened
  * O_DIRECT by a process.  In general, having processes mmap the file while
  * Direct IO is in progress can lead to Data Inconsistencies.  But, we allow
  * this by default to prevent DoS attacks - to prevent a malicious user from
  * opening up files O_DIRECT preventing other users from mmap'ing these
  * files.  "Protected" environments where stricter consistency guarantees are
  * required can disable this knob.  The process that opened the file O_DIRECT
  * cannot mmap() the file, because mmap'ed IO on an O_DIRECT open() is not
  * meaningful.
  */
 int newnfs_directio_allow_mmap = 1;
 SYSCTL_INT(_vfs_nfs, OID_AUTO, nfs_directio_allow_mmap, CTLFLAG_RW,
 	   &newnfs_directio_allow_mmap, 0, "Enable mmaped IO on file with O_DIRECT opens");
 
 #if 0
 SYSCTL_INT(_vfs_nfs, OID_AUTO, access_cache_hits, CTLFLAG_RD,
 	   &newnfsstats.accesscache_hits, 0, "NFS ACCESS cache hit count");
 
 SYSCTL_INT(_vfs_nfs, OID_AUTO, access_cache_misses, CTLFLAG_RD,
 	   &newnfsstats.accesscache_misses, 0, "NFS ACCESS cache miss count");
 #endif
 
 #define	NFSACCESS_ALL (NFSACCESS_READ | NFSACCESS_MODIFY		\
 			 | NFSACCESS_EXTEND | NFSACCESS_EXECUTE	\
 			 | NFSACCESS_DELETE | NFSACCESS_LOOKUP)
 
 /*
  * SMP Locking Note :
  * The list of locks after the description of the lock is the ordering
  * of other locks acquired with the lock held.
  * np->n_mtx : Protects the fields in the nfsnode.
        VM Object Lock
        VI_MTX (acquired indirectly)
  * nmp->nm_mtx : Protects the fields in the nfsmount.
        rep->r_mtx
  * ncl_iod_mutex : Global lock, protects shared nfsiod state.
  * nfs_reqq_mtx : Global lock, protects the nfs_reqq list.
        nmp->nm_mtx
        rep->r_mtx
  * rep->r_mtx : Protects the fields in an nfsreq.
  */
 
 static int
 nfs34_access_otw(struct vnode *vp, int wmode, struct thread *td,
     struct ucred *cred, u_int32_t *retmode)
 {
 	int error = 0, attrflag, i, lrupos;
 	u_int32_t rmode;
 	struct nfsnode *np = VTONFS(vp);
 	struct nfsvattr nfsva;
 
 	error = nfsrpc_accessrpc(vp, wmode, cred, td, &nfsva, &attrflag,
 	    &rmode, NULL);
 	if (attrflag)
 		(void) nfscl_loadattrcache(&vp, &nfsva, NULL, NULL, 0, 1);
 	if (!error) {
 		lrupos = 0;
 		mtx_lock(&np->n_mtx);
 		for (i = 0; i < NFS_ACCESSCACHESIZE; i++) {
 			if (np->n_accesscache[i].uid == cred->cr_uid) {
 				np->n_accesscache[i].mode = rmode;
 				np->n_accesscache[i].stamp = time_second;
 				break;
 			}
 			if (i > 0 && np->n_accesscache[i].stamp <
 			    np->n_accesscache[lrupos].stamp)
 				lrupos = i;
 		}
 		if (i == NFS_ACCESSCACHESIZE) {
 			np->n_accesscache[lrupos].uid = cred->cr_uid;
 			np->n_accesscache[lrupos].mode = rmode;
 			np->n_accesscache[lrupos].stamp = time_second;
 		}
 		mtx_unlock(&np->n_mtx);
 		if (retmode != NULL)
 			*retmode = rmode;
 		KDTRACE_NFS_ACCESSCACHE_LOAD_DONE(vp, cred->cr_uid, rmode, 0);
 	} else if (NFS_ISV4(vp)) {
 		error = nfscl_maperr(td, error, (uid_t)0, (gid_t)0);
 	}
 #ifdef KDTRACE_HOOKS
 	if (error != 0)
 		KDTRACE_NFS_ACCESSCACHE_LOAD_DONE(vp, cred->cr_uid, 0,
 		    error);
 #endif
 	return (error);
 }
 
 /*
  * nfs access vnode op.
  * For nfs version 2, just return ok. File accesses may fail later.
  * For nfs version 3, use the access rpc to check accessibility. If file modes
  * are changed on the server, accesses might still fail later.
  */
 static int
 nfs_access(struct vop_access_args *ap)
 {
 	struct vnode *vp = ap->a_vp;
 	int error = 0, i, gotahit;
 	u_int32_t mode, wmode, rmode;
 	int v34 = NFS_ISV34(vp);
 	struct nfsnode *np = VTONFS(vp);
 
 	/*
 	 * Disallow write attempts on filesystems mounted read-only;
 	 * unless the file is a socket, fifo, or a block or character
 	 * device resident on the filesystem.
 	 */
 	if ((ap->a_accmode & (VWRITE | VAPPEND | VWRITE_NAMED_ATTRS |
 	    VDELETE_CHILD | VWRITE_ATTRIBUTES | VDELETE | VWRITE_ACL |
 	    VWRITE_OWNER)) != 0 && (vp->v_mount->mnt_flag & MNT_RDONLY) != 0) {
 		switch (vp->v_type) {
 		case VREG:
 		case VDIR:
 		case VLNK:
 			return (EROFS);
 		default:
 			break;
 		}
 	}
 	/*
 	 * For nfs v3 or v4, check to see if we have done this recently, and if
 	 * so return our cached result instead of making an ACCESS call.
 	 * If not, do an access rpc, otherwise you are stuck emulating
 	 * ufs_access() locally using the vattr. This may not be correct,
 	 * since the server may apply other access criteria such as
 	 * client uid-->server uid mapping that we do not know about.
 	 */
 	if (v34) {
 		if (ap->a_accmode & VREAD)
 			mode = NFSACCESS_READ;
 		else
 			mode = 0;
 		if (vp->v_type != VDIR) {
 			if (ap->a_accmode & VWRITE)
 				mode |= (NFSACCESS_MODIFY | NFSACCESS_EXTEND);
 			if (ap->a_accmode & VAPPEND)
 				mode |= NFSACCESS_EXTEND;
 			if (ap->a_accmode & VEXEC)
 				mode |= NFSACCESS_EXECUTE;
 			if (ap->a_accmode & VDELETE)
 				mode |= NFSACCESS_DELETE;
 		} else {
 			if (ap->a_accmode & VWRITE)
 				mode |= (NFSACCESS_MODIFY | NFSACCESS_EXTEND);
 			if (ap->a_accmode & VAPPEND)
 				mode |= NFSACCESS_EXTEND;
 			if (ap->a_accmode & VEXEC)
 				mode |= NFSACCESS_LOOKUP;
 			if (ap->a_accmode & VDELETE)
 				mode |= NFSACCESS_DELETE;
 			if (ap->a_accmode & VDELETE_CHILD)
 				mode |= NFSACCESS_MODIFY;
 		}
 		/* XXX safety belt, only make blanket request if caching */
 		if (nfsaccess_cache_timeout > 0) {
 			wmode = NFSACCESS_READ | NFSACCESS_MODIFY |
 				NFSACCESS_EXTEND | NFSACCESS_EXECUTE |
 				NFSACCESS_DELETE | NFSACCESS_LOOKUP;
 		} else {
 			wmode = mode;
 		}
 
 		/*
 		 * Does our cached result allow us to give a definite yes to
 		 * this request?
 		 */
 		gotahit = 0;
 		mtx_lock(&np->n_mtx);
 		for (i = 0; i < NFS_ACCESSCACHESIZE; i++) {
 			if (ap->a_cred->cr_uid == np->n_accesscache[i].uid) {
 			    if (time_second < (np->n_accesscache[i].stamp
 				+ nfsaccess_cache_timeout) &&
 				(np->n_accesscache[i].mode & mode) == mode) {
 				NFSINCRGLOBAL(newnfsstats.accesscache_hits);
 				gotahit = 1;
 			    }
 			    break;
 			}
 		}
 		mtx_unlock(&np->n_mtx);
 #ifdef KDTRACE_HOOKS
 		if (gotahit != 0)
 			KDTRACE_NFS_ACCESSCACHE_GET_HIT(vp,
 			    ap->a_cred->cr_uid, mode);
 		else
 			KDTRACE_NFS_ACCESSCACHE_GET_MISS(vp,
 			    ap->a_cred->cr_uid, mode);
 #endif
 		if (gotahit == 0) {
 			/*
 			 * Either a no, or a don't know.  Go to the wire.
 			 */
 			NFSINCRGLOBAL(newnfsstats.accesscache_misses);
 		        error = nfs34_access_otw(vp, wmode, ap->a_td,
 			    ap->a_cred, &rmode);
 			if (!error &&
 			    (rmode & mode) != mode)
 				error = EACCES;
 		}
 		return (error);
 	} else {
 		if ((error = nfsspec_access(ap)) != 0) {
 			return (error);
 		}
 		/*
 		 * Attempt to prevent a mapped root from accessing a file
 		 * which it shouldn't.  We try to read a byte from the file
 		 * if the user is root and the file is not zero length.
 		 * After calling nfsspec_access, we should have the correct
 		 * file size cached.
 		 */
 		mtx_lock(&np->n_mtx);
 		if (ap->a_cred->cr_uid == 0 && (ap->a_accmode & VREAD)
 		    && VTONFS(vp)->n_size > 0) {
 			struct iovec aiov;
 			struct uio auio;
 			char buf[1];
 
 			mtx_unlock(&np->n_mtx);
 			aiov.iov_base = buf;
 			aiov.iov_len = 1;
 			auio.uio_iov = &aiov;
 			auio.uio_iovcnt = 1;
 			auio.uio_offset = 0;
 			auio.uio_resid = 1;
 			auio.uio_segflg = UIO_SYSSPACE;
 			auio.uio_rw = UIO_READ;
 			auio.uio_td = ap->a_td;
 
 			if (vp->v_type == VREG)
 				error = ncl_readrpc(vp, &auio, ap->a_cred);
 			else if (vp->v_type == VDIR) {
 				char* bp;
 				bp = malloc(NFS_DIRBLKSIZ, M_TEMP, M_WAITOK);
 				aiov.iov_base = bp;
 				aiov.iov_len = auio.uio_resid = NFS_DIRBLKSIZ;
 				error = ncl_readdirrpc(vp, &auio, ap->a_cred,
 				    ap->a_td);
 				free(bp, M_TEMP);
 			} else if (vp->v_type == VLNK)
 				error = ncl_readlinkrpc(vp, &auio, ap->a_cred);
 			else
 				error = EACCES;
 		} else
 			mtx_unlock(&np->n_mtx);
 		return (error);
 	}
 }
 
 
 /*
  * nfs open vnode op
  * Check to see if the type is ok
  * and that deletion is not in progress.
  * For paged in text files, you will need to flush the page cache
  * if consistency is lost.
  */
 /* ARGSUSED */
 static int
 nfs_open(struct vop_open_args *ap)
 {
 	struct vnode *vp = ap->a_vp;
 	struct nfsnode *np = VTONFS(vp);
 	struct vattr vattr;
 	int error;
 	int fmode = ap->a_mode;
 	struct ucred *cred;
 
 	if (vp->v_type != VREG && vp->v_type != VDIR && vp->v_type != VLNK)
 		return (EOPNOTSUPP);
 
 	/*
 	 * For NFSv4, we need to do the Open Op before cache validation,
 	 * so that we conform to RFC3530 Sec. 9.3.1.
 	 */
 	if (NFS_ISV4(vp)) {
 		error = nfsrpc_open(vp, fmode, ap->a_cred, ap->a_td);
 		if (error) {
 			error = nfscl_maperr(ap->a_td, error, (uid_t)0,
 			    (gid_t)0);
 			return (error);
 		}
 	}
 
 	/*
 	 * Now, if this Open will be doing reading, re-validate/flush the
 	 * cache, so that Close/Open coherency is maintained.
 	 */
 	mtx_lock(&np->n_mtx);
 	if (np->n_flag & NMODIFIED) {
 		mtx_unlock(&np->n_mtx);
 		error = ncl_vinvalbuf(vp, V_SAVE, ap->a_td, 1);
 		if (error == EINTR || error == EIO) {
 			if (NFS_ISV4(vp))
 				(void) nfsrpc_close(vp, 0, ap->a_td);
 			return (error);
 		}
 		mtx_lock(&np->n_mtx);
 		np->n_attrstamp = 0;
 		KDTRACE_NFS_ATTRCACHE_FLUSH_DONE(vp);
 		if (vp->v_type == VDIR)
 			np->n_direofoffset = 0;
 		mtx_unlock(&np->n_mtx);
 		error = VOP_GETATTR(vp, &vattr, ap->a_cred);
 		if (error) {
 			if (NFS_ISV4(vp))
 				(void) nfsrpc_close(vp, 0, ap->a_td);
 			return (error);
 		}
 		mtx_lock(&np->n_mtx);
 		np->n_mtime = vattr.va_mtime;
 		if (NFS_ISV4(vp))
 			np->n_change = vattr.va_filerev;
 	} else {
 		mtx_unlock(&np->n_mtx);
 		error = VOP_GETATTR(vp, &vattr, ap->a_cred);
 		if (error) {
 			if (NFS_ISV4(vp))
 				(void) nfsrpc_close(vp, 0, ap->a_td);
 			return (error);
 		}
 		mtx_lock(&np->n_mtx);
 		if ((NFS_ISV4(vp) && np->n_change != vattr.va_filerev) ||
 		    NFS_TIMESPEC_COMPARE(&np->n_mtime, &vattr.va_mtime)) {
 			if (vp->v_type == VDIR)
 				np->n_direofoffset = 0;
 			mtx_unlock(&np->n_mtx);
 			error = ncl_vinvalbuf(vp, V_SAVE, ap->a_td, 1);
 			if (error == EINTR || error == EIO) {
 				if (NFS_ISV4(vp))
 					(void) nfsrpc_close(vp, 0, ap->a_td);
 				return (error);
 			}
 			mtx_lock(&np->n_mtx);
 			np->n_mtime = vattr.va_mtime;
 			if (NFS_ISV4(vp))
 				np->n_change = vattr.va_filerev;
 		}
 	}
 
 	/*
 	 * If the object has >= 1 O_DIRECT active opens, we disable caching.
 	 */
 	if (newnfs_directio_enable && (fmode & O_DIRECT) &&
 	    (vp->v_type == VREG)) {
 		if (np->n_directio_opens == 0) {
 			mtx_unlock(&np->n_mtx);
 			error = ncl_vinvalbuf(vp, V_SAVE, ap->a_td, 1);
 			if (error) {
 				if (NFS_ISV4(vp))
 					(void) nfsrpc_close(vp, 0, ap->a_td);
 				return (error);
 			}
 			mtx_lock(&np->n_mtx);
 			np->n_flag |= NNONCACHE;
 		}
 		np->n_directio_opens++;
 	}
 
 	/* If opened for writing via NFSv4.1 or later, mark that for pNFS. */
 	if (NFSHASPNFS(VFSTONFS(vp->v_mount)) && (fmode & FWRITE) != 0)
 		np->n_flag |= NWRITEOPENED;
 
 	/*
 	 * If this is an open for writing, capture a reference to the
 	 * credentials, so they can be used by ncl_putpages(). Using
 	 * these write credentials is preferable to the credentials of
 	 * whatever thread happens to be doing the VOP_PUTPAGES() since
 	 * the write RPCs are less likely to fail with EACCES.
 	 */
 	if ((fmode & FWRITE) != 0) {
 		cred = np->n_writecred;
 		np->n_writecred = crhold(ap->a_cred);
 	} else
 		cred = NULL;
 	mtx_unlock(&np->n_mtx);
 
 	if (cred != NULL)
 		crfree(cred);
 	vnode_create_vobject(vp, vattr.va_size, ap->a_td);
 	return (0);
 }
 
 /*
  * nfs close vnode op
  * What an NFS client should do upon close after writing is a debatable issue.
  * Most NFS clients push delayed writes to the server upon close, basically for
  * two reasons:
  * 1 - So that any write errors may be reported back to the client process
  *     doing the close system call. By far the two most likely errors are
  *     NFSERR_NOSPC and NFSERR_DQUOT to indicate space allocation failure.
  * 2 - To put a worst case upper bound on cache inconsistency between
  *     multiple clients for the file.
  * There is also a consistency problem for Version 2 of the protocol w.r.t.
  * not being able to tell if other clients are writing a file concurrently,
  * since there is no way of knowing if the changed modify time in the reply
  * is only due to the write for this client.
  * (NFS Version 3 provides weak cache consistency data in the reply that
  *  should be sufficient to detect and handle this case.)
  *
  * The current code does the following:
  * for NFS Version 2 - play it safe and flush/invalidate all dirty buffers
  * for NFS Version 3 - flush dirty buffers to the server but don't invalidate
  *                     or commit them (this satisfies 1 and 2 except for the
  *                     case where the server crashes after this close but
  *                     before the commit RPC, which is felt to be "good
  *                     enough". Changing the last argument to ncl_flush() to
  *                     a 1 would force a commit operation, if it is felt a
  *                     commit is necessary now.
  * for NFS Version 4 - flush the dirty buffers and commit them, if
  *		       nfscl_mustflush() says this is necessary.
  *                     It is necessary if there is no write delegation held,
  *                     in order to satisfy open/close coherency.
  *                     If the file isn't cached on local stable storage,
  *                     it may be necessary in order to detect "out of space"
  *                     errors from the server, if the write delegation
  *                     issued by the server doesn't allow the file to grow.
  */
 /* ARGSUSED */
 static int
 nfs_close(struct vop_close_args *ap)
 {
 	struct vnode *vp = ap->a_vp;
 	struct nfsnode *np = VTONFS(vp);
 	struct nfsvattr nfsva;
 	struct ucred *cred;
 	int error = 0, ret, localcred = 0;
 	int fmode = ap->a_fflag;
 
 	if ((vp->v_mount->mnt_kern_flag & MNTK_UNMOUNTF))
 		return (0);
 	/*
 	 * During shutdown, a_cred isn't valid, so just use root.
 	 */
 	if (ap->a_cred == NOCRED) {
 		cred = newnfs_getcred();
 		localcred = 1;
 	} else {
 		cred = ap->a_cred;
 	}
 	if (vp->v_type == VREG) {
 	    /*
 	     * Examine and clean dirty pages, regardless of NMODIFIED.
 	     * This closes a major hole in close-to-open consistency.
 	     * We want to push out all dirty pages (and buffers) on
 	     * close, regardless of whether they were dirtied by
 	     * mmap'ed writes or via write().
 	     */
 	    if (nfs_clean_pages_on_close && vp->v_object) {
 		VM_OBJECT_WLOCK(vp->v_object);
 		vm_object_page_clean(vp->v_object, 0, 0, 0);
 		VM_OBJECT_WUNLOCK(vp->v_object);
 	    }
 	    mtx_lock(&np->n_mtx);
 	    if (np->n_flag & NMODIFIED) {
 		mtx_unlock(&np->n_mtx);
 		if (NFS_ISV3(vp)) {
 		    /*
 		     * Under NFSv3 we have dirty buffers to dispose of.  We
 		     * must flush them to the NFS server.  We have the option
 		     * of waiting all the way through the commit rpc or just
 		     * waiting for the initial write.  The default is to only
 		     * wait through the initial write so the data is in the
 		     * server's cache, which is roughly similar to the state
 		     * a standard disk subsystem leaves the file in on close().
 		     *
 		     * We cannot clear the NMODIFIED bit in np->n_flag due to
 		     * potential races with other processes, and certainly
 		     * cannot clear it if we don't commit.
 		     * These races occur when there is no longer the old
 		     * traditional vnode locking implemented for Vnode Ops.
 		     */
 		    int cm = newnfs_commit_on_close ? 1 : 0;
 		    error = ncl_flush(vp, MNT_WAIT, cred, ap->a_td, cm, 0);
 		    /* np->n_flag &= ~NMODIFIED; */
 		} else if (NFS_ISV4(vp)) { 
 			if (nfscl_mustflush(vp) != 0) {
 				int cm = newnfs_commit_on_close ? 1 : 0;
 				error = ncl_flush(vp, MNT_WAIT, cred, ap->a_td,
 				    cm, 0);
 				/*
 				 * as above w.r.t races when clearing
 				 * NMODIFIED.
 				 * np->n_flag &= ~NMODIFIED;
 				 */
 			}
 		} else
 		    error = ncl_vinvalbuf(vp, V_SAVE, ap->a_td, 1);
 		mtx_lock(&np->n_mtx);
 	    }
  	    /* 
  	     * Invalidate the attribute cache in all cases.
  	     * An open is going to fetch fresh attrs any way, other procs
  	     * on this node that have file open will be forced to do an 
  	     * otw attr fetch, but this is safe.
 	     * --> A user found that their RPC count dropped by 20% when
 	     *     this was commented out and I can't see any requirement
 	     *     for it, so I've disabled it when negative lookups are
 	     *     enabled. (What does this have to do with negative lookup
 	     *     caching? Well nothing, except it was reported by the
 	     *     same user that needed negative lookup caching and I wanted
 	     *     there to be a way to disable it to see if it
 	     *     is the cause of some caching/coherency issue that might
 	     *     crop up.)
  	     */
 	    if (VFSTONFS(vp->v_mount)->nm_negnametimeo == 0) {
 		    np->n_attrstamp = 0;
 		    KDTRACE_NFS_ATTRCACHE_FLUSH_DONE(vp);
 	    }
 	    if (np->n_flag & NWRITEERR) {
 		np->n_flag &= ~NWRITEERR;
 		error = np->n_error;
 	    }
 	    mtx_unlock(&np->n_mtx);
 	}
 
 	if (NFS_ISV4(vp)) {
 		/*
 		 * Get attributes so "change" is up to date.
 		 */
 		if (error == 0 && nfscl_mustflush(vp) != 0) {
 			ret = nfsrpc_getattr(vp, cred, ap->a_td, &nfsva,
 			    NULL);
 			if (!ret) {
 				np->n_change = nfsva.na_filerev;
 				(void) nfscl_loadattrcache(&vp, &nfsva, NULL,
 				    NULL, 0, 0);
 			}
 		}
 
 		/*
 		 * and do the close.
 		 */
 		ret = nfsrpc_close(vp, 0, ap->a_td);
 		if (!error && ret)
 			error = ret;
 		if (error)
 			error = nfscl_maperr(ap->a_td, error, (uid_t)0,
 			    (gid_t)0);
 	}
 	if (newnfs_directio_enable)
 		KASSERT((np->n_directio_asyncwr == 0),
 			("nfs_close: dirty unflushed (%d) directio buffers\n",
 			 np->n_directio_asyncwr));
 	if (newnfs_directio_enable && (fmode & O_DIRECT) && (vp->v_type == VREG)) {
 		mtx_lock(&np->n_mtx);
 		KASSERT((np->n_directio_opens > 0), 
 			("nfs_close: unexpectedly value (0) of n_directio_opens\n"));
 		np->n_directio_opens--;
 		if (np->n_directio_opens == 0)
 			np->n_flag &= ~NNONCACHE;
 		mtx_unlock(&np->n_mtx);
 	}
 	if (localcred)
 		NFSFREECRED(cred);
 	return (error);
 }
 
 /*
  * nfs getattr call from vfs.
  */
 static int
 nfs_getattr(struct vop_getattr_args *ap)
 {
 	struct vnode *vp = ap->a_vp;
 	struct thread *td = curthread;	/* XXX */
 	struct nfsnode *np = VTONFS(vp);
 	int error = 0;
 	struct nfsvattr nfsva;
 	struct vattr *vap = ap->a_vap;
 	struct vattr vattr;
 
 	/*
 	 * Update local times for special files.
 	 */
 	mtx_lock(&np->n_mtx);
 	if (np->n_flag & (NACC | NUPD))
 		np->n_flag |= NCHG;
 	mtx_unlock(&np->n_mtx);
 	/*
 	 * First look in the cache.
 	 */
 	if (ncl_getattrcache(vp, &vattr) == 0) {
 		vap->va_type = vattr.va_type;
 		vap->va_mode = vattr.va_mode;
 		vap->va_nlink = vattr.va_nlink;
 		vap->va_uid = vattr.va_uid;
 		vap->va_gid = vattr.va_gid;
 		vap->va_fsid = vattr.va_fsid;
 		vap->va_fileid = vattr.va_fileid;
 		vap->va_size = vattr.va_size;
 		vap->va_blocksize = vattr.va_blocksize;
 		vap->va_atime = vattr.va_atime;
 		vap->va_mtime = vattr.va_mtime;
 		vap->va_ctime = vattr.va_ctime;
 		vap->va_gen = vattr.va_gen;
 		vap->va_flags = vattr.va_flags;
 		vap->va_rdev = vattr.va_rdev;
 		vap->va_bytes = vattr.va_bytes;
 		vap->va_filerev = vattr.va_filerev;
 		/*
 		 * Get the local modify time for the case of a write
 		 * delegation.
 		 */
 		nfscl_deleggetmodtime(vp, &vap->va_mtime);
 		return (0);
 	}
 
 	if (NFS_ISV34(vp) && nfs_prime_access_cache &&
 	    nfsaccess_cache_timeout > 0) {
 		NFSINCRGLOBAL(newnfsstats.accesscache_misses);
 		nfs34_access_otw(vp, NFSACCESS_ALL, td, ap->a_cred, NULL);
 		if (ncl_getattrcache(vp, ap->a_vap) == 0) {
 			nfscl_deleggetmodtime(vp, &ap->a_vap->va_mtime);
 			return (0);
 		}
 	}
 	error = nfsrpc_getattr(vp, ap->a_cred, td, &nfsva, NULL);
 	if (!error)
 		error = nfscl_loadattrcache(&vp, &nfsva, vap, NULL, 0, 0);
 	if (!error) {
 		/*
 		 * Get the local modify time for the case of a write
 		 * delegation.
 		 */
 		nfscl_deleggetmodtime(vp, &vap->va_mtime);
 	} else if (NFS_ISV4(vp)) {
 		error = nfscl_maperr(td, error, (uid_t)0, (gid_t)0);
 	}
 	return (error);
 }
 
 /*
  * nfs setattr call.
  */
 static int
 nfs_setattr(struct vop_setattr_args *ap)
 {
 	struct vnode *vp = ap->a_vp;
 	struct nfsnode *np = VTONFS(vp);
 	struct thread *td = curthread;	/* XXX */
 	struct vattr *vap = ap->a_vap;
 	int error = 0;
 	u_quad_t tsize;
 
 #ifndef nolint
 	tsize = (u_quad_t)0;
 #endif
 
 	/*
 	 * Setting of flags and marking of atimes are not supported.
 	 */
 	if (vap->va_flags != VNOVAL)
 		return (EOPNOTSUPP);
 
 	/*
 	 * Disallow write attempts if the filesystem is mounted read-only.
 	 */
   	if ((vap->va_flags != VNOVAL || vap->va_uid != (uid_t)VNOVAL ||
 	    vap->va_gid != (gid_t)VNOVAL || vap->va_atime.tv_sec != VNOVAL ||
 	    vap->va_mtime.tv_sec != VNOVAL || vap->va_mode != (mode_t)VNOVAL) &&
 	    (vp->v_mount->mnt_flag & MNT_RDONLY))
 		return (EROFS);
 	if (vap->va_size != VNOVAL) {
  		switch (vp->v_type) {
  		case VDIR:
  			return (EISDIR);
  		case VCHR:
  		case VBLK:
  		case VSOCK:
  		case VFIFO:
 			if (vap->va_mtime.tv_sec == VNOVAL &&
 			    vap->va_atime.tv_sec == VNOVAL &&
 			    vap->va_mode == (mode_t)VNOVAL &&
 			    vap->va_uid == (uid_t)VNOVAL &&
 			    vap->va_gid == (gid_t)VNOVAL)
 				return (0);		
  			vap->va_size = VNOVAL;
  			break;
  		default:
 			/*
 			 * Disallow write attempts if the filesystem is
 			 * mounted read-only.
 			 */
 			if (vp->v_mount->mnt_flag & MNT_RDONLY)
 				return (EROFS);
 			/*
 			 *  We run vnode_pager_setsize() early (why?),
 			 * we must set np->n_size now to avoid vinvalbuf
 			 * V_SAVE races that might setsize a lower
 			 * value.
 			 */
 			mtx_lock(&np->n_mtx);
 			tsize = np->n_size;
 			mtx_unlock(&np->n_mtx);
 			error = ncl_meta_setsize(vp, ap->a_cred, td,
 			    vap->va_size);
 			mtx_lock(&np->n_mtx);
  			if (np->n_flag & NMODIFIED) {
 			    tsize = np->n_size;
 			    mtx_unlock(&np->n_mtx);
  			    if (vap->va_size == 0)
  				error = ncl_vinvalbuf(vp, 0, td, 1);
  			    else
  				error = ncl_vinvalbuf(vp, V_SAVE, td, 1);
  			    if (error) {
 				vnode_pager_setsize(vp, tsize);
 				return (error);
 			    }
 			    /*
 			     * Call nfscl_delegmodtime() to set the modify time
 			     * locally, as required.
 			     */
 			    nfscl_delegmodtime(vp);
  			} else
 			    mtx_unlock(&np->n_mtx);
 			/*
 			 * np->n_size has already been set to vap->va_size
 			 * in ncl_meta_setsize(). We must set it again since
 			 * nfs_loadattrcache() could be called through
 			 * ncl_meta_setsize() and could modify np->n_size.
 			 */
 			mtx_lock(&np->n_mtx);
  			np->n_vattr.na_size = np->n_size = vap->va_size;
 			mtx_unlock(&np->n_mtx);
   		};
   	} else {
 		mtx_lock(&np->n_mtx);
 		if ((vap->va_mtime.tv_sec != VNOVAL || vap->va_atime.tv_sec != VNOVAL) && 
 		    (np->n_flag & NMODIFIED) && vp->v_type == VREG) {
 			mtx_unlock(&np->n_mtx);
 			if ((error = ncl_vinvalbuf(vp, V_SAVE, td, 1)) != 0 &&
 			    (error == EINTR || error == EIO))
 				return (error);
 		} else
 			mtx_unlock(&np->n_mtx);
 	}
 	error = nfs_setattrrpc(vp, vap, ap->a_cred, td);
 	if (error && vap->va_size != VNOVAL) {
 		mtx_lock(&np->n_mtx);
 		np->n_size = np->n_vattr.na_size = tsize;
 		vnode_pager_setsize(vp, tsize);
 		mtx_unlock(&np->n_mtx);
 	}
 	return (error);
 }
 
 /*
  * Do an nfs setattr rpc.
  */
 static int
 nfs_setattrrpc(struct vnode *vp, struct vattr *vap, struct ucred *cred,
     struct thread *td)
 {
 	struct nfsnode *np = VTONFS(vp);
 	int error, ret, attrflag, i;
 	struct nfsvattr nfsva;
 
 	if (NFS_ISV34(vp)) {
 		mtx_lock(&np->n_mtx);
 		for (i = 0; i < NFS_ACCESSCACHESIZE; i++)
 			np->n_accesscache[i].stamp = 0;
 		np->n_flag |= NDELEGMOD;
 		mtx_unlock(&np->n_mtx);
 		KDTRACE_NFS_ACCESSCACHE_FLUSH_DONE(vp);
 	}
 	error = nfsrpc_setattr(vp, vap, NULL, cred, td, &nfsva, &attrflag,
 	    NULL);
 	if (attrflag) {
 		ret = nfscl_loadattrcache(&vp, &nfsva, NULL, NULL, 0, 1);
 		if (ret && !error)
 			error = ret;
 	}
 	if (error && NFS_ISV4(vp))
 		error = nfscl_maperr(td, error, vap->va_uid, vap->va_gid);
 	return (error);
 }
 
 /*
  * nfs lookup call, one step at a time...
  * First look in cache
  * If not found, unlock the directory nfsnode and do the rpc
  */
 static int
 nfs_lookup(struct vop_lookup_args *ap)
 {
 	struct componentname *cnp = ap->a_cnp;
 	struct vnode *dvp = ap->a_dvp;
 	struct vnode **vpp = ap->a_vpp;
 	struct mount *mp = dvp->v_mount;
 	int flags = cnp->cn_flags;
 	struct vnode *newvp;
 	struct nfsmount *nmp;
 	struct nfsnode *np, *newnp;
 	int error = 0, attrflag, dattrflag, ltype, ncticks;
 	struct thread *td = cnp->cn_thread;
 	struct nfsfh *nfhp;
 	struct nfsvattr dnfsva, nfsva;
 	struct vattr vattr;
 	struct timespec nctime;
 	
 	*vpp = NULLVP;
 	if ((flags & ISLASTCN) && (mp->mnt_flag & MNT_RDONLY) &&
 	    (cnp->cn_nameiop == DELETE || cnp->cn_nameiop == RENAME))
 		return (EROFS);
 	if (dvp->v_type != VDIR)
 		return (ENOTDIR);
 	nmp = VFSTONFS(mp);
 	np = VTONFS(dvp);
 
 	/* For NFSv4, wait until any remove is done. */
 	mtx_lock(&np->n_mtx);
 	while (NFSHASNFSV4(nmp) && (np->n_flag & NREMOVEINPROG)) {
 		np->n_flag |= NREMOVEWANT;
 		(void) msleep((caddr_t)np, &np->n_mtx, PZERO, "nfslkup", 0);
 	}
 	mtx_unlock(&np->n_mtx);
 
 	if ((error = VOP_ACCESS(dvp, VEXEC, cnp->cn_cred, td)) != 0)
 		return (error);
 	error = cache_lookup(dvp, vpp, cnp, &nctime, &ncticks);
 	if (error > 0 && error != ENOENT)
 		return (error);
 	if (error == -1) {
 		/*
 		 * Lookups of "." are special and always return the
 		 * current directory.  cache_lookup() already handles
 		 * associated locking bookkeeping, etc.
 		 */
 		if (cnp->cn_namelen == 1 && cnp->cn_nameptr[0] == '.') {
 			/* XXX: Is this really correct? */
 			if (cnp->cn_nameiop != LOOKUP &&
 			    (flags & ISLASTCN))
 				cnp->cn_flags |= SAVENAME;
 			return (0);
 		}
 
 		/*
 		 * We only accept a positive hit in the cache if the
 		 * change time of the file matches our cached copy.
 		 * Otherwise, we discard the cache entry and fallback
 		 * to doing a lookup RPC.  We also only trust cache
 		 * entries for less than nm_nametimeo seconds.
 		 *
 		 * To better handle stale file handles and attributes,
 		 * clear the attribute cache of this node if it is a
 		 * leaf component, part of an open() call, and not
 		 * locally modified before fetching the attributes.
 		 * This should allow stale file handles to be detected
 		 * here where we can fall back to a LOOKUP RPC to
 		 * recover rather than having nfs_open() detect the
 		 * stale file handle and failing open(2) with ESTALE.
 		 */
 		newvp = *vpp;
 		newnp = VTONFS(newvp);
 		if (!(nmp->nm_flag & NFSMNT_NOCTO) &&
 		    (flags & (ISLASTCN | ISOPEN)) == (ISLASTCN | ISOPEN) &&
 		    !(newnp->n_flag & NMODIFIED)) {
 			mtx_lock(&newnp->n_mtx);
 			newnp->n_attrstamp = 0;
 			KDTRACE_NFS_ATTRCACHE_FLUSH_DONE(newvp);
 			mtx_unlock(&newnp->n_mtx);
 		}
 		if (nfscl_nodeleg(newvp, 0) == 0 ||
 		    ((u_int)(ticks - ncticks) < (nmp->nm_nametimeo * hz) &&
 		    VOP_GETATTR(newvp, &vattr, cnp->cn_cred) == 0 &&
 		    timespeccmp(&vattr.va_ctime, &nctime, ==))) {
 			NFSINCRGLOBAL(newnfsstats.lookupcache_hits);
 			if (cnp->cn_nameiop != LOOKUP &&
 			    (flags & ISLASTCN))
 				cnp->cn_flags |= SAVENAME;
 			return (0);
 		}
 		cache_purge(newvp);
 		if (dvp != newvp)
 			vput(newvp);
 		else 
 			vrele(newvp);
 		*vpp = NULLVP;
 	} else if (error == ENOENT) {
 		if (dvp->v_iflag & VI_DOOMED)
 			return (ENOENT);
 		/*
 		 * We only accept a negative hit in the cache if the
 		 * modification time of the parent directory matches
 		 * the cached copy in the name cache entry.
 		 * Otherwise, we discard all of the negative cache
 		 * entries for this directory.  We also only trust
 		 * negative cache entries for up to nm_negnametimeo
 		 * seconds.
 		 */
 		if ((u_int)(ticks - ncticks) < (nmp->nm_negnametimeo * hz) &&
 		    VOP_GETATTR(dvp, &vattr, cnp->cn_cred) == 0 &&
 		    timespeccmp(&vattr.va_mtime, &nctime, ==)) {
 			NFSINCRGLOBAL(newnfsstats.lookupcache_hits);
 			return (ENOENT);
 		}
 		cache_purge_negative(dvp);
 	}
 
 	error = 0;
 	newvp = NULLVP;
 	NFSINCRGLOBAL(newnfsstats.lookupcache_misses);
 	error = nfsrpc_lookup(dvp, cnp->cn_nameptr, cnp->cn_namelen,
 	    cnp->cn_cred, td, &dnfsva, &nfsva, &nfhp, &attrflag, &dattrflag,
 	    NULL);
 	if (dattrflag)
 		(void) nfscl_loadattrcache(&dvp, &dnfsva, NULL, NULL, 0, 1);
 	if (error) {
 		if (newvp != NULLVP) {
 			vput(newvp);
 			*vpp = NULLVP;
 		}
 
 		if (error != ENOENT) {
 			if (NFS_ISV4(dvp))
 				error = nfscl_maperr(td, error, (uid_t)0,
 				    (gid_t)0);
 			return (error);
 		}
 
 		/* The requested file was not found. */
 		if ((cnp->cn_nameiop == CREATE || cnp->cn_nameiop == RENAME) &&
 		    (flags & ISLASTCN)) {
 			/*
 			 * XXX: UFS does a full VOP_ACCESS(dvp,
 			 * VWRITE) here instead of just checking
 			 * MNT_RDONLY.
 			 */
 			if (mp->mnt_flag & MNT_RDONLY)
 				return (EROFS);
 			cnp->cn_flags |= SAVENAME;
 			return (EJUSTRETURN);
 		}
 
 		if ((cnp->cn_flags & MAKEENTRY) && cnp->cn_nameiop != CREATE &&
 		    dattrflag) {
 			/*
 			 * Cache the modification time of the parent
 			 * directory from the post-op attributes in
 			 * the name cache entry.  The negative cache
 			 * entry will be ignored once the directory
 			 * has changed.  Don't bother adding the entry
 			 * if the directory has already changed.
 			 */
 			mtx_lock(&np->n_mtx);
 			if (timespeccmp(&np->n_vattr.na_mtime,
 			    &dnfsva.na_mtime, ==)) {
 				mtx_unlock(&np->n_mtx);
 				cache_enter_time(dvp, NULL, cnp,
 				    &dnfsva.na_mtime, NULL);
 			} else
 				mtx_unlock(&np->n_mtx);
 		}
 		return (ENOENT);
 	}
 
 	/*
 	 * Handle RENAME case...
 	 */
 	if (cnp->cn_nameiop == RENAME && (flags & ISLASTCN)) {
 		if (NFS_CMPFH(np, nfhp->nfh_fh, nfhp->nfh_len)) {
 			FREE((caddr_t)nfhp, M_NFSFH);
 			return (EISDIR);
 		}
 		error = nfscl_nget(mp, dvp, nfhp, cnp, td, &np, NULL,
 		    LK_EXCLUSIVE);
 		if (error)
 			return (error);
 		newvp = NFSTOV(np);
 		if (attrflag)
 			(void) nfscl_loadattrcache(&newvp, &nfsva, NULL, NULL,
 			    0, 1);
 		*vpp = newvp;
 		cnp->cn_flags |= SAVENAME;
 		return (0);
 	}
 
 	if (flags & ISDOTDOT) {
 		ltype = NFSVOPISLOCKED(dvp);
 		error = vfs_busy(mp, MBF_NOWAIT);
 		if (error != 0) {
 			vfs_ref(mp);
 			NFSVOPUNLOCK(dvp, 0);
 			error = vfs_busy(mp, 0);
 			NFSVOPLOCK(dvp, ltype | LK_RETRY);
 			vfs_rel(mp);
 			if (error == 0 && (dvp->v_iflag & VI_DOOMED)) {
 				vfs_unbusy(mp);
 				error = ENOENT;
 			}
 			if (error != 0)
 				return (error);
 		}
 		NFSVOPUNLOCK(dvp, 0);
 		error = nfscl_nget(mp, dvp, nfhp, cnp, td, &np, NULL,
 		    cnp->cn_lkflags);
 		if (error == 0)
 			newvp = NFSTOV(np);
 		vfs_unbusy(mp);
 		if (newvp != dvp)
 			NFSVOPLOCK(dvp, ltype | LK_RETRY);
 		if (dvp->v_iflag & VI_DOOMED) {
 			if (error == 0) {
 				if (newvp == dvp)
 					vrele(newvp);
 				else
 					vput(newvp);
 			}
 			error = ENOENT;
 		}
 		if (error != 0)
 			return (error);
 		if (attrflag)
 			(void) nfscl_loadattrcache(&newvp, &nfsva, NULL, NULL,
 			    0, 1);
 	} else if (NFS_CMPFH(np, nfhp->nfh_fh, nfhp->nfh_len)) {
 		FREE((caddr_t)nfhp, M_NFSFH);
 		VREF(dvp);
 		newvp = dvp;
 		if (attrflag)
 			(void) nfscl_loadattrcache(&newvp, &nfsva, NULL, NULL,
 			    0, 1);
 	} else {
 		error = nfscl_nget(mp, dvp, nfhp, cnp, td, &np, NULL,
 		    cnp->cn_lkflags);
 		if (error)
 			return (error);
 		newvp = NFSTOV(np);
 		if (attrflag)
 			(void) nfscl_loadattrcache(&newvp, &nfsva, NULL, NULL,
 			    0, 1);
 		else if ((flags & (ISLASTCN | ISOPEN)) == (ISLASTCN | ISOPEN) &&
 		    !(np->n_flag & NMODIFIED)) {			
 			/*
 			 * Flush the attribute cache when opening a
 			 * leaf node to ensure that fresh attributes
 			 * are fetched in nfs_open() since we did not
 			 * fetch attributes from the LOOKUP reply.
 			 */
 			mtx_lock(&np->n_mtx);
 			np->n_attrstamp = 0;
 			KDTRACE_NFS_ATTRCACHE_FLUSH_DONE(newvp);
 			mtx_unlock(&np->n_mtx);
 		}
 	}
 	if (cnp->cn_nameiop != LOOKUP && (flags & ISLASTCN))
 		cnp->cn_flags |= SAVENAME;
 	if ((cnp->cn_flags & MAKEENTRY) &&
 	    (cnp->cn_nameiop != DELETE || !(flags & ISLASTCN)) &&
 	    attrflag != 0 && (newvp->v_type != VDIR || dattrflag != 0))
 		cache_enter_time(dvp, newvp, cnp, &nfsva.na_ctime,
 		    newvp->v_type != VDIR ? NULL : &dnfsva.na_ctime);
 	*vpp = newvp;
 	return (0);
 }
 
 /*
  * nfs read call.
  * Just call ncl_bioread() to do the work.
  */
 static int
 nfs_read(struct vop_read_args *ap)
 {
 	struct vnode *vp = ap->a_vp;
 
 	switch (vp->v_type) {
 	case VREG:
 		return (ncl_bioread(vp, ap->a_uio, ap->a_ioflag, ap->a_cred));
 	case VDIR:
 		return (EISDIR);
 	default:
 		return (EOPNOTSUPP);
 	}
 }
 
 /*
  * nfs readlink call
  */
 static int
 nfs_readlink(struct vop_readlink_args *ap)
 {
 	struct vnode *vp = ap->a_vp;
 
 	if (vp->v_type != VLNK)
 		return (EINVAL);
 	return (ncl_bioread(vp, ap->a_uio, 0, ap->a_cred));
 }
 
 /*
  * Do a readlink rpc.
  * Called by ncl_doio() from below the buffer cache.
  */
 int
 ncl_readlinkrpc(struct vnode *vp, struct uio *uiop, struct ucred *cred)
 {
 	int error, ret, attrflag;
 	struct nfsvattr nfsva;
 
 	error = nfsrpc_readlink(vp, uiop, cred, uiop->uio_td, &nfsva,
 	    &attrflag, NULL);
 	if (attrflag) {
 		ret = nfscl_loadattrcache(&vp, &nfsva, NULL, NULL, 0, 1);
 		if (ret && !error)
 			error = ret;
 	}
 	if (error && NFS_ISV4(vp))
 		error = nfscl_maperr(uiop->uio_td, error, (uid_t)0, (gid_t)0);
 	return (error);
 }
 
 /*
  * nfs read rpc call
  * Ditto above
  */
 int
 ncl_readrpc(struct vnode *vp, struct uio *uiop, struct ucred *cred)
 {
 	int error, ret, attrflag;
 	struct nfsvattr nfsva;
 	struct nfsmount *nmp;
 
 	nmp = VFSTONFS(vnode_mount(vp));
 	error = EIO;
 	attrflag = 0;
 	if (NFSHASPNFS(nmp))
 		error = nfscl_doiods(vp, uiop, NULL, NULL,
 		    NFSV4OPEN_ACCESSREAD, cred, uiop->uio_td);
 	NFSCL_DEBUG(4, "readrpc: aft doiods=%d\n", error);
 	if (error != 0)
 		error = nfsrpc_read(vp, uiop, cred, uiop->uio_td, &nfsva,
 		    &attrflag, NULL);
 	if (attrflag) {
 		ret = nfscl_loadattrcache(&vp, &nfsva, NULL, NULL, 0, 1);
 		if (ret && !error)
 			error = ret;
 	}
 	if (error && NFS_ISV4(vp))
 		error = nfscl_maperr(uiop->uio_td, error, (uid_t)0, (gid_t)0);
 	return (error);
 }
 
 /*
  * nfs write call
  */
 int
 ncl_writerpc(struct vnode *vp, struct uio *uiop, struct ucred *cred,
     int *iomode, int *must_commit, int called_from_strategy)
 {
 	struct nfsvattr nfsva;
 	int error, attrflag, ret;
 	struct nfsmount *nmp;
 
 	nmp = VFSTONFS(vnode_mount(vp));
 	error = EIO;
 	attrflag = 0;
 	if (NFSHASPNFS(nmp))
 		error = nfscl_doiods(vp, uiop, iomode, must_commit,
 		    NFSV4OPEN_ACCESSWRITE, cred, uiop->uio_td);
 	NFSCL_DEBUG(4, "writerpc: aft doiods=%d\n", error);
 	if (error != 0)
 		error = nfsrpc_write(vp, uiop, iomode, must_commit, cred,
 		    uiop->uio_td, &nfsva, &attrflag, NULL,
 		    called_from_strategy);
 	if (attrflag) {
 		if (VTONFS(vp)->n_flag & ND_NFSV4)
 			ret = nfscl_loadattrcache(&vp, &nfsva, NULL, NULL, 1,
 			    1);
 		else
 			ret = nfscl_loadattrcache(&vp, &nfsva, NULL, NULL, 0,
 			    1);
 		if (ret && !error)
 			error = ret;
 	}
 	if (DOINGASYNC(vp))
 		*iomode = NFSWRITE_FILESYNC;
 	if (error && NFS_ISV4(vp))
 		error = nfscl_maperr(uiop->uio_td, error, (uid_t)0, (gid_t)0);
 	return (error);
 }
 
 /*
  * nfs mknod rpc
  * For NFS v2 this is a kludge. Use a create rpc but with the IFMT bits of the
  * mode set to specify the file type and the size field for rdev.
  */
 static int
 nfs_mknodrpc(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp,
     struct vattr *vap)
 {
 	struct nfsvattr nfsva, dnfsva;
 	struct vnode *newvp = NULL;
 	struct nfsnode *np = NULL, *dnp;
 	struct nfsfh *nfhp;
 	struct vattr vattr;
 	int error = 0, attrflag, dattrflag;
 	u_int32_t rdev;
 
 	if (vap->va_type == VCHR || vap->va_type == VBLK)
 		rdev = vap->va_rdev;
 	else if (vap->va_type == VFIFO || vap->va_type == VSOCK)
 		rdev = 0xffffffff;
 	else
 		return (EOPNOTSUPP);
 	if ((error = VOP_GETATTR(dvp, &vattr, cnp->cn_cred)))
 		return (error);
 	error = nfsrpc_mknod(dvp, cnp->cn_nameptr, cnp->cn_namelen, vap,
 	    rdev, vap->va_type, cnp->cn_cred, cnp->cn_thread, &dnfsva,
 	    &nfsva, &nfhp, &attrflag, &dattrflag, NULL);
 	if (!error) {
 		if (!nfhp)
 			(void) nfsrpc_lookup(dvp, cnp->cn_nameptr,
 			    cnp->cn_namelen, cnp->cn_cred, cnp->cn_thread,
 			    &dnfsva, &nfsva, &nfhp, &attrflag, &dattrflag,
 			    NULL);
 		if (nfhp)
 			error = nfscl_nget(dvp->v_mount, dvp, nfhp, cnp,
 			    cnp->cn_thread, &np, NULL, LK_EXCLUSIVE);
 	}
 	if (dattrflag)
 		(void) nfscl_loadattrcache(&dvp, &dnfsva, NULL, NULL, 0, 1);
 	if (!error) {
 		newvp = NFSTOV(np);
 		if (attrflag != 0) {
 			error = nfscl_loadattrcache(&newvp, &nfsva, NULL, NULL,
 			    0, 1);
 			if (error != 0)
 				vput(newvp);
 		}
 	}
 	if (!error) {
 		*vpp = newvp;
 	} else if (NFS_ISV4(dvp)) {
 		error = nfscl_maperr(cnp->cn_thread, error, vap->va_uid,
 		    vap->va_gid);
 	}
 	dnp = VTONFS(dvp);
 	mtx_lock(&dnp->n_mtx);
 	dnp->n_flag |= NMODIFIED;
 	if (!dattrflag) {
 		dnp->n_attrstamp = 0;
 		KDTRACE_NFS_ATTRCACHE_FLUSH_DONE(dvp);
 	}
 	mtx_unlock(&dnp->n_mtx);
 	return (error);
 }
 
 /*
  * nfs mknod vop
  * just call nfs_mknodrpc() to do the work.
  */
 /* ARGSUSED */
 static int
 nfs_mknod(struct vop_mknod_args *ap)
 {
 	return (nfs_mknodrpc(ap->a_dvp, ap->a_vpp, ap->a_cnp, ap->a_vap));
 }
 
 static struct mtx nfs_cverf_mtx;
 MTX_SYSINIT(nfs_cverf_mtx, &nfs_cverf_mtx, "NFS create verifier mutex",
     MTX_DEF);
 
 static nfsquad_t
 nfs_get_cverf(void)
 {
 	static nfsquad_t cverf;
 	nfsquad_t ret;
 	static int cverf_initialized = 0;
 
 	mtx_lock(&nfs_cverf_mtx);
 	if (cverf_initialized == 0) {
 		cverf.lval[0] = arc4random();
 		cverf.lval[1] = arc4random();
 		cverf_initialized = 1;
 	} else
 		cverf.qval++;
 	ret = cverf;
 	mtx_unlock(&nfs_cverf_mtx);
 
 	return (ret);
 }
 
 /*
  * nfs file create call
  */
 static int
 nfs_create(struct vop_create_args *ap)
 {
 	struct vnode *dvp = ap->a_dvp;
 	struct vattr *vap = ap->a_vap;
 	struct componentname *cnp = ap->a_cnp;
 	struct nfsnode *np = NULL, *dnp;
 	struct vnode *newvp = NULL;
 	struct nfsmount *nmp;
 	struct nfsvattr dnfsva, nfsva;
 	struct nfsfh *nfhp;
 	nfsquad_t cverf;
 	int error = 0, attrflag, dattrflag, fmode = 0;
 	struct vattr vattr;
 
 	/*
 	 * Oops, not for me..
 	 */
 	if (vap->va_type == VSOCK)
 		return (nfs_mknodrpc(dvp, ap->a_vpp, cnp, vap));
 
 	if ((error = VOP_GETATTR(dvp, &vattr, cnp->cn_cred)))
 		return (error);
 	if (vap->va_vaflags & VA_EXCLUSIVE)
 		fmode |= O_EXCL;
 	dnp = VTONFS(dvp);
 	nmp = VFSTONFS(vnode_mount(dvp));
 again:
 	/* For NFSv4, wait until any remove is done. */
 	mtx_lock(&dnp->n_mtx);
 	while (NFSHASNFSV4(nmp) && (dnp->n_flag & NREMOVEINPROG)) {
 		dnp->n_flag |= NREMOVEWANT;
 		(void) msleep((caddr_t)dnp, &dnp->n_mtx, PZERO, "nfscrt", 0);
 	}
 	mtx_unlock(&dnp->n_mtx);
 
 	cverf = nfs_get_cverf();
 	error = nfsrpc_create(dvp, cnp->cn_nameptr, cnp->cn_namelen,
 	    vap, cverf, fmode, cnp->cn_cred, cnp->cn_thread, &dnfsva, &nfsva,
 	    &nfhp, &attrflag, &dattrflag, NULL);
 	if (!error) {
 		if (nfhp == NULL)
 			(void) nfsrpc_lookup(dvp, cnp->cn_nameptr,
 			    cnp->cn_namelen, cnp->cn_cred, cnp->cn_thread,
 			    &dnfsva, &nfsva, &nfhp, &attrflag, &dattrflag,
 			    NULL);
 		if (nfhp != NULL)
 			error = nfscl_nget(dvp->v_mount, dvp, nfhp, cnp,
 			    cnp->cn_thread, &np, NULL, LK_EXCLUSIVE);
 	}
 	if (dattrflag)
 		(void) nfscl_loadattrcache(&dvp, &dnfsva, NULL, NULL, 0, 1);
 	if (!error) {
 		newvp = NFSTOV(np);
 		if (attrflag == 0)
 			error = nfsrpc_getattr(newvp, cnp->cn_cred,
 			    cnp->cn_thread, &nfsva, NULL);
 		if (error == 0)
 			error = nfscl_loadattrcache(&newvp, &nfsva, NULL, NULL,
 			    0, 1);
 	}
 	if (error) {
 		if (newvp != NULL) {
 			vput(newvp);
 			newvp = NULL;
 		}
 		if (NFS_ISV34(dvp) && (fmode & O_EXCL) &&
 		    error == NFSERR_NOTSUPP) {
 			fmode &= ~O_EXCL;
 			goto again;
 		}
 	} else if (NFS_ISV34(dvp) && (fmode & O_EXCL)) {
 		if (nfscl_checksattr(vap, &nfsva)) {
 			/*
 			 * We are normally called with only a partially
 			 * initialized VAP. Since the NFSv3 spec says that
 			 * the server may use the file attributes to
 			 * store the verifier, the spec requires us to do a
 			 * SETATTR RPC. FreeBSD servers store the verifier in
 			 * atime, but we can't really assume that all servers
 			 * will so we ensure that our SETATTR sets both atime
 			 * and mtime.
 			 */
 			if (vap->va_mtime.tv_sec == VNOVAL)
 				vfs_timestamp(&vap->va_mtime);
 			if (vap->va_atime.tv_sec == VNOVAL)
 				vap->va_atime = vap->va_mtime;
 			error = nfsrpc_setattr(newvp, vap, NULL, cnp->cn_cred,
 			    cnp->cn_thread, &nfsva, &attrflag, NULL);
 			if (error && (vap->va_uid != (uid_t)VNOVAL ||
 			    vap->va_gid != (gid_t)VNOVAL)) {
 				/* try again without setting uid/gid */
 				vap->va_uid = (uid_t)VNOVAL;
 				vap->va_gid = (uid_t)VNOVAL;
 				error = nfsrpc_setattr(newvp, vap, NULL, 
 				    cnp->cn_cred, cnp->cn_thread, &nfsva,
 				    &attrflag, NULL);
 			}
 			if (attrflag)
 				(void) nfscl_loadattrcache(&newvp, &nfsva, NULL,
 				    NULL, 0, 1);
 			if (error != 0)
 				vput(newvp);
 		}
 	}
 	if (!error) {
 		if ((cnp->cn_flags & MAKEENTRY) && attrflag)
 			cache_enter_time(dvp, newvp, cnp, &nfsva.na_ctime,
 			    NULL);
 		*ap->a_vpp = newvp;
 	} else if (NFS_ISV4(dvp)) {
 		error = nfscl_maperr(cnp->cn_thread, error, vap->va_uid,
 		    vap->va_gid);
 	}
 	mtx_lock(&dnp->n_mtx);
 	dnp->n_flag |= NMODIFIED;
 	if (!dattrflag) {
 		dnp->n_attrstamp = 0;
 		KDTRACE_NFS_ATTRCACHE_FLUSH_DONE(dvp);
 	}
 	mtx_unlock(&dnp->n_mtx);
 	return (error);
 }
 
 /*
  * nfs file remove call
  * To try and make nfs semantics closer to ufs semantics, a file that has
  * other processes using the vnode is renamed instead of removed and then
  * removed later on the last close.
  * - If v_usecount > 1
  *	  If a rename is not already in the works
  *	     call nfs_sillyrename() to set it up
  *     else
  *	  do the remove rpc
  */
 static int
 nfs_remove(struct vop_remove_args *ap)
 {
 	struct vnode *vp = ap->a_vp;
 	struct vnode *dvp = ap->a_dvp;
 	struct componentname *cnp = ap->a_cnp;
 	struct nfsnode *np = VTONFS(vp);
 	int error = 0;
 	struct vattr vattr;
 
 	KASSERT((cnp->cn_flags & HASBUF) != 0, ("nfs_remove: no name"));
 	KASSERT(vrefcnt(vp) > 0, ("nfs_remove: bad v_usecount"));
 	if (vp->v_type == VDIR)
 		error = EPERM;
 	else if (vrefcnt(vp) == 1 || (np->n_sillyrename &&
 	    VOP_GETATTR(vp, &vattr, cnp->cn_cred) == 0 &&
 	    vattr.va_nlink > 1)) {
 		/*
 		 * Purge the name cache so that the chance of a lookup for
 		 * the name succeeding while the remove is in progress is
 		 * minimized. Without node locking it can still happen, such
 		 * that an I/O op returns ESTALE, but since you get this if
 		 * another host removes the file..
 		 */
 		cache_purge(vp);
 		/*
 		 * throw away biocache buffers, mainly to avoid
 		 * unnecessary delayed writes later.
 		 */
 		error = ncl_vinvalbuf(vp, 0, cnp->cn_thread, 1);
 		/* Do the rpc */
 		if (error != EINTR && error != EIO)
 			error = nfs_removerpc(dvp, vp, cnp->cn_nameptr,
 			    cnp->cn_namelen, cnp->cn_cred, cnp->cn_thread);
 		/*
 		 * Kludge City: If the first reply to the remove rpc is lost..
 		 *   the reply to the retransmitted request will be ENOENT
 		 *   since the file was in fact removed
 		 *   Therefore, we cheat and return success.
 		 */
 		if (error == ENOENT)
 			error = 0;
 	} else if (!np->n_sillyrename)
 		error = nfs_sillyrename(dvp, vp, cnp);
 	mtx_lock(&np->n_mtx);
 	np->n_attrstamp = 0;
 	mtx_unlock(&np->n_mtx);
 	KDTRACE_NFS_ATTRCACHE_FLUSH_DONE(vp);
 	return (error);
 }
 
 /*
  * nfs file remove rpc called from nfs_inactive
  */
 int
 ncl_removeit(struct sillyrename *sp, struct vnode *vp)
 {
 	/*
 	 * Make sure that the directory vnode is still valid.
 	 * XXX we should lock sp->s_dvp here.
 	 */
 	if (sp->s_dvp->v_type == VBAD)
 		return (0);
 	return (nfs_removerpc(sp->s_dvp, vp, sp->s_name, sp->s_namlen,
 	    sp->s_cred, NULL));
 }
 
 /*
  * Nfs remove rpc, called from nfs_remove() and ncl_removeit().
  */
 static int
 nfs_removerpc(struct vnode *dvp, struct vnode *vp, char *name,
     int namelen, struct ucred *cred, struct thread *td)
 {
 	struct nfsvattr dnfsva;
 	struct nfsnode *dnp = VTONFS(dvp);
 	int error = 0, dattrflag;
 
 	mtx_lock(&dnp->n_mtx);
 	dnp->n_flag |= NREMOVEINPROG;
 	mtx_unlock(&dnp->n_mtx);
 	error = nfsrpc_remove(dvp, name, namelen, vp, cred, td, &dnfsva,
 	    &dattrflag, NULL);
 	mtx_lock(&dnp->n_mtx);
 	if ((dnp->n_flag & NREMOVEWANT)) {
 		dnp->n_flag &= ~(NREMOVEWANT | NREMOVEINPROG);
 		mtx_unlock(&dnp->n_mtx);
 		wakeup((caddr_t)dnp);
 	} else {
 		dnp->n_flag &= ~NREMOVEINPROG;
 		mtx_unlock(&dnp->n_mtx);
 	}
 	if (dattrflag)
 		(void) nfscl_loadattrcache(&dvp, &dnfsva, NULL, NULL, 0, 1);
 	mtx_lock(&dnp->n_mtx);
 	dnp->n_flag |= NMODIFIED;
 	if (!dattrflag) {
 		dnp->n_attrstamp = 0;
 		KDTRACE_NFS_ATTRCACHE_FLUSH_DONE(dvp);
 	}
 	mtx_unlock(&dnp->n_mtx);
 	if (error && NFS_ISV4(dvp))
 		error = nfscl_maperr(td, error, (uid_t)0, (gid_t)0);
 	return (error);
 }
 
 /*
  * nfs file rename call
  */
 static int
 nfs_rename(struct vop_rename_args *ap)
 {
 	struct vnode *fvp = ap->a_fvp;
 	struct vnode *tvp = ap->a_tvp;
 	struct vnode *fdvp = ap->a_fdvp;
 	struct vnode *tdvp = ap->a_tdvp;
 	struct componentname *tcnp = ap->a_tcnp;
 	struct componentname *fcnp = ap->a_fcnp;
 	struct nfsnode *fnp = VTONFS(ap->a_fvp);
 	struct nfsnode *tdnp = VTONFS(ap->a_tdvp);
 	struct nfsv4node *newv4 = NULL;
 	int error;
 
 	KASSERT((tcnp->cn_flags & HASBUF) != 0 &&
 	    (fcnp->cn_flags & HASBUF) != 0, ("nfs_rename: no name"));
 	/* Check for cross-device rename */
 	if ((fvp->v_mount != tdvp->v_mount) ||
 	    (tvp && (fvp->v_mount != tvp->v_mount))) {
 		error = EXDEV;
 		goto out;
 	}
 
 	if (fvp == tvp) {
 		ncl_printf("nfs_rename: fvp == tvp (can't happen)\n");
 		error = 0;
 		goto out;
 	}
 	if ((error = NFSVOPLOCK(fvp, LK_EXCLUSIVE)) != 0)
 		goto out;
 
 	/*
 	 * We have to flush B_DELWRI data prior to renaming
 	 * the file.  If we don't, the delayed-write buffers
 	 * can be flushed out later after the file has gone stale
 	 * under NFSV3.  NFSV2 does not have this problem because
 	 * ( as far as I can tell ) it flushes dirty buffers more
 	 * often.
 	 * 
 	 * Skip the rename operation if the fsync fails, this can happen
 	 * due to the server's volume being full, when we pushed out data
 	 * that was written back to our cache earlier. Not checking for
 	 * this condition can result in potential (silent) data loss.
 	 */
 	error = VOP_FSYNC(fvp, MNT_WAIT, fcnp->cn_thread);
 	NFSVOPUNLOCK(fvp, 0);
 	if (!error && tvp)
 		error = VOP_FSYNC(tvp, MNT_WAIT, tcnp->cn_thread);
 	if (error)
 		goto out;
 
 	/*
 	 * If the tvp exists and is in use, sillyrename it before doing the
 	 * rename of the new file over it.
 	 * XXX Can't sillyrename a directory.
 	 */
 	if (tvp && vrefcnt(tvp) > 1 && !VTONFS(tvp)->n_sillyrename &&
 		tvp->v_type != VDIR && !nfs_sillyrename(tdvp, tvp, tcnp)) {
 		vput(tvp);
 		tvp = NULL;
 	}
 
 	error = nfs_renamerpc(fdvp, fvp, fcnp->cn_nameptr, fcnp->cn_namelen,
 	    tdvp, tvp, tcnp->cn_nameptr, tcnp->cn_namelen, tcnp->cn_cred,
 	    tcnp->cn_thread);
 
 	if (error == 0 && NFS_ISV4(tdvp)) {
 		/*
 		 * For NFSv4, check to see if it is the same name and
 		 * replace the name, if it is different.
 		 */
 		MALLOC(newv4, struct nfsv4node *,
 		    sizeof (struct nfsv4node) +
 		    tdnp->n_fhp->nfh_len + tcnp->cn_namelen - 1,
 		    M_NFSV4NODE, M_WAITOK);
 		mtx_lock(&tdnp->n_mtx);
 		mtx_lock(&fnp->n_mtx);
 		if (fnp->n_v4 != NULL && fvp->v_type == VREG &&
 		    (fnp->n_v4->n4_namelen != tcnp->cn_namelen ||
 		      NFSBCMP(tcnp->cn_nameptr, NFS4NODENAME(fnp->n_v4),
 		      tcnp->cn_namelen) ||
 		      tdnp->n_fhp->nfh_len != fnp->n_v4->n4_fhlen ||
 		      NFSBCMP(tdnp->n_fhp->nfh_fh, fnp->n_v4->n4_data,
 			tdnp->n_fhp->nfh_len))) {
 #ifdef notdef
 { char nnn[100]; int nnnl;
 nnnl = (tcnp->cn_namelen < 100) ? tcnp->cn_namelen : 99;
 bcopy(tcnp->cn_nameptr, nnn, nnnl);
 nnn[nnnl] = '\0';
 printf("ren replace=%s\n",nnn);
 }
 #endif
 			FREE((caddr_t)fnp->n_v4, M_NFSV4NODE);
 			fnp->n_v4 = newv4;
 			newv4 = NULL;
 			fnp->n_v4->n4_fhlen = tdnp->n_fhp->nfh_len;
 			fnp->n_v4->n4_namelen = tcnp->cn_namelen;
 			NFSBCOPY(tdnp->n_fhp->nfh_fh, fnp->n_v4->n4_data,
 			    tdnp->n_fhp->nfh_len);
 			NFSBCOPY(tcnp->cn_nameptr,
 			    NFS4NODENAME(fnp->n_v4), tcnp->cn_namelen);
 		}
 		mtx_unlock(&tdnp->n_mtx);
 		mtx_unlock(&fnp->n_mtx);
 		if (newv4 != NULL)
 			FREE((caddr_t)newv4, M_NFSV4NODE);
 	}
 
 	if (fvp->v_type == VDIR) {
 		if (tvp != NULL && tvp->v_type == VDIR)
 			cache_purge(tdvp);
 		cache_purge(fdvp);
 	}
 
 out:
 	if (tdvp == tvp)
 		vrele(tdvp);
 	else
 		vput(tdvp);
 	if (tvp)
 		vput(tvp);
 	vrele(fdvp);
 	vrele(fvp);
 	/*
 	 * Kludge: Map ENOENT => 0 assuming that it is a reply to a retry.
 	 */
 	if (error == ENOENT)
 		error = 0;
 	return (error);
 }
 
 /*
  * nfs file rename rpc called from nfs_remove() above
  */
 static int
 nfs_renameit(struct vnode *sdvp, struct vnode *svp, struct componentname *scnp,
     struct sillyrename *sp)
 {
 
 	return (nfs_renamerpc(sdvp, svp, scnp->cn_nameptr, scnp->cn_namelen,
 	    sdvp, NULL, sp->s_name, sp->s_namlen, scnp->cn_cred,
 	    scnp->cn_thread));
 }
 
 /*
  * Do an nfs rename rpc. Called from nfs_rename() and nfs_renameit().
  */
 static int
 nfs_renamerpc(struct vnode *fdvp, struct vnode *fvp, char *fnameptr,
     int fnamelen, struct vnode *tdvp, struct vnode *tvp, char *tnameptr,
     int tnamelen, struct ucred *cred, struct thread *td)
 {
 	struct nfsvattr fnfsva, tnfsva;
 	struct nfsnode *fdnp = VTONFS(fdvp);
 	struct nfsnode *tdnp = VTONFS(tdvp);
 	int error = 0, fattrflag, tattrflag;
 
 	error = nfsrpc_rename(fdvp, fvp, fnameptr, fnamelen, tdvp, tvp,
 	    tnameptr, tnamelen, cred, td, &fnfsva, &tnfsva, &fattrflag,
 	    &tattrflag, NULL, NULL);
 	mtx_lock(&fdnp->n_mtx);
 	fdnp->n_flag |= NMODIFIED;
 	if (fattrflag != 0) {
 		mtx_unlock(&fdnp->n_mtx);
 		(void) nfscl_loadattrcache(&fdvp, &fnfsva, NULL, NULL, 0, 1);
 	} else {
 		fdnp->n_attrstamp = 0;
 		mtx_unlock(&fdnp->n_mtx);
 		KDTRACE_NFS_ATTRCACHE_FLUSH_DONE(fdvp);
 	}
 	mtx_lock(&tdnp->n_mtx);
 	tdnp->n_flag |= NMODIFIED;
 	if (tattrflag != 0) {
 		mtx_unlock(&tdnp->n_mtx);
 		(void) nfscl_loadattrcache(&tdvp, &tnfsva, NULL, NULL, 0, 1);
 	} else {
 		tdnp->n_attrstamp = 0;
 		mtx_unlock(&tdnp->n_mtx);
 		KDTRACE_NFS_ATTRCACHE_FLUSH_DONE(tdvp);
 	}
 	if (error && NFS_ISV4(fdvp))
 		error = nfscl_maperr(td, error, (uid_t)0, (gid_t)0);
 	return (error);
 }
 
 /*
  * nfs hard link create call
  */
 static int
 nfs_link(struct vop_link_args *ap)
 {
 	struct vnode *vp = ap->a_vp;
 	struct vnode *tdvp = ap->a_tdvp;
 	struct componentname *cnp = ap->a_cnp;
 	struct nfsnode *np, *tdnp;
 	struct nfsvattr nfsva, dnfsva;
 	int error = 0, attrflag, dattrflag;
 
 	if (vp->v_mount != tdvp->v_mount) {
 		return (EXDEV);
 	}
 
 	/*
 	 * Push all writes to the server, so that the attribute cache
 	 * doesn't get "out of sync" with the server.
 	 * XXX There should be a better way!
 	 */
 	VOP_FSYNC(vp, MNT_WAIT, cnp->cn_thread);
 
 	error = nfsrpc_link(tdvp, vp, cnp->cn_nameptr, cnp->cn_namelen,
 	    cnp->cn_cred, cnp->cn_thread, &dnfsva, &nfsva, &attrflag,
 	    &dattrflag, NULL);
 	tdnp = VTONFS(tdvp);
 	mtx_lock(&tdnp->n_mtx);
 	tdnp->n_flag |= NMODIFIED;
 	if (dattrflag != 0) {
 		mtx_unlock(&tdnp->n_mtx);
 		(void) nfscl_loadattrcache(&tdvp, &dnfsva, NULL, NULL, 0, 1);
 	} else {
 		tdnp->n_attrstamp = 0;
 		mtx_unlock(&tdnp->n_mtx);
 		KDTRACE_NFS_ATTRCACHE_FLUSH_DONE(tdvp);
 	}
 	if (attrflag)
 		(void) nfscl_loadattrcache(&vp, &nfsva, NULL, NULL, 0, 1);
 	else {
 		np = VTONFS(vp);
 		mtx_lock(&np->n_mtx);
 		np->n_attrstamp = 0;
 		mtx_unlock(&np->n_mtx);
 		KDTRACE_NFS_ATTRCACHE_FLUSH_DONE(vp);
 	}
 	/*
 	 * If negative lookup caching is enabled, I might as well
 	 * add an entry for this node. Not necessary for correctness,
 	 * but if negative caching is enabled, then the system
 	 * must care about lookup caching hit rate, so...
 	 */
 	if (VFSTONFS(vp->v_mount)->nm_negnametimeo != 0 &&
 	    (cnp->cn_flags & MAKEENTRY) && attrflag != 0 && error == 0) {
 		cache_enter_time(tdvp, vp, cnp, &nfsva.na_ctime, NULL);
 	}
 	if (error && NFS_ISV4(vp))
 		error = nfscl_maperr(cnp->cn_thread, error, (uid_t)0,
 		    (gid_t)0);
 	return (error);
 }
 
 /*
  * nfs symbolic link create call
  */
 static int
 nfs_symlink(struct vop_symlink_args *ap)
 {
 	struct vnode *dvp = ap->a_dvp;
 	struct vattr *vap = ap->a_vap;
 	struct componentname *cnp = ap->a_cnp;
 	struct nfsvattr nfsva, dnfsva;
 	struct nfsfh *nfhp;
 	struct nfsnode *np = NULL, *dnp;
 	struct vnode *newvp = NULL;
 	int error = 0, attrflag, dattrflag, ret;
 
 	vap->va_type = VLNK;
 	error = nfsrpc_symlink(dvp, cnp->cn_nameptr, cnp->cn_namelen,
 	    ap->a_target, vap, cnp->cn_cred, cnp->cn_thread, &dnfsva,
 	    &nfsva, &nfhp, &attrflag, &dattrflag, NULL);
 	if (nfhp) {
 		ret = nfscl_nget(dvp->v_mount, dvp, nfhp, cnp, cnp->cn_thread,
 		    &np, NULL, LK_EXCLUSIVE);
 		if (!ret)
 			newvp = NFSTOV(np);
 		else if (!error)
 			error = ret;
 	}
 	if (newvp != NULL) {
 		if (attrflag)
 			(void) nfscl_loadattrcache(&newvp, &nfsva, NULL, NULL,
 			    0, 1);
 	} else if (!error) {
 		/*
 		 * If we do not have an error and we could not extract the
 		 * newvp from the response due to the request being NFSv2, we
 		 * have to do a lookup in order to obtain a newvp to return.
 		 */
 		error = nfs_lookitup(dvp, cnp->cn_nameptr, cnp->cn_namelen,
 		    cnp->cn_cred, cnp->cn_thread, &np);
 		if (!error)
 			newvp = NFSTOV(np);
 	}
 	if (error) {
 		if (newvp)
 			vput(newvp);
 		if (NFS_ISV4(dvp))
 			error = nfscl_maperr(cnp->cn_thread, error,
 			    vap->va_uid, vap->va_gid);
 	} else {
 		*ap->a_vpp = newvp;
 	}
 
 	dnp = VTONFS(dvp);
 	mtx_lock(&dnp->n_mtx);
 	dnp->n_flag |= NMODIFIED;
 	if (dattrflag != 0) {
 		mtx_unlock(&dnp->n_mtx);
 		(void) nfscl_loadattrcache(&dvp, &dnfsva, NULL, NULL, 0, 1);
 	} else {
 		dnp->n_attrstamp = 0;
 		mtx_unlock(&dnp->n_mtx);
 		KDTRACE_NFS_ATTRCACHE_FLUSH_DONE(dvp);
 	}
 	/*
 	 * If negative lookup caching is enabled, I might as well
 	 * add an entry for this node. Not necessary for correctness,
 	 * but if negative caching is enabled, then the system
 	 * must care about lookup caching hit rate, so...
 	 */
 	if (VFSTONFS(dvp->v_mount)->nm_negnametimeo != 0 &&
 	    (cnp->cn_flags & MAKEENTRY) && attrflag != 0 && error == 0) {
 		cache_enter_time(dvp, newvp, cnp, &nfsva.na_ctime, NULL);
 	}
 	return (error);
 }
 
 /*
  * nfs make dir call
  */
 static int
 nfs_mkdir(struct vop_mkdir_args *ap)
 {
 	struct vnode *dvp = ap->a_dvp;
 	struct vattr *vap = ap->a_vap;
 	struct componentname *cnp = ap->a_cnp;
 	struct nfsnode *np = NULL, *dnp;
 	struct vnode *newvp = NULL;
 	struct vattr vattr;
 	struct nfsfh *nfhp;
 	struct nfsvattr nfsva, dnfsva;
 	int error = 0, attrflag, dattrflag, ret;
 
 	if ((error = VOP_GETATTR(dvp, &vattr, cnp->cn_cred)) != 0)
 		return (error);
 	vap->va_type = VDIR;
 	error = nfsrpc_mkdir(dvp, cnp->cn_nameptr, cnp->cn_namelen,
 	    vap, cnp->cn_cred, cnp->cn_thread, &dnfsva, &nfsva, &nfhp,
 	    &attrflag, &dattrflag, NULL);
 	dnp = VTONFS(dvp);
 	mtx_lock(&dnp->n_mtx);
 	dnp->n_flag |= NMODIFIED;
 	if (dattrflag != 0) {
 		mtx_unlock(&dnp->n_mtx);
 		(void) nfscl_loadattrcache(&dvp, &dnfsva, NULL, NULL, 0, 1);
 	} else {
 		dnp->n_attrstamp = 0;
 		mtx_unlock(&dnp->n_mtx);
 		KDTRACE_NFS_ATTRCACHE_FLUSH_DONE(dvp);
 	}
 	if (nfhp) {
 		ret = nfscl_nget(dvp->v_mount, dvp, nfhp, cnp, cnp->cn_thread,
 		    &np, NULL, LK_EXCLUSIVE);
 		if (!ret) {
 			newvp = NFSTOV(np);
 			if (attrflag)
 			   (void) nfscl_loadattrcache(&newvp, &nfsva, NULL,
 				NULL, 0, 1);
 		} else if (!error)
 			error = ret;
 	}
 	if (!error && newvp == NULL) {
 		error = nfs_lookitup(dvp, cnp->cn_nameptr, cnp->cn_namelen,
 		    cnp->cn_cred, cnp->cn_thread, &np);
 		if (!error) {
 			newvp = NFSTOV(np);
 			if (newvp->v_type != VDIR)
 				error = EEXIST;
 		}
 	}
 	if (error) {
 		if (newvp)
 			vput(newvp);
 		if (NFS_ISV4(dvp))
 			error = nfscl_maperr(cnp->cn_thread, error,
 			    vap->va_uid, vap->va_gid);
 	} else {
 		/*
 		 * If negative lookup caching is enabled, I might as well
 		 * add an entry for this node. Not necessary for correctness,
 		 * but if negative caching is enabled, then the system
 		 * must care about lookup caching hit rate, so...
 		 */
 		if (VFSTONFS(dvp->v_mount)->nm_negnametimeo != 0 &&
 		    (cnp->cn_flags & MAKEENTRY) &&
 		    attrflag != 0 && dattrflag != 0)
 			cache_enter_time(dvp, newvp, cnp, &nfsva.na_ctime,
 			    &dnfsva.na_ctime);
 		*ap->a_vpp = newvp;
 	}
 	return (error);
 }
 
 /*
  * nfs remove directory call
  */
 static int
 nfs_rmdir(struct vop_rmdir_args *ap)
 {
 	struct vnode *vp = ap->a_vp;
 	struct vnode *dvp = ap->a_dvp;
 	struct componentname *cnp = ap->a_cnp;
 	struct nfsnode *dnp;
 	struct nfsvattr dnfsva;
 	int error, dattrflag;
 
 	if (dvp == vp)
 		return (EINVAL);
 	error = nfsrpc_rmdir(dvp, cnp->cn_nameptr, cnp->cn_namelen,
 	    cnp->cn_cred, cnp->cn_thread, &dnfsva, &dattrflag, NULL);
 	dnp = VTONFS(dvp);
 	mtx_lock(&dnp->n_mtx);
 	dnp->n_flag |= NMODIFIED;
 	if (dattrflag != 0) {
 		mtx_unlock(&dnp->n_mtx);
 		(void) nfscl_loadattrcache(&dvp, &dnfsva, NULL, NULL, 0, 1);
 	} else {
 		dnp->n_attrstamp = 0;
 		mtx_unlock(&dnp->n_mtx);
 		KDTRACE_NFS_ATTRCACHE_FLUSH_DONE(dvp);
 	}
 
 	cache_purge(dvp);
 	cache_purge(vp);
 	if (error && NFS_ISV4(dvp))
 		error = nfscl_maperr(cnp->cn_thread, error, (uid_t)0,
 		    (gid_t)0);
 	/*
 	 * Kludge: Map ENOENT => 0 assuming that you have a reply to a retry.
 	 */
 	if (error == ENOENT)
 		error = 0;
 	return (error);
 }
 
 /*
  * nfs readdir call
  */
 static int
 nfs_readdir(struct vop_readdir_args *ap)
 {
 	struct vnode *vp = ap->a_vp;
 	struct nfsnode *np = VTONFS(vp);
 	struct uio *uio = ap->a_uio;
 	ssize_t tresid;
 	int error = 0;
 	struct vattr vattr;
 	
 	if (ap->a_eofflag != NULL)
 		*ap->a_eofflag = 0;
 	if (vp->v_type != VDIR) 
 		return(EPERM);
 
 	/*
 	 * First, check for hit on the EOF offset cache
 	 */
 	if (np->n_direofoffset > 0 && uio->uio_offset >= np->n_direofoffset &&
 	    (np->n_flag & NMODIFIED) == 0) {
 		if (VOP_GETATTR(vp, &vattr, ap->a_cred) == 0) {
 			mtx_lock(&np->n_mtx);
 			if ((NFS_ISV4(vp) && np->n_change == vattr.va_filerev) ||
 			    !NFS_TIMESPEC_COMPARE(&np->n_mtime, &vattr.va_mtime)) {
 				mtx_unlock(&np->n_mtx);
 				NFSINCRGLOBAL(newnfsstats.direofcache_hits);
 				if (ap->a_eofflag != NULL)
 					*ap->a_eofflag = 1;
 				return (0);
 			} else
 				mtx_unlock(&np->n_mtx);
 		}
 	}
 
 	/*
 	 * Call ncl_bioread() to do the real work.
 	 */
 	tresid = uio->uio_resid;
 	error = ncl_bioread(vp, uio, 0, ap->a_cred);
 
 	if (!error && uio->uio_resid == tresid) {
 		NFSINCRGLOBAL(newnfsstats.direofcache_misses);
 		if (ap->a_eofflag != NULL)
 			*ap->a_eofflag = 1;
 	}
 	return (error);
 }
 
 /*
  * Readdir rpc call.
  * Called from below the buffer cache by ncl_doio().
  */
 int
 ncl_readdirrpc(struct vnode *vp, struct uio *uiop, struct ucred *cred,
     struct thread *td)
 {
 	struct nfsvattr nfsva;
 	nfsuint64 *cookiep, cookie;
 	struct nfsnode *dnp = VTONFS(vp);
 	struct nfsmount *nmp = VFSTONFS(vp->v_mount);
 	int error = 0, eof, attrflag;
 
 	KASSERT(uiop->uio_iovcnt == 1 &&
 	    (uiop->uio_offset & (DIRBLKSIZ - 1)) == 0 &&
 	    (uiop->uio_resid & (DIRBLKSIZ - 1)) == 0,
 	    ("nfs readdirrpc bad uio"));
 
 	/*
 	 * If there is no cookie, assume directory was stale.
 	 */
 	ncl_dircookie_lock(dnp);
 	cookiep = ncl_getcookie(dnp, uiop->uio_offset, 0);
 	if (cookiep) {
 		cookie = *cookiep;
 		ncl_dircookie_unlock(dnp);
 	} else {
 		ncl_dircookie_unlock(dnp);		
 		return (NFSERR_BAD_COOKIE);
 	}
 
 	if (NFSHASNFSV3(nmp) && !NFSHASGOTFSINFO(nmp))
 		(void)ncl_fsinfo(nmp, vp, cred, td);
 
 	error = nfsrpc_readdir(vp, uiop, &cookie, cred, td, &nfsva,
 	    &attrflag, &eof, NULL);
 	if (attrflag)
 		(void) nfscl_loadattrcache(&vp, &nfsva, NULL, NULL, 0, 1);
 
 	if (!error) {
 		/*
 		 * We are now either at the end of the directory or have filled
 		 * the block.
 		 */
 		if (eof)
 			dnp->n_direofoffset = uiop->uio_offset;
 		else {
 			if (uiop->uio_resid > 0)
 				ncl_printf("EEK! readdirrpc resid > 0\n");
 			ncl_dircookie_lock(dnp);
 			cookiep = ncl_getcookie(dnp, uiop->uio_offset, 1);
 			*cookiep = cookie;
 			ncl_dircookie_unlock(dnp);
 		}
 	} else if (NFS_ISV4(vp)) {
 		error = nfscl_maperr(td, error, (uid_t)0, (gid_t)0);
 	}
 	return (error);
 }
 
 /*
  * NFS V3 readdir plus RPC. Used in place of ncl_readdirrpc().
  */
 int
 ncl_readdirplusrpc(struct vnode *vp, struct uio *uiop, struct ucred *cred,
     struct thread *td)
 {
 	struct nfsvattr nfsva;
 	nfsuint64 *cookiep, cookie;
 	struct nfsnode *dnp = VTONFS(vp);
 	struct nfsmount *nmp = VFSTONFS(vp->v_mount);
 	int error = 0, attrflag, eof;
 
 	KASSERT(uiop->uio_iovcnt == 1 &&
 	    (uiop->uio_offset & (DIRBLKSIZ - 1)) == 0 &&
 	    (uiop->uio_resid & (DIRBLKSIZ - 1)) == 0,
 	    ("nfs readdirplusrpc bad uio"));
 
 	/*
 	 * If there is no cookie, assume directory was stale.
 	 */
 	ncl_dircookie_lock(dnp);
 	cookiep = ncl_getcookie(dnp, uiop->uio_offset, 0);
 	if (cookiep) {
 		cookie = *cookiep;
 		ncl_dircookie_unlock(dnp);
 	} else {
 		ncl_dircookie_unlock(dnp);
 		return (NFSERR_BAD_COOKIE);
 	}
 
 	if (NFSHASNFSV3(nmp) && !NFSHASGOTFSINFO(nmp))
 		(void)ncl_fsinfo(nmp, vp, cred, td);
 	error = nfsrpc_readdirplus(vp, uiop, &cookie, cred, td, &nfsva,
 	    &attrflag, &eof, NULL);
 	if (attrflag)
 		(void) nfscl_loadattrcache(&vp, &nfsva, NULL, NULL, 0, 1);
 
 	if (!error) {
 		/*
 		 * We are now either at end of the directory or have filled the
 		 * the block.
 		 */
 		if (eof)
 			dnp->n_direofoffset = uiop->uio_offset;
 		else {
 			if (uiop->uio_resid > 0)
 				ncl_printf("EEK! readdirplusrpc resid > 0\n");
 			ncl_dircookie_lock(dnp);
 			cookiep = ncl_getcookie(dnp, uiop->uio_offset, 1);
 			*cookiep = cookie;
 			ncl_dircookie_unlock(dnp);
 		}
 	} else if (NFS_ISV4(vp)) {
 		error = nfscl_maperr(td, error, (uid_t)0, (gid_t)0);
 	}
 	return (error);
 }
 
 /*
  * Silly rename. To make the NFS filesystem that is stateless look a little
  * more like the "ufs" a remove of an active vnode is translated to a rename
  * to a funny looking filename that is removed by nfs_inactive on the
  * nfsnode. There is the potential for another process on a different client
  * to create the same funny name between the nfs_lookitup() fails and the
  * nfs_rename() completes, but...
  */
 static int
 nfs_sillyrename(struct vnode *dvp, struct vnode *vp, struct componentname *cnp)
 {
 	struct sillyrename *sp;
 	struct nfsnode *np;
 	int error;
 	short pid;
 	unsigned int lticks;
 
 	cache_purge(dvp);
 	np = VTONFS(vp);
 	KASSERT(vp->v_type != VDIR, ("nfs: sillyrename dir"));
 	MALLOC(sp, struct sillyrename *, sizeof (struct sillyrename),
 	    M_NEWNFSREQ, M_WAITOK);
 	sp->s_cred = crhold(cnp->cn_cred);
 	sp->s_dvp = dvp;
 	VREF(dvp);
 
 	/* 
 	 * Fudge together a funny name.
 	 * Changing the format of the funny name to accomodate more 
 	 * sillynames per directory.
 	 * The name is now changed to .nfs.<ticks>.<pid>.4, where ticks is 
 	 * CPU ticks since boot.
 	 */
 	pid = cnp->cn_thread->td_proc->p_pid;
 	lticks = (unsigned int)ticks;
 	for ( ; ; ) {
 		sp->s_namlen = sprintf(sp->s_name, 
 				       ".nfs.%08x.%04x4.4", lticks, 
 				       pid);
 		if (nfs_lookitup(dvp, sp->s_name, sp->s_namlen, sp->s_cred,
 				 cnp->cn_thread, NULL))
 			break;
 		lticks++;
 	}
 	error = nfs_renameit(dvp, vp, cnp, sp);
 	if (error)
 		goto bad;
 	error = nfs_lookitup(dvp, sp->s_name, sp->s_namlen, sp->s_cred,
 		cnp->cn_thread, &np);
 	np->n_sillyrename = sp;
 	return (0);
 bad:
 	vrele(sp->s_dvp);
 	crfree(sp->s_cred);
 	free((caddr_t)sp, M_NEWNFSREQ);
 	return (error);
 }
 
 /*
  * Look up a file name and optionally either update the file handle or
  * allocate an nfsnode, depending on the value of npp.
  * npp == NULL	--> just do the lookup
  * *npp == NULL --> allocate a new nfsnode and make sure attributes are
  *			handled too
  * *npp != NULL --> update the file handle in the vnode
  */
 static int
 nfs_lookitup(struct vnode *dvp, char *name, int len, struct ucred *cred,
     struct thread *td, struct nfsnode **npp)
 {
 	struct vnode *newvp = NULL, *vp;
 	struct nfsnode *np, *dnp = VTONFS(dvp);
 	struct nfsfh *nfhp, *onfhp;
 	struct nfsvattr nfsva, dnfsva;
 	struct componentname cn;
 	int error = 0, attrflag, dattrflag;
 	u_int hash;
 
 	error = nfsrpc_lookup(dvp, name, len, cred, td, &dnfsva, &nfsva,
 	    &nfhp, &attrflag, &dattrflag, NULL);
 	if (dattrflag)
 		(void) nfscl_loadattrcache(&dvp, &dnfsva, NULL, NULL, 0, 1);
 	if (npp && !error) {
 		if (*npp != NULL) {
 		    np = *npp;
 		    vp = NFSTOV(np);
 		    /*
 		     * For NFSv4, check to see if it is the same name and
 		     * replace the name, if it is different.
 		     */
 		    if (np->n_v4 != NULL && nfsva.na_type == VREG &&
 			(np->n_v4->n4_namelen != len ||
 			 NFSBCMP(name, NFS4NODENAME(np->n_v4), len) ||
 			 dnp->n_fhp->nfh_len != np->n_v4->n4_fhlen ||
 			 NFSBCMP(dnp->n_fhp->nfh_fh, np->n_v4->n4_data,
 			 dnp->n_fhp->nfh_len))) {
 #ifdef notdef
 { char nnn[100]; int nnnl;
 nnnl = (len < 100) ? len : 99;
 bcopy(name, nnn, nnnl);
 nnn[nnnl] = '\0';
 printf("replace=%s\n",nnn);
 }
 #endif
 			    FREE((caddr_t)np->n_v4, M_NFSV4NODE);
 			    MALLOC(np->n_v4, struct nfsv4node *,
 				sizeof (struct nfsv4node) +
 				dnp->n_fhp->nfh_len + len - 1,
 				M_NFSV4NODE, M_WAITOK);
 			    np->n_v4->n4_fhlen = dnp->n_fhp->nfh_len;
 			    np->n_v4->n4_namelen = len;
 			    NFSBCOPY(dnp->n_fhp->nfh_fh, np->n_v4->n4_data,
 				dnp->n_fhp->nfh_len);
 			    NFSBCOPY(name, NFS4NODENAME(np->n_v4), len);
 		    }
 		    hash = fnv_32_buf(nfhp->nfh_fh, nfhp->nfh_len,
 			FNV1_32_INIT);
 		    onfhp = np->n_fhp;
 		    /*
 		     * Rehash node for new file handle.
 		     */
 		    vfs_hash_rehash(vp, hash);
 		    np->n_fhp = nfhp;
 		    if (onfhp != NULL)
 			FREE((caddr_t)onfhp, M_NFSFH);
 		    newvp = NFSTOV(np);
 		} else if (NFS_CMPFH(dnp, nfhp->nfh_fh, nfhp->nfh_len)) {
 		    FREE((caddr_t)nfhp, M_NFSFH);
 		    VREF(dvp);
 		    newvp = dvp;
 		} else {
 		    cn.cn_nameptr = name;
 		    cn.cn_namelen = len;
 		    error = nfscl_nget(dvp->v_mount, dvp, nfhp, &cn, td,
 			&np, NULL, LK_EXCLUSIVE);
 		    if (error)
 			return (error);
 		    newvp = NFSTOV(np);
 		}
 		if (!attrflag && *npp == NULL) {
 			if (newvp == dvp)
 				vrele(newvp);
 			else
 				vput(newvp);
 			return (ENOENT);
 		}
 		if (attrflag)
 			(void) nfscl_loadattrcache(&newvp, &nfsva, NULL, NULL,
 			    0, 1);
 	}
 	if (npp && *npp == NULL) {
 		if (error) {
 			if (newvp) {
 				if (newvp == dvp)
 					vrele(newvp);
 				else
 					vput(newvp);
 			}
 		} else
 			*npp = np;
 	}
 	if (error && NFS_ISV4(dvp))
 		error = nfscl_maperr(td, error, (uid_t)0, (gid_t)0);
 	return (error);
 }
 
 /*
  * Nfs Version 3 and 4 commit rpc
  */
 int
 ncl_commit(struct vnode *vp, u_quad_t offset, int cnt, struct ucred *cred,
    struct thread *td)
 {
 	struct nfsvattr nfsva;
 	struct nfsmount *nmp = VFSTONFS(vp->v_mount);
 	int error, attrflag;
 
 	mtx_lock(&nmp->nm_mtx);
 	if ((nmp->nm_state & NFSSTA_HASWRITEVERF) == 0) {
 		mtx_unlock(&nmp->nm_mtx);
 		return (0);
 	}
 	mtx_unlock(&nmp->nm_mtx);
 	error = nfsrpc_commit(vp, offset, cnt, cred, td, &nfsva,
 	    &attrflag, NULL);
 	if (attrflag != 0)
 		(void) nfscl_loadattrcache(&vp, &nfsva, NULL, NULL,
 		    0, 1);
 	if (error != 0 && NFS_ISV4(vp))
 		error = nfscl_maperr(td, error, (uid_t)0, (gid_t)0);
 	return (error);
 }
 
 /*
  * Strategy routine.
  * For async requests when nfsiod(s) are running, queue the request by
  * calling ncl_asyncio(), otherwise just all ncl_doio() to do the
  * request.
  */
 static int
 nfs_strategy(struct vop_strategy_args *ap)
 {
 	struct buf *bp = ap->a_bp;
 	struct ucred *cr;
 
 	KASSERT(!(bp->b_flags & B_DONE),
 	    ("nfs_strategy: buffer %p unexpectedly marked B_DONE", bp));
 	BUF_ASSERT_HELD(bp);
 
 	if (bp->b_iocmd == BIO_READ)
 		cr = bp->b_rcred;
 	else
 		cr = bp->b_wcred;
 
 	/*
 	 * If the op is asynchronous and an i/o daemon is waiting
 	 * queue the request, wake it up and wait for completion
 	 * otherwise just do it ourselves.
 	 */
 	if ((bp->b_flags & B_ASYNC) == 0 ||
 	    ncl_asyncio(VFSTONFS(ap->a_vp->v_mount), bp, NOCRED, curthread))
 		(void) ncl_doio(ap->a_vp, bp, cr, curthread, 1);
 	return (0);
 }
 
 /*
  * fsync vnode op. Just call ncl_flush() with commit == 1.
  */
 /* ARGSUSED */
 static int
 nfs_fsync(struct vop_fsync_args *ap)
 {
 
 	if (ap->a_vp->v_type != VREG) {
 		/*
 		 * For NFS, metadata is changed synchronously on the server,
 		 * so there is nothing to flush. Also, ncl_flush() clears
 		 * the NMODIFIED flag and that shouldn't be done here for
 		 * directories.
 		 */
 		return (0);
 	}
 	return (ncl_flush(ap->a_vp, ap->a_waitfor, NULL, ap->a_td, 1, 0));
 }
 
 /*
  * Flush all the blocks associated with a vnode.
  * 	Walk through the buffer pool and push any dirty pages
  *	associated with the vnode.
  * If the called_from_renewthread argument is TRUE, it has been called
  * from the NFSv4 renew thread and, as such, cannot block indefinitely
  * waiting for a buffer write to complete.
  */
 int
 ncl_flush(struct vnode *vp, int waitfor, struct ucred *cred, struct thread *td,
     int commit, int called_from_renewthread)
 {
 	struct nfsnode *np = VTONFS(vp);
 	struct buf *bp;
 	int i;
 	struct buf *nbp;
 	struct nfsmount *nmp = VFSTONFS(vp->v_mount);
 	int error = 0, slptimeo = 0, slpflag = 0, retv, bvecpos;
 	int passone = 1, trycnt = 0;
 	u_quad_t off, endoff, toff;
 	struct ucred* wcred = NULL;
 	struct buf **bvec = NULL;
 	struct bufobj *bo;
 #ifndef NFS_COMMITBVECSIZ
 #define	NFS_COMMITBVECSIZ	20
 #endif
 	struct buf *bvec_on_stack[NFS_COMMITBVECSIZ];
 	int bvecsize = 0, bveccount;
 
 	if (called_from_renewthread != 0)
 		slptimeo = hz;
 	if (nmp->nm_flag & NFSMNT_INT)
 		slpflag = PCATCH;
 	if (!commit)
 		passone = 0;
 	bo = &vp->v_bufobj;
 	/*
 	 * A b_flags == (B_DELWRI | B_NEEDCOMMIT) block has been written to the
 	 * server, but has not been committed to stable storage on the server
 	 * yet. On the first pass, the byte range is worked out and the commit
 	 * rpc is done. On the second pass, ncl_writebp() is called to do the
 	 * job.
 	 */
 again:
 	off = (u_quad_t)-1;
 	endoff = 0;
 	bvecpos = 0;
 	if (NFS_ISV34(vp) && commit) {
 		if (bvec != NULL && bvec != bvec_on_stack)
 			free(bvec, M_TEMP);
 		/*
 		 * Count up how many buffers waiting for a commit.
 		 */
 		bveccount = 0;
 		BO_LOCK(bo);
 		TAILQ_FOREACH_SAFE(bp, &bo->bo_dirty.bv_hd, b_bobufs, nbp) {
 			if (!BUF_ISLOCKED(bp) &&
 			    (bp->b_flags & (B_DELWRI | B_NEEDCOMMIT))
 				== (B_DELWRI | B_NEEDCOMMIT))
 				bveccount++;
 		}
 		/*
 		 * Allocate space to remember the list of bufs to commit.  It is
 		 * important to use M_NOWAIT here to avoid a race with nfs_write.
 		 * If we can't get memory (for whatever reason), we will end up
 		 * committing the buffers one-by-one in the loop below.
 		 */
 		if (bveccount > NFS_COMMITBVECSIZ) {
 			/*
 			 * Release the vnode interlock to avoid a lock
 			 * order reversal.
 			 */
 			BO_UNLOCK(bo);
 			bvec = (struct buf **)
 				malloc(bveccount * sizeof(struct buf *),
 				       M_TEMP, M_NOWAIT);
 			BO_LOCK(bo);
 			if (bvec == NULL) {
 				bvec = bvec_on_stack;
 				bvecsize = NFS_COMMITBVECSIZ;
 			} else
 				bvecsize = bveccount;
 		} else {
 			bvec = bvec_on_stack;
 			bvecsize = NFS_COMMITBVECSIZ;
 		}
 		TAILQ_FOREACH_SAFE(bp, &bo->bo_dirty.bv_hd, b_bobufs, nbp) {
 			if (bvecpos >= bvecsize)
 				break;
 			if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL)) {
 				nbp = TAILQ_NEXT(bp, b_bobufs);
 				continue;
 			}
 			if ((bp->b_flags & (B_DELWRI | B_NEEDCOMMIT)) !=
 			    (B_DELWRI | B_NEEDCOMMIT)) {
 				BUF_UNLOCK(bp);
 				nbp = TAILQ_NEXT(bp, b_bobufs);
 				continue;
 			}
 			BO_UNLOCK(bo);
 			bremfree(bp);
 			/*
 			 * Work out if all buffers are using the same cred
 			 * so we can deal with them all with one commit.
 			 *
 			 * NOTE: we are not clearing B_DONE here, so we have
 			 * to do it later on in this routine if we intend to
 			 * initiate I/O on the bp.
 			 *
 			 * Note: to avoid loopback deadlocks, we do not
 			 * assign b_runningbufspace.
 			 */
 			if (wcred == NULL)
 				wcred = bp->b_wcred;
 			else if (wcred != bp->b_wcred)
 				wcred = NOCRED;
 			vfs_busy_pages(bp, 1);
 
 			BO_LOCK(bo);
 			/*
 			 * bp is protected by being locked, but nbp is not
 			 * and vfs_busy_pages() may sleep.  We have to
 			 * recalculate nbp.
 			 */
 			nbp = TAILQ_NEXT(bp, b_bobufs);
 
 			/*
 			 * A list of these buffers is kept so that the
 			 * second loop knows which buffers have actually
 			 * been committed. This is necessary, since there
 			 * may be a race between the commit rpc and new
 			 * uncommitted writes on the file.
 			 */
 			bvec[bvecpos++] = bp;
 			toff = ((u_quad_t)bp->b_blkno) * DEV_BSIZE +
 				bp->b_dirtyoff;
 			if (toff < off)
 				off = toff;
 			toff += (u_quad_t)(bp->b_dirtyend - bp->b_dirtyoff);
 			if (toff > endoff)
 				endoff = toff;
 		}
 		BO_UNLOCK(bo);
 	}
 	if (bvecpos > 0) {
 		/*
 		 * Commit data on the server, as required.
 		 * If all bufs are using the same wcred, then use that with
 		 * one call for all of them, otherwise commit each one
 		 * separately.
 		 */
 		if (wcred != NOCRED)
 			retv = ncl_commit(vp, off, (int)(endoff - off),
 					  wcred, td);
 		else {
 			retv = 0;
 			for (i = 0; i < bvecpos; i++) {
 				off_t off, size;
 				bp = bvec[i];
 				off = ((u_quad_t)bp->b_blkno) * DEV_BSIZE +
 					bp->b_dirtyoff;
 				size = (u_quad_t)(bp->b_dirtyend
 						  - bp->b_dirtyoff);
 				retv = ncl_commit(vp, off, (int)size,
 						  bp->b_wcred, td);
 				if (retv) break;
 			}
 		}
 
 		if (retv == NFSERR_STALEWRITEVERF)
 			ncl_clearcommit(vp->v_mount);
 
 		/*
 		 * Now, either mark the blocks I/O done or mark the
 		 * blocks dirty, depending on whether the commit
 		 * succeeded.
 		 */
 		for (i = 0; i < bvecpos; i++) {
 			bp = bvec[i];
 			bp->b_flags &= ~(B_NEEDCOMMIT | B_CLUSTEROK);
 			if (retv) {
 				/*
 				 * Error, leave B_DELWRI intact
 				 */
 				vfs_unbusy_pages(bp);
 				brelse(bp);
 			} else {
 				/*
 				 * Success, remove B_DELWRI ( bundirty() ).
 				 *
 				 * b_dirtyoff/b_dirtyend seem to be NFS
 				 * specific.  We should probably move that
 				 * into bundirty(). XXX
 				 */
 				bufobj_wref(bo);
 				bp->b_flags |= B_ASYNC;
 				bundirty(bp);
 				bp->b_flags &= ~B_DONE;
 				bp->b_ioflags &= ~BIO_ERROR;
 				bp->b_dirtyoff = bp->b_dirtyend = 0;
 				bufdone(bp);
 			}
 		}
 	}
 
 	/*
 	 * Start/do any write(s) that are required.
 	 */
 loop:
 	BO_LOCK(bo);
 	TAILQ_FOREACH_SAFE(bp, &bo->bo_dirty.bv_hd, b_bobufs, nbp) {
 		if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL)) {
 			if (waitfor != MNT_WAIT || passone)
 				continue;
 
 			error = BUF_TIMELOCK(bp,
 			    LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK,
 			    BO_LOCKPTR(bo), "nfsfsync", slpflag, slptimeo);
 			if (error == 0) {
 				BUF_UNLOCK(bp);
 				goto loop;
 			}
 			if (error == ENOLCK) {
 				error = 0;
 				goto loop;
 			}
 			if (called_from_renewthread != 0) {
 				/*
 				 * Return EIO so the flush will be retried
 				 * later.
 				 */
 				error = EIO;
 				goto done;
 			}
 			if (newnfs_sigintr(nmp, td)) {
 				error = EINTR;
 				goto done;
 			}
 			if (slpflag == PCATCH) {
 				slpflag = 0;
 				slptimeo = 2 * hz;
 			}
 			goto loop;
 		}
 		if ((bp->b_flags & B_DELWRI) == 0)
 			panic("nfs_fsync: not dirty");
 		if ((passone || !commit) && (bp->b_flags & B_NEEDCOMMIT)) {
 			BUF_UNLOCK(bp);
 			continue;
 		}
 		BO_UNLOCK(bo);
 		bremfree(bp);
 		if (passone || !commit)
 		    bp->b_flags |= B_ASYNC;
 		else
 		    bp->b_flags |= B_ASYNC;
 		bwrite(bp);
 		if (newnfs_sigintr(nmp, td)) {
 			error = EINTR;
 			goto done;
 		}
 		goto loop;
 	}
 	if (passone) {
 		passone = 0;
 		BO_UNLOCK(bo);
 		goto again;
 	}
 	if (waitfor == MNT_WAIT) {
 		while (bo->bo_numoutput) {
 			error = bufobj_wwait(bo, slpflag, slptimeo);
 			if (error) {
 			    BO_UNLOCK(bo);
 			    if (called_from_renewthread != 0) {
 				/*
 				 * Return EIO so that the flush will be
 				 * retried later.
 				 */
 				error = EIO;
 				goto done;
 			    }
 			    error = newnfs_sigintr(nmp, td);
 			    if (error)
 				goto done;
 			    if (slpflag == PCATCH) {
 				slpflag = 0;
 				slptimeo = 2 * hz;
 			    }
 			    BO_LOCK(bo);
 			}
 		}
 		if (bo->bo_dirty.bv_cnt != 0 && commit) {
 			BO_UNLOCK(bo);
 			goto loop;
 		}
 		/*
 		 * Wait for all the async IO requests to drain
 		 */
 		BO_UNLOCK(bo);
 		mtx_lock(&np->n_mtx);
 		while (np->n_directio_asyncwr > 0) {
 			np->n_flag |= NFSYNCWAIT;
 			error = newnfs_msleep(td, &np->n_directio_asyncwr,
 			    &np->n_mtx, slpflag | (PRIBIO + 1), 
 			    "nfsfsync", 0);
 			if (error) {
 				if (newnfs_sigintr(nmp, td)) {
 					mtx_unlock(&np->n_mtx);
 					error = EINTR;	
 					goto done;
 				}
 			}
 		}
 		mtx_unlock(&np->n_mtx);
 	} else
 		BO_UNLOCK(bo);
 	if (NFSHASPNFS(nmp)) {
 		nfscl_layoutcommit(vp, td);
 		/*
 		 * Invalidate the attribute cache, since writes to a DS
 		 * won't update the size attribute.
 		 */
 		mtx_lock(&np->n_mtx);
 		np->n_attrstamp = 0;
 	} else
 		mtx_lock(&np->n_mtx);
 	if (np->n_flag & NWRITEERR) {
 		error = np->n_error;
 		np->n_flag &= ~NWRITEERR;
 	}
   	if (commit && bo->bo_dirty.bv_cnt == 0 &&
 	    bo->bo_numoutput == 0 && np->n_directio_asyncwr == 0)
   		np->n_flag &= ~NMODIFIED;
 	mtx_unlock(&np->n_mtx);
 done:
 	if (bvec != NULL && bvec != bvec_on_stack)
 		free(bvec, M_TEMP);
 	if (error == 0 && commit != 0 && waitfor == MNT_WAIT &&
 	    (bo->bo_dirty.bv_cnt != 0 || bo->bo_numoutput != 0 ||
 	     np->n_directio_asyncwr != 0) && trycnt++ < 5) {
 		/* try, try again... */
 		passone = 1;
 		wcred = NULL;
 		bvec = NULL;
 		bvecsize = 0;
 printf("try%d\n", trycnt);
 		goto again;
 	}
 	return (error);
 }
 
 /*
  * NFS advisory byte-level locks.
  */
 static int
 nfs_advlock(struct vop_advlock_args *ap)
 {
 	struct vnode *vp = ap->a_vp;
 	struct ucred *cred;
 	struct nfsnode *np = VTONFS(ap->a_vp);
 	struct proc *p = (struct proc *)ap->a_id;
 	struct thread *td = curthread;	/* XXX */
 	struct vattr va;
 	int ret, error = EOPNOTSUPP;
 	u_quad_t size;
 	
 	if (NFS_ISV4(vp) && (ap->a_flags & (F_POSIX | F_FLOCK)) != 0) {
 		if (vp->v_type != VREG)
 			return (EINVAL);
 		if ((ap->a_flags & F_POSIX) != 0)
 			cred = p->p_ucred;
 		else
 			cred = td->td_ucred;
 		NFSVOPLOCK(vp, LK_EXCLUSIVE | LK_RETRY);
 		if (vp->v_iflag & VI_DOOMED) {
 			NFSVOPUNLOCK(vp, 0);
 			return (EBADF);
 		}
 
 		/*
 		 * If this is unlocking a write locked region, flush and
 		 * commit them before unlocking. This is required by
 		 * RFC3530 Sec. 9.3.2.
 		 */
 		if (ap->a_op == F_UNLCK &&
 		    nfscl_checkwritelocked(vp, ap->a_fl, cred, td, ap->a_id,
 		    ap->a_flags))
 			(void) ncl_flush(vp, MNT_WAIT, cred, td, 1, 0);
 
 		/*
 		 * Loop around doing the lock op, while a blocking lock
 		 * must wait for the lock op to succeed.
 		 */
 		do {
 			ret = nfsrpc_advlock(vp, np->n_size, ap->a_op,
 			    ap->a_fl, 0, cred, td, ap->a_id, ap->a_flags);
 			if (ret == NFSERR_DENIED && (ap->a_flags & F_WAIT) &&
 			    ap->a_op == F_SETLK) {
 				NFSVOPUNLOCK(vp, 0);
 				error = nfs_catnap(PZERO | PCATCH, ret,
 				    "ncladvl");
 				if (error)
 					return (EINTR);
 				NFSVOPLOCK(vp, LK_EXCLUSIVE | LK_RETRY);
 				if (vp->v_iflag & VI_DOOMED) {
 					NFSVOPUNLOCK(vp, 0);
 					return (EBADF);
 				}
 			}
 		} while (ret == NFSERR_DENIED && (ap->a_flags & F_WAIT) &&
 		     ap->a_op == F_SETLK);
 		if (ret == NFSERR_DENIED) {
 			NFSVOPUNLOCK(vp, 0);
 			return (EAGAIN);
 		} else if (ret == EINVAL || ret == EBADF || ret == EINTR) {
 			NFSVOPUNLOCK(vp, 0);
 			return (ret);
 		} else if (ret != 0) {
 			NFSVOPUNLOCK(vp, 0);
 			return (EACCES);
 		}
 
 		/*
 		 * Now, if we just got a lock, invalidate data in the buffer
 		 * cache, as required, so that the coherency conforms with
 		 * RFC3530 Sec. 9.3.2.
 		 */
 		if (ap->a_op == F_SETLK) {
 			if ((np->n_flag & NMODIFIED) == 0) {
 				np->n_attrstamp = 0;
 				KDTRACE_NFS_ATTRCACHE_FLUSH_DONE(vp);
 				ret = VOP_GETATTR(vp, &va, cred);
 			}
 			if ((np->n_flag & NMODIFIED) || ret ||
 			    np->n_change != va.va_filerev) {
 				(void) ncl_vinvalbuf(vp, V_SAVE, td, 1);
 				np->n_attrstamp = 0;
 				KDTRACE_NFS_ATTRCACHE_FLUSH_DONE(vp);
 				ret = VOP_GETATTR(vp, &va, cred);
 				if (!ret) {
 					np->n_mtime = va.va_mtime;
 					np->n_change = va.va_filerev;
 				}
 			}
 		}
 		NFSVOPUNLOCK(vp, 0);
 		return (0);
 	} else if (!NFS_ISV4(vp)) {
 		error = NFSVOPLOCK(vp, LK_SHARED);
 		if (error)
 			return (error);
 		if ((VFSTONFS(vp->v_mount)->nm_flag & NFSMNT_NOLOCKD) != 0) {
 			size = VTONFS(vp)->n_size;
 			NFSVOPUNLOCK(vp, 0);
 			error = lf_advlock(ap, &(vp->v_lockf), size);
 		} else {
 			if (nfs_advlock_p != NULL)
 				error = nfs_advlock_p(ap);
 			else {
 				NFSVOPUNLOCK(vp, 0);
 				error = ENOLCK;
 			}
 		}
 	}
 	return (error);
 }
 
 /*
  * NFS advisory byte-level locks.
  */
 static int
 nfs_advlockasync(struct vop_advlockasync_args *ap)
 {
 	struct vnode *vp = ap->a_vp;
 	u_quad_t size;
 	int error;
 	
 	if (NFS_ISV4(vp))
 		return (EOPNOTSUPP);
 	error = NFSVOPLOCK(vp, LK_SHARED);
 	if (error)
 		return (error);
 	if ((VFSTONFS(vp->v_mount)->nm_flag & NFSMNT_NOLOCKD) != 0) {
 		size = VTONFS(vp)->n_size;
 		NFSVOPUNLOCK(vp, 0);
 		error = lf_advlockasync(ap, &(vp->v_lockf), size);
 	} else {
 		NFSVOPUNLOCK(vp, 0);
 		error = EOPNOTSUPP;
 	}
 	return (error);
 }
 
 /*
  * Print out the contents of an nfsnode.
  */
 static int
 nfs_print(struct vop_print_args *ap)
 {
 	struct vnode *vp = ap->a_vp;
 	struct nfsnode *np = VTONFS(vp);
 
 	ncl_printf("\tfileid %ld fsid 0x%x",
 	   np->n_vattr.na_fileid, np->n_vattr.na_fsid);
 	if (vp->v_type == VFIFO)
 		fifo_printinfo(vp);
 	printf("\n");
 	return (0);
 }
 
 /*
  * This is the "real" nfs::bwrite(struct buf*).
  * We set B_CACHE if this is a VMIO buffer.
  */
 int
 ncl_writebp(struct buf *bp, int force __unused, struct thread *td)
 {
 	int s;
 	int oldflags = bp->b_flags;
 #if 0
 	int retv = 1;
 	off_t off;
 #endif
 
 	BUF_ASSERT_HELD(bp);
 
 	if (bp->b_flags & B_INVAL) {
 		brelse(bp);
 		return(0);
 	}
 
 	bp->b_flags |= B_CACHE;
 
 	/*
 	 * Undirty the bp.  We will redirty it later if the I/O fails.
 	 */
 
 	s = splbio();
 	bundirty(bp);
 	bp->b_flags &= ~B_DONE;
 	bp->b_ioflags &= ~BIO_ERROR;
 	bp->b_iocmd = BIO_WRITE;
 
 	bufobj_wref(bp->b_bufobj);
 	curthread->td_ru.ru_oublock++;
 	splx(s);
 
 	/*
 	 * Note: to avoid loopback deadlocks, we do not
 	 * assign b_runningbufspace.
 	 */
 	vfs_busy_pages(bp, 1);
 
 	BUF_KERNPROC(bp);
 	bp->b_iooffset = dbtob(bp->b_blkno);
 	bstrategy(bp);
 
 	if( (oldflags & B_ASYNC) == 0) {
 		int rtval = bufwait(bp);
 
 		if (oldflags & B_DELWRI) {
 			s = splbio();
 			reassignbuf(bp);
 			splx(s);
 		}
 		brelse(bp);
 		return (rtval);
 	}
 
 	return (0);
 }
 
 /*
  * nfs special file access vnode op.
  * Essentially just get vattr and then imitate iaccess() since the device is
  * local to the client.
  */
 static int
 nfsspec_access(struct vop_access_args *ap)
 {
 	struct vattr *vap;
 	struct ucred *cred = ap->a_cred;
 	struct vnode *vp = ap->a_vp;
 	accmode_t accmode = ap->a_accmode;
 	struct vattr vattr;
 	int error;
 
 	/*
 	 * Disallow write attempts on filesystems mounted read-only;
 	 * unless the file is a socket, fifo, or a block or character
 	 * device resident on the filesystem.
 	 */
 	if ((accmode & VWRITE) && (vp->v_mount->mnt_flag & MNT_RDONLY)) {
 		switch (vp->v_type) {
 		case VREG:
 		case VDIR:
 		case VLNK:
 			return (EROFS);
 		default:
 			break;
 		}
 	}
 	vap = &vattr;
 	error = VOP_GETATTR(vp, vap, cred);
 	if (error)
 		goto out;
 	error  = vaccess(vp->v_type, vap->va_mode, vap->va_uid, vap->va_gid,
 	    accmode, cred, NULL);
 out:
 	return error;
 }
 
 /*
  * Read wrapper for fifos.
  */
 static int
 nfsfifo_read(struct vop_read_args *ap)
 {
 	struct nfsnode *np = VTONFS(ap->a_vp);
 	int error;
 
 	/*
 	 * Set access flag.
 	 */
 	mtx_lock(&np->n_mtx);
 	np->n_flag |= NACC;
 	vfs_timestamp(&np->n_atim);
 	mtx_unlock(&np->n_mtx);
 	error = fifo_specops.vop_read(ap);
 	return error;	
 }
 
 /*
  * Write wrapper for fifos.
  */
 static int
 nfsfifo_write(struct vop_write_args *ap)
 {
 	struct nfsnode *np = VTONFS(ap->a_vp);
 
 	/*
 	 * Set update flag.
 	 */
 	mtx_lock(&np->n_mtx);
 	np->n_flag |= NUPD;
 	vfs_timestamp(&np->n_mtim);
 	mtx_unlock(&np->n_mtx);
 	return(fifo_specops.vop_write(ap));
 }
 
 /*
  * Close wrapper for fifos.
  *
  * Update the times on the nfsnode then do fifo close.
  */
 static int
 nfsfifo_close(struct vop_close_args *ap)
 {
 	struct vnode *vp = ap->a_vp;
 	struct nfsnode *np = VTONFS(vp);
 	struct vattr vattr;
 	struct timespec ts;
 
 	mtx_lock(&np->n_mtx);
 	if (np->n_flag & (NACC | NUPD)) {
 		vfs_timestamp(&ts);
 		if (np->n_flag & NACC)
 			np->n_atim = ts;
 		if (np->n_flag & NUPD)
 			np->n_mtim = ts;
 		np->n_flag |= NCHG;
 		if (vrefcnt(vp) == 1 &&
 		    (vp->v_mount->mnt_flag & MNT_RDONLY) == 0) {
 			VATTR_NULL(&vattr);
 			if (np->n_flag & NACC)
 				vattr.va_atime = np->n_atim;
 			if (np->n_flag & NUPD)
 				vattr.va_mtime = np->n_mtim;
 			mtx_unlock(&np->n_mtx);
 			(void)VOP_SETATTR(vp, &vattr, ap->a_cred);
 			goto out;
 		}
 	}
 	mtx_unlock(&np->n_mtx);
 out:
 	return (fifo_specops.vop_close(ap));
 }
 
 /*
  * Just call ncl_writebp() with the force argument set to 1.
  *
  * NOTE: B_DONE may or may not be set in a_bp on call.
  */
 static int
 nfs_bwrite(struct buf *bp)
 {
 
 	return (ncl_writebp(bp, 1, curthread));
 }
 
 struct buf_ops buf_ops_newnfs = {
 	.bop_name	=	"buf_ops_nfs",
 	.bop_write	=	nfs_bwrite,
 	.bop_strategy	=	bufstrategy,
 	.bop_sync	=	bufsync,
 	.bop_bdflush	=	bufbdflush,
 };
 
 /*
  * Cloned from vop_stdlock(), and then the ugly hack added.
  */
 static int
 nfs_lock1(struct vop_lock1_args *ap)
 {
 	struct vnode *vp = ap->a_vp;
 	int error = 0;
 
 	/*
 	 * Since vfs_hash_get() calls vget() and it will no longer work
 	 * for FreeBSD8 with flags == 0, I can only think of this horrible
 	 * hack to work around it. I call vfs_hash_get() with LK_EXCLOTHER
 	 * and then handle it here. All I want for this case is a v_usecount
 	 * on the vnode to use for recovery, while another thread might
 	 * hold a lock on the vnode. I have the other threads blocked, so
 	 * there isn't any race problem.
 	 */
 	if ((ap->a_flags & LK_TYPE_MASK) == LK_EXCLOTHER) {
 		if ((ap->a_flags & LK_INTERLOCK) == 0)
 			panic("ncllock1");
 		if ((vp->v_iflag & VI_DOOMED))
 			error = ENOENT;
 		VI_UNLOCK(vp);
 		return (error);
 	}
 	return (_lockmgr_args(vp->v_vnlock, ap->a_flags, VI_MTX(vp),
 	    LK_WMESG_DEFAULT, LK_PRIO_DEFAULT, LK_TIMO_DEFAULT, ap->a_file,
 	    ap->a_line));
 }
 
 static int
 nfs_getacl(struct vop_getacl_args *ap)
 {
 	int error;
 
 	if (ap->a_type != ACL_TYPE_NFS4)
 		return (EOPNOTSUPP);
 	error = nfsrpc_getacl(ap->a_vp, ap->a_cred, ap->a_td, ap->a_aclp,
 	    NULL);
 	if (error > NFSERR_STALE) {
 		(void) nfscl_maperr(ap->a_td, error, (uid_t)0, (gid_t)0);
 		error = EPERM;
 	}
 	return (error);
 }
 
 static int
 nfs_setacl(struct vop_setacl_args *ap)
 {
 	int error;
 
 	if (ap->a_type != ACL_TYPE_NFS4)
 		return (EOPNOTSUPP);
 	error = nfsrpc_setacl(ap->a_vp, ap->a_cred, ap->a_td, ap->a_aclp,
 	    NULL);
 	if (error > NFSERR_STALE) {
 		(void) nfscl_maperr(ap->a_td, error, (uid_t)0, (gid_t)0);
 		error = EPERM;
 	}
 	return (error);
 }
 
 /*
  * Return POSIX pathconf information applicable to nfs filesystems.
  */
 static int
 nfs_pathconf(struct vop_pathconf_args *ap)
 {
 	struct nfsv3_pathconf pc;
 	struct nfsvattr nfsva;
 	struct vnode *vp = ap->a_vp;
 	struct thread *td = curthread;
 	int attrflag, error;
 
 	if (NFS_ISV4(vp) || (NFS_ISV3(vp) && (ap->a_name == _PC_LINK_MAX ||
 	    ap->a_name == _PC_NAME_MAX || ap->a_name == _PC_CHOWN_RESTRICTED ||
 	    ap->a_name == _PC_NO_TRUNC))) {
 		/*
 		 * Since only the above 4 a_names are returned by the NFSv3
 		 * Pathconf RPC, there is no point in doing it for others.
 		 */
 		error = nfsrpc_pathconf(vp, &pc, td->td_ucred, td, &nfsva,
 		    &attrflag, NULL);
 		if (attrflag != 0)
 			(void) nfscl_loadattrcache(&vp, &nfsva, NULL, NULL, 0,
 			    1);
 		if (error != 0)
 			return (error);
 	} else {
 		/*
 		 * For NFSv2 (or NFSv3 when not one of the above 4 a_names),
 		 * just fake them.
 		 */
 		pc.pc_linkmax = LINK_MAX;
 		pc.pc_namemax = NFS_MAXNAMLEN;
 		pc.pc_notrunc = 1;
 		pc.pc_chownrestricted = 1;
 		pc.pc_caseinsensitive = 0;
 		pc.pc_casepreserving = 1;
 		error = 0;
 	}
 	switch (ap->a_name) {
 	case _PC_LINK_MAX:
 		*ap->a_retval = pc.pc_linkmax;
 		break;
 	case _PC_NAME_MAX:
 		*ap->a_retval = pc.pc_namemax;
 		break;
 	case _PC_PATH_MAX:
 		*ap->a_retval = PATH_MAX;
 		break;
 	case _PC_PIPE_BUF:
 		*ap->a_retval = PIPE_BUF;
 		break;
 	case _PC_CHOWN_RESTRICTED:
 		*ap->a_retval = pc.pc_chownrestricted;
 		break;
 	case _PC_NO_TRUNC:
 		*ap->a_retval = pc.pc_notrunc;
 		break;
 	case _PC_ACL_EXTENDED:
 		*ap->a_retval = 0;
 		break;
 	case _PC_ACL_NFS4:
 		if (NFS_ISV4(vp) && nfsrv_useacl != 0 && attrflag != 0 &&
 		    NFSISSET_ATTRBIT(&nfsva.na_suppattr, NFSATTRBIT_ACL))
 			*ap->a_retval = 1;
 		else
 			*ap->a_retval = 0;
 		break;
 	case _PC_ACL_PATH_MAX:
 		if (NFS_ISV4(vp))
 			*ap->a_retval = ACL_MAX_ENTRIES;
 		else
 			*ap->a_retval = 3;
 		break;
 	case _PC_MAC_PRESENT:
 		*ap->a_retval = 0;
 		break;
 	case _PC_ASYNC_IO:
 		/* _PC_ASYNC_IO should have been handled by upper layers. */
 		KASSERT(0, ("_PC_ASYNC_IO should not get here"));
 		error = EINVAL;
 		break;
 	case _PC_PRIO_IO:
 		*ap->a_retval = 0;
 		break;
 	case _PC_SYNC_IO:
 		*ap->a_retval = 0;
 		break;
 	case _PC_ALLOC_SIZE_MIN:
 		*ap->a_retval = vp->v_mount->mnt_stat.f_bsize;
 		break;
 	case _PC_FILESIZEBITS:
 		if (NFS_ISV34(vp))
 			*ap->a_retval = 64;
 		else
 			*ap->a_retval = 32;
 		break;
 	case _PC_REC_INCR_XFER_SIZE:
 		*ap->a_retval = vp->v_mount->mnt_stat.f_iosize;
 		break;
 	case _PC_REC_MAX_XFER_SIZE:
 		*ap->a_retval = -1; /* means ``unlimited'' */
 		break;
 	case _PC_REC_MIN_XFER_SIZE:
 		*ap->a_retval = vp->v_mount->mnt_stat.f_iosize;
 		break;
 	case _PC_REC_XFER_ALIGN:
 		*ap->a_retval = PAGE_SIZE;
 		break;
 	case _PC_SYMLINK_MAX:
 		*ap->a_retval = NFS_MAXPATHLEN;
 		break;
 
 	default:
 		error = EINVAL;
 		break;
 	}
 	return (error);
 }
 
Index: head/sys/i386/i386/exception.s
===================================================================
--- head/sys/i386/i386/exception.s	(revision 258540)
+++ head/sys/i386/i386/exception.s	(revision 258541)
@@ -1,470 +1,469 @@
 /*-
  * Copyright (c) 1989, 1990 William F. Jolitz.
  * Copyright (c) 1990 The Regents of the University of California.
  * Copyright (c) 2007 The FreeBSD Foundation
  * All rights reserved.
  *
  * Portions of this software were developed by A. Joseph Koshy under
  * sponsorship from the FreeBSD Foundation and Google, Inc.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  * $FreeBSD$
  */
 
 #include "opt_apic.h"
 #include "opt_atpic.h"
 #include "opt_hwpmc_hooks.h"
-#include "opt_kdtrace.h"
 #include "opt_npx.h"
 
 #include <machine/asmacros.h>
 #include <machine/psl.h>
 #include <machine/trap.h>
 
 #include "assym.s"
 
 #define	SEL_RPL_MASK	0x0003
 #define	GSEL_KPL	0x0020	/* GSEL(GCODE_SEL, SEL_KPL) */
 
 #ifdef KDTRACE_HOOKS
 	.bss
 	.globl	dtrace_invop_jump_addr
 	.align	4
 	.type	dtrace_invop_jump_addr, @object
 	.size	dtrace_invop_jump_addr, 4
 dtrace_invop_jump_addr:
 	.zero	4
 	.globl	dtrace_invop_calltrap_addr
 	.align	4
 	.type	dtrace_invop_calltrap_addr, @object
 	.size	dtrace_invop_calltrap_addr, 4
 dtrace_invop_calltrap_addr:
 	.zero	8
 #endif
 	.text
 #ifdef HWPMC_HOOKS
 	ENTRY(start_exceptions)
 #endif
 /*****************************************************************************/
 /* Trap handling                                                             */
 /*****************************************************************************/
 /*
  * Trap and fault vector routines.
  *
  * Most traps are 'trap gates', SDT_SYS386TGT.  A trap gate pushes state on
  * the stack that mostly looks like an interrupt, but does not disable
  * interrupts.  A few of the traps we are use are interrupt gates,
  * SDT_SYS386IGT, which are nearly the same thing except interrupts are
  * disabled on entry.
  *
  * The cpu will push a certain amount of state onto the kernel stack for
  * the current process.  The amount of state depends on the type of trap
  * and whether the trap crossed rings or not.  See i386/include/frame.h.
  * At the very least the current EFLAGS (status register, which includes
  * the interrupt disable state prior to the trap), the code segment register,
  * and the return instruction pointer are pushed by the cpu.  The cpu
  * will also push an 'error' code for certain traps.  We push a dummy
  * error code for those traps where the cpu doesn't in order to maintain
  * a consistent frame.  We also push a contrived 'trap number'.
  *
  * The cpu does not push the general registers, we must do that, and we
  * must restore them prior to calling 'iret'.  The cpu adjusts the %cs and
  * %ss segment registers, but does not mess with %ds, %es, or %fs.  Thus we
  * must load them with appropriate values for supervisor mode operation.
  */
 
 MCOUNT_LABEL(user)
 MCOUNT_LABEL(btrap)
 
 #define	TRAP(a)		pushl $(a) ; jmp alltraps
 
 IDTVEC(div)
 	pushl $0; TRAP(T_DIVIDE)
 IDTVEC(dbg)
 	pushl $0; TRAP(T_TRCTRAP)
 IDTVEC(nmi)
 	pushl $0; TRAP(T_NMI)
 IDTVEC(bpt)
 	pushl $0; TRAP(T_BPTFLT)
 IDTVEC(dtrace_ret)
 	pushl $0; TRAP(T_DTRACE_RET)
 IDTVEC(ofl)
 	pushl $0; TRAP(T_OFLOW)
 IDTVEC(bnd)
 	pushl $0; TRAP(T_BOUND)
 #ifndef KDTRACE_HOOKS
 IDTVEC(ill)
 	pushl $0; TRAP(T_PRIVINFLT)
 #endif
 IDTVEC(dna)
 	pushl $0; TRAP(T_DNA)
 IDTVEC(fpusegm)
 	pushl $0; TRAP(T_FPOPFLT)
 IDTVEC(tss)
 	TRAP(T_TSSFLT)
 IDTVEC(missing)
 	TRAP(T_SEGNPFLT)
 IDTVEC(stk)
 	TRAP(T_STKFLT)
 IDTVEC(prot)
 	TRAP(T_PROTFLT)
 IDTVEC(page)
 	TRAP(T_PAGEFLT)
 IDTVEC(mchk)
 	pushl $0; TRAP(T_MCHK)
 IDTVEC(rsvd)
 	pushl $0; TRAP(T_RESERVED)
 IDTVEC(fpu)
 	pushl $0; TRAP(T_ARITHTRAP)
 IDTVEC(align)
 	TRAP(T_ALIGNFLT)
 IDTVEC(xmm)
 	pushl $0; TRAP(T_XMMFLT)
 
 	/*
 	 * All traps except ones for syscalls jump to alltraps.  If
 	 * interrupts were enabled when the trap occurred, then interrupts
 	 * are enabled now if the trap was through a trap gate, else
 	 * disabled if the trap was through an interrupt gate.  Note that
 	 * int0x80_syscall is a trap gate.   Interrupt gates are used by
 	 * page faults, non-maskable interrupts, debug and breakpoint
 	 * exceptions.
 	 */
 	SUPERALIGN_TEXT
 	.globl	alltraps
 	.type	alltraps,@function
 alltraps:
 	pushal
 	pushl	%ds
 	pushl	%es
 	pushl	%fs
 alltraps_with_regs_pushed:
 	SET_KERNEL_SREGS
 	cld
 	FAKE_MCOUNT(TF_EIP(%esp))
 calltrap:
 	pushl	%esp
 	call	trap
 	add	$4, %esp
 
 	/*
 	 * Return via doreti to handle ASTs.
 	 */
 	MEXITCOUNT
 	jmp	doreti
 
 /*
  * Privileged instruction fault.
  */
 #ifdef KDTRACE_HOOKS
 	SUPERALIGN_TEXT
 IDTVEC(ill)
 	/* Check if there is no DTrace hook registered. */
 	cmpl	$0,dtrace_invop_jump_addr
 	je	norm_ill
 
 	/* Check if this is a user fault. */
 	cmpl	$GSEL_KPL, 4(%esp)	/* Check the code segment. */
 
 	/* If so, just handle it as a normal trap. */
 	jne	norm_ill
 
 	/*
 	 * This is a kernel instruction fault that might have been caused
 	 * by a DTrace provider.
 	 */
 	pushal				/* Push all registers onto the stack. */
 
 	/*
 	 * Set our jump address for the jump back in the event that
 	 * the exception wasn't caused by DTrace at all.
 	 */
 	movl	$norm_ill, dtrace_invop_calltrap_addr
 
 	/* Jump to the code hooked in by DTrace. */
 	jmpl	*dtrace_invop_jump_addr
 
 	/*
 	 * Process the instruction fault in the normal way.
 	 */
 norm_ill:
 	pushl $0
 	TRAP(T_PRIVINFLT)
 #endif
 
 /*
  * Call gate entry for syscalls (lcall 7,0).
  * This is used by FreeBSD 1.x a.out executables and "old" NetBSD executables.
  *
  * The intersegment call has been set up to specify one dummy parameter.
  * This leaves a place to put eflags so that the call frame can be
  * converted to a trap frame. Note that the eflags is (semi-)bogusly
  * pushed into (what will be) tf_err and then copied later into the
  * final spot. It has to be done this way because esp can't be just
  * temporarily altered for the pushfl - an interrupt might come in
  * and clobber the saved cs/eip.
  */
 	SUPERALIGN_TEXT
 IDTVEC(lcall_syscall)
 	pushfl				/* save eflags */
 	popl	8(%esp)			/* shuffle into tf_eflags */
 	pushl	$7			/* sizeof "lcall 7,0" */
 	subl	$4,%esp			/* skip over tf_trapno */
 	pushal
 	pushl	%ds
 	pushl	%es
 	pushl	%fs
 	SET_KERNEL_SREGS
 	cld
 	FAKE_MCOUNT(TF_EIP(%esp))
 	pushl	%esp
 	call	syscall
 	add	$4, %esp
 	MEXITCOUNT
 	jmp	doreti
 
 /*
  * Trap gate entry for syscalls (int 0x80).
  * This is used by FreeBSD ELF executables, "new" NetBSD executables, and all
  * Linux executables.
  *
  * Even though the name says 'int0x80', this is actually a trap gate, not an
  * interrupt gate.  Thus interrupts are enabled on entry just as they are for
  * a normal syscall.
  */
 	SUPERALIGN_TEXT
 IDTVEC(int0x80_syscall)
 	pushl	$2			/* sizeof "int 0x80" */
 	subl	$4,%esp			/* skip over tf_trapno */
 	pushal
 	pushl	%ds
 	pushl	%es
 	pushl	%fs
 	SET_KERNEL_SREGS
 	cld
 	FAKE_MCOUNT(TF_EIP(%esp))
 	pushl	%esp
 	call	syscall
 	add	$4, %esp
 	MEXITCOUNT
 	jmp	doreti
 
 ENTRY(fork_trampoline)
 	pushl	%esp			/* trapframe pointer */
 	pushl	%ebx			/* arg1 */
 	pushl	%esi			/* function */
 	call	fork_exit
 	addl	$12,%esp
 	/* cut from syscall */
 
 	/*
 	 * Return via doreti to handle ASTs.
 	 */
 	MEXITCOUNT
 	jmp	doreti
 
 
 /*
  * To efficiently implement classification of trap and interrupt handlers
  * for profiling, there must be only trap handlers between the labels btrap
  * and bintr, and only interrupt handlers between the labels bintr and
  * eintr.  This is implemented (partly) by including files that contain
  * some of the handlers.  Before including the files, set up a normal asm
  * environment so that the included files doen't need to know that they are
  * included.
  */
 
 	.data
 	.p2align 4
 	.text
 	SUPERALIGN_TEXT
 MCOUNT_LABEL(bintr)
 
 #ifdef DEV_ATPIC
 #include <i386/i386/atpic_vector.s>
 #endif
 
 #if defined(DEV_APIC) && defined(DEV_ATPIC)
 	.data
 	.p2align 4
 	.text
 	SUPERALIGN_TEXT
 #endif
 
 #ifdef DEV_APIC
 #include <i386/i386/apic_vector.s>
 #endif
 
 	.data
 	.p2align 4
 	.text
 	SUPERALIGN_TEXT
 #include <i386/i386/vm86bios.s>
 
 	.text
 MCOUNT_LABEL(eintr)
 
 /*
  * void doreti(struct trapframe)
  *
  * Handle return from interrupts, traps and syscalls.
  */
 	.text
 	SUPERALIGN_TEXT
 	.type	doreti,@function
 doreti:
 	FAKE_MCOUNT($bintr)		/* init "from" bintr -> doreti */
 doreti_next:
 	/*
 	 * Check if ASTs can be handled now.  ASTs cannot be safely
 	 * processed when returning from an NMI.
 	 */
 	cmpb	$T_NMI,TF_TRAPNO(%esp)
 #ifdef HWPMC_HOOKS
 	je	doreti_nmi
 #else
 	je	doreti_exit
 #endif
 	/*
 	 * PSL_VM must be checked first since segment registers only
 	 * have an RPL in non-VM86 mode.
 	 * ASTs can not be handled now if we are in a vm86 call.
 	 */
 	testl	$PSL_VM,TF_EFLAGS(%esp)
 	jz	doreti_notvm86
 	movl	PCPU(CURPCB),%ecx
 	testl	$PCB_VM86CALL,PCB_FLAGS(%ecx)
 	jz	doreti_ast
 	jmp	doreti_exit
 
 doreti_notvm86:
 	testb	$SEL_RPL_MASK,TF_CS(%esp) /* are we returning to user mode? */
 	jz	doreti_exit		/* can't handle ASTs now if not */
 
 doreti_ast:
 	/*
 	 * Check for ASTs atomically with returning.  Disabling CPU
 	 * interrupts provides sufficient locking even in the SMP case,
 	 * since we will be informed of any new ASTs by an IPI.
 	 */
 	cli
 	movl	PCPU(CURTHREAD),%eax
 	testl	$TDF_ASTPENDING | TDF_NEEDRESCHED,TD_FLAGS(%eax)
 	je	doreti_exit
 	sti
 	pushl	%esp			/* pass a pointer to the trapframe */
 	call	ast
 	add	$4,%esp
 	jmp	doreti_ast
 
 	/*
 	 * doreti_exit:	pop registers, iret.
 	 *
 	 *	The segment register pop is a special case, since it may
 	 *	fault if (for example) a sigreturn specifies bad segment
 	 *	registers.  The fault is handled in trap.c.
 	 */
 doreti_exit:
 	MEXITCOUNT
 
 	.globl	doreti_popl_fs
 doreti_popl_fs:
 	popl	%fs
 	.globl	doreti_popl_es
 doreti_popl_es:
 	popl	%es
 	.globl	doreti_popl_ds
 doreti_popl_ds:
 	popl	%ds
 	popal
 	addl	$8,%esp
 	.globl	doreti_iret
 doreti_iret:
 	iret
 
 	/*
 	 * doreti_iret_fault and friends.  Alternative return code for
 	 * the case where we get a fault in the doreti_exit code
 	 * above.  trap() (i386/i386/trap.c) catches this specific
 	 * case, sends the process a signal and continues in the
 	 * corresponding place in the code below.
 	 */
 	ALIGN_TEXT
 	.globl	doreti_iret_fault
 doreti_iret_fault:
 	subl	$8,%esp
 	pushal
 	pushl	%ds
 	.globl	doreti_popl_ds_fault
 doreti_popl_ds_fault:
 	pushl	%es
 	.globl	doreti_popl_es_fault
 doreti_popl_es_fault:
 	pushl	%fs
 	.globl	doreti_popl_fs_fault
 doreti_popl_fs_fault:
 	sti
 	movl	$0,TF_ERR(%esp)	/* XXX should be the error code */
 	movl	$T_PROTFLT,TF_TRAPNO(%esp)
 	jmp	alltraps_with_regs_pushed
 #ifdef HWPMC_HOOKS
 doreti_nmi:
 	/*
 	 * Since we are returning from an NMI, check if the current trap
 	 * was from user mode and if so whether the current thread
 	 * needs a user call chain capture.
 	 */
 	testb	$SEL_RPL_MASK,TF_CS(%esp)
 	jz	doreti_exit
 	movl	PCPU(CURTHREAD),%eax	/* curthread present? */
 	orl	%eax,%eax
 	jz	doreti_exit
 	testl	$TDP_CALLCHAIN,TD_PFLAGS(%eax) /* flagged for capture? */
 	jz	doreti_exit
 	/*
 	 * Take the processor out of NMI mode by executing a fake "iret".
 	 */
 	pushfl
 	pushl	%cs
 	pushl	$outofnmi
 	iret
 outofnmi:
 	/*
 	 * Call the callchain capture hook after turning interrupts back on.
 	 */
 	movl	pmc_hook,%ecx
 	orl	%ecx,%ecx
 	jz	doreti_exit
 	pushl	%esp			/* frame pointer */
 	pushl	$PMC_FN_USER_CALLCHAIN	/* command */
 	movl	PCPU(CURTHREAD),%eax
 	pushl	%eax			/* curthread */
 	sti
 	call	*%ecx
 	addl	$12,%esp
 	jmp	doreti_ast
 	ENTRY(end_exceptions)
 #endif
Index: head/sys/i386/i386/machdep.c
===================================================================
--- head/sys/i386/i386/machdep.c	(revision 258540)
+++ head/sys/i386/i386/machdep.c	(revision 258541)
@@ -1,3732 +1,3731 @@
 /*-
  * Copyright (c) 1992 Terrence R. Lambert.
  * Copyright (c) 1982, 1987, 1990 The Regents of the University of California.
  * All rights reserved.
  *
  * This code is derived from software contributed to Berkeley by
  * William Jolitz.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *	This product includes software developed by the University of
  *	California, Berkeley and its contributors.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	from: @(#)machdep.c	7.4 (Berkeley) 6/3/91
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_apic.h"
 #include "opt_atalk.h"
 #include "opt_atpic.h"
 #include "opt_compat.h"
 #include "opt_cpu.h"
 #include "opt_ddb.h"
 #include "opt_inet.h"
 #include "opt_ipx.h"
 #include "opt_isa.h"
 #include "opt_kstack_pages.h"
 #include "opt_maxmem.h"
 #include "opt_mp_watchdog.h"
 #include "opt_npx.h"
 #include "opt_perfmon.h"
 #include "opt_platform.h"
 #include "opt_xbox.h"
-#include "opt_kdtrace.h"
 
 #include <sys/param.h>
 #include <sys/proc.h>
 #include <sys/systm.h>
 #include <sys/bio.h>
 #include <sys/buf.h>
 #include <sys/bus.h>
 #include <sys/callout.h>
 #include <sys/cons.h>
 #include <sys/cpu.h>
 #include <sys/eventhandler.h>
 #include <sys/exec.h>
 #include <sys/imgact.h>
 #include <sys/kdb.h>
 #include <sys/kernel.h>
 #include <sys/ktr.h>
 #include <sys/linker.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/memrange.h>
 #include <sys/msgbuf.h>
 #include <sys/mutex.h>
 #include <sys/pcpu.h>
 #include <sys/ptrace.h>
 #include <sys/reboot.h>
 #include <sys/rwlock.h>
 #include <sys/sched.h>
 #include <sys/signalvar.h>
 #ifdef SMP
 #include <sys/smp.h>
 #endif
 #include <sys/syscallsubr.h>
 #include <sys/sysctl.h>
 #include <sys/sysent.h>
 #include <sys/sysproto.h>
 #include <sys/ucontext.h>
 #include <sys/vmmeter.h>
 
 #include <vm/vm.h>
 #include <vm/vm_extern.h>
 #include <vm/vm_kern.h>
 #include <vm/vm_page.h>
 #include <vm/vm_map.h>
 #include <vm/vm_object.h>
 #include <vm/vm_pager.h>
 #include <vm/vm_param.h>
 
 #ifdef DDB
 #ifndef KDB
 #error KDB must be enabled in order for DDB to work!
 #endif
 #include <ddb/ddb.h>
 #include <ddb/db_sym.h>
 #endif
 
 #include <isa/rtc.h>
 
 #include <net/netisr.h>
 
 #include <machine/bootinfo.h>
 #include <machine/clock.h>
 #include <machine/cpu.h>
 #include <machine/cputypes.h>
 #include <machine/intr_machdep.h>
 #include <x86/mca.h>
 #include <machine/md_var.h>
 #include <machine/metadata.h>
 #include <machine/mp_watchdog.h>
 #include <machine/pc/bios.h>
 #include <machine/pcb.h>
 #include <machine/pcb_ext.h>
 #include <machine/proc.h>
 #include <machine/reg.h>
 #include <machine/sigframe.h>
 #include <machine/specialreg.h>
 #include <machine/vm86.h>
 #ifdef PERFMON
 #include <machine/perfmon.h>
 #endif
 #ifdef SMP
 #include <machine/smp.h>
 #endif
 #ifdef FDT
 #include <x86/fdt.h>
 #endif
 
 #ifdef DEV_APIC
 #include <machine/apicvar.h>
 #endif
 
 #ifdef DEV_ISA
 #include <x86/isa/icu.h>
 #endif
 
 #ifdef XBOX
 #include <machine/xbox.h>
 
 int arch_i386_is_xbox = 0;
 uint32_t arch_i386_xbox_memsize = 0;
 #endif
 
 #ifdef XEN
 /* XEN includes */
 #include <xen/xen-os.h>
 #include <xen/hypervisor.h>
 #include <machine/xen/xenvar.h>
 #include <machine/xen/xenfunc.h>
 #include <xen/xen_intr.h>
 
 void Xhypervisor_callback(void);
 void failsafe_callback(void);
 
 extern trap_info_t trap_table[];
 struct proc_ldt default_proc_ldt;
 extern int init_first;
 int running_xen = 1;
 extern unsigned long physfree;
 #endif /* XEN */
 
 /* Sanity check for __curthread() */
 CTASSERT(offsetof(struct pcpu, pc_curthread) == 0);
 
 extern void init386(int first);
 extern void dblfault_handler(void);
 
 extern void printcpuinfo(void);	/* XXX header file */
 extern void finishidentcpu(void);
 extern void panicifcpuunsupported(void);
 
 #define	CS_SECURE(cs)		(ISPL(cs) == SEL_UPL)
 #define	EFL_SECURE(ef, oef)	((((ef) ^ (oef)) & ~PSL_USERCHANGE) == 0)
 
 #if !defined(CPU_DISABLE_SSE) && defined(I686_CPU)
 #define CPU_ENABLE_SSE
 #endif
 
 static void cpu_startup(void *);
 static void fpstate_drop(struct thread *td);
 static void get_fpcontext(struct thread *td, mcontext_t *mcp);
 static int  set_fpcontext(struct thread *td, const mcontext_t *mcp);
 #ifdef CPU_ENABLE_SSE
 static void set_fpregs_xmm(struct save87 *, struct savexmm *);
 static void fill_fpregs_xmm(struct savexmm *, struct save87 *);
 #endif /* CPU_ENABLE_SSE */
 SYSINIT(cpu, SI_SUB_CPU, SI_ORDER_FIRST, cpu_startup, NULL);
 
 #ifdef DDB
 extern vm_offset_t ksym_start, ksym_end;
 #endif
 
 /* Intel ICH registers */
 #define ICH_PMBASE	0x400
 #define ICH_SMI_EN	ICH_PMBASE + 0x30
 
 int	_udatasel, _ucodesel;
 u_int	basemem;
 
 int cold = 1;
 
 #ifdef COMPAT_43
 static void osendsig(sig_t catcher, ksiginfo_t *, sigset_t *mask);
 #endif
 #ifdef COMPAT_FREEBSD4
 static void freebsd4_sendsig(sig_t catcher, ksiginfo_t *, sigset_t *mask);
 #endif
 
 long Maxmem = 0;
 long realmem = 0;
 
 #ifdef PAE
 FEATURE(pae, "Physical Address Extensions");
 #endif
 
 /*
  * The number of PHYSMAP entries must be one less than the number of
  * PHYSSEG entries because the PHYSMAP entry that spans the largest
  * physical address that is accessible by ISA DMA is split into two
  * PHYSSEG entries.
  */
 #define	PHYSMAP_SIZE	(2 * (VM_PHYSSEG_MAX - 1))
 
 vm_paddr_t phys_avail[PHYSMAP_SIZE + 2];
 vm_paddr_t dump_avail[PHYSMAP_SIZE + 2];
 
 /* must be 2 less so 0 0 can signal end of chunks */
 #define PHYS_AVAIL_ARRAY_END ((sizeof(phys_avail) / sizeof(phys_avail[0])) - 2)
 #define DUMP_AVAIL_ARRAY_END ((sizeof(dump_avail) / sizeof(dump_avail[0])) - 2)
 
 struct kva_md_info kmi;
 
 static struct trapframe proc0_tf;
 struct pcpu __pcpu[MAXCPU];
 
 struct mtx icu_lock;
 
 struct mem_range_softc mem_range_softc;
 
 static void
 cpu_startup(dummy)
 	void *dummy;
 {
 	uintmax_t memsize;
 	char *sysenv;
 	
 	/*
 	 * On MacBooks, we need to disallow the legacy USB circuit to
 	 * generate an SMI# because this can cause several problems,
 	 * namely: incorrect CPU frequency detection and failure to
 	 * start the APs.
 	 * We do this by disabling a bit in the SMI_EN (SMI Control and
 	 * Enable register) of the Intel ICH LPC Interface Bridge.
 	 */
 	sysenv = getenv("smbios.system.product");
 	if (sysenv != NULL) {
 		if (strncmp(sysenv, "MacBook1,1", 10) == 0 ||
 		    strncmp(sysenv, "MacBook3,1", 10) == 0 ||
 		    strncmp(sysenv, "MacBookPro1,1", 13) == 0 ||
 		    strncmp(sysenv, "MacBookPro1,2", 13) == 0 ||
 		    strncmp(sysenv, "MacBookPro3,1", 13) == 0 ||
 		    strncmp(sysenv, "Macmini1,1", 10) == 0) {
 			if (bootverbose)
 				printf("Disabling LEGACY_USB_EN bit on "
 				    "Intel ICH.\n");
 			outl(ICH_SMI_EN, inl(ICH_SMI_EN) & ~0x8);
 		}
 		freeenv(sysenv);
 	}
 
 	/*
 	 * Good {morning,afternoon,evening,night}.
 	 */
 	startrtclock();
 	printcpuinfo();
 	panicifcpuunsupported();
 #ifdef PERFMON
 	perfmon_init();
 #endif
 
 	/*
 	 * Display physical memory if SMBIOS reports reasonable amount.
 	 */
 	memsize = 0;
 	sysenv = getenv("smbios.memory.enabled");
 	if (sysenv != NULL) {
 		memsize = (uintmax_t)strtoul(sysenv, (char **)NULL, 10) << 10;
 		freeenv(sysenv);
 	}
 	if (memsize < ptoa((uintmax_t)cnt.v_free_count))
 		memsize = ptoa((uintmax_t)Maxmem);
 	printf("real memory  = %ju (%ju MB)\n", memsize, memsize >> 20);
 	realmem = atop(memsize);
 
 	/*
 	 * Display any holes after the first chunk of extended memory.
 	 */
 	if (bootverbose) {
 		int indx;
 
 		printf("Physical memory chunk(s):\n");
 		for (indx = 0; phys_avail[indx + 1] != 0; indx += 2) {
 			vm_paddr_t size;
 
 			size = phys_avail[indx + 1] - phys_avail[indx];
 			printf(
 			    "0x%016jx - 0x%016jx, %ju bytes (%ju pages)\n",
 			    (uintmax_t)phys_avail[indx],
 			    (uintmax_t)phys_avail[indx + 1] - 1,
 			    (uintmax_t)size, (uintmax_t)size / PAGE_SIZE);
 		}
 	}
 
 	vm_ksubmap_init(&kmi);
 
 	printf("avail memory = %ju (%ju MB)\n",
 	    ptoa((uintmax_t)cnt.v_free_count),
 	    ptoa((uintmax_t)cnt.v_free_count) / 1048576);
 
 	/*
 	 * Set up buffers, so they can be used to read disk labels.
 	 */
 	bufinit();
 	vm_pager_bufferinit();
 #ifndef XEN
 	cpu_setregs();
 #endif
 }
 
 /*
  * Send an interrupt to process.
  *
  * Stack is set up to allow sigcode stored
  * at top to call routine, followed by kcall
  * to sigreturn routine below.  After sigreturn
  * resets the signal mask, the stack, and the
  * frame pointer, it returns to the user
  * specified pc, psl.
  */
 #ifdef COMPAT_43
 static void
 osendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask)
 {
 	struct osigframe sf, *fp;
 	struct proc *p;
 	struct thread *td;
 	struct sigacts *psp;
 	struct trapframe *regs;
 	int sig;
 	int oonstack;
 
 	td = curthread;
 	p = td->td_proc;
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	sig = ksi->ksi_signo;
 	psp = p->p_sigacts;
 	mtx_assert(&psp->ps_mtx, MA_OWNED);
 	regs = td->td_frame;
 	oonstack = sigonstack(regs->tf_esp);
 
 	/* Allocate space for the signal handler context. */
 	if ((td->td_pflags & TDP_ALTSTACK) && !oonstack &&
 	    SIGISMEMBER(psp->ps_sigonstack, sig)) {
 		fp = (struct osigframe *)(td->td_sigstk.ss_sp +
 		    td->td_sigstk.ss_size - sizeof(struct osigframe));
 #if defined(COMPAT_43)
 		td->td_sigstk.ss_flags |= SS_ONSTACK;
 #endif
 	} else
 		fp = (struct osigframe *)regs->tf_esp - 1;
 
 	/* Translate the signal if appropriate. */
 	if (p->p_sysent->sv_sigtbl && sig <= p->p_sysent->sv_sigsize)
 		sig = p->p_sysent->sv_sigtbl[_SIG_IDX(sig)];
 
 	/* Build the argument list for the signal handler. */
 	sf.sf_signum = sig;
 	sf.sf_scp = (register_t)&fp->sf_siginfo.si_sc;
 	bzero(&sf.sf_siginfo, sizeof(sf.sf_siginfo));
 	if (SIGISMEMBER(psp->ps_siginfo, sig)) {
 		/* Signal handler installed with SA_SIGINFO. */
 		sf.sf_arg2 = (register_t)&fp->sf_siginfo;
 		sf.sf_siginfo.si_signo = sig;
 		sf.sf_siginfo.si_code = ksi->ksi_code;
 		sf.sf_ahu.sf_action = (__osiginfohandler_t *)catcher;
 		sf.sf_addr = 0;
 	} else {
 		/* Old FreeBSD-style arguments. */
 		sf.sf_arg2 = ksi->ksi_code;
 		sf.sf_addr = (register_t)ksi->ksi_addr;
 		sf.sf_ahu.sf_handler = catcher;
 	}
 	mtx_unlock(&psp->ps_mtx);
 	PROC_UNLOCK(p);
 
 	/* Save most if not all of trap frame. */
 	sf.sf_siginfo.si_sc.sc_eax = regs->tf_eax;
 	sf.sf_siginfo.si_sc.sc_ebx = regs->tf_ebx;
 	sf.sf_siginfo.si_sc.sc_ecx = regs->tf_ecx;
 	sf.sf_siginfo.si_sc.sc_edx = regs->tf_edx;
 	sf.sf_siginfo.si_sc.sc_esi = regs->tf_esi;
 	sf.sf_siginfo.si_sc.sc_edi = regs->tf_edi;
 	sf.sf_siginfo.si_sc.sc_cs = regs->tf_cs;
 	sf.sf_siginfo.si_sc.sc_ds = regs->tf_ds;
 	sf.sf_siginfo.si_sc.sc_ss = regs->tf_ss;
 	sf.sf_siginfo.si_sc.sc_es = regs->tf_es;
 	sf.sf_siginfo.si_sc.sc_fs = regs->tf_fs;
 	sf.sf_siginfo.si_sc.sc_gs = rgs();
 	sf.sf_siginfo.si_sc.sc_isp = regs->tf_isp;
 
 	/* Build the signal context to be used by osigreturn(). */
 	sf.sf_siginfo.si_sc.sc_onstack = (oonstack) ? 1 : 0;
 	SIG2OSIG(*mask, sf.sf_siginfo.si_sc.sc_mask);
 	sf.sf_siginfo.si_sc.sc_sp = regs->tf_esp;
 	sf.sf_siginfo.si_sc.sc_fp = regs->tf_ebp;
 	sf.sf_siginfo.si_sc.sc_pc = regs->tf_eip;
 	sf.sf_siginfo.si_sc.sc_ps = regs->tf_eflags;
 	sf.sf_siginfo.si_sc.sc_trapno = regs->tf_trapno;
 	sf.sf_siginfo.si_sc.sc_err = regs->tf_err;
 
 	/*
 	 * If we're a vm86 process, we want to save the segment registers.
 	 * We also change eflags to be our emulated eflags, not the actual
 	 * eflags.
 	 */
 	if (regs->tf_eflags & PSL_VM) {
 		/* XXX confusing names: `tf' isn't a trapframe; `regs' is. */
 		struct trapframe_vm86 *tf = (struct trapframe_vm86 *)regs;
 		struct vm86_kernel *vm86 = &td->td_pcb->pcb_ext->ext_vm86;
 
 		sf.sf_siginfo.si_sc.sc_gs = tf->tf_vm86_gs;
 		sf.sf_siginfo.si_sc.sc_fs = tf->tf_vm86_fs;
 		sf.sf_siginfo.si_sc.sc_es = tf->tf_vm86_es;
 		sf.sf_siginfo.si_sc.sc_ds = tf->tf_vm86_ds;
 
 		if (vm86->vm86_has_vme == 0)
 			sf.sf_siginfo.si_sc.sc_ps =
 			    (tf->tf_eflags & ~(PSL_VIF | PSL_VIP)) |
 			    (vm86->vm86_eflags & (PSL_VIF | PSL_VIP));
 
 		/* See sendsig() for comments. */
 		tf->tf_eflags &= ~(PSL_VM | PSL_NT | PSL_VIF | PSL_VIP);
 	}
 
 	/*
 	 * Copy the sigframe out to the user's stack.
 	 */
 	if (copyout(&sf, fp, sizeof(*fp)) != 0) {
 #ifdef DEBUG
 		printf("process %ld has trashed its stack\n", (long)p->p_pid);
 #endif
 		PROC_LOCK(p);
 		sigexit(td, SIGILL);
 	}
 
 	regs->tf_esp = (int)fp;
 	if (p->p_sysent->sv_sigcode_base != 0) {
 		regs->tf_eip = p->p_sysent->sv_sigcode_base + szsigcode -
 		    szosigcode;
 	} else {
 		/* a.out sysentvec does not use shared page */
 		regs->tf_eip = p->p_sysent->sv_psstrings - szosigcode;
 	}
 	regs->tf_eflags &= ~(PSL_T | PSL_D);
 	regs->tf_cs = _ucodesel;
 	regs->tf_ds = _udatasel;
 	regs->tf_es = _udatasel;
 	regs->tf_fs = _udatasel;
 	load_gs(_udatasel);
 	regs->tf_ss = _udatasel;
 	PROC_LOCK(p);
 	mtx_lock(&psp->ps_mtx);
 }
 #endif /* COMPAT_43 */
 
 #ifdef COMPAT_FREEBSD4
 static void
 freebsd4_sendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask)
 {
 	struct sigframe4 sf, *sfp;
 	struct proc *p;
 	struct thread *td;
 	struct sigacts *psp;
 	struct trapframe *regs;
 	int sig;
 	int oonstack;
 
 	td = curthread;
 	p = td->td_proc;
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	sig = ksi->ksi_signo;
 	psp = p->p_sigacts;
 	mtx_assert(&psp->ps_mtx, MA_OWNED);
 	regs = td->td_frame;
 	oonstack = sigonstack(regs->tf_esp);
 
 	/* Save user context. */
 	bzero(&sf, sizeof(sf));
 	sf.sf_uc.uc_sigmask = *mask;
 	sf.sf_uc.uc_stack = td->td_sigstk;
 	sf.sf_uc.uc_stack.ss_flags = (td->td_pflags & TDP_ALTSTACK)
 	    ? ((oonstack) ? SS_ONSTACK : 0) : SS_DISABLE;
 	sf.sf_uc.uc_mcontext.mc_onstack = (oonstack) ? 1 : 0;
 	sf.sf_uc.uc_mcontext.mc_gs = rgs();
 	bcopy(regs, &sf.sf_uc.uc_mcontext.mc_fs, sizeof(*regs));
 	bzero(sf.sf_uc.uc_mcontext.mc_fpregs,
 	    sizeof(sf.sf_uc.uc_mcontext.mc_fpregs));
 	bzero(sf.sf_uc.uc_mcontext.__spare__,
 	    sizeof(sf.sf_uc.uc_mcontext.__spare__));
 	bzero(sf.sf_uc.__spare__, sizeof(sf.sf_uc.__spare__));
 
 	/* Allocate space for the signal handler context. */
 	if ((td->td_pflags & TDP_ALTSTACK) != 0 && !oonstack &&
 	    SIGISMEMBER(psp->ps_sigonstack, sig)) {
 		sfp = (struct sigframe4 *)(td->td_sigstk.ss_sp +
 		    td->td_sigstk.ss_size - sizeof(struct sigframe4));
 #if defined(COMPAT_43)
 		td->td_sigstk.ss_flags |= SS_ONSTACK;
 #endif
 	} else
 		sfp = (struct sigframe4 *)regs->tf_esp - 1;
 
 	/* Translate the signal if appropriate. */
 	if (p->p_sysent->sv_sigtbl && sig <= p->p_sysent->sv_sigsize)
 		sig = p->p_sysent->sv_sigtbl[_SIG_IDX(sig)];
 
 	/* Build the argument list for the signal handler. */
 	sf.sf_signum = sig;
 	sf.sf_ucontext = (register_t)&sfp->sf_uc;
 	bzero(&sf.sf_si, sizeof(sf.sf_si));
 	if (SIGISMEMBER(psp->ps_siginfo, sig)) {
 		/* Signal handler installed with SA_SIGINFO. */
 		sf.sf_siginfo = (register_t)&sfp->sf_si;
 		sf.sf_ahu.sf_action = (__siginfohandler_t *)catcher;
 
 		/* Fill in POSIX parts */
 		sf.sf_si.si_signo = sig;
 		sf.sf_si.si_code = ksi->ksi_code;
 		sf.sf_si.si_addr = ksi->ksi_addr;
 	} else {
 		/* Old FreeBSD-style arguments. */
 		sf.sf_siginfo = ksi->ksi_code;
 		sf.sf_addr = (register_t)ksi->ksi_addr;
 		sf.sf_ahu.sf_handler = catcher;
 	}
 	mtx_unlock(&psp->ps_mtx);
 	PROC_UNLOCK(p);
 
 	/*
 	 * If we're a vm86 process, we want to save the segment registers.
 	 * We also change eflags to be our emulated eflags, not the actual
 	 * eflags.
 	 */
 	if (regs->tf_eflags & PSL_VM) {
 		struct trapframe_vm86 *tf = (struct trapframe_vm86 *)regs;
 		struct vm86_kernel *vm86 = &td->td_pcb->pcb_ext->ext_vm86;
 
 		sf.sf_uc.uc_mcontext.mc_gs = tf->tf_vm86_gs;
 		sf.sf_uc.uc_mcontext.mc_fs = tf->tf_vm86_fs;
 		sf.sf_uc.uc_mcontext.mc_es = tf->tf_vm86_es;
 		sf.sf_uc.uc_mcontext.mc_ds = tf->tf_vm86_ds;
 
 		if (vm86->vm86_has_vme == 0)
 			sf.sf_uc.uc_mcontext.mc_eflags =
 			    (tf->tf_eflags & ~(PSL_VIF | PSL_VIP)) |
 			    (vm86->vm86_eflags & (PSL_VIF | PSL_VIP));
 
 		/*
 		 * Clear PSL_NT to inhibit T_TSSFLT faults on return from
 		 * syscalls made by the signal handler.  This just avoids
 		 * wasting time for our lazy fixup of such faults.  PSL_NT
 		 * does nothing in vm86 mode, but vm86 programs can set it
 		 * almost legitimately in probes for old cpu types.
 		 */
 		tf->tf_eflags &= ~(PSL_VM | PSL_NT | PSL_VIF | PSL_VIP);
 	}
 
 	/*
 	 * Copy the sigframe out to the user's stack.
 	 */
 	if (copyout(&sf, sfp, sizeof(*sfp)) != 0) {
 #ifdef DEBUG
 		printf("process %ld has trashed its stack\n", (long)p->p_pid);
 #endif
 		PROC_LOCK(p);
 		sigexit(td, SIGILL);
 	}
 
 	regs->tf_esp = (int)sfp;
 	regs->tf_eip = p->p_sysent->sv_sigcode_base + szsigcode -
 	    szfreebsd4_sigcode;
 	regs->tf_eflags &= ~(PSL_T | PSL_D);
 	regs->tf_cs = _ucodesel;
 	regs->tf_ds = _udatasel;
 	regs->tf_es = _udatasel;
 	regs->tf_fs = _udatasel;
 	regs->tf_ss = _udatasel;
 	PROC_LOCK(p);
 	mtx_lock(&psp->ps_mtx);
 }
 #endif	/* COMPAT_FREEBSD4 */
 
 void
 sendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask)
 {
 	struct sigframe sf, *sfp;
 	struct proc *p;
 	struct thread *td;
 	struct sigacts *psp;
 	char *sp;
 	struct trapframe *regs;
 	struct segment_descriptor *sdp;
 	int sig;
 	int oonstack;
 
 	td = curthread;
 	p = td->td_proc;
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	sig = ksi->ksi_signo;
 	psp = p->p_sigacts;
 	mtx_assert(&psp->ps_mtx, MA_OWNED);
 #ifdef COMPAT_FREEBSD4
 	if (SIGISMEMBER(psp->ps_freebsd4, sig)) {
 		freebsd4_sendsig(catcher, ksi, mask);
 		return;
 	}
 #endif
 #ifdef COMPAT_43
 	if (SIGISMEMBER(psp->ps_osigset, sig)) {
 		osendsig(catcher, ksi, mask);
 		return;
 	}
 #endif
 	regs = td->td_frame;
 	oonstack = sigonstack(regs->tf_esp);
 
 	/* Save user context. */
 	bzero(&sf, sizeof(sf));
 	sf.sf_uc.uc_sigmask = *mask;
 	sf.sf_uc.uc_stack = td->td_sigstk;
 	sf.sf_uc.uc_stack.ss_flags = (td->td_pflags & TDP_ALTSTACK)
 	    ? ((oonstack) ? SS_ONSTACK : 0) : SS_DISABLE;
 	sf.sf_uc.uc_mcontext.mc_onstack = (oonstack) ? 1 : 0;
 	sf.sf_uc.uc_mcontext.mc_gs = rgs();
 	bcopy(regs, &sf.sf_uc.uc_mcontext.mc_fs, sizeof(*regs));
 	sf.sf_uc.uc_mcontext.mc_len = sizeof(sf.sf_uc.uc_mcontext); /* magic */
 	get_fpcontext(td, &sf.sf_uc.uc_mcontext);
 	fpstate_drop(td);
 	/*
 	 * Unconditionally fill the fsbase and gsbase into the mcontext.
 	 */
 	sdp = &td->td_pcb->pcb_fsd;
 	sf.sf_uc.uc_mcontext.mc_fsbase = sdp->sd_hibase << 24 |
 	    sdp->sd_lobase;
 	sdp = &td->td_pcb->pcb_gsd;
 	sf.sf_uc.uc_mcontext.mc_gsbase = sdp->sd_hibase << 24 |
 	    sdp->sd_lobase;
 	sf.sf_uc.uc_mcontext.mc_flags = 0;
 	bzero(sf.sf_uc.uc_mcontext.mc_spare2,
 	    sizeof(sf.sf_uc.uc_mcontext.mc_spare2));
 	bzero(sf.sf_uc.__spare__, sizeof(sf.sf_uc.__spare__));
 
 	/* Allocate space for the signal handler context. */
 	if ((td->td_pflags & TDP_ALTSTACK) != 0 && !oonstack &&
 	    SIGISMEMBER(psp->ps_sigonstack, sig)) {
 		sp = td->td_sigstk.ss_sp +
 		    td->td_sigstk.ss_size - sizeof(struct sigframe);
 #if defined(COMPAT_43)
 		td->td_sigstk.ss_flags |= SS_ONSTACK;
 #endif
 	} else
 		sp = (char *)regs->tf_esp - sizeof(struct sigframe);
 	/* Align to 16 bytes. */
 	sfp = (struct sigframe *)((unsigned int)sp & ~0xF);
 
 	/* Translate the signal if appropriate. */
 	if (p->p_sysent->sv_sigtbl && sig <= p->p_sysent->sv_sigsize)
 		sig = p->p_sysent->sv_sigtbl[_SIG_IDX(sig)];
 
 	/* Build the argument list for the signal handler. */
 	sf.sf_signum = sig;
 	sf.sf_ucontext = (register_t)&sfp->sf_uc;
 	bzero(&sf.sf_si, sizeof(sf.sf_si));
 	if (SIGISMEMBER(psp->ps_siginfo, sig)) {
 		/* Signal handler installed with SA_SIGINFO. */
 		sf.sf_siginfo = (register_t)&sfp->sf_si;
 		sf.sf_ahu.sf_action = (__siginfohandler_t *)catcher;
 
 		/* Fill in POSIX parts */
 		sf.sf_si = ksi->ksi_info;
 		sf.sf_si.si_signo = sig; /* maybe a translated signal */
 	} else {
 		/* Old FreeBSD-style arguments. */
 		sf.sf_siginfo = ksi->ksi_code;
 		sf.sf_addr = (register_t)ksi->ksi_addr;
 		sf.sf_ahu.sf_handler = catcher;
 	}
 	mtx_unlock(&psp->ps_mtx);
 	PROC_UNLOCK(p);
 
 	/*
 	 * If we're a vm86 process, we want to save the segment registers.
 	 * We also change eflags to be our emulated eflags, not the actual
 	 * eflags.
 	 */
 	if (regs->tf_eflags & PSL_VM) {
 		struct trapframe_vm86 *tf = (struct trapframe_vm86 *)regs;
 		struct vm86_kernel *vm86 = &td->td_pcb->pcb_ext->ext_vm86;
 
 		sf.sf_uc.uc_mcontext.mc_gs = tf->tf_vm86_gs;
 		sf.sf_uc.uc_mcontext.mc_fs = tf->tf_vm86_fs;
 		sf.sf_uc.uc_mcontext.mc_es = tf->tf_vm86_es;
 		sf.sf_uc.uc_mcontext.mc_ds = tf->tf_vm86_ds;
 
 		if (vm86->vm86_has_vme == 0)
 			sf.sf_uc.uc_mcontext.mc_eflags =
 			    (tf->tf_eflags & ~(PSL_VIF | PSL_VIP)) |
 			    (vm86->vm86_eflags & (PSL_VIF | PSL_VIP));
 
 		/*
 		 * Clear PSL_NT to inhibit T_TSSFLT faults on return from
 		 * syscalls made by the signal handler.  This just avoids
 		 * wasting time for our lazy fixup of such faults.  PSL_NT
 		 * does nothing in vm86 mode, but vm86 programs can set it
 		 * almost legitimately in probes for old cpu types.
 		 */
 		tf->tf_eflags &= ~(PSL_VM | PSL_NT | PSL_VIF | PSL_VIP);
 	}
 
 	/*
 	 * Copy the sigframe out to the user's stack.
 	 */
 	if (copyout(&sf, sfp, sizeof(*sfp)) != 0) {
 #ifdef DEBUG
 		printf("process %ld has trashed its stack\n", (long)p->p_pid);
 #endif
 		PROC_LOCK(p);
 		sigexit(td, SIGILL);
 	}
 
 	regs->tf_esp = (int)sfp;
 	regs->tf_eip = p->p_sysent->sv_sigcode_base;
 	if (regs->tf_eip == 0)
 		regs->tf_eip = p->p_sysent->sv_psstrings - szsigcode;
 	regs->tf_eflags &= ~(PSL_T | PSL_D);
 	regs->tf_cs = _ucodesel;
 	regs->tf_ds = _udatasel;
 	regs->tf_es = _udatasel;
 	regs->tf_fs = _udatasel;
 	regs->tf_ss = _udatasel;
 	PROC_LOCK(p);
 	mtx_lock(&psp->ps_mtx);
 }
 
 /*
  * System call to cleanup state after a signal
  * has been taken.  Reset signal mask and
  * stack state from context left by sendsig (above).
  * Return to previous pc and psl as specified by
  * context left by sendsig. Check carefully to
  * make sure that the user has not modified the
  * state to gain improper privileges.
  *
  * MPSAFE
  */
 #ifdef COMPAT_43
 int
 osigreturn(td, uap)
 	struct thread *td;
 	struct osigreturn_args /* {
 		struct osigcontext *sigcntxp;
 	} */ *uap;
 {
 	struct osigcontext sc;
 	struct trapframe *regs;
 	struct osigcontext *scp;
 	int eflags, error;
 	ksiginfo_t ksi;
 
 	regs = td->td_frame;
 	error = copyin(uap->sigcntxp, &sc, sizeof(sc));
 	if (error != 0)
 		return (error);
 	scp = &sc;
 	eflags = scp->sc_ps;
 	if (eflags & PSL_VM) {
 		struct trapframe_vm86 *tf = (struct trapframe_vm86 *)regs;
 		struct vm86_kernel *vm86;
 
 		/*
 		 * if pcb_ext == 0 or vm86_inited == 0, the user hasn't
 		 * set up the vm86 area, and we can't enter vm86 mode.
 		 */
 		if (td->td_pcb->pcb_ext == 0)
 			return (EINVAL);
 		vm86 = &td->td_pcb->pcb_ext->ext_vm86;
 		if (vm86->vm86_inited == 0)
 			return (EINVAL);
 
 		/* Go back to user mode if both flags are set. */
 		if ((eflags & PSL_VIP) && (eflags & PSL_VIF)) {
 			ksiginfo_init_trap(&ksi);
 			ksi.ksi_signo = SIGBUS;
 			ksi.ksi_code = BUS_OBJERR;
 			ksi.ksi_addr = (void *)regs->tf_eip;
 			trapsignal(td, &ksi);
 		}
 
 		if (vm86->vm86_has_vme) {
 			eflags = (tf->tf_eflags & ~VME_USERCHANGE) |
 			    (eflags & VME_USERCHANGE) | PSL_VM;
 		} else {
 			vm86->vm86_eflags = eflags;	/* save VIF, VIP */
 			eflags = (tf->tf_eflags & ~VM_USERCHANGE) |
 			    (eflags & VM_USERCHANGE) | PSL_VM;
 		}
 		tf->tf_vm86_ds = scp->sc_ds;
 		tf->tf_vm86_es = scp->sc_es;
 		tf->tf_vm86_fs = scp->sc_fs;
 		tf->tf_vm86_gs = scp->sc_gs;
 		tf->tf_ds = _udatasel;
 		tf->tf_es = _udatasel;
 		tf->tf_fs = _udatasel;
 	} else {
 		/*
 		 * Don't allow users to change privileged or reserved flags.
 		 */
 		if (!EFL_SECURE(eflags, regs->tf_eflags)) {
 	    		return (EINVAL);
 		}
 
 		/*
 		 * Don't allow users to load a valid privileged %cs.  Let the
 		 * hardware check for invalid selectors, excess privilege in
 		 * other selectors, invalid %eip's and invalid %esp's.
 		 */
 		if (!CS_SECURE(scp->sc_cs)) {
 			ksiginfo_init_trap(&ksi);
 			ksi.ksi_signo = SIGBUS;
 			ksi.ksi_code = BUS_OBJERR;
 			ksi.ksi_trapno = T_PROTFLT;
 			ksi.ksi_addr = (void *)regs->tf_eip;
 			trapsignal(td, &ksi);
 			return (EINVAL);
 		}
 		regs->tf_ds = scp->sc_ds;
 		regs->tf_es = scp->sc_es;
 		regs->tf_fs = scp->sc_fs;
 	}
 
 	/* Restore remaining registers. */
 	regs->tf_eax = scp->sc_eax;
 	regs->tf_ebx = scp->sc_ebx;
 	regs->tf_ecx = scp->sc_ecx;
 	regs->tf_edx = scp->sc_edx;
 	regs->tf_esi = scp->sc_esi;
 	regs->tf_edi = scp->sc_edi;
 	regs->tf_cs = scp->sc_cs;
 	regs->tf_ss = scp->sc_ss;
 	regs->tf_isp = scp->sc_isp;
 	regs->tf_ebp = scp->sc_fp;
 	regs->tf_esp = scp->sc_sp;
 	regs->tf_eip = scp->sc_pc;
 	regs->tf_eflags = eflags;
 
 #if defined(COMPAT_43)
 	if (scp->sc_onstack & 1)
 		td->td_sigstk.ss_flags |= SS_ONSTACK;
 	else
 		td->td_sigstk.ss_flags &= ~SS_ONSTACK;
 #endif
 	kern_sigprocmask(td, SIG_SETMASK, (sigset_t *)&scp->sc_mask, NULL,
 	    SIGPROCMASK_OLD);
 	return (EJUSTRETURN);
 }
 #endif /* COMPAT_43 */
 
 #ifdef COMPAT_FREEBSD4
 /*
  * MPSAFE
  */
 int
 freebsd4_sigreturn(td, uap)
 	struct thread *td;
 	struct freebsd4_sigreturn_args /* {
 		const ucontext4 *sigcntxp;
 	} */ *uap;
 {
 	struct ucontext4 uc;
 	struct trapframe *regs;
 	struct ucontext4 *ucp;
 	int cs, eflags, error;
 	ksiginfo_t ksi;
 
 	error = copyin(uap->sigcntxp, &uc, sizeof(uc));
 	if (error != 0)
 		return (error);
 	ucp = &uc;
 	regs = td->td_frame;
 	eflags = ucp->uc_mcontext.mc_eflags;
 	if (eflags & PSL_VM) {
 		struct trapframe_vm86 *tf = (struct trapframe_vm86 *)regs;
 		struct vm86_kernel *vm86;
 
 		/*
 		 * if pcb_ext == 0 or vm86_inited == 0, the user hasn't
 		 * set up the vm86 area, and we can't enter vm86 mode.
 		 */
 		if (td->td_pcb->pcb_ext == 0)
 			return (EINVAL);
 		vm86 = &td->td_pcb->pcb_ext->ext_vm86;
 		if (vm86->vm86_inited == 0)
 			return (EINVAL);
 
 		/* Go back to user mode if both flags are set. */
 		if ((eflags & PSL_VIP) && (eflags & PSL_VIF)) {
 			ksiginfo_init_trap(&ksi);
 			ksi.ksi_signo = SIGBUS;
 			ksi.ksi_code = BUS_OBJERR;
 			ksi.ksi_addr = (void *)regs->tf_eip;
 			trapsignal(td, &ksi);
 		}
 		if (vm86->vm86_has_vme) {
 			eflags = (tf->tf_eflags & ~VME_USERCHANGE) |
 			    (eflags & VME_USERCHANGE) | PSL_VM;
 		} else {
 			vm86->vm86_eflags = eflags;	/* save VIF, VIP */
 			eflags = (tf->tf_eflags & ~VM_USERCHANGE) |
 			    (eflags & VM_USERCHANGE) | PSL_VM;
 		}
 		bcopy(&ucp->uc_mcontext.mc_fs, tf, sizeof(struct trapframe));
 		tf->tf_eflags = eflags;
 		tf->tf_vm86_ds = tf->tf_ds;
 		tf->tf_vm86_es = tf->tf_es;
 		tf->tf_vm86_fs = tf->tf_fs;
 		tf->tf_vm86_gs = ucp->uc_mcontext.mc_gs;
 		tf->tf_ds = _udatasel;
 		tf->tf_es = _udatasel;
 		tf->tf_fs = _udatasel;
 	} else {
 		/*
 		 * Don't allow users to change privileged or reserved flags.
 		 */
 		if (!EFL_SECURE(eflags, regs->tf_eflags)) {
 			uprintf("pid %d (%s): freebsd4_sigreturn eflags = 0x%x\n",
 			    td->td_proc->p_pid, td->td_name, eflags);
 	    		return (EINVAL);
 		}
 
 		/*
 		 * Don't allow users to load a valid privileged %cs.  Let the
 		 * hardware check for invalid selectors, excess privilege in
 		 * other selectors, invalid %eip's and invalid %esp's.
 		 */
 		cs = ucp->uc_mcontext.mc_cs;
 		if (!CS_SECURE(cs)) {
 			uprintf("pid %d (%s): freebsd4_sigreturn cs = 0x%x\n",
 			    td->td_proc->p_pid, td->td_name, cs);
 			ksiginfo_init_trap(&ksi);
 			ksi.ksi_signo = SIGBUS;
 			ksi.ksi_code = BUS_OBJERR;
 			ksi.ksi_trapno = T_PROTFLT;
 			ksi.ksi_addr = (void *)regs->tf_eip;
 			trapsignal(td, &ksi);
 			return (EINVAL);
 		}
 
 		bcopy(&ucp->uc_mcontext.mc_fs, regs, sizeof(*regs));
 	}
 
 #if defined(COMPAT_43)
 	if (ucp->uc_mcontext.mc_onstack & 1)
 		td->td_sigstk.ss_flags |= SS_ONSTACK;
 	else
 		td->td_sigstk.ss_flags &= ~SS_ONSTACK;
 #endif
 	kern_sigprocmask(td, SIG_SETMASK, &ucp->uc_sigmask, NULL, 0);
 	return (EJUSTRETURN);
 }
 #endif	/* COMPAT_FREEBSD4 */
 
 /*
  * MPSAFE
  */
 int
 sys_sigreturn(td, uap)
 	struct thread *td;
 	struct sigreturn_args /* {
 		const struct __ucontext *sigcntxp;
 	} */ *uap;
 {
 	ucontext_t uc;
 	struct trapframe *regs;
 	ucontext_t *ucp;
 	int cs, eflags, error, ret;
 	ksiginfo_t ksi;
 
 	error = copyin(uap->sigcntxp, &uc, sizeof(uc));
 	if (error != 0)
 		return (error);
 	ucp = &uc;
 	regs = td->td_frame;
 	eflags = ucp->uc_mcontext.mc_eflags;
 	if (eflags & PSL_VM) {
 		struct trapframe_vm86 *tf = (struct trapframe_vm86 *)regs;
 		struct vm86_kernel *vm86;
 
 		/*
 		 * if pcb_ext == 0 or vm86_inited == 0, the user hasn't
 		 * set up the vm86 area, and we can't enter vm86 mode.
 		 */
 		if (td->td_pcb->pcb_ext == 0)
 			return (EINVAL);
 		vm86 = &td->td_pcb->pcb_ext->ext_vm86;
 		if (vm86->vm86_inited == 0)
 			return (EINVAL);
 
 		/* Go back to user mode if both flags are set. */
 		if ((eflags & PSL_VIP) && (eflags & PSL_VIF)) {
 			ksiginfo_init_trap(&ksi);
 			ksi.ksi_signo = SIGBUS;
 			ksi.ksi_code = BUS_OBJERR;
 			ksi.ksi_addr = (void *)regs->tf_eip;
 			trapsignal(td, &ksi);
 		}
 
 		if (vm86->vm86_has_vme) {
 			eflags = (tf->tf_eflags & ~VME_USERCHANGE) |
 			    (eflags & VME_USERCHANGE) | PSL_VM;
 		} else {
 			vm86->vm86_eflags = eflags;	/* save VIF, VIP */
 			eflags = (tf->tf_eflags & ~VM_USERCHANGE) |
 			    (eflags & VM_USERCHANGE) | PSL_VM;
 		}
 		bcopy(&ucp->uc_mcontext.mc_fs, tf, sizeof(struct trapframe));
 		tf->tf_eflags = eflags;
 		tf->tf_vm86_ds = tf->tf_ds;
 		tf->tf_vm86_es = tf->tf_es;
 		tf->tf_vm86_fs = tf->tf_fs;
 		tf->tf_vm86_gs = ucp->uc_mcontext.mc_gs;
 		tf->tf_ds = _udatasel;
 		tf->tf_es = _udatasel;
 		tf->tf_fs = _udatasel;
 	} else {
 		/*
 		 * Don't allow users to change privileged or reserved flags.
 		 */
 		if (!EFL_SECURE(eflags, regs->tf_eflags)) {
 			uprintf("pid %d (%s): sigreturn eflags = 0x%x\n",
 			    td->td_proc->p_pid, td->td_name, eflags);
 	    		return (EINVAL);
 		}
 
 		/*
 		 * Don't allow users to load a valid privileged %cs.  Let the
 		 * hardware check for invalid selectors, excess privilege in
 		 * other selectors, invalid %eip's and invalid %esp's.
 		 */
 		cs = ucp->uc_mcontext.mc_cs;
 		if (!CS_SECURE(cs)) {
 			uprintf("pid %d (%s): sigreturn cs = 0x%x\n",
 			    td->td_proc->p_pid, td->td_name, cs);
 			ksiginfo_init_trap(&ksi);
 			ksi.ksi_signo = SIGBUS;
 			ksi.ksi_code = BUS_OBJERR;
 			ksi.ksi_trapno = T_PROTFLT;
 			ksi.ksi_addr = (void *)regs->tf_eip;
 			trapsignal(td, &ksi);
 			return (EINVAL);
 		}
 
 		ret = set_fpcontext(td, &ucp->uc_mcontext);
 		if (ret != 0)
 			return (ret);
 		bcopy(&ucp->uc_mcontext.mc_fs, regs, sizeof(*regs));
 	}
 
 #if defined(COMPAT_43)
 	if (ucp->uc_mcontext.mc_onstack & 1)
 		td->td_sigstk.ss_flags |= SS_ONSTACK;
 	else
 		td->td_sigstk.ss_flags &= ~SS_ONSTACK;
 #endif
 
 	kern_sigprocmask(td, SIG_SETMASK, &ucp->uc_sigmask, NULL, 0);
 	return (EJUSTRETURN);
 }
 
 /*
  * Machine dependent boot() routine
  *
  * I haven't seen anything to put here yet
  * Possibly some stuff might be grafted back here from boot()
  */
 void
 cpu_boot(int howto)
 {
 }
 
 /*
  * Flush the D-cache for non-DMA I/O so that the I-cache can
  * be made coherent later.
  */
 void
 cpu_flush_dcache(void *ptr, size_t len)
 {
 	/* Not applicable */
 }
 
 /* Get current clock frequency for the given cpu id. */
 int
 cpu_est_clockrate(int cpu_id, uint64_t *rate)
 {
 	uint64_t tsc1, tsc2;
 	uint64_t acnt, mcnt, perf;
 	register_t reg;
 
 	if (pcpu_find(cpu_id) == NULL || rate == NULL)
 		return (EINVAL);
 	if ((cpu_feature & CPUID_TSC) == 0)
 		return (EOPNOTSUPP);
 
 	/*
 	 * If TSC is P-state invariant and APERF/MPERF MSRs do not exist,
 	 * DELAY(9) based logic fails.
 	 */
 	if (tsc_is_invariant && !tsc_perf_stat)
 		return (EOPNOTSUPP);
 
 #ifdef SMP
 	if (smp_cpus > 1) {
 		/* Schedule ourselves on the indicated cpu. */
 		thread_lock(curthread);
 		sched_bind(curthread, cpu_id);
 		thread_unlock(curthread);
 	}
 #endif
 
 	/* Calibrate by measuring a short delay. */
 	reg = intr_disable();
 	if (tsc_is_invariant) {
 		wrmsr(MSR_MPERF, 0);
 		wrmsr(MSR_APERF, 0);
 		tsc1 = rdtsc();
 		DELAY(1000);
 		mcnt = rdmsr(MSR_MPERF);
 		acnt = rdmsr(MSR_APERF);
 		tsc2 = rdtsc();
 		intr_restore(reg);
 		perf = 1000 * acnt / mcnt;
 		*rate = (tsc2 - tsc1) * perf;
 	} else {
 		tsc1 = rdtsc();
 		DELAY(1000);
 		tsc2 = rdtsc();
 		intr_restore(reg);
 		*rate = (tsc2 - tsc1) * 1000;
 	}
 
 #ifdef SMP
 	if (smp_cpus > 1) {
 		thread_lock(curthread);
 		sched_unbind(curthread);
 		thread_unlock(curthread);
 	}
 #endif
 
 	return (0);
 }
 
 #ifdef XEN
 
 static void
 idle_block(void)
 {
 
 	HYPERVISOR_sched_op(SCHEDOP_block, 0);
 }
 
 void
 cpu_halt(void)
 {
 	HYPERVISOR_shutdown(SHUTDOWN_poweroff);
 }
 
 int scheduler_running;
 
 static void
 cpu_idle_hlt(sbintime_t sbt)
 {
 
 	scheduler_running = 1;
 	enable_intr();
 	idle_block();
 }
 
 #else
 /*
  * Shutdown the CPU as much as possible
  */
 void
 cpu_halt(void)
 {
 	for (;;)
 		halt();
 }
 
 #endif
 
 void (*cpu_idle_hook)(sbintime_t) = NULL;	/* ACPI idle hook. */
 static int	cpu_ident_amdc1e = 0;	/* AMD C1E supported. */
 static int	idle_mwait = 1;		/* Use MONITOR/MWAIT for short idle. */
 TUNABLE_INT("machdep.idle_mwait", &idle_mwait);
 SYSCTL_INT(_machdep, OID_AUTO, idle_mwait, CTLFLAG_RW, &idle_mwait,
     0, "Use MONITOR/MWAIT for short idle");
 
 #define	STATE_RUNNING	0x0
 #define	STATE_MWAIT	0x1
 #define	STATE_SLEEPING	0x2
 
 static void
 cpu_idle_acpi(sbintime_t sbt)
 {
 	int *state;
 
 	state = (int *)PCPU_PTR(monitorbuf);
 	*state = STATE_SLEEPING;
 
 	/* See comments in cpu_idle_hlt(). */
 	disable_intr();
 	if (sched_runnable())
 		enable_intr();
 	else if (cpu_idle_hook)
 		cpu_idle_hook(sbt);
 	else
 		__asm __volatile("sti; hlt");
 	*state = STATE_RUNNING;
 }
 
 #ifndef XEN
 static void
 cpu_idle_hlt(sbintime_t sbt)
 {
 	int *state;
 
 	state = (int *)PCPU_PTR(monitorbuf);
 	*state = STATE_SLEEPING;
 
 	/*
 	 * Since we may be in a critical section from cpu_idle(), if
 	 * an interrupt fires during that critical section we may have
 	 * a pending preemption.  If the CPU halts, then that thread
 	 * may not execute until a later interrupt awakens the CPU.
 	 * To handle this race, check for a runnable thread after
 	 * disabling interrupts and immediately return if one is
 	 * found.  Also, we must absolutely guarentee that hlt is
 	 * the next instruction after sti.  This ensures that any
 	 * interrupt that fires after the call to disable_intr() will
 	 * immediately awaken the CPU from hlt.  Finally, please note
 	 * that on x86 this works fine because of interrupts enabled only
 	 * after the instruction following sti takes place, while IF is set
 	 * to 1 immediately, allowing hlt instruction to acknowledge the
 	 * interrupt.
 	 */
 	disable_intr();
 	if (sched_runnable())
 		enable_intr();
 	else
 		__asm __volatile("sti; hlt");
 	*state = STATE_RUNNING;
 }
 #endif
 
 /*
  * MWAIT cpu power states.  Lower 4 bits are sub-states.
  */
 #define	MWAIT_C0	0xf0
 #define	MWAIT_C1	0x00
 #define	MWAIT_C2	0x10
 #define	MWAIT_C3	0x20
 #define	MWAIT_C4	0x30
 
 static void
 cpu_idle_mwait(sbintime_t sbt)
 {
 	int *state;
 
 	state = (int *)PCPU_PTR(monitorbuf);
 	*state = STATE_MWAIT;
 
 	/* See comments in cpu_idle_hlt(). */
 	disable_intr();
 	if (sched_runnable()) {
 		enable_intr();
 		*state = STATE_RUNNING;
 		return;
 	}
 	cpu_monitor(state, 0, 0);
 	if (*state == STATE_MWAIT)
 		__asm __volatile("sti; mwait" : : "a" (MWAIT_C1), "c" (0));
 	else
 		enable_intr();
 	*state = STATE_RUNNING;
 }
 
 static void
 cpu_idle_spin(sbintime_t sbt)
 {
 	int *state;
 	int i;
 
 	state = (int *)PCPU_PTR(monitorbuf);
 	*state = STATE_RUNNING;
 
 	/*
 	 * The sched_runnable() call is racy but as long as there is
 	 * a loop missing it one time will have just a little impact if any 
 	 * (and it is much better than missing the check at all).
 	 */
 	for (i = 0; i < 1000; i++) {
 		if (sched_runnable())
 			return;
 		cpu_spinwait();
 	}
 }
 
 /*
  * C1E renders the local APIC timer dead, so we disable it by
  * reading the Interrupt Pending Message register and clearing
  * both C1eOnCmpHalt (bit 28) and SmiOnCmpHalt (bit 27).
  * 
  * Reference:
  *   "BIOS and Kernel Developer's Guide for AMD NPT Family 0Fh Processors"
  *   #32559 revision 3.00+
  */
 #define	MSR_AMDK8_IPM		0xc0010055
 #define	AMDK8_SMIONCMPHALT	(1ULL << 27)
 #define	AMDK8_C1EONCMPHALT	(1ULL << 28)
 #define	AMDK8_CMPHALT		(AMDK8_SMIONCMPHALT | AMDK8_C1EONCMPHALT)
 
 static void
 cpu_probe_amdc1e(void)
 {
 
 	/*
 	 * Detect the presence of C1E capability mostly on latest
 	 * dual-cores (or future) k8 family.
 	 */
 	if (cpu_vendor_id == CPU_VENDOR_AMD &&
 	    (cpu_id & 0x00000f00) == 0x00000f00 &&
 	    (cpu_id & 0x0fff0000) >=  0x00040000) {
 		cpu_ident_amdc1e = 1;
 	}
 }
 
 #ifdef XEN
 void (*cpu_idle_fn)(sbintime_t) = cpu_idle_hlt;
 #else
 void (*cpu_idle_fn)(sbintime_t) = cpu_idle_acpi;
 #endif
 
 void
 cpu_idle(int busy)
 {
 #ifndef XEN
 	uint64_t msr;
 #endif
 	sbintime_t sbt = -1;
 
 	CTR2(KTR_SPARE2, "cpu_idle(%d) at %d",
 	    busy, curcpu);
 #if defined(MP_WATCHDOG) && !defined(XEN)
 	ap_watchdog(PCPU_GET(cpuid));
 #endif
 #ifndef XEN
 	/* If we are busy - try to use fast methods. */
 	if (busy) {
 		if ((cpu_feature2 & CPUID2_MON) && idle_mwait) {
 			cpu_idle_mwait(busy);
 			goto out;
 		}
 	}
 #endif
 
 	/* If we have time - switch timers into idle mode. */
 	if (!busy) {
 		critical_enter();
 		sbt = cpu_idleclock();
 	}
 
 #ifndef XEN
 	/* Apply AMD APIC timer C1E workaround. */
 	if (cpu_ident_amdc1e && cpu_disable_deep_sleep) {
 		msr = rdmsr(MSR_AMDK8_IPM);
 		if (msr & AMDK8_CMPHALT)
 			wrmsr(MSR_AMDK8_IPM, msr & ~AMDK8_CMPHALT);
 	}
 #endif
 
 	/* Call main idle method. */
 	cpu_idle_fn(sbt);
 
 	/* Switch timers mack into active mode. */
 	if (!busy) {
 		cpu_activeclock();
 		critical_exit();
 	}
 #ifndef XEN
 out:
 #endif
 	CTR2(KTR_SPARE2, "cpu_idle(%d) at %d done",
 	    busy, curcpu);
 }
 
 int
 cpu_idle_wakeup(int cpu)
 {
 	struct pcpu *pcpu;
 	int *state;
 
 	pcpu = pcpu_find(cpu);
 	state = (int *)pcpu->pc_monitorbuf;
 	/*
 	 * This doesn't need to be atomic since missing the race will
 	 * simply result in unnecessary IPIs.
 	 */
 	if (*state == STATE_SLEEPING)
 		return (0);
 	if (*state == STATE_MWAIT)
 		*state = STATE_RUNNING;
 	return (1);
 }
 
 /*
  * Ordered by speed/power consumption.
  */
 struct {
 	void	*id_fn;
 	char	*id_name;
 } idle_tbl[] = {
 	{ cpu_idle_spin, "spin" },
 	{ cpu_idle_mwait, "mwait" },
 	{ cpu_idle_hlt, "hlt" },
 	{ cpu_idle_acpi, "acpi" },
 	{ NULL, NULL }
 };
 
 static int
 idle_sysctl_available(SYSCTL_HANDLER_ARGS)
 {
 	char *avail, *p;
 	int error;
 	int i;
 
 	avail = malloc(256, M_TEMP, M_WAITOK);
 	p = avail;
 	for (i = 0; idle_tbl[i].id_name != NULL; i++) {
 		if (strstr(idle_tbl[i].id_name, "mwait") &&
 		    (cpu_feature2 & CPUID2_MON) == 0)
 			continue;
 		if (strcmp(idle_tbl[i].id_name, "acpi") == 0 &&
 		    cpu_idle_hook == NULL)
 			continue;
 		p += sprintf(p, "%s%s", p != avail ? ", " : "",
 		    idle_tbl[i].id_name);
 	}
 	error = sysctl_handle_string(oidp, avail, 0, req);
 	free(avail, M_TEMP);
 	return (error);
 }
 
 SYSCTL_PROC(_machdep, OID_AUTO, idle_available, CTLTYPE_STRING | CTLFLAG_RD,
     0, 0, idle_sysctl_available, "A", "list of available idle functions");
 
 static int
 idle_sysctl(SYSCTL_HANDLER_ARGS)
 {
 	char buf[16];
 	int error;
 	char *p;
 	int i;
 
 	p = "unknown";
 	for (i = 0; idle_tbl[i].id_name != NULL; i++) {
 		if (idle_tbl[i].id_fn == cpu_idle_fn) {
 			p = idle_tbl[i].id_name;
 			break;
 		}
 	}
 	strncpy(buf, p, sizeof(buf));
 	error = sysctl_handle_string(oidp, buf, sizeof(buf), req);
 	if (error != 0 || req->newptr == NULL)
 		return (error);
 	for (i = 0; idle_tbl[i].id_name != NULL; i++) {
 		if (strstr(idle_tbl[i].id_name, "mwait") &&
 		    (cpu_feature2 & CPUID2_MON) == 0)
 			continue;
 		if (strcmp(idle_tbl[i].id_name, "acpi") == 0 &&
 		    cpu_idle_hook == NULL)
 			continue;
 		if (strcmp(idle_tbl[i].id_name, buf))
 			continue;
 		cpu_idle_fn = idle_tbl[i].id_fn;
 		return (0);
 	}
 	return (EINVAL);
 }
 
 SYSCTL_PROC(_machdep, OID_AUTO, idle, CTLTYPE_STRING | CTLFLAG_RW, 0, 0,
     idle_sysctl, "A", "currently selected idle function");
 
 /*
  * Reset registers to default values on exec.
  */
 void
 exec_setregs(struct thread *td, struct image_params *imgp, u_long stack)
 {
 	struct trapframe *regs = td->td_frame;
 	struct pcb *pcb = td->td_pcb;
 
 	/* Reset pc->pcb_gs and %gs before possibly invalidating it. */
 	pcb->pcb_gs = _udatasel;
 	load_gs(_udatasel);
 
 	mtx_lock_spin(&dt_lock);
 	if (td->td_proc->p_md.md_ldt)
 		user_ldt_free(td);
 	else
 		mtx_unlock_spin(&dt_lock);
   
 	bzero((char *)regs, sizeof(struct trapframe));
 	regs->tf_eip = imgp->entry_addr;
 	regs->tf_esp = stack;
 	regs->tf_eflags = PSL_USER | (regs->tf_eflags & PSL_T);
 	regs->tf_ss = _udatasel;
 	regs->tf_ds = _udatasel;
 	regs->tf_es = _udatasel;
 	regs->tf_fs = _udatasel;
 	regs->tf_cs = _ucodesel;
 
 	/* PS_STRINGS value for BSD/OS binaries.  It is 0 for non-BSD/OS. */
 	regs->tf_ebx = imgp->ps_strings;
 
         /*
          * Reset the hardware debug registers if they were in use.
          * They won't have any meaning for the newly exec'd process.  
          */
         if (pcb->pcb_flags & PCB_DBREGS) {
                 pcb->pcb_dr0 = 0;
                 pcb->pcb_dr1 = 0;
                 pcb->pcb_dr2 = 0;
                 pcb->pcb_dr3 = 0;
                 pcb->pcb_dr6 = 0;
                 pcb->pcb_dr7 = 0;
                 if (pcb == curpcb) {
 		        /*
 			 * Clear the debug registers on the running
 			 * CPU, otherwise they will end up affecting
 			 * the next process we switch to.
 			 */
 		        reset_dbregs();
                 }
                 pcb->pcb_flags &= ~PCB_DBREGS;
         }
 
 	/*
 	 * Initialize the math emulator (if any) for the current process.
 	 * Actually, just clear the bit that says that the emulator has
 	 * been initialized.  Initialization is delayed until the process
 	 * traps to the emulator (if it is done at all) mainly because
 	 * emulators don't provide an entry point for initialization.
 	 */
 	td->td_pcb->pcb_flags &= ~FP_SOFTFP;
 	pcb->pcb_initial_npxcw = __INITIAL_NPXCW__;
 
 	/*
 	 * Drop the FP state if we hold it, so that the process gets a
 	 * clean FP state if it uses the FPU again.
 	 */
 	fpstate_drop(td);
 
 	/*
 	 * XXX - Linux emulator
 	 * Make sure sure edx is 0x0 on entry. Linux binaries depend
 	 * on it.
 	 */
 	td->td_retval[1] = 0;
 }
 
 void
 cpu_setregs(void)
 {
 	unsigned int cr0;
 
 	cr0 = rcr0();
 
 	/*
 	 * CR0_MP, CR0_NE and CR0_TS are set for NPX (FPU) support:
 	 *
 	 * Prepare to trap all ESC (i.e., NPX) instructions and all WAIT
 	 * instructions.  We must set the CR0_MP bit and use the CR0_TS
 	 * bit to control the trap, because setting the CR0_EM bit does
 	 * not cause WAIT instructions to trap.  It's important to trap
 	 * WAIT instructions - otherwise the "wait" variants of no-wait
 	 * control instructions would degenerate to the "no-wait" variants
 	 * after FP context switches but work correctly otherwise.  It's
 	 * particularly important to trap WAITs when there is no NPX -
 	 * otherwise the "wait" variants would always degenerate.
 	 *
 	 * Try setting CR0_NE to get correct error reporting on 486DX's.
 	 * Setting it should fail or do nothing on lesser processors.
 	 */
 	cr0 |= CR0_MP | CR0_NE | CR0_TS | CR0_WP | CR0_AM;
 	load_cr0(cr0);
 	load_gs(_udatasel);
 }
 
 u_long bootdev;		/* not a struct cdev *- encoding is different */
 SYSCTL_ULONG(_machdep, OID_AUTO, guessed_bootdev,
 	CTLFLAG_RD, &bootdev, 0, "Maybe the Boot device (not in struct cdev *format)");
 
 /*
  * Initialize 386 and configure to run kernel
  */
 
 /*
  * Initialize segments & interrupt table
  */
 
 int _default_ldt;
 
 #ifdef XEN
 union descriptor *gdt;
 union descriptor *ldt;
 #else
 union descriptor gdt[NGDT * MAXCPU];	/* global descriptor table */
 union descriptor ldt[NLDT];		/* local descriptor table */
 #endif
 static struct gate_descriptor idt0[NIDT];
 struct gate_descriptor *idt = &idt0[0];	/* interrupt descriptor table */
 struct region_descriptor r_gdt, r_idt;	/* table descriptors */
 struct mtx dt_lock;			/* lock for GDT and LDT */
 
 #if defined(I586_CPU) && !defined(NO_F00F_HACK)
 extern int has_f00f_bug;
 #endif
 
 static struct i386tss dblfault_tss;
 static char dblfault_stack[PAGE_SIZE];
 
 extern  vm_offset_t	proc0kstack;
 
 
 /*
  * software prototypes -- in more palatable form.
  *
  * GCODE_SEL through GUDATA_SEL must be in this order for syscall/sysret
  * GUFS_SEL and GUGS_SEL must be in this order (swtch.s knows it)
  */
 struct soft_segment_descriptor gdt_segs[] = {
 /* GNULL_SEL	0 Null Descriptor */
 {	.ssd_base = 0x0,
 	.ssd_limit = 0x0,
 	.ssd_type = 0,
 	.ssd_dpl = SEL_KPL,
 	.ssd_p = 0,
 	.ssd_xx = 0, .ssd_xx1 = 0,
 	.ssd_def32 = 0,
 	.ssd_gran = 0		},
 /* GPRIV_SEL	1 SMP Per-Processor Private Data Descriptor */
 {	.ssd_base = 0x0,
 	.ssd_limit = 0xfffff,
 	.ssd_type = SDT_MEMRWA,
 	.ssd_dpl = SEL_KPL,
 	.ssd_p = 1,
 	.ssd_xx = 0, .ssd_xx1 = 0,
 	.ssd_def32 = 1,
 	.ssd_gran = 1		},
 /* GUFS_SEL	2 %fs Descriptor for user */
 {	.ssd_base = 0x0,
 	.ssd_limit = 0xfffff,
 	.ssd_type = SDT_MEMRWA,
 	.ssd_dpl = SEL_UPL,
 	.ssd_p = 1,
 	.ssd_xx = 0, .ssd_xx1 = 0,
 	.ssd_def32 = 1,
 	.ssd_gran = 1		},
 /* GUGS_SEL	3 %gs Descriptor for user */
 {	.ssd_base = 0x0,
 	.ssd_limit = 0xfffff,
 	.ssd_type = SDT_MEMRWA,
 	.ssd_dpl = SEL_UPL,
 	.ssd_p = 1,
 	.ssd_xx = 0, .ssd_xx1 = 0,
 	.ssd_def32 = 1,
 	.ssd_gran = 1		},
 /* GCODE_SEL	4 Code Descriptor for kernel */
 {	.ssd_base = 0x0,
 	.ssd_limit = 0xfffff,
 	.ssd_type = SDT_MEMERA,
 	.ssd_dpl = SEL_KPL,
 	.ssd_p = 1,
 	.ssd_xx = 0, .ssd_xx1 = 0,
 	.ssd_def32 = 1,
 	.ssd_gran = 1		},
 /* GDATA_SEL	5 Data Descriptor for kernel */
 {	.ssd_base = 0x0,
 	.ssd_limit = 0xfffff,
 	.ssd_type = SDT_MEMRWA,
 	.ssd_dpl = SEL_KPL,
 	.ssd_p = 1,
 	.ssd_xx = 0, .ssd_xx1 = 0,
 	.ssd_def32 = 1,
 	.ssd_gran = 1		},
 /* GUCODE_SEL	6 Code Descriptor for user */
 {	.ssd_base = 0x0,
 	.ssd_limit = 0xfffff,
 	.ssd_type = SDT_MEMERA,
 	.ssd_dpl = SEL_UPL,
 	.ssd_p = 1,
 	.ssd_xx = 0, .ssd_xx1 = 0,
 	.ssd_def32 = 1,
 	.ssd_gran = 1		},
 /* GUDATA_SEL	7 Data Descriptor for user */
 {	.ssd_base = 0x0,
 	.ssd_limit = 0xfffff,
 	.ssd_type = SDT_MEMRWA,
 	.ssd_dpl = SEL_UPL,
 	.ssd_p = 1,
 	.ssd_xx = 0, .ssd_xx1 = 0,
 	.ssd_def32 = 1,
 	.ssd_gran = 1		},
 /* GBIOSLOWMEM_SEL 8 BIOS access to realmode segment 0x40, must be #8 in GDT */
 {	.ssd_base = 0x400,
 	.ssd_limit = 0xfffff,
 	.ssd_type = SDT_MEMRWA,
 	.ssd_dpl = SEL_KPL,
 	.ssd_p = 1,
 	.ssd_xx = 0, .ssd_xx1 = 0,
 	.ssd_def32 = 1,
 	.ssd_gran = 1		},
 #ifndef XEN
 /* GPROC0_SEL	9 Proc 0 Tss Descriptor */
 {
 	.ssd_base = 0x0,
 	.ssd_limit = sizeof(struct i386tss)-1,
 	.ssd_type = SDT_SYS386TSS,
 	.ssd_dpl = 0,
 	.ssd_p = 1,
 	.ssd_xx = 0, .ssd_xx1 = 0,
 	.ssd_def32 = 0,
 	.ssd_gran = 0		},
 /* GLDT_SEL	10 LDT Descriptor */
 {	.ssd_base = (int) ldt,
 	.ssd_limit = sizeof(ldt)-1,
 	.ssd_type = SDT_SYSLDT,
 	.ssd_dpl = SEL_UPL,
 	.ssd_p = 1,
 	.ssd_xx = 0, .ssd_xx1 = 0,
 	.ssd_def32 = 0,
 	.ssd_gran = 0		},
 /* GUSERLDT_SEL	11 User LDT Descriptor per process */
 {	.ssd_base = (int) ldt,
 	.ssd_limit = (512 * sizeof(union descriptor)-1),
 	.ssd_type = SDT_SYSLDT,
 	.ssd_dpl = 0,
 	.ssd_p = 1,
 	.ssd_xx = 0, .ssd_xx1 = 0,
 	.ssd_def32 = 0,
 	.ssd_gran = 0		},
 /* GPANIC_SEL	12 Panic Tss Descriptor */
 {	.ssd_base = (int) &dblfault_tss,
 	.ssd_limit = sizeof(struct i386tss)-1,
 	.ssd_type = SDT_SYS386TSS,
 	.ssd_dpl = 0,
 	.ssd_p = 1,
 	.ssd_xx = 0, .ssd_xx1 = 0,
 	.ssd_def32 = 0,
 	.ssd_gran = 0		},
 /* GBIOSCODE32_SEL 13 BIOS 32-bit interface (32bit Code) */
 {	.ssd_base = 0,
 	.ssd_limit = 0xfffff,
 	.ssd_type = SDT_MEMERA,
 	.ssd_dpl = 0,
 	.ssd_p = 1,
 	.ssd_xx = 0, .ssd_xx1 = 0,
 	.ssd_def32 = 0,
 	.ssd_gran = 1		},
 /* GBIOSCODE16_SEL 14 BIOS 32-bit interface (16bit Code) */
 {	.ssd_base = 0,
 	.ssd_limit = 0xfffff,
 	.ssd_type = SDT_MEMERA,
 	.ssd_dpl = 0,
 	.ssd_p = 1,
 	.ssd_xx = 0, .ssd_xx1 = 0,
 	.ssd_def32 = 0,
 	.ssd_gran = 1		},
 /* GBIOSDATA_SEL 15 BIOS 32-bit interface (Data) */
 {	.ssd_base = 0,
 	.ssd_limit = 0xfffff,
 	.ssd_type = SDT_MEMRWA,
 	.ssd_dpl = 0,
 	.ssd_p = 1,
 	.ssd_xx = 0, .ssd_xx1 = 0,
 	.ssd_def32 = 1,
 	.ssd_gran = 1		},
 /* GBIOSUTIL_SEL 16 BIOS 16-bit interface (Utility) */
 {	.ssd_base = 0,
 	.ssd_limit = 0xfffff,
 	.ssd_type = SDT_MEMRWA,
 	.ssd_dpl = 0,
 	.ssd_p = 1,
 	.ssd_xx = 0, .ssd_xx1 = 0,
 	.ssd_def32 = 0,
 	.ssd_gran = 1		},
 /* GBIOSARGS_SEL 17 BIOS 16-bit interface (Arguments) */
 {	.ssd_base = 0,
 	.ssd_limit = 0xfffff,
 	.ssd_type = SDT_MEMRWA,
 	.ssd_dpl = 0,
 	.ssd_p = 1,
 	.ssd_xx = 0, .ssd_xx1 = 0,
 	.ssd_def32 = 0,
 	.ssd_gran = 1		},
 /* GNDIS_SEL	18 NDIS Descriptor */
 {	.ssd_base = 0x0,
 	.ssd_limit = 0x0,
 	.ssd_type = 0,
 	.ssd_dpl = 0,
 	.ssd_p = 0,
 	.ssd_xx = 0, .ssd_xx1 = 0,
 	.ssd_def32 = 0,
 	.ssd_gran = 0		},
 #endif /* !XEN */
 };
 
 static struct soft_segment_descriptor ldt_segs[] = {
 	/* Null Descriptor - overwritten by call gate */
 {	.ssd_base = 0x0,
 	.ssd_limit = 0x0,
 	.ssd_type = 0,
 	.ssd_dpl = 0,
 	.ssd_p = 0,
 	.ssd_xx = 0, .ssd_xx1 = 0,
 	.ssd_def32 = 0,
 	.ssd_gran = 0		},
 	/* Null Descriptor - overwritten by call gate */
 {	.ssd_base = 0x0,
 	.ssd_limit = 0x0,
 	.ssd_type = 0,
 	.ssd_dpl = 0,
 	.ssd_p = 0,
 	.ssd_xx = 0, .ssd_xx1 = 0,
 	.ssd_def32 = 0,
 	.ssd_gran = 0		},
 	/* Null Descriptor - overwritten by call gate */
 {	.ssd_base = 0x0,
 	.ssd_limit = 0x0,
 	.ssd_type = 0,
 	.ssd_dpl = 0,
 	.ssd_p = 0,
 	.ssd_xx = 0, .ssd_xx1 = 0,
 	.ssd_def32 = 0,
 	.ssd_gran = 0		},
 	/* Code Descriptor for user */
 {	.ssd_base = 0x0,
 	.ssd_limit = 0xfffff,
 	.ssd_type = SDT_MEMERA,
 	.ssd_dpl = SEL_UPL,
 	.ssd_p = 1,
 	.ssd_xx = 0, .ssd_xx1 = 0,
 	.ssd_def32 = 1,
 	.ssd_gran = 1		},
 	/* Null Descriptor - overwritten by call gate */
 {	.ssd_base = 0x0,
 	.ssd_limit = 0x0,
 	.ssd_type = 0,
 	.ssd_dpl = 0,
 	.ssd_p = 0,
 	.ssd_xx = 0, .ssd_xx1 = 0,
 	.ssd_def32 = 0,
 	.ssd_gran = 0		},
 	/* Data Descriptor for user */
 {	.ssd_base = 0x0,
 	.ssd_limit = 0xfffff,
 	.ssd_type = SDT_MEMRWA,
 	.ssd_dpl = SEL_UPL,
 	.ssd_p = 1,
 	.ssd_xx = 0, .ssd_xx1 = 0,
 	.ssd_def32 = 1,
 	.ssd_gran = 1		},
 };
 
 void
 setidt(idx, func, typ, dpl, selec)
 	int idx;
 	inthand_t *func;
 	int typ;
 	int dpl;
 	int selec;
 {
 	struct gate_descriptor *ip;
 
 	ip = idt + idx;
 	ip->gd_looffset = (int)func;
 	ip->gd_selector = selec;
 	ip->gd_stkcpy = 0;
 	ip->gd_xx = 0;
 	ip->gd_type = typ;
 	ip->gd_dpl = dpl;
 	ip->gd_p = 1;
 	ip->gd_hioffset = ((int)func)>>16 ;
 }
 
 extern inthand_t
 	IDTVEC(div), IDTVEC(dbg), IDTVEC(nmi), IDTVEC(bpt), IDTVEC(ofl),
 	IDTVEC(bnd), IDTVEC(ill), IDTVEC(dna), IDTVEC(fpusegm),
 	IDTVEC(tss), IDTVEC(missing), IDTVEC(stk), IDTVEC(prot),
 	IDTVEC(page), IDTVEC(mchk), IDTVEC(rsvd), IDTVEC(fpu), IDTVEC(align),
 	IDTVEC(xmm),
 #ifdef KDTRACE_HOOKS
 	IDTVEC(dtrace_ret),
 #endif
 #ifdef XENHVM
 	IDTVEC(xen_intr_upcall),
 #endif
 	IDTVEC(lcall_syscall), IDTVEC(int0x80_syscall);
 
 #ifdef DDB
 /*
  * Display the index and function name of any IDT entries that don't use
  * the default 'rsvd' entry point.
  */
 DB_SHOW_COMMAND(idt, db_show_idt)
 {
 	struct gate_descriptor *ip;
 	int idx;
 	uintptr_t func;
 
 	ip = idt;
 	for (idx = 0; idx < NIDT && !db_pager_quit; idx++) {
 		func = (ip->gd_hioffset << 16 | ip->gd_looffset);
 		if (func != (uintptr_t)&IDTVEC(rsvd)) {
 			db_printf("%3d\t", idx);
 			db_printsym(func, DB_STGY_PROC);
 			db_printf("\n");
 		}
 		ip++;
 	}
 }
 
 /* Show privileged registers. */
 DB_SHOW_COMMAND(sysregs, db_show_sysregs)
 {
 	uint64_t idtr, gdtr;
 
 	idtr = ridt();
 	db_printf("idtr\t0x%08x/%04x\n",
 	    (u_int)(idtr >> 16), (u_int)idtr & 0xffff);
 	gdtr = rgdt();
 	db_printf("gdtr\t0x%08x/%04x\n",
 	    (u_int)(gdtr >> 16), (u_int)gdtr & 0xffff);
 	db_printf("ldtr\t0x%04x\n", rldt());
 	db_printf("tr\t0x%04x\n", rtr());
 	db_printf("cr0\t0x%08x\n", rcr0());
 	db_printf("cr2\t0x%08x\n", rcr2());
 	db_printf("cr3\t0x%08x\n", rcr3());
 	db_printf("cr4\t0x%08x\n", rcr4());
 }
 #endif
 
 void
 sdtossd(sd, ssd)
 	struct segment_descriptor *sd;
 	struct soft_segment_descriptor *ssd;
 {
 	ssd->ssd_base  = (sd->sd_hibase << 24) | sd->sd_lobase;
 	ssd->ssd_limit = (sd->sd_hilimit << 16) | sd->sd_lolimit;
 	ssd->ssd_type  = sd->sd_type;
 	ssd->ssd_dpl   = sd->sd_dpl;
 	ssd->ssd_p     = sd->sd_p;
 	ssd->ssd_def32 = sd->sd_def32;
 	ssd->ssd_gran  = sd->sd_gran;
 }
 
 #ifndef XEN
 static int
 add_physmap_entry(uint64_t base, uint64_t length, vm_paddr_t *physmap,
     int *physmap_idxp)
 {
 	int i, insert_idx, physmap_idx;
 
 	physmap_idx = *physmap_idxp;
 	
 	if (length == 0)
 		return (1);
 
 #ifndef PAE
 	if (base > 0xffffffff) {
 		printf("%uK of memory above 4GB ignored\n",
 		    (u_int)(length / 1024));
 		return (1);
 	}
 #endif
 
 	/*
 	 * Find insertion point while checking for overlap.  Start off by
 	 * assuming the new entry will be added to the end.
 	 */
 	insert_idx = physmap_idx + 2;
 	for (i = 0; i <= physmap_idx; i += 2) {
 		if (base < physmap[i + 1]) {
 			if (base + length <= physmap[i]) {
 				insert_idx = i;
 				break;
 			}
 			if (boothowto & RB_VERBOSE)
 				printf(
 		    "Overlapping memory regions, ignoring second region\n");
 			return (1);
 		}
 	}
 
 	/* See if we can prepend to the next entry. */
 	if (insert_idx <= physmap_idx && base + length == physmap[insert_idx]) {
 		physmap[insert_idx] = base;
 		return (1);
 	}
 
 	/* See if we can append to the previous entry. */
 	if (insert_idx > 0 && base == physmap[insert_idx - 1]) {
 		physmap[insert_idx - 1] += length;
 		return (1);
 	}
 
 	physmap_idx += 2;
 	*physmap_idxp = physmap_idx;
 	if (physmap_idx == PHYSMAP_SIZE) {
 		printf(
 		"Too many segments in the physical address map, giving up\n");
 		return (0);
 	}
 
 	/*
 	 * Move the last 'N' entries down to make room for the new
 	 * entry if needed.
 	 */
 	for (i = physmap_idx; i > insert_idx; i -= 2) {
 		physmap[i] = physmap[i - 2];
 		physmap[i + 1] = physmap[i - 1];
 	}
 
 	/* Insert the new entry. */
 	physmap[insert_idx] = base;
 	physmap[insert_idx + 1] = base + length;
 	return (1);
 }
 
 static int
 add_smap_entry(struct bios_smap *smap, vm_paddr_t *physmap, int *physmap_idxp)
 {
 	if (boothowto & RB_VERBOSE)
 		printf("SMAP type=%02x base=%016llx len=%016llx\n",
 		    smap->type, smap->base, smap->length);
 
 	if (smap->type != SMAP_TYPE_MEMORY)
 		return (1);
 
 	return (add_physmap_entry(smap->base, smap->length, physmap,
 	    physmap_idxp));
 }
 
 static void
 add_smap_entries(struct bios_smap *smapbase, vm_paddr_t *physmap,
     int *physmap_idxp)
 {
 	struct bios_smap *smap, *smapend;
 	u_int32_t smapsize;
 	/*
 	 * Memory map from INT 15:E820.
 	 *
 	 * subr_module.c says:
 	 * "Consumer may safely assume that size value precedes data."
 	 * ie: an int32_t immediately precedes SMAP.
 	 */
 	smapsize = *((u_int32_t *)smapbase - 1);
 	smapend = (struct bios_smap *)((uintptr_t)smapbase + smapsize);
 
 	for (smap = smapbase; smap < smapend; smap++)
 		if (!add_smap_entry(smap, physmap, physmap_idxp))
 			break;
 }
 
 static void
 basemem_setup(void)
 {
 	vm_paddr_t pa;
 	pt_entry_t *pte;
 	int i;
 
 	if (basemem > 640) {
 		printf("Preposterous BIOS basemem of %uK, truncating to 640K\n",
 			basemem);
 		basemem = 640;
 	}
 
 	/*
 	 * XXX if biosbasemem is now < 640, there is a `hole'
 	 * between the end of base memory and the start of
 	 * ISA memory.  The hole may be empty or it may
 	 * contain BIOS code or data.  Map it read/write so
 	 * that the BIOS can write to it.  (Memory from 0 to
 	 * the physical end of the kernel is mapped read-only
 	 * to begin with and then parts of it are remapped.
 	 * The parts that aren't remapped form holes that
 	 * remain read-only and are unused by the kernel.
 	 * The base memory area is below the physical end of
 	 * the kernel and right now forms a read-only hole.
 	 * The part of it from PAGE_SIZE to
 	 * (trunc_page(biosbasemem * 1024) - 1) will be
 	 * remapped and used by the kernel later.)
 	 *
 	 * This code is similar to the code used in
 	 * pmap_mapdev, but since no memory needs to be
 	 * allocated we simply change the mapping.
 	 */
 	for (pa = trunc_page(basemem * 1024);
 	     pa < ISA_HOLE_START; pa += PAGE_SIZE)
 		pmap_kenter(KERNBASE + pa, pa);
 
 	/*
 	 * Map pages between basemem and ISA_HOLE_START, if any, r/w into
 	 * the vm86 page table so that vm86 can scribble on them using
 	 * the vm86 map too.  XXX: why 2 ways for this and only 1 way for
 	 * page 0, at least as initialized here?
 	 */
 	pte = (pt_entry_t *)vm86paddr;
 	for (i = basemem / 4; i < 160; i++)
 		pte[i] = (i << PAGE_SHIFT) | PG_V | PG_RW | PG_U;
 }
 #endif
 
 /*
  * Populate the (physmap) array with base/bound pairs describing the
  * available physical memory in the system, then test this memory and
  * build the phys_avail array describing the actually-available memory.
  *
  * If we cannot accurately determine the physical memory map, then use
  * value from the 0xE801 call, and failing that, the RTC.
  *
  * Total memory size may be set by the kernel environment variable
  * hw.physmem or the compile-time define MAXMEM.
  *
  * XXX first should be vm_paddr_t.
  */
 static void
 getmemsize(int first)
 {
 	int has_smap, off, physmap_idx, pa_indx, da_indx;
 	u_long physmem_tunable, memtest;
 	vm_paddr_t physmap[PHYSMAP_SIZE];
 	pt_entry_t *pte;
 	quad_t dcons_addr, dcons_size;
 #ifndef XEN
 	int hasbrokenint12, i, res;
 	u_int extmem;
 	struct vm86frame vmf;
 	struct vm86context vmc;
 	vm_paddr_t pa;
 	struct bios_smap *smap, *smapbase;
 	caddr_t kmdp;
 #endif
 
 	has_smap = 0;
 #if defined(XEN)
 	Maxmem = xen_start_info->nr_pages - init_first;
 	physmem = Maxmem;
 	basemem = 0;
 	physmap[0] = init_first << PAGE_SHIFT;
 	physmap[1] = ptoa(Maxmem) - round_page(msgbufsize);
 	physmap_idx = 0;
 #else
 #ifdef XBOX
 	if (arch_i386_is_xbox) {
 		/*
 		 * We queried the memory size before, so chop off 4MB for
 		 * the framebuffer and inform the OS of this.
 		 */
 		physmap[0] = 0;
 		physmap[1] = (arch_i386_xbox_memsize * 1024 * 1024) - XBOX_FB_SIZE;
 		physmap_idx = 0;
 		goto physmap_done;
 	}
 #endif
 	bzero(&vmf, sizeof(vmf));
 	bzero(physmap, sizeof(physmap));
 	basemem = 0;
 
 	/*
 	 * Check if the loader supplied an SMAP memory map.  If so,
 	 * use that and do not make any VM86 calls.
 	 */
 	physmap_idx = 0;
 	smapbase = NULL;
 	kmdp = preload_search_by_type("elf kernel");
 	if (kmdp == NULL)
 		kmdp = preload_search_by_type("elf32 kernel");
 	if (kmdp != NULL)
 		smapbase = (struct bios_smap *)preload_search_info(kmdp,
 		    MODINFO_METADATA | MODINFOMD_SMAP);
 	if (smapbase != NULL) {
 		add_smap_entries(smapbase, physmap, &physmap_idx);
 		has_smap = 1;
 		goto have_smap;
 	}
 
 	/*
 	 * Some newer BIOSes have a broken INT 12H implementation
 	 * which causes a kernel panic immediately.  In this case, we
 	 * need use the SMAP to determine the base memory size.
 	 */
 	hasbrokenint12 = 0;
 	TUNABLE_INT_FETCH("hw.hasbrokenint12", &hasbrokenint12);
 	if (hasbrokenint12 == 0) {
 		/* Use INT12 to determine base memory size. */
 		vm86_intcall(0x12, &vmf);
 		basemem = vmf.vmf_ax;
 		basemem_setup();
 	}
 
 	/*
 	 * Fetch the memory map with INT 15:E820.  Map page 1 R/W into
 	 * the kernel page table so we can use it as a buffer.  The
 	 * kernel will unmap this page later.
 	 */
 	pmap_kenter(KERNBASE + (1 << PAGE_SHIFT), 1 << PAGE_SHIFT);
 	vmc.npages = 0;
 	smap = (void *)vm86_addpage(&vmc, 1, KERNBASE + (1 << PAGE_SHIFT));
 	res = vm86_getptr(&vmc, (vm_offset_t)smap, &vmf.vmf_es, &vmf.vmf_di);
 	KASSERT(res != 0, ("vm86_getptr() failed: address not found"));
 
 	vmf.vmf_ebx = 0;
 	do {
 		vmf.vmf_eax = 0xE820;
 		vmf.vmf_edx = SMAP_SIG;
 		vmf.vmf_ecx = sizeof(struct bios_smap);
 		i = vm86_datacall(0x15, &vmf, &vmc);
 		if (i || vmf.vmf_eax != SMAP_SIG)
 			break;
 		has_smap = 1;
 		if (!add_smap_entry(smap, physmap, &physmap_idx))
 			break;
 	} while (vmf.vmf_ebx != 0);
 
 have_smap:
 	/*
 	 * If we didn't fetch the "base memory" size from INT12,
 	 * figure it out from the SMAP (or just guess).
 	 */
 	if (basemem == 0) {
 		for (i = 0; i <= physmap_idx; i += 2) {
 			if (physmap[i] == 0x00000000) {
 				basemem = physmap[i + 1] / 1024;
 				break;
 			}
 		}
 
 		/* XXX: If we couldn't find basemem from SMAP, just guess. */
 		if (basemem == 0)
 			basemem = 640;
 		basemem_setup();
 	}
 
 	if (physmap[1] != 0)
 		goto physmap_done;
 
 	/*
 	 * If we failed to find an SMAP, figure out the extended
 	 * memory size.  We will then build a simple memory map with
 	 * two segments, one for "base memory" and the second for
 	 * "extended memory".  Note that "extended memory" starts at a
 	 * physical address of 1MB and that both basemem and extmem
 	 * are in units of 1KB.
 	 *
 	 * First, try to fetch the extended memory size via INT 15:E801.
 	 */
 	vmf.vmf_ax = 0xE801;
 	if (vm86_intcall(0x15, &vmf) == 0) {
 		extmem = vmf.vmf_cx + vmf.vmf_dx * 64;
 	} else {
 		/*
 		 * If INT15:E801 fails, this is our last ditch effort
 		 * to determine the extended memory size.  Currently
 		 * we prefer the RTC value over INT15:88.
 		 */
 #if 0
 		vmf.vmf_ah = 0x88;
 		vm86_intcall(0x15, &vmf);
 		extmem = vmf.vmf_ax;
 #else
 		extmem = rtcin(RTC_EXTLO) + (rtcin(RTC_EXTHI) << 8);
 #endif
 	}
 
 	/*
 	 * Special hack for chipsets that still remap the 384k hole when
 	 * there's 16MB of memory - this really confuses people that
 	 * are trying to use bus mastering ISA controllers with the
 	 * "16MB limit"; they only have 16MB, but the remapping puts
 	 * them beyond the limit.
 	 *
 	 * If extended memory is between 15-16MB (16-17MB phys address range),
 	 *	chop it to 15MB.
 	 */
 	if ((extmem > 15 * 1024) && (extmem < 16 * 1024))
 		extmem = 15 * 1024;
 
 	physmap[0] = 0;
 	physmap[1] = basemem * 1024;
 	physmap_idx = 2;
 	physmap[physmap_idx] = 0x100000;
 	physmap[physmap_idx + 1] = physmap[physmap_idx] + extmem * 1024;
 
 physmap_done:
 #endif	
 	/*
 	 * Now, physmap contains a map of physical memory.
 	 */
 
 #ifdef SMP
 	/* make hole for AP bootstrap code */
 	physmap[1] = mp_bootaddress(physmap[1]);
 #endif
 
 	/*
 	 * Maxmem isn't the "maximum memory", it's one larger than the
 	 * highest page of the physical address space.  It should be
 	 * called something like "Maxphyspage".  We may adjust this 
 	 * based on ``hw.physmem'' and the results of the memory test.
 	 */
 	Maxmem = atop(physmap[physmap_idx + 1]);
 
 #ifdef MAXMEM
 	Maxmem = MAXMEM / 4;
 #endif
 
 	if (TUNABLE_ULONG_FETCH("hw.physmem", &physmem_tunable))
 		Maxmem = atop(physmem_tunable);
 
 	/*
 	 * If we have an SMAP, don't allow MAXMEM or hw.physmem to extend
 	 * the amount of memory in the system.
 	 */
 	if (has_smap && Maxmem > atop(physmap[physmap_idx + 1]))
 		Maxmem = atop(physmap[physmap_idx + 1]);
 
 	/*
 	 * By default enable the memory test on real hardware, and disable
 	 * it if we appear to be running in a VM.  This avoids touching all
 	 * pages unnecessarily, which doesn't matter on real hardware but is
 	 * bad for shared VM hosts.  Use a general name so that
 	 * one could eventually do more with the code than just disable it.
 	 */
 	memtest = (vm_guest > VM_GUEST_NO) ? 0 : 1;
 	TUNABLE_ULONG_FETCH("hw.memtest.tests", &memtest);
 
 	if (atop(physmap[physmap_idx + 1]) != Maxmem &&
 	    (boothowto & RB_VERBOSE))
 		printf("Physical memory use set to %ldK\n", Maxmem * 4);
 
 	/*
 	 * If Maxmem has been increased beyond what the system has detected,
 	 * extend the last memory segment to the new limit.
 	 */ 
 	if (atop(physmap[physmap_idx + 1]) < Maxmem)
 		physmap[physmap_idx + 1] = ptoa((vm_paddr_t)Maxmem);
 
 	/* call pmap initialization to make new kernel address space */
 	pmap_bootstrap(first);
 
 	/*
 	 * Size up each available chunk of physical memory.
 	 */
 	physmap[0] = PAGE_SIZE;		/* mask off page 0 */
 	pa_indx = 0;
 	da_indx = 1;
 	phys_avail[pa_indx++] = physmap[0];
 	phys_avail[pa_indx] = physmap[0];
 	dump_avail[da_indx] = physmap[0];
 	pte = CMAP1;
 
 	/*
 	 * Get dcons buffer address
 	 */
 	if (getenv_quad("dcons.addr", &dcons_addr) == 0 ||
 	    getenv_quad("dcons.size", &dcons_size) == 0)
 		dcons_addr = 0;
 
 #ifndef XEN
 	/*
 	 * physmap is in bytes, so when converting to page boundaries,
 	 * round up the start address and round down the end address.
 	 */
 	for (i = 0; i <= physmap_idx; i += 2) {
 		vm_paddr_t end;
 
 		end = ptoa((vm_paddr_t)Maxmem);
 		if (physmap[i + 1] < end)
 			end = trunc_page(physmap[i + 1]);
 		for (pa = round_page(physmap[i]); pa < end; pa += PAGE_SIZE) {
 			int tmp, page_bad, full;
 			int *ptr = (int *)CADDR1;
 
 			full = FALSE;
 			/*
 			 * block out kernel memory as not available.
 			 */
 			if (pa >= KERNLOAD && pa < first)
 				goto do_dump_avail;
 
 			/*
 			 * block out dcons buffer
 			 */
 			if (dcons_addr > 0
 			    && pa >= trunc_page(dcons_addr)
 			    && pa < dcons_addr + dcons_size)
 				goto do_dump_avail;
 
 			page_bad = FALSE;
 			if (memtest == 0)
 				goto skip_memtest;
 
 			/*
 			 * map page into kernel: valid, read/write,non-cacheable
 			 */
 			*pte = pa | PG_V | PG_RW | PG_N;
 			invltlb();
 
 			tmp = *(int *)ptr;
 			/*
 			 * Test for alternating 1's and 0's
 			 */
 			*(volatile int *)ptr = 0xaaaaaaaa;
 			if (*(volatile int *)ptr != 0xaaaaaaaa)
 				page_bad = TRUE;
 			/*
 			 * Test for alternating 0's and 1's
 			 */
 			*(volatile int *)ptr = 0x55555555;
 			if (*(volatile int *)ptr != 0x55555555)
 				page_bad = TRUE;
 			/*
 			 * Test for all 1's
 			 */
 			*(volatile int *)ptr = 0xffffffff;
 			if (*(volatile int *)ptr != 0xffffffff)
 				page_bad = TRUE;
 			/*
 			 * Test for all 0's
 			 */
 			*(volatile int *)ptr = 0x0;
 			if (*(volatile int *)ptr != 0x0)
 				page_bad = TRUE;
 			/*
 			 * Restore original value.
 			 */
 			*(int *)ptr = tmp;
 
 skip_memtest:
 			/*
 			 * Adjust array of valid/good pages.
 			 */
 			if (page_bad == TRUE)
 				continue;
 			/*
 			 * If this good page is a continuation of the
 			 * previous set of good pages, then just increase
 			 * the end pointer. Otherwise start a new chunk.
 			 * Note that "end" points one higher than end,
 			 * making the range >= start and < end.
 			 * If we're also doing a speculative memory
 			 * test and we at or past the end, bump up Maxmem
 			 * so that we keep going. The first bad page
 			 * will terminate the loop.
 			 */
 			if (phys_avail[pa_indx] == pa) {
 				phys_avail[pa_indx] += PAGE_SIZE;
 			} else {
 				pa_indx++;
 				if (pa_indx == PHYS_AVAIL_ARRAY_END) {
 					printf(
 		"Too many holes in the physical address space, giving up\n");
 					pa_indx--;
 					full = TRUE;
 					goto do_dump_avail;
 				}
 				phys_avail[pa_indx++] = pa;	/* start */
 				phys_avail[pa_indx] = pa + PAGE_SIZE; /* end */
 			}
 			physmem++;
 do_dump_avail:
 			if (dump_avail[da_indx] == pa) {
 				dump_avail[da_indx] += PAGE_SIZE;
 			} else {
 				da_indx++;
 				if (da_indx == DUMP_AVAIL_ARRAY_END) {
 					da_indx--;
 					goto do_next;
 				}
 				dump_avail[da_indx++] = pa;	/* start */
 				dump_avail[da_indx] = pa + PAGE_SIZE; /* end */
 			}
 do_next:
 			if (full)
 				break;
 		}
 	}
 	*pte = 0;
 	invltlb();
 #else
 	phys_avail[0] = physfree;
 	phys_avail[1] = xen_start_info->nr_pages*PAGE_SIZE;
 	dump_avail[0] = 0;	
 	dump_avail[1] = xen_start_info->nr_pages*PAGE_SIZE;
 	
 #endif
 	
 	/*
 	 * XXX
 	 * The last chunk must contain at least one page plus the message
 	 * buffer to avoid complicating other code (message buffer address
 	 * calculation, etc.).
 	 */
 	while (phys_avail[pa_indx - 1] + PAGE_SIZE +
 	    round_page(msgbufsize) >= phys_avail[pa_indx]) {
 		physmem -= atop(phys_avail[pa_indx] - phys_avail[pa_indx - 1]);
 		phys_avail[pa_indx--] = 0;
 		phys_avail[pa_indx--] = 0;
 	}
 
 	Maxmem = atop(phys_avail[pa_indx]);
 
 	/* Trim off space for the message buffer. */
 	phys_avail[pa_indx] -= round_page(msgbufsize);
 
 	/* Map the message buffer. */
 	for (off = 0; off < round_page(msgbufsize); off += PAGE_SIZE)
 		pmap_kenter((vm_offset_t)msgbufp + off, phys_avail[pa_indx] +
 		    off);
 
 	PT_UPDATES_FLUSH();
 }
 
 #ifdef XEN
 #define MTOPSIZE (1<<(14 + PAGE_SHIFT))
 
 void
 init386(first)
 	int first;
 {
 	unsigned long gdtmachpfn;
 	int error, gsel_tss, metadata_missing, x, pa;
 	size_t kstack0_sz;
 	struct pcpu *pc;
 	struct callback_register event = {
 		.type = CALLBACKTYPE_event,
 		.address = {GSEL(GCODE_SEL, SEL_KPL), (unsigned long)Xhypervisor_callback },
 	};
 	struct callback_register failsafe = {
 		.type = CALLBACKTYPE_failsafe,
 		.address = {GSEL(GCODE_SEL, SEL_KPL), (unsigned long)failsafe_callback },
 	};
 
 	thread0.td_kstack = proc0kstack;
 	thread0.td_kstack_pages = KSTACK_PAGES;
 	kstack0_sz = thread0.td_kstack_pages * PAGE_SIZE;
 	thread0.td_pcb = (struct pcb *)(thread0.td_kstack + kstack0_sz) - 1;
 
 	/*
  	 * This may be done better later if it gets more high level
  	 * components in it. If so just link td->td_proc here.
 	 */
 	proc_linkup0(&proc0, &thread0);
 
 	metadata_missing = 0;
 	if (xen_start_info->mod_start) {
 		preload_metadata = (caddr_t)xen_start_info->mod_start;
 		preload_bootstrap_relocate(KERNBASE);
 	} else {
 		metadata_missing = 1;
 	}
 	if (envmode == 1)
 		kern_envp = static_env;
 	else if ((caddr_t)xen_start_info->cmd_line)
 	        kern_envp = xen_setbootenv((caddr_t)xen_start_info->cmd_line);
 
 	boothowto |= xen_boothowto(kern_envp);
 	
 	/* Init basic tunables, hz etc */
 	init_param1();
 
 	/*
 	 * XEN occupies a portion of the upper virtual address space 
 	 * At its base it manages an array mapping machine page frames 
 	 * to physical page frames - hence we need to be able to 
 	 * access 4GB - (64MB  - 4MB + 64k) 
 	 */
 	gdt_segs[GPRIV_SEL].ssd_limit = atop(HYPERVISOR_VIRT_START + MTOPSIZE);
 	gdt_segs[GUFS_SEL].ssd_limit = atop(HYPERVISOR_VIRT_START + MTOPSIZE);
 	gdt_segs[GUGS_SEL].ssd_limit = atop(HYPERVISOR_VIRT_START + MTOPSIZE);
 	gdt_segs[GCODE_SEL].ssd_limit = atop(HYPERVISOR_VIRT_START + MTOPSIZE);
 	gdt_segs[GDATA_SEL].ssd_limit = atop(HYPERVISOR_VIRT_START + MTOPSIZE);
 	gdt_segs[GUCODE_SEL].ssd_limit = atop(HYPERVISOR_VIRT_START + MTOPSIZE);
 	gdt_segs[GUDATA_SEL].ssd_limit = atop(HYPERVISOR_VIRT_START + MTOPSIZE);
 	gdt_segs[GBIOSLOWMEM_SEL].ssd_limit = atop(HYPERVISOR_VIRT_START + MTOPSIZE);
 
 	pc = &__pcpu[0];
 	gdt_segs[GPRIV_SEL].ssd_base = (int) pc;
 	gdt_segs[GPROC0_SEL].ssd_base = (int) &pc->pc_common_tss;
 
 	PT_SET_MA(gdt, xpmap_ptom(VTOP(gdt)) | PG_V | PG_RW);
 	bzero(gdt, PAGE_SIZE);
 	for (x = 0; x < NGDT; x++)
 		ssdtosd(&gdt_segs[x], &gdt[x].sd);
 
 	mtx_init(&dt_lock, "descriptor tables", NULL, MTX_SPIN);
 
 	gdtmachpfn = vtomach(gdt) >> PAGE_SHIFT;
 	PT_SET_MA(gdt, xpmap_ptom(VTOP(gdt)) | PG_V);
 	PANIC_IF(HYPERVISOR_set_gdt(&gdtmachpfn, 512) != 0);	
 	lgdt(&r_gdt);
 	gdtset = 1;
 
 	if ((error = HYPERVISOR_set_trap_table(trap_table)) != 0) {
 		panic("set_trap_table failed - error %d\n", error);
 	}
 	
 	error = HYPERVISOR_callback_op(CALLBACKOP_register, &event);
 	if (error == 0)
 		error = HYPERVISOR_callback_op(CALLBACKOP_register, &failsafe);
 #if	CONFIG_XEN_COMPAT <= 0x030002
 	if (error == -ENOXENSYS)
 		HYPERVISOR_set_callbacks(GSEL(GCODE_SEL, SEL_KPL),
 		    (unsigned long)Xhypervisor_callback,
 		    GSEL(GCODE_SEL, SEL_KPL), (unsigned long)failsafe_callback);
 #endif
 	pcpu_init(pc, 0, sizeof(struct pcpu));
 	for (pa = first; pa < first + DPCPU_SIZE; pa += PAGE_SIZE)
 		pmap_kenter(pa + KERNBASE, pa);
 	dpcpu_init((void *)(first + KERNBASE), 0);
 	first += DPCPU_SIZE;
 	physfree += DPCPU_SIZE;
 	init_first += DPCPU_SIZE / PAGE_SIZE;
 
 	PCPU_SET(prvspace, pc);
 	PCPU_SET(curthread, &thread0);
 	PCPU_SET(curpcb, thread0.td_pcb);
 
 	/*
 	 * Initialize mutexes.
 	 *
 	 * icu_lock: in order to allow an interrupt to occur in a critical
 	 * 	     section, to set pcpu->ipending (etc...) properly, we
 	 *	     must be able to get the icu lock, so it can't be
 	 *	     under witness.
 	 */
 	mutex_init();
 	mtx_init(&icu_lock, "icu", NULL, MTX_SPIN | MTX_NOWITNESS | MTX_NOPROFILE);
 
 	/* make ldt memory segments */
 	PT_SET_MA(ldt, xpmap_ptom(VTOP(ldt)) | PG_V | PG_RW);
 	bzero(ldt, PAGE_SIZE);
 	ldt_segs[LUCODE_SEL].ssd_limit = atop(0 - 1);
 	ldt_segs[LUDATA_SEL].ssd_limit = atop(0 - 1);
 	for (x = 0; x < sizeof ldt_segs / sizeof ldt_segs[0]; x++)
 		ssdtosd(&ldt_segs[x], &ldt[x].sd);
 
 	default_proc_ldt.ldt_base = (caddr_t)ldt;
 	default_proc_ldt.ldt_len = 6;
 	_default_ldt = (int)&default_proc_ldt;
 	PCPU_SET(currentldt, _default_ldt);
 	PT_SET_MA(ldt, *vtopte((unsigned long)ldt) & ~PG_RW);
 	xen_set_ldt((unsigned long) ldt, (sizeof ldt_segs / sizeof ldt_segs[0]));
 	
 #if defined(XEN_PRIVILEGED)
 	/*
 	 * Initialize the i8254 before the console so that console
 	 * initialization can use DELAY().
 	 */
 	i8254_init();
 #endif
 	
 	/*
 	 * Initialize the console before we print anything out.
 	 */
 	cninit();
 
 	if (metadata_missing)
 		printf("WARNING: loader(8) metadata is missing!\n");
 
 #ifdef DEV_ISA
 #ifdef DEV_ATPIC
 	elcr_probe();
 	atpic_startup();
 #else
 	/* Reset and mask the atpics and leave them shut down. */
 	atpic_reset();
 
 	/*
 	 * Point the ICU spurious interrupt vectors at the APIC spurious
 	 * interrupt handler.
 	 */
 	setidt(IDT_IO_INTS + 7, IDTVEC(spuriousint), SDT_SYS386IGT, SEL_KPL,
 	    GSEL(GCODE_SEL, SEL_KPL));
 	setidt(IDT_IO_INTS + 15, IDTVEC(spuriousint), SDT_SYS386IGT, SEL_KPL,
 	    GSEL(GCODE_SEL, SEL_KPL));
 #endif
 #endif
 
 #ifdef DDB
 	ksym_start = bootinfo.bi_symtab;
 	ksym_end = bootinfo.bi_esymtab;
 #endif
 
 	kdb_init();
 
 #ifdef KDB
 	if (boothowto & RB_KDB)
 		kdb_enter(KDB_WHY_BOOTFLAGS, "Boot flags requested debugger");
 #endif
 
 	finishidentcpu();	/* Final stage of CPU initialization */
 	setidt(IDT_UD, &IDTVEC(ill),  SDT_SYS386TGT, SEL_KPL,
 	    GSEL(GCODE_SEL, SEL_KPL));
 	setidt(IDT_GP, &IDTVEC(prot),  SDT_SYS386TGT, SEL_KPL,
 	    GSEL(GCODE_SEL, SEL_KPL));
 	initializecpu();	/* Initialize CPU registers */
 
 	/* make an initial tss so cpu can get interrupt stack on syscall! */
 	/* Note: -16 is so we can grow the trapframe if we came from vm86 */
 	PCPU_SET(common_tss.tss_esp0, thread0.td_kstack +
 	    kstack0_sz - sizeof(struct pcb) - 16);
 	PCPU_SET(common_tss.tss_ss0, GSEL(GDATA_SEL, SEL_KPL));
 	gsel_tss = GSEL(GPROC0_SEL, SEL_KPL);
 	HYPERVISOR_stack_switch(GSEL(GDATA_SEL, SEL_KPL),
 	    PCPU_GET(common_tss.tss_esp0));
 	
 	/* pointer to selector slot for %fs/%gs */
 	PCPU_SET(fsgs_gdt, &gdt[GUFS_SEL].sd);
 
 	dblfault_tss.tss_esp = dblfault_tss.tss_esp0 = dblfault_tss.tss_esp1 =
 	    dblfault_tss.tss_esp2 = (int)&dblfault_stack[sizeof(dblfault_stack)];
 	dblfault_tss.tss_ss = dblfault_tss.tss_ss0 = dblfault_tss.tss_ss1 =
 	    dblfault_tss.tss_ss2 = GSEL(GDATA_SEL, SEL_KPL);
 #ifdef PAE
 	dblfault_tss.tss_cr3 = (int)IdlePDPT;
 #else
 	dblfault_tss.tss_cr3 = (int)IdlePTD;
 #endif
 	dblfault_tss.tss_eip = (int)dblfault_handler;
 	dblfault_tss.tss_eflags = PSL_KERNEL;
 	dblfault_tss.tss_ds = dblfault_tss.tss_es =
 	    dblfault_tss.tss_gs = GSEL(GDATA_SEL, SEL_KPL);
 	dblfault_tss.tss_fs = GSEL(GPRIV_SEL, SEL_KPL);
 	dblfault_tss.tss_cs = GSEL(GCODE_SEL, SEL_KPL);
 	dblfault_tss.tss_ldt = GSEL(GLDT_SEL, SEL_KPL);
 
 	vm86_initialize();
 	getmemsize(first);
 	init_param2(physmem);
 
 	/* now running on new page tables, configured,and u/iom is accessible */
 
 	msgbufinit(msgbufp, msgbufsize);
 	/* transfer to user mode */
 
 	_ucodesel = GSEL(GUCODE_SEL, SEL_UPL);
 	_udatasel = GSEL(GUDATA_SEL, SEL_UPL);
 
 	/* setup proc 0's pcb */
 	thread0.td_pcb->pcb_flags = 0;
 #ifdef PAE
 	thread0.td_pcb->pcb_cr3 = (int)IdlePDPT;
 #else
 	thread0.td_pcb->pcb_cr3 = (int)IdlePTD;
 #endif
 	thread0.td_pcb->pcb_ext = 0;
 	thread0.td_frame = &proc0_tf;
 	thread0.td_pcb->pcb_fsd = PCPU_GET(fsgs_gdt)[0];
 	thread0.td_pcb->pcb_gsd = PCPU_GET(fsgs_gdt)[1];
 
 	cpu_probe_amdc1e();
 }
 
 #else
 void
 init386(first)
 	int first;
 {
 	struct gate_descriptor *gdp;
 	int gsel_tss, metadata_missing, x, pa;
 	size_t kstack0_sz;
 	struct pcpu *pc;
 
 	thread0.td_kstack = proc0kstack;
 	thread0.td_kstack_pages = KSTACK_PAGES;
 	kstack0_sz = thread0.td_kstack_pages * PAGE_SIZE;
 	thread0.td_pcb = (struct pcb *)(thread0.td_kstack + kstack0_sz) - 1;
 
 	/*
  	 * This may be done better later if it gets more high level
  	 * components in it. If so just link td->td_proc here.
 	 */
 	proc_linkup0(&proc0, &thread0);
 
 	metadata_missing = 0;
 	if (bootinfo.bi_modulep) {
 		preload_metadata = (caddr_t)bootinfo.bi_modulep + KERNBASE;
 		preload_bootstrap_relocate(KERNBASE);
 	} else {
 		metadata_missing = 1;
 	}
 	if (envmode == 1)
 		kern_envp = static_env;
 	else if (bootinfo.bi_envp)
 		kern_envp = (caddr_t)bootinfo.bi_envp + KERNBASE;
 
 	/* Init basic tunables, hz etc */
 	init_param1();
 
 	/*
 	 * Make gdt memory segments.  All segments cover the full 4GB
 	 * of address space and permissions are enforced at page level.
 	 */
 	gdt_segs[GCODE_SEL].ssd_limit = atop(0 - 1);
 	gdt_segs[GDATA_SEL].ssd_limit = atop(0 - 1);
 	gdt_segs[GUCODE_SEL].ssd_limit = atop(0 - 1);
 	gdt_segs[GUDATA_SEL].ssd_limit = atop(0 - 1);
 	gdt_segs[GUFS_SEL].ssd_limit = atop(0 - 1);
 	gdt_segs[GUGS_SEL].ssd_limit = atop(0 - 1);
 
 	pc = &__pcpu[0];
 	gdt_segs[GPRIV_SEL].ssd_limit = atop(0 - 1);
 	gdt_segs[GPRIV_SEL].ssd_base = (int) pc;
 	gdt_segs[GPROC0_SEL].ssd_base = (int) &pc->pc_common_tss;
 
 	for (x = 0; x < NGDT; x++)
 		ssdtosd(&gdt_segs[x], &gdt[x].sd);
 
 	r_gdt.rd_limit = NGDT * sizeof(gdt[0]) - 1;
 	r_gdt.rd_base =  (int) gdt;
 	mtx_init(&dt_lock, "descriptor tables", NULL, MTX_SPIN);
 	lgdt(&r_gdt);
 
 	pcpu_init(pc, 0, sizeof(struct pcpu));
 	for (pa = first; pa < first + DPCPU_SIZE; pa += PAGE_SIZE)
 		pmap_kenter(pa + KERNBASE, pa);
 	dpcpu_init((void *)(first + KERNBASE), 0);
 	first += DPCPU_SIZE;
 	PCPU_SET(prvspace, pc);
 	PCPU_SET(curthread, &thread0);
 	PCPU_SET(curpcb, thread0.td_pcb);
 
 	/*
 	 * Initialize mutexes.
 	 *
 	 * icu_lock: in order to allow an interrupt to occur in a critical
 	 * 	     section, to set pcpu->ipending (etc...) properly, we
 	 *	     must be able to get the icu lock, so it can't be
 	 *	     under witness.
 	 */
 	mutex_init();
 	mtx_init(&icu_lock, "icu", NULL, MTX_SPIN | MTX_NOWITNESS | MTX_NOPROFILE);
 
 	/* make ldt memory segments */
 	ldt_segs[LUCODE_SEL].ssd_limit = atop(0 - 1);
 	ldt_segs[LUDATA_SEL].ssd_limit = atop(0 - 1);
 	for (x = 0; x < sizeof ldt_segs / sizeof ldt_segs[0]; x++)
 		ssdtosd(&ldt_segs[x], &ldt[x].sd);
 
 	_default_ldt = GSEL(GLDT_SEL, SEL_KPL);
 	lldt(_default_ldt);
 	PCPU_SET(currentldt, _default_ldt);
 
 	/* exceptions */
 	for (x = 0; x < NIDT; x++)
 		setidt(x, &IDTVEC(rsvd), SDT_SYS386TGT, SEL_KPL,
 		    GSEL(GCODE_SEL, SEL_KPL));
 	setidt(IDT_DE, &IDTVEC(div),  SDT_SYS386TGT, SEL_KPL,
 	    GSEL(GCODE_SEL, SEL_KPL));
 	setidt(IDT_DB, &IDTVEC(dbg),  SDT_SYS386IGT, SEL_KPL,
 	    GSEL(GCODE_SEL, SEL_KPL));
 	setidt(IDT_NMI, &IDTVEC(nmi),  SDT_SYS386IGT, SEL_KPL,
 	    GSEL(GCODE_SEL, SEL_KPL));
  	setidt(IDT_BP, &IDTVEC(bpt),  SDT_SYS386IGT, SEL_UPL,
 	    GSEL(GCODE_SEL, SEL_KPL));
 	setidt(IDT_OF, &IDTVEC(ofl),  SDT_SYS386TGT, SEL_UPL,
 	    GSEL(GCODE_SEL, SEL_KPL));
 	setidt(IDT_BR, &IDTVEC(bnd),  SDT_SYS386TGT, SEL_KPL,
 	    GSEL(GCODE_SEL, SEL_KPL));
 	setidt(IDT_UD, &IDTVEC(ill),  SDT_SYS386TGT, SEL_KPL,
 	    GSEL(GCODE_SEL, SEL_KPL));
 	setidt(IDT_NM, &IDTVEC(dna),  SDT_SYS386TGT, SEL_KPL
 	    , GSEL(GCODE_SEL, SEL_KPL));
 	setidt(IDT_DF, 0,  SDT_SYSTASKGT, SEL_KPL, GSEL(GPANIC_SEL, SEL_KPL));
 	setidt(IDT_FPUGP, &IDTVEC(fpusegm),  SDT_SYS386TGT, SEL_KPL,
 	    GSEL(GCODE_SEL, SEL_KPL));
 	setidt(IDT_TS, &IDTVEC(tss),  SDT_SYS386TGT, SEL_KPL,
 	    GSEL(GCODE_SEL, SEL_KPL));
 	setidt(IDT_NP, &IDTVEC(missing),  SDT_SYS386TGT, SEL_KPL,
 	    GSEL(GCODE_SEL, SEL_KPL));
 	setidt(IDT_SS, &IDTVEC(stk),  SDT_SYS386TGT, SEL_KPL,
 	    GSEL(GCODE_SEL, SEL_KPL));
 	setidt(IDT_GP, &IDTVEC(prot),  SDT_SYS386TGT, SEL_KPL,
 	    GSEL(GCODE_SEL, SEL_KPL));
 	setidt(IDT_PF, &IDTVEC(page),  SDT_SYS386IGT, SEL_KPL,
 	    GSEL(GCODE_SEL, SEL_KPL));
 	setidt(IDT_MF, &IDTVEC(fpu),  SDT_SYS386TGT, SEL_KPL,
 	    GSEL(GCODE_SEL, SEL_KPL));
 	setidt(IDT_AC, &IDTVEC(align), SDT_SYS386TGT, SEL_KPL,
 	    GSEL(GCODE_SEL, SEL_KPL));
 	setidt(IDT_MC, &IDTVEC(mchk),  SDT_SYS386TGT, SEL_KPL,
 	    GSEL(GCODE_SEL, SEL_KPL));
 	setidt(IDT_XF, &IDTVEC(xmm), SDT_SYS386TGT, SEL_KPL,
 	    GSEL(GCODE_SEL, SEL_KPL));
  	setidt(IDT_SYSCALL, &IDTVEC(int0x80_syscall), SDT_SYS386TGT, SEL_UPL,
 	    GSEL(GCODE_SEL, SEL_KPL));
 #ifdef KDTRACE_HOOKS
 	setidt(IDT_DTRACE_RET, &IDTVEC(dtrace_ret), SDT_SYS386TGT, SEL_UPL,
 	    GSEL(GCODE_SEL, SEL_KPL));
 #endif
 #ifdef XENHVM
 	setidt(IDT_EVTCHN, &IDTVEC(xen_intr_upcall), SDT_SYS386IGT, SEL_UPL,
 	    GSEL(GCODE_SEL, SEL_KPL));
 #endif
 
 	r_idt.rd_limit = sizeof(idt0) - 1;
 	r_idt.rd_base = (int) idt;
 	lidt(&r_idt);
 
 #ifdef XBOX
 	/*
 	 * The following code queries the PCI ID of 0:0:0. For the XBOX,
 	 * This should be 0x10de / 0x02a5.
 	 *
 	 * This is exactly what Linux does.
 	 */
 	outl(0xcf8, 0x80000000);
 	if (inl(0xcfc) == 0x02a510de) {
 		arch_i386_is_xbox = 1;
 		pic16l_setled(XBOX_LED_GREEN);
 
 		/*
 		 * We are an XBOX, but we may have either 64MB or 128MB of
 		 * memory. The PCI host bridge should be programmed for this,
 		 * so we just query it. 
 		 */
 		outl(0xcf8, 0x80000084);
 		arch_i386_xbox_memsize = (inl(0xcfc) == 0x7FFFFFF) ? 128 : 64;
 	}
 #endif /* XBOX */
 
 	/*
 	 * Initialize the i8254 before the console so that console
 	 * initialization can use DELAY().
 	 */
 	i8254_init();
 
 	/*
 	 * Initialize the console before we print anything out.
 	 */
 	cninit();
 
 	if (metadata_missing)
 		printf("WARNING: loader(8) metadata is missing!\n");
 
 #ifdef DEV_ISA
 #ifdef DEV_ATPIC
 	elcr_probe();
 	atpic_startup();
 #else
 	/* Reset and mask the atpics and leave them shut down. */
 	atpic_reset();
 
 	/*
 	 * Point the ICU spurious interrupt vectors at the APIC spurious
 	 * interrupt handler.
 	 */
 	setidt(IDT_IO_INTS + 7, IDTVEC(spuriousint), SDT_SYS386IGT, SEL_KPL,
 	    GSEL(GCODE_SEL, SEL_KPL));
 	setidt(IDT_IO_INTS + 15, IDTVEC(spuriousint), SDT_SYS386IGT, SEL_KPL,
 	    GSEL(GCODE_SEL, SEL_KPL));
 #endif
 #endif
 
 #ifdef DDB
 	ksym_start = bootinfo.bi_symtab;
 	ksym_end = bootinfo.bi_esymtab;
 #endif
 
 	kdb_init();
 
 #ifdef KDB
 	if (boothowto & RB_KDB)
 		kdb_enter(KDB_WHY_BOOTFLAGS, "Boot flags requested debugger");
 #endif
 
 	finishidentcpu();	/* Final stage of CPU initialization */
 	setidt(IDT_UD, &IDTVEC(ill),  SDT_SYS386TGT, SEL_KPL,
 	    GSEL(GCODE_SEL, SEL_KPL));
 	setidt(IDT_GP, &IDTVEC(prot),  SDT_SYS386TGT, SEL_KPL,
 	    GSEL(GCODE_SEL, SEL_KPL));
 	initializecpu();	/* Initialize CPU registers */
 
 	/* make an initial tss so cpu can get interrupt stack on syscall! */
 	/* Note: -16 is so we can grow the trapframe if we came from vm86 */
 	PCPU_SET(common_tss.tss_esp0, thread0.td_kstack +
 	    kstack0_sz - sizeof(struct pcb) - 16);
 	PCPU_SET(common_tss.tss_ss0, GSEL(GDATA_SEL, SEL_KPL));
 	gsel_tss = GSEL(GPROC0_SEL, SEL_KPL);
 	PCPU_SET(tss_gdt, &gdt[GPROC0_SEL].sd);
 	PCPU_SET(common_tssd, *PCPU_GET(tss_gdt));
 	PCPU_SET(common_tss.tss_ioopt, (sizeof (struct i386tss)) << 16);
 	ltr(gsel_tss);
 
 	/* pointer to selector slot for %fs/%gs */
 	PCPU_SET(fsgs_gdt, &gdt[GUFS_SEL].sd);
 
 	dblfault_tss.tss_esp = dblfault_tss.tss_esp0 = dblfault_tss.tss_esp1 =
 	    dblfault_tss.tss_esp2 = (int)&dblfault_stack[sizeof(dblfault_stack)];
 	dblfault_tss.tss_ss = dblfault_tss.tss_ss0 = dblfault_tss.tss_ss1 =
 	    dblfault_tss.tss_ss2 = GSEL(GDATA_SEL, SEL_KPL);
 #ifdef PAE
 	dblfault_tss.tss_cr3 = (int)IdlePDPT;
 #else
 	dblfault_tss.tss_cr3 = (int)IdlePTD;
 #endif
 	dblfault_tss.tss_eip = (int)dblfault_handler;
 	dblfault_tss.tss_eflags = PSL_KERNEL;
 	dblfault_tss.tss_ds = dblfault_tss.tss_es =
 	    dblfault_tss.tss_gs = GSEL(GDATA_SEL, SEL_KPL);
 	dblfault_tss.tss_fs = GSEL(GPRIV_SEL, SEL_KPL);
 	dblfault_tss.tss_cs = GSEL(GCODE_SEL, SEL_KPL);
 	dblfault_tss.tss_ldt = GSEL(GLDT_SEL, SEL_KPL);
 
 	vm86_initialize();
 	getmemsize(first);
 	init_param2(physmem);
 
 	/* now running on new page tables, configured,and u/iom is accessible */
 
 	msgbufinit(msgbufp, msgbufsize);
 
 	/* make a call gate to reenter kernel with */
 	gdp = &ldt[LSYS5CALLS_SEL].gd;
 
 	x = (int) &IDTVEC(lcall_syscall);
 	gdp->gd_looffset = x;
 	gdp->gd_selector = GSEL(GCODE_SEL,SEL_KPL);
 	gdp->gd_stkcpy = 1;
 	gdp->gd_type = SDT_SYS386CGT;
 	gdp->gd_dpl = SEL_UPL;
 	gdp->gd_p = 1;
 	gdp->gd_hioffset = x >> 16;
 
 	/* XXX does this work? */
 	/* XXX yes! */
 	ldt[LBSDICALLS_SEL] = ldt[LSYS5CALLS_SEL];
 	ldt[LSOL26CALLS_SEL] = ldt[LSYS5CALLS_SEL];
 
 	/* transfer to user mode */
 
 	_ucodesel = GSEL(GUCODE_SEL, SEL_UPL);
 	_udatasel = GSEL(GUDATA_SEL, SEL_UPL);
 
 	/* setup proc 0's pcb */
 	thread0.td_pcb->pcb_flags = 0;
 #ifdef PAE
 	thread0.td_pcb->pcb_cr3 = (int)IdlePDPT;
 #else
 	thread0.td_pcb->pcb_cr3 = (int)IdlePTD;
 #endif
 	thread0.td_pcb->pcb_ext = 0;
 	thread0.td_frame = &proc0_tf;
 
 	cpu_probe_amdc1e();
 
 #ifdef FDT
 	x86_init_fdt();
 #endif
 }
 #endif
 
 void
 cpu_pcpu_init(struct pcpu *pcpu, int cpuid, size_t size)
 {
 
 	pcpu->pc_acpi_id = 0xffffffff;
 }
 
 void
 spinlock_enter(void)
 {
 	struct thread *td;
 	register_t flags;
 
 	td = curthread;
 	if (td->td_md.md_spinlock_count == 0) {
 		flags = intr_disable();
 		td->td_md.md_spinlock_count = 1;
 		td->td_md.md_saved_flags = flags;
 	} else
 		td->td_md.md_spinlock_count++;
 	critical_enter();
 }
 
 void
 spinlock_exit(void)
 {
 	struct thread *td;
 	register_t flags;
 
 	td = curthread;
 	critical_exit();
 	flags = td->td_md.md_saved_flags;
 	td->td_md.md_spinlock_count--;
 	if (td->td_md.md_spinlock_count == 0)
 		intr_restore(flags);
 }
 
 #if defined(I586_CPU) && !defined(NO_F00F_HACK)
 static void f00f_hack(void *unused);
 SYSINIT(f00f_hack, SI_SUB_INTRINSIC, SI_ORDER_FIRST, f00f_hack, NULL);
 
 static void
 f00f_hack(void *unused)
 {
 	struct gate_descriptor *new_idt;
 	vm_offset_t tmp;
 
 	if (!has_f00f_bug)
 		return;
 
 	GIANT_REQUIRED;
 
 	printf("Intel Pentium detected, installing workaround for F00F bug\n");
 
 	tmp = kmem_malloc(kernel_arena, PAGE_SIZE * 2, M_WAITOK | M_ZERO);
 	if (tmp == 0)
 		panic("kmem_malloc returned 0");
 
 	/* Put the problematic entry (#6) at the end of the lower page. */
 	new_idt = (struct gate_descriptor*)
 	    (tmp + PAGE_SIZE - 7 * sizeof(struct gate_descriptor));
 	bcopy(idt, new_idt, sizeof(idt0));
 	r_idt.rd_base = (u_int)new_idt;
 	lidt(&r_idt);
 	idt = new_idt;
 	pmap_protect(kernel_pmap, tmp, tmp + PAGE_SIZE, VM_PROT_READ);
 }
 #endif /* defined(I586_CPU) && !NO_F00F_HACK */
 
 /*
  * Construct a PCB from a trapframe. This is called from kdb_trap() where
  * we want to start a backtrace from the function that caused us to enter
  * the debugger. We have the context in the trapframe, but base the trace
  * on the PCB. The PCB doesn't have to be perfect, as long as it contains
  * enough for a backtrace.
  */
 void
 makectx(struct trapframe *tf, struct pcb *pcb)
 {
 
 	pcb->pcb_edi = tf->tf_edi;
 	pcb->pcb_esi = tf->tf_esi;
 	pcb->pcb_ebp = tf->tf_ebp;
 	pcb->pcb_ebx = tf->tf_ebx;
 	pcb->pcb_eip = tf->tf_eip;
 	pcb->pcb_esp = (ISPL(tf->tf_cs)) ? tf->tf_esp : (int)(tf + 1) - 8;
 }
 
 int
 ptrace_set_pc(struct thread *td, u_long addr)
 {
 
 	td->td_frame->tf_eip = addr;
 	return (0);
 }
 
 int
 ptrace_single_step(struct thread *td)
 {
 	td->td_frame->tf_eflags |= PSL_T;
 	return (0);
 }
 
 int
 ptrace_clear_single_step(struct thread *td)
 {
 	td->td_frame->tf_eflags &= ~PSL_T;
 	return (0);
 }
 
 int
 fill_regs(struct thread *td, struct reg *regs)
 {
 	struct pcb *pcb;
 	struct trapframe *tp;
 
 	tp = td->td_frame;
 	pcb = td->td_pcb;
 	regs->r_gs = pcb->pcb_gs;
 	return (fill_frame_regs(tp, regs));
 }
 
 int
 fill_frame_regs(struct trapframe *tp, struct reg *regs)
 {
 	regs->r_fs = tp->tf_fs;
 	regs->r_es = tp->tf_es;
 	regs->r_ds = tp->tf_ds;
 	regs->r_edi = tp->tf_edi;
 	regs->r_esi = tp->tf_esi;
 	regs->r_ebp = tp->tf_ebp;
 	regs->r_ebx = tp->tf_ebx;
 	regs->r_edx = tp->tf_edx;
 	regs->r_ecx = tp->tf_ecx;
 	regs->r_eax = tp->tf_eax;
 	regs->r_eip = tp->tf_eip;
 	regs->r_cs = tp->tf_cs;
 	regs->r_eflags = tp->tf_eflags;
 	regs->r_esp = tp->tf_esp;
 	regs->r_ss = tp->tf_ss;
 	return (0);
 }
 
 int
 set_regs(struct thread *td, struct reg *regs)
 {
 	struct pcb *pcb;
 	struct trapframe *tp;
 
 	tp = td->td_frame;
 	if (!EFL_SECURE(regs->r_eflags, tp->tf_eflags) ||
 	    !CS_SECURE(regs->r_cs))
 		return (EINVAL);
 	pcb = td->td_pcb;
 	tp->tf_fs = regs->r_fs;
 	tp->tf_es = regs->r_es;
 	tp->tf_ds = regs->r_ds;
 	tp->tf_edi = regs->r_edi;
 	tp->tf_esi = regs->r_esi;
 	tp->tf_ebp = regs->r_ebp;
 	tp->tf_ebx = regs->r_ebx;
 	tp->tf_edx = regs->r_edx;
 	tp->tf_ecx = regs->r_ecx;
 	tp->tf_eax = regs->r_eax;
 	tp->tf_eip = regs->r_eip;
 	tp->tf_cs = regs->r_cs;
 	tp->tf_eflags = regs->r_eflags;
 	tp->tf_esp = regs->r_esp;
 	tp->tf_ss = regs->r_ss;
 	pcb->pcb_gs = regs->r_gs;
 	return (0);
 }
 
 #ifdef CPU_ENABLE_SSE
 static void
 fill_fpregs_xmm(sv_xmm, sv_87)
 	struct savexmm *sv_xmm;
 	struct save87 *sv_87;
 {
 	register struct env87 *penv_87 = &sv_87->sv_env;
 	register struct envxmm *penv_xmm = &sv_xmm->sv_env;
 	int i;
 
 	bzero(sv_87, sizeof(*sv_87));
 
 	/* FPU control/status */
 	penv_87->en_cw = penv_xmm->en_cw;
 	penv_87->en_sw = penv_xmm->en_sw;
 	penv_87->en_tw = penv_xmm->en_tw;
 	penv_87->en_fip = penv_xmm->en_fip;
 	penv_87->en_fcs = penv_xmm->en_fcs;
 	penv_87->en_opcode = penv_xmm->en_opcode;
 	penv_87->en_foo = penv_xmm->en_foo;
 	penv_87->en_fos = penv_xmm->en_fos;
 
 	/* FPU registers */
 	for (i = 0; i < 8; ++i)
 		sv_87->sv_ac[i] = sv_xmm->sv_fp[i].fp_acc;
 }
 
 static void
 set_fpregs_xmm(sv_87, sv_xmm)
 	struct save87 *sv_87;
 	struct savexmm *sv_xmm;
 {
 	register struct env87 *penv_87 = &sv_87->sv_env;
 	register struct envxmm *penv_xmm = &sv_xmm->sv_env;
 	int i;
 
 	/* FPU control/status */
 	penv_xmm->en_cw = penv_87->en_cw;
 	penv_xmm->en_sw = penv_87->en_sw;
 	penv_xmm->en_tw = penv_87->en_tw;
 	penv_xmm->en_fip = penv_87->en_fip;
 	penv_xmm->en_fcs = penv_87->en_fcs;
 	penv_xmm->en_opcode = penv_87->en_opcode;
 	penv_xmm->en_foo = penv_87->en_foo;
 	penv_xmm->en_fos = penv_87->en_fos;
 
 	/* FPU registers */
 	for (i = 0; i < 8; ++i)
 		sv_xmm->sv_fp[i].fp_acc = sv_87->sv_ac[i];
 }
 #endif /* CPU_ENABLE_SSE */
 
 int
 fill_fpregs(struct thread *td, struct fpreg *fpregs)
 {
 
 	KASSERT(td == curthread || TD_IS_SUSPENDED(td) ||
 	    P_SHOULDSTOP(td->td_proc),
 	    ("not suspended thread %p", td));
 #ifdef DEV_NPX
 	npxgetregs(td);
 #else
 	bzero(fpregs, sizeof(*fpregs));
 #endif
 #ifdef CPU_ENABLE_SSE
 	if (cpu_fxsr)
 		fill_fpregs_xmm(&td->td_pcb->pcb_user_save.sv_xmm,
 		    (struct save87 *)fpregs);
 	else
 #endif /* CPU_ENABLE_SSE */
 		bcopy(&td->td_pcb->pcb_user_save.sv_87, fpregs,
 		    sizeof(*fpregs));
 	return (0);
 }
 
 int
 set_fpregs(struct thread *td, struct fpreg *fpregs)
 {
 
 #ifdef CPU_ENABLE_SSE
 	if (cpu_fxsr)
 		set_fpregs_xmm((struct save87 *)fpregs,
 		    &td->td_pcb->pcb_user_save.sv_xmm);
 	else
 #endif /* CPU_ENABLE_SSE */
 		bcopy(fpregs, &td->td_pcb->pcb_user_save.sv_87,
 		    sizeof(*fpregs));
 #ifdef DEV_NPX
 	npxuserinited(td);
 #endif
 	return (0);
 }
 
 /*
  * Get machine context.
  */
 int
 get_mcontext(struct thread *td, mcontext_t *mcp, int flags)
 {
 	struct trapframe *tp;
 	struct segment_descriptor *sdp;
 
 	tp = td->td_frame;
 
 	PROC_LOCK(curthread->td_proc);
 	mcp->mc_onstack = sigonstack(tp->tf_esp);
 	PROC_UNLOCK(curthread->td_proc);
 	mcp->mc_gs = td->td_pcb->pcb_gs;
 	mcp->mc_fs = tp->tf_fs;
 	mcp->mc_es = tp->tf_es;
 	mcp->mc_ds = tp->tf_ds;
 	mcp->mc_edi = tp->tf_edi;
 	mcp->mc_esi = tp->tf_esi;
 	mcp->mc_ebp = tp->tf_ebp;
 	mcp->mc_isp = tp->tf_isp;
 	mcp->mc_eflags = tp->tf_eflags;
 	if (flags & GET_MC_CLEAR_RET) {
 		mcp->mc_eax = 0;
 		mcp->mc_edx = 0;
 		mcp->mc_eflags &= ~PSL_C;
 	} else {
 		mcp->mc_eax = tp->tf_eax;
 		mcp->mc_edx = tp->tf_edx;
 	}
 	mcp->mc_ebx = tp->tf_ebx;
 	mcp->mc_ecx = tp->tf_ecx;
 	mcp->mc_eip = tp->tf_eip;
 	mcp->mc_cs = tp->tf_cs;
 	mcp->mc_esp = tp->tf_esp;
 	mcp->mc_ss = tp->tf_ss;
 	mcp->mc_len = sizeof(*mcp);
 	get_fpcontext(td, mcp);
 	sdp = &td->td_pcb->pcb_fsd;
 	mcp->mc_fsbase = sdp->sd_hibase << 24 | sdp->sd_lobase;
 	sdp = &td->td_pcb->pcb_gsd;
 	mcp->mc_gsbase = sdp->sd_hibase << 24 | sdp->sd_lobase;
 	mcp->mc_flags = 0;
 	bzero(mcp->mc_spare2, sizeof(mcp->mc_spare2));
 	return (0);
 }
 
 /*
  * Set machine context.
  *
  * However, we don't set any but the user modifiable flags, and we won't
  * touch the cs selector.
  */
 int
 set_mcontext(struct thread *td, const mcontext_t *mcp)
 {
 	struct trapframe *tp;
 	int eflags, ret;
 
 	tp = td->td_frame;
 	if (mcp->mc_len != sizeof(*mcp))
 		return (EINVAL);
 	eflags = (mcp->mc_eflags & PSL_USERCHANGE) |
 	    (tp->tf_eflags & ~PSL_USERCHANGE);
 	if ((ret = set_fpcontext(td, mcp)) == 0) {
 		tp->tf_fs = mcp->mc_fs;
 		tp->tf_es = mcp->mc_es;
 		tp->tf_ds = mcp->mc_ds;
 		tp->tf_edi = mcp->mc_edi;
 		tp->tf_esi = mcp->mc_esi;
 		tp->tf_ebp = mcp->mc_ebp;
 		tp->tf_ebx = mcp->mc_ebx;
 		tp->tf_edx = mcp->mc_edx;
 		tp->tf_ecx = mcp->mc_ecx;
 		tp->tf_eax = mcp->mc_eax;
 		tp->tf_eip = mcp->mc_eip;
 		tp->tf_eflags = eflags;
 		tp->tf_esp = mcp->mc_esp;
 		tp->tf_ss = mcp->mc_ss;
 		td->td_pcb->pcb_gs = mcp->mc_gs;
 		ret = 0;
 	}
 	return (ret);
 }
 
 static void
 get_fpcontext(struct thread *td, mcontext_t *mcp)
 {
 
 #ifndef DEV_NPX
 	mcp->mc_fpformat = _MC_FPFMT_NODEV;
 	mcp->mc_ownedfp = _MC_FPOWNED_NONE;
 	bzero(mcp->mc_fpstate, sizeof(mcp->mc_fpstate));
 #else
 	mcp->mc_ownedfp = npxgetregs(td);
 	bcopy(&td->td_pcb->pcb_user_save, &mcp->mc_fpstate[0],
 	    sizeof(mcp->mc_fpstate));
 	mcp->mc_fpformat = npxformat();
 #endif
 }
 
 static int
 set_fpcontext(struct thread *td, const mcontext_t *mcp)
 {
 
 	if (mcp->mc_fpformat == _MC_FPFMT_NODEV)
 		return (0);
 	else if (mcp->mc_fpformat != _MC_FPFMT_387 &&
 	    mcp->mc_fpformat != _MC_FPFMT_XMM)
 		return (EINVAL);
 	else if (mcp->mc_ownedfp == _MC_FPOWNED_NONE)
 		/* We don't care what state is left in the FPU or PCB. */
 		fpstate_drop(td);
 	else if (mcp->mc_ownedfp == _MC_FPOWNED_FPU ||
 	    mcp->mc_ownedfp == _MC_FPOWNED_PCB) {
 #ifdef DEV_NPX
 #ifdef CPU_ENABLE_SSE
 		if (cpu_fxsr)
 			((union savefpu *)&mcp->mc_fpstate)->sv_xmm.sv_env.
 			    en_mxcsr &= cpu_mxcsr_mask;
 #endif
 		npxsetregs(td, (union savefpu *)&mcp->mc_fpstate);
 #endif
 	} else
 		return (EINVAL);
 	return (0);
 }
 
 static void
 fpstate_drop(struct thread *td)
 {
 
 	KASSERT(PCB_USER_FPU(td->td_pcb), ("fpstate_drop: kernel-owned fpu"));
 	critical_enter();
 #ifdef DEV_NPX
 	if (PCPU_GET(fpcurthread) == td)
 		npxdrop();
 #endif
 	/*
 	 * XXX force a full drop of the npx.  The above only drops it if we
 	 * owned it.  npxgetregs() has the same bug in the !cpu_fxsr case.
 	 *
 	 * XXX I don't much like npxgetregs()'s semantics of doing a full
 	 * drop.  Dropping only to the pcb matches fnsave's behaviour.
 	 * We only need to drop to !PCB_INITDONE in sendsig().  But
 	 * sendsig() is the only caller of npxgetregs()... perhaps we just
 	 * have too many layers.
 	 */
 	curthread->td_pcb->pcb_flags &= ~(PCB_NPXINITDONE |
 	    PCB_NPXUSERINITDONE);
 	critical_exit();
 }
 
 int
 fill_dbregs(struct thread *td, struct dbreg *dbregs)
 {
 	struct pcb *pcb;
 
 	if (td == NULL) {
 		dbregs->dr[0] = rdr0();
 		dbregs->dr[1] = rdr1();
 		dbregs->dr[2] = rdr2();
 		dbregs->dr[3] = rdr3();
 		dbregs->dr[4] = rdr4();
 		dbregs->dr[5] = rdr5();
 		dbregs->dr[6] = rdr6();
 		dbregs->dr[7] = rdr7();
 	} else {
 		pcb = td->td_pcb;
 		dbregs->dr[0] = pcb->pcb_dr0;
 		dbregs->dr[1] = pcb->pcb_dr1;
 		dbregs->dr[2] = pcb->pcb_dr2;
 		dbregs->dr[3] = pcb->pcb_dr3;
 		dbregs->dr[4] = 0;
 		dbregs->dr[5] = 0;
 		dbregs->dr[6] = pcb->pcb_dr6;
 		dbregs->dr[7] = pcb->pcb_dr7;
 	}
 	return (0);
 }
 
 int
 set_dbregs(struct thread *td, struct dbreg *dbregs)
 {
 	struct pcb *pcb;
 	int i;
 
 	if (td == NULL) {
 		load_dr0(dbregs->dr[0]);
 		load_dr1(dbregs->dr[1]);
 		load_dr2(dbregs->dr[2]);
 		load_dr3(dbregs->dr[3]);
 		load_dr4(dbregs->dr[4]);
 		load_dr5(dbregs->dr[5]);
 		load_dr6(dbregs->dr[6]);
 		load_dr7(dbregs->dr[7]);
 	} else {
 		/*
 		 * Don't let an illegal value for dr7 get set.	Specifically,
 		 * check for undefined settings.  Setting these bit patterns
 		 * result in undefined behaviour and can lead to an unexpected
 		 * TRCTRAP.
 		 */
 		for (i = 0; i < 4; i++) {
 			if (DBREG_DR7_ACCESS(dbregs->dr[7], i) == 0x02)
 				return (EINVAL);
 			if (DBREG_DR7_LEN(dbregs->dr[7], i) == 0x02)
 				return (EINVAL);
 		}
 		
 		pcb = td->td_pcb;
 		
 		/*
 		 * Don't let a process set a breakpoint that is not within the
 		 * process's address space.  If a process could do this, it
 		 * could halt the system by setting a breakpoint in the kernel
 		 * (if ddb was enabled).  Thus, we need to check to make sure
 		 * that no breakpoints are being enabled for addresses outside
 		 * process's address space.
 		 *
 		 * XXX - what about when the watched area of the user's
 		 * address space is written into from within the kernel
 		 * ... wouldn't that still cause a breakpoint to be generated
 		 * from within kernel mode?
 		 */
 
 		if (DBREG_DR7_ENABLED(dbregs->dr[7], 0)) {
 			/* dr0 is enabled */
 			if (dbregs->dr[0] >= VM_MAXUSER_ADDRESS)
 				return (EINVAL);
 		}
 			
 		if (DBREG_DR7_ENABLED(dbregs->dr[7], 1)) {
 			/* dr1 is enabled */
 			if (dbregs->dr[1] >= VM_MAXUSER_ADDRESS)
 				return (EINVAL);
 		}
 			
 		if (DBREG_DR7_ENABLED(dbregs->dr[7], 2)) {
 			/* dr2 is enabled */
 			if (dbregs->dr[2] >= VM_MAXUSER_ADDRESS)
 				return (EINVAL);
 		}
 			
 		if (DBREG_DR7_ENABLED(dbregs->dr[7], 3)) {
 			/* dr3 is enabled */
 			if (dbregs->dr[3] >= VM_MAXUSER_ADDRESS)
 				return (EINVAL);
 		}
 
 		pcb->pcb_dr0 = dbregs->dr[0];
 		pcb->pcb_dr1 = dbregs->dr[1];
 		pcb->pcb_dr2 = dbregs->dr[2];
 		pcb->pcb_dr3 = dbregs->dr[3];
 		pcb->pcb_dr6 = dbregs->dr[6];
 		pcb->pcb_dr7 = dbregs->dr[7];
 
 		pcb->pcb_flags |= PCB_DBREGS;
 	}
 
 	return (0);
 }
 
 /*
  * Return > 0 if a hardware breakpoint has been hit, and the
  * breakpoint was in user space.  Return 0, otherwise.
  */
 int
 user_dbreg_trap(void)
 {
         u_int32_t dr7, dr6; /* debug registers dr6 and dr7 */
         u_int32_t bp;       /* breakpoint bits extracted from dr6 */
         int nbp;            /* number of breakpoints that triggered */
         caddr_t addr[4];    /* breakpoint addresses */
         int i;
         
         dr7 = rdr7();
         if ((dr7 & 0x000000ff) == 0) {
                 /*
                  * all GE and LE bits in the dr7 register are zero,
                  * thus the trap couldn't have been caused by the
                  * hardware debug registers
                  */
                 return 0;
         }
 
         nbp = 0;
         dr6 = rdr6();
         bp = dr6 & 0x0000000f;
 
         if (!bp) {
                 /*
                  * None of the breakpoint bits are set meaning this
                  * trap was not caused by any of the debug registers
                  */
                 return 0;
         }
 
         /*
          * at least one of the breakpoints were hit, check to see
          * which ones and if any of them are user space addresses
          */
 
         if (bp & 0x01) {
                 addr[nbp++] = (caddr_t)rdr0();
         }
         if (bp & 0x02) {
                 addr[nbp++] = (caddr_t)rdr1();
         }
         if (bp & 0x04) {
                 addr[nbp++] = (caddr_t)rdr2();
         }
         if (bp & 0x08) {
                 addr[nbp++] = (caddr_t)rdr3();
         }
 
         for (i = 0; i < nbp; i++) {
                 if (addr[i] < (caddr_t)VM_MAXUSER_ADDRESS) {
                         /*
                          * addr[i] is in user space
                          */
                         return nbp;
                 }
         }
 
         /*
          * None of the breakpoints are in user space.
          */
         return 0;
 }
 
 #ifdef KDB
 
 /*
  * Provide inb() and outb() as functions.  They are normally only available as
  * inline functions, thus cannot be called from the debugger.
  */
 
 /* silence compiler warnings */
 u_char inb_(u_short);
 void outb_(u_short, u_char);
 
 u_char
 inb_(u_short port)
 {
 	return inb(port);
 }
 
 void
 outb_(u_short port, u_char data)
 {
 	outb(port, data);
 }
 
 #endif /* KDB */
Index: head/sys/i386/i386/trap.c
===================================================================
--- head/sys/i386/i386/trap.c	(revision 258540)
+++ head/sys/i386/i386/trap.c	(revision 258541)
@@ -1,1177 +1,1176 @@
 /*-
  * Copyright (C) 1994, David Greenman
  * Copyright (c) 1990, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * This code is derived from software contributed to Berkeley by
  * the University of Utah, and William Jolitz.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *	This product includes software developed by the University of
  *	California, Berkeley and its contributors.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	from: @(#)trap.c	7.4 (Berkeley) 5/13/91
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 /*
  * 386 Trap and System call handling
  */
 
 #include "opt_clock.h"
 #include "opt_cpu.h"
 #include "opt_hwpmc_hooks.h"
 #include "opt_isa.h"
 #include "opt_kdb.h"
-#include "opt_kdtrace.h"
 #include "opt_npx.h"
 #include "opt_trap.h"
 
 #include <sys/param.h>
 #include <sys/bus.h>
 #include <sys/systm.h>
 #include <sys/proc.h>
 #include <sys/pioctl.h>
 #include <sys/ptrace.h>
 #include <sys/kdb.h>
 #include <sys/kernel.h>
 #include <sys/ktr.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/resourcevar.h>
 #include <sys/signalvar.h>
 #include <sys/syscall.h>
 #include <sys/sysctl.h>
 #include <sys/sysent.h>
 #include <sys/uio.h>
 #include <sys/vmmeter.h>
 #ifdef HWPMC_HOOKS
 #include <sys/pmckern.h>
 PMC_SOFT_DEFINE( , , page_fault, all);
 PMC_SOFT_DEFINE( , , page_fault, read);
 PMC_SOFT_DEFINE( , , page_fault, write);
 #endif
 #include <security/audit/audit.h>
 
 #include <vm/vm.h>
 #include <vm/vm_param.h>
 #include <vm/pmap.h>
 #include <vm/vm_kern.h>
 #include <vm/vm_map.h>
 #include <vm/vm_page.h>
 #include <vm/vm_extern.h>
 
 #include <machine/cpu.h>
 #include <machine/intr_machdep.h>
 #include <x86/mca.h>
 #include <machine/md_var.h>
 #include <machine/pcb.h>
 #ifdef SMP
 #include <machine/smp.h>
 #endif
 #include <machine/tss.h>
 #include <machine/vm86.h>
 
 #ifdef POWERFAIL_NMI
 #include <sys/syslog.h>
 #include <machine/clock.h>
 #endif
 
 #ifdef KDTRACE_HOOKS
 #include <sys/dtrace_bsd.h>
 
 /*
  * This is a hook which is initialised by the dtrace module
  * to handle traps which might occur during DTrace probe
  * execution.
  */
 dtrace_trap_func_t	dtrace_trap_func;
 
 dtrace_doubletrap_func_t	dtrace_doubletrap_func;
 
 /*
  * This is a hook which is initialised by the systrace module
  * when it is loaded. This keeps the DTrace syscall provider
  * implementation opaque. 
  */
 systrace_probe_func_t	systrace_probe_func;
 
 /*
  * These hooks are necessary for the pid and usdt providers.
  */
 dtrace_pid_probe_ptr_t		dtrace_pid_probe_ptr;
 dtrace_return_probe_ptr_t	dtrace_return_probe_ptr;
 #endif
 
 extern void trap(struct trapframe *frame);
 extern void syscall(struct trapframe *frame);
 
 static int trap_pfault(struct trapframe *, int, vm_offset_t);
 static void trap_fatal(struct trapframe *, vm_offset_t);
 void dblfault_handler(void);
 
 extern inthand_t IDTVEC(lcall_syscall);
 
 #define MAX_TRAP_MSG		32
 static char *trap_msg[] = {
 	"",					/*  0 unused */
 	"privileged instruction fault",		/*  1 T_PRIVINFLT */
 	"",					/*  2 unused */
 	"breakpoint instruction fault",		/*  3 T_BPTFLT */
 	"",					/*  4 unused */
 	"",					/*  5 unused */
 	"arithmetic trap",			/*  6 T_ARITHTRAP */
 	"",					/*  7 unused */
 	"",					/*  8 unused */
 	"general protection fault",		/*  9 T_PROTFLT */
 	"trace trap",				/* 10 T_TRCTRAP */
 	"",					/* 11 unused */
 	"page fault",				/* 12 T_PAGEFLT */
 	"",					/* 13 unused */
 	"alignment fault",			/* 14 T_ALIGNFLT */
 	"",					/* 15 unused */
 	"",					/* 16 unused */
 	"",					/* 17 unused */
 	"integer divide fault",			/* 18 T_DIVIDE */
 	"non-maskable interrupt trap",		/* 19 T_NMI */
 	"overflow trap",			/* 20 T_OFLOW */
 	"FPU bounds check fault",		/* 21 T_BOUND */
 	"FPU device not available",		/* 22 T_DNA */
 	"double fault",				/* 23 T_DOUBLEFLT */
 	"FPU operand fetch fault",		/* 24 T_FPOPFLT */
 	"invalid TSS fault",			/* 25 T_TSSFLT */
 	"segment not present fault",		/* 26 T_SEGNPFLT */
 	"stack fault",				/* 27 T_STKFLT */
 	"machine check trap",			/* 28 T_MCHK */
 	"SIMD floating-point exception",	/* 29 T_XMMFLT */
 	"reserved (unknown) fault",		/* 30 T_RESERVED */
 	"",					/* 31 unused (reserved) */
 	"DTrace pid return trap",               /* 32 T_DTRACE_RET */
 };
 
 #if defined(I586_CPU) && !defined(NO_F00F_HACK)
 extern int has_f00f_bug;
 #endif
 
 #ifdef KDB
 static int kdb_on_nmi = 1;
 SYSCTL_INT(_machdep, OID_AUTO, kdb_on_nmi, CTLFLAG_RW,
 	&kdb_on_nmi, 0, "Go to KDB on NMI");
 TUNABLE_INT("machdep.kdb_on_nmi", &kdb_on_nmi);
 #endif
 static int panic_on_nmi = 1;
 SYSCTL_INT(_machdep, OID_AUTO, panic_on_nmi, CTLFLAG_RW,
 	&panic_on_nmi, 0, "Panic on NMI");
 TUNABLE_INT("machdep.panic_on_nmi", &panic_on_nmi);
 static int prot_fault_translation = 0;
 SYSCTL_INT(_machdep, OID_AUTO, prot_fault_translation, CTLFLAG_RW,
 	&prot_fault_translation, 0, "Select signal to deliver on protection fault");
 static int uprintf_signal;
 SYSCTL_INT(_machdep, OID_AUTO, uprintf_signal, CTLFLAG_RW,
     &uprintf_signal, 0,
     "Print debugging information on trap signal to ctty");
 
 /*
  * Exception, fault, and trap interface to the FreeBSD kernel.
  * This common code is called from assembly language IDT gate entry
  * routines that prepare a suitable stack frame, and restore this
  * frame after the exception has been processed.
  */
 
 void
 trap(struct trapframe *frame)
 {
 	struct thread *td = curthread;
 	struct proc *p = td->td_proc;
 	int i = 0, ucode = 0, code;
 	u_int type;
 	register_t addr = 0;
 	vm_offset_t eva;
 	ksiginfo_t ksi;
 #ifdef POWERFAIL_NMI
 	static int lastalert = 0;
 #endif
 
 	PCPU_INC(cnt.v_trap);
 	type = frame->tf_trapno;
 
 #ifdef SMP
 	/* Handler for NMI IPIs used for stopping CPUs. */
 	if (type == T_NMI) {
 	         if (ipi_nmi_handler() == 0)
 	                   goto out;
 	}
 #endif /* SMP */
 
 #ifdef KDB
 	if (kdb_active) {
 		kdb_reenter();
 		goto out;
 	}
 #endif
 
 	if (type == T_RESERVED) {
 		trap_fatal(frame, 0);
 		goto out;
 	}
 
 #ifdef	HWPMC_HOOKS
 	/*
 	 * CPU PMCs interrupt using an NMI so we check for that first.
 	 * If the HWPMC module is active, 'pmc_hook' will point to
 	 * the function to be called.  A return value of '1' from the
 	 * hook means that the NMI was handled by it and that we can
 	 * return immediately.
 	 */
 	if (type == T_NMI && pmc_intr &&
 	    (*pmc_intr)(PCPU_GET(cpuid), frame))
 	    goto out;
 #endif
 
 	if (type == T_MCHK) {
 		mca_intr();
 		goto out;
 	}
 
 #ifdef KDTRACE_HOOKS
 	/*
 	 * A trap can occur while DTrace executes a probe. Before
 	 * executing the probe, DTrace blocks re-scheduling and sets
 	 * a flag in it's per-cpu flags to indicate that it doesn't
 	 * want to fault. On returning from the probe, the no-fault
 	 * flag is cleared and finally re-scheduling is enabled.
 	 *
 	 * If the DTrace kernel module has registered a trap handler,
 	 * call it and if it returns non-zero, assume that it has
 	 * handled the trap and modified the trap frame so that this
 	 * function can return normally.
 	 */
 	if (type == T_DTRACE_RET || type == T_BPTFLT) {
 		struct reg regs;
 
 		fill_frame_regs(frame, &regs);
 		if (type == T_BPTFLT &&
 		    dtrace_pid_probe_ptr != NULL &&
 		    dtrace_pid_probe_ptr(&regs) == 0)
 			goto out;
 		if (type == T_DTRACE_RET &&
 		    dtrace_return_probe_ptr != NULL &&
 		    dtrace_return_probe_ptr(&regs) == 0)
 			goto out;
 	}
 	if ((type == T_PROTFLT || type == T_PAGEFLT) &&
 	    dtrace_trap_func != NULL && (*dtrace_trap_func)(frame, type))
 		goto out;
 #endif
 
 	if ((frame->tf_eflags & PSL_I) == 0) {
 		/*
 		 * Buggy application or kernel code has disabled
 		 * interrupts and then trapped.  Enabling interrupts
 		 * now is wrong, but it is better than running with
 		 * interrupts disabled until they are accidentally
 		 * enabled later.
 		 */
 		if (ISPL(frame->tf_cs) == SEL_UPL || (frame->tf_eflags & PSL_VM))
 			uprintf(
 			    "pid %ld (%s): trap %d with interrupts disabled\n",
 			    (long)curproc->p_pid, curthread->td_name, type);
 		else if (type != T_NMI && type != T_BPTFLT &&
 		    type != T_TRCTRAP &&
 		    frame->tf_eip != (int)cpu_switch_load_gs) {
 			/*
 			 * XXX not quite right, since this may be for a
 			 * multiple fault in user mode.
 			 */
 			printf("kernel trap %d with interrupts disabled\n",
 			    type);
 			/*
 			 * Page faults need interrupts disabled until later,
 			 * and we shouldn't enable interrupts while holding
 			 * a spin lock.
 			 */
 			if (type != T_PAGEFLT &&
 			    td->td_md.md_spinlock_count == 0)
 				enable_intr();
 		}
 	}
 	eva = 0;
 	code = frame->tf_err;
 	if (type == T_PAGEFLT) {
 		/*
 		 * For some Cyrix CPUs, %cr2 is clobbered by
 		 * interrupts.  This problem is worked around by using
 		 * an interrupt gate for the pagefault handler.  We
 		 * are finally ready to read %cr2 and conditionally
 		 * reenable interrupts.  If we hold a spin lock, then
 		 * we must not reenable interrupts.  This might be a
 		 * spurious page fault.
 		 */
 		eva = rcr2();
 		if (td->td_md.md_spinlock_count == 0)
 			enable_intr();
 	}
 
         if ((ISPL(frame->tf_cs) == SEL_UPL) ||
 	    ((frame->tf_eflags & PSL_VM) && 
 		!(curpcb->pcb_flags & PCB_VM86CALL))) {
 		/* user trap */
 
 		td->td_pticks = 0;
 		td->td_frame = frame;
 		addr = frame->tf_eip;
 		if (td->td_ucred != p->p_ucred) 
 			cred_update_thread(td);
 
 		switch (type) {
 		case T_PRIVINFLT:	/* privileged instruction fault */
 			i = SIGILL;
 			ucode = ILL_PRVOPC;
 			break;
 
 		case T_BPTFLT:		/* bpt instruction fault */
 		case T_TRCTRAP:		/* trace trap */
 			enable_intr();
 			frame->tf_eflags &= ~PSL_T;
 			i = SIGTRAP;
 			ucode = (type == T_TRCTRAP ? TRAP_TRACE : TRAP_BRKPT);
 			break;
 
 		case T_ARITHTRAP:	/* arithmetic trap */
 #ifdef DEV_NPX
 			ucode = npxtrap_x87();
 			if (ucode == -1)
 				goto userout;
 #else
 			ucode = 0;
 #endif
 			i = SIGFPE;
 			break;
 
 			/*
 			 * The following two traps can happen in
 			 * vm86 mode, and, if so, we want to handle
 			 * them specially.
 			 */
 		case T_PROTFLT:		/* general protection fault */
 		case T_STKFLT:		/* stack fault */
 			if (frame->tf_eflags & PSL_VM) {
 				i = vm86_emulate((struct vm86frame *)frame);
 				if (i == 0)
 					goto user;
 				break;
 			}
 			i = SIGBUS;
 			ucode = (type == T_PROTFLT) ? BUS_OBJERR : BUS_ADRERR;
 			break;
 		case T_SEGNPFLT:	/* segment not present fault */
 			i = SIGBUS;
 			ucode = BUS_ADRERR;
 			break;
 		case T_TSSFLT:		/* invalid TSS fault */
 			i = SIGBUS;
 			ucode = BUS_OBJERR;
 			break;
 		case T_DOUBLEFLT:	/* double fault */
 		default:
 			i = SIGBUS;
 			ucode = BUS_OBJERR;
 			break;
 
 		case T_PAGEFLT:		/* page fault */
 
 			i = trap_pfault(frame, TRUE, eva);
 #if defined(I586_CPU) && !defined(NO_F00F_HACK)
 			if (i == -2) {
 				/*
 				 * The f00f hack workaround has triggered, so
 				 * treat the fault as an illegal instruction 
 				 * (T_PRIVINFLT) instead of a page fault.
 				 */
 				type = frame->tf_trapno = T_PRIVINFLT;
 
 				/* Proceed as in that case. */
 				ucode = ILL_PRVOPC;
 				i = SIGILL;
 				break;
 			}
 #endif
 			if (i == -1)
 				goto userout;
 			if (i == 0)
 				goto user;
 
 			if (i == SIGSEGV)
 				ucode = SEGV_MAPERR;
 			else {
 				if (prot_fault_translation == 0) {
 					/*
 					 * Autodetect.
 					 * This check also covers the images
 					 * without the ABI-tag ELF note.
 					 */
 					if (SV_CURPROC_ABI() == SV_ABI_FREEBSD
 					    && p->p_osrel >= P_OSREL_SIGSEGV) {
 						i = SIGSEGV;
 						ucode = SEGV_ACCERR;
 					} else {
 						i = SIGBUS;
 						ucode = BUS_PAGE_FAULT;
 					}
 				} else if (prot_fault_translation == 1) {
 					/*
 					 * Always compat mode.
 					 */
 					i = SIGBUS;
 					ucode = BUS_PAGE_FAULT;
 				} else {
 					/*
 					 * Always SIGSEGV mode.
 					 */
 					i = SIGSEGV;
 					ucode = SEGV_ACCERR;
 				}
 			}
 			addr = eva;
 			break;
 
 		case T_DIVIDE:		/* integer divide fault */
 			ucode = FPE_INTDIV;
 			i = SIGFPE;
 			break;
 
 #ifdef DEV_ISA
 		case T_NMI:
 #ifdef POWERFAIL_NMI
 #ifndef TIMER_FREQ
 #  define TIMER_FREQ 1193182
 #endif
 			if (time_second - lastalert > 10) {
 				log(LOG_WARNING, "NMI: power fail\n");
 				sysbeep(880, hz);
 				lastalert = time_second;
 			}
 			goto userout;
 #else /* !POWERFAIL_NMI */
 			/* machine/parity/power fail/"kitchen sink" faults */
 			if (isa_nmi(code) == 0) {
 #ifdef KDB
 				/*
 				 * NMI can be hooked up to a pushbutton
 				 * for debugging.
 				 */
 				if (kdb_on_nmi) {
 					printf ("NMI ... going to debugger\n");
 					kdb_trap(type, 0, frame);
 				}
 #endif /* KDB */
 				goto userout;
 			} else if (panic_on_nmi)
 				panic("NMI indicates hardware failure");
 			break;
 #endif /* POWERFAIL_NMI */
 #endif /* DEV_ISA */
 
 		case T_OFLOW:		/* integer overflow fault */
 			ucode = FPE_INTOVF;
 			i = SIGFPE;
 			break;
 
 		case T_BOUND:		/* bounds check fault */
 			ucode = FPE_FLTSUB;
 			i = SIGFPE;
 			break;
 
 		case T_DNA:
 #ifdef DEV_NPX
 			KASSERT(PCB_USER_FPU(td->td_pcb),
 			    ("kernel FPU ctx has leaked"));
 			/* transparent fault (due to context switch "late") */
 			if (npxdna())
 				goto userout;
 #endif
 			uprintf("pid %d killed due to lack of floating point\n",
 				p->p_pid);
 			i = SIGKILL;
 			ucode = 0;
 			break;
 
 		case T_FPOPFLT:		/* FPU operand fetch fault */
 			ucode = ILL_COPROC;
 			i = SIGILL;
 			break;
 
 		case T_XMMFLT:		/* SIMD floating-point exception */
 #if defined(DEV_NPX) && !defined(CPU_DISABLE_SSE) && defined(I686_CPU)
 			ucode = npxtrap_sse();
 			if (ucode == -1)
 				goto userout;
 #else
 			ucode = 0;
 #endif
 			i = SIGFPE;
 			break;
 		}
 	} else {
 		/* kernel trap */
 
 		KASSERT(cold || td->td_ucred != NULL,
 		    ("kernel trap doesn't have ucred"));
 		switch (type) {
 		case T_PAGEFLT:			/* page fault */
 			(void) trap_pfault(frame, FALSE, eva);
 			goto out;
 
 		case T_DNA:
 #ifdef DEV_NPX
 			KASSERT(!PCB_USER_FPU(td->td_pcb),
 			    ("Unregistered use of FPU in kernel"));
 			if (npxdna())
 				goto out;
 #endif
 			break;
 
 		case T_ARITHTRAP:	/* arithmetic trap */
 		case T_XMMFLT:		/* SIMD floating-point exception */
 		case T_FPOPFLT:		/* FPU operand fetch fault */
 			/*
 			 * XXXKIB for now disable any FPU traps in kernel
 			 * handler registration seems to be overkill
 			 */
 			trap_fatal(frame, 0);
 			goto out;
 
 			/*
 			 * The following two traps can happen in
 			 * vm86 mode, and, if so, we want to handle
 			 * them specially.
 			 */
 		case T_PROTFLT:		/* general protection fault */
 		case T_STKFLT:		/* stack fault */
 			if (frame->tf_eflags & PSL_VM) {
 				i = vm86_emulate((struct vm86frame *)frame);
 				if (i != 0)
 					/*
 					 * returns to original process
 					 */
 					vm86_trap((struct vm86frame *)frame);
 				goto out;
 			}
 			if (type == T_STKFLT)
 				break;
 
 			/* FALL THROUGH */
 
 		case T_SEGNPFLT:	/* segment not present fault */
 			if (curpcb->pcb_flags & PCB_VM86CALL)
 				break;
 
 			/*
 			 * Invalid %fs's and %gs's can be created using
 			 * procfs or PT_SETREGS or by invalidating the
 			 * underlying LDT entry.  This causes a fault
 			 * in kernel mode when the kernel attempts to
 			 * switch contexts.  Lose the bad context
 			 * (XXX) so that we can continue, and generate
 			 * a signal.
 			 */
 			if (frame->tf_eip == (int)cpu_switch_load_gs) {
 				curpcb->pcb_gs = 0;
 #if 0				
 				PROC_LOCK(p);
 				kern_psignal(p, SIGBUS);
 				PROC_UNLOCK(p);
 #endif				
 				goto out;
 			}
 
 			if (td->td_intr_nesting_level != 0)
 				break;
 
 			/*
 			 * Invalid segment selectors and out of bounds
 			 * %eip's and %esp's can be set up in user mode.
 			 * This causes a fault in kernel mode when the
 			 * kernel tries to return to user mode.  We want
 			 * to get this fault so that we can fix the
 			 * problem here and not have to check all the
 			 * selectors and pointers when the user changes
 			 * them.
 			 */
 			if (frame->tf_eip == (int)doreti_iret) {
 				frame->tf_eip = (int)doreti_iret_fault;
 				goto out;
 			}
 			if (frame->tf_eip == (int)doreti_popl_ds) {
 				frame->tf_eip = (int)doreti_popl_ds_fault;
 				goto out;
 			}
 			if (frame->tf_eip == (int)doreti_popl_es) {
 				frame->tf_eip = (int)doreti_popl_es_fault;
 				goto out;
 			}
 			if (frame->tf_eip == (int)doreti_popl_fs) {
 				frame->tf_eip = (int)doreti_popl_fs_fault;
 				goto out;
 			}
 			if (curpcb->pcb_onfault != NULL) {
 				frame->tf_eip =
 				    (int)curpcb->pcb_onfault;
 				goto out;
 			}
 			break;
 
 		case T_TSSFLT:
 			/*
 			 * PSL_NT can be set in user mode and isn't cleared
 			 * automatically when the kernel is entered.  This
 			 * causes a TSS fault when the kernel attempts to
 			 * `iret' because the TSS link is uninitialized.  We
 			 * want to get this fault so that we can fix the
 			 * problem here and not every time the kernel is
 			 * entered.
 			 */
 			if (frame->tf_eflags & PSL_NT) {
 				frame->tf_eflags &= ~PSL_NT;
 				goto out;
 			}
 			break;
 
 		case T_TRCTRAP:	 /* trace trap */
 			if (frame->tf_eip == (int)IDTVEC(lcall_syscall)) {
 				/*
 				 * We've just entered system mode via the
 				 * syscall lcall.  Continue single stepping
 				 * silently until the syscall handler has
 				 * saved the flags.
 				 */
 				goto out;
 			}
 			if (frame->tf_eip == (int)IDTVEC(lcall_syscall) + 1) {
 				/*
 				 * The syscall handler has now saved the
 				 * flags.  Stop single stepping it.
 				 */
 				frame->tf_eflags &= ~PSL_T;
 				goto out;
 			}
 			/*
 			 * Ignore debug register trace traps due to
 			 * accesses in the user's address space, which
 			 * can happen under several conditions such as
 			 * if a user sets a watchpoint on a buffer and
 			 * then passes that buffer to a system call.
 			 * We still want to get TRCTRAPS for addresses
 			 * in kernel space because that is useful when
 			 * debugging the kernel.
 			 */
 			if (user_dbreg_trap() && 
 			   !(curpcb->pcb_flags & PCB_VM86CALL)) {
 				/*
 				 * Reset breakpoint bits because the
 				 * processor doesn't
 				 */
 				load_dr6(rdr6() & 0xfffffff0);
 				goto out;
 			}
 			/*
 			 * FALLTHROUGH (TRCTRAP kernel mode, kernel address)
 			 */
 		case T_BPTFLT:
 			/*
 			 * If KDB is enabled, let it handle the debugger trap.
 			 * Otherwise, debugger traps "can't happen".
 			 */
 #ifdef KDB
 			if (kdb_trap(type, 0, frame))
 				goto out;
 #endif
 			break;
 
 #ifdef DEV_ISA
 		case T_NMI:
 #ifdef POWERFAIL_NMI
 			if (time_second - lastalert > 10) {
 				log(LOG_WARNING, "NMI: power fail\n");
 				sysbeep(880, hz);
 				lastalert = time_second;
 			}
 			goto out;
 #else /* !POWERFAIL_NMI */
 			/* machine/parity/power fail/"kitchen sink" faults */
 			if (isa_nmi(code) == 0) {
 #ifdef KDB
 				/*
 				 * NMI can be hooked up to a pushbutton
 				 * for debugging.
 				 */
 				if (kdb_on_nmi) {
 					printf ("NMI ... going to debugger\n");
 					kdb_trap(type, 0, frame);
 				}
 #endif /* KDB */
 				goto out;
 			} else if (panic_on_nmi == 0)
 				goto out;
 			/* FALLTHROUGH */
 #endif /* POWERFAIL_NMI */
 #endif /* DEV_ISA */
 		}
 
 		trap_fatal(frame, eva);
 		goto out;
 	}
 
 	/* Translate fault for emulators (e.g. Linux) */
 	if (*p->p_sysent->sv_transtrap)
 		i = (*p->p_sysent->sv_transtrap)(i, type);
 
 	ksiginfo_init_trap(&ksi);
 	ksi.ksi_signo = i;
 	ksi.ksi_code = ucode;
 	ksi.ksi_addr = (void *)addr;
 	ksi.ksi_trapno = type;
 	if (uprintf_signal) {
 		uprintf("pid %d comm %s: signal %d err %x code %d type %d "
 		    "addr 0x%x esp 0x%08x eip 0x%08x "
 		    "<%02x %02x %02x %02x %02x %02x %02x %02x>\n",
 		    p->p_pid, p->p_comm, i, frame->tf_err, ucode, type, addr,
 		    frame->tf_esp, frame->tf_eip,
 		    fubyte((void *)(frame->tf_eip + 0)),
 		    fubyte((void *)(frame->tf_eip + 1)),
 		    fubyte((void *)(frame->tf_eip + 2)),
 		    fubyte((void *)(frame->tf_eip + 3)),
 		    fubyte((void *)(frame->tf_eip + 4)),
 		    fubyte((void *)(frame->tf_eip + 5)),
 		    fubyte((void *)(frame->tf_eip + 6)),
 		    fubyte((void *)(frame->tf_eip + 7)));
 	}
 	KASSERT((read_eflags() & PSL_I) != 0, ("interrupts disabled"));
 	trapsignal(td, &ksi);
 
 #ifdef DEBUG
 	if (type <= MAX_TRAP_MSG) {
 		uprintf("fatal process exception: %s",
 			trap_msg[type]);
 		if ((type == T_PAGEFLT) || (type == T_PROTFLT))
 			uprintf(", fault VA = 0x%lx", (u_long)eva);
 		uprintf("\n");
 	}
 #endif
 
 user:
 	userret(td, frame);
 	KASSERT(PCB_USER_FPU(td->td_pcb),
 	    ("Return from trap with kernel FPU ctx leaked"));
 userout:
 out:
 	return;
 }
 
 static int
 trap_pfault(frame, usermode, eva)
 	struct trapframe *frame;
 	int usermode;
 	vm_offset_t eva;
 {
 	vm_offset_t va;
 	struct vmspace *vm;
 	vm_map_t map;
 	int rv = 0;
 	vm_prot_t ftype;
 	struct thread *td = curthread;
 	struct proc *p = td->td_proc;
 
 	if (__predict_false((td->td_pflags & TDP_NOFAULTING) != 0)) {
 		/*
 		 * Due to both processor errata and lazy TLB invalidation when
 		 * access restrictions are removed from virtual pages, memory
 		 * accesses that are allowed by the physical mapping layer may
 		 * nonetheless cause one spurious page fault per virtual page. 
 		 * When the thread is executing a "no faulting" section that
 		 * is bracketed by vm_fault_{disable,enable}_pagefaults(),
 		 * every page fault is treated as a spurious page fault,
 		 * unless it accesses the same virtual address as the most
 		 * recent page fault within the same "no faulting" section.
 		 */
 		if (td->td_md.md_spurflt_addr != eva ||
 		    (td->td_pflags & TDP_RESETSPUR) != 0) {
 			/*
 			 * Do nothing to the TLB.  A stale TLB entry is
 			 * flushed automatically by a page fault.
 			 */
 			td->td_md.md_spurflt_addr = eva;
 			td->td_pflags &= ~TDP_RESETSPUR;
 			return (0);
 		}
 	} else {
 		/*
 		 * If we get a page fault while in a critical section, then
 		 * it is most likely a fatal kernel page fault.  The kernel
 		 * is already going to panic trying to get a sleep lock to
 		 * do the VM lookup, so just consider it a fatal trap so the
 		 * kernel can print out a useful trap message and even get
 		 * to the debugger.
 		 *
 		 * If we get a page fault while holding a non-sleepable
 		 * lock, then it is most likely a fatal kernel page fault.
 		 * If WITNESS is enabled, then it's going to whine about
 		 * bogus LORs with various VM locks, so just skip to the
 		 * fatal trap handling directly.
 		 */
 		if (td->td_critnest != 0 ||
 		    WITNESS_CHECK(WARN_SLEEPOK | WARN_GIANTOK, NULL,
 		    "Kernel page fault") != 0) {
 			trap_fatal(frame, eva);
 			return (-1);
 		}
 	}
 	va = trunc_page(eva);
 	if (va >= KERNBASE) {
 		/*
 		 * Don't allow user-mode faults in kernel address space.
 		 * An exception:  if the faulting address is the invalid
 		 * instruction entry in the IDT, then the Intel Pentium
 		 * F00F bug workaround was triggered, and we need to
 		 * treat it is as an illegal instruction, and not a page
 		 * fault.
 		 */
 #if defined(I586_CPU) && !defined(NO_F00F_HACK)
 		if ((eva == (unsigned int)&idt[6]) && has_f00f_bug)
 			return (-2);
 #endif
 		if (usermode)
 			goto nogo;
 
 		map = kernel_map;
 	} else {
 		/*
 		 * This is a fault on non-kernel virtual memory.  If either
 		 * p or p->p_vmspace is NULL, then the fault is fatal.
 		 */
 		if (p == NULL || (vm = p->p_vmspace) == NULL)
 			goto nogo;
 
 		map = &vm->vm_map;
 
 		/*
 		 * When accessing a user-space address, kernel must be
 		 * ready to accept the page fault, and provide a
 		 * handling routine.  Since accessing the address
 		 * without the handler is a bug, do not try to handle
 		 * it normally, and panic immediately.
 		 */
 		if (!usermode && (td->td_intr_nesting_level != 0 ||
 		    curpcb->pcb_onfault == NULL)) {
 			trap_fatal(frame, eva);
 			return (-1);
 		}
 	}
 
 	/*
 	 * PGEX_I is defined only if the execute disable bit capability is
 	 * supported and enabled.
 	 */
 	if (frame->tf_err & PGEX_W)
 		ftype = VM_PROT_WRITE;
 #ifdef PAE
 	else if ((frame->tf_err & PGEX_I) && pg_nx != 0)
 		ftype = VM_PROT_EXECUTE;
 #endif
 	else
 		ftype = VM_PROT_READ;
 
 	if (map != kernel_map) {
 		/*
 		 * Keep swapout from messing with us during this
 		 *	critical time.
 		 */
 		PROC_LOCK(p);
 		++p->p_lock;
 		PROC_UNLOCK(p);
 
 		/* Fault in the user page: */
 		rv = vm_fault(map, va, ftype, VM_FAULT_NORMAL);
 
 		PROC_LOCK(p);
 		--p->p_lock;
 		PROC_UNLOCK(p);
 	} else {
 		/*
 		 * Don't have to worry about process locking or stacks in the
 		 * kernel.
 		 */
 		rv = vm_fault(map, va, ftype, VM_FAULT_NORMAL);
 	}
 	if (rv == KERN_SUCCESS) {
 #ifdef HWPMC_HOOKS
 		if (ftype == VM_PROT_READ || ftype == VM_PROT_WRITE) {
 			PMC_SOFT_CALL_TF( , , page_fault, all, frame);
 			if (ftype == VM_PROT_READ)
 				PMC_SOFT_CALL_TF( , , page_fault, read,
 				    frame);
 			else
 				PMC_SOFT_CALL_TF( , , page_fault, write,
 				    frame);
 		}
 #endif
 		return (0);
 	}
 nogo:
 	if (!usermode) {
 		if (td->td_intr_nesting_level == 0 &&
 		    curpcb->pcb_onfault != NULL) {
 			frame->tf_eip = (int)curpcb->pcb_onfault;
 			return (0);
 		}
 		trap_fatal(frame, eva);
 		return (-1);
 	}
 	return ((rv == KERN_PROTECTION_FAILURE) ? SIGBUS : SIGSEGV);
 }
 
 static void
 trap_fatal(frame, eva)
 	struct trapframe *frame;
 	vm_offset_t eva;
 {
 	int code, ss, esp;
 	u_int type;
 	struct soft_segment_descriptor softseg;
 	char *msg;
 
 	code = frame->tf_err;
 	type = frame->tf_trapno;
 	sdtossd(&gdt[IDXSEL(frame->tf_cs & 0xffff)].sd, &softseg);
 
 	if (type <= MAX_TRAP_MSG)
 		msg = trap_msg[type];
 	else
 		msg = "UNKNOWN";
 	printf("\n\nFatal trap %d: %s while in %s mode\n", type, msg,
 	    frame->tf_eflags & PSL_VM ? "vm86" :
 	    ISPL(frame->tf_cs) == SEL_UPL ? "user" : "kernel");
 #ifdef SMP
 	/* two separate prints in case of a trap on an unmapped page */
 	printf("cpuid = %d; ", PCPU_GET(cpuid));
 	printf("apic id = %02x\n", PCPU_GET(apic_id));
 #endif
 	if (type == T_PAGEFLT) {
 		printf("fault virtual address	= 0x%x\n", eva);
 		printf("fault code		= %s %s, %s\n",
 			code & PGEX_U ? "user" : "supervisor",
 			code & PGEX_W ? "write" : "read",
 			code & PGEX_P ? "protection violation" : "page not present");
 	}
 	printf("instruction pointer	= 0x%x:0x%x\n",
 	       frame->tf_cs & 0xffff, frame->tf_eip);
         if ((ISPL(frame->tf_cs) == SEL_UPL) || (frame->tf_eflags & PSL_VM)) {
 		ss = frame->tf_ss & 0xffff;
 		esp = frame->tf_esp;
 	} else {
 		ss = GSEL(GDATA_SEL, SEL_KPL);
 		esp = (int)&frame->tf_esp;
 	}
 	printf("stack pointer	        = 0x%x:0x%x\n", ss, esp);
 	printf("frame pointer	        = 0x%x:0x%x\n", ss, frame->tf_ebp);
 	printf("code segment		= base 0x%x, limit 0x%x, type 0x%x\n",
 	       softseg.ssd_base, softseg.ssd_limit, softseg.ssd_type);
 	printf("			= DPL %d, pres %d, def32 %d, gran %d\n",
 	       softseg.ssd_dpl, softseg.ssd_p, softseg.ssd_def32,
 	       softseg.ssd_gran);
 	printf("processor eflags	= ");
 	if (frame->tf_eflags & PSL_T)
 		printf("trace trap, ");
 	if (frame->tf_eflags & PSL_I)
 		printf("interrupt enabled, ");
 	if (frame->tf_eflags & PSL_NT)
 		printf("nested task, ");
 	if (frame->tf_eflags & PSL_RF)
 		printf("resume, ");
 	if (frame->tf_eflags & PSL_VM)
 		printf("vm86, ");
 	printf("IOPL = %d\n", (frame->tf_eflags & PSL_IOPL) >> 12);
 	printf("current process		= ");
 	if (curproc) {
 		printf("%lu (%s)\n", (u_long)curproc->p_pid, curthread->td_name);
 	} else {
 		printf("Idle\n");
 	}
 
 #ifdef KDB
 	if (debugger_on_panic || kdb_active) {
 		frame->tf_err = eva;	/* smuggle fault address to ddb */
 		if (kdb_trap(type, 0, frame)) {
 			frame->tf_err = code;	/* restore error code */
 			return;
 		}
 		frame->tf_err = code;		/* restore error code */
 	}
 #endif
 	printf("trap number		= %d\n", type);
 	if (type <= MAX_TRAP_MSG)
 		panic("%s", trap_msg[type]);
 	else
 		panic("unknown/reserved trap");
 }
 
 /*
  * Double fault handler. Called when a fault occurs while writing
  * a frame for a trap/exception onto the stack. This usually occurs
  * when the stack overflows (such is the case with infinite recursion,
  * for example).
  *
  * XXX Note that the current PTD gets replaced by IdlePTD when the
  * task switch occurs. This means that the stack that was active at
  * the time of the double fault is not available at <kstack> unless
  * the machine was idle when the double fault occurred. The downside
  * of this is that "trace <ebp>" in ddb won't work.
  */
 void
 dblfault_handler()
 {
 #ifdef KDTRACE_HOOKS
 	if (dtrace_doubletrap_func != NULL)
 		(*dtrace_doubletrap_func)();
 #endif
 	printf("\nFatal double fault:\n");
 	printf("eip = 0x%x\n", PCPU_GET(common_tss.tss_eip));
 	printf("esp = 0x%x\n", PCPU_GET(common_tss.tss_esp));
 	printf("ebp = 0x%x\n", PCPU_GET(common_tss.tss_ebp));
 #ifdef SMP
 	/* two separate prints in case of a trap on an unmapped page */
 	printf("cpuid = %d; ", PCPU_GET(cpuid));
 	printf("apic id = %02x\n", PCPU_GET(apic_id));
 #endif
 	panic("double fault");
 }
 
 int
 cpu_fetch_syscall_args(struct thread *td, struct syscall_args *sa)
 {
 	struct proc *p;
 	struct trapframe *frame;
 	caddr_t params;
 	int error;
 
 	p = td->td_proc;
 	frame = td->td_frame;
 
 	params = (caddr_t)frame->tf_esp + sizeof(int);
 	sa->code = frame->tf_eax;
 
 	/*
 	 * Need to check if this is a 32 bit or 64 bit syscall.
 	 */
 	if (sa->code == SYS_syscall) {
 		/*
 		 * Code is first argument, followed by actual args.
 		 */
 		sa->code = fuword(params);
 		params += sizeof(int);
 	} else if (sa->code == SYS___syscall) {
 		/*
 		 * Like syscall, but code is a quad, so as to maintain
 		 * quad alignment for the rest of the arguments.
 		 */
 		sa->code = fuword(params);
 		params += sizeof(quad_t);
 	}
 
  	if (p->p_sysent->sv_mask)
  		sa->code &= p->p_sysent->sv_mask;
  	if (sa->code >= p->p_sysent->sv_size)
  		sa->callp = &p->p_sysent->sv_table[0];
   	else
  		sa->callp = &p->p_sysent->sv_table[sa->code];
 	sa->narg = sa->callp->sy_narg;
 
 	if (params != NULL && sa->narg != 0)
 		error = copyin(params, (caddr_t)sa->args,
 		    (u_int)(sa->narg * sizeof(int)));
 	else
 		error = 0;
 
 	if (error == 0) {
 		td->td_retval[0] = 0;
 		td->td_retval[1] = frame->tf_edx;
 	}
 		
 	return (error);
 }
 
 #include "../../kern/subr_syscall.c"
 
 /*
  * syscall - system call request C handler.  A system call is
  * essentially treated as a trap by reusing the frame layout.
  */
 void
 syscall(struct trapframe *frame)
 {
 	struct thread *td;
 	struct syscall_args sa;
 	register_t orig_tf_eflags;
 	int error;
 	ksiginfo_t ksi;
 
 #ifdef DIAGNOSTIC
 	if (ISPL(frame->tf_cs) != SEL_UPL) {
 		panic("syscall");
 		/* NOT REACHED */
 	}
 #endif
 	orig_tf_eflags = frame->tf_eflags;
 
 	td = curthread;
 	td->td_frame = frame;
 
 	error = syscallenter(td, &sa);
 
 	/*
 	 * Traced syscall.
 	 */
 	if ((orig_tf_eflags & PSL_T) && !(orig_tf_eflags & PSL_VM)) {
 		frame->tf_eflags &= ~PSL_T;
 		ksiginfo_init_trap(&ksi);
 		ksi.ksi_signo = SIGTRAP;
 		ksi.ksi_code = TRAP_TRACE;
 		ksi.ksi_addr = (void *)frame->tf_eip;
 		trapsignal(td, &ksi);
 	}
 
 	KASSERT(PCB_USER_FPU(td->td_pcb),
 	    ("System call %s returning with kernel FPU ctx leaked",
 	     syscallname(td->td_proc, sa.code)));
 	KASSERT(td->td_pcb->pcb_save == &td->td_pcb->pcb_user_save,
 	    ("System call %s returning with mangled pcb_save",
 	     syscallname(td->td_proc, sa.code)));
 
 	syscallret(td, error, &sa);
 }
Index: head/sys/i386/linux/linux_dummy.c
===================================================================
--- head/sys/i386/linux/linux_dummy.c	(revision 258540)
+++ head/sys/i386/linux/linux_dummy.c	(revision 258541)
@@ -1,167 +1,166 @@
 /*-
  * Copyright (c) 1994-1995 Søren Schmidt
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer 
  *    in this position and unchanged.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. The name of the author may not be used to endorse or promote products
  *    derived from this software without specific prior written permission
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_compat.h"
-#include "opt_kdtrace.h"
 
 #include <sys/param.h>
 #include <sys/kernel.h>
 #include <sys/sdt.h>
 #include <sys/systm.h>
 #include <sys/proc.h>
 
 #include <i386/linux/linux.h>
 #include <i386/linux/linux_proto.h>
 #include <compat/linux/linux_dtrace.h>
 #include <compat/linux/linux_util.h>
 
 /* DTrace init */
 LIN_SDT_PROVIDER_DECLARE(LINUX_DTRACE);
 
 DUMMY(stime);
 DUMMY(fstat);
 DUMMY(olduname);
 DUMMY(syslog);
 DUMMY(uname);
 DUMMY(vhangup);
 DUMMY(vm86old);
 DUMMY(swapoff);
 DUMMY(adjtimex);
 DUMMY(create_module);
 DUMMY(init_module);
 DUMMY(delete_module);
 DUMMY(get_kernel_syms);
 DUMMY(quotactl);
 DUMMY(bdflush);
 DUMMY(sysfs);
 DUMMY(vm86);
 DUMMY(query_module);
 DUMMY(nfsservctl);
 DUMMY(rt_sigqueueinfo);
 DUMMY(sendfile);		/* different semantics */
 DUMMY(setfsuid);
 DUMMY(setfsgid);
 DUMMY(pivot_root);
 DUMMY(mincore);
 DUMMY(lookup_dcookie);
 DUMMY(epoll_create);
 DUMMY(epoll_ctl);
 DUMMY(epoll_wait);
 DUMMY(remap_file_pages);
 DUMMY(fstatfs64);
 DUMMY(mbind);
 DUMMY(get_mempolicy);
 DUMMY(set_mempolicy);
 DUMMY(kexec_load);
 DUMMY(waitid);
 /* linux 2.6.11: */
 DUMMY(add_key);
 DUMMY(request_key);
 DUMMY(keyctl);
 /* linux 2.6.13: */
 DUMMY(ioprio_set);
 DUMMY(ioprio_get);
 DUMMY(inotify_init);
 DUMMY(inotify_add_watch);
 DUMMY(inotify_rm_watch);
 /* linux 2.6.16: */
 DUMMY(migrate_pages);
 DUMMY(pselect6);
 DUMMY(ppoll);
 DUMMY(unshare);
 /* linux 2.6.17: */
 DUMMY(splice);
 DUMMY(sync_file_range);
 DUMMY(tee);
 DUMMY(vmsplice);
 /* linux 2.6.18: */
 DUMMY(move_pages);
 /* linux 2.6.19: */
 DUMMY(getcpu);
 DUMMY(epoll_pwait);
 /* linux 2.6.22: */
 DUMMY(utimensat);
 DUMMY(signalfd);
 DUMMY(timerfd_create);
 DUMMY(eventfd);
 /* linux 2.6.23: */
 DUMMY(fallocate);
 /* linux 2.6.25: */
 DUMMY(timerfd_settime);
 DUMMY(timerfd_gettime);
 /* linux 2.6.27: */
 DUMMY(signalfd4);
 DUMMY(eventfd2);
 DUMMY(epoll_create1);
 DUMMY(dup3);
 DUMMY(inotify_init1);
 /* linux 2.6.30: */
 DUMMY(preadv);
 DUMMY(pwritev);
 /* linux 2.6.31 */
 DUMMY(rt_tsigqueueinfo);
 DUMMY(perf_event_open);
 /* linux 2.6.33: */
 DUMMY(recvmmsg);
 DUMMY(fanotify_init);
 DUMMY(fanotify_mark);
 /* linux 2.6.36: */
 DUMMY(prlimit64);
 /* later: */
 DUMMY(name_to_handle_at);
 DUMMY(open_by_handle_at);
 DUMMY(clock_adjtime);
 DUMMY(syncfs);
 DUMMY(sendmmsg);
 DUMMY(setns);
 DUMMY(process_vm_readv);
 DUMMY(process_vm_writev);
 
 #define DUMMY_XATTR(s)						\
 int								\
 linux_ ## s ## xattr(						\
     struct thread *td, struct linux_ ## s ## xattr_args *arg)	\
 {								\
 								\
 	return (ENOATTR);					\
 }
 DUMMY_XATTR(set);
 DUMMY_XATTR(lset);
 DUMMY_XATTR(fset);
 DUMMY_XATTR(get);
 DUMMY_XATTR(lget);
 DUMMY_XATTR(fget);
 DUMMY_XATTR(list);
 DUMMY_XATTR(llist);
 DUMMY_XATTR(flist);
 DUMMY_XATTR(remove);
 DUMMY_XATTR(lremove);
 DUMMY_XATTR(fremove);
Index: head/sys/kern/kern_clock.c
===================================================================
--- head/sys/kern/kern_clock.c	(revision 258540)
+++ head/sys/kern/kern_clock.c	(revision 258541)
@@ -1,895 +1,894 @@
 /*-
  * Copyright (c) 1982, 1986, 1991, 1993
  *	The Regents of the University of California.  All rights reserved.
  * (c) UNIX System Laboratories, Inc.
  * All or some portions of this file are derived from material licensed
  * to the University of California by American Telephone and Telegraph
  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
  * the permission of UNIX System Laboratories, Inc.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)kern_clock.c	8.5 (Berkeley) 1/21/94
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_kdb.h"
 #include "opt_device_polling.h"
 #include "opt_hwpmc_hooks.h"
-#include "opt_kdtrace.h"
 #include "opt_ntp.h"
 #include "opt_watchdog.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/callout.h>
 #include <sys/kdb.h>
 #include <sys/kernel.h>
 #include <sys/kthread.h>
 #include <sys/ktr.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/proc.h>
 #include <sys/resource.h>
 #include <sys/resourcevar.h>
 #include <sys/sched.h>
 #include <sys/sdt.h>
 #include <sys/signalvar.h>
 #include <sys/sleepqueue.h>
 #include <sys/smp.h>
 #include <vm/vm.h>
 #include <vm/pmap.h>
 #include <vm/vm_map.h>
 #include <sys/sysctl.h>
 #include <sys/bus.h>
 #include <sys/interrupt.h>
 #include <sys/limits.h>
 #include <sys/timetc.h>
 
 #ifdef GPROF
 #include <sys/gmon.h>
 #endif
 
 #ifdef HWPMC_HOOKS
 #include <sys/pmckern.h>
 PMC_SOFT_DEFINE( , , clock, hard);
 PMC_SOFT_DEFINE( , , clock, stat);
 PMC_SOFT_DEFINE_EX( , , clock, prof, \
     cpu_startprofclock, cpu_stopprofclock);
 #endif
 
 #ifdef DEVICE_POLLING
 extern void hardclock_device_poll(void);
 #endif /* DEVICE_POLLING */
 
 static void initclocks(void *dummy);
 SYSINIT(clocks, SI_SUB_CLOCKS, SI_ORDER_FIRST, initclocks, NULL);
 
 /* Spin-lock protecting profiling statistics. */
 static struct mtx time_lock;
 
 SDT_PROVIDER_DECLARE(sched);
 SDT_PROBE_DEFINE2(sched, , , tick, tick, "struct thread *", "struct proc *");
 
 static int
 sysctl_kern_cp_time(SYSCTL_HANDLER_ARGS)
 {
 	int error;
 	long cp_time[CPUSTATES];
 #ifdef SCTL_MASK32
 	int i;
 	unsigned int cp_time32[CPUSTATES];
 #endif
 
 	read_cpu_time(cp_time);
 #ifdef SCTL_MASK32
 	if (req->flags & SCTL_MASK32) {
 		if (!req->oldptr)
 			return SYSCTL_OUT(req, 0, sizeof(cp_time32));
 		for (i = 0; i < CPUSTATES; i++)
 			cp_time32[i] = (unsigned int)cp_time[i];
 		error = SYSCTL_OUT(req, cp_time32, sizeof(cp_time32));
 	} else
 #endif
 	{
 		if (!req->oldptr)
 			return SYSCTL_OUT(req, 0, sizeof(cp_time));
 		error = SYSCTL_OUT(req, cp_time, sizeof(cp_time));
 	}
 	return error;
 }
 
 SYSCTL_PROC(_kern, OID_AUTO, cp_time, CTLTYPE_LONG|CTLFLAG_RD|CTLFLAG_MPSAFE,
     0,0, sysctl_kern_cp_time, "LU", "CPU time statistics");
 
 static long empty[CPUSTATES];
 
 static int
 sysctl_kern_cp_times(SYSCTL_HANDLER_ARGS)
 {
 	struct pcpu *pcpu;
 	int error;
 	int c;
 	long *cp_time;
 #ifdef SCTL_MASK32
 	unsigned int cp_time32[CPUSTATES];
 	int i;
 #endif
 
 	if (!req->oldptr) {
 #ifdef SCTL_MASK32
 		if (req->flags & SCTL_MASK32)
 			return SYSCTL_OUT(req, 0, sizeof(cp_time32) * (mp_maxid + 1));
 		else
 #endif
 			return SYSCTL_OUT(req, 0, sizeof(long) * CPUSTATES * (mp_maxid + 1));
 	}
 	for (error = 0, c = 0; error == 0 && c <= mp_maxid; c++) {
 		if (!CPU_ABSENT(c)) {
 			pcpu = pcpu_find(c);
 			cp_time = pcpu->pc_cp_time;
 		} else {
 			cp_time = empty;
 		}
 #ifdef SCTL_MASK32
 		if (req->flags & SCTL_MASK32) {
 			for (i = 0; i < CPUSTATES; i++)
 				cp_time32[i] = (unsigned int)cp_time[i];
 			error = SYSCTL_OUT(req, cp_time32, sizeof(cp_time32));
 		} else
 #endif
 			error = SYSCTL_OUT(req, cp_time, sizeof(long) * CPUSTATES);
 	}
 	return error;
 }
 
 SYSCTL_PROC(_kern, OID_AUTO, cp_times, CTLTYPE_LONG|CTLFLAG_RD|CTLFLAG_MPSAFE,
     0,0, sysctl_kern_cp_times, "LU", "per-CPU time statistics");
 
 #ifdef DEADLKRES
 static const char *blessed[] = {
 	"getblk",
 	"so_snd_sx",
 	"so_rcv_sx",
 	NULL
 };
 static int slptime_threshold = 1800;
 static int blktime_threshold = 900;
 static int sleepfreq = 3;
 
 static void
 deadlkres(void)
 {
 	struct proc *p;
 	struct thread *td;
 	void *wchan;
 	int blkticks, i, slpticks, slptype, tryl, tticks;
 
 	tryl = 0;
 	for (;;) {
 		blkticks = blktime_threshold * hz;
 		slpticks = slptime_threshold * hz;
 
 		/*
 		 * Avoid to sleep on the sx_lock in order to avoid a possible
 		 * priority inversion problem leading to starvation.
 		 * If the lock can't be held after 100 tries, panic.
 		 */
 		if (!sx_try_slock(&allproc_lock)) {
 			if (tryl > 100)
 		panic("%s: possible deadlock detected on allproc_lock\n",
 				    __func__);
 			tryl++;
 			pause("allproc", sleepfreq * hz);
 			continue;
 		}
 		tryl = 0;
 		FOREACH_PROC_IN_SYSTEM(p) {
 			PROC_LOCK(p);
 			if (p->p_state == PRS_NEW) {
 				PROC_UNLOCK(p);
 				continue;
 			}
 			FOREACH_THREAD_IN_PROC(p, td) {
 
 				thread_lock(td);
 				if (TD_ON_LOCK(td)) {
 
 					/*
 					 * The thread should be blocked on a
 					 * turnstile, simply check if the
 					 * turnstile channel is in good state.
 					 */
 					MPASS(td->td_blocked != NULL);
 
 					tticks = ticks - td->td_blktick;
 					thread_unlock(td);
 					if (tticks > blkticks) {
 
 						/*
 						 * Accordingly with provided
 						 * thresholds, this thread is
 						 * stuck for too long on a
 						 * turnstile.
 						 */
 						PROC_UNLOCK(p);
 						sx_sunlock(&allproc_lock);
 	panic("%s: possible deadlock detected for %p, blocked for %d ticks\n",
 						    __func__, td, tticks);
 					}
 				} else if (TD_IS_SLEEPING(td) &&
 				    TD_ON_SLEEPQ(td)) {
 
 					/*
 					 * Check if the thread is sleeping on a
 					 * lock, otherwise skip the check.
 					 * Drop the thread lock in order to
 					 * avoid a LOR with the sleepqueue
 					 * spinlock.
 					 */
 					wchan = td->td_wchan;
 					tticks = ticks - td->td_slptick;
 					thread_unlock(td);
 					slptype = sleepq_type(wchan);
 					if ((slptype == SLEEPQ_SX ||
 					    slptype == SLEEPQ_LK) &&
 					    tticks > slpticks) {
 
 						/*
 						 * Accordingly with provided
 						 * thresholds, this thread is
 						 * stuck for too long on a
 						 * sleepqueue.
 						 * However, being on a
 						 * sleepqueue, we might still
 						 * check for the blessed
 						 * list.
 						 */
 						tryl = 0;
 						for (i = 0; blessed[i] != NULL;
 						    i++) {
 							if (!strcmp(blessed[i],
 							    td->td_wmesg)) {
 								tryl = 1;
 								break;
 							}
 						}
 						if (tryl != 0) {
 							tryl = 0;
 							continue;
 						}
 						PROC_UNLOCK(p);
 						sx_sunlock(&allproc_lock);
 	panic("%s: possible deadlock detected for %p, blocked for %d ticks\n",
 						    __func__, td, tticks);
 					}
 				} else
 					thread_unlock(td);
 			}
 			PROC_UNLOCK(p);
 		}
 		sx_sunlock(&allproc_lock);
 
 		/* Sleep for sleepfreq seconds. */
 		pause("-", sleepfreq * hz);
 	}
 }
 
 static struct kthread_desc deadlkres_kd = {
 	"deadlkres",
 	deadlkres,
 	(struct thread **)NULL
 };
 
 SYSINIT(deadlkres, SI_SUB_CLOCKS, SI_ORDER_ANY, kthread_start, &deadlkres_kd);
 
 static SYSCTL_NODE(_debug, OID_AUTO, deadlkres, CTLFLAG_RW, 0,
     "Deadlock resolver");
 SYSCTL_INT(_debug_deadlkres, OID_AUTO, slptime_threshold, CTLFLAG_RW,
     &slptime_threshold, 0,
     "Number of seconds within is valid to sleep on a sleepqueue");
 SYSCTL_INT(_debug_deadlkres, OID_AUTO, blktime_threshold, CTLFLAG_RW,
     &blktime_threshold, 0,
     "Number of seconds within is valid to block on a turnstile");
 SYSCTL_INT(_debug_deadlkres, OID_AUTO, sleepfreq, CTLFLAG_RW, &sleepfreq, 0,
     "Number of seconds between any deadlock resolver thread run");
 #endif	/* DEADLKRES */
 
 void
 read_cpu_time(long *cp_time)
 {
 	struct pcpu *pc;
 	int i, j;
 
 	/* Sum up global cp_time[]. */
 	bzero(cp_time, sizeof(long) * CPUSTATES);
 	CPU_FOREACH(i) {
 		pc = pcpu_find(i);
 		for (j = 0; j < CPUSTATES; j++)
 			cp_time[j] += pc->pc_cp_time[j];
 	}
 }
 
 #ifdef SW_WATCHDOG
 #include <sys/watchdog.h>
 
 static int watchdog_ticks;
 static int watchdog_enabled;
 static void watchdog_fire(void);
 static void watchdog_config(void *, u_int, int *);
 #endif /* SW_WATCHDOG */
 
 /*
  * Clock handling routines.
  *
  * This code is written to operate with two timers that run independently of
  * each other.
  *
  * The main timer, running hz times per second, is used to trigger interval
  * timers, timeouts and rescheduling as needed.
  *
  * The second timer handles kernel and user profiling,
  * and does resource use estimation.  If the second timer is programmable,
  * it is randomized to avoid aliasing between the two clocks.  For example,
  * the randomization prevents an adversary from always giving up the cpu
  * just before its quantum expires.  Otherwise, it would never accumulate
  * cpu ticks.  The mean frequency of the second timer is stathz.
  *
  * If no second timer exists, stathz will be zero; in this case we drive
  * profiling and statistics off the main clock.  This WILL NOT be accurate;
  * do not do it unless absolutely necessary.
  *
  * The statistics clock may (or may not) be run at a higher rate while
  * profiling.  This profile clock runs at profhz.  We require that profhz
  * be an integral multiple of stathz.
  *
  * If the statistics clock is running fast, it must be divided by the ratio
  * profhz/stathz for statistics.  (For profiling, every tick counts.)
  *
  * Time-of-day is maintained using a "timecounter", which may or may
  * not be related to the hardware generating the above mentioned
  * interrupts.
  */
 
 int	stathz;
 int	profhz;
 int	profprocs;
 volatile int	ticks;
 int	psratio;
 
 static DPCPU_DEFINE(int, pcputicks);	/* Per-CPU version of ticks. */
 static int global_hardclock_run = 0;
 
 /*
  * Initialize clock frequencies and start both clocks running.
  */
 /* ARGSUSED*/
 static void
 initclocks(dummy)
 	void *dummy;
 {
 	register int i;
 
 	/*
 	 * Set divisors to 1 (normal case) and let the machine-specific
 	 * code do its bit.
 	 */
 	mtx_init(&time_lock, "time lock", NULL, MTX_DEF);
 	cpu_initclocks();
 
 	/*
 	 * Compute profhz/stathz, and fix profhz if needed.
 	 */
 	i = stathz ? stathz : hz;
 	if (profhz == 0)
 		profhz = i;
 	psratio = profhz / i;
 #ifdef SW_WATCHDOG
 	EVENTHANDLER_REGISTER(watchdog_list, watchdog_config, NULL, 0);
 #endif
 }
 
 /*
  * Each time the real-time timer fires, this function is called on all CPUs.
  * Note that hardclock() calls hardclock_cpu() for the boot CPU, so only
  * the other CPUs in the system need to call this function.
  */
 void
 hardclock_cpu(int usermode)
 {
 	struct pstats *pstats;
 	struct thread *td = curthread;
 	struct proc *p = td->td_proc;
 	int flags;
 
 	/*
 	 * Run current process's virtual and profile time, as needed.
 	 */
 	pstats = p->p_stats;
 	flags = 0;
 	if (usermode &&
 	    timevalisset(&pstats->p_timer[ITIMER_VIRTUAL].it_value)) {
 		PROC_SLOCK(p);
 		if (itimerdecr(&pstats->p_timer[ITIMER_VIRTUAL], tick) == 0)
 			flags |= TDF_ALRMPEND | TDF_ASTPENDING;
 		PROC_SUNLOCK(p);
 	}
 	if (timevalisset(&pstats->p_timer[ITIMER_PROF].it_value)) {
 		PROC_SLOCK(p);
 		if (itimerdecr(&pstats->p_timer[ITIMER_PROF], tick) == 0)
 			flags |= TDF_PROFPEND | TDF_ASTPENDING;
 		PROC_SUNLOCK(p);
 	}
 	thread_lock(td);
 	sched_tick(1);
 	td->td_flags |= flags;
 	thread_unlock(td);
 
 #ifdef HWPMC_HOOKS
 	if (PMC_CPU_HAS_SAMPLES(PCPU_GET(cpuid)))
 		PMC_CALL_HOOK_UNLOCKED(curthread, PMC_FN_DO_SAMPLES, NULL);
 	if (td->td_intr_frame != NULL)
 		PMC_SOFT_CALL_TF( , , clock, hard, td->td_intr_frame);
 #endif
 	callout_process(sbinuptime());
 }
 
 /*
  * The real-time timer, interrupting hz times per second.
  */
 void
 hardclock(int usermode, uintfptr_t pc)
 {
 
 	atomic_add_int(&ticks, 1);
 	hardclock_cpu(usermode);
 	tc_ticktock(1);
 	cpu_tick_calibration();
 	/*
 	 * If no separate statistics clock is available, run it from here.
 	 *
 	 * XXX: this only works for UP
 	 */
 	if (stathz == 0) {
 		profclock(usermode, pc);
 		statclock(usermode);
 	}
 #ifdef DEVICE_POLLING
 	hardclock_device_poll();	/* this is very short and quick */
 #endif /* DEVICE_POLLING */
 #ifdef SW_WATCHDOG
 	if (watchdog_enabled > 0 && --watchdog_ticks <= 0)
 		watchdog_fire();
 #endif /* SW_WATCHDOG */
 }
 
 void
 hardclock_cnt(int cnt, int usermode)
 {
 	struct pstats *pstats;
 	struct thread *td = curthread;
 	struct proc *p = td->td_proc;
 	int *t = DPCPU_PTR(pcputicks);
 	int flags, global, newticks;
 #ifdef SW_WATCHDOG
 	int i;
 #endif /* SW_WATCHDOG */
 
 	/*
 	 * Update per-CPU and possibly global ticks values.
 	 */
 	*t += cnt;
 	do {
 		global = ticks;
 		newticks = *t - global;
 		if (newticks <= 0) {
 			if (newticks < -1)
 				*t = global - 1;
 			newticks = 0;
 			break;
 		}
 	} while (!atomic_cmpset_int(&ticks, global, *t));
 
 	/*
 	 * Run current process's virtual and profile time, as needed.
 	 */
 	pstats = p->p_stats;
 	flags = 0;
 	if (usermode &&
 	    timevalisset(&pstats->p_timer[ITIMER_VIRTUAL].it_value)) {
 		PROC_SLOCK(p);
 		if (itimerdecr(&pstats->p_timer[ITIMER_VIRTUAL],
 		    tick * cnt) == 0)
 			flags |= TDF_ALRMPEND | TDF_ASTPENDING;
 		PROC_SUNLOCK(p);
 	}
 	if (timevalisset(&pstats->p_timer[ITIMER_PROF].it_value)) {
 		PROC_SLOCK(p);
 		if (itimerdecr(&pstats->p_timer[ITIMER_PROF],
 		    tick * cnt) == 0)
 			flags |= TDF_PROFPEND | TDF_ASTPENDING;
 		PROC_SUNLOCK(p);
 	}
 	thread_lock(td);
 	sched_tick(cnt);
 	td->td_flags |= flags;
 	thread_unlock(td);
 
 #ifdef	HWPMC_HOOKS
 	if (PMC_CPU_HAS_SAMPLES(PCPU_GET(cpuid)))
 		PMC_CALL_HOOK_UNLOCKED(curthread, PMC_FN_DO_SAMPLES, NULL);
 	if (td->td_intr_frame != NULL)
 		PMC_SOFT_CALL_TF( , , clock, hard, td->td_intr_frame);
 #endif
 	/* We are in charge to handle this tick duty. */
 	if (newticks > 0) {
 		/* Dangerous and no need to call these things concurrently. */
 		if (atomic_cmpset_acq_int(&global_hardclock_run, 0, 1)) {
 			tc_ticktock(newticks);
 #ifdef DEVICE_POLLING
 			/* This is very short and quick. */
 			hardclock_device_poll();
 #endif /* DEVICE_POLLING */
 			atomic_store_rel_int(&global_hardclock_run, 0);
 		}
 #ifdef SW_WATCHDOG
 		if (watchdog_enabled > 0) {
 			i = atomic_fetchadd_int(&watchdog_ticks, -newticks);
 			if (i > 0 && i <= newticks)
 				watchdog_fire();
 		}
 #endif /* SW_WATCHDOG */
 	}
 	if (curcpu == CPU_FIRST())
 		cpu_tick_calibration();
 }
 
 void
 hardclock_sync(int cpu)
 {
 	int	*t = DPCPU_ID_PTR(cpu, pcputicks);
 
 	*t = ticks;
 }
 
 /*
  * Compute number of ticks in the specified amount of time.
  */
 int
 tvtohz(tv)
 	struct timeval *tv;
 {
 	register unsigned long ticks;
 	register long sec, usec;
 
 	/*
 	 * If the number of usecs in the whole seconds part of the time
 	 * difference fits in a long, then the total number of usecs will
 	 * fit in an unsigned long.  Compute the total and convert it to
 	 * ticks, rounding up and adding 1 to allow for the current tick
 	 * to expire.  Rounding also depends on unsigned long arithmetic
 	 * to avoid overflow.
 	 *
 	 * Otherwise, if the number of ticks in the whole seconds part of
 	 * the time difference fits in a long, then convert the parts to
 	 * ticks separately and add, using similar rounding methods and
 	 * overflow avoidance.  This method would work in the previous
 	 * case but it is slightly slower and assumes that hz is integral.
 	 *
 	 * Otherwise, round the time difference down to the maximum
 	 * representable value.
 	 *
 	 * If ints have 32 bits, then the maximum value for any timeout in
 	 * 10ms ticks is 248 days.
 	 */
 	sec = tv->tv_sec;
 	usec = tv->tv_usec;
 	if (usec < 0) {
 		sec--;
 		usec += 1000000;
 	}
 	if (sec < 0) {
 #ifdef DIAGNOSTIC
 		if (usec > 0) {
 			sec++;
 			usec -= 1000000;
 		}
 		printf("tvotohz: negative time difference %ld sec %ld usec\n",
 		       sec, usec);
 #endif
 		ticks = 1;
 	} else if (sec <= LONG_MAX / 1000000)
 		ticks = (sec * 1000000 + (unsigned long)usec + (tick - 1))
 			/ tick + 1;
 	else if (sec <= LONG_MAX / hz)
 		ticks = sec * hz
 			+ ((unsigned long)usec + (tick - 1)) / tick + 1;
 	else
 		ticks = LONG_MAX;
 	if (ticks > INT_MAX)
 		ticks = INT_MAX;
 	return ((int)ticks);
 }
 
 /*
  * Start profiling on a process.
  *
  * Kernel profiling passes proc0 which never exits and hence
  * keeps the profile clock running constantly.
  */
 void
 startprofclock(p)
 	register struct proc *p;
 {
 
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	if (p->p_flag & P_STOPPROF)
 		return;
 	if ((p->p_flag & P_PROFIL) == 0) {
 		p->p_flag |= P_PROFIL;
 		mtx_lock(&time_lock);
 		if (++profprocs == 1)
 			cpu_startprofclock();
 		mtx_unlock(&time_lock);
 	}
 }
 
 /*
  * Stop profiling on a process.
  */
 void
 stopprofclock(p)
 	register struct proc *p;
 {
 
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	if (p->p_flag & P_PROFIL) {
 		if (p->p_profthreads != 0) {
 			p->p_flag |= P_STOPPROF;
 			while (p->p_profthreads != 0)
 				msleep(&p->p_profthreads, &p->p_mtx, PPAUSE,
 				    "stopprof", 0);
 			p->p_flag &= ~P_STOPPROF;
 		}
 		if ((p->p_flag & P_PROFIL) == 0)
 			return;
 		p->p_flag &= ~P_PROFIL;
 		mtx_lock(&time_lock);
 		if (--profprocs == 0)
 			cpu_stopprofclock();
 		mtx_unlock(&time_lock);
 	}
 }
 
 /*
  * Statistics clock.  Updates rusage information and calls the scheduler
  * to adjust priorities of the active thread.
  *
  * This should be called by all active processors.
  */
 void
 statclock(int usermode)
 {
 
 	statclock_cnt(1, usermode);
 }
 
 void
 statclock_cnt(int cnt, int usermode)
 {
 	struct rusage *ru;
 	struct vmspace *vm;
 	struct thread *td;
 	struct proc *p;
 	long rss;
 	long *cp_time;
 
 	td = curthread;
 	p = td->td_proc;
 
 	cp_time = (long *)PCPU_PTR(cp_time);
 	if (usermode) {
 		/*
 		 * Charge the time as appropriate.
 		 */
 		td->td_uticks += cnt;
 		if (p->p_nice > NZERO)
 			cp_time[CP_NICE] += cnt;
 		else
 			cp_time[CP_USER] += cnt;
 	} else {
 		/*
 		 * Came from kernel mode, so we were:
 		 * - handling an interrupt,
 		 * - doing syscall or trap work on behalf of the current
 		 *   user process, or
 		 * - spinning in the idle loop.
 		 * Whichever it is, charge the time as appropriate.
 		 * Note that we charge interrupts to the current process,
 		 * regardless of whether they are ``for'' that process,
 		 * so that we know how much of its real time was spent
 		 * in ``non-process'' (i.e., interrupt) work.
 		 */
 		if ((td->td_pflags & TDP_ITHREAD) ||
 		    td->td_intr_nesting_level >= 2) {
 			td->td_iticks += cnt;
 			cp_time[CP_INTR] += cnt;
 		} else {
 			td->td_pticks += cnt;
 			td->td_sticks += cnt;
 			if (!TD_IS_IDLETHREAD(td))
 				cp_time[CP_SYS] += cnt;
 			else
 				cp_time[CP_IDLE] += cnt;
 		}
 	}
 
 	/* Update resource usage integrals and maximums. */
 	MPASS(p->p_vmspace != NULL);
 	vm = p->p_vmspace;
 	ru = &td->td_ru;
 	ru->ru_ixrss += pgtok(vm->vm_tsize) * cnt;
 	ru->ru_idrss += pgtok(vm->vm_dsize) * cnt;
 	ru->ru_isrss += pgtok(vm->vm_ssize) * cnt;
 	rss = pgtok(vmspace_resident_count(vm));
 	if (ru->ru_maxrss < rss)
 		ru->ru_maxrss = rss;
 	KTR_POINT2(KTR_SCHED, "thread", sched_tdname(td), "statclock",
 	    "prio:%d", td->td_priority, "stathz:%d", (stathz)?stathz:hz);
 	SDT_PROBE2(sched, , , tick, td, td->td_proc);
 	thread_lock_flags(td, MTX_QUIET);
 	for ( ; cnt > 0; cnt--)
 		sched_clock(td);
 	thread_unlock(td);
 #ifdef HWPMC_HOOKS
 	if (td->td_intr_frame != NULL)
 		PMC_SOFT_CALL_TF( , , clock, stat, td->td_intr_frame);
 #endif
 }
 
 void
 profclock(int usermode, uintfptr_t pc)
 {
 
 	profclock_cnt(1, usermode, pc);
 }
 
 void
 profclock_cnt(int cnt, int usermode, uintfptr_t pc)
 {
 	struct thread *td;
 #ifdef GPROF
 	struct gmonparam *g;
 	uintfptr_t i;
 #endif
 
 	td = curthread;
 	if (usermode) {
 		/*
 		 * Came from user mode; CPU was in user state.
 		 * If this process is being profiled, record the tick.
 		 * if there is no related user location yet, don't
 		 * bother trying to count it.
 		 */
 		if (td->td_proc->p_flag & P_PROFIL)
 			addupc_intr(td, pc, cnt);
 	}
 #ifdef GPROF
 	else {
 		/*
 		 * Kernel statistics are just like addupc_intr, only easier.
 		 */
 		g = &_gmonparam;
 		if (g->state == GMON_PROF_ON && pc >= g->lowpc) {
 			i = PC_TO_I(g, pc);
 			if (i < g->textsize) {
 				KCOUNT(g, i) += cnt;
 			}
 		}
 	}
 #endif
 #ifdef HWPMC_HOOKS
 	if (td->td_intr_frame != NULL)
 		PMC_SOFT_CALL_TF( , , clock, prof, td->td_intr_frame);
 #endif
 }
 
 /*
  * Return information about system clocks.
  */
 static int
 sysctl_kern_clockrate(SYSCTL_HANDLER_ARGS)
 {
 	struct clockinfo clkinfo;
 	/*
 	 * Construct clockinfo structure.
 	 */
 	bzero(&clkinfo, sizeof(clkinfo));
 	clkinfo.hz = hz;
 	clkinfo.tick = tick;
 	clkinfo.profhz = profhz;
 	clkinfo.stathz = stathz ? stathz : hz;
 	return (sysctl_handle_opaque(oidp, &clkinfo, sizeof clkinfo, req));
 }
 
 SYSCTL_PROC(_kern, KERN_CLOCKRATE, clockrate,
 	CTLTYPE_STRUCT|CTLFLAG_RD|CTLFLAG_MPSAFE,
 	0, 0, sysctl_kern_clockrate, "S,clockinfo",
 	"Rate and period of various kernel clocks");
 
 #ifdef SW_WATCHDOG
 
 static void
 watchdog_config(void *unused __unused, u_int cmd, int *error)
 {
 	u_int u;
 
 	u = cmd & WD_INTERVAL;
 	if (u >= WD_TO_1SEC) {
 		watchdog_ticks = (1 << (u - WD_TO_1SEC)) * hz;
 		watchdog_enabled = 1;
 		*error = 0;
 	} else {
 		watchdog_enabled = 0;
 	}
 }
 
 /*
  * Handle a watchdog timeout by dumping interrupt information and
  * then either dropping to DDB or panicking.
  */
 static void
 watchdog_fire(void)
 {
 	int nintr;
 	uint64_t inttotal;
 	u_long *curintr;
 	char *curname;
 
 	curintr = intrcnt;
 	curname = intrnames;
 	inttotal = 0;
 	nintr = sintrcnt / sizeof(u_long);
 
 	printf("interrupt                   total\n");
 	while (--nintr >= 0) {
 		if (*curintr)
 			printf("%-12s %20lu\n", curname, *curintr);
 		curname += strlen(curname) + 1;
 		inttotal += *curintr++;
 	}
 	printf("Total        %20ju\n", (uintmax_t)inttotal);
 
 #if defined(KDB) && !defined(KDB_UNATTENDED)
 	kdb_backtrace();
 	kdb_enter(KDB_WHY_WATCHDOG, "watchdog timeout");
 #else
 	panic("watchdog timeout");
 #endif
 }
 
 #endif /* SW_WATCHDOG */
Index: head/sys/kern/kern_clocksource.c
===================================================================
--- head/sys/kern/kern_clocksource.c	(revision 258540)
+++ head/sys/kern/kern_clocksource.c	(revision 258541)
@@ -1,949 +1,948 @@
 /*-
  * Copyright (c) 2010-2013 Alexander Motin <mav@FreeBSD.org>
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer,
  *    without modification, immediately at the beginning of the file.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 /*
  * Common routines to manage event timers hardware.
  */
 
 #include "opt_device_polling.h"
-#include "opt_kdtrace.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/bus.h>
 #include <sys/limits.h>
 #include <sys/lock.h>
 #include <sys/kdb.h>
 #include <sys/ktr.h>
 #include <sys/mutex.h>
 #include <sys/proc.h>
 #include <sys/kernel.h>
 #include <sys/sched.h>
 #include <sys/smp.h>
 #include <sys/sysctl.h>
 #include <sys/timeet.h>
 #include <sys/timetc.h>
 
 #include <machine/atomic.h>
 #include <machine/clock.h>
 #include <machine/cpu.h>
 #include <machine/smp.h>
 
 #ifdef KDTRACE_HOOKS
 #include <sys/dtrace_bsd.h>
 cyclic_clock_func_t	cyclic_clock_func = NULL;
 #endif
 
 int			cpu_can_deep_sleep = 0;	/* C3 state is available. */
 int			cpu_disable_deep_sleep = 0; /* Timer dies in C3. */
 
 static void		setuptimer(void);
 static void		loadtimer(sbintime_t now, int first);
 static int		doconfigtimer(void);
 static void		configtimer(int start);
 static int		round_freq(struct eventtimer *et, int freq);
 
 static sbintime_t	getnextcpuevent(int idle);
 static sbintime_t	getnextevent(void);
 static int		handleevents(sbintime_t now, int fake);
 
 static struct mtx	et_hw_mtx;
 
 #define	ET_HW_LOCK(state)						\
 	{								\
 		if (timer->et_flags & ET_FLAGS_PERCPU)			\
 			mtx_lock_spin(&(state)->et_hw_mtx);		\
 		else							\
 			mtx_lock_spin(&et_hw_mtx);			\
 	}
 
 #define	ET_HW_UNLOCK(state)						\
 	{								\
 		if (timer->et_flags & ET_FLAGS_PERCPU)			\
 			mtx_unlock_spin(&(state)->et_hw_mtx);		\
 		else							\
 			mtx_unlock_spin(&et_hw_mtx);			\
 	}
 
 static struct eventtimer *timer = NULL;
 static sbintime_t	timerperiod;	/* Timer period for periodic mode. */
 static sbintime_t	statperiod;	/* statclock() events period. */
 static sbintime_t	profperiod;	/* profclock() events period. */
 static sbintime_t	nexttick;	/* Next global timer tick time. */
 static u_int		busy = 1;	/* Reconfiguration is in progress. */
 static int		profiling = 0;	/* Profiling events enabled. */
 
 static char		timername[32];	/* Wanted timer. */
 TUNABLE_STR("kern.eventtimer.timer", timername, sizeof(timername));
 
 static int		singlemul = 0;	/* Multiplier for periodic mode. */
 TUNABLE_INT("kern.eventtimer.singlemul", &singlemul);
 SYSCTL_INT(_kern_eventtimer, OID_AUTO, singlemul, CTLFLAG_RW, &singlemul,
     0, "Multiplier for periodic mode");
 
 static u_int		idletick = 0;	/* Run periodic events when idle. */
 TUNABLE_INT("kern.eventtimer.idletick", &idletick);
 SYSCTL_UINT(_kern_eventtimer, OID_AUTO, idletick, CTLFLAG_RW, &idletick,
     0, "Run periodic events when idle");
 
 static int		periodic = 0;	/* Periodic or one-shot mode. */
 static int		want_periodic = 0; /* What mode to prefer. */
 TUNABLE_INT("kern.eventtimer.periodic", &want_periodic);
 
 struct pcpu_state {
 	struct mtx	et_hw_mtx;	/* Per-CPU timer mutex. */
 	u_int		action;		/* Reconfiguration requests. */
 	u_int		handle;		/* Immediate handle resuests. */
 	sbintime_t	now;		/* Last tick time. */
 	sbintime_t	nextevent;	/* Next scheduled event on this CPU. */
 	sbintime_t	nexttick;	/* Next timer tick time. */
 	sbintime_t	nexthard;	/* Next hardlock() event. */
 	sbintime_t	nextstat;	/* Next statclock() event. */
 	sbintime_t	nextprof;	/* Next profclock() event. */
 	sbintime_t	nextcall;	/* Next callout event. */
 	sbintime_t	nextcallopt;	/* Next optional callout event. */
 #ifdef KDTRACE_HOOKS
 	sbintime_t	nextcyc;	/* Next OpenSolaris cyclics event. */
 #endif
 	int		ipi;		/* This CPU needs IPI. */
 	int		idle;		/* This CPU is in idle mode. */
 };
 
 static DPCPU_DEFINE(struct pcpu_state, timerstate);
 DPCPU_DEFINE(sbintime_t, hardclocktime);
 
 /*
  * Timer broadcast IPI handler.
  */
 int
 hardclockintr(void)
 {
 	sbintime_t now;
 	struct pcpu_state *state;
 	int done;
 
 	if (doconfigtimer() || busy)
 		return (FILTER_HANDLED);
 	state = DPCPU_PTR(timerstate);
 	now = state->now;
 	CTR3(KTR_SPARE2, "ipi  at %d:    now  %d.%08x",
 	    curcpu, (int)(now >> 32), (u_int)(now & 0xffffffff));
 	done = handleevents(now, 0);
 	return (done ? FILTER_HANDLED : FILTER_STRAY);
 }
 
 /*
  * Handle all events for specified time on this CPU
  */
 static int
 handleevents(sbintime_t now, int fake)
 {
 	sbintime_t t, *hct;
 	struct trapframe *frame;
 	struct pcpu_state *state;
 	int usermode;
 	int done, runs;
 
 	CTR3(KTR_SPARE2, "handle at %d:  now  %d.%08x",
 	    curcpu, (int)(now >> 32), (u_int)(now & 0xffffffff));
 	done = 0;
 	if (fake) {
 		frame = NULL;
 		usermode = 0;
 	} else {
 		frame = curthread->td_intr_frame;
 		usermode = TRAPF_USERMODE(frame);
 	}
 
 	state = DPCPU_PTR(timerstate);
 
 	runs = 0;
 	while (now >= state->nexthard) {
 		state->nexthard += tick_sbt;
 		runs++;
 	}
 	if (runs) {
 		hct = DPCPU_PTR(hardclocktime);
 		*hct = state->nexthard - tick_sbt;
 		if (fake < 2) {
 			hardclock_cnt(runs, usermode);
 			done = 1;
 		}
 	}
 	runs = 0;
 	while (now >= state->nextstat) {
 		state->nextstat += statperiod;
 		runs++;
 	}
 	if (runs && fake < 2) {
 		statclock_cnt(runs, usermode);
 		done = 1;
 	}
 	if (profiling) {
 		runs = 0;
 		while (now >= state->nextprof) {
 			state->nextprof += profperiod;
 			runs++;
 		}
 		if (runs && !fake) {
 			profclock_cnt(runs, usermode, TRAPF_PC(frame));
 			done = 1;
 		}
 	} else
 		state->nextprof = state->nextstat;
 	if (now >= state->nextcallopt) {
 		state->nextcall = state->nextcallopt = INT64_MAX;
 		callout_process(now);
 	}
 
 #ifdef KDTRACE_HOOKS
 	if (fake == 0 && now >= state->nextcyc && cyclic_clock_func != NULL) {
 		state->nextcyc = INT64_MAX;
 		(*cyclic_clock_func)(frame);
 	}
 #endif
 
 	t = getnextcpuevent(0);
 	ET_HW_LOCK(state);
 	if (!busy) {
 		state->idle = 0;
 		state->nextevent = t;
 		loadtimer(now, 0);
 	}
 	ET_HW_UNLOCK(state);
 	return (done);
 }
 
 /*
  * Schedule binuptime of the next event on current CPU.
  */
 static sbintime_t
 getnextcpuevent(int idle)
 {
 	sbintime_t event;
 	struct pcpu_state *state;
 	u_int hardfreq;
 
 	state = DPCPU_PTR(timerstate);
 	/* Handle hardclock() events, skipping some if CPU is idle. */
 	event = state->nexthard;
 	if (idle) {
 		hardfreq = (u_int)hz / 2;
 		if (tc_min_ticktock_freq > 2
 #ifdef SMP
 		    && curcpu == CPU_FIRST()
 #endif
 		    )
 			hardfreq = hz / tc_min_ticktock_freq;
 		if (hardfreq > 1)
 			event += tick_sbt * (hardfreq - 1);
 	}
 	/* Handle callout events. */
 	if (event > state->nextcall)
 		event = state->nextcall;
 	if (!idle) { /* If CPU is active - handle other types of events. */
 		if (event > state->nextstat)
 			event = state->nextstat;
 		if (profiling && event > state->nextprof)
 			event = state->nextprof;
 	}
 #ifdef KDTRACE_HOOKS
 	if (event > state->nextcyc)
 		event = state->nextcyc;
 #endif
 	return (event);
 }
 
 /*
  * Schedule binuptime of the next event on all CPUs.
  */
 static sbintime_t
 getnextevent(void)
 {
 	struct pcpu_state *state;
 	sbintime_t event;
 #ifdef SMP
 	int	cpu;
 #endif
 	int	c;
 
 	state = DPCPU_PTR(timerstate);
 	event = state->nextevent;
 	c = -1;
 #ifdef SMP
 	if ((timer->et_flags & ET_FLAGS_PERCPU) == 0) {
 		CPU_FOREACH(cpu) {
 			state = DPCPU_ID_PTR(cpu, timerstate);
 			if (event > state->nextevent) {
 				event = state->nextevent;
 				c = cpu;
 			}
 		}
 	}
 #endif
 	CTR4(KTR_SPARE2, "next at %d:    next %d.%08x by %d",
 	    curcpu, (int)(event >> 32), (u_int)(event & 0xffffffff), c);
 	return (event);
 }
 
 /* Hardware timer callback function. */
 static void
 timercb(struct eventtimer *et, void *arg)
 {
 	sbintime_t now;
 	sbintime_t *next;
 	struct pcpu_state *state;
 #ifdef SMP
 	int cpu, bcast;
 #endif
 
 	/* Do not touch anything if somebody reconfiguring timers. */
 	if (busy)
 		return;
 	/* Update present and next tick times. */
 	state = DPCPU_PTR(timerstate);
 	if (et->et_flags & ET_FLAGS_PERCPU) {
 		next = &state->nexttick;
 	} else
 		next = &nexttick;
 	now = sbinuptime();
 	if (periodic)
 		*next = now + timerperiod;
 	else
 		*next = -1;	/* Next tick is not scheduled yet. */
 	state->now = now;
 	CTR3(KTR_SPARE2, "intr at %d:    now  %d.%08x",
 	    curcpu, (int)(now >> 32), (u_int)(now & 0xffffffff));
 
 #ifdef SMP
 	/* Prepare broadcasting to other CPUs for non-per-CPU timers. */
 	bcast = 0;
 	if ((et->et_flags & ET_FLAGS_PERCPU) == 0 && smp_started) {
 		CPU_FOREACH(cpu) {
 			state = DPCPU_ID_PTR(cpu, timerstate);
 			ET_HW_LOCK(state);
 			state->now = now;
 			if (now >= state->nextevent) {
 				state->nextevent += SBT_1S;
 				if (curcpu != cpu) {
 					state->ipi = 1;
 					bcast = 1;
 				}
 			}
 			ET_HW_UNLOCK(state);
 		}
 	}
 #endif
 
 	/* Handle events for this time on this CPU. */
 	handleevents(now, 0);
 
 #ifdef SMP
 	/* Broadcast interrupt to other CPUs for non-per-CPU timers. */
 	if (bcast) {
 		CPU_FOREACH(cpu) {
 			if (curcpu == cpu)
 				continue;
 			state = DPCPU_ID_PTR(cpu, timerstate);
 			if (state->ipi) {
 				state->ipi = 0;
 				ipi_cpu(cpu, IPI_HARDCLOCK);
 			}
 		}
 	}
 #endif
 }
 
 /*
  * Load new value into hardware timer.
  */
 static void
 loadtimer(sbintime_t now, int start)
 {
 	struct pcpu_state *state;
 	sbintime_t new;
 	sbintime_t *next;
 	uint64_t tmp;
 	int eq;
 
 	if (timer->et_flags & ET_FLAGS_PERCPU) {
 		state = DPCPU_PTR(timerstate);
 		next = &state->nexttick;
 	} else
 		next = &nexttick;
 	if (periodic) {
 		if (start) {
 			/*
 			 * Try to start all periodic timers aligned
 			 * to period to make events synchronous.
 			 */
 			tmp = now % timerperiod;
 			new = timerperiod - tmp;
 			if (new < tmp)		/* Left less then passed. */
 				new += timerperiod;
 			CTR5(KTR_SPARE2, "load p at %d:   now %d.%08x first in %d.%08x",
 			    curcpu, (int)(now >> 32), (u_int)(now & 0xffffffff),
 			    (int)(new >> 32), (u_int)(new & 0xffffffff));
 			*next = new + now;
 			et_start(timer, new, timerperiod);
 		}
 	} else {
 		new = getnextevent();
 		eq = (new == *next);
 		CTR4(KTR_SPARE2, "load at %d:    next %d.%08x eq %d",
 		    curcpu, (int)(new >> 32), (u_int)(new & 0xffffffff), eq);
 		if (!eq) {
 			*next = new;
 			et_start(timer, new - now, 0);
 		}
 	}
 }
 
 /*
  * Prepare event timer parameters after configuration changes.
  */
 static void
 setuptimer(void)
 {
 	int freq;
 
 	if (periodic && (timer->et_flags & ET_FLAGS_PERIODIC) == 0)
 		periodic = 0;
 	else if (!periodic && (timer->et_flags & ET_FLAGS_ONESHOT) == 0)
 		periodic = 1;
 	singlemul = MIN(MAX(singlemul, 1), 20);
 	freq = hz * singlemul;
 	while (freq < (profiling ? profhz : stathz))
 		freq += hz;
 	freq = round_freq(timer, freq);
 	timerperiod = SBT_1S / freq;
 }
 
 /*
  * Reconfigure specified per-CPU timer on other CPU. Called from IPI handler.
  */
 static int
 doconfigtimer(void)
 {
 	sbintime_t now;
 	struct pcpu_state *state;
 
 	state = DPCPU_PTR(timerstate);
 	switch (atomic_load_acq_int(&state->action)) {
 	case 1:
 		now = sbinuptime();
 		ET_HW_LOCK(state);
 		loadtimer(now, 1);
 		ET_HW_UNLOCK(state);
 		state->handle = 0;
 		atomic_store_rel_int(&state->action, 0);
 		return (1);
 	case 2:
 		ET_HW_LOCK(state);
 		et_stop(timer);
 		ET_HW_UNLOCK(state);
 		state->handle = 0;
 		atomic_store_rel_int(&state->action, 0);
 		return (1);
 	}
 	if (atomic_readandclear_int(&state->handle) && !busy) {
 		now = sbinuptime();
 		handleevents(now, 0);
 		return (1);
 	}
 	return (0);
 }
 
 /*
  * Reconfigure specified timer.
  * For per-CPU timers use IPI to make other CPUs to reconfigure.
  */
 static void
 configtimer(int start)
 {
 	sbintime_t now, next;
 	struct pcpu_state *state;
 	int cpu;
 
 	if (start) {
 		setuptimer();
 		now = sbinuptime();
 	} else
 		now = 0;
 	critical_enter();
 	ET_HW_LOCK(DPCPU_PTR(timerstate));
 	if (start) {
 		/* Initialize time machine parameters. */
 		next = now + timerperiod;
 		if (periodic)
 			nexttick = next;
 		else
 			nexttick = -1;
 		CPU_FOREACH(cpu) {
 			state = DPCPU_ID_PTR(cpu, timerstate);
 			state->now = now;
 			if (!smp_started && cpu != CPU_FIRST())
 				state->nextevent = INT64_MAX;
 			else
 				state->nextevent = next;
 			if (periodic)
 				state->nexttick = next;
 			else
 				state->nexttick = -1;
 			state->nexthard = next;
 			state->nextstat = next;
 			state->nextprof = next;
 			state->nextcall = next;
 			state->nextcallopt = next;
 			hardclock_sync(cpu);
 		}
 		busy = 0;
 		/* Start global timer or per-CPU timer of this CPU. */
 		loadtimer(now, 1);
 	} else {
 		busy = 1;
 		/* Stop global timer or per-CPU timer of this CPU. */
 		et_stop(timer);
 	}
 	ET_HW_UNLOCK(DPCPU_PTR(timerstate));
 #ifdef SMP
 	/* If timer is global or there is no other CPUs yet - we are done. */
 	if ((timer->et_flags & ET_FLAGS_PERCPU) == 0 || !smp_started) {
 		critical_exit();
 		return;
 	}
 	/* Set reconfigure flags for other CPUs. */
 	CPU_FOREACH(cpu) {
 		state = DPCPU_ID_PTR(cpu, timerstate);
 		atomic_store_rel_int(&state->action,
 		    (cpu == curcpu) ? 0 : ( start ? 1 : 2));
 	}
 	/* Broadcast reconfigure IPI. */
 	ipi_all_but_self(IPI_HARDCLOCK);
 	/* Wait for reconfiguration completed. */
 restart:
 	cpu_spinwait();
 	CPU_FOREACH(cpu) {
 		if (cpu == curcpu)
 			continue;
 		state = DPCPU_ID_PTR(cpu, timerstate);
 		if (atomic_load_acq_int(&state->action))
 			goto restart;
 	}
 #endif
 	critical_exit();
 }
 
 /*
  * Calculate nearest frequency supported by hardware timer.
  */
 static int
 round_freq(struct eventtimer *et, int freq)
 {
 	uint64_t div;
 
 	if (et->et_frequency != 0) {
 		div = lmax((et->et_frequency + freq / 2) / freq, 1);
 		if (et->et_flags & ET_FLAGS_POW2DIV)
 			div = 1 << (flsl(div + div / 2) - 1);
 		freq = (et->et_frequency + div / 2) / div;
 	}
 	if (et->et_min_period > SBT_1S)
 		panic("Event timer \"%s\" doesn't support sub-second periods!",
 		    et->et_name);
 	else if (et->et_min_period != 0)
 		freq = min(freq, SBT2FREQ(et->et_min_period));
 	if (et->et_max_period < SBT_1S && et->et_max_period != 0)
 		freq = max(freq, SBT2FREQ(et->et_max_period));
 	return (freq);
 }
 
 /*
  * Configure and start event timers (BSP part).
  */
 void
 cpu_initclocks_bsp(void)
 {
 	struct pcpu_state *state;
 	int base, div, cpu;
 
 	mtx_init(&et_hw_mtx, "et_hw_mtx", NULL, MTX_SPIN);
 	CPU_FOREACH(cpu) {
 		state = DPCPU_ID_PTR(cpu, timerstate);
 		mtx_init(&state->et_hw_mtx, "et_hw_mtx", NULL, MTX_SPIN);
 #ifdef KDTRACE_HOOKS
 		state->nextcyc = INT64_MAX;
 #endif
 		state->nextcall = INT64_MAX;
 		state->nextcallopt = INT64_MAX;
 	}
 	periodic = want_periodic;
 	/* Grab requested timer or the best of present. */
 	if (timername[0])
 		timer = et_find(timername, 0, 0);
 	if (timer == NULL && periodic) {
 		timer = et_find(NULL,
 		    ET_FLAGS_PERIODIC, ET_FLAGS_PERIODIC);
 	}
 	if (timer == NULL) {
 		timer = et_find(NULL,
 		    ET_FLAGS_ONESHOT, ET_FLAGS_ONESHOT);
 	}
 	if (timer == NULL && !periodic) {
 		timer = et_find(NULL,
 		    ET_FLAGS_PERIODIC, ET_FLAGS_PERIODIC);
 	}
 	if (timer == NULL)
 		panic("No usable event timer found!");
 	et_init(timer, timercb, NULL, NULL);
 
 	/* Adapt to timer capabilities. */
 	if (periodic && (timer->et_flags & ET_FLAGS_PERIODIC) == 0)
 		periodic = 0;
 	else if (!periodic && (timer->et_flags & ET_FLAGS_ONESHOT) == 0)
 		periodic = 1;
 	if (timer->et_flags & ET_FLAGS_C3STOP)
 		cpu_disable_deep_sleep++;
 
 	/*
 	 * We honor the requested 'hz' value.
 	 * We want to run stathz in the neighborhood of 128hz.
 	 * We would like profhz to run as often as possible.
 	 */
 	if (singlemul <= 0 || singlemul > 20) {
 		if (hz >= 1500 || (hz % 128) == 0)
 			singlemul = 1;
 		else if (hz >= 750)
 			singlemul = 2;
 		else
 			singlemul = 4;
 	}
 	if (periodic) {
 		base = round_freq(timer, hz * singlemul);
 		singlemul = max((base + hz / 2) / hz, 1);
 		hz = (base + singlemul / 2) / singlemul;
 		if (base <= 128)
 			stathz = base;
 		else {
 			div = base / 128;
 			if (div >= singlemul && (div % singlemul) == 0)
 				div++;
 			stathz = base / div;
 		}
 		profhz = stathz;
 		while ((profhz + stathz) <= 128 * 64)
 			profhz += stathz;
 		profhz = round_freq(timer, profhz);
 	} else {
 		hz = round_freq(timer, hz);
 		stathz = round_freq(timer, 127);
 		profhz = round_freq(timer, stathz * 64);
 	}
 	tick = 1000000 / hz;
 	tick_sbt = SBT_1S / hz;
 	tick_bt = sbttobt(tick_sbt);
 	statperiod = SBT_1S / stathz;
 	profperiod = SBT_1S / profhz;
 	ET_LOCK();
 	configtimer(1);
 	ET_UNLOCK();
 }
 
 /*
  * Start per-CPU event timers on APs.
  */
 void
 cpu_initclocks_ap(void)
 {
 	sbintime_t now;
 	struct pcpu_state *state;
 	struct thread *td;
 
 	state = DPCPU_PTR(timerstate);
 	now = sbinuptime();
 	ET_HW_LOCK(state);
 	state->now = now;
 	hardclock_sync(curcpu);
 	spinlock_enter();
 	ET_HW_UNLOCK(state);
 	td = curthread;
 	td->td_intr_nesting_level++;
 	handleevents(state->now, 2);
 	td->td_intr_nesting_level--;
 	spinlock_exit();
 }
 
 /*
  * Switch to profiling clock rates.
  */
 void
 cpu_startprofclock(void)
 {
 
 	ET_LOCK();
 	if (profiling == 0) {
 		if (periodic) {
 			configtimer(0);
 			profiling = 1;
 			configtimer(1);
 		} else
 			profiling = 1;
 	} else
 		profiling++;
 	ET_UNLOCK();
 }
 
 /*
  * Switch to regular clock rates.
  */
 void
 cpu_stopprofclock(void)
 {
 
 	ET_LOCK();
 	if (profiling == 1) {
 		if (periodic) {
 			configtimer(0);
 			profiling = 0;
 			configtimer(1);
 		} else
 		profiling = 0;
 	} else
 		profiling--;
 	ET_UNLOCK();
 }
 
 /*
  * Switch to idle mode (all ticks handled).
  */
 sbintime_t
 cpu_idleclock(void)
 {
 	sbintime_t now, t;
 	struct pcpu_state *state;
 
 	if (idletick || busy ||
 	    (periodic && (timer->et_flags & ET_FLAGS_PERCPU))
 #ifdef DEVICE_POLLING
 	    || curcpu == CPU_FIRST()
 #endif
 	    )
 		return (-1);
 	state = DPCPU_PTR(timerstate);
 	if (periodic)
 		now = state->now;
 	else
 		now = sbinuptime();
 	CTR3(KTR_SPARE2, "idle at %d:    now  %d.%08x",
 	    curcpu, (int)(now >> 32), (u_int)(now & 0xffffffff));
 	t = getnextcpuevent(1);
 	ET_HW_LOCK(state);
 	state->idle = 1;
 	state->nextevent = t;
 	if (!periodic)
 		loadtimer(now, 0);
 	ET_HW_UNLOCK(state);
 	return (MAX(t - now, 0));
 }
 
 /*
  * Switch to active mode (skip empty ticks).
  */
 void
 cpu_activeclock(void)
 {
 	sbintime_t now;
 	struct pcpu_state *state;
 	struct thread *td;
 
 	state = DPCPU_PTR(timerstate);
 	if (state->idle == 0 || busy)
 		return;
 	if (periodic)
 		now = state->now;
 	else
 		now = sbinuptime();
 	CTR3(KTR_SPARE2, "active at %d:  now  %d.%08x",
 	    curcpu, (int)(now >> 32), (u_int)(now & 0xffffffff));
 	spinlock_enter();
 	td = curthread;
 	td->td_intr_nesting_level++;
 	handleevents(now, 1);
 	td->td_intr_nesting_level--;
 	spinlock_exit();
 }
 
 #ifdef KDTRACE_HOOKS
 void
 clocksource_cyc_set(const struct bintime *bt)
 {
 	sbintime_t now, t;
 	struct pcpu_state *state;
 
 	/* Do not touch anything if somebody reconfiguring timers. */
 	if (busy)
 		return;
 	t = bttosbt(*bt);
 	state = DPCPU_PTR(timerstate);
 	if (periodic)
 		now = state->now;
 	else
 		now = sbinuptime();
 
 	CTR5(KTR_SPARE2, "set_cyc at %d:  now  %d.%08x  t  %d.%08x",
 	    curcpu, (int)(now >> 32), (u_int)(now & 0xffffffff),
 	    (int)(t >> 32), (u_int)(t & 0xffffffff));
 
 	ET_HW_LOCK(state);
 	if (t == state->nextcyc)
 		goto done;
 	state->nextcyc = t;
 	if (t >= state->nextevent)
 		goto done;
 	state->nextevent = t;
 	if (!periodic)
 		loadtimer(now, 0);
 done:
 	ET_HW_UNLOCK(state);
 }
 #endif
 
 void
 cpu_new_callout(int cpu, sbintime_t bt, sbintime_t bt_opt)
 {
 	struct pcpu_state *state;
 
 	/* Do not touch anything if somebody reconfiguring timers. */
 	if (busy)
 		return;
 	CTR6(KTR_SPARE2, "new co at %d:    on %d at %d.%08x - %d.%08x",
 	    curcpu, cpu, (int)(bt_opt >> 32), (u_int)(bt_opt & 0xffffffff),
 	    (int)(bt >> 32), (u_int)(bt & 0xffffffff));
 	state = DPCPU_ID_PTR(cpu, timerstate);
 	ET_HW_LOCK(state);
 
 	/*
 	 * If there is callout time already set earlier -- do nothing.
 	 * This check may appear redundant because we check already in
 	 * callout_process() but this double check guarantees we're safe
 	 * with respect to race conditions between interrupts execution
 	 * and scheduling.
 	 */
 	state->nextcallopt = bt_opt;
 	if (bt >= state->nextcall)
 		goto done;
 	state->nextcall = bt;
 	/* If there is some other event set earlier -- do nothing. */
 	if (bt >= state->nextevent)
 		goto done;
 	state->nextevent = bt;
 	/* If timer is periodic -- there is nothing to reprogram. */
 	if (periodic)
 		goto done;
 	/* If timer is global or of the current CPU -- reprogram it. */
 	if ((timer->et_flags & ET_FLAGS_PERCPU) == 0 || cpu == curcpu) {
 		loadtimer(sbinuptime(), 0);
 done:
 		ET_HW_UNLOCK(state);
 		return;
 	}
 	/* Otherwise make other CPU to reprogram it. */
 	state->handle = 1;
 	ET_HW_UNLOCK(state);
 #ifdef SMP
 	ipi_cpu(cpu, IPI_HARDCLOCK);
 #endif
 }
 
 /*
  * Report or change the active event timers hardware.
  */
 static int
 sysctl_kern_eventtimer_timer(SYSCTL_HANDLER_ARGS)
 {
 	char buf[32];
 	struct eventtimer *et;
 	int error;
 
 	ET_LOCK();
 	et = timer;
 	snprintf(buf, sizeof(buf), "%s", et->et_name);
 	ET_UNLOCK();
 	error = sysctl_handle_string(oidp, buf, sizeof(buf), req);
 	ET_LOCK();
 	et = timer;
 	if (error != 0 || req->newptr == NULL ||
 	    strcasecmp(buf, et->et_name) == 0) {
 		ET_UNLOCK();
 		return (error);
 	}
 	et = et_find(buf, 0, 0);
 	if (et == NULL) {
 		ET_UNLOCK();
 		return (ENOENT);
 	}
 	configtimer(0);
 	et_free(timer);
 	if (et->et_flags & ET_FLAGS_C3STOP)
 		cpu_disable_deep_sleep++;
 	if (timer->et_flags & ET_FLAGS_C3STOP)
 		cpu_disable_deep_sleep--;
 	periodic = want_periodic;
 	timer = et;
 	et_init(timer, timercb, NULL, NULL);
 	configtimer(1);
 	ET_UNLOCK();
 	return (error);
 }
 SYSCTL_PROC(_kern_eventtimer, OID_AUTO, timer,
     CTLTYPE_STRING | CTLFLAG_RW | CTLFLAG_MPSAFE,
     0, 0, sysctl_kern_eventtimer_timer, "A", "Chosen event timer");
 
 /*
  * Report or change the active event timer periodicity.
  */
 static int
 sysctl_kern_eventtimer_periodic(SYSCTL_HANDLER_ARGS)
 {
 	int error, val;
 
 	val = periodic;
 	error = sysctl_handle_int(oidp, &val, 0, req);
 	if (error != 0 || req->newptr == NULL)
 		return (error);
 	ET_LOCK();
 	configtimer(0);
 	periodic = want_periodic = val;
 	configtimer(1);
 	ET_UNLOCK();
 	return (error);
 }
 SYSCTL_PROC(_kern_eventtimer, OID_AUTO, periodic,
     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
     0, 0, sysctl_kern_eventtimer_periodic, "I", "Enable event timer periodic mode");
Index: head/sys/kern/kern_exec.c
===================================================================
--- head/sys/kern/kern_exec.c	(revision 258540)
+++ head/sys/kern/kern_exec.c	(revision 258541)
@@ -1,1510 +1,1509 @@
 /*-
  * Copyright (c) 1993, David Greenman
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_capsicum.h"
 #include "opt_hwpmc_hooks.h"
-#include "opt_kdtrace.h"
 #include "opt_ktrace.h"
 #include "opt_vm.h"
 
 #include <sys/param.h>
 #include <sys/capability.h>
 #include <sys/systm.h>
 #include <sys/capability.h>
 #include <sys/eventhandler.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/sysproto.h>
 #include <sys/signalvar.h>
 #include <sys/kernel.h>
 #include <sys/mount.h>
 #include <sys/filedesc.h>
 #include <sys/fcntl.h>
 #include <sys/acct.h>
 #include <sys/exec.h>
 #include <sys/imgact.h>
 #include <sys/imgact_elf.h>
 #include <sys/wait.h>
 #include <sys/malloc.h>
 #include <sys/priv.h>
 #include <sys/proc.h>
 #include <sys/pioctl.h>
 #include <sys/namei.h>
 #include <sys/resourcevar.h>
 #include <sys/rwlock.h>
 #include <sys/sched.h>
 #include <sys/sdt.h>
 #include <sys/sf_buf.h>
 #include <sys/syscallsubr.h>
 #include <sys/sysent.h>
 #include <sys/shm.h>
 #include <sys/sysctl.h>
 #include <sys/vnode.h>
 #include <sys/stat.h>
 #ifdef KTRACE
 #include <sys/ktrace.h>
 #endif
 
 #include <vm/vm.h>
 #include <vm/vm_param.h>
 #include <vm/pmap.h>
 #include <vm/vm_page.h>
 #include <vm/vm_map.h>
 #include <vm/vm_kern.h>
 #include <vm/vm_extern.h>
 #include <vm/vm_object.h>
 #include <vm/vm_pager.h>
 
 #ifdef	HWPMC_HOOKS
 #include <sys/pmckern.h>
 #endif
 
 #include <machine/reg.h>
 
 #include <security/audit/audit.h>
 #include <security/mac/mac_framework.h>
 
 #ifdef KDTRACE_HOOKS
 #include <sys/dtrace_bsd.h>
 dtrace_execexit_func_t	dtrace_fasttrap_exec;
 #endif
 
 SDT_PROVIDER_DECLARE(proc);
 SDT_PROBE_DEFINE1(proc, kernel, , exec, exec, "char *");
 SDT_PROBE_DEFINE1(proc, kernel, , exec_failure, exec-failure, "int");
 SDT_PROBE_DEFINE1(proc, kernel, , exec_success, exec-success, "char *");
 
 MALLOC_DEFINE(M_PARGS, "proc-args", "Process arguments");
 
 static int sysctl_kern_ps_strings(SYSCTL_HANDLER_ARGS);
 static int sysctl_kern_usrstack(SYSCTL_HANDLER_ARGS);
 static int sysctl_kern_stackprot(SYSCTL_HANDLER_ARGS);
 static int do_execve(struct thread *td, struct image_args *args,
     struct mac *mac_p);
 
 /* XXX This should be vm_size_t. */
 SYSCTL_PROC(_kern, KERN_PS_STRINGS, ps_strings, CTLTYPE_ULONG|CTLFLAG_RD,
     NULL, 0, sysctl_kern_ps_strings, "LU", "");
 
 /* XXX This should be vm_size_t. */
 SYSCTL_PROC(_kern, KERN_USRSTACK, usrstack, CTLTYPE_ULONG|CTLFLAG_RD|
     CTLFLAG_CAPRD, NULL, 0, sysctl_kern_usrstack, "LU", "");
 
 SYSCTL_PROC(_kern, OID_AUTO, stackprot, CTLTYPE_INT|CTLFLAG_RD,
     NULL, 0, sysctl_kern_stackprot, "I", "");
 
 u_long ps_arg_cache_limit = PAGE_SIZE / 16;
 SYSCTL_ULONG(_kern, OID_AUTO, ps_arg_cache_limit, CTLFLAG_RW, 
     &ps_arg_cache_limit, 0, "");
 
 static int disallow_high_osrel;
 SYSCTL_INT(_kern, OID_AUTO, disallow_high_osrel, CTLFLAG_RW,
     &disallow_high_osrel, 0,
     "Disallow execution of binaries built for higher version of the world");
 
 static int map_at_zero = 0;
 TUNABLE_INT("security.bsd.map_at_zero", &map_at_zero);
 SYSCTL_INT(_security_bsd, OID_AUTO, map_at_zero, CTLFLAG_RW, &map_at_zero, 0,
     "Permit processes to map an object at virtual address 0.");
 
 static int
 sysctl_kern_ps_strings(SYSCTL_HANDLER_ARGS)
 {
 	struct proc *p;
 	int error;
 
 	p = curproc;
 #ifdef SCTL_MASK32
 	if (req->flags & SCTL_MASK32) {
 		unsigned int val;
 		val = (unsigned int)p->p_sysent->sv_psstrings;
 		error = SYSCTL_OUT(req, &val, sizeof(val));
 	} else
 #endif
 		error = SYSCTL_OUT(req, &p->p_sysent->sv_psstrings,
 		   sizeof(p->p_sysent->sv_psstrings));
 	return error;
 }
 
 static int
 sysctl_kern_usrstack(SYSCTL_HANDLER_ARGS)
 {
 	struct proc *p;
 	int error;
 
 	p = curproc;
 #ifdef SCTL_MASK32
 	if (req->flags & SCTL_MASK32) {
 		unsigned int val;
 		val = (unsigned int)p->p_sysent->sv_usrstack;
 		error = SYSCTL_OUT(req, &val, sizeof(val));
 	} else
 #endif
 		error = SYSCTL_OUT(req, &p->p_sysent->sv_usrstack,
 		    sizeof(p->p_sysent->sv_usrstack));
 	return error;
 }
 
 static int
 sysctl_kern_stackprot(SYSCTL_HANDLER_ARGS)
 {
 	struct proc *p;
 
 	p = curproc;
 	return (SYSCTL_OUT(req, &p->p_sysent->sv_stackprot,
 	    sizeof(p->p_sysent->sv_stackprot)));
 }
 
 /*
  * Each of the items is a pointer to a `const struct execsw', hence the
  * double pointer here.
  */
 static const struct execsw **execsw;
 
 #ifndef _SYS_SYSPROTO_H_
 struct execve_args {
 	char    *fname; 
 	char    **argv;
 	char    **envv; 
 };
 #endif
 
 int
 sys_execve(td, uap)
 	struct thread *td;
 	struct execve_args /* {
 		char *fname;
 		char **argv;
 		char **envv;
 	} */ *uap;
 {
 	int error;
 	struct image_args args;
 
 	error = exec_copyin_args(&args, uap->fname, UIO_USERSPACE,
 	    uap->argv, uap->envv);
 	if (error == 0)
 		error = kern_execve(td, &args, NULL);
 	return (error);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct fexecve_args {
 	int	fd;
 	char	**argv;
 	char	**envv;
 }
 #endif
 int
 sys_fexecve(struct thread *td, struct fexecve_args *uap)
 {
 	int error;
 	struct image_args args;
 
 	error = exec_copyin_args(&args, NULL, UIO_SYSSPACE,
 	    uap->argv, uap->envv);
 	if (error == 0) {
 		args.fd = uap->fd;
 		error = kern_execve(td, &args, NULL);
 	}
 	return (error);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct __mac_execve_args {
 	char	*fname;
 	char	**argv;
 	char	**envv;
 	struct mac	*mac_p;
 };
 #endif
 
 int
 sys___mac_execve(td, uap)
 	struct thread *td;
 	struct __mac_execve_args /* {
 		char *fname;
 		char **argv;
 		char **envv;
 		struct mac *mac_p;
 	} */ *uap;
 {
 #ifdef MAC
 	int error;
 	struct image_args args;
 
 	error = exec_copyin_args(&args, uap->fname, UIO_USERSPACE,
 	    uap->argv, uap->envv);
 	if (error == 0)
 		error = kern_execve(td, &args, uap->mac_p);
 	return (error);
 #else
 	return (ENOSYS);
 #endif
 }
 
 /*
  * XXX: kern_execve has the astonishing property of not always returning to
  * the caller.  If sufficiently bad things happen during the call to
  * do_execve(), it can end up calling exit1(); as a result, callers must
  * avoid doing anything which they might need to undo (e.g., allocating
  * memory).
  */
 int
 kern_execve(td, args, mac_p)
 	struct thread *td;
 	struct image_args *args;
 	struct mac *mac_p;
 {
 	struct proc *p = td->td_proc;
 	int error;
 
 	AUDIT_ARG_ARGV(args->begin_argv, args->argc,
 	    args->begin_envv - args->begin_argv);
 	AUDIT_ARG_ENVV(args->begin_envv, args->envc,
 	    args->endp - args->begin_envv);
 	if (p->p_flag & P_HADTHREADS) {
 		PROC_LOCK(p);
 		if (thread_single(SINGLE_BOUNDARY)) {
 			PROC_UNLOCK(p);
 	       		exec_free_args(args);
 			return (ERESTART);	/* Try again later. */
 		}
 		PROC_UNLOCK(p);
 	}
 
 	error = do_execve(td, args, mac_p);
 
 	if (p->p_flag & P_HADTHREADS) {
 		PROC_LOCK(p);
 		/*
 		 * If success, we upgrade to SINGLE_EXIT state to
 		 * force other threads to suicide.
 		 */
 		if (error == 0)
 			thread_single(SINGLE_EXIT);
 		else
 			thread_single_end();
 		PROC_UNLOCK(p);
 	}
 
 	return (error);
 }
 
 /*
  * In-kernel implementation of execve().  All arguments are assumed to be
  * userspace pointers from the passed thread.
  */
 static int
 do_execve(td, args, mac_p)
 	struct thread *td;
 	struct image_args *args;
 	struct mac *mac_p;
 {
 	struct proc *p = td->td_proc;
 	struct nameidata nd;
 	struct ucred *newcred = NULL, *oldcred;
 	struct uidinfo *euip;
 	register_t *stack_base;
 	int error, i;
 	struct image_params image_params, *imgp;
 	struct vattr attr;
 	int (*img_first)(struct image_params *);
 	struct pargs *oldargs = NULL, *newargs = NULL;
 	struct sigacts *oldsigacts, *newsigacts;
 #ifdef KTRACE
 	struct vnode *tracevp = NULL;
 	struct ucred *tracecred = NULL;
 #endif
 	struct vnode *textvp = NULL, *binvp = NULL;
 	cap_rights_t rights;
 	int credential_changing;
 	int textset;
 #ifdef MAC
 	struct label *interpvplabel = NULL;
 	int will_transition;
 #endif
 #ifdef HWPMC_HOOKS
 	struct pmckern_procexec pe;
 #endif
 	static const char fexecv_proc_title[] = "(fexecv)";
 
 	imgp = &image_params;
 
 	/*
 	 * Lock the process and set the P_INEXEC flag to indicate that
 	 * it should be left alone until we're done here.  This is
 	 * necessary to avoid race conditions - e.g. in ptrace() -
 	 * that might allow a local user to illicitly obtain elevated
 	 * privileges.
 	 */
 	PROC_LOCK(p);
 	KASSERT((p->p_flag & P_INEXEC) == 0,
 	    ("%s(): process already has P_INEXEC flag", __func__));
 	p->p_flag |= P_INEXEC;
 	PROC_UNLOCK(p);
 
 	/*
 	 * Initialize part of the common data
 	 */
 	imgp->proc = p;
 	imgp->execlabel = NULL;
 	imgp->attr = &attr;
 	imgp->entry_addr = 0;
 	imgp->reloc_base = 0;
 	imgp->vmspace_destroyed = 0;
 	imgp->interpreted = 0;
 	imgp->opened = 0;
 	imgp->interpreter_name = NULL;
 	imgp->auxargs = NULL;
 	imgp->vp = NULL;
 	imgp->object = NULL;
 	imgp->firstpage = NULL;
 	imgp->ps_strings = 0;
 	imgp->auxarg_size = 0;
 	imgp->args = args;
 	imgp->execpath = imgp->freepath = NULL;
 	imgp->execpathp = 0;
 	imgp->canary = 0;
 	imgp->canarylen = 0;
 	imgp->pagesizes = 0;
 	imgp->pagesizeslen = 0;
 	imgp->stack_prot = 0;
 
 #ifdef MAC
 	error = mac_execve_enter(imgp, mac_p);
 	if (error)
 		goto exec_fail;
 #endif
 
 	imgp->image_header = NULL;
 
 	/*
 	 * Translate the file name. namei() returns a vnode pointer
 	 *	in ni_vp amoung other things.
 	 *
 	 * XXXAUDIT: It would be desirable to also audit the name of the
 	 * interpreter if this is an interpreted binary.
 	 */
 	if (args->fname != NULL) {
 		NDINIT(&nd, LOOKUP, ISOPEN | LOCKLEAF | FOLLOW | SAVENAME
 		    | AUDITVNODE1, UIO_SYSSPACE, args->fname, td);
 	}
 
 	SDT_PROBE(proc, kernel, , exec, args->fname, 0, 0, 0, 0 );
 
 interpret:
 	if (args->fname != NULL) {
 #ifdef CAPABILITY_MODE
 		/*
 		 * While capability mode can't reach this point via direct
 		 * path arguments to execve(), we also don't allow
 		 * interpreters to be used in capability mode (for now).
 		 * Catch indirect lookups and return a permissions error.
 		 */
 		if (IN_CAPABILITY_MODE(td)) {
 			error = ECAPMODE;
 			goto exec_fail;
 		}
 #endif
 		error = namei(&nd);
 		if (error)
 			goto exec_fail;
 
 		binvp  = nd.ni_vp;
 		imgp->vp = binvp;
 	} else {
 		AUDIT_ARG_FD(args->fd);
 		/*
 		 * Descriptors opened only with O_EXEC or O_RDONLY are allowed.
 		 */
 		error = fgetvp_exec(td, args->fd,
 		    cap_rights_init(&rights, CAP_FEXECVE), &binvp);
 		if (error)
 			goto exec_fail;
 		vn_lock(binvp, LK_EXCLUSIVE | LK_RETRY);
 		AUDIT_ARG_VNODE1(binvp);
 		imgp->vp = binvp;
 	}
 
 	/*
 	 * Check file permissions (also 'opens' file)
 	 */
 	error = exec_check_permissions(imgp);
 	if (error)
 		goto exec_fail_dealloc;
 
 	imgp->object = imgp->vp->v_object;
 	if (imgp->object != NULL)
 		vm_object_reference(imgp->object);
 
 	/*
 	 * Set VV_TEXT now so no one can write to the executable while we're
 	 * activating it.
 	 *
 	 * Remember if this was set before and unset it in case this is not
 	 * actually an executable image.
 	 */
 	textset = VOP_IS_TEXT(imgp->vp);
 	VOP_SET_TEXT(imgp->vp);
 
 	error = exec_map_first_page(imgp);
 	if (error)
 		goto exec_fail_dealloc;
 
 	imgp->proc->p_osrel = 0;
 	/*
 	 *	If the current process has a special image activator it
 	 *	wants to try first, call it.   For example, emulating shell
 	 *	scripts differently.
 	 */
 	error = -1;
 	if ((img_first = imgp->proc->p_sysent->sv_imgact_try) != NULL)
 		error = img_first(imgp);
 
 	/*
 	 *	Loop through the list of image activators, calling each one.
 	 *	An activator returns -1 if there is no match, 0 on success,
 	 *	and an error otherwise.
 	 */
 	for (i = 0; error == -1 && execsw[i]; ++i) {
 		if (execsw[i]->ex_imgact == NULL ||
 		    execsw[i]->ex_imgact == img_first) {
 			continue;
 		}
 		error = (*execsw[i]->ex_imgact)(imgp);
 	}
 
 	if (error) {
 		if (error == -1) {
 			if (textset == 0)
 				VOP_UNSET_TEXT(imgp->vp);
 			error = ENOEXEC;
 		}
 		goto exec_fail_dealloc;
 	}
 
 	/*
 	 * Special interpreter operation, cleanup and loop up to try to
 	 * activate the interpreter.
 	 */
 	if (imgp->interpreted) {
 		exec_unmap_first_page(imgp);
 		/*
 		 * VV_TEXT needs to be unset for scripts.  There is a short
 		 * period before we determine that something is a script where
 		 * VV_TEXT will be set. The vnode lock is held over this
 		 * entire period so nothing should illegitimately be blocked.
 		 */
 		VOP_UNSET_TEXT(imgp->vp);
 		/* free name buffer and old vnode */
 		if (args->fname != NULL)
 			NDFREE(&nd, NDF_ONLY_PNBUF);
 #ifdef MAC
 		mac_execve_interpreter_enter(binvp, &interpvplabel);
 #endif
 		if (imgp->opened) {
 			VOP_CLOSE(binvp, FREAD, td->td_ucred, td);
 			imgp->opened = 0;
 		}
 		vput(binvp);
 		vm_object_deallocate(imgp->object);
 		imgp->object = NULL;
 		/* set new name to that of the interpreter */
 		NDINIT(&nd, LOOKUP, LOCKLEAF | FOLLOW | SAVENAME,
 		    UIO_SYSSPACE, imgp->interpreter_name, td);
 		args->fname = imgp->interpreter_name;
 		goto interpret;
 	}
 
 	/*
 	 * NB: We unlock the vnode here because it is believed that none
 	 * of the sv_copyout_strings/sv_fixup operations require the vnode.
 	 */
 	VOP_UNLOCK(imgp->vp, 0);
 
 	/*
 	 * Do the best to calculate the full path to the image file.
 	 */
 	if (imgp->auxargs != NULL &&
 	    ((args->fname != NULL && args->fname[0] == '/') ||
 	     vn_fullpath(td, imgp->vp, &imgp->execpath, &imgp->freepath) != 0))
 		imgp->execpath = args->fname;
 
 	if (disallow_high_osrel &&
 	    P_OSREL_MAJOR(p->p_osrel) > P_OSREL_MAJOR(__FreeBSD_version)) {
 		error = ENOEXEC;
 		uprintf("Osrel %d for image %s too high\n", p->p_osrel,
 		    imgp->execpath != NULL ? imgp->execpath : "<unresolved>");
 		vn_lock(imgp->vp, LK_SHARED | LK_RETRY);
 		goto exec_fail_dealloc;
 	}
 
 	/*
 	 * Copy out strings (args and env) and initialize stack base
 	 */
 	if (p->p_sysent->sv_copyout_strings)
 		stack_base = (*p->p_sysent->sv_copyout_strings)(imgp);
 	else
 		stack_base = exec_copyout_strings(imgp);
 
 	/*
 	 * If custom stack fixup routine present for this process
 	 * let it do the stack setup.
 	 * Else stuff argument count as first item on stack
 	 */
 	if (p->p_sysent->sv_fixup != NULL)
 		(*p->p_sysent->sv_fixup)(&stack_base, imgp);
 	else
 		suword(--stack_base, imgp->args->argc);
 
 	/*
 	 * For security and other reasons, the file descriptor table cannot
 	 * be shared after an exec.
 	 */
 	fdunshare(p, td);
 
 	/*
 	 * Malloc things before we need locks.
 	 */
 	newcred = crget();
 	euip = uifind(attr.va_uid);
 	i = imgp->args->begin_envv - imgp->args->begin_argv;
 	/* Cache arguments if they fit inside our allowance */
 	if (ps_arg_cache_limit >= i + sizeof(struct pargs)) {
 		newargs = pargs_alloc(i);
 		bcopy(imgp->args->begin_argv, newargs->ar_args, i);
 	}
 
 	/* close files on exec */
 	fdcloseexec(td);
 	vn_lock(imgp->vp, LK_SHARED | LK_RETRY);
 
 	/* Get a reference to the vnode prior to locking the proc */
 	VREF(binvp);
 
 	/*
 	 * For security and other reasons, signal handlers cannot
 	 * be shared after an exec. The new process gets a copy of the old
 	 * handlers. In execsigs(), the new process will have its signals
 	 * reset.
 	 */
 	PROC_LOCK(p);
 	oldcred = crcopysafe(p, newcred);
 	if (sigacts_shared(p->p_sigacts)) {
 		oldsigacts = p->p_sigacts;
 		PROC_UNLOCK(p);
 		newsigacts = sigacts_alloc();
 		sigacts_copy(newsigacts, oldsigacts);
 		PROC_LOCK(p);
 		p->p_sigacts = newsigacts;
 	} else
 		oldsigacts = NULL;
 
 	/* Stop profiling */
 	stopprofclock(p);
 
 	/* reset caught signals */
 	execsigs(p);
 
 	/* name this process - nameiexec(p, ndp) */
 	bzero(p->p_comm, sizeof(p->p_comm));
 	if (args->fname)
 		bcopy(nd.ni_cnd.cn_nameptr, p->p_comm,
 		    min(nd.ni_cnd.cn_namelen, MAXCOMLEN));
 	else if (vn_commname(binvp, p->p_comm, sizeof(p->p_comm)) != 0)
 		bcopy(fexecv_proc_title, p->p_comm, sizeof(fexecv_proc_title));
 	bcopy(p->p_comm, td->td_name, sizeof(td->td_name));
 #ifdef KTR
 	sched_clear_tdname(td);
 #endif
 
 	/*
 	 * mark as execed, wakeup the process that vforked (if any) and tell
 	 * it that it now has its own resources back
 	 */
 	p->p_flag |= P_EXEC;
 	if (p->p_pptr && (p->p_flag & P_PPWAIT)) {
 		p->p_flag &= ~(P_PPWAIT | P_PPTRACE);
 		cv_broadcast(&p->p_pwait);
 	}
 
 	/*
 	 * Implement image setuid/setgid.
 	 *
 	 * Don't honor setuid/setgid if the filesystem prohibits it or if
 	 * the process is being traced.
 	 *
 	 * We disable setuid/setgid/etc in compatibility mode on the basis
 	 * that most setugid applications are not written with that
 	 * environment in mind, and will therefore almost certainly operate
 	 * incorrectly. In principle there's no reason that setugid
 	 * applications might not be useful in capability mode, so we may want
 	 * to reconsider this conservative design choice in the future.
 	 *
 	 * XXXMAC: For the time being, use NOSUID to also prohibit
 	 * transitions on the file system.
 	 */
 	credential_changing = 0;
 	credential_changing |= (attr.va_mode & S_ISUID) && oldcred->cr_uid !=
 	    attr.va_uid;
 	credential_changing |= (attr.va_mode & S_ISGID) && oldcred->cr_gid !=
 	    attr.va_gid;
 #ifdef MAC
 	will_transition = mac_vnode_execve_will_transition(oldcred, imgp->vp,
 	    interpvplabel, imgp);
 	credential_changing |= will_transition;
 #endif
 
 	if (credential_changing &&
 #ifdef CAPABILITY_MODE
 	    ((oldcred->cr_flags & CRED_FLAG_CAPMODE) == 0) &&
 #endif
 	    (imgp->vp->v_mount->mnt_flag & MNT_NOSUID) == 0 &&
 	    (p->p_flag & P_TRACED) == 0) {
 		/*
 		 * Turn off syscall tracing for set-id programs, except for
 		 * root.  Record any set-id flags first to make sure that
 		 * we do not regain any tracing during a possible block.
 		 */
 		setsugid(p);
 
 #ifdef KTRACE
 		if (p->p_tracecred != NULL &&
 		    priv_check_cred(p->p_tracecred, PRIV_DEBUG_DIFFCRED, 0))
 			ktrprocexec(p, &tracecred, &tracevp);
 #endif
 		/*
 		 * Close any file descriptors 0..2 that reference procfs,
 		 * then make sure file descriptors 0..2 are in use.
 		 *
 		 * setugidsafety() may call closef() and then pfind()
 		 * which may grab the process lock.
 		 * fdcheckstd() may call falloc() which may block to
 		 * allocate memory, so temporarily drop the process lock.
 		 */
 		PROC_UNLOCK(p);
 		VOP_UNLOCK(imgp->vp, 0);
 		setugidsafety(td);
 		error = fdcheckstd(td);
 		vn_lock(imgp->vp, LK_SHARED | LK_RETRY);
 		if (error != 0)
 			goto done1;
 		PROC_LOCK(p);
 		/*
 		 * Set the new credentials.
 		 */
 		if (attr.va_mode & S_ISUID)
 			change_euid(newcred, euip);
 		if (attr.va_mode & S_ISGID)
 			change_egid(newcred, attr.va_gid);
 #ifdef MAC
 		if (will_transition) {
 			mac_vnode_execve_transition(oldcred, newcred, imgp->vp,
 			    interpvplabel, imgp);
 		}
 #endif
 		/*
 		 * Implement correct POSIX saved-id behavior.
 		 *
 		 * XXXMAC: Note that the current logic will save the
 		 * uid and gid if a MAC domain transition occurs, even
 		 * though maybe it shouldn't.
 		 */
 		change_svuid(newcred, newcred->cr_uid);
 		change_svgid(newcred, newcred->cr_gid);
 		p->p_ucred = newcred;
 		newcred = NULL;
 	} else {
 		if (oldcred->cr_uid == oldcred->cr_ruid &&
 		    oldcred->cr_gid == oldcred->cr_rgid)
 			p->p_flag &= ~P_SUGID;
 		/*
 		 * Implement correct POSIX saved-id behavior.
 		 *
 		 * XXX: It's not clear that the existing behavior is
 		 * POSIX-compliant.  A number of sources indicate that the
 		 * saved uid/gid should only be updated if the new ruid is
 		 * not equal to the old ruid, or the new euid is not equal
 		 * to the old euid and the new euid is not equal to the old
 		 * ruid.  The FreeBSD code always updates the saved uid/gid.
 		 * Also, this code uses the new (replaced) euid and egid as
 		 * the source, which may or may not be the right ones to use.
 		 */
 		if (oldcred->cr_svuid != oldcred->cr_uid ||
 		    oldcred->cr_svgid != oldcred->cr_gid) {
 			change_svuid(newcred, newcred->cr_uid);
 			change_svgid(newcred, newcred->cr_gid);
 			p->p_ucred = newcred;
 			newcred = NULL;
 		}
 	}
 
 	/*
 	 * Store the vp for use in procfs.  This vnode was referenced prior
 	 * to locking the proc lock.
 	 */
 	textvp = p->p_textvp;
 	p->p_textvp = binvp;
 
 #ifdef KDTRACE_HOOKS
 	/*
 	 * Tell the DTrace fasttrap provider about the exec if it
 	 * has declared an interest.
 	 */
 	if (dtrace_fasttrap_exec)
 		dtrace_fasttrap_exec(p);
 #endif
 
 	/*
 	 * Notify others that we exec'd, and clear the P_INEXEC flag
 	 * as we're now a bona fide freshly-execed process.
 	 */
 	KNOTE_LOCKED(&p->p_klist, NOTE_EXEC);
 	p->p_flag &= ~P_INEXEC;
 
 	/* clear "fork but no exec" flag, as we _are_ execing */
 	p->p_acflag &= ~AFORK;
 
 	/*
 	 * Free any previous argument cache and replace it with
 	 * the new argument cache, if any.
 	 */
 	oldargs = p->p_args;
 	p->p_args = newargs;
 	newargs = NULL;
 
 #ifdef	HWPMC_HOOKS
 	/*
 	 * Check if system-wide sampling is in effect or if the
 	 * current process is using PMCs.  If so, do exec() time
 	 * processing.  This processing needs to happen AFTER the
 	 * P_INEXEC flag is cleared.
 	 *
 	 * The proc lock needs to be released before taking the PMC
 	 * SX.
 	 */
 	if (PMC_SYSTEM_SAMPLING_ACTIVE() || PMC_PROC_IS_USING_PMCS(p)) {
 		PROC_UNLOCK(p);
 		VOP_UNLOCK(imgp->vp, 0);
 		pe.pm_credentialschanged = credential_changing;
 		pe.pm_entryaddr = imgp->entry_addr;
 
 		PMC_CALL_HOOK_X(td, PMC_FN_PROCESS_EXEC, (void *) &pe);
 		vn_lock(imgp->vp, LK_SHARED | LK_RETRY);
 	} else
 		PROC_UNLOCK(p);
 #else  /* !HWPMC_HOOKS */
 	PROC_UNLOCK(p);
 #endif
 
 	/* Set values passed into the program in registers. */
 	if (p->p_sysent->sv_setregs)
 		(*p->p_sysent->sv_setregs)(td, imgp, 
 		    (u_long)(uintptr_t)stack_base);
 	else
 		exec_setregs(td, imgp, (u_long)(uintptr_t)stack_base);
 
 	vfs_mark_atime(imgp->vp, td->td_ucred);
 
 	SDT_PROBE(proc, kernel, , exec_success, args->fname, 0, 0, 0, 0);
 
 done1:
 	/*
 	 * Free any resources malloc'd earlier that we didn't use.
 	 */
 	uifree(euip);
 	if (newcred == NULL)
 		crfree(oldcred);
 	else
 		crfree(newcred);
 	VOP_UNLOCK(imgp->vp, 0);
 
 	/*
 	 * Handle deferred decrement of ref counts.
 	 */
 	if (textvp != NULL)
 		vrele(textvp);
 	if (binvp && error != 0)
 		vrele(binvp);
 #ifdef KTRACE
 	if (tracevp != NULL)
 		vrele(tracevp);
 	if (tracecred != NULL)
 		crfree(tracecred);
 #endif
 	vn_lock(imgp->vp, LK_SHARED | LK_RETRY);
 	pargs_drop(oldargs);
 	pargs_drop(newargs);
 	if (oldsigacts != NULL)
 		sigacts_free(oldsigacts);
 
 exec_fail_dealloc:
 
 	/*
 	 * free various allocated resources
 	 */
 	if (imgp->firstpage != NULL)
 		exec_unmap_first_page(imgp);
 
 	if (imgp->vp != NULL) {
 		if (args->fname)
 			NDFREE(&nd, NDF_ONLY_PNBUF);
 		if (imgp->opened)
 			VOP_CLOSE(imgp->vp, FREAD, td->td_ucred, td);
 		vput(imgp->vp);
 	}
 
 	if (imgp->object != NULL)
 		vm_object_deallocate(imgp->object);
 
 	free(imgp->freepath, M_TEMP);
 
 	if (error == 0) {
 		PROC_LOCK(p);
 		td->td_dbgflags |= TDB_EXEC;
 		PROC_UNLOCK(p);
 
 		/*
 		 * Stop the process here if its stop event mask has
 		 * the S_EXEC bit set.
 		 */
 		STOPEVENT(p, S_EXEC, 0);
 		goto done2;
 	}
 
 exec_fail:
 	/* we're done here, clear P_INEXEC */
 	PROC_LOCK(p);
 	p->p_flag &= ~P_INEXEC;
 	PROC_UNLOCK(p);
 
 	SDT_PROBE(proc, kernel, , exec_failure, error, 0, 0, 0, 0);
 
 done2:
 #ifdef MAC
 	mac_execve_exit(imgp);
 	mac_execve_interpreter_exit(interpvplabel);
 #endif
 	exec_free_args(args);
 
 	if (error && imgp->vmspace_destroyed) {
 		/* sorry, no more process anymore. exit gracefully */
 		exit1(td, W_EXITCODE(0, SIGABRT));
 		/* NOT REACHED */
 	}
 
 #ifdef KTRACE
 	if (error == 0)
 		ktrprocctor(p);
 #endif
 
 	return (error);
 }
 
 int
 exec_map_first_page(imgp)
 	struct image_params *imgp;
 {
 	int rv, i;
 	int initial_pagein;
 	vm_page_t ma[VM_INITIAL_PAGEIN];
 	vm_object_t object;
 
 	if (imgp->firstpage != NULL)
 		exec_unmap_first_page(imgp);
 
 	object = imgp->vp->v_object;
 	if (object == NULL)
 		return (EACCES);
 	VM_OBJECT_WLOCK(object);
 #if VM_NRESERVLEVEL > 0
 	if ((object->flags & OBJ_COLORED) == 0) {
 		object->flags |= OBJ_COLORED;
 		object->pg_color = 0;
 	}
 #endif
 	ma[0] = vm_page_grab(object, 0, VM_ALLOC_NORMAL);
 	if (ma[0]->valid != VM_PAGE_BITS_ALL) {
 		initial_pagein = VM_INITIAL_PAGEIN;
 		if (initial_pagein > object->size)
 			initial_pagein = object->size;
 		for (i = 1; i < initial_pagein; i++) {
 			if ((ma[i] = vm_page_next(ma[i - 1])) != NULL) {
 				if (ma[i]->valid)
 					break;
 				if (vm_page_tryxbusy(ma[i]))
 					break;
 			} else {
 				ma[i] = vm_page_alloc(object, i,
 				    VM_ALLOC_NORMAL | VM_ALLOC_IFNOTCACHED);
 				if (ma[i] == NULL)
 					break;
 			}
 		}
 		initial_pagein = i;
 		rv = vm_pager_get_pages(object, ma, initial_pagein, 0);
 		ma[0] = vm_page_lookup(object, 0);
 		if ((rv != VM_PAGER_OK) || (ma[0] == NULL)) {
 			if (ma[0] != NULL) {
 				vm_page_lock(ma[0]);
 				vm_page_free(ma[0]);
 				vm_page_unlock(ma[0]);
 			}
 			VM_OBJECT_WUNLOCK(object);
 			return (EIO);
 		}
 	}
 	vm_page_xunbusy(ma[0]);
 	vm_page_lock(ma[0]);
 	vm_page_hold(ma[0]);
 	vm_page_unlock(ma[0]);
 	VM_OBJECT_WUNLOCK(object);
 
 	imgp->firstpage = sf_buf_alloc(ma[0], 0);
 	imgp->image_header = (char *)sf_buf_kva(imgp->firstpage);
 
 	return (0);
 }
 
 void
 exec_unmap_first_page(imgp)
 	struct image_params *imgp;
 {
 	vm_page_t m;
 
 	if (imgp->firstpage != NULL) {
 		m = sf_buf_page(imgp->firstpage);
 		sf_buf_free(imgp->firstpage);
 		imgp->firstpage = NULL;
 		vm_page_lock(m);
 		vm_page_unhold(m);
 		vm_page_unlock(m);
 	}
 }
 
 /*
  * Destroy old address space, and allocate a new stack
  *	The new stack is only SGROWSIZ large because it is grown
  *	automatically in trap.c.
  */
 int
 exec_new_vmspace(imgp, sv)
 	struct image_params *imgp;
 	struct sysentvec *sv;
 {
 	int error;
 	struct proc *p = imgp->proc;
 	struct vmspace *vmspace = p->p_vmspace;
 	vm_object_t obj;
 	vm_offset_t sv_minuser, stack_addr;
 	vm_map_t map;
 	u_long ssiz;
 
 	imgp->vmspace_destroyed = 1;
 	imgp->sysent = sv;
 
 	/* May be called with Giant held */
 	EVENTHANDLER_INVOKE(process_exec, p, imgp);
 
 	/*
 	 * Blow away entire process VM, if address space not shared,
 	 * otherwise, create a new VM space so that other threads are
 	 * not disrupted
 	 */
 	map = &vmspace->vm_map;
 	if (map_at_zero)
 		sv_minuser = sv->sv_minuser;
 	else
 		sv_minuser = MAX(sv->sv_minuser, PAGE_SIZE);
 	if (vmspace->vm_refcnt == 1 && vm_map_min(map) == sv_minuser &&
 	    vm_map_max(map) == sv->sv_maxuser) {
 		shmexit(vmspace);
 		pmap_remove_pages(vmspace_pmap(vmspace));
 		vm_map_remove(map, vm_map_min(map), vm_map_max(map));
 	} else {
 		error = vmspace_exec(p, sv_minuser, sv->sv_maxuser);
 		if (error)
 			return (error);
 		vmspace = p->p_vmspace;
 		map = &vmspace->vm_map;
 	}
 
 	/* Map a shared page */
 	obj = sv->sv_shared_page_obj;
 	if (obj != NULL) {
 		vm_object_reference(obj);
 		error = vm_map_fixed(map, obj, 0,
 		    sv->sv_shared_page_base, sv->sv_shared_page_len,
 		    VM_PROT_READ | VM_PROT_EXECUTE,
 		    VM_PROT_READ | VM_PROT_EXECUTE,
 		    MAP_INHERIT_SHARE | MAP_ACC_NO_CHARGE);
 		if (error) {
 			vm_object_deallocate(obj);
 			return (error);
 		}
 	}
 
 	/* Allocate a new stack */
 	if (sv->sv_maxssiz != NULL)
 		ssiz = *sv->sv_maxssiz;
 	else
 		ssiz = maxssiz;
 	stack_addr = sv->sv_usrstack - ssiz;
 	error = vm_map_stack(map, stack_addr, (vm_size_t)ssiz,
 	    obj != NULL && imgp->stack_prot != 0 ? imgp->stack_prot :
 		sv->sv_stackprot,
 	    VM_PROT_ALL, MAP_STACK_GROWS_DOWN);
 	if (error)
 		return (error);
 
 #ifdef __ia64__
 	/* Allocate a new register stack */
 	stack_addr = IA64_BACKINGSTORE;
 	error = vm_map_stack(map, stack_addr, (vm_size_t)ssiz,
 	    sv->sv_stackprot, VM_PROT_ALL, MAP_STACK_GROWS_UP);
 	if (error)
 		return (error);
 #endif
 
 	/* vm_ssize and vm_maxsaddr are somewhat antiquated concepts in the
 	 * VM_STACK case, but they are still used to monitor the size of the
 	 * process stack so we can check the stack rlimit.
 	 */
 	vmspace->vm_ssize = sgrowsiz >> PAGE_SHIFT;
 	vmspace->vm_maxsaddr = (char *)sv->sv_usrstack - ssiz;
 
 	return (0);
 }
 
 /*
  * Copy out argument and environment strings from the old process address
  * space into the temporary string buffer.
  */
 int
 exec_copyin_args(struct image_args *args, char *fname,
     enum uio_seg segflg, char **argv, char **envv)
 {
 	char *argp, *envp;
 	int error;
 	size_t length;
 
 	bzero(args, sizeof(*args));
 	if (argv == NULL)
 		return (EFAULT);
 
 	/*
 	 * Allocate demand-paged memory for the file name, argument, and
 	 * environment strings.
 	 */
 	error = exec_alloc_args(args);
 	if (error != 0)
 		return (error);
 
 	/*
 	 * Copy the file name.
 	 */
 	if (fname != NULL) {
 		args->fname = args->buf;
 		error = (segflg == UIO_SYSSPACE) ?
 		    copystr(fname, args->fname, PATH_MAX, &length) :
 		    copyinstr(fname, args->fname, PATH_MAX, &length);
 		if (error != 0)
 			goto err_exit;
 	} else
 		length = 0;
 
 	args->begin_argv = args->buf + length;
 	args->endp = args->begin_argv;
 	args->stringspace = ARG_MAX;
 
 	/*
 	 * extract arguments first
 	 */
 	while ((argp = (caddr_t) (intptr_t) fuword(argv++))) {
 		if (argp == (caddr_t) -1) {
 			error = EFAULT;
 			goto err_exit;
 		}
 		if ((error = copyinstr(argp, args->endp,
 		    args->stringspace, &length))) {
 			if (error == ENAMETOOLONG) 
 				error = E2BIG;
 			goto err_exit;
 		}
 		args->stringspace -= length;
 		args->endp += length;
 		args->argc++;
 	}
 
 	args->begin_envv = args->endp;
 
 	/*
 	 * extract environment strings
 	 */
 	if (envv) {
 		while ((envp = (caddr_t)(intptr_t)fuword(envv++))) {
 			if (envp == (caddr_t)-1) {
 				error = EFAULT;
 				goto err_exit;
 			}
 			if ((error = copyinstr(envp, args->endp,
 			    args->stringspace, &length))) {
 				if (error == ENAMETOOLONG)
 					error = E2BIG;
 				goto err_exit;
 			}
 			args->stringspace -= length;
 			args->endp += length;
 			args->envc++;
 		}
 	}
 
 	return (0);
 
 err_exit:
 	exec_free_args(args);
 	return (error);
 }
 
 /*
  * Allocate temporary demand-paged, zero-filled memory for the file name,
  * argument, and environment strings.  Returns zero if the allocation succeeds
  * and ENOMEM otherwise.
  */
 int
 exec_alloc_args(struct image_args *args)
 {
 
 	args->buf = (char *)kmap_alloc_wait(exec_map, PATH_MAX + ARG_MAX);
 	return (args->buf != NULL ? 0 : ENOMEM);
 }
 
 void
 exec_free_args(struct image_args *args)
 {
 
 	if (args->buf != NULL) {
 		kmap_free_wakeup(exec_map, (vm_offset_t)args->buf,
 		    PATH_MAX + ARG_MAX);
 		args->buf = NULL;
 	}
 	if (args->fname_buf != NULL) {
 		free(args->fname_buf, M_TEMP);
 		args->fname_buf = NULL;
 	}
 }
 
 /*
  * Copy strings out to the new process address space, constructing new arg
  * and env vector tables. Return a pointer to the base so that it can be used
  * as the initial stack pointer.
  */
 register_t *
 exec_copyout_strings(imgp)
 	struct image_params *imgp;
 {
 	int argc, envc;
 	char **vectp;
 	char *stringp, *destp;
 	register_t *stack_base;
 	struct ps_strings *arginfo;
 	struct proc *p;
 	size_t execpath_len;
 	int szsigcode, szps;
 	char canary[sizeof(long) * 8];
 
 	szps = sizeof(pagesizes[0]) * MAXPAGESIZES;
 	/*
 	 * Calculate string base and vector table pointers.
 	 * Also deal with signal trampoline code for this exec type.
 	 */
 	if (imgp->execpath != NULL && imgp->auxargs != NULL)
 		execpath_len = strlen(imgp->execpath) + 1;
 	else
 		execpath_len = 0;
 	p = imgp->proc;
 	szsigcode = 0;
 	arginfo = (struct ps_strings *)p->p_sysent->sv_psstrings;
 	if (p->p_sysent->sv_sigcode_base == 0) {
 		if (p->p_sysent->sv_szsigcode != NULL)
 			szsigcode = *(p->p_sysent->sv_szsigcode);
 	}
 	destp =	(caddr_t)arginfo - szsigcode - SPARE_USRSPACE -
 	    roundup(execpath_len, sizeof(char *)) -
 	    roundup(sizeof(canary), sizeof(char *)) -
 	    roundup(szps, sizeof(char *)) -
 	    roundup((ARG_MAX - imgp->args->stringspace), sizeof(char *));
 
 	/*
 	 * install sigcode
 	 */
 	if (szsigcode != 0)
 		copyout(p->p_sysent->sv_sigcode, ((caddr_t)arginfo -
 		    szsigcode), szsigcode);
 
 	/*
 	 * Copy the image path for the rtld.
 	 */
 	if (execpath_len != 0) {
 		imgp->execpathp = (uintptr_t)arginfo - szsigcode - execpath_len;
 		copyout(imgp->execpath, (void *)imgp->execpathp,
 		    execpath_len);
 	}
 
 	/*
 	 * Prepare the canary for SSP.
 	 */
 	arc4rand(canary, sizeof(canary), 0);
 	imgp->canary = (uintptr_t)arginfo - szsigcode - execpath_len -
 	    sizeof(canary);
 	copyout(canary, (void *)imgp->canary, sizeof(canary));
 	imgp->canarylen = sizeof(canary);
 
 	/*
 	 * Prepare the pagesizes array.
 	 */
 	imgp->pagesizes = (uintptr_t)arginfo - szsigcode - execpath_len -
 	    roundup(sizeof(canary), sizeof(char *)) - szps;
 	copyout(pagesizes, (void *)imgp->pagesizes, szps);
 	imgp->pagesizeslen = szps;
 
 	/*
 	 * If we have a valid auxargs ptr, prepare some room
 	 * on the stack.
 	 */
 	if (imgp->auxargs) {
 		/*
 		 * 'AT_COUNT*2' is size for the ELF Auxargs data. This is for
 		 * lower compatibility.
 		 */
 		imgp->auxarg_size = (imgp->auxarg_size) ? imgp->auxarg_size :
 		    (AT_COUNT * 2);
 		/*
 		 * The '+ 2' is for the null pointers at the end of each of
 		 * the arg and env vector sets,and imgp->auxarg_size is room
 		 * for argument of Runtime loader.
 		 */
 		vectp = (char **)(destp - (imgp->args->argc +
 		    imgp->args->envc + 2 + imgp->auxarg_size)
 		    * sizeof(char *));
 	} else {
 		/*
 		 * The '+ 2' is for the null pointers at the end of each of
 		 * the arg and env vector sets
 		 */
 		vectp = (char **)(destp - (imgp->args->argc + imgp->args->envc + 2) *
 		    sizeof(char *));
 	}
 
 	/*
 	 * vectp also becomes our initial stack base
 	 */
 	stack_base = (register_t *)vectp;
 
 	stringp = imgp->args->begin_argv;
 	argc = imgp->args->argc;
 	envc = imgp->args->envc;
 
 	/*
 	 * Copy out strings - arguments and environment.
 	 */
 	copyout(stringp, destp, ARG_MAX - imgp->args->stringspace);
 
 	/*
 	 * Fill in "ps_strings" struct for ps, w, etc.
 	 */
 	suword(&arginfo->ps_argvstr, (long)(intptr_t)vectp);
 	suword32(&arginfo->ps_nargvstr, argc);
 
 	/*
 	 * Fill in argument portion of vector table.
 	 */
 	for (; argc > 0; --argc) {
 		suword(vectp++, (long)(intptr_t)destp);
 		while (*stringp++ != 0)
 			destp++;
 		destp++;
 	}
 
 	/* a null vector table pointer separates the argp's from the envp's */
 	suword(vectp++, 0);
 
 	suword(&arginfo->ps_envstr, (long)(intptr_t)vectp);
 	suword32(&arginfo->ps_nenvstr, envc);
 
 	/*
 	 * Fill in environment portion of vector table.
 	 */
 	for (; envc > 0; --envc) {
 		suword(vectp++, (long)(intptr_t)destp);
 		while (*stringp++ != 0)
 			destp++;
 		destp++;
 	}
 
 	/* end of vector table is a null pointer */
 	suword(vectp, 0);
 
 	return (stack_base);
 }
 
 /*
  * Check permissions of file to execute.
  *	Called with imgp->vp locked.
  *	Return 0 for success or error code on failure.
  */
 int
 exec_check_permissions(imgp)
 	struct image_params *imgp;
 {
 	struct vnode *vp = imgp->vp;
 	struct vattr *attr = imgp->attr;
 	struct thread *td;
 	int error, writecount;
 
 	td = curthread;
 
 	/* Get file attributes */
 	error = VOP_GETATTR(vp, attr, td->td_ucred);
 	if (error)
 		return (error);
 
 #ifdef MAC
 	error = mac_vnode_check_exec(td->td_ucred, imgp->vp, imgp);
 	if (error)
 		return (error);
 #endif
 
 	/*
 	 * 1) Check if file execution is disabled for the filesystem that
 	 *    this file resides on.
 	 * 2) Ensure that at least one execute bit is on. Otherwise, a
 	 *    privileged user will always succeed, and we don't want this
 	 *    to happen unless the file really is executable.
 	 * 3) Ensure that the file is a regular file.
 	 */
 	if ((vp->v_mount->mnt_flag & MNT_NOEXEC) ||
 	    (attr->va_mode & (S_IXUSR | S_IXGRP | S_IXOTH)) == 0 ||
 	    (attr->va_type != VREG))
 		return (EACCES);
 
 	/*
 	 * Zero length files can't be exec'd
 	 */
 	if (attr->va_size == 0)
 		return (ENOEXEC);
 
 	/*
 	 *  Check for execute permission to file based on current credentials.
 	 */
 	error = VOP_ACCESS(vp, VEXEC, td->td_ucred, td);
 	if (error)
 		return (error);
 
 	/*
 	 * Check number of open-for-writes on the file and deny execution
 	 * if there are any.
 	 */
 	error = VOP_GET_WRITECOUNT(vp, &writecount);
 	if (error != 0)
 		return (error);
 	if (writecount != 0)
 		return (ETXTBSY);
 
 	/*
 	 * Call filesystem specific open routine (which does nothing in the
 	 * general case).
 	 */
 	error = VOP_OPEN(vp, FREAD, td->td_ucred, td, NULL);
 	if (error == 0)
 		imgp->opened = 1;
 	return (error);
 }
 
 /*
  * Exec handler registration
  */
 int
 exec_register(execsw_arg)
 	const struct execsw *execsw_arg;
 {
 	const struct execsw **es, **xs, **newexecsw;
 	int count = 2;	/* New slot and trailing NULL */
 
 	if (execsw)
 		for (es = execsw; *es; es++)
 			count++;
 	newexecsw = malloc(count * sizeof(*es), M_TEMP, M_WAITOK);
 	if (newexecsw == NULL)
 		return (ENOMEM);
 	xs = newexecsw;
 	if (execsw)
 		for (es = execsw; *es; es++)
 			*xs++ = *es;
 	*xs++ = execsw_arg;
 	*xs = NULL;
 	if (execsw)
 		free(execsw, M_TEMP);
 	execsw = newexecsw;
 	return (0);
 }
 
 int
 exec_unregister(execsw_arg)
 	const struct execsw *execsw_arg;
 {
 	const struct execsw **es, **xs, **newexecsw;
 	int count = 1;
 
 	if (execsw == NULL)
 		panic("unregister with no handlers left?\n");
 
 	for (es = execsw; *es; es++) {
 		if (*es == execsw_arg)
 			break;
 	}
 	if (*es == NULL)
 		return (ENOENT);
 	for (es = execsw; *es; es++)
 		if (*es != execsw_arg)
 			count++;
 	newexecsw = malloc(count * sizeof(*es), M_TEMP, M_WAITOK);
 	if (newexecsw == NULL)
 		return (ENOMEM);
 	xs = newexecsw;
 	for (es = execsw; *es; es++)
 		if (*es != execsw_arg)
 			*xs++ = *es;
 	*xs = NULL;
 	if (execsw)
 		free(execsw, M_TEMP);
 	execsw = newexecsw;
 	return (0);
 }
Index: head/sys/kern/kern_exit.c
===================================================================
--- head/sys/kern/kern_exit.c	(revision 258540)
+++ head/sys/kern/kern_exit.c	(revision 258541)
@@ -1,1264 +1,1263 @@
 /*-
  * Copyright (c) 1982, 1986, 1989, 1991, 1993
  *	The Regents of the University of California.  All rights reserved.
  * (c) UNIX System Laboratories, Inc.
  * All or some portions of this file are derived from material licensed
  * to the University of California by American Telephone and Telegraph
  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
  * the permission of UNIX System Laboratories, Inc.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)kern_exit.c	8.7 (Berkeley) 2/12/94
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_compat.h"
-#include "opt_kdtrace.h"
 #include "opt_ktrace.h"
 #include "opt_procdesc.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/sysproto.h>
 #include <sys/capability.h>
 #include <sys/eventhandler.h>
 #include <sys/kernel.h>
 #include <sys/malloc.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/proc.h>
 #include <sys/procdesc.h>
 #include <sys/pioctl.h>
 #include <sys/jail.h>
 #include <sys/tty.h>
 #include <sys/wait.h>
 #include <sys/vmmeter.h>
 #include <sys/vnode.h>
 #include <sys/racct.h>
 #include <sys/resourcevar.h>
 #include <sys/sbuf.h>
 #include <sys/signalvar.h>
 #include <sys/sched.h>
 #include <sys/sx.h>
 #include <sys/syscallsubr.h>
 #include <sys/syslog.h>
 #include <sys/ptrace.h>
 #include <sys/acct.h>		/* for acct_process() function prototype */
 #include <sys/filedesc.h>
 #include <sys/sdt.h>
 #include <sys/shm.h>
 #include <sys/sem.h>
 #ifdef KTRACE
 #include <sys/ktrace.h>
 #endif
 
 #include <security/audit/audit.h>
 #include <security/mac/mac_framework.h>
 
 #include <vm/vm.h>
 #include <vm/vm_extern.h>
 #include <vm/vm_param.h>
 #include <vm/pmap.h>
 #include <vm/vm_map.h>
 #include <vm/vm_page.h>
 #include <vm/uma.h>
 
 #ifdef KDTRACE_HOOKS
 #include <sys/dtrace_bsd.h>
 dtrace_execexit_func_t	dtrace_fasttrap_exit;
 #endif
 
 SDT_PROVIDER_DECLARE(proc);
 SDT_PROBE_DEFINE1(proc, kernel, , exit, exit, "int");
 
 /* Hook for NFS teardown procedure. */
 void (*nlminfo_release_p)(struct proc *p);
 
 static void
 clear_orphan(struct proc *p)
 {
 
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 
 	if (p->p_flag & P_ORPHAN) {
 		LIST_REMOVE(p, p_orphan);
 		p->p_flag &= ~P_ORPHAN;
 	}
 }
 
 /*
  * exit -- death of process.
  */
 void
 sys_sys_exit(struct thread *td, struct sys_exit_args *uap)
 {
 
 	exit1(td, W_EXITCODE(uap->rval, 0));
 	/* NOTREACHED */
 }
 
 /*
  * Exit: deallocate address space and other resources, change proc state to
  * zombie, and unlink proc from allproc and parent's lists.  Save exit status
  * and rusage for wait().  Check for child processes and orphan them.
  */
 void
 exit1(struct thread *td, int rv)
 {
 	struct proc *p, *nq, *q;
 	struct vnode *vtmp;
 	struct vnode *ttyvp = NULL;
 	struct plimit *plim;
 
 	mtx_assert(&Giant, MA_NOTOWNED);
 
 	p = td->td_proc;
 	/*
 	 * XXX in case we're rebooting we just let init die in order to
 	 * work around an unsolved stack overflow seen very late during
 	 * shutdown on sparc64 when the gmirror worker process exists.
 	 */
 	if (p == initproc && rebooting == 0) {
 		printf("init died (signal %d, exit %d)\n",
 		    WTERMSIG(rv), WEXITSTATUS(rv));
 		panic("Going nowhere without my init!");
 	}
 
 	/*
 	 * MUST abort all other threads before proceeding past here.
 	 */
 	PROC_LOCK(p);
 	while (p->p_flag & P_HADTHREADS) {
 		/*
 		 * First check if some other thread got here before us.
 		 * If so, act appropriately: exit or suspend.
 		 */
 		thread_suspend_check(0);
 
 		/*
 		 * Kill off the other threads. This requires
 		 * some co-operation from other parts of the kernel
 		 * so it may not be instantaneous.  With this state set
 		 * any thread entering the kernel from userspace will
 		 * thread_exit() in trap().  Any thread attempting to
 		 * sleep will return immediately with EINTR or EWOULDBLOCK
 		 * which will hopefully force them to back out to userland
 		 * freeing resources as they go.  Any thread attempting
 		 * to return to userland will thread_exit() from userret().
 		 * thread_exit() will unsuspend us when the last of the
 		 * other threads exits.
 		 * If there is already a thread singler after resumption,
 		 * calling thread_single will fail; in that case, we just
 		 * re-check all suspension request, the thread should
 		 * either be suspended there or exit.
 		 */
 		if (!thread_single(SINGLE_EXIT))
 			break;
 
 		/*
 		 * All other activity in this process is now stopped.
 		 * Threading support has been turned off.
 		 */
 	}
 	KASSERT(p->p_numthreads == 1,
 	    ("exit1: proc %p exiting with %d threads", p, p->p_numthreads));
 	racct_sub(p, RACCT_NTHR, 1);
 	/*
 	 * Wakeup anyone in procfs' PIOCWAIT.  They should have a hold
 	 * on our vmspace, so we should block below until they have
 	 * released their reference to us.  Note that if they have
 	 * requested S_EXIT stops we will block here until they ack
 	 * via PIOCCONT.
 	 */
 	_STOPEVENT(p, S_EXIT, rv);
 
 	/*
 	 * Ignore any pending request to stop due to a stop signal.
 	 * Once P_WEXIT is set, future requests will be ignored as
 	 * well.
 	 */
 	p->p_flag &= ~P_STOPPED_SIG;
 	KASSERT(!P_SHOULDSTOP(p), ("exiting process is stopped"));
 
 	/*
 	 * Note that we are exiting and do another wakeup of anyone in
 	 * PIOCWAIT in case they aren't listening for S_EXIT stops or
 	 * decided to wait again after we told them we are exiting.
 	 */
 	p->p_flag |= P_WEXIT;
 	wakeup(&p->p_stype);
 
 	/*
 	 * Wait for any processes that have a hold on our vmspace to
 	 * release their reference.
 	 */
 	while (p->p_lock > 0)
 		msleep(&p->p_lock, &p->p_mtx, PWAIT, "exithold", 0);
 
 	p->p_xstat = rv;	/* Let event handler change exit status */
 	PROC_UNLOCK(p);
 	/* Drain the limit callout while we don't have the proc locked */
 	callout_drain(&p->p_limco);
 
 #ifdef AUDIT
 	/*
 	 * The Sun BSM exit token contains two components: an exit status as
 	 * passed to exit(), and a return value to indicate what sort of exit
 	 * it was.  The exit status is WEXITSTATUS(rv), but it's not clear
 	 * what the return value is.
 	 */
 	AUDIT_ARG_EXIT(WEXITSTATUS(rv), 0);
 	AUDIT_SYSCALL_EXIT(0, td);
 #endif
 
 	/* Are we a task leader? */
 	if (p == p->p_leader) {
 		mtx_lock(&ppeers_lock);
 		q = p->p_peers;
 		while (q != NULL) {
 			PROC_LOCK(q);
 			kern_psignal(q, SIGKILL);
 			PROC_UNLOCK(q);
 			q = q->p_peers;
 		}
 		while (p->p_peers != NULL)
 			msleep(p, &ppeers_lock, PWAIT, "exit1", 0);
 		mtx_unlock(&ppeers_lock);
 	}
 
 	/*
 	 * Check if any loadable modules need anything done at process exit.
 	 * E.g. SYSV IPC stuff
 	 * XXX what if one of these generates an error?
 	 */
 	EVENTHANDLER_INVOKE(process_exit, p);
 
 	/*
 	 * If parent is waiting for us to exit or exec,
 	 * P_PPWAIT is set; we will wakeup the parent below.
 	 */
 	PROC_LOCK(p);
 	rv = p->p_xstat;	/* Event handler could change exit status */
 	stopprofclock(p);
 	p->p_flag &= ~(P_TRACED | P_PPWAIT | P_PPTRACE);
 
 	/*
 	 * Stop the real interval timer.  If the handler is currently
 	 * executing, prevent it from rearming itself and let it finish.
 	 */
 	if (timevalisset(&p->p_realtimer.it_value) &&
 	    callout_stop(&p->p_itcallout) == 0) {
 		timevalclear(&p->p_realtimer.it_interval);
 		msleep(&p->p_itcallout, &p->p_mtx, PWAIT, "ritwait", 0);
 		KASSERT(!timevalisset(&p->p_realtimer.it_value),
 		    ("realtime timer is still armed"));
 	}
 	PROC_UNLOCK(p);
 
 	/*
 	 * Reset any sigio structures pointing to us as a result of
 	 * F_SETOWN with our pid.
 	 */
 	funsetownlst(&p->p_sigiolst);
 
 	/*
 	 * If this process has an nlminfo data area (for lockd), release it
 	 */
 	if (nlminfo_release_p != NULL && p->p_nlminfo != NULL)
 		(*nlminfo_release_p)(p);
 
 	/*
 	 * Close open files and release open-file table.
 	 * This may block!
 	 */
 	fdescfree(td);
 
 	/*
 	 * If this thread tickled GEOM, we need to wait for the giggling to
 	 * stop before we return to userland
 	 */
 	if (td->td_pflags & TDP_GEOM)
 		g_waitidle();
 
 	/*
 	 * Remove ourself from our leader's peer list and wake our leader.
 	 */
 	mtx_lock(&ppeers_lock);
 	if (p->p_leader->p_peers) {
 		q = p->p_leader;
 		while (q->p_peers != p)
 			q = q->p_peers;
 		q->p_peers = p->p_peers;
 		wakeup(p->p_leader);
 	}
 	mtx_unlock(&ppeers_lock);
 
 	vmspace_exit(td);
 
 	sx_xlock(&proctree_lock);
 	if (SESS_LEADER(p)) {
 		struct session *sp = p->p_session;
 		struct tty *tp;
 
 		/*
 		 * s_ttyp is not zero'd; we use this to indicate that
 		 * the session once had a controlling terminal. (for
 		 * logging and informational purposes)
 		 */
 		SESS_LOCK(sp);
 		ttyvp = sp->s_ttyvp;
 		tp = sp->s_ttyp;
 		sp->s_ttyvp = NULL;
 		sp->s_ttydp = NULL;
 		sp->s_leader = NULL;
 		SESS_UNLOCK(sp);
 
 		/*
 		 * Signal foreground pgrp and revoke access to
 		 * controlling terminal if it has not been revoked
 		 * already.
 		 *
 		 * Because the TTY may have been revoked in the mean
 		 * time and could already have a new session associated
 		 * with it, make sure we don't send a SIGHUP to a
 		 * foreground process group that does not belong to this
 		 * session.
 		 */
 
 		if (tp != NULL) {
 			tty_lock(tp);
 			if (tp->t_session == sp)
 				tty_signal_pgrp(tp, SIGHUP);
 			tty_unlock(tp);
 		}
 
 		if (ttyvp != NULL) {
 			sx_xunlock(&proctree_lock);
 			if (vn_lock(ttyvp, LK_EXCLUSIVE) == 0) {
 				VOP_REVOKE(ttyvp, REVOKEALL);
 				VOP_UNLOCK(ttyvp, 0);
 			}
 			sx_xlock(&proctree_lock);
 		}
 	}
 	fixjobc(p, p->p_pgrp, 0);
 	sx_xunlock(&proctree_lock);
 	(void)acct_process(td);
 
 	/* Release the TTY now we've unlocked everything. */
 	if (ttyvp != NULL)
 		vrele(ttyvp);
 #ifdef KTRACE
 	ktrprocexit(td);
 #endif
 	/*
 	 * Release reference to text vnode
 	 */
 	if ((vtmp = p->p_textvp) != NULL) {
 		p->p_textvp = NULL;
 		vrele(vtmp);
 	}
 
 	/*
 	 * Release our limits structure.
 	 */
 	PROC_LOCK(p);
 	plim = p->p_limit;
 	p->p_limit = NULL;
 	PROC_UNLOCK(p);
 	lim_free(plim);
 
 	tidhash_remove(td);
 
 	/*
 	 * Remove proc from allproc queue and pidhash chain.
 	 * Place onto zombproc.  Unlink from parent's child list.
 	 */
 	sx_xlock(&allproc_lock);
 	LIST_REMOVE(p, p_list);
 	LIST_INSERT_HEAD(&zombproc, p, p_list);
 	LIST_REMOVE(p, p_hash);
 	sx_xunlock(&allproc_lock);
 
 	/*
 	 * Call machine-dependent code to release any
 	 * machine-dependent resources other than the address space.
 	 * The address space is released by "vmspace_exitfree(p)" in
 	 * vm_waitproc().
 	 */
 	cpu_exit(td);
 
 	WITNESS_WARN(WARN_PANIC, NULL, "process (pid %d) exiting", p->p_pid);
 
 	/*
 	 * Reparent all of our children to init.
 	 */
 	sx_xlock(&proctree_lock);
 	q = LIST_FIRST(&p->p_children);
 	if (q != NULL)		/* only need this if any child is S_ZOMB */
 		wakeup(initproc);
 	for (; q != NULL; q = nq) {
 		nq = LIST_NEXT(q, p_sibling);
 		PROC_LOCK(q);
 		proc_reparent(q, initproc);
 		q->p_sigparent = SIGCHLD;
 		/*
 		 * Traced processes are killed
 		 * since their existence means someone is screwing up.
 		 */
 		if (q->p_flag & P_TRACED) {
 			struct thread *temp;
 
 			/*
 			 * Since q was found on our children list, the
 			 * proc_reparent() call moved q to the orphan
 			 * list due to present P_TRACED flag. Clear
 			 * orphan link for q now while q is locked.
 			 */
 			clear_orphan(q);
 			q->p_flag &= ~(P_TRACED | P_STOPPED_TRACE);
 			FOREACH_THREAD_IN_PROC(q, temp)
 				temp->td_dbgflags &= ~TDB_SUSPEND;
 			kern_psignal(q, SIGKILL);
 		}
 		PROC_UNLOCK(q);
 	}
 
 	/*
 	 * Also get rid of our orphans.
 	 */
 	while ((q = LIST_FIRST(&p->p_orphans)) != NULL) {
 		PROC_LOCK(q);
 		clear_orphan(q);
 		PROC_UNLOCK(q);
 	}
 
 	/* Save exit status. */
 	PROC_LOCK(p);
 	p->p_xthread = td;
 
 	/* Tell the prison that we are gone. */
 	prison_proc_free(p->p_ucred->cr_prison);
 
 #ifdef KDTRACE_HOOKS
 	/*
 	 * Tell the DTrace fasttrap provider about the exit if it
 	 * has declared an interest.
 	 */
 	if (dtrace_fasttrap_exit)
 		dtrace_fasttrap_exit(p);
 #endif
 
 	/*
 	 * Notify interested parties of our demise.
 	 */
 	KNOTE_LOCKED(&p->p_klist, NOTE_EXIT);
 
 #ifdef KDTRACE_HOOKS
 	int reason = CLD_EXITED;
 	if (WCOREDUMP(rv))
 		reason = CLD_DUMPED;
 	else if (WIFSIGNALED(rv))
 		reason = CLD_KILLED;
 	SDT_PROBE(proc, kernel, , exit, reason, 0, 0, 0, 0);
 #endif
 
 	/*
 	 * Just delete all entries in the p_klist. At this point we won't
 	 * report any more events, and there are nasty race conditions that
 	 * can beat us if we don't.
 	 */
 	knlist_clear(&p->p_klist, 1);
 
 	/*
 	 * If this is a process with a descriptor, we may not need to deliver
 	 * a signal to the parent.  proctree_lock is held over
 	 * procdesc_exit() to serialize concurrent calls to close() and
 	 * exit().
 	 */
 #ifdef PROCDESC
 	if (p->p_procdesc == NULL || procdesc_exit(p)) {
 #endif
 		/*
 		 * Notify parent that we're gone.  If parent has the
 		 * PS_NOCLDWAIT flag set, or if the handler is set to SIG_IGN,
 		 * notify process 1 instead (and hope it will handle this
 		 * situation).
 		 */
 		PROC_LOCK(p->p_pptr);
 		mtx_lock(&p->p_pptr->p_sigacts->ps_mtx);
 		if (p->p_pptr->p_sigacts->ps_flag &
 		    (PS_NOCLDWAIT | PS_CLDSIGIGN)) {
 			struct proc *pp;
 
 			mtx_unlock(&p->p_pptr->p_sigacts->ps_mtx);
 			pp = p->p_pptr;
 			PROC_UNLOCK(pp);
 			proc_reparent(p, initproc);
 			p->p_sigparent = SIGCHLD;
 			PROC_LOCK(p->p_pptr);
 
 			/*
 			 * Notify parent, so in case he was wait(2)ing or
 			 * executing waitpid(2) with our pid, he will
 			 * continue.
 			 */
 			wakeup(pp);
 		} else
 			mtx_unlock(&p->p_pptr->p_sigacts->ps_mtx);
 
 		if (p->p_pptr == initproc)
 			kern_psignal(p->p_pptr, SIGCHLD);
 		else if (p->p_sigparent != 0) {
 			if (p->p_sigparent == SIGCHLD)
 				childproc_exited(p);
 			else	/* LINUX thread */
 				kern_psignal(p->p_pptr, p->p_sigparent);
 		}
 #ifdef PROCDESC
 	} else
 		PROC_LOCK(p->p_pptr);
 #endif
 	sx_xunlock(&proctree_lock);
 
 	/*
 	 * The state PRS_ZOMBIE prevents other proesses from sending
 	 * signal to the process, to avoid memory leak, we free memory
 	 * for signal queue at the time when the state is set.
 	 */
 	sigqueue_flush(&p->p_sigqueue);
 	sigqueue_flush(&td->td_sigqueue);
 
 	/*
 	 * We have to wait until after acquiring all locks before
 	 * changing p_state.  We need to avoid all possible context
 	 * switches (including ones from blocking on a mutex) while
 	 * marked as a zombie.  We also have to set the zombie state
 	 * before we release the parent process' proc lock to avoid
 	 * a lost wakeup.  So, we first call wakeup, then we grab the
 	 * sched lock, update the state, and release the parent process'
 	 * proc lock.
 	 */
 	wakeup(p->p_pptr);
 	cv_broadcast(&p->p_pwait);
 	sched_exit(p->p_pptr, td);
 	PROC_SLOCK(p);
 	p->p_state = PRS_ZOMBIE;
 	PROC_UNLOCK(p->p_pptr);
 
 	/*
 	 * Hopefully no one will try to deliver a signal to the process this
 	 * late in the game.
 	 */
 	knlist_destroy(&p->p_klist);
 
 	/*
 	 * Save our children's rusage information in our exit rusage.
 	 */
 	ruadd(&p->p_ru, &p->p_rux, &p->p_stats->p_cru, &p->p_crux);
 
 	/*
 	 * Make sure the scheduler takes this thread out of its tables etc.
 	 * This will also release this thread's reference to the ucred.
 	 * Other thread parts to release include pcb bits and such.
 	 */
 	thread_exit();
 }
 
 
 #ifndef _SYS_SYSPROTO_H_
 struct abort2_args {
 	char *why;
 	int nargs;
 	void **args;
 };
 #endif
 
 int
 sys_abort2(struct thread *td, struct abort2_args *uap)
 {
 	struct proc *p = td->td_proc;
 	struct sbuf *sb;
 	void *uargs[16];
 	int error, i, sig;
 
 	/*
 	 * Do it right now so we can log either proper call of abort2(), or
 	 * note, that invalid argument was passed. 512 is big enough to
 	 * handle 16 arguments' descriptions with additional comments.
 	 */
 	sb = sbuf_new(NULL, NULL, 512, SBUF_FIXEDLEN);
 	sbuf_clear(sb);
 	sbuf_printf(sb, "%s(pid %d uid %d) aborted: ",
 	    p->p_comm, p->p_pid, td->td_ucred->cr_uid);
 	/*
 	 * Since we can't return from abort2(), send SIGKILL in cases, where
 	 * abort2() was called improperly
 	 */
 	sig = SIGKILL;
 	/* Prevent from DoSes from user-space. */
 	if (uap->nargs < 0 || uap->nargs > 16)
 		goto out;
 	if (uap->nargs > 0) {
 		if (uap->args == NULL)
 			goto out;
 		error = copyin(uap->args, uargs, uap->nargs * sizeof(void *));
 		if (error != 0)
 			goto out;
 	}
 	/*
 	 * Limit size of 'reason' string to 128. Will fit even when
 	 * maximal number of arguments was chosen to be logged.
 	 */
 	if (uap->why != NULL) {
 		error = sbuf_copyin(sb, uap->why, 128);
 		if (error < 0)
 			goto out;
 	} else {
 		sbuf_printf(sb, "(null)");
 	}
 	if (uap->nargs > 0) {
 		sbuf_printf(sb, "(");
 		for (i = 0;i < uap->nargs; i++)
 			sbuf_printf(sb, "%s%p", i == 0 ? "" : ", ", uargs[i]);
 		sbuf_printf(sb, ")");
 	}
 	/*
 	 * Final stage: arguments were proper, string has been
 	 * successfully copied from userspace, and copying pointers
 	 * from user-space succeed.
 	 */
 	sig = SIGABRT;
 out:
 	if (sig == SIGKILL) {
 		sbuf_trim(sb);
 		sbuf_printf(sb, " (Reason text inaccessible)");
 	}
 	sbuf_cat(sb, "\n");
 	sbuf_finish(sb);
 	log(LOG_INFO, "%s", sbuf_data(sb));
 	sbuf_delete(sb);
 	exit1(td, W_EXITCODE(0, sig));
 	return (0);
 }
 
 
 #ifdef COMPAT_43
 /*
  * The dirty work is handled by kern_wait().
  */
 int
 owait(struct thread *td, struct owait_args *uap __unused)
 {
 	int error, status;
 
 	error = kern_wait(td, WAIT_ANY, &status, 0, NULL);
 	if (error == 0)
 		td->td_retval[1] = status;
 	return (error);
 }
 #endif /* COMPAT_43 */
 
 /*
  * The dirty work is handled by kern_wait().
  */
 int
 sys_wait4(struct thread *td, struct wait4_args *uap)
 {
 	struct rusage ru, *rup;
 	int error, status;
 
 	if (uap->rusage != NULL)
 		rup = &ru;
 	else
 		rup = NULL;
 	error = kern_wait(td, uap->pid, &status, uap->options, rup);
 	if (uap->status != NULL && error == 0)
 		error = copyout(&status, uap->status, sizeof(status));
 	if (uap->rusage != NULL && error == 0)
 		error = copyout(&ru, uap->rusage, sizeof(struct rusage));
 	return (error);
 }
 
 int
 sys_wait6(struct thread *td, struct wait6_args *uap)
 {
 	struct __wrusage wru, *wrup;
 	siginfo_t si, *sip;
 	idtype_t idtype;
 	id_t id;
 	int error, status;
 
 	idtype = uap->idtype;
 	id = uap->id;
 
 	if (uap->wrusage != NULL)
 		wrup = &wru;
 	else
 		wrup = NULL;
 
 	if (uap->info != NULL) {
 		sip = &si;
 		bzero(sip, sizeof(*sip));
 	} else
 		sip = NULL;
 
 	/*
 	 *  We expect all callers of wait6() to know about WEXITED and
 	 *  WTRAPPED.
 	 */
 	error = kern_wait6(td, idtype, id, &status, uap->options, wrup, sip);
 
 	if (uap->status != NULL && error == 0)
 		error = copyout(&status, uap->status, sizeof(status));
 	if (uap->wrusage != NULL && error == 0)
 		error = copyout(&wru, uap->wrusage, sizeof(wru));
 	if (uap->info != NULL && error == 0)
 		error = copyout(&si, uap->info, sizeof(si));
 	return (error);
 }
 
 /*
  * Reap the remains of a zombie process and optionally return status and
  * rusage.  Asserts and will release both the proctree_lock and the process
  * lock as part of its work.
  */
 void
 proc_reap(struct thread *td, struct proc *p, int *status, int options)
 {
 	struct proc *q, *t;
 
 	sx_assert(&proctree_lock, SA_XLOCKED);
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	PROC_SLOCK_ASSERT(p, MA_OWNED);
 	KASSERT(p->p_state == PRS_ZOMBIE, ("proc_reap: !PRS_ZOMBIE"));
 
 	q = td->td_proc;
 
 	PROC_SUNLOCK(p);
 	td->td_retval[0] = p->p_pid;
 	if (status)
 		*status = p->p_xstat;	/* convert to int */
 	if (options & WNOWAIT) {
 		/*
 		 *  Only poll, returning the status.  Caller does not wish to
 		 * release the proc struct just yet.
 		 */
 		PROC_UNLOCK(p);
 		sx_xunlock(&proctree_lock);
 		return;
 	}
 
 	PROC_LOCK(q);
 	sigqueue_take(p->p_ksi);
 	PROC_UNLOCK(q);
 	PROC_UNLOCK(p);
 
 	/*
 	 * If we got the child via a ptrace 'attach', we need to give it back
 	 * to the old parent.
 	 */
 	if (p->p_oppid && (t = pfind(p->p_oppid)) != NULL) {
 		PROC_LOCK(p);
 		proc_reparent(p, t);
 		p->p_oppid = 0;
 		PROC_UNLOCK(p);
 		pksignal(t, SIGCHLD, p->p_ksi);
 		wakeup(t);
 		cv_broadcast(&p->p_pwait);
 		PROC_UNLOCK(t);
 		sx_xunlock(&proctree_lock);
 		return;
 	}
 
 	/*
 	 * Remove other references to this process to ensure we have an
 	 * exclusive reference.
 	 */
 	sx_xlock(&allproc_lock);
 	LIST_REMOVE(p, p_list);	/* off zombproc */
 	sx_xunlock(&allproc_lock);
 	LIST_REMOVE(p, p_sibling);
 	PROC_LOCK(p);
 	clear_orphan(p);
 	PROC_UNLOCK(p);
 	leavepgrp(p);
 #ifdef PROCDESC
 	if (p->p_procdesc != NULL)
 		procdesc_reap(p);
 #endif
 	sx_xunlock(&proctree_lock);
 
 	/*
 	 * As a side effect of this lock, we know that all other writes to
 	 * this proc are visible now, so no more locking is needed for p.
 	 */
 	PROC_LOCK(p);
 	p->p_xstat = 0;		/* XXX: why? */
 	PROC_UNLOCK(p);
 	PROC_LOCK(q);
 	ruadd(&q->p_stats->p_cru, &q->p_crux, &p->p_ru, &p->p_rux);
 	PROC_UNLOCK(q);
 
 	/*
 	 * Decrement the count of procs running with this uid.
 	 */
 	(void)chgproccnt(p->p_ucred->cr_ruidinfo, -1, 0);
 
 	/*
 	 * Destroy resource accounting information associated with the process.
 	 */
 #ifdef RACCT
 	PROC_LOCK(p);
 	racct_sub(p, RACCT_NPROC, 1);
 	PROC_UNLOCK(p);
 #endif
 	racct_proc_exit(p);
 
 	/*
 	 * Free credentials, arguments, and sigacts.
 	 */
 	crfree(p->p_ucred);
 	p->p_ucred = NULL;
 	pargs_drop(p->p_args);
 	p->p_args = NULL;
 	sigacts_free(p->p_sigacts);
 	p->p_sigacts = NULL;
 
 	/*
 	 * Do any thread-system specific cleanups.
 	 */
 	thread_wait(p);
 
 	/*
 	 * Give vm and machine-dependent layer a chance to free anything that
 	 * cpu_exit couldn't release while still running in process context.
 	 */
 	vm_waitproc(p);
 #ifdef MAC
 	mac_proc_destroy(p);
 #endif
 	KASSERT(FIRST_THREAD_IN_PROC(p),
 	    ("proc_reap: no residual thread!"));
 	uma_zfree(proc_zone, p);
 	sx_xlock(&allproc_lock);
 	nprocs--;
 	sx_xunlock(&allproc_lock);
 }
 
 static int
 proc_to_reap(struct thread *td, struct proc *p, idtype_t idtype, id_t id,
     int *status, int options, struct __wrusage *wrusage, siginfo_t *siginfo)
 {
 	struct proc *q;
 	struct rusage *rup;
 
 	sx_assert(&proctree_lock, SA_XLOCKED);
 
 	q = td->td_proc;
 	PROC_LOCK(p);
 
 	switch (idtype) {
 	case P_ALL:
 		break;
 	case P_PID:
 		if (p->p_pid != (pid_t)id) {
 			PROC_UNLOCK(p);
 			return (0);
 		}
 		break;
 	case P_PGID:
 		if (p->p_pgid != (pid_t)id) {
 			PROC_UNLOCK(p);
 			return (0);
 		}
 		break;
 	case P_SID:
 		if (p->p_session->s_sid != (pid_t)id) {
 			PROC_UNLOCK(p);
 			return (0);
 		}
 		break;
 	case P_UID:
 		if (p->p_ucred->cr_uid != (uid_t)id) {
 			PROC_UNLOCK(p);
 			return (0);
 		}
 		break;
 	case P_GID:
 		if (p->p_ucred->cr_gid != (gid_t)id) {
 			PROC_UNLOCK(p);
 			return (0);
 		}
 		break;
 	case P_JAILID:
 		if (p->p_ucred->cr_prison->pr_id != (int)id) {
 			PROC_UNLOCK(p);
 			return (0);
 		}
 		break;
 	/*
 	 * It seems that the thread structures get zeroed out
 	 * at process exit.  This makes it impossible to
 	 * support P_SETID, P_CID or P_CPUID.
 	 */
 	default:
 		PROC_UNLOCK(p);
 		return (0);
 	}
 
 	if (p_canwait(td, p)) {
 		PROC_UNLOCK(p);
 		return (0);
 	}
 
 	if (((options & WEXITED) == 0) && (p->p_state == PRS_ZOMBIE)) {
 		PROC_UNLOCK(p);
 		return (0);
 	}
 
 	/*
 	 * This special case handles a kthread spawned by linux_clone
 	 * (see linux_misc.c).  The linux_wait4 and linux_waitpid
 	 * functions need to be able to distinguish between waiting
 	 * on a process and waiting on a thread.  It is a thread if
 	 * p_sigparent is not SIGCHLD, and the WLINUXCLONE option
 	 * signifies we want to wait for threads and not processes.
 	 */
 	if ((p->p_sigparent != SIGCHLD) ^
 	    ((options & WLINUXCLONE) != 0)) {
 		PROC_UNLOCK(p);
 		return (0);
 	}
 
 	PROC_SLOCK(p);
 
 	if (siginfo != NULL) {
 		bzero(siginfo, sizeof(*siginfo));
 		siginfo->si_errno = 0;
 
 		/*
 		 * SUSv4 requires that the si_signo value is always
 		 * SIGCHLD. Obey it despite the rfork(2) interface
 		 * allows to request other signal for child exit
 		 * notification.
 		 */
 		siginfo->si_signo = SIGCHLD;
 
 		/*
 		 *  This is still a rough estimate.  We will fix the
 		 *  cases TRAPPED, STOPPED, and CONTINUED later.
 		 */
 		if (WCOREDUMP(p->p_xstat)) {
 			siginfo->si_code = CLD_DUMPED;
 			siginfo->si_status = WTERMSIG(p->p_xstat);
 		} else if (WIFSIGNALED(p->p_xstat)) {
 			siginfo->si_code = CLD_KILLED;
 			siginfo->si_status = WTERMSIG(p->p_xstat);
 		} else {
 			siginfo->si_code = CLD_EXITED;
 			siginfo->si_status = WEXITSTATUS(p->p_xstat);
 		}
 
 		siginfo->si_pid = p->p_pid;
 		siginfo->si_uid = p->p_ucred->cr_uid;
 
 		/*
 		 * The si_addr field would be useful additional
 		 * detail, but apparently the PC value may be lost
 		 * when we reach this point.  bzero() above sets
 		 * siginfo->si_addr to NULL.
 		 */
 	}
 
 	/*
 	 * There should be no reason to limit resources usage info to
 	 * exited processes only.  A snapshot about any resources used
 	 * by a stopped process may be exactly what is needed.
 	 */
 	if (wrusage != NULL) {
 		rup = &wrusage->wru_self;
 		*rup = p->p_ru;
 		calcru(p, &rup->ru_utime, &rup->ru_stime);
 
 		rup = &wrusage->wru_children;
 		*rup = p->p_stats->p_cru;
 		calccru(p, &rup->ru_utime, &rup->ru_stime);
 	}
 
 	if (p->p_state == PRS_ZOMBIE) {
 		proc_reap(td, p, status, options);
 		return (-1);
 	}
 	PROC_SUNLOCK(p);
 	PROC_UNLOCK(p);
 	return (1);
 }
 
 int
 kern_wait(struct thread *td, pid_t pid, int *status, int options,
     struct rusage *rusage)
 {
 	struct __wrusage wru, *wrup;
 	idtype_t idtype;
 	id_t id;
 	int ret;
 
 	/*
 	 * Translate the special pid values into the (idtype, pid)
 	 * pair for kern_wait6.  The WAIT_MYPGRP case is handled by
 	 * kern_wait6() on its own.
 	 */
 	if (pid == WAIT_ANY) {
 		idtype = P_ALL;
 		id = 0;
 	} else if (pid < 0) {
 		idtype = P_PGID;
 		id = (id_t)-pid;
 	} else {
 		idtype = P_PID;
 		id = (id_t)pid;
 	}
 
 	if (rusage != NULL)
 		wrup = &wru;
 	else
 		wrup = NULL;
 
 	/*
 	 * For backward compatibility we implicitly add flags WEXITED
 	 * and WTRAPPED here.
 	 */
 	options |= WEXITED | WTRAPPED;
 	ret = kern_wait6(td, idtype, id, status, options, wrup, NULL);
 	if (rusage != NULL)
 		*rusage = wru.wru_self;
 	return (ret);
 }
 
 int
 kern_wait6(struct thread *td, idtype_t idtype, id_t id, int *status,
     int options, struct __wrusage *wrusage, siginfo_t *siginfo)
 {
 	struct proc *p, *q;
 	int error, nfound, ret;
 
 	AUDIT_ARG_VALUE((int)idtype);	/* XXX - This is likely wrong! */
 	AUDIT_ARG_PID((pid_t)id);	/* XXX - This may be wrong! */
 	AUDIT_ARG_VALUE(options);
 
 	q = td->td_proc;
 
 	if ((pid_t)id == WAIT_MYPGRP && (idtype == P_PID || idtype == P_PGID)) {
 		PROC_LOCK(q);
 		id = (id_t)q->p_pgid;
 		PROC_UNLOCK(q);
 		idtype = P_PGID;
 	}
 
 	/* If we don't know the option, just return. */
 	if ((options & ~(WUNTRACED | WNOHANG | WCONTINUED | WNOWAIT |
 	    WEXITED | WTRAPPED | WLINUXCLONE)) != 0)
 		return (EINVAL);
 	if ((options & (WEXITED | WUNTRACED | WCONTINUED | WTRAPPED)) == 0) {
 		/*
 		 * We will be unable to find any matching processes,
 		 * because there are no known events to look for.
 		 * Prefer to return error instead of blocking
 		 * indefinitely.
 		 */
 		return (EINVAL);
 	}
 
 loop:
 	if (q->p_flag & P_STATCHILD) {
 		PROC_LOCK(q);
 		q->p_flag &= ~P_STATCHILD;
 		PROC_UNLOCK(q);
 	}
 	nfound = 0;
 	sx_xlock(&proctree_lock);
 	LIST_FOREACH(p, &q->p_children, p_sibling) {
 		ret = proc_to_reap(td, p, idtype, id, status, options,
 		    wrusage, siginfo);
 		if (ret == 0)
 			continue;
 		else if (ret == 1)
 			nfound++;
 		else
 			return (0);
 
 		PROC_LOCK(p);
 		PROC_SLOCK(p);
 
 		if ((options & WTRAPPED) != 0 &&
 		    (p->p_flag & P_TRACED) != 0 &&
 		    (p->p_flag & (P_STOPPED_TRACE | P_STOPPED_SIG)) != 0 &&
 		    (p->p_suspcount == p->p_numthreads) &&
 		    ((p->p_flag & P_WAITED) == 0)) {
 			PROC_SUNLOCK(p);
 			if ((options & WNOWAIT) == 0)
 				p->p_flag |= P_WAITED;
 			sx_xunlock(&proctree_lock);
 			td->td_retval[0] = p->p_pid;
 
 			if (status != NULL)
 				*status = W_STOPCODE(p->p_xstat);
 			if (siginfo != NULL) {
 				siginfo->si_status = p->p_xstat;
 				siginfo->si_code = CLD_TRAPPED;
 			}
 			if ((options & WNOWAIT) == 0) {
 				PROC_LOCK(q);
 				sigqueue_take(p->p_ksi);
 				PROC_UNLOCK(q);
 			}
 
 			PROC_UNLOCK(p);
 			return (0);
 		}
 		if ((options & WUNTRACED) != 0 &&
 		    (p->p_flag & P_STOPPED_SIG) != 0 &&
 		    (p->p_suspcount == p->p_numthreads) &&
 		    ((p->p_flag & P_WAITED) == 0)) {
 			PROC_SUNLOCK(p);
 			if ((options & WNOWAIT) == 0)
 				p->p_flag |= P_WAITED;
 			sx_xunlock(&proctree_lock);
 			td->td_retval[0] = p->p_pid;
 
 			if (status != NULL)
 				*status = W_STOPCODE(p->p_xstat);
 			if (siginfo != NULL) {
 				siginfo->si_status = p->p_xstat;
 				siginfo->si_code = CLD_STOPPED;
 			}
 			if ((options & WNOWAIT) == 0) {
 				PROC_LOCK(q);
 				sigqueue_take(p->p_ksi);
 				PROC_UNLOCK(q);
 			}
 
 			PROC_UNLOCK(p);
 			return (0);
 		}
 		PROC_SUNLOCK(p);
 		if ((options & WCONTINUED) != 0 &&
 		    (p->p_flag & P_CONTINUED) != 0) {
 			sx_xunlock(&proctree_lock);
 			td->td_retval[0] = p->p_pid;
 			if ((options & WNOWAIT) == 0) {
 				p->p_flag &= ~P_CONTINUED;
 				PROC_LOCK(q);
 				sigqueue_take(p->p_ksi);
 				PROC_UNLOCK(q);
 			}
 			PROC_UNLOCK(p);
 
 			if (status != NULL)
 				*status = SIGCONT;
 			if (siginfo != NULL) {
 				siginfo->si_status = SIGCONT;
 				siginfo->si_code = CLD_CONTINUED;
 			}
 			return (0);
 		}
 		PROC_UNLOCK(p);
 	}
 
 	/*
 	 * Look in the orphans list too, to allow the parent to
 	 * collect it's child exit status even if child is being
 	 * debugged.
 	 *
 	 * Debugger detaches from the parent upon successful
 	 * switch-over from parent to child.  At this point due to
 	 * re-parenting the parent loses the child to debugger and a
 	 * wait4(2) call would report that it has no children to wait
 	 * for.  By maintaining a list of orphans we allow the parent
 	 * to successfully wait until the child becomes a zombie.
 	 */
 	LIST_FOREACH(p, &q->p_orphans, p_orphan) {
 		ret = proc_to_reap(td, p, idtype, id, status, options,
 		    wrusage, siginfo);
 		if (ret == 0)
 			continue;
 		else if (ret == 1)
 			nfound++;
 		else
 			return (0);
 	}
 	if (nfound == 0) {
 		sx_xunlock(&proctree_lock);
 		return (ECHILD);
 	}
 	if (options & WNOHANG) {
 		sx_xunlock(&proctree_lock);
 		td->td_retval[0] = 0;
 		return (0);
 	}
 	PROC_LOCK(q);
 	sx_xunlock(&proctree_lock);
 	if (q->p_flag & P_STATCHILD) {
 		q->p_flag &= ~P_STATCHILD;
 		error = 0;
 	} else
 		error = msleep(q, &q->p_mtx, PWAIT | PCATCH, "wait", 0);
 	PROC_UNLOCK(q);
 	if (error)
 		return (error);
 	goto loop;
 }
 
 /*
  * Make process 'parent' the new parent of process 'child'.
  * Must be called with an exclusive hold of proctree lock.
  */
 void
 proc_reparent(struct proc *child, struct proc *parent)
 {
 
 	sx_assert(&proctree_lock, SX_XLOCKED);
 	PROC_LOCK_ASSERT(child, MA_OWNED);
 	if (child->p_pptr == parent)
 		return;
 
 	PROC_LOCK(child->p_pptr);
 	sigqueue_take(child->p_ksi);
 	PROC_UNLOCK(child->p_pptr);
 	LIST_REMOVE(child, p_sibling);
 	LIST_INSERT_HEAD(&parent->p_children, child, p_sibling);
 
 	clear_orphan(child);
 	if (child->p_flag & P_TRACED) {
 		LIST_INSERT_HEAD(&child->p_pptr->p_orphans, child, p_orphan);
 		child->p_flag |= P_ORPHAN;
 	}
 
 	child->p_pptr = parent;
 }
Index: head/sys/kern/kern_fork.c
===================================================================
--- head/sys/kern/kern_fork.c	(revision 258540)
+++ head/sys/kern/kern_fork.c	(revision 258541)
@@ -1,1058 +1,1057 @@
 /*-
  * Copyright (c) 1982, 1986, 1989, 1991, 1993
  *	The Regents of the University of California.  All rights reserved.
  * (c) UNIX System Laboratories, Inc.
  * All or some portions of this file are derived from material licensed
  * to the University of California by American Telephone and Telegraph
  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
  * the permission of UNIX System Laboratories, Inc.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)kern_fork.c	8.6 (Berkeley) 4/8/94
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
-#include "opt_kdtrace.h"
 #include "opt_ktrace.h"
 #include "opt_kstack_pages.h"
 #include "opt_procdesc.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/sysproto.h>
 #include <sys/eventhandler.h>
 #include <sys/fcntl.h>
 #include <sys/filedesc.h>
 #include <sys/jail.h>
 #include <sys/kernel.h>
 #include <sys/kthread.h>
 #include <sys/sysctl.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mutex.h>
 #include <sys/priv.h>
 #include <sys/proc.h>
 #include <sys/procdesc.h>
 #include <sys/pioctl.h>
 #include <sys/racct.h>
 #include <sys/resourcevar.h>
 #include <sys/sched.h>
 #include <sys/syscall.h>
 #include <sys/vmmeter.h>
 #include <sys/vnode.h>
 #include <sys/acct.h>
 #include <sys/ktr.h>
 #include <sys/ktrace.h>
 #include <sys/unistd.h>	
 #include <sys/sdt.h>
 #include <sys/sx.h>
 #include <sys/sysent.h>
 #include <sys/signalvar.h>
 
 #include <security/audit/audit.h>
 #include <security/mac/mac_framework.h>
 
 #include <vm/vm.h>
 #include <vm/pmap.h>
 #include <vm/vm_map.h>
 #include <vm/vm_extern.h>
 #include <vm/uma.h>
 
 #ifdef KDTRACE_HOOKS
 #include <sys/dtrace_bsd.h>
 dtrace_fork_func_t	dtrace_fasttrap_fork;
 #endif
 
 SDT_PROVIDER_DECLARE(proc);
 SDT_PROBE_DEFINE3(proc, kernel, , create, create, "struct proc *",
     "struct proc *", "int");
 
 #ifndef _SYS_SYSPROTO_H_
 struct fork_args {
 	int     dummy;
 };
 #endif
 
 /* ARGSUSED */
 int
 sys_fork(struct thread *td, struct fork_args *uap)
 {
 	int error;
 	struct proc *p2;
 
 	error = fork1(td, RFFDG | RFPROC, 0, &p2, NULL, 0);
 	if (error == 0) {
 		td->td_retval[0] = p2->p_pid;
 		td->td_retval[1] = 0;
 	}
 	return (error);
 }
 
 /* ARGUSED */
 int
 sys_pdfork(td, uap)
 	struct thread *td;
 	struct pdfork_args *uap;
 {
 #ifdef PROCDESC
 	int error, fd;
 	struct proc *p2;
 
 	/*
 	 * It is necessary to return fd by reference because 0 is a valid file
 	 * descriptor number, and the child needs to be able to distinguish
 	 * itself from the parent using the return value.
 	 */
 	error = fork1(td, RFFDG | RFPROC | RFPROCDESC, 0, &p2,
 	    &fd, uap->flags);
 	if (error == 0) {
 		td->td_retval[0] = p2->p_pid;
 		td->td_retval[1] = 0;
 		error = copyout(&fd, uap->fdp, sizeof(fd));
 	}
 	return (error);
 #else
 	return (ENOSYS);
 #endif
 }
 
 /* ARGSUSED */
 int
 sys_vfork(struct thread *td, struct vfork_args *uap)
 {
 	int error, flags;
 	struct proc *p2;
 
 	flags = RFFDG | RFPROC | RFPPWAIT | RFMEM;
 	error = fork1(td, flags, 0, &p2, NULL, 0);
 	if (error == 0) {
 		td->td_retval[0] = p2->p_pid;
 		td->td_retval[1] = 0;
 	}
 	return (error);
 }
 
 int
 sys_rfork(struct thread *td, struct rfork_args *uap)
 {
 	struct proc *p2;
 	int error;
 
 	/* Don't allow kernel-only flags. */
 	if ((uap->flags & RFKERNELONLY) != 0)
 		return (EINVAL);
 
 	AUDIT_ARG_FFLAGS(uap->flags);
 	error = fork1(td, uap->flags, 0, &p2, NULL, 0);
 	if (error == 0) {
 		td->td_retval[0] = p2 ? p2->p_pid : 0;
 		td->td_retval[1] = 0;
 	}
 	return (error);
 }
 
 int	nprocs = 1;		/* process 0 */
 int	lastpid = 0;
 SYSCTL_INT(_kern, OID_AUTO, lastpid, CTLFLAG_RD, &lastpid, 0, 
     "Last used PID");
 
 /*
  * Random component to lastpid generation.  We mix in a random factor to make
  * it a little harder to predict.  We sanity check the modulus value to avoid
  * doing it in critical paths.  Don't let it be too small or we pointlessly
  * waste randomness entropy, and don't let it be impossibly large.  Using a
  * modulus that is too big causes a LOT more process table scans and slows
  * down fork processing as the pidchecked caching is defeated.
  */
 static int randompid = 0;
 
 static int
 sysctl_kern_randompid(SYSCTL_HANDLER_ARGS)
 {
 	int error, pid;
 
 	error = sysctl_wire_old_buffer(req, sizeof(int));
 	if (error != 0)
 		return(error);
 	sx_xlock(&allproc_lock);
 	pid = randompid;
 	error = sysctl_handle_int(oidp, &pid, 0, req);
 	if (error == 0 && req->newptr != NULL) {
 		if (pid < 0 || pid > pid_max - 100)	/* out of range */
 			pid = pid_max - 100;
 		else if (pid < 2)			/* NOP */
 			pid = 0;
 		else if (pid < 100)			/* Make it reasonable */
 			pid = 100;
 		randompid = pid;
 	}
 	sx_xunlock(&allproc_lock);
 	return (error);
 }
 
 SYSCTL_PROC(_kern, OID_AUTO, randompid, CTLTYPE_INT|CTLFLAG_RW,
     0, 0, sysctl_kern_randompid, "I", "Random PID modulus");
 
 static int
 fork_findpid(int flags)
 {
 	struct proc *p;
 	int trypid;
 	static int pidchecked = 0;
 
 	/*
 	 * Requires allproc_lock in order to iterate over the list
 	 * of processes, and proctree_lock to access p_pgrp.
 	 */
 	sx_assert(&allproc_lock, SX_LOCKED);
 	sx_assert(&proctree_lock, SX_LOCKED);
 
 	/*
 	 * Find an unused process ID.  We remember a range of unused IDs
 	 * ready to use (from lastpid+1 through pidchecked-1).
 	 *
 	 * If RFHIGHPID is set (used during system boot), do not allocate
 	 * low-numbered pids.
 	 */
 	trypid = lastpid + 1;
 	if (flags & RFHIGHPID) {
 		if (trypid < 10)
 			trypid = 10;
 	} else {
 		if (randompid)
 			trypid += arc4random() % randompid;
 	}
 retry:
 	/*
 	 * If the process ID prototype has wrapped around,
 	 * restart somewhat above 0, as the low-numbered procs
 	 * tend to include daemons that don't exit.
 	 */
 	if (trypid >= pid_max) {
 		trypid = trypid % pid_max;
 		if (trypid < 100)
 			trypid += 100;
 		pidchecked = 0;
 	}
 	if (trypid >= pidchecked) {
 		int doingzomb = 0;
 
 		pidchecked = PID_MAX;
 		/*
 		 * Scan the active and zombie procs to check whether this pid
 		 * is in use.  Remember the lowest pid that's greater
 		 * than trypid, so we can avoid checking for a while.
 		 */
 		p = LIST_FIRST(&allproc);
 again:
 		for (; p != NULL; p = LIST_NEXT(p, p_list)) {
 			while (p->p_pid == trypid ||
 			    (p->p_pgrp != NULL &&
 			    (p->p_pgrp->pg_id == trypid ||
 			    (p->p_session != NULL &&
 			    p->p_session->s_sid == trypid)))) {
 				trypid++;
 				if (trypid >= pidchecked)
 					goto retry;
 			}
 			if (p->p_pid > trypid && pidchecked > p->p_pid)
 				pidchecked = p->p_pid;
 			if (p->p_pgrp != NULL) {
 				if (p->p_pgrp->pg_id > trypid &&
 				    pidchecked > p->p_pgrp->pg_id)
 					pidchecked = p->p_pgrp->pg_id;
 				if (p->p_session != NULL &&
 				    p->p_session->s_sid > trypid &&
 				    pidchecked > p->p_session->s_sid)
 					pidchecked = p->p_session->s_sid;
 			}
 		}
 		if (!doingzomb) {
 			doingzomb = 1;
 			p = LIST_FIRST(&zombproc);
 			goto again;
 		}
 	}
 
 	/*
 	 * RFHIGHPID does not mess with the lastpid counter during boot.
 	 */
 	if (flags & RFHIGHPID)
 		pidchecked = 0;
 	else
 		lastpid = trypid;
 
 	return (trypid);
 }
 
 static int
 fork_norfproc(struct thread *td, int flags)
 {
 	int error;
 	struct proc *p1;
 
 	KASSERT((flags & RFPROC) == 0,
 	    ("fork_norfproc called with RFPROC set"));
 	p1 = td->td_proc;
 
 	if (((p1->p_flag & (P_HADTHREADS|P_SYSTEM)) == P_HADTHREADS) &&
 	    (flags & (RFCFDG | RFFDG))) {
 		PROC_LOCK(p1);
 		if (thread_single(SINGLE_BOUNDARY)) {
 			PROC_UNLOCK(p1);
 			return (ERESTART);
 		}
 		PROC_UNLOCK(p1);
 	}
 
 	error = vm_forkproc(td, NULL, NULL, NULL, flags);
 	if (error)
 		goto fail;
 
 	/*
 	 * Close all file descriptors.
 	 */
 	if (flags & RFCFDG) {
 		struct filedesc *fdtmp;
 		fdtmp = fdinit(td->td_proc->p_fd);
 		fdescfree(td);
 		p1->p_fd = fdtmp;
 	}
 
 	/*
 	 * Unshare file descriptors (from parent).
 	 */
 	if (flags & RFFDG) 
 		fdunshare(p1, td);
 
 fail:
 	if (((p1->p_flag & (P_HADTHREADS|P_SYSTEM)) == P_HADTHREADS) &&
 	    (flags & (RFCFDG | RFFDG))) {
 		PROC_LOCK(p1);
 		thread_single_end();
 		PROC_UNLOCK(p1);
 	}
 	return (error);
 }
 
 static void
 do_fork(struct thread *td, int flags, struct proc *p2, struct thread *td2,
     struct vmspace *vm2, int pdflags)
 {
 	struct proc *p1, *pptr;
 	int p2_held, trypid;
 	struct filedesc *fd;
 	struct filedesc_to_leader *fdtol;
 	struct sigacts *newsigacts;
 
 	sx_assert(&proctree_lock, SX_SLOCKED);
 	sx_assert(&allproc_lock, SX_XLOCKED);
 
 	p2_held = 0;
 	p1 = td->td_proc;
 
 	/*
 	 * Increment the nprocs resource before blocking can occur.  There
 	 * are hard-limits as to the number of processes that can run.
 	 */
 	nprocs++;
 
 	trypid = fork_findpid(flags);
 
 	sx_sunlock(&proctree_lock);
 
 	p2->p_state = PRS_NEW;		/* protect against others */
 	p2->p_pid = trypid;
 	AUDIT_ARG_PID(p2->p_pid);
 	LIST_INSERT_HEAD(&allproc, p2, p_list);
 	LIST_INSERT_HEAD(PIDHASH(p2->p_pid), p2, p_hash);
 	tidhash_add(td2);
 	PROC_LOCK(p2);
 	PROC_LOCK(p1);
 
 	sx_xunlock(&allproc_lock);
 
 	bcopy(&p1->p_startcopy, &p2->p_startcopy,
 	    __rangeof(struct proc, p_startcopy, p_endcopy));
 	pargs_hold(p2->p_args);
 	PROC_UNLOCK(p1);
 
 	bzero(&p2->p_startzero,
 	    __rangeof(struct proc, p_startzero, p_endzero));
 
 	p2->p_ucred = crhold(td->td_ucred);
 
 	/* Tell the prison that we exist. */
 	prison_proc_hold(p2->p_ucred->cr_prison);
 
 	PROC_UNLOCK(p2);
 
 	/*
 	 * Malloc things while we don't hold any locks.
 	 */
 	if (flags & RFSIGSHARE)
 		newsigacts = NULL;
 	else
 		newsigacts = sigacts_alloc();
 
 	/*
 	 * Copy filedesc.
 	 */
 	if (flags & RFCFDG) {
 		fd = fdinit(p1->p_fd);
 		fdtol = NULL;
 	} else if (flags & RFFDG) {
 		fd = fdcopy(p1->p_fd);
 		fdtol = NULL;
 	} else {
 		fd = fdshare(p1->p_fd);
 		if (p1->p_fdtol == NULL)
 			p1->p_fdtol = filedesc_to_leader_alloc(NULL, NULL,
 			    p1->p_leader);
 		if ((flags & RFTHREAD) != 0) {
 			/*
 			 * Shared file descriptor table, and shared
 			 * process leaders.
 			 */
 			fdtol = p1->p_fdtol;
 			FILEDESC_XLOCK(p1->p_fd);
 			fdtol->fdl_refcount++;
 			FILEDESC_XUNLOCK(p1->p_fd);
 		} else {
 			/* 
 			 * Shared file descriptor table, and different
 			 * process leaders.
 			 */
 			fdtol = filedesc_to_leader_alloc(p1->p_fdtol,
 			    p1->p_fd, p2);
 		}
 	}
 	/*
 	 * Make a proc table entry for the new process.
 	 * Start by zeroing the section of proc that is zero-initialized,
 	 * then copy the section that is copied directly from the parent.
 	 */
 
 	PROC_LOCK(p2);
 	PROC_LOCK(p1);
 
 	bzero(&td2->td_startzero,
 	    __rangeof(struct thread, td_startzero, td_endzero));
 
 	bcopy(&td->td_startcopy, &td2->td_startcopy,
 	    __rangeof(struct thread, td_startcopy, td_endcopy));
 
 	bcopy(&p2->p_comm, &td2->td_name, sizeof(td2->td_name));
 	td2->td_sigstk = td->td_sigstk;
 	td2->td_flags = TDF_INMEM;
 	td2->td_lend_user_pri = PRI_MAX;
 
 #ifdef VIMAGE
 	td2->td_vnet = NULL;
 	td2->td_vnet_lpush = NULL;
 #endif
 
 	/*
 	 * Allow the scheduler to initialize the child.
 	 */
 	thread_lock(td);
 	sched_fork(td, td2);
 	thread_unlock(td);
 
 	/*
 	 * Duplicate sub-structures as needed.
 	 * Increase reference counts on shared objects.
 	 */
 	p2->p_flag = P_INMEM;
 	p2->p_flag2 = 0;
 	p2->p_swtick = ticks;
 	if (p1->p_flag & P_PROFIL)
 		startprofclock(p2);
 	td2->td_ucred = crhold(p2->p_ucred);
 
 	if (flags & RFSIGSHARE) {
 		p2->p_sigacts = sigacts_hold(p1->p_sigacts);
 	} else {
 		sigacts_copy(newsigacts, p1->p_sigacts);
 		p2->p_sigacts = newsigacts;
 	}
 
 	if (flags & RFTSIGZMB)
 	        p2->p_sigparent = RFTSIGNUM(flags);
 	else if (flags & RFLINUXTHPN)
 	        p2->p_sigparent = SIGUSR1;
 	else
 	        p2->p_sigparent = SIGCHLD;
 
 	p2->p_textvp = p1->p_textvp;
 	p2->p_fd = fd;
 	p2->p_fdtol = fdtol;
 
 	if (p1->p_flag2 & P2_INHERIT_PROTECTED) {
 		p2->p_flag |= P_PROTECTED;
 		p2->p_flag2 |= P2_INHERIT_PROTECTED;
 	}
 
 	/*
 	 * p_limit is copy-on-write.  Bump its refcount.
 	 */
 	lim_fork(p1, p2);
 
 	pstats_fork(p1->p_stats, p2->p_stats);
 
 	PROC_UNLOCK(p1);
 	PROC_UNLOCK(p2);
 
 	/* Bump references to the text vnode (for procfs). */
 	if (p2->p_textvp)
 		vref(p2->p_textvp);
 
 	/*
 	 * Set up linkage for kernel based threading.
 	 */
 	if ((flags & RFTHREAD) != 0) {
 		mtx_lock(&ppeers_lock);
 		p2->p_peers = p1->p_peers;
 		p1->p_peers = p2;
 		p2->p_leader = p1->p_leader;
 		mtx_unlock(&ppeers_lock);
 		PROC_LOCK(p1->p_leader);
 		if ((p1->p_leader->p_flag & P_WEXIT) != 0) {
 			PROC_UNLOCK(p1->p_leader);
 			/*
 			 * The task leader is exiting, so process p1 is
 			 * going to be killed shortly.  Since p1 obviously
 			 * isn't dead yet, we know that the leader is either
 			 * sending SIGKILL's to all the processes in this
 			 * task or is sleeping waiting for all the peers to
 			 * exit.  We let p1 complete the fork, but we need
 			 * to go ahead and kill the new process p2 since
 			 * the task leader may not get a chance to send
 			 * SIGKILL to it.  We leave it on the list so that
 			 * the task leader will wait for this new process
 			 * to commit suicide.
 			 */
 			PROC_LOCK(p2);
 			kern_psignal(p2, SIGKILL);
 			PROC_UNLOCK(p2);
 		} else
 			PROC_UNLOCK(p1->p_leader);
 	} else {
 		p2->p_peers = NULL;
 		p2->p_leader = p2;
 	}
 
 	sx_xlock(&proctree_lock);
 	PGRP_LOCK(p1->p_pgrp);
 	PROC_LOCK(p2);
 	PROC_LOCK(p1);
 
 	/*
 	 * Preserve some more flags in subprocess.  P_PROFIL has already
 	 * been preserved.
 	 */
 	p2->p_flag |= p1->p_flag & P_SUGID;
 	td2->td_pflags |= td->td_pflags & TDP_ALTSTACK;
 	SESS_LOCK(p1->p_session);
 	if (p1->p_session->s_ttyvp != NULL && p1->p_flag & P_CONTROLT)
 		p2->p_flag |= P_CONTROLT;
 	SESS_UNLOCK(p1->p_session);
 	if (flags & RFPPWAIT)
 		p2->p_flag |= P_PPWAIT;
 
 	p2->p_pgrp = p1->p_pgrp;
 	LIST_INSERT_AFTER(p1, p2, p_pglist);
 	PGRP_UNLOCK(p1->p_pgrp);
 	LIST_INIT(&p2->p_children);
 	LIST_INIT(&p2->p_orphans);
 
 	callout_init_mtx(&p2->p_itcallout, &p2->p_mtx, 0);
 
 	/*
 	 * If PF_FORK is set, the child process inherits the
 	 * procfs ioctl flags from its parent.
 	 */
 	if (p1->p_pfsflags & PF_FORK) {
 		p2->p_stops = p1->p_stops;
 		p2->p_pfsflags = p1->p_pfsflags;
 	}
 
 	/*
 	 * This begins the section where we must prevent the parent
 	 * from being swapped.
 	 */
 	_PHOLD(p1);
 	PROC_UNLOCK(p1);
 
 	/*
 	 * Attach the new process to its parent.
 	 *
 	 * If RFNOWAIT is set, the newly created process becomes a child
 	 * of init.  This effectively disassociates the child from the
 	 * parent.
 	 */
 	if (flags & RFNOWAIT)
 		pptr = initproc;
 	else
 		pptr = p1;
 	p2->p_pptr = pptr;
 	LIST_INSERT_HEAD(&pptr->p_children, p2, p_sibling);
 	sx_xunlock(&proctree_lock);
 
 	/* Inform accounting that we have forked. */
 	p2->p_acflag = AFORK;
 	PROC_UNLOCK(p2);
 
 #ifdef KTRACE
 	ktrprocfork(p1, p2);
 #endif
 
 	/*
 	 * Finish creating the child process.  It will return via a different
 	 * execution path later.  (ie: directly into user mode)
 	 */
 	vm_forkproc(td, p2, td2, vm2, flags);
 
 	if (flags == (RFFDG | RFPROC)) {
 		PCPU_INC(cnt.v_forks);
 		PCPU_ADD(cnt.v_forkpages, p2->p_vmspace->vm_dsize +
 		    p2->p_vmspace->vm_ssize);
 	} else if (flags == (RFFDG | RFPROC | RFPPWAIT | RFMEM)) {
 		PCPU_INC(cnt.v_vforks);
 		PCPU_ADD(cnt.v_vforkpages, p2->p_vmspace->vm_dsize +
 		    p2->p_vmspace->vm_ssize);
 	} else if (p1 == &proc0) {
 		PCPU_INC(cnt.v_kthreads);
 		PCPU_ADD(cnt.v_kthreadpages, p2->p_vmspace->vm_dsize +
 		    p2->p_vmspace->vm_ssize);
 	} else {
 		PCPU_INC(cnt.v_rforks);
 		PCPU_ADD(cnt.v_rforkpages, p2->p_vmspace->vm_dsize +
 		    p2->p_vmspace->vm_ssize);
 	}
 
 #ifdef PROCDESC
 	/*
 	 * Associate the process descriptor with the process before anything
 	 * can happen that might cause that process to need the descriptor.
 	 * However, don't do this until after fork(2) can no longer fail.
 	 */
 	if (flags & RFPROCDESC)
 		procdesc_new(p2, pdflags);
 #endif
 
 	/*
 	 * Both processes are set up, now check if any loadable modules want
 	 * to adjust anything.
 	 */
 	EVENTHANDLER_INVOKE(process_fork, p1, p2, flags);
 
 	/*
 	 * Set the child start time and mark the process as being complete.
 	 */
 	PROC_LOCK(p2);
 	PROC_LOCK(p1);
 	microuptime(&p2->p_stats->p_start);
 	PROC_SLOCK(p2);
 	p2->p_state = PRS_NORMAL;
 	PROC_SUNLOCK(p2);
 
 #ifdef KDTRACE_HOOKS
 	/*
 	 * Tell the DTrace fasttrap provider about the new process
 	 * if it has registered an interest. We have to do this only after
 	 * p_state is PRS_NORMAL since the fasttrap module will use pfind()
 	 * later on.
 	 */
 	if (dtrace_fasttrap_fork)
 		dtrace_fasttrap_fork(p1, p2);
 #endif
 	if ((p1->p_flag & (P_TRACED | P_FOLLOWFORK)) == (P_TRACED |
 	    P_FOLLOWFORK)) {
 		/*
 		 * Arrange for debugger to receive the fork event.
 		 *
 		 * We can report PL_FLAG_FORKED regardless of
 		 * P_FOLLOWFORK settings, but it does not make a sense
 		 * for runaway child.
 		 */
 		td->td_dbgflags |= TDB_FORK;
 		td->td_dbg_forked = p2->p_pid;
 		td2->td_dbgflags |= TDB_STOPATFORK;
 		_PHOLD(p2);
 		p2_held = 1;
 	}
 	if (flags & RFPPWAIT) {
 		td->td_pflags |= TDP_RFPPWAIT;
 		td->td_rfppwait_p = p2;
 	}
 	PROC_UNLOCK(p2);
 	if ((flags & RFSTOPPED) == 0) {
 		/*
 		 * If RFSTOPPED not requested, make child runnable and
 		 * add to run queue.
 		 */
 		thread_lock(td2);
 		TD_SET_CAN_RUN(td2);
 		sched_add(td2, SRQ_BORING);
 		thread_unlock(td2);
 	}
 
 	/*
 	 * Now can be swapped.
 	 */
 	_PRELE(p1);
 	PROC_UNLOCK(p1);
 
 	/*
 	 * Tell any interested parties about the new process.
 	 */
 	knote_fork(&p1->p_klist, p2->p_pid);
 	SDT_PROBE(proc, kernel, , create, p2, p1, flags, 0, 0);
 
 	/*
 	 * Wait until debugger is attached to child.
 	 */
 	PROC_LOCK(p2);
 	while ((td2->td_dbgflags & TDB_STOPATFORK) != 0)
 		cv_wait(&p2->p_dbgwait, &p2->p_mtx);
 	if (p2_held)
 		_PRELE(p2);
 	PROC_UNLOCK(p2);
 }
 
 int
 fork1(struct thread *td, int flags, int pages, struct proc **procp,
     int *procdescp, int pdflags)
 {
 	struct proc *p1;
 	struct proc *newproc;
 	int ok;
 	struct thread *td2;
 	struct vmspace *vm2;
 	vm_ooffset_t mem_charged;
 	int error;
 	static int curfail;
 	static struct timeval lastfail;
 #ifdef PROCDESC
 	struct file *fp_procdesc = NULL;
 #endif
 
 	/* Check for the undefined or unimplemented flags. */
 	if ((flags & ~(RFFLAGS | RFTSIGFLAGS(RFTSIGMASK))) != 0)
 		return (EINVAL);
 
 	/* Signal value requires RFTSIGZMB. */
 	if ((flags & RFTSIGFLAGS(RFTSIGMASK)) != 0 && (flags & RFTSIGZMB) == 0)
 		return (EINVAL);
 
 	/* Can't copy and clear. */
 	if ((flags & (RFFDG|RFCFDG)) == (RFFDG|RFCFDG))
 		return (EINVAL);
 
 	/* Check the validity of the signal number. */
 	if ((flags & RFTSIGZMB) != 0 && (u_int)RFTSIGNUM(flags) > _SIG_MAXSIG)
 		return (EINVAL);
 
 #ifdef PROCDESC
 	if ((flags & RFPROCDESC) != 0) {
 		/* Can't not create a process yet get a process descriptor. */
 		if ((flags & RFPROC) == 0)
 			return (EINVAL);
 
 		/* Must provide a place to put a procdesc if creating one. */
 		if (procdescp == NULL)
 			return (EINVAL);
 	}
 #endif
 
 	p1 = td->td_proc;
 
 	/*
 	 * Here we don't create a new process, but we divorce
 	 * certain parts of a process from itself.
 	 */
 	if ((flags & RFPROC) == 0) {
 		*procp = NULL;
 		return (fork_norfproc(td, flags));
 	}
 
 #ifdef PROCDESC
 	/*
 	 * If required, create a process descriptor in the parent first; we
 	 * will abandon it if something goes wrong. We don't finit() until
 	 * later.
 	 */
 	if (flags & RFPROCDESC) {
 		error = falloc(td, &fp_procdesc, procdescp, 0);
 		if (error != 0)
 			return (error);
 	}
 #endif
 
 	mem_charged = 0;
 	vm2 = NULL;
 	if (pages == 0)
 		pages = KSTACK_PAGES;
 	/* Allocate new proc. */
 	newproc = uma_zalloc(proc_zone, M_WAITOK);
 	td2 = FIRST_THREAD_IN_PROC(newproc);
 	if (td2 == NULL) {
 		td2 = thread_alloc(pages);
 		if (td2 == NULL) {
 			error = ENOMEM;
 			goto fail1;
 		}
 		proc_linkup(newproc, td2);
 	} else {
 		if (td2->td_kstack == 0 || td2->td_kstack_pages != pages) {
 			if (td2->td_kstack != 0)
 				vm_thread_dispose(td2);
 			if (!thread_alloc_stack(td2, pages)) {
 				error = ENOMEM;
 				goto fail1;
 			}
 		}
 	}
 
 	if ((flags & RFMEM) == 0) {
 		vm2 = vmspace_fork(p1->p_vmspace, &mem_charged);
 		if (vm2 == NULL) {
 			error = ENOMEM;
 			goto fail1;
 		}
 		if (!swap_reserve(mem_charged)) {
 			/*
 			 * The swap reservation failed. The accounting
 			 * from the entries of the copied vm2 will be
 			 * substracted in vmspace_free(), so force the
 			 * reservation there.
 			 */
 			swap_reserve_force(mem_charged);
 			error = ENOMEM;
 			goto fail1;
 		}
 	} else
 		vm2 = NULL;
 
 	/*
 	 * XXX: This is ugly; when we copy resource usage, we need to bump
 	 *      per-cred resource counters.
 	 */
 	newproc->p_ucred = p1->p_ucred;
 
 	/*
 	 * Initialize resource accounting for the child process.
 	 */
 	error = racct_proc_fork(p1, newproc);
 	if (error != 0) {
 		error = EAGAIN;
 		goto fail1;
 	}
 
 #ifdef MAC
 	mac_proc_init(newproc);
 #endif
 	knlist_init_mtx(&newproc->p_klist, &newproc->p_mtx);
 	STAILQ_INIT(&newproc->p_ktr);
 
 	/* We have to lock the process tree while we look for a pid. */
 	sx_slock(&proctree_lock);
 
 	/*
 	 * Although process entries are dynamically created, we still keep
 	 * a global limit on the maximum number we will create.  Don't allow
 	 * a nonprivileged user to use the last ten processes; don't let root
 	 * exceed the limit. The variable nprocs is the current number of
 	 * processes, maxproc is the limit.
 	 */
 	sx_xlock(&allproc_lock);
 	if ((nprocs >= maxproc - 10 && priv_check_cred(td->td_ucred,
 	    PRIV_MAXPROC, 0) != 0) || nprocs >= maxproc) {
 		error = EAGAIN;
 		goto fail;
 	}
 
 	/*
 	 * Increment the count of procs running with this uid. Don't allow
 	 * a nonprivileged user to exceed their current limit.
 	 *
 	 * XXXRW: Can we avoid privilege here if it's not needed?
 	 */
 	error = priv_check_cred(td->td_ucred, PRIV_PROC_LIMIT, 0);
 	if (error == 0)
 		ok = chgproccnt(td->td_ucred->cr_ruidinfo, 1, 0);
 	else {
 		PROC_LOCK(p1);
 		ok = chgproccnt(td->td_ucred->cr_ruidinfo, 1,
 		    lim_cur(p1, RLIMIT_NPROC));
 		PROC_UNLOCK(p1);
 	}
 	if (ok) {
 		do_fork(td, flags, newproc, td2, vm2, pdflags);
 
 		/*
 		 * Return child proc pointer to parent.
 		 */
 		*procp = newproc;
 #ifdef PROCDESC
 		if (flags & RFPROCDESC) {
 			procdesc_finit(newproc->p_procdesc, fp_procdesc);
 			fdrop(fp_procdesc, td);
 		}
 #endif
 		racct_proc_fork_done(newproc);
 		return (0);
 	}
 
 	error = EAGAIN;
 fail:
 	sx_sunlock(&proctree_lock);
 	if (ppsratecheck(&lastfail, &curfail, 1))
 		printf("maxproc limit exceeded by uid %u (pid %d); see tuning(7) and login.conf(5)\n",
 		    td->td_ucred->cr_ruid, p1->p_pid);
 	sx_xunlock(&allproc_lock);
 #ifdef MAC
 	mac_proc_destroy(newproc);
 #endif
 	racct_proc_exit(newproc);
 fail1:
 	if (vm2 != NULL)
 		vmspace_free(vm2);
 	uma_zfree(proc_zone, newproc);
 #ifdef PROCDESC
 	if ((flags & RFPROCDESC) != 0 && fp_procdesc != NULL) {
 		fdclose(td->td_proc->p_fd, fp_procdesc, *procdescp, td);
 		fdrop(fp_procdesc, td);
 	}
 #endif
 	pause("fork", hz / 2);
 	return (error);
 }
 
 /*
  * Handle the return of a child process from fork1().  This function
  * is called from the MD fork_trampoline() entry point.
  */
 void
 fork_exit(void (*callout)(void *, struct trapframe *), void *arg,
     struct trapframe *frame)
 {
 	struct proc *p;
 	struct thread *td;
 	struct thread *dtd;
 
 	td = curthread;
 	p = td->td_proc;
 	KASSERT(p->p_state == PRS_NORMAL, ("executing process is still new"));
 
 	CTR4(KTR_PROC, "fork_exit: new thread %p (td_sched %p, pid %d, %s)",
 		td, td->td_sched, p->p_pid, td->td_name);
 
 	sched_fork_exit(td);
 	/*
 	* Processes normally resume in mi_switch() after being
 	* cpu_switch()'ed to, but when children start up they arrive here
 	* instead, so we must do much the same things as mi_switch() would.
 	*/
 	if ((dtd = PCPU_GET(deadthread))) {
 		PCPU_SET(deadthread, NULL);
 		thread_stash(dtd);
 	}
 	thread_unlock(td);
 
 	/*
 	 * cpu_set_fork_handler intercepts this function call to
 	 * have this call a non-return function to stay in kernel mode.
 	 * initproc has its own fork handler, but it does return.
 	 */
 	KASSERT(callout != NULL, ("NULL callout in fork_exit"));
 	callout(arg, frame);
 
 	/*
 	 * Check if a kernel thread misbehaved and returned from its main
 	 * function.
 	 */
 	if (p->p_flag & P_KTHREAD) {
 		printf("Kernel thread \"%s\" (pid %d) exited prematurely.\n",
 		    td->td_name, p->p_pid);
 		kproc_exit(0);
 	}
 	mtx_assert(&Giant, MA_NOTOWNED);
 
 	if (p->p_sysent->sv_schedtail != NULL)
 		(p->p_sysent->sv_schedtail)(td);
 }
 
 /*
  * Simplified back end of syscall(), used when returning from fork()
  * directly into user mode.  Giant is not held on entry, and must not
  * be held on return.  This function is passed in to fork_exit() as the
  * first parameter and is called when returning to a new userland process.
  */
 void
 fork_return(struct thread *td, struct trapframe *frame)
 {
 	struct proc *p, *dbg;
 
 	if (td->td_dbgflags & TDB_STOPATFORK) {
 		p = td->td_proc;
 		sx_xlock(&proctree_lock);
 		PROC_LOCK(p);
 		if ((p->p_pptr->p_flag & (P_TRACED | P_FOLLOWFORK)) ==
 		    (P_TRACED | P_FOLLOWFORK)) {
 			/*
 			 * If debugger still wants auto-attach for the
 			 * parent's children, do it now.
 			 */
 			dbg = p->p_pptr->p_pptr;
 			p->p_flag |= P_TRACED;
 			p->p_oppid = p->p_pptr->p_pid;
 			proc_reparent(p, dbg);
 			sx_xunlock(&proctree_lock);
 			td->td_dbgflags |= TDB_CHILD;
 			ptracestop(td, SIGSTOP);
 			td->td_dbgflags &= ~TDB_CHILD;
 		} else {
 			/*
 			 * ... otherwise clear the request.
 			 */
 			sx_xunlock(&proctree_lock);
 			td->td_dbgflags &= ~TDB_STOPATFORK;
 			cv_broadcast(&p->p_dbgwait);
 		}
 		PROC_UNLOCK(p);
 	}
 
 	userret(td, frame);
 
 #ifdef KTRACE
 	if (KTRPOINT(td, KTR_SYSRET))
 		ktrsysret(SYS_fork, 0, 0);
 #endif
 }
Index: head/sys/kern/kern_lock.c
===================================================================
--- head/sys/kern/kern_lock.c	(revision 258540)
+++ head/sys/kern/kern_lock.c	(revision 258541)
@@ -1,1518 +1,1517 @@
 /*-
  * Copyright (c) 2008 Attilio Rao <attilio@FreeBSD.org>
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice(s), this list of conditions and the following disclaimer as
  *    the first lines of this file unmodified other than the possible
  *    addition of one or more copyright notices.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice(s), this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER(S) ``AS IS'' AND ANY
  * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
  * DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT HOLDER(S) BE LIABLE FOR ANY
  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
  * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
  * DAMAGE.
  */
 
 #include "opt_adaptive_lockmgrs.h"
 #include "opt_ddb.h"
 #include "opt_hwpmc_hooks.h"
-#include "opt_kdtrace.h"
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/kdb.h>
 #include <sys/ktr.h>
 #include <sys/lock.h>
 #include <sys/lock_profile.h>
 #include <sys/lockmgr.h>
 #include <sys/mutex.h>
 #include <sys/proc.h>
 #include <sys/sleepqueue.h>
 #ifdef DEBUG_LOCKS
 #include <sys/stack.h>
 #endif
 #include <sys/sysctl.h>
 #include <sys/systm.h>
 
 #include <machine/cpu.h>
 
 #ifdef DDB
 #include <ddb/ddb.h>
 #endif
 
 #ifdef HWPMC_HOOKS
 #include <sys/pmckern.h>
 PMC_SOFT_DECLARE( , , lock, failed);
 #endif
 
 CTASSERT(((LK_ADAPTIVE | LK_NOSHARE) & LO_CLASSFLAGS) ==
     (LK_ADAPTIVE | LK_NOSHARE));
 CTASSERT(LK_UNLOCKED == (LK_UNLOCKED &
     ~(LK_ALL_WAITERS | LK_EXCLUSIVE_SPINNERS)));
 
 #define	SQ_EXCLUSIVE_QUEUE	0
 #define	SQ_SHARED_QUEUE		1
 
 #ifndef INVARIANTS
 #define	_lockmgr_assert(lk, what, file, line)
 #define	TD_LOCKS_INC(td)
 #define	TD_LOCKS_DEC(td)
 #else
 #define	TD_LOCKS_INC(td)	((td)->td_locks++)
 #define	TD_LOCKS_DEC(td)	((td)->td_locks--)
 #endif
 #define	TD_SLOCKS_INC(td)	((td)->td_lk_slocks++)
 #define	TD_SLOCKS_DEC(td)	((td)->td_lk_slocks--)
 
 #ifndef DEBUG_LOCKS
 #define	STACK_PRINT(lk)
 #define	STACK_SAVE(lk)
 #define	STACK_ZERO(lk)
 #else
 #define	STACK_PRINT(lk)	stack_print_ddb(&(lk)->lk_stack)
 #define	STACK_SAVE(lk)	stack_save(&(lk)->lk_stack)
 #define	STACK_ZERO(lk)	stack_zero(&(lk)->lk_stack)
 #endif
 
 #define	LOCK_LOG2(lk, string, arg1, arg2)				\
 	if (LOCK_LOG_TEST(&(lk)->lock_object, 0))			\
 		CTR2(KTR_LOCK, (string), (arg1), (arg2))
 #define	LOCK_LOG3(lk, string, arg1, arg2, arg3)				\
 	if (LOCK_LOG_TEST(&(lk)->lock_object, 0))			\
 		CTR3(KTR_LOCK, (string), (arg1), (arg2), (arg3))
 
 #define	GIANT_DECLARE							\
 	int _i = 0;							\
 	WITNESS_SAVE_DECL(Giant)
 #define	GIANT_RESTORE() do {						\
 	if (_i > 0) {							\
 		while (_i--)						\
 			mtx_lock(&Giant);				\
 		WITNESS_RESTORE(&Giant.lock_object, Giant);		\
 	}								\
 } while (0)
 #define	GIANT_SAVE() do {						\
 	if (mtx_owned(&Giant)) {					\
 		WITNESS_SAVE(&Giant.lock_object, Giant);		\
 		while (mtx_owned(&Giant)) {				\
 			_i++;						\
 			mtx_unlock(&Giant);				\
 		}							\
 	}								\
 } while (0)
 
 #define	LK_CAN_SHARE(x)							\
 	(((x) & LK_SHARE) && (((x) & LK_EXCLUSIVE_WAITERS) == 0 ||	\
 	((x) & LK_EXCLUSIVE_SPINNERS) == 0 ||				\
 	curthread->td_lk_slocks || (curthread->td_pflags & TDP_DEADLKTREAT)))
 #define	LK_TRYOP(x)							\
 	((x) & LK_NOWAIT)
 
 #define	LK_CAN_WITNESS(x)						\
 	(((x) & LK_NOWITNESS) == 0 && !LK_TRYOP(x))
 #define	LK_TRYWIT(x)							\
 	(LK_TRYOP(x) ? LOP_TRYLOCK : 0)
 
 #define	LK_CAN_ADAPT(lk, f)						\
 	(((lk)->lock_object.lo_flags & LK_ADAPTIVE) != 0 &&		\
 	((f) & LK_SLEEPFAIL) == 0)
 
 #define	lockmgr_disowned(lk)						\
 	(((lk)->lk_lock & ~(LK_FLAGMASK & ~LK_SHARE)) == LK_KERNPROC)
 
 #define	lockmgr_xlocked(lk)						\
 	(((lk)->lk_lock & ~(LK_FLAGMASK & ~LK_SHARE)) == (uintptr_t)curthread)
 
 static void	assert_lockmgr(const struct lock_object *lock, int how);
 #ifdef DDB
 static void	db_show_lockmgr(const struct lock_object *lock);
 #endif
 static void	lock_lockmgr(struct lock_object *lock, uintptr_t how);
 #ifdef KDTRACE_HOOKS
 static int	owner_lockmgr(const struct lock_object *lock,
 		    struct thread **owner);
 #endif
 static uintptr_t unlock_lockmgr(struct lock_object *lock);
 
 struct lock_class lock_class_lockmgr = {
 	.lc_name = "lockmgr",
 	.lc_flags = LC_RECURSABLE | LC_SLEEPABLE | LC_SLEEPLOCK | LC_UPGRADABLE,
 	.lc_assert = assert_lockmgr,
 #ifdef DDB
 	.lc_ddb_show = db_show_lockmgr,
 #endif
 	.lc_lock = lock_lockmgr,
 	.lc_unlock = unlock_lockmgr,
 #ifdef KDTRACE_HOOKS
 	.lc_owner = owner_lockmgr,
 #endif
 };
 
 #ifdef ADAPTIVE_LOCKMGRS
 static u_int alk_retries = 10;
 static u_int alk_loops = 10000;
 static SYSCTL_NODE(_debug, OID_AUTO, lockmgr, CTLFLAG_RD, NULL,
     "lockmgr debugging");
 SYSCTL_UINT(_debug_lockmgr, OID_AUTO, retries, CTLFLAG_RW, &alk_retries, 0, "");
 SYSCTL_UINT(_debug_lockmgr, OID_AUTO, loops, CTLFLAG_RW, &alk_loops, 0, "");
 #endif
 
 static __inline struct thread *
 lockmgr_xholder(const struct lock *lk)
 {
 	uintptr_t x;
 
 	x = lk->lk_lock;
 	return ((x & LK_SHARE) ? NULL : (struct thread *)LK_HOLDER(x));
 }
 
 /*
  * It assumes sleepq_lock held and returns with this one unheld.
  * It also assumes the generic interlock is sane and previously checked.
  * If LK_INTERLOCK is specified the interlock is not reacquired after the
  * sleep.
  */
 static __inline int
 sleeplk(struct lock *lk, u_int flags, struct lock_object *ilk,
     const char *wmesg, int pri, int timo, int queue)
 {
 	GIANT_DECLARE;
 	struct lock_class *class;
 	int catch, error;
 
 	class = (flags & LK_INTERLOCK) ? LOCK_CLASS(ilk) : NULL;
 	catch = pri & PCATCH;
 	pri &= PRIMASK;
 	error = 0;
 
 	LOCK_LOG3(lk, "%s: %p blocking on the %s sleepqueue", __func__, lk,
 	    (queue == SQ_EXCLUSIVE_QUEUE) ? "exclusive" : "shared");
 
 	if (flags & LK_INTERLOCK)
 		class->lc_unlock(ilk);
 	if (queue == SQ_EXCLUSIVE_QUEUE && (flags & LK_SLEEPFAIL) != 0)
 		lk->lk_exslpfail++;
 	GIANT_SAVE();
 	sleepq_add(&lk->lock_object, NULL, wmesg, SLEEPQ_LK | (catch ?
 	    SLEEPQ_INTERRUPTIBLE : 0), queue);
 	if ((flags & LK_TIMELOCK) && timo)
 		sleepq_set_timeout(&lk->lock_object, timo);
 
 	/*
 	 * Decisional switch for real sleeping.
 	 */
 	if ((flags & LK_TIMELOCK) && timo && catch)
 		error = sleepq_timedwait_sig(&lk->lock_object, pri);
 	else if ((flags & LK_TIMELOCK) && timo)
 		error = sleepq_timedwait(&lk->lock_object, pri);
 	else if (catch)
 		error = sleepq_wait_sig(&lk->lock_object, pri);
 	else
 		sleepq_wait(&lk->lock_object, pri);
 	GIANT_RESTORE();
 	if ((flags & LK_SLEEPFAIL) && error == 0)
 		error = ENOLCK;
 
 	return (error);
 }
 
 static __inline int
 wakeupshlk(struct lock *lk, const char *file, int line)
 {
 	uintptr_t v, x;
 	u_int realexslp;
 	int queue, wakeup_swapper;
 
 	WITNESS_UNLOCK(&lk->lock_object, 0, file, line);
 	LOCK_LOG_LOCK("SUNLOCK", &lk->lock_object, 0, 0, file, line);
 
 	wakeup_swapper = 0;
 	for (;;) {
 		x = lk->lk_lock;
 
 		/*
 		 * If there is more than one shared lock held, just drop one
 		 * and return.
 		 */
 		if (LK_SHARERS(x) > 1) {
 			if (atomic_cmpset_rel_ptr(&lk->lk_lock, x,
 			    x - LK_ONE_SHARER))
 				break;
 			continue;
 		}
 
 		/*
 		 * If there are not waiters on the exclusive queue, drop the
 		 * lock quickly.
 		 */
 		if ((x & LK_ALL_WAITERS) == 0) {
 			MPASS((x & ~LK_EXCLUSIVE_SPINNERS) ==
 			    LK_SHARERS_LOCK(1));
 			if (atomic_cmpset_rel_ptr(&lk->lk_lock, x, LK_UNLOCKED))
 				break;
 			continue;
 		}
 
 		/*
 		 * We should have a sharer with waiters, so enter the hard
 		 * path in order to handle wakeups correctly.
 		 */
 		sleepq_lock(&lk->lock_object);
 		x = lk->lk_lock & (LK_ALL_WAITERS | LK_EXCLUSIVE_SPINNERS);
 		v = LK_UNLOCKED;
 
 		/*
 		 * If the lock has exclusive waiters, give them preference in
 		 * order to avoid deadlock with shared runners up.
 		 * If interruptible sleeps left the exclusive queue empty
 		 * avoid a starvation for the threads sleeping on the shared
 		 * queue by giving them precedence and cleaning up the
 		 * exclusive waiters bit anyway.
 		 * Please note that lk_exslpfail count may be lying about
 		 * the real number of waiters with the LK_SLEEPFAIL flag on
 		 * because they may be used in conjuction with interruptible
 		 * sleeps so lk_exslpfail might be considered an 'upper limit'
 		 * bound, including the edge cases.
 		 */
 		realexslp = sleepq_sleepcnt(&lk->lock_object,
 		    SQ_EXCLUSIVE_QUEUE);
 		if ((x & LK_EXCLUSIVE_WAITERS) != 0 && realexslp != 0) {
 			if (lk->lk_exslpfail < realexslp) {
 				lk->lk_exslpfail = 0;
 				queue = SQ_EXCLUSIVE_QUEUE;
 				v |= (x & LK_SHARED_WAITERS);
 			} else {
 				lk->lk_exslpfail = 0;
 				LOCK_LOG2(lk,
 				    "%s: %p has only LK_SLEEPFAIL sleepers",
 				    __func__, lk);
 				LOCK_LOG2(lk,
 			    "%s: %p waking up threads on the exclusive queue",
 				    __func__, lk);
 				wakeup_swapper =
 				    sleepq_broadcast(&lk->lock_object,
 				    SLEEPQ_LK, 0, SQ_EXCLUSIVE_QUEUE);
 				queue = SQ_SHARED_QUEUE;
 			}
 				
 		} else {
 
 			/*
 			 * Exclusive waiters sleeping with LK_SLEEPFAIL on
 			 * and using interruptible sleeps/timeout may have
 			 * left spourious lk_exslpfail counts on, so clean
 			 * it up anyway.
 			 */
 			lk->lk_exslpfail = 0;
 			queue = SQ_SHARED_QUEUE;
 		}
 
 		if (!atomic_cmpset_rel_ptr(&lk->lk_lock, LK_SHARERS_LOCK(1) | x,
 		    v)) {
 			sleepq_release(&lk->lock_object);
 			continue;
 		}
 		LOCK_LOG3(lk, "%s: %p waking up threads on the %s queue",
 		    __func__, lk, queue == SQ_SHARED_QUEUE ? "shared" :
 		    "exclusive");
 		wakeup_swapper |= sleepq_broadcast(&lk->lock_object, SLEEPQ_LK,
 		    0, queue);
 		sleepq_release(&lk->lock_object);
 		break;
 	}
 
 	lock_profile_release_lock(&lk->lock_object);
 	TD_LOCKS_DEC(curthread);
 	TD_SLOCKS_DEC(curthread);
 	return (wakeup_swapper);
 }
 
 static void
 assert_lockmgr(const struct lock_object *lock, int what)
 {
 
 	panic("lockmgr locks do not support assertions");
 }
 
 static void
 lock_lockmgr(struct lock_object *lock, uintptr_t how)
 {
 
 	panic("lockmgr locks do not support sleep interlocking");
 }
 
 static uintptr_t
 unlock_lockmgr(struct lock_object *lock)
 {
 
 	panic("lockmgr locks do not support sleep interlocking");
 }
 
 #ifdef KDTRACE_HOOKS
 static int
 owner_lockmgr(const struct lock_object *lock, struct thread **owner)
 {
 
 	panic("lockmgr locks do not support owner inquiring");
 }
 #endif
 
 void
 lockinit(struct lock *lk, int pri, const char *wmesg, int timo, int flags)
 {
 	int iflags;
 
 	MPASS((flags & ~LK_INIT_MASK) == 0);
 	ASSERT_ATOMIC_LOAD_PTR(lk->lk_lock,
             ("%s: lockmgr not aligned for %s: %p", __func__, wmesg,
             &lk->lk_lock));
 
 	iflags = LO_SLEEPABLE | LO_UPGRADABLE;
 	if (flags & LK_CANRECURSE)
 		iflags |= LO_RECURSABLE;
 	if ((flags & LK_NODUP) == 0)
 		iflags |= LO_DUPOK;
 	if (flags & LK_NOPROFILE)
 		iflags |= LO_NOPROFILE;
 	if ((flags & LK_NOWITNESS) == 0)
 		iflags |= LO_WITNESS;
 	if (flags & LK_QUIET)
 		iflags |= LO_QUIET;
 	if (flags & LK_IS_VNODE)
 		iflags |= LO_IS_VNODE;
 	iflags |= flags & (LK_ADAPTIVE | LK_NOSHARE);
 
 	lock_init(&lk->lock_object, &lock_class_lockmgr, wmesg, NULL, iflags);
 	lk->lk_lock = LK_UNLOCKED;
 	lk->lk_recurse = 0;
 	lk->lk_exslpfail = 0;
 	lk->lk_timo = timo;
 	lk->lk_pri = pri;
 	STACK_ZERO(lk);
 }
 
 /*
  * XXX: Gross hacks to manipulate external lock flags after
  * initialization.  Used for certain vnode and buf locks.
  */
 void
 lockallowshare(struct lock *lk)
 {
 
 	lockmgr_assert(lk, KA_XLOCKED);
 	lk->lock_object.lo_flags &= ~LK_NOSHARE;
 }
 
 void
 lockallowrecurse(struct lock *lk)
 {
 
 	lockmgr_assert(lk, KA_XLOCKED);
 	lk->lock_object.lo_flags |= LO_RECURSABLE;
 }
 
 void
 lockdisablerecurse(struct lock *lk)
 {
 
 	lockmgr_assert(lk, KA_XLOCKED);
 	lk->lock_object.lo_flags &= ~LO_RECURSABLE;
 }
 
 void
 lockdestroy(struct lock *lk)
 {
 
 	KASSERT(lk->lk_lock == LK_UNLOCKED, ("lockmgr still held"));
 	KASSERT(lk->lk_recurse == 0, ("lockmgr still recursed"));
 	KASSERT(lk->lk_exslpfail == 0, ("lockmgr still exclusive waiters"));
 	lock_destroy(&lk->lock_object);
 }
 
 int
 __lockmgr_args(struct lock *lk, u_int flags, struct lock_object *ilk,
     const char *wmesg, int pri, int timo, const char *file, int line)
 {
 	GIANT_DECLARE;
 	struct lock_class *class;
 	const char *iwmesg;
 	uintptr_t tid, v, x;
 	u_int op, realexslp;
 	int error, ipri, itimo, queue, wakeup_swapper;
 #ifdef LOCK_PROFILING
 	uint64_t waittime = 0;
 	int contested = 0;
 #endif
 #ifdef ADAPTIVE_LOCKMGRS
 	volatile struct thread *owner;
 	u_int i, spintries = 0;
 #endif
 
 	error = 0;
 	tid = (uintptr_t)curthread;
 	op = (flags & LK_TYPE_MASK);
 	iwmesg = (wmesg == LK_WMESG_DEFAULT) ? lk->lock_object.lo_name : wmesg;
 	ipri = (pri == LK_PRIO_DEFAULT) ? lk->lk_pri : pri;
 	itimo = (timo == LK_TIMO_DEFAULT) ? lk->lk_timo : timo;
 
 	MPASS((flags & ~LK_TOTAL_MASK) == 0);
 	KASSERT((op & (op - 1)) == 0,
 	    ("%s: Invalid requested operation @ %s:%d", __func__, file, line));
 	KASSERT((flags & (LK_NOWAIT | LK_SLEEPFAIL)) == 0 ||
 	    (op != LK_DOWNGRADE && op != LK_RELEASE),
 	    ("%s: Invalid flags in regard of the operation desired @ %s:%d",
 	    __func__, file, line));
 	KASSERT((flags & LK_INTERLOCK) == 0 || ilk != NULL,
 	    ("%s: LK_INTERLOCK passed without valid interlock @ %s:%d",
 	    __func__, file, line));
 	KASSERT(kdb_active != 0 || !TD_IS_IDLETHREAD(curthread),
 	    ("%s: idle thread %p on lockmgr %s @ %s:%d", __func__, curthread,
 	    lk->lock_object.lo_name, file, line));
 
 	class = (flags & LK_INTERLOCK) ? LOCK_CLASS(ilk) : NULL;
 	if (panicstr != NULL) {
 		if (flags & LK_INTERLOCK)
 			class->lc_unlock(ilk);
 		return (0);
 	}
 
 	if (lk->lock_object.lo_flags & LK_NOSHARE) {
 		switch (op) {
 		case LK_SHARED:
 			op = LK_EXCLUSIVE;
 			break;
 		case LK_UPGRADE:
 		case LK_TRYUPGRADE:
 		case LK_DOWNGRADE:
 			_lockmgr_assert(lk, KA_XLOCKED | KA_NOTRECURSED,
 			    file, line);
 			if (flags & LK_INTERLOCK)
 				class->lc_unlock(ilk);
 			return (0);
 		}
 	}
 
 	wakeup_swapper = 0;
 	switch (op) {
 	case LK_SHARED:
 		if (LK_CAN_WITNESS(flags))
 			WITNESS_CHECKORDER(&lk->lock_object, LOP_NEWORDER,
 			    file, line, flags & LK_INTERLOCK ? ilk : NULL);
 		for (;;) {
 			x = lk->lk_lock;
 
 			/*
 			 * If no other thread has an exclusive lock, or
 			 * no exclusive waiter is present, bump the count of
 			 * sharers.  Since we have to preserve the state of
 			 * waiters, if we fail to acquire the shared lock
 			 * loop back and retry.
 			 */
 			if (LK_CAN_SHARE(x)) {
 				if (atomic_cmpset_acq_ptr(&lk->lk_lock, x,
 				    x + LK_ONE_SHARER))
 					break;
 				continue;
 			}
 #ifdef HWPMC_HOOKS
 			PMC_SOFT_CALL( , , lock, failed);
 #endif
 			lock_profile_obtain_lock_failed(&lk->lock_object,
 			    &contested, &waittime);
 
 			/*
 			 * If the lock is already held by curthread in
 			 * exclusive way avoid a deadlock.
 			 */
 			if (LK_HOLDER(x) == tid) {
 				LOCK_LOG2(lk,
 				    "%s: %p already held in exclusive mode",
 				    __func__, lk);
 				error = EDEADLK;
 				break;
 			}
 
 			/*
 			 * If the lock is expected to not sleep just give up
 			 * and return.
 			 */
 			if (LK_TRYOP(flags)) {
 				LOCK_LOG2(lk, "%s: %p fails the try operation",
 				    __func__, lk);
 				error = EBUSY;
 				break;
 			}
 
 #ifdef ADAPTIVE_LOCKMGRS
 			/*
 			 * If the owner is running on another CPU, spin until
 			 * the owner stops running or the state of the lock
 			 * changes.  We need a double-state handle here
 			 * because for a failed acquisition the lock can be
 			 * either held in exclusive mode or shared mode
 			 * (for the writer starvation avoidance technique).
 			 */
 			if (LK_CAN_ADAPT(lk, flags) && (x & LK_SHARE) == 0 &&
 			    LK_HOLDER(x) != LK_KERNPROC) {
 				owner = (struct thread *)LK_HOLDER(x);
 				if (LOCK_LOG_TEST(&lk->lock_object, 0))
 					CTR3(KTR_LOCK,
 					    "%s: spinning on %p held by %p",
 					    __func__, lk, owner);
 
 				/*
 				 * If we are holding also an interlock drop it
 				 * in order to avoid a deadlock if the lockmgr
 				 * owner is adaptively spinning on the
 				 * interlock itself.
 				 */
 				if (flags & LK_INTERLOCK) {
 					class->lc_unlock(ilk);
 					flags &= ~LK_INTERLOCK;
 				}
 				GIANT_SAVE();
 				while (LK_HOLDER(lk->lk_lock) ==
 				    (uintptr_t)owner && TD_IS_RUNNING(owner))
 					cpu_spinwait();
 				GIANT_RESTORE();
 				continue;
 			} else if (LK_CAN_ADAPT(lk, flags) &&
 			    (x & LK_SHARE) != 0 && LK_SHARERS(x) &&
 			    spintries < alk_retries) {
 				if (flags & LK_INTERLOCK) {
 					class->lc_unlock(ilk);
 					flags &= ~LK_INTERLOCK;
 				}
 				GIANT_SAVE();
 				spintries++;
 				for (i = 0; i < alk_loops; i++) {
 					if (LOCK_LOG_TEST(&lk->lock_object, 0))
 						CTR4(KTR_LOCK,
 				    "%s: shared spinning on %p with %u and %u",
 						    __func__, lk, spintries, i);
 					x = lk->lk_lock;
 					if ((x & LK_SHARE) == 0 ||
 					    LK_CAN_SHARE(x) != 0)
 						break;
 					cpu_spinwait();
 				}
 				GIANT_RESTORE();
 				if (i != alk_loops)
 					continue;
 			}
 #endif
 
 			/*
 			 * Acquire the sleepqueue chain lock because we
 			 * probabilly will need to manipulate waiters flags.
 			 */
 			sleepq_lock(&lk->lock_object);
 			x = lk->lk_lock;
 
 			/*
 			 * if the lock can be acquired in shared mode, try
 			 * again.
 			 */
 			if (LK_CAN_SHARE(x)) {
 				sleepq_release(&lk->lock_object);
 				continue;
 			}
 
 #ifdef ADAPTIVE_LOCKMGRS
 			/*
 			 * The current lock owner might have started executing
 			 * on another CPU (or the lock could have changed
 			 * owner) while we were waiting on the turnstile
 			 * chain lock.  If so, drop the turnstile lock and try
 			 * again.
 			 */
 			if (LK_CAN_ADAPT(lk, flags) && (x & LK_SHARE) == 0 &&
 			    LK_HOLDER(x) != LK_KERNPROC) {
 				owner = (struct thread *)LK_HOLDER(x);
 				if (TD_IS_RUNNING(owner)) {
 					sleepq_release(&lk->lock_object);
 					continue;
 				}
 			}
 #endif
 
 			/*
 			 * Try to set the LK_SHARED_WAITERS flag.  If we fail,
 			 * loop back and retry.
 			 */
 			if ((x & LK_SHARED_WAITERS) == 0) {
 				if (!atomic_cmpset_acq_ptr(&lk->lk_lock, x,
 				    x | LK_SHARED_WAITERS)) {
 					sleepq_release(&lk->lock_object);
 					continue;
 				}
 				LOCK_LOG2(lk, "%s: %p set shared waiters flag",
 				    __func__, lk);
 			}
 
 			/*
 			 * As far as we have been unable to acquire the
 			 * shared lock and the shared waiters flag is set,
 			 * we will sleep.
 			 */
 			error = sleeplk(lk, flags, ilk, iwmesg, ipri, itimo,
 			    SQ_SHARED_QUEUE);
 			flags &= ~LK_INTERLOCK;
 			if (error) {
 				LOCK_LOG3(lk,
 				    "%s: interrupted sleep for %p with %d",
 				    __func__, lk, error);
 				break;
 			}
 			LOCK_LOG2(lk, "%s: %p resuming from the sleep queue",
 			    __func__, lk);
 		}
 		if (error == 0) {
 			lock_profile_obtain_lock_success(&lk->lock_object,
 			    contested, waittime, file, line);
 			LOCK_LOG_LOCK("SLOCK", &lk->lock_object, 0, 0, file,
 			    line);
 			WITNESS_LOCK(&lk->lock_object, LK_TRYWIT(flags), file,
 			    line);
 			TD_LOCKS_INC(curthread);
 			TD_SLOCKS_INC(curthread);
 			STACK_SAVE(lk);
 		}
 		break;
 	case LK_UPGRADE:
 	case LK_TRYUPGRADE:
 		_lockmgr_assert(lk, KA_SLOCKED, file, line);
 		v = lk->lk_lock;
 		x = v & LK_ALL_WAITERS;
 		v &= LK_EXCLUSIVE_SPINNERS;
 
 		/*
 		 * Try to switch from one shared lock to an exclusive one.
 		 * We need to preserve waiters flags during the operation.
 		 */
 		if (atomic_cmpset_ptr(&lk->lk_lock, LK_SHARERS_LOCK(1) | x | v,
 		    tid | x)) {
 			LOCK_LOG_LOCK("XUPGRADE", &lk->lock_object, 0, 0, file,
 			    line);
 			WITNESS_UPGRADE(&lk->lock_object, LOP_EXCLUSIVE |
 			    LK_TRYWIT(flags), file, line);
 			TD_SLOCKS_DEC(curthread);
 			break;
 		}
 
 		/*
 		 * In LK_TRYUPGRADE mode, do not drop the lock,
 		 * returning EBUSY instead.
 		 */
 		if (op == LK_TRYUPGRADE) {
 			LOCK_LOG2(lk, "%s: %p failed the nowait upgrade",
 			    __func__, lk);
 			error = EBUSY;
 			break;
 		}
 
 		/*
 		 * We have been unable to succeed in upgrading, so just
 		 * give up the shared lock.
 		 */
 		wakeup_swapper |= wakeupshlk(lk, file, line);
 
 		/* FALLTHROUGH */
 	case LK_EXCLUSIVE:
 		if (LK_CAN_WITNESS(flags))
 			WITNESS_CHECKORDER(&lk->lock_object, LOP_NEWORDER |
 			    LOP_EXCLUSIVE, file, line, flags & LK_INTERLOCK ?
 			    ilk : NULL);
 
 		/*
 		 * If curthread already holds the lock and this one is
 		 * allowed to recurse, simply recurse on it.
 		 */
 		if (lockmgr_xlocked(lk)) {
 			if ((flags & LK_CANRECURSE) == 0 &&
 			    (lk->lock_object.lo_flags & LO_RECURSABLE) == 0) {
 
 				/*
 				 * If the lock is expected to not panic just
 				 * give up and return.
 				 */
 				if (LK_TRYOP(flags)) {
 					LOCK_LOG2(lk,
 					    "%s: %p fails the try operation",
 					    __func__, lk);
 					error = EBUSY;
 					break;
 				}
 				if (flags & LK_INTERLOCK)
 					class->lc_unlock(ilk);
 		panic("%s: recursing on non recursive lockmgr %s @ %s:%d\n",
 				    __func__, iwmesg, file, line);
 			}
 			lk->lk_recurse++;
 			LOCK_LOG2(lk, "%s: %p recursing", __func__, lk);
 			LOCK_LOG_LOCK("XLOCK", &lk->lock_object, 0,
 			    lk->lk_recurse, file, line);
 			WITNESS_LOCK(&lk->lock_object, LOP_EXCLUSIVE |
 			    LK_TRYWIT(flags), file, line);
 			TD_LOCKS_INC(curthread);
 			break;
 		}
 
 		while (!atomic_cmpset_acq_ptr(&lk->lk_lock, LK_UNLOCKED,
 		    tid)) {
 #ifdef HWPMC_HOOKS
 			PMC_SOFT_CALL( , , lock, failed);
 #endif
 			lock_profile_obtain_lock_failed(&lk->lock_object,
 			    &contested, &waittime);
 
 			/*
 			 * If the lock is expected to not sleep just give up
 			 * and return.
 			 */
 			if (LK_TRYOP(flags)) {
 				LOCK_LOG2(lk, "%s: %p fails the try operation",
 				    __func__, lk);
 				error = EBUSY;
 				break;
 			}
 
 #ifdef ADAPTIVE_LOCKMGRS
 			/*
 			 * If the owner is running on another CPU, spin until
 			 * the owner stops running or the state of the lock
 			 * changes.
 			 */
 			x = lk->lk_lock;
 			if (LK_CAN_ADAPT(lk, flags) && (x & LK_SHARE) == 0 &&
 			    LK_HOLDER(x) != LK_KERNPROC) {
 				owner = (struct thread *)LK_HOLDER(x);
 				if (LOCK_LOG_TEST(&lk->lock_object, 0))
 					CTR3(KTR_LOCK,
 					    "%s: spinning on %p held by %p",
 					    __func__, lk, owner);
 
 				/*
 				 * If we are holding also an interlock drop it
 				 * in order to avoid a deadlock if the lockmgr
 				 * owner is adaptively spinning on the
 				 * interlock itself.
 				 */
 				if (flags & LK_INTERLOCK) {
 					class->lc_unlock(ilk);
 					flags &= ~LK_INTERLOCK;
 				}
 				GIANT_SAVE();
 				while (LK_HOLDER(lk->lk_lock) ==
 				    (uintptr_t)owner && TD_IS_RUNNING(owner))
 					cpu_spinwait();
 				GIANT_RESTORE();
 				continue;
 			} else if (LK_CAN_ADAPT(lk, flags) &&
 			    (x & LK_SHARE) != 0 && LK_SHARERS(x) &&
 			    spintries < alk_retries) {
 				if ((x & LK_EXCLUSIVE_SPINNERS) == 0 &&
 				    !atomic_cmpset_ptr(&lk->lk_lock, x,
 				    x | LK_EXCLUSIVE_SPINNERS))
 					continue;
 				if (flags & LK_INTERLOCK) {
 					class->lc_unlock(ilk);
 					flags &= ~LK_INTERLOCK;
 				}
 				GIANT_SAVE();
 				spintries++;
 				for (i = 0; i < alk_loops; i++) {
 					if (LOCK_LOG_TEST(&lk->lock_object, 0))
 						CTR4(KTR_LOCK,
 				    "%s: shared spinning on %p with %u and %u",
 						    __func__, lk, spintries, i);
 					if ((lk->lk_lock &
 					    LK_EXCLUSIVE_SPINNERS) == 0)
 						break;
 					cpu_spinwait();
 				}
 				GIANT_RESTORE();
 				if (i != alk_loops)
 					continue;
 			}
 #endif
 
 			/*
 			 * Acquire the sleepqueue chain lock because we
 			 * probabilly will need to manipulate waiters flags.
 			 */
 			sleepq_lock(&lk->lock_object);
 			x = lk->lk_lock;
 
 			/*
 			 * if the lock has been released while we spun on
 			 * the sleepqueue chain lock just try again.
 			 */
 			if (x == LK_UNLOCKED) {
 				sleepq_release(&lk->lock_object);
 				continue;
 			}
 
 #ifdef ADAPTIVE_LOCKMGRS
 			/*
 			 * The current lock owner might have started executing
 			 * on another CPU (or the lock could have changed
 			 * owner) while we were waiting on the turnstile
 			 * chain lock.  If so, drop the turnstile lock and try
 			 * again.
 			 */
 			if (LK_CAN_ADAPT(lk, flags) && (x & LK_SHARE) == 0 &&
 			    LK_HOLDER(x) != LK_KERNPROC) {
 				owner = (struct thread *)LK_HOLDER(x);
 				if (TD_IS_RUNNING(owner)) {
 					sleepq_release(&lk->lock_object);
 					continue;
 				}
 			}
 #endif
 
 			/*
 			 * The lock can be in the state where there is a
 			 * pending queue of waiters, but still no owner.
 			 * This happens when the lock is contested and an
 			 * owner is going to claim the lock.
 			 * If curthread is the one successfully acquiring it
 			 * claim lock ownership and return, preserving waiters
 			 * flags.
 			 */
 			v = x & (LK_ALL_WAITERS | LK_EXCLUSIVE_SPINNERS);
 			if ((x & ~v) == LK_UNLOCKED) {
 				v &= ~LK_EXCLUSIVE_SPINNERS;
 				if (atomic_cmpset_acq_ptr(&lk->lk_lock, x,
 				    tid | v)) {
 					sleepq_release(&lk->lock_object);
 					LOCK_LOG2(lk,
 					    "%s: %p claimed by a new writer",
 					    __func__, lk);
 					break;
 				}
 				sleepq_release(&lk->lock_object);
 				continue;
 			}
 
 			/*
 			 * Try to set the LK_EXCLUSIVE_WAITERS flag.  If we
 			 * fail, loop back and retry.
 			 */
 			if ((x & LK_EXCLUSIVE_WAITERS) == 0) {
 				if (!atomic_cmpset_ptr(&lk->lk_lock, x,
 				    x | LK_EXCLUSIVE_WAITERS)) {
 					sleepq_release(&lk->lock_object);
 					continue;
 				}
 				LOCK_LOG2(lk, "%s: %p set excl waiters flag",
 				    __func__, lk);
 			}
 
 			/*
 			 * As far as we have been unable to acquire the
 			 * exclusive lock and the exclusive waiters flag
 			 * is set, we will sleep.
 			 */
 			error = sleeplk(lk, flags, ilk, iwmesg, ipri, itimo,
 			    SQ_EXCLUSIVE_QUEUE);
 			flags &= ~LK_INTERLOCK;
 			if (error) {
 				LOCK_LOG3(lk,
 				    "%s: interrupted sleep for %p with %d",
 				    __func__, lk, error);
 				break;
 			}
 			LOCK_LOG2(lk, "%s: %p resuming from the sleep queue",
 			    __func__, lk);
 		}
 		if (error == 0) {
 			lock_profile_obtain_lock_success(&lk->lock_object,
 			    contested, waittime, file, line);
 			LOCK_LOG_LOCK("XLOCK", &lk->lock_object, 0,
 			    lk->lk_recurse, file, line);
 			WITNESS_LOCK(&lk->lock_object, LOP_EXCLUSIVE |
 			    LK_TRYWIT(flags), file, line);
 			TD_LOCKS_INC(curthread);
 			STACK_SAVE(lk);
 		}
 		break;
 	case LK_DOWNGRADE:
 		_lockmgr_assert(lk, KA_XLOCKED, file, line);
 		LOCK_LOG_LOCK("XDOWNGRADE", &lk->lock_object, 0, 0, file, line);
 		WITNESS_DOWNGRADE(&lk->lock_object, 0, file, line);
 
 		/*
 		 * Panic if the lock is recursed.
 		 */
 		if (lockmgr_xlocked(lk) && lockmgr_recursed(lk)) {
 			if (flags & LK_INTERLOCK)
 				class->lc_unlock(ilk);
 			panic("%s: downgrade a recursed lockmgr %s @ %s:%d\n",
 			    __func__, iwmesg, file, line);
 		}
 		TD_SLOCKS_INC(curthread);
 
 		/*
 		 * In order to preserve waiters flags, just spin.
 		 */
 		for (;;) {
 			x = lk->lk_lock;
 			MPASS((x & LK_EXCLUSIVE_SPINNERS) == 0);
 			x &= LK_ALL_WAITERS;
 			if (atomic_cmpset_rel_ptr(&lk->lk_lock, tid | x,
 			    LK_SHARERS_LOCK(1) | x))
 				break;
 			cpu_spinwait();
 		}
 		break;
 	case LK_RELEASE:
 		_lockmgr_assert(lk, KA_LOCKED, file, line);
 		x = lk->lk_lock;
 
 		if ((x & LK_SHARE) == 0) {
 
 			/*
 			 * As first option, treact the lock as if it has not
 			 * any waiter.
 			 * Fix-up the tid var if the lock has been disowned.
 			 */
 			if (LK_HOLDER(x) == LK_KERNPROC)
 				tid = LK_KERNPROC;
 			else {
 				WITNESS_UNLOCK(&lk->lock_object, LOP_EXCLUSIVE,
 				    file, line);
 				TD_LOCKS_DEC(curthread);
 			}
 			LOCK_LOG_LOCK("XUNLOCK", &lk->lock_object, 0,
 			    lk->lk_recurse, file, line);
 
 			/*
 			 * The lock is held in exclusive mode.
 			 * If the lock is recursed also, then unrecurse it.
 			 */
 			if (lockmgr_xlocked(lk) && lockmgr_recursed(lk)) {
 				LOCK_LOG2(lk, "%s: %p unrecursing", __func__,
 				    lk);
 				lk->lk_recurse--;
 				break;
 			}
 			if (tid != LK_KERNPROC)
 				lock_profile_release_lock(&lk->lock_object);
 
 			if (atomic_cmpset_rel_ptr(&lk->lk_lock, tid,
 			    LK_UNLOCKED))
 				break;
 
 			sleepq_lock(&lk->lock_object);
 			x = lk->lk_lock;
 			v = LK_UNLOCKED;
 
 			/*
 		 	 * If the lock has exclusive waiters, give them
 			 * preference in order to avoid deadlock with
 			 * shared runners up.
 			 * If interruptible sleeps left the exclusive queue
 			 * empty avoid a starvation for the threads sleeping
 			 * on the shared queue by giving them precedence
 			 * and cleaning up the exclusive waiters bit anyway.
 			 * Please note that lk_exslpfail count may be lying
 			 * about the real number of waiters with the
 			 * LK_SLEEPFAIL flag on because they may be used in
 			 * conjuction with interruptible sleeps so
 			 * lk_exslpfail might be considered an 'upper limit'
 			 * bound, including the edge cases.
 			 */
 			MPASS((x & LK_EXCLUSIVE_SPINNERS) == 0);
 			realexslp = sleepq_sleepcnt(&lk->lock_object,
 			    SQ_EXCLUSIVE_QUEUE);
 			if ((x & LK_EXCLUSIVE_WAITERS) != 0 && realexslp != 0) {
 				if (lk->lk_exslpfail < realexslp) {
 					lk->lk_exslpfail = 0;
 					queue = SQ_EXCLUSIVE_QUEUE;
 					v |= (x & LK_SHARED_WAITERS);
 				} else {
 					lk->lk_exslpfail = 0;
 					LOCK_LOG2(lk,
 					"%s: %p has only LK_SLEEPFAIL sleepers",
 					    __func__, lk);
 					LOCK_LOG2(lk,
 			"%s: %p waking up threads on the exclusive queue",
 					    __func__, lk);
 					wakeup_swapper =
 					    sleepq_broadcast(&lk->lock_object,
 					    SLEEPQ_LK, 0, SQ_EXCLUSIVE_QUEUE);
 					queue = SQ_SHARED_QUEUE;
 				}
 			} else {
 
 				/*
 				 * Exclusive waiters sleeping with LK_SLEEPFAIL
 				 * on and using interruptible sleeps/timeout
 				 * may have left spourious lk_exslpfail counts
 				 * on, so clean it up anyway. 
 				 */
 				lk->lk_exslpfail = 0;
 				queue = SQ_SHARED_QUEUE;
 			}
 
 			LOCK_LOG3(lk,
 			    "%s: %p waking up threads on the %s queue",
 			    __func__, lk, queue == SQ_SHARED_QUEUE ? "shared" :
 			    "exclusive");
 			atomic_store_rel_ptr(&lk->lk_lock, v);
 			wakeup_swapper |= sleepq_broadcast(&lk->lock_object,
 			    SLEEPQ_LK, 0, queue);
 			sleepq_release(&lk->lock_object);
 			break;
 		} else
 			wakeup_swapper = wakeupshlk(lk, file, line);
 		break;
 	case LK_DRAIN:
 		if (LK_CAN_WITNESS(flags))
 			WITNESS_CHECKORDER(&lk->lock_object, LOP_NEWORDER |
 			    LOP_EXCLUSIVE, file, line, flags & LK_INTERLOCK ?
 			    ilk : NULL);
 
 		/*
 		 * Trying to drain a lock we already own will result in a
 		 * deadlock.
 		 */
 		if (lockmgr_xlocked(lk)) {
 			if (flags & LK_INTERLOCK)
 				class->lc_unlock(ilk);
 			panic("%s: draining %s with the lock held @ %s:%d\n",
 			    __func__, iwmesg, file, line);
 		}
 
 		while (!atomic_cmpset_acq_ptr(&lk->lk_lock, LK_UNLOCKED, tid)) {
 #ifdef HWPMC_HOOKS
 			PMC_SOFT_CALL( , , lock, failed);
 #endif
 			lock_profile_obtain_lock_failed(&lk->lock_object,
 			    &contested, &waittime);
 
 			/*
 			 * If the lock is expected to not sleep just give up
 			 * and return.
 			 */
 			if (LK_TRYOP(flags)) {
 				LOCK_LOG2(lk, "%s: %p fails the try operation",
 				    __func__, lk);
 				error = EBUSY;
 				break;
 			}
 
 			/*
 			 * Acquire the sleepqueue chain lock because we
 			 * probabilly will need to manipulate waiters flags.
 			 */
 			sleepq_lock(&lk->lock_object);
 			x = lk->lk_lock;
 
 			/*
 			 * if the lock has been released while we spun on
 			 * the sleepqueue chain lock just try again.
 			 */
 			if (x == LK_UNLOCKED) {
 				sleepq_release(&lk->lock_object);
 				continue;
 			}
 
 			v = x & (LK_ALL_WAITERS | LK_EXCLUSIVE_SPINNERS);
 			if ((x & ~v) == LK_UNLOCKED) {
 				v = (x & ~LK_EXCLUSIVE_SPINNERS);
 
 				/*
 				 * If interruptible sleeps left the exclusive
 				 * queue empty avoid a starvation for the
 				 * threads sleeping on the shared queue by
 				 * giving them precedence and cleaning up the
 				 * exclusive waiters bit anyway.
 				 * Please note that lk_exslpfail count may be
 				 * lying about the real number of waiters with
 				 * the LK_SLEEPFAIL flag on because they may
 				 * be used in conjuction with interruptible
 				 * sleeps so lk_exslpfail might be considered
 				 * an 'upper limit' bound, including the edge
 				 * cases.
 				 */
 				if (v & LK_EXCLUSIVE_WAITERS) {
 					queue = SQ_EXCLUSIVE_QUEUE;
 					v &= ~LK_EXCLUSIVE_WAITERS;
 				} else {
 
 					/*
 					 * Exclusive waiters sleeping with
 					 * LK_SLEEPFAIL on and using
 					 * interruptible sleeps/timeout may
 					 * have left spourious lk_exslpfail
 					 * counts on, so clean it up anyway.
 					 */
 					MPASS(v & LK_SHARED_WAITERS);
 					lk->lk_exslpfail = 0;
 					queue = SQ_SHARED_QUEUE;
 					v &= ~LK_SHARED_WAITERS;
 				}
 				if (queue == SQ_EXCLUSIVE_QUEUE) {
 					realexslp =
 					    sleepq_sleepcnt(&lk->lock_object,
 					    SQ_EXCLUSIVE_QUEUE);
 					if (lk->lk_exslpfail >= realexslp) {
 						lk->lk_exslpfail = 0;
 						queue = SQ_SHARED_QUEUE;
 						v &= ~LK_SHARED_WAITERS;
 						if (realexslp != 0) {
 							LOCK_LOG2(lk,
 					"%s: %p has only LK_SLEEPFAIL sleepers",
 							    __func__, lk);
 							LOCK_LOG2(lk,
 			"%s: %p waking up threads on the exclusive queue",
 							    __func__, lk);
 							wakeup_swapper =
 							    sleepq_broadcast(
 							    &lk->lock_object,
 							    SLEEPQ_LK, 0,
 							    SQ_EXCLUSIVE_QUEUE);
 						}
 					} else
 						lk->lk_exslpfail = 0;
 				}
 				if (!atomic_cmpset_ptr(&lk->lk_lock, x, v)) {
 					sleepq_release(&lk->lock_object);
 					continue;
 				}
 				LOCK_LOG3(lk,
 				"%s: %p waking up all threads on the %s queue",
 				    __func__, lk, queue == SQ_SHARED_QUEUE ?
 				    "shared" : "exclusive");
 				wakeup_swapper |= sleepq_broadcast(
 				    &lk->lock_object, SLEEPQ_LK, 0, queue);
 
 				/*
 				 * If shared waiters have been woken up we need
 				 * to wait for one of them to acquire the lock
 				 * before to set the exclusive waiters in
 				 * order to avoid a deadlock.
 				 */
 				if (queue == SQ_SHARED_QUEUE) {
 					for (v = lk->lk_lock;
 					    (v & LK_SHARE) && !LK_SHARERS(v);
 					    v = lk->lk_lock)
 						cpu_spinwait();
 				}
 			}
 
 			/*
 			 * Try to set the LK_EXCLUSIVE_WAITERS flag.  If we
 			 * fail, loop back and retry.
 			 */
 			if ((x & LK_EXCLUSIVE_WAITERS) == 0) {
 				if (!atomic_cmpset_ptr(&lk->lk_lock, x,
 				    x | LK_EXCLUSIVE_WAITERS)) {
 					sleepq_release(&lk->lock_object);
 					continue;
 				}
 				LOCK_LOG2(lk, "%s: %p set drain waiters flag",
 				    __func__, lk);
 			}
 
 			/*
 			 * As far as we have been unable to acquire the
 			 * exclusive lock and the exclusive waiters flag
 			 * is set, we will sleep.
 			 */
 			if (flags & LK_INTERLOCK) {
 				class->lc_unlock(ilk);
 				flags &= ~LK_INTERLOCK;
 			}
 			GIANT_SAVE();
 			sleepq_add(&lk->lock_object, NULL, iwmesg, SLEEPQ_LK,
 			    SQ_EXCLUSIVE_QUEUE);
 			sleepq_wait(&lk->lock_object, ipri & PRIMASK);
 			GIANT_RESTORE();
 			LOCK_LOG2(lk, "%s: %p resuming from the sleep queue",
 			    __func__, lk);
 		}
 
 		if (error == 0) {
 			lock_profile_obtain_lock_success(&lk->lock_object,
 			    contested, waittime, file, line);
 			LOCK_LOG_LOCK("DRAIN", &lk->lock_object, 0,
 			    lk->lk_recurse, file, line);
 			WITNESS_LOCK(&lk->lock_object, LOP_EXCLUSIVE |
 			    LK_TRYWIT(flags), file, line);
 			TD_LOCKS_INC(curthread);
 			STACK_SAVE(lk);
 		}
 		break;
 	default:
 		if (flags & LK_INTERLOCK)
 			class->lc_unlock(ilk);
 		panic("%s: unknown lockmgr request 0x%x\n", __func__, op);
 	}
 
 	if (flags & LK_INTERLOCK)
 		class->lc_unlock(ilk);
 	if (wakeup_swapper)
 		kick_proc0();
 
 	return (error);
 }
 
 void
 _lockmgr_disown(struct lock *lk, const char *file, int line)
 {
 	uintptr_t tid, x;
 
 	if (SCHEDULER_STOPPED())
 		return;
 
 	tid = (uintptr_t)curthread;
 	_lockmgr_assert(lk, KA_XLOCKED, file, line);
 
 	/*
 	 * Panic if the lock is recursed.
 	 */
 	if (lockmgr_xlocked(lk) && lockmgr_recursed(lk))
 		panic("%s: disown a recursed lockmgr @ %s:%d\n",
 		    __func__,  file, line);
 
 	/*
 	 * If the owner is already LK_KERNPROC just skip the whole operation.
 	 */
 	if (LK_HOLDER(lk->lk_lock) != tid)
 		return;
 	lock_profile_release_lock(&lk->lock_object);
 	LOCK_LOG_LOCK("XDISOWN", &lk->lock_object, 0, 0, file, line);
 	WITNESS_UNLOCK(&lk->lock_object, LOP_EXCLUSIVE, file, line);
 	TD_LOCKS_DEC(curthread);
 	STACK_SAVE(lk);
 
 	/*
 	 * In order to preserve waiters flags, just spin.
 	 */
 	for (;;) {
 		x = lk->lk_lock;
 		MPASS((x & LK_EXCLUSIVE_SPINNERS) == 0);
 		x &= LK_ALL_WAITERS;
 		if (atomic_cmpset_rel_ptr(&lk->lk_lock, tid | x,
 		    LK_KERNPROC | x))
 			return;
 		cpu_spinwait();
 	}
 }
 
 void
 lockmgr_printinfo(const struct lock *lk)
 {
 	struct thread *td;
 	uintptr_t x;
 
 	if (lk->lk_lock == LK_UNLOCKED)
 		printf("lock type %s: UNLOCKED\n", lk->lock_object.lo_name);
 	else if (lk->lk_lock & LK_SHARE)
 		printf("lock type %s: SHARED (count %ju)\n",
 		    lk->lock_object.lo_name,
 		    (uintmax_t)LK_SHARERS(lk->lk_lock));
 	else {
 		td = lockmgr_xholder(lk);
 		printf("lock type %s: EXCL by thread %p "
 		    "(pid %d, %s, tid %d)\n", lk->lock_object.lo_name, td,
 		    td->td_proc->p_pid, td->td_proc->p_comm, td->td_tid);
 	}
 
 	x = lk->lk_lock;
 	if (x & LK_EXCLUSIVE_WAITERS)
 		printf(" with exclusive waiters pending\n");
 	if (x & LK_SHARED_WAITERS)
 		printf(" with shared waiters pending\n");
 	if (x & LK_EXCLUSIVE_SPINNERS)
 		printf(" with exclusive spinners pending\n");
 
 	STACK_PRINT(lk);
 }
 
 int
 lockstatus(const struct lock *lk)
 {
 	uintptr_t v, x;
 	int ret;
 
 	ret = LK_SHARED;
 	x = lk->lk_lock;
 	v = LK_HOLDER(x);
 
 	if ((x & LK_SHARE) == 0) {
 		if (v == (uintptr_t)curthread || v == LK_KERNPROC)
 			ret = LK_EXCLUSIVE;
 		else
 			ret = LK_EXCLOTHER;
 	} else if (x == LK_UNLOCKED)
 		ret = 0;
 
 	return (ret);
 }
 
 #ifdef INVARIANT_SUPPORT
 
 FEATURE(invariant_support,
     "Support for modules compiled with INVARIANTS option");
 
 #ifndef INVARIANTS
 #undef	_lockmgr_assert
 #endif
 
 void
 _lockmgr_assert(const struct lock *lk, int what, const char *file, int line)
 {
 	int slocked = 0;
 
 	if (panicstr != NULL)
 		return;
 	switch (what) {
 	case KA_SLOCKED:
 	case KA_SLOCKED | KA_NOTRECURSED:
 	case KA_SLOCKED | KA_RECURSED:
 		slocked = 1;
 	case KA_LOCKED:
 	case KA_LOCKED | KA_NOTRECURSED:
 	case KA_LOCKED | KA_RECURSED:
 #ifdef WITNESS
 
 		/*
 		 * We cannot trust WITNESS if the lock is held in exclusive
 		 * mode and a call to lockmgr_disown() happened.
 		 * Workaround this skipping the check if the lock is held in
 		 * exclusive mode even for the KA_LOCKED case.
 		 */
 		if (slocked || (lk->lk_lock & LK_SHARE)) {
 			witness_assert(&lk->lock_object, what, file, line);
 			break;
 		}
 #endif
 		if (lk->lk_lock == LK_UNLOCKED ||
 		    ((lk->lk_lock & LK_SHARE) == 0 && (slocked ||
 		    (!lockmgr_xlocked(lk) && !lockmgr_disowned(lk)))))
 			panic("Lock %s not %slocked @ %s:%d\n",
 			    lk->lock_object.lo_name, slocked ? "share" : "",
 			    file, line);
 
 		if ((lk->lk_lock & LK_SHARE) == 0) {
 			if (lockmgr_recursed(lk)) {
 				if (what & KA_NOTRECURSED)
 					panic("Lock %s recursed @ %s:%d\n",
 					    lk->lock_object.lo_name, file,
 					    line);
 			} else if (what & KA_RECURSED)
 				panic("Lock %s not recursed @ %s:%d\n",
 				    lk->lock_object.lo_name, file, line);
 		}
 		break;
 	case KA_XLOCKED:
 	case KA_XLOCKED | KA_NOTRECURSED:
 	case KA_XLOCKED | KA_RECURSED:
 		if (!lockmgr_xlocked(lk) && !lockmgr_disowned(lk))
 			panic("Lock %s not exclusively locked @ %s:%d\n",
 			    lk->lock_object.lo_name, file, line);
 		if (lockmgr_recursed(lk)) {
 			if (what & KA_NOTRECURSED)
 				panic("Lock %s recursed @ %s:%d\n",
 				    lk->lock_object.lo_name, file, line);
 		} else if (what & KA_RECURSED)
 			panic("Lock %s not recursed @ %s:%d\n",
 			    lk->lock_object.lo_name, file, line);
 		break;
 	case KA_UNLOCKED:
 		if (lockmgr_xlocked(lk) || lockmgr_disowned(lk))
 			panic("Lock %s exclusively locked @ %s:%d\n",
 			    lk->lock_object.lo_name, file, line);
 		break;
 	default:
 		panic("Unknown lockmgr assertion: %d @ %s:%d\n", what, file,
 		    line);
 	}
 }
 #endif
 
 #ifdef DDB
 int
 lockmgr_chain(struct thread *td, struct thread **ownerp)
 {
 	struct lock *lk;
 
 	lk = td->td_wchan;
 
 	if (LOCK_CLASS(&lk->lock_object) != &lock_class_lockmgr)
 		return (0);
 	db_printf("blocked on lockmgr %s", lk->lock_object.lo_name);
 	if (lk->lk_lock & LK_SHARE)
 		db_printf("SHARED (count %ju)\n",
 		    (uintmax_t)LK_SHARERS(lk->lk_lock));
 	else
 		db_printf("EXCL\n");
 	*ownerp = lockmgr_xholder(lk);
 
 	return (1);
 }
 
 static void
 db_show_lockmgr(const struct lock_object *lock)
 {
 	struct thread *td;
 	const struct lock *lk;
 
 	lk = (const struct lock *)lock;
 
 	db_printf(" state: ");
 	if (lk->lk_lock == LK_UNLOCKED)
 		db_printf("UNLOCKED\n");
 	else if (lk->lk_lock & LK_SHARE)
 		db_printf("SLOCK: %ju\n", (uintmax_t)LK_SHARERS(lk->lk_lock));
 	else {
 		td = lockmgr_xholder(lk);
 		if (td == (struct thread *)LK_KERNPROC)
 			db_printf("XLOCK: LK_KERNPROC\n");
 		else
 			db_printf("XLOCK: %p (tid %d, pid %d, \"%s\")\n", td,
 			    td->td_tid, td->td_proc->p_pid,
 			    td->td_proc->p_comm);
 		if (lockmgr_recursed(lk))
 			db_printf(" recursed: %d\n", lk->lk_recurse);
 	}
 	db_printf(" waiters: ");
 	switch (lk->lk_lock & LK_ALL_WAITERS) {
 	case LK_SHARED_WAITERS:
 		db_printf("shared\n");
 		break;
 	case LK_EXCLUSIVE_WAITERS:
 		db_printf("exclusive\n");
 		break;
 	case LK_ALL_WAITERS:
 		db_printf("shared and exclusive\n");
 		break;
 	default:
 		db_printf("none\n");
 	}
 	db_printf(" spinners: ");
 	if (lk->lk_lock & LK_EXCLUSIVE_SPINNERS)
 		db_printf("exclusive\n");
 	else
 		db_printf("none\n");
 }
 #endif
Index: head/sys/kern/kern_lockstat.c
===================================================================
--- head/sys/kern/kern_lockstat.c	(revision 258540)
+++ head/sys/kern/kern_lockstat.c	(revision 258541)
@@ -1,64 +1,62 @@
 /*-
  * Copyright 2008-2009 Stacey Son <sson@FreeBSD.org>
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  * $FreeBSD$
  */
 
 /*
  * Backend for the lock tracing (lockstat) kernel support. This is required 
  * to allow a module to load even though DTrace kernel support may not be 
  * present. 
  *
  */
 
-#include "opt_kdtrace.h"
-
 #ifdef KDTRACE_HOOKS
 
 #include <sys/time.h>
 #include <sys/types.h>
 #include <sys/lockstat.h>
 
 /*
  * The following must match the type definition of dtrace_probe.  It is  
  * defined this way to avoid having to rely on CDDL code.
  */
 uint32_t lockstat_probemap[LS_NPROBES];
 void (*lockstat_probe_func)(uint32_t, uintptr_t, uintptr_t,
     uintptr_t, uintptr_t, uintptr_t);
 
 
 uint64_t 
 lockstat_nsecs(void)
 {
 	struct bintime bt;
 	uint64_t ns;
 
 	binuptime(&bt);
 	ns = bt.sec * (uint64_t)1000000000;
 	ns += ((uint64_t)1000000000 * (uint32_t)(bt.frac >> 32)) >> 32;
 	return (ns);
 }
 
 #endif /* KDTRACE_HOOKS */
Index: head/sys/kern/kern_malloc.c
===================================================================
--- head/sys/kern/kern_malloc.c	(revision 258540)
+++ head/sys/kern/kern_malloc.c	(revision 258541)
@@ -1,1107 +1,1106 @@
 /*-
  * Copyright (c) 1987, 1991, 1993
  *	The Regents of the University of California.
  * Copyright (c) 2005-2009 Robert N. M. Watson
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)kern_malloc.c	8.3 (Berkeley) 1/4/94
  */
 
 /*
  * Kernel malloc(9) implementation -- general purpose kernel memory allocator
  * based on memory types.  Back end is implemented using the UMA(9) zone
  * allocator.  A set of fixed-size buckets are used for smaller allocations,
  * and a special UMA allocation interface is used for larger allocations.
  * Callers declare memory types, and statistics are maintained independently
  * for each memory type.  Statistics are maintained per-CPU for performance
  * reasons.  See malloc(9) and comments in malloc.h for a detailed
  * description.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_ddb.h"
-#include "opt_kdtrace.h"
 #include "opt_vm.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/kdb.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mutex.h>
 #include <sys/vmmeter.h>
 #include <sys/proc.h>
 #include <sys/sbuf.h>
 #include <sys/sysctl.h>
 #include <sys/time.h>
 #include <sys/vmem.h>
 
 #include <vm/vm.h>
 #include <vm/pmap.h>
 #include <vm/vm_pageout.h>
 #include <vm/vm_param.h>
 #include <vm/vm_kern.h>
 #include <vm/vm_extern.h>
 #include <vm/vm_map.h>
 #include <vm/vm_page.h>
 #include <vm/uma.h>
 #include <vm/uma_int.h>
 #include <vm/uma_dbg.h>
 
 #ifdef DEBUG_MEMGUARD
 #include <vm/memguard.h>
 #endif
 #ifdef DEBUG_REDZONE
 #include <vm/redzone.h>
 #endif
 
 #if defined(INVARIANTS) && defined(__i386__)
 #include <machine/cpu.h>
 #endif
 
 #include <ddb/ddb.h>
 
 #ifdef KDTRACE_HOOKS
 #include <sys/dtrace_bsd.h>
 
 dtrace_malloc_probe_func_t	dtrace_malloc_probe;
 #endif
 
 /*
  * When realloc() is called, if the new size is sufficiently smaller than
  * the old size, realloc() will allocate a new, smaller block to avoid
  * wasting memory. 'Sufficiently smaller' is defined as: newsize <=
  * oldsize / 2^n, where REALLOC_FRACTION defines the value of 'n'.
  */
 #ifndef REALLOC_FRACTION
 #define	REALLOC_FRACTION	1	/* new block if <= half the size */
 #endif
 
 /*
  * Centrally define some common malloc types.
  */
 MALLOC_DEFINE(M_CACHE, "cache", "Various Dynamically allocated caches");
 MALLOC_DEFINE(M_DEVBUF, "devbuf", "device driver memory");
 MALLOC_DEFINE(M_TEMP, "temp", "misc temporary data buffers");
 
 MALLOC_DEFINE(M_IP6OPT, "ip6opt", "IPv6 options");
 MALLOC_DEFINE(M_IP6NDP, "ip6ndp", "IPv6 Neighbor Discovery");
 
 static struct malloc_type *kmemstatistics;
 static int kmemcount;
 
 #define KMEM_ZSHIFT	4
 #define KMEM_ZBASE	16
 #define KMEM_ZMASK	(KMEM_ZBASE - 1)
 
 #define KMEM_ZMAX	PAGE_SIZE
 #define KMEM_ZSIZE	(KMEM_ZMAX >> KMEM_ZSHIFT)
 static uint8_t kmemsize[KMEM_ZSIZE + 1];
 
 #ifndef MALLOC_DEBUG_MAXZONES
 #define	MALLOC_DEBUG_MAXZONES	1
 #endif
 static int numzones = MALLOC_DEBUG_MAXZONES;
 
 /*
  * Small malloc(9) memory allocations are allocated from a set of UMA buckets
  * of various sizes.
  *
  * XXX: The comment here used to read "These won't be powers of two for
  * long."  It's possible that a significant amount of wasted memory could be
  * recovered by tuning the sizes of these buckets.
  */
 struct {
 	int kz_size;
 	char *kz_name;
 	uma_zone_t kz_zone[MALLOC_DEBUG_MAXZONES];
 } kmemzones[] = {
 	{16, "16", },
 	{32, "32", },
 	{64, "64", },
 	{128, "128", },
 	{256, "256", },
 	{512, "512", },
 	{1024, "1024", },
 	{2048, "2048", },
 	{4096, "4096", },
 #if PAGE_SIZE > 4096
 	{8192, "8192", },
 #if PAGE_SIZE > 8192
 	{16384, "16384", },
 #if PAGE_SIZE > 16384
 	{32768, "32768", },
 #if PAGE_SIZE > 32768
 	{65536, "65536", },
 #if PAGE_SIZE > 65536
 #error	"Unsupported PAGE_SIZE"
 #endif	/* 65536 */
 #endif	/* 32768 */
 #endif	/* 16384 */
 #endif	/* 8192 */
 #endif	/* 4096 */
 	{0, NULL},
 };
 
 /*
  * Zone to allocate malloc type descriptions from.  For ABI reasons, memory
  * types are described by a data structure passed by the declaring code, but
  * the malloc(9) implementation has its own data structure describing the
  * type and statistics.  This permits the malloc(9)-internal data structures
  * to be modified without breaking binary-compiled kernel modules that
  * declare malloc types.
  */
 static uma_zone_t mt_zone;
 
 u_long vm_kmem_size;
 SYSCTL_ULONG(_vm, OID_AUTO, kmem_size, CTLFLAG_RDTUN, &vm_kmem_size, 0,
     "Size of kernel memory");
 
 static u_long vm_kmem_size_min;
 SYSCTL_ULONG(_vm, OID_AUTO, kmem_size_min, CTLFLAG_RDTUN, &vm_kmem_size_min, 0,
     "Minimum size of kernel memory");
 
 static u_long vm_kmem_size_max;
 SYSCTL_ULONG(_vm, OID_AUTO, kmem_size_max, CTLFLAG_RDTUN, &vm_kmem_size_max, 0,
     "Maximum size of kernel memory");
 
 static u_int vm_kmem_size_scale;
 SYSCTL_UINT(_vm, OID_AUTO, kmem_size_scale, CTLFLAG_RDTUN, &vm_kmem_size_scale, 0,
     "Scale factor for kernel memory size");
 
 static int sysctl_kmem_map_size(SYSCTL_HANDLER_ARGS);
 SYSCTL_PROC(_vm, OID_AUTO, kmem_map_size,
     CTLFLAG_RD | CTLTYPE_ULONG | CTLFLAG_MPSAFE, NULL, 0,
     sysctl_kmem_map_size, "LU", "Current kmem allocation size");
 
 static int sysctl_kmem_map_free(SYSCTL_HANDLER_ARGS);
 SYSCTL_PROC(_vm, OID_AUTO, kmem_map_free,
     CTLFLAG_RD | CTLTYPE_ULONG | CTLFLAG_MPSAFE, NULL, 0,
     sysctl_kmem_map_free, "LU", "Free space in kmem");
 
 /*
  * The malloc_mtx protects the kmemstatistics linked list.
  */
 struct mtx malloc_mtx;
 
 #ifdef MALLOC_PROFILE
 uint64_t krequests[KMEM_ZSIZE + 1];
 
 static int sysctl_kern_mprof(SYSCTL_HANDLER_ARGS);
 #endif
 
 static int sysctl_kern_malloc_stats(SYSCTL_HANDLER_ARGS);
 
 /*
  * time_uptime of the last malloc(9) failure (induced or real).
  */
 static time_t t_malloc_fail;
 
 #if defined(MALLOC_MAKE_FAILURES) || (MALLOC_DEBUG_MAXZONES > 1)
 static SYSCTL_NODE(_debug, OID_AUTO, malloc, CTLFLAG_RD, 0,
     "Kernel malloc debugging options");
 #endif
 
 /*
  * malloc(9) fault injection -- cause malloc failures every (n) mallocs when
  * the caller specifies M_NOWAIT.  If set to 0, no failures are caused.
  */
 #ifdef MALLOC_MAKE_FAILURES
 static int malloc_failure_rate;
 static int malloc_nowait_count;
 static int malloc_failure_count;
 SYSCTL_INT(_debug_malloc, OID_AUTO, failure_rate, CTLFLAG_RW,
     &malloc_failure_rate, 0, "Every (n) mallocs with M_NOWAIT will fail");
 TUNABLE_INT("debug.malloc.failure_rate", &malloc_failure_rate);
 SYSCTL_INT(_debug_malloc, OID_AUTO, failure_count, CTLFLAG_RD,
     &malloc_failure_count, 0, "Number of imposed M_NOWAIT malloc failures");
 #endif
 
 static int
 sysctl_kmem_map_size(SYSCTL_HANDLER_ARGS)
 {
 	u_long size;
 
 	size = vmem_size(kmem_arena, VMEM_ALLOC);
 	return (sysctl_handle_long(oidp, &size, 0, req));
 }
 
 static int
 sysctl_kmem_map_free(SYSCTL_HANDLER_ARGS)
 {
 	u_long size;
 
 	size = vmem_size(kmem_arena, VMEM_FREE);
 	return (sysctl_handle_long(oidp, &size, 0, req));
 }
 
 /*
  * malloc(9) uma zone separation -- sub-page buffer overruns in one
  * malloc type will affect only a subset of other malloc types.
  */
 #if MALLOC_DEBUG_MAXZONES > 1
 static void
 tunable_set_numzones(void)
 {
 
 	TUNABLE_INT_FETCH("debug.malloc.numzones",
 	    &numzones);
 
 	/* Sanity check the number of malloc uma zones. */
 	if (numzones <= 0)
 		numzones = 1;
 	if (numzones > MALLOC_DEBUG_MAXZONES)
 		numzones = MALLOC_DEBUG_MAXZONES;
 }
 SYSINIT(numzones, SI_SUB_TUNABLES, SI_ORDER_ANY, tunable_set_numzones, NULL);
 SYSCTL_INT(_debug_malloc, OID_AUTO, numzones, CTLFLAG_RDTUN,
     &numzones, 0, "Number of malloc uma subzones");
 
 /*
  * Any number that changes regularly is an okay choice for the
  * offset.  Build numbers are pretty good of you have them.
  */
 static u_int zone_offset = __FreeBSD_version;
 TUNABLE_INT("debug.malloc.zone_offset", &zone_offset);
 SYSCTL_UINT(_debug_malloc, OID_AUTO, zone_offset, CTLFLAG_RDTUN,
     &zone_offset, 0, "Separate malloc types by examining the "
     "Nth character in the malloc type short description.");
 
 static u_int
 mtp_get_subzone(const char *desc)
 {
 	size_t len;
 	u_int val;
 
 	if (desc == NULL || (len = strlen(desc)) == 0)
 		return (0);
 	val = desc[zone_offset % len];
 	return (val % numzones);
 }
 #elif MALLOC_DEBUG_MAXZONES == 0
 #error "MALLOC_DEBUG_MAXZONES must be positive."
 #else
 static inline u_int
 mtp_get_subzone(const char *desc)
 {
 
 	return (0);
 }
 #endif /* MALLOC_DEBUG_MAXZONES > 1 */
 
 int
 malloc_last_fail(void)
 {
 
 	return (time_uptime - t_malloc_fail);
 }
 
 /*
  * An allocation has succeeded -- update malloc type statistics for the
  * amount of bucket size.  Occurs within a critical section so that the
  * thread isn't preempted and doesn't migrate while updating per-PCU
  * statistics.
  */
 static void
 malloc_type_zone_allocated(struct malloc_type *mtp, unsigned long size,
     int zindx)
 {
 	struct malloc_type_internal *mtip;
 	struct malloc_type_stats *mtsp;
 
 	critical_enter();
 	mtip = mtp->ks_handle;
 	mtsp = &mtip->mti_stats[curcpu];
 	if (size > 0) {
 		mtsp->mts_memalloced += size;
 		mtsp->mts_numallocs++;
 	}
 	if (zindx != -1)
 		mtsp->mts_size |= 1 << zindx;
 
 #ifdef KDTRACE_HOOKS
 	if (dtrace_malloc_probe != NULL) {
 		uint32_t probe_id = mtip->mti_probes[DTMALLOC_PROBE_MALLOC];
 		if (probe_id != 0)
 			(dtrace_malloc_probe)(probe_id,
 			    (uintptr_t) mtp, (uintptr_t) mtip,
 			    (uintptr_t) mtsp, size, zindx);
 	}
 #endif
 
 	critical_exit();
 }
 
 void
 malloc_type_allocated(struct malloc_type *mtp, unsigned long size)
 {
 
 	if (size > 0)
 		malloc_type_zone_allocated(mtp, size, -1);
 }
 
 /*
  * A free operation has occurred -- update malloc type statistics for the
  * amount of the bucket size.  Occurs within a critical section so that the
  * thread isn't preempted and doesn't migrate while updating per-CPU
  * statistics.
  */
 void
 malloc_type_freed(struct malloc_type *mtp, unsigned long size)
 {
 	struct malloc_type_internal *mtip;
 	struct malloc_type_stats *mtsp;
 
 	critical_enter();
 	mtip = mtp->ks_handle;
 	mtsp = &mtip->mti_stats[curcpu];
 	mtsp->mts_memfreed += size;
 	mtsp->mts_numfrees++;
 
 #ifdef KDTRACE_HOOKS
 	if (dtrace_malloc_probe != NULL) {
 		uint32_t probe_id = mtip->mti_probes[DTMALLOC_PROBE_FREE];
 		if (probe_id != 0)
 			(dtrace_malloc_probe)(probe_id,
 			    (uintptr_t) mtp, (uintptr_t) mtip,
 			    (uintptr_t) mtsp, size, 0);
 	}
 #endif
 
 	critical_exit();
 }
 
 /*
  *	contigmalloc:
  *
  *	Allocate a block of physically contiguous memory.
  *
  *	If M_NOWAIT is set, this routine will not block and return NULL if
  *	the allocation fails.
  */
 void *
 contigmalloc(unsigned long size, struct malloc_type *type, int flags,
     vm_paddr_t low, vm_paddr_t high, unsigned long alignment,
     vm_paddr_t boundary)
 {
 	void *ret;
 
 	ret = (void *)kmem_alloc_contig(kernel_arena, size, flags, low, high,
 	    alignment, boundary, VM_MEMATTR_DEFAULT);
 	if (ret != NULL)
 		malloc_type_allocated(type, round_page(size));
 	return (ret);
 }
 
 /*
  *	contigfree:
  *
  *	Free a block of memory allocated by contigmalloc.
  *
  *	This routine may not block.
  */
 void
 contigfree(void *addr, unsigned long size, struct malloc_type *type)
 {
 
 	kmem_free(kernel_arena, (vm_offset_t)addr, size);
 	malloc_type_freed(type, round_page(size));
 }
 
 /*
  *	malloc:
  *
  *	Allocate a block of memory.
  *
  *	If M_NOWAIT is set, this routine will not block and return NULL if
  *	the allocation fails.
  */
 void *
 malloc(unsigned long size, struct malloc_type *mtp, int flags)
 {
 	int indx;
 	struct malloc_type_internal *mtip;
 	caddr_t va;
 	uma_zone_t zone;
 #if defined(DIAGNOSTIC) || defined(DEBUG_REDZONE)
 	unsigned long osize = size;
 #endif
 
 #ifdef INVARIANTS
 	KASSERT(mtp->ks_magic == M_MAGIC, ("malloc: bad malloc type magic"));
 	/*
 	 * Check that exactly one of M_WAITOK or M_NOWAIT is specified.
 	 */
 	indx = flags & (M_WAITOK | M_NOWAIT);
 	if (indx != M_NOWAIT && indx != M_WAITOK) {
 		static	struct timeval lasterr;
 		static	int curerr, once;
 		if (once == 0 && ppsratecheck(&lasterr, &curerr, 1)) {
 			printf("Bad malloc flags: %x\n", indx);
 			kdb_backtrace();
 			flags |= M_WAITOK;
 			once++;
 		}
 	}
 #endif
 #ifdef MALLOC_MAKE_FAILURES
 	if ((flags & M_NOWAIT) && (malloc_failure_rate != 0)) {
 		atomic_add_int(&malloc_nowait_count, 1);
 		if ((malloc_nowait_count % malloc_failure_rate) == 0) {
 			atomic_add_int(&malloc_failure_count, 1);
 			t_malloc_fail = time_uptime;
 			return (NULL);
 		}
 	}
 #endif
 	if (flags & M_WAITOK)
 		KASSERT(curthread->td_intr_nesting_level == 0,
 		   ("malloc(M_WAITOK) in interrupt context"));
 
 #ifdef DEBUG_MEMGUARD
 	if (memguard_cmp_mtp(mtp, size)) {
 		va = memguard_alloc(size, flags);
 		if (va != NULL)
 			return (va);
 		/* This is unfortunate but should not be fatal. */
 	}
 #endif
 
 #ifdef DEBUG_REDZONE
 	size = redzone_size_ntor(size);
 #endif
 
 	if (size <= KMEM_ZMAX) {
 		mtip = mtp->ks_handle;
 		if (size & KMEM_ZMASK)
 			size = (size & ~KMEM_ZMASK) + KMEM_ZBASE;
 		indx = kmemsize[size >> KMEM_ZSHIFT];
 		KASSERT(mtip->mti_zone < numzones,
 		    ("mti_zone %u out of range %d",
 		    mtip->mti_zone, numzones));
 		zone = kmemzones[indx].kz_zone[mtip->mti_zone];
 #ifdef MALLOC_PROFILE
 		krequests[size >> KMEM_ZSHIFT]++;
 #endif
 		va = uma_zalloc(zone, flags);
 		if (va != NULL)
 			size = zone->uz_size;
 		malloc_type_zone_allocated(mtp, va == NULL ? 0 : size, indx);
 	} else {
 		size = roundup(size, PAGE_SIZE);
 		zone = NULL;
 		va = uma_large_malloc(size, flags);
 		malloc_type_allocated(mtp, va == NULL ? 0 : size);
 	}
 	if (flags & M_WAITOK)
 		KASSERT(va != NULL, ("malloc(M_WAITOK) returned NULL"));
 	else if (va == NULL)
 		t_malloc_fail = time_uptime;
 #ifdef DIAGNOSTIC
 	if (va != NULL && !(flags & M_ZERO)) {
 		memset(va, 0x70, osize);
 	}
 #endif
 #ifdef DEBUG_REDZONE
 	if (va != NULL)
 		va = redzone_setup(va, osize);
 #endif
 	return ((void *) va);
 }
 
 /*
  *	free:
  *
  *	Free a block of memory allocated by malloc.
  *
  *	This routine may not block.
  */
 void
 free(void *addr, struct malloc_type *mtp)
 {
 	uma_slab_t slab;
 	u_long size;
 
 	KASSERT(mtp->ks_magic == M_MAGIC, ("free: bad malloc type magic"));
 
 	/* free(NULL, ...) does nothing */
 	if (addr == NULL)
 		return;
 
 #ifdef DEBUG_MEMGUARD
 	if (is_memguard_addr(addr)) {
 		memguard_free(addr);
 		return;
 	}
 #endif
 
 #ifdef DEBUG_REDZONE
 	redzone_check(addr);
 	addr = redzone_addr_ntor(addr);
 #endif
 
 	slab = vtoslab((vm_offset_t)addr & (~UMA_SLAB_MASK));
 
 	if (slab == NULL)
 		panic("free: address %p(%p) has not been allocated.\n",
 		    addr, (void *)((u_long)addr & (~UMA_SLAB_MASK)));
 
 	if (!(slab->us_flags & UMA_SLAB_MALLOC)) {
 #ifdef INVARIANTS
 		struct malloc_type **mtpp = addr;
 #endif
 		size = slab->us_keg->uk_size;
 #ifdef INVARIANTS
 		/*
 		 * Cache a pointer to the malloc_type that most recently freed
 		 * this memory here.  This way we know who is most likely to
 		 * have stepped on it later.
 		 *
 		 * This code assumes that size is a multiple of 8 bytes for
 		 * 64 bit machines
 		 */
 		mtpp = (struct malloc_type **)
 		    ((unsigned long)mtpp & ~UMA_ALIGN_PTR);
 		mtpp += (size - sizeof(struct malloc_type *)) /
 		    sizeof(struct malloc_type *);
 		*mtpp = mtp;
 #endif
 		uma_zfree_arg(LIST_FIRST(&slab->us_keg->uk_zones), addr, slab);
 	} else {
 		size = slab->us_size;
 		uma_large_free(slab);
 	}
 	malloc_type_freed(mtp, size);
 }
 
 /*
  *	realloc: change the size of a memory block
  */
 void *
 realloc(void *addr, unsigned long size, struct malloc_type *mtp, int flags)
 {
 	uma_slab_t slab;
 	unsigned long alloc;
 	void *newaddr;
 
 	KASSERT(mtp->ks_magic == M_MAGIC,
 	    ("realloc: bad malloc type magic"));
 
 	/* realloc(NULL, ...) is equivalent to malloc(...) */
 	if (addr == NULL)
 		return (malloc(size, mtp, flags));
 
 	/*
 	 * XXX: Should report free of old memory and alloc of new memory to
 	 * per-CPU stats.
 	 */
 
 #ifdef DEBUG_MEMGUARD
 	if (is_memguard_addr(addr))
 		return (memguard_realloc(addr, size, mtp, flags));
 #endif
 
 #ifdef DEBUG_REDZONE
 	slab = NULL;
 	alloc = redzone_get_size(addr);
 #else
 	slab = vtoslab((vm_offset_t)addr & ~(UMA_SLAB_MASK));
 
 	/* Sanity check */
 	KASSERT(slab != NULL,
 	    ("realloc: address %p out of range", (void *)addr));
 
 	/* Get the size of the original block */
 	if (!(slab->us_flags & UMA_SLAB_MALLOC))
 		alloc = slab->us_keg->uk_size;
 	else
 		alloc = slab->us_size;
 
 	/* Reuse the original block if appropriate */
 	if (size <= alloc
 	    && (size > (alloc >> REALLOC_FRACTION) || alloc == MINALLOCSIZE))
 		return (addr);
 #endif /* !DEBUG_REDZONE */
 
 	/* Allocate a new, bigger (or smaller) block */
 	if ((newaddr = malloc(size, mtp, flags)) == NULL)
 		return (NULL);
 
 	/* Copy over original contents */
 	bcopy(addr, newaddr, min(size, alloc));
 	free(addr, mtp);
 	return (newaddr);
 }
 
 /*
  *	reallocf: same as realloc() but free memory on failure.
  */
 void *
 reallocf(void *addr, unsigned long size, struct malloc_type *mtp, int flags)
 {
 	void *mem;
 
 	if ((mem = realloc(addr, size, mtp, flags)) == NULL)
 		free(addr, mtp);
 	return (mem);
 }
 
 /*
  * Wake the page daemon when we exhaust KVA.  It will call the lowmem handler
  * and uma_reclaim() callbacks in a context that is safe.
  */
 static void
 kmem_reclaim(vmem_t *vm, int flags)
 {
 
 	pagedaemon_wakeup();
 }
 
 CTASSERT(VM_KMEM_SIZE_SCALE >= 1);
 
 /*
  * Initialize the kernel memory (kmem) arena.
  */
 void
 kmeminit(void)
 {
 	u_long mem_size, tmp;
 
 	/*
 	 * Calculate the amount of kernel virtual address (KVA) space that is
 	 * preallocated to the kmem arena.  In order to support a wide range
 	 * of machines, it is a function of the physical memory size,
 	 * specifically,
 	 *
 	 *	min(max(physical memory size / VM_KMEM_SIZE_SCALE,
 	 *	    VM_KMEM_SIZE_MIN), VM_KMEM_SIZE_MAX)
 	 *
 	 * Every architecture must define an integral value for
 	 * VM_KMEM_SIZE_SCALE.  However, the definitions of VM_KMEM_SIZE_MIN
 	 * and VM_KMEM_SIZE_MAX, which represent respectively the floor and
 	 * ceiling on this preallocation, are optional.  Typically,
 	 * VM_KMEM_SIZE_MAX is itself a function of the available KVA space on
 	 * a given architecture.
 	 */
 	mem_size = cnt.v_page_count;
 
 	vm_kmem_size_scale = VM_KMEM_SIZE_SCALE;
 	TUNABLE_INT_FETCH("vm.kmem_size_scale", &vm_kmem_size_scale);
 	if (vm_kmem_size_scale < 1)
 		vm_kmem_size_scale = VM_KMEM_SIZE_SCALE;
 
 	vm_kmem_size = (mem_size / vm_kmem_size_scale) * PAGE_SIZE;
 
 #if defined(VM_KMEM_SIZE_MIN)
 	vm_kmem_size_min = VM_KMEM_SIZE_MIN;
 #endif
 	TUNABLE_ULONG_FETCH("vm.kmem_size_min", &vm_kmem_size_min);
 	if (vm_kmem_size_min > 0 && vm_kmem_size < vm_kmem_size_min)
 		vm_kmem_size = vm_kmem_size_min;
 
 #if defined(VM_KMEM_SIZE_MAX)
 	vm_kmem_size_max = VM_KMEM_SIZE_MAX;
 #endif
 	TUNABLE_ULONG_FETCH("vm.kmem_size_max", &vm_kmem_size_max);
 	if (vm_kmem_size_max > 0 && vm_kmem_size >= vm_kmem_size_max)
 		vm_kmem_size = vm_kmem_size_max;
 
 	/*
 	 * Alternatively, the amount of KVA space that is preallocated to the
 	 * kmem arena can be set statically at compile-time or manually
 	 * through the kernel environment.  However, it is still limited to
 	 * twice the physical memory size, which has been sufficient to handle
 	 * the most severe cases of external fragmentation in the kmem arena. 
 	 */
 #if defined(VM_KMEM_SIZE)
 	vm_kmem_size = VM_KMEM_SIZE;
 #endif
 	TUNABLE_ULONG_FETCH("vm.kmem_size", &vm_kmem_size);
 	if (vm_kmem_size / 2 / PAGE_SIZE > mem_size)
 		vm_kmem_size = 2 * mem_size * PAGE_SIZE;
 
 	vm_kmem_size = round_page(vm_kmem_size);
 #ifdef DEBUG_MEMGUARD
 	tmp = memguard_fudge(vm_kmem_size, kernel_map);
 #else
 	tmp = vm_kmem_size;
 #endif
 	vmem_init(kmem_arena, "kmem arena", kva_alloc(tmp), tmp, PAGE_SIZE,
 	    0, 0);
 	vmem_set_reclaim(kmem_arena, kmem_reclaim);
 
 #ifdef DEBUG_MEMGUARD
 	/*
 	 * Initialize MemGuard if support compiled in.  MemGuard is a
 	 * replacement allocator used for detecting tamper-after-free
 	 * scenarios as they occur.  It is only used for debugging.
 	 */
 	memguard_init(kmem_arena);
 #endif
 }
 
 /*
  * Initialize the kernel memory allocator
  */
 /* ARGSUSED*/
 static void
 mallocinit(void *dummy)
 {
 	int i;
 	uint8_t indx;
 
 	mtx_init(&malloc_mtx, "malloc", NULL, MTX_DEF);
 
 	kmeminit();
 
 	uma_startup2();
 
 	mt_zone = uma_zcreate("mt_zone", sizeof(struct malloc_type_internal),
 #ifdef INVARIANTS
 	    mtrash_ctor, mtrash_dtor, mtrash_init, mtrash_fini,
 #else
 	    NULL, NULL, NULL, NULL,
 #endif
 	    UMA_ALIGN_PTR, UMA_ZONE_MALLOC);
 	for (i = 0, indx = 0; kmemzones[indx].kz_size != 0; indx++) {
 		int size = kmemzones[indx].kz_size;
 		char *name = kmemzones[indx].kz_name;
 		int subzone;
 
 		for (subzone = 0; subzone < numzones; subzone++) {
 			kmemzones[indx].kz_zone[subzone] =
 			    uma_zcreate(name, size,
 #ifdef INVARIANTS
 			    mtrash_ctor, mtrash_dtor, mtrash_init, mtrash_fini,
 #else
 			    NULL, NULL, NULL, NULL,
 #endif
 			    UMA_ALIGN_PTR, UMA_ZONE_MALLOC);
 		}		    
 		for (;i <= size; i+= KMEM_ZBASE)
 			kmemsize[i >> KMEM_ZSHIFT] = indx;
 		
 	}
 }
 SYSINIT(kmem, SI_SUB_KMEM, SI_ORDER_FIRST, mallocinit, NULL);
 
 void
 malloc_init(void *data)
 {
 	struct malloc_type_internal *mtip;
 	struct malloc_type *mtp;
 
 	KASSERT(cnt.v_page_count != 0, ("malloc_register before vm_init"));
 
 	mtp = data;
 	if (mtp->ks_magic != M_MAGIC)
 		panic("malloc_init: bad malloc type magic");
 
 	mtip = uma_zalloc(mt_zone, M_WAITOK | M_ZERO);
 	mtp->ks_handle = mtip;
 	mtip->mti_zone = mtp_get_subzone(mtp->ks_shortdesc);
 
 	mtx_lock(&malloc_mtx);
 	mtp->ks_next = kmemstatistics;
 	kmemstatistics = mtp;
 	kmemcount++;
 	mtx_unlock(&malloc_mtx);
 }
 
 void
 malloc_uninit(void *data)
 {
 	struct malloc_type_internal *mtip;
 	struct malloc_type_stats *mtsp;
 	struct malloc_type *mtp, *temp;
 	uma_slab_t slab;
 	long temp_allocs, temp_bytes;
 	int i;
 
 	mtp = data;
 	KASSERT(mtp->ks_magic == M_MAGIC,
 	    ("malloc_uninit: bad malloc type magic"));
 	KASSERT(mtp->ks_handle != NULL, ("malloc_deregister: cookie NULL"));
 
 	mtx_lock(&malloc_mtx);
 	mtip = mtp->ks_handle;
 	mtp->ks_handle = NULL;
 	if (mtp != kmemstatistics) {
 		for (temp = kmemstatistics; temp != NULL;
 		    temp = temp->ks_next) {
 			if (temp->ks_next == mtp) {
 				temp->ks_next = mtp->ks_next;
 				break;
 			}
 		}
 		KASSERT(temp,
 		    ("malloc_uninit: type '%s' not found", mtp->ks_shortdesc));
 	} else
 		kmemstatistics = mtp->ks_next;
 	kmemcount--;
 	mtx_unlock(&malloc_mtx);
 
 	/*
 	 * Look for memory leaks.
 	 */
 	temp_allocs = temp_bytes = 0;
 	for (i = 0; i < MAXCPU; i++) {
 		mtsp = &mtip->mti_stats[i];
 		temp_allocs += mtsp->mts_numallocs;
 		temp_allocs -= mtsp->mts_numfrees;
 		temp_bytes += mtsp->mts_memalloced;
 		temp_bytes -= mtsp->mts_memfreed;
 	}
 	if (temp_allocs > 0 || temp_bytes > 0) {
 		printf("Warning: memory type %s leaked memory on destroy "
 		    "(%ld allocations, %ld bytes leaked).\n", mtp->ks_shortdesc,
 		    temp_allocs, temp_bytes);
 	}
 
 	slab = vtoslab((vm_offset_t) mtip & (~UMA_SLAB_MASK));
 	uma_zfree_arg(mt_zone, mtip, slab);
 }
 
 struct malloc_type *
 malloc_desc2type(const char *desc)
 {
 	struct malloc_type *mtp;
 
 	mtx_assert(&malloc_mtx, MA_OWNED);
 	for (mtp = kmemstatistics; mtp != NULL; mtp = mtp->ks_next) {
 		if (strcmp(mtp->ks_shortdesc, desc) == 0)
 			return (mtp);
 	}
 	return (NULL);
 }
 
 static int
 sysctl_kern_malloc_stats(SYSCTL_HANDLER_ARGS)
 {
 	struct malloc_type_stream_header mtsh;
 	struct malloc_type_internal *mtip;
 	struct malloc_type_header mth;
 	struct malloc_type *mtp;
 	int error, i;
 	struct sbuf sbuf;
 
 	error = sysctl_wire_old_buffer(req, 0);
 	if (error != 0)
 		return (error);
 	sbuf_new_for_sysctl(&sbuf, NULL, 128, req);
 	mtx_lock(&malloc_mtx);
 
 	/*
 	 * Insert stream header.
 	 */
 	bzero(&mtsh, sizeof(mtsh));
 	mtsh.mtsh_version = MALLOC_TYPE_STREAM_VERSION;
 	mtsh.mtsh_maxcpus = MAXCPU;
 	mtsh.mtsh_count = kmemcount;
 	(void)sbuf_bcat(&sbuf, &mtsh, sizeof(mtsh));
 
 	/*
 	 * Insert alternating sequence of type headers and type statistics.
 	 */
 	for (mtp = kmemstatistics; mtp != NULL; mtp = mtp->ks_next) {
 		mtip = (struct malloc_type_internal *)mtp->ks_handle;
 
 		/*
 		 * Insert type header.
 		 */
 		bzero(&mth, sizeof(mth));
 		strlcpy(mth.mth_name, mtp->ks_shortdesc, MALLOC_MAX_NAME);
 		(void)sbuf_bcat(&sbuf, &mth, sizeof(mth));
 
 		/*
 		 * Insert type statistics for each CPU.
 		 */
 		for (i = 0; i < MAXCPU; i++) {
 			(void)sbuf_bcat(&sbuf, &mtip->mti_stats[i],
 			    sizeof(mtip->mti_stats[i]));
 		}
 	}
 	mtx_unlock(&malloc_mtx);
 	error = sbuf_finish(&sbuf);
 	sbuf_delete(&sbuf);
 	return (error);
 }
 
 SYSCTL_PROC(_kern, OID_AUTO, malloc_stats, CTLFLAG_RD|CTLTYPE_STRUCT,
     0, 0, sysctl_kern_malloc_stats, "s,malloc_type_ustats",
     "Return malloc types");
 
 SYSCTL_INT(_kern, OID_AUTO, malloc_count, CTLFLAG_RD, &kmemcount, 0,
     "Count of kernel malloc types");
 
 void
 malloc_type_list(malloc_type_list_func_t *func, void *arg)
 {
 	struct malloc_type *mtp, **bufmtp;
 	int count, i;
 	size_t buflen;
 
 	mtx_lock(&malloc_mtx);
 restart:
 	mtx_assert(&malloc_mtx, MA_OWNED);
 	count = kmemcount;
 	mtx_unlock(&malloc_mtx);
 
 	buflen = sizeof(struct malloc_type *) * count;
 	bufmtp = malloc(buflen, M_TEMP, M_WAITOK);
 
 	mtx_lock(&malloc_mtx);
 
 	if (count < kmemcount) {
 		free(bufmtp, M_TEMP);
 		goto restart;
 	}
 
 	for (mtp = kmemstatistics, i = 0; mtp != NULL; mtp = mtp->ks_next, i++)
 		bufmtp[i] = mtp;
 
 	mtx_unlock(&malloc_mtx);
 
 	for (i = 0; i < count; i++)
 		(func)(bufmtp[i], arg);
 
 	free(bufmtp, M_TEMP);
 }
 
 #ifdef DDB
 DB_SHOW_COMMAND(malloc, db_show_malloc)
 {
 	struct malloc_type_internal *mtip;
 	struct malloc_type *mtp;
 	uint64_t allocs, frees;
 	uint64_t alloced, freed;
 	int i;
 
 	db_printf("%18s %12s  %12s %12s\n", "Type", "InUse", "MemUse",
 	    "Requests");
 	for (mtp = kmemstatistics; mtp != NULL; mtp = mtp->ks_next) {
 		mtip = (struct malloc_type_internal *)mtp->ks_handle;
 		allocs = 0;
 		frees = 0;
 		alloced = 0;
 		freed = 0;
 		for (i = 0; i < MAXCPU; i++) {
 			allocs += mtip->mti_stats[i].mts_numallocs;
 			frees += mtip->mti_stats[i].mts_numfrees;
 			alloced += mtip->mti_stats[i].mts_memalloced;
 			freed += mtip->mti_stats[i].mts_memfreed;
 		}
 		db_printf("%18s %12ju %12juK %12ju\n",
 		    mtp->ks_shortdesc, allocs - frees,
 		    (alloced - freed + 1023) / 1024, allocs);
 		if (db_pager_quit)
 			break;
 	}
 }
 
 #if MALLOC_DEBUG_MAXZONES > 1
 DB_SHOW_COMMAND(multizone_matches, db_show_multizone_matches)
 {
 	struct malloc_type_internal *mtip;
 	struct malloc_type *mtp;
 	u_int subzone;
 
 	if (!have_addr) {
 		db_printf("Usage: show multizone_matches <malloc type/addr>\n");
 		return;
 	}
 	mtp = (void *)addr;
 	if (mtp->ks_magic != M_MAGIC) {
 		db_printf("Magic %lx does not match expected %x\n",
 		    mtp->ks_magic, M_MAGIC);
 		return;
 	}
 
 	mtip = mtp->ks_handle;
 	subzone = mtip->mti_zone;
 
 	for (mtp = kmemstatistics; mtp != NULL; mtp = mtp->ks_next) {
 		mtip = mtp->ks_handle;
 		if (mtip->mti_zone != subzone)
 			continue;
 		db_printf("%s\n", mtp->ks_shortdesc);
 		if (db_pager_quit)
 			break;
 	}
 }
 #endif /* MALLOC_DEBUG_MAXZONES > 1 */
 #endif /* DDB */
 
 #ifdef MALLOC_PROFILE
 
 static int
 sysctl_kern_mprof(SYSCTL_HANDLER_ARGS)
 {
 	struct sbuf sbuf;
 	uint64_t count;
 	uint64_t waste;
 	uint64_t mem;
 	int error;
 	int rsize;
 	int size;
 	int i;
 
 	waste = 0;
 	mem = 0;
 
 	error = sysctl_wire_old_buffer(req, 0);
 	if (error != 0)
 		return (error);
 	sbuf_new_for_sysctl(&sbuf, NULL, 128, req);
 	sbuf_printf(&sbuf, 
 	    "\n  Size                    Requests  Real Size\n");
 	for (i = 0; i < KMEM_ZSIZE; i++) {
 		size = i << KMEM_ZSHIFT;
 		rsize = kmemzones[kmemsize[i]].kz_size;
 		count = (long long unsigned)krequests[i];
 
 		sbuf_printf(&sbuf, "%6d%28llu%11d\n", size,
 		    (unsigned long long)count, rsize);
 
 		if ((rsize * count) > (size * count))
 			waste += (rsize * count) - (size * count);
 		mem += (rsize * count);
 	}
 	sbuf_printf(&sbuf,
 	    "\nTotal memory used:\t%30llu\nTotal Memory wasted:\t%30llu\n",
 	    (unsigned long long)mem, (unsigned long long)waste);
 	error = sbuf_finish(&sbuf);
 	sbuf_delete(&sbuf);
 	return (error);
 }
 
 SYSCTL_OID(_kern, OID_AUTO, mprof, CTLTYPE_STRING|CTLFLAG_RD,
     NULL, 0, sysctl_kern_mprof, "A", "Malloc Profiling");
 #endif /* MALLOC_PROFILE */
Index: head/sys/kern/kern_mutex.c
===================================================================
--- head/sys/kern/kern_mutex.c	(revision 258540)
+++ head/sys/kern/kern_mutex.c	(revision 258541)
@@ -1,1009 +1,1006 @@
 /*-
  * Copyright (c) 1998 Berkeley Software Design, Inc. All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Berkeley Software Design Inc's name may not be used to endorse or
  *    promote products derived from this software without specific prior
  *    written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY BERKELEY SOFTWARE DESIGN INC ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL BERKELEY SOFTWARE DESIGN INC BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	from BSDI $Id: mutex_witness.c,v 1.1.2.20 2000/04/27 03:10:27 cp Exp $
  *	and BSDI $Id: synch_machdep.c,v 2.3.2.39 2000/04/27 03:10:25 cp Exp $
  */
 
 /*
  * Machine independent bits of mutex implementation.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_adaptive_mutexes.h"
 #include "opt_ddb.h"
 #include "opt_global.h"
 #include "opt_hwpmc_hooks.h"
-#include "opt_kdtrace.h"
 #include "opt_sched.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/bus.h>
 #include <sys/conf.h>
 #include <sys/kdb.h>
 #include <sys/kernel.h>
 #include <sys/ktr.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mutex.h>
 #include <sys/proc.h>
 #include <sys/resourcevar.h>
 #include <sys/sched.h>
 #include <sys/sbuf.h>
 #include <sys/sysctl.h>
 #include <sys/turnstile.h>
 #include <sys/vmmeter.h>
 #include <sys/lock_profile.h>
 
 #include <machine/atomic.h>
 #include <machine/bus.h>
 #include <machine/cpu.h>
 
 #include <ddb/ddb.h>
 
 #include <fs/devfs/devfs_int.h>
 
 #include <vm/vm.h>
 #include <vm/vm_extern.h>
 
 #if defined(SMP) && !defined(NO_ADAPTIVE_MUTEXES)
 #define	ADAPTIVE_MUTEXES
 #endif
 
 #ifdef HWPMC_HOOKS
 #include <sys/pmckern.h>
 PMC_SOFT_DEFINE( , , lock, failed);
 #endif
 
 /*
  * Return the mutex address when the lock cookie address is provided.
  * This functionality assumes that struct mtx* have a member named mtx_lock.
  */
 #define	mtxlock2mtx(c)	(__containerof(c, struct mtx, mtx_lock))
 
 /*
  * Internal utility macros.
  */
 #define mtx_unowned(m)	((m)->mtx_lock == MTX_UNOWNED)
 
 #define	mtx_destroyed(m) ((m)->mtx_lock == MTX_DESTROYED)
 
 #define	mtx_owner(m)	((struct thread *)((m)->mtx_lock & ~MTX_FLAGMASK))
 
 static void	assert_mtx(const struct lock_object *lock, int what);
 #ifdef DDB
 static void	db_show_mtx(const struct lock_object *lock);
 #endif
 static void	lock_mtx(struct lock_object *lock, uintptr_t how);
 static void	lock_spin(struct lock_object *lock, uintptr_t how);
 #ifdef KDTRACE_HOOKS
 static int	owner_mtx(const struct lock_object *lock,
 		    struct thread **owner);
 #endif
 static uintptr_t unlock_mtx(struct lock_object *lock);
 static uintptr_t unlock_spin(struct lock_object *lock);
 
 /*
  * Lock classes for sleep and spin mutexes.
  */
 struct lock_class lock_class_mtx_sleep = {
 	.lc_name = "sleep mutex",
 	.lc_flags = LC_SLEEPLOCK | LC_RECURSABLE,
 	.lc_assert = assert_mtx,
 #ifdef DDB
 	.lc_ddb_show = db_show_mtx,
 #endif
 	.lc_lock = lock_mtx,
 	.lc_unlock = unlock_mtx,
 #ifdef KDTRACE_HOOKS
 	.lc_owner = owner_mtx,
 #endif
 };
 struct lock_class lock_class_mtx_spin = {
 	.lc_name = "spin mutex",
 	.lc_flags = LC_SPINLOCK | LC_RECURSABLE,
 	.lc_assert = assert_mtx,
 #ifdef DDB
 	.lc_ddb_show = db_show_mtx,
 #endif
 	.lc_lock = lock_spin,
 	.lc_unlock = unlock_spin,
 #ifdef KDTRACE_HOOKS
 	.lc_owner = owner_mtx,
 #endif
 };
 
 /*
  * System-wide mutexes
  */
 struct mtx blocked_lock;
 struct mtx Giant;
 
 void
 assert_mtx(const struct lock_object *lock, int what)
 {
 
 	mtx_assert((const struct mtx *)lock, what);
 }
 
 void
 lock_mtx(struct lock_object *lock, uintptr_t how)
 {
 
 	mtx_lock((struct mtx *)lock);
 }
 
 void
 lock_spin(struct lock_object *lock, uintptr_t how)
 {
 
 	panic("spin locks can only use msleep_spin");
 }
 
 uintptr_t
 unlock_mtx(struct lock_object *lock)
 {
 	struct mtx *m;
 
 	m = (struct mtx *)lock;
 	mtx_assert(m, MA_OWNED | MA_NOTRECURSED);
 	mtx_unlock(m);
 	return (0);
 }
 
 uintptr_t
 unlock_spin(struct lock_object *lock)
 {
 
 	panic("spin locks can only use msleep_spin");
 }
 
 #ifdef KDTRACE_HOOKS
 int
 owner_mtx(const struct lock_object *lock, struct thread **owner)
 {
 	const struct mtx *m = (const struct mtx *)lock;
 
 	*owner = mtx_owner(m);
 	return (mtx_unowned(m) == 0);
 }
 #endif
 
 /*
  * Function versions of the inlined __mtx_* macros.  These are used by
  * modules and can also be called from assembly language if needed.
  */
 void
 __mtx_lock_flags(volatile uintptr_t *c, int opts, const char *file, int line)
 {
 	struct mtx *m;
 
 	if (SCHEDULER_STOPPED())
 		return;
 
 	m = mtxlock2mtx(c);
 
 	KASSERT(kdb_active != 0 || !TD_IS_IDLETHREAD(curthread),
 	    ("mtx_lock() by idle thread %p on sleep mutex %s @ %s:%d",
 	    curthread, m->lock_object.lo_name, file, line));
 	KASSERT(m->mtx_lock != MTX_DESTROYED,
 	    ("mtx_lock() of destroyed mutex @ %s:%d", file, line));
 	KASSERT(LOCK_CLASS(&m->lock_object) == &lock_class_mtx_sleep,
 	    ("mtx_lock() of spin mutex %s @ %s:%d", m->lock_object.lo_name,
 	    file, line));
 	WITNESS_CHECKORDER(&m->lock_object, (opts & ~MTX_RECURSE) |
 	    LOP_NEWORDER | LOP_EXCLUSIVE, file, line, NULL);
 
 	__mtx_lock(m, curthread, opts, file, line);
 	LOCK_LOG_LOCK("LOCK", &m->lock_object, opts, m->mtx_recurse, file,
 	    line);
 	WITNESS_LOCK(&m->lock_object, (opts & ~MTX_RECURSE) | LOP_EXCLUSIVE,
 	    file, line);
 	curthread->td_locks++;
 }
 
 void
 __mtx_unlock_flags(volatile uintptr_t *c, int opts, const char *file, int line)
 {
 	struct mtx *m;
 
 	if (SCHEDULER_STOPPED())
 		return;
 
 	m = mtxlock2mtx(c);
 
 	KASSERT(m->mtx_lock != MTX_DESTROYED,
 	    ("mtx_unlock() of destroyed mutex @ %s:%d", file, line));
 	KASSERT(LOCK_CLASS(&m->lock_object) == &lock_class_mtx_sleep,
 	    ("mtx_unlock() of spin mutex %s @ %s:%d", m->lock_object.lo_name,
 	    file, line));
 	WITNESS_UNLOCK(&m->lock_object, opts | LOP_EXCLUSIVE, file, line);
 	LOCK_LOG_LOCK("UNLOCK", &m->lock_object, opts, m->mtx_recurse, file,
 	    line);
 	mtx_assert(m, MA_OWNED);
 
-	if (m->mtx_recurse == 0)
-		LOCKSTAT_PROFILE_RELEASE_LOCK(LS_MTX_UNLOCK_RELEASE, m);
 	__mtx_unlock(m, curthread, opts, file, line);
 	curthread->td_locks--;
 }
 
 void
 __mtx_lock_spin_flags(volatile uintptr_t *c, int opts, const char *file,
     int line)
 {
 	struct mtx *m;
 
 	if (SCHEDULER_STOPPED())
 		return;
 
 	m = mtxlock2mtx(c);
 
 	KASSERT(m->mtx_lock != MTX_DESTROYED,
 	    ("mtx_lock_spin() of destroyed mutex @ %s:%d", file, line));
 	KASSERT(LOCK_CLASS(&m->lock_object) == &lock_class_mtx_spin,
 	    ("mtx_lock_spin() of sleep mutex %s @ %s:%d",
 	    m->lock_object.lo_name, file, line));
 	if (mtx_owned(m))
 		KASSERT((m->lock_object.lo_flags & LO_RECURSABLE) != 0 ||
 		    (opts & MTX_RECURSE) != 0,
 	    ("mtx_lock_spin: recursed on non-recursive mutex %s @ %s:%d\n",
 		    m->lock_object.lo_name, file, line));
 	opts &= ~MTX_RECURSE;
 	WITNESS_CHECKORDER(&m->lock_object, opts | LOP_NEWORDER | LOP_EXCLUSIVE,
 	    file, line, NULL);
 	__mtx_lock_spin(m, curthread, opts, file, line);
 	LOCK_LOG_LOCK("LOCK", &m->lock_object, opts, m->mtx_recurse, file,
 	    line);
 	WITNESS_LOCK(&m->lock_object, opts | LOP_EXCLUSIVE, file, line);
 }
 
 void
 __mtx_unlock_spin_flags(volatile uintptr_t *c, int opts, const char *file,
     int line)
 {
 	struct mtx *m;
 
 	if (SCHEDULER_STOPPED())
 		return;
 
 	m = mtxlock2mtx(c);
 
 	KASSERT(m->mtx_lock != MTX_DESTROYED,
 	    ("mtx_unlock_spin() of destroyed mutex @ %s:%d", file, line));
 	KASSERT(LOCK_CLASS(&m->lock_object) == &lock_class_mtx_spin,
 	    ("mtx_unlock_spin() of sleep mutex %s @ %s:%d",
 	    m->lock_object.lo_name, file, line));
 	WITNESS_UNLOCK(&m->lock_object, opts | LOP_EXCLUSIVE, file, line);
 	LOCK_LOG_LOCK("UNLOCK", &m->lock_object, opts, m->mtx_recurse, file,
 	    line);
 	mtx_assert(m, MA_OWNED);
 
 	__mtx_unlock_spin(m);
 }
 
 /*
  * The important part of mtx_trylock{,_flags}()
  * Tries to acquire lock `m.'  If this function is called on a mutex that
  * is already owned, it will recursively acquire the lock.
  */
 int
 _mtx_trylock_flags_(volatile uintptr_t *c, int opts, const char *file, int line)
 {
 	struct mtx *m;
 #ifdef LOCK_PROFILING
 	uint64_t waittime = 0;
 	int contested = 0;
 #endif
 	int rval;
 
 	if (SCHEDULER_STOPPED())
 		return (1);
 
 	m = mtxlock2mtx(c);
 
 	KASSERT(kdb_active != 0 || !TD_IS_IDLETHREAD(curthread),
 	    ("mtx_trylock() by idle thread %p on sleep mutex %s @ %s:%d",
 	    curthread, m->lock_object.lo_name, file, line));
 	KASSERT(m->mtx_lock != MTX_DESTROYED,
 	    ("mtx_trylock() of destroyed mutex @ %s:%d", file, line));
 	KASSERT(LOCK_CLASS(&m->lock_object) == &lock_class_mtx_sleep,
 	    ("mtx_trylock() of spin mutex %s @ %s:%d", m->lock_object.lo_name,
 	    file, line));
 
 	if (mtx_owned(m) && ((m->lock_object.lo_flags & LO_RECURSABLE) != 0 ||
 	    (opts & MTX_RECURSE) != 0)) {
 		m->mtx_recurse++;
 		atomic_set_ptr(&m->mtx_lock, MTX_RECURSED);
 		rval = 1;
 	} else
 		rval = _mtx_obtain_lock(m, (uintptr_t)curthread);
 	opts &= ~MTX_RECURSE;
 
 	LOCK_LOG_TRY("LOCK", &m->lock_object, opts, rval, file, line);
 	if (rval) {
 		WITNESS_LOCK(&m->lock_object, opts | LOP_EXCLUSIVE | LOP_TRYLOCK,
 		    file, line);
 		curthread->td_locks++;
 		if (m->mtx_recurse == 0)
 			LOCKSTAT_PROFILE_OBTAIN_LOCK_SUCCESS(LS_MTX_LOCK_ACQUIRE,
 			    m, contested, waittime, file, line);
 
 	}
 
 	return (rval);
 }
 
 /*
  * __mtx_lock_sleep: the tougher part of acquiring an MTX_DEF lock.
  *
  * We call this if the lock is either contested (i.e. we need to go to
  * sleep waiting for it), or if we need to recurse on it.
  */
 void
 __mtx_lock_sleep(volatile uintptr_t *c, uintptr_t tid, int opts,
     const char *file, int line)
 {
 	struct mtx *m;
 	struct turnstile *ts;
 	uintptr_t v;
 #ifdef ADAPTIVE_MUTEXES
 	volatile struct thread *owner;
 #endif
 #ifdef KTR
 	int cont_logged = 0;
 #endif
 #ifdef LOCK_PROFILING
 	int contested = 0;
 	uint64_t waittime = 0;
 #endif
 #ifdef KDTRACE_HOOKS
 	uint64_t spin_cnt = 0;
 	uint64_t sleep_cnt = 0;
 	int64_t sleep_time = 0;
 #endif
 
 	if (SCHEDULER_STOPPED())
 		return;
 
 	m = mtxlock2mtx(c);
 
 	if (mtx_owned(m)) {
 		KASSERT((m->lock_object.lo_flags & LO_RECURSABLE) != 0 ||
 		    (opts & MTX_RECURSE) != 0,
 	    ("_mtx_lock_sleep: recursed on non-recursive mutex %s @ %s:%d\n",
 		    m->lock_object.lo_name, file, line));
 		opts &= ~MTX_RECURSE;
 		m->mtx_recurse++;
 		atomic_set_ptr(&m->mtx_lock, MTX_RECURSED);
 		if (LOCK_LOG_TEST(&m->lock_object, opts))
 			CTR1(KTR_LOCK, "_mtx_lock_sleep: %p recursing", m);
 		return;
 	}
 	opts &= ~MTX_RECURSE;
 
 #ifdef HWPMC_HOOKS
 	PMC_SOFT_CALL( , , lock, failed);
 #endif
 	lock_profile_obtain_lock_failed(&m->lock_object,
 		    &contested, &waittime);
 	if (LOCK_LOG_TEST(&m->lock_object, opts))
 		CTR4(KTR_LOCK,
 		    "_mtx_lock_sleep: %s contested (lock=%p) at %s:%d",
 		    m->lock_object.lo_name, (void *)m->mtx_lock, file, line);
 
 	while (!_mtx_obtain_lock(m, tid)) {
 #ifdef KDTRACE_HOOKS
 		spin_cnt++;
 #endif
 #ifdef ADAPTIVE_MUTEXES
 		/*
 		 * If the owner is running on another CPU, spin until the
 		 * owner stops running or the state of the lock changes.
 		 */
 		v = m->mtx_lock;
 		if (v != MTX_UNOWNED) {
 			owner = (struct thread *)(v & ~MTX_FLAGMASK);
 			if (TD_IS_RUNNING(owner)) {
 				if (LOCK_LOG_TEST(&m->lock_object, 0))
 					CTR3(KTR_LOCK,
 					    "%s: spinning on %p held by %p",
 					    __func__, m, owner);
 				while (mtx_owner(m) == owner &&
 				    TD_IS_RUNNING(owner)) {
 					cpu_spinwait();
 #ifdef KDTRACE_HOOKS
 					spin_cnt++;
 #endif
 				}
 				continue;
 			}
 		}
 #endif
 
 		ts = turnstile_trywait(&m->lock_object);
 		v = m->mtx_lock;
 
 		/*
 		 * Check if the lock has been released while spinning for
 		 * the turnstile chain lock.
 		 */
 		if (v == MTX_UNOWNED) {
 			turnstile_cancel(ts);
 			continue;
 		}
 
 #ifdef ADAPTIVE_MUTEXES
 		/*
 		 * The current lock owner might have started executing
 		 * on another CPU (or the lock could have changed
 		 * owners) while we were waiting on the turnstile
 		 * chain lock.  If so, drop the turnstile lock and try
 		 * again.
 		 */
 		owner = (struct thread *)(v & ~MTX_FLAGMASK);
 		if (TD_IS_RUNNING(owner)) {
 			turnstile_cancel(ts);
 			continue;
 		}
 #endif
 
 		/*
 		 * If the mutex isn't already contested and a failure occurs
 		 * setting the contested bit, the mutex was either released
 		 * or the state of the MTX_RECURSED bit changed.
 		 */
 		if ((v & MTX_CONTESTED) == 0 &&
 		    !atomic_cmpset_ptr(&m->mtx_lock, v, v | MTX_CONTESTED)) {
 			turnstile_cancel(ts);
 			continue;
 		}
 
 		/*
 		 * We definitely must sleep for this lock.
 		 */
 		mtx_assert(m, MA_NOTOWNED);
 
 #ifdef KTR
 		if (!cont_logged) {
 			CTR6(KTR_CONTENTION,
 			    "contention: %p at %s:%d wants %s, taken by %s:%d",
 			    (void *)tid, file, line, m->lock_object.lo_name,
 			    WITNESS_FILE(&m->lock_object),
 			    WITNESS_LINE(&m->lock_object));
 			cont_logged = 1;
 		}
 #endif
 
 		/*
 		 * Block on the turnstile.
 		 */
 #ifdef KDTRACE_HOOKS
 		sleep_time -= lockstat_nsecs();
 #endif
 		turnstile_wait(ts, mtx_owner(m), TS_EXCLUSIVE_QUEUE);
 #ifdef KDTRACE_HOOKS
 		sleep_time += lockstat_nsecs();
 		sleep_cnt++;
 #endif
 	}
 #ifdef KTR
 	if (cont_logged) {
 		CTR4(KTR_CONTENTION,
 		    "contention end: %s acquired by %p at %s:%d",
 		    m->lock_object.lo_name, (void *)tid, file, line);
 	}
 #endif
 	LOCKSTAT_PROFILE_OBTAIN_LOCK_SUCCESS(LS_MTX_LOCK_ACQUIRE, m, contested,
 	    waittime, file, line);
 #ifdef KDTRACE_HOOKS
 	if (sleep_time)
 		LOCKSTAT_RECORD1(LS_MTX_LOCK_BLOCK, m, sleep_time);
 
 	/*
 	 * Only record the loops spinning and not sleeping. 
 	 */
 	if (spin_cnt > sleep_cnt)
 		LOCKSTAT_RECORD1(LS_MTX_LOCK_SPIN, m, (spin_cnt - sleep_cnt));
 #endif
 }
 
 static void
 _mtx_lock_spin_failed(struct mtx *m)
 {
 	struct thread *td;
 
 	td = mtx_owner(m);
 
 	/* If the mutex is unlocked, try again. */
 	if (td == NULL)
 		return;
 
 	printf( "spin lock %p (%s) held by %p (tid %d) too long\n",
 	    m, m->lock_object.lo_name, td, td->td_tid);
 #ifdef WITNESS
 	witness_display_spinlock(&m->lock_object, td, printf);
 #endif
 	panic("spin lock held too long");
 }
 
 #ifdef SMP
 /*
  * _mtx_lock_spin_cookie: the tougher part of acquiring an MTX_SPIN lock.
  *
  * This is only called if we need to actually spin for the lock. Recursion
  * is handled inline.
  */
 void
 _mtx_lock_spin_cookie(volatile uintptr_t *c, uintptr_t tid, int opts,
     const char *file, int line)
 {
 	struct mtx *m;
 	int i = 0;
 #ifdef LOCK_PROFILING
 	int contested = 0;
 	uint64_t waittime = 0;
 #endif
 
 	if (SCHEDULER_STOPPED())
 		return;
 
 	m = mtxlock2mtx(c);
 
 	if (LOCK_LOG_TEST(&m->lock_object, opts))
 		CTR1(KTR_LOCK, "_mtx_lock_spin: %p spinning", m);
 
 #ifdef HWPMC_HOOKS
 	PMC_SOFT_CALL( , , lock, failed);
 #endif
 	lock_profile_obtain_lock_failed(&m->lock_object, &contested, &waittime);
 	while (!_mtx_obtain_lock(m, tid)) {
 
 		/* Give interrupts a chance while we spin. */
 		spinlock_exit();
 		while (m->mtx_lock != MTX_UNOWNED) {
 			if (i++ < 10000000) {
 				cpu_spinwait();
 				continue;
 			}
 			if (i < 60000000 || kdb_active || panicstr != NULL)
 				DELAY(1);
 			else
 				_mtx_lock_spin_failed(m);
 			cpu_spinwait();
 		}
 		spinlock_enter();
 	}
 
 	if (LOCK_LOG_TEST(&m->lock_object, opts))
 		CTR1(KTR_LOCK, "_mtx_lock_spin: %p spin done", m);
 
 	LOCKSTAT_PROFILE_OBTAIN_LOCK_SUCCESS(LS_MTX_SPIN_LOCK_ACQUIRE, m,
 	    contested, waittime, (file), (line));
 	LOCKSTAT_RECORD1(LS_MTX_SPIN_LOCK_SPIN, m, i);
 }
 #endif /* SMP */
 
 void
 thread_lock_flags_(struct thread *td, int opts, const char *file, int line)
 {
 	struct mtx *m;
 	uintptr_t tid;
 	int i;
 #ifdef LOCK_PROFILING
 	int contested = 0;
 	uint64_t waittime = 0;
 #endif
 #ifdef KDTRACE_HOOKS
 	uint64_t spin_cnt = 0;
 #endif
 
 	i = 0;
 	tid = (uintptr_t)curthread;
 
 	if (SCHEDULER_STOPPED())
 		return;
 
 	for (;;) {
 retry:
 		spinlock_enter();
 		m = td->td_lock;
 		KASSERT(m->mtx_lock != MTX_DESTROYED,
 		    ("thread_lock() of destroyed mutex @ %s:%d", file, line));
 		KASSERT(LOCK_CLASS(&m->lock_object) == &lock_class_mtx_spin,
 		    ("thread_lock() of sleep mutex %s @ %s:%d",
 		    m->lock_object.lo_name, file, line));
 		if (mtx_owned(m))
 			KASSERT((m->lock_object.lo_flags & LO_RECURSABLE) != 0,
 	    ("thread_lock: recursed on non-recursive mutex %s @ %s:%d\n",
 			    m->lock_object.lo_name, file, line));
 		WITNESS_CHECKORDER(&m->lock_object,
 		    opts | LOP_NEWORDER | LOP_EXCLUSIVE, file, line, NULL);
 		while (!_mtx_obtain_lock(m, tid)) {
 #ifdef KDTRACE_HOOKS
 			spin_cnt++;
 #endif
 			if (m->mtx_lock == tid) {
 				m->mtx_recurse++;
 				break;
 			}
 #ifdef HWPMC_HOOKS
 			PMC_SOFT_CALL( , , lock, failed);
 #endif
 			lock_profile_obtain_lock_failed(&m->lock_object,
 			    &contested, &waittime);
 			/* Give interrupts a chance while we spin. */
 			spinlock_exit();
 			while (m->mtx_lock != MTX_UNOWNED) {
 				if (i++ < 10000000)
 					cpu_spinwait();
 				else if (i < 60000000 ||
 				    kdb_active || panicstr != NULL)
 					DELAY(1);
 				else
 					_mtx_lock_spin_failed(m);
 				cpu_spinwait();
 				if (m != td->td_lock)
 					goto retry;
 			}
 			spinlock_enter();
 		}
 		if (m == td->td_lock)
 			break;
 		__mtx_unlock_spin(m);	/* does spinlock_exit() */
 #ifdef KDTRACE_HOOKS
 		spin_cnt++;
 #endif
 	}
 	if (m->mtx_recurse == 0)
 		LOCKSTAT_PROFILE_OBTAIN_LOCK_SUCCESS(LS_MTX_SPIN_LOCK_ACQUIRE,
 		    m, contested, waittime, (file), (line));
 	LOCK_LOG_LOCK("LOCK", &m->lock_object, opts, m->mtx_recurse, file,
 	    line);
 	WITNESS_LOCK(&m->lock_object, opts | LOP_EXCLUSIVE, file, line);
 	LOCKSTAT_RECORD1(LS_THREAD_LOCK_SPIN, m, spin_cnt);
 }
 
 struct mtx *
 thread_lock_block(struct thread *td)
 {
 	struct mtx *lock;
 
 	THREAD_LOCK_ASSERT(td, MA_OWNED);
 	lock = td->td_lock;
 	td->td_lock = &blocked_lock;
 	mtx_unlock_spin(lock);
 
 	return (lock);
 }
 
 void
 thread_lock_unblock(struct thread *td, struct mtx *new)
 {
 	mtx_assert(new, MA_OWNED);
 	MPASS(td->td_lock == &blocked_lock);
 	atomic_store_rel_ptr((volatile void *)&td->td_lock, (uintptr_t)new);
 }
 
 void
 thread_lock_set(struct thread *td, struct mtx *new)
 {
 	struct mtx *lock;
 
 	mtx_assert(new, MA_OWNED);
 	THREAD_LOCK_ASSERT(td, MA_OWNED);
 	lock = td->td_lock;
 	td->td_lock = new;
 	mtx_unlock_spin(lock);
 }
 
 /*
  * __mtx_unlock_sleep: the tougher part of releasing an MTX_DEF lock.
  *
  * We are only called here if the lock is recursed or contested (i.e. we
  * need to wake up a blocked thread).
  */
 void
 __mtx_unlock_sleep(volatile uintptr_t *c, int opts, const char *file, int line)
 {
 	struct mtx *m;
 	struct turnstile *ts;
 
 	if (SCHEDULER_STOPPED())
 		return;
 
 	m = mtxlock2mtx(c);
 
 	if (mtx_recursed(m)) {
 		if (--(m->mtx_recurse) == 0)
 			atomic_clear_ptr(&m->mtx_lock, MTX_RECURSED);
 		if (LOCK_LOG_TEST(&m->lock_object, opts))
 			CTR1(KTR_LOCK, "_mtx_unlock_sleep: %p unrecurse", m);
 		return;
 	}
 
 	/*
 	 * We have to lock the chain before the turnstile so this turnstile
 	 * can be removed from the hash list if it is empty.
 	 */
 	turnstile_chain_lock(&m->lock_object);
 	ts = turnstile_lookup(&m->lock_object);
 	if (LOCK_LOG_TEST(&m->lock_object, opts))
 		CTR1(KTR_LOCK, "_mtx_unlock_sleep: %p contested", m);
 	MPASS(ts != NULL);
 	turnstile_broadcast(ts, TS_EXCLUSIVE_QUEUE);
 	_mtx_release_lock_quick(m);
 
 	/*
 	 * This turnstile is now no longer associated with the mutex.  We can
 	 * unlock the chain lock so a new turnstile may take it's place.
 	 */
 	turnstile_unpend(ts, TS_EXCLUSIVE_LOCK);
 	turnstile_chain_unlock(&m->lock_object);
 }
 
 /*
  * All the unlocking of MTX_SPIN locks is done inline.
  * See the __mtx_unlock_spin() macro for the details.
  */
 
 /*
  * The backing function for the INVARIANTS-enabled mtx_assert()
  */
 #ifdef INVARIANT_SUPPORT
 void
 __mtx_assert(const volatile uintptr_t *c, int what, const char *file, int line)
 {
 	const struct mtx *m;
 
 	if (panicstr != NULL || dumping)
 		return;
 
 	m = mtxlock2mtx(c);
 
 	switch (what) {
 	case MA_OWNED:
 	case MA_OWNED | MA_RECURSED:
 	case MA_OWNED | MA_NOTRECURSED:
 		if (!mtx_owned(m))
 			panic("mutex %s not owned at %s:%d",
 			    m->lock_object.lo_name, file, line);
 		if (mtx_recursed(m)) {
 			if ((what & MA_NOTRECURSED) != 0)
 				panic("mutex %s recursed at %s:%d",
 				    m->lock_object.lo_name, file, line);
 		} else if ((what & MA_RECURSED) != 0) {
 			panic("mutex %s unrecursed at %s:%d",
 			    m->lock_object.lo_name, file, line);
 		}
 		break;
 	case MA_NOTOWNED:
 		if (mtx_owned(m))
 			panic("mutex %s owned at %s:%d",
 			    m->lock_object.lo_name, file, line);
 		break;
 	default:
 		panic("unknown mtx_assert at %s:%d", file, line);
 	}
 }
 #endif
 
 /*
  * The MUTEX_DEBUG-enabled mtx_validate()
  *
  * Most of these checks have been moved off into the LO_INITIALIZED flag
  * maintained by the witness code.
  */
 #ifdef MUTEX_DEBUG
 
 void	mtx_validate(struct mtx *);
 
 void
 mtx_validate(struct mtx *m)
 {
 
 /*
  * XXX: When kernacc() does not require Giant we can reenable this check
  */
 #ifdef notyet
 	/*
 	 * Can't call kernacc() from early init386(), especially when
 	 * initializing Giant mutex, because some stuff in kernacc()
 	 * requires Giant itself.
 	 */
 	if (!cold)
 		if (!kernacc((caddr_t)m, sizeof(m),
 		    VM_PROT_READ | VM_PROT_WRITE))
 			panic("Can't read and write to mutex %p", m);
 #endif
 }
 #endif
 
 /*
  * General init routine used by the MTX_SYSINIT() macro.
  */
 void
 mtx_sysinit(void *arg)
 {
 	struct mtx_args *margs = arg;
 
 	mtx_init((struct mtx *)margs->ma_mtx, margs->ma_desc, NULL,
 	    margs->ma_opts);
 }
 
 /*
  * Mutex initialization routine; initialize lock `m' of type contained in
  * `opts' with options contained in `opts' and name `name.'  The optional
  * lock type `type' is used as a general lock category name for use with
  * witness.
  */
 void
 _mtx_init(volatile uintptr_t *c, const char *name, const char *type, int opts)
 {
 	struct mtx *m;
 	struct lock_class *class;
 	int flags;
 
 	m = mtxlock2mtx(c);
 
 	MPASS((opts & ~(MTX_SPIN | MTX_QUIET | MTX_RECURSE |
 		MTX_NOWITNESS | MTX_DUPOK | MTX_NOPROFILE)) == 0);
 	ASSERT_ATOMIC_LOAD_PTR(m->mtx_lock,
 	    ("%s: mtx_lock not aligned for %s: %p", __func__, name,
 	    &m->mtx_lock));
 
 #ifdef MUTEX_DEBUG
 	/* Diagnostic and error correction */
 	mtx_validate(m);
 #endif
 
 	/* Determine lock class and lock flags. */
 	if (opts & MTX_SPIN)
 		class = &lock_class_mtx_spin;
 	else
 		class = &lock_class_mtx_sleep;
 	flags = 0;
 	if (opts & MTX_QUIET)
 		flags |= LO_QUIET;
 	if (opts & MTX_RECURSE)
 		flags |= LO_RECURSABLE;
 	if ((opts & MTX_NOWITNESS) == 0)
 		flags |= LO_WITNESS;
 	if (opts & MTX_DUPOK)
 		flags |= LO_DUPOK;
 	if (opts & MTX_NOPROFILE)
 		flags |= LO_NOPROFILE;
 
 	/* Initialize mutex. */
 	lock_init(&m->lock_object, class, name, type, flags);
 
 	m->mtx_lock = MTX_UNOWNED;
 	m->mtx_recurse = 0;
 }
 
 /*
  * Remove lock `m' from all_mtx queue.  We don't allow MTX_QUIET to be
  * passed in as a flag here because if the corresponding mtx_init() was
  * called with MTX_QUIET set, then it will already be set in the mutex's
  * flags.
  */
 void
 _mtx_destroy(volatile uintptr_t *c)
 {
 	struct mtx *m;
 
 	m = mtxlock2mtx(c);
 
 	if (!mtx_owned(m))
 		MPASS(mtx_unowned(m));
 	else {
 		MPASS((m->mtx_lock & (MTX_RECURSED|MTX_CONTESTED)) == 0);
 
 		/* Perform the non-mtx related part of mtx_unlock_spin(). */
 		if (LOCK_CLASS(&m->lock_object) == &lock_class_mtx_spin)
 			spinlock_exit();
 		else
 			curthread->td_locks--;
 
 		lock_profile_release_lock(&m->lock_object);
 		/* Tell witness this isn't locked to make it happy. */
 		WITNESS_UNLOCK(&m->lock_object, LOP_EXCLUSIVE, __FILE__,
 		    __LINE__);
 	}
 
 	m->mtx_lock = MTX_DESTROYED;
 	lock_destroy(&m->lock_object);
 }
 
 /*
  * Intialize the mutex code and system mutexes.  This is called from the MD
  * startup code prior to mi_startup().  The per-CPU data space needs to be
  * setup before this is called.
  */
 void
 mutex_init(void)
 {
 
 	/* Setup turnstiles so that sleep mutexes work. */
 	init_turnstiles();
 
 	/*
 	 * Initialize mutexes.
 	 */
 	mtx_init(&Giant, "Giant", NULL, MTX_DEF | MTX_RECURSE);
 	mtx_init(&blocked_lock, "blocked lock", NULL, MTX_SPIN);
 	blocked_lock.mtx_lock = 0xdeadc0de;	/* Always blocked. */
 	mtx_init(&proc0.p_mtx, "process lock", NULL, MTX_DEF | MTX_DUPOK);
 	mtx_init(&proc0.p_slock, "process slock", NULL, MTX_SPIN | MTX_RECURSE);
 	mtx_init(&devmtx, "cdev", NULL, MTX_DEF);
 	mtx_lock(&Giant);
 }
 
 #ifdef DDB
 void
 db_show_mtx(const struct lock_object *lock)
 {
 	struct thread *td;
 	const struct mtx *m;
 
 	m = (const struct mtx *)lock;
 
 	db_printf(" flags: {");
 	if (LOCK_CLASS(lock) == &lock_class_mtx_spin)
 		db_printf("SPIN");
 	else
 		db_printf("DEF");
 	if (m->lock_object.lo_flags & LO_RECURSABLE)
 		db_printf(", RECURSE");
 	if (m->lock_object.lo_flags & LO_DUPOK)
 		db_printf(", DUPOK");
 	db_printf("}\n");
 	db_printf(" state: {");
 	if (mtx_unowned(m))
 		db_printf("UNOWNED");
 	else if (mtx_destroyed(m))
 		db_printf("DESTROYED");
 	else {
 		db_printf("OWNED");
 		if (m->mtx_lock & MTX_CONTESTED)
 			db_printf(", CONTESTED");
 		if (m->mtx_lock & MTX_RECURSED)
 			db_printf(", RECURSED");
 	}
 	db_printf("}\n");
 	if (!mtx_unowned(m) && !mtx_destroyed(m)) {
 		td = mtx_owner(m);
 		db_printf(" owner: %p (tid %d, pid %d, \"%s\")\n", td,
 		    td->td_tid, td->td_proc->p_pid, td->td_name);
 		if (mtx_recursed(m))
 			db_printf(" recursed: %d\n", m->mtx_recurse);
 	}
 }
 #endif
Index: head/sys/kern/kern_priv.c
===================================================================
--- head/sys/kern/kern_priv.c	(revision 258540)
+++ head/sys/kern/kern_priv.c	(revision 258541)
@@ -1,185 +1,183 @@
 /*-
  * Copyright (c) 2006 nCircle Network Security, Inc.
  * Copyright (c) 2009 Robert N. M. Watson
  * All rights reserved.
  *
  * This software was developed by Robert N. M. Watson for the TrustedBSD
  * Project under contract to nCircle Network Security, Inc.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR, NCIRCLE NETWORK SECURITY,
  * INC., OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
  * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
  * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
  * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
  * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
-#include "opt_kdtrace.h"
-
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/jail.h>
 #include <sys/kernel.h>
 #include <sys/priv.h>
 #include <sys/proc.h>
 #include <sys/sdt.h>
 #include <sys/sysctl.h>
 #include <sys/systm.h>
 
 #include <security/mac/mac_framework.h>
 
 /*
  * `suser_enabled' (which can be set by the security.bsd.suser_enabled
  * sysctl) determines whether the system 'super-user' policy is in effect.  If
  * it is nonzero, an effective uid of 0 connotes special privilege,
  * overriding many mandatory and discretionary protections.  If it is zero,
  * uid 0 is offered no special privilege in the kernel security policy.
  * Setting it to zero may seriously impact the functionality of many existing
  * userland programs, and should not be done without careful consideration of
  * the consequences.
  */
 static int	suser_enabled = 1;
 SYSCTL_INT(_security_bsd, OID_AUTO, suser_enabled, CTLFLAG_RW,
     &suser_enabled, 0, "processes with uid 0 have privilege");
 TUNABLE_INT("security.bsd.suser_enabled", &suser_enabled);
 
 static int	unprivileged_mlock = 1;
 SYSCTL_INT(_security_bsd, OID_AUTO, unprivileged_mlock, CTLFLAG_RW|CTLFLAG_TUN,
     &unprivileged_mlock, 0, "Allow non-root users to call mlock(2)");
 TUNABLE_INT("security.bsd.unprivileged_mlock", &unprivileged_mlock);
 
 SDT_PROVIDER_DEFINE(priv);
 SDT_PROBE_DEFINE1(priv, kernel, priv_check, priv_ok, priv-ok, "int");
 SDT_PROBE_DEFINE1(priv, kernel, priv_check, priv_err, priv-err, "int");
 
 /*
  * Check a credential for privilege.  Lots of good reasons to deny privilege;
  * only a few to grant it.
  */
 int
 priv_check_cred(struct ucred *cred, int priv, int flags)
 {
 	int error;
 
 	KASSERT(PRIV_VALID(priv), ("priv_check_cred: invalid privilege %d",
 	    priv));
 
 	/*
 	 * We first evaluate policies that may deny the granting of
 	 * privilege unilaterally.
 	 */
 #ifdef MAC
 	error = mac_priv_check(cred, priv);
 	if (error)
 		goto out;
 #endif
 
 	/*
 	 * Jail policy will restrict certain privileges that may otherwise be
 	 * be granted.
 	 */
 	error = prison_priv_check(cred, priv);
 	if (error)
 		goto out;
 
 	if (unprivileged_mlock) {
 		/*
 		 * Allow unprivileged users to call mlock(2)/munlock(2) and
 		 * mlockall(2)/munlockall(2).
 		 */
 		switch (priv) {
 		case PRIV_VM_MLOCK:
 		case PRIV_VM_MUNLOCK:
 			error = 0;
 			goto out;
 		}
 	}
 
 	/*
 	 * Having determined if privilege is restricted by various policies,
 	 * now determine if privilege is granted.  At this point, any policy
 	 * may grant privilege.  For now, we allow short-circuit boolean
 	 * evaluation, so may not call all policies.  Perhaps we should.
 	 *
 	 * Superuser policy grants privilege based on the effective (or in
 	 * the case of specific privileges, real) uid being 0.  We allow the
 	 * superuser policy to be globally disabled, although this is
 	 * currenty of limited utility.
 	 */
 	if (suser_enabled) {
 		switch (priv) {
 		case PRIV_MAXFILES:
 		case PRIV_MAXPROC:
 		case PRIV_PROC_LIMIT:
 			if (cred->cr_ruid == 0) {
 				error = 0;
 				goto out;
 			}
 			break;
 		default:
 			if (cred->cr_uid == 0) {
 				error = 0;
 				goto out;
 			}
 			break;
 		}
 	}
 
 	/*
 	 * Writes to kernel/physical memory are a typical root-only operation,
 	 * but non-root users are expected to be able to read it (provided they
 	 * have permission to access /dev/[k]mem).
 	 */
 	if (priv == PRIV_KMEM_READ) {
 		error = 0;
 		goto out;
 	}
 
 	/*
 	 * Now check with MAC, if enabled, to see if a policy module grants
 	 * privilege.
 	 */
 #ifdef MAC
 	if (mac_priv_grant(cred, priv) == 0) {
 		error = 0;
 		goto out;
 	}
 #endif
 
 	/*
 	 * The default is deny, so if no policies have granted it, reject
 	 * with a privilege error here.
 	 */
 	error = EPERM;
 out:
 	if (error)
 		SDT_PROBE1(priv, kernel, priv_check, priv_err, priv);
 	else
 		SDT_PROBE1(priv, kernel, priv_check, priv_ok, priv);
 	return (error);
 }
 
 int
 priv_check(struct thread *td, int priv)
 {
 
 	KASSERT(td == curthread, ("priv_check: td != curthread"));
 
 	return (priv_check_cred(td->td_ucred, priv, 0));
 }
Index: head/sys/kern/kern_proc.c
===================================================================
--- head/sys/kern/kern_proc.c	(revision 258540)
+++ head/sys/kern/kern_proc.c	(revision 258541)
@@ -1,2742 +1,2741 @@
 /*-
  * Copyright (c) 1982, 1986, 1989, 1991, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)kern_proc.c	8.7 (Berkeley) 2/14/95
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_compat.h"
 #include "opt_ddb.h"
-#include "opt_kdtrace.h"
 #include "opt_ktrace.h"
 #include "opt_kstack_pages.h"
 #include "opt_stack.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/elf.h>
 #include <sys/exec.h>
 #include <sys/kernel.h>
 #include <sys/limits.h>
 #include <sys/lock.h>
 #include <sys/loginclass.h>
 #include <sys/malloc.h>
 #include <sys/mman.h>
 #include <sys/mount.h>
 #include <sys/mutex.h>
 #include <sys/proc.h>
 #include <sys/ptrace.h>
 #include <sys/refcount.h>
 #include <sys/resourcevar.h>
 #include <sys/rwlock.h>
 #include <sys/sbuf.h>
 #include <sys/sysent.h>
 #include <sys/sched.h>
 #include <sys/smp.h>
 #include <sys/stack.h>
 #include <sys/stat.h>
 #include <sys/sysctl.h>
 #include <sys/filedesc.h>
 #include <sys/tty.h>
 #include <sys/signalvar.h>
 #include <sys/sdt.h>
 #include <sys/sx.h>
 #include <sys/user.h>
 #include <sys/jail.h>
 #include <sys/vnode.h>
 #include <sys/eventhandler.h>
 
 #ifdef DDB
 #include <ddb/ddb.h>
 #endif
 
 #include <vm/vm.h>
 #include <vm/vm_param.h>
 #include <vm/vm_extern.h>
 #include <vm/pmap.h>
 #include <vm/vm_map.h>
 #include <vm/vm_object.h>
 #include <vm/vm_page.h>
 #include <vm/uma.h>
 
 #ifdef COMPAT_FREEBSD32
 #include <compat/freebsd32/freebsd32.h>
 #include <compat/freebsd32/freebsd32_util.h>
 #endif
 
 SDT_PROVIDER_DEFINE(proc);
 SDT_PROBE_DEFINE4(proc, kernel, ctor, entry, entry, "struct proc *", "int",
     "void *", "int");
 SDT_PROBE_DEFINE4(proc, kernel, ctor, return, return, "struct proc *", "int",
     "void *", "int");
 SDT_PROBE_DEFINE4(proc, kernel, dtor, entry, entry, "struct proc *", "int",
     "void *", "struct thread *");
 SDT_PROBE_DEFINE3(proc, kernel, dtor, return, return, "struct proc *", "int",
     "void *");
 SDT_PROBE_DEFINE3(proc, kernel, init, entry, entry, "struct proc *", "int",
     "int");
 SDT_PROBE_DEFINE3(proc, kernel, init, return, return, "struct proc *", "int",
     "int");
 
 MALLOC_DEFINE(M_PGRP, "pgrp", "process group header");
 MALLOC_DEFINE(M_SESSION, "session", "session header");
 static MALLOC_DEFINE(M_PROC, "proc", "Proc structures");
 MALLOC_DEFINE(M_SUBPROC, "subproc", "Proc sub-structures");
 
 static void doenterpgrp(struct proc *, struct pgrp *);
 static void orphanpg(struct pgrp *pg);
 static void fill_kinfo_aggregate(struct proc *p, struct kinfo_proc *kp);
 static void fill_kinfo_proc_only(struct proc *p, struct kinfo_proc *kp);
 static void fill_kinfo_thread(struct thread *td, struct kinfo_proc *kp,
     int preferthread);
 static void pgadjustjobc(struct pgrp *pgrp, int entering);
 static void pgdelete(struct pgrp *);
 static int proc_ctor(void *mem, int size, void *arg, int flags);
 static void proc_dtor(void *mem, int size, void *arg);
 static int proc_init(void *mem, int size, int flags);
 static void proc_fini(void *mem, int size);
 static void pargs_free(struct pargs *pa);
 static struct proc *zpfind_locked(pid_t pid);
 
 /*
  * Other process lists
  */
 struct pidhashhead *pidhashtbl;
 u_long pidhash;
 struct pgrphashhead *pgrphashtbl;
 u_long pgrphash;
 struct proclist allproc;
 struct proclist zombproc;
 struct sx allproc_lock;
 struct sx proctree_lock;
 struct mtx ppeers_lock;
 uma_zone_t proc_zone;
 
 int kstack_pages = KSTACK_PAGES;
 SYSCTL_INT(_kern, OID_AUTO, kstack_pages, CTLFLAG_RD, &kstack_pages, 0,
     "Kernel stack size in pages");
 
 CTASSERT(sizeof(struct kinfo_proc) == KINFO_PROC_SIZE);
 #ifdef COMPAT_FREEBSD32
 CTASSERT(sizeof(struct kinfo_proc32) == KINFO_PROC32_SIZE);
 #endif
 
 /*
  * Initialize global process hashing structures.
  */
 void
 procinit()
 {
 
 	sx_init(&allproc_lock, "allproc");
 	sx_init(&proctree_lock, "proctree");
 	mtx_init(&ppeers_lock, "p_peers", NULL, MTX_DEF);
 	LIST_INIT(&allproc);
 	LIST_INIT(&zombproc);
 	pidhashtbl = hashinit(maxproc / 4, M_PROC, &pidhash);
 	pgrphashtbl = hashinit(maxproc / 4, M_PROC, &pgrphash);
 	proc_zone = uma_zcreate("PROC", sched_sizeof_proc(),
 	    proc_ctor, proc_dtor, proc_init, proc_fini,
 	    UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
 	uihashinit();
 }
 
 /*
  * Prepare a proc for use.
  */
 static int
 proc_ctor(void *mem, int size, void *arg, int flags)
 {
 	struct proc *p;
 
 	p = (struct proc *)mem;
 	SDT_PROBE(proc, kernel, ctor , entry, p, size, arg, flags, 0);
 	EVENTHANDLER_INVOKE(process_ctor, p);
 	SDT_PROBE(proc, kernel, ctor , return, p, size, arg, flags, 0);
 	return (0);
 }
 
 /*
  * Reclaim a proc after use.
  */
 static void
 proc_dtor(void *mem, int size, void *arg)
 {
 	struct proc *p;
 	struct thread *td;
 
 	/* INVARIANTS checks go here */
 	p = (struct proc *)mem;
 	td = FIRST_THREAD_IN_PROC(p);
 	SDT_PROBE(proc, kernel, dtor, entry, p, size, arg, td, 0);
 	if (td != NULL) {
 #ifdef INVARIANTS
 		KASSERT((p->p_numthreads == 1),
 		    ("bad number of threads in exiting process"));
 		KASSERT(STAILQ_EMPTY(&p->p_ktr), ("proc_dtor: non-empty p_ktr"));
 #endif
 		/* Free all OSD associated to this thread. */
 		osd_thread_exit(td);
 	}
 	EVENTHANDLER_INVOKE(process_dtor, p);
 	if (p->p_ksi != NULL)
 		KASSERT(! KSI_ONQ(p->p_ksi), ("SIGCHLD queue"));
 	SDT_PROBE(proc, kernel, dtor, return, p, size, arg, 0, 0);
 }
 
 /*
  * Initialize type-stable parts of a proc (when newly created).
  */
 static int
 proc_init(void *mem, int size, int flags)
 {
 	struct proc *p;
 
 	p = (struct proc *)mem;
 	SDT_PROBE(proc, kernel, init, entry, p, size, flags, 0, 0);
 	p->p_sched = (struct p_sched *)&p[1];
 	bzero(&p->p_mtx, sizeof(struct mtx));
 	mtx_init(&p->p_mtx, "process lock", NULL, MTX_DEF | MTX_DUPOK);
 	mtx_init(&p->p_slock, "process slock", NULL, MTX_SPIN | MTX_RECURSE);
 	cv_init(&p->p_pwait, "ppwait");
 	cv_init(&p->p_dbgwait, "dbgwait");
 	TAILQ_INIT(&p->p_threads);	     /* all threads in proc */
 	EVENTHANDLER_INVOKE(process_init, p);
 	p->p_stats = pstats_alloc();
 	SDT_PROBE(proc, kernel, init, return, p, size, flags, 0, 0);
 	return (0);
 }
 
 /*
  * UMA should ensure that this function is never called.
  * Freeing a proc structure would violate type stability.
  */
 static void
 proc_fini(void *mem, int size)
 {
 #ifdef notnow
 	struct proc *p;
 
 	p = (struct proc *)mem;
 	EVENTHANDLER_INVOKE(process_fini, p);
 	pstats_free(p->p_stats);
 	thread_free(FIRST_THREAD_IN_PROC(p));
 	mtx_destroy(&p->p_mtx);
 	if (p->p_ksi != NULL)
 		ksiginfo_free(p->p_ksi);
 #else
 	panic("proc reclaimed");
 #endif
 }
 
 /*
  * Is p an inferior of the current process?
  */
 int
 inferior(p)
 	register struct proc *p;
 {
 
 	sx_assert(&proctree_lock, SX_LOCKED);
 	for (; p != curproc; p = p->p_pptr)
 		if (p->p_pid == 0)
 			return (0);
 	return (1);
 }
 
 struct proc *
 pfind_locked(pid_t pid)
 {
 	struct proc *p;
 
 	sx_assert(&allproc_lock, SX_LOCKED);
 	LIST_FOREACH(p, PIDHASH(pid), p_hash) {
 		if (p->p_pid == pid) {
 			PROC_LOCK(p);
 			if (p->p_state == PRS_NEW) {
 				PROC_UNLOCK(p);
 				p = NULL;
 			}
 			break;
 		}
 	}
 	return (p);
 }
 
 /*
  * Locate a process by number; return only "live" processes -- i.e., neither
  * zombies nor newly born but incompletely initialized processes.  By not
  * returning processes in the PRS_NEW state, we allow callers to avoid
  * testing for that condition to avoid dereferencing p_ucred, et al.
  */
 struct proc *
 pfind(pid_t pid)
 {
 	struct proc *p;
 
 	sx_slock(&allproc_lock);
 	p = pfind_locked(pid);
 	sx_sunlock(&allproc_lock);
 	return (p);
 }
 
 static struct proc *
 pfind_tid_locked(pid_t tid)
 {
 	struct proc *p;
 	struct thread *td;
 
 	sx_assert(&allproc_lock, SX_LOCKED);
 	FOREACH_PROC_IN_SYSTEM(p) {
 		PROC_LOCK(p);
 		if (p->p_state == PRS_NEW) {
 			PROC_UNLOCK(p);
 			continue;
 		}
 		FOREACH_THREAD_IN_PROC(p, td) {
 			if (td->td_tid == tid)
 				goto found;
 		}
 		PROC_UNLOCK(p);
 	}
 found:
 	return (p);
 }
 
 /*
  * Locate a process group by number.
  * The caller must hold proctree_lock.
  */
 struct pgrp *
 pgfind(pgid)
 	register pid_t pgid;
 {
 	register struct pgrp *pgrp;
 
 	sx_assert(&proctree_lock, SX_LOCKED);
 
 	LIST_FOREACH(pgrp, PGRPHASH(pgid), pg_hash) {
 		if (pgrp->pg_id == pgid) {
 			PGRP_LOCK(pgrp);
 			return (pgrp);
 		}
 	}
 	return (NULL);
 }
 
 /*
  * Locate process and do additional manipulations, depending on flags.
  */
 int
 pget(pid_t pid, int flags, struct proc **pp)
 {
 	struct proc *p;
 	int error;
 
 	sx_slock(&allproc_lock);
 	if (pid <= PID_MAX) {
 		p = pfind_locked(pid);
 		if (p == NULL && (flags & PGET_NOTWEXIT) == 0)
 			p = zpfind_locked(pid);
 	} else if ((flags & PGET_NOTID) == 0) {
 		p = pfind_tid_locked(pid);
 	} else {
 		p = NULL;
 	}
 	sx_sunlock(&allproc_lock);
 	if (p == NULL)
 		return (ESRCH);
 	if ((flags & PGET_CANSEE) != 0) {
 		error = p_cansee(curthread, p);
 		if (error != 0)
 			goto errout;
 	}
 	if ((flags & PGET_CANDEBUG) != 0) {
 		error = p_candebug(curthread, p);
 		if (error != 0)
 			goto errout;
 	}
 	if ((flags & PGET_ISCURRENT) != 0 && curproc != p) {
 		error = EPERM;
 		goto errout;
 	}
 	if ((flags & PGET_NOTWEXIT) != 0 && (p->p_flag & P_WEXIT) != 0) {
 		error = ESRCH;
 		goto errout;
 	}
 	if ((flags & PGET_NOTINEXEC) != 0 && (p->p_flag & P_INEXEC) != 0) {
 		/*
 		 * XXXRW: Not clear ESRCH is the right error during proc
 		 * execve().
 		 */
 		error = ESRCH;
 		goto errout;
 	}
 	if ((flags & PGET_HOLD) != 0) {
 		_PHOLD(p);
 		PROC_UNLOCK(p);
 	}
 	*pp = p;
 	return (0);
 errout:
 	PROC_UNLOCK(p);
 	return (error);
 }
 
 /*
  * Create a new process group.
  * pgid must be equal to the pid of p.
  * Begin a new session if required.
  */
 int
 enterpgrp(p, pgid, pgrp, sess)
 	register struct proc *p;
 	pid_t pgid;
 	struct pgrp *pgrp;
 	struct session *sess;
 {
 
 	sx_assert(&proctree_lock, SX_XLOCKED);
 
 	KASSERT(pgrp != NULL, ("enterpgrp: pgrp == NULL"));
 	KASSERT(p->p_pid == pgid,
 	    ("enterpgrp: new pgrp and pid != pgid"));
 	KASSERT(pgfind(pgid) == NULL,
 	    ("enterpgrp: pgrp with pgid exists"));
 	KASSERT(!SESS_LEADER(p),
 	    ("enterpgrp: session leader attempted setpgrp"));
 
 	mtx_init(&pgrp->pg_mtx, "process group", NULL, MTX_DEF | MTX_DUPOK);
 
 	if (sess != NULL) {
 		/*
 		 * new session
 		 */
 		mtx_init(&sess->s_mtx, "session", NULL, MTX_DEF);
 		PROC_LOCK(p);
 		p->p_flag &= ~P_CONTROLT;
 		PROC_UNLOCK(p);
 		PGRP_LOCK(pgrp);
 		sess->s_leader = p;
 		sess->s_sid = p->p_pid;
 		refcount_init(&sess->s_count, 1);
 		sess->s_ttyvp = NULL;
 		sess->s_ttydp = NULL;
 		sess->s_ttyp = NULL;
 		bcopy(p->p_session->s_login, sess->s_login,
 			    sizeof(sess->s_login));
 		pgrp->pg_session = sess;
 		KASSERT(p == curproc,
 		    ("enterpgrp: mksession and p != curproc"));
 	} else {
 		pgrp->pg_session = p->p_session;
 		sess_hold(pgrp->pg_session);
 		PGRP_LOCK(pgrp);
 	}
 	pgrp->pg_id = pgid;
 	LIST_INIT(&pgrp->pg_members);
 
 	/*
 	 * As we have an exclusive lock of proctree_lock,
 	 * this should not deadlock.
 	 */
 	LIST_INSERT_HEAD(PGRPHASH(pgid), pgrp, pg_hash);
 	pgrp->pg_jobc = 0;
 	SLIST_INIT(&pgrp->pg_sigiolst);
 	PGRP_UNLOCK(pgrp);
 
 	doenterpgrp(p, pgrp);
 
 	return (0);
 }
 
 /*
  * Move p to an existing process group
  */
 int
 enterthispgrp(p, pgrp)
 	register struct proc *p;
 	struct pgrp *pgrp;
 {
 
 	sx_assert(&proctree_lock, SX_XLOCKED);
 	PROC_LOCK_ASSERT(p, MA_NOTOWNED);
 	PGRP_LOCK_ASSERT(pgrp, MA_NOTOWNED);
 	PGRP_LOCK_ASSERT(p->p_pgrp, MA_NOTOWNED);
 	SESS_LOCK_ASSERT(p->p_session, MA_NOTOWNED);
 	KASSERT(pgrp->pg_session == p->p_session,
 		("%s: pgrp's session %p, p->p_session %p.\n",
 		__func__,
 		pgrp->pg_session,
 		p->p_session));
 	KASSERT(pgrp != p->p_pgrp,
 		("%s: p belongs to pgrp.", __func__));
 
 	doenterpgrp(p, pgrp);
 
 	return (0);
 }
 
 /*
  * Move p to a process group
  */
 static void
 doenterpgrp(p, pgrp)
 	struct proc *p;
 	struct pgrp *pgrp;
 {
 	struct pgrp *savepgrp;
 
 	sx_assert(&proctree_lock, SX_XLOCKED);
 	PROC_LOCK_ASSERT(p, MA_NOTOWNED);
 	PGRP_LOCK_ASSERT(pgrp, MA_NOTOWNED);
 	PGRP_LOCK_ASSERT(p->p_pgrp, MA_NOTOWNED);
 	SESS_LOCK_ASSERT(p->p_session, MA_NOTOWNED);
 
 	savepgrp = p->p_pgrp;
 
 	/*
 	 * Adjust eligibility of affected pgrps to participate in job control.
 	 * Increment eligibility counts before decrementing, otherwise we
 	 * could reach 0 spuriously during the first call.
 	 */
 	fixjobc(p, pgrp, 1);
 	fixjobc(p, p->p_pgrp, 0);
 
 	PGRP_LOCK(pgrp);
 	PGRP_LOCK(savepgrp);
 	PROC_LOCK(p);
 	LIST_REMOVE(p, p_pglist);
 	p->p_pgrp = pgrp;
 	PROC_UNLOCK(p);
 	LIST_INSERT_HEAD(&pgrp->pg_members, p, p_pglist);
 	PGRP_UNLOCK(savepgrp);
 	PGRP_UNLOCK(pgrp);
 	if (LIST_EMPTY(&savepgrp->pg_members))
 		pgdelete(savepgrp);
 }
 
 /*
  * remove process from process group
  */
 int
 leavepgrp(p)
 	register struct proc *p;
 {
 	struct pgrp *savepgrp;
 
 	sx_assert(&proctree_lock, SX_XLOCKED);
 	savepgrp = p->p_pgrp;
 	PGRP_LOCK(savepgrp);
 	PROC_LOCK(p);
 	LIST_REMOVE(p, p_pglist);
 	p->p_pgrp = NULL;
 	PROC_UNLOCK(p);
 	PGRP_UNLOCK(savepgrp);
 	if (LIST_EMPTY(&savepgrp->pg_members))
 		pgdelete(savepgrp);
 	return (0);
 }
 
 /*
  * delete a process group
  */
 static void
 pgdelete(pgrp)
 	register struct pgrp *pgrp;
 {
 	struct session *savesess;
 	struct tty *tp;
 
 	sx_assert(&proctree_lock, SX_XLOCKED);
 	PGRP_LOCK_ASSERT(pgrp, MA_NOTOWNED);
 	SESS_LOCK_ASSERT(pgrp->pg_session, MA_NOTOWNED);
 
 	/*
 	 * Reset any sigio structures pointing to us as a result of
 	 * F_SETOWN with our pgid.
 	 */
 	funsetownlst(&pgrp->pg_sigiolst);
 
 	PGRP_LOCK(pgrp);
 	tp = pgrp->pg_session->s_ttyp;
 	LIST_REMOVE(pgrp, pg_hash);
 	savesess = pgrp->pg_session;
 	PGRP_UNLOCK(pgrp);
 
 	/* Remove the reference to the pgrp before deallocating it. */
 	if (tp != NULL) {
 		tty_lock(tp);
 		tty_rel_pgrp(tp, pgrp);
 	}
 
 	mtx_destroy(&pgrp->pg_mtx);
 	free(pgrp, M_PGRP);
 	sess_release(savesess);
 }
 
 static void
 pgadjustjobc(pgrp, entering)
 	struct pgrp *pgrp;
 	int entering;
 {
 
 	PGRP_LOCK(pgrp);
 	if (entering)
 		pgrp->pg_jobc++;
 	else {
 		--pgrp->pg_jobc;
 		if (pgrp->pg_jobc == 0)
 			orphanpg(pgrp);
 	}
 	PGRP_UNLOCK(pgrp);
 }
 
 /*
  * Adjust pgrp jobc counters when specified process changes process group.
  * We count the number of processes in each process group that "qualify"
  * the group for terminal job control (those with a parent in a different
  * process group of the same session).  If that count reaches zero, the
  * process group becomes orphaned.  Check both the specified process'
  * process group and that of its children.
  * entering == 0 => p is leaving specified group.
  * entering == 1 => p is entering specified group.
  */
 void
 fixjobc(p, pgrp, entering)
 	register struct proc *p;
 	register struct pgrp *pgrp;
 	int entering;
 {
 	register struct pgrp *hispgrp;
 	register struct session *mysession;
 
 	sx_assert(&proctree_lock, SX_LOCKED);
 	PROC_LOCK_ASSERT(p, MA_NOTOWNED);
 	PGRP_LOCK_ASSERT(pgrp, MA_NOTOWNED);
 	SESS_LOCK_ASSERT(pgrp->pg_session, MA_NOTOWNED);
 
 	/*
 	 * Check p's parent to see whether p qualifies its own process
 	 * group; if so, adjust count for p's process group.
 	 */
 	mysession = pgrp->pg_session;
 	if ((hispgrp = p->p_pptr->p_pgrp) != pgrp &&
 	    hispgrp->pg_session == mysession)
 		pgadjustjobc(pgrp, entering);
 
 	/*
 	 * Check this process' children to see whether they qualify
 	 * their process groups; if so, adjust counts for children's
 	 * process groups.
 	 */
 	LIST_FOREACH(p, &p->p_children, p_sibling) {
 		hispgrp = p->p_pgrp;
 		if (hispgrp == pgrp ||
 		    hispgrp->pg_session != mysession)
 			continue;
 		PROC_LOCK(p);
 		if (p->p_state == PRS_ZOMBIE) {
 			PROC_UNLOCK(p);
 			continue;
 		}
 		PROC_UNLOCK(p);
 		pgadjustjobc(hispgrp, entering);
 	}
 }
 
 /*
  * A process group has become orphaned;
  * if there are any stopped processes in the group,
  * hang-up all process in that group.
  */
 static void
 orphanpg(pg)
 	struct pgrp *pg;
 {
 	register struct proc *p;
 
 	PGRP_LOCK_ASSERT(pg, MA_OWNED);
 
 	LIST_FOREACH(p, &pg->pg_members, p_pglist) {
 		PROC_LOCK(p);
 		if (P_SHOULDSTOP(p)) {
 			PROC_UNLOCK(p);
 			LIST_FOREACH(p, &pg->pg_members, p_pglist) {
 				PROC_LOCK(p);
 				kern_psignal(p, SIGHUP);
 				kern_psignal(p, SIGCONT);
 				PROC_UNLOCK(p);
 			}
 			return;
 		}
 		PROC_UNLOCK(p);
 	}
 }
 
 void
 sess_hold(struct session *s)
 {
 
 	refcount_acquire(&s->s_count);
 }
 
 void
 sess_release(struct session *s)
 {
 
 	if (refcount_release(&s->s_count)) {
 		if (s->s_ttyp != NULL) {
 			tty_lock(s->s_ttyp);
 			tty_rel_sess(s->s_ttyp, s);
 		}
 		mtx_destroy(&s->s_mtx);
 		free(s, M_SESSION);
 	}
 }
 
 #ifdef DDB
 
 DB_SHOW_COMMAND(pgrpdump, pgrpdump)
 {
 	register struct pgrp *pgrp;
 	register struct proc *p;
 	register int i;
 
 	for (i = 0; i <= pgrphash; i++) {
 		if (!LIST_EMPTY(&pgrphashtbl[i])) {
 			printf("\tindx %d\n", i);
 			LIST_FOREACH(pgrp, &pgrphashtbl[i], pg_hash) {
 				printf(
 			"\tpgrp %p, pgid %ld, sess %p, sesscnt %d, mem %p\n",
 				    (void *)pgrp, (long)pgrp->pg_id,
 				    (void *)pgrp->pg_session,
 				    pgrp->pg_session->s_count,
 				    (void *)LIST_FIRST(&pgrp->pg_members));
 				LIST_FOREACH(p, &pgrp->pg_members, p_pglist) {
 					printf("\t\tpid %ld addr %p pgrp %p\n", 
 					    (long)p->p_pid, (void *)p,
 					    (void *)p->p_pgrp);
 				}
 			}
 		}
 	}
 }
 #endif /* DDB */
 
 /*
  * Calculate the kinfo_proc members which contain process-wide
  * informations.
  * Must be called with the target process locked.
  */
 static void
 fill_kinfo_aggregate(struct proc *p, struct kinfo_proc *kp)
 {
 	struct thread *td;
 
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 
 	kp->ki_estcpu = 0;
 	kp->ki_pctcpu = 0;
 	FOREACH_THREAD_IN_PROC(p, td) {
 		thread_lock(td);
 		kp->ki_pctcpu += sched_pctcpu(td);
 		kp->ki_estcpu += td->td_estcpu;
 		thread_unlock(td);
 	}
 }
 
 /*
  * Clear kinfo_proc and fill in any information that is common
  * to all threads in the process.
  * Must be called with the target process locked.
  */
 static void
 fill_kinfo_proc_only(struct proc *p, struct kinfo_proc *kp)
 {
 	struct thread *td0;
 	struct tty *tp;
 	struct session *sp;
 	struct ucred *cred;
 	struct sigacts *ps;
 
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	bzero(kp, sizeof(*kp));
 
 	kp->ki_structsize = sizeof(*kp);
 	kp->ki_paddr = p;
 	kp->ki_addr =/* p->p_addr; */0; /* XXX */
 	kp->ki_args = p->p_args;
 	kp->ki_textvp = p->p_textvp;
 #ifdef KTRACE
 	kp->ki_tracep = p->p_tracevp;
 	kp->ki_traceflag = p->p_traceflag;
 #endif
 	kp->ki_fd = p->p_fd;
 	kp->ki_vmspace = p->p_vmspace;
 	kp->ki_flag = p->p_flag;
 	kp->ki_flag2 = p->p_flag2;
 	cred = p->p_ucred;
 	if (cred) {
 		kp->ki_uid = cred->cr_uid;
 		kp->ki_ruid = cred->cr_ruid;
 		kp->ki_svuid = cred->cr_svuid;
 		kp->ki_cr_flags = 0;
 		if (cred->cr_flags & CRED_FLAG_CAPMODE)
 			kp->ki_cr_flags |= KI_CRF_CAPABILITY_MODE;
 		/* XXX bde doesn't like KI_NGROUPS */
 		if (cred->cr_ngroups > KI_NGROUPS) {
 			kp->ki_ngroups = KI_NGROUPS;
 			kp->ki_cr_flags |= KI_CRF_GRP_OVERFLOW;
 		} else
 			kp->ki_ngroups = cred->cr_ngroups;
 		bcopy(cred->cr_groups, kp->ki_groups,
 		    kp->ki_ngroups * sizeof(gid_t));
 		kp->ki_rgid = cred->cr_rgid;
 		kp->ki_svgid = cred->cr_svgid;
 		/* If jailed(cred), emulate the old P_JAILED flag. */
 		if (jailed(cred)) {
 			kp->ki_flag |= P_JAILED;
 			/* If inside the jail, use 0 as a jail ID. */
 			if (cred->cr_prison != curthread->td_ucred->cr_prison)
 				kp->ki_jid = cred->cr_prison->pr_id;
 		}
 		strlcpy(kp->ki_loginclass, cred->cr_loginclass->lc_name,
 		    sizeof(kp->ki_loginclass));
 	}
 	ps = p->p_sigacts;
 	if (ps) {
 		mtx_lock(&ps->ps_mtx);
 		kp->ki_sigignore = ps->ps_sigignore;
 		kp->ki_sigcatch = ps->ps_sigcatch;
 		mtx_unlock(&ps->ps_mtx);
 	}
 	if (p->p_state != PRS_NEW &&
 	    p->p_state != PRS_ZOMBIE &&
 	    p->p_vmspace != NULL) {
 		struct vmspace *vm = p->p_vmspace;
 
 		kp->ki_size = vm->vm_map.size;
 		kp->ki_rssize = vmspace_resident_count(vm); /*XXX*/
 		FOREACH_THREAD_IN_PROC(p, td0) {
 			if (!TD_IS_SWAPPED(td0))
 				kp->ki_rssize += td0->td_kstack_pages;
 		}
 		kp->ki_swrss = vm->vm_swrss;
 		kp->ki_tsize = vm->vm_tsize;
 		kp->ki_dsize = vm->vm_dsize;
 		kp->ki_ssize = vm->vm_ssize;
 	} else if (p->p_state == PRS_ZOMBIE)
 		kp->ki_stat = SZOMB;
 	if (kp->ki_flag & P_INMEM)
 		kp->ki_sflag = PS_INMEM;
 	else
 		kp->ki_sflag = 0;
 	/* Calculate legacy swtime as seconds since 'swtick'. */
 	kp->ki_swtime = (ticks - p->p_swtick) / hz;
 	kp->ki_pid = p->p_pid;
 	kp->ki_nice = p->p_nice;
 	kp->ki_fibnum = p->p_fibnum;
 	kp->ki_start = p->p_stats->p_start;
 	timevaladd(&kp->ki_start, &boottime);
 	PROC_SLOCK(p);
 	rufetch(p, &kp->ki_rusage);
 	kp->ki_runtime = cputick2usec(p->p_rux.rux_runtime);
 	calcru(p, &kp->ki_rusage.ru_utime, &kp->ki_rusage.ru_stime);
 	PROC_SUNLOCK(p);
 	calccru(p, &kp->ki_childutime, &kp->ki_childstime);
 	/* Some callers want child times in a single value. */
 	kp->ki_childtime = kp->ki_childstime;
 	timevaladd(&kp->ki_childtime, &kp->ki_childutime);
 
 	FOREACH_THREAD_IN_PROC(p, td0)
 		kp->ki_cow += td0->td_cow;
 
 	tp = NULL;
 	if (p->p_pgrp) {
 		kp->ki_pgid = p->p_pgrp->pg_id;
 		kp->ki_jobc = p->p_pgrp->pg_jobc;
 		sp = p->p_pgrp->pg_session;
 
 		if (sp != NULL) {
 			kp->ki_sid = sp->s_sid;
 			SESS_LOCK(sp);
 			strlcpy(kp->ki_login, sp->s_login,
 			    sizeof(kp->ki_login));
 			if (sp->s_ttyvp)
 				kp->ki_kiflag |= KI_CTTY;
 			if (SESS_LEADER(p))
 				kp->ki_kiflag |= KI_SLEADER;
 			/* XXX proctree_lock */
 			tp = sp->s_ttyp;
 			SESS_UNLOCK(sp);
 		}
 	}
 	if ((p->p_flag & P_CONTROLT) && tp != NULL) {
 		kp->ki_tdev = tty_udev(tp);
 		kp->ki_tpgid = tp->t_pgrp ? tp->t_pgrp->pg_id : NO_PID;
 		if (tp->t_session)
 			kp->ki_tsid = tp->t_session->s_sid;
 	} else
 		kp->ki_tdev = NODEV;
 	if (p->p_comm[0] != '\0')
 		strlcpy(kp->ki_comm, p->p_comm, sizeof(kp->ki_comm));
 	if (p->p_sysent && p->p_sysent->sv_name != NULL &&
 	    p->p_sysent->sv_name[0] != '\0')
 		strlcpy(kp->ki_emul, p->p_sysent->sv_name, sizeof(kp->ki_emul));
 	kp->ki_siglist = p->p_siglist;
 	kp->ki_xstat = p->p_xstat;
 	kp->ki_acflag = p->p_acflag;
 	kp->ki_lock = p->p_lock;
 	if (p->p_pptr)
 		kp->ki_ppid = p->p_pptr->p_pid;
 }
 
 /*
  * Fill in information that is thread specific.  Must be called with
  * target process locked.  If 'preferthread' is set, overwrite certain
  * process-related fields that are maintained for both threads and
  * processes.
  */
 static void
 fill_kinfo_thread(struct thread *td, struct kinfo_proc *kp, int preferthread)
 {
 	struct proc *p;
 
 	p = td->td_proc;
 	kp->ki_tdaddr = td;
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 
 	if (preferthread)
 		PROC_SLOCK(p);
 	thread_lock(td);
 	if (td->td_wmesg != NULL)
 		strlcpy(kp->ki_wmesg, td->td_wmesg, sizeof(kp->ki_wmesg));
 	else
 		bzero(kp->ki_wmesg, sizeof(kp->ki_wmesg));
 	strlcpy(kp->ki_tdname, td->td_name, sizeof(kp->ki_tdname));
 	if (TD_ON_LOCK(td)) {
 		kp->ki_kiflag |= KI_LOCKBLOCK;
 		strlcpy(kp->ki_lockname, td->td_lockname,
 		    sizeof(kp->ki_lockname));
 	} else {
 		kp->ki_kiflag &= ~KI_LOCKBLOCK;
 		bzero(kp->ki_lockname, sizeof(kp->ki_lockname));
 	}
 
 	if (p->p_state == PRS_NORMAL) { /* approximate. */
 		if (TD_ON_RUNQ(td) ||
 		    TD_CAN_RUN(td) ||
 		    TD_IS_RUNNING(td)) {
 			kp->ki_stat = SRUN;
 		} else if (P_SHOULDSTOP(p)) {
 			kp->ki_stat = SSTOP;
 		} else if (TD_IS_SLEEPING(td)) {
 			kp->ki_stat = SSLEEP;
 		} else if (TD_ON_LOCK(td)) {
 			kp->ki_stat = SLOCK;
 		} else {
 			kp->ki_stat = SWAIT;
 		}
 	} else if (p->p_state == PRS_ZOMBIE) {
 		kp->ki_stat = SZOMB;
 	} else {
 		kp->ki_stat = SIDL;
 	}
 
 	/* Things in the thread */
 	kp->ki_wchan = td->td_wchan;
 	kp->ki_pri.pri_level = td->td_priority;
 	kp->ki_pri.pri_native = td->td_base_pri;
 	kp->ki_lastcpu = td->td_lastcpu;
 	kp->ki_oncpu = td->td_oncpu;
 	kp->ki_tdflags = td->td_flags;
 	kp->ki_tid = td->td_tid;
 	kp->ki_numthreads = p->p_numthreads;
 	kp->ki_pcb = td->td_pcb;
 	kp->ki_kstack = (void *)td->td_kstack;
 	kp->ki_slptime = (ticks - td->td_slptick) / hz;
 	kp->ki_pri.pri_class = td->td_pri_class;
 	kp->ki_pri.pri_user = td->td_user_pri;
 
 	if (preferthread) {
 		rufetchtd(td, &kp->ki_rusage);
 		kp->ki_runtime = cputick2usec(td->td_rux.rux_runtime);
 		kp->ki_pctcpu = sched_pctcpu(td);
 		kp->ki_estcpu = td->td_estcpu;
 		kp->ki_cow = td->td_cow;
 	}
 
 	/* We can't get this anymore but ps etc never used it anyway. */
 	kp->ki_rqindex = 0;
 
 	if (preferthread)
 		kp->ki_siglist = td->td_siglist;
 	kp->ki_sigmask = td->td_sigmask;
 	thread_unlock(td);
 	if (preferthread)
 		PROC_SUNLOCK(p);
 }
 
 /*
  * Fill in a kinfo_proc structure for the specified process.
  * Must be called with the target process locked.
  */
 void
 fill_kinfo_proc(struct proc *p, struct kinfo_proc *kp)
 {
 
 	MPASS(FIRST_THREAD_IN_PROC(p) != NULL);
 
 	fill_kinfo_proc_only(p, kp);
 	fill_kinfo_thread(FIRST_THREAD_IN_PROC(p), kp, 0);
 	fill_kinfo_aggregate(p, kp);
 }
 
 struct pstats *
 pstats_alloc(void)
 {
 
 	return (malloc(sizeof(struct pstats), M_SUBPROC, M_ZERO|M_WAITOK));
 }
 
 /*
  * Copy parts of p_stats; zero the rest of p_stats (statistics).
  */
 void
 pstats_fork(struct pstats *src, struct pstats *dst)
 {
 
 	bzero(&dst->pstat_startzero,
 	    __rangeof(struct pstats, pstat_startzero, pstat_endzero));
 	bcopy(&src->pstat_startcopy, &dst->pstat_startcopy,
 	    __rangeof(struct pstats, pstat_startcopy, pstat_endcopy));
 }
 
 void
 pstats_free(struct pstats *ps)
 {
 
 	free(ps, M_SUBPROC);
 }
 
 static struct proc *
 zpfind_locked(pid_t pid)
 {
 	struct proc *p;
 
 	sx_assert(&allproc_lock, SX_LOCKED);
 	LIST_FOREACH(p, &zombproc, p_list) {
 		if (p->p_pid == pid) {
 			PROC_LOCK(p);
 			break;
 		}
 	}
 	return (p);
 }
 
 /*
  * Locate a zombie process by number
  */
 struct proc *
 zpfind(pid_t pid)
 {
 	struct proc *p;
 
 	sx_slock(&allproc_lock);
 	p = zpfind_locked(pid);
 	sx_sunlock(&allproc_lock);
 	return (p);
 }
 
 #ifdef COMPAT_FREEBSD32
 
 /*
  * This function is typically used to copy out the kernel address, so
  * it can be replaced by assignment of zero.
  */
 static inline uint32_t
 ptr32_trim(void *ptr)
 {
 	uintptr_t uptr;
 
 	uptr = (uintptr_t)ptr;
 	return ((uptr > UINT_MAX) ? 0 : uptr);
 }
 
 #define PTRTRIM_CP(src,dst,fld) \
 	do { (dst).fld = ptr32_trim((src).fld); } while (0)
 
 static void
 freebsd32_kinfo_proc_out(const struct kinfo_proc *ki, struct kinfo_proc32 *ki32)
 {
 	int i;
 
 	bzero(ki32, sizeof(struct kinfo_proc32));
 	ki32->ki_structsize = sizeof(struct kinfo_proc32);
 	CP(*ki, *ki32, ki_layout);
 	PTRTRIM_CP(*ki, *ki32, ki_args);
 	PTRTRIM_CP(*ki, *ki32, ki_paddr);
 	PTRTRIM_CP(*ki, *ki32, ki_addr);
 	PTRTRIM_CP(*ki, *ki32, ki_tracep);
 	PTRTRIM_CP(*ki, *ki32, ki_textvp);
 	PTRTRIM_CP(*ki, *ki32, ki_fd);
 	PTRTRIM_CP(*ki, *ki32, ki_vmspace);
 	PTRTRIM_CP(*ki, *ki32, ki_wchan);
 	CP(*ki, *ki32, ki_pid);
 	CP(*ki, *ki32, ki_ppid);
 	CP(*ki, *ki32, ki_pgid);
 	CP(*ki, *ki32, ki_tpgid);
 	CP(*ki, *ki32, ki_sid);
 	CP(*ki, *ki32, ki_tsid);
 	CP(*ki, *ki32, ki_jobc);
 	CP(*ki, *ki32, ki_tdev);
 	CP(*ki, *ki32, ki_siglist);
 	CP(*ki, *ki32, ki_sigmask);
 	CP(*ki, *ki32, ki_sigignore);
 	CP(*ki, *ki32, ki_sigcatch);
 	CP(*ki, *ki32, ki_uid);
 	CP(*ki, *ki32, ki_ruid);
 	CP(*ki, *ki32, ki_svuid);
 	CP(*ki, *ki32, ki_rgid);
 	CP(*ki, *ki32, ki_svgid);
 	CP(*ki, *ki32, ki_ngroups);
 	for (i = 0; i < KI_NGROUPS; i++)
 		CP(*ki, *ki32, ki_groups[i]);
 	CP(*ki, *ki32, ki_size);
 	CP(*ki, *ki32, ki_rssize);
 	CP(*ki, *ki32, ki_swrss);
 	CP(*ki, *ki32, ki_tsize);
 	CP(*ki, *ki32, ki_dsize);
 	CP(*ki, *ki32, ki_ssize);
 	CP(*ki, *ki32, ki_xstat);
 	CP(*ki, *ki32, ki_acflag);
 	CP(*ki, *ki32, ki_pctcpu);
 	CP(*ki, *ki32, ki_estcpu);
 	CP(*ki, *ki32, ki_slptime);
 	CP(*ki, *ki32, ki_swtime);
 	CP(*ki, *ki32, ki_cow);
 	CP(*ki, *ki32, ki_runtime);
 	TV_CP(*ki, *ki32, ki_start);
 	TV_CP(*ki, *ki32, ki_childtime);
 	CP(*ki, *ki32, ki_flag);
 	CP(*ki, *ki32, ki_kiflag);
 	CP(*ki, *ki32, ki_traceflag);
 	CP(*ki, *ki32, ki_stat);
 	CP(*ki, *ki32, ki_nice);
 	CP(*ki, *ki32, ki_lock);
 	CP(*ki, *ki32, ki_rqindex);
 	CP(*ki, *ki32, ki_oncpu);
 	CP(*ki, *ki32, ki_lastcpu);
 	bcopy(ki->ki_tdname, ki32->ki_tdname, TDNAMLEN + 1);
 	bcopy(ki->ki_wmesg, ki32->ki_wmesg, WMESGLEN + 1);
 	bcopy(ki->ki_login, ki32->ki_login, LOGNAMELEN + 1);
 	bcopy(ki->ki_lockname, ki32->ki_lockname, LOCKNAMELEN + 1);
 	bcopy(ki->ki_comm, ki32->ki_comm, COMMLEN + 1);
 	bcopy(ki->ki_emul, ki32->ki_emul, KI_EMULNAMELEN + 1);
 	bcopy(ki->ki_loginclass, ki32->ki_loginclass, LOGINCLASSLEN + 1);
 	CP(*ki, *ki32, ki_flag2);
 	CP(*ki, *ki32, ki_fibnum);
 	CP(*ki, *ki32, ki_cr_flags);
 	CP(*ki, *ki32, ki_jid);
 	CP(*ki, *ki32, ki_numthreads);
 	CP(*ki, *ki32, ki_tid);
 	CP(*ki, *ki32, ki_pri);
 	freebsd32_rusage_out(&ki->ki_rusage, &ki32->ki_rusage);
 	freebsd32_rusage_out(&ki->ki_rusage_ch, &ki32->ki_rusage_ch);
 	PTRTRIM_CP(*ki, *ki32, ki_pcb);
 	PTRTRIM_CP(*ki, *ki32, ki_kstack);
 	PTRTRIM_CP(*ki, *ki32, ki_udata);
 	CP(*ki, *ki32, ki_sflag);
 	CP(*ki, *ki32, ki_tdflags);
 }
 #endif
 
 int
 kern_proc_out(struct proc *p, struct sbuf *sb, int flags)
 {
 	struct thread *td;
 	struct kinfo_proc ki;
 #ifdef COMPAT_FREEBSD32
 	struct kinfo_proc32 ki32;
 #endif
 	int error;
 
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	MPASS(FIRST_THREAD_IN_PROC(p) != NULL);
 
 	error = 0;
 	fill_kinfo_proc(p, &ki);
 	if ((flags & KERN_PROC_NOTHREADS) != 0) {
 #ifdef COMPAT_FREEBSD32
 		if ((flags & KERN_PROC_MASK32) != 0) {
 			freebsd32_kinfo_proc_out(&ki, &ki32);
 			error = sbuf_bcat(sb, &ki32, sizeof(ki32));
 		} else
 #endif
 			error = sbuf_bcat(sb, &ki, sizeof(ki));
 	} else {
 		FOREACH_THREAD_IN_PROC(p, td) {
 			fill_kinfo_thread(td, &ki, 1);
 #ifdef COMPAT_FREEBSD32
 			if ((flags & KERN_PROC_MASK32) != 0) {
 				freebsd32_kinfo_proc_out(&ki, &ki32);
 				error = sbuf_bcat(sb, &ki32, sizeof(ki32));
 			} else
 #endif
 				error = sbuf_bcat(sb, &ki, sizeof(ki));
 			if (error)
 				break;
 		}
 	}
 	PROC_UNLOCK(p);
 	return (error);
 }
 
 static int
 sysctl_out_proc(struct proc *p, struct sysctl_req *req, int flags,
     int doingzomb)
 {
 	struct sbuf sb;
 	struct kinfo_proc ki;
 	struct proc *np;
 	int error, error2;
 	pid_t pid;
 
 	pid = p->p_pid;
 	sbuf_new_for_sysctl(&sb, (char *)&ki, sizeof(ki), req);
 	error = kern_proc_out(p, &sb, flags);
 	error2 = sbuf_finish(&sb);
 	sbuf_delete(&sb);
 	if (error != 0)
 		return (error);
 	else if (error2 != 0)
 		return (error2);
 	if (doingzomb)
 		np = zpfind(pid);
 	else {
 		if (pid == 0)
 			return (0);
 		np = pfind(pid);
 	}
 	if (np == NULL)
 		return (ESRCH);
 	if (np != p) {
 		PROC_UNLOCK(np);
 		return (ESRCH);
 	}
 	PROC_UNLOCK(np);
 	return (0);
 }
 
 static int
 sysctl_kern_proc(SYSCTL_HANDLER_ARGS)
 {
 	int *name = (int *)arg1;
 	u_int namelen = arg2;
 	struct proc *p;
 	int flags, doingzomb, oid_number;
 	int error = 0;
 
 	oid_number = oidp->oid_number;
 	if (oid_number != KERN_PROC_ALL &&
 	    (oid_number & KERN_PROC_INC_THREAD) == 0)
 		flags = KERN_PROC_NOTHREADS;
 	else {
 		flags = 0;
 		oid_number &= ~KERN_PROC_INC_THREAD;
 	}
 #ifdef COMPAT_FREEBSD32
 	if (req->flags & SCTL_MASK32)
 		flags |= KERN_PROC_MASK32;
 #endif
 	if (oid_number == KERN_PROC_PID) {
 		if (namelen != 1)
 			return (EINVAL);
 		error = sysctl_wire_old_buffer(req, 0);
 		if (error)
 			return (error);
 		error = pget((pid_t)name[0], PGET_CANSEE, &p);
 		if (error != 0)
 			return (error);
 		error = sysctl_out_proc(p, req, flags, 0);
 		return (error);
 	}
 
 	switch (oid_number) {
 	case KERN_PROC_ALL:
 		if (namelen != 0)
 			return (EINVAL);
 		break;
 	case KERN_PROC_PROC:
 		if (namelen != 0 && namelen != 1)
 			return (EINVAL);
 		break;
 	default:
 		if (namelen != 1)
 			return (EINVAL);
 		break;
 	}
 
 	if (!req->oldptr) {
 		/* overestimate by 5 procs */
 		error = SYSCTL_OUT(req, 0, sizeof (struct kinfo_proc) * 5);
 		if (error)
 			return (error);
 	}
 	error = sysctl_wire_old_buffer(req, 0);
 	if (error != 0)
 		return (error);
 	sx_slock(&allproc_lock);
 	for (doingzomb=0 ; doingzomb < 2 ; doingzomb++) {
 		if (!doingzomb)
 			p = LIST_FIRST(&allproc);
 		else
 			p = LIST_FIRST(&zombproc);
 		for (; p != 0; p = LIST_NEXT(p, p_list)) {
 			/*
 			 * Skip embryonic processes.
 			 */
 			PROC_LOCK(p);
 			if (p->p_state == PRS_NEW) {
 				PROC_UNLOCK(p);
 				continue;
 			}
 			KASSERT(p->p_ucred != NULL,
 			    ("process credential is NULL for non-NEW proc"));
 			/*
 			 * Show a user only appropriate processes.
 			 */
 			if (p_cansee(curthread, p)) {
 				PROC_UNLOCK(p);
 				continue;
 			}
 			/*
 			 * TODO - make more efficient (see notes below).
 			 * do by session.
 			 */
 			switch (oid_number) {
 
 			case KERN_PROC_GID:
 				if (p->p_ucred->cr_gid != (gid_t)name[0]) {
 					PROC_UNLOCK(p);
 					continue;
 				}
 				break;
 
 			case KERN_PROC_PGRP:
 				/* could do this by traversing pgrp */
 				if (p->p_pgrp == NULL ||
 				    p->p_pgrp->pg_id != (pid_t)name[0]) {
 					PROC_UNLOCK(p);
 					continue;
 				}
 				break;
 
 			case KERN_PROC_RGID:
 				if (p->p_ucred->cr_rgid != (gid_t)name[0]) {
 					PROC_UNLOCK(p);
 					continue;
 				}
 				break;
 
 			case KERN_PROC_SESSION:
 				if (p->p_session == NULL ||
 				    p->p_session->s_sid != (pid_t)name[0]) {
 					PROC_UNLOCK(p);
 					continue;
 				}
 				break;
 
 			case KERN_PROC_TTY:
 				if ((p->p_flag & P_CONTROLT) == 0 ||
 				    p->p_session == NULL) {
 					PROC_UNLOCK(p);
 					continue;
 				}
 				/* XXX proctree_lock */
 				SESS_LOCK(p->p_session);
 				if (p->p_session->s_ttyp == NULL ||
 				    tty_udev(p->p_session->s_ttyp) !=
 				    (dev_t)name[0]) {
 					SESS_UNLOCK(p->p_session);
 					PROC_UNLOCK(p);
 					continue;
 				}
 				SESS_UNLOCK(p->p_session);
 				break;
 
 			case KERN_PROC_UID:
 				if (p->p_ucred->cr_uid != (uid_t)name[0]) {
 					PROC_UNLOCK(p);
 					continue;
 				}
 				break;
 
 			case KERN_PROC_RUID:
 				if (p->p_ucred->cr_ruid != (uid_t)name[0]) {
 					PROC_UNLOCK(p);
 					continue;
 				}
 				break;
 
 			case KERN_PROC_PROC:
 				break;
 
 			default:
 				break;
 
 			}
 
 			error = sysctl_out_proc(p, req, flags, doingzomb);
 			if (error) {
 				sx_sunlock(&allproc_lock);
 				return (error);
 			}
 		}
 	}
 	sx_sunlock(&allproc_lock);
 	return (0);
 }
 
 struct pargs *
 pargs_alloc(int len)
 {
 	struct pargs *pa;
 
 	pa = malloc(sizeof(struct pargs) + len, M_PARGS,
 		M_WAITOK);
 	refcount_init(&pa->ar_ref, 1);
 	pa->ar_length = len;
 	return (pa);
 }
 
 static void
 pargs_free(struct pargs *pa)
 {
 
 	free(pa, M_PARGS);
 }
 
 void
 pargs_hold(struct pargs *pa)
 {
 
 	if (pa == NULL)
 		return;
 	refcount_acquire(&pa->ar_ref);
 }
 
 void
 pargs_drop(struct pargs *pa)
 {
 
 	if (pa == NULL)
 		return;
 	if (refcount_release(&pa->ar_ref))
 		pargs_free(pa);
 }
 
 static int
 proc_read_mem(struct thread *td, struct proc *p, vm_offset_t offset, void* buf,
     size_t len)
 {
 	struct iovec iov;
 	struct uio uio;
 
 	iov.iov_base = (caddr_t)buf;
 	iov.iov_len = len;
 	uio.uio_iov = &iov;
 	uio.uio_iovcnt = 1;
 	uio.uio_offset = offset;
 	uio.uio_resid = (ssize_t)len;
 	uio.uio_segflg = UIO_SYSSPACE;
 	uio.uio_rw = UIO_READ;
 	uio.uio_td = td;
 
 	return (proc_rwmem(p, &uio));
 }
 
 static int
 proc_read_string(struct thread *td, struct proc *p, const char *sptr, char *buf,
     size_t len)
 {
 	size_t i;
 	int error;
 
 	error = proc_read_mem(td, p, (vm_offset_t)sptr, buf, len);
 	/*
 	 * Reading the chunk may validly return EFAULT if the string is shorter
 	 * than the chunk and is aligned at the end of the page, assuming the
 	 * next page is not mapped.  So if EFAULT is returned do a fallback to
 	 * one byte read loop.
 	 */
 	if (error == EFAULT) {
 		for (i = 0; i < len; i++, buf++, sptr++) {
 			error = proc_read_mem(td, p, (vm_offset_t)sptr, buf, 1);
 			if (error != 0)
 				return (error);
 			if (*buf == '\0')
 				break;
 		}
 		error = 0;
 	}
 	return (error);
 }
 
 #define PROC_AUXV_MAX	256	/* Safety limit on auxv size. */
 
 enum proc_vector_type {
 	PROC_ARG,
 	PROC_ENV,
 	PROC_AUX,
 };
 
 #ifdef COMPAT_FREEBSD32
 static int
 get_proc_vector32(struct thread *td, struct proc *p, char ***proc_vectorp,
     size_t *vsizep, enum proc_vector_type type)
 {
 	struct freebsd32_ps_strings pss;
 	Elf32_Auxinfo aux;
 	vm_offset_t vptr, ptr;
 	uint32_t *proc_vector32;
 	char **proc_vector;
 	size_t vsize, size;
 	int i, error;
 
 	error = proc_read_mem(td, p, (vm_offset_t)(p->p_sysent->sv_psstrings),
 	    &pss, sizeof(pss));
 	if (error != 0)
 		return (error);
 	switch (type) {
 	case PROC_ARG:
 		vptr = (vm_offset_t)PTRIN(pss.ps_argvstr);
 		vsize = pss.ps_nargvstr;
 		if (vsize > ARG_MAX)
 			return (ENOEXEC);
 		size = vsize * sizeof(int32_t);
 		break;
 	case PROC_ENV:
 		vptr = (vm_offset_t)PTRIN(pss.ps_envstr);
 		vsize = pss.ps_nenvstr;
 		if (vsize > ARG_MAX)
 			return (ENOEXEC);
 		size = vsize * sizeof(int32_t);
 		break;
 	case PROC_AUX:
 		vptr = (vm_offset_t)PTRIN(pss.ps_envstr) +
 		    (pss.ps_nenvstr + 1) * sizeof(int32_t);
 		if (vptr % 4 != 0)
 			return (ENOEXEC);
 		for (ptr = vptr, i = 0; i < PROC_AUXV_MAX; i++) {
 			error = proc_read_mem(td, p, ptr, &aux, sizeof(aux));
 			if (error != 0)
 				return (error);
 			if (aux.a_type == AT_NULL)
 				break;
 			ptr += sizeof(aux);
 		}
 		if (aux.a_type != AT_NULL)
 			return (ENOEXEC);
 		vsize = i + 1;
 		size = vsize * sizeof(aux);
 		break;
 	default:
 		KASSERT(0, ("Wrong proc vector type: %d", type));
 		return (EINVAL);
 	}
 	proc_vector32 = malloc(size, M_TEMP, M_WAITOK);
 	error = proc_read_mem(td, p, vptr, proc_vector32, size);
 	if (error != 0)
 		goto done;
 	if (type == PROC_AUX) {
 		*proc_vectorp = (char **)proc_vector32;
 		*vsizep = vsize;
 		return (0);
 	}
 	proc_vector = malloc(vsize * sizeof(char *), M_TEMP, M_WAITOK);
 	for (i = 0; i < (int)vsize; i++)
 		proc_vector[i] = PTRIN(proc_vector32[i]);
 	*proc_vectorp = proc_vector;
 	*vsizep = vsize;
 done:
 	free(proc_vector32, M_TEMP);
 	return (error);
 }
 #endif
 
 static int
 get_proc_vector(struct thread *td, struct proc *p, char ***proc_vectorp,
     size_t *vsizep, enum proc_vector_type type)
 {
 	struct ps_strings pss;
 	Elf_Auxinfo aux;
 	vm_offset_t vptr, ptr;
 	char **proc_vector;
 	size_t vsize, size;
 	int error, i;
 
 #ifdef COMPAT_FREEBSD32
 	if (SV_PROC_FLAG(p, SV_ILP32) != 0)
 		return (get_proc_vector32(td, p, proc_vectorp, vsizep, type));
 #endif
 	error = proc_read_mem(td, p, (vm_offset_t)(p->p_sysent->sv_psstrings),
 	    &pss, sizeof(pss));
 	if (error != 0)
 		return (error);
 	switch (type) {
 	case PROC_ARG:
 		vptr = (vm_offset_t)pss.ps_argvstr;
 		vsize = pss.ps_nargvstr;
 		if (vsize > ARG_MAX)
 			return (ENOEXEC);
 		size = vsize * sizeof(char *);
 		break;
 	case PROC_ENV:
 		vptr = (vm_offset_t)pss.ps_envstr;
 		vsize = pss.ps_nenvstr;
 		if (vsize > ARG_MAX)
 			return (ENOEXEC);
 		size = vsize * sizeof(char *);
 		break;
 	case PROC_AUX:
 		/*
 		 * The aux array is just above env array on the stack. Check
 		 * that the address is naturally aligned.
 		 */
 		vptr = (vm_offset_t)pss.ps_envstr + (pss.ps_nenvstr + 1)
 		    * sizeof(char *);
 #if __ELF_WORD_SIZE == 64
 		if (vptr % sizeof(uint64_t) != 0)
 #else
 		if (vptr % sizeof(uint32_t) != 0)
 #endif
 			return (ENOEXEC);
 		/*
 		 * We count the array size reading the aux vectors from the
 		 * stack until AT_NULL vector is returned.  So (to keep the code
 		 * simple) we read the process stack twice: the first time here
 		 * to find the size and the second time when copying the vectors
 		 * to the allocated proc_vector.
 		 */
 		for (ptr = vptr, i = 0; i < PROC_AUXV_MAX; i++) {
 			error = proc_read_mem(td, p, ptr, &aux, sizeof(aux));
 			if (error != 0)
 				return (error);
 			if (aux.a_type == AT_NULL)
 				break;
 			ptr += sizeof(aux);
 		}
 		/*
 		 * If the PROC_AUXV_MAX entries are iterated over, and we have
 		 * not reached AT_NULL, it is most likely we are reading wrong
 		 * data: either the process doesn't have auxv array or data has
 		 * been modified. Return the error in this case.
 		 */
 		if (aux.a_type != AT_NULL)
 			return (ENOEXEC);
 		vsize = i + 1;
 		size = vsize * sizeof(aux);
 		break;
 	default:
 		KASSERT(0, ("Wrong proc vector type: %d", type));
 		return (EINVAL); /* In case we are built without INVARIANTS. */
 	}
 	proc_vector = malloc(size, M_TEMP, M_WAITOK);
 	if (proc_vector == NULL)
 		return (ENOMEM);
 	error = proc_read_mem(td, p, vptr, proc_vector, size);
 	if (error != 0) {
 		free(proc_vector, M_TEMP);
 		return (error);
 	}
 	*proc_vectorp = proc_vector;
 	*vsizep = vsize;
 
 	return (0);
 }
 
 #define GET_PS_STRINGS_CHUNK_SZ	256	/* Chunk size (bytes) for ps_strings operations. */
 
 static int
 get_ps_strings(struct thread *td, struct proc *p, struct sbuf *sb,
     enum proc_vector_type type)
 {
 	size_t done, len, nchr, vsize;
 	int error, i;
 	char **proc_vector, *sptr;
 	char pss_string[GET_PS_STRINGS_CHUNK_SZ];
 
 	PROC_ASSERT_HELD(p);
 
 	/*
 	 * We are not going to read more than 2 * (PATH_MAX + ARG_MAX) bytes.
 	 */
 	nchr = 2 * (PATH_MAX + ARG_MAX);
 
 	error = get_proc_vector(td, p, &proc_vector, &vsize, type);
 	if (error != 0)
 		return (error);
 	for (done = 0, i = 0; i < (int)vsize && done < nchr; i++) {
 		/*
 		 * The program may have scribbled into its argv array, e.g. to
 		 * remove some arguments.  If that has happened, break out
 		 * before trying to read from NULL.
 		 */
 		if (proc_vector[i] == NULL)
 			break;
 		for (sptr = proc_vector[i]; ; sptr += GET_PS_STRINGS_CHUNK_SZ) {
 			error = proc_read_string(td, p, sptr, pss_string,
 			    sizeof(pss_string));
 			if (error != 0)
 				goto done;
 			len = strnlen(pss_string, GET_PS_STRINGS_CHUNK_SZ);
 			if (done + len >= nchr)
 				len = nchr - done - 1;
 			sbuf_bcat(sb, pss_string, len);
 			if (len != GET_PS_STRINGS_CHUNK_SZ)
 				break;
 			done += GET_PS_STRINGS_CHUNK_SZ;
 		}
 		sbuf_bcat(sb, "", 1);
 		done += len + 1;
 	}
 done:
 	free(proc_vector, M_TEMP);
 	return (error);
 }
 
 int
 proc_getargv(struct thread *td, struct proc *p, struct sbuf *sb)
 {
 
 	return (get_ps_strings(curthread, p, sb, PROC_ARG));
 }
 
 int
 proc_getenvv(struct thread *td, struct proc *p, struct sbuf *sb)
 {
 
 	return (get_ps_strings(curthread, p, sb, PROC_ENV));
 }
 
 int
 proc_getauxv(struct thread *td, struct proc *p, struct sbuf *sb)
 {
 	size_t vsize, size;
 	char **auxv;
 	int error;
 
 	error = get_proc_vector(td, p, &auxv, &vsize, PROC_AUX);
 	if (error == 0) {
 #ifdef COMPAT_FREEBSD32
 		if (SV_PROC_FLAG(p, SV_ILP32) != 0)
 			size = vsize * sizeof(Elf32_Auxinfo);
 		else
 #endif
 			size = vsize * sizeof(Elf_Auxinfo);
 		error = sbuf_bcat(sb, auxv, size);
 		free(auxv, M_TEMP);
 	}
 	return (error);
 }
 
 /*
  * This sysctl allows a process to retrieve the argument list or process
  * title for another process without groping around in the address space
  * of the other process.  It also allow a process to set its own "process 
  * title to a string of its own choice.
  */
 static int
 sysctl_kern_proc_args(SYSCTL_HANDLER_ARGS)
 {
 	int *name = (int *)arg1;
 	u_int namelen = arg2;
 	struct pargs *newpa, *pa;
 	struct proc *p;
 	struct sbuf sb;
 	int flags, error = 0, error2;
 
 	if (namelen != 1)
 		return (EINVAL);
 
 	flags = PGET_CANSEE;
 	if (req->newptr != NULL)
 		flags |= PGET_ISCURRENT;
 	error = pget((pid_t)name[0], flags, &p);
 	if (error)
 		return (error);
 
 	pa = p->p_args;
 	if (pa != NULL) {
 		pargs_hold(pa);
 		PROC_UNLOCK(p);
 		error = SYSCTL_OUT(req, pa->ar_args, pa->ar_length);
 		pargs_drop(pa);
 	} else if ((p->p_flag & (P_WEXIT | P_SYSTEM)) == 0) {
 		_PHOLD(p);
 		PROC_UNLOCK(p);
 		sbuf_new_for_sysctl(&sb, NULL, GET_PS_STRINGS_CHUNK_SZ, req);
 		error = proc_getargv(curthread, p, &sb);
 		error2 = sbuf_finish(&sb);
 		PRELE(p);
 		sbuf_delete(&sb);
 		if (error == 0 && error2 != 0)
 			error = error2;
 	} else {
 		PROC_UNLOCK(p);
 	}
 	if (error != 0 || req->newptr == NULL)
 		return (error);
 
 	if (req->newlen + sizeof(struct pargs) > ps_arg_cache_limit)
 		return (ENOMEM);
 	newpa = pargs_alloc(req->newlen);
 	error = SYSCTL_IN(req, newpa->ar_args, req->newlen);
 	if (error != 0) {
 		pargs_free(newpa);
 		return (error);
 	}
 	PROC_LOCK(p);
 	pa = p->p_args;
 	p->p_args = newpa;
 	PROC_UNLOCK(p);
 	pargs_drop(pa);
 	return (0);
 }
 
 /*
  * This sysctl allows a process to retrieve environment of another process.
  */
 static int
 sysctl_kern_proc_env(SYSCTL_HANDLER_ARGS)
 {
 	int *name = (int *)arg1;
 	u_int namelen = arg2;
 	struct proc *p;
 	struct sbuf sb;
 	int error, error2;
 
 	if (namelen != 1)
 		return (EINVAL);
 
 	error = pget((pid_t)name[0], PGET_WANTREAD, &p);
 	if (error != 0)
 		return (error);
 	if ((p->p_flag & P_SYSTEM) != 0) {
 		PRELE(p);
 		return (0);
 	}
 
 	sbuf_new_for_sysctl(&sb, NULL, GET_PS_STRINGS_CHUNK_SZ, req);
 	error = proc_getenvv(curthread, p, &sb);
 	error2 = sbuf_finish(&sb);
 	PRELE(p);
 	sbuf_delete(&sb);
 	return (error != 0 ? error : error2);
 }
 
 /*
  * This sysctl allows a process to retrieve ELF auxiliary vector of
  * another process.
  */
 static int
 sysctl_kern_proc_auxv(SYSCTL_HANDLER_ARGS)
 {
 	int *name = (int *)arg1;
 	u_int namelen = arg2;
 	struct proc *p;
 	struct sbuf sb;
 	int error, error2;
 
 	if (namelen != 1)
 		return (EINVAL);
 
 	error = pget((pid_t)name[0], PGET_WANTREAD, &p);
 	if (error != 0)
 		return (error);
 	if ((p->p_flag & P_SYSTEM) != 0) {
 		PRELE(p);
 		return (0);
 	}
 	sbuf_new_for_sysctl(&sb, NULL, GET_PS_STRINGS_CHUNK_SZ, req);
 	error = proc_getauxv(curthread, p, &sb);
 	error2 = sbuf_finish(&sb);
 	PRELE(p);
 	sbuf_delete(&sb);
 	return (error != 0 ? error : error2);
 }
 
 /*
  * This sysctl allows a process to retrieve the path of the executable for
  * itself or another process.
  */
 static int
 sysctl_kern_proc_pathname(SYSCTL_HANDLER_ARGS)
 {
 	pid_t *pidp = (pid_t *)arg1;
 	unsigned int arglen = arg2;
 	struct proc *p;
 	struct vnode *vp;
 	char *retbuf, *freebuf;
 	int error;
 
 	if (arglen != 1)
 		return (EINVAL);
 	if (*pidp == -1) {	/* -1 means this process */
 		p = req->td->td_proc;
 	} else {
 		error = pget(*pidp, PGET_CANSEE, &p);
 		if (error != 0)
 			return (error);
 	}
 
 	vp = p->p_textvp;
 	if (vp == NULL) {
 		if (*pidp != -1)
 			PROC_UNLOCK(p);
 		return (0);
 	}
 	vref(vp);
 	if (*pidp != -1)
 		PROC_UNLOCK(p);
 	error = vn_fullpath(req->td, vp, &retbuf, &freebuf);
 	vrele(vp);
 	if (error)
 		return (error);
 	error = SYSCTL_OUT(req, retbuf, strlen(retbuf) + 1);
 	free(freebuf, M_TEMP);
 	return (error);
 }
 
 static int
 sysctl_kern_proc_sv_name(SYSCTL_HANDLER_ARGS)
 {
 	struct proc *p;
 	char *sv_name;
 	int *name;
 	int namelen;
 	int error;
 
 	namelen = arg2;
 	if (namelen != 1)
 		return (EINVAL);
 
 	name = (int *)arg1;
 	error = pget((pid_t)name[0], PGET_CANSEE, &p);
 	if (error != 0)
 		return (error);
 	sv_name = p->p_sysent->sv_name;
 	PROC_UNLOCK(p);
 	return (sysctl_handle_string(oidp, sv_name, 0, req));
 }
 
 #ifdef KINFO_OVMENTRY_SIZE
 CTASSERT(sizeof(struct kinfo_ovmentry) == KINFO_OVMENTRY_SIZE);
 #endif
 
 #ifdef COMPAT_FREEBSD7
 static int
 sysctl_kern_proc_ovmmap(SYSCTL_HANDLER_ARGS)
 {
 	vm_map_entry_t entry, tmp_entry;
 	unsigned int last_timestamp;
 	char *fullpath, *freepath;
 	struct kinfo_ovmentry *kve;
 	struct vattr va;
 	struct ucred *cred;
 	int error, *name;
 	struct vnode *vp;
 	struct proc *p;
 	vm_map_t map;
 	struct vmspace *vm;
 
 	name = (int *)arg1;
 	error = pget((pid_t)name[0], PGET_WANTREAD, &p);
 	if (error != 0)
 		return (error);
 	vm = vmspace_acquire_ref(p);
 	if (vm == NULL) {
 		PRELE(p);
 		return (ESRCH);
 	}
 	kve = malloc(sizeof(*kve), M_TEMP, M_WAITOK);
 
 	map = &vm->vm_map;
 	vm_map_lock_read(map);
 	for (entry = map->header.next; entry != &map->header;
 	    entry = entry->next) {
 		vm_object_t obj, tobj, lobj;
 		vm_offset_t addr;
 
 		if (entry->eflags & MAP_ENTRY_IS_SUB_MAP)
 			continue;
 
 		bzero(kve, sizeof(*kve));
 		kve->kve_structsize = sizeof(*kve);
 
 		kve->kve_private_resident = 0;
 		obj = entry->object.vm_object;
 		if (obj != NULL) {
 			VM_OBJECT_RLOCK(obj);
 			if (obj->shadow_count == 1)
 				kve->kve_private_resident =
 				    obj->resident_page_count;
 		}
 		kve->kve_resident = 0;
 		addr = entry->start;
 		while (addr < entry->end) {
 			if (pmap_extract(map->pmap, addr))
 				kve->kve_resident++;
 			addr += PAGE_SIZE;
 		}
 
 		for (lobj = tobj = obj; tobj; tobj = tobj->backing_object) {
 			if (tobj != obj)
 				VM_OBJECT_RLOCK(tobj);
 			if (lobj != obj)
 				VM_OBJECT_RUNLOCK(lobj);
 			lobj = tobj;
 		}
 
 		kve->kve_start = (void*)entry->start;
 		kve->kve_end = (void*)entry->end;
 		kve->kve_offset = (off_t)entry->offset;
 
 		if (entry->protection & VM_PROT_READ)
 			kve->kve_protection |= KVME_PROT_READ;
 		if (entry->protection & VM_PROT_WRITE)
 			kve->kve_protection |= KVME_PROT_WRITE;
 		if (entry->protection & VM_PROT_EXECUTE)
 			kve->kve_protection |= KVME_PROT_EXEC;
 
 		if (entry->eflags & MAP_ENTRY_COW)
 			kve->kve_flags |= KVME_FLAG_COW;
 		if (entry->eflags & MAP_ENTRY_NEEDS_COPY)
 			kve->kve_flags |= KVME_FLAG_NEEDS_COPY;
 		if (entry->eflags & MAP_ENTRY_NOCOREDUMP)
 			kve->kve_flags |= KVME_FLAG_NOCOREDUMP;
 
 		last_timestamp = map->timestamp;
 		vm_map_unlock_read(map);
 
 		kve->kve_fileid = 0;
 		kve->kve_fsid = 0;
 		freepath = NULL;
 		fullpath = "";
 		if (lobj) {
 			vp = NULL;
 			switch (lobj->type) {
 			case OBJT_DEFAULT:
 				kve->kve_type = KVME_TYPE_DEFAULT;
 				break;
 			case OBJT_VNODE:
 				kve->kve_type = KVME_TYPE_VNODE;
 				vp = lobj->handle;
 				vref(vp);
 				break;
 			case OBJT_SWAP:
 				kve->kve_type = KVME_TYPE_SWAP;
 				break;
 			case OBJT_DEVICE:
 				kve->kve_type = KVME_TYPE_DEVICE;
 				break;
 			case OBJT_PHYS:
 				kve->kve_type = KVME_TYPE_PHYS;
 				break;
 			case OBJT_DEAD:
 				kve->kve_type = KVME_TYPE_DEAD;
 				break;
 			case OBJT_SG:
 				kve->kve_type = KVME_TYPE_SG;
 				break;
 			default:
 				kve->kve_type = KVME_TYPE_UNKNOWN;
 				break;
 			}
 			if (lobj != obj)
 				VM_OBJECT_RUNLOCK(lobj);
 
 			kve->kve_ref_count = obj->ref_count;
 			kve->kve_shadow_count = obj->shadow_count;
 			VM_OBJECT_RUNLOCK(obj);
 			if (vp != NULL) {
 				vn_fullpath(curthread, vp, &fullpath,
 				    &freepath);
 				cred = curthread->td_ucred;
 				vn_lock(vp, LK_SHARED | LK_RETRY);
 				if (VOP_GETATTR(vp, &va, cred) == 0) {
 					kve->kve_fileid = va.va_fileid;
 					kve->kve_fsid = va.va_fsid;
 				}
 				vput(vp);
 			}
 		} else {
 			kve->kve_type = KVME_TYPE_NONE;
 			kve->kve_ref_count = 0;
 			kve->kve_shadow_count = 0;
 		}
 
 		strlcpy(kve->kve_path, fullpath, sizeof(kve->kve_path));
 		if (freepath != NULL)
 			free(freepath, M_TEMP);
 
 		error = SYSCTL_OUT(req, kve, sizeof(*kve));
 		vm_map_lock_read(map);
 		if (error)
 			break;
 		if (last_timestamp != map->timestamp) {
 			vm_map_lookup_entry(map, addr - 1, &tmp_entry);
 			entry = tmp_entry;
 		}
 	}
 	vm_map_unlock_read(map);
 	vmspace_free(vm);
 	PRELE(p);
 	free(kve, M_TEMP);
 	return (error);
 }
 #endif	/* COMPAT_FREEBSD7 */
 
 #ifdef KINFO_VMENTRY_SIZE
 CTASSERT(sizeof(struct kinfo_vmentry) == KINFO_VMENTRY_SIZE);
 #endif
 
 /*
  * Must be called with the process locked and will return unlocked.
  */
 int
 kern_proc_vmmap_out(struct proc *p, struct sbuf *sb)
 {
 	vm_map_entry_t entry, tmp_entry;
 	unsigned int last_timestamp;
 	char *fullpath, *freepath;
 	struct kinfo_vmentry *kve;
 	struct vattr va;
 	struct ucred *cred;
 	int error;
 	struct vnode *vp;
 	struct vmspace *vm;
 	vm_map_t map;
 
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 
 	_PHOLD(p);
 	PROC_UNLOCK(p);
 	vm = vmspace_acquire_ref(p);
 	if (vm == NULL) {
 		PRELE(p);
 		return (ESRCH);
 	}
 	kve = malloc(sizeof(*kve), M_TEMP, M_WAITOK);
 
 	error = 0;
 	map = &vm->vm_map;
 	vm_map_lock_read(map);
 	for (entry = map->header.next; entry != &map->header;
 	    entry = entry->next) {
 		vm_object_t obj, tobj, lobj;
 		vm_offset_t addr;
 		vm_paddr_t locked_pa;
 		int mincoreinfo;
 
 		if (entry->eflags & MAP_ENTRY_IS_SUB_MAP)
 			continue;
 
 		bzero(kve, sizeof(*kve));
 
 		kve->kve_private_resident = 0;
 		obj = entry->object.vm_object;
 		if (obj != NULL) {
 			VM_OBJECT_RLOCK(obj);
 			if (obj->shadow_count == 1)
 				kve->kve_private_resident =
 				    obj->resident_page_count;
 		}
 		kve->kve_resident = 0;
 		addr = entry->start;
 		while (addr < entry->end) {
 			locked_pa = 0;
 			mincoreinfo = pmap_mincore(map->pmap, addr, &locked_pa);
 			if (locked_pa != 0)
 				vm_page_unlock(PHYS_TO_VM_PAGE(locked_pa));
 			if (mincoreinfo & MINCORE_INCORE)
 				kve->kve_resident++;
 			if (mincoreinfo & MINCORE_SUPER)
 				kve->kve_flags |= KVME_FLAG_SUPER;
 			addr += PAGE_SIZE;
 		}
 
 		for (lobj = tobj = obj; tobj; tobj = tobj->backing_object) {
 			if (tobj != obj)
 				VM_OBJECT_RLOCK(tobj);
 			if (lobj != obj)
 				VM_OBJECT_RUNLOCK(lobj);
 			lobj = tobj;
 		}
 
 		kve->kve_start = entry->start;
 		kve->kve_end = entry->end;
 		kve->kve_offset = entry->offset;
 
 		if (entry->protection & VM_PROT_READ)
 			kve->kve_protection |= KVME_PROT_READ;
 		if (entry->protection & VM_PROT_WRITE)
 			kve->kve_protection |= KVME_PROT_WRITE;
 		if (entry->protection & VM_PROT_EXECUTE)
 			kve->kve_protection |= KVME_PROT_EXEC;
 
 		if (entry->eflags & MAP_ENTRY_COW)
 			kve->kve_flags |= KVME_FLAG_COW;
 		if (entry->eflags & MAP_ENTRY_NEEDS_COPY)
 			kve->kve_flags |= KVME_FLAG_NEEDS_COPY;
 		if (entry->eflags & MAP_ENTRY_NOCOREDUMP)
 			kve->kve_flags |= KVME_FLAG_NOCOREDUMP;
 		if (entry->eflags & MAP_ENTRY_GROWS_UP)
 			kve->kve_flags |= KVME_FLAG_GROWS_UP;
 		if (entry->eflags & MAP_ENTRY_GROWS_DOWN)
 			kve->kve_flags |= KVME_FLAG_GROWS_DOWN;
 
 		last_timestamp = map->timestamp;
 		vm_map_unlock_read(map);
 
 		freepath = NULL;
 		fullpath = "";
 		if (lobj) {
 			vp = NULL;
 			switch (lobj->type) {
 			case OBJT_DEFAULT:
 				kve->kve_type = KVME_TYPE_DEFAULT;
 				break;
 			case OBJT_VNODE:
 				kve->kve_type = KVME_TYPE_VNODE;
 				vp = lobj->handle;
 				vref(vp);
 				break;
 			case OBJT_SWAP:
 				kve->kve_type = KVME_TYPE_SWAP;
 				break;
 			case OBJT_DEVICE:
 				kve->kve_type = KVME_TYPE_DEVICE;
 				break;
 			case OBJT_PHYS:
 				kve->kve_type = KVME_TYPE_PHYS;
 				break;
 			case OBJT_DEAD:
 				kve->kve_type = KVME_TYPE_DEAD;
 				break;
 			case OBJT_SG:
 				kve->kve_type = KVME_TYPE_SG;
 				break;
 			default:
 				kve->kve_type = KVME_TYPE_UNKNOWN;
 				break;
 			}
 			if (lobj != obj)
 				VM_OBJECT_RUNLOCK(lobj);
 
 			kve->kve_ref_count = obj->ref_count;
 			kve->kve_shadow_count = obj->shadow_count;
 			VM_OBJECT_RUNLOCK(obj);
 			if (vp != NULL) {
 				vn_fullpath(curthread, vp, &fullpath,
 				    &freepath);
 				kve->kve_vn_type = vntype_to_kinfo(vp->v_type);
 				cred = curthread->td_ucred;
 				vn_lock(vp, LK_SHARED | LK_RETRY);
 				if (VOP_GETATTR(vp, &va, cred) == 0) {
 					kve->kve_vn_fileid = va.va_fileid;
 					kve->kve_vn_fsid = va.va_fsid;
 					kve->kve_vn_mode =
 					    MAKEIMODE(va.va_type, va.va_mode);
 					kve->kve_vn_size = va.va_size;
 					kve->kve_vn_rdev = va.va_rdev;
 					kve->kve_status = KF_ATTR_VALID;
 				}
 				vput(vp);
 			}
 		} else {
 			kve->kve_type = KVME_TYPE_NONE;
 			kve->kve_ref_count = 0;
 			kve->kve_shadow_count = 0;
 		}
 
 		strlcpy(kve->kve_path, fullpath, sizeof(kve->kve_path));
 		if (freepath != NULL)
 			free(freepath, M_TEMP);
 
 		/* Pack record size down */
 		kve->kve_structsize = offsetof(struct kinfo_vmentry, kve_path) +
 		    strlen(kve->kve_path) + 1;
 		kve->kve_structsize = roundup(kve->kve_structsize,
 		    sizeof(uint64_t));
 		error = sbuf_bcat(sb, kve, kve->kve_structsize);
 		vm_map_lock_read(map);
 		if (error)
 			break;
 		if (last_timestamp != map->timestamp) {
 			vm_map_lookup_entry(map, addr - 1, &tmp_entry);
 			entry = tmp_entry;
 		}
 	}
 	vm_map_unlock_read(map);
 	vmspace_free(vm);
 	PRELE(p);
 	free(kve, M_TEMP);
 	return (error);
 }
 
 static int
 sysctl_kern_proc_vmmap(SYSCTL_HANDLER_ARGS)
 {
 	struct proc *p;
 	struct sbuf sb;
 	int error, error2, *name;
 
 	name = (int *)arg1;
 	sbuf_new_for_sysctl(&sb, NULL, sizeof(struct kinfo_vmentry), req);
 	error = pget((pid_t)name[0], PGET_CANDEBUG | PGET_NOTWEXIT, &p);
 	if (error != 0) {
 		sbuf_delete(&sb);
 		return (error);
 	}
 	error = kern_proc_vmmap_out(p, &sb);
 	error2 = sbuf_finish(&sb);
 	sbuf_delete(&sb);
 	return (error != 0 ? error : error2);
 }
 
 #if defined(STACK) || defined(DDB)
 static int
 sysctl_kern_proc_kstack(SYSCTL_HANDLER_ARGS)
 {
 	struct kinfo_kstack *kkstp;
 	int error, i, *name, numthreads;
 	lwpid_t *lwpidarray;
 	struct thread *td;
 	struct stack *st;
 	struct sbuf sb;
 	struct proc *p;
 
 	name = (int *)arg1;
 	error = pget((pid_t)name[0], PGET_NOTINEXEC | PGET_WANTREAD, &p);
 	if (error != 0)
 		return (error);
 
 	kkstp = malloc(sizeof(*kkstp), M_TEMP, M_WAITOK);
 	st = stack_create();
 
 	lwpidarray = NULL;
 	numthreads = 0;
 	PROC_LOCK(p);
 repeat:
 	if (numthreads < p->p_numthreads) {
 		if (lwpidarray != NULL) {
 			free(lwpidarray, M_TEMP);
 			lwpidarray = NULL;
 		}
 		numthreads = p->p_numthreads;
 		PROC_UNLOCK(p);
 		lwpidarray = malloc(sizeof(*lwpidarray) * numthreads, M_TEMP,
 		    M_WAITOK | M_ZERO);
 		PROC_LOCK(p);
 		goto repeat;
 	}
 	i = 0;
 
 	/*
 	 * XXXRW: During the below loop, execve(2) and countless other sorts
 	 * of changes could have taken place.  Should we check to see if the
 	 * vmspace has been replaced, or the like, in order to prevent
 	 * giving a snapshot that spans, say, execve(2), with some threads
 	 * before and some after?  Among other things, the credentials could
 	 * have changed, in which case the right to extract debug info might
 	 * no longer be assured.
 	 */
 	FOREACH_THREAD_IN_PROC(p, td) {
 		KASSERT(i < numthreads,
 		    ("sysctl_kern_proc_kstack: numthreads"));
 		lwpidarray[i] = td->td_tid;
 		i++;
 	}
 	numthreads = i;
 	for (i = 0; i < numthreads; i++) {
 		td = thread_find(p, lwpidarray[i]);
 		if (td == NULL) {
 			continue;
 		}
 		bzero(kkstp, sizeof(*kkstp));
 		(void)sbuf_new(&sb, kkstp->kkst_trace,
 		    sizeof(kkstp->kkst_trace), SBUF_FIXEDLEN);
 		thread_lock(td);
 		kkstp->kkst_tid = td->td_tid;
 		if (TD_IS_SWAPPED(td))
 			kkstp->kkst_state = KKST_STATE_SWAPPED;
 		else if (TD_IS_RUNNING(td))
 			kkstp->kkst_state = KKST_STATE_RUNNING;
 		else {
 			kkstp->kkst_state = KKST_STATE_STACKOK;
 			stack_save_td(st, td);
 		}
 		thread_unlock(td);
 		PROC_UNLOCK(p);
 		stack_sbuf_print(&sb, st);
 		sbuf_finish(&sb);
 		sbuf_delete(&sb);
 		error = SYSCTL_OUT(req, kkstp, sizeof(*kkstp));
 		PROC_LOCK(p);
 		if (error)
 			break;
 	}
 	_PRELE(p);
 	PROC_UNLOCK(p);
 	if (lwpidarray != NULL)
 		free(lwpidarray, M_TEMP);
 	stack_destroy(st);
 	free(kkstp, M_TEMP);
 	return (error);
 }
 #endif
 
 /*
  * This sysctl allows a process to retrieve the full list of groups from
  * itself or another process.
  */
 static int
 sysctl_kern_proc_groups(SYSCTL_HANDLER_ARGS)
 {
 	pid_t *pidp = (pid_t *)arg1;
 	unsigned int arglen = arg2;
 	struct proc *p;
 	struct ucred *cred;
 	int error;
 
 	if (arglen != 1)
 		return (EINVAL);
 	if (*pidp == -1) {	/* -1 means this process */
 		p = req->td->td_proc;
 	} else {
 		error = pget(*pidp, PGET_CANSEE, &p);
 		if (error != 0)
 			return (error);
 	}
 
 	cred = crhold(p->p_ucred);
 	if (*pidp != -1)
 		PROC_UNLOCK(p);
 
 	error = SYSCTL_OUT(req, cred->cr_groups,
 	    cred->cr_ngroups * sizeof(gid_t));
 	crfree(cred);
 	return (error);
 }
 
 /*
  * This sysctl allows a process to retrieve or/and set the resource limit for
  * another process.
  */
 static int
 sysctl_kern_proc_rlimit(SYSCTL_HANDLER_ARGS)
 {
 	int *name = (int *)arg1;
 	u_int namelen = arg2;
 	struct rlimit rlim;
 	struct proc *p;
 	u_int which;
 	int flags, error;
 
 	if (namelen != 2)
 		return (EINVAL);
 
 	which = (u_int)name[1];
 	if (which >= RLIM_NLIMITS)
 		return (EINVAL);
 
 	if (req->newptr != NULL && req->newlen != sizeof(rlim))
 		return (EINVAL);
 
 	flags = PGET_HOLD | PGET_NOTWEXIT;
 	if (req->newptr != NULL)
 		flags |= PGET_CANDEBUG;
 	else
 		flags |= PGET_CANSEE;
 	error = pget((pid_t)name[0], flags, &p);
 	if (error != 0)
 		return (error);
 
 	/*
 	 * Retrieve limit.
 	 */
 	if (req->oldptr != NULL) {
 		PROC_LOCK(p);
 		lim_rlimit(p, which, &rlim);
 		PROC_UNLOCK(p);
 	}
 	error = SYSCTL_OUT(req, &rlim, sizeof(rlim));
 	if (error != 0)
 		goto errout;
 
 	/*
 	 * Set limit.
 	 */
 	if (req->newptr != NULL) {
 		error = SYSCTL_IN(req, &rlim, sizeof(rlim));
 		if (error == 0)
 			error = kern_proc_setrlimit(curthread, p, which, &rlim);
 	}
 
 errout:
 	PRELE(p);
 	return (error);
 }
 
 /*
  * This sysctl allows a process to retrieve ps_strings structure location of
  * another process.
  */
 static int
 sysctl_kern_proc_ps_strings(SYSCTL_HANDLER_ARGS)
 {
 	int *name = (int *)arg1;
 	u_int namelen = arg2;
 	struct proc *p;
 	vm_offset_t ps_strings;
 	int error;
 #ifdef COMPAT_FREEBSD32
 	uint32_t ps_strings32;
 #endif
 
 	if (namelen != 1)
 		return (EINVAL);
 
 	error = pget((pid_t)name[0], PGET_CANDEBUG, &p);
 	if (error != 0)
 		return (error);
 #ifdef COMPAT_FREEBSD32
 	if ((req->flags & SCTL_MASK32) != 0) {
 		/*
 		 * We return 0 if the 32 bit emulation request is for a 64 bit
 		 * process.
 		 */
 		ps_strings32 = SV_PROC_FLAG(p, SV_ILP32) != 0 ?
 		    PTROUT(p->p_sysent->sv_psstrings) : 0;
 		PROC_UNLOCK(p);
 		error = SYSCTL_OUT(req, &ps_strings32, sizeof(ps_strings32));
 		return (error);
 	}
 #endif
 	ps_strings = p->p_sysent->sv_psstrings;
 	PROC_UNLOCK(p);
 	error = SYSCTL_OUT(req, &ps_strings, sizeof(ps_strings));
 	return (error);
 }
 
 /*
  * This sysctl allows a process to retrieve umask of another process.
  */
 static int
 sysctl_kern_proc_umask(SYSCTL_HANDLER_ARGS)
 {
 	int *name = (int *)arg1;
 	u_int namelen = arg2;
 	struct proc *p;
 	int error;
 	u_short fd_cmask;
 
 	if (namelen != 1)
 		return (EINVAL);
 
 	error = pget((pid_t)name[0], PGET_WANTREAD, &p);
 	if (error != 0)
 		return (error);
 
 	FILEDESC_SLOCK(p->p_fd);
 	fd_cmask = p->p_fd->fd_cmask;
 	FILEDESC_SUNLOCK(p->p_fd);
 	PRELE(p);
 	error = SYSCTL_OUT(req, &fd_cmask, sizeof(fd_cmask));
 	return (error);
 }
 
 /*
  * This sysctl allows a process to set and retrieve binary osreldate of
  * another process.
  */
 static int
 sysctl_kern_proc_osrel(SYSCTL_HANDLER_ARGS)
 {
 	int *name = (int *)arg1;
 	u_int namelen = arg2;
 	struct proc *p;
 	int flags, error, osrel;
 
 	if (namelen != 1)
 		return (EINVAL);
 
 	if (req->newptr != NULL && req->newlen != sizeof(osrel))
 		return (EINVAL);
 
 	flags = PGET_HOLD | PGET_NOTWEXIT;
 	if (req->newptr != NULL)
 		flags |= PGET_CANDEBUG;
 	else
 		flags |= PGET_CANSEE;
 	error = pget((pid_t)name[0], flags, &p);
 	if (error != 0)
 		return (error);
 
 	error = SYSCTL_OUT(req, &p->p_osrel, sizeof(p->p_osrel));
 	if (error != 0)
 		goto errout;
 
 	if (req->newptr != NULL) {
 		error = SYSCTL_IN(req, &osrel, sizeof(osrel));
 		if (error != 0)
 			goto errout;
 		if (osrel < 0) {
 			error = EINVAL;
 			goto errout;
 		}
 		p->p_osrel = osrel;
 	}
 errout:
 	PRELE(p);
 	return (error);
 }
 
 SYSCTL_NODE(_kern, KERN_PROC, proc, CTLFLAG_RD,  0, "Process table");
 
 SYSCTL_PROC(_kern_proc, KERN_PROC_ALL, all, CTLFLAG_RD|CTLTYPE_STRUCT|
 	CTLFLAG_MPSAFE, 0, 0, sysctl_kern_proc, "S,proc",
 	"Return entire process table");
 
 static SYSCTL_NODE(_kern_proc, KERN_PROC_GID, gid, CTLFLAG_RD | CTLFLAG_MPSAFE,
 	sysctl_kern_proc, "Process table");
 
 static SYSCTL_NODE(_kern_proc, KERN_PROC_PGRP, pgrp, CTLFLAG_RD | CTLFLAG_MPSAFE,
 	sysctl_kern_proc, "Process table");
 
 static SYSCTL_NODE(_kern_proc, KERN_PROC_RGID, rgid, CTLFLAG_RD | CTLFLAG_MPSAFE,
 	sysctl_kern_proc, "Process table");
 
 static SYSCTL_NODE(_kern_proc, KERN_PROC_SESSION, sid, CTLFLAG_RD |
 	CTLFLAG_MPSAFE, sysctl_kern_proc, "Process table");
 
 static SYSCTL_NODE(_kern_proc, KERN_PROC_TTY, tty, CTLFLAG_RD | CTLFLAG_MPSAFE,
 	sysctl_kern_proc, "Process table");
 
 static SYSCTL_NODE(_kern_proc, KERN_PROC_UID, uid, CTLFLAG_RD | CTLFLAG_MPSAFE,
 	sysctl_kern_proc, "Process table");
 
 static SYSCTL_NODE(_kern_proc, KERN_PROC_RUID, ruid, CTLFLAG_RD | CTLFLAG_MPSAFE,
 	sysctl_kern_proc, "Process table");
 
 static SYSCTL_NODE(_kern_proc, KERN_PROC_PID, pid, CTLFLAG_RD | CTLFLAG_MPSAFE,
 	sysctl_kern_proc, "Process table");
 
 static SYSCTL_NODE(_kern_proc, KERN_PROC_PROC, proc, CTLFLAG_RD | CTLFLAG_MPSAFE,
 	sysctl_kern_proc, "Return process table, no threads");
 
 static SYSCTL_NODE(_kern_proc, KERN_PROC_ARGS, args,
 	CTLFLAG_RW | CTLFLAG_ANYBODY | CTLFLAG_MPSAFE,
 	sysctl_kern_proc_args, "Process argument list");
 
 static SYSCTL_NODE(_kern_proc, KERN_PROC_ENV, env, CTLFLAG_RD | CTLFLAG_MPSAFE,
 	sysctl_kern_proc_env, "Process environment");
 
 static SYSCTL_NODE(_kern_proc, KERN_PROC_AUXV, auxv, CTLFLAG_RD |
 	CTLFLAG_MPSAFE, sysctl_kern_proc_auxv, "Process ELF auxiliary vector");
 
 static SYSCTL_NODE(_kern_proc, KERN_PROC_PATHNAME, pathname, CTLFLAG_RD |
 	CTLFLAG_MPSAFE, sysctl_kern_proc_pathname, "Process executable path");
 
 static SYSCTL_NODE(_kern_proc, KERN_PROC_SV_NAME, sv_name, CTLFLAG_RD |
 	CTLFLAG_MPSAFE, sysctl_kern_proc_sv_name,
 	"Process syscall vector name (ABI type)");
 
 static SYSCTL_NODE(_kern_proc, (KERN_PROC_GID | KERN_PROC_INC_THREAD), gid_td,
 	CTLFLAG_RD | CTLFLAG_MPSAFE, sysctl_kern_proc, "Process table");
 
 static SYSCTL_NODE(_kern_proc, (KERN_PROC_PGRP | KERN_PROC_INC_THREAD), pgrp_td,
 	CTLFLAG_RD | CTLFLAG_MPSAFE, sysctl_kern_proc, "Process table");
 
 static SYSCTL_NODE(_kern_proc, (KERN_PROC_RGID | KERN_PROC_INC_THREAD), rgid_td,
 	CTLFLAG_RD | CTLFLAG_MPSAFE, sysctl_kern_proc, "Process table");
 
 static SYSCTL_NODE(_kern_proc, (KERN_PROC_SESSION | KERN_PROC_INC_THREAD),
 	sid_td, CTLFLAG_RD | CTLFLAG_MPSAFE, sysctl_kern_proc, "Process table");
 
 static SYSCTL_NODE(_kern_proc, (KERN_PROC_TTY | KERN_PROC_INC_THREAD), tty_td,
 	CTLFLAG_RD | CTLFLAG_MPSAFE, sysctl_kern_proc, "Process table");
 
 static SYSCTL_NODE(_kern_proc, (KERN_PROC_UID | KERN_PROC_INC_THREAD), uid_td,
 	CTLFLAG_RD | CTLFLAG_MPSAFE, sysctl_kern_proc, "Process table");
 
 static SYSCTL_NODE(_kern_proc, (KERN_PROC_RUID | KERN_PROC_INC_THREAD), ruid_td,
 	CTLFLAG_RD | CTLFLAG_MPSAFE, sysctl_kern_proc, "Process table");
 
 static SYSCTL_NODE(_kern_proc, (KERN_PROC_PID | KERN_PROC_INC_THREAD), pid_td,
 	CTLFLAG_RD | CTLFLAG_MPSAFE, sysctl_kern_proc, "Process table");
 
 static SYSCTL_NODE(_kern_proc, (KERN_PROC_PROC | KERN_PROC_INC_THREAD), proc_td,
 	CTLFLAG_RD | CTLFLAG_MPSAFE, sysctl_kern_proc,
 	"Return process table, no threads");
 
 #ifdef COMPAT_FREEBSD7
 static SYSCTL_NODE(_kern_proc, KERN_PROC_OVMMAP, ovmmap, CTLFLAG_RD |
 	CTLFLAG_MPSAFE, sysctl_kern_proc_ovmmap, "Old Process vm map entries");
 #endif
 
 static SYSCTL_NODE(_kern_proc, KERN_PROC_VMMAP, vmmap, CTLFLAG_RD |
 	CTLFLAG_MPSAFE, sysctl_kern_proc_vmmap, "Process vm map entries");
 
 #if defined(STACK) || defined(DDB)
 static SYSCTL_NODE(_kern_proc, KERN_PROC_KSTACK, kstack, CTLFLAG_RD |
 	CTLFLAG_MPSAFE, sysctl_kern_proc_kstack, "Process kernel stacks");
 #endif
 
 static SYSCTL_NODE(_kern_proc, KERN_PROC_GROUPS, groups, CTLFLAG_RD |
 	CTLFLAG_MPSAFE, sysctl_kern_proc_groups, "Process groups");
 
 static SYSCTL_NODE(_kern_proc, KERN_PROC_RLIMIT, rlimit, CTLFLAG_RW |
 	CTLFLAG_ANYBODY | CTLFLAG_MPSAFE, sysctl_kern_proc_rlimit,
 	"Process resource limits");
 
 static SYSCTL_NODE(_kern_proc, KERN_PROC_PS_STRINGS, ps_strings, CTLFLAG_RD |
 	CTLFLAG_MPSAFE, sysctl_kern_proc_ps_strings,
 	"Process ps_strings location");
 
 static SYSCTL_NODE(_kern_proc, KERN_PROC_UMASK, umask, CTLFLAG_RD |
 	CTLFLAG_MPSAFE, sysctl_kern_proc_umask, "Process umask");
 
 static SYSCTL_NODE(_kern_proc, KERN_PROC_OSREL, osrel, CTLFLAG_RW |
 	CTLFLAG_ANYBODY | CTLFLAG_MPSAFE, sysctl_kern_proc_osrel,
 	"Process binary osreldate");
Index: head/sys/kern/kern_racct.c
===================================================================
--- head/sys/kern/kern_racct.c	(revision 258540)
+++ head/sys/kern/kern_racct.c	(revision 258541)
@@ -1,1291 +1,1290 @@
 /*-
  * Copyright (c) 2010 The FreeBSD Foundation
  * All rights reserved.
  *
  * This software was developed by Edward Tomasz Napierala under sponsorship
  * from the FreeBSD Foundation.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  * $FreeBSD$
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
-#include "opt_kdtrace.h"
 #include "opt_sched.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/eventhandler.h>
 #include <sys/jail.h>
 #include <sys/kernel.h>
 #include <sys/kthread.h>
 #include <sys/lock.h>
 #include <sys/loginclass.h>
 #include <sys/malloc.h>
 #include <sys/mutex.h>
 #include <sys/proc.h>
 #include <sys/racct.h>
 #include <sys/resourcevar.h>
 #include <sys/sbuf.h>
 #include <sys/sched.h>
 #include <sys/sdt.h>
 #include <sys/smp.h>
 #include <sys/sx.h>
 #include <sys/sysctl.h>
 #include <sys/sysent.h>
 #include <sys/sysproto.h>
 #include <sys/umtx.h>
 #include <machine/smp.h>
 
 #ifdef RCTL
 #include <sys/rctl.h>
 #endif
 
 #ifdef RACCT
 
 FEATURE(racct, "Resource Accounting");
 
 /*
  * Do not block processes that have their %cpu usage <= pcpu_threshold.
  */
 static int pcpu_threshold = 1;
 
 SYSCTL_NODE(_kern, OID_AUTO, racct, CTLFLAG_RW, 0, "Resource Accounting");
 SYSCTL_UINT(_kern_racct, OID_AUTO, pcpu_threshold, CTLFLAG_RW, &pcpu_threshold,
     0, "Processes with higher %cpu usage than this value can be throttled.");
 
 /*
  * How many seconds it takes to use the scheduler %cpu calculations.  When a
  * process starts, we compute its %cpu usage by dividing its runtime by the
  * process wall clock time.  After RACCT_PCPU_SECS pass, we use the value
  * provided by the scheduler.
  */
 #define RACCT_PCPU_SECS		3
 
 static struct mtx racct_lock;
 MTX_SYSINIT(racct_lock, &racct_lock, "racct lock", MTX_DEF);
 
 static uma_zone_t racct_zone;
 
 static void racct_sub_racct(struct racct *dest, const struct racct *src);
 static void racct_sub_cred_locked(struct ucred *cred, int resource,
 		uint64_t amount);
 static void racct_add_cred_locked(struct ucred *cred, int resource,
 		uint64_t amount);
 
 SDT_PROVIDER_DEFINE(racct);
 SDT_PROBE_DEFINE3(racct, kernel, rusage, add, add, "struct proc *", "int",
     "uint64_t");
 SDT_PROBE_DEFINE3(racct, kernel, rusage, add_failure, add-failure,
     "struct proc *", "int", "uint64_t");
 SDT_PROBE_DEFINE3(racct, kernel, rusage, add_cred, add-cred, "struct ucred *",
     "int", "uint64_t");
 SDT_PROBE_DEFINE3(racct, kernel, rusage, add_force, add-force, "struct proc *",
     "int", "uint64_t");
 SDT_PROBE_DEFINE3(racct, kernel, rusage, set, set, "struct proc *", "int",
     "uint64_t");
 SDT_PROBE_DEFINE3(racct, kernel, rusage, set_failure, set-failure,
     "struct proc *", "int", "uint64_t");
 SDT_PROBE_DEFINE3(racct, kernel, rusage, sub, sub, "struct proc *", "int",
     "uint64_t");
 SDT_PROBE_DEFINE3(racct, kernel, rusage, sub_cred, sub-cred, "struct ucred *",
     "int", "uint64_t");
 SDT_PROBE_DEFINE1(racct, kernel, racct, create, create, "struct racct *");
 SDT_PROBE_DEFINE1(racct, kernel, racct, destroy, destroy, "struct racct *");
 SDT_PROBE_DEFINE2(racct, kernel, racct, join, join, "struct racct *",
     "struct racct *");
 SDT_PROBE_DEFINE2(racct, kernel, racct, join_failure, join-failure,
     "struct racct *", "struct racct *");
 SDT_PROBE_DEFINE2(racct, kernel, racct, leave, leave, "struct racct *",
     "struct racct *");
 
 int racct_types[] = {
 	[RACCT_CPU] =
 		RACCT_IN_MILLIONS,
 	[RACCT_DATA] =
 		RACCT_RECLAIMABLE | RACCT_INHERITABLE | RACCT_DENIABLE,
 	[RACCT_STACK] =
 		RACCT_RECLAIMABLE | RACCT_INHERITABLE | RACCT_DENIABLE,
 	[RACCT_CORE] =
 		RACCT_DENIABLE,
 	[RACCT_RSS] =
 		RACCT_RECLAIMABLE,
 	[RACCT_MEMLOCK] =
 		RACCT_RECLAIMABLE | RACCT_DENIABLE,
 	[RACCT_NPROC] =
 		RACCT_RECLAIMABLE | RACCT_DENIABLE,
 	[RACCT_NOFILE] =
 		RACCT_RECLAIMABLE | RACCT_INHERITABLE | RACCT_DENIABLE,
 	[RACCT_VMEM] =
 		RACCT_RECLAIMABLE | RACCT_INHERITABLE | RACCT_DENIABLE,
 	[RACCT_NPTS] =
 		RACCT_RECLAIMABLE | RACCT_DENIABLE | RACCT_SLOPPY,
 	[RACCT_SWAP] =
 		RACCT_RECLAIMABLE | RACCT_DENIABLE | RACCT_SLOPPY,
 	[RACCT_NTHR] =
 		RACCT_RECLAIMABLE | RACCT_DENIABLE,
 	[RACCT_MSGQQUEUED] =
 		RACCT_RECLAIMABLE | RACCT_DENIABLE | RACCT_SLOPPY,
 	[RACCT_MSGQSIZE] =
 		RACCT_RECLAIMABLE | RACCT_DENIABLE | RACCT_SLOPPY,
 	[RACCT_NMSGQ] =
 		RACCT_RECLAIMABLE | RACCT_DENIABLE | RACCT_SLOPPY,
 	[RACCT_NSEM] =
 		RACCT_RECLAIMABLE | RACCT_DENIABLE | RACCT_SLOPPY,
 	[RACCT_NSEMOP] =
 		RACCT_RECLAIMABLE | RACCT_INHERITABLE | RACCT_DENIABLE,
 	[RACCT_NSHM] =
 		RACCT_RECLAIMABLE | RACCT_DENIABLE | RACCT_SLOPPY,
 	[RACCT_SHMSIZE] =
 		RACCT_RECLAIMABLE | RACCT_DENIABLE | RACCT_SLOPPY,
 	[RACCT_WALLCLOCK] =
 		RACCT_IN_MILLIONS,
 	[RACCT_PCTCPU] =
 		RACCT_DECAYING | RACCT_DENIABLE | RACCT_IN_MILLIONS };
 
 static const fixpt_t RACCT_DECAY_FACTOR = 0.3 * FSCALE;
 
 #ifdef SCHED_4BSD
 /*
  * Contains intermediate values for %cpu calculations to avoid using floating
  * point in the kernel.
  * ccpu_exp[k] = FSCALE * (ccpu/FSCALE)^k = FSCALE * exp(-k/20)
  * It is needed only for the 4BSD scheduler, because in ULE, the ccpu equals to
  * zero so the calculations are more straightforward.
  */
 fixpt_t ccpu_exp[] = {
 	[0] = FSCALE * 1,
 	[1] = FSCALE * 0.95122942450071400909,
 	[2] = FSCALE * 0.90483741803595957316,
 	[3] = FSCALE * 0.86070797642505780722,
 	[4] = FSCALE * 0.81873075307798185866,
 	[5] = FSCALE * 0.77880078307140486824,
 	[6] = FSCALE * 0.74081822068171786606,
 	[7] = FSCALE * 0.70468808971871343435,
 	[8] = FSCALE * 0.67032004603563930074,
 	[9] = FSCALE * 0.63762815162177329314,
 	[10] = FSCALE * 0.60653065971263342360,
 	[11] = FSCALE * 0.57694981038048669531,
 	[12] = FSCALE * 0.54881163609402643262,
 	[13] = FSCALE * 0.52204577676101604789,
 	[14] = FSCALE * 0.49658530379140951470,
 	[15] = FSCALE * 0.47236655274101470713,
 	[16] = FSCALE * 0.44932896411722159143,
 	[17] = FSCALE * 0.42741493194872666992,
 	[18] = FSCALE * 0.40656965974059911188,
 	[19] = FSCALE * 0.38674102345450120691,
 	[20] = FSCALE * 0.36787944117144232159,
 	[21] = FSCALE * 0.34993774911115535467,
 	[22] = FSCALE * 0.33287108369807955328,
 	[23] = FSCALE * 0.31663676937905321821,
 	[24] = FSCALE * 0.30119421191220209664,
 	[25] = FSCALE * 0.28650479686019010032,
 	[26] = FSCALE * 0.27253179303401260312,
 	[27] = FSCALE * 0.25924026064589150757,
 	[28] = FSCALE * 0.24659696394160647693,
 	[29] = FSCALE * 0.23457028809379765313,
 	[30] = FSCALE * 0.22313016014842982893,
 	[31] = FSCALE * 0.21224797382674305771,
 	[32] = FSCALE * 0.20189651799465540848,
 	[33] = FSCALE * 0.19204990862075411423,
 	[34] = FSCALE * 0.18268352405273465022,
 	[35] = FSCALE * 0.17377394345044512668,
 	[36] = FSCALE * 0.16529888822158653829,
 	[37] = FSCALE * 0.15723716631362761621,
 	[38] = FSCALE * 0.14956861922263505264,
 	[39] = FSCALE * 0.14227407158651357185,
 	[40] = FSCALE * 0.13533528323661269189,
 	[41] = FSCALE * 0.12873490358780421886,
 	[42] = FSCALE * 0.12245642825298191021,
 	[43] = FSCALE * 0.11648415777349695786,
 	[44] = FSCALE * 0.11080315836233388333,
 	[45] = FSCALE * 0.10539922456186433678,
 	[46] = FSCALE * 0.10025884372280373372,
 	[47] = FSCALE * 0.09536916221554961888,
 	[48] = FSCALE * 0.09071795328941250337,
 	[49] = FSCALE * 0.08629358649937051097,
 	[50] = FSCALE * 0.08208499862389879516,
 	[51] = FSCALE * 0.07808166600115315231,
 	[52] = FSCALE * 0.07427357821433388042,
 	[53] = FSCALE * 0.07065121306042958674,
 	[54] = FSCALE * 0.06720551273974976512,
 	[55] = FSCALE * 0.06392786120670757270,
 	[56] = FSCALE * 0.06081006262521796499,
 	[57] = FSCALE * 0.05784432087483846296,
 	[58] = FSCALE * 0.05502322005640722902,
 	[59] = FSCALE * 0.05233970594843239308,
 	[60] = FSCALE * 0.04978706836786394297,
 	[61] = FSCALE * 0.04735892439114092119,
 	[62] = FSCALE * 0.04504920239355780606,
 	[63] = FSCALE * 0.04285212686704017991,
 	[64] = FSCALE * 0.04076220397836621516,
 	[65] = FSCALE * 0.03877420783172200988,
 	[66] = FSCALE * 0.03688316740124000544,
 	[67] = FSCALE * 0.03508435410084502588,
 	[68] = FSCALE * 0.03337326996032607948,
 	[69] = FSCALE * 0.03174563637806794323,
 	[70] = FSCALE * 0.03019738342231850073,
 	[71] = FSCALE * 0.02872463965423942912,
 	[72] = FSCALE * 0.02732372244729256080,
 	[73] = FSCALE * 0.02599112877875534358,
 	[74] = FSCALE * 0.02472352647033939120,
 	[75] = FSCALE * 0.02351774585600910823,
 	[76] = FSCALE * 0.02237077185616559577,
 	[77] = FSCALE * 0.02127973643837716938,
 	[78] = FSCALE * 0.02024191144580438847,
 	[79] = FSCALE * 0.01925470177538692429,
 	[80] = FSCALE * 0.01831563888873418029,
 	[81] = FSCALE * 0.01742237463949351138,
 	[82] = FSCALE * 0.01657267540176124754,
 	[83] = FSCALE * 0.01576441648485449082,
 	[84] = FSCALE * 0.01499557682047770621,
 	[85] = FSCALE * 0.01426423390899925527,
 	[86] = FSCALE * 0.01356855901220093175,
 	[87] = FSCALE * 0.01290681258047986886,
 	[88] = FSCALE * 0.01227733990306844117,
 	[89] = FSCALE * 0.01167856697039544521,
 	[90] = FSCALE * 0.01110899653824230649,
 	[91] = FSCALE * 0.01056720438385265337,
 	[92] = FSCALE * 0.01005183574463358164,
 	[93] = FSCALE * 0.00956160193054350793,
 	[94] = FSCALE * 0.00909527710169581709,
 	[95] = FSCALE * 0.00865169520312063417,
 	[96] = FSCALE * 0.00822974704902002884,
 	[97] = FSCALE * 0.00782837754922577143,
 	[98] = FSCALE * 0.00744658307092434051,
 	[99] = FSCALE * 0.00708340892905212004,
 	[100] = FSCALE * 0.00673794699908546709,
 	[101] = FSCALE * 0.00640933344625638184,
 	[102] = FSCALE * 0.00609674656551563610,
 	[103] = FSCALE * 0.00579940472684214321,
 	[104] = FSCALE * 0.00551656442076077241,
 	[105] = FSCALE * 0.00524751839918138427,
 	[106] = FSCALE * 0.00499159390691021621,
 	[107] = FSCALE * 0.00474815099941147558,
 	[108] = FSCALE * 0.00451658094261266798,
 	[109] = FSCALE * 0.00429630469075234057,
 	[110] = FSCALE * 0.00408677143846406699,
 };
 #endif
 
 #define	CCPU_EXP_MAX	110
 
 /*
  * This function is analogical to the getpcpu() function in the ps(1) command.
  * They should both calculate in the same way so that the racct %cpu
  * calculations are consistent with the values showed by the ps(1) tool.
  * The calculations are more complex in the 4BSD scheduler because of the value
  * of the ccpu variable.  In ULE it is defined to be zero which saves us some
  * work.
  */
 static uint64_t
 racct_getpcpu(struct proc *p, u_int pcpu)
 {
 	u_int swtime;
 #ifdef SCHED_4BSD
 	fixpt_t pctcpu, pctcpu_next;
 #endif
 #ifdef SMP
 	struct pcpu *pc;
 	int found;
 #endif
 	fixpt_t p_pctcpu;
 	struct thread *td;
 
 	/*
 	 * If the process is swapped out, we count its %cpu usage as zero.
 	 * This behaviour is consistent with the userland ps(1) tool.
 	 */
 	if ((p->p_flag & P_INMEM) == 0)
 		return (0);
 	swtime = (ticks - p->p_swtick) / hz;
 
 	/*
 	 * For short-lived processes, the sched_pctcpu() returns small
 	 * values even for cpu intensive processes.  Therefore we use
 	 * our own estimate in this case.
 	 */
 	if (swtime < RACCT_PCPU_SECS)
 		return (pcpu);
 
 	p_pctcpu = 0;
 	FOREACH_THREAD_IN_PROC(p, td) {
 		if (td == PCPU_GET(idlethread))
 			continue;
 #ifdef SMP
 		found = 0;
 		STAILQ_FOREACH(pc, &cpuhead, pc_allcpu) {
 			if (td == pc->pc_idlethread) {
 				found = 1;
 				break;
 			}
 		}
 		if (found)
 			continue;
 #endif
 		thread_lock(td);
 #ifdef SCHED_4BSD
 		pctcpu = sched_pctcpu(td);
 		/* Count also the yet unfinished second. */
 		pctcpu_next = (pctcpu * ccpu_exp[1]) >> FSHIFT;
 		pctcpu_next += sched_pctcpu_delta(td);
 		p_pctcpu += max(pctcpu, pctcpu_next);
 #else
 		/*
 		 * In ULE the %cpu statistics are updated on every
 		 * sched_pctcpu() call.  So special calculations to
 		 * account for the latest (unfinished) second are
 		 * not needed.
 		 */
 		p_pctcpu += sched_pctcpu(td);
 #endif
 		thread_unlock(td);
 	}
 
 #ifdef SCHED_4BSD
 	if (swtime <= CCPU_EXP_MAX)
 		return ((100 * (uint64_t)p_pctcpu * 1000000) /
 		    (FSCALE - ccpu_exp[swtime]));
 #endif
 
 	return ((100 * (uint64_t)p_pctcpu * 1000000) / FSCALE);
 }
 
 static void
 racct_add_racct(struct racct *dest, const struct racct *src)
 {
 	int i;
 
 	mtx_assert(&racct_lock, MA_OWNED);
 
 	/*
 	 * Update resource usage in dest.
 	 */
 	for (i = 0; i <= RACCT_MAX; i++) {
 		KASSERT(dest->r_resources[i] >= 0,
 		    ("%s: resource %d propagation meltdown: dest < 0",
 		    __func__, i));
 		KASSERT(src->r_resources[i] >= 0,
 		    ("%s: resource %d propagation meltdown: src < 0",
 		    __func__, i));
 		dest->r_resources[i] += src->r_resources[i];
 	}
 }
 
 static void
 racct_sub_racct(struct racct *dest, const struct racct *src)
 {
 	int i;
 
 	mtx_assert(&racct_lock, MA_OWNED);
 
 	/*
 	 * Update resource usage in dest.
 	 */
 	for (i = 0; i <= RACCT_MAX; i++) {
 		if (!RACCT_IS_SLOPPY(i) && !RACCT_IS_DECAYING(i)) {
 			KASSERT(dest->r_resources[i] >= 0,
 			    ("%s: resource %d propagation meltdown: dest < 0",
 			    __func__, i));
 			KASSERT(src->r_resources[i] >= 0,
 			    ("%s: resource %d propagation meltdown: src < 0",
 			    __func__, i));
 			KASSERT(src->r_resources[i] <= dest->r_resources[i],
 			    ("%s: resource %d propagation meltdown: src > dest",
 			    __func__, i));
 		}
 		if (RACCT_CAN_DROP(i)) {
 			dest->r_resources[i] -= src->r_resources[i];
 			if (dest->r_resources[i] < 0) {
 				KASSERT(RACCT_IS_SLOPPY(i) ||
 				    RACCT_IS_DECAYING(i),
 				    ("%s: resource %d usage < 0", __func__, i));
 				dest->r_resources[i] = 0;
 			}
 		}
 	}
 }
 
 void
 racct_create(struct racct **racctp)
 {
 
 	SDT_PROBE(racct, kernel, racct, create, racctp, 0, 0, 0, 0);
 
 	KASSERT(*racctp == NULL, ("racct already allocated"));
 
 	*racctp = uma_zalloc(racct_zone, M_WAITOK | M_ZERO);
 }
 
 static void
 racct_destroy_locked(struct racct **racctp)
 {
 	int i;
 	struct racct *racct;
 
 	SDT_PROBE(racct, kernel, racct, destroy, racctp, 0, 0, 0, 0);
 
 	mtx_assert(&racct_lock, MA_OWNED);
 	KASSERT(racctp != NULL, ("NULL racctp"));
 	KASSERT(*racctp != NULL, ("NULL racct"));
 
 	racct = *racctp;
 
 	for (i = 0; i <= RACCT_MAX; i++) {
 		if (RACCT_IS_SLOPPY(i))
 			continue;
 		if (!RACCT_IS_RECLAIMABLE(i))
 			continue;
 		KASSERT(racct->r_resources[i] == 0,
 		    ("destroying non-empty racct: "
 		    "%ju allocated for resource %d\n",
 		    racct->r_resources[i], i));
 	}
 	uma_zfree(racct_zone, racct);
 	*racctp = NULL;
 }
 
 void
 racct_destroy(struct racct **racct)
 {
 
 	mtx_lock(&racct_lock);
 	racct_destroy_locked(racct);
 	mtx_unlock(&racct_lock);
 }
 
 /*
  * Increase consumption of 'resource' by 'amount' for 'racct'
  * and all its parents.  Differently from other cases, 'amount' here
  * may be less than zero.
  */
 static void
 racct_alloc_resource(struct racct *racct, int resource,
     uint64_t amount)
 {
 
 	mtx_assert(&racct_lock, MA_OWNED);
 	KASSERT(racct != NULL, ("NULL racct"));
 
 	racct->r_resources[resource] += amount;
 	if (racct->r_resources[resource] < 0) {
 		KASSERT(RACCT_IS_SLOPPY(resource) || RACCT_IS_DECAYING(resource),
 		    ("%s: resource %d usage < 0", __func__, resource));
 		racct->r_resources[resource] = 0;
 	}
 	
 	/*
 	 * There are some cases where the racct %cpu resource would grow
 	 * beyond 100%.
 	 * For example in racct_proc_exit() we add the process %cpu usage
 	 * to the ucred racct containers.  If too many processes terminated
 	 * in a short time span, the ucred %cpu resource could grow too much.
 	 * Also, the 4BSD scheduler sometimes returns for a thread more than
 	 * 100% cpu usage.  So we set a boundary here to 100%.
 	 */
 	if ((resource == RACCT_PCTCPU) &&
 	    (racct->r_resources[RACCT_PCTCPU] > 100 * 1000000))
 		racct->r_resources[RACCT_PCTCPU] = 100 * 1000000;
 }
 
 static int
 racct_add_locked(struct proc *p, int resource, uint64_t amount)
 {
 #ifdef RCTL
 	int error;
 #endif
 
 	SDT_PROBE(racct, kernel, rusage, add, p, resource, amount, 0, 0);
 
 	/*
 	 * We need proc lock to dereference p->p_ucred.
 	 */
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 
 #ifdef RCTL
 	error = rctl_enforce(p, resource, amount);
 	if (error && RACCT_IS_DENIABLE(resource)) {
 		SDT_PROBE(racct, kernel, rusage, add_failure, p, resource,
 		    amount, 0, 0);
 		return (error);
 	}
 #endif
 	racct_alloc_resource(p->p_racct, resource, amount);
 	racct_add_cred_locked(p->p_ucred, resource, amount);
 
 	return (0);
 }
 
 /*
  * Increase allocation of 'resource' by 'amount' for process 'p'.
  * Return 0 if it's below limits, or errno, if it's not.
  */
 int
 racct_add(struct proc *p, int resource, uint64_t amount)
 {
 	int error;
 
 	mtx_lock(&racct_lock);
 	error = racct_add_locked(p, resource, amount);
 	mtx_unlock(&racct_lock);
 	return (error);
 }
 
 static void
 racct_add_cred_locked(struct ucred *cred, int resource, uint64_t amount)
 {
 	struct prison *pr;
 
 	SDT_PROBE(racct, kernel, rusage, add_cred, cred, resource, amount,
 	    0, 0);
 
 	racct_alloc_resource(cred->cr_ruidinfo->ui_racct, resource, amount);
 	for (pr = cred->cr_prison; pr != NULL; pr = pr->pr_parent)
 		racct_alloc_resource(pr->pr_prison_racct->prr_racct, resource,
 		    amount);
 	racct_alloc_resource(cred->cr_loginclass->lc_racct, resource, amount);
 }
 
 /*
  * Increase allocation of 'resource' by 'amount' for credential 'cred'.
  * Doesn't check for limits and never fails.
  *
  * XXX: Shouldn't this ever return an error?
  */
 void
 racct_add_cred(struct ucred *cred, int resource, uint64_t amount)
 {
 
 	mtx_lock(&racct_lock);
 	racct_add_cred_locked(cred, resource, amount);
 	mtx_unlock(&racct_lock);
 }
 
 /*
  * Increase allocation of 'resource' by 'amount' for process 'p'.
  * Doesn't check for limits and never fails.
  */
 void
 racct_add_force(struct proc *p, int resource, uint64_t amount)
 {
 
 	SDT_PROBE(racct, kernel, rusage, add_force, p, resource, amount, 0, 0);
 
 	/*
 	 * We need proc lock to dereference p->p_ucred.
 	 */
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 
 	mtx_lock(&racct_lock);
 	racct_alloc_resource(p->p_racct, resource, amount);
 	mtx_unlock(&racct_lock);
 	racct_add_cred(p->p_ucred, resource, amount);
 }
 
 static int
 racct_set_locked(struct proc *p, int resource, uint64_t amount)
 {
 	int64_t old_amount, decayed_amount;
 	int64_t diff_proc, diff_cred;
 #ifdef RCTL
 	int error;
 #endif
 
 	SDT_PROBE(racct, kernel, rusage, set, p, resource, amount, 0, 0);
 
 	/*
 	 * We need proc lock to dereference p->p_ucred.
 	 */
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 
 	old_amount = p->p_racct->r_resources[resource];
 	/*
 	 * The diffs may be negative.
 	 */
 	diff_proc = amount - old_amount;
 	if (RACCT_IS_DECAYING(resource)) {
 		/*
 		 * Resources in per-credential racct containers may decay.
 		 * If this is the case, we need to calculate the difference
 		 * between the new amount and the proportional value of the
 		 * old amount that has decayed in the ucred racct containers.
 		 */
 		decayed_amount = old_amount * RACCT_DECAY_FACTOR / FSCALE;
 		diff_cred = amount - decayed_amount;
 	} else
 		diff_cred = diff_proc;
 #ifdef notyet
 	KASSERT(diff_proc >= 0 || RACCT_CAN_DROP(resource),
 	    ("%s: usage of non-droppable resource %d dropping", __func__,
 	     resource));
 #endif
 #ifdef RCTL
 	if (diff_proc > 0) {
 		error = rctl_enforce(p, resource, diff_proc);
 		if (error && RACCT_IS_DENIABLE(resource)) {
 			SDT_PROBE(racct, kernel, rusage, set_failure, p,
 			    resource, amount, 0, 0);
 			return (error);
 		}
 	}
 #endif
 	racct_alloc_resource(p->p_racct, resource, diff_proc);
 	if (diff_cred > 0)
 		racct_add_cred_locked(p->p_ucred, resource, diff_cred);
 	else if (diff_cred < 0)
 		racct_sub_cred_locked(p->p_ucred, resource, -diff_cred);
 
 	return (0);
 }
 
 /*
  * Set allocation of 'resource' to 'amount' for process 'p'.
  * Return 0 if it's below limits, or errno, if it's not.
  *
  * Note that decreasing the allocation always returns 0,
  * even if it's above the limit.
  */
 int
 racct_set(struct proc *p, int resource, uint64_t amount)
 {
 	int error;
 
 	mtx_lock(&racct_lock);
 	error = racct_set_locked(p, resource, amount);
 	mtx_unlock(&racct_lock);
 	return (error);
 }
 
 static void
 racct_set_force_locked(struct proc *p, int resource, uint64_t amount)
 {
 	int64_t old_amount, decayed_amount;
 	int64_t diff_proc, diff_cred;
 
 	SDT_PROBE(racct, kernel, rusage, set, p, resource, amount, 0, 0);
 
 	/*
 	 * We need proc lock to dereference p->p_ucred.
 	 */
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 
 	old_amount = p->p_racct->r_resources[resource];
 	/*
 	 * The diffs may be negative.
 	 */
 	diff_proc = amount - old_amount;
 	if (RACCT_IS_DECAYING(resource)) {
 		/*
 		 * Resources in per-credential racct containers may decay.
 		 * If this is the case, we need to calculate the difference
 		 * between the new amount and the proportional value of the
 		 * old amount that has decayed in the ucred racct containers.
 		 */
 		decayed_amount = old_amount * RACCT_DECAY_FACTOR / FSCALE;
 		diff_cred = amount - decayed_amount;
 	} else
 		diff_cred = diff_proc;
 
 	racct_alloc_resource(p->p_racct, resource, diff_proc);
 	if (diff_cred > 0)
 		racct_add_cred_locked(p->p_ucred, resource, diff_cred);
 	else if (diff_cred < 0)
 		racct_sub_cred_locked(p->p_ucred, resource, -diff_cred);
 }
 
 void
 racct_set_force(struct proc *p, int resource, uint64_t amount)
 {
 	mtx_lock(&racct_lock);
 	racct_set_force_locked(p, resource, amount);
 	mtx_unlock(&racct_lock);
 }
 
 /*
  * Returns amount of 'resource' the process 'p' can keep allocated.
  * Allocating more than that would be denied, unless the resource
  * is marked undeniable.  Amount of already allocated resource does
  * not matter.
  */
 uint64_t
 racct_get_limit(struct proc *p, int resource)
 {
 
 #ifdef RCTL
 	return (rctl_get_limit(p, resource));
 #else
 	return (UINT64_MAX);
 #endif
 }
 
 /*
  * Returns amount of 'resource' the process 'p' can keep allocated.
  * Allocating more than that would be denied, unless the resource
  * is marked undeniable.  Amount of already allocated resource does
  * matter.
  */
 uint64_t
 racct_get_available(struct proc *p, int resource)
 {
 
 #ifdef RCTL
 	return (rctl_get_available(p, resource));
 #else
 	return (UINT64_MAX);
 #endif
 }
 
 /*
  * Returns amount of the %cpu resource that process 'p' can add to its %cpu
  * utilization.  Adding more than that would lead to the process being
  * throttled.
  */
 static int64_t
 racct_pcpu_available(struct proc *p)
 {
 
 #ifdef RCTL
 	return (rctl_pcpu_available(p));
 #else
 	return (INT64_MAX);
 #endif
 }
 
 /*
  * Decrease allocation of 'resource' by 'amount' for process 'p'.
  */
 void
 racct_sub(struct proc *p, int resource, uint64_t amount)
 {
 
 	SDT_PROBE(racct, kernel, rusage, sub, p, resource, amount, 0, 0);
 
 	/*
 	 * We need proc lock to dereference p->p_ucred.
 	 */
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	KASSERT(RACCT_CAN_DROP(resource),
 	    ("%s: called for non-droppable resource %d", __func__, resource));
 
 	mtx_lock(&racct_lock);
 	KASSERT(amount <= p->p_racct->r_resources[resource],
 	    ("%s: freeing %ju of resource %d, which is more "
 	     "than allocated %jd for %s (pid %d)", __func__, amount, resource,
 	    (intmax_t)p->p_racct->r_resources[resource], p->p_comm, p->p_pid));
 
 	racct_alloc_resource(p->p_racct, resource, -amount);
 	racct_sub_cred_locked(p->p_ucred, resource, amount);
 	mtx_unlock(&racct_lock);
 }
 
 static void
 racct_sub_cred_locked(struct ucred *cred, int resource, uint64_t amount)
 {
 	struct prison *pr;
 
 	SDT_PROBE(racct, kernel, rusage, sub_cred, cred, resource, amount,
 	    0, 0);
 
 #ifdef notyet
 	KASSERT(RACCT_CAN_DROP(resource),
 	    ("%s: called for resource %d which can not drop", __func__,
 	     resource));
 #endif
 
 	racct_alloc_resource(cred->cr_ruidinfo->ui_racct, resource, -amount);
 	for (pr = cred->cr_prison; pr != NULL; pr = pr->pr_parent)
 		racct_alloc_resource(pr->pr_prison_racct->prr_racct, resource,
 		    -amount);
 	racct_alloc_resource(cred->cr_loginclass->lc_racct, resource, -amount);
 }
 
 /*
  * Decrease allocation of 'resource' by 'amount' for credential 'cred'.
  */
 void
 racct_sub_cred(struct ucred *cred, int resource, uint64_t amount)
 {
 
 	mtx_lock(&racct_lock);
 	racct_sub_cred_locked(cred, resource, amount);
 	mtx_unlock(&racct_lock);
 }
 
 /*
  * Inherit resource usage information from the parent process.
  */
 int
 racct_proc_fork(struct proc *parent, struct proc *child)
 {
 	int i, error = 0;
 
 	/*
 	 * Create racct for the child process.
 	 */
 	racct_create(&child->p_racct);
 
 	PROC_LOCK(parent);
 	PROC_LOCK(child);
 	mtx_lock(&racct_lock);
 
 #ifdef RCTL
 	error = rctl_proc_fork(parent, child);
 	if (error != 0)
 		goto out;
 #endif
 
 	/* Init process cpu time. */
 	child->p_prev_runtime = 0;
 	child->p_throttled = 0;
 
 	/*
 	 * Inherit resource usage.
 	 */
 	for (i = 0; i <= RACCT_MAX; i++) {
 		if (parent->p_racct->r_resources[i] == 0 ||
 		    !RACCT_IS_INHERITABLE(i))
 			continue;
 
 		error = racct_set_locked(child, i,
 		    parent->p_racct->r_resources[i]);
 		if (error != 0)
 			goto out;
 	}
 
 	error = racct_add_locked(child, RACCT_NPROC, 1);
 	error += racct_add_locked(child, RACCT_NTHR, 1);
 
 out:
 	mtx_unlock(&racct_lock);
 	PROC_UNLOCK(child);
 	PROC_UNLOCK(parent);
 
 	if (error != 0)
 		racct_proc_exit(child);
 
 	return (error);
 }
 
 /*
  * Called at the end of fork1(), to handle rules that require the process
  * to be fully initialized.
  */
 void
 racct_proc_fork_done(struct proc *child)
 {
 
 #ifdef RCTL
 	PROC_LOCK(child);
 	mtx_lock(&racct_lock);
 	rctl_enforce(child, RACCT_NPROC, 0);
 	rctl_enforce(child, RACCT_NTHR, 0);
 	mtx_unlock(&racct_lock);
 	PROC_UNLOCK(child);
 #endif
 }
 
 void
 racct_proc_exit(struct proc *p)
 {
 	int i;
 	uint64_t runtime;
 	struct timeval wallclock;
 	uint64_t pct_estimate, pct;
 
 	PROC_LOCK(p);
 	/*
 	 * We don't need to calculate rux, proc_reap() has already done this.
 	 */
 	runtime = cputick2usec(p->p_rux.rux_runtime);
 #ifdef notyet
 	KASSERT(runtime >= p->p_prev_runtime, ("runtime < p_prev_runtime"));
 #else
 	if (runtime < p->p_prev_runtime)
 		runtime = p->p_prev_runtime;
 #endif
 	microuptime(&wallclock);
 	timevalsub(&wallclock, &p->p_stats->p_start);
 	if (wallclock.tv_sec > 0 || wallclock.tv_usec > 0) {
 		pct_estimate = (1000000 * runtime * 100) /
 		    ((uint64_t)wallclock.tv_sec * 1000000 +
 		    wallclock.tv_usec);
 	} else
 		pct_estimate = 0;
 	pct = racct_getpcpu(p, pct_estimate);
 
 	mtx_lock(&racct_lock);
 	racct_set_locked(p, RACCT_CPU, runtime);
 	racct_add_cred_locked(p->p_ucred, RACCT_PCTCPU, pct);
 
 	for (i = 0; i <= RACCT_MAX; i++) {
 		if (p->p_racct->r_resources[i] == 0)
 			continue;
 	    	if (!RACCT_IS_RECLAIMABLE(i))
 			continue;
 		racct_set_locked(p, i, 0);
 	}
 
 	mtx_unlock(&racct_lock);
 	PROC_UNLOCK(p);
 
 #ifdef RCTL
 	rctl_racct_release(p->p_racct);
 #endif
 	racct_destroy(&p->p_racct);
 }
 
 /*
  * Called after credentials change, to move resource utilisation
  * between raccts.
  */
 void
 racct_proc_ucred_changed(struct proc *p, struct ucred *oldcred,
     struct ucred *newcred)
 {
 	struct uidinfo *olduip, *newuip;
 	struct loginclass *oldlc, *newlc;
 	struct prison *oldpr, *newpr, *pr;
 
 	PROC_LOCK_ASSERT(p, MA_NOTOWNED);
 
 	newuip = newcred->cr_ruidinfo;
 	olduip = oldcred->cr_ruidinfo;
 	newlc = newcred->cr_loginclass;
 	oldlc = oldcred->cr_loginclass;
 	newpr = newcred->cr_prison;
 	oldpr = oldcred->cr_prison;
 
 	mtx_lock(&racct_lock);
 	if (newuip != olduip) {
 		racct_sub_racct(olduip->ui_racct, p->p_racct);
 		racct_add_racct(newuip->ui_racct, p->p_racct);
 	}
 	if (newlc != oldlc) {
 		racct_sub_racct(oldlc->lc_racct, p->p_racct);
 		racct_add_racct(newlc->lc_racct, p->p_racct);
 	}
 	if (newpr != oldpr) {
 		for (pr = oldpr; pr != NULL; pr = pr->pr_parent)
 			racct_sub_racct(pr->pr_prison_racct->prr_racct,
 			    p->p_racct);
 		for (pr = newpr; pr != NULL; pr = pr->pr_parent)
 			racct_add_racct(pr->pr_prison_racct->prr_racct,
 			    p->p_racct);
 	}
 	mtx_unlock(&racct_lock);
 
 #ifdef RCTL
 	rctl_proc_ucred_changed(p, newcred);
 #endif
 }
 
 void
 racct_move(struct racct *dest, struct racct *src)
 {
 
 	mtx_lock(&racct_lock);
 
 	racct_add_racct(dest, src);
 	racct_sub_racct(src, src);
 
 	mtx_unlock(&racct_lock);
 }
 
 static void
 racct_proc_throttle(struct proc *p)
 {
 	struct thread *td;
 #ifdef SMP
 	int cpuid;
 #endif
 
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 
 	/*
 	 * Do not block kernel processes.  Also do not block processes with
 	 * low %cpu utilization to improve interactivity.
 	 */
 	if (((p->p_flag & (P_SYSTEM | P_KTHREAD)) != 0) ||
 	    (p->p_racct->r_resources[RACCT_PCTCPU] <= pcpu_threshold))
 		return;
 	p->p_throttled = 1;
 
 	FOREACH_THREAD_IN_PROC(p, td) {
 		thread_lock(td);
 		switch (td->td_state) {
 		case TDS_RUNQ:
 			/*
 			 * If the thread is on the scheduler run-queue, we can
 			 * not just remove it from there.  So we set the flag
 			 * TDF_NEEDRESCHED for the thread, so that once it is
 			 * running, it is taken off the cpu as soon as possible.
 			 */
 			td->td_flags |= TDF_NEEDRESCHED;
 			break;
 		case TDS_RUNNING:
 			/*
 			 * If the thread is running, we request a context
 			 * switch for it by setting the TDF_NEEDRESCHED flag.
 			 */
 			td->td_flags |= TDF_NEEDRESCHED;
 #ifdef SMP
 			cpuid = td->td_oncpu;
 			if ((cpuid != NOCPU) && (td != curthread))
 				ipi_cpu(cpuid, IPI_AST);
 #endif
 			break;
 		default:
 			break;
 		}
 		thread_unlock(td);
 	}
 }
 
 static void
 racct_proc_wakeup(struct proc *p)
 {
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 
 	if (p->p_throttled) {
 		p->p_throttled = 0;
 		wakeup(p->p_racct);
 	}
 }
 
 static void
 racct_decay_resource(struct racct *racct, void * res, void* dummy)
 {
 	int resource;
 	int64_t r_old, r_new;
 
 	resource = *(int *)res;
 	r_old = racct->r_resources[resource];
 
 	/* If there is nothing to decay, just exit. */
 	if (r_old <= 0)
 		return;
 
 	mtx_lock(&racct_lock);
 	r_new = r_old * RACCT_DECAY_FACTOR / FSCALE;
 	racct->r_resources[resource] = r_new;
 	mtx_unlock(&racct_lock);
 }
 
 static void
 racct_decay(int resource)
 {
 	ui_racct_foreach(racct_decay_resource, &resource, NULL);
 	loginclass_racct_foreach(racct_decay_resource, &resource, NULL);
 	prison_racct_foreach(racct_decay_resource, &resource, NULL);
 }
 
 static void
 racctd(void)
 {
 	struct thread *td;
 	struct proc *p;
 	struct timeval wallclock;
 	uint64_t runtime;
 	uint64_t pct, pct_estimate;
 
 	for (;;) {
 		racct_decay(RACCT_PCTCPU);
 
 		sx_slock(&allproc_lock);
 
 		LIST_FOREACH(p, &zombproc, p_list) {
 			PROC_LOCK(p);
 			racct_set(p, RACCT_PCTCPU, 0);
 			PROC_UNLOCK(p);
 		}
 
 		FOREACH_PROC_IN_SYSTEM(p) {
 			PROC_LOCK(p);
 			if (p->p_state != PRS_NORMAL) {
 				PROC_UNLOCK(p);
 				continue;
 			}
 
 			microuptime(&wallclock);
 			timevalsub(&wallclock, &p->p_stats->p_start);
 			PROC_SLOCK(p);
 			FOREACH_THREAD_IN_PROC(p, td)
 				ruxagg(p, td);
 			runtime = cputick2usec(p->p_rux.rux_runtime);
 			PROC_SUNLOCK(p);
 #ifdef notyet
 			KASSERT(runtime >= p->p_prev_runtime,
 			    ("runtime < p_prev_runtime"));
 #else
 			if (runtime < p->p_prev_runtime)
 				runtime = p->p_prev_runtime;
 #endif
 			p->p_prev_runtime = runtime;
 			if (wallclock.tv_sec > 0 || wallclock.tv_usec > 0) {
 				pct_estimate = (1000000 * runtime * 100) /
 				    ((uint64_t)wallclock.tv_sec * 1000000 +
 				    wallclock.tv_usec);
 			} else
 				pct_estimate = 0;
 			pct = racct_getpcpu(p, pct_estimate);
 			mtx_lock(&racct_lock);
 			racct_set_force_locked(p, RACCT_PCTCPU, pct);
 			racct_set_locked(p, RACCT_CPU, runtime);
 			racct_set_locked(p, RACCT_WALLCLOCK,
 			    (uint64_t)wallclock.tv_sec * 1000000 +
 			    wallclock.tv_usec);
 			mtx_unlock(&racct_lock);
 			PROC_UNLOCK(p);
 		}
 
 		/*
 		 * To ensure that processes are throttled in a fair way, we need
 		 * to iterate over all processes again and check the limits
 		 * for %cpu resource only after ucred racct containers have been
 		 * properly filled.
 		 */
 		FOREACH_PROC_IN_SYSTEM(p) {
 			PROC_LOCK(p);
 			if (p->p_state != PRS_NORMAL) {
 				PROC_UNLOCK(p);
 				continue;
 			}
 
 			if (racct_pcpu_available(p) <= 0)
 				racct_proc_throttle(p);
 			else if (p->p_throttled)
 				racct_proc_wakeup(p);
 			PROC_UNLOCK(p);
 		}
 		sx_sunlock(&allproc_lock);
 		pause("-", hz);
 	}
 }
 
 static struct kproc_desc racctd_kp = {
 	"racctd",
 	racctd,
 	NULL
 };
 SYSINIT(racctd, SI_SUB_RACCTD, SI_ORDER_FIRST, kproc_start, &racctd_kp);
 
 static void
 racct_init(void)
 {
 
 	racct_zone = uma_zcreate("racct", sizeof(struct racct),
 	    NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
 	/*
 	 * XXX: Move this somewhere.
 	 */
 	prison0.pr_prison_racct = prison_racct_find("0");
 }
 SYSINIT(racct, SI_SUB_RACCT, SI_ORDER_FIRST, racct_init, NULL);
 
 #else /* !RACCT */
 
 int
 racct_add(struct proc *p, int resource, uint64_t amount)
 {
 
 	return (0);
 }
 
 void
 racct_add_cred(struct ucred *cred, int resource, uint64_t amount)
 {
 }
 
 void
 racct_add_force(struct proc *p, int resource, uint64_t amount)
 {
 
 	return;
 }
 
 int
 racct_set(struct proc *p, int resource, uint64_t amount)
 {
 
 	return (0);
 }
 
 void
 racct_set_force(struct proc *p, int resource, uint64_t amount)
 {
 }
 
 void
 racct_sub(struct proc *p, int resource, uint64_t amount)
 {
 }
 
 void
 racct_sub_cred(struct ucred *cred, int resource, uint64_t amount)
 {
 }
 
 uint64_t
 racct_get_limit(struct proc *p, int resource)
 {
 
 	return (UINT64_MAX);
 }
 
 uint64_t
 racct_get_available(struct proc *p, int resource)
 {
 
 	return (UINT64_MAX);
 }
 
 void
 racct_create(struct racct **racctp)
 {
 }
 
 void
 racct_destroy(struct racct **racctp)
 {
 }
 
 int
 racct_proc_fork(struct proc *parent, struct proc *child)
 {
 
 	return (0);
 }
 
 void
 racct_proc_fork_done(struct proc *child)
 {
 }
 
 void
 racct_proc_exit(struct proc *p)
 {
 }
 
 #endif /* !RACCT */
Index: head/sys/kern/kern_rmlock.c
===================================================================
--- head/sys/kern/kern_rmlock.c	(revision 258540)
+++ head/sys/kern/kern_rmlock.c	(revision 258541)
@@ -1,858 +1,857 @@
 /*-
  * Copyright (c) 2007 Stephan Uphoff <ups@FreeBSD.org>
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the author nor the names of any co-contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 /*
  * Machine independent bits of reader/writer lock implementation.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_ddb.h"
-#include "opt_kdtrace.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 
 #include <sys/kernel.h>
 #include <sys/kdb.h>
 #include <sys/ktr.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/proc.h>
 #include <sys/rmlock.h>
 #include <sys/sched.h>
 #include <sys/smp.h>
 #include <sys/turnstile.h>
 #include <sys/lock_profile.h>
 #include <machine/cpu.h>
 
 #ifdef DDB
 #include <ddb/ddb.h>
 #endif
 
 /*
  * A cookie to mark destroyed rmlocks.  This is stored in the head of
  * rm_activeReaders.
  */
 #define	RM_DESTROYED	((void *)0xdead)
 
 #define	rm_destroyed(rm)						\
 	(LIST_FIRST(&(rm)->rm_activeReaders) == RM_DESTROYED)
 
 #define RMPF_ONQUEUE	1
 #define RMPF_SIGNAL	2
 
 #ifndef INVARIANTS
 #define	_rm_assert(c, what, file, line)
 #endif
 
 static void	assert_rm(const struct lock_object *lock, int what);
 #ifdef DDB
 static void	db_show_rm(const struct lock_object *lock);
 #endif
 static void	lock_rm(struct lock_object *lock, uintptr_t how);
 #ifdef KDTRACE_HOOKS
 static int	owner_rm(const struct lock_object *lock, struct thread **owner);
 #endif
 static uintptr_t unlock_rm(struct lock_object *lock);
 
 struct lock_class lock_class_rm = {
 	.lc_name = "rm",
 	.lc_flags = LC_SLEEPLOCK | LC_RECURSABLE,
 	.lc_assert = assert_rm,
 #ifdef DDB
 	.lc_ddb_show = db_show_rm,
 #endif
 	.lc_lock = lock_rm,
 	.lc_unlock = unlock_rm,
 #ifdef KDTRACE_HOOKS
 	.lc_owner = owner_rm,
 #endif
 };
 
 struct lock_class lock_class_rm_sleepable = {
 	.lc_name = "sleepable rm",
 	.lc_flags = LC_SLEEPLOCK | LC_SLEEPABLE | LC_RECURSABLE,
 	.lc_assert = assert_rm,
 #ifdef DDB
 	.lc_ddb_show = db_show_rm,
 #endif
 	.lc_lock = lock_rm,
 	.lc_unlock = unlock_rm,
 #ifdef KDTRACE_HOOKS
 	.lc_owner = owner_rm,
 #endif
 };
 
 static void
 assert_rm(const struct lock_object *lock, int what)
 {
 
 	rm_assert((const struct rmlock *)lock, what);
 }
 
 static void
 lock_rm(struct lock_object *lock, uintptr_t how)
 {
 	struct rmlock *rm;
 	struct rm_priotracker *tracker;
 
 	rm = (struct rmlock *)lock;
 	if (how == 0)
 		rm_wlock(rm);
 	else {
 		tracker = (struct rm_priotracker *)how;
 		rm_rlock(rm, tracker);
 	}
 }
 
 static uintptr_t
 unlock_rm(struct lock_object *lock)
 {
 	struct thread *td;
 	struct pcpu *pc;
 	struct rmlock *rm;
 	struct rm_queue *queue;
 	struct rm_priotracker *tracker;
 	uintptr_t how;
 
 	rm = (struct rmlock *)lock;
 	tracker = NULL;
 	how = 0;
 	rm_assert(rm, RA_LOCKED | RA_NOTRECURSED);
 	if (rm_wowned(rm))
 		rm_wunlock(rm);
 	else {
 		/*
 		 * Find the right rm_priotracker structure for curthread.
 		 * The guarantee about its uniqueness is given by the fact
 		 * we already asserted the lock wasn't recursively acquired.
 		 */
 		critical_enter();
 		td = curthread;
 		pc = pcpu_find(curcpu);
 		for (queue = pc->pc_rm_queue.rmq_next;
 		    queue != &pc->pc_rm_queue; queue = queue->rmq_next) {
 			tracker = (struct rm_priotracker *)queue;
 				if ((tracker->rmp_rmlock == rm) &&
 				    (tracker->rmp_thread == td)) {
 					how = (uintptr_t)tracker;
 					break;
 				}
 		}
 		KASSERT(tracker != NULL,
 		    ("rm_priotracker is non-NULL when lock held in read mode"));
 		critical_exit();
 		rm_runlock(rm, tracker);
 	}
 	return (how);
 }
 
 #ifdef KDTRACE_HOOKS
 static int
 owner_rm(const struct lock_object *lock, struct thread **owner)
 {
 	const struct rmlock *rm;
 	struct lock_class *lc;
 
 	rm = (const struct rmlock *)lock;
 	lc = LOCK_CLASS(&rm->rm_wlock_object);
 	return (lc->lc_owner(&rm->rm_wlock_object, owner));
 }
 #endif
 
 static struct mtx rm_spinlock;
 
 MTX_SYSINIT(rm_spinlock, &rm_spinlock, "rm_spinlock", MTX_SPIN);
 
 /*
  * Add or remove tracker from per-cpu list.
  *
  * The per-cpu list can be traversed at any time in forward direction from an
  * interrupt on the *local* cpu.
  */
 static void inline
 rm_tracker_add(struct pcpu *pc, struct rm_priotracker *tracker)
 {
 	struct rm_queue *next;
 
 	/* Initialize all tracker pointers */
 	tracker->rmp_cpuQueue.rmq_prev = &pc->pc_rm_queue;
 	next = pc->pc_rm_queue.rmq_next;
 	tracker->rmp_cpuQueue.rmq_next = next;
 
 	/* rmq_prev is not used during froward traversal. */
 	next->rmq_prev = &tracker->rmp_cpuQueue;
 
 	/* Update pointer to first element. */
 	pc->pc_rm_queue.rmq_next = &tracker->rmp_cpuQueue;
 }
 
 /*
  * Return a count of the number of trackers the thread 'td' already
  * has on this CPU for the lock 'rm'.
  */
 static int
 rm_trackers_present(const struct pcpu *pc, const struct rmlock *rm,
     const struct thread *td)
 {
 	struct rm_queue *queue;
 	struct rm_priotracker *tracker;
 	int count;
 
 	count = 0;
 	for (queue = pc->pc_rm_queue.rmq_next; queue != &pc->pc_rm_queue;
 	    queue = queue->rmq_next) {
 		tracker = (struct rm_priotracker *)queue;
 		if ((tracker->rmp_rmlock == rm) && (tracker->rmp_thread == td))
 			count++;
 	}
 	return (count);
 }
 
 static void inline
 rm_tracker_remove(struct pcpu *pc, struct rm_priotracker *tracker)
 {
 	struct rm_queue *next, *prev;
 
 	next = tracker->rmp_cpuQueue.rmq_next;
 	prev = tracker->rmp_cpuQueue.rmq_prev;
 
 	/* Not used during forward traversal. */
 	next->rmq_prev = prev;
 
 	/* Remove from list. */
 	prev->rmq_next = next;
 }
 
 static void
 rm_cleanIPI(void *arg)
 {
 	struct pcpu *pc;
 	struct rmlock *rm = arg;
 	struct rm_priotracker *tracker;
 	struct rm_queue *queue;
 	pc = pcpu_find(curcpu);
 
 	for (queue = pc->pc_rm_queue.rmq_next; queue != &pc->pc_rm_queue;
 	    queue = queue->rmq_next) {
 		tracker = (struct rm_priotracker *)queue;
 		if (tracker->rmp_rmlock == rm && tracker->rmp_flags == 0) {
 			tracker->rmp_flags = RMPF_ONQUEUE;
 			mtx_lock_spin(&rm_spinlock);
 			LIST_INSERT_HEAD(&rm->rm_activeReaders, tracker,
 			    rmp_qentry);
 			mtx_unlock_spin(&rm_spinlock);
 		}
 	}
 }
 
 void
 rm_init_flags(struct rmlock *rm, const char *name, int opts)
 {
 	struct lock_class *lc;
 	int liflags;
 
 	liflags = 0;
 	if (!(opts & RM_NOWITNESS))
 		liflags |= LO_WITNESS;
 	if (opts & RM_RECURSE)
 		liflags |= LO_RECURSABLE;
 	rm->rm_writecpus = all_cpus;
 	LIST_INIT(&rm->rm_activeReaders);
 	if (opts & RM_SLEEPABLE) {
 		liflags |= LO_SLEEPABLE;
 		lc = &lock_class_rm_sleepable;
 		sx_init_flags(&rm->rm_lock_sx, "rmlock_sx", SX_NOWITNESS);
 	} else {
 		lc = &lock_class_rm;
 		mtx_init(&rm->rm_lock_mtx, name, "rmlock_mtx", MTX_NOWITNESS);
 	}
 	lock_init(&rm->lock_object, lc, name, NULL, liflags);
 }
 
 void
 rm_init(struct rmlock *rm, const char *name)
 {
 
 	rm_init_flags(rm, name, 0);
 }
 
 void
 rm_destroy(struct rmlock *rm)
 {
 
 	rm_assert(rm, RA_UNLOCKED);
 	LIST_FIRST(&rm->rm_activeReaders) = RM_DESTROYED;
 	if (rm->lock_object.lo_flags & LO_SLEEPABLE)
 		sx_destroy(&rm->rm_lock_sx);
 	else
 		mtx_destroy(&rm->rm_lock_mtx);
 	lock_destroy(&rm->lock_object);
 }
 
 int
 rm_wowned(const struct rmlock *rm)
 {
 
 	if (rm->lock_object.lo_flags & LO_SLEEPABLE)
 		return (sx_xlocked(&rm->rm_lock_sx));
 	else
 		return (mtx_owned(&rm->rm_lock_mtx));
 }
 
 void
 rm_sysinit(void *arg)
 {
 	struct rm_args *args = arg;
 
 	rm_init(args->ra_rm, args->ra_desc);
 }
 
 void
 rm_sysinit_flags(void *arg)
 {
 	struct rm_args_flags *args = arg;
 
 	rm_init_flags(args->ra_rm, args->ra_desc, args->ra_opts);
 }
 
 static int
 _rm_rlock_hard(struct rmlock *rm, struct rm_priotracker *tracker, int trylock)
 {
 	struct pcpu *pc;
 
 	critical_enter();
 	pc = pcpu_find(curcpu);
 
 	/* Check if we just need to do a proper critical_exit. */
 	if (!CPU_ISSET(pc->pc_cpuid, &rm->rm_writecpus)) {
 		critical_exit();
 		return (1);
 	}
 
 	/* Remove our tracker from the per-cpu list. */
 	rm_tracker_remove(pc, tracker);
 
 	/* Check to see if the IPI granted us the lock after all. */
 	if (tracker->rmp_flags) {
 		/* Just add back tracker - we hold the lock. */
 		rm_tracker_add(pc, tracker);
 		critical_exit();
 		return (1);
 	}
 
 	/*
 	 * We allow readers to aquire a lock even if a writer is blocked if
 	 * the lock is recursive and the reader already holds the lock.
 	 */
 	if ((rm->lock_object.lo_flags & LO_RECURSABLE) != 0) {
 		/*
 		 * Just grant the lock if this thread already has a tracker
 		 * for this lock on the per-cpu queue.
 		 */
 		if (rm_trackers_present(pc, rm, curthread) != 0) {
 			mtx_lock_spin(&rm_spinlock);
 			LIST_INSERT_HEAD(&rm->rm_activeReaders, tracker,
 			    rmp_qentry);
 			tracker->rmp_flags = RMPF_ONQUEUE;
 			mtx_unlock_spin(&rm_spinlock);
 			rm_tracker_add(pc, tracker);
 			critical_exit();
 			return (1);
 		}
 	}
 
 	sched_unpin();
 	critical_exit();
 
 	if (trylock) {
 		if (rm->lock_object.lo_flags & LO_SLEEPABLE) {
 			if (!sx_try_xlock(&rm->rm_lock_sx))
 				return (0);
 		} else {
 			if (!mtx_trylock(&rm->rm_lock_mtx))
 				return (0);
 		}
 	} else {
 		if (rm->lock_object.lo_flags & LO_SLEEPABLE)
 			sx_xlock(&rm->rm_lock_sx);
 		else
 			mtx_lock(&rm->rm_lock_mtx);
 	}
 
 	critical_enter();
 	pc = pcpu_find(curcpu);
 	CPU_CLR(pc->pc_cpuid, &rm->rm_writecpus);
 	rm_tracker_add(pc, tracker);
 	sched_pin();
 	critical_exit();
 
 	if (rm->lock_object.lo_flags & LO_SLEEPABLE)
 		sx_xunlock(&rm->rm_lock_sx);
 	else
 		mtx_unlock(&rm->rm_lock_mtx);
 
 	return (1);
 }
 
 int
 _rm_rlock(struct rmlock *rm, struct rm_priotracker *tracker, int trylock)
 {
 	struct thread *td = curthread;
 	struct pcpu *pc;
 
 	if (SCHEDULER_STOPPED())
 		return (1);
 
 	tracker->rmp_flags  = 0;
 	tracker->rmp_thread = td;
 	tracker->rmp_rmlock = rm;
 
 	if (rm->lock_object.lo_flags & LO_SLEEPABLE)
 		THREAD_NO_SLEEPING();
 
 	td->td_critnest++;	/* critical_enter(); */
 
 	__compiler_membar();
 
 	pc = cpuid_to_pcpu[td->td_oncpu]; /* pcpu_find(td->td_oncpu); */
 
 	rm_tracker_add(pc, tracker);
 
 	sched_pin();
 
 	__compiler_membar();
 
 	td->td_critnest--;
 
 	/*
 	 * Fast path to combine two common conditions into a single
 	 * conditional jump.
 	 */
 	if (0 == (td->td_owepreempt |
 	    CPU_ISSET(pc->pc_cpuid, &rm->rm_writecpus)))
 		return (1);
 
 	/* We do not have a read token and need to acquire one. */
 	return _rm_rlock_hard(rm, tracker, trylock);
 }
 
 static void
 _rm_unlock_hard(struct thread *td,struct rm_priotracker *tracker)
 {
 
 	if (td->td_owepreempt) {
 		td->td_critnest++;
 		critical_exit();
 	}
 
 	if (!tracker->rmp_flags)
 		return;
 
 	mtx_lock_spin(&rm_spinlock);
 	LIST_REMOVE(tracker, rmp_qentry);
 
 	if (tracker->rmp_flags & RMPF_SIGNAL) {
 		struct rmlock *rm;
 		struct turnstile *ts;
 
 		rm = tracker->rmp_rmlock;
 
 		turnstile_chain_lock(&rm->lock_object);
 		mtx_unlock_spin(&rm_spinlock);
 
 		ts = turnstile_lookup(&rm->lock_object);
 
 		turnstile_signal(ts, TS_EXCLUSIVE_QUEUE);
 		turnstile_unpend(ts, TS_EXCLUSIVE_LOCK);
 		turnstile_chain_unlock(&rm->lock_object);
 	} else
 		mtx_unlock_spin(&rm_spinlock);
 }
 
 void
 _rm_runlock(struct rmlock *rm, struct rm_priotracker *tracker)
 {
 	struct pcpu *pc;
 	struct thread *td = tracker->rmp_thread;
 
 	if (SCHEDULER_STOPPED())
 		return;
 
 	td->td_critnest++;	/* critical_enter(); */
 	pc = cpuid_to_pcpu[td->td_oncpu]; /* pcpu_find(td->td_oncpu); */
 	rm_tracker_remove(pc, tracker);
 	td->td_critnest--;
 	sched_unpin();
 
 	if (rm->lock_object.lo_flags & LO_SLEEPABLE)
 		THREAD_SLEEPING_OK();
 
 	if (0 == (td->td_owepreempt | tracker->rmp_flags))
 		return;
 
 	_rm_unlock_hard(td, tracker);
 }
 
 void
 _rm_wlock(struct rmlock *rm)
 {
 	struct rm_priotracker *prio;
 	struct turnstile *ts;
 	cpuset_t readcpus;
 
 	if (SCHEDULER_STOPPED())
 		return;
 
 	if (rm->lock_object.lo_flags & LO_SLEEPABLE)
 		sx_xlock(&rm->rm_lock_sx);
 	else
 		mtx_lock(&rm->rm_lock_mtx);
 
 	if (CPU_CMP(&rm->rm_writecpus, &all_cpus)) {
 		/* Get all read tokens back */
 		readcpus = all_cpus;
 		CPU_NAND(&readcpus, &rm->rm_writecpus);
 		rm->rm_writecpus = all_cpus;
 
 		/*
 		 * Assumes rm->rm_writecpus update is visible on other CPUs
 		 * before rm_cleanIPI is called.
 		 */
 #ifdef SMP
 		smp_rendezvous_cpus(readcpus,
 		    smp_no_rendevous_barrier,
 		    rm_cleanIPI,
 		    smp_no_rendevous_barrier,
 		    rm);
 
 #else
 		rm_cleanIPI(rm);
 #endif
 
 		mtx_lock_spin(&rm_spinlock);
 		while ((prio = LIST_FIRST(&rm->rm_activeReaders)) != NULL) {
 			ts = turnstile_trywait(&rm->lock_object);
 			prio->rmp_flags = RMPF_ONQUEUE | RMPF_SIGNAL;
 			mtx_unlock_spin(&rm_spinlock);
 			turnstile_wait(ts, prio->rmp_thread,
 			    TS_EXCLUSIVE_QUEUE);
 			mtx_lock_spin(&rm_spinlock);
 		}
 		mtx_unlock_spin(&rm_spinlock);
 	}
 }
 
 void
 _rm_wunlock(struct rmlock *rm)
 {
 
 	if (rm->lock_object.lo_flags & LO_SLEEPABLE)
 		sx_xunlock(&rm->rm_lock_sx);
 	else
 		mtx_unlock(&rm->rm_lock_mtx);
 }
 
 #ifdef LOCK_DEBUG
 
 void
 _rm_wlock_debug(struct rmlock *rm, const char *file, int line)
 {
 
 	if (SCHEDULER_STOPPED())
 		return;
 
 	KASSERT(kdb_active != 0 || !TD_IS_IDLETHREAD(curthread),
 	    ("rm_wlock() by idle thread %p on rmlock %s @ %s:%d",
 	    curthread, rm->lock_object.lo_name, file, line));
 	KASSERT(!rm_destroyed(rm),
 	    ("rm_wlock() of destroyed rmlock @ %s:%d", file, line));
 	_rm_assert(rm, RA_UNLOCKED, file, line);
 
 	WITNESS_CHECKORDER(&rm->lock_object, LOP_NEWORDER | LOP_EXCLUSIVE,
 	    file, line, NULL);
 
 	_rm_wlock(rm);
 
 	LOCK_LOG_LOCK("RMWLOCK", &rm->lock_object, 0, 0, file, line);
 
 	WITNESS_LOCK(&rm->lock_object, LOP_EXCLUSIVE, file, line);
 
 	curthread->td_locks++;
 
 }
 
 void
 _rm_wunlock_debug(struct rmlock *rm, const char *file, int line)
 {
 
 	if (SCHEDULER_STOPPED())
 		return;
 
 	KASSERT(!rm_destroyed(rm),
 	    ("rm_wunlock() of destroyed rmlock @ %s:%d", file, line));
 	_rm_assert(rm, RA_WLOCKED, file, line);
 	WITNESS_UNLOCK(&rm->lock_object, LOP_EXCLUSIVE, file, line);
 	LOCK_LOG_LOCK("RMWUNLOCK", &rm->lock_object, 0, 0, file, line);
 	_rm_wunlock(rm);
 	curthread->td_locks--;
 }
 
 int
 _rm_rlock_debug(struct rmlock *rm, struct rm_priotracker *tracker,
     int trylock, const char *file, int line)
 {
 
 	if (SCHEDULER_STOPPED())
 		return (1);
 
 #ifdef INVARIANTS
 	if (!(rm->lock_object.lo_flags & LO_RECURSABLE) && !trylock) {
 		critical_enter();
 		KASSERT(rm_trackers_present(pcpu_find(curcpu), rm,
 		    curthread) == 0,
 		    ("rm_rlock: recursed on non-recursive rmlock %s @ %s:%d\n",
 		    rm->lock_object.lo_name, file, line));
 		critical_exit();
 	}
 #endif
 	KASSERT(kdb_active != 0 || !TD_IS_IDLETHREAD(curthread),
 	    ("rm_rlock() by idle thread %p on rmlock %s @ %s:%d",
 	    curthread, rm->lock_object.lo_name, file, line));
 	KASSERT(!rm_destroyed(rm),
 	    ("rm_rlock() of destroyed rmlock @ %s:%d", file, line));
 	if (!trylock) {
 		KASSERT(!rm_wowned(rm),
 		    ("rm_rlock: wlock already held for %s @ %s:%d",
 		    rm->lock_object.lo_name, file, line));
 		WITNESS_CHECKORDER(&rm->lock_object, LOP_NEWORDER, file, line,
 		    NULL);
 	}
 
 	if (_rm_rlock(rm, tracker, trylock)) {
 		if (trylock)
 			LOCK_LOG_TRY("RMRLOCK", &rm->lock_object, 0, 1, file,
 			    line);
 		else
 			LOCK_LOG_LOCK("RMRLOCK", &rm->lock_object, 0, 0, file,
 			    line);
 		WITNESS_LOCK(&rm->lock_object, 0, file, line);
 
 		curthread->td_locks++;
 
 		return (1);
 	} else if (trylock)
 		LOCK_LOG_TRY("RMRLOCK", &rm->lock_object, 0, 0, file, line);
 
 	return (0);
 }
 
 void
 _rm_runlock_debug(struct rmlock *rm, struct rm_priotracker *tracker,
     const char *file, int line)
 {
 
 	if (SCHEDULER_STOPPED())
 		return;
 
 	KASSERT(!rm_destroyed(rm),
 	    ("rm_runlock() of destroyed rmlock @ %s:%d", file, line));
 	_rm_assert(rm, RA_RLOCKED, file, line);
 	WITNESS_UNLOCK(&rm->lock_object, 0, file, line);
 	LOCK_LOG_LOCK("RMRUNLOCK", &rm->lock_object, 0, 0, file, line);
 	_rm_runlock(rm, tracker);
 	curthread->td_locks--;
 }
 
 #else
 
 /*
  * Just strip out file and line arguments if no lock debugging is enabled in
  * the kernel - we are called from a kernel module.
  */
 void
 _rm_wlock_debug(struct rmlock *rm, const char *file, int line)
 {
 
 	_rm_wlock(rm);
 }
 
 void
 _rm_wunlock_debug(struct rmlock *rm, const char *file, int line)
 {
 
 	_rm_wunlock(rm);
 }
 
 int
 _rm_rlock_debug(struct rmlock *rm, struct rm_priotracker *tracker,
     int trylock, const char *file, int line)
 {
 
 	return _rm_rlock(rm, tracker, trylock);
 }
 
 void
 _rm_runlock_debug(struct rmlock *rm, struct rm_priotracker *tracker,
     const char *file, int line)
 {
 
 	_rm_runlock(rm, tracker);
 }
 
 #endif
 
 #ifdef INVARIANT_SUPPORT
 #ifndef INVARIANTS
 #undef _rm_assert
 #endif
 
 /*
  * Note that this does not need to use witness_assert() for read lock
  * assertions since an exact count of read locks held by this thread
  * is computable.
  */
 void
 _rm_assert(const struct rmlock *rm, int what, const char *file, int line)
 {
 	int count;
 
 	if (panicstr != NULL)
 		return;
 	switch (what) {
 	case RA_LOCKED:
 	case RA_LOCKED | RA_RECURSED:
 	case RA_LOCKED | RA_NOTRECURSED:
 	case RA_RLOCKED:
 	case RA_RLOCKED | RA_RECURSED:
 	case RA_RLOCKED | RA_NOTRECURSED:
 		/*
 		 * Handle the write-locked case.  Unlike other
 		 * primitives, writers can never recurse.
 		 */
 		if (rm_wowned(rm)) {
 			if (what & RA_RLOCKED)
 				panic("Lock %s exclusively locked @ %s:%d\n",
 				    rm->lock_object.lo_name, file, line);
 			if (what & RA_RECURSED)
 				panic("Lock %s not recursed @ %s:%d\n",
 				    rm->lock_object.lo_name, file, line);
 			break;
 		}
 
 		critical_enter();
 		count = rm_trackers_present(pcpu_find(curcpu), rm, curthread);
 		critical_exit();
 
 		if (count == 0)
 			panic("Lock %s not %slocked @ %s:%d\n",
 			    rm->lock_object.lo_name, (what & RA_RLOCKED) ?
 			    "read " : "", file, line);
 		if (count > 1) {
 			if (what & RA_NOTRECURSED)
 				panic("Lock %s recursed @ %s:%d\n",
 				    rm->lock_object.lo_name, file, line);
 		} else if (what & RA_RECURSED)
 			panic("Lock %s not recursed @ %s:%d\n",
 			    rm->lock_object.lo_name, file, line);
 		break;
 	case RA_WLOCKED:
 		if (!rm_wowned(rm))
 			panic("Lock %s not exclusively locked @ %s:%d\n",
 			    rm->lock_object.lo_name, file, line);
 		break;
 	case RA_UNLOCKED:
 		if (rm_wowned(rm))
 			panic("Lock %s exclusively locked @ %s:%d\n",
 			    rm->lock_object.lo_name, file, line);
 
 		critical_enter();
 		count = rm_trackers_present(pcpu_find(curcpu), rm, curthread);
 		critical_exit();
 
 		if (count != 0)
 			panic("Lock %s read locked @ %s:%d\n",
 			    rm->lock_object.lo_name, file, line);
 		break;
 	default:
 		panic("Unknown rm lock assertion: %d @ %s:%d", what, file,
 		    line);
 	}
 }
 #endif /* INVARIANT_SUPPORT */
 
 #ifdef DDB
 static void
 print_tracker(struct rm_priotracker *tr)
 {
 	struct thread *td;
 
 	td = tr->rmp_thread;
 	db_printf("   thread %p (tid %d, pid %d, \"%s\") {", td, td->td_tid,
 	    td->td_proc->p_pid, td->td_name);
 	if (tr->rmp_flags & RMPF_ONQUEUE) {
 		db_printf("ONQUEUE");
 		if (tr->rmp_flags & RMPF_SIGNAL)
 			db_printf(",SIGNAL");
 	} else
 		db_printf("0");
 	db_printf("}\n");
 }
 
 static void
 db_show_rm(const struct lock_object *lock)
 {
 	struct rm_priotracker *tr;
 	struct rm_queue *queue;
 	const struct rmlock *rm;
 	struct lock_class *lc;
 	struct pcpu *pc;
 
 	rm = (const struct rmlock *)lock;
 	db_printf(" writecpus: ");
 	ddb_display_cpuset(__DEQUALIFY(const cpuset_t *, &rm->rm_writecpus));
 	db_printf("\n");
 	db_printf(" per-CPU readers:\n");
 	STAILQ_FOREACH(pc, &cpuhead, pc_allcpu)
 		for (queue = pc->pc_rm_queue.rmq_next;
 		    queue != &pc->pc_rm_queue; queue = queue->rmq_next) {
 			tr = (struct rm_priotracker *)queue;
 			if (tr->rmp_rmlock == rm)
 				print_tracker(tr);
 		}
 	db_printf(" active readers:\n");
 	LIST_FOREACH(tr, &rm->rm_activeReaders, rmp_qentry)
 		print_tracker(tr);
 	lc = LOCK_CLASS(&rm->rm_wlock_object);
 	db_printf("Backing write-lock (%s):\n", lc->lc_name);
 	lc->lc_ddb_show(&rm->rm_wlock_object);
 }
 #endif
Index: head/sys/kern/kern_rwlock.c
===================================================================
--- head/sys/kern/kern_rwlock.c	(revision 258540)
+++ head/sys/kern/kern_rwlock.c	(revision 258541)
@@ -1,1232 +1,1229 @@
 /*-
  * Copyright (c) 2006 John Baldwin <jhb@FreeBSD.org>
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the author nor the names of any co-contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 /*
  * Machine independent bits of reader/writer lock implementation.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_ddb.h"
 #include "opt_hwpmc_hooks.h"
-#include "opt_kdtrace.h"
 #include "opt_no_adaptive_rwlocks.h"
 
 #include <sys/param.h>
 #include <sys/kdb.h>
 #include <sys/ktr.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/proc.h>
 #include <sys/rwlock.h>
 #include <sys/sysctl.h>
 #include <sys/systm.h>
 #include <sys/turnstile.h>
 
 #include <machine/cpu.h>
 
 #if defined(SMP) && !defined(NO_ADAPTIVE_RWLOCKS)
 #define	ADAPTIVE_RWLOCKS
 #endif
 
 #ifdef HWPMC_HOOKS
 #include <sys/pmckern.h>
 PMC_SOFT_DECLARE( , , lock, failed);
 #endif
 
 /*
  * Return the rwlock address when the lock cookie address is provided.
  * This functionality assumes that struct rwlock* have a member named rw_lock.
  */
 #define	rwlock2rw(c)	(__containerof(c, struct rwlock, rw_lock))
 
 #ifdef ADAPTIVE_RWLOCKS
 static int rowner_retries = 10;
 static int rowner_loops = 10000;
 static SYSCTL_NODE(_debug, OID_AUTO, rwlock, CTLFLAG_RD, NULL,
     "rwlock debugging");
 SYSCTL_INT(_debug_rwlock, OID_AUTO, retry, CTLFLAG_RW, &rowner_retries, 0, "");
 SYSCTL_INT(_debug_rwlock, OID_AUTO, loops, CTLFLAG_RW, &rowner_loops, 0, "");
 #endif
 
 #ifdef DDB
 #include <ddb/ddb.h>
 
 static void	db_show_rwlock(const struct lock_object *lock);
 #endif
 static void	assert_rw(const struct lock_object *lock, int what);
 static void	lock_rw(struct lock_object *lock, uintptr_t how);
 #ifdef KDTRACE_HOOKS
 static int	owner_rw(const struct lock_object *lock, struct thread **owner);
 #endif
 static uintptr_t unlock_rw(struct lock_object *lock);
 
 struct lock_class lock_class_rw = {
 	.lc_name = "rw",
 	.lc_flags = LC_SLEEPLOCK | LC_RECURSABLE | LC_UPGRADABLE,
 	.lc_assert = assert_rw,
 #ifdef DDB
 	.lc_ddb_show = db_show_rwlock,
 #endif
 	.lc_lock = lock_rw,
 	.lc_unlock = unlock_rw,
 #ifdef KDTRACE_HOOKS
 	.lc_owner = owner_rw,
 #endif
 };
 
 /*
  * Return a pointer to the owning thread if the lock is write-locked or
  * NULL if the lock is unlocked or read-locked.
  */
 #define	rw_wowner(rw)							\
 	((rw)->rw_lock & RW_LOCK_READ ? NULL :				\
 	    (struct thread *)RW_OWNER((rw)->rw_lock))
 
 /*
  * Returns if a write owner is recursed.  Write ownership is not assured
  * here and should be previously checked.
  */
 #define	rw_recursed(rw)		((rw)->rw_recurse != 0)
 
 /*
  * Return true if curthread helds the lock.
  */
 #define	rw_wlocked(rw)		(rw_wowner((rw)) == curthread)
 
 /*
  * Return a pointer to the owning thread for this lock who should receive
  * any priority lent by threads that block on this lock.  Currently this
  * is identical to rw_wowner().
  */
 #define	rw_owner(rw)		rw_wowner(rw)
 
 #ifndef INVARIANTS
 #define	__rw_assert(c, what, file, line)
 #endif
 
 void
 assert_rw(const struct lock_object *lock, int what)
 {
 
 	rw_assert((const struct rwlock *)lock, what);
 }
 
 void
 lock_rw(struct lock_object *lock, uintptr_t how)
 {
 	struct rwlock *rw;
 
 	rw = (struct rwlock *)lock;
 	if (how)
 		rw_rlock(rw);
 	else
 		rw_wlock(rw);
 }
 
 uintptr_t
 unlock_rw(struct lock_object *lock)
 {
 	struct rwlock *rw;
 
 	rw = (struct rwlock *)lock;
 	rw_assert(rw, RA_LOCKED | LA_NOTRECURSED);
 	if (rw->rw_lock & RW_LOCK_READ) {
 		rw_runlock(rw);
 		return (1);
 	} else {
 		rw_wunlock(rw);
 		return (0);
 	}
 }
 
 #ifdef KDTRACE_HOOKS
 int
 owner_rw(const struct lock_object *lock, struct thread **owner)
 {
 	const struct rwlock *rw = (const struct rwlock *)lock;
 	uintptr_t x = rw->rw_lock;
 
 	*owner = rw_wowner(rw);
 	return ((x & RW_LOCK_READ) != 0 ?  (RW_READERS(x) != 0) :
 	    (*owner != NULL));
 }
 #endif
 
 void
 _rw_init_flags(volatile uintptr_t *c, const char *name, int opts)
 {
 	struct rwlock *rw;
 	int flags;
 
 	rw = rwlock2rw(c);
 
 	MPASS((opts & ~(RW_DUPOK | RW_NOPROFILE | RW_NOWITNESS | RW_QUIET |
 	    RW_RECURSE)) == 0);
 	ASSERT_ATOMIC_LOAD_PTR(rw->rw_lock,
 	    ("%s: rw_lock not aligned for %s: %p", __func__, name,
 	    &rw->rw_lock));
 
 	flags = LO_UPGRADABLE;
 	if (opts & RW_DUPOK)
 		flags |= LO_DUPOK;
 	if (opts & RW_NOPROFILE)
 		flags |= LO_NOPROFILE;
 	if (!(opts & RW_NOWITNESS))
 		flags |= LO_WITNESS;
 	if (opts & RW_RECURSE)
 		flags |= LO_RECURSABLE;
 	if (opts & RW_QUIET)
 		flags |= LO_QUIET;
 
 	lock_init(&rw->lock_object, &lock_class_rw, name, NULL, flags);
 	rw->rw_lock = RW_UNLOCKED;
 	rw->rw_recurse = 0;
 }
 
 void
 _rw_destroy(volatile uintptr_t *c)
 {
 	struct rwlock *rw;
 
 	rw = rwlock2rw(c);
 
 	KASSERT(rw->rw_lock == RW_UNLOCKED, ("rw lock %p not unlocked", rw));
 	KASSERT(rw->rw_recurse == 0, ("rw lock %p still recursed", rw));
 	rw->rw_lock = RW_DESTROYED;
 	lock_destroy(&rw->lock_object);
 }
 
 void
 rw_sysinit(void *arg)
 {
 	struct rw_args *args = arg;
 
 	rw_init((struct rwlock *)args->ra_rw, args->ra_desc);
 }
 
 void
 rw_sysinit_flags(void *arg)
 {
 	struct rw_args_flags *args = arg;
 
 	rw_init_flags((struct rwlock *)args->ra_rw, args->ra_desc,
 	    args->ra_flags);
 }
 
 int
 _rw_wowned(const volatile uintptr_t *c)
 {
 
 	return (rw_wowner(rwlock2rw(c)) == curthread);
 }
 
 void
 _rw_wlock_cookie(volatile uintptr_t *c, const char *file, int line)
 {
 	struct rwlock *rw;
 
 	if (SCHEDULER_STOPPED())
 		return;
 
 	rw = rwlock2rw(c);
 
 	KASSERT(kdb_active != 0 || !TD_IS_IDLETHREAD(curthread),
 	    ("rw_wlock() by idle thread %p on rwlock %s @ %s:%d",
 	    curthread, rw->lock_object.lo_name, file, line));
 	KASSERT(rw->rw_lock != RW_DESTROYED,
 	    ("rw_wlock() of destroyed rwlock @ %s:%d", file, line));
 	WITNESS_CHECKORDER(&rw->lock_object, LOP_NEWORDER | LOP_EXCLUSIVE, file,
 	    line, NULL);
 	__rw_wlock(rw, curthread, file, line);
 	LOCK_LOG_LOCK("WLOCK", &rw->lock_object, 0, rw->rw_recurse, file, line);
 	WITNESS_LOCK(&rw->lock_object, LOP_EXCLUSIVE, file, line);
 	curthread->td_locks++;
 }
 
 int
 __rw_try_wlock(volatile uintptr_t *c, const char *file, int line)
 {
 	struct rwlock *rw;
 	int rval;
 
 	if (SCHEDULER_STOPPED())
 		return (1);
 
 	rw = rwlock2rw(c);
 
 	KASSERT(kdb_active != 0 || !TD_IS_IDLETHREAD(curthread),
 	    ("rw_try_wlock() by idle thread %p on rwlock %s @ %s:%d",
 	    curthread, rw->lock_object.lo_name, file, line));
 	KASSERT(rw->rw_lock != RW_DESTROYED,
 	    ("rw_try_wlock() of destroyed rwlock @ %s:%d", file, line));
 
 	if (rw_wlocked(rw) &&
 	    (rw->lock_object.lo_flags & LO_RECURSABLE) != 0) {
 		rw->rw_recurse++;
 		rval = 1;
 	} else
 		rval = atomic_cmpset_acq_ptr(&rw->rw_lock, RW_UNLOCKED,
 		    (uintptr_t)curthread);
 
 	LOCK_LOG_TRY("WLOCK", &rw->lock_object, 0, rval, file, line);
 	if (rval) {
 		WITNESS_LOCK(&rw->lock_object, LOP_EXCLUSIVE | LOP_TRYLOCK,
 		    file, line);
 		curthread->td_locks++;
 	}
 	return (rval);
 }
 
 void
 _rw_wunlock_cookie(volatile uintptr_t *c, const char *file, int line)
 {
 	struct rwlock *rw;
 
 	if (SCHEDULER_STOPPED())
 		return;
 
 	rw = rwlock2rw(c);
 
 	KASSERT(rw->rw_lock != RW_DESTROYED,
 	    ("rw_wunlock() of destroyed rwlock @ %s:%d", file, line));
 	__rw_assert(c, RA_WLOCKED, file, line);
 	WITNESS_UNLOCK(&rw->lock_object, LOP_EXCLUSIVE, file, line);
 	LOCK_LOG_LOCK("WUNLOCK", &rw->lock_object, 0, rw->rw_recurse, file,
 	    line);
-	if (!rw_recursed(rw))
-		LOCKSTAT_PROFILE_RELEASE_LOCK(LS_RW_WUNLOCK_RELEASE, rw);
 	__rw_wunlock(rw, curthread, file, line);
 	curthread->td_locks--;
 }
 /*
  * Determines whether a new reader can acquire a lock.  Succeeds if the
  * reader already owns a read lock and the lock is locked for read to
  * prevent deadlock from reader recursion.  Also succeeds if the lock
  * is unlocked and has no writer waiters or spinners.  Failing otherwise
  * prioritizes writers before readers.
  */
 #define	RW_CAN_READ(_rw)						\
     ((curthread->td_rw_rlocks && (_rw) & RW_LOCK_READ) || ((_rw) &	\
     (RW_LOCK_READ | RW_LOCK_WRITE_WAITERS | RW_LOCK_WRITE_SPINNER)) ==	\
     RW_LOCK_READ)
 
 void
 __rw_rlock(volatile uintptr_t *c, const char *file, int line)
 {
 	struct rwlock *rw;
 	struct turnstile *ts;
 #ifdef ADAPTIVE_RWLOCKS
 	volatile struct thread *owner;
 	int spintries = 0;
 	int i;
 #endif
 #ifdef LOCK_PROFILING
 	uint64_t waittime = 0;
 	int contested = 0;
 #endif
 	uintptr_t v;
 #ifdef KDTRACE_HOOKS
 	uint64_t spin_cnt = 0;
 	uint64_t sleep_cnt = 0;
 	int64_t sleep_time = 0;
 #endif
 
 	if (SCHEDULER_STOPPED())
 		return;
 
 	rw = rwlock2rw(c);
 
 	KASSERT(kdb_active != 0 || !TD_IS_IDLETHREAD(curthread),
 	    ("rw_rlock() by idle thread %p on rwlock %s @ %s:%d",
 	    curthread, rw->lock_object.lo_name, file, line));
 	KASSERT(rw->rw_lock != RW_DESTROYED,
 	    ("rw_rlock() of destroyed rwlock @ %s:%d", file, line));
 	KASSERT(rw_wowner(rw) != curthread,
 	    ("rw_rlock: wlock already held for %s @ %s:%d",
 	    rw->lock_object.lo_name, file, line));
 	WITNESS_CHECKORDER(&rw->lock_object, LOP_NEWORDER, file, line, NULL);
 
 	for (;;) {
 #ifdef KDTRACE_HOOKS
 		spin_cnt++;
 #endif
 		/*
 		 * Handle the easy case.  If no other thread has a write
 		 * lock, then try to bump up the count of read locks.  Note
 		 * that we have to preserve the current state of the
 		 * RW_LOCK_WRITE_WAITERS flag.  If we fail to acquire a
 		 * read lock, then rw_lock must have changed, so restart
 		 * the loop.  Note that this handles the case of a
 		 * completely unlocked rwlock since such a lock is encoded
 		 * as a read lock with no waiters.
 		 */
 		v = rw->rw_lock;
 		if (RW_CAN_READ(v)) {
 			/*
 			 * The RW_LOCK_READ_WAITERS flag should only be set
 			 * if the lock has been unlocked and write waiters
 			 * were present.
 			 */
 			if (atomic_cmpset_acq_ptr(&rw->rw_lock, v,
 			    v + RW_ONE_READER)) {
 				if (LOCK_LOG_TEST(&rw->lock_object, 0))
 					CTR4(KTR_LOCK,
 					    "%s: %p succeed %p -> %p", __func__,
 					    rw, (void *)v,
 					    (void *)(v + RW_ONE_READER));
 				break;
 			}
 			continue;
 		}
 #ifdef HWPMC_HOOKS
 		PMC_SOFT_CALL( , , lock, failed);
 #endif
 		lock_profile_obtain_lock_failed(&rw->lock_object,
 		    &contested, &waittime);
 
 #ifdef ADAPTIVE_RWLOCKS
 		/*
 		 * If the owner is running on another CPU, spin until
 		 * the owner stops running or the state of the lock
 		 * changes.
 		 */
 		if ((v & RW_LOCK_READ) == 0) {
 			owner = (struct thread *)RW_OWNER(v);
 			if (TD_IS_RUNNING(owner)) {
 				if (LOCK_LOG_TEST(&rw->lock_object, 0))
 					CTR3(KTR_LOCK,
 					    "%s: spinning on %p held by %p",
 					    __func__, rw, owner);
 				while ((struct thread*)RW_OWNER(rw->rw_lock) ==
 				    owner && TD_IS_RUNNING(owner)) {
 					cpu_spinwait();
 #ifdef KDTRACE_HOOKS
 					spin_cnt++;
 #endif
 				}
 				continue;
 			}
 		} else if (spintries < rowner_retries) {
 			spintries++;
 			for (i = 0; i < rowner_loops; i++) {
 				v = rw->rw_lock;
 				if ((v & RW_LOCK_READ) == 0 || RW_CAN_READ(v))
 					break;
 				cpu_spinwait();
 			}
 			if (i != rowner_loops)
 				continue;
 		}
 #endif
 
 		/*
 		 * Okay, now it's the hard case.  Some other thread already
 		 * has a write lock or there are write waiters present,
 		 * acquire the turnstile lock so we can begin the process
 		 * of blocking.
 		 */
 		ts = turnstile_trywait(&rw->lock_object);
 
 		/*
 		 * The lock might have been released while we spun, so
 		 * recheck its state and restart the loop if needed.
 		 */
 		v = rw->rw_lock;
 		if (RW_CAN_READ(v)) {
 			turnstile_cancel(ts);
 			continue;
 		}
 
 #ifdef ADAPTIVE_RWLOCKS
 		/*
 		 * The current lock owner might have started executing
 		 * on another CPU (or the lock could have changed
 		 * owners) while we were waiting on the turnstile
 		 * chain lock.  If so, drop the turnstile lock and try
 		 * again.
 		 */
 		if ((v & RW_LOCK_READ) == 0) {
 			owner = (struct thread *)RW_OWNER(v);
 			if (TD_IS_RUNNING(owner)) {
 				turnstile_cancel(ts);
 				continue;
 			}
 		}
 #endif
 
 		/*
 		 * The lock is held in write mode or it already has waiters.
 		 */
 		MPASS(!RW_CAN_READ(v));
 
 		/*
 		 * If the RW_LOCK_READ_WAITERS flag is already set, then
 		 * we can go ahead and block.  If it is not set then try
 		 * to set it.  If we fail to set it drop the turnstile
 		 * lock and restart the loop.
 		 */
 		if (!(v & RW_LOCK_READ_WAITERS)) {
 			if (!atomic_cmpset_ptr(&rw->rw_lock, v,
 			    v | RW_LOCK_READ_WAITERS)) {
 				turnstile_cancel(ts);
 				continue;
 			}
 			if (LOCK_LOG_TEST(&rw->lock_object, 0))
 				CTR2(KTR_LOCK, "%s: %p set read waiters flag",
 				    __func__, rw);
 		}
 
 		/*
 		 * We were unable to acquire the lock and the read waiters
 		 * flag is set, so we must block on the turnstile.
 		 */
 		if (LOCK_LOG_TEST(&rw->lock_object, 0))
 			CTR2(KTR_LOCK, "%s: %p blocking on turnstile", __func__,
 			    rw);
 #ifdef KDTRACE_HOOKS
 		sleep_time -= lockstat_nsecs();
 #endif
 		turnstile_wait(ts, rw_owner(rw), TS_SHARED_QUEUE);
 #ifdef KDTRACE_HOOKS
 		sleep_time += lockstat_nsecs();
 		sleep_cnt++;
 #endif
 		if (LOCK_LOG_TEST(&rw->lock_object, 0))
 			CTR2(KTR_LOCK, "%s: %p resuming from turnstile",
 			    __func__, rw);
 	}
 
 	/*
 	 * TODO: acquire "owner of record" here.  Here be turnstile dragons
 	 * however.  turnstiles don't like owners changing between calls to
 	 * turnstile_wait() currently.
 	 */
 	LOCKSTAT_PROFILE_OBTAIN_LOCK_SUCCESS(LS_RW_RLOCK_ACQUIRE, rw, contested,
 	    waittime, file, line);
 	LOCK_LOG_LOCK("RLOCK", &rw->lock_object, 0, 0, file, line);
 	WITNESS_LOCK(&rw->lock_object, 0, file, line);
 	curthread->td_locks++;
 	curthread->td_rw_rlocks++;
 #ifdef KDTRACE_HOOKS
 	if (sleep_time)
 		LOCKSTAT_RECORD1(LS_RW_RLOCK_BLOCK, rw, sleep_time);
 
 	/*
 	 * Record only the loops spinning and not sleeping. 
 	 */
 	if (spin_cnt > sleep_cnt)
 		LOCKSTAT_RECORD1(LS_RW_RLOCK_SPIN, rw, (spin_cnt - sleep_cnt));
 #endif
 }
 
 int
 __rw_try_rlock(volatile uintptr_t *c, const char *file, int line)
 {
 	struct rwlock *rw;
 	uintptr_t x;
 
 	if (SCHEDULER_STOPPED())
 		return (1);
 
 	rw = rwlock2rw(c);
 
 	KASSERT(kdb_active != 0 || !TD_IS_IDLETHREAD(curthread),
 	    ("rw_try_rlock() by idle thread %p on rwlock %s @ %s:%d",
 	    curthread, rw->lock_object.lo_name, file, line));
 
 	for (;;) {
 		x = rw->rw_lock;
 		KASSERT(rw->rw_lock != RW_DESTROYED,
 		    ("rw_try_rlock() of destroyed rwlock @ %s:%d", file, line));
 		if (!(x & RW_LOCK_READ))
 			break;
 		if (atomic_cmpset_acq_ptr(&rw->rw_lock, x, x + RW_ONE_READER)) {
 			LOCK_LOG_TRY("RLOCK", &rw->lock_object, 0, 1, file,
 			    line);
 			WITNESS_LOCK(&rw->lock_object, LOP_TRYLOCK, file, line);
 			curthread->td_locks++;
 			curthread->td_rw_rlocks++;
 			return (1);
 		}
 	}
 
 	LOCK_LOG_TRY("RLOCK", &rw->lock_object, 0, 0, file, line);
 	return (0);
 }
 
 void
 _rw_runlock_cookie(volatile uintptr_t *c, const char *file, int line)
 {
 	struct rwlock *rw;
 	struct turnstile *ts;
 	uintptr_t x, v, queue;
 
 	if (SCHEDULER_STOPPED())
 		return;
 
 	rw = rwlock2rw(c);
 
 	KASSERT(rw->rw_lock != RW_DESTROYED,
 	    ("rw_runlock() of destroyed rwlock @ %s:%d", file, line));
 	__rw_assert(c, RA_RLOCKED, file, line);
 	WITNESS_UNLOCK(&rw->lock_object, 0, file, line);
 	LOCK_LOG_LOCK("RUNLOCK", &rw->lock_object, 0, 0, file, line);
 
 	/* TODO: drop "owner of record" here. */
 
 	for (;;) {
 		/*
 		 * See if there is more than one read lock held.  If so,
 		 * just drop one and return.
 		 */
 		x = rw->rw_lock;
 		if (RW_READERS(x) > 1) {
 			if (atomic_cmpset_rel_ptr(&rw->rw_lock, x,
 			    x - RW_ONE_READER)) {
 				if (LOCK_LOG_TEST(&rw->lock_object, 0))
 					CTR4(KTR_LOCK,
 					    "%s: %p succeeded %p -> %p",
 					    __func__, rw, (void *)x,
 					    (void *)(x - RW_ONE_READER));
 				break;
 			}
 			continue;
 		}
 		/*
 		 * If there aren't any waiters for a write lock, then try
 		 * to drop it quickly.
 		 */
 		if (!(x & RW_LOCK_WAITERS)) {
 			MPASS((x & ~RW_LOCK_WRITE_SPINNER) ==
 			    RW_READERS_LOCK(1));
 			if (atomic_cmpset_rel_ptr(&rw->rw_lock, x,
 			    RW_UNLOCKED)) {
 				if (LOCK_LOG_TEST(&rw->lock_object, 0))
 					CTR2(KTR_LOCK, "%s: %p last succeeded",
 					    __func__, rw);
 				break;
 			}
 			continue;
 		}
 		/*
 		 * Ok, we know we have waiters and we think we are the
 		 * last reader, so grab the turnstile lock.
 		 */
 		turnstile_chain_lock(&rw->lock_object);
 		v = rw->rw_lock & (RW_LOCK_WAITERS | RW_LOCK_WRITE_SPINNER);
 		MPASS(v & RW_LOCK_WAITERS);
 
 		/*
 		 * Try to drop our lock leaving the lock in a unlocked
 		 * state.
 		 *
 		 * If you wanted to do explicit lock handoff you'd have to
 		 * do it here.  You'd also want to use turnstile_signal()
 		 * and you'd have to handle the race where a higher
 		 * priority thread blocks on the write lock before the
 		 * thread you wakeup actually runs and have the new thread
 		 * "steal" the lock.  For now it's a lot simpler to just
 		 * wakeup all of the waiters.
 		 *
 		 * As above, if we fail, then another thread might have
 		 * acquired a read lock, so drop the turnstile lock and
 		 * restart.
 		 */
 		x = RW_UNLOCKED;
 		if (v & RW_LOCK_WRITE_WAITERS) {
 			queue = TS_EXCLUSIVE_QUEUE;
 			x |= (v & RW_LOCK_READ_WAITERS);
 		} else
 			queue = TS_SHARED_QUEUE;
 		if (!atomic_cmpset_rel_ptr(&rw->rw_lock, RW_READERS_LOCK(1) | v,
 		    x)) {
 			turnstile_chain_unlock(&rw->lock_object);
 			continue;
 		}
 		if (LOCK_LOG_TEST(&rw->lock_object, 0))
 			CTR2(KTR_LOCK, "%s: %p last succeeded with waiters",
 			    __func__, rw);
 
 		/*
 		 * Ok.  The lock is released and all that's left is to
 		 * wake up the waiters.  Note that the lock might not be
 		 * free anymore, but in that case the writers will just
 		 * block again if they run before the new lock holder(s)
 		 * release the lock.
 		 */
 		ts = turnstile_lookup(&rw->lock_object);
 		MPASS(ts != NULL);
 		turnstile_broadcast(ts, queue);
 		turnstile_unpend(ts, TS_SHARED_LOCK);
 		turnstile_chain_unlock(&rw->lock_object);
 		break;
 	}
 	LOCKSTAT_PROFILE_RELEASE_LOCK(LS_RW_RUNLOCK_RELEASE, rw);
 	curthread->td_locks--;
 	curthread->td_rw_rlocks--;
 }
 
 /*
  * This function is called when we are unable to obtain a write lock on the
  * first try.  This means that at least one other thread holds either a
  * read or write lock.
  */
 void
 __rw_wlock_hard(volatile uintptr_t *c, uintptr_t tid, const char *file,
     int line)
 {
 	struct rwlock *rw;
 	struct turnstile *ts;
 #ifdef ADAPTIVE_RWLOCKS
 	volatile struct thread *owner;
 	int spintries = 0;
 	int i;
 #endif
 	uintptr_t v, x;
 #ifdef LOCK_PROFILING
 	uint64_t waittime = 0;
 	int contested = 0;
 #endif
 #ifdef KDTRACE_HOOKS
 	uint64_t spin_cnt = 0;
 	uint64_t sleep_cnt = 0;
 	int64_t sleep_time = 0;
 #endif
 
 	if (SCHEDULER_STOPPED())
 		return;
 
 	rw = rwlock2rw(c);
 
 	if (rw_wlocked(rw)) {
 		KASSERT(rw->lock_object.lo_flags & LO_RECURSABLE,
 		    ("%s: recursing but non-recursive rw %s @ %s:%d\n",
 		    __func__, rw->lock_object.lo_name, file, line));
 		rw->rw_recurse++;
 		if (LOCK_LOG_TEST(&rw->lock_object, 0))
 			CTR2(KTR_LOCK, "%s: %p recursing", __func__, rw);
 		return;
 	}
 
 	if (LOCK_LOG_TEST(&rw->lock_object, 0))
 		CTR5(KTR_LOCK, "%s: %s contested (lock=%p) at %s:%d", __func__,
 		    rw->lock_object.lo_name, (void *)rw->rw_lock, file, line);
 
 	while (!_rw_write_lock(rw, tid)) {
 #ifdef KDTRACE_HOOKS
 		spin_cnt++;
 #endif
 #ifdef HWPMC_HOOKS
 		PMC_SOFT_CALL( , , lock, failed);
 #endif
 		lock_profile_obtain_lock_failed(&rw->lock_object,
 		    &contested, &waittime);
 #ifdef ADAPTIVE_RWLOCKS
 		/*
 		 * If the lock is write locked and the owner is
 		 * running on another CPU, spin until the owner stops
 		 * running or the state of the lock changes.
 		 */
 		v = rw->rw_lock;
 		owner = (struct thread *)RW_OWNER(v);
 		if (!(v & RW_LOCK_READ) && TD_IS_RUNNING(owner)) {
 			if (LOCK_LOG_TEST(&rw->lock_object, 0))
 				CTR3(KTR_LOCK, "%s: spinning on %p held by %p",
 				    __func__, rw, owner);
 			while ((struct thread*)RW_OWNER(rw->rw_lock) == owner &&
 			    TD_IS_RUNNING(owner)) {
 				cpu_spinwait();
 #ifdef KDTRACE_HOOKS
 				spin_cnt++;
 #endif
 			}
 			continue;
 		}
 		if ((v & RW_LOCK_READ) && RW_READERS(v) &&
 		    spintries < rowner_retries) {
 			if (!(v & RW_LOCK_WRITE_SPINNER)) {
 				if (!atomic_cmpset_ptr(&rw->rw_lock, v,
 				    v | RW_LOCK_WRITE_SPINNER)) {
 					continue;
 				}
 			}
 			spintries++;
 			for (i = 0; i < rowner_loops; i++) {
 				if ((rw->rw_lock & RW_LOCK_WRITE_SPINNER) == 0)
 					break;
 				cpu_spinwait();
 			}
 #ifdef KDTRACE_HOOKS
 			spin_cnt += rowner_loops - i;
 #endif
 			if (i != rowner_loops)
 				continue;
 		}
 #endif
 		ts = turnstile_trywait(&rw->lock_object);
 		v = rw->rw_lock;
 
 #ifdef ADAPTIVE_RWLOCKS
 		/*
 		 * The current lock owner might have started executing
 		 * on another CPU (or the lock could have changed
 		 * owners) while we were waiting on the turnstile
 		 * chain lock.  If so, drop the turnstile lock and try
 		 * again.
 		 */
 		if (!(v & RW_LOCK_READ)) {
 			owner = (struct thread *)RW_OWNER(v);
 			if (TD_IS_RUNNING(owner)) {
 				turnstile_cancel(ts);
 				continue;
 			}
 		}
 #endif
 		/*
 		 * Check for the waiters flags about this rwlock.
 		 * If the lock was released, without maintain any pending
 		 * waiters queue, simply try to acquire it.
 		 * If a pending waiters queue is present, claim the lock
 		 * ownership and maintain the pending queue.
 		 */
 		x = v & (RW_LOCK_WAITERS | RW_LOCK_WRITE_SPINNER);
 		if ((v & ~x) == RW_UNLOCKED) {
 			x &= ~RW_LOCK_WRITE_SPINNER;
 			if (atomic_cmpset_acq_ptr(&rw->rw_lock, v, tid | x)) {
 				if (x)
 					turnstile_claim(ts);
 				else
 					turnstile_cancel(ts);
 				break;
 			}
 			turnstile_cancel(ts);
 			continue;
 		}
 		/*
 		 * If the RW_LOCK_WRITE_WAITERS flag isn't set, then try to
 		 * set it.  If we fail to set it, then loop back and try
 		 * again.
 		 */
 		if (!(v & RW_LOCK_WRITE_WAITERS)) {
 			if (!atomic_cmpset_ptr(&rw->rw_lock, v,
 			    v | RW_LOCK_WRITE_WAITERS)) {
 				turnstile_cancel(ts);
 				continue;
 			}
 			if (LOCK_LOG_TEST(&rw->lock_object, 0))
 				CTR2(KTR_LOCK, "%s: %p set write waiters flag",
 				    __func__, rw);
 		}
 		/*
 		 * We were unable to acquire the lock and the write waiters
 		 * flag is set, so we must block on the turnstile.
 		 */
 		if (LOCK_LOG_TEST(&rw->lock_object, 0))
 			CTR2(KTR_LOCK, "%s: %p blocking on turnstile", __func__,
 			    rw);
 #ifdef KDTRACE_HOOKS
 		sleep_time -= lockstat_nsecs();
 #endif
 		turnstile_wait(ts, rw_owner(rw), TS_EXCLUSIVE_QUEUE);
 #ifdef KDTRACE_HOOKS
 		sleep_time += lockstat_nsecs();
 		sleep_cnt++;
 #endif
 		if (LOCK_LOG_TEST(&rw->lock_object, 0))
 			CTR2(KTR_LOCK, "%s: %p resuming from turnstile",
 			    __func__, rw);
 #ifdef ADAPTIVE_RWLOCKS
 		spintries = 0;
 #endif
 	}
 	LOCKSTAT_PROFILE_OBTAIN_LOCK_SUCCESS(LS_RW_WLOCK_ACQUIRE, rw, contested,
 	    waittime, file, line);
 #ifdef KDTRACE_HOOKS
 	if (sleep_time)
 		LOCKSTAT_RECORD1(LS_RW_WLOCK_BLOCK, rw, sleep_time);
 
 	/*
 	 * Record only the loops spinning and not sleeping.
 	 */ 
 	if (spin_cnt > sleep_cnt)
 		LOCKSTAT_RECORD1(LS_RW_WLOCK_SPIN, rw, (spin_cnt - sleep_cnt));
 #endif
 }
 
 /*
  * This function is called if the first try at releasing a write lock failed.
  * This means that one of the 2 waiter bits must be set indicating that at
  * least one thread is waiting on this lock.
  */
 void
 __rw_wunlock_hard(volatile uintptr_t *c, uintptr_t tid, const char *file,
     int line)
 {
 	struct rwlock *rw;
 	struct turnstile *ts;
 	uintptr_t v;
 	int queue;
 
 	if (SCHEDULER_STOPPED())
 		return;
 
 	rw = rwlock2rw(c);
 
 	if (rw_wlocked(rw) && rw_recursed(rw)) {
 		rw->rw_recurse--;
 		if (LOCK_LOG_TEST(&rw->lock_object, 0))
 			CTR2(KTR_LOCK, "%s: %p unrecursing", __func__, rw);
 		return;
 	}
 
 	KASSERT(rw->rw_lock & (RW_LOCK_READ_WAITERS | RW_LOCK_WRITE_WAITERS),
 	    ("%s: neither of the waiter flags are set", __func__));
 
 	if (LOCK_LOG_TEST(&rw->lock_object, 0))
 		CTR2(KTR_LOCK, "%s: %p contested", __func__, rw);
 
 	turnstile_chain_lock(&rw->lock_object);
 	ts = turnstile_lookup(&rw->lock_object);
 	MPASS(ts != NULL);
 
 	/*
 	 * Use the same algo as sx locks for now.  Prefer waking up shared
 	 * waiters if we have any over writers.  This is probably not ideal.
 	 *
 	 * 'v' is the value we are going to write back to rw_lock.  If we
 	 * have waiters on both queues, we need to preserve the state of
 	 * the waiter flag for the queue we don't wake up.  For now this is
 	 * hardcoded for the algorithm mentioned above.
 	 *
 	 * In the case of both readers and writers waiting we wakeup the
 	 * readers but leave the RW_LOCK_WRITE_WAITERS flag set.  If a
 	 * new writer comes in before a reader it will claim the lock up
 	 * above.  There is probably a potential priority inversion in
 	 * there that could be worked around either by waking both queues
 	 * of waiters or doing some complicated lock handoff gymnastics.
 	 */
 	v = RW_UNLOCKED;
 	if (rw->rw_lock & RW_LOCK_WRITE_WAITERS) {
 		queue = TS_EXCLUSIVE_QUEUE;
 		v |= (rw->rw_lock & RW_LOCK_READ_WAITERS);
 	} else
 		queue = TS_SHARED_QUEUE;
 
 	/* Wake up all waiters for the specific queue. */
 	if (LOCK_LOG_TEST(&rw->lock_object, 0))
 		CTR3(KTR_LOCK, "%s: %p waking up %s waiters", __func__, rw,
 		    queue == TS_SHARED_QUEUE ? "read" : "write");
 	turnstile_broadcast(ts, queue);
 	atomic_store_rel_ptr(&rw->rw_lock, v);
 	turnstile_unpend(ts, TS_EXCLUSIVE_LOCK);
 	turnstile_chain_unlock(&rw->lock_object);
 }
 
 /*
  * Attempt to do a non-blocking upgrade from a read lock to a write
  * lock.  This will only succeed if this thread holds a single read
  * lock.  Returns true if the upgrade succeeded and false otherwise.
  */
 int
 __rw_try_upgrade(volatile uintptr_t *c, const char *file, int line)
 {
 	struct rwlock *rw;
 	uintptr_t v, x, tid;
 	struct turnstile *ts;
 	int success;
 
 	if (SCHEDULER_STOPPED())
 		return (1);
 
 	rw = rwlock2rw(c);
 
 	KASSERT(rw->rw_lock != RW_DESTROYED,
 	    ("rw_try_upgrade() of destroyed rwlock @ %s:%d", file, line));
 	__rw_assert(c, RA_RLOCKED, file, line);
 
 	/*
 	 * Attempt to switch from one reader to a writer.  If there
 	 * are any write waiters, then we will have to lock the
 	 * turnstile first to prevent races with another writer
 	 * calling turnstile_wait() before we have claimed this
 	 * turnstile.  So, do the simple case of no waiters first.
 	 */
 	tid = (uintptr_t)curthread;
 	success = 0;
 	for (;;) {
 		v = rw->rw_lock;
 		if (RW_READERS(v) > 1)
 			break;
 		if (!(v & RW_LOCK_WAITERS)) {
 			success = atomic_cmpset_ptr(&rw->rw_lock, v, tid);
 			if (!success)
 				continue;
 			break;
 		}
 
 		/*
 		 * Ok, we think we have waiters, so lock the turnstile.
 		 */
 		ts = turnstile_trywait(&rw->lock_object);
 		v = rw->rw_lock;
 		if (RW_READERS(v) > 1) {
 			turnstile_cancel(ts);
 			break;
 		}
 		/*
 		 * Try to switch from one reader to a writer again.  This time
 		 * we honor the current state of the waiters flags.
 		 * If we obtain the lock with the flags set, then claim
 		 * ownership of the turnstile.
 		 */
 		x = rw->rw_lock & RW_LOCK_WAITERS;
 		success = atomic_cmpset_ptr(&rw->rw_lock, v, tid | x);
 		if (success) {
 			if (x)
 				turnstile_claim(ts);
 			else
 				turnstile_cancel(ts);
 			break;
 		}
 		turnstile_cancel(ts);
 	}
 	LOCK_LOG_TRY("WUPGRADE", &rw->lock_object, 0, success, file, line);
 	if (success) {
 		curthread->td_rw_rlocks--;
 		WITNESS_UPGRADE(&rw->lock_object, LOP_EXCLUSIVE | LOP_TRYLOCK,
 		    file, line);
 		LOCKSTAT_RECORD0(LS_RW_TRYUPGRADE_UPGRADE, rw);
 	}
 	return (success);
 }
 
 /*
  * Downgrade a write lock into a single read lock.
  */
 void
 __rw_downgrade(volatile uintptr_t *c, const char *file, int line)
 {
 	struct rwlock *rw;
 	struct turnstile *ts;
 	uintptr_t tid, v;
 	int rwait, wwait;
 
 	if (SCHEDULER_STOPPED())
 		return;
 
 	rw = rwlock2rw(c);
 
 	KASSERT(rw->rw_lock != RW_DESTROYED,
 	    ("rw_downgrade() of destroyed rwlock @ %s:%d", file, line));
 	__rw_assert(c, RA_WLOCKED | RA_NOTRECURSED, file, line);
 #ifndef INVARIANTS
 	if (rw_recursed(rw))
 		panic("downgrade of a recursed lock");
 #endif
 
 	WITNESS_DOWNGRADE(&rw->lock_object, 0, file, line);
 
 	/*
 	 * Convert from a writer to a single reader.  First we handle
 	 * the easy case with no waiters.  If there are any waiters, we
 	 * lock the turnstile and "disown" the lock.
 	 */
 	tid = (uintptr_t)curthread;
 	if (atomic_cmpset_rel_ptr(&rw->rw_lock, tid, RW_READERS_LOCK(1)))
 		goto out;
 
 	/*
 	 * Ok, we think we have waiters, so lock the turnstile so we can
 	 * read the waiter flags without any races.
 	 */
 	turnstile_chain_lock(&rw->lock_object);
 	v = rw->rw_lock & RW_LOCK_WAITERS;
 	rwait = v & RW_LOCK_READ_WAITERS;
 	wwait = v & RW_LOCK_WRITE_WAITERS;
 	MPASS(rwait | wwait);
 
 	/*
 	 * Downgrade from a write lock while preserving waiters flag
 	 * and give up ownership of the turnstile.
 	 */
 	ts = turnstile_lookup(&rw->lock_object);
 	MPASS(ts != NULL);
 	if (!wwait)
 		v &= ~RW_LOCK_READ_WAITERS;
 	atomic_store_rel_ptr(&rw->rw_lock, RW_READERS_LOCK(1) | v);
 	/*
 	 * Wake other readers if there are no writers pending.  Otherwise they
 	 * won't be able to acquire the lock anyway.
 	 */
 	if (rwait && !wwait) {
 		turnstile_broadcast(ts, TS_SHARED_QUEUE);
 		turnstile_unpend(ts, TS_EXCLUSIVE_LOCK);
 	} else
 		turnstile_disown(ts);
 	turnstile_chain_unlock(&rw->lock_object);
 out:
 	curthread->td_rw_rlocks++;
 	LOCK_LOG_LOCK("WDOWNGRADE", &rw->lock_object, 0, 0, file, line);
 	LOCKSTAT_RECORD0(LS_RW_DOWNGRADE_DOWNGRADE, rw);
 }
 
 #ifdef INVARIANT_SUPPORT
 #ifndef INVARIANTS
 #undef __rw_assert
 #endif
 
 /*
  * In the non-WITNESS case, rw_assert() can only detect that at least
  * *some* thread owns an rlock, but it cannot guarantee that *this*
  * thread owns an rlock.
  */
 void
 __rw_assert(const volatile uintptr_t *c, int what, const char *file, int line)
 {
 	const struct rwlock *rw;
 
 	if (panicstr != NULL)
 		return;
 
 	rw = rwlock2rw(c);
 
 	switch (what) {
 	case RA_LOCKED:
 	case RA_LOCKED | RA_RECURSED:
 	case RA_LOCKED | RA_NOTRECURSED:
 	case RA_RLOCKED:
 	case RA_RLOCKED | RA_RECURSED:
 	case RA_RLOCKED | RA_NOTRECURSED:
 #ifdef WITNESS
 		witness_assert(&rw->lock_object, what, file, line);
 #else
 		/*
 		 * If some other thread has a write lock or we have one
 		 * and are asserting a read lock, fail.  Also, if no one
 		 * has a lock at all, fail.
 		 */
 		if (rw->rw_lock == RW_UNLOCKED ||
 		    (!(rw->rw_lock & RW_LOCK_READ) && (what & RA_RLOCKED ||
 		    rw_wowner(rw) != curthread)))
 			panic("Lock %s not %slocked @ %s:%d\n",
 			    rw->lock_object.lo_name, (what & RA_RLOCKED) ?
 			    "read " : "", file, line);
 
 		if (!(rw->rw_lock & RW_LOCK_READ) && !(what & RA_RLOCKED)) {
 			if (rw_recursed(rw)) {
 				if (what & RA_NOTRECURSED)
 					panic("Lock %s recursed @ %s:%d\n",
 					    rw->lock_object.lo_name, file,
 					    line);
 			} else if (what & RA_RECURSED)
 				panic("Lock %s not recursed @ %s:%d\n",
 				    rw->lock_object.lo_name, file, line);
 		}
 #endif
 		break;
 	case RA_WLOCKED:
 	case RA_WLOCKED | RA_RECURSED:
 	case RA_WLOCKED | RA_NOTRECURSED:
 		if (rw_wowner(rw) != curthread)
 			panic("Lock %s not exclusively locked @ %s:%d\n",
 			    rw->lock_object.lo_name, file, line);
 		if (rw_recursed(rw)) {
 			if (what & RA_NOTRECURSED)
 				panic("Lock %s recursed @ %s:%d\n",
 				    rw->lock_object.lo_name, file, line);
 		} else if (what & RA_RECURSED)
 			panic("Lock %s not recursed @ %s:%d\n",
 			    rw->lock_object.lo_name, file, line);
 		break;
 	case RA_UNLOCKED:
 #ifdef WITNESS
 		witness_assert(&rw->lock_object, what, file, line);
 #else
 		/*
 		 * If we hold a write lock fail.  We can't reliably check
 		 * to see if we hold a read lock or not.
 		 */
 		if (rw_wowner(rw) == curthread)
 			panic("Lock %s exclusively locked @ %s:%d\n",
 			    rw->lock_object.lo_name, file, line);
 #endif
 		break;
 	default:
 		panic("Unknown rw lock assertion: %d @ %s:%d", what, file,
 		    line);
 	}
 }
 #endif /* INVARIANT_SUPPORT */
 
 #ifdef DDB
 void
 db_show_rwlock(const struct lock_object *lock)
 {
 	const struct rwlock *rw;
 	struct thread *td;
 
 	rw = (const struct rwlock *)lock;
 
 	db_printf(" state: ");
 	if (rw->rw_lock == RW_UNLOCKED)
 		db_printf("UNLOCKED\n");
 	else if (rw->rw_lock == RW_DESTROYED) {
 		db_printf("DESTROYED\n");
 		return;
 	} else if (rw->rw_lock & RW_LOCK_READ)
 		db_printf("RLOCK: %ju locks\n",
 		    (uintmax_t)(RW_READERS(rw->rw_lock)));
 	else {
 		td = rw_wowner(rw);
 		db_printf("WLOCK: %p (tid %d, pid %d, \"%s\")\n", td,
 		    td->td_tid, td->td_proc->p_pid, td->td_name);
 		if (rw_recursed(rw))
 			db_printf(" recursed: %u\n", rw->rw_recurse);
 	}
 	db_printf(" waiters: ");
 	switch (rw->rw_lock & (RW_LOCK_READ_WAITERS | RW_LOCK_WRITE_WAITERS)) {
 	case RW_LOCK_READ_WAITERS:
 		db_printf("readers\n");
 		break;
 	case RW_LOCK_WRITE_WAITERS:
 		db_printf("writers\n");
 		break;
 	case RW_LOCK_READ_WAITERS | RW_LOCK_WRITE_WAITERS:
 		db_printf("readers and writers\n");
 		break;
 	default:
 		db_printf("none\n");
 		break;
 	}
 }
 
 #endif
Index: head/sys/kern/kern_sdt.c
===================================================================
--- head/sys/kern/kern_sdt.c	(revision 258540)
+++ head/sys/kern/kern_sdt.c	(revision 258541)
@@ -1,51 +1,49 @@
 /*-
  * Copyright 2006-2008 John Birrell <jb@FreeBSD.org>
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 
  * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  * $FreeBSD$
  */
 
-#include "opt_kdtrace.h"
-
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/sdt.h>
 
 /*
  * Hook for the DTrace probe function. The SDT provider will set this to
  * dtrace_probe() when it loads.
  */
 sdt_probe_func_t sdt_probe_func = sdt_probe_stub;
 
 /*
  * This is a stub for probe calls in case kernel DTrace support isn't
  * enabled. It should never get called because there is no DTrace support
  * to enable it.
  */
 void
 sdt_probe_stub(uint32_t id, uintptr_t arg0, uintptr_t arg1,
     uintptr_t arg2, uintptr_t arg3, uintptr_t arg4)
 {
 
 	printf("sdt_probe_stub: Why did this get called?\n");
 }
Index: head/sys/kern/kern_sig.c
===================================================================
--- head/sys/kern/kern_sig.c	(revision 258540)
+++ head/sys/kern/kern_sig.c	(revision 258541)
@@ -1,3472 +1,3471 @@
 /*-
  * Copyright (c) 1982, 1986, 1989, 1991, 1993
  *	The Regents of the University of California.  All rights reserved.
  * (c) UNIX System Laboratories, Inc.
  * All or some portions of this file are derived from material licensed
  * to the University of California by American Telephone and Telegraph
  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
  * the permission of UNIX System Laboratories, Inc.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)kern_sig.c	8.7 (Berkeley) 4/18/94
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_compat.h"
-#include "opt_kdtrace.h"
 #include "opt_ktrace.h"
 #include "opt_core.h"
 #include "opt_procdesc.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/signalvar.h>
 #include <sys/vnode.h>
 #include <sys/acct.h>
 #include <sys/capability.h>
 #include <sys/condvar.h>
 #include <sys/event.h>
 #include <sys/fcntl.h>
 #include <sys/imgact.h>
 #include <sys/kernel.h>
 #include <sys/ktr.h>
 #include <sys/ktrace.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mutex.h>
 #include <sys/namei.h>
 #include <sys/proc.h>
 #include <sys/procdesc.h>
 #include <sys/posix4.h>
 #include <sys/pioctl.h>
 #include <sys/racct.h>
 #include <sys/resourcevar.h>
 #include <sys/sdt.h>
 #include <sys/sbuf.h>
 #include <sys/sleepqueue.h>
 #include <sys/smp.h>
 #include <sys/stat.h>
 #include <sys/sx.h>
 #include <sys/syscallsubr.h>
 #include <sys/sysctl.h>
 #include <sys/sysent.h>
 #include <sys/syslog.h>
 #include <sys/sysproto.h>
 #include <sys/timers.h>
 #include <sys/unistd.h>
 #include <sys/wait.h>
 #include <vm/vm.h>
 #include <vm/vm_extern.h>
 #include <vm/uma.h>
 
 #include <sys/jail.h>
 
 #include <machine/cpu.h>
 
 #include <security/audit/audit.h>
 
 #define	ONSIG	32		/* NSIG for osig* syscalls.  XXX. */
 
 SDT_PROVIDER_DECLARE(proc);
 SDT_PROBE_DEFINE3(proc, kernel, , signal_send, signal-send, "struct thread *",
     "struct proc *", "int");
 SDT_PROBE_DEFINE2(proc, kernel, , signal_clear, signal-clear, "int",
     "ksiginfo_t *");
 SDT_PROBE_DEFINE3(proc, kernel, , signal_discard, signal-discard,
     "struct thread *", "struct proc *", "int");
 
 static int	coredump(struct thread *);
 static int	killpg1(struct thread *td, int sig, int pgid, int all,
 		    ksiginfo_t *ksi);
 static int	issignal(struct thread *td);
 static int	sigprop(int sig);
 static void	tdsigwakeup(struct thread *, int, sig_t, int);
 static void	sig_suspend_threads(struct thread *, struct proc *, int);
 static int	filt_sigattach(struct knote *kn);
 static void	filt_sigdetach(struct knote *kn);
 static int	filt_signal(struct knote *kn, long hint);
 static struct thread *sigtd(struct proc *p, int sig, int prop);
 static void	sigqueue_start(void);
 
 static uma_zone_t	ksiginfo_zone = NULL;
 struct filterops sig_filtops = {
 	.f_isfd = 0,
 	.f_attach = filt_sigattach,
 	.f_detach = filt_sigdetach,
 	.f_event = filt_signal,
 };
 
 static int	kern_logsigexit = 1;
 SYSCTL_INT(_kern, KERN_LOGSIGEXIT, logsigexit, CTLFLAG_RW,
     &kern_logsigexit, 0,
     "Log processes quitting on abnormal signals to syslog(3)");
 
 static int	kern_forcesigexit = 1;
 SYSCTL_INT(_kern, OID_AUTO, forcesigexit, CTLFLAG_RW,
     &kern_forcesigexit, 0, "Force trap signal to be handled");
 
 static SYSCTL_NODE(_kern, OID_AUTO, sigqueue, CTLFLAG_RW, 0,
     "POSIX real time signal");
 
 static int	max_pending_per_proc = 128;
 SYSCTL_INT(_kern_sigqueue, OID_AUTO, max_pending_per_proc, CTLFLAG_RW,
     &max_pending_per_proc, 0, "Max pending signals per proc");
 
 static int	preallocate_siginfo = 1024;
 TUNABLE_INT("kern.sigqueue.preallocate", &preallocate_siginfo);
 SYSCTL_INT(_kern_sigqueue, OID_AUTO, preallocate, CTLFLAG_RD,
     &preallocate_siginfo, 0, "Preallocated signal memory size");
 
 static int	signal_overflow = 0;
 SYSCTL_INT(_kern_sigqueue, OID_AUTO, overflow, CTLFLAG_RD,
     &signal_overflow, 0, "Number of signals overflew");
 
 static int	signal_alloc_fail = 0;
 SYSCTL_INT(_kern_sigqueue, OID_AUTO, alloc_fail, CTLFLAG_RD,
     &signal_alloc_fail, 0, "signals failed to be allocated");
 
 SYSINIT(signal, SI_SUB_P1003_1B, SI_ORDER_FIRST+3, sigqueue_start, NULL);
 
 /*
  * Policy -- Can ucred cr1 send SIGIO to process cr2?
  * Should use cr_cansignal() once cr_cansignal() allows SIGIO and SIGURG
  * in the right situations.
  */
 #define CANSIGIO(cr1, cr2) \
 	((cr1)->cr_uid == 0 || \
 	    (cr1)->cr_ruid == (cr2)->cr_ruid || \
 	    (cr1)->cr_uid == (cr2)->cr_ruid || \
 	    (cr1)->cr_ruid == (cr2)->cr_uid || \
 	    (cr1)->cr_uid == (cr2)->cr_uid)
 
 static int	sugid_coredump;
 TUNABLE_INT("kern.sugid_coredump", &sugid_coredump);
 SYSCTL_INT(_kern, OID_AUTO, sugid_coredump, CTLFLAG_RW,
     &sugid_coredump, 0, "Allow setuid and setgid processes to dump core");
 
 static int	capmode_coredump;
 TUNABLE_INT("kern.capmode_coredump", &capmode_coredump);
 SYSCTL_INT(_kern, OID_AUTO, capmode_coredump, CTLFLAG_RW,
     &capmode_coredump, 0, "Allow processes in capability mode to dump core");
 
 static int	do_coredump = 1;
 SYSCTL_INT(_kern, OID_AUTO, coredump, CTLFLAG_RW,
 	&do_coredump, 0, "Enable/Disable coredumps");
 
 static int	set_core_nodump_flag = 0;
 SYSCTL_INT(_kern, OID_AUTO, nodump_coredump, CTLFLAG_RW, &set_core_nodump_flag,
 	0, "Enable setting the NODUMP flag on coredump files");
 
 /*
  * Signal properties and actions.
  * The array below categorizes the signals and their default actions
  * according to the following properties:
  */
 #define	SA_KILL		0x01		/* terminates process by default */
 #define	SA_CORE		0x02		/* ditto and coredumps */
 #define	SA_STOP		0x04		/* suspend process */
 #define	SA_TTYSTOP	0x08		/* ditto, from tty */
 #define	SA_IGNORE	0x10		/* ignore by default */
 #define	SA_CONT		0x20		/* continue if suspended */
 #define	SA_CANTMASK	0x40		/* non-maskable, catchable */
 
 static int sigproptbl[NSIG] = {
 	SA_KILL,			/* SIGHUP */
 	SA_KILL,			/* SIGINT */
 	SA_KILL|SA_CORE,		/* SIGQUIT */
 	SA_KILL|SA_CORE,		/* SIGILL */
 	SA_KILL|SA_CORE,		/* SIGTRAP */
 	SA_KILL|SA_CORE,		/* SIGABRT */
 	SA_KILL|SA_CORE,		/* SIGEMT */
 	SA_KILL|SA_CORE,		/* SIGFPE */
 	SA_KILL,			/* SIGKILL */
 	SA_KILL|SA_CORE,		/* SIGBUS */
 	SA_KILL|SA_CORE,		/* SIGSEGV */
 	SA_KILL|SA_CORE,		/* SIGSYS */
 	SA_KILL,			/* SIGPIPE */
 	SA_KILL,			/* SIGALRM */
 	SA_KILL,			/* SIGTERM */
 	SA_IGNORE,			/* SIGURG */
 	SA_STOP,			/* SIGSTOP */
 	SA_STOP|SA_TTYSTOP,		/* SIGTSTP */
 	SA_IGNORE|SA_CONT,		/* SIGCONT */
 	SA_IGNORE,			/* SIGCHLD */
 	SA_STOP|SA_TTYSTOP,		/* SIGTTIN */
 	SA_STOP|SA_TTYSTOP,		/* SIGTTOU */
 	SA_IGNORE,			/* SIGIO */
 	SA_KILL,			/* SIGXCPU */
 	SA_KILL,			/* SIGXFSZ */
 	SA_KILL,			/* SIGVTALRM */
 	SA_KILL,			/* SIGPROF */
 	SA_IGNORE,			/* SIGWINCH  */
 	SA_IGNORE,			/* SIGINFO */
 	SA_KILL,			/* SIGUSR1 */
 	SA_KILL,			/* SIGUSR2 */
 };
 
 static void reschedule_signals(struct proc *p, sigset_t block, int flags);
 
 static void
 sigqueue_start(void)
 {
 	ksiginfo_zone = uma_zcreate("ksiginfo", sizeof(ksiginfo_t),
 		NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
 	uma_prealloc(ksiginfo_zone, preallocate_siginfo);
 	p31b_setcfg(CTL_P1003_1B_REALTIME_SIGNALS, _POSIX_REALTIME_SIGNALS);
 	p31b_setcfg(CTL_P1003_1B_RTSIG_MAX, SIGRTMAX - SIGRTMIN + 1);
 	p31b_setcfg(CTL_P1003_1B_SIGQUEUE_MAX, max_pending_per_proc);
 }
 
 ksiginfo_t *
 ksiginfo_alloc(int wait)
 {
 	int flags;
 
 	flags = M_ZERO;
 	if (! wait)
 		flags |= M_NOWAIT;
 	if (ksiginfo_zone != NULL)
 		return ((ksiginfo_t *)uma_zalloc(ksiginfo_zone, flags));
 	return (NULL);
 }
 
 void
 ksiginfo_free(ksiginfo_t *ksi)
 {
 	uma_zfree(ksiginfo_zone, ksi);
 }
 
 static __inline int
 ksiginfo_tryfree(ksiginfo_t *ksi)
 {
 	if (!(ksi->ksi_flags & KSI_EXT)) {
 		uma_zfree(ksiginfo_zone, ksi);
 		return (1);
 	}
 	return (0);
 }
 
 void
 sigqueue_init(sigqueue_t *list, struct proc *p)
 {
 	SIGEMPTYSET(list->sq_signals);
 	SIGEMPTYSET(list->sq_kill);
 	TAILQ_INIT(&list->sq_list);
 	list->sq_proc = p;
 	list->sq_flags = SQ_INIT;
 }
 
 /*
  * Get a signal's ksiginfo.
  * Return:
  *	0	-	signal not found
  *	others	-	signal number
  */
 static int
 sigqueue_get(sigqueue_t *sq, int signo, ksiginfo_t *si)
 {
 	struct proc *p = sq->sq_proc;
 	struct ksiginfo *ksi, *next;
 	int count = 0;
 
 	KASSERT(sq->sq_flags & SQ_INIT, ("sigqueue not inited"));
 
 	if (!SIGISMEMBER(sq->sq_signals, signo))
 		return (0);
 
 	if (SIGISMEMBER(sq->sq_kill, signo)) {
 		count++;
 		SIGDELSET(sq->sq_kill, signo);
 	}
 
 	TAILQ_FOREACH_SAFE(ksi, &sq->sq_list, ksi_link, next) {
 		if (ksi->ksi_signo == signo) {
 			if (count == 0) {
 				TAILQ_REMOVE(&sq->sq_list, ksi, ksi_link);
 				ksi->ksi_sigq = NULL;
 				ksiginfo_copy(ksi, si);
 				if (ksiginfo_tryfree(ksi) && p != NULL)
 					p->p_pendingcnt--;
 			}
 			if (++count > 1)
 				break;
 		}
 	}
 
 	if (count <= 1)
 		SIGDELSET(sq->sq_signals, signo);
 	si->ksi_signo = signo;
 	return (signo);
 }
 
 void
 sigqueue_take(ksiginfo_t *ksi)
 {
 	struct ksiginfo *kp;
 	struct proc	*p;
 	sigqueue_t	*sq;
 
 	if (ksi == NULL || (sq = ksi->ksi_sigq) == NULL)
 		return;
 
 	p = sq->sq_proc;
 	TAILQ_REMOVE(&sq->sq_list, ksi, ksi_link);
 	ksi->ksi_sigq = NULL;
 	if (!(ksi->ksi_flags & KSI_EXT) && p != NULL)
 		p->p_pendingcnt--;
 
 	for (kp = TAILQ_FIRST(&sq->sq_list); kp != NULL;
 	     kp = TAILQ_NEXT(kp, ksi_link)) {
 		if (kp->ksi_signo == ksi->ksi_signo)
 			break;
 	}
 	if (kp == NULL && !SIGISMEMBER(sq->sq_kill, ksi->ksi_signo))
 		SIGDELSET(sq->sq_signals, ksi->ksi_signo);
 }
 
 static int
 sigqueue_add(sigqueue_t *sq, int signo, ksiginfo_t *si)
 {
 	struct proc *p = sq->sq_proc;
 	struct ksiginfo *ksi;
 	int ret = 0;
 
 	KASSERT(sq->sq_flags & SQ_INIT, ("sigqueue not inited"));
 
 	if (signo == SIGKILL || signo == SIGSTOP || si == NULL) {
 		SIGADDSET(sq->sq_kill, signo);
 		goto out_set_bit;
 	}
 
 	/* directly insert the ksi, don't copy it */
 	if (si->ksi_flags & KSI_INS) {
 		if (si->ksi_flags & KSI_HEAD)
 			TAILQ_INSERT_HEAD(&sq->sq_list, si, ksi_link);
 		else
 			TAILQ_INSERT_TAIL(&sq->sq_list, si, ksi_link);
 		si->ksi_sigq = sq;
 		goto out_set_bit;
 	}
 
 	if (__predict_false(ksiginfo_zone == NULL)) {
 		SIGADDSET(sq->sq_kill, signo);
 		goto out_set_bit;
 	}
 
 	if (p != NULL && p->p_pendingcnt >= max_pending_per_proc) {
 		signal_overflow++;
 		ret = EAGAIN;
 	} else if ((ksi = ksiginfo_alloc(0)) == NULL) {
 		signal_alloc_fail++;
 		ret = EAGAIN;
 	} else {
 		if (p != NULL)
 			p->p_pendingcnt++;
 		ksiginfo_copy(si, ksi);
 		ksi->ksi_signo = signo;
 		if (si->ksi_flags & KSI_HEAD)
 			TAILQ_INSERT_HEAD(&sq->sq_list, ksi, ksi_link);
 		else
 			TAILQ_INSERT_TAIL(&sq->sq_list, ksi, ksi_link);
 		ksi->ksi_sigq = sq;
 	}
 
 	if ((si->ksi_flags & KSI_TRAP) != 0 ||
 	    (si->ksi_flags & KSI_SIGQ) == 0) {
 		if (ret != 0)
 			SIGADDSET(sq->sq_kill, signo);
 		ret = 0;
 		goto out_set_bit;
 	}
 
 	if (ret != 0)
 		return (ret);
 
 out_set_bit:
 	SIGADDSET(sq->sq_signals, signo);
 	return (ret);
 }
 
 void
 sigqueue_flush(sigqueue_t *sq)
 {
 	struct proc *p = sq->sq_proc;
 	ksiginfo_t *ksi;
 
 	KASSERT(sq->sq_flags & SQ_INIT, ("sigqueue not inited"));
 
 	if (p != NULL)
 		PROC_LOCK_ASSERT(p, MA_OWNED);
 
 	while ((ksi = TAILQ_FIRST(&sq->sq_list)) != NULL) {
 		TAILQ_REMOVE(&sq->sq_list, ksi, ksi_link);
 		ksi->ksi_sigq = NULL;
 		if (ksiginfo_tryfree(ksi) && p != NULL)
 			p->p_pendingcnt--;
 	}
 
 	SIGEMPTYSET(sq->sq_signals);
 	SIGEMPTYSET(sq->sq_kill);
 }
 
 static void
 sigqueue_move_set(sigqueue_t *src, sigqueue_t *dst, const sigset_t *set)
 {
 	sigset_t tmp;
 	struct proc *p1, *p2;
 	ksiginfo_t *ksi, *next;
 
 	KASSERT(src->sq_flags & SQ_INIT, ("src sigqueue not inited"));
 	KASSERT(dst->sq_flags & SQ_INIT, ("dst sigqueue not inited"));
 	p1 = src->sq_proc;
 	p2 = dst->sq_proc;
 	/* Move siginfo to target list */
 	TAILQ_FOREACH_SAFE(ksi, &src->sq_list, ksi_link, next) {
 		if (SIGISMEMBER(*set, ksi->ksi_signo)) {
 			TAILQ_REMOVE(&src->sq_list, ksi, ksi_link);
 			if (p1 != NULL)
 				p1->p_pendingcnt--;
 			TAILQ_INSERT_TAIL(&dst->sq_list, ksi, ksi_link);
 			ksi->ksi_sigq = dst;
 			if (p2 != NULL)
 				p2->p_pendingcnt++;
 		}
 	}
 
 	/* Move pending bits to target list */
 	tmp = src->sq_kill;
 	SIGSETAND(tmp, *set);
 	SIGSETOR(dst->sq_kill, tmp);
 	SIGSETNAND(src->sq_kill, tmp);
 
 	tmp = src->sq_signals;
 	SIGSETAND(tmp, *set);
 	SIGSETOR(dst->sq_signals, tmp);
 	SIGSETNAND(src->sq_signals, tmp);
 }
 
 #if 0
 static void
 sigqueue_move(sigqueue_t *src, sigqueue_t *dst, int signo)
 {
 	sigset_t set;
 
 	SIGEMPTYSET(set);
 	SIGADDSET(set, signo);
 	sigqueue_move_set(src, dst, &set);
 }
 #endif
 
 static void
 sigqueue_delete_set(sigqueue_t *sq, const sigset_t *set)
 {
 	struct proc *p = sq->sq_proc;
 	ksiginfo_t *ksi, *next;
 
 	KASSERT(sq->sq_flags & SQ_INIT, ("src sigqueue not inited"));
 
 	/* Remove siginfo queue */
 	TAILQ_FOREACH_SAFE(ksi, &sq->sq_list, ksi_link, next) {
 		if (SIGISMEMBER(*set, ksi->ksi_signo)) {
 			TAILQ_REMOVE(&sq->sq_list, ksi, ksi_link);
 			ksi->ksi_sigq = NULL;
 			if (ksiginfo_tryfree(ksi) && p != NULL)
 				p->p_pendingcnt--;
 		}
 	}
 	SIGSETNAND(sq->sq_kill, *set);
 	SIGSETNAND(sq->sq_signals, *set);
 }
 
 void
 sigqueue_delete(sigqueue_t *sq, int signo)
 {
 	sigset_t set;
 
 	SIGEMPTYSET(set);
 	SIGADDSET(set, signo);
 	sigqueue_delete_set(sq, &set);
 }
 
 /* Remove a set of signals for a process */
 static void
 sigqueue_delete_set_proc(struct proc *p, const sigset_t *set)
 {
 	sigqueue_t worklist;
 	struct thread *td0;
 
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 
 	sigqueue_init(&worklist, NULL);
 	sigqueue_move_set(&p->p_sigqueue, &worklist, set);
 
 	FOREACH_THREAD_IN_PROC(p, td0)
 		sigqueue_move_set(&td0->td_sigqueue, &worklist, set);
 
 	sigqueue_flush(&worklist);
 }
 
 void
 sigqueue_delete_proc(struct proc *p, int signo)
 {
 	sigset_t set;
 
 	SIGEMPTYSET(set);
 	SIGADDSET(set, signo);
 	sigqueue_delete_set_proc(p, &set);
 }
 
 static void
 sigqueue_delete_stopmask_proc(struct proc *p)
 {
 	sigset_t set;
 
 	SIGEMPTYSET(set);
 	SIGADDSET(set, SIGSTOP);
 	SIGADDSET(set, SIGTSTP);
 	SIGADDSET(set, SIGTTIN);
 	SIGADDSET(set, SIGTTOU);
 	sigqueue_delete_set_proc(p, &set);
 }
 
 /*
  * Determine signal that should be delivered to thread td, the current
  * thread, 0 if none.  If there is a pending stop signal with default
  * action, the process stops in issignal().
  */
 int
 cursig(struct thread *td)
 {
 	PROC_LOCK_ASSERT(td->td_proc, MA_OWNED);
 	mtx_assert(&td->td_proc->p_sigacts->ps_mtx, MA_OWNED);
 	THREAD_LOCK_ASSERT(td, MA_NOTOWNED);
 	return (SIGPENDING(td) ? issignal(td) : 0);
 }
 
 /*
  * Arrange for ast() to handle unmasked pending signals on return to user
  * mode.  This must be called whenever a signal is added to td_sigqueue or
  * unmasked in td_sigmask.
  */
 void
 signotify(struct thread *td)
 {
 	struct proc *p;
 
 	p = td->td_proc;
 
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 
 	if (SIGPENDING(td)) {
 		thread_lock(td);
 		td->td_flags |= TDF_NEEDSIGCHK | TDF_ASTPENDING;
 		thread_unlock(td);
 	}
 }
 
 int
 sigonstack(size_t sp)
 {
 	struct thread *td = curthread;
 
 	return ((td->td_pflags & TDP_ALTSTACK) ?
 #if defined(COMPAT_43)
 	    ((td->td_sigstk.ss_size == 0) ?
 		(td->td_sigstk.ss_flags & SS_ONSTACK) :
 		((sp - (size_t)td->td_sigstk.ss_sp) < td->td_sigstk.ss_size))
 #else
 	    ((sp - (size_t)td->td_sigstk.ss_sp) < td->td_sigstk.ss_size)
 #endif
 	    : 0);
 }
 
 static __inline int
 sigprop(int sig)
 {
 
 	if (sig > 0 && sig < NSIG)
 		return (sigproptbl[_SIG_IDX(sig)]);
 	return (0);
 }
 
 int
 sig_ffs(sigset_t *set)
 {
 	int i;
 
 	for (i = 0; i < _SIG_WORDS; i++)
 		if (set->__bits[i])
 			return (ffs(set->__bits[i]) + (i * 32));
 	return (0);
 }
 
 /*
  * kern_sigaction
  * sigaction
  * freebsd4_sigaction
  * osigaction
  */
 int
 kern_sigaction(td, sig, act, oact, flags)
 	struct thread *td;
 	register int sig;
 	struct sigaction *act, *oact;
 	int flags;
 {
 	struct sigacts *ps;
 	struct proc *p = td->td_proc;
 
 	if (!_SIG_VALID(sig))
 		return (EINVAL);
 
 	PROC_LOCK(p);
 	ps = p->p_sigacts;
 	mtx_lock(&ps->ps_mtx);
 	if (oact) {
 		oact->sa_mask = ps->ps_catchmask[_SIG_IDX(sig)];
 		oact->sa_flags = 0;
 		if (SIGISMEMBER(ps->ps_sigonstack, sig))
 			oact->sa_flags |= SA_ONSTACK;
 		if (!SIGISMEMBER(ps->ps_sigintr, sig))
 			oact->sa_flags |= SA_RESTART;
 		if (SIGISMEMBER(ps->ps_sigreset, sig))
 			oact->sa_flags |= SA_RESETHAND;
 		if (SIGISMEMBER(ps->ps_signodefer, sig))
 			oact->sa_flags |= SA_NODEFER;
 		if (SIGISMEMBER(ps->ps_siginfo, sig)) {
 			oact->sa_flags |= SA_SIGINFO;
 			oact->sa_sigaction =
 			    (__siginfohandler_t *)ps->ps_sigact[_SIG_IDX(sig)];
 		} else
 			oact->sa_handler = ps->ps_sigact[_SIG_IDX(sig)];
 		if (sig == SIGCHLD && ps->ps_flag & PS_NOCLDSTOP)
 			oact->sa_flags |= SA_NOCLDSTOP;
 		if (sig == SIGCHLD && ps->ps_flag & PS_NOCLDWAIT)
 			oact->sa_flags |= SA_NOCLDWAIT;
 	}
 	if (act) {
 		if ((sig == SIGKILL || sig == SIGSTOP) &&
 		    act->sa_handler != SIG_DFL) {
 			mtx_unlock(&ps->ps_mtx);
 			PROC_UNLOCK(p);
 			return (EINVAL);
 		}
 
 		/*
 		 * Change setting atomically.
 		 */
 
 		ps->ps_catchmask[_SIG_IDX(sig)] = act->sa_mask;
 		SIG_CANTMASK(ps->ps_catchmask[_SIG_IDX(sig)]);
 		if (act->sa_flags & SA_SIGINFO) {
 			ps->ps_sigact[_SIG_IDX(sig)] =
 			    (__sighandler_t *)act->sa_sigaction;
 			SIGADDSET(ps->ps_siginfo, sig);
 		} else {
 			ps->ps_sigact[_SIG_IDX(sig)] = act->sa_handler;
 			SIGDELSET(ps->ps_siginfo, sig);
 		}
 		if (!(act->sa_flags & SA_RESTART))
 			SIGADDSET(ps->ps_sigintr, sig);
 		else
 			SIGDELSET(ps->ps_sigintr, sig);
 		if (act->sa_flags & SA_ONSTACK)
 			SIGADDSET(ps->ps_sigonstack, sig);
 		else
 			SIGDELSET(ps->ps_sigonstack, sig);
 		if (act->sa_flags & SA_RESETHAND)
 			SIGADDSET(ps->ps_sigreset, sig);
 		else
 			SIGDELSET(ps->ps_sigreset, sig);
 		if (act->sa_flags & SA_NODEFER)
 			SIGADDSET(ps->ps_signodefer, sig);
 		else
 			SIGDELSET(ps->ps_signodefer, sig);
 		if (sig == SIGCHLD) {
 			if (act->sa_flags & SA_NOCLDSTOP)
 				ps->ps_flag |= PS_NOCLDSTOP;
 			else
 				ps->ps_flag &= ~PS_NOCLDSTOP;
 			if (act->sa_flags & SA_NOCLDWAIT) {
 				/*
 				 * Paranoia: since SA_NOCLDWAIT is implemented
 				 * by reparenting the dying child to PID 1 (and
 				 * trust it to reap the zombie), PID 1 itself
 				 * is forbidden to set SA_NOCLDWAIT.
 				 */
 				if (p->p_pid == 1)
 					ps->ps_flag &= ~PS_NOCLDWAIT;
 				else
 					ps->ps_flag |= PS_NOCLDWAIT;
 			} else
 				ps->ps_flag &= ~PS_NOCLDWAIT;
 			if (ps->ps_sigact[_SIG_IDX(SIGCHLD)] == SIG_IGN)
 				ps->ps_flag |= PS_CLDSIGIGN;
 			else
 				ps->ps_flag &= ~PS_CLDSIGIGN;
 		}
 		/*
 		 * Set bit in ps_sigignore for signals that are set to SIG_IGN,
 		 * and for signals set to SIG_DFL where the default is to
 		 * ignore. However, don't put SIGCONT in ps_sigignore, as we
 		 * have to restart the process.
 		 */
 		if (ps->ps_sigact[_SIG_IDX(sig)] == SIG_IGN ||
 		    (sigprop(sig) & SA_IGNORE &&
 		     ps->ps_sigact[_SIG_IDX(sig)] == SIG_DFL)) {
 			/* never to be seen again */
 			sigqueue_delete_proc(p, sig);
 			if (sig != SIGCONT)
 				/* easier in psignal */
 				SIGADDSET(ps->ps_sigignore, sig);
 			SIGDELSET(ps->ps_sigcatch, sig);
 		} else {
 			SIGDELSET(ps->ps_sigignore, sig);
 			if (ps->ps_sigact[_SIG_IDX(sig)] == SIG_DFL)
 				SIGDELSET(ps->ps_sigcatch, sig);
 			else
 				SIGADDSET(ps->ps_sigcatch, sig);
 		}
 #ifdef COMPAT_FREEBSD4
 		if (ps->ps_sigact[_SIG_IDX(sig)] == SIG_IGN ||
 		    ps->ps_sigact[_SIG_IDX(sig)] == SIG_DFL ||
 		    (flags & KSA_FREEBSD4) == 0)
 			SIGDELSET(ps->ps_freebsd4, sig);
 		else
 			SIGADDSET(ps->ps_freebsd4, sig);
 #endif
 #ifdef COMPAT_43
 		if (ps->ps_sigact[_SIG_IDX(sig)] == SIG_IGN ||
 		    ps->ps_sigact[_SIG_IDX(sig)] == SIG_DFL ||
 		    (flags & KSA_OSIGSET) == 0)
 			SIGDELSET(ps->ps_osigset, sig);
 		else
 			SIGADDSET(ps->ps_osigset, sig);
 #endif
 	}
 	mtx_unlock(&ps->ps_mtx);
 	PROC_UNLOCK(p);
 	return (0);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct sigaction_args {
 	int	sig;
 	struct	sigaction *act;
 	struct	sigaction *oact;
 };
 #endif
 int
 sys_sigaction(td, uap)
 	struct thread *td;
 	register struct sigaction_args *uap;
 {
 	struct sigaction act, oact;
 	register struct sigaction *actp, *oactp;
 	int error;
 
 	actp = (uap->act != NULL) ? &act : NULL;
 	oactp = (uap->oact != NULL) ? &oact : NULL;
 	if (actp) {
 		error = copyin(uap->act, actp, sizeof(act));
 		if (error)
 			return (error);
 	}
 	error = kern_sigaction(td, uap->sig, actp, oactp, 0);
 	if (oactp && !error)
 		error = copyout(oactp, uap->oact, sizeof(oact));
 	return (error);
 }
 
 #ifdef COMPAT_FREEBSD4
 #ifndef _SYS_SYSPROTO_H_
 struct freebsd4_sigaction_args {
 	int	sig;
 	struct	sigaction *act;
 	struct	sigaction *oact;
 };
 #endif
 int
 freebsd4_sigaction(td, uap)
 	struct thread *td;
 	register struct freebsd4_sigaction_args *uap;
 {
 	struct sigaction act, oact;
 	register struct sigaction *actp, *oactp;
 	int error;
 
 
 	actp = (uap->act != NULL) ? &act : NULL;
 	oactp = (uap->oact != NULL) ? &oact : NULL;
 	if (actp) {
 		error = copyin(uap->act, actp, sizeof(act));
 		if (error)
 			return (error);
 	}
 	error = kern_sigaction(td, uap->sig, actp, oactp, KSA_FREEBSD4);
 	if (oactp && !error)
 		error = copyout(oactp, uap->oact, sizeof(oact));
 	return (error);
 }
 #endif	/* COMAPT_FREEBSD4 */
 
 #ifdef COMPAT_43	/* XXX - COMPAT_FBSD3 */
 #ifndef _SYS_SYSPROTO_H_
 struct osigaction_args {
 	int	signum;
 	struct	osigaction *nsa;
 	struct	osigaction *osa;
 };
 #endif
 int
 osigaction(td, uap)
 	struct thread *td;
 	register struct osigaction_args *uap;
 {
 	struct osigaction sa;
 	struct sigaction nsa, osa;
 	register struct sigaction *nsap, *osap;
 	int error;
 
 	if (uap->signum <= 0 || uap->signum >= ONSIG)
 		return (EINVAL);
 
 	nsap = (uap->nsa != NULL) ? &nsa : NULL;
 	osap = (uap->osa != NULL) ? &osa : NULL;
 
 	if (nsap) {
 		error = copyin(uap->nsa, &sa, sizeof(sa));
 		if (error)
 			return (error);
 		nsap->sa_handler = sa.sa_handler;
 		nsap->sa_flags = sa.sa_flags;
 		OSIG2SIG(sa.sa_mask, nsap->sa_mask);
 	}
 	error = kern_sigaction(td, uap->signum, nsap, osap, KSA_OSIGSET);
 	if (osap && !error) {
 		sa.sa_handler = osap->sa_handler;
 		sa.sa_flags = osap->sa_flags;
 		SIG2OSIG(osap->sa_mask, sa.sa_mask);
 		error = copyout(&sa, uap->osa, sizeof(sa));
 	}
 	return (error);
 }
 
 #if !defined(__i386__)
 /* Avoid replicating the same stub everywhere */
 int
 osigreturn(td, uap)
 	struct thread *td;
 	struct osigreturn_args *uap;
 {
 
 	return (nosys(td, (struct nosys_args *)uap));
 }
 #endif
 #endif /* COMPAT_43 */
 
 /*
  * Initialize signal state for process 0;
  * set to ignore signals that are ignored by default.
  */
 void
 siginit(p)
 	struct proc *p;
 {
 	register int i;
 	struct sigacts *ps;
 
 	PROC_LOCK(p);
 	ps = p->p_sigacts;
 	mtx_lock(&ps->ps_mtx);
 	for (i = 1; i <= NSIG; i++)
 		if (sigprop(i) & SA_IGNORE && i != SIGCONT)
 			SIGADDSET(ps->ps_sigignore, i);
 	mtx_unlock(&ps->ps_mtx);
 	PROC_UNLOCK(p);
 }
 
 /*
  * Reset signals for an exec of the specified process.
  */
 void
 execsigs(struct proc *p)
 {
 	struct sigacts *ps;
 	int sig;
 	struct thread *td;
 
 	/*
 	 * Reset caught signals.  Held signals remain held
 	 * through td_sigmask (unless they were caught,
 	 * and are now ignored by default).
 	 */
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	td = FIRST_THREAD_IN_PROC(p);
 	ps = p->p_sigacts;
 	mtx_lock(&ps->ps_mtx);
 	while (SIGNOTEMPTY(ps->ps_sigcatch)) {
 		sig = sig_ffs(&ps->ps_sigcatch);
 		SIGDELSET(ps->ps_sigcatch, sig);
 		if (sigprop(sig) & SA_IGNORE) {
 			if (sig != SIGCONT)
 				SIGADDSET(ps->ps_sigignore, sig);
 			sigqueue_delete_proc(p, sig);
 		}
 		ps->ps_sigact[_SIG_IDX(sig)] = SIG_DFL;
 	}
 	/*
 	 * Reset stack state to the user stack.
 	 * Clear set of signals caught on the signal stack.
 	 */
 	td->td_sigstk.ss_flags = SS_DISABLE;
 	td->td_sigstk.ss_size = 0;
 	td->td_sigstk.ss_sp = 0;
 	td->td_pflags &= ~TDP_ALTSTACK;
 	/*
 	 * Reset no zombies if child dies flag as Solaris does.
 	 */
 	ps->ps_flag &= ~(PS_NOCLDWAIT | PS_CLDSIGIGN);
 	if (ps->ps_sigact[_SIG_IDX(SIGCHLD)] == SIG_IGN)
 		ps->ps_sigact[_SIG_IDX(SIGCHLD)] = SIG_DFL;
 	mtx_unlock(&ps->ps_mtx);
 }
 
 /*
  * kern_sigprocmask()
  *
  *	Manipulate signal mask.
  */
 int
 kern_sigprocmask(struct thread *td, int how, sigset_t *set, sigset_t *oset,
     int flags)
 {
 	sigset_t new_block, oset1;
 	struct proc *p;
 	int error;
 
 	p = td->td_proc;
 	if (!(flags & SIGPROCMASK_PROC_LOCKED))
 		PROC_LOCK(p);
 	if (oset != NULL)
 		*oset = td->td_sigmask;
 
 	error = 0;
 	if (set != NULL) {
 		switch (how) {
 		case SIG_BLOCK:
 			SIG_CANTMASK(*set);
 			oset1 = td->td_sigmask;
 			SIGSETOR(td->td_sigmask, *set);
 			new_block = td->td_sigmask;
 			SIGSETNAND(new_block, oset1);
 			break;
 		case SIG_UNBLOCK:
 			SIGSETNAND(td->td_sigmask, *set);
 			signotify(td);
 			goto out;
 		case SIG_SETMASK:
 			SIG_CANTMASK(*set);
 			oset1 = td->td_sigmask;
 			if (flags & SIGPROCMASK_OLD)
 				SIGSETLO(td->td_sigmask, *set);
 			else
 				td->td_sigmask = *set;
 			new_block = td->td_sigmask;
 			SIGSETNAND(new_block, oset1);
 			signotify(td);
 			break;
 		default:
 			error = EINVAL;
 			goto out;
 		}
 
 		/*
 		 * The new_block set contains signals that were not previously
 		 * blocked, but are blocked now.
 		 *
 		 * In case we block any signal that was not previously blocked
 		 * for td, and process has the signal pending, try to schedule
 		 * signal delivery to some thread that does not block the
 		 * signal, possibly waking it up.
 		 */
 		if (p->p_numthreads != 1)
 			reschedule_signals(p, new_block, flags);
 	}
 
 out:
 	if (!(flags & SIGPROCMASK_PROC_LOCKED))
 		PROC_UNLOCK(p);
 	return (error);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct sigprocmask_args {
 	int	how;
 	const sigset_t *set;
 	sigset_t *oset;
 };
 #endif
 int
 sys_sigprocmask(td, uap)
 	register struct thread *td;
 	struct sigprocmask_args *uap;
 {
 	sigset_t set, oset;
 	sigset_t *setp, *osetp;
 	int error;
 
 	setp = (uap->set != NULL) ? &set : NULL;
 	osetp = (uap->oset != NULL) ? &oset : NULL;
 	if (setp) {
 		error = copyin(uap->set, setp, sizeof(set));
 		if (error)
 			return (error);
 	}
 	error = kern_sigprocmask(td, uap->how, setp, osetp, 0);
 	if (osetp && !error) {
 		error = copyout(osetp, uap->oset, sizeof(oset));
 	}
 	return (error);
 }
 
 #ifdef COMPAT_43	/* XXX - COMPAT_FBSD3 */
 #ifndef _SYS_SYSPROTO_H_
 struct osigprocmask_args {
 	int	how;
 	osigset_t mask;
 };
 #endif
 int
 osigprocmask(td, uap)
 	register struct thread *td;
 	struct osigprocmask_args *uap;
 {
 	sigset_t set, oset;
 	int error;
 
 	OSIG2SIG(uap->mask, set);
 	error = kern_sigprocmask(td, uap->how, &set, &oset, 1);
 	SIG2OSIG(oset, td->td_retval[0]);
 	return (error);
 }
 #endif /* COMPAT_43 */
 
 int
 sys_sigwait(struct thread *td, struct sigwait_args *uap)
 {
 	ksiginfo_t ksi;
 	sigset_t set;
 	int error;
 
 	error = copyin(uap->set, &set, sizeof(set));
 	if (error) {
 		td->td_retval[0] = error;
 		return (0);
 	}
 
 	error = kern_sigtimedwait(td, set, &ksi, NULL);
 	if (error) {
 		if (error == EINTR && td->td_proc->p_osrel < P_OSREL_SIGWAIT)
 			error = ERESTART;
 		if (error == ERESTART)
 			return (error);
 		td->td_retval[0] = error;
 		return (0);
 	}
 
 	error = copyout(&ksi.ksi_signo, uap->sig, sizeof(ksi.ksi_signo));
 	td->td_retval[0] = error;
 	return (0);
 }
 
 int
 sys_sigtimedwait(struct thread *td, struct sigtimedwait_args *uap)
 {
 	struct timespec ts;
 	struct timespec *timeout;
 	sigset_t set;
 	ksiginfo_t ksi;
 	int error;
 
 	if (uap->timeout) {
 		error = copyin(uap->timeout, &ts, sizeof(ts));
 		if (error)
 			return (error);
 
 		timeout = &ts;
 	} else
 		timeout = NULL;
 
 	error = copyin(uap->set, &set, sizeof(set));
 	if (error)
 		return (error);
 
 	error = kern_sigtimedwait(td, set, &ksi, timeout);
 	if (error)
 		return (error);
 
 	if (uap->info)
 		error = copyout(&ksi.ksi_info, uap->info, sizeof(siginfo_t));
 
 	if (error == 0)
 		td->td_retval[0] = ksi.ksi_signo;
 	return (error);
 }
 
 int
 sys_sigwaitinfo(struct thread *td, struct sigwaitinfo_args *uap)
 {
 	ksiginfo_t ksi;
 	sigset_t set;
 	int error;
 
 	error = copyin(uap->set, &set, sizeof(set));
 	if (error)
 		return (error);
 
 	error = kern_sigtimedwait(td, set, &ksi, NULL);
 	if (error)
 		return (error);
 
 	if (uap->info)
 		error = copyout(&ksi.ksi_info, uap->info, sizeof(siginfo_t));
 
 	if (error == 0)
 		td->td_retval[0] = ksi.ksi_signo;
 	return (error);
 }
 
 int
 kern_sigtimedwait(struct thread *td, sigset_t waitset, ksiginfo_t *ksi,
 	struct timespec *timeout)
 {
 	struct sigacts *ps;
 	sigset_t saved_mask, new_block;
 	struct proc *p;
 	int error, sig, timo, timevalid = 0;
 	struct timespec rts, ets, ts;
 	struct timeval tv;
 
 	p = td->td_proc;
 	error = 0;
 	ets.tv_sec = 0;
 	ets.tv_nsec = 0;
 
 	if (timeout != NULL) {
 		if (timeout->tv_nsec >= 0 && timeout->tv_nsec < 1000000000) {
 			timevalid = 1;
 			getnanouptime(&rts);
 			ets = rts;
 			timespecadd(&ets, timeout);
 		}
 	}
 	ksiginfo_init(ksi);
 	/* Some signals can not be waited for. */
 	SIG_CANTMASK(waitset);
 	ps = p->p_sigacts;
 	PROC_LOCK(p);
 	saved_mask = td->td_sigmask;
 	SIGSETNAND(td->td_sigmask, waitset);
 	for (;;) {
 		mtx_lock(&ps->ps_mtx);
 		sig = cursig(td);
 		mtx_unlock(&ps->ps_mtx);
 		if (sig != 0 && SIGISMEMBER(waitset, sig)) {
 			if (sigqueue_get(&td->td_sigqueue, sig, ksi) != 0 ||
 			    sigqueue_get(&p->p_sigqueue, sig, ksi) != 0) {
 				error = 0;
 				break;
 			}
 		}
 
 		if (error != 0)
 			break;
 
 		/*
 		 * POSIX says this must be checked after looking for pending
 		 * signals.
 		 */
 		if (timeout != NULL) {
 			if (!timevalid) {
 				error = EINVAL;
 				break;
 			}
 			getnanouptime(&rts);
 			if (timespeccmp(&rts, &ets, >=)) {
 				error = EAGAIN;
 				break;
 			}
 			ts = ets;
 			timespecsub(&ts, &rts);
 			TIMESPEC_TO_TIMEVAL(&tv, &ts);
 			timo = tvtohz(&tv);
 		} else {
 			timo = 0;
 		}
 
 		error = msleep(ps, &p->p_mtx, PPAUSE|PCATCH, "sigwait", timo);
 
 		if (timeout != NULL) {
 			if (error == ERESTART) {
 				/* Timeout can not be restarted. */
 				error = EINTR;
 			} else if (error == EAGAIN) {
 				/* We will calculate timeout by ourself. */
 				error = 0;
 			}
 		}
 	}
 
 	new_block = saved_mask;
 	SIGSETNAND(new_block, td->td_sigmask);
 	td->td_sigmask = saved_mask;
 	/*
 	 * Fewer signals can be delivered to us, reschedule signal
 	 * notification.
 	 */
 	if (p->p_numthreads != 1)
 		reschedule_signals(p, new_block, 0);
 
 	if (error == 0) {
 		SDT_PROBE(proc, kernel, , signal_clear, sig, ksi, 0, 0, 0);
 
 		if (ksi->ksi_code == SI_TIMER)
 			itimer_accept(p, ksi->ksi_timerid, ksi);
 
 #ifdef KTRACE
 		if (KTRPOINT(td, KTR_PSIG)) {
 			sig_t action;
 
 			mtx_lock(&ps->ps_mtx);
 			action = ps->ps_sigact[_SIG_IDX(sig)];
 			mtx_unlock(&ps->ps_mtx);
 			ktrpsig(sig, action, &td->td_sigmask, ksi->ksi_code);
 		}
 #endif
 		if (sig == SIGKILL)
 			sigexit(td, sig);
 	}
 	PROC_UNLOCK(p);
 	return (error);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct sigpending_args {
 	sigset_t	*set;
 };
 #endif
 int
 sys_sigpending(td, uap)
 	struct thread *td;
 	struct sigpending_args *uap;
 {
 	struct proc *p = td->td_proc;
 	sigset_t pending;
 
 	PROC_LOCK(p);
 	pending = p->p_sigqueue.sq_signals;
 	SIGSETOR(pending, td->td_sigqueue.sq_signals);
 	PROC_UNLOCK(p);
 	return (copyout(&pending, uap->set, sizeof(sigset_t)));
 }
 
 #ifdef COMPAT_43	/* XXX - COMPAT_FBSD3 */
 #ifndef _SYS_SYSPROTO_H_
 struct osigpending_args {
 	int	dummy;
 };
 #endif
 int
 osigpending(td, uap)
 	struct thread *td;
 	struct osigpending_args *uap;
 {
 	struct proc *p = td->td_proc;
 	sigset_t pending;
 
 	PROC_LOCK(p);
 	pending = p->p_sigqueue.sq_signals;
 	SIGSETOR(pending, td->td_sigqueue.sq_signals);
 	PROC_UNLOCK(p);
 	SIG2OSIG(pending, td->td_retval[0]);
 	return (0);
 }
 #endif /* COMPAT_43 */
 
 #if defined(COMPAT_43)
 /*
  * Generalized interface signal handler, 4.3-compatible.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct osigvec_args {
 	int	signum;
 	struct	sigvec *nsv;
 	struct	sigvec *osv;
 };
 #endif
 /* ARGSUSED */
 int
 osigvec(td, uap)
 	struct thread *td;
 	register struct osigvec_args *uap;
 {
 	struct sigvec vec;
 	struct sigaction nsa, osa;
 	register struct sigaction *nsap, *osap;
 	int error;
 
 	if (uap->signum <= 0 || uap->signum >= ONSIG)
 		return (EINVAL);
 	nsap = (uap->nsv != NULL) ? &nsa : NULL;
 	osap = (uap->osv != NULL) ? &osa : NULL;
 	if (nsap) {
 		error = copyin(uap->nsv, &vec, sizeof(vec));
 		if (error)
 			return (error);
 		nsap->sa_handler = vec.sv_handler;
 		OSIG2SIG(vec.sv_mask, nsap->sa_mask);
 		nsap->sa_flags = vec.sv_flags;
 		nsap->sa_flags ^= SA_RESTART;	/* opposite of SV_INTERRUPT */
 	}
 	error = kern_sigaction(td, uap->signum, nsap, osap, KSA_OSIGSET);
 	if (osap && !error) {
 		vec.sv_handler = osap->sa_handler;
 		SIG2OSIG(osap->sa_mask, vec.sv_mask);
 		vec.sv_flags = osap->sa_flags;
 		vec.sv_flags &= ~SA_NOCLDWAIT;
 		vec.sv_flags ^= SA_RESTART;
 		error = copyout(&vec, uap->osv, sizeof(vec));
 	}
 	return (error);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct osigblock_args {
 	int	mask;
 };
 #endif
 int
 osigblock(td, uap)
 	register struct thread *td;
 	struct osigblock_args *uap;
 {
 	sigset_t set, oset;
 
 	OSIG2SIG(uap->mask, set);
 	kern_sigprocmask(td, SIG_BLOCK, &set, &oset, 0);
 	SIG2OSIG(oset, td->td_retval[0]);
 	return (0);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct osigsetmask_args {
 	int	mask;
 };
 #endif
 int
 osigsetmask(td, uap)
 	struct thread *td;
 	struct osigsetmask_args *uap;
 {
 	sigset_t set, oset;
 
 	OSIG2SIG(uap->mask, set);
 	kern_sigprocmask(td, SIG_SETMASK, &set, &oset, 0);
 	SIG2OSIG(oset, td->td_retval[0]);
 	return (0);
 }
 #endif /* COMPAT_43 */
 
 /*
  * Suspend calling thread until signal, providing mask to be set in the
  * meantime.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct sigsuspend_args {
 	const sigset_t *sigmask;
 };
 #endif
 /* ARGSUSED */
 int
 sys_sigsuspend(td, uap)
 	struct thread *td;
 	struct sigsuspend_args *uap;
 {
 	sigset_t mask;
 	int error;
 
 	error = copyin(uap->sigmask, &mask, sizeof(mask));
 	if (error)
 		return (error);
 	return (kern_sigsuspend(td, mask));
 }
 
 int
 kern_sigsuspend(struct thread *td, sigset_t mask)
 {
 	struct proc *p = td->td_proc;
 	int has_sig, sig;
 
 	/*
 	 * When returning from sigsuspend, we want
 	 * the old mask to be restored after the
 	 * signal handler has finished.  Thus, we
 	 * save it here and mark the sigacts structure
 	 * to indicate this.
 	 */
 	PROC_LOCK(p);
 	kern_sigprocmask(td, SIG_SETMASK, &mask, &td->td_oldsigmask,
 	    SIGPROCMASK_PROC_LOCKED);
 	td->td_pflags |= TDP_OLDMASK;
 
 	/*
 	 * Process signals now. Otherwise, we can get spurious wakeup
 	 * due to signal entered process queue, but delivered to other
 	 * thread. But sigsuspend should return only on signal
 	 * delivery.
 	 */
 	(p->p_sysent->sv_set_syscall_retval)(td, EINTR);
 	for (has_sig = 0; !has_sig;) {
 		while (msleep(&p->p_sigacts, &p->p_mtx, PPAUSE|PCATCH, "pause",
 			0) == 0)
 			/* void */;
 		thread_suspend_check(0);
 		mtx_lock(&p->p_sigacts->ps_mtx);
 		while ((sig = cursig(td)) != 0)
 			has_sig += postsig(sig);
 		mtx_unlock(&p->p_sigacts->ps_mtx);
 	}
 	PROC_UNLOCK(p);
 	td->td_errno = EINTR;
 	td->td_pflags |= TDP_NERRNO;
 	return (EJUSTRETURN);
 }
 
 #ifdef COMPAT_43	/* XXX - COMPAT_FBSD3 */
 /*
  * Compatibility sigsuspend call for old binaries.  Note nonstandard calling
  * convention: libc stub passes mask, not pointer, to save a copyin.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct osigsuspend_args {
 	osigset_t mask;
 };
 #endif
 /* ARGSUSED */
 int
 osigsuspend(td, uap)
 	struct thread *td;
 	struct osigsuspend_args *uap;
 {
 	sigset_t mask;
 
 	OSIG2SIG(uap->mask, mask);
 	return (kern_sigsuspend(td, mask));
 }
 #endif /* COMPAT_43 */
 
 #if defined(COMPAT_43)
 #ifndef _SYS_SYSPROTO_H_
 struct osigstack_args {
 	struct	sigstack *nss;
 	struct	sigstack *oss;
 };
 #endif
 /* ARGSUSED */
 int
 osigstack(td, uap)
 	struct thread *td;
 	register struct osigstack_args *uap;
 {
 	struct sigstack nss, oss;
 	int error = 0;
 
 	if (uap->nss != NULL) {
 		error = copyin(uap->nss, &nss, sizeof(nss));
 		if (error)
 			return (error);
 	}
 	oss.ss_sp = td->td_sigstk.ss_sp;
 	oss.ss_onstack = sigonstack(cpu_getstack(td));
 	if (uap->nss != NULL) {
 		td->td_sigstk.ss_sp = nss.ss_sp;
 		td->td_sigstk.ss_size = 0;
 		td->td_sigstk.ss_flags |= nss.ss_onstack & SS_ONSTACK;
 		td->td_pflags |= TDP_ALTSTACK;
 	}
 	if (uap->oss != NULL)
 		error = copyout(&oss, uap->oss, sizeof(oss));
 
 	return (error);
 }
 #endif /* COMPAT_43 */
 
 #ifndef _SYS_SYSPROTO_H_
 struct sigaltstack_args {
 	stack_t	*ss;
 	stack_t	*oss;
 };
 #endif
 /* ARGSUSED */
 int
 sys_sigaltstack(td, uap)
 	struct thread *td;
 	register struct sigaltstack_args *uap;
 {
 	stack_t ss, oss;
 	int error;
 
 	if (uap->ss != NULL) {
 		error = copyin(uap->ss, &ss, sizeof(ss));
 		if (error)
 			return (error);
 	}
 	error = kern_sigaltstack(td, (uap->ss != NULL) ? &ss : NULL,
 	    (uap->oss != NULL) ? &oss : NULL);
 	if (error)
 		return (error);
 	if (uap->oss != NULL)
 		error = copyout(&oss, uap->oss, sizeof(stack_t));
 	return (error);
 }
 
 int
 kern_sigaltstack(struct thread *td, stack_t *ss, stack_t *oss)
 {
 	struct proc *p = td->td_proc;
 	int oonstack;
 
 	oonstack = sigonstack(cpu_getstack(td));
 
 	if (oss != NULL) {
 		*oss = td->td_sigstk;
 		oss->ss_flags = (td->td_pflags & TDP_ALTSTACK)
 		    ? ((oonstack) ? SS_ONSTACK : 0) : SS_DISABLE;
 	}
 
 	if (ss != NULL) {
 		if (oonstack)
 			return (EPERM);
 		if ((ss->ss_flags & ~SS_DISABLE) != 0)
 			return (EINVAL);
 		if (!(ss->ss_flags & SS_DISABLE)) {
 			if (ss->ss_size < p->p_sysent->sv_minsigstksz)
 				return (ENOMEM);
 
 			td->td_sigstk = *ss;
 			td->td_pflags |= TDP_ALTSTACK;
 		} else {
 			td->td_pflags &= ~TDP_ALTSTACK;
 		}
 	}
 	return (0);
 }
 
 /*
  * Common code for kill process group/broadcast kill.
  * cp is calling process.
  */
 static int
 killpg1(struct thread *td, int sig, int pgid, int all, ksiginfo_t *ksi)
 {
 	struct proc *p;
 	struct pgrp *pgrp;
 	int err;
 	int ret;
 
 	ret = ESRCH;
 	if (all) {
 		/*
 		 * broadcast
 		 */
 		sx_slock(&allproc_lock);
 		FOREACH_PROC_IN_SYSTEM(p) {
 			PROC_LOCK(p);
 			if (p->p_pid <= 1 || p->p_flag & P_SYSTEM ||
 			    p == td->td_proc || p->p_state == PRS_NEW) {
 				PROC_UNLOCK(p);
 				continue;
 			}
 			err = p_cansignal(td, p, sig);
 			if (err == 0) {
 				if (sig)
 					pksignal(p, sig, ksi);
 				ret = err;
 			}
 			else if (ret == ESRCH)
 				ret = err;
 			PROC_UNLOCK(p);
 		}
 		sx_sunlock(&allproc_lock);
 	} else {
 		sx_slock(&proctree_lock);
 		if (pgid == 0) {
 			/*
 			 * zero pgid means send to my process group.
 			 */
 			pgrp = td->td_proc->p_pgrp;
 			PGRP_LOCK(pgrp);
 		} else {
 			pgrp = pgfind(pgid);
 			if (pgrp == NULL) {
 				sx_sunlock(&proctree_lock);
 				return (ESRCH);
 			}
 		}
 		sx_sunlock(&proctree_lock);
 		LIST_FOREACH(p, &pgrp->pg_members, p_pglist) {
 			PROC_LOCK(p);
 			if (p->p_pid <= 1 || p->p_flag & P_SYSTEM ||
 			    p->p_state == PRS_NEW) {
 				PROC_UNLOCK(p);
 				continue;
 			}
 			err = p_cansignal(td, p, sig);
 			if (err == 0) {
 				if (sig)
 					pksignal(p, sig, ksi);
 				ret = err;
 			}
 			else if (ret == ESRCH)
 				ret = err;
 			PROC_UNLOCK(p);
 		}
 		PGRP_UNLOCK(pgrp);
 	}
 	return (ret);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct kill_args {
 	int	pid;
 	int	signum;
 };
 #endif
 /* ARGSUSED */
 int
 sys_kill(struct thread *td, struct kill_args *uap)
 {
 	ksiginfo_t ksi;
 	struct proc *p;
 	int error;
 
 	/*
 	 * A process in capability mode can send signals only to himself.
 	 * The main rationale behind this is that abort(3) is implemented as
 	 * kill(getpid(), SIGABRT).
 	 */
 	if (IN_CAPABILITY_MODE(td) && uap->pid != td->td_proc->p_pid)
 		return (ECAPMODE);
 
 	AUDIT_ARG_SIGNUM(uap->signum);
 	AUDIT_ARG_PID(uap->pid);
 	if ((u_int)uap->signum > _SIG_MAXSIG)
 		return (EINVAL);
 
 	ksiginfo_init(&ksi);
 	ksi.ksi_signo = uap->signum;
 	ksi.ksi_code = SI_USER;
 	ksi.ksi_pid = td->td_proc->p_pid;
 	ksi.ksi_uid = td->td_ucred->cr_ruid;
 
 	if (uap->pid > 0) {
 		/* kill single process */
 		if ((p = pfind(uap->pid)) == NULL) {
 			if ((p = zpfind(uap->pid)) == NULL)
 				return (ESRCH);
 		}
 		AUDIT_ARG_PROCESS(p);
 		error = p_cansignal(td, p, uap->signum);
 		if (error == 0 && uap->signum)
 			pksignal(p, uap->signum, &ksi);
 		PROC_UNLOCK(p);
 		return (error);
 	}
 	switch (uap->pid) {
 	case -1:		/* broadcast signal */
 		return (killpg1(td, uap->signum, 0, 1, &ksi));
 	case 0:			/* signal own process group */
 		return (killpg1(td, uap->signum, 0, 0, &ksi));
 	default:		/* negative explicit process group */
 		return (killpg1(td, uap->signum, -uap->pid, 0, &ksi));
 	}
 	/* NOTREACHED */
 }
 
 int
 sys_pdkill(td, uap)
 	struct thread *td;
 	struct pdkill_args *uap;
 {
 #ifdef PROCDESC
 	struct proc *p;
 	cap_rights_t rights;
 	int error;
 
 	AUDIT_ARG_SIGNUM(uap->signum);
 	AUDIT_ARG_FD(uap->fd);
 	if ((u_int)uap->signum > _SIG_MAXSIG)
 		return (EINVAL);
 
 	error = procdesc_find(td, uap->fd,
 	    cap_rights_init(&rights, CAP_PDKILL), &p);
 	if (error)
 		return (error);
 	AUDIT_ARG_PROCESS(p);
 	error = p_cansignal(td, p, uap->signum);
 	if (error == 0 && uap->signum)
 		kern_psignal(p, uap->signum);
 	PROC_UNLOCK(p);
 	return (error);
 #else
 	return (ENOSYS);
 #endif
 }
 
 #if defined(COMPAT_43)
 #ifndef _SYS_SYSPROTO_H_
 struct okillpg_args {
 	int	pgid;
 	int	signum;
 };
 #endif
 /* ARGSUSED */
 int
 okillpg(struct thread *td, struct okillpg_args *uap)
 {
 	ksiginfo_t ksi;
 
 	AUDIT_ARG_SIGNUM(uap->signum);
 	AUDIT_ARG_PID(uap->pgid);
 	if ((u_int)uap->signum > _SIG_MAXSIG)
 		return (EINVAL);
 
 	ksiginfo_init(&ksi);
 	ksi.ksi_signo = uap->signum;
 	ksi.ksi_code = SI_USER;
 	ksi.ksi_pid = td->td_proc->p_pid;
 	ksi.ksi_uid = td->td_ucred->cr_ruid;
 	return (killpg1(td, uap->signum, uap->pgid, 0, &ksi));
 }
 #endif /* COMPAT_43 */
 
 #ifndef _SYS_SYSPROTO_H_
 struct sigqueue_args {
 	pid_t pid;
 	int signum;
 	/* union sigval */ void *value;
 };
 #endif
 int
 sys_sigqueue(struct thread *td, struct sigqueue_args *uap)
 {
 	ksiginfo_t ksi;
 	struct proc *p;
 	int error;
 
 	if ((u_int)uap->signum > _SIG_MAXSIG)
 		return (EINVAL);
 
 	/*
 	 * Specification says sigqueue can only send signal to
 	 * single process.
 	 */
 	if (uap->pid <= 0)
 		return (EINVAL);
 
 	if ((p = pfind(uap->pid)) == NULL) {
 		if ((p = zpfind(uap->pid)) == NULL)
 			return (ESRCH);
 	}
 	error = p_cansignal(td, p, uap->signum);
 	if (error == 0 && uap->signum != 0) {
 		ksiginfo_init(&ksi);
 		ksi.ksi_flags = KSI_SIGQ;
 		ksi.ksi_signo = uap->signum;
 		ksi.ksi_code = SI_QUEUE;
 		ksi.ksi_pid = td->td_proc->p_pid;
 		ksi.ksi_uid = td->td_ucred->cr_ruid;
 		ksi.ksi_value.sival_ptr = uap->value;
 		error = pksignal(p, ksi.ksi_signo, &ksi);
 	}
 	PROC_UNLOCK(p);
 	return (error);
 }
 
 /*
  * Send a signal to a process group.
  */
 void
 gsignal(int pgid, int sig, ksiginfo_t *ksi)
 {
 	struct pgrp *pgrp;
 
 	if (pgid != 0) {
 		sx_slock(&proctree_lock);
 		pgrp = pgfind(pgid);
 		sx_sunlock(&proctree_lock);
 		if (pgrp != NULL) {
 			pgsignal(pgrp, sig, 0, ksi);
 			PGRP_UNLOCK(pgrp);
 		}
 	}
 }
 
 /*
  * Send a signal to a process group.  If checktty is 1,
  * limit to members which have a controlling terminal.
  */
 void
 pgsignal(struct pgrp *pgrp, int sig, int checkctty, ksiginfo_t *ksi)
 {
 	struct proc *p;
 
 	if (pgrp) {
 		PGRP_LOCK_ASSERT(pgrp, MA_OWNED);
 		LIST_FOREACH(p, &pgrp->pg_members, p_pglist) {
 			PROC_LOCK(p);
 			if (p->p_state == PRS_NORMAL &&
 			    (checkctty == 0 || p->p_flag & P_CONTROLT))
 				pksignal(p, sig, ksi);
 			PROC_UNLOCK(p);
 		}
 	}
 }
 
 /*
  * Send a signal caused by a trap to the current thread.  If it will be
  * caught immediately, deliver it with correct code.  Otherwise, post it
  * normally.
  */
 void
 trapsignal(struct thread *td, ksiginfo_t *ksi)
 {
 	struct sigacts *ps;
 	sigset_t mask;
 	struct proc *p;
 	int sig;
 	int code;
 
 	p = td->td_proc;
 	sig = ksi->ksi_signo;
 	code = ksi->ksi_code;
 	KASSERT(_SIG_VALID(sig), ("invalid signal"));
 
 	PROC_LOCK(p);
 	ps = p->p_sigacts;
 	mtx_lock(&ps->ps_mtx);
 	if ((p->p_flag & P_TRACED) == 0 && SIGISMEMBER(ps->ps_sigcatch, sig) &&
 	    !SIGISMEMBER(td->td_sigmask, sig)) {
 		td->td_ru.ru_nsignals++;
 #ifdef KTRACE
 		if (KTRPOINT(curthread, KTR_PSIG))
 			ktrpsig(sig, ps->ps_sigact[_SIG_IDX(sig)],
 			    &td->td_sigmask, code);
 #endif
 		(*p->p_sysent->sv_sendsig)(ps->ps_sigact[_SIG_IDX(sig)],
 				ksi, &td->td_sigmask);
 		mask = ps->ps_catchmask[_SIG_IDX(sig)];
 		if (!SIGISMEMBER(ps->ps_signodefer, sig))
 			SIGADDSET(mask, sig);
 		kern_sigprocmask(td, SIG_BLOCK, &mask, NULL,
 		    SIGPROCMASK_PROC_LOCKED | SIGPROCMASK_PS_LOCKED);
 		if (SIGISMEMBER(ps->ps_sigreset, sig)) {
 			/*
 			 * See kern_sigaction() for origin of this code.
 			 */
 			SIGDELSET(ps->ps_sigcatch, sig);
 			if (sig != SIGCONT &&
 			    sigprop(sig) & SA_IGNORE)
 				SIGADDSET(ps->ps_sigignore, sig);
 			ps->ps_sigact[_SIG_IDX(sig)] = SIG_DFL;
 		}
 		mtx_unlock(&ps->ps_mtx);
 	} else {
 		/*
 		 * Avoid a possible infinite loop if the thread
 		 * masking the signal or process is ignoring the
 		 * signal.
 		 */
 		if (kern_forcesigexit &&
 		    (SIGISMEMBER(td->td_sigmask, sig) ||
 		     ps->ps_sigact[_SIG_IDX(sig)] == SIG_IGN)) {
 			SIGDELSET(td->td_sigmask, sig);
 			SIGDELSET(ps->ps_sigcatch, sig);
 			SIGDELSET(ps->ps_sigignore, sig);
 			ps->ps_sigact[_SIG_IDX(sig)] = SIG_DFL;
 		}
 		mtx_unlock(&ps->ps_mtx);
 		p->p_code = code;	/* XXX for core dump/debugger */
 		p->p_sig = sig;		/* XXX to verify code */
 		tdsendsignal(p, td, sig, ksi);
 	}
 	PROC_UNLOCK(p);
 }
 
 static struct thread *
 sigtd(struct proc *p, int sig, int prop)
 {
 	struct thread *td, *signal_td;
 
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 
 	/*
 	 * Check if current thread can handle the signal without
 	 * switching context to another thread.
 	 */
 	if (curproc == p && !SIGISMEMBER(curthread->td_sigmask, sig))
 		return (curthread);
 	signal_td = NULL;
 	FOREACH_THREAD_IN_PROC(p, td) {
 		if (!SIGISMEMBER(td->td_sigmask, sig)) {
 			signal_td = td;
 			break;
 		}
 	}
 	if (signal_td == NULL)
 		signal_td = FIRST_THREAD_IN_PROC(p);
 	return (signal_td);
 }
 
 /*
  * Send the signal to the process.  If the signal has an action, the action
  * is usually performed by the target process rather than the caller; we add
  * the signal to the set of pending signals for the process.
  *
  * Exceptions:
  *   o When a stop signal is sent to a sleeping process that takes the
  *     default action, the process is stopped without awakening it.
  *   o SIGCONT restarts stopped processes (or puts them back to sleep)
  *     regardless of the signal action (eg, blocked or ignored).
  *
  * Other ignored signals are discarded immediately.
  *
  * NB: This function may be entered from the debugger via the "kill" DDB
  * command.  There is little that can be done to mitigate the possibly messy
  * side effects of this unwise possibility.
  */
 void
 kern_psignal(struct proc *p, int sig)
 {
 	ksiginfo_t ksi;
 
 	ksiginfo_init(&ksi);
 	ksi.ksi_signo = sig;
 	ksi.ksi_code = SI_KERNEL;
 	(void) tdsendsignal(p, NULL, sig, &ksi);
 }
 
 int
 pksignal(struct proc *p, int sig, ksiginfo_t *ksi)
 {
 
 	return (tdsendsignal(p, NULL, sig, ksi));
 }
 
 /* Utility function for finding a thread to send signal event to. */
 int
 sigev_findtd(struct proc *p ,struct sigevent *sigev, struct thread **ttd)
 {
 	struct thread *td;
 
 	if (sigev->sigev_notify == SIGEV_THREAD_ID) {
 		td = tdfind(sigev->sigev_notify_thread_id, p->p_pid);
 		if (td == NULL)
 			return (ESRCH);
 		*ttd = td;
 	} else {
 		*ttd = NULL;
 		PROC_LOCK(p);
 	}
 	return (0);
 }
 
 void
 tdsignal(struct thread *td, int sig)
 {
 	ksiginfo_t ksi;
 
 	ksiginfo_init(&ksi);
 	ksi.ksi_signo = sig;
 	ksi.ksi_code = SI_KERNEL;
 	(void) tdsendsignal(td->td_proc, td, sig, &ksi);
 }
 
 void
 tdksignal(struct thread *td, int sig, ksiginfo_t *ksi)
 {
 
 	(void) tdsendsignal(td->td_proc, td, sig, ksi);
 }
 
 int
 tdsendsignal(struct proc *p, struct thread *td, int sig, ksiginfo_t *ksi)
 {
 	sig_t action;
 	sigqueue_t *sigqueue;
 	int prop;
 	struct sigacts *ps;
 	int intrval;
 	int ret = 0;
 	int wakeup_swapper;
 
 	MPASS(td == NULL || p == td->td_proc);
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 
 	if (!_SIG_VALID(sig))
 		panic("%s(): invalid signal %d", __func__, sig);
 
 	KASSERT(ksi == NULL || !KSI_ONQ(ksi), ("%s: ksi on queue", __func__));
 
 	/*
 	 * IEEE Std 1003.1-2001: return success when killing a zombie.
 	 */
 	if (p->p_state == PRS_ZOMBIE) {
 		if (ksi && (ksi->ksi_flags & KSI_INS))
 			ksiginfo_tryfree(ksi);
 		return (ret);
 	}
 
 	ps = p->p_sigacts;
 	KNOTE_LOCKED(&p->p_klist, NOTE_SIGNAL | sig);
 	prop = sigprop(sig);
 
 	if (td == NULL) {
 		td = sigtd(p, sig, prop);
 		sigqueue = &p->p_sigqueue;
 	} else
 		sigqueue = &td->td_sigqueue;
 
 	SDT_PROBE(proc, kernel, , signal_send, td, p, sig, 0, 0 );
 
 	/*
 	 * If the signal is being ignored,
 	 * then we forget about it immediately.
 	 * (Note: we don't set SIGCONT in ps_sigignore,
 	 * and if it is set to SIG_IGN,
 	 * action will be SIG_DFL here.)
 	 */
 	mtx_lock(&ps->ps_mtx);
 	if (SIGISMEMBER(ps->ps_sigignore, sig)) {
 		SDT_PROBE(proc, kernel, , signal_discard, td, p, sig, 0, 0 );
 
 		mtx_unlock(&ps->ps_mtx);
 		if (ksi && (ksi->ksi_flags & KSI_INS))
 			ksiginfo_tryfree(ksi);
 		return (ret);
 	}
 	if (SIGISMEMBER(td->td_sigmask, sig))
 		action = SIG_HOLD;
 	else if (SIGISMEMBER(ps->ps_sigcatch, sig))
 		action = SIG_CATCH;
 	else
 		action = SIG_DFL;
 	if (SIGISMEMBER(ps->ps_sigintr, sig))
 		intrval = EINTR;
 	else
 		intrval = ERESTART;
 	mtx_unlock(&ps->ps_mtx);
 
 	if (prop & SA_CONT)
 		sigqueue_delete_stopmask_proc(p);
 	else if (prop & SA_STOP) {
 		/*
 		 * If sending a tty stop signal to a member of an orphaned
 		 * process group, discard the signal here if the action
 		 * is default; don't stop the process below if sleeping,
 		 * and don't clear any pending SIGCONT.
 		 */
 		if ((prop & SA_TTYSTOP) &&
 		    (p->p_pgrp->pg_jobc == 0) &&
 		    (action == SIG_DFL)) {
 			if (ksi && (ksi->ksi_flags & KSI_INS))
 				ksiginfo_tryfree(ksi);
 			return (ret);
 		}
 		sigqueue_delete_proc(p, SIGCONT);
 		if (p->p_flag & P_CONTINUED) {
 			p->p_flag &= ~P_CONTINUED;
 			PROC_LOCK(p->p_pptr);
 			sigqueue_take(p->p_ksi);
 			PROC_UNLOCK(p->p_pptr);
 		}
 	}
 
 	ret = sigqueue_add(sigqueue, sig, ksi);
 	if (ret != 0)
 		return (ret);
 	signotify(td);
 	/*
 	 * Defer further processing for signals which are held,
 	 * except that stopped processes must be continued by SIGCONT.
 	 */
 	if (action == SIG_HOLD &&
 	    !((prop & SA_CONT) && (p->p_flag & P_STOPPED_SIG)))
 		return (ret);
 	/*
 	 * SIGKILL: Remove procfs STOPEVENTs.
 	 */
 	if (sig == SIGKILL) {
 		/* from procfs_ioctl.c: PIOCBIC */
 		p->p_stops = 0;
 		/* from procfs_ioctl.c: PIOCCONT */
 		p->p_step = 0;
 		wakeup(&p->p_step);
 	}
 	/*
 	 * Some signals have a process-wide effect and a per-thread
 	 * component.  Most processing occurs when the process next
 	 * tries to cross the user boundary, however there are some
 	 * times when processing needs to be done immediately, such as
 	 * waking up threads so that they can cross the user boundary.
 	 * We try to do the per-process part here.
 	 */
 	if (P_SHOULDSTOP(p)) {
 		KASSERT(!(p->p_flag & P_WEXIT),
 		    ("signal to stopped but exiting process"));
 		if (sig == SIGKILL) {
 			/*
 			 * If traced process is already stopped,
 			 * then no further action is necessary.
 			 */
 			if (p->p_flag & P_TRACED)
 				goto out;
 			/*
 			 * SIGKILL sets process running.
 			 * It will die elsewhere.
 			 * All threads must be restarted.
 			 */
 			p->p_flag &= ~P_STOPPED_SIG;
 			goto runfast;
 		}
 
 		if (prop & SA_CONT) {
 			/*
 			 * If traced process is already stopped,
 			 * then no further action is necessary.
 			 */
 			if (p->p_flag & P_TRACED)
 				goto out;
 			/*
 			 * If SIGCONT is default (or ignored), we continue the
 			 * process but don't leave the signal in sigqueue as
 			 * it has no further action.  If SIGCONT is held, we
 			 * continue the process and leave the signal in
 			 * sigqueue.  If the process catches SIGCONT, let it
 			 * handle the signal itself.  If it isn't waiting on
 			 * an event, it goes back to run state.
 			 * Otherwise, process goes back to sleep state.
 			 */
 			p->p_flag &= ~P_STOPPED_SIG;
 			PROC_SLOCK(p);
 			if (p->p_numthreads == p->p_suspcount) {
 				PROC_SUNLOCK(p);
 				p->p_flag |= P_CONTINUED;
 				p->p_xstat = SIGCONT;
 				PROC_LOCK(p->p_pptr);
 				childproc_continued(p);
 				PROC_UNLOCK(p->p_pptr);
 				PROC_SLOCK(p);
 			}
 			if (action == SIG_DFL) {
 				thread_unsuspend(p);
 				PROC_SUNLOCK(p);
 				sigqueue_delete(sigqueue, sig);
 				goto out;
 			}
 			if (action == SIG_CATCH) {
 				/*
 				 * The process wants to catch it so it needs
 				 * to run at least one thread, but which one?
 				 */
 				PROC_SUNLOCK(p);
 				goto runfast;
 			}
 			/*
 			 * The signal is not ignored or caught.
 			 */
 			thread_unsuspend(p);
 			PROC_SUNLOCK(p);
 			goto out;
 		}
 
 		if (prop & SA_STOP) {
 			/*
 			 * If traced process is already stopped,
 			 * then no further action is necessary.
 			 */
 			if (p->p_flag & P_TRACED)
 				goto out;
 			/*
 			 * Already stopped, don't need to stop again
 			 * (If we did the shell could get confused).
 			 * Just make sure the signal STOP bit set.
 			 */
 			p->p_flag |= P_STOPPED_SIG;
 			sigqueue_delete(sigqueue, sig);
 			goto out;
 		}
 
 		/*
 		 * All other kinds of signals:
 		 * If a thread is sleeping interruptibly, simulate a
 		 * wakeup so that when it is continued it will be made
 		 * runnable and can look at the signal.  However, don't make
 		 * the PROCESS runnable, leave it stopped.
 		 * It may run a bit until it hits a thread_suspend_check().
 		 */
 		wakeup_swapper = 0;
 		PROC_SLOCK(p);
 		thread_lock(td);
 		if (TD_ON_SLEEPQ(td) && (td->td_flags & TDF_SINTR))
 			wakeup_swapper = sleepq_abort(td, intrval);
 		thread_unlock(td);
 		PROC_SUNLOCK(p);
 		if (wakeup_swapper)
 			kick_proc0();
 		goto out;
 		/*
 		 * Mutexes are short lived. Threads waiting on them will
 		 * hit thread_suspend_check() soon.
 		 */
 	} else if (p->p_state == PRS_NORMAL) {
 		if (p->p_flag & P_TRACED || action == SIG_CATCH) {
 			tdsigwakeup(td, sig, action, intrval);
 			goto out;
 		}
 
 		MPASS(action == SIG_DFL);
 
 		if (prop & SA_STOP) {
 			if (p->p_flag & (P_PPWAIT|P_WEXIT))
 				goto out;
 			p->p_flag |= P_STOPPED_SIG;
 			p->p_xstat = sig;
 			PROC_SLOCK(p);
 			sig_suspend_threads(td, p, 1);
 			if (p->p_numthreads == p->p_suspcount) {
 				/*
 				 * only thread sending signal to another
 				 * process can reach here, if thread is sending
 				 * signal to its process, because thread does
 				 * not suspend itself here, p_numthreads
 				 * should never be equal to p_suspcount.
 				 */
 				thread_stopped(p);
 				PROC_SUNLOCK(p);
 				sigqueue_delete_proc(p, p->p_xstat);
 			} else
 				PROC_SUNLOCK(p);
 			goto out;
 		}
 	} else {
 		/* Not in "NORMAL" state. discard the signal. */
 		sigqueue_delete(sigqueue, sig);
 		goto out;
 	}
 
 	/*
 	 * The process is not stopped so we need to apply the signal to all the
 	 * running threads.
 	 */
 runfast:
 	tdsigwakeup(td, sig, action, intrval);
 	PROC_SLOCK(p);
 	thread_unsuspend(p);
 	PROC_SUNLOCK(p);
 out:
 	/* If we jump here, proc slock should not be owned. */
 	PROC_SLOCK_ASSERT(p, MA_NOTOWNED);
 	return (ret);
 }
 
 /*
  * The force of a signal has been directed against a single
  * thread.  We need to see what we can do about knocking it
  * out of any sleep it may be in etc.
  */
 static void
 tdsigwakeup(struct thread *td, int sig, sig_t action, int intrval)
 {
 	struct proc *p = td->td_proc;
 	register int prop;
 	int wakeup_swapper;
 
 	wakeup_swapper = 0;
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	prop = sigprop(sig);
 
 	PROC_SLOCK(p);
 	thread_lock(td);
 	/*
 	 * Bring the priority of a thread up if we want it to get
 	 * killed in this lifetime.
 	 */
 	if (action == SIG_DFL && (prop & SA_KILL) && td->td_priority > PUSER)
 		sched_prio(td, PUSER);
 	if (TD_ON_SLEEPQ(td)) {
 		/*
 		 * If thread is sleeping uninterruptibly
 		 * we can't interrupt the sleep... the signal will
 		 * be noticed when the process returns through
 		 * trap() or syscall().
 		 */
 		if ((td->td_flags & TDF_SINTR) == 0)
 			goto out;
 		/*
 		 * If SIGCONT is default (or ignored) and process is
 		 * asleep, we are finished; the process should not
 		 * be awakened.
 		 */
 		if ((prop & SA_CONT) && action == SIG_DFL) {
 			thread_unlock(td);
 			PROC_SUNLOCK(p);
 			sigqueue_delete(&p->p_sigqueue, sig);
 			/*
 			 * It may be on either list in this state.
 			 * Remove from both for now.
 			 */
 			sigqueue_delete(&td->td_sigqueue, sig);
 			return;
 		}
 
 		/*
 		 * Don't awaken a sleeping thread for SIGSTOP if the
 		 * STOP signal is deferred.
 		 */
 		if ((prop & SA_STOP) && (td->td_flags & TDF_SBDRY))
 			goto out;
 
 		/*
 		 * Give low priority threads a better chance to run.
 		 */
 		if (td->td_priority > PUSER)
 			sched_prio(td, PUSER);
 
 		wakeup_swapper = sleepq_abort(td, intrval);
 	} else {
 		/*
 		 * Other states do nothing with the signal immediately,
 		 * other than kicking ourselves if we are running.
 		 * It will either never be noticed, or noticed very soon.
 		 */
 #ifdef SMP
 		if (TD_IS_RUNNING(td) && td != curthread)
 			forward_signal(td);
 #endif
 	}
 out:
 	PROC_SUNLOCK(p);
 	thread_unlock(td);
 	if (wakeup_swapper)
 		kick_proc0();
 }
 
 static void
 sig_suspend_threads(struct thread *td, struct proc *p, int sending)
 {
 	struct thread *td2;
 
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	PROC_SLOCK_ASSERT(p, MA_OWNED);
 
 	FOREACH_THREAD_IN_PROC(p, td2) {
 		thread_lock(td2);
 		td2->td_flags |= TDF_ASTPENDING | TDF_NEEDSUSPCHK;
 		if ((TD_IS_SLEEPING(td2) || TD_IS_SWAPPED(td2)) &&
 		    (td2->td_flags & TDF_SINTR)) {
 			if (td2->td_flags & TDF_SBDRY) {
 				/*
 				 * Once a thread is asleep with
 				 * TDF_SBDRY set, it should never
 				 * become suspended due to this check.
 				 */
 				KASSERT(!TD_IS_SUSPENDED(td2),
 				    ("thread with deferred stops suspended"));
 			} else if (!TD_IS_SUSPENDED(td2)) {
 				thread_suspend_one(td2);
 			}
 		} else if (!TD_IS_SUSPENDED(td2)) {
 			if (sending || td != td2)
 				td2->td_flags |= TDF_ASTPENDING;
 #ifdef SMP
 			if (TD_IS_RUNNING(td2) && td2 != td)
 				forward_signal(td2);
 #endif
 		}
 		thread_unlock(td2);
 	}
 }
 
 int
 ptracestop(struct thread *td, int sig)
 {
 	struct proc *p = td->td_proc;
 
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	KASSERT(!(p->p_flag & P_WEXIT), ("Stopping exiting process"));
 	WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK,
 	    &p->p_mtx.lock_object, "Stopping for traced signal");
 
 	td->td_dbgflags |= TDB_XSIG;
 	td->td_xsig = sig;
 	PROC_SLOCK(p);
 	while ((p->p_flag & P_TRACED) && (td->td_dbgflags & TDB_XSIG)) {
 		if (p->p_flag & P_SINGLE_EXIT) {
 			td->td_dbgflags &= ~TDB_XSIG;
 			PROC_SUNLOCK(p);
 			return (sig);
 		}
 		/*
 		 * Just make wait() to work, the last stopped thread
 		 * will win.
 		 */
 		p->p_xstat = sig;
 		p->p_xthread = td;
 		p->p_flag |= (P_STOPPED_SIG|P_STOPPED_TRACE);
 		sig_suspend_threads(td, p, 0);
 		if ((td->td_dbgflags & TDB_STOPATFORK) != 0) {
 			td->td_dbgflags &= ~TDB_STOPATFORK;
 			cv_broadcast(&p->p_dbgwait);
 		}
 stopme:
 		thread_suspend_switch(td);
 		if (p->p_xthread == td)
 			p->p_xthread = NULL;
 		if (!(p->p_flag & P_TRACED))
 			break;
 		if (td->td_dbgflags & TDB_SUSPEND) {
 			if (p->p_flag & P_SINGLE_EXIT)
 				break;
 			goto stopme;
 		}
 	}
 	PROC_SUNLOCK(p);
 	return (td->td_xsig);
 }
 
 static void
 reschedule_signals(struct proc *p, sigset_t block, int flags)
 {
 	struct sigacts *ps;
 	struct thread *td;
 	int sig;
 
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	if (SIGISEMPTY(p->p_siglist))
 		return;
 	ps = p->p_sigacts;
 	SIGSETAND(block, p->p_siglist);
 	while ((sig = sig_ffs(&block)) != 0) {
 		SIGDELSET(block, sig);
 		td = sigtd(p, sig, 0);
 		signotify(td);
 		if (!(flags & SIGPROCMASK_PS_LOCKED))
 			mtx_lock(&ps->ps_mtx);
 		if (p->p_flag & P_TRACED || SIGISMEMBER(ps->ps_sigcatch, sig))
 			tdsigwakeup(td, sig, SIG_CATCH,
 			    (SIGISMEMBER(ps->ps_sigintr, sig) ? EINTR :
 			     ERESTART));
 		if (!(flags & SIGPROCMASK_PS_LOCKED))
 			mtx_unlock(&ps->ps_mtx);
 	}
 }
 
 void
 tdsigcleanup(struct thread *td)
 {
 	struct proc *p;
 	sigset_t unblocked;
 
 	p = td->td_proc;
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 
 	sigqueue_flush(&td->td_sigqueue);
 	if (p->p_numthreads == 1)
 		return;
 
 	/*
 	 * Since we cannot handle signals, notify signal post code
 	 * about this by filling the sigmask.
 	 *
 	 * Also, if needed, wake up thread(s) that do not block the
 	 * same signals as the exiting thread, since the thread might
 	 * have been selected for delivery and woken up.
 	 */
 	SIGFILLSET(unblocked);
 	SIGSETNAND(unblocked, td->td_sigmask);
 	SIGFILLSET(td->td_sigmask);
 	reschedule_signals(p, unblocked, 0);
 
 }
 
 /*
  * Defer the delivery of SIGSTOP for the current thread.  Returns true
  * if stops were deferred and false if they were already deferred.
  */
 int
 sigdeferstop(void)
 {
 	struct thread *td;
 
 	td = curthread;
 	if (td->td_flags & TDF_SBDRY)
 		return (0);
 	thread_lock(td);
 	td->td_flags |= TDF_SBDRY;
 	thread_unlock(td);
 	return (1);
 }
 
 /*
  * Permit the delivery of SIGSTOP for the current thread.  This does
  * not immediately suspend if a stop was posted.  Instead, the thread
  * will suspend either via ast() or a subsequent interruptible sleep.
  */
 void
 sigallowstop()
 {
 	struct thread *td;
 
 	td = curthread;
 	thread_lock(td);
 	td->td_flags &= ~TDF_SBDRY;
 	thread_unlock(td);
 }
 
 /*
  * If the current process has received a signal (should be caught or cause
  * termination, should interrupt current syscall), return the signal number.
  * Stop signals with default action are processed immediately, then cleared;
  * they aren't returned.  This is checked after each entry to the system for
  * a syscall or trap (though this can usually be done without calling issignal
  * by checking the pending signal masks in cursig.) The normal call
  * sequence is
  *
  *	while (sig = cursig(curthread))
  *		postsig(sig);
  */
 static int
 issignal(struct thread *td)
 {
 	struct proc *p;
 	struct sigacts *ps;
 	struct sigqueue *queue;
 	sigset_t sigpending;
 	int sig, prop, newsig;
 
 	p = td->td_proc;
 	ps = p->p_sigacts;
 	mtx_assert(&ps->ps_mtx, MA_OWNED);
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	for (;;) {
 		int traced = (p->p_flag & P_TRACED) || (p->p_stops & S_SIG);
 
 		sigpending = td->td_sigqueue.sq_signals;
 		SIGSETOR(sigpending, p->p_sigqueue.sq_signals);
 		SIGSETNAND(sigpending, td->td_sigmask);
 
 		if (p->p_flag & P_PPWAIT || td->td_flags & TDF_SBDRY)
 			SIG_STOPSIGMASK(sigpending);
 		if (SIGISEMPTY(sigpending))	/* no signal to send */
 			return (0);
 		sig = sig_ffs(&sigpending);
 
 		if (p->p_stops & S_SIG) {
 			mtx_unlock(&ps->ps_mtx);
 			stopevent(p, S_SIG, sig);
 			mtx_lock(&ps->ps_mtx);
 		}
 
 		/*
 		 * We should see pending but ignored signals
 		 * only if P_TRACED was on when they were posted.
 		 */
 		if (SIGISMEMBER(ps->ps_sigignore, sig) && (traced == 0)) {
 			sigqueue_delete(&td->td_sigqueue, sig);
 			sigqueue_delete(&p->p_sigqueue, sig);
 			continue;
 		}
 		if (p->p_flag & P_TRACED && (p->p_flag & P_PPTRACE) == 0) {
 			/*
 			 * If traced, always stop.
 			 * Remove old signal from queue before the stop.
 			 * XXX shrug off debugger, it causes siginfo to
 			 * be thrown away.
 			 */
 			queue = &td->td_sigqueue;
 			td->td_dbgksi.ksi_signo = 0;
 			if (sigqueue_get(queue, sig, &td->td_dbgksi) == 0) {
 				queue = &p->p_sigqueue;
 				sigqueue_get(queue, sig, &td->td_dbgksi);
 			}
 
 			mtx_unlock(&ps->ps_mtx);
 			newsig = ptracestop(td, sig);
 			mtx_lock(&ps->ps_mtx);
 
 			if (sig != newsig) {
 
 				/*
 				 * If parent wants us to take the signal,
 				 * then it will leave it in p->p_xstat;
 				 * otherwise we just look for signals again.
 				*/
 				if (newsig == 0)
 					continue;
 				sig = newsig;
 
 				/*
 				 * Put the new signal into td_sigqueue. If the
 				 * signal is being masked, look for other
 				 * signals.
 				 */
 				sigqueue_add(queue, sig, NULL);
 				if (SIGISMEMBER(td->td_sigmask, sig))
 					continue;
 				signotify(td);
 			} else {
 				if (td->td_dbgksi.ksi_signo != 0) {
 					td->td_dbgksi.ksi_flags |= KSI_HEAD;
 					if (sigqueue_add(&td->td_sigqueue, sig,
 					    &td->td_dbgksi) != 0)
 						td->td_dbgksi.ksi_signo = 0;
 				}
 				if (td->td_dbgksi.ksi_signo == 0)
 					sigqueue_add(&td->td_sigqueue, sig,
 					    NULL);
 			}
 
 			/*
 			 * If the traced bit got turned off, go back up
 			 * to the top to rescan signals.  This ensures
 			 * that p_sig* and p_sigact are consistent.
 			 */
 			if ((p->p_flag & P_TRACED) == 0)
 				continue;
 		}
 
 		prop = sigprop(sig);
 
 		/*
 		 * Decide whether the signal should be returned.
 		 * Return the signal's number, or fall through
 		 * to clear it from the pending mask.
 		 */
 		switch ((intptr_t)p->p_sigacts->ps_sigact[_SIG_IDX(sig)]) {
 
 		case (intptr_t)SIG_DFL:
 			/*
 			 * Don't take default actions on system processes.
 			 */
 			if (p->p_pid <= 1) {
 #ifdef DIAGNOSTIC
 				/*
 				 * Are you sure you want to ignore SIGSEGV
 				 * in init? XXX
 				 */
 				printf("Process (pid %lu) got signal %d\n",
 					(u_long)p->p_pid, sig);
 #endif
 				break;		/* == ignore */
 			}
 			/*
 			 * If there is a pending stop signal to process
 			 * with default action, stop here,
 			 * then clear the signal.  However,
 			 * if process is member of an orphaned
 			 * process group, ignore tty stop signals.
 			 */
 			if (prop & SA_STOP) {
 				if (p->p_flag & (P_TRACED|P_WEXIT) ||
 				    (p->p_pgrp->pg_jobc == 0 &&
 				     prop & SA_TTYSTOP))
 					break;	/* == ignore */
 				mtx_unlock(&ps->ps_mtx);
 				WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK,
 				    &p->p_mtx.lock_object, "Catching SIGSTOP");
 				p->p_flag |= P_STOPPED_SIG;
 				p->p_xstat = sig;
 				PROC_SLOCK(p);
 				sig_suspend_threads(td, p, 0);
 				thread_suspend_switch(td);
 				PROC_SUNLOCK(p);
 				mtx_lock(&ps->ps_mtx);
 				break;
 			} else if (prop & SA_IGNORE) {
 				/*
 				 * Except for SIGCONT, shouldn't get here.
 				 * Default action is to ignore; drop it.
 				 */
 				break;		/* == ignore */
 			} else
 				return (sig);
 			/*NOTREACHED*/
 
 		case (intptr_t)SIG_IGN:
 			/*
 			 * Masking above should prevent us ever trying
 			 * to take action on an ignored signal other
 			 * than SIGCONT, unless process is traced.
 			 */
 			if ((prop & SA_CONT) == 0 &&
 			    (p->p_flag & P_TRACED) == 0)
 				printf("issignal\n");
 			break;		/* == ignore */
 
 		default:
 			/*
 			 * This signal has an action, let
 			 * postsig() process it.
 			 */
 			return (sig);
 		}
 		sigqueue_delete(&td->td_sigqueue, sig);	/* take the signal! */
 		sigqueue_delete(&p->p_sigqueue, sig);
 	}
 	/* NOTREACHED */
 }
 
 void
 thread_stopped(struct proc *p)
 {
 	int n;
 
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	PROC_SLOCK_ASSERT(p, MA_OWNED);
 	n = p->p_suspcount;
 	if (p == curproc)
 		n++;
 	if ((p->p_flag & P_STOPPED_SIG) && (n == p->p_numthreads)) {
 		PROC_SUNLOCK(p);
 		p->p_flag &= ~P_WAITED;
 		PROC_LOCK(p->p_pptr);
 		childproc_stopped(p, (p->p_flag & P_TRACED) ?
 			CLD_TRAPPED : CLD_STOPPED);
 		PROC_UNLOCK(p->p_pptr);
 		PROC_SLOCK(p);
 	}
 }
 
 /*
  * Take the action for the specified signal
  * from the current set of pending signals.
  */
 int
 postsig(sig)
 	register int sig;
 {
 	struct thread *td = curthread;
 	register struct proc *p = td->td_proc;
 	struct sigacts *ps;
 	sig_t action;
 	ksiginfo_t ksi;
 	sigset_t returnmask, mask;
 
 	KASSERT(sig != 0, ("postsig"));
 
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	ps = p->p_sigacts;
 	mtx_assert(&ps->ps_mtx, MA_OWNED);
 	ksiginfo_init(&ksi);
 	if (sigqueue_get(&td->td_sigqueue, sig, &ksi) == 0 &&
 	    sigqueue_get(&p->p_sigqueue, sig, &ksi) == 0)
 		return (0);
 	ksi.ksi_signo = sig;
 	if (ksi.ksi_code == SI_TIMER)
 		itimer_accept(p, ksi.ksi_timerid, &ksi);
 	action = ps->ps_sigact[_SIG_IDX(sig)];
 #ifdef KTRACE
 	if (KTRPOINT(td, KTR_PSIG))
 		ktrpsig(sig, action, td->td_pflags & TDP_OLDMASK ?
 		    &td->td_oldsigmask : &td->td_sigmask, ksi.ksi_code);
 #endif
 	if (p->p_stops & S_SIG) {
 		mtx_unlock(&ps->ps_mtx);
 		stopevent(p, S_SIG, sig);
 		mtx_lock(&ps->ps_mtx);
 	}
 
 	if (action == SIG_DFL) {
 		/*
 		 * Default action, where the default is to kill
 		 * the process.  (Other cases were ignored above.)
 		 */
 		mtx_unlock(&ps->ps_mtx);
 		sigexit(td, sig);
 		/* NOTREACHED */
 	} else {
 		/*
 		 * If we get here, the signal must be caught.
 		 */
 		KASSERT(action != SIG_IGN && !SIGISMEMBER(td->td_sigmask, sig),
 		    ("postsig action"));
 		/*
 		 * Set the new mask value and also defer further
 		 * occurrences of this signal.
 		 *
 		 * Special case: user has done a sigsuspend.  Here the
 		 * current mask is not of interest, but rather the
 		 * mask from before the sigsuspend is what we want
 		 * restored after the signal processing is completed.
 		 */
 		if (td->td_pflags & TDP_OLDMASK) {
 			returnmask = td->td_oldsigmask;
 			td->td_pflags &= ~TDP_OLDMASK;
 		} else
 			returnmask = td->td_sigmask;
 
 		mask = ps->ps_catchmask[_SIG_IDX(sig)];
 		if (!SIGISMEMBER(ps->ps_signodefer, sig))
 			SIGADDSET(mask, sig);
 		kern_sigprocmask(td, SIG_BLOCK, &mask, NULL,
 		    SIGPROCMASK_PROC_LOCKED | SIGPROCMASK_PS_LOCKED);
 
 		if (SIGISMEMBER(ps->ps_sigreset, sig)) {
 			/*
 			 * See kern_sigaction() for origin of this code.
 			 */
 			SIGDELSET(ps->ps_sigcatch, sig);
 			if (sig != SIGCONT &&
 			    sigprop(sig) & SA_IGNORE)
 				SIGADDSET(ps->ps_sigignore, sig);
 			ps->ps_sigact[_SIG_IDX(sig)] = SIG_DFL;
 		}
 		td->td_ru.ru_nsignals++;
 		if (p->p_sig == sig) {
 			p->p_code = 0;
 			p->p_sig = 0;
 		}
 		(*p->p_sysent->sv_sendsig)(action, &ksi, &returnmask);
 	}
 	return (1);
 }
 
 /*
  * Kill the current process for stated reason.
  */
 void
 killproc(p, why)
 	struct proc *p;
 	char *why;
 {
 
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	CTR3(KTR_PROC, "killproc: proc %p (pid %d, %s)", p, p->p_pid,
 	    p->p_comm);
 	log(LOG_ERR, "pid %d (%s), uid %d, was killed: %s\n", p->p_pid,
 	    p->p_comm, p->p_ucred ? p->p_ucred->cr_uid : -1, why);
 	p->p_flag |= P_WKILLED;
 	kern_psignal(p, SIGKILL);
 }
 
 /*
  * Force the current process to exit with the specified signal, dumping core
  * if appropriate.  We bypass the normal tests for masked and caught signals,
  * allowing unrecoverable failures to terminate the process without changing
  * signal state.  Mark the accounting record with the signal termination.
  * If dumping core, save the signal number for the debugger.  Calls exit and
  * does not return.
  */
 void
 sigexit(td, sig)
 	struct thread *td;
 	int sig;
 {
 	struct proc *p = td->td_proc;
 
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	p->p_acflag |= AXSIG;
 	/*
 	 * We must be single-threading to generate a core dump.  This
 	 * ensures that the registers in the core file are up-to-date.
 	 * Also, the ELF dump handler assumes that the thread list doesn't
 	 * change out from under it.
 	 *
 	 * XXX If another thread attempts to single-thread before us
 	 *     (e.g. via fork()), we won't get a dump at all.
 	 */
 	if ((sigprop(sig) & SA_CORE) && (thread_single(SINGLE_NO_EXIT) == 0)) {
 		p->p_sig = sig;
 		/*
 		 * Log signals which would cause core dumps
 		 * (Log as LOG_INFO to appease those who don't want
 		 * these messages.)
 		 * XXX : Todo, as well as euid, write out ruid too
 		 * Note that coredump() drops proc lock.
 		 */
 		if (coredump(td) == 0)
 			sig |= WCOREFLAG;
 		if (kern_logsigexit)
 			log(LOG_INFO,
 			    "pid %d (%s), uid %d: exited on signal %d%s\n",
 			    p->p_pid, p->p_comm,
 			    td->td_ucred ? td->td_ucred->cr_uid : -1,
 			    sig &~ WCOREFLAG,
 			    sig & WCOREFLAG ? " (core dumped)" : "");
 	} else
 		PROC_UNLOCK(p);
 	exit1(td, W_EXITCODE(0, sig));
 	/* NOTREACHED */
 }
 
 /*
  * Send queued SIGCHLD to parent when child process's state
  * is changed.
  */
 static void
 sigparent(struct proc *p, int reason, int status)
 {
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	PROC_LOCK_ASSERT(p->p_pptr, MA_OWNED);
 
 	if (p->p_ksi != NULL) {
 		p->p_ksi->ksi_signo  = SIGCHLD;
 		p->p_ksi->ksi_code   = reason;
 		p->p_ksi->ksi_status = status;
 		p->p_ksi->ksi_pid    = p->p_pid;
 		p->p_ksi->ksi_uid    = p->p_ucred->cr_ruid;
 		if (KSI_ONQ(p->p_ksi))
 			return;
 	}
 	pksignal(p->p_pptr, SIGCHLD, p->p_ksi);
 }
 
 static void
 childproc_jobstate(struct proc *p, int reason, int sig)
 {
 	struct sigacts *ps;
 
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	PROC_LOCK_ASSERT(p->p_pptr, MA_OWNED);
 
 	/*
 	 * Wake up parent sleeping in kern_wait(), also send
 	 * SIGCHLD to parent, but SIGCHLD does not guarantee
 	 * that parent will awake, because parent may masked
 	 * the signal.
 	 */
 	p->p_pptr->p_flag |= P_STATCHILD;
 	wakeup(p->p_pptr);
 
 	ps = p->p_pptr->p_sigacts;
 	mtx_lock(&ps->ps_mtx);
 	if ((ps->ps_flag & PS_NOCLDSTOP) == 0) {
 		mtx_unlock(&ps->ps_mtx);
 		sigparent(p, reason, sig);
 	} else
 		mtx_unlock(&ps->ps_mtx);
 }
 
 void
 childproc_stopped(struct proc *p, int reason)
 {
 	/* p_xstat is a plain signal number, not a full wait() status here. */
 	childproc_jobstate(p, reason, p->p_xstat);
 }
 
 void
 childproc_continued(struct proc *p)
 {
 	childproc_jobstate(p, CLD_CONTINUED, SIGCONT);
 }
 
 void
 childproc_exited(struct proc *p)
 {
 	int reason;
 	int xstat = p->p_xstat; /* convert to int */
 	int status;
 
 	if (WCOREDUMP(xstat))
 		reason = CLD_DUMPED, status = WTERMSIG(xstat);
 	else if (WIFSIGNALED(xstat))
 		reason = CLD_KILLED, status = WTERMSIG(xstat);
 	else
 		reason = CLD_EXITED, status = WEXITSTATUS(xstat);
 	/*
 	 * XXX avoid calling wakeup(p->p_pptr), the work is
 	 * done in exit1().
 	 */
 	sigparent(p, reason, status);
 }
 
 /*
  * We only have 1 character for the core count in the format
  * string, so the range will be 0-9
  */
 #define MAX_NUM_CORES 10
 static int num_cores = 5;
 
 static int
 sysctl_debug_num_cores_check (SYSCTL_HANDLER_ARGS)
 {
 	int error;
 	int new_val;
 
 	new_val = num_cores;
 	error = sysctl_handle_int(oidp, &new_val, 0, req);
 	if (error != 0 || req->newptr == NULL)
 		return (error);
 	if (new_val > MAX_NUM_CORES)
 		new_val = MAX_NUM_CORES;
 	if (new_val < 0)
 		new_val = 0;
 	num_cores = new_val;
 	return (0);
 }
 SYSCTL_PROC(_debug, OID_AUTO, ncores, CTLTYPE_INT|CTLFLAG_RW,
 	    0, sizeof(int), sysctl_debug_num_cores_check, "I", "");
 
 #if defined(COMPRESS_USER_CORES)
 int compress_user_cores = 1;
 SYSCTL_INT(_kern, OID_AUTO, compress_user_cores, CTLFLAG_RW,
     &compress_user_cores, 0, "Compression of user corefiles");
 
 int compress_user_cores_gzlevel = -1; /* default level */
 SYSCTL_INT(_kern, OID_AUTO, compress_user_cores_gzlevel, CTLFLAG_RW,
     &compress_user_cores_gzlevel, -1, "Corefile gzip compression level");
 
 #define GZ_SUFFIX	".gz"
 #define GZ_SUFFIX_LEN	3
 #endif
 
 static char corefilename[MAXPATHLEN] = {"%N.core"};
 TUNABLE_STR("kern.corefile", corefilename, sizeof(corefilename));
 SYSCTL_STRING(_kern, OID_AUTO, corefile, CTLFLAG_RW, corefilename,
     sizeof(corefilename), "Process corefile name format string");
 
 /*
  * corefile_open(comm, uid, pid, td, compress, vpp, namep)
  * Expand the name described in corefilename, using name, uid, and pid
  * and open/create core file.
  * corefilename is a printf-like string, with three format specifiers:
  *	%N	name of process ("name")
  *	%P	process id (pid)
  *	%U	user id (uid)
  * For example, "%N.core" is the default; they can be disabled completely
  * by using "/dev/null", or all core files can be stored in "/cores/%U/%N-%P".
  * This is controlled by the sysctl variable kern.corefile (see above).
  */
 static int
 corefile_open(const char *comm, uid_t uid, pid_t pid, struct thread *td,
     int compress, struct vnode **vpp, char **namep)
 {
 	struct nameidata nd;
 	struct sbuf sb;
 	const char *format;
 	char *hostname, *name;
 	int indexpos, i, error, cmode, flags, oflags;
 
 	hostname = NULL;
 	format = corefilename;
 	name = malloc(MAXPATHLEN, M_TEMP, M_WAITOK | M_ZERO);
 	indexpos = -1;
 	(void)sbuf_new(&sb, name, MAXPATHLEN, SBUF_FIXEDLEN);
 	for (i = 0; format[i] != '\0'; i++) {
 		switch (format[i]) {
 		case '%':	/* Format character */
 			i++;
 			switch (format[i]) {
 			case '%':
 				sbuf_putc(&sb, '%');
 				break;
 			case 'H':	/* hostname */
 				if (hostname == NULL) {
 					hostname = malloc(MAXHOSTNAMELEN,
 					    M_TEMP, M_WAITOK);
 				}
 				getcredhostname(td->td_ucred, hostname,
 				    MAXHOSTNAMELEN);
 				sbuf_printf(&sb, "%s", hostname);
 				break;
 			case 'I':	/* autoincrementing index */
 				sbuf_printf(&sb, "0");
 				indexpos = sbuf_len(&sb) - 1;
 				break;
 			case 'N':	/* process name */
 				sbuf_printf(&sb, "%s", comm);
 				break;
 			case 'P':	/* process id */
 				sbuf_printf(&sb, "%u", pid);
 				break;
 			case 'U':	/* user id */
 				sbuf_printf(&sb, "%u", uid);
 				break;
 			default:
 				log(LOG_ERR,
 				    "Unknown format character %c in "
 				    "corename `%s'\n", format[i], format);
 				break;
 			}
 			break;
 		default:
 			sbuf_putc(&sb, format[i]);
 			break;
 		}
 	}
 	free(hostname, M_TEMP);
 #ifdef COMPRESS_USER_CORES
 	if (compress)
 		sbuf_printf(&sb, GZ_SUFFIX);
 #endif
 	if (sbuf_error(&sb) != 0) {
 		log(LOG_ERR, "pid %ld (%s), uid (%lu): corename is too "
 		    "long\n", (long)pid, comm, (u_long)uid);
 		sbuf_delete(&sb);
 		free(name, M_TEMP);
 		return (ENOMEM);
 	}
 	sbuf_finish(&sb);
 	sbuf_delete(&sb);
 
 	cmode = S_IRUSR | S_IWUSR;
 	oflags = VN_OPEN_NOAUDIT | (capmode_coredump ? VN_OPEN_NOCAPCHECK : 0);
 
 	/*
 	 * If the core format has a %I in it, then we need to check
 	 * for existing corefiles before returning a name.
 	 * To do this we iterate over 0..num_cores to find a
 	 * non-existing core file name to use.
 	 */
 	if (indexpos != -1) {
 		for (i = 0; i < num_cores; i++) {
 			flags = O_CREAT | O_EXCL | FWRITE | O_NOFOLLOW;
 			name[indexpos] = '0' + i;
 			NDINIT(&nd, LOOKUP, NOFOLLOW, UIO_SYSSPACE, name, td);
 			error = vn_open_cred(&nd, &flags, cmode, oflags,
 			    td->td_ucred, NULL);
 			if (error) {
 				if (error == EEXIST)
 					continue;
 				log(LOG_ERR,
 				    "pid %d (%s), uid (%u):  Path `%s' failed "
 				    "on initial open test, error = %d\n",
 				    pid, comm, uid, name, error);
 			}
 			goto out;
 		}
 	}
 
 	flags = O_CREAT | FWRITE | O_NOFOLLOW;
 	NDINIT(&nd, LOOKUP, NOFOLLOW, UIO_SYSSPACE, name, td);
 	error = vn_open_cred(&nd, &flags, cmode, oflags, td->td_ucred, NULL);
 out:
 	if (error) {
 #ifdef AUDIT
 		audit_proc_coredump(td, name, error);
 #endif
 		free(name, M_TEMP);
 		return (error);
 	}
 	NDFREE(&nd, NDF_ONLY_PNBUF);
 	*vpp = nd.ni_vp;
 	*namep = name;
 	return (0);
 }
 
 /*
  * Dump a process' core.  The main routine does some
  * policy checking, and creates the name of the coredump;
  * then it passes on a vnode and a size limit to the process-specific
  * coredump routine if there is one; if there _is not_ one, it returns
  * ENOSYS; otherwise it returns the error from the process-specific routine.
  */
 
 static int
 coredump(struct thread *td)
 {
 	struct proc *p = td->td_proc;
 	struct ucred *cred = td->td_ucred;
 	struct vnode *vp;
 	struct flock lf;
 	struct vattr vattr;
 	int error, error1, locked;
 	struct mount *mp;
 	char *name;			/* name of corefile */
 	off_t limit;
 	int compress;
 
 #ifdef COMPRESS_USER_CORES
 	compress = compress_user_cores;
 #else
 	compress = 0;
 #endif
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	MPASS((p->p_flag & P_HADTHREADS) == 0 || p->p_singlethread == td);
 	_STOPEVENT(p, S_CORE, 0);
 
 	if (!do_coredump || (!sugid_coredump && (p->p_flag & P_SUGID) != 0)) {
 		PROC_UNLOCK(p);
 		return (EFAULT);
 	}
 
 	/*
 	 * Note that the bulk of limit checking is done after
 	 * the corefile is created.  The exception is if the limit
 	 * for corefiles is 0, in which case we don't bother
 	 * creating the corefile at all.  This layout means that
 	 * a corefile is truncated instead of not being created,
 	 * if it is larger than the limit.
 	 */
 	limit = (off_t)lim_cur(p, RLIMIT_CORE);
 	if (limit == 0 || racct_get_available(p, RACCT_CORE) == 0) {
 		PROC_UNLOCK(p);
 		return (EFBIG);
 	}
 	PROC_UNLOCK(p);
 
 restart:
 	error = corefile_open(p->p_comm, cred->cr_uid, p->p_pid, td, compress,
 	    &vp, &name);
 	if (error != 0)
 		return (error);
 
 	/* Don't dump to non-regular files or files with links. */
 	if (vp->v_type != VREG || VOP_GETATTR(vp, &vattr, cred) != 0 ||
 	    vattr.va_nlink != 1) {
 		VOP_UNLOCK(vp, 0);
 		error = EFAULT;
 		goto close;
 	}
 
 	VOP_UNLOCK(vp, 0);
 	lf.l_whence = SEEK_SET;
 	lf.l_start = 0;
 	lf.l_len = 0;
 	lf.l_type = F_WRLCK;
 	locked = (VOP_ADVLOCK(vp, (caddr_t)p, F_SETLK, &lf, F_FLOCK) == 0);
 
 	if (vn_start_write(vp, &mp, V_NOWAIT) != 0) {
 		lf.l_type = F_UNLCK;
 		if (locked)
 			VOP_ADVLOCK(vp, (caddr_t)p, F_UNLCK, &lf, F_FLOCK);
 		if ((error = vn_close(vp, FWRITE, cred, td)) != 0)
 			goto out;
 		if ((error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH)) != 0)
 			goto out;
 		free(name, M_TEMP);
 		goto restart;
 	}
 
 	VATTR_NULL(&vattr);
 	vattr.va_size = 0;
 	if (set_core_nodump_flag)
 		vattr.va_flags = UF_NODUMP;
 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
 	VOP_SETATTR(vp, &vattr, cred);
 	VOP_UNLOCK(vp, 0);
 	vn_finished_write(mp);
 	PROC_LOCK(p);
 	p->p_acflag |= ACORE;
 	PROC_UNLOCK(p);
 
 	if (p->p_sysent->sv_coredump != NULL) {
 		error = p->p_sysent->sv_coredump(td, vp, limit,
 		    compress ? IMGACT_CORE_COMPRESS : 0);
 	} else {
 		error = ENOSYS;
 	}
 
 	if (locked) {
 		lf.l_type = F_UNLCK;
 		VOP_ADVLOCK(vp, (caddr_t)p, F_UNLCK, &lf, F_FLOCK);
 	}
 close:
 	error1 = vn_close(vp, FWRITE, cred, td);
 	if (error == 0)
 		error = error1;
 out:
 #ifdef AUDIT
 	audit_proc_coredump(td, name, error);
 #endif
 	free(name, M_TEMP);
 	return (error);
 }
 
 /*
  * Nonexistent system call-- signal process (may want to handle it).  Flag
  * error in case process won't see signal immediately (blocked or ignored).
  */
 #ifndef _SYS_SYSPROTO_H_
 struct nosys_args {
 	int	dummy;
 };
 #endif
 /* ARGSUSED */
 int
 nosys(td, args)
 	struct thread *td;
 	struct nosys_args *args;
 {
 	struct proc *p = td->td_proc;
 
 	PROC_LOCK(p);
 	tdsignal(td, SIGSYS);
 	PROC_UNLOCK(p);
 	return (ENOSYS);
 }
 
 /*
  * Send a SIGIO or SIGURG signal to a process or process group using stored
  * credentials rather than those of the current process.
  */
 void
 pgsigio(sigiop, sig, checkctty)
 	struct sigio **sigiop;
 	int sig, checkctty;
 {
 	ksiginfo_t ksi;
 	struct sigio *sigio;
 
 	ksiginfo_init(&ksi);
 	ksi.ksi_signo = sig;
 	ksi.ksi_code = SI_KERNEL;
 
 	SIGIO_LOCK();
 	sigio = *sigiop;
 	if (sigio == NULL) {
 		SIGIO_UNLOCK();
 		return;
 	}
 	if (sigio->sio_pgid > 0) {
 		PROC_LOCK(sigio->sio_proc);
 		if (CANSIGIO(sigio->sio_ucred, sigio->sio_proc->p_ucred))
 			kern_psignal(sigio->sio_proc, sig);
 		PROC_UNLOCK(sigio->sio_proc);
 	} else if (sigio->sio_pgid < 0) {
 		struct proc *p;
 
 		PGRP_LOCK(sigio->sio_pgrp);
 		LIST_FOREACH(p, &sigio->sio_pgrp->pg_members, p_pglist) {
 			PROC_LOCK(p);
 			if (p->p_state == PRS_NORMAL &&
 			    CANSIGIO(sigio->sio_ucred, p->p_ucred) &&
 			    (checkctty == 0 || (p->p_flag & P_CONTROLT)))
 				kern_psignal(p, sig);
 			PROC_UNLOCK(p);
 		}
 		PGRP_UNLOCK(sigio->sio_pgrp);
 	}
 	SIGIO_UNLOCK();
 }
 
 static int
 filt_sigattach(struct knote *kn)
 {
 	struct proc *p = curproc;
 
 	kn->kn_ptr.p_proc = p;
 	kn->kn_flags |= EV_CLEAR;		/* automatically set */
 
 	knlist_add(&p->p_klist, kn, 0);
 
 	return (0);
 }
 
 static void
 filt_sigdetach(struct knote *kn)
 {
 	struct proc *p = kn->kn_ptr.p_proc;
 
 	knlist_remove(&p->p_klist, kn, 0);
 }
 
 /*
  * signal knotes are shared with proc knotes, so we apply a mask to
  * the hint in order to differentiate them from process hints.  This
  * could be avoided by using a signal-specific knote list, but probably
  * isn't worth the trouble.
  */
 static int
 filt_signal(struct knote *kn, long hint)
 {
 
 	if (hint & NOTE_SIGNAL) {
 		hint &= ~NOTE_SIGNAL;
 
 		if (kn->kn_id == hint)
 			kn->kn_data++;
 	}
 	return (kn->kn_data != 0);
 }
 
 struct sigacts *
 sigacts_alloc(void)
 {
 	struct sigacts *ps;
 
 	ps = malloc(sizeof(struct sigacts), M_SUBPROC, M_WAITOK | M_ZERO);
 	ps->ps_refcnt = 1;
 	mtx_init(&ps->ps_mtx, "sigacts", NULL, MTX_DEF);
 	return (ps);
 }
 
 void
 sigacts_free(struct sigacts *ps)
 {
 
 	mtx_lock(&ps->ps_mtx);
 	ps->ps_refcnt--;
 	if (ps->ps_refcnt == 0) {
 		mtx_destroy(&ps->ps_mtx);
 		free(ps, M_SUBPROC);
 	} else
 		mtx_unlock(&ps->ps_mtx);
 }
 
 struct sigacts *
 sigacts_hold(struct sigacts *ps)
 {
 	mtx_lock(&ps->ps_mtx);
 	ps->ps_refcnt++;
 	mtx_unlock(&ps->ps_mtx);
 	return (ps);
 }
 
 void
 sigacts_copy(struct sigacts *dest, struct sigacts *src)
 {
 
 	KASSERT(dest->ps_refcnt == 1, ("sigacts_copy to shared dest"));
 	mtx_lock(&src->ps_mtx);
 	bcopy(src, dest, offsetof(struct sigacts, ps_refcnt));
 	mtx_unlock(&src->ps_mtx);
 }
 
 int
 sigacts_shared(struct sigacts *ps)
 {
 	int shared;
 
 	mtx_lock(&ps->ps_mtx);
 	shared = ps->ps_refcnt > 1;
 	mtx_unlock(&ps->ps_mtx);
 	return (shared);
 }
Index: head/sys/kern/kern_sx.c
===================================================================
--- head/sys/kern/kern_sx.c	(revision 258540)
+++ head/sys/kern/kern_sx.c	(revision 258541)
@@ -1,1214 +1,1209 @@
 /*-
  * Copyright (c) 2007 Attilio Rao <attilio@freebsd.org>
  * Copyright (c) 2001 Jason Evans <jasone@freebsd.org>
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice(s), this list of conditions and the following disclaimer as
  *    the first lines of this file unmodified other than the possible
  *    addition of one or more copyright notices.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice(s), this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER(S) ``AS IS'' AND ANY
  * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
  * DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT HOLDER(S) BE LIABLE FOR ANY
  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
  * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
  * DAMAGE.
  */
 
 /*
  * Shared/exclusive locks.  This implementation attempts to ensure
  * deterministic lock granting behavior, so that slocks and xlocks are
  * interleaved.
  *
  * Priority propagation will not generally raise the priority of lock holders,
  * so should not be relied upon in combination with sx locks.
  */
 
 #include "opt_ddb.h"
 #include "opt_hwpmc_hooks.h"
-#include "opt_kdtrace.h"
 #include "opt_no_adaptive_sx.h"
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/kdb.h>
 #include <sys/ktr.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/proc.h>
 #include <sys/sleepqueue.h>
 #include <sys/sx.h>
 #include <sys/sysctl.h>
 
 #if defined(SMP) && !defined(NO_ADAPTIVE_SX)
 #include <machine/cpu.h>
 #endif
 
 #ifdef DDB
 #include <ddb/ddb.h>
 #endif
 
 #if defined(SMP) && !defined(NO_ADAPTIVE_SX)
 #define	ADAPTIVE_SX
 #endif
 
 CTASSERT((SX_NOADAPTIVE & LO_CLASSFLAGS) == SX_NOADAPTIVE);
 
 #ifdef HWPMC_HOOKS
 #include <sys/pmckern.h>
 PMC_SOFT_DECLARE( , , lock, failed);
 #endif
 
 /* Handy macros for sleep queues. */
 #define	SQ_EXCLUSIVE_QUEUE	0
 #define	SQ_SHARED_QUEUE		1
 
 /*
  * Variations on DROP_GIANT()/PICKUP_GIANT() for use in this file.  We
  * drop Giant anytime we have to sleep or if we adaptively spin.
  */
 #define	GIANT_DECLARE							\
 	int _giantcnt = 0;						\
 	WITNESS_SAVE_DECL(Giant)					\
 
 #define	GIANT_SAVE() do {						\
 	if (mtx_owned(&Giant)) {					\
 		WITNESS_SAVE(&Giant.lock_object, Giant);		\
 		while (mtx_owned(&Giant)) {				\
 			_giantcnt++;					\
 			mtx_unlock(&Giant);				\
 		}							\
 	}								\
 } while (0)
 
 #define GIANT_RESTORE() do {						\
 	if (_giantcnt > 0) {						\
 		mtx_assert(&Giant, MA_NOTOWNED);			\
 		while (_giantcnt--)					\
 			mtx_lock(&Giant);				\
 		WITNESS_RESTORE(&Giant.lock_object, Giant);		\
 	}								\
 } while (0)
 
 /*
  * Returns true if an exclusive lock is recursed.  It assumes
  * curthread currently has an exclusive lock.
  */
-#define	sx_recurse		lock_object.lo_data
 #define	sx_recursed(sx)		((sx)->sx_recurse != 0)
 
 static void	assert_sx(const struct lock_object *lock, int what);
 #ifdef DDB
 static void	db_show_sx(const struct lock_object *lock);
 #endif
 static void	lock_sx(struct lock_object *lock, uintptr_t how);
 #ifdef KDTRACE_HOOKS
 static int	owner_sx(const struct lock_object *lock, struct thread **owner);
 #endif
 static uintptr_t unlock_sx(struct lock_object *lock);
 
 struct lock_class lock_class_sx = {
 	.lc_name = "sx",
 	.lc_flags = LC_SLEEPLOCK | LC_SLEEPABLE | LC_RECURSABLE | LC_UPGRADABLE,
 	.lc_assert = assert_sx,
 #ifdef DDB
 	.lc_ddb_show = db_show_sx,
 #endif
 	.lc_lock = lock_sx,
 	.lc_unlock = unlock_sx,
 #ifdef KDTRACE_HOOKS
 	.lc_owner = owner_sx,
 #endif
 };
 
 #ifndef INVARIANTS
 #define	_sx_assert(sx, what, file, line)
 #endif
 
 #ifdef ADAPTIVE_SX
 static u_int asx_retries = 10;
 static u_int asx_loops = 10000;
 static SYSCTL_NODE(_debug, OID_AUTO, sx, CTLFLAG_RD, NULL, "sxlock debugging");
 SYSCTL_UINT(_debug_sx, OID_AUTO, retries, CTLFLAG_RW, &asx_retries, 0, "");
 SYSCTL_UINT(_debug_sx, OID_AUTO, loops, CTLFLAG_RW, &asx_loops, 0, "");
 #endif
 
 void
 assert_sx(const struct lock_object *lock, int what)
 {
 
 	sx_assert((const struct sx *)lock, what);
 }
 
 void
 lock_sx(struct lock_object *lock, uintptr_t how)
 {
 	struct sx *sx;
 
 	sx = (struct sx *)lock;
 	if (how)
 		sx_slock(sx);
 	else
 		sx_xlock(sx);
 }
 
 uintptr_t
 unlock_sx(struct lock_object *lock)
 {
 	struct sx *sx;
 
 	sx = (struct sx *)lock;
 	sx_assert(sx, SA_LOCKED | SA_NOTRECURSED);
 	if (sx_xlocked(sx)) {
 		sx_xunlock(sx);
 		return (0);
 	} else {
 		sx_sunlock(sx);
 		return (1);
 	}
 }
 
 #ifdef KDTRACE_HOOKS
 int
 owner_sx(const struct lock_object *lock, struct thread **owner)
 {
         const struct sx *sx = (const struct sx *)lock;
 	uintptr_t x = sx->sx_lock;
 
         *owner = (struct thread *)SX_OWNER(x);
         return ((x & SX_LOCK_SHARED) != 0 ? (SX_SHARERS(x) != 0) :
 	    (*owner != NULL));
 }
 #endif
 
 void
 sx_sysinit(void *arg)
 {
 	struct sx_args *sargs = arg;
 
 	sx_init_flags(sargs->sa_sx, sargs->sa_desc, sargs->sa_flags);
 }
 
 void
 sx_init_flags(struct sx *sx, const char *description, int opts)
 {
 	int flags;
 
 	MPASS((opts & ~(SX_QUIET | SX_RECURSE | SX_NOWITNESS | SX_DUPOK |
 	    SX_NOPROFILE | SX_NOADAPTIVE)) == 0);
 	ASSERT_ATOMIC_LOAD_PTR(sx->sx_lock,
 	    ("%s: sx_lock not aligned for %s: %p", __func__, description,
 	    &sx->sx_lock));
 
 	flags = LO_SLEEPABLE | LO_UPGRADABLE;
 	if (opts & SX_DUPOK)
 		flags |= LO_DUPOK;
 	if (opts & SX_NOPROFILE)
 		flags |= LO_NOPROFILE;
 	if (!(opts & SX_NOWITNESS))
 		flags |= LO_WITNESS;
 	if (opts & SX_RECURSE)
 		flags |= LO_RECURSABLE;
 	if (opts & SX_QUIET)
 		flags |= LO_QUIET;
 
 	flags |= opts & SX_NOADAPTIVE;
 	lock_init(&sx->lock_object, &lock_class_sx, description, NULL, flags);
 	sx->sx_lock = SX_LOCK_UNLOCKED;
 	sx->sx_recurse = 0;
 }
 
 void
 sx_destroy(struct sx *sx)
 {
 
 	KASSERT(sx->sx_lock == SX_LOCK_UNLOCKED, ("sx lock still held"));
 	KASSERT(sx->sx_recurse == 0, ("sx lock still recursed"));
 	sx->sx_lock = SX_LOCK_DESTROYED;
 	lock_destroy(&sx->lock_object);
 }
 
 int
 _sx_slock(struct sx *sx, int opts, const char *file, int line)
 {
 	int error = 0;
 
 	if (SCHEDULER_STOPPED())
 		return (0);
 	KASSERT(kdb_active != 0 || !TD_IS_IDLETHREAD(curthread),
 	    ("sx_slock() by idle thread %p on sx %s @ %s:%d",
 	    curthread, sx->lock_object.lo_name, file, line));
 	KASSERT(sx->sx_lock != SX_LOCK_DESTROYED,
 	    ("sx_slock() of destroyed sx @ %s:%d", file, line));
 	WITNESS_CHECKORDER(&sx->lock_object, LOP_NEWORDER, file, line, NULL);
 	error = __sx_slock(sx, opts, file, line);
 	if (!error) {
 		LOCK_LOG_LOCK("SLOCK", &sx->lock_object, 0, 0, file, line);
 		WITNESS_LOCK(&sx->lock_object, 0, file, line);
 		curthread->td_locks++;
 	}
 
 	return (error);
 }
 
 int
 sx_try_slock_(struct sx *sx, const char *file, int line)
 {
 	uintptr_t x;
 
 	if (SCHEDULER_STOPPED())
 		return (1);
 
 	KASSERT(kdb_active != 0 || !TD_IS_IDLETHREAD(curthread),
 	    ("sx_try_slock() by idle thread %p on sx %s @ %s:%d",
 	    curthread, sx->lock_object.lo_name, file, line));
 
 	for (;;) {
 		x = sx->sx_lock;
 		KASSERT(x != SX_LOCK_DESTROYED,
 		    ("sx_try_slock() of destroyed sx @ %s:%d", file, line));
 		if (!(x & SX_LOCK_SHARED))
 			break;
 		if (atomic_cmpset_acq_ptr(&sx->sx_lock, x, x + SX_ONE_SHARER)) {
 			LOCK_LOG_TRY("SLOCK", &sx->lock_object, 0, 1, file, line);
 			WITNESS_LOCK(&sx->lock_object, LOP_TRYLOCK, file, line);
 			curthread->td_locks++;
 			return (1);
 		}
 	}
 
 	LOCK_LOG_TRY("SLOCK", &sx->lock_object, 0, 0, file, line);
 	return (0);
 }
 
 int
 _sx_xlock(struct sx *sx, int opts, const char *file, int line)
 {
 	int error = 0;
 
 	if (SCHEDULER_STOPPED())
 		return (0);
 	KASSERT(kdb_active != 0 || !TD_IS_IDLETHREAD(curthread),
 	    ("sx_xlock() by idle thread %p on sx %s @ %s:%d",
 	    curthread, sx->lock_object.lo_name, file, line));
 	KASSERT(sx->sx_lock != SX_LOCK_DESTROYED,
 	    ("sx_xlock() of destroyed sx @ %s:%d", file, line));
 	WITNESS_CHECKORDER(&sx->lock_object, LOP_NEWORDER | LOP_EXCLUSIVE, file,
 	    line, NULL);
 	error = __sx_xlock(sx, curthread, opts, file, line);
 	if (!error) {
 		LOCK_LOG_LOCK("XLOCK", &sx->lock_object, 0, sx->sx_recurse,
 		    file, line);
 		WITNESS_LOCK(&sx->lock_object, LOP_EXCLUSIVE, file, line);
 		curthread->td_locks++;
 	}
 
 	return (error);
 }
 
 int
 sx_try_xlock_(struct sx *sx, const char *file, int line)
 {
 	int rval;
 
 	if (SCHEDULER_STOPPED())
 		return (1);
 
 	KASSERT(kdb_active != 0 || !TD_IS_IDLETHREAD(curthread),
 	    ("sx_try_xlock() by idle thread %p on sx %s @ %s:%d",
 	    curthread, sx->lock_object.lo_name, file, line));
 	KASSERT(sx->sx_lock != SX_LOCK_DESTROYED,
 	    ("sx_try_xlock() of destroyed sx @ %s:%d", file, line));
 
 	if (sx_xlocked(sx) &&
 	    (sx->lock_object.lo_flags & LO_RECURSABLE) != 0) {
 		sx->sx_recurse++;
 		atomic_set_ptr(&sx->sx_lock, SX_LOCK_RECURSED);
 		rval = 1;
 	} else
 		rval = atomic_cmpset_acq_ptr(&sx->sx_lock, SX_LOCK_UNLOCKED,
 		    (uintptr_t)curthread);
 	LOCK_LOG_TRY("XLOCK", &sx->lock_object, 0, rval, file, line);
 	if (rval) {
 		WITNESS_LOCK(&sx->lock_object, LOP_EXCLUSIVE | LOP_TRYLOCK,
 		    file, line);
 		curthread->td_locks++;
 	}
 
 	return (rval);
 }
 
 void
 _sx_sunlock(struct sx *sx, const char *file, int line)
 {
 
 	if (SCHEDULER_STOPPED())
 		return;
 	KASSERT(sx->sx_lock != SX_LOCK_DESTROYED,
 	    ("sx_sunlock() of destroyed sx @ %s:%d", file, line));
 	_sx_assert(sx, SA_SLOCKED, file, line);
 	WITNESS_UNLOCK(&sx->lock_object, 0, file, line);
 	LOCK_LOG_LOCK("SUNLOCK", &sx->lock_object, 0, 0, file, line);
 	__sx_sunlock(sx, file, line);
-	LOCKSTAT_PROFILE_RELEASE_LOCK(LS_SX_SUNLOCK_RELEASE, sx);
 	curthread->td_locks--;
 }
 
 void
 _sx_xunlock(struct sx *sx, const char *file, int line)
 {
 
 	if (SCHEDULER_STOPPED())
 		return;
 	KASSERT(sx->sx_lock != SX_LOCK_DESTROYED,
 	    ("sx_xunlock() of destroyed sx @ %s:%d", file, line));
 	_sx_assert(sx, SA_XLOCKED, file, line);
 	WITNESS_UNLOCK(&sx->lock_object, LOP_EXCLUSIVE, file, line);
 	LOCK_LOG_LOCK("XUNLOCK", &sx->lock_object, 0, sx->sx_recurse, file,
 	    line);
-	if (!sx_recursed(sx))
-		LOCKSTAT_PROFILE_RELEASE_LOCK(LS_SX_XUNLOCK_RELEASE, sx);
 	__sx_xunlock(sx, curthread, file, line);
 	curthread->td_locks--;
 }
 
 /*
  * Try to do a non-blocking upgrade from a shared lock to an exclusive lock.
  * This will only succeed if this thread holds a single shared lock.
  * Return 1 if if the upgrade succeed, 0 otherwise.
  */
 int
 sx_try_upgrade_(struct sx *sx, const char *file, int line)
 {
 	uintptr_t x;
 	int success;
 
 	if (SCHEDULER_STOPPED())
 		return (1);
 
 	KASSERT(sx->sx_lock != SX_LOCK_DESTROYED,
 	    ("sx_try_upgrade() of destroyed sx @ %s:%d", file, line));
 	_sx_assert(sx, SA_SLOCKED, file, line);
 
 	/*
 	 * Try to switch from one shared lock to an exclusive lock.  We need
 	 * to maintain the SX_LOCK_EXCLUSIVE_WAITERS flag if set so that
 	 * we will wake up the exclusive waiters when we drop the lock.
 	 */
 	x = sx->sx_lock & SX_LOCK_EXCLUSIVE_WAITERS;
 	success = atomic_cmpset_ptr(&sx->sx_lock, SX_SHARERS_LOCK(1) | x,
 	    (uintptr_t)curthread | x);
 	LOCK_LOG_TRY("XUPGRADE", &sx->lock_object, 0, success, file, line);
 	if (success) {
 		WITNESS_UPGRADE(&sx->lock_object, LOP_EXCLUSIVE | LOP_TRYLOCK,
 		    file, line);
 		LOCKSTAT_RECORD0(LS_SX_TRYUPGRADE_UPGRADE, sx);
 	}
 	return (success);
 }
 
 /*
  * Downgrade an unrecursed exclusive lock into a single shared lock.
  */
 void
 sx_downgrade_(struct sx *sx, const char *file, int line)
 {
 	uintptr_t x;
 	int wakeup_swapper;
 
 	if (SCHEDULER_STOPPED())
 		return;
 
 	KASSERT(sx->sx_lock != SX_LOCK_DESTROYED,
 	    ("sx_downgrade() of destroyed sx @ %s:%d", file, line));
 	_sx_assert(sx, SA_XLOCKED | SA_NOTRECURSED, file, line);
 #ifndef INVARIANTS
 	if (sx_recursed(sx))
 		panic("downgrade of a recursed lock");
 #endif
 
 	WITNESS_DOWNGRADE(&sx->lock_object, 0, file, line);
 
 	/*
 	 * Try to switch from an exclusive lock with no shared waiters
 	 * to one sharer with no shared waiters.  If there are
 	 * exclusive waiters, we don't need to lock the sleep queue so
 	 * long as we preserve the flag.  We do one quick try and if
 	 * that fails we grab the sleepq lock to keep the flags from
 	 * changing and do it the slow way.
 	 *
 	 * We have to lock the sleep queue if there are shared waiters
 	 * so we can wake them up.
 	 */
 	x = sx->sx_lock;
 	if (!(x & SX_LOCK_SHARED_WAITERS) &&
 	    atomic_cmpset_rel_ptr(&sx->sx_lock, x, SX_SHARERS_LOCK(1) |
 	    (x & SX_LOCK_EXCLUSIVE_WAITERS))) {
 		LOCK_LOG_LOCK("XDOWNGRADE", &sx->lock_object, 0, 0, file, line);
 		return;
 	}
 
 	/*
 	 * Lock the sleep queue so we can read the waiters bits
 	 * without any races and wakeup any shared waiters.
 	 */
 	sleepq_lock(&sx->lock_object);
 
 	/*
 	 * Preserve SX_LOCK_EXCLUSIVE_WAITERS while downgraded to a single
 	 * shared lock.  If there are any shared waiters, wake them up.
 	 */
 	wakeup_swapper = 0;
 	x = sx->sx_lock;
 	atomic_store_rel_ptr(&sx->sx_lock, SX_SHARERS_LOCK(1) |
 	    (x & SX_LOCK_EXCLUSIVE_WAITERS));
 	if (x & SX_LOCK_SHARED_WAITERS)
 		wakeup_swapper = sleepq_broadcast(&sx->lock_object, SLEEPQ_SX,
 		    0, SQ_SHARED_QUEUE);
 	sleepq_release(&sx->lock_object);
 
 	LOCK_LOG_LOCK("XDOWNGRADE", &sx->lock_object, 0, 0, file, line);
 	LOCKSTAT_RECORD0(LS_SX_DOWNGRADE_DOWNGRADE, sx);
 
 	if (wakeup_swapper)
 		kick_proc0();
 }
 
 /*
  * This function represents the so-called 'hard case' for sx_xlock
  * operation.  All 'easy case' failures are redirected to this.  Note
  * that ideally this would be a static function, but it needs to be
  * accessible from at least sx.h.
  */
 int
 _sx_xlock_hard(struct sx *sx, uintptr_t tid, int opts, const char *file,
     int line)
 {
 	GIANT_DECLARE;
 #ifdef ADAPTIVE_SX
 	volatile struct thread *owner;
 	u_int i, spintries = 0;
 #endif
 	uintptr_t x;
 #ifdef LOCK_PROFILING
 	uint64_t waittime = 0;
 	int contested = 0;
 #endif
 	int error = 0;
 #ifdef	KDTRACE_HOOKS
 	uint64_t spin_cnt = 0;
 	uint64_t sleep_cnt = 0;
 	int64_t sleep_time = 0;
 #endif
 
 	if (SCHEDULER_STOPPED())
 		return (0);
 
 	/* If we already hold an exclusive lock, then recurse. */
 	if (sx_xlocked(sx)) {
 		KASSERT((sx->lock_object.lo_flags & LO_RECURSABLE) != 0,
 	    ("_sx_xlock_hard: recursed on non-recursive sx %s @ %s:%d\n",
 		    sx->lock_object.lo_name, file, line));
 		sx->sx_recurse++;
 		atomic_set_ptr(&sx->sx_lock, SX_LOCK_RECURSED);
 		if (LOCK_LOG_TEST(&sx->lock_object, 0))
 			CTR2(KTR_LOCK, "%s: %p recursing", __func__, sx);
 		return (0);
 	}
 
 	if (LOCK_LOG_TEST(&sx->lock_object, 0))
 		CTR5(KTR_LOCK, "%s: %s contested (lock=%p) at %s:%d", __func__,
 		    sx->lock_object.lo_name, (void *)sx->sx_lock, file, line);
 
 	while (!atomic_cmpset_acq_ptr(&sx->sx_lock, SX_LOCK_UNLOCKED, tid)) {
 #ifdef KDTRACE_HOOKS
 		spin_cnt++;
 #endif
 #ifdef HWPMC_HOOKS
 		PMC_SOFT_CALL( , , lock, failed);
 #endif
 		lock_profile_obtain_lock_failed(&sx->lock_object, &contested,
 		    &waittime);
 #ifdef ADAPTIVE_SX
 		/*
 		 * If the lock is write locked and the owner is
 		 * running on another CPU, spin until the owner stops
 		 * running or the state of the lock changes.
 		 */
 		x = sx->sx_lock;
 		if ((sx->lock_object.lo_flags & SX_NOADAPTIVE) == 0) {
 			if ((x & SX_LOCK_SHARED) == 0) {
 				x = SX_OWNER(x);
 				owner = (struct thread *)x;
 				if (TD_IS_RUNNING(owner)) {
 					if (LOCK_LOG_TEST(&sx->lock_object, 0))
 						CTR3(KTR_LOCK,
 					    "%s: spinning on %p held by %p",
 						    __func__, sx, owner);
 					GIANT_SAVE();
 					while (SX_OWNER(sx->sx_lock) == x &&
 					    TD_IS_RUNNING(owner)) {
 						cpu_spinwait();
 #ifdef KDTRACE_HOOKS
 						spin_cnt++;
 #endif
 					}
 					continue;
 				}
 			} else if (SX_SHARERS(x) && spintries < asx_retries) {
 				GIANT_SAVE();
 				spintries++;
 				for (i = 0; i < asx_loops; i++) {
 					if (LOCK_LOG_TEST(&sx->lock_object, 0))
 						CTR4(KTR_LOCK,
 				    "%s: shared spinning on %p with %u and %u",
 						    __func__, sx, spintries, i);
 					x = sx->sx_lock;
 					if ((x & SX_LOCK_SHARED) == 0 ||
 					    SX_SHARERS(x) == 0)
 						break;
 					cpu_spinwait();
 #ifdef KDTRACE_HOOKS
 					spin_cnt++;
 #endif
 				}
 				if (i != asx_loops)
 					continue;
 			}
 		}
 #endif
 
 		sleepq_lock(&sx->lock_object);
 		x = sx->sx_lock;
 
 		/*
 		 * If the lock was released while spinning on the
 		 * sleep queue chain lock, try again.
 		 */
 		if (x == SX_LOCK_UNLOCKED) {
 			sleepq_release(&sx->lock_object);
 			continue;
 		}
 
 #ifdef ADAPTIVE_SX
 		/*
 		 * The current lock owner might have started executing
 		 * on another CPU (or the lock could have changed
 		 * owners) while we were waiting on the sleep queue
 		 * chain lock.  If so, drop the sleep queue lock and try
 		 * again.
 		 */
 		if (!(x & SX_LOCK_SHARED) &&
 		    (sx->lock_object.lo_flags & SX_NOADAPTIVE) == 0) {
 			owner = (struct thread *)SX_OWNER(x);
 			if (TD_IS_RUNNING(owner)) {
 				sleepq_release(&sx->lock_object);
 				continue;
 			}
 		}
 #endif
 
 		/*
 		 * If an exclusive lock was released with both shared
 		 * and exclusive waiters and a shared waiter hasn't
 		 * woken up and acquired the lock yet, sx_lock will be
 		 * set to SX_LOCK_UNLOCKED | SX_LOCK_EXCLUSIVE_WAITERS.
 		 * If we see that value, try to acquire it once.  Note
 		 * that we have to preserve SX_LOCK_EXCLUSIVE_WAITERS
 		 * as there are other exclusive waiters still.  If we
 		 * fail, restart the loop.
 		 */
 		if (x == (SX_LOCK_UNLOCKED | SX_LOCK_EXCLUSIVE_WAITERS)) {
 			if (atomic_cmpset_acq_ptr(&sx->sx_lock,
 			    SX_LOCK_UNLOCKED | SX_LOCK_EXCLUSIVE_WAITERS,
 			    tid | SX_LOCK_EXCLUSIVE_WAITERS)) {
 				sleepq_release(&sx->lock_object);
 				CTR2(KTR_LOCK, "%s: %p claimed by new writer",
 				    __func__, sx);
 				break;
 			}
 			sleepq_release(&sx->lock_object);
 			continue;
 		}
 
 		/*
 		 * Try to set the SX_LOCK_EXCLUSIVE_WAITERS.  If we fail,
 		 * than loop back and retry.
 		 */
 		if (!(x & SX_LOCK_EXCLUSIVE_WAITERS)) {
 			if (!atomic_cmpset_ptr(&sx->sx_lock, x,
 			    x | SX_LOCK_EXCLUSIVE_WAITERS)) {
 				sleepq_release(&sx->lock_object);
 				continue;
 			}
 			if (LOCK_LOG_TEST(&sx->lock_object, 0))
 				CTR2(KTR_LOCK, "%s: %p set excl waiters flag",
 				    __func__, sx);
 		}
 
 		/*
 		 * Since we have been unable to acquire the exclusive
 		 * lock and the exclusive waiters flag is set, we have
 		 * to sleep.
 		 */
 		if (LOCK_LOG_TEST(&sx->lock_object, 0))
 			CTR2(KTR_LOCK, "%s: %p blocking on sleep queue",
 			    __func__, sx);
 
 #ifdef KDTRACE_HOOKS
 		sleep_time -= lockstat_nsecs();
 #endif
 		GIANT_SAVE();
 		sleepq_add(&sx->lock_object, NULL, sx->lock_object.lo_name,
 		    SLEEPQ_SX | ((opts & SX_INTERRUPTIBLE) ?
 		    SLEEPQ_INTERRUPTIBLE : 0), SQ_EXCLUSIVE_QUEUE);
 		if (!(opts & SX_INTERRUPTIBLE))
 			sleepq_wait(&sx->lock_object, 0);
 		else
 			error = sleepq_wait_sig(&sx->lock_object, 0);
 #ifdef KDTRACE_HOOKS
 		sleep_time += lockstat_nsecs();
 		sleep_cnt++;
 #endif
 		if (error) {
 			if (LOCK_LOG_TEST(&sx->lock_object, 0))
 				CTR2(KTR_LOCK,
 			"%s: interruptible sleep by %p suspended by signal",
 				    __func__, sx);
 			break;
 		}
 		if (LOCK_LOG_TEST(&sx->lock_object, 0))
 			CTR2(KTR_LOCK, "%s: %p resuming from sleep queue",
 			    __func__, sx);
 	}
 
 	GIANT_RESTORE();
 	if (!error)
 		LOCKSTAT_PROFILE_OBTAIN_LOCK_SUCCESS(LS_SX_XLOCK_ACQUIRE, sx,
 		    contested, waittime, file, line);
 #ifdef KDTRACE_HOOKS
 	if (sleep_time)
 		LOCKSTAT_RECORD1(LS_SX_XLOCK_BLOCK, sx, sleep_time);
 	if (spin_cnt > sleep_cnt)
 		LOCKSTAT_RECORD1(LS_SX_XLOCK_SPIN, sx, (spin_cnt - sleep_cnt));
 #endif
 	return (error);
 }
 
 /*
  * This function represents the so-called 'hard case' for sx_xunlock
  * operation.  All 'easy case' failures are redirected to this.  Note
  * that ideally this would be a static function, but it needs to be
  * accessible from at least sx.h.
  */
 void
 _sx_xunlock_hard(struct sx *sx, uintptr_t tid, const char *file, int line)
 {
 	uintptr_t x;
 	int queue, wakeup_swapper;
 
 	if (SCHEDULER_STOPPED())
 		return;
 
 	MPASS(!(sx->sx_lock & SX_LOCK_SHARED));
 
 	/* If the lock is recursed, then unrecurse one level. */
 	if (sx_xlocked(sx) && sx_recursed(sx)) {
 		if ((--sx->sx_recurse) == 0)
 			atomic_clear_ptr(&sx->sx_lock, SX_LOCK_RECURSED);
 		if (LOCK_LOG_TEST(&sx->lock_object, 0))
 			CTR2(KTR_LOCK, "%s: %p unrecursing", __func__, sx);
 		return;
 	}
 	MPASS(sx->sx_lock & (SX_LOCK_SHARED_WAITERS |
 	    SX_LOCK_EXCLUSIVE_WAITERS));
 	if (LOCK_LOG_TEST(&sx->lock_object, 0))
 		CTR2(KTR_LOCK, "%s: %p contested", __func__, sx);
 
 	sleepq_lock(&sx->lock_object);
 	x = SX_LOCK_UNLOCKED;
 
 	/*
 	 * The wake up algorithm here is quite simple and probably not
 	 * ideal.  It gives precedence to shared waiters if they are
 	 * present.  For this condition, we have to preserve the
 	 * state of the exclusive waiters flag.
 	 * If interruptible sleeps left the shared queue empty avoid a
 	 * starvation for the threads sleeping on the exclusive queue by giving
 	 * them precedence and cleaning up the shared waiters bit anyway.
 	 */
 	if ((sx->sx_lock & SX_LOCK_SHARED_WAITERS) != 0 &&
 	    sleepq_sleepcnt(&sx->lock_object, SQ_SHARED_QUEUE) != 0) {
 		queue = SQ_SHARED_QUEUE;
 		x |= (sx->sx_lock & SX_LOCK_EXCLUSIVE_WAITERS);
 	} else
 		queue = SQ_EXCLUSIVE_QUEUE;
 
 	/* Wake up all the waiters for the specific queue. */
 	if (LOCK_LOG_TEST(&sx->lock_object, 0))
 		CTR3(KTR_LOCK, "%s: %p waking up all threads on %s queue",
 		    __func__, sx, queue == SQ_SHARED_QUEUE ? "shared" :
 		    "exclusive");
 	atomic_store_rel_ptr(&sx->sx_lock, x);
 	wakeup_swapper = sleepq_broadcast(&sx->lock_object, SLEEPQ_SX, 0,
 	    queue);
 	sleepq_release(&sx->lock_object);
 	if (wakeup_swapper)
 		kick_proc0();
 }
 
 /*
  * This function represents the so-called 'hard case' for sx_slock
  * operation.  All 'easy case' failures are redirected to this.  Note
  * that ideally this would be a static function, but it needs to be
  * accessible from at least sx.h.
  */
 int
 _sx_slock_hard(struct sx *sx, int opts, const char *file, int line)
 {
 	GIANT_DECLARE;
 #ifdef ADAPTIVE_SX
 	volatile struct thread *owner;
 #endif
 #ifdef LOCK_PROFILING
 	uint64_t waittime = 0;
 	int contested = 0;
 #endif
 	uintptr_t x;
 	int error = 0;
 #ifdef KDTRACE_HOOKS
 	uint64_t spin_cnt = 0;
 	uint64_t sleep_cnt = 0;
 	int64_t sleep_time = 0;
 #endif
 
 	if (SCHEDULER_STOPPED())
 		return (0);
 
 	/*
 	 * As with rwlocks, we don't make any attempt to try to block
 	 * shared locks once there is an exclusive waiter.
 	 */
 	for (;;) {
 #ifdef KDTRACE_HOOKS
 		spin_cnt++;
 #endif
 		x = sx->sx_lock;
 
 		/*
 		 * If no other thread has an exclusive lock then try to bump up
 		 * the count of sharers.  Since we have to preserve the state
 		 * of SX_LOCK_EXCLUSIVE_WAITERS, if we fail to acquire the
 		 * shared lock loop back and retry.
 		 */
 		if (x & SX_LOCK_SHARED) {
 			MPASS(!(x & SX_LOCK_SHARED_WAITERS));
 			if (atomic_cmpset_acq_ptr(&sx->sx_lock, x,
 			    x + SX_ONE_SHARER)) {
 				if (LOCK_LOG_TEST(&sx->lock_object, 0))
 					CTR4(KTR_LOCK,
 					    "%s: %p succeed %p -> %p", __func__,
 					    sx, (void *)x,
 					    (void *)(x + SX_ONE_SHARER));
 				break;
 			}
 			continue;
 		}
 #ifdef HWPMC_HOOKS
 		PMC_SOFT_CALL( , , lock, failed);
 #endif
 		lock_profile_obtain_lock_failed(&sx->lock_object, &contested,
 		    &waittime);
 
 #ifdef ADAPTIVE_SX
 		/*
 		 * If the owner is running on another CPU, spin until
 		 * the owner stops running or the state of the lock
 		 * changes.
 		 */
 		if ((sx->lock_object.lo_flags & SX_NOADAPTIVE) == 0) {
 			x = SX_OWNER(x);
 			owner = (struct thread *)x;
 			if (TD_IS_RUNNING(owner)) {
 				if (LOCK_LOG_TEST(&sx->lock_object, 0))
 					CTR3(KTR_LOCK,
 					    "%s: spinning on %p held by %p",
 					    __func__, sx, owner);
 				GIANT_SAVE();
 				while (SX_OWNER(sx->sx_lock) == x &&
 				    TD_IS_RUNNING(owner)) {
 #ifdef KDTRACE_HOOKS
 					spin_cnt++;
 #endif
 					cpu_spinwait();
 				}
 				continue;
 			}
 		}
 #endif
 
 		/*
 		 * Some other thread already has an exclusive lock, so
 		 * start the process of blocking.
 		 */
 		sleepq_lock(&sx->lock_object);
 		x = sx->sx_lock;
 
 		/*
 		 * The lock could have been released while we spun.
 		 * In this case loop back and retry.
 		 */
 		if (x & SX_LOCK_SHARED) {
 			sleepq_release(&sx->lock_object);
 			continue;
 		}
 
 #ifdef ADAPTIVE_SX
 		/*
 		 * If the owner is running on another CPU, spin until
 		 * the owner stops running or the state of the lock
 		 * changes.
 		 */
 		if (!(x & SX_LOCK_SHARED) &&
 		    (sx->lock_object.lo_flags & SX_NOADAPTIVE) == 0) {
 			owner = (struct thread *)SX_OWNER(x);
 			if (TD_IS_RUNNING(owner)) {
 				sleepq_release(&sx->lock_object);
 				continue;
 			}
 		}
 #endif
 
 		/*
 		 * Try to set the SX_LOCK_SHARED_WAITERS flag.  If we
 		 * fail to set it drop the sleep queue lock and loop
 		 * back.
 		 */
 		if (!(x & SX_LOCK_SHARED_WAITERS)) {
 			if (!atomic_cmpset_ptr(&sx->sx_lock, x,
 			    x | SX_LOCK_SHARED_WAITERS)) {
 				sleepq_release(&sx->lock_object);
 				continue;
 			}
 			if (LOCK_LOG_TEST(&sx->lock_object, 0))
 				CTR2(KTR_LOCK, "%s: %p set shared waiters flag",
 				    __func__, sx);
 		}
 
 		/*
 		 * Since we have been unable to acquire the shared lock,
 		 * we have to sleep.
 		 */
 		if (LOCK_LOG_TEST(&sx->lock_object, 0))
 			CTR2(KTR_LOCK, "%s: %p blocking on sleep queue",
 			    __func__, sx);
 
 #ifdef KDTRACE_HOOKS
 		sleep_time -= lockstat_nsecs();
 #endif
 		GIANT_SAVE();
 		sleepq_add(&sx->lock_object, NULL, sx->lock_object.lo_name,
 		    SLEEPQ_SX | ((opts & SX_INTERRUPTIBLE) ?
 		    SLEEPQ_INTERRUPTIBLE : 0), SQ_SHARED_QUEUE);
 		if (!(opts & SX_INTERRUPTIBLE))
 			sleepq_wait(&sx->lock_object, 0);
 		else
 			error = sleepq_wait_sig(&sx->lock_object, 0);
 #ifdef KDTRACE_HOOKS
 		sleep_time += lockstat_nsecs();
 		sleep_cnt++;
 #endif
 		if (error) {
 			if (LOCK_LOG_TEST(&sx->lock_object, 0))
 				CTR2(KTR_LOCK,
 			"%s: interruptible sleep by %p suspended by signal",
 				    __func__, sx);
 			break;
 		}
 		if (LOCK_LOG_TEST(&sx->lock_object, 0))
 			CTR2(KTR_LOCK, "%s: %p resuming from sleep queue",
 			    __func__, sx);
 	}
 	if (error == 0)
 		LOCKSTAT_PROFILE_OBTAIN_LOCK_SUCCESS(LS_SX_SLOCK_ACQUIRE, sx,
 		    contested, waittime, file, line);
 #ifdef KDTRACE_HOOKS
 	if (sleep_time)
 		LOCKSTAT_RECORD1(LS_SX_XLOCK_BLOCK, sx, sleep_time);
 	if (spin_cnt > sleep_cnt)
 		LOCKSTAT_RECORD1(LS_SX_XLOCK_SPIN, sx, (spin_cnt - sleep_cnt));
 #endif
 	GIANT_RESTORE();
 	return (error);
 }
 
 /*
  * This function represents the so-called 'hard case' for sx_sunlock
  * operation.  All 'easy case' failures are redirected to this.  Note
  * that ideally this would be a static function, but it needs to be
  * accessible from at least sx.h.
  */
 void
 _sx_sunlock_hard(struct sx *sx, const char *file, int line)
 {
 	uintptr_t x;
 	int wakeup_swapper;
 
 	if (SCHEDULER_STOPPED())
 		return;
 
 	for (;;) {
 		x = sx->sx_lock;
 
 		/*
 		 * We should never have sharers while at least one thread
 		 * holds a shared lock.
 		 */
 		KASSERT(!(x & SX_LOCK_SHARED_WAITERS),
 		    ("%s: waiting sharers", __func__));
 
 		/*
 		 * See if there is more than one shared lock held.  If
 		 * so, just drop one and return.
 		 */
 		if (SX_SHARERS(x) > 1) {
 			if (atomic_cmpset_rel_ptr(&sx->sx_lock, x,
 			    x - SX_ONE_SHARER)) {
 				if (LOCK_LOG_TEST(&sx->lock_object, 0))
 					CTR4(KTR_LOCK,
 					    "%s: %p succeeded %p -> %p",
 					    __func__, sx, (void *)x,
 					    (void *)(x - SX_ONE_SHARER));
 				break;
 			}
 			continue;
 		}
 
 		/*
 		 * If there aren't any waiters for an exclusive lock,
 		 * then try to drop it quickly.
 		 */
 		if (!(x & SX_LOCK_EXCLUSIVE_WAITERS)) {
 			MPASS(x == SX_SHARERS_LOCK(1));
 			if (atomic_cmpset_rel_ptr(&sx->sx_lock,
 			    SX_SHARERS_LOCK(1), SX_LOCK_UNLOCKED)) {
 				if (LOCK_LOG_TEST(&sx->lock_object, 0))
 					CTR2(KTR_LOCK, "%s: %p last succeeded",
 					    __func__, sx);
 				break;
 			}
 			continue;
 		}
 
 		/*
 		 * At this point, there should just be one sharer with
 		 * exclusive waiters.
 		 */
 		MPASS(x == (SX_SHARERS_LOCK(1) | SX_LOCK_EXCLUSIVE_WAITERS));
 
 		sleepq_lock(&sx->lock_object);
 
 		/*
 		 * Wake up semantic here is quite simple:
 		 * Just wake up all the exclusive waiters.
 		 * Note that the state of the lock could have changed,
 		 * so if it fails loop back and retry.
 		 */
 		if (!atomic_cmpset_rel_ptr(&sx->sx_lock,
 		    SX_SHARERS_LOCK(1) | SX_LOCK_EXCLUSIVE_WAITERS,
 		    SX_LOCK_UNLOCKED)) {
 			sleepq_release(&sx->lock_object);
 			continue;
 		}
 		if (LOCK_LOG_TEST(&sx->lock_object, 0))
 			CTR2(KTR_LOCK, "%s: %p waking up all thread on"
 			    "exclusive queue", __func__, sx);
 		wakeup_swapper = sleepq_broadcast(&sx->lock_object, SLEEPQ_SX,
 		    0, SQ_EXCLUSIVE_QUEUE);
 		sleepq_release(&sx->lock_object);
 		if (wakeup_swapper)
 			kick_proc0();
 		break;
 	}
 }
 
 #ifdef INVARIANT_SUPPORT
 #ifndef INVARIANTS
 #undef	_sx_assert
 #endif
 
 /*
  * In the non-WITNESS case, sx_assert() can only detect that at least
  * *some* thread owns an slock, but it cannot guarantee that *this*
  * thread owns an slock.
  */
 void
 _sx_assert(const struct sx *sx, int what, const char *file, int line)
 {
 #ifndef WITNESS
 	int slocked = 0;
 #endif
 
 	if (panicstr != NULL)
 		return;
 	switch (what) {
 	case SA_SLOCKED:
 	case SA_SLOCKED | SA_NOTRECURSED:
 	case SA_SLOCKED | SA_RECURSED:
 #ifndef WITNESS
 		slocked = 1;
 		/* FALLTHROUGH */
 #endif
 	case SA_LOCKED:
 	case SA_LOCKED | SA_NOTRECURSED:
 	case SA_LOCKED | SA_RECURSED:
 #ifdef WITNESS
 		witness_assert(&sx->lock_object, what, file, line);
 #else
 		/*
 		 * If some other thread has an exclusive lock or we
 		 * have one and are asserting a shared lock, fail.
 		 * Also, if no one has a lock at all, fail.
 		 */
 		if (sx->sx_lock == SX_LOCK_UNLOCKED ||
 		    (!(sx->sx_lock & SX_LOCK_SHARED) && (slocked ||
 		    sx_xholder(sx) != curthread)))
 			panic("Lock %s not %slocked @ %s:%d\n",
 			    sx->lock_object.lo_name, slocked ? "share " : "",
 			    file, line);
 
 		if (!(sx->sx_lock & SX_LOCK_SHARED)) {
 			if (sx_recursed(sx)) {
 				if (what & SA_NOTRECURSED)
 					panic("Lock %s recursed @ %s:%d\n",
 					    sx->lock_object.lo_name, file,
 					    line);
 			} else if (what & SA_RECURSED)
 				panic("Lock %s not recursed @ %s:%d\n",
 				    sx->lock_object.lo_name, file, line);
 		}
 #endif
 		break;
 	case SA_XLOCKED:
 	case SA_XLOCKED | SA_NOTRECURSED:
 	case SA_XLOCKED | SA_RECURSED:
 		if (sx_xholder(sx) != curthread)
 			panic("Lock %s not exclusively locked @ %s:%d\n",
 			    sx->lock_object.lo_name, file, line);
 		if (sx_recursed(sx)) {
 			if (what & SA_NOTRECURSED)
 				panic("Lock %s recursed @ %s:%d\n",
 				    sx->lock_object.lo_name, file, line);
 		} else if (what & SA_RECURSED)
 			panic("Lock %s not recursed @ %s:%d\n",
 			    sx->lock_object.lo_name, file, line);
 		break;
 	case SA_UNLOCKED:
 #ifdef WITNESS
 		witness_assert(&sx->lock_object, what, file, line);
 #else
 		/*
 		 * If we hold an exclusve lock fail.  We can't
 		 * reliably check to see if we hold a shared lock or
 		 * not.
 		 */
 		if (sx_xholder(sx) == curthread)
 			panic("Lock %s exclusively locked @ %s:%d\n",
 			    sx->lock_object.lo_name, file, line);
 #endif
 		break;
 	default:
 		panic("Unknown sx lock assertion: %d @ %s:%d", what, file,
 		    line);
 	}
 }
 #endif	/* INVARIANT_SUPPORT */
 
 #ifdef DDB
 static void
 db_show_sx(const struct lock_object *lock)
 {
 	struct thread *td;
 	const struct sx *sx;
 
 	sx = (const struct sx *)lock;
 
 	db_printf(" state: ");
 	if (sx->sx_lock == SX_LOCK_UNLOCKED)
 		db_printf("UNLOCKED\n");
 	else if (sx->sx_lock == SX_LOCK_DESTROYED) {
 		db_printf("DESTROYED\n");
 		return;
 	} else if (sx->sx_lock & SX_LOCK_SHARED)
 		db_printf("SLOCK: %ju\n", (uintmax_t)SX_SHARERS(sx->sx_lock));
 	else {
 		td = sx_xholder(sx);
 		db_printf("XLOCK: %p (tid %d, pid %d, \"%s\")\n", td,
 		    td->td_tid, td->td_proc->p_pid, td->td_name);
 		if (sx_recursed(sx))
 			db_printf(" recursed: %d\n", sx->sx_recurse);
 	}
 
 	db_printf(" waiters: ");
 	switch(sx->sx_lock &
 	    (SX_LOCK_SHARED_WAITERS | SX_LOCK_EXCLUSIVE_WAITERS)) {
 	case SX_LOCK_SHARED_WAITERS:
 		db_printf("shared\n");
 		break;
 	case SX_LOCK_EXCLUSIVE_WAITERS:
 		db_printf("exclusive\n");
 		break;
 	case SX_LOCK_SHARED_WAITERS | SX_LOCK_EXCLUSIVE_WAITERS:
 		db_printf("exclusive and shared\n");
 		break;
 	default:
 		db_printf("none\n");
 	}
 }
 
 /*
  * Check to see if a thread that is blocked on a sleep queue is actually
  * blocked on an sx lock.  If so, output some details and return true.
  * If the lock has an exclusive owner, return that in *ownerp.
  */
 int
 sx_chain(struct thread *td, struct thread **ownerp)
 {
 	struct sx *sx;
 
 	/*
 	 * Check to see if this thread is blocked on an sx lock.
 	 * First, we check the lock class.  If that is ok, then we
 	 * compare the lock name against the wait message.
 	 */
 	sx = td->td_wchan;
 	if (LOCK_CLASS(&sx->lock_object) != &lock_class_sx ||
 	    sx->lock_object.lo_name != td->td_wmesg)
 		return (0);
 
 	/* We think we have an sx lock, so output some details. */
 	db_printf("blocked on sx \"%s\" ", td->td_wmesg);
 	*ownerp = sx_xholder(sx);
 	if (sx->sx_lock & SX_LOCK_SHARED)
 		db_printf("SLOCK (count %ju)\n",
 		    (uintmax_t)SX_SHARERS(sx->sx_lock));
 	else
 		db_printf("XLOCK\n");
 	return (1);
 }
 #endif
Index: head/sys/kern/kern_synch.c
===================================================================
--- head/sys/kern/kern_synch.c	(revision 258540)
+++ head/sys/kern/kern_synch.c	(revision 258541)
@@ -1,633 +1,632 @@
 /*-
  * Copyright (c) 1982, 1986, 1990, 1991, 1993
  *	The Regents of the University of California.  All rights reserved.
  * (c) UNIX System Laboratories, Inc.
  * All or some portions of this file are derived from material licensed
  * to the University of California by American Telephone and Telegraph
  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
  * the permission of UNIX System Laboratories, Inc.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)kern_synch.c	8.9 (Berkeley) 5/19/95
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
-#include "opt_kdtrace.h"
 #include "opt_ktrace.h"
 #include "opt_sched.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/condvar.h>
 #include <sys/kdb.h>
 #include <sys/kernel.h>
 #include <sys/ktr.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/proc.h>
 #include <sys/resourcevar.h>
 #include <sys/sched.h>
 #include <sys/sdt.h>
 #include <sys/signalvar.h>
 #include <sys/sleepqueue.h>
 #include <sys/smp.h>
 #include <sys/sx.h>
 #include <sys/sysctl.h>
 #include <sys/sysproto.h>
 #include <sys/vmmeter.h>
 #ifdef KTRACE
 #include <sys/uio.h>
 #include <sys/ktrace.h>
 #endif
 
 #include <machine/cpu.h>
 
 #ifdef XEN
 #include <vm/vm.h>
 #include <vm/vm_param.h>
 #include <vm/pmap.h>
 #endif
 
 #define	KTDSTATE(td)							\
 	(((td)->td_inhibitors & TDI_SLEEPING) != 0 ? "sleep"  :		\
 	((td)->td_inhibitors & TDI_SUSPENDED) != 0 ? "suspended" :	\
 	((td)->td_inhibitors & TDI_SWAPPED) != 0 ? "swapped" :		\
 	((td)->td_inhibitors & TDI_LOCK) != 0 ? "blocked" :		\
 	((td)->td_inhibitors & TDI_IWAIT) != 0 ? "iwait" : "yielding")
 
 static void synch_setup(void *dummy);
 SYSINIT(synch_setup, SI_SUB_KICK_SCHEDULER, SI_ORDER_FIRST, synch_setup,
     NULL);
 
 int	hogticks;
 static uint8_t pause_wchan[MAXCPU];
 
 static struct callout loadav_callout;
 
 struct loadavg averunnable =
 	{ {0, 0, 0}, FSCALE };	/* load average, of runnable procs */
 /*
  * Constants for averages over 1, 5, and 15 minutes
  * when sampling at 5 second intervals.
  */
 static fixpt_t cexp[3] = {
 	0.9200444146293232 * FSCALE,	/* exp(-1/12) */
 	0.9834714538216174 * FSCALE,	/* exp(-1/60) */
 	0.9944598480048967 * FSCALE,	/* exp(-1/180) */
 };
 
 /* kernel uses `FSCALE', userland (SHOULD) use kern.fscale */
 static int      fscale __unused = FSCALE;
 SYSCTL_INT(_kern, OID_AUTO, fscale, CTLFLAG_RD, 0, FSCALE, "");
 
 static void	loadav(void *arg);
 
 SDT_PROVIDER_DECLARE(sched);
 SDT_PROBE_DEFINE(sched, , , preempt, preempt);
 
 /*
  * These probes reference Solaris features that are not implemented in FreeBSD.
  * Create the probes anyway for compatibility with existing D scripts; they'll
  * just never fire.
  */
 SDT_PROBE_DEFINE(sched, , , cpucaps_sleep, cpucaps-sleep);
 SDT_PROBE_DEFINE(sched, , , cpucaps_wakeup, cpucaps-wakeup);
 SDT_PROBE_DEFINE(sched, , , schedctl_nopreempt, schedctl-nopreempt);
 SDT_PROBE_DEFINE(sched, , , schedctl_preempt, schedctl-preempt);
 SDT_PROBE_DEFINE(sched, , , schedctl_yield, schedctl-yield);
 
 static void
 sleepinit(void *unused)
 {
 
 	hogticks = (hz / 10) * 2;	/* Default only. */
 	init_sleepqueues();
 }
 
 /*
  * vmem tries to lock the sleepq mutexes when free'ing kva, so make sure
  * it is available.
  */
 SYSINIT(sleepinit, SI_SUB_KMEM, SI_ORDER_ANY, sleepinit, 0);
 
 /*
  * General sleep call.  Suspends the current thread until a wakeup is
  * performed on the specified identifier.  The thread will then be made
  * runnable with the specified priority.  Sleeps at most sbt units of time
  * (0 means no timeout).  If pri includes the PCATCH flag, let signals
  * interrupt the sleep, otherwise ignore them while sleeping.  Returns 0 if
  * awakened, EWOULDBLOCK if the timeout expires.  If PCATCH is set and a
  * signal becomes pending, ERESTART is returned if the current system
  * call should be restarted if possible, and EINTR is returned if the system
  * call should be interrupted by the signal (return EINTR).
  *
  * The lock argument is unlocked before the caller is suspended, and
  * re-locked before _sleep() returns.  If priority includes the PDROP
  * flag the lock is not re-locked before returning.
  */
 int
 _sleep(void *ident, struct lock_object *lock, int priority,
     const char *wmesg, sbintime_t sbt, sbintime_t pr, int flags)
 {
 	struct thread *td;
 	struct proc *p;
 	struct lock_class *class;
 	uintptr_t lock_state;
 	int catch, pri, rval, sleepq_flags;
 	WITNESS_SAVE_DECL(lock_witness);
 
 	td = curthread;
 	p = td->td_proc;
 #ifdef KTRACE
 	if (KTRPOINT(td, KTR_CSW))
 		ktrcsw(1, 0, wmesg);
 #endif
 	WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, lock,
 	    "Sleeping on \"%s\"", wmesg);
 	KASSERT(sbt != 0 || mtx_owned(&Giant) || lock != NULL,
 	    ("sleeping without a lock"));
 	KASSERT(p != NULL, ("msleep1"));
 	KASSERT(ident != NULL && TD_IS_RUNNING(td), ("msleep"));
 	if (priority & PDROP)
 		KASSERT(lock != NULL && lock != &Giant.lock_object,
 		    ("PDROP requires a non-Giant lock"));
 	if (lock != NULL)
 		class = LOCK_CLASS(lock);
 	else
 		class = NULL;
 
 	if (cold || SCHEDULER_STOPPED()) {
 		/*
 		 * During autoconfiguration, just return;
 		 * don't run any other threads or panic below,
 		 * in case this is the idle thread and already asleep.
 		 * XXX: this used to do "s = splhigh(); splx(safepri);
 		 * splx(s);" to give interrupts a chance, but there is
 		 * no way to give interrupts a chance now.
 		 */
 		if (lock != NULL && priority & PDROP)
 			class->lc_unlock(lock);
 		return (0);
 	}
 	catch = priority & PCATCH;
 	pri = priority & PRIMASK;
 
 	/*
 	 * If we are already on a sleep queue, then remove us from that
 	 * sleep queue first.  We have to do this to handle recursive
 	 * sleeps.
 	 */
 	if (TD_ON_SLEEPQ(td))
 		sleepq_remove(td, td->td_wchan);
 
 	if ((uint8_t *)ident >= &pause_wchan[0] &&
 	    (uint8_t *)ident <= &pause_wchan[MAXCPU - 1])
 		sleepq_flags = SLEEPQ_PAUSE;
 	else
 		sleepq_flags = SLEEPQ_SLEEP;
 	if (catch)
 		sleepq_flags |= SLEEPQ_INTERRUPTIBLE;
 
 	sleepq_lock(ident);
 	CTR5(KTR_PROC, "sleep: thread %ld (pid %ld, %s) on %s (%p)",
 	    td->td_tid, p->p_pid, td->td_name, wmesg, ident);
 
 	if (lock == &Giant.lock_object)
 		mtx_assert(&Giant, MA_OWNED);
 	DROP_GIANT();
 	if (lock != NULL && lock != &Giant.lock_object &&
 	    !(class->lc_flags & LC_SLEEPABLE)) {
 		WITNESS_SAVE(lock, lock_witness);
 		lock_state = class->lc_unlock(lock);
 	} else
 		/* GCC needs to follow the Yellow Brick Road */
 		lock_state = -1;
 
 	/*
 	 * We put ourselves on the sleep queue and start our timeout
 	 * before calling thread_suspend_check, as we could stop there,
 	 * and a wakeup or a SIGCONT (or both) could occur while we were
 	 * stopped without resuming us.  Thus, we must be ready for sleep
 	 * when cursig() is called.  If the wakeup happens while we're
 	 * stopped, then td will no longer be on a sleep queue upon
 	 * return from cursig().
 	 */
 	sleepq_add(ident, lock, wmesg, sleepq_flags, 0);
 	if (sbt != 0)
 		sleepq_set_timeout_sbt(ident, sbt, pr, flags);
 	if (lock != NULL && class->lc_flags & LC_SLEEPABLE) {
 		sleepq_release(ident);
 		WITNESS_SAVE(lock, lock_witness);
 		lock_state = class->lc_unlock(lock);
 		sleepq_lock(ident);
 	}
 	if (sbt != 0 && catch)
 		rval = sleepq_timedwait_sig(ident, pri);
 	else if (sbt != 0)
 		rval = sleepq_timedwait(ident, pri);
 	else if (catch)
 		rval = sleepq_wait_sig(ident, pri);
 	else {
 		sleepq_wait(ident, pri);
 		rval = 0;
 	}
 #ifdef KTRACE
 	if (KTRPOINT(td, KTR_CSW))
 		ktrcsw(0, 0, wmesg);
 #endif
 	PICKUP_GIANT();
 	if (lock != NULL && lock != &Giant.lock_object && !(priority & PDROP)) {
 		class->lc_lock(lock, lock_state);
 		WITNESS_RESTORE(lock, lock_witness);
 	}
 	return (rval);
 }
 
 int
 msleep_spin_sbt(void *ident, struct mtx *mtx, const char *wmesg,
     sbintime_t sbt, sbintime_t pr, int flags)
 {
 	struct thread *td;
 	struct proc *p;
 	int rval;
 	WITNESS_SAVE_DECL(mtx);
 
 	td = curthread;
 	p = td->td_proc;
 	KASSERT(mtx != NULL, ("sleeping without a mutex"));
 	KASSERT(p != NULL, ("msleep1"));
 	KASSERT(ident != NULL && TD_IS_RUNNING(td), ("msleep"));
 
 	if (cold || SCHEDULER_STOPPED()) {
 		/*
 		 * During autoconfiguration, just return;
 		 * don't run any other threads or panic below,
 		 * in case this is the idle thread and already asleep.
 		 * XXX: this used to do "s = splhigh(); splx(safepri);
 		 * splx(s);" to give interrupts a chance, but there is
 		 * no way to give interrupts a chance now.
 		 */
 		return (0);
 	}
 
 	sleepq_lock(ident);
 	CTR5(KTR_PROC, "msleep_spin: thread %ld (pid %ld, %s) on %s (%p)",
 	    td->td_tid, p->p_pid, td->td_name, wmesg, ident);
 
 	DROP_GIANT();
 	mtx_assert(mtx, MA_OWNED | MA_NOTRECURSED);
 	WITNESS_SAVE(&mtx->lock_object, mtx);
 	mtx_unlock_spin(mtx);
 
 	/*
 	 * We put ourselves on the sleep queue and start our timeout.
 	 */
 	sleepq_add(ident, &mtx->lock_object, wmesg, SLEEPQ_SLEEP, 0);
 	if (sbt != 0)
 		sleepq_set_timeout_sbt(ident, sbt, pr, flags);
 
 	/*
 	 * Can't call ktrace with any spin locks held so it can lock the
 	 * ktrace_mtx lock, and WITNESS_WARN considers it an error to hold
 	 * any spin lock.  Thus, we have to drop the sleepq spin lock while
 	 * we handle those requests.  This is safe since we have placed our
 	 * thread on the sleep queue already.
 	 */
 #ifdef KTRACE
 	if (KTRPOINT(td, KTR_CSW)) {
 		sleepq_release(ident);
 		ktrcsw(1, 0, wmesg);
 		sleepq_lock(ident);
 	}
 #endif
 #ifdef WITNESS
 	sleepq_release(ident);
 	WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL, "Sleeping on \"%s\"",
 	    wmesg);
 	sleepq_lock(ident);
 #endif
 	if (sbt != 0)
 		rval = sleepq_timedwait(ident, 0);
 	else {
 		sleepq_wait(ident, 0);
 		rval = 0;
 	}
 #ifdef KTRACE
 	if (KTRPOINT(td, KTR_CSW))
 		ktrcsw(0, 0, wmesg);
 #endif
 	PICKUP_GIANT();
 	mtx_lock_spin(mtx);
 	WITNESS_RESTORE(&mtx->lock_object, mtx);
 	return (rval);
 }
 
 /*
  * pause() delays the calling thread by the given number of system ticks.
  * During cold bootup, pause() uses the DELAY() function instead of
  * the tsleep() function to do the waiting. The "timo" argument must be
  * greater than or equal to zero. A "timo" value of zero is equivalent
  * to a "timo" value of one.
  */
 int
 pause_sbt(const char *wmesg, sbintime_t sbt, sbintime_t pr, int flags)
 {
 	KASSERT(sbt >= 0, ("pause: timeout must be >= 0"));
 
 	/* silently convert invalid timeouts */
 	if (sbt == 0)
 		sbt = tick_sbt;
 
 	if (cold) {
 		/*
 		 * We delay one second at a time to avoid overflowing the
 		 * system specific DELAY() function(s):
 		 */
 		while (sbt >= SBT_1S) {
 			DELAY(1000000);
 			sbt -= SBT_1S;
 		}
 		/* Do the delay remainder, if any */
 		sbt = (sbt + SBT_1US - 1) / SBT_1US;
 		if (sbt > 0)
 			DELAY(sbt);
 		return (0);
 	}
 	return (_sleep(&pause_wchan[curcpu], NULL, 0, wmesg, sbt, pr, flags));
 }
 
 /*
  * Make all threads sleeping on the specified identifier runnable.
  */
 void
 wakeup(void *ident)
 {
 	int wakeup_swapper;
 
 	sleepq_lock(ident);
 	wakeup_swapper = sleepq_broadcast(ident, SLEEPQ_SLEEP, 0, 0);
 	sleepq_release(ident);
 	if (wakeup_swapper) {
 		KASSERT(ident != &proc0,
 		    ("wakeup and wakeup_swapper and proc0"));
 		kick_proc0();
 	}
 }
 
 /*
  * Make a thread sleeping on the specified identifier runnable.
  * May wake more than one thread if a target thread is currently
  * swapped out.
  */
 void
 wakeup_one(void *ident)
 {
 	int wakeup_swapper;
 
 	sleepq_lock(ident);
 	wakeup_swapper = sleepq_signal(ident, SLEEPQ_SLEEP, 0, 0);
 	sleepq_release(ident);
 	if (wakeup_swapper)
 		kick_proc0();
 }
 
 static void
 kdb_switch(void)
 {
 	thread_unlock(curthread);
 	kdb_backtrace();
 	kdb_reenter();
 	panic("%s: did not reenter debugger", __func__);
 }
 
 /*
  * The machine independent parts of context switching.
  */
 void
 mi_switch(int flags, struct thread *newtd)
 {
 	uint64_t runtime, new_switchtime;
 	struct thread *td;
 	struct proc *p;
 
 	td = curthread;			/* XXX */
 	THREAD_LOCK_ASSERT(td, MA_OWNED | MA_NOTRECURSED);
 	p = td->td_proc;		/* XXX */
 	KASSERT(!TD_ON_RUNQ(td), ("mi_switch: called by old code"));
 #ifdef INVARIANTS
 	if (!TD_ON_LOCK(td) && !TD_IS_RUNNING(td))
 		mtx_assert(&Giant, MA_NOTOWNED);
 #endif
 	KASSERT(td->td_critnest == 1 || panicstr,
 	    ("mi_switch: switch in a critical section"));
 	KASSERT((flags & (SW_INVOL | SW_VOL)) != 0,
 	    ("mi_switch: switch must be voluntary or involuntary"));
 	KASSERT(newtd != curthread, ("mi_switch: preempting back to ourself"));
 
 	/*
 	 * Don't perform context switches from the debugger.
 	 */
 	if (kdb_active)
 		kdb_switch();
 	if (SCHEDULER_STOPPED())
 		return;
 	if (flags & SW_VOL) {
 		td->td_ru.ru_nvcsw++;
 		td->td_swvoltick = ticks;
 	} else
 		td->td_ru.ru_nivcsw++;
 #ifdef SCHED_STATS
 	SCHED_STAT_INC(sched_switch_stats[flags & SW_TYPE_MASK]);
 #endif
 	/*
 	 * Compute the amount of time during which the current
 	 * thread was running, and add that to its total so far.
 	 */
 	new_switchtime = cpu_ticks();
 	runtime = new_switchtime - PCPU_GET(switchtime);
 	td->td_runtime += runtime;
 	td->td_incruntime += runtime;
 	PCPU_SET(switchtime, new_switchtime);
 	td->td_generation++;	/* bump preempt-detect counter */
 	PCPU_INC(cnt.v_swtch);
 	PCPU_SET(switchticks, ticks);
 	CTR4(KTR_PROC, "mi_switch: old thread %ld (td_sched %p, pid %ld, %s)",
 	    td->td_tid, td->td_sched, p->p_pid, td->td_name);
 #if (KTR_COMPILE & KTR_SCHED) != 0
 	if (TD_IS_IDLETHREAD(td))
 		KTR_STATE1(KTR_SCHED, "thread", sched_tdname(td), "idle",
 		    "prio:%d", td->td_priority);
 	else
 		KTR_STATE3(KTR_SCHED, "thread", sched_tdname(td), KTDSTATE(td),
 		    "prio:%d", td->td_priority, "wmesg:\"%s\"", td->td_wmesg,
 		    "lockname:\"%s\"", td->td_lockname);
 #endif
 	SDT_PROBE0(sched, , , preempt);
 #ifdef XEN
 	PT_UPDATES_FLUSH();
 #endif
 	sched_switch(td, newtd, flags);
 	KTR_STATE1(KTR_SCHED, "thread", sched_tdname(td), "running",
 	    "prio:%d", td->td_priority);
 
 	CTR4(KTR_PROC, "mi_switch: new thread %ld (td_sched %p, pid %ld, %s)",
 	    td->td_tid, td->td_sched, p->p_pid, td->td_name);
 
 	/* 
 	 * If the last thread was exiting, finish cleaning it up.
 	 */
 	if ((td = PCPU_GET(deadthread))) {
 		PCPU_SET(deadthread, NULL);
 		thread_stash(td);
 	}
 }
 
 /*
  * Change thread state to be runnable, placing it on the run queue if
  * it is in memory.  If it is swapped out, return true so our caller
  * will know to awaken the swapper.
  */
 int
 setrunnable(struct thread *td)
 {
 
 	THREAD_LOCK_ASSERT(td, MA_OWNED);
 	KASSERT(td->td_proc->p_state != PRS_ZOMBIE,
 	    ("setrunnable: pid %d is a zombie", td->td_proc->p_pid));
 	switch (td->td_state) {
 	case TDS_RUNNING:
 	case TDS_RUNQ:
 		return (0);
 	case TDS_INHIBITED:
 		/*
 		 * If we are only inhibited because we are swapped out
 		 * then arange to swap in this process. Otherwise just return.
 		 */
 		if (td->td_inhibitors != TDI_SWAPPED)
 			return (0);
 		/* FALLTHROUGH */
 	case TDS_CAN_RUN:
 		break;
 	default:
 		printf("state is 0x%x", td->td_state);
 		panic("setrunnable(2)");
 	}
 	if ((td->td_flags & TDF_INMEM) == 0) {
 		if ((td->td_flags & TDF_SWAPINREQ) == 0) {
 			td->td_flags |= TDF_SWAPINREQ;
 			return (1);
 		}
 	} else
 		sched_wakeup(td);
 	return (0);
 }
 
 /*
  * Compute a tenex style load average of a quantity on
  * 1, 5 and 15 minute intervals.
  */
 static void
 loadav(void *arg)
 {
 	int i, nrun;
 	struct loadavg *avg;
 
 	nrun = sched_load();
 	avg = &averunnable;
 
 	for (i = 0; i < 3; i++)
 		avg->ldavg[i] = (cexp[i] * avg->ldavg[i] +
 		    nrun * FSCALE * (FSCALE - cexp[i])) >> FSHIFT;
 
 	/*
 	 * Schedule the next update to occur after 5 seconds, but add a
 	 * random variation to avoid synchronisation with processes that
 	 * run at regular intervals.
 	 */
 	callout_reset_sbt(&loadav_callout,
 	    SBT_1US * (4000000 + (int)(random() % 2000001)), SBT_1US,
 	    loadav, NULL, C_DIRECT_EXEC | C_PREL(32));
 }
 
 /* ARGSUSED */
 static void
 synch_setup(void *dummy)
 {
 	callout_init(&loadav_callout, CALLOUT_MPSAFE);
 
 	/* Kick off timeout driven events by calling first time. */
 	loadav(NULL);
 }
 
 int
 should_yield(void)
 {
 
 	return ((unsigned int)(ticks - curthread->td_swvoltick) >= hogticks);
 }
 
 void
 maybe_yield(void)
 {
 
 	if (should_yield())
 		kern_yield(PRI_USER);
 }
 
 void
 kern_yield(int prio)
 {
 	struct thread *td;
 
 	td = curthread;
 	DROP_GIANT();
 	thread_lock(td);
 	if (prio == PRI_USER)
 		prio = td->td_user_pri;
 	if (prio >= 0)
 		sched_prio(td, prio);
 	mi_switch(SW_VOL | SWT_RELINQUISH, NULL);
 	thread_unlock(td);
 	PICKUP_GIANT();
 }
 
 /*
  * General purpose yield system call.
  */
 int
 sys_yield(struct thread *td, struct yield_args *uap)
 {
 
 	thread_lock(td);
 	if (PRI_BASE(td->td_pri_class) == PRI_TIMESHARE)
 		sched_prio(td, PRI_MAX_TIMESHARE);
 	mi_switch(SW_VOL | SWT_RELINQUISH, NULL);
 	thread_unlock(td);
 	td->td_retval[0] = 0;
 	return (0);
 }
Index: head/sys/kern/kern_thread.c
===================================================================
--- head/sys/kern/kern_thread.c	(revision 258540)
+++ head/sys/kern/kern_thread.c	(revision 258541)
@@ -1,1054 +1,1053 @@
 /*-
  * Copyright (C) 2001 Julian Elischer <julian@freebsd.org>.
  *  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice(s), this list of conditions and the following disclaimer as
  *    the first lines of this file unmodified other than the possible
  *    addition of one or more copyright notices.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice(s), this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER(S) ``AS IS'' AND ANY
  * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
  * DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT HOLDER(S) BE LIABLE FOR ANY
  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
  * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
  * DAMAGE.
  */
 
 #include "opt_witness.h"
-#include "opt_kdtrace.h"
 #include "opt_hwpmc_hooks.h"
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/proc.h>
 #include <sys/rangelock.h>
 #include <sys/resourcevar.h>
 #include <sys/sdt.h>
 #include <sys/smp.h>
 #include <sys/sched.h>
 #include <sys/sleepqueue.h>
 #include <sys/selinfo.h>
 #include <sys/turnstile.h>
 #include <sys/ktr.h>
 #include <sys/rwlock.h>
 #include <sys/umtx.h>
 #include <sys/cpuset.h>
 #ifdef	HWPMC_HOOKS
 #include <sys/pmckern.h>
 #endif
 
 #include <security/audit/audit.h>
 
 #include <vm/vm.h>
 #include <vm/vm_extern.h>
 #include <vm/uma.h>
 #include <sys/eventhandler.h>
 
 SDT_PROVIDER_DECLARE(proc);
 SDT_PROBE_DEFINE(proc, , , lwp_exit, lwp-exit);
 
 
 /*
  * thread related storage.
  */
 static uma_zone_t thread_zone;
 
 TAILQ_HEAD(, thread) zombie_threads = TAILQ_HEAD_INITIALIZER(zombie_threads);
 static struct mtx zombie_lock;
 MTX_SYSINIT(zombie_lock, &zombie_lock, "zombie lock", MTX_SPIN);
 
 static void thread_zombie(struct thread *);
 
 #define TID_BUFFER_SIZE	1024
 
 struct mtx tid_lock;
 static struct unrhdr *tid_unrhdr;
 static lwpid_t tid_buffer[TID_BUFFER_SIZE];
 static int tid_head, tid_tail;
 static MALLOC_DEFINE(M_TIDHASH, "tidhash", "thread hash");
 
 struct	tidhashhead *tidhashtbl;
 u_long	tidhash;
 struct	rwlock tidhash_lock;
 
 static lwpid_t
 tid_alloc(void)
 {
 	lwpid_t	tid;
 
 	tid = alloc_unr(tid_unrhdr);
 	if (tid != -1)
 		return (tid);
 	mtx_lock(&tid_lock);
 	if (tid_head == tid_tail) {
 		mtx_unlock(&tid_lock);
 		return (-1);
 	}
 	tid = tid_buffer[tid_head];
 	tid_head = (tid_head + 1) % TID_BUFFER_SIZE;
 	mtx_unlock(&tid_lock);
 	return (tid);
 }
 
 static void
 tid_free(lwpid_t tid)
 {
 	lwpid_t tmp_tid = -1;
 
 	mtx_lock(&tid_lock);
 	if ((tid_tail + 1) % TID_BUFFER_SIZE == tid_head) {
 		tmp_tid = tid_buffer[tid_head];
 		tid_head = (tid_head + 1) % TID_BUFFER_SIZE;
 	}
 	tid_buffer[tid_tail] = tid;
 	tid_tail = (tid_tail + 1) % TID_BUFFER_SIZE;
 	mtx_unlock(&tid_lock);
 	if (tmp_tid != -1)
 		free_unr(tid_unrhdr, tmp_tid);
 }
 
 /*
  * Prepare a thread for use.
  */
 static int
 thread_ctor(void *mem, int size, void *arg, int flags)
 {
 	struct thread	*td;
 
 	td = (struct thread *)mem;
 	td->td_state = TDS_INACTIVE;
 	td->td_oncpu = NOCPU;
 
 	td->td_tid = tid_alloc();
 
 	/*
 	 * Note that td_critnest begins life as 1 because the thread is not
 	 * running and is thereby implicitly waiting to be on the receiving
 	 * end of a context switch.
 	 */
 	td->td_critnest = 1;
 	td->td_lend_user_pri = PRI_MAX;
 	EVENTHANDLER_INVOKE(thread_ctor, td);
 #ifdef AUDIT
 	audit_thread_alloc(td);
 #endif
 	umtx_thread_alloc(td);
 	return (0);
 }
 
 /*
  * Reclaim a thread after use.
  */
 static void
 thread_dtor(void *mem, int size, void *arg)
 {
 	struct thread *td;
 
 	td = (struct thread *)mem;
 
 #ifdef INVARIANTS
 	/* Verify that this thread is in a safe state to free. */
 	switch (td->td_state) {
 	case TDS_INHIBITED:
 	case TDS_RUNNING:
 	case TDS_CAN_RUN:
 	case TDS_RUNQ:
 		/*
 		 * We must never unlink a thread that is in one of
 		 * these states, because it is currently active.
 		 */
 		panic("bad state for thread unlinking");
 		/* NOTREACHED */
 	case TDS_INACTIVE:
 		break;
 	default:
 		panic("bad thread state");
 		/* NOTREACHED */
 	}
 #endif
 #ifdef AUDIT
 	audit_thread_free(td);
 #endif
 	/* Free all OSD associated to this thread. */
 	osd_thread_exit(td);
 
 	EVENTHANDLER_INVOKE(thread_dtor, td);
 	tid_free(td->td_tid);
 }
 
 /*
  * Initialize type-stable parts of a thread (when newly created).
  */
 static int
 thread_init(void *mem, int size, int flags)
 {
 	struct thread *td;
 
 	td = (struct thread *)mem;
 
 	td->td_sleepqueue = sleepq_alloc();
 	td->td_turnstile = turnstile_alloc();
 	td->td_rlqe = NULL;
 	EVENTHANDLER_INVOKE(thread_init, td);
 	td->td_sched = (struct td_sched *)&td[1];
 	umtx_thread_init(td);
 	td->td_kstack = 0;
 	return (0);
 }
 
 /*
  * Tear down type-stable parts of a thread (just before being discarded).
  */
 static void
 thread_fini(void *mem, int size)
 {
 	struct thread *td;
 
 	td = (struct thread *)mem;
 	EVENTHANDLER_INVOKE(thread_fini, td);
 	rlqentry_free(td->td_rlqe);
 	turnstile_free(td->td_turnstile);
 	sleepq_free(td->td_sleepqueue);
 	umtx_thread_fini(td);
 	seltdfini(td);
 }
 
 /*
  * For a newly created process,
  * link up all the structures and its initial threads etc.
  * called from:
  * {arch}/{arch}/machdep.c   ia64_init(), init386() etc.
  * proc_dtor() (should go away)
  * proc_init()
  */
 void
 proc_linkup0(struct proc *p, struct thread *td)
 {
 	TAILQ_INIT(&p->p_threads);	     /* all threads in proc */
 	proc_linkup(p, td);
 }
 
 void
 proc_linkup(struct proc *p, struct thread *td)
 {
 
 	sigqueue_init(&p->p_sigqueue, p);
 	p->p_ksi = ksiginfo_alloc(1);
 	if (p->p_ksi != NULL) {
 		/* XXX p_ksi may be null if ksiginfo zone is not ready */
 		p->p_ksi->ksi_flags = KSI_EXT | KSI_INS;
 	}
 	LIST_INIT(&p->p_mqnotifier);
 	p->p_numthreads = 0;
 	thread_link(td, p);
 }
 
 /*
  * Initialize global thread allocation resources.
  */
 void
 threadinit(void)
 {
 
 	mtx_init(&tid_lock, "TID lock", NULL, MTX_DEF);
 
 	/*
 	 * pid_max cannot be greater than PID_MAX.
 	 * leave one number for thread0.
 	 */
 	tid_unrhdr = new_unrhdr(PID_MAX + 2, INT_MAX, &tid_lock);
 
 	thread_zone = uma_zcreate("THREAD", sched_sizeof_thread(),
 	    thread_ctor, thread_dtor, thread_init, thread_fini,
 	    16 - 1, 0);
 	tidhashtbl = hashinit(maxproc / 2, M_TIDHASH, &tidhash);
 	rw_init(&tidhash_lock, "tidhash");
 }
 
 /*
  * Place an unused thread on the zombie list.
  * Use the slpq as that must be unused by now.
  */
 void
 thread_zombie(struct thread *td)
 {
 	mtx_lock_spin(&zombie_lock);
 	TAILQ_INSERT_HEAD(&zombie_threads, td, td_slpq);
 	mtx_unlock_spin(&zombie_lock);
 }
 
 /*
  * Release a thread that has exited after cpu_throw().
  */
 void
 thread_stash(struct thread *td)
 {
 	atomic_subtract_rel_int(&td->td_proc->p_exitthreads, 1);
 	thread_zombie(td);
 }
 
 /*
  * Reap zombie resources.
  */
 void
 thread_reap(void)
 {
 	struct thread *td_first, *td_next;
 
 	/*
 	 * Don't even bother to lock if none at this instant,
 	 * we really don't care about the next instant..
 	 */
 	if (!TAILQ_EMPTY(&zombie_threads)) {
 		mtx_lock_spin(&zombie_lock);
 		td_first = TAILQ_FIRST(&zombie_threads);
 		if (td_first)
 			TAILQ_INIT(&zombie_threads);
 		mtx_unlock_spin(&zombie_lock);
 		while (td_first) {
 			td_next = TAILQ_NEXT(td_first, td_slpq);
 			if (td_first->td_ucred)
 				crfree(td_first->td_ucred);
 			thread_free(td_first);
 			td_first = td_next;
 		}
 	}
 }
 
 /*
  * Allocate a thread.
  */
 struct thread *
 thread_alloc(int pages)
 {
 	struct thread *td;
 
 	thread_reap(); /* check if any zombies to get */
 
 	td = (struct thread *)uma_zalloc(thread_zone, M_WAITOK);
 	KASSERT(td->td_kstack == 0, ("thread_alloc got thread with kstack"));
 	if (!vm_thread_new(td, pages)) {
 		uma_zfree(thread_zone, td);
 		return (NULL);
 	}
 	cpu_thread_alloc(td);
 	return (td);
 }
 
 int
 thread_alloc_stack(struct thread *td, int pages)
 {
 
 	KASSERT(td->td_kstack == 0,
 	    ("thread_alloc_stack called on a thread with kstack"));
 	if (!vm_thread_new(td, pages))
 		return (0);
 	cpu_thread_alloc(td);
 	return (1);
 }
 
 /*
  * Deallocate a thread.
  */
 void
 thread_free(struct thread *td)
 {
 
 	lock_profile_thread_exit(td);
 	if (td->td_cpuset)
 		cpuset_rel(td->td_cpuset);
 	td->td_cpuset = NULL;
 	cpu_thread_free(td);
 	if (td->td_kstack != 0)
 		vm_thread_dispose(td);
 	uma_zfree(thread_zone, td);
 }
 
 /*
  * Discard the current thread and exit from its context.
  * Always called with scheduler locked.
  *
  * Because we can't free a thread while we're operating under its context,
  * push the current thread into our CPU's deadthread holder. This means
  * we needn't worry about someone else grabbing our context before we
  * do a cpu_throw().
  */
 void
 thread_exit(void)
 {
 	uint64_t runtime, new_switchtime;
 	struct thread *td;
 	struct thread *td2;
 	struct proc *p;
 	int wakeup_swapper;
 
 	td = curthread;
 	p = td->td_proc;
 
 	PROC_SLOCK_ASSERT(p, MA_OWNED);
 	mtx_assert(&Giant, MA_NOTOWNED);
 
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	KASSERT(p != NULL, ("thread exiting without a process"));
 	CTR3(KTR_PROC, "thread_exit: thread %p (pid %ld, %s)", td,
 	    (long)p->p_pid, td->td_name);
 	KASSERT(TAILQ_EMPTY(&td->td_sigqueue.sq_list), ("signal pending"));
 
 #ifdef AUDIT
 	AUDIT_SYSCALL_EXIT(0, td);
 #endif
 	umtx_thread_exit(td);
 	/*
 	 * drop FPU & debug register state storage, or any other
 	 * architecture specific resources that
 	 * would not be on a new untouched process.
 	 */
 	cpu_thread_exit(td);	/* XXXSMP */
 
 	/*
 	 * The last thread is left attached to the process
 	 * So that the whole bundle gets recycled. Skip
 	 * all this stuff if we never had threads.
 	 * EXIT clears all sign of other threads when
 	 * it goes to single threading, so the last thread always
 	 * takes the short path.
 	 */
 	if (p->p_flag & P_HADTHREADS) {
 		if (p->p_numthreads > 1) {
 			thread_unlink(td);
 			td2 = FIRST_THREAD_IN_PROC(p);
 			sched_exit_thread(td2, td);
 
 			/*
 			 * The test below is NOT true if we are the
 			 * sole exiting thread. P_STOPPED_SINGLE is unset
 			 * in exit1() after it is the only survivor.
 			 */
 			if (P_SHOULDSTOP(p) == P_STOPPED_SINGLE) {
 				if (p->p_numthreads == p->p_suspcount) {
 					thread_lock(p->p_singlethread);
 					wakeup_swapper = thread_unsuspend_one(
 						p->p_singlethread);
 					thread_unlock(p->p_singlethread);
 					if (wakeup_swapper)
 						kick_proc0();
 				}
 			}
 
 			atomic_add_int(&td->td_proc->p_exitthreads, 1);
 			PCPU_SET(deadthread, td);
 		} else {
 			/*
 			 * The last thread is exiting.. but not through exit()
 			 */
 			panic ("thread_exit: Last thread exiting on its own");
 		}
 	} 
 #ifdef	HWPMC_HOOKS
 	/*
 	 * If this thread is part of a process that is being tracked by hwpmc(4),
 	 * inform the module of the thread's impending exit.
 	 */
 	if (PMC_PROC_IS_USING_PMCS(td->td_proc))
 		PMC_SWITCH_CONTEXT(td, PMC_FN_CSW_OUT);
 #endif
 	PROC_UNLOCK(p);
 
 	/* Do the same timestamp bookkeeping that mi_switch() would do. */
 	new_switchtime = cpu_ticks();
 	runtime = new_switchtime - PCPU_GET(switchtime);
 	td->td_runtime += runtime;
 	td->td_incruntime += runtime;
 	PCPU_SET(switchtime, new_switchtime);
 	PCPU_SET(switchticks, ticks);
 	PCPU_INC(cnt.v_swtch);
 
 	/* Save our resource usage in our process. */
 	td->td_ru.ru_nvcsw++;
 	ruxagg(p, td);
 	rucollect(&p->p_ru, &td->td_ru);
 
 	thread_lock(td);
 	PROC_SUNLOCK(p);
 	td->td_state = TDS_INACTIVE;
 #ifdef WITNESS
 	witness_thread_exit(td);
 #endif
 	CTR1(KTR_PROC, "thread_exit: cpu_throw() thread %p", td);
 	sched_throw(td);
 	panic("I'm a teapot!");
 	/* NOTREACHED */
 }
 
 /*
  * Do any thread specific cleanups that may be needed in wait()
  * called with Giant, proc and schedlock not held.
  */
 void
 thread_wait(struct proc *p)
 {
 	struct thread *td;
 
 	mtx_assert(&Giant, MA_NOTOWNED);
 	KASSERT((p->p_numthreads == 1), ("Multiple threads in wait1()"));
 	td = FIRST_THREAD_IN_PROC(p);
 	/* Lock the last thread so we spin until it exits cpu_throw(). */
 	thread_lock(td);
 	thread_unlock(td);
 	/* Wait for any remaining threads to exit cpu_throw(). */
 	while (p->p_exitthreads)
 		sched_relinquish(curthread);
 	lock_profile_thread_exit(td);
 	cpuset_rel(td->td_cpuset);
 	td->td_cpuset = NULL;
 	cpu_thread_clean(td);
 	crfree(td->td_ucred);
 	thread_reap();	/* check for zombie threads etc. */
 }
 
 /*
  * Link a thread to a process.
  * set up anything that needs to be initialized for it to
  * be used by the process.
  */
 void
 thread_link(struct thread *td, struct proc *p)
 {
 
 	/*
 	 * XXX This can't be enabled because it's called for proc0 before
 	 * its lock has been created.
 	 * PROC_LOCK_ASSERT(p, MA_OWNED);
 	 */
 	td->td_state    = TDS_INACTIVE;
 	td->td_proc     = p;
 	td->td_flags    = TDF_INMEM;
 
 	LIST_INIT(&td->td_contested);
 	LIST_INIT(&td->td_lprof[0]);
 	LIST_INIT(&td->td_lprof[1]);
 	sigqueue_init(&td->td_sigqueue, p);
 	callout_init(&td->td_slpcallout, CALLOUT_MPSAFE);
 	TAILQ_INSERT_HEAD(&p->p_threads, td, td_plist);
 	p->p_numthreads++;
 }
 
 /*
  * Convert a process with one thread to an unthreaded process.
  */
 void
 thread_unthread(struct thread *td)
 {
 	struct proc *p = td->td_proc;
 
 	KASSERT((p->p_numthreads == 1), ("Unthreading with >1 threads"));
 	p->p_flag &= ~P_HADTHREADS;
 }
 
 /*
  * Called from:
  *  thread_exit()
  */
 void
 thread_unlink(struct thread *td)
 {
 	struct proc *p = td->td_proc;
 
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	TAILQ_REMOVE(&p->p_threads, td, td_plist);
 	p->p_numthreads--;
 	/* could clear a few other things here */
 	/* Must  NOT clear links to proc! */
 }
 
 static int
 calc_remaining(struct proc *p, int mode)
 {
 	int remaining;
 
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	PROC_SLOCK_ASSERT(p, MA_OWNED);
 	if (mode == SINGLE_EXIT)
 		remaining = p->p_numthreads;
 	else if (mode == SINGLE_BOUNDARY)
 		remaining = p->p_numthreads - p->p_boundary_count;
 	else if (mode == SINGLE_NO_EXIT)
 		remaining = p->p_numthreads - p->p_suspcount;
 	else
 		panic("calc_remaining: wrong mode %d", mode);
 	return (remaining);
 }
 
 /*
  * Enforce single-threading.
  *
  * Returns 1 if the caller must abort (another thread is waiting to
  * exit the process or similar). Process is locked!
  * Returns 0 when you are successfully the only thread running.
  * A process has successfully single threaded in the suspend mode when
  * There are no threads in user mode. Threads in the kernel must be
  * allowed to continue until they get to the user boundary. They may even
  * copy out their return values and data before suspending. They may however be
  * accelerated in reaching the user boundary as we will wake up
  * any sleeping threads that are interruptable. (PCATCH).
  */
 int
 thread_single(int mode)
 {
 	struct thread *td;
 	struct thread *td2;
 	struct proc *p;
 	int remaining, wakeup_swapper;
 
 	td = curthread;
 	p = td->td_proc;
 	mtx_assert(&Giant, MA_NOTOWNED);
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 
 	if ((p->p_flag & P_HADTHREADS) == 0)
 		return (0);
 
 	/* Is someone already single threading? */
 	if (p->p_singlethread != NULL && p->p_singlethread != td)
 		return (1);
 
 	if (mode == SINGLE_EXIT) {
 		p->p_flag |= P_SINGLE_EXIT;
 		p->p_flag &= ~P_SINGLE_BOUNDARY;
 	} else {
 		p->p_flag &= ~P_SINGLE_EXIT;
 		if (mode == SINGLE_BOUNDARY)
 			p->p_flag |= P_SINGLE_BOUNDARY;
 		else
 			p->p_flag &= ~P_SINGLE_BOUNDARY;
 	}
 	p->p_flag |= P_STOPPED_SINGLE;
 	PROC_SLOCK(p);
 	p->p_singlethread = td;
 	remaining = calc_remaining(p, mode);
 	while (remaining != 1) {
 		if (P_SHOULDSTOP(p) != P_STOPPED_SINGLE)
 			goto stopme;
 		wakeup_swapper = 0;
 		FOREACH_THREAD_IN_PROC(p, td2) {
 			if (td2 == td)
 				continue;
 			thread_lock(td2);
 			td2->td_flags |= TDF_ASTPENDING | TDF_NEEDSUSPCHK;
 			if (TD_IS_INHIBITED(td2)) {
 				switch (mode) {
 				case SINGLE_EXIT:
 					if (TD_IS_SUSPENDED(td2))
 						wakeup_swapper |=
 						    thread_unsuspend_one(td2);
 					if (TD_ON_SLEEPQ(td2) &&
 					    (td2->td_flags & TDF_SINTR))
 						wakeup_swapper |=
 						    sleepq_abort(td2, EINTR);
 					break;
 				case SINGLE_BOUNDARY:
 					if (TD_IS_SUSPENDED(td2) &&
 					    !(td2->td_flags & TDF_BOUNDARY))
 						wakeup_swapper |=
 						    thread_unsuspend_one(td2);
 					if (TD_ON_SLEEPQ(td2) &&
 					    (td2->td_flags & TDF_SINTR))
 						wakeup_swapper |=
 						    sleepq_abort(td2, ERESTART);
 					break;
 				case SINGLE_NO_EXIT:
 					if (TD_IS_SUSPENDED(td2) &&
 					    !(td2->td_flags & TDF_BOUNDARY))
 						wakeup_swapper |=
 						    thread_unsuspend_one(td2);
 					if (TD_ON_SLEEPQ(td2) &&
 					    (td2->td_flags & TDF_SINTR))
 						wakeup_swapper |=
 						    sleepq_abort(td2, ERESTART);
 					break;
 				default:
 					break;
 				}
 			}
 #ifdef SMP
 			else if (TD_IS_RUNNING(td2) && td != td2) {
 				forward_signal(td2);
 			}
 #endif
 			thread_unlock(td2);
 		}
 		if (wakeup_swapper)
 			kick_proc0();
 		remaining = calc_remaining(p, mode);
 
 		/*
 		 * Maybe we suspended some threads.. was it enough?
 		 */
 		if (remaining == 1)
 			break;
 
 stopme:
 		/*
 		 * Wake us up when everyone else has suspended.
 		 * In the mean time we suspend as well.
 		 */
 		thread_suspend_switch(td);
 		remaining = calc_remaining(p, mode);
 	}
 	if (mode == SINGLE_EXIT) {
 		/*
 		 * We have gotten rid of all the other threads and we
 		 * are about to either exit or exec. In either case,
 		 * we try our utmost to revert to being a non-threaded
 		 * process.
 		 */
 		p->p_singlethread = NULL;
 		p->p_flag &= ~(P_STOPPED_SINGLE | P_SINGLE_EXIT);
 		thread_unthread(td);
 	}
 	PROC_SUNLOCK(p);
 	return (0);
 }
 
 /*
  * Called in from locations that can safely check to see
  * whether we have to suspend or at least throttle for a
  * single-thread event (e.g. fork).
  *
  * Such locations include userret().
  * If the "return_instead" argument is non zero, the thread must be able to
  * accept 0 (caller may continue), or 1 (caller must abort) as a result.
  *
  * The 'return_instead' argument tells the function if it may do a
  * thread_exit() or suspend, or whether the caller must abort and back
  * out instead.
  *
  * If the thread that set the single_threading request has set the
  * P_SINGLE_EXIT bit in the process flags then this call will never return
  * if 'return_instead' is false, but will exit.
  *
  * P_SINGLE_EXIT | return_instead == 0| return_instead != 0
  *---------------+--------------------+---------------------
  *       0       | returns 0          |   returns 0 or 1
  *               | when ST ends       |   immediately
  *---------------+--------------------+---------------------
  *       1       | thread exits       |   returns 1
  *               |                    |  immediately
  * 0 = thread_exit() or suspension ok,
  * other = return error instead of stopping the thread.
  *
  * While a full suspension is under effect, even a single threading
  * thread would be suspended if it made this call (but it shouldn't).
  * This call should only be made from places where
  * thread_exit() would be safe as that may be the outcome unless
  * return_instead is set.
  */
 int
 thread_suspend_check(int return_instead)
 {
 	struct thread *td;
 	struct proc *p;
 	int wakeup_swapper;
 
 	td = curthread;
 	p = td->td_proc;
 	mtx_assert(&Giant, MA_NOTOWNED);
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	while (P_SHOULDSTOP(p) ||
 	      ((p->p_flag & P_TRACED) && (td->td_dbgflags & TDB_SUSPEND))) {
 		if (P_SHOULDSTOP(p) == P_STOPPED_SINGLE) {
 			KASSERT(p->p_singlethread != NULL,
 			    ("singlethread not set"));
 			/*
 			 * The only suspension in action is a
 			 * single-threading. Single threader need not stop.
 			 * XXX Should be safe to access unlocked
 			 * as it can only be set to be true by us.
 			 */
 			if (p->p_singlethread == td)
 				return (0);	/* Exempt from stopping. */
 		}
 		if ((p->p_flag & P_SINGLE_EXIT) && return_instead)
 			return (EINTR);
 
 		/* Should we goto user boundary if we didn't come from there? */
 		if (P_SHOULDSTOP(p) == P_STOPPED_SINGLE &&
 		    (p->p_flag & P_SINGLE_BOUNDARY) && return_instead)
 			return (ERESTART);
 
 		/*
 		 * Ignore suspend requests for stop signals if they
 		 * are deferred.
 		 */
 		if (P_SHOULDSTOP(p) == P_STOPPED_SIG &&
 		    td->td_flags & TDF_SBDRY) {
 			KASSERT(return_instead,
 			    ("TDF_SBDRY set for unsafe thread_suspend_check"));
 			return (0);
 		}
 
 		/*
 		 * If the process is waiting for us to exit,
 		 * this thread should just suicide.
 		 * Assumes that P_SINGLE_EXIT implies P_STOPPED_SINGLE.
 		 */
 		if ((p->p_flag & P_SINGLE_EXIT) && (p->p_singlethread != td)) {
 			PROC_UNLOCK(p);
 			tidhash_remove(td);
 			PROC_LOCK(p);
 			tdsigcleanup(td);
 			PROC_SLOCK(p);
 			thread_stopped(p);
 			thread_exit();
 		}
 
 		PROC_SLOCK(p);
 		thread_stopped(p);
 		if (P_SHOULDSTOP(p) == P_STOPPED_SINGLE) {
 			if (p->p_numthreads == p->p_suspcount + 1) {
 				thread_lock(p->p_singlethread);
 				wakeup_swapper =
 				    thread_unsuspend_one(p->p_singlethread);
 				thread_unlock(p->p_singlethread);
 				if (wakeup_swapper)
 					kick_proc0();
 			}
 		}
 		PROC_UNLOCK(p);
 		thread_lock(td);
 		/*
 		 * When a thread suspends, it just
 		 * gets taken off all queues.
 		 */
 		thread_suspend_one(td);
 		if (return_instead == 0) {
 			p->p_boundary_count++;
 			td->td_flags |= TDF_BOUNDARY;
 		}
 		PROC_SUNLOCK(p);
 		mi_switch(SW_INVOL | SWT_SUSPEND, NULL);
 		if (return_instead == 0)
 			td->td_flags &= ~TDF_BOUNDARY;
 		thread_unlock(td);
 		PROC_LOCK(p);
 		if (return_instead == 0) {
 			PROC_SLOCK(p);
 			p->p_boundary_count--;
 			PROC_SUNLOCK(p);
 		}
 	}
 	return (0);
 }
 
 void
 thread_suspend_switch(struct thread *td)
 {
 	struct proc *p;
 
 	p = td->td_proc;
 	KASSERT(!TD_IS_SUSPENDED(td), ("already suspended"));
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	PROC_SLOCK_ASSERT(p, MA_OWNED);
 	/*
 	 * We implement thread_suspend_one in stages here to avoid
 	 * dropping the proc lock while the thread lock is owned.
 	 */
 	thread_stopped(p);
 	p->p_suspcount++;
 	PROC_UNLOCK(p);
 	thread_lock(td);
 	td->td_flags &= ~TDF_NEEDSUSPCHK;
 	TD_SET_SUSPENDED(td);
 	sched_sleep(td, 0);
 	PROC_SUNLOCK(p);
 	DROP_GIANT();
 	mi_switch(SW_VOL | SWT_SUSPEND, NULL);
 	thread_unlock(td);
 	PICKUP_GIANT();
 	PROC_LOCK(p);
 	PROC_SLOCK(p);
 }
 
 void
 thread_suspend_one(struct thread *td)
 {
 	struct proc *p = td->td_proc;
 
 	PROC_SLOCK_ASSERT(p, MA_OWNED);
 	THREAD_LOCK_ASSERT(td, MA_OWNED);
 	KASSERT(!TD_IS_SUSPENDED(td), ("already suspended"));
 	p->p_suspcount++;
 	td->td_flags &= ~TDF_NEEDSUSPCHK;
 	TD_SET_SUSPENDED(td);
 	sched_sleep(td, 0);
 }
 
 int
 thread_unsuspend_one(struct thread *td)
 {
 	struct proc *p = td->td_proc;
 
 	PROC_SLOCK_ASSERT(p, MA_OWNED);
 	THREAD_LOCK_ASSERT(td, MA_OWNED);
 	KASSERT(TD_IS_SUSPENDED(td), ("Thread not suspended"));
 	TD_CLR_SUSPENDED(td);
 	p->p_suspcount--;
 	return (setrunnable(td));
 }
 
 /*
  * Allow all threads blocked by single threading to continue running.
  */
 void
 thread_unsuspend(struct proc *p)
 {
 	struct thread *td;
 	int wakeup_swapper;
 
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	PROC_SLOCK_ASSERT(p, MA_OWNED);
 	wakeup_swapper = 0;
 	if (!P_SHOULDSTOP(p)) {
                 FOREACH_THREAD_IN_PROC(p, td) {
 			thread_lock(td);
 			if (TD_IS_SUSPENDED(td)) {
 				wakeup_swapper |= thread_unsuspend_one(td);
 			}
 			thread_unlock(td);
 		}
 	} else if ((P_SHOULDSTOP(p) == P_STOPPED_SINGLE) &&
 	    (p->p_numthreads == p->p_suspcount)) {
 		/*
 		 * Stopping everything also did the job for the single
 		 * threading request. Now we've downgraded to single-threaded,
 		 * let it continue.
 		 */
 		thread_lock(p->p_singlethread);
 		wakeup_swapper = thread_unsuspend_one(p->p_singlethread);
 		thread_unlock(p->p_singlethread);
 	}
 	if (wakeup_swapper)
 		kick_proc0();
 }
 
 /*
  * End the single threading mode..
  */
 void
 thread_single_end(void)
 {
 	struct thread *td;
 	struct proc *p;
 	int wakeup_swapper;
 
 	td = curthread;
 	p = td->td_proc;
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	p->p_flag &= ~(P_STOPPED_SINGLE | P_SINGLE_EXIT | P_SINGLE_BOUNDARY);
 	PROC_SLOCK(p);
 	p->p_singlethread = NULL;
 	wakeup_swapper = 0;
 	/*
 	 * If there are other threads they may now run,
 	 * unless of course there is a blanket 'stop order'
 	 * on the process. The single threader must be allowed
 	 * to continue however as this is a bad place to stop.
 	 */
 	if ((p->p_numthreads != 1) && (!P_SHOULDSTOP(p))) {
                 FOREACH_THREAD_IN_PROC(p, td) {
 			thread_lock(td);
 			if (TD_IS_SUSPENDED(td)) {
 				wakeup_swapper |= thread_unsuspend_one(td);
 			}
 			thread_unlock(td);
 		}
 	}
 	PROC_SUNLOCK(p);
 	if (wakeup_swapper)
 		kick_proc0();
 }
 
 struct thread *
 thread_find(struct proc *p, lwpid_t tid)
 {
 	struct thread *td;
 
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	FOREACH_THREAD_IN_PROC(p, td) {
 		if (td->td_tid == tid)
 			break;
 	}
 	return (td);
 }
 
 /* Locate a thread by number; return with proc lock held. */
 struct thread *
 tdfind(lwpid_t tid, pid_t pid)
 {
 #define RUN_THRESH	16
 	struct thread *td;
 	int run = 0;
 
 	rw_rlock(&tidhash_lock);
 	LIST_FOREACH(td, TIDHASH(tid), td_hash) {
 		if (td->td_tid == tid) {
 			if (pid != -1 && td->td_proc->p_pid != pid) {
 				td = NULL;
 				break;
 			}
 			PROC_LOCK(td->td_proc);
 			if (td->td_proc->p_state == PRS_NEW) {
 				PROC_UNLOCK(td->td_proc);
 				td = NULL;
 				break;
 			}
 			if (run > RUN_THRESH) {
 				if (rw_try_upgrade(&tidhash_lock)) {
 					LIST_REMOVE(td, td_hash);
 					LIST_INSERT_HEAD(TIDHASH(td->td_tid),
 						td, td_hash);
 					rw_wunlock(&tidhash_lock);
 					return (td);
 				}
 			}
 			break;
 		}
 		run++;
 	}
 	rw_runlock(&tidhash_lock);
 	return (td);
 }
 
 void
 tidhash_add(struct thread *td)
 {
 	rw_wlock(&tidhash_lock);
 	LIST_INSERT_HEAD(TIDHASH(td->td_tid), td, td_hash);
 	rw_wunlock(&tidhash_lock);
 }
 
 void
 tidhash_remove(struct thread *td)
 {
 	rw_wlock(&tidhash_lock);
 	LIST_REMOVE(td, td_hash);
 	rw_wunlock(&tidhash_lock);
 }
Index: head/sys/kern/kern_timeout.c
===================================================================
--- head/sys/kern/kern_timeout.c	(revision 258540)
+++ head/sys/kern/kern_timeout.c	(revision 258541)
@@ -1,1446 +1,1445 @@
 /*-
  * Copyright (c) 1982, 1986, 1991, 1993
  *	The Regents of the University of California.  All rights reserved.
  * (c) UNIX System Laboratories, Inc.
  * All or some portions of this file are derived from material licensed
  * to the University of California by American Telephone and Telegraph
  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
  * the permission of UNIX System Laboratories, Inc.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	From: @(#)kern_clock.c	8.5 (Berkeley) 1/21/94
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_callout_profiling.h"
-#include "opt_kdtrace.h"
 #if defined(__arm__)
 #include "opt_timer.h"
 #endif
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/bus.h>
 #include <sys/callout.h>
 #include <sys/file.h>
 #include <sys/interrupt.h>
 #include <sys/kernel.h>
 #include <sys/ktr.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mutex.h>
 #include <sys/proc.h>
 #include <sys/sdt.h>
 #include <sys/sleepqueue.h>
 #include <sys/sysctl.h>
 #include <sys/smp.h>
 
 #ifdef SMP
 #include <machine/cpu.h>
 #endif
 
 #ifndef NO_EVENTTIMERS
 DPCPU_DECLARE(sbintime_t, hardclocktime);
 #endif
 
 SDT_PROVIDER_DEFINE(callout_execute);
 SDT_PROBE_DEFINE1(callout_execute, kernel, , callout_start, callout-start,
     "struct callout *");
 SDT_PROBE_DEFINE1(callout_execute, kernel, , callout_end, callout-end,
     "struct callout *");
 
 #ifdef CALLOUT_PROFILING
 static int avg_depth;
 SYSCTL_INT(_debug, OID_AUTO, to_avg_depth, CTLFLAG_RD, &avg_depth, 0,
     "Average number of items examined per softclock call. Units = 1/1000");
 static int avg_gcalls;
 SYSCTL_INT(_debug, OID_AUTO, to_avg_gcalls, CTLFLAG_RD, &avg_gcalls, 0,
     "Average number of Giant callouts made per softclock call. Units = 1/1000");
 static int avg_lockcalls;
 SYSCTL_INT(_debug, OID_AUTO, to_avg_lockcalls, CTLFLAG_RD, &avg_lockcalls, 0,
     "Average number of lock callouts made per softclock call. Units = 1/1000");
 static int avg_mpcalls;
 SYSCTL_INT(_debug, OID_AUTO, to_avg_mpcalls, CTLFLAG_RD, &avg_mpcalls, 0,
     "Average number of MP callouts made per softclock call. Units = 1/1000");
 static int avg_depth_dir;
 SYSCTL_INT(_debug, OID_AUTO, to_avg_depth_dir, CTLFLAG_RD, &avg_depth_dir, 0,
     "Average number of direct callouts examined per callout_process call. "
     "Units = 1/1000");
 static int avg_lockcalls_dir;
 SYSCTL_INT(_debug, OID_AUTO, to_avg_lockcalls_dir, CTLFLAG_RD,
     &avg_lockcalls_dir, 0, "Average number of lock direct callouts made per "
     "callout_process call. Units = 1/1000");
 static int avg_mpcalls_dir;
 SYSCTL_INT(_debug, OID_AUTO, to_avg_mpcalls_dir, CTLFLAG_RD, &avg_mpcalls_dir,
     0, "Average number of MP direct callouts made per callout_process call. "
     "Units = 1/1000");
 #endif
 
 static int ncallout;
 SYSCTL_INT(_kern, OID_AUTO, ncallout, CTLFLAG_RDTUN, &ncallout, 0,
     "Number of entries in callwheel and size of timeout() preallocation");
 
 /*
  * TODO:
  *	allocate more timeout table slots when table overflows.
  */
 u_int callwheelsize, callwheelmask;
 
 /*
  * The callout cpu exec entities represent informations necessary for
  * describing the state of callouts currently running on the CPU and the ones
  * necessary for migrating callouts to the new callout cpu. In particular,
  * the first entry of the array cc_exec_entity holds informations for callout
  * running in SWI thread context, while the second one holds informations
  * for callout running directly from hardware interrupt context.
  * The cached informations are very important for deferring migration when
  * the migrating callout is already running.
  */
 struct cc_exec {
 	struct callout		*cc_next;
 	struct callout		*cc_curr;
 #ifdef SMP
 	void			(*ce_migration_func)(void *);
 	void			*ce_migration_arg;
 	int			ce_migration_cpu;
 	sbintime_t		ce_migration_time;
 	sbintime_t		ce_migration_prec;
 #endif
 	bool			cc_cancel;
 	bool			cc_waiting;
 };
 
 /*
  * There is one struct callout_cpu per cpu, holding all relevant
  * state for the callout processing thread on the individual CPU.
  */
 struct callout_cpu {
 	struct mtx_padalign	cc_lock;
 	struct cc_exec 		cc_exec_entity[2];
 	struct callout		*cc_callout;
 	struct callout_list	*cc_callwheel;
 	struct callout_tailq	cc_expireq;
 	struct callout_slist	cc_callfree;
 	sbintime_t		cc_firstevent;
 	sbintime_t		cc_lastscan;
 	void			*cc_cookie;
 	u_int			cc_bucket;
 };
 
 #define	cc_exec_curr		cc_exec_entity[0].cc_curr
 #define	cc_exec_next		cc_exec_entity[0].cc_next
 #define	cc_exec_cancel		cc_exec_entity[0].cc_cancel
 #define	cc_exec_waiting		cc_exec_entity[0].cc_waiting
 #define	cc_exec_curr_dir	cc_exec_entity[1].cc_curr
 #define	cc_exec_next_dir	cc_exec_entity[1].cc_next
 #define	cc_exec_cancel_dir	cc_exec_entity[1].cc_cancel
 #define	cc_exec_waiting_dir	cc_exec_entity[1].cc_waiting
 
 #ifdef SMP
 #define	cc_migration_func	cc_exec_entity[0].ce_migration_func
 #define	cc_migration_arg	cc_exec_entity[0].ce_migration_arg
 #define	cc_migration_cpu	cc_exec_entity[0].ce_migration_cpu
 #define	cc_migration_time	cc_exec_entity[0].ce_migration_time
 #define	cc_migration_prec	cc_exec_entity[0].ce_migration_prec
 #define	cc_migration_func_dir	cc_exec_entity[1].ce_migration_func
 #define	cc_migration_arg_dir	cc_exec_entity[1].ce_migration_arg
 #define	cc_migration_cpu_dir	cc_exec_entity[1].ce_migration_cpu
 #define	cc_migration_time_dir	cc_exec_entity[1].ce_migration_time
 #define	cc_migration_prec_dir	cc_exec_entity[1].ce_migration_prec
 
 struct callout_cpu cc_cpu[MAXCPU];
 #define	CPUBLOCK	MAXCPU
 #define	CC_CPU(cpu)	(&cc_cpu[(cpu)])
 #define	CC_SELF()	CC_CPU(PCPU_GET(cpuid))
 #else
 struct callout_cpu cc_cpu;
 #define	CC_CPU(cpu)	&cc_cpu
 #define	CC_SELF()	&cc_cpu
 #endif
 #define	CC_LOCK(cc)	mtx_lock_spin(&(cc)->cc_lock)
 #define	CC_UNLOCK(cc)	mtx_unlock_spin(&(cc)->cc_lock)
 #define	CC_LOCK_ASSERT(cc)	mtx_assert(&(cc)->cc_lock, MA_OWNED)
 
 static int timeout_cpu;
 
 static void	callout_cpu_init(struct callout_cpu *cc);
 static void	softclock_call_cc(struct callout *c, struct callout_cpu *cc,
 #ifdef CALLOUT_PROFILING
 		    int *mpcalls, int *lockcalls, int *gcalls,
 #endif
 		    int direct);
 
 static MALLOC_DEFINE(M_CALLOUT, "callout", "Callout datastructures");
 
 /**
  * Locked by cc_lock:
  *   cc_curr         - If a callout is in progress, it is cc_curr.
  *                     If cc_curr is non-NULL, threads waiting in
  *                     callout_drain() will be woken up as soon as the
  *                     relevant callout completes.
  *   cc_cancel       - Changing to 1 with both callout_lock and cc_lock held
  *                     guarantees that the current callout will not run.
  *                     The softclock() function sets this to 0 before it
  *                     drops callout_lock to acquire c_lock, and it calls
  *                     the handler only if curr_cancelled is still 0 after
  *                     cc_lock is successfully acquired.
  *   cc_waiting      - If a thread is waiting in callout_drain(), then
  *                     callout_wait is nonzero.  Set only when
  *                     cc_curr is non-NULL.
  */
 
 /*
  * Resets the execution entity tied to a specific callout cpu.
  */
 static void
 cc_cce_cleanup(struct callout_cpu *cc, int direct)
 {
 
 	cc->cc_exec_entity[direct].cc_curr = NULL;
 	cc->cc_exec_entity[direct].cc_next = NULL;
 	cc->cc_exec_entity[direct].cc_cancel = false;
 	cc->cc_exec_entity[direct].cc_waiting = false;
 #ifdef SMP
 	cc->cc_exec_entity[direct].ce_migration_cpu = CPUBLOCK;
 	cc->cc_exec_entity[direct].ce_migration_time = 0;
 	cc->cc_exec_entity[direct].ce_migration_prec = 0;
 	cc->cc_exec_entity[direct].ce_migration_func = NULL;
 	cc->cc_exec_entity[direct].ce_migration_arg = NULL;
 #endif
 }
 
 /*
  * Checks if migration is requested by a specific callout cpu.
  */
 static int
 cc_cce_migrating(struct callout_cpu *cc, int direct)
 {
 
 #ifdef SMP
 	return (cc->cc_exec_entity[direct].ce_migration_cpu != CPUBLOCK);
 #else
 	return (0);
 #endif
 }
 
 /*
  * Kernel low level callwheel initialization
  * called on cpu0 during kernel startup.
  */
 static void
 callout_callwheel_init(void *dummy)
 {
 	struct callout_cpu *cc;
 
 	/*
 	 * Calculate the size of the callout wheel and the preallocated
 	 * timeout() structures.
 	 * XXX: Clip callout to result of previous function of maxusers
 	 * maximum 384.  This is still huge, but acceptable.
 	 */
 	ncallout = imin(16 + maxproc + maxfiles, 18508);
 	TUNABLE_INT_FETCH("kern.ncallout", &ncallout);
 
 	/*
 	 * Calculate callout wheel size, should be next power of two higher
 	 * than 'ncallout'.
 	 */
 	callwheelsize = 1 << fls(ncallout);
 	callwheelmask = callwheelsize - 1;
 
 	/*
 	 * Only cpu0 handles timeout(9) and receives a preallocation.
 	 *
 	 * XXX: Once all timeout(9) consumers are converted this can
 	 * be removed.
 	 */
 	timeout_cpu = PCPU_GET(cpuid);
 	cc = CC_CPU(timeout_cpu);
 	cc->cc_callout = malloc(ncallout * sizeof(struct callout),
 	    M_CALLOUT, M_WAITOK);
 	callout_cpu_init(cc);
 }
 SYSINIT(callwheel_init, SI_SUB_CPU, SI_ORDER_ANY, callout_callwheel_init, NULL);
 
 /*
  * Initialize the per-cpu callout structures.
  */
 static void
 callout_cpu_init(struct callout_cpu *cc)
 {
 	struct callout *c;
 	int i;
 
 	mtx_init(&cc->cc_lock, "callout", NULL, MTX_SPIN | MTX_RECURSE);
 	SLIST_INIT(&cc->cc_callfree);
 	cc->cc_callwheel = malloc(sizeof(struct callout_list) * callwheelsize,
 	    M_CALLOUT, M_WAITOK);
 	for (i = 0; i < callwheelsize; i++)
 		LIST_INIT(&cc->cc_callwheel[i]);
 	TAILQ_INIT(&cc->cc_expireq);
 	cc->cc_firstevent = INT64_MAX;
 	for (i = 0; i < 2; i++)
 		cc_cce_cleanup(cc, i);
 	if (cc->cc_callout == NULL)	/* Only cpu0 handles timeout(9) */
 		return;
 	for (i = 0; i < ncallout; i++) {
 		c = &cc->cc_callout[i];
 		callout_init(c, 0);
 		c->c_flags = CALLOUT_LOCAL_ALLOC;
 		SLIST_INSERT_HEAD(&cc->cc_callfree, c, c_links.sle);
 	}
 }
 
 #ifdef SMP
 /*
  * Switches the cpu tied to a specific callout.
  * The function expects a locked incoming callout cpu and returns with
  * locked outcoming callout cpu.
  */
 static struct callout_cpu *
 callout_cpu_switch(struct callout *c, struct callout_cpu *cc, int new_cpu)
 {
 	struct callout_cpu *new_cc;
 
 	MPASS(c != NULL && cc != NULL);
 	CC_LOCK_ASSERT(cc);
 
 	/*
 	 * Avoid interrupts and preemption firing after the callout cpu
 	 * is blocked in order to avoid deadlocks as the new thread
 	 * may be willing to acquire the callout cpu lock.
 	 */
 	c->c_cpu = CPUBLOCK;
 	spinlock_enter();
 	CC_UNLOCK(cc);
 	new_cc = CC_CPU(new_cpu);
 	CC_LOCK(new_cc);
 	spinlock_exit();
 	c->c_cpu = new_cpu;
 	return (new_cc);
 }
 #endif
 
 /*
  * Start standard softclock thread.
  */
 static void
 start_softclock(void *dummy)
 {
 	struct callout_cpu *cc;
 #ifdef SMP
 	int cpu;
 #endif
 
 	cc = CC_CPU(timeout_cpu);
 	if (swi_add(&clk_intr_event, "clock", softclock, cc, SWI_CLOCK,
 	    INTR_MPSAFE, &cc->cc_cookie))
 		panic("died while creating standard software ithreads");
 #ifdef SMP
 	CPU_FOREACH(cpu) {
 		if (cpu == timeout_cpu)
 			continue;
 		cc = CC_CPU(cpu);
 		cc->cc_callout = NULL;	/* Only cpu0 handles timeout(9). */
 		callout_cpu_init(cc);
 		if (swi_add(NULL, "clock", softclock, cc, SWI_CLOCK,
 		    INTR_MPSAFE, &cc->cc_cookie))
 			panic("died while creating standard software ithreads");
 	}
 #endif
 }
 SYSINIT(start_softclock, SI_SUB_SOFTINTR, SI_ORDER_FIRST, start_softclock, NULL);
 
 #define	CC_HASH_SHIFT	8
 
 static inline u_int
 callout_hash(sbintime_t sbt)
 {
 
 	return (sbt >> (32 - CC_HASH_SHIFT));
 }
 
 static inline u_int
 callout_get_bucket(sbintime_t sbt)
 {
 
 	return (callout_hash(sbt) & callwheelmask);
 }
 
 void
 callout_process(sbintime_t now)
 {
 	struct callout *tmp, *tmpn;
 	struct callout_cpu *cc;
 	struct callout_list *sc;
 	sbintime_t first, last, max, tmp_max;
 	uint32_t lookahead;
 	u_int firstb, lastb, nowb;
 #ifdef CALLOUT_PROFILING
 	int depth_dir = 0, mpcalls_dir = 0, lockcalls_dir = 0;
 #endif
 
 	cc = CC_SELF();
 	mtx_lock_spin_flags(&cc->cc_lock, MTX_QUIET);
 
 	/* Compute the buckets of the last scan and present times. */
 	firstb = callout_hash(cc->cc_lastscan);
 	cc->cc_lastscan = now;
 	nowb = callout_hash(now);
 
 	/* Compute the last bucket and minimum time of the bucket after it. */
 	if (nowb == firstb)
 		lookahead = (SBT_1S / 16);
 	else if (nowb - firstb == 1)
 		lookahead = (SBT_1S / 8);
 	else
 		lookahead = (SBT_1S / 2);
 	first = last = now;
 	first += (lookahead / 2);
 	last += lookahead;
 	last &= (0xffffffffffffffffLLU << (32 - CC_HASH_SHIFT));
 	lastb = callout_hash(last) - 1;
 	max = last;
 
 	/*
 	 * Check if we wrapped around the entire wheel from the last scan.
 	 * In case, we need to scan entirely the wheel for pending callouts.
 	 */
 	if (lastb - firstb >= callwheelsize) {
 		lastb = firstb + callwheelsize - 1;
 		if (nowb - firstb >= callwheelsize)
 			nowb = lastb;
 	}
 
 	/* Iterate callwheel from firstb to nowb and then up to lastb. */
 	do {
 		sc = &cc->cc_callwheel[firstb & callwheelmask];
 		tmp = LIST_FIRST(sc);
 		while (tmp != NULL) {
 			/* Run the callout if present time within allowed. */
 			if (tmp->c_time <= now) {
 				/*
 				 * Consumer told us the callout may be run
 				 * directly from hardware interrupt context.
 				 */
 				if (tmp->c_flags & CALLOUT_DIRECT) {
 #ifdef CALLOUT_PROFILING
 					++depth_dir;
 #endif
 					cc->cc_exec_next_dir =
 					    LIST_NEXT(tmp, c_links.le);
 					cc->cc_bucket = firstb & callwheelmask;
 					LIST_REMOVE(tmp, c_links.le);
 					softclock_call_cc(tmp, cc,
 #ifdef CALLOUT_PROFILING
 					    &mpcalls_dir, &lockcalls_dir, NULL,
 #endif
 					    1);
 					tmp = cc->cc_exec_next_dir;
 				} else {
 					tmpn = LIST_NEXT(tmp, c_links.le);
 					LIST_REMOVE(tmp, c_links.le);
 					TAILQ_INSERT_TAIL(&cc->cc_expireq,
 					    tmp, c_links.tqe);
 					tmp->c_flags |= CALLOUT_PROCESSED;
 					tmp = tmpn;
 				}
 				continue;
 			}
 			/* Skip events from distant future. */
 			if (tmp->c_time >= max)
 				goto next;
 			/*
 			 * Event minimal time is bigger than present maximal
 			 * time, so it cannot be aggregated.
 			 */
 			if (tmp->c_time > last) {
 				lastb = nowb;
 				goto next;
 			}
 			/* Update first and last time, respecting this event. */
 			if (tmp->c_time < first)
 				first = tmp->c_time;
 			tmp_max = tmp->c_time + tmp->c_precision;
 			if (tmp_max < last)
 				last = tmp_max;
 next:
 			tmp = LIST_NEXT(tmp, c_links.le);
 		}
 		/* Proceed with the next bucket. */
 		firstb++;
 		/*
 		 * Stop if we looked after present time and found
 		 * some event we can't execute at now.
 		 * Stop if we looked far enough into the future.
 		 */
 	} while (((int)(firstb - lastb)) <= 0);
 	cc->cc_firstevent = last;
 #ifndef NO_EVENTTIMERS
 	cpu_new_callout(curcpu, last, first);
 #endif
 #ifdef CALLOUT_PROFILING
 	avg_depth_dir += (depth_dir * 1000 - avg_depth_dir) >> 8;
 	avg_mpcalls_dir += (mpcalls_dir * 1000 - avg_mpcalls_dir) >> 8;
 	avg_lockcalls_dir += (lockcalls_dir * 1000 - avg_lockcalls_dir) >> 8;
 #endif
 	mtx_unlock_spin_flags(&cc->cc_lock, MTX_QUIET);
 	/*
 	 * swi_sched acquires the thread lock, so we don't want to call it
 	 * with cc_lock held; incorrect locking order.
 	 */
 	if (!TAILQ_EMPTY(&cc->cc_expireq))
 		swi_sched(cc->cc_cookie, 0);
 }
 
 static struct callout_cpu *
 callout_lock(struct callout *c)
 {
 	struct callout_cpu *cc;
 	int cpu;
 
 	for (;;) {
 		cpu = c->c_cpu;
 #ifdef SMP
 		if (cpu == CPUBLOCK) {
 			while (c->c_cpu == CPUBLOCK)
 				cpu_spinwait();
 			continue;
 		}
 #endif
 		cc = CC_CPU(cpu);
 		CC_LOCK(cc);
 		if (cpu == c->c_cpu)
 			break;
 		CC_UNLOCK(cc);
 	}
 	return (cc);
 }
 
 static void
 callout_cc_add(struct callout *c, struct callout_cpu *cc,
     sbintime_t sbt, sbintime_t precision, void (*func)(void *),
     void *arg, int cpu, int flags)
 {
 	int bucket;
 
 	CC_LOCK_ASSERT(cc);
 	if (sbt < cc->cc_lastscan)
 		sbt = cc->cc_lastscan;
 	c->c_arg = arg;
 	c->c_flags |= (CALLOUT_ACTIVE | CALLOUT_PENDING);
 	if (flags & C_DIRECT_EXEC)
 		c->c_flags |= CALLOUT_DIRECT;
 	c->c_flags &= ~CALLOUT_PROCESSED;
 	c->c_func = func;
 	c->c_time = sbt;
 	c->c_precision = precision;
 	bucket = callout_get_bucket(c->c_time);
 	CTR3(KTR_CALLOUT, "precision set for %p: %d.%08x",
 	    c, (int)(c->c_precision >> 32),
 	    (u_int)(c->c_precision & 0xffffffff));
 	LIST_INSERT_HEAD(&cc->cc_callwheel[bucket], c, c_links.le);
 	if (cc->cc_bucket == bucket)
 		cc->cc_exec_next_dir = c;
 #ifndef NO_EVENTTIMERS
 	/*
 	 * Inform the eventtimers(4) subsystem there's a new callout
 	 * that has been inserted, but only if really required.
 	 */
 	if (INT64_MAX - c->c_time < c->c_precision)
 		c->c_precision = INT64_MAX - c->c_time;
 	sbt = c->c_time + c->c_precision;
 	if (sbt < cc->cc_firstevent) {
 		cc->cc_firstevent = sbt;
 		cpu_new_callout(cpu, sbt, c->c_time);
 	}
 #endif
 }
 
 static void
 callout_cc_del(struct callout *c, struct callout_cpu *cc)
 {
 
 	if ((c->c_flags & CALLOUT_LOCAL_ALLOC) == 0)
 		return;
 	c->c_func = NULL;
 	SLIST_INSERT_HEAD(&cc->cc_callfree, c, c_links.sle);
 }
 
 static void
 softclock_call_cc(struct callout *c, struct callout_cpu *cc,
 #ifdef CALLOUT_PROFILING
     int *mpcalls, int *lockcalls, int *gcalls,
 #endif
     int direct)
 {
 	struct rm_priotracker tracker;
 	void (*c_func)(void *);
 	void *c_arg;
 	struct lock_class *class;
 	struct lock_object *c_lock;
 	uintptr_t lock_status;
 	int c_flags;
 #ifdef SMP
 	struct callout_cpu *new_cc;
 	void (*new_func)(void *);
 	void *new_arg;
 	int flags, new_cpu;
 	sbintime_t new_prec, new_time;
 #endif
 #if defined(DIAGNOSTIC) || defined(CALLOUT_PROFILING) 
 	sbintime_t sbt1, sbt2;
 	struct timespec ts2;
 	static sbintime_t maxdt = 2 * SBT_1MS;	/* 2 msec */
 	static timeout_t *lastfunc;
 #endif
 
 	KASSERT((c->c_flags & (CALLOUT_PENDING | CALLOUT_ACTIVE)) ==
 	    (CALLOUT_PENDING | CALLOUT_ACTIVE),
 	    ("softclock_call_cc: pend|act %p %x", c, c->c_flags));
 	class = (c->c_lock != NULL) ? LOCK_CLASS(c->c_lock) : NULL;
 	lock_status = 0;
 	if (c->c_flags & CALLOUT_SHAREDLOCK) {
 		if (class == &lock_class_rm)
 			lock_status = (uintptr_t)&tracker;
 		else
 			lock_status = 1;
 	}
 	c_lock = c->c_lock;
 	c_func = c->c_func;
 	c_arg = c->c_arg;
 	c_flags = c->c_flags;
 	if (c->c_flags & CALLOUT_LOCAL_ALLOC)
 		c->c_flags = CALLOUT_LOCAL_ALLOC;
 	else
 		c->c_flags &= ~CALLOUT_PENDING;
 	cc->cc_exec_entity[direct].cc_curr = c;
 	cc->cc_exec_entity[direct].cc_cancel = false;
 	CC_UNLOCK(cc);
 	if (c_lock != NULL) {
 		class->lc_lock(c_lock, lock_status);
 		/*
 		 * The callout may have been cancelled
 		 * while we switched locks.
 		 */
 		if (cc->cc_exec_entity[direct].cc_cancel) {
 			class->lc_unlock(c_lock);
 			goto skip;
 		}
 		/* The callout cannot be stopped now. */
 		cc->cc_exec_entity[direct].cc_cancel = true;
 		if (c_lock == &Giant.lock_object) {
 #ifdef CALLOUT_PROFILING
 			(*gcalls)++;
 #endif
 			CTR3(KTR_CALLOUT, "callout giant %p func %p arg %p",
 			    c, c_func, c_arg);
 		} else {
 #ifdef CALLOUT_PROFILING
 			(*lockcalls)++;
 #endif
 			CTR3(KTR_CALLOUT, "callout lock %p func %p arg %p",
 			    c, c_func, c_arg);
 		}
 	} else {
 #ifdef CALLOUT_PROFILING
 		(*mpcalls)++;
 #endif
 		CTR3(KTR_CALLOUT, "callout %p func %p arg %p",
 		    c, c_func, c_arg);
 	}
 #if defined(DIAGNOSTIC) || defined(CALLOUT_PROFILING)
 	sbt1 = sbinuptime();
 #endif
 	THREAD_NO_SLEEPING();
 	SDT_PROBE(callout_execute, kernel, , callout_start, c, 0, 0, 0, 0);
 	c_func(c_arg);
 	SDT_PROBE(callout_execute, kernel, , callout_end, c, 0, 0, 0, 0);
 	THREAD_SLEEPING_OK();
 #if defined(DIAGNOSTIC) || defined(CALLOUT_PROFILING)
 	sbt2 = sbinuptime();
 	sbt2 -= sbt1;
 	if (sbt2 > maxdt) {
 		if (lastfunc != c_func || sbt2 > maxdt * 2) {
 			ts2 = sbttots(sbt2);
 			printf(
 		"Expensive timeout(9) function: %p(%p) %jd.%09ld s\n",
 			    c_func, c_arg, (intmax_t)ts2.tv_sec, ts2.tv_nsec);
 		}
 		maxdt = sbt2;
 		lastfunc = c_func;
 	}
 #endif
 	CTR1(KTR_CALLOUT, "callout %p finished", c);
 	if ((c_flags & CALLOUT_RETURNUNLOCKED) == 0)
 		class->lc_unlock(c_lock);
 skip:
 	CC_LOCK(cc);
 	KASSERT(cc->cc_exec_entity[direct].cc_curr == c, ("mishandled cc_curr"));
 	cc->cc_exec_entity[direct].cc_curr = NULL;
 	if (cc->cc_exec_entity[direct].cc_waiting) {
 		/*
 		 * There is someone waiting for the
 		 * callout to complete.
 		 * If the callout was scheduled for
 		 * migration just cancel it.
 		 */
 		if (cc_cce_migrating(cc, direct)) {
 			cc_cce_cleanup(cc, direct);
 
 			/*
 			 * It should be assert here that the callout is not
 			 * destroyed but that is not easy.
 			 */
 			c->c_flags &= ~CALLOUT_DFRMIGRATION;
 		}
 		cc->cc_exec_entity[direct].cc_waiting = false;
 		CC_UNLOCK(cc);
 		wakeup(&cc->cc_exec_entity[direct].cc_waiting);
 		CC_LOCK(cc);
 	} else if (cc_cce_migrating(cc, direct)) {
 		KASSERT((c_flags & CALLOUT_LOCAL_ALLOC) == 0,
 		    ("Migrating legacy callout %p", c));
 #ifdef SMP
 		/*
 		 * If the callout was scheduled for
 		 * migration just perform it now.
 		 */
 		new_cpu = cc->cc_exec_entity[direct].ce_migration_cpu;
 		new_time = cc->cc_exec_entity[direct].ce_migration_time;
 		new_prec = cc->cc_exec_entity[direct].ce_migration_prec;
 		new_func = cc->cc_exec_entity[direct].ce_migration_func;
 		new_arg = cc->cc_exec_entity[direct].ce_migration_arg;
 		cc_cce_cleanup(cc, direct);
 
 		/*
 		 * It should be assert here that the callout is not destroyed
 		 * but that is not easy.
 		 *
 		 * As first thing, handle deferred callout stops.
 		 */
 		if ((c->c_flags & CALLOUT_DFRMIGRATION) == 0) {
 			CTR3(KTR_CALLOUT,
 			     "deferred cancelled %p func %p arg %p",
 			     c, new_func, new_arg);
 			callout_cc_del(c, cc);
 			return;
 		}
 		c->c_flags &= ~CALLOUT_DFRMIGRATION;
 
 		new_cc = callout_cpu_switch(c, cc, new_cpu);
 		flags = (direct) ? C_DIRECT_EXEC : 0;
 		callout_cc_add(c, new_cc, new_time, new_prec, new_func,
 		    new_arg, new_cpu, flags);
 		CC_UNLOCK(new_cc);
 		CC_LOCK(cc);
 #else
 		panic("migration should not happen");
 #endif
 	}
 	/*
 	 * If the current callout is locally allocated (from
 	 * timeout(9)) then put it on the freelist.
 	 *
 	 * Note: we need to check the cached copy of c_flags because
 	 * if it was not local, then it's not safe to deref the
 	 * callout pointer.
 	 */
 	KASSERT((c_flags & CALLOUT_LOCAL_ALLOC) == 0 ||
 	    c->c_flags == CALLOUT_LOCAL_ALLOC,
 	    ("corrupted callout"));
 	if (c_flags & CALLOUT_LOCAL_ALLOC)
 		callout_cc_del(c, cc);
 }
 
 /*
  * The callout mechanism is based on the work of Adam M. Costello and
  * George Varghese, published in a technical report entitled "Redesigning
  * the BSD Callout and Timer Facilities" and modified slightly for inclusion
  * in FreeBSD by Justin T. Gibbs.  The original work on the data structures
  * used in this implementation was published by G. Varghese and T. Lauck in
  * the paper "Hashed and Hierarchical Timing Wheels: Data Structures for
  * the Efficient Implementation of a Timer Facility" in the Proceedings of
  * the 11th ACM Annual Symposium on Operating Systems Principles,
  * Austin, Texas Nov 1987.
  */
 
 /*
  * Software (low priority) clock interrupt.
  * Run periodic events from timeout queue.
  */
 void
 softclock(void *arg)
 {
 	struct callout_cpu *cc;
 	struct callout *c;
 #ifdef CALLOUT_PROFILING
 	int depth = 0, gcalls = 0, lockcalls = 0, mpcalls = 0;
 #endif
 
 	cc = (struct callout_cpu *)arg;
 	CC_LOCK(cc);
 	while ((c = TAILQ_FIRST(&cc->cc_expireq)) != NULL) {
 		TAILQ_REMOVE(&cc->cc_expireq, c, c_links.tqe);
 		softclock_call_cc(c, cc,
 #ifdef CALLOUT_PROFILING
 		    &mpcalls, &lockcalls, &gcalls,
 #endif
 		    0);
 #ifdef CALLOUT_PROFILING
 		++depth;
 #endif
 	}
 #ifdef CALLOUT_PROFILING
 	avg_depth += (depth * 1000 - avg_depth) >> 8;
 	avg_mpcalls += (mpcalls * 1000 - avg_mpcalls) >> 8;
 	avg_lockcalls += (lockcalls * 1000 - avg_lockcalls) >> 8;
 	avg_gcalls += (gcalls * 1000 - avg_gcalls) >> 8;
 #endif
 	CC_UNLOCK(cc);
 }
 
 /*
  * timeout --
  *	Execute a function after a specified length of time.
  *
  * untimeout --
  *	Cancel previous timeout function call.
  *
  * callout_handle_init --
  *	Initialize a handle so that using it with untimeout is benign.
  *
  *	See AT&T BCI Driver Reference Manual for specification.  This
  *	implementation differs from that one in that although an
  *	identification value is returned from timeout, the original
  *	arguments to timeout as well as the identifier are used to
  *	identify entries for untimeout.
  */
 struct callout_handle
 timeout(ftn, arg, to_ticks)
 	timeout_t *ftn;
 	void *arg;
 	int to_ticks;
 {
 	struct callout_cpu *cc;
 	struct callout *new;
 	struct callout_handle handle;
 
 	cc = CC_CPU(timeout_cpu);
 	CC_LOCK(cc);
 	/* Fill in the next free callout structure. */
 	new = SLIST_FIRST(&cc->cc_callfree);
 	if (new == NULL)
 		/* XXX Attempt to malloc first */
 		panic("timeout table full");
 	SLIST_REMOVE_HEAD(&cc->cc_callfree, c_links.sle);
 	callout_reset(new, to_ticks, ftn, arg);
 	handle.callout = new;
 	CC_UNLOCK(cc);
 
 	return (handle);
 }
 
 void
 untimeout(ftn, arg, handle)
 	timeout_t *ftn;
 	void *arg;
 	struct callout_handle handle;
 {
 	struct callout_cpu *cc;
 
 	/*
 	 * Check for a handle that was initialized
 	 * by callout_handle_init, but never used
 	 * for a real timeout.
 	 */
 	if (handle.callout == NULL)
 		return;
 
 	cc = callout_lock(handle.callout);
 	if (handle.callout->c_func == ftn && handle.callout->c_arg == arg)
 		callout_stop(handle.callout);
 	CC_UNLOCK(cc);
 }
 
 void
 callout_handle_init(struct callout_handle *handle)
 {
 	handle->callout = NULL;
 }
 
 /*
  * New interface; clients allocate their own callout structures.
  *
  * callout_reset() - establish or change a timeout
  * callout_stop() - disestablish a timeout
  * callout_init() - initialize a callout structure so that it can
  *	safely be passed to callout_reset() and callout_stop()
  *
  * <sys/callout.h> defines three convenience macros:
  *
  * callout_active() - returns truth if callout has not been stopped,
  *	drained, or deactivated since the last time the callout was
  *	reset.
  * callout_pending() - returns truth if callout is still waiting for timeout
  * callout_deactivate() - marks the callout as having been serviced
  */
 int
 callout_reset_sbt_on(struct callout *c, sbintime_t sbt, sbintime_t precision,
     void (*ftn)(void *), void *arg, int cpu, int flags)
 {
 	sbintime_t to_sbt, pr;
 	struct callout_cpu *cc;
 	int cancelled, direct;
 
 	cancelled = 0;
 	if (flags & C_ABSOLUTE) {
 		to_sbt = sbt;
 	} else {
 		if ((flags & C_HARDCLOCK) && (sbt < tick_sbt))
 			sbt = tick_sbt;
 		if ((flags & C_HARDCLOCK) ||
 #ifdef NO_EVENTTIMERS
 		    sbt >= sbt_timethreshold) {
 			to_sbt = getsbinuptime();
 
 			/* Add safety belt for the case of hz > 1000. */
 			to_sbt += tc_tick_sbt - tick_sbt;
 #else
 		    sbt >= sbt_tickthreshold) {
 			/*
 			 * Obtain the time of the last hardclock() call on
 			 * this CPU directly from the kern_clocksource.c.
 			 * This value is per-CPU, but it is equal for all
 			 * active ones.
 			 */
 #ifdef __LP64__
 			to_sbt = DPCPU_GET(hardclocktime);
 #else
 			spinlock_enter();
 			to_sbt = DPCPU_GET(hardclocktime);
 			spinlock_exit();
 #endif
 #endif
 			if ((flags & C_HARDCLOCK) == 0)
 				to_sbt += tick_sbt;
 		} else
 			to_sbt = sbinuptime();
 		if (INT64_MAX - to_sbt < sbt)
 			to_sbt = INT64_MAX;
 		else
 			to_sbt += sbt;
 		pr = ((C_PRELGET(flags) < 0) ? sbt >> tc_precexp :
 		    sbt >> C_PRELGET(flags));
 		if (pr > precision)
 			precision = pr;
 	}
 	/*
 	 * Don't allow migration of pre-allocated callouts lest they
 	 * become unbalanced.
 	 */
 	if (c->c_flags & CALLOUT_LOCAL_ALLOC)
 		cpu = c->c_cpu;
 	direct = (c->c_flags & CALLOUT_DIRECT) != 0;
 	KASSERT(!direct || c->c_lock == NULL,
 	    ("%s: direct callout %p has lock", __func__, c));
 	cc = callout_lock(c);
 	if (cc->cc_exec_entity[direct].cc_curr == c) {
 		/*
 		 * We're being asked to reschedule a callout which is
 		 * currently in progress.  If there is a lock then we
 		 * can cancel the callout if it has not really started.
 		 */
 		if (c->c_lock != NULL && !cc->cc_exec_entity[direct].cc_cancel)
 			cancelled = cc->cc_exec_entity[direct].cc_cancel = true;
 		if (cc->cc_exec_entity[direct].cc_waiting) {
 			/*
 			 * Someone has called callout_drain to kill this
 			 * callout.  Don't reschedule.
 			 */
 			CTR4(KTR_CALLOUT, "%s %p func %p arg %p",
 			    cancelled ? "cancelled" : "failed to cancel",
 			    c, c->c_func, c->c_arg);
 			CC_UNLOCK(cc);
 			return (cancelled);
 		}
 	}
 	if (c->c_flags & CALLOUT_PENDING) {
 		if ((c->c_flags & CALLOUT_PROCESSED) == 0) {
 			if (cc->cc_exec_next_dir == c)
 				cc->cc_exec_next_dir = LIST_NEXT(c, c_links.le);
 			LIST_REMOVE(c, c_links.le);
 		} else
 			TAILQ_REMOVE(&cc->cc_expireq, c, c_links.tqe);
 		cancelled = 1;
 		c->c_flags &= ~(CALLOUT_ACTIVE | CALLOUT_PENDING);
 	}
 
 #ifdef SMP
 	/*
 	 * If the callout must migrate try to perform it immediately.
 	 * If the callout is currently running, just defer the migration
 	 * to a more appropriate moment.
 	 */
 	if (c->c_cpu != cpu) {
 		if (cc->cc_exec_entity[direct].cc_curr == c) {
 			cc->cc_exec_entity[direct].ce_migration_cpu = cpu;
 			cc->cc_exec_entity[direct].ce_migration_time
 			    = to_sbt;
 			cc->cc_exec_entity[direct].ce_migration_prec 
 			    = precision;
 			cc->cc_exec_entity[direct].ce_migration_func = ftn;
 			cc->cc_exec_entity[direct].ce_migration_arg = arg;
 			c->c_flags |= CALLOUT_DFRMIGRATION;
 			CTR6(KTR_CALLOUT,
 		    "migration of %p func %p arg %p in %d.%08x to %u deferred",
 			    c, c->c_func, c->c_arg, (int)(to_sbt >> 32),
 			    (u_int)(to_sbt & 0xffffffff), cpu);
 			CC_UNLOCK(cc);
 			return (cancelled);
 		}
 		cc = callout_cpu_switch(c, cc, cpu);
 	}
 #endif
 
 	callout_cc_add(c, cc, to_sbt, precision, ftn, arg, cpu, flags);
 	CTR6(KTR_CALLOUT, "%sscheduled %p func %p arg %p in %d.%08x",
 	    cancelled ? "re" : "", c, c->c_func, c->c_arg, (int)(to_sbt >> 32),
 	    (u_int)(to_sbt & 0xffffffff));
 	CC_UNLOCK(cc);
 
 	return (cancelled);
 }
 
 /*
  * Common idioms that can be optimized in the future.
  */
 int
 callout_schedule_on(struct callout *c, int to_ticks, int cpu)
 {
 	return callout_reset_on(c, to_ticks, c->c_func, c->c_arg, cpu);
 }
 
 int
 callout_schedule(struct callout *c, int to_ticks)
 {
 	return callout_reset_on(c, to_ticks, c->c_func, c->c_arg, c->c_cpu);
 }
 
 int
 _callout_stop_safe(c, safe)
 	struct	callout *c;
 	int	safe;
 {
 	struct callout_cpu *cc, *old_cc;
 	struct lock_class *class;
 	int direct, sq_locked, use_lock;
 
 	/*
 	 * Some old subsystems don't hold Giant while running a callout_stop(),
 	 * so just discard this check for the moment.
 	 */
 	if (!safe && c->c_lock != NULL) {
 		if (c->c_lock == &Giant.lock_object)
 			use_lock = mtx_owned(&Giant);
 		else {
 			use_lock = 1;
 			class = LOCK_CLASS(c->c_lock);
 			class->lc_assert(c->c_lock, LA_XLOCKED);
 		}
 	} else
 		use_lock = 0;
 	direct = (c->c_flags & CALLOUT_DIRECT) != 0;
 	sq_locked = 0;
 	old_cc = NULL;
 again:
 	cc = callout_lock(c);
 
 	/*
 	 * If the callout was migrating while the callout cpu lock was
 	 * dropped,  just drop the sleepqueue lock and check the states
 	 * again.
 	 */
 	if (sq_locked != 0 && cc != old_cc) {
 #ifdef SMP
 		CC_UNLOCK(cc);
 		sleepq_release(&old_cc->cc_exec_entity[direct].cc_waiting);
 		sq_locked = 0;
 		old_cc = NULL;
 		goto again;
 #else
 		panic("migration should not happen");
 #endif
 	}
 
 	/*
 	 * If the callout isn't pending, it's not on the queue, so
 	 * don't attempt to remove it from the queue.  We can try to
 	 * stop it by other means however.
 	 */
 	if (!(c->c_flags & CALLOUT_PENDING)) {
 		c->c_flags &= ~CALLOUT_ACTIVE;
 
 		/*
 		 * If it wasn't on the queue and it isn't the current
 		 * callout, then we can't stop it, so just bail.
 		 */
 		if (cc->cc_exec_entity[direct].cc_curr != c) {
 			CTR3(KTR_CALLOUT, "failed to stop %p func %p arg %p",
 			    c, c->c_func, c->c_arg);
 			CC_UNLOCK(cc);
 			if (sq_locked)
 				sleepq_release(
 				    &cc->cc_exec_entity[direct].cc_waiting);
 			return (0);
 		}
 
 		if (safe) {
 			/*
 			 * The current callout is running (or just
 			 * about to run) and blocking is allowed, so
 			 * just wait for the current invocation to
 			 * finish.
 			 */
 			while (cc->cc_exec_entity[direct].cc_curr == c) {
 				/*
 				 * Use direct calls to sleepqueue interface
 				 * instead of cv/msleep in order to avoid
 				 * a LOR between cc_lock and sleepqueue
 				 * chain spinlocks.  This piece of code
 				 * emulates a msleep_spin() call actually.
 				 *
 				 * If we already have the sleepqueue chain
 				 * locked, then we can safely block.  If we
 				 * don't already have it locked, however,
 				 * we have to drop the cc_lock to lock
 				 * it.  This opens several races, so we
 				 * restart at the beginning once we have
 				 * both locks.  If nothing has changed, then
 				 * we will end up back here with sq_locked
 				 * set.
 				 */
 				if (!sq_locked) {
 					CC_UNLOCK(cc);
 					sleepq_lock(
 					&cc->cc_exec_entity[direct].cc_waiting);
 					sq_locked = 1;
 					old_cc = cc;
 					goto again;
 				}
 
 				/*
 				 * Migration could be cancelled here, but
 				 * as long as it is still not sure when it
 				 * will be packed up, just let softclock()
 				 * take care of it.
 				 */
 				cc->cc_exec_entity[direct].cc_waiting = true;
 				DROP_GIANT();
 				CC_UNLOCK(cc);
 				sleepq_add(
 				    &cc->cc_exec_entity[direct].cc_waiting,
 				    &cc->cc_lock.lock_object, "codrain",
 				    SLEEPQ_SLEEP, 0);
 				sleepq_wait(
 				    &cc->cc_exec_entity[direct].cc_waiting,
 					     0);
 				sq_locked = 0;
 				old_cc = NULL;
 
 				/* Reacquire locks previously released. */
 				PICKUP_GIANT();
 				CC_LOCK(cc);
 			}
 		} else if (use_lock &&
 			    !cc->cc_exec_entity[direct].cc_cancel) {
 			/*
 			 * The current callout is waiting for its
 			 * lock which we hold.  Cancel the callout
 			 * and return.  After our caller drops the
 			 * lock, the callout will be skipped in
 			 * softclock().
 			 */
 			cc->cc_exec_entity[direct].cc_cancel = true;
 			CTR3(KTR_CALLOUT, "cancelled %p func %p arg %p",
 			    c, c->c_func, c->c_arg);
 			KASSERT(!cc_cce_migrating(cc, direct),
 			    ("callout wrongly scheduled for migration"));
 			CC_UNLOCK(cc);
 			KASSERT(!sq_locked, ("sleepqueue chain locked"));
 			return (1);
 		} else if ((c->c_flags & CALLOUT_DFRMIGRATION) != 0) {
 			c->c_flags &= ~CALLOUT_DFRMIGRATION;
 			CTR3(KTR_CALLOUT, "postponing stop %p func %p arg %p",
 			    c, c->c_func, c->c_arg);
 			CC_UNLOCK(cc);
 			return (1);
 		}
 		CTR3(KTR_CALLOUT, "failed to stop %p func %p arg %p",
 		    c, c->c_func, c->c_arg);
 		CC_UNLOCK(cc);
 		KASSERT(!sq_locked, ("sleepqueue chain still locked"));
 		return (0);
 	}
 	if (sq_locked)
 		sleepq_release(&cc->cc_exec_entity[direct].cc_waiting);
 
 	c->c_flags &= ~(CALLOUT_ACTIVE | CALLOUT_PENDING);
 
 	CTR3(KTR_CALLOUT, "cancelled %p func %p arg %p",
 	    c, c->c_func, c->c_arg);
 	if ((c->c_flags & CALLOUT_PROCESSED) == 0) {
 		if (cc->cc_exec_next_dir == c)
 			cc->cc_exec_next_dir = LIST_NEXT(c, c_links.le);
 		LIST_REMOVE(c, c_links.le);
 	} else
 		TAILQ_REMOVE(&cc->cc_expireq, c, c_links.tqe);
 	callout_cc_del(c, cc);
 
 	CC_UNLOCK(cc);
 	return (1);
 }
 
 void
 callout_init(c, mpsafe)
 	struct	callout *c;
 	int mpsafe;
 {
 	bzero(c, sizeof *c);
 	if (mpsafe) {
 		c->c_lock = NULL;
 		c->c_flags = CALLOUT_RETURNUNLOCKED;
 	} else {
 		c->c_lock = &Giant.lock_object;
 		c->c_flags = 0;
 	}
 	c->c_cpu = timeout_cpu;
 }
 
 void
 _callout_init_lock(c, lock, flags)
 	struct	callout *c;
 	struct	lock_object *lock;
 	int flags;
 {
 	bzero(c, sizeof *c);
 	c->c_lock = lock;
 	KASSERT((flags & ~(CALLOUT_RETURNUNLOCKED | CALLOUT_SHAREDLOCK)) == 0,
 	    ("callout_init_lock: bad flags %d", flags));
 	KASSERT(lock != NULL || (flags & CALLOUT_RETURNUNLOCKED) == 0,
 	    ("callout_init_lock: CALLOUT_RETURNUNLOCKED with no lock"));
 	KASSERT(lock == NULL || !(LOCK_CLASS(lock)->lc_flags &
 	    (LC_SPINLOCK | LC_SLEEPABLE)), ("%s: invalid lock class",
 	    __func__));
 	c->c_flags = flags & (CALLOUT_RETURNUNLOCKED | CALLOUT_SHAREDLOCK);
 	c->c_cpu = timeout_cpu;
 }
 
 #ifdef APM_FIXUP_CALLTODO
 /* 
  * Adjust the kernel calltodo timeout list.  This routine is used after 
  * an APM resume to recalculate the calltodo timer list values with the 
  * number of hz's we have been sleeping.  The next hardclock() will detect 
  * that there are fired timers and run softclock() to execute them.
  *
  * Please note, I have not done an exhaustive analysis of what code this
  * might break.  I am motivated to have my select()'s and alarm()'s that
  * have expired during suspend firing upon resume so that the applications
  * which set the timer can do the maintanence the timer was for as close
  * as possible to the originally intended time.  Testing this code for a 
  * week showed that resuming from a suspend resulted in 22 to 25 timers 
  * firing, which seemed independant on whether the suspend was 2 hours or
  * 2 days.  Your milage may vary.   - Ken Key <key@cs.utk.edu>
  */
 void
 adjust_timeout_calltodo(time_change)
     struct timeval *time_change;
 {
 	register struct callout *p;
 	unsigned long delta_ticks;
 
 	/* 
 	 * How many ticks were we asleep?
 	 * (stolen from tvtohz()).
 	 */
 
 	/* Don't do anything */
 	if (time_change->tv_sec < 0)
 		return;
 	else if (time_change->tv_sec <= LONG_MAX / 1000000)
 		delta_ticks = (time_change->tv_sec * 1000000 +
 			       time_change->tv_usec + (tick - 1)) / tick + 1;
 	else if (time_change->tv_sec <= LONG_MAX / hz)
 		delta_ticks = time_change->tv_sec * hz +
 			      (time_change->tv_usec + (tick - 1)) / tick + 1;
 	else
 		delta_ticks = LONG_MAX;
 
 	if (delta_ticks > INT_MAX)
 		delta_ticks = INT_MAX;
 
 	/* 
 	 * Now rip through the timer calltodo list looking for timers
 	 * to expire.
 	 */
 
 	/* don't collide with softclock() */
 	CC_LOCK(cc);
 	for (p = calltodo.c_next; p != NULL; p = p->c_next) {
 		p->c_time -= delta_ticks;
 
 		/* Break if the timer had more time on it than delta_ticks */
 		if (p->c_time > 0)
 			break;
 
 		/* take back the ticks the timer didn't use (p->c_time <= 0) */
 		delta_ticks = -p->c_time;
 	}
 	CC_UNLOCK(cc);
 
 	return;
 }
 #endif /* APM_FIXUP_CALLTODO */
 
 static int
 flssbt(sbintime_t sbt)
 {
 
 	sbt += (uint64_t)sbt >> 1;
 	if (sizeof(long) >= sizeof(sbintime_t))
 		return (flsl(sbt));
 	if (sbt >= SBT_1S)
 		return (flsl(((uint64_t)sbt) >> 32) + 32);
 	return (flsl(sbt));
 }
 
 /*
  * Dump immediate statistic snapshot of the scheduled callouts.
  */
 static int
 sysctl_kern_callout_stat(SYSCTL_HANDLER_ARGS)
 {
 	struct callout *tmp;
 	struct callout_cpu *cc;
 	struct callout_list *sc;
 	sbintime_t maxpr, maxt, medpr, medt, now, spr, st, t;
 	int ct[64], cpr[64], ccpbk[32];
 	int error, val, i, count, tcum, pcum, maxc, c, medc;
 #ifdef SMP
 	int cpu;
 #endif
 
 	val = 0;
 	error = sysctl_handle_int(oidp, &val, 0, req);
 	if (error != 0 || req->newptr == NULL)
 		return (error);
 	count = maxc = 0;
 	st = spr = maxt = maxpr = 0;
 	bzero(ccpbk, sizeof(ccpbk));
 	bzero(ct, sizeof(ct));
 	bzero(cpr, sizeof(cpr));
 	now = sbinuptime();
 #ifdef SMP
 	CPU_FOREACH(cpu) {
 		cc = CC_CPU(cpu);
 #else
 		cc = CC_CPU(timeout_cpu);
 #endif
 		CC_LOCK(cc);
 		for (i = 0; i < callwheelsize; i++) {
 			sc = &cc->cc_callwheel[i];
 			c = 0;
 			LIST_FOREACH(tmp, sc, c_links.le) {
 				c++;
 				t = tmp->c_time - now;
 				if (t < 0)
 					t = 0;
 				st += t / SBT_1US;
 				spr += tmp->c_precision / SBT_1US;
 				if (t > maxt)
 					maxt = t;
 				if (tmp->c_precision > maxpr)
 					maxpr = tmp->c_precision;
 				ct[flssbt(t)]++;
 				cpr[flssbt(tmp->c_precision)]++;
 			}
 			if (c > maxc)
 				maxc = c;
 			ccpbk[fls(c + c / 2)]++;
 			count += c;
 		}
 		CC_UNLOCK(cc);
 #ifdef SMP
 	}
 #endif
 
 	for (i = 0, tcum = 0; i < 64 && tcum < count / 2; i++)
 		tcum += ct[i];
 	medt = (i >= 2) ? (((sbintime_t)1) << (i - 2)) : 0;
 	for (i = 0, pcum = 0; i < 64 && pcum < count / 2; i++)
 		pcum += cpr[i];
 	medpr = (i >= 2) ? (((sbintime_t)1) << (i - 2)) : 0;
 	for (i = 0, c = 0; i < 32 && c < count / 2; i++)
 		c += ccpbk[i];
 	medc = (i >= 2) ? (1 << (i - 2)) : 0;
 
 	printf("Scheduled callouts statistic snapshot:\n");
 	printf("  Callouts: %6d  Buckets: %6d*%-3d  Bucket size: 0.%06ds\n",
 	    count, callwheelsize, mp_ncpus, 1000000 >> CC_HASH_SHIFT);
 	printf("  C/Bk: med %5d         avg %6d.%06jd  max %6d\n",
 	    medc,
 	    count / callwheelsize / mp_ncpus,
 	    (uint64_t)count * 1000000 / callwheelsize / mp_ncpus % 1000000,
 	    maxc);
 	printf("  Time: med %5jd.%06jds avg %6jd.%06jds max %6jd.%06jds\n",
 	    medt / SBT_1S, (medt & 0xffffffff) * 1000000 >> 32,
 	    (st / count) / 1000000, (st / count) % 1000000,
 	    maxt / SBT_1S, (maxt & 0xffffffff) * 1000000 >> 32);
 	printf("  Prec: med %5jd.%06jds avg %6jd.%06jds max %6jd.%06jds\n",
 	    medpr / SBT_1S, (medpr & 0xffffffff) * 1000000 >> 32,
 	    (spr / count) / 1000000, (spr / count) % 1000000,
 	    maxpr / SBT_1S, (maxpr & 0xffffffff) * 1000000 >> 32);
 	printf("  Distribution:       \tbuckets\t   time\t   tcum\t"
 	    "   prec\t   pcum\n");
 	for (i = 0, tcum = pcum = 0; i < 64; i++) {
 		if (ct[i] == 0 && cpr[i] == 0)
 			continue;
 		t = (i != 0) ? (((sbintime_t)1) << (i - 1)) : 0;
 		tcum += ct[i];
 		pcum += cpr[i];
 		printf("  %10jd.%06jds\t 2**%d\t%7d\t%7d\t%7d\t%7d\n",
 		    t / SBT_1S, (t & 0xffffffff) * 1000000 >> 32,
 		    i - 1 - (32 - CC_HASH_SHIFT),
 		    ct[i], tcum, cpr[i], pcum);
 	}
 	return (error);
 }
 SYSCTL_PROC(_kern, OID_AUTO, callout_stat,
     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
     0, 0, sysctl_kern_callout_stat, "I",
     "Dump immediate statistic snapshot of the scheduled callouts");
Index: head/sys/kern/sched_4bsd.c
===================================================================
--- head/sys/kern/sched_4bsd.c	(revision 258540)
+++ head/sys/kern/sched_4bsd.c	(revision 258541)
@@ -1,1784 +1,1783 @@
 /*-
  * Copyright (c) 1982, 1986, 1990, 1991, 1993
  *	The Regents of the University of California.  All rights reserved.
  * (c) UNIX System Laboratories, Inc.
  * All or some portions of this file are derived from material licensed
  * to the University of California by American Telephone and Telegraph
  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
  * the permission of UNIX System Laboratories, Inc.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_hwpmc_hooks.h"
 #include "opt_sched.h"
-#include "opt_kdtrace.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/cpuset.h>
 #include <sys/kernel.h>
 #include <sys/ktr.h>
 #include <sys/lock.h>
 #include <sys/kthread.h>
 #include <sys/mutex.h>
 #include <sys/proc.h>
 #include <sys/resourcevar.h>
 #include <sys/sched.h>
 #include <sys/sdt.h>
 #include <sys/smp.h>
 #include <sys/sysctl.h>
 #include <sys/sx.h>
 #include <sys/turnstile.h>
 #include <sys/umtx.h>
 #include <machine/pcb.h>
 #include <machine/smp.h>
 
 #ifdef HWPMC_HOOKS
 #include <sys/pmckern.h>
 #endif
 
 #ifdef KDTRACE_HOOKS
 #include <sys/dtrace_bsd.h>
 int				dtrace_vtime_active;
 dtrace_vtime_switch_func_t	dtrace_vtime_switch_func;
 #endif
 
 /*
  * INVERSE_ESTCPU_WEIGHT is only suitable for statclock() frequencies in
  * the range 100-256 Hz (approximately).
  */
 #define	ESTCPULIM(e) \
     min((e), INVERSE_ESTCPU_WEIGHT * (NICE_WEIGHT * (PRIO_MAX - PRIO_MIN) - \
     RQ_PPQ) + INVERSE_ESTCPU_WEIGHT - 1)
 #ifdef SMP
 #define	INVERSE_ESTCPU_WEIGHT	(8 * smp_cpus)
 #else
 #define	INVERSE_ESTCPU_WEIGHT	8	/* 1 / (priorities per estcpu level). */
 #endif
 #define	NICE_WEIGHT		1	/* Priorities per nice level. */
 
 #define	TS_NAME_LEN (MAXCOMLEN + sizeof(" td ") + sizeof(__XSTRING(UINT_MAX)))
 
 /*
  * The schedulable entity that runs a context.
  * This is  an extension to the thread structure and is tailored to
  * the requirements of this scheduler
  */
 struct td_sched {
 	fixpt_t		ts_pctcpu;	/* (j) %cpu during p_swtime. */
 	int		ts_cpticks;	/* (j) Ticks of cpu time. */
 	int		ts_slptime;	/* (j) Seconds !RUNNING. */
 	int		ts_slice;	/* Remaining part of time slice. */
 	int		ts_flags;
 	struct runq	*ts_runq;	/* runq the thread is currently on */
 #ifdef KTR
 	char		ts_name[TS_NAME_LEN];
 #endif
 };
 
 /* flags kept in td_flags */
 #define TDF_DIDRUN	TDF_SCHED0	/* thread actually ran. */
 #define TDF_BOUND	TDF_SCHED1	/* Bound to one CPU. */
 #define	TDF_SLICEEND	TDF_SCHED2	/* Thread time slice is over. */
 
 /* flags kept in ts_flags */
 #define	TSF_AFFINITY	0x0001		/* Has a non-"full" CPU set. */
 
 #define SKE_RUNQ_PCPU(ts)						\
     ((ts)->ts_runq != 0 && (ts)->ts_runq != &runq)
 
 #define	THREAD_CAN_SCHED(td, cpu)	\
     CPU_ISSET((cpu), &(td)->td_cpuset->cs_mask)
 
 static struct td_sched td_sched0;
 struct mtx sched_lock;
 
 static int	realstathz = 127; /* stathz is sometimes 0 and run off of hz. */
 static int	sched_tdcnt;	/* Total runnable threads in the system. */
 static int	sched_slice = 12; /* Thread run time before rescheduling. */
 
 static void	setup_runqs(void);
 static void	schedcpu(void);
 static void	schedcpu_thread(void);
 static void	sched_priority(struct thread *td, u_char prio);
 static void	sched_setup(void *dummy);
 static void	maybe_resched(struct thread *td);
 static void	updatepri(struct thread *td);
 static void	resetpriority(struct thread *td);
 static void	resetpriority_thread(struct thread *td);
 #ifdef SMP
 static int	sched_pickcpu(struct thread *td);
 static int	forward_wakeup(int cpunum);
 static void	kick_other_cpu(int pri, int cpuid);
 #endif
 
 static struct kproc_desc sched_kp = {
         "schedcpu",
         schedcpu_thread,
         NULL
 };
 SYSINIT(schedcpu, SI_SUB_LAST, SI_ORDER_FIRST, kproc_start,
     &sched_kp);
 SYSINIT(sched_setup, SI_SUB_RUN_QUEUE, SI_ORDER_FIRST, sched_setup, NULL);
 
 static void sched_initticks(void *dummy);
 SYSINIT(sched_initticks, SI_SUB_CLOCKS, SI_ORDER_THIRD, sched_initticks,
     NULL);
 
 /*
  * Global run queue.
  */
 static struct runq runq;
 
 #ifdef SMP
 /*
  * Per-CPU run queues
  */
 static struct runq runq_pcpu[MAXCPU];
 long runq_length[MAXCPU];
 
 static cpuset_t idle_cpus_mask;
 #endif
 
 struct pcpuidlestat {
 	u_int idlecalls;
 	u_int oldidlecalls;
 };
 static DPCPU_DEFINE(struct pcpuidlestat, idlestat);
 
 static void
 setup_runqs(void)
 {
 #ifdef SMP
 	int i;
 
 	for (i = 0; i < MAXCPU; ++i)
 		runq_init(&runq_pcpu[i]);
 #endif
 
 	runq_init(&runq);
 }
 
 static int
 sysctl_kern_quantum(SYSCTL_HANDLER_ARGS)
 {
 	int error, new_val, period;
 
 	period = 1000000 / realstathz;
 	new_val = period * sched_slice;
 	error = sysctl_handle_int(oidp, &new_val, 0, req);
 	if (error != 0 || req->newptr == NULL)
 		return (error);
 	if (new_val <= 0)
 		return (EINVAL);
 	sched_slice = imax(1, (new_val + period / 2) / period);
 	hogticks = imax(1, (2 * hz * sched_slice + realstathz / 2) /
 	    realstathz);
 	return (0);
 }
 
 SYSCTL_NODE(_kern, OID_AUTO, sched, CTLFLAG_RD, 0, "Scheduler");
 
 SYSCTL_STRING(_kern_sched, OID_AUTO, name, CTLFLAG_RD, "4BSD", 0,
     "Scheduler name");
 SYSCTL_PROC(_kern_sched, OID_AUTO, quantum, CTLTYPE_INT | CTLFLAG_RW,
     NULL, 0, sysctl_kern_quantum, "I",
     "Quantum for timeshare threads in microseconds");
 SYSCTL_INT(_kern_sched, OID_AUTO, slice, CTLFLAG_RW, &sched_slice, 0,
     "Quantum for timeshare threads in stathz ticks");
 #ifdef SMP
 /* Enable forwarding of wakeups to all other cpus */
 static SYSCTL_NODE(_kern_sched, OID_AUTO, ipiwakeup, CTLFLAG_RD, NULL,
     "Kernel SMP");
 
 static int runq_fuzz = 1;
 SYSCTL_INT(_kern_sched, OID_AUTO, runq_fuzz, CTLFLAG_RW, &runq_fuzz, 0, "");
 
 static int forward_wakeup_enabled = 1;
 SYSCTL_INT(_kern_sched_ipiwakeup, OID_AUTO, enabled, CTLFLAG_RW,
 	   &forward_wakeup_enabled, 0,
 	   "Forwarding of wakeup to idle CPUs");
 
 static int forward_wakeups_requested = 0;
 SYSCTL_INT(_kern_sched_ipiwakeup, OID_AUTO, requested, CTLFLAG_RD,
 	   &forward_wakeups_requested, 0,
 	   "Requests for Forwarding of wakeup to idle CPUs");
 
 static int forward_wakeups_delivered = 0;
 SYSCTL_INT(_kern_sched_ipiwakeup, OID_AUTO, delivered, CTLFLAG_RD,
 	   &forward_wakeups_delivered, 0,
 	   "Completed Forwarding of wakeup to idle CPUs");
 
 static int forward_wakeup_use_mask = 1;
 SYSCTL_INT(_kern_sched_ipiwakeup, OID_AUTO, usemask, CTLFLAG_RW,
 	   &forward_wakeup_use_mask, 0,
 	   "Use the mask of idle cpus");
 
 static int forward_wakeup_use_loop = 0;
 SYSCTL_INT(_kern_sched_ipiwakeup, OID_AUTO, useloop, CTLFLAG_RW,
 	   &forward_wakeup_use_loop, 0,
 	   "Use a loop to find idle cpus");
 
 #endif
 #if 0
 static int sched_followon = 0;
 SYSCTL_INT(_kern_sched, OID_AUTO, followon, CTLFLAG_RW,
 	   &sched_followon, 0,
 	   "allow threads to share a quantum");
 #endif
 
 SDT_PROVIDER_DEFINE(sched);
 
 SDT_PROBE_DEFINE3(sched, , , change_pri, change-pri, "struct thread *", 
     "struct proc *", "uint8_t");
 SDT_PROBE_DEFINE3(sched, , , dequeue, dequeue, "struct thread *", 
     "struct proc *", "void *");
 SDT_PROBE_DEFINE4(sched, , , enqueue, enqueue, "struct thread *", 
     "struct proc *", "void *", "int");
 SDT_PROBE_DEFINE4(sched, , , lend_pri, lend-pri, "struct thread *", 
     "struct proc *", "uint8_t", "struct thread *");
 SDT_PROBE_DEFINE2(sched, , , load_change, load-change, "int", "int");
 SDT_PROBE_DEFINE2(sched, , , off_cpu, off-cpu, "struct thread *",
     "struct proc *");
 SDT_PROBE_DEFINE(sched, , , on_cpu, on-cpu);
 SDT_PROBE_DEFINE(sched, , , remain_cpu, remain-cpu);
 SDT_PROBE_DEFINE2(sched, , , surrender, surrender, "struct thread *",
     "struct proc *");
 
 static __inline void
 sched_load_add(void)
 {
 
 	sched_tdcnt++;
 	KTR_COUNTER0(KTR_SCHED, "load", "global load", sched_tdcnt);
 	SDT_PROBE2(sched, , , load_change, NOCPU, sched_tdcnt);
 }
 
 static __inline void
 sched_load_rem(void)
 {
 
 	sched_tdcnt--;
 	KTR_COUNTER0(KTR_SCHED, "load", "global load", sched_tdcnt);
 	SDT_PROBE2(sched, , , load_change, NOCPU, sched_tdcnt);
 }
 /*
  * Arrange to reschedule if necessary, taking the priorities and
  * schedulers into account.
  */
 static void
 maybe_resched(struct thread *td)
 {
 
 	THREAD_LOCK_ASSERT(td, MA_OWNED);
 	if (td->td_priority < curthread->td_priority)
 		curthread->td_flags |= TDF_NEEDRESCHED;
 }
 
 /*
  * This function is called when a thread is about to be put on run queue
  * because it has been made runnable or its priority has been adjusted.  It
  * determines if the new thread should be immediately preempted to.  If so,
  * it switches to it and eventually returns true.  If not, it returns false
  * so that the caller may place the thread on an appropriate run queue.
  */
 int
 maybe_preempt(struct thread *td)
 {
 #ifdef PREEMPTION
 	struct thread *ctd;
 	int cpri, pri;
 
 	/*
 	 * The new thread should not preempt the current thread if any of the
 	 * following conditions are true:
 	 *
 	 *  - The kernel is in the throes of crashing (panicstr).
 	 *  - The current thread has a higher (numerically lower) or
 	 *    equivalent priority.  Note that this prevents curthread from
 	 *    trying to preempt to itself.
 	 *  - It is too early in the boot for context switches (cold is set).
 	 *  - The current thread has an inhibitor set or is in the process of
 	 *    exiting.  In this case, the current thread is about to switch
 	 *    out anyways, so there's no point in preempting.  If we did,
 	 *    the current thread would not be properly resumed as well, so
 	 *    just avoid that whole landmine.
 	 *  - If the new thread's priority is not a realtime priority and
 	 *    the current thread's priority is not an idle priority and
 	 *    FULL_PREEMPTION is disabled.
 	 *
 	 * If all of these conditions are false, but the current thread is in
 	 * a nested critical section, then we have to defer the preemption
 	 * until we exit the critical section.  Otherwise, switch immediately
 	 * to the new thread.
 	 */
 	ctd = curthread;
 	THREAD_LOCK_ASSERT(td, MA_OWNED);
 	KASSERT((td->td_inhibitors == 0),
 			("maybe_preempt: trying to run inhibited thread"));
 	pri = td->td_priority;
 	cpri = ctd->td_priority;
 	if (panicstr != NULL || pri >= cpri || cold /* || dumping */ ||
 	    TD_IS_INHIBITED(ctd))
 		return (0);
 #ifndef FULL_PREEMPTION
 	if (pri > PRI_MAX_ITHD && cpri < PRI_MIN_IDLE)
 		return (0);
 #endif
 
 	if (ctd->td_critnest > 1) {
 		CTR1(KTR_PROC, "maybe_preempt: in critical section %d",
 		    ctd->td_critnest);
 		ctd->td_owepreempt = 1;
 		return (0);
 	}
 	/*
 	 * Thread is runnable but not yet put on system run queue.
 	 */
 	MPASS(ctd->td_lock == td->td_lock);
 	MPASS(TD_ON_RUNQ(td));
 	TD_SET_RUNNING(td);
 	CTR3(KTR_PROC, "preempting to thread %p (pid %d, %s)\n", td,
 	    td->td_proc->p_pid, td->td_name);
 	mi_switch(SW_INVOL | SW_PREEMPT | SWT_PREEMPT, td);
 	/*
 	 * td's lock pointer may have changed.  We have to return with it
 	 * locked.
 	 */
 	spinlock_enter();
 	thread_unlock(ctd);
 	thread_lock(td);
 	spinlock_exit();
 	return (1);
 #else
 	return (0);
 #endif
 }
 
 /*
  * Constants for digital decay and forget:
  *	90% of (td_estcpu) usage in 5 * loadav time
  *	95% of (ts_pctcpu) usage in 60 seconds (load insensitive)
  *          Note that, as ps(1) mentions, this can let percentages
  *          total over 100% (I've seen 137.9% for 3 processes).
  *
  * Note that schedclock() updates td_estcpu and p_cpticks asynchronously.
  *
  * We wish to decay away 90% of td_estcpu in (5 * loadavg) seconds.
  * That is, the system wants to compute a value of decay such
  * that the following for loop:
  * 	for (i = 0; i < (5 * loadavg); i++)
  * 		td_estcpu *= decay;
  * will compute
  * 	td_estcpu *= 0.1;
  * for all values of loadavg:
  *
  * Mathematically this loop can be expressed by saying:
  * 	decay ** (5 * loadavg) ~= .1
  *
  * The system computes decay as:
  * 	decay = (2 * loadavg) / (2 * loadavg + 1)
  *
  * We wish to prove that the system's computation of decay
  * will always fulfill the equation:
  * 	decay ** (5 * loadavg) ~= .1
  *
  * If we compute b as:
  * 	b = 2 * loadavg
  * then
  * 	decay = b / (b + 1)
  *
  * We now need to prove two things:
  *	1) Given factor ** (5 * loadavg) ~= .1, prove factor == b/(b+1)
  *	2) Given b/(b+1) ** power ~= .1, prove power == (5 * loadavg)
  *
  * Facts:
  *         For x close to zero, exp(x) =~ 1 + x, since
  *              exp(x) = 0! + x**1/1! + x**2/2! + ... .
  *              therefore exp(-1/b) =~ 1 - (1/b) = (b-1)/b.
  *         For x close to zero, ln(1+x) =~ x, since
  *              ln(1+x) = x - x**2/2 + x**3/3 - ...     -1 < x < 1
  *              therefore ln(b/(b+1)) = ln(1 - 1/(b+1)) =~ -1/(b+1).
  *         ln(.1) =~ -2.30
  *
  * Proof of (1):
  *    Solve (factor)**(power) =~ .1 given power (5*loadav):
  *	solving for factor,
  *      ln(factor) =~ (-2.30/5*loadav), or
  *      factor =~ exp(-1/((5/2.30)*loadav)) =~ exp(-1/(2*loadav)) =
  *          exp(-1/b) =~ (b-1)/b =~ b/(b+1).                    QED
  *
  * Proof of (2):
  *    Solve (factor)**(power) =~ .1 given factor == (b/(b+1)):
  *	solving for power,
  *      power*ln(b/(b+1)) =~ -2.30, or
  *      power =~ 2.3 * (b + 1) = 4.6*loadav + 2.3 =~ 5*loadav.  QED
  *
  * Actual power values for the implemented algorithm are as follows:
  *      loadav: 1       2       3       4
  *      power:  5.68    10.32   14.94   19.55
  */
 
 /* calculations for digital decay to forget 90% of usage in 5*loadav sec */
 #define	loadfactor(loadav)	(2 * (loadav))
 #define	decay_cpu(loadfac, cpu)	(((loadfac) * (cpu)) / ((loadfac) + FSCALE))
 
 /* decay 95% of `ts_pctcpu' in 60 seconds; see CCPU_SHIFT before changing */
 static fixpt_t	ccpu = 0.95122942450071400909 * FSCALE;	/* exp(-1/20) */
 SYSCTL_UINT(_kern, OID_AUTO, ccpu, CTLFLAG_RD, &ccpu, 0, "");
 
 /*
  * If `ccpu' is not equal to `exp(-1/20)' and you still want to use the
  * faster/more-accurate formula, you'll have to estimate CCPU_SHIFT below
  * and possibly adjust FSHIFT in "param.h" so that (FSHIFT >= CCPU_SHIFT).
  *
  * To estimate CCPU_SHIFT for exp(-1/20), the following formula was used:
  *	1 - exp(-1/20) ~= 0.0487 ~= 0.0488 == 1 (fixed pt, *11* bits).
  *
  * If you don't want to bother with the faster/more-accurate formula, you
  * can set CCPU_SHIFT to (FSHIFT + 1) which will use a slower/less-accurate
  * (more general) method of calculating the %age of CPU used by a process.
  */
 #define	CCPU_SHIFT	11
 
 /*
  * Recompute process priorities, every hz ticks.
  * MP-safe, called without the Giant mutex.
  */
 /* ARGSUSED */
 static void
 schedcpu(void)
 {
 	register fixpt_t loadfac = loadfactor(averunnable.ldavg[0]);
 	struct thread *td;
 	struct proc *p;
 	struct td_sched *ts;
 	int awake;
 
 	sx_slock(&allproc_lock);
 	FOREACH_PROC_IN_SYSTEM(p) {
 		PROC_LOCK(p);
 		if (p->p_state == PRS_NEW) {
 			PROC_UNLOCK(p);
 			continue;
 		}
 		FOREACH_THREAD_IN_PROC(p, td) {
 			awake = 0;
 			thread_lock(td);
 			ts = td->td_sched;
 			/*
 			 * Increment sleep time (if sleeping).  We
 			 * ignore overflow, as above.
 			 */
 			/*
 			 * The td_sched slptimes are not touched in wakeup
 			 * because the thread may not HAVE everything in
 			 * memory? XXX I think this is out of date.
 			 */
 			if (TD_ON_RUNQ(td)) {
 				awake = 1;
 				td->td_flags &= ~TDF_DIDRUN;
 			} else if (TD_IS_RUNNING(td)) {
 				awake = 1;
 				/* Do not clear TDF_DIDRUN */
 			} else if (td->td_flags & TDF_DIDRUN) {
 				awake = 1;
 				td->td_flags &= ~TDF_DIDRUN;
 			}
 
 			/*
 			 * ts_pctcpu is only for ps and ttyinfo().
 			 */
 			ts->ts_pctcpu = (ts->ts_pctcpu * ccpu) >> FSHIFT;
 			/*
 			 * If the td_sched has been idle the entire second,
 			 * stop recalculating its priority until
 			 * it wakes up.
 			 */
 			if (ts->ts_cpticks != 0) {
 #if	(FSHIFT >= CCPU_SHIFT)
 				ts->ts_pctcpu += (realstathz == 100)
 				    ? ((fixpt_t) ts->ts_cpticks) <<
 				    (FSHIFT - CCPU_SHIFT) :
 				    100 * (((fixpt_t) ts->ts_cpticks)
 				    << (FSHIFT - CCPU_SHIFT)) / realstathz;
 #else
 				ts->ts_pctcpu += ((FSCALE - ccpu) *
 				    (ts->ts_cpticks *
 				    FSCALE / realstathz)) >> FSHIFT;
 #endif
 				ts->ts_cpticks = 0;
 			}
 			/*
 			 * If there are ANY running threads in this process,
 			 * then don't count it as sleeping.
 			 * XXX: this is broken.
 			 */
 			if (awake) {
 				if (ts->ts_slptime > 1) {
 					/*
 					 * In an ideal world, this should not
 					 * happen, because whoever woke us
 					 * up from the long sleep should have
 					 * unwound the slptime and reset our
 					 * priority before we run at the stale
 					 * priority.  Should KASSERT at some
 					 * point when all the cases are fixed.
 					 */
 					updatepri(td);
 				}
 				ts->ts_slptime = 0;
 			} else
 				ts->ts_slptime++;
 			if (ts->ts_slptime > 1) {
 				thread_unlock(td);
 				continue;
 			}
 			td->td_estcpu = decay_cpu(loadfac, td->td_estcpu);
 		      	resetpriority(td);
 			resetpriority_thread(td);
 			thread_unlock(td);
 		}
 		PROC_UNLOCK(p);
 	}
 	sx_sunlock(&allproc_lock);
 }
 
 /*
  * Main loop for a kthread that executes schedcpu once a second.
  */
 static void
 schedcpu_thread(void)
 {
 
 	for (;;) {
 		schedcpu();
 		pause("-", hz);
 	}
 }
 
 /*
  * Recalculate the priority of a process after it has slept for a while.
  * For all load averages >= 1 and max td_estcpu of 255, sleeping for at
  * least six times the loadfactor will decay td_estcpu to zero.
  */
 static void
 updatepri(struct thread *td)
 {
 	struct td_sched *ts;
 	fixpt_t loadfac;
 	unsigned int newcpu;
 
 	ts = td->td_sched;
 	loadfac = loadfactor(averunnable.ldavg[0]);
 	if (ts->ts_slptime > 5 * loadfac)
 		td->td_estcpu = 0;
 	else {
 		newcpu = td->td_estcpu;
 		ts->ts_slptime--;	/* was incremented in schedcpu() */
 		while (newcpu && --ts->ts_slptime)
 			newcpu = decay_cpu(loadfac, newcpu);
 		td->td_estcpu = newcpu;
 	}
 }
 
 /*
  * Compute the priority of a process when running in user mode.
  * Arrange to reschedule if the resulting priority is better
  * than that of the current process.
  */
 static void
 resetpriority(struct thread *td)
 {
 	register unsigned int newpriority;
 
 	if (td->td_pri_class == PRI_TIMESHARE) {
 		newpriority = PUSER + td->td_estcpu / INVERSE_ESTCPU_WEIGHT +
 		    NICE_WEIGHT * (td->td_proc->p_nice - PRIO_MIN);
 		newpriority = min(max(newpriority, PRI_MIN_TIMESHARE),
 		    PRI_MAX_TIMESHARE);
 		sched_user_prio(td, newpriority);
 	}
 }
 
 /*
  * Update the thread's priority when the associated process's user
  * priority changes.
  */
 static void
 resetpriority_thread(struct thread *td)
 {
 
 	/* Only change threads with a time sharing user priority. */
 	if (td->td_priority < PRI_MIN_TIMESHARE ||
 	    td->td_priority > PRI_MAX_TIMESHARE)
 		return;
 
 	/* XXX the whole needresched thing is broken, but not silly. */
 	maybe_resched(td);
 
 	sched_prio(td, td->td_user_pri);
 }
 
 /* ARGSUSED */
 static void
 sched_setup(void *dummy)
 {
 
 	setup_runqs();
 
 	/* Account for thread0. */
 	sched_load_add();
 }
 
 /*
  * This routine determines time constants after stathz and hz are setup.
  */
 static void
 sched_initticks(void *dummy)
 {
 
 	realstathz = stathz ? stathz : hz;
 	sched_slice = realstathz / 10;	/* ~100ms */
 	hogticks = imax(1, (2 * hz * sched_slice + realstathz / 2) /
 	    realstathz);
 }
 
 /* External interfaces start here */
 
 /*
  * Very early in the boot some setup of scheduler-specific
  * parts of proc0 and of some scheduler resources needs to be done.
  * Called from:
  *  proc0_init()
  */
 void
 schedinit(void)
 {
 	/*
 	 * Set up the scheduler specific parts of proc0.
 	 */
 	proc0.p_sched = NULL; /* XXX */
 	thread0.td_sched = &td_sched0;
 	thread0.td_lock = &sched_lock;
 	td_sched0.ts_slice = sched_slice;
 	mtx_init(&sched_lock, "sched lock", NULL, MTX_SPIN | MTX_RECURSE);
 }
 
 int
 sched_runnable(void)
 {
 #ifdef SMP
 	return runq_check(&runq) + runq_check(&runq_pcpu[PCPU_GET(cpuid)]);
 #else
 	return runq_check(&runq);
 #endif
 }
 
 int
 sched_rr_interval(void)
 {
 
 	/* Convert sched_slice from stathz to hz. */
 	return (imax(1, (sched_slice * hz + realstathz / 2) / realstathz));
 }
 
 /*
  * We adjust the priority of the current process.  The priority of
  * a process gets worse as it accumulates CPU time.  The cpu usage
  * estimator (td_estcpu) is increased here.  resetpriority() will
  * compute a different priority each time td_estcpu increases by
  * INVERSE_ESTCPU_WEIGHT
  * (until MAXPRI is reached).  The cpu usage estimator ramps up
  * quite quickly when the process is running (linearly), and decays
  * away exponentially, at a rate which is proportionally slower when
  * the system is busy.  The basic principle is that the system will
  * 90% forget that the process used a lot of CPU time in 5 * loadav
  * seconds.  This causes the system to favor processes which haven't
  * run much recently, and to round-robin among other processes.
  */
 void
 sched_clock(struct thread *td)
 {
 	struct pcpuidlestat *stat;
 	struct td_sched *ts;
 
 	THREAD_LOCK_ASSERT(td, MA_OWNED);
 	ts = td->td_sched;
 
 	ts->ts_cpticks++;
 	td->td_estcpu = ESTCPULIM(td->td_estcpu + 1);
 	if ((td->td_estcpu % INVERSE_ESTCPU_WEIGHT) == 0) {
 		resetpriority(td);
 		resetpriority_thread(td);
 	}
 
 	/*
 	 * Force a context switch if the current thread has used up a full
 	 * time slice (default is 100ms).
 	 */
 	if (!TD_IS_IDLETHREAD(td) && --ts->ts_slice <= 0) {
 		ts->ts_slice = sched_slice;
 		td->td_flags |= TDF_NEEDRESCHED | TDF_SLICEEND;
 	}
 
 	stat = DPCPU_PTR(idlestat);
 	stat->oldidlecalls = stat->idlecalls;
 	stat->idlecalls = 0;
 }
 
 /*
  * Charge child's scheduling CPU usage to parent.
  */
 void
 sched_exit(struct proc *p, struct thread *td)
 {
 
 	KTR_STATE1(KTR_SCHED, "thread", sched_tdname(td), "proc exit",
 	    "prio:%d", td->td_priority);
 
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	sched_exit_thread(FIRST_THREAD_IN_PROC(p), td);
 }
 
 void
 sched_exit_thread(struct thread *td, struct thread *child)
 {
 
 	KTR_STATE1(KTR_SCHED, "thread", sched_tdname(child), "exit",
 	    "prio:%d", child->td_priority);
 	thread_lock(td);
 	td->td_estcpu = ESTCPULIM(td->td_estcpu + child->td_estcpu);
 	thread_unlock(td);
 	thread_lock(child);
 	if ((child->td_flags & TDF_NOLOAD) == 0)
 		sched_load_rem();
 	thread_unlock(child);
 }
 
 void
 sched_fork(struct thread *td, struct thread *childtd)
 {
 	sched_fork_thread(td, childtd);
 }
 
 void
 sched_fork_thread(struct thread *td, struct thread *childtd)
 {
 	struct td_sched *ts;
 
 	childtd->td_estcpu = td->td_estcpu;
 	childtd->td_lock = &sched_lock;
 	childtd->td_cpuset = cpuset_ref(td->td_cpuset);
 	childtd->td_priority = childtd->td_base_pri;
 	ts = childtd->td_sched;
 	bzero(ts, sizeof(*ts));
 	ts->ts_flags |= (td->td_sched->ts_flags & TSF_AFFINITY);
 	ts->ts_slice = 1;
 }
 
 void
 sched_nice(struct proc *p, int nice)
 {
 	struct thread *td;
 
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	p->p_nice = nice;
 	FOREACH_THREAD_IN_PROC(p, td) {
 		thread_lock(td);
 		resetpriority(td);
 		resetpriority_thread(td);
 		thread_unlock(td);
 	}
 }
 
 void
 sched_class(struct thread *td, int class)
 {
 	THREAD_LOCK_ASSERT(td, MA_OWNED);
 	td->td_pri_class = class;
 }
 
 /*
  * Adjust the priority of a thread.
  */
 static void
 sched_priority(struct thread *td, u_char prio)
 {
 
 
 	KTR_POINT3(KTR_SCHED, "thread", sched_tdname(td), "priority change",
 	    "prio:%d", td->td_priority, "new prio:%d", prio, KTR_ATTR_LINKED,
 	    sched_tdname(curthread));
 	SDT_PROBE3(sched, , , change_pri, td, td->td_proc, prio);
 	if (td != curthread && prio > td->td_priority) {
 		KTR_POINT3(KTR_SCHED, "thread", sched_tdname(curthread),
 		    "lend prio", "prio:%d", td->td_priority, "new prio:%d",
 		    prio, KTR_ATTR_LINKED, sched_tdname(td));
 		SDT_PROBE4(sched, , , lend_pri, td, td->td_proc, prio, 
 		    curthread);
 	}
 	THREAD_LOCK_ASSERT(td, MA_OWNED);
 	if (td->td_priority == prio)
 		return;
 	td->td_priority = prio;
 	if (TD_ON_RUNQ(td) && td->td_rqindex != (prio / RQ_PPQ)) {
 		sched_rem(td);
 		sched_add(td, SRQ_BORING);
 	}
 }
 
 /*
  * Update a thread's priority when it is lent another thread's
  * priority.
  */
 void
 sched_lend_prio(struct thread *td, u_char prio)
 {
 
 	td->td_flags |= TDF_BORROWING;
 	sched_priority(td, prio);
 }
 
 /*
  * Restore a thread's priority when priority propagation is
  * over.  The prio argument is the minimum priority the thread
  * needs to have to satisfy other possible priority lending
  * requests.  If the thread's regulary priority is less
  * important than prio the thread will keep a priority boost
  * of prio.
  */
 void
 sched_unlend_prio(struct thread *td, u_char prio)
 {
 	u_char base_pri;
 
 	if (td->td_base_pri >= PRI_MIN_TIMESHARE &&
 	    td->td_base_pri <= PRI_MAX_TIMESHARE)
 		base_pri = td->td_user_pri;
 	else
 		base_pri = td->td_base_pri;
 	if (prio >= base_pri) {
 		td->td_flags &= ~TDF_BORROWING;
 		sched_prio(td, base_pri);
 	} else
 		sched_lend_prio(td, prio);
 }
 
 void
 sched_prio(struct thread *td, u_char prio)
 {
 	u_char oldprio;
 
 	/* First, update the base priority. */
 	td->td_base_pri = prio;
 
 	/*
 	 * If the thread is borrowing another thread's priority, don't ever
 	 * lower the priority.
 	 */
 	if (td->td_flags & TDF_BORROWING && td->td_priority < prio)
 		return;
 
 	/* Change the real priority. */
 	oldprio = td->td_priority;
 	sched_priority(td, prio);
 
 	/*
 	 * If the thread is on a turnstile, then let the turnstile update
 	 * its state.
 	 */
 	if (TD_ON_LOCK(td) && oldprio != prio)
 		turnstile_adjust(td, oldprio);
 }
 
 void
 sched_user_prio(struct thread *td, u_char prio)
 {
 
 	THREAD_LOCK_ASSERT(td, MA_OWNED);
 	td->td_base_user_pri = prio;
 	if (td->td_lend_user_pri <= prio)
 		return;
 	td->td_user_pri = prio;
 }
 
 void
 sched_lend_user_prio(struct thread *td, u_char prio)
 {
 
 	THREAD_LOCK_ASSERT(td, MA_OWNED);
 	td->td_lend_user_pri = prio;
 	td->td_user_pri = min(prio, td->td_base_user_pri);
 	if (td->td_priority > td->td_user_pri)
 		sched_prio(td, td->td_user_pri);
 	else if (td->td_priority != td->td_user_pri)
 		td->td_flags |= TDF_NEEDRESCHED;
 }
 
 void
 sched_sleep(struct thread *td, int pri)
 {
 
 	THREAD_LOCK_ASSERT(td, MA_OWNED);
 	td->td_slptick = ticks;
 	td->td_sched->ts_slptime = 0;
 	if (pri != 0 && PRI_BASE(td->td_pri_class) == PRI_TIMESHARE)
 		sched_prio(td, pri);
 	if (TD_IS_SUSPENDED(td) || pri >= PSOCK)
 		td->td_flags |= TDF_CANSWAP;
 }
 
 void
 sched_switch(struct thread *td, struct thread *newtd, int flags)
 {
 	struct mtx *tmtx;
 	struct td_sched *ts;
 	struct proc *p;
 	int preempted;
 
 	tmtx = NULL;
 	ts = td->td_sched;
 	p = td->td_proc;
 
 	THREAD_LOCK_ASSERT(td, MA_OWNED);
 
 	/* 
 	 * Switch to the sched lock to fix things up and pick
 	 * a new thread.
 	 * Block the td_lock in order to avoid breaking the critical path.
 	 */
 	if (td->td_lock != &sched_lock) {
 		mtx_lock_spin(&sched_lock);
 		tmtx = thread_lock_block(td);
 	}
 
 	if ((td->td_flags & TDF_NOLOAD) == 0)
 		sched_load_rem();
 
 	td->td_lastcpu = td->td_oncpu;
 	preempted = !(td->td_flags & TDF_SLICEEND);
 	td->td_flags &= ~(TDF_NEEDRESCHED | TDF_SLICEEND);
 	td->td_owepreempt = 0;
 	td->td_oncpu = NOCPU;
 
 	/*
 	 * At the last moment, if this thread is still marked RUNNING,
 	 * then put it back on the run queue as it has not been suspended
 	 * or stopped or any thing else similar.  We never put the idle
 	 * threads on the run queue, however.
 	 */
 	if (td->td_flags & TDF_IDLETD) {
 		TD_SET_CAN_RUN(td);
 #ifdef SMP
 		CPU_CLR(PCPU_GET(cpuid), &idle_cpus_mask);
 #endif
 	} else {
 		if (TD_IS_RUNNING(td)) {
 			/* Put us back on the run queue. */
 			sched_add(td, preempted ?
 			    SRQ_OURSELF|SRQ_YIELDING|SRQ_PREEMPTED :
 			    SRQ_OURSELF|SRQ_YIELDING);
 		}
 	}
 	if (newtd) {
 		/*
 		 * The thread we are about to run needs to be counted
 		 * as if it had been added to the run queue and selected.
 		 * It came from:
 		 * * A preemption
 		 * * An upcall
 		 * * A followon
 		 */
 		KASSERT((newtd->td_inhibitors == 0),
 			("trying to run inhibited thread"));
 		newtd->td_flags |= TDF_DIDRUN;
         	TD_SET_RUNNING(newtd);
 		if ((newtd->td_flags & TDF_NOLOAD) == 0)
 			sched_load_add();
 	} else {
 		newtd = choosethread();
 		MPASS(newtd->td_lock == &sched_lock);
 	}
 
 	if (td != newtd) {
 #ifdef	HWPMC_HOOKS
 		if (PMC_PROC_IS_USING_PMCS(td->td_proc))
 			PMC_SWITCH_CONTEXT(td, PMC_FN_CSW_OUT);
 #endif
 
 		SDT_PROBE2(sched, , , off_cpu, td, td->td_proc);
 
                 /* I feel sleepy */
 		lock_profile_release_lock(&sched_lock.lock_object);
 #ifdef KDTRACE_HOOKS
 		/*
 		 * If DTrace has set the active vtime enum to anything
 		 * other than INACTIVE (0), then it should have set the
 		 * function to call.
 		 */
 		if (dtrace_vtime_active)
 			(*dtrace_vtime_switch_func)(newtd);
 #endif
 
 		cpu_switch(td, newtd, tmtx != NULL ? tmtx : td->td_lock);
 		lock_profile_obtain_lock_success(&sched_lock.lock_object,
 		    0, 0, __FILE__, __LINE__);
 		/*
 		 * Where am I?  What year is it?
 		 * We are in the same thread that went to sleep above,
 		 * but any amount of time may have passed. All our context
 		 * will still be available as will local variables.
 		 * PCPU values however may have changed as we may have
 		 * changed CPU so don't trust cached values of them.
 		 * New threads will go to fork_exit() instead of here
 		 * so if you change things here you may need to change
 		 * things there too.
 		 *
 		 * If the thread above was exiting it will never wake
 		 * up again here, so either it has saved everything it
 		 * needed to, or the thread_wait() or wait() will
 		 * need to reap it.
 		 */
 
 		SDT_PROBE0(sched, , , on_cpu);
 #ifdef	HWPMC_HOOKS
 		if (PMC_PROC_IS_USING_PMCS(td->td_proc))
 			PMC_SWITCH_CONTEXT(td, PMC_FN_CSW_IN);
 #endif
 	} else
 		SDT_PROBE0(sched, , , remain_cpu);
 
 #ifdef SMP
 	if (td->td_flags & TDF_IDLETD)
 		CPU_SET(PCPU_GET(cpuid), &idle_cpus_mask);
 #endif
 	sched_lock.mtx_lock = (uintptr_t)td;
 	td->td_oncpu = PCPU_GET(cpuid);
 	MPASS(td->td_lock == &sched_lock);
 }
 
 void
 sched_wakeup(struct thread *td)
 {
 	struct td_sched *ts;
 
 	THREAD_LOCK_ASSERT(td, MA_OWNED);
 	ts = td->td_sched;
 	td->td_flags &= ~TDF_CANSWAP;
 	if (ts->ts_slptime > 1) {
 		updatepri(td);
 		resetpriority(td);
 	}
 	td->td_slptick = 0;
 	ts->ts_slptime = 0;
 	ts->ts_slice = sched_slice;
 	sched_add(td, SRQ_BORING);
 }
 
 #ifdef SMP
 static int
 forward_wakeup(int cpunum)
 {
 	struct pcpu *pc;
 	cpuset_t dontuse, map, map2;
 	u_int id, me;
 	int iscpuset;
 
 	mtx_assert(&sched_lock, MA_OWNED);
 
 	CTR0(KTR_RUNQ, "forward_wakeup()");
 
 	if ((!forward_wakeup_enabled) ||
 	     (forward_wakeup_use_mask == 0 && forward_wakeup_use_loop == 0))
 		return (0);
 	if (!smp_started || cold || panicstr)
 		return (0);
 
 	forward_wakeups_requested++;
 
 	/*
 	 * Check the idle mask we received against what we calculated
 	 * before in the old version.
 	 */
 	me = PCPU_GET(cpuid);
 
 	/* Don't bother if we should be doing it ourself. */
 	if (CPU_ISSET(me, &idle_cpus_mask) &&
 	    (cpunum == NOCPU || me == cpunum))
 		return (0);
 
 	CPU_SETOF(me, &dontuse);
 	CPU_OR(&dontuse, &stopped_cpus);
 	CPU_OR(&dontuse, &hlt_cpus_mask);
 	CPU_ZERO(&map2);
 	if (forward_wakeup_use_loop) {
 		STAILQ_FOREACH(pc, &cpuhead, pc_allcpu) {
 			id = pc->pc_cpuid;
 			if (!CPU_ISSET(id, &dontuse) &&
 			    pc->pc_curthread == pc->pc_idlethread) {
 				CPU_SET(id, &map2);
 			}
 		}
 	}
 
 	if (forward_wakeup_use_mask) {
 		map = idle_cpus_mask;
 		CPU_NAND(&map, &dontuse);
 
 		/* If they are both on, compare and use loop if different. */
 		if (forward_wakeup_use_loop) {
 			if (CPU_CMP(&map, &map2)) {
 				printf("map != map2, loop method preferred\n");
 				map = map2;
 			}
 		}
 	} else {
 		map = map2;
 	}
 
 	/* If we only allow a specific CPU, then mask off all the others. */
 	if (cpunum != NOCPU) {
 		KASSERT((cpunum <= mp_maxcpus),("forward_wakeup: bad cpunum."));
 		iscpuset = CPU_ISSET(cpunum, &map);
 		if (iscpuset == 0)
 			CPU_ZERO(&map);
 		else
 			CPU_SETOF(cpunum, &map);
 	}
 	if (!CPU_EMPTY(&map)) {
 		forward_wakeups_delivered++;
 		STAILQ_FOREACH(pc, &cpuhead, pc_allcpu) {
 			id = pc->pc_cpuid;
 			if (!CPU_ISSET(id, &map))
 				continue;
 			if (cpu_idle_wakeup(pc->pc_cpuid))
 				CPU_CLR(id, &map);
 		}
 		if (!CPU_EMPTY(&map))
 			ipi_selected(map, IPI_AST);
 		return (1);
 	}
 	if (cpunum == NOCPU)
 		printf("forward_wakeup: Idle processor not found\n");
 	return (0);
 }
 
 static void
 kick_other_cpu(int pri, int cpuid)
 {
 	struct pcpu *pcpu;
 	int cpri;
 
 	pcpu = pcpu_find(cpuid);
 	if (CPU_ISSET(cpuid, &idle_cpus_mask)) {
 		forward_wakeups_delivered++;
 		if (!cpu_idle_wakeup(cpuid))
 			ipi_cpu(cpuid, IPI_AST);
 		return;
 	}
 
 	cpri = pcpu->pc_curthread->td_priority;
 	if (pri >= cpri)
 		return;
 
 #if defined(IPI_PREEMPTION) && defined(PREEMPTION)
 #if !defined(FULL_PREEMPTION)
 	if (pri <= PRI_MAX_ITHD)
 #endif /* ! FULL_PREEMPTION */
 	{
 		ipi_cpu(cpuid, IPI_PREEMPT);
 		return;
 	}
 #endif /* defined(IPI_PREEMPTION) && defined(PREEMPTION) */
 
 	pcpu->pc_curthread->td_flags |= TDF_NEEDRESCHED;
 	ipi_cpu(cpuid, IPI_AST);
 	return;
 }
 #endif /* SMP */
 
 #ifdef SMP
 static int
 sched_pickcpu(struct thread *td)
 {
 	int best, cpu;
 
 	mtx_assert(&sched_lock, MA_OWNED);
 
 	if (THREAD_CAN_SCHED(td, td->td_lastcpu))
 		best = td->td_lastcpu;
 	else
 		best = NOCPU;
 	CPU_FOREACH(cpu) {
 		if (!THREAD_CAN_SCHED(td, cpu))
 			continue;
 	
 		if (best == NOCPU)
 			best = cpu;
 		else if (runq_length[cpu] < runq_length[best])
 			best = cpu;
 	}
 	KASSERT(best != NOCPU, ("no valid CPUs"));
 
 	return (best);
 }
 #endif
 
 void
 sched_add(struct thread *td, int flags)
 #ifdef SMP
 {
 	cpuset_t tidlemsk;
 	struct td_sched *ts;
 	u_int cpu, cpuid;
 	int forwarded = 0;
 	int single_cpu = 0;
 
 	ts = td->td_sched;
 	THREAD_LOCK_ASSERT(td, MA_OWNED);
 	KASSERT((td->td_inhibitors == 0),
 	    ("sched_add: trying to run inhibited thread"));
 	KASSERT((TD_CAN_RUN(td) || TD_IS_RUNNING(td)),
 	    ("sched_add: bad thread state"));
 	KASSERT(td->td_flags & TDF_INMEM,
 	    ("sched_add: thread swapped out"));
 
 	KTR_STATE2(KTR_SCHED, "thread", sched_tdname(td), "runq add",
 	    "prio:%d", td->td_priority, KTR_ATTR_LINKED,
 	    sched_tdname(curthread));
 	KTR_POINT1(KTR_SCHED, "thread", sched_tdname(curthread), "wokeup",
 	    KTR_ATTR_LINKED, sched_tdname(td));
 	SDT_PROBE4(sched, , , enqueue, td, td->td_proc, NULL, 
 	    flags & SRQ_PREEMPTED);
 
 
 	/*
 	 * Now that the thread is moving to the run-queue, set the lock
 	 * to the scheduler's lock.
 	 */
 	if (td->td_lock != &sched_lock) {
 		mtx_lock_spin(&sched_lock);
 		thread_lock_set(td, &sched_lock);
 	}
 	TD_SET_RUNQ(td);
 
 	/*
 	 * If SMP is started and the thread is pinned or otherwise limited to
 	 * a specific set of CPUs, queue the thread to a per-CPU run queue.
 	 * Otherwise, queue the thread to the global run queue.
 	 *
 	 * If SMP has not yet been started we must use the global run queue
 	 * as per-CPU state may not be initialized yet and we may crash if we
 	 * try to access the per-CPU run queues.
 	 */
 	if (smp_started && (td->td_pinned != 0 || td->td_flags & TDF_BOUND ||
 	    ts->ts_flags & TSF_AFFINITY)) {
 		if (td->td_pinned != 0)
 			cpu = td->td_lastcpu;
 		else if (td->td_flags & TDF_BOUND) {
 			/* Find CPU from bound runq. */
 			KASSERT(SKE_RUNQ_PCPU(ts),
 			    ("sched_add: bound td_sched not on cpu runq"));
 			cpu = ts->ts_runq - &runq_pcpu[0];
 		} else
 			/* Find a valid CPU for our cpuset */
 			cpu = sched_pickcpu(td);
 		ts->ts_runq = &runq_pcpu[cpu];
 		single_cpu = 1;
 		CTR3(KTR_RUNQ,
 		    "sched_add: Put td_sched:%p(td:%p) on cpu%d runq", ts, td,
 		    cpu);
 	} else {
 		CTR2(KTR_RUNQ,
 		    "sched_add: adding td_sched:%p (td:%p) to gbl runq", ts,
 		    td);
 		cpu = NOCPU;
 		ts->ts_runq = &runq;
 	}
 
 	cpuid = PCPU_GET(cpuid);
 	if (single_cpu && cpu != cpuid) {
 	        kick_other_cpu(td->td_priority, cpu);
 	} else {
 		if (!single_cpu) {
 			tidlemsk = idle_cpus_mask;
 			CPU_NAND(&tidlemsk, &hlt_cpus_mask);
 			CPU_CLR(cpuid, &tidlemsk);
 
 			if (!CPU_ISSET(cpuid, &idle_cpus_mask) &&
 			    ((flags & SRQ_INTR) == 0) &&
 			    !CPU_EMPTY(&tidlemsk))
 				forwarded = forward_wakeup(cpu);
 		}
 
 		if (!forwarded) {
 			if ((flags & SRQ_YIELDING) == 0 && maybe_preempt(td))
 				return;
 			else
 				maybe_resched(td);
 		}
 	}
 
 	if ((td->td_flags & TDF_NOLOAD) == 0)
 		sched_load_add();
 	runq_add(ts->ts_runq, td, flags);
 	if (cpu != NOCPU)
 		runq_length[cpu]++;
 }
 #else /* SMP */
 {
 	struct td_sched *ts;
 
 	ts = td->td_sched;
 	THREAD_LOCK_ASSERT(td, MA_OWNED);
 	KASSERT((td->td_inhibitors == 0),
 	    ("sched_add: trying to run inhibited thread"));
 	KASSERT((TD_CAN_RUN(td) || TD_IS_RUNNING(td)),
 	    ("sched_add: bad thread state"));
 	KASSERT(td->td_flags & TDF_INMEM,
 	    ("sched_add: thread swapped out"));
 	KTR_STATE2(KTR_SCHED, "thread", sched_tdname(td), "runq add",
 	    "prio:%d", td->td_priority, KTR_ATTR_LINKED,
 	    sched_tdname(curthread));
 	KTR_POINT1(KTR_SCHED, "thread", sched_tdname(curthread), "wokeup",
 	    KTR_ATTR_LINKED, sched_tdname(td));
 	SDT_PROBE4(sched, , , enqueue, td, td->td_proc, NULL, 
 	    flags & SRQ_PREEMPTED);
 
 	/*
 	 * Now that the thread is moving to the run-queue, set the lock
 	 * to the scheduler's lock.
 	 */
 	if (td->td_lock != &sched_lock) {
 		mtx_lock_spin(&sched_lock);
 		thread_lock_set(td, &sched_lock);
 	}
 	TD_SET_RUNQ(td);
 	CTR2(KTR_RUNQ, "sched_add: adding td_sched:%p (td:%p) to runq", ts, td);
 	ts->ts_runq = &runq;
 
 	/*
 	 * If we are yielding (on the way out anyhow) or the thread
 	 * being saved is US, then don't try be smart about preemption
 	 * or kicking off another CPU as it won't help and may hinder.
 	 * In the YIEDLING case, we are about to run whoever is being
 	 * put in the queue anyhow, and in the OURSELF case, we are
 	 * puting ourself on the run queue which also only happens
 	 * when we are about to yield.
 	 */
 	if ((flags & SRQ_YIELDING) == 0) {
 		if (maybe_preempt(td))
 			return;
 	}
 	if ((td->td_flags & TDF_NOLOAD) == 0)
 		sched_load_add();
 	runq_add(ts->ts_runq, td, flags);
 	maybe_resched(td);
 }
 #endif /* SMP */
 
 void
 sched_rem(struct thread *td)
 {
 	struct td_sched *ts;
 
 	ts = td->td_sched;
 	KASSERT(td->td_flags & TDF_INMEM,
 	    ("sched_rem: thread swapped out"));
 	KASSERT(TD_ON_RUNQ(td),
 	    ("sched_rem: thread not on run queue"));
 	mtx_assert(&sched_lock, MA_OWNED);
 	KTR_STATE2(KTR_SCHED, "thread", sched_tdname(td), "runq rem",
 	    "prio:%d", td->td_priority, KTR_ATTR_LINKED,
 	    sched_tdname(curthread));
 	SDT_PROBE3(sched, , , dequeue, td, td->td_proc, NULL);
 
 	if ((td->td_flags & TDF_NOLOAD) == 0)
 		sched_load_rem();
 #ifdef SMP
 	if (ts->ts_runq != &runq)
 		runq_length[ts->ts_runq - runq_pcpu]--;
 #endif
 	runq_remove(ts->ts_runq, td);
 	TD_SET_CAN_RUN(td);
 }
 
 /*
  * Select threads to run.  Note that running threads still consume a
  * slot.
  */
 struct thread *
 sched_choose(void)
 {
 	struct thread *td;
 	struct runq *rq;
 
 	mtx_assert(&sched_lock,  MA_OWNED);
 #ifdef SMP
 	struct thread *tdcpu;
 
 	rq = &runq;
 	td = runq_choose_fuzz(&runq, runq_fuzz);
 	tdcpu = runq_choose(&runq_pcpu[PCPU_GET(cpuid)]);
 
 	if (td == NULL ||
 	    (tdcpu != NULL &&
 	     tdcpu->td_priority < td->td_priority)) {
 		CTR2(KTR_RUNQ, "choosing td %p from pcpu runq %d", tdcpu,
 		     PCPU_GET(cpuid));
 		td = tdcpu;
 		rq = &runq_pcpu[PCPU_GET(cpuid)];
 	} else {
 		CTR1(KTR_RUNQ, "choosing td_sched %p from main runq", td);
 	}
 
 #else
 	rq = &runq;
 	td = runq_choose(&runq);
 #endif
 
 	if (td) {
 #ifdef SMP
 		if (td == tdcpu)
 			runq_length[PCPU_GET(cpuid)]--;
 #endif
 		runq_remove(rq, td);
 		td->td_flags |= TDF_DIDRUN;
 
 		KASSERT(td->td_flags & TDF_INMEM,
 		    ("sched_choose: thread swapped out"));
 		return (td);
 	}
 	return (PCPU_GET(idlethread));
 }
 
 void
 sched_preempt(struct thread *td)
 {
 
 	SDT_PROBE2(sched, , , surrender, td, td->td_proc);
 	thread_lock(td);
 	if (td->td_critnest > 1)
 		td->td_owepreempt = 1;
 	else
 		mi_switch(SW_INVOL | SW_PREEMPT | SWT_PREEMPT, NULL);
 	thread_unlock(td);
 }
 
 void
 sched_userret(struct thread *td)
 {
 	/*
 	 * XXX we cheat slightly on the locking here to avoid locking in
 	 * the usual case.  Setting td_priority here is essentially an
 	 * incomplete workaround for not setting it properly elsewhere.
 	 * Now that some interrupt handlers are threads, not setting it
 	 * properly elsewhere can clobber it in the window between setting
 	 * it here and returning to user mode, so don't waste time setting
 	 * it perfectly here.
 	 */
 	KASSERT((td->td_flags & TDF_BORROWING) == 0,
 	    ("thread with borrowed priority returning to userland"));
 	if (td->td_priority != td->td_user_pri) {
 		thread_lock(td);
 		td->td_priority = td->td_user_pri;
 		td->td_base_pri = td->td_user_pri;
 		thread_unlock(td);
 	}
 }
 
 void
 sched_bind(struct thread *td, int cpu)
 {
 	struct td_sched *ts;
 
 	THREAD_LOCK_ASSERT(td, MA_OWNED|MA_NOTRECURSED);
 	KASSERT(td == curthread, ("sched_bind: can only bind curthread"));
 
 	ts = td->td_sched;
 
 	td->td_flags |= TDF_BOUND;
 #ifdef SMP
 	ts->ts_runq = &runq_pcpu[cpu];
 	if (PCPU_GET(cpuid) == cpu)
 		return;
 
 	mi_switch(SW_VOL, NULL);
 #endif
 }
 
 void
 sched_unbind(struct thread* td)
 {
 	THREAD_LOCK_ASSERT(td, MA_OWNED);
 	KASSERT(td == curthread, ("sched_unbind: can only bind curthread"));
 	td->td_flags &= ~TDF_BOUND;
 }
 
 int
 sched_is_bound(struct thread *td)
 {
 	THREAD_LOCK_ASSERT(td, MA_OWNED);
 	return (td->td_flags & TDF_BOUND);
 }
 
 void
 sched_relinquish(struct thread *td)
 {
 	thread_lock(td);
 	mi_switch(SW_VOL | SWT_RELINQUISH, NULL);
 	thread_unlock(td);
 }
 
 int
 sched_load(void)
 {
 	return (sched_tdcnt);
 }
 
 int
 sched_sizeof_proc(void)
 {
 	return (sizeof(struct proc));
 }
 
 int
 sched_sizeof_thread(void)
 {
 	return (sizeof(struct thread) + sizeof(struct td_sched));
 }
 
 fixpt_t
 sched_pctcpu(struct thread *td)
 {
 	struct td_sched *ts;
 
 	THREAD_LOCK_ASSERT(td, MA_OWNED);
 	ts = td->td_sched;
 	return (ts->ts_pctcpu);
 }
 
 #ifdef	RACCT
 /*
  * Calculates the contribution to the thread cpu usage for the latest
  * (unfinished) second.
  */
 fixpt_t
 sched_pctcpu_delta(struct thread *td)
 {
 	struct td_sched *ts;
 	fixpt_t delta;
 	int realstathz;
 
 	THREAD_LOCK_ASSERT(td, MA_OWNED);
 	ts = td->td_sched;
 	delta = 0;
 	realstathz = stathz ? stathz : hz;
 	if (ts->ts_cpticks != 0) {
 #if	(FSHIFT >= CCPU_SHIFT)
 		delta = (realstathz == 100)
 		    ? ((fixpt_t) ts->ts_cpticks) <<
 		    (FSHIFT - CCPU_SHIFT) :
 		    100 * (((fixpt_t) ts->ts_cpticks)
 		    << (FSHIFT - CCPU_SHIFT)) / realstathz;
 #else
 		delta = ((FSCALE - ccpu) *
 		    (ts->ts_cpticks *
 		    FSCALE / realstathz)) >> FSHIFT;
 #endif
 	}
 
 	return (delta);
 }
 #endif
 
 void
 sched_tick(int cnt)
 {
 }
 
 /*
  * The actual idle process.
  */
 void
 sched_idletd(void *dummy)
 {
 	struct pcpuidlestat *stat;
 
 	THREAD_NO_SLEEPING();
 	stat = DPCPU_PTR(idlestat);
 	for (;;) {
 		mtx_assert(&Giant, MA_NOTOWNED);
 
 		while (sched_runnable() == 0) {
 			cpu_idle(stat->idlecalls + stat->oldidlecalls > 64);
 			stat->idlecalls++;
 		}
 
 		mtx_lock_spin(&sched_lock);
 		mi_switch(SW_VOL | SWT_IDLE, NULL);
 		mtx_unlock_spin(&sched_lock);
 	}
 }
 
 /*
  * A CPU is entering for the first time or a thread is exiting.
  */
 void
 sched_throw(struct thread *td)
 {
 	/*
 	 * Correct spinlock nesting.  The idle thread context that we are
 	 * borrowing was created so that it would start out with a single
 	 * spin lock (sched_lock) held in fork_trampoline().  Since we've
 	 * explicitly acquired locks in this function, the nesting count
 	 * is now 2 rather than 1.  Since we are nested, calling
 	 * spinlock_exit() will simply adjust the counts without allowing
 	 * spin lock using code to interrupt us.
 	 */
 	if (td == NULL) {
 		mtx_lock_spin(&sched_lock);
 		spinlock_exit();
 		PCPU_SET(switchtime, cpu_ticks());
 		PCPU_SET(switchticks, ticks);
 	} else {
 		lock_profile_release_lock(&sched_lock.lock_object);
 		MPASS(td->td_lock == &sched_lock);
 	}
 	mtx_assert(&sched_lock, MA_OWNED);
 	KASSERT(curthread->td_md.md_spinlock_count == 1, ("invalid count"));
 	cpu_throw(td, choosethread());	/* doesn't return */
 }
 
 void
 sched_fork_exit(struct thread *td)
 {
 
 	/*
 	 * Finish setting up thread glue so that it begins execution in a
 	 * non-nested critical section with sched_lock held but not recursed.
 	 */
 	td->td_oncpu = PCPU_GET(cpuid);
 	sched_lock.mtx_lock = (uintptr_t)td;
 	lock_profile_obtain_lock_success(&sched_lock.lock_object,
 	    0, 0, __FILE__, __LINE__);
 	THREAD_LOCK_ASSERT(td, MA_OWNED | MA_NOTRECURSED);
 }
 
 char *
 sched_tdname(struct thread *td)
 {
 #ifdef KTR
 	struct td_sched *ts;
 
 	ts = td->td_sched;
 	if (ts->ts_name[0] == '\0')
 		snprintf(ts->ts_name, sizeof(ts->ts_name),
 		    "%s tid %d", td->td_name, td->td_tid);
 	return (ts->ts_name);
 #else   
 	return (td->td_name);
 #endif
 }
 
 #ifdef KTR
 void
 sched_clear_tdname(struct thread *td)
 {
 	struct td_sched *ts;
 
 	ts = td->td_sched;
 	ts->ts_name[0] = '\0';
 }
 #endif
 
 void
 sched_affinity(struct thread *td)
 {
 #ifdef SMP
 	struct td_sched *ts;
 	int cpu;
 
 	THREAD_LOCK_ASSERT(td, MA_OWNED);	
 
 	/*
 	 * Set the TSF_AFFINITY flag if there is at least one CPU this
 	 * thread can't run on.
 	 */
 	ts = td->td_sched;
 	ts->ts_flags &= ~TSF_AFFINITY;
 	CPU_FOREACH(cpu) {
 		if (!THREAD_CAN_SCHED(td, cpu)) {
 			ts->ts_flags |= TSF_AFFINITY;
 			break;
 		}
 	}
 
 	/*
 	 * If this thread can run on all CPUs, nothing else to do.
 	 */
 	if (!(ts->ts_flags & TSF_AFFINITY))
 		return;
 
 	/* Pinned threads and bound threads should be left alone. */
 	if (td->td_pinned != 0 || td->td_flags & TDF_BOUND)
 		return;
 
 	switch (td->td_state) {
 	case TDS_RUNQ:
 		/*
 		 * If we are on a per-CPU runqueue that is in the set,
 		 * then nothing needs to be done.
 		 */
 		if (ts->ts_runq != &runq &&
 		    THREAD_CAN_SCHED(td, ts->ts_runq - runq_pcpu))
 			return;
 
 		/* Put this thread on a valid per-CPU runqueue. */
 		sched_rem(td);
 		sched_add(td, SRQ_BORING);
 		break;
 	case TDS_RUNNING:
 		/*
 		 * See if our current CPU is in the set.  If not, force a
 		 * context switch.
 		 */
 		if (THREAD_CAN_SCHED(td, td->td_oncpu))
 			return;
 
 		td->td_flags |= TDF_NEEDRESCHED;
 		if (td != curthread)
 			ipi_cpu(cpu, IPI_AST);
 		break;
 	default:
 		break;
 	}
 #endif
 }
Index: head/sys/kern/sched_ule.c
===================================================================
--- head/sys/kern/sched_ule.c	(revision 258540)
+++ head/sys/kern/sched_ule.c	(revision 258541)
@@ -1,2911 +1,2910 @@
 /*-
  * Copyright (c) 2002-2007, Jeffrey Roberson <jeff@freebsd.org>
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice unmodified, this list of conditions, and the following
  *    disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
 /*
  * This file implements the ULE scheduler.  ULE supports independent CPU
  * run queues and fine grain locking.  It has superior interactive
  * performance under load even on uni-processor systems.
  *
  * etymology:
  *   ULE is the last three letters in schedule.  It owes its name to a
  * generic user created for a scheduling system by Paul Mikesell at
  * Isilon Systems and a general lack of creativity on the part of the author.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_hwpmc_hooks.h"
-#include "opt_kdtrace.h"
 #include "opt_sched.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/kdb.h>
 #include <sys/kernel.h>
 #include <sys/ktr.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/proc.h>
 #include <sys/resource.h>
 #include <sys/resourcevar.h>
 #include <sys/sched.h>
 #include <sys/sdt.h>
 #include <sys/smp.h>
 #include <sys/sx.h>
 #include <sys/sysctl.h>
 #include <sys/sysproto.h>
 #include <sys/turnstile.h>
 #include <sys/umtx.h>
 #include <sys/vmmeter.h>
 #include <sys/cpuset.h>
 #include <sys/sbuf.h>
 
 #ifdef HWPMC_HOOKS
 #include <sys/pmckern.h>
 #endif
 
 #ifdef KDTRACE_HOOKS
 #include <sys/dtrace_bsd.h>
 int				dtrace_vtime_active;
 dtrace_vtime_switch_func_t	dtrace_vtime_switch_func;
 #endif
 
 #include <machine/cpu.h>
 #include <machine/smp.h>
 
 #if defined(__powerpc__) && defined(BOOKE_E500)
 #error "This architecture is not currently compatible with ULE"
 #endif
 
 #define	KTR_ULE	0
 
 #define	TS_NAME_LEN (MAXCOMLEN + sizeof(" td ") + sizeof(__XSTRING(UINT_MAX)))
 #define	TDQ_NAME_LEN	(sizeof("sched lock ") + sizeof(__XSTRING(MAXCPU)))
 #define	TDQ_LOADNAME_LEN	(sizeof("CPU ") + sizeof(__XSTRING(MAXCPU)) - 1 + sizeof(" load"))
 
 /*
  * Thread scheduler specific section.  All fields are protected
  * by the thread lock.
  */
 struct td_sched {	
 	struct runq	*ts_runq;	/* Run-queue we're queued on. */
 	short		ts_flags;	/* TSF_* flags. */
 	u_char		ts_cpu;		/* CPU that we have affinity for. */
 	int		ts_rltick;	/* Real last tick, for affinity. */
 	int		ts_slice;	/* Ticks of slice remaining. */
 	u_int		ts_slptime;	/* Number of ticks we vol. slept */
 	u_int		ts_runtime;	/* Number of ticks we were running */
 	int		ts_ltick;	/* Last tick that we were running on */
 	int		ts_ftick;	/* First tick that we were running on */
 	int		ts_ticks;	/* Tick count */
 #ifdef KTR
 	char		ts_name[TS_NAME_LEN];
 #endif
 };
 /* flags kept in ts_flags */
 #define	TSF_BOUND	0x0001		/* Thread can not migrate. */
 #define	TSF_XFERABLE	0x0002		/* Thread was added as transferable. */
 
 static struct td_sched td_sched0;
 
 #define	THREAD_CAN_MIGRATE(td)	((td)->td_pinned == 0)
 #define	THREAD_CAN_SCHED(td, cpu)	\
     CPU_ISSET((cpu), &(td)->td_cpuset->cs_mask)
 
 /*
  * Priority ranges used for interactive and non-interactive timeshare
  * threads.  The timeshare priorities are split up into four ranges.
  * The first range handles interactive threads.  The last three ranges
  * (NHALF, x, and NHALF) handle non-interactive threads with the outer
  * ranges supporting nice values.
  */
 #define	PRI_TIMESHARE_RANGE	(PRI_MAX_TIMESHARE - PRI_MIN_TIMESHARE + 1)
 #define	PRI_INTERACT_RANGE	((PRI_TIMESHARE_RANGE - SCHED_PRI_NRESV) / 2)
 #define	PRI_BATCH_RANGE		(PRI_TIMESHARE_RANGE - PRI_INTERACT_RANGE)
 
 #define	PRI_MIN_INTERACT	PRI_MIN_TIMESHARE
 #define	PRI_MAX_INTERACT	(PRI_MIN_TIMESHARE + PRI_INTERACT_RANGE - 1)
 #define	PRI_MIN_BATCH		(PRI_MIN_TIMESHARE + PRI_INTERACT_RANGE)
 #define	PRI_MAX_BATCH		PRI_MAX_TIMESHARE
 
 /*
  * Cpu percentage computation macros and defines.
  *
  * SCHED_TICK_SECS:	Number of seconds to average the cpu usage across.
  * SCHED_TICK_TARG:	Number of hz ticks to average the cpu usage across.
  * SCHED_TICK_MAX:	Maximum number of ticks before scaling back.
  * SCHED_TICK_SHIFT:	Shift factor to avoid rounding away results.
  * SCHED_TICK_HZ:	Compute the number of hz ticks for a given ticks count.
  * SCHED_TICK_TOTAL:	Gives the amount of time we've been recording ticks.
  */
 #define	SCHED_TICK_SECS		10
 #define	SCHED_TICK_TARG		(hz * SCHED_TICK_SECS)
 #define	SCHED_TICK_MAX		(SCHED_TICK_TARG + hz)
 #define	SCHED_TICK_SHIFT	10
 #define	SCHED_TICK_HZ(ts)	((ts)->ts_ticks >> SCHED_TICK_SHIFT)
 #define	SCHED_TICK_TOTAL(ts)	(max((ts)->ts_ltick - (ts)->ts_ftick, hz))
 
 /*
  * These macros determine priorities for non-interactive threads.  They are
  * assigned a priority based on their recent cpu utilization as expressed
  * by the ratio of ticks to the tick total.  NHALF priorities at the start
  * and end of the MIN to MAX timeshare range are only reachable with negative
  * or positive nice respectively.
  *
  * PRI_RANGE:	Priority range for utilization dependent priorities.
  * PRI_NRESV:	Number of nice values.
  * PRI_TICKS:	Compute a priority in PRI_RANGE from the ticks count and total.
  * PRI_NICE:	Determines the part of the priority inherited from nice.
  */
 #define	SCHED_PRI_NRESV		(PRIO_MAX - PRIO_MIN)
 #define	SCHED_PRI_NHALF		(SCHED_PRI_NRESV / 2)
 #define	SCHED_PRI_MIN		(PRI_MIN_BATCH + SCHED_PRI_NHALF)
 #define	SCHED_PRI_MAX		(PRI_MAX_BATCH - SCHED_PRI_NHALF)
 #define	SCHED_PRI_RANGE		(SCHED_PRI_MAX - SCHED_PRI_MIN + 1)
 #define	SCHED_PRI_TICKS(ts)						\
     (SCHED_TICK_HZ((ts)) /						\
     (roundup(SCHED_TICK_TOTAL((ts)), SCHED_PRI_RANGE) / SCHED_PRI_RANGE))
 #define	SCHED_PRI_NICE(nice)	(nice)
 
 /*
  * These determine the interactivity of a process.  Interactivity differs from
  * cpu utilization in that it expresses the voluntary time slept vs time ran
  * while cpu utilization includes all time not running.  This more accurately
  * models the intent of the thread.
  *
  * SLP_RUN_MAX:	Maximum amount of sleep time + run time we'll accumulate
  *		before throttling back.
  * SLP_RUN_FORK:	Maximum slp+run time to inherit at fork time.
  * INTERACT_MAX:	Maximum interactivity value.  Smaller is better.
  * INTERACT_THRESH:	Threshold for placement on the current runq.
  */
 #define	SCHED_SLP_RUN_MAX	((hz * 5) << SCHED_TICK_SHIFT)
 #define	SCHED_SLP_RUN_FORK	((hz / 2) << SCHED_TICK_SHIFT)
 #define	SCHED_INTERACT_MAX	(100)
 #define	SCHED_INTERACT_HALF	(SCHED_INTERACT_MAX / 2)
 #define	SCHED_INTERACT_THRESH	(30)
 
 /*
  * These parameters determine the slice behavior for batch work.
  */
 #define	SCHED_SLICE_DEFAULT_DIVISOR	10	/* ~94 ms, 12 stathz ticks. */
 #define	SCHED_SLICE_MIN_DIVISOR		6	/* DEFAULT/MIN = ~16 ms. */
 
 /* Flags kept in td_flags. */
 #define	TDF_SLICEEND	TDF_SCHED2	/* Thread time slice is over. */
 
 /*
  * tickincr:		Converts a stathz tick into a hz domain scaled by
  *			the shift factor.  Without the shift the error rate
  *			due to rounding would be unacceptably high.
  * realstathz:		stathz is sometimes 0 and run off of hz.
  * sched_slice:		Runtime of each thread before rescheduling.
  * preempt_thresh:	Priority threshold for preemption and remote IPIs.
  */
 static int sched_interact = SCHED_INTERACT_THRESH;
 static int tickincr = 8 << SCHED_TICK_SHIFT;
 static int realstathz = 127;	/* reset during boot. */
 static int sched_slice = 10;	/* reset during boot. */
 static int sched_slice_min = 1;	/* reset during boot. */
 #ifdef PREEMPTION
 #ifdef FULL_PREEMPTION
 static int preempt_thresh = PRI_MAX_IDLE;
 #else
 static int preempt_thresh = PRI_MIN_KERN;
 #endif
 #else 
 static int preempt_thresh = 0;
 #endif
 static int static_boost = PRI_MIN_BATCH;
 static int sched_idlespins = 10000;
 static int sched_idlespinthresh = -1;
 
 /*
  * tdq - per processor runqs and statistics.  All fields are protected by the
  * tdq_lock.  The load and lowpri may be accessed without to avoid excess
  * locking in sched_pickcpu();
  */
 struct tdq {
 	/* 
 	 * Ordered to improve efficiency of cpu_search() and switch().
 	 * tdq_lock is padded to avoid false sharing with tdq_load and
 	 * tdq_cpu_idle.
 	 */
 	struct mtx_padalign tdq_lock;		/* run queue lock. */
 	struct cpu_group *tdq_cg;		/* Pointer to cpu topology. */
 	volatile int	tdq_load;		/* Aggregate load. */
 	volatile int	tdq_cpu_idle;		/* cpu_idle() is active. */
 	int		tdq_sysload;		/* For loadavg, !ITHD load. */
 	int		tdq_transferable;	/* Transferable thread count. */
 	short		tdq_switchcnt;		/* Switches this tick. */
 	short		tdq_oldswitchcnt;	/* Switches last tick. */
 	u_char		tdq_lowpri;		/* Lowest priority thread. */
 	u_char		tdq_ipipending;		/* IPI pending. */
 	u_char		tdq_idx;		/* Current insert index. */
 	u_char		tdq_ridx;		/* Current removal index. */
 	struct runq	tdq_realtime;		/* real-time run queue. */
 	struct runq	tdq_timeshare;		/* timeshare run queue. */
 	struct runq	tdq_idle;		/* Queue of IDLE threads. */
 	char		tdq_name[TDQ_NAME_LEN];
 #ifdef KTR
 	char		tdq_loadname[TDQ_LOADNAME_LEN];
 #endif
 } __aligned(64);
 
 /* Idle thread states and config. */
 #define	TDQ_RUNNING	1
 #define	TDQ_IDLE	2
 
 #ifdef SMP
 struct cpu_group *cpu_top;		/* CPU topology */
 
 #define	SCHED_AFFINITY_DEFAULT	(max(1, hz / 1000))
 #define	SCHED_AFFINITY(ts, t)	((ts)->ts_rltick > ticks - ((t) * affinity))
 
 /*
  * Run-time tunables.
  */
 static int rebalance = 1;
 static int balance_interval = 128;	/* Default set in sched_initticks(). */
 static int affinity;
 static int steal_idle = 1;
 static int steal_thresh = 2;
 
 /*
  * One thread queue per processor.
  */
 static struct tdq	tdq_cpu[MAXCPU];
 static struct tdq	*balance_tdq;
 static int balance_ticks;
 static DPCPU_DEFINE(uint32_t, randomval);
 
 #define	TDQ_SELF()	(&tdq_cpu[PCPU_GET(cpuid)])
 #define	TDQ_CPU(x)	(&tdq_cpu[(x)])
 #define	TDQ_ID(x)	((int)((x) - tdq_cpu))
 #else	/* !SMP */
 static struct tdq	tdq_cpu;
 
 #define	TDQ_ID(x)	(0)
 #define	TDQ_SELF()	(&tdq_cpu)
 #define	TDQ_CPU(x)	(&tdq_cpu)
 #endif
 
 #define	TDQ_LOCK_ASSERT(t, type)	mtx_assert(TDQ_LOCKPTR((t)), (type))
 #define	TDQ_LOCK(t)		mtx_lock_spin(TDQ_LOCKPTR((t)))
 #define	TDQ_LOCK_FLAGS(t, f)	mtx_lock_spin_flags(TDQ_LOCKPTR((t)), (f))
 #define	TDQ_UNLOCK(t)		mtx_unlock_spin(TDQ_LOCKPTR((t)))
 #define	TDQ_LOCKPTR(t)		((struct mtx *)(&(t)->tdq_lock))
 
 static void sched_priority(struct thread *);
 static void sched_thread_priority(struct thread *, u_char);
 static int sched_interact_score(struct thread *);
 static void sched_interact_update(struct thread *);
 static void sched_interact_fork(struct thread *);
 static void sched_pctcpu_update(struct td_sched *, int);
 
 /* Operations on per processor queues */
 static struct thread *tdq_choose(struct tdq *);
 static void tdq_setup(struct tdq *);
 static void tdq_load_add(struct tdq *, struct thread *);
 static void tdq_load_rem(struct tdq *, struct thread *);
 static __inline void tdq_runq_add(struct tdq *, struct thread *, int);
 static __inline void tdq_runq_rem(struct tdq *, struct thread *);
 static inline int sched_shouldpreempt(int, int, int);
 void tdq_print(int cpu);
 static void runq_print(struct runq *rq);
 static void tdq_add(struct tdq *, struct thread *, int);
 #ifdef SMP
 static int tdq_move(struct tdq *, struct tdq *);
 static int tdq_idled(struct tdq *);
 static void tdq_notify(struct tdq *, struct thread *);
 static struct thread *tdq_steal(struct tdq *, int);
 static struct thread *runq_steal(struct runq *, int);
 static int sched_pickcpu(struct thread *, int);
 static void sched_balance(void);
 static int sched_balance_pair(struct tdq *, struct tdq *);
 static inline struct tdq *sched_setcpu(struct thread *, int, int);
 static inline void thread_unblock_switch(struct thread *, struct mtx *);
 static struct mtx *sched_switch_migrate(struct tdq *, struct thread *, int);
 static int sysctl_kern_sched_topology_spec(SYSCTL_HANDLER_ARGS);
 static int sysctl_kern_sched_topology_spec_internal(struct sbuf *sb, 
     struct cpu_group *cg, int indent);
 #endif
 
 static void sched_setup(void *dummy);
 SYSINIT(sched_setup, SI_SUB_RUN_QUEUE, SI_ORDER_FIRST, sched_setup, NULL);
 
 static void sched_initticks(void *dummy);
 SYSINIT(sched_initticks, SI_SUB_CLOCKS, SI_ORDER_THIRD, sched_initticks,
     NULL);
 
 SDT_PROVIDER_DEFINE(sched);
 
 SDT_PROBE_DEFINE3(sched, , , change_pri, change-pri, "struct thread *", 
     "struct proc *", "uint8_t");
 SDT_PROBE_DEFINE3(sched, , , dequeue, dequeue, "struct thread *", 
     "struct proc *", "void *");
 SDT_PROBE_DEFINE4(sched, , , enqueue, enqueue, "struct thread *", 
     "struct proc *", "void *", "int");
 SDT_PROBE_DEFINE4(sched, , , lend_pri, lend-pri, "struct thread *", 
     "struct proc *", "uint8_t", "struct thread *");
 SDT_PROBE_DEFINE2(sched, , , load_change, load-change, "int", "int");
 SDT_PROBE_DEFINE2(sched, , , off_cpu, off-cpu, "struct thread *", 
     "struct proc *");
 SDT_PROBE_DEFINE(sched, , , on_cpu, on-cpu);
 SDT_PROBE_DEFINE(sched, , , remain_cpu, remain-cpu);
 SDT_PROBE_DEFINE2(sched, , , surrender, surrender, "struct thread *", 
     "struct proc *");
 
 /*
  * Print the threads waiting on a run-queue.
  */
 static void
 runq_print(struct runq *rq)
 {
 	struct rqhead *rqh;
 	struct thread *td;
 	int pri;
 	int j;
 	int i;
 
 	for (i = 0; i < RQB_LEN; i++) {
 		printf("\t\trunq bits %d 0x%zx\n",
 		    i, rq->rq_status.rqb_bits[i]);
 		for (j = 0; j < RQB_BPW; j++)
 			if (rq->rq_status.rqb_bits[i] & (1ul << j)) {
 				pri = j + (i << RQB_L2BPW);
 				rqh = &rq->rq_queues[pri];
 				TAILQ_FOREACH(td, rqh, td_runq) {
 					printf("\t\t\ttd %p(%s) priority %d rqindex %d pri %d\n",
 					    td, td->td_name, td->td_priority,
 					    td->td_rqindex, pri);
 				}
 			}
 	}
 }
 
 /*
  * Print the status of a per-cpu thread queue.  Should be a ddb show cmd.
  */
 void
 tdq_print(int cpu)
 {
 	struct tdq *tdq;
 
 	tdq = TDQ_CPU(cpu);
 
 	printf("tdq %d:\n", TDQ_ID(tdq));
 	printf("\tlock            %p\n", TDQ_LOCKPTR(tdq));
 	printf("\tLock name:      %s\n", tdq->tdq_name);
 	printf("\tload:           %d\n", tdq->tdq_load);
 	printf("\tswitch cnt:     %d\n", tdq->tdq_switchcnt);
 	printf("\told switch cnt: %d\n", tdq->tdq_oldswitchcnt);
 	printf("\ttimeshare idx:  %d\n", tdq->tdq_idx);
 	printf("\ttimeshare ridx: %d\n", tdq->tdq_ridx);
 	printf("\tload transferable: %d\n", tdq->tdq_transferable);
 	printf("\tlowest priority:   %d\n", tdq->tdq_lowpri);
 	printf("\trealtime runq:\n");
 	runq_print(&tdq->tdq_realtime);
 	printf("\ttimeshare runq:\n");
 	runq_print(&tdq->tdq_timeshare);
 	printf("\tidle runq:\n");
 	runq_print(&tdq->tdq_idle);
 }
 
 static inline int
 sched_shouldpreempt(int pri, int cpri, int remote)
 {
 	/*
 	 * If the new priority is not better than the current priority there is
 	 * nothing to do.
 	 */
 	if (pri >= cpri)
 		return (0);
 	/*
 	 * Always preempt idle.
 	 */
 	if (cpri >= PRI_MIN_IDLE)
 		return (1);
 	/*
 	 * If preemption is disabled don't preempt others.
 	 */
 	if (preempt_thresh == 0)
 		return (0);
 	/*
 	 * Preempt if we exceed the threshold.
 	 */
 	if (pri <= preempt_thresh)
 		return (1);
 	/*
 	 * If we're interactive or better and there is non-interactive
 	 * or worse running preempt only remote processors.
 	 */
 	if (remote && pri <= PRI_MAX_INTERACT && cpri > PRI_MAX_INTERACT)
 		return (1);
 	return (0);
 }
 
 /*
  * Add a thread to the actual run-queue.  Keeps transferable counts up to
  * date with what is actually on the run-queue.  Selects the correct
  * queue position for timeshare threads.
  */
 static __inline void
 tdq_runq_add(struct tdq *tdq, struct thread *td, int flags)
 {
 	struct td_sched *ts;
 	u_char pri;
 
 	TDQ_LOCK_ASSERT(tdq, MA_OWNED);
 	THREAD_LOCK_ASSERT(td, MA_OWNED);
 
 	pri = td->td_priority;
 	ts = td->td_sched;
 	TD_SET_RUNQ(td);
 	if (THREAD_CAN_MIGRATE(td)) {
 		tdq->tdq_transferable++;
 		ts->ts_flags |= TSF_XFERABLE;
 	}
 	if (pri < PRI_MIN_BATCH) {
 		ts->ts_runq = &tdq->tdq_realtime;
 	} else if (pri <= PRI_MAX_BATCH) {
 		ts->ts_runq = &tdq->tdq_timeshare;
 		KASSERT(pri <= PRI_MAX_BATCH && pri >= PRI_MIN_BATCH,
 			("Invalid priority %d on timeshare runq", pri));
 		/*
 		 * This queue contains only priorities between MIN and MAX
 		 * realtime.  Use the whole queue to represent these values.
 		 */
 		if ((flags & (SRQ_BORROWING|SRQ_PREEMPTED)) == 0) {
 			pri = RQ_NQS * (pri - PRI_MIN_BATCH) / PRI_BATCH_RANGE;
 			pri = (pri + tdq->tdq_idx) % RQ_NQS;
 			/*
 			 * This effectively shortens the queue by one so we
 			 * can have a one slot difference between idx and
 			 * ridx while we wait for threads to drain.
 			 */
 			if (tdq->tdq_ridx != tdq->tdq_idx &&
 			    pri == tdq->tdq_ridx)
 				pri = (unsigned char)(pri - 1) % RQ_NQS;
 		} else
 			pri = tdq->tdq_ridx;
 		runq_add_pri(ts->ts_runq, td, pri, flags);
 		return;
 	} else
 		ts->ts_runq = &tdq->tdq_idle;
 	runq_add(ts->ts_runq, td, flags);
 }
 
 /* 
  * Remove a thread from a run-queue.  This typically happens when a thread
  * is selected to run.  Running threads are not on the queue and the
  * transferable count does not reflect them.
  */
 static __inline void
 tdq_runq_rem(struct tdq *tdq, struct thread *td)
 {
 	struct td_sched *ts;
 
 	ts = td->td_sched;
 	TDQ_LOCK_ASSERT(tdq, MA_OWNED);
 	KASSERT(ts->ts_runq != NULL,
 	    ("tdq_runq_remove: thread %p null ts_runq", td));
 	if (ts->ts_flags & TSF_XFERABLE) {
 		tdq->tdq_transferable--;
 		ts->ts_flags &= ~TSF_XFERABLE;
 	}
 	if (ts->ts_runq == &tdq->tdq_timeshare) {
 		if (tdq->tdq_idx != tdq->tdq_ridx)
 			runq_remove_idx(ts->ts_runq, td, &tdq->tdq_ridx);
 		else
 			runq_remove_idx(ts->ts_runq, td, NULL);
 	} else
 		runq_remove(ts->ts_runq, td);
 }
 
 /*
  * Load is maintained for all threads RUNNING and ON_RUNQ.  Add the load
  * for this thread to the referenced thread queue.
  */
 static void
 tdq_load_add(struct tdq *tdq, struct thread *td)
 {
 
 	TDQ_LOCK_ASSERT(tdq, MA_OWNED);
 	THREAD_LOCK_ASSERT(td, MA_OWNED);
 
 	tdq->tdq_load++;
 	if ((td->td_flags & TDF_NOLOAD) == 0)
 		tdq->tdq_sysload++;
 	KTR_COUNTER0(KTR_SCHED, "load", tdq->tdq_loadname, tdq->tdq_load);
 	SDT_PROBE2(sched, , , load_change, (int)TDQ_ID(tdq), tdq->tdq_load);
 }
 
 /*
  * Remove the load from a thread that is transitioning to a sleep state or
  * exiting.
  */
 static void
 tdq_load_rem(struct tdq *tdq, struct thread *td)
 {
 
 	THREAD_LOCK_ASSERT(td, MA_OWNED);
 	TDQ_LOCK_ASSERT(tdq, MA_OWNED);
 	KASSERT(tdq->tdq_load != 0,
 	    ("tdq_load_rem: Removing with 0 load on queue %d", TDQ_ID(tdq)));
 
 	tdq->tdq_load--;
 	if ((td->td_flags & TDF_NOLOAD) == 0)
 		tdq->tdq_sysload--;
 	KTR_COUNTER0(KTR_SCHED, "load", tdq->tdq_loadname, tdq->tdq_load);
 	SDT_PROBE2(sched, , , load_change, (int)TDQ_ID(tdq), tdq->tdq_load);
 }
 
 /*
  * Bound timeshare latency by decreasing slice size as load increases.  We
  * consider the maximum latency as the sum of the threads waiting to run
  * aside from curthread and target no more than sched_slice latency but
  * no less than sched_slice_min runtime.
  */
 static inline int
 tdq_slice(struct tdq *tdq)
 {
 	int load;
 
 	/*
 	 * It is safe to use sys_load here because this is called from
 	 * contexts where timeshare threads are running and so there
 	 * cannot be higher priority load in the system.
 	 */
 	load = tdq->tdq_sysload - 1;
 	if (load >= SCHED_SLICE_MIN_DIVISOR)
 		return (sched_slice_min);
 	if (load <= 1)
 		return (sched_slice);
 	return (sched_slice / load);
 }
 
 /*
  * Set lowpri to its exact value by searching the run-queue and
  * evaluating curthread.  curthread may be passed as an optimization.
  */
 static void
 tdq_setlowpri(struct tdq *tdq, struct thread *ctd)
 {
 	struct thread *td;
 
 	TDQ_LOCK_ASSERT(tdq, MA_OWNED);
 	if (ctd == NULL)
 		ctd = pcpu_find(TDQ_ID(tdq))->pc_curthread;
 	td = tdq_choose(tdq);
 	if (td == NULL || td->td_priority > ctd->td_priority)
 		tdq->tdq_lowpri = ctd->td_priority;
 	else
 		tdq->tdq_lowpri = td->td_priority;
 }
 
 #ifdef SMP
 struct cpu_search {
 	cpuset_t cs_mask;
 	u_int	cs_prefer;
 	int	cs_pri;		/* Min priority for low. */
 	int	cs_limit;	/* Max load for low, min load for high. */
 	int	cs_cpu;
 	int	cs_load;
 };
 
 #define	CPU_SEARCH_LOWEST	0x1
 #define	CPU_SEARCH_HIGHEST	0x2
 #define	CPU_SEARCH_BOTH		(CPU_SEARCH_LOWEST|CPU_SEARCH_HIGHEST)
 
 #define	CPUSET_FOREACH(cpu, mask)				\
 	for ((cpu) = 0; (cpu) <= mp_maxid; (cpu)++)		\
 		if (CPU_ISSET(cpu, &mask))
 
 static __inline int cpu_search(const struct cpu_group *cg, struct cpu_search *low,
     struct cpu_search *high, const int match);
 int cpu_search_lowest(const struct cpu_group *cg, struct cpu_search *low);
 int cpu_search_highest(const struct cpu_group *cg, struct cpu_search *high);
 int cpu_search_both(const struct cpu_group *cg, struct cpu_search *low,
     struct cpu_search *high);
 
 /*
  * Search the tree of cpu_groups for the lowest or highest loaded cpu
  * according to the match argument.  This routine actually compares the
  * load on all paths through the tree and finds the least loaded cpu on
  * the least loaded path, which may differ from the least loaded cpu in
  * the system.  This balances work among caches and busses.
  *
  * This inline is instantiated in three forms below using constants for the
  * match argument.  It is reduced to the minimum set for each case.  It is
  * also recursive to the depth of the tree.
  */
 static __inline int
 cpu_search(const struct cpu_group *cg, struct cpu_search *low,
     struct cpu_search *high, const int match)
 {
 	struct cpu_search lgroup;
 	struct cpu_search hgroup;
 	cpuset_t cpumask;
 	struct cpu_group *child;
 	struct tdq *tdq;
 	int cpu, i, hload, lload, load, total, rnd, *rndptr;
 
 	total = 0;
 	cpumask = cg->cg_mask;
 	if (match & CPU_SEARCH_LOWEST) {
 		lload = INT_MAX;
 		lgroup = *low;
 	}
 	if (match & CPU_SEARCH_HIGHEST) {
 		hload = INT_MIN;
 		hgroup = *high;
 	}
 
 	/* Iterate through the child CPU groups and then remaining CPUs. */
 	for (i = cg->cg_children, cpu = mp_maxid; ; ) {
 		if (i == 0) {
 #ifdef HAVE_INLINE_FFSL
 			cpu = CPU_FFS(&cpumask) - 1;
 #else
 			while (cpu >= 0 && !CPU_ISSET(cpu, &cpumask))
 				cpu--;
 #endif
 			if (cpu < 0)
 				break;
 			child = NULL;
 		} else
 			child = &cg->cg_child[i - 1];
 
 		if (match & CPU_SEARCH_LOWEST)
 			lgroup.cs_cpu = -1;
 		if (match & CPU_SEARCH_HIGHEST)
 			hgroup.cs_cpu = -1;
 		if (child) {			/* Handle child CPU group. */
 			CPU_NAND(&cpumask, &child->cg_mask);
 			switch (match) {
 			case CPU_SEARCH_LOWEST:
 				load = cpu_search_lowest(child, &lgroup);
 				break;
 			case CPU_SEARCH_HIGHEST:
 				load = cpu_search_highest(child, &hgroup);
 				break;
 			case CPU_SEARCH_BOTH:
 				load = cpu_search_both(child, &lgroup, &hgroup);
 				break;
 			}
 		} else {			/* Handle child CPU. */
 			CPU_CLR(cpu, &cpumask);
 			tdq = TDQ_CPU(cpu);
 			load = tdq->tdq_load * 256;
 			rndptr = DPCPU_PTR(randomval);
 			rnd = (*rndptr = *rndptr * 69069 + 5) >> 26;
 			if (match & CPU_SEARCH_LOWEST) {
 				if (cpu == low->cs_prefer)
 					load -= 64;
 				/* If that CPU is allowed and get data. */
 				if (tdq->tdq_lowpri > lgroup.cs_pri &&
 				    tdq->tdq_load <= lgroup.cs_limit &&
 				    CPU_ISSET(cpu, &lgroup.cs_mask)) {
 					lgroup.cs_cpu = cpu;
 					lgroup.cs_load = load - rnd;
 				}
 			}
 			if (match & CPU_SEARCH_HIGHEST)
 				if (tdq->tdq_load >= hgroup.cs_limit &&
 				    tdq->tdq_transferable &&
 				    CPU_ISSET(cpu, &hgroup.cs_mask)) {
 					hgroup.cs_cpu = cpu;
 					hgroup.cs_load = load - rnd;
 				}
 		}
 		total += load;
 
 		/* We have info about child item. Compare it. */
 		if (match & CPU_SEARCH_LOWEST) {
 			if (lgroup.cs_cpu >= 0 &&
 			    (load < lload ||
 			     (load == lload && lgroup.cs_load < low->cs_load))) {
 				lload = load;
 				low->cs_cpu = lgroup.cs_cpu;
 				low->cs_load = lgroup.cs_load;
 			}
 		}
 		if (match & CPU_SEARCH_HIGHEST)
 			if (hgroup.cs_cpu >= 0 &&
 			    (load > hload ||
 			     (load == hload && hgroup.cs_load > high->cs_load))) {
 				hload = load;
 				high->cs_cpu = hgroup.cs_cpu;
 				high->cs_load = hgroup.cs_load;
 			}
 		if (child) {
 			i--;
 			if (i == 0 && CPU_EMPTY(&cpumask))
 				break;
 		}
 #ifndef HAVE_INLINE_FFSL
 		else
 			cpu--;
 #endif
 	}
 	return (total);
 }
 
 /*
  * cpu_search instantiations must pass constants to maintain the inline
  * optimization.
  */
 int
 cpu_search_lowest(const struct cpu_group *cg, struct cpu_search *low)
 {
 	return cpu_search(cg, low, NULL, CPU_SEARCH_LOWEST);
 }
 
 int
 cpu_search_highest(const struct cpu_group *cg, struct cpu_search *high)
 {
 	return cpu_search(cg, NULL, high, CPU_SEARCH_HIGHEST);
 }
 
 int
 cpu_search_both(const struct cpu_group *cg, struct cpu_search *low,
     struct cpu_search *high)
 {
 	return cpu_search(cg, low, high, CPU_SEARCH_BOTH);
 }
 
 /*
  * Find the cpu with the least load via the least loaded path that has a
  * lowpri greater than pri  pri.  A pri of -1 indicates any priority is
  * acceptable.
  */
 static inline int
 sched_lowest(const struct cpu_group *cg, cpuset_t mask, int pri, int maxload,
     int prefer)
 {
 	struct cpu_search low;
 
 	low.cs_cpu = -1;
 	low.cs_prefer = prefer;
 	low.cs_mask = mask;
 	low.cs_pri = pri;
 	low.cs_limit = maxload;
 	cpu_search_lowest(cg, &low);
 	return low.cs_cpu;
 }
 
 /*
  * Find the cpu with the highest load via the highest loaded path.
  */
 static inline int
 sched_highest(const struct cpu_group *cg, cpuset_t mask, int minload)
 {
 	struct cpu_search high;
 
 	high.cs_cpu = -1;
 	high.cs_mask = mask;
 	high.cs_limit = minload;
 	cpu_search_highest(cg, &high);
 	return high.cs_cpu;
 }
 
 /*
  * Simultaneously find the highest and lowest loaded cpu reachable via
  * cg.
  */
 static inline void
 sched_both(const struct cpu_group *cg, cpuset_t mask, int *lowcpu, int *highcpu)
 {
 	struct cpu_search high;
 	struct cpu_search low;
 
 	low.cs_cpu = -1;
 	low.cs_prefer = -1;
 	low.cs_pri = -1;
 	low.cs_limit = INT_MAX;
 	low.cs_mask = mask;
 	high.cs_cpu = -1;
 	high.cs_limit = -1;
 	high.cs_mask = mask;
 	cpu_search_both(cg, &low, &high);
 	*lowcpu = low.cs_cpu;
 	*highcpu = high.cs_cpu;
 	return;
 }
 
 static void
 sched_balance_group(struct cpu_group *cg)
 {
 	cpuset_t hmask, lmask;
 	int high, low, anylow;
 
 	CPU_FILL(&hmask);
 	for (;;) {
 		high = sched_highest(cg, hmask, 1);
 		/* Stop if there is no more CPU with transferrable threads. */
 		if (high == -1)
 			break;
 		CPU_CLR(high, &hmask);
 		CPU_COPY(&hmask, &lmask);
 		/* Stop if there is no more CPU left for low. */
 		if (CPU_EMPTY(&lmask))
 			break;
 		anylow = 1;
 nextlow:
 		low = sched_lowest(cg, lmask, -1,
 		    TDQ_CPU(high)->tdq_load - 1, high);
 		/* Stop if we looked well and found no less loaded CPU. */
 		if (anylow && low == -1)
 			break;
 		/* Go to next high if we found no less loaded CPU. */
 		if (low == -1)
 			continue;
 		/* Transfer thread from high to low. */
 		if (sched_balance_pair(TDQ_CPU(high), TDQ_CPU(low))) {
 			/* CPU that got thread can no longer be a donor. */
 			CPU_CLR(low, &hmask);
 		} else {
 			/*
 			 * If failed, then there is no threads on high
 			 * that can run on this low. Drop low from low
 			 * mask and look for different one.
 			 */
 			CPU_CLR(low, &lmask);
 			anylow = 0;
 			goto nextlow;
 		}
 	}
 }
 
 static void
 sched_balance(void)
 {
 	struct tdq *tdq;
 
 	/*
 	 * Select a random time between .5 * balance_interval and
 	 * 1.5 * balance_interval.
 	 */
 	balance_ticks = max(balance_interval / 2, 1);
 	balance_ticks += random() % balance_interval;
 	if (smp_started == 0 || rebalance == 0)
 		return;
 	tdq = TDQ_SELF();
 	TDQ_UNLOCK(tdq);
 	sched_balance_group(cpu_top);
 	TDQ_LOCK(tdq);
 }
 
 /*
  * Lock two thread queues using their address to maintain lock order.
  */
 static void
 tdq_lock_pair(struct tdq *one, struct tdq *two)
 {
 	if (one < two) {
 		TDQ_LOCK(one);
 		TDQ_LOCK_FLAGS(two, MTX_DUPOK);
 	} else {
 		TDQ_LOCK(two);
 		TDQ_LOCK_FLAGS(one, MTX_DUPOK);
 	}
 }
 
 /*
  * Unlock two thread queues.  Order is not important here.
  */
 static void
 tdq_unlock_pair(struct tdq *one, struct tdq *two)
 {
 	TDQ_UNLOCK(one);
 	TDQ_UNLOCK(two);
 }
 
 /*
  * Transfer load between two imbalanced thread queues.
  */
 static int
 sched_balance_pair(struct tdq *high, struct tdq *low)
 {
 	int moved;
 	int cpu;
 
 	tdq_lock_pair(high, low);
 	moved = 0;
 	/*
 	 * Determine what the imbalance is and then adjust that to how many
 	 * threads we actually have to give up (transferable).
 	 */
 	if (high->tdq_transferable != 0 && high->tdq_load > low->tdq_load &&
 	    (moved = tdq_move(high, low)) > 0) {
 		/*
 		 * In case the target isn't the current cpu IPI it to force a
 		 * reschedule with the new workload.
 		 */
 		cpu = TDQ_ID(low);
 		if (cpu != PCPU_GET(cpuid))
 			ipi_cpu(cpu, IPI_PREEMPT);
 	}
 	tdq_unlock_pair(high, low);
 	return (moved);
 }
 
 /*
  * Move a thread from one thread queue to another.
  */
 static int
 tdq_move(struct tdq *from, struct tdq *to)
 {
 	struct td_sched *ts;
 	struct thread *td;
 	struct tdq *tdq;
 	int cpu;
 
 	TDQ_LOCK_ASSERT(from, MA_OWNED);
 	TDQ_LOCK_ASSERT(to, MA_OWNED);
 
 	tdq = from;
 	cpu = TDQ_ID(to);
 	td = tdq_steal(tdq, cpu);
 	if (td == NULL)
 		return (0);
 	ts = td->td_sched;
 	/*
 	 * Although the run queue is locked the thread may be blocked.  Lock
 	 * it to clear this and acquire the run-queue lock.
 	 */
 	thread_lock(td);
 	/* Drop recursive lock on from acquired via thread_lock(). */
 	TDQ_UNLOCK(from);
 	sched_rem(td);
 	ts->ts_cpu = cpu;
 	td->td_lock = TDQ_LOCKPTR(to);
 	tdq_add(to, td, SRQ_YIELDING);
 	return (1);
 }
 
 /*
  * This tdq has idled.  Try to steal a thread from another cpu and switch
  * to it.
  */
 static int
 tdq_idled(struct tdq *tdq)
 {
 	struct cpu_group *cg;
 	struct tdq *steal;
 	cpuset_t mask;
 	int thresh;
 	int cpu;
 
 	if (smp_started == 0 || steal_idle == 0)
 		return (1);
 	CPU_FILL(&mask);
 	CPU_CLR(PCPU_GET(cpuid), &mask);
 	/* We don't want to be preempted while we're iterating. */
 	spinlock_enter();
 	for (cg = tdq->tdq_cg; cg != NULL; ) {
 		if ((cg->cg_flags & CG_FLAG_THREAD) == 0)
 			thresh = steal_thresh;
 		else
 			thresh = 1;
 		cpu = sched_highest(cg, mask, thresh);
 		if (cpu == -1) {
 			cg = cg->cg_parent;
 			continue;
 		}
 		steal = TDQ_CPU(cpu);
 		CPU_CLR(cpu, &mask);
 		tdq_lock_pair(tdq, steal);
 		if (steal->tdq_load < thresh || steal->tdq_transferable == 0) {
 			tdq_unlock_pair(tdq, steal);
 			continue;
 		}
 		/*
 		 * If a thread was added while interrupts were disabled don't
 		 * steal one here.  If we fail to acquire one due to affinity
 		 * restrictions loop again with this cpu removed from the
 		 * set.
 		 */
 		if (tdq->tdq_load == 0 && tdq_move(steal, tdq) == 0) {
 			tdq_unlock_pair(tdq, steal);
 			continue;
 		}
 		spinlock_exit();
 		TDQ_UNLOCK(steal);
 		mi_switch(SW_VOL | SWT_IDLE, NULL);
 		thread_unlock(curthread);
 
 		return (0);
 	}
 	spinlock_exit();
 	return (1);
 }
 
 /*
  * Notify a remote cpu of new work.  Sends an IPI if criteria are met.
  */
 static void
 tdq_notify(struct tdq *tdq, struct thread *td)
 {
 	struct thread *ctd;
 	int pri;
 	int cpu;
 
 	if (tdq->tdq_ipipending)
 		return;
 	cpu = td->td_sched->ts_cpu;
 	pri = td->td_priority;
 	ctd = pcpu_find(cpu)->pc_curthread;
 	if (!sched_shouldpreempt(pri, ctd->td_priority, 1))
 		return;
 	if (TD_IS_IDLETHREAD(ctd)) {
 		/*
 		 * If the MD code has an idle wakeup routine try that before
 		 * falling back to IPI.
 		 */
 		if (!tdq->tdq_cpu_idle || cpu_idle_wakeup(cpu))
 			return;
 	}
 	tdq->tdq_ipipending = 1;
 	ipi_cpu(cpu, IPI_PREEMPT);
 }
 
 /*
  * Steals load from a timeshare queue.  Honors the rotating queue head
  * index.
  */
 static struct thread *
 runq_steal_from(struct runq *rq, int cpu, u_char start)
 {
 	struct rqbits *rqb;
 	struct rqhead *rqh;
 	struct thread *td, *first;
 	int bit;
 	int pri;
 	int i;
 
 	rqb = &rq->rq_status;
 	bit = start & (RQB_BPW -1);
 	pri = 0;
 	first = NULL;
 again:
 	for (i = RQB_WORD(start); i < RQB_LEN; bit = 0, i++) {
 		if (rqb->rqb_bits[i] == 0)
 			continue;
 		if (bit != 0) {
 			for (pri = bit; pri < RQB_BPW; pri++)
 				if (rqb->rqb_bits[i] & (1ul << pri))
 					break;
 			if (pri >= RQB_BPW)
 				continue;
 		} else
 			pri = RQB_FFS(rqb->rqb_bits[i]);
 		pri += (i << RQB_L2BPW);
 		rqh = &rq->rq_queues[pri];
 		TAILQ_FOREACH(td, rqh, td_runq) {
 			if (first && THREAD_CAN_MIGRATE(td) &&
 			    THREAD_CAN_SCHED(td, cpu))
 				return (td);
 			first = td;
 		}
 	}
 	if (start != 0) {
 		start = 0;
 		goto again;
 	}
 
 	if (first && THREAD_CAN_MIGRATE(first) &&
 	    THREAD_CAN_SCHED(first, cpu))
 		return (first);
 	return (NULL);
 }
 
 /*
  * Steals load from a standard linear queue.
  */
 static struct thread *
 runq_steal(struct runq *rq, int cpu)
 {
 	struct rqhead *rqh;
 	struct rqbits *rqb;
 	struct thread *td;
 	int word;
 	int bit;
 
 	rqb = &rq->rq_status;
 	for (word = 0; word < RQB_LEN; word++) {
 		if (rqb->rqb_bits[word] == 0)
 			continue;
 		for (bit = 0; bit < RQB_BPW; bit++) {
 			if ((rqb->rqb_bits[word] & (1ul << bit)) == 0)
 				continue;
 			rqh = &rq->rq_queues[bit + (word << RQB_L2BPW)];
 			TAILQ_FOREACH(td, rqh, td_runq)
 				if (THREAD_CAN_MIGRATE(td) &&
 				    THREAD_CAN_SCHED(td, cpu))
 					return (td);
 		}
 	}
 	return (NULL);
 }
 
 /*
  * Attempt to steal a thread in priority order from a thread queue.
  */
 static struct thread *
 tdq_steal(struct tdq *tdq, int cpu)
 {
 	struct thread *td;
 
 	TDQ_LOCK_ASSERT(tdq, MA_OWNED);
 	if ((td = runq_steal(&tdq->tdq_realtime, cpu)) != NULL)
 		return (td);
 	if ((td = runq_steal_from(&tdq->tdq_timeshare,
 	    cpu, tdq->tdq_ridx)) != NULL)
 		return (td);
 	return (runq_steal(&tdq->tdq_idle, cpu));
 }
 
 /*
  * Sets the thread lock and ts_cpu to match the requested cpu.  Unlocks the
  * current lock and returns with the assigned queue locked.
  */
 static inline struct tdq *
 sched_setcpu(struct thread *td, int cpu, int flags)
 {
 
 	struct tdq *tdq;
 
 	THREAD_LOCK_ASSERT(td, MA_OWNED);
 	tdq = TDQ_CPU(cpu);
 	td->td_sched->ts_cpu = cpu;
 	/*
 	 * If the lock matches just return the queue.
 	 */
 	if (td->td_lock == TDQ_LOCKPTR(tdq))
 		return (tdq);
 #ifdef notyet
 	/*
 	 * If the thread isn't running its lockptr is a
 	 * turnstile or a sleepqueue.  We can just lock_set without
 	 * blocking.
 	 */
 	if (TD_CAN_RUN(td)) {
 		TDQ_LOCK(tdq);
 		thread_lock_set(td, TDQ_LOCKPTR(tdq));
 		return (tdq);
 	}
 #endif
 	/*
 	 * The hard case, migration, we need to block the thread first to
 	 * prevent order reversals with other cpus locks.
 	 */
 	spinlock_enter();
 	thread_lock_block(td);
 	TDQ_LOCK(tdq);
 	thread_lock_unblock(td, TDQ_LOCKPTR(tdq));
 	spinlock_exit();
 	return (tdq);
 }
 
 SCHED_STAT_DEFINE(pickcpu_intrbind, "Soft interrupt binding");
 SCHED_STAT_DEFINE(pickcpu_idle_affinity, "Picked idle cpu based on affinity");
 SCHED_STAT_DEFINE(pickcpu_affinity, "Picked cpu based on affinity");
 SCHED_STAT_DEFINE(pickcpu_lowest, "Selected lowest load");
 SCHED_STAT_DEFINE(pickcpu_local, "Migrated to current cpu");
 SCHED_STAT_DEFINE(pickcpu_migration, "Selection may have caused migration");
 
 static int
 sched_pickcpu(struct thread *td, int flags)
 {
 	struct cpu_group *cg, *ccg;
 	struct td_sched *ts;
 	struct tdq *tdq;
 	cpuset_t mask;
 	int cpu, pri, self;
 
 	self = PCPU_GET(cpuid);
 	ts = td->td_sched;
 	if (smp_started == 0)
 		return (self);
 	/*
 	 * Don't migrate a running thread from sched_switch().
 	 */
 	if ((flags & SRQ_OURSELF) || !THREAD_CAN_MIGRATE(td))
 		return (ts->ts_cpu);
 	/*
 	 * Prefer to run interrupt threads on the processors that generate
 	 * the interrupt.
 	 */
 	pri = td->td_priority;
 	if (td->td_priority <= PRI_MAX_ITHD && THREAD_CAN_SCHED(td, self) &&
 	    curthread->td_intr_nesting_level && ts->ts_cpu != self) {
 		SCHED_STAT_INC(pickcpu_intrbind);
 		ts->ts_cpu = self;
 		if (TDQ_CPU(self)->tdq_lowpri > pri) {
 			SCHED_STAT_INC(pickcpu_affinity);
 			return (ts->ts_cpu);
 		}
 	}
 	/*
 	 * If the thread can run on the last cpu and the affinity has not
 	 * expired or it is idle run it there.
 	 */
 	tdq = TDQ_CPU(ts->ts_cpu);
 	cg = tdq->tdq_cg;
 	if (THREAD_CAN_SCHED(td, ts->ts_cpu) &&
 	    tdq->tdq_lowpri >= PRI_MIN_IDLE &&
 	    SCHED_AFFINITY(ts, CG_SHARE_L2)) {
 		if (cg->cg_flags & CG_FLAG_THREAD) {
 			CPUSET_FOREACH(cpu, cg->cg_mask) {
 				if (TDQ_CPU(cpu)->tdq_lowpri < PRI_MIN_IDLE)
 					break;
 			}
 		} else
 			cpu = INT_MAX;
 		if (cpu > mp_maxid) {
 			SCHED_STAT_INC(pickcpu_idle_affinity);
 			return (ts->ts_cpu);
 		}
 	}
 	/*
 	 * Search for the last level cache CPU group in the tree.
 	 * Skip caches with expired affinity time and SMT groups.
 	 * Affinity to higher level caches will be handled less aggressively.
 	 */
 	for (ccg = NULL; cg != NULL; cg = cg->cg_parent) {
 		if (cg->cg_flags & CG_FLAG_THREAD)
 			continue;
 		if (!SCHED_AFFINITY(ts, cg->cg_level))
 			continue;
 		ccg = cg;
 	}
 	if (ccg != NULL)
 		cg = ccg;
 	cpu = -1;
 	/* Search the group for the less loaded idle CPU we can run now. */
 	mask = td->td_cpuset->cs_mask;
 	if (cg != NULL && cg != cpu_top &&
 	    CPU_CMP(&cg->cg_mask, &cpu_top->cg_mask) != 0)
 		cpu = sched_lowest(cg, mask, max(pri, PRI_MAX_TIMESHARE),
 		    INT_MAX, ts->ts_cpu);
 	/* Search globally for the less loaded CPU we can run now. */
 	if (cpu == -1)
 		cpu = sched_lowest(cpu_top, mask, pri, INT_MAX, ts->ts_cpu);
 	/* Search globally for the less loaded CPU. */
 	if (cpu == -1)
 		cpu = sched_lowest(cpu_top, mask, -1, INT_MAX, ts->ts_cpu);
 	KASSERT(cpu != -1, ("sched_pickcpu: Failed to find a cpu."));
 	/*
 	 * Compare the lowest loaded cpu to current cpu.
 	 */
 	if (THREAD_CAN_SCHED(td, self) && TDQ_CPU(self)->tdq_lowpri > pri &&
 	    TDQ_CPU(cpu)->tdq_lowpri < PRI_MIN_IDLE &&
 	    TDQ_CPU(self)->tdq_load <= TDQ_CPU(cpu)->tdq_load + 1) {
 		SCHED_STAT_INC(pickcpu_local);
 		cpu = self;
 	} else
 		SCHED_STAT_INC(pickcpu_lowest);
 	if (cpu != ts->ts_cpu)
 		SCHED_STAT_INC(pickcpu_migration);
 	return (cpu);
 }
 #endif
 
 /*
  * Pick the highest priority task we have and return it.
  */
 static struct thread *
 tdq_choose(struct tdq *tdq)
 {
 	struct thread *td;
 
 	TDQ_LOCK_ASSERT(tdq, MA_OWNED);
 	td = runq_choose(&tdq->tdq_realtime);
 	if (td != NULL)
 		return (td);
 	td = runq_choose_from(&tdq->tdq_timeshare, tdq->tdq_ridx);
 	if (td != NULL) {
 		KASSERT(td->td_priority >= PRI_MIN_BATCH,
 		    ("tdq_choose: Invalid priority on timeshare queue %d",
 		    td->td_priority));
 		return (td);
 	}
 	td = runq_choose(&tdq->tdq_idle);
 	if (td != NULL) {
 		KASSERT(td->td_priority >= PRI_MIN_IDLE,
 		    ("tdq_choose: Invalid priority on idle queue %d",
 		    td->td_priority));
 		return (td);
 	}
 
 	return (NULL);
 }
 
 /*
  * Initialize a thread queue.
  */
 static void
 tdq_setup(struct tdq *tdq)
 {
 
 	if (bootverbose)
 		printf("ULE: setup cpu %d\n", TDQ_ID(tdq));
 	runq_init(&tdq->tdq_realtime);
 	runq_init(&tdq->tdq_timeshare);
 	runq_init(&tdq->tdq_idle);
 	snprintf(tdq->tdq_name, sizeof(tdq->tdq_name),
 	    "sched lock %d", (int)TDQ_ID(tdq));
 	mtx_init(&tdq->tdq_lock, tdq->tdq_name, "sched lock",
 	    MTX_SPIN | MTX_RECURSE);
 #ifdef KTR
 	snprintf(tdq->tdq_loadname, sizeof(tdq->tdq_loadname),
 	    "CPU %d load", (int)TDQ_ID(tdq));
 #endif
 }
 
 #ifdef SMP
 static void
 sched_setup_smp(void)
 {
 	struct tdq *tdq;
 	int i;
 
 	cpu_top = smp_topo();
 	CPU_FOREACH(i) {
 		tdq = TDQ_CPU(i);
 		tdq_setup(tdq);
 		tdq->tdq_cg = smp_topo_find(cpu_top, i);
 		if (tdq->tdq_cg == NULL)
 			panic("Can't find cpu group for %d\n", i);
 	}
 	balance_tdq = TDQ_SELF();
 	sched_balance();
 }
 #endif
 
 /*
  * Setup the thread queues and initialize the topology based on MD
  * information.
  */
 static void
 sched_setup(void *dummy)
 {
 	struct tdq *tdq;
 
 	tdq = TDQ_SELF();
 #ifdef SMP
 	sched_setup_smp();
 #else
 	tdq_setup(tdq);
 #endif
 
 	/* Add thread0's load since it's running. */
 	TDQ_LOCK(tdq);
 	thread0.td_lock = TDQ_LOCKPTR(TDQ_SELF());
 	tdq_load_add(tdq, &thread0);
 	tdq->tdq_lowpri = thread0.td_priority;
 	TDQ_UNLOCK(tdq);
 }
 
 /*
  * This routine determines time constants after stathz and hz are setup.
  */
 /* ARGSUSED */
 static void
 sched_initticks(void *dummy)
 {
 	int incr;
 
 	realstathz = stathz ? stathz : hz;
 	sched_slice = realstathz / SCHED_SLICE_DEFAULT_DIVISOR;
 	sched_slice_min = sched_slice / SCHED_SLICE_MIN_DIVISOR;
 	hogticks = imax(1, (2 * hz * sched_slice + realstathz / 2) /
 	    realstathz);
 
 	/*
 	 * tickincr is shifted out by 10 to avoid rounding errors due to
 	 * hz not being evenly divisible by stathz on all platforms.
 	 */
 	incr = (hz << SCHED_TICK_SHIFT) / realstathz;
 	/*
 	 * This does not work for values of stathz that are more than
 	 * 1 << SCHED_TICK_SHIFT * hz.  In practice this does not happen.
 	 */
 	if (incr == 0)
 		incr = 1;
 	tickincr = incr;
 #ifdef SMP
 	/*
 	 * Set the default balance interval now that we know
 	 * what realstathz is.
 	 */
 	balance_interval = realstathz;
 	affinity = SCHED_AFFINITY_DEFAULT;
 #endif
 	if (sched_idlespinthresh < 0)
 		sched_idlespinthresh = 2 * max(10000, 6 * hz) / realstathz;
 }
 
 
 /*
  * This is the core of the interactivity algorithm.  Determines a score based
  * on past behavior.  It is the ratio of sleep time to run time scaled to
  * a [0, 100] integer.  This is the voluntary sleep time of a process, which
  * differs from the cpu usage because it does not account for time spent
  * waiting on a run-queue.  Would be prettier if we had floating point.
  */
 static int
 sched_interact_score(struct thread *td)
 {
 	struct td_sched *ts;
 	int div;
 
 	ts = td->td_sched;
 	/*
 	 * The score is only needed if this is likely to be an interactive
 	 * task.  Don't go through the expense of computing it if there's
 	 * no chance.
 	 */
 	if (sched_interact <= SCHED_INTERACT_HALF &&
 		ts->ts_runtime >= ts->ts_slptime)
 			return (SCHED_INTERACT_HALF);
 
 	if (ts->ts_runtime > ts->ts_slptime) {
 		div = max(1, ts->ts_runtime / SCHED_INTERACT_HALF);
 		return (SCHED_INTERACT_HALF +
 		    (SCHED_INTERACT_HALF - (ts->ts_slptime / div)));
 	}
 	if (ts->ts_slptime > ts->ts_runtime) {
 		div = max(1, ts->ts_slptime / SCHED_INTERACT_HALF);
 		return (ts->ts_runtime / div);
 	}
 	/* runtime == slptime */
 	if (ts->ts_runtime)
 		return (SCHED_INTERACT_HALF);
 
 	/*
 	 * This can happen if slptime and runtime are 0.
 	 */
 	return (0);
 
 }
 
 /*
  * Scale the scheduling priority according to the "interactivity" of this
  * process.
  */
 static void
 sched_priority(struct thread *td)
 {
 	int score;
 	int pri;
 
 	if (PRI_BASE(td->td_pri_class) != PRI_TIMESHARE)
 		return;
 	/*
 	 * If the score is interactive we place the thread in the realtime
 	 * queue with a priority that is less than kernel and interrupt
 	 * priorities.  These threads are not subject to nice restrictions.
 	 *
 	 * Scores greater than this are placed on the normal timeshare queue
 	 * where the priority is partially decided by the most recent cpu
 	 * utilization and the rest is decided by nice value.
 	 *
 	 * The nice value of the process has a linear effect on the calculated
 	 * score.  Negative nice values make it easier for a thread to be
 	 * considered interactive.
 	 */
 	score = imax(0, sched_interact_score(td) + td->td_proc->p_nice);
 	if (score < sched_interact) {
 		pri = PRI_MIN_INTERACT;
 		pri += ((PRI_MAX_INTERACT - PRI_MIN_INTERACT + 1) /
 		    sched_interact) * score;
 		KASSERT(pri >= PRI_MIN_INTERACT && pri <= PRI_MAX_INTERACT,
 		    ("sched_priority: invalid interactive priority %d score %d",
 		    pri, score));
 	} else {
 		pri = SCHED_PRI_MIN;
 		if (td->td_sched->ts_ticks)
 			pri += min(SCHED_PRI_TICKS(td->td_sched),
 			    SCHED_PRI_RANGE);
 		pri += SCHED_PRI_NICE(td->td_proc->p_nice);
 		KASSERT(pri >= PRI_MIN_BATCH && pri <= PRI_MAX_BATCH,
 		    ("sched_priority: invalid priority %d: nice %d, " 
 		    "ticks %d ftick %d ltick %d tick pri %d",
 		    pri, td->td_proc->p_nice, td->td_sched->ts_ticks,
 		    td->td_sched->ts_ftick, td->td_sched->ts_ltick,
 		    SCHED_PRI_TICKS(td->td_sched)));
 	}
 	sched_user_prio(td, pri);
 
 	return;
 }
 
 /*
  * This routine enforces a maximum limit on the amount of scheduling history
  * kept.  It is called after either the slptime or runtime is adjusted.  This
  * function is ugly due to integer math.
  */
 static void
 sched_interact_update(struct thread *td)
 {
 	struct td_sched *ts;
 	u_int sum;
 
 	ts = td->td_sched;
 	sum = ts->ts_runtime + ts->ts_slptime;
 	if (sum < SCHED_SLP_RUN_MAX)
 		return;
 	/*
 	 * This only happens from two places:
 	 * 1) We have added an unusual amount of run time from fork_exit.
 	 * 2) We have added an unusual amount of sleep time from sched_sleep().
 	 */
 	if (sum > SCHED_SLP_RUN_MAX * 2) {
 		if (ts->ts_runtime > ts->ts_slptime) {
 			ts->ts_runtime = SCHED_SLP_RUN_MAX;
 			ts->ts_slptime = 1;
 		} else {
 			ts->ts_slptime = SCHED_SLP_RUN_MAX;
 			ts->ts_runtime = 1;
 		}
 		return;
 	}
 	/*
 	 * If we have exceeded by more than 1/5th then the algorithm below
 	 * will not bring us back into range.  Dividing by two here forces
 	 * us into the range of [4/5 * SCHED_INTERACT_MAX, SCHED_INTERACT_MAX]
 	 */
 	if (sum > (SCHED_SLP_RUN_MAX / 5) * 6) {
 		ts->ts_runtime /= 2;
 		ts->ts_slptime /= 2;
 		return;
 	}
 	ts->ts_runtime = (ts->ts_runtime / 5) * 4;
 	ts->ts_slptime = (ts->ts_slptime / 5) * 4;
 }
 
 /*
  * Scale back the interactivity history when a child thread is created.  The
  * history is inherited from the parent but the thread may behave totally
  * differently.  For example, a shell spawning a compiler process.  We want
  * to learn that the compiler is behaving badly very quickly.
  */
 static void
 sched_interact_fork(struct thread *td)
 {
 	int ratio;
 	int sum;
 
 	sum = td->td_sched->ts_runtime + td->td_sched->ts_slptime;
 	if (sum > SCHED_SLP_RUN_FORK) {
 		ratio = sum / SCHED_SLP_RUN_FORK;
 		td->td_sched->ts_runtime /= ratio;
 		td->td_sched->ts_slptime /= ratio;
 	}
 }
 
 /*
  * Called from proc0_init() to setup the scheduler fields.
  */
 void
 schedinit(void)
 {
 
 	/*
 	 * Set up the scheduler specific parts of proc0.
 	 */
 	proc0.p_sched = NULL; /* XXX */
 	thread0.td_sched = &td_sched0;
 	td_sched0.ts_ltick = ticks;
 	td_sched0.ts_ftick = ticks;
 	td_sched0.ts_slice = 0;
 }
 
 /*
  * This is only somewhat accurate since given many processes of the same
  * priority they will switch when their slices run out, which will be
  * at most sched_slice stathz ticks.
  */
 int
 sched_rr_interval(void)
 {
 
 	/* Convert sched_slice from stathz to hz. */
 	return (imax(1, (sched_slice * hz + realstathz / 2) / realstathz));
 }
 
 /*
  * Update the percent cpu tracking information when it is requested or
  * the total history exceeds the maximum.  We keep a sliding history of
  * tick counts that slowly decays.  This is less precise than the 4BSD
  * mechanism since it happens with less regular and frequent events.
  */
 static void
 sched_pctcpu_update(struct td_sched *ts, int run)
 {
 	int t = ticks;
 
 	if (t - ts->ts_ltick >= SCHED_TICK_TARG) {
 		ts->ts_ticks = 0;
 		ts->ts_ftick = t - SCHED_TICK_TARG;
 	} else if (t - ts->ts_ftick >= SCHED_TICK_MAX) {
 		ts->ts_ticks = (ts->ts_ticks / (ts->ts_ltick - ts->ts_ftick)) *
 		    (ts->ts_ltick - (t - SCHED_TICK_TARG));
 		ts->ts_ftick = t - SCHED_TICK_TARG;
 	}
 	if (run)
 		ts->ts_ticks += (t - ts->ts_ltick) << SCHED_TICK_SHIFT;
 	ts->ts_ltick = t;
 }
 
 /*
  * Adjust the priority of a thread.  Move it to the appropriate run-queue
  * if necessary.  This is the back-end for several priority related
  * functions.
  */
 static void
 sched_thread_priority(struct thread *td, u_char prio)
 {
 	struct td_sched *ts;
 	struct tdq *tdq;
 	int oldpri;
 
 	KTR_POINT3(KTR_SCHED, "thread", sched_tdname(td), "prio",
 	    "prio:%d", td->td_priority, "new prio:%d", prio,
 	    KTR_ATTR_LINKED, sched_tdname(curthread));
 	SDT_PROBE3(sched, , , change_pri, td, td->td_proc, prio);
 	if (td != curthread && prio < td->td_priority) {
 		KTR_POINT3(KTR_SCHED, "thread", sched_tdname(curthread),
 		    "lend prio", "prio:%d", td->td_priority, "new prio:%d",
 		    prio, KTR_ATTR_LINKED, sched_tdname(td));
 		SDT_PROBE4(sched, , , lend_pri, td, td->td_proc, prio, 
 		    curthread);
 	} 
 	ts = td->td_sched;
 	THREAD_LOCK_ASSERT(td, MA_OWNED);
 	if (td->td_priority == prio)
 		return;
 	/*
 	 * If the priority has been elevated due to priority
 	 * propagation, we may have to move ourselves to a new
 	 * queue.  This could be optimized to not re-add in some
 	 * cases.
 	 */
 	if (TD_ON_RUNQ(td) && prio < td->td_priority) {
 		sched_rem(td);
 		td->td_priority = prio;
 		sched_add(td, SRQ_BORROWING);
 		return;
 	}
 	/*
 	 * If the thread is currently running we may have to adjust the lowpri
 	 * information so other cpus are aware of our current priority.
 	 */
 	if (TD_IS_RUNNING(td)) {
 		tdq = TDQ_CPU(ts->ts_cpu);
 		oldpri = td->td_priority;
 		td->td_priority = prio;
 		if (prio < tdq->tdq_lowpri)
 			tdq->tdq_lowpri = prio;
 		else if (tdq->tdq_lowpri == oldpri)
 			tdq_setlowpri(tdq, td);
 		return;
 	}
 	td->td_priority = prio;
 }
 
 /*
  * Update a thread's priority when it is lent another thread's
  * priority.
  */
 void
 sched_lend_prio(struct thread *td, u_char prio)
 {
 
 	td->td_flags |= TDF_BORROWING;
 	sched_thread_priority(td, prio);
 }
 
 /*
  * Restore a thread's priority when priority propagation is
  * over.  The prio argument is the minimum priority the thread
  * needs to have to satisfy other possible priority lending
  * requests.  If the thread's regular priority is less
  * important than prio, the thread will keep a priority boost
  * of prio.
  */
 void
 sched_unlend_prio(struct thread *td, u_char prio)
 {
 	u_char base_pri;
 
 	if (td->td_base_pri >= PRI_MIN_TIMESHARE &&
 	    td->td_base_pri <= PRI_MAX_TIMESHARE)
 		base_pri = td->td_user_pri;
 	else
 		base_pri = td->td_base_pri;
 	if (prio >= base_pri) {
 		td->td_flags &= ~TDF_BORROWING;
 		sched_thread_priority(td, base_pri);
 	} else
 		sched_lend_prio(td, prio);
 }
 
 /*
  * Standard entry for setting the priority to an absolute value.
  */
 void
 sched_prio(struct thread *td, u_char prio)
 {
 	u_char oldprio;
 
 	/* First, update the base priority. */
 	td->td_base_pri = prio;
 
 	/*
 	 * If the thread is borrowing another thread's priority, don't
 	 * ever lower the priority.
 	 */
 	if (td->td_flags & TDF_BORROWING && td->td_priority < prio)
 		return;
 
 	/* Change the real priority. */
 	oldprio = td->td_priority;
 	sched_thread_priority(td, prio);
 
 	/*
 	 * If the thread is on a turnstile, then let the turnstile update
 	 * its state.
 	 */
 	if (TD_ON_LOCK(td) && oldprio != prio)
 		turnstile_adjust(td, oldprio);
 }
 
 /*
  * Set the base user priority, does not effect current running priority.
  */
 void
 sched_user_prio(struct thread *td, u_char prio)
 {
 
 	td->td_base_user_pri = prio;
 	if (td->td_lend_user_pri <= prio)
 		return;
 	td->td_user_pri = prio;
 }
 
 void
 sched_lend_user_prio(struct thread *td, u_char prio)
 {
 
 	THREAD_LOCK_ASSERT(td, MA_OWNED);
 	td->td_lend_user_pri = prio;
 	td->td_user_pri = min(prio, td->td_base_user_pri);
 	if (td->td_priority > td->td_user_pri)
 		sched_prio(td, td->td_user_pri);
 	else if (td->td_priority != td->td_user_pri)
 		td->td_flags |= TDF_NEEDRESCHED;
 }
 
 /*
  * Handle migration from sched_switch().  This happens only for
  * cpu binding.
  */
 static struct mtx *
 sched_switch_migrate(struct tdq *tdq, struct thread *td, int flags)
 {
 	struct tdq *tdn;
 
 	tdn = TDQ_CPU(td->td_sched->ts_cpu);
 #ifdef SMP
 	tdq_load_rem(tdq, td);
 	/*
 	 * Do the lock dance required to avoid LOR.  We grab an extra
 	 * spinlock nesting to prevent preemption while we're
 	 * not holding either run-queue lock.
 	 */
 	spinlock_enter();
 	thread_lock_block(td);	/* This releases the lock on tdq. */
 
 	/*
 	 * Acquire both run-queue locks before placing the thread on the new
 	 * run-queue to avoid deadlocks created by placing a thread with a
 	 * blocked lock on the run-queue of a remote processor.  The deadlock
 	 * occurs when a third processor attempts to lock the two queues in
 	 * question while the target processor is spinning with its own
 	 * run-queue lock held while waiting for the blocked lock to clear.
 	 */
 	tdq_lock_pair(tdn, tdq);
 	tdq_add(tdn, td, flags);
 	tdq_notify(tdn, td);
 	TDQ_UNLOCK(tdn);
 	spinlock_exit();
 #endif
 	return (TDQ_LOCKPTR(tdn));
 }
 
 /*
  * Variadic version of thread_lock_unblock() that does not assume td_lock
  * is blocked.
  */
 static inline void
 thread_unblock_switch(struct thread *td, struct mtx *mtx)
 {
 	atomic_store_rel_ptr((volatile uintptr_t *)&td->td_lock,
 	    (uintptr_t)mtx);
 }
 
 /*
  * Switch threads.  This function has to handle threads coming in while
  * blocked for some reason, running, or idle.  It also must deal with
  * migrating a thread from one queue to another as running threads may
  * be assigned elsewhere via binding.
  */
 void
 sched_switch(struct thread *td, struct thread *newtd, int flags)
 {
 	struct tdq *tdq;
 	struct td_sched *ts;
 	struct mtx *mtx;
 	int srqflag;
 	int cpuid, preempted;
 
 	THREAD_LOCK_ASSERT(td, MA_OWNED);
 	KASSERT(newtd == NULL, ("sched_switch: Unsupported newtd argument"));
 
 	cpuid = PCPU_GET(cpuid);
 	tdq = TDQ_CPU(cpuid);
 	ts = td->td_sched;
 	mtx = td->td_lock;
 	sched_pctcpu_update(ts, 1);
 	ts->ts_rltick = ticks;
 	td->td_lastcpu = td->td_oncpu;
 	td->td_oncpu = NOCPU;
 	preempted = !(td->td_flags & TDF_SLICEEND);
 	td->td_flags &= ~(TDF_NEEDRESCHED | TDF_SLICEEND);
 	td->td_owepreempt = 0;
 	if (!TD_IS_IDLETHREAD(td))
 		tdq->tdq_switchcnt++;
 	/*
 	 * The lock pointer in an idle thread should never change.  Reset it
 	 * to CAN_RUN as well.
 	 */
 	if (TD_IS_IDLETHREAD(td)) {
 		MPASS(td->td_lock == TDQ_LOCKPTR(tdq));
 		TD_SET_CAN_RUN(td);
 	} else if (TD_IS_RUNNING(td)) {
 		MPASS(td->td_lock == TDQ_LOCKPTR(tdq));
 		srqflag = preempted ?
 		    SRQ_OURSELF|SRQ_YIELDING|SRQ_PREEMPTED :
 		    SRQ_OURSELF|SRQ_YIELDING;
 #ifdef SMP
 		if (THREAD_CAN_MIGRATE(td) && !THREAD_CAN_SCHED(td, ts->ts_cpu))
 			ts->ts_cpu = sched_pickcpu(td, 0);
 #endif
 		if (ts->ts_cpu == cpuid)
 			tdq_runq_add(tdq, td, srqflag);
 		else {
 			KASSERT(THREAD_CAN_MIGRATE(td) ||
 			    (ts->ts_flags & TSF_BOUND) != 0,
 			    ("Thread %p shouldn't migrate", td));
 			mtx = sched_switch_migrate(tdq, td, srqflag);
 		}
 	} else {
 		/* This thread must be going to sleep. */
 		TDQ_LOCK(tdq);
 		mtx = thread_lock_block(td);
 		tdq_load_rem(tdq, td);
 	}
 	/*
 	 * We enter here with the thread blocked and assigned to the
 	 * appropriate cpu run-queue or sleep-queue and with the current
 	 * thread-queue locked.
 	 */
 	TDQ_LOCK_ASSERT(tdq, MA_OWNED | MA_NOTRECURSED);
 	newtd = choosethread();
 	/*
 	 * Call the MD code to switch contexts if necessary.
 	 */
 	if (td != newtd) {
 #ifdef	HWPMC_HOOKS
 		if (PMC_PROC_IS_USING_PMCS(td->td_proc))
 			PMC_SWITCH_CONTEXT(td, PMC_FN_CSW_OUT);
 #endif
 		SDT_PROBE2(sched, , , off_cpu, newtd, newtd->td_proc);
 		lock_profile_release_lock(&TDQ_LOCKPTR(tdq)->lock_object);
 		TDQ_LOCKPTR(tdq)->mtx_lock = (uintptr_t)newtd;
 		sched_pctcpu_update(newtd->td_sched, 0);
 
 #ifdef KDTRACE_HOOKS
 		/*
 		 * If DTrace has set the active vtime enum to anything
 		 * other than INACTIVE (0), then it should have set the
 		 * function to call.
 		 */
 		if (dtrace_vtime_active)
 			(*dtrace_vtime_switch_func)(newtd);
 #endif
 
 		cpu_switch(td, newtd, mtx);
 		/*
 		 * We may return from cpu_switch on a different cpu.  However,
 		 * we always return with td_lock pointing to the current cpu's
 		 * run queue lock.
 		 */
 		cpuid = PCPU_GET(cpuid);
 		tdq = TDQ_CPU(cpuid);
 		lock_profile_obtain_lock_success(
 		    &TDQ_LOCKPTR(tdq)->lock_object, 0, 0, __FILE__, __LINE__);
 
 		SDT_PROBE0(sched, , , on_cpu);
 #ifdef	HWPMC_HOOKS
 		if (PMC_PROC_IS_USING_PMCS(td->td_proc))
 			PMC_SWITCH_CONTEXT(td, PMC_FN_CSW_IN);
 #endif
 	} else {
 		thread_unblock_switch(td, mtx);
 		SDT_PROBE0(sched, , , remain_cpu);
 	}
 	/*
 	 * Assert that all went well and return.
 	 */
 	TDQ_LOCK_ASSERT(tdq, MA_OWNED|MA_NOTRECURSED);
 	MPASS(td->td_lock == TDQ_LOCKPTR(tdq));
 	td->td_oncpu = cpuid;
 }
 
 /*
  * Adjust thread priorities as a result of a nice request.
  */
 void
 sched_nice(struct proc *p, int nice)
 {
 	struct thread *td;
 
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 
 	p->p_nice = nice;
 	FOREACH_THREAD_IN_PROC(p, td) {
 		thread_lock(td);
 		sched_priority(td);
 		sched_prio(td, td->td_base_user_pri);
 		thread_unlock(td);
 	}
 }
 
 /*
  * Record the sleep time for the interactivity scorer.
  */
 void
 sched_sleep(struct thread *td, int prio)
 {
 
 	THREAD_LOCK_ASSERT(td, MA_OWNED);
 
 	td->td_slptick = ticks;
 	if (TD_IS_SUSPENDED(td) || prio >= PSOCK)
 		td->td_flags |= TDF_CANSWAP;
 	if (PRI_BASE(td->td_pri_class) != PRI_TIMESHARE)
 		return;
 	if (static_boost == 1 && prio)
 		sched_prio(td, prio);
 	else if (static_boost && td->td_priority > static_boost)
 		sched_prio(td, static_boost);
 }
 
 /*
  * Schedule a thread to resume execution and record how long it voluntarily
  * slept.  We also update the pctcpu, interactivity, and priority.
  */
 void
 sched_wakeup(struct thread *td)
 {
 	struct td_sched *ts;
 	int slptick;
 
 	THREAD_LOCK_ASSERT(td, MA_OWNED);
 	ts = td->td_sched;
 	td->td_flags &= ~TDF_CANSWAP;
 	/*
 	 * If we slept for more than a tick update our interactivity and
 	 * priority.
 	 */
 	slptick = td->td_slptick;
 	td->td_slptick = 0;
 	if (slptick && slptick != ticks) {
 		ts->ts_slptime += (ticks - slptick) << SCHED_TICK_SHIFT;
 		sched_interact_update(td);
 		sched_pctcpu_update(ts, 0);
 	}
 	/*
 	 * Reset the slice value since we slept and advanced the round-robin.
 	 */
 	ts->ts_slice = 0;
 	sched_add(td, SRQ_BORING);
 }
 
 /*
  * Penalize the parent for creating a new child and initialize the child's
  * priority.
  */
 void
 sched_fork(struct thread *td, struct thread *child)
 {
 	THREAD_LOCK_ASSERT(td, MA_OWNED);
 	sched_pctcpu_update(td->td_sched, 1);
 	sched_fork_thread(td, child);
 	/*
 	 * Penalize the parent and child for forking.
 	 */
 	sched_interact_fork(child);
 	sched_priority(child);
 	td->td_sched->ts_runtime += tickincr;
 	sched_interact_update(td);
 	sched_priority(td);
 }
 
 /*
  * Fork a new thread, may be within the same process.
  */
 void
 sched_fork_thread(struct thread *td, struct thread *child)
 {
 	struct td_sched *ts;
 	struct td_sched *ts2;
 	struct tdq *tdq;
 
 	tdq = TDQ_SELF();
 	THREAD_LOCK_ASSERT(td, MA_OWNED);
 	/*
 	 * Initialize child.
 	 */
 	ts = td->td_sched;
 	ts2 = child->td_sched;
 	child->td_lock = TDQ_LOCKPTR(tdq);
 	child->td_cpuset = cpuset_ref(td->td_cpuset);
 	ts2->ts_cpu = ts->ts_cpu;
 	ts2->ts_flags = 0;
 	/*
 	 * Grab our parents cpu estimation information.
 	 */
 	ts2->ts_ticks = ts->ts_ticks;
 	ts2->ts_ltick = ts->ts_ltick;
 	ts2->ts_ftick = ts->ts_ftick;
 	/*
 	 * Do not inherit any borrowed priority from the parent.
 	 */
 	child->td_priority = child->td_base_pri;
 	/*
 	 * And update interactivity score.
 	 */
 	ts2->ts_slptime = ts->ts_slptime;
 	ts2->ts_runtime = ts->ts_runtime;
 	/* Attempt to quickly learn interactivity. */
 	ts2->ts_slice = tdq_slice(tdq) - sched_slice_min;
 #ifdef KTR
 	bzero(ts2->ts_name, sizeof(ts2->ts_name));
 #endif
 }
 
 /*
  * Adjust the priority class of a thread.
  */
 void
 sched_class(struct thread *td, int class)
 {
 
 	THREAD_LOCK_ASSERT(td, MA_OWNED);
 	if (td->td_pri_class == class)
 		return;
 	td->td_pri_class = class;
 }
 
 /*
  * Return some of the child's priority and interactivity to the parent.
  */
 void
 sched_exit(struct proc *p, struct thread *child)
 {
 	struct thread *td;
 
 	KTR_STATE1(KTR_SCHED, "thread", sched_tdname(child), "proc exit",
 	    "prio:%d", child->td_priority);
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	td = FIRST_THREAD_IN_PROC(p);
 	sched_exit_thread(td, child);
 }
 
 /*
  * Penalize another thread for the time spent on this one.  This helps to
  * worsen the priority and interactivity of processes which schedule batch
  * jobs such as make.  This has little effect on the make process itself but
  * causes new processes spawned by it to receive worse scores immediately.
  */
 void
 sched_exit_thread(struct thread *td, struct thread *child)
 {
 
 	KTR_STATE1(KTR_SCHED, "thread", sched_tdname(child), "thread exit",
 	    "prio:%d", child->td_priority);
 	/*
 	 * Give the child's runtime to the parent without returning the
 	 * sleep time as a penalty to the parent.  This causes shells that
 	 * launch expensive things to mark their children as expensive.
 	 */
 	thread_lock(td);
 	td->td_sched->ts_runtime += child->td_sched->ts_runtime;
 	sched_interact_update(td);
 	sched_priority(td);
 	thread_unlock(td);
 }
 
 void
 sched_preempt(struct thread *td)
 {
 	struct tdq *tdq;
 
 	SDT_PROBE2(sched, , , surrender, td, td->td_proc);
 
 	thread_lock(td);
 	tdq = TDQ_SELF();
 	TDQ_LOCK_ASSERT(tdq, MA_OWNED);
 	tdq->tdq_ipipending = 0;
 	if (td->td_priority > tdq->tdq_lowpri) {
 		int flags;
 
 		flags = SW_INVOL | SW_PREEMPT;
 		if (td->td_critnest > 1)
 			td->td_owepreempt = 1;
 		else if (TD_IS_IDLETHREAD(td))
 			mi_switch(flags | SWT_REMOTEWAKEIDLE, NULL);
 		else
 			mi_switch(flags | SWT_REMOTEPREEMPT, NULL);
 	}
 	thread_unlock(td);
 }
 
 /*
  * Fix priorities on return to user-space.  Priorities may be elevated due
  * to static priorities in msleep() or similar.
  */
 void
 sched_userret(struct thread *td)
 {
 	/*
 	 * XXX we cheat slightly on the locking here to avoid locking in  
 	 * the usual case.  Setting td_priority here is essentially an
 	 * incomplete workaround for not setting it properly elsewhere.
 	 * Now that some interrupt handlers are threads, not setting it
 	 * properly elsewhere can clobber it in the window between setting
 	 * it here and returning to user mode, so don't waste time setting
 	 * it perfectly here.
 	 */
 	KASSERT((td->td_flags & TDF_BORROWING) == 0,
 	    ("thread with borrowed priority returning to userland"));
 	if (td->td_priority != td->td_user_pri) {
 		thread_lock(td);
 		td->td_priority = td->td_user_pri;
 		td->td_base_pri = td->td_user_pri;
 		tdq_setlowpri(TDQ_SELF(), td);
 		thread_unlock(td);
         }
 }
 
 /*
  * Handle a stathz tick.  This is really only relevant for timeshare
  * threads.
  */
 void
 sched_clock(struct thread *td)
 {
 	struct tdq *tdq;
 	struct td_sched *ts;
 
 	THREAD_LOCK_ASSERT(td, MA_OWNED);
 	tdq = TDQ_SELF();
 #ifdef SMP
 	/*
 	 * We run the long term load balancer infrequently on the first cpu.
 	 */
 	if (balance_tdq == tdq) {
 		if (balance_ticks && --balance_ticks == 0)
 			sched_balance();
 	}
 #endif
 	/*
 	 * Save the old switch count so we have a record of the last ticks
 	 * activity.   Initialize the new switch count based on our load.
 	 * If there is some activity seed it to reflect that.
 	 */
 	tdq->tdq_oldswitchcnt = tdq->tdq_switchcnt;
 	tdq->tdq_switchcnt = tdq->tdq_load;
 	/*
 	 * Advance the insert index once for each tick to ensure that all
 	 * threads get a chance to run.
 	 */
 	if (tdq->tdq_idx == tdq->tdq_ridx) {
 		tdq->tdq_idx = (tdq->tdq_idx + 1) % RQ_NQS;
 		if (TAILQ_EMPTY(&tdq->tdq_timeshare.rq_queues[tdq->tdq_ridx]))
 			tdq->tdq_ridx = tdq->tdq_idx;
 	}
 	ts = td->td_sched;
 	sched_pctcpu_update(ts, 1);
 	if (td->td_pri_class & PRI_FIFO_BIT)
 		return;
 	if (PRI_BASE(td->td_pri_class) == PRI_TIMESHARE) {
 		/*
 		 * We used a tick; charge it to the thread so
 		 * that we can compute our interactivity.
 		 */
 		td->td_sched->ts_runtime += tickincr;
 		sched_interact_update(td);
 		sched_priority(td);
 	}
 
 	/*
 	 * Force a context switch if the current thread has used up a full
 	 * time slice (default is 100ms).
 	 */
 	if (!TD_IS_IDLETHREAD(td) && ++ts->ts_slice >= tdq_slice(tdq)) {
 		ts->ts_slice = 0;
 		td->td_flags |= TDF_NEEDRESCHED | TDF_SLICEEND;
 	}
 }
 
 /*
  * Called once per hz tick.
  */
 void
 sched_tick(int cnt)
 {
 
 }
 
 /*
  * Return whether the current CPU has runnable tasks.  Used for in-kernel
  * cooperative idle threads.
  */
 int
 sched_runnable(void)
 {
 	struct tdq *tdq;
 	int load;
 
 	load = 1;
 
 	tdq = TDQ_SELF();
 	if ((curthread->td_flags & TDF_IDLETD) != 0) {
 		if (tdq->tdq_load > 0)
 			goto out;
 	} else
 		if (tdq->tdq_load - 1 > 0)
 			goto out;
 	load = 0;
 out:
 	return (load);
 }
 
 /*
  * Choose the highest priority thread to run.  The thread is removed from
  * the run-queue while running however the load remains.  For SMP we set
  * the tdq in the global idle bitmask if it idles here.
  */
 struct thread *
 sched_choose(void)
 {
 	struct thread *td;
 	struct tdq *tdq;
 
 	tdq = TDQ_SELF();
 	TDQ_LOCK_ASSERT(tdq, MA_OWNED);
 	td = tdq_choose(tdq);
 	if (td) {
 		tdq_runq_rem(tdq, td);
 		tdq->tdq_lowpri = td->td_priority;
 		return (td);
 	}
 	tdq->tdq_lowpri = PRI_MAX_IDLE;
 	return (PCPU_GET(idlethread));
 }
 
 /*
  * Set owepreempt if necessary.  Preemption never happens directly in ULE,
  * we always request it once we exit a critical section.
  */
 static inline void
 sched_setpreempt(struct thread *td)
 {
 	struct thread *ctd;
 	int cpri;
 	int pri;
 
 	THREAD_LOCK_ASSERT(curthread, MA_OWNED);
 
 	ctd = curthread;
 	pri = td->td_priority;
 	cpri = ctd->td_priority;
 	if (pri < cpri)
 		ctd->td_flags |= TDF_NEEDRESCHED;
 	if (panicstr != NULL || pri >= cpri || cold || TD_IS_INHIBITED(ctd))
 		return;
 	if (!sched_shouldpreempt(pri, cpri, 0))
 		return;
 	ctd->td_owepreempt = 1;
 }
 
 /*
  * Add a thread to a thread queue.  Select the appropriate runq and add the
  * thread to it.  This is the internal function called when the tdq is
  * predetermined.
  */
 void
 tdq_add(struct tdq *tdq, struct thread *td, int flags)
 {
 
 	TDQ_LOCK_ASSERT(tdq, MA_OWNED);
 	KASSERT((td->td_inhibitors == 0),
 	    ("sched_add: trying to run inhibited thread"));
 	KASSERT((TD_CAN_RUN(td) || TD_IS_RUNNING(td)),
 	    ("sched_add: bad thread state"));
 	KASSERT(td->td_flags & TDF_INMEM,
 	    ("sched_add: thread swapped out"));
 
 	if (td->td_priority < tdq->tdq_lowpri)
 		tdq->tdq_lowpri = td->td_priority;
 	tdq_runq_add(tdq, td, flags);
 	tdq_load_add(tdq, td);
 }
 
 /*
  * Select the target thread queue and add a thread to it.  Request
  * preemption or IPI a remote processor if required.
  */
 void
 sched_add(struct thread *td, int flags)
 {
 	struct tdq *tdq;
 #ifdef SMP
 	int cpu;
 #endif
 
 	KTR_STATE2(KTR_SCHED, "thread", sched_tdname(td), "runq add",
 	    "prio:%d", td->td_priority, KTR_ATTR_LINKED,
 	    sched_tdname(curthread));
 	KTR_POINT1(KTR_SCHED, "thread", sched_tdname(curthread), "wokeup",
 	    KTR_ATTR_LINKED, sched_tdname(td));
 	SDT_PROBE4(sched, , , enqueue, td, td->td_proc, NULL, 
 	    flags & SRQ_PREEMPTED);
 	THREAD_LOCK_ASSERT(td, MA_OWNED);
 	/*
 	 * Recalculate the priority before we select the target cpu or
 	 * run-queue.
 	 */
 	if (PRI_BASE(td->td_pri_class) == PRI_TIMESHARE)
 		sched_priority(td);
 #ifdef SMP
 	/*
 	 * Pick the destination cpu and if it isn't ours transfer to the
 	 * target cpu.
 	 */
 	cpu = sched_pickcpu(td, flags);
 	tdq = sched_setcpu(td, cpu, flags);
 	tdq_add(tdq, td, flags);
 	if (cpu != PCPU_GET(cpuid)) {
 		tdq_notify(tdq, td);
 		return;
 	}
 #else
 	tdq = TDQ_SELF();
 	TDQ_LOCK(tdq);
 	/*
 	 * Now that the thread is moving to the run-queue, set the lock
 	 * to the scheduler's lock.
 	 */
 	thread_lock_set(td, TDQ_LOCKPTR(tdq));
 	tdq_add(tdq, td, flags);
 #endif
 	if (!(flags & SRQ_YIELDING))
 		sched_setpreempt(td);
 }
 
 /*
  * Remove a thread from a run-queue without running it.  This is used
  * when we're stealing a thread from a remote queue.  Otherwise all threads
  * exit by calling sched_exit_thread() and sched_throw() themselves.
  */
 void
 sched_rem(struct thread *td)
 {
 	struct tdq *tdq;
 
 	KTR_STATE1(KTR_SCHED, "thread", sched_tdname(td), "runq rem",
 	    "prio:%d", td->td_priority);
 	SDT_PROBE3(sched, , , dequeue, td, td->td_proc, NULL);
 	tdq = TDQ_CPU(td->td_sched->ts_cpu);
 	TDQ_LOCK_ASSERT(tdq, MA_OWNED);
 	MPASS(td->td_lock == TDQ_LOCKPTR(tdq));
 	KASSERT(TD_ON_RUNQ(td),
 	    ("sched_rem: thread not on run queue"));
 	tdq_runq_rem(tdq, td);
 	tdq_load_rem(tdq, td);
 	TD_SET_CAN_RUN(td);
 	if (td->td_priority == tdq->tdq_lowpri)
 		tdq_setlowpri(tdq, NULL);
 }
 
 /*
  * Fetch cpu utilization information.  Updates on demand.
  */
 fixpt_t
 sched_pctcpu(struct thread *td)
 {
 	fixpt_t pctcpu;
 	struct td_sched *ts;
 
 	pctcpu = 0;
 	ts = td->td_sched;
 	if (ts == NULL)
 		return (0);
 
 	THREAD_LOCK_ASSERT(td, MA_OWNED);
 	sched_pctcpu_update(ts, TD_IS_RUNNING(td));
 	if (ts->ts_ticks) {
 		int rtick;
 
 		/* How many rtick per second ? */
 		rtick = min(SCHED_TICK_HZ(ts) / SCHED_TICK_SECS, hz);
 		pctcpu = (FSCALE * ((FSCALE * rtick)/hz)) >> FSHIFT;
 	}
 
 	return (pctcpu);
 }
 
 /*
  * Enforce affinity settings for a thread.  Called after adjustments to
  * cpumask.
  */
 void
 sched_affinity(struct thread *td)
 {
 #ifdef SMP
 	struct td_sched *ts;
 
 	THREAD_LOCK_ASSERT(td, MA_OWNED);
 	ts = td->td_sched;
 	if (THREAD_CAN_SCHED(td, ts->ts_cpu))
 		return;
 	if (TD_ON_RUNQ(td)) {
 		sched_rem(td);
 		sched_add(td, SRQ_BORING);
 		return;
 	}
 	if (!TD_IS_RUNNING(td))
 		return;
 	/*
 	 * Force a switch before returning to userspace.  If the
 	 * target thread is not running locally send an ipi to force
 	 * the issue.
 	 */
 	td->td_flags |= TDF_NEEDRESCHED;
 	if (td != curthread)
 		ipi_cpu(ts->ts_cpu, IPI_PREEMPT);
 #endif
 }
 
 /*
  * Bind a thread to a target cpu.
  */
 void
 sched_bind(struct thread *td, int cpu)
 {
 	struct td_sched *ts;
 
 	THREAD_LOCK_ASSERT(td, MA_OWNED|MA_NOTRECURSED);
 	KASSERT(td == curthread, ("sched_bind: can only bind curthread"));
 	ts = td->td_sched;
 	if (ts->ts_flags & TSF_BOUND)
 		sched_unbind(td);
 	KASSERT(THREAD_CAN_MIGRATE(td), ("%p must be migratable", td));
 	ts->ts_flags |= TSF_BOUND;
 	sched_pin();
 	if (PCPU_GET(cpuid) == cpu)
 		return;
 	ts->ts_cpu = cpu;
 	/* When we return from mi_switch we'll be on the correct cpu. */
 	mi_switch(SW_VOL, NULL);
 }
 
 /*
  * Release a bound thread.
  */
 void
 sched_unbind(struct thread *td)
 {
 	struct td_sched *ts;
 
 	THREAD_LOCK_ASSERT(td, MA_OWNED);
 	KASSERT(td == curthread, ("sched_unbind: can only bind curthread"));
 	ts = td->td_sched;
 	if ((ts->ts_flags & TSF_BOUND) == 0)
 		return;
 	ts->ts_flags &= ~TSF_BOUND;
 	sched_unpin();
 }
 
 int
 sched_is_bound(struct thread *td)
 {
 	THREAD_LOCK_ASSERT(td, MA_OWNED);
 	return (td->td_sched->ts_flags & TSF_BOUND);
 }
 
 /*
  * Basic yield call.
  */
 void
 sched_relinquish(struct thread *td)
 {
 	thread_lock(td);
 	mi_switch(SW_VOL | SWT_RELINQUISH, NULL);
 	thread_unlock(td);
 }
 
 /*
  * Return the total system load.
  */
 int
 sched_load(void)
 {
 #ifdef SMP
 	int total;
 	int i;
 
 	total = 0;
 	CPU_FOREACH(i)
 		total += TDQ_CPU(i)->tdq_sysload;
 	return (total);
 #else
 	return (TDQ_SELF()->tdq_sysload);
 #endif
 }
 
 int
 sched_sizeof_proc(void)
 {
 	return (sizeof(struct proc));
 }
 
 int
 sched_sizeof_thread(void)
 {
 	return (sizeof(struct thread) + sizeof(struct td_sched));
 }
 
 #ifdef SMP
 #define	TDQ_IDLESPIN(tdq)						\
     ((tdq)->tdq_cg != NULL && ((tdq)->tdq_cg->cg_flags & CG_FLAG_THREAD) == 0)
 #else
 #define	TDQ_IDLESPIN(tdq)	1
 #endif
 
 /*
  * The actual idle process.
  */
 void
 sched_idletd(void *dummy)
 {
 	struct thread *td;
 	struct tdq *tdq;
 	int oldswitchcnt, switchcnt;
 	int i;
 
 	mtx_assert(&Giant, MA_NOTOWNED);
 	td = curthread;
 	tdq = TDQ_SELF();
 	THREAD_NO_SLEEPING();
 	oldswitchcnt = -1;
 	for (;;) {
 		if (tdq->tdq_load) {
 			thread_lock(td);
 			mi_switch(SW_VOL | SWT_IDLE, NULL);
 			thread_unlock(td);
 		}
 		switchcnt = tdq->tdq_switchcnt + tdq->tdq_oldswitchcnt;
 #ifdef SMP
 		if (switchcnt != oldswitchcnt) {
 			oldswitchcnt = switchcnt;
 			if (tdq_idled(tdq) == 0)
 				continue;
 		}
 		switchcnt = tdq->tdq_switchcnt + tdq->tdq_oldswitchcnt;
 #else
 		oldswitchcnt = switchcnt;
 #endif
 		/*
 		 * If we're switching very frequently, spin while checking
 		 * for load rather than entering a low power state that 
 		 * may require an IPI.  However, don't do any busy
 		 * loops while on SMT machines as this simply steals
 		 * cycles from cores doing useful work.
 		 */
 		if (TDQ_IDLESPIN(tdq) && switchcnt > sched_idlespinthresh) {
 			for (i = 0; i < sched_idlespins; i++) {
 				if (tdq->tdq_load)
 					break;
 				cpu_spinwait();
 			}
 		}
 
 		/* If there was context switch during spin, restart it. */
 		switchcnt = tdq->tdq_switchcnt + tdq->tdq_oldswitchcnt;
 		if (tdq->tdq_load != 0 || switchcnt != oldswitchcnt)
 			continue;
 
 		/* Run main MD idle handler. */
 		tdq->tdq_cpu_idle = 1;
 		cpu_idle(switchcnt * 4 > sched_idlespinthresh);
 		tdq->tdq_cpu_idle = 0;
 
 		/*
 		 * Account thread-less hardware interrupts and
 		 * other wakeup reasons equal to context switches.
 		 */
 		switchcnt = tdq->tdq_switchcnt + tdq->tdq_oldswitchcnt;
 		if (switchcnt != oldswitchcnt)
 			continue;
 		tdq->tdq_switchcnt++;
 		oldswitchcnt++;
 	}
 }
 
 /*
  * A CPU is entering for the first time or a thread is exiting.
  */
 void
 sched_throw(struct thread *td)
 {
 	struct thread *newtd;
 	struct tdq *tdq;
 
 	tdq = TDQ_SELF();
 	if (td == NULL) {
 		/* Correct spinlock nesting and acquire the correct lock. */
 		TDQ_LOCK(tdq);
 		spinlock_exit();
 		PCPU_SET(switchtime, cpu_ticks());
 		PCPU_SET(switchticks, ticks);
 	} else {
 		MPASS(td->td_lock == TDQ_LOCKPTR(tdq));
 		tdq_load_rem(tdq, td);
 		lock_profile_release_lock(&TDQ_LOCKPTR(tdq)->lock_object);
 	}
 	KASSERT(curthread->td_md.md_spinlock_count == 1, ("invalid count"));
 	newtd = choosethread();
 	TDQ_LOCKPTR(tdq)->mtx_lock = (uintptr_t)newtd;
 	cpu_throw(td, newtd);		/* doesn't return */
 }
 
 /*
  * This is called from fork_exit().  Just acquire the correct locks and
  * let fork do the rest of the work.
  */
 void
 sched_fork_exit(struct thread *td)
 {
 	struct td_sched *ts;
 	struct tdq *tdq;
 	int cpuid;
 
 	/*
 	 * Finish setting up thread glue so that it begins execution in a
 	 * non-nested critical section with the scheduler lock held.
 	 */
 	cpuid = PCPU_GET(cpuid);
 	tdq = TDQ_CPU(cpuid);
 	ts = td->td_sched;
 	if (TD_IS_IDLETHREAD(td))
 		td->td_lock = TDQ_LOCKPTR(tdq);
 	MPASS(td->td_lock == TDQ_LOCKPTR(tdq));
 	td->td_oncpu = cpuid;
 	TDQ_LOCK_ASSERT(tdq, MA_OWNED | MA_NOTRECURSED);
 	lock_profile_obtain_lock_success(
 	    &TDQ_LOCKPTR(tdq)->lock_object, 0, 0, __FILE__, __LINE__);
 }
 
 /*
  * Create on first use to catch odd startup conditons.
  */
 char *
 sched_tdname(struct thread *td)
 {
 #ifdef KTR
 	struct td_sched *ts;
 
 	ts = td->td_sched;
 	if (ts->ts_name[0] == '\0')
 		snprintf(ts->ts_name, sizeof(ts->ts_name),
 		    "%s tid %d", td->td_name, td->td_tid);
 	return (ts->ts_name);
 #else
 	return (td->td_name);
 #endif
 }
 
 #ifdef KTR
 void
 sched_clear_tdname(struct thread *td)
 {
 	struct td_sched *ts;
 
 	ts = td->td_sched;
 	ts->ts_name[0] = '\0';
 }
 #endif
 
 #ifdef SMP
 
 /*
  * Build the CPU topology dump string. Is recursively called to collect
  * the topology tree.
  */
 static int
 sysctl_kern_sched_topology_spec_internal(struct sbuf *sb, struct cpu_group *cg,
     int indent)
 {
 	char cpusetbuf[CPUSETBUFSIZ];
 	int i, first;
 
 	sbuf_printf(sb, "%*s<group level=\"%d\" cache-level=\"%d\">\n", indent,
 	    "", 1 + indent / 2, cg->cg_level);
 	sbuf_printf(sb, "%*s <cpu count=\"%d\" mask=\"%s\">", indent, "",
 	    cg->cg_count, cpusetobj_strprint(cpusetbuf, &cg->cg_mask));
 	first = TRUE;
 	for (i = 0; i < MAXCPU; i++) {
 		if (CPU_ISSET(i, &cg->cg_mask)) {
 			if (!first)
 				sbuf_printf(sb, ", ");
 			else
 				first = FALSE;
 			sbuf_printf(sb, "%d", i);
 		}
 	}
 	sbuf_printf(sb, "</cpu>\n");
 
 	if (cg->cg_flags != 0) {
 		sbuf_printf(sb, "%*s <flags>", indent, "");
 		if ((cg->cg_flags & CG_FLAG_HTT) != 0)
 			sbuf_printf(sb, "<flag name=\"HTT\">HTT group</flag>");
 		if ((cg->cg_flags & CG_FLAG_THREAD) != 0)
 			sbuf_printf(sb, "<flag name=\"THREAD\">THREAD group</flag>");
 		if ((cg->cg_flags & CG_FLAG_SMT) != 0)
 			sbuf_printf(sb, "<flag name=\"SMT\">SMT group</flag>");
 		sbuf_printf(sb, "</flags>\n");
 	}
 
 	if (cg->cg_children > 0) {
 		sbuf_printf(sb, "%*s <children>\n", indent, "");
 		for (i = 0; i < cg->cg_children; i++)
 			sysctl_kern_sched_topology_spec_internal(sb, 
 			    &cg->cg_child[i], indent+2);
 		sbuf_printf(sb, "%*s </children>\n", indent, "");
 	}
 	sbuf_printf(sb, "%*s</group>\n", indent, "");
 	return (0);
 }
 
 /*
  * Sysctl handler for retrieving topology dump. It's a wrapper for
  * the recursive sysctl_kern_smp_topology_spec_internal().
  */
 static int
 sysctl_kern_sched_topology_spec(SYSCTL_HANDLER_ARGS)
 {
 	struct sbuf *topo;
 	int err;
 
 	KASSERT(cpu_top != NULL, ("cpu_top isn't initialized"));
 
 	topo = sbuf_new(NULL, NULL, 500, SBUF_AUTOEXTEND);
 	if (topo == NULL)
 		return (ENOMEM);
 
 	sbuf_printf(topo, "<groups>\n");
 	err = sysctl_kern_sched_topology_spec_internal(topo, cpu_top, 1);
 	sbuf_printf(topo, "</groups>\n");
 
 	if (err == 0) {
 		sbuf_finish(topo);
 		err = SYSCTL_OUT(req, sbuf_data(topo), sbuf_len(topo));
 	}
 	sbuf_delete(topo);
 	return (err);
 }
 
 #endif
 
 static int
 sysctl_kern_quantum(SYSCTL_HANDLER_ARGS)
 {
 	int error, new_val, period;
 
 	period = 1000000 / realstathz;
 	new_val = period * sched_slice;
 	error = sysctl_handle_int(oidp, &new_val, 0, req);
 	if (error != 0 || req->newptr == NULL)
 		return (error);
 	if (new_val <= 0)
 		return (EINVAL);
 	sched_slice = imax(1, (new_val + period / 2) / period);
 	sched_slice_min = sched_slice / SCHED_SLICE_MIN_DIVISOR;
 	hogticks = imax(1, (2 * hz * sched_slice + realstathz / 2) /
 	    realstathz);
 	return (0);
 }
 
 SYSCTL_NODE(_kern, OID_AUTO, sched, CTLFLAG_RW, 0, "Scheduler");
 SYSCTL_STRING(_kern_sched, OID_AUTO, name, CTLFLAG_RD, "ULE", 0,
     "Scheduler name");
 SYSCTL_PROC(_kern_sched, OID_AUTO, quantum, CTLTYPE_INT | CTLFLAG_RW,
     NULL, 0, sysctl_kern_quantum, "I",
     "Quantum for timeshare threads in microseconds");
 SYSCTL_INT(_kern_sched, OID_AUTO, slice, CTLFLAG_RW, &sched_slice, 0,
     "Quantum for timeshare threads in stathz ticks");
 SYSCTL_INT(_kern_sched, OID_AUTO, interact, CTLFLAG_RW, &sched_interact, 0,
     "Interactivity score threshold");
 SYSCTL_INT(_kern_sched, OID_AUTO, preempt_thresh, CTLFLAG_RW,
     &preempt_thresh, 0,
     "Maximal (lowest) priority for preemption");
 SYSCTL_INT(_kern_sched, OID_AUTO, static_boost, CTLFLAG_RW, &static_boost, 0,
     "Assign static kernel priorities to sleeping threads");
 SYSCTL_INT(_kern_sched, OID_AUTO, idlespins, CTLFLAG_RW, &sched_idlespins, 0,
     "Number of times idle thread will spin waiting for new work");
 SYSCTL_INT(_kern_sched, OID_AUTO, idlespinthresh, CTLFLAG_RW,
     &sched_idlespinthresh, 0,
     "Threshold before we will permit idle thread spinning");
 #ifdef SMP
 SYSCTL_INT(_kern_sched, OID_AUTO, affinity, CTLFLAG_RW, &affinity, 0,
     "Number of hz ticks to keep thread affinity for");
 SYSCTL_INT(_kern_sched, OID_AUTO, balance, CTLFLAG_RW, &rebalance, 0,
     "Enables the long-term load balancer");
 SYSCTL_INT(_kern_sched, OID_AUTO, balance_interval, CTLFLAG_RW,
     &balance_interval, 0,
     "Average period in stathz ticks to run the long-term balancer");
 SYSCTL_INT(_kern_sched, OID_AUTO, steal_idle, CTLFLAG_RW, &steal_idle, 0,
     "Attempts to steal work from other cores before idling");
 SYSCTL_INT(_kern_sched, OID_AUTO, steal_thresh, CTLFLAG_RW, &steal_thresh, 0,
     "Minimum load on remote CPU before we'll steal");
 SYSCTL_PROC(_kern_sched, OID_AUTO, topology_spec, CTLTYPE_STRING |
     CTLFLAG_RD, NULL, 0, sysctl_kern_sched_topology_spec, "A",
     "XML dump of detected CPU topology");
 #endif
 
 /* ps compat.  All cpu percentages from ULE are weighted. */
 static int ccpu = 0;
 SYSCTL_INT(_kern, OID_AUTO, ccpu, CTLFLAG_RD, &ccpu, 0, "");
Index: head/sys/kern/subr_devstat.c
===================================================================
--- head/sys/kern/subr_devstat.c	(revision 258540)
+++ head/sys/kern/subr_devstat.c	(revision 258541)
@@ -1,578 +1,576 @@
 /*-
  * Copyright (c) 1997, 1998, 1999 Kenneth D. Merry.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. The name of the author may not be used to endorse or promote products
  *    derived from this software without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
-#include "opt_kdtrace.h"
-
 #include <sys/param.h>
 #include <sys/kernel.h>
 #include <sys/systm.h>
 #include <sys/bio.h>
 #include <sys/devicestat.h>
 #include <sys/sdt.h>
 #include <sys/sysctl.h>
 #include <sys/malloc.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/conf.h>
 #include <vm/vm.h>
 #include <vm/pmap.h>
 
 #include <machine/atomic.h>
 
 SDT_PROVIDER_DEFINE(io);
 
 SDT_PROBE_DEFINE2(io, , , start, start, "struct bio *", "struct devstat *");
 SDT_PROBE_DEFINE2(io, , , done, done, "struct bio *", "struct devstat *");
 SDT_PROBE_DEFINE2(io, , , wait_start, wait-start, "struct bio *",
     "struct devstat *");
 SDT_PROBE_DEFINE2(io, , , wait_done, wait-done, "struct bio *",
     "struct devstat *");
 
 #define	DTRACE_DEVSTAT_START()		SDT_PROBE2(io, , , start, NULL, ds)
 #define	DTRACE_DEVSTAT_BIO_START()	SDT_PROBE2(io, , , start, bp, ds)
 #define	DTRACE_DEVSTAT_DONE()		SDT_PROBE2(io, , , done, NULL, ds)
 #define	DTRACE_DEVSTAT_BIO_DONE()	SDT_PROBE2(io, , , done, bp, ds)
 #define	DTRACE_DEVSTAT_WAIT_START()	SDT_PROBE2(io, , , wait_start, NULL, ds)
 #define	DTRACE_DEVSTAT_WAIT_DONE()	SDT_PROBE2(io, , , wait_done, NULL, ds)
 
 static int devstat_num_devs;
 static long devstat_generation = 1;
 static int devstat_version = DEVSTAT_VERSION;
 static int devstat_current_devnumber;
 static struct mtx devstat_mutex;
 MTX_SYSINIT(devstat_mutex, &devstat_mutex, "devstat", MTX_DEF);
 
 static struct devstatlist device_statq = STAILQ_HEAD_INITIALIZER(device_statq);
 static struct devstat *devstat_alloc(void);
 static void devstat_free(struct devstat *);
 static void devstat_add_entry(struct devstat *ds, const void *dev_name, 
 		       int unit_number, uint32_t block_size,
 		       devstat_support_flags flags,
 		       devstat_type_flags device_type,
 		       devstat_priority priority);
 
 /*
  * Allocate a devstat and initialize it
  */
 struct devstat *
 devstat_new_entry(const void *dev_name,
 		  int unit_number, uint32_t block_size,
 		  devstat_support_flags flags,
 		  devstat_type_flags device_type,
 		  devstat_priority priority)
 {
 	struct devstat *ds;
 
 	mtx_assert(&devstat_mutex, MA_NOTOWNED);
 
 	ds = devstat_alloc();
 	mtx_lock(&devstat_mutex);
 	if (unit_number == -1) {
 		ds->unit_number = unit_number;
 		ds->id = dev_name;
 		binuptime(&ds->creation_time);
 		devstat_generation++;
 	} else {
 		devstat_add_entry(ds, dev_name, unit_number, block_size,
 				  flags, device_type, priority);
 	}
 	mtx_unlock(&devstat_mutex);
 	return (ds);
 }
 
 /*
  * Take a malloced and zeroed devstat structure given to us, fill it in 
  * and add it to the queue of devices.  
  */
 static void
 devstat_add_entry(struct devstat *ds, const void *dev_name, 
 		  int unit_number, uint32_t block_size,
 		  devstat_support_flags flags,
 		  devstat_type_flags device_type,
 		  devstat_priority priority)
 {
 	struct devstatlist *devstat_head;
 	struct devstat *ds_tmp;
 
 	mtx_assert(&devstat_mutex, MA_OWNED);
 	devstat_num_devs++;
 
 	devstat_head = &device_statq;
 
 	/*
 	 * Priority sort.  Each driver passes in its priority when it adds
 	 * its devstat entry.  Drivers are sorted first by priority, and
 	 * then by probe order.
 	 * 
 	 * For the first device, we just insert it, since the priority
 	 * doesn't really matter yet.  Subsequent devices are inserted into
 	 * the list using the order outlined above.
 	 */
 	if (devstat_num_devs == 1)
 		STAILQ_INSERT_TAIL(devstat_head, ds, dev_links);
 	else {
 		STAILQ_FOREACH(ds_tmp, devstat_head, dev_links) {
 			struct devstat *ds_next;
 
 			ds_next = STAILQ_NEXT(ds_tmp, dev_links);
 
 			/*
 			 * If we find a break between higher and lower
 			 * priority items, and if this item fits in the
 			 * break, insert it.  This also applies if the
 			 * "lower priority item" is the end of the list.
 			 */
 			if ((priority <= ds_tmp->priority)
 			 && ((ds_next == NULL)
 			   || (priority > ds_next->priority))) {
 				STAILQ_INSERT_AFTER(devstat_head, ds_tmp, ds,
 						    dev_links);
 				break;
 			} else if (priority > ds_tmp->priority) {
 				/*
 				 * If this is the case, we should be able
 				 * to insert ourselves at the head of the
 				 * list.  If we can't, something is wrong.
 				 */
 				if (ds_tmp == STAILQ_FIRST(devstat_head)) {
 					STAILQ_INSERT_HEAD(devstat_head,
 							   ds, dev_links);
 					break;
 				} else {
 					STAILQ_INSERT_TAIL(devstat_head,
 							   ds, dev_links);
 					printf("devstat_add_entry: HELP! "
 					       "sorting problem detected "
 					       "for name %p unit %d\n",
 					       dev_name, unit_number);
 					break;
 				}
 			}
 		}
 	}
 
 	ds->device_number = devstat_current_devnumber++;
 	ds->unit_number = unit_number;
 	strlcpy(ds->device_name, dev_name, DEVSTAT_NAME_LEN);
 	ds->block_size = block_size;
 	ds->flags = flags;
 	ds->device_type = device_type;
 	ds->priority = priority;
 	binuptime(&ds->creation_time);
 	devstat_generation++;
 }
 
 /*
  * Remove a devstat structure from the list of devices.
  */
 void
 devstat_remove_entry(struct devstat *ds)
 {
 	struct devstatlist *devstat_head;
 
 	mtx_assert(&devstat_mutex, MA_NOTOWNED);
 	if (ds == NULL)
 		return;
 
 	mtx_lock(&devstat_mutex);
 
 	devstat_head = &device_statq;
 
 	/* Remove this entry from the devstat queue */
 	atomic_add_acq_int(&ds->sequence1, 1);
 	if (ds->unit_number != -1) {
 		devstat_num_devs--;
 		STAILQ_REMOVE(devstat_head, ds, devstat, dev_links);
 	}
 	devstat_free(ds);
 	devstat_generation++;
 	mtx_unlock(&devstat_mutex);
 }
 
 /*
  * Record a transaction start.
  *
  * See comments for devstat_end_transaction().  Ordering is very important
  * here.
  */
 void
 devstat_start_transaction(struct devstat *ds, struct bintime *now)
 {
 
 	mtx_assert(&devstat_mutex, MA_NOTOWNED);
 
 	/* sanity check */
 	if (ds == NULL)
 		return;
 
 	atomic_add_acq_int(&ds->sequence1, 1);
 	/*
 	 * We only want to set the start time when we are going from idle
 	 * to busy.  The start time is really the start of the latest busy
 	 * period.
 	 */
 	if (ds->start_count == ds->end_count) {
 		if (now != NULL)
 			ds->busy_from = *now;
 		else
 			binuptime(&ds->busy_from);
 	}
 	ds->start_count++;
 	atomic_add_rel_int(&ds->sequence0, 1);
 	DTRACE_DEVSTAT_START();
 }
 
 void
 devstat_start_transaction_bio(struct devstat *ds, struct bio *bp)
 {
 
 	mtx_assert(&devstat_mutex, MA_NOTOWNED);
 
 	/* sanity check */
 	if (ds == NULL)
 		return;
 
 	binuptime(&bp->bio_t0);
 	devstat_start_transaction(ds, &bp->bio_t0);
 	DTRACE_DEVSTAT_BIO_START();
 }
 
 /*
  * Record the ending of a transaction, and incrment the various counters.
  *
  * Ordering in this function, and in devstat_start_transaction() is VERY
  * important.  The idea here is to run without locks, so we are very
  * careful to only modify some fields on the way "down" (i.e. at
  * transaction start) and some fields on the way "up" (i.e. at transaction
  * completion).  One exception is busy_from, which we only modify in
  * devstat_start_transaction() when there are no outstanding transactions,
  * and thus it can't be modified in devstat_end_transaction()
  * simultaneously.
  *
  * The sequence0 and sequence1 fields are provided to enable an application
  * spying on the structures with mmap(2) to tell when a structure is in a
  * consistent state or not.
  *
  * For this to work 100% reliably, it is important that the two fields
  * are at opposite ends of the structure and that they are incremented
  * in the opposite order of how a memcpy(3) in userland would copy them.
  * We assume that the copying happens front to back, but there is actually
  * no way short of writing your own memcpy(3) replacement to guarantee
  * this will be the case.
  *
  * In addition to this, being a kind of locks, they must be updated with
  * atomic instructions using appropriate memory barriers.
  */
 void
 devstat_end_transaction(struct devstat *ds, uint32_t bytes, 
 			devstat_tag_type tag_type, devstat_trans_flags flags,
 			struct bintime *now, struct bintime *then)
 {
 	struct bintime dt, lnow;
 
 	/* sanity check */
 	if (ds == NULL)
 		return;
 
 	if (now == NULL) {
 		now = &lnow;
 		binuptime(now);
 	}
 
 	atomic_add_acq_int(&ds->sequence1, 1);
 	/* Update byte and operations counts */
 	ds->bytes[flags] += bytes;
 	ds->operations[flags]++;
 
 	/*
 	 * Keep a count of the various tag types sent.
 	 */
 	if ((ds->flags & DEVSTAT_NO_ORDERED_TAGS) == 0 &&
 	    tag_type != DEVSTAT_TAG_NONE)
 		ds->tag_types[tag_type]++;
 
 	if (then != NULL) {
 		/* Update duration of operations */
 		dt = *now;
 		bintime_sub(&dt, then);
 		bintime_add(&ds->duration[flags], &dt);
 	}
 
 	/* Accumulate busy time */
 	dt = *now;
 	bintime_sub(&dt, &ds->busy_from);
 	bintime_add(&ds->busy_time, &dt);
 	ds->busy_from = *now;
 
 	ds->end_count++;
 	atomic_add_rel_int(&ds->sequence0, 1);
 	DTRACE_DEVSTAT_DONE();
 }
 
 void
 devstat_end_transaction_bio(struct devstat *ds, struct bio *bp)
 {
 
 	devstat_end_transaction_bio_bt(ds, bp, NULL);
 }
 
 void
 devstat_end_transaction_bio_bt(struct devstat *ds, struct bio *bp,
     struct bintime *now)
 {
 	devstat_trans_flags flg;
 
 	/* sanity check */
 	if (ds == NULL)
 		return;
 
 	if (bp->bio_cmd == BIO_DELETE)
 		flg = DEVSTAT_FREE;
 	else if (bp->bio_cmd == BIO_READ)
 		flg = DEVSTAT_READ;
 	else if (bp->bio_cmd == BIO_WRITE)
 		flg = DEVSTAT_WRITE;
 	else 
 		flg = DEVSTAT_NO_DATA;
 
 	devstat_end_transaction(ds, bp->bio_bcount - bp->bio_resid,
 				DEVSTAT_TAG_SIMPLE, flg, now, &bp->bio_t0);
 	DTRACE_DEVSTAT_BIO_DONE();
 }
 
 /*
  * This is the sysctl handler for the devstat package.  The data pushed out
  * on the kern.devstat.all sysctl variable consists of the current devstat
  * generation number, and then an array of devstat structures, one for each
  * device in the system.
  *
  * This is more cryptic that obvious, but basically we neither can nor
  * want to hold the devstat_mutex for any amount of time, so we grab it
  * only when we need to and keep an eye on devstat_generation all the time.
  */
 static int
 sysctl_devstat(SYSCTL_HANDLER_ARGS)
 {
 	int error;
 	long mygen;
 	struct devstat *nds;
 
 	mtx_assert(&devstat_mutex, MA_NOTOWNED);
 
 	/*
 	 * XXX devstat_generation should really be "volatile" but that
 	 * XXX freaks out the sysctl macro below.  The places where we
 	 * XXX change it and inspect it are bracketed in the mutex which
 	 * XXX guarantees us proper write barriers.  I don't belive the
 	 * XXX compiler is allowed to optimize mygen away across calls
 	 * XXX to other functions, so the following is belived to be safe.
 	 */
 	mygen = devstat_generation;
 
 	error = SYSCTL_OUT(req, &mygen, sizeof(mygen));
 
 	if (devstat_num_devs == 0)
 		return(0);
 
 	if (error != 0)
 		return (error);
 
 	mtx_lock(&devstat_mutex);
 	nds = STAILQ_FIRST(&device_statq); 
 	if (mygen != devstat_generation)
 		error = EBUSY;
 	mtx_unlock(&devstat_mutex);
 
 	if (error != 0)
 		return (error);
 
 	for (;nds != NULL;) {
 		error = SYSCTL_OUT(req, nds, sizeof(struct devstat));
 		if (error != 0)
 			return (error);
 		mtx_lock(&devstat_mutex);
 		if (mygen != devstat_generation)
 			error = EBUSY;
 		else
 			nds = STAILQ_NEXT(nds, dev_links);
 		mtx_unlock(&devstat_mutex);
 		if (error != 0)
 			return (error);
 	}
 	return(error);
 }
 
 /*
  * Sysctl entries for devstat.  The first one is a node that all the rest
  * hang off of. 
  */
 static SYSCTL_NODE(_kern, OID_AUTO, devstat, CTLFLAG_RD, NULL,
     "Device Statistics");
 
 SYSCTL_PROC(_kern_devstat, OID_AUTO, all, CTLFLAG_RD|CTLTYPE_OPAQUE,
     NULL, 0, sysctl_devstat, "S,devstat", "All devices in the devstat list");
 /*
  * Export the number of devices in the system so that userland utilities
  * can determine how much memory to allocate to hold all the devices.
  */
 SYSCTL_INT(_kern_devstat, OID_AUTO, numdevs, CTLFLAG_RD, 
     &devstat_num_devs, 0, "Number of devices in the devstat list");
 SYSCTL_LONG(_kern_devstat, OID_AUTO, generation, CTLFLAG_RD,
     &devstat_generation, 0, "Devstat list generation");
 SYSCTL_INT(_kern_devstat, OID_AUTO, version, CTLFLAG_RD, 
     &devstat_version, 0, "Devstat list version number");
 
 /*
  * Allocator for struct devstat structures.  We sub-allocate these from pages
  * which we get from malloc.  These pages are exported for mmap(2)'ing through
  * a miniature device driver
  */
 
 #define statsperpage (PAGE_SIZE / sizeof(struct devstat))
 
 static d_mmap_t devstat_mmap;
 
 static struct cdevsw devstat_cdevsw = {
 	.d_version =	D_VERSION,
 	.d_flags =	D_NEEDGIANT,
 	.d_mmap =	devstat_mmap,
 	.d_name =	"devstat",
 };
 
 struct statspage {
 	TAILQ_ENTRY(statspage)	list;
 	struct devstat		*stat;
 	u_int			nfree;
 };
 
 static TAILQ_HEAD(, statspage)	pagelist = TAILQ_HEAD_INITIALIZER(pagelist);
 static MALLOC_DEFINE(M_DEVSTAT, "devstat", "Device statistics");
 
 static int
 devstat_mmap(struct cdev *dev, vm_ooffset_t offset, vm_paddr_t *paddr,
     int nprot, vm_memattr_t *memattr)
 {
 	struct statspage *spp;
 
 	if (nprot != VM_PROT_READ)
 		return (-1);
 	TAILQ_FOREACH(spp, &pagelist, list) {
 		if (offset == 0) {
 			*paddr = vtophys(spp->stat);
 			return (0);
 		}
 		offset -= PAGE_SIZE;
 	}
 	return (-1);
 }
 
 static struct devstat *
 devstat_alloc(void)
 {
 	struct devstat *dsp;
 	struct statspage *spp, *spp2;
 	u_int u;
 	static int once;
 
 	mtx_assert(&devstat_mutex, MA_NOTOWNED);
 	if (!once) {
 		make_dev_credf(MAKEDEV_ETERNAL | MAKEDEV_CHECKNAME,
 		    &devstat_cdevsw, 0, NULL, UID_ROOT, GID_WHEEL, 0400,
 		    DEVSTAT_DEVICE_NAME);
 		once = 1;
 	}
 	spp2 = NULL;
 	mtx_lock(&devstat_mutex);
 	for (;;) {
 		TAILQ_FOREACH(spp, &pagelist, list) {
 			if (spp->nfree > 0)
 				break;
 		}
 		if (spp != NULL)
 			break;
 		mtx_unlock(&devstat_mutex);
 		spp2 = malloc(sizeof *spp, M_DEVSTAT, M_ZERO | M_WAITOK);
 		spp2->stat = malloc(PAGE_SIZE, M_DEVSTAT, M_ZERO | M_WAITOK);
 		spp2->nfree = statsperpage;
 
 		/*
 		 * If free statspages were added while the lock was released
 		 * just reuse them.
 		 */
 		mtx_lock(&devstat_mutex);
 		TAILQ_FOREACH(spp, &pagelist, list)
 			if (spp->nfree > 0)
 				break;
 		if (spp == NULL) {
 			spp = spp2;
 
 			/*
 			 * It would make more sense to add the new page at the
 			 * head but the order on the list determine the
 			 * sequence of the mapping so we can't do that.
 			 */
 			TAILQ_INSERT_TAIL(&pagelist, spp, list);
 		} else
 			break;
 	}
 	dsp = spp->stat;
 	for (u = 0; u < statsperpage; u++) {
 		if (dsp->allocated == 0)
 			break;
 		dsp++;
 	}
 	spp->nfree--;
 	dsp->allocated = 1;
 	mtx_unlock(&devstat_mutex);
 	if (spp2 != NULL && spp2 != spp) {
 		free(spp2->stat, M_DEVSTAT);
 		free(spp2, M_DEVSTAT);
 	}
 	return (dsp);
 }
 
 static void
 devstat_free(struct devstat *dsp)
 {
 	struct statspage *spp;
 
 	mtx_assert(&devstat_mutex, MA_OWNED);
 	bzero(dsp, sizeof *dsp);
 	TAILQ_FOREACH(spp, &pagelist, list) {
 		if (dsp >= spp->stat && dsp < (spp->stat + statsperpage)) {
 			spp->nfree++;
 			return;
 		}
 	}
 }
 
 SYSCTL_INT(_debug_sizeof, OID_AUTO, devstat, CTLFLAG_RD,
     NULL, sizeof(struct devstat), "sizeof(struct devstat)");
Index: head/sys/kern/subr_sleepqueue.c
===================================================================
--- head/sys/kern/subr_sleepqueue.c	(revision 258540)
+++ head/sys/kern/subr_sleepqueue.c	(revision 258541)
@@ -1,1236 +1,1235 @@
 /*-
  * Copyright (c) 2004 John Baldwin <jhb@FreeBSD.org>
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the author nor the names of any co-contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 /*
  * Implementation of sleep queues used to hold queue of threads blocked on
  * a wait channel.  Sleep queues different from turnstiles in that wait
  * channels are not owned by anyone, so there is no priority propagation.
  * Sleep queues can also provide a timeout and can also be interrupted by
  * signals.  That said, there are several similarities between the turnstile
  * and sleep queue implementations.  (Note: turnstiles were implemented
  * first.)  For example, both use a hash table of the same size where each
  * bucket is referred to as a "chain" that contains both a spin lock and
  * a linked list of queues.  An individual queue is located by using a hash
  * to pick a chain, locking the chain, and then walking the chain searching
  * for the queue.  This means that a wait channel object does not need to
  * embed it's queue head just as locks do not embed their turnstile queue
  * head.  Threads also carry around a sleep queue that they lend to the
  * wait channel when blocking.  Just as in turnstiles, the queue includes
  * a free list of the sleep queues of other threads blocked on the same
  * wait channel in the case of multiple waiters.
  *
  * Some additional functionality provided by sleep queues include the
  * ability to set a timeout.  The timeout is managed using a per-thread
  * callout that resumes a thread if it is asleep.  A thread may also
  * catch signals while it is asleep (aka an interruptible sleep).  The
  * signal code uses sleepq_abort() to interrupt a sleeping thread.  Finally,
  * sleep queues also provide some extra assertions.  One is not allowed to
  * mix the sleep/wakeup and cv APIs for a given wait channel.  Also, one
  * must consistently use the same lock to synchronize with a wait channel,
  * though this check is currently only a warning for sleep/wakeup due to
  * pre-existing abuse of that API.  The same lock must also be held when
  * awakening threads, though that is currently only enforced for condition
  * variables.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_sleepqueue_profiling.h"
 #include "opt_ddb.h"
-#include "opt_kdtrace.h"
 #include "opt_sched.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/lock.h>
 #include <sys/kernel.h>
 #include <sys/ktr.h>
 #include <sys/mutex.h>
 #include <sys/proc.h>
 #include <sys/sbuf.h>
 #include <sys/sched.h>
 #include <sys/sdt.h>
 #include <sys/signalvar.h>
 #include <sys/sleepqueue.h>
 #include <sys/sysctl.h>
 
 #include <vm/uma.h>
 
 #ifdef DDB
 #include <ddb/ddb.h>
 #endif
 
 /*
  * Constants for the hash table of sleep queue chains.
  * SC_TABLESIZE must be a power of two for SC_MASK to work properly.
  */
 #define	SC_TABLESIZE	256			/* Must be power of 2. */
 #define	SC_MASK		(SC_TABLESIZE - 1)
 #define	SC_SHIFT	8
 #define	SC_HASH(wc)	((((uintptr_t)(wc) >> SC_SHIFT) ^ (uintptr_t)(wc)) & \
 			    SC_MASK)
 #define	SC_LOOKUP(wc)	&sleepq_chains[SC_HASH(wc)]
 #define NR_SLEEPQS      2
 /*
  * There two different lists of sleep queues.  Both lists are connected
  * via the sq_hash entries.  The first list is the sleep queue chain list
  * that a sleep queue is on when it is attached to a wait channel.  The
  * second list is the free list hung off of a sleep queue that is attached
  * to a wait channel.
  *
  * Each sleep queue also contains the wait channel it is attached to, the
  * list of threads blocked on that wait channel, flags specific to the
  * wait channel, and the lock used to synchronize with a wait channel.
  * The flags are used to catch mismatches between the various consumers
  * of the sleep queue API (e.g. sleep/wakeup and condition variables).
  * The lock pointer is only used when invariants are enabled for various
  * debugging checks.
  *
  * Locking key:
  *  c - sleep queue chain lock
  */
 struct sleepqueue {
 	TAILQ_HEAD(, thread) sq_blocked[NR_SLEEPQS];	/* (c) Blocked threads. */
 	u_int sq_blockedcnt[NR_SLEEPQS];	/* (c) N. of blocked threads. */
 	LIST_ENTRY(sleepqueue) sq_hash;		/* (c) Chain and free list. */
 	LIST_HEAD(, sleepqueue) sq_free;	/* (c) Free queues. */
 	void	*sq_wchan;			/* (c) Wait channel. */
 	int	sq_type;			/* (c) Queue type. */
 #ifdef INVARIANTS
 	struct lock_object *sq_lock;		/* (c) Associated lock. */
 #endif
 };
 
 struct sleepqueue_chain {
 	LIST_HEAD(, sleepqueue) sc_queues;	/* List of sleep queues. */
 	struct mtx sc_lock;			/* Spin lock for this chain. */
 #ifdef SLEEPQUEUE_PROFILING
 	u_int	sc_depth;			/* Length of sc_queues. */
 	u_int	sc_max_depth;			/* Max length of sc_queues. */
 #endif
 };
 
 #ifdef SLEEPQUEUE_PROFILING
 u_int sleepq_max_depth;
 static SYSCTL_NODE(_debug, OID_AUTO, sleepq, CTLFLAG_RD, 0, "sleepq profiling");
 static SYSCTL_NODE(_debug_sleepq, OID_AUTO, chains, CTLFLAG_RD, 0,
     "sleepq chain stats");
 SYSCTL_UINT(_debug_sleepq, OID_AUTO, max_depth, CTLFLAG_RD, &sleepq_max_depth,
     0, "maxmimum depth achieved of a single chain");
 
 static void	sleepq_profile(const char *wmesg);
 static int	prof_enabled;
 #endif
 static struct sleepqueue_chain sleepq_chains[SC_TABLESIZE];
 static uma_zone_t sleepq_zone;
 
 /*
  * Prototypes for non-exported routines.
  */
 static int	sleepq_catch_signals(void *wchan, int pri);
 static int	sleepq_check_signals(void);
 static int	sleepq_check_timeout(void);
 #ifdef INVARIANTS
 static void	sleepq_dtor(void *mem, int size, void *arg);
 #endif
 static int	sleepq_init(void *mem, int size, int flags);
 static int	sleepq_resume_thread(struct sleepqueue *sq, struct thread *td,
 		    int pri);
 static void	sleepq_switch(void *wchan, int pri);
 static void	sleepq_timeout(void *arg);
 
 SDT_PROBE_DECLARE(sched, , , sleep);
 SDT_PROBE_DECLARE(sched, , , wakeup);
 
 /*
  * Early initialization of sleep queues that is called from the sleepinit()
  * SYSINIT.
  */
 void
 init_sleepqueues(void)
 {
 #ifdef SLEEPQUEUE_PROFILING
 	struct sysctl_oid *chain_oid;
 	char chain_name[10];
 #endif
 	int i;
 
 	for (i = 0; i < SC_TABLESIZE; i++) {
 		LIST_INIT(&sleepq_chains[i].sc_queues);
 		mtx_init(&sleepq_chains[i].sc_lock, "sleepq chain", NULL,
 		    MTX_SPIN | MTX_RECURSE);
 #ifdef SLEEPQUEUE_PROFILING
 		snprintf(chain_name, sizeof(chain_name), "%d", i);
 		chain_oid = SYSCTL_ADD_NODE(NULL, 
 		    SYSCTL_STATIC_CHILDREN(_debug_sleepq_chains), OID_AUTO,
 		    chain_name, CTLFLAG_RD, NULL, "sleepq chain stats");
 		SYSCTL_ADD_UINT(NULL, SYSCTL_CHILDREN(chain_oid), OID_AUTO,
 		    "depth", CTLFLAG_RD, &sleepq_chains[i].sc_depth, 0, NULL);
 		SYSCTL_ADD_UINT(NULL, SYSCTL_CHILDREN(chain_oid), OID_AUTO,
 		    "max_depth", CTLFLAG_RD, &sleepq_chains[i].sc_max_depth, 0,
 		    NULL);
 #endif
 	}
 	sleepq_zone = uma_zcreate("SLEEPQUEUE", sizeof(struct sleepqueue),
 #ifdef INVARIANTS
 	    NULL, sleepq_dtor, sleepq_init, NULL, UMA_ALIGN_CACHE, 0);
 #else
 	    NULL, NULL, sleepq_init, NULL, UMA_ALIGN_CACHE, 0);
 #endif
 	
 	thread0.td_sleepqueue = sleepq_alloc();
 }
 
 /*
  * Get a sleep queue for a new thread.
  */
 struct sleepqueue *
 sleepq_alloc(void)
 {
 
 	return (uma_zalloc(sleepq_zone, M_WAITOK));
 }
 
 /*
  * Free a sleep queue when a thread is destroyed.
  */
 void
 sleepq_free(struct sleepqueue *sq)
 {
 
 	uma_zfree(sleepq_zone, sq);
 }
 
 /*
  * Lock the sleep queue chain associated with the specified wait channel.
  */
 void
 sleepq_lock(void *wchan)
 {
 	struct sleepqueue_chain *sc;
 
 	sc = SC_LOOKUP(wchan);
 	mtx_lock_spin(&sc->sc_lock);
 }
 
 /*
  * Look up the sleep queue associated with a given wait channel in the hash
  * table locking the associated sleep queue chain.  If no queue is found in
  * the table, NULL is returned.
  */
 struct sleepqueue *
 sleepq_lookup(void *wchan)
 {
 	struct sleepqueue_chain *sc;
 	struct sleepqueue *sq;
 
 	KASSERT(wchan != NULL, ("%s: invalid NULL wait channel", __func__));
 	sc = SC_LOOKUP(wchan);
 	mtx_assert(&sc->sc_lock, MA_OWNED);
 	LIST_FOREACH(sq, &sc->sc_queues, sq_hash)
 		if (sq->sq_wchan == wchan)
 			return (sq);
 	return (NULL);
 }
 
 /*
  * Unlock the sleep queue chain associated with a given wait channel.
  */
 void
 sleepq_release(void *wchan)
 {
 	struct sleepqueue_chain *sc;
 
 	sc = SC_LOOKUP(wchan);
 	mtx_unlock_spin(&sc->sc_lock);
 }
 
 /*
  * Places the current thread on the sleep queue for the specified wait
  * channel.  If INVARIANTS is enabled, then it associates the passed in
  * lock with the sleepq to make sure it is held when that sleep queue is
  * woken up.
  */
 void
 sleepq_add(void *wchan, struct lock_object *lock, const char *wmesg, int flags,
     int queue)
 {
 	struct sleepqueue_chain *sc;
 	struct sleepqueue *sq;
 	struct thread *td;
 
 	td = curthread;
 	sc = SC_LOOKUP(wchan);
 	mtx_assert(&sc->sc_lock, MA_OWNED);
 	MPASS(td->td_sleepqueue != NULL);
 	MPASS(wchan != NULL);
 	MPASS((queue >= 0) && (queue < NR_SLEEPQS));
 
 	/* If this thread is not allowed to sleep, die a horrible death. */
 	KASSERT(td->td_no_sleeping == 0,
 	    ("%s: td %p to sleep on wchan %p with sleeping prohibited",
 	    __func__, td, wchan));
 
 	/* Look up the sleep queue associated with the wait channel 'wchan'. */
 	sq = sleepq_lookup(wchan);
 
 	/*
 	 * If the wait channel does not already have a sleep queue, use
 	 * this thread's sleep queue.  Otherwise, insert the current thread
 	 * into the sleep queue already in use by this wait channel.
 	 */
 	if (sq == NULL) {
 #ifdef INVARIANTS
 		int i;
 
 		sq = td->td_sleepqueue;
 		for (i = 0; i < NR_SLEEPQS; i++) {
 			KASSERT(TAILQ_EMPTY(&sq->sq_blocked[i]),
 			    ("thread's sleep queue %d is not empty", i));
 			KASSERT(sq->sq_blockedcnt[i] == 0,
 			    ("thread's sleep queue %d count mismatches", i));
 		}
 		KASSERT(LIST_EMPTY(&sq->sq_free),
 		    ("thread's sleep queue has a non-empty free list"));
 		KASSERT(sq->sq_wchan == NULL, ("stale sq_wchan pointer"));
 		sq->sq_lock = lock;
 #endif
 #ifdef SLEEPQUEUE_PROFILING
 		sc->sc_depth++;
 		if (sc->sc_depth > sc->sc_max_depth) {
 			sc->sc_max_depth = sc->sc_depth;
 			if (sc->sc_max_depth > sleepq_max_depth)
 				sleepq_max_depth = sc->sc_max_depth;
 		}
 #endif
 		sq = td->td_sleepqueue;
 		LIST_INSERT_HEAD(&sc->sc_queues, sq, sq_hash);
 		sq->sq_wchan = wchan;
 		sq->sq_type = flags & SLEEPQ_TYPE;
 	} else {
 		MPASS(wchan == sq->sq_wchan);
 		MPASS(lock == sq->sq_lock);
 		MPASS((flags & SLEEPQ_TYPE) == sq->sq_type);
 		LIST_INSERT_HEAD(&sq->sq_free, td->td_sleepqueue, sq_hash);
 	}
 	thread_lock(td);
 	TAILQ_INSERT_TAIL(&sq->sq_blocked[queue], td, td_slpq);
 	sq->sq_blockedcnt[queue]++;
 	td->td_sleepqueue = NULL;
 	td->td_sqqueue = queue;
 	td->td_wchan = wchan;
 	td->td_wmesg = wmesg;
 	if (flags & SLEEPQ_INTERRUPTIBLE) {
 		td->td_flags |= TDF_SINTR;
 		td->td_flags &= ~TDF_SLEEPABORT;
 	}
 	thread_unlock(td);
 }
 
 /*
  * Sets a timeout that will remove the current thread from the specified
  * sleep queue after timo ticks if the thread has not already been awakened.
  */
 void
 sleepq_set_timeout_sbt(void *wchan, sbintime_t sbt, sbintime_t pr,
     int flags)
 {
 	struct sleepqueue_chain *sc;
 	struct thread *td;
 
 	td = curthread;
 	sc = SC_LOOKUP(wchan);
 	mtx_assert(&sc->sc_lock, MA_OWNED);
 	MPASS(TD_ON_SLEEPQ(td));
 	MPASS(td->td_sleepqueue == NULL);
 	MPASS(wchan != NULL);
 	callout_reset_sbt_on(&td->td_slpcallout, sbt, pr,
 	    sleepq_timeout, td, PCPU_GET(cpuid), flags | C_DIRECT_EXEC);
 }
 
 /*
  * Return the number of actual sleepers for the specified queue.
  */
 u_int
 sleepq_sleepcnt(void *wchan, int queue)
 {
 	struct sleepqueue *sq;
 
 	KASSERT(wchan != NULL, ("%s: invalid NULL wait channel", __func__));
 	MPASS((queue >= 0) && (queue < NR_SLEEPQS));
 	sq = sleepq_lookup(wchan);
 	if (sq == NULL)
 		return (0);
 	return (sq->sq_blockedcnt[queue]);
 }
 
 /*
  * Marks the pending sleep of the current thread as interruptible and
  * makes an initial check for pending signals before putting a thread
  * to sleep. Enters and exits with the thread lock held.  Thread lock
  * may have transitioned from the sleepq lock to a run lock.
  */
 static int
 sleepq_catch_signals(void *wchan, int pri)
 {
 	struct sleepqueue_chain *sc;
 	struct sleepqueue *sq;
 	struct thread *td;
 	struct proc *p;
 	struct sigacts *ps;
 	int sig, ret;
 
 	td = curthread;
 	p = curproc;
 	sc = SC_LOOKUP(wchan);
 	mtx_assert(&sc->sc_lock, MA_OWNED);
 	MPASS(wchan != NULL);
 	if ((td->td_pflags & TDP_WAKEUP) != 0) {
 		td->td_pflags &= ~TDP_WAKEUP;
 		ret = EINTR;
 		thread_lock(td);
 		goto out;
 	}
 
 	/*
 	 * See if there are any pending signals for this thread.  If not
 	 * we can switch immediately.  Otherwise do the signal processing
 	 * directly.
 	 */
 	thread_lock(td);
 	if ((td->td_flags & (TDF_NEEDSIGCHK | TDF_NEEDSUSPCHK)) == 0) {
 		sleepq_switch(wchan, pri);
 		return (0);
 	}
 	thread_unlock(td);
 	mtx_unlock_spin(&sc->sc_lock);
 	CTR3(KTR_PROC, "sleepq catching signals: thread %p (pid %ld, %s)",
 		(void *)td, (long)p->p_pid, td->td_name);
 	PROC_LOCK(p);
 	ps = p->p_sigacts;
 	mtx_lock(&ps->ps_mtx);
 	sig = cursig(td);
 	if (sig == 0) {
 		mtx_unlock(&ps->ps_mtx);
 		ret = thread_suspend_check(1);
 		MPASS(ret == 0 || ret == EINTR || ret == ERESTART);
 	} else {
 		if (SIGISMEMBER(ps->ps_sigintr, sig))
 			ret = EINTR;
 		else
 			ret = ERESTART;
 		mtx_unlock(&ps->ps_mtx);
 	}
 	/*
 	 * Lock the per-process spinlock prior to dropping the PROC_LOCK
 	 * to avoid a signal delivery race.  PROC_LOCK, PROC_SLOCK, and
 	 * thread_lock() are currently held in tdsendsignal().
 	 */
 	PROC_SLOCK(p);
 	mtx_lock_spin(&sc->sc_lock);
 	PROC_UNLOCK(p);
 	thread_lock(td);
 	PROC_SUNLOCK(p);
 	if (ret == 0) {
 		sleepq_switch(wchan, pri);
 		return (0);
 	}
 out:
 	/*
 	 * There were pending signals and this thread is still
 	 * on the sleep queue, remove it from the sleep queue.
 	 */
 	if (TD_ON_SLEEPQ(td)) {
 		sq = sleepq_lookup(wchan);
 		if (sleepq_resume_thread(sq, td, 0)) {
 #ifdef INVARIANTS
 			/*
 			 * This thread hasn't gone to sleep yet, so it
 			 * should not be swapped out.
 			 */
 			panic("not waking up swapper");
 #endif
 		}
 	}
 	mtx_unlock_spin(&sc->sc_lock);
 	MPASS(td->td_lock != &sc->sc_lock);
 	return (ret);
 }
 
 /*
  * Switches to another thread if we are still asleep on a sleep queue.
  * Returns with thread lock.
  */
 static void
 sleepq_switch(void *wchan, int pri)
 {
 	struct sleepqueue_chain *sc;
 	struct sleepqueue *sq;
 	struct thread *td;
 
 	td = curthread;
 	sc = SC_LOOKUP(wchan);
 	mtx_assert(&sc->sc_lock, MA_OWNED);
 	THREAD_LOCK_ASSERT(td, MA_OWNED);
 
 	/* 
 	 * If we have a sleep queue, then we've already been woken up, so
 	 * just return.
 	 */
 	if (td->td_sleepqueue != NULL) {
 		mtx_unlock_spin(&sc->sc_lock);
 		return;
 	}
 
 	/*
 	 * If TDF_TIMEOUT is set, then our sleep has been timed out
 	 * already but we are still on the sleep queue, so dequeue the
 	 * thread and return.
 	 */
 	if (td->td_flags & TDF_TIMEOUT) {
 		MPASS(TD_ON_SLEEPQ(td));
 		sq = sleepq_lookup(wchan);
 		if (sleepq_resume_thread(sq, td, 0)) {
 #ifdef INVARIANTS
 			/*
 			 * This thread hasn't gone to sleep yet, so it
 			 * should not be swapped out.
 			 */
 			panic("not waking up swapper");
 #endif
 		}
 		mtx_unlock_spin(&sc->sc_lock);
 		return;		
 	}
 #ifdef SLEEPQUEUE_PROFILING
 	if (prof_enabled)
 		sleepq_profile(td->td_wmesg);
 #endif
 	MPASS(td->td_sleepqueue == NULL);
 	sched_sleep(td, pri);
 	thread_lock_set(td, &sc->sc_lock);
 	SDT_PROBE0(sched, , , sleep);
 	TD_SET_SLEEPING(td);
 	mi_switch(SW_VOL | SWT_SLEEPQ, NULL);
 	KASSERT(TD_IS_RUNNING(td), ("running but not TDS_RUNNING"));
 	CTR3(KTR_PROC, "sleepq resume: thread %p (pid %ld, %s)",
 	    (void *)td, (long)td->td_proc->p_pid, (void *)td->td_name);
 }
 
 /*
  * Check to see if we timed out.
  */
 static int
 sleepq_check_timeout(void)
 {
 	struct thread *td;
 
 	td = curthread;
 	THREAD_LOCK_ASSERT(td, MA_OWNED);
 
 	/*
 	 * If TDF_TIMEOUT is set, we timed out.
 	 */
 	if (td->td_flags & TDF_TIMEOUT) {
 		td->td_flags &= ~TDF_TIMEOUT;
 		return (EWOULDBLOCK);
 	}
 
 	/*
 	 * If TDF_TIMOFAIL is set, the timeout ran after we had
 	 * already been woken up.
 	 */
 	if (td->td_flags & TDF_TIMOFAIL)
 		td->td_flags &= ~TDF_TIMOFAIL;
 
 	/*
 	 * If callout_stop() fails, then the timeout is running on
 	 * another CPU, so synchronize with it to avoid having it
 	 * accidentally wake up a subsequent sleep.
 	 */
 	else if (callout_stop(&td->td_slpcallout) == 0) {
 		td->td_flags |= TDF_TIMEOUT;
 		TD_SET_SLEEPING(td);
 		mi_switch(SW_INVOL | SWT_SLEEPQTIMO, NULL);
 	}
 	return (0);
 }
 
 /*
  * Check to see if we were awoken by a signal.
  */
 static int
 sleepq_check_signals(void)
 {
 	struct thread *td;
 
 	td = curthread;
 	THREAD_LOCK_ASSERT(td, MA_OWNED);
 
 	/* We are no longer in an interruptible sleep. */
 	if (td->td_flags & TDF_SINTR)
 		td->td_flags &= ~TDF_SINTR;
 
 	if (td->td_flags & TDF_SLEEPABORT) {
 		td->td_flags &= ~TDF_SLEEPABORT;
 		return (td->td_intrval);
 	}
 
 	return (0);
 }
 
 /*
  * Block the current thread until it is awakened from its sleep queue.
  */
 void
 sleepq_wait(void *wchan, int pri)
 {
 	struct thread *td;
 
 	td = curthread;
 	MPASS(!(td->td_flags & TDF_SINTR));
 	thread_lock(td);
 	sleepq_switch(wchan, pri);
 	thread_unlock(td);
 }
 
 /*
  * Block the current thread until it is awakened from its sleep queue
  * or it is interrupted by a signal.
  */
 int
 sleepq_wait_sig(void *wchan, int pri)
 {
 	int rcatch;
 	int rval;
 
 	rcatch = sleepq_catch_signals(wchan, pri);
 	rval = sleepq_check_signals();
 	thread_unlock(curthread);
 	if (rcatch)
 		return (rcatch);
 	return (rval);
 }
 
 /*
  * Block the current thread until it is awakened from its sleep queue
  * or it times out while waiting.
  */
 int
 sleepq_timedwait(void *wchan, int pri)
 {
 	struct thread *td;
 	int rval;
 
 	td = curthread;
 	MPASS(!(td->td_flags & TDF_SINTR));
 	thread_lock(td);
 	sleepq_switch(wchan, pri);
 	rval = sleepq_check_timeout();
 	thread_unlock(td);
 
 	return (rval);
 }
 
 /*
  * Block the current thread until it is awakened from its sleep queue,
  * it is interrupted by a signal, or it times out waiting to be awakened.
  */
 int
 sleepq_timedwait_sig(void *wchan, int pri)
 {
 	int rcatch, rvalt, rvals;
 
 	rcatch = sleepq_catch_signals(wchan, pri);
 	rvalt = sleepq_check_timeout();
 	rvals = sleepq_check_signals();
 	thread_unlock(curthread);
 	if (rcatch)
 		return (rcatch);
 	if (rvals)
 		return (rvals);
 	return (rvalt);
 }
 
 /*
  * Returns the type of sleepqueue given a waitchannel.
  */
 int
 sleepq_type(void *wchan)
 {
 	struct sleepqueue *sq;
 	int type;
 
 	MPASS(wchan != NULL);
 
 	sleepq_lock(wchan);
 	sq = sleepq_lookup(wchan);
 	if (sq == NULL) {
 		sleepq_release(wchan);
 		return (-1);
 	}
 	type = sq->sq_type;
 	sleepq_release(wchan);
 	return (type);
 }
 
 /*
  * Removes a thread from a sleep queue and makes it
  * runnable.
  */
 static int
 sleepq_resume_thread(struct sleepqueue *sq, struct thread *td, int pri)
 {
 	struct sleepqueue_chain *sc;
 
 	MPASS(td != NULL);
 	MPASS(sq->sq_wchan != NULL);
 	MPASS(td->td_wchan == sq->sq_wchan);
 	MPASS(td->td_sqqueue < NR_SLEEPQS && td->td_sqqueue >= 0);
 	THREAD_LOCK_ASSERT(td, MA_OWNED);
 	sc = SC_LOOKUP(sq->sq_wchan);
 	mtx_assert(&sc->sc_lock, MA_OWNED);
 
 	SDT_PROBE2(sched, , , wakeup, td, td->td_proc);
 
 	/* Remove the thread from the queue. */
 	sq->sq_blockedcnt[td->td_sqqueue]--;
 	TAILQ_REMOVE(&sq->sq_blocked[td->td_sqqueue], td, td_slpq);
 
 	/*
 	 * Get a sleep queue for this thread.  If this is the last waiter,
 	 * use the queue itself and take it out of the chain, otherwise,
 	 * remove a queue from the free list.
 	 */
 	if (LIST_EMPTY(&sq->sq_free)) {
 		td->td_sleepqueue = sq;
 #ifdef INVARIANTS
 		sq->sq_wchan = NULL;
 #endif
 #ifdef SLEEPQUEUE_PROFILING
 		sc->sc_depth--;
 #endif
 	} else
 		td->td_sleepqueue = LIST_FIRST(&sq->sq_free);
 	LIST_REMOVE(td->td_sleepqueue, sq_hash);
 
 	td->td_wmesg = NULL;
 	td->td_wchan = NULL;
 	td->td_flags &= ~TDF_SINTR;
 
 	CTR3(KTR_PROC, "sleepq_wakeup: thread %p (pid %ld, %s)",
 	    (void *)td, (long)td->td_proc->p_pid, td->td_name);
 
 	/* Adjust priority if requested. */
 	MPASS(pri == 0 || (pri >= PRI_MIN && pri <= PRI_MAX));
 	if (pri != 0 && td->td_priority > pri &&
 	    PRI_BASE(td->td_pri_class) == PRI_TIMESHARE)
 		sched_prio(td, pri);
 
 	/*
 	 * Note that thread td might not be sleeping if it is running
 	 * sleepq_catch_signals() on another CPU or is blocked on its
 	 * proc lock to check signals.  There's no need to mark the
 	 * thread runnable in that case.
 	 */
 	if (TD_IS_SLEEPING(td)) {
 		TD_CLR_SLEEPING(td);
 		return (setrunnable(td));
 	}
 	return (0);
 }
 
 #ifdef INVARIANTS
 /*
  * UMA zone item deallocator.
  */
 static void
 sleepq_dtor(void *mem, int size, void *arg)
 {
 	struct sleepqueue *sq;
 	int i;
 
 	sq = mem;
 	for (i = 0; i < NR_SLEEPQS; i++) {
 		MPASS(TAILQ_EMPTY(&sq->sq_blocked[i]));
 		MPASS(sq->sq_blockedcnt[i] == 0);
 	}
 }
 #endif
 
 /*
  * UMA zone item initializer.
  */
 static int
 sleepq_init(void *mem, int size, int flags)
 {
 	struct sleepqueue *sq;
 	int i;
 
 	bzero(mem, size);
 	sq = mem;
 	for (i = 0; i < NR_SLEEPQS; i++) {
 		TAILQ_INIT(&sq->sq_blocked[i]);
 		sq->sq_blockedcnt[i] = 0;
 	}
 	LIST_INIT(&sq->sq_free);
 	return (0);
 }
 
 /*
  * Find the highest priority thread sleeping on a wait channel and resume it.
  */
 int
 sleepq_signal(void *wchan, int flags, int pri, int queue)
 {
 	struct sleepqueue *sq;
 	struct thread *td, *besttd;
 	int wakeup_swapper;
 
 	CTR2(KTR_PROC, "sleepq_signal(%p, %d)", wchan, flags);
 	KASSERT(wchan != NULL, ("%s: invalid NULL wait channel", __func__));
 	MPASS((queue >= 0) && (queue < NR_SLEEPQS));
 	sq = sleepq_lookup(wchan);
 	if (sq == NULL)
 		return (0);
 	KASSERT(sq->sq_type == (flags & SLEEPQ_TYPE),
 	    ("%s: mismatch between sleep/wakeup and cv_*", __func__));
 
 	/*
 	 * Find the highest priority thread on the queue.  If there is a
 	 * tie, use the thread that first appears in the queue as it has
 	 * been sleeping the longest since threads are always added to
 	 * the tail of sleep queues.
 	 */
 	besttd = NULL;
 	TAILQ_FOREACH(td, &sq->sq_blocked[queue], td_slpq) {
 		if (besttd == NULL || td->td_priority < besttd->td_priority)
 			besttd = td;
 	}
 	MPASS(besttd != NULL);
 	thread_lock(besttd);
 	wakeup_swapper = sleepq_resume_thread(sq, besttd, pri);
 	thread_unlock(besttd);
 	return (wakeup_swapper);
 }
 
 /*
  * Resume all threads sleeping on a specified wait channel.
  */
 int
 sleepq_broadcast(void *wchan, int flags, int pri, int queue)
 {
 	struct sleepqueue *sq;
 	struct thread *td, *tdn;
 	int wakeup_swapper;
 
 	CTR2(KTR_PROC, "sleepq_broadcast(%p, %d)", wchan, flags);
 	KASSERT(wchan != NULL, ("%s: invalid NULL wait channel", __func__));
 	MPASS((queue >= 0) && (queue < NR_SLEEPQS));
 	sq = sleepq_lookup(wchan);
 	if (sq == NULL)
 		return (0);
 	KASSERT(sq->sq_type == (flags & SLEEPQ_TYPE),
 	    ("%s: mismatch between sleep/wakeup and cv_*", __func__));
 
 	/* Resume all blocked threads on the sleep queue. */
 	wakeup_swapper = 0;
 	TAILQ_FOREACH_SAFE(td, &sq->sq_blocked[queue], td_slpq, tdn) {
 		thread_lock(td);
 		if (sleepq_resume_thread(sq, td, pri))
 			wakeup_swapper = 1;
 		thread_unlock(td);
 	}
 	return (wakeup_swapper);
 }
 
 /*
  * Time sleeping threads out.  When the timeout expires, the thread is
  * removed from the sleep queue and made runnable if it is still asleep.
  */
 static void
 sleepq_timeout(void *arg)
 {
 	struct sleepqueue_chain *sc;
 	struct sleepqueue *sq;
 	struct thread *td;
 	void *wchan;
 	int wakeup_swapper;
 
 	td = arg;
 	wakeup_swapper = 0;
 	CTR3(KTR_PROC, "sleepq_timeout: thread %p (pid %ld, %s)",
 	    (void *)td, (long)td->td_proc->p_pid, (void *)td->td_name);
 
 	/*
 	 * First, see if the thread is asleep and get the wait channel if
 	 * it is.
 	 */
 	thread_lock(td);
 	if (TD_IS_SLEEPING(td) && TD_ON_SLEEPQ(td)) {
 		wchan = td->td_wchan;
 		sc = SC_LOOKUP(wchan);
 		THREAD_LOCKPTR_ASSERT(td, &sc->sc_lock);
 		sq = sleepq_lookup(wchan);
 		MPASS(sq != NULL);
 		td->td_flags |= TDF_TIMEOUT;
 		wakeup_swapper = sleepq_resume_thread(sq, td, 0);
 		thread_unlock(td);
 		if (wakeup_swapper)
 			kick_proc0();
 		return;
 	}
 
 	/*
 	 * If the thread is on the SLEEPQ but isn't sleeping yet, it
 	 * can either be on another CPU in between sleepq_add() and
 	 * one of the sleepq_*wait*() routines or it can be in
 	 * sleepq_catch_signals().
 	 */
 	if (TD_ON_SLEEPQ(td)) {
 		td->td_flags |= TDF_TIMEOUT;
 		thread_unlock(td);
 		return;
 	}
 
 	/*
 	 * Now check for the edge cases.  First, if TDF_TIMEOUT is set,
 	 * then the other thread has already yielded to us, so clear
 	 * the flag and resume it.  If TDF_TIMEOUT is not set, then the
 	 * we know that the other thread is not on a sleep queue, but it
 	 * hasn't resumed execution yet.  In that case, set TDF_TIMOFAIL
 	 * to let it know that the timeout has already run and doesn't
 	 * need to be canceled.
 	 */
 	if (td->td_flags & TDF_TIMEOUT) {
 		MPASS(TD_IS_SLEEPING(td));
 		td->td_flags &= ~TDF_TIMEOUT;
 		TD_CLR_SLEEPING(td);
 		wakeup_swapper = setrunnable(td);
 	} else
 		td->td_flags |= TDF_TIMOFAIL;
 	thread_unlock(td);
 	if (wakeup_swapper)
 		kick_proc0();
 }
 
 /*
  * Resumes a specific thread from the sleep queue associated with a specific
  * wait channel if it is on that queue.
  */
 void
 sleepq_remove(struct thread *td, void *wchan)
 {
 	struct sleepqueue *sq;
 	int wakeup_swapper;
 
 	/*
 	 * Look up the sleep queue for this wait channel, then re-check
 	 * that the thread is asleep on that channel, if it is not, then
 	 * bail.
 	 */
 	MPASS(wchan != NULL);
 	sleepq_lock(wchan);
 	sq = sleepq_lookup(wchan);
 	/*
 	 * We can not lock the thread here as it may be sleeping on a
 	 * different sleepq.  However, holding the sleepq lock for this
 	 * wchan can guarantee that we do not miss a wakeup for this
 	 * channel.  The asserts below will catch any false positives.
 	 */
 	if (!TD_ON_SLEEPQ(td) || td->td_wchan != wchan) {
 		sleepq_release(wchan);
 		return;
 	}
 	/* Thread is asleep on sleep queue sq, so wake it up. */
 	thread_lock(td);
 	MPASS(sq != NULL);
 	MPASS(td->td_wchan == wchan);
 	wakeup_swapper = sleepq_resume_thread(sq, td, 0);
 	thread_unlock(td);
 	sleepq_release(wchan);
 	if (wakeup_swapper)
 		kick_proc0();
 }
 
 /*
  * Abort a thread as if an interrupt had occurred.  Only abort
  * interruptible waits (unfortunately it isn't safe to abort others).
  */
 int
 sleepq_abort(struct thread *td, int intrval)
 {
 	struct sleepqueue *sq;
 	void *wchan;
 
 	THREAD_LOCK_ASSERT(td, MA_OWNED);
 	MPASS(TD_ON_SLEEPQ(td));
 	MPASS(td->td_flags & TDF_SINTR);
 	MPASS(intrval == EINTR || intrval == ERESTART);
 
 	/*
 	 * If the TDF_TIMEOUT flag is set, just leave. A
 	 * timeout is scheduled anyhow.
 	 */
 	if (td->td_flags & TDF_TIMEOUT)
 		return (0);
 
 	CTR3(KTR_PROC, "sleepq_abort: thread %p (pid %ld, %s)",
 	    (void *)td, (long)td->td_proc->p_pid, (void *)td->td_name);
 	td->td_intrval = intrval;
 	td->td_flags |= TDF_SLEEPABORT;
 	/*
 	 * If the thread has not slept yet it will find the signal in
 	 * sleepq_catch_signals() and call sleepq_resume_thread.  Otherwise
 	 * we have to do it here.
 	 */
 	if (!TD_IS_SLEEPING(td))
 		return (0);
 	wchan = td->td_wchan;
 	MPASS(wchan != NULL);
 	sq = sleepq_lookup(wchan);
 	MPASS(sq != NULL);
 
 	/* Thread is asleep on sleep queue sq, so wake it up. */
 	return (sleepq_resume_thread(sq, td, 0));
 }
 
 #ifdef SLEEPQUEUE_PROFILING
 #define	SLEEPQ_PROF_LOCATIONS	1024
 #define	SLEEPQ_SBUFSIZE		512
 struct sleepq_prof {
 	LIST_ENTRY(sleepq_prof) sp_link;
 	const char	*sp_wmesg;
 	long		sp_count;
 };
 
 LIST_HEAD(sqphead, sleepq_prof);
 
 struct sqphead sleepq_prof_free;
 struct sqphead sleepq_hash[SC_TABLESIZE];
 static struct sleepq_prof sleepq_profent[SLEEPQ_PROF_LOCATIONS];
 static struct mtx sleepq_prof_lock;
 MTX_SYSINIT(sleepq_prof_lock, &sleepq_prof_lock, "sleepq_prof", MTX_SPIN);
 
 static void
 sleepq_profile(const char *wmesg)
 {
 	struct sleepq_prof *sp;
 
 	mtx_lock_spin(&sleepq_prof_lock);
 	if (prof_enabled == 0)
 		goto unlock;
 	LIST_FOREACH(sp, &sleepq_hash[SC_HASH(wmesg)], sp_link)
 		if (sp->sp_wmesg == wmesg)
 			goto done;
 	sp = LIST_FIRST(&sleepq_prof_free);
 	if (sp == NULL)
 		goto unlock;
 	sp->sp_wmesg = wmesg;
 	LIST_REMOVE(sp, sp_link);
 	LIST_INSERT_HEAD(&sleepq_hash[SC_HASH(wmesg)], sp, sp_link);
 done:
 	sp->sp_count++;
 unlock:
 	mtx_unlock_spin(&sleepq_prof_lock);
 	return;
 }
 
 static void
 sleepq_prof_reset(void)
 {
 	struct sleepq_prof *sp;
 	int enabled;
 	int i;
 
 	mtx_lock_spin(&sleepq_prof_lock);
 	enabled = prof_enabled;
 	prof_enabled = 0;
 	for (i = 0; i < SC_TABLESIZE; i++)
 		LIST_INIT(&sleepq_hash[i]);
 	LIST_INIT(&sleepq_prof_free);
 	for (i = 0; i < SLEEPQ_PROF_LOCATIONS; i++) {
 		sp = &sleepq_profent[i];
 		sp->sp_wmesg = NULL;
 		sp->sp_count = 0;
 		LIST_INSERT_HEAD(&sleepq_prof_free, sp, sp_link);
 	}
 	prof_enabled = enabled;
 	mtx_unlock_spin(&sleepq_prof_lock);
 }
 
 static int
 enable_sleepq_prof(SYSCTL_HANDLER_ARGS)
 {
 	int error, v;
 
 	v = prof_enabled;
 	error = sysctl_handle_int(oidp, &v, v, req);
 	if (error)
 		return (error);
 	if (req->newptr == NULL)
 		return (error);
 	if (v == prof_enabled)
 		return (0);
 	if (v == 1)
 		sleepq_prof_reset();
 	mtx_lock_spin(&sleepq_prof_lock);
 	prof_enabled = !!v;
 	mtx_unlock_spin(&sleepq_prof_lock);
 
 	return (0);
 }
 
 static int
 reset_sleepq_prof_stats(SYSCTL_HANDLER_ARGS)
 {
 	int error, v;
 
 	v = 0;
 	error = sysctl_handle_int(oidp, &v, 0, req);
 	if (error)
 		return (error);
 	if (req->newptr == NULL)
 		return (error);
 	if (v == 0)
 		return (0);
 	sleepq_prof_reset();
 
 	return (0);
 }
 
 static int
 dump_sleepq_prof_stats(SYSCTL_HANDLER_ARGS)
 {
 	struct sleepq_prof *sp;
 	struct sbuf *sb;
 	int enabled;
 	int error;
 	int i;
 
 	error = sysctl_wire_old_buffer(req, 0);
 	if (error != 0)
 		return (error);
 	sb = sbuf_new_for_sysctl(NULL, NULL, SLEEPQ_SBUFSIZE, req);
 	sbuf_printf(sb, "\nwmesg\tcount\n");
 	enabled = prof_enabled;
 	mtx_lock_spin(&sleepq_prof_lock);
 	prof_enabled = 0;
 	mtx_unlock_spin(&sleepq_prof_lock);
 	for (i = 0; i < SC_TABLESIZE; i++) {
 		LIST_FOREACH(sp, &sleepq_hash[i], sp_link) {
 			sbuf_printf(sb, "%s\t%ld\n",
 			    sp->sp_wmesg, sp->sp_count);
 		}
 	}
 	mtx_lock_spin(&sleepq_prof_lock);
 	prof_enabled = enabled;
 	mtx_unlock_spin(&sleepq_prof_lock);
 
 	error = sbuf_finish(sb);
 	sbuf_delete(sb);
 	return (error);
 }
 
 SYSCTL_PROC(_debug_sleepq, OID_AUTO, stats, CTLTYPE_STRING | CTLFLAG_RD,
     NULL, 0, dump_sleepq_prof_stats, "A", "Sleepqueue profiling statistics");
 SYSCTL_PROC(_debug_sleepq, OID_AUTO, reset, CTLTYPE_INT | CTLFLAG_RW,
     NULL, 0, reset_sleepq_prof_stats, "I",
     "Reset sleepqueue profiling statistics");
 SYSCTL_PROC(_debug_sleepq, OID_AUTO, enable, CTLTYPE_INT | CTLFLAG_RW,
     NULL, 0, enable_sleepq_prof, "I", "Enable sleepqueue profiling");
 #endif
 
 #ifdef DDB
 DB_SHOW_COMMAND(sleepq, db_show_sleepqueue)
 {
 	struct sleepqueue_chain *sc;
 	struct sleepqueue *sq;
 #ifdef INVARIANTS
 	struct lock_object *lock;
 #endif
 	struct thread *td;
 	void *wchan;
 	int i;
 
 	if (!have_addr)
 		return;
 
 	/*
 	 * First, see if there is an active sleep queue for the wait channel
 	 * indicated by the address.
 	 */
 	wchan = (void *)addr;
 	sc = SC_LOOKUP(wchan);
 	LIST_FOREACH(sq, &sc->sc_queues, sq_hash)
 		if (sq->sq_wchan == wchan)
 			goto found;
 
 	/*
 	 * Second, see if there is an active sleep queue at the address
 	 * indicated.
 	 */
 	for (i = 0; i < SC_TABLESIZE; i++)
 		LIST_FOREACH(sq, &sleepq_chains[i].sc_queues, sq_hash) {
 			if (sq == (struct sleepqueue *)addr)
 				goto found;
 		}
 
 	db_printf("Unable to locate a sleep queue via %p\n", (void *)addr);
 	return;
 found:
 	db_printf("Wait channel: %p\n", sq->sq_wchan);
 	db_printf("Queue type: %d\n", sq->sq_type);
 #ifdef INVARIANTS
 	if (sq->sq_lock) {
 		lock = sq->sq_lock;
 		db_printf("Associated Interlock: %p - (%s) %s\n", lock,
 		    LOCK_CLASS(lock)->lc_name, lock->lo_name);
 	}
 #endif
 	db_printf("Blocked threads:\n");
 	for (i = 0; i < NR_SLEEPQS; i++) {
 		db_printf("\nQueue[%d]:\n", i);
 		if (TAILQ_EMPTY(&sq->sq_blocked[i]))
 			db_printf("\tempty\n");
 		else
 			TAILQ_FOREACH(td, &sq->sq_blocked[0],
 				      td_slpq) {
 				db_printf("\t%p (tid %d, pid %d, \"%s\")\n", td,
 					  td->td_tid, td->td_proc->p_pid,
 					  td->td_name);
 			}
 		db_printf("(expected: %u)\n", sq->sq_blockedcnt[i]);
 	}
 }
 
 /* Alias 'show sleepqueue' to 'show sleepq'. */
 DB_SHOW_ALIAS(sleepqueue, db_show_sleepqueue);
 #endif
Index: head/sys/kern/subr_syscall.c
===================================================================
--- head/sys/kern/subr_syscall.c	(revision 258540)
+++ head/sys/kern/subr_syscall.c	(revision 258541)
@@ -1,235 +1,234 @@
 /*-
  * Copyright (C) 1994, David Greenman
  * Copyright (c) 1990, 1993
  *	The Regents of the University of California.  All rights reserved.
  * Copyright (C) 2010 Konstantin Belousov <kib@freebsd.org>
  *
  * This code is derived from software contributed to Berkeley by
  * the University of Utah, and William Jolitz.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *	This product includes software developed by the University of
  *	California, Berkeley and its contributors.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	from: @(#)trap.c	7.4 (Berkeley) 5/13/91
  */
 
 #include "opt_capsicum.h"
 #include "opt_ktrace.h"
-#include "opt_kdtrace.h"
 
 __FBSDID("$FreeBSD$");
 
 #include <sys/capability.h>
 #include <sys/ktr.h>
 #ifdef KTRACE
 #include <sys/uio.h>
 #include <sys/ktrace.h>
 #endif
 #include <security/audit/audit.h>
 
 static inline int
 syscallenter(struct thread *td, struct syscall_args *sa)
 {
 	struct proc *p;
 	int error, traced;
 
 	PCPU_INC(cnt.v_syscall);
 	p = td->td_proc;
 
 	td->td_pticks = 0;
 	if (td->td_ucred != p->p_ucred)
 		cred_update_thread(td);
 	if (p->p_flag & P_TRACED) {
 		traced = 1;
 		PROC_LOCK(p);
 		td->td_dbgflags &= ~TDB_USERWR;
 		td->td_dbgflags |= TDB_SCE;
 		PROC_UNLOCK(p);
 	} else
 		traced = 0;
 	error = (p->p_sysent->sv_fetch_syscall_args)(td, sa);
 #ifdef KTRACE
 	if (KTRPOINT(td, KTR_SYSCALL))
 		ktrsyscall(sa->code, sa->narg, sa->args);
 #endif
 	KTR_START4(KTR_SYSC, "syscall", syscallname(p, sa->code),
 	    (uintptr_t)td, "pid:%d", td->td_proc->p_pid, "arg0:%p", sa->args[0],
 	    "arg1:%p", sa->args[1], "arg2:%p", sa->args[2]);
 
 	if (error == 0) {
 
 		STOPEVENT(p, S_SCE, sa->narg);
 		if (p->p_flag & P_TRACED && p->p_stops & S_PT_SCE) {
 			PROC_LOCK(p);
 			ptracestop((td), SIGTRAP);
 			PROC_UNLOCK(p);
 		}
 		if (td->td_dbgflags & TDB_USERWR) {
 			/*
 			 * Reread syscall number and arguments if
 			 * debugger modified registers or memory.
 			 */
 			error = (p->p_sysent->sv_fetch_syscall_args)(td, sa);
 #ifdef KTRACE
 			if (KTRPOINT(td, KTR_SYSCALL))
 				ktrsyscall(sa->code, sa->narg, sa->args);
 #endif
 			if (error != 0)
 				goto retval;
 		}
 
 #ifdef CAPABILITY_MODE
 		/*
 		 * In capability mode, we only allow access to system calls
 		 * flagged with SYF_CAPENABLED.
 		 */
 		if (IN_CAPABILITY_MODE(td) &&
 		    !(sa->callp->sy_flags & SYF_CAPENABLED)) {
 			error = ECAPMODE;
 			goto retval;
 		}
 #endif
 
 		error = syscall_thread_enter(td, sa->callp);
 		if (error != 0)
 			goto retval;
 
 #ifdef KDTRACE_HOOKS
 		/*
 		 * If the systrace module has registered it's probe
 		 * callback and if there is a probe active for the
 		 * syscall 'entry', process the probe.
 		 */
 		if (systrace_probe_func != NULL && sa->callp->sy_entry != 0)
 			(*systrace_probe_func)(sa->callp->sy_entry, sa->code,
 			    sa->callp, sa->args, 0);
 #endif
 
 		AUDIT_SYSCALL_ENTER(sa->code, td);
 		error = (sa->callp->sy_call)(td, sa->args);
 		AUDIT_SYSCALL_EXIT(error, td);
 
 		/* Save the latest error return value. */
 		if ((td->td_pflags & TDP_NERRNO) == 0)
 			td->td_errno = error;
 
 #ifdef KDTRACE_HOOKS
 		/*
 		 * If the systrace module has registered it's probe
 		 * callback and if there is a probe active for the
 		 * syscall 'return', process the probe.
 		 */
 		if (systrace_probe_func != NULL && sa->callp->sy_return != 0)
 			(*systrace_probe_func)(sa->callp->sy_return, sa->code,
 			    sa->callp, NULL, (error) ? -1 : td->td_retval[0]);
 #endif
 		syscall_thread_exit(td, sa->callp);
 	}
  retval:
 	KTR_STOP4(KTR_SYSC, "syscall", syscallname(p, sa->code),
 	    (uintptr_t)td, "pid:%d", td->td_proc->p_pid, "error:%d", error,
 	    "retval0:%#lx", td->td_retval[0], "retval1:%#lx",
 	    td->td_retval[1]);
 	if (traced) {
 		PROC_LOCK(p);
 		td->td_dbgflags &= ~TDB_SCE;
 		PROC_UNLOCK(p);
 	}
 	(p->p_sysent->sv_set_syscall_retval)(td, error);
 	return (error);
 }
 
 static inline void
 syscallret(struct thread *td, int error, struct syscall_args *sa __unused)
 {
 	struct proc *p, *p2;
 	int traced;
 
 	p = td->td_proc;
 
 	/*
 	 * Handle reschedule and other end-of-syscall issues
 	 */
 	userret(td, td->td_frame);
 
 #ifdef KTRACE
 	if (KTRPOINT(td, KTR_SYSRET)) {
 		ktrsysret(sa->code, (td->td_pflags & TDP_NERRNO) == 0 ?
 		    error : td->td_errno, td->td_retval[0]);
 	}
 #endif
 	td->td_pflags &= ~TDP_NERRNO;
 
 	if (p->p_flag & P_TRACED) {
 		traced = 1;
 		PROC_LOCK(p);
 		td->td_dbgflags |= TDB_SCX;
 		PROC_UNLOCK(p);
 	} else
 		traced = 0;
 	/*
 	 * This works because errno is findable through the
 	 * register set.  If we ever support an emulation where this
 	 * is not the case, this code will need to be revisited.
 	 */
 	STOPEVENT(p, S_SCX, sa->code);
 	if (traced || (td->td_dbgflags & (TDB_EXEC | TDB_FORK)) != 0) {
 		PROC_LOCK(p);
 		/*
 		 * If tracing the execed process, trap to the debugger
 		 * so that breakpoints can be set before the program
 		 * executes.  If debugger requested tracing of syscall
 		 * returns, do it now too.
 		 */
 		if (traced &&
 		    ((td->td_dbgflags & (TDB_FORK | TDB_EXEC)) != 0 ||
 		    (p->p_stops & S_PT_SCX) != 0))
 			ptracestop(td, SIGTRAP);
 		td->td_dbgflags &= ~(TDB_SCX | TDB_EXEC | TDB_FORK);
 		PROC_UNLOCK(p);
 	}
 
 	if (td->td_pflags & TDP_RFPPWAIT) {
 		/*
 		 * Preserve synchronization semantics of vfork.  If
 		 * waiting for child to exec or exit, fork set
 		 * P_PPWAIT on child, and there we sleep on our proc
 		 * (in case of exit).
 		 *
 		 * Do it after the ptracestop() above is finished, to
 		 * not block our debugger until child execs or exits
 		 * to finish vfork wait.
 		 */
 		td->td_pflags &= ~TDP_RFPPWAIT;
 		p2 = td->td_rfppwait_p;
 		PROC_LOCK(p2);
 		while (p2->p_flag & P_PPWAIT)
 			cv_wait(&p2->p_pwait, &p2->p_mtx);
 		PROC_UNLOCK(p2);
 	}
 }
Index: head/sys/kern/subr_trap.c
===================================================================
--- head/sys/kern/subr_trap.c	(revision 258540)
+++ head/sys/kern/subr_trap.c	(revision 258541)
@@ -1,303 +1,302 @@
 /*-
  * Copyright (C) 1994, David Greenman
  * Copyright (c) 1990, 1993
  *	The Regents of the University of California.  All rights reserved.
  * Copyright (c) 2007 The FreeBSD Foundation
  *
  * This code is derived from software contributed to Berkeley by
  * the University of Utah, and William Jolitz.
  *
  * Portions of this software were developed by A. Joseph Koshy under
  * sponsorship from the FreeBSD Foundation and Google, Inc.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *	This product includes software developed by the University of
  *	California, Berkeley and its contributors.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	from: @(#)trap.c	7.4 (Berkeley) 5/13/91
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_hwpmc_hooks.h"
 #include "opt_ktrace.h"
-#include "opt_kdtrace.h"
 #include "opt_sched.h"
 
 #include <sys/param.h>
 #include <sys/bus.h>
 #include <sys/capability.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/pmckern.h>
 #include <sys/proc.h>
 #include <sys/ktr.h>
 #include <sys/pioctl.h>
 #include <sys/ptrace.h>
 #include <sys/resourcevar.h>
 #include <sys/sched.h>
 #include <sys/signalvar.h>
 #include <sys/syscall.h>
 #include <sys/syscallsubr.h>
 #include <sys/sysent.h>
 #include <sys/systm.h>
 #include <sys/vmmeter.h>
 #ifdef KTRACE
 #include <sys/uio.h>
 #include <sys/ktrace.h>
 #endif
 #include <security/audit/audit.h>
 
 #include <machine/cpu.h>
 
 #ifdef VIMAGE
 #include <net/vnet.h>
 #endif
 
 #ifdef XEN
 #include <vm/vm.h>
 #include <vm/vm_param.h>
 #include <vm/pmap.h>
 #endif
 
 #ifdef	HWPMC_HOOKS
 #include <sys/pmckern.h>
 #endif
 
 #include <security/mac/mac_framework.h>
 
 /*
  * Define the code needed before returning to user mode, for trap and
  * syscall.
  */
 void
 userret(struct thread *td, struct trapframe *frame)
 {
 	struct proc *p = td->td_proc;
 
 	CTR3(KTR_SYSC, "userret: thread %p (pid %d, %s)", td, p->p_pid,
             td->td_name);
 	KASSERT((p->p_flag & P_WEXIT) == 0,
 	    ("Exiting process returns to usermode"));
 #if 0
 #ifdef DIAGNOSTIC
 	/* Check that we called signotify() enough. */
 	PROC_LOCK(p);
 	thread_lock(td);
 	if (SIGPENDING(td) && ((td->td_flags & TDF_NEEDSIGCHK) == 0 ||
 	    (td->td_flags & TDF_ASTPENDING) == 0))
 		printf("failed to set signal flags properly for ast()\n");
 	thread_unlock(td);
 	PROC_UNLOCK(p);
 #endif
 #endif
 #ifdef KTRACE
 	KTRUSERRET(td);
 #endif
 	/*
 	 * If this thread tickled GEOM, we need to wait for the giggling to
 	 * stop before we return to userland
 	 */
 	if (td->td_pflags & TDP_GEOM)
 		g_waitidle();
 
 	/*
 	 * Charge system time if profiling.
 	 */
 	if (p->p_flag & P_PROFIL)
 		addupc_task(td, TRAPF_PC(frame), td->td_pticks * psratio);
 	/*
 	 * Let the scheduler adjust our priority etc.
 	 */
 	sched_userret(td);
 #ifdef XEN
 	PT_UPDATES_FLUSH();
 #endif
 
 	/*
 	 * Check for misbehavior.
 	 *
 	 * In case there is a callchain tracing ongoing because of
 	 * hwpmc(4), skip the scheduler pinning check.
 	 * hwpmc(4) subsystem, infact, will collect callchain informations
 	 * at ast() checkpoint, which is past userret().
 	 */
 	WITNESS_WARN(WARN_PANIC, NULL, "userret: returning");
 	KASSERT(td->td_critnest == 0,
 	    ("userret: Returning in a critical section"));
 	KASSERT(td->td_locks == 0,
 	    ("userret: Returning with %d locks held", td->td_locks));
 	KASSERT((td->td_pflags & TDP_NOFAULTING) == 0,
 	    ("userret: Returning with pagefaults disabled"));
 	KASSERT(td->td_no_sleeping == 0,
 	    ("userret: Returning with sleep disabled"));
 	KASSERT(td->td_pinned == 0 || (td->td_pflags & TDP_CALLCHAIN) != 0,
 	    ("userret: Returning with with pinned thread"));
 	KASSERT(td->td_vp_reserv == 0,
 	    ("userret: Returning while holding vnode reservation"));
 	KASSERT((td->td_flags & TDF_SBDRY) == 0,
 	    ("userret: Returning with stop signals deferred"));
 #ifdef VIMAGE
 	/* Unfortunately td_vnet_lpush needs VNET_DEBUG. */
 	VNET_ASSERT(curvnet == NULL,
 	    ("%s: Returning on td %p (pid %d, %s) with vnet %p set in %s",
 	    __func__, td, p->p_pid, td->td_name, curvnet,
 	    (td->td_vnet_lpush != NULL) ? td->td_vnet_lpush : "N/A"));
 #endif
 #ifdef	RACCT
 	PROC_LOCK(p);
 	while (p->p_throttled == 1)
 		msleep(p->p_racct, &p->p_mtx, 0, "racct", 0);
 	PROC_UNLOCK(p);
 #endif
 }
 
 /*
  * Process an asynchronous software trap.
  * This is relatively easy.
  * This function will return with preemption disabled.
  */
 void
 ast(struct trapframe *framep)
 {
 	struct thread *td;
 	struct proc *p;
 	int flags;
 	int sig;
 
 	td = curthread;
 	p = td->td_proc;
 
 	CTR3(KTR_SYSC, "ast: thread %p (pid %d, %s)", td, p->p_pid,
             p->p_comm);
 	KASSERT(TRAPF_USERMODE(framep), ("ast in kernel mode"));
 	WITNESS_WARN(WARN_PANIC, NULL, "Returning to user mode");
 	mtx_assert(&Giant, MA_NOTOWNED);
 	THREAD_LOCK_ASSERT(td, MA_NOTOWNED);
 	td->td_frame = framep;
 	td->td_pticks = 0;
 
 	/*
 	 * This updates the td_flag's for the checks below in one
 	 * "atomic" operation with turning off the astpending flag.
 	 * If another AST is triggered while we are handling the
 	 * AST's saved in flags, the astpending flag will be set and
 	 * ast() will be called again.
 	 */
 	thread_lock(td);
 	flags = td->td_flags;
 	td->td_flags &= ~(TDF_ASTPENDING | TDF_NEEDSIGCHK | TDF_NEEDSUSPCHK |
 	    TDF_NEEDRESCHED | TDF_ALRMPEND | TDF_PROFPEND | TDF_MACPEND);
 	thread_unlock(td);
 	PCPU_INC(cnt.v_trap);
 
 	if (td->td_ucred != p->p_ucred) 
 		cred_update_thread(td);
 	if (td->td_pflags & TDP_OWEUPC && p->p_flag & P_PROFIL) {
 		addupc_task(td, td->td_profil_addr, td->td_profil_ticks);
 		td->td_profil_ticks = 0;
 		td->td_pflags &= ~TDP_OWEUPC;
 	}
 #ifdef HWPMC_HOOKS
 	/* Handle Software PMC callchain capture. */
 	if (PMC_IS_PENDING_CALLCHAIN(td))
 		PMC_CALL_HOOK_UNLOCKED(td, PMC_FN_USER_CALLCHAIN_SOFT, (void *) framep);
 #endif
 	if (flags & TDF_ALRMPEND) {
 		PROC_LOCK(p);
 		kern_psignal(p, SIGVTALRM);
 		PROC_UNLOCK(p);
 	}
 	if (flags & TDF_PROFPEND) {
 		PROC_LOCK(p);
 		kern_psignal(p, SIGPROF);
 		PROC_UNLOCK(p);
 	}
 #ifdef MAC
 	if (flags & TDF_MACPEND)
 		mac_thread_userret(td);
 #endif
 	if (flags & TDF_NEEDRESCHED) {
 #ifdef KTRACE
 		if (KTRPOINT(td, KTR_CSW))
 			ktrcsw(1, 1, __func__);
 #endif
 		thread_lock(td);
 		sched_prio(td, td->td_user_pri);
 		mi_switch(SW_INVOL | SWT_NEEDRESCHED, NULL);
 		thread_unlock(td);
 #ifdef KTRACE
 		if (KTRPOINT(td, KTR_CSW))
 			ktrcsw(0, 1, __func__);
 #endif
 	}
 
 	/*
 	 * Check for signals. Unlocked reads of p_pendingcnt or
 	 * p_siglist might cause process-directed signal to be handled
 	 * later.
 	 */
 	if (flags & TDF_NEEDSIGCHK || p->p_pendingcnt > 0 ||
 	    !SIGISEMPTY(p->p_siglist)) {
 		PROC_LOCK(p);
 		mtx_lock(&p->p_sigacts->ps_mtx);
 		while ((sig = cursig(td)) != 0)
 			postsig(sig);
 		mtx_unlock(&p->p_sigacts->ps_mtx);
 		PROC_UNLOCK(p);
 	}
 	/*
 	 * We need to check to see if we have to exit or wait due to a
 	 * single threading requirement or some other STOP condition.
 	 */
 	if (flags & TDF_NEEDSUSPCHK) {
 		PROC_LOCK(p);
 		thread_suspend_check(0);
 		PROC_UNLOCK(p);
 	}
 
 	if (td->td_pflags & TDP_OLDMASK) {
 		td->td_pflags &= ~TDP_OLDMASK;
 		kern_sigprocmask(td, SIG_SETMASK, &td->td_oldsigmask, NULL, 0);
 	}
 
 	userret(td, framep);
 }
 
 const char *
 syscallname(struct proc *p, u_int code)
 {
 	static const char unknown[] = "unknown";
 	struct sysentvec *sv;
 
 	sv = p->p_sysent;
 	if (sv->sv_syscallnames == NULL || code >= sv->sv_size)
 		return (unknown);
 	return (sv->sv_syscallnames[code]);
 }
Index: head/sys/kern/subr_turnstile.c
===================================================================
--- head/sys/kern/subr_turnstile.c	(revision 258540)
+++ head/sys/kern/subr_turnstile.c	(revision 258541)
@@ -1,1308 +1,1307 @@
 /*-
  * Copyright (c) 1998 Berkeley Software Design, Inc. All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Berkeley Software Design Inc's name may not be used to endorse or
  *    promote products derived from this software without specific prior
  *    written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY BERKELEY SOFTWARE DESIGN INC ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL BERKELEY SOFTWARE DESIGN INC BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	from BSDI $Id: mutex_witness.c,v 1.1.2.20 2000/04/27 03:10:27 cp Exp $
  *	and BSDI $Id: synch_machdep.c,v 2.3.2.39 2000/04/27 03:10:25 cp Exp $
  */
 
 /*
  * Implementation of turnstiles used to hold queue of threads blocked on
  * non-sleepable locks.  Sleepable locks use condition variables to
  * implement their queues.  Turnstiles differ from a sleep queue in that
  * turnstile queue's are assigned to a lock held by an owning thread.  Thus,
  * when one thread is enqueued onto a turnstile, it can lend its priority
  * to the owning thread.
  *
  * We wish to avoid bloating locks with an embedded turnstile and we do not
  * want to use back-pointers in the locks for the same reason.  Thus, we
  * use a similar approach to that of Solaris 7 as described in Solaris
  * Internals by Jim Mauro and Richard McDougall.  Turnstiles are looked up
  * in a hash table based on the address of the lock.  Each entry in the
  * hash table is a linked-lists of turnstiles and is called a turnstile
  * chain.  Each chain contains a spin mutex that protects all of the
  * turnstiles in the chain.
  *
  * Each time a thread is created, a turnstile is allocated from a UMA zone
  * and attached to that thread.  When a thread blocks on a lock, if it is the
  * first thread to block, it lends its turnstile to the lock.  If the lock
  * already has a turnstile, then it gives its turnstile to the lock's
  * turnstile's free list.  When a thread is woken up, it takes a turnstile from
  * the free list if there are any other waiters.  If it is the only thread
  * blocked on the lock, then it reclaims the turnstile associated with the lock
  * and removes it from the hash table.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_ddb.h"
-#include "opt_kdtrace.h"
 #include "opt_turnstile_profiling.h"
 #include "opt_sched.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/kdb.h>
 #include <sys/kernel.h>
 #include <sys/ktr.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/proc.h>
 #include <sys/queue.h>
 #include <sys/sched.h>
 #include <sys/sdt.h>
 #include <sys/sysctl.h>
 #include <sys/turnstile.h>
 
 #include <vm/uma.h>
 
 #ifdef DDB
 #include <ddb/ddb.h>
 #include <sys/lockmgr.h>
 #include <sys/sx.h>
 #endif
 
 /*
  * Constants for the hash table of turnstile chains.  TC_SHIFT is a magic
  * number chosen because the sleep queue's use the same value for the
  * shift.  Basically, we ignore the lower 8 bits of the address.
  * TC_TABLESIZE must be a power of two for TC_MASK to work properly.
  */
 #define	TC_TABLESIZE	128			/* Must be power of 2. */
 #define	TC_MASK		(TC_TABLESIZE - 1)
 #define	TC_SHIFT	8
 #define	TC_HASH(lock)	(((uintptr_t)(lock) >> TC_SHIFT) & TC_MASK)
 #define	TC_LOOKUP(lock)	&turnstile_chains[TC_HASH(lock)]
 
 /*
  * There are three different lists of turnstiles as follows.  The list
  * connected by ts_link entries is a per-thread list of all the turnstiles
  * attached to locks that we own.  This is used to fixup our priority when
  * a lock is released.  The other two lists use the ts_hash entries.  The
  * first of these two is the turnstile chain list that a turnstile is on
  * when it is attached to a lock.  The second list to use ts_hash is the
  * free list hung off of a turnstile that is attached to a lock.
  *
  * Each turnstile contains three lists of threads.  The two ts_blocked lists
  * are linked list of threads blocked on the turnstile's lock.  One list is
  * for exclusive waiters, and the other is for shared waiters.  The
  * ts_pending list is a linked list of threads previously awakened by
  * turnstile_signal() or turnstile_wait() that are waiting to be put on
  * the run queue.
  *
  * Locking key:
  *  c - turnstile chain lock
  *  q - td_contested lock
  */
 struct turnstile {
 	struct mtx ts_lock;			/* Spin lock for self. */
 	struct threadqueue ts_blocked[2];	/* (c + q) Blocked threads. */
 	struct threadqueue ts_pending;		/* (c) Pending threads. */
 	LIST_ENTRY(turnstile) ts_hash;		/* (c) Chain and free list. */
 	LIST_ENTRY(turnstile) ts_link;		/* (q) Contested locks. */
 	LIST_HEAD(, turnstile) ts_free;		/* (c) Free turnstiles. */
 	struct lock_object *ts_lockobj;		/* (c) Lock we reference. */
 	struct thread *ts_owner;		/* (c + q) Who owns the lock. */
 };
 
 struct turnstile_chain {
 	LIST_HEAD(, turnstile) tc_turnstiles;	/* List of turnstiles. */
 	struct mtx tc_lock;			/* Spin lock for this chain. */
 #ifdef TURNSTILE_PROFILING
 	u_int	tc_depth;			/* Length of tc_queues. */
 	u_int	tc_max_depth;			/* Max length of tc_queues. */
 #endif
 };
 
 #ifdef TURNSTILE_PROFILING
 u_int turnstile_max_depth;
 static SYSCTL_NODE(_debug, OID_AUTO, turnstile, CTLFLAG_RD, 0,
     "turnstile profiling");
 static SYSCTL_NODE(_debug_turnstile, OID_AUTO, chains, CTLFLAG_RD, 0,
     "turnstile chain stats");
 SYSCTL_UINT(_debug_turnstile, OID_AUTO, max_depth, CTLFLAG_RD,
     &turnstile_max_depth, 0, "maximum depth achieved of a single chain");
 #endif
 static struct mtx td_contested_lock;
 static struct turnstile_chain turnstile_chains[TC_TABLESIZE];
 static uma_zone_t turnstile_zone;
 
 /*
  * Prototypes for non-exported routines.
  */
 static void	init_turnstile0(void *dummy);
 #ifdef TURNSTILE_PROFILING
 static void	init_turnstile_profiling(void *arg);
 #endif
 static void	propagate_priority(struct thread *td);
 static int	turnstile_adjust_thread(struct turnstile *ts,
 		    struct thread *td);
 static struct thread *turnstile_first_waiter(struct turnstile *ts);
 static void	turnstile_setowner(struct turnstile *ts, struct thread *owner);
 #ifdef INVARIANTS
 static void	turnstile_dtor(void *mem, int size, void *arg);
 #endif
 static int	turnstile_init(void *mem, int size, int flags);
 static void	turnstile_fini(void *mem, int size);
 
 SDT_PROVIDER_DECLARE(sched);
 SDT_PROBE_DEFINE(sched, , , sleep, sleep);
 SDT_PROBE_DEFINE2(sched, , , wakeup, wakeup, "struct thread *", 
     "struct proc *");
 
 /*
  * Walks the chain of turnstiles and their owners to propagate the priority
  * of the thread being blocked to all the threads holding locks that have to
  * release their locks before this thread can run again.
  */
 static void
 propagate_priority(struct thread *td)
 {
 	struct turnstile *ts;
 	int pri;
 
 	THREAD_LOCK_ASSERT(td, MA_OWNED);
 	pri = td->td_priority;
 	ts = td->td_blocked;
 	THREAD_LOCKPTR_ASSERT(td, &ts->ts_lock);
 	/*
 	 * Grab a recursive lock on this turnstile chain so it stays locked
 	 * for the whole operation.  The caller expects us to return with
 	 * the original lock held.  We only ever lock down the chain so
 	 * the lock order is constant.
 	 */
 	mtx_lock_spin(&ts->ts_lock);
 	for (;;) {
 		td = ts->ts_owner;
 
 		if (td == NULL) {
 			/*
 			 * This might be a read lock with no owner.  There's
 			 * not much we can do, so just bail.
 			 */
 			mtx_unlock_spin(&ts->ts_lock);
 			return;
 		}
 
 		thread_lock_flags(td, MTX_DUPOK);
 		mtx_unlock_spin(&ts->ts_lock);
 		MPASS(td->td_proc != NULL);
 		MPASS(td->td_proc->p_magic == P_MAGIC);
 
 		/*
 		 * If the thread is asleep, then we are probably about
 		 * to deadlock.  To make debugging this easier, show
 		 * backtrace of misbehaving thread and panic to not
 		 * leave the kernel deadlocked.
 		 */
 		if (TD_IS_SLEEPING(td)) {
 			printf(
 		"Sleeping thread (tid %d, pid %d) owns a non-sleepable lock\n",
 			    td->td_tid, td->td_proc->p_pid);
 			kdb_backtrace_thread(td);
 			panic("sleeping thread");
 		}
 
 		/*
 		 * If this thread already has higher priority than the
 		 * thread that is being blocked, we are finished.
 		 */
 		if (td->td_priority <= pri) {
 			thread_unlock(td);
 			return;
 		}
 
 		/*
 		 * Bump this thread's priority.
 		 */
 		sched_lend_prio(td, pri);
 
 		/*
 		 * If lock holder is actually running or on the run queue
 		 * then we are done.
 		 */
 		if (TD_IS_RUNNING(td) || TD_ON_RUNQ(td)) {
 			MPASS(td->td_blocked == NULL);
 			thread_unlock(td);
 			return;
 		}
 
 #ifndef SMP
 		/*
 		 * For UP, we check to see if td is curthread (this shouldn't
 		 * ever happen however as it would mean we are in a deadlock.)
 		 */
 		KASSERT(td != curthread, ("Deadlock detected"));
 #endif
 
 		/*
 		 * If we aren't blocked on a lock, we should be.
 		 */
 		KASSERT(TD_ON_LOCK(td), (
 		    "thread %d(%s):%d holds %s but isn't blocked on a lock\n",
 		    td->td_tid, td->td_name, td->td_state,
 		    ts->ts_lockobj->lo_name));
 
 		/*
 		 * Pick up the lock that td is blocked on.
 		 */
 		ts = td->td_blocked;
 		MPASS(ts != NULL);
 		THREAD_LOCKPTR_ASSERT(td, &ts->ts_lock);
 		/* Resort td on the list if needed. */
 		if (!turnstile_adjust_thread(ts, td)) {
 			mtx_unlock_spin(&ts->ts_lock);
 			return;
 		}
 		/* The thread lock is released as ts lock above. */
 	}
 }
 
 /*
  * Adjust the thread's position on a turnstile after its priority has been
  * changed.
  */
 static int
 turnstile_adjust_thread(struct turnstile *ts, struct thread *td)
 {
 	struct thread *td1, *td2;
 	int queue;
 
 	THREAD_LOCK_ASSERT(td, MA_OWNED);
 	MPASS(TD_ON_LOCK(td));
 
 	/*
 	 * This thread may not be blocked on this turnstile anymore
 	 * but instead might already be woken up on another CPU
 	 * that is waiting on the thread lock in turnstile_unpend() to
 	 * finish waking this thread up.  We can detect this case
 	 * by checking to see if this thread has been given a
 	 * turnstile by either turnstile_signal() or
 	 * turnstile_broadcast().  In this case, treat the thread as
 	 * if it was already running.
 	 */
 	if (td->td_turnstile != NULL)
 		return (0);
 
 	/*
 	 * Check if the thread needs to be moved on the blocked chain.
 	 * It needs to be moved if either its priority is lower than
 	 * the previous thread or higher than the next thread.
 	 */
 	THREAD_LOCKPTR_ASSERT(td, &ts->ts_lock);
 	td1 = TAILQ_PREV(td, threadqueue, td_lockq);
 	td2 = TAILQ_NEXT(td, td_lockq);
 	if ((td1 != NULL && td->td_priority < td1->td_priority) ||
 	    (td2 != NULL && td->td_priority > td2->td_priority)) {
 
 		/*
 		 * Remove thread from blocked chain and determine where
 		 * it should be moved to.
 		 */
 		queue = td->td_tsqueue;
 		MPASS(queue == TS_EXCLUSIVE_QUEUE || queue == TS_SHARED_QUEUE);
 		mtx_lock_spin(&td_contested_lock);
 		TAILQ_REMOVE(&ts->ts_blocked[queue], td, td_lockq);
 		TAILQ_FOREACH(td1, &ts->ts_blocked[queue], td_lockq) {
 			MPASS(td1->td_proc->p_magic == P_MAGIC);
 			if (td1->td_priority > td->td_priority)
 				break;
 		}
 
 		if (td1 == NULL)
 			TAILQ_INSERT_TAIL(&ts->ts_blocked[queue], td, td_lockq);
 		else
 			TAILQ_INSERT_BEFORE(td1, td, td_lockq);
 		mtx_unlock_spin(&td_contested_lock);
 		if (td1 == NULL)
 			CTR3(KTR_LOCK,
 		    "turnstile_adjust_thread: td %d put at tail on [%p] %s",
 			    td->td_tid, ts->ts_lockobj, ts->ts_lockobj->lo_name);
 		else
 			CTR4(KTR_LOCK,
 		    "turnstile_adjust_thread: td %d moved before %d on [%p] %s",
 			    td->td_tid, td1->td_tid, ts->ts_lockobj,
 			    ts->ts_lockobj->lo_name);
 	}
 	return (1);
 }
 
 /*
  * Early initialization of turnstiles.  This is not done via a SYSINIT()
  * since this needs to be initialized very early when mutexes are first
  * initialized.
  */
 void
 init_turnstiles(void)
 {
 	int i;
 
 	for (i = 0; i < TC_TABLESIZE; i++) {
 		LIST_INIT(&turnstile_chains[i].tc_turnstiles);
 		mtx_init(&turnstile_chains[i].tc_lock, "turnstile chain",
 		    NULL, MTX_SPIN);
 	}
 	mtx_init(&td_contested_lock, "td_contested", NULL, MTX_SPIN);
 	LIST_INIT(&thread0.td_contested);
 	thread0.td_turnstile = NULL;
 }
 
 #ifdef TURNSTILE_PROFILING
 static void
 init_turnstile_profiling(void *arg)
 {
 	struct sysctl_oid *chain_oid;
 	char chain_name[10];
 	int i;
 
 	for (i = 0; i < TC_TABLESIZE; i++) {
 		snprintf(chain_name, sizeof(chain_name), "%d", i);
 		chain_oid = SYSCTL_ADD_NODE(NULL, 
 		    SYSCTL_STATIC_CHILDREN(_debug_turnstile_chains), OID_AUTO,
 		    chain_name, CTLFLAG_RD, NULL, "turnstile chain stats");
 		SYSCTL_ADD_UINT(NULL, SYSCTL_CHILDREN(chain_oid), OID_AUTO,
 		    "depth", CTLFLAG_RD, &turnstile_chains[i].tc_depth, 0,
 		    NULL);
 		SYSCTL_ADD_UINT(NULL, SYSCTL_CHILDREN(chain_oid), OID_AUTO,
 		    "max_depth", CTLFLAG_RD, &turnstile_chains[i].tc_max_depth,
 		    0, NULL);
 	}
 }
 SYSINIT(turnstile_profiling, SI_SUB_LOCK, SI_ORDER_ANY,
     init_turnstile_profiling, NULL);
 #endif
 
 static void
 init_turnstile0(void *dummy)
 {
 
 	turnstile_zone = uma_zcreate("TURNSTILE", sizeof(struct turnstile),
 	    NULL,
 #ifdef INVARIANTS
 	    turnstile_dtor,
 #else
 	    NULL,
 #endif
 	    turnstile_init, turnstile_fini, UMA_ALIGN_CACHE, UMA_ZONE_NOFREE);
 	thread0.td_turnstile = turnstile_alloc();
 }
 SYSINIT(turnstile0, SI_SUB_LOCK, SI_ORDER_ANY, init_turnstile0, NULL);
 
 /*
  * Update a thread on the turnstile list after it's priority has been changed.
  * The old priority is passed in as an argument.
  */
 void
 turnstile_adjust(struct thread *td, u_char oldpri)
 {
 	struct turnstile *ts;
 
 	MPASS(TD_ON_LOCK(td));
 
 	/*
 	 * Pick up the lock that td is blocked on.
 	 */
 	ts = td->td_blocked;
 	MPASS(ts != NULL);
 	THREAD_LOCKPTR_ASSERT(td, &ts->ts_lock);
 	mtx_assert(&ts->ts_lock, MA_OWNED);
 
 	/* Resort the turnstile on the list. */
 	if (!turnstile_adjust_thread(ts, td))
 		return;
 	/*
 	 * If our priority was lowered and we are at the head of the
 	 * turnstile, then propagate our new priority up the chain.
 	 * Note that we currently don't try to revoke lent priorities
 	 * when our priority goes up.
 	 */
 	MPASS(td->td_tsqueue == TS_EXCLUSIVE_QUEUE ||
 	    td->td_tsqueue == TS_SHARED_QUEUE);
 	if (td == TAILQ_FIRST(&ts->ts_blocked[td->td_tsqueue]) &&
 	    td->td_priority < oldpri) {
 		propagate_priority(td);
 	}
 }
 
 /*
  * Set the owner of the lock this turnstile is attached to.
  */
 static void
 turnstile_setowner(struct turnstile *ts, struct thread *owner)
 {
 
 	mtx_assert(&td_contested_lock, MA_OWNED);
 	MPASS(ts->ts_owner == NULL);
 
 	/* A shared lock might not have an owner. */
 	if (owner == NULL)
 		return;
 
 	MPASS(owner->td_proc->p_magic == P_MAGIC);
 	ts->ts_owner = owner;
 	LIST_INSERT_HEAD(&owner->td_contested, ts, ts_link);
 }
 
 #ifdef INVARIANTS
 /*
  * UMA zone item deallocator.
  */
 static void
 turnstile_dtor(void *mem, int size, void *arg)
 {
 	struct turnstile *ts;
 
 	ts = mem;
 	MPASS(TAILQ_EMPTY(&ts->ts_blocked[TS_EXCLUSIVE_QUEUE]));
 	MPASS(TAILQ_EMPTY(&ts->ts_blocked[TS_SHARED_QUEUE]));
 	MPASS(TAILQ_EMPTY(&ts->ts_pending));
 }
 #endif
 
 /*
  * UMA zone item initializer.
  */
 static int
 turnstile_init(void *mem, int size, int flags)
 {
 	struct turnstile *ts;
 
 	bzero(mem, size);
 	ts = mem;
 	TAILQ_INIT(&ts->ts_blocked[TS_EXCLUSIVE_QUEUE]);
 	TAILQ_INIT(&ts->ts_blocked[TS_SHARED_QUEUE]);
 	TAILQ_INIT(&ts->ts_pending);
 	LIST_INIT(&ts->ts_free);
 	mtx_init(&ts->ts_lock, "turnstile lock", NULL, MTX_SPIN | MTX_RECURSE);
 	return (0);
 }
 
 static void
 turnstile_fini(void *mem, int size)
 {
 	struct turnstile *ts;
 
 	ts = mem;
 	mtx_destroy(&ts->ts_lock);
 }
 
 /*
  * Get a turnstile for a new thread.
  */
 struct turnstile *
 turnstile_alloc(void)
 {
 
 	return (uma_zalloc(turnstile_zone, M_WAITOK));
 }
 
 /*
  * Free a turnstile when a thread is destroyed.
  */
 void
 turnstile_free(struct turnstile *ts)
 {
 
 	uma_zfree(turnstile_zone, ts);
 }
 
 /*
  * Lock the turnstile chain associated with the specified lock.
  */
 void
 turnstile_chain_lock(struct lock_object *lock)
 {
 	struct turnstile_chain *tc;
 
 	tc = TC_LOOKUP(lock);
 	mtx_lock_spin(&tc->tc_lock);
 }
 
 struct turnstile *
 turnstile_trywait(struct lock_object *lock)
 {
 	struct turnstile_chain *tc;
 	struct turnstile *ts;
 
 	tc = TC_LOOKUP(lock);
 	mtx_lock_spin(&tc->tc_lock);
 	LIST_FOREACH(ts, &tc->tc_turnstiles, ts_hash)
 		if (ts->ts_lockobj == lock) {
 			mtx_lock_spin(&ts->ts_lock);
 			return (ts);
 		}
 
 	ts = curthread->td_turnstile;
 	MPASS(ts != NULL);
 	mtx_lock_spin(&ts->ts_lock);
 	KASSERT(ts->ts_lockobj == NULL, ("stale ts_lockobj pointer"));
 	ts->ts_lockobj = lock;
 
 	return (ts);
 }
 
 void
 turnstile_cancel(struct turnstile *ts)
 {
 	struct turnstile_chain *tc;
 	struct lock_object *lock;
 
 	mtx_assert(&ts->ts_lock, MA_OWNED);
 
 	mtx_unlock_spin(&ts->ts_lock);
 	lock = ts->ts_lockobj;
 	if (ts == curthread->td_turnstile)
 		ts->ts_lockobj = NULL;
 	tc = TC_LOOKUP(lock);
 	mtx_unlock_spin(&tc->tc_lock);
 }
 
 /*
  * Look up the turnstile for a lock in the hash table locking the associated
  * turnstile chain along the way.  If no turnstile is found in the hash
  * table, NULL is returned.
  */
 struct turnstile *
 turnstile_lookup(struct lock_object *lock)
 {
 	struct turnstile_chain *tc;
 	struct turnstile *ts;
 
 	tc = TC_LOOKUP(lock);
 	mtx_assert(&tc->tc_lock, MA_OWNED);
 	LIST_FOREACH(ts, &tc->tc_turnstiles, ts_hash)
 		if (ts->ts_lockobj == lock) {
 			mtx_lock_spin(&ts->ts_lock);
 			return (ts);
 		}
 	return (NULL);
 }
 
 /*
  * Unlock the turnstile chain associated with a given lock.
  */
 void
 turnstile_chain_unlock(struct lock_object *lock)
 {
 	struct turnstile_chain *tc;
 
 	tc = TC_LOOKUP(lock);
 	mtx_unlock_spin(&tc->tc_lock);
 }
 
 /*
  * Return a pointer to the thread waiting on this turnstile with the
  * most important priority or NULL if the turnstile has no waiters.
  */
 static struct thread *
 turnstile_first_waiter(struct turnstile *ts)
 {
 	struct thread *std, *xtd;
 
 	std = TAILQ_FIRST(&ts->ts_blocked[TS_SHARED_QUEUE]);
 	xtd = TAILQ_FIRST(&ts->ts_blocked[TS_EXCLUSIVE_QUEUE]);
 	if (xtd == NULL || (std != NULL && std->td_priority < xtd->td_priority))
 		return (std);
 	return (xtd);
 }
 
 /*
  * Take ownership of a turnstile and adjust the priority of the new
  * owner appropriately.
  */
 void
 turnstile_claim(struct turnstile *ts)
 {
 	struct thread *td, *owner;
 	struct turnstile_chain *tc;
 
 	mtx_assert(&ts->ts_lock, MA_OWNED);
 	MPASS(ts != curthread->td_turnstile);
 
 	owner = curthread;
 	mtx_lock_spin(&td_contested_lock);
 	turnstile_setowner(ts, owner);
 	mtx_unlock_spin(&td_contested_lock);
 
 	td = turnstile_first_waiter(ts);
 	MPASS(td != NULL);
 	MPASS(td->td_proc->p_magic == P_MAGIC);
 	THREAD_LOCKPTR_ASSERT(td, &ts->ts_lock);
 
 	/*
 	 * Update the priority of the new owner if needed.
 	 */
 	thread_lock(owner);
 	if (td->td_priority < owner->td_priority)
 		sched_lend_prio(owner, td->td_priority);
 	thread_unlock(owner);
 	tc = TC_LOOKUP(ts->ts_lockobj);
 	mtx_unlock_spin(&ts->ts_lock);
 	mtx_unlock_spin(&tc->tc_lock);
 }
 
 /*
  * Block the current thread on the turnstile assicated with 'lock'.  This
  * function will context switch and not return until this thread has been
  * woken back up.  This function must be called with the appropriate
  * turnstile chain locked and will return with it unlocked.
  */
 void
 turnstile_wait(struct turnstile *ts, struct thread *owner, int queue)
 {
 	struct turnstile_chain *tc;
 	struct thread *td, *td1;
 	struct lock_object *lock;
 
 	td = curthread;
 	mtx_assert(&ts->ts_lock, MA_OWNED);
 	if (owner)
 		MPASS(owner->td_proc->p_magic == P_MAGIC);
 	MPASS(queue == TS_SHARED_QUEUE || queue == TS_EXCLUSIVE_QUEUE);
 
 	/*
 	 * If the lock does not already have a turnstile, use this thread's
 	 * turnstile.  Otherwise insert the current thread into the
 	 * turnstile already in use by this lock.
 	 */
 	tc = TC_LOOKUP(ts->ts_lockobj);
 	mtx_assert(&tc->tc_lock, MA_OWNED);
 	if (ts == td->td_turnstile) {
 #ifdef TURNSTILE_PROFILING
 		tc->tc_depth++;
 		if (tc->tc_depth > tc->tc_max_depth) {
 			tc->tc_max_depth = tc->tc_depth;
 			if (tc->tc_max_depth > turnstile_max_depth)
 				turnstile_max_depth = tc->tc_max_depth;
 		}
 #endif
 		LIST_INSERT_HEAD(&tc->tc_turnstiles, ts, ts_hash);
 		KASSERT(TAILQ_EMPTY(&ts->ts_pending),
 		    ("thread's turnstile has pending threads"));
 		KASSERT(TAILQ_EMPTY(&ts->ts_blocked[TS_EXCLUSIVE_QUEUE]),
 		    ("thread's turnstile has exclusive waiters"));
 		KASSERT(TAILQ_EMPTY(&ts->ts_blocked[TS_SHARED_QUEUE]),
 		    ("thread's turnstile has shared waiters"));
 		KASSERT(LIST_EMPTY(&ts->ts_free),
 		    ("thread's turnstile has a non-empty free list"));
 		MPASS(ts->ts_lockobj != NULL);
 		mtx_lock_spin(&td_contested_lock);
 		TAILQ_INSERT_TAIL(&ts->ts_blocked[queue], td, td_lockq);
 		turnstile_setowner(ts, owner);
 		mtx_unlock_spin(&td_contested_lock);
 	} else {
 		TAILQ_FOREACH(td1, &ts->ts_blocked[queue], td_lockq)
 			if (td1->td_priority > td->td_priority)
 				break;
 		mtx_lock_spin(&td_contested_lock);
 		if (td1 != NULL)
 			TAILQ_INSERT_BEFORE(td1, td, td_lockq);
 		else
 			TAILQ_INSERT_TAIL(&ts->ts_blocked[queue], td, td_lockq);
 		MPASS(owner == ts->ts_owner);
 		mtx_unlock_spin(&td_contested_lock);
 		MPASS(td->td_turnstile != NULL);
 		LIST_INSERT_HEAD(&ts->ts_free, td->td_turnstile, ts_hash);
 	}
 	thread_lock(td);
 	thread_lock_set(td, &ts->ts_lock);
 	td->td_turnstile = NULL;
 
 	/* Save who we are blocked on and switch. */
 	lock = ts->ts_lockobj;
 	td->td_tsqueue = queue;
 	td->td_blocked = ts;
 	td->td_lockname = lock->lo_name;
 	td->td_blktick = ticks;
 	TD_SET_LOCK(td);
 	mtx_unlock_spin(&tc->tc_lock);
 	propagate_priority(td);
 
 	if (LOCK_LOG_TEST(lock, 0))
 		CTR4(KTR_LOCK, "%s: td %d blocked on [%p] %s", __func__,
 		    td->td_tid, lock, lock->lo_name);
 
 	SDT_PROBE0(sched, , , sleep);
 
 	THREAD_LOCKPTR_ASSERT(td, &ts->ts_lock);
 	mi_switch(SW_VOL | SWT_TURNSTILE, NULL);
 
 	if (LOCK_LOG_TEST(lock, 0))
 		CTR4(KTR_LOCK, "%s: td %d free from blocked on [%p] %s",
 		    __func__, td->td_tid, lock, lock->lo_name);
 	thread_unlock(td);
 }
 
 /*
  * Pick the highest priority thread on this turnstile and put it on the
  * pending list.  This must be called with the turnstile chain locked.
  */
 int
 turnstile_signal(struct turnstile *ts, int queue)
 {
 	struct turnstile_chain *tc;
 	struct thread *td;
 	int empty;
 
 	MPASS(ts != NULL);
 	mtx_assert(&ts->ts_lock, MA_OWNED);
 	MPASS(curthread->td_proc->p_magic == P_MAGIC);
 	MPASS(ts->ts_owner == curthread || ts->ts_owner == NULL);
 	MPASS(queue == TS_SHARED_QUEUE || queue == TS_EXCLUSIVE_QUEUE);
 
 	/*
 	 * Pick the highest priority thread blocked on this lock and
 	 * move it to the pending list.
 	 */
 	td = TAILQ_FIRST(&ts->ts_blocked[queue]);
 	MPASS(td->td_proc->p_magic == P_MAGIC);
 	mtx_lock_spin(&td_contested_lock);
 	TAILQ_REMOVE(&ts->ts_blocked[queue], td, td_lockq);
 	mtx_unlock_spin(&td_contested_lock);
 	TAILQ_INSERT_TAIL(&ts->ts_pending, td, td_lockq);
 
 	/*
 	 * If the turnstile is now empty, remove it from its chain and
 	 * give it to the about-to-be-woken thread.  Otherwise take a
 	 * turnstile from the free list and give it to the thread.
 	 */
 	empty = TAILQ_EMPTY(&ts->ts_blocked[TS_EXCLUSIVE_QUEUE]) &&
 	    TAILQ_EMPTY(&ts->ts_blocked[TS_SHARED_QUEUE]);
 	if (empty) {
 		tc = TC_LOOKUP(ts->ts_lockobj);
 		mtx_assert(&tc->tc_lock, MA_OWNED);
 		MPASS(LIST_EMPTY(&ts->ts_free));
 #ifdef TURNSTILE_PROFILING
 		tc->tc_depth--;
 #endif
 	} else
 		ts = LIST_FIRST(&ts->ts_free);
 	MPASS(ts != NULL);
 	LIST_REMOVE(ts, ts_hash);
 	td->td_turnstile = ts;
 
 	return (empty);
 }
 	
 /*
  * Put all blocked threads on the pending list.  This must be called with
  * the turnstile chain locked.
  */
 void
 turnstile_broadcast(struct turnstile *ts, int queue)
 {
 	struct turnstile_chain *tc;
 	struct turnstile *ts1;
 	struct thread *td;
 
 	MPASS(ts != NULL);
 	mtx_assert(&ts->ts_lock, MA_OWNED);
 	MPASS(curthread->td_proc->p_magic == P_MAGIC);
 	MPASS(ts->ts_owner == curthread || ts->ts_owner == NULL);
 	/*
 	 * We must have the chain locked so that we can remove the empty
 	 * turnstile from the hash queue.
 	 */
 	tc = TC_LOOKUP(ts->ts_lockobj);
 	mtx_assert(&tc->tc_lock, MA_OWNED);
 	MPASS(queue == TS_SHARED_QUEUE || queue == TS_EXCLUSIVE_QUEUE);
 
 	/*
 	 * Transfer the blocked list to the pending list.
 	 */
 	mtx_lock_spin(&td_contested_lock);
 	TAILQ_CONCAT(&ts->ts_pending, &ts->ts_blocked[queue], td_lockq);
 	mtx_unlock_spin(&td_contested_lock);
 
 	/*
 	 * Give a turnstile to each thread.  The last thread gets
 	 * this turnstile if the turnstile is empty.
 	 */
 	TAILQ_FOREACH(td, &ts->ts_pending, td_lockq) {
 		if (LIST_EMPTY(&ts->ts_free)) {
 			MPASS(TAILQ_NEXT(td, td_lockq) == NULL);
 			ts1 = ts;
 #ifdef TURNSTILE_PROFILING
 			tc->tc_depth--;
 #endif
 		} else
 			ts1 = LIST_FIRST(&ts->ts_free);
 		MPASS(ts1 != NULL);
 		LIST_REMOVE(ts1, ts_hash);
 		td->td_turnstile = ts1;
 	}
 }
 
 /*
  * Wakeup all threads on the pending list and adjust the priority of the
  * current thread appropriately.  This must be called with the turnstile
  * chain locked.
  */
 void
 turnstile_unpend(struct turnstile *ts, int owner_type)
 {
 	TAILQ_HEAD( ,thread) pending_threads;
 	struct turnstile *nts;
 	struct thread *td;
 	u_char cp, pri;
 
 	MPASS(ts != NULL);
 	mtx_assert(&ts->ts_lock, MA_OWNED);
 	MPASS(ts->ts_owner == curthread || ts->ts_owner == NULL);
 	MPASS(!TAILQ_EMPTY(&ts->ts_pending));
 
 	/*
 	 * Move the list of pending threads out of the turnstile and
 	 * into a local variable.
 	 */
 	TAILQ_INIT(&pending_threads);
 	TAILQ_CONCAT(&pending_threads, &ts->ts_pending, td_lockq);
 #ifdef INVARIANTS
 	if (TAILQ_EMPTY(&ts->ts_blocked[TS_EXCLUSIVE_QUEUE]) &&
 	    TAILQ_EMPTY(&ts->ts_blocked[TS_SHARED_QUEUE]))
 		ts->ts_lockobj = NULL;
 #endif
 	/*
 	 * Adjust the priority of curthread based on other contested
 	 * locks it owns.  Don't lower the priority below the base
 	 * priority however.
 	 */
 	td = curthread;
 	pri = PRI_MAX;
 	thread_lock(td);
 	mtx_lock_spin(&td_contested_lock);
 	/*
 	 * Remove the turnstile from this thread's list of contested locks
 	 * since this thread doesn't own it anymore.  New threads will
 	 * not be blocking on the turnstile until it is claimed by a new
 	 * owner.  There might not be a current owner if this is a shared
 	 * lock.
 	 */
 	if (ts->ts_owner != NULL) {
 		ts->ts_owner = NULL;
 		LIST_REMOVE(ts, ts_link);
 	}
 	LIST_FOREACH(nts, &td->td_contested, ts_link) {
 		cp = turnstile_first_waiter(nts)->td_priority;
 		if (cp < pri)
 			pri = cp;
 	}
 	mtx_unlock_spin(&td_contested_lock);
 	sched_unlend_prio(td, pri);
 	thread_unlock(td);
 	/*
 	 * Wake up all the pending threads.  If a thread is not blocked
 	 * on a lock, then it is currently executing on another CPU in
 	 * turnstile_wait() or sitting on a run queue waiting to resume
 	 * in turnstile_wait().  Set a flag to force it to try to acquire
 	 * the lock again instead of blocking.
 	 */
 	while (!TAILQ_EMPTY(&pending_threads)) {
 		td = TAILQ_FIRST(&pending_threads);
 		TAILQ_REMOVE(&pending_threads, td, td_lockq);
 		SDT_PROBE2(sched, , , wakeup, td, td->td_proc);
 		thread_lock(td);
 		THREAD_LOCKPTR_ASSERT(td, &ts->ts_lock);
 		MPASS(td->td_proc->p_magic == P_MAGIC);
 		MPASS(TD_ON_LOCK(td));
 		TD_CLR_LOCK(td);
 		MPASS(TD_CAN_RUN(td));
 		td->td_blocked = NULL;
 		td->td_lockname = NULL;
 		td->td_blktick = 0;
 #ifdef INVARIANTS
 		td->td_tsqueue = 0xff;
 #endif
 		sched_add(td, SRQ_BORING);
 		thread_unlock(td);
 	}
 	mtx_unlock_spin(&ts->ts_lock);
 }
 
 /*
  * Give up ownership of a turnstile.  This must be called with the
  * turnstile chain locked.
  */
 void
 turnstile_disown(struct turnstile *ts)
 {
 	struct thread *td;
 	u_char cp, pri;
 
 	MPASS(ts != NULL);
 	mtx_assert(&ts->ts_lock, MA_OWNED);
 	MPASS(ts->ts_owner == curthread);
 	MPASS(TAILQ_EMPTY(&ts->ts_pending));
 	MPASS(!TAILQ_EMPTY(&ts->ts_blocked[TS_EXCLUSIVE_QUEUE]) ||
 	    !TAILQ_EMPTY(&ts->ts_blocked[TS_SHARED_QUEUE]));
 
 	/*
 	 * Remove the turnstile from this thread's list of contested locks
 	 * since this thread doesn't own it anymore.  New threads will
 	 * not be blocking on the turnstile until it is claimed by a new
 	 * owner.
 	 */
 	mtx_lock_spin(&td_contested_lock);
 	ts->ts_owner = NULL;
 	LIST_REMOVE(ts, ts_link);
 	mtx_unlock_spin(&td_contested_lock);
 
 	/*
 	 * Adjust the priority of curthread based on other contested
 	 * locks it owns.  Don't lower the priority below the base
 	 * priority however.
 	 */
 	td = curthread;
 	pri = PRI_MAX;
 	thread_lock(td);
 	mtx_unlock_spin(&ts->ts_lock);
 	mtx_lock_spin(&td_contested_lock);
 	LIST_FOREACH(ts, &td->td_contested, ts_link) {
 		cp = turnstile_first_waiter(ts)->td_priority;
 		if (cp < pri)
 			pri = cp;
 	}
 	mtx_unlock_spin(&td_contested_lock);
 	sched_unlend_prio(td, pri);
 	thread_unlock(td);
 }
 
 /*
  * Return the first thread in a turnstile.
  */
 struct thread *
 turnstile_head(struct turnstile *ts, int queue)
 {
 #ifdef INVARIANTS
 
 	MPASS(ts != NULL);
 	MPASS(queue == TS_SHARED_QUEUE || queue == TS_EXCLUSIVE_QUEUE);
 	mtx_assert(&ts->ts_lock, MA_OWNED);
 #endif
 	return (TAILQ_FIRST(&ts->ts_blocked[queue]));
 }
 
 /*
  * Returns true if a sub-queue of a turnstile is empty.
  */
 int
 turnstile_empty(struct turnstile *ts, int queue)
 {
 #ifdef INVARIANTS
 
 	MPASS(ts != NULL);
 	MPASS(queue == TS_SHARED_QUEUE || queue == TS_EXCLUSIVE_QUEUE);
 	mtx_assert(&ts->ts_lock, MA_OWNED);
 #endif
 	return (TAILQ_EMPTY(&ts->ts_blocked[queue]));
 }
 
 #ifdef DDB
 static void
 print_thread(struct thread *td, const char *prefix)
 {
 
 	db_printf("%s%p (tid %d, pid %d, \"%s\")\n", prefix, td, td->td_tid,
 	    td->td_proc->p_pid, td->td_name[0] != '\0' ? td->td_name :
 	    td->td_name);
 }
 
 static void
 print_queue(struct threadqueue *queue, const char *header, const char *prefix)
 {
 	struct thread *td;
 
 	db_printf("%s:\n", header);
 	if (TAILQ_EMPTY(queue)) {
 		db_printf("%sempty\n", prefix);
 		return;
 	}
 	TAILQ_FOREACH(td, queue, td_lockq) {
 		print_thread(td, prefix);
 	}
 }
 
 DB_SHOW_COMMAND(turnstile, db_show_turnstile)
 {
 	struct turnstile_chain *tc;
 	struct turnstile *ts;
 	struct lock_object *lock;
 	int i;
 
 	if (!have_addr)
 		return;
 
 	/*
 	 * First, see if there is an active turnstile for the lock indicated
 	 * by the address.
 	 */
 	lock = (struct lock_object *)addr;
 	tc = TC_LOOKUP(lock);
 	LIST_FOREACH(ts, &tc->tc_turnstiles, ts_hash)
 		if (ts->ts_lockobj == lock)
 			goto found;
 
 	/*
 	 * Second, see if there is an active turnstile at the address
 	 * indicated.
 	 */
 	for (i = 0; i < TC_TABLESIZE; i++)
 		LIST_FOREACH(ts, &turnstile_chains[i].tc_turnstiles, ts_hash) {
 			if (ts == (struct turnstile *)addr)
 				goto found;
 		}
 
 	db_printf("Unable to locate a turnstile via %p\n", (void *)addr);
 	return;
 found:
 	lock = ts->ts_lockobj;
 	db_printf("Lock: %p - (%s) %s\n", lock, LOCK_CLASS(lock)->lc_name,
 	    lock->lo_name);
 	if (ts->ts_owner)
 		print_thread(ts->ts_owner, "Lock Owner: ");
 	else
 		db_printf("Lock Owner: none\n");
 	print_queue(&ts->ts_blocked[TS_SHARED_QUEUE], "Shared Waiters", "\t");
 	print_queue(&ts->ts_blocked[TS_EXCLUSIVE_QUEUE], "Exclusive Waiters",
 	    "\t");
 	print_queue(&ts->ts_pending, "Pending Threads", "\t");
 	
 }
 
 /*
  * Show all the threads a particular thread is waiting on based on
  * non-sleepable and non-spin locks.
  */
 static void
 print_lockchain(struct thread *td, const char *prefix)
 {
 	struct lock_object *lock;
 	struct lock_class *class;
 	struct turnstile *ts;
 
 	/*
 	 * Follow the chain.  We keep walking as long as the thread is
 	 * blocked on a turnstile that has an owner.
 	 */
 	while (!db_pager_quit) {
 		db_printf("%sthread %d (pid %d, %s) ", prefix, td->td_tid,
 		    td->td_proc->p_pid, td->td_name[0] != '\0' ? td->td_name :
 		    td->td_name);
 		switch (td->td_state) {
 		case TDS_INACTIVE:
 			db_printf("is inactive\n");
 			return;
 		case TDS_CAN_RUN:
 			db_printf("can run\n");
 			return;
 		case TDS_RUNQ:
 			db_printf("is on a run queue\n");
 			return;
 		case TDS_RUNNING:
 			db_printf("running on CPU %d\n", td->td_oncpu);
 			return;
 		case TDS_INHIBITED:
 			if (TD_ON_LOCK(td)) {
 				ts = td->td_blocked;
 				lock = ts->ts_lockobj;
 				class = LOCK_CLASS(lock);
 				db_printf("blocked on lock %p (%s) \"%s\"\n",
 				    lock, class->lc_name, lock->lo_name);
 				if (ts->ts_owner == NULL)
 					return;
 				td = ts->ts_owner;
 				break;
 			}
 			db_printf("inhibited\n");
 			return;
 		default:
 			db_printf("??? (%#x)\n", td->td_state);
 			return;
 		}
 	}
 }
 
 DB_SHOW_COMMAND(lockchain, db_show_lockchain)
 {
 	struct thread *td;
 
 	/* Figure out which thread to start with. */
 	if (have_addr)
 		td = db_lookup_thread(addr, TRUE);
 	else
 		td = kdb_thread;
 
 	print_lockchain(td, "");
 }
 
 DB_SHOW_ALL_COMMAND(chains, db_show_allchains)
 {
 	struct thread *td;
 	struct proc *p;
 	int i;
 
 	i = 1;
 	FOREACH_PROC_IN_SYSTEM(p) {
 		FOREACH_THREAD_IN_PROC(p, td) {
 			if (TD_ON_LOCK(td) && LIST_EMPTY(&td->td_contested)) {
 				db_printf("chain %d:\n", i++);
 				print_lockchain(td, " ");
 			}
 			if (db_pager_quit)
 				return;
 		}
 	}
 }
 DB_SHOW_ALIAS(allchains, db_show_allchains)
 
 /*
  * Show all the threads a particular thread is waiting on based on
  * sleepable locks.
  */
 static void
 print_sleepchain(struct thread *td, const char *prefix)
 {
 	struct thread *owner;
 
 	/*
 	 * Follow the chain.  We keep walking as long as the thread is
 	 * blocked on a sleep lock that has an owner.
 	 */
 	while (!db_pager_quit) {
 		db_printf("%sthread %d (pid %d, %s) ", prefix, td->td_tid,
 		    td->td_proc->p_pid, td->td_name[0] != '\0' ? td->td_name :
 		    td->td_name);
 		switch (td->td_state) {
 		case TDS_INACTIVE:
 			db_printf("is inactive\n");
 			return;
 		case TDS_CAN_RUN:
 			db_printf("can run\n");
 			return;
 		case TDS_RUNQ:
 			db_printf("is on a run queue\n");
 			return;
 		case TDS_RUNNING:
 			db_printf("running on CPU %d\n", td->td_oncpu);
 			return;
 		case TDS_INHIBITED:
 			if (TD_ON_SLEEPQ(td)) {
 				if (lockmgr_chain(td, &owner) ||
 				    sx_chain(td, &owner)) {
 					if (owner == NULL)
 						return;
 					td = owner;
 					break;
 				}
 				db_printf("sleeping on %p \"%s\"\n",
 				    td->td_wchan, td->td_wmesg);
 				return;
 			}
 			db_printf("inhibited\n");
 			return;
 		default:
 			db_printf("??? (%#x)\n", td->td_state);
 			return;
 		}
 	}
 }
 
 DB_SHOW_COMMAND(sleepchain, db_show_sleepchain)
 {
 	struct thread *td;
 
 	/* Figure out which thread to start with. */
 	if (have_addr)
 		td = db_lookup_thread(addr, TRUE);
 	else
 		td = kdb_thread;
 
 	print_sleepchain(td, "");
 }
 
 static void	print_waiters(struct turnstile *ts, int indent);
 	
 static void
 print_waiter(struct thread *td, int indent)
 {
 	struct turnstile *ts;
 	int i;
 
 	if (db_pager_quit)
 		return;
 	for (i = 0; i < indent; i++)
 		db_printf(" ");
 	print_thread(td, "thread ");
 	LIST_FOREACH(ts, &td->td_contested, ts_link)
 		print_waiters(ts, indent + 1);
 }
 
 static void
 print_waiters(struct turnstile *ts, int indent)
 {
 	struct lock_object *lock;
 	struct lock_class *class;
 	struct thread *td;
 	int i;
 
 	if (db_pager_quit)
 		return;
 	lock = ts->ts_lockobj;
 	class = LOCK_CLASS(lock);
 	for (i = 0; i < indent; i++)
 		db_printf(" ");
 	db_printf("lock %p (%s) \"%s\"\n", lock, class->lc_name, lock->lo_name);
 	TAILQ_FOREACH(td, &ts->ts_blocked[TS_EXCLUSIVE_QUEUE], td_lockq)
 		print_waiter(td, indent + 1);
 	TAILQ_FOREACH(td, &ts->ts_blocked[TS_SHARED_QUEUE], td_lockq)
 		print_waiter(td, indent + 1);
 	TAILQ_FOREACH(td, &ts->ts_pending, td_lockq)
 		print_waiter(td, indent + 1);
 }
 
 DB_SHOW_COMMAND(locktree, db_show_locktree)
 {
 	struct lock_object *lock;
 	struct lock_class *class;
 	struct turnstile_chain *tc;
 	struct turnstile *ts;
 
 	if (!have_addr)
 		return;
 	lock = (struct lock_object *)addr;
 	tc = TC_LOOKUP(lock);
 	LIST_FOREACH(ts, &tc->tc_turnstiles, ts_hash)
 		if (ts->ts_lockobj == lock)
 			break;
 	if (ts == NULL) {
 		class = LOCK_CLASS(lock);
 		db_printf("lock %p (%s) \"%s\"\n", lock, class->lc_name,
 		    lock->lo_name);
 	} else
 		print_waiters(ts, 0);
 }
 #endif
Index: head/sys/kern/vfs_cache.c
===================================================================
--- head/sys/kern/vfs_cache.c	(revision 258540)
+++ head/sys/kern/vfs_cache.c	(revision 258541)
@@ -1,1486 +1,1485 @@
 /*-
  * Copyright (c) 1989, 1993, 1995
  *	The Regents of the University of California.  All rights reserved.
  *
  * This code is derived from software contributed to Berkeley by
  * Poul-Henning Kamp of the FreeBSD Project.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)vfs_cache.c	8.5 (Berkeley) 3/22/95
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
-#include "opt_kdtrace.h"
 #include "opt_ktrace.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/filedesc.h>
 #include <sys/fnv_hash.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/fcntl.h>
 #include <sys/mount.h>
 #include <sys/namei.h>
 #include <sys/proc.h>
 #include <sys/rwlock.h>
 #include <sys/sdt.h>
 #include <sys/syscallsubr.h>
 #include <sys/sysctl.h>
 #include <sys/sysproto.h>
 #include <sys/vnode.h>
 #ifdef KTRACE
 #include <sys/ktrace.h>
 #endif
 
 #include <vm/uma.h>
 
 SDT_PROVIDER_DECLARE(vfs);
 SDT_PROBE_DEFINE3(vfs, namecache, enter, done, done, "struct vnode *", "char *",
     "struct vnode *");
 SDT_PROBE_DEFINE2(vfs, namecache, enter_negative, done, done, "struct vnode *",
     "char *");
 SDT_PROBE_DEFINE1(vfs, namecache, fullpath, entry, entry, "struct vnode *");
 SDT_PROBE_DEFINE3(vfs, namecache, fullpath, hit, hit, "struct vnode *",
     "char *", "struct vnode *");
 SDT_PROBE_DEFINE1(vfs, namecache, fullpath, miss, miss, "struct vnode *");
 SDT_PROBE_DEFINE3(vfs, namecache, fullpath, return, return, "int",
     "struct vnode *", "char *");
 SDT_PROBE_DEFINE3(vfs, namecache, lookup, hit, hit, "struct vnode *", "char *",
     "struct vnode *");
 SDT_PROBE_DEFINE2(vfs, namecache, lookup, hit_negative, hit-negative,
     "struct vnode *", "char *");
 SDT_PROBE_DEFINE2(vfs, namecache, lookup, miss, miss, "struct vnode *",
     "char *");
 SDT_PROBE_DEFINE1(vfs, namecache, purge, done, done, "struct vnode *");
 SDT_PROBE_DEFINE1(vfs, namecache, purge_negative, done, done, "struct vnode *");
 SDT_PROBE_DEFINE1(vfs, namecache, purgevfs, done, done, "struct mount *");
 SDT_PROBE_DEFINE3(vfs, namecache, zap, done, done, "struct vnode *", "char *",
     "struct vnode *");
 SDT_PROBE_DEFINE2(vfs, namecache, zap_negative, done, done, "struct vnode *",
     "char *");
 
 /*
  * This structure describes the elements in the cache of recent
  * names looked up by namei.
  */
 
 struct	namecache {
 	LIST_ENTRY(namecache) nc_hash;	/* hash chain */
 	LIST_ENTRY(namecache) nc_src;	/* source vnode list */
 	TAILQ_ENTRY(namecache) nc_dst;	/* destination vnode list */
 	struct	vnode *nc_dvp;		/* vnode of parent of name */
 	struct	vnode *nc_vp;		/* vnode the name refers to */
 	u_char	nc_flag;		/* flag bits */
 	u_char	nc_nlen;		/* length of name */
 	char	nc_name[0];		/* segment name + nul */
 };
 
 /*
  * struct namecache_ts repeats struct namecache layout up to the
  * nc_nlen member.
  * struct namecache_ts is used in place of struct namecache when time(s) need
  * to be stored.  The nc_dotdottime field is used when a cache entry is mapping
  * both a non-dotdot directory name plus dotdot for the directory's
  * parent.
  */
 struct	namecache_ts {
 	LIST_ENTRY(namecache) nc_hash;	/* hash chain */
 	LIST_ENTRY(namecache) nc_src;	/* source vnode list */
 	TAILQ_ENTRY(namecache) nc_dst;	/* destination vnode list */
 	struct	vnode *nc_dvp;		/* vnode of parent of name */
 	struct	vnode *nc_vp;		/* vnode the name refers to */
 	u_char	nc_flag;		/* flag bits */
 	u_char	nc_nlen;		/* length of name */
 	struct	timespec nc_time;	/* timespec provided by fs */
 	struct	timespec nc_dotdottime;	/* dotdot timespec provided by fs */
 	int	nc_ticks;		/* ticks value when entry was added */
 	char	nc_name[0];		/* segment name + nul */
 };
 
 /*
  * Flags in namecache.nc_flag
  */
 #define NCF_WHITE	0x01
 #define NCF_ISDOTDOT	0x02
 #define	NCF_TS		0x04
 #define	NCF_DTS		0x08
 
 /*
  * Name caching works as follows:
  *
  * Names found by directory scans are retained in a cache
  * for future reference.  It is managed LRU, so frequently
  * used names will hang around.  Cache is indexed by hash value
  * obtained from (vp, name) where vp refers to the directory
  * containing name.
  *
  * If it is a "negative" entry, (i.e. for a name that is known NOT to
  * exist) the vnode pointer will be NULL.
  *
  * Upon reaching the last segment of a path, if the reference
  * is for DELETE, or NOCACHE is set (rewrite), and the
  * name is located in the cache, it will be dropped.
  */
 
 /*
  * Structures associated with name cacheing.
  */
 #define NCHHASH(hash) \
 	(&nchashtbl[(hash) & nchash])
 static LIST_HEAD(nchashhead, namecache) *nchashtbl;	/* Hash Table */
 static TAILQ_HEAD(, namecache) ncneg;	/* Hash Table */
 static u_long	nchash;			/* size of hash table */
 SYSCTL_ULONG(_debug, OID_AUTO, nchash, CTLFLAG_RD, &nchash, 0,
     "Size of namecache hash table");
 static u_long	ncnegfactor = 16;	/* ratio of negative entries */
 SYSCTL_ULONG(_vfs, OID_AUTO, ncnegfactor, CTLFLAG_RW, &ncnegfactor, 0,
     "Ratio of negative namecache entries");
 static u_long	numneg;			/* number of negative entries allocated */
 SYSCTL_ULONG(_debug, OID_AUTO, numneg, CTLFLAG_RD, &numneg, 0,
     "Number of negative entries in namecache");
 static u_long	numcache;		/* number of cache entries allocated */
 SYSCTL_ULONG(_debug, OID_AUTO, numcache, CTLFLAG_RD, &numcache, 0,
     "Number of namecache entries");
 static u_long	numcachehv;		/* number of cache entries with vnodes held */
 SYSCTL_ULONG(_debug, OID_AUTO, numcachehv, CTLFLAG_RD, &numcachehv, 0,
     "Number of namecache entries with vnodes held");
 static u_int	ncsizefactor = 2;
 SYSCTL_UINT(_vfs, OID_AUTO, ncsizefactor, CTLFLAG_RW, &ncsizefactor, 0,
     "Size factor for namecache");
 
 struct nchstats	nchstats;		/* cache effectiveness statistics */
 
 static struct rwlock cache_lock;
 RW_SYSINIT(vfscache, &cache_lock, "Name Cache");
 
 #define	CACHE_UPGRADE_LOCK()	rw_try_upgrade(&cache_lock)
 #define	CACHE_RLOCK()		rw_rlock(&cache_lock)
 #define	CACHE_RUNLOCK()		rw_runlock(&cache_lock)
 #define	CACHE_WLOCK()		rw_wlock(&cache_lock)
 #define	CACHE_WUNLOCK()		rw_wunlock(&cache_lock)
 
 /*
  * UMA zones for the VFS cache.
  *
  * The small cache is used for entries with short names, which are the
  * most common.  The large cache is used for entries which are too big to
  * fit in the small cache.
  */
 static uma_zone_t cache_zone_small;
 static uma_zone_t cache_zone_small_ts;
 static uma_zone_t cache_zone_large;
 static uma_zone_t cache_zone_large_ts;
 
 #define	CACHE_PATH_CUTOFF	35
 
 static struct namecache *
 cache_alloc(int len, int ts)
 {
 
 	if (len > CACHE_PATH_CUTOFF) {
 		if (ts)
 			return (uma_zalloc(cache_zone_large_ts, M_WAITOK));
 		else
 			return (uma_zalloc(cache_zone_large, M_WAITOK));
 	}
 	if (ts)
 		return (uma_zalloc(cache_zone_small_ts, M_WAITOK));
 	else
 		return (uma_zalloc(cache_zone_small, M_WAITOK));
 }
 
 static void
 cache_free(struct namecache *ncp)
 {
 	int ts;
 
 	if (ncp == NULL)
 		return;
 	ts = ncp->nc_flag & NCF_TS;
 	if (ncp->nc_nlen <= CACHE_PATH_CUTOFF) {
 		if (ts)
 			uma_zfree(cache_zone_small_ts, ncp);
 		else
 			uma_zfree(cache_zone_small, ncp);
 	} else if (ts)
 		uma_zfree(cache_zone_large_ts, ncp);
 	else
 		uma_zfree(cache_zone_large, ncp);
 }
 
 static char *
 nc_get_name(struct namecache *ncp)
 {
 	struct namecache_ts *ncp_ts;
 
 	if ((ncp->nc_flag & NCF_TS) == 0)
 		return (ncp->nc_name);
 	ncp_ts = (struct namecache_ts *)ncp;
 	return (ncp_ts->nc_name);
 }
 
 static void
 cache_out_ts(struct namecache *ncp, struct timespec *tsp, int *ticksp)
 {
 
 	KASSERT((ncp->nc_flag & NCF_TS) != 0 ||
 	    (tsp == NULL && ticksp == NULL),
 	    ("No NCF_TS"));
 
 	if (tsp != NULL)
 		*tsp = ((struct namecache_ts *)ncp)->nc_time;
 	if (ticksp != NULL)
 		*ticksp = ((struct namecache_ts *)ncp)->nc_ticks;
 }
 
 static int	doingcache = 1;		/* 1 => enable the cache */
 SYSCTL_INT(_debug, OID_AUTO, vfscache, CTLFLAG_RW, &doingcache, 0,
     "VFS namecache enabled");
 
 /* Export size information to userland */
 SYSCTL_INT(_debug_sizeof, OID_AUTO, namecache, CTLFLAG_RD, 0,
     sizeof(struct namecache), "sizeof(struct namecache)");
 
 /*
  * The new name cache statistics
  */
 static SYSCTL_NODE(_vfs, OID_AUTO, cache, CTLFLAG_RW, 0,
     "Name cache statistics");
 #define STATNODE(mode, name, var, descr) \
 	SYSCTL_ULONG(_vfs_cache, OID_AUTO, name, mode, var, 0, descr);
 STATNODE(CTLFLAG_RD, numneg, &numneg, "Number of negative cache entries");
 STATNODE(CTLFLAG_RD, numcache, &numcache, "Number of cache entries");
 static u_long numcalls; STATNODE(CTLFLAG_RD, numcalls, &numcalls,
     "Number of cache lookups");
 static u_long dothits; STATNODE(CTLFLAG_RD, dothits, &dothits,
     "Number of '.' hits");
 static u_long dotdothits; STATNODE(CTLFLAG_RD, dotdothits, &dotdothits,
     "Number of '..' hits");
 static u_long numchecks; STATNODE(CTLFLAG_RD, numchecks, &numchecks,
     "Number of checks in lookup");
 static u_long nummiss; STATNODE(CTLFLAG_RD, nummiss, &nummiss,
     "Number of cache misses");
 static u_long nummisszap; STATNODE(CTLFLAG_RD, nummisszap, &nummisszap,
     "Number of cache misses we do not want to cache");
 static u_long numposzaps; STATNODE(CTLFLAG_RD, numposzaps, &numposzaps, 
     "Number of cache hits (positive) we do not want to cache");
 static u_long numposhits; STATNODE(CTLFLAG_RD, numposhits, &numposhits,
     "Number of cache hits (positive)");
 static u_long numnegzaps; STATNODE(CTLFLAG_RD, numnegzaps, &numnegzaps,
     "Number of cache hits (negative) we do not want to cache");
 static u_long numneghits; STATNODE(CTLFLAG_RD, numneghits, &numneghits,
     "Number of cache hits (negative)");
 static u_long numupgrades; STATNODE(CTLFLAG_RD, numupgrades, &numupgrades,
     "Number of updates of the cache after lookup (write lock + retry)");
 
 SYSCTL_OPAQUE(_vfs_cache, OID_AUTO, nchstats, CTLFLAG_RD | CTLFLAG_MPSAFE,
     &nchstats, sizeof(nchstats), "LU",
     "VFS cache effectiveness statistics");
 
 
 
 static void cache_zap(struct namecache *ncp);
 static int vn_vptocnp_locked(struct vnode **vp, struct ucred *cred, char *buf,
     u_int *buflen);
 static int vn_fullpath1(struct thread *td, struct vnode *vp, struct vnode *rdir,
     char *buf, char **retbuf, u_int buflen);
 
 static MALLOC_DEFINE(M_VFSCACHE, "vfscache", "VFS name cache entries");
 
 #ifdef DIAGNOSTIC
 /*
  * Grab an atomic snapshot of the name cache hash chain lengths
  */
 static SYSCTL_NODE(_debug, OID_AUTO, hashstat, CTLFLAG_RW, NULL,
     "hash table stats");
 
 static int
 sysctl_debug_hashstat_rawnchash(SYSCTL_HANDLER_ARGS)
 {
 	int error;
 	struct nchashhead *ncpp;
 	struct namecache *ncp;
 	int n_nchash;
 	int count;
 
 	n_nchash = nchash + 1;	/* nchash is max index, not count */
 	if (!req->oldptr)
 		return SYSCTL_OUT(req, 0, n_nchash * sizeof(int));
 
 	/* Scan hash tables for applicable entries */
 	for (ncpp = nchashtbl; n_nchash > 0; n_nchash--, ncpp++) {
 		CACHE_RLOCK();
 		count = 0;
 		LIST_FOREACH(ncp, ncpp, nc_hash) {
 			count++;
 		}
 		CACHE_RUNLOCK();
 		error = SYSCTL_OUT(req, &count, sizeof(count));
 		if (error)
 			return (error);
 	}
 	return (0);
 }
 SYSCTL_PROC(_debug_hashstat, OID_AUTO, rawnchash, CTLTYPE_INT|CTLFLAG_RD|
     CTLFLAG_MPSAFE, 0, 0, sysctl_debug_hashstat_rawnchash, "S,int",
     "nchash chain lengths");
 
 static int
 sysctl_debug_hashstat_nchash(SYSCTL_HANDLER_ARGS)
 {
 	int error;
 	struct nchashhead *ncpp;
 	struct namecache *ncp;
 	int n_nchash;
 	int count, maxlength, used, pct;
 
 	if (!req->oldptr)
 		return SYSCTL_OUT(req, 0, 4 * sizeof(int));
 
 	n_nchash = nchash + 1;	/* nchash is max index, not count */
 	used = 0;
 	maxlength = 0;
 
 	/* Scan hash tables for applicable entries */
 	for (ncpp = nchashtbl; n_nchash > 0; n_nchash--, ncpp++) {
 		count = 0;
 		CACHE_RLOCK();
 		LIST_FOREACH(ncp, ncpp, nc_hash) {
 			count++;
 		}
 		CACHE_RUNLOCK();
 		if (count)
 			used++;
 		if (maxlength < count)
 			maxlength = count;
 	}
 	n_nchash = nchash + 1;
 	pct = (used * 100) / (n_nchash / 100);
 	error = SYSCTL_OUT(req, &n_nchash, sizeof(n_nchash));
 	if (error)
 		return (error);
 	error = SYSCTL_OUT(req, &used, sizeof(used));
 	if (error)
 		return (error);
 	error = SYSCTL_OUT(req, &maxlength, sizeof(maxlength));
 	if (error)
 		return (error);
 	error = SYSCTL_OUT(req, &pct, sizeof(pct));
 	if (error)
 		return (error);
 	return (0);
 }
 SYSCTL_PROC(_debug_hashstat, OID_AUTO, nchash, CTLTYPE_INT|CTLFLAG_RD|
     CTLFLAG_MPSAFE, 0, 0, sysctl_debug_hashstat_nchash, "I",
     "nchash statistics (number of total/used buckets, maximum chain length, usage percentage)");
 #endif
 
 /*
  * cache_zap():
  *
  *   Removes a namecache entry from cache, whether it contains an actual
  *   pointer to a vnode or if it is just a negative cache entry.
  */
 static void
 cache_zap(ncp)
 	struct namecache *ncp;
 {
 	struct vnode *vp;
 
 	rw_assert(&cache_lock, RA_WLOCKED);
 	CTR2(KTR_VFS, "cache_zap(%p) vp %p", ncp, ncp->nc_vp);
 #ifdef KDTRACE_HOOKS
 	if (ncp->nc_vp != NULL) {
 		SDT_PROBE(vfs, namecache, zap, done, ncp->nc_dvp,
 		    nc_get_name(ncp), ncp->nc_vp, 0, 0);
 	} else {
 		SDT_PROBE(vfs, namecache, zap_negative, done, ncp->nc_dvp,
 		    nc_get_name(ncp), 0, 0, 0);
 	}
 #endif
 	vp = NULL;
 	LIST_REMOVE(ncp, nc_hash);
 	if (ncp->nc_flag & NCF_ISDOTDOT) {
 		if (ncp == ncp->nc_dvp->v_cache_dd)
 			ncp->nc_dvp->v_cache_dd = NULL;
 	} else {
 		LIST_REMOVE(ncp, nc_src);
 		if (LIST_EMPTY(&ncp->nc_dvp->v_cache_src)) {
 			vp = ncp->nc_dvp;
 			numcachehv--;
 		}
 	}
 	if (ncp->nc_vp) {
 		TAILQ_REMOVE(&ncp->nc_vp->v_cache_dst, ncp, nc_dst);
 		if (ncp == ncp->nc_vp->v_cache_dd)
 			ncp->nc_vp->v_cache_dd = NULL;
 	} else {
 		TAILQ_REMOVE(&ncneg, ncp, nc_dst);
 		numneg--;
 	}
 	numcache--;
 	cache_free(ncp);
 	if (vp)
 		vdrop(vp);
 }
 
 /*
  * Lookup an entry in the cache
  *
  * Lookup is called with dvp pointing to the directory to search,
  * cnp pointing to the name of the entry being sought. If the lookup
  * succeeds, the vnode is returned in *vpp, and a status of -1 is
  * returned. If the lookup determines that the name does not exist
  * (negative cacheing), a status of ENOENT is returned. If the lookup
  * fails, a status of zero is returned.  If the directory vnode is
  * recycled out from under us due to a forced unmount, a status of
  * ENOENT is returned.
  *
  * vpp is locked and ref'd on return.  If we're looking up DOTDOT, dvp is
  * unlocked.  If we're looking up . an extra ref is taken, but the lock is
  * not recursively acquired.
  */
 
 int
 cache_lookup(dvp, vpp, cnp, tsp, ticksp)
 	struct vnode *dvp;
 	struct vnode **vpp;
 	struct componentname *cnp;
 	struct timespec *tsp;
 	int *ticksp;
 {
 	struct namecache *ncp;
 	uint32_t hash;
 	int error, ltype, wlocked;
 
 	if (!doingcache) {
 		cnp->cn_flags &= ~MAKEENTRY;
 		return (0);
 	}
 retry:
 	CACHE_RLOCK();
 	wlocked = 0;
 	numcalls++;
 	error = 0;
 
 retry_wlocked:
 	if (cnp->cn_nameptr[0] == '.') {
 		if (cnp->cn_namelen == 1) {
 			*vpp = dvp;
 			CTR2(KTR_VFS, "cache_lookup(%p, %s) found via .",
 			    dvp, cnp->cn_nameptr);
 			dothits++;
 			SDT_PROBE(vfs, namecache, lookup, hit, dvp, ".",
 			    *vpp, 0, 0);
 			if (tsp != NULL)
 				timespecclear(tsp);
 			if (ticksp != NULL)
 				*ticksp = ticks;
 			goto success;
 		}
 		if (cnp->cn_namelen == 2 && cnp->cn_nameptr[1] == '.') {
 			dotdothits++;
 			if (dvp->v_cache_dd == NULL) {
 				SDT_PROBE(vfs, namecache, lookup, miss, dvp,
 				    "..", NULL, 0, 0);
 				goto unlock;
 			}
 			if ((cnp->cn_flags & MAKEENTRY) == 0) {
 				if (!wlocked && !CACHE_UPGRADE_LOCK())
 					goto wlock;
 				if (dvp->v_cache_dd->nc_flag & NCF_ISDOTDOT)
 					cache_zap(dvp->v_cache_dd);
 				dvp->v_cache_dd = NULL;
 				CACHE_WUNLOCK();
 				return (0);
 			}
 			ncp = dvp->v_cache_dd;
 			if (ncp->nc_flag & NCF_ISDOTDOT)
 				*vpp = ncp->nc_vp;
 			else
 				*vpp = ncp->nc_dvp;
 			/* Return failure if negative entry was found. */
 			if (*vpp == NULL)
 				goto negative_success;
 			CTR3(KTR_VFS, "cache_lookup(%p, %s) found %p via ..",
 			    dvp, cnp->cn_nameptr, *vpp);
 			SDT_PROBE(vfs, namecache, lookup, hit, dvp, "..",
 			    *vpp, 0, 0);
 			cache_out_ts(ncp, tsp, ticksp);
 			if ((ncp->nc_flag & (NCF_ISDOTDOT | NCF_DTS)) ==
 			    NCF_DTS && tsp != NULL)
 				*tsp = ((struct namecache_ts *)ncp)->
 				    nc_dotdottime;
 			goto success;
 		}
 	}
 
 	hash = fnv_32_buf(cnp->cn_nameptr, cnp->cn_namelen, FNV1_32_INIT);
 	hash = fnv_32_buf(&dvp, sizeof(dvp), hash);
 	LIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) {
 		numchecks++;
 		if (ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen &&
 		    !bcmp(nc_get_name(ncp), cnp->cn_nameptr, ncp->nc_nlen))
 			break;
 	}
 
 	/* We failed to find an entry */
 	if (ncp == NULL) {
 		SDT_PROBE(vfs, namecache, lookup, miss, dvp, cnp->cn_nameptr,
 		    NULL, 0, 0);
 		if ((cnp->cn_flags & MAKEENTRY) == 0) {
 			nummisszap++;
 		} else {
 			nummiss++;
 		}
 		nchstats.ncs_miss++;
 		goto unlock;
 	}
 
 	/* We don't want to have an entry, so dump it */
 	if ((cnp->cn_flags & MAKEENTRY) == 0) {
 		numposzaps++;
 		nchstats.ncs_badhits++;
 		if (!wlocked && !CACHE_UPGRADE_LOCK())
 			goto wlock;
 		cache_zap(ncp);
 		CACHE_WUNLOCK();
 		return (0);
 	}
 
 	/* We found a "positive" match, return the vnode */
 	if (ncp->nc_vp) {
 		numposhits++;
 		nchstats.ncs_goodhits++;
 		*vpp = ncp->nc_vp;
 		CTR4(KTR_VFS, "cache_lookup(%p, %s) found %p via ncp %p",
 		    dvp, cnp->cn_nameptr, *vpp, ncp);
 		SDT_PROBE(vfs, namecache, lookup, hit, dvp, nc_get_name(ncp),
 		    *vpp, 0, 0);
 		cache_out_ts(ncp, tsp, ticksp);
 		goto success;
 	}
 
 negative_success:
 	/* We found a negative match, and want to create it, so purge */
 	if (cnp->cn_nameiop == CREATE) {
 		numnegzaps++;
 		nchstats.ncs_badhits++;
 		if (!wlocked && !CACHE_UPGRADE_LOCK())
 			goto wlock;
 		cache_zap(ncp);
 		CACHE_WUNLOCK();
 		return (0);
 	}
 
 	if (!wlocked && !CACHE_UPGRADE_LOCK())
 		goto wlock;
 	numneghits++;
 	/*
 	 * We found a "negative" match, so we shift it to the end of
 	 * the "negative" cache entries queue to satisfy LRU.  Also,
 	 * check to see if the entry is a whiteout; indicate this to
 	 * the componentname, if so.
 	 */
 	TAILQ_REMOVE(&ncneg, ncp, nc_dst);
 	TAILQ_INSERT_TAIL(&ncneg, ncp, nc_dst);
 	nchstats.ncs_neghits++;
 	if (ncp->nc_flag & NCF_WHITE)
 		cnp->cn_flags |= ISWHITEOUT;
 	SDT_PROBE(vfs, namecache, lookup, hit_negative, dvp, nc_get_name(ncp),
 	    0, 0, 0);
 	cache_out_ts(ncp, tsp, ticksp);
 	CACHE_WUNLOCK();
 	return (ENOENT);
 
 wlock:
 	/*
 	 * We need to update the cache after our lookup, so upgrade to
 	 * a write lock and retry the operation.
 	 */
 	CACHE_RUNLOCK();
 	CACHE_WLOCK();
 	numupgrades++;
 	wlocked = 1;
 	goto retry_wlocked;
 
 success:
 	/*
 	 * On success we return a locked and ref'd vnode as per the lookup
 	 * protocol.
 	 */
 	if (dvp == *vpp) {   /* lookup on "." */
 		VREF(*vpp);
 		if (wlocked)
 			CACHE_WUNLOCK();
 		else
 			CACHE_RUNLOCK();
 		/*
 		 * When we lookup "." we still can be asked to lock it
 		 * differently...
 		 */
 		ltype = cnp->cn_lkflags & LK_TYPE_MASK;
 		if (ltype != VOP_ISLOCKED(*vpp)) {
 			if (ltype == LK_EXCLUSIVE) {
 				vn_lock(*vpp, LK_UPGRADE | LK_RETRY);
 				if ((*vpp)->v_iflag & VI_DOOMED) {
 					/* forced unmount */
 					vrele(*vpp);
 					*vpp = NULL;
 					return (ENOENT);
 				}
 			} else
 				vn_lock(*vpp, LK_DOWNGRADE | LK_RETRY);
 		}
 		return (-1);
 	}
 	ltype = 0;	/* silence gcc warning */
 	if (cnp->cn_flags & ISDOTDOT) {
 		ltype = VOP_ISLOCKED(dvp);
 		VOP_UNLOCK(dvp, 0);
 	}
 	VI_LOCK(*vpp);
 	if (wlocked)
 		CACHE_WUNLOCK();
 	else
 		CACHE_RUNLOCK();
 	error = vget(*vpp, cnp->cn_lkflags | LK_INTERLOCK, cnp->cn_thread);
 	if (cnp->cn_flags & ISDOTDOT) {
 		vn_lock(dvp, ltype | LK_RETRY);
 		if (dvp->v_iflag & VI_DOOMED) {
 			if (error == 0)
 				vput(*vpp);
 			*vpp = NULL;
 			return (ENOENT);
 		}
 	}
 	if (error) {
 		*vpp = NULL;
 		goto retry;
 	}
 	if ((cnp->cn_flags & ISLASTCN) &&
 	    (cnp->cn_lkflags & LK_TYPE_MASK) == LK_EXCLUSIVE) {
 		ASSERT_VOP_ELOCKED(*vpp, "cache_lookup");
 	}
 	return (-1);
 
 unlock:
 	if (wlocked)
 		CACHE_WUNLOCK();
 	else
 		CACHE_RUNLOCK();
 	return (0);
 }
 
 /*
  * Add an entry to the cache.
  */
 void
 cache_enter_time(dvp, vp, cnp, tsp, dtsp)
 	struct vnode *dvp;
 	struct vnode *vp;
 	struct componentname *cnp;
 	struct timespec *tsp;
 	struct timespec *dtsp;
 {
 	struct namecache *ncp, *n2;
 	struct namecache_ts *n3;
 	struct nchashhead *ncpp;
 	uint32_t hash;
 	int flag;
 	int hold;
 	int zap;
 	int len;
 
 	CTR3(KTR_VFS, "cache_enter(%p, %p, %s)", dvp, vp, cnp->cn_nameptr);
 	VNASSERT(vp == NULL || (vp->v_iflag & VI_DOOMED) == 0, vp,
 	    ("cache_enter: Adding a doomed vnode"));
 	VNASSERT(dvp == NULL || (dvp->v_iflag & VI_DOOMED) == 0, dvp,
 	    ("cache_enter: Doomed vnode used as src"));
 
 	if (!doingcache)
 		return;
 
 	/*
 	 * Avoid blowout in namecache entries.
 	 */
 	if (numcache >= desiredvnodes * ncsizefactor)
 		return;
 
 	flag = 0;
 	if (cnp->cn_nameptr[0] == '.') {
 		if (cnp->cn_namelen == 1)
 			return;
 		if (cnp->cn_namelen == 2 && cnp->cn_nameptr[1] == '.') {
 			CACHE_WLOCK();
 			/*
 			 * If dotdot entry already exists, just retarget it
 			 * to new parent vnode, otherwise continue with new
 			 * namecache entry allocation.
 			 */
 			if ((ncp = dvp->v_cache_dd) != NULL &&
 			    ncp->nc_flag & NCF_ISDOTDOT) {
 				KASSERT(ncp->nc_dvp == dvp,
 				    ("wrong isdotdot parent"));
 				if (ncp->nc_vp != NULL)
 					TAILQ_REMOVE(&ncp->nc_vp->v_cache_dst,
 					    ncp, nc_dst);
 				else
 					TAILQ_REMOVE(&ncneg, ncp, nc_dst);
 				if (vp != NULL)
 					TAILQ_INSERT_HEAD(&vp->v_cache_dst,
 					    ncp, nc_dst);
 				else
 					TAILQ_INSERT_TAIL(&ncneg, ncp, nc_dst);
 				ncp->nc_vp = vp;
 				CACHE_WUNLOCK();
 				return;
 			}
 			dvp->v_cache_dd = NULL;
 			SDT_PROBE(vfs, namecache, enter, done, dvp, "..", vp,
 			    0, 0);
 			CACHE_WUNLOCK();
 			flag = NCF_ISDOTDOT;
 		}
 	}
 
 	hold = 0;
 	zap = 0;
 
 	/*
 	 * Calculate the hash key and setup as much of the new
 	 * namecache entry as possible before acquiring the lock.
 	 */
 	ncp = cache_alloc(cnp->cn_namelen, tsp != NULL);
 	ncp->nc_vp = vp;
 	ncp->nc_dvp = dvp;
 	ncp->nc_flag = flag;
 	if (tsp != NULL) {
 		n3 = (struct namecache_ts *)ncp;
 		n3->nc_time = *tsp;
 		n3->nc_ticks = ticks;
 		n3->nc_flag |= NCF_TS;
 		if (dtsp != NULL) {
 			n3->nc_dotdottime = *dtsp;
 			n3->nc_flag |= NCF_DTS;
 		}
 	}
 	len = ncp->nc_nlen = cnp->cn_namelen;
 	hash = fnv_32_buf(cnp->cn_nameptr, len, FNV1_32_INIT);
 	strlcpy(nc_get_name(ncp), cnp->cn_nameptr, len + 1);
 	hash = fnv_32_buf(&dvp, sizeof(dvp), hash);
 	CACHE_WLOCK();
 
 	/*
 	 * See if this vnode or negative entry is already in the cache
 	 * with this name.  This can happen with concurrent lookups of
 	 * the same path name.
 	 */
 	ncpp = NCHHASH(hash);
 	LIST_FOREACH(n2, ncpp, nc_hash) {
 		if (n2->nc_dvp == dvp &&
 		    n2->nc_nlen == cnp->cn_namelen &&
 		    !bcmp(nc_get_name(n2), cnp->cn_nameptr, n2->nc_nlen)) {
 			if (tsp != NULL) {
 				KASSERT((n2->nc_flag & NCF_TS) != 0,
 				    ("no NCF_TS"));
 				n3 = (struct namecache_ts *)n2;
 				n3->nc_time =
 				    ((struct namecache_ts *)ncp)->nc_time;
 				n3->nc_ticks =
 				    ((struct namecache_ts *)ncp)->nc_ticks;
 				if (dtsp != NULL) {
 					n3->nc_dotdottime =
 					    ((struct namecache_ts *)ncp)->
 					    nc_dotdottime;
 					n3->nc_flag |= NCF_DTS;
 				}
 			}
 			CACHE_WUNLOCK();
 			cache_free(ncp);
 			return;
 		}
 	}
 
 	if (flag == NCF_ISDOTDOT) {
 		/*
 		 * See if we are trying to add .. entry, but some other lookup
 		 * has populated v_cache_dd pointer already.
 		 */
 		if (dvp->v_cache_dd != NULL) {
 		    CACHE_WUNLOCK();
 		    cache_free(ncp);
 		    return;
 		}
 		KASSERT(vp == NULL || vp->v_type == VDIR,
 		    ("wrong vnode type %p", vp));
 		dvp->v_cache_dd = ncp;
 	}
 
 	numcache++;
 	if (!vp) {
 		numneg++;
 		if (cnp->cn_flags & ISWHITEOUT)
 			ncp->nc_flag |= NCF_WHITE;
 	} else if (vp->v_type == VDIR) {
 		if (flag != NCF_ISDOTDOT) {
 			/*
 			 * For this case, the cache entry maps both the
 			 * directory name in it and the name ".." for the
 			 * directory's parent.
 			 */
 			if ((n2 = vp->v_cache_dd) != NULL &&
 			    (n2->nc_flag & NCF_ISDOTDOT) != 0)
 				cache_zap(n2);
 			vp->v_cache_dd = ncp;
 		}
 	} else {
 		vp->v_cache_dd = NULL;
 	}
 
 	/*
 	 * Insert the new namecache entry into the appropriate chain
 	 * within the cache entries table.
 	 */
 	LIST_INSERT_HEAD(ncpp, ncp, nc_hash);
 	if (flag != NCF_ISDOTDOT) {
 		if (LIST_EMPTY(&dvp->v_cache_src)) {
 			hold = 1;
 			numcachehv++;
 		}
 		LIST_INSERT_HEAD(&dvp->v_cache_src, ncp, nc_src);
 	}
 
 	/*
 	 * If the entry is "negative", we place it into the
 	 * "negative" cache queue, otherwise, we place it into the
 	 * destination vnode's cache entries queue.
 	 */
 	if (vp) {
 		TAILQ_INSERT_HEAD(&vp->v_cache_dst, ncp, nc_dst);
 		SDT_PROBE(vfs, namecache, enter, done, dvp, nc_get_name(ncp),
 		    vp, 0, 0);
 	} else {
 		TAILQ_INSERT_TAIL(&ncneg, ncp, nc_dst);
 		SDT_PROBE(vfs, namecache, enter_negative, done, dvp,
 		    nc_get_name(ncp), 0, 0, 0);
 	}
 	if (numneg * ncnegfactor > numcache) {
 		ncp = TAILQ_FIRST(&ncneg);
 		zap = 1;
 	}
 	if (hold)
 		vhold(dvp);
 	if (zap)
 		cache_zap(ncp);
 	CACHE_WUNLOCK();
 }
 
 /*
  * Name cache initialization, from vfs_init() when we are booting
  */
 static void
 nchinit(void *dummy __unused)
 {
 
 	TAILQ_INIT(&ncneg);
 
 	cache_zone_small = uma_zcreate("S VFS Cache",
 	    sizeof(struct namecache) + CACHE_PATH_CUTOFF + 1,
 	    NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_ZINIT);
 	cache_zone_small_ts = uma_zcreate("STS VFS Cache",
 	    sizeof(struct namecache_ts) + CACHE_PATH_CUTOFF + 1,
 	    NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_ZINIT);
 	cache_zone_large = uma_zcreate("L VFS Cache",
 	    sizeof(struct namecache) + NAME_MAX + 1,
 	    NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_ZINIT);
 	cache_zone_large_ts = uma_zcreate("LTS VFS Cache",
 	    sizeof(struct namecache_ts) + NAME_MAX + 1,
 	    NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_ZINIT);
 
 	nchashtbl = hashinit(desiredvnodes * 2, M_VFSCACHE, &nchash);
 }
 SYSINIT(vfs, SI_SUB_VFS, SI_ORDER_SECOND, nchinit, NULL);
 
 
 /*
  * Invalidate all entries to a particular vnode.
  */
 void
 cache_purge(vp)
 	struct vnode *vp;
 {
 
 	CTR1(KTR_VFS, "cache_purge(%p)", vp);
 	SDT_PROBE(vfs, namecache, purge, done, vp, 0, 0, 0, 0);
 	CACHE_WLOCK();
 	while (!LIST_EMPTY(&vp->v_cache_src))
 		cache_zap(LIST_FIRST(&vp->v_cache_src));
 	while (!TAILQ_EMPTY(&vp->v_cache_dst))
 		cache_zap(TAILQ_FIRST(&vp->v_cache_dst));
 	if (vp->v_cache_dd != NULL) {
 		KASSERT(vp->v_cache_dd->nc_flag & NCF_ISDOTDOT,
 		   ("lost dotdot link"));
 		cache_zap(vp->v_cache_dd);
 	}
 	KASSERT(vp->v_cache_dd == NULL, ("incomplete purge"));
 	CACHE_WUNLOCK();
 }
 
 /*
  * Invalidate all negative entries for a particular directory vnode.
  */
 void
 cache_purge_negative(vp)
 	struct vnode *vp;
 {
 	struct namecache *cp, *ncp;
 
 	CTR1(KTR_VFS, "cache_purge_negative(%p)", vp);
 	SDT_PROBE(vfs, namecache, purge_negative, done, vp, 0, 0, 0, 0);
 	CACHE_WLOCK();
 	LIST_FOREACH_SAFE(cp, &vp->v_cache_src, nc_src, ncp) {
 		if (cp->nc_vp == NULL)
 			cache_zap(cp);
 	}
 	CACHE_WUNLOCK();
 }
 
 /*
  * Flush all entries referencing a particular filesystem.
  */
 void
 cache_purgevfs(mp)
 	struct mount *mp;
 {
 	struct nchashhead *ncpp;
 	struct namecache *ncp, *nnp;
 
 	/* Scan hash tables for applicable entries */
 	SDT_PROBE(vfs, namecache, purgevfs, done, mp, 0, 0, 0, 0);
 	CACHE_WLOCK();
 	for (ncpp = &nchashtbl[nchash]; ncpp >= nchashtbl; ncpp--) {
 		LIST_FOREACH_SAFE(ncp, ncpp, nc_hash, nnp) {
 			if (ncp->nc_dvp->v_mount == mp)
 				cache_zap(ncp);
 		}
 	}
 	CACHE_WUNLOCK();
 }
 
 /*
  * Perform canonical checks and cache lookup and pass on to filesystem
  * through the vop_cachedlookup only if needed.
  */
 
 int
 vfs_cache_lookup(ap)
 	struct vop_lookup_args /* {
 		struct vnode *a_dvp;
 		struct vnode **a_vpp;
 		struct componentname *a_cnp;
 	} */ *ap;
 {
 	struct vnode *dvp;
 	int error;
 	struct vnode **vpp = ap->a_vpp;
 	struct componentname *cnp = ap->a_cnp;
 	struct ucred *cred = cnp->cn_cred;
 	int flags = cnp->cn_flags;
 	struct thread *td = cnp->cn_thread;
 
 	*vpp = NULL;
 	dvp = ap->a_dvp;
 
 	if (dvp->v_type != VDIR)
 		return (ENOTDIR);
 
 	if ((flags & ISLASTCN) && (dvp->v_mount->mnt_flag & MNT_RDONLY) &&
 	    (cnp->cn_nameiop == DELETE || cnp->cn_nameiop == RENAME))
 		return (EROFS);
 
 	error = VOP_ACCESS(dvp, VEXEC, cred, td);
 	if (error)
 		return (error);
 
 	error = cache_lookup(dvp, vpp, cnp, NULL, NULL);
 	if (error == 0)
 		return (VOP_CACHEDLOOKUP(dvp, vpp, cnp));
 	if (error == -1)
 		return (0);
 	return (error);
 }
 
 
 #ifndef _SYS_SYSPROTO_H_
 struct  __getcwd_args {
 	u_char	*buf;
 	u_int	buflen;
 };
 #endif
 
 /*
  * XXX All of these sysctls would probably be more productive dead.
  */
 static int disablecwd;
 SYSCTL_INT(_debug, OID_AUTO, disablecwd, CTLFLAG_RW, &disablecwd, 0,
    "Disable the getcwd syscall");
 
 /* Implementation of the getcwd syscall. */
 int
 sys___getcwd(td, uap)
 	struct thread *td;
 	struct __getcwd_args *uap;
 {
 
 	return (kern___getcwd(td, uap->buf, UIO_USERSPACE, uap->buflen));
 }
 
 int
 kern___getcwd(struct thread *td, u_char *buf, enum uio_seg bufseg, u_int buflen)
 {
 	char *bp, *tmpbuf;
 	struct filedesc *fdp;
 	struct vnode *cdir, *rdir;
 	int error;
 
 	if (disablecwd)
 		return (ENODEV);
 	if (buflen < 2)
 		return (EINVAL);
 	if (buflen > MAXPATHLEN)
 		buflen = MAXPATHLEN;
 
 	tmpbuf = malloc(buflen, M_TEMP, M_WAITOK);
 	fdp = td->td_proc->p_fd;
 	FILEDESC_SLOCK(fdp);
 	cdir = fdp->fd_cdir;
 	VREF(cdir);
 	rdir = fdp->fd_rdir;
 	VREF(rdir);
 	FILEDESC_SUNLOCK(fdp);
 	error = vn_fullpath1(td, cdir, rdir, tmpbuf, &bp, buflen);
 	vrele(rdir);
 	vrele(cdir);
 
 	if (!error) {
 		if (bufseg == UIO_SYSSPACE)
 			bcopy(bp, buf, strlen(bp) + 1);
 		else
 			error = copyout(bp, buf, strlen(bp) + 1);
 #ifdef KTRACE
 	if (KTRPOINT(curthread, KTR_NAMEI))
 		ktrnamei(bp);
 #endif
 	}
 	free(tmpbuf, M_TEMP);
 	return (error);
 }
 
 /*
  * Thus begins the fullpath magic.
  */
 
 #undef STATNODE
 #define STATNODE(name, descr)						\
 	static u_int name;						\
 	SYSCTL_UINT(_vfs_cache, OID_AUTO, name, CTLFLAG_RD, &name, 0, descr)
 
 static int disablefullpath;
 SYSCTL_INT(_debug, OID_AUTO, disablefullpath, CTLFLAG_RW, &disablefullpath, 0,
     "Disable the vn_fullpath function");
 
 /* These count for kern___getcwd(), too. */
 STATNODE(numfullpathcalls, "Number of fullpath search calls");
 STATNODE(numfullpathfail1, "Number of fullpath search errors (ENOTDIR)");
 STATNODE(numfullpathfail2,
     "Number of fullpath search errors (VOP_VPTOCNP failures)");
 STATNODE(numfullpathfail4, "Number of fullpath search errors (ENOMEM)");
 STATNODE(numfullpathfound, "Number of successful fullpath calls");
 
 /*
  * Retrieve the full filesystem path that correspond to a vnode from the name
  * cache (if available)
  */
 int
 vn_fullpath(struct thread *td, struct vnode *vn, char **retbuf, char **freebuf)
 {
 	char *buf;
 	struct filedesc *fdp;
 	struct vnode *rdir;
 	int error;
 
 	if (disablefullpath)
 		return (ENODEV);
 	if (vn == NULL)
 		return (EINVAL);
 
 	buf = malloc(MAXPATHLEN, M_TEMP, M_WAITOK);
 	fdp = td->td_proc->p_fd;
 	FILEDESC_SLOCK(fdp);
 	rdir = fdp->fd_rdir;
 	VREF(rdir);
 	FILEDESC_SUNLOCK(fdp);
 	error = vn_fullpath1(td, vn, rdir, buf, retbuf, MAXPATHLEN);
 	vrele(rdir);
 
 	if (!error)
 		*freebuf = buf;
 	else
 		free(buf, M_TEMP);
 	return (error);
 }
 
 /*
  * This function is similar to vn_fullpath, but it attempts to lookup the
  * pathname relative to the global root mount point.  This is required for the
  * auditing sub-system, as audited pathnames must be absolute, relative to the
  * global root mount point.
  */
 int
 vn_fullpath_global(struct thread *td, struct vnode *vn,
     char **retbuf, char **freebuf)
 {
 	char *buf;
 	int error;
 
 	if (disablefullpath)
 		return (ENODEV);
 	if (vn == NULL)
 		return (EINVAL);
 	buf = malloc(MAXPATHLEN, M_TEMP, M_WAITOK);
 	error = vn_fullpath1(td, vn, rootvnode, buf, retbuf, MAXPATHLEN);
 	if (!error)
 		*freebuf = buf;
 	else
 		free(buf, M_TEMP);
 	return (error);
 }
 
 int
 vn_vptocnp(struct vnode **vp, struct ucred *cred, char *buf, u_int *buflen)
 {
 	int error;
 
 	CACHE_RLOCK();
 	error = vn_vptocnp_locked(vp, cred, buf, buflen);
 	if (error == 0)
 		CACHE_RUNLOCK();
 	return (error);
 }
 
 static int
 vn_vptocnp_locked(struct vnode **vp, struct ucred *cred, char *buf,
     u_int *buflen)
 {
 	struct vnode *dvp;
 	struct namecache *ncp;
 	int error;
 
 	TAILQ_FOREACH(ncp, &((*vp)->v_cache_dst), nc_dst) {
 		if ((ncp->nc_flag & NCF_ISDOTDOT) == 0)
 			break;
 	}
 	if (ncp != NULL) {
 		if (*buflen < ncp->nc_nlen) {
 			CACHE_RUNLOCK();
 			vrele(*vp);
 			numfullpathfail4++;
 			error = ENOMEM;
 			SDT_PROBE(vfs, namecache, fullpath, return, error,
 			    vp, NULL, 0, 0);
 			return (error);
 		}
 		*buflen -= ncp->nc_nlen;
 		memcpy(buf + *buflen, nc_get_name(ncp), ncp->nc_nlen);
 		SDT_PROBE(vfs, namecache, fullpath, hit, ncp->nc_dvp,
 		    nc_get_name(ncp), vp, 0, 0);
 		dvp = *vp;
 		*vp = ncp->nc_dvp;
 		vref(*vp);
 		CACHE_RUNLOCK();
 		vrele(dvp);
 		CACHE_RLOCK();
 		return (0);
 	}
 	SDT_PROBE(vfs, namecache, fullpath, miss, vp, 0, 0, 0, 0);
 
 	CACHE_RUNLOCK();
 	vn_lock(*vp, LK_SHARED | LK_RETRY);
 	error = VOP_VPTOCNP(*vp, &dvp, cred, buf, buflen);
 	vput(*vp);
 	if (error) {
 		numfullpathfail2++;
 		SDT_PROBE(vfs, namecache, fullpath, return,  error, vp,
 		    NULL, 0, 0);
 		return (error);
 	}
 
 	*vp = dvp;
 	CACHE_RLOCK();
 	if (dvp->v_iflag & VI_DOOMED) {
 		/* forced unmount */
 		CACHE_RUNLOCK();
 		vrele(dvp);
 		error = ENOENT;
 		SDT_PROBE(vfs, namecache, fullpath, return, error, vp,
 		    NULL, 0, 0);
 		return (error);
 	}
 	/*
 	 * *vp has its use count incremented still.
 	 */
 
 	return (0);
 }
 
 /*
  * The magic behind kern___getcwd() and vn_fullpath().
  */
 static int
 vn_fullpath1(struct thread *td, struct vnode *vp, struct vnode *rdir,
     char *buf, char **retbuf, u_int buflen)
 {
 	int error, slash_prefixed;
 #ifdef KDTRACE_HOOKS
 	struct vnode *startvp = vp;
 #endif
 	struct vnode *vp1;
 
 	buflen--;
 	buf[buflen] = '\0';
 	error = 0;
 	slash_prefixed = 0;
 
 	SDT_PROBE(vfs, namecache, fullpath, entry, vp, 0, 0, 0, 0);
 	numfullpathcalls++;
 	vref(vp);
 	CACHE_RLOCK();
 	if (vp->v_type != VDIR) {
 		error = vn_vptocnp_locked(&vp, td->td_ucred, buf, &buflen);
 		if (error)
 			return (error);
 		if (buflen == 0) {
 			CACHE_RUNLOCK();
 			vrele(vp);
 			return (ENOMEM);
 		}
 		buf[--buflen] = '/';
 		slash_prefixed = 1;
 	}
 	while (vp != rdir && vp != rootvnode) {
 		if (vp->v_vflag & VV_ROOT) {
 			if (vp->v_iflag & VI_DOOMED) {	/* forced unmount */
 				CACHE_RUNLOCK();
 				vrele(vp);
 				error = ENOENT;
 				SDT_PROBE(vfs, namecache, fullpath, return,
 				    error, vp, NULL, 0, 0);
 				break;
 			}
 			vp1 = vp->v_mount->mnt_vnodecovered;
 			vref(vp1);
 			CACHE_RUNLOCK();
 			vrele(vp);
 			vp = vp1;
 			CACHE_RLOCK();
 			continue;
 		}
 		if (vp->v_type != VDIR) {
 			CACHE_RUNLOCK();
 			vrele(vp);
 			numfullpathfail1++;
 			error = ENOTDIR;
 			SDT_PROBE(vfs, namecache, fullpath, return,
 			    error, vp, NULL, 0, 0);
 			break;
 		}
 		error = vn_vptocnp_locked(&vp, td->td_ucred, buf, &buflen);
 		if (error)
 			break;
 		if (buflen == 0) {
 			CACHE_RUNLOCK();
 			vrele(vp);
 			error = ENOMEM;
 			SDT_PROBE(vfs, namecache, fullpath, return, error,
 			    startvp, NULL, 0, 0);
 			break;
 		}
 		buf[--buflen] = '/';
 		slash_prefixed = 1;
 	}
 	if (error)
 		return (error);
 	if (!slash_prefixed) {
 		if (buflen == 0) {
 			CACHE_RUNLOCK();
 			vrele(vp);
 			numfullpathfail4++;
 			SDT_PROBE(vfs, namecache, fullpath, return, ENOMEM,
 			    startvp, NULL, 0, 0);
 			return (ENOMEM);
 		}
 		buf[--buflen] = '/';
 	}
 	numfullpathfound++;
 	CACHE_RUNLOCK();
 	vrele(vp);
 
 	SDT_PROBE(vfs, namecache, fullpath, return, 0, startvp, buf + buflen,
 	    0, 0);
 	*retbuf = buf + buflen;
 	return (0);
 }
 
 struct vnode *
 vn_dir_dd_ino(struct vnode *vp)
 {
 	struct namecache *ncp;
 	struct vnode *ddvp;
 
 	ASSERT_VOP_LOCKED(vp, "vn_dir_dd_ino");
 	CACHE_RLOCK();
 	TAILQ_FOREACH(ncp, &(vp->v_cache_dst), nc_dst) {
 		if ((ncp->nc_flag & NCF_ISDOTDOT) != 0)
 			continue;
 		ddvp = ncp->nc_dvp;
 		VI_LOCK(ddvp);
 		CACHE_RUNLOCK();
 		if (vget(ddvp, LK_INTERLOCK | LK_SHARED | LK_NOWAIT, curthread))
 			return (NULL);
 		return (ddvp);
 	}
 	CACHE_RUNLOCK();
 	return (NULL);
 }
 
 int
 vn_commname(struct vnode *vp, char *buf, u_int buflen)
 {
 	struct namecache *ncp;
 	int l;
 
 	CACHE_RLOCK();
 	TAILQ_FOREACH(ncp, &vp->v_cache_dst, nc_dst)
 		if ((ncp->nc_flag & NCF_ISDOTDOT) == 0)
 			break;
 	if (ncp == NULL) {
 		CACHE_RUNLOCK();
 		return (ENOENT);
 	}
 	l = min(ncp->nc_nlen, buflen - 1);
 	memcpy(buf, nc_get_name(ncp), l);
 	CACHE_RUNLOCK();
 	buf[l] = '\0';
 	return (0);
 }
 
 /* ABI compat shims for old kernel modules. */
 #undef cache_enter
 
 void	cache_enter(struct vnode *dvp, struct vnode *vp,
 	    struct componentname *cnp);
 
 void
 cache_enter(struct vnode *dvp, struct vnode *vp, struct componentname *cnp)
 {
 
 	cache_enter_time(dvp, vp, cnp, NULL, NULL);
 }
 
 /*
  * This function updates path string to vnode's full global path
  * and checks the size of the new path string against the pathlen argument.
  *
  * Requires a locked, referenced vnode and GIANT lock held.
  * Vnode is re-locked on success or ENODEV, otherwise unlocked.
  *
  * If sysctl debug.disablefullpath is set, ENODEV is returned,
  * vnode is left locked and path remain untouched.
  *
  * If vp is a directory, the call to vn_fullpath_global() always succeeds
  * because it falls back to the ".." lookup if the namecache lookup fails.
  */
 int
 vn_path_to_global_path(struct thread *td, struct vnode *vp, char *path,
     u_int pathlen)
 {
 	struct nameidata nd;
 	struct vnode *vp1;
 	char *rpath, *fbuf;
 	int error;
 
 	ASSERT_VOP_ELOCKED(vp, __func__);
 
 	/* Return ENODEV if sysctl debug.disablefullpath==1 */
 	if (disablefullpath)
 		return (ENODEV);
 
 	/* Construct global filesystem path from vp. */
 	VOP_UNLOCK(vp, 0);
 	error = vn_fullpath_global(td, vp, &rpath, &fbuf);
 
 	if (error != 0) {
 		vrele(vp);
 		return (error);
 	}
 
 	if (strlen(rpath) >= pathlen) {
 		vrele(vp);
 		error = ENAMETOOLONG;
 		goto out;
 	}
 
 	/*
 	 * Re-lookup the vnode by path to detect a possible rename.
 	 * As a side effect, the vnode is relocked.
 	 * If vnode was renamed, return ENOENT.
 	 */
 	NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | AUDITVNODE1,
 	    UIO_SYSSPACE, path, td);
 	error = namei(&nd);
 	if (error != 0) {
 		vrele(vp);
 		goto out;
 	}
 	NDFREE(&nd, NDF_ONLY_PNBUF);
 	vp1 = nd.ni_vp;
 	vrele(vp);
 	if (vp1 == vp)
 		strcpy(path, rpath);
 	else {
 		vput(vp1);
 		error = ENOENT;
 	}
 
 out:
 	free(fbuf, M_TEMP);
 	return (error);
 }
Index: head/sys/kern/vfs_lookup.c
===================================================================
--- head/sys/kern/vfs_lookup.c	(revision 258540)
+++ head/sys/kern/vfs_lookup.c	(revision 258541)
@@ -1,1249 +1,1248 @@
 /*-
  * Copyright (c) 1982, 1986, 1989, 1993
  *	The Regents of the University of California.  All rights reserved.
  * (c) UNIX System Laboratories, Inc.
  * All or some portions of this file are derived from material licensed
  * to the University of California by American Telephone and Telegraph
  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
  * the permission of UNIX System Laboratories, Inc.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)vfs_lookup.c	8.4 (Berkeley) 2/16/94
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_capsicum.h"
-#include "opt_kdtrace.h"
 #include "opt_ktrace.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/kernel.h>
 #include <sys/capability.h>
 #include <sys/fcntl.h>
 #include <sys/jail.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/namei.h>
 #include <sys/vnode.h>
 #include <sys/mount.h>
 #include <sys/filedesc.h>
 #include <sys/proc.h>
 #include <sys/sdt.h>
 #include <sys/syscallsubr.h>
 #include <sys/sysctl.h>
 #ifdef KTRACE
 #include <sys/ktrace.h>
 #endif
 
 #include <security/audit/audit.h>
 #include <security/mac/mac_framework.h>
 
 #include <vm/uma.h>
 
 #define	NAMEI_DIAGNOSTIC 1
 #undef NAMEI_DIAGNOSTIC
 
 SDT_PROVIDER_DECLARE(vfs);
 SDT_PROBE_DEFINE3(vfs, namei, lookup, entry, entry, "struct vnode *", "char *",
     "unsigned long");
 SDT_PROBE_DEFINE2(vfs, namei, lookup, return, return, "int", "struct vnode *");
 
 /*
  * Allocation zone for namei
  */
 uma_zone_t namei_zone;
 /*
  * Placeholder vnode for mp traversal
  */
 static struct vnode *vp_crossmp;
 
 static void
 nameiinit(void *dummy __unused)
 {
 
 	namei_zone = uma_zcreate("NAMEI", MAXPATHLEN, NULL, NULL, NULL, NULL,
 	    UMA_ALIGN_PTR, 0);
 	getnewvnode("crossmp", NULL, &dead_vnodeops, &vp_crossmp);
 	vn_lock(vp_crossmp, LK_EXCLUSIVE);
 	VN_LOCK_ASHARE(vp_crossmp);
 	VOP_UNLOCK(vp_crossmp, 0);
 }
 SYSINIT(vfs, SI_SUB_VFS, SI_ORDER_SECOND, nameiinit, NULL);
 
 static int lookup_shared = 1;
 SYSCTL_INT(_vfs, OID_AUTO, lookup_shared, CTLFLAG_RW, &lookup_shared, 0,
     "Enables/Disables shared locks for path name translation");
 TUNABLE_INT("vfs.lookup_shared", &lookup_shared);
 
 /*
  * Convert a pathname into a pointer to a locked vnode.
  *
  * The FOLLOW flag is set when symbolic links are to be followed
  * when they occur at the end of the name translation process.
  * Symbolic links are always followed for all other pathname
  * components other than the last.
  *
  * The segflg defines whether the name is to be copied from user
  * space or kernel space.
  *
  * Overall outline of namei:
  *
  *	copy in name
  *	get starting directory
  *	while (!done && !error) {
  *		call lookup to search path.
  *		if symbolic link, massage name in buffer and continue
  *	}
  */
 int
 namei(struct nameidata *ndp)
 {
 	struct filedesc *fdp;	/* pointer to file descriptor state */
 	char *cp;		/* pointer into pathname argument */
 	struct vnode *dp;	/* the directory we are searching */
 	struct iovec aiov;		/* uio for reading symbolic links */
 	struct uio auio;
 	int error, linklen;
 	struct componentname *cnp = &ndp->ni_cnd;
 	struct thread *td = cnp->cn_thread;
 	struct proc *p = td->td_proc;
 
 	ndp->ni_cnd.cn_cred = ndp->ni_cnd.cn_thread->td_ucred;
 	KASSERT(cnp->cn_cred && p, ("namei: bad cred/proc"));
 	KASSERT((cnp->cn_nameiop & (~OPMASK)) == 0,
 	    ("namei: nameiop contaminated with flags"));
 	KASSERT((cnp->cn_flags & OPMASK) == 0,
 	    ("namei: flags contaminated with nameiops"));
 	if (!lookup_shared)
 		cnp->cn_flags &= ~LOCKSHARED;
 	fdp = p->p_fd;
 
 	/* We will set this ourselves if we need it. */
 	cnp->cn_flags &= ~TRAILINGSLASH;
 
 	/*
 	 * Get a buffer for the name to be translated, and copy the
 	 * name into the buffer.
 	 */
 	if ((cnp->cn_flags & HASBUF) == 0)
 		cnp->cn_pnbuf = uma_zalloc(namei_zone, M_WAITOK);
 	if (ndp->ni_segflg == UIO_SYSSPACE)
 		error = copystr(ndp->ni_dirp, cnp->cn_pnbuf,
 			    MAXPATHLEN, (size_t *)&ndp->ni_pathlen);
 	else
 		error = copyinstr(ndp->ni_dirp, cnp->cn_pnbuf,
 			    MAXPATHLEN, (size_t *)&ndp->ni_pathlen);
 
 	/*
 	 * Don't allow empty pathnames.
 	 */
 	if (!error && *cnp->cn_pnbuf == '\0')
 		error = ENOENT;
 
 #ifdef CAPABILITY_MODE
 	/*
 	 * In capability mode, lookups must be "strictly relative" (i.e.
 	 * not an absolute path, and not containing '..' components) to
 	 * a real file descriptor, not the pseudo-descriptor AT_FDCWD.
 	 */
 	if (error == 0 && IN_CAPABILITY_MODE(td) &&
 	    (cnp->cn_flags & NOCAPCHECK) == 0) {
 		ndp->ni_strictrelative = 1;
 		if (ndp->ni_dirfd == AT_FDCWD) {
 #ifdef KTRACE
 			if (KTRPOINT(td, KTR_CAPFAIL))
 				ktrcapfail(CAPFAIL_LOOKUP, NULL, NULL);
 #endif
 			error = ECAPMODE;
 		}
 	}
 #endif
 	if (error) {
 		uma_zfree(namei_zone, cnp->cn_pnbuf);
 #ifdef DIAGNOSTIC
 		cnp->cn_pnbuf = NULL;
 		cnp->cn_nameptr = NULL;
 #endif
 		ndp->ni_vp = NULL;
 		return (error);
 	}
 	ndp->ni_loopcnt = 0;
 #ifdef KTRACE
 	if (KTRPOINT(td, KTR_NAMEI)) {
 		KASSERT(cnp->cn_thread == curthread,
 		    ("namei not using curthread"));
 		ktrnamei(cnp->cn_pnbuf);
 	}
 #endif
 	/*
 	 * Get starting point for the translation.
 	 */
 	FILEDESC_SLOCK(fdp);
 	ndp->ni_rootdir = fdp->fd_rdir;
 	ndp->ni_topdir = fdp->fd_jdir;
 
 	/*
 	 * If we are auditing the kernel pathname, save the user pathname.
 	 */
 	if (cnp->cn_flags & AUDITVNODE1)
 		AUDIT_ARG_UPATH1(td, ndp->ni_dirfd, cnp->cn_pnbuf);
 	if (cnp->cn_flags & AUDITVNODE2)
 		AUDIT_ARG_UPATH2(td, ndp->ni_dirfd, cnp->cn_pnbuf);
 
 	dp = NULL;
 	if (cnp->cn_pnbuf[0] != '/') {
 		if (ndp->ni_startdir != NULL) {
 			dp = ndp->ni_startdir;
 			error = 0;
 		} else if (ndp->ni_dirfd != AT_FDCWD) {
 			cap_rights_t rights;
 
 			rights = ndp->ni_rightsneeded;
 			cap_rights_set(&rights, CAP_LOOKUP);
 
 			if (cnp->cn_flags & AUDITVNODE1)
 				AUDIT_ARG_ATFD1(ndp->ni_dirfd);
 			if (cnp->cn_flags & AUDITVNODE2)
 				AUDIT_ARG_ATFD2(ndp->ni_dirfd);
 			error = fgetvp_rights(td, ndp->ni_dirfd,
 			    &rights, &ndp->ni_filecaps, &dp);
 #ifdef CAPABILITIES
 			/*
 			 * If file descriptor doesn't have all rights,
 			 * all lookups relative to it must also be
 			 * strictly relative.
 			 */
 			CAP_ALL(&rights);
 			if (!cap_rights_contains(&ndp->ni_filecaps.fc_rights,
 			    &rights) ||
 			    ndp->ni_filecaps.fc_fcntls != CAP_FCNTL_ALL ||
 			    ndp->ni_filecaps.fc_nioctls != -1) {
 				ndp->ni_strictrelative = 1;
 			}
 #endif
 		}
 		if (error != 0 || dp != NULL) {
 			FILEDESC_SUNLOCK(fdp);
 			if (error == 0 && dp->v_type != VDIR) {
 				vrele(dp);
 				error = ENOTDIR;
 			}
 		}
 		if (error) {
 			uma_zfree(namei_zone, cnp->cn_pnbuf);
 #ifdef DIAGNOSTIC
 			cnp->cn_pnbuf = NULL;
 			cnp->cn_nameptr = NULL;
 #endif
 			return (error);
 		}
 	}
 	if (dp == NULL) {
 		dp = fdp->fd_cdir;
 		VREF(dp);
 		FILEDESC_SUNLOCK(fdp);
 		if (ndp->ni_startdir != NULL)
 			vrele(ndp->ni_startdir);
 	}
 	SDT_PROBE(vfs, namei, lookup, entry, dp, cnp->cn_pnbuf,
 	    cnp->cn_flags, 0, 0);
 	for (;;) {
 		/*
 		 * Check if root directory should replace current directory.
 		 * Done at start of translation and after symbolic link.
 		 */
 		cnp->cn_nameptr = cnp->cn_pnbuf;
 		if (*(cnp->cn_nameptr) == '/') {
 			vrele(dp);
 			if (ndp->ni_strictrelative != 0) {
 #ifdef KTRACE
 				if (KTRPOINT(curthread, KTR_CAPFAIL))
 					ktrcapfail(CAPFAIL_LOOKUP, NULL, NULL);
 #endif
 				return (ENOTCAPABLE);
 			}
 			while (*(cnp->cn_nameptr) == '/') {
 				cnp->cn_nameptr++;
 				ndp->ni_pathlen--;
 			}
 			dp = ndp->ni_rootdir;
 			VREF(dp);
 		}
 		ndp->ni_startdir = dp;
 		error = lookup(ndp);
 		if (error) {
 			uma_zfree(namei_zone, cnp->cn_pnbuf);
 #ifdef DIAGNOSTIC
 			cnp->cn_pnbuf = NULL;
 			cnp->cn_nameptr = NULL;
 #endif
 			SDT_PROBE(vfs, namei, lookup, return, error, NULL, 0,
 			    0, 0);
 			return (error);
 		}
 		/*
 		 * If not a symbolic link, we're done.
 		 */
 		if ((cnp->cn_flags & ISSYMLINK) == 0) {
 			if ((cnp->cn_flags & (SAVENAME | SAVESTART)) == 0) {
 				uma_zfree(namei_zone, cnp->cn_pnbuf);
 #ifdef DIAGNOSTIC
 				cnp->cn_pnbuf = NULL;
 				cnp->cn_nameptr = NULL;
 #endif
 			} else
 				cnp->cn_flags |= HASBUF;
 
 			SDT_PROBE(vfs, namei, lookup, return, 0, ndp->ni_vp,
 			    0, 0, 0);
 			return (0);
 		}
 		if (ndp->ni_loopcnt++ >= MAXSYMLINKS) {
 			error = ELOOP;
 			break;
 		}
 #ifdef MAC
 		if ((cnp->cn_flags & NOMACCHECK) == 0) {
 			error = mac_vnode_check_readlink(td->td_ucred,
 			    ndp->ni_vp);
 			if (error)
 				break;
 		}
 #endif
 		if (ndp->ni_pathlen > 1)
 			cp = uma_zalloc(namei_zone, M_WAITOK);
 		else
 			cp = cnp->cn_pnbuf;
 		aiov.iov_base = cp;
 		aiov.iov_len = MAXPATHLEN;
 		auio.uio_iov = &aiov;
 		auio.uio_iovcnt = 1;
 		auio.uio_offset = 0;
 		auio.uio_rw = UIO_READ;
 		auio.uio_segflg = UIO_SYSSPACE;
 		auio.uio_td = td;
 		auio.uio_resid = MAXPATHLEN;
 		error = VOP_READLINK(ndp->ni_vp, &auio, cnp->cn_cred);
 		if (error) {
 			if (ndp->ni_pathlen > 1)
 				uma_zfree(namei_zone, cp);
 			break;
 		}
 		linklen = MAXPATHLEN - auio.uio_resid;
 		if (linklen == 0) {
 			if (ndp->ni_pathlen > 1)
 				uma_zfree(namei_zone, cp);
 			error = ENOENT;
 			break;
 		}
 		if (linklen + ndp->ni_pathlen >= MAXPATHLEN) {
 			if (ndp->ni_pathlen > 1)
 				uma_zfree(namei_zone, cp);
 			error = ENAMETOOLONG;
 			break;
 		}
 		if (ndp->ni_pathlen > 1) {
 			bcopy(ndp->ni_next, cp + linklen, ndp->ni_pathlen);
 			uma_zfree(namei_zone, cnp->cn_pnbuf);
 			cnp->cn_pnbuf = cp;
 		} else
 			cnp->cn_pnbuf[linklen] = '\0';
 		ndp->ni_pathlen += linklen;
 		vput(ndp->ni_vp);
 		dp = ndp->ni_dvp;
 	}
 	uma_zfree(namei_zone, cnp->cn_pnbuf);
 #ifdef DIAGNOSTIC
 	cnp->cn_pnbuf = NULL;
 	cnp->cn_nameptr = NULL;
 #endif
 	vput(ndp->ni_vp);
 	ndp->ni_vp = NULL;
 	vrele(ndp->ni_dvp);
 	SDT_PROBE(vfs, namei, lookup, return, error, NULL, 0, 0, 0);
 	return (error);
 }
 
 static int
 compute_cn_lkflags(struct mount *mp, int lkflags, int cnflags)
 {
 
 	if (mp == NULL || ((lkflags & LK_SHARED) &&
 	    (!(mp->mnt_kern_flag & MNTK_LOOKUP_SHARED) ||
 	    ((cnflags & ISDOTDOT) &&
 	    (mp->mnt_kern_flag & MNTK_LOOKUP_EXCL_DOTDOT))))) {
 		lkflags &= ~LK_SHARED;
 		lkflags |= LK_EXCLUSIVE;
 	}
 	return (lkflags);
 }
 
 static __inline int
 needs_exclusive_leaf(struct mount *mp, int flags)
 {
 
 	/*
 	 * Intermediate nodes can use shared locks, we only need to
 	 * force an exclusive lock for leaf nodes.
 	 */
 	if ((flags & (ISLASTCN | LOCKLEAF)) != (ISLASTCN | LOCKLEAF))
 		return (0);
 
 	/* Always use exclusive locks if LOCKSHARED isn't set. */
 	if (!(flags & LOCKSHARED))
 		return (1);
 
 	/*
 	 * For lookups during open(), if the mount point supports
 	 * extended shared operations, then use a shared lock for the
 	 * leaf node, otherwise use an exclusive lock.
 	 */
 	if ((flags & ISOPEN) != 0)
 		return (!MNT_EXTENDED_SHARED(mp));
 
 	/*
 	 * Lookup requests outside of open() that specify LOCKSHARED
 	 * only need a shared lock on the leaf vnode.
 	 */
 	return (0);
 }
 
 /*
  * Search a pathname.
  * This is a very central and rather complicated routine.
  *
  * The pathname is pointed to by ni_ptr and is of length ni_pathlen.
  * The starting directory is taken from ni_startdir. The pathname is
  * descended until done, or a symbolic link is encountered. The variable
  * ni_more is clear if the path is completed; it is set to one if a
  * symbolic link needing interpretation is encountered.
  *
  * The flag argument is LOOKUP, CREATE, RENAME, or DELETE depending on
  * whether the name is to be looked up, created, renamed, or deleted.
  * When CREATE, RENAME, or DELETE is specified, information usable in
  * creating, renaming, or deleting a directory entry may be calculated.
  * If flag has LOCKPARENT or'ed into it, the parent directory is returned
  * locked. If flag has WANTPARENT or'ed into it, the parent directory is
  * returned unlocked. Otherwise the parent directory is not returned. If
  * the target of the pathname exists and LOCKLEAF is or'ed into the flag
  * the target is returned locked, otherwise it is returned unlocked.
  * When creating or renaming and LOCKPARENT is specified, the target may not
  * be ".".  When deleting and LOCKPARENT is specified, the target may be ".".
  *
  * Overall outline of lookup:
  *
  * dirloop:
  *	identify next component of name at ndp->ni_ptr
  *	handle degenerate case where name is null string
  *	if .. and crossing mount points and on mounted filesys, find parent
  *	call VOP_LOOKUP routine for next component name
  *	    directory vnode returned in ni_dvp, unlocked unless LOCKPARENT set
  *	    component vnode returned in ni_vp (if it exists), locked.
  *	if result vnode is mounted on and crossing mount points,
  *	    find mounted on vnode
  *	if more components of name, do next level at dirloop
  *	return the answer in ni_vp, locked if LOCKLEAF set
  *	    if LOCKPARENT set, return locked parent in ni_dvp
  *	    if WANTPARENT set, return unlocked parent in ni_dvp
  */
 int
 lookup(struct nameidata *ndp)
 {
 	char *cp;		/* pointer into pathname argument */
 	struct vnode *dp = 0;	/* the directory we are searching */
 	struct vnode *tdp;		/* saved dp */
 	struct mount *mp;		/* mount table entry */
 	struct prison *pr;
 	int docache;			/* == 0 do not cache last component */
 	int wantparent;			/* 1 => wantparent or lockparent flag */
 	int rdonly;			/* lookup read-only flag bit */
 	int error = 0;
 	int dpunlocked = 0;		/* dp has already been unlocked */
 	struct componentname *cnp = &ndp->ni_cnd;
 	int lkflags_save;
 	int ni_dvp_unlocked;
 	
 	/*
 	 * Setup: break out flag bits into variables.
 	 */
 	ni_dvp_unlocked = 0;
 	wantparent = cnp->cn_flags & (LOCKPARENT | WANTPARENT);
 	KASSERT(cnp->cn_nameiop == LOOKUP || wantparent,
 	    ("CREATE, DELETE, RENAME require LOCKPARENT or WANTPARENT."));
 	docache = (cnp->cn_flags & NOCACHE) ^ NOCACHE;
 	if (cnp->cn_nameiop == DELETE ||
 	    (wantparent && cnp->cn_nameiop != CREATE &&
 	     cnp->cn_nameiop != LOOKUP))
 		docache = 0;
 	rdonly = cnp->cn_flags & RDONLY;
 	cnp->cn_flags &= ~ISSYMLINK;
 	ndp->ni_dvp = NULL;
 	/*
 	 * We use shared locks until we hit the parent of the last cn then
 	 * we adjust based on the requesting flags.
 	 */
 	if (lookup_shared)
 		cnp->cn_lkflags = LK_SHARED;
 	else
 		cnp->cn_lkflags = LK_EXCLUSIVE;
 	dp = ndp->ni_startdir;
 	ndp->ni_startdir = NULLVP;
 	vn_lock(dp,
 	    compute_cn_lkflags(dp->v_mount, cnp->cn_lkflags | LK_RETRY,
 	    cnp->cn_flags));
 
 dirloop:
 	/*
 	 * Search a new directory.
 	 *
 	 * The last component of the filename is left accessible via
 	 * cnp->cn_nameptr for callers that need the name. Callers needing
 	 * the name set the SAVENAME flag. When done, they assume
 	 * responsibility for freeing the pathname buffer.
 	 */
 	cnp->cn_consume = 0;
 	for (cp = cnp->cn_nameptr; *cp != 0 && *cp != '/'; cp++)
 		continue;
 	cnp->cn_namelen = cp - cnp->cn_nameptr;
 	if (cnp->cn_namelen > NAME_MAX) {
 		error = ENAMETOOLONG;
 		goto bad;
 	}
 #ifdef NAMEI_DIAGNOSTIC
 	{ char c = *cp;
 	*cp = '\0';
 	printf("{%s}: ", cnp->cn_nameptr);
 	*cp = c; }
 #endif
 	ndp->ni_pathlen -= cnp->cn_namelen;
 	ndp->ni_next = cp;
 
 	/*
 	 * Replace multiple slashes by a single slash and trailing slashes
 	 * by a null.  This must be done before VOP_LOOKUP() because some
 	 * fs's don't know about trailing slashes.  Remember if there were
 	 * trailing slashes to handle symlinks, existing non-directories
 	 * and non-existing files that won't be directories specially later.
 	 */
 	while (*cp == '/' && (cp[1] == '/' || cp[1] == '\0')) {
 		cp++;
 		ndp->ni_pathlen--;
 		if (*cp == '\0') {
 			*ndp->ni_next = '\0';
 			cnp->cn_flags |= TRAILINGSLASH;
 		}
 	}
 	ndp->ni_next = cp;
 
 	cnp->cn_flags |= MAKEENTRY;
 	if (*cp == '\0' && docache == 0)
 		cnp->cn_flags &= ~MAKEENTRY;
 	if (cnp->cn_namelen == 2 &&
 	    cnp->cn_nameptr[1] == '.' && cnp->cn_nameptr[0] == '.')
 		cnp->cn_flags |= ISDOTDOT;
 	else
 		cnp->cn_flags &= ~ISDOTDOT;
 	if (*ndp->ni_next == 0)
 		cnp->cn_flags |= ISLASTCN;
 	else
 		cnp->cn_flags &= ~ISLASTCN;
 
 	if ((cnp->cn_flags & ISLASTCN) != 0 &&
 	    cnp->cn_namelen == 1 && cnp->cn_nameptr[0] == '.' &&
 	    (cnp->cn_nameiop == DELETE || cnp->cn_nameiop == RENAME)) {
 		error = EINVAL;
 		goto bad;
 	}
 
 	/*
 	 * Check for degenerate name (e.g. / or "")
 	 * which is a way of talking about a directory,
 	 * e.g. like "/." or ".".
 	 */
 	if (cnp->cn_nameptr[0] == '\0') {
 		if (dp->v_type != VDIR) {
 			error = ENOTDIR;
 			goto bad;
 		}
 		if (cnp->cn_nameiop != LOOKUP) {
 			error = EISDIR;
 			goto bad;
 		}
 		if (wantparent) {
 			ndp->ni_dvp = dp;
 			VREF(dp);
 		}
 		ndp->ni_vp = dp;
 
 		if (cnp->cn_flags & AUDITVNODE1)
 			AUDIT_ARG_VNODE1(dp);
 		else if (cnp->cn_flags & AUDITVNODE2)
 			AUDIT_ARG_VNODE2(dp);
 
 		if (!(cnp->cn_flags & (LOCKPARENT | LOCKLEAF)))
 			VOP_UNLOCK(dp, 0);
 		/* XXX This should probably move to the top of function. */
 		if (cnp->cn_flags & SAVESTART)
 			panic("lookup: SAVESTART");
 		goto success;
 	}
 
 	/*
 	 * Handle "..": five special cases.
 	 * 0. If doing a capability lookup, return ENOTCAPABLE (this is a
 	 *    fairly conservative design choice, but it's the only one that we
 	 *    are satisfied guarantees the property we're looking for).
 	 * 1. Return an error if this is the last component of
 	 *    the name and the operation is DELETE or RENAME.
 	 * 2. If at root directory (e.g. after chroot)
 	 *    or at absolute root directory
 	 *    then ignore it so can't get out.
 	 * 3. If this vnode is the root of a mounted
 	 *    filesystem, then replace it with the
 	 *    vnode which was mounted on so we take the
 	 *    .. in the other filesystem.
 	 * 4. If the vnode is the top directory of
 	 *    the jail or chroot, don't let them out.
 	 */
 	if (cnp->cn_flags & ISDOTDOT) {
 		if (ndp->ni_strictrelative != 0) {
 #ifdef KTRACE
 			if (KTRPOINT(curthread, KTR_CAPFAIL))
 				ktrcapfail(CAPFAIL_LOOKUP, NULL, NULL);
 #endif
 			error = ENOTCAPABLE;
 			goto bad;
 		}
 		if ((cnp->cn_flags & ISLASTCN) != 0 &&
 		    (cnp->cn_nameiop == DELETE || cnp->cn_nameiop == RENAME)) {
 			error = EINVAL;
 			goto bad;
 		}
 		for (;;) {
 			for (pr = cnp->cn_cred->cr_prison; pr != NULL;
 			     pr = pr->pr_parent)
 				if (dp == pr->pr_root)
 					break;
 			if (dp == ndp->ni_rootdir || 
 			    dp == ndp->ni_topdir || 
 			    dp == rootvnode ||
 			    pr != NULL ||
 			    ((dp->v_vflag & VV_ROOT) != 0 &&
 			     (cnp->cn_flags & NOCROSSMOUNT) != 0)) {
 				ndp->ni_dvp = dp;
 				ndp->ni_vp = dp;
 				VREF(dp);
 				goto nextname;
 			}
 			if ((dp->v_vflag & VV_ROOT) == 0)
 				break;
 			if (dp->v_iflag & VI_DOOMED) {	/* forced unmount */
 				error = ENOENT;
 				goto bad;
 			}
 			tdp = dp;
 			dp = dp->v_mount->mnt_vnodecovered;
 			VREF(dp);
 			vput(tdp);
 			vn_lock(dp,
 			    compute_cn_lkflags(dp->v_mount, cnp->cn_lkflags |
 			    LK_RETRY, ISDOTDOT));
 		}
 	}
 
 	/*
 	 * We now have a segment name to search for, and a directory to search.
 	 */
 unionlookup:
 #ifdef MAC
 	if ((cnp->cn_flags & NOMACCHECK) == 0) {
 		error = mac_vnode_check_lookup(cnp->cn_thread->td_ucred, dp,
 		    cnp);
 		if (error)
 			goto bad;
 	}
 #endif
 	ndp->ni_dvp = dp;
 	ndp->ni_vp = NULL;
 	ASSERT_VOP_LOCKED(dp, "lookup");
 	/*
 	 * If we have a shared lock we may need to upgrade the lock for the
 	 * last operation.
 	 */
 	if (dp != vp_crossmp &&
 	    VOP_ISLOCKED(dp) == LK_SHARED &&
 	    (cnp->cn_flags & ISLASTCN) && (cnp->cn_flags & LOCKPARENT))
 		vn_lock(dp, LK_UPGRADE|LK_RETRY);
 	if ((dp->v_iflag & VI_DOOMED) != 0) {
 		error = ENOENT;
 		goto bad;
 	}
 	/*
 	 * If we're looking up the last component and we need an exclusive
 	 * lock, adjust our lkflags.
 	 */
 	if (needs_exclusive_leaf(dp->v_mount, cnp->cn_flags))
 		cnp->cn_lkflags = LK_EXCLUSIVE;
 #ifdef NAMEI_DIAGNOSTIC
 	vprint("lookup in", dp);
 #endif
 	lkflags_save = cnp->cn_lkflags;
 	cnp->cn_lkflags = compute_cn_lkflags(dp->v_mount, cnp->cn_lkflags,
 	    cnp->cn_flags);
 	if ((error = VOP_LOOKUP(dp, &ndp->ni_vp, cnp)) != 0) {
 		cnp->cn_lkflags = lkflags_save;
 		KASSERT(ndp->ni_vp == NULL, ("leaf should be empty"));
 #ifdef NAMEI_DIAGNOSTIC
 		printf("not found\n");
 #endif
 		if ((error == ENOENT) &&
 		    (dp->v_vflag & VV_ROOT) && (dp->v_mount != NULL) &&
 		    (dp->v_mount->mnt_flag & MNT_UNION)) {
 			tdp = dp;
 			dp = dp->v_mount->mnt_vnodecovered;
 			VREF(dp);
 			vput(tdp);
 			vn_lock(dp,
 			    compute_cn_lkflags(dp->v_mount, cnp->cn_lkflags |
 			    LK_RETRY, cnp->cn_flags));
 			goto unionlookup;
 		}
 
 		if (error != EJUSTRETURN)
 			goto bad;
 		/*
 		 * At this point, we know we're at the end of the
 		 * pathname.  If creating / renaming, we can consider
 		 * allowing the file or directory to be created / renamed,
 		 * provided we're not on a read-only filesystem.
 		 */
 		if (rdonly) {
 			error = EROFS;
 			goto bad;
 		}
 		/* trailing slash only allowed for directories */
 		if ((cnp->cn_flags & TRAILINGSLASH) &&
 		    !(cnp->cn_flags & WILLBEDIR)) {
 			error = ENOENT;
 			goto bad;
 		}
 		if ((cnp->cn_flags & LOCKPARENT) == 0)
 			VOP_UNLOCK(dp, 0);
 		/*
 		 * We return with ni_vp NULL to indicate that the entry
 		 * doesn't currently exist, leaving a pointer to the
 		 * (possibly locked) directory vnode in ndp->ni_dvp.
 		 */
 		if (cnp->cn_flags & SAVESTART) {
 			ndp->ni_startdir = ndp->ni_dvp;
 			VREF(ndp->ni_startdir);
 		}
 		goto success;
 	} else
 		cnp->cn_lkflags = lkflags_save;
 #ifdef NAMEI_DIAGNOSTIC
 	printf("found\n");
 #endif
 	/*
 	 * Take into account any additional components consumed by
 	 * the underlying filesystem.
 	 */
 	if (cnp->cn_consume > 0) {
 		cnp->cn_nameptr += cnp->cn_consume;
 		ndp->ni_next += cnp->cn_consume;
 		ndp->ni_pathlen -= cnp->cn_consume;
 		cnp->cn_consume = 0;
 	}
 
 	dp = ndp->ni_vp;
 
 	/*
 	 * Check to see if the vnode has been mounted on;
 	 * if so find the root of the mounted filesystem.
 	 */
 	while (dp->v_type == VDIR && (mp = dp->v_mountedhere) &&
 	       (cnp->cn_flags & NOCROSSMOUNT) == 0) {
 		if (vfs_busy(mp, 0))
 			continue;
 		vput(dp);
 		if (dp != ndp->ni_dvp)
 			vput(ndp->ni_dvp);
 		else
 			vrele(ndp->ni_dvp);
 		vref(vp_crossmp);
 		ndp->ni_dvp = vp_crossmp;
 		error = VFS_ROOT(mp, compute_cn_lkflags(mp, cnp->cn_lkflags,
 		    cnp->cn_flags), &tdp);
 		vfs_unbusy(mp);
 		if (vn_lock(vp_crossmp, LK_SHARED | LK_NOWAIT))
 			panic("vp_crossmp exclusively locked or reclaimed");
 		if (error) {
 			dpunlocked = 1;
 			goto bad2;
 		}
 		ndp->ni_vp = dp = tdp;
 	}
 
 	/*
 	 * Check for symbolic link
 	 */
 	if ((dp->v_type == VLNK) &&
 	    ((cnp->cn_flags & FOLLOW) || (cnp->cn_flags & TRAILINGSLASH) ||
 	     *ndp->ni_next == '/')) {
 		cnp->cn_flags |= ISSYMLINK;
 		if (dp->v_iflag & VI_DOOMED) {
 			/*
 			 * We can't know whether the directory was mounted with
 			 * NOSYMFOLLOW, so we can't follow safely.
 			 */
 			error = ENOENT;
 			goto bad2;
 		}
 		if (dp->v_mount->mnt_flag & MNT_NOSYMFOLLOW) {
 			error = EACCES;
 			goto bad2;
 		}
 		/*
 		 * Symlink code always expects an unlocked dvp.
 		 */
 		if (ndp->ni_dvp != ndp->ni_vp) {
 			VOP_UNLOCK(ndp->ni_dvp, 0);
 			ni_dvp_unlocked = 1;
 		}
 		goto success;
 	}
 
 nextname:
 	/*
 	 * Not a symbolic link that we will follow.  Continue with the
 	 * next component if there is any; otherwise, we're done.
 	 */
 	KASSERT((cnp->cn_flags & ISLASTCN) || *ndp->ni_next == '/',
 	    ("lookup: invalid path state."));
 	if (*ndp->ni_next == '/') {
 		cnp->cn_nameptr = ndp->ni_next;
 		while (*cnp->cn_nameptr == '/') {
 			cnp->cn_nameptr++;
 			ndp->ni_pathlen--;
 		}
 		if (ndp->ni_dvp != dp)
 			vput(ndp->ni_dvp);
 		else
 			vrele(ndp->ni_dvp);
 		goto dirloop;
 	}
 	/*
 	 * If we're processing a path with a trailing slash,
 	 * check that the end result is a directory.
 	 */
 	if ((cnp->cn_flags & TRAILINGSLASH) && dp->v_type != VDIR) {
 		error = ENOTDIR;
 		goto bad2;
 	}
 	/*
 	 * Disallow directory write attempts on read-only filesystems.
 	 */
 	if (rdonly &&
 	    (cnp->cn_nameiop == DELETE || cnp->cn_nameiop == RENAME)) {
 		error = EROFS;
 		goto bad2;
 	}
 	if (cnp->cn_flags & SAVESTART) {
 		ndp->ni_startdir = ndp->ni_dvp;
 		VREF(ndp->ni_startdir);
 	}
 	if (!wantparent) {
 		ni_dvp_unlocked = 2;
 		if (ndp->ni_dvp != dp)
 			vput(ndp->ni_dvp);
 		else
 			vrele(ndp->ni_dvp);
 	} else if ((cnp->cn_flags & LOCKPARENT) == 0 && ndp->ni_dvp != dp) {
 		VOP_UNLOCK(ndp->ni_dvp, 0);
 		ni_dvp_unlocked = 1;
 	}
 
 	if (cnp->cn_flags & AUDITVNODE1)
 		AUDIT_ARG_VNODE1(dp);
 	else if (cnp->cn_flags & AUDITVNODE2)
 		AUDIT_ARG_VNODE2(dp);
 
 	if ((cnp->cn_flags & LOCKLEAF) == 0)
 		VOP_UNLOCK(dp, 0);
 success:
 	/*
 	 * Because of lookup_shared we may have the vnode shared locked, but
 	 * the caller may want it to be exclusively locked.
 	 */
 	if (needs_exclusive_leaf(dp->v_mount, cnp->cn_flags) &&
 	    VOP_ISLOCKED(dp) != LK_EXCLUSIVE) {
 		vn_lock(dp, LK_UPGRADE | LK_RETRY);
 		if (dp->v_iflag & VI_DOOMED) {
 			error = ENOENT;
 			goto bad2;
 		}
 	}
 	return (0);
 
 bad2:
 	if (ni_dvp_unlocked != 2) {
 		if (dp != ndp->ni_dvp && !ni_dvp_unlocked)
 			vput(ndp->ni_dvp);
 		else
 			vrele(ndp->ni_dvp);
 	}
 bad:
 	if (!dpunlocked)
 		vput(dp);
 	ndp->ni_vp = NULL;
 	return (error);
 }
 
 /*
  * relookup - lookup a path name component
  *    Used by lookup to re-acquire things.
  */
 int
 relookup(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp)
 {
 	struct vnode *dp = 0;		/* the directory we are searching */
 	int wantparent;			/* 1 => wantparent or lockparent flag */
 	int rdonly;			/* lookup read-only flag bit */
 	int error = 0;
 
 	KASSERT(cnp->cn_flags & ISLASTCN,
 	    ("relookup: Not given last component."));
 	/*
 	 * Setup: break out flag bits into variables.
 	 */
 	wantparent = cnp->cn_flags & (LOCKPARENT|WANTPARENT);
 	KASSERT(wantparent, ("relookup: parent not wanted."));
 	rdonly = cnp->cn_flags & RDONLY;
 	cnp->cn_flags &= ~ISSYMLINK;
 	dp = dvp;
 	cnp->cn_lkflags = LK_EXCLUSIVE;
 	vn_lock(dp, LK_EXCLUSIVE | LK_RETRY);
 
 	/*
 	 * Search a new directory.
 	 *
 	 * The last component of the filename is left accessible via
 	 * cnp->cn_nameptr for callers that need the name. Callers needing
 	 * the name set the SAVENAME flag. When done, they assume
 	 * responsibility for freeing the pathname buffer.
 	 */
 #ifdef NAMEI_DIAGNOSTIC
 	printf("{%s}: ", cnp->cn_nameptr);
 #endif
 
 	/*
 	 * Check for "" which represents the root directory after slash
 	 * removal.
 	 */
 	if (cnp->cn_nameptr[0] == '\0') {
 		/*
 		 * Support only LOOKUP for "/" because lookup()
 		 * can't succeed for CREATE, DELETE and RENAME.
 		 */
 		KASSERT(cnp->cn_nameiop == LOOKUP, ("nameiop must be LOOKUP"));
 		KASSERT(dp->v_type == VDIR, ("dp is not a directory"));
 
 		if (!(cnp->cn_flags & LOCKLEAF))
 			VOP_UNLOCK(dp, 0);
 		*vpp = dp;
 		/* XXX This should probably move to the top of function. */
 		if (cnp->cn_flags & SAVESTART)
 			panic("lookup: SAVESTART");
 		return (0);
 	}
 
 	if (cnp->cn_flags & ISDOTDOT)
 		panic ("relookup: lookup on dot-dot");
 
 	/*
 	 * We now have a segment name to search for, and a directory to search.
 	 */
 #ifdef NAMEI_DIAGNOSTIC
 	vprint("search in:", dp);
 #endif
 	if ((error = VOP_LOOKUP(dp, vpp, cnp)) != 0) {
 		KASSERT(*vpp == NULL, ("leaf should be empty"));
 		if (error != EJUSTRETURN)
 			goto bad;
 		/*
 		 * If creating and at end of pathname, then can consider
 		 * allowing file to be created.
 		 */
 		if (rdonly) {
 			error = EROFS;
 			goto bad;
 		}
 		/* ASSERT(dvp == ndp->ni_startdir) */
 		if (cnp->cn_flags & SAVESTART)
 			VREF(dvp);
 		if ((cnp->cn_flags & LOCKPARENT) == 0)
 			VOP_UNLOCK(dp, 0);
 		/*
 		 * We return with ni_vp NULL to indicate that the entry
 		 * doesn't currently exist, leaving a pointer to the
 		 * (possibly locked) directory vnode in ndp->ni_dvp.
 		 */
 		return (0);
 	}
 
 	dp = *vpp;
 
 	/*
 	 * Disallow directory write attempts on read-only filesystems.
 	 */
 	if (rdonly &&
 	    (cnp->cn_nameiop == DELETE || cnp->cn_nameiop == RENAME)) {
 		if (dvp == dp)
 			vrele(dvp);
 		else
 			vput(dvp);
 		error = EROFS;
 		goto bad;
 	}
 	/*
 	 * Set the parent lock/ref state to the requested state.
 	 */
 	if ((cnp->cn_flags & LOCKPARENT) == 0 && dvp != dp) {
 		if (wantparent)
 			VOP_UNLOCK(dvp, 0);
 		else
 			vput(dvp);
 	} else if (!wantparent)
 		vrele(dvp);
 	/*
 	 * Check for symbolic link
 	 */
 	KASSERT(dp->v_type != VLNK || !(cnp->cn_flags & FOLLOW),
 	    ("relookup: symlink found.\n"));
 
 	/* ASSERT(dvp == ndp->ni_startdir) */
 	if (cnp->cn_flags & SAVESTART)
 		VREF(dvp);
 	
 	if ((cnp->cn_flags & LOCKLEAF) == 0)
 		VOP_UNLOCK(dp, 0);
 	return (0);
 bad:
 	vput(dp);
 	*vpp = NULL;
 	return (error);
 }
 
 void
 NDINIT_ALL(struct nameidata *ndp, u_long op, u_long flags, enum uio_seg segflg,
     const char *namep, int dirfd, struct vnode *startdir, cap_rights_t *rightsp,
     struct thread *td)
 {
 
 	ndp->ni_cnd.cn_nameiop = op;
 	ndp->ni_cnd.cn_flags = flags;
 	ndp->ni_segflg = segflg;
 	ndp->ni_dirp = namep;
 	ndp->ni_dirfd = dirfd;
 	ndp->ni_startdir = startdir;
 	ndp->ni_strictrelative = 0;
 	if (rightsp != NULL)
 		ndp->ni_rightsneeded = *rightsp;
 	else
 		cap_rights_init(&ndp->ni_rightsneeded);
 	filecaps_init(&ndp->ni_filecaps);
 	ndp->ni_cnd.cn_thread = td;
 }
 
 /*
  * Free data allocated by namei(); see namei(9) for details.
  */
 void
 NDFREE(struct nameidata *ndp, const u_int flags)
 {
 	int unlock_dvp;
 	int unlock_vp;
 
 	unlock_dvp = 0;
 	unlock_vp = 0;
 
 	if (!(flags & NDF_NO_FREE_PNBUF) &&
 	    (ndp->ni_cnd.cn_flags & HASBUF)) {
 		uma_zfree(namei_zone, ndp->ni_cnd.cn_pnbuf);
 		ndp->ni_cnd.cn_flags &= ~HASBUF;
 	}
 	if (!(flags & NDF_NO_VP_UNLOCK) &&
 	    (ndp->ni_cnd.cn_flags & LOCKLEAF) && ndp->ni_vp)
 		unlock_vp = 1;
 	if (!(flags & NDF_NO_VP_RELE) && ndp->ni_vp) {
 		if (unlock_vp) {
 			vput(ndp->ni_vp);
 			unlock_vp = 0;
 		} else
 			vrele(ndp->ni_vp);
 		ndp->ni_vp = NULL;
 	}
 	if (unlock_vp)
 		VOP_UNLOCK(ndp->ni_vp, 0);
 	if (!(flags & NDF_NO_DVP_UNLOCK) &&
 	    (ndp->ni_cnd.cn_flags & LOCKPARENT) &&
 	    ndp->ni_dvp != ndp->ni_vp)
 		unlock_dvp = 1;
 	if (!(flags & NDF_NO_DVP_RELE) &&
 	    (ndp->ni_cnd.cn_flags & (LOCKPARENT|WANTPARENT))) {
 		if (unlock_dvp) {
 			vput(ndp->ni_dvp);
 			unlock_dvp = 0;
 		} else
 			vrele(ndp->ni_dvp);
 		ndp->ni_dvp = NULL;
 	}
 	if (unlock_dvp)
 		VOP_UNLOCK(ndp->ni_dvp, 0);
 	if (!(flags & NDF_NO_STARTDIR_RELE) &&
 	    (ndp->ni_cnd.cn_flags & SAVESTART)) {
 		vrele(ndp->ni_startdir);
 		ndp->ni_startdir = NULL;
 	}
 }
 
 /*
  * Determine if there is a suitable alternate filename under the specified
  * prefix for the specified path.  If the create flag is set, then the
  * alternate prefix will be used so long as the parent directory exists.
  * This is used by the various compatiblity ABIs so that Linux binaries prefer
  * files under /compat/linux for example.  The chosen path (whether under
  * the prefix or under /) is returned in a kernel malloc'd buffer pointed
  * to by pathbuf.  The caller is responsible for free'ing the buffer from
  * the M_TEMP bucket if one is returned.
  */
 int
 kern_alternate_path(struct thread *td, const char *prefix, const char *path,
     enum uio_seg pathseg, char **pathbuf, int create, int dirfd)
 {
 	struct nameidata nd, ndroot;
 	char *ptr, *buf, *cp;
 	size_t len, sz;
 	int error;
 
 	buf = (char *) malloc(MAXPATHLEN, M_TEMP, M_WAITOK);
 	*pathbuf = buf;
 
 	/* Copy the prefix into the new pathname as a starting point. */
 	len = strlcpy(buf, prefix, MAXPATHLEN);
 	if (len >= MAXPATHLEN) {
 		*pathbuf = NULL;
 		free(buf, M_TEMP);
 		return (EINVAL);
 	}
 	sz = MAXPATHLEN - len;
 	ptr = buf + len;
 
 	/* Append the filename to the prefix. */
 	if (pathseg == UIO_SYSSPACE)
 		error = copystr(path, ptr, sz, &len);
 	else
 		error = copyinstr(path, ptr, sz, &len);
 
 	if (error) {
 		*pathbuf = NULL;
 		free(buf, M_TEMP);
 		return (error);
 	}
 
 	/* Only use a prefix with absolute pathnames. */
 	if (*ptr != '/') {
 		error = EINVAL;
 		goto keeporig;
 	}
 
 	if (dirfd != AT_FDCWD) {
 		/*
 		 * We want the original because the "prefix" is
 		 * included in the already opened dirfd.
 		 */
 		bcopy(ptr, buf, len);
 		return (0);
 	}
 
 	/*
 	 * We know that there is a / somewhere in this pathname.
 	 * Search backwards for it, to find the file's parent dir
 	 * to see if it exists in the alternate tree. If it does,
 	 * and we want to create a file (cflag is set). We don't
 	 * need to worry about the root comparison in this case.
 	 */
 
 	if (create) {
 		for (cp = &ptr[len] - 1; *cp != '/'; cp--);
 		*cp = '\0';
 
 		NDINIT(&nd, LOOKUP, FOLLOW, UIO_SYSSPACE, buf, td);
 		error = namei(&nd);
 		*cp = '/';
 		if (error != 0)
 			goto keeporig;
 	} else {
 		NDINIT(&nd, LOOKUP, FOLLOW, UIO_SYSSPACE, buf, td);
 
 		error = namei(&nd);
 		if (error != 0)
 			goto keeporig;
 
 		/*
 		 * We now compare the vnode of the prefix to the one
 		 * vnode asked. If they resolve to be the same, then we
 		 * ignore the match so that the real root gets used.
 		 * This avoids the problem of traversing "../.." to find the
 		 * root directory and never finding it, because "/" resolves
 		 * to the emulation root directory. This is expensive :-(
 		 */
 		NDINIT(&ndroot, LOOKUP, FOLLOW, UIO_SYSSPACE, prefix,
 		    td);
 
 		/* We shouldn't ever get an error from this namei(). */
 		error = namei(&ndroot);
 		if (error == 0) {
 			if (nd.ni_vp == ndroot.ni_vp)
 				error = ENOENT;
 
 			NDFREE(&ndroot, NDF_ONLY_PNBUF);
 			vrele(ndroot.ni_vp);
 		}
 	}
 
 	NDFREE(&nd, NDF_ONLY_PNBUF);
 	vrele(nd.ni_vp);
 
 keeporig:
 	/* If there was an error, use the original path name. */
 	if (error)
 		bcopy(ptr, buf, len);
 	return (error);
 }
Index: head/sys/kern/vfs_syscalls.c
===================================================================
--- head/sys/kern/vfs_syscalls.c	(revision 258540)
+++ head/sys/kern/vfs_syscalls.c	(revision 258541)
@@ -1,4729 +1,4728 @@
 /*-
  * Copyright (c) 1989, 1993
  *	The Regents of the University of California.  All rights reserved.
  * (c) UNIX System Laboratories, Inc.
  * All or some portions of this file are derived from material licensed
  * to the University of California by American Telephone and Telegraph
  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
  * the permission of UNIX System Laboratories, Inc.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)vfs_syscalls.c	8.13 (Berkeley) 4/15/94
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_capsicum.h"
 #include "opt_compat.h"
-#include "opt_kdtrace.h"
 #include "opt_ktrace.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/bio.h>
 #include <sys/buf.h>
 #include <sys/capability.h>
 #include <sys/disk.h>
 #include <sys/sysent.h>
 #include <sys/malloc.h>
 #include <sys/mount.h>
 #include <sys/mutex.h>
 #include <sys/sysproto.h>
 #include <sys/namei.h>
 #include <sys/filedesc.h>
 #include <sys/kernel.h>
 #include <sys/fcntl.h>
 #include <sys/file.h>
 #include <sys/filio.h>
 #include <sys/limits.h>
 #include <sys/linker.h>
 #include <sys/rwlock.h>
 #include <sys/sdt.h>
 #include <sys/stat.h>
 #include <sys/sx.h>
 #include <sys/unistd.h>
 #include <sys/vnode.h>
 #include <sys/priv.h>
 #include <sys/proc.h>
 #include <sys/dirent.h>
 #include <sys/jail.h>
 #include <sys/syscallsubr.h>
 #include <sys/sysctl.h>
 #ifdef KTRACE
 #include <sys/ktrace.h>
 #endif
 
 #include <machine/stdarg.h>
 
 #include <security/audit/audit.h>
 #include <security/mac/mac_framework.h>
 
 #include <vm/vm.h>
 #include <vm/vm_object.h>
 #include <vm/vm_page.h>
 #include <vm/uma.h>
 
 #include <ufs/ufs/quota.h>
 
 MALLOC_DEFINE(M_FADVISE, "fadvise", "posix_fadvise(2) information");
 
 SDT_PROVIDER_DEFINE(vfs);
 SDT_PROBE_DEFINE2(vfs, , stat, mode, mode, "char *", "int");
 SDT_PROBE_DEFINE2(vfs, , stat, reg, reg, "char *", "int");
 
 static int chroot_refuse_vdir_fds(struct filedesc *fdp);
 static int getutimes(const struct timeval *, enum uio_seg, struct timespec *);
 static int kern_chflags(struct thread *td, const char *path,
     enum uio_seg pathseg, u_long flags);
 static int kern_chflagsat(struct thread *td, int fd, const char *path,
     enum uio_seg pathseg, u_long flags, int atflag);
 static int setfflags(struct thread *td, struct vnode *, u_long);
 static int setutimes(struct thread *td, struct vnode *,
     const struct timespec *, int, int);
 static int vn_access(struct vnode *vp, int user_flags, struct ucred *cred,
     struct thread *td);
 
 /*
  * The module initialization routine for POSIX asynchronous I/O will
  * set this to the version of AIO that it implements.  (Zero means
  * that it is not implemented.)  This value is used here by pathconf()
  * and in kern_descrip.c by fpathconf().
  */
 int async_io_version;
 
 #ifdef DEBUG
 static int syncprt = 0;
 SYSCTL_INT(_debug, OID_AUTO, syncprt, CTLFLAG_RW, &syncprt, 0, "");
 #endif
 
 /*
  * Sync each mounted filesystem.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct sync_args {
 	int     dummy;
 };
 #endif
 /* ARGSUSED */
 int
 sys_sync(td, uap)
 	struct thread *td;
 	struct sync_args *uap;
 {
 	struct mount *mp, *nmp;
 	int save;
 
 	mtx_lock(&mountlist_mtx);
 	for (mp = TAILQ_FIRST(&mountlist); mp != NULL; mp = nmp) {
 		if (vfs_busy(mp, MBF_NOWAIT | MBF_MNTLSTLOCK)) {
 			nmp = TAILQ_NEXT(mp, mnt_list);
 			continue;
 		}
 		if ((mp->mnt_flag & MNT_RDONLY) == 0 &&
 		    vn_start_write(NULL, &mp, V_NOWAIT) == 0) {
 			save = curthread_pflags_set(TDP_SYNCIO);
 			vfs_msync(mp, MNT_NOWAIT);
 			VFS_SYNC(mp, MNT_NOWAIT);
 			curthread_pflags_restore(save);
 			vn_finished_write(mp);
 		}
 		mtx_lock(&mountlist_mtx);
 		nmp = TAILQ_NEXT(mp, mnt_list);
 		vfs_unbusy(mp);
 	}
 	mtx_unlock(&mountlist_mtx);
 	return (0);
 }
 
 /*
  * Change filesystem quotas.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct quotactl_args {
 	char *path;
 	int cmd;
 	int uid;
 	caddr_t arg;
 };
 #endif
 int
 sys_quotactl(td, uap)
 	struct thread *td;
 	register struct quotactl_args /* {
 		char *path;
 		int cmd;
 		int uid;
 		caddr_t arg;
 	} */ *uap;
 {
 	struct mount *mp;
 	struct nameidata nd;
 	int error;
 
 	AUDIT_ARG_CMD(uap->cmd);
 	AUDIT_ARG_UID(uap->uid);
 	if (!prison_allow(td->td_ucred, PR_ALLOW_QUOTAS))
 		return (EPERM);
 	NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | AUDITVNODE1, UIO_USERSPACE,
 	    uap->path, td);
 	if ((error = namei(&nd)) != 0)
 		return (error);
 	NDFREE(&nd, NDF_ONLY_PNBUF);
 	mp = nd.ni_vp->v_mount;
 	vfs_ref(mp);
 	vput(nd.ni_vp);
 	error = vfs_busy(mp, 0);
 	vfs_rel(mp);
 	if (error != 0)
 		return (error);
 	error = VFS_QUOTACTL(mp, uap->cmd, uap->uid, uap->arg);
 
 	/*
 	 * Since quota on operation typically needs to open quota
 	 * file, the Q_QUOTAON handler needs to unbusy the mount point
 	 * before calling into namei.  Otherwise, unmount might be
 	 * started between two vfs_busy() invocations (first is our,
 	 * second is from mount point cross-walk code in lookup()),
 	 * causing deadlock.
 	 *
 	 * Require that Q_QUOTAON handles the vfs_busy() reference on
 	 * its own, always returning with ubusied mount point.
 	 */
 	if ((uap->cmd >> SUBCMDSHIFT) != Q_QUOTAON)
 		vfs_unbusy(mp);
 	return (error);
 }
 
 /*
  * Used by statfs conversion routines to scale the block size up if
  * necessary so that all of the block counts are <= 'max_size'.  Note
  * that 'max_size' should be a bitmask, i.e. 2^n - 1 for some non-zero
  * value of 'n'.
  */
 void
 statfs_scale_blocks(struct statfs *sf, long max_size)
 {
 	uint64_t count;
 	int shift;
 
 	KASSERT(powerof2(max_size + 1), ("%s: invalid max_size", __func__));
 
 	/*
 	 * Attempt to scale the block counts to give a more accurate
 	 * overview to userland of the ratio of free space to used
 	 * space.  To do this, find the largest block count and compute
 	 * a divisor that lets it fit into a signed integer <= max_size.
 	 */
 	if (sf->f_bavail < 0)
 		count = -sf->f_bavail;
 	else
 		count = sf->f_bavail;
 	count = MAX(sf->f_blocks, MAX(sf->f_bfree, count));
 	if (count <= max_size)
 		return;
 
 	count >>= flsl(max_size);
 	shift = 0;
 	while (count > 0) {
 		shift++;
 		count >>=1;
 	}
 
 	sf->f_bsize <<= shift;
 	sf->f_blocks >>= shift;
 	sf->f_bfree >>= shift;
 	sf->f_bavail >>= shift;
 }
 
 /*
  * Get filesystem statistics.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct statfs_args {
 	char *path;
 	struct statfs *buf;
 };
 #endif
 int
 sys_statfs(td, uap)
 	struct thread *td;
 	register struct statfs_args /* {
 		char *path;
 		struct statfs *buf;
 	} */ *uap;
 {
 	struct statfs sf;
 	int error;
 
 	error = kern_statfs(td, uap->path, UIO_USERSPACE, &sf);
 	if (error == 0)
 		error = copyout(&sf, uap->buf, sizeof(sf));
 	return (error);
 }
 
 int
 kern_statfs(struct thread *td, char *path, enum uio_seg pathseg,
     struct statfs *buf)
 {
 	struct mount *mp;
 	struct statfs *sp, sb;
 	struct nameidata nd;
 	int error;
 
 	NDINIT(&nd, LOOKUP, FOLLOW | LOCKSHARED | LOCKLEAF | AUDITVNODE1,
 	    pathseg, path, td);
 	error = namei(&nd);
 	if (error != 0)
 		return (error);
 	mp = nd.ni_vp->v_mount;
 	vfs_ref(mp);
 	NDFREE(&nd, NDF_ONLY_PNBUF);
 	vput(nd.ni_vp);
 	error = vfs_busy(mp, 0);
 	vfs_rel(mp);
 	if (error != 0)
 		return (error);
 #ifdef MAC
 	error = mac_mount_check_stat(td->td_ucred, mp);
 	if (error != 0)
 		goto out;
 #endif
 	/*
 	 * Set these in case the underlying filesystem fails to do so.
 	 */
 	sp = &mp->mnt_stat;
 	sp->f_version = STATFS_VERSION;
 	sp->f_namemax = NAME_MAX;
 	sp->f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
 	error = VFS_STATFS(mp, sp);
 	if (error != 0)
 		goto out;
 	if (priv_check(td, PRIV_VFS_GENERATION)) {
 		bcopy(sp, &sb, sizeof(sb));
 		sb.f_fsid.val[0] = sb.f_fsid.val[1] = 0;
 		prison_enforce_statfs(td->td_ucred, mp, &sb);
 		sp = &sb;
 	}
 	*buf = *sp;
 out:
 	vfs_unbusy(mp);
 	return (error);
 }
 
 /*
  * Get filesystem statistics.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct fstatfs_args {
 	int fd;
 	struct statfs *buf;
 };
 #endif
 int
 sys_fstatfs(td, uap)
 	struct thread *td;
 	register struct fstatfs_args /* {
 		int fd;
 		struct statfs *buf;
 	} */ *uap;
 {
 	struct statfs sf;
 	int error;
 
 	error = kern_fstatfs(td, uap->fd, &sf);
 	if (error == 0)
 		error = copyout(&sf, uap->buf, sizeof(sf));
 	return (error);
 }
 
 int
 kern_fstatfs(struct thread *td, int fd, struct statfs *buf)
 {
 	struct file *fp;
 	struct mount *mp;
 	struct statfs *sp, sb;
 	struct vnode *vp;
 	cap_rights_t rights;
 	int error;
 
 	AUDIT_ARG_FD(fd);
 	error = getvnode(td->td_proc->p_fd, fd,
 	    cap_rights_init(&rights, CAP_FSTATFS), &fp);
 	if (error != 0)
 		return (error);
 	vp = fp->f_vnode;
 	vn_lock(vp, LK_SHARED | LK_RETRY);
 #ifdef AUDIT
 	AUDIT_ARG_VNODE1(vp);
 #endif
 	mp = vp->v_mount;
 	if (mp)
 		vfs_ref(mp);
 	VOP_UNLOCK(vp, 0);
 	fdrop(fp, td);
 	if (mp == NULL) {
 		error = EBADF;
 		goto out;
 	}
 	error = vfs_busy(mp, 0);
 	vfs_rel(mp);
 	if (error != 0)
 		return (error);
 #ifdef MAC
 	error = mac_mount_check_stat(td->td_ucred, mp);
 	if (error != 0)
 		goto out;
 #endif
 	/*
 	 * Set these in case the underlying filesystem fails to do so.
 	 */
 	sp = &mp->mnt_stat;
 	sp->f_version = STATFS_VERSION;
 	sp->f_namemax = NAME_MAX;
 	sp->f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
 	error = VFS_STATFS(mp, sp);
 	if (error != 0)
 		goto out;
 	if (priv_check(td, PRIV_VFS_GENERATION)) {
 		bcopy(sp, &sb, sizeof(sb));
 		sb.f_fsid.val[0] = sb.f_fsid.val[1] = 0;
 		prison_enforce_statfs(td->td_ucred, mp, &sb);
 		sp = &sb;
 	}
 	*buf = *sp;
 out:
 	if (mp)
 		vfs_unbusy(mp);
 	return (error);
 }
 
 /*
  * Get statistics on all filesystems.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct getfsstat_args {
 	struct statfs *buf;
 	long bufsize;
 	int flags;
 };
 #endif
 int
 sys_getfsstat(td, uap)
 	struct thread *td;
 	register struct getfsstat_args /* {
 		struct statfs *buf;
 		long bufsize;
 		int flags;
 	} */ *uap;
 {
 
 	return (kern_getfsstat(td, &uap->buf, uap->bufsize, UIO_USERSPACE,
 	    uap->flags));
 }
 
 /*
  * If (bufsize > 0 && bufseg == UIO_SYSSPACE)
  *	The caller is responsible for freeing memory which will be allocated
  *	in '*buf'.
  */
 int
 kern_getfsstat(struct thread *td, struct statfs **buf, size_t bufsize,
     enum uio_seg bufseg, int flags)
 {
 	struct mount *mp, *nmp;
 	struct statfs *sfsp, *sp, sb;
 	size_t count, maxcount;
 	int error;
 
 	maxcount = bufsize / sizeof(struct statfs);
 	if (bufsize == 0)
 		sfsp = NULL;
 	else if (bufseg == UIO_USERSPACE)
 		sfsp = *buf;
 	else /* if (bufseg == UIO_SYSSPACE) */ {
 		count = 0;
 		mtx_lock(&mountlist_mtx);
 		TAILQ_FOREACH(mp, &mountlist, mnt_list) {
 			count++;
 		}
 		mtx_unlock(&mountlist_mtx);
 		if (maxcount > count)
 			maxcount = count;
 		sfsp = *buf = malloc(maxcount * sizeof(struct statfs), M_TEMP,
 		    M_WAITOK);
 	}
 	count = 0;
 	mtx_lock(&mountlist_mtx);
 	for (mp = TAILQ_FIRST(&mountlist); mp != NULL; mp = nmp) {
 		if (prison_canseemount(td->td_ucred, mp) != 0) {
 			nmp = TAILQ_NEXT(mp, mnt_list);
 			continue;
 		}
 #ifdef MAC
 		if (mac_mount_check_stat(td->td_ucred, mp) != 0) {
 			nmp = TAILQ_NEXT(mp, mnt_list);
 			continue;
 		}
 #endif
 		if (vfs_busy(mp, MBF_NOWAIT | MBF_MNTLSTLOCK)) {
 			nmp = TAILQ_NEXT(mp, mnt_list);
 			continue;
 		}
 		if (sfsp && count < maxcount) {
 			sp = &mp->mnt_stat;
 			/*
 			 * Set these in case the underlying filesystem
 			 * fails to do so.
 			 */
 			sp->f_version = STATFS_VERSION;
 			sp->f_namemax = NAME_MAX;
 			sp->f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
 			/*
 			 * If MNT_NOWAIT or MNT_LAZY is specified, do not
 			 * refresh the fsstat cache. MNT_NOWAIT or MNT_LAZY
 			 * overrides MNT_WAIT.
 			 */
 			if (((flags & (MNT_LAZY|MNT_NOWAIT)) == 0 ||
 			    (flags & MNT_WAIT)) &&
 			    (error = VFS_STATFS(mp, sp))) {
 				mtx_lock(&mountlist_mtx);
 				nmp = TAILQ_NEXT(mp, mnt_list);
 				vfs_unbusy(mp);
 				continue;
 			}
 			if (priv_check(td, PRIV_VFS_GENERATION)) {
 				bcopy(sp, &sb, sizeof(sb));
 				sb.f_fsid.val[0] = sb.f_fsid.val[1] = 0;
 				prison_enforce_statfs(td->td_ucred, mp, &sb);
 				sp = &sb;
 			}
 			if (bufseg == UIO_SYSSPACE)
 				bcopy(sp, sfsp, sizeof(*sp));
 			else /* if (bufseg == UIO_USERSPACE) */ {
 				error = copyout(sp, sfsp, sizeof(*sp));
 				if (error != 0) {
 					vfs_unbusy(mp);
 					return (error);
 				}
 			}
 			sfsp++;
 		}
 		count++;
 		mtx_lock(&mountlist_mtx);
 		nmp = TAILQ_NEXT(mp, mnt_list);
 		vfs_unbusy(mp);
 	}
 	mtx_unlock(&mountlist_mtx);
 	if (sfsp && count > maxcount)
 		td->td_retval[0] = maxcount;
 	else
 		td->td_retval[0] = count;
 	return (0);
 }
 
 #ifdef COMPAT_FREEBSD4
 /*
  * Get old format filesystem statistics.
  */
 static void cvtstatfs(struct statfs *, struct ostatfs *);
 
 #ifndef _SYS_SYSPROTO_H_
 struct freebsd4_statfs_args {
 	char *path;
 	struct ostatfs *buf;
 };
 #endif
 int
 freebsd4_statfs(td, uap)
 	struct thread *td;
 	struct freebsd4_statfs_args /* {
 		char *path;
 		struct ostatfs *buf;
 	} */ *uap;
 {
 	struct ostatfs osb;
 	struct statfs sf;
 	int error;
 
 	error = kern_statfs(td, uap->path, UIO_USERSPACE, &sf);
 	if (error != 0)
 		return (error);
 	cvtstatfs(&sf, &osb);
 	return (copyout(&osb, uap->buf, sizeof(osb)));
 }
 
 /*
  * Get filesystem statistics.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct freebsd4_fstatfs_args {
 	int fd;
 	struct ostatfs *buf;
 };
 #endif
 int
 freebsd4_fstatfs(td, uap)
 	struct thread *td;
 	struct freebsd4_fstatfs_args /* {
 		int fd;
 		struct ostatfs *buf;
 	} */ *uap;
 {
 	struct ostatfs osb;
 	struct statfs sf;
 	int error;
 
 	error = kern_fstatfs(td, uap->fd, &sf);
 	if (error != 0)
 		return (error);
 	cvtstatfs(&sf, &osb);
 	return (copyout(&osb, uap->buf, sizeof(osb)));
 }
 
 /*
  * Get statistics on all filesystems.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct freebsd4_getfsstat_args {
 	struct ostatfs *buf;
 	long bufsize;
 	int flags;
 };
 #endif
 int
 freebsd4_getfsstat(td, uap)
 	struct thread *td;
 	register struct freebsd4_getfsstat_args /* {
 		struct ostatfs *buf;
 		long bufsize;
 		int flags;
 	} */ *uap;
 {
 	struct statfs *buf, *sp;
 	struct ostatfs osb;
 	size_t count, size;
 	int error;
 
 	count = uap->bufsize / sizeof(struct ostatfs);
 	size = count * sizeof(struct statfs);
 	error = kern_getfsstat(td, &buf, size, UIO_SYSSPACE, uap->flags);
 	if (size > 0) {
 		count = td->td_retval[0];
 		sp = buf;
 		while (count > 0 && error == 0) {
 			cvtstatfs(sp, &osb);
 			error = copyout(&osb, uap->buf, sizeof(osb));
 			sp++;
 			uap->buf++;
 			count--;
 		}
 		free(buf, M_TEMP);
 	}
 	return (error);
 }
 
 /*
  * Implement fstatfs() for (NFS) file handles.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct freebsd4_fhstatfs_args {
 	struct fhandle *u_fhp;
 	struct ostatfs *buf;
 };
 #endif
 int
 freebsd4_fhstatfs(td, uap)
 	struct thread *td;
 	struct freebsd4_fhstatfs_args /* {
 		struct fhandle *u_fhp;
 		struct ostatfs *buf;
 	} */ *uap;
 {
 	struct ostatfs osb;
 	struct statfs sf;
 	fhandle_t fh;
 	int error;
 
 	error = copyin(uap->u_fhp, &fh, sizeof(fhandle_t));
 	if (error != 0)
 		return (error);
 	error = kern_fhstatfs(td, fh, &sf);
 	if (error != 0)
 		return (error);
 	cvtstatfs(&sf, &osb);
 	return (copyout(&osb, uap->buf, sizeof(osb)));
 }
 
 /*
  * Convert a new format statfs structure to an old format statfs structure.
  */
 static void
 cvtstatfs(nsp, osp)
 	struct statfs *nsp;
 	struct ostatfs *osp;
 {
 
 	statfs_scale_blocks(nsp, LONG_MAX);
 	bzero(osp, sizeof(*osp));
 	osp->f_bsize = nsp->f_bsize;
 	osp->f_iosize = MIN(nsp->f_iosize, LONG_MAX);
 	osp->f_blocks = nsp->f_blocks;
 	osp->f_bfree = nsp->f_bfree;
 	osp->f_bavail = nsp->f_bavail;
 	osp->f_files = MIN(nsp->f_files, LONG_MAX);
 	osp->f_ffree = MIN(nsp->f_ffree, LONG_MAX);
 	osp->f_owner = nsp->f_owner;
 	osp->f_type = nsp->f_type;
 	osp->f_flags = nsp->f_flags;
 	osp->f_syncwrites = MIN(nsp->f_syncwrites, LONG_MAX);
 	osp->f_asyncwrites = MIN(nsp->f_asyncwrites, LONG_MAX);
 	osp->f_syncreads = MIN(nsp->f_syncreads, LONG_MAX);
 	osp->f_asyncreads = MIN(nsp->f_asyncreads, LONG_MAX);
 	strlcpy(osp->f_fstypename, nsp->f_fstypename,
 	    MIN(MFSNAMELEN, OMFSNAMELEN));
 	strlcpy(osp->f_mntonname, nsp->f_mntonname,
 	    MIN(MNAMELEN, OMNAMELEN));
 	strlcpy(osp->f_mntfromname, nsp->f_mntfromname,
 	    MIN(MNAMELEN, OMNAMELEN));
 	osp->f_fsid = nsp->f_fsid;
 }
 #endif /* COMPAT_FREEBSD4 */
 
 /*
  * Change current working directory to a given file descriptor.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct fchdir_args {
 	int	fd;
 };
 #endif
 int
 sys_fchdir(td, uap)
 	struct thread *td;
 	struct fchdir_args /* {
 		int fd;
 	} */ *uap;
 {
 	register struct filedesc *fdp = td->td_proc->p_fd;
 	struct vnode *vp, *tdp, *vpold;
 	struct mount *mp;
 	struct file *fp;
 	cap_rights_t rights;
 	int error;
 
 	AUDIT_ARG_FD(uap->fd);
 	error = getvnode(fdp, uap->fd, cap_rights_init(&rights, CAP_FCHDIR),
 	    &fp);
 	if (error != 0)
 		return (error);
 	vp = fp->f_vnode;
 	VREF(vp);
 	fdrop(fp, td);
 	vn_lock(vp, LK_SHARED | LK_RETRY);
 	AUDIT_ARG_VNODE1(vp);
 	error = change_dir(vp, td);
 	while (!error && (mp = vp->v_mountedhere) != NULL) {
 		if (vfs_busy(mp, 0))
 			continue;
 		error = VFS_ROOT(mp, LK_SHARED, &tdp);
 		vfs_unbusy(mp);
 		if (error != 0)
 			break;
 		vput(vp);
 		vp = tdp;
 	}
 	if (error != 0) {
 		vput(vp);
 		return (error);
 	}
 	VOP_UNLOCK(vp, 0);
 	FILEDESC_XLOCK(fdp);
 	vpold = fdp->fd_cdir;
 	fdp->fd_cdir = vp;
 	FILEDESC_XUNLOCK(fdp);
 	vrele(vpold);
 	return (0);
 }
 
 /*
  * Change current working directory (``.'').
  */
 #ifndef _SYS_SYSPROTO_H_
 struct chdir_args {
 	char	*path;
 };
 #endif
 int
 sys_chdir(td, uap)
 	struct thread *td;
 	struct chdir_args /* {
 		char *path;
 	} */ *uap;
 {
 
 	return (kern_chdir(td, uap->path, UIO_USERSPACE));
 }
 
 int
 kern_chdir(struct thread *td, char *path, enum uio_seg pathseg)
 {
 	register struct filedesc *fdp = td->td_proc->p_fd;
 	struct nameidata nd;
 	struct vnode *vp;
 	int error;
 
 	NDINIT(&nd, LOOKUP, FOLLOW | LOCKSHARED | LOCKLEAF | AUDITVNODE1,
 	    pathseg, path, td);
 	if ((error = namei(&nd)) != 0)
 		return (error);
 	if ((error = change_dir(nd.ni_vp, td)) != 0) {
 		vput(nd.ni_vp);
 		NDFREE(&nd, NDF_ONLY_PNBUF);
 		return (error);
 	}
 	VOP_UNLOCK(nd.ni_vp, 0);
 	NDFREE(&nd, NDF_ONLY_PNBUF);
 	FILEDESC_XLOCK(fdp);
 	vp = fdp->fd_cdir;
 	fdp->fd_cdir = nd.ni_vp;
 	FILEDESC_XUNLOCK(fdp);
 	vrele(vp);
 	return (0);
 }
 
 /*
  * Helper function for raised chroot(2) security function:  Refuse if
  * any filedescriptors are open directories.
  */
 static int
 chroot_refuse_vdir_fds(fdp)
 	struct filedesc *fdp;
 {
 	struct vnode *vp;
 	struct file *fp;
 	int fd;
 
 	FILEDESC_LOCK_ASSERT(fdp);
 
 	for (fd = 0; fd < fdp->fd_nfiles ; fd++) {
 		fp = fget_locked(fdp, fd);
 		if (fp == NULL)
 			continue;
 		if (fp->f_type == DTYPE_VNODE) {
 			vp = fp->f_vnode;
 			if (vp->v_type == VDIR)
 				return (EPERM);
 		}
 	}
 	return (0);
 }
 
 /*
  * This sysctl determines if we will allow a process to chroot(2) if it
  * has a directory open:
  *	0: disallowed for all processes.
  *	1: allowed for processes that were not already chroot(2)'ed.
  *	2: allowed for all processes.
  */
 
 static int chroot_allow_open_directories = 1;
 
 SYSCTL_INT(_kern, OID_AUTO, chroot_allow_open_directories, CTLFLAG_RW,
      &chroot_allow_open_directories, 0,
      "Allow a process to chroot(2) if it has a directory open");
 
 /*
  * Change notion of root (``/'') directory.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct chroot_args {
 	char	*path;
 };
 #endif
 int
 sys_chroot(td, uap)
 	struct thread *td;
 	struct chroot_args /* {
 		char *path;
 	} */ *uap;
 {
 	struct nameidata nd;
 	int error;
 
 	error = priv_check(td, PRIV_VFS_CHROOT);
 	if (error != 0)
 		return (error);
 	NDINIT(&nd, LOOKUP, FOLLOW | LOCKSHARED | LOCKLEAF | AUDITVNODE1,
 	    UIO_USERSPACE, uap->path, td);
 	error = namei(&nd);
 	if (error != 0)
 		goto error;
 	error = change_dir(nd.ni_vp, td);
 	if (error != 0)
 		goto e_vunlock;
 #ifdef MAC
 	error = mac_vnode_check_chroot(td->td_ucred, nd.ni_vp);
 	if (error != 0)
 		goto e_vunlock;
 #endif
 	VOP_UNLOCK(nd.ni_vp, 0);
 	error = change_root(nd.ni_vp, td);
 	vrele(nd.ni_vp);
 	NDFREE(&nd, NDF_ONLY_PNBUF);
 	return (error);
 e_vunlock:
 	vput(nd.ni_vp);
 error:
 	NDFREE(&nd, NDF_ONLY_PNBUF);
 	return (error);
 }
 
 /*
  * Common routine for chroot and chdir.  Callers must provide a locked vnode
  * instance.
  */
 int
 change_dir(vp, td)
 	struct vnode *vp;
 	struct thread *td;
 {
 #ifdef MAC
 	int error;
 #endif
 
 	ASSERT_VOP_LOCKED(vp, "change_dir(): vp not locked");
 	if (vp->v_type != VDIR)
 		return (ENOTDIR);
 #ifdef MAC
 	error = mac_vnode_check_chdir(td->td_ucred, vp);
 	if (error != 0)
 		return (error);
 #endif
 	return (VOP_ACCESS(vp, VEXEC, td->td_ucred, td));
 }
 
 /*
  * Common routine for kern_chroot() and jail_attach().  The caller is
  * responsible for invoking priv_check() and mac_vnode_check_chroot() to
  * authorize this operation.
  */
 int
 change_root(vp, td)
 	struct vnode *vp;
 	struct thread *td;
 {
 	struct filedesc *fdp;
 	struct vnode *oldvp;
 	int error;
 
 	fdp = td->td_proc->p_fd;
 	FILEDESC_XLOCK(fdp);
 	if (chroot_allow_open_directories == 0 ||
 	    (chroot_allow_open_directories == 1 && fdp->fd_rdir != rootvnode)) {
 		error = chroot_refuse_vdir_fds(fdp);
 		if (error != 0) {
 			FILEDESC_XUNLOCK(fdp);
 			return (error);
 		}
 	}
 	oldvp = fdp->fd_rdir;
 	fdp->fd_rdir = vp;
 	VREF(fdp->fd_rdir);
 	if (!fdp->fd_jdir) {
 		fdp->fd_jdir = vp;
 		VREF(fdp->fd_jdir);
 	}
 	FILEDESC_XUNLOCK(fdp);
 	vrele(oldvp);
 	return (0);
 }
 
 static __inline void
 flags_to_rights(int flags, cap_rights_t *rightsp)
 {
 
 	if (flags & O_EXEC) {
 		cap_rights_set(rightsp, CAP_FEXECVE);
 	} else {
 		switch ((flags & O_ACCMODE)) {
 		case O_RDONLY:
 			cap_rights_set(rightsp, CAP_READ);
 			break;
 		case O_RDWR:
 			cap_rights_set(rightsp, CAP_READ);
 			/* FALLTHROUGH */
 		case O_WRONLY:
 			cap_rights_set(rightsp, CAP_WRITE);
 			if (!(flags & (O_APPEND | O_TRUNC)))
 				cap_rights_set(rightsp, CAP_SEEK);
 			break;
 		}
 	}
 
 	if (flags & O_CREAT)
 		cap_rights_set(rightsp, CAP_CREATE);
 
 	if (flags & O_TRUNC)
 		cap_rights_set(rightsp, CAP_FTRUNCATE);
 
 	if (flags & (O_SYNC | O_FSYNC))
 		cap_rights_set(rightsp, CAP_FSYNC);
 
 	if (flags & (O_EXLOCK | O_SHLOCK))
 		cap_rights_set(rightsp, CAP_FLOCK);
 }
 
 /*
  * Check permissions, allocate an open file structure, and call the device
  * open routine if any.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct open_args {
 	char	*path;
 	int	flags;
 	int	mode;
 };
 #endif
 int
 sys_open(td, uap)
 	struct thread *td;
 	register struct open_args /* {
 		char *path;
 		int flags;
 		int mode;
 	} */ *uap;
 {
 
 	return (kern_open(td, uap->path, UIO_USERSPACE, uap->flags, uap->mode));
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct openat_args {
 	int	fd;
 	char	*path;
 	int	flag;
 	int	mode;
 };
 #endif
 int
 sys_openat(struct thread *td, struct openat_args *uap)
 {
 
 	return (kern_openat(td, uap->fd, uap->path, UIO_USERSPACE, uap->flag,
 	    uap->mode));
 }
 
 int
 kern_open(struct thread *td, char *path, enum uio_seg pathseg, int flags,
     int mode)
 {
 
 	return (kern_openat(td, AT_FDCWD, path, pathseg, flags, mode));
 }
 
 int
 kern_openat(struct thread *td, int fd, char *path, enum uio_seg pathseg,
     int flags, int mode)
 {
 	struct proc *p = td->td_proc;
 	struct filedesc *fdp = p->p_fd;
 	struct file *fp;
 	struct vnode *vp;
 	struct nameidata nd;
 	cap_rights_t rights;
 	int cmode, error, indx;
 
 	indx = -1;
 
 	AUDIT_ARG_FFLAGS(flags);
 	AUDIT_ARG_MODE(mode);
 	/* XXX: audit dirfd */
 	cap_rights_init(&rights, CAP_LOOKUP);
 	flags_to_rights(flags, &rights);
 	/*
 	 * Only one of the O_EXEC, O_RDONLY, O_WRONLY and O_RDWR flags
 	 * may be specified.
 	 */
 	if (flags & O_EXEC) {
 		if (flags & O_ACCMODE)
 			return (EINVAL);
 	} else if ((flags & O_ACCMODE) == O_ACCMODE) {
 		return (EINVAL);
 	} else {
 		flags = FFLAGS(flags);
 	}
 
 	/*
 	 * Allocate the file descriptor, but don't install a descriptor yet.
 	 */
 	error = falloc_noinstall(td, &fp);
 	if (error != 0)
 		return (error);
 	/*
 	 * An extra reference on `fp' has been held for us by
 	 * falloc_noinstall().
 	 */
 	/* Set the flags early so the finit in devfs can pick them up. */
 	fp->f_flag = flags & FMASK;
 	cmode = ((mode & ~fdp->fd_cmask) & ALLPERMS) & ~S_ISTXT;
 	NDINIT_ATRIGHTS(&nd, LOOKUP, FOLLOW | AUDITVNODE1, pathseg, path, fd,
 	    &rights, td);
 	td->td_dupfd = -1;		/* XXX check for fdopen */
 	error = vn_open(&nd, &flags, cmode, fp);
 	if (error != 0) {
 		/*
 		 * If the vn_open replaced the method vector, something
 		 * wonderous happened deep below and we just pass it up
 		 * pretending we know what we do.
 		 */
 		if (error == ENXIO && fp->f_ops != &badfileops)
 			goto success;
 
 		/*
 		 * Handle special fdopen() case. bleh.
 		 *
 		 * Don't do this for relative (capability) lookups; we don't
 		 * understand exactly what would happen, and we don't think
 		 * that it ever should.
 		 */
 		if (nd.ni_strictrelative == 0 &&
 		    (error == ENODEV || error == ENXIO) &&
 		    td->td_dupfd >= 0) {
 			error = dupfdopen(td, fdp, td->td_dupfd, flags, error,
 			    &indx);
 			if (error == 0)
 				goto success;
 		}
 
 		goto bad;
 	}
 	td->td_dupfd = 0;
 	NDFREE(&nd, NDF_ONLY_PNBUF);
 	vp = nd.ni_vp;
 
 	/*
 	 * Store the vnode, for any f_type. Typically, the vnode use
 	 * count is decremented by direct call to vn_closefile() for
 	 * files that switched type in the cdevsw fdopen() method.
 	 */
 	fp->f_vnode = vp;
 	/*
 	 * If the file wasn't claimed by devfs bind it to the normal
 	 * vnode operations here.
 	 */
 	if (fp->f_ops == &badfileops) {
 		KASSERT(vp->v_type != VFIFO, ("Unexpected fifo."));
 		fp->f_seqcount = 1;
 		finit(fp, (flags & FMASK) | (fp->f_flag & FHASLOCK),
 		    DTYPE_VNODE, vp, &vnops);
 	}
 
 	VOP_UNLOCK(vp, 0);
 	if (flags & O_TRUNC) {
 		error = fo_truncate(fp, 0, td->td_ucred, td);
 		if (error != 0)
 			goto bad;
 	}
 success:
 	/*
 	 * If we haven't already installed the FD (for dupfdopen), do so now.
 	 */
 	if (indx == -1) {
 		struct filecaps *fcaps;
 
 #ifdef CAPABILITIES
 		if (nd.ni_strictrelative == 1)
 			fcaps = &nd.ni_filecaps;
 		else
 #endif
 			fcaps = NULL;
 		error = finstall(td, fp, &indx, flags, fcaps);
 		/* On success finstall() consumes fcaps. */
 		if (error != 0) {
 			filecaps_free(&nd.ni_filecaps);
 			goto bad;
 		}
 	} else {
 		filecaps_free(&nd.ni_filecaps);
 	}
 
 	/*
 	 * Release our private reference, leaving the one associated with
 	 * the descriptor table intact.
 	 */
 	fdrop(fp, td);
 	td->td_retval[0] = indx;
 	return (0);
 bad:
 	KASSERT(indx == -1, ("indx=%d, should be -1", indx));
 	fdrop(fp, td);
 	return (error);
 }
 
 #ifdef COMPAT_43
 /*
  * Create a file.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct ocreat_args {
 	char	*path;
 	int	mode;
 };
 #endif
 int
 ocreat(td, uap)
 	struct thread *td;
 	register struct ocreat_args /* {
 		char *path;
 		int mode;
 	} */ *uap;
 {
 
 	return (kern_open(td, uap->path, UIO_USERSPACE,
 	    O_WRONLY | O_CREAT | O_TRUNC, uap->mode));
 }
 #endif /* COMPAT_43 */
 
 /*
  * Create a special file.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct mknod_args {
 	char	*path;
 	int	mode;
 	int	dev;
 };
 #endif
 int
 sys_mknod(td, uap)
 	struct thread *td;
 	register struct mknod_args /* {
 		char *path;
 		int mode;
 		int dev;
 	} */ *uap;
 {
 
 	return (kern_mknod(td, uap->path, UIO_USERSPACE, uap->mode, uap->dev));
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct mknodat_args {
 	int	fd;
 	char	*path;
 	mode_t	mode;
 	dev_t	dev;
 };
 #endif
 int
 sys_mknodat(struct thread *td, struct mknodat_args *uap)
 {
 
 	return (kern_mknodat(td, uap->fd, uap->path, UIO_USERSPACE, uap->mode,
 	    uap->dev));
 }
 
 int
 kern_mknod(struct thread *td, char *path, enum uio_seg pathseg, int mode,
     int dev)
 {
 
 	return (kern_mknodat(td, AT_FDCWD, path, pathseg, mode, dev));
 }
 
 int
 kern_mknodat(struct thread *td, int fd, char *path, enum uio_seg pathseg,
     int mode, int dev)
 {
 	struct vnode *vp;
 	struct mount *mp;
 	struct vattr vattr;
 	struct nameidata nd;
 	cap_rights_t rights;
 	int error, whiteout = 0;
 
 	AUDIT_ARG_MODE(mode);
 	AUDIT_ARG_DEV(dev);
 	switch (mode & S_IFMT) {
 	case S_IFCHR:
 	case S_IFBLK:
 		error = priv_check(td, PRIV_VFS_MKNOD_DEV);
 		break;
 	case S_IFMT:
 		error = priv_check(td, PRIV_VFS_MKNOD_BAD);
 		break;
 	case S_IFWHT:
 		error = priv_check(td, PRIV_VFS_MKNOD_WHT);
 		break;
 	case S_IFIFO:
 		if (dev == 0)
 			return (kern_mkfifoat(td, fd, path, pathseg, mode));
 		/* FALLTHROUGH */
 	default:
 		error = EINVAL;
 		break;
 	}
 	if (error != 0)
 		return (error);
 restart:
 	bwillwrite();
 	NDINIT_ATRIGHTS(&nd, CREATE, LOCKPARENT | SAVENAME | AUDITVNODE1,
 	    pathseg, path, fd, cap_rights_init(&rights, CAP_MKNODAT), td);
 	if ((error = namei(&nd)) != 0)
 		return (error);
 	vp = nd.ni_vp;
 	if (vp != NULL) {
 		NDFREE(&nd, NDF_ONLY_PNBUF);
 		if (vp == nd.ni_dvp)
 			vrele(nd.ni_dvp);
 		else
 			vput(nd.ni_dvp);
 		vrele(vp);
 		return (EEXIST);
 	} else {
 		VATTR_NULL(&vattr);
 		vattr.va_mode = (mode & ALLPERMS) &
 		    ~td->td_proc->p_fd->fd_cmask;
 		vattr.va_rdev = dev;
 		whiteout = 0;
 
 		switch (mode & S_IFMT) {
 		case S_IFMT:	/* used by badsect to flag bad sectors */
 			vattr.va_type = VBAD;
 			break;
 		case S_IFCHR:
 			vattr.va_type = VCHR;
 			break;
 		case S_IFBLK:
 			vattr.va_type = VBLK;
 			break;
 		case S_IFWHT:
 			whiteout = 1;
 			break;
 		default:
 			panic("kern_mknod: invalid mode");
 		}
 	}
 	if (vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) {
 		NDFREE(&nd, NDF_ONLY_PNBUF);
 		vput(nd.ni_dvp);
 		if ((error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH)) != 0)
 			return (error);
 		goto restart;
 	}
 #ifdef MAC
 	if (error == 0 && !whiteout)
 		error = mac_vnode_check_create(td->td_ucred, nd.ni_dvp,
 		    &nd.ni_cnd, &vattr);
 #endif
 	if (error == 0) {
 		if (whiteout)
 			error = VOP_WHITEOUT(nd.ni_dvp, &nd.ni_cnd, CREATE);
 		else {
 			error = VOP_MKNOD(nd.ni_dvp, &nd.ni_vp,
 						&nd.ni_cnd, &vattr);
 			if (error == 0)
 				vput(nd.ni_vp);
 		}
 	}
 	NDFREE(&nd, NDF_ONLY_PNBUF);
 	vput(nd.ni_dvp);
 	vn_finished_write(mp);
 	return (error);
 }
 
 /*
  * Create a named pipe.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct mkfifo_args {
 	char	*path;
 	int	mode;
 };
 #endif
 int
 sys_mkfifo(td, uap)
 	struct thread *td;
 	register struct mkfifo_args /* {
 		char *path;
 		int mode;
 	} */ *uap;
 {
 
 	return (kern_mkfifo(td, uap->path, UIO_USERSPACE, uap->mode));
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct mkfifoat_args {
 	int	fd;
 	char	*path;
 	mode_t	mode;
 };
 #endif
 int
 sys_mkfifoat(struct thread *td, struct mkfifoat_args *uap)
 {
 
 	return (kern_mkfifoat(td, uap->fd, uap->path, UIO_USERSPACE,
 	    uap->mode));
 }
 
 int
 kern_mkfifo(struct thread *td, char *path, enum uio_seg pathseg, int mode)
 {
 
 	return (kern_mkfifoat(td, AT_FDCWD, path, pathseg, mode));
 }
 
 int
 kern_mkfifoat(struct thread *td, int fd, char *path, enum uio_seg pathseg,
     int mode)
 {
 	struct mount *mp;
 	struct vattr vattr;
 	struct nameidata nd;
 	cap_rights_t rights;
 	int error;
 
 	AUDIT_ARG_MODE(mode);
 restart:
 	bwillwrite();
 	NDINIT_ATRIGHTS(&nd, CREATE, LOCKPARENT | SAVENAME | AUDITVNODE1,
 	    pathseg, path, fd, cap_rights_init(&rights, CAP_MKFIFOAT), td);
 	if ((error = namei(&nd)) != 0)
 		return (error);
 	if (nd.ni_vp != NULL) {
 		NDFREE(&nd, NDF_ONLY_PNBUF);
 		if (nd.ni_vp == nd.ni_dvp)
 			vrele(nd.ni_dvp);
 		else
 			vput(nd.ni_dvp);
 		vrele(nd.ni_vp);
 		return (EEXIST);
 	}
 	if (vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) {
 		NDFREE(&nd, NDF_ONLY_PNBUF);
 		vput(nd.ni_dvp);
 		if ((error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH)) != 0)
 			return (error);
 		goto restart;
 	}
 	VATTR_NULL(&vattr);
 	vattr.va_type = VFIFO;
 	vattr.va_mode = (mode & ALLPERMS) & ~td->td_proc->p_fd->fd_cmask;
 #ifdef MAC
 	error = mac_vnode_check_create(td->td_ucred, nd.ni_dvp, &nd.ni_cnd,
 	    &vattr);
 	if (error != 0)
 		goto out;
 #endif
 	error = VOP_MKNOD(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vattr);
 	if (error == 0)
 		vput(nd.ni_vp);
 #ifdef MAC
 out:
 #endif
 	vput(nd.ni_dvp);
 	vn_finished_write(mp);
 	NDFREE(&nd, NDF_ONLY_PNBUF);
 	return (error);
 }
 
 /*
  * Make a hard file link.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct link_args {
 	char	*path;
 	char	*link;
 };
 #endif
 int
 sys_link(td, uap)
 	struct thread *td;
 	register struct link_args /* {
 		char *path;
 		char *link;
 	} */ *uap;
 {
 
 	return (kern_link(td, uap->path, uap->link, UIO_USERSPACE));
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct linkat_args {
 	int	fd1;
 	char	*path1;
 	int	fd2;
 	char	*path2;
 	int	flag;
 };
 #endif
 int
 sys_linkat(struct thread *td, struct linkat_args *uap)
 {
 	int flag;
 
 	flag = uap->flag;
 	if (flag & ~AT_SYMLINK_FOLLOW)
 		return (EINVAL);
 
 	return (kern_linkat(td, uap->fd1, uap->fd2, uap->path1, uap->path2,
 	    UIO_USERSPACE, (flag & AT_SYMLINK_FOLLOW) ? FOLLOW : NOFOLLOW));
 }
 
 int hardlink_check_uid = 0;
 SYSCTL_INT(_security_bsd, OID_AUTO, hardlink_check_uid, CTLFLAG_RW,
     &hardlink_check_uid, 0,
     "Unprivileged processes cannot create hard links to files owned by other "
     "users");
 static int hardlink_check_gid = 0;
 SYSCTL_INT(_security_bsd, OID_AUTO, hardlink_check_gid, CTLFLAG_RW,
     &hardlink_check_gid, 0,
     "Unprivileged processes cannot create hard links to files owned by other "
     "groups");
 
 static int
 can_hardlink(struct vnode *vp, struct ucred *cred)
 {
 	struct vattr va;
 	int error;
 
 	if (!hardlink_check_uid && !hardlink_check_gid)
 		return (0);
 
 	error = VOP_GETATTR(vp, &va, cred);
 	if (error != 0)
 		return (error);
 
 	if (hardlink_check_uid && cred->cr_uid != va.va_uid) {
 		error = priv_check_cred(cred, PRIV_VFS_LINK, 0);
 		if (error != 0)
 			return (error);
 	}
 
 	if (hardlink_check_gid && !groupmember(va.va_gid, cred)) {
 		error = priv_check_cred(cred, PRIV_VFS_LINK, 0);
 		if (error != 0)
 			return (error);
 	}
 
 	return (0);
 }
 
 int
 kern_link(struct thread *td, char *path, char *link, enum uio_seg segflg)
 {
 
 	return (kern_linkat(td, AT_FDCWD, AT_FDCWD, path,link, segflg, FOLLOW));
 }
 
 int
 kern_linkat(struct thread *td, int fd1, int fd2, char *path1, char *path2,
     enum uio_seg segflg, int follow)
 {
 	struct vnode *vp;
 	struct mount *mp;
 	struct nameidata nd;
 	cap_rights_t rights;
 	int error;
 
 	bwillwrite();
 	NDINIT_AT(&nd, LOOKUP, follow | AUDITVNODE1, segflg, path1, fd1, td);
 
 	if ((error = namei(&nd)) != 0)
 		return (error);
 	NDFREE(&nd, NDF_ONLY_PNBUF);
 	vp = nd.ni_vp;
 	if (vp->v_type == VDIR) {
 		vrele(vp);
 		return (EPERM);		/* POSIX */
 	}
 	if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0) {
 		vrele(vp);
 		return (error);
 	}
 	NDINIT_ATRIGHTS(&nd, CREATE, LOCKPARENT | SAVENAME | AUDITVNODE2,
 	    segflg, path2, fd2, cap_rights_init(&rights, CAP_LINKAT), td);
 	if ((error = namei(&nd)) == 0) {
 		if (nd.ni_vp != NULL) {
 			if (nd.ni_dvp == nd.ni_vp)
 				vrele(nd.ni_dvp);
 			else
 				vput(nd.ni_dvp);
 			vrele(nd.ni_vp);
 			error = EEXIST;
 		} else if ((error = vn_lock(vp, LK_EXCLUSIVE | LK_RETRY))
 		    == 0) {
 			error = can_hardlink(vp, td->td_ucred);
 			if (error == 0)
 #ifdef MAC
 				error = mac_vnode_check_link(td->td_ucred,
 				    nd.ni_dvp, vp, &nd.ni_cnd);
 			if (error == 0)
 #endif
 				error = VOP_LINK(nd.ni_dvp, vp, &nd.ni_cnd);
 			VOP_UNLOCK(vp, 0);
 			vput(nd.ni_dvp);
 		}
 		NDFREE(&nd, NDF_ONLY_PNBUF);
 	}
 	vrele(vp);
 	vn_finished_write(mp);
 	return (error);
 }
 
 /*
  * Make a symbolic link.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct symlink_args {
 	char	*path;
 	char	*link;
 };
 #endif
 int
 sys_symlink(td, uap)
 	struct thread *td;
 	register struct symlink_args /* {
 		char *path;
 		char *link;
 	} */ *uap;
 {
 
 	return (kern_symlink(td, uap->path, uap->link, UIO_USERSPACE));
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct symlinkat_args {
 	char	*path;
 	int	fd;
 	char	*path2;
 };
 #endif
 int
 sys_symlinkat(struct thread *td, struct symlinkat_args *uap)
 {
 
 	return (kern_symlinkat(td, uap->path1, uap->fd, uap->path2,
 	    UIO_USERSPACE));
 }
 
 int
 kern_symlink(struct thread *td, char *path, char *link, enum uio_seg segflg)
 {
 
 	return (kern_symlinkat(td, path, AT_FDCWD, link, segflg));
 }
 
 int
 kern_symlinkat(struct thread *td, char *path1, int fd, char *path2,
     enum uio_seg segflg)
 {
 	struct mount *mp;
 	struct vattr vattr;
 	char *syspath;
 	struct nameidata nd;
 	int error;
 	cap_rights_t rights;
 
 	if (segflg == UIO_SYSSPACE) {
 		syspath = path1;
 	} else {
 		syspath = uma_zalloc(namei_zone, M_WAITOK);
 		if ((error = copyinstr(path1, syspath, MAXPATHLEN, NULL)) != 0)
 			goto out;
 	}
 	AUDIT_ARG_TEXT(syspath);
 restart:
 	bwillwrite();
 	NDINIT_ATRIGHTS(&nd, CREATE, LOCKPARENT | SAVENAME | AUDITVNODE1,
 	    segflg, path2, fd, cap_rights_init(&rights, CAP_SYMLINKAT), td);
 	if ((error = namei(&nd)) != 0)
 		goto out;
 	if (nd.ni_vp) {
 		NDFREE(&nd, NDF_ONLY_PNBUF);
 		if (nd.ni_vp == nd.ni_dvp)
 			vrele(nd.ni_dvp);
 		else
 			vput(nd.ni_dvp);
 		vrele(nd.ni_vp);
 		error = EEXIST;
 		goto out;
 	}
 	if (vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) {
 		NDFREE(&nd, NDF_ONLY_PNBUF);
 		vput(nd.ni_dvp);
 		if ((error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH)) != 0)
 			goto out;
 		goto restart;
 	}
 	VATTR_NULL(&vattr);
 	vattr.va_mode = ACCESSPERMS &~ td->td_proc->p_fd->fd_cmask;
 #ifdef MAC
 	vattr.va_type = VLNK;
 	error = mac_vnode_check_create(td->td_ucred, nd.ni_dvp, &nd.ni_cnd,
 	    &vattr);
 	if (error != 0)
 		goto out2;
 #endif
 	error = VOP_SYMLINK(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vattr, syspath);
 	if (error == 0)
 		vput(nd.ni_vp);
 #ifdef MAC
 out2:
 #endif
 	NDFREE(&nd, NDF_ONLY_PNBUF);
 	vput(nd.ni_dvp);
 	vn_finished_write(mp);
 out:
 	if (segflg != UIO_SYSSPACE)
 		uma_zfree(namei_zone, syspath);
 	return (error);
 }
 
 /*
  * Delete a whiteout from the filesystem.
  */
 int
 sys_undelete(td, uap)
 	struct thread *td;
 	register struct undelete_args /* {
 		char *path;
 	} */ *uap;
 {
 	struct mount *mp;
 	struct nameidata nd;
 	int error;
 
 restart:
 	bwillwrite();
 	NDINIT(&nd, DELETE, LOCKPARENT | DOWHITEOUT | AUDITVNODE1,
 	    UIO_USERSPACE, uap->path, td);
 	error = namei(&nd);
 	if (error != 0)
 		return (error);
 
 	if (nd.ni_vp != NULLVP || !(nd.ni_cnd.cn_flags & ISWHITEOUT)) {
 		NDFREE(&nd, NDF_ONLY_PNBUF);
 		if (nd.ni_vp == nd.ni_dvp)
 			vrele(nd.ni_dvp);
 		else
 			vput(nd.ni_dvp);
 		if (nd.ni_vp)
 			vrele(nd.ni_vp);
 		return (EEXIST);
 	}
 	if (vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) {
 		NDFREE(&nd, NDF_ONLY_PNBUF);
 		vput(nd.ni_dvp);
 		if ((error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH)) != 0)
 			return (error);
 		goto restart;
 	}
 	error = VOP_WHITEOUT(nd.ni_dvp, &nd.ni_cnd, DELETE);
 	NDFREE(&nd, NDF_ONLY_PNBUF);
 	vput(nd.ni_dvp);
 	vn_finished_write(mp);
 	return (error);
 }
 
 /*
  * Delete a name from the filesystem.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct unlink_args {
 	char	*path;
 };
 #endif
 int
 sys_unlink(td, uap)
 	struct thread *td;
 	struct unlink_args /* {
 		char *path;
 	} */ *uap;
 {
 
 	return (kern_unlink(td, uap->path, UIO_USERSPACE));
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct unlinkat_args {
 	int	fd;
 	char	*path;
 	int	flag;
 };
 #endif
 int
 sys_unlinkat(struct thread *td, struct unlinkat_args *uap)
 {
 	int flag = uap->flag;
 	int fd = uap->fd;
 	char *path = uap->path;
 
 	if (flag & ~AT_REMOVEDIR)
 		return (EINVAL);
 
 	if (flag & AT_REMOVEDIR)
 		return (kern_rmdirat(td, fd, path, UIO_USERSPACE));
 	else
 		return (kern_unlinkat(td, fd, path, UIO_USERSPACE, 0));
 }
 
 int
 kern_unlink(struct thread *td, char *path, enum uio_seg pathseg)
 {
 
 	return (kern_unlinkat(td, AT_FDCWD, path, pathseg, 0));
 }
 
 int
 kern_unlinkat(struct thread *td, int fd, char *path, enum uio_seg pathseg,
     ino_t oldinum)
 {
 	struct mount *mp;
 	struct vnode *vp;
 	struct nameidata nd;
 	struct stat sb;
 	cap_rights_t rights;
 	int error;
 
 restart:
 	bwillwrite();
 	NDINIT_ATRIGHTS(&nd, DELETE, LOCKPARENT | LOCKLEAF | AUDITVNODE1,
 	    pathseg, path, fd, cap_rights_init(&rights, CAP_UNLINKAT), td);
 	if ((error = namei(&nd)) != 0)
 		return (error == EINVAL ? EPERM : error);
 	vp = nd.ni_vp;
 	if (vp->v_type == VDIR && oldinum == 0) {
 		error = EPERM;		/* POSIX */
 	} else if (oldinum != 0 &&
 		  ((error = vn_stat(vp, &sb, td->td_ucred, NOCRED, td)) == 0) &&
 		  sb.st_ino != oldinum) {
 			error = EIDRM;	/* Identifier removed */
 	} else {
 		/*
 		 * The root of a mounted filesystem cannot be deleted.
 		 *
 		 * XXX: can this only be a VDIR case?
 		 */
 		if (vp->v_vflag & VV_ROOT)
 			error = EBUSY;
 	}
 	if (error == 0) {
 		if (vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) {
 			NDFREE(&nd, NDF_ONLY_PNBUF);
 			vput(nd.ni_dvp);
 			if (vp == nd.ni_dvp)
 				vrele(vp);
 			else
 				vput(vp);
 			if ((error = vn_start_write(NULL, &mp,
 			    V_XSLEEP | PCATCH)) != 0)
 				return (error);
 			goto restart;
 		}
 #ifdef MAC
 		error = mac_vnode_check_unlink(td->td_ucred, nd.ni_dvp, vp,
 		    &nd.ni_cnd);
 		if (error != 0)
 			goto out;
 #endif
 		vfs_notify_upper(vp, VFS_NOTIFY_UPPER_UNLINK);
 		error = VOP_REMOVE(nd.ni_dvp, vp, &nd.ni_cnd);
 #ifdef MAC
 out:
 #endif
 		vn_finished_write(mp);
 	}
 	NDFREE(&nd, NDF_ONLY_PNBUF);
 	vput(nd.ni_dvp);
 	if (vp == nd.ni_dvp)
 		vrele(vp);
 	else
 		vput(vp);
 	return (error);
 }
 
 /*
  * Reposition read/write file offset.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct lseek_args {
 	int	fd;
 	int	pad;
 	off_t	offset;
 	int	whence;
 };
 #endif
 int
 sys_lseek(td, uap)
 	struct thread *td;
 	register struct lseek_args /* {
 		int fd;
 		int pad;
 		off_t offset;
 		int whence;
 	} */ *uap;
 {
 	struct file *fp;
 	cap_rights_t rights;
 	int error;
 
 	AUDIT_ARG_FD(uap->fd);
 	error = fget(td, uap->fd, cap_rights_init(&rights, CAP_SEEK), &fp);
 	if (error != 0)
 		return (error);
 	error = (fp->f_ops->fo_flags & DFLAG_SEEKABLE) != 0 ?
 	    fo_seek(fp, uap->offset, uap->whence, td) : ESPIPE;
 	fdrop(fp, td);
 	return (error);
 }
 
 #if defined(COMPAT_43)
 /*
  * Reposition read/write file offset.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct olseek_args {
 	int	fd;
 	long	offset;
 	int	whence;
 };
 #endif
 int
 olseek(td, uap)
 	struct thread *td;
 	register struct olseek_args /* {
 		int fd;
 		long offset;
 		int whence;
 	} */ *uap;
 {
 	struct lseek_args /* {
 		int fd;
 		int pad;
 		off_t offset;
 		int whence;
 	} */ nuap;
 
 	nuap.fd = uap->fd;
 	nuap.offset = uap->offset;
 	nuap.whence = uap->whence;
 	return (sys_lseek(td, &nuap));
 }
 #endif /* COMPAT_43 */
 
 /* Version with the 'pad' argument */
 int
 freebsd6_lseek(td, uap)
 	struct thread *td;
 	register struct freebsd6_lseek_args *uap;
 {
 	struct lseek_args ouap;
 
 	ouap.fd = uap->fd;
 	ouap.offset = uap->offset;
 	ouap.whence = uap->whence;
 	return (sys_lseek(td, &ouap));
 }
 
 /*
  * Check access permissions using passed credentials.
  */
 static int
 vn_access(vp, user_flags, cred, td)
 	struct vnode	*vp;
 	int		user_flags;
 	struct ucred	*cred;
 	struct thread	*td;
 {
 	accmode_t accmode;
 	int error;
 
 	/* Flags == 0 means only check for existence. */
 	error = 0;
 	if (user_flags) {
 		accmode = 0;
 		if (user_flags & R_OK)
 			accmode |= VREAD;
 		if (user_flags & W_OK)
 			accmode |= VWRITE;
 		if (user_flags & X_OK)
 			accmode |= VEXEC;
 #ifdef MAC
 		error = mac_vnode_check_access(cred, vp, accmode);
 		if (error != 0)
 			return (error);
 #endif
 		if ((accmode & VWRITE) == 0 || (error = vn_writechk(vp)) == 0)
 			error = VOP_ACCESS(vp, accmode, cred, td);
 	}
 	return (error);
 }
 
 /*
  * Check access permissions using "real" credentials.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct access_args {
 	char	*path;
 	int	amode;
 };
 #endif
 int
 sys_access(td, uap)
 	struct thread *td;
 	register struct access_args /* {
 		char *path;
 		int amode;
 	} */ *uap;
 {
 
 	return (kern_access(td, uap->path, UIO_USERSPACE, uap->amode));
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct faccessat_args {
 	int	dirfd;
 	char	*path;
 	int	amode;
 	int	flag;
 }
 #endif
 int
 sys_faccessat(struct thread *td, struct faccessat_args *uap)
 {
 
 	if (uap->flag & ~AT_EACCESS)
 		return (EINVAL);
 	return (kern_accessat(td, uap->fd, uap->path, UIO_USERSPACE, uap->flag,
 	    uap->amode));
 }
 
 int
 kern_access(struct thread *td, char *path, enum uio_seg pathseg, int amode)
 {
 
 	return (kern_accessat(td, AT_FDCWD, path, pathseg, 0, amode));
 }
 
 int
 kern_accessat(struct thread *td, int fd, char *path, enum uio_seg pathseg,
     int flag, int amode)
 {
 	struct ucred *cred, *tmpcred;
 	struct vnode *vp;
 	struct nameidata nd;
 	cap_rights_t rights;
 	int error;
 
 	/*
 	 * Create and modify a temporary credential instead of one that
 	 * is potentially shared.
 	 */
 	if (!(flag & AT_EACCESS)) {
 		cred = td->td_ucred;
 		tmpcred = crdup(cred);
 		tmpcred->cr_uid = cred->cr_ruid;
 		tmpcred->cr_groups[0] = cred->cr_rgid;
 		td->td_ucred = tmpcred;
 	} else
 		cred = tmpcred = td->td_ucred;
 	AUDIT_ARG_VALUE(amode);
 	NDINIT_ATRIGHTS(&nd, LOOKUP, FOLLOW | LOCKSHARED | LOCKLEAF |
 	    AUDITVNODE1, pathseg, path, fd, cap_rights_init(&rights, CAP_FSTAT),
 	    td);
 	if ((error = namei(&nd)) != 0)
 		goto out1;
 	vp = nd.ni_vp;
 
 	error = vn_access(vp, amode, tmpcred, td);
 	NDFREE(&nd, NDF_ONLY_PNBUF);
 	vput(vp);
 out1:
 	if (!(flag & AT_EACCESS)) {
 		td->td_ucred = cred;
 		crfree(tmpcred);
 	}
 	return (error);
 }
 
 /*
  * Check access permissions using "effective" credentials.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct eaccess_args {
 	char	*path;
 	int	amode;
 };
 #endif
 int
 sys_eaccess(td, uap)
 	struct thread *td;
 	register struct eaccess_args /* {
 		char *path;
 		int amode;
 	} */ *uap;
 {
 
 	return (kern_eaccess(td, uap->path, UIO_USERSPACE, uap->amode));
 }
 
 int
 kern_eaccess(struct thread *td, char *path, enum uio_seg pathseg, int amode)
 {
 
 	return (kern_accessat(td, AT_FDCWD, path, pathseg, AT_EACCESS, amode));
 }
 
 #if defined(COMPAT_43)
 /*
  * Get file status; this version follows links.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct ostat_args {
 	char	*path;
 	struct ostat *ub;
 };
 #endif
 int
 ostat(td, uap)
 	struct thread *td;
 	register struct ostat_args /* {
 		char *path;
 		struct ostat *ub;
 	} */ *uap;
 {
 	struct stat sb;
 	struct ostat osb;
 	int error;
 
 	error = kern_stat(td, uap->path, UIO_USERSPACE, &sb);
 	if (error != 0)
 		return (error);
 	cvtstat(&sb, &osb);
 	return (copyout(&osb, uap->ub, sizeof (osb)));
 }
 
 /*
  * Get file status; this version does not follow links.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct olstat_args {
 	char	*path;
 	struct ostat *ub;
 };
 #endif
 int
 olstat(td, uap)
 	struct thread *td;
 	register struct olstat_args /* {
 		char *path;
 		struct ostat *ub;
 	} */ *uap;
 {
 	struct stat sb;
 	struct ostat osb;
 	int error;
 
 	error = kern_lstat(td, uap->path, UIO_USERSPACE, &sb);
 	if (error != 0)
 		return (error);
 	cvtstat(&sb, &osb);
 	return (copyout(&osb, uap->ub, sizeof (osb)));
 }
 
 /*
  * Convert from an old to a new stat structure.
  */
 void
 cvtstat(st, ost)
 	struct stat *st;
 	struct ostat *ost;
 {
 
 	ost->st_dev = st->st_dev;
 	ost->st_ino = st->st_ino;
 	ost->st_mode = st->st_mode;
 	ost->st_nlink = st->st_nlink;
 	ost->st_uid = st->st_uid;
 	ost->st_gid = st->st_gid;
 	ost->st_rdev = st->st_rdev;
 	if (st->st_size < (quad_t)1 << 32)
 		ost->st_size = st->st_size;
 	else
 		ost->st_size = -2;
 	ost->st_atim = st->st_atim;
 	ost->st_mtim = st->st_mtim;
 	ost->st_ctim = st->st_ctim;
 	ost->st_blksize = st->st_blksize;
 	ost->st_blocks = st->st_blocks;
 	ost->st_flags = st->st_flags;
 	ost->st_gen = st->st_gen;
 }
 #endif /* COMPAT_43 */
 
 /*
  * Get file status; this version follows links.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct stat_args {
 	char	*path;
 	struct stat *ub;
 };
 #endif
 int
 sys_stat(td, uap)
 	struct thread *td;
 	register struct stat_args /* {
 		char *path;
 		struct stat *ub;
 	} */ *uap;
 {
 	struct stat sb;
 	int error;
 
 	error = kern_stat(td, uap->path, UIO_USERSPACE, &sb);
 	if (error == 0)
 		error = copyout(&sb, uap->ub, sizeof (sb));
 	return (error);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct fstatat_args {
 	int	fd;
 	char	*path;
 	struct stat	*buf;
 	int	flag;
 }
 #endif
 int
 sys_fstatat(struct thread *td, struct fstatat_args *uap)
 {
 	struct stat sb;
 	int error;
 
 	error = kern_statat(td, uap->flag, uap->fd, uap->path,
 	    UIO_USERSPACE, &sb);
 	if (error == 0)
 		error = copyout(&sb, uap->buf, sizeof (sb));
 	return (error);
 }
 
 int
 kern_stat(struct thread *td, char *path, enum uio_seg pathseg, struct stat *sbp)
 {
 
 	return (kern_statat(td, 0, AT_FDCWD, path, pathseg, sbp));
 }
 
 int
 kern_statat(struct thread *td, int flag, int fd, char *path,
     enum uio_seg pathseg, struct stat *sbp)
 {
 
 	return (kern_statat_vnhook(td, flag, fd, path, pathseg, sbp, NULL));
 }
 
 int
 kern_statat_vnhook(struct thread *td, int flag, int fd, char *path,
     enum uio_seg pathseg, struct stat *sbp,
     void (*hook)(struct vnode *vp, struct stat *sbp))
 {
 	struct nameidata nd;
 	struct stat sb;
 	cap_rights_t rights;
 	int error;
 
 	if (flag & ~AT_SYMLINK_NOFOLLOW)
 		return (EINVAL);
 
 	NDINIT_ATRIGHTS(&nd, LOOKUP, ((flag & AT_SYMLINK_NOFOLLOW) ? NOFOLLOW :
 	    FOLLOW) | LOCKSHARED | LOCKLEAF | AUDITVNODE1, pathseg, path, fd,
 	    cap_rights_init(&rights, CAP_FSTAT), td);
 
 	if ((error = namei(&nd)) != 0)
 		return (error);
 	error = vn_stat(nd.ni_vp, &sb, td->td_ucred, NOCRED, td);
 	if (error == 0) {
 		SDT_PROBE(vfs, , stat, mode, path, sb.st_mode, 0, 0, 0);
 		if (S_ISREG(sb.st_mode))
 			SDT_PROBE(vfs, , stat, reg, path, pathseg, 0, 0, 0);
 		if (__predict_false(hook != NULL))
 			hook(nd.ni_vp, &sb);
 	}
 	NDFREE(&nd, NDF_ONLY_PNBUF);
 	vput(nd.ni_vp);
 	if (error != 0)
 		return (error);
 	*sbp = sb;
 #ifdef KTRACE
 	if (KTRPOINT(td, KTR_STRUCT))
 		ktrstat(&sb);
 #endif
 	return (0);
 }
 
 /*
  * Get file status; this version does not follow links.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct lstat_args {
 	char	*path;
 	struct stat *ub;
 };
 #endif
 int
 sys_lstat(td, uap)
 	struct thread *td;
 	register struct lstat_args /* {
 		char *path;
 		struct stat *ub;
 	} */ *uap;
 {
 	struct stat sb;
 	int error;
 
 	error = kern_lstat(td, uap->path, UIO_USERSPACE, &sb);
 	if (error == 0)
 		error = copyout(&sb, uap->ub, sizeof (sb));
 	return (error);
 }
 
 int
 kern_lstat(struct thread *td, char *path, enum uio_seg pathseg, struct stat *sbp)
 {
 
 	return (kern_statat(td, AT_SYMLINK_NOFOLLOW, AT_FDCWD, path, pathseg,
 	    sbp));
 }
 
 /*
  * Implementation of the NetBSD [l]stat() functions.
  */
 void
 cvtnstat(sb, nsb)
 	struct stat *sb;
 	struct nstat *nsb;
 {
 
 	bzero(nsb, sizeof *nsb);
 	nsb->st_dev = sb->st_dev;
 	nsb->st_ino = sb->st_ino;
 	nsb->st_mode = sb->st_mode;
 	nsb->st_nlink = sb->st_nlink;
 	nsb->st_uid = sb->st_uid;
 	nsb->st_gid = sb->st_gid;
 	nsb->st_rdev = sb->st_rdev;
 	nsb->st_atim = sb->st_atim;
 	nsb->st_mtim = sb->st_mtim;
 	nsb->st_ctim = sb->st_ctim;
 	nsb->st_size = sb->st_size;
 	nsb->st_blocks = sb->st_blocks;
 	nsb->st_blksize = sb->st_blksize;
 	nsb->st_flags = sb->st_flags;
 	nsb->st_gen = sb->st_gen;
 	nsb->st_birthtim = sb->st_birthtim;
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct nstat_args {
 	char	*path;
 	struct nstat *ub;
 };
 #endif
 int
 sys_nstat(td, uap)
 	struct thread *td;
 	register struct nstat_args /* {
 		char *path;
 		struct nstat *ub;
 	} */ *uap;
 {
 	struct stat sb;
 	struct nstat nsb;
 	int error;
 
 	error = kern_stat(td, uap->path, UIO_USERSPACE, &sb);
 	if (error != 0)
 		return (error);
 	cvtnstat(&sb, &nsb);
 	return (copyout(&nsb, uap->ub, sizeof (nsb)));
 }
 
 /*
  * NetBSD lstat.  Get file status; this version does not follow links.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct lstat_args {
 	char	*path;
 	struct stat *ub;
 };
 #endif
 int
 sys_nlstat(td, uap)
 	struct thread *td;
 	register struct nlstat_args /* {
 		char *path;
 		struct nstat *ub;
 	} */ *uap;
 {
 	struct stat sb;
 	struct nstat nsb;
 	int error;
 
 	error = kern_lstat(td, uap->path, UIO_USERSPACE, &sb);
 	if (error != 0)
 		return (error);
 	cvtnstat(&sb, &nsb);
 	return (copyout(&nsb, uap->ub, sizeof (nsb)));
 }
 
 /*
  * Get configurable pathname variables.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct pathconf_args {
 	char	*path;
 	int	name;
 };
 #endif
 int
 sys_pathconf(td, uap)
 	struct thread *td;
 	register struct pathconf_args /* {
 		char *path;
 		int name;
 	} */ *uap;
 {
 
 	return (kern_pathconf(td, uap->path, UIO_USERSPACE, uap->name, FOLLOW));
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct lpathconf_args {
 	char	*path;
 	int	name;
 };
 #endif
 int
 sys_lpathconf(td, uap)
 	struct thread *td;
 	register struct lpathconf_args /* {
 		char *path;
 		int name;
 	} */ *uap;
 {
 
 	return (kern_pathconf(td, uap->path, UIO_USERSPACE, uap->name,
 	    NOFOLLOW));
 }
 
 int
 kern_pathconf(struct thread *td, char *path, enum uio_seg pathseg, int name,
     u_long flags)
 {
 	struct nameidata nd;
 	int error;
 
 	NDINIT(&nd, LOOKUP, LOCKSHARED | LOCKLEAF | AUDITVNODE1 | flags,
 	    pathseg, path, td);
 	if ((error = namei(&nd)) != 0)
 		return (error);
 	NDFREE(&nd, NDF_ONLY_PNBUF);
 
 	/* If asynchronous I/O is available, it works for all files. */
 	if (name == _PC_ASYNC_IO)
 		td->td_retval[0] = async_io_version;
 	else
 		error = VOP_PATHCONF(nd.ni_vp, name, td->td_retval);
 	vput(nd.ni_vp);
 	return (error);
 }
 
 /*
  * Return target name of a symbolic link.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct readlink_args {
 	char	*path;
 	char	*buf;
 	size_t	count;
 };
 #endif
 int
 sys_readlink(td, uap)
 	struct thread *td;
 	register struct readlink_args /* {
 		char *path;
 		char *buf;
 		size_t count;
 	} */ *uap;
 {
 
 	return (kern_readlink(td, uap->path, UIO_USERSPACE, uap->buf,
 	    UIO_USERSPACE, uap->count));
 }
 #ifndef _SYS_SYSPROTO_H_
 struct readlinkat_args {
 	int	fd;
 	char	*path;
 	char	*buf;
 	size_t	bufsize;
 };
 #endif
 int
 sys_readlinkat(struct thread *td, struct readlinkat_args *uap)
 {
 
 	return (kern_readlinkat(td, uap->fd, uap->path, UIO_USERSPACE,
 	    uap->buf, UIO_USERSPACE, uap->bufsize));
 }
 
 int
 kern_readlink(struct thread *td, char *path, enum uio_seg pathseg, char *buf,
     enum uio_seg bufseg, size_t count)
 {
 
 	return (kern_readlinkat(td, AT_FDCWD, path, pathseg, buf, bufseg,
 	    count));
 }
 
 int
 kern_readlinkat(struct thread *td, int fd, char *path, enum uio_seg pathseg,
     char *buf, enum uio_seg bufseg, size_t count)
 {
 	struct vnode *vp;
 	struct iovec aiov;
 	struct uio auio;
 	struct nameidata nd;
 	int error;
 
 	if (count > IOSIZE_MAX)
 		return (EINVAL);
 
 	NDINIT_AT(&nd, LOOKUP, NOFOLLOW | LOCKSHARED | LOCKLEAF | AUDITVNODE1,
 	    pathseg, path, fd, td);
 
 	if ((error = namei(&nd)) != 0)
 		return (error);
 	NDFREE(&nd, NDF_ONLY_PNBUF);
 	vp = nd.ni_vp;
 #ifdef MAC
 	error = mac_vnode_check_readlink(td->td_ucred, vp);
 	if (error != 0) {
 		vput(vp);
 		return (error);
 	}
 #endif
 	if (vp->v_type != VLNK)
 		error = EINVAL;
 	else {
 		aiov.iov_base = buf;
 		aiov.iov_len = count;
 		auio.uio_iov = &aiov;
 		auio.uio_iovcnt = 1;
 		auio.uio_offset = 0;
 		auio.uio_rw = UIO_READ;
 		auio.uio_segflg = bufseg;
 		auio.uio_td = td;
 		auio.uio_resid = count;
 		error = VOP_READLINK(vp, &auio, td->td_ucred);
 	}
 	vput(vp);
 	td->td_retval[0] = count - auio.uio_resid;
 	return (error);
 }
 
 /*
  * Common implementation code for chflags() and fchflags().
  */
 static int
 setfflags(td, vp, flags)
 	struct thread *td;
 	struct vnode *vp;
 	u_long flags;
 {
 	struct mount *mp;
 	struct vattr vattr;
 	int error;
 
 	/* We can't support the value matching VNOVAL. */
 	if (flags == VNOVAL)
 		return (EOPNOTSUPP);
 
 	/*
 	 * Prevent non-root users from setting flags on devices.  When
 	 * a device is reused, users can retain ownership of the device
 	 * if they are allowed to set flags and programs assume that
 	 * chown can't fail when done as root.
 	 */
 	if (vp->v_type == VCHR || vp->v_type == VBLK) {
 		error = priv_check(td, PRIV_VFS_CHFLAGS_DEV);
 		if (error != 0)
 			return (error);
 	}
 
 	if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0)
 		return (error);
 	VATTR_NULL(&vattr);
 	vattr.va_flags = flags;
 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
 #ifdef MAC
 	error = mac_vnode_check_setflags(td->td_ucred, vp, vattr.va_flags);
 	if (error == 0)
 #endif
 		error = VOP_SETATTR(vp, &vattr, td->td_ucred);
 	VOP_UNLOCK(vp, 0);
 	vn_finished_write(mp);
 	return (error);
 }
 
 /*
  * Change flags of a file given a path name.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct chflags_args {
 	const char *path;
 	u_long	flags;
 };
 #endif
 int
 sys_chflags(td, uap)
 	struct thread *td;
 	register struct chflags_args /* {
 		const char *path;
 		u_long flags;
 	} */ *uap;
 {
 
 	return (kern_chflags(td, uap->path, UIO_USERSPACE, uap->flags));
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct chflagsat_args {
 	int	fd;
 	const char *path;
 	u_long	flags;
 	int	atflag;
 }
 #endif
 int
 sys_chflagsat(struct thread *td, struct chflagsat_args *uap)
 {
 	int fd = uap->fd;
 	const char *path = uap->path;
 	u_long flags = uap->flags;
 	int atflag = uap->atflag;
 
 	if (atflag & ~AT_SYMLINK_NOFOLLOW)
 		return (EINVAL);
 
 	return (kern_chflagsat(td, fd, path, UIO_USERSPACE, flags, atflag));
 }
 
 static int
 kern_chflags(struct thread *td, const char *path, enum uio_seg pathseg,
     u_long flags)
 {
 
 	return (kern_chflagsat(td, AT_FDCWD, path, pathseg, flags, 0));
 }
 
 /*
  * Same as chflags() but doesn't follow symlinks.
  */
 int
 sys_lchflags(td, uap)
 	struct thread *td;
 	register struct lchflags_args /* {
 		const char *path;
 		u_long flags;
 	} */ *uap;
 {
 
 	return (kern_chflagsat(td, AT_FDCWD, uap->path, UIO_USERSPACE,
 	    uap->flags, AT_SYMLINK_NOFOLLOW));
 }
 
 static int
 kern_chflagsat(struct thread *td, int fd, const char *path,
     enum uio_seg pathseg, u_long flags, int atflag)
 {
 	struct nameidata nd;
 	cap_rights_t rights;
 	int error, follow;
 
 	AUDIT_ARG_FFLAGS(flags);
 	follow = (atflag & AT_SYMLINK_NOFOLLOW) ? NOFOLLOW : FOLLOW;
 	NDINIT_ATRIGHTS(&nd, LOOKUP, follow | AUDITVNODE1, pathseg, path, fd,
 	    cap_rights_init(&rights, CAP_FCHFLAGS), td);
 	if ((error = namei(&nd)) != 0)
 		return (error);
 	NDFREE(&nd, NDF_ONLY_PNBUF);
 	error = setfflags(td, nd.ni_vp, flags);
 	vrele(nd.ni_vp);
 	return (error);
 }
 
 /*
  * Change flags of a file given a file descriptor.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct fchflags_args {
 	int	fd;
 	u_long	flags;
 };
 #endif
 int
 sys_fchflags(td, uap)
 	struct thread *td;
 	register struct fchflags_args /* {
 		int fd;
 		u_long flags;
 	} */ *uap;
 {
 	struct file *fp;
 	cap_rights_t rights;
 	int error;
 
 	AUDIT_ARG_FD(uap->fd);
 	AUDIT_ARG_FFLAGS(uap->flags);
 	error = getvnode(td->td_proc->p_fd, uap->fd,
 	    cap_rights_init(&rights, CAP_FCHFLAGS), &fp);
 	if (error != 0)
 		return (error);
 #ifdef AUDIT
 	vn_lock(fp->f_vnode, LK_SHARED | LK_RETRY);
 	AUDIT_ARG_VNODE1(fp->f_vnode);
 	VOP_UNLOCK(fp->f_vnode, 0);
 #endif
 	error = setfflags(td, fp->f_vnode, uap->flags);
 	fdrop(fp, td);
 	return (error);
 }
 
 /*
  * Common implementation code for chmod(), lchmod() and fchmod().
  */
 int
 setfmode(td, cred, vp, mode)
 	struct thread *td;
 	struct ucred *cred;
 	struct vnode *vp;
 	int mode;
 {
 	struct mount *mp;
 	struct vattr vattr;
 	int error;
 
 	if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0)
 		return (error);
 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
 	VATTR_NULL(&vattr);
 	vattr.va_mode = mode & ALLPERMS;
 #ifdef MAC
 	error = mac_vnode_check_setmode(cred, vp, vattr.va_mode);
 	if (error == 0)
 #endif
 		error = VOP_SETATTR(vp, &vattr, cred);
 	VOP_UNLOCK(vp, 0);
 	vn_finished_write(mp);
 	return (error);
 }
 
 /*
  * Change mode of a file given path name.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct chmod_args {
 	char	*path;
 	int	mode;
 };
 #endif
 int
 sys_chmod(td, uap)
 	struct thread *td;
 	register struct chmod_args /* {
 		char *path;
 		int mode;
 	} */ *uap;
 {
 
 	return (kern_chmod(td, uap->path, UIO_USERSPACE, uap->mode));
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct fchmodat_args {
 	int	dirfd;
 	char	*path;
 	mode_t	mode;
 	int	flag;
 }
 #endif
 int
 sys_fchmodat(struct thread *td, struct fchmodat_args *uap)
 {
 	int flag = uap->flag;
 	int fd = uap->fd;
 	char *path = uap->path;
 	mode_t mode = uap->mode;
 
 	if (flag & ~AT_SYMLINK_NOFOLLOW)
 		return (EINVAL);
 
 	return (kern_fchmodat(td, fd, path, UIO_USERSPACE, mode, flag));
 }
 
 int
 kern_chmod(struct thread *td, char *path, enum uio_seg pathseg, int mode)
 {
 
 	return (kern_fchmodat(td, AT_FDCWD, path, pathseg, mode, 0));
 }
 
 /*
  * Change mode of a file given path name (don't follow links.)
  */
 #ifndef _SYS_SYSPROTO_H_
 struct lchmod_args {
 	char	*path;
 	int	mode;
 };
 #endif
 int
 sys_lchmod(td, uap)
 	struct thread *td;
 	register struct lchmod_args /* {
 		char *path;
 		int mode;
 	} */ *uap;
 {
 
 	return (kern_fchmodat(td, AT_FDCWD, uap->path, UIO_USERSPACE,
 	    uap->mode, AT_SYMLINK_NOFOLLOW));
 }
 
 int
 kern_fchmodat(struct thread *td, int fd, char *path, enum uio_seg pathseg,
     mode_t mode, int flag)
 {
 	struct nameidata nd;
 	cap_rights_t rights;
 	int error, follow;
 
 	AUDIT_ARG_MODE(mode);
 	follow = (flag & AT_SYMLINK_NOFOLLOW) ? NOFOLLOW : FOLLOW;
 	NDINIT_ATRIGHTS(&nd, LOOKUP, follow | AUDITVNODE1, pathseg, path, fd,
 	    cap_rights_init(&rights, CAP_FCHMOD), td);
 	if ((error = namei(&nd)) != 0)
 		return (error);
 	NDFREE(&nd, NDF_ONLY_PNBUF);
 	error = setfmode(td, td->td_ucred, nd.ni_vp, mode);
 	vrele(nd.ni_vp);
 	return (error);
 }
 
 /*
  * Change mode of a file given a file descriptor.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct fchmod_args {
 	int	fd;
 	int	mode;
 };
 #endif
 int
 sys_fchmod(struct thread *td, struct fchmod_args *uap)
 {
 	struct file *fp;
 	cap_rights_t rights;
 	int error;
 
 	AUDIT_ARG_FD(uap->fd);
 	AUDIT_ARG_MODE(uap->mode);
 
 	error = fget(td, uap->fd, cap_rights_init(&rights, CAP_FCHMOD), &fp);
 	if (error != 0)
 		return (error);
 	error = fo_chmod(fp, uap->mode, td->td_ucred, td);
 	fdrop(fp, td);
 	return (error);
 }
 
 /*
  * Common implementation for chown(), lchown(), and fchown()
  */
 int
 setfown(td, cred, vp, uid, gid)
 	struct thread *td;
 	struct ucred *cred;
 	struct vnode *vp;
 	uid_t uid;
 	gid_t gid;
 {
 	struct mount *mp;
 	struct vattr vattr;
 	int error;
 
 	if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0)
 		return (error);
 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
 	VATTR_NULL(&vattr);
 	vattr.va_uid = uid;
 	vattr.va_gid = gid;
 #ifdef MAC
 	error = mac_vnode_check_setowner(cred, vp, vattr.va_uid,
 	    vattr.va_gid);
 	if (error == 0)
 #endif
 		error = VOP_SETATTR(vp, &vattr, cred);
 	VOP_UNLOCK(vp, 0);
 	vn_finished_write(mp);
 	return (error);
 }
 
 /*
  * Set ownership given a path name.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct chown_args {
 	char	*path;
 	int	uid;
 	int	gid;
 };
 #endif
 int
 sys_chown(td, uap)
 	struct thread *td;
 	register struct chown_args /* {
 		char *path;
 		int uid;
 		int gid;
 	} */ *uap;
 {
 
 	return (kern_chown(td, uap->path, UIO_USERSPACE, uap->uid, uap->gid));
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct fchownat_args {
 	int fd;
 	const char * path;
 	uid_t uid;
 	gid_t gid;
 	int flag;
 };
 #endif
 int
 sys_fchownat(struct thread *td, struct fchownat_args *uap)
 {
 	int flag;
 
 	flag = uap->flag;
 	if (flag & ~AT_SYMLINK_NOFOLLOW)
 		return (EINVAL);
 
 	return (kern_fchownat(td, uap->fd, uap->path, UIO_USERSPACE, uap->uid,
 	    uap->gid, uap->flag));
 }
 
 int
 kern_chown(struct thread *td, char *path, enum uio_seg pathseg, int uid,
     int gid)
 {
 
 	return (kern_fchownat(td, AT_FDCWD, path, pathseg, uid, gid, 0));
 }
 
 int
 kern_fchownat(struct thread *td, int fd, char *path, enum uio_seg pathseg,
     int uid, int gid, int flag)
 {
 	struct nameidata nd;
 	cap_rights_t rights;
 	int error, follow;
 
 	AUDIT_ARG_OWNER(uid, gid);
 	follow = (flag & AT_SYMLINK_NOFOLLOW) ? NOFOLLOW : FOLLOW;
 	NDINIT_ATRIGHTS(&nd, LOOKUP, follow | AUDITVNODE1, pathseg, path, fd,
 	    cap_rights_init(&rights, CAP_FCHOWN), td);
 
 	if ((error = namei(&nd)) != 0)
 		return (error);
 	NDFREE(&nd, NDF_ONLY_PNBUF);
 	error = setfown(td, td->td_ucred, nd.ni_vp, uid, gid);
 	vrele(nd.ni_vp);
 	return (error);
 }
 
 /*
  * Set ownership given a path name, do not cross symlinks.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct lchown_args {
 	char	*path;
 	int	uid;
 	int	gid;
 };
 #endif
 int
 sys_lchown(td, uap)
 	struct thread *td;
 	register struct lchown_args /* {
 		char *path;
 		int uid;
 		int gid;
 	} */ *uap;
 {
 
 	return (kern_lchown(td, uap->path, UIO_USERSPACE, uap->uid, uap->gid));
 }
 
 int
 kern_lchown(struct thread *td, char *path, enum uio_seg pathseg, int uid,
     int gid)
 {
 
 	return (kern_fchownat(td, AT_FDCWD, path, pathseg, uid, gid,
 	    AT_SYMLINK_NOFOLLOW));
 }
 
 /*
  * Set ownership given a file descriptor.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct fchown_args {
 	int	fd;
 	int	uid;
 	int	gid;
 };
 #endif
 int
 sys_fchown(td, uap)
 	struct thread *td;
 	register struct fchown_args /* {
 		int fd;
 		int uid;
 		int gid;
 	} */ *uap;
 {
 	struct file *fp;
 	cap_rights_t rights;
 	int error;
 
 	AUDIT_ARG_FD(uap->fd);
 	AUDIT_ARG_OWNER(uap->uid, uap->gid);
 	error = fget(td, uap->fd, cap_rights_init(&rights, CAP_FCHOWN), &fp);
 	if (error != 0)
 		return (error);
 	error = fo_chown(fp, uap->uid, uap->gid, td->td_ucred, td);
 	fdrop(fp, td);
 	return (error);
 }
 
 /*
  * Common implementation code for utimes(), lutimes(), and futimes().
  */
 static int
 getutimes(usrtvp, tvpseg, tsp)
 	const struct timeval *usrtvp;
 	enum uio_seg tvpseg;
 	struct timespec *tsp;
 {
 	struct timeval tv[2];
 	const struct timeval *tvp;
 	int error;
 
 	if (usrtvp == NULL) {
 		vfs_timestamp(&tsp[0]);
 		tsp[1] = tsp[0];
 	} else {
 		if (tvpseg == UIO_SYSSPACE) {
 			tvp = usrtvp;
 		} else {
 			if ((error = copyin(usrtvp, tv, sizeof(tv))) != 0)
 				return (error);
 			tvp = tv;
 		}
 
 		if (tvp[0].tv_usec < 0 || tvp[0].tv_usec >= 1000000 ||
 		    tvp[1].tv_usec < 0 || tvp[1].tv_usec >= 1000000)
 			return (EINVAL);
 		TIMEVAL_TO_TIMESPEC(&tvp[0], &tsp[0]);
 		TIMEVAL_TO_TIMESPEC(&tvp[1], &tsp[1]);
 	}
 	return (0);
 }
 
 /*
  * Common implementation code for utimes(), lutimes(), and futimes().
  */
 static int
 setutimes(td, vp, ts, numtimes, nullflag)
 	struct thread *td;
 	struct vnode *vp;
 	const struct timespec *ts;
 	int numtimes;
 	int nullflag;
 {
 	struct mount *mp;
 	struct vattr vattr;
 	int error, setbirthtime;
 
 	if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0)
 		return (error);
 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
 	setbirthtime = 0;
 	if (numtimes < 3 && !VOP_GETATTR(vp, &vattr, td->td_ucred) &&
 	    timespeccmp(&ts[1], &vattr.va_birthtime, < ))
 		setbirthtime = 1;
 	VATTR_NULL(&vattr);
 	vattr.va_atime = ts[0];
 	vattr.va_mtime = ts[1];
 	if (setbirthtime)
 		vattr.va_birthtime = ts[1];
 	if (numtimes > 2)
 		vattr.va_birthtime = ts[2];
 	if (nullflag)
 		vattr.va_vaflags |= VA_UTIMES_NULL;
 #ifdef MAC
 	error = mac_vnode_check_setutimes(td->td_ucred, vp, vattr.va_atime,
 	    vattr.va_mtime);
 #endif
 	if (error == 0)
 		error = VOP_SETATTR(vp, &vattr, td->td_ucred);
 	VOP_UNLOCK(vp, 0);
 	vn_finished_write(mp);
 	return (error);
 }
 
 /*
  * Set the access and modification times of a file.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct utimes_args {
 	char	*path;
 	struct	timeval *tptr;
 };
 #endif
 int
 sys_utimes(td, uap)
 	struct thread *td;
 	register struct utimes_args /* {
 		char *path;
 		struct timeval *tptr;
 	} */ *uap;
 {
 
 	return (kern_utimes(td, uap->path, UIO_USERSPACE, uap->tptr,
 	    UIO_USERSPACE));
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct futimesat_args {
 	int fd;
 	const char * path;
 	const struct timeval * times;
 };
 #endif
 int
 sys_futimesat(struct thread *td, struct futimesat_args *uap)
 {
 
 	return (kern_utimesat(td, uap->fd, uap->path, UIO_USERSPACE,
 	    uap->times, UIO_USERSPACE));
 }
 
 int
 kern_utimes(struct thread *td, char *path, enum uio_seg pathseg,
     struct timeval *tptr, enum uio_seg tptrseg)
 {
 
 	return (kern_utimesat(td, AT_FDCWD, path, pathseg, tptr, tptrseg));
 }
 
 int
 kern_utimesat(struct thread *td, int fd, char *path, enum uio_seg pathseg,
     struct timeval *tptr, enum uio_seg tptrseg)
 {
 	struct nameidata nd;
 	struct timespec ts[2];
 	cap_rights_t rights;
 	int error;
 
 	if ((error = getutimes(tptr, tptrseg, ts)) != 0)
 		return (error);
 	NDINIT_ATRIGHTS(&nd, LOOKUP, FOLLOW | AUDITVNODE1, pathseg, path, fd,
 	    cap_rights_init(&rights, CAP_FUTIMES), td);
 
 	if ((error = namei(&nd)) != 0)
 		return (error);
 	NDFREE(&nd, NDF_ONLY_PNBUF);
 	error = setutimes(td, nd.ni_vp, ts, 2, tptr == NULL);
 	vrele(nd.ni_vp);
 	return (error);
 }
 
 /*
  * Set the access and modification times of a file.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct lutimes_args {
 	char	*path;
 	struct	timeval *tptr;
 };
 #endif
 int
 sys_lutimes(td, uap)
 	struct thread *td;
 	register struct lutimes_args /* {
 		char *path;
 		struct timeval *tptr;
 	} */ *uap;
 {
 
 	return (kern_lutimes(td, uap->path, UIO_USERSPACE, uap->tptr,
 	    UIO_USERSPACE));
 }
 
 int
 kern_lutimes(struct thread *td, char *path, enum uio_seg pathseg,
     struct timeval *tptr, enum uio_seg tptrseg)
 {
 	struct timespec ts[2];
 	struct nameidata nd;
 	int error;
 
 	if ((error = getutimes(tptr, tptrseg, ts)) != 0)
 		return (error);
 	NDINIT(&nd, LOOKUP, NOFOLLOW | AUDITVNODE1, pathseg, path, td);
 	if ((error = namei(&nd)) != 0)
 		return (error);
 	NDFREE(&nd, NDF_ONLY_PNBUF);
 	error = setutimes(td, nd.ni_vp, ts, 2, tptr == NULL);
 	vrele(nd.ni_vp);
 	return (error);
 }
 
 /*
  * Set the access and modification times of a file.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct futimes_args {
 	int	fd;
 	struct	timeval *tptr;
 };
 #endif
 int
 sys_futimes(td, uap)
 	struct thread *td;
 	register struct futimes_args /* {
 		int  fd;
 		struct timeval *tptr;
 	} */ *uap;
 {
 
 	return (kern_futimes(td, uap->fd, uap->tptr, UIO_USERSPACE));
 }
 
 int
 kern_futimes(struct thread *td, int fd, struct timeval *tptr,
     enum uio_seg tptrseg)
 {
 	struct timespec ts[2];
 	struct file *fp;
 	cap_rights_t rights;
 	int error;
 
 	AUDIT_ARG_FD(fd);
 	error = getutimes(tptr, tptrseg, ts);
 	if (error != 0)
 		return (error);
 	error = getvnode(td->td_proc->p_fd, fd,
 	    cap_rights_init(&rights, CAP_FUTIMES), &fp);
 	if (error != 0)
 		return (error);
 #ifdef AUDIT
 	vn_lock(fp->f_vnode, LK_SHARED | LK_RETRY);
 	AUDIT_ARG_VNODE1(fp->f_vnode);
 	VOP_UNLOCK(fp->f_vnode, 0);
 #endif
 	error = setutimes(td, fp->f_vnode, ts, 2, tptr == NULL);
 	fdrop(fp, td);
 	return (error);
 }
 
 /*
  * Truncate a file given its path name.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct truncate_args {
 	char	*path;
 	int	pad;
 	off_t	length;
 };
 #endif
 int
 sys_truncate(td, uap)
 	struct thread *td;
 	register struct truncate_args /* {
 		char *path;
 		int pad;
 		off_t length;
 	} */ *uap;
 {
 
 	return (kern_truncate(td, uap->path, UIO_USERSPACE, uap->length));
 }
 
 int
 kern_truncate(struct thread *td, char *path, enum uio_seg pathseg, off_t length)
 {
 	struct mount *mp;
 	struct vnode *vp;
 	void *rl_cookie;
 	struct vattr vattr;
 	struct nameidata nd;
 	int error;
 
 	if (length < 0)
 		return(EINVAL);
 	NDINIT(&nd, LOOKUP, FOLLOW | AUDITVNODE1, pathseg, path, td);
 	if ((error = namei(&nd)) != 0)
 		return (error);
 	vp = nd.ni_vp;
 	rl_cookie = vn_rangelock_wlock(vp, 0, OFF_MAX);
 	if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0) {
 		vn_rangelock_unlock(vp, rl_cookie);
 		vrele(vp);
 		return (error);
 	}
 	NDFREE(&nd, NDF_ONLY_PNBUF);
 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
 	if (vp->v_type == VDIR)
 		error = EISDIR;
 #ifdef MAC
 	else if ((error = mac_vnode_check_write(td->td_ucred, NOCRED, vp))) {
 	}
 #endif
 	else if ((error = vn_writechk(vp)) == 0 &&
 	    (error = VOP_ACCESS(vp, VWRITE, td->td_ucred, td)) == 0) {
 		VATTR_NULL(&vattr);
 		vattr.va_size = length;
 		error = VOP_SETATTR(vp, &vattr, td->td_ucred);
 	}
 	VOP_UNLOCK(vp, 0);
 	vn_finished_write(mp);
 	vn_rangelock_unlock(vp, rl_cookie);
 	vrele(vp);
 	return (error);
 }
 
 #if defined(COMPAT_43)
 /*
  * Truncate a file given its path name.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct otruncate_args {
 	char	*path;
 	long	length;
 };
 #endif
 int
 otruncate(td, uap)
 	struct thread *td;
 	register struct otruncate_args /* {
 		char *path;
 		long length;
 	} */ *uap;
 {
 	struct truncate_args /* {
 		char *path;
 		int pad;
 		off_t length;
 	} */ nuap;
 
 	nuap.path = uap->path;
 	nuap.length = uap->length;
 	return (sys_truncate(td, &nuap));
 }
 #endif /* COMPAT_43 */
 
 /* Versions with the pad argument */
 int
 freebsd6_truncate(struct thread *td, struct freebsd6_truncate_args *uap)
 {
 	struct truncate_args ouap;
 
 	ouap.path = uap->path;
 	ouap.length = uap->length;
 	return (sys_truncate(td, &ouap));
 }
 
 int
 freebsd6_ftruncate(struct thread *td, struct freebsd6_ftruncate_args *uap)
 {
 	struct ftruncate_args ouap;
 
 	ouap.fd = uap->fd;
 	ouap.length = uap->length;
 	return (sys_ftruncate(td, &ouap));
 }
 
 /*
  * Sync an open file.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct fsync_args {
 	int	fd;
 };
 #endif
 int
 sys_fsync(td, uap)
 	struct thread *td;
 	struct fsync_args /* {
 		int fd;
 	} */ *uap;
 {
 	struct vnode *vp;
 	struct mount *mp;
 	struct file *fp;
 	cap_rights_t rights;
 	int error, lock_flags;
 
 	AUDIT_ARG_FD(uap->fd);
 	error = getvnode(td->td_proc->p_fd, uap->fd,
 	    cap_rights_init(&rights, CAP_FSYNC), &fp);
 	if (error != 0)
 		return (error);
 	vp = fp->f_vnode;
 	error = vn_start_write(vp, &mp, V_WAIT | PCATCH);
 	if (error != 0)
 		goto drop;
 	if (MNT_SHARED_WRITES(mp) ||
 	    ((mp == NULL) && MNT_SHARED_WRITES(vp->v_mount))) {
 		lock_flags = LK_SHARED;
 	} else {
 		lock_flags = LK_EXCLUSIVE;
 	}
 	vn_lock(vp, lock_flags | LK_RETRY);
 	AUDIT_ARG_VNODE1(vp);
 	if (vp->v_object != NULL) {
 		VM_OBJECT_WLOCK(vp->v_object);
 		vm_object_page_clean(vp->v_object, 0, 0, 0);
 		VM_OBJECT_WUNLOCK(vp->v_object);
 	}
 	error = VOP_FSYNC(vp, MNT_WAIT, td);
 
 	VOP_UNLOCK(vp, 0);
 	vn_finished_write(mp);
 drop:
 	fdrop(fp, td);
 	return (error);
 }
 
 /*
  * Rename files.  Source and destination must either both be directories, or
  * both not be directories.  If target is a directory, it must be empty.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct rename_args {
 	char	*from;
 	char	*to;
 };
 #endif
 int
 sys_rename(td, uap)
 	struct thread *td;
 	register struct rename_args /* {
 		char *from;
 		char *to;
 	} */ *uap;
 {
 
 	return (kern_rename(td, uap->from, uap->to, UIO_USERSPACE));
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct renameat_args {
 	int	oldfd;
 	char	*old;
 	int	newfd;
 	char	*new;
 };
 #endif
 int
 sys_renameat(struct thread *td, struct renameat_args *uap)
 {
 
 	return (kern_renameat(td, uap->oldfd, uap->old, uap->newfd, uap->new,
 	    UIO_USERSPACE));
 }
 
 int
 kern_rename(struct thread *td, char *from, char *to, enum uio_seg pathseg)
 {
 
 	return (kern_renameat(td, AT_FDCWD, from, AT_FDCWD, to, pathseg));
 }
 
 int
 kern_renameat(struct thread *td, int oldfd, char *old, int newfd, char *new,
     enum uio_seg pathseg)
 {
 	struct mount *mp = NULL;
 	struct vnode *tvp, *fvp, *tdvp;
 	struct nameidata fromnd, tond;
 	cap_rights_t rights;
 	int error;
 
 	bwillwrite();
 #ifdef MAC
 	NDINIT_ATRIGHTS(&fromnd, DELETE, LOCKPARENT | LOCKLEAF | SAVESTART |
 	    AUDITVNODE1, pathseg, old, oldfd,
 	    cap_rights_init(&rights, CAP_RENAMEAT), td);
 #else
 	NDINIT_ATRIGHTS(&fromnd, DELETE, WANTPARENT | SAVESTART | AUDITVNODE1,
 	    pathseg, old, oldfd, cap_rights_init(&rights, CAP_RENAMEAT), td);
 #endif
 
 	if ((error = namei(&fromnd)) != 0)
 		return (error);
 #ifdef MAC
 	error = mac_vnode_check_rename_from(td->td_ucred, fromnd.ni_dvp,
 	    fromnd.ni_vp, &fromnd.ni_cnd);
 	VOP_UNLOCK(fromnd.ni_dvp, 0);
 	if (fromnd.ni_dvp != fromnd.ni_vp)
 		VOP_UNLOCK(fromnd.ni_vp, 0);
 #endif
 	fvp = fromnd.ni_vp;
 	if (error == 0)
 		error = vn_start_write(fvp, &mp, V_WAIT | PCATCH);
 	if (error != 0) {
 		NDFREE(&fromnd, NDF_ONLY_PNBUF);
 		vrele(fromnd.ni_dvp);
 		vrele(fvp);
 		goto out1;
 	}
 	NDINIT_ATRIGHTS(&tond, RENAME, LOCKPARENT | LOCKLEAF | NOCACHE |
 	    SAVESTART | AUDITVNODE2, pathseg, new, newfd,
 	    cap_rights_init(&rights, CAP_LINKAT), td);
 	if (fromnd.ni_vp->v_type == VDIR)
 		tond.ni_cnd.cn_flags |= WILLBEDIR;
 	if ((error = namei(&tond)) != 0) {
 		/* Translate error code for rename("dir1", "dir2/."). */
 		if (error == EISDIR && fvp->v_type == VDIR)
 			error = EINVAL;
 		NDFREE(&fromnd, NDF_ONLY_PNBUF);
 		vrele(fromnd.ni_dvp);
 		vrele(fvp);
 		vn_finished_write(mp);
 		goto out1;
 	}
 	tdvp = tond.ni_dvp;
 	tvp = tond.ni_vp;
 	if (tvp != NULL) {
 		if (fvp->v_type == VDIR && tvp->v_type != VDIR) {
 			error = ENOTDIR;
 			goto out;
 		} else if (fvp->v_type != VDIR && tvp->v_type == VDIR) {
 			error = EISDIR;
 			goto out;
 		}
 #ifdef CAPABILITIES
 		if (newfd != AT_FDCWD) {
 			/*
 			 * If the target already exists we require CAP_UNLINKAT
 			 * from 'newfd'.
 			 */
 			error = cap_check(&tond.ni_filecaps.fc_rights,
 			    cap_rights_init(&rights, CAP_UNLINKAT));
 			if (error != 0)
 				goto out;
 		}
 #endif
 	}
 	if (fvp == tdvp) {
 		error = EINVAL;
 		goto out;
 	}
 	/*
 	 * If the source is the same as the destination (that is, if they
 	 * are links to the same vnode), then there is nothing to do.
 	 */
 	if (fvp == tvp)
 		error = -1;
 #ifdef MAC
 	else
 		error = mac_vnode_check_rename_to(td->td_ucred, tdvp,
 		    tond.ni_vp, fromnd.ni_dvp == tdvp, &tond.ni_cnd);
 #endif
 out:
 	if (error == 0) {
 		error = VOP_RENAME(fromnd.ni_dvp, fromnd.ni_vp, &fromnd.ni_cnd,
 		    tond.ni_dvp, tond.ni_vp, &tond.ni_cnd);
 		NDFREE(&fromnd, NDF_ONLY_PNBUF);
 		NDFREE(&tond, NDF_ONLY_PNBUF);
 	} else {
 		NDFREE(&fromnd, NDF_ONLY_PNBUF);
 		NDFREE(&tond, NDF_ONLY_PNBUF);
 		if (tvp != NULL)
 			vput(tvp);
 		if (tdvp == tvp)
 			vrele(tdvp);
 		else
 			vput(tdvp);
 		vrele(fromnd.ni_dvp);
 		vrele(fvp);
 	}
 	vrele(tond.ni_startdir);
 	vn_finished_write(mp);
 out1:
 	if (fromnd.ni_startdir)
 		vrele(fromnd.ni_startdir);
 	if (error == -1)
 		return (0);
 	return (error);
 }
 
 /*
  * Make a directory file.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct mkdir_args {
 	char	*path;
 	int	mode;
 };
 #endif
 int
 sys_mkdir(td, uap)
 	struct thread *td;
 	register struct mkdir_args /* {
 		char *path;
 		int mode;
 	} */ *uap;
 {
 
 	return (kern_mkdir(td, uap->path, UIO_USERSPACE, uap->mode));
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct mkdirat_args {
 	int	fd;
 	char	*path;
 	mode_t	mode;
 };
 #endif
 int
 sys_mkdirat(struct thread *td, struct mkdirat_args *uap)
 {
 
 	return (kern_mkdirat(td, uap->fd, uap->path, UIO_USERSPACE, uap->mode));
 }
 
 int
 kern_mkdir(struct thread *td, char *path, enum uio_seg segflg, int mode)
 {
 
 	return (kern_mkdirat(td, AT_FDCWD, path, segflg, mode));
 }
 
 int
 kern_mkdirat(struct thread *td, int fd, char *path, enum uio_seg segflg,
     int mode)
 {
 	struct mount *mp;
 	struct vnode *vp;
 	struct vattr vattr;
 	struct nameidata nd;
 	cap_rights_t rights;
 	int error;
 
 	AUDIT_ARG_MODE(mode);
 restart:
 	bwillwrite();
 	NDINIT_ATRIGHTS(&nd, CREATE, LOCKPARENT | SAVENAME | AUDITVNODE1,
 	    segflg, path, fd, cap_rights_init(&rights, CAP_MKDIRAT), td);
 	nd.ni_cnd.cn_flags |= WILLBEDIR;
 	if ((error = namei(&nd)) != 0)
 		return (error);
 	vp = nd.ni_vp;
 	if (vp != NULL) {
 		NDFREE(&nd, NDF_ONLY_PNBUF);
 		/*
 		 * XXX namei called with LOCKPARENT but not LOCKLEAF has
 		 * the strange behaviour of leaving the vnode unlocked
 		 * if the target is the same vnode as the parent.
 		 */
 		if (vp == nd.ni_dvp)
 			vrele(nd.ni_dvp);
 		else
 			vput(nd.ni_dvp);
 		vrele(vp);
 		return (EEXIST);
 	}
 	if (vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) {
 		NDFREE(&nd, NDF_ONLY_PNBUF);
 		vput(nd.ni_dvp);
 		if ((error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH)) != 0)
 			return (error);
 		goto restart;
 	}
 	VATTR_NULL(&vattr);
 	vattr.va_type = VDIR;
 	vattr.va_mode = (mode & ACCESSPERMS) &~ td->td_proc->p_fd->fd_cmask;
 #ifdef MAC
 	error = mac_vnode_check_create(td->td_ucred, nd.ni_dvp, &nd.ni_cnd,
 	    &vattr);
 	if (error != 0)
 		goto out;
 #endif
 	error = VOP_MKDIR(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vattr);
 #ifdef MAC
 out:
 #endif
 	NDFREE(&nd, NDF_ONLY_PNBUF);
 	vput(nd.ni_dvp);
 	if (error == 0)
 		vput(nd.ni_vp);
 	vn_finished_write(mp);
 	return (error);
 }
 
 /*
  * Remove a directory file.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct rmdir_args {
 	char	*path;
 };
 #endif
 int
 sys_rmdir(td, uap)
 	struct thread *td;
 	struct rmdir_args /* {
 		char *path;
 	} */ *uap;
 {
 
 	return (kern_rmdir(td, uap->path, UIO_USERSPACE));
 }
 
 int
 kern_rmdir(struct thread *td, char *path, enum uio_seg pathseg)
 {
 
 	return (kern_rmdirat(td, AT_FDCWD, path, pathseg));
 }
 
 int
 kern_rmdirat(struct thread *td, int fd, char *path, enum uio_seg pathseg)
 {
 	struct mount *mp;
 	struct vnode *vp;
 	struct nameidata nd;
 	cap_rights_t rights;
 	int error;
 
 restart:
 	bwillwrite();
 	NDINIT_ATRIGHTS(&nd, DELETE, LOCKPARENT | LOCKLEAF | AUDITVNODE1,
 	    pathseg, path, fd, cap_rights_init(&rights, CAP_UNLINKAT), td);
 	if ((error = namei(&nd)) != 0)
 		return (error);
 	vp = nd.ni_vp;
 	if (vp->v_type != VDIR) {
 		error = ENOTDIR;
 		goto out;
 	}
 	/*
 	 * No rmdir "." please.
 	 */
 	if (nd.ni_dvp == vp) {
 		error = EINVAL;
 		goto out;
 	}
 	/*
 	 * The root of a mounted filesystem cannot be deleted.
 	 */
 	if (vp->v_vflag & VV_ROOT) {
 		error = EBUSY;
 		goto out;
 	}
 #ifdef MAC
 	error = mac_vnode_check_unlink(td->td_ucred, nd.ni_dvp, vp,
 	    &nd.ni_cnd);
 	if (error != 0)
 		goto out;
 #endif
 	if (vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) {
 		NDFREE(&nd, NDF_ONLY_PNBUF);
 		vput(vp);
 		if (nd.ni_dvp == vp)
 			vrele(nd.ni_dvp);
 		else
 			vput(nd.ni_dvp);
 		if ((error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH)) != 0)
 			return (error);
 		goto restart;
 	}
 	vfs_notify_upper(vp, VFS_NOTIFY_UPPER_UNLINK);
 	error = VOP_RMDIR(nd.ni_dvp, nd.ni_vp, &nd.ni_cnd);
 	vn_finished_write(mp);
 out:
 	NDFREE(&nd, NDF_ONLY_PNBUF);
 	vput(vp);
 	if (nd.ni_dvp == vp)
 		vrele(nd.ni_dvp);
 	else
 		vput(nd.ni_dvp);
 	return (error);
 }
 
 #ifdef COMPAT_43
 /*
  * Read a block of directory entries in a filesystem independent format.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct ogetdirentries_args {
 	int	fd;
 	char	*buf;
 	u_int	count;
 	long	*basep;
 };
 #endif
 int
 ogetdirentries(struct thread *td, struct ogetdirentries_args *uap)
 {
 	long loff;
 	int error;
 
 	error = kern_ogetdirentries(td, uap, &loff);
 	if (error == 0)
 		error = copyout(&loff, uap->basep, sizeof(long));
 	return (error);
 }
 
 int
 kern_ogetdirentries(struct thread *td, struct ogetdirentries_args *uap,
     long *ploff)
 {
 	struct vnode *vp;
 	struct file *fp;
 	struct uio auio, kuio;
 	struct iovec aiov, kiov;
 	struct dirent *dp, *edp;
 	cap_rights_t rights;
 	caddr_t dirbuf;
 	int error, eofflag, readcnt;
 	long loff;
 	off_t foffset;
 
 	/* XXX arbitrary sanity limit on `count'. */
 	if (uap->count > 64 * 1024)
 		return (EINVAL);
 	error = getvnode(td->td_proc->p_fd, uap->fd,
 	    cap_rights_init(&rights, CAP_READ), &fp);
 	if (error != 0)
 		return (error);
 	if ((fp->f_flag & FREAD) == 0) {
 		fdrop(fp, td);
 		return (EBADF);
 	}
 	vp = fp->f_vnode;
 	foffset = foffset_lock(fp, 0);
 unionread:
 	if (vp->v_type != VDIR) {
 		foffset_unlock(fp, foffset, 0);
 		fdrop(fp, td);
 		return (EINVAL);
 	}
 	aiov.iov_base = uap->buf;
 	aiov.iov_len = uap->count;
 	auio.uio_iov = &aiov;
 	auio.uio_iovcnt = 1;
 	auio.uio_rw = UIO_READ;
 	auio.uio_segflg = UIO_USERSPACE;
 	auio.uio_td = td;
 	auio.uio_resid = uap->count;
 	vn_lock(vp, LK_SHARED | LK_RETRY);
 	loff = auio.uio_offset = foffset;
 #ifdef MAC
 	error = mac_vnode_check_readdir(td->td_ucred, vp);
 	if (error != 0) {
 		VOP_UNLOCK(vp, 0);
 		foffset_unlock(fp, foffset, FOF_NOUPDATE);
 		fdrop(fp, td);
 		return (error);
 	}
 #endif
 #	if (BYTE_ORDER != LITTLE_ENDIAN)
 		if (vp->v_mount->mnt_maxsymlinklen <= 0) {
 			error = VOP_READDIR(vp, &auio, fp->f_cred, &eofflag,
 			    NULL, NULL);
 			foffset = auio.uio_offset;
 		} else
 #	endif
 	{
 		kuio = auio;
 		kuio.uio_iov = &kiov;
 		kuio.uio_segflg = UIO_SYSSPACE;
 		kiov.iov_len = uap->count;
 		dirbuf = malloc(uap->count, M_TEMP, M_WAITOK);
 		kiov.iov_base = dirbuf;
 		error = VOP_READDIR(vp, &kuio, fp->f_cred, &eofflag,
 			    NULL, NULL);
 		foffset = kuio.uio_offset;
 		if (error == 0) {
 			readcnt = uap->count - kuio.uio_resid;
 			edp = (struct dirent *)&dirbuf[readcnt];
 			for (dp = (struct dirent *)dirbuf; dp < edp; ) {
 #				if (BYTE_ORDER == LITTLE_ENDIAN)
 					/*
 					 * The expected low byte of
 					 * dp->d_namlen is our dp->d_type.
 					 * The high MBZ byte of dp->d_namlen
 					 * is our dp->d_namlen.
 					 */
 					dp->d_type = dp->d_namlen;
 					dp->d_namlen = 0;
 #				else
 					/*
 					 * The dp->d_type is the high byte
 					 * of the expected dp->d_namlen,
 					 * so must be zero'ed.
 					 */
 					dp->d_type = 0;
 #				endif
 				if (dp->d_reclen > 0) {
 					dp = (struct dirent *)
 					    ((char *)dp + dp->d_reclen);
 				} else {
 					error = EIO;
 					break;
 				}
 			}
 			if (dp >= edp)
 				error = uiomove(dirbuf, readcnt, &auio);
 		}
 		free(dirbuf, M_TEMP);
 	}
 	if (error != 0) {
 		VOP_UNLOCK(vp, 0);
 		foffset_unlock(fp, foffset, 0);
 		fdrop(fp, td);
 		return (error);
 	}
 	if (uap->count == auio.uio_resid &&
 	    (vp->v_vflag & VV_ROOT) &&
 	    (vp->v_mount->mnt_flag & MNT_UNION)) {
 		struct vnode *tvp = vp;
 		vp = vp->v_mount->mnt_vnodecovered;
 		VREF(vp);
 		fp->f_vnode = vp;
 		fp->f_data = vp;
 		foffset = 0;
 		vput(tvp);
 		goto unionread;
 	}
 	VOP_UNLOCK(vp, 0);
 	foffset_unlock(fp, foffset, 0);
 	fdrop(fp, td);
 	td->td_retval[0] = uap->count - auio.uio_resid;
 	if (error == 0)
 		*ploff = loff;
 	return (error);
 }
 #endif /* COMPAT_43 */
 
 /*
  * Read a block of directory entries in a filesystem independent format.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct getdirentries_args {
 	int	fd;
 	char	*buf;
 	u_int	count;
 	long	*basep;
 };
 #endif
 int
 sys_getdirentries(td, uap)
 	struct thread *td;
 	register struct getdirentries_args /* {
 		int fd;
 		char *buf;
 		u_int count;
 		long *basep;
 	} */ *uap;
 {
 	long base;
 	int error;
 
 	error = kern_getdirentries(td, uap->fd, uap->buf, uap->count, &base,
 	    NULL, UIO_USERSPACE);
 	if (error != 0)
 		return (error);
 	if (uap->basep != NULL)
 		error = copyout(&base, uap->basep, sizeof(long));
 	return (error);
 }
 
 int
 kern_getdirentries(struct thread *td, int fd, char *buf, u_int count,
     long *basep, ssize_t *residp, enum uio_seg bufseg)
 {
 	struct vnode *vp;
 	struct file *fp;
 	struct uio auio;
 	struct iovec aiov;
 	cap_rights_t rights;
 	long loff;
 	int error, eofflag;
 	off_t foffset;
 
 	AUDIT_ARG_FD(fd);
 	if (count > IOSIZE_MAX)
 		return (EINVAL);
 	auio.uio_resid = count;
 	error = getvnode(td->td_proc->p_fd, fd,
 	    cap_rights_init(&rights, CAP_READ), &fp);
 	if (error != 0)
 		return (error);
 	if ((fp->f_flag & FREAD) == 0) {
 		fdrop(fp, td);
 		return (EBADF);
 	}
 	vp = fp->f_vnode;
 	foffset = foffset_lock(fp, 0);
 unionread:
 	if (vp->v_type != VDIR) {
 		error = EINVAL;
 		goto fail;
 	}
 	aiov.iov_base = buf;
 	aiov.iov_len = count;
 	auio.uio_iov = &aiov;
 	auio.uio_iovcnt = 1;
 	auio.uio_rw = UIO_READ;
 	auio.uio_segflg = bufseg;
 	auio.uio_td = td;
 	vn_lock(vp, LK_SHARED | LK_RETRY);
 	AUDIT_ARG_VNODE1(vp);
 	loff = auio.uio_offset = foffset;
 #ifdef MAC
 	error = mac_vnode_check_readdir(td->td_ucred, vp);
 	if (error == 0)
 #endif
 		error = VOP_READDIR(vp, &auio, fp->f_cred, &eofflag, NULL,
 		    NULL);
 	foffset = auio.uio_offset;
 	if (error != 0) {
 		VOP_UNLOCK(vp, 0);
 		goto fail;
 	}
 	if (count == auio.uio_resid &&
 	    (vp->v_vflag & VV_ROOT) &&
 	    (vp->v_mount->mnt_flag & MNT_UNION)) {
 		struct vnode *tvp = vp;
 
 		vp = vp->v_mount->mnt_vnodecovered;
 		VREF(vp);
 		fp->f_vnode = vp;
 		fp->f_data = vp;
 		foffset = 0;
 		vput(tvp);
 		goto unionread;
 	}
 	VOP_UNLOCK(vp, 0);
 	*basep = loff;
 	if (residp != NULL)
 		*residp = auio.uio_resid;
 	td->td_retval[0] = count - auio.uio_resid;
 fail:
 	foffset_unlock(fp, foffset, 0);
 	fdrop(fp, td);
 	return (error);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct getdents_args {
 	int fd;
 	char *buf;
 	size_t count;
 };
 #endif
 int
 sys_getdents(td, uap)
 	struct thread *td;
 	register struct getdents_args /* {
 		int fd;
 		char *buf;
 		u_int count;
 	} */ *uap;
 {
 	struct getdirentries_args ap;
 
 	ap.fd = uap->fd;
 	ap.buf = uap->buf;
 	ap.count = uap->count;
 	ap.basep = NULL;
 	return (sys_getdirentries(td, &ap));
 }
 
 /*
  * Set the mode mask for creation of filesystem nodes.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct umask_args {
 	int	newmask;
 };
 #endif
 int
 sys_umask(td, uap)
 	struct thread *td;
 	struct umask_args /* {
 		int newmask;
 	} */ *uap;
 {
 	register struct filedesc *fdp;
 
 	FILEDESC_XLOCK(td->td_proc->p_fd);
 	fdp = td->td_proc->p_fd;
 	td->td_retval[0] = fdp->fd_cmask;
 	fdp->fd_cmask = uap->newmask & ALLPERMS;
 	FILEDESC_XUNLOCK(td->td_proc->p_fd);
 	return (0);
 }
 
 /*
  * Void all references to file by ripping underlying filesystem away from
  * vnode.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct revoke_args {
 	char	*path;
 };
 #endif
 int
 sys_revoke(td, uap)
 	struct thread *td;
 	register struct revoke_args /* {
 		char *path;
 	} */ *uap;
 {
 	struct vnode *vp;
 	struct vattr vattr;
 	struct nameidata nd;
 	int error;
 
 	NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | AUDITVNODE1, UIO_USERSPACE,
 	    uap->path, td);
 	if ((error = namei(&nd)) != 0)
 		return (error);
 	vp = nd.ni_vp;
 	NDFREE(&nd, NDF_ONLY_PNBUF);
 	if (vp->v_type != VCHR || vp->v_rdev == NULL) {
 		error = EINVAL;
 		goto out;
 	}
 #ifdef MAC
 	error = mac_vnode_check_revoke(td->td_ucred, vp);
 	if (error != 0)
 		goto out;
 #endif
 	error = VOP_GETATTR(vp, &vattr, td->td_ucred);
 	if (error != 0)
 		goto out;
 	if (td->td_ucred->cr_uid != vattr.va_uid) {
 		error = priv_check(td, PRIV_VFS_ADMIN);
 		if (error != 0)
 			goto out;
 	}
 	if (vcount(vp) > 1)
 		VOP_REVOKE(vp, REVOKEALL);
 out:
 	vput(vp);
 	return (error);
 }
 
 /*
  * Convert a user file descriptor to a kernel file entry and check that, if it
  * is a capability, the correct rights are present. A reference on the file
  * entry is held upon returning.
  */
 int
 getvnode(struct filedesc *fdp, int fd, cap_rights_t *rightsp, struct file **fpp)
 {
 	struct file *fp;
 	int error;
 
 	error = fget_unlocked(fdp, fd, rightsp, 0, &fp, NULL);
 	if (error != 0)
 		return (error);
 
 	/*
 	 * The file could be not of the vnode type, or it may be not
 	 * yet fully initialized, in which case the f_vnode pointer
 	 * may be set, but f_ops is still badfileops.  E.g.,
 	 * devfs_open() transiently create such situation to
 	 * facilitate csw d_fdopen().
 	 *
 	 * Dupfdopen() handling in kern_openat() installs the
 	 * half-baked file into the process descriptor table, allowing
 	 * other thread to dereference it. Guard against the race by
 	 * checking f_ops.
 	 */
 	if (fp->f_vnode == NULL || fp->f_ops == &badfileops) {
 		fdrop(fp, curthread);
 		return (EINVAL);
 	}
 	*fpp = fp;
 	return (0);
 }
 
 
 /*
  * Get an (NFS) file handle.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct lgetfh_args {
 	char	*fname;
 	fhandle_t *fhp;
 };
 #endif
 int
 sys_lgetfh(td, uap)
 	struct thread *td;
 	register struct lgetfh_args *uap;
 {
 	struct nameidata nd;
 	fhandle_t fh;
 	register struct vnode *vp;
 	int error;
 
 	error = priv_check(td, PRIV_VFS_GETFH);
 	if (error != 0)
 		return (error);
 	NDINIT(&nd, LOOKUP, NOFOLLOW | LOCKLEAF | AUDITVNODE1, UIO_USERSPACE,
 	    uap->fname, td);
 	error = namei(&nd);
 	if (error != 0)
 		return (error);
 	NDFREE(&nd, NDF_ONLY_PNBUF);
 	vp = nd.ni_vp;
 	bzero(&fh, sizeof(fh));
 	fh.fh_fsid = vp->v_mount->mnt_stat.f_fsid;
 	error = VOP_VPTOFH(vp, &fh.fh_fid);
 	vput(vp);
 	if (error == 0)
 		error = copyout(&fh, uap->fhp, sizeof (fh));
 	return (error);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct getfh_args {
 	char	*fname;
 	fhandle_t *fhp;
 };
 #endif
 int
 sys_getfh(td, uap)
 	struct thread *td;
 	register struct getfh_args *uap;
 {
 	struct nameidata nd;
 	fhandle_t fh;
 	register struct vnode *vp;
 	int error;
 
 	error = priv_check(td, PRIV_VFS_GETFH);
 	if (error != 0)
 		return (error);
 	NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | AUDITVNODE1, UIO_USERSPACE,
 	    uap->fname, td);
 	error = namei(&nd);
 	if (error != 0)
 		return (error);
 	NDFREE(&nd, NDF_ONLY_PNBUF);
 	vp = nd.ni_vp;
 	bzero(&fh, sizeof(fh));
 	fh.fh_fsid = vp->v_mount->mnt_stat.f_fsid;
 	error = VOP_VPTOFH(vp, &fh.fh_fid);
 	vput(vp);
 	if (error == 0)
 		error = copyout(&fh, uap->fhp, sizeof (fh));
 	return (error);
 }
 
 /*
  * syscall for the rpc.lockd to use to translate a NFS file handle into an
  * open descriptor.
  *
  * warning: do not remove the priv_check() call or this becomes one giant
  * security hole.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct fhopen_args {
 	const struct fhandle *u_fhp;
 	int flags;
 };
 #endif
 int
 sys_fhopen(td, uap)
 	struct thread *td;
 	struct fhopen_args /* {
 		const struct fhandle *u_fhp;
 		int flags;
 	} */ *uap;
 {
 	struct mount *mp;
 	struct vnode *vp;
 	struct fhandle fhp;
 	struct file *fp;
 	int fmode, error;
 	int indx;
 
 	error = priv_check(td, PRIV_VFS_FHOPEN);
 	if (error != 0)
 		return (error);
 	indx = -1;
 	fmode = FFLAGS(uap->flags);
 	/* why not allow a non-read/write open for our lockd? */
 	if (((fmode & (FREAD | FWRITE)) == 0) || (fmode & O_CREAT))
 		return (EINVAL);
 	error = copyin(uap->u_fhp, &fhp, sizeof(fhp));
 	if (error != 0)
 		return(error);
 	/* find the mount point */
 	mp = vfs_busyfs(&fhp.fh_fsid);
 	if (mp == NULL)
 		return (ESTALE);
 	/* now give me my vnode, it gets returned to me locked */
 	error = VFS_FHTOVP(mp, &fhp.fh_fid, LK_EXCLUSIVE, &vp);
 	vfs_unbusy(mp);
 	if (error != 0)
 		return (error);
 
 	error = falloc_noinstall(td, &fp);
 	if (error != 0) {
 		vput(vp);
 		return (error);
 	}
 	/*
 	 * An extra reference on `fp' has been held for us by
 	 * falloc_noinstall().
 	 */
 
 #ifdef INVARIANTS
 	td->td_dupfd = -1;
 #endif
 	error = vn_open_vnode(vp, fmode, td->td_ucred, td, fp);
 	if (error != 0) {
 		KASSERT(fp->f_ops == &badfileops,
 		    ("VOP_OPEN in fhopen() set f_ops"));
 		KASSERT(td->td_dupfd < 0,
 		    ("fhopen() encountered fdopen()"));
 
 		vput(vp);
 		goto bad;
 	}
 #ifdef INVARIANTS
 	td->td_dupfd = 0;
 #endif
 	fp->f_vnode = vp;
 	fp->f_seqcount = 1;
 	finit(fp, (fmode & FMASK) | (fp->f_flag & FHASLOCK), DTYPE_VNODE, vp,
 	    &vnops);
 	VOP_UNLOCK(vp, 0);
 	if ((fmode & O_TRUNC) != 0) {
 		error = fo_truncate(fp, 0, td->td_ucred, td);
 		if (error != 0)
 			goto bad;
 	}
 
 	error = finstall(td, fp, &indx, fmode, NULL);
 bad:
 	fdrop(fp, td);
 	td->td_retval[0] = indx;
 	return (error);
 }
 
 /*
  * Stat an (NFS) file handle.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct fhstat_args {
 	struct fhandle *u_fhp;
 	struct stat *sb;
 };
 #endif
 int
 sys_fhstat(td, uap)
 	struct thread *td;
 	register struct fhstat_args /* {
 		struct fhandle *u_fhp;
 		struct stat *sb;
 	} */ *uap;
 {
 	struct stat sb;
 	struct fhandle fh;
 	int error;
 
 	error = copyin(uap->u_fhp, &fh, sizeof(fh));
 	if (error != 0)
 		return (error);
 	error = kern_fhstat(td, fh, &sb);
 	if (error == 0)
 		error = copyout(&sb, uap->sb, sizeof(sb));
 	return (error);
 }
 
 int
 kern_fhstat(struct thread *td, struct fhandle fh, struct stat *sb)
 {
 	struct mount *mp;
 	struct vnode *vp;
 	int error;
 
 	error = priv_check(td, PRIV_VFS_FHSTAT);
 	if (error != 0)
 		return (error);
 	if ((mp = vfs_busyfs(&fh.fh_fsid)) == NULL)
 		return (ESTALE);
 	error = VFS_FHTOVP(mp, &fh.fh_fid, LK_EXCLUSIVE, &vp);
 	vfs_unbusy(mp);
 	if (error != 0)
 		return (error);
 	error = vn_stat(vp, sb, td->td_ucred, NOCRED, td);
 	vput(vp);
 	return (error);
 }
 
 /*
  * Implement fstatfs() for (NFS) file handles.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct fhstatfs_args {
 	struct fhandle *u_fhp;
 	struct statfs *buf;
 };
 #endif
 int
 sys_fhstatfs(td, uap)
 	struct thread *td;
 	struct fhstatfs_args /* {
 		struct fhandle *u_fhp;
 		struct statfs *buf;
 	} */ *uap;
 {
 	struct statfs sf;
 	fhandle_t fh;
 	int error;
 
 	error = copyin(uap->u_fhp, &fh, sizeof(fhandle_t));
 	if (error != 0)
 		return (error);
 	error = kern_fhstatfs(td, fh, &sf);
 	if (error != 0)
 		return (error);
 	return (copyout(&sf, uap->buf, sizeof(sf)));
 }
 
 int
 kern_fhstatfs(struct thread *td, fhandle_t fh, struct statfs *buf)
 {
 	struct statfs *sp;
 	struct mount *mp;
 	struct vnode *vp;
 	int error;
 
 	error = priv_check(td, PRIV_VFS_FHSTATFS);
 	if (error != 0)
 		return (error);
 	if ((mp = vfs_busyfs(&fh.fh_fsid)) == NULL)
 		return (ESTALE);
 	error = VFS_FHTOVP(mp, &fh.fh_fid, LK_EXCLUSIVE, &vp);
 	if (error != 0) {
 		vfs_unbusy(mp);
 		return (error);
 	}
 	vput(vp);
 	error = prison_canseemount(td->td_ucred, mp);
 	if (error != 0)
 		goto out;
 #ifdef MAC
 	error = mac_mount_check_stat(td->td_ucred, mp);
 	if (error != 0)
 		goto out;
 #endif
 	/*
 	 * Set these in case the underlying filesystem fails to do so.
 	 */
 	sp = &mp->mnt_stat;
 	sp->f_version = STATFS_VERSION;
 	sp->f_namemax = NAME_MAX;
 	sp->f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
 	error = VFS_STATFS(mp, sp);
 	if (error == 0)
 		*buf = *sp;
 out:
 	vfs_unbusy(mp);
 	return (error);
 }
 
 int
 kern_posix_fallocate(struct thread *td, int fd, off_t offset, off_t len)
 {
 	struct file *fp;
 	struct mount *mp;
 	struct vnode *vp;
 	cap_rights_t rights;
 	off_t olen, ooffset;
 	int error;
 
 	fp = NULL;
 	error = fget(td, fd, cap_rights_init(&rights, CAP_WRITE), &fp);
 	if (error != 0)
 		goto out;
 
 	switch (fp->f_type) {
 	case DTYPE_VNODE:
 		break;
 	case DTYPE_PIPE:
 	case DTYPE_FIFO:
 		error = ESPIPE;
 		goto out;
 	default:
 		error = ENODEV;
 		goto out;
 	}
 	if ((fp->f_flag & FWRITE) == 0) {
 		error = EBADF;
 		goto out;
 	}
 	vp = fp->f_vnode;
 	if (vp->v_type != VREG) {
 		error = ENODEV;
 		goto out;
 	}
 	if (offset < 0 || len <= 0) {
 		error = EINVAL;
 		goto out;
 	}
 	/* Check for wrap. */
 	if (offset > OFF_MAX - len) {
 		error = EFBIG;
 		goto out;
 	}
 
 	/* Allocating blocks may take a long time, so iterate. */
 	for (;;) {
 		olen = len;
 		ooffset = offset;
 
 		bwillwrite();
 		mp = NULL;
 		error = vn_start_write(vp, &mp, V_WAIT | PCATCH);
 		if (error != 0)
 			break;
 		error = vn_lock(vp, LK_EXCLUSIVE);
 		if (error != 0) {
 			vn_finished_write(mp);
 			break;
 		}
 #ifdef MAC
 		error = mac_vnode_check_write(td->td_ucred, fp->f_cred, vp);
 		if (error == 0)
 #endif
 			error = VOP_ALLOCATE(vp, &offset, &len);
 		VOP_UNLOCK(vp, 0);
 		vn_finished_write(mp);
 
 		if (olen + ooffset != offset + len) {
 			panic("offset + len changed from %jx/%jx to %jx/%jx",
 			    ooffset, olen, offset, len);
 		}
 		if (error != 0 || len == 0)
 			break;
 		KASSERT(olen > len, ("Iteration did not make progress?"));
 		maybe_yield();
 	}
  out:
 	if (fp != NULL)
 		fdrop(fp, td);
 	return (error);
 }
 
 int
 sys_posix_fallocate(struct thread *td, struct posix_fallocate_args *uap)
 {
 
 	return (kern_posix_fallocate(td, uap->fd, uap->offset, uap->len));
 }
 
 /*
  * Unlike madvise(2), we do not make a best effort to remember every
  * possible caching hint.  Instead, we remember the last setting with
  * the exception that we will allow POSIX_FADV_NORMAL to adjust the
  * region of any current setting.
  */
 int
 kern_posix_fadvise(struct thread *td, int fd, off_t offset, off_t len,
     int advice)
 {
 	struct fadvise_info *fa, *new;
 	struct file *fp;
 	struct vnode *vp;
 	cap_rights_t rights;
 	off_t end;
 	int error;
 
 	if (offset < 0 || len < 0 || offset > OFF_MAX - len)
 		return (EINVAL);
 	switch (advice) {
 	case POSIX_FADV_SEQUENTIAL:
 	case POSIX_FADV_RANDOM:
 	case POSIX_FADV_NOREUSE:
 		new = malloc(sizeof(*fa), M_FADVISE, M_WAITOK);
 		break;
 	case POSIX_FADV_NORMAL:
 	case POSIX_FADV_WILLNEED:
 	case POSIX_FADV_DONTNEED:
 		new = NULL;
 		break;
 	default:
 		return (EINVAL);
 	}
 	/* XXX: CAP_POSIX_FADVISE? */
 	error = fget(td, fd, cap_rights_init(&rights), &fp);
 	if (error != 0)
 		goto out;
 
 	switch (fp->f_type) {
 	case DTYPE_VNODE:
 		break;
 	case DTYPE_PIPE:
 	case DTYPE_FIFO:
 		error = ESPIPE;
 		goto out;
 	default:
 		error = ENODEV;
 		goto out;
 	}
 	vp = fp->f_vnode;
 	if (vp->v_type != VREG) {
 		error = ENODEV;
 		goto out;
 	}
 	if (len == 0)
 		end = OFF_MAX;
 	else
 		end = offset + len - 1;
 	switch (advice) {
 	case POSIX_FADV_SEQUENTIAL:
 	case POSIX_FADV_RANDOM:
 	case POSIX_FADV_NOREUSE:
 		/*
 		 * Try to merge any existing non-standard region with
 		 * this new region if possible, otherwise create a new
 		 * non-standard region for this request.
 		 */
 		mtx_pool_lock(mtxpool_sleep, fp);
 		fa = fp->f_advice;
 		if (fa != NULL && fa->fa_advice == advice &&
 		    ((fa->fa_start <= end && fa->fa_end >= offset) ||
 		    (end != OFF_MAX && fa->fa_start == end + 1) ||
 		    (fa->fa_end != OFF_MAX && fa->fa_end + 1 == offset))) {
 			if (offset < fa->fa_start)
 				fa->fa_start = offset;
 			if (end > fa->fa_end)
 				fa->fa_end = end;
 		} else {
 			new->fa_advice = advice;
 			new->fa_start = offset;
 			new->fa_end = end;
 			new->fa_prevstart = 0;
 			new->fa_prevend = 0;
 			fp->f_advice = new;
 			new = fa;
 		}
 		mtx_pool_unlock(mtxpool_sleep, fp);
 		break;
 	case POSIX_FADV_NORMAL:
 		/*
 		 * If a the "normal" region overlaps with an existing
 		 * non-standard region, trim or remove the
 		 * non-standard region.
 		 */
 		mtx_pool_lock(mtxpool_sleep, fp);
 		fa = fp->f_advice;
 		if (fa != NULL) {
 			if (offset <= fa->fa_start && end >= fa->fa_end) {
 				new = fa;
 				fp->f_advice = NULL;
 			} else if (offset <= fa->fa_start &&
 			    end >= fa->fa_start)
 				fa->fa_start = end + 1;
 			else if (offset <= fa->fa_end && end >= fa->fa_end)
 				fa->fa_end = offset - 1;
 			else if (offset >= fa->fa_start && end <= fa->fa_end) {
 				/*
 				 * If the "normal" region is a middle
 				 * portion of the existing
 				 * non-standard region, just remove
 				 * the whole thing rather than picking
 				 * one side or the other to
 				 * preserve.
 				 */
 				new = fa;
 				fp->f_advice = NULL;
 			}
 		}
 		mtx_pool_unlock(mtxpool_sleep, fp);
 		break;
 	case POSIX_FADV_WILLNEED:
 	case POSIX_FADV_DONTNEED:
 		error = VOP_ADVISE(vp, offset, end, advice);
 		break;
 	}
 out:
 	if (fp != NULL)
 		fdrop(fp, td);
 	free(new, M_FADVISE);
 	return (error);
 }
 
 int
 sys_posix_fadvise(struct thread *td, struct posix_fadvise_args *uap)
 {
 
 	return (kern_posix_fadvise(td, uap->fd, uap->offset, uap->len,
 	    uap->advice));
 }
Index: head/sys/mips/mips/exception.S
===================================================================
--- head/sys/mips/mips/exception.S	(revision 258540)
+++ head/sys/mips/mips/exception.S	(revision 258541)
@@ -1,1256 +1,1256 @@
 /*	$OpenBSD: locore.S,v 1.18 1998/09/15 10:58:53 pefo Exp $	*/
 /*-
  * Copyright (c) 1992, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * This code is derived from software contributed to Berkeley by
  * Digital Equipment Corporation and Ralph Campbell.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  * Copyright (C) 1989 Digital Equipment Corporation.
  * Permission to use, copy, modify, and distribute this software and
  * its documentation for any purpose and without fee is hereby granted,
  * provided that the above copyright notice appears in all copies.
  * Digital Equipment Corporation makes no representations about the
  * suitability of this software for any purpose.  It is provided "as is"
  * without express or implied warranty.
  *
  * from: Header: /sprite/src/kernel/mach/ds3100.md/RCS/loMem.s,
  *	v 1.1 89/07/11 17:55:04 nelson Exp  SPRITE (DECWRL)
  * from: Header: /sprite/src/kernel/mach/ds3100.md/RCS/machAsm.s,
  *	v 9.2 90/01/29 18:00:39 shirriff Exp  SPRITE (DECWRL)
  * from: Header: /sprite/src/kernel/vm/ds3100.md/vmPmaxAsm.s,
  *	v 1.1 89/07/10 14:27:41 nelson Exp  SPRITE (DECWRL)
  *	from: @(#)locore.s	8.5 (Berkeley) 1/4/94
  *	JNPR: exception.S,v 1.5 2007/01/08 04:58:37 katta
  * $FreeBSD$
  */
 
 /*
  *	Contains code that is the first executed at boot time plus
  *	assembly language support routines.
  */
 
 #include "opt_ddb.h"
-#include "opt_kdtrace.h"
+
 #include <machine/asm.h>
 #include <machine/cpu.h>
 #include <machine/regnum.h>
 #include <machine/cpuregs.h>
 #include <machine/pte.h>
 #include <machine/pcb.h>
 
 #include "assym.s"
 
 	.set	noreorder		# Noreorder is default style!
 
 #ifdef KDTRACE_HOOKS
 	.data
 	.globl	dtrace_invop_jump_addr
 	.align	4
 	.type	dtrace_invop_jump_addr, @object
         .size	dtrace_invop_jump_addr, 8
 dtrace_invop_jump_addr:
 	.word	0
 	.word	0
 	.globl	dtrace_invop_calltrap_addr
 	.align	4
 	.type	dtrace_invop_calltrap_addr, @object
         .size	dtrace_invop_calltrap_addr, 8
 dtrace_invop_calltrap_addr:
 	.word	0
 	.word	0
 
 	.text
 #endif
 
 /*
  * Reasonable limit
  */
 #define	INTRCNT_COUNT	256
 
 
 /*
  *----------------------------------------------------------------------------
  *
  * MipsTLBMiss --
  *
  *	Vector code for the TLB-miss exception vector 0x80000000.
  *
  * This code is copied to the TLB exception vector address to
  * which the CPU jumps in response to an exception or a TLB miss.
  * NOTE: This code must be position independent!!!
  *
  *
  */
 VECTOR(MipsTLBMiss, unknown)
 	.set push
 	.set noat
 	j	MipsDoTLBMiss
 	MFC0	k0, MIPS_COP_0_BAD_VADDR	# get the fault address
 	.set pop
 VECTOR_END(MipsTLBMiss)
 
 /*
  *----------------------------------------------------------------------------
  *
  * MipsDoTLBMiss --
  *
  * This is the real TLB Miss Handler code.
  * 'segbase' points to the base of the segment table for user processes.
  *
  * Don't check for invalid pte's here. We load them as well and
  * let the processor trap to load the correct value after service.
  *----------------------------------------------------------------------------
  */
  	.set push
 	.set noat
 MipsDoTLBMiss:
 	bltz		k0, 1f				#02: k0<0 -> 1f (kernel fault)
 	PTR_SRL		k0, k0, SEGSHIFT - PTRSHIFT	#03: k0=seg offset (almost)
 
 	GET_CPU_PCPU(k1)
 	PTR_L		k1, PC_SEGBASE(k1)
 	beqz		k1, 2f				#05: make sure segbase is not null
 	andi		k0, k0, PDEPTRMASK		#06: k0=seg offset
 	PTR_ADDU	k1, k0, k1			#07: k1=seg entry address
 
 	PTR_L		k1, 0(k1)			#08: k1=seg entry
 	MFC0		k0, MIPS_COP_0_BAD_VADDR	#09: k0=bad address (again)
 	beq		k1, zero, 2f			#0a: ==0 -- no page table
 #ifdef __mips_n64
 	PTR_SRL		k0, PDRSHIFT - PTRSHIFT		# k0=VPN
 	andi		k0, k0, PDEPTRMASK		# k0=pde offset
 	PTR_ADDU	k1, k0, k1			# k1=pde entry address
 	PTR_L		k1, 0(k1)			# k1=pde entry
 	MFC0		k0, MIPS_COP_0_BAD_VADDR	# k0=bad address (again)
 	beq		k1, zero, 2f			# ==0 -- no page table
 #endif
 	PTR_SRL		k0, PAGE_SHIFT - PTESHIFT	#0b: k0=VPN (aka va>>10)
 	andi		k0, k0, PTE2MASK		#0c: k0=page tab offset
 	PTR_ADDU	k1, k1, k0			#0d: k1=pte address
 	PTE_L		k0, 0(k1)			#0e: k0=lo0 pte
 	PTE_L		k1, PTESIZE(k1)			#0f: k1=lo0 pte
 	CLEAR_PTE_SWBITS(k0)
 	PTE_MTC0	k0, MIPS_COP_0_TLB_LO0		#12: lo0 is loaded
 	COP0_SYNC
 	CLEAR_PTE_SWBITS(k1)
 	PTE_MTC0	k1, MIPS_COP_0_TLB_LO1		#15: lo1 is loaded
 	COP0_SYNC
 	tlbwr						#1a: write to tlb
 	HAZARD_DELAY
 	eret						#1f: retUrn from exception
 1:	j		MipsTLBMissException		#20: kernel exception
 	nop						#21: branch delay slot
 2:	j		SlowFault			#22: no page table present
 	nop						#23: branch delay slot
 	.set pop
 
 /*
  * This code is copied to the general exception vector address to
  * handle all execptions except RESET and TLBMiss.
  * NOTE: This code must be position independent!!!
  */
 VECTOR(MipsException, unknown)
 /*
  * Find out what mode we came from and jump to the proper handler.
  */
 	.set	noat
 	mfc0	k0, MIPS_COP_0_STATUS		# Get the status register
 	mfc0	k1, MIPS_COP_0_CAUSE		# Get the cause register value.
 	and	k0, k0, MIPS_SR_KSU_USER	# test for user mode
 						# sneaky but the bits are
 						# with us........
 	sll	k0, k0, 3			# shift user bit for cause index
 	and	k1, k1, MIPS_CR_EXC_CODE	# Mask out the cause bits.
 	or	k1, k1, k0			# change index to user table
 #if defined(__mips_n64)
 	PTR_SLL	k1, k1, 1			# shift to get 8-byte offset
 #endif
 1:
 	PTR_LA	k0, _C_LABEL(machExceptionTable)  # get base of the jump table
 	PTR_ADDU k0, k0, k1			# Get the address of the
 						#  function entry.  Note that
 						#  the cause is already
 						#  shifted left by 2 bits so
 						#  we dont have to shift.
 	PTR_L	k0, 0(k0)			# Get the function address
 	nop
 	j	k0				# Jump to the function.
 	nop
 	.set	at
 VECTOR_END(MipsException)
 
 /*
  * We couldn't find a TLB entry.
  * Find out what mode we came from and call the appropriate handler.
  */
 SlowFault:
 	.set	noat
 	mfc0	k0, MIPS_COP_0_STATUS
 	nop
 	and	k0, k0, MIPS_SR_KSU_USER
 	bne	k0, zero, _C_LABEL(MipsUserGenException)
 	nop
 	.set	at
 /*
  * Fall though ...
  */
 
 /*----------------------------------------------------------------------------
  *
  * MipsKernGenException --
  *
  *	Handle an exception from kernel mode.
  *
  * Results:
  *	None.
  *
  * Side effects:
  *	None.
  *
  *----------------------------------------------------------------------------
  */
 
 #define	SAVE_REG(reg, offs, base) \
 	REG_S	reg, CALLFRAME_SIZ + (SZREG * offs) (base)
 
 #if defined(CPU_CNMIPS)
 #define CLEAR_STATUS \
 	mfc0    a0, MIPS_COP_0_STATUS   ;\
 	li      a2, (MIPS_SR_KX | MIPS_SR_SX | MIPS_SR_UX) ; \
 	or      a0, a0, a2	        ; \
 	li      a2, ~(MIPS_SR_INT_IE | MIPS_SR_EXL | MIPS_SR_KSU_USER)   ; \
 	and     a0, a0, a2              ; \
         mtc0    a0, MIPS_COP_0_STATUS   ; \
 	ITLBNOPFIX
 #elif defined(CPU_RMI) || defined(CPU_NLM)
 #define CLEAR_STATUS \
 	mfc0    a0, MIPS_COP_0_STATUS   ;\
 	li      a2, (MIPS_SR_KX | MIPS_SR_UX | MIPS_SR_COP_2_BIT) ; \
 	or      a0, a0, a2	        ; \
 	li      a2, ~(MIPS_SR_INT_IE | MIPS_SR_EXL | MIPS_SR_KSU_USER)   ; \
 	and     a0, a0, a2              ; \
         mtc0    a0, MIPS_COP_0_STATUS   ; \
 	ITLBNOPFIX
 #else
 #define CLEAR_STATUS \
 	mfc0    a0, MIPS_COP_0_STATUS   ;\
 	li      a2, ~(MIPS_SR_INT_IE | MIPS_SR_EXL | MIPS_SR_KSU_USER)   ; \
 	and     a0, a0, a2              ; \
 	mtc0	a0, MIPS_COP_0_STATUS   ; \
 	ITLBNOPFIX
 #endif
 
 /*
  * Save CPU and CP0 register state.
  *
  * This is straightforward except for saving the exception program
  * counter. The ddb backtrace code looks for the first instruction
  * matching the form "sw ra, (off)sp" to figure out the address of the
  * calling function. So we must make sure that we save the exception
  * PC by staging it through 'ra' as opposed to any other register.
  */
 #define	SAVE_CPU \
 	SAVE_REG(AT, AST, sp)		;\
 	.set	at		        ; \
 	SAVE_REG(v0, V0, sp)		;\
 	SAVE_REG(v1, V1, sp)		;\
 	SAVE_REG(a0, A0, sp)		;\
 	SAVE_REG(a1, A1, sp)		;\
 	SAVE_REG(a2, A2, sp)		;\
 	SAVE_REG(a3, A3, sp)		;\
 	SAVE_REG(t0, T0, sp)		;\
 	SAVE_REG(t1, T1, sp)		;\
 	SAVE_REG(t2, T2, sp)		;\
 	SAVE_REG(t3, T3, sp)		;\
 	SAVE_REG(ta0, TA0, sp)		;\
 	SAVE_REG(ta1, TA1, sp)		;\
 	SAVE_REG(ta2, TA2, sp)		;\
 	SAVE_REG(ta3, TA3, sp)		;\
 	SAVE_REG(t8, T8, sp)		;\
 	SAVE_REG(t9, T9, sp)		;\
 	SAVE_REG(gp, GP, sp)		;\
 	SAVE_REG(s0, S0, sp)		;\
 	SAVE_REG(s1, S1, sp)		;\
 	SAVE_REG(s2, S2, sp)		;\
 	SAVE_REG(s3, S3, sp)		;\
 	SAVE_REG(s4, S4, sp)		;\
 	SAVE_REG(s5, S5, sp)		;\
 	SAVE_REG(s6, S6, sp)		;\
 	SAVE_REG(s7, S7, sp)		;\
 	SAVE_REG(s8, S8, sp)	        ;\
 	mflo	v0			;\
 	mfhi	v1			;\
 	mfc0	a0, MIPS_COP_0_STATUS	;\
 	mfc0	a1, MIPS_COP_0_CAUSE	;\
 	MFC0	a2, MIPS_COP_0_BAD_VADDR;\
 	MFC0	a3, MIPS_COP_0_EXC_PC	;\
 	SAVE_REG(v0, MULLO, sp)		;\
 	SAVE_REG(v1, MULHI, sp)		;\
 	SAVE_REG(a0, SR, sp)		;\
 	SAVE_REG(a1, CAUSE, sp)		;\
 	SAVE_REG(a2, BADVADDR, sp)	;\
 	move	t0, ra			;\
 	move	ra, a3			;\
 	SAVE_REG(ra, PC, sp)		;\
 	move	ra, t0			;\
 	SAVE_REG(ra, RA, sp)		;\
 	PTR_ADDU v0, sp, KERN_EXC_FRAME_SIZE ;\
 	SAVE_REG(v0, SP, sp)		;\
 	CLEAR_STATUS			;\
 	PTR_ADDU a0, sp, CALLFRAME_SIZ	;\
 	ITLBNOPFIX
 
 #define	RESTORE_REG(reg, offs, base) \
 	REG_L	reg, CALLFRAME_SIZ + (SZREG * offs) (base)
 
 #define	RESTORE_CPU \
 	CLEAR_STATUS			;\
 	RESTORE_REG(k0, SR, sp)		;\
 	RESTORE_REG(t0, MULLO, sp)	;\
 	RESTORE_REG(t1, MULHI, sp)	;\
 	mtlo	t0			;\
 	mthi	t1			;\
 	MTC0	v0, MIPS_COP_0_EXC_PC	;\
 	.set noat		        ;\
 	RESTORE_REG(AT, AST, sp)	;\
 	RESTORE_REG(v0, V0, sp)		;\
 	RESTORE_REG(v1, V1, sp)		;\
 	RESTORE_REG(a0, A0, sp)		;\
 	RESTORE_REG(a1, A1, sp)		;\
 	RESTORE_REG(a2, A2, sp)		;\
 	RESTORE_REG(a3, A3, sp)		;\
 	RESTORE_REG(t0, T0, sp)		;\
 	RESTORE_REG(t1, T1, sp)		;\
 	RESTORE_REG(t2, T2, sp)		;\
 	RESTORE_REG(t3, T3, sp)		;\
 	RESTORE_REG(ta0, TA0, sp)	;\
 	RESTORE_REG(ta1, TA1, sp)	;\
 	RESTORE_REG(ta2, TA2, sp)	;\
 	RESTORE_REG(ta3, TA3, sp)	;\
 	RESTORE_REG(t8, T8, sp)		;\
 	RESTORE_REG(t9, T9, sp)		;\
 	RESTORE_REG(s0, S0, sp)		;\
 	RESTORE_REG(s1, S1, sp)		;\
 	RESTORE_REG(s2, S2, sp)		;\
 	RESTORE_REG(s3, S3, sp)		;\
 	RESTORE_REG(s4, S4, sp)		;\
 	RESTORE_REG(s5, S5, sp)		;\
 	RESTORE_REG(s6, S6, sp)		;\
 	RESTORE_REG(s7, S7, sp)		;\
 	RESTORE_REG(s8, S8, sp)	        ;\
 	RESTORE_REG(gp, GP, sp)		;\
 	RESTORE_REG(ra, RA, sp)		;\
 	PTR_ADDU sp, sp, KERN_EXC_FRAME_SIZE;\
 	mtc0	k0, MIPS_COP_0_STATUS
 
 
 /*
  * The kernel exception stack contains 18 saved general registers,
  * the status register and the multiply lo and high registers.
  * In addition, we set this up for linkage conventions.
  */
 #define	KERN_REG_SIZE		(NUMSAVEREGS * SZREG)
 #define	KERN_EXC_FRAME_SIZE	(CALLFRAME_SIZ + KERN_REG_SIZE + 16)
 
 NESTED_NOPROFILE(MipsKernGenException, KERN_EXC_FRAME_SIZE, ra)
 	.set	noat
 	PTR_SUBU	sp, sp, KERN_EXC_FRAME_SIZE
 	.mask	0x80000000, (CALLFRAME_RA - KERN_EXC_FRAME_SIZE)
 /*
  *  Save CPU state, building 'frame'.
  */
 	SAVE_CPU
 /*
  *  Call the exception handler. a0 points at the saved frame.
  */
 	PTR_LA	gp, _C_LABEL(_gp)
 	PTR_LA	k0, _C_LABEL(trap)
 	jalr	k0
 	REG_S	a3, CALLFRAME_RA + KERN_REG_SIZE(sp)		# for debugging
 
 	/*
 	 * Update interrupt and CPU mask in saved status register
 	 * Some of interrupts could be disabled by
 	 * intr filters if interrupts are enabled later
 	 * in trap handler
 	 */
 	mfc0	a0, MIPS_COP_0_STATUS
 	and	a0, a0, (MIPS_SR_INT_MASK|MIPS_SR_COP_USABILITY)
 	RESTORE_REG(a1, SR, sp)
 	and	a1, a1, ~(MIPS_SR_INT_MASK|MIPS_SR_COP_USABILITY)
 	or	a1, a1, a0
 	SAVE_REG(a1, SR, sp)
 	RESTORE_CPU			# v0 contains the return address.
 	sync
 	eret
 	.set	at
 END(MipsKernGenException)
 
 
 /*----------------------------------------------------------------------------
  *
  * MipsUserGenException --
  *
  *	Handle an exception from user mode.
  *
  * Results:
  *	None.
  *
  * Side effects:
  *	None.
  *
  *----------------------------------------------------------------------------
  */
 NESTED_NOPROFILE(MipsUserGenException, CALLFRAME_SIZ, ra)
 	.set	noat
 	.mask	0x80000000, (CALLFRAME_RA - CALLFRAME_SIZ)
 /*
  * Save all of the registers except for the kernel temporaries in u.u_pcb.
  */
 	GET_CPU_PCPU(k1)
 	PTR_L	k1, PC_CURPCB(k1)
 	SAVE_U_PCB_REG(AT, AST, k1)
 	.set	at
 	SAVE_U_PCB_REG(v0, V0, k1)
 	SAVE_U_PCB_REG(v1, V1, k1)
 	SAVE_U_PCB_REG(a0, A0, k1)
 	mflo	v0
 	SAVE_U_PCB_REG(a1, A1, k1)
 	SAVE_U_PCB_REG(a2, A2, k1)
 	SAVE_U_PCB_REG(a3, A3, k1)
 	SAVE_U_PCB_REG(t0, T0, k1)
 	mfhi	v1
 	SAVE_U_PCB_REG(t1, T1, k1)
 	SAVE_U_PCB_REG(t2, T2, k1)
 	SAVE_U_PCB_REG(t3, T3, k1)
 	SAVE_U_PCB_REG(ta0, TA0, k1)
 	mfc0	a0, MIPS_COP_0_STATUS		# First arg is the status reg.
 	SAVE_U_PCB_REG(ta1, TA1, k1)
 	SAVE_U_PCB_REG(ta2, TA2, k1)
 	SAVE_U_PCB_REG(ta3, TA3, k1)
 	SAVE_U_PCB_REG(s0, S0, k1)
 	mfc0	a1, MIPS_COP_0_CAUSE		# Second arg is the cause reg.
 	SAVE_U_PCB_REG(s1, S1, k1)
 	SAVE_U_PCB_REG(s2, S2, k1)
 	SAVE_U_PCB_REG(s3, S3, k1)
 	SAVE_U_PCB_REG(s4, S4, k1)
 	MFC0	a2, MIPS_COP_0_BAD_VADDR	# Third arg is the fault addr
 	SAVE_U_PCB_REG(s5, S5, k1)
 	SAVE_U_PCB_REG(s6, S6, k1)
 	SAVE_U_PCB_REG(s7, S7, k1)
 	SAVE_U_PCB_REG(t8, T8, k1)
 	MFC0	a3, MIPS_COP_0_EXC_PC		# Fourth arg is the pc.
 	SAVE_U_PCB_REG(t9, T9, k1)
 	SAVE_U_PCB_REG(gp, GP, k1)
 	SAVE_U_PCB_REG(sp, SP, k1)
 	SAVE_U_PCB_REG(s8, S8, k1)
 	PTR_SUBU	sp, k1, CALLFRAME_SIZ	 # switch to kernel SP
 	SAVE_U_PCB_REG(ra, RA, k1)
 	SAVE_U_PCB_REG(v0, MULLO, k1)
 	SAVE_U_PCB_REG(v1, MULHI, k1)
 	SAVE_U_PCB_REG(a0, SR, k1)
 	SAVE_U_PCB_REG(a1, CAUSE, k1)
 	SAVE_U_PCB_REG(a2, BADVADDR, k1)
 	SAVE_U_PCB_REG(a3, PC, k1)
 	REG_S	a3, CALLFRAME_RA(sp)	# for debugging
 	PTR_LA	gp, _C_LABEL(_gp)	# switch to kernel GP
 # Turn off fpu and enter kernel mode
 	and	t0, a0, ~(MIPS_SR_COP_1_BIT | MIPS_SR_EXL | MIPS_SR_KSU_MASK | MIPS_SR_INT_IE)
 #if defined(CPU_CNMIPS)
 	and	t0, t0, ~(MIPS_SR_COP_2_BIT)
 	or      t0, t0, (MIPS_SR_KX | MIPS_SR_SX | MIPS_SR_UX | MIPS_SR_PX)
 #elif defined(CPU_RMI)	|| defined(CPU_NLM)
 	or      t0, t0, (MIPS_SR_KX | MIPS_SR_UX | MIPS_SR_COP_2_BIT)
 #endif	
 	mtc0	t0, MIPS_COP_0_STATUS
 	PTR_ADDU a0, k1, U_PCB_REGS
 	ITLBNOPFIX
 
 /*
  * Call the exception handler.
  */
 	PTR_LA	k0, _C_LABEL(trap)
 	jalr	k0
 	nop
 
 /*
  * Restore user registers and return.
  * First disable interrupts and set exeption level.
  */
 	DO_AST
 
 	CLEAR_STATUS
 
 /*
  * The use of k1 for storing the PCB pointer must be done only
  * after interrupts are disabled.  Otherwise it will get overwritten
  * by the interrupt code.
  */
 	GET_CPU_PCPU(k1)
 	PTR_L	k1, PC_CURPCB(k1)
 
 	/*
 	 * Update interrupt mask in saved status register
 	 * Some of interrupts could be enabled by ithread
 	 * scheduled by ast()
 	 */
 	mfc0	a0, MIPS_COP_0_STATUS
 	and	a0, a0, MIPS_SR_INT_MASK
 	RESTORE_U_PCB_REG(a1, SR, k1)
 	and	a1, a1, ~MIPS_SR_INT_MASK
 	or	a1, a1, a0
 	SAVE_U_PCB_REG(a1, SR, k1)
 
 	RESTORE_U_PCB_REG(t0, MULLO, k1)
 	RESTORE_U_PCB_REG(t1, MULHI, k1)
 	mtlo	t0
 	mthi	t1
 	RESTORE_U_PCB_REG(a0, PC, k1)
 	RESTORE_U_PCB_REG(v0, V0, k1)
         MTC0	a0, MIPS_COP_0_EXC_PC	# set return address
 	RESTORE_U_PCB_REG(v1, V1, k1)
 	RESTORE_U_PCB_REG(a0, A0, k1)
 	RESTORE_U_PCB_REG(a1, A1, k1)
 	RESTORE_U_PCB_REG(a2, A2, k1)
 	RESTORE_U_PCB_REG(a3, A3, k1)
 	RESTORE_U_PCB_REG(t0, T0, k1)
 	RESTORE_U_PCB_REG(t1, T1, k1)
 	RESTORE_U_PCB_REG(t2, T2, k1)
 	RESTORE_U_PCB_REG(t3, T3, k1)
 	RESTORE_U_PCB_REG(ta0, TA0, k1)
 	RESTORE_U_PCB_REG(ta1, TA1, k1)
 	RESTORE_U_PCB_REG(ta2, TA2, k1)
 	RESTORE_U_PCB_REG(ta3, TA3, k1)
 	RESTORE_U_PCB_REG(s0, S0, k1)
 	RESTORE_U_PCB_REG(s1, S1, k1)
 	RESTORE_U_PCB_REG(s2, S2, k1)
 	RESTORE_U_PCB_REG(s3, S3, k1)
 	RESTORE_U_PCB_REG(s4, S4, k1)
 	RESTORE_U_PCB_REG(s5, S5, k1)
 	RESTORE_U_PCB_REG(s6, S6, k1)
 	RESTORE_U_PCB_REG(s7, S7, k1)
 	RESTORE_U_PCB_REG(t8, T8, k1)
 	RESTORE_U_PCB_REG(t9, T9, k1)
 	RESTORE_U_PCB_REG(gp, GP, k1)
 	RESTORE_U_PCB_REG(sp, SP, k1)
 	RESTORE_U_PCB_REG(k0, SR, k1)
 	RESTORE_U_PCB_REG(s8, S8, k1)
 	RESTORE_U_PCB_REG(ra, RA, k1)
 	.set noat
 	RESTORE_U_PCB_REG(AT, AST, k1)
 
 	mtc0	k0, MIPS_COP_0_STATUS	# still exception level
 	ITLBNOPFIX
 	sync
 	eret
 	.set	at
 END(MipsUserGenException)
 
 	.set	push
 	.set	noat
 NESTED(mips_wait, CALLFRAME_SIZ, ra)
 	PTR_SUBU        sp, sp, CALLFRAME_SIZ
 	.mask   0x80000000, (CALLFRAME_RA - CALLFRAME_SIZ)
 	REG_S   ra, CALLFRAME_RA(sp)		# save RA
 	mfc0	t0, MIPS_COP_0_STATUS
 	xori	t1, t0, MIPS_SR_INT_IE
 	mtc0	t1, MIPS_COP_0_STATUS
 	COP0_SYNC
 	jal	sched_runnable
 	nop
 	REG_L   ra, CALLFRAME_RA(sp)
 	mfc0	t0, MIPS_COP_0_STATUS
 	ori	t1, t0, MIPS_SR_INT_IE
 	.align 4
 GLOBAL(MipsWaitStart)			# this is 16 byte aligned
 	mtc0	t1, MIPS_COP_0_STATUS
 	bnez	v0, MipsWaitEnd
 	nop
 	wait
 GLOBAL(MipsWaitEnd)			# MipsWaitStart + 16
 	jr	ra
 	PTR_ADDU        sp, sp, CALLFRAME_SIZ
 END(mips_wait)
 	.set	pop
 
 /*----------------------------------------------------------------------------
  *
  * MipsKernIntr --
  *
  *	Handle an interrupt from kernel mode.
  *	Interrupts use the standard kernel stack.
  *	switch_exit sets up a kernel stack after exit so interrupts won't fail.
  *
  * Results:
  *	None.
  *
  * Side effects:
  *	None.
  *
  *----------------------------------------------------------------------------
  */
 
 NESTED_NOPROFILE(MipsKernIntr, KERN_EXC_FRAME_SIZE, ra)
 	.set	noat
 	PTR_SUBU	sp, sp, KERN_EXC_FRAME_SIZE
 	.mask	0x80000000, (CALLFRAME_RA - KERN_EXC_FRAME_SIZE)
 
 /*
  * Check for getting interrupts just before wait
  */
 	MFC0	k0, MIPS_COP_0_EXC_PC
 	ori	k0, 0xf
 	xori	k0, 0xf			# 16 byte align
 	PTR_LA	k1, MipsWaitStart
 	bne	k0, k1, 1f
 	nop
 	PTR_ADDU k1, 16			# skip over wait
 	MTC0	k1, MIPS_COP_0_EXC_PC
 1:
 /*
  *  Save CPU state, building 'frame'.
  */
 	SAVE_CPU
 /*
  *  Call the interrupt handler. a0 points at the saved frame.
  */
 	PTR_LA	gp, _C_LABEL(_gp)
 	PTR_LA	k0, _C_LABEL(cpu_intr)
 	jalr	k0
 	REG_S	a3, CALLFRAME_RA + KERN_REG_SIZE(sp)		# for debugging
 
 	/*
 	 * Update interrupt and CPU mask in saved status register
 	 * Some of interrupts could be disabled by
 	 * intr filters if interrupts are enabled later
 	 * in trap handler
 	 */
 	mfc0	a0, MIPS_COP_0_STATUS
 	and	a0, a0, (MIPS_SR_INT_MASK|MIPS_SR_COP_USABILITY)
 	RESTORE_REG(a1, SR, sp)
 	and	a1, a1, ~(MIPS_SR_INT_MASK|MIPS_SR_COP_USABILITY)
 	or	a1, a1, a0
 	SAVE_REG(a1, SR, sp)
 	REG_L	v0, CALLFRAME_RA + KERN_REG_SIZE(sp)
 	RESTORE_CPU			# v0 contains the return address.
 	sync
 	eret
 	.set	at
 END(MipsKernIntr)
 
 /*----------------------------------------------------------------------------
  *
  * MipsUserIntr --
  *
  *	Handle an interrupt from user mode.
  *	Note: we save minimal state in the u.u_pcb struct and use the standard
  *	kernel stack since there has to be a u page if we came from user mode.
  *	If there is a pending software interrupt, then save the remaining state
  *	and call softintr(). This is all because if we call switch() inside
  *	interrupt(), not all the user registers have been saved in u.u_pcb.
  *
  * Results:
  *	None.
  *
  * Side effects:
  *	None.
  *
  *----------------------------------------------------------------------------
  */
 NESTED_NOPROFILE(MipsUserIntr, CALLFRAME_SIZ, ra)
 	.set	noat
 	.mask	0x80000000, (CALLFRAME_RA - CALLFRAME_SIZ)
 /*
  * Save the relevant user registers into the u.u_pcb struct.
  * We don't need to save s0 - s8 because the compiler does it for us.
  */
 	GET_CPU_PCPU(k1)
 	PTR_L	k1, PC_CURPCB(k1)
 	SAVE_U_PCB_REG(AT, AST, k1)
 	.set	at
 	SAVE_U_PCB_REG(v0, V0, k1)
 	SAVE_U_PCB_REG(v1, V1, k1)
 	SAVE_U_PCB_REG(a0, A0, k1)
 	SAVE_U_PCB_REG(a1, A1, k1)
 	SAVE_U_PCB_REG(a2, A2, k1)
 	SAVE_U_PCB_REG(a3, A3, k1)
 	SAVE_U_PCB_REG(t0, T0, k1)
 	SAVE_U_PCB_REG(t1, T1, k1)
 	SAVE_U_PCB_REG(t2, T2, k1)
 	SAVE_U_PCB_REG(t3, T3, k1)
 	SAVE_U_PCB_REG(ta0, TA0, k1)
 	SAVE_U_PCB_REG(ta1, TA1, k1)
 	SAVE_U_PCB_REG(ta2, TA2, k1)
 	SAVE_U_PCB_REG(ta3, TA3, k1)
 	SAVE_U_PCB_REG(t8, T8, k1)
 	SAVE_U_PCB_REG(t9, T9, k1)
 	SAVE_U_PCB_REG(gp, GP, k1)
 	SAVE_U_PCB_REG(sp, SP, k1)
 	SAVE_U_PCB_REG(ra, RA, k1)
 /*
  *  save remaining user state in u.u_pcb.
  */
 	SAVE_U_PCB_REG(s0, S0, k1)
 	SAVE_U_PCB_REG(s1, S1, k1)
 	SAVE_U_PCB_REG(s2, S2, k1)
 	SAVE_U_PCB_REG(s3, S3, k1)
 	SAVE_U_PCB_REG(s4, S4, k1)
 	SAVE_U_PCB_REG(s5, S5, k1)
 	SAVE_U_PCB_REG(s6, S6, k1)
 	SAVE_U_PCB_REG(s7, S7, k1)
 	SAVE_U_PCB_REG(s8, S8, k1)
 
 	mflo	v0			# get lo/hi late to avoid stall
 	mfhi	v1
 	mfc0	a0, MIPS_COP_0_STATUS
 	mfc0	a1, MIPS_COP_0_CAUSE
 	MFC0	a3, MIPS_COP_0_EXC_PC
 	SAVE_U_PCB_REG(v0, MULLO, k1)
 	SAVE_U_PCB_REG(v1, MULHI, k1)
 	SAVE_U_PCB_REG(a0, SR, k1)
 	SAVE_U_PCB_REG(a1, CAUSE, k1)
 	SAVE_U_PCB_REG(a3, PC, k1)	# PC in a3, note used later!
 	PTR_SUBU	sp, k1, CALLFRAME_SIZ  # switch to kernel SP
 	PTR_LA	gp, _C_LABEL(_gp)	# switch to kernel GP
 
 # Turn off fpu, disable interrupts, set kernel mode kernel mode, clear exception level.
 	and	t0, a0, ~(MIPS_SR_COP_1_BIT | MIPS_SR_EXL | MIPS_SR_INT_IE | MIPS_SR_KSU_MASK)
 #ifdef CPU_CNMIPS
 	and	t0, t0, ~(MIPS_SR_COP_2_BIT)
 	or      t0, t0, (MIPS_SR_KX | MIPS_SR_SX | MIPS_SR_UX | MIPS_SR_PX)
 #elif defined(CPU_RMI)	|| defined(CPU_NLM)
 	or      t0, t0, (MIPS_SR_KX | MIPS_SR_UX | MIPS_SR_COP_2_BIT)
 #endif	
 	mtc0	t0, MIPS_COP_0_STATUS
 	ITLBNOPFIX
 	PTR_ADDU a0, k1, U_PCB_REGS
 /*
  * Call the interrupt handler.
  */
 	PTR_LA	k0, _C_LABEL(cpu_intr)
 	jalr	k0
 	REG_S	a3, CALLFRAME_RA(sp)	# for debugging
 
 /*
  * Enable interrupts before doing ast().
  *
  * On SMP kernels the AST processing might trigger IPI to other processors.
  * If that processor is also doing AST processing with interrupts disabled
  * then we may deadlock.
  */
 	mfc0	a0, MIPS_COP_0_STATUS
 	or	a0, a0, MIPS_SR_INT_IE
 	mtc0	a0, MIPS_COP_0_STATUS
 	ITLBNOPFIX
 
 /*
  * DO_AST enabled interrupts
  */
 	DO_AST
 	
 /*
  * Restore user registers and return. 
  */
  	CLEAR_STATUS
 
 	GET_CPU_PCPU(k1)
 	PTR_L	k1, PC_CURPCB(k1)
 
 	/*
 	 * Update interrupt mask in saved status register
 	 * Some of interrupts could be disabled by
 	 * intr filters
 	 */
 	mfc0	a0, MIPS_COP_0_STATUS
 	and	a0, a0, MIPS_SR_INT_MASK
 	RESTORE_U_PCB_REG(a1, SR, k1)
 	and	a1, a1, ~MIPS_SR_INT_MASK
 	or	a1, a1, a0
 	SAVE_U_PCB_REG(a1, SR, k1)
 
 	RESTORE_U_PCB_REG(s0, S0, k1)
 	RESTORE_U_PCB_REG(s1, S1, k1)
 	RESTORE_U_PCB_REG(s2, S2, k1)
 	RESTORE_U_PCB_REG(s3, S3, k1)
 	RESTORE_U_PCB_REG(s4, S4, k1)
 	RESTORE_U_PCB_REG(s5, S5, k1)
 	RESTORE_U_PCB_REG(s6, S6, k1)
 	RESTORE_U_PCB_REG(s7, S7, k1)
 	RESTORE_U_PCB_REG(s8, S8, k1)
 	RESTORE_U_PCB_REG(t0, MULLO, k1)
 	RESTORE_U_PCB_REG(t1, MULHI, k1)
 	RESTORE_U_PCB_REG(t2, PC, k1)
 	mtlo	t0
 	mthi	t1
 	MTC0	t2, MIPS_COP_0_EXC_PC	# set return address
 	RESTORE_U_PCB_REG(v0, V0, k1)
 	RESTORE_U_PCB_REG(v1, V1, k1)
 	RESTORE_U_PCB_REG(a0, A0, k1)
 	RESTORE_U_PCB_REG(a1, A1, k1)
 	RESTORE_U_PCB_REG(a2, A2, k1)
 	RESTORE_U_PCB_REG(a3, A3, k1)
 	RESTORE_U_PCB_REG(t0, T0, k1)
 	RESTORE_U_PCB_REG(t1, T1, k1)
 	RESTORE_U_PCB_REG(t2, T2, k1)
 	RESTORE_U_PCB_REG(t3, T3, k1)
 	RESTORE_U_PCB_REG(ta0, TA0, k1)
 	RESTORE_U_PCB_REG(ta1, TA1, k1)
 	RESTORE_U_PCB_REG(ta2, TA2, k1)
 	RESTORE_U_PCB_REG(ta3, TA3, k1)
 	RESTORE_U_PCB_REG(t8, T8, k1)
 	RESTORE_U_PCB_REG(t9, T9, k1)
 	RESTORE_U_PCB_REG(gp, GP, k1)
 	RESTORE_U_PCB_REG(k0, SR, k1)
 	RESTORE_U_PCB_REG(sp, SP, k1)
 	RESTORE_U_PCB_REG(ra, RA, k1)
 	.set	noat
 	RESTORE_U_PCB_REG(AT, AST, k1)
 
 	mtc0	k0, MIPS_COP_0_STATUS	# SR with EXL set. 
 	ITLBNOPFIX
 	sync
 	eret
 	.set	at
 END(MipsUserIntr)
 
 LEAF_NOPROFILE(MipsTLBInvalidException)
 	.set push
 	.set noat
 	.set noreorder
 
 	MFC0		k0, MIPS_COP_0_BAD_VADDR
 	PTR_LI		k1, VM_MAXUSER_ADDRESS
 	sltu		k1, k0, k1
 	bnez		k1, 1f
 	nop
 
 	/* Kernel address.  */
 	lui		k1, %hi(kernel_segmap)		# k1=hi of segbase
 	b		2f
 	PTR_L		k1, %lo(kernel_segmap)(k1)	# k1=segment tab base
 
 1:	/* User address.  */
 	GET_CPU_PCPU(k1)
 	PTR_L		k1, PC_SEGBASE(k1)
 
 2:	/* Validate page directory pointer.  */
 	beqz		k1, 3f
 	nop
 
 	PTR_SRL		k0, SEGSHIFT - PTRSHIFT		# k0=seg offset (almost)
 	beq		k1, zero, MipsKernGenException	# ==0 -- no seg tab
 	andi		k0, k0, PDEPTRMASK		#06: k0=seg offset
 	PTR_ADDU	k1, k0, k1			# k1=seg entry address
 	PTR_L		k1, 0(k1)			# k1=seg entry
 
 	/* Validate page table pointer.  */
 	beqz		k1, 3f
 	nop
 
 #ifdef __mips_n64
 	MFC0		k0, MIPS_COP_0_BAD_VADDR
 	PTR_SRL		k0, PDRSHIFT - PTRSHIFT		# k0=pde offset (almost)
 	beq		k1, zero, MipsKernGenException	# ==0 -- no pde tab
 	andi		k0, k0, PDEPTRMASK		# k0=pde offset
 	PTR_ADDU	k1, k0, k1			# k1=pde entry address
 	PTR_L		k1, 0(k1)			# k1=pde entry
 
 	/* Validate pde table pointer.  */
 	beqz		k1, 3f
 	nop
 #endif
 	MFC0		k0, MIPS_COP_0_BAD_VADDR	# k0=bad address (again)
 	PTR_SRL		k0, PAGE_SHIFT - PTESHIFT	# k0=VPN
 	andi		k0, k0, PTEMASK			# k0=page tab offset
 	PTR_ADDU	k1, k1, k0			# k1=pte address
 	PTE_L		k0, 0(k1)			# k0=this PTE
 
 	/* Validate page table entry.  */
 	andi		k0, PTE_V
 	beqz		k0, 3f
 	nop
 
 	/* Check whether this is an even or odd entry.  */
 	andi		k0, k1, PTESIZE
 	bnez		k0, odd_page
 	nop
 
 	PTE_L		k0, 0(k1)
 	PTE_L		k1, PTESIZE(k1)
 	CLEAR_PTE_SWBITS(k0)
 	PTE_MTC0	k0, MIPS_COP_0_TLB_LO0
 	COP0_SYNC
 	CLEAR_PTE_SWBITS(k1)
 	PTE_MTC0	k1, MIPS_COP_0_TLB_LO1
 	COP0_SYNC
 
 	b		tlb_insert_entry
 	nop
 
 odd_page:
 	PTE_L		k0, -PTESIZE(k1)
 	PTE_L		k1, 0(k1)
 	CLEAR_PTE_SWBITS(k0)
 	PTE_MTC0	k0, MIPS_COP_0_TLB_LO0
 	COP0_SYNC
 	CLEAR_PTE_SWBITS(k1)
 	PTE_MTC0	k1, MIPS_COP_0_TLB_LO1
 	COP0_SYNC
 
 tlb_insert_entry:
 	tlbp
 	HAZARD_DELAY
 	mfc0		k0, MIPS_COP_0_TLB_INDEX
 	bltz		k0, tlb_insert_random
 	nop
 	tlbwi
 	eret
 	ssnop
 
 tlb_insert_random:
 	tlbwr
 	eret
 	ssnop
 
 3:
 	/*
 	 * Branch to the comprehensive exception processing.
 	 */
 	mfc0	k1, MIPS_COP_0_STATUS
 	andi	k1, k1, MIPS_SR_KSU_USER
 	bnez	k1, _C_LABEL(MipsUserGenException)
 	nop
 
 	/*
 	 * Check for kernel stack overflow.
 	 */
 	GET_CPU_PCPU(k1)
 	PTR_L	k0, PC_CURTHREAD(k1)
 	PTR_L	k0, TD_KSTACK(k0)
 	sltu	k0, k0, sp
 	bnez	k0, _C_LABEL(MipsKernGenException)
 	nop
 
 	/*
 	 * Kernel stack overflow.
 	 *
 	 * Move to a valid stack before we call panic. We use the boot stack
 	 * for this purpose.
 	 */
 	GET_CPU_PCPU(k1)
 	lw	k1, PC_CPUID(k1)
 	sll	k1, k1, PAGE_SHIFT + 1
 
 	PTR_LA	k0, _C_LABEL(pcpu_space)
 	PTR_ADDU	k0, PAGE_SIZE * 2
 	PTR_ADDU	k0, k0, k1
 
 	/*
 	 * Stash the original value of 'sp' so we can update trapframe later.
 	 * We assume that SAVE_CPU does not trash 'k1'.
 	 */
 	move	k1, sp
 
 	move	sp, k0
 	PTR_SUBU	sp, sp, KERN_EXC_FRAME_SIZE
 
 	move	k0, ra
 	move	ra, zero
 	REG_S	ra, CALLFRAME_RA(sp)	/* stop the ddb backtrace right here */
 	REG_S	zero, CALLFRAME_SP(sp)
 	move	ra, k0
 
 	SAVE_CPU
 
 	/*
 	 * Now restore the value of 'sp' at the time of the tlb exception in
 	 * the trapframe.
 	 */
 	SAVE_REG(k1, SP, sp)
 
 	/*
 	 * Squelch any more overflow checks by setting the stack base to 0.
 	 */
 	GET_CPU_PCPU(k1)
 	PTR_L	k0, PC_CURTHREAD(k1)
 	PTR_S	zero, TD_KSTACK(k0)
 
 	move	a1, a0
 	PANIC("kernel stack overflow - trapframe at %p")
 
 	/*
 	 * This nop is necessary so that the 'ra' remains within the bounds
 	 * of this handler. Otherwise the ddb backtrace code will think that
 	 * the panic() was called from MipsTLBMissException.
 	 */
 	nop
 
 	.set pop
 END(MipsTLBInvalidException)
 
 /*----------------------------------------------------------------------------
  *
  * MipsTLBMissException --
  *
  *	Handle a TLB miss exception from kernel mode in kernel space.
  *	The BaddVAddr, Context, and EntryHi registers contain the failed
  *	virtual address.
  *
  * Results:
  *	None.
  *
  * Side effects:
  *	None.
  *
  *----------------------------------------------------------------------------
  */
 LEAF_NOPROFILE(MipsTLBMissException)
 	.set	noat
 	MFC0		k0, MIPS_COP_0_BAD_VADDR	# k0=bad address
 	PTR_LI		k1, VM_MAX_KERNEL_ADDRESS	# check fault address against
 	sltu		k1, k1, k0			# upper bound of kernel_segmap
 	bnez		k1, MipsKernGenException	# out of bound
 	lui		k1, %hi(kernel_segmap)		# k1=hi of segbase
 	PTR_SRL		k0, SEGSHIFT - PTRSHIFT		# k0=seg offset (almost)
 	PTR_L		k1, %lo(kernel_segmap)(k1)	# k1=segment tab base
 	beq		k1, zero, MipsKernGenException	# ==0 -- no seg tab
 	andi		k0, k0, PDEPTRMASK		#06: k0=seg offset
 	PTR_ADDU	k1, k0, k1			# k1=seg entry address
 	PTR_L		k1, 0(k1)			# k1=seg entry
 	MFC0		k0, MIPS_COP_0_BAD_VADDR	# k0=bad address (again)
 	beq		k1, zero, MipsKernGenException	# ==0 -- no page table
 #ifdef __mips_n64
 	PTR_SRL		k0, PDRSHIFT - PTRSHIFT		# k0=VPN
 	andi		k0, k0, PDEPTRMASK		# k0=pde offset
 	PTR_ADDU	k1, k0, k1			# k1=pde entry address
 	PTR_L		k1, 0(k1)			# k1=pde entry
 	MFC0		k0, MIPS_COP_0_BAD_VADDR	# k0=bad address (again)
   	beq		k1, zero, MipsKernGenException	# ==0 -- no page table
 #endif
 	PTR_SRL		k0, PAGE_SHIFT - PTESHIFT	# k0=VPN
 	andi		k0, k0, PTE2MASK		# k0=page tab offset
 	PTR_ADDU	k1, k1, k0			# k1=pte address
 	PTE_L		k0, 0(k1)			# k0=lo0 pte
 	PTE_L		k1, PTESIZE(k1)			# k1=lo1 pte
 	CLEAR_PTE_SWBITS(k0)
 	PTE_MTC0	k0, MIPS_COP_0_TLB_LO0		# lo0 is loaded
 	COP0_SYNC
 	CLEAR_PTE_SWBITS(k1)
 	PTE_MTC0	k1, MIPS_COP_0_TLB_LO1		# lo1 is loaded
 	COP0_SYNC
 	tlbwr					# write to tlb
 	HAZARD_DELAY
 	eret					# return from exception
 	.set	at
 END(MipsTLBMissException)
 
 /*----------------------------------------------------------------------------
  *
  * MipsFPTrap --
  *
  *	Handle a floating point Trap.
  *
  *	MipsFPTrap(statusReg, causeReg, pc)
  *		unsigned statusReg;
  *		unsigned causeReg;
  *		unsigned pc;
  *
  * Results:
  *	None.
  *
  * Side effects:
  *	None.
  *
  *----------------------------------------------------------------------------
  */
 NESTED(MipsFPTrap, CALLFRAME_SIZ, ra)
 	PTR_SUBU	sp, sp, CALLFRAME_SIZ
 	mfc0	t0, MIPS_COP_0_STATUS
 	REG_S	ra, CALLFRAME_RA(sp)
 	.mask	0x80000000, (CALLFRAME_RA - CALLFRAME_SIZ)
 
 	or	t1, t0, MIPS_SR_COP_1_BIT
 	mtc0	t1, MIPS_COP_0_STATUS
 	ITLBNOPFIX
 	cfc1	t1, MIPS_FPU_CSR		# stall til FP done
 	cfc1	t1, MIPS_FPU_CSR		# now get status
 	nop
 	sll	t2, t1, (31 - 17)		# unimplemented operation?
 	bgez	t2, 3f				# no, normal trap
 	nop
 /*
  * We got an unimplemented operation trap so
  * fetch the instruction, compute the next PC and emulate the instruction.
  */
 	bgez	a1, 1f				# Check the branch delay bit.
 	nop
 /*
  * The instruction is in the branch delay slot so the branch will have to
  * be emulated to get the resulting PC.
  */
 	PTR_S	a2, CALLFRAME_SIZ + 8(sp)
 	GET_CPU_PCPU(a0)
 #mips64 unsafe?
 	PTR_L	a0, PC_CURPCB(a0)
 	PTR_ADDU a0, a0, U_PCB_REGS		# first arg is ptr to CPU registers
 	move	a1, a2				# second arg is instruction PC
 	move	a2, t1				# third arg is floating point CSR
 	PTR_LA	t3, _C_LABEL(MipsEmulateBranch)	# compute PC after branch
 	jalr	t3				# compute PC after branch
 	move	a3, zero			# fourth arg is FALSE
 /*
  * Now load the floating-point instruction in the branch delay slot
  * to be emulated.
  */
 	PTR_L	a2, CALLFRAME_SIZ + 8(sp)	# restore EXC pc
 	b	2f
 	lw	a0, 4(a2)			# a0 = coproc instruction
 /*
  * This is not in the branch delay slot so calculate the resulting
  * PC (epc + 4) into v0 and continue to MipsEmulateFP().
  */
 1:
 	lw	a0, 0(a2)			# a0 = coproc instruction
 #xxx mips64 unsafe?
 	PTR_ADDU	v0, a2, 4			# v0 = next pc
 2:
 	GET_CPU_PCPU(t2)
 	PTR_L	t2, PC_CURPCB(t2)
 	SAVE_U_PCB_REG(v0, PC, t2)		# save new pc
 /*
  * Check to see if the instruction to be emulated is a floating-point
  * instruction.
  */
 	srl	a3, a0, MIPS_OPCODE_SHIFT
 	beq	a3, MIPS_OPCODE_C1, 4f		# this should never fail
 	nop
 /*
  * Send a floating point exception signal to the current process.
  */
 3:
 	GET_CPU_PCPU(a0)
 	PTR_L	a0, PC_CURTHREAD(a0)		# get current thread
 	cfc1	a2, MIPS_FPU_CSR		# code = FP execptions
 	ctc1	zero, MIPS_FPU_CSR		# Clear exceptions
 	PTR_LA	t3, _C_LABEL(trapsignal)
 	jalr	t3
 	li	a1, SIGFPE
 	b	FPReturn
 	nop
 
 /*
  * Finally, we can call MipsEmulateFP() where a0 is the instruction to emulate.
  */
 4:
 	PTR_LA	t3, _C_LABEL(MipsEmulateFP)
 	jalr	t3
 	nop
 
 /*
  * Turn off the floating point coprocessor and return.
  */
 FPReturn:
 	mfc0	t0, MIPS_COP_0_STATUS
 	PTR_L	ra, CALLFRAME_RA(sp)
 	and	t0, t0, ~MIPS_SR_COP_1_BIT
 	mtc0	t0, MIPS_COP_0_STATUS
 	ITLBNOPFIX
 	j	ra
 	PTR_ADDU sp, sp, CALLFRAME_SIZ
 END(MipsFPTrap)
 
 /*
  * Interrupt counters for vmstat.
  */
 	.data
 	.globl intrcnt
 	.globl sintrcnt
 	.globl intrnames
 	.globl sintrnames
 intrnames:
 	.space  INTRCNT_COUNT * (MAXCOMLEN + 1) * 2
 sintrnames:
 #ifdef __mips_n64
 	.quad  INTRCNT_COUNT * (MAXCOMLEN + 1) * 2
 #else
 	.int  INTRCNT_COUNT * (MAXCOMLEN + 1) * 2
 #endif
 
 	.align	(_MIPS_SZLONG / 8)
 intrcnt:
 	.space  INTRCNT_COUNT * (_MIPS_SZLONG / 8) * 2
 sintrcnt:
 #ifdef __mips_n64
 	.quad  INTRCNT_COUNT * (_MIPS_SZLONG / 8) * 2
 #else
 	.int  INTRCNT_COUNT * (_MIPS_SZLONG / 8) * 2
 #endif
 
 
 /*
  * Vector to real handler in KSEG1.
  */
 	.text
 VECTOR(MipsCache, unknown)
 	PTR_LA	k0, _C_LABEL(MipsCacheException)
 	li	k1, MIPS_KSEG0_PHYS_MASK
 	and	k0, k1
 	PTR_LI	k1, MIPS_KSEG1_START
 	or	k0, k1
 	j	k0
 	nop
 VECTOR_END(MipsCache)
 
 	.set	at
 
 
 /*
  * Panic on cache errors.  A lot more could be done to recover
  * from some types of errors but it is tricky.
  */
 NESTED_NOPROFILE(MipsCacheException, KERN_EXC_FRAME_SIZE, ra)
 	.set	noat
 	.mask	0x80000000, -4
 	PTR_LA	k0, _C_LABEL(panic)		# return to panic
 	PTR_LA	a0, 9f				# panicstr
 	MFC0	a1, MIPS_COP_0_ERROR_PC
 	mfc0	a2, MIPS_COP_0_CACHE_ERR	# 3rd arg cache error
 
 	MTC0	k0, MIPS_COP_0_ERROR_PC		# set return address
 
 	mfc0	k0, MIPS_COP_0_STATUS		# restore status
 	li	k1, MIPS_SR_DIAG_PE		# ignore further errors
 	or	k0, k1
 	mtc0	k0, MIPS_COP_0_STATUS		# restore status
 	COP0_SYNC
 
 	eret
 
 	MSG("cache error @ EPC 0x%x CachErr 0x%x");
 	.set	at
 END(MipsCacheException)
Index: head/sys/mips/mips/trap.c
===================================================================
--- head/sys/mips/mips/trap.c	(revision 258540)
+++ head/sys/mips/mips/trap.c	(revision 258541)
@@ -1,1735 +1,1734 @@
 /*	$OpenBSD: trap.c,v 1.19 1998/09/30 12:40:41 pefo Exp $	*/
 /* tracked to 1.23 */
 /*-
  * Copyright (c) 1988 University of Utah.
  * Copyright (c) 1992, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * This code is derived from software contributed to Berkeley by
  * the Systems Programming Group of the University of Utah Computer
  * Science Department and Ralph Campbell.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  * from: Utah Hdr: trap.c 1.32 91/04/06
  *
  *	from: @(#)trap.c	8.5 (Berkeley) 1/11/94
  *	JNPR: trap.c,v 1.13.2.2 2007/08/29 10:03:49 girish
  */
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_compat.h"
 #include "opt_ddb.h"
 #include "opt_global.h"
 #include "opt_ktrace.h"
-#include "opt_kdtrace.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/sysent.h>
 #include <sys/proc.h>
 #include <sys/kernel.h>
 #include <sys/signalvar.h>
 #include <sys/syscall.h>
 #include <sys/lock.h>
 #include <vm/vm.h>
 #include <vm/vm_extern.h>
 #include <vm/vm_kern.h>
 #include <vm/vm_page.h>
 #include <vm/vm_map.h>
 #include <vm/vm_param.h>
 #include <sys/vmmeter.h>
 #include <sys/ptrace.h>
 #include <sys/user.h>
 #include <sys/buf.h>
 #include <sys/vnode.h>
 #include <sys/pioctl.h>
 #include <sys/sysctl.h>
 #include <sys/syslog.h>
 #include <sys/bus.h>
 #ifdef KTRACE
 #include <sys/ktrace.h>
 #endif
 #include <net/netisr.h>
 
 #include <machine/trap.h>
 #include <machine/cpu.h>
 #include <machine/pte.h>
 #include <machine/pmap.h>
 #include <machine/md_var.h>
 #include <machine/mips_opcode.h>
 #include <machine/frame.h>
 #include <machine/regnum.h>
 #include <machine/tls.h>
 
 #ifdef DDB
 #include <machine/db_machdep.h>
 #include <ddb/db_sym.h>
 #include <ddb/ddb.h>
 #include <sys/kdb.h>
 #endif
 
 #ifdef KDTRACE_HOOKS
 #include <sys/dtrace_bsd.h>
 
 /*
  * This is a hook which is initialised by the dtrace module
  * to handle traps which might occur during DTrace probe
  * execution.
  */
 dtrace_trap_func_t	dtrace_trap_func;
 
 dtrace_doubletrap_func_t	dtrace_doubletrap_func;
 
 /*
  * This is a hook which is initialised by the systrace module
  * when it is loaded. This keeps the DTrace syscall provider
  * implementation opaque. 
  */
 systrace_probe_func_t	systrace_probe_func;
 
 /*
  * These hooks are necessary for the pid and usdt providers.
  */
 dtrace_pid_probe_ptr_t		dtrace_pid_probe_ptr;
 dtrace_return_probe_ptr_t	dtrace_return_probe_ptr;
 #endif
 
 #ifdef TRAP_DEBUG
 int trap_debug = 0;
 SYSCTL_INT(_machdep, OID_AUTO, trap_debug, CTLFLAG_RW,
     &trap_debug, 0, "Debug information on all traps");
 #endif
 
 #define	lbu_macro(data, addr)						\
 	__asm __volatile ("lbu %0, 0x0(%1)"				\
 			: "=r" (data)	/* outputs */			\
 			: "r" (addr));	/* inputs */
 
 #define	lb_macro(data, addr)						\
 	__asm __volatile ("lb %0, 0x0(%1)"				\
 			: "=r" (data)	/* outputs */			\
 			: "r" (addr));	/* inputs */
 
 #define	lwl_macro(data, addr)						\
 	__asm __volatile ("lwl %0, 0x0(%1)"				\
 			: "=r" (data)	/* outputs */			\
 			: "r" (addr));	/* inputs */
 
 #define	lwr_macro(data, addr)						\
 	__asm __volatile ("lwr %0, 0x0(%1)"				\
 			: "=r" (data)	/* outputs */			\
 			: "r" (addr));	/* inputs */
 
 #define	ldl_macro(data, addr)						\
 	__asm __volatile ("ldl %0, 0x0(%1)"				\
 			: "=r" (data)	/* outputs */			\
 			: "r" (addr));	/* inputs */
 
 #define	ldr_macro(data, addr)						\
 	__asm __volatile ("ldr %0, 0x0(%1)"				\
 			: "=r" (data)	/* outputs */			\
 			: "r" (addr));	/* inputs */
 
 #define	sb_macro(data, addr)						\
 	__asm __volatile ("sb %0, 0x0(%1)"				\
 			:				/* outputs */	\
 			: "r" (data), "r" (addr));	/* inputs */
 
 #define	swl_macro(data, addr)						\
 	__asm __volatile ("swl %0, 0x0(%1)"				\
 			: 				/* outputs */	\
 			: "r" (data), "r" (addr));	/* inputs */
 
 #define	swr_macro(data, addr)						\
 	__asm __volatile ("swr %0, 0x0(%1)"				\
 			: 				/* outputs */	\
 			: "r" (data), "r" (addr));	/* inputs */
 
 #define	sdl_macro(data, addr)						\
 	__asm __volatile ("sdl %0, 0x0(%1)"				\
 			: 				/* outputs */	\
 			: "r" (data), "r" (addr));	/* inputs */
 
 #define	sdr_macro(data, addr)						\
 	__asm __volatile ("sdr %0, 0x0(%1)"				\
 			:				/* outputs */	\
 			: "r" (data), "r" (addr));	/* inputs */
 
 static void log_illegal_instruction(const char *, struct trapframe *);
 static void log_bad_page_fault(char *, struct trapframe *, int);
 static void log_frame_dump(struct trapframe *frame);
 static void get_mapping_info(vm_offset_t, pd_entry_t **, pt_entry_t **);
 
 #ifdef TRAP_DEBUG
 static void trap_frame_dump(struct trapframe *frame);
 #endif
 
 void (*machExceptionTable[]) (void)= {
 /*
  * The kernel exception handlers.
  */
 	MipsKernIntr,		/* external interrupt */
 	MipsKernGenException,	/* TLB modification */
 	MipsTLBInvalidException,/* TLB miss (load or instr. fetch) */
 	MipsTLBInvalidException,/* TLB miss (store) */
 	MipsKernGenException,	/* address error (load or I-fetch) */
 	MipsKernGenException,	/* address error (store) */
 	MipsKernGenException,	/* bus error (I-fetch) */
 	MipsKernGenException,	/* bus error (load or store) */
 	MipsKernGenException,	/* system call */
 	MipsKernGenException,	/* breakpoint */
 	MipsKernGenException,	/* reserved instruction */
 	MipsKernGenException,	/* coprocessor unusable */
 	MipsKernGenException,	/* arithmetic overflow */
 	MipsKernGenException,	/* trap exception */
 	MipsKernGenException,	/* virtual coherence exception inst */
 	MipsKernGenException,	/* floating point exception */
 	MipsKernGenException,	/* reserved */
 	MipsKernGenException,	/* reserved */
 	MipsKernGenException,	/* reserved */
 	MipsKernGenException,	/* reserved */
 	MipsKernGenException,	/* reserved */
 	MipsKernGenException,	/* reserved */
 	MipsKernGenException,	/* reserved */
 	MipsKernGenException,	/* watch exception */
 	MipsKernGenException,	/* reserved */
 	MipsKernGenException,	/* reserved */
 	MipsKernGenException,	/* reserved */
 	MipsKernGenException,	/* reserved */
 	MipsKernGenException,	/* reserved */
 	MipsKernGenException,	/* reserved */
 	MipsKernGenException,	/* reserved */
 	MipsKernGenException,	/* virtual coherence exception data */
 /*
  * The user exception handlers.
  */
 	MipsUserIntr,		/* 0 */
 	MipsUserGenException,	/* 1 */
 	MipsTLBInvalidException,/* 2 */
 	MipsTLBInvalidException,/* 3 */
 	MipsUserGenException,	/* 4 */
 	MipsUserGenException,	/* 5 */
 	MipsUserGenException,	/* 6 */
 	MipsUserGenException,	/* 7 */
 	MipsUserGenException,	/* 8 */
 	MipsUserGenException,	/* 9 */
 	MipsUserGenException,	/* 10 */
 	MipsUserGenException,	/* 11 */
 	MipsUserGenException,	/* 12 */
 	MipsUserGenException,	/* 13 */
 	MipsUserGenException,	/* 14 */
 	MipsUserGenException,	/* 15 */
 	MipsUserGenException,	/* 16 */
 	MipsUserGenException,	/* 17 */
 	MipsUserGenException,	/* 18 */
 	MipsUserGenException,	/* 19 */
 	MipsUserGenException,	/* 20 */
 	MipsUserGenException,	/* 21 */
 	MipsUserGenException,	/* 22 */
 	MipsUserGenException,	/* 23 */
 	MipsUserGenException,	/* 24 */
 	MipsUserGenException,	/* 25 */
 	MipsUserGenException,	/* 26 */
 	MipsUserGenException,	/* 27 */
 	MipsUserGenException,	/* 28 */
 	MipsUserGenException,	/* 29 */
 	MipsUserGenException,	/* 20 */
 	MipsUserGenException,	/* 31 */
 };
 
 char *trap_type[] = {
 	"external interrupt",
 	"TLB modification",
 	"TLB miss (load or instr. fetch)",
 	"TLB miss (store)",
 	"address error (load or I-fetch)",
 	"address error (store)",
 	"bus error (I-fetch)",
 	"bus error (load or store)",
 	"system call",
 	"breakpoint",
 	"reserved instruction",
 	"coprocessor unusable",
 	"arithmetic overflow",
 	"trap",
 	"virtual coherency instruction",
 	"floating point",
 	"reserved 16",
 	"reserved 17",
 	"reserved 18",
 	"reserved 19",
 	"reserved 20",
 	"reserved 21",
 	"reserved 22",
 	"watch",
 	"reserved 24",
 	"reserved 25",
 	"reserved 26",
 	"reserved 27",
 	"reserved 28",
 	"reserved 29",
 	"reserved 30",
 	"virtual coherency data",
 };
 
 #if !defined(SMP) && (defined(DDB) || defined(DEBUG))
 struct trapdebug trapdebug[TRAPSIZE], *trp = trapdebug;
 #endif
 
 #if defined(DDB) || defined(DEBUG)
 void stacktrace(struct trapframe *);
 void logstacktrace(struct trapframe *);
 #endif
 
 #define	KERNLAND(x)	((vm_offset_t)(x) >= VM_MIN_KERNEL_ADDRESS && (vm_offset_t)(x) < VM_MAX_KERNEL_ADDRESS)
 #define	DELAYBRANCH(x)	((int)(x) < 0)
 
 /*
  * MIPS load/store access type
  */
 enum {
 	MIPS_LHU_ACCESS = 1,
 	MIPS_LH_ACCESS,
 	MIPS_LWU_ACCESS,
 	MIPS_LW_ACCESS,
 	MIPS_LD_ACCESS,
 	MIPS_SH_ACCESS,
 	MIPS_SW_ACCESS,
 	MIPS_SD_ACCESS
 };
 
 char *access_name[] = {
 	"Load Halfword Unsigned",
 	"Load Halfword",
 	"Load Word Unsigned",
 	"Load Word",
 	"Load Doubleword",
 	"Store Halfword",
 	"Store Word",
 	"Store Doubleword"
 };
 
 #ifdef	CPU_CNMIPS
 #include <machine/octeon_cop2.h>
 #endif
 
 static int allow_unaligned_acc = 1;
 
 SYSCTL_INT(_vm, OID_AUTO, allow_unaligned_acc, CTLFLAG_RW,
     &allow_unaligned_acc, 0, "Allow unaligned accesses");
 
 /*
  * FP emulation is assumed to work on O32, but the code is outdated and crufty
  * enough that it's a more sensible default to have it disabled when using
  * other ABIs.  At the very least, it needs a lot of help in using
  * type-semantic ABI-oblivious macros for everything it does.
  */
 #if defined(__mips_o32)
 static int emulate_fp = 1;
 #else
 static int emulate_fp = 0;
 #endif
 SYSCTL_INT(_machdep, OID_AUTO, emulate_fp, CTLFLAG_RW,
     &emulate_fp, 0, "Emulate unimplemented FPU instructions");
 
 static int emulate_unaligned_access(struct trapframe *frame, int mode);
 
 extern void fswintrberr(void); /* XXX */
 
 int
 cpu_fetch_syscall_args(struct thread *td, struct syscall_args *sa)
 {
 	struct trapframe *locr0 = td->td_frame;
 	struct sysentvec *se;
 	int error, nsaved;
 
 	bzero(sa->args, sizeof(sa->args));
 
 	/* compute next PC after syscall instruction */
 	td->td_pcb->pcb_tpc = sa->trapframe->pc; /* Remember if restart */
 	if (DELAYBRANCH(sa->trapframe->cause))	 /* Check BD bit */
 		locr0->pc = MipsEmulateBranch(locr0, sa->trapframe->pc, 0, 0);
 	else
 		locr0->pc += sizeof(int);
 	sa->code = locr0->v0;
 
 	switch (sa->code) {
 	case SYS___syscall:
 	case SYS_syscall:
 		/*
 		 * This is an indirect syscall, in which the code is the first argument.
 		 */
 #if (!defined(__mips_n32) && !defined(__mips_n64)) || defined(COMPAT_FREEBSD32)
 		if (sa->code == SYS___syscall && SV_PROC_FLAG(td->td_proc, SV_ILP32)) {
 			/*
 			 * Like syscall, but code is a quad, so as to maintain alignment
 			 * for the rest of the arguments.
 			 */
 			if (_QUAD_LOWWORD == 0)
 				sa->code = locr0->a0;
 			else
 				sa->code = locr0->a1;
 			sa->args[0] = locr0->a2;
 			sa->args[1] = locr0->a3;
 			nsaved = 2;
 			break;
 		} 
 #endif
 		/*
 		 * This is either not a quad syscall, or is a quad syscall with a
 		 * new ABI in which quads fit in a single register.
 		 */
 		sa->code = locr0->a0;
 		sa->args[0] = locr0->a1;
 		sa->args[1] = locr0->a2;
 		sa->args[2] = locr0->a3;
 		nsaved = 3;
 #if defined(__mips_n32) || defined(__mips_n64)
 #ifdef COMPAT_FREEBSD32
 		if (!SV_PROC_FLAG(td->td_proc, SV_ILP32)) {
 #endif
 			/*
 			 * Non-o32 ABIs support more arguments in registers.
 			 */
 			sa->args[3] = locr0->a4;
 			sa->args[4] = locr0->a5;
 			sa->args[5] = locr0->a6;
 			sa->args[6] = locr0->a7;
 			nsaved += 4;
 #ifdef COMPAT_FREEBSD32
 		}
 #endif
 #endif
 		break;
 	default:
 		/*
 		 * A direct syscall, arguments are just parameters to the syscall.
 		 */
 		sa->args[0] = locr0->a0;
 		sa->args[1] = locr0->a1;
 		sa->args[2] = locr0->a2;
 		sa->args[3] = locr0->a3;
 		nsaved = 4;
 #if defined (__mips_n32) || defined(__mips_n64)
 #ifdef COMPAT_FREEBSD32
 		if (!SV_PROC_FLAG(td->td_proc, SV_ILP32)) {
 #endif
 			/*
 			 * Non-o32 ABIs support more arguments in registers.
 			 */
 			sa->args[4] = locr0->a4;
 			sa->args[5] = locr0->a5;
 			sa->args[6] = locr0->a6;
 			sa->args[7] = locr0->a7;
 			nsaved += 4;
 #ifdef COMPAT_FREEBSD32
 		}
 #endif
 #endif
 		break;
 	}
 
 #ifdef TRAP_DEBUG
 	if (trap_debug)
 		printf("SYSCALL #%d pid:%u\n", sa->code, td->td_proc->p_pid);
 #endif
 
 	se = td->td_proc->p_sysent;
 	/*
 	 * XXX
 	 * Shouldn't this go before switching on the code?
 	 */
 	if (se->sv_mask)
 		sa->code &= se->sv_mask;
 
 	if (sa->code >= se->sv_size)
 		sa->callp = &se->sv_table[0];
 	else
 		sa->callp = &se->sv_table[sa->code];
 
 	sa->narg = sa->callp->sy_narg;
 
 	if (sa->narg > nsaved) {
 #if defined(__mips_n32) || defined(__mips_n64)
 		/*
 		 * XXX
 		 * Is this right for new ABIs?  I think the 4 there
 		 * should be 8, size there are 8 registers to skip,
 		 * not 4, but I'm not certain.
 		 */
 #ifdef COMPAT_FREEBSD32
 		if (!SV_PROC_FLAG(td->td_proc, SV_ILP32))
 #endif
 			printf("SYSCALL #%u pid:%u, narg (%u) > nsaved (%u).\n",
 			    sa->code, td->td_proc->p_pid, sa->narg, nsaved);
 #endif
 #if (defined(__mips_n32) || defined(__mips_n64)) && defined(COMPAT_FREEBSD32)
 		if (SV_PROC_FLAG(td->td_proc, SV_ILP32)) {
 			unsigned i;
 			int32_t arg;
 
 			error = 0; /* XXX GCC is awful.  */
 			for (i = nsaved; i < sa->narg; i++) {
 				error = copyin((caddr_t)(intptr_t)(locr0->sp +
 				    (4 + (i - nsaved)) * sizeof(int32_t)),
 				    (caddr_t)&arg, sizeof arg);
 				if (error != 0)
 					break;
 				sa->args[i] = arg;
 			}
 		} else
 #endif
 		error = copyin((caddr_t)(intptr_t)(locr0->sp +
 		    4 * sizeof(register_t)), (caddr_t)&sa->args[nsaved],
 		   (u_int)(sa->narg - nsaved) * sizeof(register_t));
 		if (error != 0) {
 			locr0->v0 = error;
 			locr0->a3 = 1;
 		}
 	} else
 		error = 0;
 
 	if (error == 0) {
 		td->td_retval[0] = 0;
 		td->td_retval[1] = locr0->v1;
 	}
 
 	return (error);
 }
 
 #undef __FBSDID
 #define __FBSDID(x)
 #include "../../kern/subr_syscall.c"
 
 /*
  * Handle an exception.
  * Called from MipsKernGenException() or MipsUserGenException()
  * when a processor trap occurs.
  * In the case of a kernel trap, we return the pc where to resume if
  * p->p_addr->u_pcb.pcb_onfault is set, otherwise, return old pc.
  */
 register_t
 trap(struct trapframe *trapframe)
 {
 	int type, usermode;
 	int i = 0;
 	unsigned ucode = 0;
 	struct thread *td = curthread;
 	struct proc *p = curproc;
 	vm_prot_t ftype;
 	pmap_t pmap;
 	int access_type;
 	ksiginfo_t ksi;
 	char *msg = NULL;
 	intptr_t addr = 0;
 	register_t pc;
 	int cop;
 	register_t *frame_regs;
 
 	trapdebug_enter(trapframe, 0);
 	
 	type = (trapframe->cause & MIPS_CR_EXC_CODE) >> MIPS_CR_EXC_CODE_SHIFT;
 	if (TRAPF_USERMODE(trapframe)) {
 		type |= T_USER;
 		usermode = 1;
 	} else {
 		usermode = 0;
 	}
 
 	/*
 	 * Enable hardware interrupts if they were on before the trap. If it
 	 * was off disable all so we don't accidently enable it when doing a
 	 * return to userland.
 	 */
 	if (trapframe->sr & MIPS_SR_INT_IE) {
 		set_intr_mask(trapframe->sr & MIPS_SR_INT_MASK);
 		intr_enable();
 	} else {
 		intr_disable();
 	}
 
 #ifdef TRAP_DEBUG
 	if (trap_debug) {
 		static vm_offset_t last_badvaddr = 0;
 		static vm_offset_t this_badvaddr = 0;
 		static int count = 0;
 		u_int32_t pid;
 
 		printf("trap type %x (%s - ", type,
 		    trap_type[type & (~T_USER)]);
 
 		if (type & T_USER)
 			printf("user mode)\n");
 		else
 			printf("kernel mode)\n");
 
 #ifdef SMP
 		printf("cpuid = %d\n", PCPU_GET(cpuid));
 #endif
 		pid = mips_rd_entryhi() & TLBHI_ASID_MASK;
 		printf("badaddr = %#jx, pc = %#jx, ra = %#jx, sp = %#jx, sr = %jx, pid = %d, ASID = %u\n",
 		    (intmax_t)trapframe->badvaddr, (intmax_t)trapframe->pc, (intmax_t)trapframe->ra,
 		    (intmax_t)trapframe->sp, (intmax_t)trapframe->sr,
 		    (curproc ? curproc->p_pid : -1), pid);
 
 		switch (type & ~T_USER) {
 		case T_TLB_MOD:
 		case T_TLB_LD_MISS:
 		case T_TLB_ST_MISS:
 		case T_ADDR_ERR_LD:
 		case T_ADDR_ERR_ST:
 			this_badvaddr = trapframe->badvaddr;
 			break;
 		case T_SYSCALL:
 			this_badvaddr = trapframe->ra;
 			break;
 		default:
 			this_badvaddr = trapframe->pc;
 			break;
 		}
 		if ((last_badvaddr == this_badvaddr) &&
 		    ((type & ~T_USER) != T_SYSCALL)) {
 			if (++count == 3) {
 				trap_frame_dump(trapframe);
 				panic("too many faults at %p\n", (void *)last_badvaddr);
 			}
 		} else {
 			last_badvaddr = this_badvaddr;
 			count = 0;
 		}
 	}
 #endif
 
 #ifdef KDTRACE_HOOKS
 	/*
 	 * A trap can occur while DTrace executes a probe. Before
 	 * executing the probe, DTrace blocks re-scheduling and sets
 	 * a flag in it's per-cpu flags to indicate that it doesn't
 	 * want to fault. On returning from the probe, the no-fault
 	 * flag is cleared and finally re-scheduling is enabled.
 	 *
 	 * If the DTrace kernel module has registered a trap handler,
 	 * call it and if it returns non-zero, assume that it has
 	 * handled the trap and modified the trap frame so that this
 	 * function can return normally.
 	 */
 	/*
 	 * XXXDTRACE: add pid probe handler here (if ever)
 	 */
 	if (!usermode) {
 		if (dtrace_trap_func != NULL && (*dtrace_trap_func)(trapframe, type))
 			return (trapframe->pc);
 	}
 #endif
 
 	switch (type) {
 	case T_MCHECK:
 #ifdef DDB
 		kdb_trap(type, 0, trapframe);
 #endif
 		panic("MCHECK\n");
 		break;
 	case T_TLB_MOD:
 		/* check for kernel address */
 		if (KERNLAND(trapframe->badvaddr)) {
 			if (pmap_emulate_modified(kernel_pmap, 
 			    trapframe->badvaddr) != 0) {
 				ftype = VM_PROT_WRITE;
 				goto kernel_fault;
 			}
 			return (trapframe->pc);
 		}
 		/* FALLTHROUGH */
 
 	case T_TLB_MOD + T_USER:
 		pmap = &p->p_vmspace->vm_pmap;
 		if (pmap_emulate_modified(pmap, trapframe->badvaddr) != 0) {
 			ftype = VM_PROT_WRITE;
 			goto dofault;
 		}
 		if (!usermode)
 			return (trapframe->pc);
 		goto out;
 
 	case T_TLB_LD_MISS:
 	case T_TLB_ST_MISS:
 		ftype = (type == T_TLB_ST_MISS) ? VM_PROT_WRITE : VM_PROT_READ;
 		/* check for kernel address */
 		if (KERNLAND(trapframe->badvaddr)) {
 			vm_offset_t va;
 			int rv;
 
 	kernel_fault:
 			va = trunc_page((vm_offset_t)trapframe->badvaddr);
 			rv = vm_fault(kernel_map, va, ftype, VM_FAULT_NORMAL);
 			if (rv == KERN_SUCCESS)
 				return (trapframe->pc);
 			if (td->td_pcb->pcb_onfault != NULL) {
 				pc = (register_t)(intptr_t)td->td_pcb->pcb_onfault;
 				td->td_pcb->pcb_onfault = NULL;
 				return (pc);
 			}
 			goto err;
 		}
 
 		/*
 		 * It is an error for the kernel to access user space except
 		 * through the copyin/copyout routines.
 		 */
 		if (td->td_pcb->pcb_onfault == NULL)
 			goto err;
 
 		/* check for fuswintr() or suswintr() getting a page fault */
 		/* XXX There must be a nicer way to do this.  */
 		if (td->td_pcb->pcb_onfault == fswintrberr) {
 			pc = (register_t)(intptr_t)td->td_pcb->pcb_onfault;
 			td->td_pcb->pcb_onfault = NULL;
 			return (pc);
 		}
 
 		goto dofault;
 
 	case T_TLB_LD_MISS + T_USER:
 		ftype = VM_PROT_READ;
 		goto dofault;
 
 	case T_TLB_ST_MISS + T_USER:
 		ftype = VM_PROT_WRITE;
 dofault:
 		{
 			vm_offset_t va;
 			struct vmspace *vm;
 			vm_map_t map;
 			int rv = 0;
 
 			vm = p->p_vmspace;
 			map = &vm->vm_map;
 			va = trunc_page((vm_offset_t)trapframe->badvaddr);
 			if (KERNLAND(trapframe->badvaddr)) {
 				/*
 				 * Don't allow user-mode faults in kernel
 				 * address space.
 				 */
 				goto nogo;
 			}
 
 			/*
 			 * Keep swapout from messing with us during this
 			 * critical time.
 			 */
 			PROC_LOCK(p);
 			++p->p_lock;
 			PROC_UNLOCK(p);
 
 			rv = vm_fault(map, va, ftype, VM_FAULT_NORMAL);
 
 			PROC_LOCK(p);
 			--p->p_lock;
 			PROC_UNLOCK(p);
 			/*
 			 * XXXDTRACE: add dtrace_doubletrap_func here?
 			 */
 #ifdef VMFAULT_TRACE
 			printf("vm_fault(%p (pmap %p), %p (%p), %x, %d) -> %x at pc %p\n",
 			    map, &vm->vm_pmap, (void *)va, (void *)(intptr_t)trapframe->badvaddr,
 			    ftype, VM_FAULT_NORMAL, rv, (void *)(intptr_t)trapframe->pc);
 #endif
 
 			if (rv == KERN_SUCCESS) {
 				if (!usermode) {
 					return (trapframe->pc);
 				}
 				goto out;
 			}
 	nogo:
 			if (!usermode) {
 				if (td->td_pcb->pcb_onfault != NULL) {
 					pc = (register_t)(intptr_t)td->td_pcb->pcb_onfault;
 					td->td_pcb->pcb_onfault = NULL;
 					return (pc);
 				}
 				goto err;
 			}
 			ucode = ftype;
 			i = ((rv == KERN_PROTECTION_FAILURE) ? SIGBUS : SIGSEGV);
 			addr = trapframe->pc;
 
 			msg = "BAD_PAGE_FAULT";
 			log_bad_page_fault(msg, trapframe, type);
 
 			break;
 		}
 
 	case T_ADDR_ERR_LD + T_USER:	/* misaligned or kseg access */
 	case T_ADDR_ERR_ST + T_USER:	/* misaligned or kseg access */
 		if (trapframe->badvaddr < 0 ||
 		    trapframe->badvaddr >= VM_MAXUSER_ADDRESS) {
 			msg = "ADDRESS_SPACE_ERR";
 		} else if (allow_unaligned_acc) {
 			int mode;
 
 			if (type == (T_ADDR_ERR_LD + T_USER))
 				mode = VM_PROT_READ;
 			else
 				mode = VM_PROT_WRITE;
 
 			access_type = emulate_unaligned_access(trapframe, mode);
 			if (access_type != 0)
 				goto out;
 			msg = "ALIGNMENT_FIX_ERR";
 		} else {
 			msg = "ADDRESS_ERR";
 		}
 
 		/* FALL THROUGH */
 
 	case T_BUS_ERR_IFETCH + T_USER:	/* BERR asserted to cpu */
 	case T_BUS_ERR_LD_ST + T_USER:	/* BERR asserted to cpu */
 		ucode = 0;	/* XXX should be VM_PROT_something */
 		i = SIGBUS;
 		addr = trapframe->pc;
 		if (!msg)
 			msg = "BUS_ERR";
 		log_bad_page_fault(msg, trapframe, type);
 		break;
 
 	case T_SYSCALL + T_USER:
 		{
 			struct syscall_args sa;
 			int error;
 
 			sa.trapframe = trapframe;
 			error = syscallenter(td, &sa);
 
 #if !defined(SMP) && (defined(DDB) || defined(DEBUG))
 			if (trp == trapdebug)
 				trapdebug[TRAPSIZE - 1].code = sa.code;
 			else
 				trp[-1].code = sa.code;
 #endif
 			trapdebug_enter(td->td_frame, -sa.code);
 
 			/*
 			 * The sync'ing of I & D caches for SYS_ptrace() is
 			 * done by procfs_domem() through procfs_rwmem()
 			 * instead of being done here under a special check
 			 * for SYS_ptrace().
 			 */
 			syscallret(td, error, &sa);
 			return (trapframe->pc);
 		}
 
 #ifdef DDB
 	case T_BREAK:
 		kdb_trap(type, 0, trapframe);
 		return (trapframe->pc);
 #endif
 
 	case T_BREAK + T_USER:
 		{
 			intptr_t va;
 			uint32_t instr;
 
 			/* compute address of break instruction */
 			va = trapframe->pc;
 			if (DELAYBRANCH(trapframe->cause))
 				va += sizeof(int);
 
 			/* read break instruction */
 			instr = fuword32((caddr_t)va);
 #if 0
 			printf("trap: %s (%d) breakpoint %x at %x: (adr %x ins %x)\n",
 			    p->p_comm, p->p_pid, instr, trapframe->pc,
 			    p->p_md.md_ss_addr, p->p_md.md_ss_instr);	/* XXX */
 #endif
 			if (td->td_md.md_ss_addr != va ||
 			    instr != MIPS_BREAK_SSTEP) {
 				i = SIGTRAP;
 				addr = trapframe->pc;
 				break;
 			}
 			/*
 			 * The restoration of the original instruction and
 			 * the clearing of the berakpoint will be done later
 			 * by the call to ptrace_clear_single_step() in
 			 * issignal() when SIGTRAP is processed.
 			 */
 			addr = trapframe->pc;
 			i = SIGTRAP;
 			break;
 		}
 
 	case T_IWATCH + T_USER:
 	case T_DWATCH + T_USER:
 		{
 			intptr_t va;
 
 			/* compute address of trapped instruction */
 			va = trapframe->pc;
 			if (DELAYBRANCH(trapframe->cause))
 				va += sizeof(int);
 			printf("watch exception @ %p\n", (void *)va);
 			i = SIGTRAP;
 			addr = va;
 			break;
 		}
 
 	case T_TRAP + T_USER:
 		{
 			intptr_t va;
 			uint32_t instr;
 			struct trapframe *locr0 = td->td_frame;
 
 			/* compute address of trap instruction */
 			va = trapframe->pc;
 			if (DELAYBRANCH(trapframe->cause))
 				va += sizeof(int);
 			/* read break instruction */
 			instr = fuword32((caddr_t)va);
 
 			if (DELAYBRANCH(trapframe->cause)) {	/* Check BD bit */
 				locr0->pc = MipsEmulateBranch(locr0, trapframe->pc, 0,
 				    0);
 			} else {
 				locr0->pc += sizeof(int);
 			}
 			addr = va;
 			i = SIGEMT;	/* Stuff it with something for now */
 			break;
 		}
 
 	case T_RES_INST + T_USER:
 		{
 			InstFmt inst;
 			inst = *(InstFmt *)(intptr_t)trapframe->pc;
 			switch (inst.RType.op) {
 			case OP_SPECIAL3:
 				switch (inst.RType.func) {
 				case OP_RDHWR:
 					/* Register 29 used for TLS */
 					if (inst.RType.rd == 29) {
 						frame_regs = &(trapframe->zero);
 						frame_regs[inst.RType.rt] = (register_t)(intptr_t)td->td_md.md_tls;
 #if defined(__mips_n64) && defined(COMPAT_FREEBSD32)
 						if (SV_PROC_FLAG(td->td_proc, SV_ILP32))
 							frame_regs[inst.RType.rt] += TLS_TP_OFFSET + TLS_TCB_SIZE32;
 						else
 #endif
 						frame_regs[inst.RType.rt] += TLS_TP_OFFSET + TLS_TCB_SIZE;
 						trapframe->pc += sizeof(int);
 						goto out;
 					}
 				break;
 				}
 			break;
 			}
 
 			log_illegal_instruction("RES_INST", trapframe);
 			i = SIGILL;
 			addr = trapframe->pc;
 		}
 		break;
 	case T_C2E:
 	case T_C2E + T_USER:
 		goto err;
 		break;
 	case T_COP_UNUSABLE:
 #ifdef	CPU_CNMIPS
 		cop = (trapframe->cause & MIPS_CR_COP_ERR) >> MIPS_CR_COP_ERR_SHIFT;
 		/* Handle only COP2 exception */
 		if (cop != 2)
 			goto err;
 
 		addr = trapframe->pc;
 		/* save userland cop2 context if it has been touched */
 		if ((td->td_md.md_flags & MDTD_COP2USED) &&
 		    (td->td_md.md_cop2owner == COP2_OWNER_USERLAND)) {
 			if (td->td_md.md_ucop2)
 				octeon_cop2_save(td->td_md.md_ucop2);
 			else
 				panic("COP2 was used in user mode but md_ucop2 is NULL");
 		}
 
 		if (td->td_md.md_cop2 == NULL) {
 			td->td_md.md_cop2 = octeon_cop2_alloc_ctx();
 			if (td->td_md.md_cop2 == NULL)
 				panic("Failed to allocate COP2 context");
 			memset(td->td_md.md_cop2, 0, sizeof(*td->td_md.md_cop2));
 		}
 
 		octeon_cop2_restore(td->td_md.md_cop2);
 		
 		/* Make userland re-request its context */
 		td->td_frame->sr &= ~MIPS_SR_COP_2_BIT;
 		td->td_md.md_flags |= MDTD_COP2USED;
 		td->td_md.md_cop2owner = COP2_OWNER_KERNEL;
 		/* Enable COP2, it will be disabled in cpu_switch */
 		mips_wr_status(mips_rd_status() | MIPS_SR_COP_2_BIT);
 		return (trapframe->pc);
 #else
 		goto err;
 		break;
 #endif
 
 	case T_COP_UNUSABLE + T_USER:
 		cop = (trapframe->cause & MIPS_CR_COP_ERR) >> MIPS_CR_COP_ERR_SHIFT;
 		if (cop == 1) {
 #if !defined(CPU_HAVEFPU)
 		/* FP (COP1) instruction */
 			log_illegal_instruction("COP1_UNUSABLE", trapframe);
 			i = SIGILL;
 			break;
 #else
 			addr = trapframe->pc;
 			MipsSwitchFPState(PCPU_GET(fpcurthread), td->td_frame);
 			PCPU_SET(fpcurthread, td);
 			td->td_frame->sr |= MIPS_SR_COP_1_BIT;
 			td->td_md.md_flags |= MDTD_FPUSED;
 			goto out;
 #endif
 		}
 #ifdef	CPU_CNMIPS
 		else  if (cop == 2) {
 			addr = trapframe->pc;
 			if ((td->td_md.md_flags & MDTD_COP2USED) &&
 			    (td->td_md.md_cop2owner == COP2_OWNER_KERNEL)) {
 				if (td->td_md.md_cop2)
 					octeon_cop2_save(td->td_md.md_cop2);
 				else
 					panic("COP2 was used in kernel mode but md_cop2 is NULL");
 			}
 
 			if (td->td_md.md_ucop2 == NULL) {
 				td->td_md.md_ucop2 = octeon_cop2_alloc_ctx();
 				if (td->td_md.md_ucop2 == NULL)
 					panic("Failed to allocate userland COP2 context");
 				memset(td->td_md.md_ucop2, 0, sizeof(*td->td_md.md_ucop2));
 			}
 
 			octeon_cop2_restore(td->td_md.md_ucop2);
 
 			td->td_frame->sr |= MIPS_SR_COP_2_BIT;
 			td->td_md.md_flags |= MDTD_COP2USED;
 			td->td_md.md_cop2owner = COP2_OWNER_USERLAND;
 			goto out;
 		}
 #endif
 		else {
 			log_illegal_instruction("COPn_UNUSABLE", trapframe);
 			i = SIGILL;	/* only FPU instructions allowed */
 			break;
 		}
 
 	case T_FPE:
 #if !defined(SMP) && (defined(DDB) || defined(DEBUG))
 		trapDump("fpintr");
 #else
 		printf("FPU Trap: PC %#jx CR %x SR %x\n",
 		    (intmax_t)trapframe->pc, (unsigned)trapframe->cause, (unsigned)trapframe->sr);
 		goto err;
 #endif
 
 	case T_FPE + T_USER:
 		if (!emulate_fp) {
 			i = SIGILL;
 			addr = trapframe->pc;
 			break;
 		}
 		MipsFPTrap(trapframe->sr, trapframe->cause, trapframe->pc);
 		goto out;
 
 	case T_OVFLOW + T_USER:
 		i = SIGFPE;
 		addr = trapframe->pc;
 		break;
 
 	case T_ADDR_ERR_LD:	/* misaligned access */
 	case T_ADDR_ERR_ST:	/* misaligned access */
 #ifdef TRAP_DEBUG
 		if (trap_debug) {
 			printf("+++ ADDR_ERR: type = %d, badvaddr = %#jx\n", type,
 			    (intmax_t)trapframe->badvaddr);
 		}
 #endif
 		/* Only allow emulation on a user address */
 		if (allow_unaligned_acc &&
 		    ((vm_offset_t)trapframe->badvaddr < VM_MAXUSER_ADDRESS)) {
 			int mode;
 
 			if (type == T_ADDR_ERR_LD)
 				mode = VM_PROT_READ;
 			else
 				mode = VM_PROT_WRITE;
 
 			access_type = emulate_unaligned_access(trapframe, mode);
 			if (access_type != 0)
 				return (trapframe->pc);
 		}
 		/* FALLTHROUGH */
 
 	case T_BUS_ERR_LD_ST:	/* BERR asserted to cpu */
 		if (td->td_pcb->pcb_onfault != NULL) {
 			pc = (register_t)(intptr_t)td->td_pcb->pcb_onfault;
 			td->td_pcb->pcb_onfault = NULL;
 			return (pc);
 		}
 
 		/* FALLTHROUGH */
 
 	default:
 err:
 
 #if !defined(SMP) && defined(DEBUG)
 		stacktrace(!usermode ? trapframe : td->td_frame);
 		trapDump("trap");
 #endif
 #ifdef SMP
 		printf("cpu:%d-", PCPU_GET(cpuid));
 #endif
 		printf("Trap cause = %d (%s - ", type,
 		    trap_type[type & (~T_USER)]);
 
 		if (type & T_USER)
 			printf("user mode)\n");
 		else
 			printf("kernel mode)\n");
 
 #ifdef TRAP_DEBUG
 		if (trap_debug)
 			printf("badvaddr = %#jx, pc = %#jx, ra = %#jx, sr = %#jxx\n",
 			       (intmax_t)trapframe->badvaddr, (intmax_t)trapframe->pc, (intmax_t)trapframe->ra,
 			       (intmax_t)trapframe->sr);
 #endif
 
 #ifdef KDB
 		if (debugger_on_panic || kdb_active) {
 			kdb_trap(type, 0, trapframe);
 		}
 #endif
 		panic("trap");
 	}
 	td->td_frame->pc = trapframe->pc;
 	td->td_frame->cause = trapframe->cause;
 	td->td_frame->badvaddr = trapframe->badvaddr;
 	ksiginfo_init_trap(&ksi);
 	ksi.ksi_signo = i;
 	ksi.ksi_code = ucode;
 	ksi.ksi_addr = (void *)addr;
 	ksi.ksi_trapno = type;
 	trapsignal(td, &ksi);
 out:
 
 	/*
 	 * Note: we should only get here if returning to user mode.
 	 */
 	userret(td, trapframe);
 	return (trapframe->pc);
 }
 
 #if !defined(SMP) && (defined(DDB) || defined(DEBUG))
 void
 trapDump(char *msg)
 {
 	register_t s;
 	int i;
 
 	s = intr_disable();
 	printf("trapDump(%s)\n", msg);
 	for (i = 0; i < TRAPSIZE; i++) {
 		if (trp == trapdebug) {
 			trp = &trapdebug[TRAPSIZE - 1];
 		} else {
 			trp--;
 		}
 
 		if (trp->cause == 0)
 			break;
 
 		printf("%s: ADR %jx PC %jx CR %jx SR %jx\n",
 		    trap_type[(trp->cause & MIPS_CR_EXC_CODE) >> 
 			MIPS_CR_EXC_CODE_SHIFT],
 		    (intmax_t)trp->vadr, (intmax_t)trp->pc,
 		    (intmax_t)trp->cause, (intmax_t)trp->status);
 
 		printf("   RA %jx SP %jx code %d\n", (intmax_t)trp->ra,
 		    (intmax_t)trp->sp, (int)trp->code);
 	}
 	intr_restore(s);
 }
 #endif
 
 
 /*
  * Return the resulting PC as if the branch was executed.
  */
 uintptr_t
 MipsEmulateBranch(struct trapframe *framePtr, uintptr_t instPC, int fpcCSR,
     uintptr_t instptr)
 {
 	InstFmt inst;
 	register_t *regsPtr = (register_t *) framePtr;
 	uintptr_t retAddr = 0;
 	int condition;
 
 #define	GetBranchDest(InstPtr, inst) \
 	(InstPtr + 4 + ((short)inst.IType.imm << 2))
 
 
 	if (instptr) {
 		if (instptr < MIPS_KSEG0_START)
 			inst.word = fuword32((void *)instptr);
 		else
 			inst = *(InstFmt *) instptr;
 	} else {
 		if ((vm_offset_t)instPC < MIPS_KSEG0_START)
 			inst.word = fuword32((void *)instPC);
 		else
 			inst = *(InstFmt *) instPC;
 	}
 
 	switch ((int)inst.JType.op) {
 	case OP_SPECIAL:
 		switch ((int)inst.RType.func) {
 		case OP_JR:
 		case OP_JALR:
 			retAddr = regsPtr[inst.RType.rs];
 			break;
 
 		default:
 			retAddr = instPC + 4;
 			break;
 		}
 		break;
 
 	case OP_BCOND:
 		switch ((int)inst.IType.rt) {
 		case OP_BLTZ:
 		case OP_BLTZL:
 		case OP_BLTZAL:
 		case OP_BLTZALL:
 			if ((int)(regsPtr[inst.RType.rs]) < 0)
 				retAddr = GetBranchDest(instPC, inst);
 			else
 				retAddr = instPC + 8;
 			break;
 
 		case OP_BGEZ:
 		case OP_BGEZL:
 		case OP_BGEZAL:
 		case OP_BGEZALL:
 			if ((int)(regsPtr[inst.RType.rs]) >= 0)
 				retAddr = GetBranchDest(instPC, inst);
 			else
 				retAddr = instPC + 8;
 			break;
 
 		case OP_TGEI:
 		case OP_TGEIU:
 		case OP_TLTI:
 		case OP_TLTIU:
 		case OP_TEQI:
 		case OP_TNEI:
 			retAddr = instPC + 4;	/* Like syscall... */
 			break;
 
 		default:
 			panic("MipsEmulateBranch: Bad branch cond");
 		}
 		break;
 
 	case OP_J:
 	case OP_JAL:
 		retAddr = (inst.JType.target << 2) |
 		    ((unsigned)(instPC + 4) & 0xF0000000);
 		break;
 
 	case OP_BEQ:
 	case OP_BEQL:
 		if (regsPtr[inst.RType.rs] == regsPtr[inst.RType.rt])
 			retAddr = GetBranchDest(instPC, inst);
 		else
 			retAddr = instPC + 8;
 		break;
 
 	case OP_BNE:
 	case OP_BNEL:
 		if (regsPtr[inst.RType.rs] != regsPtr[inst.RType.rt])
 			retAddr = GetBranchDest(instPC, inst);
 		else
 			retAddr = instPC + 8;
 		break;
 
 	case OP_BLEZ:
 	case OP_BLEZL:
 		if ((int)(regsPtr[inst.RType.rs]) <= 0)
 			retAddr = GetBranchDest(instPC, inst);
 		else
 			retAddr = instPC + 8;
 		break;
 
 	case OP_BGTZ:
 	case OP_BGTZL:
 		if ((int)(regsPtr[inst.RType.rs]) > 0)
 			retAddr = GetBranchDest(instPC, inst);
 		else
 			retAddr = instPC + 8;
 		break;
 
 	case OP_COP1:
 		switch (inst.RType.rs) {
 		case OP_BCx:
 		case OP_BCy:
 			if ((inst.RType.rt & COPz_BC_TF_MASK) == COPz_BC_TRUE)
 				condition = fpcCSR & MIPS_FPU_COND_BIT;
 			else
 				condition = !(fpcCSR & MIPS_FPU_COND_BIT);
 			if (condition)
 				retAddr = GetBranchDest(instPC, inst);
 			else
 				retAddr = instPC + 8;
 			break;
 
 		default:
 			retAddr = instPC + 4;
 		}
 		break;
 
 	default:
 		retAddr = instPC + 4;
 	}
 	return (retAddr);
 }
 
 
 #if defined(DDB) || defined(DEBUG)
 /*
  * Print a stack backtrace.
  */
 void
 stacktrace(struct trapframe *regs)
 {
 	stacktrace_subr(regs->pc, regs->sp, regs->ra, printf);
 }
 #endif
 
 static void
 log_frame_dump(struct trapframe *frame)
 {
 	log(LOG_ERR, "Trapframe Register Dump:\n");
 	log(LOG_ERR, "\tzero: %#jx\tat: %#jx\tv0: %#jx\tv1: %#jx\n",
 	    (intmax_t)0, (intmax_t)frame->ast, (intmax_t)frame->v0, (intmax_t)frame->v1);
 
 	log(LOG_ERR, "\ta0: %#jx\ta1: %#jx\ta2: %#jx\ta3: %#jx\n",
 	    (intmax_t)frame->a0, (intmax_t)frame->a1, (intmax_t)frame->a2, (intmax_t)frame->a3);
 
 #if defined(__mips_n32) || defined(__mips_n64)
 	log(LOG_ERR, "\ta4: %#jx\ta5: %#jx\ta6: %#jx\ta6: %#jx\n",
 	    (intmax_t)frame->a4, (intmax_t)frame->a5, (intmax_t)frame->a6, (intmax_t)frame->a7);
 
 	log(LOG_ERR, "\tt0: %#jx\tt1: %#jx\tt2: %#jx\tt3: %#jx\n",
 	    (intmax_t)frame->t0, (intmax_t)frame->t1, (intmax_t)frame->t2, (intmax_t)frame->t3);
 #else
 	log(LOG_ERR, "\tt0: %#jx\tt1: %#jx\tt2: %#jx\tt3: %#jx\n",
 	    (intmax_t)frame->t0, (intmax_t)frame->t1, (intmax_t)frame->t2, (intmax_t)frame->t3);
 
 	log(LOG_ERR, "\tt4: %#jx\tt5: %#jx\tt6: %#jx\tt7: %#jx\n",
 	    (intmax_t)frame->t4, (intmax_t)frame->t5, (intmax_t)frame->t6, (intmax_t)frame->t7);
 #endif
 	log(LOG_ERR, "\tt8: %#jx\tt9: %#jx\ts0: %#jx\ts1: %#jx\n",
 	    (intmax_t)frame->t8, (intmax_t)frame->t9, (intmax_t)frame->s0, (intmax_t)frame->s1);
 
 	log(LOG_ERR, "\ts2: %#jx\ts3: %#jx\ts4: %#jx\ts5: %#jx\n",
 	    (intmax_t)frame->s2, (intmax_t)frame->s3, (intmax_t)frame->s4, (intmax_t)frame->s5);
 
 	log(LOG_ERR, "\ts6: %#jx\ts7: %#jx\tk0: %#jx\tk1: %#jx\n",
 	    (intmax_t)frame->s6, (intmax_t)frame->s7, (intmax_t)frame->k0, (intmax_t)frame->k1);
 
 	log(LOG_ERR, "\tgp: %#jx\tsp: %#jx\ts8: %#jx\tra: %#jx\n",
 	    (intmax_t)frame->gp, (intmax_t)frame->sp, (intmax_t)frame->s8, (intmax_t)frame->ra);
 
 	log(LOG_ERR, "\tsr: %#jx\tmullo: %#jx\tmulhi: %#jx\tbadvaddr: %#jx\n",
 	    (intmax_t)frame->sr, (intmax_t)frame->mullo, (intmax_t)frame->mulhi, (intmax_t)frame->badvaddr);
 
 #ifdef IC_REG
 	log(LOG_ERR, "\tcause: %#jx\tpc: %#jx\tic: %#jx\n",
 	    (intmax_t)frame->cause, (intmax_t)frame->pc, (intmax_t)frame->ic);
 #else
 	log(LOG_ERR, "\tcause: %#jx\tpc: %#jx\n",
 	    (intmax_t)frame->cause, (intmax_t)frame->pc);
 #endif
 }
 
 #ifdef TRAP_DEBUG
 static void
 trap_frame_dump(struct trapframe *frame)
 {
 	printf("Trapframe Register Dump:\n");
 	printf("\tzero: %#jx\tat: %#jx\tv0: %#jx\tv1: %#jx\n",
 	    (intmax_t)0, (intmax_t)frame->ast, (intmax_t)frame->v0, (intmax_t)frame->v1);
 
 	printf("\ta0: %#jx\ta1: %#jx\ta2: %#jx\ta3: %#jx\n",
 	    (intmax_t)frame->a0, (intmax_t)frame->a1, (intmax_t)frame->a2, (intmax_t)frame->a3);
 #if defined(__mips_n32) || defined(__mips_n64)
 	printf("\ta4: %#jx\ta5: %#jx\ta6: %#jx\ta7: %#jx\n",
 	    (intmax_t)frame->a4, (intmax_t)frame->a5, (intmax_t)frame->a6, (intmax_t)frame->a7);
 
 	printf("\tt0: %#jx\tt1: %#jx\tt2: %#jx\tt3: %#jx\n",
 	    (intmax_t)frame->t0, (intmax_t)frame->t1, (intmax_t)frame->t2, (intmax_t)frame->t3);
 #else
 	printf("\tt0: %#jx\tt1: %#jx\tt2: %#jx\tt3: %#jx\n",
 	    (intmax_t)frame->t0, (intmax_t)frame->t1, (intmax_t)frame->t2, (intmax_t)frame->t3);
 
 	printf("\tt4: %#jx\tt5: %#jx\tt6: %#jx\tt7: %#jx\n",
 	    (intmax_t)frame->t4, (intmax_t)frame->t5, (intmax_t)frame->t6, (intmax_t)frame->t7);
 #endif
 	printf("\tt8: %#jx\tt9: %#jx\ts0: %#jx\ts1: %#jx\n",
 	    (intmax_t)frame->t8, (intmax_t)frame->t9, (intmax_t)frame->s0, (intmax_t)frame->s1);
 
 	printf("\ts2: %#jx\ts3: %#jx\ts4: %#jx\ts5: %#jx\n",
 	    (intmax_t)frame->s2, (intmax_t)frame->s3, (intmax_t)frame->s4, (intmax_t)frame->s5);
 
 	printf("\ts6: %#jx\ts7: %#jx\tk0: %#jx\tk1: %#jx\n",
 	    (intmax_t)frame->s6, (intmax_t)frame->s7, (intmax_t)frame->k0, (intmax_t)frame->k1);
 
 	printf("\tgp: %#jx\tsp: %#jx\ts8: %#jx\tra: %#jx\n",
 	    (intmax_t)frame->gp, (intmax_t)frame->sp, (intmax_t)frame->s8, (intmax_t)frame->ra);
 
 	printf("\tsr: %#jx\tmullo: %#jx\tmulhi: %#jx\tbadvaddr: %#jx\n",
 	    (intmax_t)frame->sr, (intmax_t)frame->mullo, (intmax_t)frame->mulhi, (intmax_t)frame->badvaddr);
 
 #ifdef IC_REG
 	printf("\tcause: %#jx\tpc: %#jx\tic: %#jx\n",
 	    (intmax_t)frame->cause, (intmax_t)frame->pc, (intmax_t)frame->ic);
 #else
 	printf("\tcause: %#jx\tpc: %#jx\n",
 	    (intmax_t)frame->cause, (intmax_t)frame->pc);
 #endif
 }
 
 #endif
 
 
 static void
 get_mapping_info(vm_offset_t va, pd_entry_t **pdepp, pt_entry_t **ptepp)
 {
 	pt_entry_t *ptep;
 	pd_entry_t *pdep;
 	struct proc *p = curproc;
 
 	pdep = (&(p->p_vmspace->vm_pmap.pm_segtab[(va >> SEGSHIFT) & (NPDEPG - 1)]));
 	if (*pdep)
 		ptep = pmap_pte(&p->p_vmspace->vm_pmap, va);
 	else
 		ptep = (pt_entry_t *)0;
 
 	*pdepp = pdep;
 	*ptepp = ptep;
 }
 
 static void
 log_illegal_instruction(const char *msg, struct trapframe *frame)
 {
 	pt_entry_t *ptep;
 	pd_entry_t *pdep;
 	unsigned int *addr;
 	struct thread *td;
 	struct proc *p;
 	register_t pc;
 
 	td = curthread;
 	p = td->td_proc;
 
 #ifdef SMP
 	printf("cpuid = %d\n", PCPU_GET(cpuid));
 #endif
 	pc = frame->pc + (DELAYBRANCH(frame->cause) ? 4 : 0);
 	log(LOG_ERR, "%s: pid %d tid %ld (%s), uid %d: pc %#jx ra %#jx\n",
 	    msg, p->p_pid, (long)td->td_tid, p->p_comm,
 	    p->p_ucred ? p->p_ucred->cr_uid : -1,
 	    (intmax_t)pc,
 	    (intmax_t)frame->ra);
 
 	/* log registers in trap frame */
 	log_frame_dump(frame);
 
 	get_mapping_info((vm_offset_t)pc, &pdep, &ptep);
 
 	/*
 	 * Dump a few words around faulting instruction, if the addres is
 	 * valid.
 	 */
 	if (!(pc & 3) &&
 	    useracc((caddr_t)(intptr_t)pc, sizeof(int) * 4, VM_PROT_READ)) {
 		/* dump page table entry for faulting instruction */
 		log(LOG_ERR, "Page table info for pc address %#jx: pde = %p, pte = %#jx\n",
 		    (intmax_t)pc, (void *)(intptr_t)*pdep, (uintmax_t)(ptep ? *ptep : 0));
 
 		addr = (unsigned int *)(intptr_t)pc;
 		log(LOG_ERR, "Dumping 4 words starting at pc address %p: \n",
 		    addr);
 		log(LOG_ERR, "%08x %08x %08x %08x\n",
 		    addr[0], addr[1], addr[2], addr[3]);
 	} else {
 		log(LOG_ERR, "pc address %#jx is inaccessible, pde = %p, pte = %#jx\n",
 		    (intmax_t)pc, (void *)(intptr_t)*pdep, (uintmax_t)(ptep ? *ptep : 0));
 	}
 }
 
 static void
 log_bad_page_fault(char *msg, struct trapframe *frame, int trap_type)
 {
 	pt_entry_t *ptep;
 	pd_entry_t *pdep;
 	unsigned int *addr;
 	struct thread *td;
 	struct proc *p;
 	char *read_or_write;
 	register_t pc;
 
 	trap_type &= ~T_USER;
 
 	td = curthread;
 	p = td->td_proc;
 
 #ifdef SMP
 	printf("cpuid = %d\n", PCPU_GET(cpuid));
 #endif
 	switch (trap_type) {
 	case T_TLB_MOD:
 	case T_TLB_ST_MISS:
 	case T_ADDR_ERR_ST:
 		read_or_write = "write";
 		break;
 	case T_TLB_LD_MISS:
 	case T_ADDR_ERR_LD:
 	case T_BUS_ERR_IFETCH:
 		read_or_write = "read";
 		break;
 	default:
 		read_or_write = "unknown";
 	}
 
 	pc = frame->pc + (DELAYBRANCH(frame->cause) ? 4 : 0);
 	log(LOG_ERR, "%s: pid %d tid %ld (%s), uid %d: pc %#jx got a %s fault "
 	    "(type %#x) at %#jx\n",
 	    msg, p->p_pid, (long)td->td_tid, p->p_comm,
 	    p->p_ucred ? p->p_ucred->cr_uid : -1,
 	    (intmax_t)pc,
 	    read_or_write,
 	    trap_type,
 	    (intmax_t)frame->badvaddr);
 
 	/* log registers in trap frame */
 	log_frame_dump(frame);
 
 	get_mapping_info((vm_offset_t)pc, &pdep, &ptep);
 
 	/*
 	 * Dump a few words around faulting instruction, if the addres is
 	 * valid.
 	 */
 	if (!(pc & 3) && (pc != frame->badvaddr) &&
 	    (trap_type != T_BUS_ERR_IFETCH) &&
 	    useracc((caddr_t)(intptr_t)pc, sizeof(int) * 4, VM_PROT_READ)) {
 		/* dump page table entry for faulting instruction */
 		log(LOG_ERR, "Page table info for pc address %#jx: pde = %p, pte = %#jx\n",
 		    (intmax_t)pc, (void *)(intptr_t)*pdep, (uintmax_t)(ptep ? *ptep : 0));
 
 		addr = (unsigned int *)(intptr_t)pc;
 		log(LOG_ERR, "Dumping 4 words starting at pc address %p: \n",
 		    addr);
 		log(LOG_ERR, "%08x %08x %08x %08x\n",
 		    addr[0], addr[1], addr[2], addr[3]);
 	} else {
 		log(LOG_ERR, "pc address %#jx is inaccessible, pde = %p, pte = %#jx\n",
 		    (intmax_t)pc, (void *)(intptr_t)*pdep, (uintmax_t)(ptep ? *ptep : 0));
 	}
 
 	get_mapping_info((vm_offset_t)frame->badvaddr, &pdep, &ptep);
 	log(LOG_ERR, "Page table info for bad address %#jx: pde = %p, pte = %#jx\n",
 	    (intmax_t)frame->badvaddr, (void *)(intptr_t)*pdep, (uintmax_t)(ptep ? *ptep : 0));
 }
 
 
 /*
  * Unaligned load/store emulation
  */
 static int
 mips_unaligned_load_store(struct trapframe *frame, int mode, register_t addr, register_t pc)
 {
 	register_t *reg = (register_t *) frame;
 	u_int32_t inst = *((u_int32_t *)(intptr_t)pc);
 	register_t value_msb, value;
 	unsigned size;
 
 	/*
 	 * ADDR_ERR faults have higher priority than TLB
 	 * Miss faults.  Therefore, it is necessary to
 	 * verify that the faulting address is a valid
 	 * virtual address within the process' address space
 	 * before trying to emulate the unaligned access.
 	 */
 	switch (MIPS_INST_OPCODE(inst)) {
 	case OP_LHU: case OP_LH:
 	case OP_SH:
 		size = 2;
 		break;
 	case OP_LWU: case OP_LW:
 	case OP_SW:
 		size = 4;
 		break;
 	case OP_LD:
 	case OP_SD:
 		size = 8;
 		break;
 	default:
 		printf("%s: unhandled opcode in address error: %#x\n", __func__, MIPS_INST_OPCODE(inst));
 		return (0);
 	}
 
 	if (!useracc((void *)((vm_offset_t)addr & ~(size - 1)), size * 2, mode))
 		return (0);
 
 	/*
 	 * XXX
 	 * Handle LL/SC LLD/SCD.
 	 */
 	switch (MIPS_INST_OPCODE(inst)) {
 	case OP_LHU:
 		KASSERT(mode == VM_PROT_READ, ("access mode must be read for load instruction."));
 		lbu_macro(value_msb, addr);
 		addr += 1;
 		lbu_macro(value, addr);
 		value |= value_msb << 8;
 		reg[MIPS_INST_RT(inst)] = value;
 		return (MIPS_LHU_ACCESS);
 
 	case OP_LH:
 		KASSERT(mode == VM_PROT_READ, ("access mode must be read for load instruction."));
 		lb_macro(value_msb, addr);
 		addr += 1;
 		lbu_macro(value, addr);
 		value |= value_msb << 8;
 		reg[MIPS_INST_RT(inst)] = value;
 		return (MIPS_LH_ACCESS);
 
 	case OP_LWU:
 		KASSERT(mode == VM_PROT_READ, ("access mode must be read for load instruction."));
 		lwl_macro(value, addr);
 		addr += 3;
 		lwr_macro(value, addr);
 		value &= 0xffffffff;
 		reg[MIPS_INST_RT(inst)] = value;
 		return (MIPS_LWU_ACCESS);
 
 	case OP_LW:
 		KASSERT(mode == VM_PROT_READ, ("access mode must be read for load instruction."));
 		lwl_macro(value, addr);
 		addr += 3;
 		lwr_macro(value, addr);
 		reg[MIPS_INST_RT(inst)] = value;
 		return (MIPS_LW_ACCESS);
 
 #if defined(__mips_n32) || defined(__mips_n64)
 	case OP_LD:
 		KASSERT(mode == VM_PROT_READ, ("access mode must be read for load instruction."));
 		ldl_macro(value, addr);
 		addr += 7;
 		ldr_macro(value, addr);
 		reg[MIPS_INST_RT(inst)] = value;
 		return (MIPS_LD_ACCESS);
 #endif
 
 	case OP_SH:
 		KASSERT(mode == VM_PROT_WRITE, ("access mode must be write for store instruction."));
 		value = reg[MIPS_INST_RT(inst)];
 		value_msb = value >> 8;
 		sb_macro(value_msb, addr);
 		addr += 1;
 		sb_macro(value, addr);
 		return (MIPS_SH_ACCESS);
 
 	case OP_SW:
 		KASSERT(mode == VM_PROT_WRITE, ("access mode must be write for store instruction."));
 		value = reg[MIPS_INST_RT(inst)];
 		swl_macro(value, addr);
 		addr += 3;
 		swr_macro(value, addr);
 		return (MIPS_SW_ACCESS);
 
 #if defined(__mips_n32) || defined(__mips_n64)
 	case OP_SD:
 		KASSERT(mode == VM_PROT_WRITE, ("access mode must be write for store instruction."));
 		value = reg[MIPS_INST_RT(inst)];
 		sdl_macro(value, addr);
 		addr += 7;
 		sdr_macro(value, addr);
 		return (MIPS_SD_ACCESS);
 #endif
 	}
 	panic("%s: should not be reached.", __func__);
 }
 
 
 static int
 emulate_unaligned_access(struct trapframe *frame, int mode)
 {
 	register_t pc;
 	int access_type = 0;
 
 	pc = frame->pc + (DELAYBRANCH(frame->cause) ? 4 : 0);
 
 	/*
 	 * Fall through if it's instruction fetch exception
 	 */
 	if (!((pc & 3) || (pc == frame->badvaddr))) {
 
 		/*
 		 * Handle unaligned load and store
 		 */
 
 		/*
 		 * Return access type if the instruction was emulated.
 		 * Otherwise restore pc and fall through.
 		 */
 		access_type = mips_unaligned_load_store(frame,
 		    mode, frame->badvaddr, pc);
 
 		if (access_type) {
 			if (DELAYBRANCH(frame->cause))
 				frame->pc = MipsEmulateBranch(frame, frame->pc,
 				    0, 0);
 			else
 				frame->pc += 4;
 
 			log(LOG_INFO, "Unaligned %s: pc=%#jx, badvaddr=%#jx\n",
 			    access_name[access_type - 1], (intmax_t)pc,
 			    (intmax_t)frame->badvaddr);
 		}
 	}
 	return access_type;
 }
Index: head/sys/modules/crypto/Makefile
===================================================================
--- head/sys/modules/crypto/Makefile	(revision 258540)
+++ head/sys/modules/crypto/Makefile	(revision 258541)
@@ -1,21 +1,21 @@
 # $FreeBSD$
 
 .PATH:	${.CURDIR}/../../opencrypto
 .PATH:	${.CURDIR}/../../crypto
 .PATH:	${.CURDIR}/../../crypto/blowfish
 .PATH:	${.CURDIR}/../../crypto/des
 .PATH:	${.CURDIR}/../../crypto/rijndael
 .PATH:	${.CURDIR}/../../crypto/sha2
 .PATH:	${.CURDIR}/../../crypto/camellia
 
 KMOD	= crypto
 SRCS	= crypto.c cryptodev_if.c
 SRCS	+= criov.c cryptosoft.c xform.c
 SRCS	+= cast.c deflate.c rmd160.c rijndael-alg-fst.c rijndael-api.c
 SRCS	+= skipjack.c bf_enc.c bf_ecb.c bf_skey.c
 SRCS	+= des_ecb.c des_enc.c des_setkey.c
 SRCS	+= sha1.c sha2.c
 SRCS	+= opt_param.h cryptodev_if.h bus_if.h device_if.h
-SRCS	+= opt_ddb.h opt_kdtrace.h
+SRCS	+= opt_ddb.h
 SRCS	+= camellia.c camellia-api.c
 .include <bsd.kmod.mk>
Index: head/sys/modules/ctl/Makefile
===================================================================
--- head/sys/modules/ctl/Makefile	(revision 258540)
+++ head/sys/modules/ctl/Makefile	(revision 258541)
@@ -1,29 +1,28 @@
 # $FreeBSD$
 
 .PATH: ${.CURDIR}/../../cam/ctl
 
 KMOD=	ctl
 
 SRCS=	ctl.c
 SRCS+=	ctl_backend.c
 SRCS+=	ctl_backend_block.c
 SRCS+=	ctl_backend_ramdisk.c
 SRCS+=	ctl_cmd_table.c
 SRCS+=	ctl_frontend.c
 SRCS+=	ctl_frontend_cam_sim.c
 SRCS+=	ctl_frontend_internal.c
 SRCS+=	ctl_frontend_iscsi.c
 SRCS+=	ctl_mem_pool.c
 SRCS+=	ctl_scsi_all.c
 SRCS+=	ctl_error.c
 SRCS+=	ctl_util.c
 SRCS+=	scsi_ctl.c
 SRCS+=	bus_if.h
 SRCS+=	device_if.h
 SRCS+=	vnode_if.h
 SRCS+=	opt_cam.h
-SRCS+=	opt_kdtrace.h
 
 #CFLAGS+=-DICL_KERNEL_PROXY
 
 .include <bsd.kmod.mk>
Index: head/sys/modules/dtrace/dtrace_test/Makefile
===================================================================
--- head/sys/modules/dtrace/dtrace_test/Makefile	(revision 258540)
+++ head/sys/modules/dtrace/dtrace_test/Makefile	(revision 258541)
@@ -1,17 +1,16 @@
 # $FreeBSD$
 
 .PATH: ${.CURDIR}/../../../cddl/dev/dtrace
 
 KMOD=		dtrace_test
 SRCS=		dtrace_test.c
 
-SRCS+=		opt_kdtrace.h
 SRCS+=		vnode_if.h
 
 CFLAGS+=	-I${.CURDIR}/../../..
 
 CFLAGS+=	-D_KERNEL
 
 .include <bsd.kmod.mk>
 
 CFLAGS+=	-include ${.CURDIR}/../../../cddl/compat/opensolaris/sys/debug_compat.h
Index: head/sys/modules/dtrace/lockstat/Makefile
===================================================================
--- head/sys/modules/dtrace/lockstat/Makefile	(revision 258540)
+++ head/sys/modules/dtrace/lockstat/Makefile	(revision 258541)
@@ -1,16 +1,16 @@
 # $FreeBSD$
 
 .PATH: ${.CURDIR}/../../../cddl/dev/lockstat
 
 KMOD=		lockstat
 SRCS=		lockstat.c 
 
-SRCS+=		vnode_if.h opt_kdtrace.h
+SRCS+=		vnode_if.h
 
 CFLAGS+=	-I${.CURDIR}/../../../cddl/compat/opensolaris \
 		-I${.CURDIR}/../../../cddl/contrib/opensolaris/uts/common \
 		-I${.CURDIR}/../../..
 
 .include <bsd.kmod.mk>
 
 CFLAGS+=	-include ${.CURDIR}/../../../cddl/compat/opensolaris/sys/debug_compat.h
Index: head/sys/modules/dtrace/sdt/Makefile
===================================================================
--- head/sys/modules/dtrace/sdt/Makefile	(revision 258540)
+++ head/sys/modules/dtrace/sdt/Makefile	(revision 258541)
@@ -1,15 +1,15 @@
 # $FreeBSD$
 
 .PATH: ${.CURDIR}/../../../cddl/dev/sdt
 
 KMOD=		sdt
 SRCS=		sdt.c
-SRCS+=		vnode_if.h opt_kdtrace.h
+SRCS+=		vnode_if.h
 
 CFLAGS+=	-I${.CURDIR}/../../../cddl/compat/opensolaris \
 		-I${.CURDIR}/../../../cddl/contrib/opensolaris/uts/common \
 		-I${.CURDIR}/../../..
 
 .include <bsd.kmod.mk>
 
 CFLAGS+=	-include ${.CURDIR}/../../../cddl/compat/opensolaris/sys/debug_compat.h
Index: head/sys/modules/ip6_mroute_mod/Makefile
===================================================================
--- head/sys/modules/ip6_mroute_mod/Makefile	(revision 258540)
+++ head/sys/modules/ip6_mroute_mod/Makefile	(revision 258541)
@@ -1,19 +1,19 @@
 # $FreeBSD$
 
 .include <bsd.own.mk>
 
 .PATH: ${.CURDIR}/../../netinet6
 
 KMOD=	ip6_mroute
 
 SRCS=	ip6_mroute.c
-SRCS+=	opt_inet6.h opt_kdtrace.h opt_mrouting.h
+SRCS+=	opt_inet6.h opt_mrouting.h
 
 .if !defined(KERNBUILDDIR)
 opt_inet6.h:
 	echo "#define INET6 1" > ${.TARGET}
 opt_mrouting.h:
 	echo "#define MROUTING 1" > ${.TARGET}
 .endif
 
 .include <bsd.kmod.mk>
Index: head/sys/modules/linux/Makefile
===================================================================
--- head/sys/modules/linux/Makefile	(revision 258540)
+++ head/sys/modules/linux/Makefile	(revision 258541)
@@ -1,66 +1,66 @@
 # $FreeBSD$
 
 .if ${MACHINE_CPUARCH} == "amd64"
 SFX= 32
 CFLAGS+=-DCOMPAT_FREEBSD32 -DCOMPAT_LINUX32
 .endif
 
 .PATH: ${.CURDIR}/../../compat/linux ${.CURDIR}/../../${MACHINE_CPUARCH}/linux${SFX}
 
 KMOD=	linux
 SRCS=	linux_fork.c linux${SFX}_dummy.c linux_emul.c linux_file.c \
 	linux_futex.c linux_getcwd.c linux_ioctl.c linux_ipc.c \
 	linux${SFX}_machdep.c linux_mib.c linux_misc.c linux_signal.c \
 	linux_socket.c linux_stats.c linux_sysctl.c linux${SFX}_sysent.c \
 	linux${SFX}_sysvec.c linux_uid16.c linux_util.c linux_time.c \
-	opt_inet6.h opt_compat.h opt_kdtrace.h opt_posix.h opt_usb.h \
-	vnode_if.h device_if.h bus_if.h assym.s
+	opt_inet6.h opt_compat.h opt_posix.h opt_usb.h vnode_if.h \
+	device_if.h bus_if.h assym.s
 
 # XXX: for assym.s
 SRCS+=  opt_kstack_pages.h opt_nfs.h opt_compat.h opt_hwpmc_hooks.h
 .if ${MACHINE_CPUARCH} == "i386"
 SRCS+=	opt_apic.h
 .endif
 
 OBJS=	linux${SFX}_locore.o linux${SFX}_support.o
 
 .if ${MACHINE_CPUARCH} == "i386"
 SRCS+=	linux_ptrace.c imgact_linux.c opt_cpu.h
 .endif
 
 EXPORT_SYMS=
 EXPORT_SYMS+=	linux_emul_path
 EXPORT_SYMS+=	linux_get_osname
 EXPORT_SYMS+=	linux_get_osrelease
 EXPORT_SYMS+=	linux_ifname
 EXPORT_SYMS+=	linux_ioctl_register_handler
 EXPORT_SYMS+=	linux_ioctl_unregister_handler
 
 CLEANFILES=	linux${SFX}_assym.h linux${SFX}_genassym.o
 
 linux${SFX}_assym.h: linux${SFX}_genassym.o
 .if exists(@)
 linux${SFX}_assym.h: @/kern/genassym.sh
 .endif
 	sh @/kern/genassym.sh linux${SFX}_genassym.o > ${.TARGET}
 
 linux${SFX}_locore.o: linux${SFX}_locore.s linux${SFX}_assym.h
 	${CC} -c -x assembler-with-cpp -DLOCORE ${CFLAGS} \
 	    ${.IMPSRC} -o ${.TARGET}
 
 linux${SFX}_support.o: linux${SFX}_support.s assym.s linux${SFX}_assym.h
 	${CC} -c -x assembler-with-cpp -DLOCORE ${CFLAGS} \
 	    ${.IMPSRC} -o ${.TARGET}
 
 linux${SFX}_genassym.o: linux${SFX}_genassym.c linux.h @ machine x86
 	${CC} -c ${CFLAGS:N-fno-common} ${.IMPSRC}
 
 .if !defined(KERNBUILDDIR)
 opt_inet6.h:
 	echo "#define INET6 1" > ${.TARGET}
 .if defined(KTR)
 CFLAGS+=	-DKTR
 .endif
 .endif
 
 .include <bsd.kmod.mk>
Index: head/sys/modules/nfscl/Makefile
===================================================================
--- head/sys/modules/nfscl/Makefile	(revision 258540)
+++ head/sys/modules/nfscl/Makefile	(revision 258541)
@@ -1,58 +1,57 @@
 # $FreeBSD$
 
 .PATH: ${.CURDIR}/../../fs/nfsclient ${.CURDIR}/../../nfs
 KMOD=	nfscl
 SRCS=	vnode_if.h \
 	nfs_clrpcops.c \
 	nfs_clkrpc.c \
 	nfs_clstate.c \
 	nfs_clcomsubs.c \
 	nfs_clport.c \
 	nfs_clbio.c \
 	nfs_clnfsiod.c \
 	nfs_clnode.c \
 	nfs_clsubs.c \
 	nfs_clvfsops.c \
 	nfs_clvnops.c \
 	opt_bootp.h \
 	opt_inet.h \
 	opt_inet6.h \
-	opt_kdtrace.h \
 	opt_kgssapi.h \
 	opt_nfs.h \
 	opt_nfsroot.h \
 	opt_ufs.h
 
 .if !defined(KERNBUILDDIR)
 NFS_INET?=	1	# 0/1 - requires INET to be configured in kernel
 NFS_INET6?=	1	# 0/1 - requires INET6 to be configured in kernel
 NFS_ROOT?=	1	# 0/1 - requires NFS_ROOT to be configured in kernel
 
 .if ${NFS_INET} > 0
 opt_inet.h:
 	echo "#define INET 1" > ${.TARGET}
 .endif
 
 .if ${NFS_INET6} > 0
 opt_inet6.h:
 	echo "#define INET6 1" > ${.TARGET}
 .endif
 
 .if ${NFS_ROOT} > 0
 opt_nfsroot.h:
 	echo "#define NFS_ROOT 1" > ${.TARGET}
 .endif
 .else
 OPT_NFS_ROOT!=	cat ${KERNBUILDDIR}/opt_nfsroot.h
 .if empty(OPT_NFS_ROOT)
 NFS_ROOT=	0
 .else
 NFS_ROOT=	1
 .endif
 .endif
 
 .if ${NFS_ROOT} > 0
 SRCS+=	nfs_diskless.c
 .endif
 
 .include <bsd.kmod.mk>
Index: head/sys/modules/nfsclient/Makefile
===================================================================
--- head/sys/modules/nfsclient/Makefile	(revision 258540)
+++ head/sys/modules/nfsclient/Makefile	(revision 258541)
@@ -1,44 +1,44 @@
 # $FreeBSD$
 
 .PATH: ${.CURDIR}/../../nfsclient ${.CURDIR}/../../nfs ${.CURDIR}/../../rpc
 
 KMOD=	nfsclient
 SRCS=	vnode_if.h \
 	nfs_bio.c nfs_node.c nfs_subs.c nfs_nfsiod.c \
 	nfs_vfsops.c nfs_vnops.c nfs_krpc.c \
 	opt_inet.h opt_nfs.h opt_bootp.h opt_nfsroot.h
-SRCS+=	opt_inet6.h opt_kdtrace.h opt_kgssapi.h
+SRCS+=	opt_inet6.h opt_kgssapi.h
 
 .if !defined(KERNBUILDDIR)
 NFS_INET?=	1	# 0/1 - requires INET to be configured in kernel
 NFS_INET6?=	1	# 0/1 - requires INET6 to be configured in kernel
 NFS_ROOT?=	1	# 0/1 - requires NFS_ROOT to be configured in kernel
 
 .if ${NFS_INET} > 0
 opt_inet.h:
 	echo "#define INET 1" > ${.TARGET}
 .endif
 
 .if ${NFS_INET6} > 0
 opt_inet6.h:
 	echo "#define INET6 1" > ${.TARGET}
 .endif
 
 .if ${NFS_ROOT} > 0
 opt_nfsroot.h:
 	echo "#define NFS_ROOT 1" > ${.TARGET}
 .endif
 .else
 OPT_NFS_ROOT!=	cat ${KERNBUILDDIR}/opt_nfsroot.h
 .if empty(OPT_NFS_ROOT)
 NFS_ROOT=	0
 .else
 NFS_ROOT=	1
 .endif
 .endif
 
 .if ${NFS_ROOT} > 0
 SRCS+=	nfs_diskless.c
 .endif
 
 .include <bsd.kmod.mk>
Index: head/sys/modules/nfscommon/Makefile
===================================================================
--- head/sys/modules/nfscommon/Makefile	(revision 258540)
+++ head/sys/modules/nfscommon/Makefile	(revision 258541)
@@ -1,16 +1,15 @@
 # $FreeBSD$
 
 .PATH: ${.CURDIR}/../../fs/nfs 
 KMOD=	nfscommon
 SRCS=	vnode_if.h \
 	nfs_commonacl.c \
 	nfs_commonkrpc.c \
 	nfs_commonport.c \
 	nfs_commonsubs.c \
 	opt_inet6.h \
-	opt_kdtrace.h \
 	opt_kgssapi.h \
 	opt_nfs.h \
 	opt_ufs.h
 
 .include <bsd.kmod.mk>
Index: head/sys/modules/send/Makefile
===================================================================
--- head/sys/modules/send/Makefile	(revision 258540)
+++ head/sys/modules/send/Makefile	(revision 258541)
@@ -1,7 +1,7 @@
 # $FreeBSD$
 .PATH: ${.CURDIR}/../../netinet6
 
 KMOD=	send
-SRCS=	send.c opt_kdtrace.h
+SRCS=	send.c
 
 .include <bsd.kmod.mk>
Index: head/sys/net/vnet.c
===================================================================
--- head/sys/net/vnet.c	(revision 258540)
+++ head/sys/net/vnet.c	(revision 258541)
@@ -1,824 +1,823 @@
 /*-
  * Copyright (c) 2004-2009 University of Zagreb
  * Copyright (c) 2006-2009 FreeBSD Foundation
  * All rights reserved.
  *
  * This software was developed by the University of Zagreb and the
  * FreeBSD Foundation under sponsorship by the Stichting NLnet and the
  * FreeBSD Foundation.
  *
  * Copyright (c) 2009 Jeffrey Roberson <jeff@freebsd.org>
  * Copyright (c) 2009 Robert N. M. Watson
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_ddb.h"
 #include "opt_kdb.h"
-#include "opt_kdtrace.h"
 
 #include <sys/param.h>
 #include <sys/kdb.h>
 #include <sys/kernel.h>
 #include <sys/jail.h>
 #include <sys/sdt.h>
 #include <sys/systm.h>
 #include <sys/sysctl.h>
 #include <sys/eventhandler.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/proc.h>
 #include <sys/socket.h>
 #include <sys/sx.h>
 #include <sys/sysctl.h>
 
 #include <machine/stdarg.h>
 
 #ifdef DDB
 #include <ddb/ddb.h>
 #include <ddb/db_sym.h>
 #endif
 
 #include <net/if.h>
 #include <net/if_var.h>
 #include <net/vnet.h>
 
 /*-
  * This file implements core functions for virtual network stacks:
  *
  * - Virtual network stack management functions.
  *
  * - Virtual network stack memory allocator, which virtualizes global
  *   variables in the network stack
  *
  * - Virtualized SYSINIT's/SYSUNINIT's, which allow network stack subsystems
  *   to register startup/shutdown events to be run for each virtual network
  *   stack instance.
  */
 
 FEATURE(vimage, "VIMAGE kernel virtualization");
 
 static MALLOC_DEFINE(M_VNET, "vnet", "network stack control block");
 
 /*
  * The virtual network stack list has two read-write locks, one sleepable and
  * the other not, so that the list can be stablized and walked in a variety
  * of network stack contexts.  Both must be acquired exclusively to modify
  * the list, but a read lock of either lock is sufficient to walk the list.
  */
 struct rwlock		vnet_rwlock;
 struct sx		vnet_sxlock;
 
 #define	VNET_LIST_WLOCK() do {						\
 	sx_xlock(&vnet_sxlock);						\
 	rw_wlock(&vnet_rwlock);						\
 } while (0)
 
 #define	VNET_LIST_WUNLOCK() do {					\
 	rw_wunlock(&vnet_rwlock);					\
 	sx_xunlock(&vnet_sxlock);					\
 } while (0)
 
 struct vnet_list_head vnet_head;
 struct vnet *vnet0;
 
 /*
  * The virtual network stack allocator provides storage for virtualized
  * global variables.  These variables are defined/declared using the
  * VNET_DEFINE()/VNET_DECLARE() macros, which place them in the 'set_vnet'
  * linker set.  The details of the implementation are somewhat subtle, but
  * allow the majority of most network subsystems to maintain
  * virtualization-agnostic.
  *
  * The virtual network stack allocator handles variables in the base kernel
  * vs. modules in similar but different ways.  In both cases, virtualized
  * global variables are marked as such by being declared to be part of the
  * vnet linker set.  These "master" copies of global variables serve two
  * functions:
  *
  * (1) They contain static initialization or "default" values for global
  *     variables which will be propagated to each virtual network stack
  *     instance when created.  As with normal global variables, they default
  *     to zero-filled.
  *
  * (2) They act as unique global names by which the variable can be referred
  *     to, regardless of network stack instance.  The single global symbol
  *     will be used to calculate the location of a per-virtual instance
  *     variable at run-time.
  *
  * Each virtual network stack instance has a complete copy of each
  * virtualized global variable, stored in a malloc'd block of memory
  * referred to by vnet->vnet_data_mem.  Critical to the design is that each
  * per-instance memory block is laid out identically to the master block so
  * that the offset of each global variable is the same across all blocks.  To
  * optimize run-time access, a precalculated 'base' address,
  * vnet->vnet_data_base, is stored in each vnet, and is the amount that can
  * be added to the address of a 'master' instance of a variable to get to the
  * per-vnet instance.
  *
  * Virtualized global variables are handled in a similar manner, but as each
  * module has its own 'set_vnet' linker set, and we want to keep all
  * virtualized globals togther, we reserve space in the kernel's linker set
  * for potential module variables using a per-vnet character array,
  * 'modspace'.  The virtual network stack allocator maintains a free list to
  * track what space in the array is free (all, initially) and as modules are
  * linked, allocates portions of the space to specific globals.  The kernel
  * module linker queries the virtual network stack allocator and will
  * bind references of the global to the location during linking.  It also
  * calls into the virtual network stack allocator, once the memory is
  * initialized, in order to propagate the new static initializations to all
  * existing virtual network stack instances so that the soon-to-be executing
  * module will find every network stack instance with proper default values.
  */
 
 /*
  * Number of bytes of data in the 'set_vnet' linker set, and hence the total
  * size of all kernel virtualized global variables, and the malloc(9) type
  * that will be used to allocate it.
  */
 #define	VNET_BYTES	(VNET_STOP - VNET_START)
 
 static MALLOC_DEFINE(M_VNET_DATA, "vnet_data", "VNET data");
 
 /*
  * VNET_MODMIN is the minimum number of bytes we will reserve for the sum of
  * global variables across all loaded modules.  As this actually sizes an
  * array declared as a virtualized global variable in the kernel itself, and
  * we want the virtualized global variable space to be page-sized, we may
  * have more space than that in practice.
  */
 #define	VNET_MODMIN	8192
 #define	VNET_SIZE	roundup2(VNET_BYTES, PAGE_SIZE)
 #define	VNET_MODSIZE	(VNET_SIZE - (VNET_BYTES - VNET_MODMIN))
 
 /*
  * Space to store virtualized global variables from loadable kernel modules,
  * and the free list to manage it.
  */
 static VNET_DEFINE(char, modspace[VNET_MODMIN]);
 
 /*
  * Global lists of subsystem constructor and destructors for vnets.  They are
  * registered via VNET_SYSINIT() and VNET_SYSUNINIT().  Both lists are
  * protected by the vnet_sysinit_sxlock global lock.
  */
 static TAILQ_HEAD(vnet_sysinit_head, vnet_sysinit) vnet_constructors =
 	TAILQ_HEAD_INITIALIZER(vnet_constructors);
 static TAILQ_HEAD(vnet_sysuninit_head, vnet_sysinit) vnet_destructors =
 	TAILQ_HEAD_INITIALIZER(vnet_destructors);
 
 struct sx		vnet_sysinit_sxlock;
 
 #define	VNET_SYSINIT_WLOCK()	sx_xlock(&vnet_sysinit_sxlock);
 #define	VNET_SYSINIT_WUNLOCK()	sx_xunlock(&vnet_sysinit_sxlock);
 #define	VNET_SYSINIT_RLOCK()	sx_slock(&vnet_sysinit_sxlock);
 #define	VNET_SYSINIT_RUNLOCK()	sx_sunlock(&vnet_sysinit_sxlock);
 
 struct vnet_data_free {
 	uintptr_t	vnd_start;
 	int		vnd_len;
 	TAILQ_ENTRY(vnet_data_free) vnd_link;
 };
 
 static MALLOC_DEFINE(M_VNET_DATA_FREE, "vnet_data_free",
     "VNET resource accounting");
 static TAILQ_HEAD(, vnet_data_free) vnet_data_free_head =
 	    TAILQ_HEAD_INITIALIZER(vnet_data_free_head);
 static struct sx vnet_data_free_lock;
 
 SDT_PROVIDER_DEFINE(vnet);
 SDT_PROBE_DEFINE1(vnet, functions, vnet_alloc, entry, entry, "int");
 SDT_PROBE_DEFINE2(vnet, functions, vnet_alloc, alloc, alloc, "int",
     "struct vnet *");
 SDT_PROBE_DEFINE2(vnet, functions, vnet_alloc, return, return,
     "int", "struct vnet *");
 SDT_PROBE_DEFINE2(vnet, functions, vnet_destroy, entry, entry,
     "int", "struct vnet *");
 SDT_PROBE_DEFINE1(vnet, functions, vnet_destroy, return, entry,
     "int");
 
 #ifdef DDB
 static void db_show_vnet_print_vs(struct vnet_sysinit *, int);
 #endif
 
 /*
  * Allocate a virtual network stack.
  */
 struct vnet *
 vnet_alloc(void)
 {
 	struct vnet *vnet;
 
 	SDT_PROBE1(vnet, functions, vnet_alloc, entry, __LINE__);
 	vnet = malloc(sizeof(struct vnet), M_VNET, M_WAITOK | M_ZERO);
 	vnet->vnet_magic_n = VNET_MAGIC_N;
 	SDT_PROBE2(vnet, functions, vnet_alloc, alloc, __LINE__, vnet);
 
 	/*
 	 * Allocate storage for virtualized global variables and copy in
 	 * initial values form our 'master' copy.
 	 */
 	vnet->vnet_data_mem = malloc(VNET_SIZE, M_VNET_DATA, M_WAITOK);
 	memcpy(vnet->vnet_data_mem, (void *)VNET_START, VNET_BYTES);
 
 	/*
 	 * All use of vnet-specific data will immediately subtract VNET_START
 	 * from the base memory pointer, so pre-calculate that now to avoid
 	 * it on each use.
 	 */
 	vnet->vnet_data_base = (uintptr_t)vnet->vnet_data_mem - VNET_START;
 
 	/* Initialize / attach vnet module instances. */
 	CURVNET_SET_QUIET(vnet);
 	vnet_sysinit();
 	CURVNET_RESTORE();
 
 	VNET_LIST_WLOCK();
 	LIST_INSERT_HEAD(&vnet_head, vnet, vnet_le);
 	VNET_LIST_WUNLOCK();
 
 	SDT_PROBE2(vnet, functions, vnet_alloc, return, __LINE__, vnet);
 	return (vnet);
 }
 
 /*
  * Destroy a virtual network stack.
  */
 void
 vnet_destroy(struct vnet *vnet)
 {
 	struct ifnet *ifp, *nifp;
 
 	SDT_PROBE2(vnet, functions, vnet_destroy, entry, __LINE__, vnet);
 	KASSERT(vnet->vnet_sockcnt == 0,
 	    ("%s: vnet still has sockets", __func__));
 
 	VNET_LIST_WLOCK();
 	LIST_REMOVE(vnet, vnet_le);
 	VNET_LIST_WUNLOCK();
 
 	CURVNET_SET_QUIET(vnet);
 
 	/* Return all inherited interfaces to their parent vnets. */
 	TAILQ_FOREACH_SAFE(ifp, &V_ifnet, if_link, nifp) {
 		if (ifp->if_home_vnet != ifp->if_vnet)
 			if_vmove(ifp, ifp->if_home_vnet);
 	}
 
 	vnet_sysuninit();
 	CURVNET_RESTORE();
 
 	/*
 	 * Release storage for the virtual network stack instance.
 	 */
 	free(vnet->vnet_data_mem, M_VNET_DATA);
 	vnet->vnet_data_mem = NULL;
 	vnet->vnet_data_base = 0;
 	vnet->vnet_magic_n = 0xdeadbeef;
 	free(vnet, M_VNET);
 	SDT_PROBE1(vnet, functions, vnet_destroy, return, __LINE__);
 }
 
 /*
  * Boot time initialization and allocation of virtual network stacks.
  */
 static void
 vnet_init_prelink(void *arg)
 {
 
 	rw_init(&vnet_rwlock, "vnet_rwlock");
 	sx_init(&vnet_sxlock, "vnet_sxlock");
 	sx_init(&vnet_sysinit_sxlock, "vnet_sysinit_sxlock");
 	LIST_INIT(&vnet_head);
 }
 SYSINIT(vnet_init_prelink, SI_SUB_VNET_PRELINK, SI_ORDER_FIRST,
     vnet_init_prelink, NULL);
 
 static void
 vnet0_init(void *arg)
 {
 
 	/* Warn people before take off - in case we crash early. */
 	printf("WARNING: VIMAGE (virtualized network stack) is a highly "
 	    "experimental feature.\n");
 
 	/*
 	 * We MUST clear curvnet in vi_init_done() before going SMP,
 	 * otherwise CURVNET_SET() macros would scream about unnecessary
 	 * curvnet recursions.
 	 */
 	curvnet = prison0.pr_vnet = vnet0 = vnet_alloc();
 }
 SYSINIT(vnet0_init, SI_SUB_VNET, SI_ORDER_FIRST, vnet0_init, NULL);
 
 static void
 vnet_init_done(void *unused)
 {
 
 	curvnet = NULL;
 }
 
 SYSINIT(vnet_init_done, SI_SUB_VNET_DONE, SI_ORDER_FIRST, vnet_init_done,
     NULL);
 
 /*
  * Once on boot, initialize the modspace freelist to entirely cover modspace.
  */
 static void
 vnet_data_startup(void *dummy __unused)
 {
 	struct vnet_data_free *df;
 
 	df = malloc(sizeof(*df), M_VNET_DATA_FREE, M_WAITOK | M_ZERO);
 	df->vnd_start = (uintptr_t)&VNET_NAME(modspace);
 	df->vnd_len = VNET_MODMIN;
 	TAILQ_INSERT_HEAD(&vnet_data_free_head, df, vnd_link);
 	sx_init(&vnet_data_free_lock, "vnet_data alloc lock");
 }
 SYSINIT(vnet_data, SI_SUB_KLD, SI_ORDER_FIRST, vnet_data_startup, 0);
 
 /*
  * When a module is loaded and requires storage for a virtualized global
  * variable, allocate space from the modspace free list.  This interface
  * should be used only by the kernel linker.
  */
 void *
 vnet_data_alloc(int size)
 {
 	struct vnet_data_free *df;
 	void *s;
 
 	s = NULL;
 	size = roundup2(size, sizeof(void *));
 	sx_xlock(&vnet_data_free_lock);
 	TAILQ_FOREACH(df, &vnet_data_free_head, vnd_link) {
 		if (df->vnd_len < size)
 			continue;
 		if (df->vnd_len == size) {
 			s = (void *)df->vnd_start;
 			TAILQ_REMOVE(&vnet_data_free_head, df, vnd_link);
 			free(df, M_VNET_DATA_FREE);
 			break;
 		}
 		s = (void *)df->vnd_start;
 		df->vnd_len -= size;
 		df->vnd_start = df->vnd_start + size;
 		break;
 	}
 	sx_xunlock(&vnet_data_free_lock);
 
 	return (s);
 }
 
 /*
  * Free space for a virtualized global variable on module unload.
  */
 void
 vnet_data_free(void *start_arg, int size)
 {
 	struct vnet_data_free *df;
 	struct vnet_data_free *dn;
 	uintptr_t start;
 	uintptr_t end;
 
 	size = roundup2(size, sizeof(void *));
 	start = (uintptr_t)start_arg;
 	end = start + size;
 	/*
 	 * Free a region of space and merge it with as many neighbors as
 	 * possible.  Keeping the list sorted simplifies this operation.
 	 */
 	sx_xlock(&vnet_data_free_lock);
 	TAILQ_FOREACH(df, &vnet_data_free_head, vnd_link) {
 		if (df->vnd_start > end)
 			break;
 		/*
 		 * If we expand at the end of an entry we may have to merge
 		 * it with the one following it as well.
 		 */
 		if (df->vnd_start + df->vnd_len == start) {
 			df->vnd_len += size;
 			dn = TAILQ_NEXT(df, vnd_link);
 			if (df->vnd_start + df->vnd_len == dn->vnd_start) {
 				df->vnd_len += dn->vnd_len;
 				TAILQ_REMOVE(&vnet_data_free_head, dn,
 				    vnd_link);
 				free(dn, M_VNET_DATA_FREE);
 			}
 			sx_xunlock(&vnet_data_free_lock);
 			return;
 		}
 		if (df->vnd_start == end) {
 			df->vnd_start = start;
 			df->vnd_len += size;
 			sx_xunlock(&vnet_data_free_lock);
 			return;
 		}
 	}
 	dn = malloc(sizeof(*df), M_VNET_DATA_FREE, M_WAITOK | M_ZERO);
 	dn->vnd_start = start;
 	dn->vnd_len = size;
 	if (df)
 		TAILQ_INSERT_BEFORE(df, dn, vnd_link);
 	else
 		TAILQ_INSERT_TAIL(&vnet_data_free_head, dn, vnd_link);
 	sx_xunlock(&vnet_data_free_lock);
 }
 
 /*
  * When a new virtualized global variable has been allocated, propagate its
  * initial value to each already-allocated virtual network stack instance.
  */
 void
 vnet_data_copy(void *start, int size)
 {
 	struct vnet *vnet;
 
 	VNET_LIST_RLOCK();
 	LIST_FOREACH(vnet, &vnet_head, vnet_le)
 		memcpy((void *)((uintptr_t)vnet->vnet_data_base +
 		    (uintptr_t)start), start, size);
 	VNET_LIST_RUNLOCK();
 }
 
 /*
  * Variants on sysctl_handle_foo that know how to handle virtualized global
  * variables: if 'arg1' is a pointer, then we transform it to the local vnet
  * offset.
  */
 int
 vnet_sysctl_handle_int(SYSCTL_HANDLER_ARGS)
 {
 
 	if (arg1 != NULL)
 		arg1 = (void *)(curvnet->vnet_data_base + (uintptr_t)arg1);
 	return (sysctl_handle_int(oidp, arg1, arg2, req));
 }
 
 int
 vnet_sysctl_handle_opaque(SYSCTL_HANDLER_ARGS)
 {
 
 	if (arg1 != NULL)
 		arg1 = (void *)(curvnet->vnet_data_base + (uintptr_t)arg1);
 	return (sysctl_handle_opaque(oidp, arg1, arg2, req));
 }
 
 int
 vnet_sysctl_handle_string(SYSCTL_HANDLER_ARGS)
 {
 
 	if (arg1 != NULL)
 		arg1 = (void *)(curvnet->vnet_data_base + (uintptr_t)arg1);
 	return (sysctl_handle_string(oidp, arg1, arg2, req));
 }
 
 int
 vnet_sysctl_handle_uint(SYSCTL_HANDLER_ARGS)
 {
 
 	if (arg1 != NULL)
 		arg1 = (void *)(curvnet->vnet_data_base + (uintptr_t)arg1);
 	return (sysctl_handle_int(oidp, arg1, arg2, req));
 }
 
 /*
  * Support for special SYSINIT handlers registered via VNET_SYSINIT()
  * and VNET_SYSUNINIT().
  */
 void
 vnet_register_sysinit(void *arg)
 {
 	struct vnet_sysinit *vs, *vs2;	
 	struct vnet *vnet;
 
 	vs = arg;
 	KASSERT(vs->subsystem > SI_SUB_VNET, ("vnet sysinit too early"));
 
 	/* Add the constructor to the global list of vnet constructors. */
 	VNET_SYSINIT_WLOCK();
 	TAILQ_FOREACH(vs2, &vnet_constructors, link) {
 		if (vs2->subsystem > vs->subsystem)
 			break;
 		if (vs2->subsystem == vs->subsystem && vs2->order > vs->order)
 			break;
 	}
 	if (vs2 != NULL)
 		TAILQ_INSERT_BEFORE(vs2, vs, link);
 	else
 		TAILQ_INSERT_TAIL(&vnet_constructors, vs, link);
 
 	/*
 	 * Invoke the constructor on all the existing vnets when it is
 	 * registered.
 	 */
 	VNET_FOREACH(vnet) {
 		CURVNET_SET_QUIET(vnet);
 		vs->func(vs->arg);
 		CURVNET_RESTORE();
 	}
 	VNET_SYSINIT_WUNLOCK();
 }
 
 void
 vnet_deregister_sysinit(void *arg)
 {
 	struct vnet_sysinit *vs;
 
 	vs = arg;
 
 	/* Remove the constructor from the global list of vnet constructors. */
 	VNET_SYSINIT_WLOCK();
 	TAILQ_REMOVE(&vnet_constructors, vs, link);
 	VNET_SYSINIT_WUNLOCK();
 }
 
 void
 vnet_register_sysuninit(void *arg)
 {
 	struct vnet_sysinit *vs, *vs2;
 
 	vs = arg;
 
 	/* Add the destructor to the global list of vnet destructors. */
 	VNET_SYSINIT_WLOCK();
 	TAILQ_FOREACH(vs2, &vnet_destructors, link) {
 		if (vs2->subsystem > vs->subsystem)
 			break;
 		if (vs2->subsystem == vs->subsystem && vs2->order > vs->order)
 			break;
 	}
 	if (vs2 != NULL)
 		TAILQ_INSERT_BEFORE(vs2, vs, link);
 	else
 		TAILQ_INSERT_TAIL(&vnet_destructors, vs, link);
 	VNET_SYSINIT_WUNLOCK();
 }
 
 void
 vnet_deregister_sysuninit(void *arg)
 {
 	struct vnet_sysinit *vs;
 	struct vnet *vnet;
 
 	vs = arg;
 
 	/*
 	 * Invoke the destructor on all the existing vnets when it is
 	 * deregistered.
 	 */
 	VNET_SYSINIT_WLOCK();
 	VNET_FOREACH(vnet) {
 		CURVNET_SET_QUIET(vnet);
 		vs->func(vs->arg);
 		CURVNET_RESTORE();
 	}
 
 	/* Remove the destructor from the global list of vnet destructors. */
 	TAILQ_REMOVE(&vnet_destructors, vs, link);
 	VNET_SYSINIT_WUNLOCK();
 }
 
 /*
  * Invoke all registered vnet constructors on the current vnet.  Used during
  * vnet construction.  The caller is responsible for ensuring the new vnet is
  * the current vnet and that the vnet_sysinit_sxlock lock is locked.
  */
 void
 vnet_sysinit(void)
 {
 	struct vnet_sysinit *vs;
 
 	VNET_SYSINIT_RLOCK();
 	TAILQ_FOREACH(vs, &vnet_constructors, link) {
 		vs->func(vs->arg);
 	}
 	VNET_SYSINIT_RUNLOCK();
 }
 
 /*
  * Invoke all registered vnet destructors on the current vnet.  Used during
  * vnet destruction.  The caller is responsible for ensuring the dying vnet
  * the current vnet and that the vnet_sysinit_sxlock lock is locked.
  */
 void
 vnet_sysuninit(void)
 {
 	struct vnet_sysinit *vs;
 
 	VNET_SYSINIT_RLOCK();
 	TAILQ_FOREACH_REVERSE(vs, &vnet_destructors, vnet_sysuninit_head,
 	    link) {
 		vs->func(vs->arg);
 	}
 	VNET_SYSINIT_RUNLOCK();
 }
 
 /*
  * EVENTHANDLER(9) extensions.
  */
 /*
  * Invoke the eventhandler function originally registered with the possibly
  * registered argument for all virtual network stack instances.
  *
  * This iterator can only be used for eventhandlers that do not take any
  * additional arguments, as we do ignore the variadic arguments from the
  * EVENTHANDLER_INVOKE() call.
  */
 void
 vnet_global_eventhandler_iterator_func(void *arg, ...)
 {
 	VNET_ITERATOR_DECL(vnet_iter);
 	struct eventhandler_entry_vimage *v_ee;
 
 	/*
 	 * There is a bug here in that we should actually cast things to
 	 * (struct eventhandler_entry_ ## name *)  but that's not easily
 	 * possible in here so just re-using the variadic version we
 	 * defined for the generic vimage case.
 	 */
 	v_ee = arg;
 	VNET_LIST_RLOCK();
 	VNET_FOREACH(vnet_iter) {
 		CURVNET_SET(vnet_iter);
 		((vimage_iterator_func_t)v_ee->func)(v_ee->ee_arg);
 		CURVNET_RESTORE();
 	}
 	VNET_LIST_RUNLOCK();
 }
 
 #ifdef VNET_DEBUG
 struct vnet_recursion {
 	SLIST_ENTRY(vnet_recursion)	 vnr_le;
 	const char			*prev_fn;
 	const char			*where_fn;
 	int				 where_line;
 	struct vnet			*old_vnet;
 	struct vnet			*new_vnet;
 };
 
 static SLIST_HEAD(, vnet_recursion) vnet_recursions =
     SLIST_HEAD_INITIALIZER(vnet_recursions);
 
 static void
 vnet_print_recursion(struct vnet_recursion *vnr, int brief)
 {
 
 	if (!brief)
 		printf("CURVNET_SET() recursion in ");
 	printf("%s() line %d, prev in %s()", vnr->where_fn, vnr->where_line,
 	    vnr->prev_fn);
 	if (brief)
 		printf(", ");
 	else
 		printf("\n    ");
 	printf("%p -> %p\n", vnr->old_vnet, vnr->new_vnet);
 }
 
 void
 vnet_log_recursion(struct vnet *old_vnet, const char *old_fn, int line)
 {
 	struct vnet_recursion *vnr;
 
 	/* Skip already logged recursion events. */
 	SLIST_FOREACH(vnr, &vnet_recursions, vnr_le)
 		if (vnr->prev_fn == old_fn &&
 		    vnr->where_fn == curthread->td_vnet_lpush &&
 		    vnr->where_line == line &&
 		    (vnr->old_vnet == vnr->new_vnet) == (curvnet == old_vnet))
 			return;
 
 	vnr = malloc(sizeof(*vnr), M_VNET, M_NOWAIT | M_ZERO);
 	if (vnr == NULL)
 		panic("%s: malloc failed", __func__);
 	vnr->prev_fn = old_fn;
 	vnr->where_fn = curthread->td_vnet_lpush;
 	vnr->where_line = line;
 	vnr->old_vnet = old_vnet;
 	vnr->new_vnet = curvnet;
 
 	SLIST_INSERT_HEAD(&vnet_recursions, vnr, vnr_le);
 
 	vnet_print_recursion(vnr, 0);
 #ifdef KDB
 	kdb_backtrace();
 #endif
 }
 #endif /* VNET_DEBUG */
 
 /*
  * DDB(4).
  */
 #ifdef DDB
 DB_SHOW_COMMAND(vnets, db_show_vnets)
 {
 	VNET_ITERATOR_DECL(vnet_iter);
 
 	VNET_FOREACH(vnet_iter) {
 		db_printf("vnet            = %p\n", vnet_iter);
 		db_printf(" vnet_magic_n   = 0x%x (%s, orig 0x%x)\n",
 		    vnet_iter->vnet_magic_n,
 		    (vnet_iter->vnet_magic_n == VNET_MAGIC_N) ?
 			"ok" : "mismatch", VNET_MAGIC_N);
 		db_printf(" vnet_ifcnt     = %u\n", vnet_iter->vnet_ifcnt);
 		db_printf(" vnet_sockcnt   = %u\n", vnet_iter->vnet_sockcnt);
 		db_printf(" vnet_data_mem  = %p\n", vnet_iter->vnet_data_mem);
 		db_printf(" vnet_data_base = 0x%jx\n",
 		    (uintmax_t)vnet_iter->vnet_data_base);
 		db_printf("\n");
 		if (db_pager_quit)
 			break;
 	}
 }
 
 static void
 db_show_vnet_print_vs(struct vnet_sysinit *vs, int ddb)
 {
 	const char *vsname, *funcname;
 	c_db_sym_t sym;
 	db_expr_t  offset;
 
 #define xprint(...)							\
 	if (ddb)							\
 		db_printf(__VA_ARGS__);					\
 	else								\
 		printf(__VA_ARGS__)
 
 	if (vs == NULL) {
 		xprint("%s: no vnet_sysinit * given\n", __func__);
 		return;
 	}
 
 	sym = db_search_symbol((vm_offset_t)vs, DB_STGY_ANY, &offset);
 	db_symbol_values(sym, &vsname, NULL);
 	sym = db_search_symbol((vm_offset_t)vs->func, DB_STGY_PROC, &offset);
 	db_symbol_values(sym, &funcname, NULL);
 	xprint("%s(%p)\n", (vsname != NULL) ? vsname : "", vs);
 	xprint("  0x%08x 0x%08x\n", vs->subsystem, vs->order);
 	xprint("  %p(%s)(%p)\n",
 	    vs->func, (funcname != NULL) ? funcname : "", vs->arg);
 #undef xprint
 }
 
 DB_SHOW_COMMAND(vnet_sysinit, db_show_vnet_sysinit)
 {
 	struct vnet_sysinit *vs;
 
 	db_printf("VNET_SYSINIT vs Name(Ptr)\n");
 	db_printf("  Subsystem  Order\n");
 	db_printf("  Function(Name)(Arg)\n");
 	TAILQ_FOREACH(vs, &vnet_constructors, link) {
 		db_show_vnet_print_vs(vs, 1);
 		if (db_pager_quit)
 			break;
 	}
 }
 
 DB_SHOW_COMMAND(vnet_sysuninit, db_show_vnet_sysuninit)
 {
 	struct vnet_sysinit *vs;
 
 	db_printf("VNET_SYSUNINIT vs Name(Ptr)\n");
 	db_printf("  Subsystem  Order\n");
 	db_printf("  Function(Name)(Arg)\n");
 	TAILQ_FOREACH_REVERSE(vs, &vnet_destructors, vnet_sysuninit_head,
 	    link) {
 		db_show_vnet_print_vs(vs, 1);
 		if (db_pager_quit)
 			break;
 	}
 }
 
 #ifdef VNET_DEBUG
 DB_SHOW_COMMAND(vnetrcrs, db_show_vnetrcrs)
 {
 	struct vnet_recursion *vnr;
 
 	SLIST_FOREACH(vnr, &vnet_recursions, vnr_le)
 		vnet_print_recursion(vnr, 1);
 }
 #endif
 #endif /* DDB */
Index: head/sys/netinet/in_kdtrace.c
===================================================================
--- head/sys/netinet/in_kdtrace.c	(revision 258540)
+++ head/sys/netinet/in_kdtrace.c	(revision 258541)
@@ -1,127 +1,125 @@
 /*-
  * Copyright (c) 2013 Mark Johnston <markj@FreeBSD.org>
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are
  * met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in
  *    the documentation and/or other materials provided with the
  *    distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  * $FreeBSD$
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
-#include "opt_kdtrace.h"
-
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/sdt.h>
 
 SDT_PROVIDER_DEFINE(ip);
 SDT_PROVIDER_DEFINE(tcp);
 SDT_PROVIDER_DEFINE(udp);
 
 SDT_PROBE_DEFINE6_XLATE(ip, , , receive, receive,
     "void *", "pktinfo_t *",
     "void *", "csinfo_t *",
     "uint8_t *", "ipinfo_t *",
     "struct ifnet *", "ifinfo_t *",
     "struct ip *", "ipv4info_t *",
     "struct ip6_hdr *", "ipv6info_t *");
 
 SDT_PROBE_DEFINE6_XLATE(ip, , , send, send,
     "void *", "pktinfo_t *",
     "void *", "csinfo_t *",
     "uint8_t *", "ipinfo_t *",
     "struct ifnet *", "ifinfo_t *",
     "struct ip *", "ipv4info_t *",
     "struct ip6_hdr *", "ipv6info_t *");
 
 SDT_PROBE_DEFINE5_XLATE(tcp, , , accept_established, accept-established,
     "void *", "pktinfo_t *",
     "struct tcpcb *", "csinfo_t *",
     "uint8_t *", "ipinfo_t *",
     "struct tcpcb *", "tcpsinfo_t *" ,
     "struct tcphdr *", "tcpinfoh_t *");
 
 SDT_PROBE_DEFINE5_XLATE(tcp, , , accept_refused, accept-refused,
     "void *", "pktinfo_t *",
     "struct tcpcb *", "csinfo_t *",
     "uint8_t *", "ipinfo_t *",
     "struct tcpcb *", "tcpsinfo_t *" ,
     "struct tcphdr *", "tcpinfo_t *");
 
 SDT_PROBE_DEFINE5_XLATE(tcp, , , connect_established, connect-established,
     "void *", "pktinfo_t *",
     "struct tcpcb *", "csinfo_t *",
     "uint8_t *", "ipinfo_t *",
     "struct tcpcb *", "tcpsinfo_t *" ,
     "struct tcphdr *", "tcpinfoh_t *");
 
 SDT_PROBE_DEFINE5_XLATE(tcp, , , connect_refused, connect-refused,
     "void *", "pktinfo_t *",
     "struct tcpcb *", "csinfo_t *",
     "uint8_t *", "ipinfo_t *",
     "struct tcpcb *", "tcpsinfo_t *" ,
     "struct tcphdr *", "tcpinfoh_t *");
 
 SDT_PROBE_DEFINE5_XLATE(tcp, , , connect_request, connect-request,
     "void *", "pktinfo_t *",
     "struct tcpcb *", "csinfo_t *",
     "uint8_t *", "ipinfo_t *",
     "struct tcpcb *", "tcpsinfo_t *" ,
     "struct tcphdr *", "tcpinfo_t *");
 
 SDT_PROBE_DEFINE5_XLATE(tcp, , , receive, receive,
     "void *", "pktinfo_t *",
     "struct tcpcb *", "csinfo_t *",
     "uint8_t *", "ipinfo_t *",
     "struct tcpcb *", "tcpsinfo_t *" ,
     "struct tcphdr *", "tcpinfoh_t *");
 
 SDT_PROBE_DEFINE5_XLATE(tcp, , , send, send,
     "void *", "pktinfo_t *",
     "struct tcpcb *", "csinfo_t *",
     "uint8_t *", "ipinfo_t *",
     "struct tcpcb *", "tcpsinfo_t *" ,
     "struct tcphdr *", "tcpinfo_t *");
 
 SDT_PROBE_DEFINE6_XLATE(tcp, , , state_change, state-change,
     "void *", "void *",
     "struct tcpcb *", "csinfo_t *",
     "void *", "void *",
     "struct tcpcb *", "tcpsinfo_t *",
     "void *", "void *",
     "int", "tcplsinfo_t *");
 
 SDT_PROBE_DEFINE5_XLATE(udp, , , receive, receive,
     "void *", "pktinfo_t *",
     "struct inpcb *", "csinfo_t *",
     "uint8_t *", "ipinfo_t *",
     "struct inpcb *", "udpsinfo_t *",
     "struct udphdr *", "udpinfo_t *");
 
 SDT_PROBE_DEFINE5_XLATE(udp, , , send, send,
     "void *", "pktinfo_t *",
     "struct inpcb *", "csinfo_t *",
     "uint8_t *", "ipinfo_t *",
     "struct inpcb *", "udpsinfo_t *",
     "struct udphdr *", "udpinfo_t *");
Index: head/sys/netinet/ip_fastfwd.c
===================================================================
--- head/sys/netinet/ip_fastfwd.c	(revision 258540)
+++ head/sys/netinet/ip_fastfwd.c	(revision 258541)
@@ -1,602 +1,601 @@
 /*-
  * Copyright (c) 2003 Andre Oppermann, Internet Business Solutions AG
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. The name of the author may not be used to endorse or promote
  *    products derived from this software without specific prior written
  *    permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 /*
  * ip_fastforward gets its speed from processing the forwarded packet to
  * completion (if_output on the other side) without any queues or netisr's.
  * The receiving interface DMAs the packet into memory, the upper half of
  * driver calls ip_fastforward, we do our routing table lookup and directly
  * send it off to the outgoing interface, which DMAs the packet to the
  * network card. The only part of the packet we touch with the CPU is the
  * IP header (unless there are complex firewall rules touching other parts
  * of the packet, but that is up to you). We are essentially limited by bus
  * bandwidth and how fast the network card/driver can set up receives and
  * transmits.
  *
  * We handle basic errors, IP header errors, checksum errors,
  * destination unreachable, fragmentation and fragmentation needed and
  * report them via ICMP to the sender.
  *
  * Else if something is not pure IPv4 unicast forwarding we fall back to
  * the normal ip_input processing path. We should only be called from
  * interfaces connected to the outside world.
  *
  * Firewalling is fully supported including divert, ipfw fwd and ipfilter
  * ipnat and address rewrite.
  *
  * IPSEC is not supported if this host is a tunnel broker. IPSEC is
  * supported for connections to/from local host.
  *
  * We try to do the least expensive (in CPU ops) checks and operations
  * first to catch junk with as little overhead as possible.
  * 
  * We take full advantage of hardware support for IP checksum and
  * fragmentation offloading.
  *
  * We don't do ICMP redirect in the fast forwarding path. I have had my own
  * cases where two core routers with Zebra routing suite would send millions
  * ICMP redirects to connected hosts if the destination router was not the
  * default gateway. In one case it was filling the routing table of a host
  * with approximately 300.000 cloned redirect entries until it ran out of
  * kernel memory. However the networking code proved very robust and it didn't
  * crash or fail in other ways.
  */
 
 /*
  * Many thanks to Matt Thomas of NetBSD for basic structure of ip_flow.c which
  * is being followed here.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_ipfw.h"
 #include "opt_ipstealth.h"
-#include "opt_kdtrace.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/kernel.h>
 #include <sys/malloc.h>
 #include <sys/mbuf.h>
 #include <sys/protosw.h>
 #include <sys/sdt.h>
 #include <sys/socket.h>
 #include <sys/sysctl.h>
 
 #include <net/pfil.h>
 #include <net/if.h>
 #include <net/if_types.h>
 #include <net/if_var.h>
 #include <net/if_dl.h>
 #include <net/route.h>
 #include <net/vnet.h>
 
 #include <netinet/in.h>
 #include <netinet/in_kdtrace.h>
 #include <netinet/in_systm.h>
 #include <netinet/in_var.h>
 #include <netinet/ip.h>
 #include <netinet/ip_var.h>
 #include <netinet/ip_icmp.h>
 #include <netinet/ip_options.h>
 
 #include <machine/in_cksum.h>
 
 static VNET_DEFINE(int, ipfastforward_active);
 #define	V_ipfastforward_active		VNET(ipfastforward_active)
 
 SYSCTL_VNET_INT(_net_inet_ip, OID_AUTO, fastforwarding, CTLFLAG_RW,
     &VNET_NAME(ipfastforward_active), 0, "Enable fast IP forwarding");
 
 static struct sockaddr_in *
 ip_findroute(struct route *ro, struct in_addr dest, struct mbuf *m)
 {
 	struct sockaddr_in *dst;
 	struct rtentry *rt;
 
 	/*
 	 * Find route to destination.
 	 */
 	bzero(ro, sizeof(*ro));
 	dst = (struct sockaddr_in *)&ro->ro_dst;
 	dst->sin_family = AF_INET;
 	dst->sin_len = sizeof(*dst);
 	dst->sin_addr.s_addr = dest.s_addr;
 	in_rtalloc_ign(ro, 0, M_GETFIB(m));
 
 	/*
 	 * Route there and interface still up?
 	 */
 	rt = ro->ro_rt;
 	if (rt && (rt->rt_flags & RTF_UP) &&
 	    (rt->rt_ifp->if_flags & IFF_UP) &&
 	    (rt->rt_ifp->if_drv_flags & IFF_DRV_RUNNING)) {
 		if (rt->rt_flags & RTF_GATEWAY)
 			dst = (struct sockaddr_in *)rt->rt_gateway;
 	} else {
 		IPSTAT_INC(ips_noroute);
 		IPSTAT_INC(ips_cantforward);
 		if (rt)
 			RTFREE(rt);
 		icmp_error(m, ICMP_UNREACH, ICMP_UNREACH_HOST, 0, 0);
 		return NULL;
 	}
 	return dst;
 }
 
 /*
  * Try to forward a packet based on the destination address.
  * This is a fast path optimized for the plain forwarding case.
  * If the packet is handled (and consumed) here then we return NULL;
  * otherwise mbuf is returned and the packet should be delivered
  * to ip_input for full processing.
  */
 struct mbuf *
 ip_fastforward(struct mbuf *m)
 {
 	struct ip *ip;
 	struct mbuf *m0 = NULL;
 	struct route ro;
 	struct sockaddr_in *dst = NULL;
 	struct ifnet *ifp;
 	struct in_addr odest, dest;
 	uint16_t sum, ip_len, ip_off;
 	int error = 0;
 	int hlen, mtu;
 	struct m_tag *fwd_tag = NULL;
 
 	/*
 	 * Are we active and forwarding packets?
 	 */
 	if (!V_ipfastforward_active || !V_ipforwarding)
 		return m;
 
 	M_ASSERTVALID(m);
 	M_ASSERTPKTHDR(m);
 
 	bzero(&ro, sizeof(ro));
 
 	/*
 	 * Step 1: check for packet drop conditions (and sanity checks)
 	 */
 
 	/*
 	 * Is entire packet big enough?
 	 */
 	if (m->m_pkthdr.len < sizeof(struct ip)) {
 		IPSTAT_INC(ips_tooshort);
 		goto drop;
 	}
 
 	/*
 	 * Is first mbuf large enough for ip header and is header present?
 	 */
 	if (m->m_len < sizeof (struct ip) &&
 	   (m = m_pullup(m, sizeof (struct ip))) == NULL) {
 		IPSTAT_INC(ips_toosmall);
 		return NULL;	/* mbuf already free'd */
 	}
 
 	ip = mtod(m, struct ip *);
 
 	/*
 	 * Is it IPv4?
 	 */
 	if (ip->ip_v != IPVERSION) {
 		IPSTAT_INC(ips_badvers);
 		goto drop;
 	}
 
 	/*
 	 * Is IP header length correct and is it in first mbuf?
 	 */
 	hlen = ip->ip_hl << 2;
 	if (hlen < sizeof(struct ip)) {	/* minimum header length */
 		IPSTAT_INC(ips_badhlen);
 		goto drop;
 	}
 	if (hlen > m->m_len) {
 		if ((m = m_pullup(m, hlen)) == NULL) {
 			IPSTAT_INC(ips_badhlen);
 			return NULL;	/* mbuf already free'd */
 		}
 		ip = mtod(m, struct ip *);
 	}
 
 	/*
 	 * Checksum correct?
 	 */
 	if (m->m_pkthdr.csum_flags & CSUM_IP_CHECKED)
 		sum = !(m->m_pkthdr.csum_flags & CSUM_IP_VALID);
 	else {
 		if (hlen == sizeof(struct ip))
 			sum = in_cksum_hdr(ip);
 		else
 			sum = in_cksum(m, hlen);
 	}
 	if (sum) {
 		IPSTAT_INC(ips_badsum);
 		goto drop;
 	}
 
 	/*
 	 * Remember that we have checked the IP header and found it valid.
 	 */
 	m->m_pkthdr.csum_flags |= (CSUM_IP_CHECKED | CSUM_IP_VALID);
 
 	ip_len = ntohs(ip->ip_len);
 
 	/*
 	 * Is IP length longer than packet we have got?
 	 */
 	if (m->m_pkthdr.len < ip_len) {
 		IPSTAT_INC(ips_tooshort);
 		goto drop;
 	}
 
 	/*
 	 * Is packet longer than IP header tells us? If yes, truncate packet.
 	 */
 	if (m->m_pkthdr.len > ip_len) {
 		if (m->m_len == m->m_pkthdr.len) {
 			m->m_len = ip_len;
 			m->m_pkthdr.len = ip_len;
 		} else
 			m_adj(m, ip_len - m->m_pkthdr.len);
 	}
 
 	/*
 	 * Is packet from or to 127/8?
 	 */
 	if ((ntohl(ip->ip_dst.s_addr) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET ||
 	    (ntohl(ip->ip_src.s_addr) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET) {
 		IPSTAT_INC(ips_badaddr);
 		goto drop;
 	}
 
 #ifdef ALTQ
 	/*
 	 * Is packet dropped by traffic conditioner?
 	 */
 	if (altq_input != NULL && (*altq_input)(m, AF_INET) == 0)
 		goto drop;
 #endif
 
 	/*
 	 * Step 2: fallback conditions to normal ip_input path processing
 	 */
 
 	/*
 	 * Only IP packets without options
 	 */
 	if (ip->ip_hl != (sizeof(struct ip) >> 2)) {
 		if (ip_doopts == 1)
 			return m;
 		else if (ip_doopts == 2) {
 			icmp_error(m, ICMP_UNREACH, ICMP_UNREACH_FILTER_PROHIB,
 				0, 0);
 			return NULL;	/* mbuf already free'd */
 		}
 		/* else ignore IP options and continue */
 	}
 
 	/*
 	 * Only unicast IP, not from loopback, no L2 or IP broadcast,
 	 * no multicast, no INADDR_ANY
 	 *
 	 * XXX: Probably some of these checks could be direct drop
 	 * conditions.  However it is not clear whether there are some
 	 * hacks or obscure behaviours which make it neccessary to
 	 * let ip_input handle it.  We play safe here and let ip_input
 	 * deal with it until it is proven that we can directly drop it.
 	 */
 	if ((m->m_flags & (M_BCAST|M_MCAST)) ||
 	    (m->m_pkthdr.rcvif->if_flags & IFF_LOOPBACK) ||
 	    ntohl(ip->ip_src.s_addr) == (u_long)INADDR_BROADCAST ||
 	    ntohl(ip->ip_dst.s_addr) == (u_long)INADDR_BROADCAST ||
 	    IN_MULTICAST(ntohl(ip->ip_src.s_addr)) ||
 	    IN_MULTICAST(ntohl(ip->ip_dst.s_addr)) ||
 	    IN_LINKLOCAL(ntohl(ip->ip_src.s_addr)) ||
 	    IN_LINKLOCAL(ntohl(ip->ip_dst.s_addr)) ||
 	    ip->ip_src.s_addr == INADDR_ANY ||
 	    ip->ip_dst.s_addr == INADDR_ANY )
 		return m;
 
 	/*
 	 * Is it for a local address on this host?
 	 */
 	if (in_localip(ip->ip_dst))
 		return m;
 
 	IPSTAT_INC(ips_total);
 
 	/*
 	 * Step 3: incoming packet firewall processing
 	 */
 
 	odest.s_addr = dest.s_addr = ip->ip_dst.s_addr;
 
 	/*
 	 * Run through list of ipfilter hooks for input packets
 	 */
 	if (!PFIL_HOOKED(&V_inet_pfil_hook))
 		goto passin;
 
 	if (pfil_run_hooks(
 	    &V_inet_pfil_hook, &m, m->m_pkthdr.rcvif, PFIL_IN, NULL) ||
 	    m == NULL)
 		goto drop;
 
 	M_ASSERTVALID(m);
 	M_ASSERTPKTHDR(m);
 
 	ip = mtod(m, struct ip *);	/* m may have changed by pfil hook */
 	dest.s_addr = ip->ip_dst.s_addr;
 
 	/*
 	 * Destination address changed?
 	 */
 	if (odest.s_addr != dest.s_addr) {
 		/*
 		 * Is it now for a local address on this host?
 		 */
 		if (in_localip(dest))
 			goto forwardlocal;
 		/*
 		 * Go on with new destination address
 		 */
 	}
 
 	if (m->m_flags & M_FASTFWD_OURS) {
 		/*
 		 * ipfw changed it for a local address on this host.
 		 */
 		goto forwardlocal;
 	}
 
 passin:
 	/*
 	 * Step 4: decrement TTL and look up route
 	 */
 
 	/*
 	 * Check TTL
 	 */
 #ifdef IPSTEALTH
 	if (!V_ipstealth) {
 #endif
 	if (ip->ip_ttl <= IPTTLDEC) {
 		icmp_error(m, ICMP_TIMXCEED, ICMP_TIMXCEED_INTRANS, 0, 0);
 		return NULL;	/* mbuf already free'd */
 	}
 
 	/*
 	 * Decrement the TTL and incrementally change the IP header checksum.
 	 * Don't bother doing this with hw checksum offloading, it's faster
 	 * doing it right here.
 	 */
 	ip->ip_ttl -= IPTTLDEC;
 	if (ip->ip_sum >= (u_int16_t) ~htons(IPTTLDEC << 8))
 		ip->ip_sum -= ~htons(IPTTLDEC << 8);
 	else
 		ip->ip_sum += htons(IPTTLDEC << 8);
 #ifdef IPSTEALTH
 	}
 #endif
 
 	/*
 	 * Find route to destination.
 	 */
 	if ((dst = ip_findroute(&ro, dest, m)) == NULL)
 		return NULL;	/* icmp unreach already sent */
 	ifp = ro.ro_rt->rt_ifp;
 
 	/*
 	 * Immediately drop blackholed traffic, and directed broadcasts
 	 * for either the all-ones or all-zero subnet addresses on
 	 * locally attached networks.
 	 */
 	if ((ro.ro_rt->rt_flags & (RTF_BLACKHOLE|RTF_BROADCAST)) != 0)
 		goto drop;
 
 	/*
 	 * Step 5: outgoing firewall packet processing
 	 */
 
 	/*
 	 * Run through list of hooks for output packets.
 	 */
 	if (!PFIL_HOOKED(&V_inet_pfil_hook))
 		goto passout;
 
 	if (pfil_run_hooks(&V_inet_pfil_hook, &m, ifp, PFIL_OUT, NULL) || m == NULL) {
 		goto drop;
 	}
 
 	M_ASSERTVALID(m);
 	M_ASSERTPKTHDR(m);
 
 	ip = mtod(m, struct ip *);
 	dest.s_addr = ip->ip_dst.s_addr;
 
 	/*
 	 * Destination address changed?
 	 */
 	if (m->m_flags & M_IP_NEXTHOP)
 		fwd_tag = m_tag_find(m, PACKET_TAG_IPFORWARD, NULL);
 	if (odest.s_addr != dest.s_addr || fwd_tag != NULL) {
 		/*
 		 * Is it now for a local address on this host?
 		 */
 		if (m->m_flags & M_FASTFWD_OURS || in_localip(dest)) {
 forwardlocal:
 			/*
 			 * Return packet for processing by ip_input().
 			 */
 			m->m_flags |= M_FASTFWD_OURS;
 			if (ro.ro_rt)
 				RTFREE(ro.ro_rt);
 			return m;
 		}
 		/*
 		 * Redo route lookup with new destination address
 		 */
 		if (fwd_tag) {
 			dest.s_addr = ((struct sockaddr_in *)
 				    (fwd_tag + 1))->sin_addr.s_addr;
 			m_tag_delete(m, fwd_tag);
 			m->m_flags &= ~M_IP_NEXTHOP;
 		}
 		RTFREE(ro.ro_rt);
 		if ((dst = ip_findroute(&ro, dest, m)) == NULL)
 			return NULL;	/* icmp unreach already sent */
 		ifp = ro.ro_rt->rt_ifp;
 	}
 
 passout:
 	/*
 	 * Step 6: send off the packet
 	 */
 	ip_len = ntohs(ip->ip_len);
 	ip_off = ntohs(ip->ip_off);
 
 	/*
 	 * Check if route is dampned (when ARP is unable to resolve)
 	 */
 	if ((ro.ro_rt->rt_flags & RTF_REJECT) &&
 	    (ro.ro_rt->rt_rmx.rmx_expire == 0 ||
 	    time_uptime < ro.ro_rt->rt_rmx.rmx_expire)) {
 		icmp_error(m, ICMP_UNREACH, ICMP_UNREACH_HOST, 0, 0);
 		goto consumed;
 	}
 
 #ifndef ALTQ
 	/*
 	 * Check if there is enough space in the interface queue
 	 */
 	if ((ifp->if_snd.ifq_len + ip_len / ifp->if_mtu + 1) >=
 	    ifp->if_snd.ifq_maxlen) {
 		IPSTAT_INC(ips_odropped);
 		/* would send source quench here but that is depreciated */
 		goto drop;
 	}
 #endif
 
 	/*
 	 * Check if media link state of interface is not down
 	 */
 	if (ifp->if_link_state == LINK_STATE_DOWN) {
 		icmp_error(m, ICMP_UNREACH, ICMP_UNREACH_HOST, 0, 0);
 		goto consumed;
 	}
 
 	/*
 	 * Check if packet fits MTU or if hardware will fragment for us
 	 */
 	if (ro.ro_rt->rt_rmx.rmx_mtu)
 		mtu = min(ro.ro_rt->rt_rmx.rmx_mtu, ifp->if_mtu);
 	else
 		mtu = ifp->if_mtu;
 
 	if (ip_len <= mtu ||
 	    (ifp->if_hwassist & CSUM_FRAGMENT && (ip_off & IP_DF) == 0)) {
 		/*
 		 * Avoid confusing lower layers.
 		 */
 		m_clrprotoflags(m);
 		/*
 		 * Send off the packet via outgoing interface
 		 */
 		IP_PROBE(send, NULL, NULL, ip, ifp, ip, NULL);
 		error = (*ifp->if_output)(ifp, m,
 				(struct sockaddr *)dst, &ro);
 	} else {
 		/*
 		 * Handle EMSGSIZE with icmp reply needfrag for TCP MTU discovery
 		 */
 		if (ip_off & IP_DF) {
 			IPSTAT_INC(ips_cantfrag);
 			icmp_error(m, ICMP_UNREACH, ICMP_UNREACH_NEEDFRAG,
 				0, mtu);
 			goto consumed;
 		} else {
 			/*
 			 * We have to fragment the packet
 			 */
 			m->m_pkthdr.csum_flags |= CSUM_IP;
 			if (ip_fragment(ip, &m, mtu, ifp->if_hwassist))
 				goto drop;
 			KASSERT(m != NULL, ("null mbuf and no error"));
 			/*
 			 * Send off the fragments via outgoing interface
 			 */
 			error = 0;
 			do {
 				m0 = m->m_nextpkt;
 				m->m_nextpkt = NULL;
 				/*
 				 * Avoid confusing lower layers.
 				 */
 				m_clrprotoflags(m);
 
 				IP_PROBE(send, NULL, NULL, ip, ifp, ip, NULL);
 				error = (*ifp->if_output)(ifp, m,
 					(struct sockaddr *)dst, &ro);
 				if (error)
 					break;
 			} while ((m = m0) != NULL);
 			if (error) {
 				/* Reclaim remaining fragments */
 				for (m = m0; m; m = m0) {
 					m0 = m->m_nextpkt;
 					m_freem(m);
 				}
 			} else
 				IPSTAT_INC(ips_fragmented);
 		}
 	}
 
 	if (error != 0)
 		IPSTAT_INC(ips_odropped);
 	else {
 		ro.ro_rt->rt_rmx.rmx_pksent++;
 		IPSTAT_INC(ips_forward);
 		IPSTAT_INC(ips_fastforward);
 	}
 consumed:
 	RTFREE(ro.ro_rt);
 	return NULL;
 drop:
 	if (m)
 		m_freem(m);
 	if (ro.ro_rt)
 		RTFREE(ro.ro_rt);
 	return NULL;
 }
Index: head/sys/netinet/ip_input.c
===================================================================
--- head/sys/netinet/ip_input.c	(revision 258540)
+++ head/sys/netinet/ip_input.c	(revision 258541)
@@ -1,1769 +1,1768 @@
 /*-
  * Copyright (c) 1982, 1986, 1988, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)ip_input.c	8.2 (Berkeley) 1/4/94
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_bootp.h"
 #include "opt_ipfw.h"
 #include "opt_ipstealth.h"
 #include "opt_ipsec.h"
-#include "opt_kdtrace.h"
 #include "opt_route.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/mbuf.h>
 #include <sys/malloc.h>
 #include <sys/domain.h>
 #include <sys/protosw.h>
 #include <sys/socket.h>
 #include <sys/time.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/rwlock.h>
 #include <sys/sdt.h>
 #include <sys/syslog.h>
 #include <sys/sysctl.h>
 
 #include <net/pfil.h>
 #include <net/if.h>
 #include <net/if_types.h>
 #include <net/if_var.h>
 #include <net/if_dl.h>
 #include <net/route.h>
 #include <net/netisr.h>
 #include <net/vnet.h>
 #include <net/flowtable.h>
 
 #include <netinet/in.h>
 #include <netinet/in_kdtrace.h>
 #include <netinet/in_systm.h>
 #include <netinet/in_var.h>
 #include <netinet/ip.h>
 #include <netinet/in_pcb.h>
 #include <netinet/ip_var.h>
 #include <netinet/ip_fw.h>
 #include <netinet/ip_icmp.h>
 #include <netinet/ip_options.h>
 #include <machine/in_cksum.h>
 #include <netinet/ip_carp.h>
 #ifdef IPSEC
 #include <netinet/ip_ipsec.h>
 #endif /* IPSEC */
 
 #include <sys/socketvar.h>
 
 #include <security/mac/mac_framework.h>
 
 #ifdef CTASSERT
 CTASSERT(sizeof(struct ip) == 20);
 #endif
 
 struct	rwlock in_ifaddr_lock;
 RW_SYSINIT(in_ifaddr_lock, &in_ifaddr_lock, "in_ifaddr_lock");
 
 VNET_DEFINE(int, rsvp_on);
 
 VNET_DEFINE(int, ipforwarding);
 SYSCTL_VNET_INT(_net_inet_ip, IPCTL_FORWARDING, forwarding, CTLFLAG_RW,
     &VNET_NAME(ipforwarding), 0,
     "Enable IP forwarding between interfaces");
 
 static VNET_DEFINE(int, ipsendredirects) = 1;	/* XXX */
 #define	V_ipsendredirects	VNET(ipsendredirects)
 SYSCTL_VNET_INT(_net_inet_ip, IPCTL_SENDREDIRECTS, redirect, CTLFLAG_RW,
     &VNET_NAME(ipsendredirects), 0,
     "Enable sending IP redirects");
 
 static VNET_DEFINE(int, ip_keepfaith);
 #define	V_ip_keepfaith		VNET(ip_keepfaith)
 SYSCTL_VNET_INT(_net_inet_ip, IPCTL_KEEPFAITH, keepfaith, CTLFLAG_RW,
     &VNET_NAME(ip_keepfaith), 0,
     "Enable packet capture for FAITH IPv4->IPv6 translater daemon");
 
 static VNET_DEFINE(int, ip_sendsourcequench);
 #define	V_ip_sendsourcequench	VNET(ip_sendsourcequench)
 SYSCTL_VNET_INT(_net_inet_ip, OID_AUTO, sendsourcequench, CTLFLAG_RW,
     &VNET_NAME(ip_sendsourcequench), 0,
     "Enable the transmission of source quench packets");
 
 VNET_DEFINE(int, ip_do_randomid);
 SYSCTL_VNET_INT(_net_inet_ip, OID_AUTO, random_id, CTLFLAG_RW,
     &VNET_NAME(ip_do_randomid), 0,
     "Assign random ip_id values");
 
 /*
  * XXX - Setting ip_checkinterface mostly implements the receive side of
  * the Strong ES model described in RFC 1122, but since the routing table
  * and transmit implementation do not implement the Strong ES model,
  * setting this to 1 results in an odd hybrid.
  *
  * XXX - ip_checkinterface currently must be disabled if you use ipnat
  * to translate the destination address to another local interface.
  *
  * XXX - ip_checkinterface must be disabled if you add IP aliases
  * to the loopback interface instead of the interface where the
  * packets for those addresses are received.
  */
 static VNET_DEFINE(int, ip_checkinterface);
 #define	V_ip_checkinterface	VNET(ip_checkinterface)
 SYSCTL_VNET_INT(_net_inet_ip, OID_AUTO, check_interface, CTLFLAG_RW,
     &VNET_NAME(ip_checkinterface), 0,
     "Verify packet arrives on correct interface");
 
 VNET_DEFINE(struct pfil_head, inet_pfil_hook);	/* Packet filter hooks */
 
 static struct netisr_handler ip_nh = {
 	.nh_name = "ip",
 	.nh_handler = ip_input,
 	.nh_proto = NETISR_IP,
 	.nh_policy = NETISR_POLICY_FLOW,
 };
 
 extern	struct domain inetdomain;
 extern	struct protosw inetsw[];
 u_char	ip_protox[IPPROTO_MAX];
 VNET_DEFINE(struct in_ifaddrhead, in_ifaddrhead);  /* first inet address */
 VNET_DEFINE(struct in_ifaddrhashhead *, in_ifaddrhashtbl); /* inet addr hash table  */
 VNET_DEFINE(u_long, in_ifaddrhmask);		/* mask for hash table */
 
 static VNET_DEFINE(uma_zone_t, ipq_zone);
 static VNET_DEFINE(TAILQ_HEAD(ipqhead, ipq), ipq[IPREASS_NHASH]);
 static struct mtx ipqlock;
 
 #define	V_ipq_zone		VNET(ipq_zone)
 #define	V_ipq			VNET(ipq)
 
 #define	IPQ_LOCK()	mtx_lock(&ipqlock)
 #define	IPQ_UNLOCK()	mtx_unlock(&ipqlock)
 #define	IPQ_LOCK_INIT()	mtx_init(&ipqlock, "ipqlock", NULL, MTX_DEF)
 #define	IPQ_LOCK_ASSERT()	mtx_assert(&ipqlock, MA_OWNED)
 
 static void	maxnipq_update(void);
 static void	ipq_zone_change(void *);
 static void	ip_drain_locked(void);
 
 static VNET_DEFINE(int, maxnipq);  /* Administrative limit on # reass queues. */
 static VNET_DEFINE(int, nipq);			/* Total # of reass queues */
 #define	V_maxnipq		VNET(maxnipq)
 #define	V_nipq			VNET(nipq)
 SYSCTL_VNET_INT(_net_inet_ip, OID_AUTO, fragpackets, CTLFLAG_RD,
     &VNET_NAME(nipq), 0,
     "Current number of IPv4 fragment reassembly queue entries");
 
 static VNET_DEFINE(int, maxfragsperpacket);
 #define	V_maxfragsperpacket	VNET(maxfragsperpacket)
 SYSCTL_VNET_INT(_net_inet_ip, OID_AUTO, maxfragsperpacket, CTLFLAG_RW,
     &VNET_NAME(maxfragsperpacket), 0,
     "Maximum number of IPv4 fragments allowed per packet");
 
 #ifdef IPCTL_DEFMTU
 SYSCTL_INT(_net_inet_ip, IPCTL_DEFMTU, mtu, CTLFLAG_RW,
     &ip_mtu, 0, "Default MTU");
 #endif
 
 #ifdef IPSTEALTH
 VNET_DEFINE(int, ipstealth);
 SYSCTL_VNET_INT(_net_inet_ip, OID_AUTO, stealth, CTLFLAG_RW,
     &VNET_NAME(ipstealth), 0,
     "IP stealth mode, no TTL decrementation on forwarding");
 #endif
 
 #ifdef FLOWTABLE
 static VNET_DEFINE(int, ip_output_flowtable_size) = 2048;
 VNET_DEFINE(struct flowtable *, ip_ft);
 #define	V_ip_output_flowtable_size	VNET(ip_output_flowtable_size)
 
 SYSCTL_VNET_INT(_net_inet_ip, OID_AUTO, output_flowtable_size, CTLFLAG_RDTUN,
     &VNET_NAME(ip_output_flowtable_size), 2048,
     "number of entries in the per-cpu output flow caches");
 #endif
 
 static void	ip_freef(struct ipqhead *, struct ipq *);
 
 /*
  * IP statistics are stored in the "array" of counter(9)s.
  */
 VNET_PCPUSTAT_DEFINE(struct ipstat, ipstat);
 VNET_PCPUSTAT_SYSINIT(ipstat);
 SYSCTL_VNET_PCPUSTAT(_net_inet_ip, IPCTL_STATS, stats, struct ipstat, ipstat,
     "IP statistics (struct ipstat, netinet/ip_var.h)");
 
 #ifdef VIMAGE
 VNET_PCPUSTAT_SYSUNINIT(ipstat);
 #endif /* VIMAGE */
 
 /*
  * Kernel module interface for updating ipstat.  The argument is an index
  * into ipstat treated as an array.
  */
 void
 kmod_ipstat_inc(int statnum)
 {
 
 	counter_u64_add(VNET(ipstat)[statnum], 1);
 }
 
 void
 kmod_ipstat_dec(int statnum)
 {
 
 	counter_u64_add(VNET(ipstat)[statnum], -1);
 }
 
 static int
 sysctl_netinet_intr_queue_maxlen(SYSCTL_HANDLER_ARGS)
 {
 	int error, qlimit;
 
 	netisr_getqlimit(&ip_nh, &qlimit);
 	error = sysctl_handle_int(oidp, &qlimit, 0, req);
 	if (error || !req->newptr)
 		return (error);
 	if (qlimit < 1)
 		return (EINVAL);
 	return (netisr_setqlimit(&ip_nh, qlimit));
 }
 SYSCTL_PROC(_net_inet_ip, IPCTL_INTRQMAXLEN, intr_queue_maxlen,
     CTLTYPE_INT|CTLFLAG_RW, 0, 0, sysctl_netinet_intr_queue_maxlen, "I",
     "Maximum size of the IP input queue");
 
 static int
 sysctl_netinet_intr_queue_drops(SYSCTL_HANDLER_ARGS)
 {
 	u_int64_t qdrops_long;
 	int error, qdrops;
 
 	netisr_getqdrops(&ip_nh, &qdrops_long);
 	qdrops = qdrops_long;
 	error = sysctl_handle_int(oidp, &qdrops, 0, req);
 	if (error || !req->newptr)
 		return (error);
 	if (qdrops != 0)
 		return (EINVAL);
 	netisr_clearqdrops(&ip_nh);
 	return (0);
 }
 
 SYSCTL_PROC(_net_inet_ip, IPCTL_INTRQDROPS, intr_queue_drops,
     CTLTYPE_INT|CTLFLAG_RD, 0, 0, sysctl_netinet_intr_queue_drops, "I",
     "Number of packets dropped from the IP input queue");
 
 /*
  * IP initialization: fill in IP protocol switch table.
  * All protocols not implemented in kernel go to raw IP protocol handler.
  */
 void
 ip_init(void)
 {
 	struct protosw *pr;
 	int i;
 
 	V_ip_id = time_second & 0xffff;
 
 	TAILQ_INIT(&V_in_ifaddrhead);
 	V_in_ifaddrhashtbl = hashinit(INADDR_NHASH, M_IFADDR, &V_in_ifaddrhmask);
 
 	/* Initialize IP reassembly queue. */
 	for (i = 0; i < IPREASS_NHASH; i++)
 		TAILQ_INIT(&V_ipq[i]);
 	V_maxnipq = nmbclusters / 32;
 	V_maxfragsperpacket = 16;
 	V_ipq_zone = uma_zcreate("ipq", sizeof(struct ipq), NULL, NULL, NULL,
 	    NULL, UMA_ALIGN_PTR, 0);
 	maxnipq_update();
 
 	/* Initialize packet filter hooks. */
 	V_inet_pfil_hook.ph_type = PFIL_TYPE_AF;
 	V_inet_pfil_hook.ph_af = AF_INET;
 	if ((i = pfil_head_register(&V_inet_pfil_hook)) != 0)
 		printf("%s: WARNING: unable to register pfil hook, "
 			"error %d\n", __func__, i);
 
 #ifdef FLOWTABLE
 	if (TUNABLE_INT_FETCH("net.inet.ip.output_flowtable_size",
 		&V_ip_output_flowtable_size)) {
 		if (V_ip_output_flowtable_size < 256)
 			V_ip_output_flowtable_size = 256;
 		if (!powerof2(V_ip_output_flowtable_size)) {
 			printf("flowtable must be power of 2 size\n");
 			V_ip_output_flowtable_size = 2048;
 		}
 	} else {
 		/*
 		 * round up to the next power of 2
 		 */
 		V_ip_output_flowtable_size = 1 << fls((1024 + maxusers * 64)-1);
 	}
 	V_ip_ft = flowtable_alloc("ipv4", V_ip_output_flowtable_size, FL_PCPU);
 #endif
 
 	/* Skip initialization of globals for non-default instances. */
 	if (!IS_DEFAULT_VNET(curvnet))
 		return;
 
 	pr = pffindproto(PF_INET, IPPROTO_RAW, SOCK_RAW);
 	if (pr == NULL)
 		panic("ip_init: PF_INET not found");
 
 	/* Initialize the entire ip_protox[] array to IPPROTO_RAW. */
 	for (i = 0; i < IPPROTO_MAX; i++)
 		ip_protox[i] = pr - inetsw;
 	/*
 	 * Cycle through IP protocols and put them into the appropriate place
 	 * in ip_protox[].
 	 */
 	for (pr = inetdomain.dom_protosw;
 	    pr < inetdomain.dom_protoswNPROTOSW; pr++)
 		if (pr->pr_domain->dom_family == PF_INET &&
 		    pr->pr_protocol && pr->pr_protocol != IPPROTO_RAW) {
 			/* Be careful to only index valid IP protocols. */
 			if (pr->pr_protocol < IPPROTO_MAX)
 				ip_protox[pr->pr_protocol] = pr - inetsw;
 		}
 
 	EVENTHANDLER_REGISTER(nmbclusters_change, ipq_zone_change,
 		NULL, EVENTHANDLER_PRI_ANY);
 
 	/* Initialize various other remaining things. */
 	IPQ_LOCK_INIT();
 	netisr_register(&ip_nh);
 }
 
 #ifdef VIMAGE
 void
 ip_destroy(void)
 {
 	int i;
 
 	if ((i = pfil_head_unregister(&V_inet_pfil_hook)) != 0)
 		printf("%s: WARNING: unable to unregister pfil hook, "
 		    "error %d\n", __func__, i);
 
 	/* Cleanup in_ifaddr hash table; should be empty. */
 	hashdestroy(V_in_ifaddrhashtbl, M_IFADDR, V_in_ifaddrhmask);
 
 	IPQ_LOCK();
 	ip_drain_locked();
 	IPQ_UNLOCK();
 
 	uma_zdestroy(V_ipq_zone);
 }
 #endif
 
 /*
  * Ip input routine.  Checksum and byte swap header.  If fragmented
  * try to reassemble.  Process options.  Pass to next level.
  */
 void
 ip_input(struct mbuf *m)
 {
 	struct ip *ip = NULL;
 	struct in_ifaddr *ia = NULL;
 	struct ifaddr *ifa;
 	struct ifnet *ifp;
 	int    checkif, hlen = 0;
 	uint16_t sum, ip_len;
 	int dchg = 0;				/* dest changed after fw */
 	struct in_addr odst;			/* original dst address */
 
 	M_ASSERTPKTHDR(m);
 
 	if (m->m_flags & M_FASTFWD_OURS) {
 		m->m_flags &= ~M_FASTFWD_OURS;
 		/* Set up some basics that will be used later. */
 		ip = mtod(m, struct ip *);
 		hlen = ip->ip_hl << 2;
 		ip_len = ntohs(ip->ip_len);
 		goto ours;
 	}
 
 	IPSTAT_INC(ips_total);
 
 	if (m->m_pkthdr.len < sizeof(struct ip))
 		goto tooshort;
 
 	if (m->m_len < sizeof (struct ip) &&
 	    (m = m_pullup(m, sizeof (struct ip))) == NULL) {
 		IPSTAT_INC(ips_toosmall);
 		return;
 	}
 	ip = mtod(m, struct ip *);
 
 	if (ip->ip_v != IPVERSION) {
 		IPSTAT_INC(ips_badvers);
 		goto bad;
 	}
 
 	hlen = ip->ip_hl << 2;
 	if (hlen < sizeof(struct ip)) {	/* minimum header length */
 		IPSTAT_INC(ips_badhlen);
 		goto bad;
 	}
 	if (hlen > m->m_len) {
 		if ((m = m_pullup(m, hlen)) == NULL) {
 			IPSTAT_INC(ips_badhlen);
 			return;
 		}
 		ip = mtod(m, struct ip *);
 	}
 
 	IP_PROBE(receive, NULL, NULL, ip, m->m_pkthdr.rcvif, ip, NULL);
 
 	/* 127/8 must not appear on wire - RFC1122 */
 	ifp = m->m_pkthdr.rcvif;
 	if ((ntohl(ip->ip_dst.s_addr) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET ||
 	    (ntohl(ip->ip_src.s_addr) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET) {
 		if ((ifp->if_flags & IFF_LOOPBACK) == 0) {
 			IPSTAT_INC(ips_badaddr);
 			goto bad;
 		}
 	}
 
 	if (m->m_pkthdr.csum_flags & CSUM_IP_CHECKED) {
 		sum = !(m->m_pkthdr.csum_flags & CSUM_IP_VALID);
 	} else {
 		if (hlen == sizeof(struct ip)) {
 			sum = in_cksum_hdr(ip);
 		} else {
 			sum = in_cksum(m, hlen);
 		}
 	}
 	if (sum) {
 		IPSTAT_INC(ips_badsum);
 		goto bad;
 	}
 
 #ifdef ALTQ
 	if (altq_input != NULL && (*altq_input)(m, AF_INET) == 0)
 		/* packet is dropped by traffic conditioner */
 		return;
 #endif
 
 	ip_len = ntohs(ip->ip_len);
 	if (ip_len < hlen) {
 		IPSTAT_INC(ips_badlen);
 		goto bad;
 	}
 
 	/*
 	 * Check that the amount of data in the buffers
 	 * is as at least much as the IP header would have us expect.
 	 * Trim mbufs if longer than we expect.
 	 * Drop packet if shorter than we expect.
 	 */
 	if (m->m_pkthdr.len < ip_len) {
 tooshort:
 		IPSTAT_INC(ips_tooshort);
 		goto bad;
 	}
 	if (m->m_pkthdr.len > ip_len) {
 		if (m->m_len == m->m_pkthdr.len) {
 			m->m_len = ip_len;
 			m->m_pkthdr.len = ip_len;
 		} else
 			m_adj(m, ip_len - m->m_pkthdr.len);
 	}
 #ifdef IPSEC
 	/*
 	 * Bypass packet filtering for packets previously handled by IPsec.
 	 */
 	if (ip_ipsec_filtertunnel(m))
 		goto passin;
 #endif /* IPSEC */
 
 	/*
 	 * Run through list of hooks for input packets.
 	 *
 	 * NB: Beware of the destination address changing (e.g.
 	 *     by NAT rewriting).  When this happens, tell
 	 *     ip_forward to do the right thing.
 	 */
 
 	/* Jump over all PFIL processing if hooks are not active. */
 	if (!PFIL_HOOKED(&V_inet_pfil_hook))
 		goto passin;
 
 	odst = ip->ip_dst;
 	if (pfil_run_hooks(&V_inet_pfil_hook, &m, ifp, PFIL_IN, NULL) != 0)
 		return;
 	if (m == NULL)			/* consumed by filter */
 		return;
 
 	ip = mtod(m, struct ip *);
 	dchg = (odst.s_addr != ip->ip_dst.s_addr);
 	ifp = m->m_pkthdr.rcvif;
 
 	if (m->m_flags & M_FASTFWD_OURS) {
 		m->m_flags &= ~M_FASTFWD_OURS;
 		goto ours;
 	}
 	if (m->m_flags & M_IP_NEXTHOP) {
 		dchg = (m_tag_find(m, PACKET_TAG_IPFORWARD, NULL) != NULL);
 		if (dchg != 0) {
 			/*
 			 * Directly ship the packet on.  This allows
 			 * forwarding packets originally destined to us
 			 * to some other directly connected host.
 			 */
 			ip_forward(m, 1);
 			return;
 		}
 	}
 passin:
 
 	/*
 	 * Process options and, if not destined for us,
 	 * ship it on.  ip_dooptions returns 1 when an
 	 * error was detected (causing an icmp message
 	 * to be sent and the original packet to be freed).
 	 */
 	if (hlen > sizeof (struct ip) && ip_dooptions(m, 0))
 		return;
 
         /* greedy RSVP, snatches any PATH packet of the RSVP protocol and no
          * matter if it is destined to another node, or whether it is 
          * a multicast one, RSVP wants it! and prevents it from being forwarded
          * anywhere else. Also checks if the rsvp daemon is running before
 	 * grabbing the packet.
          */
 	if (V_rsvp_on && ip->ip_p==IPPROTO_RSVP) 
 		goto ours;
 
 	/*
 	 * Check our list of addresses, to see if the packet is for us.
 	 * If we don't have any addresses, assume any unicast packet
 	 * we receive might be for us (and let the upper layers deal
 	 * with it).
 	 */
 	if (TAILQ_EMPTY(&V_in_ifaddrhead) &&
 	    (m->m_flags & (M_MCAST|M_BCAST)) == 0)
 		goto ours;
 
 	/*
 	 * Enable a consistency check between the destination address
 	 * and the arrival interface for a unicast packet (the RFC 1122
 	 * strong ES model) if IP forwarding is disabled and the packet
 	 * is not locally generated and the packet is not subject to
 	 * 'ipfw fwd'.
 	 *
 	 * XXX - Checking also should be disabled if the destination
 	 * address is ipnat'ed to a different interface.
 	 *
 	 * XXX - Checking is incompatible with IP aliases added
 	 * to the loopback interface instead of the interface where
 	 * the packets are received.
 	 *
 	 * XXX - This is the case for carp vhost IPs as well so we
 	 * insert a workaround. If the packet got here, we already
 	 * checked with carp_iamatch() and carp_forus().
 	 */
 	checkif = V_ip_checkinterface && (V_ipforwarding == 0) && 
 	    ifp != NULL && ((ifp->if_flags & IFF_LOOPBACK) == 0) &&
 	    ifp->if_carp == NULL && (dchg == 0);
 
 	/*
 	 * Check for exact addresses in the hash bucket.
 	 */
 	/* IN_IFADDR_RLOCK(); */
 	LIST_FOREACH(ia, INADDR_HASH(ip->ip_dst.s_addr), ia_hash) {
 		/*
 		 * If the address matches, verify that the packet
 		 * arrived via the correct interface if checking is
 		 * enabled.
 		 */
 		if (IA_SIN(ia)->sin_addr.s_addr == ip->ip_dst.s_addr && 
 		    (!checkif || ia->ia_ifp == ifp)) {
 			counter_u64_add(ia->ia_ifa.ifa_ipackets, 1);
 			counter_u64_add(ia->ia_ifa.ifa_ibytes,
 			    m->m_pkthdr.len);
 			/* IN_IFADDR_RUNLOCK(); */
 			goto ours;
 		}
 	}
 	/* IN_IFADDR_RUNLOCK(); */
 
 	/*
 	 * Check for broadcast addresses.
 	 *
 	 * Only accept broadcast packets that arrive via the matching
 	 * interface.  Reception of forwarded directed broadcasts would
 	 * be handled via ip_forward() and ether_output() with the loopback
 	 * into the stack for SIMPLEX interfaces handled by ether_output().
 	 */
 	if (ifp != NULL && ifp->if_flags & IFF_BROADCAST) {
 		IF_ADDR_RLOCK(ifp);
 	        TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
 			if (ifa->ifa_addr->sa_family != AF_INET)
 				continue;
 			ia = ifatoia(ifa);
 			if (satosin(&ia->ia_broadaddr)->sin_addr.s_addr ==
 			    ip->ip_dst.s_addr) {
 				counter_u64_add(ia->ia_ifa.ifa_ipackets, 1);
 				counter_u64_add(ia->ia_ifa.ifa_ibytes,
 				    m->m_pkthdr.len);
 				IF_ADDR_RUNLOCK(ifp);
 				goto ours;
 			}
 #ifdef BOOTP_COMPAT
 			if (IA_SIN(ia)->sin_addr.s_addr == INADDR_ANY) {
 				counter_u64_add(ia->ia_ifa.ifa_ipackets, 1);
 				counter_u64_add(ia->ia_ifa.ifa_ibytes,
 				    m->m_pkthdr.len);
 				IF_ADDR_RUNLOCK(ifp);
 				goto ours;
 			}
 #endif
 		}
 		IF_ADDR_RUNLOCK(ifp);
 		ia = NULL;
 	}
 	/* RFC 3927 2.7: Do not forward datagrams for 169.254.0.0/16. */
 	if (IN_LINKLOCAL(ntohl(ip->ip_dst.s_addr))) {
 		IPSTAT_INC(ips_cantforward);
 		m_freem(m);
 		return;
 	}
 	if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr))) {
 		if (V_ip_mrouter) {
 			/*
 			 * If we are acting as a multicast router, all
 			 * incoming multicast packets are passed to the
 			 * kernel-level multicast forwarding function.
 			 * The packet is returned (relatively) intact; if
 			 * ip_mforward() returns a non-zero value, the packet
 			 * must be discarded, else it may be accepted below.
 			 */
 			if (ip_mforward && ip_mforward(ip, ifp, m, 0) != 0) {
 				IPSTAT_INC(ips_cantforward);
 				m_freem(m);
 				return;
 			}
 
 			/*
 			 * The process-level routing daemon needs to receive
 			 * all multicast IGMP packets, whether or not this
 			 * host belongs to their destination groups.
 			 */
 			if (ip->ip_p == IPPROTO_IGMP)
 				goto ours;
 			IPSTAT_INC(ips_forward);
 		}
 		/*
 		 * Assume the packet is for us, to avoid prematurely taking
 		 * a lock on the in_multi hash. Protocols must perform
 		 * their own filtering and update statistics accordingly.
 		 */
 		goto ours;
 	}
 	if (ip->ip_dst.s_addr == (u_long)INADDR_BROADCAST)
 		goto ours;
 	if (ip->ip_dst.s_addr == INADDR_ANY)
 		goto ours;
 
 	/*
 	 * FAITH(Firewall Aided Internet Translator)
 	 */
 	if (ifp && ifp->if_type == IFT_FAITH) {
 		if (V_ip_keepfaith) {
 			if (ip->ip_p == IPPROTO_TCP || ip->ip_p == IPPROTO_ICMP) 
 				goto ours;
 		}
 		m_freem(m);
 		return;
 	}
 
 	/*
 	 * Not for us; forward if possible and desirable.
 	 */
 	if (V_ipforwarding == 0) {
 		IPSTAT_INC(ips_cantforward);
 		m_freem(m);
 	} else {
 #ifdef IPSEC
 		if (ip_ipsec_fwd(m))
 			goto bad;
 #endif /* IPSEC */
 		ip_forward(m, dchg);
 	}
 	return;
 
 ours:
 #ifdef IPSTEALTH
 	/*
 	 * IPSTEALTH: Process non-routing options only
 	 * if the packet is destined for us.
 	 */
 	if (V_ipstealth && hlen > sizeof (struct ip) && ip_dooptions(m, 1))
 		return;
 #endif /* IPSTEALTH */
 
 	/*
 	 * Attempt reassembly; if it succeeds, proceed.
 	 * ip_reass() will return a different mbuf.
 	 */
 	if (ip->ip_off & htons(IP_MF | IP_OFFMASK)) {
 		m = ip_reass(m);
 		if (m == NULL)
 			return;
 		ip = mtod(m, struct ip *);
 		/* Get the header length of the reassembled packet */
 		hlen = ip->ip_hl << 2;
 	}
 
 #ifdef IPSEC
 	/*
 	 * enforce IPsec policy checking if we are seeing last header.
 	 * note that we do not visit this with protocols with pcb layer
 	 * code - like udp/tcp/raw ip.
 	 */
 	if (ip_ipsec_input(m))
 		goto bad;
 #endif /* IPSEC */
 
 	/*
 	 * Switch out to protocol's input routine.
 	 */
 	IPSTAT_INC(ips_delivered);
 
 	(*inetsw[ip_protox[ip->ip_p]].pr_input)(m, hlen);
 	return;
 bad:
 	m_freem(m);
 }
 
 /*
  * After maxnipq has been updated, propagate the change to UMA.  The UMA zone
  * max has slightly different semantics than the sysctl, for historical
  * reasons.
  */
 static void
 maxnipq_update(void)
 {
 
 	/*
 	 * -1 for unlimited allocation.
 	 */
 	if (V_maxnipq < 0)
 		uma_zone_set_max(V_ipq_zone, 0);
 	/*
 	 * Positive number for specific bound.
 	 */
 	if (V_maxnipq > 0)
 		uma_zone_set_max(V_ipq_zone, V_maxnipq);
 	/*
 	 * Zero specifies no further fragment queue allocation -- set the
 	 * bound very low, but rely on implementation elsewhere to actually
 	 * prevent allocation and reclaim current queues.
 	 */
 	if (V_maxnipq == 0)
 		uma_zone_set_max(V_ipq_zone, 1);
 }
 
 static void
 ipq_zone_change(void *tag)
 {
 
 	if (V_maxnipq > 0 && V_maxnipq < (nmbclusters / 32)) {
 		V_maxnipq = nmbclusters / 32;
 		maxnipq_update();
 	}
 }
 
 static int
 sysctl_maxnipq(SYSCTL_HANDLER_ARGS)
 {
 	int error, i;
 
 	i = V_maxnipq;
 	error = sysctl_handle_int(oidp, &i, 0, req);
 	if (error || !req->newptr)
 		return (error);
 
 	/*
 	 * XXXRW: Might be a good idea to sanity check the argument and place
 	 * an extreme upper bound.
 	 */
 	if (i < -1)
 		return (EINVAL);
 	V_maxnipq = i;
 	maxnipq_update();
 	return (0);
 }
 
 SYSCTL_PROC(_net_inet_ip, OID_AUTO, maxfragpackets, CTLTYPE_INT|CTLFLAG_RW,
     NULL, 0, sysctl_maxnipq, "I",
     "Maximum number of IPv4 fragment reassembly queue entries");
 
 /*
  * Take incoming datagram fragment and try to reassemble it into
  * whole datagram.  If the argument is the first fragment or one
  * in between the function will return NULL and store the mbuf
  * in the fragment chain.  If the argument is the last fragment
  * the packet will be reassembled and the pointer to the new
  * mbuf returned for further processing.  Only m_tags attached
  * to the first packet/fragment are preserved.
  * The IP header is *NOT* adjusted out of iplen.
  */
 struct mbuf *
 ip_reass(struct mbuf *m)
 {
 	struct ip *ip;
 	struct mbuf *p, *q, *nq, *t;
 	struct ipq *fp = NULL;
 	struct ipqhead *head;
 	int i, hlen, next;
 	u_int8_t ecn, ecn0;
 	u_short hash;
 
 	/* If maxnipq or maxfragsperpacket are 0, never accept fragments. */
 	if (V_maxnipq == 0 || V_maxfragsperpacket == 0) {
 		IPSTAT_INC(ips_fragments);
 		IPSTAT_INC(ips_fragdropped);
 		m_freem(m);
 		return (NULL);
 	}
 
 	ip = mtod(m, struct ip *);
 	hlen = ip->ip_hl << 2;
 
 	hash = IPREASS_HASH(ip->ip_src.s_addr, ip->ip_id);
 	head = &V_ipq[hash];
 	IPQ_LOCK();
 
 	/*
 	 * Look for queue of fragments
 	 * of this datagram.
 	 */
 	TAILQ_FOREACH(fp, head, ipq_list)
 		if (ip->ip_id == fp->ipq_id &&
 		    ip->ip_src.s_addr == fp->ipq_src.s_addr &&
 		    ip->ip_dst.s_addr == fp->ipq_dst.s_addr &&
 #ifdef MAC
 		    mac_ipq_match(m, fp) &&
 #endif
 		    ip->ip_p == fp->ipq_p)
 			goto found;
 
 	fp = NULL;
 
 	/*
 	 * Attempt to trim the number of allocated fragment queues if it
 	 * exceeds the administrative limit.
 	 */
 	if ((V_nipq > V_maxnipq) && (V_maxnipq > 0)) {
 		/*
 		 * drop something from the tail of the current queue
 		 * before proceeding further
 		 */
 		struct ipq *q = TAILQ_LAST(head, ipqhead);
 		if (q == NULL) {   /* gak */
 			for (i = 0; i < IPREASS_NHASH; i++) {
 				struct ipq *r = TAILQ_LAST(&V_ipq[i], ipqhead);
 				if (r) {
 					IPSTAT_ADD(ips_fragtimeout,
 					    r->ipq_nfrags);
 					ip_freef(&V_ipq[i], r);
 					break;
 				}
 			}
 		} else {
 			IPSTAT_ADD(ips_fragtimeout, q->ipq_nfrags);
 			ip_freef(head, q);
 		}
 	}
 
 found:
 	/*
 	 * Adjust ip_len to not reflect header,
 	 * convert offset of this to bytes.
 	 */
 	ip->ip_len = htons(ntohs(ip->ip_len) - hlen);
 	if (ip->ip_off & htons(IP_MF)) {
 		/*
 		 * Make sure that fragments have a data length
 		 * that's a non-zero multiple of 8 bytes.
 		 */
 		if (ip->ip_len == htons(0) || (ntohs(ip->ip_len) & 0x7) != 0) {
 			IPSTAT_INC(ips_toosmall); /* XXX */
 			goto dropfrag;
 		}
 		m->m_flags |= M_IP_FRAG;
 	} else
 		m->m_flags &= ~M_IP_FRAG;
 	ip->ip_off = htons(ntohs(ip->ip_off) << 3);
 
 	/*
 	 * Attempt reassembly; if it succeeds, proceed.
 	 * ip_reass() will return a different mbuf.
 	 */
 	IPSTAT_INC(ips_fragments);
 	m->m_pkthdr.PH_loc.ptr = ip;
 
 	/* Previous ip_reass() started here. */
 	/*
 	 * Presence of header sizes in mbufs
 	 * would confuse code below.
 	 */
 	m->m_data += hlen;
 	m->m_len -= hlen;
 
 	/*
 	 * If first fragment to arrive, create a reassembly queue.
 	 */
 	if (fp == NULL) {
 		fp = uma_zalloc(V_ipq_zone, M_NOWAIT);
 		if (fp == NULL)
 			goto dropfrag;
 #ifdef MAC
 		if (mac_ipq_init(fp, M_NOWAIT) != 0) {
 			uma_zfree(V_ipq_zone, fp);
 			fp = NULL;
 			goto dropfrag;
 		}
 		mac_ipq_create(m, fp);
 #endif
 		TAILQ_INSERT_HEAD(head, fp, ipq_list);
 		V_nipq++;
 		fp->ipq_nfrags = 1;
 		fp->ipq_ttl = IPFRAGTTL;
 		fp->ipq_p = ip->ip_p;
 		fp->ipq_id = ip->ip_id;
 		fp->ipq_src = ip->ip_src;
 		fp->ipq_dst = ip->ip_dst;
 		fp->ipq_frags = m;
 		m->m_nextpkt = NULL;
 		goto done;
 	} else {
 		fp->ipq_nfrags++;
 #ifdef MAC
 		mac_ipq_update(m, fp);
 #endif
 	}
 
 #define GETIP(m)	((struct ip*)((m)->m_pkthdr.PH_loc.ptr))
 
 	/*
 	 * Handle ECN by comparing this segment with the first one;
 	 * if CE is set, do not lose CE.
 	 * drop if CE and not-ECT are mixed for the same packet.
 	 */
 	ecn = ip->ip_tos & IPTOS_ECN_MASK;
 	ecn0 = GETIP(fp->ipq_frags)->ip_tos & IPTOS_ECN_MASK;
 	if (ecn == IPTOS_ECN_CE) {
 		if (ecn0 == IPTOS_ECN_NOTECT)
 			goto dropfrag;
 		if (ecn0 != IPTOS_ECN_CE)
 			GETIP(fp->ipq_frags)->ip_tos |= IPTOS_ECN_CE;
 	}
 	if (ecn == IPTOS_ECN_NOTECT && ecn0 != IPTOS_ECN_NOTECT)
 		goto dropfrag;
 
 	/*
 	 * Find a segment which begins after this one does.
 	 */
 	for (p = NULL, q = fp->ipq_frags; q; p = q, q = q->m_nextpkt)
 		if (ntohs(GETIP(q)->ip_off) > ntohs(ip->ip_off))
 			break;
 
 	/*
 	 * If there is a preceding segment, it may provide some of
 	 * our data already.  If so, drop the data from the incoming
 	 * segment.  If it provides all of our data, drop us, otherwise
 	 * stick new segment in the proper place.
 	 *
 	 * If some of the data is dropped from the preceding
 	 * segment, then it's checksum is invalidated.
 	 */
 	if (p) {
 		i = ntohs(GETIP(p)->ip_off) + ntohs(GETIP(p)->ip_len) -
 		    ntohs(ip->ip_off);
 		if (i > 0) {
 			if (i >= ntohs(ip->ip_len))
 				goto dropfrag;
 			m_adj(m, i);
 			m->m_pkthdr.csum_flags = 0;
 			ip->ip_off = htons(ntohs(ip->ip_off) + i);
 			ip->ip_len = htons(ntohs(ip->ip_len) - i);
 		}
 		m->m_nextpkt = p->m_nextpkt;
 		p->m_nextpkt = m;
 	} else {
 		m->m_nextpkt = fp->ipq_frags;
 		fp->ipq_frags = m;
 	}
 
 	/*
 	 * While we overlap succeeding segments trim them or,
 	 * if they are completely covered, dequeue them.
 	 */
 	for (; q != NULL && ntohs(ip->ip_off) + ntohs(ip->ip_len) >
 	    ntohs(GETIP(q)->ip_off); q = nq) {
 		i = (ntohs(ip->ip_off) + ntohs(ip->ip_len)) -
 		    ntohs(GETIP(q)->ip_off);
 		if (i < ntohs(GETIP(q)->ip_len)) {
 			GETIP(q)->ip_len = htons(ntohs(GETIP(q)->ip_len) - i);
 			GETIP(q)->ip_off = htons(ntohs(GETIP(q)->ip_off) + i);
 			m_adj(q, i);
 			q->m_pkthdr.csum_flags = 0;
 			break;
 		}
 		nq = q->m_nextpkt;
 		m->m_nextpkt = nq;
 		IPSTAT_INC(ips_fragdropped);
 		fp->ipq_nfrags--;
 		m_freem(q);
 	}
 
 	/*
 	 * Check for complete reassembly and perform frag per packet
 	 * limiting.
 	 *
 	 * Frag limiting is performed here so that the nth frag has
 	 * a chance to complete the packet before we drop the packet.
 	 * As a result, n+1 frags are actually allowed per packet, but
 	 * only n will ever be stored. (n = maxfragsperpacket.)
 	 *
 	 */
 	next = 0;
 	for (p = NULL, q = fp->ipq_frags; q; p = q, q = q->m_nextpkt) {
 		if (ntohs(GETIP(q)->ip_off) != next) {
 			if (fp->ipq_nfrags > V_maxfragsperpacket) {
 				IPSTAT_ADD(ips_fragdropped, fp->ipq_nfrags);
 				ip_freef(head, fp);
 			}
 			goto done;
 		}
 		next += ntohs(GETIP(q)->ip_len);
 	}
 	/* Make sure the last packet didn't have the IP_MF flag */
 	if (p->m_flags & M_IP_FRAG) {
 		if (fp->ipq_nfrags > V_maxfragsperpacket) {
 			IPSTAT_ADD(ips_fragdropped, fp->ipq_nfrags);
 			ip_freef(head, fp);
 		}
 		goto done;
 	}
 
 	/*
 	 * Reassembly is complete.  Make sure the packet is a sane size.
 	 */
 	q = fp->ipq_frags;
 	ip = GETIP(q);
 	if (next + (ip->ip_hl << 2) > IP_MAXPACKET) {
 		IPSTAT_INC(ips_toolong);
 		IPSTAT_ADD(ips_fragdropped, fp->ipq_nfrags);
 		ip_freef(head, fp);
 		goto done;
 	}
 
 	/*
 	 * Concatenate fragments.
 	 */
 	m = q;
 	t = m->m_next;
 	m->m_next = NULL;
 	m_cat(m, t);
 	nq = q->m_nextpkt;
 	q->m_nextpkt = NULL;
 	for (q = nq; q != NULL; q = nq) {
 		nq = q->m_nextpkt;
 		q->m_nextpkt = NULL;
 		m->m_pkthdr.csum_flags &= q->m_pkthdr.csum_flags;
 		m->m_pkthdr.csum_data += q->m_pkthdr.csum_data;
 		m_cat(m, q);
 	}
 	/*
 	 * In order to do checksumming faster we do 'end-around carry' here
 	 * (and not in for{} loop), though it implies we are not going to
 	 * reassemble more than 64k fragments.
 	 */
 	m->m_pkthdr.csum_data =
 	    (m->m_pkthdr.csum_data & 0xffff) + (m->m_pkthdr.csum_data >> 16);
 #ifdef MAC
 	mac_ipq_reassemble(fp, m);
 	mac_ipq_destroy(fp);
 #endif
 
 	/*
 	 * Create header for new ip packet by modifying header of first
 	 * packet;  dequeue and discard fragment reassembly header.
 	 * Make header visible.
 	 */
 	ip->ip_len = htons((ip->ip_hl << 2) + next);
 	ip->ip_src = fp->ipq_src;
 	ip->ip_dst = fp->ipq_dst;
 	TAILQ_REMOVE(head, fp, ipq_list);
 	V_nipq--;
 	uma_zfree(V_ipq_zone, fp);
 	m->m_len += (ip->ip_hl << 2);
 	m->m_data -= (ip->ip_hl << 2);
 	/* some debugging cruft by sklower, below, will go away soon */
 	if (m->m_flags & M_PKTHDR)	/* XXX this should be done elsewhere */
 		m_fixhdr(m);
 	IPSTAT_INC(ips_reassembled);
 	IPQ_UNLOCK();
 	return (m);
 
 dropfrag:
 	IPSTAT_INC(ips_fragdropped);
 	if (fp != NULL)
 		fp->ipq_nfrags--;
 	m_freem(m);
 done:
 	IPQ_UNLOCK();
 	return (NULL);
 
 #undef GETIP
 }
 
 /*
  * Free a fragment reassembly header and all
  * associated datagrams.
  */
 static void
 ip_freef(struct ipqhead *fhp, struct ipq *fp)
 {
 	struct mbuf *q;
 
 	IPQ_LOCK_ASSERT();
 
 	while (fp->ipq_frags) {
 		q = fp->ipq_frags;
 		fp->ipq_frags = q->m_nextpkt;
 		m_freem(q);
 	}
 	TAILQ_REMOVE(fhp, fp, ipq_list);
 	uma_zfree(V_ipq_zone, fp);
 	V_nipq--;
 }
 
 /*
  * IP timer processing;
  * if a timer expires on a reassembly
  * queue, discard it.
  */
 void
 ip_slowtimo(void)
 {
 	VNET_ITERATOR_DECL(vnet_iter);
 	struct ipq *fp;
 	int i;
 
 	VNET_LIST_RLOCK_NOSLEEP();
 	IPQ_LOCK();
 	VNET_FOREACH(vnet_iter) {
 		CURVNET_SET(vnet_iter);
 		for (i = 0; i < IPREASS_NHASH; i++) {
 			for(fp = TAILQ_FIRST(&V_ipq[i]); fp;) {
 				struct ipq *fpp;
 
 				fpp = fp;
 				fp = TAILQ_NEXT(fp, ipq_list);
 				if(--fpp->ipq_ttl == 0) {
 					IPSTAT_ADD(ips_fragtimeout,
 					    fpp->ipq_nfrags);
 					ip_freef(&V_ipq[i], fpp);
 				}
 			}
 		}
 		/*
 		 * If we are over the maximum number of fragments
 		 * (due to the limit being lowered), drain off
 		 * enough to get down to the new limit.
 		 */
 		if (V_maxnipq >= 0 && V_nipq > V_maxnipq) {
 			for (i = 0; i < IPREASS_NHASH; i++) {
 				while (V_nipq > V_maxnipq &&
 				    !TAILQ_EMPTY(&V_ipq[i])) {
 					IPSTAT_ADD(ips_fragdropped,
 					    TAILQ_FIRST(&V_ipq[i])->ipq_nfrags);
 					ip_freef(&V_ipq[i],
 					    TAILQ_FIRST(&V_ipq[i]));
 				}
 			}
 		}
 		CURVNET_RESTORE();
 	}
 	IPQ_UNLOCK();
 	VNET_LIST_RUNLOCK_NOSLEEP();
 }
 
 /*
  * Drain off all datagram fragments.
  */
 static void
 ip_drain_locked(void)
 {
 	int     i;
 
 	IPQ_LOCK_ASSERT();
 
 	for (i = 0; i < IPREASS_NHASH; i++) {
 		while(!TAILQ_EMPTY(&V_ipq[i])) {
 			IPSTAT_ADD(ips_fragdropped,
 			    TAILQ_FIRST(&V_ipq[i])->ipq_nfrags);
 			ip_freef(&V_ipq[i], TAILQ_FIRST(&V_ipq[i]));
 		}
 	}
 }
 
 void
 ip_drain(void)
 {
 	VNET_ITERATOR_DECL(vnet_iter);
 
 	VNET_LIST_RLOCK_NOSLEEP();
 	IPQ_LOCK();
 	VNET_FOREACH(vnet_iter) {
 		CURVNET_SET(vnet_iter);
 		ip_drain_locked();
 		CURVNET_RESTORE();
 	}
 	IPQ_UNLOCK();
 	VNET_LIST_RUNLOCK_NOSLEEP();
 	in_rtqdrain();
 }
 
 /*
  * The protocol to be inserted into ip_protox[] must be already registered
  * in inetsw[], either statically or through pf_proto_register().
  */
 int
 ipproto_register(short ipproto)
 {
 	struct protosw *pr;
 
 	/* Sanity checks. */
 	if (ipproto <= 0 || ipproto >= IPPROTO_MAX)
 		return (EPROTONOSUPPORT);
 
 	/*
 	 * The protocol slot must not be occupied by another protocol
 	 * already.  An index pointing to IPPROTO_RAW is unused.
 	 */
 	pr = pffindproto(PF_INET, IPPROTO_RAW, SOCK_RAW);
 	if (pr == NULL)
 		return (EPFNOSUPPORT);
 	if (ip_protox[ipproto] != pr - inetsw)	/* IPPROTO_RAW */
 		return (EEXIST);
 
 	/* Find the protocol position in inetsw[] and set the index. */
 	for (pr = inetdomain.dom_protosw;
 	     pr < inetdomain.dom_protoswNPROTOSW; pr++) {
 		if (pr->pr_domain->dom_family == PF_INET &&
 		    pr->pr_protocol && pr->pr_protocol == ipproto) {
 			ip_protox[pr->pr_protocol] = pr - inetsw;
 			return (0);
 		}
 	}
 	return (EPROTONOSUPPORT);
 }
 
 int
 ipproto_unregister(short ipproto)
 {
 	struct protosw *pr;
 
 	/* Sanity checks. */
 	if (ipproto <= 0 || ipproto >= IPPROTO_MAX)
 		return (EPROTONOSUPPORT);
 
 	/* Check if the protocol was indeed registered. */
 	pr = pffindproto(PF_INET, IPPROTO_RAW, SOCK_RAW);
 	if (pr == NULL)
 		return (EPFNOSUPPORT);
 	if (ip_protox[ipproto] == pr - inetsw)  /* IPPROTO_RAW */
 		return (ENOENT);
 
 	/* Reset the protocol slot to IPPROTO_RAW. */
 	ip_protox[ipproto] = pr - inetsw;
 	return (0);
 }
 
 /*
  * Given address of next destination (final or next hop), return (referenced)
  * internet address info of interface to be used to get there.
  */
 struct in_ifaddr *
 ip_rtaddr(struct in_addr dst, u_int fibnum)
 {
 	struct route sro;
 	struct sockaddr_in *sin;
 	struct in_ifaddr *ia;
 
 	bzero(&sro, sizeof(sro));
 	sin = (struct sockaddr_in *)&sro.ro_dst;
 	sin->sin_family = AF_INET;
 	sin->sin_len = sizeof(*sin);
 	sin->sin_addr = dst;
 	in_rtalloc_ign(&sro, 0, fibnum);
 
 	if (sro.ro_rt == NULL)
 		return (NULL);
 
 	ia = ifatoia(sro.ro_rt->rt_ifa);
 	ifa_ref(&ia->ia_ifa);
 	RTFREE(sro.ro_rt);
 	return (ia);
 }
 
 u_char inetctlerrmap[PRC_NCMDS] = {
 	0,		0,		0,		0,
 	0,		EMSGSIZE,	EHOSTDOWN,	EHOSTUNREACH,
 	EHOSTUNREACH,	EHOSTUNREACH,	ECONNREFUSED,	ECONNREFUSED,
 	EMSGSIZE,	EHOSTUNREACH,	0,		0,
 	0,		0,		EHOSTUNREACH,	0,
 	ENOPROTOOPT,	ECONNREFUSED
 };
 
 /*
  * Forward a packet.  If some error occurs return the sender
  * an icmp packet.  Note we can't always generate a meaningful
  * icmp message because icmp doesn't have a large enough repertoire
  * of codes and types.
  *
  * If not forwarding, just drop the packet.  This could be confusing
  * if ipforwarding was zero but some routing protocol was advancing
  * us as a gateway to somewhere.  However, we must let the routing
  * protocol deal with that.
  *
  * The srcrt parameter indicates whether the packet is being forwarded
  * via a source route.
  */
 void
 ip_forward(struct mbuf *m, int srcrt)
 {
 	struct ip *ip = mtod(m, struct ip *);
 	struct in_ifaddr *ia;
 	struct mbuf *mcopy;
 	struct in_addr dest;
 	struct route ro;
 	int error, type = 0, code = 0, mtu = 0;
 
 	if (m->m_flags & (M_BCAST|M_MCAST) || in_canforward(ip->ip_dst) == 0) {
 		IPSTAT_INC(ips_cantforward);
 		m_freem(m);
 		return;
 	}
 #ifdef IPSTEALTH
 	if (!V_ipstealth) {
 #endif
 		if (ip->ip_ttl <= IPTTLDEC) {
 			icmp_error(m, ICMP_TIMXCEED, ICMP_TIMXCEED_INTRANS,
 			    0, 0);
 			return;
 		}
 #ifdef IPSTEALTH
 	}
 #endif
 
 	ia = ip_rtaddr(ip->ip_dst, M_GETFIB(m));
 #ifndef IPSEC
 	/*
 	 * 'ia' may be NULL if there is no route for this destination.
 	 * In case of IPsec, Don't discard it just yet, but pass it to
 	 * ip_output in case of outgoing IPsec policy.
 	 */
 	if (!srcrt && ia == NULL) {
 		icmp_error(m, ICMP_UNREACH, ICMP_UNREACH_HOST, 0, 0);
 		return;
 	}
 #endif
 
 	/*
 	 * Save the IP header and at most 8 bytes of the payload,
 	 * in case we need to generate an ICMP message to the src.
 	 *
 	 * XXX this can be optimized a lot by saving the data in a local
 	 * buffer on the stack (72 bytes at most), and only allocating the
 	 * mbuf if really necessary. The vast majority of the packets
 	 * are forwarded without having to send an ICMP back (either
 	 * because unnecessary, or because rate limited), so we are
 	 * really we are wasting a lot of work here.
 	 *
 	 * We don't use m_copy() because it might return a reference
 	 * to a shared cluster. Both this function and ip_output()
 	 * assume exclusive access to the IP header in `m', so any
 	 * data in a cluster may change before we reach icmp_error().
 	 */
 	mcopy = m_gethdr(M_NOWAIT, m->m_type);
 	if (mcopy != NULL && !m_dup_pkthdr(mcopy, m, M_NOWAIT)) {
 		/*
 		 * It's probably ok if the pkthdr dup fails (because
 		 * the deep copy of the tag chain failed), but for now
 		 * be conservative and just discard the copy since
 		 * code below may some day want the tags.
 		 */
 		m_free(mcopy);
 		mcopy = NULL;
 	}
 	if (mcopy != NULL) {
 		mcopy->m_len = min(ntohs(ip->ip_len), M_TRAILINGSPACE(mcopy));
 		mcopy->m_pkthdr.len = mcopy->m_len;
 		m_copydata(m, 0, mcopy->m_len, mtod(mcopy, caddr_t));
 	}
 
 #ifdef IPSTEALTH
 	if (!V_ipstealth) {
 #endif
 		ip->ip_ttl -= IPTTLDEC;
 #ifdef IPSTEALTH
 	}
 #endif
 
 	/*
 	 * If forwarding packet using same interface that it came in on,
 	 * perhaps should send a redirect to sender to shortcut a hop.
 	 * Only send redirect if source is sending directly to us,
 	 * and if packet was not source routed (or has any options).
 	 * Also, don't send redirect if forwarding using a default route
 	 * or a route modified by a redirect.
 	 */
 	dest.s_addr = 0;
 	if (!srcrt && V_ipsendredirects &&
 	    ia != NULL && ia->ia_ifp == m->m_pkthdr.rcvif) {
 		struct sockaddr_in *sin;
 		struct rtentry *rt;
 
 		bzero(&ro, sizeof(ro));
 		sin = (struct sockaddr_in *)&ro.ro_dst;
 		sin->sin_family = AF_INET;
 		sin->sin_len = sizeof(*sin);
 		sin->sin_addr = ip->ip_dst;
 		in_rtalloc_ign(&ro, 0, M_GETFIB(m));
 
 		rt = ro.ro_rt;
 
 		if (rt && (rt->rt_flags & (RTF_DYNAMIC|RTF_MODIFIED)) == 0 &&
 		    satosin(rt_key(rt))->sin_addr.s_addr != 0) {
 #define	RTA(rt)	((struct in_ifaddr *)(rt->rt_ifa))
 			u_long src = ntohl(ip->ip_src.s_addr);
 
 			if (RTA(rt) &&
 			    (src & RTA(rt)->ia_subnetmask) == RTA(rt)->ia_subnet) {
 				if (rt->rt_flags & RTF_GATEWAY)
 					dest.s_addr = satosin(rt->rt_gateway)->sin_addr.s_addr;
 				else
 					dest.s_addr = ip->ip_dst.s_addr;
 				/* Router requirements says to only send host redirects */
 				type = ICMP_REDIRECT;
 				code = ICMP_REDIRECT_HOST;
 			}
 		}
 		if (rt)
 			RTFREE(rt);
 	}
 
 	/*
 	 * Try to cache the route MTU from ip_output so we can consider it for
 	 * the ICMP_UNREACH_NEEDFRAG "Next-Hop MTU" field described in RFC1191.
 	 */
 	bzero(&ro, sizeof(ro));
 
 	error = ip_output(m, NULL, &ro, IP_FORWARDING, NULL, NULL);
 
 	if (error == EMSGSIZE && ro.ro_rt)
 		mtu = ro.ro_rt->rt_rmx.rmx_mtu;
 	RO_RTFREE(&ro);
 
 	if (error)
 		IPSTAT_INC(ips_cantforward);
 	else {
 		IPSTAT_INC(ips_forward);
 		if (type)
 			IPSTAT_INC(ips_redirectsent);
 		else {
 			if (mcopy)
 				m_freem(mcopy);
 			if (ia != NULL)
 				ifa_free(&ia->ia_ifa);
 			return;
 		}
 	}
 	if (mcopy == NULL) {
 		if (ia != NULL)
 			ifa_free(&ia->ia_ifa);
 		return;
 	}
 
 	switch (error) {
 
 	case 0:				/* forwarded, but need redirect */
 		/* type, code set above */
 		break;
 
 	case ENETUNREACH:
 	case EHOSTUNREACH:
 	case ENETDOWN:
 	case EHOSTDOWN:
 	default:
 		type = ICMP_UNREACH;
 		code = ICMP_UNREACH_HOST;
 		break;
 
 	case EMSGSIZE:
 		type = ICMP_UNREACH;
 		code = ICMP_UNREACH_NEEDFRAG;
 
 #ifdef IPSEC
 		/* 
 		 * If IPsec is configured for this path,
 		 * override any possibly mtu value set by ip_output.
 		 */ 
 		mtu = ip_ipsec_mtu(mcopy, mtu);
 #endif /* IPSEC */
 		/*
 		 * If the MTU was set before make sure we are below the
 		 * interface MTU.
 		 * If the MTU wasn't set before use the interface mtu or
 		 * fall back to the next smaller mtu step compared to the
 		 * current packet size.
 		 */
 		if (mtu != 0) {
 			if (ia != NULL)
 				mtu = min(mtu, ia->ia_ifp->if_mtu);
 		} else {
 			if (ia != NULL)
 				mtu = ia->ia_ifp->if_mtu;
 			else
 				mtu = ip_next_mtu(ntohs(ip->ip_len), 0);
 		}
 		IPSTAT_INC(ips_cantfrag);
 		break;
 
 	case ENOBUFS:
 		/*
 		 * A router should not generate ICMP_SOURCEQUENCH as
 		 * required in RFC1812 Requirements for IP Version 4 Routers.
 		 * Source quench could be a big problem under DoS attacks,
 		 * or if the underlying interface is rate-limited.
 		 * Those who need source quench packets may re-enable them
 		 * via the net.inet.ip.sendsourcequench sysctl.
 		 */
 		if (V_ip_sendsourcequench == 0) {
 			m_freem(mcopy);
 			if (ia != NULL)
 				ifa_free(&ia->ia_ifa);
 			return;
 		} else {
 			type = ICMP_SOURCEQUENCH;
 			code = 0;
 		}
 		break;
 
 	case EACCES:			/* ipfw denied packet */
 		m_freem(mcopy);
 		if (ia != NULL)
 			ifa_free(&ia->ia_ifa);
 		return;
 	}
 	if (ia != NULL)
 		ifa_free(&ia->ia_ifa);
 	icmp_error(mcopy, type, code, dest.s_addr, mtu);
 }
 
 void
 ip_savecontrol(struct inpcb *inp, struct mbuf **mp, struct ip *ip,
     struct mbuf *m)
 {
 
 	if (inp->inp_socket->so_options & (SO_BINTIME | SO_TIMESTAMP)) {
 		struct bintime bt;
 
 		bintime(&bt);
 		if (inp->inp_socket->so_options & SO_BINTIME) {
 			*mp = sbcreatecontrol((caddr_t)&bt, sizeof(bt),
 			    SCM_BINTIME, SOL_SOCKET);
 			if (*mp)
 				mp = &(*mp)->m_next;
 		}
 		if (inp->inp_socket->so_options & SO_TIMESTAMP) {
 			struct timeval tv;
 
 			bintime2timeval(&bt, &tv);
 			*mp = sbcreatecontrol((caddr_t)&tv, sizeof(tv),
 			    SCM_TIMESTAMP, SOL_SOCKET);
 			if (*mp)
 				mp = &(*mp)->m_next;
 		}
 	}
 	if (inp->inp_flags & INP_RECVDSTADDR) {
 		*mp = sbcreatecontrol((caddr_t)&ip->ip_dst,
 		    sizeof(struct in_addr), IP_RECVDSTADDR, IPPROTO_IP);
 		if (*mp)
 			mp = &(*mp)->m_next;
 	}
 	if (inp->inp_flags & INP_RECVTTL) {
 		*mp = sbcreatecontrol((caddr_t)&ip->ip_ttl,
 		    sizeof(u_char), IP_RECVTTL, IPPROTO_IP);
 		if (*mp)
 			mp = &(*mp)->m_next;
 	}
 #ifdef notyet
 	/* XXX
 	 * Moving these out of udp_input() made them even more broken
 	 * than they already were.
 	 */
 	/* options were tossed already */
 	if (inp->inp_flags & INP_RECVOPTS) {
 		*mp = sbcreatecontrol((caddr_t)opts_deleted_above,
 		    sizeof(struct in_addr), IP_RECVOPTS, IPPROTO_IP);
 		if (*mp)
 			mp = &(*mp)->m_next;
 	}
 	/* ip_srcroute doesn't do what we want here, need to fix */
 	if (inp->inp_flags & INP_RECVRETOPTS) {
 		*mp = sbcreatecontrol((caddr_t)ip_srcroute(m),
 		    sizeof(struct in_addr), IP_RECVRETOPTS, IPPROTO_IP);
 		if (*mp)
 			mp = &(*mp)->m_next;
 	}
 #endif
 	if (inp->inp_flags & INP_RECVIF) {
 		struct ifnet *ifp;
 		struct sdlbuf {
 			struct sockaddr_dl sdl;
 			u_char	pad[32];
 		} sdlbuf;
 		struct sockaddr_dl *sdp;
 		struct sockaddr_dl *sdl2 = &sdlbuf.sdl;
 
 		if ((ifp = m->m_pkthdr.rcvif) &&
 		    ifp->if_index && ifp->if_index <= V_if_index) {
 			sdp = (struct sockaddr_dl *)ifp->if_addr->ifa_addr;
 			/*
 			 * Change our mind and don't try copy.
 			 */
 			if (sdp->sdl_family != AF_LINK ||
 			    sdp->sdl_len > sizeof(sdlbuf)) {
 				goto makedummy;
 			}
 			bcopy(sdp, sdl2, sdp->sdl_len);
 		} else {
 makedummy:	
 			sdl2->sdl_len =
 			    offsetof(struct sockaddr_dl, sdl_data[0]);
 			sdl2->sdl_family = AF_LINK;
 			sdl2->sdl_index = 0;
 			sdl2->sdl_nlen = sdl2->sdl_alen = sdl2->sdl_slen = 0;
 		}
 		*mp = sbcreatecontrol((caddr_t)sdl2, sdl2->sdl_len,
 		    IP_RECVIF, IPPROTO_IP);
 		if (*mp)
 			mp = &(*mp)->m_next;
 	}
 	if (inp->inp_flags & INP_RECVTOS) {
 		*mp = sbcreatecontrol((caddr_t)&ip->ip_tos,
 		    sizeof(u_char), IP_RECVTOS, IPPROTO_IP);
 		if (*mp)
 			mp = &(*mp)->m_next;
 	}
 }
 
 /*
  * XXXRW: Multicast routing code in ip_mroute.c is generally MPSAFE, but the
  * ip_rsvp and ip_rsvp_on variables need to be interlocked with rsvp_on
  * locking.  This code remains in ip_input.c as ip_mroute.c is optionally
  * compiled.
  */
 static VNET_DEFINE(int, ip_rsvp_on);
 VNET_DEFINE(struct socket *, ip_rsvpd);
 
 #define	V_ip_rsvp_on		VNET(ip_rsvp_on)
 
 int
 ip_rsvp_init(struct socket *so)
 {
 
 	if (so->so_type != SOCK_RAW ||
 	    so->so_proto->pr_protocol != IPPROTO_RSVP)
 		return EOPNOTSUPP;
 
 	if (V_ip_rsvpd != NULL)
 		return EADDRINUSE;
 
 	V_ip_rsvpd = so;
 	/*
 	 * This may seem silly, but we need to be sure we don't over-increment
 	 * the RSVP counter, in case something slips up.
 	 */
 	if (!V_ip_rsvp_on) {
 		V_ip_rsvp_on = 1;
 		V_rsvp_on++;
 	}
 
 	return 0;
 }
 
 int
 ip_rsvp_done(void)
 {
 
 	V_ip_rsvpd = NULL;
 	/*
 	 * This may seem silly, but we need to be sure we don't over-decrement
 	 * the RSVP counter, in case something slips up.
 	 */
 	if (V_ip_rsvp_on) {
 		V_ip_rsvp_on = 0;
 		V_rsvp_on--;
 	}
 	return 0;
 }
 
 void
 rsvp_input(struct mbuf *m, int off)	/* XXX must fixup manually */
 {
 
 	if (rsvp_input_p) { /* call the real one if loaded */
 		rsvp_input_p(m, off);
 		return;
 	}
 
 	/* Can still get packets with rsvp_on = 0 if there is a local member
 	 * of the group to which the RSVP packet is addressed.  But in this
 	 * case we want to throw the packet away.
 	 */
 	
 	if (!V_rsvp_on) {
 		m_freem(m);
 		return;
 	}
 
 	if (V_ip_rsvpd != NULL) { 
 		rip_input(m, off);
 		return;
 	}
 	/* Drop the packet */
 	m_freem(m);
 }
Index: head/sys/netinet/ip_output.c
===================================================================
--- head/sys/netinet/ip_output.c	(revision 258540)
+++ head/sys/netinet/ip_output.c	(revision 258541)
@@ -1,1314 +1,1313 @@
 /*-
  * Copyright (c) 1982, 1986, 1988, 1990, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)ip_output.c	8.3 (Berkeley) 1/21/94
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_ipfw.h"
 #include "opt_ipsec.h"
-#include "opt_kdtrace.h"
 #include "opt_mbuf_stress_test.h"
 #include "opt_mpath.h"
 #include "opt_route.h"
 #include "opt_sctp.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/kernel.h>
 #include <sys/malloc.h>
 #include <sys/mbuf.h>
 #include <sys/priv.h>
 #include <sys/proc.h>
 #include <sys/protosw.h>
 #include <sys/sdt.h>
 #include <sys/socket.h>
 #include <sys/socketvar.h>
 #include <sys/sysctl.h>
 #include <sys/ucred.h>
 
 #include <net/if.h>
 #include <net/if_var.h>
 #include <net/if_llatbl.h>
 #include <net/netisr.h>
 #include <net/pfil.h>
 #include <net/route.h>
 #include <net/flowtable.h>
 #ifdef RADIX_MPATH
 #include <net/radix_mpath.h>
 #endif
 #include <net/vnet.h>
 
 #include <netinet/in.h>
 #include <netinet/in_kdtrace.h>
 #include <netinet/in_systm.h>
 #include <netinet/ip.h>
 #include <netinet/in_pcb.h>
 #include <netinet/in_var.h>
 #include <netinet/ip_var.h>
 #include <netinet/ip_options.h>
 #ifdef SCTP
 #include <netinet/sctp.h>
 #include <netinet/sctp_crc32.h>
 #endif
 
 #ifdef IPSEC
 #include <netinet/ip_ipsec.h>
 #include <netipsec/ipsec.h>
 #endif /* IPSEC*/
 
 #include <machine/in_cksum.h>
 
 #include <security/mac/mac_framework.h>
 
 VNET_DEFINE(u_short, ip_id);
 
 #ifdef MBUF_STRESS_TEST
 static int mbuf_frag_size = 0;
 SYSCTL_INT(_net_inet_ip, OID_AUTO, mbuf_frag_size, CTLFLAG_RW,
 	&mbuf_frag_size, 0, "Fragment outgoing mbufs to this size");
 #endif
 
 static void	ip_mloopback
 	(struct ifnet *, struct mbuf *, struct sockaddr_in *, int);
 
 
 extern int in_mcast_loop;
 extern	struct protosw inetsw[];
 
 /*
  * IP output.  The packet in mbuf chain m contains a skeletal IP
  * header (with len, off, ttl, proto, tos, src, dst).
  * The mbuf chain containing the packet will be freed.
  * The mbuf opt, if present, will not be freed.
  * If route ro is present and has ro_rt initialized, route lookup would be
  * skipped and ro->ro_rt would be used. If ro is present but ro->ro_rt is NULL,
  * then result of route lookup is stored in ro->ro_rt.
  *
  * In the IP forwarding case, the packet will arrive with options already
  * inserted, so must have a NULL opt pointer.
  */
 int
 ip_output(struct mbuf *m, struct mbuf *opt, struct route *ro, int flags,
     struct ip_moptions *imo, struct inpcb *inp)
 {
 	struct ip *ip;
 	struct ifnet *ifp = NULL;	/* keep compiler happy */
 	struct mbuf *m0;
 	int hlen = sizeof (struct ip);
 	int mtu;
 	int n;	/* scratchpad */
 	int error = 0;
 	struct sockaddr_in *dst;
 	const struct sockaddr_in *gw;
 	struct in_ifaddr *ia;
 	int isbroadcast;
 	uint16_t ip_len, ip_off;
 	struct route iproute;
 	struct rtentry *rte;	/* cache for ro->ro_rt */
 	struct in_addr odst;
 	struct m_tag *fwd_tag = NULL;
 #ifdef IPSEC
 	int no_route_but_check_spd = 0;
 #endif
 	M_ASSERTPKTHDR(m);
 
 	if (inp != NULL) {
 		INP_LOCK_ASSERT(inp);
 		M_SETFIB(m, inp->inp_inc.inc_fibnum);
 		if (inp->inp_flags & (INP_HW_FLOWID|INP_SW_FLOWID)) {
 			m->m_pkthdr.flowid = inp->inp_flowid;
 			m->m_flags |= M_FLOWID;
 		}
 	}
 
 	if (ro == NULL) {
 		ro = &iproute;
 		bzero(ro, sizeof (*ro));
 	}
 
 #ifdef FLOWTABLE
 	if (ro->ro_rt == NULL) {
 		struct flentry *fle;
 			
 		/*
 		 * The flow table returns route entries valid for up to 30
 		 * seconds; we rely on the remainder of ip_output() taking no
 		 * longer than that long for the stability of ro_rt. The
 		 * flow ID assignment must have happened before this point.
 		 */
 		fle = flowtable_lookup_mbuf(V_ip_ft, m, AF_INET);
 		if (fle != NULL)
 			flow_to_route(fle, ro);
 	}
 #endif
 
 	if (opt) {
 		int len = 0;
 		m = ip_insertoptions(m, opt, &len);
 		if (len != 0)
 			hlen = len; /* ip->ip_hl is updated above */
 	}
 	ip = mtod(m, struct ip *);
 	ip_len = ntohs(ip->ip_len);
 	ip_off = ntohs(ip->ip_off);
 
 	/*
 	 * Fill in IP header.  If we are not allowing fragmentation,
 	 * then the ip_id field is meaningless, but we don't set it
 	 * to zero.  Doing so causes various problems when devices along
 	 * the path (routers, load balancers, firewalls, etc.) illegally
 	 * disable DF on our packet.  Note that a 16-bit counter
 	 * will wrap around in less than 10 seconds at 100 Mbit/s on a
 	 * medium with MTU 1500.  See Steven M. Bellovin, "A Technique
 	 * for Counting NATted Hosts", Proc. IMW'02, available at
 	 * <http://www.cs.columbia.edu/~smb/papers/fnat.pdf>.
 	 */
 	if ((flags & (IP_FORWARDING|IP_RAWOUTPUT)) == 0) {
 		ip->ip_v = IPVERSION;
 		ip->ip_hl = hlen >> 2;
 		ip->ip_id = ip_newid();
 		IPSTAT_INC(ips_localout);
 	} else {
 		/* Header already set, fetch hlen from there */
 		hlen = ip->ip_hl << 2;
 	}
 
 	gw = dst = (struct sockaddr_in *)&ro->ro_dst;
 again:
 	ia = NULL;
 	/*
 	 * If there is a cached route,
 	 * check that it is to the same destination
 	 * and is still up.  If not, free it and try again.
 	 * The address family should also be checked in case of sharing the
 	 * cache with IPv6.
 	 */
 	rte = ro->ro_rt;
 	if (rte && ((rte->rt_flags & RTF_UP) == 0 ||
 		    rte->rt_ifp == NULL ||
 		    !RT_LINK_IS_UP(rte->rt_ifp) ||
 			  dst->sin_family != AF_INET ||
 			  dst->sin_addr.s_addr != ip->ip_dst.s_addr)) {
 		RO_RTFREE(ro);
 		ro->ro_lle = NULL;
 		rte = NULL;
 	}
 	if (rte == NULL && fwd_tag == NULL) {
 		bzero(dst, sizeof(*dst));
 		dst->sin_family = AF_INET;
 		dst->sin_len = sizeof(*dst);
 		dst->sin_addr = ip->ip_dst;
 	}
 	/*
 	 * If routing to interface only, short circuit routing lookup.
 	 * The use of an all-ones broadcast address implies this; an
 	 * interface is specified by the broadcast address of an interface,
 	 * or the destination address of a ptp interface.
 	 */
 	if (flags & IP_SENDONES) {
 		if ((ia = ifatoia(ifa_ifwithbroadaddr(sintosa(dst)))) == NULL &&
 		    (ia = ifatoia(ifa_ifwithdstaddr(sintosa(dst)))) == NULL) {
 			IPSTAT_INC(ips_noroute);
 			error = ENETUNREACH;
 			goto bad;
 		}
 		ip->ip_dst.s_addr = INADDR_BROADCAST;
 		dst->sin_addr = ip->ip_dst;
 		ifp = ia->ia_ifp;
 		ip->ip_ttl = 1;
 		isbroadcast = 1;
 	} else if (flags & IP_ROUTETOIF) {
 		if ((ia = ifatoia(ifa_ifwithdstaddr(sintosa(dst)))) == NULL &&
 		    (ia = ifatoia(ifa_ifwithnet(sintosa(dst), 0))) == NULL) {
 			IPSTAT_INC(ips_noroute);
 			error = ENETUNREACH;
 			goto bad;
 		}
 		ifp = ia->ia_ifp;
 		ip->ip_ttl = 1;
 		isbroadcast = in_broadcast(dst->sin_addr, ifp);
 	} else if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr)) &&
 	    imo != NULL && imo->imo_multicast_ifp != NULL) {
 		/*
 		 * Bypass the normal routing lookup for multicast
 		 * packets if the interface is specified.
 		 */
 		ifp = imo->imo_multicast_ifp;
 		IFP_TO_IA(ifp, ia);
 		isbroadcast = 0;	/* fool gcc */
 	} else {
 		/*
 		 * We want to do any cloning requested by the link layer,
 		 * as this is probably required in all cases for correct
 		 * operation (as it is for ARP).
 		 */
 		if (rte == NULL) {
 #ifdef RADIX_MPATH
 			rtalloc_mpath_fib(ro,
 			    ntohl(ip->ip_src.s_addr ^ ip->ip_dst.s_addr),
 			    inp ? inp->inp_inc.inc_fibnum : M_GETFIB(m));
 #else
 			in_rtalloc_ign(ro, 0,
 			    inp ? inp->inp_inc.inc_fibnum : M_GETFIB(m));
 #endif
 			rte = ro->ro_rt;
 		}
 		if (rte == NULL ||
 		    rte->rt_ifp == NULL ||
 		    !RT_LINK_IS_UP(rte->rt_ifp)) {
 #ifdef IPSEC
 			/*
 			 * There is no route for this packet, but it is
 			 * possible that a matching SPD entry exists.
 			 */
 			no_route_but_check_spd = 1;
 			mtu = 0; /* Silence GCC warning. */
 			goto sendit;
 #endif
 			IPSTAT_INC(ips_noroute);
 			error = EHOSTUNREACH;
 			goto bad;
 		}
 		ia = ifatoia(rte->rt_ifa);
 		ifa_ref(&ia->ia_ifa);
 		ifp = rte->rt_ifp;
 		rte->rt_rmx.rmx_pksent++;
 		if (rte->rt_flags & RTF_GATEWAY)
 			gw = (struct sockaddr_in *)rte->rt_gateway;
 		if (rte->rt_flags & RTF_HOST)
 			isbroadcast = (rte->rt_flags & RTF_BROADCAST);
 		else
 			isbroadcast = in_broadcast(gw->sin_addr, ifp);
 	}
 	/*
 	 * Calculate MTU.  If we have a route that is up, use that,
 	 * otherwise use the interface's MTU.
 	 */
 	if (rte != NULL && (rte->rt_flags & (RTF_UP|RTF_HOST))) {
 		/*
 		 * This case can happen if the user changed the MTU
 		 * of an interface after enabling IP on it.  Because
 		 * most netifs don't keep track of routes pointing to
 		 * them, there is no way for one to update all its
 		 * routes when the MTU is changed.
 		 */
 		if (rte->rt_rmx.rmx_mtu > ifp->if_mtu)
 			rte->rt_rmx.rmx_mtu = ifp->if_mtu;
 		mtu = rte->rt_rmx.rmx_mtu;
 	} else {
 		mtu = ifp->if_mtu;
 	}
 	/* Catch a possible divide by zero later. */
 	KASSERT(mtu > 0, ("%s: mtu %d <= 0, rte=%p (rt_flags=0x%08x) ifp=%p",
 	    __func__, mtu, rte, (rte != NULL) ? rte->rt_flags : 0, ifp));
 	if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr))) {
 		m->m_flags |= M_MCAST;
 		/*
 		 * See if the caller provided any multicast options
 		 */
 		if (imo != NULL) {
 			ip->ip_ttl = imo->imo_multicast_ttl;
 			if (imo->imo_multicast_vif != -1)
 				ip->ip_src.s_addr =
 				    ip_mcast_src ?
 				    ip_mcast_src(imo->imo_multicast_vif) :
 				    INADDR_ANY;
 		} else
 			ip->ip_ttl = IP_DEFAULT_MULTICAST_TTL;
 		/*
 		 * Confirm that the outgoing interface supports multicast.
 		 */
 		if ((imo == NULL) || (imo->imo_multicast_vif == -1)) {
 			if ((ifp->if_flags & IFF_MULTICAST) == 0) {
 				IPSTAT_INC(ips_noroute);
 				error = ENETUNREACH;
 				goto bad;
 			}
 		}
 		/*
 		 * If source address not specified yet, use address
 		 * of outgoing interface.
 		 */
 		if (ip->ip_src.s_addr == INADDR_ANY) {
 			/* Interface may have no addresses. */
 			if (ia != NULL)
 				ip->ip_src = IA_SIN(ia)->sin_addr;
 		}
 
 		if ((imo == NULL && in_mcast_loop) ||
 		    (imo && imo->imo_multicast_loop)) {
 			/*
 			 * Loop back multicast datagram if not expressly
 			 * forbidden to do so, even if we are not a member
 			 * of the group; ip_input() will filter it later,
 			 * thus deferring a hash lookup and mutex acquisition
 			 * at the expense of a cheap copy using m_copym().
 			 */
 			ip_mloopback(ifp, m, dst, hlen);
 		} else {
 			/*
 			 * If we are acting as a multicast router, perform
 			 * multicast forwarding as if the packet had just
 			 * arrived on the interface to which we are about
 			 * to send.  The multicast forwarding function
 			 * recursively calls this function, using the
 			 * IP_FORWARDING flag to prevent infinite recursion.
 			 *
 			 * Multicasts that are looped back by ip_mloopback(),
 			 * above, will be forwarded by the ip_input() routine,
 			 * if necessary.
 			 */
 			if (V_ip_mrouter && (flags & IP_FORWARDING) == 0) {
 				/*
 				 * If rsvp daemon is not running, do not
 				 * set ip_moptions. This ensures that the packet
 				 * is multicast and not just sent down one link
 				 * as prescribed by rsvpd.
 				 */
 				if (!V_rsvp_on)
 					imo = NULL;
 				if (ip_mforward &&
 				    ip_mforward(ip, ifp, m, imo) != 0) {
 					m_freem(m);
 					goto done;
 				}
 			}
 		}
 
 		/*
 		 * Multicasts with a time-to-live of zero may be looped-
 		 * back, above, but must not be transmitted on a network.
 		 * Also, multicasts addressed to the loopback interface
 		 * are not sent -- the above call to ip_mloopback() will
 		 * loop back a copy. ip_input() will drop the copy if
 		 * this host does not belong to the destination group on
 		 * the loopback interface.
 		 */
 		if (ip->ip_ttl == 0 || ifp->if_flags & IFF_LOOPBACK) {
 			m_freem(m);
 			goto done;
 		}
 
 		goto sendit;
 	}
 
 	/*
 	 * If the source address is not specified yet, use the address
 	 * of the outoing interface.
 	 */
 	if (ip->ip_src.s_addr == INADDR_ANY) {
 		/* Interface may have no addresses. */
 		if (ia != NULL) {
 			ip->ip_src = IA_SIN(ia)->sin_addr;
 		}
 	}
 
 	/*
 	 * Verify that we have any chance at all of being able to queue the
 	 * packet or packet fragments, unless ALTQ is enabled on the given
 	 * interface in which case packetdrop should be done by queueing.
 	 */
 	n = ip_len / mtu + 1; /* how many fragments ? */
 	if (
 #ifdef ALTQ
 	    (!ALTQ_IS_ENABLED(&ifp->if_snd)) &&
 #endif /* ALTQ */
 	    (ifp->if_snd.ifq_len + n) >= ifp->if_snd.ifq_maxlen ) {
 		error = ENOBUFS;
 		IPSTAT_INC(ips_odropped);
 		ifp->if_snd.ifq_drops += n;
 		goto bad;
 	}
 
 	/*
 	 * Look for broadcast address and
 	 * verify user is allowed to send
 	 * such a packet.
 	 */
 	if (isbroadcast) {
 		if ((ifp->if_flags & IFF_BROADCAST) == 0) {
 			error = EADDRNOTAVAIL;
 			goto bad;
 		}
 		if ((flags & IP_ALLOWBROADCAST) == 0) {
 			error = EACCES;
 			goto bad;
 		}
 		/* don't allow broadcast messages to be fragmented */
 		if (ip_len > mtu) {
 			error = EMSGSIZE;
 			goto bad;
 		}
 		m->m_flags |= M_BCAST;
 	} else {
 		m->m_flags &= ~M_BCAST;
 	}
 
 sendit:
 #ifdef IPSEC
 	switch(ip_ipsec_output(&m, inp, &flags, &error)) {
 	case 1:
 		goto bad;
 	case -1:
 		goto done;
 	case 0:
 	default:
 		break;	/* Continue with packet processing. */
 	}
 	/*
 	 * Check if there was a route for this packet; return error if not.
 	 */
 	if (no_route_but_check_spd) {
 		IPSTAT_INC(ips_noroute);
 		error = EHOSTUNREACH;
 		goto bad;
 	}
 	/* Update variables that are affected by ipsec4_output(). */
 	ip = mtod(m, struct ip *);
 	hlen = ip->ip_hl << 2;
 #endif /* IPSEC */
 
 	/* Jump over all PFIL processing if hooks are not active. */
 	if (!PFIL_HOOKED(&V_inet_pfil_hook))
 		goto passout;
 
 	/* Run through list of hooks for output packets. */
 	odst.s_addr = ip->ip_dst.s_addr;
 	error = pfil_run_hooks(&V_inet_pfil_hook, &m, ifp, PFIL_OUT, inp);
 	if (error != 0 || m == NULL)
 		goto done;
 
 	ip = mtod(m, struct ip *);
 
 	/* See if destination IP address was changed by packet filter. */
 	if (odst.s_addr != ip->ip_dst.s_addr) {
 		m->m_flags |= M_SKIP_FIREWALL;
 		/* If destination is now ourself drop to ip_input(). */
 		if (in_localip(ip->ip_dst)) {
 			m->m_flags |= M_FASTFWD_OURS;
 			if (m->m_pkthdr.rcvif == NULL)
 				m->m_pkthdr.rcvif = V_loif;
 			if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA) {
 				m->m_pkthdr.csum_flags |=
 				    CSUM_DATA_VALID | CSUM_PSEUDO_HDR;
 				m->m_pkthdr.csum_data = 0xffff;
 			}
 			m->m_pkthdr.csum_flags |=
 			    CSUM_IP_CHECKED | CSUM_IP_VALID;
 #ifdef SCTP
 			if (m->m_pkthdr.csum_flags & CSUM_SCTP)
 				m->m_pkthdr.csum_flags |= CSUM_SCTP_VALID;
 #endif
 			error = netisr_queue(NETISR_IP, m);
 			goto done;
 		} else {
 			if (ia != NULL)
 				ifa_free(&ia->ia_ifa);
 			goto again;	/* Redo the routing table lookup. */
 		}
 	}
 
 	/* See if local, if yes, send it to netisr with IP_FASTFWD_OURS. */
 	if (m->m_flags & M_FASTFWD_OURS) {
 		if (m->m_pkthdr.rcvif == NULL)
 			m->m_pkthdr.rcvif = V_loif;
 		if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA) {
 			m->m_pkthdr.csum_flags |=
 			    CSUM_DATA_VALID | CSUM_PSEUDO_HDR;
 			m->m_pkthdr.csum_data = 0xffff;
 		}
 #ifdef SCTP
 		if (m->m_pkthdr.csum_flags & CSUM_SCTP)
 			m->m_pkthdr.csum_flags |= CSUM_SCTP_VALID;
 #endif
 		m->m_pkthdr.csum_flags |=
 			    CSUM_IP_CHECKED | CSUM_IP_VALID;
 
 		error = netisr_queue(NETISR_IP, m);
 		goto done;
 	}
 	/* Or forward to some other address? */
 	if ((m->m_flags & M_IP_NEXTHOP) &&
 	    (fwd_tag = m_tag_find(m, PACKET_TAG_IPFORWARD, NULL)) != NULL) {
 		bcopy((fwd_tag+1), dst, sizeof(struct sockaddr_in));
 		m->m_flags |= M_SKIP_FIREWALL;
 		m->m_flags &= ~M_IP_NEXTHOP;
 		m_tag_delete(m, fwd_tag);
 		if (ia != NULL)
 			ifa_free(&ia->ia_ifa);
 		goto again;
 	}
 
 passout:
 	/* 127/8 must not appear on wire - RFC1122. */
 	if ((ntohl(ip->ip_dst.s_addr) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET ||
 	    (ntohl(ip->ip_src.s_addr) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET) {
 		if ((ifp->if_flags & IFF_LOOPBACK) == 0) {
 			IPSTAT_INC(ips_badaddr);
 			error = EADDRNOTAVAIL;
 			goto bad;
 		}
 	}
 
 	m->m_pkthdr.csum_flags |= CSUM_IP;
 	if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA & ~ifp->if_hwassist) {
 		in_delayed_cksum(m);
 		m->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA;
 	}
 #ifdef SCTP
 	if (m->m_pkthdr.csum_flags & CSUM_SCTP & ~ifp->if_hwassist) {
 		sctp_delayed_cksum(m, (uint32_t)(ip->ip_hl << 2));
 		m->m_pkthdr.csum_flags &= ~CSUM_SCTP;
 	}
 #endif
 
 	/*
 	 * If small enough for interface, or the interface will take
 	 * care of the fragmentation for us, we can just send directly.
 	 */
 	if (ip_len <= mtu ||
 	    (m->m_pkthdr.csum_flags & ifp->if_hwassist & CSUM_TSO) != 0 ||
 	    ((ip_off & IP_DF) == 0 && (ifp->if_hwassist & CSUM_FRAGMENT))) {
 		ip->ip_sum = 0;
 		if (m->m_pkthdr.csum_flags & CSUM_IP & ~ifp->if_hwassist) {
 			ip->ip_sum = in_cksum(m, hlen);
 			m->m_pkthdr.csum_flags &= ~CSUM_IP;
 		}
 
 		/*
 		 * Record statistics for this interface address.
 		 * With CSUM_TSO the byte/packet count will be slightly
 		 * incorrect because we count the IP+TCP headers only
 		 * once instead of for every generated packet.
 		 */
 		if (!(flags & IP_FORWARDING) && ia) {
 			if (m->m_pkthdr.csum_flags & CSUM_TSO)
 				counter_u64_add(ia->ia_ifa.ifa_opackets,
 				    m->m_pkthdr.len / m->m_pkthdr.tso_segsz);
 			else
 				counter_u64_add(ia->ia_ifa.ifa_opackets, 1);
 
 			counter_u64_add(ia->ia_ifa.ifa_obytes, m->m_pkthdr.len);
 		}
 #ifdef MBUF_STRESS_TEST
 		if (mbuf_frag_size && m->m_pkthdr.len > mbuf_frag_size)
 			m = m_fragment(m, M_NOWAIT, mbuf_frag_size);
 #endif
 		/*
 		 * Reset layer specific mbuf flags
 		 * to avoid confusing lower layers.
 		 */
 		m_clrprotoflags(m);
 		IP_PROBE(send, NULL, NULL, ip, ifp, ip, NULL);
 		error = (*ifp->if_output)(ifp, m,
 		    (const struct sockaddr *)gw, ro);
 		goto done;
 	}
 
 	/* Balk when DF bit is set or the interface didn't support TSO. */
 	if ((ip_off & IP_DF) || (m->m_pkthdr.csum_flags & CSUM_TSO)) {
 		error = EMSGSIZE;
 		IPSTAT_INC(ips_cantfrag);
 		goto bad;
 	}
 
 	/*
 	 * Too large for interface; fragment if possible. If successful,
 	 * on return, m will point to a list of packets to be sent.
 	 */
 	error = ip_fragment(ip, &m, mtu, ifp->if_hwassist);
 	if (error)
 		goto bad;
 	for (; m; m = m0) {
 		m0 = m->m_nextpkt;
 		m->m_nextpkt = 0;
 		if (error == 0) {
 			/* Record statistics for this interface address. */
 			if (ia != NULL) {
 				counter_u64_add(ia->ia_ifa.ifa_opackets, 1);
 				counter_u64_add(ia->ia_ifa.ifa_obytes,
 				    m->m_pkthdr.len);
 			}
 			/*
 			 * Reset layer specific mbuf flags
 			 * to avoid confusing upper layers.
 			 */
 			m_clrprotoflags(m);
 
 			IP_PROBE(send, NULL, NULL, ip, ifp, ip, NULL);
 			error = (*ifp->if_output)(ifp, m,
 			    (const struct sockaddr *)gw, ro);
 		} else
 			m_freem(m);
 	}
 
 	if (error == 0)
 		IPSTAT_INC(ips_fragmented);
 
 done:
 	if (ro == &iproute)
 		RO_RTFREE(ro);
 	if (ia != NULL)
 		ifa_free(&ia->ia_ifa);
 	return (error);
 bad:
 	m_freem(m);
 	goto done;
 }
 
 /*
  * Create a chain of fragments which fit the given mtu. m_frag points to the
  * mbuf to be fragmented; on return it points to the chain with the fragments.
  * Return 0 if no error. If error, m_frag may contain a partially built
  * chain of fragments that should be freed by the caller.
  *
  * if_hwassist_flags is the hw offload capabilities (see if_data.ifi_hwassist)
  */
 int
 ip_fragment(struct ip *ip, struct mbuf **m_frag, int mtu,
     u_long if_hwassist_flags)
 {
 	int error = 0;
 	int hlen = ip->ip_hl << 2;
 	int len = (mtu - hlen) & ~7;	/* size of payload in each fragment */
 	int off;
 	struct mbuf *m0 = *m_frag;	/* the original packet		*/
 	int firstlen;
 	struct mbuf **mnext;
 	int nfrags;
 	uint16_t ip_len, ip_off;
 
 	ip_len = ntohs(ip->ip_len);
 	ip_off = ntohs(ip->ip_off);
 
 	if (ip_off & IP_DF) {	/* Fragmentation not allowed */
 		IPSTAT_INC(ips_cantfrag);
 		return EMSGSIZE;
 	}
 
 	/*
 	 * Must be able to put at least 8 bytes per fragment.
 	 */
 	if (len < 8)
 		return EMSGSIZE;
 
 	/*
 	 * If the interface will not calculate checksums on
 	 * fragmented packets, then do it here.
 	 */
 	if (m0->m_pkthdr.csum_flags & CSUM_DELAY_DATA) {
 		in_delayed_cksum(m0);
 		m0->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA;
 	}
 #ifdef SCTP
 	if (m0->m_pkthdr.csum_flags & CSUM_SCTP) {
 		sctp_delayed_cksum(m0, hlen);
 		m0->m_pkthdr.csum_flags &= ~CSUM_SCTP;
 	}
 #endif
 	if (len > PAGE_SIZE) {
 		/* 
 		 * Fragment large datagrams such that each segment 
 		 * contains a multiple of PAGE_SIZE amount of data, 
 		 * plus headers. This enables a receiver to perform 
 		 * page-flipping zero-copy optimizations.
 		 *
 		 * XXX When does this help given that sender and receiver
 		 * could have different page sizes, and also mtu could
 		 * be less than the receiver's page size ?
 		 */
 		int newlen;
 		struct mbuf *m;
 
 		for (m = m0, off = 0; m && (off+m->m_len) <= mtu; m = m->m_next)
 			off += m->m_len;
 
 		/*
 		 * firstlen (off - hlen) must be aligned on an 
 		 * 8-byte boundary
 		 */
 		if (off < hlen)
 			goto smart_frag_failure;
 		off = ((off - hlen) & ~7) + hlen;
 		newlen = (~PAGE_MASK) & mtu;
 		if ((newlen + sizeof (struct ip)) > mtu) {
 			/* we failed, go back the default */
 smart_frag_failure:
 			newlen = len;
 			off = hlen + len;
 		}
 		len = newlen;
 
 	} else {
 		off = hlen + len;
 	}
 
 	firstlen = off - hlen;
 	mnext = &m0->m_nextpkt;		/* pointer to next packet */
 
 	/*
 	 * Loop through length of segment after first fragment,
 	 * make new header and copy data of each part and link onto chain.
 	 * Here, m0 is the original packet, m is the fragment being created.
 	 * The fragments are linked off the m_nextpkt of the original
 	 * packet, which after processing serves as the first fragment.
 	 */
 	for (nfrags = 1; off < ip_len; off += len, nfrags++) {
 		struct ip *mhip;	/* ip header on the fragment */
 		struct mbuf *m;
 		int mhlen = sizeof (struct ip);
 
 		m = m_gethdr(M_NOWAIT, MT_DATA);
 		if (m == NULL) {
 			error = ENOBUFS;
 			IPSTAT_INC(ips_odropped);
 			goto done;
 		}
 		m->m_flags |= (m0->m_flags & M_MCAST);
 		/*
 		 * In the first mbuf, leave room for the link header, then
 		 * copy the original IP header including options. The payload
 		 * goes into an additional mbuf chain returned by m_copym().
 		 */
 		m->m_data += max_linkhdr;
 		mhip = mtod(m, struct ip *);
 		*mhip = *ip;
 		if (hlen > sizeof (struct ip)) {
 			mhlen = ip_optcopy(ip, mhip) + sizeof (struct ip);
 			mhip->ip_v = IPVERSION;
 			mhip->ip_hl = mhlen >> 2;
 		}
 		m->m_len = mhlen;
 		/* XXX do we need to add ip_off below ? */
 		mhip->ip_off = ((off - hlen) >> 3) + ip_off;
 		if (off + len >= ip_len)
 			len = ip_len - off;
 		else
 			mhip->ip_off |= IP_MF;
 		mhip->ip_len = htons((u_short)(len + mhlen));
 		m->m_next = m_copym(m0, off, len, M_NOWAIT);
 		if (m->m_next == NULL) {	/* copy failed */
 			m_free(m);
 			error = ENOBUFS;	/* ??? */
 			IPSTAT_INC(ips_odropped);
 			goto done;
 		}
 		m->m_pkthdr.len = mhlen + len;
 		m->m_pkthdr.rcvif = NULL;
 #ifdef MAC
 		mac_netinet_fragment(m0, m);
 #endif
 		m->m_pkthdr.csum_flags = m0->m_pkthdr.csum_flags;
 		mhip->ip_off = htons(mhip->ip_off);
 		mhip->ip_sum = 0;
 		if (m->m_pkthdr.csum_flags & CSUM_IP & ~if_hwassist_flags) {
 			mhip->ip_sum = in_cksum(m, mhlen);
 			m->m_pkthdr.csum_flags &= ~CSUM_IP;
 		}
 		*mnext = m;
 		mnext = &m->m_nextpkt;
 	}
 	IPSTAT_ADD(ips_ofragments, nfrags);
 
 	/*
 	 * Update first fragment by trimming what's been copied out
 	 * and updating header.
 	 */
 	m_adj(m0, hlen + firstlen - ip_len);
 	m0->m_pkthdr.len = hlen + firstlen;
 	ip->ip_len = htons((u_short)m0->m_pkthdr.len);
 	ip->ip_off = htons(ip_off | IP_MF);
 	ip->ip_sum = 0;
 	if (m0->m_pkthdr.csum_flags & CSUM_IP & ~if_hwassist_flags) {
 		ip->ip_sum = in_cksum(m0, hlen);
 		m0->m_pkthdr.csum_flags &= ~CSUM_IP;
 	}
 
 done:
 	*m_frag = m0;
 	return error;
 }
 
 void
 in_delayed_cksum(struct mbuf *m)
 {
 	struct ip *ip;
 	uint16_t csum, offset, ip_len;
 
 	ip = mtod(m, struct ip *);
 	offset = ip->ip_hl << 2 ;
 	ip_len = ntohs(ip->ip_len);
 	csum = in_cksum_skip(m, ip_len, offset);
 	if (m->m_pkthdr.csum_flags & CSUM_UDP && csum == 0)
 		csum = 0xffff;
 	offset += m->m_pkthdr.csum_data;	/* checksum offset */
 
 	if (offset + sizeof(u_short) > m->m_len) {
 		printf("delayed m_pullup, m->len: %d  off: %d  p: %d\n",
 		    m->m_len, offset, ip->ip_p);
 		/*
 		 * XXX
 		 * this shouldn't happen, but if it does, the
 		 * correct behavior may be to insert the checksum
 		 * in the appropriate next mbuf in the chain.
 		 */
 		return;
 	}
 	*(u_short *)(m->m_data + offset) = csum;
 }
 
 /*
  * IP socket option processing.
  */
 int
 ip_ctloutput(struct socket *so, struct sockopt *sopt)
 {
 	struct	inpcb *inp = sotoinpcb(so);
 	int	error, optval;
 
 	error = optval = 0;
 	if (sopt->sopt_level != IPPROTO_IP) {
 		error = EINVAL;
 
 		if (sopt->sopt_level == SOL_SOCKET &&
 		    sopt->sopt_dir == SOPT_SET) {
 			switch (sopt->sopt_name) {
 			case SO_REUSEADDR:
 				INP_WLOCK(inp);
 				if ((so->so_options & SO_REUSEADDR) != 0)
 					inp->inp_flags2 |= INP_REUSEADDR;
 				else
 					inp->inp_flags2 &= ~INP_REUSEADDR;
 				INP_WUNLOCK(inp);
 				error = 0;
 				break;
 			case SO_REUSEPORT:
 				INP_WLOCK(inp);
 				if ((so->so_options & SO_REUSEPORT) != 0)
 					inp->inp_flags2 |= INP_REUSEPORT;
 				else
 					inp->inp_flags2 &= ~INP_REUSEPORT;
 				INP_WUNLOCK(inp);
 				error = 0;
 				break;
 			case SO_SETFIB:
 				INP_WLOCK(inp);
 				inp->inp_inc.inc_fibnum = so->so_fibnum;
 				INP_WUNLOCK(inp);
 				error = 0;
 				break;
 			default:
 				break;
 			}
 		}
 		return (error);
 	}
 
 	switch (sopt->sopt_dir) {
 	case SOPT_SET:
 		switch (sopt->sopt_name) {
 		case IP_OPTIONS:
 #ifdef notyet
 		case IP_RETOPTS:
 #endif
 		{
 			struct mbuf *m;
 			if (sopt->sopt_valsize > MLEN) {
 				error = EMSGSIZE;
 				break;
 			}
 			m = m_get(sopt->sopt_td ? M_WAITOK : M_NOWAIT, MT_DATA);
 			if (m == NULL) {
 				error = ENOBUFS;
 				break;
 			}
 			m->m_len = sopt->sopt_valsize;
 			error = sooptcopyin(sopt, mtod(m, char *), m->m_len,
 					    m->m_len);
 			if (error) {
 				m_free(m);
 				break;
 			}
 			INP_WLOCK(inp);
 			error = ip_pcbopts(inp, sopt->sopt_name, m);
 			INP_WUNLOCK(inp);
 			return (error);
 		}
 
 		case IP_BINDANY:
 			if (sopt->sopt_td != NULL) {
 				error = priv_check(sopt->sopt_td,
 				    PRIV_NETINET_BINDANY);
 				if (error)
 					break;
 			}
 			/* FALLTHROUGH */
 		case IP_TOS:
 		case IP_TTL:
 		case IP_MINTTL:
 		case IP_RECVOPTS:
 		case IP_RECVRETOPTS:
 		case IP_RECVDSTADDR:
 		case IP_RECVTTL:
 		case IP_RECVIF:
 		case IP_FAITH:
 		case IP_ONESBCAST:
 		case IP_DONTFRAG:
 		case IP_RECVTOS:
 			error = sooptcopyin(sopt, &optval, sizeof optval,
 					    sizeof optval);
 			if (error)
 				break;
 
 			switch (sopt->sopt_name) {
 			case IP_TOS:
 				inp->inp_ip_tos = optval;
 				break;
 
 			case IP_TTL:
 				inp->inp_ip_ttl = optval;
 				break;
 
 			case IP_MINTTL:
 				if (optval >= 0 && optval <= MAXTTL)
 					inp->inp_ip_minttl = optval;
 				else
 					error = EINVAL;
 				break;
 
 #define	OPTSET(bit) do {						\
 	INP_WLOCK(inp);							\
 	if (optval)							\
 		inp->inp_flags |= bit;					\
 	else								\
 		inp->inp_flags &= ~bit;					\
 	INP_WUNLOCK(inp);						\
 } while (0)
 
 			case IP_RECVOPTS:
 				OPTSET(INP_RECVOPTS);
 				break;
 
 			case IP_RECVRETOPTS:
 				OPTSET(INP_RECVRETOPTS);
 				break;
 
 			case IP_RECVDSTADDR:
 				OPTSET(INP_RECVDSTADDR);
 				break;
 
 			case IP_RECVTTL:
 				OPTSET(INP_RECVTTL);
 				break;
 
 			case IP_RECVIF:
 				OPTSET(INP_RECVIF);
 				break;
 
 			case IP_FAITH:
 				OPTSET(INP_FAITH);
 				break;
 
 			case IP_ONESBCAST:
 				OPTSET(INP_ONESBCAST);
 				break;
 			case IP_DONTFRAG:
 				OPTSET(INP_DONTFRAG);
 				break;
 			case IP_BINDANY:
 				OPTSET(INP_BINDANY);
 				break;
 			case IP_RECVTOS:
 				OPTSET(INP_RECVTOS);
 				break;
 			}
 			break;
 #undef OPTSET
 
 		/*
 		 * Multicast socket options are processed by the in_mcast
 		 * module.
 		 */
 		case IP_MULTICAST_IF:
 		case IP_MULTICAST_VIF:
 		case IP_MULTICAST_TTL:
 		case IP_MULTICAST_LOOP:
 		case IP_ADD_MEMBERSHIP:
 		case IP_DROP_MEMBERSHIP:
 		case IP_ADD_SOURCE_MEMBERSHIP:
 		case IP_DROP_SOURCE_MEMBERSHIP:
 		case IP_BLOCK_SOURCE:
 		case IP_UNBLOCK_SOURCE:
 		case IP_MSFILTER:
 		case MCAST_JOIN_GROUP:
 		case MCAST_LEAVE_GROUP:
 		case MCAST_JOIN_SOURCE_GROUP:
 		case MCAST_LEAVE_SOURCE_GROUP:
 		case MCAST_BLOCK_SOURCE:
 		case MCAST_UNBLOCK_SOURCE:
 			error = inp_setmoptions(inp, sopt);
 			break;
 
 		case IP_PORTRANGE:
 			error = sooptcopyin(sopt, &optval, sizeof optval,
 					    sizeof optval);
 			if (error)
 				break;
 
 			INP_WLOCK(inp);
 			switch (optval) {
 			case IP_PORTRANGE_DEFAULT:
 				inp->inp_flags &= ~(INP_LOWPORT);
 				inp->inp_flags &= ~(INP_HIGHPORT);
 				break;
 
 			case IP_PORTRANGE_HIGH:
 				inp->inp_flags &= ~(INP_LOWPORT);
 				inp->inp_flags |= INP_HIGHPORT;
 				break;
 
 			case IP_PORTRANGE_LOW:
 				inp->inp_flags &= ~(INP_HIGHPORT);
 				inp->inp_flags |= INP_LOWPORT;
 				break;
 
 			default:
 				error = EINVAL;
 				break;
 			}
 			INP_WUNLOCK(inp);
 			break;
 
 #ifdef IPSEC
 		case IP_IPSEC_POLICY:
 		{
 			caddr_t req;
 			struct mbuf *m;
 
 			if ((error = soopt_getm(sopt, &m)) != 0) /* XXX */
 				break;
 			if ((error = soopt_mcopyin(sopt, m)) != 0) /* XXX */
 				break;
 			req = mtod(m, caddr_t);
 			error = ipsec_set_policy(inp, sopt->sopt_name, req,
 			    m->m_len, (sopt->sopt_td != NULL) ?
 			    sopt->sopt_td->td_ucred : NULL);
 			m_freem(m);
 			break;
 		}
 #endif /* IPSEC */
 
 		default:
 			error = ENOPROTOOPT;
 			break;
 		}
 		break;
 
 	case SOPT_GET:
 		switch (sopt->sopt_name) {
 		case IP_OPTIONS:
 		case IP_RETOPTS:
 			if (inp->inp_options)
 				error = sooptcopyout(sopt, 
 						     mtod(inp->inp_options,
 							  char *),
 						     inp->inp_options->m_len);
 			else
 				sopt->sopt_valsize = 0;
 			break;
 
 		case IP_TOS:
 		case IP_TTL:
 		case IP_MINTTL:
 		case IP_RECVOPTS:
 		case IP_RECVRETOPTS:
 		case IP_RECVDSTADDR:
 		case IP_RECVTTL:
 		case IP_RECVIF:
 		case IP_PORTRANGE:
 		case IP_FAITH:
 		case IP_ONESBCAST:
 		case IP_DONTFRAG:
 		case IP_BINDANY:
 		case IP_RECVTOS:
 			switch (sopt->sopt_name) {
 
 			case IP_TOS:
 				optval = inp->inp_ip_tos;
 				break;
 
 			case IP_TTL:
 				optval = inp->inp_ip_ttl;
 				break;
 
 			case IP_MINTTL:
 				optval = inp->inp_ip_minttl;
 				break;
 
 #define	OPTBIT(bit)	(inp->inp_flags & bit ? 1 : 0)
 
 			case IP_RECVOPTS:
 				optval = OPTBIT(INP_RECVOPTS);
 				break;
 
 			case IP_RECVRETOPTS:
 				optval = OPTBIT(INP_RECVRETOPTS);
 				break;
 
 			case IP_RECVDSTADDR:
 				optval = OPTBIT(INP_RECVDSTADDR);
 				break;
 
 			case IP_RECVTTL:
 				optval = OPTBIT(INP_RECVTTL);
 				break;
 
 			case IP_RECVIF:
 				optval = OPTBIT(INP_RECVIF);
 				break;
 
 			case IP_PORTRANGE:
 				if (inp->inp_flags & INP_HIGHPORT)
 					optval = IP_PORTRANGE_HIGH;
 				else if (inp->inp_flags & INP_LOWPORT)
 					optval = IP_PORTRANGE_LOW;
 				else
 					optval = 0;
 				break;
 
 			case IP_FAITH:
 				optval = OPTBIT(INP_FAITH);
 				break;
 
 			case IP_ONESBCAST:
 				optval = OPTBIT(INP_ONESBCAST);
 				break;
 			case IP_DONTFRAG:
 				optval = OPTBIT(INP_DONTFRAG);
 				break;
 			case IP_BINDANY:
 				optval = OPTBIT(INP_BINDANY);
 				break;
 			case IP_RECVTOS:
 				optval = OPTBIT(INP_RECVTOS);
 				break;
 			}
 			error = sooptcopyout(sopt, &optval, sizeof optval);
 			break;
 
 		/*
 		 * Multicast socket options are processed by the in_mcast
 		 * module.
 		 */
 		case IP_MULTICAST_IF:
 		case IP_MULTICAST_VIF:
 		case IP_MULTICAST_TTL:
 		case IP_MULTICAST_LOOP:
 		case IP_MSFILTER:
 			error = inp_getmoptions(inp, sopt);
 			break;
 
 #ifdef IPSEC
 		case IP_IPSEC_POLICY:
 		{
 			struct mbuf *m = NULL;
 			caddr_t req = NULL;
 			size_t len = 0;
 
 			if (m != 0) {
 				req = mtod(m, caddr_t);
 				len = m->m_len;
 			}
 			error = ipsec_get_policy(sotoinpcb(so), req, len, &m);
 			if (error == 0)
 				error = soopt_mcopyout(sopt, m); /* XXX */
 			if (error == 0)
 				m_freem(m);
 			break;
 		}
 #endif /* IPSEC */
 
 		default:
 			error = ENOPROTOOPT;
 			break;
 		}
 		break;
 	}
 	return (error);
 }
 
 /*
  * Routine called from ip_output() to loop back a copy of an IP multicast
  * packet to the input queue of a specified interface.  Note that this
  * calls the output routine of the loopback "driver", but with an interface
  * pointer that might NOT be a loopback interface -- evil, but easier than
  * replicating that code here.
  */
 static void
 ip_mloopback(struct ifnet *ifp, struct mbuf *m, struct sockaddr_in *dst,
     int hlen)
 {
 	register struct ip *ip;
 	struct mbuf *copym;
 
 	/*
 	 * Make a deep copy of the packet because we're going to
 	 * modify the pack in order to generate checksums.
 	 */
 	copym = m_dup(m, M_NOWAIT);
 	if (copym != NULL && (copym->m_flags & M_EXT || copym->m_len < hlen))
 		copym = m_pullup(copym, hlen);
 	if (copym != NULL) {
 		/* If needed, compute the checksum and mark it as valid. */
 		if (copym->m_pkthdr.csum_flags & CSUM_DELAY_DATA) {
 			in_delayed_cksum(copym);
 			copym->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA;
 			copym->m_pkthdr.csum_flags |=
 			    CSUM_DATA_VALID | CSUM_PSEUDO_HDR;
 			copym->m_pkthdr.csum_data = 0xffff;
 		}
 		/*
 		 * We don't bother to fragment if the IP length is greater
 		 * than the interface's MTU.  Can this possibly matter?
 		 */
 		ip = mtod(copym, struct ip *);
 		ip->ip_sum = 0;
 		ip->ip_sum = in_cksum(copym, hlen);
 #if 1 /* XXX */
 		if (dst->sin_family != AF_INET) {
 			printf("ip_mloopback: bad address family %d\n",
 						dst->sin_family);
 			dst->sin_family = AF_INET;
 		}
 #endif
 		if_simloop(ifp, copym, dst->sin_family, 0);
 	}
 }
Index: head/sys/netinet/sctp_dtrace_declare.h
===================================================================
--- head/sys/netinet/sctp_dtrace_declare.h	(revision 258540)
+++ head/sys/netinet/sctp_dtrace_declare.h	(revision 258541)
@@ -1,80 +1,79 @@
 /*-
  * Copyright (c) 2008-2012, by Randall Stewart. All rights reserved.
  * Copyright (c) 2008-2012, by Michael Tuexen. All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
  *
  * a) Redistributions of source code must retain the above copyright notice,
  *    this list of conditions and the following disclaimer.
  *
  * b) Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in
  *    the documentation and/or other materials provided with the distribution.
  *
  * c) Neither the name of Cisco Systems, Inc. nor the names of its
  *    contributors may be used to endorse or promote products derived
  *    from this software without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
  * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
  * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
  * THE POSSIBILITY OF SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #ifndef _NETINET_SCTP_DTRACE_DECLARE_H_
 #define _NETINET_SCTP_DTRACE_DECLARE_H_
 
-#include "opt_kdtrace.h"
 #include <sys/kernel.h>
 #include <sys/sdt.h>
 
 /* Declare the SCTP provider */
 SDT_PROVIDER_DECLARE(sctp);
 
 /* The probes we have so far: */
 
 /* One to track a net's cwnd */
 /* initial */
 SDT_PROBE_DECLARE(sctp, cwnd, net, init);
 /* update at a ack -- increase */
 SDT_PROBE_DECLARE(sctp, cwnd, net, ack);
 /* update at a fast retransmit -- decrease */
 SDT_PROBE_DECLARE(sctp, cwnd, net, fr);
 /* update at a time-out -- decrease */
 SDT_PROBE_DECLARE(sctp, cwnd, net, to);
 /* update at a burst-limit -- decrease */
 SDT_PROBE_DECLARE(sctp, cwnd, net, bl);
 /* update at a ECN -- decrease */
 SDT_PROBE_DECLARE(sctp, cwnd, net, ecn);
 /* update at a Packet-Drop -- decrease */
 SDT_PROBE_DECLARE(sctp, cwnd, net, pd);
 /* Rttvar probe declaration */
 SDT_PROBE_DECLARE(sctp, cwnd, net, rttvar);
 SDT_PROBE_DECLARE(sctp, cwnd, net, rttstep);
 
 /* One to track an associations rwnd */
 SDT_PROBE_DECLARE(sctp, rwnd, assoc, val);
 
 /* One to track a net's flight size */
 SDT_PROBE_DECLARE(sctp, flightsize, net, val);
 
 /* One to track an associations flight size */
 SDT_PROBE_DECLARE(sctp, flightsize, assoc, val);
 
 
 
 
 
 
 #endif
Index: head/sys/netinet/sctp_dtrace_define.h
===================================================================
--- head/sys/netinet/sctp_dtrace_define.h	(revision 258540)
+++ head/sys/netinet/sctp_dtrace_define.h	(revision 258541)
@@ -1,176 +1,175 @@
 /*-
  * Copyright (c) 2008-2012, by Randall Stewart. All rights reserved.
  * Copyright (c) 2008-2012, by Michael Tuexen. All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
  *
  * a) Redistributions of source code must retain the above copyright notice,
  *    this list of conditions and the following disclaimer.
  *
  * b) Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in
  *    the documentation and/or other materials provided with the distribution.
  *
  * c) Neither the name of Cisco Systems, Inc. nor the names of its
  *    contributors may be used to endorse or promote products derived
  *    from this software without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
  * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
  * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
  * THE POSSIBILITY OF SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #ifndef _NETINET_SCTP_DTRACE_DEFINE_H_
 #define _NETINET_SCTP_DTRACE_DEFINE_H_
 
-#include "opt_kdtrace.h"
 #include <sys/kernel.h>
 #include <sys/sdt.h>
 
 SDT_PROVIDER_DEFINE(sctp);
 
 /********************************************************/
 /* Cwnd probe - tracks changes in the congestion window on a netp */
 /********************************************************/
 /* Initial */
 SDT_PROBE_DEFINE5(sctp, cwnd, net, init, init,
     "uint32_t",		/* The Vtag for this end */
     "uint32_t",		/*
 			 * The port number of the local side << 16 | port number
 			 * of remote in network byte order.
 			 */
     "uintptr_t",	/* The pointer to the struct sctp_nets * changing */
     "int",		/* The old value of the cwnd */
     "int");		/* The new value of the cwnd */
 
 /* ACK-INCREASE */
 SDT_PROBE_DEFINE5(sctp, cwnd, net, ack, ack,
     "uint32_t",		/* The Vtag for this end */
     "uint32_t",		/*
 			 * The port number of the local side << 16 | port number
 			 * of remote in network byte order.
 			 */
     "uintptr_t",	/* The pointer to the struct sctp_nets * changing */
     "int",		/* The old value of the cwnd */
     "int");		/* The new value of the cwnd */
 
 /* ACK-INCREASE */
 SDT_PROBE_DEFINE5(sctp, cwnd, net, rttvar, rttvar,
     "uint64_t",		/* The Vtag << 32 | localport << 16 | remoteport */
     "uint64_t",		/* obw | nbw */
     "uint64_t",		/* bwrtt | newrtt */
     "uint64_t",		/* flight */
     "uint64_t");	/* (cwnd << 32) | point << 16 | retval(0/1) */
 
 SDT_PROBE_DEFINE5(sctp, cwnd, net, rttstep, rttstep,
     "uint64_t",		/* The Vtag << 32 | localport << 16 | remoteport */
     "uint64_t",		/* obw | nbw */
     "uint64_t",		/* bwrtt | newrtt */
     "uint64_t",		/* flight */
     "uint64_t");	/* (cwnd << 32) | point << 16 | retval(0/1) */
 
 /* FastRetransmit-DECREASE */
 SDT_PROBE_DEFINE5(sctp, cwnd, net, fr, fr,
     "uint32_t",		/* The Vtag for this end */
     "uint32_t",		/*
 			 * The port number of the local side << 16 | port number
 			 * of remote in network byte order.
 			 */
     "uintptr_t",	/* The pointer to the struct sctp_nets * changing */
     "int",		/* The old value of the cwnd */
     "int");		/* The new value of the cwnd */
 
 /* TimeOut-DECREASE */
 SDT_PROBE_DEFINE5(sctp, cwnd, net, to, to,
     "uint32_t",		/* The Vtag for this end */
     "uint32_t",		/*
 			 * The port number of the local side << 16 | port number
 			 * of remote in network byte order.
 			 */
     "uintptr_t",	/* The pointer to the struct sctp_nets * changing */
     "int",		/* The old value of the cwnd */
     "int");		/* The new value of the cwnd */
 
 /* BurstLimit-DECREASE */
 SDT_PROBE_DEFINE5(sctp, cwnd, net, bl, bl,
     "uint32_t",		/* The Vtag for this end */
     "uint32_t",		/*
 			 * The port number of the local side << 16 | port number
 			 * of remote in network byte order.
 			 */
     "uintptr_t",	/* The pointer to the struct sctp_nets * changing */
     "int",		/* The old value of the cwnd */
     "int");		/* The new value of the cwnd */
 
 /* ECN-DECREASE */
 SDT_PROBE_DEFINE5(sctp, cwnd, net, ecn, ecn,
     "uint32_t",		/* The Vtag for this end */
     "uint32_t",		/*
 			 * The port number of the local side << 16 | port number
 			 * of remote in network byte order.
 			 */
     "uintptr_t",	/* The pointer to the struct sctp_nets * changing */
     "int",		/* The old value of the cwnd */
     "int");		/* The new value of the cwnd */
 
 /* PacketDrop-DECREASE */
 SDT_PROBE_DEFINE5(sctp, cwnd, net, pd, pd,
     "uint32_t",		/* The Vtag for this end */
     "uint32_t",		/*
 			 * The port number of the local side << 16 | port number
 			 * of remote in network byte order.
 			 */
     "uintptr_t",	/* The pointer to the struct sctp_nets * changing */
     "int",		/* The old value of the cwnd */
     "int");		/* The new value of the cwnd */
 
 /********************************************************/
 /* Rwnd probe - tracks changes in the receiver window for an assoc */
 /********************************************************/
 SDT_PROBE_DEFINE4(sctp, rwnd, assoc, val, val,
     "uint32_t",		/* The Vtag for this end */
     "uint32_t",		/*
 			 * The port number of the local side << 16 | port number
 			 * of remote in network byte order.
 			 */
     "int",		/* The up/down amount */
     "int");		/* The new value of the cwnd */
 
 /********************************************************/
 /* flight probe - tracks changes in the flight size on a net or assoc */
 /********************************************************/
 SDT_PROBE_DEFINE5(sctp, flightsize, net, val, val,
     "uint32_t",		/* The Vtag for this end */
     "uint32_t",		/*
 			 * The port number of the local side << 16 | port number
 			 * of remote in network byte order.
 			 */
     "uintptr_t",        /* The pointer to the struct sctp_nets * changing */
     "int",		/* The up/down amount */
     "int");		/* The new value of the cwnd */
 
 /********************************************************/
 /* The total flight version */
 /********************************************************/
 SDT_PROBE_DEFINE4(sctp, flightsize, assoc, val, val,
     "uint32_t",		/* The Vtag for this end */
     "uint32_t",		/*
 			 * The port number of the local side << 16 | port number
 			 * of remote in network byte order.
 			 */
     "int",		/* The up/down amount */
     "int");		/* The new value of the cwnd */
 
 #endif
Index: head/sys/netinet/tcp_input.c
===================================================================
--- head/sys/netinet/tcp_input.c	(revision 258540)
+++ head/sys/netinet/tcp_input.c	(revision 258541)
@@ -1,3739 +1,3738 @@
 /*-
  * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1994, 1995
  *	The Regents of the University of California.  All rights reserved.
  * Copyright (c) 2007-2008,2010
  *	Swinburne University of Technology, Melbourne, Australia.
  * Copyright (c) 2009-2010 Lawrence Stewart <lstewart@freebsd.org>
  * Copyright (c) 2010 The FreeBSD Foundation
  * Copyright (c) 2010-2011 Juniper Networks, Inc.
  * All rights reserved.
  *
  * Portions of this software were developed at the Centre for Advanced Internet
  * Architectures, Swinburne University of Technology, by Lawrence Stewart,
  * James Healy and David Hayes, made possible in part by a grant from the Cisco
  * University Research Program Fund at Community Foundation Silicon Valley.
  *
  * Portions of this software were developed at the Centre for Advanced
  * Internet Architectures, Swinburne University of Technology, Melbourne,
  * Australia by David Hayes under sponsorship from the FreeBSD Foundation.
  *
  * Portions of this software were developed by Robert N. M. Watson under
  * contract to Juniper Networks, Inc.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)tcp_input.c	8.12 (Berkeley) 5/24/95
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_ipfw.h"		/* for ipfw_fwd	*/
 #include "opt_inet.h"
 #include "opt_inet6.h"
 #include "opt_ipsec.h"
-#include "opt_kdtrace.h"
 #include "opt_tcpdebug.h"
 
 #include <sys/param.h>
 #include <sys/kernel.h>
 #include <sys/hhook.h>
 #include <sys/malloc.h>
 #include <sys/mbuf.h>
 #include <sys/proc.h>		/* for proc0 declaration */
 #include <sys/protosw.h>
 #include <sys/sdt.h>
 #include <sys/signalvar.h>
 #include <sys/socket.h>
 #include <sys/socketvar.h>
 #include <sys/sysctl.h>
 #include <sys/syslog.h>
 #include <sys/systm.h>
 
 #include <machine/cpu.h>	/* before tcp_seq.h, for tcp_random18() */
 
 #include <vm/uma.h>
 
 #include <net/if.h>
 #include <net/if_var.h>
 #include <net/route.h>
 #include <net/vnet.h>
 
 #define TCPSTATES		/* for logging */
 
 #include <netinet/cc.h>
 #include <netinet/in.h>
 #include <netinet/in_kdtrace.h>
 #include <netinet/in_pcb.h>
 #include <netinet/in_systm.h>
 #include <netinet/in_var.h>
 #include <netinet/ip.h>
 #include <netinet/ip_icmp.h>	/* required for icmp_var.h */
 #include <netinet/icmp_var.h>	/* for ICMP_BANDLIM */
 #include <netinet/ip_var.h>
 #include <netinet/ip_options.h>
 #include <netinet/ip6.h>
 #include <netinet/icmp6.h>
 #include <netinet6/in6_pcb.h>
 #include <netinet6/ip6_var.h>
 #include <netinet6/nd6.h>
 #include <netinet/tcp_fsm.h>
 #include <netinet/tcp_seq.h>
 #include <netinet/tcp_timer.h>
 #include <netinet/tcp_var.h>
 #include <netinet6/tcp6_var.h>
 #include <netinet/tcpip.h>
 #include <netinet/tcp_syncache.h>
 #ifdef TCPDEBUG
 #include <netinet/tcp_debug.h>
 #endif /* TCPDEBUG */
 #ifdef TCP_OFFLOAD
 #include <netinet/tcp_offload.h>
 #endif
 
 #ifdef IPSEC
 #include <netipsec/ipsec.h>
 #include <netipsec/ipsec6.h>
 #endif /*IPSEC*/
 
 #include <machine/in_cksum.h>
 
 #include <security/mac/mac_framework.h>
 
 const int tcprexmtthresh = 3;
 
 int tcp_log_in_vain = 0;
 SYSCTL_INT(_net_inet_tcp, OID_AUTO, log_in_vain, CTLFLAG_RW,
     &tcp_log_in_vain, 0,
     "Log all incoming TCP segments to closed ports");
 
 VNET_DEFINE(int, blackhole) = 0;
 #define	V_blackhole		VNET(blackhole)
 SYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, blackhole, CTLFLAG_RW,
     &VNET_NAME(blackhole), 0,
     "Do not send RST on segments to closed ports");
 
 VNET_DEFINE(int, tcp_delack_enabled) = 1;
 SYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, delayed_ack, CTLFLAG_RW,
     &VNET_NAME(tcp_delack_enabled), 0,
     "Delay ACK to try and piggyback it onto a data packet");
 
 VNET_DEFINE(int, drop_synfin) = 0;
 #define	V_drop_synfin		VNET(drop_synfin)
 SYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, drop_synfin, CTLFLAG_RW,
     &VNET_NAME(drop_synfin), 0,
     "Drop TCP packets with SYN+FIN set");
 
 VNET_DEFINE(int, tcp_do_rfc3042) = 1;
 #define	V_tcp_do_rfc3042	VNET(tcp_do_rfc3042)
 SYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, rfc3042, CTLFLAG_RW,
     &VNET_NAME(tcp_do_rfc3042), 0,
     "Enable RFC 3042 (Limited Transmit)");
 
 VNET_DEFINE(int, tcp_do_rfc3390) = 1;
 SYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, rfc3390, CTLFLAG_RW,
     &VNET_NAME(tcp_do_rfc3390), 0,
     "Enable RFC 3390 (Increasing TCP's Initial Congestion Window)");
 
 SYSCTL_NODE(_net_inet_tcp, OID_AUTO, experimental, CTLFLAG_RW, 0,
     "Experimental TCP extensions");
 
 VNET_DEFINE(int, tcp_do_initcwnd10) = 1;
 SYSCTL_VNET_INT(_net_inet_tcp_experimental, OID_AUTO, initcwnd10, CTLFLAG_RW,
     &VNET_NAME(tcp_do_initcwnd10), 0,
     "Enable draft-ietf-tcpm-initcwnd-05 (Increasing initial CWND to 10)");
 
 VNET_DEFINE(int, tcp_do_rfc3465) = 1;
 SYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, rfc3465, CTLFLAG_RW,
     &VNET_NAME(tcp_do_rfc3465), 0,
     "Enable RFC 3465 (Appropriate Byte Counting)");
 
 VNET_DEFINE(int, tcp_abc_l_var) = 2;
 SYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, abc_l_var, CTLFLAG_RW,
     &VNET_NAME(tcp_abc_l_var), 2,
     "Cap the max cwnd increment during slow-start to this number of segments");
 
 static SYSCTL_NODE(_net_inet_tcp, OID_AUTO, ecn, CTLFLAG_RW, 0, "TCP ECN");
 
 VNET_DEFINE(int, tcp_do_ecn) = 0;
 SYSCTL_VNET_INT(_net_inet_tcp_ecn, OID_AUTO, enable, CTLFLAG_RW,
     &VNET_NAME(tcp_do_ecn), 0,
     "TCP ECN support");
 
 VNET_DEFINE(int, tcp_ecn_maxretries) = 1;
 SYSCTL_VNET_INT(_net_inet_tcp_ecn, OID_AUTO, maxretries, CTLFLAG_RW,
     &VNET_NAME(tcp_ecn_maxretries), 0,
     "Max retries before giving up on ECN");
 
 VNET_DEFINE(int, tcp_insecure_rst) = 0;
 #define	V_tcp_insecure_rst	VNET(tcp_insecure_rst)
 SYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, insecure_rst, CTLFLAG_RW,
     &VNET_NAME(tcp_insecure_rst), 0,
     "Follow the old (insecure) criteria for accepting RST packets");
 
 VNET_DEFINE(int, tcp_recvspace) = 1024*64;
 #define	V_tcp_recvspace	VNET(tcp_recvspace)
 SYSCTL_VNET_INT(_net_inet_tcp, TCPCTL_RECVSPACE, recvspace, CTLFLAG_RW,
     &VNET_NAME(tcp_recvspace), 0, "Initial receive socket buffer size");
 
 VNET_DEFINE(int, tcp_do_autorcvbuf) = 1;
 #define	V_tcp_do_autorcvbuf	VNET(tcp_do_autorcvbuf)
 SYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, recvbuf_auto, CTLFLAG_RW,
     &VNET_NAME(tcp_do_autorcvbuf), 0,
     "Enable automatic receive buffer sizing");
 
 VNET_DEFINE(int, tcp_autorcvbuf_inc) = 16*1024;
 #define	V_tcp_autorcvbuf_inc	VNET(tcp_autorcvbuf_inc)
 SYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, recvbuf_inc, CTLFLAG_RW,
     &VNET_NAME(tcp_autorcvbuf_inc), 0,
     "Incrementor step size of automatic receive buffer");
 
 VNET_DEFINE(int, tcp_autorcvbuf_max) = 2*1024*1024;
 #define	V_tcp_autorcvbuf_max	VNET(tcp_autorcvbuf_max)
 SYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, recvbuf_max, CTLFLAG_RW,
     &VNET_NAME(tcp_autorcvbuf_max), 0,
     "Max size of automatic receive buffer");
 
 VNET_DEFINE(struct inpcbhead, tcb);
 #define	tcb6	tcb  /* for KAME src sync over BSD*'s */
 VNET_DEFINE(struct inpcbinfo, tcbinfo);
 
 static void	 tcp_dooptions(struct tcpopt *, u_char *, int, int);
 static void	 tcp_do_segment(struct mbuf *, struct tcphdr *,
 		     struct socket *, struct tcpcb *, int, int, uint8_t,
 		     int);
 static void	 tcp_dropwithreset(struct mbuf *, struct tcphdr *,
 		     struct tcpcb *, int, int);
 static void	 tcp_pulloutofband(struct socket *,
 		     struct tcphdr *, struct mbuf *, int);
 static void	 tcp_xmit_timer(struct tcpcb *, int);
 static void	 tcp_newreno_partial_ack(struct tcpcb *, struct tcphdr *);
 static void inline 	tcp_fields_to_host(struct tcphdr *);
 #ifdef TCP_SIGNATURE
 static void inline 	tcp_fields_to_net(struct tcphdr *);
 static int inline	tcp_signature_verify_input(struct mbuf *, int, int,
 			    int, struct tcpopt *, struct tcphdr *, u_int);
 #endif
 static void inline	cc_ack_received(struct tcpcb *tp, struct tcphdr *th,
 			    uint16_t type);
 static void inline	cc_conn_init(struct tcpcb *tp);
 static void inline	cc_post_recovery(struct tcpcb *tp, struct tcphdr *th);
 static void inline	hhook_run_tcp_est_in(struct tcpcb *tp,
 			    struct tcphdr *th, struct tcpopt *to);
 
 /*
  * TCP statistics are stored in an "array" of counter(9)s.
  */
 VNET_PCPUSTAT_DEFINE(struct tcpstat, tcpstat);
 VNET_PCPUSTAT_SYSINIT(tcpstat);
 SYSCTL_VNET_PCPUSTAT(_net_inet_tcp, TCPCTL_STATS, stats, struct tcpstat,
     tcpstat, "TCP statistics (struct tcpstat, netinet/tcp_var.h)");
 
 #ifdef VIMAGE
 VNET_PCPUSTAT_SYSUNINIT(tcpstat);
 #endif /* VIMAGE */
 /*
  * Kernel module interface for updating tcpstat.  The argument is an index
  * into tcpstat treated as an array.
  */
 void
 kmod_tcpstat_inc(int statnum)
 {
 
 	counter_u64_add(VNET(tcpstat)[statnum], 1);
 }
 
 /*
  * Wrapper for the TCP established input helper hook.
  */
 static void inline
 hhook_run_tcp_est_in(struct tcpcb *tp, struct tcphdr *th, struct tcpopt *to)
 {
 	struct tcp_hhook_data hhook_data;
 
 	if (V_tcp_hhh[HHOOK_TCP_EST_IN]->hhh_nhooks > 0) {
 		hhook_data.tp = tp;
 		hhook_data.th = th;
 		hhook_data.to = to;
 
 		hhook_run_hooks(V_tcp_hhh[HHOOK_TCP_EST_IN], &hhook_data,
 		    tp->osd);
 	}
 }
 
 /*
  * CC wrapper hook functions
  */
 static void inline
 cc_ack_received(struct tcpcb *tp, struct tcphdr *th, uint16_t type)
 {
 	INP_WLOCK_ASSERT(tp->t_inpcb);
 
 	tp->ccv->bytes_this_ack = BYTES_THIS_ACK(tp, th);
 	if (tp->snd_cwnd <= tp->snd_wnd)
 		tp->ccv->flags |= CCF_CWND_LIMITED;
 	else
 		tp->ccv->flags &= ~CCF_CWND_LIMITED;
 
 	if (type == CC_ACK) {
 		if (tp->snd_cwnd > tp->snd_ssthresh) {
 			tp->t_bytes_acked += min(tp->ccv->bytes_this_ack,
 			     V_tcp_abc_l_var * tp->t_maxseg);
 			if (tp->t_bytes_acked >= tp->snd_cwnd) {
 				tp->t_bytes_acked -= tp->snd_cwnd;
 				tp->ccv->flags |= CCF_ABC_SENTAWND;
 			}
 		} else {
 				tp->ccv->flags &= ~CCF_ABC_SENTAWND;
 				tp->t_bytes_acked = 0;
 		}
 	}
 
 	if (CC_ALGO(tp)->ack_received != NULL) {
 		/* XXXLAS: Find a way to live without this */
 		tp->ccv->curack = th->th_ack;
 		CC_ALGO(tp)->ack_received(tp->ccv, type);
 	}
 }
 
 static void inline
 cc_conn_init(struct tcpcb *tp)
 {
 	struct hc_metrics_lite metrics;
 	struct inpcb *inp = tp->t_inpcb;
 	int rtt;
 
 	INP_WLOCK_ASSERT(tp->t_inpcb);
 
 	tcp_hc_get(&inp->inp_inc, &metrics);
 
 	if (tp->t_srtt == 0 && (rtt = metrics.rmx_rtt)) {
 		tp->t_srtt = rtt;
 		tp->t_rttbest = tp->t_srtt + TCP_RTT_SCALE;
 		TCPSTAT_INC(tcps_usedrtt);
 		if (metrics.rmx_rttvar) {
 			tp->t_rttvar = metrics.rmx_rttvar;
 			TCPSTAT_INC(tcps_usedrttvar);
 		} else {
 			/* default variation is +- 1 rtt */
 			tp->t_rttvar =
 			    tp->t_srtt * TCP_RTTVAR_SCALE / TCP_RTT_SCALE;
 		}
 		TCPT_RANGESET(tp->t_rxtcur,
 		    ((tp->t_srtt >> 2) + tp->t_rttvar) >> 1,
 		    tp->t_rttmin, TCPTV_REXMTMAX);
 	}
 	if (metrics.rmx_ssthresh) {
 		/*
 		 * There's some sort of gateway or interface
 		 * buffer limit on the path.  Use this to set
 		 * the slow start threshhold, but set the
 		 * threshold to no less than 2*mss.
 		 */
 		tp->snd_ssthresh = max(2 * tp->t_maxseg, metrics.rmx_ssthresh);
 		TCPSTAT_INC(tcps_usedssthresh);
 	}
 
 	/*
 	 * Set the initial slow-start flight size.
 	 *
 	 * RFC5681 Section 3.1 specifies the default conservative values.
 	 * RFC3390 specifies slightly more aggressive values.
 	 * Draft-ietf-tcpm-initcwnd-05 increases it to ten segments.
 	 *
 	 * If a SYN or SYN/ACK was lost and retransmitted, we have to
 	 * reduce the initial CWND to one segment as congestion is likely
 	 * requiring us to be cautious.
 	 */
 	if (tp->snd_cwnd == 1)
 		tp->snd_cwnd = tp->t_maxseg;		/* SYN(-ACK) lost */
 	else if (V_tcp_do_initcwnd10)
 		tp->snd_cwnd = min(10 * tp->t_maxseg,
 		    max(2 * tp->t_maxseg, 14600));
 	else if (V_tcp_do_rfc3390)
 		tp->snd_cwnd = min(4 * tp->t_maxseg,
 		    max(2 * tp->t_maxseg, 4380));
 	else {
 		/* Per RFC5681 Section 3.1 */
 		if (tp->t_maxseg > 2190)
 			tp->snd_cwnd = 2 * tp->t_maxseg;
 		else if (tp->t_maxseg > 1095)
 			tp->snd_cwnd = 3 * tp->t_maxseg;
 		else
 			tp->snd_cwnd = 4 * tp->t_maxseg;
 	}
 
 	if (CC_ALGO(tp)->conn_init != NULL)
 		CC_ALGO(tp)->conn_init(tp->ccv);
 }
 
 void inline
 cc_cong_signal(struct tcpcb *tp, struct tcphdr *th, uint32_t type)
 {
 	INP_WLOCK_ASSERT(tp->t_inpcb);
 
 	switch(type) {
 	case CC_NDUPACK:
 		if (!IN_FASTRECOVERY(tp->t_flags)) {
 			tp->snd_recover = tp->snd_max;
 			if (tp->t_flags & TF_ECN_PERMIT)
 				tp->t_flags |= TF_ECN_SND_CWR;
 		}
 		break;
 	case CC_ECN:
 		if (!IN_CONGRECOVERY(tp->t_flags)) {
 			TCPSTAT_INC(tcps_ecn_rcwnd);
 			tp->snd_recover = tp->snd_max;
 			if (tp->t_flags & TF_ECN_PERMIT)
 				tp->t_flags |= TF_ECN_SND_CWR;
 		}
 		break;
 	case CC_RTO:
 		tp->t_dupacks = 0;
 		tp->t_bytes_acked = 0;
 		EXIT_RECOVERY(tp->t_flags);
 		tp->snd_ssthresh = max(2, min(tp->snd_wnd, tp->snd_cwnd) / 2 /
 		    tp->t_maxseg) * tp->t_maxseg;
 		tp->snd_cwnd = tp->t_maxseg;
 		break;
 	case CC_RTO_ERR:
 		TCPSTAT_INC(tcps_sndrexmitbad);
 		/* RTO was unnecessary, so reset everything. */
 		tp->snd_cwnd = tp->snd_cwnd_prev;
 		tp->snd_ssthresh = tp->snd_ssthresh_prev;
 		tp->snd_recover = tp->snd_recover_prev;
 		if (tp->t_flags & TF_WASFRECOVERY)
 			ENTER_FASTRECOVERY(tp->t_flags);
 		if (tp->t_flags & TF_WASCRECOVERY)
 			ENTER_CONGRECOVERY(tp->t_flags);
 		tp->snd_nxt = tp->snd_max;
 		tp->t_flags &= ~TF_PREVVALID;
 		tp->t_badrxtwin = 0;
 		break;
 	}
 
 	if (CC_ALGO(tp)->cong_signal != NULL) {
 		if (th != NULL)
 			tp->ccv->curack = th->th_ack;
 		CC_ALGO(tp)->cong_signal(tp->ccv, type);
 	}
 }
 
 static void inline
 cc_post_recovery(struct tcpcb *tp, struct tcphdr *th)
 {
 	INP_WLOCK_ASSERT(tp->t_inpcb);
 
 	/* XXXLAS: KASSERT that we're in recovery? */
 
 	if (CC_ALGO(tp)->post_recovery != NULL) {
 		tp->ccv->curack = th->th_ack;
 		CC_ALGO(tp)->post_recovery(tp->ccv);
 	}
 	/* XXXLAS: EXIT_RECOVERY ? */
 	tp->t_bytes_acked = 0;
 }
 
 static inline void
 tcp_fields_to_host(struct tcphdr *th)
 {
 
 	th->th_seq = ntohl(th->th_seq);
 	th->th_ack = ntohl(th->th_ack);
 	th->th_win = ntohs(th->th_win);
 	th->th_urp = ntohs(th->th_urp);
 }
 
 #ifdef TCP_SIGNATURE
 static inline void
 tcp_fields_to_net(struct tcphdr *th)
 {
 
 	th->th_seq = htonl(th->th_seq);
 	th->th_ack = htonl(th->th_ack);
 	th->th_win = htons(th->th_win);
 	th->th_urp = htons(th->th_urp);
 }
 
 static inline int
 tcp_signature_verify_input(struct mbuf *m, int off0, int tlen, int optlen,
     struct tcpopt *to, struct tcphdr *th, u_int tcpbflag)
 {
 	int ret;
 
 	tcp_fields_to_net(th);
 	ret = tcp_signature_verify(m, off0, tlen, optlen, to, th, tcpbflag);
 	tcp_fields_to_host(th);
 	return (ret);
 }
 #endif
 
 /* Neighbor Discovery, Neighbor Unreachability Detection Upper layer hint. */
 #ifdef INET6
 #define ND6_HINT(tp) \
 do { \
 	if ((tp) && (tp)->t_inpcb && \
 	    ((tp)->t_inpcb->inp_vflag & INP_IPV6) != 0) \
 		nd6_nud_hint(NULL, NULL, 0); \
 } while (0)
 #else
 #define ND6_HINT(tp)
 #endif
 
 /*
  * Indicate whether this ack should be delayed.  We can delay the ack if
  *	- there is no delayed ack timer in progress and
  *	- our last ack wasn't a 0-sized window.  We never want to delay
  *	  the ack that opens up a 0-sized window and
  *		- delayed acks are enabled or
  *		- this is a half-synchronized T/TCP connection.
  *	- the segment size is not larger than the MSS and LRO wasn't used
  *	  for this segment.
  */
 #define DELAY_ACK(tp, tlen)						\
 	((!tcp_timer_active(tp, TT_DELACK) &&				\
 	    (tp->t_flags & TF_RXWIN0SENT) == 0) &&			\
 	    (tlen <= tp->t_maxopd) &&					\
 	    (V_tcp_delack_enabled || (tp->t_flags & TF_NEEDSYN)))
 
 /*
  * TCP input handling is split into multiple parts:
  *   tcp6_input is a thin wrapper around tcp_input for the extended
  *	ip6_protox[] call format in ip6_input
  *   tcp_input handles primary segment validation, inpcb lookup and
  *	SYN processing on listen sockets
  *   tcp_do_segment processes the ACK and text of the segment for
  *	establishing, established and closing connections
  */
 #ifdef INET6
 int
 tcp6_input(struct mbuf **mp, int *offp, int proto)
 {
 	struct mbuf *m = *mp;
 	struct in6_ifaddr *ia6;
 
 	IP6_EXTHDR_CHECK(m, *offp, sizeof(struct tcphdr), IPPROTO_DONE);
 
 	/*
 	 * draft-itojun-ipv6-tcp-to-anycast
 	 * better place to put this in?
 	 */
 	ia6 = ip6_getdstifaddr(m);
 	if (ia6 && (ia6->ia6_flags & IN6_IFF_ANYCAST)) {
 		struct ip6_hdr *ip6;
 
 		ifa_free(&ia6->ia_ifa);
 		ip6 = mtod(m, struct ip6_hdr *);
 		icmp6_error(m, ICMP6_DST_UNREACH, ICMP6_DST_UNREACH_ADDR,
 			    (caddr_t)&ip6->ip6_dst - (caddr_t)ip6);
 		return IPPROTO_DONE;
 	}
 	if (ia6)
 		ifa_free(&ia6->ia_ifa);
 
 	tcp_input(m, *offp);
 	return IPPROTO_DONE;
 }
 #endif /* INET6 */
 
 void
 tcp_input(struct mbuf *m, int off0)
 {
 	struct tcphdr *th = NULL;
 	struct ip *ip = NULL;
 	struct inpcb *inp = NULL;
 	struct tcpcb *tp = NULL;
 	struct socket *so = NULL;
 	u_char *optp = NULL;
 	int optlen = 0;
 #ifdef INET
 	int len;
 #endif
 	int tlen = 0, off;
 	int drop_hdrlen;
 	int thflags;
 	int rstreason = 0;	/* For badport_bandlim accounting purposes */
 #ifdef TCP_SIGNATURE
 	uint8_t sig_checked = 0;
 #endif
 	uint8_t iptos = 0;
 	struct m_tag *fwd_tag = NULL;
 #ifdef INET6
 	struct ip6_hdr *ip6 = NULL;
 	int isipv6;
 #else
 	const void *ip6 = NULL;
 #endif /* INET6 */
 	struct tcpopt to;		/* options in this segment */
 	char *s = NULL;			/* address and port logging */
 	int ti_locked;
 #define	TI_UNLOCKED	1
 #define	TI_WLOCKED	2
 
 #ifdef TCPDEBUG
 	/*
 	 * The size of tcp_saveipgen must be the size of the max ip header,
 	 * now IPv6.
 	 */
 	u_char tcp_saveipgen[IP6_HDR_LEN];
 	struct tcphdr tcp_savetcp;
 	short ostate = 0;
 #endif
 
 #ifdef INET6
 	isipv6 = (mtod(m, struct ip *)->ip_v == 6) ? 1 : 0;
 #endif
 
 	to.to_flags = 0;
 	TCPSTAT_INC(tcps_rcvtotal);
 
 #ifdef INET6
 	if (isipv6) {
 		/* IP6_EXTHDR_CHECK() is already done at tcp6_input(). */
 
 		if (m->m_len < (sizeof(*ip6) + sizeof(*th))) {
 			m = m_pullup(m, sizeof(*ip6) + sizeof(*th));
 			if (m == NULL) {
 				TCPSTAT_INC(tcps_rcvshort);
 				return;
 			}
 		}
 
 		ip6 = mtod(m, struct ip6_hdr *);
 		th = (struct tcphdr *)((caddr_t)ip6 + off0);
 		tlen = sizeof(*ip6) + ntohs(ip6->ip6_plen) - off0;
 		if (m->m_pkthdr.csum_flags & CSUM_DATA_VALID_IPV6) {
 			if (m->m_pkthdr.csum_flags & CSUM_PSEUDO_HDR)
 				th->th_sum = m->m_pkthdr.csum_data;
 			else
 				th->th_sum = in6_cksum_pseudo(ip6, tlen,
 				    IPPROTO_TCP, m->m_pkthdr.csum_data);
 			th->th_sum ^= 0xffff;
 		} else
 			th->th_sum = in6_cksum(m, IPPROTO_TCP, off0, tlen);
 		if (th->th_sum) {
 			TCPSTAT_INC(tcps_rcvbadsum);
 			goto drop;
 		}
 
 		/*
 		 * Be proactive about unspecified IPv6 address in source.
 		 * As we use all-zero to indicate unbounded/unconnected pcb,
 		 * unspecified IPv6 address can be used to confuse us.
 		 *
 		 * Note that packets with unspecified IPv6 destination is
 		 * already dropped in ip6_input.
 		 */
 		if (IN6_IS_ADDR_UNSPECIFIED(&ip6->ip6_src)) {
 			/* XXX stat */
 			goto drop;
 		}
 	}
 #endif
 #if defined(INET) && defined(INET6)
 	else
 #endif
 #ifdef INET
 	{
 		/*
 		 * Get IP and TCP header together in first mbuf.
 		 * Note: IP leaves IP header in first mbuf.
 		 */
 		if (off0 > sizeof (struct ip)) {
 			ip_stripoptions(m);
 			off0 = sizeof(struct ip);
 		}
 		if (m->m_len < sizeof (struct tcpiphdr)) {
 			if ((m = m_pullup(m, sizeof (struct tcpiphdr)))
 			    == NULL) {
 				TCPSTAT_INC(tcps_rcvshort);
 				return;
 			}
 		}
 		ip = mtod(m, struct ip *);
 		th = (struct tcphdr *)((caddr_t)ip + off0);
 		tlen = ntohs(ip->ip_len) - off0;
 
 		if (m->m_pkthdr.csum_flags & CSUM_DATA_VALID) {
 			if (m->m_pkthdr.csum_flags & CSUM_PSEUDO_HDR)
 				th->th_sum = m->m_pkthdr.csum_data;
 			else
 				th->th_sum = in_pseudo(ip->ip_src.s_addr,
 				    ip->ip_dst.s_addr,
 				    htonl(m->m_pkthdr.csum_data + tlen +
 				    IPPROTO_TCP));
 			th->th_sum ^= 0xffff;
 		} else {
 			struct ipovly *ipov = (struct ipovly *)ip;
 
 			/*
 			 * Checksum extended TCP header and data.
 			 */
 			len = off0 + tlen;
 			bzero(ipov->ih_x1, sizeof(ipov->ih_x1));
 			ipov->ih_len = htons(tlen);
 			th->th_sum = in_cksum(m, len);
 			/* Reset length for SDT probes. */
 			ip->ip_len = htons(tlen + off0);
 		}
 
 		if (th->th_sum) {
 			TCPSTAT_INC(tcps_rcvbadsum);
 			goto drop;
 		}
 		/* Re-initialization for later version check */
 		ip->ip_v = IPVERSION;
 	}
 #endif /* INET */
 
 #ifdef INET6
 	if (isipv6)
 		iptos = (ntohl(ip6->ip6_flow) >> 20) & 0xff;
 #endif
 #if defined(INET) && defined(INET6)
 	else
 #endif
 #ifdef INET
 		iptos = ip->ip_tos;
 #endif
 
 	/*
 	 * Check that TCP offset makes sense,
 	 * pull out TCP options and adjust length.		XXX
 	 */
 	off = th->th_off << 2;
 	if (off < sizeof (struct tcphdr) || off > tlen) {
 		TCPSTAT_INC(tcps_rcvbadoff);
 		goto drop;
 	}
 	tlen -= off;	/* tlen is used instead of ti->ti_len */
 	if (off > sizeof (struct tcphdr)) {
 #ifdef INET6
 		if (isipv6) {
 			IP6_EXTHDR_CHECK(m, off0, off, );
 			ip6 = mtod(m, struct ip6_hdr *);
 			th = (struct tcphdr *)((caddr_t)ip6 + off0);
 		}
 #endif
 #if defined(INET) && defined(INET6)
 		else
 #endif
 #ifdef INET
 		{
 			if (m->m_len < sizeof(struct ip) + off) {
 				if ((m = m_pullup(m, sizeof (struct ip) + off))
 				    == NULL) {
 					TCPSTAT_INC(tcps_rcvshort);
 					return;
 				}
 				ip = mtod(m, struct ip *);
 				th = (struct tcphdr *)((caddr_t)ip + off0);
 			}
 		}
 #endif
 		optlen = off - sizeof (struct tcphdr);
 		optp = (u_char *)(th + 1);
 	}
 	thflags = th->th_flags;
 
 	/*
 	 * Convert TCP protocol specific fields to host format.
 	 */
 	tcp_fields_to_host(th);
 
 	/*
 	 * Delay dropping TCP, IP headers, IPv6 ext headers, and TCP options.
 	 */
 	drop_hdrlen = off0 + off;
 
 	/*
 	 * Locate pcb for segment; if we're likely to add or remove a
 	 * connection then first acquire pcbinfo lock.  There are two cases
 	 * where we might discover later we need a write lock despite the
 	 * flags: ACKs moving a connection out of the syncache, and ACKs for
 	 * a connection in TIMEWAIT.
 	 */
 	if ((thflags & (TH_SYN | TH_FIN | TH_RST)) != 0) {
 		INP_INFO_WLOCK(&V_tcbinfo);
 		ti_locked = TI_WLOCKED;
 	} else
 		ti_locked = TI_UNLOCKED;
 
 	/*
 	 * Grab info from PACKET_TAG_IPFORWARD tag prepended to the chain.
 	 */
         if (
 #ifdef INET6
 	    (isipv6 && (m->m_flags & M_IP6_NEXTHOP))
 #ifdef INET
 	    || (!isipv6 && (m->m_flags & M_IP_NEXTHOP))
 #endif
 #endif
 #if defined(INET) && !defined(INET6)
 	    (m->m_flags & M_IP_NEXTHOP)
 #endif
 	    )
 		fwd_tag = m_tag_find(m, PACKET_TAG_IPFORWARD, NULL);
 
 findpcb:
 #ifdef INVARIANTS
 	if (ti_locked == TI_WLOCKED) {
 		INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
 	} else {
 		INP_INFO_UNLOCK_ASSERT(&V_tcbinfo);
 	}
 #endif
 #ifdef INET6
 	if (isipv6 && fwd_tag != NULL) {
 		struct sockaddr_in6 *next_hop6;
 
 		next_hop6 = (struct sockaddr_in6 *)(fwd_tag + 1);
 		/*
 		 * Transparently forwarded. Pretend to be the destination.
 		 * Already got one like this?
 		 */
 		inp = in6_pcblookup_mbuf(&V_tcbinfo,
 		    &ip6->ip6_src, th->th_sport, &ip6->ip6_dst, th->th_dport,
 		    INPLOOKUP_WLOCKPCB, m->m_pkthdr.rcvif, m);
 		if (!inp) {
 			/*
 			 * It's new.  Try to find the ambushing socket.
 			 * Because we've rewritten the destination address,
 			 * any hardware-generated hash is ignored.
 			 */
 			inp = in6_pcblookup(&V_tcbinfo, &ip6->ip6_src,
 			    th->th_sport, &next_hop6->sin6_addr,
 			    next_hop6->sin6_port ? ntohs(next_hop6->sin6_port) :
 			    th->th_dport, INPLOOKUP_WILDCARD |
 			    INPLOOKUP_WLOCKPCB, m->m_pkthdr.rcvif);
 		}
 	} else if (isipv6) {
 		inp = in6_pcblookup_mbuf(&V_tcbinfo, &ip6->ip6_src,
 		    th->th_sport, &ip6->ip6_dst, th->th_dport,
 		    INPLOOKUP_WILDCARD | INPLOOKUP_WLOCKPCB,
 		    m->m_pkthdr.rcvif, m);
 	}
 #endif /* INET6 */
 #if defined(INET6) && defined(INET)
 	else
 #endif
 #ifdef INET
 	if (fwd_tag != NULL) {
 		struct sockaddr_in *next_hop;
 
 		next_hop = (struct sockaddr_in *)(fwd_tag+1);
 		/*
 		 * Transparently forwarded. Pretend to be the destination.
 		 * already got one like this?
 		 */
 		inp = in_pcblookup_mbuf(&V_tcbinfo, ip->ip_src, th->th_sport,
 		    ip->ip_dst, th->th_dport, INPLOOKUP_WLOCKPCB,
 		    m->m_pkthdr.rcvif, m);
 		if (!inp) {
 			/*
 			 * It's new.  Try to find the ambushing socket.
 			 * Because we've rewritten the destination address,
 			 * any hardware-generated hash is ignored.
 			 */
 			inp = in_pcblookup(&V_tcbinfo, ip->ip_src,
 			    th->th_sport, next_hop->sin_addr,
 			    next_hop->sin_port ? ntohs(next_hop->sin_port) :
 			    th->th_dport, INPLOOKUP_WILDCARD |
 			    INPLOOKUP_WLOCKPCB, m->m_pkthdr.rcvif);
 		}
 	} else
 		inp = in_pcblookup_mbuf(&V_tcbinfo, ip->ip_src,
 		    th->th_sport, ip->ip_dst, th->th_dport,
 		    INPLOOKUP_WILDCARD | INPLOOKUP_WLOCKPCB,
 		    m->m_pkthdr.rcvif, m);
 #endif /* INET */
 
 	/*
 	 * If the INPCB does not exist then all data in the incoming
 	 * segment is discarded and an appropriate RST is sent back.
 	 * XXX MRT Send RST using which routing table?
 	 */
 	if (inp == NULL) {
 		/*
 		 * Log communication attempts to ports that are not
 		 * in use.
 		 */
 		if ((tcp_log_in_vain == 1 && (thflags & TH_SYN)) ||
 		    tcp_log_in_vain == 2) {
 			if ((s = tcp_log_vain(NULL, th, (void *)ip, ip6)))
 				log(LOG_INFO, "%s; %s: Connection attempt "
 				    "to closed port\n", s, __func__);
 		}
 		/*
 		 * When blackholing do not respond with a RST but
 		 * completely ignore the segment and drop it.
 		 */
 		if ((V_blackhole == 1 && (thflags & TH_SYN)) ||
 		    V_blackhole == 2)
 			goto dropunlock;
 
 		rstreason = BANDLIM_RST_CLOSEDPORT;
 		goto dropwithreset;
 	}
 	INP_WLOCK_ASSERT(inp);
 	if (!(inp->inp_flags & INP_HW_FLOWID)
 	    && (m->m_flags & M_FLOWID)
 	    && ((inp->inp_socket == NULL)
 		|| !(inp->inp_socket->so_options & SO_ACCEPTCONN))) {
 		inp->inp_flags |= INP_HW_FLOWID;
 		inp->inp_flags &= ~INP_SW_FLOWID;
 		inp->inp_flowid = m->m_pkthdr.flowid;
 	}
 #ifdef IPSEC
 #ifdef INET6
 	if (isipv6 && ipsec6_in_reject(m, inp)) {
 		IPSEC6STAT_INC(ips_in_polvio);
 		goto dropunlock;
 	} else
 #endif /* INET6 */
 	if (ipsec4_in_reject(m, inp) != 0) {
 		IPSECSTAT_INC(ips_in_polvio);
 		goto dropunlock;
 	}
 #endif /* IPSEC */
 
 	/*
 	 * Check the minimum TTL for socket.
 	 */
 	if (inp->inp_ip_minttl != 0) {
 #ifdef INET6
 		if (isipv6 && inp->inp_ip_minttl > ip6->ip6_hlim)
 			goto dropunlock;
 		else
 #endif
 		if (inp->inp_ip_minttl > ip->ip_ttl)
 			goto dropunlock;
 	}
 
 	/*
 	 * A previous connection in TIMEWAIT state is supposed to catch stray
 	 * or duplicate segments arriving late.  If this segment was a
 	 * legitimate new connection attempt, the old INPCB gets removed and
 	 * we can try again to find a listening socket.
 	 *
 	 * At this point, due to earlier optimism, we may hold only an inpcb
 	 * lock, and not the inpcbinfo write lock.  If so, we need to try to
 	 * acquire it, or if that fails, acquire a reference on the inpcb,
 	 * drop all locks, acquire a global write lock, and then re-acquire
 	 * the inpcb lock.  We may at that point discover that another thread
 	 * has tried to free the inpcb, in which case we need to loop back
 	 * and try to find a new inpcb to deliver to.
 	 *
 	 * XXXRW: It may be time to rethink timewait locking.
 	 */
 relocked:
 	if (inp->inp_flags & INP_TIMEWAIT) {
 		if (ti_locked == TI_UNLOCKED) {
 			if (INP_INFO_TRY_WLOCK(&V_tcbinfo) == 0) {
 				in_pcbref(inp);
 				INP_WUNLOCK(inp);
 				INP_INFO_WLOCK(&V_tcbinfo);
 				ti_locked = TI_WLOCKED;
 				INP_WLOCK(inp);
 				if (in_pcbrele_wlocked(inp)) {
 					inp = NULL;
 					goto findpcb;
 				}
 			} else
 				ti_locked = TI_WLOCKED;
 		}
 		INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
 
 		if (thflags & TH_SYN)
 			tcp_dooptions(&to, optp, optlen, TO_SYN);
 		/*
 		 * NB: tcp_twcheck unlocks the INP and frees the mbuf.
 		 */
 		if (tcp_twcheck(inp, &to, th, m, tlen))
 			goto findpcb;
 		INP_INFO_WUNLOCK(&V_tcbinfo);
 		return;
 	}
 	/*
 	 * The TCPCB may no longer exist if the connection is winding
 	 * down or it is in the CLOSED state.  Either way we drop the
 	 * segment and send an appropriate response.
 	 */
 	tp = intotcpcb(inp);
 	if (tp == NULL || tp->t_state == TCPS_CLOSED) {
 		rstreason = BANDLIM_RST_CLOSEDPORT;
 		goto dropwithreset;
 	}
 
 #ifdef TCP_OFFLOAD
 	if (tp->t_flags & TF_TOE) {
 		tcp_offload_input(tp, m);
 		m = NULL;	/* consumed by the TOE driver */
 		goto dropunlock;
 	}
 #endif
 
 	/*
 	 * We've identified a valid inpcb, but it could be that we need an
 	 * inpcbinfo write lock but don't hold it.  In this case, attempt to
 	 * acquire using the same strategy as the TIMEWAIT case above.  If we
 	 * relock, we have to jump back to 'relocked' as the connection might
 	 * now be in TIMEWAIT.
 	 */
 #ifdef INVARIANTS
 	if ((thflags & (TH_SYN | TH_FIN | TH_RST)) != 0)
 		INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
 #endif
 	if (tp->t_state != TCPS_ESTABLISHED) {
 		if (ti_locked == TI_UNLOCKED) {
 			if (INP_INFO_TRY_WLOCK(&V_tcbinfo) == 0) {
 				in_pcbref(inp);
 				INP_WUNLOCK(inp);
 				INP_INFO_WLOCK(&V_tcbinfo);
 				ti_locked = TI_WLOCKED;
 				INP_WLOCK(inp);
 				if (in_pcbrele_wlocked(inp)) {
 					inp = NULL;
 					goto findpcb;
 				}
 				goto relocked;
 			} else
 				ti_locked = TI_WLOCKED;
 		}
 		INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
 	}
 
 #ifdef MAC
 	INP_WLOCK_ASSERT(inp);
 	if (mac_inpcb_check_deliver(inp, m))
 		goto dropunlock;
 #endif
 	so = inp->inp_socket;
 	KASSERT(so != NULL, ("%s: so == NULL", __func__));
 #ifdef TCPDEBUG
 	if (so->so_options & SO_DEBUG) {
 		ostate = tp->t_state;
 #ifdef INET6
 		if (isipv6) {
 			bcopy((char *)ip6, (char *)tcp_saveipgen, sizeof(*ip6));
 		} else
 #endif
 			bcopy((char *)ip, (char *)tcp_saveipgen, sizeof(*ip));
 		tcp_savetcp = *th;
 	}
 #endif /* TCPDEBUG */
 	/*
 	 * When the socket is accepting connections (the INPCB is in LISTEN
 	 * state) we look into the SYN cache if this is a new connection
 	 * attempt or the completion of a previous one.  Because listen
 	 * sockets are never in TCPS_ESTABLISHED, the V_tcbinfo lock will be
 	 * held in this case.
 	 */
 	if (so->so_options & SO_ACCEPTCONN) {
 		struct in_conninfo inc;
 
 		KASSERT(tp->t_state == TCPS_LISTEN, ("%s: so accepting but "
 		    "tp not listening", __func__));
 		INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
 
 		bzero(&inc, sizeof(inc));
 #ifdef INET6
 		if (isipv6) {
 			inc.inc_flags |= INC_ISIPV6;
 			inc.inc6_faddr = ip6->ip6_src;
 			inc.inc6_laddr = ip6->ip6_dst;
 		} else
 #endif
 		{
 			inc.inc_faddr = ip->ip_src;
 			inc.inc_laddr = ip->ip_dst;
 		}
 		inc.inc_fport = th->th_sport;
 		inc.inc_lport = th->th_dport;
 		inc.inc_fibnum = so->so_fibnum;
 
 		/*
 		 * Check for an existing connection attempt in syncache if
 		 * the flag is only ACK.  A successful lookup creates a new
 		 * socket appended to the listen queue in SYN_RECEIVED state.
 		 */
 		if ((thflags & (TH_RST|TH_ACK|TH_SYN)) == TH_ACK) {
 			/*
 			 * Parse the TCP options here because
 			 * syncookies need access to the reflected
 			 * timestamp.
 			 */
 			tcp_dooptions(&to, optp, optlen, 0);
 			/*
 			 * NB: syncache_expand() doesn't unlock
 			 * inp and tcpinfo locks.
 			 */
 			if (!syncache_expand(&inc, &to, th, &so, m)) {
 				/*
 				 * No syncache entry or ACK was not
 				 * for our SYN/ACK.  Send a RST.
 				 * NB: syncache did its own logging
 				 * of the failure cause.
 				 */
 				rstreason = BANDLIM_RST_OPENPORT;
 				goto dropwithreset;
 			}
 			if (so == NULL) {
 				/*
 				 * We completed the 3-way handshake
 				 * but could not allocate a socket
 				 * either due to memory shortage,
 				 * listen queue length limits or
 				 * global socket limits.  Send RST
 				 * or wait and have the remote end
 				 * retransmit the ACK for another
 				 * try.
 				 */
 				if ((s = tcp_log_addrs(&inc, th, NULL, NULL)))
 					log(LOG_DEBUG, "%s; %s: Listen socket: "
 					    "Socket allocation failed due to "
 					    "limits or memory shortage, %s\n",
 					    s, __func__,
 					    V_tcp_sc_rst_sock_fail ?
 					    "sending RST" : "try again");
 				if (V_tcp_sc_rst_sock_fail) {
 					rstreason = BANDLIM_UNLIMITED;
 					goto dropwithreset;
 				} else
 					goto dropunlock;
 			}
 			/*
 			 * Socket is created in state SYN_RECEIVED.
 			 * Unlock the listen socket, lock the newly
 			 * created socket and update the tp variable.
 			 */
 			INP_WUNLOCK(inp);	/* listen socket */
 			inp = sotoinpcb(so);
 			INP_WLOCK(inp);		/* new connection */
 			tp = intotcpcb(inp);
 			KASSERT(tp->t_state == TCPS_SYN_RECEIVED,
 			    ("%s: ", __func__));
 #ifdef TCP_SIGNATURE
 			if (sig_checked == 0)  {
 				tcp_dooptions(&to, optp, optlen,
 				    (thflags & TH_SYN) ? TO_SYN : 0);
 				if (!tcp_signature_verify_input(m, off0, tlen,
 				    optlen, &to, th, tp->t_flags)) {
 
 					/*
 					 * In SYN_SENT state if it receives an
 					 * RST, it is allowed for further
 					 * processing.
 					 */
 					if ((thflags & TH_RST) == 0 ||
 					    (tp->t_state == TCPS_SYN_SENT) == 0)
 						goto dropunlock;
 				}
 				sig_checked = 1;
 			}
 #endif
 
 			/*
 			 * Process the segment and the data it
 			 * contains.  tcp_do_segment() consumes
 			 * the mbuf chain and unlocks the inpcb.
 			 */
 			tcp_do_segment(m, th, so, tp, drop_hdrlen, tlen,
 			    iptos, ti_locked);
 			INP_INFO_UNLOCK_ASSERT(&V_tcbinfo);
 			return;
 		}
 		/*
 		 * Segment flag validation for new connection attempts:
 		 *
 		 * Our (SYN|ACK) response was rejected.
 		 * Check with syncache and remove entry to prevent
 		 * retransmits.
 		 *
 		 * NB: syncache_chkrst does its own logging of failure
 		 * causes.
 		 */
 		if (thflags & TH_RST) {
 			syncache_chkrst(&inc, th);
 			goto dropunlock;
 		}
 		/*
 		 * We can't do anything without SYN.
 		 */
 		if ((thflags & TH_SYN) == 0) {
 			if ((s = tcp_log_addrs(&inc, th, NULL, NULL)))
 				log(LOG_DEBUG, "%s; %s: Listen socket: "
 				    "SYN is missing, segment ignored\n",
 				    s, __func__);
 			TCPSTAT_INC(tcps_badsyn);
 			goto dropunlock;
 		}
 		/*
 		 * (SYN|ACK) is bogus on a listen socket.
 		 */
 		if (thflags & TH_ACK) {
 			if ((s = tcp_log_addrs(&inc, th, NULL, NULL)))
 				log(LOG_DEBUG, "%s; %s: Listen socket: "
 				    "SYN|ACK invalid, segment rejected\n",
 				    s, __func__);
 			syncache_badack(&inc);	/* XXX: Not needed! */
 			TCPSTAT_INC(tcps_badsyn);
 			rstreason = BANDLIM_RST_OPENPORT;
 			goto dropwithreset;
 		}
 		/*
 		 * If the drop_synfin option is enabled, drop all
 		 * segments with both the SYN and FIN bits set.
 		 * This prevents e.g. nmap from identifying the
 		 * TCP/IP stack.
 		 * XXX: Poor reasoning.  nmap has other methods
 		 * and is constantly refining its stack detection
 		 * strategies.
 		 * XXX: This is a violation of the TCP specification
 		 * and was used by RFC1644.
 		 */
 		if ((thflags & TH_FIN) && V_drop_synfin) {
 			if ((s = tcp_log_addrs(&inc, th, NULL, NULL)))
 				log(LOG_DEBUG, "%s; %s: Listen socket: "
 				    "SYN|FIN segment ignored (based on "
 				    "sysctl setting)\n", s, __func__);
 			TCPSTAT_INC(tcps_badsyn);
 			goto dropunlock;
 		}
 		/*
 		 * Segment's flags are (SYN) or (SYN|FIN).
 		 *
 		 * TH_PUSH, TH_URG, TH_ECE, TH_CWR are ignored
 		 * as they do not affect the state of the TCP FSM.
 		 * The data pointed to by TH_URG and th_urp is ignored.
 		 */
 		KASSERT((thflags & (TH_RST|TH_ACK)) == 0,
 		    ("%s: Listen socket: TH_RST or TH_ACK set", __func__));
 		KASSERT(thflags & (TH_SYN),
 		    ("%s: Listen socket: TH_SYN not set", __func__));
 #ifdef INET6
 		/*
 		 * If deprecated address is forbidden,
 		 * we do not accept SYN to deprecated interface
 		 * address to prevent any new inbound connection from
 		 * getting established.
 		 * When we do not accept SYN, we send a TCP RST,
 		 * with deprecated source address (instead of dropping
 		 * it).  We compromise it as it is much better for peer
 		 * to send a RST, and RST will be the final packet
 		 * for the exchange.
 		 *
 		 * If we do not forbid deprecated addresses, we accept
 		 * the SYN packet.  RFC2462 does not suggest dropping
 		 * SYN in this case.
 		 * If we decipher RFC2462 5.5.4, it says like this:
 		 * 1. use of deprecated addr with existing
 		 *    communication is okay - "SHOULD continue to be
 		 *    used"
 		 * 2. use of it with new communication:
 		 *   (2a) "SHOULD NOT be used if alternate address
 		 *        with sufficient scope is available"
 		 *   (2b) nothing mentioned otherwise.
 		 * Here we fall into (2b) case as we have no choice in
 		 * our source address selection - we must obey the peer.
 		 *
 		 * The wording in RFC2462 is confusing, and there are
 		 * multiple description text for deprecated address
 		 * handling - worse, they are not exactly the same.
 		 * I believe 5.5.4 is the best one, so we follow 5.5.4.
 		 */
 		if (isipv6 && !V_ip6_use_deprecated) {
 			struct in6_ifaddr *ia6;
 
 			ia6 = ip6_getdstifaddr(m);
 			if (ia6 != NULL &&
 			    (ia6->ia6_flags & IN6_IFF_DEPRECATED)) {
 				ifa_free(&ia6->ia_ifa);
 				if ((s = tcp_log_addrs(&inc, th, NULL, NULL)))
 				    log(LOG_DEBUG, "%s; %s: Listen socket: "
 					"Connection attempt to deprecated "
 					"IPv6 address rejected\n",
 					s, __func__);
 				rstreason = BANDLIM_RST_OPENPORT;
 				goto dropwithreset;
 			}
 			if (ia6)
 				ifa_free(&ia6->ia_ifa);
 		}
 #endif /* INET6 */
 		/*
 		 * Basic sanity checks on incoming SYN requests:
 		 *   Don't respond if the destination is a link layer
 		 *	broadcast according to RFC1122 4.2.3.10, p. 104.
 		 *   If it is from this socket it must be forged.
 		 *   Don't respond if the source or destination is a
 		 *	global or subnet broad- or multicast address.
 		 *   Note that it is quite possible to receive unicast
 		 *	link-layer packets with a broadcast IP address. Use
 		 *	in_broadcast() to find them.
 		 */
 		if (m->m_flags & (M_BCAST|M_MCAST)) {
 			if ((s = tcp_log_addrs(&inc, th, NULL, NULL)))
 			    log(LOG_DEBUG, "%s; %s: Listen socket: "
 				"Connection attempt from broad- or multicast "
 				"link layer address ignored\n", s, __func__);
 			goto dropunlock;
 		}
 #ifdef INET6
 		if (isipv6) {
 			if (th->th_dport == th->th_sport &&
 			    IN6_ARE_ADDR_EQUAL(&ip6->ip6_dst, &ip6->ip6_src)) {
 				if ((s = tcp_log_addrs(&inc, th, NULL, NULL)))
 				    log(LOG_DEBUG, "%s; %s: Listen socket: "
 					"Connection attempt to/from self "
 					"ignored\n", s, __func__);
 				goto dropunlock;
 			}
 			if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst) ||
 			    IN6_IS_ADDR_MULTICAST(&ip6->ip6_src)) {
 				if ((s = tcp_log_addrs(&inc, th, NULL, NULL)))
 				    log(LOG_DEBUG, "%s; %s: Listen socket: "
 					"Connection attempt from/to multicast "
 					"address ignored\n", s, __func__);
 				goto dropunlock;
 			}
 		}
 #endif
 #if defined(INET) && defined(INET6)
 		else
 #endif
 #ifdef INET
 		{
 			if (th->th_dport == th->th_sport &&
 			    ip->ip_dst.s_addr == ip->ip_src.s_addr) {
 				if ((s = tcp_log_addrs(&inc, th, NULL, NULL)))
 				    log(LOG_DEBUG, "%s; %s: Listen socket: "
 					"Connection attempt from/to self "
 					"ignored\n", s, __func__);
 				goto dropunlock;
 			}
 			if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr)) ||
 			    IN_MULTICAST(ntohl(ip->ip_src.s_addr)) ||
 			    ip->ip_src.s_addr == htonl(INADDR_BROADCAST) ||
 			    in_broadcast(ip->ip_dst, m->m_pkthdr.rcvif)) {
 				if ((s = tcp_log_addrs(&inc, th, NULL, NULL)))
 				    log(LOG_DEBUG, "%s; %s: Listen socket: "
 					"Connection attempt from/to broad- "
 					"or multicast address ignored\n",
 					s, __func__);
 				goto dropunlock;
 			}
 		}
 #endif
 		/*
 		 * SYN appears to be valid.  Create compressed TCP state
 		 * for syncache.
 		 */
 #ifdef TCPDEBUG
 		if (so->so_options & SO_DEBUG)
 			tcp_trace(TA_INPUT, ostate, tp,
 			    (void *)tcp_saveipgen, &tcp_savetcp, 0);
 #endif
 		tcp_dooptions(&to, optp, optlen, TO_SYN);
 		syncache_add(&inc, &to, th, inp, &so, m, NULL, NULL);
 		/*
 		 * Entry added to syncache and mbuf consumed.
 		 * Everything already unlocked by syncache_add().
 		 */
 		INP_INFO_UNLOCK_ASSERT(&V_tcbinfo);
 		return;
 	} else if (tp->t_state == TCPS_LISTEN) {
 		/*
 		 * When a listen socket is torn down the SO_ACCEPTCONN
 		 * flag is removed first while connections are drained
 		 * from the accept queue in a unlock/lock cycle of the
 		 * ACCEPT_LOCK, opening a race condition allowing a SYN
 		 * attempt go through unhandled.
 		 */
 		goto dropunlock;
 	}
 
 #ifdef TCP_SIGNATURE
 	if (sig_checked == 0)  {
 		tcp_dooptions(&to, optp, optlen,
 		    (thflags & TH_SYN) ? TO_SYN : 0);
 		if (!tcp_signature_verify_input(m, off0, tlen, optlen, &to,
 		    th, tp->t_flags)) {
 
 			/*
 			 * In SYN_SENT state if it receives an RST, it is
 			 * allowed for further processing.
 			 */
 			if ((thflags & TH_RST) == 0 ||
 			    (tp->t_state == TCPS_SYN_SENT) == 0)
 				goto dropunlock;
 		}
 		sig_checked = 1;
 	}
 #endif
 
 	TCP_PROBE5(receive, NULL, tp, m->m_data, tp, th);
 
 	/*
 	 * Segment belongs to a connection in SYN_SENT, ESTABLISHED or later
 	 * state.  tcp_do_segment() always consumes the mbuf chain, unlocks
 	 * the inpcb, and unlocks pcbinfo.
 	 */
 	tcp_do_segment(m, th, so, tp, drop_hdrlen, tlen, iptos, ti_locked);
 	INP_INFO_UNLOCK_ASSERT(&V_tcbinfo);
 	return;
 
 dropwithreset:
 	TCP_PROBE5(receive, NULL, tp, m->m_data, tp, th);
 
 	if (ti_locked == TI_WLOCKED) {
 		INP_INFO_WUNLOCK(&V_tcbinfo);
 		ti_locked = TI_UNLOCKED;
 	}
 #ifdef INVARIANTS
 	else {
 		KASSERT(ti_locked == TI_UNLOCKED, ("%s: dropwithreset "
 		    "ti_locked: %d", __func__, ti_locked));
 		INP_INFO_UNLOCK_ASSERT(&V_tcbinfo);
 	}
 #endif
 
 	if (inp != NULL) {
 		tcp_dropwithreset(m, th, tp, tlen, rstreason);
 		INP_WUNLOCK(inp);
 	} else
 		tcp_dropwithreset(m, th, NULL, tlen, rstreason);
 	m = NULL;	/* mbuf chain got consumed. */
 	goto drop;
 
 dropunlock:
 	if (m != NULL)
 		TCP_PROBE5(receive, NULL, tp, m->m_data, tp, th);
 
 	if (ti_locked == TI_WLOCKED) {
 		INP_INFO_WUNLOCK(&V_tcbinfo);
 		ti_locked = TI_UNLOCKED;
 	}
 #ifdef INVARIANTS
 	else {
 		KASSERT(ti_locked == TI_UNLOCKED, ("%s: dropunlock "
 		    "ti_locked: %d", __func__, ti_locked));
 		INP_INFO_UNLOCK_ASSERT(&V_tcbinfo);
 	}
 #endif
 
 	if (inp != NULL)
 		INP_WUNLOCK(inp);
 
 drop:
 	INP_INFO_UNLOCK_ASSERT(&V_tcbinfo);
 	if (s != NULL)
 		free(s, M_TCPLOG);
 	if (m != NULL)
 		m_freem(m);
 }
 
 static void
 tcp_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so,
     struct tcpcb *tp, int drop_hdrlen, int tlen, uint8_t iptos,
     int ti_locked)
 {
 	int thflags, acked, ourfinisacked, needoutput = 0;
 	int rstreason, todrop, win;
 	u_long tiwin;
 	char *s;
 	struct in_conninfo *inc;
 	struct mbuf *mfree;
 	struct tcpopt to;
 
 #ifdef TCPDEBUG
 	/*
 	 * The size of tcp_saveipgen must be the size of the max ip header,
 	 * now IPv6.
 	 */
 	u_char tcp_saveipgen[IP6_HDR_LEN];
 	struct tcphdr tcp_savetcp;
 	short ostate = 0;
 #endif
 	thflags = th->th_flags;
 	inc = &tp->t_inpcb->inp_inc;
 	tp->sackhint.last_sack_ack = 0;
 
 	/*
 	 * If this is either a state-changing packet or current state isn't
 	 * established, we require a write lock on tcbinfo.  Otherwise, we
 	 * allow the tcbinfo to be in either alocked or unlocked, as the
 	 * caller may have unnecessarily acquired a write lock due to a race.
 	 */
 	if ((thflags & (TH_SYN | TH_FIN | TH_RST)) != 0 ||
 	    tp->t_state != TCPS_ESTABLISHED) {
 		KASSERT(ti_locked == TI_WLOCKED, ("%s ti_locked %d for "
 		    "SYN/FIN/RST/!EST", __func__, ti_locked));
 		INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
 	} else {
 #ifdef INVARIANTS
 		if (ti_locked == TI_WLOCKED)
 			INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
 		else {
 			KASSERT(ti_locked == TI_UNLOCKED, ("%s: EST "
 			    "ti_locked: %d", __func__, ti_locked));
 			INP_INFO_UNLOCK_ASSERT(&V_tcbinfo);
 		}
 #endif
 	}
 	INP_WLOCK_ASSERT(tp->t_inpcb);
 	KASSERT(tp->t_state > TCPS_LISTEN, ("%s: TCPS_LISTEN",
 	    __func__));
 	KASSERT(tp->t_state != TCPS_TIME_WAIT, ("%s: TCPS_TIME_WAIT",
 	    __func__));
 
 	/*
 	 * Segment received on connection.
 	 * Reset idle time and keep-alive timer.
 	 * XXX: This should be done after segment
 	 * validation to ignore broken/spoofed segs.
 	 */
 	tp->t_rcvtime = ticks;
 	if (TCPS_HAVEESTABLISHED(tp->t_state))
 		tcp_timer_activate(tp, TT_KEEP, TP_KEEPIDLE(tp));
 
 	/*
 	 * Unscale the window into a 32-bit value.
 	 * For the SYN_SENT state the scale is zero.
 	 */
 	tiwin = th->th_win << tp->snd_scale;
 
 	/*
 	 * TCP ECN processing.
 	 */
 	if (tp->t_flags & TF_ECN_PERMIT) {
 		if (thflags & TH_CWR)
 			tp->t_flags &= ~TF_ECN_SND_ECE;
 		switch (iptos & IPTOS_ECN_MASK) {
 		case IPTOS_ECN_CE:
 			tp->t_flags |= TF_ECN_SND_ECE;
 			TCPSTAT_INC(tcps_ecn_ce);
 			break;
 		case IPTOS_ECN_ECT0:
 			TCPSTAT_INC(tcps_ecn_ect0);
 			break;
 		case IPTOS_ECN_ECT1:
 			TCPSTAT_INC(tcps_ecn_ect1);
 			break;
 		}
 		/* Congestion experienced. */
 		if (thflags & TH_ECE) {
 			cc_cong_signal(tp, th, CC_ECN);
 		}
 	}
 
 	/*
 	 * Parse options on any incoming segment.
 	 */
 	tcp_dooptions(&to, (u_char *)(th + 1),
 	    (th->th_off << 2) - sizeof(struct tcphdr),
 	    (thflags & TH_SYN) ? TO_SYN : 0);
 
 	/*
 	 * If echoed timestamp is later than the current time,
 	 * fall back to non RFC1323 RTT calculation.  Normalize
 	 * timestamp if syncookies were used when this connection
 	 * was established.
 	 */
 	if ((to.to_flags & TOF_TS) && (to.to_tsecr != 0)) {
 		to.to_tsecr -= tp->ts_offset;
 		if (TSTMP_GT(to.to_tsecr, tcp_ts_getticks()))
 			to.to_tsecr = 0;
 	}
 	/*
 	 * If timestamps were negotiated during SYN/ACK they should
 	 * appear on every segment during this session and vice versa.
 	 */
 	if ((tp->t_flags & TF_RCVD_TSTMP) && !(to.to_flags & TOF_TS)) {
 		if ((s = tcp_log_addrs(inc, th, NULL, NULL))) {
 			log(LOG_DEBUG, "%s; %s: Timestamp missing, "
 			    "no action\n", s, __func__);
 			free(s, M_TCPLOG);
 		}
 	}
 	if (!(tp->t_flags & TF_RCVD_TSTMP) && (to.to_flags & TOF_TS)) {
 		if ((s = tcp_log_addrs(inc, th, NULL, NULL))) {
 			log(LOG_DEBUG, "%s; %s: Timestamp not expected, "
 			    "no action\n", s, __func__);
 			free(s, M_TCPLOG);
 		}
 	}
 
 	/*
 	 * Process options only when we get SYN/ACK back. The SYN case
 	 * for incoming connections is handled in tcp_syncache.
 	 * According to RFC1323 the window field in a SYN (i.e., a <SYN>
 	 * or <SYN,ACK>) segment itself is never scaled.
 	 * XXX this is traditional behavior, may need to be cleaned up.
 	 */
 	if (tp->t_state == TCPS_SYN_SENT && (thflags & TH_SYN)) {
 		if ((to.to_flags & TOF_SCALE) &&
 		    (tp->t_flags & TF_REQ_SCALE)) {
 			tp->t_flags |= TF_RCVD_SCALE;
 			tp->snd_scale = to.to_wscale;
 		}
 		/*
 		 * Initial send window.  It will be updated with
 		 * the next incoming segment to the scaled value.
 		 */
 		tp->snd_wnd = th->th_win;
 		if (to.to_flags & TOF_TS) {
 			tp->t_flags |= TF_RCVD_TSTMP;
 			tp->ts_recent = to.to_tsval;
 			tp->ts_recent_age = tcp_ts_getticks();
 		}
 		if (to.to_flags & TOF_MSS)
 			tcp_mss(tp, to.to_mss);
 		if ((tp->t_flags & TF_SACK_PERMIT) &&
 		    (to.to_flags & TOF_SACKPERM) == 0)
 			tp->t_flags &= ~TF_SACK_PERMIT;
 	}
 
 	/*
 	 * Header prediction: check for the two common cases
 	 * of a uni-directional data xfer.  If the packet has
 	 * no control flags, is in-sequence, the window didn't
 	 * change and we're not retransmitting, it's a
 	 * candidate.  If the length is zero and the ack moved
 	 * forward, we're the sender side of the xfer.  Just
 	 * free the data acked & wake any higher level process
 	 * that was blocked waiting for space.  If the length
 	 * is non-zero and the ack didn't move, we're the
 	 * receiver side.  If we're getting packets in-order
 	 * (the reassembly queue is empty), add the data to
 	 * the socket buffer and note that we need a delayed ack.
 	 * Make sure that the hidden state-flags are also off.
 	 * Since we check for TCPS_ESTABLISHED first, it can only
 	 * be TH_NEEDSYN.
 	 */
 	if (tp->t_state == TCPS_ESTABLISHED &&
 	    th->th_seq == tp->rcv_nxt &&
 	    (thflags & (TH_SYN|TH_FIN|TH_RST|TH_URG|TH_ACK)) == TH_ACK &&
 	    tp->snd_nxt == tp->snd_max &&
 	    tiwin && tiwin == tp->snd_wnd && 
 	    ((tp->t_flags & (TF_NEEDSYN|TF_NEEDFIN)) == 0) &&
 	    LIST_EMPTY(&tp->t_segq) &&
 	    ((to.to_flags & TOF_TS) == 0 ||
 	     TSTMP_GEQ(to.to_tsval, tp->ts_recent)) ) {
 
 		/*
 		 * If last ACK falls within this segment's sequence numbers,
 		 * record the timestamp.
 		 * NOTE that the test is modified according to the latest
 		 * proposal of the tcplw@cray.com list (Braden 1993/04/26).
 		 */
 		if ((to.to_flags & TOF_TS) != 0 &&
 		    SEQ_LEQ(th->th_seq, tp->last_ack_sent)) {
 			tp->ts_recent_age = tcp_ts_getticks();
 			tp->ts_recent = to.to_tsval;
 		}
 
 		if (tlen == 0) {
 			if (SEQ_GT(th->th_ack, tp->snd_una) &&
 			    SEQ_LEQ(th->th_ack, tp->snd_max) &&
 			    !IN_RECOVERY(tp->t_flags) &&
 			    (to.to_flags & TOF_SACK) == 0 &&
 			    TAILQ_EMPTY(&tp->snd_holes)) {
 				/*
 				 * This is a pure ack for outstanding data.
 				 */
 				if (ti_locked == TI_WLOCKED)
 					INP_INFO_WUNLOCK(&V_tcbinfo);
 				ti_locked = TI_UNLOCKED;
 
 				TCPSTAT_INC(tcps_predack);
 
 				/*
 				 * "bad retransmit" recovery.
 				 */
 				if (tp->t_rxtshift == 1 &&
 				    tp->t_flags & TF_PREVVALID &&
 				    (int)(ticks - tp->t_badrxtwin) < 0) {
 					cc_cong_signal(tp, th, CC_RTO_ERR);
 				}
 
 				/*
 				 * Recalculate the transmit timer / rtt.
 				 *
 				 * Some boxes send broken timestamp replies
 				 * during the SYN+ACK phase, ignore
 				 * timestamps of 0 or we could calculate a
 				 * huge RTT and blow up the retransmit timer.
 				 */
 				if ((to.to_flags & TOF_TS) != 0 &&
 				    to.to_tsecr) {
 					u_int t;
 
 					t = tcp_ts_getticks() - to.to_tsecr;
 					if (!tp->t_rttlow || tp->t_rttlow > t)
 						tp->t_rttlow = t;
 					tcp_xmit_timer(tp,
 					    TCP_TS_TO_TICKS(t) + 1);
 				} else if (tp->t_rtttime &&
 				    SEQ_GT(th->th_ack, tp->t_rtseq)) {
 					if (!tp->t_rttlow ||
 					    tp->t_rttlow > ticks - tp->t_rtttime)
 						tp->t_rttlow = ticks - tp->t_rtttime;
 					tcp_xmit_timer(tp,
 							ticks - tp->t_rtttime);
 				}
 				acked = BYTES_THIS_ACK(tp, th);
 
 				/* Run HHOOK_TCP_ESTABLISHED_IN helper hooks. */
 				hhook_run_tcp_est_in(tp, th, &to);
 
 				TCPSTAT_INC(tcps_rcvackpack);
 				TCPSTAT_ADD(tcps_rcvackbyte, acked);
 				sbdrop(&so->so_snd, acked);
 				if (SEQ_GT(tp->snd_una, tp->snd_recover) &&
 				    SEQ_LEQ(th->th_ack, tp->snd_recover))
 					tp->snd_recover = th->th_ack - 1;
 				
 				/*
 				 * Let the congestion control algorithm update
 				 * congestion control related information. This
 				 * typically means increasing the congestion
 				 * window.
 				 */
 				cc_ack_received(tp, th, CC_ACK);
 
 				tp->snd_una = th->th_ack;
 				/*
 				 * Pull snd_wl2 up to prevent seq wrap relative
 				 * to th_ack.
 				 */
 				tp->snd_wl2 = th->th_ack;
 				tp->t_dupacks = 0;
 				m_freem(m);
 				ND6_HINT(tp); /* Some progress has been made. */
 
 				/*
 				 * If all outstanding data are acked, stop
 				 * retransmit timer, otherwise restart timer
 				 * using current (possibly backed-off) value.
 				 * If process is waiting for space,
 				 * wakeup/selwakeup/signal.  If data
 				 * are ready to send, let tcp_output
 				 * decide between more output or persist.
 				 */
 #ifdef TCPDEBUG
 				if (so->so_options & SO_DEBUG)
 					tcp_trace(TA_INPUT, ostate, tp,
 					    (void *)tcp_saveipgen,
 					    &tcp_savetcp, 0);
 #endif
 				if (tp->snd_una == tp->snd_max)
 					tcp_timer_activate(tp, TT_REXMT, 0);
 				else if (!tcp_timer_active(tp, TT_PERSIST))
 					tcp_timer_activate(tp, TT_REXMT,
 						      tp->t_rxtcur);
 				sowwakeup(so);
 				if (so->so_snd.sb_cc)
 					(void) tcp_output(tp);
 				goto check_delack;
 			}
 		} else if (th->th_ack == tp->snd_una &&
 		    tlen <= sbspace(&so->so_rcv)) {
 			int newsize = 0;	/* automatic sockbuf scaling */
 
 			/*
 			 * This is a pure, in-sequence data packet with
 			 * nothing on the reassembly queue and we have enough
 			 * buffer space to take it.
 			 */
 			if (ti_locked == TI_WLOCKED)
 				INP_INFO_WUNLOCK(&V_tcbinfo);
 			ti_locked = TI_UNLOCKED;
 
 			/* Clean receiver SACK report if present */
 			if ((tp->t_flags & TF_SACK_PERMIT) && tp->rcv_numsacks)
 				tcp_clean_sackreport(tp);
 			TCPSTAT_INC(tcps_preddat);
 			tp->rcv_nxt += tlen;
 			/*
 			 * Pull snd_wl1 up to prevent seq wrap relative to
 			 * th_seq.
 			 */
 			tp->snd_wl1 = th->th_seq;
 			/*
 			 * Pull rcv_up up to prevent seq wrap relative to
 			 * rcv_nxt.
 			 */
 			tp->rcv_up = tp->rcv_nxt;
 			TCPSTAT_INC(tcps_rcvpack);
 			TCPSTAT_ADD(tcps_rcvbyte, tlen);
 			ND6_HINT(tp);	/* Some progress has been made */
 #ifdef TCPDEBUG
 			if (so->so_options & SO_DEBUG)
 				tcp_trace(TA_INPUT, ostate, tp,
 				    (void *)tcp_saveipgen, &tcp_savetcp, 0);
 #endif
 		/*
 		 * Automatic sizing of receive socket buffer.  Often the send
 		 * buffer size is not optimally adjusted to the actual network
 		 * conditions at hand (delay bandwidth product).  Setting the
 		 * buffer size too small limits throughput on links with high
 		 * bandwidth and high delay (eg. trans-continental/oceanic links).
 		 *
 		 * On the receive side the socket buffer memory is only rarely
 		 * used to any significant extent.  This allows us to be much
 		 * more aggressive in scaling the receive socket buffer.  For
 		 * the case that the buffer space is actually used to a large
 		 * extent and we run out of kernel memory we can simply drop
 		 * the new segments; TCP on the sender will just retransmit it
 		 * later.  Setting the buffer size too big may only consume too
 		 * much kernel memory if the application doesn't read() from
 		 * the socket or packet loss or reordering makes use of the
 		 * reassembly queue.
 		 *
 		 * The criteria to step up the receive buffer one notch are:
 		 *  1. the number of bytes received during the time it takes
 		 *     one timestamp to be reflected back to us (the RTT);
 		 *  2. received bytes per RTT is within seven eighth of the
 		 *     current socket buffer size;
 		 *  3. receive buffer size has not hit maximal automatic size;
 		 *
 		 * This algorithm does one step per RTT at most and only if
 		 * we receive a bulk stream w/o packet losses or reorderings.
 		 * Shrinking the buffer during idle times is not necessary as
 		 * it doesn't consume any memory when idle.
 		 *
 		 * TODO: Only step up if the application is actually serving
 		 * the buffer to better manage the socket buffer resources.
 		 */
 			if (V_tcp_do_autorcvbuf &&
 			    to.to_tsecr &&
 			    (so->so_rcv.sb_flags & SB_AUTOSIZE)) {
 				if (TSTMP_GT(to.to_tsecr, tp->rfbuf_ts) &&
 				    to.to_tsecr - tp->rfbuf_ts < hz) {
 					if (tp->rfbuf_cnt >
 					    (so->so_rcv.sb_hiwat / 8 * 7) &&
 					    so->so_rcv.sb_hiwat <
 					    V_tcp_autorcvbuf_max) {
 						newsize =
 						    min(so->so_rcv.sb_hiwat +
 						    V_tcp_autorcvbuf_inc,
 						    V_tcp_autorcvbuf_max);
 					}
 					/* Start over with next RTT. */
 					tp->rfbuf_ts = 0;
 					tp->rfbuf_cnt = 0;
 				} else
 					tp->rfbuf_cnt += tlen;	/* add up */
 			}
 
 			/* Add data to socket buffer. */
 			SOCKBUF_LOCK(&so->so_rcv);
 			if (so->so_rcv.sb_state & SBS_CANTRCVMORE) {
 				m_freem(m);
 			} else {
 				/*
 				 * Set new socket buffer size.
 				 * Give up when limit is reached.
 				 */
 				if (newsize)
 					if (!sbreserve_locked(&so->so_rcv,
 					    newsize, so, NULL))
 						so->so_rcv.sb_flags &= ~SB_AUTOSIZE;
 				m_adj(m, drop_hdrlen);	/* delayed header drop */
 				sbappendstream_locked(&so->so_rcv, m);
 			}
 			/* NB: sorwakeup_locked() does an implicit unlock. */
 			sorwakeup_locked(so);
 			if (DELAY_ACK(tp, tlen)) {
 				tp->t_flags |= TF_DELACK;
 			} else {
 				tp->t_flags |= TF_ACKNOW;
 				tcp_output(tp);
 			}
 			goto check_delack;
 		}
 	}
 
 	/*
 	 * Calculate amount of space in receive window,
 	 * and then do TCP input processing.
 	 * Receive window is amount of space in rcv queue,
 	 * but not less than advertised window.
 	 */
 	win = sbspace(&so->so_rcv);
 	if (win < 0)
 		win = 0;
 	tp->rcv_wnd = imax(win, (int)(tp->rcv_adv - tp->rcv_nxt));
 
 	/* Reset receive buffer auto scaling when not in bulk receive mode. */
 	tp->rfbuf_ts = 0;
 	tp->rfbuf_cnt = 0;
 
 	switch (tp->t_state) {
 
 	/*
 	 * If the state is SYN_RECEIVED:
 	 *	if seg contains an ACK, but not for our SYN/ACK, send a RST.
 	 */
 	case TCPS_SYN_RECEIVED:
 		if ((thflags & TH_ACK) &&
 		    (SEQ_LEQ(th->th_ack, tp->snd_una) ||
 		     SEQ_GT(th->th_ack, tp->snd_max))) {
 				rstreason = BANDLIM_RST_OPENPORT;
 				goto dropwithreset;
 		}
 		break;
 
 	/*
 	 * If the state is SYN_SENT:
 	 *	if seg contains an ACK, but not for our SYN, drop the input.
 	 *	if seg contains a RST, then drop the connection.
 	 *	if seg does not contain SYN, then drop it.
 	 * Otherwise this is an acceptable SYN segment
 	 *	initialize tp->rcv_nxt and tp->irs
 	 *	if seg contains ack then advance tp->snd_una
 	 *	if seg contains an ECE and ECN support is enabled, the stream
 	 *	    is ECN capable.
 	 *	if SYN has been acked change to ESTABLISHED else SYN_RCVD state
 	 *	arrange for segment to be acked (eventually)
 	 *	continue processing rest of data/controls, beginning with URG
 	 */
 	case TCPS_SYN_SENT:
 		if ((thflags & TH_ACK) &&
 		    (SEQ_LEQ(th->th_ack, tp->iss) ||
 		     SEQ_GT(th->th_ack, tp->snd_max))) {
 			rstreason = BANDLIM_UNLIMITED;
 			goto dropwithreset;
 		}
 		if ((thflags & (TH_ACK|TH_RST)) == (TH_ACK|TH_RST)) {
 			TCP_PROBE5(connect_refused, NULL, tp, m->m_data, tp,
 			    th);
 			tp = tcp_drop(tp, ECONNREFUSED);
 		}
 		if (thflags & TH_RST)
 			goto drop;
 		if (!(thflags & TH_SYN))
 			goto drop;
 
 		tp->irs = th->th_seq;
 		tcp_rcvseqinit(tp);
 		if (thflags & TH_ACK) {
 			TCPSTAT_INC(tcps_connects);
 			soisconnected(so);
 #ifdef MAC
 			mac_socketpeer_set_from_mbuf(m, so);
 #endif
 			/* Do window scaling on this connection? */
 			if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) ==
 				(TF_RCVD_SCALE|TF_REQ_SCALE)) {
 				tp->rcv_scale = tp->request_r_scale;
 			}
 			tp->rcv_adv += imin(tp->rcv_wnd,
 			    TCP_MAXWIN << tp->rcv_scale);
 			tp->snd_una++;		/* SYN is acked */
 			/*
 			 * If there's data, delay ACK; if there's also a FIN
 			 * ACKNOW will be turned on later.
 			 */
 			if (DELAY_ACK(tp, tlen) && tlen != 0)
 				tcp_timer_activate(tp, TT_DELACK,
 				    tcp_delacktime);
 			else
 				tp->t_flags |= TF_ACKNOW;
 
 			if ((thflags & TH_ECE) && V_tcp_do_ecn) {
 				tp->t_flags |= TF_ECN_PERMIT;
 				TCPSTAT_INC(tcps_ecn_shs);
 			}
 			
 			/*
 			 * Received <SYN,ACK> in SYN_SENT[*] state.
 			 * Transitions:
 			 *	SYN_SENT  --> ESTABLISHED
 			 *	SYN_SENT* --> FIN_WAIT_1
 			 */
 			tp->t_starttime = ticks;
 			if (tp->t_flags & TF_NEEDFIN) {
 				tcp_state_change(tp, TCPS_FIN_WAIT_1);
 				tp->t_flags &= ~TF_NEEDFIN;
 				thflags &= ~TH_SYN;
 			} else {
 				tcp_state_change(tp, TCPS_ESTABLISHED);
 				TCP_PROBE5(connect_established, NULL, tp,
 				    m->m_data, tp, th);
 				cc_conn_init(tp);
 				tcp_timer_activate(tp, TT_KEEP,
 				    TP_KEEPIDLE(tp));
 			}
 		} else {
 			/*
 			 * Received initial SYN in SYN-SENT[*] state =>
 			 * simultaneous open.  If segment contains CC option
 			 * and there is a cached CC, apply TAO test.
 			 * If it succeeds, connection is * half-synchronized.
 			 * Otherwise, do 3-way handshake:
 			 *        SYN-SENT -> SYN-RECEIVED
 			 *        SYN-SENT* -> SYN-RECEIVED*
 			 * If there was no CC option, clear cached CC value.
 			 */
 			tp->t_flags |= (TF_ACKNOW | TF_NEEDSYN);
 			tcp_timer_activate(tp, TT_REXMT, 0);
 			tcp_state_change(tp, TCPS_SYN_RECEIVED);
 		}
 
 		KASSERT(ti_locked == TI_WLOCKED, ("%s: trimthenstep6: "
 		    "ti_locked %d", __func__, ti_locked));
 		INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
 		INP_WLOCK_ASSERT(tp->t_inpcb);
 
 		/*
 		 * Advance th->th_seq to correspond to first data byte.
 		 * If data, trim to stay within window,
 		 * dropping FIN if necessary.
 		 */
 		th->th_seq++;
 		if (tlen > tp->rcv_wnd) {
 			todrop = tlen - tp->rcv_wnd;
 			m_adj(m, -todrop);
 			tlen = tp->rcv_wnd;
 			thflags &= ~TH_FIN;
 			TCPSTAT_INC(tcps_rcvpackafterwin);
 			TCPSTAT_ADD(tcps_rcvbyteafterwin, todrop);
 		}
 		tp->snd_wl1 = th->th_seq - 1;
 		tp->rcv_up = th->th_seq;
 		/*
 		 * Client side of transaction: already sent SYN and data.
 		 * If the remote host used T/TCP to validate the SYN,
 		 * our data will be ACK'd; if so, enter normal data segment
 		 * processing in the middle of step 5, ack processing.
 		 * Otherwise, goto step 6.
 		 */
 		if (thflags & TH_ACK)
 			goto process_ACK;
 
 		goto step6;
 
 	/*
 	 * If the state is LAST_ACK or CLOSING or TIME_WAIT:
 	 *      do normal processing.
 	 *
 	 * NB: Leftover from RFC1644 T/TCP.  Cases to be reused later.
 	 */
 	case TCPS_LAST_ACK:
 	case TCPS_CLOSING:
 		break;  /* continue normal processing */
 	}
 
 	/*
 	 * States other than LISTEN or SYN_SENT.
 	 * First check the RST flag and sequence number since reset segments
 	 * are exempt from the timestamp and connection count tests.  This
 	 * fixes a bug introduced by the Stevens, vol. 2, p. 960 bugfix
 	 * below which allowed reset segments in half the sequence space
 	 * to fall though and be processed (which gives forged reset
 	 * segments with a random sequence number a 50 percent chance of
 	 * killing a connection).
 	 * Then check timestamp, if present.
 	 * Then check the connection count, if present.
 	 * Then check that at least some bytes of segment are within
 	 * receive window.  If segment begins before rcv_nxt,
 	 * drop leading data (and SYN); if nothing left, just ack.
 	 *
 	 *
 	 * If the RST bit is set, check the sequence number to see
 	 * if this is a valid reset segment.
 	 * RFC 793 page 37:
 	 *   In all states except SYN-SENT, all reset (RST) segments
 	 *   are validated by checking their SEQ-fields.  A reset is
 	 *   valid if its sequence number is in the window.
 	 * Note: this does not take into account delayed ACKs, so
 	 *   we should test against last_ack_sent instead of rcv_nxt.
 	 *   The sequence number in the reset segment is normally an
 	 *   echo of our outgoing acknowlegement numbers, but some hosts
 	 *   send a reset with the sequence number at the rightmost edge
 	 *   of our receive window, and we have to handle this case.
 	 * Note 2: Paul Watson's paper "Slipping in the Window" has shown
 	 *   that brute force RST attacks are possible.  To combat this,
 	 *   we use a much stricter check while in the ESTABLISHED state,
 	 *   only accepting RSTs where the sequence number is equal to
 	 *   last_ack_sent.  In all other states (the states in which a
 	 *   RST is more likely), the more permissive check is used.
 	 * If we have multiple segments in flight, the initial reset
 	 * segment sequence numbers will be to the left of last_ack_sent,
 	 * but they will eventually catch up.
 	 * In any case, it never made sense to trim reset segments to
 	 * fit the receive window since RFC 1122 says:
 	 *   4.2.2.12  RST Segment: RFC-793 Section 3.4
 	 *
 	 *    A TCP SHOULD allow a received RST segment to include data.
 	 *
 	 *    DISCUSSION
 	 *         It has been suggested that a RST segment could contain
 	 *         ASCII text that encoded and explained the cause of the
 	 *         RST.  No standard has yet been established for such
 	 *         data.
 	 *
 	 * If the reset segment passes the sequence number test examine
 	 * the state:
 	 *    SYN_RECEIVED STATE:
 	 *	If passive open, return to LISTEN state.
 	 *	If active open, inform user that connection was refused.
 	 *    ESTABLISHED, FIN_WAIT_1, FIN_WAIT_2, CLOSE_WAIT STATES:
 	 *	Inform user that connection was reset, and close tcb.
 	 *    CLOSING, LAST_ACK STATES:
 	 *	Close the tcb.
 	 *    TIME_WAIT STATE:
 	 *	Drop the segment - see Stevens, vol. 2, p. 964 and
 	 *      RFC 1337.
 	 */
 	if (thflags & TH_RST) {
 		if (SEQ_GEQ(th->th_seq, tp->last_ack_sent - 1) &&
 		    SEQ_LEQ(th->th_seq, tp->last_ack_sent + tp->rcv_wnd)) {
 			switch (tp->t_state) {
 
 			case TCPS_SYN_RECEIVED:
 				so->so_error = ECONNREFUSED;
 				goto close;
 
 			case TCPS_ESTABLISHED:
 				if (V_tcp_insecure_rst == 0 &&
 				    !(SEQ_GEQ(th->th_seq, tp->rcv_nxt - 1) &&
 				    SEQ_LEQ(th->th_seq, tp->rcv_nxt + 1)) &&
 				    !(SEQ_GEQ(th->th_seq, tp->last_ack_sent - 1) &&
 				    SEQ_LEQ(th->th_seq, tp->last_ack_sent + 1))) {
 					TCPSTAT_INC(tcps_badrst);
 					goto drop;
 				}
 				/* FALLTHROUGH */
 			case TCPS_FIN_WAIT_1:
 			case TCPS_FIN_WAIT_2:
 			case TCPS_CLOSE_WAIT:
 				so->so_error = ECONNRESET;
 			close:
 				KASSERT(ti_locked == TI_WLOCKED,
 				    ("tcp_do_segment: TH_RST 1 ti_locked %d",
 				    ti_locked));
 				INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
 
 				tcp_state_change(tp, TCPS_CLOSED);
 				TCPSTAT_INC(tcps_drops);
 				tp = tcp_close(tp);
 				break;
 
 			case TCPS_CLOSING:
 			case TCPS_LAST_ACK:
 				KASSERT(ti_locked == TI_WLOCKED,
 				    ("tcp_do_segment: TH_RST 2 ti_locked %d",
 				    ti_locked));
 				INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
 
 				tp = tcp_close(tp);
 				break;
 			}
 		}
 		goto drop;
 	}
 
 	/*
 	 * RFC 1323 PAWS: If we have a timestamp reply on this segment
 	 * and it's less than ts_recent, drop it.
 	 */
 	if ((to.to_flags & TOF_TS) != 0 && tp->ts_recent &&
 	    TSTMP_LT(to.to_tsval, tp->ts_recent)) {
 
 		/* Check to see if ts_recent is over 24 days old.  */
 		if (tcp_ts_getticks() - tp->ts_recent_age > TCP_PAWS_IDLE) {
 			/*
 			 * Invalidate ts_recent.  If this segment updates
 			 * ts_recent, the age will be reset later and ts_recent
 			 * will get a valid value.  If it does not, setting
 			 * ts_recent to zero will at least satisfy the
 			 * requirement that zero be placed in the timestamp
 			 * echo reply when ts_recent isn't valid.  The
 			 * age isn't reset until we get a valid ts_recent
 			 * because we don't want out-of-order segments to be
 			 * dropped when ts_recent is old.
 			 */
 			tp->ts_recent = 0;
 		} else {
 			TCPSTAT_INC(tcps_rcvduppack);
 			TCPSTAT_ADD(tcps_rcvdupbyte, tlen);
 			TCPSTAT_INC(tcps_pawsdrop);
 			if (tlen)
 				goto dropafterack;
 			goto drop;
 		}
 	}
 
 	/*
 	 * In the SYN-RECEIVED state, validate that the packet belongs to
 	 * this connection before trimming the data to fit the receive
 	 * window.  Check the sequence number versus IRS since we know
 	 * the sequence numbers haven't wrapped.  This is a partial fix
 	 * for the "LAND" DoS attack.
 	 */
 	if (tp->t_state == TCPS_SYN_RECEIVED && SEQ_LT(th->th_seq, tp->irs)) {
 		rstreason = BANDLIM_RST_OPENPORT;
 		goto dropwithreset;
 	}
 
 	todrop = tp->rcv_nxt - th->th_seq;
 	if (todrop > 0) {
 		/*
 		 * If this is a duplicate SYN for our current connection,
 		 * advance over it and pretend and it's not a SYN.
 		 */
 		if (thflags & TH_SYN && th->th_seq == tp->irs) {
 			thflags &= ~TH_SYN;
 			th->th_seq++;
 			if (th->th_urp > 1)
 				th->th_urp--;
 			else
 				thflags &= ~TH_URG;
 			todrop--;
 		}
 		/*
 		 * Following if statement from Stevens, vol. 2, p. 960.
 		 */
 		if (todrop > tlen
 		    || (todrop == tlen && (thflags & TH_FIN) == 0)) {
 			/*
 			 * Any valid FIN must be to the left of the window.
 			 * At this point the FIN must be a duplicate or out
 			 * of sequence; drop it.
 			 */
 			thflags &= ~TH_FIN;
 
 			/*
 			 * Send an ACK to resynchronize and drop any data.
 			 * But keep on processing for RST or ACK.
 			 */
 			tp->t_flags |= TF_ACKNOW;
 			todrop = tlen;
 			TCPSTAT_INC(tcps_rcvduppack);
 			TCPSTAT_ADD(tcps_rcvdupbyte, todrop);
 		} else {
 			TCPSTAT_INC(tcps_rcvpartduppack);
 			TCPSTAT_ADD(tcps_rcvpartdupbyte, todrop);
 		}
 		drop_hdrlen += todrop;	/* drop from the top afterwards */
 		th->th_seq += todrop;
 		tlen -= todrop;
 		if (th->th_urp > todrop)
 			th->th_urp -= todrop;
 		else {
 			thflags &= ~TH_URG;
 			th->th_urp = 0;
 		}
 	}
 
 	/*
 	 * If new data are received on a connection after the
 	 * user processes are gone, then RST the other end.
 	 */
 	if ((so->so_state & SS_NOFDREF) &&
 	    tp->t_state > TCPS_CLOSE_WAIT && tlen) {
 		KASSERT(ti_locked == TI_WLOCKED, ("%s: SS_NOFDEREF && "
 		    "CLOSE_WAIT && tlen ti_locked %d", __func__, ti_locked));
 		INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
 
 		if ((s = tcp_log_addrs(inc, th, NULL, NULL))) {
 			log(LOG_DEBUG, "%s; %s: %s: Received %d bytes of data "
 			    "after socket was closed, "
 			    "sending RST and removing tcpcb\n",
 			    s, __func__, tcpstates[tp->t_state], tlen);
 			free(s, M_TCPLOG);
 		}
 		tp = tcp_close(tp);
 		TCPSTAT_INC(tcps_rcvafterclose);
 		rstreason = BANDLIM_UNLIMITED;
 		goto dropwithreset;
 	}
 
 	/*
 	 * If segment ends after window, drop trailing data
 	 * (and PUSH and FIN); if nothing left, just ACK.
 	 */
 	todrop = (th->th_seq + tlen) - (tp->rcv_nxt + tp->rcv_wnd);
 	if (todrop > 0) {
 		TCPSTAT_INC(tcps_rcvpackafterwin);
 		if (todrop >= tlen) {
 			TCPSTAT_ADD(tcps_rcvbyteafterwin, tlen);
 			/*
 			 * If window is closed can only take segments at
 			 * window edge, and have to drop data and PUSH from
 			 * incoming segments.  Continue processing, but
 			 * remember to ack.  Otherwise, drop segment
 			 * and ack.
 			 */
 			if (tp->rcv_wnd == 0 && th->th_seq == tp->rcv_nxt) {
 				tp->t_flags |= TF_ACKNOW;
 				TCPSTAT_INC(tcps_rcvwinprobe);
 			} else
 				goto dropafterack;
 		} else
 			TCPSTAT_ADD(tcps_rcvbyteafterwin, todrop);
 		m_adj(m, -todrop);
 		tlen -= todrop;
 		thflags &= ~(TH_PUSH|TH_FIN);
 	}
 
 	/*
 	 * If last ACK falls within this segment's sequence numbers,
 	 * record its timestamp.
 	 * NOTE: 
 	 * 1) That the test incorporates suggestions from the latest
 	 *    proposal of the tcplw@cray.com list (Braden 1993/04/26).
 	 * 2) That updating only on newer timestamps interferes with
 	 *    our earlier PAWS tests, so this check should be solely
 	 *    predicated on the sequence space of this segment.
 	 * 3) That we modify the segment boundary check to be 
 	 *        Last.ACK.Sent <= SEG.SEQ + SEG.Len  
 	 *    instead of RFC1323's
 	 *        Last.ACK.Sent < SEG.SEQ + SEG.Len,
 	 *    This modified check allows us to overcome RFC1323's
 	 *    limitations as described in Stevens TCP/IP Illustrated
 	 *    Vol. 2 p.869. In such cases, we can still calculate the
 	 *    RTT correctly when RCV.NXT == Last.ACK.Sent.
 	 */
 	if ((to.to_flags & TOF_TS) != 0 &&
 	    SEQ_LEQ(th->th_seq, tp->last_ack_sent) &&
 	    SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen +
 		((thflags & (TH_SYN|TH_FIN)) != 0))) {
 		tp->ts_recent_age = tcp_ts_getticks();
 		tp->ts_recent = to.to_tsval;
 	}
 
 	/*
 	 * If a SYN is in the window, then this is an
 	 * error and we send an RST and drop the connection.
 	 */
 	if (thflags & TH_SYN) {
 		KASSERT(ti_locked == TI_WLOCKED,
 		    ("tcp_do_segment: TH_SYN ti_locked %d", ti_locked));
 		INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
 
 		tp = tcp_drop(tp, ECONNRESET);
 		rstreason = BANDLIM_UNLIMITED;
 		goto drop;
 	}
 
 	/*
 	 * If the ACK bit is off:  if in SYN-RECEIVED state or SENDSYN
 	 * flag is on (half-synchronized state), then queue data for
 	 * later processing; else drop segment and return.
 	 */
 	if ((thflags & TH_ACK) == 0) {
 		if (tp->t_state == TCPS_SYN_RECEIVED ||
 		    (tp->t_flags & TF_NEEDSYN))
 			goto step6;
 		else if (tp->t_flags & TF_ACKNOW)
 			goto dropafterack;
 		else
 			goto drop;
 	}
 
 	/*
 	 * Ack processing.
 	 */
 	switch (tp->t_state) {
 
 	/*
 	 * In SYN_RECEIVED state, the ack ACKs our SYN, so enter
 	 * ESTABLISHED state and continue processing.
 	 * The ACK was checked above.
 	 */
 	case TCPS_SYN_RECEIVED:
 
 		TCPSTAT_INC(tcps_connects);
 		soisconnected(so);
 		/* Do window scaling? */
 		if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) ==
 			(TF_RCVD_SCALE|TF_REQ_SCALE)) {
 			tp->rcv_scale = tp->request_r_scale;
 			tp->snd_wnd = tiwin;
 		}
 		/*
 		 * Make transitions:
 		 *      SYN-RECEIVED  -> ESTABLISHED
 		 *      SYN-RECEIVED* -> FIN-WAIT-1
 		 */
 		tp->t_starttime = ticks;
 		if (tp->t_flags & TF_NEEDFIN) {
 			tcp_state_change(tp, TCPS_FIN_WAIT_1);
 			tp->t_flags &= ~TF_NEEDFIN;
 		} else {
 			tcp_state_change(tp, TCPS_ESTABLISHED);
 			TCP_PROBE5(accept_established, NULL, tp, m->m_data, tp,
 			    th);
 			cc_conn_init(tp);
 			tcp_timer_activate(tp, TT_KEEP, TP_KEEPIDLE(tp));
 		}
 		/*
 		 * If segment contains data or ACK, will call tcp_reass()
 		 * later; if not, do so now to pass queued data to user.
 		 */
 		if (tlen == 0 && (thflags & TH_FIN) == 0)
 			(void) tcp_reass(tp, (struct tcphdr *)0, 0,
 			    (struct mbuf *)0);
 		tp->snd_wl1 = th->th_seq - 1;
 		/* FALLTHROUGH */
 
 	/*
 	 * In ESTABLISHED state: drop duplicate ACKs; ACK out of range
 	 * ACKs.  If the ack is in the range
 	 *	tp->snd_una < th->th_ack <= tp->snd_max
 	 * then advance tp->snd_una to th->th_ack and drop
 	 * data from the retransmission queue.  If this ACK reflects
 	 * more up to date window information we update our window information.
 	 */
 	case TCPS_ESTABLISHED:
 	case TCPS_FIN_WAIT_1:
 	case TCPS_FIN_WAIT_2:
 	case TCPS_CLOSE_WAIT:
 	case TCPS_CLOSING:
 	case TCPS_LAST_ACK:
 		if (SEQ_GT(th->th_ack, tp->snd_max)) {
 			TCPSTAT_INC(tcps_rcvacktoomuch);
 			goto dropafterack;
 		}
 		if ((tp->t_flags & TF_SACK_PERMIT) &&
 		    ((to.to_flags & TOF_SACK) ||
 		     !TAILQ_EMPTY(&tp->snd_holes)))
 			tcp_sack_doack(tp, &to, th->th_ack);
 
 		/* Run HHOOK_TCP_ESTABLISHED_IN helper hooks. */
 		hhook_run_tcp_est_in(tp, th, &to);
 
 		if (SEQ_LEQ(th->th_ack, tp->snd_una)) {
 			if (tlen == 0 && tiwin == tp->snd_wnd) {
 				TCPSTAT_INC(tcps_rcvdupack);
 				/*
 				 * If we have outstanding data (other than
 				 * a window probe), this is a completely
 				 * duplicate ack (ie, window info didn't
 				 * change), the ack is the biggest we've
 				 * seen and we've seen exactly our rexmt
 				 * threshhold of them, assume a packet
 				 * has been dropped and retransmit it.
 				 * Kludge snd_nxt & the congestion
 				 * window so we send only this one
 				 * packet.
 				 *
 				 * We know we're losing at the current
 				 * window size so do congestion avoidance
 				 * (set ssthresh to half the current window
 				 * and pull our congestion window back to
 				 * the new ssthresh).
 				 *
 				 * Dup acks mean that packets have left the
 				 * network (they're now cached at the receiver)
 				 * so bump cwnd by the amount in the receiver
 				 * to keep a constant cwnd packets in the
 				 * network.
 				 *
 				 * When using TCP ECN, notify the peer that
 				 * we reduced the cwnd.
 				 */
 				if (!tcp_timer_active(tp, TT_REXMT) ||
 				    th->th_ack != tp->snd_una)
 					tp->t_dupacks = 0;
 				else if (++tp->t_dupacks > tcprexmtthresh ||
 				     IN_FASTRECOVERY(tp->t_flags)) {
 					cc_ack_received(tp, th, CC_DUPACK);
 					if ((tp->t_flags & TF_SACK_PERMIT) &&
 					    IN_FASTRECOVERY(tp->t_flags)) {
 						int awnd;
 						
 						/*
 						 * Compute the amount of data in flight first.
 						 * We can inject new data into the pipe iff 
 						 * we have less than 1/2 the original window's
 						 * worth of data in flight.
 						 */
 						awnd = (tp->snd_nxt - tp->snd_fack) +
 							tp->sackhint.sack_bytes_rexmit;
 						if (awnd < tp->snd_ssthresh) {
 							tp->snd_cwnd += tp->t_maxseg;
 							if (tp->snd_cwnd > tp->snd_ssthresh)
 								tp->snd_cwnd = tp->snd_ssthresh;
 						}
 					} else
 						tp->snd_cwnd += tp->t_maxseg;
 					if ((thflags & TH_FIN) &&
 					    (TCPS_HAVERCVDFIN(tp->t_state) == 0)) {
 						/* 
 						 * If its a fin we need to process
 						 * it to avoid a race where both
 						 * sides enter FIN-WAIT and send FIN|ACK
 						 * at the same time.
 						 */
 						break;
 					}
 					(void) tcp_output(tp);
 					goto drop;
 				} else if (tp->t_dupacks == tcprexmtthresh) {
 					tcp_seq onxt = tp->snd_nxt;
 
 					/*
 					 * If we're doing sack, check to
 					 * see if we're already in sack
 					 * recovery. If we're not doing sack,
 					 * check to see if we're in newreno
 					 * recovery.
 					 */
 					if (tp->t_flags & TF_SACK_PERMIT) {
 						if (IN_FASTRECOVERY(tp->t_flags)) {
 							tp->t_dupacks = 0;
 							break;
 						}
 					} else {
 						if (SEQ_LEQ(th->th_ack,
 						    tp->snd_recover)) {
 							tp->t_dupacks = 0;
 							break;
 						}
 					}
 					/* Congestion signal before ack. */
 					cc_cong_signal(tp, th, CC_NDUPACK);
 					cc_ack_received(tp, th, CC_DUPACK);
 					tcp_timer_activate(tp, TT_REXMT, 0);
 					tp->t_rtttime = 0;
 					if (tp->t_flags & TF_SACK_PERMIT) {
 						TCPSTAT_INC(
 						    tcps_sack_recovery_episode);
 						tp->sack_newdata = tp->snd_nxt;
 						tp->snd_cwnd = tp->t_maxseg;
 						(void) tcp_output(tp);
 						goto drop;
 					}
 					tp->snd_nxt = th->th_ack;
 					tp->snd_cwnd = tp->t_maxseg;
 					if ((thflags & TH_FIN) &&
 					    (TCPS_HAVERCVDFIN(tp->t_state) == 0)) {
 						/* 
 						 * If its a fin we need to process
 						 * it to avoid a race where both
 						 * sides enter FIN-WAIT and send FIN|ACK
 						 * at the same time.
 						 */
 						break;
 					}
 					(void) tcp_output(tp);
 					KASSERT(tp->snd_limited <= 2,
 					    ("%s: tp->snd_limited too big",
 					    __func__));
 					tp->snd_cwnd = tp->snd_ssthresh +
 					     tp->t_maxseg *
 					     (tp->t_dupacks - tp->snd_limited);
 					if (SEQ_GT(onxt, tp->snd_nxt))
 						tp->snd_nxt = onxt;
 					goto drop;
 				} else if (V_tcp_do_rfc3042) {
 					cc_ack_received(tp, th, CC_DUPACK);
 					u_long oldcwnd = tp->snd_cwnd;
 					tcp_seq oldsndmax = tp->snd_max;
 					u_int sent;
 					int avail;
 
 					KASSERT(tp->t_dupacks == 1 ||
 					    tp->t_dupacks == 2,
 					    ("%s: dupacks not 1 or 2",
 					    __func__));
 					if (tp->t_dupacks == 1)
 						tp->snd_limited = 0;
 					tp->snd_cwnd =
 					    (tp->snd_nxt - tp->snd_una) +
 					    (tp->t_dupacks - tp->snd_limited) *
 					    tp->t_maxseg;
 					if ((thflags & TH_FIN) &&
 					    (TCPS_HAVERCVDFIN(tp->t_state) == 0)) {
 						/* 
 						 * If its a fin we need to process
 						 * it to avoid a race where both
 						 * sides enter FIN-WAIT and send FIN|ACK
 						 * at the same time.
 						 */
 						break;
 					}
 					/*
 					 * Only call tcp_output when there
 					 * is new data available to be sent.
 					 * Otherwise we would send pure ACKs.
 					 */
 					SOCKBUF_LOCK(&so->so_snd);
 					avail = so->so_snd.sb_cc -
 					    (tp->snd_nxt - tp->snd_una);
 					SOCKBUF_UNLOCK(&so->so_snd);
 					if (avail > 0)
 						(void) tcp_output(tp);
 					sent = tp->snd_max - oldsndmax;
 					if (sent > tp->t_maxseg) {
 						KASSERT((tp->t_dupacks == 2 &&
 						    tp->snd_limited == 0) ||
 						   (sent == tp->t_maxseg + 1 &&
 						    tp->t_flags & TF_SENTFIN),
 						    ("%s: sent too much",
 						    __func__));
 						tp->snd_limited = 2;
 					} else if (sent > 0)
 						++tp->snd_limited;
 					tp->snd_cwnd = oldcwnd;
 					goto drop;
 				}
 			} else
 				tp->t_dupacks = 0;
 			break;
 		}
 
 		KASSERT(SEQ_GT(th->th_ack, tp->snd_una),
 		    ("%s: th_ack <= snd_una", __func__));
 
 		/*
 		 * If the congestion window was inflated to account
 		 * for the other side's cached packets, retract it.
 		 */
 		if (IN_FASTRECOVERY(tp->t_flags)) {
 			if (SEQ_LT(th->th_ack, tp->snd_recover)) {
 				if (tp->t_flags & TF_SACK_PERMIT)
 					tcp_sack_partialack(tp, th);
 				else
 					tcp_newreno_partial_ack(tp, th);
 			} else
 				cc_post_recovery(tp, th);
 		}
 		tp->t_dupacks = 0;
 		/*
 		 * If we reach this point, ACK is not a duplicate,
 		 *     i.e., it ACKs something we sent.
 		 */
 		if (tp->t_flags & TF_NEEDSYN) {
 			/*
 			 * T/TCP: Connection was half-synchronized, and our
 			 * SYN has been ACK'd (so connection is now fully
 			 * synchronized).  Go to non-starred state,
 			 * increment snd_una for ACK of SYN, and check if
 			 * we can do window scaling.
 			 */
 			tp->t_flags &= ~TF_NEEDSYN;
 			tp->snd_una++;
 			/* Do window scaling? */
 			if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) ==
 				(TF_RCVD_SCALE|TF_REQ_SCALE)) {
 				tp->rcv_scale = tp->request_r_scale;
 				/* Send window already scaled. */
 			}
 		}
 
 process_ACK:
 		INP_WLOCK_ASSERT(tp->t_inpcb);
 
 		acked = BYTES_THIS_ACK(tp, th);
 		TCPSTAT_INC(tcps_rcvackpack);
 		TCPSTAT_ADD(tcps_rcvackbyte, acked);
 
 		/*
 		 * If we just performed our first retransmit, and the ACK
 		 * arrives within our recovery window, then it was a mistake
 		 * to do the retransmit in the first place.  Recover our
 		 * original cwnd and ssthresh, and proceed to transmit where
 		 * we left off.
 		 */
 		if (tp->t_rxtshift == 1 && tp->t_flags & TF_PREVVALID &&
 		    (int)(ticks - tp->t_badrxtwin) < 0)
 			cc_cong_signal(tp, th, CC_RTO_ERR);
 
 		/*
 		 * If we have a timestamp reply, update smoothed
 		 * round trip time.  If no timestamp is present but
 		 * transmit timer is running and timed sequence
 		 * number was acked, update smoothed round trip time.
 		 * Since we now have an rtt measurement, cancel the
 		 * timer backoff (cf., Phil Karn's retransmit alg.).
 		 * Recompute the initial retransmit timer.
 		 *
 		 * Some boxes send broken timestamp replies
 		 * during the SYN+ACK phase, ignore
 		 * timestamps of 0 or we could calculate a
 		 * huge RTT and blow up the retransmit timer.
 		 */
 		if ((to.to_flags & TOF_TS) != 0 && to.to_tsecr) {
 			u_int t;
 
 			t = tcp_ts_getticks() - to.to_tsecr;
 			if (!tp->t_rttlow || tp->t_rttlow > t)
 				tp->t_rttlow = t;
 			tcp_xmit_timer(tp, TCP_TS_TO_TICKS(t) + 1);
 		} else if (tp->t_rtttime && SEQ_GT(th->th_ack, tp->t_rtseq)) {
 			if (!tp->t_rttlow || tp->t_rttlow > ticks - tp->t_rtttime)
 				tp->t_rttlow = ticks - tp->t_rtttime;
 			tcp_xmit_timer(tp, ticks - tp->t_rtttime);
 		}
 
 		/*
 		 * If all outstanding data is acked, stop retransmit
 		 * timer and remember to restart (more output or persist).
 		 * If there is more data to be acked, restart retransmit
 		 * timer, using current (possibly backed-off) value.
 		 */
 		if (th->th_ack == tp->snd_max) {
 			tcp_timer_activate(tp, TT_REXMT, 0);
 			needoutput = 1;
 		} else if (!tcp_timer_active(tp, TT_PERSIST))
 			tcp_timer_activate(tp, TT_REXMT, tp->t_rxtcur);
 
 		/*
 		 * If no data (only SYN) was ACK'd,
 		 *    skip rest of ACK processing.
 		 */
 		if (acked == 0)
 			goto step6;
 
 		/*
 		 * Let the congestion control algorithm update congestion
 		 * control related information. This typically means increasing
 		 * the congestion window.
 		 */
 		cc_ack_received(tp, th, CC_ACK);
 
 		SOCKBUF_LOCK(&so->so_snd);
 		if (acked > so->so_snd.sb_cc) {
 			tp->snd_wnd -= so->so_snd.sb_cc;
 			mfree = sbcut_locked(&so->so_snd,
 			    (int)so->so_snd.sb_cc);
 			ourfinisacked = 1;
 		} else {
 			mfree = sbcut_locked(&so->so_snd, acked);
 			tp->snd_wnd -= acked;
 			ourfinisacked = 0;
 		}
 		/* NB: sowwakeup_locked() does an implicit unlock. */
 		sowwakeup_locked(so);
 		m_freem(mfree);
 		/* Detect una wraparound. */
 		if (!IN_RECOVERY(tp->t_flags) &&
 		    SEQ_GT(tp->snd_una, tp->snd_recover) &&
 		    SEQ_LEQ(th->th_ack, tp->snd_recover))
 			tp->snd_recover = th->th_ack - 1;
 		/* XXXLAS: Can this be moved up into cc_post_recovery? */
 		if (IN_RECOVERY(tp->t_flags) &&
 		    SEQ_GEQ(th->th_ack, tp->snd_recover)) {
 			EXIT_RECOVERY(tp->t_flags);
 		}
 		tp->snd_una = th->th_ack;
 		if (tp->t_flags & TF_SACK_PERMIT) {
 			if (SEQ_GT(tp->snd_una, tp->snd_recover))
 				tp->snd_recover = tp->snd_una;
 		}
 		if (SEQ_LT(tp->snd_nxt, tp->snd_una))
 			tp->snd_nxt = tp->snd_una;
 
 		switch (tp->t_state) {
 
 		/*
 		 * In FIN_WAIT_1 STATE in addition to the processing
 		 * for the ESTABLISHED state if our FIN is now acknowledged
 		 * then enter FIN_WAIT_2.
 		 */
 		case TCPS_FIN_WAIT_1:
 			if (ourfinisacked) {
 				/*
 				 * If we can't receive any more
 				 * data, then closing user can proceed.
 				 * Starting the timer is contrary to the
 				 * specification, but if we don't get a FIN
 				 * we'll hang forever.
 				 *
 				 * XXXjl:
 				 * we should release the tp also, and use a
 				 * compressed state.
 				 */
 				if (so->so_rcv.sb_state & SBS_CANTRCVMORE) {
 					soisdisconnected(so);
 					tcp_timer_activate(tp, TT_2MSL,
 					    (tcp_fast_finwait2_recycle ?
 					    tcp_finwait2_timeout :
 					    TP_MAXIDLE(tp)));
 				}
 				tcp_state_change(tp, TCPS_FIN_WAIT_2);
 			}
 			break;
 
 		/*
 		 * In CLOSING STATE in addition to the processing for
 		 * the ESTABLISHED state if the ACK acknowledges our FIN
 		 * then enter the TIME-WAIT state, otherwise ignore
 		 * the segment.
 		 */
 		case TCPS_CLOSING:
 			if (ourfinisacked) {
 				INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
 				tcp_twstart(tp);
 				INP_INFO_WUNLOCK(&V_tcbinfo);
 				m_freem(m);
 				return;
 			}
 			break;
 
 		/*
 		 * In LAST_ACK, we may still be waiting for data to drain
 		 * and/or to be acked, as well as for the ack of our FIN.
 		 * If our FIN is now acknowledged, delete the TCB,
 		 * enter the closed state and return.
 		 */
 		case TCPS_LAST_ACK:
 			if (ourfinisacked) {
 				INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
 				tp = tcp_close(tp);
 				goto drop;
 			}
 			break;
 		}
 	}
 
 step6:
 	INP_WLOCK_ASSERT(tp->t_inpcb);
 
 	/*
 	 * Update window information.
 	 * Don't look at window if no ACK: TAC's send garbage on first SYN.
 	 */
 	if ((thflags & TH_ACK) &&
 	    (SEQ_LT(tp->snd_wl1, th->th_seq) ||
 	    (tp->snd_wl1 == th->th_seq && (SEQ_LT(tp->snd_wl2, th->th_ack) ||
 	     (tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd))))) {
 		/* keep track of pure window updates */
 		if (tlen == 0 &&
 		    tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd)
 			TCPSTAT_INC(tcps_rcvwinupd);
 		tp->snd_wnd = tiwin;
 		tp->snd_wl1 = th->th_seq;
 		tp->snd_wl2 = th->th_ack;
 		if (tp->snd_wnd > tp->max_sndwnd)
 			tp->max_sndwnd = tp->snd_wnd;
 		needoutput = 1;
 	}
 
 	/*
 	 * Process segments with URG.
 	 */
 	if ((thflags & TH_URG) && th->th_urp &&
 	    TCPS_HAVERCVDFIN(tp->t_state) == 0) {
 		/*
 		 * This is a kludge, but if we receive and accept
 		 * random urgent pointers, we'll crash in
 		 * soreceive.  It's hard to imagine someone
 		 * actually wanting to send this much urgent data.
 		 */
 		SOCKBUF_LOCK(&so->so_rcv);
 		if (th->th_urp + so->so_rcv.sb_cc > sb_max) {
 			th->th_urp = 0;			/* XXX */
 			thflags &= ~TH_URG;		/* XXX */
 			SOCKBUF_UNLOCK(&so->so_rcv);	/* XXX */
 			goto dodata;			/* XXX */
 		}
 		/*
 		 * If this segment advances the known urgent pointer,
 		 * then mark the data stream.  This should not happen
 		 * in CLOSE_WAIT, CLOSING, LAST_ACK or TIME_WAIT STATES since
 		 * a FIN has been received from the remote side.
 		 * In these states we ignore the URG.
 		 *
 		 * According to RFC961 (Assigned Protocols),
 		 * the urgent pointer points to the last octet
 		 * of urgent data.  We continue, however,
 		 * to consider it to indicate the first octet
 		 * of data past the urgent section as the original
 		 * spec states (in one of two places).
 		 */
 		if (SEQ_GT(th->th_seq+th->th_urp, tp->rcv_up)) {
 			tp->rcv_up = th->th_seq + th->th_urp;
 			so->so_oobmark = so->so_rcv.sb_cc +
 			    (tp->rcv_up - tp->rcv_nxt) - 1;
 			if (so->so_oobmark == 0)
 				so->so_rcv.sb_state |= SBS_RCVATMARK;
 			sohasoutofband(so);
 			tp->t_oobflags &= ~(TCPOOB_HAVEDATA | TCPOOB_HADDATA);
 		}
 		SOCKBUF_UNLOCK(&so->so_rcv);
 		/*
 		 * Remove out of band data so doesn't get presented to user.
 		 * This can happen independent of advancing the URG pointer,
 		 * but if two URG's are pending at once, some out-of-band
 		 * data may creep in... ick.
 		 */
 		if (th->th_urp <= (u_long)tlen &&
 		    !(so->so_options & SO_OOBINLINE)) {
 			/* hdr drop is delayed */
 			tcp_pulloutofband(so, th, m, drop_hdrlen);
 		}
 	} else {
 		/*
 		 * If no out of band data is expected,
 		 * pull receive urgent pointer along
 		 * with the receive window.
 		 */
 		if (SEQ_GT(tp->rcv_nxt, tp->rcv_up))
 			tp->rcv_up = tp->rcv_nxt;
 	}
 dodata:							/* XXX */
 	INP_WLOCK_ASSERT(tp->t_inpcb);
 
 	/*
 	 * Process the segment text, merging it into the TCP sequencing queue,
 	 * and arranging for acknowledgment of receipt if necessary.
 	 * This process logically involves adjusting tp->rcv_wnd as data
 	 * is presented to the user (this happens in tcp_usrreq.c,
 	 * case PRU_RCVD).  If a FIN has already been received on this
 	 * connection then we just ignore the text.
 	 */
 	if ((tlen || (thflags & TH_FIN)) &&
 	    TCPS_HAVERCVDFIN(tp->t_state) == 0) {
 		tcp_seq save_start = th->th_seq;
 		m_adj(m, drop_hdrlen);	/* delayed header drop */
 		/*
 		 * Insert segment which includes th into TCP reassembly queue
 		 * with control block tp.  Set thflags to whether reassembly now
 		 * includes a segment with FIN.  This handles the common case
 		 * inline (segment is the next to be received on an established
 		 * connection, and the queue is empty), avoiding linkage into
 		 * and removal from the queue and repetition of various
 		 * conversions.
 		 * Set DELACK for segments received in order, but ack
 		 * immediately when segments are out of order (so
 		 * fast retransmit can work).
 		 */
 		if (th->th_seq == tp->rcv_nxt &&
 		    LIST_EMPTY(&tp->t_segq) &&
 		    TCPS_HAVEESTABLISHED(tp->t_state)) {
 			if (DELAY_ACK(tp, tlen))
 				tp->t_flags |= TF_DELACK;
 			else
 				tp->t_flags |= TF_ACKNOW;
 			tp->rcv_nxt += tlen;
 			thflags = th->th_flags & TH_FIN;
 			TCPSTAT_INC(tcps_rcvpack);
 			TCPSTAT_ADD(tcps_rcvbyte, tlen);
 			ND6_HINT(tp);
 			SOCKBUF_LOCK(&so->so_rcv);
 			if (so->so_rcv.sb_state & SBS_CANTRCVMORE)
 				m_freem(m);
 			else
 				sbappendstream_locked(&so->so_rcv, m);
 			/* NB: sorwakeup_locked() does an implicit unlock. */
 			sorwakeup_locked(so);
 		} else {
 			/*
 			 * XXX: Due to the header drop above "th" is
 			 * theoretically invalid by now.  Fortunately
 			 * m_adj() doesn't actually frees any mbufs
 			 * when trimming from the head.
 			 */
 			thflags = tcp_reass(tp, th, &tlen, m);
 			tp->t_flags |= TF_ACKNOW;
 		}
 		if (tlen > 0 && (tp->t_flags & TF_SACK_PERMIT))
 			tcp_update_sack_list(tp, save_start, save_start + tlen);
 #if 0
 		/*
 		 * Note the amount of data that peer has sent into
 		 * our window, in order to estimate the sender's
 		 * buffer size.
 		 * XXX: Unused.
 		 */
 		if (SEQ_GT(tp->rcv_adv, tp->rcv_nxt))
 			len = so->so_rcv.sb_hiwat - (tp->rcv_adv - tp->rcv_nxt);
 		else
 			len = so->so_rcv.sb_hiwat;
 #endif
 	} else {
 		m_freem(m);
 		thflags &= ~TH_FIN;
 	}
 
 	/*
 	 * If FIN is received ACK the FIN and let the user know
 	 * that the connection is closing.
 	 */
 	if (thflags & TH_FIN) {
 		if (TCPS_HAVERCVDFIN(tp->t_state) == 0) {
 			socantrcvmore(so);
 			/*
 			 * If connection is half-synchronized
 			 * (ie NEEDSYN flag on) then delay ACK,
 			 * so it may be piggybacked when SYN is sent.
 			 * Otherwise, since we received a FIN then no
 			 * more input can be expected, send ACK now.
 			 */
 			if (tp->t_flags & TF_NEEDSYN)
 				tp->t_flags |= TF_DELACK;
 			else
 				tp->t_flags |= TF_ACKNOW;
 			tp->rcv_nxt++;
 		}
 		switch (tp->t_state) {
 
 		/*
 		 * In SYN_RECEIVED and ESTABLISHED STATES
 		 * enter the CLOSE_WAIT state.
 		 */
 		case TCPS_SYN_RECEIVED:
 			tp->t_starttime = ticks;
 			/* FALLTHROUGH */
 		case TCPS_ESTABLISHED:
 			tcp_state_change(tp, TCPS_CLOSE_WAIT);
 			break;
 
 		/*
 		 * If still in FIN_WAIT_1 STATE FIN has not been acked so
 		 * enter the CLOSING state.
 		 */
 		case TCPS_FIN_WAIT_1:
 			tcp_state_change(tp, TCPS_CLOSING);
 			break;
 
 		/*
 		 * In FIN_WAIT_2 state enter the TIME_WAIT state,
 		 * starting the time-wait timer, turning off the other
 		 * standard timers.
 		 */
 		case TCPS_FIN_WAIT_2:
 			INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
 			KASSERT(ti_locked == TI_WLOCKED, ("%s: dodata "
 			    "TCP_FIN_WAIT_2 ti_locked: %d", __func__,
 			    ti_locked));
 
 			tcp_twstart(tp);
 			INP_INFO_WUNLOCK(&V_tcbinfo);
 			return;
 		}
 	}
 	if (ti_locked == TI_WLOCKED)
 		INP_INFO_WUNLOCK(&V_tcbinfo);
 	ti_locked = TI_UNLOCKED;
 
 #ifdef TCPDEBUG
 	if (so->so_options & SO_DEBUG)
 		tcp_trace(TA_INPUT, ostate, tp, (void *)tcp_saveipgen,
 			  &tcp_savetcp, 0);
 #endif
 
 	/*
 	 * Return any desired output.
 	 */
 	if (needoutput || (tp->t_flags & TF_ACKNOW))
 		(void) tcp_output(tp);
 
 check_delack:
 	KASSERT(ti_locked == TI_UNLOCKED, ("%s: check_delack ti_locked %d",
 	    __func__, ti_locked));
 	INP_INFO_UNLOCK_ASSERT(&V_tcbinfo);
 	INP_WLOCK_ASSERT(tp->t_inpcb);
 
 	if (tp->t_flags & TF_DELACK) {
 		tp->t_flags &= ~TF_DELACK;
 		tcp_timer_activate(tp, TT_DELACK, tcp_delacktime);
 	}
 	INP_WUNLOCK(tp->t_inpcb);
 	return;
 
 dropafterack:
 	/*
 	 * Generate an ACK dropping incoming segment if it occupies
 	 * sequence space, where the ACK reflects our state.
 	 *
 	 * We can now skip the test for the RST flag since all
 	 * paths to this code happen after packets containing
 	 * RST have been dropped.
 	 *
 	 * In the SYN-RECEIVED state, don't send an ACK unless the
 	 * segment we received passes the SYN-RECEIVED ACK test.
 	 * If it fails send a RST.  This breaks the loop in the
 	 * "LAND" DoS attack, and also prevents an ACK storm
 	 * between two listening ports that have been sent forged
 	 * SYN segments, each with the source address of the other.
 	 */
 	if (tp->t_state == TCPS_SYN_RECEIVED && (thflags & TH_ACK) &&
 	    (SEQ_GT(tp->snd_una, th->th_ack) ||
 	     SEQ_GT(th->th_ack, tp->snd_max)) ) {
 		rstreason = BANDLIM_RST_OPENPORT;
 		goto dropwithreset;
 	}
 #ifdef TCPDEBUG
 	if (so->so_options & SO_DEBUG)
 		tcp_trace(TA_DROP, ostate, tp, (void *)tcp_saveipgen,
 			  &tcp_savetcp, 0);
 #endif
 	if (ti_locked == TI_WLOCKED)
 		INP_INFO_WUNLOCK(&V_tcbinfo);
 	ti_locked = TI_UNLOCKED;
 
 	tp->t_flags |= TF_ACKNOW;
 	(void) tcp_output(tp);
 	INP_WUNLOCK(tp->t_inpcb);
 	m_freem(m);
 	return;
 
 dropwithreset:
 	if (ti_locked == TI_WLOCKED)
 		INP_INFO_WUNLOCK(&V_tcbinfo);
 	ti_locked = TI_UNLOCKED;
 
 	if (tp != NULL) {
 		tcp_dropwithreset(m, th, tp, tlen, rstreason);
 		INP_WUNLOCK(tp->t_inpcb);
 	} else
 		tcp_dropwithreset(m, th, NULL, tlen, rstreason);
 	return;
 
 drop:
 	if (ti_locked == TI_WLOCKED) {
 		INP_INFO_WUNLOCK(&V_tcbinfo);
 		ti_locked = TI_UNLOCKED;
 	}
 #ifdef INVARIANTS
 	else
 		INP_INFO_UNLOCK_ASSERT(&V_tcbinfo);
 #endif
 
 	/*
 	 * Drop space held by incoming segment and return.
 	 */
 #ifdef TCPDEBUG
 	if (tp == NULL || (tp->t_inpcb->inp_socket->so_options & SO_DEBUG))
 		tcp_trace(TA_DROP, ostate, tp, (void *)tcp_saveipgen,
 			  &tcp_savetcp, 0);
 #endif
 	if (tp != NULL)
 		INP_WUNLOCK(tp->t_inpcb);
 	m_freem(m);
 }
 
 /*
  * Issue RST and make ACK acceptable to originator of segment.
  * The mbuf must still include the original packet header.
  * tp may be NULL.
  */
 static void
 tcp_dropwithreset(struct mbuf *m, struct tcphdr *th, struct tcpcb *tp,
     int tlen, int rstreason)
 {
 #ifdef INET
 	struct ip *ip;
 #endif
 #ifdef INET6
 	struct ip6_hdr *ip6;
 #endif
 
 	if (tp != NULL) {
 		INP_WLOCK_ASSERT(tp->t_inpcb);
 	}
 
 	/* Don't bother if destination was broadcast/multicast. */
 	if ((th->th_flags & TH_RST) || m->m_flags & (M_BCAST|M_MCAST))
 		goto drop;
 #ifdef INET6
 	if (mtod(m, struct ip *)->ip_v == 6) {
 		ip6 = mtod(m, struct ip6_hdr *);
 		if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst) ||
 		    IN6_IS_ADDR_MULTICAST(&ip6->ip6_src))
 			goto drop;
 		/* IPv6 anycast check is done at tcp6_input() */
 	}
 #endif
 #if defined(INET) && defined(INET6)
 	else
 #endif
 #ifdef INET
 	{
 		ip = mtod(m, struct ip *);
 		if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr)) ||
 		    IN_MULTICAST(ntohl(ip->ip_src.s_addr)) ||
 		    ip->ip_src.s_addr == htonl(INADDR_BROADCAST) ||
 		    in_broadcast(ip->ip_dst, m->m_pkthdr.rcvif))
 			goto drop;
 	}
 #endif
 
 	/* Perform bandwidth limiting. */
 	if (badport_bandlim(rstreason) < 0)
 		goto drop;
 
 	/* tcp_respond consumes the mbuf chain. */
 	if (th->th_flags & TH_ACK) {
 		tcp_respond(tp, mtod(m, void *), th, m, (tcp_seq)0,
 		    th->th_ack, TH_RST);
 	} else {
 		if (th->th_flags & TH_SYN)
 			tlen++;
 		tcp_respond(tp, mtod(m, void *), th, m, th->th_seq+tlen,
 		    (tcp_seq)0, TH_RST|TH_ACK);
 	}
 	return;
 drop:
 	m_freem(m);
 }
 
 /*
  * Parse TCP options and place in tcpopt.
  */
 static void
 tcp_dooptions(struct tcpopt *to, u_char *cp, int cnt, int flags)
 {
 	int opt, optlen;
 
 	to->to_flags = 0;
 	for (; cnt > 0; cnt -= optlen, cp += optlen) {
 		opt = cp[0];
 		if (opt == TCPOPT_EOL)
 			break;
 		if (opt == TCPOPT_NOP)
 			optlen = 1;
 		else {
 			if (cnt < 2)
 				break;
 			optlen = cp[1];
 			if (optlen < 2 || optlen > cnt)
 				break;
 		}
 		switch (opt) {
 		case TCPOPT_MAXSEG:
 			if (optlen != TCPOLEN_MAXSEG)
 				continue;
 			if (!(flags & TO_SYN))
 				continue;
 			to->to_flags |= TOF_MSS;
 			bcopy((char *)cp + 2,
 			    (char *)&to->to_mss, sizeof(to->to_mss));
 			to->to_mss = ntohs(to->to_mss);
 			break;
 		case TCPOPT_WINDOW:
 			if (optlen != TCPOLEN_WINDOW)
 				continue;
 			if (!(flags & TO_SYN))
 				continue;
 			to->to_flags |= TOF_SCALE;
 			to->to_wscale = min(cp[2], TCP_MAX_WINSHIFT);
 			break;
 		case TCPOPT_TIMESTAMP:
 			if (optlen != TCPOLEN_TIMESTAMP)
 				continue;
 			to->to_flags |= TOF_TS;
 			bcopy((char *)cp + 2,
 			    (char *)&to->to_tsval, sizeof(to->to_tsval));
 			to->to_tsval = ntohl(to->to_tsval);
 			bcopy((char *)cp + 6,
 			    (char *)&to->to_tsecr, sizeof(to->to_tsecr));
 			to->to_tsecr = ntohl(to->to_tsecr);
 			break;
 #ifdef TCP_SIGNATURE
 		/*
 		 * XXX In order to reply to a host which has set the
 		 * TCP_SIGNATURE option in its initial SYN, we have to
 		 * record the fact that the option was observed here
 		 * for the syncache code to perform the correct response.
 		 */
 		case TCPOPT_SIGNATURE:
 			if (optlen != TCPOLEN_SIGNATURE)
 				continue;
 			to->to_flags |= TOF_SIGNATURE;
 			to->to_signature = cp + 2;
 			break;
 #endif
 		case TCPOPT_SACK_PERMITTED:
 			if (optlen != TCPOLEN_SACK_PERMITTED)
 				continue;
 			if (!(flags & TO_SYN))
 				continue;
 			if (!V_tcp_do_sack)
 				continue;
 			to->to_flags |= TOF_SACKPERM;
 			break;
 		case TCPOPT_SACK:
 			if (optlen <= 2 || (optlen - 2) % TCPOLEN_SACK != 0)
 				continue;
 			if (flags & TO_SYN)
 				continue;
 			to->to_flags |= TOF_SACK;
 			to->to_nsacks = (optlen - 2) / TCPOLEN_SACK;
 			to->to_sacks = cp + 2;
 			TCPSTAT_INC(tcps_sack_rcv_blocks);
 			break;
 		default:
 			continue;
 		}
 	}
 }
 
 /*
  * Pull out of band byte out of a segment so
  * it doesn't appear in the user's data queue.
  * It is still reflected in the segment length for
  * sequencing purposes.
  */
 static void
 tcp_pulloutofband(struct socket *so, struct tcphdr *th, struct mbuf *m,
     int off)
 {
 	int cnt = off + th->th_urp - 1;
 
 	while (cnt >= 0) {
 		if (m->m_len > cnt) {
 			char *cp = mtod(m, caddr_t) + cnt;
 			struct tcpcb *tp = sototcpcb(so);
 
 			INP_WLOCK_ASSERT(tp->t_inpcb);
 
 			tp->t_iobc = *cp;
 			tp->t_oobflags |= TCPOOB_HAVEDATA;
 			bcopy(cp+1, cp, (unsigned)(m->m_len - cnt - 1));
 			m->m_len--;
 			if (m->m_flags & M_PKTHDR)
 				m->m_pkthdr.len--;
 			return;
 		}
 		cnt -= m->m_len;
 		m = m->m_next;
 		if (m == NULL)
 			break;
 	}
 	panic("tcp_pulloutofband");
 }
 
 /*
  * Collect new round-trip time estimate
  * and update averages and current timeout.
  */
 static void
 tcp_xmit_timer(struct tcpcb *tp, int rtt)
 {
 	int delta;
 
 	INP_WLOCK_ASSERT(tp->t_inpcb);
 
 	TCPSTAT_INC(tcps_rttupdated);
 	tp->t_rttupdated++;
 	if (tp->t_srtt != 0) {
 		/*
 		 * srtt is stored as fixed point with 5 bits after the
 		 * binary point (i.e., scaled by 8).  The following magic
 		 * is equivalent to the smoothing algorithm in rfc793 with
 		 * an alpha of .875 (srtt = rtt/8 + srtt*7/8 in fixed
 		 * point).  Adjust rtt to origin 0.
 		 */
 		delta = ((rtt - 1) << TCP_DELTA_SHIFT)
 			- (tp->t_srtt >> (TCP_RTT_SHIFT - TCP_DELTA_SHIFT));
 
 		if ((tp->t_srtt += delta) <= 0)
 			tp->t_srtt = 1;
 
 		/*
 		 * We accumulate a smoothed rtt variance (actually, a
 		 * smoothed mean difference), then set the retransmit
 		 * timer to smoothed rtt + 4 times the smoothed variance.
 		 * rttvar is stored as fixed point with 4 bits after the
 		 * binary point (scaled by 16).  The following is
 		 * equivalent to rfc793 smoothing with an alpha of .75
 		 * (rttvar = rttvar*3/4 + |delta| / 4).  This replaces
 		 * rfc793's wired-in beta.
 		 */
 		if (delta < 0)
 			delta = -delta;
 		delta -= tp->t_rttvar >> (TCP_RTTVAR_SHIFT - TCP_DELTA_SHIFT);
 		if ((tp->t_rttvar += delta) <= 0)
 			tp->t_rttvar = 1;
 		if (tp->t_rttbest > tp->t_srtt + tp->t_rttvar)
 		    tp->t_rttbest = tp->t_srtt + tp->t_rttvar;
 	} else {
 		/*
 		 * No rtt measurement yet - use the unsmoothed rtt.
 		 * Set the variance to half the rtt (so our first
 		 * retransmit happens at 3*rtt).
 		 */
 		tp->t_srtt = rtt << TCP_RTT_SHIFT;
 		tp->t_rttvar = rtt << (TCP_RTTVAR_SHIFT - 1);
 		tp->t_rttbest = tp->t_srtt + tp->t_rttvar;
 	}
 	tp->t_rtttime = 0;
 	tp->t_rxtshift = 0;
 
 	/*
 	 * the retransmit should happen at rtt + 4 * rttvar.
 	 * Because of the way we do the smoothing, srtt and rttvar
 	 * will each average +1/2 tick of bias.  When we compute
 	 * the retransmit timer, we want 1/2 tick of rounding and
 	 * 1 extra tick because of +-1/2 tick uncertainty in the
 	 * firing of the timer.  The bias will give us exactly the
 	 * 1.5 tick we need.  But, because the bias is
 	 * statistical, we have to test that we don't drop below
 	 * the minimum feasible timer (which is 2 ticks).
 	 */
 	TCPT_RANGESET(tp->t_rxtcur, TCP_REXMTVAL(tp),
 		      max(tp->t_rttmin, rtt + 2), TCPTV_REXMTMAX);
 
 	/*
 	 * We received an ack for a packet that wasn't retransmitted;
 	 * it is probably safe to discard any error indications we've
 	 * received recently.  This isn't quite right, but close enough
 	 * for now (a route might have failed after we sent a segment,
 	 * and the return path might not be symmetrical).
 	 */
 	tp->t_softerror = 0;
 }
 
 /*
  * Determine a reasonable value for maxseg size.
  * If the route is known, check route for mtu.
  * If none, use an mss that can be handled on the outgoing interface
  * without forcing IP to fragment.  If no route is found, route has no mtu,
  * or the destination isn't local, use a default, hopefully conservative
  * size (usually 512 or the default IP max size, but no more than the mtu
  * of the interface), as we can't discover anything about intervening
  * gateways or networks.  We also initialize the congestion/slow start
  * window to be a single segment if the destination isn't local.
  * While looking at the routing entry, we also initialize other path-dependent
  * parameters from pre-set or cached values in the routing entry.
  *
  * Also take into account the space needed for options that we
  * send regularly.  Make maxseg shorter by that amount to assure
  * that we can send maxseg amount of data even when the options
  * are present.  Store the upper limit of the length of options plus
  * data in maxopd.
  *
  * NOTE that this routine is only called when we process an incoming
  * segment, or an ICMP need fragmentation datagram. Outgoing SYN/ACK MSS
  * settings are handled in tcp_mssopt().
  */
 void
 tcp_mss_update(struct tcpcb *tp, int offer, int mtuoffer,
     struct hc_metrics_lite *metricptr, struct tcp_ifcap *cap)
 {
 	int mss = 0;
 	u_long maxmtu = 0;
 	struct inpcb *inp = tp->t_inpcb;
 	struct hc_metrics_lite metrics;
 	int origoffer;
 #ifdef INET6
 	int isipv6 = ((inp->inp_vflag & INP_IPV6) != 0) ? 1 : 0;
 	size_t min_protoh = isipv6 ?
 			    sizeof (struct ip6_hdr) + sizeof (struct tcphdr) :
 			    sizeof (struct tcpiphdr);
 #else
 	const size_t min_protoh = sizeof(struct tcpiphdr);
 #endif
 
 	INP_WLOCK_ASSERT(tp->t_inpcb);
 
 	if (mtuoffer != -1) {
 		KASSERT(offer == -1, ("%s: conflict", __func__));
 		offer = mtuoffer - min_protoh;
 	}
 	origoffer = offer;
 
 	/* Initialize. */
 #ifdef INET6
 	if (isipv6) {
 		maxmtu = tcp_maxmtu6(&inp->inp_inc, cap);
 		tp->t_maxopd = tp->t_maxseg = V_tcp_v6mssdflt;
 	}
 #endif
 #if defined(INET) && defined(INET6)
 	else
 #endif
 #ifdef INET
 	{
 		maxmtu = tcp_maxmtu(&inp->inp_inc, cap);
 		tp->t_maxopd = tp->t_maxseg = V_tcp_mssdflt;
 	}
 #endif
 
 	/*
 	 * No route to sender, stay with default mss and return.
 	 */
 	if (maxmtu == 0) {
 		/*
 		 * In case we return early we need to initialize metrics
 		 * to a defined state as tcp_hc_get() would do for us
 		 * if there was no cache hit.
 		 */
 		if (metricptr != NULL)
 			bzero(metricptr, sizeof(struct hc_metrics_lite));
 		return;
 	}
 
 	/* What have we got? */
 	switch (offer) {
 		case 0:
 			/*
 			 * Offer == 0 means that there was no MSS on the SYN
 			 * segment, in this case we use tcp_mssdflt as
 			 * already assigned to t_maxopd above.
 			 */
 			offer = tp->t_maxopd;
 			break;
 
 		case -1:
 			/*
 			 * Offer == -1 means that we didn't receive SYN yet.
 			 */
 			/* FALLTHROUGH */
 
 		default:
 			/*
 			 * Prevent DoS attack with too small MSS. Round up
 			 * to at least minmss.
 			 */
 			offer = max(offer, V_tcp_minmss);
 	}
 
 	/*
 	 * rmx information is now retrieved from tcp_hostcache.
 	 */
 	tcp_hc_get(&inp->inp_inc, &metrics);
 	if (metricptr != NULL)
 		bcopy(&metrics, metricptr, sizeof(struct hc_metrics_lite));
 
 	/*
 	 * If there's a discovered mtu int tcp hostcache, use it
 	 * else, use the link mtu.
 	 */
 	if (metrics.rmx_mtu)
 		mss = min(metrics.rmx_mtu, maxmtu) - min_protoh;
 	else {
 #ifdef INET6
 		if (isipv6) {
 			mss = maxmtu - min_protoh;
 			if (!V_path_mtu_discovery &&
 			    !in6_localaddr(&inp->in6p_faddr))
 				mss = min(mss, V_tcp_v6mssdflt);
 		}
 #endif
 #if defined(INET) && defined(INET6)
 		else
 #endif
 #ifdef INET
 		{
 			mss = maxmtu - min_protoh;
 			if (!V_path_mtu_discovery &&
 			    !in_localaddr(inp->inp_faddr))
 				mss = min(mss, V_tcp_mssdflt);
 		}
 #endif
 		/*
 		 * XXX - The above conditional (mss = maxmtu - min_protoh)
 		 * probably violates the TCP spec.
 		 * The problem is that, since we don't know the
 		 * other end's MSS, we are supposed to use a conservative
 		 * default.  But, if we do that, then MTU discovery will
 		 * never actually take place, because the conservative
 		 * default is much less than the MTUs typically seen
 		 * on the Internet today.  For the moment, we'll sweep
 		 * this under the carpet.
 		 *
 		 * The conservative default might not actually be a problem
 		 * if the only case this occurs is when sending an initial
 		 * SYN with options and data to a host we've never talked
 		 * to before.  Then, they will reply with an MSS value which
 		 * will get recorded and the new parameters should get
 		 * recomputed.  For Further Study.
 		 */
 	}
 	mss = min(mss, offer);
 
 	/*
 	 * Sanity check: make sure that maxopd will be large
 	 * enough to allow some data on segments even if the
 	 * all the option space is used (40bytes).  Otherwise
 	 * funny things may happen in tcp_output.
 	 */
 	mss = max(mss, 64);
 
 	/*
 	 * maxopd stores the maximum length of data AND options
 	 * in a segment; maxseg is the amount of data in a normal
 	 * segment.  We need to store this value (maxopd) apart
 	 * from maxseg, because now every segment carries options
 	 * and thus we normally have somewhat less data in segments.
 	 */
 	tp->t_maxopd = mss;
 
 	/*
 	 * origoffer==-1 indicates that no segments were received yet.
 	 * In this case we just guess.
 	 */
 	if ((tp->t_flags & (TF_REQ_TSTMP|TF_NOOPT)) == TF_REQ_TSTMP &&
 	    (origoffer == -1 ||
 	     (tp->t_flags & TF_RCVD_TSTMP) == TF_RCVD_TSTMP))
 		mss -= TCPOLEN_TSTAMP_APPA;
 
 	tp->t_maxseg = mss;
 }
 
 void
 tcp_mss(struct tcpcb *tp, int offer)
 {
 	int mss;
 	u_long bufsize;
 	struct inpcb *inp;
 	struct socket *so;
 	struct hc_metrics_lite metrics;
 	struct tcp_ifcap cap;
 
 	KASSERT(tp != NULL, ("%s: tp == NULL", __func__));
 
 	bzero(&cap, sizeof(cap));
 	tcp_mss_update(tp, offer, -1, &metrics, &cap);
 
 	mss = tp->t_maxseg;
 	inp = tp->t_inpcb;
 
 	/*
 	 * If there's a pipesize, change the socket buffer to that size,
 	 * don't change if sb_hiwat is different than default (then it
 	 * has been changed on purpose with setsockopt).
 	 * Make the socket buffers an integral number of mss units;
 	 * if the mss is larger than the socket buffer, decrease the mss.
 	 */
 	so = inp->inp_socket;
 	SOCKBUF_LOCK(&so->so_snd);
 	if ((so->so_snd.sb_hiwat == V_tcp_sendspace) && metrics.rmx_sendpipe)
 		bufsize = metrics.rmx_sendpipe;
 	else
 		bufsize = so->so_snd.sb_hiwat;
 	if (bufsize < mss)
 		mss = bufsize;
 	else {
 		bufsize = roundup(bufsize, mss);
 		if (bufsize > sb_max)
 			bufsize = sb_max;
 		if (bufsize > so->so_snd.sb_hiwat)
 			(void)sbreserve_locked(&so->so_snd, bufsize, so, NULL);
 	}
 	SOCKBUF_UNLOCK(&so->so_snd);
 	tp->t_maxseg = mss;
 
 	SOCKBUF_LOCK(&so->so_rcv);
 	if ((so->so_rcv.sb_hiwat == V_tcp_recvspace) && metrics.rmx_recvpipe)
 		bufsize = metrics.rmx_recvpipe;
 	else
 		bufsize = so->so_rcv.sb_hiwat;
 	if (bufsize > mss) {
 		bufsize = roundup(bufsize, mss);
 		if (bufsize > sb_max)
 			bufsize = sb_max;
 		if (bufsize > so->so_rcv.sb_hiwat)
 			(void)sbreserve_locked(&so->so_rcv, bufsize, so, NULL);
 	}
 	SOCKBUF_UNLOCK(&so->so_rcv);
 
 	/* Check the interface for TSO capabilities. */
 	if (cap.ifcap & CSUM_TSO) {
 		tp->t_flags |= TF_TSO;
 		tp->t_tsomax = cap.tsomax;
 	}
 }
 
 /*
  * Determine the MSS option to send on an outgoing SYN.
  */
 int
 tcp_mssopt(struct in_conninfo *inc)
 {
 	int mss = 0;
 	u_long maxmtu = 0;
 	u_long thcmtu = 0;
 	size_t min_protoh;
 
 	KASSERT(inc != NULL, ("tcp_mssopt with NULL in_conninfo pointer"));
 
 #ifdef INET6
 	if (inc->inc_flags & INC_ISIPV6) {
 		mss = V_tcp_v6mssdflt;
 		maxmtu = tcp_maxmtu6(inc, NULL);
 		min_protoh = sizeof(struct ip6_hdr) + sizeof(struct tcphdr);
 	}
 #endif
 #if defined(INET) && defined(INET6)
 	else
 #endif
 #ifdef INET
 	{
 		mss = V_tcp_mssdflt;
 		maxmtu = tcp_maxmtu(inc, NULL);
 		min_protoh = sizeof(struct tcpiphdr);
 	}
 #endif
 #if defined(INET6) || defined(INET)
 	thcmtu = tcp_hc_getmtu(inc); /* IPv4 and IPv6 */
 #endif
 
 	if (maxmtu && thcmtu)
 		mss = min(maxmtu, thcmtu) - min_protoh;
 	else if (maxmtu || thcmtu)
 		mss = max(maxmtu, thcmtu) - min_protoh;
 
 	return (mss);
 }
 
 
 /*
  * On a partial ack arrives, force the retransmission of the
  * next unacknowledged segment.  Do not clear tp->t_dupacks.
  * By setting snd_nxt to ti_ack, this forces retransmission timer to
  * be started again.
  */
 static void
 tcp_newreno_partial_ack(struct tcpcb *tp, struct tcphdr *th)
 {
 	tcp_seq onxt = tp->snd_nxt;
 	u_long  ocwnd = tp->snd_cwnd;
 
 	INP_WLOCK_ASSERT(tp->t_inpcb);
 
 	tcp_timer_activate(tp, TT_REXMT, 0);
 	tp->t_rtttime = 0;
 	tp->snd_nxt = th->th_ack;
 	/*
 	 * Set snd_cwnd to one segment beyond acknowledged offset.
 	 * (tp->snd_una has not yet been updated when this function is called.)
 	 */
 	tp->snd_cwnd = tp->t_maxseg + BYTES_THIS_ACK(tp, th);
 	tp->t_flags |= TF_ACKNOW;
 	(void) tcp_output(tp);
 	tp->snd_cwnd = ocwnd;
 	if (SEQ_GT(onxt, tp->snd_nxt))
 		tp->snd_nxt = onxt;
 	/*
 	 * Partial window deflation.  Relies on fact that tp->snd_una
 	 * not updated yet.
 	 */
 	if (tp->snd_cwnd > BYTES_THIS_ACK(tp, th))
 		tp->snd_cwnd -= BYTES_THIS_ACK(tp, th);
 	else
 		tp->snd_cwnd = 0;
 	tp->snd_cwnd += tp->t_maxseg;
 }
Index: head/sys/netinet/tcp_output.c
===================================================================
--- head/sys/netinet/tcp_output.c	(revision 258540)
+++ head/sys/netinet/tcp_output.c	(revision 258541)
@@ -1,1581 +1,1580 @@
 /*-
  * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1995
  *	The Regents of the University of California.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)tcp_output.c	8.4 (Berkeley) 5/24/95
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_inet.h"
 #include "opt_inet6.h"
 #include "opt_ipsec.h"
-#include "opt_kdtrace.h"
 #include "opt_tcpdebug.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/domain.h>
 #include <sys/hhook.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/mbuf.h>
 #include <sys/mutex.h>
 #include <sys/protosw.h>
 #include <sys/sdt.h>
 #include <sys/socket.h>
 #include <sys/socketvar.h>
 #include <sys/sysctl.h>
 
 #include <net/if.h>
 #include <net/route.h>
 #include <net/vnet.h>
 
 #include <netinet/cc.h>
 #include <netinet/in.h>
 #include <netinet/in_kdtrace.h>
 #include <netinet/in_systm.h>
 #include <netinet/ip.h>
 #include <netinet/in_pcb.h>
 #include <netinet/ip_var.h>
 #include <netinet/ip_options.h>
 #ifdef INET6
 #include <netinet6/in6_pcb.h>
 #include <netinet/ip6.h>
 #include <netinet6/ip6_var.h>
 #endif
 #define	TCPOUTFLAGS
 #include <netinet/tcp_fsm.h>
 #include <netinet/tcp_seq.h>
 #include <netinet/tcp_timer.h>
 #include <netinet/tcp_var.h>
 #include <netinet/tcpip.h>
 #ifdef TCPDEBUG
 #include <netinet/tcp_debug.h>
 #endif
 #ifdef TCP_OFFLOAD
 #include <netinet/tcp_offload.h>
 #endif
 
 #ifdef IPSEC
 #include <netipsec/ipsec.h>
 #endif /*IPSEC*/
 
 #include <machine/in_cksum.h>
 
 #include <security/mac/mac_framework.h>
 
 VNET_DEFINE(int, path_mtu_discovery) = 1;
 SYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, path_mtu_discovery, CTLFLAG_RW,
 	&VNET_NAME(path_mtu_discovery), 1,
 	"Enable Path MTU Discovery");
 
 VNET_DEFINE(int, tcp_do_tso) = 1;
 #define	V_tcp_do_tso		VNET(tcp_do_tso)
 SYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, tso, CTLFLAG_RW,
 	&VNET_NAME(tcp_do_tso), 0,
 	"Enable TCP Segmentation Offload");
 
 VNET_DEFINE(int, tcp_sendspace) = 1024*32;
 #define	V_tcp_sendspace	VNET(tcp_sendspace)
 SYSCTL_VNET_INT(_net_inet_tcp, TCPCTL_SENDSPACE, sendspace, CTLFLAG_RW,
 	&VNET_NAME(tcp_sendspace), 0, "Initial send socket buffer size");
 
 VNET_DEFINE(int, tcp_do_autosndbuf) = 1;
 #define	V_tcp_do_autosndbuf	VNET(tcp_do_autosndbuf)
 SYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, sendbuf_auto, CTLFLAG_RW,
 	&VNET_NAME(tcp_do_autosndbuf), 0,
 	"Enable automatic send buffer sizing");
 
 VNET_DEFINE(int, tcp_autosndbuf_inc) = 8*1024;
 #define	V_tcp_autosndbuf_inc	VNET(tcp_autosndbuf_inc)
 SYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, sendbuf_inc, CTLFLAG_RW,
 	&VNET_NAME(tcp_autosndbuf_inc), 0,
 	"Incrementor step size of automatic send buffer");
 
 VNET_DEFINE(int, tcp_autosndbuf_max) = 2*1024*1024;
 #define	V_tcp_autosndbuf_max	VNET(tcp_autosndbuf_max)
 SYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, sendbuf_max, CTLFLAG_RW,
 	&VNET_NAME(tcp_autosndbuf_max), 0,
 	"Max size of automatic send buffer");
 
 static void inline	hhook_run_tcp_est_out(struct tcpcb *tp,
 			    struct tcphdr *th, struct tcpopt *to,
 			    long len, int tso);
 static void inline	cc_after_idle(struct tcpcb *tp);
 
 /*
  * Wrapper for the TCP established output helper hook.
  */
 static void inline
 hhook_run_tcp_est_out(struct tcpcb *tp, struct tcphdr *th,
     struct tcpopt *to, long len, int tso)
 {
 	struct tcp_hhook_data hhook_data;
 
 	if (V_tcp_hhh[HHOOK_TCP_EST_OUT]->hhh_nhooks > 0) {
 		hhook_data.tp = tp;
 		hhook_data.th = th;
 		hhook_data.to = to;
 		hhook_data.len = len;
 		hhook_data.tso = tso;
 
 		hhook_run_hooks(V_tcp_hhh[HHOOK_TCP_EST_OUT], &hhook_data,
 		    tp->osd);
 	}
 }
 
 /*
  * CC wrapper hook functions
  */
 static void inline
 cc_after_idle(struct tcpcb *tp)
 {
 	INP_WLOCK_ASSERT(tp->t_inpcb);
 
 	if (CC_ALGO(tp)->after_idle != NULL)
 		CC_ALGO(tp)->after_idle(tp->ccv);
 }
 
 /*
  * Tcp output routine: figure out what should be sent and send it.
  */
 int
 tcp_output(struct tcpcb *tp)
 {
 	struct socket *so = tp->t_inpcb->inp_socket;
 	long len, recwin, sendwin;
 	int off, flags, error = 0;	/* Keep compiler happy */
 	struct mbuf *m;
 	struct ip *ip = NULL;
 	struct ipovly *ipov = NULL;
 	struct tcphdr *th;
 	u_char opt[TCP_MAXOLEN];
 	unsigned ipoptlen, optlen, hdrlen;
 #ifdef IPSEC
 	unsigned ipsec_optlen = 0;
 #endif
 	int idle, sendalot;
 	int sack_rxmit, sack_bytes_rxmt;
 	struct sackhole *p;
 	int tso, mtu;
 	struct tcpopt to;
 #if 0
 	int maxburst = TCP_MAXBURST;
 #endif
 #ifdef INET6
 	struct ip6_hdr *ip6 = NULL;
 	int isipv6;
 
 	isipv6 = (tp->t_inpcb->inp_vflag & INP_IPV6) != 0;
 #endif
 
 	INP_WLOCK_ASSERT(tp->t_inpcb);
 
 #ifdef TCP_OFFLOAD
 	if (tp->t_flags & TF_TOE)
 		return (tcp_offload_output(tp));
 #endif
 
 	/*
 	 * Determine length of data that should be transmitted,
 	 * and flags that will be used.
 	 * If there is some data or critical controls (SYN, RST)
 	 * to send, then transmit; otherwise, investigate further.
 	 */
 	idle = (tp->t_flags & TF_LASTIDLE) || (tp->snd_max == tp->snd_una);
 	if (idle && ticks - tp->t_rcvtime >= tp->t_rxtcur)
 		cc_after_idle(tp);
 	tp->t_flags &= ~TF_LASTIDLE;
 	if (idle) {
 		if (tp->t_flags & TF_MORETOCOME) {
 			tp->t_flags |= TF_LASTIDLE;
 			idle = 0;
 		}
 	}
 again:
 	/*
 	 * If we've recently taken a timeout, snd_max will be greater than
 	 * snd_nxt.  There may be SACK information that allows us to avoid
 	 * resending already delivered data.  Adjust snd_nxt accordingly.
 	 */
 	if ((tp->t_flags & TF_SACK_PERMIT) &&
 	    SEQ_LT(tp->snd_nxt, tp->snd_max))
 		tcp_sack_adjust(tp);
 	sendalot = 0;
 	tso = 0;
 	mtu = 0;
 	off = tp->snd_nxt - tp->snd_una;
 	sendwin = min(tp->snd_wnd, tp->snd_cwnd);
 
 	flags = tcp_outflags[tp->t_state];
 	/*
 	 * Send any SACK-generated retransmissions.  If we're explicitly trying
 	 * to send out new data (when sendalot is 1), bypass this function.
 	 * If we retransmit in fast recovery mode, decrement snd_cwnd, since
 	 * we're replacing a (future) new transmission with a retransmission
 	 * now, and we previously incremented snd_cwnd in tcp_input().
 	 */
 	/*
 	 * Still in sack recovery , reset rxmit flag to zero.
 	 */
 	sack_rxmit = 0;
 	sack_bytes_rxmt = 0;
 	len = 0;
 	p = NULL;
 	if ((tp->t_flags & TF_SACK_PERMIT) && IN_FASTRECOVERY(tp->t_flags) &&
 	    (p = tcp_sack_output(tp, &sack_bytes_rxmt))) {
 		long cwin;
 		
 		cwin = min(tp->snd_wnd, tp->snd_cwnd) - sack_bytes_rxmt;
 		if (cwin < 0)
 			cwin = 0;
 		/* Do not retransmit SACK segments beyond snd_recover */
 		if (SEQ_GT(p->end, tp->snd_recover)) {
 			/*
 			 * (At least) part of sack hole extends beyond
 			 * snd_recover. Check to see if we can rexmit data
 			 * for this hole.
 			 */
 			if (SEQ_GEQ(p->rxmit, tp->snd_recover)) {
 				/*
 				 * Can't rexmit any more data for this hole.
 				 * That data will be rexmitted in the next
 				 * sack recovery episode, when snd_recover
 				 * moves past p->rxmit.
 				 */
 				p = NULL;
 				goto after_sack_rexmit;
 			} else
 				/* Can rexmit part of the current hole */
 				len = ((long)ulmin(cwin,
 						   tp->snd_recover - p->rxmit));
 		} else
 			len = ((long)ulmin(cwin, p->end - p->rxmit));
 		off = p->rxmit - tp->snd_una;
 		KASSERT(off >= 0,("%s: sack block to the left of una : %d",
 		    __func__, off));
 		if (len > 0) {
 			sack_rxmit = 1;
 			sendalot = 1;
 			TCPSTAT_INC(tcps_sack_rexmits);
 			TCPSTAT_ADD(tcps_sack_rexmit_bytes,
 			    min(len, tp->t_maxseg));
 		}
 	}
 after_sack_rexmit:
 	/*
 	 * Get standard flags, and add SYN or FIN if requested by 'hidden'
 	 * state flags.
 	 */
 	if (tp->t_flags & TF_NEEDFIN)
 		flags |= TH_FIN;
 	if (tp->t_flags & TF_NEEDSYN)
 		flags |= TH_SYN;
 
 	SOCKBUF_LOCK(&so->so_snd);
 	/*
 	 * If in persist timeout with window of 0, send 1 byte.
 	 * Otherwise, if window is small but nonzero
 	 * and timer expired, we will send what we can
 	 * and go to transmit state.
 	 */
 	if (tp->t_flags & TF_FORCEDATA) {
 		if (sendwin == 0) {
 			/*
 			 * If we still have some data to send, then
 			 * clear the FIN bit.  Usually this would
 			 * happen below when it realizes that we
 			 * aren't sending all the data.  However,
 			 * if we have exactly 1 byte of unsent data,
 			 * then it won't clear the FIN bit below,
 			 * and if we are in persist state, we wind
 			 * up sending the packet without recording
 			 * that we sent the FIN bit.
 			 *
 			 * We can't just blindly clear the FIN bit,
 			 * because if we don't have any more data
 			 * to send then the probe will be the FIN
 			 * itself.
 			 */
 			if (off < so->so_snd.sb_cc)
 				flags &= ~TH_FIN;
 			sendwin = 1;
 		} else {
 			tcp_timer_activate(tp, TT_PERSIST, 0);
 			tp->t_rxtshift = 0;
 		}
 	}
 
 	/*
 	 * If snd_nxt == snd_max and we have transmitted a FIN, the
 	 * offset will be > 0 even if so_snd.sb_cc is 0, resulting in
 	 * a negative length.  This can also occur when TCP opens up
 	 * its congestion window while receiving additional duplicate
 	 * acks after fast-retransmit because TCP will reset snd_nxt
 	 * to snd_max after the fast-retransmit.
 	 *
 	 * In the normal retransmit-FIN-only case, however, snd_nxt will
 	 * be set to snd_una, the offset will be 0, and the length may
 	 * wind up 0.
 	 *
 	 * If sack_rxmit is true we are retransmitting from the scoreboard
 	 * in which case len is already set.
 	 */
 	if (sack_rxmit == 0) {
 		if (sack_bytes_rxmt == 0)
 			len = ((long)ulmin(so->so_snd.sb_cc, sendwin) - off);
 		else {
 			long cwin;
 
                         /*
 			 * We are inside of a SACK recovery episode and are
 			 * sending new data, having retransmitted all the
 			 * data possible in the scoreboard.
 			 */
 			len = ((long)ulmin(so->so_snd.sb_cc, tp->snd_wnd) 
 			       - off);
 			/*
 			 * Don't remove this (len > 0) check !
 			 * We explicitly check for len > 0 here (although it 
 			 * isn't really necessary), to work around a gcc 
 			 * optimization issue - to force gcc to compute
 			 * len above. Without this check, the computation
 			 * of len is bungled by the optimizer.
 			 */
 			if (len > 0) {
 				cwin = tp->snd_cwnd - 
 					(tp->snd_nxt - tp->sack_newdata) -
 					sack_bytes_rxmt;
 				if (cwin < 0)
 					cwin = 0;
 				len = lmin(len, cwin);
 			}
 		}
 	}
 
 	/*
 	 * Lop off SYN bit if it has already been sent.  However, if this
 	 * is SYN-SENT state and if segment contains data and if we don't
 	 * know that foreign host supports TAO, suppress sending segment.
 	 */
 	if ((flags & TH_SYN) && SEQ_GT(tp->snd_nxt, tp->snd_una)) {
 		if (tp->t_state != TCPS_SYN_RECEIVED)
 			flags &= ~TH_SYN;
 		off--, len++;
 	}
 
 	/*
 	 * Be careful not to send data and/or FIN on SYN segments.
 	 * This measure is needed to prevent interoperability problems
 	 * with not fully conformant TCP implementations.
 	 */
 	if ((flags & TH_SYN) && (tp->t_flags & TF_NOOPT)) {
 		len = 0;
 		flags &= ~TH_FIN;
 	}
 
 	if (len < 0) {
 		/*
 		 * If FIN has been sent but not acked,
 		 * but we haven't been called to retransmit,
 		 * len will be < 0.  Otherwise, window shrank
 		 * after we sent into it.  If window shrank to 0,
 		 * cancel pending retransmit, pull snd_nxt back
 		 * to (closed) window, and set the persist timer
 		 * if it isn't already going.  If the window didn't
 		 * close completely, just wait for an ACK.
 		 */
 		len = 0;
 		if (sendwin == 0) {
 			tcp_timer_activate(tp, TT_REXMT, 0);
 			tp->t_rxtshift = 0;
 			tp->snd_nxt = tp->snd_una;
 			if (!tcp_timer_active(tp, TT_PERSIST))
 				tcp_setpersist(tp);
 		}
 	}
 
 	/* len will be >= 0 after this point. */
 	KASSERT(len >= 0, ("[%s:%d]: len < 0", __func__, __LINE__));
 
 	/*
 	 * Automatic sizing of send socket buffer.  Often the send buffer
 	 * size is not optimally adjusted to the actual network conditions
 	 * at hand (delay bandwidth product).  Setting the buffer size too
 	 * small limits throughput on links with high bandwidth and high
 	 * delay (eg. trans-continental/oceanic links).  Setting the
 	 * buffer size too big consumes too much real kernel memory,
 	 * especially with many connections on busy servers.
 	 *
 	 * The criteria to step up the send buffer one notch are:
 	 *  1. receive window of remote host is larger than send buffer
 	 *     (with a fudge factor of 5/4th);
 	 *  2. send buffer is filled to 7/8th with data (so we actually
 	 *     have data to make use of it);
 	 *  3. send buffer fill has not hit maximal automatic size;
 	 *  4. our send window (slow start and cogestion controlled) is
 	 *     larger than sent but unacknowledged data in send buffer.
 	 *
 	 * The remote host receive window scaling factor may limit the
 	 * growing of the send buffer before it reaches its allowed
 	 * maximum.
 	 *
 	 * It scales directly with slow start or congestion window
 	 * and does at most one step per received ACK.  This fast
 	 * scaling has the drawback of growing the send buffer beyond
 	 * what is strictly necessary to make full use of a given
 	 * delay*bandwith product.  However testing has shown this not
 	 * to be much of an problem.  At worst we are trading wasting
 	 * of available bandwith (the non-use of it) for wasting some
 	 * socket buffer memory.
 	 *
 	 * TODO: Shrink send buffer during idle periods together
 	 * with congestion window.  Requires another timer.  Has to
 	 * wait for upcoming tcp timer rewrite.
 	 */
 	if (V_tcp_do_autosndbuf && so->so_snd.sb_flags & SB_AUTOSIZE) {
 		if ((tp->snd_wnd / 4 * 5) >= so->so_snd.sb_hiwat &&
 		    so->so_snd.sb_cc >= (so->so_snd.sb_hiwat / 8 * 7) &&
 		    so->so_snd.sb_cc < V_tcp_autosndbuf_max &&
 		    sendwin >= (so->so_snd.sb_cc - (tp->snd_nxt - tp->snd_una))) {
 			if (!sbreserve_locked(&so->so_snd,
 			    min(so->so_snd.sb_hiwat + V_tcp_autosndbuf_inc,
 			     V_tcp_autosndbuf_max), so, curthread))
 				so->so_snd.sb_flags &= ~SB_AUTOSIZE;
 		}
 	}
 
 	/*
 	 * Decide if we can use TCP Segmentation Offloading (if supported by
 	 * hardware).
 	 *
 	 * TSO may only be used if we are in a pure bulk sending state.  The
 	 * presence of TCP-MD5, SACK retransmits, SACK advertizements and
 	 * IP options prevent using TSO.  With TSO the TCP header is the same
 	 * (except for the sequence number) for all generated packets.  This
 	 * makes it impossible to transmit any options which vary per generated
 	 * segment or packet.
 	 */
 #ifdef IPSEC
 	/*
 	 * Pre-calculate here as we save another lookup into the darknesses
 	 * of IPsec that way and can actually decide if TSO is ok.
 	 */
 	ipsec_optlen = ipsec_hdrsiz_tcp(tp);
 #endif
 	if ((tp->t_flags & TF_TSO) && V_tcp_do_tso && len > tp->t_maxseg &&
 	    ((tp->t_flags & TF_SIGNATURE) == 0) &&
 	    tp->rcv_numsacks == 0 && sack_rxmit == 0 &&
 #ifdef IPSEC
 	    ipsec_optlen == 0 &&
 #endif
 	    tp->t_inpcb->inp_options == NULL &&
 	    tp->t_inpcb->in6p_options == NULL)
 		tso = 1;
 
 	if (sack_rxmit) {
 		if (SEQ_LT(p->rxmit + len, tp->snd_una + so->so_snd.sb_cc))
 			flags &= ~TH_FIN;
 	} else {
 		if (SEQ_LT(tp->snd_nxt + len, tp->snd_una + so->so_snd.sb_cc))
 			flags &= ~TH_FIN;
 	}
 
 	recwin = sbspace(&so->so_rcv);
 
 	/*
 	 * Sender silly window avoidance.   We transmit under the following
 	 * conditions when len is non-zero:
 	 *
 	 *	- We have a full segment (or more with TSO)
 	 *	- This is the last buffer in a write()/send() and we are
 	 *	  either idle or running NODELAY
 	 *	- we've timed out (e.g. persist timer)
 	 *	- we have more then 1/2 the maximum send window's worth of
 	 *	  data (receiver may be limited the window size)
 	 *	- we need to retransmit
 	 */
 	if (len) {
 		if (len >= tp->t_maxseg)
 			goto send;
 		/*
 		 * NOTE! on localhost connections an 'ack' from the remote
 		 * end may occur synchronously with the output and cause
 		 * us to flush a buffer queued with moretocome.  XXX
 		 *
 		 * note: the len + off check is almost certainly unnecessary.
 		 */
 		if (!(tp->t_flags & TF_MORETOCOME) &&	/* normal case */
 		    (idle || (tp->t_flags & TF_NODELAY)) &&
 		    len + off >= so->so_snd.sb_cc &&
 		    (tp->t_flags & TF_NOPUSH) == 0) {
 			goto send;
 		}
 		if (tp->t_flags & TF_FORCEDATA)		/* typ. timeout case */
 			goto send;
 		if (len >= tp->max_sndwnd / 2 && tp->max_sndwnd > 0)
 			goto send;
 		if (SEQ_LT(tp->snd_nxt, tp->snd_max))	/* retransmit case */
 			goto send;
 		if (sack_rxmit)
 			goto send;
 	}
 
 	/*
 	 * Sending of standalone window updates.
 	 *
 	 * Window updates are important when we close our window due to a
 	 * full socket buffer and are opening it again after the application
 	 * reads data from it.  Once the window has opened again and the
 	 * remote end starts to send again the ACK clock takes over and
 	 * provides the most current window information.
 	 *
 	 * We must avoid the silly window syndrome whereas every read
 	 * from the receive buffer, no matter how small, causes a window
 	 * update to be sent.  We also should avoid sending a flurry of
 	 * window updates when the socket buffer had queued a lot of data
 	 * and the application is doing small reads.
 	 *
 	 * Prevent a flurry of pointless window updates by only sending
 	 * an update when we can increase the advertized window by more
 	 * than 1/4th of the socket buffer capacity.  When the buffer is
 	 * getting full or is very small be more aggressive and send an
 	 * update whenever we can increase by two mss sized segments.
 	 * In all other situations the ACK's to new incoming data will
 	 * carry further window increases.
 	 *
 	 * Don't send an independent window update if a delayed
 	 * ACK is pending (it will get piggy-backed on it) or the
 	 * remote side already has done a half-close and won't send
 	 * more data.  Skip this if the connection is in T/TCP
 	 * half-open state.
 	 */
 	if (recwin > 0 && !(tp->t_flags & TF_NEEDSYN) &&
 	    !(tp->t_flags & TF_DELACK) &&
 	    !TCPS_HAVERCVDFIN(tp->t_state)) {
 		/*
 		 * "adv" is the amount we could increase the window,
 		 * taking into account that we are limited by
 		 * TCP_MAXWIN << tp->rcv_scale.
 		 */
 		long adv;
 		int oldwin;
 
 		adv = min(recwin, (long)TCP_MAXWIN << tp->rcv_scale);
 		if (SEQ_GT(tp->rcv_adv, tp->rcv_nxt)) {
 			oldwin = (tp->rcv_adv - tp->rcv_nxt);
 			adv -= oldwin;
 		} else
 			oldwin = 0;
 
 		/* 
 		 * If the new window size ends up being the same as the old
 		 * size when it is scaled, then don't force a window update.
 		 */
 		if (oldwin >> tp->rcv_scale == (adv + oldwin) >> tp->rcv_scale)
 			goto dontupdate;
 
 		if (adv >= (long)(2 * tp->t_maxseg) &&
 		    (adv >= (long)(so->so_rcv.sb_hiwat / 4) ||
 		     recwin <= (long)(so->so_rcv.sb_hiwat / 8) ||
 		     so->so_rcv.sb_hiwat <= 8 * tp->t_maxseg))
 			goto send;
 	}
 dontupdate:
 
 	/*
 	 * Send if we owe the peer an ACK, RST, SYN, or urgent data.  ACKNOW
 	 * is also a catch-all for the retransmit timer timeout case.
 	 */
 	if (tp->t_flags & TF_ACKNOW)
 		goto send;
 	if ((flags & TH_RST) ||
 	    ((flags & TH_SYN) && (tp->t_flags & TF_NEEDSYN) == 0))
 		goto send;
 	if (SEQ_GT(tp->snd_up, tp->snd_una))
 		goto send;
 	/*
 	 * If our state indicates that FIN should be sent
 	 * and we have not yet done so, then we need to send.
 	 */
 	if (flags & TH_FIN &&
 	    ((tp->t_flags & TF_SENTFIN) == 0 || tp->snd_nxt == tp->snd_una))
 		goto send;
 	/*
 	 * In SACK, it is possible for tcp_output to fail to send a segment
 	 * after the retransmission timer has been turned off.  Make sure
 	 * that the retransmission timer is set.
 	 */
 	if ((tp->t_flags & TF_SACK_PERMIT) &&
 	    SEQ_GT(tp->snd_max, tp->snd_una) &&
 	    !tcp_timer_active(tp, TT_REXMT) &&
 	    !tcp_timer_active(tp, TT_PERSIST)) {
 		tcp_timer_activate(tp, TT_REXMT, tp->t_rxtcur);
 		goto just_return;
 	} 
 	/*
 	 * TCP window updates are not reliable, rather a polling protocol
 	 * using ``persist'' packets is used to insure receipt of window
 	 * updates.  The three ``states'' for the output side are:
 	 *	idle			not doing retransmits or persists
 	 *	persisting		to move a small or zero window
 	 *	(re)transmitting	and thereby not persisting
 	 *
 	 * tcp_timer_active(tp, TT_PERSIST)
 	 *	is true when we are in persist state.
 	 * (tp->t_flags & TF_FORCEDATA)
 	 *	is set when we are called to send a persist packet.
 	 * tcp_timer_active(tp, TT_REXMT)
 	 *	is set when we are retransmitting
 	 * The output side is idle when both timers are zero.
 	 *
 	 * If send window is too small, there is data to transmit, and no
 	 * retransmit or persist is pending, then go to persist state.
 	 * If nothing happens soon, send when timer expires:
 	 * if window is nonzero, transmit what we can,
 	 * otherwise force out a byte.
 	 */
 	if (so->so_snd.sb_cc && !tcp_timer_active(tp, TT_REXMT) &&
 	    !tcp_timer_active(tp, TT_PERSIST)) {
 		tp->t_rxtshift = 0;
 		tcp_setpersist(tp);
 	}
 
 	/*
 	 * No reason to send a segment, just return.
 	 */
 just_return:
 	SOCKBUF_UNLOCK(&so->so_snd);
 	return (0);
 
 send:
 	SOCKBUF_LOCK_ASSERT(&so->so_snd);
 	/*
 	 * Before ESTABLISHED, force sending of initial options
 	 * unless TCP set not to do any options.
 	 * NOTE: we assume that the IP/TCP header plus TCP options
 	 * always fit in a single mbuf, leaving room for a maximum
 	 * link header, i.e.
 	 *	max_linkhdr + sizeof (struct tcpiphdr) + optlen <= MCLBYTES
 	 */
 	optlen = 0;
 #ifdef INET6
 	if (isipv6)
 		hdrlen = sizeof (struct ip6_hdr) + sizeof (struct tcphdr);
 	else
 #endif
 		hdrlen = sizeof (struct tcpiphdr);
 
 	/*
 	 * Compute options for segment.
 	 * We only have to care about SYN and established connection
 	 * segments.  Options for SYN-ACK segments are handled in TCP
 	 * syncache.
 	 */
 	if ((tp->t_flags & TF_NOOPT) == 0) {
 		to.to_flags = 0;
 		/* Maximum segment size. */
 		if (flags & TH_SYN) {
 			tp->snd_nxt = tp->iss;
 			to.to_mss = tcp_mssopt(&tp->t_inpcb->inp_inc);
 			to.to_flags |= TOF_MSS;
 		}
 		/* Window scaling. */
 		if ((flags & TH_SYN) && (tp->t_flags & TF_REQ_SCALE)) {
 			to.to_wscale = tp->request_r_scale;
 			to.to_flags |= TOF_SCALE;
 		}
 		/* Timestamps. */
 		if ((tp->t_flags & TF_RCVD_TSTMP) ||
 		    ((flags & TH_SYN) && (tp->t_flags & TF_REQ_TSTMP))) {
 			to.to_tsval = tcp_ts_getticks() + tp->ts_offset;
 			to.to_tsecr = tp->ts_recent;
 			to.to_flags |= TOF_TS;
 			/* Set receive buffer autosizing timestamp. */
 			if (tp->rfbuf_ts == 0 &&
 			    (so->so_rcv.sb_flags & SB_AUTOSIZE))
 				tp->rfbuf_ts = tcp_ts_getticks();
 		}
 		/* Selective ACK's. */
 		if (tp->t_flags & TF_SACK_PERMIT) {
 			if (flags & TH_SYN)
 				to.to_flags |= TOF_SACKPERM;
 			else if (TCPS_HAVEESTABLISHED(tp->t_state) &&
 			    (tp->t_flags & TF_SACK_PERMIT) &&
 			    tp->rcv_numsacks > 0) {
 				to.to_flags |= TOF_SACK;
 				to.to_nsacks = tp->rcv_numsacks;
 				to.to_sacks = (u_char *)tp->sackblks;
 			}
 		}
 #ifdef TCP_SIGNATURE
 		/* TCP-MD5 (RFC2385). */
 		if (tp->t_flags & TF_SIGNATURE)
 			to.to_flags |= TOF_SIGNATURE;
 #endif /* TCP_SIGNATURE */
 
 		/* Processing the options. */
 		hdrlen += optlen = tcp_addoptions(&to, opt);
 	}
 
 #ifdef INET6
 	if (isipv6)
 		ipoptlen = ip6_optlen(tp->t_inpcb);
 	else
 #endif
 	if (tp->t_inpcb->inp_options)
 		ipoptlen = tp->t_inpcb->inp_options->m_len -
 				offsetof(struct ipoption, ipopt_list);
 	else
 		ipoptlen = 0;
 #ifdef IPSEC
 	ipoptlen += ipsec_optlen;
 #endif
 
 	/*
 	 * Adjust data length if insertion of options will
 	 * bump the packet length beyond the t_maxopd length.
 	 * Clear the FIN bit because we cut off the tail of
 	 * the segment.
 	 */
 	if (len + optlen + ipoptlen > tp->t_maxopd) {
 		flags &= ~TH_FIN;
 
 		if (tso) {
 			KASSERT(ipoptlen == 0,
 			    ("%s: TSO can't do IP options", __func__));
 
 			/*
 			 * Limit a burst to t_tsomax minus IP,
 			 * TCP and options length to keep ip->ip_len
 			 * from overflowing or exceeding the maximum
 			 * length allowed by the network interface.
 			 */
 			if (len > tp->t_tsomax - hdrlen) {
 				len = tp->t_tsomax - hdrlen;
 				sendalot = 1;
 			}
 
 			/*
 			 * Prevent the last segment from being
 			 * fractional unless the send sockbuf can
 			 * be emptied.
 			 */
 			if (sendalot && off + len < so->so_snd.sb_cc) {
 				len -= len % (tp->t_maxopd - optlen);
 				sendalot = 1;
 			}
 
 			/*
 			 * Send the FIN in a separate segment
 			 * after the bulk sending is done.
 			 * We don't trust the TSO implementations
 			 * to clear the FIN flag on all but the
 			 * last segment.
 			 */
 			if (tp->t_flags & TF_NEEDFIN)
 				sendalot = 1;
 
 		} else {
 			len = tp->t_maxopd - optlen - ipoptlen;
 			sendalot = 1;
 		}
 	} else
 		tso = 0;
 
 	KASSERT(len + hdrlen + ipoptlen <= IP_MAXPACKET,
 	    ("%s: len > IP_MAXPACKET", __func__));
 
 /*#ifdef DIAGNOSTIC*/
 #ifdef INET6
 	if (max_linkhdr + hdrlen > MCLBYTES)
 #else
 	if (max_linkhdr + hdrlen > MHLEN)
 #endif
 		panic("tcphdr too big");
 /*#endif*/
 
 	/*
 	 * This KASSERT is here to catch edge cases at a well defined place.
 	 * Before, those had triggered (random) panic conditions further down.
 	 */
 	KASSERT(len >= 0, ("[%s:%d]: len < 0", __func__, __LINE__));
 
 	/*
 	 * Grab a header mbuf, attaching a copy of data to
 	 * be transmitted, and initialize the header from
 	 * the template for sends on this connection.
 	 */
 	if (len) {
 		struct mbuf *mb;
 		u_int moff;
 
 		if ((tp->t_flags & TF_FORCEDATA) && len == 1)
 			TCPSTAT_INC(tcps_sndprobe);
 		else if (SEQ_LT(tp->snd_nxt, tp->snd_max) || sack_rxmit) {
 			tp->t_sndrexmitpack++;
 			TCPSTAT_INC(tcps_sndrexmitpack);
 			TCPSTAT_ADD(tcps_sndrexmitbyte, len);
 		} else {
 			TCPSTAT_INC(tcps_sndpack);
 			TCPSTAT_ADD(tcps_sndbyte, len);
 		}
 #ifdef INET6
 		if (MHLEN < hdrlen + max_linkhdr)
 			m = m_getcl(M_NOWAIT, MT_DATA, M_PKTHDR);
 		else
 #endif
 			m = m_gethdr(M_NOWAIT, MT_DATA);
 
 		if (m == NULL) {
 			SOCKBUF_UNLOCK(&so->so_snd);
 			error = ENOBUFS;
 			sack_rxmit = 0;
 			goto out;
 		}
 
 		m->m_data += max_linkhdr;
 		m->m_len = hdrlen;
 
 		/*
 		 * Start the m_copy functions from the closest mbuf
 		 * to the offset in the socket buffer chain.
 		 */
 		mb = sbsndptr(&so->so_snd, off, len, &moff);
 
 		if (len <= MHLEN - hdrlen - max_linkhdr) {
 			m_copydata(mb, moff, (int)len,
 			    mtod(m, caddr_t) + hdrlen);
 			m->m_len += len;
 		} else {
 			m->m_next = m_copy(mb, moff, (int)len);
 			if (m->m_next == NULL) {
 				SOCKBUF_UNLOCK(&so->so_snd);
 				(void) m_free(m);
 				error = ENOBUFS;
 				sack_rxmit = 0;
 				goto out;
 			}
 		}
 
 		/*
 		 * If we're sending everything we've got, set PUSH.
 		 * (This will keep happy those implementations which only
 		 * give data to the user when a buffer fills or
 		 * a PUSH comes in.)
 		 */
 		if (off + len == so->so_snd.sb_cc)
 			flags |= TH_PUSH;
 		SOCKBUF_UNLOCK(&so->so_snd);
 	} else {
 		SOCKBUF_UNLOCK(&so->so_snd);
 		if (tp->t_flags & TF_ACKNOW)
 			TCPSTAT_INC(tcps_sndacks);
 		else if (flags & (TH_SYN|TH_FIN|TH_RST))
 			TCPSTAT_INC(tcps_sndctrl);
 		else if (SEQ_GT(tp->snd_up, tp->snd_una))
 			TCPSTAT_INC(tcps_sndurg);
 		else
 			TCPSTAT_INC(tcps_sndwinup);
 
 		m = m_gethdr(M_NOWAIT, MT_DATA);
 		if (m == NULL) {
 			error = ENOBUFS;
 			sack_rxmit = 0;
 			goto out;
 		}
 #ifdef INET6
 		if (isipv6 && (MHLEN < hdrlen + max_linkhdr) &&
 		    MHLEN >= hdrlen) {
 			MH_ALIGN(m, hdrlen);
 		} else
 #endif
 		m->m_data += max_linkhdr;
 		m->m_len = hdrlen;
 	}
 	SOCKBUF_UNLOCK_ASSERT(&so->so_snd);
 	m->m_pkthdr.rcvif = (struct ifnet *)0;
 #ifdef MAC
 	mac_inpcb_create_mbuf(tp->t_inpcb, m);
 #endif
 #ifdef INET6
 	if (isipv6) {
 		ip6 = mtod(m, struct ip6_hdr *);
 		th = (struct tcphdr *)(ip6 + 1);
 		tcpip_fillheaders(tp->t_inpcb, ip6, th);
 	} else
 #endif /* INET6 */
 	{
 		ip = mtod(m, struct ip *);
 		ipov = (struct ipovly *)ip;
 		th = (struct tcphdr *)(ip + 1);
 		tcpip_fillheaders(tp->t_inpcb, ip, th);
 	}
 
 	/*
 	 * Fill in fields, remembering maximum advertised
 	 * window for use in delaying messages about window sizes.
 	 * If resending a FIN, be sure not to use a new sequence number.
 	 */
 	if (flags & TH_FIN && tp->t_flags & TF_SENTFIN &&
 	    tp->snd_nxt == tp->snd_max)
 		tp->snd_nxt--;
 	/*
 	 * If we are starting a connection, send ECN setup
 	 * SYN packet. If we are on a retransmit, we may
 	 * resend those bits a number of times as per
 	 * RFC 3168.
 	 */
 	if (tp->t_state == TCPS_SYN_SENT && V_tcp_do_ecn) {
 		if (tp->t_rxtshift >= 1) {
 			if (tp->t_rxtshift <= V_tcp_ecn_maxretries)
 				flags |= TH_ECE|TH_CWR;
 		} else
 			flags |= TH_ECE|TH_CWR;
 	}
 	
 	if (tp->t_state == TCPS_ESTABLISHED &&
 	    (tp->t_flags & TF_ECN_PERMIT)) {
 		/*
 		 * If the peer has ECN, mark data packets with
 		 * ECN capable transmission (ECT).
 		 * Ignore pure ack packets, retransmissions and window probes.
 		 */
 		if (len > 0 && SEQ_GEQ(tp->snd_nxt, tp->snd_max) &&
 		    !((tp->t_flags & TF_FORCEDATA) && len == 1)) {
 #ifdef INET6
 			if (isipv6)
 				ip6->ip6_flow |= htonl(IPTOS_ECN_ECT0 << 20);
 			else
 #endif
 				ip->ip_tos |= IPTOS_ECN_ECT0;
 			TCPSTAT_INC(tcps_ecn_ect0);
 		}
 		
 		/*
 		 * Reply with proper ECN notifications.
 		 */
 		if (tp->t_flags & TF_ECN_SND_CWR) {
 			flags |= TH_CWR;
 			tp->t_flags &= ~TF_ECN_SND_CWR;
 		} 
 		if (tp->t_flags & TF_ECN_SND_ECE)
 			flags |= TH_ECE;
 	}
 	
 	/*
 	 * If we are doing retransmissions, then snd_nxt will
 	 * not reflect the first unsent octet.  For ACK only
 	 * packets, we do not want the sequence number of the
 	 * retransmitted packet, we want the sequence number
 	 * of the next unsent octet.  So, if there is no data
 	 * (and no SYN or FIN), use snd_max instead of snd_nxt
 	 * when filling in ti_seq.  But if we are in persist
 	 * state, snd_max might reflect one byte beyond the
 	 * right edge of the window, so use snd_nxt in that
 	 * case, since we know we aren't doing a retransmission.
 	 * (retransmit and persist are mutually exclusive...)
 	 */
 	if (sack_rxmit == 0) {
 		if (len || (flags & (TH_SYN|TH_FIN)) ||
 		    tcp_timer_active(tp, TT_PERSIST))
 			th->th_seq = htonl(tp->snd_nxt);
 		else
 			th->th_seq = htonl(tp->snd_max);
 	} else {
 		th->th_seq = htonl(p->rxmit);
 		p->rxmit += len;
 		tp->sackhint.sack_bytes_rexmit += len;
 	}
 	th->th_ack = htonl(tp->rcv_nxt);
 	if (optlen) {
 		bcopy(opt, th + 1, optlen);
 		th->th_off = (sizeof (struct tcphdr) + optlen) >> 2;
 	}
 	th->th_flags = flags;
 	/*
 	 * Calculate receive window.  Don't shrink window,
 	 * but avoid silly window syndrome.
 	 */
 	if (recwin < (long)(so->so_rcv.sb_hiwat / 4) &&
 	    recwin < (long)tp->t_maxseg)
 		recwin = 0;
 	if (SEQ_GT(tp->rcv_adv, tp->rcv_nxt) &&
 	    recwin < (long)(tp->rcv_adv - tp->rcv_nxt))
 		recwin = (long)(tp->rcv_adv - tp->rcv_nxt);
 	if (recwin > (long)TCP_MAXWIN << tp->rcv_scale)
 		recwin = (long)TCP_MAXWIN << tp->rcv_scale;
 
 	/*
 	 * According to RFC1323 the window field in a SYN (i.e., a <SYN>
 	 * or <SYN,ACK>) segment itself is never scaled.  The <SYN,ACK>
 	 * case is handled in syncache.
 	 */
 	if (flags & TH_SYN)
 		th->th_win = htons((u_short)
 				(min(sbspace(&so->so_rcv), TCP_MAXWIN)));
 	else
 		th->th_win = htons((u_short)(recwin >> tp->rcv_scale));
 
 	/*
 	 * Adjust the RXWIN0SENT flag - indicate that we have advertised
 	 * a 0 window.  This may cause the remote transmitter to stall.  This
 	 * flag tells soreceive() to disable delayed acknowledgements when
 	 * draining the buffer.  This can occur if the receiver is attempting
 	 * to read more data than can be buffered prior to transmitting on
 	 * the connection.
 	 */
 	if (th->th_win == 0) {
 		tp->t_sndzerowin++;
 		tp->t_flags |= TF_RXWIN0SENT;
 	} else
 		tp->t_flags &= ~TF_RXWIN0SENT;
 	if (SEQ_GT(tp->snd_up, tp->snd_nxt)) {
 		th->th_urp = htons((u_short)(tp->snd_up - tp->snd_nxt));
 		th->th_flags |= TH_URG;
 	} else
 		/*
 		 * If no urgent pointer to send, then we pull
 		 * the urgent pointer to the left edge of the send window
 		 * so that it doesn't drift into the send window on sequence
 		 * number wraparound.
 		 */
 		tp->snd_up = tp->snd_una;		/* drag it along */
 
 #ifdef TCP_SIGNATURE
 	if (tp->t_flags & TF_SIGNATURE) {
 		int sigoff = to.to_signature - opt;
 		tcp_signature_compute(m, 0, len, optlen,
 		    (u_char *)(th + 1) + sigoff, IPSEC_DIR_OUTBOUND);
 	}
 #endif
 
 	/*
 	 * Put TCP length in extended header, and then
 	 * checksum extended header and data.
 	 */
 	m->m_pkthdr.len = hdrlen + len; /* in6_cksum() need this */
 	m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum);
 #ifdef INET6
 	if (isipv6) {
 		/*
 		 * ip6_plen is not need to be filled now, and will be filled
 		 * in ip6_output.
 		 */
 		m->m_pkthdr.csum_flags = CSUM_TCP_IPV6;
 		th->th_sum = in6_cksum_pseudo(ip6, sizeof(struct tcphdr) +
 		    optlen + len, IPPROTO_TCP, 0);
 	}
 #endif
 #if defined(INET6) && defined(INET)
 	else
 #endif
 #ifdef INET
 	{
 		m->m_pkthdr.csum_flags = CSUM_TCP;
 		th->th_sum = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr,
 		    htons(sizeof(struct tcphdr) + IPPROTO_TCP + len + optlen));
 
 		/* IP version must be set here for ipv4/ipv6 checking later */
 		KASSERT(ip->ip_v == IPVERSION,
 		    ("%s: IP version incorrect: %d", __func__, ip->ip_v));
 	}
 #endif
 
 	/*
 	 * Enable TSO and specify the size of the segments.
 	 * The TCP pseudo header checksum is always provided.
 	 * XXX: Fixme: This is currently not the case for IPv6.
 	 */
 	if (tso) {
 		KASSERT(len > tp->t_maxopd - optlen,
 		    ("%s: len <= tso_segsz", __func__));
 		m->m_pkthdr.csum_flags |= CSUM_TSO;
 		m->m_pkthdr.tso_segsz = tp->t_maxopd - optlen;
 	}
 
 #ifdef IPSEC
 	KASSERT(len + hdrlen + ipoptlen - ipsec_optlen == m_length(m, NULL),
 	    ("%s: mbuf chain shorter than expected: %ld + %u + %u - %u != %u",
 	    __func__, len, hdrlen, ipoptlen, ipsec_optlen, m_length(m, NULL)));
 #else
 	KASSERT(len + hdrlen + ipoptlen == m_length(m, NULL),
 	    ("%s: mbuf chain shorter than expected: %ld + %u + %u != %u",
 	    __func__, len, hdrlen, ipoptlen, m_length(m, NULL)));
 #endif
 
 	/* Run HHOOK_TCP_ESTABLISHED_OUT helper hooks. */
 	hhook_run_tcp_est_out(tp, th, &to, len, tso);
 
 #ifdef TCPDEBUG
 	/*
 	 * Trace.
 	 */
 	if (so->so_options & SO_DEBUG) {
 		u_short save = 0;
 #ifdef INET6
 		if (!isipv6)
 #endif
 		{
 			save = ipov->ih_len;
 			ipov->ih_len = htons(m->m_pkthdr.len /* - hdrlen + (th->th_off << 2) */);
 		}
 		tcp_trace(TA_OUTPUT, tp->t_state, tp, mtod(m, void *), th, 0);
 #ifdef INET6
 		if (!isipv6)
 #endif
 		ipov->ih_len = save;
 	}
 #endif /* TCPDEBUG */
 
 	/*
 	 * Fill in IP length and desired time to live and
 	 * send to IP level.  There should be a better way
 	 * to handle ttl and tos; we could keep them in
 	 * the template, but need a way to checksum without them.
 	 */
 	/*
 	 * m->m_pkthdr.len should have been set before cksum calcuration,
 	 * because in6_cksum() need it.
 	 */
 #ifdef INET6
 	if (isipv6) {
 		struct route_in6 ro;
 
 		bzero(&ro, sizeof(ro));
 		/*
 		 * we separately set hoplimit for every segment, since the
 		 * user might want to change the value via setsockopt.
 		 * Also, desired default hop limit might be changed via
 		 * Neighbor Discovery.
 		 */
 		ip6->ip6_hlim = in6_selecthlim(tp->t_inpcb, NULL);
 
 		/*
 		 * Set the packet size here for the benefit of DTrace probes.
 		 * ip6_output() will set it properly; it's supposed to include
 		 * the option header lengths as well.
 		 */
 		ip6->ip6_plen = htons(m->m_pkthdr.len - sizeof(*ip6));
 
 		if (tp->t_state == TCPS_SYN_SENT)
 			TCP_PROBE5(connect_request, NULL, tp, ip6, tp, th);
 
 		TCP_PROBE5(send, NULL, tp, ip6, tp, th);
 
 		/* TODO: IPv6 IP6TOS_ECT bit on */
 		error = ip6_output(m, tp->t_inpcb->in6p_outputopts, &ro,
 		    ((so->so_options & SO_DONTROUTE) ?  IP_ROUTETOIF : 0),
 		    NULL, NULL, tp->t_inpcb);
 
 		if (error == EMSGSIZE && ro.ro_rt != NULL)
 			mtu = ro.ro_rt->rt_rmx.rmx_mtu;
 		RO_RTFREE(&ro);
 	}
 #endif /* INET6 */
 #if defined(INET) && defined(INET6)
 	else
 #endif
 #ifdef INET
     {
 	struct route ro;
 
 	bzero(&ro, sizeof(ro));
 	ip->ip_len = htons(m->m_pkthdr.len);
 #ifdef INET6
 	if (tp->t_inpcb->inp_vflag & INP_IPV6PROTO)
 		ip->ip_ttl = in6_selecthlim(tp->t_inpcb, NULL);
 #endif /* INET6 */
 	/*
 	 * If we do path MTU discovery, then we set DF on every packet.
 	 * This might not be the best thing to do according to RFC3390
 	 * Section 2. However the tcp hostcache migitates the problem
 	 * so it affects only the first tcp connection with a host.
 	 *
 	 * NB: Don't set DF on small MTU/MSS to have a safe fallback.
 	 */
 	if (V_path_mtu_discovery && tp->t_maxopd > V_tcp_minmss)
 		ip->ip_off |= htons(IP_DF);
 
 	if (tp->t_state == TCPS_SYN_SENT)
 		TCP_PROBE5(connect_request, NULL, tp, ip, tp, th);
 
 	TCP_PROBE5(send, NULL, tp, ip, tp, th);
 
 	error = ip_output(m, tp->t_inpcb->inp_options, &ro,
 	    ((so->so_options & SO_DONTROUTE) ? IP_ROUTETOIF : 0), 0,
 	    tp->t_inpcb);
 
 	if (error == EMSGSIZE && ro.ro_rt != NULL)
 		mtu = ro.ro_rt->rt_rmx.rmx_mtu;
 	RO_RTFREE(&ro);
     }
 #endif /* INET */
 
 out:
 	/*
 	 * In transmit state, time the transmission and arrange for
 	 * the retransmit.  In persist state, just set snd_max.
 	 */
 	if ((tp->t_flags & TF_FORCEDATA) == 0 || 
 	    !tcp_timer_active(tp, TT_PERSIST)) {
 		tcp_seq startseq = tp->snd_nxt;
 
 		/*
 		 * Advance snd_nxt over sequence space of this segment.
 		 */
 		if (flags & (TH_SYN|TH_FIN)) {
 			if (flags & TH_SYN)
 				tp->snd_nxt++;
 			if (flags & TH_FIN) {
 				tp->snd_nxt++;
 				tp->t_flags |= TF_SENTFIN;
 			}
 		}
 		if (sack_rxmit)
 			goto timer;
 		tp->snd_nxt += len;
 		if (SEQ_GT(tp->snd_nxt, tp->snd_max)) {
 			tp->snd_max = tp->snd_nxt;
 			/*
 			 * Time this transmission if not a retransmission and
 			 * not currently timing anything.
 			 */
 			if (tp->t_rtttime == 0) {
 				tp->t_rtttime = ticks;
 				tp->t_rtseq = startseq;
 				TCPSTAT_INC(tcps_segstimed);
 			}
 		}
 
 		/*
 		 * Set retransmit timer if not currently set,
 		 * and not doing a pure ack or a keep-alive probe.
 		 * Initial value for retransmit timer is smoothed
 		 * round-trip time + 2 * round-trip time variance.
 		 * Initialize shift counter which is used for backoff
 		 * of retransmit time.
 		 */
 timer:
 		if (!tcp_timer_active(tp, TT_REXMT) &&
 		    ((sack_rxmit && tp->snd_nxt != tp->snd_max) ||
 		     (tp->snd_nxt != tp->snd_una))) {
 			if (tcp_timer_active(tp, TT_PERSIST)) {
 				tcp_timer_activate(tp, TT_PERSIST, 0);
 				tp->t_rxtshift = 0;
 			}
 			tcp_timer_activate(tp, TT_REXMT, tp->t_rxtcur);
 		}
 	} else {
 		/*
 		 * Persist case, update snd_max but since we are in
 		 * persist mode (no window) we do not update snd_nxt.
 		 */
 		int xlen = len;
 		if (flags & TH_SYN)
 			++xlen;
 		if (flags & TH_FIN) {
 			++xlen;
 			tp->t_flags |= TF_SENTFIN;
 		}
 		if (SEQ_GT(tp->snd_nxt + xlen, tp->snd_max))
 			tp->snd_max = tp->snd_nxt + len;
 	}
 
 	if (error) {
 
 		/*
 		 * We know that the packet was lost, so back out the
 		 * sequence number advance, if any.
 		 *
 		 * If the error is EPERM the packet got blocked by the
 		 * local firewall.  Normally we should terminate the
 		 * connection but the blocking may have been spurious
 		 * due to a firewall reconfiguration cycle.  So we treat
 		 * it like a packet loss and let the retransmit timer and
 		 * timeouts do their work over time.
 		 * XXX: It is a POLA question whether calling tcp_drop right
 		 * away would be the really correct behavior instead.
 		 */
 		if (((tp->t_flags & TF_FORCEDATA) == 0 ||
 		    !tcp_timer_active(tp, TT_PERSIST)) &&
 		    ((flags & TH_SYN) == 0) &&
 		    (error != EPERM)) {
 			if (sack_rxmit) {
 				p->rxmit -= len;
 				tp->sackhint.sack_bytes_rexmit -= len;
 				KASSERT(tp->sackhint.sack_bytes_rexmit >= 0,
 				    ("sackhint bytes rtx >= 0"));
 			} else
 				tp->snd_nxt -= len;
 		}
 		SOCKBUF_UNLOCK_ASSERT(&so->so_snd);	/* Check gotos. */
 		switch (error) {
 		case EPERM:
 			tp->t_softerror = error;
 			return (error);
 		case ENOBUFS:
 	                if (!tcp_timer_active(tp, TT_REXMT) &&
 			    !tcp_timer_active(tp, TT_PERSIST))
 	                        tcp_timer_activate(tp, TT_REXMT, tp->t_rxtcur);
 			tp->snd_cwnd = tp->t_maxseg;
 			return (0);
 		case EMSGSIZE:
 			/*
 			 * For some reason the interface we used initially
 			 * to send segments changed to another or lowered
 			 * its MTU.
 			 * If TSO was active we either got an interface
 			 * without TSO capabilits or TSO was turned off.
 			 * If we obtained mtu from ip_output() then update
 			 * it and try again.
 			 */
 			if (tso)
 				tp->t_flags &= ~TF_TSO;
 			if (mtu != 0) {
 				tcp_mss_update(tp, -1, mtu, NULL, NULL);
 				goto again;
 			}
 			return (error);
 		case EHOSTDOWN:
 		case EHOSTUNREACH:
 		case ENETDOWN:
 		case ENETUNREACH:
 			if (TCPS_HAVERCVDSYN(tp->t_state)) {
 				tp->t_softerror = error;
 				return (0);
 			}
 			/* FALLTHROUGH */
 		default:
 			return (error);
 		}
 	}
 	TCPSTAT_INC(tcps_sndtotal);
 
 	/*
 	 * Data sent (as far as we can tell).
 	 * If this advertises a larger window than any other segment,
 	 * then remember the size of the advertised window.
 	 * Any pending ACK has now been sent.
 	 */
 	if (recwin >= 0 && SEQ_GT(tp->rcv_nxt + recwin, tp->rcv_adv))
 		tp->rcv_adv = tp->rcv_nxt + recwin;
 	tp->last_ack_sent = tp->rcv_nxt;
 	tp->t_flags &= ~(TF_ACKNOW | TF_DELACK);
 	if (tcp_timer_active(tp, TT_DELACK))
 		tcp_timer_activate(tp, TT_DELACK, 0);
 #if 0
 	/*
 	 * This completely breaks TCP if newreno is turned on.  What happens
 	 * is that if delayed-acks are turned on on the receiver, this code
 	 * on the transmitter effectively destroys the TCP window, forcing
 	 * it to four packets (1.5Kx4 = 6K window).
 	 */
 	if (sendalot && --maxburst)
 		goto again;
 #endif
 	if (sendalot)
 		goto again;
 	return (0);
 }
 
 void
 tcp_setpersist(struct tcpcb *tp)
 {
 	int t = ((tp->t_srtt >> 2) + tp->t_rttvar) >> 1;
 	int tt;
 
 	tp->t_flags &= ~TF_PREVVALID;
 	if (tcp_timer_active(tp, TT_REXMT))
 		panic("tcp_setpersist: retransmit pending");
 	/*
 	 * Start/restart persistance timer.
 	 */
 	TCPT_RANGESET(tt, t * tcp_backoff[tp->t_rxtshift],
 		      TCPTV_PERSMIN, TCPTV_PERSMAX);
 	tcp_timer_activate(tp, TT_PERSIST, tt);
 	if (tp->t_rxtshift < TCP_MAXRXTSHIFT)
 		tp->t_rxtshift++;
 }
 
 /*
  * Insert TCP options according to the supplied parameters to the place
  * optp in a consistent way.  Can handle unaligned destinations.
  *
  * The order of the option processing is crucial for optimal packing and
  * alignment for the scarce option space.
  *
  * The optimal order for a SYN/SYN-ACK segment is:
  *   MSS (4) + NOP (1) + Window scale (3) + SACK permitted (2) +
  *   Timestamp (10) + Signature (18) = 38 bytes out of a maximum of 40.
  *
  * The SACK options should be last.  SACK blocks consume 8*n+2 bytes.
  * So a full size SACK blocks option is 34 bytes (with 4 SACK blocks).
  * At minimum we need 10 bytes (to generate 1 SACK block).  If both
  * TCP Timestamps (12 bytes) and TCP Signatures (18 bytes) are present,
  * we only have 10 bytes for SACK options (40 - (12 + 18)).
  */
 int
 tcp_addoptions(struct tcpopt *to, u_char *optp)
 {
 	u_int mask, optlen = 0;
 
 	for (mask = 1; mask < TOF_MAXOPT; mask <<= 1) {
 		if ((to->to_flags & mask) != mask)
 			continue;
 		if (optlen == TCP_MAXOLEN)
 			break;
 		switch (to->to_flags & mask) {
 		case TOF_MSS:
 			while (optlen % 4) {
 				optlen += TCPOLEN_NOP;
 				*optp++ = TCPOPT_NOP;
 			}
 			if (TCP_MAXOLEN - optlen < TCPOLEN_MAXSEG)
 				continue;
 			optlen += TCPOLEN_MAXSEG;
 			*optp++ = TCPOPT_MAXSEG;
 			*optp++ = TCPOLEN_MAXSEG;
 			to->to_mss = htons(to->to_mss);
 			bcopy((u_char *)&to->to_mss, optp, sizeof(to->to_mss));
 			optp += sizeof(to->to_mss);
 			break;
 		case TOF_SCALE:
 			while (!optlen || optlen % 2 != 1) {
 				optlen += TCPOLEN_NOP;
 				*optp++ = TCPOPT_NOP;
 			}
 			if (TCP_MAXOLEN - optlen < TCPOLEN_WINDOW)
 				continue;
 			optlen += TCPOLEN_WINDOW;
 			*optp++ = TCPOPT_WINDOW;
 			*optp++ = TCPOLEN_WINDOW;
 			*optp++ = to->to_wscale;
 			break;
 		case TOF_SACKPERM:
 			while (optlen % 2) {
 				optlen += TCPOLEN_NOP;
 				*optp++ = TCPOPT_NOP;
 			}
 			if (TCP_MAXOLEN - optlen < TCPOLEN_SACK_PERMITTED)
 				continue;
 			optlen += TCPOLEN_SACK_PERMITTED;
 			*optp++ = TCPOPT_SACK_PERMITTED;
 			*optp++ = TCPOLEN_SACK_PERMITTED;
 			break;
 		case TOF_TS:
 			while (!optlen || optlen % 4 != 2) {
 				optlen += TCPOLEN_NOP;
 				*optp++ = TCPOPT_NOP;
 			}
 			if (TCP_MAXOLEN - optlen < TCPOLEN_TIMESTAMP)
 				continue;
 			optlen += TCPOLEN_TIMESTAMP;
 			*optp++ = TCPOPT_TIMESTAMP;
 			*optp++ = TCPOLEN_TIMESTAMP;
 			to->to_tsval = htonl(to->to_tsval);
 			to->to_tsecr = htonl(to->to_tsecr);
 			bcopy((u_char *)&to->to_tsval, optp, sizeof(to->to_tsval));
 			optp += sizeof(to->to_tsval);
 			bcopy((u_char *)&to->to_tsecr, optp, sizeof(to->to_tsecr));
 			optp += sizeof(to->to_tsecr);
 			break;
 		case TOF_SIGNATURE:
 			{
 			int siglen = TCPOLEN_SIGNATURE - 2;
 
 			while (!optlen || optlen % 4 != 2) {
 				optlen += TCPOLEN_NOP;
 				*optp++ = TCPOPT_NOP;
 			}
 			if (TCP_MAXOLEN - optlen < TCPOLEN_SIGNATURE)
 				continue;
 			optlen += TCPOLEN_SIGNATURE;
 			*optp++ = TCPOPT_SIGNATURE;
 			*optp++ = TCPOLEN_SIGNATURE;
 			to->to_signature = optp;
 			while (siglen--)
 				 *optp++ = 0;
 			break;
 			}
 		case TOF_SACK:
 			{
 			int sackblks = 0;
 			struct sackblk *sack = (struct sackblk *)to->to_sacks;
 			tcp_seq sack_seq;
 
 			while (!optlen || optlen % 4 != 2) {
 				optlen += TCPOLEN_NOP;
 				*optp++ = TCPOPT_NOP;
 			}
 			if (TCP_MAXOLEN - optlen < TCPOLEN_SACKHDR + TCPOLEN_SACK)
 				continue;
 			optlen += TCPOLEN_SACKHDR;
 			*optp++ = TCPOPT_SACK;
 			sackblks = min(to->to_nsacks,
 					(TCP_MAXOLEN - optlen) / TCPOLEN_SACK);
 			*optp++ = TCPOLEN_SACKHDR + sackblks * TCPOLEN_SACK;
 			while (sackblks--) {
 				sack_seq = htonl(sack->start);
 				bcopy((u_char *)&sack_seq, optp, sizeof(sack_seq));
 				optp += sizeof(sack_seq);
 				sack_seq = htonl(sack->end);
 				bcopy((u_char *)&sack_seq, optp, sizeof(sack_seq));
 				optp += sizeof(sack_seq);
 				optlen += TCPOLEN_SACK;
 				sack++;
 			}
 			TCPSTAT_INC(tcps_sack_send_blocks);
 			break;
 			}
 		default:
 			panic("%s: unknown TCP option type", __func__);
 			break;
 		}
 	}
 
 	/* Terminate and pad TCP options to a 4 byte boundary. */
 	if (optlen % 4) {
 		optlen += TCPOLEN_EOL;
 		*optp++ = TCPOPT_EOL;
 	}
 	/*
 	 * According to RFC 793 (STD0007):
 	 *   "The content of the header beyond the End-of-Option option
 	 *    must be header padding (i.e., zero)."
 	 *   and later: "The padding is composed of zeros."
 	 */
 	while (optlen % 4) {
 		optlen += TCPOLEN_PAD;
 		*optp++ = TCPOPT_PAD;
 	}
 
 	KASSERT(optlen <= TCP_MAXOLEN, ("%s: TCP options too long", __func__));
 	return (optlen);
 }
Index: head/sys/netinet/tcp_subr.c
===================================================================
--- head/sys/netinet/tcp_subr.c	(revision 258540)
+++ head/sys/netinet/tcp_subr.c	(revision 258541)
@@ -1,2416 +1,2415 @@
 /*-
  * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1995
  *	The Regents of the University of California.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)tcp_subr.c	8.2 (Berkeley) 5/24/95
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_compat.h"
 #include "opt_inet.h"
 #include "opt_inet6.h"
 #include "opt_ipsec.h"
-#include "opt_kdtrace.h"
 #include "opt_tcpdebug.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/callout.h>
 #include <sys/hhook.h>
 #include <sys/kernel.h>
 #include <sys/khelp.h>
 #include <sys/sysctl.h>
 #include <sys/jail.h>
 #include <sys/malloc.h>
 #include <sys/mbuf.h>
 #ifdef INET6
 #include <sys/domain.h>
 #endif
 #include <sys/priv.h>
 #include <sys/proc.h>
 #include <sys/sdt.h>
 #include <sys/socket.h>
 #include <sys/socketvar.h>
 #include <sys/protosw.h>
 #include <sys/random.h>
 
 #include <vm/uma.h>
 
 #include <net/route.h>
 #include <net/if.h>
 #include <net/if_var.h>
 #include <net/vnet.h>
 
 #include <netinet/cc.h>
 #include <netinet/in.h>
 #include <netinet/in_kdtrace.h>
 #include <netinet/in_pcb.h>
 #include <netinet/in_systm.h>
 #include <netinet/in_var.h>
 #include <netinet/ip.h>
 #include <netinet/ip_icmp.h>
 #include <netinet/ip_var.h>
 #ifdef INET6
 #include <netinet/ip6.h>
 #include <netinet6/in6_pcb.h>
 #include <netinet6/ip6_var.h>
 #include <netinet6/scope6_var.h>
 #include <netinet6/nd6.h>
 #endif
 
 #include <netinet/tcp_fsm.h>
 #include <netinet/tcp_seq.h>
 #include <netinet/tcp_timer.h>
 #include <netinet/tcp_var.h>
 #include <netinet/tcp_syncache.h>
 #ifdef INET6
 #include <netinet6/tcp6_var.h>
 #endif
 #include <netinet/tcpip.h>
 #ifdef TCPDEBUG
 #include <netinet/tcp_debug.h>
 #endif
 #ifdef INET6
 #include <netinet6/ip6protosw.h>
 #endif
 #ifdef TCP_OFFLOAD
 #include <netinet/tcp_offload.h>
 #endif
 
 #ifdef IPSEC
 #include <netipsec/ipsec.h>
 #include <netipsec/xform.h>
 #ifdef INET6
 #include <netipsec/ipsec6.h>
 #endif
 #include <netipsec/key.h>
 #include <sys/syslog.h>
 #endif /*IPSEC*/
 
 #include <machine/in_cksum.h>
 #include <sys/md5.h>
 
 #include <security/mac/mac_framework.h>
 
 VNET_DEFINE(int, tcp_mssdflt) = TCP_MSS;
 #ifdef INET6
 VNET_DEFINE(int, tcp_v6mssdflt) = TCP6_MSS;
 #endif
 
 static int
 sysctl_net_inet_tcp_mss_check(SYSCTL_HANDLER_ARGS)
 {
 	int error, new;
 
 	new = V_tcp_mssdflt;
 	error = sysctl_handle_int(oidp, &new, 0, req);
 	if (error == 0 && req->newptr) {
 		if (new < TCP_MINMSS)
 			error = EINVAL;
 		else
 			V_tcp_mssdflt = new;
 	}
 	return (error);
 }
 
 SYSCTL_VNET_PROC(_net_inet_tcp, TCPCTL_MSSDFLT, mssdflt,
     CTLTYPE_INT|CTLFLAG_RW, &VNET_NAME(tcp_mssdflt), 0,
     &sysctl_net_inet_tcp_mss_check, "I",
     "Default TCP Maximum Segment Size");
 
 #ifdef INET6
 static int
 sysctl_net_inet_tcp_mss_v6_check(SYSCTL_HANDLER_ARGS)
 {
 	int error, new;
 
 	new = V_tcp_v6mssdflt;
 	error = sysctl_handle_int(oidp, &new, 0, req);
 	if (error == 0 && req->newptr) {
 		if (new < TCP_MINMSS)
 			error = EINVAL;
 		else
 			V_tcp_v6mssdflt = new;
 	}
 	return (error);
 }
 
 SYSCTL_VNET_PROC(_net_inet_tcp, TCPCTL_V6MSSDFLT, v6mssdflt,
     CTLTYPE_INT|CTLFLAG_RW, &VNET_NAME(tcp_v6mssdflt), 0,
     &sysctl_net_inet_tcp_mss_v6_check, "I",
    "Default TCP Maximum Segment Size for IPv6");
 #endif /* INET6 */
 
 /*
  * Minimum MSS we accept and use. This prevents DoS attacks where
  * we are forced to a ridiculous low MSS like 20 and send hundreds
  * of packets instead of one. The effect scales with the available
  * bandwidth and quickly saturates the CPU and network interface
  * with packet generation and sending. Set to zero to disable MINMSS
  * checking. This setting prevents us from sending too small packets.
  */
 VNET_DEFINE(int, tcp_minmss) = TCP_MINMSS;
 SYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, minmss, CTLFLAG_RW,
      &VNET_NAME(tcp_minmss), 0,
     "Minimum TCP Maximum Segment Size");
 
 VNET_DEFINE(int, tcp_do_rfc1323) = 1;
 SYSCTL_VNET_INT(_net_inet_tcp, TCPCTL_DO_RFC1323, rfc1323, CTLFLAG_RW,
     &VNET_NAME(tcp_do_rfc1323), 0,
     "Enable rfc1323 (high performance TCP) extensions");
 
 static int	tcp_log_debug = 0;
 SYSCTL_INT(_net_inet_tcp, OID_AUTO, log_debug, CTLFLAG_RW,
     &tcp_log_debug, 0, "Log errors caused by incoming TCP segments");
 
 static int	tcp_tcbhashsize = 0;
 SYSCTL_INT(_net_inet_tcp, OID_AUTO, tcbhashsize, CTLFLAG_RDTUN,
     &tcp_tcbhashsize, 0, "Size of TCP control-block hashtable");
 
 static int	do_tcpdrain = 1;
 SYSCTL_INT(_net_inet_tcp, OID_AUTO, do_tcpdrain, CTLFLAG_RW, &do_tcpdrain, 0,
     "Enable tcp_drain routine for extra help when low on mbufs");
 
 SYSCTL_VNET_UINT(_net_inet_tcp, OID_AUTO, pcbcount, CTLFLAG_RD,
     &VNET_NAME(tcbinfo.ipi_count), 0, "Number of active PCBs");
 
 static VNET_DEFINE(int, icmp_may_rst) = 1;
 #define	V_icmp_may_rst			VNET(icmp_may_rst)
 SYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, icmp_may_rst, CTLFLAG_RW,
     &VNET_NAME(icmp_may_rst), 0,
     "Certain ICMP unreachable messages may abort connections in SYN_SENT");
 
 static VNET_DEFINE(int, tcp_isn_reseed_interval) = 0;
 #define	V_tcp_isn_reseed_interval	VNET(tcp_isn_reseed_interval)
 SYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, isn_reseed_interval, CTLFLAG_RW,
     &VNET_NAME(tcp_isn_reseed_interval), 0,
     "Seconds between reseeding of ISN secret");
 
 static int	tcp_soreceive_stream = 0;
 SYSCTL_INT(_net_inet_tcp, OID_AUTO, soreceive_stream, CTLFLAG_RDTUN,
     &tcp_soreceive_stream, 0, "Using soreceive_stream for TCP sockets");
 
 #ifdef TCP_SIGNATURE
 static int	tcp_sig_checksigs = 1;
 SYSCTL_INT(_net_inet_tcp, OID_AUTO, signature_verify_input, CTLFLAG_RW,
     &tcp_sig_checksigs, 0, "Verify RFC2385 digests on inbound traffic");
 #endif
 
 VNET_DEFINE(uma_zone_t, sack_hole_zone);
 #define	V_sack_hole_zone		VNET(sack_hole_zone)
 
 VNET_DEFINE(struct hhook_head *, tcp_hhh[HHOOK_TCP_LAST+1]);
 
 static struct inpcb *tcp_notify(struct inpcb *, int);
 static struct inpcb *tcp_mtudisc_notify(struct inpcb *, int);
 static char *	tcp_log_addr(struct in_conninfo *inc, struct tcphdr *th,
 		    void *ip4hdr, const void *ip6hdr);
 
 /*
  * Target size of TCP PCB hash tables. Must be a power of two.
  *
  * Note that this can be overridden by the kernel environment
  * variable net.inet.tcp.tcbhashsize
  */
 #ifndef TCBHASHSIZE
 #define TCBHASHSIZE	0
 #endif
 
 /*
  * XXX
  * Callouts should be moved into struct tcp directly.  They are currently
  * separate because the tcpcb structure is exported to userland for sysctl
  * parsing purposes, which do not know about callouts.
  */
 struct tcpcb_mem {
 	struct	tcpcb		tcb;
 	struct	tcp_timer	tt;
 	struct	cc_var		ccv;
 	struct	osd		osd;
 };
 
 static VNET_DEFINE(uma_zone_t, tcpcb_zone);
 #define	V_tcpcb_zone			VNET(tcpcb_zone)
 
 MALLOC_DEFINE(M_TCPLOG, "tcplog", "TCP address and flags print buffers");
 static struct mtx isn_mtx;
 
 #define	ISN_LOCK_INIT()	mtx_init(&isn_mtx, "isn_mtx", NULL, MTX_DEF)
 #define	ISN_LOCK()	mtx_lock(&isn_mtx)
 #define	ISN_UNLOCK()	mtx_unlock(&isn_mtx)
 
 /*
  * TCP initialization.
  */
 static void
 tcp_zone_change(void *tag)
 {
 
 	uma_zone_set_max(V_tcbinfo.ipi_zone, maxsockets);
 	uma_zone_set_max(V_tcpcb_zone, maxsockets);
 	tcp_tw_zone_change();
 }
 
 static int
 tcp_inpcb_init(void *mem, int size, int flags)
 {
 	struct inpcb *inp = mem;
 
 	INP_LOCK_INIT(inp, "inp", "tcpinp");
 	return (0);
 }
 
 /*
  * Take a value and get the next power of 2 that doesn't overflow.
  * Used to size the tcp_inpcb hash buckets.
  */
 static int
 maketcp_hashsize(int size)
 {
 	int hashsize;
 
 	/*
 	 * auto tune.
 	 * get the next power of 2 higher than maxsockets.
 	 */
 	hashsize = 1 << fls(size);
 	/* catch overflow, and just go one power of 2 smaller */
 	if (hashsize < size) {
 		hashsize = 1 << (fls(size) - 1);
 	}
 	return (hashsize);
 }
 
 void
 tcp_init(void)
 {
 	const char *tcbhash_tuneable;
 	int hashsize;
 
 	tcbhash_tuneable = "net.inet.tcp.tcbhashsize";
 
 	if (hhook_head_register(HHOOK_TYPE_TCP, HHOOK_TCP_EST_IN,
 	    &V_tcp_hhh[HHOOK_TCP_EST_IN], HHOOK_NOWAIT|HHOOK_HEADISINVNET) != 0)
 		printf("%s: WARNING: unable to register helper hook\n", __func__);
 	if (hhook_head_register(HHOOK_TYPE_TCP, HHOOK_TCP_EST_OUT,
 	    &V_tcp_hhh[HHOOK_TCP_EST_OUT], HHOOK_NOWAIT|HHOOK_HEADISINVNET) != 0)
 		printf("%s: WARNING: unable to register helper hook\n", __func__);
 
 	hashsize = TCBHASHSIZE;
 	TUNABLE_INT_FETCH(tcbhash_tuneable, &hashsize);
 	if (hashsize == 0) {
 		/*
 		 * Auto tune the hash size based on maxsockets.
 		 * A perfect hash would have a 1:1 mapping
 		 * (hashsize = maxsockets) however it's been
 		 * suggested that O(2) average is better.
 		 */
 		hashsize = maketcp_hashsize(maxsockets / 4);
 		/*
 		 * Our historical default is 512,
 		 * do not autotune lower than this.
 		 */
 		if (hashsize < 512)
 			hashsize = 512;
 		if (bootverbose)
 			printf("%s: %s auto tuned to %d\n", __func__,
 			    tcbhash_tuneable, hashsize);
 	}
 	/*
 	 * We require a hashsize to be a power of two.
 	 * Previously if it was not a power of two we would just reset it
 	 * back to 512, which could be a nasty surprise if you did not notice
 	 * the error message.
 	 * Instead what we do is clip it to the closest power of two lower
 	 * than the specified hash value.
 	 */
 	if (!powerof2(hashsize)) {
 		int oldhashsize = hashsize;
 
 		hashsize = maketcp_hashsize(hashsize);
 		/* prevent absurdly low value */
 		if (hashsize < 16)
 			hashsize = 16;
 		printf("%s: WARNING: TCB hash size not a power of 2, "
 		    "clipped from %d to %d.\n", __func__, oldhashsize,
 		    hashsize);
 	}
 	in_pcbinfo_init(&V_tcbinfo, "tcp", &V_tcb, hashsize, hashsize,
 	    "tcp_inpcb", tcp_inpcb_init, NULL, UMA_ZONE_NOFREE,
 	    IPI_HASHFIELDS_4TUPLE);
 
 	/*
 	 * These have to be type stable for the benefit of the timers.
 	 */
 	V_tcpcb_zone = uma_zcreate("tcpcb", sizeof(struct tcpcb_mem),
 	    NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
 	uma_zone_set_max(V_tcpcb_zone, maxsockets);
 	uma_zone_set_warning(V_tcpcb_zone, "kern.ipc.maxsockets limit reached");
 
 	tcp_tw_init();
 	syncache_init();
 	tcp_hc_init();
 	tcp_reass_init();
 
 	TUNABLE_INT_FETCH("net.inet.tcp.sack.enable", &V_tcp_do_sack);
 	V_sack_hole_zone = uma_zcreate("sackhole", sizeof(struct sackhole),
 	    NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
 
 	/* Skip initialization of globals for non-default instances. */
 	if (!IS_DEFAULT_VNET(curvnet))
 		return;
 
 	/* XXX virtualize those bellow? */
 	tcp_delacktime = TCPTV_DELACK;
 	tcp_keepinit = TCPTV_KEEP_INIT;
 	tcp_keepidle = TCPTV_KEEP_IDLE;
 	tcp_keepintvl = TCPTV_KEEPINTVL;
 	tcp_maxpersistidle = TCPTV_KEEP_IDLE;
 	tcp_msl = TCPTV_MSL;
 	tcp_rexmit_min = TCPTV_MIN;
 	if (tcp_rexmit_min < 1)
 		tcp_rexmit_min = 1;
 	tcp_rexmit_slop = TCPTV_CPU_VAR;
 	tcp_finwait2_timeout = TCPTV_FINWAIT2_TIMEOUT;
 	tcp_tcbhashsize = hashsize;
 
 	TUNABLE_INT_FETCH("net.inet.tcp.soreceive_stream", &tcp_soreceive_stream);
 	if (tcp_soreceive_stream) {
 #ifdef INET
 		tcp_usrreqs.pru_soreceive = soreceive_stream;
 #endif
 #ifdef INET6
 		tcp6_usrreqs.pru_soreceive = soreceive_stream;
 #endif /* INET6 */
 	}
 
 #ifdef INET6
 #define TCP_MINPROTOHDR (sizeof(struct ip6_hdr) + sizeof(struct tcphdr))
 #else /* INET6 */
 #define TCP_MINPROTOHDR (sizeof(struct tcpiphdr))
 #endif /* INET6 */
 	if (max_protohdr < TCP_MINPROTOHDR)
 		max_protohdr = TCP_MINPROTOHDR;
 	if (max_linkhdr + TCP_MINPROTOHDR > MHLEN)
 		panic("tcp_init");
 #undef TCP_MINPROTOHDR
 
 	ISN_LOCK_INIT();
 	EVENTHANDLER_REGISTER(shutdown_pre_sync, tcp_fini, NULL,
 		SHUTDOWN_PRI_DEFAULT);
 	EVENTHANDLER_REGISTER(maxsockets_change, tcp_zone_change, NULL,
 		EVENTHANDLER_PRI_ANY);
 }
 
 #ifdef VIMAGE
 void
 tcp_destroy(void)
 {
 	int error;
 
 	tcp_reass_destroy();
 	tcp_hc_destroy();
 	syncache_destroy();
 	tcp_tw_destroy();
 	in_pcbinfo_destroy(&V_tcbinfo);
 	uma_zdestroy(V_sack_hole_zone);
 	uma_zdestroy(V_tcpcb_zone);
 
 	error = hhook_head_deregister(V_tcp_hhh[HHOOK_TCP_EST_IN]);
 	if (error != 0) {
 		printf("%s: WARNING: unable to deregister helper hook "
 		    "type=%d, id=%d: error %d returned\n", __func__,
 		    HHOOK_TYPE_TCP, HHOOK_TCP_EST_IN, error);
 	}
 	error = hhook_head_deregister(V_tcp_hhh[HHOOK_TCP_EST_OUT]);
 	if (error != 0) {
 		printf("%s: WARNING: unable to deregister helper hook "
 		    "type=%d, id=%d: error %d returned\n", __func__,
 		    HHOOK_TYPE_TCP, HHOOK_TCP_EST_OUT, error);
 	}
 }
 #endif
 
 void
 tcp_fini(void *xtp)
 {
 
 }
 
 /*
  * Fill in the IP and TCP headers for an outgoing packet, given the tcpcb.
  * tcp_template used to store this data in mbufs, but we now recopy it out
  * of the tcpcb each time to conserve mbufs.
  */
 void
 tcpip_fillheaders(struct inpcb *inp, void *ip_ptr, void *tcp_ptr)
 {
 	struct tcphdr *th = (struct tcphdr *)tcp_ptr;
 
 	INP_WLOCK_ASSERT(inp);
 
 #ifdef INET6
 	if ((inp->inp_vflag & INP_IPV6) != 0) {
 		struct ip6_hdr *ip6;
 
 		ip6 = (struct ip6_hdr *)ip_ptr;
 		ip6->ip6_flow = (ip6->ip6_flow & ~IPV6_FLOWINFO_MASK) |
 			(inp->inp_flow & IPV6_FLOWINFO_MASK);
 		ip6->ip6_vfc = (ip6->ip6_vfc & ~IPV6_VERSION_MASK) |
 			(IPV6_VERSION & IPV6_VERSION_MASK);
 		ip6->ip6_nxt = IPPROTO_TCP;
 		ip6->ip6_plen = htons(sizeof(struct tcphdr));
 		ip6->ip6_src = inp->in6p_laddr;
 		ip6->ip6_dst = inp->in6p_faddr;
 	}
 #endif /* INET6 */
 #if defined(INET6) && defined(INET)
 	else
 #endif
 #ifdef INET
 	{
 		struct ip *ip;
 
 		ip = (struct ip *)ip_ptr;
 		ip->ip_v = IPVERSION;
 		ip->ip_hl = 5;
 		ip->ip_tos = inp->inp_ip_tos;
 		ip->ip_len = 0;
 		ip->ip_id = 0;
 		ip->ip_off = 0;
 		ip->ip_ttl = inp->inp_ip_ttl;
 		ip->ip_sum = 0;
 		ip->ip_p = IPPROTO_TCP;
 		ip->ip_src = inp->inp_laddr;
 		ip->ip_dst = inp->inp_faddr;
 	}
 #endif /* INET */
 	th->th_sport = inp->inp_lport;
 	th->th_dport = inp->inp_fport;
 	th->th_seq = 0;
 	th->th_ack = 0;
 	th->th_x2 = 0;
 	th->th_off = 5;
 	th->th_flags = 0;
 	th->th_win = 0;
 	th->th_urp = 0;
 	th->th_sum = 0;		/* in_pseudo() is called later for ipv4 */
 }
 
 /*
  * Create template to be used to send tcp packets on a connection.
  * Allocates an mbuf and fills in a skeletal tcp/ip header.  The only
  * use for this function is in keepalives, which use tcp_respond.
  */
 struct tcptemp *
 tcpip_maketemplate(struct inpcb *inp)
 {
 	struct tcptemp *t;
 
 	t = malloc(sizeof(*t), M_TEMP, M_NOWAIT);
 	if (t == NULL)
 		return (NULL);
 	tcpip_fillheaders(inp, (void *)&t->tt_ipgen, (void *)&t->tt_t);
 	return (t);
 }
 
 /*
  * Send a single message to the TCP at address specified by
  * the given TCP/IP header.  If m == NULL, then we make a copy
  * of the tcpiphdr at ti and send directly to the addressed host.
  * This is used to force keep alive messages out using the TCP
  * template for a connection.  If flags are given then we send
  * a message back to the TCP which originated the * segment ti,
  * and discard the mbuf containing it and any other attached mbufs.
  *
  * In any case the ack and sequence number of the transmitted
  * segment are as specified by the parameters.
  *
  * NOTE: If m != NULL, then ti must point to *inside* the mbuf.
  */
 void
 tcp_respond(struct tcpcb *tp, void *ipgen, struct tcphdr *th, struct mbuf *m,
     tcp_seq ack, tcp_seq seq, int flags)
 {
 	int tlen;
 	int win = 0;
 	struct ip *ip;
 	struct tcphdr *nth;
 #ifdef INET6
 	struct ip6_hdr *ip6;
 	int isipv6;
 #endif /* INET6 */
 	int ipflags = 0;
 	struct inpcb *inp;
 
 	KASSERT(tp != NULL || m != NULL, ("tcp_respond: tp and m both NULL"));
 
 #ifdef INET6
 	isipv6 = ((struct ip *)ipgen)->ip_v == (IPV6_VERSION >> 4);
 	ip6 = ipgen;
 #endif /* INET6 */
 	ip = ipgen;
 
 	if (tp != NULL) {
 		inp = tp->t_inpcb;
 		KASSERT(inp != NULL, ("tcp control block w/o inpcb"));
 		INP_WLOCK_ASSERT(inp);
 	} else
 		inp = NULL;
 
 	if (tp != NULL) {
 		if (!(flags & TH_RST)) {
 			win = sbspace(&inp->inp_socket->so_rcv);
 			if (win > (long)TCP_MAXWIN << tp->rcv_scale)
 				win = (long)TCP_MAXWIN << tp->rcv_scale;
 		}
 	}
 	if (m == NULL) {
 		m = m_gethdr(M_NOWAIT, MT_DATA);
 		if (m == NULL)
 			return;
 		tlen = 0;
 		m->m_data += max_linkhdr;
 #ifdef INET6
 		if (isipv6) {
 			bcopy((caddr_t)ip6, mtod(m, caddr_t),
 			      sizeof(struct ip6_hdr));
 			ip6 = mtod(m, struct ip6_hdr *);
 			nth = (struct tcphdr *)(ip6 + 1);
 		} else
 #endif /* INET6 */
 		{
 			bcopy((caddr_t)ip, mtod(m, caddr_t), sizeof(struct ip));
 			ip = mtod(m, struct ip *);
 			nth = (struct tcphdr *)(ip + 1);
 		}
 		bcopy((caddr_t)th, (caddr_t)nth, sizeof(struct tcphdr));
 		flags = TH_ACK;
 	} else {
 		/*
 		 *  reuse the mbuf. 
 		 * XXX MRT We inherrit the FIB, which is lucky.
 		 */
 		m_freem(m->m_next);
 		m->m_next = NULL;
 		m->m_data = (caddr_t)ipgen;
 		/* m_len is set later */
 		tlen = 0;
 #define xchg(a,b,type) { type t; t=a; a=b; b=t; }
 #ifdef INET6
 		if (isipv6) {
 			xchg(ip6->ip6_dst, ip6->ip6_src, struct in6_addr);
 			nth = (struct tcphdr *)(ip6 + 1);
 		} else
 #endif /* INET6 */
 		{
 			xchg(ip->ip_dst.s_addr, ip->ip_src.s_addr, uint32_t);
 			nth = (struct tcphdr *)(ip + 1);
 		}
 		if (th != nth) {
 			/*
 			 * this is usually a case when an extension header
 			 * exists between the IPv6 header and the
 			 * TCP header.
 			 */
 			nth->th_sport = th->th_sport;
 			nth->th_dport = th->th_dport;
 		}
 		xchg(nth->th_dport, nth->th_sport, uint16_t);
 #undef xchg
 	}
 #ifdef INET6
 	if (isipv6) {
 		ip6->ip6_flow = 0;
 		ip6->ip6_vfc = IPV6_VERSION;
 		ip6->ip6_nxt = IPPROTO_TCP;
 		tlen += sizeof (struct ip6_hdr) + sizeof (struct tcphdr);
 		ip6->ip6_plen = htons(tlen - sizeof(*ip6));
 	}
 #endif
 #if defined(INET) && defined(INET6)
 	else
 #endif
 #ifdef INET
 	{
 		tlen += sizeof (struct tcpiphdr);
 		ip->ip_len = htons(tlen);
 		ip->ip_ttl = V_ip_defttl;
 		if (V_path_mtu_discovery)
 			ip->ip_off |= htons(IP_DF);
 	}
 #endif
 	m->m_len = tlen;
 	m->m_pkthdr.len = tlen;
 	m->m_pkthdr.rcvif = NULL;
 #ifdef MAC
 	if (inp != NULL) {
 		/*
 		 * Packet is associated with a socket, so allow the
 		 * label of the response to reflect the socket label.
 		 */
 		INP_WLOCK_ASSERT(inp);
 		mac_inpcb_create_mbuf(inp, m);
 	} else {
 		/*
 		 * Packet is not associated with a socket, so possibly
 		 * update the label in place.
 		 */
 		mac_netinet_tcp_reply(m);
 	}
 #endif
 	nth->th_seq = htonl(seq);
 	nth->th_ack = htonl(ack);
 	nth->th_x2 = 0;
 	nth->th_off = sizeof (struct tcphdr) >> 2;
 	nth->th_flags = flags;
 	if (tp != NULL)
 		nth->th_win = htons((u_short) (win >> tp->rcv_scale));
 	else
 		nth->th_win = htons((u_short)win);
 	nth->th_urp = 0;
 
 	m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum);
 #ifdef INET6
 	if (isipv6) {
 		m->m_pkthdr.csum_flags = CSUM_TCP_IPV6;
 		nth->th_sum = in6_cksum_pseudo(ip6,
 		    tlen - sizeof(struct ip6_hdr), IPPROTO_TCP, 0);
 		ip6->ip6_hlim = in6_selecthlim(tp != NULL ? tp->t_inpcb :
 		    NULL, NULL);
 	}
 #endif /* INET6 */
 #if defined(INET6) && defined(INET)
 	else
 #endif
 #ifdef INET
 	{
 		m->m_pkthdr.csum_flags = CSUM_TCP;
 		nth->th_sum = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr,
 		    htons((u_short)(tlen - sizeof(struct ip) + ip->ip_p)));
 	}
 #endif /* INET */
 #ifdef TCPDEBUG
 	if (tp == NULL || (inp->inp_socket->so_options & SO_DEBUG))
 		tcp_trace(TA_OUTPUT, 0, tp, mtod(m, void *), th, 0);
 #endif
 	if (flags & TH_RST)
 		TCP_PROBE5(accept_refused, NULL, NULL, m->m_data, tp, nth);
 
 	TCP_PROBE5(send, NULL, tp, m->m_data, tp, nth);
 #ifdef INET6
 	if (isipv6)
 		(void) ip6_output(m, NULL, NULL, ipflags, NULL, NULL, inp);
 #endif /* INET6 */
 #if defined(INET) && defined(INET6)
 	else
 #endif
 #ifdef INET
 		(void) ip_output(m, NULL, NULL, ipflags, NULL, inp);
 #endif
 }
 
 /*
  * Create a new TCP control block, making an
  * empty reassembly queue and hooking it to the argument
  * protocol control block.  The `inp' parameter must have
  * come from the zone allocator set up in tcp_init().
  */
 struct tcpcb *
 tcp_newtcpcb(struct inpcb *inp)
 {
 	struct tcpcb_mem *tm;
 	struct tcpcb *tp;
 #ifdef INET6
 	int isipv6 = (inp->inp_vflag & INP_IPV6) != 0;
 #endif /* INET6 */
 
 	tm = uma_zalloc(V_tcpcb_zone, M_NOWAIT | M_ZERO);
 	if (tm == NULL)
 		return (NULL);
 	tp = &tm->tcb;
 
 	/* Initialise cc_var struct for this tcpcb. */
 	tp->ccv = &tm->ccv;
 	tp->ccv->type = IPPROTO_TCP;
 	tp->ccv->ccvc.tcp = tp;
 
 	/*
 	 * Use the current system default CC algorithm.
 	 */
 	CC_LIST_RLOCK();
 	KASSERT(!STAILQ_EMPTY(&cc_list), ("cc_list is empty!"));
 	CC_ALGO(tp) = CC_DEFAULT();
 	CC_LIST_RUNLOCK();
 
 	if (CC_ALGO(tp)->cb_init != NULL)
 		if (CC_ALGO(tp)->cb_init(tp->ccv) > 0) {
 			uma_zfree(V_tcpcb_zone, tm);
 			return (NULL);
 		}
 
 	tp->osd = &tm->osd;
 	if (khelp_init_osd(HELPER_CLASS_TCP, tp->osd)) {
 		uma_zfree(V_tcpcb_zone, tm);
 		return (NULL);
 	}
 
 #ifdef VIMAGE
 	tp->t_vnet = inp->inp_vnet;
 #endif
 	tp->t_timers = &tm->tt;
 	/*	LIST_INIT(&tp->t_segq); */	/* XXX covered by M_ZERO */
 	tp->t_maxseg = tp->t_maxopd =
 #ifdef INET6
 		isipv6 ? V_tcp_v6mssdflt :
 #endif /* INET6 */
 		V_tcp_mssdflt;
 
 	/* Set up our timeouts. */
 	callout_init(&tp->t_timers->tt_rexmt, CALLOUT_MPSAFE);
 	callout_init(&tp->t_timers->tt_persist, CALLOUT_MPSAFE);
 	callout_init(&tp->t_timers->tt_keep, CALLOUT_MPSAFE);
 	callout_init(&tp->t_timers->tt_2msl, CALLOUT_MPSAFE);
 	callout_init(&tp->t_timers->tt_delack, CALLOUT_MPSAFE);
 
 	if (V_tcp_do_rfc1323)
 		tp->t_flags = (TF_REQ_SCALE|TF_REQ_TSTMP);
 	if (V_tcp_do_sack)
 		tp->t_flags |= TF_SACK_PERMIT;
 	TAILQ_INIT(&tp->snd_holes);
 	tp->t_inpcb = inp;	/* XXX */
 	/*
 	 * Init srtt to TCPTV_SRTTBASE (0), so we can tell that we have no
 	 * rtt estimate.  Set rttvar so that srtt + 4 * rttvar gives
 	 * reasonable initial retransmit time.
 	 */
 	tp->t_srtt = TCPTV_SRTTBASE;
 	tp->t_rttvar = ((TCPTV_RTOBASE - TCPTV_SRTTBASE) << TCP_RTTVAR_SHIFT) / 4;
 	tp->t_rttmin = tcp_rexmit_min;
 	tp->t_rxtcur = TCPTV_RTOBASE;
 	tp->snd_cwnd = TCP_MAXWIN << TCP_MAX_WINSHIFT;
 	tp->snd_ssthresh = TCP_MAXWIN << TCP_MAX_WINSHIFT;
 	tp->t_rcvtime = ticks;
 	/*
 	 * IPv4 TTL initialization is necessary for an IPv6 socket as well,
 	 * because the socket may be bound to an IPv6 wildcard address,
 	 * which may match an IPv4-mapped IPv6 address.
 	 */
 	inp->inp_ip_ttl = V_ip_defttl;
 	inp->inp_ppcb = tp;
 	return (tp);		/* XXX */
 }
 
 /*
  * Switch the congestion control algorithm back to NewReno for any active
  * control blocks using an algorithm which is about to go away.
  * This ensures the CC framework can allow the unload to proceed without leaving
  * any dangling pointers which would trigger a panic.
  * Returning non-zero would inform the CC framework that something went wrong
  * and it would be unsafe to allow the unload to proceed. However, there is no
  * way for this to occur with this implementation so we always return zero.
  */
 int
 tcp_ccalgounload(struct cc_algo *unload_algo)
 {
 	struct cc_algo *tmpalgo;
 	struct inpcb *inp;
 	struct tcpcb *tp;
 	VNET_ITERATOR_DECL(vnet_iter);
 
 	/*
 	 * Check all active control blocks across all network stacks and change
 	 * any that are using "unload_algo" back to NewReno. If "unload_algo"
 	 * requires cleanup code to be run, call it.
 	 */
 	VNET_LIST_RLOCK();
 	VNET_FOREACH(vnet_iter) {
 		CURVNET_SET(vnet_iter);
 		INP_INFO_RLOCK(&V_tcbinfo);
 		/*
 		 * New connections already part way through being initialised
 		 * with the CC algo we're removing will not race with this code
 		 * because the INP_INFO_WLOCK is held during initialisation. We
 		 * therefore don't enter the loop below until the connection
 		 * list has stabilised.
 		 */
 		LIST_FOREACH(inp, &V_tcb, inp_list) {
 			INP_WLOCK(inp);
 			/* Important to skip tcptw structs. */
 			if (!(inp->inp_flags & INP_TIMEWAIT) &&
 			    (tp = intotcpcb(inp)) != NULL) {
 				/*
 				 * By holding INP_WLOCK here, we are assured
 				 * that the connection is not currently
 				 * executing inside the CC module's functions
 				 * i.e. it is safe to make the switch back to
 				 * NewReno.
 				 */
 				if (CC_ALGO(tp) == unload_algo) {
 					tmpalgo = CC_ALGO(tp);
 					/* NewReno does not require any init. */
 					CC_ALGO(tp) = &newreno_cc_algo;
 					if (tmpalgo->cb_destroy != NULL)
 						tmpalgo->cb_destroy(tp->ccv);
 				}
 			}
 			INP_WUNLOCK(inp);
 		}
 		INP_INFO_RUNLOCK(&V_tcbinfo);
 		CURVNET_RESTORE();
 	}
 	VNET_LIST_RUNLOCK();
 
 	return (0);
 }
 
 /*
  * Drop a TCP connection, reporting
  * the specified error.  If connection is synchronized,
  * then send a RST to peer.
  */
 struct tcpcb *
 tcp_drop(struct tcpcb *tp, int errno)
 {
 	struct socket *so = tp->t_inpcb->inp_socket;
 
 	INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
 	INP_WLOCK_ASSERT(tp->t_inpcb);
 
 	if (TCPS_HAVERCVDSYN(tp->t_state)) {
 		tcp_state_change(tp, TCPS_CLOSED);
 		(void) tcp_output(tp);
 		TCPSTAT_INC(tcps_drops);
 	} else
 		TCPSTAT_INC(tcps_conndrops);
 	if (errno == ETIMEDOUT && tp->t_softerror)
 		errno = tp->t_softerror;
 	so->so_error = errno;
 	return (tcp_close(tp));
 }
 
 void
 tcp_discardcb(struct tcpcb *tp)
 {
 	struct inpcb *inp = tp->t_inpcb;
 	struct socket *so = inp->inp_socket;
 #ifdef INET6
 	int isipv6 = (inp->inp_vflag & INP_IPV6) != 0;
 #endif /* INET6 */
 
 	INP_WLOCK_ASSERT(inp);
 
 	/*
 	 * Make sure that all of our timers are stopped before we delete the
 	 * PCB.
 	 *
 	 * XXXRW: Really, we would like to use callout_drain() here in order
 	 * to avoid races experienced in tcp_timer.c where a timer is already
 	 * executing at this point.  However, we can't, both because we're
 	 * running in a context where we can't sleep, and also because we
 	 * hold locks required by the timers.  What we instead need to do is
 	 * test to see if callout_drain() is required, and if so, defer some
 	 * portion of the remainder of tcp_discardcb() to an asynchronous
 	 * context that can callout_drain() and then continue.  Some care
 	 * will be required to ensure that no further processing takes place
 	 * on the tcpcb, even though it hasn't been freed (a flag?).
 	 */
 	callout_stop(&tp->t_timers->tt_rexmt);
 	callout_stop(&tp->t_timers->tt_persist);
 	callout_stop(&tp->t_timers->tt_keep);
 	callout_stop(&tp->t_timers->tt_2msl);
 	callout_stop(&tp->t_timers->tt_delack);
 
 	/*
 	 * If we got enough samples through the srtt filter,
 	 * save the rtt and rttvar in the routing entry.
 	 * 'Enough' is arbitrarily defined as 4 rtt samples.
 	 * 4 samples is enough for the srtt filter to converge
 	 * to within enough % of the correct value; fewer samples
 	 * and we could save a bogus rtt. The danger is not high
 	 * as tcp quickly recovers from everything.
 	 * XXX: Works very well but needs some more statistics!
 	 */
 	if (tp->t_rttupdated >= 4) {
 		struct hc_metrics_lite metrics;
 		u_long ssthresh;
 
 		bzero(&metrics, sizeof(metrics));
 		/*
 		 * Update the ssthresh always when the conditions below
 		 * are satisfied. This gives us better new start value
 		 * for the congestion avoidance for new connections.
 		 * ssthresh is only set if packet loss occured on a session.
 		 *
 		 * XXXRW: 'so' may be NULL here, and/or socket buffer may be
 		 * being torn down.  Ideally this code would not use 'so'.
 		 */
 		ssthresh = tp->snd_ssthresh;
 		if (ssthresh != 0 && ssthresh < so->so_snd.sb_hiwat / 2) {
 			/*
 			 * convert the limit from user data bytes to
 			 * packets then to packet data bytes.
 			 */
 			ssthresh = (ssthresh + tp->t_maxseg / 2) / tp->t_maxseg;
 			if (ssthresh < 2)
 				ssthresh = 2;
 			ssthresh *= (u_long)(tp->t_maxseg +
 #ifdef INET6
 			    (isipv6 ? sizeof (struct ip6_hdr) +
 				sizeof (struct tcphdr) :
 #endif
 				sizeof (struct tcpiphdr)
 #ifdef INET6
 			    )
 #endif
 			    );
 		} else
 			ssthresh = 0;
 		metrics.rmx_ssthresh = ssthresh;
 
 		metrics.rmx_rtt = tp->t_srtt;
 		metrics.rmx_rttvar = tp->t_rttvar;
 		metrics.rmx_cwnd = tp->snd_cwnd;
 		metrics.rmx_sendpipe = 0;
 		metrics.rmx_recvpipe = 0;
 
 		tcp_hc_update(&inp->inp_inc, &metrics);
 	}
 
 	/* free the reassembly queue, if any */
 	tcp_reass_flush(tp);
 
 #ifdef TCP_OFFLOAD
 	/* Disconnect offload device, if any. */
 	if (tp->t_flags & TF_TOE)
 		tcp_offload_detach(tp);
 #endif
 		
 	tcp_free_sackholes(tp);
 
 	/* Allow the CC algorithm to clean up after itself. */
 	if (CC_ALGO(tp)->cb_destroy != NULL)
 		CC_ALGO(tp)->cb_destroy(tp->ccv);
 
 	khelp_destroy_osd(tp->osd);
 
 	CC_ALGO(tp) = NULL;
 	inp->inp_ppcb = NULL;
 	tp->t_inpcb = NULL;
 	uma_zfree(V_tcpcb_zone, tp);
 }
 
 /*
  * Attempt to close a TCP control block, marking it as dropped, and freeing
  * the socket if we hold the only reference.
  */
 struct tcpcb *
 tcp_close(struct tcpcb *tp)
 {
 	struct inpcb *inp = tp->t_inpcb;
 	struct socket *so;
 
 	INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
 	INP_WLOCK_ASSERT(inp);
 
 #ifdef TCP_OFFLOAD
 	if (tp->t_state == TCPS_LISTEN)
 		tcp_offload_listen_stop(tp);
 #endif
 	in_pcbdrop(inp);
 	TCPSTAT_INC(tcps_closed);
 	KASSERT(inp->inp_socket != NULL, ("tcp_close: inp_socket NULL"));
 	so = inp->inp_socket;
 	soisdisconnected(so);
 	if (inp->inp_flags & INP_SOCKREF) {
 		KASSERT(so->so_state & SS_PROTOREF,
 		    ("tcp_close: !SS_PROTOREF"));
 		inp->inp_flags &= ~INP_SOCKREF;
 		INP_WUNLOCK(inp);
 		ACCEPT_LOCK();
 		SOCK_LOCK(so);
 		so->so_state &= ~SS_PROTOREF;
 		sofree(so);
 		return (NULL);
 	}
 	return (tp);
 }
 
 void
 tcp_drain(void)
 {
 	VNET_ITERATOR_DECL(vnet_iter);
 
 	if (!do_tcpdrain)
 		return;
 
 	VNET_LIST_RLOCK_NOSLEEP();
 	VNET_FOREACH(vnet_iter) {
 		CURVNET_SET(vnet_iter);
 		struct inpcb *inpb;
 		struct tcpcb *tcpb;
 
 	/*
 	 * Walk the tcpbs, if existing, and flush the reassembly queue,
 	 * if there is one...
 	 * XXX: The "Net/3" implementation doesn't imply that the TCP
 	 *      reassembly queue should be flushed, but in a situation
 	 *	where we're really low on mbufs, this is potentially
 	 *	useful.
 	 */
 		INP_INFO_RLOCK(&V_tcbinfo);
 		LIST_FOREACH(inpb, V_tcbinfo.ipi_listhead, inp_list) {
 			if (inpb->inp_flags & INP_TIMEWAIT)
 				continue;
 			INP_WLOCK(inpb);
 			if ((tcpb = intotcpcb(inpb)) != NULL) {
 				tcp_reass_flush(tcpb);
 				tcp_clean_sackreport(tcpb);
 			}
 			INP_WUNLOCK(inpb);
 		}
 		INP_INFO_RUNLOCK(&V_tcbinfo);
 		CURVNET_RESTORE();
 	}
 	VNET_LIST_RUNLOCK_NOSLEEP();
 }
 
 /*
  * Notify a tcp user of an asynchronous error;
  * store error as soft error, but wake up user
  * (for now, won't do anything until can select for soft error).
  *
  * Do not wake up user since there currently is no mechanism for
  * reporting soft errors (yet - a kqueue filter may be added).
  */
 static struct inpcb *
 tcp_notify(struct inpcb *inp, int error)
 {
 	struct tcpcb *tp;
 
 	INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
 	INP_WLOCK_ASSERT(inp);
 
 	if ((inp->inp_flags & INP_TIMEWAIT) ||
 	    (inp->inp_flags & INP_DROPPED))
 		return (inp);
 
 	tp = intotcpcb(inp);
 	KASSERT(tp != NULL, ("tcp_notify: tp == NULL"));
 
 	/*
 	 * Ignore some errors if we are hooked up.
 	 * If connection hasn't completed, has retransmitted several times,
 	 * and receives a second error, give up now.  This is better
 	 * than waiting a long time to establish a connection that
 	 * can never complete.
 	 */
 	if (tp->t_state == TCPS_ESTABLISHED &&
 	    (error == EHOSTUNREACH || error == ENETUNREACH ||
 	     error == EHOSTDOWN)) {
 		return (inp);
 	} else if (tp->t_state < TCPS_ESTABLISHED && tp->t_rxtshift > 3 &&
 	    tp->t_softerror) {
 		tp = tcp_drop(tp, error);
 		if (tp != NULL)
 			return (inp);
 		else
 			return (NULL);
 	} else {
 		tp->t_softerror = error;
 		return (inp);
 	}
 #if 0
 	wakeup( &so->so_timeo);
 	sorwakeup(so);
 	sowwakeup(so);
 #endif
 }
 
 static int
 tcp_pcblist(SYSCTL_HANDLER_ARGS)
 {
 	int error, i, m, n, pcb_count;
 	struct inpcb *inp, **inp_list;
 	inp_gen_t gencnt;
 	struct xinpgen xig;
 
 	/*
 	 * The process of preparing the TCB list is too time-consuming and
 	 * resource-intensive to repeat twice on every request.
 	 */
 	if (req->oldptr == NULL) {
 		n = V_tcbinfo.ipi_count + syncache_pcbcount();
 		n += imax(n / 8, 10);
 		req->oldidx = 2 * (sizeof xig) + n * sizeof(struct xtcpcb);
 		return (0);
 	}
 
 	if (req->newptr != NULL)
 		return (EPERM);
 
 	/*
 	 * OK, now we're committed to doing something.
 	 */
 	INP_INFO_RLOCK(&V_tcbinfo);
 	gencnt = V_tcbinfo.ipi_gencnt;
 	n = V_tcbinfo.ipi_count;
 	INP_INFO_RUNLOCK(&V_tcbinfo);
 
 	m = syncache_pcbcount();
 
 	error = sysctl_wire_old_buffer(req, 2 * (sizeof xig)
 		+ (n + m) * sizeof(struct xtcpcb));
 	if (error != 0)
 		return (error);
 
 	xig.xig_len = sizeof xig;
 	xig.xig_count = n + m;
 	xig.xig_gen = gencnt;
 	xig.xig_sogen = so_gencnt;
 	error = SYSCTL_OUT(req, &xig, sizeof xig);
 	if (error)
 		return (error);
 
 	error = syncache_pcblist(req, m, &pcb_count);
 	if (error)
 		return (error);
 
 	inp_list = malloc(n * sizeof *inp_list, M_TEMP, M_WAITOK);
 	if (inp_list == NULL)
 		return (ENOMEM);
 
 	INP_INFO_RLOCK(&V_tcbinfo);
 	for (inp = LIST_FIRST(V_tcbinfo.ipi_listhead), i = 0;
 	    inp != NULL && i < n; inp = LIST_NEXT(inp, inp_list)) {
 		INP_WLOCK(inp);
 		if (inp->inp_gencnt <= gencnt) {
 			/*
 			 * XXX: This use of cr_cansee(), introduced with
 			 * TCP state changes, is not quite right, but for
 			 * now, better than nothing.
 			 */
 			if (inp->inp_flags & INP_TIMEWAIT) {
 				if (intotw(inp) != NULL)
 					error = cr_cansee(req->td->td_ucred,
 					    intotw(inp)->tw_cred);
 				else
 					error = EINVAL;	/* Skip this inp. */
 			} else
 				error = cr_canseeinpcb(req->td->td_ucred, inp);
 			if (error == 0) {
 				in_pcbref(inp);
 				inp_list[i++] = inp;
 			}
 		}
 		INP_WUNLOCK(inp);
 	}
 	INP_INFO_RUNLOCK(&V_tcbinfo);
 	n = i;
 
 	error = 0;
 	for (i = 0; i < n; i++) {
 		inp = inp_list[i];
 		INP_RLOCK(inp);
 		if (inp->inp_gencnt <= gencnt) {
 			struct xtcpcb xt;
 			void *inp_ppcb;
 
 			bzero(&xt, sizeof(xt));
 			xt.xt_len = sizeof xt;
 			/* XXX should avoid extra copy */
 			bcopy(inp, &xt.xt_inp, sizeof *inp);
 			inp_ppcb = inp->inp_ppcb;
 			if (inp_ppcb == NULL)
 				bzero((char *) &xt.xt_tp, sizeof xt.xt_tp);
 			else if (inp->inp_flags & INP_TIMEWAIT) {
 				bzero((char *) &xt.xt_tp, sizeof xt.xt_tp);
 				xt.xt_tp.t_state = TCPS_TIME_WAIT;
 			} else {
 				bcopy(inp_ppcb, &xt.xt_tp, sizeof xt.xt_tp);
 				if (xt.xt_tp.t_timers)
 					tcp_timer_to_xtimer(&xt.xt_tp, xt.xt_tp.t_timers, &xt.xt_timer);
 			}
 			if (inp->inp_socket != NULL)
 				sotoxsocket(inp->inp_socket, &xt.xt_socket);
 			else {
 				bzero(&xt.xt_socket, sizeof xt.xt_socket);
 				xt.xt_socket.xso_protocol = IPPROTO_TCP;
 			}
 			xt.xt_inp.inp_gencnt = inp->inp_gencnt;
 			INP_RUNLOCK(inp);
 			error = SYSCTL_OUT(req, &xt, sizeof xt);
 		} else
 			INP_RUNLOCK(inp);
 	}
 	INP_INFO_WLOCK(&V_tcbinfo);
 	for (i = 0; i < n; i++) {
 		inp = inp_list[i];
 		INP_RLOCK(inp);
 		if (!in_pcbrele_rlocked(inp))
 			INP_RUNLOCK(inp);
 	}
 	INP_INFO_WUNLOCK(&V_tcbinfo);
 
 	if (!error) {
 		/*
 		 * Give the user an updated idea of our state.
 		 * If the generation differs from what we told
 		 * her before, she knows that something happened
 		 * while we were processing this request, and it
 		 * might be necessary to retry.
 		 */
 		INP_INFO_RLOCK(&V_tcbinfo);
 		xig.xig_gen = V_tcbinfo.ipi_gencnt;
 		xig.xig_sogen = so_gencnt;
 		xig.xig_count = V_tcbinfo.ipi_count + pcb_count;
 		INP_INFO_RUNLOCK(&V_tcbinfo);
 		error = SYSCTL_OUT(req, &xig, sizeof xig);
 	}
 	free(inp_list, M_TEMP);
 	return (error);
 }
 
 SYSCTL_PROC(_net_inet_tcp, TCPCTL_PCBLIST, pcblist,
     CTLTYPE_OPAQUE | CTLFLAG_RD, NULL, 0,
     tcp_pcblist, "S,xtcpcb", "List of active TCP connections");
 
 #ifdef INET
 static int
 tcp_getcred(SYSCTL_HANDLER_ARGS)
 {
 	struct xucred xuc;
 	struct sockaddr_in addrs[2];
 	struct inpcb *inp;
 	int error;
 
 	error = priv_check(req->td, PRIV_NETINET_GETCRED);
 	if (error)
 		return (error);
 	error = SYSCTL_IN(req, addrs, sizeof(addrs));
 	if (error)
 		return (error);
 	inp = in_pcblookup(&V_tcbinfo, addrs[1].sin_addr, addrs[1].sin_port,
 	    addrs[0].sin_addr, addrs[0].sin_port, INPLOOKUP_RLOCKPCB, NULL);
 	if (inp != NULL) {
 		if (inp->inp_socket == NULL)
 			error = ENOENT;
 		if (error == 0)
 			error = cr_canseeinpcb(req->td->td_ucred, inp);
 		if (error == 0)
 			cru2x(inp->inp_cred, &xuc);
 		INP_RUNLOCK(inp);
 	} else
 		error = ENOENT;
 	if (error == 0)
 		error = SYSCTL_OUT(req, &xuc, sizeof(struct xucred));
 	return (error);
 }
 
 SYSCTL_PROC(_net_inet_tcp, OID_AUTO, getcred,
     CTLTYPE_OPAQUE|CTLFLAG_RW|CTLFLAG_PRISON, 0, 0,
     tcp_getcred, "S,xucred", "Get the xucred of a TCP connection");
 #endif /* INET */
 
 #ifdef INET6
 static int
 tcp6_getcred(SYSCTL_HANDLER_ARGS)
 {
 	struct xucred xuc;
 	struct sockaddr_in6 addrs[2];
 	struct inpcb *inp;
 	int error;
 #ifdef INET
 	int mapped = 0;
 #endif
 
 	error = priv_check(req->td, PRIV_NETINET_GETCRED);
 	if (error)
 		return (error);
 	error = SYSCTL_IN(req, addrs, sizeof(addrs));
 	if (error)
 		return (error);
 	if ((error = sa6_embedscope(&addrs[0], V_ip6_use_defzone)) != 0 ||
 	    (error = sa6_embedscope(&addrs[1], V_ip6_use_defzone)) != 0) {
 		return (error);
 	}
 	if (IN6_IS_ADDR_V4MAPPED(&addrs[0].sin6_addr)) {
 #ifdef INET
 		if (IN6_IS_ADDR_V4MAPPED(&addrs[1].sin6_addr))
 			mapped = 1;
 		else
 #endif
 			return (EINVAL);
 	}
 
 #ifdef INET
 	if (mapped == 1)
 		inp = in_pcblookup(&V_tcbinfo,
 			*(struct in_addr *)&addrs[1].sin6_addr.s6_addr[12],
 			addrs[1].sin6_port,
 			*(struct in_addr *)&addrs[0].sin6_addr.s6_addr[12],
 			addrs[0].sin6_port, INPLOOKUP_RLOCKPCB, NULL);
 	else
 #endif
 		inp = in6_pcblookup(&V_tcbinfo,
 			&addrs[1].sin6_addr, addrs[1].sin6_port,
 			&addrs[0].sin6_addr, addrs[0].sin6_port,
 			INPLOOKUP_RLOCKPCB, NULL);
 	if (inp != NULL) {
 		if (inp->inp_socket == NULL)
 			error = ENOENT;
 		if (error == 0)
 			error = cr_canseeinpcb(req->td->td_ucred, inp);
 		if (error == 0)
 			cru2x(inp->inp_cred, &xuc);
 		INP_RUNLOCK(inp);
 	} else
 		error = ENOENT;
 	if (error == 0)
 		error = SYSCTL_OUT(req, &xuc, sizeof(struct xucred));
 	return (error);
 }
 
 SYSCTL_PROC(_net_inet6_tcp6, OID_AUTO, getcred,
     CTLTYPE_OPAQUE|CTLFLAG_RW|CTLFLAG_PRISON, 0, 0,
     tcp6_getcred, "S,xucred", "Get the xucred of a TCP6 connection");
 #endif /* INET6 */
 
 
 #ifdef INET
 void
 tcp_ctlinput(int cmd, struct sockaddr *sa, void *vip)
 {
 	struct ip *ip = vip;
 	struct tcphdr *th;
 	struct in_addr faddr;
 	struct inpcb *inp;
 	struct tcpcb *tp;
 	struct inpcb *(*notify)(struct inpcb *, int) = tcp_notify;
 	struct icmp *icp;
 	struct in_conninfo inc;
 	tcp_seq icmp_tcp_seq;
 	int mtu;
 
 	faddr = ((struct sockaddr_in *)sa)->sin_addr;
 	if (sa->sa_family != AF_INET || faddr.s_addr == INADDR_ANY)
 		return;
 
 	if (cmd == PRC_MSGSIZE)
 		notify = tcp_mtudisc_notify;
 	else if (V_icmp_may_rst && (cmd == PRC_UNREACH_ADMIN_PROHIB ||
 		cmd == PRC_UNREACH_PORT || cmd == PRC_TIMXCEED_INTRANS) && ip)
 		notify = tcp_drop_syn_sent;
 	/*
 	 * Redirects don't need to be handled up here.
 	 */
 	else if (PRC_IS_REDIRECT(cmd))
 		return;
 	/*
 	 * Source quench is depreciated.
 	 */
 	else if (cmd == PRC_QUENCH)
 		return;
 	/*
 	 * Hostdead is ugly because it goes linearly through all PCBs.
 	 * XXX: We never get this from ICMP, otherwise it makes an
 	 * excellent DoS attack on machines with many connections.
 	 */
 	else if (cmd == PRC_HOSTDEAD)
 		ip = NULL;
 	else if ((unsigned)cmd >= PRC_NCMDS || inetctlerrmap[cmd] == 0)
 		return;
 	if (ip != NULL) {
 		icp = (struct icmp *)((caddr_t)ip
 				      - offsetof(struct icmp, icmp_ip));
 		th = (struct tcphdr *)((caddr_t)ip
 				       + (ip->ip_hl << 2));
 		INP_INFO_WLOCK(&V_tcbinfo);
 		inp = in_pcblookup(&V_tcbinfo, faddr, th->th_dport,
 		    ip->ip_src, th->th_sport, INPLOOKUP_WLOCKPCB, NULL);
 		if (inp != NULL)  {
 			if (!(inp->inp_flags & INP_TIMEWAIT) &&
 			    !(inp->inp_flags & INP_DROPPED) &&
 			    !(inp->inp_socket == NULL)) {
 				icmp_tcp_seq = htonl(th->th_seq);
 				tp = intotcpcb(inp);
 				if (SEQ_GEQ(icmp_tcp_seq, tp->snd_una) &&
 				    SEQ_LT(icmp_tcp_seq, tp->snd_max)) {
 					if (cmd == PRC_MSGSIZE) {
 					    /*
 					     * MTU discovery:
 					     * If we got a needfrag set the MTU
 					     * in the route to the suggested new
 					     * value (if given) and then notify.
 					     */
 					    bzero(&inc, sizeof(inc));
 					    inc.inc_faddr = faddr;
 					    inc.inc_fibnum =
 						inp->inp_inc.inc_fibnum;
 
 					    mtu = ntohs(icp->icmp_nextmtu);
 					    /*
 					     * If no alternative MTU was
 					     * proposed, try the next smaller
 					     * one.
 					     */
 					    if (!mtu)
 						mtu = ip_next_mtu(
 						 ntohs(ip->ip_len), 1);
 					    if (mtu < V_tcp_minmss
 						 + sizeof(struct tcpiphdr))
 						mtu = V_tcp_minmss
 						 + sizeof(struct tcpiphdr);
 					    /*
 					     * Only cache the MTU if it
 					     * is smaller than the interface
 					     * or route MTU.  tcp_mtudisc()
 					     * will do right thing by itself.
 					     */
 					    if (mtu <= tcp_maxmtu(&inc, NULL))
 						tcp_hc_updatemtu(&inc, mtu);
 					    tcp_mtudisc(inp, mtu);
 					} else
 						inp = (*notify)(inp,
 						    inetctlerrmap[cmd]);
 				}
 			}
 			if (inp != NULL)
 				INP_WUNLOCK(inp);
 		} else {
 			bzero(&inc, sizeof(inc));
 			inc.inc_fport = th->th_dport;
 			inc.inc_lport = th->th_sport;
 			inc.inc_faddr = faddr;
 			inc.inc_laddr = ip->ip_src;
 			syncache_unreach(&inc, th);
 		}
 		INP_INFO_WUNLOCK(&V_tcbinfo);
 	} else
 		in_pcbnotifyall(&V_tcbinfo, faddr, inetctlerrmap[cmd], notify);
 }
 #endif /* INET */
 
 #ifdef INET6
 void
 tcp6_ctlinput(int cmd, struct sockaddr *sa, void *d)
 {
 	struct tcphdr th;
 	struct inpcb *(*notify)(struct inpcb *, int) = tcp_notify;
 	struct ip6_hdr *ip6;
 	struct mbuf *m;
 	struct ip6ctlparam *ip6cp = NULL;
 	const struct sockaddr_in6 *sa6_src = NULL;
 	int off;
 	struct tcp_portonly {
 		u_int16_t th_sport;
 		u_int16_t th_dport;
 	} *thp;
 
 	if (sa->sa_family != AF_INET6 ||
 	    sa->sa_len != sizeof(struct sockaddr_in6))
 		return;
 
 	if (cmd == PRC_MSGSIZE)
 		notify = tcp_mtudisc_notify;
 	else if (!PRC_IS_REDIRECT(cmd) &&
 		 ((unsigned)cmd >= PRC_NCMDS || inet6ctlerrmap[cmd] == 0))
 		return;
 	/* Source quench is depreciated. */
 	else if (cmd == PRC_QUENCH)
 		return;
 
 	/* if the parameter is from icmp6, decode it. */
 	if (d != NULL) {
 		ip6cp = (struct ip6ctlparam *)d;
 		m = ip6cp->ip6c_m;
 		ip6 = ip6cp->ip6c_ip6;
 		off = ip6cp->ip6c_off;
 		sa6_src = ip6cp->ip6c_src;
 	} else {
 		m = NULL;
 		ip6 = NULL;
 		off = 0;	/* fool gcc */
 		sa6_src = &sa6_any;
 	}
 
 	if (ip6 != NULL) {
 		struct in_conninfo inc;
 		/*
 		 * XXX: We assume that when IPV6 is non NULL,
 		 * M and OFF are valid.
 		 */
 
 		/* check if we can safely examine src and dst ports */
 		if (m->m_pkthdr.len < off + sizeof(*thp))
 			return;
 
 		bzero(&th, sizeof(th));
 		m_copydata(m, off, sizeof(*thp), (caddr_t)&th);
 
 		in6_pcbnotify(&V_tcbinfo, sa, th.th_dport,
 		    (struct sockaddr *)ip6cp->ip6c_src,
 		    th.th_sport, cmd, NULL, notify);
 
 		bzero(&inc, sizeof(inc));
 		inc.inc_fport = th.th_dport;
 		inc.inc_lport = th.th_sport;
 		inc.inc6_faddr = ((struct sockaddr_in6 *)sa)->sin6_addr;
 		inc.inc6_laddr = ip6cp->ip6c_src->sin6_addr;
 		inc.inc_flags |= INC_ISIPV6;
 		INP_INFO_WLOCK(&V_tcbinfo);
 		syncache_unreach(&inc, &th);
 		INP_INFO_WUNLOCK(&V_tcbinfo);
 	} else
 		in6_pcbnotify(&V_tcbinfo, sa, 0, (const struct sockaddr *)sa6_src,
 			      0, cmd, NULL, notify);
 }
 #endif /* INET6 */
 
 
 /*
  * Following is where TCP initial sequence number generation occurs.
  *
  * There are two places where we must use initial sequence numbers:
  * 1.  In SYN-ACK packets.
  * 2.  In SYN packets.
  *
  * All ISNs for SYN-ACK packets are generated by the syncache.  See
  * tcp_syncache.c for details.
  *
  * The ISNs in SYN packets must be monotonic; TIME_WAIT recycling
  * depends on this property.  In addition, these ISNs should be
  * unguessable so as to prevent connection hijacking.  To satisfy
  * the requirements of this situation, the algorithm outlined in
  * RFC 1948 is used, with only small modifications.
  *
  * Implementation details:
  *
  * Time is based off the system timer, and is corrected so that it
  * increases by one megabyte per second.  This allows for proper
  * recycling on high speed LANs while still leaving over an hour
  * before rollover.
  *
  * As reading the *exact* system time is too expensive to be done
  * whenever setting up a TCP connection, we increment the time
  * offset in two ways.  First, a small random positive increment
  * is added to isn_offset for each connection that is set up.
  * Second, the function tcp_isn_tick fires once per clock tick
  * and increments isn_offset as necessary so that sequence numbers
  * are incremented at approximately ISN_BYTES_PER_SECOND.  The
  * random positive increments serve only to ensure that the same
  * exact sequence number is never sent out twice (as could otherwise
  * happen when a port is recycled in less than the system tick
  * interval.)
  *
  * net.inet.tcp.isn_reseed_interval controls the number of seconds
  * between seeding of isn_secret.  This is normally set to zero,
  * as reseeding should not be necessary.
  *
  * Locking of the global variables isn_secret, isn_last_reseed, isn_offset,
  * isn_offset_old, and isn_ctx is performed using the TCP pcbinfo lock.  In
  * general, this means holding an exclusive (write) lock.
  */
 
 #define ISN_BYTES_PER_SECOND 1048576
 #define ISN_STATIC_INCREMENT 4096
 #define ISN_RANDOM_INCREMENT (4096 - 1)
 
 static VNET_DEFINE(u_char, isn_secret[32]);
 static VNET_DEFINE(int, isn_last);
 static VNET_DEFINE(int, isn_last_reseed);
 static VNET_DEFINE(u_int32_t, isn_offset);
 static VNET_DEFINE(u_int32_t, isn_offset_old);
 
 #define	V_isn_secret			VNET(isn_secret)
 #define	V_isn_last			VNET(isn_last)
 #define	V_isn_last_reseed		VNET(isn_last_reseed)
 #define	V_isn_offset			VNET(isn_offset)
 #define	V_isn_offset_old		VNET(isn_offset_old)
 
 tcp_seq
 tcp_new_isn(struct tcpcb *tp)
 {
 	MD5_CTX isn_ctx;
 	u_int32_t md5_buffer[4];
 	tcp_seq new_isn;
 	u_int32_t projected_offset;
 
 	INP_WLOCK_ASSERT(tp->t_inpcb);
 
 	ISN_LOCK();
 	/* Seed if this is the first use, reseed if requested. */
 	if ((V_isn_last_reseed == 0) || ((V_tcp_isn_reseed_interval > 0) &&
 	     (((u_int)V_isn_last_reseed + (u_int)V_tcp_isn_reseed_interval*hz)
 		< (u_int)ticks))) {
 		read_random(&V_isn_secret, sizeof(V_isn_secret));
 		V_isn_last_reseed = ticks;
 	}
 
 	/* Compute the md5 hash and return the ISN. */
 	MD5Init(&isn_ctx);
 	MD5Update(&isn_ctx, (u_char *) &tp->t_inpcb->inp_fport, sizeof(u_short));
 	MD5Update(&isn_ctx, (u_char *) &tp->t_inpcb->inp_lport, sizeof(u_short));
 #ifdef INET6
 	if ((tp->t_inpcb->inp_vflag & INP_IPV6) != 0) {
 		MD5Update(&isn_ctx, (u_char *) &tp->t_inpcb->in6p_faddr,
 			  sizeof(struct in6_addr));
 		MD5Update(&isn_ctx, (u_char *) &tp->t_inpcb->in6p_laddr,
 			  sizeof(struct in6_addr));
 	} else
 #endif
 	{
 		MD5Update(&isn_ctx, (u_char *) &tp->t_inpcb->inp_faddr,
 			  sizeof(struct in_addr));
 		MD5Update(&isn_ctx, (u_char *) &tp->t_inpcb->inp_laddr,
 			  sizeof(struct in_addr));
 	}
 	MD5Update(&isn_ctx, (u_char *) &V_isn_secret, sizeof(V_isn_secret));
 	MD5Final((u_char *) &md5_buffer, &isn_ctx);
 	new_isn = (tcp_seq) md5_buffer[0];
 	V_isn_offset += ISN_STATIC_INCREMENT +
 		(arc4random() & ISN_RANDOM_INCREMENT);
 	if (ticks != V_isn_last) {
 		projected_offset = V_isn_offset_old +
 		    ISN_BYTES_PER_SECOND / hz * (ticks - V_isn_last);
 		if (SEQ_GT(projected_offset, V_isn_offset))
 			V_isn_offset = projected_offset;
 		V_isn_offset_old = V_isn_offset;
 		V_isn_last = ticks;
 	}
 	new_isn += V_isn_offset;
 	ISN_UNLOCK();
 	return (new_isn);
 }
 
 /*
  * When a specific ICMP unreachable message is received and the
  * connection state is SYN-SENT, drop the connection.  This behavior
  * is controlled by the icmp_may_rst sysctl.
  */
 struct inpcb *
 tcp_drop_syn_sent(struct inpcb *inp, int errno)
 {
 	struct tcpcb *tp;
 
 	INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
 	INP_WLOCK_ASSERT(inp);
 
 	if ((inp->inp_flags & INP_TIMEWAIT) ||
 	    (inp->inp_flags & INP_DROPPED))
 		return (inp);
 
 	tp = intotcpcb(inp);
 	if (tp->t_state != TCPS_SYN_SENT)
 		return (inp);
 
 	tp = tcp_drop(tp, errno);
 	if (tp != NULL)
 		return (inp);
 	else
 		return (NULL);
 }
 
 /*
  * When `need fragmentation' ICMP is received, update our idea of the MSS
  * based on the new value. Also nudge TCP to send something, since we
  * know the packet we just sent was dropped.
  * This duplicates some code in the tcp_mss() function in tcp_input.c.
  */
 static struct inpcb *
 tcp_mtudisc_notify(struct inpcb *inp, int error)
 {
 
 	return (tcp_mtudisc(inp, -1));
 }
 
 struct inpcb *
 tcp_mtudisc(struct inpcb *inp, int mtuoffer)
 {
 	struct tcpcb *tp;
 	struct socket *so;
 
 	INP_WLOCK_ASSERT(inp);
 	if ((inp->inp_flags & INP_TIMEWAIT) ||
 	    (inp->inp_flags & INP_DROPPED))
 		return (inp);
 
 	tp = intotcpcb(inp);
 	KASSERT(tp != NULL, ("tcp_mtudisc: tp == NULL"));
 
 	tcp_mss_update(tp, -1, mtuoffer, NULL, NULL);
   
 	so = inp->inp_socket;
 	SOCKBUF_LOCK(&so->so_snd);
 	/* If the mss is larger than the socket buffer, decrease the mss. */
 	if (so->so_snd.sb_hiwat < tp->t_maxseg)
 		tp->t_maxseg = so->so_snd.sb_hiwat;
 	SOCKBUF_UNLOCK(&so->so_snd);
 
 	TCPSTAT_INC(tcps_mturesent);
 	tp->t_rtttime = 0;
 	tp->snd_nxt = tp->snd_una;
 	tcp_free_sackholes(tp);
 	tp->snd_recover = tp->snd_max;
 	if (tp->t_flags & TF_SACK_PERMIT)
 		EXIT_FASTRECOVERY(tp->t_flags);
 	tcp_output(tp);
 	return (inp);
 }
 
 #ifdef INET
 /*
  * Look-up the routing entry to the peer of this inpcb.  If no route
  * is found and it cannot be allocated, then return 0.  This routine
  * is called by TCP routines that access the rmx structure and by
  * tcp_mss_update to get the peer/interface MTU.
  */
 u_long
 tcp_maxmtu(struct in_conninfo *inc, struct tcp_ifcap *cap)
 {
 	struct route sro;
 	struct sockaddr_in *dst;
 	struct ifnet *ifp;
 	u_long maxmtu = 0;
 
 	KASSERT(inc != NULL, ("tcp_maxmtu with NULL in_conninfo pointer"));
 
 	bzero(&sro, sizeof(sro));
 	if (inc->inc_faddr.s_addr != INADDR_ANY) {
 	        dst = (struct sockaddr_in *)&sro.ro_dst;
 		dst->sin_family = AF_INET;
 		dst->sin_len = sizeof(*dst);
 		dst->sin_addr = inc->inc_faddr;
 		in_rtalloc_ign(&sro, 0, inc->inc_fibnum);
 	}
 	if (sro.ro_rt != NULL) {
 		ifp = sro.ro_rt->rt_ifp;
 		if (sro.ro_rt->rt_rmx.rmx_mtu == 0)
 			maxmtu = ifp->if_mtu;
 		else
 			maxmtu = min(sro.ro_rt->rt_rmx.rmx_mtu, ifp->if_mtu);
 
 		/* Report additional interface capabilities. */
 		if (cap != NULL) {
 			if (ifp->if_capenable & IFCAP_TSO4 &&
 			    ifp->if_hwassist & CSUM_TSO)
 				cap->ifcap |= CSUM_TSO;
 				cap->tsomax = ifp->if_hw_tsomax;
 		}
 		RTFREE(sro.ro_rt);
 	}
 	return (maxmtu);
 }
 #endif /* INET */
 
 #ifdef INET6
 u_long
 tcp_maxmtu6(struct in_conninfo *inc, struct tcp_ifcap *cap)
 {
 	struct route_in6 sro6;
 	struct ifnet *ifp;
 	u_long maxmtu = 0;
 
 	KASSERT(inc != NULL, ("tcp_maxmtu6 with NULL in_conninfo pointer"));
 
 	bzero(&sro6, sizeof(sro6));
 	if (!IN6_IS_ADDR_UNSPECIFIED(&inc->inc6_faddr)) {
 		sro6.ro_dst.sin6_family = AF_INET6;
 		sro6.ro_dst.sin6_len = sizeof(struct sockaddr_in6);
 		sro6.ro_dst.sin6_addr = inc->inc6_faddr;
 		in6_rtalloc_ign(&sro6, 0, inc->inc_fibnum);
 	}
 	if (sro6.ro_rt != NULL) {
 		ifp = sro6.ro_rt->rt_ifp;
 		if (sro6.ro_rt->rt_rmx.rmx_mtu == 0)
 			maxmtu = IN6_LINKMTU(sro6.ro_rt->rt_ifp);
 		else
 			maxmtu = min(sro6.ro_rt->rt_rmx.rmx_mtu,
 				     IN6_LINKMTU(sro6.ro_rt->rt_ifp));
 
 		/* Report additional interface capabilities. */
 		if (cap != NULL) {
 			if (ifp->if_capenable & IFCAP_TSO6 &&
 			    ifp->if_hwassist & CSUM_TSO)
 				cap->ifcap |= CSUM_TSO;
 				cap->tsomax = ifp->if_hw_tsomax;
 		}
 		RTFREE(sro6.ro_rt);
 	}
 
 	return (maxmtu);
 }
 #endif /* INET6 */
 
 #ifdef IPSEC
 /* compute ESP/AH header size for TCP, including outer IP header. */
 size_t
 ipsec_hdrsiz_tcp(struct tcpcb *tp)
 {
 	struct inpcb *inp;
 	struct mbuf *m;
 	size_t hdrsiz;
 	struct ip *ip;
 #ifdef INET6
 	struct ip6_hdr *ip6;
 #endif
 	struct tcphdr *th;
 
 	if ((tp == NULL) || ((inp = tp->t_inpcb) == NULL))
 		return (0);
 	m = m_gethdr(M_NOWAIT, MT_DATA);
 	if (!m)
 		return (0);
 
 #ifdef INET6
 	if ((inp->inp_vflag & INP_IPV6) != 0) {
 		ip6 = mtod(m, struct ip6_hdr *);
 		th = (struct tcphdr *)(ip6 + 1);
 		m->m_pkthdr.len = m->m_len =
 			sizeof(struct ip6_hdr) + sizeof(struct tcphdr);
 		tcpip_fillheaders(inp, ip6, th);
 		hdrsiz = ipsec_hdrsiz(m, IPSEC_DIR_OUTBOUND, inp);
 	} else
 #endif /* INET6 */
 	{
 		ip = mtod(m, struct ip *);
 		th = (struct tcphdr *)(ip + 1);
 		m->m_pkthdr.len = m->m_len = sizeof(struct tcpiphdr);
 		tcpip_fillheaders(inp, ip, th);
 		hdrsiz = ipsec_hdrsiz(m, IPSEC_DIR_OUTBOUND, inp);
 	}
 
 	m_free(m);
 	return (hdrsiz);
 }
 #endif /* IPSEC */
 
 #ifdef TCP_SIGNATURE
 /*
  * Callback function invoked by m_apply() to digest TCP segment data
  * contained within an mbuf chain.
  */
 static int
 tcp_signature_apply(void *fstate, void *data, u_int len)
 {
 
 	MD5Update(fstate, (u_char *)data, len);
 	return (0);
 }
 
 /*
  * Compute TCP-MD5 hash of a TCP segment. (RFC2385)
  *
  * Parameters:
  * m		pointer to head of mbuf chain
  * _unused	
  * len		length of TCP segment data, excluding options
  * optlen	length of TCP segment options
  * buf		pointer to storage for computed MD5 digest
  * direction	direction of flow (IPSEC_DIR_INBOUND or OUTBOUND)
  *
  * We do this over ip, tcphdr, segment data, and the key in the SADB.
  * When called from tcp_input(), we can be sure that th_sum has been
  * zeroed out and verified already.
  *
  * Return 0 if successful, otherwise return -1.
  *
  * XXX The key is retrieved from the system's PF_KEY SADB, by keying a
  * search with the destination IP address, and a 'magic SPI' to be
  * determined by the application. This is hardcoded elsewhere to 1179
  * right now. Another branch of this code exists which uses the SPD to
  * specify per-application flows but it is unstable.
  */
 int
 tcp_signature_compute(struct mbuf *m, int _unused, int len, int optlen,
     u_char *buf, u_int direction)
 {
 	union sockaddr_union dst;
 #ifdef INET
 	struct ippseudo ippseudo;
 #endif
 	MD5_CTX ctx;
 	int doff;
 	struct ip *ip;
 #ifdef INET
 	struct ipovly *ipovly;
 #endif
 	struct secasvar *sav;
 	struct tcphdr *th;
 #ifdef INET6
 	struct ip6_hdr *ip6;
 	struct in6_addr in6;
 	char ip6buf[INET6_ADDRSTRLEN];
 	uint32_t plen;
 	uint16_t nhdr;
 #endif
 	u_short savecsum;
 
 	KASSERT(m != NULL, ("NULL mbuf chain"));
 	KASSERT(buf != NULL, ("NULL signature pointer"));
 
 	/* Extract the destination from the IP header in the mbuf. */
 	bzero(&dst, sizeof(union sockaddr_union));
 	ip = mtod(m, struct ip *);
 #ifdef INET6
 	ip6 = NULL;	/* Make the compiler happy. */
 #endif
 	switch (ip->ip_v) {
 #ifdef INET
 	case IPVERSION:
 		dst.sa.sa_len = sizeof(struct sockaddr_in);
 		dst.sa.sa_family = AF_INET;
 		dst.sin.sin_addr = (direction == IPSEC_DIR_INBOUND) ?
 		    ip->ip_src : ip->ip_dst;
 		break;
 #endif
 #ifdef INET6
 	case (IPV6_VERSION >> 4):
 		ip6 = mtod(m, struct ip6_hdr *);
 		dst.sa.sa_len = sizeof(struct sockaddr_in6);
 		dst.sa.sa_family = AF_INET6;
 		dst.sin6.sin6_addr = (direction == IPSEC_DIR_INBOUND) ?
 		    ip6->ip6_src : ip6->ip6_dst;
 		break;
 #endif
 	default:
 		return (EINVAL);
 		/* NOTREACHED */
 		break;
 	}
 
 	/* Look up an SADB entry which matches the address of the peer. */
 	sav = KEY_ALLOCSA(&dst, IPPROTO_TCP, htonl(TCP_SIG_SPI));
 	if (sav == NULL) {
 		ipseclog((LOG_ERR, "%s: SADB lookup failed for %s\n", __func__,
 		    (ip->ip_v == IPVERSION) ? inet_ntoa(dst.sin.sin_addr) :
 #ifdef INET6
 			(ip->ip_v == (IPV6_VERSION >> 4)) ?
 			    ip6_sprintf(ip6buf, &dst.sin6.sin6_addr) :
 #endif
 			"(unsupported)"));
 		return (EINVAL);
 	}
 
 	MD5Init(&ctx);
 	/*
 	 * Step 1: Update MD5 hash with IP(v6) pseudo-header.
 	 *
 	 * XXX The ippseudo header MUST be digested in network byte order,
 	 * or else we'll fail the regression test. Assume all fields we've
 	 * been doing arithmetic on have been in host byte order.
 	 * XXX One cannot depend on ipovly->ih_len here. When called from
 	 * tcp_output(), the underlying ip_len member has not yet been set.
 	 */
 	switch (ip->ip_v) {
 #ifdef INET
 	case IPVERSION:
 		ipovly = (struct ipovly *)ip;
 		ippseudo.ippseudo_src = ipovly->ih_src;
 		ippseudo.ippseudo_dst = ipovly->ih_dst;
 		ippseudo.ippseudo_pad = 0;
 		ippseudo.ippseudo_p = IPPROTO_TCP;
 		ippseudo.ippseudo_len = htons(len + sizeof(struct tcphdr) +
 		    optlen);
 		MD5Update(&ctx, (char *)&ippseudo, sizeof(struct ippseudo));
 
 		th = (struct tcphdr *)((u_char *)ip + sizeof(struct ip));
 		doff = sizeof(struct ip) + sizeof(struct tcphdr) + optlen;
 		break;
 #endif
 #ifdef INET6
 	/*
 	 * RFC 2385, 2.0  Proposal
 	 * For IPv6, the pseudo-header is as described in RFC 2460, namely the
 	 * 128-bit source IPv6 address, 128-bit destination IPv6 address, zero-
 	 * extended next header value (to form 32 bits), and 32-bit segment
 	 * length.
 	 * Note: Upper-Layer Packet Length comes before Next Header.
 	 */
 	case (IPV6_VERSION >> 4):
 		in6 = ip6->ip6_src;
 		in6_clearscope(&in6);
 		MD5Update(&ctx, (char *)&in6, sizeof(struct in6_addr));
 		in6 = ip6->ip6_dst;
 		in6_clearscope(&in6);
 		MD5Update(&ctx, (char *)&in6, sizeof(struct in6_addr));
 		plen = htonl(len + sizeof(struct tcphdr) + optlen);
 		MD5Update(&ctx, (char *)&plen, sizeof(uint32_t));
 		nhdr = 0;
 		MD5Update(&ctx, (char *)&nhdr, sizeof(uint8_t));
 		MD5Update(&ctx, (char *)&nhdr, sizeof(uint8_t));
 		MD5Update(&ctx, (char *)&nhdr, sizeof(uint8_t));
 		nhdr = IPPROTO_TCP;
 		MD5Update(&ctx, (char *)&nhdr, sizeof(uint8_t));
 
 		th = (struct tcphdr *)((u_char *)ip6 + sizeof(struct ip6_hdr));
 		doff = sizeof(struct ip6_hdr) + sizeof(struct tcphdr) + optlen;
 		break;
 #endif
 	default:
 		return (EINVAL);
 		/* NOTREACHED */
 		break;
 	}
 
 
 	/*
 	 * Step 2: Update MD5 hash with TCP header, excluding options.
 	 * The TCP checksum must be set to zero.
 	 */
 	savecsum = th->th_sum;
 	th->th_sum = 0;
 	MD5Update(&ctx, (char *)th, sizeof(struct tcphdr));
 	th->th_sum = savecsum;
 
 	/*
 	 * Step 3: Update MD5 hash with TCP segment data.
 	 *         Use m_apply() to avoid an early m_pullup().
 	 */
 	if (len > 0)
 		m_apply(m, doff, len, tcp_signature_apply, &ctx);
 
 	/*
 	 * Step 4: Update MD5 hash with shared secret.
 	 */
 	MD5Update(&ctx, sav->key_auth->key_data, _KEYLEN(sav->key_auth));
 	MD5Final(buf, &ctx);
 
 	key_sa_recordxfer(sav, m);
 	KEY_FREESAV(&sav);
 	return (0);
 }
 
 /*
  * Verify the TCP-MD5 hash of a TCP segment. (RFC2385)
  *
  * Parameters:
  * m		pointer to head of mbuf chain
  * len		length of TCP segment data, excluding options
  * optlen	length of TCP segment options
  * buf		pointer to storage for computed MD5 digest
  * direction	direction of flow (IPSEC_DIR_INBOUND or OUTBOUND)
  *
  * Return 1 if successful, otherwise return 0.
  */
 int
 tcp_signature_verify(struct mbuf *m, int off0, int tlen, int optlen,
     struct tcpopt *to, struct tcphdr *th, u_int tcpbflag)
 {
 	char tmpdigest[TCP_SIGLEN];
 
 	if (tcp_sig_checksigs == 0)
 		return (1);
 	if ((tcpbflag & TF_SIGNATURE) == 0) {
 		if ((to->to_flags & TOF_SIGNATURE) != 0) {
 
 			/*
 			 * If this socket is not expecting signature but
 			 * the segment contains signature just fail.
 			 */
 			TCPSTAT_INC(tcps_sig_err_sigopt);
 			TCPSTAT_INC(tcps_sig_rcvbadsig);
 			return (0);
 		}
 
 		/* Signature is not expected, and not present in segment. */
 		return (1);
 	}
 
 	/*
 	 * If this socket is expecting signature but the segment does not
 	 * contain any just fail.
 	 */
 	if ((to->to_flags & TOF_SIGNATURE) == 0) {
 		TCPSTAT_INC(tcps_sig_err_nosigopt);
 		TCPSTAT_INC(tcps_sig_rcvbadsig);
 		return (0);
 	}
 	if (tcp_signature_compute(m, off0, tlen, optlen, &tmpdigest[0],
 	    IPSEC_DIR_INBOUND) == -1) {
 		TCPSTAT_INC(tcps_sig_err_buildsig);
 		TCPSTAT_INC(tcps_sig_rcvbadsig);
 		return (0);
 	}
 	
 	if (bcmp(to->to_signature, &tmpdigest[0], TCP_SIGLEN) != 0) {
 		TCPSTAT_INC(tcps_sig_rcvbadsig);
 		return (0);
 	}
 	TCPSTAT_INC(tcps_sig_rcvgoodsig);
 	return (1);
 }
 #endif /* TCP_SIGNATURE */
 
 static int
 sysctl_drop(SYSCTL_HANDLER_ARGS)
 {
 	/* addrs[0] is a foreign socket, addrs[1] is a local one. */
 	struct sockaddr_storage addrs[2];
 	struct inpcb *inp;
 	struct tcpcb *tp;
 	struct tcptw *tw;
 	struct sockaddr_in *fin, *lin;
 #ifdef INET6
 	struct sockaddr_in6 *fin6, *lin6;
 #endif
 	int error;
 
 	inp = NULL;
 	fin = lin = NULL;
 #ifdef INET6
 	fin6 = lin6 = NULL;
 #endif
 	error = 0;
 
 	if (req->oldptr != NULL || req->oldlen != 0)
 		return (EINVAL);
 	if (req->newptr == NULL)
 		return (EPERM);
 	if (req->newlen < sizeof(addrs))
 		return (ENOMEM);
 	error = SYSCTL_IN(req, &addrs, sizeof(addrs));
 	if (error)
 		return (error);
 
 	switch (addrs[0].ss_family) {
 #ifdef INET6
 	case AF_INET6:
 		fin6 = (struct sockaddr_in6 *)&addrs[0];
 		lin6 = (struct sockaddr_in6 *)&addrs[1];
 		if (fin6->sin6_len != sizeof(struct sockaddr_in6) ||
 		    lin6->sin6_len != sizeof(struct sockaddr_in6))
 			return (EINVAL);
 		if (IN6_IS_ADDR_V4MAPPED(&fin6->sin6_addr)) {
 			if (!IN6_IS_ADDR_V4MAPPED(&lin6->sin6_addr))
 				return (EINVAL);
 			in6_sin6_2_sin_in_sock((struct sockaddr *)&addrs[0]);
 			in6_sin6_2_sin_in_sock((struct sockaddr *)&addrs[1]);
 			fin = (struct sockaddr_in *)&addrs[0];
 			lin = (struct sockaddr_in *)&addrs[1];
 			break;
 		}
 		error = sa6_embedscope(fin6, V_ip6_use_defzone);
 		if (error)
 			return (error);
 		error = sa6_embedscope(lin6, V_ip6_use_defzone);
 		if (error)
 			return (error);
 		break;
 #endif
 #ifdef INET
 	case AF_INET:
 		fin = (struct sockaddr_in *)&addrs[0];
 		lin = (struct sockaddr_in *)&addrs[1];
 		if (fin->sin_len != sizeof(struct sockaddr_in) ||
 		    lin->sin_len != sizeof(struct sockaddr_in))
 			return (EINVAL);
 		break;
 #endif
 	default:
 		return (EINVAL);
 	}
 	INP_INFO_WLOCK(&V_tcbinfo);
 	switch (addrs[0].ss_family) {
 #ifdef INET6
 	case AF_INET6:
 		inp = in6_pcblookup(&V_tcbinfo, &fin6->sin6_addr,
 		    fin6->sin6_port, &lin6->sin6_addr, lin6->sin6_port,
 		    INPLOOKUP_WLOCKPCB, NULL);
 		break;
 #endif
 #ifdef INET
 	case AF_INET:
 		inp = in_pcblookup(&V_tcbinfo, fin->sin_addr, fin->sin_port,
 		    lin->sin_addr, lin->sin_port, INPLOOKUP_WLOCKPCB, NULL);
 		break;
 #endif
 	}
 	if (inp != NULL) {
 		if (inp->inp_flags & INP_TIMEWAIT) {
 			/*
 			 * XXXRW: There currently exists a state where an
 			 * inpcb is present, but its timewait state has been
 			 * discarded.  For now, don't allow dropping of this
 			 * type of inpcb.
 			 */
 			tw = intotw(inp);
 			if (tw != NULL)
 				tcp_twclose(tw, 0);
 			else
 				INP_WUNLOCK(inp);
 		} else if (!(inp->inp_flags & INP_DROPPED) &&
 			   !(inp->inp_socket->so_options & SO_ACCEPTCONN)) {
 			tp = intotcpcb(inp);
 			tp = tcp_drop(tp, ECONNABORTED);
 			if (tp != NULL)
 				INP_WUNLOCK(inp);
 		} else
 			INP_WUNLOCK(inp);
 	} else
 		error = ESRCH;
 	INP_INFO_WUNLOCK(&V_tcbinfo);
 	return (error);
 }
 
 SYSCTL_VNET_PROC(_net_inet_tcp, TCPCTL_DROP, drop,
     CTLTYPE_STRUCT|CTLFLAG_WR|CTLFLAG_SKIP, NULL,
     0, sysctl_drop, "", "Drop TCP connection");
 
 /*
  * Generate a standardized TCP log line for use throughout the
  * tcp subsystem.  Memory allocation is done with M_NOWAIT to
  * allow use in the interrupt context.
  *
  * NB: The caller MUST free(s, M_TCPLOG) the returned string.
  * NB: The function may return NULL if memory allocation failed.
  *
  * Due to header inclusion and ordering limitations the struct ip
  * and ip6_hdr pointers have to be passed as void pointers.
  */
 char *
 tcp_log_vain(struct in_conninfo *inc, struct tcphdr *th, void *ip4hdr,
     const void *ip6hdr)
 {
 
 	/* Is logging enabled? */
 	if (tcp_log_in_vain == 0)
 		return (NULL);
 
 	return (tcp_log_addr(inc, th, ip4hdr, ip6hdr));
 }
 
 char *
 tcp_log_addrs(struct in_conninfo *inc, struct tcphdr *th, void *ip4hdr,
     const void *ip6hdr)
 {
 
 	/* Is logging enabled? */
 	if (tcp_log_debug == 0)
 		return (NULL);
 
 	return (tcp_log_addr(inc, th, ip4hdr, ip6hdr));
 }
 
 static char *
 tcp_log_addr(struct in_conninfo *inc, struct tcphdr *th, void *ip4hdr,
     const void *ip6hdr)
 {
 	char *s, *sp;
 	size_t size;
 	struct ip *ip;
 #ifdef INET6
 	const struct ip6_hdr *ip6;
 
 	ip6 = (const struct ip6_hdr *)ip6hdr;
 #endif /* INET6 */
 	ip = (struct ip *)ip4hdr;
 
 	/*
 	 * The log line looks like this:
 	 * "TCP: [1.2.3.4]:50332 to [1.2.3.4]:80 tcpflags 0x2<SYN>"
 	 */
 	size = sizeof("TCP: []:12345 to []:12345 tcpflags 0x2<>") +
 	    sizeof(PRINT_TH_FLAGS) + 1 +
 #ifdef INET6
 	    2 * INET6_ADDRSTRLEN;
 #else
 	    2 * INET_ADDRSTRLEN;
 #endif /* INET6 */
 
 	s = malloc(size, M_TCPLOG, M_ZERO|M_NOWAIT);
 	if (s == NULL)
 		return (NULL);
 
 	strcat(s, "TCP: [");
 	sp = s + strlen(s);
 
 	if (inc && ((inc->inc_flags & INC_ISIPV6) == 0)) {
 		inet_ntoa_r(inc->inc_faddr, sp);
 		sp = s + strlen(s);
 		sprintf(sp, "]:%i to [", ntohs(inc->inc_fport));
 		sp = s + strlen(s);
 		inet_ntoa_r(inc->inc_laddr, sp);
 		sp = s + strlen(s);
 		sprintf(sp, "]:%i", ntohs(inc->inc_lport));
 #ifdef INET6
 	} else if (inc) {
 		ip6_sprintf(sp, &inc->inc6_faddr);
 		sp = s + strlen(s);
 		sprintf(sp, "]:%i to [", ntohs(inc->inc_fport));
 		sp = s + strlen(s);
 		ip6_sprintf(sp, &inc->inc6_laddr);
 		sp = s + strlen(s);
 		sprintf(sp, "]:%i", ntohs(inc->inc_lport));
 	} else if (ip6 && th) {
 		ip6_sprintf(sp, &ip6->ip6_src);
 		sp = s + strlen(s);
 		sprintf(sp, "]:%i to [", ntohs(th->th_sport));
 		sp = s + strlen(s);
 		ip6_sprintf(sp, &ip6->ip6_dst);
 		sp = s + strlen(s);
 		sprintf(sp, "]:%i", ntohs(th->th_dport));
 #endif /* INET6 */
 #ifdef INET
 	} else if (ip && th) {
 		inet_ntoa_r(ip->ip_src, sp);
 		sp = s + strlen(s);
 		sprintf(sp, "]:%i to [", ntohs(th->th_sport));
 		sp = s + strlen(s);
 		inet_ntoa_r(ip->ip_dst, sp);
 		sp = s + strlen(s);
 		sprintf(sp, "]:%i", ntohs(th->th_dport));
 #endif /* INET */
 	} else {
 		free(s, M_TCPLOG);
 		return (NULL);
 	}
 	sp = s + strlen(s);
 	if (th)
 		sprintf(sp, " tcpflags 0x%b", th->th_flags, PRINT_TH_FLAGS);
 	if (*(s + size - 1) != '\0')
 		panic("%s: string too long", __func__);
 	return (s);
 }
 
 /*
  * A subroutine which makes it easy to track TCP state changes with DTrace.
  * This function shouldn't be called for t_state initializations that don't
  * correspond to actual TCP state transitions.
  */
 void
 tcp_state_change(struct tcpcb *tp, int newstate)
 {
 #if defined(KDTRACE_HOOKS)
 	int pstate = tp->t_state;
 #endif
 
 	tp->t_state = newstate;
 	TCP_PROBE6(state_change, NULL, tp, NULL, tp, NULL, pstate);
 }
Index: head/sys/netinet/udp_usrreq.c
===================================================================
--- head/sys/netinet/udp_usrreq.c	(revision 258540)
+++ head/sys/netinet/udp_usrreq.c	(revision 258541)
@@ -1,1636 +1,1635 @@
 /*-
  * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1995
  *	The Regents of the University of California.
  * Copyright (c) 2008 Robert N. M. Watson
  * Copyright (c) 2010-2011 Juniper Networks, Inc.
  * All rights reserved.
  *
  * Portions of this software were developed by Robert N. M. Watson under
  * contract to Juniper Networks, Inc.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)udp_usrreq.c	8.6 (Berkeley) 5/23/95
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_ipfw.h"
 #include "opt_inet.h"
 #include "opt_inet6.h"
 #include "opt_ipsec.h"
-#include "opt_kdtrace.h"
 
 #include <sys/param.h>
 #include <sys/domain.h>
 #include <sys/eventhandler.h>
 #include <sys/jail.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mbuf.h>
 #include <sys/priv.h>
 #include <sys/proc.h>
 #include <sys/protosw.h>
 #include <sys/sdt.h>
 #include <sys/signalvar.h>
 #include <sys/socket.h>
 #include <sys/socketvar.h>
 #include <sys/sx.h>
 #include <sys/sysctl.h>
 #include <sys/syslog.h>
 #include <sys/systm.h>
 
 #include <vm/uma.h>
 
 #include <net/if.h>
 #include <net/if_var.h>
 #include <net/route.h>
 
 #include <netinet/in.h>
 #include <netinet/in_kdtrace.h>
 #include <netinet/in_pcb.h>
 #include <netinet/in_systm.h>
 #include <netinet/in_var.h>
 #include <netinet/ip.h>
 #ifdef INET6
 #include <netinet/ip6.h>
 #endif
 #include <netinet/ip_icmp.h>
 #include <netinet/icmp_var.h>
 #include <netinet/ip_var.h>
 #include <netinet/ip_options.h>
 #ifdef INET6
 #include <netinet6/ip6_var.h>
 #endif
 #include <netinet/udp.h>
 #include <netinet/udp_var.h>
 
 #ifdef IPSEC
 #include <netipsec/ipsec.h>
 #include <netipsec/esp.h>
 #endif
 
 #include <machine/in_cksum.h>
 
 #include <security/mac/mac_framework.h>
 
 /*
  * UDP protocol implementation.
  * Per RFC 768, August, 1980.
  */
 
 /*
  * BSD 4.2 defaulted the udp checksum to be off.  Turning off udp checksums
  * removes the only data integrity mechanism for packets and malformed
  * packets that would otherwise be discarded due to bad checksums, and may
  * cause problems (especially for NFS data blocks).
  */
 VNET_DEFINE(int, udp_cksum) = 1;
 SYSCTL_VNET_INT(_net_inet_udp, UDPCTL_CHECKSUM, checksum, CTLFLAG_RW,
     &VNET_NAME(udp_cksum), 0, "compute udp checksum");
 
 int	udp_log_in_vain = 0;
 SYSCTL_INT(_net_inet_udp, OID_AUTO, log_in_vain, CTLFLAG_RW,
     &udp_log_in_vain, 0, "Log all incoming UDP packets");
 
 VNET_DEFINE(int, udp_blackhole) = 0;
 SYSCTL_VNET_INT(_net_inet_udp, OID_AUTO, blackhole, CTLFLAG_RW,
     &VNET_NAME(udp_blackhole), 0,
     "Do not send port unreachables for refused connects");
 
 u_long	udp_sendspace = 9216;		/* really max datagram size */
 					/* 40 1K datagrams */
 SYSCTL_ULONG(_net_inet_udp, UDPCTL_MAXDGRAM, maxdgram, CTLFLAG_RW,
     &udp_sendspace, 0, "Maximum outgoing UDP datagram size");
 
 u_long	udp_recvspace = 40 * (1024 +
 #ifdef INET6
 				      sizeof(struct sockaddr_in6)
 #else
 				      sizeof(struct sockaddr_in)
 #endif
 				      );
 
 SYSCTL_ULONG(_net_inet_udp, UDPCTL_RECVSPACE, recvspace, CTLFLAG_RW,
     &udp_recvspace, 0, "Maximum space for incoming UDP datagrams");
 
 VNET_DEFINE(struct inpcbhead, udb);		/* from udp_var.h */
 VNET_DEFINE(struct inpcbinfo, udbinfo);
 static VNET_DEFINE(uma_zone_t, udpcb_zone);
 #define	V_udpcb_zone			VNET(udpcb_zone)
 
 #ifndef UDBHASHSIZE
 #define	UDBHASHSIZE	128
 #endif
 
 VNET_PCPUSTAT_DEFINE(struct udpstat, udpstat);		/* from udp_var.h */
 VNET_PCPUSTAT_SYSINIT(udpstat);
 SYSCTL_VNET_PCPUSTAT(_net_inet_udp, UDPCTL_STATS, stats, struct udpstat,
     udpstat, "UDP statistics (struct udpstat, netinet/udp_var.h)");
 
 #ifdef VIMAGE
 VNET_PCPUSTAT_SYSUNINIT(udpstat);
 #endif /* VIMAGE */
 #ifdef INET
 static void	udp_detach(struct socket *so);
 static int	udp_output(struct inpcb *, struct mbuf *, struct sockaddr *,
 		    struct mbuf *, struct thread *);
 #endif
 
 #ifdef IPSEC
 #ifdef IPSEC_NAT_T
 #define	UF_ESPINUDP_ALL	(UF_ESPINUDP_NON_IKE|UF_ESPINUDP)
 #ifdef INET
 static struct mbuf *udp4_espdecap(struct inpcb *, struct mbuf *, int);
 #endif
 #endif /* IPSEC_NAT_T */
 #endif /* IPSEC */
 
 static void
 udp_zone_change(void *tag)
 {
 
 	uma_zone_set_max(V_udbinfo.ipi_zone, maxsockets);
 	uma_zone_set_max(V_udpcb_zone, maxsockets);
 }
 
 static int
 udp_inpcb_init(void *mem, int size, int flags)
 {
 	struct inpcb *inp;
 
 	inp = mem;
 	INP_LOCK_INIT(inp, "inp", "udpinp");
 	return (0);
 }
 
 void
 udp_init(void)
 {
 
 	in_pcbinfo_init(&V_udbinfo, "udp", &V_udb, UDBHASHSIZE, UDBHASHSIZE,
 	    "udp_inpcb", udp_inpcb_init, NULL, UMA_ZONE_NOFREE,
 	    IPI_HASHFIELDS_2TUPLE);
 	V_udpcb_zone = uma_zcreate("udpcb", sizeof(struct udpcb),
 	    NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
 	uma_zone_set_max(V_udpcb_zone, maxsockets);
 	uma_zone_set_warning(V_udpcb_zone, "kern.ipc.maxsockets limit reached");
 	EVENTHANDLER_REGISTER(maxsockets_change, udp_zone_change, NULL,
 	    EVENTHANDLER_PRI_ANY);
 }
 
 /*
  * Kernel module interface for updating udpstat.  The argument is an index
  * into udpstat treated as an array of u_long.  While this encodes the
  * general layout of udpstat into the caller, it doesn't encode its location,
  * so that future changes to add, for example, per-CPU stats support won't
  * cause binary compatibility problems for kernel modules.
  */
 void
 kmod_udpstat_inc(int statnum)
 {
 
 	counter_u64_add(VNET(udpstat)[statnum], 1);
 }
 
 int
 udp_newudpcb(struct inpcb *inp)
 {
 	struct udpcb *up;
 
 	up = uma_zalloc(V_udpcb_zone, M_NOWAIT | M_ZERO);
 	if (up == NULL)
 		return (ENOBUFS);
 	inp->inp_ppcb = up;
 	return (0);
 }
 
 void
 udp_discardcb(struct udpcb *up)
 {
 
 	uma_zfree(V_udpcb_zone, up);
 }
 
 #ifdef VIMAGE
 void
 udp_destroy(void)
 {
 
 	in_pcbinfo_destroy(&V_udbinfo);
 	uma_zdestroy(V_udpcb_zone);
 }
 #endif
 
 #ifdef INET
 /*
  * Subroutine of udp_input(), which appends the provided mbuf chain to the
  * passed pcb/socket.  The caller must provide a sockaddr_in via udp_in that
  * contains the source address.  If the socket ends up being an IPv6 socket,
  * udp_append() will convert to a sockaddr_in6 before passing the address
  * into the socket code.
  */
 static void
 udp_append(struct inpcb *inp, struct ip *ip, struct mbuf *n, int off,
     struct sockaddr_in *udp_in)
 {
 	struct sockaddr *append_sa;
 	struct socket *so;
 	struct mbuf *opts = 0;
 #ifdef INET6
 	struct sockaddr_in6 udp_in6;
 #endif
 	struct udpcb *up;
 
 	INP_LOCK_ASSERT(inp);
 
 	/*
 	 * Engage the tunneling protocol.
 	 */
 	up = intoudpcb(inp);
 	if (up->u_tun_func != NULL) {
 		(*up->u_tun_func)(n, off, inp);
 		return;
 	}
 
 	if (n == NULL)
 		return;
 
 	off += sizeof(struct udphdr);
 
 #ifdef IPSEC
 	/* Check AH/ESP integrity. */
 	if (ipsec4_in_reject(n, inp)) {
 		m_freem(n);
 		IPSECSTAT_INC(ips_in_polvio);
 		return;
 	}
 #ifdef IPSEC_NAT_T
 	up = intoudpcb(inp);
 	KASSERT(up != NULL, ("%s: udpcb NULL", __func__));
 	if (up->u_flags & UF_ESPINUDP_ALL) {	/* IPSec UDP encaps. */
 		n = udp4_espdecap(inp, n, off);
 		if (n == NULL)				/* Consumed. */
 			return;
 	}
 #endif /* IPSEC_NAT_T */
 #endif /* IPSEC */
 #ifdef MAC
 	if (mac_inpcb_check_deliver(inp, n) != 0) {
 		m_freem(n);
 		return;
 	}
 #endif /* MAC */
 	if (inp->inp_flags & INP_CONTROLOPTS ||
 	    inp->inp_socket->so_options & (SO_TIMESTAMP | SO_BINTIME)) {
 #ifdef INET6
 		if (inp->inp_vflag & INP_IPV6)
 			(void)ip6_savecontrol_v4(inp, n, &opts, NULL);
 		else
 #endif /* INET6 */
 			ip_savecontrol(inp, &opts, ip, n);
 	}
 #ifdef INET6
 	if (inp->inp_vflag & INP_IPV6) {
 		bzero(&udp_in6, sizeof(udp_in6));
 		udp_in6.sin6_len = sizeof(udp_in6);
 		udp_in6.sin6_family = AF_INET6;
 		in6_sin_2_v4mapsin6(udp_in, &udp_in6);
 		append_sa = (struct sockaddr *)&udp_in6;
 	} else
 #endif /* INET6 */
 		append_sa = (struct sockaddr *)udp_in;
 	m_adj(n, off);
 
 	so = inp->inp_socket;
 	SOCKBUF_LOCK(&so->so_rcv);
 	if (sbappendaddr_locked(&so->so_rcv, append_sa, n, opts) == 0) {
 		SOCKBUF_UNLOCK(&so->so_rcv);
 		m_freem(n);
 		if (opts)
 			m_freem(opts);
 		UDPSTAT_INC(udps_fullsock);
 	} else
 		sorwakeup_locked(so);
 }
 
 void
 udp_input(struct mbuf *m, int off)
 {
 	int iphlen = off;
 	struct ip *ip;
 	struct udphdr *uh;
 	struct ifnet *ifp;
 	struct inpcb *inp;
 	uint16_t len, ip_len;
 	struct ip save_ip;
 	struct sockaddr_in udp_in;
 	struct m_tag *fwd_tag;
 
 	ifp = m->m_pkthdr.rcvif;
 	UDPSTAT_INC(udps_ipackets);
 
 	/*
 	 * Strip IP options, if any; should skip this, make available to
 	 * user, and use on returned packets, but we don't yet have a way to
 	 * check the checksum with options still present.
 	 */
 	if (iphlen > sizeof (struct ip)) {
 		ip_stripoptions(m);
 		iphlen = sizeof(struct ip);
 	}
 
 	/*
 	 * Get IP and UDP header together in first mbuf.
 	 */
 	ip = mtod(m, struct ip *);
 	if (m->m_len < iphlen + sizeof(struct udphdr)) {
 		if ((m = m_pullup(m, iphlen + sizeof(struct udphdr))) == 0) {
 			UDPSTAT_INC(udps_hdrops);
 			return;
 		}
 		ip = mtod(m, struct ip *);
 	}
 	uh = (struct udphdr *)((caddr_t)ip + iphlen);
 
 	/*
 	 * Destination port of 0 is illegal, based on RFC768.
 	 */
 	if (uh->uh_dport == 0)
 		goto badunlocked;
 
 	/*
 	 * Construct sockaddr format source address.  Stuff source address
 	 * and datagram in user buffer.
 	 */
 	bzero(&udp_in, sizeof(udp_in));
 	udp_in.sin_len = sizeof(udp_in);
 	udp_in.sin_family = AF_INET;
 	udp_in.sin_port = uh->uh_sport;
 	udp_in.sin_addr = ip->ip_src;
 
 	/*
 	 * Make mbuf data length reflect UDP length.  If not enough data to
 	 * reflect UDP length, drop.
 	 */
 	len = ntohs((u_short)uh->uh_ulen);
 	ip_len = ntohs(ip->ip_len) - iphlen;
 	if (ip_len != len) {
 		if (len > ip_len || len < sizeof(struct udphdr)) {
 			UDPSTAT_INC(udps_badlen);
 			goto badunlocked;
 		}
 		m_adj(m, len - ip_len);
 	}
 
 	/*
 	 * Save a copy of the IP header in case we want restore it for
 	 * sending an ICMP error message in response.
 	 */
 	if (!V_udp_blackhole)
 		save_ip = *ip;
 	else
 		memset(&save_ip, 0, sizeof(save_ip));
 
 	/*
 	 * Checksum extended UDP header and data.
 	 */
 	if (uh->uh_sum) {
 		u_short uh_sum;
 
 		if (m->m_pkthdr.csum_flags & CSUM_DATA_VALID) {
 			if (m->m_pkthdr.csum_flags & CSUM_PSEUDO_HDR)
 				uh_sum = m->m_pkthdr.csum_data;
 			else
 				uh_sum = in_pseudo(ip->ip_src.s_addr,
 				    ip->ip_dst.s_addr, htonl((u_short)len +
 				    m->m_pkthdr.csum_data + IPPROTO_UDP));
 			uh_sum ^= 0xffff;
 		} else {
 			char b[9];
 
 			bcopy(((struct ipovly *)ip)->ih_x1, b, 9);
 			bzero(((struct ipovly *)ip)->ih_x1, 9);
 			((struct ipovly *)ip)->ih_len = uh->uh_ulen;
 			uh_sum = in_cksum(m, len + sizeof (struct ip));
 			bcopy(b, ((struct ipovly *)ip)->ih_x1, 9);
 		}
 		if (uh_sum) {
 			UDPSTAT_INC(udps_badsum);
 			m_freem(m);
 			return;
 		}
 	} else
 		UDPSTAT_INC(udps_nosum);
 
 	if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr)) ||
 	    in_broadcast(ip->ip_dst, ifp)) {
 		struct inpcb *last;
 		struct ip_moptions *imo;
 
 		INP_INFO_RLOCK(&V_udbinfo);
 		last = NULL;
 		LIST_FOREACH(inp, &V_udb, inp_list) {
 			if (inp->inp_lport != uh->uh_dport)
 				continue;
 #ifdef INET6
 			if ((inp->inp_vflag & INP_IPV4) == 0)
 				continue;
 #endif
 			if (inp->inp_laddr.s_addr != INADDR_ANY &&
 			    inp->inp_laddr.s_addr != ip->ip_dst.s_addr)
 				continue;
 			if (inp->inp_faddr.s_addr != INADDR_ANY &&
 			    inp->inp_faddr.s_addr != ip->ip_src.s_addr)
 				continue;
 			if (inp->inp_fport != 0 &&
 			    inp->inp_fport != uh->uh_sport)
 				continue;
 
 			INP_RLOCK(inp);
 
 			/*
 			 * XXXRW: Because we weren't holding either the inpcb
 			 * or the hash lock when we checked for a match
 			 * before, we should probably recheck now that the
 			 * inpcb lock is held.
 			 */
 
 			/*
 			 * Handle socket delivery policy for any-source
 			 * and source-specific multicast. [RFC3678]
 			 */
 			imo = inp->inp_moptions;
 			if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr))) {
 				struct sockaddr_in	 group;
 				int			 blocked;
 				if (imo == NULL) {
 					INP_RUNLOCK(inp);
 					continue;
 				}
 				bzero(&group, sizeof(struct sockaddr_in));
 				group.sin_len = sizeof(struct sockaddr_in);
 				group.sin_family = AF_INET;
 				group.sin_addr = ip->ip_dst;
 
 				blocked = imo_multi_filter(imo, ifp,
 					(struct sockaddr *)&group,
 					(struct sockaddr *)&udp_in);
 				if (blocked != MCAST_PASS) {
 					if (blocked == MCAST_NOTGMEMBER)
 						IPSTAT_INC(ips_notmember);
 					if (blocked == MCAST_NOTSMEMBER ||
 					    blocked == MCAST_MUTED)
 						UDPSTAT_INC(udps_filtermcast);
 					INP_RUNLOCK(inp);
 					continue;
 				}
 			}
 			if (last != NULL) {
 				struct mbuf *n;
 
 				n = m_copy(m, 0, M_COPYALL);
 				udp_append(last, ip, n, iphlen, &udp_in);
 				INP_RUNLOCK(last);
 			}
 			last = inp;
 			/*
 			 * Don't look for additional matches if this one does
 			 * not have either the SO_REUSEPORT or SO_REUSEADDR
 			 * socket options set.  This heuristic avoids
 			 * searching through all pcbs in the common case of a
 			 * non-shared port.  It assumes that an application
 			 * will never clear these options after setting them.
 			 */
 			if ((last->inp_socket->so_options &
 			    (SO_REUSEPORT|SO_REUSEADDR)) == 0)
 				break;
 		}
 
 		if (last == NULL) {
 			/*
 			 * No matching pcb found; discard datagram.  (No need
 			 * to send an ICMP Port Unreachable for a broadcast
 			 * or multicast datgram.)
 			 */
 			UDPSTAT_INC(udps_noportbcast);
 			if (inp)
 				INP_RUNLOCK(inp);
 			INP_INFO_RUNLOCK(&V_udbinfo);
 			goto badunlocked;
 		}
 		udp_append(last, ip, m, iphlen, &udp_in);
 		INP_RUNLOCK(last);
 		INP_INFO_RUNLOCK(&V_udbinfo);
 		return;
 	}
 
 	/*
 	 * Locate pcb for datagram.
 	 */
 
 	/*
 	 * Grab info from PACKET_TAG_IPFORWARD tag prepended to the chain.
 	 */
 	if ((m->m_flags & M_IP_NEXTHOP) &&
 	    (fwd_tag = m_tag_find(m, PACKET_TAG_IPFORWARD, NULL)) != NULL) {
 		struct sockaddr_in *next_hop;
 
 		next_hop = (struct sockaddr_in *)(fwd_tag + 1);
 
 		/*
 		 * Transparently forwarded. Pretend to be the destination.
 		 * Already got one like this?
 		 */
 		inp = in_pcblookup_mbuf(&V_udbinfo, ip->ip_src, uh->uh_sport,
 		    ip->ip_dst, uh->uh_dport, INPLOOKUP_RLOCKPCB, ifp, m);
 		if (!inp) {
 			/*
 			 * It's new.  Try to find the ambushing socket.
 			 * Because we've rewritten the destination address,
 			 * any hardware-generated hash is ignored.
 			 */
 			inp = in_pcblookup(&V_udbinfo, ip->ip_src,
 			    uh->uh_sport, next_hop->sin_addr,
 			    next_hop->sin_port ? htons(next_hop->sin_port) :
 			    uh->uh_dport, INPLOOKUP_WILDCARD |
 			    INPLOOKUP_RLOCKPCB, ifp);
 		}
 		/* Remove the tag from the packet. We don't need it anymore. */
 		m_tag_delete(m, fwd_tag);
 		m->m_flags &= ~M_IP_NEXTHOP;
 	} else
 		inp = in_pcblookup_mbuf(&V_udbinfo, ip->ip_src, uh->uh_sport,
 		    ip->ip_dst, uh->uh_dport, INPLOOKUP_WILDCARD |
 		    INPLOOKUP_RLOCKPCB, ifp, m);
 	if (inp == NULL) {
 		if (udp_log_in_vain) {
 			char buf[4*sizeof "123"];
 
 			strcpy(buf, inet_ntoa(ip->ip_dst));
 			log(LOG_INFO,
 			    "Connection attempt to UDP %s:%d from %s:%d\n",
 			    buf, ntohs(uh->uh_dport), inet_ntoa(ip->ip_src),
 			    ntohs(uh->uh_sport));
 		}
 		UDPSTAT_INC(udps_noport);
 		if (m->m_flags & (M_BCAST | M_MCAST)) {
 			UDPSTAT_INC(udps_noportbcast);
 			goto badunlocked;
 		}
 		if (V_udp_blackhole)
 			goto badunlocked;
 		if (badport_bandlim(BANDLIM_ICMP_UNREACH) < 0)
 			goto badunlocked;
 		*ip = save_ip;
 		icmp_error(m, ICMP_UNREACH, ICMP_UNREACH_PORT, 0, 0);
 		return;
 	}
 
 	/*
 	 * Check the minimum TTL for socket.
 	 */
 	INP_RLOCK_ASSERT(inp);
 	if (inp->inp_ip_minttl && inp->inp_ip_minttl > ip->ip_ttl) {
 		INP_RUNLOCK(inp);
 		m_freem(m);
 		return;
 	}
 
 	UDP_PROBE(receive, NULL, inp, ip, inp, uh);
 	udp_append(inp, ip, m, iphlen, &udp_in);
 	INP_RUNLOCK(inp);
 	return;
 
 badunlocked:
 	m_freem(m);
 }
 #endif /* INET */
 
 /*
  * Notify a udp user of an asynchronous error; just wake up so that they can
  * collect error status.
  */
 struct inpcb *
 udp_notify(struct inpcb *inp, int errno)
 {
 
 	/*
 	 * While udp_ctlinput() always calls udp_notify() with a read lock
 	 * when invoking it directly, in_pcbnotifyall() currently uses write
 	 * locks due to sharing code with TCP.  For now, accept either a read
 	 * or a write lock, but a read lock is sufficient.
 	 */
 	INP_LOCK_ASSERT(inp);
 
 	inp->inp_socket->so_error = errno;
 	sorwakeup(inp->inp_socket);
 	sowwakeup(inp->inp_socket);
 	return (inp);
 }
 
 #ifdef INET
 void
 udp_ctlinput(int cmd, struct sockaddr *sa, void *vip)
 {
 	struct ip *ip = vip;
 	struct udphdr *uh;
 	struct in_addr faddr;
 	struct inpcb *inp;
 
 	faddr = ((struct sockaddr_in *)sa)->sin_addr;
 	if (sa->sa_family != AF_INET || faddr.s_addr == INADDR_ANY)
 		return;
 
 	/*
 	 * Redirects don't need to be handled up here.
 	 */
 	if (PRC_IS_REDIRECT(cmd))
 		return;
 
 	/*
 	 * Hostdead is ugly because it goes linearly through all PCBs.
 	 *
 	 * XXX: We never get this from ICMP, otherwise it makes an excellent
 	 * DoS attack on machines with many connections.
 	 */
 	if (cmd == PRC_HOSTDEAD)
 		ip = NULL;
 	else if ((unsigned)cmd >= PRC_NCMDS || inetctlerrmap[cmd] == 0)
 		return;
 	if (ip != NULL) {
 		uh = (struct udphdr *)((caddr_t)ip + (ip->ip_hl << 2));
 		inp = in_pcblookup(&V_udbinfo, faddr, uh->uh_dport,
 		    ip->ip_src, uh->uh_sport, INPLOOKUP_RLOCKPCB, NULL);
 		if (inp != NULL) {
 			INP_RLOCK_ASSERT(inp);
 			if (inp->inp_socket != NULL) {
 				udp_notify(inp, inetctlerrmap[cmd]);
 			}
 			INP_RUNLOCK(inp);
 		}
 	} else
 		in_pcbnotifyall(&V_udbinfo, faddr, inetctlerrmap[cmd],
 		    udp_notify);
 }
 #endif /* INET */
 
 static int
 udp_pcblist(SYSCTL_HANDLER_ARGS)
 {
 	int error, i, n;
 	struct inpcb *inp, **inp_list;
 	inp_gen_t gencnt;
 	struct xinpgen xig;
 
 	/*
 	 * The process of preparing the PCB list is too time-consuming and
 	 * resource-intensive to repeat twice on every request.
 	 */
 	if (req->oldptr == 0) {
 		n = V_udbinfo.ipi_count;
 		n += imax(n / 8, 10);
 		req->oldidx = 2 * (sizeof xig) + n * sizeof(struct xinpcb);
 		return (0);
 	}
 
 	if (req->newptr != 0)
 		return (EPERM);
 
 	/*
 	 * OK, now we're committed to doing something.
 	 */
 	INP_INFO_RLOCK(&V_udbinfo);
 	gencnt = V_udbinfo.ipi_gencnt;
 	n = V_udbinfo.ipi_count;
 	INP_INFO_RUNLOCK(&V_udbinfo);
 
 	error = sysctl_wire_old_buffer(req, 2 * (sizeof xig)
 		+ n * sizeof(struct xinpcb));
 	if (error != 0)
 		return (error);
 
 	xig.xig_len = sizeof xig;
 	xig.xig_count = n;
 	xig.xig_gen = gencnt;
 	xig.xig_sogen = so_gencnt;
 	error = SYSCTL_OUT(req, &xig, sizeof xig);
 	if (error)
 		return (error);
 
 	inp_list = malloc(n * sizeof *inp_list, M_TEMP, M_WAITOK);
 	if (inp_list == 0)
 		return (ENOMEM);
 
 	INP_INFO_RLOCK(&V_udbinfo);
 	for (inp = LIST_FIRST(V_udbinfo.ipi_listhead), i = 0; inp && i < n;
 	     inp = LIST_NEXT(inp, inp_list)) {
 		INP_WLOCK(inp);
 		if (inp->inp_gencnt <= gencnt &&
 		    cr_canseeinpcb(req->td->td_ucred, inp) == 0) {
 			in_pcbref(inp);
 			inp_list[i++] = inp;
 		}
 		INP_WUNLOCK(inp);
 	}
 	INP_INFO_RUNLOCK(&V_udbinfo);
 	n = i;
 
 	error = 0;
 	for (i = 0; i < n; i++) {
 		inp = inp_list[i];
 		INP_RLOCK(inp);
 		if (inp->inp_gencnt <= gencnt) {
 			struct xinpcb xi;
 
 			bzero(&xi, sizeof(xi));
 			xi.xi_len = sizeof xi;
 			/* XXX should avoid extra copy */
 			bcopy(inp, &xi.xi_inp, sizeof *inp);
 			if (inp->inp_socket)
 				sotoxsocket(inp->inp_socket, &xi.xi_socket);
 			xi.xi_inp.inp_gencnt = inp->inp_gencnt;
 			INP_RUNLOCK(inp);
 			error = SYSCTL_OUT(req, &xi, sizeof xi);
 		} else
 			INP_RUNLOCK(inp);
 	}
 	INP_INFO_WLOCK(&V_udbinfo);
 	for (i = 0; i < n; i++) {
 		inp = inp_list[i];
 		INP_RLOCK(inp);
 		if (!in_pcbrele_rlocked(inp))
 			INP_RUNLOCK(inp);
 	}
 	INP_INFO_WUNLOCK(&V_udbinfo);
 
 	if (!error) {
 		/*
 		 * Give the user an updated idea of our state.  If the
 		 * generation differs from what we told her before, she knows
 		 * that something happened while we were processing this
 		 * request, and it might be necessary to retry.
 		 */
 		INP_INFO_RLOCK(&V_udbinfo);
 		xig.xig_gen = V_udbinfo.ipi_gencnt;
 		xig.xig_sogen = so_gencnt;
 		xig.xig_count = V_udbinfo.ipi_count;
 		INP_INFO_RUNLOCK(&V_udbinfo);
 		error = SYSCTL_OUT(req, &xig, sizeof xig);
 	}
 	free(inp_list, M_TEMP);
 	return (error);
 }
 
 SYSCTL_PROC(_net_inet_udp, UDPCTL_PCBLIST, pcblist,
     CTLTYPE_OPAQUE | CTLFLAG_RD, NULL, 0,
     udp_pcblist, "S,xinpcb", "List of active UDP sockets");
 
 #ifdef INET
 static int
 udp_getcred(SYSCTL_HANDLER_ARGS)
 {
 	struct xucred xuc;
 	struct sockaddr_in addrs[2];
 	struct inpcb *inp;
 	int error;
 
 	error = priv_check(req->td, PRIV_NETINET_GETCRED);
 	if (error)
 		return (error);
 	error = SYSCTL_IN(req, addrs, sizeof(addrs));
 	if (error)
 		return (error);
 	inp = in_pcblookup(&V_udbinfo, addrs[1].sin_addr, addrs[1].sin_port,
 	    addrs[0].sin_addr, addrs[0].sin_port,
 	    INPLOOKUP_WILDCARD | INPLOOKUP_RLOCKPCB, NULL);
 	if (inp != NULL) {
 		INP_RLOCK_ASSERT(inp);
 		if (inp->inp_socket == NULL)
 			error = ENOENT;
 		if (error == 0)
 			error = cr_canseeinpcb(req->td->td_ucred, inp);
 		if (error == 0)
 			cru2x(inp->inp_cred, &xuc);
 		INP_RUNLOCK(inp);
 	} else
 		error = ENOENT;
 	if (error == 0)
 		error = SYSCTL_OUT(req, &xuc, sizeof(struct xucred));
 	return (error);
 }
 
 SYSCTL_PROC(_net_inet_udp, OID_AUTO, getcred,
     CTLTYPE_OPAQUE|CTLFLAG_RW|CTLFLAG_PRISON, 0, 0,
     udp_getcred, "S,xucred", "Get the xucred of a UDP connection");
 #endif /* INET */
 
 int
 udp_ctloutput(struct socket *so, struct sockopt *sopt)
 {
 	int error = 0, optval;
 	struct inpcb *inp;
 #ifdef IPSEC_NAT_T
 	struct udpcb *up;
 #endif
 
 	inp = sotoinpcb(so);
 	KASSERT(inp != NULL, ("%s: inp == NULL", __func__));
 	INP_WLOCK(inp);
 	if (sopt->sopt_level != IPPROTO_UDP) {
 #ifdef INET6
 		if (INP_CHECK_SOCKAF(so, AF_INET6)) {
 			INP_WUNLOCK(inp);
 			error = ip6_ctloutput(so, sopt);
 		}
 #endif
 #if defined(INET) && defined(INET6)
 		else
 #endif
 #ifdef INET
 		{
 			INP_WUNLOCK(inp);
 			error = ip_ctloutput(so, sopt);
 		}
 #endif
 		return (error);
 	}
 
 	switch (sopt->sopt_dir) {
 	case SOPT_SET:
 		switch (sopt->sopt_name) {
 		case UDP_ENCAP:
 			INP_WUNLOCK(inp);
 			error = sooptcopyin(sopt, &optval, sizeof optval,
 					    sizeof optval);
 			if (error)
 				break;
 			inp = sotoinpcb(so);
 			KASSERT(inp != NULL, ("%s: inp == NULL", __func__));
 			INP_WLOCK(inp);
 #ifdef IPSEC_NAT_T
 			up = intoudpcb(inp);
 			KASSERT(up != NULL, ("%s: up == NULL", __func__));
 #endif
 			switch (optval) {
 			case 0:
 				/* Clear all UDP encap. */
 #ifdef IPSEC_NAT_T
 				up->u_flags &= ~UF_ESPINUDP_ALL;
 #endif
 				break;
 #ifdef IPSEC_NAT_T
 			case UDP_ENCAP_ESPINUDP:
 			case UDP_ENCAP_ESPINUDP_NON_IKE:
 				up->u_flags &= ~UF_ESPINUDP_ALL;
 				if (optval == UDP_ENCAP_ESPINUDP)
 					up->u_flags |= UF_ESPINUDP;
 				else if (optval == UDP_ENCAP_ESPINUDP_NON_IKE)
 					up->u_flags |= UF_ESPINUDP_NON_IKE;
 				break;
 #endif
 			default:
 				error = EINVAL;
 				break;
 			}
 			INP_WUNLOCK(inp);
 			break;
 		default:
 			INP_WUNLOCK(inp);
 			error = ENOPROTOOPT;
 			break;
 		}
 		break;
 	case SOPT_GET:
 		switch (sopt->sopt_name) {
 #ifdef IPSEC_NAT_T
 		case UDP_ENCAP:
 			up = intoudpcb(inp);
 			KASSERT(up != NULL, ("%s: up == NULL", __func__));
 			optval = up->u_flags & UF_ESPINUDP_ALL;
 			INP_WUNLOCK(inp);
 			error = sooptcopyout(sopt, &optval, sizeof optval);
 			break;
 #endif
 		default:
 			INP_WUNLOCK(inp);
 			error = ENOPROTOOPT;
 			break;
 		}
 		break;
 	}	
 	return (error);
 }
 
 #ifdef INET
 #define	UH_WLOCKED	2
 #define	UH_RLOCKED	1
 #define	UH_UNLOCKED	0
 static int
 udp_output(struct inpcb *inp, struct mbuf *m, struct sockaddr *addr,
     struct mbuf *control, struct thread *td)
 {
 	struct udpiphdr *ui;
 	int len = m->m_pkthdr.len;
 	struct in_addr faddr, laddr;
 	struct cmsghdr *cm;
 	struct sockaddr_in *sin, src;
 	int error = 0;
 	int ipflags;
 	u_short fport, lport;
 	int unlock_udbinfo;
 	u_char tos;
 
 	/*
 	 * udp_output() may need to temporarily bind or connect the current
 	 * inpcb.  As such, we don't know up front whether we will need the
 	 * pcbinfo lock or not.  Do any work to decide what is needed up
 	 * front before acquiring any locks.
 	 */
 	if (len + sizeof(struct udpiphdr) > IP_MAXPACKET) {
 		if (control)
 			m_freem(control);
 		m_freem(m);
 		return (EMSGSIZE);
 	}
 
 	src.sin_family = 0;
 	INP_RLOCK(inp);
 	tos = inp->inp_ip_tos;
 	if (control != NULL) {
 		/*
 		 * XXX: Currently, we assume all the optional information is
 		 * stored in a single mbuf.
 		 */
 		if (control->m_next) {
 			INP_RUNLOCK(inp);
 			m_freem(control);
 			m_freem(m);
 			return (EINVAL);
 		}
 		for (; control->m_len > 0;
 		    control->m_data += CMSG_ALIGN(cm->cmsg_len),
 		    control->m_len -= CMSG_ALIGN(cm->cmsg_len)) {
 			cm = mtod(control, struct cmsghdr *);
 			if (control->m_len < sizeof(*cm) || cm->cmsg_len == 0
 			    || cm->cmsg_len > control->m_len) {
 				error = EINVAL;
 				break;
 			}
 			if (cm->cmsg_level != IPPROTO_IP)
 				continue;
 
 			switch (cm->cmsg_type) {
 			case IP_SENDSRCADDR:
 				if (cm->cmsg_len !=
 				    CMSG_LEN(sizeof(struct in_addr))) {
 					error = EINVAL;
 					break;
 				}
 				bzero(&src, sizeof(src));
 				src.sin_family = AF_INET;
 				src.sin_len = sizeof(src);
 				src.sin_port = inp->inp_lport;
 				src.sin_addr =
 				    *(struct in_addr *)CMSG_DATA(cm);
 				break;
 
 			case IP_TOS:
 				if (cm->cmsg_len != CMSG_LEN(sizeof(u_char))) {
 					error = EINVAL;
 					break;
 				}
 				tos = *(u_char *)CMSG_DATA(cm);
 				break;
 
 			default:
 				error = ENOPROTOOPT;
 				break;
 			}
 			if (error)
 				break;
 		}
 		m_freem(control);
 	}
 	if (error) {
 		INP_RUNLOCK(inp);
 		m_freem(m);
 		return (error);
 	}
 
 	/*
 	 * Depending on whether or not the application has bound or connected
 	 * the socket, we may have to do varying levels of work.  The optimal
 	 * case is for a connected UDP socket, as a global lock isn't
 	 * required at all.
 	 *
 	 * In order to decide which we need, we require stability of the
 	 * inpcb binding, which we ensure by acquiring a read lock on the
 	 * inpcb.  This doesn't strictly follow the lock order, so we play
 	 * the trylock and retry game; note that we may end up with more
 	 * conservative locks than required the second time around, so later
 	 * assertions have to accept that.  Further analysis of the number of
 	 * misses under contention is required.
 	 *
 	 * XXXRW: Check that hash locking update here is correct.
 	 */
 	sin = (struct sockaddr_in *)addr;
 	if (sin != NULL &&
 	    (inp->inp_laddr.s_addr == INADDR_ANY && inp->inp_lport == 0)) {
 		INP_RUNLOCK(inp);
 		INP_WLOCK(inp);
 		INP_HASH_WLOCK(&V_udbinfo);
 		unlock_udbinfo = UH_WLOCKED;
 	} else if ((sin != NULL && (
 	    (sin->sin_addr.s_addr == INADDR_ANY) ||
 	    (sin->sin_addr.s_addr == INADDR_BROADCAST) ||
 	    (inp->inp_laddr.s_addr == INADDR_ANY) ||
 	    (inp->inp_lport == 0))) ||
 	    (src.sin_family == AF_INET)) {
 		INP_HASH_RLOCK(&V_udbinfo);
 		unlock_udbinfo = UH_RLOCKED;
 	} else
 		unlock_udbinfo = UH_UNLOCKED;
 
 	/*
 	 * If the IP_SENDSRCADDR control message was specified, override the
 	 * source address for this datagram.  Its use is invalidated if the
 	 * address thus specified is incomplete or clobbers other inpcbs.
 	 */
 	laddr = inp->inp_laddr;
 	lport = inp->inp_lport;
 	if (src.sin_family == AF_INET) {
 		INP_HASH_LOCK_ASSERT(&V_udbinfo);
 		if ((lport == 0) ||
 		    (laddr.s_addr == INADDR_ANY &&
 		     src.sin_addr.s_addr == INADDR_ANY)) {
 			error = EINVAL;
 			goto release;
 		}
 		error = in_pcbbind_setup(inp, (struct sockaddr *)&src,
 		    &laddr.s_addr, &lport, td->td_ucred);
 		if (error)
 			goto release;
 	}
 
 	/*
 	 * If a UDP socket has been connected, then a local address/port will
 	 * have been selected and bound.
 	 *
 	 * If a UDP socket has not been connected to, then an explicit
 	 * destination address must be used, in which case a local
 	 * address/port may not have been selected and bound.
 	 */
 	if (sin != NULL) {
 		INP_LOCK_ASSERT(inp);
 		if (inp->inp_faddr.s_addr != INADDR_ANY) {
 			error = EISCONN;
 			goto release;
 		}
 
 		/*
 		 * Jail may rewrite the destination address, so let it do
 		 * that before we use it.
 		 */
 		error = prison_remote_ip4(td->td_ucred, &sin->sin_addr);
 		if (error)
 			goto release;
 
 		/*
 		 * If a local address or port hasn't yet been selected, or if
 		 * the destination address needs to be rewritten due to using
 		 * a special INADDR_ constant, invoke in_pcbconnect_setup()
 		 * to do the heavy lifting.  Once a port is selected, we
 		 * commit the binding back to the socket; we also commit the
 		 * binding of the address if in jail.
 		 *
 		 * If we already have a valid binding and we're not
 		 * requesting a destination address rewrite, use a fast path.
 		 */
 		if (inp->inp_laddr.s_addr == INADDR_ANY ||
 		    inp->inp_lport == 0 ||
 		    sin->sin_addr.s_addr == INADDR_ANY ||
 		    sin->sin_addr.s_addr == INADDR_BROADCAST) {
 			INP_HASH_LOCK_ASSERT(&V_udbinfo);
 			error = in_pcbconnect_setup(inp, addr, &laddr.s_addr,
 			    &lport, &faddr.s_addr, &fport, NULL,
 			    td->td_ucred);
 			if (error)
 				goto release;
 
 			/*
 			 * XXXRW: Why not commit the port if the address is
 			 * !INADDR_ANY?
 			 */
 			/* Commit the local port if newly assigned. */
 			if (inp->inp_laddr.s_addr == INADDR_ANY &&
 			    inp->inp_lport == 0) {
 				INP_WLOCK_ASSERT(inp);
 				INP_HASH_WLOCK_ASSERT(&V_udbinfo);
 				/*
 				 * Remember addr if jailed, to prevent
 				 * rebinding.
 				 */
 				if (prison_flag(td->td_ucred, PR_IP4))
 					inp->inp_laddr = laddr;
 				inp->inp_lport = lport;
 				if (in_pcbinshash(inp) != 0) {
 					inp->inp_lport = 0;
 					error = EAGAIN;
 					goto release;
 				}
 				inp->inp_flags |= INP_ANONPORT;
 			}
 		} else {
 			faddr = sin->sin_addr;
 			fport = sin->sin_port;
 		}
 	} else {
 		INP_LOCK_ASSERT(inp);
 		faddr = inp->inp_faddr;
 		fport = inp->inp_fport;
 		if (faddr.s_addr == INADDR_ANY) {
 			error = ENOTCONN;
 			goto release;
 		}
 	}
 
 	/*
 	 * Calculate data length and get a mbuf for UDP, IP, and possible
 	 * link-layer headers.  Immediate slide the data pointer back forward
 	 * since we won't use that space at this layer.
 	 */
 	M_PREPEND(m, sizeof(struct udpiphdr) + max_linkhdr, M_NOWAIT);
 	if (m == NULL) {
 		error = ENOBUFS;
 		goto release;
 	}
 	m->m_data += max_linkhdr;
 	m->m_len -= max_linkhdr;
 	m->m_pkthdr.len -= max_linkhdr;
 
 	/*
 	 * Fill in mbuf with extended UDP header and addresses and length put
 	 * into network format.
 	 */
 	ui = mtod(m, struct udpiphdr *);
 	bzero(ui->ui_x1, sizeof(ui->ui_x1));	/* XXX still needed? */
 	ui->ui_v = IPVERSION << 4;
 	ui->ui_pr = IPPROTO_UDP;
 	ui->ui_src = laddr;
 	ui->ui_dst = faddr;
 	ui->ui_sport = lport;
 	ui->ui_dport = fport;
 	ui->ui_ulen = htons((u_short)len + sizeof(struct udphdr));
 
 	/*
 	 * Set the Don't Fragment bit in the IP header.
 	 */
 	if (inp->inp_flags & INP_DONTFRAG) {
 		struct ip *ip;
 
 		ip = (struct ip *)&ui->ui_i;
 		ip->ip_off |= htons(IP_DF);
 	}
 
 	ipflags = 0;
 	if (inp->inp_socket->so_options & SO_DONTROUTE)
 		ipflags |= IP_ROUTETOIF;
 	if (inp->inp_socket->so_options & SO_BROADCAST)
 		ipflags |= IP_ALLOWBROADCAST;
 	if (inp->inp_flags & INP_ONESBCAST)
 		ipflags |= IP_SENDONES;
 
 #ifdef MAC
 	mac_inpcb_create_mbuf(inp, m);
 #endif
 
 	/*
 	 * Set up checksum and output datagram.
 	 */
 	if (V_udp_cksum) {
 		if (inp->inp_flags & INP_ONESBCAST)
 			faddr.s_addr = INADDR_BROADCAST;
 		ui->ui_sum = in_pseudo(ui->ui_src.s_addr, faddr.s_addr,
 		    htons((u_short)len + sizeof(struct udphdr) + IPPROTO_UDP));
 		m->m_pkthdr.csum_flags = CSUM_UDP;
 		m->m_pkthdr.csum_data = offsetof(struct udphdr, uh_sum);
 	} else
 		ui->ui_sum = 0;
 	((struct ip *)ui)->ip_len = htons(sizeof(struct udpiphdr) + len);
 	((struct ip *)ui)->ip_ttl = inp->inp_ip_ttl;	/* XXX */
 	((struct ip *)ui)->ip_tos = tos;		/* XXX */
 	UDPSTAT_INC(udps_opackets);
 
 	if (unlock_udbinfo == UH_WLOCKED)
 		INP_HASH_WUNLOCK(&V_udbinfo);
 	else if (unlock_udbinfo == UH_RLOCKED)
 		INP_HASH_RUNLOCK(&V_udbinfo);
 	UDP_PROBE(send, NULL, inp, &ui->ui_i, inp, &ui->ui_u);
 	error = ip_output(m, inp->inp_options, NULL, ipflags,
 	    inp->inp_moptions, inp);
 	if (unlock_udbinfo == UH_WLOCKED)
 		INP_WUNLOCK(inp);
 	else
 		INP_RUNLOCK(inp);
 	return (error);
 
 release:
 	if (unlock_udbinfo == UH_WLOCKED) {
 		INP_HASH_WUNLOCK(&V_udbinfo);
 		INP_WUNLOCK(inp);
 	} else if (unlock_udbinfo == UH_RLOCKED) {
 		INP_HASH_RUNLOCK(&V_udbinfo);
 		INP_RUNLOCK(inp);
 	} else
 		INP_RUNLOCK(inp);
 	m_freem(m);
 	return (error);
 }
 
 
 #if defined(IPSEC) && defined(IPSEC_NAT_T)
 /*
  * Potentially decap ESP in UDP frame.  Check for an ESP header
  * and optional marker; if present, strip the UDP header and
  * push the result through IPSec.
  *
  * Returns mbuf to be processed (potentially re-allocated) or
  * NULL if consumed and/or processed.
  */
 static struct mbuf *
 udp4_espdecap(struct inpcb *inp, struct mbuf *m, int off)
 {
 	size_t minlen, payload, skip, iphlen;
 	caddr_t data;
 	struct udpcb *up;
 	struct m_tag *tag;
 	struct udphdr *udphdr;
 	struct ip *ip;
 
 	INP_RLOCK_ASSERT(inp);
 
 	/* 
 	 * Pull up data so the longest case is contiguous:
 	 *    IP/UDP hdr + non ESP marker + ESP hdr.
 	 */
 	minlen = off + sizeof(uint64_t) + sizeof(struct esp);
 	if (minlen > m->m_pkthdr.len)
 		minlen = m->m_pkthdr.len;
 	if ((m = m_pullup(m, minlen)) == NULL) {
 		IPSECSTAT_INC(ips_in_inval);
 		return (NULL);		/* Bypass caller processing. */
 	}
 	data = mtod(m, caddr_t);	/* Points to ip header. */
 	payload = m->m_len - off;	/* Size of payload. */
 
 	if (payload == 1 && data[off] == '\xff')
 		return (m);		/* NB: keepalive packet, no decap. */
 
 	up = intoudpcb(inp);
 	KASSERT(up != NULL, ("%s: udpcb NULL", __func__));
 	KASSERT((up->u_flags & UF_ESPINUDP_ALL) != 0,
 	    ("u_flags 0x%x", up->u_flags));
 
 	/* 
 	 * Check that the payload is large enough to hold an
 	 * ESP header and compute the amount of data to remove.
 	 *
 	 * NB: the caller has already done a pullup for us.
 	 * XXX can we assume alignment and eliminate bcopys?
 	 */
 	if (up->u_flags & UF_ESPINUDP_NON_IKE) {
 		/*
 		 * draft-ietf-ipsec-nat-t-ike-0[01].txt and
 		 * draft-ietf-ipsec-udp-encaps-(00/)01.txt, ignoring
 		 * possible AH mode non-IKE marker+non-ESP marker
 		 * from draft-ietf-ipsec-udp-encaps-00.txt.
 		 */
 		uint64_t marker;
 
 		if (payload <= sizeof(uint64_t) + sizeof(struct esp))
 			return (m);	/* NB: no decap. */
 		bcopy(data + off, &marker, sizeof(uint64_t));
 		if (marker != 0)	/* Non-IKE marker. */
 			return (m);	/* NB: no decap. */
 		skip = sizeof(uint64_t) + sizeof(struct udphdr);
 	} else {
 		uint32_t spi;
 
 		if (payload <= sizeof(struct esp)) {
 			IPSECSTAT_INC(ips_in_inval);
 			m_freem(m);
 			return (NULL);	/* Discard. */
 		}
 		bcopy(data + off, &spi, sizeof(uint32_t));
 		if (spi == 0)		/* Non-ESP marker. */
 			return (m);	/* NB: no decap. */
 		skip = sizeof(struct udphdr);
 	}
 
 	/*
 	 * Setup a PACKET_TAG_IPSEC_NAT_T_PORT tag to remember
 	 * the UDP ports. This is required if we want to select
 	 * the right SPD for multiple hosts behind same NAT.
 	 *
 	 * NB: ports are maintained in network byte order everywhere
 	 *     in the NAT-T code.
 	 */
 	tag = m_tag_get(PACKET_TAG_IPSEC_NAT_T_PORTS,
 		2 * sizeof(uint16_t), M_NOWAIT);
 	if (tag == NULL) {
 		IPSECSTAT_INC(ips_in_nomem);
 		m_freem(m);
 		return (NULL);		/* Discard. */
 	}
 	iphlen = off - sizeof(struct udphdr);
 	udphdr = (struct udphdr *)(data + iphlen);
 	((uint16_t *)(tag + 1))[0] = udphdr->uh_sport;
 	((uint16_t *)(tag + 1))[1] = udphdr->uh_dport;
 	m_tag_prepend(m, tag);
 
 	/*
 	 * Remove the UDP header (and possibly the non ESP marker)
 	 * IP header length is iphlen
 	 * Before:
 	 *   <--- off --->
 	 *   +----+------+-----+
 	 *   | IP |  UDP | ESP |
 	 *   +----+------+-----+
 	 *        <-skip->
 	 * After:
 	 *          +----+-----+
 	 *          | IP | ESP |
 	 *          +----+-----+
 	 *   <-skip->
 	 */
 	ovbcopy(data, data + skip, iphlen);
 	m_adj(m, skip);
 
 	ip = mtod(m, struct ip *);
 	ip->ip_len = htons(ntohs(ip->ip_len) - skip);
 	ip->ip_p = IPPROTO_ESP;
 
 	/*
 	 * We cannot yet update the cksums so clear any
 	 * h/w cksum flags as they are no longer valid.
 	 */
 	if (m->m_pkthdr.csum_flags & CSUM_DATA_VALID)
 		m->m_pkthdr.csum_flags &= ~(CSUM_DATA_VALID|CSUM_PSEUDO_HDR);
 
 	(void) ipsec4_common_input(m, iphlen, ip->ip_p);
 	return (NULL);			/* NB: consumed, bypass processing. */
 }
 #endif /* defined(IPSEC) && defined(IPSEC_NAT_T) */
 
 static void
 udp_abort(struct socket *so)
 {
 	struct inpcb *inp;
 
 	inp = sotoinpcb(so);
 	KASSERT(inp != NULL, ("udp_abort: inp == NULL"));
 	INP_WLOCK(inp);
 	if (inp->inp_faddr.s_addr != INADDR_ANY) {
 		INP_HASH_WLOCK(&V_udbinfo);
 		in_pcbdisconnect(inp);
 		inp->inp_laddr.s_addr = INADDR_ANY;
 		INP_HASH_WUNLOCK(&V_udbinfo);
 		soisdisconnected(so);
 	}
 	INP_WUNLOCK(inp);
 }
 
 static int
 udp_attach(struct socket *so, int proto, struct thread *td)
 {
 	struct inpcb *inp;
 	int error;
 
 	inp = sotoinpcb(so);
 	KASSERT(inp == NULL, ("udp_attach: inp != NULL"));
 	error = soreserve(so, udp_sendspace, udp_recvspace);
 	if (error)
 		return (error);
 	INP_INFO_WLOCK(&V_udbinfo);
 	error = in_pcballoc(so, &V_udbinfo);
 	if (error) {
 		INP_INFO_WUNLOCK(&V_udbinfo);
 		return (error);
 	}
 
 	inp = sotoinpcb(so);
 	inp->inp_vflag |= INP_IPV4;
 	inp->inp_ip_ttl = V_ip_defttl;
 
 	error = udp_newudpcb(inp);
 	if (error) {
 		in_pcbdetach(inp);
 		in_pcbfree(inp);
 		INP_INFO_WUNLOCK(&V_udbinfo);
 		return (error);
 	}
 
 	INP_WUNLOCK(inp);
 	INP_INFO_WUNLOCK(&V_udbinfo);
 	return (0);
 }
 #endif /* INET */
 
 int
 udp_set_kernel_tunneling(struct socket *so, udp_tun_func_t f)
 {
 	struct inpcb *inp;
 	struct udpcb *up;
 
 	KASSERT(so->so_type == SOCK_DGRAM,
 	    ("udp_set_kernel_tunneling: !dgram"));
 	inp = sotoinpcb(so);
 	KASSERT(inp != NULL, ("udp_set_kernel_tunneling: inp == NULL"));
 	INP_WLOCK(inp);
 	up = intoudpcb(inp);
 	if (up->u_tun_func != NULL) {
 		INP_WUNLOCK(inp);
 		return (EBUSY);
 	}
 	up->u_tun_func = f;
 	INP_WUNLOCK(inp);
 	return (0);
 }
 
 #ifdef INET
 static int
 udp_bind(struct socket *so, struct sockaddr *nam, struct thread *td)
 {
 	struct inpcb *inp;
 	int error;
 
 	inp = sotoinpcb(so);
 	KASSERT(inp != NULL, ("udp_bind: inp == NULL"));
 	INP_WLOCK(inp);
 	INP_HASH_WLOCK(&V_udbinfo);
 	error = in_pcbbind(inp, nam, td->td_ucred);
 	INP_HASH_WUNLOCK(&V_udbinfo);
 	INP_WUNLOCK(inp);
 	return (error);
 }
 
 static void
 udp_close(struct socket *so)
 {
 	struct inpcb *inp;
 
 	inp = sotoinpcb(so);
 	KASSERT(inp != NULL, ("udp_close: inp == NULL"));
 	INP_WLOCK(inp);
 	if (inp->inp_faddr.s_addr != INADDR_ANY) {
 		INP_HASH_WLOCK(&V_udbinfo);
 		in_pcbdisconnect(inp);
 		inp->inp_laddr.s_addr = INADDR_ANY;
 		INP_HASH_WUNLOCK(&V_udbinfo);
 		soisdisconnected(so);
 	}
 	INP_WUNLOCK(inp);
 }
 
 static int
 udp_connect(struct socket *so, struct sockaddr *nam, struct thread *td)
 {
 	struct inpcb *inp;
 	int error;
 	struct sockaddr_in *sin;
 
 	inp = sotoinpcb(so);
 	KASSERT(inp != NULL, ("udp_connect: inp == NULL"));
 	INP_WLOCK(inp);
 	if (inp->inp_faddr.s_addr != INADDR_ANY) {
 		INP_WUNLOCK(inp);
 		return (EISCONN);
 	}
 	sin = (struct sockaddr_in *)nam;
 	error = prison_remote_ip4(td->td_ucred, &sin->sin_addr);
 	if (error != 0) {
 		INP_WUNLOCK(inp);
 		return (error);
 	}
 	INP_HASH_WLOCK(&V_udbinfo);
 	error = in_pcbconnect(inp, nam, td->td_ucred);
 	INP_HASH_WUNLOCK(&V_udbinfo);
 	if (error == 0)
 		soisconnected(so);
 	INP_WUNLOCK(inp);
 	return (error);
 }
 
 static void
 udp_detach(struct socket *so)
 {
 	struct inpcb *inp;
 	struct udpcb *up;
 
 	inp = sotoinpcb(so);
 	KASSERT(inp != NULL, ("udp_detach: inp == NULL"));
 	KASSERT(inp->inp_faddr.s_addr == INADDR_ANY,
 	    ("udp_detach: not disconnected"));
 	INP_INFO_WLOCK(&V_udbinfo);
 	INP_WLOCK(inp);
 	up = intoudpcb(inp);
 	KASSERT(up != NULL, ("%s: up == NULL", __func__));
 	inp->inp_ppcb = NULL;
 	in_pcbdetach(inp);
 	in_pcbfree(inp);
 	INP_INFO_WUNLOCK(&V_udbinfo);
 	udp_discardcb(up);
 }
 
 static int
 udp_disconnect(struct socket *so)
 {
 	struct inpcb *inp;
 
 	inp = sotoinpcb(so);
 	KASSERT(inp != NULL, ("udp_disconnect: inp == NULL"));
 	INP_WLOCK(inp);
 	if (inp->inp_faddr.s_addr == INADDR_ANY) {
 		INP_WUNLOCK(inp);
 		return (ENOTCONN);
 	}
 	INP_HASH_WLOCK(&V_udbinfo);
 	in_pcbdisconnect(inp);
 	inp->inp_laddr.s_addr = INADDR_ANY;
 	INP_HASH_WUNLOCK(&V_udbinfo);
 	SOCK_LOCK(so);
 	so->so_state &= ~SS_ISCONNECTED;		/* XXX */
 	SOCK_UNLOCK(so);
 	INP_WUNLOCK(inp);
 	return (0);
 }
 
 static int
 udp_send(struct socket *so, int flags, struct mbuf *m, struct sockaddr *addr,
     struct mbuf *control, struct thread *td)
 {
 	struct inpcb *inp;
 
 	inp = sotoinpcb(so);
 	KASSERT(inp != NULL, ("udp_send: inp == NULL"));
 	return (udp_output(inp, m, addr, control, td));
 }
 #endif /* INET */
 
 int
 udp_shutdown(struct socket *so)
 {
 	struct inpcb *inp;
 
 	inp = sotoinpcb(so);
 	KASSERT(inp != NULL, ("udp_shutdown: inp == NULL"));
 	INP_WLOCK(inp);
 	socantsendmore(so);
 	INP_WUNLOCK(inp);
 	return (0);
 }
 
 #ifdef INET
 struct pr_usrreqs udp_usrreqs = {
 	.pru_abort =		udp_abort,
 	.pru_attach =		udp_attach,
 	.pru_bind =		udp_bind,
 	.pru_connect =		udp_connect,
 	.pru_control =		in_control,
 	.pru_detach =		udp_detach,
 	.pru_disconnect =	udp_disconnect,
 	.pru_peeraddr =		in_getpeeraddr,
 	.pru_send =		udp_send,
 	.pru_soreceive =	soreceive_dgram,
 	.pru_sosend =		sosend_dgram,
 	.pru_shutdown =		udp_shutdown,
 	.pru_sockaddr =		in_getsockaddr,
 	.pru_sosetlabel =	in_pcbsosetlabel,
 	.pru_close =		udp_close,
 };
 #endif /* INET */
Index: head/sys/netinet6/ip6_input.c
===================================================================
--- head/sys/netinet6/ip6_input.c	(revision 258540)
+++ head/sys/netinet6/ip6_input.c	(revision 258541)
@@ -1,1890 +1,1889 @@
 /*-
  * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the project nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	$KAME: ip6_input.c,v 1.259 2002/01/21 04:58:09 jinmei Exp $
  */
 
 /*-
  * Copyright (c) 1982, 1986, 1988, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)ip_input.c	8.2 (Berkeley) 1/4/94
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_inet.h"
 #include "opt_inet6.h"
 #include "opt_ipfw.h"
 #include "opt_ipsec.h"
-#include "opt_kdtrace.h"
 #include "opt_route.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/malloc.h>
 #include <sys/mbuf.h>
 #include <sys/proc.h>
 #include <sys/domain.h>
 #include <sys/protosw.h>
 #include <sys/sdt.h>
 #include <sys/socket.h>
 #include <sys/socketvar.h>
 #include <sys/errno.h>
 #include <sys/time.h>
 #include <sys/kernel.h>
 #include <sys/syslog.h>
 
 #include <net/if.h>
 #include <net/if_var.h>
 #include <net/if_types.h>
 #include <net/if_dl.h>
 #include <net/route.h>
 #include <net/netisr.h>
 #include <net/pfil.h>
 #include <net/vnet.h>
 
 #include <netinet/in.h>
 #include <netinet/in_kdtrace.h>
 #include <netinet/ip_var.h>
 #include <netinet/in_systm.h>
 #include <net/if_llatbl.h>
 #ifdef INET
 #include <netinet/ip.h>
 #include <netinet/ip_icmp.h>
 #endif /* INET */
 #include <netinet/ip6.h>
 #include <netinet6/in6_var.h>
 #include <netinet6/ip6_var.h>
 #include <netinet/in_pcb.h>
 #include <netinet/icmp6.h>
 #include <netinet6/scope6_var.h>
 #include <netinet6/in6_ifattach.h>
 #include <netinet6/nd6.h>
 
 #ifdef IPSEC
 #include <netipsec/ipsec.h>
 #include <netinet6/ip6_ipsec.h>
 #include <netipsec/ipsec6.h>
 #endif /* IPSEC */
 
 #include <netinet6/ip6protosw.h>
 
 #ifdef FLOWTABLE
 #include <net/flowtable.h>
 VNET_DECLARE(int, ip6_output_flowtable_size);
 #define	V_ip6_output_flowtable_size	VNET(ip6_output_flowtable_size)
 #endif
 
 extern struct domain inet6domain;
 
 u_char ip6_protox[IPPROTO_MAX];
 VNET_DEFINE(struct in6_ifaddrhead, in6_ifaddrhead);
 VNET_DEFINE(struct in6_ifaddrlisthead *, in6_ifaddrhashtbl);
 VNET_DEFINE(u_long, in6_ifaddrhmask);
 
 static struct netisr_handler ip6_nh = {
 	.nh_name = "ip6",
 	.nh_handler = ip6_input,
 	.nh_proto = NETISR_IPV6,
 	.nh_policy = NETISR_POLICY_FLOW,
 };
 
 VNET_DECLARE(struct callout, in6_tmpaddrtimer_ch);
 #define	V_in6_tmpaddrtimer_ch		VNET(in6_tmpaddrtimer_ch)
 
 VNET_DEFINE(struct pfil_head, inet6_pfil_hook);
 
 VNET_PCPUSTAT_DEFINE(struct ip6stat, ip6stat);
 VNET_PCPUSTAT_SYSINIT(ip6stat);
 #ifdef VIMAGE
 VNET_PCPUSTAT_SYSUNINIT(ip6stat);
 #endif /* VIMAGE */
 
 struct rwlock in6_ifaddr_lock;
 RW_SYSINIT(in6_ifaddr_lock, &in6_ifaddr_lock, "in6_ifaddr_lock");
 
 static void ip6_init2(void *);
 static struct ip6aux *ip6_setdstifaddr(struct mbuf *, struct in6_ifaddr *);
 static struct ip6aux *ip6_addaux(struct mbuf *);
 static struct ip6aux *ip6_findaux(struct mbuf *m);
 static void ip6_delaux (struct mbuf *);
 static int ip6_hopopts_input(u_int32_t *, u_int32_t *, struct mbuf **, int *);
 #ifdef PULLDOWN_TEST
 static struct mbuf *ip6_pullexthdr(struct mbuf *, size_t, int);
 #endif
 
 /*
  * IP6 initialization: fill in IP6 protocol switch table.
  * All protocols not implemented in kernel go to raw IP6 protocol handler.
  */
 void
 ip6_init(void)
 {
 	struct ip6protosw *pr;
 	int i;
 
 	TUNABLE_INT_FETCH("net.inet6.ip6.auto_linklocal",
 	    &V_ip6_auto_linklocal);
 	TUNABLE_INT_FETCH("net.inet6.ip6.accept_rtadv", &V_ip6_accept_rtadv);
 	TUNABLE_INT_FETCH("net.inet6.ip6.no_radr", &V_ip6_no_radr);
 
 	TAILQ_INIT(&V_in6_ifaddrhead);
 	V_in6_ifaddrhashtbl = hashinit(IN6ADDR_NHASH, M_IFADDR,
 	    &V_in6_ifaddrhmask);
 
 	/* Initialize packet filter hooks. */
 	V_inet6_pfil_hook.ph_type = PFIL_TYPE_AF;
 	V_inet6_pfil_hook.ph_af = AF_INET6;
 	if ((i = pfil_head_register(&V_inet6_pfil_hook)) != 0)
 		printf("%s: WARNING: unable to register pfil hook, "
 			"error %d\n", __func__, i);
 
 	scope6_init();
 	addrsel_policy_init();
 	nd6_init();
 	frag6_init();
 
 #ifdef FLOWTABLE
 	if (TUNABLE_INT_FETCH("net.inet6.ip6.output_flowtable_size",
 		&V_ip6_output_flowtable_size)) {
 		if (V_ip6_output_flowtable_size < 256)
 			V_ip6_output_flowtable_size = 256;
 		if (!powerof2(V_ip6_output_flowtable_size)) {
 			printf("flowtable must be power of 2 size\n");
 			V_ip6_output_flowtable_size = 2048;
 		}
 	} else {
 		/*
 		 * round up to the next power of 2
 		 */
 		V_ip6_output_flowtable_size = 1 << fls((1024 + maxusers * 64)-1);
 	}
 	V_ip6_ft = flowtable_alloc("ipv6", V_ip6_output_flowtable_size, FL_IPV6|FL_PCPU);
 #endif	
 	
 	V_ip6_desync_factor = arc4random() % MAX_TEMP_DESYNC_FACTOR;
 
 	/* Skip global initialization stuff for non-default instances. */
 	if (!IS_DEFAULT_VNET(curvnet))
 		return;
 
 #ifdef DIAGNOSTIC
 	if (sizeof(struct protosw) != sizeof(struct ip6protosw))
 		panic("sizeof(protosw) != sizeof(ip6protosw)");
 #endif
 	pr = (struct ip6protosw *)pffindproto(PF_INET6, IPPROTO_RAW, SOCK_RAW);
 	if (pr == NULL)
 		panic("ip6_init");
 
 	/* Initialize the entire ip6_protox[] array to IPPROTO_RAW. */
 	for (i = 0; i < IPPROTO_MAX; i++)
 		ip6_protox[i] = pr - inet6sw;
 	/*
 	 * Cycle through IP protocols and put them into the appropriate place
 	 * in ip6_protox[].
 	 */
 	for (pr = (struct ip6protosw *)inet6domain.dom_protosw;
 	    pr < (struct ip6protosw *)inet6domain.dom_protoswNPROTOSW; pr++)
 		if (pr->pr_domain->dom_family == PF_INET6 &&
 		    pr->pr_protocol && pr->pr_protocol != IPPROTO_RAW) {
 			/* Be careful to only index valid IP protocols. */
 			if (pr->pr_protocol < IPPROTO_MAX)
 				ip6_protox[pr->pr_protocol] = pr - inet6sw;
 		}
 
 	netisr_register(&ip6_nh);
 }
 
 /*
  * The protocol to be inserted into ip6_protox[] must be already registered
  * in inet6sw[], either statically or through pf_proto_register().
  */
 int
 ip6proto_register(short ip6proto)
 {
 	struct ip6protosw *pr;
 
 	/* Sanity checks. */
 	if (ip6proto <= 0 || ip6proto >= IPPROTO_MAX)
 		return (EPROTONOSUPPORT);
 
 	/*
 	 * The protocol slot must not be occupied by another protocol
 	 * already.  An index pointing to IPPROTO_RAW is unused.
 	 */
 	pr = (struct ip6protosw *)pffindproto(PF_INET6, IPPROTO_RAW, SOCK_RAW);
 	if (pr == NULL)
 		return (EPFNOSUPPORT);
 	if (ip6_protox[ip6proto] != pr - inet6sw)	/* IPPROTO_RAW */
 		return (EEXIST);
 
 	/*
 	 * Find the protocol position in inet6sw[] and set the index.
 	 */
 	for (pr = (struct ip6protosw *)inet6domain.dom_protosw;
 	    pr < (struct ip6protosw *)inet6domain.dom_protoswNPROTOSW; pr++) {
 		if (pr->pr_domain->dom_family == PF_INET6 &&
 		    pr->pr_protocol && pr->pr_protocol == ip6proto) {
 			ip6_protox[pr->pr_protocol] = pr - inet6sw;
 			return (0);
 		}
 	}
 	return (EPROTONOSUPPORT);
 }
 
 int
 ip6proto_unregister(short ip6proto)
 {
 	struct ip6protosw *pr;
 
 	/* Sanity checks. */
 	if (ip6proto <= 0 || ip6proto >= IPPROTO_MAX)
 		return (EPROTONOSUPPORT);
 
 	/* Check if the protocol was indeed registered. */
 	pr = (struct ip6protosw *)pffindproto(PF_INET6, IPPROTO_RAW, SOCK_RAW);
 	if (pr == NULL)
 		return (EPFNOSUPPORT);
 	if (ip6_protox[ip6proto] == pr - inet6sw)	/* IPPROTO_RAW */
 		return (ENOENT);
 
 	/* Reset the protocol slot to IPPROTO_RAW. */
 	ip6_protox[ip6proto] = pr - inet6sw;
 	return (0);
 }
 
 #ifdef VIMAGE
 void
 ip6_destroy()
 {
 	int i;
 
 	if ((i = pfil_head_unregister(&V_inet6_pfil_hook)) != 0)
 		printf("%s: WARNING: unable to unregister pfil hook, "
 		    "error %d\n", __func__, i);
 	hashdestroy(V_in6_ifaddrhashtbl, M_IFADDR, V_in6_ifaddrhmask);
 	nd6_destroy();
 	callout_drain(&V_in6_tmpaddrtimer_ch);
 }
 #endif
 
 static int
 ip6_init2_vnet(const void *unused __unused)
 {
 
 	/* nd6_timer_init */
 	callout_init(&V_nd6_timer_ch, 0);
 	callout_reset(&V_nd6_timer_ch, hz, nd6_timer, curvnet);
 
 	/* timer for regeneranation of temporary addresses randomize ID */
 	callout_init(&V_in6_tmpaddrtimer_ch, 0);
 	callout_reset(&V_in6_tmpaddrtimer_ch,
 		      (V_ip6_temp_preferred_lifetime - V_ip6_desync_factor -
 		       V_ip6_temp_regen_advance) * hz,
 		      in6_tmpaddrtimer, curvnet);
 
 	return (0);
 }
 
 static void
 ip6_init2(void *dummy)
 {
 
 	ip6_init2_vnet(NULL);
 }
 
 /* cheat */
 /* This must be after route_init(), which is now SI_ORDER_THIRD */
 SYSINIT(netinet6init2, SI_SUB_PROTO_DOMAIN, SI_ORDER_MIDDLE, ip6_init2, NULL);
 
 static int
 ip6_input_hbh(struct mbuf *m, uint32_t *plen, uint32_t *rtalert, int *off,
     int *nxt, int *ours)
 {
 	struct ip6_hdr *ip6;
 	struct ip6_hbh *hbh;
 
 	if (ip6_hopopts_input(plen, rtalert, &m, off)) {
 #if 0	/*touches NULL pointer*/
 		in6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_discard);
 #endif
 		goto out;	/* m have already been freed */
 	}
 
 	/* adjust pointer */
 	ip6 = mtod(m, struct ip6_hdr *);
 
 	/*
 	 * if the payload length field is 0 and the next header field
 	 * indicates Hop-by-Hop Options header, then a Jumbo Payload
 	 * option MUST be included.
 	 */
 	if (ip6->ip6_plen == 0 && *plen == 0) {
 		/*
 		 * Note that if a valid jumbo payload option is
 		 * contained, ip6_hopopts_input() must set a valid
 		 * (non-zero) payload length to the variable plen.
 		 */
 		IP6STAT_INC(ip6s_badoptions);
 		in6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_discard);
 		in6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_hdrerr);
 		icmp6_error(m, ICMP6_PARAM_PROB,
 			    ICMP6_PARAMPROB_HEADER,
 			    (caddr_t)&ip6->ip6_plen - (caddr_t)ip6);
 		goto out;
 	}
 #ifndef PULLDOWN_TEST
 	/* ip6_hopopts_input() ensures that mbuf is contiguous */
 	hbh = (struct ip6_hbh *)(ip6 + 1);
 #else
 	IP6_EXTHDR_GET(hbh, struct ip6_hbh *, m, sizeof(struct ip6_hdr),
 		sizeof(struct ip6_hbh));
 	if (hbh == NULL) {
 		IP6STAT_INC(ip6s_tooshort);
 		goto out;
 	}
 #endif
 	*nxt = hbh->ip6h_nxt;
 
 	/*
 	 * If we are acting as a router and the packet contains a
 	 * router alert option, see if we know the option value.
 	 * Currently, we only support the option value for MLD, in which
 	 * case we should pass the packet to the multicast routing
 	 * daemon.
 	 */
 	if (*rtalert != ~0) {
 		switch (*rtalert) {
 		case IP6OPT_RTALERT_MLD:
 			if (V_ip6_forwarding)
 				*ours = 1;
 			break;
 		default:
 			/*
 			 * RFC2711 requires unrecognized values must be
 			 * silently ignored.
 			 */
 			break;
 		}
 	}
 
 	return (0);
 
 out:
 	return (1);
 }
 
 void
 ip6_input(struct mbuf *m)
 {
 	struct ip6_hdr *ip6;
 	int off = sizeof(struct ip6_hdr), nest;
 	u_int32_t plen;
 	u_int32_t rtalert = ~0;
 	int nxt, ours = 0;
 	struct ifnet *deliverifp = NULL, *ifp = NULL;
 	struct in6_addr odst;
 	struct route_in6 rin6;
 	int srcrt = 0;
 	struct llentry *lle = NULL;
 	struct sockaddr_in6 dst6, *dst;
 
 	bzero(&rin6, sizeof(struct route_in6));
 #ifdef IPSEC
 	/*
 	 * should the inner packet be considered authentic?
 	 * see comment in ah4_input().
 	 * NB: m cannot be NULL when passed to the input routine
 	 */
 
 	m->m_flags &= ~M_AUTHIPHDR;
 	m->m_flags &= ~M_AUTHIPDGM;
 
 #endif /* IPSEC */
 
 	/*
 	 * make sure we don't have onion peering information into m_tag.
 	 */
 	ip6_delaux(m);
 
 	if (m->m_flags & M_FASTFWD_OURS) {
 		/*
 		 * Firewall changed destination to local.
 		 */
 		m->m_flags &= ~M_FASTFWD_OURS;
 		ours = 1;
 		deliverifp = m->m_pkthdr.rcvif;
 		ip6 = mtod(m, struct ip6_hdr *);
 		goto hbhcheck;
 	}
 
 	/*
 	 * mbuf statistics
 	 */
 	if (m->m_flags & M_EXT) {
 		if (m->m_next)
 			IP6STAT_INC(ip6s_mext2m);
 		else
 			IP6STAT_INC(ip6s_mext1);
 	} else {
 		if (m->m_next) {
 			if (m->m_flags & M_LOOP) {
 				IP6STAT_INC(ip6s_m2m[V_loif->if_index]);
 			} else if (m->m_pkthdr.rcvif->if_index < IP6S_M2MMAX)
 				IP6STAT_INC(
 				    ip6s_m2m[m->m_pkthdr.rcvif->if_index]);
 			else
 				IP6STAT_INC(ip6s_m2m[0]);
 		} else
 			IP6STAT_INC(ip6s_m1);
 	}
 
 	/* drop the packet if IPv6 operation is disabled on the IF */
 	if ((ND_IFINFO(m->m_pkthdr.rcvif)->flags & ND6_IFF_IFDISABLED)) {
 		m_freem(m);
 		return;
 	}
 
 	in6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_receive);
 	IP6STAT_INC(ip6s_total);
 
 #ifndef PULLDOWN_TEST
 	/*
 	 * L2 bridge code and some other code can return mbuf chain
 	 * that does not conform to KAME requirement.  too bad.
 	 * XXX: fails to join if interface MTU > MCLBYTES.  jumbogram?
 	 */
 	if (m && m->m_next != NULL && m->m_pkthdr.len < MCLBYTES) {
 		struct mbuf *n;
 
 		if (m->m_pkthdr.len > MHLEN)
 			n = m_getcl(M_NOWAIT, MT_DATA, M_PKTHDR);
 		else
 			n = m_gethdr(M_NOWAIT, MT_DATA);
 		if (n == NULL) {
 			m_freem(m);
 			return;	/* ENOBUFS */
 		}
 
 		m_move_pkthdr(n, m);
 		m_copydata(m, 0, n->m_pkthdr.len, mtod(n, caddr_t));
 		n->m_len = n->m_pkthdr.len;
 		m_freem(m);
 		m = n;
 	}
 	IP6_EXTHDR_CHECK(m, 0, sizeof(struct ip6_hdr), /* nothing */);
 #endif
 
 	if (m->m_len < sizeof(struct ip6_hdr)) {
 		struct ifnet *inifp;
 		inifp = m->m_pkthdr.rcvif;
 		if ((m = m_pullup(m, sizeof(struct ip6_hdr))) == NULL) {
 			IP6STAT_INC(ip6s_toosmall);
 			in6_ifstat_inc(inifp, ifs6_in_hdrerr);
 			return;
 		}
 	}
 
 	ip6 = mtod(m, struct ip6_hdr *);
 
 	if ((ip6->ip6_vfc & IPV6_VERSION_MASK) != IPV6_VERSION) {
 		IP6STAT_INC(ip6s_badvers);
 		in6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_hdrerr);
 		goto bad;
 	}
 
 	IP6STAT_INC(ip6s_nxthist[ip6->ip6_nxt]);
 
 	IP_PROBE(receive, NULL, NULL, ip6, m->m_pkthdr.rcvif, NULL, ip6);
 
 	/*
 	 * Check against address spoofing/corruption.
 	 */
 	if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_src) ||
 	    IN6_IS_ADDR_UNSPECIFIED(&ip6->ip6_dst)) {
 		/*
 		 * XXX: "badscope" is not very suitable for a multicast source.
 		 */
 		IP6STAT_INC(ip6s_badscope);
 		in6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_addrerr);
 		goto bad;
 	}
 	if (IN6_IS_ADDR_MC_INTFACELOCAL(&ip6->ip6_dst) &&
 	    !(m->m_flags & M_LOOP)) {
 		/*
 		 * In this case, the packet should come from the loopback
 		 * interface.  However, we cannot just check the if_flags,
 		 * because ip6_mloopback() passes the "actual" interface
 		 * as the outgoing/incoming interface.
 		 */
 		IP6STAT_INC(ip6s_badscope);
 		in6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_addrerr);
 		goto bad;
 	}
 
 #ifdef ALTQ
 	if (altq_input != NULL && (*altq_input)(m, AF_INET6) == 0) {
 		/* packet is dropped by traffic conditioner */
 		return;
 	}
 #endif
 	/*
 	 * The following check is not documented in specs.  A malicious
 	 * party may be able to use IPv4 mapped addr to confuse tcp/udp stack
 	 * and bypass security checks (act as if it was from 127.0.0.1 by using
 	 * IPv6 src ::ffff:127.0.0.1).  Be cautious.
 	 *
 	 * This check chokes if we are in an SIIT cloud.  As none of BSDs
 	 * support IPv4-less kernel compilation, we cannot support SIIT
 	 * environment at all.  So, it makes more sense for us to reject any
 	 * malicious packets for non-SIIT environment, than try to do a
 	 * partial support for SIIT environment.
 	 */
 	if (IN6_IS_ADDR_V4MAPPED(&ip6->ip6_src) ||
 	    IN6_IS_ADDR_V4MAPPED(&ip6->ip6_dst)) {
 		IP6STAT_INC(ip6s_badscope);
 		in6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_addrerr);
 		goto bad;
 	}
 #if 0
 	/*
 	 * Reject packets with IPv4 compatible addresses (auto tunnel).
 	 *
 	 * The code forbids auto tunnel relay case in RFC1933 (the check is
 	 * stronger than RFC1933).  We may want to re-enable it if mech-xx
 	 * is revised to forbid relaying case.
 	 */
 	if (IN6_IS_ADDR_V4COMPAT(&ip6->ip6_src) ||
 	    IN6_IS_ADDR_V4COMPAT(&ip6->ip6_dst)) {
 		IP6STAT_INC(ip6s_badscope);
 		in6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_addrerr);
 		goto bad;
 	}
 #endif
 #ifdef IPSEC
 	/*
 	 * Bypass packet filtering for packets previously handled by IPsec.
 	 */
 	if (ip6_ipsec_filtertunnel(m))
 		goto passin;
 #endif /* IPSEC */
 
 	/*
 	 * Run through list of hooks for input packets.
 	 *
 	 * NB: Beware of the destination address changing
 	 *     (e.g. by NAT rewriting).  When this happens,
 	 *     tell ip6_forward to do the right thing.
 	 */
 	odst = ip6->ip6_dst;
 
 	/* Jump over all PFIL processing if hooks are not active. */
 	if (!PFIL_HOOKED(&V_inet6_pfil_hook))
 		goto passin;
 
 	if (pfil_run_hooks(&V_inet6_pfil_hook, &m,
 	    m->m_pkthdr.rcvif, PFIL_IN, NULL))
 		return;
 	if (m == NULL)			/* consumed by filter */
 		return;
 	ip6 = mtod(m, struct ip6_hdr *);
 	srcrt = !IN6_ARE_ADDR_EQUAL(&odst, &ip6->ip6_dst);
 
 	if (m->m_flags & M_FASTFWD_OURS) {
 		m->m_flags &= ~M_FASTFWD_OURS;
 		ours = 1;
 		deliverifp = m->m_pkthdr.rcvif;
 		goto hbhcheck;
 	}
 	if ((m->m_flags & M_IP6_NEXTHOP) &&
 	    m_tag_find(m, PACKET_TAG_IPFORWARD, NULL) != NULL) {
 		/*
 		 * Directly ship the packet on.  This allows forwarding
 		 * packets originally destined to us to some other directly
 		 * connected host.
 		 */
 		ip6_forward(m, 1);
 		goto out;
 	}
 
 passin:
 	/*
 	 * Disambiguate address scope zones (if there is ambiguity).
 	 * We first make sure that the original source or destination address
 	 * is not in our internal form for scoped addresses.  Such addresses
 	 * are not necessarily invalid spec-wise, but we cannot accept them due
 	 * to the usage conflict.
 	 * in6_setscope() then also checks and rejects the cases where src or
 	 * dst are the loopback address and the receiving interface
 	 * is not loopback.
 	 */
 	if (in6_clearscope(&ip6->ip6_src) || in6_clearscope(&ip6->ip6_dst)) {
 		IP6STAT_INC(ip6s_badscope); /* XXX */
 		goto bad;
 	}
 	if (in6_setscope(&ip6->ip6_src, m->m_pkthdr.rcvif, NULL) ||
 	    in6_setscope(&ip6->ip6_dst, m->m_pkthdr.rcvif, NULL)) {
 		IP6STAT_INC(ip6s_badscope);
 		goto bad;
 	}
 
 	/*
 	 * Multicast check. Assume packet is for us to avoid
 	 * prematurely taking locks.
 	 */
 	if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst)) {
 		ours = 1;
 		in6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_mcast);
 		deliverifp = m->m_pkthdr.rcvif;
 		goto hbhcheck;
 	}
 
 	/*
 	 *  Unicast check
 	 */
 
 	bzero(&dst6, sizeof(dst6));
 	dst6.sin6_family = AF_INET6;
 	dst6.sin6_len = sizeof(struct sockaddr_in6);
 	dst6.sin6_addr = ip6->ip6_dst;
 	ifp = m->m_pkthdr.rcvif;
 	IF_AFDATA_RLOCK(ifp);
 	lle = lla_lookup(LLTABLE6(ifp), 0,
 	     (struct sockaddr *)&dst6);
 	IF_AFDATA_RUNLOCK(ifp);
 	if ((lle != NULL) && (lle->la_flags & LLE_IFADDR)) {
 		struct ifaddr *ifa;
 		struct in6_ifaddr *ia6;
 		int bad;
 
 		bad = 1;
 #define	sa_equal(a1, a2)						\
 	(bcmp((a1), (a2), ((a1))->sin6_len) == 0)
 		IF_ADDR_RLOCK(ifp);
 		TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
 			if (ifa->ifa_addr->sa_family != dst6.sin6_family)
 				continue;
 			if (sa_equal(&dst6, ifa->ifa_addr))
 				break;
 		}
 		KASSERT(ifa != NULL, ("%s: ifa not found for lle %p",
 		    __func__, lle));
 #undef sa_equal
 
 		ia6 = (struct in6_ifaddr *)ifa;
 		if (!(ia6->ia6_flags & IN6_IFF_NOTREADY)) {
 			/* Count the packet in the ip address stats */
 			counter_u64_add(ia6->ia_ifa.ifa_ipackets, 1);
 			counter_u64_add(ia6->ia_ifa.ifa_ibytes,
 			    m->m_pkthdr.len);
 
 			/*
 			 * record address information into m_tag.
 			 */
 			(void)ip6_setdstifaddr(m, ia6);
 
 			bad = 0;
 		} else {
 			char ip6bufs[INET6_ADDRSTRLEN];
 			char ip6bufd[INET6_ADDRSTRLEN];
 			/* address is not ready, so discard the packet. */
 			nd6log((LOG_INFO,
 			    "ip6_input: packet to an unready address %s->%s\n",
 			    ip6_sprintf(ip6bufs, &ip6->ip6_src),
 			    ip6_sprintf(ip6bufd, &ip6->ip6_dst)));
 		}
 		IF_ADDR_RUNLOCK(ifp);
 		LLE_RUNLOCK(lle);
 		if (bad)
 			goto bad;
 		else {
 			ours = 1;
 			deliverifp = ifp;
 			goto hbhcheck;
 		}
 	}
 	if (lle != NULL)
 		LLE_RUNLOCK(lle);
 
 	dst = &rin6.ro_dst;
 	dst->sin6_len = sizeof(struct sockaddr_in6);
 	dst->sin6_family = AF_INET6;
 	dst->sin6_addr = ip6->ip6_dst;
 	rin6.ro_rt = in6_rtalloc1((struct sockaddr *)dst, 0, 0, M_GETFIB(m));
 	if (rin6.ro_rt)
 		RT_UNLOCK(rin6.ro_rt);
 
 #define rt6_key(r) ((struct sockaddr_in6 *)((r)->rt_nodes->rn_key))
 
 	/*
 	 * Accept the packet if the forwarding interface to the destination
 	 * according to the routing table is the loopback interface,
 	 * unless the associated route has a gateway.
 	 * Note that this approach causes to accept a packet if there is a
 	 * route to the loopback interface for the destination of the packet.
 	 * But we think it's even useful in some situations, e.g. when using
 	 * a special daemon which wants to intercept the packet.
 	 *
 	 * XXX: some OSes automatically make a cloned route for the destination
 	 * of an outgoing packet.  If the outgoing interface of the packet
 	 * is a loopback one, the kernel would consider the packet to be
 	 * accepted, even if we have no such address assinged on the interface.
 	 * We check the cloned flag of the route entry to reject such cases,
 	 * assuming that route entries for our own addresses are not made by
 	 * cloning (it should be true because in6_addloop explicitly installs
 	 * the host route).  However, we might have to do an explicit check
 	 * while it would be less efficient.  Or, should we rather install a
 	 * reject route for such a case?
 	 */
 	if (rin6.ro_rt &&
 	    (rin6.ro_rt->rt_flags &
 	     (RTF_HOST|RTF_GATEWAY)) == RTF_HOST &&
 #ifdef RTF_WASCLONED
 	    !(rin6.ro_rt->rt_flags & RTF_WASCLONED) &&
 #endif
 #ifdef RTF_CLONED
 	    !(rin6.ro_rt->rt_flags & RTF_CLONED) &&
 #endif
 #if 0
 	    /*
 	     * The check below is redundant since the comparison of
 	     * the destination and the key of the rtentry has
 	     * already done through looking up the routing table.
 	     */
 	    IN6_ARE_ADDR_EQUAL(&ip6->ip6_dst,
 	    &rt6_key(rin6.ro_rt)->sin6_addr)
 #endif
 	    rin6.ro_rt->rt_ifp->if_type == IFT_LOOP) {
 		int free_ia6 = 0;
 		struct in6_ifaddr *ia6;
 
 		/*
 		 * found the loopback route to the interface address
 		 */
 		if (rin6.ro_rt->rt_gateway->sa_family == AF_LINK) {
 			struct sockaddr_in6 dest6;
 
 			bzero(&dest6, sizeof(dest6));
 			dest6.sin6_family = AF_INET6;
 			dest6.sin6_len = sizeof(dest6);
 			dest6.sin6_addr = ip6->ip6_dst;
 			ia6 = (struct in6_ifaddr *)
 			    ifa_ifwithaddr((struct sockaddr *)&dest6);
 			if (ia6 == NULL)
 				goto bad;
 			free_ia6 = 1;
 		}
 		else
 			ia6 = (struct in6_ifaddr *)rin6.ro_rt->rt_ifa;
 
 		/*
 		 * record address information into m_tag.
 		 */
 		(void)ip6_setdstifaddr(m, ia6);
 
 		/*
 		 * packets to a tentative, duplicated, or somehow invalid
 		 * address must not be accepted.
 		 */
 		if (!(ia6->ia6_flags & IN6_IFF_NOTREADY)) {
 			/* this address is ready */
 			ours = 1;
 			deliverifp = ia6->ia_ifp;	/* correct? */
 			/* Count the packet in the ip address stats */
 			counter_u64_add(ia6->ia_ifa.ifa_ipackets, 1);
 			counter_u64_add(ia6->ia_ifa.ifa_ibytes,
 			    m->m_pkthdr.len);
 			if (free_ia6)
 				ifa_free(&ia6->ia_ifa);
 			goto hbhcheck;
 		} else {
 			char ip6bufs[INET6_ADDRSTRLEN];
 			char ip6bufd[INET6_ADDRSTRLEN];
 			/* address is not ready, so discard the packet. */
 			nd6log((LOG_INFO,
 			    "ip6_input: packet to an unready address %s->%s\n",
 			    ip6_sprintf(ip6bufs, &ip6->ip6_src),
 			    ip6_sprintf(ip6bufd, &ip6->ip6_dst)));
 
 			if (free_ia6)
 				ifa_free(&ia6->ia_ifa);
 			goto bad;
 		}
 	}
 
 	/*
 	 * FAITH (Firewall Aided Internet Translator)
 	 */
 	if (V_ip6_keepfaith) {
 		if (rin6.ro_rt && rin6.ro_rt->rt_ifp &&
 		    rin6.ro_rt->rt_ifp->if_type == IFT_FAITH) {
 			/* XXX do we need more sanity checks? */
 			ours = 1;
 			deliverifp = rin6.ro_rt->rt_ifp; /* faith */
 			goto hbhcheck;
 		}
 	}
 
 	/*
 	 * Now there is no reason to process the packet if it's not our own
 	 * and we're not a router.
 	 */
 	if (!V_ip6_forwarding) {
 		IP6STAT_INC(ip6s_cantforward);
 		in6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_discard);
 		goto bad;
 	}
 
   hbhcheck:
 	/*
 	 * record address information into m_tag, if we don't have one yet.
 	 * note that we are unable to record it, if the address is not listed
 	 * as our interface address (e.g. multicast addresses, addresses
 	 * within FAITH prefixes and such).
 	 */
 	if (deliverifp) {
 		struct in6_ifaddr *ia6;
 
  		if ((ia6 = ip6_getdstifaddr(m)) != NULL) {
 			ifa_free(&ia6->ia_ifa);
 		} else {
 			ia6 = in6_ifawithifp(deliverifp, &ip6->ip6_dst);
 			if (ia6) {
 				if (!ip6_setdstifaddr(m, ia6)) {
 					/*
 					 * XXX maybe we should drop the packet here,
 					 * as we could not provide enough information
 					 * to the upper layers.
 					 */
 				}
 				ifa_free(&ia6->ia_ifa);
 			}
 		}
 	}
 
 	/*
 	 * Process Hop-by-Hop options header if it's contained.
 	 * m may be modified in ip6_hopopts_input().
 	 * If a JumboPayload option is included, plen will also be modified.
 	 */
 	plen = (u_int32_t)ntohs(ip6->ip6_plen);
 	if (ip6->ip6_nxt == IPPROTO_HOPOPTS) {
 		int error;
 
 		error = ip6_input_hbh(m, &plen, &rtalert, &off, &nxt, &ours);
 		if (error != 0)
 			goto out;
 	} else
 		nxt = ip6->ip6_nxt;
 
 	/*
 	 * Check that the amount of data in the buffers
 	 * is as at least much as the IPv6 header would have us expect.
 	 * Trim mbufs if longer than we expect.
 	 * Drop packet if shorter than we expect.
 	 */
 	if (m->m_pkthdr.len - sizeof(struct ip6_hdr) < plen) {
 		IP6STAT_INC(ip6s_tooshort);
 		in6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_truncated);
 		goto bad;
 	}
 	if (m->m_pkthdr.len > sizeof(struct ip6_hdr) + plen) {
 		if (m->m_len == m->m_pkthdr.len) {
 			m->m_len = sizeof(struct ip6_hdr) + plen;
 			m->m_pkthdr.len = sizeof(struct ip6_hdr) + plen;
 		} else
 			m_adj(m, sizeof(struct ip6_hdr) + plen - m->m_pkthdr.len);
 	}
 
 	/*
 	 * Forward if desirable.
 	 */
 	if (V_ip6_mrouter &&
 	    IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst)) {
 		/*
 		 * If we are acting as a multicast router, all
 		 * incoming multicast packets are passed to the
 		 * kernel-level multicast forwarding function.
 		 * The packet is returned (relatively) intact; if
 		 * ip6_mforward() returns a non-zero value, the packet
 		 * must be discarded, else it may be accepted below.
 		 *
 		 * XXX TODO: Check hlim and multicast scope here to avoid
 		 * unnecessarily calling into ip6_mforward().
 		 */
 		if (ip6_mforward &&
 		    ip6_mforward(ip6, m->m_pkthdr.rcvif, m)) {
 			IP6STAT_INC(ip6s_cantforward);
 			in6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_discard);
 			goto bad;
 		}
 	} else if (!ours) {
 		ip6_forward(m, srcrt);
 		goto out;
 	}
 
 	ip6 = mtod(m, struct ip6_hdr *);
 
 	/*
 	 * Malicious party may be able to use IPv4 mapped addr to confuse
 	 * tcp/udp stack and bypass security checks (act as if it was from
 	 * 127.0.0.1 by using IPv6 src ::ffff:127.0.0.1).  Be cautious.
 	 *
 	 * For SIIT end node behavior, you may want to disable the check.
 	 * However, you will  become vulnerable to attacks using IPv4 mapped
 	 * source.
 	 */
 	if (IN6_IS_ADDR_V4MAPPED(&ip6->ip6_src) ||
 	    IN6_IS_ADDR_V4MAPPED(&ip6->ip6_dst)) {
 		IP6STAT_INC(ip6s_badscope);
 		in6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_addrerr);
 		goto bad;
 	}
 
 	/*
 	 * Tell launch routine the next header
 	 */
 	IP6STAT_INC(ip6s_delivered);
 	in6_ifstat_inc(deliverifp, ifs6_in_deliver);
 	nest = 0;
 
 	while (nxt != IPPROTO_DONE) {
 		if (V_ip6_hdrnestlimit && (++nest > V_ip6_hdrnestlimit)) {
 			IP6STAT_INC(ip6s_toomanyhdr);
 			goto bad;
 		}
 
 		/*
 		 * protection against faulty packet - there should be
 		 * more sanity checks in header chain processing.
 		 */
 		if (m->m_pkthdr.len < off) {
 			IP6STAT_INC(ip6s_tooshort);
 			in6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_truncated);
 			goto bad;
 		}
 
 #ifdef IPSEC
 		/*
 		 * enforce IPsec policy checking if we are seeing last header.
 		 * note that we do not visit this with protocols with pcb layer
 		 * code - like udp/tcp/raw ip.
 		 */
 		if (ip6_ipsec_input(m, nxt))
 			goto bad;
 #endif /* IPSEC */
 
 		/*
 		 * Use mbuf flags to propagate Router Alert option to
 		 * ICMPv6 layer, as hop-by-hop options have been stripped.
 		 */
 		if (nxt == IPPROTO_ICMPV6 && rtalert != ~0)
 			m->m_flags |= M_RTALERT_MLD;
 
 		nxt = (*inet6sw[ip6_protox[nxt]].pr_input)(&m, &off, nxt);
 	}
 	goto out;
 bad:
 	m_freem(m);
 out:
 	if (rin6.ro_rt)
 		RTFREE(rin6.ro_rt);
 }
 
 /*
  * set/grab in6_ifaddr correspond to IPv6 destination address.
  * XXX backward compatibility wrapper
  *
  * XXXRW: We should bump the refcount on ia6 before sticking it in the m_tag,
  * and then bump it when the tag is copied, and release it when the tag is
  * freed.  Unfortunately, m_tags don't support deep copies (yet), so instead
  * we just bump the ia refcount when we receive it.  This should be fixed.
  */
 static struct ip6aux *
 ip6_setdstifaddr(struct mbuf *m, struct in6_ifaddr *ia6)
 {
 	struct ip6aux *ip6a;
 
 	ip6a = ip6_addaux(m);
 	if (ip6a)
 		ip6a->ip6a_dstia6 = ia6;
 	return ip6a;	/* NULL if failed to set */
 }
 
 struct in6_ifaddr *
 ip6_getdstifaddr(struct mbuf *m)
 {
 	struct ip6aux *ip6a;
 	struct in6_ifaddr *ia;
 
 	ip6a = ip6_findaux(m);
 	if (ip6a) {
 		ia = ip6a->ip6a_dstia6;
 		ifa_ref(&ia->ia_ifa);
 		return ia;
 	} else
 		return NULL;
 }
 
 /*
  * Hop-by-Hop options header processing. If a valid jumbo payload option is
  * included, the real payload length will be stored in plenp.
  *
  * rtalertp - XXX: should be stored more smart way
  */
 static int
 ip6_hopopts_input(u_int32_t *plenp, u_int32_t *rtalertp,
     struct mbuf **mp, int *offp)
 {
 	struct mbuf *m = *mp;
 	int off = *offp, hbhlen;
 	struct ip6_hbh *hbh;
 	u_int8_t *opt;
 
 	/* validation of the length of the header */
 #ifndef PULLDOWN_TEST
 	IP6_EXTHDR_CHECK(m, off, sizeof(*hbh), -1);
 	hbh = (struct ip6_hbh *)(mtod(m, caddr_t) + off);
 	hbhlen = (hbh->ip6h_len + 1) << 3;
 
 	IP6_EXTHDR_CHECK(m, off, hbhlen, -1);
 	hbh = (struct ip6_hbh *)(mtod(m, caddr_t) + off);
 #else
 	IP6_EXTHDR_GET(hbh, struct ip6_hbh *, m,
 		sizeof(struct ip6_hdr), sizeof(struct ip6_hbh));
 	if (hbh == NULL) {
 		IP6STAT_INC(ip6s_tooshort);
 		return -1;
 	}
 	hbhlen = (hbh->ip6h_len + 1) << 3;
 	IP6_EXTHDR_GET(hbh, struct ip6_hbh *, m, sizeof(struct ip6_hdr),
 		hbhlen);
 	if (hbh == NULL) {
 		IP6STAT_INC(ip6s_tooshort);
 		return -1;
 	}
 #endif
 	off += hbhlen;
 	hbhlen -= sizeof(struct ip6_hbh);
 	opt = (u_int8_t *)hbh + sizeof(struct ip6_hbh);
 
 	if (ip6_process_hopopts(m, (u_int8_t *)hbh + sizeof(struct ip6_hbh),
 				hbhlen, rtalertp, plenp) < 0)
 		return (-1);
 
 	*offp = off;
 	*mp = m;
 	return (0);
 }
 
 /*
  * Search header for all Hop-by-hop options and process each option.
  * This function is separate from ip6_hopopts_input() in order to
  * handle a case where the sending node itself process its hop-by-hop
  * options header. In such a case, the function is called from ip6_output().
  *
  * The function assumes that hbh header is located right after the IPv6 header
  * (RFC2460 p7), opthead is pointer into data content in m, and opthead to
  * opthead + hbhlen is located in contiguous memory region.
  */
 int
 ip6_process_hopopts(struct mbuf *m, u_int8_t *opthead, int hbhlen,
     u_int32_t *rtalertp, u_int32_t *plenp)
 {
 	struct ip6_hdr *ip6;
 	int optlen = 0;
 	u_int8_t *opt = opthead;
 	u_int16_t rtalert_val;
 	u_int32_t jumboplen;
 	const int erroff = sizeof(struct ip6_hdr) + sizeof(struct ip6_hbh);
 
 	for (; hbhlen > 0; hbhlen -= optlen, opt += optlen) {
 		switch (*opt) {
 		case IP6OPT_PAD1:
 			optlen = 1;
 			break;
 		case IP6OPT_PADN:
 			if (hbhlen < IP6OPT_MINLEN) {
 				IP6STAT_INC(ip6s_toosmall);
 				goto bad;
 			}
 			optlen = *(opt + 1) + 2;
 			break;
 		case IP6OPT_ROUTER_ALERT:
 			/* XXX may need check for alignment */
 			if (hbhlen < IP6OPT_RTALERT_LEN) {
 				IP6STAT_INC(ip6s_toosmall);
 				goto bad;
 			}
 			if (*(opt + 1) != IP6OPT_RTALERT_LEN - 2) {
 				/* XXX stat */
 				icmp6_error(m, ICMP6_PARAM_PROB,
 				    ICMP6_PARAMPROB_HEADER,
 				    erroff + opt + 1 - opthead);
 				return (-1);
 			}
 			optlen = IP6OPT_RTALERT_LEN;
 			bcopy((caddr_t)(opt + 2), (caddr_t)&rtalert_val, 2);
 			*rtalertp = ntohs(rtalert_val);
 			break;
 		case IP6OPT_JUMBO:
 			/* XXX may need check for alignment */
 			if (hbhlen < IP6OPT_JUMBO_LEN) {
 				IP6STAT_INC(ip6s_toosmall);
 				goto bad;
 			}
 			if (*(opt + 1) != IP6OPT_JUMBO_LEN - 2) {
 				/* XXX stat */
 				icmp6_error(m, ICMP6_PARAM_PROB,
 				    ICMP6_PARAMPROB_HEADER,
 				    erroff + opt + 1 - opthead);
 				return (-1);
 			}
 			optlen = IP6OPT_JUMBO_LEN;
 
 			/*
 			 * IPv6 packets that have non 0 payload length
 			 * must not contain a jumbo payload option.
 			 */
 			ip6 = mtod(m, struct ip6_hdr *);
 			if (ip6->ip6_plen) {
 				IP6STAT_INC(ip6s_badoptions);
 				icmp6_error(m, ICMP6_PARAM_PROB,
 				    ICMP6_PARAMPROB_HEADER,
 				    erroff + opt - opthead);
 				return (-1);
 			}
 
 			/*
 			 * We may see jumbolen in unaligned location, so
 			 * we'd need to perform bcopy().
 			 */
 			bcopy(opt + 2, &jumboplen, sizeof(jumboplen));
 			jumboplen = (u_int32_t)htonl(jumboplen);
 
 #if 1
 			/*
 			 * if there are multiple jumbo payload options,
 			 * *plenp will be non-zero and the packet will be
 			 * rejected.
 			 * the behavior may need some debate in ipngwg -
 			 * multiple options does not make sense, however,
 			 * there's no explicit mention in specification.
 			 */
 			if (*plenp != 0) {
 				IP6STAT_INC(ip6s_badoptions);
 				icmp6_error(m, ICMP6_PARAM_PROB,
 				    ICMP6_PARAMPROB_HEADER,
 				    erroff + opt + 2 - opthead);
 				return (-1);
 			}
 #endif
 
 			/*
 			 * jumbo payload length must be larger than 65535.
 			 */
 			if (jumboplen <= IPV6_MAXPACKET) {
 				IP6STAT_INC(ip6s_badoptions);
 				icmp6_error(m, ICMP6_PARAM_PROB,
 				    ICMP6_PARAMPROB_HEADER,
 				    erroff + opt + 2 - opthead);
 				return (-1);
 			}
 			*plenp = jumboplen;
 
 			break;
 		default:		/* unknown option */
 			if (hbhlen < IP6OPT_MINLEN) {
 				IP6STAT_INC(ip6s_toosmall);
 				goto bad;
 			}
 			optlen = ip6_unknown_opt(opt, m,
 			    erroff + opt - opthead);
 			if (optlen == -1)
 				return (-1);
 			optlen += 2;
 			break;
 		}
 	}
 
 	return (0);
 
   bad:
 	m_freem(m);
 	return (-1);
 }
 
 /*
  * Unknown option processing.
  * The third argument `off' is the offset from the IPv6 header to the option,
  * which is necessary if the IPv6 header the and option header and IPv6 header
  * is not contiguous in order to return an ICMPv6 error.
  */
 int
 ip6_unknown_opt(u_int8_t *optp, struct mbuf *m, int off)
 {
 	struct ip6_hdr *ip6;
 
 	switch (IP6OPT_TYPE(*optp)) {
 	case IP6OPT_TYPE_SKIP: /* ignore the option */
 		return ((int)*(optp + 1));
 	case IP6OPT_TYPE_DISCARD:	/* silently discard */
 		m_freem(m);
 		return (-1);
 	case IP6OPT_TYPE_FORCEICMP: /* send ICMP even if multicasted */
 		IP6STAT_INC(ip6s_badoptions);
 		icmp6_error(m, ICMP6_PARAM_PROB, ICMP6_PARAMPROB_OPTION, off);
 		return (-1);
 	case IP6OPT_TYPE_ICMP: /* send ICMP if not multicasted */
 		IP6STAT_INC(ip6s_badoptions);
 		ip6 = mtod(m, struct ip6_hdr *);
 		if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst) ||
 		    (m->m_flags & (M_BCAST|M_MCAST)))
 			m_freem(m);
 		else
 			icmp6_error(m, ICMP6_PARAM_PROB,
 				    ICMP6_PARAMPROB_OPTION, off);
 		return (-1);
 	}
 
 	m_freem(m);		/* XXX: NOTREACHED */
 	return (-1);
 }
 
 /*
  * Create the "control" list for this pcb.
  * These functions will not modify mbuf chain at all.
  *
  * With KAME mbuf chain restriction:
  * The routine will be called from upper layer handlers like tcp6_input().
  * Thus the routine assumes that the caller (tcp6_input) have already
  * called IP6_EXTHDR_CHECK() and all the extension headers are located in the
  * very first mbuf on the mbuf chain.
  *
  * ip6_savecontrol_v4 will handle those options that are possible to be
  * set on a v4-mapped socket.
  * ip6_savecontrol will directly call ip6_savecontrol_v4 to handle those
  * options and handle the v6-only ones itself.
  */
 struct mbuf **
 ip6_savecontrol_v4(struct inpcb *inp, struct mbuf *m, struct mbuf **mp,
     int *v4only)
 {
 	struct ip6_hdr *ip6 = mtod(m, struct ip6_hdr *);
 
 #ifdef SO_TIMESTAMP
 	if ((inp->inp_socket->so_options & SO_TIMESTAMP) != 0) {
 		struct timeval tv;
 
 		microtime(&tv);
 		*mp = sbcreatecontrol((caddr_t) &tv, sizeof(tv),
 		    SCM_TIMESTAMP, SOL_SOCKET);
 		if (*mp)
 			mp = &(*mp)->m_next;
 	}
 #endif
 
 #define IS2292(inp, x, y)	(((inp)->inp_flags & IN6P_RFC2292) ? (x) : (y))
 	/* RFC 2292 sec. 5 */
 	if ((inp->inp_flags & IN6P_PKTINFO) != 0) {
 		struct in6_pktinfo pi6;
 
 		if ((ip6->ip6_vfc & IPV6_VERSION_MASK) != IPV6_VERSION) {
 #ifdef INET
 			struct ip *ip;
 
 			ip = mtod(m, struct ip *);
 			pi6.ipi6_addr.s6_addr32[0] = 0;
 			pi6.ipi6_addr.s6_addr32[1] = 0;
 			pi6.ipi6_addr.s6_addr32[2] = IPV6_ADDR_INT32_SMP;
 			pi6.ipi6_addr.s6_addr32[3] = ip->ip_dst.s_addr;
 #else
 			/* We won't hit this code */
 			bzero(&pi6.ipi6_addr, sizeof(struct in6_addr));
 #endif
 		} else {	
 			bcopy(&ip6->ip6_dst, &pi6.ipi6_addr, sizeof(struct in6_addr));
 			in6_clearscope(&pi6.ipi6_addr);	/* XXX */
 		}
 		pi6.ipi6_ifindex =
 		    (m && m->m_pkthdr.rcvif) ? m->m_pkthdr.rcvif->if_index : 0;
 
 		*mp = sbcreatecontrol((caddr_t) &pi6,
 		    sizeof(struct in6_pktinfo),
 		    IS2292(inp, IPV6_2292PKTINFO, IPV6_PKTINFO), IPPROTO_IPV6);
 		if (*mp)
 			mp = &(*mp)->m_next;
 	}
 
 	if ((inp->inp_flags & IN6P_HOPLIMIT) != 0) {
 		int hlim;
 
 		if ((ip6->ip6_vfc & IPV6_VERSION_MASK) != IPV6_VERSION) {
 #ifdef INET
 			struct ip *ip;
 
 			ip = mtod(m, struct ip *);
 			hlim = ip->ip_ttl;
 #else
 			/* We won't hit this code */
 			hlim = 0;
 #endif
 		} else {
 			hlim = ip6->ip6_hlim & 0xff;
 		}
 		*mp = sbcreatecontrol((caddr_t) &hlim, sizeof(int),
 		    IS2292(inp, IPV6_2292HOPLIMIT, IPV6_HOPLIMIT),
 		    IPPROTO_IPV6);
 		if (*mp)
 			mp = &(*mp)->m_next;
 	}
 
 	if ((inp->inp_flags & IN6P_TCLASS) != 0) {
 		int tclass;
 
 		if ((ip6->ip6_vfc & IPV6_VERSION_MASK) != IPV6_VERSION) {
 #ifdef INET
 			struct ip *ip;
 
 			ip = mtod(m, struct ip *);
 			tclass = ip->ip_tos;
 #else
 			/* We won't hit this code */
 			tclass = 0;
 #endif
 		} else {
 			u_int32_t flowinfo;
 
 			flowinfo = (u_int32_t)ntohl(ip6->ip6_flow & IPV6_FLOWINFO_MASK);
 			flowinfo >>= 20;
 			tclass = flowinfo & 0xff;
 		}
 		*mp = sbcreatecontrol((caddr_t) &tclass, sizeof(int),
 		    IPV6_TCLASS, IPPROTO_IPV6);
 		if (*mp)
 			mp = &(*mp)->m_next;
 	}
 
 	if (v4only != NULL) {
 		if ((ip6->ip6_vfc & IPV6_VERSION_MASK) != IPV6_VERSION) {
 			*v4only = 1;
 		} else {
 			*v4only = 0;
 		}
 	}
 
 	return (mp);
 }
 
 void
 ip6_savecontrol(struct inpcb *in6p, struct mbuf *m, struct mbuf **mp)
 {
 	struct ip6_hdr *ip6 = mtod(m, struct ip6_hdr *);
 	int v4only = 0;
 
 	mp = ip6_savecontrol_v4(in6p, m, mp, &v4only);
 	if (v4only)
 		return;
 
 	/*
 	 * IPV6_HOPOPTS socket option.  Recall that we required super-user
 	 * privilege for the option (see ip6_ctloutput), but it might be too
 	 * strict, since there might be some hop-by-hop options which can be
 	 * returned to normal user.
 	 * See also RFC 2292 section 6 (or RFC 3542 section 8).
 	 */
 	if ((in6p->inp_flags & IN6P_HOPOPTS) != 0) {
 		/*
 		 * Check if a hop-by-hop options header is contatined in the
 		 * received packet, and if so, store the options as ancillary
 		 * data. Note that a hop-by-hop options header must be
 		 * just after the IPv6 header, which is assured through the
 		 * IPv6 input processing.
 		 */
 		if (ip6->ip6_nxt == IPPROTO_HOPOPTS) {
 			struct ip6_hbh *hbh;
 			int hbhlen = 0;
 #ifdef PULLDOWN_TEST
 			struct mbuf *ext;
 #endif
 
 #ifndef PULLDOWN_TEST
 			hbh = (struct ip6_hbh *)(ip6 + 1);
 			hbhlen = (hbh->ip6h_len + 1) << 3;
 #else
 			ext = ip6_pullexthdr(m, sizeof(struct ip6_hdr),
 			    ip6->ip6_nxt);
 			if (ext == NULL) {
 				IP6STAT_INC(ip6s_tooshort);
 				return;
 			}
 			hbh = mtod(ext, struct ip6_hbh *);
 			hbhlen = (hbh->ip6h_len + 1) << 3;
 			if (hbhlen != ext->m_len) {
 				m_freem(ext);
 				IP6STAT_INC(ip6s_tooshort);
 				return;
 			}
 #endif
 
 			/*
 			 * XXX: We copy the whole header even if a
 			 * jumbo payload option is included, the option which
 			 * is to be removed before returning according to
 			 * RFC2292.
 			 * Note: this constraint is removed in RFC3542
 			 */
 			*mp = sbcreatecontrol((caddr_t)hbh, hbhlen,
 			    IS2292(in6p, IPV6_2292HOPOPTS, IPV6_HOPOPTS),
 			    IPPROTO_IPV6);
 			if (*mp)
 				mp = &(*mp)->m_next;
 #ifdef PULLDOWN_TEST
 			m_freem(ext);
 #endif
 		}
 	}
 
 	if ((in6p->inp_flags & (IN6P_RTHDR | IN6P_DSTOPTS)) != 0) {
 		int nxt = ip6->ip6_nxt, off = sizeof(struct ip6_hdr);
 
 		/*
 		 * Search for destination options headers or routing
 		 * header(s) through the header chain, and stores each
 		 * header as ancillary data.
 		 * Note that the order of the headers remains in
 		 * the chain of ancillary data.
 		 */
 		while (1) {	/* is explicit loop prevention necessary? */
 			struct ip6_ext *ip6e = NULL;
 			int elen;
 #ifdef PULLDOWN_TEST
 			struct mbuf *ext = NULL;
 #endif
 
 			/*
 			 * if it is not an extension header, don't try to
 			 * pull it from the chain.
 			 */
 			switch (nxt) {
 			case IPPROTO_DSTOPTS:
 			case IPPROTO_ROUTING:
 			case IPPROTO_HOPOPTS:
 			case IPPROTO_AH: /* is it possible? */
 				break;
 			default:
 				goto loopend;
 			}
 
 #ifndef PULLDOWN_TEST
 			if (off + sizeof(*ip6e) > m->m_len)
 				goto loopend;
 			ip6e = (struct ip6_ext *)(mtod(m, caddr_t) + off);
 			if (nxt == IPPROTO_AH)
 				elen = (ip6e->ip6e_len + 2) << 2;
 			else
 				elen = (ip6e->ip6e_len + 1) << 3;
 			if (off + elen > m->m_len)
 				goto loopend;
 #else
 			ext = ip6_pullexthdr(m, off, nxt);
 			if (ext == NULL) {
 				IP6STAT_INC(ip6s_tooshort);
 				return;
 			}
 			ip6e = mtod(ext, struct ip6_ext *);
 			if (nxt == IPPROTO_AH)
 				elen = (ip6e->ip6e_len + 2) << 2;
 			else
 				elen = (ip6e->ip6e_len + 1) << 3;
 			if (elen != ext->m_len) {
 				m_freem(ext);
 				IP6STAT_INC(ip6s_tooshort);
 				return;
 			}
 #endif
 
 			switch (nxt) {
 			case IPPROTO_DSTOPTS:
 				if (!(in6p->inp_flags & IN6P_DSTOPTS))
 					break;
 
 				*mp = sbcreatecontrol((caddr_t)ip6e, elen,
 				    IS2292(in6p,
 					IPV6_2292DSTOPTS, IPV6_DSTOPTS),
 				    IPPROTO_IPV6);
 				if (*mp)
 					mp = &(*mp)->m_next;
 				break;
 			case IPPROTO_ROUTING:
 				if (!(in6p->inp_flags & IN6P_RTHDR))
 					break;
 
 				*mp = sbcreatecontrol((caddr_t)ip6e, elen,
 				    IS2292(in6p, IPV6_2292RTHDR, IPV6_RTHDR),
 				    IPPROTO_IPV6);
 				if (*mp)
 					mp = &(*mp)->m_next;
 				break;
 			case IPPROTO_HOPOPTS:
 			case IPPROTO_AH: /* is it possible? */
 				break;
 
 			default:
 				/*
 				 * other cases have been filtered in the above.
 				 * none will visit this case.  here we supply
 				 * the code just in case (nxt overwritten or
 				 * other cases).
 				 */
 #ifdef PULLDOWN_TEST
 				m_freem(ext);
 #endif
 				goto loopend;
 
 			}
 
 			/* proceed with the next header. */
 			off += elen;
 			nxt = ip6e->ip6e_nxt;
 			ip6e = NULL;
 #ifdef PULLDOWN_TEST
 			m_freem(ext);
 			ext = NULL;
 #endif
 		}
 	  loopend:
 		;
 	}
 }
 #undef IS2292
 
 void
 ip6_notify_pmtu(struct inpcb *in6p, struct sockaddr_in6 *dst, u_int32_t *mtu)
 {
 	struct socket *so;
 	struct mbuf *m_mtu;
 	struct ip6_mtuinfo mtuctl;
 
 	so =  in6p->inp_socket;
 
 	if (mtu == NULL)
 		return;
 
 #ifdef DIAGNOSTIC
 	if (so == NULL)		/* I believe this is impossible */
 		panic("ip6_notify_pmtu: socket is NULL");
 #endif
 
 	bzero(&mtuctl, sizeof(mtuctl));	/* zero-clear for safety */
 	mtuctl.ip6m_mtu = *mtu;
 	mtuctl.ip6m_addr = *dst;
 	if (sa6_recoverscope(&mtuctl.ip6m_addr))
 		return;
 
 	if ((m_mtu = sbcreatecontrol((caddr_t)&mtuctl, sizeof(mtuctl),
 	    IPV6_PATHMTU, IPPROTO_IPV6)) == NULL)
 		return;
 
 	if (sbappendaddr(&so->so_rcv, (struct sockaddr *)dst, NULL, m_mtu)
 	    == 0) {
 		m_freem(m_mtu);
 		/* XXX: should count statistics */
 	} else
 		sorwakeup(so);
 
 	return;
 }
 
 #ifdef PULLDOWN_TEST
 /*
  * pull single extension header from mbuf chain.  returns single mbuf that
  * contains the result, or NULL on error.
  */
 static struct mbuf *
 ip6_pullexthdr(struct mbuf *m, size_t off, int nxt)
 {
 	struct ip6_ext ip6e;
 	size_t elen;
 	struct mbuf *n;
 
 #ifdef DIAGNOSTIC
 	switch (nxt) {
 	case IPPROTO_DSTOPTS:
 	case IPPROTO_ROUTING:
 	case IPPROTO_HOPOPTS:
 	case IPPROTO_AH: /* is it possible? */
 		break;
 	default:
 		printf("ip6_pullexthdr: invalid nxt=%d\n", nxt);
 	}
 #endif
 
 	m_copydata(m, off, sizeof(ip6e), (caddr_t)&ip6e);
 	if (nxt == IPPROTO_AH)
 		elen = (ip6e.ip6e_len + 2) << 2;
 	else
 		elen = (ip6e.ip6e_len + 1) << 3;
 
 	if (elen > MLEN)
 		n = m_getcl(M_NOWAIT, MT_DATA, 0);
 	else
 		n = m_get(M_NOWAIT, MT_DATA);
 	if (n == NULL)
 		return NULL;
 
 	m_copydata(m, off, elen, mtod(n, caddr_t));
 	n->m_len = elen;
 	return n;
 }
 #endif
 
 /*
  * Get pointer to the previous header followed by the header
  * currently processed.
  * XXX: This function supposes that
  *	M includes all headers,
  *	the next header field and the header length field of each header
  *	are valid, and
  *	the sum of each header length equals to OFF.
  * Because of these assumptions, this function must be called very
  * carefully. Moreover, it will not be used in the near future when
  * we develop `neater' mechanism to process extension headers.
  */
 char *
 ip6_get_prevhdr(struct mbuf *m, int off)
 {
 	struct ip6_hdr *ip6 = mtod(m, struct ip6_hdr *);
 
 	if (off == sizeof(struct ip6_hdr))
 		return (&ip6->ip6_nxt);
 	else {
 		int len, nxt;
 		struct ip6_ext *ip6e = NULL;
 
 		nxt = ip6->ip6_nxt;
 		len = sizeof(struct ip6_hdr);
 		while (len < off) {
 			ip6e = (struct ip6_ext *)(mtod(m, caddr_t) + len);
 
 			switch (nxt) {
 			case IPPROTO_FRAGMENT:
 				len += sizeof(struct ip6_frag);
 				break;
 			case IPPROTO_AH:
 				len += (ip6e->ip6e_len + 2) << 2;
 				break;
 			default:
 				len += (ip6e->ip6e_len + 1) << 3;
 				break;
 			}
 			nxt = ip6e->ip6e_nxt;
 		}
 		if (ip6e)
 			return (&ip6e->ip6e_nxt);
 		else
 			return NULL;
 	}
 }
 
 /*
  * get next header offset.  m will be retained.
  */
 int
 ip6_nexthdr(struct mbuf *m, int off, int proto, int *nxtp)
 {
 	struct ip6_hdr ip6;
 	struct ip6_ext ip6e;
 	struct ip6_frag fh;
 
 	/* just in case */
 	if (m == NULL)
 		panic("ip6_nexthdr: m == NULL");
 	if ((m->m_flags & M_PKTHDR) == 0 || m->m_pkthdr.len < off)
 		return -1;
 
 	switch (proto) {
 	case IPPROTO_IPV6:
 		if (m->m_pkthdr.len < off + sizeof(ip6))
 			return -1;
 		m_copydata(m, off, sizeof(ip6), (caddr_t)&ip6);
 		if (nxtp)
 			*nxtp = ip6.ip6_nxt;
 		off += sizeof(ip6);
 		return off;
 
 	case IPPROTO_FRAGMENT:
 		/*
 		 * terminate parsing if it is not the first fragment,
 		 * it does not make sense to parse through it.
 		 */
 		if (m->m_pkthdr.len < off + sizeof(fh))
 			return -1;
 		m_copydata(m, off, sizeof(fh), (caddr_t)&fh);
 		/* IP6F_OFF_MASK = 0xfff8(BigEndian), 0xf8ff(LittleEndian) */
 		if (fh.ip6f_offlg & IP6F_OFF_MASK)
 			return -1;
 		if (nxtp)
 			*nxtp = fh.ip6f_nxt;
 		off += sizeof(struct ip6_frag);
 		return off;
 
 	case IPPROTO_AH:
 		if (m->m_pkthdr.len < off + sizeof(ip6e))
 			return -1;
 		m_copydata(m, off, sizeof(ip6e), (caddr_t)&ip6e);
 		if (nxtp)
 			*nxtp = ip6e.ip6e_nxt;
 		off += (ip6e.ip6e_len + 2) << 2;
 		return off;
 
 	case IPPROTO_HOPOPTS:
 	case IPPROTO_ROUTING:
 	case IPPROTO_DSTOPTS:
 		if (m->m_pkthdr.len < off + sizeof(ip6e))
 			return -1;
 		m_copydata(m, off, sizeof(ip6e), (caddr_t)&ip6e);
 		if (nxtp)
 			*nxtp = ip6e.ip6e_nxt;
 		off += (ip6e.ip6e_len + 1) << 3;
 		return off;
 
 	case IPPROTO_NONE:
 	case IPPROTO_ESP:
 	case IPPROTO_IPCOMP:
 		/* give up */
 		return -1;
 
 	default:
 		return -1;
 	}
 
 	return -1;
 }
 
 /*
  * get offset for the last header in the chain.  m will be kept untainted.
  */
 int
 ip6_lasthdr(struct mbuf *m, int off, int proto, int *nxtp)
 {
 	int newoff;
 	int nxt;
 
 	if (!nxtp) {
 		nxt = -1;
 		nxtp = &nxt;
 	}
 	while (1) {
 		newoff = ip6_nexthdr(m, off, proto, nxtp);
 		if (newoff < 0)
 			return off;
 		else if (newoff < off)
 			return -1;	/* invalid */
 		else if (newoff == off)
 			return newoff;
 
 		off = newoff;
 		proto = *nxtp;
 	}
 }
 
 static struct ip6aux *
 ip6_addaux(struct mbuf *m)
 {
 	struct m_tag *mtag;
 
 	mtag = m_tag_find(m, PACKET_TAG_IPV6_INPUT, NULL);
 	if (!mtag) {
 		mtag = m_tag_get(PACKET_TAG_IPV6_INPUT, sizeof(struct ip6aux),
 		    M_NOWAIT);
 		if (mtag) {
 			m_tag_prepend(m, mtag);
 			bzero(mtag + 1, sizeof(struct ip6aux));
 		}
 	}
 	return mtag ? (struct ip6aux *)(mtag + 1) : NULL;
 }
 
 static struct ip6aux *
 ip6_findaux(struct mbuf *m)
 {
 	struct m_tag *mtag;
 
 	mtag = m_tag_find(m, PACKET_TAG_IPV6_INPUT, NULL);
 	return mtag ? (struct ip6aux *)(mtag + 1) : NULL;
 }
 
 static void
 ip6_delaux(struct mbuf *m)
 {
 	struct m_tag *mtag;
 
 	mtag = m_tag_find(m, PACKET_TAG_IPV6_INPUT, NULL);
 	if (mtag)
 		m_tag_delete(m, mtag);
 }
 
 /*
  * System control for IP6
  */
 
 u_char	inet6ctlerrmap[PRC_NCMDS] = {
 	0,		0,		0,		0,
 	0,		EMSGSIZE,	EHOSTDOWN,	EHOSTUNREACH,
 	EHOSTUNREACH,	EHOSTUNREACH,	ECONNREFUSED,	ECONNREFUSED,
 	EMSGSIZE,	EHOSTUNREACH,	0,		0,
 	0,		0,		0,		0,
 	ENOPROTOOPT
 };
Index: head/sys/netinet6/ip6_mroute.c
===================================================================
--- head/sys/netinet6/ip6_mroute.c	(revision 258540)
+++ head/sys/netinet6/ip6_mroute.c	(revision 258541)
@@ -1,2072 +1,2071 @@
 /*-
  * Copyright (C) 1998 WIDE Project.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the project nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	$KAME: ip6_mroute.c,v 1.58 2001/12/18 02:36:31 itojun Exp $
  */
 
 /*-
  * Copyright (c) 1989 Stephen Deering
  * Copyright (c) 1992, 1993
  *      The Regents of the University of California.  All rights reserved.
  *
  * This code is derived from software contributed to Berkeley by
  * Stephen Deering of Stanford University.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)ip_mroute.c	8.2 (Berkeley) 11/15/93
  *	BSDI ip_mroute.c,v 2.10 1996/11/14 00:29:52 jch Exp
  */
 
 /*
  * IP multicast forwarding procedures
  *
  * Written by David Waitzman, BBN Labs, August 1988.
  * Modified by Steve Deering, Stanford, February 1989.
  * Modified by Mark J. Steiglitz, Stanford, May, 1991
  * Modified by Van Jacobson, LBL, January 1993
  * Modified by Ajit Thyagarajan, PARC, August 1993
  * Modified by Bill Fenner, PARC, April 1994
  *
  * MROUTING Revision: 3.5.1.2 + PIM-SMv2 (pimd) Support
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_inet6.h"
-#include "opt_kdtrace.h"
 
 #include <sys/param.h>
 #include <sys/callout.h>
 #include <sys/errno.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mbuf.h>
 #include <sys/module.h>
 #include <sys/domain.h>
 #include <sys/protosw.h>
 #include <sys/sdt.h>
 #include <sys/signalvar.h>
 #include <sys/socket.h>
 #include <sys/socketvar.h>
 #include <sys/sockio.h>
 #include <sys/sx.h>
 #include <sys/sysctl.h>
 #include <sys/syslog.h>
 #include <sys/systm.h>
 #include <sys/time.h>
 
 #include <net/if.h>
 #include <net/if_var.h>
 #include <net/if_types.h>
 #include <net/raw_cb.h>
 #include <net/vnet.h>
 
 #include <netinet/in.h>
 #include <netinet/in_var.h>
 #include <netinet/icmp6.h>
 #include <netinet/ip_encap.h>
 
 #include <netinet/ip6.h>
 #include <netinet/in_kdtrace.h>
 #include <netinet6/ip6_var.h>
 #include <netinet6/scope6_var.h>
 #include <netinet6/nd6.h>
 #include <netinet6/ip6_mroute.h>
 #include <netinet6/ip6protosw.h>
 #include <netinet6/pim6.h>
 #include <netinet6/pim6_var.h>
 
 static MALLOC_DEFINE(M_MRTABLE6, "mf6c", "multicast forwarding cache entry");
 
 /* XXX: this is a very common idiom; move to <sys/mbuf.h> ? */
 #define M_HASCL(m) ((m)->m_flags & M_EXT)
 
 static int	ip6_mdq(struct mbuf *, struct ifnet *, struct mf6c *);
 static void	phyint_send(struct ip6_hdr *, struct mif6 *, struct mbuf *);
 static int	register_send(struct ip6_hdr *, struct mif6 *, struct mbuf *);
 static int	set_pim6(int *);
 static int	socket_send(struct socket *, struct mbuf *,
 		    struct sockaddr_in6 *);
 
 extern int in6_mcast_loop;
 extern struct domain inet6domain;
 
 static const struct encaptab *pim6_encap_cookie;
 static const struct ip6protosw in6_pim_protosw = {
 	.pr_type =		SOCK_RAW,
 	.pr_domain =		&inet6domain,
 	.pr_protocol =		IPPROTO_PIM,
 	.pr_flags =		PR_ATOMIC|PR_ADDR|PR_LASTHDR,
 	.pr_input =		pim6_input,
 	.pr_output =		rip6_output,
 	.pr_ctloutput =		rip6_ctloutput,
 	.pr_usrreqs =		&rip6_usrreqs
 };
 static int pim6_encapcheck(const struct mbuf *, int, int, void *);
 
 static VNET_DEFINE(int, ip6_mrouter_ver) = 0;
 #define	V_ip6_mrouter_ver	VNET(ip6_mrouter_ver)
 
 SYSCTL_DECL(_net_inet6);
 SYSCTL_DECL(_net_inet6_ip6);
 static SYSCTL_NODE(_net_inet6, IPPROTO_PIM, pim, CTLFLAG_RW, 0, "PIM");
 
 static struct mrt6stat mrt6stat;
 SYSCTL_STRUCT(_net_inet6_ip6, OID_AUTO, mrt6stat, CTLFLAG_RW,
     &mrt6stat, mrt6stat,
     "Multicast Routing Statistics (struct mrt6stat, netinet6/ip6_mroute.h)");
 
 #define	MRT6STAT_INC(name)	mrt6stat.name += 1
 #define NO_RTE_FOUND	0x1
 #define RTE_FOUND	0x2
 
 static struct mtx mrouter6_mtx;
 #define	MROUTER6_LOCK()		mtx_lock(&mrouter6_mtx)
 #define	MROUTER6_UNLOCK()	mtx_unlock(&mrouter6_mtx)
 #define	MROUTER6_LOCK_ASSERT()	do {					\
 	mtx_assert(&mrouter6_mtx, MA_OWNED);				\
 	NET_ASSERT_GIANT();						\
 } while (0)
 #define	MROUTER6_LOCK_INIT()	\
 	mtx_init(&mrouter6_mtx, "IPv6 multicast forwarding", NULL, MTX_DEF)
 #define	MROUTER6_LOCK_DESTROY()	mtx_destroy(&mrouter6_mtx)
 
 static struct mf6c *mf6ctable[MF6CTBLSIZ];
 SYSCTL_OPAQUE(_net_inet6_ip6, OID_AUTO, mf6ctable, CTLFLAG_RD,
     &mf6ctable, sizeof(mf6ctable), "S,*mf6ctable[MF6CTBLSIZ]",
     "IPv6 Multicast Forwarding Table (struct *mf6ctable[MF6CTBLSIZ], "
     "netinet6/ip6_mroute.h)");
 
 static struct mtx mfc6_mtx;
 #define	MFC6_LOCK()		mtx_lock(&mfc6_mtx)
 #define	MFC6_UNLOCK()		mtx_unlock(&mfc6_mtx)
 #define	MFC6_LOCK_ASSERT()	do {					\
 	mtx_assert(&mfc6_mtx, MA_OWNED);				\
 	NET_ASSERT_GIANT();						\
 } while (0)
 #define	MFC6_LOCK_INIT()		\
 	mtx_init(&mfc6_mtx, "IPv6 multicast forwarding cache", NULL, MTX_DEF)
 #define	MFC6_LOCK_DESTROY()	mtx_destroy(&mfc6_mtx)
 
 static u_char n6expire[MF6CTBLSIZ];
 
 static struct mif6 mif6table[MAXMIFS];
 SYSCTL_OPAQUE(_net_inet6_ip6, OID_AUTO, mif6table, CTLFLAG_RD,
     &mif6table, sizeof(mif6table), "S,mif6[MAXMIFS]",
     "IPv6 Multicast Interfaces (struct mif6[MAXMIFS], netinet6/ip6_mroute.h)");
 
 static struct mtx mif6_mtx;
 #define	MIF6_LOCK()		mtx_lock(&mif6_mtx)
 #define	MIF6_UNLOCK()		mtx_unlock(&mif6_mtx)
 #define	MIF6_LOCK_ASSERT()	mtx_assert(&mif6_mtx, MA_OWNED)
 #define	MIF6_LOCK_INIT()	\
 	mtx_init(&mif6_mtx, "IPv6 multicast interfaces", NULL, MTX_DEF)
 #define	MIF6_LOCK_DESTROY()	mtx_destroy(&mif6_mtx)
 
 #ifdef MRT6DEBUG
 static VNET_DEFINE(u_int, mrt6debug) = 0;	/* debug level */
 #define	V_mrt6debug		VNET(mrt6debug)
 #define DEBUG_MFC	0x02
 #define DEBUG_FORWARD	0x04
 #define DEBUG_EXPIRE	0x08
 #define DEBUG_XMIT	0x10
 #define DEBUG_REG	0x20
 #define DEBUG_PIM	0x40
 #endif
 
 static void	expire_upcalls(void *);
 #define	EXPIRE_TIMEOUT	(hz / 4)	/* 4x / second */
 #define	UPCALL_EXPIRE	6		/* number of timeouts */
 
 /*
  * XXX TODO: maintain a count to if_allmulti() calls in struct ifnet.
  */
 
 /*
  * 'Interfaces' associated with decapsulator (so we can tell
  * packets that went through it from ones that get reflected
  * by a broken gateway).  Different from IPv4 register_if,
  * these interfaces are linked into the system ifnet list,
  * because per-interface IPv6 statistics are maintained in
  * ifp->if_afdata.  But it does not have any routes point
  * to them.  I.e., packets can't be sent this way.  They
  * only exist as a placeholder for multicast source
  * verification.
  */
 static struct ifnet *multicast_register_if6;
 
 #define ENCAP_HOPS 64
 
 /*
  * Private variables.
  */
 static mifi_t nummifs = 0;
 static mifi_t reg_mif_num = (mifi_t)-1;
 
 static struct pim6stat pim6stat;
 SYSCTL_STRUCT(_net_inet6_pim, PIM6CTL_STATS, stats, CTLFLAG_RW,
     &pim6stat, pim6stat,
     "PIM Statistics (struct pim6stat, netinet6/pim6_var.h)");
 
 #define	PIM6STAT_INC(name)	pim6stat.name += 1
 static VNET_DEFINE(int, pim6);
 #define	V_pim6		VNET(pim6)
 
 /*
  * Hash function for a source, group entry
  */
 #define MF6CHASH(a, g) MF6CHASHMOD((a).s6_addr32[0] ^ (a).s6_addr32[1] ^ \
 				   (a).s6_addr32[2] ^ (a).s6_addr32[3] ^ \
 				   (g).s6_addr32[0] ^ (g).s6_addr32[1] ^ \
 				   (g).s6_addr32[2] ^ (g).s6_addr32[3])
 
 /*
  * Find a route for a given origin IPv6 address and Multicast group address.
  */
 #define MF6CFIND(o, g, rt) do { \
 	struct mf6c *_rt = mf6ctable[MF6CHASH(o,g)]; \
 	rt = NULL; \
 	MRT6STAT_INC(mrt6s_mfc_lookups); \
 	while (_rt) { \
 		if (IN6_ARE_ADDR_EQUAL(&_rt->mf6c_origin.sin6_addr, &(o)) && \
 		    IN6_ARE_ADDR_EQUAL(&_rt->mf6c_mcastgrp.sin6_addr, &(g)) && \
 		    (_rt->mf6c_stall == NULL)) { \
 			rt = _rt; \
 			break; \
 		} \
 		_rt = _rt->mf6c_next; \
 	} \
 	if (rt == NULL) { \
 		MRT6STAT_INC(mrt6s_mfc_misses); \
 	} \
 } while (/*CONSTCOND*/ 0)
 
 /*
  * Macros to compute elapsed time efficiently
  * Borrowed from Van Jacobson's scheduling code
  * XXX: replace with timersub() ?
  */
 #define TV_DELTA(a, b, delta) do { \
 	    int xxs; \
 		\
 	    delta = (a).tv_usec - (b).tv_usec; \
 	    if ((xxs = (a).tv_sec - (b).tv_sec)) { \
 	       switch (xxs) { \
 		      case 2: \
 			  delta += 1000000; \
 			      /* FALLTHROUGH */ \
 		      case 1: \
 			  delta += 1000000; \
 			  break; \
 		      default: \
 			  delta += (1000000 * xxs); \
 	       } \
 	    } \
 } while (/*CONSTCOND*/ 0)
 
 /* XXX: replace with timercmp(a, b, <) ? */
 #define TV_LT(a, b) (((a).tv_usec < (b).tv_usec && \
 	      (a).tv_sec <= (b).tv_sec) || (a).tv_sec < (b).tv_sec)
 
 #ifdef UPCALL_TIMING
 #define UPCALL_MAX	50
 static u_long upcall_data[UPCALL_MAX + 1];
 static void collate();
 #endif /* UPCALL_TIMING */
 
 static int ip6_mrouter_init(struct socket *, int, int);
 static int add_m6fc(struct mf6cctl *);
 static int add_m6if(struct mif6ctl *);
 static int del_m6fc(struct mf6cctl *);
 static int del_m6if(mifi_t *);
 static int del_m6if_locked(mifi_t *);
 static int get_mif6_cnt(struct sioc_mif_req6 *);
 static int get_sg_cnt(struct sioc_sg_req6 *);
 
 static struct callout expire_upcalls_ch;
 
 int X_ip6_mforward(struct ip6_hdr *, struct ifnet *, struct mbuf *);
 int X_ip6_mrouter_done(void);
 int X_ip6_mrouter_set(struct socket *, struct sockopt *);
 int X_ip6_mrouter_get(struct socket *, struct sockopt *);
 int X_mrt6_ioctl(u_long, caddr_t);
 
 /*
  * Handle MRT setsockopt commands to modify the multicast routing tables.
  */
 int
 X_ip6_mrouter_set(struct socket *so, struct sockopt *sopt)
 {
 	int error = 0;
 	int optval;
 	struct mif6ctl mifc;
 	struct mf6cctl mfcc;
 	mifi_t mifi;
 
 	if (so != V_ip6_mrouter && sopt->sopt_name != MRT6_INIT)
 		return (EACCES);
 
 	switch (sopt->sopt_name) {
 	case MRT6_INIT:
 #ifdef MRT6_OINIT
 	case MRT6_OINIT:
 #endif
 		error = sooptcopyin(sopt, &optval, sizeof(optval),
 		    sizeof(optval));
 		if (error)
 			break;
 		error = ip6_mrouter_init(so, optval, sopt->sopt_name);
 		break;
 	case MRT6_DONE:
 		error = X_ip6_mrouter_done();
 		break;
 	case MRT6_ADD_MIF:
 		error = sooptcopyin(sopt, &mifc, sizeof(mifc), sizeof(mifc));
 		if (error)
 			break;
 		error = add_m6if(&mifc);
 		break;
 	case MRT6_ADD_MFC:
 		error = sooptcopyin(sopt, &mfcc, sizeof(mfcc), sizeof(mfcc));
 		if (error)
 			break;
 		error = add_m6fc(&mfcc);
 		break;
 	case MRT6_DEL_MFC:
 		error = sooptcopyin(sopt, &mfcc, sizeof(mfcc), sizeof(mfcc));
 		if (error)
 			break;
 		error = del_m6fc(&mfcc);
 		break;
 	case MRT6_DEL_MIF:
 		error = sooptcopyin(sopt, &mifi, sizeof(mifi), sizeof(mifi));
 		if (error)
 			break;
 		error = del_m6if(&mifi);
 		break;
 	case MRT6_PIM:
 		error = sooptcopyin(sopt, &optval, sizeof(optval),
 		    sizeof(optval));
 		if (error)
 			break;
 		error = set_pim6(&optval);
 		break;
 	default:
 		error = EOPNOTSUPP;
 		break;
 	}
 
 	return (error);
 }
 
 /*
  * Handle MRT getsockopt commands
  */
 int
 X_ip6_mrouter_get(struct socket *so, struct sockopt *sopt)
 {
 	int error = 0;
 
 	if (so != V_ip6_mrouter)
 		return (EACCES);
 
 	switch (sopt->sopt_name) {
 		case MRT6_PIM:
 			error = sooptcopyout(sopt, &V_pim6, sizeof(V_pim6));
 			break;
 	}
 	return (error);
 }
 
 /*
  * Handle ioctl commands to obtain information from the cache
  */
 int
 X_mrt6_ioctl(u_long cmd, caddr_t data)
 {
 	int ret;
 
 	ret = EINVAL;
 
 	switch (cmd) {
 	case SIOCGETSGCNT_IN6:
 		ret = get_sg_cnt((struct sioc_sg_req6 *)data);
 		break;
 
 	case SIOCGETMIFCNT_IN6:
 		ret = get_mif6_cnt((struct sioc_mif_req6 *)data);
 		break;
 
 	default:
 		break;
 	}
 
 	return (ret);
 }
 
 /*
  * returns the packet, byte, rpf-failure count for the source group provided
  */
 static int
 get_sg_cnt(struct sioc_sg_req6 *req)
 {
 	struct mf6c *rt;
 	int ret;
 
 	ret = 0;
 
 	MFC6_LOCK();
 
 	MF6CFIND(req->src.sin6_addr, req->grp.sin6_addr, rt);
 	if (rt == NULL) {
 		ret = ESRCH;
 	} else {
 		req->pktcnt = rt->mf6c_pkt_cnt;
 		req->bytecnt = rt->mf6c_byte_cnt;
 		req->wrong_if = rt->mf6c_wrong_if;
 	}
 
 	MFC6_UNLOCK();
 
 	return (ret);
 }
 
 /*
  * returns the input and output packet and byte counts on the mif provided
  */
 static int
 get_mif6_cnt(struct sioc_mif_req6 *req)
 {
 	mifi_t mifi;
 	int ret;
 
 	ret = 0;
 	mifi = req->mifi;
 
 	MIF6_LOCK();
 
 	if (mifi >= nummifs) {
 		ret = EINVAL;
 	} else {
 		req->icount = mif6table[mifi].m6_pkt_in;
 		req->ocount = mif6table[mifi].m6_pkt_out;
 		req->ibytes = mif6table[mifi].m6_bytes_in;
 		req->obytes = mif6table[mifi].m6_bytes_out;
 	}
 
 	MIF6_UNLOCK();
 
 	return (ret);
 }
 
 static int
 set_pim6(int *i)
 {
 	if ((*i != 1) && (*i != 0))
 		return (EINVAL);
 
 	V_pim6 = *i;
 
 	return (0);
 }
 
 /*
  * Enable multicast routing
  */
 static int
 ip6_mrouter_init(struct socket *so, int v, int cmd)
 {
 
 #ifdef MRT6DEBUG
 	if (V_mrt6debug)
 		log(LOG_DEBUG,
 		    "ip6_mrouter_init: so_type = %d, pr_protocol = %d\n",
 		    so->so_type, so->so_proto->pr_protocol);
 #endif
 
 	if (so->so_type != SOCK_RAW ||
 	    so->so_proto->pr_protocol != IPPROTO_ICMPV6)
 		return (EOPNOTSUPP);
 
 	if (v != 1)
 		return (ENOPROTOOPT);
 
 	MROUTER6_LOCK();
 
 	if (V_ip6_mrouter != NULL) {
 		MROUTER6_UNLOCK();
 		return (EADDRINUSE);
 	}
 
 	V_ip6_mrouter = so;
 	V_ip6_mrouter_ver = cmd;
 
 	bzero((caddr_t)mf6ctable, sizeof(mf6ctable));
 	bzero((caddr_t)n6expire, sizeof(n6expire));
 
 	V_pim6 = 0;/* used for stubbing out/in pim stuff */
 
 	callout_init(&expire_upcalls_ch, 0);
 	callout_reset(&expire_upcalls_ch, EXPIRE_TIMEOUT,
 	    expire_upcalls, NULL);
 
 	MROUTER6_UNLOCK();
 
 #ifdef MRT6DEBUG
 	if (V_mrt6debug)
 		log(LOG_DEBUG, "ip6_mrouter_init\n");
 #endif
 
 	return (0);
 }
 
 /*
  * Disable IPv6 multicast forwarding.
  */
 int
 X_ip6_mrouter_done(void)
 {
 	mifi_t mifi;
 	u_long i;
 	struct mf6c *rt;
 	struct rtdetq *rte;
 
 	MROUTER6_LOCK();
 
 	if (V_ip6_mrouter == NULL) {
 		MROUTER6_UNLOCK();
 		return (EINVAL);
 	}
 
 	/*
 	 * For each phyint in use, disable promiscuous reception of all IPv6
 	 * multicasts.
 	 */
 	for (mifi = 0; mifi < nummifs; mifi++) {
 		if (mif6table[mifi].m6_ifp &&
 		    !(mif6table[mifi].m6_flags & MIFF_REGISTER)) {
 			if_allmulti(mif6table[mifi].m6_ifp, 0);
 		}
 	}
 	bzero((caddr_t)mif6table, sizeof(mif6table));
 	nummifs = 0;
 
 	V_pim6 = 0; /* used to stub out/in pim specific code */
 
 	callout_stop(&expire_upcalls_ch);
 
 	/*
 	 * Free all multicast forwarding cache entries.
 	 */
 	MFC6_LOCK();
 	for (i = 0; i < MF6CTBLSIZ; i++) {
 		rt = mf6ctable[i];
 		while (rt) {
 			struct mf6c *frt;
 
 			for (rte = rt->mf6c_stall; rte != NULL; ) {
 				struct rtdetq *n = rte->next;
 
 				m_freem(rte->m);
 				free(rte, M_MRTABLE6);
 				rte = n;
 			}
 			frt = rt;
 			rt = rt->mf6c_next;
 			free(frt, M_MRTABLE6);
 		}
 	}
 	bzero((caddr_t)mf6ctable, sizeof(mf6ctable));
 	MFC6_UNLOCK();
 
 	/*
 	 * Reset register interface
 	 */
 	if (reg_mif_num != (mifi_t)-1 && multicast_register_if6 != NULL) {
 		if_detach(multicast_register_if6);
 		if_free(multicast_register_if6);
 		reg_mif_num = (mifi_t)-1;
 		multicast_register_if6 = NULL;
 	}
 
 	V_ip6_mrouter = NULL;
 	V_ip6_mrouter_ver = 0;
 
 	MROUTER6_UNLOCK();
 
 #ifdef MRT6DEBUG
 	if (V_mrt6debug)
 		log(LOG_DEBUG, "ip6_mrouter_done\n");
 #endif
 
 	return (0);
 }
 
 static struct sockaddr_in6 sin6 = { sizeof(sin6), AF_INET6 };
 
 /*
  * Add a mif to the mif table
  */
 static int
 add_m6if(struct mif6ctl *mifcp)
 {
 	struct mif6 *mifp;
 	struct ifnet *ifp;
 	int error;
 
 	MIF6_LOCK();
 
 	if (mifcp->mif6c_mifi >= MAXMIFS) {
 		MIF6_UNLOCK();
 		return (EINVAL);
 	}
 	mifp = mif6table + mifcp->mif6c_mifi;
 	if (mifp->m6_ifp != NULL) {
 		MIF6_UNLOCK();
 		return (EADDRINUSE); /* XXX: is it appropriate? */
 	}
 	if (mifcp->mif6c_pifi == 0 || mifcp->mif6c_pifi > V_if_index) {
 		MIF6_UNLOCK();
 		return (ENXIO);
 	}
 
 	ifp = ifnet_byindex(mifcp->mif6c_pifi);
 
 	if (mifcp->mif6c_flags & MIFF_REGISTER) {
 		if (reg_mif_num == (mifi_t)-1) {
 			ifp = if_alloc(IFT_OTHER);
 
 			if_initname(ifp, "register_mif", 0);
 			ifp->if_flags |= IFF_LOOPBACK;
 			if_attach(ifp);
 			multicast_register_if6 = ifp;
 			reg_mif_num = mifcp->mif6c_mifi;
 			/*
 			 * it is impossible to guess the ifindex of the
 			 * register interface.  So mif6c_pifi is automatically
 			 * calculated.
 			 */
 			mifcp->mif6c_pifi = ifp->if_index;
 		} else {
 			ifp = multicast_register_if6;
 		}
 	} else {
 		/* Make sure the interface supports multicast */
 		if ((ifp->if_flags & IFF_MULTICAST) == 0) {
 			MIF6_UNLOCK();
 			return (EOPNOTSUPP);
 		}
 
 		error = if_allmulti(ifp, 1);
 		if (error) {
 			MIF6_UNLOCK();
 			return (error);
 		}
 	}
 
 	mifp->m6_flags     = mifcp->mif6c_flags;
 	mifp->m6_ifp       = ifp;
 
 	/* initialize per mif pkt counters */
 	mifp->m6_pkt_in    = 0;
 	mifp->m6_pkt_out   = 0;
 	mifp->m6_bytes_in  = 0;
 	mifp->m6_bytes_out = 0;
 
 	/* Adjust nummifs up if the mifi is higher than nummifs */
 	if (nummifs <= mifcp->mif6c_mifi)
 		nummifs = mifcp->mif6c_mifi + 1;
 
 	MIF6_UNLOCK();
 
 #ifdef MRT6DEBUG
 	if (V_mrt6debug)
 		log(LOG_DEBUG,
 		    "add_mif #%d, phyint %s\n",
 		    mifcp->mif6c_mifi,
 		    ifp->if_xname);
 #endif
 
 	return (0);
 }
 
 /*
  * Delete a mif from the mif table
  */
 static int
 del_m6if_locked(mifi_t *mifip)
 {
 	struct mif6 *mifp = mif6table + *mifip;
 	mifi_t mifi;
 	struct ifnet *ifp;
 
 	MIF6_LOCK_ASSERT();
 
 	if (*mifip >= nummifs)
 		return (EINVAL);
 	if (mifp->m6_ifp == NULL)
 		return (EINVAL);
 
 	if (!(mifp->m6_flags & MIFF_REGISTER)) {
 		/* XXX: TODO: Maintain an ALLMULTI refcount in struct ifnet. */
 		ifp = mifp->m6_ifp;
 		if_allmulti(ifp, 0);
 	} else {
 		if (reg_mif_num != (mifi_t)-1 &&
 		    multicast_register_if6 != NULL) {
 			if_detach(multicast_register_if6);
 			if_free(multicast_register_if6);
 			reg_mif_num = (mifi_t)-1;
 			multicast_register_if6 = NULL;
 		}
 	}
 
 	bzero((caddr_t)mifp, sizeof(*mifp));
 
 	/* Adjust nummifs down */
 	for (mifi = nummifs; mifi > 0; mifi--)
 		if (mif6table[mifi - 1].m6_ifp)
 			break;
 	nummifs = mifi;
 
 #ifdef MRT6DEBUG
 	if (V_mrt6debug)
 		log(LOG_DEBUG, "del_m6if %d, nummifs %d\n", *mifip, nummifs);
 #endif
 
 	return (0);
 }
 
 static int
 del_m6if(mifi_t *mifip)
 {
 	int cc;
 
 	MIF6_LOCK();
 	cc = del_m6if_locked(mifip);
 	MIF6_UNLOCK();
 
 	return (cc);
 }
 
 /*
  * Add an mfc entry
  */
 static int
 add_m6fc(struct mf6cctl *mfccp)
 {
 	struct mf6c *rt;
 	u_long hash;
 	struct rtdetq *rte;
 	u_short nstl;
 	char ip6bufo[INET6_ADDRSTRLEN], ip6bufg[INET6_ADDRSTRLEN];
 
 	MFC6_LOCK();
 
 	MF6CFIND(mfccp->mf6cc_origin.sin6_addr,
 		 mfccp->mf6cc_mcastgrp.sin6_addr, rt);
 
 	/* If an entry already exists, just update the fields */
 	if (rt) {
 #ifdef MRT6DEBUG
 		if (V_mrt6debug & DEBUG_MFC) {
 		    log(LOG_DEBUG,
 			"add_m6fc no upcall h %d o %s g %s p %x\n",
 			ip6_sprintf(ip6bufo, &mfccp->mf6cc_origin.sin6_addr),
 			ip6_sprintf(ip6bufg, &mfccp->mf6cc_mcastgrp.sin6_addr),
 			mfccp->mf6cc_parent);
 		}
 #endif
 
 		rt->mf6c_parent = mfccp->mf6cc_parent;
 		rt->mf6c_ifset = mfccp->mf6cc_ifset;
 
 		MFC6_UNLOCK();
 		return (0);
 	}
 
 	/*
 	 * Find the entry for which the upcall was made and update
 	 */
 	hash = MF6CHASH(mfccp->mf6cc_origin.sin6_addr,
 			mfccp->mf6cc_mcastgrp.sin6_addr);
 	for (rt = mf6ctable[hash], nstl = 0; rt; rt = rt->mf6c_next) {
 		if (IN6_ARE_ADDR_EQUAL(&rt->mf6c_origin.sin6_addr,
 				       &mfccp->mf6cc_origin.sin6_addr) &&
 		    IN6_ARE_ADDR_EQUAL(&rt->mf6c_mcastgrp.sin6_addr,
 				       &mfccp->mf6cc_mcastgrp.sin6_addr) &&
 		    (rt->mf6c_stall != NULL)) {
 
 			if (nstl++)
 				log(LOG_ERR,
 				    "add_m6fc: %s o %s g %s p %x dbx %p\n",
 				    "multiple kernel entries",
 				    ip6_sprintf(ip6bufo,
 					    &mfccp->mf6cc_origin.sin6_addr),
 				    ip6_sprintf(ip6bufg,
 					    &mfccp->mf6cc_mcastgrp.sin6_addr),
 				    mfccp->mf6cc_parent, rt->mf6c_stall);
 
 #ifdef MRT6DEBUG
 			if (V_mrt6debug & DEBUG_MFC)
 				log(LOG_DEBUG,
 				    "add_m6fc o %s g %s p %x dbg %x\n",
 				    ip6_sprintf(ip6bufo,
 					    &mfccp->mf6cc_origin.sin6_addr),
 				    ip6_sprintf(ip6bufg,
 					    &mfccp->mf6cc_mcastgrp.sin6_addr),
 				    mfccp->mf6cc_parent, rt->mf6c_stall);
 #endif
 
 			rt->mf6c_origin     = mfccp->mf6cc_origin;
 			rt->mf6c_mcastgrp   = mfccp->mf6cc_mcastgrp;
 			rt->mf6c_parent     = mfccp->mf6cc_parent;
 			rt->mf6c_ifset	    = mfccp->mf6cc_ifset;
 			/* initialize pkt counters per src-grp */
 			rt->mf6c_pkt_cnt    = 0;
 			rt->mf6c_byte_cnt   = 0;
 			rt->mf6c_wrong_if   = 0;
 
 			rt->mf6c_expire = 0;	/* Don't clean this guy up */
 			n6expire[hash]--;
 
 			/* free packets Qed at the end of this entry */
 			for (rte = rt->mf6c_stall; rte != NULL; ) {
 				struct rtdetq *n = rte->next;
 				ip6_mdq(rte->m, rte->ifp, rt);
 				m_freem(rte->m);
 #ifdef UPCALL_TIMING
 				collate(&(rte->t));
 #endif /* UPCALL_TIMING */
 				free(rte, M_MRTABLE6);
 				rte = n;
 			}
 			rt->mf6c_stall = NULL;
 		}
 	}
 
 	/*
 	 * It is possible that an entry is being inserted without an upcall
 	 */
 	if (nstl == 0) {
 #ifdef MRT6DEBUG
 		if (V_mrt6debug & DEBUG_MFC)
 		    log(LOG_DEBUG,
 			"add_mfc no upcall h %d o %s g %s p %x\n",
 			hash,
 			ip6_sprintf(ip6bufo, &mfccp->mf6cc_origin.sin6_addr),
 			ip6_sprintf(ip6bufg, &mfccp->mf6cc_mcastgrp.sin6_addr),
 			mfccp->mf6cc_parent);
 #endif
 
 		for (rt = mf6ctable[hash]; rt; rt = rt->mf6c_next) {
 
 			if (IN6_ARE_ADDR_EQUAL(&rt->mf6c_origin.sin6_addr,
 					       &mfccp->mf6cc_origin.sin6_addr)&&
 			    IN6_ARE_ADDR_EQUAL(&rt->mf6c_mcastgrp.sin6_addr,
 					       &mfccp->mf6cc_mcastgrp.sin6_addr)) {
 
 				rt->mf6c_origin     = mfccp->mf6cc_origin;
 				rt->mf6c_mcastgrp   = mfccp->mf6cc_mcastgrp;
 				rt->mf6c_parent     = mfccp->mf6cc_parent;
 				rt->mf6c_ifset	    = mfccp->mf6cc_ifset;
 				/* initialize pkt counters per src-grp */
 				rt->mf6c_pkt_cnt    = 0;
 				rt->mf6c_byte_cnt   = 0;
 				rt->mf6c_wrong_if   = 0;
 
 				if (rt->mf6c_expire)
 					n6expire[hash]--;
 				rt->mf6c_expire	   = 0;
 			}
 		}
 		if (rt == NULL) {
 			/* no upcall, so make a new entry */
 			rt = (struct mf6c *)malloc(sizeof(*rt), M_MRTABLE6,
 						  M_NOWAIT);
 			if (rt == NULL) {
 				MFC6_UNLOCK();
 				return (ENOBUFS);
 			}
 
 			/* insert new entry at head of hash chain */
 			rt->mf6c_origin     = mfccp->mf6cc_origin;
 			rt->mf6c_mcastgrp   = mfccp->mf6cc_mcastgrp;
 			rt->mf6c_parent     = mfccp->mf6cc_parent;
 			rt->mf6c_ifset	    = mfccp->mf6cc_ifset;
 			/* initialize pkt counters per src-grp */
 			rt->mf6c_pkt_cnt    = 0;
 			rt->mf6c_byte_cnt   = 0;
 			rt->mf6c_wrong_if   = 0;
 			rt->mf6c_expire     = 0;
 			rt->mf6c_stall = NULL;
 
 			/* link into table */
 			rt->mf6c_next  = mf6ctable[hash];
 			mf6ctable[hash] = rt;
 		}
 	}
 
 	MFC6_UNLOCK();
 	return (0);
 }
 
 #ifdef UPCALL_TIMING
 /*
  * collect delay statistics on the upcalls
  */
 static void
 collate(struct timeval *t)
 {
 	u_long d;
 	struct timeval tp;
 	u_long delta;
 
 	GET_TIME(tp);
 
 	if (TV_LT(*t, tp))
 	{
 		TV_DELTA(tp, *t, delta);
 
 		d = delta >> 10;
 		if (d > UPCALL_MAX)
 			d = UPCALL_MAX;
 
 		++upcall_data[d];
 	}
 }
 #endif /* UPCALL_TIMING */
 
 /*
  * Delete an mfc entry
  */
 static int
 del_m6fc(struct mf6cctl *mfccp)
 {
 	struct sockaddr_in6	origin;
 	struct sockaddr_in6	mcastgrp;
 	struct mf6c		*rt;
 	struct mf6c		**nptr;
 	u_long		hash;
 
 	origin = mfccp->mf6cc_origin;
 	mcastgrp = mfccp->mf6cc_mcastgrp;
 	hash = MF6CHASH(origin.sin6_addr, mcastgrp.sin6_addr);
 
 #ifdef MRT6DEBUG
 	if (V_mrt6debug & DEBUG_MFC) {
 		char ip6bufo[INET6_ADDRSTRLEN], ip6bufg[INET6_ADDRSTRLEN];
 		log(LOG_DEBUG,"del_m6fc orig %s mcastgrp %s\n",
 		    ip6_sprintf(ip6bufo, &origin.sin6_addr),
 		    ip6_sprintf(ip6bufg, &mcastgrp.sin6_addr));
 	}
 #endif
 
 	MFC6_LOCK();
 
 	nptr = &mf6ctable[hash];
 	while ((rt = *nptr) != NULL) {
 		if (IN6_ARE_ADDR_EQUAL(&origin.sin6_addr,
 				       &rt->mf6c_origin.sin6_addr) &&
 		    IN6_ARE_ADDR_EQUAL(&mcastgrp.sin6_addr,
 				       &rt->mf6c_mcastgrp.sin6_addr) &&
 		    rt->mf6c_stall == NULL)
 			break;
 
 		nptr = &rt->mf6c_next;
 	}
 	if (rt == NULL) {
 		MFC6_UNLOCK();
 		return (EADDRNOTAVAIL);
 	}
 
 	*nptr = rt->mf6c_next;
 	free(rt, M_MRTABLE6);
 
 	MFC6_UNLOCK();
 
 	return (0);
 }
 
 static int
 socket_send(struct socket *s, struct mbuf *mm, struct sockaddr_in6 *src)
 {
 
 	if (s) {
 		if (sbappendaddr(&s->so_rcv,
 				 (struct sockaddr *)src,
 				 mm, (struct mbuf *)0) != 0) {
 			sorwakeup(s);
 			return (0);
 		}
 	}
 	m_freem(mm);
 	return (-1);
 }
 
 /*
  * IPv6 multicast forwarding function. This function assumes that the packet
  * pointed to by "ip6" has arrived on (or is about to be sent to) the interface
  * pointed to by "ifp", and the packet is to be relayed to other networks
  * that have members of the packet's destination IPv6 multicast group.
  *
  * The packet is returned unscathed to the caller, unless it is
  * erroneous, in which case a non-zero return value tells the caller to
  * discard it.
  *
  * NOTE: this implementation assumes that m->m_pkthdr.rcvif is NULL iff
  * this function is called in the originating context (i.e., not when
  * forwarding a packet from other node).  ip6_output(), which is currently the
  * only function that calls this function is called in the originating context,
  * explicitly ensures this condition.  It is caller's responsibility to ensure
  * that if this function is called from somewhere else in the originating
  * context in the future.
  */
 int
 X_ip6_mforward(struct ip6_hdr *ip6, struct ifnet *ifp, struct mbuf *m)
 {
 	struct mf6c *rt;
 	struct mif6 *mifp;
 	struct mbuf *mm;
 	mifi_t mifi;
 	char ip6bufs[INET6_ADDRSTRLEN], ip6bufd[INET6_ADDRSTRLEN];
 
 #ifdef MRT6DEBUG
 	if (V_mrt6debug & DEBUG_FORWARD)
 		log(LOG_DEBUG, "ip6_mforward: src %s, dst %s, ifindex %d\n",
 		    ip6_sprintf(ip6bufs, &ip6->ip6_src),
 		    ip6_sprintf(ip6bufd, &ip6->ip6_dst),
 		    ifp->if_index);
 #endif
 
 	/*
 	 * Don't forward a packet with Hop limit of zero or one,
 	 * or a packet destined to a local-only group.
 	 */
 	if (ip6->ip6_hlim <= 1 || IN6_IS_ADDR_MC_INTFACELOCAL(&ip6->ip6_dst) ||
 	    IN6_IS_ADDR_MC_LINKLOCAL(&ip6->ip6_dst))
 		return (0);
 	ip6->ip6_hlim--;
 
 	/*
 	 * Source address check: do not forward packets with unspecified
 	 * source. It was discussed in July 2000, on ipngwg mailing list.
 	 * This is rather more serious than unicast cases, because some
 	 * MLD packets can be sent with the unspecified source address
 	 * (although such packets must normally set 1 to the hop limit field).
 	 */
 	if (IN6_IS_ADDR_UNSPECIFIED(&ip6->ip6_src)) {
 		IP6STAT_INC(ip6s_cantforward);
 		if (V_ip6_log_time + V_ip6_log_interval < time_uptime) {
 			V_ip6_log_time = time_uptime;
 			log(LOG_DEBUG,
 			    "cannot forward "
 			    "from %s to %s nxt %d received on %s\n",
 			    ip6_sprintf(ip6bufs, &ip6->ip6_src),
 			    ip6_sprintf(ip6bufd, &ip6->ip6_dst),
 			    ip6->ip6_nxt,
 			    if_name(m->m_pkthdr.rcvif));
 		}
 		return (0);
 	}
 
 	MFC6_LOCK();
 
 	/*
 	 * Determine forwarding mifs from the forwarding cache table
 	 */
 	MF6CFIND(ip6->ip6_src, ip6->ip6_dst, rt);
 
 	/* Entry exists, so forward if necessary */
 	if (rt) {
 		MFC6_UNLOCK();
 		return (ip6_mdq(m, ifp, rt));
 	} else {
 		/*
 		 * If we don't have a route for packet's origin,
 		 * Make a copy of the packet &
 		 * send message to routing daemon
 		 */
 
 		struct mbuf *mb0;
 		struct rtdetq *rte;
 		u_long hash;
 /*		int i, npkts;*/
 #ifdef UPCALL_TIMING
 		struct timeval tp;
 
 		GET_TIME(tp);
 #endif /* UPCALL_TIMING */
 
 		MRT6STAT_INC(mrt6s_no_route);
 #ifdef MRT6DEBUG
 		if (V_mrt6debug & (DEBUG_FORWARD | DEBUG_MFC))
 			log(LOG_DEBUG, "ip6_mforward: no rte s %s g %s\n",
 			    ip6_sprintf(ip6bufs, &ip6->ip6_src),
 			    ip6_sprintf(ip6bufd, &ip6->ip6_dst));
 #endif
 
 		/*
 		 * Allocate mbufs early so that we don't do extra work if we
 		 * are just going to fail anyway.
 		 */
 		rte = (struct rtdetq *)malloc(sizeof(*rte), M_MRTABLE6,
 					      M_NOWAIT);
 		if (rte == NULL) {
 			MFC6_UNLOCK();
 			return (ENOBUFS);
 		}
 		mb0 = m_copy(m, 0, M_COPYALL);
 		/*
 		 * Pullup packet header if needed before storing it,
 		 * as other references may modify it in the meantime.
 		 */
 		if (mb0 &&
 		    (M_HASCL(mb0) || mb0->m_len < sizeof(struct ip6_hdr)))
 			mb0 = m_pullup(mb0, sizeof(struct ip6_hdr));
 		if (mb0 == NULL) {
 			free(rte, M_MRTABLE6);
 			MFC6_UNLOCK();
 			return (ENOBUFS);
 		}
 
 		/* is there an upcall waiting for this packet? */
 		hash = MF6CHASH(ip6->ip6_src, ip6->ip6_dst);
 		for (rt = mf6ctable[hash]; rt; rt = rt->mf6c_next) {
 			if (IN6_ARE_ADDR_EQUAL(&ip6->ip6_src,
 					       &rt->mf6c_origin.sin6_addr) &&
 			    IN6_ARE_ADDR_EQUAL(&ip6->ip6_dst,
 					       &rt->mf6c_mcastgrp.sin6_addr) &&
 			    (rt->mf6c_stall != NULL))
 				break;
 		}
 
 		if (rt == NULL) {
 			struct mrt6msg *im;
 #ifdef MRT6_OINIT
 			struct omrt6msg *oim;
 #endif
 
 			/* no upcall, so make a new entry */
 			rt = (struct mf6c *)malloc(sizeof(*rt), M_MRTABLE6,
 						  M_NOWAIT);
 			if (rt == NULL) {
 				free(rte, M_MRTABLE6);
 				m_freem(mb0);
 				MFC6_UNLOCK();
 				return (ENOBUFS);
 			}
 			/*
 			 * Make a copy of the header to send to the user
 			 * level process
 			 */
 			mm = m_copy(mb0, 0, sizeof(struct ip6_hdr));
 
 			if (mm == NULL) {
 				free(rte, M_MRTABLE6);
 				m_freem(mb0);
 				free(rt, M_MRTABLE6);
 				MFC6_UNLOCK();
 				return (ENOBUFS);
 			}
 
 			/*
 			 * Send message to routing daemon
 			 */
 			sin6.sin6_addr = ip6->ip6_src;
 
 			im = NULL;
 #ifdef MRT6_OINIT
 			oim = NULL;
 #endif
 			switch (V_ip6_mrouter_ver) {
 #ifdef MRT6_OINIT
 			case MRT6_OINIT:
 				oim = mtod(mm, struct omrt6msg *);
 				oim->im6_msgtype = MRT6MSG_NOCACHE;
 				oim->im6_mbz = 0;
 				break;
 #endif
 			case MRT6_INIT:
 				im = mtod(mm, struct mrt6msg *);
 				im->im6_msgtype = MRT6MSG_NOCACHE;
 				im->im6_mbz = 0;
 				break;
 			default:
 				free(rte, M_MRTABLE6);
 				m_freem(mb0);
 				free(rt, M_MRTABLE6);
 				MFC6_UNLOCK();
 				return (EINVAL);
 			}
 
 #ifdef MRT6DEBUG
 			if (V_mrt6debug & DEBUG_FORWARD)
 				log(LOG_DEBUG,
 				    "getting the iif info in the kernel\n");
 #endif
 
 			for (mifp = mif6table, mifi = 0;
 			     mifi < nummifs && mifp->m6_ifp != ifp;
 			     mifp++, mifi++)
 				;
 
 			switch (V_ip6_mrouter_ver) {
 #ifdef MRT6_OINIT
 			case MRT6_OINIT:
 				oim->im6_mif = mifi;
 				break;
 #endif
 			case MRT6_INIT:
 				im->im6_mif = mifi;
 				break;
 			}
 
 			if (socket_send(V_ip6_mrouter, mm, &sin6) < 0) {
 				log(LOG_WARNING, "ip6_mforward: ip6_mrouter "
 				    "socket queue full\n");
 				MRT6STAT_INC(mrt6s_upq_sockfull);
 				free(rte, M_MRTABLE6);
 				m_freem(mb0);
 				free(rt, M_MRTABLE6);
 				MFC6_UNLOCK();
 				return (ENOBUFS);
 			}
 
 			MRT6STAT_INC(mrt6s_upcalls);
 
 			/* insert new entry at head of hash chain */
 			bzero(rt, sizeof(*rt));
 			rt->mf6c_origin.sin6_family = AF_INET6;
 			rt->mf6c_origin.sin6_len = sizeof(struct sockaddr_in6);
 			rt->mf6c_origin.sin6_addr = ip6->ip6_src;
 			rt->mf6c_mcastgrp.sin6_family = AF_INET6;
 			rt->mf6c_mcastgrp.sin6_len = sizeof(struct sockaddr_in6);
 			rt->mf6c_mcastgrp.sin6_addr = ip6->ip6_dst;
 			rt->mf6c_expire = UPCALL_EXPIRE;
 			n6expire[hash]++;
 			rt->mf6c_parent = MF6C_INCOMPLETE_PARENT;
 
 			/* link into table */
 			rt->mf6c_next  = mf6ctable[hash];
 			mf6ctable[hash] = rt;
 			/* Add this entry to the end of the queue */
 			rt->mf6c_stall = rte;
 		} else {
 			/* determine if q has overflowed */
 			struct rtdetq **p;
 			int npkts = 0;
 
 			for (p = &rt->mf6c_stall; *p != NULL; p = &(*p)->next)
 				if (++npkts > MAX_UPQ6) {
 					MRT6STAT_INC(mrt6s_upq_ovflw);
 					free(rte, M_MRTABLE6);
 					m_freem(mb0);
 					MFC6_UNLOCK();
 					return (0);
 				}
 
 			/* Add this entry to the end of the queue */
 			*p = rte;
 		}
 
 		rte->next = NULL;
 		rte->m = mb0;
 		rte->ifp = ifp;
 #ifdef UPCALL_TIMING
 		rte->t = tp;
 #endif /* UPCALL_TIMING */
 
 		MFC6_UNLOCK();
 
 		return (0);
 	}
 }
 
 /*
  * Clean up cache entries if upcalls are not serviced
  * Call from the Slow Timeout mechanism, every half second.
  */
 static void
 expire_upcalls(void *unused)
 {
 	struct rtdetq *rte;
 	struct mf6c *mfc, **nptr;
 	u_long i;
 
 	MFC6_LOCK();
 	for (i = 0; i < MF6CTBLSIZ; i++) {
 		if (n6expire[i] == 0)
 			continue;
 		nptr = &mf6ctable[i];
 		while ((mfc = *nptr) != NULL) {
 			rte = mfc->mf6c_stall;
 			/*
 			 * Skip real cache entries
 			 * Make sure it wasn't marked to not expire (shouldn't happen)
 			 * If it expires now
 			 */
 			if (rte != NULL &&
 			    mfc->mf6c_expire != 0 &&
 			    --mfc->mf6c_expire == 0) {
 #ifdef MRT6DEBUG
 				if (V_mrt6debug & DEBUG_EXPIRE) {
 					char ip6bufo[INET6_ADDRSTRLEN];
 					char ip6bufg[INET6_ADDRSTRLEN];
 					log(LOG_DEBUG, "expire_upcalls: expiring (%s %s)\n",
 					    ip6_sprintf(ip6bufo, &mfc->mf6c_origin.sin6_addr),
 					    ip6_sprintf(ip6bufg, &mfc->mf6c_mcastgrp.sin6_addr));
 				}
 #endif
 				/*
 				 * drop all the packets
 				 * free the mbuf with the pkt, if, timing info
 				 */
 				do {
 					struct rtdetq *n = rte->next;
 					m_freem(rte->m);
 					free(rte, M_MRTABLE6);
 					rte = n;
 				} while (rte != NULL);
 				MRT6STAT_INC(mrt6s_cache_cleanups);
 				n6expire[i]--;
 
 				*nptr = mfc->mf6c_next;
 				free(mfc, M_MRTABLE6);
 			} else {
 				nptr = &mfc->mf6c_next;
 			}
 		}
 	}
 	MFC6_UNLOCK();
 	callout_reset(&expire_upcalls_ch, EXPIRE_TIMEOUT,
 	    expire_upcalls, NULL);
 }
 
 /*
  * Packet forwarding routine once entry in the cache is made
  */
 static int
 ip6_mdq(struct mbuf *m, struct ifnet *ifp, struct mf6c *rt)
 {
 	struct ip6_hdr *ip6 = mtod(m, struct ip6_hdr *);
 	mifi_t mifi, iif;
 	struct mif6 *mifp;
 	int plen = m->m_pkthdr.len;
 	struct in6_addr src0, dst0; /* copies for local work */
 	u_int32_t iszone, idzone, oszone, odzone;
 	int error = 0;
 
 /*
  * Macro to send packet on mif.  Since RSVP packets don't get counted on
  * input, they shouldn't get counted on output, so statistics keeping is
  * separate.
  */
 
 #define MC6_SEND(ip6, mifp, m) do {				\
 	if ((mifp)->m6_flags & MIFF_REGISTER)			\
 		register_send((ip6), (mifp), (m));		\
 	else							\
 		phyint_send((ip6), (mifp), (m));		\
 } while (/*CONSTCOND*/ 0)
 
 	/*
 	 * Don't forward if it didn't arrive from the parent mif
 	 * for its origin.
 	 */
 	mifi = rt->mf6c_parent;
 	if ((mifi >= nummifs) || (mif6table[mifi].m6_ifp != ifp)) {
 		/* came in the wrong interface */
 #ifdef MRT6DEBUG
 		if (V_mrt6debug & DEBUG_FORWARD)
 			log(LOG_DEBUG,
 			    "wrong if: ifid %d mifi %d mififid %x\n",
 			    ifp->if_index, mifi,
 			    mif6table[mifi].m6_ifp->if_index);
 #endif
 		MRT6STAT_INC(mrt6s_wrong_if);
 		rt->mf6c_wrong_if++;
 		/*
 		 * If we are doing PIM processing, and we are forwarding
 		 * packets on this interface, send a message to the
 		 * routing daemon.
 		 */
 		/* have to make sure this is a valid mif */
 		if (mifi < nummifs && mif6table[mifi].m6_ifp)
 			if (V_pim6 && (m->m_flags & M_LOOP) == 0) {
 				/*
 				 * Check the M_LOOP flag to avoid an
 				 * unnecessary PIM assert.
 				 * XXX: M_LOOP is an ad-hoc hack...
 				 */
 				static struct sockaddr_in6 sin6 =
 				{ sizeof(sin6), AF_INET6 };
 
 				struct mbuf *mm;
 				struct mrt6msg *im;
 #ifdef MRT6_OINIT
 				struct omrt6msg *oim;
 #endif
 
 				mm = m_copy(m, 0, sizeof(struct ip6_hdr));
 				if (mm &&
 				    (M_HASCL(mm) ||
 				     mm->m_len < sizeof(struct ip6_hdr)))
 					mm = m_pullup(mm, sizeof(struct ip6_hdr));
 				if (mm == NULL)
 					return (ENOBUFS);
 
 #ifdef MRT6_OINIT
 				oim = NULL;
 #endif
 				im = NULL;
 				switch (V_ip6_mrouter_ver) {
 #ifdef MRT6_OINIT
 				case MRT6_OINIT:
 					oim = mtod(mm, struct omrt6msg *);
 					oim->im6_msgtype = MRT6MSG_WRONGMIF;
 					oim->im6_mbz = 0;
 					break;
 #endif
 				case MRT6_INIT:
 					im = mtod(mm, struct mrt6msg *);
 					im->im6_msgtype = MRT6MSG_WRONGMIF;
 					im->im6_mbz = 0;
 					break;
 				default:
 					m_freem(mm);
 					return (EINVAL);
 				}
 
 				for (mifp = mif6table, iif = 0;
 				     iif < nummifs && mifp &&
 					     mifp->m6_ifp != ifp;
 				     mifp++, iif++)
 					;
 
 				switch (V_ip6_mrouter_ver) {
 #ifdef MRT6_OINIT
 				case MRT6_OINIT:
 					oim->im6_mif = iif;
 					sin6.sin6_addr = oim->im6_src;
 					break;
 #endif
 				case MRT6_INIT:
 					im->im6_mif = iif;
 					sin6.sin6_addr = im->im6_src;
 					break;
 				}
 
 				MRT6STAT_INC(mrt6s_upcalls);
 
 				if (socket_send(V_ip6_mrouter, mm, &sin6) < 0) {
 #ifdef MRT6DEBUG
 					if (V_mrt6debug)
 						log(LOG_WARNING, "mdq, ip6_mrouter socket queue full\n");
 #endif
 					MRT6STAT_INC(mrt6s_upq_sockfull);
 					return (ENOBUFS);
 				}	/* if socket Q full */
 			}		/* if PIM */
 		return (0);
 	}			/* if wrong iif */
 
 	/* If I sourced this packet, it counts as output, else it was input. */
 	if (m->m_pkthdr.rcvif == NULL) {
 		/* XXX: is rcvif really NULL when output?? */
 		mif6table[mifi].m6_pkt_out++;
 		mif6table[mifi].m6_bytes_out += plen;
 	} else {
 		mif6table[mifi].m6_pkt_in++;
 		mif6table[mifi].m6_bytes_in += plen;
 	}
 	rt->mf6c_pkt_cnt++;
 	rt->mf6c_byte_cnt += plen;
 
 	/*
 	 * For each mif, forward a copy of the packet if there are group
 	 * members downstream on the interface.
 	 */
 	src0 = ip6->ip6_src;
 	dst0 = ip6->ip6_dst;
 	if ((error = in6_setscope(&src0, ifp, &iszone)) != 0 ||
 	    (error = in6_setscope(&dst0, ifp, &idzone)) != 0) {
 		IP6STAT_INC(ip6s_badscope);
 		return (error);
 	}
 	for (mifp = mif6table, mifi = 0; mifi < nummifs; mifp++, mifi++) {
 		if (IF_ISSET(mifi, &rt->mf6c_ifset)) {
 			/*
 			 * check if the outgoing packet is going to break
 			 * a scope boundary.
 			 * XXX For packets through PIM register tunnel
 			 * interface, we believe a routing daemon.
 			 */
 			if (!(mif6table[rt->mf6c_parent].m6_flags &
 			      MIFF_REGISTER) &&
 			    !(mif6table[mifi].m6_flags & MIFF_REGISTER)) {
 				if (in6_setscope(&src0, mif6table[mifi].m6_ifp,
 				    &oszone) ||
 				    in6_setscope(&dst0, mif6table[mifi].m6_ifp,
 				    &odzone) ||
 				    iszone != oszone ||
 				    idzone != odzone) {
 					IP6STAT_INC(ip6s_badscope);
 					continue;
 				}
 			}
 
 			mifp->m6_pkt_out++;
 			mifp->m6_bytes_out += plen;
 			MC6_SEND(ip6, mifp, m);
 		}
 	}
 	return (0);
 }
 
 static void
 phyint_send(struct ip6_hdr *ip6, struct mif6 *mifp, struct mbuf *m)
 {
 	struct mbuf *mb_copy;
 	struct ifnet *ifp = mifp->m6_ifp;
 	int error = 0;
 	u_long linkmtu;
 
 	/*
 	 * Make a new reference to the packet; make sure that
 	 * the IPv6 header is actually copied, not just referenced,
 	 * so that ip6_output() only scribbles on the copy.
 	 */
 	mb_copy = m_copy(m, 0, M_COPYALL);
 	if (mb_copy &&
 	    (M_HASCL(mb_copy) || mb_copy->m_len < sizeof(struct ip6_hdr)))
 		mb_copy = m_pullup(mb_copy, sizeof(struct ip6_hdr));
 	if (mb_copy == NULL) {
 		return;
 	}
 	/* set MCAST flag to the outgoing packet */
 	mb_copy->m_flags |= M_MCAST;
 
 	/*
 	 * If we sourced the packet, call ip6_output since we may devide
 	 * the packet into fragments when the packet is too big for the
 	 * outgoing interface.
 	 * Otherwise, we can simply send the packet to the interface
 	 * sending queue.
 	 */
 	if (m->m_pkthdr.rcvif == NULL) {
 		struct ip6_moptions im6o;
 
 		im6o.im6o_multicast_ifp = ifp;
 		/* XXX: ip6_output will override ip6->ip6_hlim */
 		im6o.im6o_multicast_hlim = ip6->ip6_hlim;
 		im6o.im6o_multicast_loop = 1;
 		error = ip6_output(mb_copy, NULL, NULL, IPV6_FORWARDING, &im6o,
 		    NULL, NULL);
 
 #ifdef MRT6DEBUG
 		if (V_mrt6debug & DEBUG_XMIT)
 			log(LOG_DEBUG, "phyint_send on mif %d err %d\n",
 			    mifp - mif6table, error);
 #endif
 		return;
 	}
 
 	/*
 	 * If configured to loop back multicasts by default,
 	 * loop back a copy now.
 	 */
 	if (in6_mcast_loop) {
 		struct sockaddr_in6 dst6;
 
 		bzero(&dst6, sizeof(dst6));
 		dst6.sin6_len = sizeof(struct sockaddr_in6);
 		dst6.sin6_family = AF_INET6;
 		dst6.sin6_addr = ip6->ip6_dst;
 		ip6_mloopback(ifp, m, &dst6);
 	}
 
 	/*
 	 * Put the packet into the sending queue of the outgoing interface
 	 * if it would fit in the MTU of the interface.
 	 */
 	linkmtu = IN6_LINKMTU(ifp);
 	if (mb_copy->m_pkthdr.len <= linkmtu || linkmtu < IPV6_MMTU) {
 		struct sockaddr_in6 dst6;
 
 		bzero(&dst6, sizeof(dst6));
 		dst6.sin6_len = sizeof(struct sockaddr_in6);
 		dst6.sin6_family = AF_INET6;
 		dst6.sin6_addr = ip6->ip6_dst;
 
 		IP_PROBE(send, NULL, NULL, ip6, ifp, NULL, ip6);
 		/*
 		 * We just call if_output instead of nd6_output here, since
 		 * we need no ND for a multicast forwarded packet...right?
 		 */
 		m_clrprotoflags(m);	/* Avoid confusing lower layers. */
 		error = (*ifp->if_output)(ifp, mb_copy,
 		    (struct sockaddr *)&dst6, NULL);
 #ifdef MRT6DEBUG
 		if (V_mrt6debug & DEBUG_XMIT)
 			log(LOG_DEBUG, "phyint_send on mif %d err %d\n",
 			    mifp - mif6table, error);
 #endif
 	} else {
 		/*
 		 * pMTU discovery is intentionally disabled by default, since
 		 * various router may notify pMTU in multicast, which can be
 		 * a DDoS to a router
 		 */
 		if (V_ip6_mcast_pmtu)
 			icmp6_error(mb_copy, ICMP6_PACKET_TOO_BIG, 0, linkmtu);
 		else {
 #ifdef MRT6DEBUG
 			if (V_mrt6debug & DEBUG_XMIT) {
 				char ip6bufs[INET6_ADDRSTRLEN];
 				char ip6bufd[INET6_ADDRSTRLEN];
 				log(LOG_DEBUG,
 				    "phyint_send: packet too big on %s o %s "
 				    "g %s size %d(discarded)\n",
 				    if_name(ifp),
 				    ip6_sprintf(ip6bufs, &ip6->ip6_src),
 				    ip6_sprintf(ip6bufd, &ip6->ip6_dst),
 				    mb_copy->m_pkthdr.len);
 			}
 #endif /* MRT6DEBUG */
 			m_freem(mb_copy); /* simply discard the packet */
 		}
 	}
 }
 
 static int
 register_send(struct ip6_hdr *ip6, struct mif6 *mif, struct mbuf *m)
 {
 	struct mbuf *mm;
 	int i, len = m->m_pkthdr.len;
 	static struct sockaddr_in6 sin6 = { sizeof(sin6), AF_INET6 };
 	struct mrt6msg *im6;
 
 #ifdef MRT6DEBUG
 	if (V_mrt6debug) {
 		char ip6bufs[INET6_ADDRSTRLEN], ip6bufd[INET6_ADDRSTRLEN];
 		log(LOG_DEBUG, "** IPv6 register_send **\n src %s dst %s\n",
 		    ip6_sprintf(ip6bufs, &ip6->ip6_src),
 		    ip6_sprintf(ip6bufd, &ip6->ip6_dst));
 	}
 #endif
 	PIM6STAT_INC(pim6s_snd_registers);
 
 	/* Make a copy of the packet to send to the user level process. */
 	mm = m_gethdr(M_NOWAIT, MT_DATA);
 	if (mm == NULL)
 		return (ENOBUFS);
 	mm->m_data += max_linkhdr;
 	mm->m_len = sizeof(struct ip6_hdr);
 
 	if ((mm->m_next = m_copy(m, 0, M_COPYALL)) == NULL) {
 		m_freem(mm);
 		return (ENOBUFS);
 	}
 	i = MHLEN - M_LEADINGSPACE(mm);
 	if (i > len)
 		i = len;
 	mm = m_pullup(mm, i);
 	if (mm == NULL)
 		return (ENOBUFS);
 /* TODO: check it! */
 	mm->m_pkthdr.len = len + sizeof(struct ip6_hdr);
 
 	/*
 	 * Send message to routing daemon
 	 */
 	sin6.sin6_addr = ip6->ip6_src;
 
 	im6 = mtod(mm, struct mrt6msg *);
 	im6->im6_msgtype      = MRT6MSG_WHOLEPKT;
 	im6->im6_mbz          = 0;
 
 	im6->im6_mif = mif - mif6table;
 
 	/* iif info is not given for reg. encap.n */
 	MRT6STAT_INC(mrt6s_upcalls);
 
 	if (socket_send(V_ip6_mrouter, mm, &sin6) < 0) {
 #ifdef MRT6DEBUG
 		if (V_mrt6debug)
 			log(LOG_WARNING,
 			    "register_send: ip6_mrouter socket queue full\n");
 #endif
 		MRT6STAT_INC(mrt6s_upq_sockfull);
 		return (ENOBUFS);
 	}
 	return (0);
 }
 
 /*
  * pim6_encapcheck() is called by the encap6_input() path at runtime to
  * determine if a packet is for PIM; allowing PIM to be dynamically loaded
  * into the kernel.
  */
 static int
 pim6_encapcheck(const struct mbuf *m, int off, int proto, void *arg)
 {
 
 #ifdef DIAGNOSTIC
     KASSERT(proto == IPPROTO_PIM, ("not for IPPROTO_PIM"));
 #endif
     if (proto != IPPROTO_PIM)
 	return 0;	/* not for us; reject the datagram. */
 
     return 64;		/* claim the datagram. */
 }
 
 /*
  * PIM sparse mode hook
  * Receives the pim control messages, and passes them up to the listening
  * socket, using rip6_input.
  * The only message processed is the REGISTER pim message; the pim header
  * is stripped off, and the inner packet is passed to register_mforward.
  */
 int
 pim6_input(struct mbuf **mp, int *offp, int proto)
 {
 	struct pim *pim; /* pointer to a pim struct */
 	struct ip6_hdr *ip6;
 	int pimlen;
 	struct mbuf *m = *mp;
 	int minlen;
 	int off = *offp;
 
 	PIM6STAT_INC(pim6s_rcv_total);
 
 	ip6 = mtod(m, struct ip6_hdr *);
 	pimlen = m->m_pkthdr.len - *offp;
 
 	/*
 	 * Validate lengths
 	 */
 	if (pimlen < PIM_MINLEN) {
 		PIM6STAT_INC(pim6s_rcv_tooshort);
 #ifdef MRT6DEBUG
 		if (V_mrt6debug & DEBUG_PIM)
 			log(LOG_DEBUG,"pim6_input: PIM packet too short\n");
 #endif
 		m_freem(m);
 		return (IPPROTO_DONE);
 	}
 
 	/*
 	 * if the packet is at least as big as a REGISTER, go ahead
 	 * and grab the PIM REGISTER header size, to avoid another
 	 * possible m_pullup() later.
 	 *
 	 * PIM_MINLEN       == pimhdr + u_int32 == 8
 	 * PIM6_REG_MINLEN   == pimhdr + reghdr + eip6hdr == 4 + 4 + 40
 	 */
 	minlen = (pimlen >= PIM6_REG_MINLEN) ? PIM6_REG_MINLEN : PIM_MINLEN;
 
 	/*
 	 * Make sure that the IP6 and PIM headers in contiguous memory, and
 	 * possibly the PIM REGISTER header
 	 */
 #ifndef PULLDOWN_TEST
 	IP6_EXTHDR_CHECK(m, off, minlen, IPPROTO_DONE);
 	/* adjust pointer */
 	ip6 = mtod(m, struct ip6_hdr *);
 
 	/* adjust mbuf to point to the PIM header */
 	pim = (struct pim *)((caddr_t)ip6 + off);
 #else
 	IP6_EXTHDR_GET(pim, struct pim *, m, off, minlen);
 	if (pim == NULL) {
 		PIM6STAT_INC(pim6s_rcv_tooshort);
 		return (IPPROTO_DONE);
 	}
 #endif
 
 #define PIM6_CHECKSUM
 #ifdef PIM6_CHECKSUM
 	{
 		int cksumlen;
 
 		/*
 		 * Validate checksum.
 		 * If PIM REGISTER, exclude the data packet
 		 */
 		if (pim->pim_type == PIM_REGISTER)
 			cksumlen = PIM_MINLEN;
 		else
 			cksumlen = pimlen;
 
 		if (in6_cksum(m, IPPROTO_PIM, off, cksumlen)) {
 			PIM6STAT_INC(pim6s_rcv_badsum);
 #ifdef MRT6DEBUG
 			if (V_mrt6debug & DEBUG_PIM)
 				log(LOG_DEBUG,
 				    "pim6_input: invalid checksum\n");
 #endif
 			m_freem(m);
 			return (IPPROTO_DONE);
 		}
 	}
 #endif /* PIM_CHECKSUM */
 
 	/* PIM version check */
 	if (pim->pim_ver != PIM_VERSION) {
 		PIM6STAT_INC(pim6s_rcv_badversion);
 #ifdef MRT6DEBUG
 		log(LOG_ERR,
 		    "pim6_input: incorrect version %d, expecting %d\n",
 		    pim->pim_ver, PIM_VERSION);
 #endif
 		m_freem(m);
 		return (IPPROTO_DONE);
 	}
 
 	if (pim->pim_type == PIM_REGISTER) {
 		/*
 		 * since this is a REGISTER, we'll make a copy of the register
 		 * headers ip6+pim+u_int32_t+encap_ip6, to be passed up to the
 		 * routing daemon.
 		 */
 		static struct sockaddr_in6 dst = { sizeof(dst), AF_INET6 };
 
 		struct mbuf *mcp;
 		struct ip6_hdr *eip6;
 		u_int32_t *reghdr;
 		int rc;
 #ifdef MRT6DEBUG
 		char ip6bufs[INET6_ADDRSTRLEN], ip6bufd[INET6_ADDRSTRLEN];
 #endif
 
 		PIM6STAT_INC(pim6s_rcv_registers);
 
 		if ((reg_mif_num >= nummifs) || (reg_mif_num == (mifi_t) -1)) {
 #ifdef MRT6DEBUG
 			if (V_mrt6debug & DEBUG_PIM)
 				log(LOG_DEBUG,
 				    "pim6_input: register mif not set: %d\n",
 				    reg_mif_num);
 #endif
 			m_freem(m);
 			return (IPPROTO_DONE);
 		}
 
 		reghdr = (u_int32_t *)(pim + 1);
 
 		if ((ntohl(*reghdr) & PIM_NULL_REGISTER))
 			goto pim6_input_to_daemon;
 
 		/*
 		 * Validate length
 		 */
 		if (pimlen < PIM6_REG_MINLEN) {
 			PIM6STAT_INC(pim6s_rcv_tooshort);
 			PIM6STAT_INC(pim6s_rcv_badregisters);
 #ifdef MRT6DEBUG
 			log(LOG_ERR,
 			    "pim6_input: register packet size too "
 			    "small %d from %s\n",
 			    pimlen, ip6_sprintf(ip6bufs, &ip6->ip6_src));
 #endif
 			m_freem(m);
 			return (IPPROTO_DONE);
 		}
 
 		eip6 = (struct ip6_hdr *) (reghdr + 1);
 #ifdef MRT6DEBUG
 		if (V_mrt6debug & DEBUG_PIM)
 			log(LOG_DEBUG,
 			    "pim6_input[register], eip6: %s -> %s, "
 			    "eip6 plen %d\n",
 			    ip6_sprintf(ip6bufs, &eip6->ip6_src),
 			    ip6_sprintf(ip6bufd, &eip6->ip6_dst),
 			    ntohs(eip6->ip6_plen));
 #endif
 
 		/* verify the version number of the inner packet */
 		if ((eip6->ip6_vfc & IPV6_VERSION_MASK) != IPV6_VERSION) {
 			PIM6STAT_INC(pim6s_rcv_badregisters);
 #ifdef MRT6DEBUG
 			log(LOG_DEBUG, "pim6_input: invalid IP version (%d) "
 			    "of the inner packet\n",
 			    (eip6->ip6_vfc & IPV6_VERSION));
 #endif
 			m_freem(m);
 			return (IPPROTO_NONE);
 		}
 
 		/* verify the inner packet is destined to a mcast group */
 		if (!IN6_IS_ADDR_MULTICAST(&eip6->ip6_dst)) {
 			PIM6STAT_INC(pim6s_rcv_badregisters);
 #ifdef MRT6DEBUG
 			if (V_mrt6debug & DEBUG_PIM)
 				log(LOG_DEBUG,
 				    "pim6_input: inner packet of register "
 				    "is not multicast %s\n",
 				    ip6_sprintf(ip6bufd, &eip6->ip6_dst));
 #endif
 			m_freem(m);
 			return (IPPROTO_DONE);
 		}
 
 		/*
 		 * make a copy of the whole header to pass to the daemon later.
 		 */
 		mcp = m_copy(m, 0, off + PIM6_REG_MINLEN);
 		if (mcp == NULL) {
 #ifdef MRT6DEBUG
 			log(LOG_ERR,
 			    "pim6_input: pim register: "
 			    "could not copy register head\n");
 #endif
 			m_freem(m);
 			return (IPPROTO_DONE);
 		}
 
 		/*
 		 * forward the inner ip6 packet; point m_data at the inner ip6.
 		 */
 		m_adj(m, off + PIM_MINLEN);
 #ifdef MRT6DEBUG
 		if (V_mrt6debug & DEBUG_PIM) {
 			log(LOG_DEBUG,
 			    "pim6_input: forwarding decapsulated register: "
 			    "src %s, dst %s, mif %d\n",
 			    ip6_sprintf(ip6bufs, &eip6->ip6_src),
 			    ip6_sprintf(ip6bufd, &eip6->ip6_dst),
 			    reg_mif_num);
 		}
 #endif
 
 		rc = if_simloop(mif6table[reg_mif_num].m6_ifp, m,
 				dst.sin6_family, 0);
 
 		/* prepare the register head to send to the mrouting daemon */
 		m = mcp;
 	}
 
 	/*
 	 * Pass the PIM message up to the daemon; if it is a register message
 	 * pass the 'head' only up to the daemon. This includes the
 	 * encapsulator ip6 header, pim header, register header and the
 	 * encapsulated ip6 header.
 	 */
   pim6_input_to_daemon:
 	rip6_input(&m, offp, proto);
 	return (IPPROTO_DONE);
 }
 
 static int
 ip6_mroute_modevent(module_t mod, int type, void *unused)
 {
 
 	switch (type) {
 	case MOD_LOAD:
 		MROUTER6_LOCK_INIT();
 		MFC6_LOCK_INIT();
 		MIF6_LOCK_INIT();
 
 		pim6_encap_cookie = encap_attach_func(AF_INET6, IPPROTO_PIM,
 			pim6_encapcheck,
 			(const struct protosw *)&in6_pim_protosw, NULL);
 		if (pim6_encap_cookie == NULL) {
 			printf("ip6_mroute: unable to attach pim6 encap\n");
 			MIF6_LOCK_DESTROY();
 			MFC6_LOCK_DESTROY();
 			MROUTER6_LOCK_DESTROY();
 			return (EINVAL);
 		}
 
 		ip6_mforward = X_ip6_mforward;
 		ip6_mrouter_done = X_ip6_mrouter_done;
 		ip6_mrouter_get = X_ip6_mrouter_get;
 		ip6_mrouter_set = X_ip6_mrouter_set;
 		mrt6_ioctl = X_mrt6_ioctl;
 		break;
 
 	case MOD_UNLOAD:
 		if (V_ip6_mrouter != NULL)
 			return EINVAL;
 
 		if (pim6_encap_cookie) {
 			encap_detach(pim6_encap_cookie);
 			pim6_encap_cookie = NULL;
 		}
 		X_ip6_mrouter_done();
 		ip6_mforward = NULL;
 		ip6_mrouter_done = NULL;
 		ip6_mrouter_get = NULL;
 		ip6_mrouter_set = NULL;
 		mrt6_ioctl = NULL;
 
 		MIF6_LOCK_DESTROY();
 		MFC6_LOCK_DESTROY();
 		MROUTER6_LOCK_DESTROY();
 		break;
 
 	default:
 		return (EOPNOTSUPP);
 	}
 
 	return (0);
 }
 
 static moduledata_t ip6_mroutemod = {
 	"ip6_mroute",
 	ip6_mroute_modevent,
 	0
 };
 
 DECLARE_MODULE(ip6_mroute, ip6_mroutemod, SI_SUB_PSEUDO, SI_ORDER_ANY);
Index: head/sys/netinet6/nd6.c
===================================================================
--- head/sys/netinet6/nd6.c	(revision 258540)
+++ head/sys/netinet6/nd6.c	(revision 258541)
@@ -1,2377 +1,2376 @@
 /*-
  * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the project nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	$KAME: nd6.c,v 1.144 2001/05/24 07:44:00 itojun Exp $
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_inet.h"
 #include "opt_inet6.h"
-#include "opt_kdtrace.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/callout.h>
 #include <sys/malloc.h>
 #include <sys/mbuf.h>
 #include <sys/socket.h>
 #include <sys/sockio.h>
 #include <sys/time.h>
 #include <sys/kernel.h>
 #include <sys/protosw.h>
 #include <sys/errno.h>
 #include <sys/syslog.h>
 #include <sys/lock.h>
 #include <sys/rwlock.h>
 #include <sys/queue.h>
 #include <sys/sdt.h>
 #include <sys/sysctl.h>
 
 #include <net/if.h>
 #include <net/if_var.h>
 #include <net/if_arc.h>
 #include <net/if_dl.h>
 #include <net/if_types.h>
 #include <net/iso88025.h>
 #include <net/fddi.h>
 #include <net/route.h>
 #include <net/vnet.h>
 
 #include <netinet/in.h>
 #include <netinet/in_kdtrace.h>
 #include <net/if_llatbl.h>
 #define	L3_ADDR_SIN6(le)	((struct sockaddr_in6 *) L3_ADDR(le))
 #include <netinet/if_ether.h>
 #include <netinet6/in6_var.h>
 #include <netinet/ip6.h>
 #include <netinet6/ip6_var.h>
 #include <netinet6/scope6_var.h>
 #include <netinet6/nd6.h>
 #include <netinet6/in6_ifattach.h>
 #include <netinet/icmp6.h>
 #include <netinet6/send.h>
 
 #include <sys/limits.h>
 
 #include <security/mac/mac_framework.h>
 
 #define ND6_SLOWTIMER_INTERVAL (60 * 60) /* 1 hour */
 #define ND6_RECALC_REACHTM_INTERVAL (60 * 120) /* 2 hours */
 
 #define SIN6(s) ((const struct sockaddr_in6 *)(s))
 
 /* timer values */
 VNET_DEFINE(int, nd6_prune)	= 1;	/* walk list every 1 seconds */
 VNET_DEFINE(int, nd6_delay)	= 5;	/* delay first probe time 5 second */
 VNET_DEFINE(int, nd6_umaxtries)	= 3;	/* maximum unicast query */
 VNET_DEFINE(int, nd6_mmaxtries)	= 3;	/* maximum multicast query */
 VNET_DEFINE(int, nd6_useloopback) = 1;	/* use loopback interface for
 					 * local traffic */
 VNET_DEFINE(int, nd6_gctimer)	= (60 * 60 * 24); /* 1 day: garbage
 					 * collection timer */
 
 /* preventing too many loops in ND option parsing */
 static VNET_DEFINE(int, nd6_maxndopt) = 10; /* max # of ND options allowed */
 
 VNET_DEFINE(int, nd6_maxnudhint) = 0;	/* max # of subsequent upper
 					 * layer hints */
 static VNET_DEFINE(int, nd6_maxqueuelen) = 1; /* max pkts cached in unresolved
 					 * ND entries */
 #define	V_nd6_maxndopt			VNET(nd6_maxndopt)
 #define	V_nd6_maxqueuelen		VNET(nd6_maxqueuelen)
 
 #ifdef ND6_DEBUG
 VNET_DEFINE(int, nd6_debug) = 1;
 #else
 VNET_DEFINE(int, nd6_debug) = 0;
 #endif
 
 /* for debugging? */
 #if 0
 static int nd6_inuse, nd6_allocated;
 #endif
 
 VNET_DEFINE(struct nd_drhead, nd_defrouter);
 VNET_DEFINE(struct nd_prhead, nd_prefix);
 
 VNET_DEFINE(int, nd6_recalc_reachtm_interval) = ND6_RECALC_REACHTM_INTERVAL;
 #define	V_nd6_recalc_reachtm_interval	VNET(nd6_recalc_reachtm_interval)
 
 int	(*send_sendso_input_hook)(struct mbuf *, struct ifnet *, int, int);
 
 static int nd6_is_new_addr_neighbor(struct sockaddr_in6 *,
 	struct ifnet *);
 static void nd6_setmtu0(struct ifnet *, struct nd_ifinfo *);
 static void nd6_slowtimo(void *);
 static int regen_tmpaddr(struct in6_ifaddr *);
 static struct llentry *nd6_free(struct llentry *, int);
 static void nd6_llinfo_timer(void *);
 static void clear_llinfo_pqueue(struct llentry *);
 
 static VNET_DEFINE(struct callout, nd6_slowtimo_ch);
 #define	V_nd6_slowtimo_ch		VNET(nd6_slowtimo_ch)
 
 VNET_DEFINE(struct callout, nd6_timer_ch);
 
 void
 nd6_init(void)
 {
 
 	LIST_INIT(&V_nd_prefix);
 
 	/* initialization of the default router list */
 	TAILQ_INIT(&V_nd_defrouter);
 
 	/* start timer */
 	callout_init(&V_nd6_slowtimo_ch, 0);
 	callout_reset(&V_nd6_slowtimo_ch, ND6_SLOWTIMER_INTERVAL * hz,
 	    nd6_slowtimo, curvnet);
 }
 
 #ifdef VIMAGE
 void
 nd6_destroy()
 {
 
 	callout_drain(&V_nd6_slowtimo_ch);
 	callout_drain(&V_nd6_timer_ch);
 }
 #endif
 
 struct nd_ifinfo *
 nd6_ifattach(struct ifnet *ifp)
 {
 	struct nd_ifinfo *nd;
 
 	nd = (struct nd_ifinfo *)malloc(sizeof(*nd), M_IP6NDP, M_WAITOK|M_ZERO);
 	nd->initialized = 1;
 
 	nd->chlim = IPV6_DEFHLIM;
 	nd->basereachable = REACHABLE_TIME;
 	nd->reachable = ND_COMPUTE_RTIME(nd->basereachable);
 	nd->retrans = RETRANS_TIMER;
 
 	nd->flags = ND6_IFF_PERFORMNUD;
 
 	/* A loopback interface always has ND6_IFF_AUTO_LINKLOCAL.
 	 * XXXHRS: Clear ND6_IFF_AUTO_LINKLOCAL on an IFT_BRIDGE interface by
 	 * default regardless of the V_ip6_auto_linklocal configuration to
 	 * give a reasonable default behavior.
 	 */
 	if ((V_ip6_auto_linklocal && ifp->if_type != IFT_BRIDGE) ||
 	    (ifp->if_flags & IFF_LOOPBACK))
 		nd->flags |= ND6_IFF_AUTO_LINKLOCAL;
 	/*
 	 * A loopback interface does not need to accept RTADV.
 	 * XXXHRS: Clear ND6_IFF_ACCEPT_RTADV on an IFT_BRIDGE interface by
 	 * default regardless of the V_ip6_accept_rtadv configuration to
 	 * prevent the interface from accepting RA messages arrived
 	 * on one of the member interfaces with ND6_IFF_ACCEPT_RTADV.
 	 */
 	if (V_ip6_accept_rtadv &&
 	    !(ifp->if_flags & IFF_LOOPBACK) &&
 	    (ifp->if_type != IFT_BRIDGE))
 			nd->flags |= ND6_IFF_ACCEPT_RTADV;
 	if (V_ip6_no_radr && !(ifp->if_flags & IFF_LOOPBACK))
 		nd->flags |= ND6_IFF_NO_RADR;
 
 	/* XXX: we cannot call nd6_setmtu since ifp is not fully initialized */
 	nd6_setmtu0(ifp, nd);
 
 	return nd;
 }
 
 void
 nd6_ifdetach(struct nd_ifinfo *nd)
 {
 
 	free(nd, M_IP6NDP);
 }
 
 /*
  * Reset ND level link MTU. This function is called when the physical MTU
  * changes, which means we might have to adjust the ND level MTU.
  */
 void
 nd6_setmtu(struct ifnet *ifp)
 {
 
 	nd6_setmtu0(ifp, ND_IFINFO(ifp));
 }
 
 /* XXX todo: do not maintain copy of ifp->if_mtu in ndi->maxmtu */
 void
 nd6_setmtu0(struct ifnet *ifp, struct nd_ifinfo *ndi)
 {
 	u_int32_t omaxmtu;
 
 	omaxmtu = ndi->maxmtu;
 
 	switch (ifp->if_type) {
 	case IFT_ARCNET:
 		ndi->maxmtu = MIN(ARC_PHDS_MAXMTU, ifp->if_mtu); /* RFC2497 */
 		break;
 	case IFT_FDDI:
 		ndi->maxmtu = MIN(FDDIIPMTU, ifp->if_mtu); /* RFC2467 */
 		break;
 	case IFT_ISO88025:
 		 ndi->maxmtu = MIN(ISO88025_MAX_MTU, ifp->if_mtu);
 		 break;
 	default:
 		ndi->maxmtu = ifp->if_mtu;
 		break;
 	}
 
 	/*
 	 * Decreasing the interface MTU under IPV6 minimum MTU may cause
 	 * undesirable situation.  We thus notify the operator of the change
 	 * explicitly.  The check for omaxmtu is necessary to restrict the
 	 * log to the case of changing the MTU, not initializing it.
 	 */
 	if (omaxmtu >= IPV6_MMTU && ndi->maxmtu < IPV6_MMTU) {
 		log(LOG_NOTICE, "nd6_setmtu0: "
 		    "new link MTU on %s (%lu) is too small for IPv6\n",
 		    if_name(ifp), (unsigned long)ndi->maxmtu);
 	}
 
 	if (ndi->maxmtu > V_in6_maxmtu)
 		in6_setmaxmtu(); /* check all interfaces just in case */
 
 }
 
 void
 nd6_option_init(void *opt, int icmp6len, union nd_opts *ndopts)
 {
 
 	bzero(ndopts, sizeof(*ndopts));
 	ndopts->nd_opts_search = (struct nd_opt_hdr *)opt;
 	ndopts->nd_opts_last
 		= (struct nd_opt_hdr *)(((u_char *)opt) + icmp6len);
 
 	if (icmp6len == 0) {
 		ndopts->nd_opts_done = 1;
 		ndopts->nd_opts_search = NULL;
 	}
 }
 
 /*
  * Take one ND option.
  */
 struct nd_opt_hdr *
 nd6_option(union nd_opts *ndopts)
 {
 	struct nd_opt_hdr *nd_opt;
 	int olen;
 
 	KASSERT(ndopts != NULL, ("%s: ndopts == NULL", __func__));
 	KASSERT(ndopts->nd_opts_last != NULL, ("%s: uninitialized ndopts",
 	    __func__));
 	if (ndopts->nd_opts_search == NULL)
 		return NULL;
 	if (ndopts->nd_opts_done)
 		return NULL;
 
 	nd_opt = ndopts->nd_opts_search;
 
 	/* make sure nd_opt_len is inside the buffer */
 	if ((caddr_t)&nd_opt->nd_opt_len >= (caddr_t)ndopts->nd_opts_last) {
 		bzero(ndopts, sizeof(*ndopts));
 		return NULL;
 	}
 
 	olen = nd_opt->nd_opt_len << 3;
 	if (olen == 0) {
 		/*
 		 * Message validation requires that all included
 		 * options have a length that is greater than zero.
 		 */
 		bzero(ndopts, sizeof(*ndopts));
 		return NULL;
 	}
 
 	ndopts->nd_opts_search = (struct nd_opt_hdr *)((caddr_t)nd_opt + olen);
 	if (ndopts->nd_opts_search > ndopts->nd_opts_last) {
 		/* option overruns the end of buffer, invalid */
 		bzero(ndopts, sizeof(*ndopts));
 		return NULL;
 	} else if (ndopts->nd_opts_search == ndopts->nd_opts_last) {
 		/* reached the end of options chain */
 		ndopts->nd_opts_done = 1;
 		ndopts->nd_opts_search = NULL;
 	}
 	return nd_opt;
 }
 
 /*
  * Parse multiple ND options.
  * This function is much easier to use, for ND routines that do not need
  * multiple options of the same type.
  */
 int
 nd6_options(union nd_opts *ndopts)
 {
 	struct nd_opt_hdr *nd_opt;
 	int i = 0;
 
 	KASSERT(ndopts != NULL, ("%s: ndopts == NULL", __func__));
 	KASSERT(ndopts->nd_opts_last != NULL, ("%s: uninitialized ndopts",
 	    __func__));
 	if (ndopts->nd_opts_search == NULL)
 		return 0;
 
 	while (1) {
 		nd_opt = nd6_option(ndopts);
 		if (nd_opt == NULL && ndopts->nd_opts_last == NULL) {
 			/*
 			 * Message validation requires that all included
 			 * options have a length that is greater than zero.
 			 */
 			ICMP6STAT_INC(icp6s_nd_badopt);
 			bzero(ndopts, sizeof(*ndopts));
 			return -1;
 		}
 
 		if (nd_opt == NULL)
 			goto skip1;
 
 		switch (nd_opt->nd_opt_type) {
 		case ND_OPT_SOURCE_LINKADDR:
 		case ND_OPT_TARGET_LINKADDR:
 		case ND_OPT_MTU:
 		case ND_OPT_REDIRECTED_HEADER:
 			if (ndopts->nd_opt_array[nd_opt->nd_opt_type]) {
 				nd6log((LOG_INFO,
 				    "duplicated ND6 option found (type=%d)\n",
 				    nd_opt->nd_opt_type));
 				/* XXX bark? */
 			} else {
 				ndopts->nd_opt_array[nd_opt->nd_opt_type]
 					= nd_opt;
 			}
 			break;
 		case ND_OPT_PREFIX_INFORMATION:
 			if (ndopts->nd_opt_array[nd_opt->nd_opt_type] == 0) {
 				ndopts->nd_opt_array[nd_opt->nd_opt_type]
 					= nd_opt;
 			}
 			ndopts->nd_opts_pi_end =
 				(struct nd_opt_prefix_info *)nd_opt;
 			break;
 		/* What about ND_OPT_ROUTE_INFO? RFC 4191 */
 		case ND_OPT_RDNSS:	/* RFC 6106 */
 		case ND_OPT_DNSSL:	/* RFC 6106 */
 			/*
 			 * Silently ignore options we know and do not care about
 			 * in the kernel.
 			 */
 			break;
 		default:
 			/*
 			 * Unknown options must be silently ignored,
 			 * to accomodate future extension to the protocol.
 			 */
 			nd6log((LOG_DEBUG,
 			    "nd6_options: unsupported option %d - "
 			    "option ignored\n", nd_opt->nd_opt_type));
 		}
 
 skip1:
 		i++;
 		if (i > V_nd6_maxndopt) {
 			ICMP6STAT_INC(icp6s_nd_toomanyopt);
 			nd6log((LOG_INFO, "too many loop in nd opt\n"));
 			break;
 		}
 
 		if (ndopts->nd_opts_done)
 			break;
 	}
 
 	return 0;
 }
 
 /*
  * ND6 timer routine to handle ND6 entries
  */
 void
 nd6_llinfo_settimer_locked(struct llentry *ln, long tick)
 {
 	int canceled;
 
 	LLE_WLOCK_ASSERT(ln);
 
 	if (tick < 0) {
 		ln->la_expire = 0;
 		ln->ln_ntick = 0;
 		canceled = callout_stop(&ln->ln_timer_ch);
 	} else {
 		ln->la_expire = time_uptime + tick / hz;
 		LLE_ADDREF(ln);
 		if (tick > INT_MAX) {
 			ln->ln_ntick = tick - INT_MAX;
 			canceled = callout_reset(&ln->ln_timer_ch, INT_MAX,
 			    nd6_llinfo_timer, ln);
 		} else {
 			ln->ln_ntick = 0;
 			canceled = callout_reset(&ln->ln_timer_ch, tick,
 			    nd6_llinfo_timer, ln);
 		}
 	}
 	if (canceled)
 		LLE_REMREF(ln);
 }
 
 void
 nd6_llinfo_settimer(struct llentry *ln, long tick)
 {
 
 	LLE_WLOCK(ln);
 	nd6_llinfo_settimer_locked(ln, tick);
 	LLE_WUNLOCK(ln);
 }
 
 static void
 nd6_llinfo_timer(void *arg)
 {
 	struct llentry *ln;
 	struct in6_addr *dst;
 	struct ifnet *ifp;
 	struct nd_ifinfo *ndi = NULL;
 
 	KASSERT(arg != NULL, ("%s: arg NULL", __func__));
 	ln = (struct llentry *)arg;
 	LLE_WLOCK_ASSERT(ln);
 	ifp = ln->lle_tbl->llt_ifp;
 
 	CURVNET_SET(ifp->if_vnet);
 
 	if (ln->ln_ntick > 0) {
 		if (ln->ln_ntick > INT_MAX) {
 			ln->ln_ntick -= INT_MAX;
 			nd6_llinfo_settimer_locked(ln, INT_MAX);
 		} else {
 			ln->ln_ntick = 0;
 			nd6_llinfo_settimer_locked(ln, ln->ln_ntick);
 		}
 		goto done;
 	}
 
 	ndi = ND_IFINFO(ifp);
 	dst = &L3_ADDR_SIN6(ln)->sin6_addr;
 	if (ln->la_flags & LLE_STATIC) {
 		goto done;
 	}
 
 	if (ln->la_flags & LLE_DELETED) {
 		(void)nd6_free(ln, 0);
 		ln = NULL;
 		goto done;
 	}
 
 	switch (ln->ln_state) {
 	case ND6_LLINFO_INCOMPLETE:
 		if (ln->la_asked < V_nd6_mmaxtries) {
 			ln->la_asked++;
 			nd6_llinfo_settimer_locked(ln, (long)ndi->retrans * hz / 1000);
 			LLE_WUNLOCK(ln);
 			nd6_ns_output(ifp, NULL, dst, ln, 0);
 			LLE_WLOCK(ln);
 		} else {
 			struct mbuf *m = ln->la_hold;
 			if (m) {
 				struct mbuf *m0;
 
 				/*
 				 * assuming every packet in la_hold has the
 				 * same IP header.  Send error after unlock.
 				 */
 				m0 = m->m_nextpkt;
 				m->m_nextpkt = NULL;
 				ln->la_hold = m0;
 				clear_llinfo_pqueue(ln);
 			}
 			EVENTHANDLER_INVOKE(lle_event, ln, LLENTRY_TIMEDOUT);
 			(void)nd6_free(ln, 0);
 			ln = NULL;
 			if (m != NULL)
 				icmp6_error2(m, ICMP6_DST_UNREACH,
 				    ICMP6_DST_UNREACH_ADDR, 0, ifp);
 		}
 		break;
 	case ND6_LLINFO_REACHABLE:
 		if (!ND6_LLINFO_PERMANENT(ln)) {
 			ln->ln_state = ND6_LLINFO_STALE;
 			nd6_llinfo_settimer_locked(ln, (long)V_nd6_gctimer * hz);
 		}
 		break;
 
 	case ND6_LLINFO_STALE:
 		/* Garbage Collection(RFC 2461 5.3) */
 		if (!ND6_LLINFO_PERMANENT(ln)) {
 			EVENTHANDLER_INVOKE(lle_event, ln, LLENTRY_EXPIRED);
 			(void)nd6_free(ln, 1);
 			ln = NULL;
 		}
 		break;
 
 	case ND6_LLINFO_DELAY:
 		if (ndi && (ndi->flags & ND6_IFF_PERFORMNUD) != 0) {
 			/* We need NUD */
 			ln->la_asked = 1;
 			ln->ln_state = ND6_LLINFO_PROBE;
 			nd6_llinfo_settimer_locked(ln, (long)ndi->retrans * hz / 1000);
 			LLE_WUNLOCK(ln);
 			nd6_ns_output(ifp, dst, dst, ln, 0);
 			LLE_WLOCK(ln);
 		} else {
 			ln->ln_state = ND6_LLINFO_STALE; /* XXX */
 			nd6_llinfo_settimer_locked(ln, (long)V_nd6_gctimer * hz);
 		}
 		break;
 	case ND6_LLINFO_PROBE:
 		if (ln->la_asked < V_nd6_umaxtries) {
 			ln->la_asked++;
 			nd6_llinfo_settimer_locked(ln, (long)ndi->retrans * hz / 1000);
 			LLE_WUNLOCK(ln);
 			nd6_ns_output(ifp, dst, dst, ln, 0);
 			LLE_WLOCK(ln);
 		} else {
 			EVENTHANDLER_INVOKE(lle_event, ln, LLENTRY_EXPIRED);
 			(void)nd6_free(ln, 0);
 			ln = NULL;
 		}
 		break;
 	default:
 		panic("%s: paths in a dark night can be confusing: %d",
 		    __func__, ln->ln_state);
 	}
 done:
 	if (ln != NULL)
 		LLE_FREE_LOCKED(ln);
 	CURVNET_RESTORE();
 }
 
 
 /*
  * ND6 timer routine to expire default route list and prefix list
  */
 void
 nd6_timer(void *arg)
 {
 	CURVNET_SET((struct vnet *) arg);
 	struct nd_defrouter *dr, *ndr;
 	struct nd_prefix *pr, *npr;
 	struct in6_ifaddr *ia6, *nia6;
 
 	callout_reset(&V_nd6_timer_ch, V_nd6_prune * hz,
 	    nd6_timer, curvnet);
 
 	/* expire default router list */
 	TAILQ_FOREACH_SAFE(dr, &V_nd_defrouter, dr_entry, ndr) {
 		if (dr->expire && dr->expire < time_uptime)
 			defrtrlist_del(dr);
 	}
 
 	/*
 	 * expire interface addresses.
 	 * in the past the loop was inside prefix expiry processing.
 	 * However, from a stricter speci-confrmance standpoint, we should
 	 * rather separate address lifetimes and prefix lifetimes.
 	 *
 	 * XXXRW: in6_ifaddrhead locking.
 	 */
   addrloop:
 	TAILQ_FOREACH_SAFE(ia6, &V_in6_ifaddrhead, ia_link, nia6) {
 		/* check address lifetime */
 		if (IFA6_IS_INVALID(ia6)) {
 			int regen = 0;
 
 			/*
 			 * If the expiring address is temporary, try
 			 * regenerating a new one.  This would be useful when
 			 * we suspended a laptop PC, then turned it on after a
 			 * period that could invalidate all temporary
 			 * addresses.  Although we may have to restart the
 			 * loop (see below), it must be after purging the
 			 * address.  Otherwise, we'd see an infinite loop of
 			 * regeneration.
 			 */
 			if (V_ip6_use_tempaddr &&
 			    (ia6->ia6_flags & IN6_IFF_TEMPORARY) != 0) {
 				if (regen_tmpaddr(ia6) == 0)
 					regen = 1;
 			}
 
 			in6_purgeaddr(&ia6->ia_ifa);
 
 			if (regen)
 				goto addrloop; /* XXX: see below */
 		} else if (IFA6_IS_DEPRECATED(ia6)) {
 			int oldflags = ia6->ia6_flags;
 
 			ia6->ia6_flags |= IN6_IFF_DEPRECATED;
 
 			/*
 			 * If a temporary address has just become deprecated,
 			 * regenerate a new one if possible.
 			 */
 			if (V_ip6_use_tempaddr &&
 			    (ia6->ia6_flags & IN6_IFF_TEMPORARY) != 0 &&
 			    (oldflags & IN6_IFF_DEPRECATED) == 0) {
 
 				if (regen_tmpaddr(ia6) == 0) {
 					/*
 					 * A new temporary address is
 					 * generated.
 					 * XXX: this means the address chain
 					 * has changed while we are still in
 					 * the loop.  Although the change
 					 * would not cause disaster (because
 					 * it's not a deletion, but an
 					 * addition,) we'd rather restart the
 					 * loop just for safety.  Or does this
 					 * significantly reduce performance??
 					 */
 					goto addrloop;
 				}
 			}
 		} else {
 			/*
 			 * A new RA might have made a deprecated address
 			 * preferred.
 			 */
 			ia6->ia6_flags &= ~IN6_IFF_DEPRECATED;
 		}
 	}
 
 	/* expire prefix list */
 	LIST_FOREACH_SAFE(pr, &V_nd_prefix, ndpr_entry, npr) {
 		/*
 		 * check prefix lifetime.
 		 * since pltime is just for autoconf, pltime processing for
 		 * prefix is not necessary.
 		 */
 		if (pr->ndpr_vltime != ND6_INFINITE_LIFETIME &&
 		    time_uptime - pr->ndpr_lastupdate > pr->ndpr_vltime) {
 
 			/*
 			 * address expiration and prefix expiration are
 			 * separate.  NEVER perform in6_purgeaddr here.
 			 */
 			prelist_remove(pr);
 		}
 	}
 	CURVNET_RESTORE();
 }
 
 /*
  * ia6 - deprecated/invalidated temporary address
  */
 static int
 regen_tmpaddr(struct in6_ifaddr *ia6)
 {
 	struct ifaddr *ifa;
 	struct ifnet *ifp;
 	struct in6_ifaddr *public_ifa6 = NULL;
 
 	ifp = ia6->ia_ifa.ifa_ifp;
 	IF_ADDR_RLOCK(ifp);
 	TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
 		struct in6_ifaddr *it6;
 
 		if (ifa->ifa_addr->sa_family != AF_INET6)
 			continue;
 
 		it6 = (struct in6_ifaddr *)ifa;
 
 		/* ignore no autoconf addresses. */
 		if ((it6->ia6_flags & IN6_IFF_AUTOCONF) == 0)
 			continue;
 
 		/* ignore autoconf addresses with different prefixes. */
 		if (it6->ia6_ndpr == NULL || it6->ia6_ndpr != ia6->ia6_ndpr)
 			continue;
 
 		/*
 		 * Now we are looking at an autoconf address with the same
 		 * prefix as ours.  If the address is temporary and is still
 		 * preferred, do not create another one.  It would be rare, but
 		 * could happen, for example, when we resume a laptop PC after
 		 * a long period.
 		 */
 		if ((it6->ia6_flags & IN6_IFF_TEMPORARY) != 0 &&
 		    !IFA6_IS_DEPRECATED(it6)) {
 			public_ifa6 = NULL;
 			break;
 		}
 
 		/*
 		 * This is a public autoconf address that has the same prefix
 		 * as ours.  If it is preferred, keep it.  We can't break the
 		 * loop here, because there may be a still-preferred temporary
 		 * address with the prefix.
 		 */
 		if (!IFA6_IS_DEPRECATED(it6))
 		    public_ifa6 = it6;
 
 		if (public_ifa6 != NULL)
 			ifa_ref(&public_ifa6->ia_ifa);
 	}
 	IF_ADDR_RUNLOCK(ifp);
 
 	if (public_ifa6 != NULL) {
 		int e;
 
 		if ((e = in6_tmpifadd(public_ifa6, 0, 0)) != 0) {
 			ifa_free(&public_ifa6->ia_ifa);
 			log(LOG_NOTICE, "regen_tmpaddr: failed to create a new"
 			    " tmp addr,errno=%d\n", e);
 			return (-1);
 		}
 		ifa_free(&public_ifa6->ia_ifa);
 		return (0);
 	}
 
 	return (-1);
 }
 
 /*
  * Nuke neighbor cache/prefix/default router management table, right before
  * ifp goes away.
  */
 void
 nd6_purge(struct ifnet *ifp)
 {
 	struct nd_defrouter *dr, *ndr;
 	struct nd_prefix *pr, *npr;
 
 	/*
 	 * Nuke default router list entries toward ifp.
 	 * We defer removal of default router list entries that is installed
 	 * in the routing table, in order to keep additional side effects as
 	 * small as possible.
 	 */
 	TAILQ_FOREACH_SAFE(dr, &V_nd_defrouter, dr_entry, ndr) {
 		if (dr->installed)
 			continue;
 
 		if (dr->ifp == ifp)
 			defrtrlist_del(dr);
 	}
 
 	TAILQ_FOREACH_SAFE(dr, &V_nd_defrouter, dr_entry, ndr) {
 		if (!dr->installed)
 			continue;
 
 		if (dr->ifp == ifp)
 			defrtrlist_del(dr);
 	}
 
 	/* Nuke prefix list entries toward ifp */
 	LIST_FOREACH_SAFE(pr, &V_nd_prefix, ndpr_entry, npr) {
 		if (pr->ndpr_ifp == ifp) {
 			/*
 			 * Because if_detach() does *not* release prefixes
 			 * while purging addresses the reference count will
 			 * still be above zero. We therefore reset it to
 			 * make sure that the prefix really gets purged.
 			 */
 			pr->ndpr_refcnt = 0;
 
 			/*
 			 * Previously, pr->ndpr_addr is removed as well,
 			 * but I strongly believe we don't have to do it.
 			 * nd6_purge() is only called from in6_ifdetach(),
 			 * which removes all the associated interface addresses
 			 * by itself.
 			 * (jinmei@kame.net 20010129)
 			 */
 			prelist_remove(pr);
 		}
 	}
 
 	/* cancel default outgoing interface setting */
 	if (V_nd6_defifindex == ifp->if_index)
 		nd6_setdefaultiface(0);
 
 	if (ND_IFINFO(ifp)->flags & ND6_IFF_ACCEPT_RTADV) {
 		/* Refresh default router list. */
 		defrouter_select();
 	}
 
 	/* XXXXX
 	 * We do not nuke the neighbor cache entries here any more
 	 * because the neighbor cache is kept in if_afdata[AF_INET6].
 	 * nd6_purge() is invoked by in6_ifdetach() which is called
 	 * from if_detach() where everything gets purged. So let
 	 * in6_domifdetach() do the actual L2 table purging work.
 	 */
 }
 
 /* 
  * the caller acquires and releases the lock on the lltbls
  * Returns the llentry locked
  */
 struct llentry *
 nd6_lookup(struct in6_addr *addr6, int flags, struct ifnet *ifp)
 {
 	struct sockaddr_in6 sin6;
 	struct llentry *ln;
 	int llflags;
 	
 	bzero(&sin6, sizeof(sin6));
 	sin6.sin6_len = sizeof(struct sockaddr_in6);
 	sin6.sin6_family = AF_INET6;
 	sin6.sin6_addr = *addr6;
 
 	IF_AFDATA_LOCK_ASSERT(ifp);
 
 	llflags = 0;
 	if (flags & ND6_CREATE)
 	    llflags |= LLE_CREATE;
 	if (flags & ND6_EXCLUSIVE)
 	    llflags |= LLE_EXCLUSIVE;	
 	
 	ln = lla_lookup(LLTABLE6(ifp), llflags, (struct sockaddr *)&sin6);
 	if ((ln != NULL) && (llflags & LLE_CREATE))
 		ln->ln_state = ND6_LLINFO_NOSTATE;
 	
 	return (ln);
 }
 
 /*
  * Test whether a given IPv6 address is a neighbor or not, ignoring
  * the actual neighbor cache.  The neighbor cache is ignored in order
  * to not reenter the routing code from within itself.
  */
 static int
 nd6_is_new_addr_neighbor(struct sockaddr_in6 *addr, struct ifnet *ifp)
 {
 	struct nd_prefix *pr;
 	struct ifaddr *dstaddr;
 
 	/*
 	 * A link-local address is always a neighbor.
 	 * XXX: a link does not necessarily specify a single interface.
 	 */
 	if (IN6_IS_ADDR_LINKLOCAL(&addr->sin6_addr)) {
 		struct sockaddr_in6 sin6_copy;
 		u_int32_t zone;
 
 		/*
 		 * We need sin6_copy since sa6_recoverscope() may modify the
 		 * content (XXX).
 		 */
 		sin6_copy = *addr;
 		if (sa6_recoverscope(&sin6_copy))
 			return (0); /* XXX: should be impossible */
 		if (in6_setscope(&sin6_copy.sin6_addr, ifp, &zone))
 			return (0);
 		if (sin6_copy.sin6_scope_id == zone)
 			return (1);
 		else
 			return (0);
 	}
 
 	/*
 	 * If the address matches one of our addresses,
 	 * it should be a neighbor.
 	 * If the address matches one of our on-link prefixes, it should be a
 	 * neighbor.
 	 */
 	LIST_FOREACH(pr, &V_nd_prefix, ndpr_entry) {
 		if (pr->ndpr_ifp != ifp)
 			continue;
 
 		if (!(pr->ndpr_stateflags & NDPRF_ONLINK)) {
 			struct rtentry *rt;
 
 			/* Always use the default FIB here. */
 			rt = in6_rtalloc1((struct sockaddr *)&pr->ndpr_prefix,
 			    0, 0, RT_DEFAULT_FIB);
 			if (rt == NULL)
 				continue;
 			/*
 			 * This is the case where multiple interfaces
 			 * have the same prefix, but only one is installed 
 			 * into the routing table and that prefix entry
 			 * is not the one being examined here. In the case
 			 * where RADIX_MPATH is enabled, multiple route
 			 * entries (of the same rt_key value) will be 
 			 * installed because the interface addresses all
 			 * differ.
 			 */
 			if (!IN6_ARE_ADDR_EQUAL(&pr->ndpr_prefix.sin6_addr,
 			       &((struct sockaddr_in6 *)rt_key(rt))->sin6_addr)) {
 				RTFREE_LOCKED(rt);
 				continue;
 			}
 			RTFREE_LOCKED(rt);
 		}
 
 		if (IN6_ARE_MASKED_ADDR_EQUAL(&pr->ndpr_prefix.sin6_addr,
 		    &addr->sin6_addr, &pr->ndpr_mask))
 			return (1);
 	}
 
 	/*
 	 * If the address is assigned on the node of the other side of
 	 * a p2p interface, the address should be a neighbor.
 	 */
 	dstaddr = ifa_ifwithdstaddr((struct sockaddr *)addr);
 	if (dstaddr != NULL) {
 		if (dstaddr->ifa_ifp == ifp) {
 			ifa_free(dstaddr);
 			return (1);
 		}
 		ifa_free(dstaddr);
 	}
 
 	/*
 	 * If the default router list is empty, all addresses are regarded
 	 * as on-link, and thus, as a neighbor.
 	 */
 	if (ND_IFINFO(ifp)->flags & ND6_IFF_ACCEPT_RTADV &&
 	    TAILQ_EMPTY(&V_nd_defrouter) &&
 	    V_nd6_defifindex == ifp->if_index) {
 		return (1);
 	}
 
 	return (0);
 }
 
 
 /*
  * Detect if a given IPv6 address identifies a neighbor on a given link.
  * XXX: should take care of the destination of a p2p link?
  */
 int
 nd6_is_addr_neighbor(struct sockaddr_in6 *addr, struct ifnet *ifp)
 {
 	struct llentry *lle;
 	int rc = 0;
 
 	IF_AFDATA_UNLOCK_ASSERT(ifp);
 	if (nd6_is_new_addr_neighbor(addr, ifp))
 		return (1);
 
 	/*
 	 * Even if the address matches none of our addresses, it might be
 	 * in the neighbor cache.
 	 */
 	IF_AFDATA_RLOCK(ifp);
 	if ((lle = nd6_lookup(&addr->sin6_addr, 0, ifp)) != NULL) {
 		LLE_RUNLOCK(lle);
 		rc = 1;
 	}
 	IF_AFDATA_RUNLOCK(ifp);
 	return (rc);
 }
 
 /*
  * Free an nd6 llinfo entry.
  * Since the function would cause significant changes in the kernel, DO NOT
  * make it global, unless you have a strong reason for the change, and are sure
  * that the change is safe.
  */
 static struct llentry *
 nd6_free(struct llentry *ln, int gc)
 {
         struct llentry *next;
 	struct nd_defrouter *dr;
 	struct ifnet *ifp;
 
 	LLE_WLOCK_ASSERT(ln);
 
 	/*
 	 * we used to have pfctlinput(PRC_HOSTDEAD) here.
 	 * even though it is not harmful, it was not really necessary.
 	 */
 
 	/* cancel timer */
 	nd6_llinfo_settimer_locked(ln, -1);
 
 	ifp = ln->lle_tbl->llt_ifp;
 
 	if (ND_IFINFO(ifp)->flags & ND6_IFF_ACCEPT_RTADV) {
 		dr = defrouter_lookup(&L3_ADDR_SIN6(ln)->sin6_addr, ifp);
 
 		if (dr != NULL && dr->expire &&
 		    ln->ln_state == ND6_LLINFO_STALE && gc) {
 			/*
 			 * If the reason for the deletion is just garbage
 			 * collection, and the neighbor is an active default
 			 * router, do not delete it.  Instead, reset the GC
 			 * timer using the router's lifetime.
 			 * Simply deleting the entry would affect default
 			 * router selection, which is not necessarily a good
 			 * thing, especially when we're using router preference
 			 * values.
 			 * XXX: the check for ln_state would be redundant,
 			 *      but we intentionally keep it just in case.
 			 */
 			if (dr->expire > time_uptime)
 				nd6_llinfo_settimer_locked(ln,
 				    (dr->expire - time_uptime) * hz);
 			else
 				nd6_llinfo_settimer_locked(ln,
 				    (long)V_nd6_gctimer * hz);
 
 			next = LIST_NEXT(ln, lle_next);
 			LLE_REMREF(ln);
 			LLE_WUNLOCK(ln);
 			return (next);
 		}
 
 		if (dr) {
 			/*
 			 * Unreachablity of a router might affect the default
 			 * router selection and on-link detection of advertised
 			 * prefixes.
 			 */
 
 			/*
 			 * Temporarily fake the state to choose a new default
 			 * router and to perform on-link determination of
 			 * prefixes correctly.
 			 * Below the state will be set correctly,
 			 * or the entry itself will be deleted.
 			 */
 			ln->ln_state = ND6_LLINFO_INCOMPLETE;
 		}
 
 		if (ln->ln_router || dr) {
 
 			/*
 			 * We need to unlock to avoid a LOR with rt6_flush() with the
 			 * rnh and for the calls to pfxlist_onlink_check() and
 			 * defrouter_select() in the block further down for calls
 			 * into nd6_lookup().  We still hold a ref.
 			 */
 			LLE_WUNLOCK(ln);
 
 			/*
 			 * rt6_flush must be called whether or not the neighbor
 			 * is in the Default Router List.
 			 * See a corresponding comment in nd6_na_input().
 			 */
 			rt6_flush(&L3_ADDR_SIN6(ln)->sin6_addr, ifp);
 		}
 
 		if (dr) {
 			/*
 			 * Since defrouter_select() does not affect the
 			 * on-link determination and MIP6 needs the check
 			 * before the default router selection, we perform
 			 * the check now.
 			 */
 			pfxlist_onlink_check();
 
 			/*
 			 * Refresh default router list.
 			 */
 			defrouter_select();
 		}
 
 		if (ln->ln_router || dr)
 			LLE_WLOCK(ln);
 	}
 
 	/*
 	 * Before deleting the entry, remember the next entry as the
 	 * return value.  We need this because pfxlist_onlink_check() above
 	 * might have freed other entries (particularly the old next entry) as
 	 * a side effect (XXX).
 	 */
 	next = LIST_NEXT(ln, lle_next);
 
 	/*
 	 * Save to unlock. We still hold an extra reference and will not
 	 * free(9) in llentry_free() if someone else holds one as well.
 	 */
 	LLE_WUNLOCK(ln);
 	IF_AFDATA_LOCK(ifp);
 	LLE_WLOCK(ln);
 
 	/* Guard against race with other llentry_free(). */
 	if (ln->la_flags & LLE_LINKED) {
 		LLE_REMREF(ln);
 		llentry_free(ln);
 	} else
 		LLE_FREE_LOCKED(ln);
 
 	IF_AFDATA_UNLOCK(ifp);
 
 	return (next);
 }
 
 /*
  * Upper-layer reachability hint for Neighbor Unreachability Detection.
  *
  * XXX cost-effective methods?
  */
 void
 nd6_nud_hint(struct rtentry *rt, struct in6_addr *dst6, int force)
 {
 	struct llentry *ln;
 	struct ifnet *ifp;
 
 	if ((dst6 == NULL) || (rt == NULL))
 		return;
 
 	ifp = rt->rt_ifp;
 	IF_AFDATA_LOCK(ifp);
 	ln = nd6_lookup(dst6, ND6_EXCLUSIVE, NULL);
 	IF_AFDATA_UNLOCK(ifp);
 	if (ln == NULL)
 		return;
 
 	if (ln->ln_state < ND6_LLINFO_REACHABLE)
 		goto done;
 
 	/*
 	 * if we get upper-layer reachability confirmation many times,
 	 * it is possible we have false information.
 	 */
 	if (!force) {
 		ln->ln_byhint++;
 		if (ln->ln_byhint > V_nd6_maxnudhint) {
 			goto done;
 		}
 	}
 
  	ln->ln_state = ND6_LLINFO_REACHABLE;
 	if (!ND6_LLINFO_PERMANENT(ln)) {
 		nd6_llinfo_settimer_locked(ln,
 		    (long)ND_IFINFO(rt->rt_ifp)->reachable * hz);
 	}
 done:
 	LLE_WUNLOCK(ln);
 }
 
 
 /*
  * Rejuvenate this function for routing operations related
  * processing.
  */
 void
 nd6_rtrequest(int req, struct rtentry *rt, struct rt_addrinfo *info)
 {
 	struct sockaddr_in6 *gateway;
 	struct nd_defrouter *dr;
 	struct ifnet *ifp;
 
 	RT_LOCK_ASSERT(rt);
 	gateway = (struct sockaddr_in6 *)rt->rt_gateway;
 	ifp = rt->rt_ifp;
 
 	switch (req) {
 	case RTM_ADD:
 		break;
 
 	case RTM_DELETE:
 		if (!ifp)
 			return;
 		/*
 		 * Only indirect routes are interesting.
 		 */
 		if ((rt->rt_flags & RTF_GATEWAY) == 0)
 			return;
 		/*
 		 * check for default route
 		 */
 		if (IN6_ARE_ADDR_EQUAL(&in6addr_any, 
 				       &SIN6(rt_key(rt))->sin6_addr)) {
 
 			dr = defrouter_lookup(&gateway->sin6_addr, ifp);
 			if (dr != NULL)
 				dr->installed = 0;
 		}
 		break;
 	}
 }
 
 
 int
 nd6_ioctl(u_long cmd, caddr_t data, struct ifnet *ifp)
 {
 	struct in6_drlist *drl = (struct in6_drlist *)data;
 	struct in6_oprlist *oprl = (struct in6_oprlist *)data;
 	struct in6_ndireq *ndi = (struct in6_ndireq *)data;
 	struct in6_nbrinfo *nbi = (struct in6_nbrinfo *)data;
 	struct in6_ndifreq *ndif = (struct in6_ndifreq *)data;
 	struct nd_defrouter *dr;
 	struct nd_prefix *pr;
 	int i = 0, error = 0;
 
 	if (ifp->if_afdata[AF_INET6] == NULL)
 		return (EPFNOSUPPORT);
 	switch (cmd) {
 	case SIOCGDRLST_IN6:
 		/*
 		 * obsolete API, use sysctl under net.inet6.icmp6
 		 */
 		bzero(drl, sizeof(*drl));
 		TAILQ_FOREACH(dr, &V_nd_defrouter, dr_entry) {
 			if (i >= DRLSTSIZ)
 				break;
 			drl->defrouter[i].rtaddr = dr->rtaddr;
 			in6_clearscope(&drl->defrouter[i].rtaddr);
 
 			drl->defrouter[i].flags = dr->flags;
 			drl->defrouter[i].rtlifetime = dr->rtlifetime;
 			drl->defrouter[i].expire = dr->expire +
 			    (time_second - time_uptime);
 			drl->defrouter[i].if_index = dr->ifp->if_index;
 			i++;
 		}
 		break;
 	case SIOCGPRLST_IN6:
 		/*
 		 * obsolete API, use sysctl under net.inet6.icmp6
 		 *
 		 * XXX the structure in6_prlist was changed in backward-
 		 * incompatible manner.  in6_oprlist is used for SIOCGPRLST_IN6,
 		 * in6_prlist is used for nd6_sysctl() - fill_prlist().
 		 */
 		/*
 		 * XXX meaning of fields, especialy "raflags", is very
 		 * differnet between RA prefix list and RR/static prefix list.
 		 * how about separating ioctls into two?
 		 */
 		bzero(oprl, sizeof(*oprl));
 		LIST_FOREACH(pr, &V_nd_prefix, ndpr_entry) {
 			struct nd_pfxrouter *pfr;
 			int j;
 
 			if (i >= PRLSTSIZ)
 				break;
 			oprl->prefix[i].prefix = pr->ndpr_prefix.sin6_addr;
 			oprl->prefix[i].raflags = pr->ndpr_raf;
 			oprl->prefix[i].prefixlen = pr->ndpr_plen;
 			oprl->prefix[i].vltime = pr->ndpr_vltime;
 			oprl->prefix[i].pltime = pr->ndpr_pltime;
 			oprl->prefix[i].if_index = pr->ndpr_ifp->if_index;
 			if (pr->ndpr_vltime == ND6_INFINITE_LIFETIME)
 				oprl->prefix[i].expire = 0;
 			else {
 				time_t maxexpire;
 
 				/* XXX: we assume time_t is signed. */
 				maxexpire = (-1) &
 				    ~((time_t)1 <<
 				    ((sizeof(maxexpire) * 8) - 1));
 				if (pr->ndpr_vltime <
 				    maxexpire - pr->ndpr_lastupdate) {
 					oprl->prefix[i].expire =
 					    pr->ndpr_lastupdate +
 					    pr->ndpr_vltime +
 					    (time_second - time_uptime);
 				} else
 					oprl->prefix[i].expire = maxexpire;
 			}
 
 			j = 0;
 			LIST_FOREACH(pfr, &pr->ndpr_advrtrs, pfr_entry) {
 				if (j < DRLSTSIZ) {
 #define RTRADDR oprl->prefix[i].advrtr[j]
 					RTRADDR = pfr->router->rtaddr;
 					in6_clearscope(&RTRADDR);
 #undef RTRADDR
 				}
 				j++;
 			}
 			oprl->prefix[i].advrtrs = j;
 			oprl->prefix[i].origin = PR_ORIG_RA;
 
 			i++;
 		}
 
 		break;
 	case OSIOCGIFINFO_IN6:
 #define ND	ndi->ndi
 		/* XXX: old ndp(8) assumes a positive value for linkmtu. */
 		bzero(&ND, sizeof(ND));
 		ND.linkmtu = IN6_LINKMTU(ifp);
 		ND.maxmtu = ND_IFINFO(ifp)->maxmtu;
 		ND.basereachable = ND_IFINFO(ifp)->basereachable;
 		ND.reachable = ND_IFINFO(ifp)->reachable;
 		ND.retrans = ND_IFINFO(ifp)->retrans;
 		ND.flags = ND_IFINFO(ifp)->flags;
 		ND.recalctm = ND_IFINFO(ifp)->recalctm;
 		ND.chlim = ND_IFINFO(ifp)->chlim;
 		break;
 	case SIOCGIFINFO_IN6:
 		ND = *ND_IFINFO(ifp);
 		break;
 	case SIOCSIFINFO_IN6:
 		/*
 		 * used to change host variables from userland.
 		 * intented for a use on router to reflect RA configurations.
 		 */
 		/* 0 means 'unspecified' */
 		if (ND.linkmtu != 0) {
 			if (ND.linkmtu < IPV6_MMTU ||
 			    ND.linkmtu > IN6_LINKMTU(ifp)) {
 				error = EINVAL;
 				break;
 			}
 			ND_IFINFO(ifp)->linkmtu = ND.linkmtu;
 		}
 
 		if (ND.basereachable != 0) {
 			int obasereachable = ND_IFINFO(ifp)->basereachable;
 
 			ND_IFINFO(ifp)->basereachable = ND.basereachable;
 			if (ND.basereachable != obasereachable)
 				ND_IFINFO(ifp)->reachable =
 				    ND_COMPUTE_RTIME(ND.basereachable);
 		}
 		if (ND.retrans != 0)
 			ND_IFINFO(ifp)->retrans = ND.retrans;
 		if (ND.chlim != 0)
 			ND_IFINFO(ifp)->chlim = ND.chlim;
 		/* FALLTHROUGH */
 	case SIOCSIFINFO_FLAGS:
 	{
 		struct ifaddr *ifa;
 		struct in6_ifaddr *ia;
 
 		if ((ND_IFINFO(ifp)->flags & ND6_IFF_IFDISABLED) &&
 		    !(ND.flags & ND6_IFF_IFDISABLED)) {
 			/* ifdisabled 1->0 transision */
 
 			/*
 			 * If the interface is marked as ND6_IFF_IFDISABLED and
 			 * has an link-local address with IN6_IFF_DUPLICATED,
 			 * do not clear ND6_IFF_IFDISABLED.
 			 * See RFC 4862, Section 5.4.5.
 			 */
 			int duplicated_linklocal = 0;
 
 			IF_ADDR_RLOCK(ifp);
 			TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
 				if (ifa->ifa_addr->sa_family != AF_INET6)
 					continue;
 				ia = (struct in6_ifaddr *)ifa;
 				if ((ia->ia6_flags & IN6_IFF_DUPLICATED) &&
 				    IN6_IS_ADDR_LINKLOCAL(IA6_IN6(ia))) {
 					duplicated_linklocal = 1;
 					break;
 				}
 			}
 			IF_ADDR_RUNLOCK(ifp);
 
 			if (duplicated_linklocal) {
 				ND.flags |= ND6_IFF_IFDISABLED;
 				log(LOG_ERR, "Cannot enable an interface"
 				    " with a link-local address marked"
 				    " duplicate.\n");
 			} else {
 				ND_IFINFO(ifp)->flags &= ~ND6_IFF_IFDISABLED;
 				if (ifp->if_flags & IFF_UP)
 					in6_if_up(ifp);
 			}
 		} else if (!(ND_IFINFO(ifp)->flags & ND6_IFF_IFDISABLED) &&
 			    (ND.flags & ND6_IFF_IFDISABLED)) {
 			/* ifdisabled 0->1 transision */
 			/* Mark all IPv6 address as tentative. */
 
 			ND_IFINFO(ifp)->flags |= ND6_IFF_IFDISABLED;
 			IF_ADDR_RLOCK(ifp);
 			TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
 				if (ifa->ifa_addr->sa_family != AF_INET6)
 					continue;
 				ia = (struct in6_ifaddr *)ifa;
 				ia->ia6_flags |= IN6_IFF_TENTATIVE;
 			}
 			IF_ADDR_RUNLOCK(ifp);
 		}
 
 		if (ND.flags & ND6_IFF_AUTO_LINKLOCAL) {
 			if (!(ND_IFINFO(ifp)->flags & ND6_IFF_AUTO_LINKLOCAL)) {
 				/* auto_linklocal 0->1 transision */
 
 				/* If no link-local address on ifp, configure */
 				ND_IFINFO(ifp)->flags |= ND6_IFF_AUTO_LINKLOCAL;
 				in6_ifattach(ifp, NULL);
 			} else if (!(ND.flags & ND6_IFF_IFDISABLED) &&
 			    ifp->if_flags & IFF_UP) {
 				/*
 				 * When the IF already has
 				 * ND6_IFF_AUTO_LINKLOCAL, no link-local
 				 * address is assigned, and IFF_UP, try to
 				 * assign one.
 				 */
 				int haslinklocal = 0;
 			
 				IF_ADDR_RLOCK(ifp);
 				TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
 					if (ifa->ifa_addr->sa_family != AF_INET6)
 						continue;
 					ia = (struct in6_ifaddr *)ifa;
 					if (IN6_IS_ADDR_LINKLOCAL(IA6_IN6(ia))) {
 						haslinklocal = 1;
 						break;
 					}
 				}
 				IF_ADDR_RUNLOCK(ifp);
 				if (!haslinklocal)
 					in6_ifattach(ifp, NULL);
 			}
 		}
 	}
 		ND_IFINFO(ifp)->flags = ND.flags;
 		break;
 #undef ND
 	case SIOCSNDFLUSH_IN6:	/* XXX: the ioctl name is confusing... */
 		/* sync kernel routing table with the default router list */
 		defrouter_reset();
 		defrouter_select();
 		break;
 	case SIOCSPFXFLUSH_IN6:
 	{
 		/* flush all the prefix advertised by routers */
 		struct nd_prefix *pr, *next;
 
 		LIST_FOREACH_SAFE(pr, &V_nd_prefix, ndpr_entry, next) {
 			struct in6_ifaddr *ia, *ia_next;
 
 			if (IN6_IS_ADDR_LINKLOCAL(&pr->ndpr_prefix.sin6_addr))
 				continue; /* XXX */
 
 			/* do we really have to remove addresses as well? */
 			/* XXXRW: in6_ifaddrhead locking. */
 			TAILQ_FOREACH_SAFE(ia, &V_in6_ifaddrhead, ia_link,
 			    ia_next) {
 				if ((ia->ia6_flags & IN6_IFF_AUTOCONF) == 0)
 					continue;
 
 				if (ia->ia6_ndpr == pr)
 					in6_purgeaddr(&ia->ia_ifa);
 			}
 			prelist_remove(pr);
 		}
 		break;
 	}
 	case SIOCSRTRFLUSH_IN6:
 	{
 		/* flush all the default routers */
 		struct nd_defrouter *dr, *next;
 
 		defrouter_reset();
 		TAILQ_FOREACH_SAFE(dr, &V_nd_defrouter, dr_entry, next) {
 			defrtrlist_del(dr);
 		}
 		defrouter_select();
 		break;
 	}
 	case SIOCGNBRINFO_IN6:
 	{
 		struct llentry *ln;
 		struct in6_addr nb_addr = nbi->addr; /* make local for safety */
 
 		if ((error = in6_setscope(&nb_addr, ifp, NULL)) != 0)
 			return (error);
 
 		IF_AFDATA_RLOCK(ifp);
 		ln = nd6_lookup(&nb_addr, 0, ifp);
 		IF_AFDATA_RUNLOCK(ifp);
 
 		if (ln == NULL) {
 			error = EINVAL;
 			break;
 		}
 		nbi->state = ln->ln_state;
 		nbi->asked = ln->la_asked;
 		nbi->isrouter = ln->ln_router;
 		if (ln->la_expire == 0)
 			nbi->expire = 0;
 		else
 			nbi->expire = ln->la_expire +
 			    (time_second - time_uptime);
 		LLE_RUNLOCK(ln);
 		break;
 	}
 	case SIOCGDEFIFACE_IN6:	/* XXX: should be implemented as a sysctl? */
 		ndif->ifindex = V_nd6_defifindex;
 		break;
 	case SIOCSDEFIFACE_IN6:	/* XXX: should be implemented as a sysctl? */
 		return (nd6_setdefaultiface(ndif->ifindex));
 	}
 	return (error);
 }
 
 /*
  * Create neighbor cache entry and cache link-layer address,
  * on reception of inbound ND6 packets.  (RS/RA/NS/redirect)
  *
  * type - ICMP6 type
  * code - type dependent information
  *
  * XXXXX
  *  The caller of this function already acquired the ndp 
  *  cache table lock because the cache entry is returned.
  */
 struct llentry *
 nd6_cache_lladdr(struct ifnet *ifp, struct in6_addr *from, char *lladdr,
     int lladdrlen, int type, int code)
 {
 	struct llentry *ln = NULL;
 	int is_newentry;
 	int do_update;
 	int olladdr;
 	int llchange;
 	int flags;
 	int newstate = 0;
 	uint16_t router = 0;
 	struct sockaddr_in6 sin6;
 	struct mbuf *chain = NULL;
 	int static_route = 0;
 
 	IF_AFDATA_UNLOCK_ASSERT(ifp);
 
 	KASSERT(ifp != NULL, ("%s: ifp == NULL", __func__));
 	KASSERT(from != NULL, ("%s: from == NULL", __func__));
 
 	/* nothing must be updated for unspecified address */
 	if (IN6_IS_ADDR_UNSPECIFIED(from))
 		return NULL;
 
 	/*
 	 * Validation about ifp->if_addrlen and lladdrlen must be done in
 	 * the caller.
 	 *
 	 * XXX If the link does not have link-layer adderss, what should
 	 * we do? (ifp->if_addrlen == 0)
 	 * Spec says nothing in sections for RA, RS and NA.  There's small
 	 * description on it in NS section (RFC 2461 7.2.3).
 	 */
 	flags = lladdr ? ND6_EXCLUSIVE : 0;
 	IF_AFDATA_LOCK(ifp);
 	ln = nd6_lookup(from, flags, ifp);
 
 	if (ln == NULL) {
 		flags |= ND6_EXCLUSIVE;
 		ln = nd6_lookup(from, flags | ND6_CREATE, ifp);
 		IF_AFDATA_UNLOCK(ifp);
 		is_newentry = 1;
 	} else {
 		IF_AFDATA_UNLOCK(ifp);		
 		/* do nothing if static ndp is set */
 		if (ln->la_flags & LLE_STATIC) {
 			static_route = 1;
 			goto done;
 		}
 		is_newentry = 0;
 	}
 	if (ln == NULL)
 		return (NULL);
 
 	olladdr = (ln->la_flags & LLE_VALID) ? 1 : 0;
 	if (olladdr && lladdr) {
 		llchange = bcmp(lladdr, &ln->ll_addr,
 		    ifp->if_addrlen);
 	} else
 		llchange = 0;
 
 	/*
 	 * newentry olladdr  lladdr  llchange	(*=record)
 	 *	0	n	n	--	(1)
 	 *	0	y	n	--	(2)
 	 *	0	n	y	--	(3) * STALE
 	 *	0	y	y	n	(4) *
 	 *	0	y	y	y	(5) * STALE
 	 *	1	--	n	--	(6)   NOSTATE(= PASSIVE)
 	 *	1	--	y	--	(7) * STALE
 	 */
 
 	if (lladdr) {		/* (3-5) and (7) */
 		/*
 		 * Record source link-layer address
 		 * XXX is it dependent to ifp->if_type?
 		 */
 		bcopy(lladdr, &ln->ll_addr, ifp->if_addrlen);
 		ln->la_flags |= LLE_VALID;
 		EVENTHANDLER_INVOKE(lle_event, ln, LLENTRY_RESOLVED);
 	}
 
 	if (!is_newentry) {
 		if ((!olladdr && lladdr != NULL) ||	/* (3) */
 		    (olladdr && lladdr != NULL && llchange)) {	/* (5) */
 			do_update = 1;
 			newstate = ND6_LLINFO_STALE;
 		} else					/* (1-2,4) */
 			do_update = 0;
 	} else {
 		do_update = 1;
 		if (lladdr == NULL)			/* (6) */
 			newstate = ND6_LLINFO_NOSTATE;
 		else					/* (7) */
 			newstate = ND6_LLINFO_STALE;
 	}
 
 	if (do_update) {
 		/*
 		 * Update the state of the neighbor cache.
 		 */
 		ln->ln_state = newstate;
 
 		if (ln->ln_state == ND6_LLINFO_STALE) {
 			/*
 			 * XXX: since nd6_output() below will cause
 			 * state tansition to DELAY and reset the timer,
 			 * we must set the timer now, although it is actually
 			 * meaningless.
 			 */
 			nd6_llinfo_settimer_locked(ln, (long)V_nd6_gctimer * hz);
 
 			if (ln->la_hold) {
 				struct mbuf *m_hold, *m_hold_next;
 
 				/*
 				 * reset the la_hold in advance, to explicitly
 				 * prevent a la_hold lookup in nd6_output()
 				 * (wouldn't happen, though...)
 				 */
 				for (m_hold = ln->la_hold, ln->la_hold = NULL;
 				    m_hold; m_hold = m_hold_next) {
 					m_hold_next = m_hold->m_nextpkt;
 					m_hold->m_nextpkt = NULL;
 
 					/*
 					 * we assume ifp is not a p2p here, so
 					 * just set the 2nd argument as the
 					 * 1st one.
 					 */
 					nd6_output_lle(ifp, ifp, m_hold, L3_ADDR_SIN6(ln), NULL, ln, &chain);
 				}
 				/*
 				 * If we have mbufs in the chain we need to do
 				 * deferred transmit. Copy the address from the
 				 * llentry before dropping the lock down below.
 				 */
 				if (chain != NULL)
 					memcpy(&sin6, L3_ADDR_SIN6(ln), sizeof(sin6));
 			}
 		} else if (ln->ln_state == ND6_LLINFO_INCOMPLETE) {
 			/* probe right away */
 			nd6_llinfo_settimer_locked((void *)ln, 0);
 		}
 	}
 
 	/*
 	 * ICMP6 type dependent behavior.
 	 *
 	 * NS: clear IsRouter if new entry
 	 * RS: clear IsRouter
 	 * RA: set IsRouter if there's lladdr
 	 * redir: clear IsRouter if new entry
 	 *
 	 * RA case, (1):
 	 * The spec says that we must set IsRouter in the following cases:
 	 * - If lladdr exist, set IsRouter.  This means (1-5).
 	 * - If it is old entry (!newentry), set IsRouter.  This means (7).
 	 * So, based on the spec, in (1-5) and (7) cases we must set IsRouter.
 	 * A quetion arises for (1) case.  (1) case has no lladdr in the
 	 * neighbor cache, this is similar to (6).
 	 * This case is rare but we figured that we MUST NOT set IsRouter.
 	 *
 	 * newentry olladdr  lladdr  llchange	    NS  RS  RA	redir
 	 *							D R
 	 *	0	n	n	--	(1)	c   ?     s
 	 *	0	y	n	--	(2)	c   s     s
 	 *	0	n	y	--	(3)	c   s     s
 	 *	0	y	y	n	(4)	c   s     s
 	 *	0	y	y	y	(5)	c   s     s
 	 *	1	--	n	--	(6) c	c	c s
 	 *	1	--	y	--	(7) c	c   s	c s
 	 *
 	 *					(c=clear s=set)
 	 */
 	switch (type & 0xff) {
 	case ND_NEIGHBOR_SOLICIT:
 		/*
 		 * New entry must have is_router flag cleared.
 		 */
 		if (is_newentry)	/* (6-7) */
 			ln->ln_router = 0;
 		break;
 	case ND_REDIRECT:
 		/*
 		 * If the icmp is a redirect to a better router, always set the
 		 * is_router flag.  Otherwise, if the entry is newly created,
 		 * clear the flag.  [RFC 2461, sec 8.3]
 		 */
 		if (code == ND_REDIRECT_ROUTER)
 			ln->ln_router = 1;
 		else if (is_newentry) /* (6-7) */
 			ln->ln_router = 0;
 		break;
 	case ND_ROUTER_SOLICIT:
 		/*
 		 * is_router flag must always be cleared.
 		 */
 		ln->ln_router = 0;
 		break;
 	case ND_ROUTER_ADVERT:
 		/*
 		 * Mark an entry with lladdr as a router.
 		 */
 		if ((!is_newentry && (olladdr || lladdr)) ||	/* (2-5) */
 		    (is_newentry && lladdr)) {			/* (7) */
 			ln->ln_router = 1;
 		}
 		break;
 	}
 
 	if (ln != NULL) {
 		static_route = (ln->la_flags & LLE_STATIC);
 		router = ln->ln_router;
 
 		if (flags & ND6_EXCLUSIVE)
 			LLE_WUNLOCK(ln);
 		else
 			LLE_RUNLOCK(ln);
 		if (static_route)
 			ln = NULL;
 	}
 	if (chain)
 		nd6_output_flush(ifp, ifp, chain, &sin6, NULL);
 	
 	/*
 	 * When the link-layer address of a router changes, select the
 	 * best router again.  In particular, when the neighbor entry is newly
 	 * created, it might affect the selection policy.
 	 * Question: can we restrict the first condition to the "is_newentry"
 	 * case?
 	 * XXX: when we hear an RA from a new router with the link-layer
 	 * address option, defrouter_select() is called twice, since
 	 * defrtrlist_update called the function as well.  However, I believe
 	 * we can compromise the overhead, since it only happens the first
 	 * time.
 	 * XXX: although defrouter_select() should not have a bad effect
 	 * for those are not autoconfigured hosts, we explicitly avoid such
 	 * cases for safety.
 	 */
 	if (do_update && router &&
 	    ND_IFINFO(ifp)->flags & ND6_IFF_ACCEPT_RTADV) {
 		/*
 		 * guaranteed recursion
 		 */
 		defrouter_select();
 	}
 	
 	return (ln);
 done:	
 	if (ln != NULL) {
 		if (flags & ND6_EXCLUSIVE)
 			LLE_WUNLOCK(ln);
 		else
 			LLE_RUNLOCK(ln);
 		if (static_route)
 			ln = NULL;
 	}
 	return (ln);
 }
 
 static void
 nd6_slowtimo(void *arg)
 {
 	CURVNET_SET((struct vnet *) arg);
 	struct nd_ifinfo *nd6if;
 	struct ifnet *ifp;
 
 	callout_reset(&V_nd6_slowtimo_ch, ND6_SLOWTIMER_INTERVAL * hz,
 	    nd6_slowtimo, curvnet);
 	IFNET_RLOCK_NOSLEEP();
 	TAILQ_FOREACH(ifp, &V_ifnet, if_list) {
 		if (ifp->if_afdata[AF_INET6] == NULL)
 			continue;
 		nd6if = ND_IFINFO(ifp);
 		if (nd6if->basereachable && /* already initialized */
 		    (nd6if->recalctm -= ND6_SLOWTIMER_INTERVAL) <= 0) {
 			/*
 			 * Since reachable time rarely changes by router
 			 * advertisements, we SHOULD insure that a new random
 			 * value gets recomputed at least once every few hours.
 			 * (RFC 2461, 6.3.4)
 			 */
 			nd6if->recalctm = V_nd6_recalc_reachtm_interval;
 			nd6if->reachable = ND_COMPUTE_RTIME(nd6if->basereachable);
 		}
 	}
 	IFNET_RUNLOCK_NOSLEEP();
 	CURVNET_RESTORE();
 }
 
 int
 nd6_output(struct ifnet *ifp, struct ifnet *origifp, struct mbuf *m0,
     struct sockaddr_in6 *dst, struct rtentry *rt0)
 {
 
 	return (nd6_output_lle(ifp, origifp, m0, dst, rt0, NULL, NULL));
 }
 
 
 /*
  * Note that I'm not enforcing any global serialization
  * lle state or asked changes here as the logic is too
  * complicated to avoid having to always acquire an exclusive
  * lock
  * KMM
  *
  */
 #define senderr(e) { error = (e); goto bad;}
 
 int
 nd6_output_lle(struct ifnet *ifp, struct ifnet *origifp, struct mbuf *m0,
     struct sockaddr_in6 *dst, struct rtentry *rt0, struct llentry *lle,
 	struct mbuf **chain)
 {
 	struct mbuf *m = m0;
 	struct m_tag *mtag;
 	struct llentry *ln = lle;
 	struct ip6_hdr *ip6;
 	int error = 0;
 	int flags = 0;
 	int ip6len;
 
 #ifdef INVARIANTS
 	if (lle != NULL) {
 		
 		LLE_WLOCK_ASSERT(lle);
 
 		KASSERT(chain != NULL, (" lle locked but no mbuf chain pointer passed"));
 	}
 #endif
 	if (IN6_IS_ADDR_MULTICAST(&dst->sin6_addr))
 		goto sendpkt;
 
 	if (nd6_need_cache(ifp) == 0)
 		goto sendpkt;
 
 	/*
 	 * next hop determination.  This routine is derived from ether_output.
 	 */
 
 	/*
 	 * Address resolution or Neighbor Unreachability Detection
 	 * for the next hop.
 	 * At this point, the destination of the packet must be a unicast
 	 * or an anycast address(i.e. not a multicast).
 	 */
 
 	flags = ((m != NULL) || (lle != NULL)) ? LLE_EXCLUSIVE : 0;
 	if (ln == NULL) {
 	retry:
 		IF_AFDATA_LOCK(ifp);
 		ln = lla_lookup(LLTABLE6(ifp), flags, (struct sockaddr *)dst);
 		IF_AFDATA_UNLOCK(ifp);
 		if ((ln == NULL) && nd6_is_addr_neighbor(dst, ifp))  {
 			/*
 			 * Since nd6_is_addr_neighbor() internally calls nd6_lookup(),
 			 * the condition below is not very efficient.  But we believe
 			 * it is tolerable, because this should be a rare case.
 			 */
 			flags = ND6_CREATE | (m ? ND6_EXCLUSIVE : 0);
 			IF_AFDATA_LOCK(ifp);
 			ln = nd6_lookup(&dst->sin6_addr, flags, ifp);
 			IF_AFDATA_UNLOCK(ifp);
 		}
 	} 
 	if (ln == NULL) {
 		if ((ifp->if_flags & IFF_POINTOPOINT) == 0 &&
 		    !(ND_IFINFO(ifp)->flags & ND6_IFF_PERFORMNUD)) {
 			char ip6buf[INET6_ADDRSTRLEN];
 			log(LOG_DEBUG,
 			    "nd6_output: can't allocate llinfo for %s "
 			    "(ln=%p)\n",
 			    ip6_sprintf(ip6buf, &dst->sin6_addr), ln);
 			senderr(EIO);	/* XXX: good error? */
 		}
 		goto sendpkt;	/* send anyway */
 	}
 
 	/* We don't have to do link-layer address resolution on a p2p link. */
 	if ((ifp->if_flags & IFF_POINTOPOINT) != 0 &&
 	    ln->ln_state < ND6_LLINFO_REACHABLE) {
 		if ((flags & LLE_EXCLUSIVE) == 0) {
 			flags |= LLE_EXCLUSIVE;
 			goto retry;
 		}
 		ln->ln_state = ND6_LLINFO_STALE;
 		nd6_llinfo_settimer_locked(ln, (long)V_nd6_gctimer * hz);
 	}
 
 	/*
 	 * The first time we send a packet to a neighbor whose entry is
 	 * STALE, we have to change the state to DELAY and a sets a timer to
 	 * expire in DELAY_FIRST_PROBE_TIME seconds to ensure do
 	 * neighbor unreachability detection on expiration.
 	 * (RFC 2461 7.3.3)
 	 */
 	if (ln->ln_state == ND6_LLINFO_STALE) {
 		if ((flags & LLE_EXCLUSIVE) == 0) {
 			flags |= LLE_EXCLUSIVE;
 			LLE_RUNLOCK(ln);
 			goto retry;
 		}
 		ln->la_asked = 0;
 		ln->ln_state = ND6_LLINFO_DELAY;
 		nd6_llinfo_settimer_locked(ln, (long)V_nd6_delay * hz);
 	}
 
 	/*
 	 * If the neighbor cache entry has a state other than INCOMPLETE
 	 * (i.e. its link-layer address is already resolved), just
 	 * send the packet.
 	 */
 	if (ln->ln_state > ND6_LLINFO_INCOMPLETE)
 		goto sendpkt;
 
 	/*
 	 * There is a neighbor cache entry, but no ethernet address
 	 * response yet.  Append this latest packet to the end of the
 	 * packet queue in the mbuf, unless the number of the packet
 	 * does not exceed nd6_maxqueuelen.  When it exceeds nd6_maxqueuelen,
 	 * the oldest packet in the queue will be removed.
 	 */
 	if (ln->ln_state == ND6_LLINFO_NOSTATE)
 		ln->ln_state = ND6_LLINFO_INCOMPLETE;
 
 	if ((flags & LLE_EXCLUSIVE) == 0) {
 		flags |= LLE_EXCLUSIVE;
 		LLE_RUNLOCK(ln);
 		goto retry;
 	}
 
 	LLE_WLOCK_ASSERT(ln);
 
 	if (ln->la_hold) {
 		struct mbuf *m_hold;
 		int i;
 		
 		i = 0;
 		for (m_hold = ln->la_hold; m_hold; m_hold = m_hold->m_nextpkt) {
 			i++;
 			if (m_hold->m_nextpkt == NULL) {
 				m_hold->m_nextpkt = m;
 				break;
 			}
 		}
 		while (i >= V_nd6_maxqueuelen) {
 			m_hold = ln->la_hold;
 			ln->la_hold = ln->la_hold->m_nextpkt;
 			m_freem(m_hold);
 			i--;
 		}
 	} else {
 		ln->la_hold = m;
 	}
 
 	/*
 	 * If there has been no NS for the neighbor after entering the
 	 * INCOMPLETE state, send the first solicitation.
 	 */
 	if (!ND6_LLINFO_PERMANENT(ln) && ln->la_asked == 0) {
 		ln->la_asked++;
 		
 		nd6_llinfo_settimer_locked(ln,
 		    (long)ND_IFINFO(ifp)->retrans * hz / 1000);
 		LLE_WUNLOCK(ln);
 		nd6_ns_output(ifp, NULL, &dst->sin6_addr, ln, 0);
 		if (lle != NULL && ln == lle)
 			LLE_WLOCK(lle);
 
 	} else if (lle == NULL || ln != lle) {
 		/*
 		 * We did the lookup (no lle arg) so we
 		 * need to do the unlock here.
 		 */
 		LLE_WUNLOCK(ln);
 	}
 
 	return (0);
 
   sendpkt:
 	/* discard the packet if IPv6 operation is disabled on the interface */
 	if ((ND_IFINFO(ifp)->flags & ND6_IFF_IFDISABLED)) {
 		error = ENETDOWN; /* better error? */
 		goto bad;
 	}
 	/*
 	 * ln is valid and the caller did not pass in 
 	 * an llentry
 	 */
 	if ((ln != NULL) && (lle == NULL)) {
 		if (flags & LLE_EXCLUSIVE)
 			LLE_WUNLOCK(ln);
 		else
 			LLE_RUNLOCK(ln);
 	}
 
 #ifdef MAC
 	mac_netinet6_nd6_send(ifp, m);
 #endif
 
 	/*
 	 * If called from nd6_ns_output() (NS), nd6_na_output() (NA),
 	 * icmp6_redirect_output() (REDIRECT) or from rip6_output() (RS, RA
 	 * as handled by rtsol and rtadvd), mbufs will be tagged for SeND
 	 * to be diverted to user space.  When re-injected into the kernel,
 	 * send_output() will directly dispatch them to the outgoing interface.
 	 */
 	if (send_sendso_input_hook != NULL) {
 		mtag = m_tag_find(m, PACKET_TAG_ND_OUTGOING, NULL);
 		if (mtag != NULL) {
 			ip6 = mtod(m, struct ip6_hdr *);
 			ip6len = sizeof(struct ip6_hdr) + ntohs(ip6->ip6_plen);
 			/* Use the SEND socket */
 			error = send_sendso_input_hook(m, ifp, SND_OUT,
 			    ip6len);
 			/* -1 == no app on SEND socket */
 			if (error == 0 || error != -1)
 			    return (error);
 		}
 	}
 
 	/*
 	 * We were passed in a pointer to an lle with the lock held 
 	 * this means that we can't call if_output as we will
 	 * recurse on the lle lock - so what we do is we create
 	 * a list of mbufs to send and transmit them in the caller
 	 * after the lock is dropped
 	 */
 	if (lle != NULL) {
 		if (*chain == NULL)
 			*chain = m;
 		else {
 			struct mbuf *mb;
 
 			/*
 			 * append mbuf to end of deferred chain
 			 */
 			mb = *chain;
 			while (mb->m_nextpkt != NULL)
 				mb = mb->m_nextpkt;
 			mb->m_nextpkt = m;
 		}
 		return (error);
 	}
 	m_clrprotoflags(m);	/* Avoid confusing lower layers. */
 	IP_PROBE(send, NULL, NULL, mtod(m, struct ip6_hdr *), ifp, NULL,
 	    mtod(m, struct ip6_hdr *));
 	if ((ifp->if_flags & IFF_LOOPBACK) != 0) {
 		return ((*ifp->if_output)(origifp, m, (struct sockaddr *)dst,
 		    NULL));
 	}
 	error = (*ifp->if_output)(ifp, m, (struct sockaddr *)dst, NULL);
 	return (error);
 
   bad:
 	/*
 	 * ln is valid and the caller did not pass in 
 	 * an llentry
 	 */
 	if ((ln != NULL) && (lle == NULL)) {
 		if (flags & LLE_EXCLUSIVE)
 			LLE_WUNLOCK(ln);
 		else
 			LLE_RUNLOCK(ln);
 	}
 	if (m)
 		m_freem(m);
 	return (error);
 }
 #undef senderr
 
 
 int
 nd6_output_flush(struct ifnet *ifp, struct ifnet *origifp, struct mbuf *chain,
     struct sockaddr_in6 *dst, struct route *ro)
 {
 	struct mbuf *m, *m_head;
 	struct ifnet *outifp;
 	int error = 0;
 
 	m_head = chain;
 	if ((ifp->if_flags & IFF_LOOPBACK) != 0)
 		outifp = origifp;
 	else
 		outifp = ifp;
 	
 	while (m_head) {
 		m = m_head;
 		m_head = m_head->m_nextpkt;
 		error = (*ifp->if_output)(ifp, m, (struct sockaddr *)dst, ro);			       
 	}
 
 	/*
 	 * XXX
 	 * note that intermediate errors are blindly ignored - but this is 
 	 * the same convention as used with nd6_output when called by
 	 * nd6_cache_lladdr
 	 */
 	return (error);
 }	
 
 
 int
 nd6_need_cache(struct ifnet *ifp)
 {
 	/*
 	 * XXX: we currently do not make neighbor cache on any interface
 	 * other than ARCnet, Ethernet, FDDI and GIF.
 	 *
 	 * RFC2893 says:
 	 * - unidirectional tunnels needs no ND
 	 */
 	switch (ifp->if_type) {
 	case IFT_ARCNET:
 	case IFT_ETHER:
 	case IFT_FDDI:
 	case IFT_IEEE1394:
 #ifdef IFT_L2VLAN
 	case IFT_L2VLAN:
 #endif
 #ifdef IFT_IEEE80211
 	case IFT_IEEE80211:
 #endif
 	case IFT_INFINIBAND:
 	case IFT_GIF:		/* XXX need more cases? */
 	case IFT_PPP:
 	case IFT_TUNNEL:
 	case IFT_BRIDGE:
 	case IFT_PROPVIRTUAL:
 		return (1);
 	default:
 		return (0);
 	}
 }
 
 /*
  * the callers of this function need to be re-worked to drop
  * the lle lock, drop here for now
  */
 int
 nd6_storelladdr(struct ifnet *ifp, struct mbuf *m,
     const struct sockaddr *dst, u_char *desten, struct llentry **lle)
 {
 	struct llentry *ln;
 
 	*lle = NULL;
 	IF_AFDATA_UNLOCK_ASSERT(ifp);
 	if (m != NULL && m->m_flags & M_MCAST) {
 		int i;
 
 		switch (ifp->if_type) {
 		case IFT_ETHER:
 		case IFT_FDDI:
 #ifdef IFT_L2VLAN
 		case IFT_L2VLAN:
 #endif
 #ifdef IFT_IEEE80211
 		case IFT_IEEE80211:
 #endif
 		case IFT_BRIDGE:
 		case IFT_ISO88025:
 			ETHER_MAP_IPV6_MULTICAST(&SIN6(dst)->sin6_addr,
 						 desten);
 			return (0);
 		case IFT_IEEE1394:
 			/*
 			 * netbsd can use if_broadcastaddr, but we don't do so
 			 * to reduce # of ifdef.
 			 */
 			for (i = 0; i < ifp->if_addrlen; i++)
 				desten[i] = ~0;
 			return (0);
 		case IFT_ARCNET:
 			*desten = 0;
 			return (0);
 		default:
 			m_freem(m);
 			return (EAFNOSUPPORT);
 		}
 	}
 
 
 	/*
 	 * the entry should have been created in nd6_store_lladdr
 	 */
 	IF_AFDATA_RLOCK(ifp);
 	ln = lla_lookup(LLTABLE6(ifp), 0, dst);
 	IF_AFDATA_RUNLOCK(ifp);
 	if ((ln == NULL) || !(ln->la_flags & LLE_VALID)) {
 		if (ln != NULL)
 			LLE_RUNLOCK(ln);
 		/* this could happen, if we could not allocate memory */
 		m_freem(m);
 		return (1);
 	}
 
 	bcopy(&ln->ll_addr, desten, ifp->if_addrlen);
 	*lle = ln;
 	LLE_RUNLOCK(ln);
 	/*
 	 * A *small* use after free race exists here
 	 */
 	return (0);
 }
 
 static void 
 clear_llinfo_pqueue(struct llentry *ln)
 {
 	struct mbuf *m_hold, *m_hold_next;
 
 	for (m_hold = ln->la_hold; m_hold; m_hold = m_hold_next) {
 		m_hold_next = m_hold->m_nextpkt;
 		m_freem(m_hold);
 	}
 
 	ln->la_hold = NULL;
 	return;
 }
 
 static int nd6_sysctl_drlist(SYSCTL_HANDLER_ARGS);
 static int nd6_sysctl_prlist(SYSCTL_HANDLER_ARGS);
 #ifdef SYSCTL_DECL
 SYSCTL_DECL(_net_inet6_icmp6);
 #endif
 SYSCTL_NODE(_net_inet6_icmp6, ICMPV6CTL_ND6_DRLIST, nd6_drlist,
 	CTLFLAG_RD, nd6_sysctl_drlist, "");
 SYSCTL_NODE(_net_inet6_icmp6, ICMPV6CTL_ND6_PRLIST, nd6_prlist,
 	CTLFLAG_RD, nd6_sysctl_prlist, "");
 SYSCTL_VNET_INT(_net_inet6_icmp6, ICMPV6CTL_ND6_MAXQLEN, nd6_maxqueuelen,
 	CTLFLAG_RW, &VNET_NAME(nd6_maxqueuelen), 1, "");
 
 static int
 nd6_sysctl_drlist(SYSCTL_HANDLER_ARGS)
 {
 	struct in6_defrouter d;
 	struct nd_defrouter *dr;
 	int error;
 
 	if (req->newptr)
 		return (EPERM);
 
 	bzero(&d, sizeof(d));
 	d.rtaddr.sin6_family = AF_INET6;
 	d.rtaddr.sin6_len = sizeof(d.rtaddr);
 
 	/*
 	 * XXX locking
 	 */
 	TAILQ_FOREACH(dr, &V_nd_defrouter, dr_entry) {
 		d.rtaddr.sin6_addr = dr->rtaddr;
 		error = sa6_recoverscope(&d.rtaddr);
 		if (error != 0)
 			return (error);
 		d.flags = dr->flags;
 		d.rtlifetime = dr->rtlifetime;
 		d.expire = dr->expire + (time_second - time_uptime);
 		d.if_index = dr->ifp->if_index;
 		error = SYSCTL_OUT(req, &d, sizeof(d));
 		if (error != 0)
 			return (error);
 	}
 	return (0);
 }
 
 static int
 nd6_sysctl_prlist(SYSCTL_HANDLER_ARGS)
 {
 	struct in6_prefix p;
 	struct sockaddr_in6 s6;
 	struct nd_prefix *pr;
 	struct nd_pfxrouter *pfr;
 	time_t maxexpire;
 	int error;
 	char ip6buf[INET6_ADDRSTRLEN];
 
 	if (req->newptr)
 		return (EPERM);
 
 	bzero(&p, sizeof(p));
 	p.origin = PR_ORIG_RA;
 	bzero(&s6, sizeof(s6));
 	s6.sin6_family = AF_INET6;
 	s6.sin6_len = sizeof(s6);
 
 	/*
 	 * XXX locking
 	 */
 	LIST_FOREACH(pr, &V_nd_prefix, ndpr_entry) {
 		p.prefix = pr->ndpr_prefix;
 		if (sa6_recoverscope(&p.prefix)) {
 			log(LOG_ERR, "scope error in prefix list (%s)\n",
 			    ip6_sprintf(ip6buf, &p.prefix.sin6_addr));
 			/* XXX: press on... */
 		}
 		p.raflags = pr->ndpr_raf;
 		p.prefixlen = pr->ndpr_plen;
 		p.vltime = pr->ndpr_vltime;
 		p.pltime = pr->ndpr_pltime;
 		p.if_index = pr->ndpr_ifp->if_index;
 		if (pr->ndpr_vltime == ND6_INFINITE_LIFETIME)
 			p.expire = 0;
 		else {
 			/* XXX: we assume time_t is signed. */
 			maxexpire = (-1) &
 			    ~((time_t)1 << ((sizeof(maxexpire) * 8) - 1));
 			if (pr->ndpr_vltime < maxexpire - pr->ndpr_lastupdate)
 				p.expire = pr->ndpr_lastupdate +
 				    pr->ndpr_vltime +
 				    (time_second - time_uptime);
 			else
 				p.expire = maxexpire;
 		}
 		p.refcnt = pr->ndpr_refcnt;
 		p.flags = pr->ndpr_stateflags;
 		p.advrtrs = 0;
 		LIST_FOREACH(pfr, &pr->ndpr_advrtrs, pfr_entry)
 			p.advrtrs++;
 		error = SYSCTL_OUT(req, &p, sizeof(p));
 		if (error != 0)
 			return (error);
 		LIST_FOREACH(pfr, &pr->ndpr_advrtrs, pfr_entry) {
 			s6.sin6_addr = pfr->router->rtaddr;
 			if (sa6_recoverscope(&s6))
 				log(LOG_ERR,
 				    "scope error in prefix list (%s)\n",
 				    ip6_sprintf(ip6buf, &pfr->router->rtaddr));
 			error = SYSCTL_OUT(req, &s6, sizeof(s6));
 			if (error != 0)
 				return (error);
 		}
 	}
 	return (0);
 }
Index: head/sys/netinet6/send.c
===================================================================
--- head/sys/netinet6/send.c	(revision 258540)
+++ head/sys/netinet6/send.c	(revision 258541)
@@ -1,375 +1,373 @@
 /*-
  * Copyright (c) 2009-2010 Ana Kukec <anchie@FreeBSD.org>
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
-#include "opt_kdtrace.h"
-
 #include <sys/param.h>
 #include <sys/kernel.h>
 #include <sys/mbuf.h>
 #include <sys/module.h>
 #include <sys/priv.h>
 #include <sys/protosw.h>
 #include <sys/sdt.h>
 #include <sys/systm.h>
 #include <sys/socket.h>
 #include <sys/sockstate.h>
 #include <sys/sockbuf.h>
 #include <sys/socketvar.h>
 #include <sys/types.h>
 
 #include <net/route.h>
 #include <net/if.h>
 #include <net/if_var.h>
 #include <net/vnet.h>
 
 #include <netinet/in.h>
 #include <netinet/in_kdtrace.h>
 #include <netinet/ip_var.h>
 #include <netinet/ip6.h>
 #include <netinet/icmp6.h>
 
 #include <netinet6/in6_var.h>
 #include <netinet6/nd6.h>
 #include <netinet6/scope6_var.h>
 #include <netinet6/send.h>
 
 static MALLOC_DEFINE(M_SEND, "send", "Secure Neighbour Discovery");
 
 /*
  * The socket used to communicate with the SeND daemon.
  */
 static VNET_DEFINE(struct socket *, send_so);
 #define	V_send_so	VNET(send_so)
 
 u_long	send_sendspace	= 8 * (1024 + sizeof(struct sockaddr_send));
 u_long	send_recvspace	= 9216;
 
 struct mtx	send_mtx;
 #define SEND_LOCK_INIT()	mtx_init(&send_mtx, "send_mtx", NULL, MTX_DEF)
 #define SEND_LOCK()		mtx_lock(&send_mtx)
 #define SEND_UNLOCK()		mtx_unlock(&send_mtx)
 #define SEND_LOCK_DESTROY()     mtx_destroy(&send_mtx)
 
 static int
 send_attach(struct socket *so, int proto, struct thread *td)
 {
 	int error;
 
 	SEND_LOCK();
 	if (V_send_so != NULL) {
 		SEND_UNLOCK();
 		return (EEXIST);
 	}
 
 	error = priv_check(td, PRIV_NETINET_RAW);
 	if (error) {
 		SEND_UNLOCK();
 		return(error);
 	}
 
 	if (proto != IPPROTO_SEND) {
 		SEND_UNLOCK();
 		return (EPROTONOSUPPORT);
 	}
 	error = soreserve(so, send_sendspace, send_recvspace);
 	if (error) {
 		SEND_UNLOCK();
 		return(error);
 	}
 
 	V_send_so = so;
 	SEND_UNLOCK();
 
 	return (0);
 }
 
 static int
 send_output(struct mbuf *m, struct ifnet *ifp, int direction)
 {
 	struct ip6_hdr *ip6;
 	struct sockaddr_in6 dst;
 	struct icmp6_hdr *icmp6;
 	int icmp6len;
 
 	/*
 	 * Receive incoming (SeND-protected) or outgoing traffic
 	 * (SeND-validated) from the SeND user space application.
 	 */
 
 	switch (direction) {
 	case SND_IN:
 		if (m->m_len < (sizeof(struct ip6_hdr) +
 		    sizeof(struct icmp6_hdr))) {
 			m = m_pullup(m, sizeof(struct ip6_hdr) +
 			    sizeof(struct icmp6_hdr));
 			if (!m)
 				return (ENOBUFS);
 		}
 
 		/* Before passing off the mbuf record the proper interface. */
 		m->m_pkthdr.rcvif = ifp;
 
 		if (m->m_flags & M_PKTHDR)
 			icmp6len = m->m_pkthdr.len - sizeof(struct ip6_hdr);
 		else
 			panic("Doh! not the first mbuf.");
 
 		ip6 = mtod(m, struct ip6_hdr *);
 		icmp6 = (struct icmp6_hdr *)(ip6 + 1);
 
 		/*
 		 * Output the packet as icmp6.c:icpm6_input() would do.
 		 * The mbuf is always consumed, so we do not have to
 		 * care about that.
 		 */
 		switch (icmp6->icmp6_type) {
 		case ND_NEIGHBOR_SOLICIT:
 			nd6_ns_input(m, sizeof(struct ip6_hdr), icmp6len);
 			break;
 		case ND_NEIGHBOR_ADVERT:
 			nd6_na_input(m, sizeof(struct ip6_hdr), icmp6len);
 			break;
 		case ND_REDIRECT:
 			icmp6_redirect_input(m, sizeof(struct ip6_hdr));
 			break;
 		case ND_ROUTER_SOLICIT:
 			nd6_rs_input(m, sizeof(struct ip6_hdr), icmp6len);
 			break;
 		case ND_ROUTER_ADVERT:
 			nd6_ra_input(m, sizeof(struct ip6_hdr), icmp6len);
 			break;
 		default:
 			return (ENOSYS);
 		}
 		return (0);
 
 	case SND_OUT:
 		if (m->m_len < sizeof(struct ip6_hdr)) {
 			m = m_pullup(m, sizeof(struct ip6_hdr));
 			if (!m)
 				return (ENOBUFS);
 		}
 		ip6 = mtod(m, struct ip6_hdr *);
 		if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst))
 			m->m_flags |= M_MCAST;
 
 		bzero(&dst, sizeof(dst));
 		dst.sin6_family = AF_INET6;
 		dst.sin6_len = sizeof(dst);
 		dst.sin6_addr = ip6->ip6_dst;
 
 		m_clrprotoflags(m);	/* Avoid confusing lower layers. */
 
 		IP_PROBE(send, NULL, NULL, ip6, ifp, NULL, ip6);
 
 		/*
 		 * Output the packet as nd6.c:nd6_output_lle() would do.
 		 * The mbuf is always consumed, so we do not have to care
 		 * about that.
 		 * XXX-BZ as we added data, what about fragmenting,
 		 * if now needed?
 		 */
 		int error;
 		error = ((*ifp->if_output)(ifp, m, (struct sockaddr *)&dst,
 		    NULL));
 		if (error)
 			error = ENOENT;
 		return (error);
 
 	default:
 		panic("%s: direction %d neither SND_IN nor SND_OUT.",
 		     __func__, direction);
 	}
 }
 
 /*
  * Receive a SeND message from user space to be either send out by the kernel
  * or, with SeND ICMPv6 options removed, to be further processed by the icmp6
  * input path.
  */
 static int
 send_send(struct socket *so, int flags, struct mbuf *m, struct sockaddr *nam,
     struct mbuf *control, struct thread *td)
 {
 	struct sockaddr_send *sendsrc;
 	struct ifnet *ifp;
 	int error;
 
 	KASSERT(V_send_so == so, ("%s: socket %p not send socket %p",
 		__func__, so, V_send_so));
 
 	sendsrc = (struct sockaddr_send *)nam;
 	ifp = ifnet_byindex_ref(sendsrc->send_ifidx);
 	if (ifp == NULL) {
 		error = ENETUNREACH;
 		goto err;
 	}
 
 	error = send_output(m, ifp, sendsrc->send_direction);
 	if_rele(ifp);
 	m = NULL;
 
 err:
 	if (m != NULL)
 		m_freem(m);
 	return (error);
 }
 
 static void
 send_close(struct socket *so)
 {
 
 	SEND_LOCK();
 	if (V_send_so)
 		V_send_so = NULL;
 	SEND_UNLOCK();
 }
 
 /*
  * Send a SeND message to user space, that was either received and has to be
  * validated or was about to be send out and has to be handled by the SEND
  * daemon adding SeND ICMPv6 options.
  */
 static int
 send_input(struct mbuf *m, struct ifnet *ifp, int direction, int msglen __unused)
 {
 	struct ip6_hdr *ip6;
 	struct sockaddr_send sendsrc;
 
 	SEND_LOCK();
 	if (V_send_so == NULL) {
 		SEND_UNLOCK();
 		return (-1);
 	}
 
 	/*
 	 * Make sure to clear any possible internally embedded scope before
 	 * passing the packet to user space for SeND cryptographic signature
 	 * validation to succeed.
 	 */
 	ip6 = mtod(m, struct ip6_hdr *);
 	in6_clearscope(&ip6->ip6_src);
 	in6_clearscope(&ip6->ip6_dst);
 
 	bzero(&sendsrc, sizeof(sendsrc));
 	sendsrc.send_len = sizeof(sendsrc);
 	sendsrc.send_family = AF_INET6;
 	sendsrc.send_direction = direction;
 	sendsrc.send_ifidx = ifp->if_index;
 
 	/*
 	 * Send incoming or outgoing traffic to user space either to be
 	 * protected (outgoing) or validated (incoming) according to rfc3971.
 	 */
 	SOCKBUF_LOCK(&V_send_so->so_rcv);
 	if (sbappendaddr_locked(&V_send_so->so_rcv,
 	    (struct sockaddr *)&sendsrc, m, NULL) == 0) {
 		SOCKBUF_UNLOCK(&V_send_so->so_rcv);
 		/* XXX stats. */
 		m_freem(m);
 	} else {
 		sorwakeup_locked(V_send_so);
 	}
 
 	SEND_UNLOCK();
 	return (0);
 }
 
 struct pr_usrreqs send_usrreqs = {
 	.pru_attach =		send_attach,
 	.pru_send =		send_send,
 	.pru_detach =		send_close
 };
 struct protosw send_protosw = {
 	.pr_type =		SOCK_RAW,
 	.pr_flags =		PR_ATOMIC|PR_ADDR,
 	.pr_protocol =		IPPROTO_SEND,
 	.pr_usrreqs =		&send_usrreqs
 };
 
 static int
 send_modevent(module_t mod, int type, void *unused)
 {
 #ifdef __notyet__
 	VNET_ITERATOR_DECL(vnet_iter);
 #endif
 	int error;
 
 	switch (type) {
 	case MOD_LOAD:
 		SEND_LOCK_INIT();
 
 		error = pf_proto_register(PF_INET6, &send_protosw);
 		if (error != 0) {
 			printf("%s:%d: MOD_LOAD pf_proto_register(): %d\n",
 			   __func__, __LINE__, error);
 			SEND_LOCK_DESTROY();
 			break;
 		}
 		send_sendso_input_hook = send_input;
 		break;
 	case MOD_UNLOAD:
 		/* Do not allow unloading w/o locking. */
 		return (EBUSY);
 #ifdef __notyet__
 		VNET_LIST_RLOCK_NOSLEEP();
 		SEND_LOCK();
 		VNET_FOREACH(vnet_iter) {
 			CURVNET_SET(vnet_iter);
 			if (V_send_so != NULL) {
 				CURVNET_RESTORE();
 				SEND_UNLOCK();
 				VNET_LIST_RUNLOCK_NOSLEEP();
 				return (EBUSY);
 			}
 			CURVNET_RESTORE();
 		}
 		SEND_UNLOCK();
 		VNET_LIST_RUNLOCK_NOSLEEP();
 		error = pf_proto_unregister(PF_INET6, IPPROTO_SEND, SOCK_RAW);
 		if (error == 0)
 			SEND_LOCK_DESTROY();
 		send_sendso_input_hook = NULL;
 		break;
 #endif
 	default:
 		error = 0;
 		break;
 	}
 
 	return (error);
 }
 
 static moduledata_t sendmod = {
 	"send",
 	send_modevent,
 	0
 };
 
 DECLARE_MODULE(send, sendmod, SI_SUB_PROTO_DOMAIN, SI_ORDER_ANY);
Index: head/sys/netinet6/udp6_usrreq.c
===================================================================
--- head/sys/netinet6/udp6_usrreq.c	(revision 258540)
+++ head/sys/netinet6/udp6_usrreq.c	(revision 258541)
@@ -1,1166 +1,1165 @@
 /*-
  * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project.
  * Copyright (c) 2010-2011 Juniper Networks, Inc.
  * All rights reserved.
  *
  * Portions of this software were developed by Robert N. M. Watson under
  * contract to Juniper Networks, Inc.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the project nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	$KAME: udp6_usrreq.c,v 1.27 2001/05/21 05:45:10 jinmei Exp $
  *	$KAME: udp6_output.c,v 1.31 2001/05/21 16:39:15 jinmei Exp $
  */
 
 /*-
  * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1995
  *	The Regents of the University of California.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)udp_usrreq.c	8.6 (Berkeley) 5/23/95
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_inet.h"
 #include "opt_inet6.h"
 #include "opt_ipfw.h"
 #include "opt_ipsec.h"
-#include "opt_kdtrace.h"
 
 #include <sys/param.h>
 #include <sys/jail.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/mbuf.h>
 #include <sys/priv.h>
 #include <sys/proc.h>
 #include <sys/protosw.h>
 #include <sys/sdt.h>
 #include <sys/signalvar.h>
 #include <sys/socket.h>
 #include <sys/socketvar.h>
 #include <sys/sx.h>
 #include <sys/sysctl.h>
 #include <sys/syslog.h>
 #include <sys/systm.h>
 
 #include <net/if.h>
 #include <net/if_var.h>
 #include <net/if_types.h>
 #include <net/route.h>
 
 #include <netinet/in.h>
 #include <netinet/in_kdtrace.h>
 #include <netinet/in_pcb.h>
 #include <netinet/in_systm.h>
 #include <netinet/in_var.h>
 #include <netinet/ip.h>
 #include <netinet/ip_icmp.h>
 #include <netinet/ip6.h>
 #include <netinet/icmp_var.h>
 #include <netinet/icmp6.h>
 #include <netinet/ip_var.h>
 #include <netinet/udp.h>
 #include <netinet/udp_var.h>
 
 #include <netinet6/ip6protosw.h>
 #include <netinet6/ip6_var.h>
 #include <netinet6/in6_pcb.h>
 #include <netinet6/udp6_var.h>
 #include <netinet6/scope6_var.h>
 
 #ifdef IPSEC
 #include <netipsec/ipsec.h>
 #include <netipsec/ipsec6.h>
 #endif /* IPSEC */
 
 #include <security/mac/mac_framework.h>
 
 /*
  * UDP protocol implementation.
  * Per RFC 768, August, 1980.
  */
 
 extern struct protosw	inetsw[];
 static void		udp6_detach(struct socket *so);
 
 static void
 udp6_append(struct inpcb *inp, struct mbuf *n, int off,
     struct sockaddr_in6 *fromsa)
 {
 	struct socket *so;
 	struct mbuf *opts;
 
 	INP_LOCK_ASSERT(inp);
 
 #ifdef IPSEC
 	/* Check AH/ESP integrity. */
 	if (ipsec6_in_reject(n, inp)) {
 		m_freem(n);
 		IPSEC6STAT_INC(ips_in_polvio);
 		return;
 	}
 #endif /* IPSEC */
 #ifdef MAC
 	if (mac_inpcb_check_deliver(inp, n) != 0) {
 		m_freem(n);
 		return;
 	}
 #endif
 	opts = NULL;
 	if (inp->inp_flags & INP_CONTROLOPTS ||
 	    inp->inp_socket->so_options & SO_TIMESTAMP)
 		ip6_savecontrol(inp, n, &opts);
 	m_adj(n, off + sizeof(struct udphdr));
 
 	so = inp->inp_socket;
 	SOCKBUF_LOCK(&so->so_rcv);
 	if (sbappendaddr_locked(&so->so_rcv, (struct sockaddr *)fromsa, n,
 	    opts) == 0) {
 		SOCKBUF_UNLOCK(&so->so_rcv);
 		m_freem(n);
 		if (opts)
 			m_freem(opts);
 		UDPSTAT_INC(udps_fullsock);
 	} else
 		sorwakeup_locked(so);
 }
 
 int
 udp6_input(struct mbuf **mp, int *offp, int proto)
 {
 	struct mbuf *m = *mp;
 	struct ifnet *ifp;
 	struct ip6_hdr *ip6;
 	struct udphdr *uh;
 	struct inpcb *inp;
 	struct udpcb *up;
 	int off = *offp;
 	int plen, ulen;
 	struct sockaddr_in6 fromsa;
 	struct m_tag *fwd_tag;
 	uint16_t uh_sum;
 
 	ifp = m->m_pkthdr.rcvif;
 	ip6 = mtod(m, struct ip6_hdr *);
 
 	if (faithprefix_p != NULL && (*faithprefix_p)(&ip6->ip6_dst)) {
 		/* XXX send icmp6 host/port unreach? */
 		m_freem(m);
 		return (IPPROTO_DONE);
 	}
 
 #ifndef PULLDOWN_TEST
 	IP6_EXTHDR_CHECK(m, off, sizeof(struct udphdr), IPPROTO_DONE);
 	ip6 = mtod(m, struct ip6_hdr *);
 	uh = (struct udphdr *)((caddr_t)ip6 + off);
 #else
 	IP6_EXTHDR_GET(uh, struct udphdr *, m, off, sizeof(*uh));
 	if (!uh)
 		return (IPPROTO_DONE);
 #endif
 
 	UDPSTAT_INC(udps_ipackets);
 
 	/*
 	 * Destination port of 0 is illegal, based on RFC768.
 	 */
 	if (uh->uh_dport == 0)
 		goto badunlocked;
 
 	plen = ntohs(ip6->ip6_plen) - off + sizeof(*ip6);
 	ulen = ntohs((u_short)uh->uh_ulen);
 
 	if (plen != ulen) {
 		UDPSTAT_INC(udps_badlen);
 		goto badunlocked;
 	}
 
 	/*
 	 * Checksum extended UDP header and data.
 	 */
 	if (uh->uh_sum == 0) {
 		UDPSTAT_INC(udps_nosum);
 		goto badunlocked;
 	}
 
 	if (m->m_pkthdr.csum_flags & CSUM_DATA_VALID_IPV6) {
 		if (m->m_pkthdr.csum_flags & CSUM_PSEUDO_HDR)
 			uh_sum = m->m_pkthdr.csum_data;
 		else
 			uh_sum = in6_cksum_pseudo(ip6, ulen,
 			    IPPROTO_UDP, m->m_pkthdr.csum_data);
 		uh_sum ^= 0xffff;
 	} else
 		uh_sum = in6_cksum(m, IPPROTO_UDP, off, ulen);
 
 	if (uh_sum != 0) {
 		UDPSTAT_INC(udps_badsum);
 		goto badunlocked;
 	}
 
 	/*
 	 * Construct sockaddr format source address.
 	 */
 	init_sin6(&fromsa, m);
 	fromsa.sin6_port = uh->uh_sport;
 
 	if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst)) {
 		struct inpcb *last;
 		struct ip6_moptions *imo;
 
 		INP_INFO_RLOCK(&V_udbinfo);
 		/*
 		 * In the event that laddr should be set to the link-local
 		 * address (this happens in RIPng), the multicast address
 		 * specified in the received packet will not match laddr.  To
 		 * handle this situation, matching is relaxed if the
 		 * receiving interface is the same as one specified in the
 		 * socket and if the destination multicast address matches
 		 * one of the multicast groups specified in the socket.
 		 */
 
 		/*
 		 * KAME note: traditionally we dropped udpiphdr from mbuf
 		 * here.  We need udphdr for IPsec processing so we do that
 		 * later.
 		 */
 		last = NULL;
 		LIST_FOREACH(inp, &V_udb, inp_list) {
 			if ((inp->inp_vflag & INP_IPV6) == 0)
 				continue;
 			if (inp->inp_lport != uh->uh_dport)
 				continue;
 			if (inp->inp_fport != 0 &&
 			    inp->inp_fport != uh->uh_sport)
 				continue;
 			if (!IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr)) {
 				if (!IN6_ARE_ADDR_EQUAL(&inp->in6p_laddr,
 							&ip6->ip6_dst))
 					continue;
 			}
 			if (!IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_faddr)) {
 				if (!IN6_ARE_ADDR_EQUAL(&inp->in6p_faddr,
 							&ip6->ip6_src) ||
 				    inp->inp_fport != uh->uh_sport)
 					continue;
 			}
 
 			/*
 			 * XXXRW: Because we weren't holding either the inpcb
 			 * or the hash lock when we checked for a match 
 			 * before, we should probably recheck now that the 
 			 * inpcb lock is (supposed to be) held.
 			 */
 
 			/*
 			 * Handle socket delivery policy for any-source
 			 * and source-specific multicast. [RFC3678]
 			 */
 			imo = inp->in6p_moptions;
 			if (imo && IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst)) {
 				struct sockaddr_in6	 mcaddr;
 				int			 blocked;
 
 				INP_RLOCK(inp);
 
 				bzero(&mcaddr, sizeof(struct sockaddr_in6));
 				mcaddr.sin6_len = sizeof(struct sockaddr_in6);
 				mcaddr.sin6_family = AF_INET6;
 				mcaddr.sin6_addr = ip6->ip6_dst;
 
 				blocked = im6o_mc_filter(imo, ifp,
 					(struct sockaddr *)&mcaddr,
 					(struct sockaddr *)&fromsa);
 				if (blocked != MCAST_PASS) {
 					if (blocked == MCAST_NOTGMEMBER)
 						IP6STAT_INC(ip6s_notmember);
 					if (blocked == MCAST_NOTSMEMBER ||
 					    blocked == MCAST_MUTED)
 						UDPSTAT_INC(udps_filtermcast);
 					INP_RUNLOCK(inp); /* XXX */
 					continue;
 				}
 
 				INP_RUNLOCK(inp);
 			}
 			if (last != NULL) {
 				struct mbuf *n;
 
 				if ((n = m_copy(m, 0, M_COPYALL)) != NULL) {
 					INP_RLOCK(last);
 					up = intoudpcb(last);
 					if (up->u_tun_func == NULL) {
 						udp6_append(last, n, off, &fromsa);
 					} else {
 						/*
 						 * Engage the tunneling
 						 * protocol we will have to
 						 * leave the info_lock up,
 						 * since we are hunting
 						 * through multiple UDP's.
 						 * 
 						 */
 						(*up->u_tun_func)(n, off, last);
 					}
 					INP_RUNLOCK(last);
 				}
 			}
 			last = inp;
 			/*
 			 * Don't look for additional matches if this one does
 			 * not have either the SO_REUSEPORT or SO_REUSEADDR
 			 * socket options set.  This heuristic avoids
 			 * searching through all pcbs in the common case of a
 			 * non-shared port.  It assumes that an application
 			 * will never clear these options after setting them.
 			 */
 			if ((last->inp_socket->so_options &
 			     (SO_REUSEPORT|SO_REUSEADDR)) == 0)
 				break;
 		}
 
 		if (last == NULL) {
 			/*
 			 * No matching pcb found; discard datagram.  (No need
 			 * to send an ICMP Port Unreachable for a broadcast
 			 * or multicast datgram.)
 			 */
 			UDPSTAT_INC(udps_noport);
 			UDPSTAT_INC(udps_noportmcast);
 			goto badheadlocked;
 		}
 		INP_RLOCK(last);
 		INP_INFO_RUNLOCK(&V_udbinfo);
 		up = intoudpcb(last);
 		UDP_PROBE(receive, NULL, last, ip6, last, uh);
 		if (up->u_tun_func == NULL) {
 			udp6_append(last, m, off, &fromsa);
 		} else {
 			/*
 			 * Engage the tunneling protocol.
 			 */
 			(*up->u_tun_func)(m, off, last);
 		}
 		INP_RUNLOCK(last);
 		return (IPPROTO_DONE);
 	}
 	/*
 	 * Locate pcb for datagram.
 	 */
 
 	/*
 	 * Grab info from PACKET_TAG_IPFORWARD tag prepended to the chain.
 	 */
 	if ((m->m_flags & M_IP6_NEXTHOP) &&
 	    (fwd_tag = m_tag_find(m, PACKET_TAG_IPFORWARD, NULL)) != NULL) {
 		struct sockaddr_in6 *next_hop6;
 
 		next_hop6 = (struct sockaddr_in6 *)(fwd_tag + 1);
 
 		/*
 		 * Transparently forwarded. Pretend to be the destination.
 		 * Already got one like this?
 		 */
 		inp = in6_pcblookup_mbuf(&V_udbinfo,
 		    &ip6->ip6_src, uh->uh_sport, &ip6->ip6_dst, uh->uh_dport,
 		    INPLOOKUP_RLOCKPCB, m->m_pkthdr.rcvif, m);
 		if (!inp) {
 			/*
 			 * It's new.  Try to find the ambushing socket.
 			 * Because we've rewritten the destination address,
 			 * any hardware-generated hash is ignored.
 			 */
 			inp = in6_pcblookup(&V_udbinfo, &ip6->ip6_src,
 			    uh->uh_sport, &next_hop6->sin6_addr,
 			    next_hop6->sin6_port ? htons(next_hop6->sin6_port) :
 			    uh->uh_dport, INPLOOKUP_WILDCARD |
 			    INPLOOKUP_RLOCKPCB, m->m_pkthdr.rcvif);
 		}
 		/* Remove the tag from the packet. We don't need it anymore. */
 		m_tag_delete(m, fwd_tag);
 		m->m_flags &= ~M_IP6_NEXTHOP;
 	} else
 		inp = in6_pcblookup_mbuf(&V_udbinfo, &ip6->ip6_src,
 		    uh->uh_sport, &ip6->ip6_dst, uh->uh_dport,
 		    INPLOOKUP_WILDCARD | INPLOOKUP_RLOCKPCB,
 		    m->m_pkthdr.rcvif, m);
 	if (inp == NULL) {
 		if (udp_log_in_vain) {
 			char ip6bufs[INET6_ADDRSTRLEN];
 			char ip6bufd[INET6_ADDRSTRLEN];
 
 			log(LOG_INFO,
 			    "Connection attempt to UDP [%s]:%d from [%s]:%d\n",
 			    ip6_sprintf(ip6bufd, &ip6->ip6_dst),
 			    ntohs(uh->uh_dport),
 			    ip6_sprintf(ip6bufs, &ip6->ip6_src),
 			    ntohs(uh->uh_sport));
 		}
 		UDPSTAT_INC(udps_noport);
 		if (m->m_flags & M_MCAST) {
 			printf("UDP6: M_MCAST is set in a unicast packet.\n");
 			UDPSTAT_INC(udps_noportmcast);
 			goto badunlocked;
 		}
 		if (V_udp_blackhole)
 			goto badunlocked;
 		if (badport_bandlim(BANDLIM_ICMP6_UNREACH) < 0)
 			goto badunlocked;
 		icmp6_error(m, ICMP6_DST_UNREACH, ICMP6_DST_UNREACH_NOPORT, 0);
 		return (IPPROTO_DONE);
 	}
 	INP_RLOCK_ASSERT(inp);
 	up = intoudpcb(inp);
 	UDP_PROBE(receive, NULL, inp, ip6, inp, uh);
 	if (up->u_tun_func == NULL) {
 		udp6_append(inp, m, off, &fromsa);
 	} else {
 		/*
 		 * Engage the tunneling protocol.
 		 */
 
 		(*up->u_tun_func)(m, off, inp);
 	}
 	INP_RUNLOCK(inp);
 	return (IPPROTO_DONE);
 
 badheadlocked:
 	INP_INFO_RUNLOCK(&V_udbinfo);
 badunlocked:
 	if (m)
 		m_freem(m);
 	return (IPPROTO_DONE);
 }
 
 void
 udp6_ctlinput(int cmd, struct sockaddr *sa, void *d)
 {
 	struct udphdr uh;
 	struct ip6_hdr *ip6;
 	struct mbuf *m;
 	int off = 0;
 	struct ip6ctlparam *ip6cp = NULL;
 	const struct sockaddr_in6 *sa6_src = NULL;
 	void *cmdarg;
 	struct inpcb *(*notify)(struct inpcb *, int) = udp_notify;
 	struct udp_portonly {
 		u_int16_t uh_sport;
 		u_int16_t uh_dport;
 	} *uhp;
 
 	if (sa->sa_family != AF_INET6 ||
 	    sa->sa_len != sizeof(struct sockaddr_in6))
 		return;
 
 	if ((unsigned)cmd >= PRC_NCMDS)
 		return;
 	if (PRC_IS_REDIRECT(cmd))
 		notify = in6_rtchange, d = NULL;
 	else if (cmd == PRC_HOSTDEAD)
 		d = NULL;
 	else if (inet6ctlerrmap[cmd] == 0)
 		return;
 
 	/* if the parameter is from icmp6, decode it. */
 	if (d != NULL) {
 		ip6cp = (struct ip6ctlparam *)d;
 		m = ip6cp->ip6c_m;
 		ip6 = ip6cp->ip6c_ip6;
 		off = ip6cp->ip6c_off;
 		cmdarg = ip6cp->ip6c_cmdarg;
 		sa6_src = ip6cp->ip6c_src;
 	} else {
 		m = NULL;
 		ip6 = NULL;
 		cmdarg = NULL;
 		sa6_src = &sa6_any;
 	}
 
 	if (ip6) {
 		/*
 		 * XXX: We assume that when IPV6 is non NULL,
 		 * M and OFF are valid.
 		 */
 
 		/* Check if we can safely examine src and dst ports. */
 		if (m->m_pkthdr.len < off + sizeof(*uhp))
 			return;
 
 		bzero(&uh, sizeof(uh));
 		m_copydata(m, off, sizeof(*uhp), (caddr_t)&uh);
 
 		(void) in6_pcbnotify(&V_udbinfo, sa, uh.uh_dport,
 		    (struct sockaddr *)ip6cp->ip6c_src, uh.uh_sport, cmd,
 		    cmdarg, notify);
 	} else
 		(void) in6_pcbnotify(&V_udbinfo, sa, 0,
 		    (const struct sockaddr *)sa6_src, 0, cmd, cmdarg, notify);
 }
 
 static int
 udp6_getcred(SYSCTL_HANDLER_ARGS)
 {
 	struct xucred xuc;
 	struct sockaddr_in6 addrs[2];
 	struct inpcb *inp;
 	int error;
 
 	error = priv_check(req->td, PRIV_NETINET_GETCRED);
 	if (error)
 		return (error);
 
 	if (req->newlen != sizeof(addrs))
 		return (EINVAL);
 	if (req->oldlen != sizeof(struct xucred))
 		return (EINVAL);
 	error = SYSCTL_IN(req, addrs, sizeof(addrs));
 	if (error)
 		return (error);
 	if ((error = sa6_embedscope(&addrs[0], V_ip6_use_defzone)) != 0 ||
 	    (error = sa6_embedscope(&addrs[1], V_ip6_use_defzone)) != 0) {
 		return (error);
 	}
 	inp = in6_pcblookup(&V_udbinfo, &addrs[1].sin6_addr,
 	    addrs[1].sin6_port, &addrs[0].sin6_addr, addrs[0].sin6_port,
 	    INPLOOKUP_WILDCARD | INPLOOKUP_RLOCKPCB, NULL);
 	if (inp != NULL) {
 		INP_RLOCK_ASSERT(inp);
 		if (inp->inp_socket == NULL)
 			error = ENOENT;
 		if (error == 0)
 			error = cr_canseesocket(req->td->td_ucred,
 			    inp->inp_socket);
 		if (error == 0)
 			cru2x(inp->inp_cred, &xuc);
 		INP_RUNLOCK(inp);
 	} else
 		error = ENOENT;
 	if (error == 0)
 		error = SYSCTL_OUT(req, &xuc, sizeof(struct xucred));
 	return (error);
 }
 
 SYSCTL_PROC(_net_inet6_udp6, OID_AUTO, getcred, CTLTYPE_OPAQUE|CTLFLAG_RW, 0,
     0, udp6_getcred, "S,xucred", "Get the xucred of a UDP6 connection");
 
 static int
 udp6_output(struct inpcb *inp, struct mbuf *m, struct sockaddr *addr6,
     struct mbuf *control, struct thread *td)
 {
 	u_int32_t ulen = m->m_pkthdr.len;
 	u_int32_t plen = sizeof(struct udphdr) + ulen;
 	struct ip6_hdr *ip6;
 	struct udphdr *udp6;
 	struct in6_addr *laddr, *faddr, in6a;
 	struct sockaddr_in6 *sin6 = NULL;
 	struct ifnet *oifp = NULL;
 	int scope_ambiguous = 0;
 	u_short fport;
 	int error = 0;
 	struct ip6_pktopts *optp, opt;
 	int af = AF_INET6, hlen = sizeof(struct ip6_hdr);
 	int flags;
 	struct sockaddr_in6 tmp;
 
 	INP_WLOCK_ASSERT(inp);
 	INP_HASH_WLOCK_ASSERT(inp->inp_pcbinfo);
 
 	if (addr6) {
 		/* addr6 has been validated in udp6_send(). */
 		sin6 = (struct sockaddr_in6 *)addr6;
 
 		/* protect *sin6 from overwrites */
 		tmp = *sin6;
 		sin6 = &tmp;
 
 		/*
 		 * Application should provide a proper zone ID or the use of
 		 * default zone IDs should be enabled.  Unfortunately, some
 		 * applications do not behave as it should, so we need a
 		 * workaround.  Even if an appropriate ID is not determined,
 		 * we'll see if we can determine the outgoing interface.  If we
 		 * can, determine the zone ID based on the interface below.
 		 */
 		if (sin6->sin6_scope_id == 0 && !V_ip6_use_defzone)
 			scope_ambiguous = 1;
 		if ((error = sa6_embedscope(sin6, V_ip6_use_defzone)) != 0)
 			return (error);
 	}
 
 	if (control) {
 		if ((error = ip6_setpktopts(control, &opt,
 		    inp->in6p_outputopts, td->td_ucred, IPPROTO_UDP)) != 0)
 			goto release;
 		optp = &opt;
 	} else
 		optp = inp->in6p_outputopts;
 
 	if (sin6) {
 		faddr = &sin6->sin6_addr;
 
 		/*
 		 * Since we saw no essential reason for calling in_pcbconnect,
 		 * we get rid of such kind of logic, and call in6_selectsrc
 		 * and in6_pcbsetport in order to fill in the local address
 		 * and the local port.
 		 */
 		if (sin6->sin6_port == 0) {
 			error = EADDRNOTAVAIL;
 			goto release;
 		}
 
 		if (!IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_faddr)) {
 			/* how about ::ffff:0.0.0.0 case? */
 			error = EISCONN;
 			goto release;
 		}
 
 		fport = sin6->sin6_port; /* allow 0 port */
 
 		if (IN6_IS_ADDR_V4MAPPED(faddr)) {
 			if ((inp->inp_flags & IN6P_IPV6_V6ONLY)) {
 				/*
 				 * I believe we should explicitly discard the
 				 * packet when mapped addresses are disabled,
 				 * rather than send the packet as an IPv6 one.
 				 * If we chose the latter approach, the packet
 				 * might be sent out on the wire based on the
 				 * default route, the situation which we'd
 				 * probably want to avoid.
 				 * (20010421 jinmei@kame.net)
 				 */
 				error = EINVAL;
 				goto release;
 			}
 			if (!IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr) &&
 			    !IN6_IS_ADDR_V4MAPPED(&inp->in6p_laddr)) {
 				/*
 				 * when remote addr is an IPv4-mapped address,
 				 * local addr should not be an IPv6 address,
 				 * since you cannot determine how to map IPv6
 				 * source address to IPv4.
 				 */
 				error = EINVAL;
 				goto release;
 			}
 
 			af = AF_INET;
 		}
 
 		if (!IN6_IS_ADDR_V4MAPPED(faddr)) {
 			error = in6_selectsrc(sin6, optp, inp, NULL,
 			    td->td_ucred, &oifp, &in6a);
 			if (error)
 				goto release;
 			if (oifp && scope_ambiguous &&
 			    (error = in6_setscope(&sin6->sin6_addr,
 			    oifp, NULL))) {
 				goto release;
 			}
 			laddr = &in6a;
 		} else
 			laddr = &inp->in6p_laddr;	/* XXX */
 		if (laddr == NULL) {
 			if (error == 0)
 				error = EADDRNOTAVAIL;
 			goto release;
 		}
 		if (inp->inp_lport == 0 &&
 		    (error = in6_pcbsetport(laddr, inp, td->td_ucred)) != 0) {
 			/* Undo an address bind that may have occurred. */
 			inp->in6p_laddr = in6addr_any;
 			goto release;
 		}
 	} else {
 		if (IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_faddr)) {
 			error = ENOTCONN;
 			goto release;
 		}
 		if (IN6_IS_ADDR_V4MAPPED(&inp->in6p_faddr)) {
 			if ((inp->inp_flags & IN6P_IPV6_V6ONLY)) {
 				/*
 				 * XXX: this case would happen when the
 				 * application sets the V6ONLY flag after
 				 * connecting the foreign address.
 				 * Such applications should be fixed,
 				 * so we bark here.
 				 */
 				log(LOG_INFO, "udp6_output: IPV6_V6ONLY "
 				    "option was set for a connected socket\n");
 				error = EINVAL;
 				goto release;
 			} else
 				af = AF_INET;
 		}
 		laddr = &inp->in6p_laddr;
 		faddr = &inp->in6p_faddr;
 		fport = inp->inp_fport;
 	}
 
 	if (af == AF_INET)
 		hlen = sizeof(struct ip);
 
 	/*
 	 * Calculate data length and get a mbuf
 	 * for UDP and IP6 headers.
 	 */
 	M_PREPEND(m, hlen + sizeof(struct udphdr), M_NOWAIT);
 	if (m == 0) {
 		error = ENOBUFS;
 		goto release;
 	}
 
 	/*
 	 * Stuff checksum and output datagram.
 	 */
 	udp6 = (struct udphdr *)(mtod(m, caddr_t) + hlen);
 	udp6->uh_sport = inp->inp_lport; /* lport is always set in the PCB */
 	udp6->uh_dport = fport;
 	if (plen <= 0xffff)
 		udp6->uh_ulen = htons((u_short)plen);
 	else
 		udp6->uh_ulen = 0;
 	udp6->uh_sum = 0;
 
 	switch (af) {
 	case AF_INET6:
 		ip6 = mtod(m, struct ip6_hdr *);
 		ip6->ip6_flow	= inp->inp_flow & IPV6_FLOWINFO_MASK;
 		ip6->ip6_vfc	&= ~IPV6_VERSION_MASK;
 		ip6->ip6_vfc	|= IPV6_VERSION;
 		ip6->ip6_plen	= htons((u_short)plen);
 		ip6->ip6_nxt	= IPPROTO_UDP;
 		ip6->ip6_hlim	= in6_selecthlim(inp, NULL);
 		ip6->ip6_src	= *laddr;
 		ip6->ip6_dst	= *faddr;
 
 		udp6->uh_sum = in6_cksum_pseudo(ip6, plen, IPPROTO_UDP, 0);
 		m->m_pkthdr.csum_flags = CSUM_UDP_IPV6;
 		m->m_pkthdr.csum_data = offsetof(struct udphdr, uh_sum);
 
 		flags = 0;
 
 		UDP_PROBE(send, NULL, inp, ip6, inp, udp6);
 		UDPSTAT_INC(udps_opackets);
 		error = ip6_output(m, optp, NULL, flags, inp->in6p_moptions,
 		    NULL, inp);
 		break;
 	case AF_INET:
 		error = EAFNOSUPPORT;
 		goto release;
 	}
 	goto releaseopt;
 
 release:
 	m_freem(m);
 
 releaseopt:
 	if (control) {
 		ip6_clearpktopts(&opt, -1);
 		m_freem(control);
 	}
 	return (error);
 }
 
 static void
 udp6_abort(struct socket *so)
 {
 	struct inpcb *inp;
 
 	inp = sotoinpcb(so);
 	KASSERT(inp != NULL, ("udp6_abort: inp == NULL"));
 
 #ifdef INET
 	if (inp->inp_vflag & INP_IPV4) {
 		struct pr_usrreqs *pru;
 
 		pru = inetsw[ip_protox[IPPROTO_UDP]].pr_usrreqs;
 		(*pru->pru_abort)(so);
 		return;
 	}
 #endif
 
 	INP_WLOCK(inp);
 	if (!IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_faddr)) {
 		INP_HASH_WLOCK(&V_udbinfo);
 		in6_pcbdisconnect(inp);
 		inp->in6p_laddr = in6addr_any;
 		INP_HASH_WUNLOCK(&V_udbinfo);
 		soisdisconnected(so);
 	}
 	INP_WUNLOCK(inp);
 }
 
 static int
 udp6_attach(struct socket *so, int proto, struct thread *td)
 {
 	struct inpcb *inp;
 	int error;
 
 	inp = sotoinpcb(so);
 	KASSERT(inp == NULL, ("udp6_attach: inp != NULL"));
 
 	if (so->so_snd.sb_hiwat == 0 || so->so_rcv.sb_hiwat == 0) {
 		error = soreserve(so, udp_sendspace, udp_recvspace);
 		if (error)
 			return (error);
 	}
 	INP_INFO_WLOCK(&V_udbinfo);
 	error = in_pcballoc(so, &V_udbinfo);
 	if (error) {
 		INP_INFO_WUNLOCK(&V_udbinfo);
 		return (error);
 	}
 	inp = (struct inpcb *)so->so_pcb;
 	inp->inp_vflag |= INP_IPV6;
 	if ((inp->inp_flags & IN6P_IPV6_V6ONLY) == 0)
 		inp->inp_vflag |= INP_IPV4;
 	inp->in6p_hops = -1;	/* use kernel default */
 	inp->in6p_cksum = -1;	/* just to be sure */
 	/*
 	 * XXX: ugly!!
 	 * IPv4 TTL initialization is necessary for an IPv6 socket as well,
 	 * because the socket may be bound to an IPv6 wildcard address,
 	 * which may match an IPv4-mapped IPv6 address.
 	 */
 	inp->inp_ip_ttl = V_ip_defttl;
 
 	error = udp_newudpcb(inp);
 	if (error) {
 		in_pcbdetach(inp);
 		in_pcbfree(inp);
 		INP_INFO_WUNLOCK(&V_udbinfo);
 		return (error);
 	}
 	INP_WUNLOCK(inp);
 	INP_INFO_WUNLOCK(&V_udbinfo);
 	return (0);
 }
 
 static int
 udp6_bind(struct socket *so, struct sockaddr *nam, struct thread *td)
 {
 	struct inpcb *inp;
 	int error;
 
 	inp = sotoinpcb(so);
 	KASSERT(inp != NULL, ("udp6_bind: inp == NULL"));
 
 	INP_WLOCK(inp);
 	INP_HASH_WLOCK(&V_udbinfo);
 	inp->inp_vflag &= ~INP_IPV4;
 	inp->inp_vflag |= INP_IPV6;
 	if ((inp->inp_flags & IN6P_IPV6_V6ONLY) == 0) {
 		struct sockaddr_in6 *sin6_p;
 
 		sin6_p = (struct sockaddr_in6 *)nam;
 
 		if (IN6_IS_ADDR_UNSPECIFIED(&sin6_p->sin6_addr))
 			inp->inp_vflag |= INP_IPV4;
 #ifdef INET
 		else if (IN6_IS_ADDR_V4MAPPED(&sin6_p->sin6_addr)) {
 			struct sockaddr_in sin;
 
 			in6_sin6_2_sin(&sin, sin6_p);
 			inp->inp_vflag |= INP_IPV4;
 			inp->inp_vflag &= ~INP_IPV6;
 			error = in_pcbbind(inp, (struct sockaddr *)&sin,
 			    td->td_ucred);
 			goto out;
 		}
 #endif
 	}
 
 	error = in6_pcbbind(inp, nam, td->td_ucred);
 #ifdef INET
 out:
 #endif
 	INP_HASH_WUNLOCK(&V_udbinfo);
 	INP_WUNLOCK(inp);
 	return (error);
 }
 
 static void
 udp6_close(struct socket *so)
 {
 	struct inpcb *inp;
 
 	inp = sotoinpcb(so);
 	KASSERT(inp != NULL, ("udp6_close: inp == NULL"));
 
 #ifdef INET
 	if (inp->inp_vflag & INP_IPV4) {
 		struct pr_usrreqs *pru;
 
 		pru = inetsw[ip_protox[IPPROTO_UDP]].pr_usrreqs;
 		(*pru->pru_disconnect)(so);
 		return;
 	}
 #endif
 	INP_WLOCK(inp);
 	if (!IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_faddr)) {
 		INP_HASH_WLOCK(&V_udbinfo);
 		in6_pcbdisconnect(inp);
 		inp->in6p_laddr = in6addr_any;
 		INP_HASH_WUNLOCK(&V_udbinfo);
 		soisdisconnected(so);
 	}
 	INP_WUNLOCK(inp);
 }
 
 static int
 udp6_connect(struct socket *so, struct sockaddr *nam, struct thread *td)
 {
 	struct inpcb *inp;
 	struct sockaddr_in6 *sin6;
 	int error;
 
 	inp = sotoinpcb(so);
 	sin6 = (struct sockaddr_in6 *)nam;
 	KASSERT(inp != NULL, ("udp6_connect: inp == NULL"));
 
 	/*
 	 * XXXRW: Need to clarify locking of v4/v6 flags.
 	 */
 	INP_WLOCK(inp);
 #ifdef INET
 	if (IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) {
 		struct sockaddr_in sin;
 
 		if ((inp->inp_flags & IN6P_IPV6_V6ONLY) != 0) {
 			error = EINVAL;
 			goto out;
 		}
 		if (inp->inp_faddr.s_addr != INADDR_ANY) {
 			error = EISCONN;
 			goto out;
 		}
 		in6_sin6_2_sin(&sin, sin6);
 		inp->inp_vflag |= INP_IPV4;
 		inp->inp_vflag &= ~INP_IPV6;
 		error = prison_remote_ip4(td->td_ucred, &sin.sin_addr);
 		if (error != 0)
 			goto out;
 		INP_HASH_WLOCK(&V_udbinfo);
 		error = in_pcbconnect(inp, (struct sockaddr *)&sin,
 		    td->td_ucred);
 		INP_HASH_WUNLOCK(&V_udbinfo);
 		if (error == 0)
 			soisconnected(so);
 		goto out;
 	}
 #endif
 	if (!IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_faddr)) {
 		error = EISCONN;
 		goto out;
 	}
 	inp->inp_vflag &= ~INP_IPV4;
 	inp->inp_vflag |= INP_IPV6;
 	error = prison_remote_ip6(td->td_ucred, &sin6->sin6_addr);
 	if (error != 0)
 		goto out;
 	INP_HASH_WLOCK(&V_udbinfo);
 	error = in6_pcbconnect(inp, nam, td->td_ucred);
 	INP_HASH_WUNLOCK(&V_udbinfo);
 	if (error == 0)
 		soisconnected(so);
 out:
 	INP_WUNLOCK(inp);
 	return (error);
 }
 
 static void
 udp6_detach(struct socket *so)
 {
 	struct inpcb *inp;
 	struct udpcb *up;
 
 	inp = sotoinpcb(so);
 	KASSERT(inp != NULL, ("udp6_detach: inp == NULL"));
 
 	INP_INFO_WLOCK(&V_udbinfo);
 	INP_WLOCK(inp);
 	up = intoudpcb(inp);
 	KASSERT(up != NULL, ("%s: up == NULL", __func__));
 	in_pcbdetach(inp);
 	in_pcbfree(inp);
 	INP_INFO_WUNLOCK(&V_udbinfo);
 	udp_discardcb(up);
 }
 
 static int
 udp6_disconnect(struct socket *so)
 {
 	struct inpcb *inp;
 	int error;
 
 	inp = sotoinpcb(so);
 	KASSERT(inp != NULL, ("udp6_disconnect: inp == NULL"));
 
 #ifdef INET
 	if (inp->inp_vflag & INP_IPV4) {
 		struct pr_usrreqs *pru;
 
 		pru = inetsw[ip_protox[IPPROTO_UDP]].pr_usrreqs;
 		(void)(*pru->pru_disconnect)(so);
 		return (0);
 	}
 #endif
 
 	INP_WLOCK(inp);
 
 	if (IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_faddr)) {
 		error = ENOTCONN;
 		goto out;
 	}
 
 	INP_HASH_WLOCK(&V_udbinfo);
 	in6_pcbdisconnect(inp);
 	inp->in6p_laddr = in6addr_any;
 	INP_HASH_WUNLOCK(&V_udbinfo);
 	SOCK_LOCK(so);
 	so->so_state &= ~SS_ISCONNECTED;		/* XXX */
 	SOCK_UNLOCK(so);
 out:
 	INP_WUNLOCK(inp);
 	return (0);
 }
 
 static int
 udp6_send(struct socket *so, int flags, struct mbuf *m,
     struct sockaddr *addr, struct mbuf *control, struct thread *td)
 {
 	struct inpcb *inp;
 	int error = 0;
 
 	inp = sotoinpcb(so);
 	KASSERT(inp != NULL, ("udp6_send: inp == NULL"));
 
 	INP_WLOCK(inp);
 	if (addr) {
 		if (addr->sa_len != sizeof(struct sockaddr_in6)) {
 			error = EINVAL;
 			goto bad;
 		}
 		if (addr->sa_family != AF_INET6) {
 			error = EAFNOSUPPORT;
 			goto bad;
 		}
 	}
 
 #ifdef INET
 	if ((inp->inp_flags & IN6P_IPV6_V6ONLY) == 0) {
 		int hasv4addr;
 		struct sockaddr_in6 *sin6 = 0;
 
 		if (addr == 0)
 			hasv4addr = (inp->inp_vflag & INP_IPV4);
 		else {
 			sin6 = (struct sockaddr_in6 *)addr;
 			hasv4addr = IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)
 			    ? 1 : 0;
 		}
 		if (hasv4addr) {
 			struct pr_usrreqs *pru;
 
 			/*
 			 * XXXRW: We release UDP-layer locks before calling
 			 * udp_send() in order to avoid recursion.  However,
 			 * this does mean there is a short window where inp's
 			 * fields are unstable.  Could this lead to a
 			 * potential race in which the factors causing us to
 			 * select the UDPv4 output routine are invalidated?
 			 */
 			INP_WUNLOCK(inp);
 			if (sin6)
 				in6_sin6_2_sin_in_sock(addr);
 			pru = inetsw[ip_protox[IPPROTO_UDP]].pr_usrreqs;
 			/* addr will just be freed in sendit(). */
 			return ((*pru->pru_send)(so, flags, m, addr, control,
 			    td));
 		}
 	}
 #endif
 #ifdef MAC
 	mac_inpcb_create_mbuf(inp, m);
 #endif
 	INP_HASH_WLOCK(&V_udbinfo);
 	error = udp6_output(inp, m, addr, control, td);
 	INP_HASH_WUNLOCK(&V_udbinfo);
 #ifdef INET
 #endif	
 	INP_WUNLOCK(inp);
 	return (error);
 
 bad:
 	INP_WUNLOCK(inp);
 	m_freem(m);
 	return (error);
 }
 
 struct pr_usrreqs udp6_usrreqs = {
 	.pru_abort =		udp6_abort,
 	.pru_attach =		udp6_attach,
 	.pru_bind =		udp6_bind,
 	.pru_connect =		udp6_connect,
 	.pru_control =		in6_control,
 	.pru_detach =		udp6_detach,
 	.pru_disconnect =	udp6_disconnect,
 	.pru_peeraddr =		in6_mapped_peeraddr,
 	.pru_send =		udp6_send,
 	.pru_shutdown =		udp_shutdown,
 	.pru_sockaddr =		in6_mapped_sockaddr,
 	.pru_soreceive =	soreceive_dgram,
 	.pru_sosend =		sosend_dgram,
 	.pru_sosetlabel =	in_pcbsosetlabel,
 	.pru_close =		udp6_close
 };
Index: head/sys/nfsclient/nfs_bio.c
===================================================================
--- head/sys/nfsclient/nfs_bio.c	(revision 258540)
+++ head/sys/nfsclient/nfs_bio.c	(revision 258541)
@@ -1,1807 +1,1805 @@
 /*-
  * Copyright (c) 1989, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * This code is derived from software contributed to Berkeley by
  * Rick Macklem at The University of Guelph.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)nfs_bio.c	8.9 (Berkeley) 3/30/95
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
-#include "opt_kdtrace.h"
-
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/bio.h>
 #include <sys/buf.h>
 #include <sys/kernel.h>
 #include <sys/mbuf.h>
 #include <sys/mount.h>
 #include <sys/proc.h>
 #include <sys/rwlock.h>
 #include <sys/vmmeter.h>
 #include <sys/vnode.h>
 
 #include <vm/vm.h>
 #include <vm/vm_param.h>
 #include <vm/vm_extern.h>
 #include <vm/vm_page.h>
 #include <vm/vm_object.h>
 #include <vm/vm_pager.h>
 #include <vm/vnode_pager.h>
 
 #include <nfs/nfsproto.h>
 #include <nfsclient/nfs.h>
 #include <nfsclient/nfsmount.h>
 #include <nfsclient/nfsnode.h>
 #include <nfs/nfs_kdtrace.h>
 
 static struct buf *nfs_getcacheblk(struct vnode *vp, daddr_t bn, int size,
 		    struct thread *td);
 static int nfs_directio_write(struct vnode *vp, struct uio *uiop, 
 			      struct ucred *cred, int ioflag);
 
 extern int nfs_directio_enable;
 extern int nfs_directio_allow_mmap;
 
 /*
  * Vnode op for VM getpages.
  */
 int
 nfs_getpages(struct vop_getpages_args *ap)
 {
 	int i, error, nextoff, size, toff, count, npages;
 	struct uio uio;
 	struct iovec iov;
 	vm_offset_t kva;
 	struct buf *bp;
 	struct vnode *vp;
 	struct thread *td;
 	struct ucred *cred;
 	struct nfsmount *nmp;
 	vm_object_t object;
 	vm_page_t *pages;
 	struct nfsnode *np;
 
 	vp = ap->a_vp;
 	np = VTONFS(vp);
 	td = curthread;				/* XXX */
 	cred = curthread->td_ucred;		/* XXX */
 	nmp = VFSTONFS(vp->v_mount);
 	pages = ap->a_m;
 	count = ap->a_count;
 
 	if ((object = vp->v_object) == NULL) {
 		nfs_printf("nfs_getpages: called with non-merged cache vnode??\n");
 		return (VM_PAGER_ERROR);
 	}
 
 	if (nfs_directio_enable && !nfs_directio_allow_mmap) {
 		mtx_lock(&np->n_mtx);
 		if ((np->n_flag & NNONCACHE) && (vp->v_type == VREG)) {
 			mtx_unlock(&np->n_mtx);
 			nfs_printf("nfs_getpages: called on non-cacheable vnode??\n");
 			return (VM_PAGER_ERROR);
 		} else
 			mtx_unlock(&np->n_mtx);
 	}
 
 	mtx_lock(&nmp->nm_mtx);
 	if ((nmp->nm_flag & NFSMNT_NFSV3) != 0 &&
 	    (nmp->nm_state & NFSSTA_GOTFSINFO) == 0) {	
 		mtx_unlock(&nmp->nm_mtx);
 		/* We'll never get here for v4, because we always have fsinfo */
 		(void)nfs_fsinfo(nmp, vp, cred, td);
 	} else
 		mtx_unlock(&nmp->nm_mtx);
 
 	npages = btoc(count);
 
 	/*
 	 * If the requested page is partially valid, just return it and
 	 * allow the pager to zero-out the blanks.  Partially valid pages
 	 * can only occur at the file EOF.
 	 */
 	VM_OBJECT_WLOCK(object);
 	if (pages[ap->a_reqpage]->valid != 0) {
 		for (i = 0; i < npages; ++i) {
 			if (i != ap->a_reqpage) {
 				vm_page_lock(pages[i]);
 				vm_page_free(pages[i]);
 				vm_page_unlock(pages[i]);
 			}
 		}
 		VM_OBJECT_WUNLOCK(object);
 		return (0);
 	}
 	VM_OBJECT_WUNLOCK(object);
 
 	/*
 	 * We use only the kva address for the buffer, but this is extremely
 	 * convienient and fast.
 	 */
 	bp = getpbuf(&nfs_pbuf_freecnt);
 
 	kva = (vm_offset_t) bp->b_data;
 	pmap_qenter(kva, pages, npages);
 	PCPU_INC(cnt.v_vnodein);
 	PCPU_ADD(cnt.v_vnodepgsin, npages);
 
 	iov.iov_base = (caddr_t) kva;
 	iov.iov_len = count;
 	uio.uio_iov = &iov;
 	uio.uio_iovcnt = 1;
 	uio.uio_offset = IDX_TO_OFF(pages[0]->pindex);
 	uio.uio_resid = count;
 	uio.uio_segflg = UIO_SYSSPACE;
 	uio.uio_rw = UIO_READ;
 	uio.uio_td = td;
 
 	error = (nmp->nm_rpcops->nr_readrpc)(vp, &uio, cred);
 	pmap_qremove(kva, npages);
 
 	relpbuf(bp, &nfs_pbuf_freecnt);
 
 	if (error && (uio.uio_resid == count)) {
 		nfs_printf("nfs_getpages: error %d\n", error);
 		VM_OBJECT_WLOCK(object);
 		for (i = 0; i < npages; ++i) {
 			if (i != ap->a_reqpage) {
 				vm_page_lock(pages[i]);
 				vm_page_free(pages[i]);
 				vm_page_unlock(pages[i]);
 			}
 		}
 		VM_OBJECT_WUNLOCK(object);
 		return (VM_PAGER_ERROR);
 	}
 
 	/*
 	 * Calculate the number of bytes read and validate only that number
 	 * of bytes.  Note that due to pending writes, size may be 0.  This
 	 * does not mean that the remaining data is invalid!
 	 */
 
 	size = count - uio.uio_resid;
 	VM_OBJECT_WLOCK(object);
 	for (i = 0, toff = 0; i < npages; i++, toff = nextoff) {
 		vm_page_t m;
 		nextoff = toff + PAGE_SIZE;
 		m = pages[i];
 
 		if (nextoff <= size) {
 			/*
 			 * Read operation filled an entire page
 			 */
 			m->valid = VM_PAGE_BITS_ALL;
 			KASSERT(m->dirty == 0,
 			    ("nfs_getpages: page %p is dirty", m));
 		} else if (size > toff) {
 			/*
 			 * Read operation filled a partial page.
 			 */
 			m->valid = 0;
 			vm_page_set_valid_range(m, 0, size - toff);
 			KASSERT(m->dirty == 0,
 			    ("nfs_getpages: page %p is dirty", m));
 		} else {
 			/*
 			 * Read operation was short.  If no error
 			 * occured we may have hit a zero-fill
 			 * section.  We leave valid set to 0, and page
 			 * is freed by vm_page_readahead_finish() if
 			 * its index is not equal to requested, or
 			 * page is zeroed and set valid by
 			 * vm_pager_get_pages() for requested page.
 			 */
 			;
 		}
 		if (i != ap->a_reqpage)
 			vm_page_readahead_finish(m);
 	}
 	VM_OBJECT_WUNLOCK(object);
 	return (0);
 }
 
 /*
  * Vnode op for VM putpages.
  */
 int
 nfs_putpages(struct vop_putpages_args *ap)
 {
 	struct uio uio;
 	struct iovec iov;
 	vm_offset_t kva;
 	struct buf *bp;
 	int iomode, must_commit, i, error, npages, count;
 	off_t offset;
 	int *rtvals;
 	struct vnode *vp;
 	struct thread *td;
 	struct ucred *cred;
 	struct nfsmount *nmp;
 	struct nfsnode *np;
 	vm_page_t *pages;
 
 	vp = ap->a_vp;
 	np = VTONFS(vp);
 	td = curthread;				/* XXX */
 	/* Set the cred to n_writecred for the write rpcs. */
 	if (np->n_writecred != NULL)
 		cred = crhold(np->n_writecred);
 	else
 		cred = crhold(curthread->td_ucred);	/* XXX */
 	nmp = VFSTONFS(vp->v_mount);
 	pages = ap->a_m;
 	count = ap->a_count;
 	rtvals = ap->a_rtvals;
 	npages = btoc(count);
 	offset = IDX_TO_OFF(pages[0]->pindex);
 	
 	mtx_lock(&nmp->nm_mtx);
 	if ((nmp->nm_flag & NFSMNT_NFSV3) != 0 &&
 	    (nmp->nm_state & NFSSTA_GOTFSINFO) == 0) {
 		mtx_unlock(&nmp->nm_mtx);
 		(void)nfs_fsinfo(nmp, vp, cred, td);
 	} else
 		mtx_unlock(&nmp->nm_mtx);
 
 	mtx_lock(&np->n_mtx);
 	if (nfs_directio_enable && !nfs_directio_allow_mmap && 
 	    (np->n_flag & NNONCACHE) && (vp->v_type == VREG)) {
 		mtx_unlock(&np->n_mtx);		
 		nfs_printf("nfs_putpages: called on noncache-able vnode??\n");
 		mtx_lock(&np->n_mtx);
 	}
 
 	for (i = 0; i < npages; i++)
 		rtvals[i] = VM_PAGER_ERROR;
 
 	/*
 	 * When putting pages, do not extend file past EOF.
 	 */
 	if (offset + count > np->n_size) {
 		count = np->n_size - offset;
 		if (count < 0)
 			count = 0;
 	}
 	mtx_unlock(&np->n_mtx);
 
 	/*
 	 * We use only the kva address for the buffer, but this is extremely
 	 * convienient and fast.
 	 */
 	bp = getpbuf(&nfs_pbuf_freecnt);
 
 	kva = (vm_offset_t) bp->b_data;
 	pmap_qenter(kva, pages, npages);
 	PCPU_INC(cnt.v_vnodeout);
 	PCPU_ADD(cnt.v_vnodepgsout, count);
 
 	iov.iov_base = (caddr_t) kva;
 	iov.iov_len = count;
 	uio.uio_iov = &iov;
 	uio.uio_iovcnt = 1;
 	uio.uio_offset = offset;
 	uio.uio_resid = count;
 	uio.uio_segflg = UIO_SYSSPACE;
 	uio.uio_rw = UIO_WRITE;
 	uio.uio_td = td;
 
 	if ((ap->a_sync & VM_PAGER_PUT_SYNC) == 0)
 	    iomode = NFSV3WRITE_UNSTABLE;
 	else
 	    iomode = NFSV3WRITE_FILESYNC;
 
 	error = (nmp->nm_rpcops->nr_writerpc)(vp, &uio, cred, &iomode, &must_commit);
 	crfree(cred);
 
 	pmap_qremove(kva, npages);
 	relpbuf(bp, &nfs_pbuf_freecnt);
 
 	if (!error) {
 		vnode_pager_undirty_pages(pages, rtvals, count - uio.uio_resid);
 		if (must_commit) {
 			nfs_clearcommit(vp->v_mount);
 		}
 	}
 	return rtvals[0];
 }
 
 /*
  * For nfs, cache consistency can only be maintained approximately.
  * Although RFC1094 does not specify the criteria, the following is
  * believed to be compatible with the reference port.
  * For nfs:
  * If the file's modify time on the server has changed since the
  * last read rpc or you have written to the file,
  * you may have lost data cache consistency with the
  * server, so flush all of the file's data out of the cache.
  * Then force a getattr rpc to ensure that you have up to date
  * attributes.
  * NB: This implies that cache data can be read when up to
  * NFS_ATTRTIMEO seconds out of date. If you find that you need current
  * attributes this could be forced by setting n_attrstamp to 0 before
  * the VOP_GETATTR() call.
  */
 static inline int
 nfs_bioread_check_cons(struct vnode *vp, struct thread *td, struct ucred *cred)
 {
 	int error = 0;
 	struct vattr vattr;
 	struct nfsnode *np = VTONFS(vp);
 	int old_lock;
 	struct nfsmount *nmp = VFSTONFS(vp->v_mount);
 	
 	/*
 	 * Grab the exclusive lock before checking whether the cache is
 	 * consistent.
 	 * XXX - We can make this cheaper later (by acquiring cheaper locks).
 	 * But for now, this suffices.
 	 */
 	old_lock = nfs_upgrade_vnlock(vp);
 	if (vp->v_iflag & VI_DOOMED) {
 		nfs_downgrade_vnlock(vp, old_lock);
 		return (EBADF);
 	}
 		
 	mtx_lock(&np->n_mtx);
 	if (np->n_flag & NMODIFIED) {
 		mtx_unlock(&np->n_mtx);
 		if (vp->v_type != VREG) {
 			if (vp->v_type != VDIR)
 				panic("nfs: bioread, not dir");
 			(nmp->nm_rpcops->nr_invaldir)(vp);
 			error = nfs_vinvalbuf(vp, V_SAVE, td, 1);
 			if (error)
 				goto out;
 		}
 		np->n_attrstamp = 0;
 		KDTRACE_NFS_ATTRCACHE_FLUSH_DONE(vp);
 		error = VOP_GETATTR(vp, &vattr, cred);
 		if (error)
 			goto out;
 		mtx_lock(&np->n_mtx);
 		np->n_mtime = vattr.va_mtime;
 		mtx_unlock(&np->n_mtx);
 	} else {
 		mtx_unlock(&np->n_mtx);
 		error = VOP_GETATTR(vp, &vattr, cred);
 		if (error)
 			return (error);
 		mtx_lock(&np->n_mtx);
 		if ((np->n_flag & NSIZECHANGED)
 		    || (NFS_TIMESPEC_COMPARE(&np->n_mtime, &vattr.va_mtime))) {
 			mtx_unlock(&np->n_mtx);
 			if (vp->v_type == VDIR)
 				(nmp->nm_rpcops->nr_invaldir)(vp);
 			error = nfs_vinvalbuf(vp, V_SAVE, td, 1);
 			if (error)
 				goto out;
 			mtx_lock(&np->n_mtx);
 			np->n_mtime = vattr.va_mtime;
 			np->n_flag &= ~NSIZECHANGED;
 		}
 		mtx_unlock(&np->n_mtx);
 	}
 out:	
 	nfs_downgrade_vnlock(vp, old_lock);
 	return error;
 }
 
 /*
  * Vnode op for read using bio
  */
 int
 nfs_bioread(struct vnode *vp, struct uio *uio, int ioflag, struct ucred *cred)
 {
 	struct nfsnode *np = VTONFS(vp);
 	int biosize, i;
 	struct buf *bp, *rabp;
 	struct thread *td;
 	struct nfsmount *nmp = VFSTONFS(vp->v_mount);
 	daddr_t lbn, rabn;
 	off_t end;
 	int bcount;
 	int seqcount;
 	int nra, error = 0, n = 0, on = 0;
 
 	KASSERT(uio->uio_rw == UIO_READ, ("nfs_read mode"));
 	if (uio->uio_resid == 0)
 		return (0);
 	if (uio->uio_offset < 0)	/* XXX VDIR cookies can be negative */
 		return (EINVAL);
 	td = uio->uio_td;
 
 	mtx_lock(&nmp->nm_mtx);
 	if ((nmp->nm_flag & NFSMNT_NFSV3) != 0 &&
 	    (nmp->nm_state & NFSSTA_GOTFSINFO) == 0) {
 		mtx_unlock(&nmp->nm_mtx);
 		(void)nfs_fsinfo(nmp, vp, cred, td);
 	} else
 		mtx_unlock(&nmp->nm_mtx);		
 
 	end = uio->uio_offset + uio->uio_resid;
 	if (vp->v_type != VDIR &&
 	    (end > nmp->nm_maxfilesize || end < uio->uio_offset))
 		return (EFBIG);
 
 	if (nfs_directio_enable && (ioflag & IO_DIRECT) && (vp->v_type == VREG))
 		/* No caching/ no readaheads. Just read data into the user buffer */
 		return nfs_readrpc(vp, uio, cred);
 
 	biosize = vp->v_bufobj.bo_bsize;
 	seqcount = (int)((off_t)(ioflag >> IO_SEQSHIFT) * biosize / BKVASIZE);
 	
 	error = nfs_bioread_check_cons(vp, td, cred);
 	if (error)
 		return error;
 
 	do {
 	    u_quad_t nsize;
 			
 	    mtx_lock(&np->n_mtx);
 	    nsize = np->n_size;
 	    mtx_unlock(&np->n_mtx);		    
 
 	    switch (vp->v_type) {
 	    case VREG:
 		nfsstats.biocache_reads++;
 		lbn = uio->uio_offset / biosize;
 		on = uio->uio_offset - (lbn * biosize);
 
 		/*
 		 * Start the read ahead(s), as required.
 		 */
 		if (nmp->nm_readahead > 0) {
 		    for (nra = 0; nra < nmp->nm_readahead && nra < seqcount &&
 			(off_t)(lbn + 1 + nra) * biosize < nsize; nra++) {
 			rabn = lbn + 1 + nra;
 			if (incore(&vp->v_bufobj, rabn) == NULL) {
 			    rabp = nfs_getcacheblk(vp, rabn, biosize, td);
 			    if (!rabp) {
 				error = nfs_sigintr(nmp, td);
 				return (error ? error : EINTR);
 			    }
 			    if ((rabp->b_flags & (B_CACHE|B_DELWRI)) == 0) {
 				rabp->b_flags |= B_ASYNC;
 				rabp->b_iocmd = BIO_READ;
 				vfs_busy_pages(rabp, 0);
 				if (nfs_asyncio(nmp, rabp, cred, td)) {
 				    rabp->b_flags |= B_INVAL;
 				    rabp->b_ioflags |= BIO_ERROR;
 				    vfs_unbusy_pages(rabp);
 				    brelse(rabp);
 				    break;
 				}
 			    } else {
 				brelse(rabp);
 			    }
 			}
 		    }
 		}
 
 		/* Note that bcount is *not* DEV_BSIZE aligned. */
 		bcount = biosize;
 		if ((off_t)lbn * biosize >= nsize) {
 			bcount = 0;
 		} else if ((off_t)(lbn + 1) * biosize > nsize) {
 			bcount = nsize - (off_t)lbn * biosize;
 		}
 		bp = nfs_getcacheblk(vp, lbn, bcount, td);
 
 		if (!bp) {
 			error = nfs_sigintr(nmp, td);
 			return (error ? error : EINTR);
 		}
 
 		/*
 		 * If B_CACHE is not set, we must issue the read.  If this
 		 * fails, we return an error.
 		 */
 
 		if ((bp->b_flags & B_CACHE) == 0) {
 		    bp->b_iocmd = BIO_READ;
 		    vfs_busy_pages(bp, 0);
 		    error = nfs_doio(vp, bp, cred, td);
 		    if (error) {
 			brelse(bp);
 			return (error);
 		    }
 		}
 
 		/*
 		 * on is the offset into the current bp.  Figure out how many
 		 * bytes we can copy out of the bp.  Note that bcount is
 		 * NOT DEV_BSIZE aligned.
 		 *
 		 * Then figure out how many bytes we can copy into the uio.
 		 */
 
 		n = 0;
 		if (on < bcount)
 			n = MIN((unsigned)(bcount - on), uio->uio_resid);
 		break;
 	    case VLNK:
 		nfsstats.biocache_readlinks++;
 		bp = nfs_getcacheblk(vp, (daddr_t)0, NFS_MAXPATHLEN, td);
 		if (!bp) {
 			error = nfs_sigintr(nmp, td);
 			return (error ? error : EINTR);
 		}
 		if ((bp->b_flags & B_CACHE) == 0) {
 		    bp->b_iocmd = BIO_READ;
 		    vfs_busy_pages(bp, 0);
 		    error = nfs_doio(vp, bp, cred, td);
 		    if (error) {
 			bp->b_ioflags |= BIO_ERROR;
 			brelse(bp);
 			return (error);
 		    }
 		}
 		n = MIN(uio->uio_resid, NFS_MAXPATHLEN - bp->b_resid);
 		on = 0;
 		break;
 	    case VDIR:
 		nfsstats.biocache_readdirs++;
 		if (np->n_direofoffset
 		    && uio->uio_offset >= np->n_direofoffset) {
 		    return (0);
 		}
 		lbn = (uoff_t)uio->uio_offset / NFS_DIRBLKSIZ;
 		on = uio->uio_offset & (NFS_DIRBLKSIZ - 1);
 		bp = nfs_getcacheblk(vp, lbn, NFS_DIRBLKSIZ, td);
 		if (!bp) {
 		    error = nfs_sigintr(nmp, td);
 		    return (error ? error : EINTR);
 		}
 		if ((bp->b_flags & B_CACHE) == 0) {
 		    bp->b_iocmd = BIO_READ;
 		    vfs_busy_pages(bp, 0);
 		    error = nfs_doio(vp, bp, cred, td);
 		    if (error) {
 			    brelse(bp);
 		    }
 		    while (error == NFSERR_BAD_COOKIE) {
 			(nmp->nm_rpcops->nr_invaldir)(vp);
 			error = nfs_vinvalbuf(vp, 0, td, 1);
 			/*
 			 * Yuck! The directory has been modified on the
 			 * server. The only way to get the block is by
 			 * reading from the beginning to get all the
 			 * offset cookies.
 			 *
 			 * Leave the last bp intact unless there is an error.
 			 * Loop back up to the while if the error is another
 			 * NFSERR_BAD_COOKIE (double yuch!).
 			 */
 			for (i = 0; i <= lbn && !error; i++) {
 			    if (np->n_direofoffset
 				&& (i * NFS_DIRBLKSIZ) >= np->n_direofoffset)
 				    return (0);
 			    bp = nfs_getcacheblk(vp, i, NFS_DIRBLKSIZ, td);
 			    if (!bp) {
 				error = nfs_sigintr(nmp, td);
 				return (error ? error : EINTR);
 			    }
 			    if ((bp->b_flags & B_CACHE) == 0) {
 				    bp->b_iocmd = BIO_READ;
 				    vfs_busy_pages(bp, 0);
 				    error = nfs_doio(vp, bp, cred, td);
 				    /*
 				     * no error + B_INVAL == directory EOF,
 				     * use the block.
 				     */
 				    if (error == 0 && (bp->b_flags & B_INVAL))
 					    break;
 			    }
 			    /*
 			     * An error will throw away the block and the
 			     * for loop will break out.  If no error and this
 			     * is not the block we want, we throw away the
 			     * block and go for the next one via the for loop.
 			     */
 			    if (error || i < lbn)
 				    brelse(bp);
 			}
 		    }
 		    /*
 		     * The above while is repeated if we hit another cookie
 		     * error.  If we hit an error and it wasn't a cookie error,
 		     * we give up.
 		     */
 		    if (error)
 			    return (error);
 		}
 
 		/*
 		 * If not eof and read aheads are enabled, start one.
 		 * (You need the current block first, so that you have the
 		 *  directory offset cookie of the next block.)
 		 */
 		if (nmp->nm_readahead > 0 &&
 		    (bp->b_flags & B_INVAL) == 0 &&
 		    (np->n_direofoffset == 0 ||
 		    (lbn + 1) * NFS_DIRBLKSIZ < np->n_direofoffset) &&
 		    incore(&vp->v_bufobj, lbn + 1) == NULL) {
 			rabp = nfs_getcacheblk(vp, lbn + 1, NFS_DIRBLKSIZ, td);
 			if (rabp) {
 			    if ((rabp->b_flags & (B_CACHE|B_DELWRI)) == 0) {
 				rabp->b_flags |= B_ASYNC;
 				rabp->b_iocmd = BIO_READ;
 				vfs_busy_pages(rabp, 0);
 				if (nfs_asyncio(nmp, rabp, cred, td)) {
 				    rabp->b_flags |= B_INVAL;
 				    rabp->b_ioflags |= BIO_ERROR;
 				    vfs_unbusy_pages(rabp);
 				    brelse(rabp);
 				}
 			    } else {
 				brelse(rabp);
 			    }
 			}
 		}
 		/*
 		 * Unlike VREG files, whos buffer size ( bp->b_bcount ) is
 		 * chopped for the EOF condition, we cannot tell how large
 		 * NFS directories are going to be until we hit EOF.  So
 		 * an NFS directory buffer is *not* chopped to its EOF.  Now,
 		 * it just so happens that b_resid will effectively chop it
 		 * to EOF.  *BUT* this information is lost if the buffer goes
 		 * away and is reconstituted into a B_CACHE state ( due to
 		 * being VMIO ) later.  So we keep track of the directory eof
 		 * in np->n_direofoffset and chop it off as an extra step
 		 * right here.
 		 */
 		n = lmin(uio->uio_resid, NFS_DIRBLKSIZ - bp->b_resid - on);
 		if (np->n_direofoffset && n > np->n_direofoffset - uio->uio_offset)
 			n = np->n_direofoffset - uio->uio_offset;
 		break;
 	    default:
 		nfs_printf(" nfs_bioread: type %x unexpected\n", vp->v_type);
 		bp = NULL;
 		break;
 	    };
 
 	    if (n > 0) {
 		    error = uiomove(bp->b_data + on, (int)n, uio);
 	    }
 	    if (vp->v_type == VLNK)
 		n = 0;
 	    if (bp != NULL)
 		brelse(bp);
 	} while (error == 0 && uio->uio_resid > 0 && n > 0);
 	return (error);
 }
 
 /*
  * The NFS write path cannot handle iovecs with len > 1. So we need to 
  * break up iovecs accordingly (restricting them to wsize).
  * For the SYNC case, we can do this with 1 copy (user buffer -> mbuf). 
  * For the ASYNC case, 2 copies are needed. The first a copy from the 
  * user buffer to a staging buffer and then a second copy from the staging
  * buffer to mbufs. This can be optimized by copying from the user buffer
  * directly into mbufs and passing the chain down, but that requires a 
  * fair amount of re-working of the relevant codepaths (and can be done
  * later).
  */
 static int
 nfs_directio_write(vp, uiop, cred, ioflag)
 	struct vnode *vp;
 	struct uio *uiop;
 	struct ucred *cred;
 	int ioflag;
 {
 	int error;
 	struct nfsmount *nmp = VFSTONFS(vp->v_mount);
 	struct thread *td = uiop->uio_td;
 	int size;
 	int wsize;
 	
 	mtx_lock(&nmp->nm_mtx);
 	wsize = nmp->nm_wsize;
 	mtx_unlock(&nmp->nm_mtx);
 	if (ioflag & IO_SYNC) {
 		int iomode, must_commit;
 		struct uio uio;
 		struct iovec iov;
 do_sync:
 		while (uiop->uio_resid > 0) {
 			size = MIN(uiop->uio_resid, wsize);
 			size = MIN(uiop->uio_iov->iov_len, size);
 			iov.iov_base = uiop->uio_iov->iov_base;
 			iov.iov_len = size;
 			uio.uio_iov = &iov;
 			uio.uio_iovcnt = 1;
 			uio.uio_offset = uiop->uio_offset;
 			uio.uio_resid = size;
 			uio.uio_segflg = UIO_USERSPACE;
 			uio.uio_rw = UIO_WRITE;
 			uio.uio_td = td;
 			iomode = NFSV3WRITE_FILESYNC;
 			error = (nmp->nm_rpcops->nr_writerpc)(vp, &uio, cred, 
 						      &iomode, &must_commit);
 			KASSERT((must_commit == 0), 
 				("nfs_directio_write: Did not commit write"));
 			if (error)
 				return (error);
 			uiop->uio_offset += size;
 			uiop->uio_resid -= size;
 			if (uiop->uio_iov->iov_len <= size) {
 				uiop->uio_iovcnt--;
 				uiop->uio_iov++;
 			} else {
 				uiop->uio_iov->iov_base = 
 					(char *)uiop->uio_iov->iov_base + size;
 				uiop->uio_iov->iov_len -= size;
 			}
 		}
 	} else {
 		struct uio *t_uio;
 		struct iovec *t_iov;
 		struct buf *bp;
 		
 		/*
 		 * Break up the write into blocksize chunks and hand these
 		 * over to nfsiod's for write back.
 		 * Unfortunately, this incurs a copy of the data. Since 
 		 * the user could modify the buffer before the write is 
 		 * initiated.
 		 * 
 		 * The obvious optimization here is that one of the 2 copies
 		 * in the async write path can be eliminated by copying the
 		 * data here directly into mbufs and passing the mbuf chain
 		 * down. But that will require a fair amount of re-working
 		 * of the code and can be done if there's enough interest
 		 * in NFS directio access.
 		 */
 		while (uiop->uio_resid > 0) {
 			size = MIN(uiop->uio_resid, wsize);
 			size = MIN(uiop->uio_iov->iov_len, size);
 			bp = getpbuf(&nfs_pbuf_freecnt);
 			t_uio = malloc(sizeof(struct uio), M_NFSDIRECTIO, M_WAITOK);
 			t_iov = malloc(sizeof(struct iovec), M_NFSDIRECTIO, M_WAITOK);
 			t_iov->iov_base = malloc(size, M_NFSDIRECTIO, M_WAITOK);
 			t_iov->iov_len = size;
 			t_uio->uio_iov = t_iov;
 			t_uio->uio_iovcnt = 1;
 			t_uio->uio_offset = uiop->uio_offset;
 			t_uio->uio_resid = size;
 			t_uio->uio_segflg = UIO_SYSSPACE;
 			t_uio->uio_rw = UIO_WRITE;
 			t_uio->uio_td = td;
 			KASSERT(uiop->uio_segflg == UIO_USERSPACE ||
 			    uiop->uio_segflg == UIO_SYSSPACE,
 			    ("nfs_directio_write: Bad uio_segflg"));
 			if (uiop->uio_segflg == UIO_USERSPACE) {
 				error = copyin(uiop->uio_iov->iov_base,
 				    t_iov->iov_base, size);
 				if (error != 0)
 					goto err_free;
 			} else
 				/*
 				 * UIO_SYSSPACE may never happen, but handle
 				 * it just in case it does.
 				 */
 				bcopy(uiop->uio_iov->iov_base, t_iov->iov_base,
 				    size);
 			bp->b_flags |= B_DIRECT;
 			bp->b_iocmd = BIO_WRITE;
 			if (cred != NOCRED) {
 				crhold(cred);
 				bp->b_wcred = cred;
 			} else 
 				bp->b_wcred = NOCRED;			
 			bp->b_caller1 = (void *)t_uio;
 			bp->b_vp = vp;
 			error = nfs_asyncio(nmp, bp, NOCRED, td);
 err_free:
 			if (error) {
 				free(t_iov->iov_base, M_NFSDIRECTIO);
 				free(t_iov, M_NFSDIRECTIO);
 				free(t_uio, M_NFSDIRECTIO);
 				bp->b_vp = NULL;
 				relpbuf(bp, &nfs_pbuf_freecnt);
 				if (error == EINTR)
 					return (error);
 				goto do_sync;
 			}
 			uiop->uio_offset += size;
 			uiop->uio_resid -= size;
 			if (uiop->uio_iov->iov_len <= size) {
 				uiop->uio_iovcnt--;
 				uiop->uio_iov++;
 			} else {
 				uiop->uio_iov->iov_base = 
 					(char *)uiop->uio_iov->iov_base + size;
 				uiop->uio_iov->iov_len -= size;
 			}
 		}
 	}
 	return (0);
 }
 
 /*
  * Vnode op for write using bio
  */
 int
 nfs_write(struct vop_write_args *ap)
 {
 	int biosize;
 	struct uio *uio = ap->a_uio;
 	struct thread *td = uio->uio_td;
 	struct vnode *vp = ap->a_vp;
 	struct nfsnode *np = VTONFS(vp);
 	struct ucred *cred = ap->a_cred;
 	int ioflag = ap->a_ioflag;
 	struct buf *bp;
 	struct vattr vattr;
 	struct nfsmount *nmp = VFSTONFS(vp->v_mount);
 	daddr_t lbn;
 	off_t end;
 	int bcount;
 	int n, on, error = 0;
 
 	KASSERT(uio->uio_rw == UIO_WRITE, ("nfs_write mode"));
 	KASSERT(uio->uio_segflg != UIO_USERSPACE || uio->uio_td == curthread,
 	    ("nfs_write proc"));
 	if (vp->v_type != VREG)
 		return (EIO);
 	mtx_lock(&np->n_mtx);
 	if (np->n_flag & NWRITEERR) {
 		np->n_flag &= ~NWRITEERR;
 		mtx_unlock(&np->n_mtx);
 		return (np->n_error);
 	} else
 		mtx_unlock(&np->n_mtx);
 	mtx_lock(&nmp->nm_mtx);
 	if ((nmp->nm_flag & NFSMNT_NFSV3) != 0 &&
 	    (nmp->nm_state & NFSSTA_GOTFSINFO) == 0) {
 		mtx_unlock(&nmp->nm_mtx);
 		(void)nfs_fsinfo(nmp, vp, cred, td);
 	} else
 		mtx_unlock(&nmp->nm_mtx);
 
 	/*
 	 * Synchronously flush pending buffers if we are in synchronous
 	 * mode or if we are appending.
 	 */
 	if (ioflag & (IO_APPEND | IO_SYNC)) {
 		mtx_lock(&np->n_mtx);
 		if (np->n_flag & NMODIFIED) {
 			mtx_unlock(&np->n_mtx);
 #ifdef notyet /* Needs matching nonblock semantics elsewhere, too. */
 			/*
 			 * Require non-blocking, synchronous writes to
 			 * dirty files to inform the program it needs
 			 * to fsync(2) explicitly.
 			 */
 			if (ioflag & IO_NDELAY)
 				return (EAGAIN);
 #endif
 flush_and_restart:
 			np->n_attrstamp = 0;
 			KDTRACE_NFS_ATTRCACHE_FLUSH_DONE(vp);
 			error = nfs_vinvalbuf(vp, V_SAVE, td, 1);
 			if (error)
 				return (error);
 		} else
 			mtx_unlock(&np->n_mtx);
 	}
 
 	/*
 	 * If IO_APPEND then load uio_offset.  We restart here if we cannot
 	 * get the append lock.
 	 */
 	if (ioflag & IO_APPEND) {
 		np->n_attrstamp = 0;
 		KDTRACE_NFS_ATTRCACHE_FLUSH_DONE(vp);
 		error = VOP_GETATTR(vp, &vattr, cred);
 		if (error)
 			return (error);
 		mtx_lock(&np->n_mtx);
 		uio->uio_offset = np->n_size;
 		mtx_unlock(&np->n_mtx);
 	}
 
 	if (uio->uio_offset < 0)
 		return (EINVAL);
 	end = uio->uio_offset + uio->uio_resid;
 	if (end > nmp->nm_maxfilesize || end < uio->uio_offset)
 		return (EFBIG);
 	if (uio->uio_resid == 0)
 		return (0);
 
 	if (nfs_directio_enable && (ioflag & IO_DIRECT) && vp->v_type == VREG)
 		return nfs_directio_write(vp, uio, cred, ioflag);
 
 	/*
 	 * Maybe this should be above the vnode op call, but so long as
 	 * file servers have no limits, i don't think it matters
 	 */
 	if (vn_rlimit_fsize(vp, uio, td))
 		return (EFBIG);
 
 	biosize = vp->v_bufobj.bo_bsize;
 	/*
 	 * Find all of this file's B_NEEDCOMMIT buffers.  If our writes
 	 * would exceed the local maximum per-file write commit size when
 	 * combined with those, we must decide whether to flush,
 	 * go synchronous, or return error.  We don't bother checking
 	 * IO_UNIT -- we just make all writes atomic anyway, as there's
 	 * no point optimizing for something that really won't ever happen.
 	 */
 	if (!(ioflag & IO_SYNC)) {
 		int nflag;
 
 		mtx_lock(&np->n_mtx);
 		nflag = np->n_flag;
 		mtx_unlock(&np->n_mtx);		
 		int needrestart = 0;
 		if (nmp->nm_wcommitsize < uio->uio_resid) {
 			/*
 			 * If this request could not possibly be completed
 			 * without exceeding the maximum outstanding write
 			 * commit size, see if we can convert it into a
 			 * synchronous write operation.
 			 */
 			if (ioflag & IO_NDELAY)
 				return (EAGAIN);
 			ioflag |= IO_SYNC;
 			if (nflag & NMODIFIED)
 				needrestart = 1;
 		} else if (nflag & NMODIFIED) {
 			int wouldcommit = 0;
 			BO_LOCK(&vp->v_bufobj);
 			if (vp->v_bufobj.bo_dirty.bv_cnt != 0) {
 				TAILQ_FOREACH(bp, &vp->v_bufobj.bo_dirty.bv_hd,
 				    b_bobufs) {
 					if (bp->b_flags & B_NEEDCOMMIT)
 						wouldcommit += bp->b_bcount;
 				}
 			}
 			BO_UNLOCK(&vp->v_bufobj);
 			/*
 			 * Since we're not operating synchronously and
 			 * bypassing the buffer cache, we are in a commit
 			 * and holding all of these buffers whether
 			 * transmitted or not.  If not limited, this
 			 * will lead to the buffer cache deadlocking,
 			 * as no one else can flush our uncommitted buffers.
 			 */
 			wouldcommit += uio->uio_resid;
 			/*
 			 * If we would initially exceed the maximum
 			 * outstanding write commit size, flush and restart.
 			 */
 			if (wouldcommit > nmp->nm_wcommitsize)
 				needrestart = 1;
 		}
 		if (needrestart)
 			goto flush_and_restart;
 	}
 
 	do {
 		nfsstats.biocache_writes++;
 		lbn = uio->uio_offset / biosize;
 		on = uio->uio_offset - (lbn * biosize);
 		n = MIN((unsigned)(biosize - on), uio->uio_resid);
 again:
 		/*
 		 * Handle direct append and file extension cases, calculate
 		 * unaligned buffer size.
 		 */
 		mtx_lock(&np->n_mtx);
 		if (uio->uio_offset == np->n_size && n) {
 			mtx_unlock(&np->n_mtx);
 			/*
 			 * Get the buffer (in its pre-append state to maintain
 			 * B_CACHE if it was previously set).  Resize the
 			 * nfsnode after we have locked the buffer to prevent
 			 * readers from reading garbage.
 			 */
 			bcount = on;
 			bp = nfs_getcacheblk(vp, lbn, bcount, td);
 
 			if (bp != NULL) {
 				long save;
 
 				mtx_lock(&np->n_mtx);
 				np->n_size = uio->uio_offset + n;
 				np->n_flag |= NMODIFIED;
 				vnode_pager_setsize(vp, np->n_size);
 				mtx_unlock(&np->n_mtx);
 
 				save = bp->b_flags & B_CACHE;
 				bcount += n;
 				allocbuf(bp, bcount);
 				bp->b_flags |= save;
 			}
 		} else {
 			/*
 			 * Obtain the locked cache block first, and then
 			 * adjust the file's size as appropriate.
 			 */
 			bcount = on + n;
 			if ((off_t)lbn * biosize + bcount < np->n_size) {
 				if ((off_t)(lbn + 1) * biosize < np->n_size)
 					bcount = biosize;
 				else
 					bcount = np->n_size - (off_t)lbn * biosize;
 			}
 			mtx_unlock(&np->n_mtx);
 			bp = nfs_getcacheblk(vp, lbn, bcount, td);
 			mtx_lock(&np->n_mtx);
 			if (uio->uio_offset + n > np->n_size) {
 				np->n_size = uio->uio_offset + n;
 				np->n_flag |= NMODIFIED;
 				vnode_pager_setsize(vp, np->n_size);
 			}
 			mtx_unlock(&np->n_mtx);
 		}
 
 		if (!bp) {
 			error = nfs_sigintr(nmp, td);
 			if (!error)
 				error = EINTR;
 			break;
 		}
 
 		/*
 		 * Issue a READ if B_CACHE is not set.  In special-append
 		 * mode, B_CACHE is based on the buffer prior to the write
 		 * op and is typically set, avoiding the read.  If a read
 		 * is required in special append mode, the server will
 		 * probably send us a short-read since we extended the file
 		 * on our end, resulting in b_resid == 0 and, thusly,
 		 * B_CACHE getting set.
 		 *
 		 * We can also avoid issuing the read if the write covers
 		 * the entire buffer.  We have to make sure the buffer state
 		 * is reasonable in this case since we will not be initiating
 		 * I/O.  See the comments in kern/vfs_bio.c's getblk() for
 		 * more information.
 		 *
 		 * B_CACHE may also be set due to the buffer being cached
 		 * normally.
 		 */
 
 		if (on == 0 && n == bcount) {
 			bp->b_flags |= B_CACHE;
 			bp->b_flags &= ~B_INVAL;
 			bp->b_ioflags &= ~BIO_ERROR;
 		}
 
 		if ((bp->b_flags & B_CACHE) == 0) {
 			bp->b_iocmd = BIO_READ;
 			vfs_busy_pages(bp, 0);
 			error = nfs_doio(vp, bp, cred, td);
 			if (error) {
 				brelse(bp);
 				break;
 			}
 		}
 		if (bp->b_wcred == NOCRED)
 			bp->b_wcred = crhold(cred);
 		mtx_lock(&np->n_mtx);
 		np->n_flag |= NMODIFIED;
 		mtx_unlock(&np->n_mtx);
 
 		/*
 		 * If dirtyend exceeds file size, chop it down.  This should
 		 * not normally occur but there is an append race where it
 		 * might occur XXX, so we log it.
 		 *
 		 * If the chopping creates a reverse-indexed or degenerate
 		 * situation with dirtyoff/end, we 0 both of them.
 		 */
 
 		if (bp->b_dirtyend > bcount) {
 			nfs_printf("NFS append race @%lx:%d\n",
 			    (long)bp->b_blkno * DEV_BSIZE,
 			    bp->b_dirtyend - bcount);
 			bp->b_dirtyend = bcount;
 		}
 
 		if (bp->b_dirtyoff >= bp->b_dirtyend)
 			bp->b_dirtyoff = bp->b_dirtyend = 0;
 
 		/*
 		 * If the new write will leave a contiguous dirty
 		 * area, just update the b_dirtyoff and b_dirtyend,
 		 * otherwise force a write rpc of the old dirty area.
 		 *
 		 * While it is possible to merge discontiguous writes due to
 		 * our having a B_CACHE buffer ( and thus valid read data
 		 * for the hole), we don't because it could lead to
 		 * significant cache coherency problems with multiple clients,
 		 * especially if locking is implemented later on.
 		 *
 		 * as an optimization we could theoretically maintain
 		 * a linked list of discontinuous areas, but we would still
 		 * have to commit them separately so there isn't much
 		 * advantage to it except perhaps a bit of asynchronization.
 		 */
 
 		if (bp->b_dirtyend > 0 &&
 		    (on > bp->b_dirtyend || (on + n) < bp->b_dirtyoff)) {
 			if (bwrite(bp) == EINTR) {
 				error = EINTR;
 				break;
 			}
 			goto again;
 		}
 
 		error = uiomove((char *)bp->b_data + on, n, uio);
 
 		/*
 		 * Since this block is being modified, it must be written
 		 * again and not just committed.  Since write clustering does
 		 * not work for the stage 1 data write, only the stage 2
 		 * commit rpc, we have to clear B_CLUSTEROK as well.
 		 */
 		bp->b_flags &= ~(B_NEEDCOMMIT | B_CLUSTEROK);
 
 		if (error) {
 			bp->b_ioflags |= BIO_ERROR;
 			brelse(bp);
 			break;
 		}
 
 		/*
 		 * Only update dirtyoff/dirtyend if not a degenerate
 		 * condition.
 		 */
 		if (n) {
 			if (bp->b_dirtyend > 0) {
 				bp->b_dirtyoff = min(on, bp->b_dirtyoff);
 				bp->b_dirtyend = max((on + n), bp->b_dirtyend);
 			} else {
 				bp->b_dirtyoff = on;
 				bp->b_dirtyend = on + n;
 			}
 			vfs_bio_set_valid(bp, on, n);
 		}
 
 		/*
 		 * If IO_SYNC do bwrite().
 		 *
 		 * IO_INVAL appears to be unused.  The idea appears to be
 		 * to turn off caching in this case.  Very odd.  XXX
 		 */
 		if ((ioflag & IO_SYNC)) {
 			if (ioflag & IO_INVAL)
 				bp->b_flags |= B_NOCACHE;
 			error = bwrite(bp);
 			if (error)
 				break;
 		} else if ((n + on) == biosize) {
 			bp->b_flags |= B_ASYNC;
 			(void) (nmp->nm_rpcops->nr_writebp)(bp, 0, NULL);
 		} else {
 			bdwrite(bp);
 		}
 	} while (uio->uio_resid > 0 && n > 0);
 
 	return (error);
 }
 
 /*
  * Get an nfs cache block.
  *
  * Allocate a new one if the block isn't currently in the cache
  * and return the block marked busy. If the calling process is
  * interrupted by a signal for an interruptible mount point, return
  * NULL.
  *
  * The caller must carefully deal with the possible B_INVAL state of
  * the buffer.  nfs_doio() clears B_INVAL (and nfs_asyncio() clears it
  * indirectly), so synchronous reads can be issued without worrying about
  * the B_INVAL state.  We have to be a little more careful when dealing
  * with writes (see comments in nfs_write()) when extending a file past
  * its EOF.
  */
 static struct buf *
 nfs_getcacheblk(struct vnode *vp, daddr_t bn, int size, struct thread *td)
 {
 	struct buf *bp;
 	struct mount *mp;
 	struct nfsmount *nmp;
 
 	mp = vp->v_mount;
 	nmp = VFSTONFS(mp);
 
 	if (nmp->nm_flag & NFSMNT_INT) {
  		sigset_t oldset;
 
  		nfs_set_sigmask(td, &oldset);
 		bp = getblk(vp, bn, size, PCATCH, 0, 0);
  		nfs_restore_sigmask(td, &oldset);
 		while (bp == NULL) {
 			if (nfs_sigintr(nmp, td))
 				return (NULL);
 			bp = getblk(vp, bn, size, 0, 2 * hz, 0);
 		}
 	} else {
 		bp = getblk(vp, bn, size, 0, 0, 0);
 	}
 
 	if (vp->v_type == VREG)
 		bp->b_blkno = bn * (vp->v_bufobj.bo_bsize / DEV_BSIZE);
 	return (bp);
 }
 
 /*
  * Flush and invalidate all dirty buffers. If another process is already
  * doing the flush, just wait for completion.
  */
 int
 nfs_vinvalbuf(struct vnode *vp, int flags, struct thread *td, int intrflg)
 {
 	struct nfsnode *np = VTONFS(vp);
 	struct nfsmount *nmp = VFSTONFS(vp->v_mount);
 	int error = 0, slpflag, slptimeo;
  	int old_lock = 0;
 
 	ASSERT_VOP_LOCKED(vp, "nfs_vinvalbuf");
 
 	if ((nmp->nm_flag & NFSMNT_INT) == 0)
 		intrflg = 0;
 	if (intrflg) {
 		slpflag = PCATCH;
 		slptimeo = 2 * hz;
 	} else {
 		slpflag = 0;
 		slptimeo = 0;
 	}
 
 	old_lock = nfs_upgrade_vnlock(vp);
 	if (vp->v_iflag & VI_DOOMED) {
 		/*
 		 * Since vgonel() uses the generic vinvalbuf() to flush
 		 * dirty buffers and it does not call this function, it
 		 * is safe to just return OK when VI_DOOMED is set.
 		 */
 		nfs_downgrade_vnlock(vp, old_lock);
 		return (0);
 	}
 
 	/*
 	 * Now, flush as required.
 	 */
 	if ((flags & V_SAVE) && (vp->v_bufobj.bo_object != NULL)) {
 		VM_OBJECT_WLOCK(vp->v_bufobj.bo_object);
 		vm_object_page_clean(vp->v_bufobj.bo_object, 0, 0, OBJPC_SYNC);
 		VM_OBJECT_WUNLOCK(vp->v_bufobj.bo_object);
 		/*
 		 * If the page clean was interrupted, fail the invalidation.
 		 * Not doing so, we run the risk of losing dirty pages in the 
 		 * vinvalbuf() call below.
 		 */
 		if (intrflg && (error = nfs_sigintr(nmp, td)))
 			goto out;
 	}
 
 	error = vinvalbuf(vp, flags, slpflag, 0);
 	while (error) {
 		if (intrflg && (error = nfs_sigintr(nmp, td)))
 			goto out;
 		error = vinvalbuf(vp, flags, 0, slptimeo);
 	}
 	mtx_lock(&np->n_mtx);
 	if (np->n_directio_asyncwr == 0)
 		np->n_flag &= ~NMODIFIED;
 	mtx_unlock(&np->n_mtx);
 out:
 	nfs_downgrade_vnlock(vp, old_lock);
 	return error;
 }
 
 /*
  * Initiate asynchronous I/O. Return an error if no nfsiods are available.
  * This is mainly to avoid queueing async I/O requests when the nfsiods
  * are all hung on a dead server.
  *
  * Note: nfs_asyncio() does not clear (BIO_ERROR|B_INVAL) but when the bp
  * is eventually dequeued by the async daemon, nfs_doio() *will*.
  */
 int
 nfs_asyncio(struct nfsmount *nmp, struct buf *bp, struct ucred *cred, struct thread *td)
 {
 	int iod;
 	int gotiod;
 	int slpflag = 0;
 	int slptimeo = 0;
 	int error, error2;
 
 	/*
 	 * Commits are usually short and sweet so lets save some cpu and
 	 * leave the async daemons for more important rpc's (such as reads
 	 * and writes).
 	 *
 	 * Readdirplus RPCs do vget()s to acquire the vnodes for entries
 	 * in the directory in order to update attributes. This can deadlock
 	 * with another thread that is waiting for async I/O to be done by
 	 * an nfsiod thread while holding a lock on one of these vnodes.
 	 * To avoid this deadlock, don't allow the async nfsiod threads to
 	 * perform Readdirplus RPCs.
 	 */
 	mtx_lock(&nfs_iod_mtx);
 	if ((bp->b_iocmd == BIO_WRITE && (bp->b_flags & B_NEEDCOMMIT) &&
 	     (nmp->nm_bufqiods > nfs_numasync / 2)) ||
 	    (bp->b_vp->v_type == VDIR && (nmp->nm_flag & NFSMNT_RDIRPLUS))) {
 		mtx_unlock(&nfs_iod_mtx);
 		return(EIO);
 	}
 again:
 	if (nmp->nm_flag & NFSMNT_INT)
 		slpflag = PCATCH;
 	gotiod = FALSE;
 
 	/*
 	 * Find a free iod to process this request.
 	 */
 	for (iod = 0; iod < nfs_numasync; iod++)
 		if (nfs_iodwant[iod] == NFSIOD_AVAILABLE) {
 			gotiod = TRUE;
 			break;
 		}
 
 	/*
 	 * Try to create one if none are free.
 	 */
 	if (!gotiod)
 		nfs_nfsiodnew();
 	else {
 		/*
 		 * Found one, so wake it up and tell it which
 		 * mount to process.
 		 */
 		NFS_DPF(ASYNCIO, ("nfs_asyncio: waking iod %d for mount %p\n",
 		    iod, nmp));
 		nfs_iodwant[iod] = NFSIOD_NOT_AVAILABLE;
 		nfs_iodmount[iod] = nmp;
 		nmp->nm_bufqiods++;
 		wakeup(&nfs_iodwant[iod]);
 	}
 
 	/*
 	 * If none are free, we may already have an iod working on this mount
 	 * point.  If so, it will process our request.
 	 */
 	if (!gotiod) {
 		if (nmp->nm_bufqiods > 0) {
 			NFS_DPF(ASYNCIO,
 		("nfs_asyncio: %d iods are already processing mount %p\n",
 				 nmp->nm_bufqiods, nmp));
 			gotiod = TRUE;
 		}
 	}
 
 	/*
 	 * If we have an iod which can process the request, then queue
 	 * the buffer.
 	 */
 	if (gotiod) {
 		/*
 		 * Ensure that the queue never grows too large.  We still want
 		 * to asynchronize so we block rather then return EIO.
 		 */
 		while (nmp->nm_bufqlen >= 2 * nfs_numasync) {
 			NFS_DPF(ASYNCIO,
 		("nfs_asyncio: waiting for mount %p queue to drain\n", nmp));
 			nmp->nm_bufqwant = TRUE;
  			error = nfs_msleep(td, &nmp->nm_bufq, &nfs_iod_mtx, 
 					   slpflag | PRIBIO,
  					   "nfsaio", slptimeo);
 			if (error) {
 				error2 = nfs_sigintr(nmp, td);
 				if (error2) {
 					mtx_unlock(&nfs_iod_mtx);
 					return (error2);
 				}
 				if (slpflag == PCATCH) {
 					slpflag = 0;
 					slptimeo = 2 * hz;
 				}
 			}
 			/*
 			 * We might have lost our iod while sleeping,
 			 * so check and loop if nescessary.
 			 */
 			goto again;
 		}
 
 		/* We might have lost our nfsiod */
 		if (nmp->nm_bufqiods == 0) {
 			NFS_DPF(ASYNCIO,
 ("nfs_asyncio: no iods after mount %p queue was drained, looping\n", nmp));
 			goto again;
 		}
 
 		if (bp->b_iocmd == BIO_READ) {
 			if (bp->b_rcred == NOCRED && cred != NOCRED)
 				bp->b_rcred = crhold(cred);
 		} else {
 			if (bp->b_wcred == NOCRED && cred != NOCRED)
 				bp->b_wcred = crhold(cred);
 		}
 
 		if (bp->b_flags & B_REMFREE)
 			bremfreef(bp);
 		BUF_KERNPROC(bp);
 		TAILQ_INSERT_TAIL(&nmp->nm_bufq, bp, b_freelist);
 		nmp->nm_bufqlen++;
 		if ((bp->b_flags & B_DIRECT) && bp->b_iocmd == BIO_WRITE) {
 			mtx_lock(&(VTONFS(bp->b_vp))->n_mtx);			
 			VTONFS(bp->b_vp)->n_flag |= NMODIFIED;
 			VTONFS(bp->b_vp)->n_directio_asyncwr++;
 			mtx_unlock(&(VTONFS(bp->b_vp))->n_mtx);
 		}
 		mtx_unlock(&nfs_iod_mtx);
 		return (0);
 	}
 
 	mtx_unlock(&nfs_iod_mtx);
 
 	/*
 	 * All the iods are busy on other mounts, so return EIO to
 	 * force the caller to process the i/o synchronously.
 	 */
 	NFS_DPF(ASYNCIO, ("nfs_asyncio: no iods available, i/o is synchronous\n"));
 	return (EIO);
 }
 
 void
 nfs_doio_directwrite(struct buf *bp)
 {
 	int iomode, must_commit;
 	struct uio *uiop = (struct uio *)bp->b_caller1;
 	char *iov_base = uiop->uio_iov->iov_base;
 	struct nfsmount *nmp = VFSTONFS(bp->b_vp->v_mount);
 	
 	iomode = NFSV3WRITE_FILESYNC;
 	uiop->uio_td = NULL; /* NULL since we're in nfsiod */
 	(nmp->nm_rpcops->nr_writerpc)(bp->b_vp, uiop, bp->b_wcred, &iomode, &must_commit);
 	KASSERT((must_commit == 0), ("nfs_doio_directwrite: Did not commit write"));
 	free(iov_base, M_NFSDIRECTIO);
 	free(uiop->uio_iov, M_NFSDIRECTIO);
 	free(uiop, M_NFSDIRECTIO);
 	if ((bp->b_flags & B_DIRECT) && bp->b_iocmd == BIO_WRITE) {
 		struct nfsnode *np = VTONFS(bp->b_vp);
 		mtx_lock(&np->n_mtx);
 		np->n_directio_asyncwr--;
 		if (np->n_directio_asyncwr == 0) {
 			VTONFS(bp->b_vp)->n_flag &= ~NMODIFIED;
 			if ((np->n_flag & NFSYNCWAIT)) {
 				np->n_flag &= ~NFSYNCWAIT;
 				wakeup((caddr_t)&np->n_directio_asyncwr);
 			}
 		}
 		mtx_unlock(&np->n_mtx);
 	}
 	bp->b_vp = NULL;
 	relpbuf(bp, &nfs_pbuf_freecnt);
 }
 
 /*
  * Do an I/O operation to/from a cache block. This may be called
  * synchronously or from an nfsiod.
  */
 int
 nfs_doio(struct vnode *vp, struct buf *bp, struct ucred *cr, struct thread *td)
 {
 	struct uio *uiop;
 	struct nfsnode *np;
 	struct nfsmount *nmp;
 	int error = 0, iomode, must_commit = 0;
 	struct uio uio;
 	struct iovec io;
 	struct proc *p = td ? td->td_proc : NULL;
 	uint8_t	iocmd;
 	
 	np = VTONFS(vp);
 	nmp = VFSTONFS(vp->v_mount);
 	uiop = &uio;
 	uiop->uio_iov = &io;
 	uiop->uio_iovcnt = 1;
 	uiop->uio_segflg = UIO_SYSSPACE;
 	uiop->uio_td = td;
 
 	/*
 	 * clear BIO_ERROR and B_INVAL state prior to initiating the I/O.  We
 	 * do this here so we do not have to do it in all the code that
 	 * calls us.
 	 */
 	bp->b_flags &= ~B_INVAL;
 	bp->b_ioflags &= ~BIO_ERROR;
 
 	KASSERT(!(bp->b_flags & B_DONE), ("nfs_doio: bp %p already marked done", bp));
 	iocmd = bp->b_iocmd;
 	if (iocmd == BIO_READ) {
 	    io.iov_len = uiop->uio_resid = bp->b_bcount;
 	    io.iov_base = bp->b_data;
 	    uiop->uio_rw = UIO_READ;
 
 	    switch (vp->v_type) {
 	    case VREG:
 		uiop->uio_offset = ((off_t)bp->b_blkno) * DEV_BSIZE;
 		nfsstats.read_bios++;
 		error = (nmp->nm_rpcops->nr_readrpc)(vp, uiop, cr);
 
 		if (!error) {
 		    if (uiop->uio_resid) {
 			/*
 			 * If we had a short read with no error, we must have
 			 * hit a file hole.  We should zero-fill the remainder.
 			 * This can also occur if the server hits the file EOF.
 			 *
 			 * Holes used to be able to occur due to pending
 			 * writes, but that is not possible any longer.
 			 */
 			int nread = bp->b_bcount - uiop->uio_resid;
 			int left  = uiop->uio_resid;
 
 			if (left > 0)
 				bzero((char *)bp->b_data + nread, left);
 			uiop->uio_resid = 0;
 		    }
 		}
 		/* ASSERT_VOP_LOCKED(vp, "nfs_doio"); */
 		if (p && (vp->v_vflag & VV_TEXT)) {
 			mtx_lock(&np->n_mtx);
 			if (NFS_TIMESPEC_COMPARE(&np->n_mtime, &np->n_vattr.va_mtime)) {
 				mtx_unlock(&np->n_mtx);
 				PROC_LOCK(p);
 				killproc(p, "text file modification");
 				PROC_UNLOCK(p);
 			} else
 				mtx_unlock(&np->n_mtx);
 		}
 		break;
 	    case VLNK:
 		uiop->uio_offset = (off_t)0;
 		nfsstats.readlink_bios++;
 		error = (nmp->nm_rpcops->nr_readlinkrpc)(vp, uiop, cr);
 		break;
 	    case VDIR:
 		nfsstats.readdir_bios++;
 		uiop->uio_offset = ((u_quad_t)bp->b_lblkno) * NFS_DIRBLKSIZ;
 		if ((nmp->nm_flag & NFSMNT_RDIRPLUS) != 0) {
 			error = nfs_readdirplusrpc(vp, uiop, cr);
 			if (error == NFSERR_NOTSUPP)
 				nmp->nm_flag &= ~NFSMNT_RDIRPLUS;
 		}
 		if ((nmp->nm_flag & NFSMNT_RDIRPLUS) == 0)
 			error = nfs_readdirrpc(vp, uiop, cr);
 		/*
 		 * end-of-directory sets B_INVAL but does not generate an
 		 * error.
 		 */
 		if (error == 0 && uiop->uio_resid == bp->b_bcount)
 			bp->b_flags |= B_INVAL;
 		break;
 	    default:
 		nfs_printf("nfs_doio:  type %x unexpected\n", vp->v_type);
 		break;
 	    };
 	    if (error) {
 		bp->b_ioflags |= BIO_ERROR;
 		bp->b_error = error;
 	    }
 	} else {
 	    /*
 	     * If we only need to commit, try to commit
 	     */
 	    if (bp->b_flags & B_NEEDCOMMIT) {
 		    int retv;
 		    off_t off;
 
 		    off = ((u_quad_t)bp->b_blkno) * DEV_BSIZE + bp->b_dirtyoff;
 		    retv = (nmp->nm_rpcops->nr_commit)(
 				vp, off, bp->b_dirtyend-bp->b_dirtyoff,
 				bp->b_wcred, td);
 		    if (retv == 0) {
 			    bp->b_dirtyoff = bp->b_dirtyend = 0;
 			    bp->b_flags &= ~(B_NEEDCOMMIT | B_CLUSTEROK);
 			    bp->b_resid = 0;
 			    bufdone(bp);
 			    return (0);
 		    }
 		    if (retv == NFSERR_STALEWRITEVERF) {
 			    nfs_clearcommit(vp->v_mount);
 		    }
 	    }
 
 	    /*
 	     * Setup for actual write
 	     */
 	    mtx_lock(&np->n_mtx);
 	    if ((off_t)bp->b_blkno * DEV_BSIZE + bp->b_dirtyend > np->n_size)
 		bp->b_dirtyend = np->n_size - (off_t)bp->b_blkno * DEV_BSIZE;
 	    mtx_unlock(&np->n_mtx);
 
 	    if (bp->b_dirtyend > bp->b_dirtyoff) {
 		io.iov_len = uiop->uio_resid = bp->b_dirtyend
 		    - bp->b_dirtyoff;
 		uiop->uio_offset = (off_t)bp->b_blkno * DEV_BSIZE
 		    + bp->b_dirtyoff;
 		io.iov_base = (char *)bp->b_data + bp->b_dirtyoff;
 		uiop->uio_rw = UIO_WRITE;
 		nfsstats.write_bios++;
 
 		if ((bp->b_flags & (B_ASYNC | B_NEEDCOMMIT | B_NOCACHE | B_CLUSTER)) == B_ASYNC)
 		    iomode = NFSV3WRITE_UNSTABLE;
 		else
 		    iomode = NFSV3WRITE_FILESYNC;
 
 		error = (nmp->nm_rpcops->nr_writerpc)(vp, uiop, cr, &iomode, &must_commit);
 
 		/*
 		 * When setting B_NEEDCOMMIT also set B_CLUSTEROK to try
 		 * to cluster the buffers needing commit.  This will allow
 		 * the system to submit a single commit rpc for the whole
 		 * cluster.  We can do this even if the buffer is not 100%
 		 * dirty (relative to the NFS blocksize), so we optimize the
 		 * append-to-file-case.
 		 *
 		 * (when clearing B_NEEDCOMMIT, B_CLUSTEROK must also be
 		 * cleared because write clustering only works for commit
 		 * rpc's, not for the data portion of the write).
 		 */
 
 		if (!error && iomode == NFSV3WRITE_UNSTABLE) {
 		    bp->b_flags |= B_NEEDCOMMIT;
 		    if (bp->b_dirtyoff == 0
 			&& bp->b_dirtyend == bp->b_bcount)
 			bp->b_flags |= B_CLUSTEROK;
 		} else {
 		    bp->b_flags &= ~(B_NEEDCOMMIT | B_CLUSTEROK);
 		}
 
 		/*
 		 * For an interrupted write, the buffer is still valid
 		 * and the write hasn't been pushed to the server yet,
 		 * so we can't set BIO_ERROR and report the interruption
 		 * by setting B_EINTR. For the B_ASYNC case, B_EINTR
 		 * is not relevant, so the rpc attempt is essentially
 		 * a noop.  For the case of a V3 write rpc not being
 		 * committed to stable storage, the block is still
 		 * dirty and requires either a commit rpc or another
 		 * write rpc with iomode == NFSV3WRITE_FILESYNC before
 		 * the block is reused. This is indicated by setting
 		 * the B_DELWRI and B_NEEDCOMMIT flags.
 		 *
 		 * If the buffer is marked B_PAGING, it does not reside on
 		 * the vp's paging queues so we cannot call bdirty().  The
 		 * bp in this case is not an NFS cache block so we should
 		 * be safe. XXX
 		 *
 		 * The logic below breaks up errors into recoverable and 
 		 * unrecoverable. For the former, we clear B_INVAL|B_NOCACHE
 		 * and keep the buffer around for potential write retries.
 		 * For the latter (eg ESTALE), we toss the buffer away (B_INVAL)
 		 * and save the error in the nfsnode. This is less than ideal 
 		 * but necessary. Keeping such buffers around could potentially
 		 * cause buffer exhaustion eventually (they can never be written
 		 * out, so will get constantly be re-dirtied). It also causes
 		 * all sorts of vfs panics. For non-recoverable write errors, 
 		 * also invalidate the attrcache, so we'll be forced to go over
 		 * the wire for this object, returning an error to user on next
 		 * call (most of the time).
 		 */
     		if (error == EINTR || error == EIO || error == ETIMEDOUT
 		    || (!error && (bp->b_flags & B_NEEDCOMMIT))) {
 			int s;
 
 			s = splbio();
 			bp->b_flags &= ~(B_INVAL|B_NOCACHE);
 			if ((bp->b_flags & B_PAGING) == 0) {
 			    bdirty(bp);
 			    bp->b_flags &= ~B_DONE;
 			}
 			if (error && (bp->b_flags & B_ASYNC) == 0)
 			    bp->b_flags |= B_EINTR;
 			splx(s);
 	    	} else {
 		    if (error) {
 			bp->b_ioflags |= BIO_ERROR;
 			bp->b_flags |= B_INVAL;
 			bp->b_error = np->n_error = error;
 			mtx_lock(&np->n_mtx);
 			np->n_flag |= NWRITEERR;
 			np->n_attrstamp = 0;
 			KDTRACE_NFS_ATTRCACHE_FLUSH_DONE(vp);
 			mtx_unlock(&np->n_mtx);
 		    }
 		    bp->b_dirtyoff = bp->b_dirtyend = 0;
 		}
 	    } else {
 		bp->b_resid = 0;
 		bufdone(bp);
 		return (0);
 	    }
 	}
 	bp->b_resid = uiop->uio_resid;
 	if (must_commit)
 	    nfs_clearcommit(vp->v_mount);
 	bufdone(bp);
 	return (error);
 }
 
 /*
  * Used to aid in handling ftruncate() operations on the NFS client side.
  * Truncation creates a number of special problems for NFS.  We have to
  * throw away VM pages and buffer cache buffers that are beyond EOF, and
  * we have to properly handle VM pages or (potentially dirty) buffers
  * that straddle the truncation point.
  */
 
 int
 nfs_meta_setsize(struct vnode *vp, struct ucred *cred, struct thread *td, u_quad_t nsize)
 {
 	struct nfsnode *np = VTONFS(vp);
 	u_quad_t tsize;
 	int biosize = vp->v_bufobj.bo_bsize;
 	int error = 0;
 
 	mtx_lock(&np->n_mtx);
 	tsize = np->n_size;
 	np->n_size = nsize;
 	mtx_unlock(&np->n_mtx);
 
 	if (nsize < tsize) {
 		struct buf *bp;
 		daddr_t lbn;
 		int bufsize;
 
 		/*
 		 * vtruncbuf() doesn't get the buffer overlapping the 
 		 * truncation point.  We may have a B_DELWRI and/or B_CACHE
 		 * buffer that now needs to be truncated.
 		 */
 		error = vtruncbuf(vp, cred, nsize, biosize);
 		lbn = nsize / biosize;
 		bufsize = nsize - (lbn * biosize);
 		bp = nfs_getcacheblk(vp, lbn, bufsize, td);
  		if (!bp)
  			return EINTR;
 		if (bp->b_dirtyoff > bp->b_bcount)
 			bp->b_dirtyoff = bp->b_bcount;
 		if (bp->b_dirtyend > bp->b_bcount)
 			bp->b_dirtyend = bp->b_bcount;
 		bp->b_flags |= B_RELBUF;  /* don't leave garbage around */
 		brelse(bp);
 	} else {
 		vnode_pager_setsize(vp, nsize);
 	}
 	return(error);
 }
 
Index: head/sys/nfsclient/nfs_krpc.c
===================================================================
--- head/sys/nfsclient/nfs_krpc.c	(revision 258540)
+++ head/sys/nfsclient/nfs_krpc.c	(revision 258541)
@@ -1,888 +1,887 @@
 /*-
  * Copyright (c) 1989, 1991, 1993, 1995
  *	The Regents of the University of California.  All rights reserved.
  *
  * This code is derived from software contributed to Berkeley by
  * Rick Macklem at The University of Guelph.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)nfs_socket.c	8.5 (Berkeley) 3/30/95
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 /*
  * Socket operations for use by nfs
  */
 
 #include "opt_inet6.h"
-#include "opt_kdtrace.h"
 #include "opt_kgssapi.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/kernel.h>
 #include <sys/limits.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mbuf.h>
 #include <sys/mount.h>
 #include <sys/mutex.h>
 #include <sys/proc.h>
 #include <sys/signalvar.h>
 #include <sys/syscallsubr.h>
 #include <sys/sysctl.h>
 #include <sys/syslog.h>
 #include <sys/vnode.h>
 
 #include <rpc/rpc.h>
 
 #include <nfs/nfsproto.h>
 #include <nfsclient/nfs.h>
 #include <nfs/xdr_subs.h>
 #include <nfsclient/nfsm_subs.h>
 #include <nfsclient/nfsmount.h>
 #include <nfsclient/nfsnode.h>
 
 #ifdef KDTRACE_HOOKS
 #include <sys/dtrace_bsd.h>
 
 dtrace_nfsclient_nfs23_start_probe_func_t
     dtrace_nfsclient_nfs23_start_probe;
 
 dtrace_nfsclient_nfs23_done_probe_func_t
     dtrace_nfsclient_nfs23_done_probe;
 
 /*
  * Registered probes by RPC type.
  */
 uint32_t	nfsclient_nfs2_start_probes[NFS_NPROCS];
 uint32_t	nfsclient_nfs2_done_probes[NFS_NPROCS];
 
 uint32_t	nfsclient_nfs3_start_probes[NFS_NPROCS];
 uint32_t	nfsclient_nfs3_done_probes[NFS_NPROCS];
 #endif
 
 static int	nfs_bufpackets = 4;
 static int	nfs_reconnects;
 static int	nfs3_jukebox_delay = 10;
 static int	nfs_skip_wcc_data_onerr = 1;
 static int	fake_wchan;
 
 SYSCTL_DECL(_vfs_oldnfs);
 
 SYSCTL_INT(_vfs_oldnfs, OID_AUTO, bufpackets, CTLFLAG_RW, &nfs_bufpackets, 0,
     "Buffer reservation size 2 < x < 64");
 SYSCTL_INT(_vfs_oldnfs, OID_AUTO, reconnects, CTLFLAG_RD, &nfs_reconnects, 0,
     "Number of times the nfs client has had to reconnect");
 SYSCTL_INT(_vfs_oldnfs, OID_AUTO, nfs3_jukebox_delay, CTLFLAG_RW,
     &nfs3_jukebox_delay, 0,
     "Number of seconds to delay a retry after receiving EJUKEBOX");
 SYSCTL_INT(_vfs_oldnfs, OID_AUTO, skip_wcc_data_onerr, CTLFLAG_RW,
     &nfs_skip_wcc_data_onerr, 0,
     "Disable weak cache consistency checking when server returns an error");
 
 static void	nfs_down(struct nfsmount *, struct thread *, const char *,
     int, int);
 static void	nfs_up(struct nfsmount *, struct thread *, const char *,
     int, int);
 static int	nfs_msg(struct thread *, const char *, const char *, int);
 
 extern int nfsv2_procid[];
 
 struct nfs_cached_auth {
 	int		ca_refs; /* refcount, including 1 from the cache */
 	uid_t		ca_uid;	 /* uid that corresponds to this auth */
 	AUTH		*ca_auth; /* RPC auth handle */
 };
 
 /*
  * RTT estimator
  */
 
 static enum nfs_rto_timer_t nfs_proct[NFS_NPROCS] = {
 	NFS_DEFAULT_TIMER,	/* NULL */
 	NFS_GETATTR_TIMER,	/* GETATTR */
 	NFS_DEFAULT_TIMER,	/* SETATTR */
 	NFS_LOOKUP_TIMER,	/* LOOKUP */
 	NFS_GETATTR_TIMER,	/* ACCESS */
 	NFS_READ_TIMER,		/* READLINK */
 	NFS_READ_TIMER,		/* READ */
 	NFS_WRITE_TIMER,	/* WRITE */
 	NFS_DEFAULT_TIMER,	/* CREATE */
 	NFS_DEFAULT_TIMER,	/* MKDIR */
 	NFS_DEFAULT_TIMER,	/* SYMLINK */
 	NFS_DEFAULT_TIMER,	/* MKNOD */
 	NFS_DEFAULT_TIMER,	/* REMOVE */
 	NFS_DEFAULT_TIMER,	/* RMDIR */
 	NFS_DEFAULT_TIMER,	/* RENAME */
 	NFS_DEFAULT_TIMER,	/* LINK */
 	NFS_READ_TIMER,		/* READDIR */
 	NFS_READ_TIMER,		/* READDIRPLUS */
 	NFS_DEFAULT_TIMER,	/* FSSTAT */
 	NFS_DEFAULT_TIMER,	/* FSINFO */
 	NFS_DEFAULT_TIMER,	/* PATHCONF */
 	NFS_DEFAULT_TIMER,	/* COMMIT */
 	NFS_DEFAULT_TIMER,	/* NOOP */
 };
 
 /*
  * Choose the correct RTT timer for this NFS procedure.
  */
 static inline enum nfs_rto_timer_t
 nfs_rto_timer(u_int32_t procnum)
 {
 
 	return (nfs_proct[procnum]);
 }
 
 /*
  * Initialize the RTT estimator state for a new mount point.
  */
 static void
 nfs_init_rtt(struct nfsmount *nmp)
 {
 	int i;
 
 	for (i = 0; i < NFS_MAX_TIMER; i++) {
 		nmp->nm_timers[i].rt_srtt = hz;
 		nmp->nm_timers[i].rt_deviate = 0;
 		nmp->nm_timers[i].rt_rtxcur = hz;
 	}
 }
 
 /*
  * Initialize sockets and congestion for a new NFS connection.
  * We do not free the sockaddr if error.
  */
 int
 nfs_connect(struct nfsmount *nmp)
 {
 	int rcvreserve, sndreserve;
 	int pktscale;
 	struct sockaddr *saddr;
 	struct ucred *origcred;
 	struct thread *td = curthread;
 	CLIENT *client;
 	struct netconfig *nconf;
 	rpcvers_t vers;
 	int one = 1, retries;
 	struct timeval timo;
 
 	/*
 	 * We need to establish the socket using the credentials of
 	 * the mountpoint.  Some parts of this process (such as
 	 * sobind() and soconnect()) will use the curent thread's
 	 * credential instead of the socket credential.  To work
 	 * around this, temporarily change the current thread's
 	 * credential to that of the mountpoint.
 	 *
 	 * XXX: It would be better to explicitly pass the correct
 	 * credential to sobind() and soconnect().
 	 */
 	origcred = td->td_ucred;
 	td->td_ucred = nmp->nm_mountp->mnt_cred;
 	saddr = nmp->nm_nam;
 
 	vers = NFS_VER2;
 	if (nmp->nm_flag & NFSMNT_NFSV3)
 		vers = NFS_VER3;
 	else if (nmp->nm_flag & NFSMNT_NFSV4)
 		vers = NFS_VER4;
 	if (saddr->sa_family == AF_INET)
 		if (nmp->nm_sotype == SOCK_DGRAM)
 			nconf = getnetconfigent("udp");
 		else
 			nconf = getnetconfigent("tcp");
 	else
 		if (nmp->nm_sotype == SOCK_DGRAM)
 			nconf = getnetconfigent("udp6");
 		else
 			nconf = getnetconfigent("tcp6");
 
 	/*
 	 * Get buffer reservation size from sysctl, but impose reasonable
 	 * limits.
 	 */
 	pktscale = nfs_bufpackets;
 	if (pktscale < 2)
 		pktscale = 2;
 	if (pktscale > 64)
 		pktscale = 64;
 	mtx_lock(&nmp->nm_mtx);
 	if (nmp->nm_sotype == SOCK_DGRAM) {
 		sndreserve = (nmp->nm_wsize + NFS_MAXPKTHDR) * pktscale;
 		rcvreserve = (max(nmp->nm_rsize, nmp->nm_readdirsize) +
 		    NFS_MAXPKTHDR) * pktscale;
 	} else if (nmp->nm_sotype == SOCK_SEQPACKET) {
 		sndreserve = (nmp->nm_wsize + NFS_MAXPKTHDR) * pktscale;
 		rcvreserve = (max(nmp->nm_rsize, nmp->nm_readdirsize) +
 		    NFS_MAXPKTHDR) * pktscale;
 	} else {
 		if (nmp->nm_sotype != SOCK_STREAM)
 			panic("nfscon sotype");
 		sndreserve = (nmp->nm_wsize + NFS_MAXPKTHDR +
 		    sizeof (u_int32_t)) * pktscale;
 		rcvreserve = (nmp->nm_rsize + NFS_MAXPKTHDR +
 		    sizeof (u_int32_t)) * pktscale;
 	}
 	mtx_unlock(&nmp->nm_mtx);
 
 	client = clnt_reconnect_create(nconf, saddr, NFS_PROG, vers,
 	    sndreserve, rcvreserve);
 	CLNT_CONTROL(client, CLSET_WAITCHAN, "nfsreq");
 	if (nmp->nm_flag & NFSMNT_INT)
 		CLNT_CONTROL(client, CLSET_INTERRUPTIBLE, &one);
 	if (nmp->nm_flag & NFSMNT_RESVPORT)
 		CLNT_CONTROL(client, CLSET_PRIVPORT, &one);
 	if ((nmp->nm_flag & NFSMNT_SOFT) != 0) {
 		if (nmp->nm_sotype == SOCK_DGRAM)
 			/*
 			 * For UDP, the large timeout for a reconnect will
 			 * be set to "nm_retry * nm_timeo / 2", so we only
 			 * want to do 2 reconnect timeout retries.
 			 */
 			retries = 2;
 		else
 			retries = nmp->nm_retry;
 	} else
 		retries = INT_MAX;
 	CLNT_CONTROL(client, CLSET_RETRIES, &retries);
 
 	/*
 	 * For UDP, there are 2 timeouts:
 	 * - CLSET_RETRY_TIMEOUT sets the initial timeout for the timer
 	 *   that does a retransmit of an RPC request using the same socket
 	 *   and xid. This is what you normally want to do, since NFS
 	 *   servers depend on "same xid" for their Duplicate Request Cache.
 	 * - timeout specified in CLNT_CALL_MBUF(), which specifies when
 	 *   retransmits on the same socket should fail and a fresh socket
 	 *   created. Each of these timeouts counts as one CLSET_RETRIES,
 	 *   as set above.
 	 * Set the initial retransmit timeout for UDP. This timeout doesn't
 	 * exist for TCP and the following call just fails, which is ok.
 	 */
 	timo.tv_sec = nmp->nm_timeo / NFS_HZ;
 	timo.tv_usec = (nmp->nm_timeo % NFS_HZ) * 1000000 / NFS_HZ;
 	CLNT_CONTROL(client, CLSET_RETRY_TIMEOUT, &timo);
 
 	mtx_lock(&nmp->nm_mtx);
 	if (nmp->nm_client) {
 		/*
 		 * Someone else already connected.
 		 */
 		CLNT_RELEASE(client);
 	} else
 		nmp->nm_client = client;
 
 	/*
 	 * Protocols that do not require connections may be optionally left
 	 * unconnected for servers that reply from a port other than NFS_PORT.
 	 */
 	if (!(nmp->nm_flag & NFSMNT_NOCONN)) {
 		mtx_unlock(&nmp->nm_mtx);
 		CLNT_CONTROL(client, CLSET_CONNECT, &one);
 	} else
 		mtx_unlock(&nmp->nm_mtx);
 
 	/* Restore current thread's credentials. */
 	td->td_ucred = origcred;
 
 	mtx_lock(&nmp->nm_mtx);
 	/* Initialize other non-zero congestion variables. */
 	nfs_init_rtt(nmp);
 	mtx_unlock(&nmp->nm_mtx);
 	return (0);
 }
 
 /*
  * NFS disconnect.  Clean up and unlink.
  */
 void
 nfs_disconnect(struct nfsmount *nmp)
 {
 	CLIENT *client;
 
 	mtx_lock(&nmp->nm_mtx);
 	if (nmp->nm_client) {
 		client = nmp->nm_client;
 		nmp->nm_client = NULL;
 		mtx_unlock(&nmp->nm_mtx);
 		rpc_gss_secpurge_call(client);
 		CLNT_CLOSE(client);
 		CLNT_RELEASE(client);
 	} else
 		mtx_unlock(&nmp->nm_mtx);
 }
 
 void
 nfs_safedisconnect(struct nfsmount *nmp)
 {
 
 	nfs_disconnect(nmp);
 }
 
 static AUTH *
 nfs_getauth(struct nfsmount *nmp, struct ucred *cred)
 {
 	rpc_gss_service_t svc;
 	AUTH *auth;
 
 	switch (nmp->nm_secflavor) {
 	case RPCSEC_GSS_KRB5:
 	case RPCSEC_GSS_KRB5I:
 	case RPCSEC_GSS_KRB5P:
 		if (!nmp->nm_mech_oid)
 			if (!rpc_gss_mech_to_oid_call("kerberosv5",
 			    &nmp->nm_mech_oid))
 				return (NULL);
 		if (nmp->nm_secflavor == RPCSEC_GSS_KRB5)
 			svc = rpc_gss_svc_none;
 		else if (nmp->nm_secflavor == RPCSEC_GSS_KRB5I)
 			svc = rpc_gss_svc_integrity;
 		else
 			svc = rpc_gss_svc_privacy;
 		auth = rpc_gss_secfind_call(nmp->nm_client, cred,
 		    nmp->nm_principal, nmp->nm_mech_oid, svc);
 		if (auth)
 			return (auth);
 		/* fallthrough */
 	case AUTH_SYS:
 	default:
 		return (authunix_create(cred));
 
 	}
 }
 
 /*
  * Callback from the RPC code to generate up/down notifications.
  */
 
 struct nfs_feedback_arg {
 	struct nfsmount *nf_mount;
 	int		nf_lastmsg;	/* last tprintf */
 	int		nf_tprintfmsg;
 	struct thread	*nf_td;
 };
 
 static void
 nfs_feedback(int type, int proc, void *arg)
 {
 	struct nfs_feedback_arg *nf = (struct nfs_feedback_arg *) arg;
 	struct nfsmount *nmp = nf->nf_mount;
 	time_t now;
 
 	switch (type) {
 	case FEEDBACK_REXMIT2:
 	case FEEDBACK_RECONNECT:
 		now = time_uptime;
 		if (nf->nf_lastmsg + nmp->nm_tprintf_delay < now) {
 			nfs_down(nmp, nf->nf_td,
 			    "not responding", 0, NFSSTA_TIMEO);
 			nf->nf_tprintfmsg = TRUE;
 			nf->nf_lastmsg = now;
 		}
 		break;
 
 	case FEEDBACK_OK:
 		nfs_up(nf->nf_mount, nf->nf_td,
 		    "is alive again", NFSSTA_TIMEO, nf->nf_tprintfmsg);
 		break;
 	}
 }
 
 /*
  * nfs_request - goes something like this
  *	- fill in request struct
  *	- links it into list
  *	- calls nfs_send() for first transmit
  *	- calls nfs_receive() to get reply
  *	- break down rpc header and return with nfs reply pointed to
  *	  by mrep or error
  * nb: always frees up mreq mbuf list
  */
 int
 nfs_request(struct vnode *vp, struct mbuf *mreq, int procnum,
     struct thread *td, struct ucred *cred, struct mbuf **mrp,
     struct mbuf **mdp, caddr_t *dposp)
 {
 	struct mbuf *mrep;
 	u_int32_t *tl;
 	struct nfsmount *nmp;
 	struct mbuf *md;
 	time_t waituntil;
 	caddr_t dpos;
 	int error = 0, timeo;
 	AUTH *auth = NULL;
 	enum nfs_rto_timer_t timer;
 	struct nfs_feedback_arg nf;
 	struct rpc_callextra ext;
 	enum clnt_stat stat;
 	struct timeval timo;
 
 	/* Reject requests while attempting a forced unmount. */
 	if (vp->v_mount->mnt_kern_flag & MNTK_UNMOUNTF) {
 		m_freem(mreq);
 		return (ESTALE);
 	}
 	nmp = VFSTONFS(vp->v_mount);
 	bzero(&nf, sizeof(struct nfs_feedback_arg));
 	nf.nf_mount = nmp;
 	nf.nf_td = td;
 	nf.nf_lastmsg = time_uptime -
 	    ((nmp->nm_tprintf_delay) - (nmp->nm_tprintf_initial_delay));
 
 	/*
 	 * XXX if not already connected call nfs_connect now.  Longer
 	 * term, change nfs_mount to call nfs_connect unconditionally
 	 * and let clnt_reconnect_create handle reconnects.
 	 */
 	if (!nmp->nm_client)
 		nfs_connect(nmp);
 
 	auth = nfs_getauth(nmp, cred);
 	if (!auth) {
 		m_freem(mreq);
 		return (EACCES);
 	}
 	bzero(&ext, sizeof(ext));
 	ext.rc_auth = auth;
 
 	ext.rc_feedback = nfs_feedback;
 	ext.rc_feedback_arg = &nf;
 
 	/*
 	 * Use a conservative timeout for RPCs other than getattr,
 	 * lookup, read or write.  The justification for doing "other"
 	 * this way is that these RPCs happen so infrequently that
 	 * timer est. would probably be stale.  Also, since many of
 	 * these RPCs are non-idempotent, a conservative timeout is
 	 * desired.
 	 */
 	timer = nfs_rto_timer(procnum);
 	if (timer != NFS_DEFAULT_TIMER)
 		ext.rc_timers = &nmp->nm_timers[timer - 1];
 	else
 		ext.rc_timers = NULL;
 
 #ifdef KDTRACE_HOOKS
 	if (dtrace_nfsclient_nfs23_start_probe != NULL) {
 		uint32_t probe_id;
 		int probe_procnum;
 
 		if (nmp->nm_flag & NFSMNT_NFSV3) {
 			probe_id = nfsclient_nfs3_start_probes[procnum];
 			probe_procnum = procnum;
 		} else {
 			probe_id = nfsclient_nfs2_start_probes[procnum];
 			probe_procnum = nfsv2_procid[procnum];
 		}
 		if (probe_id != 0)
 			(dtrace_nfsclient_nfs23_start_probe)(probe_id, vp,
 			    mreq, cred, probe_procnum);
 	}
 #endif
 
 	nfsstats.rpcrequests++;
 tryagain:
 	/*
 	 * This timeout specifies when a new socket should be created,
 	 * along with new xid values. For UDP, this should be done
 	 * infrequently, since retransmits of RPC requests should normally
 	 * use the same xid.
 	 */
 	if (nmp->nm_sotype == SOCK_DGRAM) {
 		if ((nmp->nm_flag & NFSMNT_SOFT) != 0) {
 			/*
 			 * CLSET_RETRIES is set to 2, so this should be half
 			 * of the total timeout required.
 			 */
 			timeo = nmp->nm_retry * nmp->nm_timeo / 2;
 			if (timeo < 1)
 				timeo = 1;
 			timo.tv_sec = timeo / NFS_HZ;
 			timo.tv_usec = (timeo % NFS_HZ) * 1000000 / NFS_HZ;
 		} else {
 			/* For UDP hard mounts, use a large value. */
 			timo.tv_sec = NFS_MAXTIMEO / NFS_HZ;
 			timo.tv_usec = 0;
 		}
 	} else {
 		timo.tv_sec = nmp->nm_timeo / NFS_HZ;
 		timo.tv_usec = (nmp->nm_timeo % NFS_HZ) * 1000000 / NFS_HZ;
 	}
 	mrep = NULL;
 	stat = CLNT_CALL_MBUF(nmp->nm_client, &ext,
 	    (nmp->nm_flag & NFSMNT_NFSV3) ? procnum : nfsv2_procid[procnum],
 	    mreq, &mrep, timo);
 
 	/*
 	 * If there was a successful reply and a tprintf msg.
 	 * tprintf a response.
 	 */
 	if (stat == RPC_SUCCESS)
 		error = 0;
 	else if (stat == RPC_TIMEDOUT) {
 		nfsstats.rpctimeouts++;
 		error = ETIMEDOUT;
 	} else if (stat == RPC_VERSMISMATCH) {
 		nfsstats.rpcinvalid++;
 		error = EOPNOTSUPP;
 	} else if (stat == RPC_PROGVERSMISMATCH) {
 		nfsstats.rpcinvalid++;
 		error = EPROTONOSUPPORT;
 	} else if (stat == RPC_INTR) {
 		error = EINTR;
 	} else {
 		nfsstats.rpcinvalid++;
 		error = EACCES;
 	}
 	if (error)
 		goto nfsmout;
 
 	KASSERT(mrep != NULL, ("mrep shouldn't be NULL if no error\n"));
 
 	/*
 	 * Search for any mbufs that are not a multiple of 4 bytes long
 	 * or with m_data not longword aligned.
 	 * These could cause pointer alignment problems, so copy them to
 	 * well aligned mbufs.
 	 */
 	error = nfs_realign(&mrep, M_NOWAIT);
 	if (error == ENOMEM) {
 		m_freem(mrep);
 		AUTH_DESTROY(auth);
 		nfsstats.rpcinvalid++;
 		return (error);
 	}
 
 	md = mrep;
 	dpos = mtod(mrep, caddr_t);
 	tl = nfsm_dissect(u_int32_t *, NFSX_UNSIGNED);
 	if (*tl != 0) {
 		error = fxdr_unsigned(int, *tl);
 		if ((nmp->nm_flag & NFSMNT_NFSV3) &&
 		    error == NFSERR_TRYLATER) {
 			m_freem(mrep);
 			error = 0;
 			waituntil = time_second + nfs3_jukebox_delay;
 			while (time_second < waituntil)
 				(void)tsleep(&fake_wchan, PSOCK, "nqnfstry",
 				    hz);
 			goto tryagain;
 		}
 		/*
 		 * Make sure NFSERR_RETERR isn't bogusly set by a server
 		 * such as amd. (No actual NFS error has bit 31 set.)
 		 */
 		error &= ~NFSERR_RETERR;
 
 		/*
 		 * If the File Handle was stale, invalidate the lookup
 		 * cache, just in case.
 		 */
 		if (error == ESTALE)
 			nfs_purgecache(vp);
 		/*
 		 * Skip wcc data on non-ENOENT NFS errors for now.
 		 * NetApp filers return corrupt postop attrs in the
 		 * wcc data for NFS err EROFS.  Not sure if they could
 		 * return corrupt postop attrs for others errors.
 		 * Blocking ENOENT post-op attributes breaks negative
 		 * name caching, so always allow it through.
 		 */
 		if ((nmp->nm_flag & NFSMNT_NFSV3) &&
 		    (!nfs_skip_wcc_data_onerr || error == ENOENT)) {
 			*mrp = mrep;
 			*mdp = md;
 			*dposp = dpos;
 			error |= NFSERR_RETERR;
 		} else
 			m_freem(mrep);
 		goto nfsmout;
 	}
 
 #ifdef KDTRACE_HOOKS
 	if (dtrace_nfsclient_nfs23_done_probe != NULL) {
 		uint32_t probe_id;
 		int probe_procnum;
 
 		if (nmp->nm_flag & NFSMNT_NFSV3) {
 			probe_id = nfsclient_nfs3_done_probes[procnum];
 			probe_procnum = procnum;
 		} else {
 			probe_id = nfsclient_nfs2_done_probes[procnum];
 			probe_procnum = (nmp->nm_flag & NFSMNT_NFSV3) ?
 			    procnum : nfsv2_procid[procnum];
 		}
 		if (probe_id != 0)
 			(dtrace_nfsclient_nfs23_done_probe)(probe_id, vp,
 			    mreq, cred, probe_procnum, 0);
 	}
 #endif
 	m_freem(mreq);
 	*mrp = mrep;
 	*mdp = md;
 	*dposp = dpos;
 	AUTH_DESTROY(auth);
 	return (0);
 
 nfsmout:
 #ifdef KDTRACE_HOOKS
 	if (dtrace_nfsclient_nfs23_done_probe != NULL) {
 		uint32_t probe_id;
 		int probe_procnum;
 
 		if (nmp->nm_flag & NFSMNT_NFSV3) {
 			probe_id = nfsclient_nfs3_done_probes[procnum];
 			probe_procnum = procnum;
 		} else {
 			probe_id = nfsclient_nfs2_done_probes[procnum];
 			probe_procnum = (nmp->nm_flag & NFSMNT_NFSV3) ?
 			    procnum : nfsv2_procid[procnum];
 		}
 		if (probe_id != 0)
 			(dtrace_nfsclient_nfs23_done_probe)(probe_id, vp,
 			    mreq, cred, probe_procnum, error);
 	}
 #endif
 	m_freem(mreq);
 	if (auth)
 		AUTH_DESTROY(auth);
 	return (error);
 }
 
 /*
  * Mark all of an nfs mount's outstanding requests with R_SOFTTERM and
  * wait for all requests to complete.  This is used by forced unmounts
  * to terminate any outstanding RPCs.
  */
 int
 nfs_nmcancelreqs(struct nfsmount *nmp)
 {
 
 	if (nmp->nm_client)
 		CLNT_CLOSE(nmp->nm_client);
 	return (0);
 }
 
 /*
  * Any signal that can interrupt an NFS operation in an intr mount
  * should be added to this set.  SIGSTOP and SIGKILL cannot be masked.
  */
 int nfs_sig_set[] = {
 	SIGINT,
 	SIGTERM,
 	SIGHUP,
 	SIGKILL,
 	SIGQUIT
 };
 
 /*
  * Check to see if one of the signals in our subset is pending on
  * the process (in an intr mount).
  */
 static int
 nfs_sig_pending(sigset_t set)
 {
 	int i;
 
 	for (i = 0 ; i < sizeof(nfs_sig_set)/sizeof(int) ; i++)
 		if (SIGISMEMBER(set, nfs_sig_set[i]))
 			return (1);
 	return (0);
 }
 
 /*
  * The set/restore sigmask functions are used to (temporarily) overwrite
  * the thread td_sigmask during an RPC call (for example).  These are also
  * used in other places in the NFS client that might tsleep().
  */
 void
 nfs_set_sigmask(struct thread *td, sigset_t *oldset)
 {
 	sigset_t newset;
 	int i;
 	struct proc *p;
 
 	SIGFILLSET(newset);
 	if (td == NULL)
 		td = curthread; /* XXX */
 	p = td->td_proc;
 	/* Remove the NFS set of signals from newset. */
 	PROC_LOCK(p);
 	mtx_lock(&p->p_sigacts->ps_mtx);
 	for (i = 0 ; i < sizeof(nfs_sig_set)/sizeof(int) ; i++) {
 		/*
 		 * But make sure we leave the ones already masked
 		 * by the process, i.e. remove the signal from the
 		 * temporary signalmask only if it wasn't already
 		 * in p_sigmask.
 		 */
 		if (!SIGISMEMBER(td->td_sigmask, nfs_sig_set[i]) &&
 		    !SIGISMEMBER(p->p_sigacts->ps_sigignore, nfs_sig_set[i]))
 			SIGDELSET(newset, nfs_sig_set[i]);
 	}
 	mtx_unlock(&p->p_sigacts->ps_mtx);
 	kern_sigprocmask(td, SIG_SETMASK, &newset, oldset,
 	    SIGPROCMASK_PROC_LOCKED);
 	PROC_UNLOCK(p);
 }
 
 void
 nfs_restore_sigmask(struct thread *td, sigset_t *set)
 {
 	if (td == NULL)
 		td = curthread; /* XXX */
 	kern_sigprocmask(td, SIG_SETMASK, set, NULL, 0);
 }
 
 /*
  * NFS wrapper to msleep(), that shoves a new p_sigmask and restores the
  * old one after msleep() returns.
  */
 int
 nfs_msleep(struct thread *td, void *ident, struct mtx *mtx, int priority,
     char *wmesg, int timo)
 {
 	sigset_t oldset;
 	int error;
 	struct proc *p;
 
 	if ((priority & PCATCH) == 0)
 		return msleep(ident, mtx, priority, wmesg, timo);
 	if (td == NULL)
 		td = curthread; /* XXX */
 	nfs_set_sigmask(td, &oldset);
 	error = msleep(ident, mtx, priority, wmesg, timo);
 	nfs_restore_sigmask(td, &oldset);
 	p = td->td_proc;
 	return (error);
 }
 
 /*
  * Test for a termination condition pending on the process.
  * This is used for NFSMNT_INT mounts.
  */
 int
 nfs_sigintr(struct nfsmount *nmp, struct thread *td)
 {
 	struct proc *p;
 	sigset_t tmpset;
 
 	/* Terminate all requests while attempting a forced unmount. */
 	if (nmp->nm_mountp->mnt_kern_flag & MNTK_UNMOUNTF)
 		return (EIO);
 	if (!(nmp->nm_flag & NFSMNT_INT))
 		return (0);
 	if (td == NULL)
 		return (0);
 	p = td->td_proc;
 	PROC_LOCK(p);
 	tmpset = p->p_siglist;
 	SIGSETOR(tmpset, td->td_siglist);
 	SIGSETNAND(tmpset, td->td_sigmask);
 	mtx_lock(&p->p_sigacts->ps_mtx);
 	SIGSETNAND(tmpset, p->p_sigacts->ps_sigignore);
 	mtx_unlock(&p->p_sigacts->ps_mtx);
 	if ((SIGNOTEMPTY(p->p_siglist) || SIGNOTEMPTY(td->td_siglist))
 	    && nfs_sig_pending(tmpset)) {
 		PROC_UNLOCK(p);
 		return (EINTR);
 	}
 	PROC_UNLOCK(p);
 	return (0);
 }
 
 static int
 nfs_msg(struct thread *td, const char *server, const char *msg, int error)
 {
 	struct proc *p;
 
 	p = td ? td->td_proc : NULL;
 	if (error)
 		tprintf(p, LOG_INFO, "nfs server %s: %s, error %d\n", server,
 		    msg, error);
 	else
 		tprintf(p, LOG_INFO, "nfs server %s: %s\n", server, msg);
 	return (0);
 }
 
 static void
 nfs_down(struct nfsmount *nmp, struct thread *td, const char *msg,
     int error, int flags)
 {
 	if (nmp == NULL)
 		return;
 	mtx_lock(&nmp->nm_mtx);
 	if ((flags & NFSSTA_TIMEO) && !(nmp->nm_state & NFSSTA_TIMEO)) {
 		nmp->nm_state |= NFSSTA_TIMEO;
 		mtx_unlock(&nmp->nm_mtx);
 		vfs_event_signal(&nmp->nm_mountp->mnt_stat.f_fsid,
 		    VQ_NOTRESP, 0);
 	} else
 		mtx_unlock(&nmp->nm_mtx);
 	mtx_lock(&nmp->nm_mtx);
 	if ((flags & NFSSTA_LOCKTIMEO) &&
 	    !(nmp->nm_state & NFSSTA_LOCKTIMEO)) {
 		nmp->nm_state |= NFSSTA_LOCKTIMEO;
 		mtx_unlock(&nmp->nm_mtx);
 		vfs_event_signal(&nmp->nm_mountp->mnt_stat.f_fsid,
 		    VQ_NOTRESPLOCK, 0);
 	} else
 		mtx_unlock(&nmp->nm_mtx);
 	nfs_msg(td, nmp->nm_mountp->mnt_stat.f_mntfromname, msg, error);
 }
 
 static void
 nfs_up(struct nfsmount *nmp, struct thread *td, const char *msg,
     int flags, int tprintfmsg)
 {
 	if (nmp == NULL)
 		return;
 	if (tprintfmsg)
 		nfs_msg(td, nmp->nm_mountp->mnt_stat.f_mntfromname, msg, 0);
 
 	mtx_lock(&nmp->nm_mtx);
 	if ((flags & NFSSTA_TIMEO) && (nmp->nm_state & NFSSTA_TIMEO)) {
 		nmp->nm_state &= ~NFSSTA_TIMEO;
 		mtx_unlock(&nmp->nm_mtx);
 		vfs_event_signal(&nmp->nm_mountp->mnt_stat.f_fsid,
 		    VQ_NOTRESP, 1);
 	} else
 		mtx_unlock(&nmp->nm_mtx);
 
 	mtx_lock(&nmp->nm_mtx);
 	if ((flags & NFSSTA_LOCKTIMEO) &&
 	    (nmp->nm_state & NFSSTA_LOCKTIMEO)) {
 		nmp->nm_state &= ~NFSSTA_LOCKTIMEO;
 		mtx_unlock(&nmp->nm_mtx);
 		vfs_event_signal(&nmp->nm_mountp->mnt_stat.f_fsid,
 		    VQ_NOTRESPLOCK, 1);
 	} else
 		mtx_unlock(&nmp->nm_mtx);
 }
Index: head/sys/nfsclient/nfs_subs.c
===================================================================
--- head/sys/nfsclient/nfs_subs.c	(revision 258540)
+++ head/sys/nfsclient/nfs_subs.c	(revision 258541)
@@ -1,1142 +1,1140 @@
 /*-
  * Copyright (c) 1989, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * This code is derived from software contributed to Berkeley by
  * Rick Macklem at The University of Guelph.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)nfs_subs.c  8.8 (Berkeley) 5/22/95
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 /*
  * These functions support the macros and help fiddle mbuf chains for
  * the nfs op functions. They do things like create the rpc header and
  * copy data between mbuf chains and uio lists.
  */
 
-#include "opt_kdtrace.h"
-
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/kernel.h>
 #include <sys/bio.h>
 #include <sys/buf.h>
 #include <sys/proc.h>
 #include <sys/mount.h>
 #include <sys/vnode.h>
 #include <sys/namei.h>
 #include <sys/mbuf.h>
 #include <sys/socket.h>
 #include <sys/stat.h>
 #include <sys/malloc.h>
 #include <sys/rwlock.h>
 #include <sys/sysent.h>
 #include <sys/syscall.h>
 #include <sys/sysproto.h>
 #include <sys/taskqueue.h>
 
 #include <vm/vm.h>
 #include <vm/vm_object.h>
 #include <vm/vm_extern.h>
 #include <vm/uma.h>
 
 #include <nfs/nfsproto.h>
 #include <nfsclient/nfs.h>
 #include <nfsclient/nfsnode.h>
 #include <nfs/nfs_kdtrace.h>
 #include <nfs/xdr_subs.h>
 #include <nfsclient/nfsm_subs.h>
 #include <nfsclient/nfsmount.h>
 
 #include <netinet/in.h>
 
 /*
  * Note that stdarg.h and the ANSI style va_start macro is used for both
  * ANSI and traditional C compilers.
  */
 #include <machine/stdarg.h>
 
 #ifdef KDTRACE_HOOKS
 dtrace_nfsclient_attrcache_flush_probe_func_t
     dtrace_nfsclient_attrcache_flush_done_probe;
 uint32_t nfsclient_attrcache_flush_done_id;
 
 dtrace_nfsclient_attrcache_get_hit_probe_func_t
     dtrace_nfsclient_attrcache_get_hit_probe;
 uint32_t nfsclient_attrcache_get_hit_id;
 
 dtrace_nfsclient_attrcache_get_miss_probe_func_t
     dtrace_nfsclient_attrcache_get_miss_probe;
 uint32_t nfsclient_attrcache_get_miss_id;
 
 dtrace_nfsclient_attrcache_load_probe_func_t
     dtrace_nfsclient_attrcache_load_done_probe;
 uint32_t nfsclient_attrcache_load_done_id;
 #endif /* !KDTRACE_HOOKS */
 
 /*
  * Data items converted to xdr at startup, since they are constant
  * This is kinda hokey, but may save a little time doing byte swaps
  */
 u_int32_t	nfs_xdrneg1;
 u_int32_t	nfs_true, nfs_false;
 
 /* And other global data */
 static u_int32_t nfs_xid = 0;
 static enum vtype nv2tov_type[8]= {
 	VNON, VREG, VDIR, VBLK, VCHR, VLNK, VNON,  VNON
 };
 
 int		nfs_ticks;
 int		nfs_pbuf_freecnt = -1;	/* start out unlimited */
 
 struct nfs_bufq	nfs_bufq;
 static struct mtx nfs_xid_mtx;
 struct task	nfs_nfsiodnew_task;
 
 /*
  * and the reverse mapping from generic to Version 2 procedure numbers
  */
 int nfsv2_procid[NFS_NPROCS] = {
 	NFSV2PROC_NULL,
 	NFSV2PROC_GETATTR,
 	NFSV2PROC_SETATTR,
 	NFSV2PROC_LOOKUP,
 	NFSV2PROC_NOOP,
 	NFSV2PROC_READLINK,
 	NFSV2PROC_READ,
 	NFSV2PROC_WRITE,
 	NFSV2PROC_CREATE,
 	NFSV2PROC_MKDIR,
 	NFSV2PROC_SYMLINK,
 	NFSV2PROC_CREATE,
 	NFSV2PROC_REMOVE,
 	NFSV2PROC_RMDIR,
 	NFSV2PROC_RENAME,
 	NFSV2PROC_LINK,
 	NFSV2PROC_READDIR,
 	NFSV2PROC_NOOP,
 	NFSV2PROC_STATFS,
 	NFSV2PROC_NOOP,
 	NFSV2PROC_NOOP,
 	NFSV2PROC_NOOP,
 	NFSV2PROC_NOOP,
 };
 
 LIST_HEAD(nfsnodehashhead, nfsnode);
 
 u_int32_t
 nfs_xid_gen(void)
 {
 	uint32_t xid;
 
 	mtx_lock(&nfs_xid_mtx);
 
 	/* Get a pretty random xid to start with */
 	if (!nfs_xid)
 		nfs_xid = random();
 	/*
 	 * Skip zero xid if it should ever happen.
 	 */
 	if (++nfs_xid == 0)
 		nfs_xid++;
 	xid = nfs_xid;
 	mtx_unlock(&nfs_xid_mtx);
 	return xid;
 }
 
 /*
  * copies a uio scatter/gather list to an mbuf chain.
  * NOTE: can ony handle iovcnt == 1
  */
 int
 nfsm_uiotombuf(struct uio *uiop, struct mbuf **mq, int siz, caddr_t *bpos)
 {
 	char *uiocp;
 	struct mbuf *mp, *mp2;
 	int xfer, left, mlen;
 	int uiosiz, clflg, rem;
 	char *cp;
 
 	KASSERT(uiop->uio_iovcnt == 1, ("nfsm_uiotombuf: iovcnt != 1"));
 
 	if (siz > MLEN)		/* or should it >= MCLBYTES ?? */
 		clflg = 1;
 	else
 		clflg = 0;
 	rem = nfsm_rndup(siz)-siz;
 	mp = mp2 = *mq;
 	while (siz > 0) {
 		left = uiop->uio_iov->iov_len;
 		uiocp = uiop->uio_iov->iov_base;
 		if (left > siz)
 			left = siz;
 		uiosiz = left;
 		while (left > 0) {
 			mlen = M_TRAILINGSPACE(mp);
 			if (mlen == 0) {
 				if (clflg)
 					mp = m_getcl(M_WAITOK, MT_DATA, 0);
 				else
 					mp = m_get(M_WAITOK, MT_DATA);
 				mp2->m_next = mp;
 				mp2 = mp;
 				mlen = M_TRAILINGSPACE(mp);
 			}
 			xfer = (left > mlen) ? mlen : left;
 #ifdef notdef
 			/* Not Yet.. */
 			if (uiop->uio_iov->iov_op != NULL)
 				(*(uiop->uio_iov->iov_op))
 				(uiocp, mtod(mp, caddr_t)+mp->m_len, xfer);
 			else
 #endif
 			if (uiop->uio_segflg == UIO_SYSSPACE)
 				bcopy(uiocp, mtod(mp, caddr_t)+mp->m_len, xfer);
 			else
 				copyin(uiocp, mtod(mp, caddr_t)+mp->m_len, xfer);
 			mp->m_len += xfer;
 			left -= xfer;
 			uiocp += xfer;
 			uiop->uio_offset += xfer;
 			uiop->uio_resid -= xfer;
 		}
 		uiop->uio_iov->iov_base =
 		    (char *)uiop->uio_iov->iov_base + uiosiz;
 		uiop->uio_iov->iov_len -= uiosiz;
 		siz -= uiosiz;
 	}
 	if (rem > 0) {
 		if (rem > M_TRAILINGSPACE(mp)) {
 			mp = m_get(M_WAITOK, MT_DATA);
 			mp2->m_next = mp;
 		}
 		cp = mtod(mp, caddr_t)+mp->m_len;
 		for (left = 0; left < rem; left++)
 			*cp++ = '\0';
 		mp->m_len += rem;
 		*bpos = cp;
 	} else
 		*bpos = mtod(mp, caddr_t)+mp->m_len;
 	*mq = mp;
 	return (0);
 }
 
 /*
  * Copy a string into mbufs for the hard cases...
  */
 int
 nfsm_strtmbuf(struct mbuf **mb, char **bpos, const char *cp, long siz)
 {
 	struct mbuf *m1 = NULL, *m2;
 	long left, xfer, len, tlen;
 	u_int32_t *tl;
 	int putsize;
 
 	putsize = 1;
 	m2 = *mb;
 	left = M_TRAILINGSPACE(m2);
 	if (left > 0) {
 		tl = ((u_int32_t *)(*bpos));
 		*tl++ = txdr_unsigned(siz);
 		putsize = 0;
 		left -= NFSX_UNSIGNED;
 		m2->m_len += NFSX_UNSIGNED;
 		if (left > 0) {
 			bcopy(cp, (caddr_t) tl, left);
 			siz -= left;
 			cp += left;
 			m2->m_len += left;
 			left = 0;
 		}
 	}
 	/* Loop around adding mbufs */
 	while (siz > 0) {
 		if (siz > MLEN) {
 			m1 = m_getcl(M_WAITOK, MT_DATA, 0);
 			m1->m_len = MCLBYTES;
 		} else {
 			m1 = m_get(M_WAITOK, MT_DATA);
 			m1->m_len = MLEN;
 		}
 		m2->m_next = m1;
 		m2 = m1;
 		tl = mtod(m1, u_int32_t *);
 		tlen = 0;
 		if (putsize) {
 			*tl++ = txdr_unsigned(siz);
 			m1->m_len -= NFSX_UNSIGNED;
 			tlen = NFSX_UNSIGNED;
 			putsize = 0;
 		}
 		if (siz < m1->m_len) {
 			len = nfsm_rndup(siz);
 			xfer = siz;
 			if (xfer < len)
 				*(tl+(xfer>>2)) = 0;
 		} else {
 			xfer = len = m1->m_len;
 		}
 		bcopy(cp, (caddr_t) tl, xfer);
 		m1->m_len = len+tlen;
 		siz -= xfer;
 		cp += xfer;
 	}
 	*mb = m1;
 	*bpos = mtod(m1, caddr_t)+m1->m_len;
 	return (0);
 }
 
 /*
  * Called once to initialize data structures...
  */
 int
 nfs_init(struct vfsconf *vfsp)
 {
 	int i;
 
 	nfsmount_zone = uma_zcreate("NFSMOUNT", sizeof(struct nfsmount),
 	    NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
 	nfs_true = txdr_unsigned(TRUE);
 	nfs_false = txdr_unsigned(FALSE);
 	nfs_xdrneg1 = txdr_unsigned(-1);
 	nfs_ticks = (hz * NFS_TICKINTVL + 500) / 1000;
 	if (nfs_ticks < 1)
 		nfs_ticks = 1;
 	/* Ensure async daemons disabled */
 	for (i = 0; i < NFS_MAXASYNCDAEMON; i++) {
 		nfs_iodwant[i] = NFSIOD_NOT_AVAILABLE;
 		nfs_iodmount[i] = NULL;
 	}
 	nfs_nhinit();			/* Init the nfsnode table */
 
 	/*
 	 * Initialize reply list and start timer
 	 */
 	mtx_init(&nfs_iod_mtx, "NFS iod lock", NULL, MTX_DEF);
 	mtx_init(&nfs_xid_mtx, "NFS xid lock", NULL, MTX_DEF);
 	TASK_INIT(&nfs_nfsiodnew_task, 0, nfs_nfsiodnew_tq, NULL);
 
 	nfs_pbuf_freecnt = nswbuf / 2 + 1;
 
 	return (0);
 }
 
 int
 nfs_uninit(struct vfsconf *vfsp)
 {
 	int i;
 
 	/*
 	 * Tell all nfsiod processes to exit. Clear nfs_iodmax, and wakeup
 	 * any sleeping nfsiods so they check nfs_iodmax and exit.
 	 * Drain nfsiodnew task before we wait for them to finish.
 	 */
 	mtx_lock(&nfs_iod_mtx);
 	nfs_iodmax = 0;
 	mtx_unlock(&nfs_iod_mtx);
 	taskqueue_drain(taskqueue_thread, &nfs_nfsiodnew_task);
 	mtx_lock(&nfs_iod_mtx);
 	for (i = 0; i < nfs_numasync; i++)
 		if (nfs_iodwant[i] == NFSIOD_AVAILABLE)
 			wakeup(&nfs_iodwant[i]);
 	/* The last nfsiod to exit will wake us up when nfs_numasync hits 0 */
 	while (nfs_numasync)
 		msleep(&nfs_numasync, &nfs_iod_mtx, PWAIT, "ioddie", 0);
 	mtx_unlock(&nfs_iod_mtx);
 	nfs_nhuninit();
 	uma_zdestroy(nfsmount_zone);
 	return (0);
 }
 
 void 
 nfs_dircookie_lock(struct nfsnode *np)
 {
 	mtx_lock(&np->n_mtx);
 	while (np->n_flag & NDIRCOOKIELK)
 		(void) msleep(&np->n_flag, &np->n_mtx, PZERO, "nfsdirlk", 0);
 	np->n_flag |= NDIRCOOKIELK;
 	mtx_unlock(&np->n_mtx);
 }
 
 void 
 nfs_dircookie_unlock(struct nfsnode *np)
 {
 	mtx_lock(&np->n_mtx);
 	np->n_flag &= ~NDIRCOOKIELK;
 	wakeup(&np->n_flag);
 	mtx_unlock(&np->n_mtx);
 }
 
 int
 nfs_upgrade_vnlock(struct vnode *vp)
 {
 	int old_lock;
 
 	ASSERT_VOP_LOCKED(vp, "nfs_upgrade_vnlock");
 	old_lock = VOP_ISLOCKED(vp);
 	if (old_lock != LK_EXCLUSIVE) {
 		KASSERT(old_lock == LK_SHARED,
 		    ("nfs_upgrade_vnlock: wrong old_lock %d", old_lock));
 		/* Upgrade to exclusive lock, this might block */
 		vn_lock(vp, LK_UPGRADE | LK_RETRY);
   	}
 	return (old_lock);
 }
 
 void
 nfs_downgrade_vnlock(struct vnode *vp, int old_lock)
 {
 	if (old_lock != LK_EXCLUSIVE) {
 		KASSERT(old_lock == LK_SHARED, ("wrong old_lock %d", old_lock));
 		/* Downgrade from exclusive lock. */
 		vn_lock(vp, LK_DOWNGRADE | LK_RETRY);
   	}
 }
 
 void
 nfs_printf(const char *fmt, ...)
 {
 	va_list ap;
 
 	mtx_lock(&Giant);
 	va_start(ap, fmt);
 	vprintf(fmt, ap);
 	va_end(ap);
 	mtx_unlock(&Giant);
 }
 
 /*
  * Attribute cache routines.
  * nfs_loadattrcache() - loads or updates the cache contents from attributes
  *	that are on the mbuf list
  * nfs_getattrcache() - returns valid attributes if found in cache, returns
  *	error otherwise
  */
 
 /*
  * Load the attribute cache (that lives in the nfsnode entry) with
  * the values on the mbuf list and
  * Iff vap not NULL
  *    copy the attributes to *vaper
  */
 int
 nfs_loadattrcache(struct vnode **vpp, struct mbuf **mdp, caddr_t *dposp,
 		  struct vattr *vaper, int dontshrink)
 {
 	struct vnode *vp = *vpp;
 	struct vattr *vap;
 	struct nfs_fattr *fp;
 	struct nfsnode *np = NULL;
 	int32_t t1;
 	caddr_t cp2;
 	int rdev;
 	struct mbuf *md;
 	enum vtype vtyp;
 	u_short vmode;
 	struct timespec mtime, mtime_save;
 	int v3 = NFS_ISV3(vp);
 	int error = 0;
 	u_quad_t nsize;
 	int setnsize;
 
 	md = *mdp;
 	t1 = (mtod(md, caddr_t) + md->m_len) - *dposp;
 	cp2 = nfsm_disct(mdp, dposp, NFSX_FATTR(v3), t1, M_WAITOK);
 	if (cp2 == NULL) {
 		error = EBADRPC;
 		goto out;
 	}
 	fp = (struct nfs_fattr *)cp2;
 	if (v3) {
 		vtyp = nfsv3tov_type(fp->fa_type);
 		vmode = fxdr_unsigned(u_short, fp->fa_mode);
 		rdev = makedev(fxdr_unsigned(int, fp->fa3_rdev.specdata1),
 			fxdr_unsigned(int, fp->fa3_rdev.specdata2));
 		fxdr_nfsv3time(&fp->fa3_mtime, &mtime);
 	} else {
 		vtyp = nfsv2tov_type(fp->fa_type);
 		vmode = fxdr_unsigned(u_short, fp->fa_mode);
 		/*
 		 * XXX
 		 *
 		 * The duplicate information returned in fa_type and fa_mode
 		 * is an ambiguity in the NFS version 2 protocol.
 		 *
 		 * VREG should be taken literally as a regular file.  If a
 		 * server intents to return some type information differently
 		 * in the upper bits of the mode field (e.g. for sockets, or
 		 * FIFOs), NFSv2 mandates fa_type to be VNON.  Anyway, we
 		 * leave the examination of the mode bits even in the VREG
 		 * case to avoid breakage for bogus servers, but we make sure
 		 * that there are actually type bits set in the upper part of
 		 * fa_mode (and failing that, trust the va_type field).
 		 *
 		 * NFSv3 cleared the issue, and requires fa_mode to not
 		 * contain any type information (while also introduing sockets
 		 * and FIFOs for fa_type).
 		 */
 		if (vtyp == VNON || (vtyp == VREG && (vmode & S_IFMT) != 0))
 			vtyp = IFTOVT(vmode);
 		rdev = fxdr_unsigned(int32_t, fp->fa2_rdev);
 		fxdr_nfsv2time(&fp->fa2_mtime, &mtime);
 
 		/*
 		 * Really ugly NFSv2 kludge.
 		 */
 		if (vtyp == VCHR && rdev == 0xffffffff)
 			vtyp = VFIFO;
 	}
 
 	/*
 	 * If v_type == VNON it is a new node, so fill in the v_type,
 	 * n_mtime fields. Check to see if it represents a special
 	 * device, and if so, check for a possible alias. Once the
 	 * correct vnode has been obtained, fill in the rest of the
 	 * information.
 	 */
 	np = VTONFS(vp);
 	mtx_lock(&np->n_mtx);
 	if (vp->v_type != vtyp) {
 		vp->v_type = vtyp;
 		if (vp->v_type == VFIFO)
 			vp->v_op = &nfs_fifoops;
 		np->n_mtime = mtime;
 	}
 	vap = &np->n_vattr;
 	vap->va_type = vtyp;
 	vap->va_mode = (vmode & 07777);
 	vap->va_rdev = rdev;
 	mtime_save = vap->va_mtime;
 	vap->va_mtime = mtime;
 	vap->va_fsid = vp->v_mount->mnt_stat.f_fsid.val[0];
 	if (v3) {
 		vap->va_nlink = fxdr_unsigned(u_short, fp->fa_nlink);
 		vap->va_uid = fxdr_unsigned(uid_t, fp->fa_uid);
 		vap->va_gid = fxdr_unsigned(gid_t, fp->fa_gid);
 		vap->va_size = fxdr_hyper(&fp->fa3_size);
 		vap->va_blocksize = NFS_FABLKSIZE;
 		vap->va_bytes = fxdr_hyper(&fp->fa3_used);
 		vap->va_fileid = fxdr_unsigned(int32_t,
 		    fp->fa3_fileid.nfsuquad[1]);
 		fxdr_nfsv3time(&fp->fa3_atime, &vap->va_atime);
 		fxdr_nfsv3time(&fp->fa3_ctime, &vap->va_ctime);
 		vap->va_flags = 0;
 		vap->va_filerev = 0;
 	} else {
 		vap->va_nlink = fxdr_unsigned(u_short, fp->fa_nlink);
 		vap->va_uid = fxdr_unsigned(uid_t, fp->fa_uid);
 		vap->va_gid = fxdr_unsigned(gid_t, fp->fa_gid);
 		vap->va_size = fxdr_unsigned(u_int32_t, fp->fa2_size);
 		vap->va_blocksize = fxdr_unsigned(int32_t, fp->fa2_blocksize);
 		vap->va_bytes = (u_quad_t)fxdr_unsigned(int32_t, fp->fa2_blocks)
 		    * NFS_FABLKSIZE;
 		vap->va_fileid = fxdr_unsigned(int32_t, fp->fa2_fileid);
 		fxdr_nfsv2time(&fp->fa2_atime, &vap->va_atime);
 		vap->va_flags = 0;
 		vap->va_ctime.tv_sec = fxdr_unsigned(u_int32_t,
 		    fp->fa2_ctime.nfsv2_sec);
 		vap->va_ctime.tv_nsec = 0;
 		vap->va_gen = fxdr_unsigned(u_int32_t, fp->fa2_ctime.nfsv2_usec);
 		vap->va_filerev = 0;
 	}
 	np->n_attrstamp = time_second;
 	setnsize = 0;
 	nsize = 0;
 	if (vap->va_size != np->n_size) {
 		if (vap->va_type == VREG) {
 			if (dontshrink && vap->va_size < np->n_size) {
 				/*
 				 * We've been told not to shrink the file;
 				 * zero np->n_attrstamp to indicate that
 				 * the attributes are stale.
 				 */
 				vap->va_size = np->n_size;
 				np->n_attrstamp = 0;
 				KDTRACE_NFS_ATTRCACHE_FLUSH_DONE(vp);
 				vnode_pager_setsize(vp, np->n_size);
 			} else if (np->n_flag & NMODIFIED) {
 				/*
 				 * We've modified the file: Use the larger
 				 * of our size, and the server's size.
 				 */
 				if (vap->va_size < np->n_size) {
 					vap->va_size = np->n_size;
 				} else {
 					np->n_size = vap->va_size;
 					np->n_flag |= NSIZECHANGED;
 				}
 				vnode_pager_setsize(vp, np->n_size);
 			} else if (vap->va_size < np->n_size) {
 				/*
 				 * When shrinking the size, the call to
 				 * vnode_pager_setsize() cannot be done
 				 * with the mutex held, so delay it until
 				 * after the mtx_unlock call.
 				 */
 				nsize = np->n_size = vap->va_size;
 				np->n_flag |= NSIZECHANGED;
 				setnsize = 1;
 			} else {
 				np->n_size = vap->va_size;
 				np->n_flag |= NSIZECHANGED;
 				vnode_pager_setsize(vp, np->n_size);
 			}
 		} else {
 			np->n_size = vap->va_size;
 		}
 	}
 	/*
 	 * The following checks are added to prevent a race between (say)
 	 * a READDIR+ and a WRITE. 
 	 * READDIR+, WRITE requests sent out.
 	 * READDIR+ resp, WRITE resp received on client.
 	 * However, the WRITE resp was handled before the READDIR+ resp
 	 * causing the post op attrs from the write to be loaded first
 	 * and the attrs from the READDIR+ to be loaded later. If this 
 	 * happens, we have stale attrs loaded into the attrcache.
 	 * We detect this by for the mtime moving back. We invalidate the 
 	 * attrcache when this happens.
 	 */
 	if (timespeccmp(&mtime_save, &vap->va_mtime, >)) {
 		/* Size changed or mtime went backwards */
 		np->n_attrstamp = 0;
 		KDTRACE_NFS_ATTRCACHE_FLUSH_DONE(vp);
 	}
 	if (vaper != NULL) {
 		bcopy((caddr_t)vap, (caddr_t)vaper, sizeof(*vap));
 		if (np->n_flag & NCHG) {
 			if (np->n_flag & NACC)
 				vaper->va_atime = np->n_atim;
 			if (np->n_flag & NUPD)
 				vaper->va_mtime = np->n_mtim;
 		}
 	}
 
 #ifdef KDTRACE_HOOKS
 	if (np->n_attrstamp != 0)
 		KDTRACE_NFS_ATTRCACHE_LOAD_DONE(vp, &np->n_vattr, 0);
 #endif
 	mtx_unlock(&np->n_mtx);
 	if (setnsize)
 		vnode_pager_setsize(vp, nsize);
 out:
 #ifdef KDTRACE_HOOKS
 	if (error)
 		KDTRACE_NFS_ATTRCACHE_LOAD_DONE(vp, NULL, error);
 #endif
 	return (error);
 }
 
 #ifdef NFS_ACDEBUG
 #include <sys/sysctl.h>
 SYSCTL_DECL(_vfs_oldnfs);
 static int nfs_acdebug;
 SYSCTL_INT(_vfs_oldnfs, OID_AUTO, acdebug, CTLFLAG_RW, &nfs_acdebug, 0,
     "Toggle acdebug (attribute cache debug) flag");
 #endif
 
 /*
  * Check the time stamp
  * If the cache is valid, copy contents to *vap and return 0
  * otherwise return an error
  */
 int
 nfs_getattrcache(struct vnode *vp, struct vattr *vaper)
 {
 	struct nfsnode *np;
 	struct vattr *vap;
 	struct nfsmount *nmp;
 	int timeo;
 	
 	np = VTONFS(vp);
 	vap = &np->n_vattr;
 	nmp = VFSTONFS(vp->v_mount);
 #ifdef NFS_ACDEBUG
 	mtx_lock(&Giant);	/* nfs_printf() */
 #endif
 	mtx_lock(&np->n_mtx);
 	/* XXX n_mtime doesn't seem to be updated on a miss-and-reload */
 	timeo = (time_second - np->n_mtime.tv_sec) / 10;
 
 #ifdef NFS_ACDEBUG
 	if (nfs_acdebug>1)
 		nfs_printf("nfs_getattrcache: initial timeo = %d\n", timeo);
 #endif
 
 	if (vap->va_type == VDIR) {
 		if ((np->n_flag & NMODIFIED) || timeo < nmp->nm_acdirmin)
 			timeo = nmp->nm_acdirmin;
 		else if (timeo > nmp->nm_acdirmax)
 			timeo = nmp->nm_acdirmax;
 	} else {
 		if ((np->n_flag & NMODIFIED) || timeo < nmp->nm_acregmin)
 			timeo = nmp->nm_acregmin;
 		else if (timeo > nmp->nm_acregmax)
 			timeo = nmp->nm_acregmax;
 	}
 
 #ifdef NFS_ACDEBUG
 	if (nfs_acdebug > 2)
 		nfs_printf("acregmin %d; acregmax %d; acdirmin %d; acdirmax %d\n",
 			   nmp->nm_acregmin, nmp->nm_acregmax,
 			   nmp->nm_acdirmin, nmp->nm_acdirmax);
 
 	if (nfs_acdebug)
 		nfs_printf("nfs_getattrcache: age = %d; final timeo = %d\n",
 			   (time_second - np->n_attrstamp), timeo);
 #endif
 
 	if ((time_second - np->n_attrstamp) >= timeo) {
 		nfsstats.attrcache_misses++;
 		mtx_unlock(&np->n_mtx);
 #ifdef NFS_ACDEBUG
 		mtx_unlock(&Giant);	/* nfs_printf() */
 #endif
 		KDTRACE_NFS_ATTRCACHE_GET_MISS(vp);
 		return (ENOENT);
 	}
 	nfsstats.attrcache_hits++;
 	if (vap->va_size != np->n_size) {
 		if (vap->va_type == VREG) {
 			if (np->n_flag & NMODIFIED) {
 				if (vap->va_size < np->n_size)
 					vap->va_size = np->n_size;
 				else
 					np->n_size = vap->va_size;
 			} else {
 				np->n_size = vap->va_size;
 			}
 			vnode_pager_setsize(vp, np->n_size);
 		} else {
 			np->n_size = vap->va_size;
 		}
 	}
 	bcopy((caddr_t)vap, (caddr_t)vaper, sizeof(struct vattr));
 	if (np->n_flag & NCHG) {
 		if (np->n_flag & NACC)
 			vaper->va_atime = np->n_atim;
 		if (np->n_flag & NUPD)
 			vaper->va_mtime = np->n_mtim;
 	}
 	mtx_unlock(&np->n_mtx);
 #ifdef NFS_ACDEBUG
 	mtx_unlock(&Giant);	/* nfs_printf() */
 #endif
 	KDTRACE_NFS_ATTRCACHE_GET_HIT(vp, vap);
 	return (0);
 }
 
 /*
  * Purge all cached information about an NFS vnode including name
  * cache entries, the attribute cache, and the access cache.  This is
  * called when an NFS request for a node fails with a stale
  * filehandle.
  */
 void
 nfs_purgecache(struct vnode *vp)
 {
 	struct nfsnode *np;
 	int i;
 
 	np = VTONFS(vp);
 	cache_purge(vp);
 	mtx_lock(&np->n_mtx);
 	np->n_attrstamp = 0;
 	KDTRACE_NFS_ATTRCACHE_FLUSH_DONE(vp);
 	for (i = 0; i < NFS_ACCESSCACHESIZE; i++)
 		np->n_accesscache[i].stamp = 0;
 	KDTRACE_NFS_ACCESSCACHE_FLUSH_DONE(vp);
 	mtx_unlock(&np->n_mtx);
 }
 
 static nfsuint64 nfs_nullcookie = { { 0, 0 } };
 /*
  * This function finds the directory cookie that corresponds to the
  * logical byte offset given.
  */
 nfsuint64 *
 nfs_getcookie(struct nfsnode *np, off_t off, int add)
 {
 	struct nfsdmap *dp, *dp2;
 	int pos;
 	nfsuint64 *retval = NULL;
 	
 	pos = (uoff_t)off / NFS_DIRBLKSIZ;
 	if (pos == 0 || off < 0) {
 		KASSERT(!add, ("nfs getcookie add at <= 0"));
 		return (&nfs_nullcookie);
 	}
 	pos--;
 	dp = LIST_FIRST(&np->n_cookies);
 	if (!dp) {
 		if (add) {
 			dp = malloc(sizeof (struct nfsdmap),
 				M_NFSDIROFF, M_WAITOK);
 			dp->ndm_eocookie = 0;
 			LIST_INSERT_HEAD(&np->n_cookies, dp, ndm_list);
 		} else
 			goto out;
 	}
 	while (pos >= NFSNUMCOOKIES) {
 		pos -= NFSNUMCOOKIES;
 		if (LIST_NEXT(dp, ndm_list)) {
 			if (!add && dp->ndm_eocookie < NFSNUMCOOKIES &&
 			    pos >= dp->ndm_eocookie)
 				goto out;
 			dp = LIST_NEXT(dp, ndm_list);
 		} else if (add) {
 			dp2 = malloc(sizeof (struct nfsdmap),
 				M_NFSDIROFF, M_WAITOK);
 			dp2->ndm_eocookie = 0;
 			LIST_INSERT_AFTER(dp, dp2, ndm_list);
 			dp = dp2;
 		} else
 			goto out;
 	}
 	if (pos >= dp->ndm_eocookie) {
 		if (add)
 			dp->ndm_eocookie = pos + 1;
 		else
 			goto out;
 	}
 	retval = &dp->ndm_cookies[pos];
 out:
 	return (retval);
 }
 
 /*
  * Invalidate cached directory information, except for the actual directory
  * blocks (which are invalidated separately).
  * Done mainly to avoid the use of stale offset cookies.
  */
 void
 nfs_invaldir(struct vnode *vp)
 {
 	struct nfsnode *np = VTONFS(vp);
 
 	KASSERT(vp->v_type == VDIR, ("nfs: invaldir not dir"));
 	nfs_dircookie_lock(np);
 	np->n_direofoffset = 0;
 	np->n_cookieverf.nfsuquad[0] = 0;
 	np->n_cookieverf.nfsuquad[1] = 0;
 	if (LIST_FIRST(&np->n_cookies))
 		LIST_FIRST(&np->n_cookies)->ndm_eocookie = 0;
 	nfs_dircookie_unlock(np);
 }
 
 /*
  * The write verifier has changed (probably due to a server reboot), so all
  * B_NEEDCOMMIT blocks will have to be written again. Since they are on the
  * dirty block list as B_DELWRI, all this takes is clearing the B_NEEDCOMMIT
  * and B_CLUSTEROK flags.  Once done the new write verifier can be set for the
  * mount point.
  *
  * B_CLUSTEROK must be cleared along with B_NEEDCOMMIT because stage 1 data
  * writes are not clusterable.
  */
 void
 nfs_clearcommit(struct mount *mp)
 {
 	struct vnode *vp, *nvp;
 	struct buf *bp, *nbp;
 	struct bufobj *bo;
 
 	MNT_VNODE_FOREACH_ALL(vp, mp, nvp) {
 		bo = &vp->v_bufobj;
 		vholdl(vp);
 		VI_UNLOCK(vp);
 		BO_LOCK(bo);
 		TAILQ_FOREACH_SAFE(bp, &bo->bo_dirty.bv_hd, b_bobufs, nbp) {
 			if (!BUF_ISLOCKED(bp) &&
 			    (bp->b_flags & (B_DELWRI | B_NEEDCOMMIT))
 				== (B_DELWRI | B_NEEDCOMMIT))
 				bp->b_flags &= ~(B_NEEDCOMMIT | B_CLUSTEROK);
 		}
 		BO_UNLOCK(bo);
 		vdrop(vp);
 	}
 }
 
 /*
  * Helper functions for former macros.  Some of these should be
  * moved to their callers.
  */
 
 int
 nfsm_mtofh_xx(struct vnode *d, struct vnode **v, int v3, int *f,
     struct mbuf **md, caddr_t *dpos)
 {
 	struct nfsnode *ttnp;
 	struct vnode *ttvp;
 	nfsfh_t *ttfhp;
 	u_int32_t *tl;
 	int ttfhsize;
 	int t1;
 
 	if (v3) {
 		tl = nfsm_dissect_xx(NFSX_UNSIGNED, md, dpos);
 		if (tl == NULL)
 			return EBADRPC;
 		*f = fxdr_unsigned(int, *tl);
 	} else
 		*f = 1;
 	if (*f) {
 		t1 = nfsm_getfh_xx(&ttfhp, &ttfhsize, (v3), md, dpos);
 		if (t1 != 0)
 			return t1;
 		t1 = nfs_nget(d->v_mount, ttfhp, ttfhsize, &ttnp, LK_EXCLUSIVE);
 		if (t1 != 0)
 			return t1;
 		*v = NFSTOV(ttnp);
 	}
 	if (v3) {
 		tl = nfsm_dissect_xx(NFSX_UNSIGNED, md, dpos);
 		if (tl == NULL)
 			return EBADRPC;
 		if (*f)
 			*f = fxdr_unsigned(int, *tl);
 		else if (fxdr_unsigned(int, *tl))
 			nfsm_adv_xx(NFSX_V3FATTR, md, dpos);
 	}
 	if (*f) {
 		ttvp = *v;
 		t1 = nfs_loadattrcache(&ttvp, md, dpos, NULL, 0);
 		if (t1)
 			return t1;
 		*v = ttvp;
 	}
 	return 0;
 }
 
 int
 nfsm_getfh_xx(nfsfh_t **f, int *s, int v3, struct mbuf **md, caddr_t *dpos)
 {
 	u_int32_t *tl;
 
 	if (v3) {
 		tl = nfsm_dissect_xx(NFSX_UNSIGNED, md, dpos);
 		if (tl == NULL)
 			return EBADRPC;
 		*s = fxdr_unsigned(int, *tl);
 		if (*s <= 0 || *s > NFSX_V3FHMAX)
 			return EBADRPC;
 	} else
 		*s = NFSX_V2FH;
 	*f = nfsm_dissect_xx(nfsm_rndup(*s), md, dpos);
 	if (*f == NULL)
 		return EBADRPC;
 	else
 		return 0;
 }
 
 
 int
 nfsm_loadattr_xx(struct vnode **v, struct vattr *va, struct mbuf **md,
 		 caddr_t *dpos)
 {
 	int t1;
 
 	struct vnode *ttvp = *v;
 	t1 = nfs_loadattrcache(&ttvp, md, dpos, va, 0);
 	if (t1 != 0)
 		return t1;
 	*v = ttvp;
 	return 0;
 }
 
 int
 nfsm_postop_attr_xx(struct vnode **v, int *f, struct vattr *va,
 		    struct mbuf **md, caddr_t *dpos)
 {
 	u_int32_t *tl;
 	int t1;
 
 	struct vnode *ttvp = *v;
 	tl = nfsm_dissect_xx(NFSX_UNSIGNED, md, dpos);
 	if (tl == NULL)
 		return EBADRPC;
 	*f = fxdr_unsigned(int, *tl);
 	if (*f != 0) {
 		t1 = nfs_loadattrcache(&ttvp, md, dpos, va, 1);
 		if (t1 != 0) {
 			*f = 0;
 			return t1;
 		}
 		*v = ttvp;
 	}
 	return 0;
 }
 
 int
 nfsm_wcc_data_xx(struct vnode **v, int *f, struct mbuf **md, caddr_t *dpos)
 {
 	u_int32_t *tl;
 	int ttattrf, ttretf = 0;
 	int t1;
 
 	tl = nfsm_dissect_xx(NFSX_UNSIGNED, md, dpos);
 	if (tl == NULL)
 		return EBADRPC;
 	if (*tl == nfs_true) {
 		tl = nfsm_dissect_xx(6 * NFSX_UNSIGNED, md, dpos);
 		if (tl == NULL)
 			return EBADRPC;
 		mtx_lock(&(VTONFS(*v))->n_mtx);
 		if (*f)
  			ttretf = (VTONFS(*v)->n_mtime.tv_sec == fxdr_unsigned(u_int32_t, *(tl + 2)) && 
 				  VTONFS(*v)->n_mtime.tv_nsec == fxdr_unsigned(u_int32_t, *(tl + 3))); 
 		mtx_unlock(&(VTONFS(*v))->n_mtx);
 	}
 	t1 = nfsm_postop_attr_xx(v, &ttattrf, NULL, md, dpos);
 	if (t1)
 		return t1;
 	if (*f)
 		*f = ttretf;
 	else
 		*f = ttattrf;
 	return 0;
 }
 
 int
 nfsm_strtom_xx(const char *a, int s, int m, struct mbuf **mb, caddr_t *bpos)
 {
 	u_int32_t *tl;
 	int t1;
 
 	if (s > m)
 		return ENAMETOOLONG;
 	t1 = nfsm_rndup(s) + NFSX_UNSIGNED;
 	if (t1 <= M_TRAILINGSPACE(*mb)) {
 		tl = nfsm_build_xx(t1, mb, bpos);
 		*tl++ = txdr_unsigned(s);
 		*(tl + ((t1 >> 2) - 2)) = 0;
 		bcopy(a, tl, s);
 	} else {
 		t1 = nfsm_strtmbuf(mb, bpos, a, s);
 		if (t1 != 0)
 			return t1;
 	}
 	return 0;
 }
 
 int
 nfsm_fhtom_xx(struct vnode *v, int v3, struct mbuf **mb, caddr_t *bpos)
 {
 	u_int32_t *tl;
 	int t1;
 	caddr_t cp;
 
 	if (v3) {
 		t1 = nfsm_rndup(VTONFS(v)->n_fhsize) + NFSX_UNSIGNED;
 		if (t1 < M_TRAILINGSPACE(*mb)) {
 			tl = nfsm_build_xx(t1, mb, bpos);
 			*tl++ = txdr_unsigned(VTONFS(v)->n_fhsize);
 			*(tl + ((t1 >> 2) - 2)) = 0;
 			bcopy(VTONFS(v)->n_fhp, tl, VTONFS(v)->n_fhsize);
 		} else {
 			t1 = nfsm_strtmbuf(mb, bpos,
 			    (const char *)VTONFS(v)->n_fhp,
 			    VTONFS(v)->n_fhsize);
 			if (t1 != 0)
 				return t1;
 		}
 	} else {
 		cp = nfsm_build_xx(NFSX_V2FH, mb, bpos);
 		bcopy(VTONFS(v)->n_fhp, cp, NFSX_V2FH);
 	}
 	return 0;
 }
 
 void
 nfsm_v3attrbuild_xx(struct vattr *va, int full, struct mbuf **mb,
     caddr_t *bpos)
 {
 	u_int32_t *tl;
 
 	if (va->va_mode != (mode_t)VNOVAL) {
 		tl = nfsm_build_xx(2 * NFSX_UNSIGNED, mb, bpos);
 		*tl++ = nfs_true;
 		*tl = txdr_unsigned(va->va_mode);
 	} else {
 		tl = nfsm_build_xx(NFSX_UNSIGNED, mb, bpos);
 		*tl = nfs_false;
 	}
 	if (full && va->va_uid != (uid_t)VNOVAL) {
 		tl = nfsm_build_xx(2 * NFSX_UNSIGNED, mb, bpos);
 		*tl++ = nfs_true;
 		*tl = txdr_unsigned(va->va_uid);
 	} else {
 		tl = nfsm_build_xx(NFSX_UNSIGNED, mb, bpos);
 		*tl = nfs_false;
 	}
 	if (full && va->va_gid != (gid_t)VNOVAL) {
 		tl = nfsm_build_xx(2 * NFSX_UNSIGNED, mb, bpos);
 		*tl++ = nfs_true;
 		*tl = txdr_unsigned(va->va_gid);
 	} else {
 		tl = nfsm_build_xx(NFSX_UNSIGNED, mb, bpos);
 		*tl = nfs_false;
 	}
 	if (full && va->va_size != VNOVAL) {
 		tl = nfsm_build_xx(3 * NFSX_UNSIGNED, mb, bpos);
 		*tl++ = nfs_true;
 		txdr_hyper(va->va_size, tl);
 	} else {
 		tl = nfsm_build_xx(NFSX_UNSIGNED, mb, bpos);
 		*tl = nfs_false;
 	}
 	if (va->va_atime.tv_sec != VNOVAL) {
 		if ((va->va_vaflags & VA_UTIMES_NULL) == 0) {
 			tl = nfsm_build_xx(3 * NFSX_UNSIGNED, mb, bpos);
 			*tl++ = txdr_unsigned(NFSV3SATTRTIME_TOCLIENT);
 			txdr_nfsv3time(&va->va_atime, tl);
 		} else {
 			tl = nfsm_build_xx(NFSX_UNSIGNED, mb, bpos);
 			*tl = txdr_unsigned(NFSV3SATTRTIME_TOSERVER);
 		}
 	} else {
 		tl = nfsm_build_xx(NFSX_UNSIGNED, mb, bpos);
 		*tl = txdr_unsigned(NFSV3SATTRTIME_DONTCHANGE);
 	}
 	if (va->va_mtime.tv_sec != VNOVAL) {
 		if ((va->va_vaflags & VA_UTIMES_NULL) == 0) {
 			tl = nfsm_build_xx(3 * NFSX_UNSIGNED, mb, bpos);
 			*tl++ = txdr_unsigned(NFSV3SATTRTIME_TOCLIENT);
 			txdr_nfsv3time(&va->va_mtime, tl);
 		} else {
 			tl = nfsm_build_xx(NFSX_UNSIGNED, mb, bpos);
 			*tl = txdr_unsigned(NFSV3SATTRTIME_TOSERVER);
 		}
 	} else {
 		tl = nfsm_build_xx(NFSX_UNSIGNED, mb, bpos);
 		*tl = txdr_unsigned(NFSV3SATTRTIME_DONTCHANGE);
 	}
 }
Index: head/sys/nfsclient/nfs_vnops.c
===================================================================
--- head/sys/nfsclient/nfs_vnops.c	(revision 258540)
+++ head/sys/nfsclient/nfs_vnops.c	(revision 258541)
@@ -1,3546 +1,3545 @@
 /*-
  * Copyright (c) 1989, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * This code is derived from software contributed to Berkeley by
  * Rick Macklem at The University of Guelph.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)nfs_vnops.c	8.16 (Berkeley) 5/27/95
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 /*
  * vnode op calls for Sun NFS version 2 and 3
  */
 
 #include "opt_inet.h"
-#include "opt_kdtrace.h"
 
 #include <sys/param.h>
 #include <sys/kernel.h>
 #include <sys/systm.h>
 #include <sys/resourcevar.h>
 #include <sys/proc.h>
 #include <sys/mount.h>
 #include <sys/bio.h>
 #include <sys/buf.h>
 #include <sys/jail.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mbuf.h>
 #include <sys/namei.h>
 #include <sys/socket.h>
 #include <sys/vnode.h>
 #include <sys/dirent.h>
 #include <sys/fcntl.h>
 #include <sys/lockf.h>
 #include <sys/rwlock.h>
 #include <sys/stat.h>
 #include <sys/sysctl.h>
 #include <sys/signalvar.h>
 
 #include <vm/vm.h>
 #include <vm/vm_extern.h>
 #include <vm/vm_object.h>
 
 #include <nfs/nfsproto.h>
 #include <nfsclient/nfs.h>
 #include <nfsclient/nfsnode.h>
 #include <nfsclient/nfsmount.h>
 #include <nfs/nfs_kdtrace.h>
 #include <nfs/nfs_lock.h>
 #include <nfs/xdr_subs.h>
 #include <nfsclient/nfsm_subs.h>
 
 #include <net/if.h>
 #include <net/if_var.h>
 #include <net/vnet.h>
 #include <netinet/in.h>
 #include <netinet/in_var.h>
 
 #include <machine/stdarg.h>
 
 #ifdef KDTRACE_HOOKS
 #include <sys/dtrace_bsd.h>
 
 dtrace_nfsclient_accesscache_flush_probe_func_t
     dtrace_nfsclient_accesscache_flush_done_probe;
 uint32_t nfsclient_accesscache_flush_done_id;
 
 dtrace_nfsclient_accesscache_get_probe_func_t
     dtrace_nfsclient_accesscache_get_hit_probe,
     dtrace_nfsclient_accesscache_get_miss_probe;
 uint32_t nfsclient_accesscache_get_hit_id;
 uint32_t nfsclient_accesscache_get_miss_id;
 
 dtrace_nfsclient_accesscache_load_probe_func_t
     dtrace_nfsclient_accesscache_load_done_probe;
 uint32_t nfsclient_accesscache_load_done_id;
 #endif /* !KDTRACE_HOOKS */
 
 /* Defs */
 #define	TRUE	1
 #define	FALSE	0
 
 /*
  * Ifdef for FreeBSD-current merged buffer cache. It is unfortunate that these
  * calls are not in getblk() and brelse() so that they would not be necessary
  * here.
  */
 #ifndef B_VMIO
 #define vfs_busy_pages(bp, f)
 #endif
 
 static vop_read_t	nfsfifo_read;
 static vop_write_t	nfsfifo_write;
 static vop_close_t	nfsfifo_close;
 static int	nfs_flush(struct vnode *, int, int);
 static int	nfs_setattrrpc(struct vnode *, struct vattr *, struct ucred *);
 static vop_lookup_t	nfs_lookup;
 static vop_create_t	nfs_create;
 static vop_mknod_t	nfs_mknod;
 static vop_open_t	nfs_open;
 static vop_close_t	nfs_close;
 static vop_access_t	nfs_access;
 static vop_getattr_t	nfs_getattr;
 static vop_setattr_t	nfs_setattr;
 static vop_read_t	nfs_read;
 static vop_fsync_t	nfs_fsync;
 static vop_remove_t	nfs_remove;
 static vop_link_t	nfs_link;
 static vop_rename_t	nfs_rename;
 static vop_mkdir_t	nfs_mkdir;
 static vop_rmdir_t	nfs_rmdir;
 static vop_symlink_t	nfs_symlink;
 static vop_readdir_t	nfs_readdir;
 static vop_strategy_t	nfs_strategy;
 static	int	nfs_lookitup(struct vnode *, const char *, int,
 		    struct ucred *, struct thread *, struct nfsnode **);
 static	int	nfs_sillyrename(struct vnode *, struct vnode *,
 		    struct componentname *);
 static vop_access_t	nfsspec_access;
 static vop_readlink_t	nfs_readlink;
 static vop_print_t	nfs_print;
 static vop_advlock_t	nfs_advlock;
 static vop_advlockasync_t nfs_advlockasync;
 
 /*
  * Global vfs data structures for nfs
  */
 struct vop_vector nfs_vnodeops = {
 	.vop_default =		&default_vnodeops,
 	.vop_access =		nfs_access,
 	.vop_advlock =		nfs_advlock,
 	.vop_advlockasync =	nfs_advlockasync,
 	.vop_close =		nfs_close,
 	.vop_create =		nfs_create,
 	.vop_fsync =		nfs_fsync,
 	.vop_getattr =		nfs_getattr,
 	.vop_getpages =		nfs_getpages,
 	.vop_putpages =		nfs_putpages,
 	.vop_inactive =		nfs_inactive,
 	.vop_link =		nfs_link,
 	.vop_lookup =		nfs_lookup,
 	.vop_mkdir =		nfs_mkdir,
 	.vop_mknod =		nfs_mknod,
 	.vop_open =		nfs_open,
 	.vop_print =		nfs_print,
 	.vop_read =		nfs_read,
 	.vop_readdir =		nfs_readdir,
 	.vop_readlink =		nfs_readlink,
 	.vop_reclaim =		nfs_reclaim,
 	.vop_remove =		nfs_remove,
 	.vop_rename =		nfs_rename,
 	.vop_rmdir =		nfs_rmdir,
 	.vop_setattr =		nfs_setattr,
 	.vop_strategy =		nfs_strategy,
 	.vop_symlink =		nfs_symlink,
 	.vop_write =		nfs_write,
 };
 
 struct vop_vector nfs_fifoops = {
 	.vop_default =		&fifo_specops,
 	.vop_access =		nfsspec_access,
 	.vop_close =		nfsfifo_close,
 	.vop_fsync =		nfs_fsync,
 	.vop_getattr =		nfs_getattr,
 	.vop_inactive =		nfs_inactive,
 	.vop_print =		nfs_print,
 	.vop_read =		nfsfifo_read,
 	.vop_reclaim =		nfs_reclaim,
 	.vop_setattr =		nfs_setattr,
 	.vop_write =		nfsfifo_write,
 };
 
 static int	nfs_mknodrpc(struct vnode *dvp, struct vnode **vpp,
 			     struct componentname *cnp, struct vattr *vap);
 static int	nfs_removerpc(struct vnode *dvp, const char *name, int namelen,
 			      struct ucred *cred, struct thread *td);
 static int	nfs_renamerpc(struct vnode *fdvp, const char *fnameptr,
 			      int fnamelen, struct vnode *tdvp,
 			      const char *tnameptr, int tnamelen,
 			      struct ucred *cred, struct thread *td);
 static int	nfs_renameit(struct vnode *sdvp, struct componentname *scnp,
 			     struct sillyrename *sp);
 
 /*
  * Global variables
  */
 struct mtx 	nfs_iod_mtx;
 enum nfsiod_state nfs_iodwant[NFS_MAXASYNCDAEMON];
 struct nfsmount *nfs_iodmount[NFS_MAXASYNCDAEMON];
 int		 nfs_numasync = 0;
 #define	DIRHDSIZ	(sizeof (struct dirent) - (MAXNAMLEN + 1))
 
 SYSCTL_DECL(_vfs_oldnfs);
 
 static int	nfsaccess_cache_timeout = NFS_MAXATTRTIMO;
 SYSCTL_INT(_vfs_oldnfs, OID_AUTO, access_cache_timeout, CTLFLAG_RW,
 	   &nfsaccess_cache_timeout, 0, "NFS ACCESS cache timeout");
 
 static int	nfs_prime_access_cache = 0;
 SYSCTL_INT(_vfs_oldnfs, OID_AUTO, prime_access_cache, CTLFLAG_RW,
 	   &nfs_prime_access_cache, 0,
 	   "Prime NFS ACCESS cache when fetching attributes");
 
 static int	nfsv3_commit_on_close = 0;
 SYSCTL_INT(_vfs_oldnfs, OID_AUTO, nfsv3_commit_on_close, CTLFLAG_RW,
 	   &nfsv3_commit_on_close, 0, "write+commit on close, else only write");
 
 static int	nfs_clean_pages_on_close = 1;
 SYSCTL_INT(_vfs_oldnfs, OID_AUTO, clean_pages_on_close, CTLFLAG_RW,
 	   &nfs_clean_pages_on_close, 0, "NFS clean dirty pages on close");
 
 int nfs_directio_enable = 0;
 SYSCTL_INT(_vfs_oldnfs, OID_AUTO, nfs_directio_enable, CTLFLAG_RW,
 	   &nfs_directio_enable, 0, "Enable NFS directio");
 
 /*
  * This sysctl allows other processes to mmap a file that has been opened
  * O_DIRECT by a process.  In general, having processes mmap the file while
  * Direct IO is in progress can lead to Data Inconsistencies.  But, we allow
  * this by default to prevent DoS attacks - to prevent a malicious user from
  * opening up files O_DIRECT preventing other users from mmap'ing these
  * files.  "Protected" environments where stricter consistency guarantees are
  * required can disable this knob.  The process that opened the file O_DIRECT
  * cannot mmap() the file, because mmap'ed IO on an O_DIRECT open() is not
  * meaningful.
  */
 int nfs_directio_allow_mmap = 1;
 SYSCTL_INT(_vfs_oldnfs, OID_AUTO, nfs_directio_allow_mmap, CTLFLAG_RW,
 	   &nfs_directio_allow_mmap, 0, "Enable mmaped IO on file with O_DIRECT opens");
 
 #if 0
 SYSCTL_INT(_vfs_oldnfs, OID_AUTO, access_cache_hits, CTLFLAG_RD,
 	   &nfsstats.accesscache_hits, 0, "NFS ACCESS cache hit count");
 
 SYSCTL_INT(_vfs_oldnfs, OID_AUTO, access_cache_misses, CTLFLAG_RD,
 	   &nfsstats.accesscache_misses, 0, "NFS ACCESS cache miss count");
 #endif
 
 #define	NFSV3ACCESS_ALL (NFSV3ACCESS_READ | NFSV3ACCESS_MODIFY		\
 			 | NFSV3ACCESS_EXTEND | NFSV3ACCESS_EXECUTE	\
 			 | NFSV3ACCESS_DELETE | NFSV3ACCESS_LOOKUP)
 
 /*
  * SMP Locking Note :
  * The list of locks after the description of the lock is the ordering
  * of other locks acquired with the lock held.
  * np->n_mtx : Protects the fields in the nfsnode.
        VM Object Lock
        VI_MTX (acquired indirectly)
  * nmp->nm_mtx : Protects the fields in the nfsmount.
        rep->r_mtx
  * nfs_iod_mtx : Global lock, protects shared nfsiod state.
  * nfs_reqq_mtx : Global lock, protects the nfs_reqq list.
        nmp->nm_mtx
        rep->r_mtx
  * rep->r_mtx : Protects the fields in an nfsreq.
  */
 
 static int
 nfs3_access_otw(struct vnode *vp, int wmode, struct thread *td,
     struct ucred *cred, uint32_t *retmode)
 {
 	const int v3 = 1;
 	u_int32_t *tl;
 	int error = 0, attrflag, i, lrupos;
 
 	struct mbuf *mreq, *mrep, *md, *mb;
 	caddr_t bpos, dpos;
 	u_int32_t rmode;
 	struct nfsnode *np = VTONFS(vp);
 
 	nfsstats.rpccnt[NFSPROC_ACCESS]++;
 	mreq = m_get2(NFSX_FH(v3) + NFSX_UNSIGNED, M_WAITOK, MT_DATA, 0);
 	mb = mreq;
 	bpos = mtod(mb, caddr_t);
 	nfsm_fhtom(vp, v3);
 	tl = nfsm_build(u_int32_t *, NFSX_UNSIGNED);
 	*tl = txdr_unsigned(wmode);
 	nfsm_request(vp, NFSPROC_ACCESS, td, cred);
 	nfsm_postop_attr(vp, attrflag);
 	if (!error) {
 		lrupos = 0;
 		tl = nfsm_dissect(u_int32_t *, NFSX_UNSIGNED);
 		rmode = fxdr_unsigned(u_int32_t, *tl);
 		mtx_lock(&np->n_mtx);
 		for (i = 0; i < NFS_ACCESSCACHESIZE; i++) {
 			if (np->n_accesscache[i].uid == cred->cr_uid) {
 				np->n_accesscache[i].mode = rmode;
 				np->n_accesscache[i].stamp = time_second;
 				break;
 			}
 			if (i > 0 && np->n_accesscache[i].stamp <
 			    np->n_accesscache[lrupos].stamp)
 				lrupos = i;
 		}
 		if (i == NFS_ACCESSCACHESIZE) {
 			np->n_accesscache[lrupos].uid = cred->cr_uid;
 			np->n_accesscache[lrupos].mode = rmode;
 			np->n_accesscache[lrupos].stamp = time_second;
 		}
 		mtx_unlock(&np->n_mtx);
 		if (retmode != NULL)
 			*retmode = rmode;
 		KDTRACE_NFS_ACCESSCACHE_LOAD_DONE(vp, cred->cr_uid, rmode, 0);
 	}
 	m_freem(mrep);
 nfsmout:
 #ifdef KDTRACE_HOOKS
 	if (error) {
 		KDTRACE_NFS_ACCESSCACHE_LOAD_DONE(vp, cred->cr_uid, 0,
 		    error);
 	}
 #endif
 	return (error);
 }
 
 /*
  * nfs access vnode op.
  * For nfs version 2, just return ok. File accesses may fail later.
  * For nfs version 3, use the access rpc to check accessibility. If file modes
  * are changed on the server, accesses might still fail later.
  */
 static int
 nfs_access(struct vop_access_args *ap)
 {
 	struct vnode *vp = ap->a_vp;
 	int error = 0, i, gotahit;
 	u_int32_t mode, rmode, wmode;
 	int v3 = NFS_ISV3(vp);
 	struct nfsnode *np = VTONFS(vp);
 
 	/*
 	 * Disallow write attempts on filesystems mounted read-only;
 	 * unless the file is a socket, fifo, or a block or character
 	 * device resident on the filesystem.
 	 */
 	if ((ap->a_accmode & VWRITE) && (vp->v_mount->mnt_flag & MNT_RDONLY)) {
 		switch (vp->v_type) {
 		case VREG:
 		case VDIR:
 		case VLNK:
 			return (EROFS);
 		default:
 			break;
 		}
 	}
 	/*
 	 * For nfs v3, check to see if we have done this recently, and if
 	 * so return our cached result instead of making an ACCESS call.
 	 * If not, do an access rpc, otherwise you are stuck emulating
 	 * ufs_access() locally using the vattr. This may not be correct,
 	 * since the server may apply other access criteria such as
 	 * client uid-->server uid mapping that we do not know about.
 	 */
 	if (v3) {
 		if (ap->a_accmode & VREAD)
 			mode = NFSV3ACCESS_READ;
 		else
 			mode = 0;
 		if (vp->v_type != VDIR) {
 			if (ap->a_accmode & VWRITE)
 				mode |= (NFSV3ACCESS_MODIFY | NFSV3ACCESS_EXTEND);
 			if (ap->a_accmode & VEXEC)
 				mode |= NFSV3ACCESS_EXECUTE;
 		} else {
 			if (ap->a_accmode & VWRITE)
 				mode |= (NFSV3ACCESS_MODIFY | NFSV3ACCESS_EXTEND |
 					 NFSV3ACCESS_DELETE);
 			if (ap->a_accmode & VEXEC)
 				mode |= NFSV3ACCESS_LOOKUP;
 		}
 		/* XXX safety belt, only make blanket request if caching */
 		if (nfsaccess_cache_timeout > 0) {
 			wmode = NFSV3ACCESS_READ | NFSV3ACCESS_MODIFY |
 				NFSV3ACCESS_EXTEND | NFSV3ACCESS_EXECUTE |
 				NFSV3ACCESS_DELETE | NFSV3ACCESS_LOOKUP;
 		} else {
 			wmode = mode;
 		}
 
 		/*
 		 * Does our cached result allow us to give a definite yes to
 		 * this request?
 		 */
 		gotahit = 0;
 		mtx_lock(&np->n_mtx);
 		for (i = 0; i < NFS_ACCESSCACHESIZE; i++) {
 			if (ap->a_cred->cr_uid == np->n_accesscache[i].uid) {
 				if (time_second < (np->n_accesscache[i].stamp +
 				    nfsaccess_cache_timeout) &&
 				    (np->n_accesscache[i].mode & mode) == mode) {
 					nfsstats.accesscache_hits++;
 					gotahit = 1;
 				}
 				break;
 			}
 		}
 		mtx_unlock(&np->n_mtx);
 #ifdef KDTRACE_HOOKS
 		if (gotahit)
 			KDTRACE_NFS_ACCESSCACHE_GET_HIT(vp,
 			    ap->a_cred->cr_uid, mode);
 		else
 			KDTRACE_NFS_ACCESSCACHE_GET_MISS(vp,
 			    ap->a_cred->cr_uid, mode);
 #endif
 		if (gotahit == 0) {
 			/*
 			 * Either a no, or a don't know.  Go to the wire.
 			 */
 			nfsstats.accesscache_misses++;
 		        error = nfs3_access_otw(vp, wmode, ap->a_td, ap->a_cred,
 			    &rmode);
 			if (!error) {
 				if ((rmode & mode) != mode)
 					error = EACCES;
 			}
 		}
 		return (error);
 	} else {
 		if ((error = nfsspec_access(ap)) != 0) {
 			return (error);
 		}
 		/*
 		 * Attempt to prevent a mapped root from accessing a file
 		 * which it shouldn't.  We try to read a byte from the file
 		 * if the user is root and the file is not zero length.
 		 * After calling nfsspec_access, we should have the correct
 		 * file size cached.
 		 */
 		mtx_lock(&np->n_mtx);
 		if (ap->a_cred->cr_uid == 0 && (ap->a_accmode & VREAD)
 		    && VTONFS(vp)->n_size > 0) {
 			struct iovec aiov;
 			struct uio auio;
 			char buf[1];
 
 			mtx_unlock(&np->n_mtx);
 			aiov.iov_base = buf;
 			aiov.iov_len = 1;
 			auio.uio_iov = &aiov;
 			auio.uio_iovcnt = 1;
 			auio.uio_offset = 0;
 			auio.uio_resid = 1;
 			auio.uio_segflg = UIO_SYSSPACE;
 			auio.uio_rw = UIO_READ;
 			auio.uio_td = ap->a_td;
 
 			if (vp->v_type == VREG)
 				error = nfs_readrpc(vp, &auio, ap->a_cred);
 			else if (vp->v_type == VDIR) {
 				char* bp;
 				bp = malloc(NFS_DIRBLKSIZ, M_TEMP, M_WAITOK);
 				aiov.iov_base = bp;
 				aiov.iov_len = auio.uio_resid = NFS_DIRBLKSIZ;
 				error = nfs_readdirrpc(vp, &auio, ap->a_cred);
 				free(bp, M_TEMP);
 			} else if (vp->v_type == VLNK)
 				error = nfs_readlinkrpc(vp, &auio, ap->a_cred);
 			else
 				error = EACCES;
 		} else
 			mtx_unlock(&np->n_mtx);
 		return (error);
 	}
 }
 
 int nfs_otw_getattr_avoid = 0;
 
 /*
  * nfs open vnode op
  * Check to see if the type is ok
  * and that deletion is not in progress.
  * For paged in text files, you will need to flush the page cache
  * if consistency is lost.
  */
 /* ARGSUSED */
 static int
 nfs_open(struct vop_open_args *ap)
 {
 	struct vnode *vp = ap->a_vp;
 	struct nfsnode *np = VTONFS(vp);
 	struct vattr vattr;
 	int error;
 	int fmode = ap->a_mode;
 	struct ucred *cred;
 
 	if (vp->v_type != VREG && vp->v_type != VDIR && vp->v_type != VLNK)
 		return (EOPNOTSUPP);
 
 	/*
 	 * Get a valid lease. If cached data is stale, flush it.
 	 */
 	mtx_lock(&np->n_mtx);
 	if (np->n_flag & NMODIFIED) {
 		mtx_unlock(&np->n_mtx);
 		error = nfs_vinvalbuf(vp, V_SAVE, ap->a_td, 1);
 		if (error == EINTR || error == EIO)
 			return (error);
 		mtx_lock(&np->n_mtx);
 		np->n_attrstamp = 0;
 		KDTRACE_NFS_ATTRCACHE_FLUSH_DONE(vp);
 		if (vp->v_type == VDIR)
 			np->n_direofoffset = 0;
 		mtx_unlock(&np->n_mtx);
 		error = VOP_GETATTR(vp, &vattr, ap->a_cred);
 		if (error)
 			return (error);
 		mtx_lock(&np->n_mtx);
 		np->n_mtime = vattr.va_mtime;
 	} else {
 		mtx_unlock(&np->n_mtx);
 		error = VOP_GETATTR(vp, &vattr, ap->a_cred);
 		if (error)
 			return (error);
 		mtx_lock(&np->n_mtx);
 		if (NFS_TIMESPEC_COMPARE(&np->n_mtime, &vattr.va_mtime)) {
 			if (vp->v_type == VDIR)
 				np->n_direofoffset = 0;
 			mtx_unlock(&np->n_mtx);
 			error = nfs_vinvalbuf(vp, V_SAVE, ap->a_td, 1);
 			if (error == EINTR || error == EIO) {
 				return (error);
 			}
 			mtx_lock(&np->n_mtx);
 			np->n_mtime = vattr.va_mtime;
 		}
 	}
 	/*
 	 * If the object has >= 1 O_DIRECT active opens, we disable caching.
 	 */
 	if (nfs_directio_enable && (fmode & O_DIRECT) && (vp->v_type == VREG)) {
 		if (np->n_directio_opens == 0) {
 			mtx_unlock(&np->n_mtx);
 			error = nfs_vinvalbuf(vp, V_SAVE, ap->a_td, 1);
 			if (error)
 				return (error);
 			mtx_lock(&np->n_mtx);
 			np->n_flag |= NNONCACHE;
 		}
 		np->n_directio_opens++;
 	}
 
 	/*
 	 * If this is an open for writing, capture a reference to the
 	 * credentials, so they can be used by nfs_putpages(). Using
 	 * these write credentials is preferable to the credentials of
 	 * whatever thread happens to be doing the VOP_PUTPAGES() since
 	 * the write RPCs are less likely to fail with EACCES.
 	 */
 	if ((fmode & FWRITE) != 0) {
 		cred = np->n_writecred;
 		np->n_writecred = crhold(ap->a_cred);
 	} else
 		cred = NULL;
 	mtx_unlock(&np->n_mtx);
 	if (cred != NULL)
 		crfree(cred);
 	vnode_create_vobject(vp, vattr.va_size, ap->a_td);
 	return (0);
 }
 
 /*
  * nfs close vnode op
  * What an NFS client should do upon close after writing is a debatable issue.
  * Most NFS clients push delayed writes to the server upon close, basically for
  * two reasons:
  * 1 - So that any write errors may be reported back to the client process
  *     doing the close system call. By far the two most likely errors are
  *     NFSERR_NOSPC and NFSERR_DQUOT to indicate space allocation failure.
  * 2 - To put a worst case upper bound on cache inconsistency between
  *     multiple clients for the file.
  * There is also a consistency problem for Version 2 of the protocol w.r.t.
  * not being able to tell if other clients are writing a file concurrently,
  * since there is no way of knowing if the changed modify time in the reply
  * is only due to the write for this client.
  * (NFS Version 3 provides weak cache consistency data in the reply that
  *  should be sufficient to detect and handle this case.)
  *
  * The current code does the following:
  * for NFS Version 2 - play it safe and flush/invalidate all dirty buffers
  * for NFS Version 3 - flush dirty buffers to the server but don't invalidate
  *                     or commit them (this satisfies 1 and 2 except for the
  *                     case where the server crashes after this close but
  *                     before the commit RPC, which is felt to be "good
  *                     enough". Changing the last argument to nfs_flush() to
  *                     a 1 would force a commit operation, if it is felt a
  *                     commit is necessary now.
  */
 /* ARGSUSED */
 static int
 nfs_close(struct vop_close_args *ap)
 {
 	struct vnode *vp = ap->a_vp;
 	struct nfsnode *np = VTONFS(vp);
 	int error = 0;
 	int fmode = ap->a_fflag;
 
 	if (vp->v_type == VREG) {
 	    /*
 	     * Examine and clean dirty pages, regardless of NMODIFIED.
 	     * This closes a major hole in close-to-open consistency.
 	     * We want to push out all dirty pages (and buffers) on
 	     * close, regardless of whether they were dirtied by
 	     * mmap'ed writes or via write().
 	     */
 	    if (nfs_clean_pages_on_close && vp->v_object) {
 		VM_OBJECT_WLOCK(vp->v_object);
 		vm_object_page_clean(vp->v_object, 0, 0, 0);
 		VM_OBJECT_WUNLOCK(vp->v_object);
 	    }
 	    mtx_lock(&np->n_mtx);
 	    if (np->n_flag & NMODIFIED) {
 		mtx_unlock(&np->n_mtx);
 		if (NFS_ISV3(vp)) {
 		    /*
 		     * Under NFSv3 we have dirty buffers to dispose of.  We
 		     * must flush them to the NFS server.  We have the option
 		     * of waiting all the way through the commit rpc or just
 		     * waiting for the initial write.  The default is to only
 		     * wait through the initial write so the data is in the
 		     * server's cache, which is roughly similar to the state
 		     * a standard disk subsystem leaves the file in on close().
 		     *
 		     * We cannot clear the NMODIFIED bit in np->n_flag due to
 		     * potential races with other processes, and certainly
 		     * cannot clear it if we don't commit.
 		     */
 		    int cm = nfsv3_commit_on_close ? 1 : 0;
 		    error = nfs_flush(vp, MNT_WAIT, cm);
 		    /* np->n_flag &= ~NMODIFIED; */
 		} else
 		    error = nfs_vinvalbuf(vp, V_SAVE, ap->a_td, 1);
 		mtx_lock(&np->n_mtx);
 	    }
 	    if (np->n_flag & NWRITEERR) {
 		np->n_flag &= ~NWRITEERR;
 		error = np->n_error;
 	    }
 	    mtx_unlock(&np->n_mtx);
 	}
 	if (nfs_directio_enable)
 		KASSERT((np->n_directio_asyncwr == 0),
 			("nfs_close: dirty unflushed (%d) directio buffers\n",
 			 np->n_directio_asyncwr));
 	if (nfs_directio_enable && (fmode & O_DIRECT) && (vp->v_type == VREG)) {
 		mtx_lock(&np->n_mtx);
 		KASSERT((np->n_directio_opens > 0), 
 			("nfs_close: unexpectedly value (0) of n_directio_opens\n"));
 		np->n_directio_opens--;
 		if (np->n_directio_opens == 0)
 			np->n_flag &= ~NNONCACHE;
 		mtx_unlock(&np->n_mtx);
 	}
 	return (error);
 }
 
 /*
  * nfs getattr call from vfs.
  */
 static int
 nfs_getattr(struct vop_getattr_args *ap)
 {
 	struct vnode *vp = ap->a_vp;
 	struct nfsnode *np = VTONFS(vp);
 	struct thread *td = curthread;
 	struct vattr *vap = ap->a_vap;
 	struct vattr vattr;
 	caddr_t bpos, dpos;
 	int error = 0;
 	struct mbuf *mreq, *mrep, *md, *mb;
 	int v3 = NFS_ISV3(vp);
 
 	/*
 	 * Update local times for special files.
 	 */
 	mtx_lock(&np->n_mtx);
 	if (np->n_flag & (NACC | NUPD))
 		np->n_flag |= NCHG;
 	mtx_unlock(&np->n_mtx);
 	/*
 	 * First look in the cache.
 	 */
 	if (nfs_getattrcache(vp, &vattr) == 0)
 		goto nfsmout;
 	if (v3 && nfs_prime_access_cache && nfsaccess_cache_timeout > 0) {
 		nfsstats.accesscache_misses++;
 		nfs3_access_otw(vp, NFSV3ACCESS_ALL, td, ap->a_cred, NULL);
 		if (nfs_getattrcache(vp, &vattr) == 0)
 			goto nfsmout;
 	}
 	nfsstats.rpccnt[NFSPROC_GETATTR]++;
 	mreq = m_get2(NFSX_FH(v3), M_WAITOK, MT_DATA, 0);
 	mb = mreq;
 	bpos = mtod(mb, caddr_t);
 	nfsm_fhtom(vp, v3);
 	nfsm_request(vp, NFSPROC_GETATTR, td, ap->a_cred);
 	if (!error) {
 		nfsm_loadattr(vp, &vattr);
 	}
 	m_freem(mrep);
 nfsmout:
 	vap->va_type = vattr.va_type;
 	vap->va_mode = vattr.va_mode;
 	vap->va_nlink = vattr.va_nlink;
 	vap->va_uid = vattr.va_uid;
 	vap->va_gid = vattr.va_gid;
 	vap->va_fsid = vattr.va_fsid;
 	vap->va_fileid = vattr.va_fileid;
 	vap->va_size = vattr.va_size;
 	vap->va_blocksize = vattr.va_blocksize;
 	vap->va_atime = vattr.va_atime;
 	vap->va_mtime = vattr.va_mtime;
 	vap->va_ctime = vattr.va_ctime;
 	vap->va_gen = vattr.va_gen;
 	vap->va_flags = vattr.va_flags;
 	vap->va_rdev = vattr.va_rdev;
 	vap->va_bytes = vattr.va_bytes;
 	vap->va_filerev = vattr.va_filerev;
 
 	return (error);
 }
 
 /*
  * nfs setattr call.
  */
 static int
 nfs_setattr(struct vop_setattr_args *ap)
 {
 	struct vnode *vp = ap->a_vp;
 	struct nfsnode *np = VTONFS(vp);
 	struct vattr *vap = ap->a_vap;
 	struct thread *td = curthread;
 	int error = 0;
 	u_quad_t tsize;
 
 #ifndef nolint
 	tsize = (u_quad_t)0;
 #endif
 
 	/*
 	 * Setting of flags is not supported.
 	 */
 	if (vap->va_flags != VNOVAL)
 		return (EOPNOTSUPP);
 
 	/*
 	 * Disallow write attempts if the filesystem is mounted read-only.
 	 */
   	if ((vap->va_flags != VNOVAL || vap->va_uid != (uid_t)VNOVAL ||
 	    vap->va_gid != (gid_t)VNOVAL || vap->va_atime.tv_sec != VNOVAL ||
 	    vap->va_mtime.tv_sec != VNOVAL || vap->va_mode != (mode_t)VNOVAL) &&
 	    (vp->v_mount->mnt_flag & MNT_RDONLY)) {
 		error = EROFS;
 		goto out;
 	}
 	if (vap->va_size != VNOVAL) {
  		switch (vp->v_type) {
  		case VDIR:
  			return (EISDIR);
  		case VCHR:
  		case VBLK:
  		case VSOCK:
  		case VFIFO:
 			if (vap->va_mtime.tv_sec == VNOVAL &&
 			    vap->va_atime.tv_sec == VNOVAL &&
 			    vap->va_mode == (mode_t)VNOVAL &&
 			    vap->va_uid == (uid_t)VNOVAL &&
 			    vap->va_gid == (gid_t)VNOVAL)
 				return (0);		
  			vap->va_size = VNOVAL;
  			break;
  		default:
 			/*
 			 * Disallow write attempts if the filesystem is
 			 * mounted read-only.
 			 */
 			if (vp->v_mount->mnt_flag & MNT_RDONLY)
 				return (EROFS);
 			/*
 			 *  We run vnode_pager_setsize() early (why?),
 			 * we must set np->n_size now to avoid vinvalbuf
 			 * V_SAVE races that might setsize a lower
 			 * value.
 			 */
 			mtx_lock(&np->n_mtx);
 			tsize = np->n_size;
 			mtx_unlock(&np->n_mtx);
 			error = nfs_meta_setsize(vp, ap->a_cred, td,
 			    vap->va_size);
 			mtx_lock(&np->n_mtx);
  			if (np->n_flag & NMODIFIED) {
 			    tsize = np->n_size;
 			    mtx_unlock(&np->n_mtx);
  			    if (vap->va_size == 0)
  				error = nfs_vinvalbuf(vp, 0, td, 1);
  			    else
  				error = nfs_vinvalbuf(vp, V_SAVE, td, 1);
  			    if (error) {
 				vnode_pager_setsize(vp, tsize);
 				goto out;
 			    }
  			} else
 			    mtx_unlock(&np->n_mtx);
 			/*
 			 * np->n_size has already been set to vap->va_size
 			 * in nfs_meta_setsize(). We must set it again since
 			 * nfs_loadattrcache() could be called through
 			 * nfs_meta_setsize() and could modify np->n_size.
 			 */
 			mtx_lock(&np->n_mtx);
  			np->n_vattr.va_size = np->n_size = vap->va_size;
 			mtx_unlock(&np->n_mtx);
   		};
   	} else {
 		mtx_lock(&np->n_mtx);
 		if ((vap->va_mtime.tv_sec != VNOVAL || vap->va_atime.tv_sec != VNOVAL) && 
 		    (np->n_flag & NMODIFIED) && vp->v_type == VREG) {
 			mtx_unlock(&np->n_mtx);
 			if ((error = nfs_vinvalbuf(vp, V_SAVE, td, 1)) != 0 &&
 			    (error == EINTR || error == EIO))
 				return error;
 		} else
 			mtx_unlock(&np->n_mtx);
 	}
 	error = nfs_setattrrpc(vp, vap, ap->a_cred);
 	if (error && vap->va_size != VNOVAL) {
 		mtx_lock(&np->n_mtx);
 		np->n_size = np->n_vattr.va_size = tsize;
 		vnode_pager_setsize(vp, tsize);
 		mtx_unlock(&np->n_mtx);
 	}
 out:
 	return (error);
 }
 
 /*
  * Do an nfs setattr rpc.
  */
 static int
 nfs_setattrrpc(struct vnode *vp, struct vattr *vap, struct ucred *cred)
 {
 	struct nfsv2_sattr *sp;
 	struct nfsnode *np = VTONFS(vp);
 	caddr_t bpos, dpos;
 	u_int32_t *tl;
 	int error = 0, i, wccflag = NFSV3_WCCRATTR;
 	struct mbuf *mreq, *mrep, *md, *mb;
 	int v3 = NFS_ISV3(vp);
 
 	nfsstats.rpccnt[NFSPROC_SETATTR]++;
 	mreq = m_get2(NFSX_FH(v3) + NFSX_SATTR(v3), M_WAITOK, MT_DATA, 0);
 	mb = mreq;
 	bpos = mtod(mb, caddr_t);
 	nfsm_fhtom(vp, v3);
 	if (v3) {
 		nfsm_v3attrbuild(vap, TRUE);
 		tl = nfsm_build(u_int32_t *, NFSX_UNSIGNED);
 		*tl = nfs_false;
 	} else {
 		sp = nfsm_build(struct nfsv2_sattr *, NFSX_V2SATTR);
 		if (vap->va_mode == (mode_t)VNOVAL)
 			sp->sa_mode = nfs_xdrneg1;
 		else
 			sp->sa_mode = vtonfsv2_mode(vp->v_type, vap->va_mode);
 		if (vap->va_uid == (uid_t)VNOVAL)
 			sp->sa_uid = nfs_xdrneg1;
 		else
 			sp->sa_uid = txdr_unsigned(vap->va_uid);
 		if (vap->va_gid == (gid_t)VNOVAL)
 			sp->sa_gid = nfs_xdrneg1;
 		else
 			sp->sa_gid = txdr_unsigned(vap->va_gid);
 		sp->sa_size = txdr_unsigned(vap->va_size);
 		txdr_nfsv2time(&vap->va_atime, &sp->sa_atime);
 		txdr_nfsv2time(&vap->va_mtime, &sp->sa_mtime);
 	}
 	nfsm_request(vp, NFSPROC_SETATTR, curthread, cred);
 	if (v3) {
 		mtx_lock(&np->n_mtx);
 		for (i = 0; i < NFS_ACCESSCACHESIZE; i++)
 			np->n_accesscache[i].stamp = 0;
 		mtx_unlock(&np->n_mtx);
 		KDTRACE_NFS_ACCESSCACHE_FLUSH_DONE(vp);
 		nfsm_wcc_data(vp, wccflag);
 	} else
 		nfsm_loadattr(vp, NULL);
 	m_freem(mrep);
 nfsmout:
 	return (error);
 }
 
 /*
  * nfs lookup call, one step at a time...
  * First look in cache
  * If not found, unlock the directory nfsnode and do the rpc
  */
 static int
 nfs_lookup(struct vop_lookup_args *ap)
 {
 	struct componentname *cnp = ap->a_cnp;
 	struct vnode *dvp = ap->a_dvp;
 	struct vnode **vpp = ap->a_vpp;
 	struct mount *mp = dvp->v_mount;
 	struct vattr dvattr, vattr;
 	struct timespec nctime;
 	int flags = cnp->cn_flags;
 	struct vnode *newvp;
 	struct nfsmount *nmp;
 	caddr_t bpos, dpos;
 	struct mbuf *mreq, *mrep, *md, *mb;
 	long len;
 	nfsfh_t *fhp;
 	struct nfsnode *np, *newnp;
 	int error = 0, attrflag, dattrflag, fhsize, ltype, ncticks;
 	int v3 = NFS_ISV3(dvp);
 	struct thread *td = cnp->cn_thread;
 
 	*vpp = NULLVP;
 	if ((flags & ISLASTCN) && (mp->mnt_flag & MNT_RDONLY) &&
 	    (cnp->cn_nameiop == DELETE || cnp->cn_nameiop == RENAME))
 		return (EROFS);
 	if (dvp->v_type != VDIR)
 		return (ENOTDIR);
 	nmp = VFSTONFS(mp);
 	np = VTONFS(dvp);
 	if ((error = VOP_ACCESS(dvp, VEXEC, cnp->cn_cred, td)) != 0) {
 		*vpp = NULLVP;
 		return (error);
 	}
 	error = cache_lookup(dvp, vpp, cnp, &nctime, &ncticks);
 	if (error > 0 && error != ENOENT)
 		return (error);
 	if (error == -1) {
 		/*
 		 * Lookups of "." are special and always return the
 		 * current directory.  cache_lookup() already handles
 		 * associated locking bookkeeping, etc.
 		 */
 		if (cnp->cn_namelen == 1 && cnp->cn_nameptr[0] == '.') {
 			/* XXX: Is this really correct? */
 			if (cnp->cn_nameiop != LOOKUP &&
 			    (flags & ISLASTCN))
 				cnp->cn_flags |= SAVENAME;
 			return (0);
 		}
 
 		/*
 		 * We only accept a positive hit in the cache if the
 		 * change time of the file matches our cached copy.
 		 * Otherwise, we discard the cache entry and fallback
 		 * to doing a lookup RPC.  We also only trust cache
 		 * entries for less than nm_nametimeo seconds.
 		 *
 		 * To better handle stale file handles and attributes,
 		 * clear the attribute cache of this node if it is a
 		 * leaf component, part of an open() call, and not
 		 * locally modified before fetching the attributes.
 		 * This should allow stale file handles to be detected
 		 * here where we can fall back to a LOOKUP RPC to
 		 * recover rather than having nfs_open() detect the
 		 * stale file handle and failing open(2) with ESTALE.
 		 */
 		newvp = *vpp;
 		newnp = VTONFS(newvp);
 		if (!(nmp->nm_flag & NFSMNT_NOCTO) &&
 		    (flags & (ISLASTCN | ISOPEN)) == (ISLASTCN | ISOPEN) &&
 		    !(newnp->n_flag & NMODIFIED)) {
 			mtx_lock(&newnp->n_mtx);
 			newnp->n_attrstamp = 0;
 			KDTRACE_NFS_ATTRCACHE_FLUSH_DONE(newvp);
 			mtx_unlock(&newnp->n_mtx);
 		}
 		if ((u_int)(ticks - ncticks) < (nmp->nm_nametimeo * hz) &&
 		    VOP_GETATTR(newvp, &vattr, cnp->cn_cred) == 0 &&
 		    timespeccmp(&vattr.va_ctime, &nctime, ==)) {
 			nfsstats.lookupcache_hits++;
 			if (cnp->cn_nameiop != LOOKUP &&
 			    (flags & ISLASTCN))
 				cnp->cn_flags |= SAVENAME;
 			return (0);
 		}
 		cache_purge(newvp);
 		if (dvp != newvp)
 			vput(newvp);
 		else 
 			vrele(newvp);
 		*vpp = NULLVP;
 	} else if (error == ENOENT) {
 		if (dvp->v_iflag & VI_DOOMED)
 			return (ENOENT);
 		/*
 		 * We only accept a negative hit in the cache if the
 		 * modification time of the parent directory matches
 		 * the cached copy in the name cache entry.
 		 * Otherwise, we discard all of the negative cache
 		 * entries for this directory.  We also only trust
 		 * negative cache entries for up to nm_negnametimeo
 		 * seconds.
 		 */
 		if ((u_int)(ticks - ncticks) < (nmp->nm_negnametimeo * hz) &&
 		    VOP_GETATTR(dvp, &vattr, cnp->cn_cred) == 0 &&
 		    timespeccmp(&vattr.va_mtime, &nctime, ==)) {
 			nfsstats.lookupcache_hits++;
 			return (ENOENT);
 		}
 		cache_purge_negative(dvp);
 	}
 
 	attrflag = dattrflag = 0;
 	error = 0;
 	newvp = NULLVP;
 	nfsstats.lookupcache_misses++;
 	nfsstats.rpccnt[NFSPROC_LOOKUP]++;
 	len = cnp->cn_namelen;
 	mreq = m_get2(NFSX_FH(v3) + NFSX_UNSIGNED + nfsm_rndup(len), M_WAITOK,
 	    MT_DATA, 0);
 	mb = mreq;
 	bpos = mtod(mb, caddr_t);
 	nfsm_fhtom(dvp, v3);
 	nfsm_strtom(cnp->cn_nameptr, len, NFS_MAXNAMLEN);
 	nfsm_request(dvp, NFSPROC_LOOKUP, cnp->cn_thread, cnp->cn_cred);
 	if (error) {
 		if (v3) {
 			nfsm_postop_attr_va(dvp, dattrflag, &vattr);
 			m_freem(mrep);
 		}
 		goto nfsmout;
 	}
 	nfsm_getfh(fhp, fhsize, v3);
 
 	/*
 	 * Handle RENAME case...
 	 */
 	if (cnp->cn_nameiop == RENAME && (flags & ISLASTCN)) {
 		if (NFS_CMPFH(np, fhp, fhsize)) {
 			m_freem(mrep);
 			return (EISDIR);
 		}
 		error = nfs_nget(mp, fhp, fhsize, &np, LK_EXCLUSIVE);
 		if (error) {
 			m_freem(mrep);
 			return (error);
 		}
 		newvp = NFSTOV(np);
 		if (v3) {
 			nfsm_postop_attr(newvp, attrflag);
 			nfsm_postop_attr(dvp, attrflag);
 		} else
 			nfsm_loadattr(newvp, NULL);
 		*vpp = newvp;
 		m_freem(mrep);
 		cnp->cn_flags |= SAVENAME;
 		return (0);
 	}
 
 	if (flags & ISDOTDOT) {
 		ltype = VOP_ISLOCKED(dvp);
 		error = vfs_busy(mp, MBF_NOWAIT);
 		if (error != 0) {
 			vfs_ref(mp);
 			VOP_UNLOCK(dvp, 0);
 			error = vfs_busy(mp, 0);
 			vn_lock(dvp, ltype | LK_RETRY);
 			vfs_rel(mp);
 			if (error == 0 && (dvp->v_iflag & VI_DOOMED)) {
 				vfs_unbusy(mp);
 				error = ENOENT;
 			}
 			if (error != 0) {
 				m_freem(mrep);
 				return (error);
 			}
 		}
 		VOP_UNLOCK(dvp, 0);
 		error = nfs_nget(mp, fhp, fhsize, &np, cnp->cn_lkflags);
 		if (error == 0)
 			newvp = NFSTOV(np);
 		vfs_unbusy(mp);
 		if (newvp != dvp)
 			vn_lock(dvp, ltype | LK_RETRY);
 		if (dvp->v_iflag & VI_DOOMED) {
 			if (error == 0) {
 				if (newvp == dvp)
 					vrele(newvp);
 				else
 					vput(newvp);
 			}
 			error = ENOENT;
 		}
 		if (error) {
 			m_freem(mrep);
 			return (error);
 		}
 	} else if (NFS_CMPFH(np, fhp, fhsize)) {
 		VREF(dvp);
 		newvp = dvp;
 	} else {
 		error = nfs_nget(mp, fhp, fhsize, &np, cnp->cn_lkflags);
 		if (error) {
 			m_freem(mrep);
 			return (error);
 		}
 		newvp = NFSTOV(np);
 
 		/*
 		 * Flush the attribute cache when opening a leaf node
 		 * to ensure that fresh attributes are fetched in
 		 * nfs_open() if we are unable to fetch attributes
 		 * from the LOOKUP reply.
 		 */
 		if ((flags & (ISLASTCN | ISOPEN)) == (ISLASTCN | ISOPEN) &&
 		    !(np->n_flag & NMODIFIED)) {
 			mtx_lock(&np->n_mtx);
 			np->n_attrstamp = 0;
 			KDTRACE_NFS_ATTRCACHE_FLUSH_DONE(newvp);
 			mtx_unlock(&np->n_mtx);
 		}
 	}
 	if (v3) {
 		nfsm_postop_attr_va(newvp, attrflag, &vattr);
 		nfsm_postop_attr_va(dvp, dattrflag, &dvattr);
 	} else {
 		nfsm_loadattr(newvp, &vattr);
 		attrflag = 1;
 	}
 	if (cnp->cn_nameiop != LOOKUP && (flags & ISLASTCN))
 		cnp->cn_flags |= SAVENAME;
 	if ((cnp->cn_flags & MAKEENTRY) &&
 	    (cnp->cn_nameiop != DELETE || !(flags & ISLASTCN)) &&
 	    attrflag != 0 && (newvp->v_type != VDIR || dattrflag != 0))
 		cache_enter_time(dvp, newvp, cnp, &vattr.va_ctime,
 		    newvp->v_type != VDIR ? NULL : &dvattr.va_ctime);
 	*vpp = newvp;
 	m_freem(mrep);
 nfsmout:
 	if (error) {
 		if (newvp != NULLVP) {
 			vput(newvp);
 			*vpp = NULLVP;
 		}
 
 		if (error != ENOENT)
 			goto done;
 
 		/* The requested file was not found. */
 		if ((cnp->cn_nameiop == CREATE || cnp->cn_nameiop == RENAME) &&
 		    (flags & ISLASTCN)) {
 			/*
 			 * XXX: UFS does a full VOP_ACCESS(dvp,
 			 * VWRITE) here instead of just checking
 			 * MNT_RDONLY.
 			 */
 			if (mp->mnt_flag & MNT_RDONLY)
 				return (EROFS);
 			cnp->cn_flags |= SAVENAME;
 			return (EJUSTRETURN);
 		}
 
 		if ((cnp->cn_flags & MAKEENTRY) && cnp->cn_nameiop != CREATE &&
 		    dattrflag) {
 			/*
 			 * Cache the modification time of the parent
 			 * directory from the post-op attributes in
 			 * the name cache entry.  The negative cache
 			 * entry will be ignored once the directory
 			 * has changed.  Don't bother adding the entry
 			 * if the directory has already changed.
 			 */
 			mtx_lock(&np->n_mtx);
 			if (timespeccmp(&np->n_vattr.va_mtime,
 			    &vattr.va_mtime, ==)) {
 				mtx_unlock(&np->n_mtx);
 				cache_enter_time(dvp, NULL, cnp,
 				    &vattr.va_mtime, NULL);
 			} else
 				mtx_unlock(&np->n_mtx);
 		}
 		return (ENOENT);
 	}
 done:
 	return (error);
 }
 
 /*
  * nfs read call.
  * Just call nfs_bioread() to do the work.
  */
 static int
 nfs_read(struct vop_read_args *ap)
 {
 	struct vnode *vp = ap->a_vp;
 
 	switch (vp->v_type) {
 	case VREG:
 		return (nfs_bioread(vp, ap->a_uio, ap->a_ioflag, ap->a_cred));
 	case VDIR:
 		return (EISDIR);
 	default:
 		return (EOPNOTSUPP);
 	}
 }
 
 /*
  * nfs readlink call
  */
 static int
 nfs_readlink(struct vop_readlink_args *ap)
 {
 	struct vnode *vp = ap->a_vp;
 
 	if (vp->v_type != VLNK)
 		return (EINVAL);
 	return (nfs_bioread(vp, ap->a_uio, 0, ap->a_cred));
 }
 
 /*
  * Do a readlink rpc.
  * Called by nfs_doio() from below the buffer cache.
  */
 int
 nfs_readlinkrpc(struct vnode *vp, struct uio *uiop, struct ucred *cred)
 {
 	caddr_t bpos, dpos;
 	int error = 0, len, attrflag;
 	struct mbuf *mreq, *mrep, *md, *mb;
 	int v3 = NFS_ISV3(vp);
 
 	nfsstats.rpccnt[NFSPROC_READLINK]++;
 	mreq = m_get2(NFSX_FH(v3), M_WAITOK, MT_DATA, 0);
 	mb = mreq;
 	bpos = mtod(mb, caddr_t);
 	nfsm_fhtom(vp, v3);
 	nfsm_request(vp, NFSPROC_READLINK, uiop->uio_td, cred);
 	if (v3)
 		nfsm_postop_attr(vp, attrflag);
 	if (!error) {
 		nfsm_strsiz(len, NFS_MAXPATHLEN);
 		if (len == NFS_MAXPATHLEN) {
 			struct nfsnode *np = VTONFS(vp);
 			mtx_lock(&np->n_mtx);
 			if (np->n_size && np->n_size < NFS_MAXPATHLEN)
 				len = np->n_size;
 			mtx_unlock(&np->n_mtx);
 		}
 		nfsm_mtouio(uiop, len);
 	}
 	m_freem(mrep);
 nfsmout:
 	return (error);
 }
 
 /*
  * nfs read rpc call
  * Ditto above
  */
 int
 nfs_readrpc(struct vnode *vp, struct uio *uiop, struct ucred *cred)
 {
 	u_int32_t *tl;
 	caddr_t bpos, dpos;
 	struct mbuf *mreq, *mrep, *md, *mb;
 	struct nfsmount *nmp;
 	off_t end;
 	int error = 0, len, retlen, tsiz, eof, attrflag;
 	int v3 = NFS_ISV3(vp);
 	int rsize;
 
 #ifndef nolint
 	eof = 0;
 #endif
 	nmp = VFSTONFS(vp->v_mount);
 	tsiz = uiop->uio_resid;
 	mtx_lock(&nmp->nm_mtx);
 	end = uiop->uio_offset + tsiz;
 	if (end > nmp->nm_maxfilesize || end < uiop->uio_offset) {
 		mtx_unlock(&nmp->nm_mtx);
 		return (EFBIG);
 	}
 	rsize = nmp->nm_rsize;
 	mtx_unlock(&nmp->nm_mtx);
 	while (tsiz > 0) {
 		nfsstats.rpccnt[NFSPROC_READ]++;
 		len = (tsiz > rsize) ? rsize : tsiz;
 		mreq = m_get2(NFSX_FH(v3) + NFSX_UNSIGNED * 3, M_WAITOK,
 		    MT_DATA, 0);
 		mb = mreq;
 		bpos = mtod(mb, caddr_t);
 		nfsm_fhtom(vp, v3);
 		tl = nfsm_build(u_int32_t *, NFSX_UNSIGNED * 3);
 		if (v3) {
 			txdr_hyper(uiop->uio_offset, tl);
 			*(tl + 2) = txdr_unsigned(len);
 		} else {
 			*tl++ = txdr_unsigned(uiop->uio_offset);
 			*tl++ = txdr_unsigned(len);
 			*tl = 0;
 		}
 		nfsm_request(vp, NFSPROC_READ, uiop->uio_td, cred);
 		if (v3) {
 			nfsm_postop_attr(vp, attrflag);
 			if (error) {
 				m_freem(mrep);
 				goto nfsmout;
 			}
 			tl = nfsm_dissect(u_int32_t *, 2 * NFSX_UNSIGNED);
 			eof = fxdr_unsigned(int, *(tl + 1));
 		} else {
 			nfsm_loadattr(vp, NULL);
 		}
 		nfsm_strsiz(retlen, rsize);
 		nfsm_mtouio(uiop, retlen);
 		m_freem(mrep);
 		tsiz -= retlen;
 		if (v3) {
 			if (eof || retlen == 0) {
 				tsiz = 0;
 			}
 		} else if (retlen < len) {
 			tsiz = 0;
 		}
 	}
 nfsmout:
 	return (error);
 }
 
 /*
  * nfs write call
  */
 int
 nfs_writerpc(struct vnode *vp, struct uio *uiop, struct ucred *cred,
 	     int *iomode, int *must_commit)
 {
 	u_int32_t *tl;
 	int32_t backup;
 	caddr_t bpos, dpos;
 	struct mbuf *mreq, *mrep, *md, *mb;
 	struct nfsmount *nmp = VFSTONFS(vp->v_mount);
 	off_t end;
 	int error = 0, len, tsiz, wccflag = NFSV3_WCCRATTR, rlen, commit;
 	int v3 = NFS_ISV3(vp), committed = NFSV3WRITE_FILESYNC;
 	int wsize;
 	
 	KASSERT(uiop->uio_iovcnt == 1, ("nfs: writerpc iovcnt > 1"));
 	*must_commit = 0;
 	tsiz = uiop->uio_resid;
 	mtx_lock(&nmp->nm_mtx);
 	end = uiop->uio_offset + tsiz;
 	if (end > nmp->nm_maxfilesize || end < uiop->uio_offset) {
 		mtx_unlock(&nmp->nm_mtx);		
 		return (EFBIG);
 	}
 	wsize = nmp->nm_wsize;
 	mtx_unlock(&nmp->nm_mtx);
 	while (tsiz > 0) {
 		nfsstats.rpccnt[NFSPROC_WRITE]++;
 		len = (tsiz > wsize) ? wsize : tsiz;
 		mreq = m_get2(NFSX_FH(v3) + 5 * NFSX_UNSIGNED, M_WAITOK,
 		    MT_DATA, 0);
 		mb = mreq;
 		bpos = mtod(mb, caddr_t);
 		nfsm_fhtom(vp, v3);
 		if (v3) {
 			tl = nfsm_build(u_int32_t *, 5 * NFSX_UNSIGNED);
 			txdr_hyper(uiop->uio_offset, tl);
 			tl += 2;
 			*tl++ = txdr_unsigned(len);
 			*tl++ = txdr_unsigned(*iomode);
 			*tl = txdr_unsigned(len);
 		} else {
 			u_int32_t x;
 
 			tl = nfsm_build(u_int32_t *, 4 * NFSX_UNSIGNED);
 			/* Set both "begin" and "current" to non-garbage. */
 			x = txdr_unsigned((u_int32_t)uiop->uio_offset);
 			*tl++ = x;	/* "begin offset" */
 			*tl++ = x;	/* "current offset" */
 			x = txdr_unsigned(len);
 			*tl++ = x;	/* total to this offset */
 			*tl = x;	/* size of this write */
 		}
 		nfsm_uiotom(uiop, len);
 		nfsm_request(vp, NFSPROC_WRITE, uiop->uio_td, cred);
 		if (v3) {
 			wccflag = NFSV3_WCCCHK;
 			nfsm_wcc_data(vp, wccflag);
 			if (!error) {
 				tl = nfsm_dissect(u_int32_t *, 2 * NFSX_UNSIGNED
 					+ NFSX_V3WRITEVERF);
 				rlen = fxdr_unsigned(int, *tl++);
 				if (rlen == 0) {
 					error = NFSERR_IO;
 					m_freem(mrep);
 					break;
 				} else if (rlen < len) {
 					backup = len - rlen;
 					uiop->uio_iov->iov_base =
 					    (char *)uiop->uio_iov->iov_base -
 					    backup;
 					uiop->uio_iov->iov_len += backup;
 					uiop->uio_offset -= backup;
 					uiop->uio_resid += backup;
 					len = rlen;
 				}
 				commit = fxdr_unsigned(int, *tl++);
 
 				/*
 				 * Return the lowest committment level
 				 * obtained by any of the RPCs.
 				 */
 				if (committed == NFSV3WRITE_FILESYNC)
 					committed = commit;
 				else if (committed == NFSV3WRITE_DATASYNC &&
 					commit == NFSV3WRITE_UNSTABLE)
 					committed = commit;
 				mtx_lock(&nmp->nm_mtx);
 				if ((nmp->nm_state & NFSSTA_HASWRITEVERF) == 0){
 				    bcopy((caddr_t)tl, (caddr_t)nmp->nm_verf,
 					NFSX_V3WRITEVERF);
 				    nmp->nm_state |= NFSSTA_HASWRITEVERF;
 				} else if (bcmp((caddr_t)tl,
 				    (caddr_t)nmp->nm_verf, NFSX_V3WRITEVERF)) {
 				    *must_commit = 1;
 				    bcopy((caddr_t)tl, (caddr_t)nmp->nm_verf,
 					NFSX_V3WRITEVERF);
 				}
 				mtx_unlock(&nmp->nm_mtx);
 			}
 		} else {
 			nfsm_loadattr(vp, NULL);
 		}
 		if (wccflag) {
 			mtx_lock(&(VTONFS(vp))->n_mtx);
 			VTONFS(vp)->n_mtime = VTONFS(vp)->n_vattr.va_mtime;
 			mtx_unlock(&(VTONFS(vp))->n_mtx);
 		}
 		m_freem(mrep);
 		if (error)
 			break;
 		tsiz -= len;
 	}
 nfsmout:
 	if (DOINGASYNC(vp))
 		committed = NFSV3WRITE_FILESYNC;
 	*iomode = committed;
 	if (error)
 		uiop->uio_resid = tsiz;
 	return (error);
 }
 
 /*
  * nfs mknod rpc
  * For NFS v2 this is a kludge. Use a create rpc but with the IFMT bits of the
  * mode set to specify the file type and the size field for rdev.
  */
 static int
 nfs_mknodrpc(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp,
     struct vattr *vap)
 {
 	struct nfsv2_sattr *sp;
 	u_int32_t *tl;
 	struct vnode *newvp = NULL;
 	struct nfsnode *np = NULL;
 	struct vattr vattr;
 	caddr_t bpos, dpos;
 	int error = 0, wccflag = NFSV3_WCCRATTR, gotvp = 0;
 	struct mbuf *mreq, *mrep, *md, *mb;
 	u_int32_t rdev;
 	int v3 = NFS_ISV3(dvp);
 
 	if (vap->va_type == VCHR || vap->va_type == VBLK)
 		rdev = txdr_unsigned(vap->va_rdev);
 	else if (vap->va_type == VFIFO || vap->va_type == VSOCK)
 		rdev = nfs_xdrneg1;
 	else {
 		return (EOPNOTSUPP);
 	}
 	if ((error = VOP_GETATTR(dvp, &vattr, cnp->cn_cred)) != 0)
 		return (error);
 	nfsstats.rpccnt[NFSPROC_MKNOD]++;
 	mreq = m_get2(NFSX_FH(v3) + 4 * NFSX_UNSIGNED +
 	    nfsm_rndup(cnp->cn_namelen) + NFSX_SATTR(v3), M_WAITOK, MT_DATA, 0);
 	mb = mreq;
 	bpos = mtod(mb, caddr_t);
 	nfsm_fhtom(dvp, v3);
 	nfsm_strtom(cnp->cn_nameptr, cnp->cn_namelen, NFS_MAXNAMLEN);
 	if (v3) {
 		tl = nfsm_build(u_int32_t *, NFSX_UNSIGNED);
 		*tl++ = vtonfsv3_type(vap->va_type);
 		nfsm_v3attrbuild(vap, FALSE);
 		if (vap->va_type == VCHR || vap->va_type == VBLK) {
 			tl = nfsm_build(u_int32_t *, 2 * NFSX_UNSIGNED);
 			*tl++ = txdr_unsigned(major(vap->va_rdev));
 			*tl = txdr_unsigned(minor(vap->va_rdev));
 		}
 	} else {
 		sp = nfsm_build(struct nfsv2_sattr *, NFSX_V2SATTR);
 		sp->sa_mode = vtonfsv2_mode(vap->va_type, vap->va_mode);
 		sp->sa_uid = nfs_xdrneg1;
 		sp->sa_gid = nfs_xdrneg1;
 		sp->sa_size = rdev;
 		txdr_nfsv2time(&vap->va_atime, &sp->sa_atime);
 		txdr_nfsv2time(&vap->va_mtime, &sp->sa_mtime);
 	}
 	nfsm_request(dvp, NFSPROC_MKNOD, cnp->cn_thread, cnp->cn_cred);
 	if (!error) {
 		nfsm_mtofh(dvp, newvp, v3, gotvp);
 		if (!gotvp) {
 			if (newvp) {
 				vput(newvp);
 				newvp = NULL;
 			}
 			error = nfs_lookitup(dvp, cnp->cn_nameptr,
 			    cnp->cn_namelen, cnp->cn_cred, cnp->cn_thread, &np);
 			if (!error)
 				newvp = NFSTOV(np);
 		}
 	}
 	if (v3)
 		nfsm_wcc_data(dvp, wccflag);
 	m_freem(mrep);
 nfsmout:
 	if (error) {
 		if (newvp)
 			vput(newvp);
 	} else {
 		*vpp = newvp;
 	}
 	mtx_lock(&(VTONFS(dvp))->n_mtx);
 	VTONFS(dvp)->n_flag |= NMODIFIED;
 	if (!wccflag) {
 		VTONFS(dvp)->n_attrstamp = 0;
 		KDTRACE_NFS_ATTRCACHE_FLUSH_DONE(dvp);
 	}
 	mtx_unlock(&(VTONFS(dvp))->n_mtx);
 	return (error);
 }
 
 /*
  * nfs mknod vop
  * just call nfs_mknodrpc() to do the work.
  */
 /* ARGSUSED */
 static int
 nfs_mknod(struct vop_mknod_args *ap)
 {
 	return (nfs_mknodrpc(ap->a_dvp, ap->a_vpp, ap->a_cnp, ap->a_vap));
 }
 
 static u_long create_verf;
 /*
  * nfs file create call
  */
 static int
 nfs_create(struct vop_create_args *ap)
 {
 	struct vnode *dvp = ap->a_dvp;
 	struct vattr *vap = ap->a_vap;
 	struct componentname *cnp = ap->a_cnp;
 	struct nfsv2_sattr *sp;
 	u_int32_t *tl;
 	struct nfsnode *np = NULL;
 	struct vnode *newvp = NULL;
 	caddr_t bpos, dpos;
 	int error = 0, wccflag = NFSV3_WCCRATTR, gotvp = 0, fmode = 0;
 	struct mbuf *mreq, *mrep, *md, *mb;
 	struct vattr vattr;
 	int v3 = NFS_ISV3(dvp);
 
 	/*
 	 * Oops, not for me..
 	 */
 	if (vap->va_type == VSOCK) {
 		error = nfs_mknodrpc(dvp, ap->a_vpp, cnp, vap);
 		return (error);
 	}
 
 	if ((error = VOP_GETATTR(dvp, &vattr, cnp->cn_cred)) != 0) {
 		return (error);
 	}
 	if (vap->va_vaflags & VA_EXCLUSIVE)
 		fmode |= O_EXCL;
 again:
 	nfsstats.rpccnt[NFSPROC_CREATE]++;
 	mreq = m_get2(NFSX_FH(v3) + 2 * NFSX_UNSIGNED +
 	    nfsm_rndup(cnp->cn_namelen) + NFSX_SATTR(v3), M_WAITOK, MT_DATA, 0);
 	mb = mreq;
 	bpos = mtod(mb, caddr_t);
 	nfsm_fhtom(dvp, v3);
 	nfsm_strtom(cnp->cn_nameptr, cnp->cn_namelen, NFS_MAXNAMLEN);
 	if (v3) {
 		tl = nfsm_build(u_int32_t *, NFSX_UNSIGNED);
 		if (fmode & O_EXCL) {
 			*tl = txdr_unsigned(NFSV3CREATE_EXCLUSIVE);
 			tl = nfsm_build(u_int32_t *, NFSX_V3CREATEVERF);
 #ifdef INET
 			CURVNET_SET(CRED_TO_VNET(cnp->cn_cred));
 			IN_IFADDR_RLOCK();
 			if (!TAILQ_EMPTY(&V_in_ifaddrhead))
 				*tl++ = IA_SIN(TAILQ_FIRST(&V_in_ifaddrhead))->sin_addr.s_addr;
 			else
 #endif
 				*tl++ = create_verf;
 #ifdef INET
 			IN_IFADDR_RUNLOCK();
 			CURVNET_RESTORE();
 #endif
 			*tl = ++create_verf;
 		} else {
 			*tl = txdr_unsigned(NFSV3CREATE_UNCHECKED);
 			nfsm_v3attrbuild(vap, FALSE);
 		}
 	} else {
 		sp = nfsm_build(struct nfsv2_sattr *, NFSX_V2SATTR);
 		sp->sa_mode = vtonfsv2_mode(vap->va_type, vap->va_mode);
 		sp->sa_uid = nfs_xdrneg1;
 		sp->sa_gid = nfs_xdrneg1;
 		sp->sa_size = 0;
 		txdr_nfsv2time(&vap->va_atime, &sp->sa_atime);
 		txdr_nfsv2time(&vap->va_mtime, &sp->sa_mtime);
 	}
 	nfsm_request(dvp, NFSPROC_CREATE, cnp->cn_thread, cnp->cn_cred);
 	if (!error) {
 		nfsm_mtofh(dvp, newvp, v3, gotvp);
 		if (!gotvp) {
 			if (newvp) {
 				vput(newvp);
 				newvp = NULL;
 			}
 			error = nfs_lookitup(dvp, cnp->cn_nameptr,
 			    cnp->cn_namelen, cnp->cn_cred, cnp->cn_thread, &np);
 			if (!error)
 				newvp = NFSTOV(np);
 		}
 	}
 	if (v3)
 		nfsm_wcc_data(dvp, wccflag);
 	m_freem(mrep);
 nfsmout:
 	if (error) {
 		if (v3 && (fmode & O_EXCL) && error == NFSERR_NOTSUPP) {
 			fmode &= ~O_EXCL;
 			goto again;
 		}
 		if (newvp)
 			vput(newvp);
 	} else if (v3 && (fmode & O_EXCL)) {
 		/*
 		 * We are normally called with only a partially initialized
 		 * VAP.  Since the NFSv3 spec says that server may use the
 		 * file attributes to store the verifier, the spec requires
 		 * us to do a SETATTR RPC. FreeBSD servers store the verifier
 		 * in atime, but we can't really assume that all servers will
 		 * so we ensure that our SETATTR sets both atime and mtime.
 		 */
 		if (vap->va_mtime.tv_sec == VNOVAL)
 			vfs_timestamp(&vap->va_mtime);
 		if (vap->va_atime.tv_sec == VNOVAL)
 			vap->va_atime = vap->va_mtime;
 		error = nfs_setattrrpc(newvp, vap, cnp->cn_cred);
 		if (error)
 			vput(newvp);
 	}
 	if (!error) {
 		*ap->a_vpp = newvp;
 	}
 	mtx_lock(&(VTONFS(dvp))->n_mtx);
 	VTONFS(dvp)->n_flag |= NMODIFIED;
 	if (!wccflag) {
 		VTONFS(dvp)->n_attrstamp = 0;
 		KDTRACE_NFS_ATTRCACHE_FLUSH_DONE(dvp);
 	}
 	mtx_unlock(&(VTONFS(dvp))->n_mtx);
 	return (error);
 }
 
 /*
  * nfs file remove call
  * To try and make nfs semantics closer to ufs semantics, a file that has
  * other processes using the vnode is renamed instead of removed and then
  * removed later on the last close.
  * - If v_usecount > 1
  *	  If a rename is not already in the works
  *	     call nfs_sillyrename() to set it up
  *     else
  *	  do the remove rpc
  */
 static int
 nfs_remove(struct vop_remove_args *ap)
 {
 	struct vnode *vp = ap->a_vp;
 	struct vnode *dvp = ap->a_dvp;
 	struct componentname *cnp = ap->a_cnp;
 	struct nfsnode *np = VTONFS(vp);
 	int error = 0;
 	struct vattr vattr;
 
 	KASSERT((cnp->cn_flags & HASBUF) != 0, ("nfs_remove: no name"));
 	KASSERT(vrefcnt(vp) > 0, ("nfs_remove: bad v_usecount"));
 	if (vp->v_type == VDIR)
 		error = EPERM;
 	else if (vrefcnt(vp) == 1 || (np->n_sillyrename &&
 	    !VOP_GETATTR(vp, &vattr, cnp->cn_cred) && vattr.va_nlink > 1)) {
 		/*
 		 * Purge the name cache so that the chance of a lookup for
 		 * the name succeeding while the remove is in progress is
 		 * minimized. Without node locking it can still happen, such
 		 * that an I/O op returns ESTALE, but since you get this if
 		 * another host removes the file..
 		 */
 		cache_purge(vp);
 		/*
 		 * throw away biocache buffers, mainly to avoid
 		 * unnecessary delayed writes later.
 		 */
 		error = nfs_vinvalbuf(vp, 0, cnp->cn_thread, 1);
 		/* Do the rpc */
 		if (error != EINTR && error != EIO)
 			error = nfs_removerpc(dvp, cnp->cn_nameptr,
 				cnp->cn_namelen, cnp->cn_cred, cnp->cn_thread);
 		/*
 		 * Kludge City: If the first reply to the remove rpc is lost..
 		 *   the reply to the retransmitted request will be ENOENT
 		 *   since the file was in fact removed
 		 *   Therefore, we cheat and return success.
 		 */
 		if (error == ENOENT)
 			error = 0;
 	} else if (!np->n_sillyrename)
 		error = nfs_sillyrename(dvp, vp, cnp);
 	mtx_lock(&np->n_mtx);
 	np->n_attrstamp = 0;
 	mtx_unlock(&np->n_mtx);
 	KDTRACE_NFS_ATTRCACHE_FLUSH_DONE(vp);
 	return (error);
 }
 
 /*
  * nfs file remove rpc called from nfs_inactive
  */
 int
 nfs_removeit(struct sillyrename *sp)
 {
 	/*
 	 * Make sure that the directory vnode is still valid.
 	 * XXX we should lock sp->s_dvp here.
 	 */
 	if (sp->s_dvp->v_type == VBAD)
 		return (0);
 	return (nfs_removerpc(sp->s_dvp, sp->s_name, sp->s_namlen, sp->s_cred,
 		NULL));
 }
 
 /*
  * Nfs remove rpc, called from nfs_remove() and nfs_removeit().
  */
 static int
 nfs_removerpc(struct vnode *dvp, const char *name, int namelen,
     struct ucred *cred, struct thread *td)
 {
 	caddr_t bpos, dpos;
 	int error = 0, wccflag = NFSV3_WCCRATTR;
 	struct mbuf *mreq, *mrep, *md, *mb;
 	int v3 = NFS_ISV3(dvp);
 
 	nfsstats.rpccnt[NFSPROC_REMOVE]++;
 	mreq = m_get2(NFSX_FH(v3) + NFSX_UNSIGNED + nfsm_rndup(namelen),
 	    M_WAITOK, MT_DATA, 0);
 	mb = mreq;
 	bpos = mtod(mb, caddr_t);
 	nfsm_fhtom(dvp, v3);
 	nfsm_strtom(name, namelen, NFS_MAXNAMLEN);
 	nfsm_request(dvp, NFSPROC_REMOVE, td, cred);
 	if (v3)
 		nfsm_wcc_data(dvp, wccflag);
 	m_freem(mrep);
 nfsmout:
 	mtx_lock(&(VTONFS(dvp))->n_mtx);
 	VTONFS(dvp)->n_flag |= NMODIFIED;
 	if (!wccflag) {
 		VTONFS(dvp)->n_attrstamp = 0;
 		KDTRACE_NFS_ATTRCACHE_FLUSH_DONE(dvp);
 	}
 	mtx_unlock(&(VTONFS(dvp))->n_mtx);
 	return (error);
 }
 
 /*
  * nfs file rename call
  */
 static int
 nfs_rename(struct vop_rename_args *ap)
 {
 	struct vnode *fvp = ap->a_fvp;
 	struct vnode *tvp = ap->a_tvp;
 	struct vnode *fdvp = ap->a_fdvp;
 	struct vnode *tdvp = ap->a_tdvp;
 	struct componentname *tcnp = ap->a_tcnp;
 	struct componentname *fcnp = ap->a_fcnp;
 	int error;
 
 	KASSERT((tcnp->cn_flags & HASBUF) != 0 &&
 	    (fcnp->cn_flags & HASBUF) != 0, ("nfs_rename: no name"));
 	/* Check for cross-device rename */
 	if ((fvp->v_mount != tdvp->v_mount) ||
 	    (tvp && (fvp->v_mount != tvp->v_mount))) {
 		error = EXDEV;
 		goto out;
 	}
 
 	if (fvp == tvp) {
 		nfs_printf("nfs_rename: fvp == tvp (can't happen)\n");
 		error = 0;
 		goto out;
 	}
 	if ((error = vn_lock(fvp, LK_EXCLUSIVE)) != 0)
 		goto out;
 
 	/*
 	 * We have to flush B_DELWRI data prior to renaming
 	 * the file.  If we don't, the delayed-write buffers
 	 * can be flushed out later after the file has gone stale
 	 * under NFSV3.  NFSV2 does not have this problem because
 	 * ( as far as I can tell ) it flushes dirty buffers more
 	 * often.
 	 * 
 	 * Skip the rename operation if the fsync fails, this can happen
 	 * due to the server's volume being full, when we pushed out data
 	 * that was written back to our cache earlier. Not checking for
 	 * this condition can result in potential (silent) data loss.
 	 */
 	error = VOP_FSYNC(fvp, MNT_WAIT, fcnp->cn_thread);
 	VOP_UNLOCK(fvp, 0);
 	if (!error && tvp)
 		error = VOP_FSYNC(tvp, MNT_WAIT, tcnp->cn_thread);
 	if (error)
 		goto out;
 
 	/*
 	 * If the tvp exists and is in use, sillyrename it before doing the
 	 * rename of the new file over it.
 	 * XXX Can't sillyrename a directory.
 	 */
 	if (tvp && vrefcnt(tvp) > 1 && !VTONFS(tvp)->n_sillyrename &&
 		tvp->v_type != VDIR && !nfs_sillyrename(tdvp, tvp, tcnp)) {
 		vput(tvp);
 		tvp = NULL;
 	}
 
 	error = nfs_renamerpc(fdvp, fcnp->cn_nameptr, fcnp->cn_namelen,
 		tdvp, tcnp->cn_nameptr, tcnp->cn_namelen, tcnp->cn_cred,
 		tcnp->cn_thread);
 
 	if (fvp->v_type == VDIR) {
 		if (tvp != NULL && tvp->v_type == VDIR)
 			cache_purge(tdvp);
 		cache_purge(fdvp);
 	}
 
 out:
 	if (tdvp == tvp)
 		vrele(tdvp);
 	else
 		vput(tdvp);
 	if (tvp)
 		vput(tvp);
 	vrele(fdvp);
 	vrele(fvp);
 	/*
 	 * Kludge: Map ENOENT => 0 assuming that it is a reply to a retry.
 	 */
 	if (error == ENOENT)
 		error = 0;
 	return (error);
 }
 
 /*
  * nfs file rename rpc called from nfs_remove() above
  */
 static int
 nfs_renameit(struct vnode *sdvp, struct componentname *scnp,
     struct sillyrename *sp)
 {
 
 	return (nfs_renamerpc(sdvp, scnp->cn_nameptr, scnp->cn_namelen, sdvp,
 	    sp->s_name, sp->s_namlen, scnp->cn_cred, scnp->cn_thread));
 }
 
 /*
  * Do an nfs rename rpc. Called from nfs_rename() and nfs_renameit().
  */
 static int
 nfs_renamerpc(struct vnode *fdvp, const char *fnameptr, int fnamelen,
     struct vnode *tdvp, const char *tnameptr, int tnamelen, struct ucred *cred,
     struct thread *td)
 {
 	caddr_t bpos, dpos;
 	int error = 0, fwccflag = NFSV3_WCCRATTR, twccflag = NFSV3_WCCRATTR;
 	struct mbuf *mreq, *mrep, *md, *mb;
 	int v3 = NFS_ISV3(fdvp);
 
 	nfsstats.rpccnt[NFSPROC_RENAME]++;
 	mreq = m_get2((NFSX_FH(v3) + NFSX_UNSIGNED)*2 + nfsm_rndup(fnamelen) +
 	    nfsm_rndup(tnamelen), M_WAITOK, MT_DATA, 0);
 	mb = mreq;
 	bpos = mtod(mb, caddr_t);
 	nfsm_fhtom(fdvp, v3);
 	nfsm_strtom(fnameptr, fnamelen, NFS_MAXNAMLEN);
 	nfsm_fhtom(tdvp, v3);
 	nfsm_strtom(tnameptr, tnamelen, NFS_MAXNAMLEN);
 	nfsm_request(fdvp, NFSPROC_RENAME, td, cred);
 	if (v3) {
 		nfsm_wcc_data(fdvp, fwccflag);
 		nfsm_wcc_data(tdvp, twccflag);
 	}
 	m_freem(mrep);
 nfsmout:
 	mtx_lock(&(VTONFS(fdvp))->n_mtx);
 	VTONFS(fdvp)->n_flag |= NMODIFIED;
 	mtx_unlock(&(VTONFS(fdvp))->n_mtx);
 	mtx_lock(&(VTONFS(tdvp))->n_mtx);
 	VTONFS(tdvp)->n_flag |= NMODIFIED;
 	mtx_unlock(&(VTONFS(tdvp))->n_mtx);
 	if (!fwccflag) {
 		VTONFS(fdvp)->n_attrstamp = 0;
 		KDTRACE_NFS_ATTRCACHE_FLUSH_DONE(fdvp);
 	}
 	if (!twccflag) {
 		VTONFS(tdvp)->n_attrstamp = 0;
 		KDTRACE_NFS_ATTRCACHE_FLUSH_DONE(tdvp);
 	}
 	return (error);
 }
 
 /*
  * nfs hard link create call
  */
 static int
 nfs_link(struct vop_link_args *ap)
 {
 	struct vnode *vp = ap->a_vp;
 	struct vnode *tdvp = ap->a_tdvp;
 	struct componentname *cnp = ap->a_cnp;
 	caddr_t bpos, dpos;
 	int error = 0, wccflag = NFSV3_WCCRATTR, attrflag = 0;
 	struct mbuf *mreq, *mrep, *md, *mb;
 	int v3;
 
 	if (vp->v_mount != tdvp->v_mount) {
 		return (EXDEV);
 	}
 
 	/*
 	 * Push all writes to the server, so that the attribute cache
 	 * doesn't get "out of sync" with the server.
 	 * XXX There should be a better way!
 	 */
 	VOP_FSYNC(vp, MNT_WAIT, cnp->cn_thread);
 
 	v3 = NFS_ISV3(vp);
 	nfsstats.rpccnt[NFSPROC_LINK]++;
 	mreq = m_get2(NFSX_FH(v3)*2 + NFSX_UNSIGNED +
 	    nfsm_rndup(cnp->cn_namelen), M_WAITOK, MT_DATA, 0);
 	mb = mreq;
 	bpos = mtod(mb, caddr_t);
 	nfsm_fhtom(vp, v3);
 	nfsm_fhtom(tdvp, v3);
 	nfsm_strtom(cnp->cn_nameptr, cnp->cn_namelen, NFS_MAXNAMLEN);
 	nfsm_request(vp, NFSPROC_LINK, cnp->cn_thread, cnp->cn_cred);
 	if (v3) {
 		nfsm_postop_attr(vp, attrflag);
 		nfsm_wcc_data(tdvp, wccflag);
 	}
 	m_freem(mrep);
 nfsmout:
 	mtx_lock(&(VTONFS(tdvp))->n_mtx);
 	VTONFS(tdvp)->n_flag |= NMODIFIED;
 	mtx_unlock(&(VTONFS(tdvp))->n_mtx);
 	if (!attrflag) {
 		VTONFS(vp)->n_attrstamp = 0;
 		KDTRACE_NFS_ATTRCACHE_FLUSH_DONE(vp);
 	}
 	if (!wccflag) {
 		VTONFS(tdvp)->n_attrstamp = 0;
 		KDTRACE_NFS_ATTRCACHE_FLUSH_DONE(tdvp);
 	}
 	return (error);
 }
 
 /*
  * nfs symbolic link create call
  */
 static int
 nfs_symlink(struct vop_symlink_args *ap)
 {
 	struct vnode *dvp = ap->a_dvp;
 	struct vattr *vap = ap->a_vap;
 	struct componentname *cnp = ap->a_cnp;
 	struct nfsv2_sattr *sp;
 	caddr_t bpos, dpos;
 	int slen, error = 0, wccflag = NFSV3_WCCRATTR, gotvp;
 	struct mbuf *mreq, *mrep, *md, *mb;
 	struct vnode *newvp = NULL;
 	int v3 = NFS_ISV3(dvp);
 
 	nfsstats.rpccnt[NFSPROC_SYMLINK]++;
 	slen = strlen(ap->a_target);
 	mreq = m_get2(NFSX_FH(v3) + 2*NFSX_UNSIGNED +
 	    nfsm_rndup(cnp->cn_namelen) + nfsm_rndup(slen) + NFSX_SATTR(v3),
 	    M_WAITOK, MT_DATA, 0);
 	mb = mreq;
 	bpos = mtod(mb, caddr_t);
 	nfsm_fhtom(dvp, v3);
 	nfsm_strtom(cnp->cn_nameptr, cnp->cn_namelen, NFS_MAXNAMLEN);
 	if (v3) {
 		nfsm_v3attrbuild(vap, FALSE);
 	}
 	nfsm_strtom(ap->a_target, slen, NFS_MAXPATHLEN);
 	if (!v3) {
 		sp = nfsm_build(struct nfsv2_sattr *, NFSX_V2SATTR);
 		sp->sa_mode = vtonfsv2_mode(VLNK, vap->va_mode);
 		sp->sa_uid = nfs_xdrneg1;
 		sp->sa_gid = nfs_xdrneg1;
 		sp->sa_size = nfs_xdrneg1;
 		txdr_nfsv2time(&vap->va_atime, &sp->sa_atime);
 		txdr_nfsv2time(&vap->va_mtime, &sp->sa_mtime);
 	}
 
 	/*
 	 * Issue the NFS request and get the rpc response.
 	 *
 	 * Only NFSv3 responses returning an error of 0 actually return
 	 * a file handle that can be converted into newvp without having
 	 * to do an extra lookup rpc.
 	 */
 	nfsm_request(dvp, NFSPROC_SYMLINK, cnp->cn_thread, cnp->cn_cred);
 	if (v3) {
 		if (error == 0)
 			nfsm_mtofh(dvp, newvp, v3, gotvp);
 		nfsm_wcc_data(dvp, wccflag);
 	}
 
 	/*
 	 * out code jumps -> here, mrep is also freed.
 	 */
 
 	m_freem(mrep);
 nfsmout:
 
 	/*
 	 * If we do not have an error and we could not extract the newvp from
 	 * the response due to the request being NFSv2, we have to do a
 	 * lookup in order to obtain a newvp to return.
 	 */
 	if (error == 0 && newvp == NULL) {
 		struct nfsnode *np = NULL;
 
 		error = nfs_lookitup(dvp, cnp->cn_nameptr, cnp->cn_namelen,
 		    cnp->cn_cred, cnp->cn_thread, &np);
 		if (!error)
 			newvp = NFSTOV(np);
 	}
 	if (error) {
 		if (newvp)
 			vput(newvp);
 	} else {
 		*ap->a_vpp = newvp;
 	}
 	mtx_lock(&(VTONFS(dvp))->n_mtx);
 	VTONFS(dvp)->n_flag |= NMODIFIED;
 	mtx_unlock(&(VTONFS(dvp))->n_mtx);
 	if (!wccflag) {
 		VTONFS(dvp)->n_attrstamp = 0;
 		KDTRACE_NFS_ATTRCACHE_FLUSH_DONE(dvp);
 	}
 	return (error);
 }
 
 /*
  * nfs make dir call
  */
 static int
 nfs_mkdir(struct vop_mkdir_args *ap)
 {
 	struct vnode *dvp = ap->a_dvp;
 	struct vattr *vap = ap->a_vap;
 	struct componentname *cnp = ap->a_cnp;
 	struct nfsv2_sattr *sp;
 	int len;
 	struct nfsnode *np = NULL;
 	struct vnode *newvp = NULL;
 	caddr_t bpos, dpos;
 	int error = 0, wccflag = NFSV3_WCCRATTR;
 	int gotvp = 0;
 	struct mbuf *mreq, *mrep, *md, *mb;
 	struct vattr vattr;
 	int v3 = NFS_ISV3(dvp);
 
 	if ((error = VOP_GETATTR(dvp, &vattr, cnp->cn_cred)) != 0)
 		return (error);
 	len = cnp->cn_namelen;
 	nfsstats.rpccnt[NFSPROC_MKDIR]++;
 	mreq = m_get2(NFSX_FH(v3) + NFSX_UNSIGNED + nfsm_rndup(len) +
 	    NFSX_SATTR(v3), M_WAITOK, MT_DATA, 0);
 	mb = mreq;
 	bpos = mtod(mb, caddr_t);
 	nfsm_fhtom(dvp, v3);
 	nfsm_strtom(cnp->cn_nameptr, len, NFS_MAXNAMLEN);
 	if (v3) {
 		nfsm_v3attrbuild(vap, FALSE);
 	} else {
 		sp = nfsm_build(struct nfsv2_sattr *, NFSX_V2SATTR);
 		sp->sa_mode = vtonfsv2_mode(VDIR, vap->va_mode);
 		sp->sa_uid = nfs_xdrneg1;
 		sp->sa_gid = nfs_xdrneg1;
 		sp->sa_size = nfs_xdrneg1;
 		txdr_nfsv2time(&vap->va_atime, &sp->sa_atime);
 		txdr_nfsv2time(&vap->va_mtime, &sp->sa_mtime);
 	}
 	nfsm_request(dvp, NFSPROC_MKDIR, cnp->cn_thread, cnp->cn_cred);
 	if (!error)
 		nfsm_mtofh(dvp, newvp, v3, gotvp);
 	if (v3)
 		nfsm_wcc_data(dvp, wccflag);
 	m_freem(mrep);
 nfsmout:
 	mtx_lock(&(VTONFS(dvp))->n_mtx);
 	VTONFS(dvp)->n_flag |= NMODIFIED;
 	mtx_unlock(&(VTONFS(dvp))->n_mtx);
 	if (!wccflag) {
 		VTONFS(dvp)->n_attrstamp = 0;
 		KDTRACE_NFS_ATTRCACHE_FLUSH_DONE(dvp);
 	}
 	if (error == 0 && newvp == NULL) {
 		error = nfs_lookitup(dvp, cnp->cn_nameptr, len, cnp->cn_cred,
 			cnp->cn_thread, &np);
 		if (!error) {
 			newvp = NFSTOV(np);
 			if (newvp->v_type != VDIR)
 				error = EEXIST;
 		}
 	}
 	if (error) {
 		if (newvp)
 			vput(newvp);
 	} else
 		*ap->a_vpp = newvp;
 	return (error);
 }
 
 /*
  * nfs remove directory call
  */
 static int
 nfs_rmdir(struct vop_rmdir_args *ap)
 {
 	struct vnode *vp = ap->a_vp;
 	struct vnode *dvp = ap->a_dvp;
 	struct componentname *cnp = ap->a_cnp;
 	caddr_t bpos, dpos;
 	int error = 0, wccflag = NFSV3_WCCRATTR;
 	struct mbuf *mreq, *mrep, *md, *mb;
 	int v3 = NFS_ISV3(dvp);
 
 	if (dvp == vp)
 		return (EINVAL);
 	nfsstats.rpccnt[NFSPROC_RMDIR]++;
 	mreq = m_get2(NFSX_FH(v3) + NFSX_UNSIGNED +
 	    nfsm_rndup(cnp->cn_namelen), M_WAITOK, MT_DATA, 0);
 	mb = mreq;
 	bpos = mtod(mb, caddr_t);
 	nfsm_fhtom(dvp, v3);
 	nfsm_strtom(cnp->cn_nameptr, cnp->cn_namelen, NFS_MAXNAMLEN);
 	nfsm_request(dvp, NFSPROC_RMDIR, cnp->cn_thread, cnp->cn_cred);
 	if (v3)
 		nfsm_wcc_data(dvp, wccflag);
 	m_freem(mrep);
 nfsmout:
 	mtx_lock(&(VTONFS(dvp))->n_mtx);
 	VTONFS(dvp)->n_flag |= NMODIFIED;
 	mtx_unlock(&(VTONFS(dvp))->n_mtx);
 	if (!wccflag) {
 		VTONFS(dvp)->n_attrstamp = 0;
 		KDTRACE_NFS_ATTRCACHE_FLUSH_DONE(dvp);
 	}
 	cache_purge(dvp);
 	cache_purge(vp);
 	/*
 	 * Kludge: Map ENOENT => 0 assuming that you have a reply to a retry.
 	 */
 	if (error == ENOENT)
 		error = 0;
 	return (error);
 }
 
 /*
  * nfs readdir call
  */
 static int
 nfs_readdir(struct vop_readdir_args *ap)
 {
 	struct vnode *vp = ap->a_vp;
 	struct nfsnode *np = VTONFS(vp);
 	struct uio *uio = ap->a_uio;
 	int tresid, error = 0;
 	struct vattr vattr;
 	
 	if (vp->v_type != VDIR) 
 		return(EPERM);
 
 	/*
 	 * First, check for hit on the EOF offset cache
 	 */
 	if (np->n_direofoffset > 0 && uio->uio_offset >= np->n_direofoffset &&
 	    (np->n_flag & NMODIFIED) == 0) {
 		if (VOP_GETATTR(vp, &vattr, ap->a_cred) == 0) {
 			mtx_lock(&np->n_mtx);
 			if (!NFS_TIMESPEC_COMPARE(&np->n_mtime, &vattr.va_mtime)) {
 				mtx_unlock(&np->n_mtx);
 				nfsstats.direofcache_hits++;
 				goto out;
 			} else
 				mtx_unlock(&np->n_mtx);
 		}
 	}
 
 	/*
 	 * Call nfs_bioread() to do the real work.
 	 */
 	tresid = uio->uio_resid;
 	error = nfs_bioread(vp, uio, 0, ap->a_cred);
 
 	if (!error && uio->uio_resid == tresid) {
 		nfsstats.direofcache_misses++;
 	}
 out:
 	return (error);
 }
 
 /*
  * Readdir rpc call.
  * Called from below the buffer cache by nfs_doio().
  */
 int
 nfs_readdirrpc(struct vnode *vp, struct uio *uiop, struct ucred *cred)
 {
 	int len, left;
 	struct dirent *dp = NULL;
 	u_int32_t *tl;
 	caddr_t cp;
 	nfsuint64 *cookiep;
 	caddr_t bpos, dpos;
 	struct mbuf *mreq, *mrep, *md, *mb;
 	nfsuint64 cookie;
 	struct nfsmount *nmp = VFSTONFS(vp->v_mount);
 	struct nfsnode *dnp = VTONFS(vp);
 	u_quad_t fileno;
 	int error = 0, tlen, more_dirs = 1, blksiz = 0, bigenough = 1;
 	int attrflag;
 	int v3 = NFS_ISV3(vp);
 
 	KASSERT(uiop->uio_iovcnt == 1 &&
 	    (uiop->uio_offset & (DIRBLKSIZ - 1)) == 0 &&
 	    (uiop->uio_resid & (DIRBLKSIZ - 1)) == 0,
 	    ("nfs readdirrpc bad uio"));
 
 	/*
 	 * If there is no cookie, assume directory was stale.
 	 */
 	nfs_dircookie_lock(dnp);
 	cookiep = nfs_getcookie(dnp, uiop->uio_offset, 0);
 	if (cookiep) {
 		cookie = *cookiep;
 		nfs_dircookie_unlock(dnp);
 	} else {
 		nfs_dircookie_unlock(dnp);		
 		return (NFSERR_BAD_COOKIE);
 	}
 
 	/*
 	 * Loop around doing readdir rpc's of size nm_readdirsize
 	 * truncated to a multiple of DIRBLKSIZ.
 	 * The stopping criteria is EOF or buffer full.
 	 */
 	while (more_dirs && bigenough) {
 		nfsstats.rpccnt[NFSPROC_READDIR]++;
 		mreq = m_get2(NFSX_FH(v3) + NFSX_READDIR(v3), M_WAITOK,
 		    MT_DATA, 0);
 		mb = mreq;
 		bpos = mtod(mb, caddr_t);
 		nfsm_fhtom(vp, v3);
 		if (v3) {
 			tl = nfsm_build(u_int32_t *, 5 * NFSX_UNSIGNED);
 			*tl++ = cookie.nfsuquad[0];
 			*tl++ = cookie.nfsuquad[1];
 			mtx_lock(&dnp->n_mtx);
 			*tl++ = dnp->n_cookieverf.nfsuquad[0];
 			*tl++ = dnp->n_cookieverf.nfsuquad[1];
 			mtx_unlock(&dnp->n_mtx);
 		} else {
 			tl = nfsm_build(u_int32_t *, 2 * NFSX_UNSIGNED);
 			*tl++ = cookie.nfsuquad[0];
 		}
 		*tl = txdr_unsigned(nmp->nm_readdirsize);
 		nfsm_request(vp, NFSPROC_READDIR, uiop->uio_td, cred);
 		if (v3) {
 			nfsm_postop_attr(vp, attrflag);
 			if (!error) {
 				tl = nfsm_dissect(u_int32_t *,
 				    2 * NFSX_UNSIGNED);
 				mtx_lock(&dnp->n_mtx);
 				dnp->n_cookieverf.nfsuquad[0] = *tl++;
 				dnp->n_cookieverf.nfsuquad[1] = *tl;
 				mtx_unlock(&dnp->n_mtx);
 			} else {
 				m_freem(mrep);
 				goto nfsmout;
 			}
 		}
 		tl = nfsm_dissect(u_int32_t *, NFSX_UNSIGNED);
 		more_dirs = fxdr_unsigned(int, *tl);
 
 		/* loop thru the dir entries, doctoring them to 4bsd form */
 		while (more_dirs && bigenough) {
 			if (v3) {
 				tl = nfsm_dissect(u_int32_t *,
 				    3 * NFSX_UNSIGNED);
 				fileno = fxdr_hyper(tl);
 				len = fxdr_unsigned(int, *(tl + 2));
 			} else {
 				tl = nfsm_dissect(u_int32_t *,
 				    2 * NFSX_UNSIGNED);
 				fileno = fxdr_unsigned(u_quad_t, *tl++);
 				len = fxdr_unsigned(int, *tl);
 			}
 			if (len <= 0 || len > NFS_MAXNAMLEN) {
 				error = EBADRPC;
 				m_freem(mrep);
 				goto nfsmout;
 			}
 			tlen = nfsm_rndup(len);
 			if (tlen == len)
 				tlen += 4;	/* To ensure null termination */
 			left = DIRBLKSIZ - blksiz;
 			if ((tlen + DIRHDSIZ) > left) {
 				dp->d_reclen += left;
 				uiop->uio_iov->iov_base =
 				    (char *)uiop->uio_iov->iov_base + left;
 				uiop->uio_iov->iov_len -= left;
 				uiop->uio_offset += left;
 				uiop->uio_resid -= left;
 				blksiz = 0;
 			}
 			if ((tlen + DIRHDSIZ) > uiop->uio_resid)
 				bigenough = 0;
 			if (bigenough) {
 				dp = (struct dirent *)uiop->uio_iov->iov_base;
 				dp->d_fileno = (int)fileno;
 				dp->d_namlen = len;
 				dp->d_reclen = tlen + DIRHDSIZ;
 				dp->d_type = DT_UNKNOWN;
 				blksiz += dp->d_reclen;
 				if (blksiz == DIRBLKSIZ)
 					blksiz = 0;
 				uiop->uio_offset += DIRHDSIZ;
 				uiop->uio_resid -= DIRHDSIZ;
 				uiop->uio_iov->iov_base =
 				    (char *)uiop->uio_iov->iov_base + DIRHDSIZ;
 				uiop->uio_iov->iov_len -= DIRHDSIZ;
 				nfsm_mtouio(uiop, len);
 				cp = uiop->uio_iov->iov_base;
 				tlen -= len;
 				*cp = '\0';	/* null terminate */
 				uiop->uio_iov->iov_base =
 				    (char *)uiop->uio_iov->iov_base + tlen;
 				uiop->uio_iov->iov_len -= tlen;
 				uiop->uio_offset += tlen;
 				uiop->uio_resid -= tlen;
 			} else
 				nfsm_adv(nfsm_rndup(len));
 			if (v3) {
 				tl = nfsm_dissect(u_int32_t *,
 				    3 * NFSX_UNSIGNED);
 			} else {
 				tl = nfsm_dissect(u_int32_t *,
 				    2 * NFSX_UNSIGNED);
 			}
 			if (bigenough) {
 				cookie.nfsuquad[0] = *tl++;
 				if (v3)
 					cookie.nfsuquad[1] = *tl++;
 			} else if (v3)
 				tl += 2;
 			else
 				tl++;
 			more_dirs = fxdr_unsigned(int, *tl);
 		}
 		/*
 		 * If at end of rpc data, get the eof boolean
 		 */
 		if (!more_dirs) {
 			tl = nfsm_dissect(u_int32_t *, NFSX_UNSIGNED);
 			more_dirs = (fxdr_unsigned(int, *tl) == 0);
 		}
 		m_freem(mrep);
 	}
 	/*
 	 * Fill last record, iff any, out to a multiple of DIRBLKSIZ
 	 * by increasing d_reclen for the last record.
 	 */
 	if (blksiz > 0) {
 		left = DIRBLKSIZ - blksiz;
 		dp->d_reclen += left;
 		uiop->uio_iov->iov_base =
 		    (char *)uiop->uio_iov->iov_base + left;
 		uiop->uio_iov->iov_len -= left;
 		uiop->uio_offset += left;
 		uiop->uio_resid -= left;
 	}
 
 	/*
 	 * We are now either at the end of the directory or have filled the
 	 * block.
 	 */
 	if (bigenough)
 		dnp->n_direofoffset = uiop->uio_offset;
 	else {
 		if (uiop->uio_resid > 0)
 			nfs_printf("EEK! readdirrpc resid > 0\n");
 		nfs_dircookie_lock(dnp);
 		cookiep = nfs_getcookie(dnp, uiop->uio_offset, 1);
 		*cookiep = cookie;
 		nfs_dircookie_unlock(dnp);
 	}
 nfsmout:
 	return (error);
 }
 
 /*
  * NFS V3 readdir plus RPC. Used in place of nfs_readdirrpc().
  */
 int
 nfs_readdirplusrpc(struct vnode *vp, struct uio *uiop, struct ucred *cred)
 {
 	int len, left;
 	struct dirent *dp;
 	u_int32_t *tl;
 	caddr_t cp;
 	struct vnode *newvp;
 	nfsuint64 *cookiep;
 	caddr_t bpos, dpos, dpossav1, dpossav2;
 	struct mbuf *mreq, *mrep, *md, *mb, *mdsav1, *mdsav2;
 	struct nameidata nami, *ndp = &nami;
 	struct componentname *cnp = &ndp->ni_cnd;
 	nfsuint64 cookie;
 	struct nfsmount *nmp = VFSTONFS(vp->v_mount);
 	struct nfsnode *dnp = VTONFS(vp), *np;
 	struct vattr vattr, dvattr;
 	nfsfh_t *fhp;
 	u_quad_t fileno;
 	int error = 0, tlen, more_dirs = 1, blksiz = 0, doit, bigenough = 1, i;
 	int attrflag, dattrflag, fhsize;
 
 #ifndef nolint
 	dp = NULL;
 #endif
 	KASSERT(uiop->uio_iovcnt == 1 &&
 	    (uiop->uio_offset & (DIRBLKSIZ - 1)) == 0 &&
 	    (uiop->uio_resid & (DIRBLKSIZ - 1)) == 0,
 	    ("nfs readdirplusrpc bad uio"));
 	ndp->ni_dvp = vp;
 	newvp = NULLVP;
 
 	/*
 	 * If there is no cookie, assume directory was stale.
 	 */
 	nfs_dircookie_lock(dnp);
 	cookiep = nfs_getcookie(dnp, uiop->uio_offset, 0);
 	if (cookiep) {
 		cookie = *cookiep;
 		nfs_dircookie_unlock(dnp);
 	} else {
 		nfs_dircookie_unlock(dnp);
 		return (NFSERR_BAD_COOKIE);
 	}
 	/*
 	 * Loop around doing readdir rpc's of size nm_readdirsize
 	 * truncated to a multiple of DIRBLKSIZ.
 	 * The stopping criteria is EOF or buffer full.
 	 */
 	while (more_dirs && bigenough) {
 		nfsstats.rpccnt[NFSPROC_READDIRPLUS]++;
 		mreq = m_get2(NFSX_FH(1) + 6 * NFSX_UNSIGNED, M_WAITOK,
 		    MT_DATA, 0);
 		mb = mreq;
 		bpos = mtod(mb, caddr_t);
 		nfsm_fhtom(vp, 1);
  		tl = nfsm_build(u_int32_t *, 6 * NFSX_UNSIGNED);
 		*tl++ = cookie.nfsuquad[0];
 		*tl++ = cookie.nfsuquad[1];
 		mtx_lock(&dnp->n_mtx);
 		*tl++ = dnp->n_cookieverf.nfsuquad[0];
 		*tl++ = dnp->n_cookieverf.nfsuquad[1];
 		mtx_unlock(&dnp->n_mtx);
 		*tl++ = txdr_unsigned(nmp->nm_readdirsize);
 		*tl = txdr_unsigned(nmp->nm_rsize);
 		nfsm_request(vp, NFSPROC_READDIRPLUS, uiop->uio_td, cred);
 		nfsm_postop_attr_va(vp, dattrflag, &dvattr);
 		if (error) {
 			m_freem(mrep);
 			goto nfsmout;
 		}
 		tl = nfsm_dissect(u_int32_t *, 3 * NFSX_UNSIGNED);
 		mtx_lock(&dnp->n_mtx);
 		dnp->n_cookieverf.nfsuquad[0] = *tl++;
 		dnp->n_cookieverf.nfsuquad[1] = *tl++;
 		mtx_unlock(&dnp->n_mtx);
 		more_dirs = fxdr_unsigned(int, *tl);
 
 		/* loop thru the dir entries, doctoring them to 4bsd form */
 		while (more_dirs && bigenough) {
 			tl = nfsm_dissect(u_int32_t *, 3 * NFSX_UNSIGNED);
 			fileno = fxdr_hyper(tl);
 			len = fxdr_unsigned(int, *(tl + 2));
 			if (len <= 0 || len > NFS_MAXNAMLEN) {
 				error = EBADRPC;
 				m_freem(mrep);
 				goto nfsmout;
 			}
 			tlen = nfsm_rndup(len);
 			if (tlen == len)
 				tlen += 4;	/* To ensure null termination*/
 			left = DIRBLKSIZ - blksiz;
 			if ((tlen + DIRHDSIZ) > left) {
 				dp->d_reclen += left;
 				uiop->uio_iov->iov_base =
 				    (char *)uiop->uio_iov->iov_base + left;
 				uiop->uio_iov->iov_len -= left;
 				uiop->uio_offset += left;
 				uiop->uio_resid -= left;
 				blksiz = 0;
 			}
 			if ((tlen + DIRHDSIZ) > uiop->uio_resid)
 				bigenough = 0;
 			if (bigenough) {
 				dp = (struct dirent *)uiop->uio_iov->iov_base;
 				dp->d_fileno = (int)fileno;
 				dp->d_namlen = len;
 				dp->d_reclen = tlen + DIRHDSIZ;
 				dp->d_type = DT_UNKNOWN;
 				blksiz += dp->d_reclen;
 				if (blksiz == DIRBLKSIZ)
 					blksiz = 0;
 				uiop->uio_offset += DIRHDSIZ;
 				uiop->uio_resid -= DIRHDSIZ;
 				uiop->uio_iov->iov_base =
 				    (char *)uiop->uio_iov->iov_base + DIRHDSIZ;
 				uiop->uio_iov->iov_len -= DIRHDSIZ;
 				cnp->cn_nameptr = uiop->uio_iov->iov_base;
 				cnp->cn_namelen = len;
 				nfsm_mtouio(uiop, len);
 				cp = uiop->uio_iov->iov_base;
 				tlen -= len;
 				*cp = '\0';
 				uiop->uio_iov->iov_base =
 				    (char *)uiop->uio_iov->iov_base + tlen;
 				uiop->uio_iov->iov_len -= tlen;
 				uiop->uio_offset += tlen;
 				uiop->uio_resid -= tlen;
 			} else
 				nfsm_adv(nfsm_rndup(len));
 			tl = nfsm_dissect(u_int32_t *, 3 * NFSX_UNSIGNED);
 			if (bigenough) {
 				cookie.nfsuquad[0] = *tl++;
 				cookie.nfsuquad[1] = *tl++;
 			} else
 				tl += 2;
 
 			/*
 			 * Since the attributes are before the file handle
 			 * (sigh), we must skip over the attributes and then
 			 * come back and get them.
 			 */
 			attrflag = fxdr_unsigned(int, *tl);
 			if (attrflag) {
 			    dpossav1 = dpos;
 			    mdsav1 = md;
 			    nfsm_adv(NFSX_V3FATTR);
 			    tl = nfsm_dissect(u_int32_t *, NFSX_UNSIGNED);
 			    doit = fxdr_unsigned(int, *tl);
 			    /*
  			     * Skip loading the attrs for "..". There's a 
  			     * race between loading the attrs here and 
  			     * lookups that look for the directory currently
  			     * being read (in the parent). We try to acquire
  			     * the exclusive lock on ".." here, owning the 
  			     * lock on the directory being read. Lookup will
  			     * hold the lock on ".." and try to acquire the 
  			     * lock on the directory being read.
  			     * 
  			     * There are other ways of fixing this, one would
  			     * be to do a trylock on the ".." vnode and skip
  			     * loading the attrs on ".." if it happens to be 
  			     * locked by another process. But skipping the
  			     * attrload on ".." seems the easiest option.
  			     */
  			    if (strcmp(dp->d_name, "..") == 0) {
  				    doit = 0;
  				    /*
  				     * We've already skipped over the attrs, 
  				     * skip over the filehandle. And store d_type
  				     * as VDIR.
  				     */
  				    tl = nfsm_dissect(u_int32_t *, NFSX_UNSIGNED);
  				    i = fxdr_unsigned(int, *tl);
  				    nfsm_adv(nfsm_rndup(i));
  				    dp->d_type = IFTODT(VTTOIF(VDIR));
  			    }	    
 			    if (doit) {
 				nfsm_getfh(fhp, fhsize, 1);
 				if (NFS_CMPFH(dnp, fhp, fhsize)) {
 				    VREF(vp);
 				    newvp = vp;
 				    np = dnp;
 				} else {
 				    error = nfs_nget(vp->v_mount, fhp,
 					fhsize, &np, LK_EXCLUSIVE);
 				    if (error)
 					doit = 0;
 				    else
 					newvp = NFSTOV(np);
 				}
 			    }
 			    if (doit && bigenough) {
 				dpossav2 = dpos;
 				dpos = dpossav1;
 				mdsav2 = md;
 				md = mdsav1;
 				nfsm_loadattr(newvp, &vattr);
 				dpos = dpossav2;
 				md = mdsav2;
 				dp->d_type = IFTODT(VTTOIF(vattr.va_type));
 				ndp->ni_vp = newvp;
 				if (newvp->v_type != VDIR || dattrflag != 0)
 				    cache_enter_time(ndp->ni_dvp, ndp->ni_vp,
 					cnp, &vattr.va_ctime,
 					newvp->v_type != VDIR ? NULL :
 					&dvattr.va_ctime);
 			    }
 			} else {
 			    /* Just skip over the file handle */
 			    tl = nfsm_dissect(u_int32_t *, NFSX_UNSIGNED);
 			    i = fxdr_unsigned(int, *tl);
 			    if (i) {
 				    tl = nfsm_dissect(u_int32_t *, NFSX_UNSIGNED);
 				    fhsize = fxdr_unsigned(int, *tl);
 				    nfsm_adv(nfsm_rndup(fhsize));
 			    }
 			}
 			if (newvp != NULLVP) {
 			    if (newvp == vp)
 				vrele(newvp);
 			    else
 				vput(newvp);
 			    newvp = NULLVP;
 			}
 			tl = nfsm_dissect(u_int32_t *, NFSX_UNSIGNED);
 			more_dirs = fxdr_unsigned(int, *tl);
 		}
 		/*
 		 * If at end of rpc data, get the eof boolean
 		 */
 		if (!more_dirs) {
 			tl = nfsm_dissect(u_int32_t *, NFSX_UNSIGNED);
 			more_dirs = (fxdr_unsigned(int, *tl) == 0);
 		}
 		m_freem(mrep);
 	}
 	/*
 	 * Fill last record, iff any, out to a multiple of DIRBLKSIZ
 	 * by increasing d_reclen for the last record.
 	 */
 	if (blksiz > 0) {
 		left = DIRBLKSIZ - blksiz;
 		dp->d_reclen += left;
 		uiop->uio_iov->iov_base =
 		    (char *)uiop->uio_iov->iov_base + left;
 		uiop->uio_iov->iov_len -= left;
 		uiop->uio_offset += left;
 		uiop->uio_resid -= left;
 	}
 
 	/*
 	 * We are now either at the end of the directory or have filled the
 	 * block.
 	 */
 	if (bigenough)
 		dnp->n_direofoffset = uiop->uio_offset;
 	else {
 		if (uiop->uio_resid > 0)
 			nfs_printf("EEK! readdirplusrpc resid > 0\n");
 		nfs_dircookie_lock(dnp);
 		cookiep = nfs_getcookie(dnp, uiop->uio_offset, 1);
 		*cookiep = cookie;
 		nfs_dircookie_unlock(dnp);
 	}
 nfsmout:
 	if (newvp != NULLVP) {
 	        if (newvp == vp)
 			vrele(newvp);
 		else
 			vput(newvp);
 		newvp = NULLVP;
 	}
 	return (error);
 }
 
 /*
  * Silly rename. To make the NFS filesystem that is stateless look a little
  * more like the "ufs" a remove of an active vnode is translated to a rename
  * to a funny looking filename that is removed by nfs_inactive on the
  * nfsnode. There is the potential for another process on a different client
  * to create the same funny name between the nfs_lookitup() fails and the
  * nfs_rename() completes, but...
  */
 static int
 nfs_sillyrename(struct vnode *dvp, struct vnode *vp, struct componentname *cnp)
 {
 	struct sillyrename *sp;
 	struct nfsnode *np;
 	int error;
 	short pid;
 	unsigned int lticks;
 
 	cache_purge(dvp);
 	np = VTONFS(vp);
 	KASSERT(vp->v_type != VDIR, ("nfs: sillyrename dir"));
 	sp = malloc(sizeof (struct sillyrename),
 		M_NFSREQ, M_WAITOK);
 	sp->s_cred = crhold(cnp->cn_cred);
 	sp->s_dvp = dvp;
 	sp->s_removeit = nfs_removeit;
 	VREF(dvp);
 
 	/* 
 	 * Fudge together a funny name.
 	 * Changing the format of the funny name to accomodate more 
 	 * sillynames per directory.
 	 * The name is now changed to .nfs.<ticks>.<pid>.4, where ticks is 
 	 * CPU ticks since boot.
 	 */
 	pid = cnp->cn_thread->td_proc->p_pid;
 	lticks = (unsigned int)ticks;
 	for ( ; ; ) {
 		sp->s_namlen = sprintf(sp->s_name, 
 				       ".nfs.%08x.%04x4.4", lticks, 
 				       pid);
 		if (nfs_lookitup(dvp, sp->s_name, sp->s_namlen, sp->s_cred,
 				 cnp->cn_thread, NULL))
 			break;
 		lticks++;
 	}
 	error = nfs_renameit(dvp, cnp, sp);
 	if (error)
 		goto bad;
 	error = nfs_lookitup(dvp, sp->s_name, sp->s_namlen, sp->s_cred,
 		cnp->cn_thread, &np);
 	np->n_sillyrename = sp;
 	return (0);
 bad:
 	vrele(sp->s_dvp);
 	crfree(sp->s_cred);
 	free((caddr_t)sp, M_NFSREQ);
 	return (error);
 }
 
 /*
  * Look up a file name and optionally either update the file handle or
  * allocate an nfsnode, depending on the value of npp.
  * npp == NULL	--> just do the lookup
  * *npp == NULL --> allocate a new nfsnode and make sure attributes are
  *			handled too
  * *npp != NULL --> update the file handle in the vnode
  */
 static int
 nfs_lookitup(struct vnode *dvp, const char *name, int len, struct ucred *cred,
     struct thread *td, struct nfsnode **npp)
 {
 	struct vnode *newvp = NULL;
 	struct nfsnode *np, *dnp = VTONFS(dvp);
 	caddr_t bpos, dpos;
 	int error = 0, fhlen, attrflag;
 	struct mbuf *mreq, *mrep, *md, *mb;
 	nfsfh_t *nfhp;
 	int v3 = NFS_ISV3(dvp);
 
 	nfsstats.rpccnt[NFSPROC_LOOKUP]++;
 	mreq = m_get2(NFSX_FH(v3) + NFSX_UNSIGNED + nfsm_rndup(len),
 	    M_WAITOK, MT_DATA, 0);
 	mb = mreq;
 	bpos = mtod(mb, caddr_t);
 	nfsm_fhtom(dvp, v3);
 	nfsm_strtom(name, len, NFS_MAXNAMLEN);
 	nfsm_request(dvp, NFSPROC_LOOKUP, td, cred);
 	if (npp && !error) {
 		nfsm_getfh(nfhp, fhlen, v3);
 		if (*npp) {
 		    np = *npp;
 		    if (np->n_fhsize > NFS_SMALLFH && fhlen <= NFS_SMALLFH) {
 			free((caddr_t)np->n_fhp, M_NFSBIGFH);
 			np->n_fhp = &np->n_fh;
 		    } else if (np->n_fhsize <= NFS_SMALLFH && fhlen>NFS_SMALLFH)
 			np->n_fhp =(nfsfh_t *)malloc(fhlen, M_NFSBIGFH, M_WAITOK);
 		    bcopy((caddr_t)nfhp, (caddr_t)np->n_fhp, fhlen);
 		    np->n_fhsize = fhlen;
 		    newvp = NFSTOV(np);
 		} else if (NFS_CMPFH(dnp, nfhp, fhlen)) {
 		    VREF(dvp);
 		    newvp = dvp;
 		} else {
 		    error = nfs_nget(dvp->v_mount, nfhp, fhlen, &np, LK_EXCLUSIVE);
 		    if (error) {
 			m_freem(mrep);
 			return (error);
 		    }
 		    newvp = NFSTOV(np);
 		}
 		if (v3) {
 			nfsm_postop_attr(newvp, attrflag);
 			if (!attrflag && *npp == NULL) {
 				m_freem(mrep);
 				if (newvp == dvp)
 					vrele(newvp);
 				else
 					vput(newvp);
 				return (ENOENT);
 			}
 		} else
 			nfsm_loadattr(newvp, NULL);
 	}
 	m_freem(mrep);
 nfsmout:
 	if (npp && *npp == NULL) {
 		if (error) {
 			if (newvp) {
 				if (newvp == dvp)
 					vrele(newvp);
 				else
 					vput(newvp);
 			}
 		} else
 			*npp = np;
 	}
 	return (error);
 }
 
 /*
  * Nfs Version 3 commit rpc
  */
 int
 nfs_commit(struct vnode *vp, u_quad_t offset, int cnt, struct ucred *cred,
 	   struct thread *td)
 {
 	u_int32_t *tl;
 	struct nfsmount *nmp = VFSTONFS(vp->v_mount);
 	caddr_t bpos, dpos;
 	int error = 0, wccflag = NFSV3_WCCRATTR;
 	struct mbuf *mreq, *mrep, *md, *mb;
 
 	mtx_lock(&nmp->nm_mtx);
 	if ((nmp->nm_state & NFSSTA_HASWRITEVERF) == 0) {
 		mtx_unlock(&nmp->nm_mtx);
 		return (0);
 	}
 	mtx_unlock(&nmp->nm_mtx);
 	nfsstats.rpccnt[NFSPROC_COMMIT]++;
 	mreq = m_get2(NFSX_FH(1), M_WAITOK, MT_DATA, 0);
 	mb = mreq;
 	bpos = mtod(mb, caddr_t);
 	nfsm_fhtom(vp, 1);
 	tl = nfsm_build(u_int32_t *, 3 * NFSX_UNSIGNED);
 	txdr_hyper(offset, tl);
 	tl += 2;
 	*tl = txdr_unsigned(cnt);
 	nfsm_request(vp, NFSPROC_COMMIT, td, cred);
 	nfsm_wcc_data(vp, wccflag);
 	if (!error) {
 		tl = nfsm_dissect(u_int32_t *, NFSX_V3WRITEVERF);
 		if (bcmp((caddr_t)nmp->nm_verf, (caddr_t)tl,
 			NFSX_V3WRITEVERF)) {
 			bcopy((caddr_t)tl, (caddr_t)nmp->nm_verf,
 				NFSX_V3WRITEVERF);
 			error = NFSERR_STALEWRITEVERF;
 		}
 	}
 	m_freem(mrep);
 nfsmout:
 	return (error);
 }
 
 /*
  * Strategy routine.
  * For async requests when nfsiod(s) are running, queue the request by
  * calling nfs_asyncio(), otherwise just all nfs_doio() to do the
  * request.
  */
 static int
 nfs_strategy(struct vop_strategy_args *ap)
 {
 	struct buf *bp = ap->a_bp;
 	struct ucred *cr;
 
 	KASSERT(!(bp->b_flags & B_DONE),
 	    ("nfs_strategy: buffer %p unexpectedly marked B_DONE", bp));
 	BUF_ASSERT_HELD(bp);
 
 	if (bp->b_iocmd == BIO_READ)
 		cr = bp->b_rcred;
 	else
 		cr = bp->b_wcred;
 
 	/*
 	 * If the op is asynchronous and an i/o daemon is waiting
 	 * queue the request, wake it up and wait for completion
 	 * otherwise just do it ourselves.
 	 */
 	if ((bp->b_flags & B_ASYNC) == 0 ||
 	    nfs_asyncio(VFSTONFS(ap->a_vp->v_mount), bp, NOCRED, curthread))
 		(void)nfs_doio(ap->a_vp, bp, cr, curthread);
 	return (0);
 }
 
 /*
  * fsync vnode op. Just call nfs_flush() with commit == 1.
  */
 /* ARGSUSED */
 static int
 nfs_fsync(struct vop_fsync_args *ap)
 {
 
 	return (nfs_flush(ap->a_vp, ap->a_waitfor, 1));
 }
 
 /*
  * Flush all the blocks associated with a vnode.
  * 	Walk through the buffer pool and push any dirty pages
  *	associated with the vnode.
  */
 static int
 nfs_flush(struct vnode *vp, int waitfor, int commit)
 {
 	struct nfsnode *np = VTONFS(vp);
 	struct buf *bp;
 	int i;
 	struct buf *nbp;
 	struct nfsmount *nmp = VFSTONFS(vp->v_mount);
 	int error = 0, slptimeo = 0, slpflag = 0, retv, bvecpos;
 	int passone = 1;
 	u_quad_t off, endoff, toff;
 	struct ucred* wcred = NULL;
 	struct buf **bvec = NULL;
 	struct bufobj *bo;
 	struct thread *td = curthread;
 #ifndef NFS_COMMITBVECSIZ
 #define NFS_COMMITBVECSIZ	20
 #endif
 	struct buf *bvec_on_stack[NFS_COMMITBVECSIZ];
 	int bvecsize = 0, bveccount;
 
 	if (nmp->nm_flag & NFSMNT_INT)
 		slpflag = PCATCH;
 	if (!commit)
 		passone = 0;
 	bo = &vp->v_bufobj;
 	/*
 	 * A b_flags == (B_DELWRI | B_NEEDCOMMIT) block has been written to the
 	 * server, but has not been committed to stable storage on the server
 	 * yet. On the first pass, the byte range is worked out and the commit
 	 * rpc is done. On the second pass, nfs_writebp() is called to do the
 	 * job.
 	 */
 again:
 	off = (u_quad_t)-1;
 	endoff = 0;
 	bvecpos = 0;
 	if (NFS_ISV3(vp) && commit) {
 		if (bvec != NULL && bvec != bvec_on_stack)
 			free(bvec, M_TEMP);
 		/*
 		 * Count up how many buffers waiting for a commit.
 		 */
 		bveccount = 0;
 		BO_LOCK(bo);
 		TAILQ_FOREACH_SAFE(bp, &bo->bo_dirty.bv_hd, b_bobufs, nbp) {
 			if (!BUF_ISLOCKED(bp) &&
 			    (bp->b_flags & (B_DELWRI | B_NEEDCOMMIT))
 				== (B_DELWRI | B_NEEDCOMMIT))
 				bveccount++;
 		}
 		/*
 		 * Allocate space to remember the list of bufs to commit.  It is
 		 * important to use M_NOWAIT here to avoid a race with nfs_write.
 		 * If we can't get memory (for whatever reason), we will end up
 		 * committing the buffers one-by-one in the loop below.
 		 */
 		if (bveccount > NFS_COMMITBVECSIZ) {
 			/*
 			 * Release the vnode interlock to avoid a lock
 			 * order reversal.
 			 */
 			BO_UNLOCK(bo);
 			bvec = (struct buf **)
 				malloc(bveccount * sizeof(struct buf *),
 				       M_TEMP, M_NOWAIT);
 			BO_LOCK(bo);
 			if (bvec == NULL) {
 				bvec = bvec_on_stack;
 				bvecsize = NFS_COMMITBVECSIZ;
 			} else
 				bvecsize = bveccount;
 		} else {
 			bvec = bvec_on_stack;
 			bvecsize = NFS_COMMITBVECSIZ;
 		}
 		TAILQ_FOREACH_SAFE(bp, &bo->bo_dirty.bv_hd, b_bobufs, nbp) {
 			if (bvecpos >= bvecsize)
 				break;
 			if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL)) {
 				nbp = TAILQ_NEXT(bp, b_bobufs);
 				continue;
 			}
 			if ((bp->b_flags & (B_DELWRI | B_NEEDCOMMIT)) !=
 			    (B_DELWRI | B_NEEDCOMMIT)) {
 				BUF_UNLOCK(bp);
 				nbp = TAILQ_NEXT(bp, b_bobufs);
 				continue;
 			}
 			BO_UNLOCK(bo);
 			bremfree(bp);
 			/*
 			 * Work out if all buffers are using the same cred
 			 * so we can deal with them all with one commit.
 			 *
 			 * NOTE: we are not clearing B_DONE here, so we have
 			 * to do it later on in this routine if we intend to
 			 * initiate I/O on the bp.
 			 *
 			 * Note: to avoid loopback deadlocks, we do not
 			 * assign b_runningbufspace.
 			 */
 			if (wcred == NULL)
 				wcred = bp->b_wcred;
 			else if (wcred != bp->b_wcred)
 				wcred = NOCRED;
 			vfs_busy_pages(bp, 1);
 
 			BO_LOCK(bo);
 			/*
 			 * bp is protected by being locked, but nbp is not
 			 * and vfs_busy_pages() may sleep.  We have to
 			 * recalculate nbp.
 			 */
 			nbp = TAILQ_NEXT(bp, b_bobufs);
 
 			/*
 			 * A list of these buffers is kept so that the
 			 * second loop knows which buffers have actually
 			 * been committed. This is necessary, since there
 			 * may be a race between the commit rpc and new
 			 * uncommitted writes on the file.
 			 */
 			bvec[bvecpos++] = bp;
 			toff = ((u_quad_t)bp->b_blkno) * DEV_BSIZE +
 				bp->b_dirtyoff;
 			if (toff < off)
 				off = toff;
 			toff += (u_quad_t)(bp->b_dirtyend - bp->b_dirtyoff);
 			if (toff > endoff)
 				endoff = toff;
 		}
 		BO_UNLOCK(bo);
 	}
 	if (bvecpos > 0) {
 		/*
 		 * Commit data on the server, as required.
 		 * If all bufs are using the same wcred, then use that with
 		 * one call for all of them, otherwise commit each one
 		 * separately.
 		 */
 		if (wcred != NOCRED)
 			retv = nfs_commit(vp, off, (int)(endoff - off),
 					  wcred, td);
 		else {
 			retv = 0;
 			for (i = 0; i < bvecpos; i++) {
 				off_t off, size;
 				bp = bvec[i];
 				off = ((u_quad_t)bp->b_blkno) * DEV_BSIZE +
 					bp->b_dirtyoff;
 				size = (u_quad_t)(bp->b_dirtyend
 						  - bp->b_dirtyoff);
 				retv = nfs_commit(vp, off, (int)size,
 						  bp->b_wcred, td);
 				if (retv) break;
 			}
 		}
 
 		if (retv == NFSERR_STALEWRITEVERF)
 			nfs_clearcommit(vp->v_mount);
 
 		/*
 		 * Now, either mark the blocks I/O done or mark the
 		 * blocks dirty, depending on whether the commit
 		 * succeeded.
 		 */
 		for (i = 0; i < bvecpos; i++) {
 			bp = bvec[i];
 			bp->b_flags &= ~(B_NEEDCOMMIT | B_CLUSTEROK);
 			if (retv) {
 				/*
 				 * Error, leave B_DELWRI intact
 				 */
 				vfs_unbusy_pages(bp);
 				brelse(bp);
 			} else {
 				/*
 				 * Success, remove B_DELWRI ( bundirty() ).
 				 *
 				 * b_dirtyoff/b_dirtyend seem to be NFS
 				 * specific.  We should probably move that
 				 * into bundirty(). XXX
 				 */
 				bufobj_wref(bo);
 				bp->b_flags |= B_ASYNC;
 				bundirty(bp);
 				bp->b_flags &= ~B_DONE;
 				bp->b_ioflags &= ~BIO_ERROR;
 				bp->b_dirtyoff = bp->b_dirtyend = 0;
 				bufdone(bp);
 			}
 		}
 	}
 
 	/*
 	 * Start/do any write(s) that are required.
 	 */
 loop:
 	BO_LOCK(bo);
 	TAILQ_FOREACH_SAFE(bp, &bo->bo_dirty.bv_hd, b_bobufs, nbp) {
 		if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL)) {
 			if (waitfor != MNT_WAIT || passone)
 				continue;
 
 			error = BUF_TIMELOCK(bp,
 			    LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK,
 			    BO_LOCKPTR(bo), "nfsfsync", slpflag, slptimeo);
 			if (error == 0) {
 				BUF_UNLOCK(bp);
 				goto loop;
 			}
 			if (error == ENOLCK) {
 				error = 0;
 				goto loop;
 			}
 			if (nfs_sigintr(nmp, td)) {
 				error = EINTR;
 				goto done;
 			}
 			if (slpflag == PCATCH) {
 				slpflag = 0;
 				slptimeo = 2 * hz;
 			}
 			goto loop;
 		}
 		if ((bp->b_flags & B_DELWRI) == 0)
 			panic("nfs_fsync: not dirty");
 		if ((passone || !commit) && (bp->b_flags & B_NEEDCOMMIT)) {
 			BUF_UNLOCK(bp);
 			continue;
 		}
 		BO_UNLOCK(bo);
 		bremfree(bp);
 		if (passone || !commit)
 		    bp->b_flags |= B_ASYNC;
 		else
 		    bp->b_flags |= B_ASYNC;
 		bwrite(bp);
 		if (nfs_sigintr(nmp, td)) {
 			error = EINTR;
 			goto done;
 		}
 		goto loop;
 	}
 	if (passone) {
 		passone = 0;
 		BO_UNLOCK(bo);
 		goto again;
 	}
 	if (waitfor == MNT_WAIT) {
 		while (bo->bo_numoutput) {
 			error = bufobj_wwait(bo, slpflag, slptimeo);
 			if (error) {
 			    BO_UNLOCK(bo);
 			    error = nfs_sigintr(nmp, td);
 			    if (error)
 				goto done;
 			    if (slpflag == PCATCH) {
 				slpflag = 0;
 				slptimeo = 2 * hz;
 			    }
 			    BO_LOCK(bo);
 			}
 		}
 		if (bo->bo_dirty.bv_cnt != 0 && commit) {
 			BO_UNLOCK(bo);
 			goto loop;
 		}
 		/*
 		 * Wait for all the async IO requests to drain
 		 */
 		BO_UNLOCK(bo);
 		mtx_lock(&np->n_mtx);
 		while (np->n_directio_asyncwr > 0) {
 			np->n_flag |= NFSYNCWAIT;
 			error = nfs_msleep(td, (caddr_t)&np->n_directio_asyncwr,
 					   &np->n_mtx, slpflag | (PRIBIO + 1), 
 					   "nfsfsync", 0);
 			if (error) {
 				if (nfs_sigintr(nmp, td)) {
 					mtx_unlock(&np->n_mtx);
 					error = EINTR;	
 					goto done;
 				}
 			}
 		}
 		mtx_unlock(&np->n_mtx);
 	} else
 		BO_UNLOCK(bo);
 	mtx_lock(&np->n_mtx);
 	if (np->n_flag & NWRITEERR) {
 		error = np->n_error;
 		np->n_flag &= ~NWRITEERR;
 	}
   	if (commit && bo->bo_dirty.bv_cnt == 0 &&
 	    bo->bo_numoutput == 0 && np->n_directio_asyncwr == 0)
   		np->n_flag &= ~NMODIFIED;
 	mtx_unlock(&np->n_mtx);
 done:
 	if (bvec != NULL && bvec != bvec_on_stack)
 		free(bvec, M_TEMP);
 	return (error);
 }
 
 /*
  * NFS advisory byte-level locks.
  */
 static int
 nfs_advlock(struct vop_advlock_args *ap)
 {
 	struct vnode *vp = ap->a_vp;
 	u_quad_t size;
 	int error;
 
 	error = vn_lock(vp, LK_SHARED);
 	if (error)
 		return (error);
 	if ((VFSTONFS(vp->v_mount)->nm_flag & NFSMNT_NOLOCKD) != 0) {
 		size = VTONFS(vp)->n_size;
 		VOP_UNLOCK(vp, 0);
 		error = lf_advlock(ap, &(vp->v_lockf), size);
 	} else {
 		if (nfs_advlock_p)
 			error = nfs_advlock_p(ap);
 		else
 			error = ENOLCK;
 	}
 
 	return (error);
 }
 
 /*
  * NFS advisory byte-level locks.
  */
 static int
 nfs_advlockasync(struct vop_advlockasync_args *ap)
 {
 	struct vnode *vp = ap->a_vp;
 	u_quad_t size;
 	int error;
 	
 	error = vn_lock(vp, LK_SHARED);
 	if (error)
 		return (error);
 	if ((VFSTONFS(vp->v_mount)->nm_flag & NFSMNT_NOLOCKD) != 0) {
 		size = VTONFS(vp)->n_size;
 		VOP_UNLOCK(vp, 0);
 		error = lf_advlockasync(ap, &(vp->v_lockf), size);
 	} else {
 		VOP_UNLOCK(vp, 0);
 		error = EOPNOTSUPP;
 	}
 	return (error);
 }
 
 /*
  * Print out the contents of an nfsnode.
  */
 static int
 nfs_print(struct vop_print_args *ap)
 {
 	struct vnode *vp = ap->a_vp;
 	struct nfsnode *np = VTONFS(vp);
 
 	nfs_printf("\tfileid %ld fsid 0x%x",
 	   np->n_vattr.va_fileid, np->n_vattr.va_fsid);
 	if (vp->v_type == VFIFO)
 		fifo_printinfo(vp);
 	printf("\n");
 	return (0);
 }
 
 /*
  * This is the "real" nfs::bwrite(struct buf*).
  * We set B_CACHE if this is a VMIO buffer.
  */
 int
 nfs_writebp(struct buf *bp, int force __unused, struct thread *td)
 {
 	int s;
 	int oldflags = bp->b_flags;
 #if 0
 	int retv = 1;
 	off_t off;
 #endif
 
 	BUF_ASSERT_HELD(bp);
 
 	if (bp->b_flags & B_INVAL) {
 		brelse(bp);
 		return(0);
 	}
 
 	bp->b_flags |= B_CACHE;
 
 	/*
 	 * Undirty the bp.  We will redirty it later if the I/O fails.
 	 */
 
 	s = splbio();
 	bundirty(bp);
 	bp->b_flags &= ~B_DONE;
 	bp->b_ioflags &= ~BIO_ERROR;
 	bp->b_iocmd = BIO_WRITE;
 
 	bufobj_wref(bp->b_bufobj);
 	curthread->td_ru.ru_oublock++;
 	splx(s);
 
 	/*
 	 * Note: to avoid loopback deadlocks, we do not
 	 * assign b_runningbufspace.
 	 */
 	vfs_busy_pages(bp, 1);
 
 	BUF_KERNPROC(bp);
 	bp->b_iooffset = dbtob(bp->b_blkno);
 	bstrategy(bp);
 
 	if( (oldflags & B_ASYNC) == 0) {
 		int rtval = bufwait(bp);
 
 		if (oldflags & B_DELWRI) {
 			s = splbio();
 			reassignbuf(bp);
 			splx(s);
 		}
 		brelse(bp);
 		return (rtval);
 	}
 
 	return (0);
 }
 
 /*
  * nfs special file access vnode op.
  * Essentially just get vattr and then imitate iaccess() since the device is
  * local to the client.
  */
 static int
 nfsspec_access(struct vop_access_args *ap)
 {
 	struct vattr *vap;
 	struct ucred *cred = ap->a_cred;
 	struct vnode *vp = ap->a_vp;
 	accmode_t accmode = ap->a_accmode;
 	struct vattr vattr;
 	int error;
 
 	/*
 	 * Disallow write attempts on filesystems mounted read-only;
 	 * unless the file is a socket, fifo, or a block or character
 	 * device resident on the filesystem.
 	 */
 	if ((accmode & VWRITE) && (vp->v_mount->mnt_flag & MNT_RDONLY)) {
 		switch (vp->v_type) {
 		case VREG:
 		case VDIR:
 		case VLNK:
 			return (EROFS);
 		default:
 			break;
 		}
 	}
 	vap = &vattr;
 	error = VOP_GETATTR(vp, vap, cred);
 	if (error)
 		goto out;
 	error  = vaccess(vp->v_type, vap->va_mode, vap->va_uid, vap->va_gid,
 			 accmode, cred, NULL);
 out:
 	return error;
 }
 
 /*
  * Read wrapper for fifos.
  */
 static int
 nfsfifo_read(struct vop_read_args *ap)
 {
 	struct nfsnode *np = VTONFS(ap->a_vp);
 	int error;
 
 	/*
 	 * Set access flag.
 	 */
 	mtx_lock(&np->n_mtx);
 	np->n_flag |= NACC;
 	vfs_timestamp(&np->n_atim);
 	mtx_unlock(&np->n_mtx);
 	error = fifo_specops.vop_read(ap);
 	return error;	
 }
 
 /*
  * Write wrapper for fifos.
  */
 static int
 nfsfifo_write(struct vop_write_args *ap)
 {
 	struct nfsnode *np = VTONFS(ap->a_vp);
 
 	/*
 	 * Set update flag.
 	 */
 	mtx_lock(&np->n_mtx);
 	np->n_flag |= NUPD;
 	vfs_timestamp(&np->n_mtim);
 	mtx_unlock(&np->n_mtx);
 	return(fifo_specops.vop_write(ap));
 }
 
 /*
  * Close wrapper for fifos.
  *
  * Update the times on the nfsnode then do fifo close.
  */
 static int
 nfsfifo_close(struct vop_close_args *ap)
 {
 	struct vnode *vp = ap->a_vp;
 	struct nfsnode *np = VTONFS(vp);
 	struct vattr vattr;
 	struct timespec ts;
 
 	mtx_lock(&np->n_mtx);
 	if (np->n_flag & (NACC | NUPD)) {
 		vfs_timestamp(&ts);
 		if (np->n_flag & NACC)
 			np->n_atim = ts;
 		if (np->n_flag & NUPD)
 			np->n_mtim = ts;
 		np->n_flag |= NCHG;
 		if (vrefcnt(vp) == 1 &&
 		    (vp->v_mount->mnt_flag & MNT_RDONLY) == 0) {
 			VATTR_NULL(&vattr);
 			if (np->n_flag & NACC)
 				vattr.va_atime = np->n_atim;
 			if (np->n_flag & NUPD)
 				vattr.va_mtime = np->n_mtim;
 			mtx_unlock(&np->n_mtx);
 			(void)VOP_SETATTR(vp, &vattr, ap->a_cred);
 			goto out;
 		}
 	}
 	mtx_unlock(&np->n_mtx);
 out:
 	return (fifo_specops.vop_close(ap));
 }
 
 /*
  * Just call nfs_writebp() with the force argument set to 1.
  *
  * NOTE: B_DONE may or may not be set in a_bp on call.
  */
 static int
 nfs_bwrite(struct buf *bp)
 {
 
 	return (nfs_writebp(bp, 1, curthread));
 }
 
 struct buf_ops buf_ops_nfs = {
 	.bop_name	=	"buf_ops_nfs",
 	.bop_write	=	nfs_bwrite,
 	.bop_strategy	=	bufstrategy,
 	.bop_sync	=	bufsync,
 	.bop_bdflush	=	bufbdflush,
 };
Index: head/sys/opencrypto/crypto.c
===================================================================
--- head/sys/opencrypto/crypto.c	(revision 258540)
+++ head/sys/opencrypto/crypto.c	(revision 258541)
@@ -1,1576 +1,1575 @@
 /*-
  * Copyright (c) 2002-2006 Sam Leffler.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 /*
  * Cryptographic Subsystem.
  *
  * This code is derived from the Openbsd Cryptographic Framework (OCF)
  * that has the copyright shown below.  Very little of the original
  * code remains.
  */
 
 /*-
  * The author of this code is Angelos D. Keromytis (angelos@cis.upenn.edu)
  *
  * This code was written by Angelos D. Keromytis in Athens, Greece, in
  * February 2000. Network Security Technologies Inc. (NSTI) kindly
  * supported the development of this code.
  *
  * Copyright (c) 2000, 2001 Angelos D. Keromytis
  *
  * Permission to use, copy, and modify this software with or without fee
  * is hereby granted, provided that this entire notice is included in
  * all source code copies of any software which is or includes a copy or
  * modification of this software.
  *
  * THIS SOFTWARE IS BEING PROVIDED "AS IS", WITHOUT ANY EXPRESS OR
  * IMPLIED WARRANTY. IN PARTICULAR, NONE OF THE AUTHORS MAKES ANY
  * REPRESENTATION OR WARRANTY OF ANY KIND CONCERNING THE
  * MERCHANTABILITY OF THIS SOFTWARE OR ITS FITNESS FOR ANY PARTICULAR
  * PURPOSE.
  */
 
 #define	CRYPTO_TIMING				/* enable timing support */
 
 #include "opt_ddb.h"
-#include "opt_kdtrace.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/eventhandler.h>
 #include <sys/kernel.h>
 #include <sys/kthread.h>
 #include <sys/lock.h>
 #include <sys/module.h>
 #include <sys/mutex.h>
 #include <sys/malloc.h>
 #include <sys/proc.h>
 #include <sys/sdt.h>
 #include <sys/sysctl.h>
 
 #include <ddb/ddb.h>
 
 #include <vm/uma.h>
 #include <opencrypto/cryptodev.h>
 #include <opencrypto/xform.h>			/* XXX for M_XDATA */
 
 #include <sys/kobj.h>
 #include <sys/bus.h>
 #include "cryptodev_if.h"
 
 #if defined(__i386__) || defined(__amd64__)
 #include <machine/pcb.h>
 #endif
 
 SDT_PROVIDER_DEFINE(opencrypto);
 
 /*
  * Crypto drivers register themselves by allocating a slot in the
  * crypto_drivers table with crypto_get_driverid() and then registering
  * each algorithm they support with crypto_register() and crypto_kregister().
  */
 static	struct mtx crypto_drivers_mtx;		/* lock on driver table */
 #define	CRYPTO_DRIVER_LOCK()	mtx_lock(&crypto_drivers_mtx)
 #define	CRYPTO_DRIVER_UNLOCK()	mtx_unlock(&crypto_drivers_mtx)
 #define	CRYPTO_DRIVER_ASSERT()	mtx_assert(&crypto_drivers_mtx, MA_OWNED)
 
 /*
  * Crypto device/driver capabilities structure.
  *
  * Synchronization:
  * (d) - protected by CRYPTO_DRIVER_LOCK()
  * (q) - protected by CRYPTO_Q_LOCK()
  * Not tagged fields are read-only.
  */
 struct cryptocap {
 	device_t	cc_dev;			/* (d) device/driver */
 	u_int32_t	cc_sessions;		/* (d) # of sessions */
 	u_int32_t	cc_koperations;		/* (d) # os asym operations */
 	/*
 	 * Largest possible operator length (in bits) for each type of
 	 * encryption algorithm. XXX not used
 	 */
 	u_int16_t	cc_max_op_len[CRYPTO_ALGORITHM_MAX + 1];
 	u_int8_t	cc_alg[CRYPTO_ALGORITHM_MAX + 1];
 	u_int8_t	cc_kalg[CRK_ALGORITHM_MAX + 1];
 
 	int		cc_flags;		/* (d) flags */
 #define CRYPTOCAP_F_CLEANUP	0x80000000	/* needs resource cleanup */
 	int		cc_qblocked;		/* (q) symmetric q blocked */
 	int		cc_kqblocked;		/* (q) asymmetric q blocked */
 };
 static	struct cryptocap *crypto_drivers = NULL;
 static	int crypto_drivers_num = 0;
 
 /*
  * There are two queues for crypto requests; one for symmetric (e.g.
  * cipher) operations and one for asymmetric (e.g. MOD)operations.
  * A single mutex is used to lock access to both queues.  We could
  * have one per-queue but having one simplifies handling of block/unblock
  * operations.
  */
 static	int crp_sleep = 0;
 static	TAILQ_HEAD(,cryptop) crp_q;		/* request queues */
 static	TAILQ_HEAD(,cryptkop) crp_kq;
 static	struct mtx crypto_q_mtx;
 #define	CRYPTO_Q_LOCK()		mtx_lock(&crypto_q_mtx)
 #define	CRYPTO_Q_UNLOCK()	mtx_unlock(&crypto_q_mtx)
 
 /*
  * There are two queues for processing completed crypto requests; one
  * for the symmetric and one for the asymmetric ops.  We only need one
  * but have two to avoid type futzing (cryptop vs. cryptkop).  A single
  * mutex is used to lock access to both queues.  Note that this lock
  * must be separate from the lock on request queues to insure driver
  * callbacks don't generate lock order reversals.
  */
 static	TAILQ_HEAD(,cryptop) crp_ret_q;		/* callback queues */
 static	TAILQ_HEAD(,cryptkop) crp_ret_kq;
 static	struct mtx crypto_ret_q_mtx;
 #define	CRYPTO_RETQ_LOCK()	mtx_lock(&crypto_ret_q_mtx)
 #define	CRYPTO_RETQ_UNLOCK()	mtx_unlock(&crypto_ret_q_mtx)
 #define	CRYPTO_RETQ_EMPTY()	(TAILQ_EMPTY(&crp_ret_q) && TAILQ_EMPTY(&crp_ret_kq))
 
 static	uma_zone_t cryptop_zone;
 static	uma_zone_t cryptodesc_zone;
 
 int	crypto_userasymcrypto = 1;	/* userland may do asym crypto reqs */
 SYSCTL_INT(_kern, OID_AUTO, userasymcrypto, CTLFLAG_RW,
 	   &crypto_userasymcrypto, 0,
 	   "Enable/disable user-mode access to asymmetric crypto support");
 int	crypto_devallowsoft = 0;	/* only use hardware crypto for asym */
 SYSCTL_INT(_kern, OID_AUTO, cryptodevallowsoft, CTLFLAG_RW,
 	   &crypto_devallowsoft, 0,
 	   "Enable/disable use of software asym crypto support");
 
 MALLOC_DEFINE(M_CRYPTO_DATA, "crypto", "crypto session records");
 
 static	void crypto_proc(void);
 static	struct proc *cryptoproc;
 static	void crypto_ret_proc(void);
 static	struct proc *cryptoretproc;
 static	void crypto_destroy(void);
 static	int crypto_invoke(struct cryptocap *cap, struct cryptop *crp, int hint);
 static	int crypto_kinvoke(struct cryptkop *krp, int flags);
 
 static	struct cryptostats cryptostats;
 SYSCTL_STRUCT(_kern, OID_AUTO, crypto_stats, CTLFLAG_RW, &cryptostats,
 	    cryptostats, "Crypto system statistics");
 
 #ifdef CRYPTO_TIMING
 static	int crypto_timing = 0;
 SYSCTL_INT(_debug, OID_AUTO, crypto_timing, CTLFLAG_RW,
 	   &crypto_timing, 0, "Enable/disable crypto timing support");
 #endif
 
 static int
 crypto_init(void)
 {
 	int error;
 
 	mtx_init(&crypto_drivers_mtx, "crypto", "crypto driver table",
 		MTX_DEF|MTX_QUIET);
 
 	TAILQ_INIT(&crp_q);
 	TAILQ_INIT(&crp_kq);
 	mtx_init(&crypto_q_mtx, "crypto", "crypto op queues", MTX_DEF);
 
 	TAILQ_INIT(&crp_ret_q);
 	TAILQ_INIT(&crp_ret_kq);
 	mtx_init(&crypto_ret_q_mtx, "crypto", "crypto return queues", MTX_DEF);
 
 	cryptop_zone = uma_zcreate("cryptop", sizeof (struct cryptop),
 				    0, 0, 0, 0,
 				    UMA_ALIGN_PTR, UMA_ZONE_ZINIT);
 	cryptodesc_zone = uma_zcreate("cryptodesc", sizeof (struct cryptodesc),
 				    0, 0, 0, 0,
 				    UMA_ALIGN_PTR, UMA_ZONE_ZINIT);
 	if (cryptodesc_zone == NULL || cryptop_zone == NULL) {
 		printf("crypto_init: cannot setup crypto zones\n");
 		error = ENOMEM;
 		goto bad;
 	}
 
 	crypto_drivers_num = CRYPTO_DRIVERS_INITIAL;
 	crypto_drivers = malloc(crypto_drivers_num *
 	    sizeof(struct cryptocap), M_CRYPTO_DATA, M_NOWAIT | M_ZERO);
 	if (crypto_drivers == NULL) {
 		printf("crypto_init: cannot setup crypto drivers\n");
 		error = ENOMEM;
 		goto bad;
 	}
 
 	error = kproc_create((void (*)(void *)) crypto_proc, NULL,
 		    &cryptoproc, 0, 0, "crypto");
 	if (error) {
 		printf("crypto_init: cannot start crypto thread; error %d",
 			error);
 		goto bad;
 	}
 
 	error = kproc_create((void (*)(void *)) crypto_ret_proc, NULL,
 		    &cryptoretproc, 0, 0, "crypto returns");
 	if (error) {
 		printf("crypto_init: cannot start cryptoret thread; error %d",
 			error);
 		goto bad;
 	}
 	return 0;
 bad:
 	crypto_destroy();
 	return error;
 }
 
 /*
  * Signal a crypto thread to terminate.  We use the driver
  * table lock to synchronize the sleep/wakeups so that we
  * are sure the threads have terminated before we release
  * the data structures they use.  See crypto_finis below
  * for the other half of this song-and-dance.
  */
 static void
 crypto_terminate(struct proc **pp, void *q)
 {
 	struct proc *p;
 
 	mtx_assert(&crypto_drivers_mtx, MA_OWNED);
 	p = *pp;
 	*pp = NULL;
 	if (p) {
 		wakeup_one(q);
 		PROC_LOCK(p);		/* NB: insure we don't miss wakeup */
 		CRYPTO_DRIVER_UNLOCK();	/* let crypto_finis progress */
 		msleep(p, &p->p_mtx, PWAIT, "crypto_destroy", 0);
 		PROC_UNLOCK(p);
 		CRYPTO_DRIVER_LOCK();
 	}
 }
 
 static void
 crypto_destroy(void)
 {
 	/*
 	 * Terminate any crypto threads.
 	 */
 	CRYPTO_DRIVER_LOCK();
 	crypto_terminate(&cryptoproc, &crp_q);
 	crypto_terminate(&cryptoretproc, &crp_ret_q);
 	CRYPTO_DRIVER_UNLOCK();
 
 	/* XXX flush queues??? */
 
 	/* 
 	 * Reclaim dynamically allocated resources.
 	 */
 	if (crypto_drivers != NULL)
 		free(crypto_drivers, M_CRYPTO_DATA);
 
 	if (cryptodesc_zone != NULL)
 		uma_zdestroy(cryptodesc_zone);
 	if (cryptop_zone != NULL)
 		uma_zdestroy(cryptop_zone);
 	mtx_destroy(&crypto_q_mtx);
 	mtx_destroy(&crypto_ret_q_mtx);
 	mtx_destroy(&crypto_drivers_mtx);
 }
 
 static struct cryptocap *
 crypto_checkdriver(u_int32_t hid)
 {
 	if (crypto_drivers == NULL)
 		return NULL;
 	return (hid >= crypto_drivers_num ? NULL : &crypto_drivers[hid]);
 }
 
 /*
  * Compare a driver's list of supported algorithms against another
  * list; return non-zero if all algorithms are supported.
  */
 static int
 driver_suitable(const struct cryptocap *cap, const struct cryptoini *cri)
 {
 	const struct cryptoini *cr;
 
 	/* See if all the algorithms are supported. */
 	for (cr = cri; cr; cr = cr->cri_next)
 		if (cap->cc_alg[cr->cri_alg] == 0)
 			return 0;
 	return 1;
 }
 
 /*
  * Select a driver for a new session that supports the specified
  * algorithms and, optionally, is constrained according to the flags.
  * The algorithm we use here is pretty stupid; just use the
  * first driver that supports all the algorithms we need. If there
  * are multiple drivers we choose the driver with the fewest active
  * sessions.  We prefer hardware-backed drivers to software ones.
  *
  * XXX We need more smarts here (in real life too, but that's
  * XXX another story altogether).
  */
 static struct cryptocap *
 crypto_select_driver(const struct cryptoini *cri, int flags)
 {
 	struct cryptocap *cap, *best;
 	int match, hid;
 
 	CRYPTO_DRIVER_ASSERT();
 
 	/*
 	 * Look first for hardware crypto devices if permitted.
 	 */
 	if (flags & CRYPTOCAP_F_HARDWARE)
 		match = CRYPTOCAP_F_HARDWARE;
 	else
 		match = CRYPTOCAP_F_SOFTWARE;
 	best = NULL;
 again:
 	for (hid = 0; hid < crypto_drivers_num; hid++) {
 		cap = &crypto_drivers[hid];
 		/*
 		 * If it's not initialized, is in the process of
 		 * going away, or is not appropriate (hardware
 		 * or software based on match), then skip.
 		 */
 		if (cap->cc_dev == NULL ||
 		    (cap->cc_flags & CRYPTOCAP_F_CLEANUP) ||
 		    (cap->cc_flags & match) == 0)
 			continue;
 
 		/* verify all the algorithms are supported. */
 		if (driver_suitable(cap, cri)) {
 			if (best == NULL ||
 			    cap->cc_sessions < best->cc_sessions)
 				best = cap;
 		}
 	}
 	if (best != NULL)
 		return best;
 	if (match == CRYPTOCAP_F_HARDWARE && (flags & CRYPTOCAP_F_SOFTWARE)) {
 		/* sort of an Algol 68-style for loop */
 		match = CRYPTOCAP_F_SOFTWARE;
 		goto again;
 	}
 	return best;
 }
 
 /*
  * Create a new session.  The crid argument specifies a crypto
  * driver to use or constraints on a driver to select (hardware
  * only, software only, either).  Whatever driver is selected
  * must be capable of the requested crypto algorithms.
  */
 int
 crypto_newsession(u_int64_t *sid, struct cryptoini *cri, int crid)
 {
 	struct cryptocap *cap;
 	u_int32_t hid, lid;
 	int err;
 
 	CRYPTO_DRIVER_LOCK();
 	if ((crid & (CRYPTOCAP_F_HARDWARE | CRYPTOCAP_F_SOFTWARE)) == 0) {
 		/*
 		 * Use specified driver; verify it is capable.
 		 */
 		cap = crypto_checkdriver(crid);
 		if (cap != NULL && !driver_suitable(cap, cri))
 			cap = NULL;
 	} else {
 		/*
 		 * No requested driver; select based on crid flags.
 		 */
 		cap = crypto_select_driver(cri, crid);
 		/*
 		 * if NULL then can't do everything in one session.
 		 * XXX Fix this. We need to inject a "virtual" session
 		 * XXX layer right about here.
 		 */
 	}
 	if (cap != NULL) {
 		/* Call the driver initialization routine. */
 		hid = cap - crypto_drivers;
 		lid = hid;		/* Pass the driver ID. */
 		err = CRYPTODEV_NEWSESSION(cap->cc_dev, &lid, cri);
 		if (err == 0) {
 			(*sid) = (cap->cc_flags & 0xff000000)
 			       | (hid & 0x00ffffff);
 			(*sid) <<= 32;
 			(*sid) |= (lid & 0xffffffff);
 			cap->cc_sessions++;
 		}
 	} else
 		err = EINVAL;
 	CRYPTO_DRIVER_UNLOCK();
 	return err;
 }
 
 static void
 crypto_remove(struct cryptocap *cap)
 {
 
 	mtx_assert(&crypto_drivers_mtx, MA_OWNED);
 	if (cap->cc_sessions == 0 && cap->cc_koperations == 0)
 		bzero(cap, sizeof(*cap));
 }
 
 /*
  * Delete an existing session (or a reserved session on an unregistered
  * driver).
  */
 int
 crypto_freesession(u_int64_t sid)
 {
 	struct cryptocap *cap;
 	u_int32_t hid;
 	int err;
 
 	CRYPTO_DRIVER_LOCK();
 
 	if (crypto_drivers == NULL) {
 		err = EINVAL;
 		goto done;
 	}
 
 	/* Determine two IDs. */
 	hid = CRYPTO_SESID2HID(sid);
 
 	if (hid >= crypto_drivers_num) {
 		err = ENOENT;
 		goto done;
 	}
 	cap = &crypto_drivers[hid];
 
 	if (cap->cc_sessions)
 		cap->cc_sessions--;
 
 	/* Call the driver cleanup routine, if available. */
 	err = CRYPTODEV_FREESESSION(cap->cc_dev, sid);
 
 	if (cap->cc_flags & CRYPTOCAP_F_CLEANUP)
 		crypto_remove(cap);
 
 done:
 	CRYPTO_DRIVER_UNLOCK();
 	return err;
 }
 
 /*
  * Return an unused driver id.  Used by drivers prior to registering
  * support for the algorithms they handle.
  */
 int32_t
 crypto_get_driverid(device_t dev, int flags)
 {
 	struct cryptocap *newdrv;
 	int i;
 
 	if ((flags & (CRYPTOCAP_F_HARDWARE | CRYPTOCAP_F_SOFTWARE)) == 0) {
 		printf("%s: no flags specified when registering driver\n",
 		    device_get_nameunit(dev));
 		return -1;
 	}
 
 	CRYPTO_DRIVER_LOCK();
 
 	for (i = 0; i < crypto_drivers_num; i++) {
 		if (crypto_drivers[i].cc_dev == NULL &&
 		    (crypto_drivers[i].cc_flags & CRYPTOCAP_F_CLEANUP) == 0) {
 			break;
 		}
 	}
 
 	/* Out of entries, allocate some more. */
 	if (i == crypto_drivers_num) {
 		/* Be careful about wrap-around. */
 		if (2 * crypto_drivers_num <= crypto_drivers_num) {
 			CRYPTO_DRIVER_UNLOCK();
 			printf("crypto: driver count wraparound!\n");
 			return -1;
 		}
 
 		newdrv = malloc(2 * crypto_drivers_num *
 		    sizeof(struct cryptocap), M_CRYPTO_DATA, M_NOWAIT|M_ZERO);
 		if (newdrv == NULL) {
 			CRYPTO_DRIVER_UNLOCK();
 			printf("crypto: no space to expand driver table!\n");
 			return -1;
 		}
 
 		bcopy(crypto_drivers, newdrv,
 		    crypto_drivers_num * sizeof(struct cryptocap));
 
 		crypto_drivers_num *= 2;
 
 		free(crypto_drivers, M_CRYPTO_DATA);
 		crypto_drivers = newdrv;
 	}
 
 	/* NB: state is zero'd on free */
 	crypto_drivers[i].cc_sessions = 1;	/* Mark */
 	crypto_drivers[i].cc_dev = dev;
 	crypto_drivers[i].cc_flags = flags;
 	if (bootverbose)
 		printf("crypto: assign %s driver id %u, flags %u\n",
 		    device_get_nameunit(dev), i, flags);
 
 	CRYPTO_DRIVER_UNLOCK();
 
 	return i;
 }
 
 /*
  * Lookup a driver by name.  We match against the full device
  * name and unit, and against just the name.  The latter gives
  * us a simple widlcarding by device name.  On success return the
  * driver/hardware identifier; otherwise return -1.
  */
 int
 crypto_find_driver(const char *match)
 {
 	int i, len = strlen(match);
 
 	CRYPTO_DRIVER_LOCK();
 	for (i = 0; i < crypto_drivers_num; i++) {
 		device_t dev = crypto_drivers[i].cc_dev;
 		if (dev == NULL ||
 		    (crypto_drivers[i].cc_flags & CRYPTOCAP_F_CLEANUP))
 			continue;
 		if (strncmp(match, device_get_nameunit(dev), len) == 0 ||
 		    strncmp(match, device_get_name(dev), len) == 0)
 			break;
 	}
 	CRYPTO_DRIVER_UNLOCK();
 	return i < crypto_drivers_num ? i : -1;
 }
 
 /*
  * Return the device_t for the specified driver or NULL
  * if the driver identifier is invalid.
  */
 device_t
 crypto_find_device_byhid(int hid)
 {
 	struct cryptocap *cap = crypto_checkdriver(hid);
 	return cap != NULL ? cap->cc_dev : NULL;
 }
 
 /*
  * Return the device/driver capabilities.
  */
 int
 crypto_getcaps(int hid)
 {
 	struct cryptocap *cap = crypto_checkdriver(hid);
 	return cap != NULL ? cap->cc_flags : 0;
 }
 
 /*
  * Register support for a key-related algorithm.  This routine
  * is called once for each algorithm supported a driver.
  */
 int
 crypto_kregister(u_int32_t driverid, int kalg, u_int32_t flags)
 {
 	struct cryptocap *cap;
 	int err;
 
 	CRYPTO_DRIVER_LOCK();
 
 	cap = crypto_checkdriver(driverid);
 	if (cap != NULL &&
 	    (CRK_ALGORITM_MIN <= kalg && kalg <= CRK_ALGORITHM_MAX)) {
 		/*
 		 * XXX Do some performance testing to determine placing.
 		 * XXX We probably need an auxiliary data structure that
 		 * XXX describes relative performances.
 		 */
 
 		cap->cc_kalg[kalg] = flags | CRYPTO_ALG_FLAG_SUPPORTED;
 		if (bootverbose)
 			printf("crypto: %s registers key alg %u flags %u\n"
 				, device_get_nameunit(cap->cc_dev)
 				, kalg
 				, flags
 			);
 		err = 0;
 	} else
 		err = EINVAL;
 
 	CRYPTO_DRIVER_UNLOCK();
 	return err;
 }
 
 /*
  * Register support for a non-key-related algorithm.  This routine
  * is called once for each such algorithm supported by a driver.
  */
 int
 crypto_register(u_int32_t driverid, int alg, u_int16_t maxoplen,
     u_int32_t flags)
 {
 	struct cryptocap *cap;
 	int err;
 
 	CRYPTO_DRIVER_LOCK();
 
 	cap = crypto_checkdriver(driverid);
 	/* NB: algorithms are in the range [1..max] */
 	if (cap != NULL &&
 	    (CRYPTO_ALGORITHM_MIN <= alg && alg <= CRYPTO_ALGORITHM_MAX)) {
 		/*
 		 * XXX Do some performance testing to determine placing.
 		 * XXX We probably need an auxiliary data structure that
 		 * XXX describes relative performances.
 		 */
 
 		cap->cc_alg[alg] = flags | CRYPTO_ALG_FLAG_SUPPORTED;
 		cap->cc_max_op_len[alg] = maxoplen;
 		if (bootverbose)
 			printf("crypto: %s registers alg %u flags %u maxoplen %u\n"
 				, device_get_nameunit(cap->cc_dev)
 				, alg
 				, flags
 				, maxoplen
 			);
 		cap->cc_sessions = 0;		/* Unmark */
 		err = 0;
 	} else
 		err = EINVAL;
 
 	CRYPTO_DRIVER_UNLOCK();
 	return err;
 }
 
 static void
 driver_finis(struct cryptocap *cap)
 {
 	u_int32_t ses, kops;
 
 	CRYPTO_DRIVER_ASSERT();
 
 	ses = cap->cc_sessions;
 	kops = cap->cc_koperations;
 	bzero(cap, sizeof(*cap));
 	if (ses != 0 || kops != 0) {
 		/*
 		 * If there are pending sessions,
 		 * just mark as invalid.
 		 */
 		cap->cc_flags |= CRYPTOCAP_F_CLEANUP;
 		cap->cc_sessions = ses;
 		cap->cc_koperations = kops;
 	}
 }
 
 /*
  * Unregister a crypto driver. If there are pending sessions using it,
  * leave enough information around so that subsequent calls using those
  * sessions will correctly detect the driver has been unregistered and
  * reroute requests.
  */
 int
 crypto_unregister(u_int32_t driverid, int alg)
 {
 	struct cryptocap *cap;
 	int i, err;
 
 	CRYPTO_DRIVER_LOCK();
 	cap = crypto_checkdriver(driverid);
 	if (cap != NULL &&
 	    (CRYPTO_ALGORITHM_MIN <= alg && alg <= CRYPTO_ALGORITHM_MAX) &&
 	    cap->cc_alg[alg] != 0) {
 		cap->cc_alg[alg] = 0;
 		cap->cc_max_op_len[alg] = 0;
 
 		/* Was this the last algorithm ? */
 		for (i = 1; i <= CRYPTO_ALGORITHM_MAX; i++)
 			if (cap->cc_alg[i] != 0)
 				break;
 
 		if (i == CRYPTO_ALGORITHM_MAX + 1)
 			driver_finis(cap);
 		err = 0;
 	} else
 		err = EINVAL;
 	CRYPTO_DRIVER_UNLOCK();
 
 	return err;
 }
 
 /*
  * Unregister all algorithms associated with a crypto driver.
  * If there are pending sessions using it, leave enough information
  * around so that subsequent calls using those sessions will
  * correctly detect the driver has been unregistered and reroute
  * requests.
  */
 int
 crypto_unregister_all(u_int32_t driverid)
 {
 	struct cryptocap *cap;
 	int err;
 
 	CRYPTO_DRIVER_LOCK();
 	cap = crypto_checkdriver(driverid);
 	if (cap != NULL) {
 		driver_finis(cap);
 		err = 0;
 	} else
 		err = EINVAL;
 	CRYPTO_DRIVER_UNLOCK();
 
 	return err;
 }
 
 /*
  * Clear blockage on a driver.  The what parameter indicates whether
  * the driver is now ready for cryptop's and/or cryptokop's.
  */
 int
 crypto_unblock(u_int32_t driverid, int what)
 {
 	struct cryptocap *cap;
 	int err;
 
 	CRYPTO_Q_LOCK();
 	cap = crypto_checkdriver(driverid);
 	if (cap != NULL) {
 		if (what & CRYPTO_SYMQ)
 			cap->cc_qblocked = 0;
 		if (what & CRYPTO_ASYMQ)
 			cap->cc_kqblocked = 0;
 		if (crp_sleep)
 			wakeup_one(&crp_q);
 		err = 0;
 	} else
 		err = EINVAL;
 	CRYPTO_Q_UNLOCK();
 
 	return err;
 }
 
 /*
  * Add a crypto request to a queue, to be processed by the kernel thread.
  */
 int
 crypto_dispatch(struct cryptop *crp)
 {
 	struct cryptocap *cap;
 	u_int32_t hid;
 	int result;
 
 	cryptostats.cs_ops++;
 
 #ifdef CRYPTO_TIMING
 	if (crypto_timing)
 		binuptime(&crp->crp_tstamp);
 #endif
 
 	hid = CRYPTO_SESID2HID(crp->crp_sid);
 
 	if ((crp->crp_flags & CRYPTO_F_BATCH) == 0) {
 		/*
 		 * Caller marked the request to be processed
 		 * immediately; dispatch it directly to the
 		 * driver unless the driver is currently blocked.
 		 */
 		cap = crypto_checkdriver(hid);
 		/* Driver cannot disappeared when there is an active session. */
 		KASSERT(cap != NULL, ("%s: Driver disappeared.", __func__));
 		if (!cap->cc_qblocked) {
 			result = crypto_invoke(cap, crp, 0);
 			if (result != ERESTART)
 				return (result);
 			/*
 			 * The driver ran out of resources, put the request on
 			 * the queue.
 			 */
 		}
 	}
 	CRYPTO_Q_LOCK();
 	TAILQ_INSERT_TAIL(&crp_q, crp, crp_next);
 	if (crp_sleep)
 		wakeup_one(&crp_q);
 	CRYPTO_Q_UNLOCK();
 	return 0;
 }
 
 /*
  * Add an asymetric crypto request to a queue,
  * to be processed by the kernel thread.
  */
 int
 crypto_kdispatch(struct cryptkop *krp)
 {
 	int error;
 
 	cryptostats.cs_kops++;
 
 	error = crypto_kinvoke(krp, krp->krp_crid);
 	if (error == ERESTART) {
 		CRYPTO_Q_LOCK();
 		TAILQ_INSERT_TAIL(&crp_kq, krp, krp_next);
 		if (crp_sleep)
 			wakeup_one(&crp_q);
 		CRYPTO_Q_UNLOCK();
 		error = 0;
 	}
 	return error;
 }
 
 /*
  * Verify a driver is suitable for the specified operation.
  */
 static __inline int
 kdriver_suitable(const struct cryptocap *cap, const struct cryptkop *krp)
 {
 	return (cap->cc_kalg[krp->krp_op] & CRYPTO_ALG_FLAG_SUPPORTED) != 0;
 }
 
 /*
  * Select a driver for an asym operation.  The driver must
  * support the necessary algorithm.  The caller can constrain
  * which device is selected with the flags parameter.  The
  * algorithm we use here is pretty stupid; just use the first
  * driver that supports the algorithms we need. If there are
  * multiple suitable drivers we choose the driver with the
  * fewest active operations.  We prefer hardware-backed
  * drivers to software ones when either may be used.
  */
 static struct cryptocap *
 crypto_select_kdriver(const struct cryptkop *krp, int flags)
 {
 	struct cryptocap *cap, *best, *blocked;
 	int match, hid;
 
 	CRYPTO_DRIVER_ASSERT();
 
 	/*
 	 * Look first for hardware crypto devices if permitted.
 	 */
 	if (flags & CRYPTOCAP_F_HARDWARE)
 		match = CRYPTOCAP_F_HARDWARE;
 	else
 		match = CRYPTOCAP_F_SOFTWARE;
 	best = NULL;
 	blocked = NULL;
 again:
 	for (hid = 0; hid < crypto_drivers_num; hid++) {
 		cap = &crypto_drivers[hid];
 		/*
 		 * If it's not initialized, is in the process of
 		 * going away, or is not appropriate (hardware
 		 * or software based on match), then skip.
 		 */
 		if (cap->cc_dev == NULL ||
 		    (cap->cc_flags & CRYPTOCAP_F_CLEANUP) ||
 		    (cap->cc_flags & match) == 0)
 			continue;
 
 		/* verify all the algorithms are supported. */
 		if (kdriver_suitable(cap, krp)) {
 			if (best == NULL ||
 			    cap->cc_koperations < best->cc_koperations)
 				best = cap;
 		}
 	}
 	if (best != NULL)
 		return best;
 	if (match == CRYPTOCAP_F_HARDWARE && (flags & CRYPTOCAP_F_SOFTWARE)) {
 		/* sort of an Algol 68-style for loop */
 		match = CRYPTOCAP_F_SOFTWARE;
 		goto again;
 	}
 	return best;
 }
 
 /*
  * Dispatch an assymetric crypto request.
  */
 static int
 crypto_kinvoke(struct cryptkop *krp, int crid)
 {
 	struct cryptocap *cap = NULL;
 	int error;
 
 	KASSERT(krp != NULL, ("%s: krp == NULL", __func__));
 	KASSERT(krp->krp_callback != NULL,
 	    ("%s: krp->crp_callback == NULL", __func__));
 
 	CRYPTO_DRIVER_LOCK();
 	if ((crid & (CRYPTOCAP_F_HARDWARE | CRYPTOCAP_F_SOFTWARE)) == 0) {
 		cap = crypto_checkdriver(crid);
 		if (cap != NULL) {
 			/*
 			 * Driver present, it must support the necessary
 			 * algorithm and, if s/w drivers are excluded,
 			 * it must be registered as hardware-backed.
 			 */
 			if (!kdriver_suitable(cap, krp) ||
 			    (!crypto_devallowsoft &&
 			     (cap->cc_flags & CRYPTOCAP_F_HARDWARE) == 0))
 				cap = NULL;
 		}
 	} else {
 		/*
 		 * No requested driver; select based on crid flags.
 		 */
 		if (!crypto_devallowsoft)	/* NB: disallow s/w drivers */
 			crid &= ~CRYPTOCAP_F_SOFTWARE;
 		cap = crypto_select_kdriver(krp, crid);
 	}
 	if (cap != NULL && !cap->cc_kqblocked) {
 		krp->krp_hid = cap - crypto_drivers;
 		cap->cc_koperations++;
 		CRYPTO_DRIVER_UNLOCK();
 		error = CRYPTODEV_KPROCESS(cap->cc_dev, krp, 0);
 		CRYPTO_DRIVER_LOCK();
 		if (error == ERESTART) {
 			cap->cc_koperations--;
 			CRYPTO_DRIVER_UNLOCK();
 			return (error);
 		}
 	} else {
 		/*
 		 * NB: cap is !NULL if device is blocked; in
 		 *     that case return ERESTART so the operation
 		 *     is resubmitted if possible.
 		 */
 		error = (cap == NULL) ? ENODEV : ERESTART;
 	}
 	CRYPTO_DRIVER_UNLOCK();
 
 	if (error) {
 		krp->krp_status = error;
 		crypto_kdone(krp);
 	}
 	return 0;
 }
 
 #ifdef CRYPTO_TIMING
 static void
 crypto_tstat(struct cryptotstat *ts, struct bintime *bt)
 {
 	struct bintime now, delta;
 	struct timespec t;
 	uint64_t u;
 
 	binuptime(&now);
 	u = now.frac;
 	delta.frac = now.frac - bt->frac;
 	delta.sec = now.sec - bt->sec;
 	if (u < delta.frac)
 		delta.sec--;
 	bintime2timespec(&delta, &t);
 	timespecadd(&ts->acc, &t);
 	if (timespeccmp(&t, &ts->min, <))
 		ts->min = t;
 	if (timespeccmp(&t, &ts->max, >))
 		ts->max = t;
 	ts->count++;
 
 	*bt = now;
 }
 #endif
 
 /*
  * Dispatch a crypto request to the appropriate crypto devices.
  */
 static int
 crypto_invoke(struct cryptocap *cap, struct cryptop *crp, int hint)
 {
 
 	KASSERT(crp != NULL, ("%s: crp == NULL", __func__));
 	KASSERT(crp->crp_callback != NULL,
 	    ("%s: crp->crp_callback == NULL", __func__));
 	KASSERT(crp->crp_desc != NULL, ("%s: crp->crp_desc == NULL", __func__));
 
 #ifdef CRYPTO_TIMING
 	if (crypto_timing)
 		crypto_tstat(&cryptostats.cs_invoke, &crp->crp_tstamp);
 #endif
 	if (cap->cc_flags & CRYPTOCAP_F_CLEANUP) {
 		struct cryptodesc *crd;
 		u_int64_t nid;
 
 		/*
 		 * Driver has unregistered; migrate the session and return
 		 * an error to the caller so they'll resubmit the op.
 		 *
 		 * XXX: What if there are more already queued requests for this
 		 *      session?
 		 */
 		crypto_freesession(crp->crp_sid);
 
 		for (crd = crp->crp_desc; crd->crd_next; crd = crd->crd_next)
 			crd->CRD_INI.cri_next = &(crd->crd_next->CRD_INI);
 
 		/* XXX propagate flags from initial session? */
 		if (crypto_newsession(&nid, &(crp->crp_desc->CRD_INI),
 		    CRYPTOCAP_F_HARDWARE | CRYPTOCAP_F_SOFTWARE) == 0)
 			crp->crp_sid = nid;
 
 		crp->crp_etype = EAGAIN;
 		crypto_done(crp);
 		return 0;
 	} else {
 		/*
 		 * Invoke the driver to process the request.
 		 */
 		return CRYPTODEV_PROCESS(cap->cc_dev, crp, hint);
 	}
 }
 
 /*
  * Release a set of crypto descriptors.
  */
 void
 crypto_freereq(struct cryptop *crp)
 {
 	struct cryptodesc *crd;
 
 	if (crp == NULL)
 		return;
 
 #ifdef DIAGNOSTIC
 	{
 		struct cryptop *crp2;
 
 		CRYPTO_Q_LOCK();
 		TAILQ_FOREACH(crp2, &crp_q, crp_next) {
 			KASSERT(crp2 != crp,
 			    ("Freeing cryptop from the crypto queue (%p).",
 			    crp));
 		}
 		CRYPTO_Q_UNLOCK();
 		CRYPTO_RETQ_LOCK();
 		TAILQ_FOREACH(crp2, &crp_ret_q, crp_next) {
 			KASSERT(crp2 != crp,
 			    ("Freeing cryptop from the return queue (%p).",
 			    crp));
 		}
 		CRYPTO_RETQ_UNLOCK();
 	}
 #endif
 
 	while ((crd = crp->crp_desc) != NULL) {
 		crp->crp_desc = crd->crd_next;
 		uma_zfree(cryptodesc_zone, crd);
 	}
 	uma_zfree(cryptop_zone, crp);
 }
 
 /*
  * Acquire a set of crypto descriptors.
  */
 struct cryptop *
 crypto_getreq(int num)
 {
 	struct cryptodesc *crd;
 	struct cryptop *crp;
 
 	crp = uma_zalloc(cryptop_zone, M_NOWAIT|M_ZERO);
 	if (crp != NULL) {
 		while (num--) {
 			crd = uma_zalloc(cryptodesc_zone, M_NOWAIT|M_ZERO);
 			if (crd == NULL) {
 				crypto_freereq(crp);
 				return NULL;
 			}
 
 			crd->crd_next = crp->crp_desc;
 			crp->crp_desc = crd;
 		}
 	}
 	return crp;
 }
 
 /*
  * Invoke the callback on behalf of the driver.
  */
 void
 crypto_done(struct cryptop *crp)
 {
 	KASSERT((crp->crp_flags & CRYPTO_F_DONE) == 0,
 		("crypto_done: op already done, flags 0x%x", crp->crp_flags));
 	crp->crp_flags |= CRYPTO_F_DONE;
 	if (crp->crp_etype != 0)
 		cryptostats.cs_errs++;
 #ifdef CRYPTO_TIMING
 	if (crypto_timing)
 		crypto_tstat(&cryptostats.cs_done, &crp->crp_tstamp);
 #endif
 	/*
 	 * CBIMM means unconditionally do the callback immediately;
 	 * CBIFSYNC means do the callback immediately only if the
 	 * operation was done synchronously.  Both are used to avoid
 	 * doing extraneous context switches; the latter is mostly
 	 * used with the software crypto driver.
 	 */
 	if ((crp->crp_flags & CRYPTO_F_CBIMM) ||
 	    ((crp->crp_flags & CRYPTO_F_CBIFSYNC) &&
 	     (CRYPTO_SESID2CAPS(crp->crp_sid) & CRYPTOCAP_F_SYNC))) {
 		/*
 		 * Do the callback directly.  This is ok when the
 		 * callback routine does very little (e.g. the
 		 * /dev/crypto callback method just does a wakeup).
 		 */
 #ifdef CRYPTO_TIMING
 		if (crypto_timing) {
 			/*
 			 * NB: We must copy the timestamp before
 			 * doing the callback as the cryptop is
 			 * likely to be reclaimed.
 			 */
 			struct bintime t = crp->crp_tstamp;
 			crypto_tstat(&cryptostats.cs_cb, &t);
 			crp->crp_callback(crp);
 			crypto_tstat(&cryptostats.cs_finis, &t);
 		} else
 #endif
 			crp->crp_callback(crp);
 	} else {
 		/*
 		 * Normal case; queue the callback for the thread.
 		 */
 		CRYPTO_RETQ_LOCK();
 		if (CRYPTO_RETQ_EMPTY())
 			wakeup_one(&crp_ret_q);	/* shared wait channel */
 		TAILQ_INSERT_TAIL(&crp_ret_q, crp, crp_next);
 		CRYPTO_RETQ_UNLOCK();
 	}
 }
 
 /*
  * Invoke the callback on behalf of the driver.
  */
 void
 crypto_kdone(struct cryptkop *krp)
 {
 	struct cryptocap *cap;
 
 	if (krp->krp_status != 0)
 		cryptostats.cs_kerrs++;
 	CRYPTO_DRIVER_LOCK();
 	/* XXX: What if driver is loaded in the meantime? */
 	if (krp->krp_hid < crypto_drivers_num) {
 		cap = &crypto_drivers[krp->krp_hid];
 		cap->cc_koperations--;
 		KASSERT(cap->cc_koperations >= 0, ("cc_koperations < 0"));
 		if (cap->cc_flags & CRYPTOCAP_F_CLEANUP)
 			crypto_remove(cap);
 	}
 	CRYPTO_DRIVER_UNLOCK();
 	CRYPTO_RETQ_LOCK();
 	if (CRYPTO_RETQ_EMPTY())
 		wakeup_one(&crp_ret_q);		/* shared wait channel */
 	TAILQ_INSERT_TAIL(&crp_ret_kq, krp, krp_next);
 	CRYPTO_RETQ_UNLOCK();
 }
 
 int
 crypto_getfeat(int *featp)
 {
 	int hid, kalg, feat = 0;
 
 	CRYPTO_DRIVER_LOCK();
 	for (hid = 0; hid < crypto_drivers_num; hid++) {
 		const struct cryptocap *cap = &crypto_drivers[hid];
 
 		if ((cap->cc_flags & CRYPTOCAP_F_SOFTWARE) &&
 		    !crypto_devallowsoft) {
 			continue;
 		}
 		for (kalg = 0; kalg < CRK_ALGORITHM_MAX; kalg++)
 			if (cap->cc_kalg[kalg] & CRYPTO_ALG_FLAG_SUPPORTED)
 				feat |=  1 << kalg;
 	}
 	CRYPTO_DRIVER_UNLOCK();
 	*featp = feat;
 	return (0);
 }
 
 /*
  * Terminate a thread at module unload.  The process that
  * initiated this is waiting for us to signal that we're gone;
  * wake it up and exit.  We use the driver table lock to insure
  * we don't do the wakeup before they're waiting.  There is no
  * race here because the waiter sleeps on the proc lock for the
  * thread so it gets notified at the right time because of an
  * extra wakeup that's done in exit1().
  */
 static void
 crypto_finis(void *chan)
 {
 	CRYPTO_DRIVER_LOCK();
 	wakeup_one(chan);
 	CRYPTO_DRIVER_UNLOCK();
 	kproc_exit(0);
 }
 
 /*
  * Crypto thread, dispatches crypto requests.
  */
 static void
 crypto_proc(void)
 {
 	struct cryptop *crp, *submit;
 	struct cryptkop *krp;
 	struct cryptocap *cap;
 	u_int32_t hid;
 	int result, hint;
 
 #if defined(__i386__) || defined(__amd64__)
 	fpu_kern_thread(FPU_KERN_NORMAL);
 #endif
 
 	CRYPTO_Q_LOCK();
 	for (;;) {
 		/*
 		 * Find the first element in the queue that can be
 		 * processed and look-ahead to see if multiple ops
 		 * are ready for the same driver.
 		 */
 		submit = NULL;
 		hint = 0;
 		TAILQ_FOREACH(crp, &crp_q, crp_next) {
 			hid = CRYPTO_SESID2HID(crp->crp_sid);
 			cap = crypto_checkdriver(hid);
 			/*
 			 * Driver cannot disappeared when there is an active
 			 * session.
 			 */
 			KASSERT(cap != NULL, ("%s:%u Driver disappeared.",
 			    __func__, __LINE__));
 			if (cap == NULL || cap->cc_dev == NULL) {
 				/* Op needs to be migrated, process it. */
 				if (submit == NULL)
 					submit = crp;
 				break;
 			}
 			if (!cap->cc_qblocked) {
 				if (submit != NULL) {
 					/*
 					 * We stop on finding another op,
 					 * regardless whether its for the same
 					 * driver or not.  We could keep
 					 * searching the queue but it might be
 					 * better to just use a per-driver
 					 * queue instead.
 					 */
 					if (CRYPTO_SESID2HID(submit->crp_sid) == hid)
 						hint = CRYPTO_HINT_MORE;
 					break;
 				} else {
 					submit = crp;
 					if ((submit->crp_flags & CRYPTO_F_BATCH) == 0)
 						break;
 					/* keep scanning for more are q'd */
 				}
 			}
 		}
 		if (submit != NULL) {
 			TAILQ_REMOVE(&crp_q, submit, crp_next);
 			hid = CRYPTO_SESID2HID(submit->crp_sid);
 			cap = crypto_checkdriver(hid);
 			KASSERT(cap != NULL, ("%s:%u Driver disappeared.",
 			    __func__, __LINE__));
 			result = crypto_invoke(cap, submit, hint);
 			if (result == ERESTART) {
 				/*
 				 * The driver ran out of resources, mark the
 				 * driver ``blocked'' for cryptop's and put
 				 * the request back in the queue.  It would
 				 * best to put the request back where we got
 				 * it but that's hard so for now we put it
 				 * at the front.  This should be ok; putting
 				 * it at the end does not work.
 				 */
 				/* XXX validate sid again? */
 				crypto_drivers[CRYPTO_SESID2HID(submit->crp_sid)].cc_qblocked = 1;
 				TAILQ_INSERT_HEAD(&crp_q, submit, crp_next);
 				cryptostats.cs_blocks++;
 			}
 		}
 
 		/* As above, but for key ops */
 		TAILQ_FOREACH(krp, &crp_kq, krp_next) {
 			cap = crypto_checkdriver(krp->krp_hid);
 			if (cap == NULL || cap->cc_dev == NULL) {
 				/*
 				 * Operation needs to be migrated, invalidate
 				 * the assigned device so it will reselect a
 				 * new one below.  Propagate the original
 				 * crid selection flags if supplied.
 				 */
 				krp->krp_hid = krp->krp_crid &
 				    (CRYPTOCAP_F_SOFTWARE|CRYPTOCAP_F_HARDWARE);
 				if (krp->krp_hid == 0)
 					krp->krp_hid =
 				    CRYPTOCAP_F_SOFTWARE|CRYPTOCAP_F_HARDWARE;
 				break;
 			}
 			if (!cap->cc_kqblocked)
 				break;
 		}
 		if (krp != NULL) {
 			TAILQ_REMOVE(&crp_kq, krp, krp_next);
 			result = crypto_kinvoke(krp, krp->krp_hid);
 			if (result == ERESTART) {
 				/*
 				 * The driver ran out of resources, mark the
 				 * driver ``blocked'' for cryptkop's and put
 				 * the request back in the queue.  It would
 				 * best to put the request back where we got
 				 * it but that's hard so for now we put it
 				 * at the front.  This should be ok; putting
 				 * it at the end does not work.
 				 */
 				/* XXX validate sid again? */
 				crypto_drivers[krp->krp_hid].cc_kqblocked = 1;
 				TAILQ_INSERT_HEAD(&crp_kq, krp, krp_next);
 				cryptostats.cs_kblocks++;
 			}
 		}
 
 		if (submit == NULL && krp == NULL) {
 			/*
 			 * Nothing more to be processed.  Sleep until we're
 			 * woken because there are more ops to process.
 			 * This happens either by submission or by a driver
 			 * becoming unblocked and notifying us through
 			 * crypto_unblock.  Note that when we wakeup we
 			 * start processing each queue again from the
 			 * front. It's not clear that it's important to
 			 * preserve this ordering since ops may finish
 			 * out of order if dispatched to different devices
 			 * and some become blocked while others do not.
 			 */
 			crp_sleep = 1;
 			msleep(&crp_q, &crypto_q_mtx, PWAIT, "crypto_wait", 0);
 			crp_sleep = 0;
 			if (cryptoproc == NULL)
 				break;
 			cryptostats.cs_intrs++;
 		}
 	}
 	CRYPTO_Q_UNLOCK();
 
 	crypto_finis(&crp_q);
 }
 
 /*
  * Crypto returns thread, does callbacks for processed crypto requests.
  * Callbacks are done here, rather than in the crypto drivers, because
  * callbacks typically are expensive and would slow interrupt handling.
  */
 static void
 crypto_ret_proc(void)
 {
 	struct cryptop *crpt;
 	struct cryptkop *krpt;
 
 	CRYPTO_RETQ_LOCK();
 	for (;;) {
 		/* Harvest return q's for completed ops */
 		crpt = TAILQ_FIRST(&crp_ret_q);
 		if (crpt != NULL)
 			TAILQ_REMOVE(&crp_ret_q, crpt, crp_next);
 
 		krpt = TAILQ_FIRST(&crp_ret_kq);
 		if (krpt != NULL)
 			TAILQ_REMOVE(&crp_ret_kq, krpt, krp_next);
 
 		if (crpt != NULL || krpt != NULL) {
 			CRYPTO_RETQ_UNLOCK();
 			/*
 			 * Run callbacks unlocked.
 			 */
 			if (crpt != NULL) {
 #ifdef CRYPTO_TIMING
 				if (crypto_timing) {
 					/*
 					 * NB: We must copy the timestamp before
 					 * doing the callback as the cryptop is
 					 * likely to be reclaimed.
 					 */
 					struct bintime t = crpt->crp_tstamp;
 					crypto_tstat(&cryptostats.cs_cb, &t);
 					crpt->crp_callback(crpt);
 					crypto_tstat(&cryptostats.cs_finis, &t);
 				} else
 #endif
 					crpt->crp_callback(crpt);
 			}
 			if (krpt != NULL)
 				krpt->krp_callback(krpt);
 			CRYPTO_RETQ_LOCK();
 		} else {
 			/*
 			 * Nothing more to be processed.  Sleep until we're
 			 * woken because there are more returns to process.
 			 */
 			msleep(&crp_ret_q, &crypto_ret_q_mtx, PWAIT,
 				"crypto_ret_wait", 0);
 			if (cryptoretproc == NULL)
 				break;
 			cryptostats.cs_rets++;
 		}
 	}
 	CRYPTO_RETQ_UNLOCK();
 
 	crypto_finis(&crp_ret_q);
 }
 
 #ifdef DDB
 static void
 db_show_drivers(void)
 {
 	int hid;
 
 	db_printf("%12s %4s %4s %8s %2s %2s\n"
 		, "Device"
 		, "Ses"
 		, "Kops"
 		, "Flags"
 		, "QB"
 		, "KB"
 	);
 	for (hid = 0; hid < crypto_drivers_num; hid++) {
 		const struct cryptocap *cap = &crypto_drivers[hid];
 		if (cap->cc_dev == NULL)
 			continue;
 		db_printf("%-12s %4u %4u %08x %2u %2u\n"
 		    , device_get_nameunit(cap->cc_dev)
 		    , cap->cc_sessions
 		    , cap->cc_koperations
 		    , cap->cc_flags
 		    , cap->cc_qblocked
 		    , cap->cc_kqblocked
 		);
 	}
 }
 
 DB_SHOW_COMMAND(crypto, db_show_crypto)
 {
 	struct cryptop *crp;
 
 	db_show_drivers();
 	db_printf("\n");
 
 	db_printf("%4s %8s %4s %4s %4s %4s %8s %8s\n",
 	    "HID", "Caps", "Ilen", "Olen", "Etype", "Flags",
 	    "Desc", "Callback");
 	TAILQ_FOREACH(crp, &crp_q, crp_next) {
 		db_printf("%4u %08x %4u %4u %4u %04x %8p %8p\n"
 		    , (int) CRYPTO_SESID2HID(crp->crp_sid)
 		    , (int) CRYPTO_SESID2CAPS(crp->crp_sid)
 		    , crp->crp_ilen, crp->crp_olen
 		    , crp->crp_etype
 		    , crp->crp_flags
 		    , crp->crp_desc
 		    , crp->crp_callback
 		);
 	}
 	if (!TAILQ_EMPTY(&crp_ret_q)) {
 		db_printf("\n%4s %4s %4s %8s\n",
 		    "HID", "Etype", "Flags", "Callback");
 		TAILQ_FOREACH(crp, &crp_ret_q, crp_next) {
 			db_printf("%4u %4u %04x %8p\n"
 			    , (int) CRYPTO_SESID2HID(crp->crp_sid)
 			    , crp->crp_etype
 			    , crp->crp_flags
 			    , crp->crp_callback
 			);
 		}
 	}
 }
 
 DB_SHOW_COMMAND(kcrypto, db_show_kcrypto)
 {
 	struct cryptkop *krp;
 
 	db_show_drivers();
 	db_printf("\n");
 
 	db_printf("%4s %5s %4s %4s %8s %4s %8s\n",
 	    "Op", "Status", "#IP", "#OP", "CRID", "HID", "Callback");
 	TAILQ_FOREACH(krp, &crp_kq, krp_next) {
 		db_printf("%4u %5u %4u %4u %08x %4u %8p\n"
 		    , krp->krp_op
 		    , krp->krp_status
 		    , krp->krp_iparams, krp->krp_oparams
 		    , krp->krp_crid, krp->krp_hid
 		    , krp->krp_callback
 		);
 	}
 	if (!TAILQ_EMPTY(&crp_ret_q)) {
 		db_printf("%4s %5s %8s %4s %8s\n",
 		    "Op", "Status", "CRID", "HID", "Callback");
 		TAILQ_FOREACH(krp, &crp_ret_kq, krp_next) {
 			db_printf("%4u %5u %08x %4u %8p\n"
 			    , krp->krp_op
 			    , krp->krp_status
 			    , krp->krp_crid, krp->krp_hid
 			    , krp->krp_callback
 			);
 		}
 	}
 }
 #endif
 
 int crypto_modevent(module_t mod, int type, void *unused);
 
 /*
  * Initialization code, both for static and dynamic loading.
  * Note this is not invoked with the usual MODULE_DECLARE
  * mechanism but instead is listed as a dependency by the
  * cryptosoft driver.  This guarantees proper ordering of
  * calls on module load/unload.
  */
 int
 crypto_modevent(module_t mod, int type, void *unused)
 {
 	int error = EINVAL;
 
 	switch (type) {
 	case MOD_LOAD:
 		error = crypto_init();
 		if (error == 0 && bootverbose)
 			printf("crypto: <crypto core>\n");
 		break;
 	case MOD_UNLOAD:
 		/*XXX disallow if active sessions */
 		error = 0;
 		crypto_destroy();
 		return 0;
 	}
 	return error;
 }
 MODULE_VERSION(crypto, 1);
 MODULE_DEPEND(crypto, zlib, 1, 1, 1);
Index: head/sys/opencrypto/deflate.c
===================================================================
--- head/sys/opencrypto/deflate.c	(revision 258540)
+++ head/sys/opencrypto/deflate.c	(revision 258541)
@@ -1,265 +1,263 @@
 /* $OpenBSD: deflate.c,v 1.3 2001/08/20 02:45:22 hugh Exp $ */
 
 /*-
  * Copyright (c) 2001 Jean-Jacques Bernard-Gundol (jj@wabbitt.org)
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  *
  * 1. Redistributions of source code must retain the above copyright
  *   notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *   notice, this list of conditions and the following disclaimer in the
  *   documentation and/or other materials provided with the distribution.
  * 3. The name of the author may not be used to endorse or promote products
  *   derived from this software without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
 /*
  * This file contains a wrapper around the deflate algo compression
  * functions using the zlib library (see net/zlib.{c,h})
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
-#include "opt_kdtrace.h"
-
 #include <sys/types.h>
 #include <sys/param.h>
 #include <sys/malloc.h>
 #include <sys/param.h>
 #include <sys/kernel.h>
 #include <sys/sdt.h>
 #include <sys/systm.h>
 #include <net/zlib.h>
 
 #include <opencrypto/cryptodev.h>
 #include <opencrypto/deflate.h>
 
 SDT_PROVIDER_DECLARE(opencrypto);
 SDT_PROBE_DEFINE2(opencrypto, deflate, deflate_global, entry, entry,
     "int", "u_int32_t");
 SDT_PROBE_DEFINE5(opencrypto, deflate, deflate_global, bad, bad,
     "int", "int", "int", "int", "int");
 SDT_PROBE_DEFINE5(opencrypto, deflate, deflate_global, iter, iter,
     "int", "int", "int", "int", "int");
 SDT_PROBE_DEFINE2(opencrypto, deflate, deflate_global, return, return,
     "int", "u_int32_t");
 
 int window_inflate = -1 * MAX_WBITS;
 int window_deflate = -12;
 
 /*
  * This function takes a block of data and (de)compress it using the deflate
  * algorithm
  */
 
 u_int32_t
 deflate_global(data, size, decomp, out)
 	u_int8_t *data;
 	u_int32_t size;
 	int decomp;
 	u_int8_t **out;
 {
 	/* decomp indicates whether we compress (0) or decompress (1) */
 
 	z_stream zbuf;
 	u_int8_t *output;
 	u_int32_t count, result;
 	int error, i;
 	struct deflate_buf *bufh, *bufp;
 
 	SDT_PROBE2(opencrypto, deflate, deflate_global, entry, decomp, size);
 
 	bufh = bufp = NULL;
 	if (!decomp) {
 		i = 1;
 	} else {
 		/*
 	 	 * Choose a buffer with 4x the size of the input buffer
 	 	 * for the size of the output buffer in the case of
 	 	 * decompression. If it's not sufficient, it will need to be
 	 	 * updated while the decompression is going on.
 	 	 */
 		i = 4;
 	}
 	/*
 	 * Make sure we do have enough output space.  Repeated calls to
 	 * deflate need at least 6 bytes of output buffer space to avoid
 	 * repeated markers.  We will always provide at least 16 bytes.
 	 */
 	while ((size * i) < 16)
 		i++;
 
 	bufh = bufp = malloc(sizeof(*bufp) + (size_t)(size * i),
 	    M_CRYPTO_DATA, M_NOWAIT);
 	if (bufp == NULL) {
 		SDT_PROBE3(opencrypto, deflate, deflate_global, bad,
 		    decomp, 0, __LINE__);
 		goto bad2;
 	}
 	bufp->next = NULL;
 	bufp->size = size * i;
 
 	bzero(&zbuf, sizeof(z_stream));
 	zbuf.zalloc = z_alloc;
 	zbuf.zfree = z_free;
 	zbuf.opaque = Z_NULL;
 	zbuf.next_in = data;	/* Data that is going to be processed. */
 	zbuf.avail_in = size;	/* Total length of data to be processed. */
 	zbuf.next_out = bufp->data;
 	zbuf.avail_out = bufp->size;
 
 	error = decomp ? inflateInit2(&zbuf, window_inflate) :
 	    deflateInit2(&zbuf, Z_DEFAULT_COMPRESSION, Z_METHOD,
 		    window_deflate, Z_MEMLEVEL, Z_DEFAULT_STRATEGY);
 	if (error != Z_OK) {
 		SDT_PROBE3(opencrypto, deflate, deflate_global, bad,
 		    decomp, error, __LINE__);
 		goto bad;
 	}
 
 	for (;;) {
 		error = decomp ? inflate(&zbuf, Z_SYNC_FLUSH) :
 				 deflate(&zbuf, Z_FINISH);
 		if (error != Z_OK && error != Z_STREAM_END) {
 			/*
 			 * Unfortunately we are limited to 5 arguments,
 			 * thus use two probes.
 			 */
 			SDT_PROBE5(opencrypto, deflate, deflate_global, bad,
 			    decomp, error, __LINE__,
 			    zbuf.avail_in, zbuf.avail_out);
 			SDT_PROBE5(opencrypto, deflate, deflate_global, bad,
 			    decomp, error, __LINE__,
 			    zbuf.state->dummy, zbuf.total_out);
 			goto bad;
 		}
 		SDT_PROBE5(opencrypto, deflate, deflate_global, iter,
 		    decomp, error, __LINE__,
 		    zbuf.avail_in, zbuf.avail_out);
 		SDT_PROBE5(opencrypto, deflate, deflate_global, iter,
 		    decomp, error, __LINE__,
 		    zbuf.state->dummy, zbuf.total_out);
 		if (decomp && zbuf.avail_in == 0 && error == Z_STREAM_END) {
 			/* Done. */
 			break;
 		} else if (!decomp && error == Z_STREAM_END) {
 			/* Done. */
 			break;
 		} else if (zbuf.avail_out == 0) {
 			struct deflate_buf *p;
 
 			/* We need more output space for another iteration. */
 			p = malloc(sizeof(*p) + (size_t)(size * i),
 			    M_CRYPTO_DATA, M_NOWAIT);
 			if (p == NULL) {
 				SDT_PROBE3(opencrypto, deflate, deflate_global,
 				    bad, decomp, 0, __LINE__);
 				goto bad;
 			}
 			p->next = NULL;
 			p->size = size * i;
 			bufp->next = p;
 			bufp = p;
 			zbuf.next_out = bufp->data;
 			zbuf.avail_out = bufp->size;
 		} else {
 			/* Unexpect result. */
 			/*
 			 * Unfortunately we are limited to 5 arguments,
 			 * thus, again, use two probes.
 			 */
 			SDT_PROBE5(opencrypto, deflate, deflate_global, bad,
 			    decomp, error, __LINE__,
 			    zbuf.avail_in, zbuf.avail_out);
 			SDT_PROBE5(opencrypto, deflate, deflate_global, bad,
 			    decomp, error, __LINE__,
 			    zbuf.state->dummy, zbuf.total_out);
 			goto bad;
 		}
 	}
 
 	result = count = zbuf.total_out;
 
 	*out = malloc(result, M_CRYPTO_DATA, M_NOWAIT);
 	if (*out == NULL) {
 		SDT_PROBE3(opencrypto, deflate, deflate_global, bad,
 		    decomp, 0, __LINE__);
 		goto bad;
 	}
 	if (decomp)
 		inflateEnd(&zbuf);
 	else
 		deflateEnd(&zbuf);
 	output = *out;
 	for (bufp = bufh; bufp != NULL; ) {
 		if (count > bufp->size) {
 			struct deflate_buf *p;
 
 			bcopy(bufp->data, *out, bufp->size);
 			*out += bufp->size;
 			count -= bufp->size;
 			p = bufp;
 			bufp = bufp->next;
 			free(p, M_CRYPTO_DATA);
 		} else {
 			/* It should be the last buffer. */
 			bcopy(bufp->data, *out, count);
 			*out += count;
 			free(bufp, M_CRYPTO_DATA);
 			bufp = NULL;
 			count = 0;
 		}
 	}
 	*out = output;
 	SDT_PROBE2(opencrypto, deflate, deflate_global, return, decomp, result);
 	return result;
 
 bad:
 	if (decomp)
 		inflateEnd(&zbuf);
 	else
 		deflateEnd(&zbuf);
 	for (bufp = bufh; bufp != NULL; ) {
 		struct deflate_buf *p;
 
 		p = bufp;
 		bufp = bufp->next;
 		free(p, M_CRYPTO_DATA);
 	}
 bad2:
 	*out = NULL;
 	return 0;
 }
 
 void *
 z_alloc(nil, type, size)
 	void *nil;
 	u_int type, size;
 {
 	void *ptr;
 
 	ptr = malloc(type *size, M_CRYPTO_DATA, M_NOWAIT);
 	return ptr;
 }
 
 void
 z_free(nil, ptr)
 	void *nil, *ptr;
 {
 	free(ptr, M_CRYPTO_DATA);
 }
Index: head/sys/pc98/pc98/machdep.c
===================================================================
--- head/sys/pc98/pc98/machdep.c	(revision 258540)
+++ head/sys/pc98/pc98/machdep.c	(revision 258541)
@@ -1,3002 +1,3001 @@
 /*-
  * Copyright (c) 1992 Terrence R. Lambert.
  * Copyright (c) 1982, 1987, 1990 The Regents of the University of California.
  * All rights reserved.
  *
  * This code is derived from software contributed to Berkeley by
  * William Jolitz.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *	This product includes software developed by the University of
  *	California, Berkeley and its contributors.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	from: @(#)machdep.c	7.4 (Berkeley) 6/3/91
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_apic.h"
 #include "opt_atalk.h"
 #include "opt_atpic.h"
 #include "opt_compat.h"
 #include "opt_cpu.h"
 #include "opt_ddb.h"
 #include "opt_inet.h"
 #include "opt_ipx.h"
 #include "opt_isa.h"
 #include "opt_kstack_pages.h"
 #include "opt_maxmem.h"
 #include "opt_mp_watchdog.h"
 #include "opt_npx.h"
 #include "opt_perfmon.h"
-#include "opt_kdtrace.h"
 
 #include <sys/param.h>
 #include <sys/proc.h>
 #include <sys/systm.h>
 #include <sys/bio.h>
 #include <sys/buf.h>
 #include <sys/bus.h>
 #include <sys/callout.h>
 #include <sys/cons.h>
 #include <sys/cpu.h>
 #include <sys/eventhandler.h>
 #include <sys/exec.h>
 #include <sys/imgact.h>
 #include <sys/kdb.h>
 #include <sys/kernel.h>
 #include <sys/ktr.h>
 #include <sys/linker.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/memrange.h>
 #include <sys/msgbuf.h>
 #include <sys/mutex.h>
 #include <sys/pcpu.h>
 #include <sys/ptrace.h>
 #include <sys/reboot.h>
 #include <sys/rwlock.h>
 #include <sys/sched.h>
 #include <sys/signalvar.h>
 #ifdef SMP
 #include <sys/smp.h>
 #endif
 #include <sys/syscallsubr.h>
 #include <sys/sysctl.h>
 #include <sys/sysent.h>
 #include <sys/sysproto.h>
 #include <sys/ucontext.h>
 #include <sys/vmmeter.h>
 
 #include <vm/vm.h>
 #include <vm/vm_extern.h>
 #include <vm/vm_kern.h>
 #include <vm/vm_page.h>
 #include <vm/vm_map.h>
 #include <vm/vm_object.h>
 #include <vm/vm_pager.h>
 #include <vm/vm_param.h>
 
 #ifdef DDB
 #ifndef KDB
 #error KDB must be enabled in order for DDB to work!
 #endif
 #include <ddb/ddb.h>
 #include <ddb/db_sym.h>
 #endif
 
 #include <pc98/pc98/pc98_machdep.h>
 
 #include <net/netisr.h>
 
 #include <machine/bootinfo.h>
 #include <machine/clock.h>
 #include <machine/cpu.h>
 #include <machine/cputypes.h>
 #include <machine/intr_machdep.h>
 #include <x86/mca.h>
 #include <machine/md_var.h>
 #include <machine/mp_watchdog.h>
 #include <machine/pc/bios.h>
 #include <machine/pcb.h>
 #include <machine/pcb_ext.h>
 #include <machine/proc.h>
 #include <machine/reg.h>
 #include <machine/sigframe.h>
 #include <machine/specialreg.h>
 #include <machine/vm86.h>
 #ifdef PERFMON
 #include <machine/perfmon.h>
 #endif
 #ifdef SMP
 #include <machine/smp.h>
 #endif
 
 #ifdef DEV_APIC
 #include <machine/apicvar.h>
 #endif
 
 #ifdef DEV_ISA
 #include <x86/isa/icu.h>
 #endif
 
 /* Sanity check for __curthread() */
 CTASSERT(offsetof(struct pcpu, pc_curthread) == 0);
 
 extern void init386(int first);
 extern void dblfault_handler(void);
 
 extern void printcpuinfo(void);	/* XXX header file */
 extern void finishidentcpu(void);
 extern void panicifcpuunsupported(void);
 
 #define	CS_SECURE(cs)		(ISPL(cs) == SEL_UPL)
 #define	EFL_SECURE(ef, oef)	((((ef) ^ (oef)) & ~PSL_USERCHANGE) == 0)
 
 #if !defined(CPU_DISABLE_SSE) && defined(I686_CPU)
 #define CPU_ENABLE_SSE
 #endif
 
 static void cpu_startup(void *);
 static void fpstate_drop(struct thread *td);
 static void get_fpcontext(struct thread *td, mcontext_t *mcp);
 static int  set_fpcontext(struct thread *td, const mcontext_t *mcp);
 #ifdef CPU_ENABLE_SSE
 static void set_fpregs_xmm(struct save87 *, struct savexmm *);
 static void fill_fpregs_xmm(struct savexmm *, struct save87 *);
 #endif /* CPU_ENABLE_SSE */
 SYSINIT(cpu, SI_SUB_CPU, SI_ORDER_FIRST, cpu_startup, NULL);
 
 int	need_pre_dma_flush;	/* If 1, use wbinvd befor DMA transfer. */
 int	need_post_dma_flush;	/* If 1, use invd after DMA transfer. */
 
 #ifdef DDB
 extern vm_offset_t ksym_start, ksym_end;
 #endif
 
 int	_udatasel, _ucodesel;
 u_int	basemem;
 
 static int	ispc98 = 1;
 SYSCTL_INT(_machdep, OID_AUTO, ispc98, CTLFLAG_RD, &ispc98, 0, "");
 
 int cold = 1;
 
 #ifdef COMPAT_43
 static void osendsig(sig_t catcher, ksiginfo_t *, sigset_t *mask);
 #endif
 #ifdef COMPAT_FREEBSD4
 static void freebsd4_sendsig(sig_t catcher, ksiginfo_t *, sigset_t *mask);
 #endif
 
 long Maxmem = 0;
 long realmem = 0;
 
 /*
  * The number of PHYSMAP entries must be one less than the number of
  * PHYSSEG entries because the PHYSMAP entry that spans the largest
  * physical address that is accessible by ISA DMA is split into two
  * PHYSSEG entries.
  */
 #define	PHYSMAP_SIZE	(2 * (VM_PHYSSEG_MAX - 1))
 
 vm_paddr_t phys_avail[PHYSMAP_SIZE + 2];
 vm_paddr_t dump_avail[PHYSMAP_SIZE + 2];
 
 /* must be 2 less so 0 0 can signal end of chunks */
 #define PHYS_AVAIL_ARRAY_END ((sizeof(phys_avail) / sizeof(phys_avail[0])) - 2)
 #define DUMP_AVAIL_ARRAY_END ((sizeof(dump_avail) / sizeof(dump_avail[0])) - 2)
 
 struct kva_md_info kmi;
 
 static struct trapframe proc0_tf;
 struct pcpu __pcpu[MAXCPU];
 
 struct mtx icu_lock;
 
 struct mem_range_softc mem_range_softc;
 
 static void
 cpu_startup(dummy)
 	void *dummy;
 {
 	uintmax_t memsize;
 
 	/*
 	 * Good {morning,afternoon,evening,night}.
 	 */
 	startrtclock();
 	printcpuinfo();
 	panicifcpuunsupported();
 #ifdef PERFMON
 	perfmon_init();
 #endif
 	realmem = Maxmem;
 
 	/*
 	 * Display physical memory.
 	 */
 	memsize = ptoa((uintmax_t)Maxmem);
 	printf("real memory  = %ju (%ju MB)\n", memsize, memsize >> 20);
 
 	/*
 	 * Display any holes after the first chunk of extended memory.
 	 */
 	if (bootverbose) {
 		int indx;
 
 		printf("Physical memory chunk(s):\n");
 		for (indx = 0; phys_avail[indx + 1] != 0; indx += 2) {
 			vm_paddr_t size;
 
 			size = phys_avail[indx + 1] - phys_avail[indx];
 			printf(
 			    "0x%016jx - 0x%016jx, %ju bytes (%ju pages)\n",
 			    (uintmax_t)phys_avail[indx],
 			    (uintmax_t)phys_avail[indx + 1] - 1,
 			    (uintmax_t)size, (uintmax_t)size / PAGE_SIZE);
 		}
 	}
 
 	vm_ksubmap_init(&kmi);
 
 	printf("avail memory = %ju (%ju MB)\n",
 	    ptoa((uintmax_t)cnt.v_free_count),
 	    ptoa((uintmax_t)cnt.v_free_count) / 1048576);
 
 	/*
 	 * Set up buffers, so they can be used to read disk labels.
 	 */
 	bufinit();
 	vm_pager_bufferinit();
 	cpu_setregs();
 }
 
 /*
  * Send an interrupt to process.
  *
  * Stack is set up to allow sigcode stored
  * at top to call routine, followed by kcall
  * to sigreturn routine below.  After sigreturn
  * resets the signal mask, the stack, and the
  * frame pointer, it returns to the user
  * specified pc, psl.
  */
 #ifdef COMPAT_43
 static void
 osendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask)
 {
 	struct osigframe sf, *fp;
 	struct proc *p;
 	struct thread *td;
 	struct sigacts *psp;
 	struct trapframe *regs;
 	int sig;
 	int oonstack;
 
 	td = curthread;
 	p = td->td_proc;
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	sig = ksi->ksi_signo;
 	psp = p->p_sigacts;
 	mtx_assert(&psp->ps_mtx, MA_OWNED);
 	regs = td->td_frame;
 	oonstack = sigonstack(regs->tf_esp);
 
 	/* Allocate space for the signal handler context. */
 	if ((td->td_pflags & TDP_ALTSTACK) && !oonstack &&
 	    SIGISMEMBER(psp->ps_sigonstack, sig)) {
 		fp = (struct osigframe *)(td->td_sigstk.ss_sp +
 		    td->td_sigstk.ss_size - sizeof(struct osigframe));
 #if defined(COMPAT_43)
 		td->td_sigstk.ss_flags |= SS_ONSTACK;
 #endif
 	} else
 		fp = (struct osigframe *)regs->tf_esp - 1;
 
 	/* Translate the signal if appropriate. */
 	if (p->p_sysent->sv_sigtbl && sig <= p->p_sysent->sv_sigsize)
 		sig = p->p_sysent->sv_sigtbl[_SIG_IDX(sig)];
 
 	/* Build the argument list for the signal handler. */
 	sf.sf_signum = sig;
 	sf.sf_scp = (register_t)&fp->sf_siginfo.si_sc;
 	bzero(&sf.sf_siginfo, sizeof(sf.sf_siginfo));
 	if (SIGISMEMBER(psp->ps_siginfo, sig)) {
 		/* Signal handler installed with SA_SIGINFO. */
 		sf.sf_arg2 = (register_t)&fp->sf_siginfo;
 		sf.sf_siginfo.si_signo = sig;
 		sf.sf_siginfo.si_code = ksi->ksi_code;
 		sf.sf_ahu.sf_action = (__osiginfohandler_t *)catcher;
 		sf.sf_addr = 0;
 	} else {
 		/* Old FreeBSD-style arguments. */
 		sf.sf_arg2 = ksi->ksi_code;
 		sf.sf_addr = (register_t)ksi->ksi_addr;
 		sf.sf_ahu.sf_handler = catcher;
 	}
 	mtx_unlock(&psp->ps_mtx);
 	PROC_UNLOCK(p);
 
 	/* Save most if not all of trap frame. */
 	sf.sf_siginfo.si_sc.sc_eax = regs->tf_eax;
 	sf.sf_siginfo.si_sc.sc_ebx = regs->tf_ebx;
 	sf.sf_siginfo.si_sc.sc_ecx = regs->tf_ecx;
 	sf.sf_siginfo.si_sc.sc_edx = regs->tf_edx;
 	sf.sf_siginfo.si_sc.sc_esi = regs->tf_esi;
 	sf.sf_siginfo.si_sc.sc_edi = regs->tf_edi;
 	sf.sf_siginfo.si_sc.sc_cs = regs->tf_cs;
 	sf.sf_siginfo.si_sc.sc_ds = regs->tf_ds;
 	sf.sf_siginfo.si_sc.sc_ss = regs->tf_ss;
 	sf.sf_siginfo.si_sc.sc_es = regs->tf_es;
 	sf.sf_siginfo.si_sc.sc_fs = regs->tf_fs;
 	sf.sf_siginfo.si_sc.sc_gs = rgs();
 	sf.sf_siginfo.si_sc.sc_isp = regs->tf_isp;
 
 	/* Build the signal context to be used by osigreturn(). */
 	sf.sf_siginfo.si_sc.sc_onstack = (oonstack) ? 1 : 0;
 	SIG2OSIG(*mask, sf.sf_siginfo.si_sc.sc_mask);
 	sf.sf_siginfo.si_sc.sc_sp = regs->tf_esp;
 	sf.sf_siginfo.si_sc.sc_fp = regs->tf_ebp;
 	sf.sf_siginfo.si_sc.sc_pc = regs->tf_eip;
 	sf.sf_siginfo.si_sc.sc_ps = regs->tf_eflags;
 	sf.sf_siginfo.si_sc.sc_trapno = regs->tf_trapno;
 	sf.sf_siginfo.si_sc.sc_err = regs->tf_err;
 
 	/*
 	 * If we're a vm86 process, we want to save the segment registers.
 	 * We also change eflags to be our emulated eflags, not the actual
 	 * eflags.
 	 */
 	if (regs->tf_eflags & PSL_VM) {
 		/* XXX confusing names: `tf' isn't a trapframe; `regs' is. */
 		struct trapframe_vm86 *tf = (struct trapframe_vm86 *)regs;
 		struct vm86_kernel *vm86 = &td->td_pcb->pcb_ext->ext_vm86;
 
 		sf.sf_siginfo.si_sc.sc_gs = tf->tf_vm86_gs;
 		sf.sf_siginfo.si_sc.sc_fs = tf->tf_vm86_fs;
 		sf.sf_siginfo.si_sc.sc_es = tf->tf_vm86_es;
 		sf.sf_siginfo.si_sc.sc_ds = tf->tf_vm86_ds;
 
 		if (vm86->vm86_has_vme == 0)
 			sf.sf_siginfo.si_sc.sc_ps =
 			    (tf->tf_eflags & ~(PSL_VIF | PSL_VIP)) |
 			    (vm86->vm86_eflags & (PSL_VIF | PSL_VIP));
 
 		/* See sendsig() for comments. */
 		tf->tf_eflags &= ~(PSL_VM | PSL_NT | PSL_VIF | PSL_VIP);
 	}
 
 	/*
 	 * Copy the sigframe out to the user's stack.
 	 */
 	if (copyout(&sf, fp, sizeof(*fp)) != 0) {
 #ifdef DEBUG
 		printf("process %ld has trashed its stack\n", (long)p->p_pid);
 #endif
 		PROC_LOCK(p);
 		sigexit(td, SIGILL);
 	}
 
 	regs->tf_esp = (int)fp;
 	if (p->p_sysent->sv_sigcode_base != 0) {
 		regs->tf_eip = p->p_sysent->sv_sigcode_base + szsigcode -
 		    szosigcode;
 	} else {
 		/* a.out sysentvec does not use shared page */
 		regs->tf_eip = p->p_sysent->sv_psstrings - szosigcode;
 	}
 	regs->tf_eflags &= ~(PSL_T | PSL_D);
 	regs->tf_cs = _ucodesel;
 	regs->tf_ds = _udatasel;
 	regs->tf_es = _udatasel;
 	regs->tf_fs = _udatasel;
 	load_gs(_udatasel);
 	regs->tf_ss = _udatasel;
 	PROC_LOCK(p);
 	mtx_lock(&psp->ps_mtx);
 }
 #endif /* COMPAT_43 */
 
 #ifdef COMPAT_FREEBSD4
 static void
 freebsd4_sendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask)
 {
 	struct sigframe4 sf, *sfp;
 	struct proc *p;
 	struct thread *td;
 	struct sigacts *psp;
 	struct trapframe *regs;
 	int sig;
 	int oonstack;
 
 	td = curthread;
 	p = td->td_proc;
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	sig = ksi->ksi_signo;
 	psp = p->p_sigacts;
 	mtx_assert(&psp->ps_mtx, MA_OWNED);
 	regs = td->td_frame;
 	oonstack = sigonstack(regs->tf_esp);
 
 	/* Save user context. */
 	bzero(&sf, sizeof(sf));
 	sf.sf_uc.uc_sigmask = *mask;
 	sf.sf_uc.uc_stack = td->td_sigstk;
 	sf.sf_uc.uc_stack.ss_flags = (td->td_pflags & TDP_ALTSTACK)
 	    ? ((oonstack) ? SS_ONSTACK : 0) : SS_DISABLE;
 	sf.sf_uc.uc_mcontext.mc_onstack = (oonstack) ? 1 : 0;
 	sf.sf_uc.uc_mcontext.mc_gs = rgs();
 	bcopy(regs, &sf.sf_uc.uc_mcontext.mc_fs, sizeof(*regs));
 	bzero(sf.sf_uc.uc_mcontext.mc_fpregs,
 	    sizeof(sf.sf_uc.uc_mcontext.mc_fpregs));
 	bzero(sf.sf_uc.uc_mcontext.__spare__,
 	    sizeof(sf.sf_uc.uc_mcontext.__spare__));
 	bzero(sf.sf_uc.__spare__, sizeof(sf.sf_uc.__spare__));
 
 	/* Allocate space for the signal handler context. */
 	if ((td->td_pflags & TDP_ALTSTACK) != 0 && !oonstack &&
 	    SIGISMEMBER(psp->ps_sigonstack, sig)) {
 		sfp = (struct sigframe4 *)(td->td_sigstk.ss_sp +
 		    td->td_sigstk.ss_size - sizeof(struct sigframe4));
 #if defined(COMPAT_43)
 		td->td_sigstk.ss_flags |= SS_ONSTACK;
 #endif
 	} else
 		sfp = (struct sigframe4 *)regs->tf_esp - 1;
 
 	/* Translate the signal if appropriate. */
 	if (p->p_sysent->sv_sigtbl && sig <= p->p_sysent->sv_sigsize)
 		sig = p->p_sysent->sv_sigtbl[_SIG_IDX(sig)];
 
 	/* Build the argument list for the signal handler. */
 	sf.sf_signum = sig;
 	sf.sf_ucontext = (register_t)&sfp->sf_uc;
 	bzero(&sf.sf_si, sizeof(sf.sf_si));
 	if (SIGISMEMBER(psp->ps_siginfo, sig)) {
 		/* Signal handler installed with SA_SIGINFO. */
 		sf.sf_siginfo = (register_t)&sfp->sf_si;
 		sf.sf_ahu.sf_action = (__siginfohandler_t *)catcher;
 
 		/* Fill in POSIX parts */
 		sf.sf_si.si_signo = sig;
 		sf.sf_si.si_code = ksi->ksi_code;
 		sf.sf_si.si_addr = ksi->ksi_addr;
 	} else {
 		/* Old FreeBSD-style arguments. */
 		sf.sf_siginfo = ksi->ksi_code;
 		sf.sf_addr = (register_t)ksi->ksi_addr;
 		sf.sf_ahu.sf_handler = catcher;
 	}
 	mtx_unlock(&psp->ps_mtx);
 	PROC_UNLOCK(p);
 
 	/*
 	 * If we're a vm86 process, we want to save the segment registers.
 	 * We also change eflags to be our emulated eflags, not the actual
 	 * eflags.
 	 */
 	if (regs->tf_eflags & PSL_VM) {
 		struct trapframe_vm86 *tf = (struct trapframe_vm86 *)regs;
 		struct vm86_kernel *vm86 = &td->td_pcb->pcb_ext->ext_vm86;
 
 		sf.sf_uc.uc_mcontext.mc_gs = tf->tf_vm86_gs;
 		sf.sf_uc.uc_mcontext.mc_fs = tf->tf_vm86_fs;
 		sf.sf_uc.uc_mcontext.mc_es = tf->tf_vm86_es;
 		sf.sf_uc.uc_mcontext.mc_ds = tf->tf_vm86_ds;
 
 		if (vm86->vm86_has_vme == 0)
 			sf.sf_uc.uc_mcontext.mc_eflags =
 			    (tf->tf_eflags & ~(PSL_VIF | PSL_VIP)) |
 			    (vm86->vm86_eflags & (PSL_VIF | PSL_VIP));
 
 		/*
 		 * Clear PSL_NT to inhibit T_TSSFLT faults on return from
 		 * syscalls made by the signal handler.  This just avoids
 		 * wasting time for our lazy fixup of such faults.  PSL_NT
 		 * does nothing in vm86 mode, but vm86 programs can set it
 		 * almost legitimately in probes for old cpu types.
 		 */
 		tf->tf_eflags &= ~(PSL_VM | PSL_NT | PSL_VIF | PSL_VIP);
 	}
 
 	/*
 	 * Copy the sigframe out to the user's stack.
 	 */
 	if (copyout(&sf, sfp, sizeof(*sfp)) != 0) {
 #ifdef DEBUG
 		printf("process %ld has trashed its stack\n", (long)p->p_pid);
 #endif
 		PROC_LOCK(p);
 		sigexit(td, SIGILL);
 	}
 
 	regs->tf_esp = (int)sfp;
 	regs->tf_eip = p->p_sysent->sv_sigcode_base + szsigcode -
 	    szfreebsd4_sigcode;
 	regs->tf_eflags &= ~(PSL_T | PSL_D);
 	regs->tf_cs = _ucodesel;
 	regs->tf_ds = _udatasel;
 	regs->tf_es = _udatasel;
 	regs->tf_fs = _udatasel;
 	regs->tf_ss = _udatasel;
 	PROC_LOCK(p);
 	mtx_lock(&psp->ps_mtx);
 }
 #endif	/* COMPAT_FREEBSD4 */
 
 void
 sendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask)
 {
 	struct sigframe sf, *sfp;
 	struct proc *p;
 	struct thread *td;
 	struct sigacts *psp;
 	char *sp;
 	struct trapframe *regs;
 	struct segment_descriptor *sdp;
 	int sig;
 	int oonstack;
 
 	td = curthread;
 	p = td->td_proc;
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	sig = ksi->ksi_signo;
 	psp = p->p_sigacts;
 	mtx_assert(&psp->ps_mtx, MA_OWNED);
 #ifdef COMPAT_FREEBSD4
 	if (SIGISMEMBER(psp->ps_freebsd4, sig)) {
 		freebsd4_sendsig(catcher, ksi, mask);
 		return;
 	}
 #endif
 #ifdef COMPAT_43
 	if (SIGISMEMBER(psp->ps_osigset, sig)) {
 		osendsig(catcher, ksi, mask);
 		return;
 	}
 #endif
 	regs = td->td_frame;
 	oonstack = sigonstack(regs->tf_esp);
 
 	/* Save user context. */
 	bzero(&sf, sizeof(sf));
 	sf.sf_uc.uc_sigmask = *mask;
 	sf.sf_uc.uc_stack = td->td_sigstk;
 	sf.sf_uc.uc_stack.ss_flags = (td->td_pflags & TDP_ALTSTACK)
 	    ? ((oonstack) ? SS_ONSTACK : 0) : SS_DISABLE;
 	sf.sf_uc.uc_mcontext.mc_onstack = (oonstack) ? 1 : 0;
 	sf.sf_uc.uc_mcontext.mc_gs = rgs();
 	bcopy(regs, &sf.sf_uc.uc_mcontext.mc_fs, sizeof(*regs));
 	sf.sf_uc.uc_mcontext.mc_len = sizeof(sf.sf_uc.uc_mcontext); /* magic */
 	get_fpcontext(td, &sf.sf_uc.uc_mcontext);
 	fpstate_drop(td);
 	/*
 	 * Unconditionally fill the fsbase and gsbase into the mcontext.
 	 */
 	sdp = &td->td_pcb->pcb_fsd;
 	sf.sf_uc.uc_mcontext.mc_fsbase = sdp->sd_hibase << 24 |
 	    sdp->sd_lobase;
 	sdp = &td->td_pcb->pcb_gsd;
 	sf.sf_uc.uc_mcontext.mc_gsbase = sdp->sd_hibase << 24 |
 	    sdp->sd_lobase;
 	sf.sf_uc.uc_mcontext.mc_flags = 0;
 	bzero(sf.sf_uc.uc_mcontext.mc_spare2,
 	    sizeof(sf.sf_uc.uc_mcontext.mc_spare2));
 	bzero(sf.sf_uc.__spare__, sizeof(sf.sf_uc.__spare__));
 
 	/* Allocate space for the signal handler context. */
 	if ((td->td_pflags & TDP_ALTSTACK) != 0 && !oonstack &&
 	    SIGISMEMBER(psp->ps_sigonstack, sig)) {
 		sp = td->td_sigstk.ss_sp +
 		    td->td_sigstk.ss_size - sizeof(struct sigframe);
 #if defined(COMPAT_43)
 		td->td_sigstk.ss_flags |= SS_ONSTACK;
 #endif
 	} else
 		sp = (char *)regs->tf_esp - sizeof(struct sigframe);
 	/* Align to 16 bytes. */
 	sfp = (struct sigframe *)((unsigned int)sp & ~0xF);
 
 	/* Translate the signal if appropriate. */
 	if (p->p_sysent->sv_sigtbl && sig <= p->p_sysent->sv_sigsize)
 		sig = p->p_sysent->sv_sigtbl[_SIG_IDX(sig)];
 
 	/* Build the argument list for the signal handler. */
 	sf.sf_signum = sig;
 	sf.sf_ucontext = (register_t)&sfp->sf_uc;
 	bzero(&sf.sf_si, sizeof(sf.sf_si));
 	if (SIGISMEMBER(psp->ps_siginfo, sig)) {
 		/* Signal handler installed with SA_SIGINFO. */
 		sf.sf_siginfo = (register_t)&sfp->sf_si;
 		sf.sf_ahu.sf_action = (__siginfohandler_t *)catcher;
 
 		/* Fill in POSIX parts */
 		sf.sf_si = ksi->ksi_info;
 		sf.sf_si.si_signo = sig; /* maybe a translated signal */
 	} else {
 		/* Old FreeBSD-style arguments. */
 		sf.sf_siginfo = ksi->ksi_code;
 		sf.sf_addr = (register_t)ksi->ksi_addr;
 		sf.sf_ahu.sf_handler = catcher;
 	}
 	mtx_unlock(&psp->ps_mtx);
 	PROC_UNLOCK(p);
 
 	/*
 	 * If we're a vm86 process, we want to save the segment registers.
 	 * We also change eflags to be our emulated eflags, not the actual
 	 * eflags.
 	 */
 	if (regs->tf_eflags & PSL_VM) {
 		struct trapframe_vm86 *tf = (struct trapframe_vm86 *)regs;
 		struct vm86_kernel *vm86 = &td->td_pcb->pcb_ext->ext_vm86;
 
 		sf.sf_uc.uc_mcontext.mc_gs = tf->tf_vm86_gs;
 		sf.sf_uc.uc_mcontext.mc_fs = tf->tf_vm86_fs;
 		sf.sf_uc.uc_mcontext.mc_es = tf->tf_vm86_es;
 		sf.sf_uc.uc_mcontext.mc_ds = tf->tf_vm86_ds;
 
 		if (vm86->vm86_has_vme == 0)
 			sf.sf_uc.uc_mcontext.mc_eflags =
 			    (tf->tf_eflags & ~(PSL_VIF | PSL_VIP)) |
 			    (vm86->vm86_eflags & (PSL_VIF | PSL_VIP));
 
 		/*
 		 * Clear PSL_NT to inhibit T_TSSFLT faults on return from
 		 * syscalls made by the signal handler.  This just avoids
 		 * wasting time for our lazy fixup of such faults.  PSL_NT
 		 * does nothing in vm86 mode, but vm86 programs can set it
 		 * almost legitimately in probes for old cpu types.
 		 */
 		tf->tf_eflags &= ~(PSL_VM | PSL_NT | PSL_VIF | PSL_VIP);
 	}
 
 	/*
 	 * Copy the sigframe out to the user's stack.
 	 */
 	if (copyout(&sf, sfp, sizeof(*sfp)) != 0) {
 #ifdef DEBUG
 		printf("process %ld has trashed its stack\n", (long)p->p_pid);
 #endif
 		PROC_LOCK(p);
 		sigexit(td, SIGILL);
 	}
 
 	regs->tf_esp = (int)sfp;
 	regs->tf_eip = p->p_sysent->sv_sigcode_base;
 	regs->tf_eflags &= ~(PSL_T | PSL_D);
 	regs->tf_cs = _ucodesel;
 	regs->tf_ds = _udatasel;
 	regs->tf_es = _udatasel;
 	regs->tf_fs = _udatasel;
 	regs->tf_ss = _udatasel;
 	PROC_LOCK(p);
 	mtx_lock(&psp->ps_mtx);
 }
 
 /*
  * System call to cleanup state after a signal
  * has been taken.  Reset signal mask and
  * stack state from context left by sendsig (above).
  * Return to previous pc and psl as specified by
  * context left by sendsig. Check carefully to
  * make sure that the user has not modified the
  * state to gain improper privileges.
  *
  * MPSAFE
  */
 #ifdef COMPAT_43
 int
 osigreturn(td, uap)
 	struct thread *td;
 	struct osigreturn_args /* {
 		struct osigcontext *sigcntxp;
 	} */ *uap;
 {
 	struct osigcontext sc;
 	struct trapframe *regs;
 	struct osigcontext *scp;
 	int eflags, error;
 	ksiginfo_t ksi;
 
 	regs = td->td_frame;
 	error = copyin(uap->sigcntxp, &sc, sizeof(sc));
 	if (error != 0)
 		return (error);
 	scp = &sc;
 	eflags = scp->sc_ps;
 	if (eflags & PSL_VM) {
 		struct trapframe_vm86 *tf = (struct trapframe_vm86 *)regs;
 		struct vm86_kernel *vm86;
 
 		/*
 		 * if pcb_ext == 0 or vm86_inited == 0, the user hasn't
 		 * set up the vm86 area, and we can't enter vm86 mode.
 		 */
 		if (td->td_pcb->pcb_ext == 0)
 			return (EINVAL);
 		vm86 = &td->td_pcb->pcb_ext->ext_vm86;
 		if (vm86->vm86_inited == 0)
 			return (EINVAL);
 
 		/* Go back to user mode if both flags are set. */
 		if ((eflags & PSL_VIP) && (eflags & PSL_VIF)) {
 			ksiginfo_init_trap(&ksi);
 			ksi.ksi_signo = SIGBUS;
 			ksi.ksi_code = BUS_OBJERR;
 			ksi.ksi_addr = (void *)regs->tf_eip;
 			trapsignal(td, &ksi);
 		}
 
 		if (vm86->vm86_has_vme) {
 			eflags = (tf->tf_eflags & ~VME_USERCHANGE) |
 			    (eflags & VME_USERCHANGE) | PSL_VM;
 		} else {
 			vm86->vm86_eflags = eflags;	/* save VIF, VIP */
 			eflags = (tf->tf_eflags & ~VM_USERCHANGE) |
 			    (eflags & VM_USERCHANGE) | PSL_VM;
 		}
 		tf->tf_vm86_ds = scp->sc_ds;
 		tf->tf_vm86_es = scp->sc_es;
 		tf->tf_vm86_fs = scp->sc_fs;
 		tf->tf_vm86_gs = scp->sc_gs;
 		tf->tf_ds = _udatasel;
 		tf->tf_es = _udatasel;
 		tf->tf_fs = _udatasel;
 	} else {
 		/*
 		 * Don't allow users to change privileged or reserved flags.
 		 */
 		if (!EFL_SECURE(eflags, regs->tf_eflags)) {
 	    		return (EINVAL);
 		}
 
 		/*
 		 * Don't allow users to load a valid privileged %cs.  Let the
 		 * hardware check for invalid selectors, excess privilege in
 		 * other selectors, invalid %eip's and invalid %esp's.
 		 */
 		if (!CS_SECURE(scp->sc_cs)) {
 			ksiginfo_init_trap(&ksi);
 			ksi.ksi_signo = SIGBUS;
 			ksi.ksi_code = BUS_OBJERR;
 			ksi.ksi_trapno = T_PROTFLT;
 			ksi.ksi_addr = (void *)regs->tf_eip;
 			trapsignal(td, &ksi);
 			return (EINVAL);
 		}
 		regs->tf_ds = scp->sc_ds;
 		regs->tf_es = scp->sc_es;
 		regs->tf_fs = scp->sc_fs;
 	}
 
 	/* Restore remaining registers. */
 	regs->tf_eax = scp->sc_eax;
 	regs->tf_ebx = scp->sc_ebx;
 	regs->tf_ecx = scp->sc_ecx;
 	regs->tf_edx = scp->sc_edx;
 	regs->tf_esi = scp->sc_esi;
 	regs->tf_edi = scp->sc_edi;
 	regs->tf_cs = scp->sc_cs;
 	regs->tf_ss = scp->sc_ss;
 	regs->tf_isp = scp->sc_isp;
 	regs->tf_ebp = scp->sc_fp;
 	regs->tf_esp = scp->sc_sp;
 	regs->tf_eip = scp->sc_pc;
 	regs->tf_eflags = eflags;
 
 #if defined(COMPAT_43)
 	if (scp->sc_onstack & 1)
 		td->td_sigstk.ss_flags |= SS_ONSTACK;
 	else
 		td->td_sigstk.ss_flags &= ~SS_ONSTACK;
 #endif
 	kern_sigprocmask(td, SIG_SETMASK, (sigset_t *)&scp->sc_mask, NULL,
 	    SIGPROCMASK_OLD);
 	return (EJUSTRETURN);
 }
 #endif /* COMPAT_43 */
 
 #ifdef COMPAT_FREEBSD4
 /*
  * MPSAFE
  */
 int
 freebsd4_sigreturn(td, uap)
 	struct thread *td;
 	struct freebsd4_sigreturn_args /* {
 		const ucontext4 *sigcntxp;
 	} */ *uap;
 {
 	struct ucontext4 uc;
 	struct trapframe *regs;
 	struct ucontext4 *ucp;
 	int cs, eflags, error;
 	ksiginfo_t ksi;
 
 	error = copyin(uap->sigcntxp, &uc, sizeof(uc));
 	if (error != 0)
 		return (error);
 	ucp = &uc;
 	regs = td->td_frame;
 	eflags = ucp->uc_mcontext.mc_eflags;
 	if (eflags & PSL_VM) {
 		struct trapframe_vm86 *tf = (struct trapframe_vm86 *)regs;
 		struct vm86_kernel *vm86;
 
 		/*
 		 * if pcb_ext == 0 or vm86_inited == 0, the user hasn't
 		 * set up the vm86 area, and we can't enter vm86 mode.
 		 */
 		if (td->td_pcb->pcb_ext == 0)
 			return (EINVAL);
 		vm86 = &td->td_pcb->pcb_ext->ext_vm86;
 		if (vm86->vm86_inited == 0)
 			return (EINVAL);
 
 		/* Go back to user mode if both flags are set. */
 		if ((eflags & PSL_VIP) && (eflags & PSL_VIF)) {
 			ksiginfo_init_trap(&ksi);
 			ksi.ksi_signo = SIGBUS;
 			ksi.ksi_code = BUS_OBJERR;
 			ksi.ksi_addr = (void *)regs->tf_eip;
 			trapsignal(td, &ksi);
 		}
 		if (vm86->vm86_has_vme) {
 			eflags = (tf->tf_eflags & ~VME_USERCHANGE) |
 			    (eflags & VME_USERCHANGE) | PSL_VM;
 		} else {
 			vm86->vm86_eflags = eflags;	/* save VIF, VIP */
 			eflags = (tf->tf_eflags & ~VM_USERCHANGE) |
 			    (eflags & VM_USERCHANGE) | PSL_VM;
 		}
 		bcopy(&ucp->uc_mcontext.mc_fs, tf, sizeof(struct trapframe));
 		tf->tf_eflags = eflags;
 		tf->tf_vm86_ds = tf->tf_ds;
 		tf->tf_vm86_es = tf->tf_es;
 		tf->tf_vm86_fs = tf->tf_fs;
 		tf->tf_vm86_gs = ucp->uc_mcontext.mc_gs;
 		tf->tf_ds = _udatasel;
 		tf->tf_es = _udatasel;
 		tf->tf_fs = _udatasel;
 	} else {
 		/*
 		 * Don't allow users to change privileged or reserved flags.
 		 */
 		if (!EFL_SECURE(eflags, regs->tf_eflags)) {
 			uprintf("pid %d (%s): freebsd4_sigreturn eflags = 0x%x\n",
 			    td->td_proc->p_pid, td->td_name, eflags);
 	    		return (EINVAL);
 		}
 
 		/*
 		 * Don't allow users to load a valid privileged %cs.  Let the
 		 * hardware check for invalid selectors, excess privilege in
 		 * other selectors, invalid %eip's and invalid %esp's.
 		 */
 		cs = ucp->uc_mcontext.mc_cs;
 		if (!CS_SECURE(cs)) {
 			uprintf("pid %d (%s): freebsd4_sigreturn cs = 0x%x\n",
 			    td->td_proc->p_pid, td->td_name, cs);
 			ksiginfo_init_trap(&ksi);
 			ksi.ksi_signo = SIGBUS;
 			ksi.ksi_code = BUS_OBJERR;
 			ksi.ksi_trapno = T_PROTFLT;
 			ksi.ksi_addr = (void *)regs->tf_eip;
 			trapsignal(td, &ksi);
 			return (EINVAL);
 		}
 
 		bcopy(&ucp->uc_mcontext.mc_fs, regs, sizeof(*regs));
 	}
 
 #if defined(COMPAT_43)
 	if (ucp->uc_mcontext.mc_onstack & 1)
 		td->td_sigstk.ss_flags |= SS_ONSTACK;
 	else
 		td->td_sigstk.ss_flags &= ~SS_ONSTACK;
 #endif
 	kern_sigprocmask(td, SIG_SETMASK, &ucp->uc_sigmask, NULL, 0);
 	return (EJUSTRETURN);
 }
 #endif	/* COMPAT_FREEBSD4 */
 
 /*
  * MPSAFE
  */
 int
 sys_sigreturn(td, uap)
 	struct thread *td;
 	struct sigreturn_args /* {
 		const struct __ucontext *sigcntxp;
 	} */ *uap;
 {
 	ucontext_t uc;
 	struct trapframe *regs;
 	ucontext_t *ucp;
 	int cs, eflags, error, ret;
 	ksiginfo_t ksi;
 
 	error = copyin(uap->sigcntxp, &uc, sizeof(uc));
 	if (error != 0)
 		return (error);
 	ucp = &uc;
 	regs = td->td_frame;
 	eflags = ucp->uc_mcontext.mc_eflags;
 	if (eflags & PSL_VM) {
 		struct trapframe_vm86 *tf = (struct trapframe_vm86 *)regs;
 		struct vm86_kernel *vm86;
 
 		/*
 		 * if pcb_ext == 0 or vm86_inited == 0, the user hasn't
 		 * set up the vm86 area, and we can't enter vm86 mode.
 		 */
 		if (td->td_pcb->pcb_ext == 0)
 			return (EINVAL);
 		vm86 = &td->td_pcb->pcb_ext->ext_vm86;
 		if (vm86->vm86_inited == 0)
 			return (EINVAL);
 
 		/* Go back to user mode if both flags are set. */
 		if ((eflags & PSL_VIP) && (eflags & PSL_VIF)) {
 			ksiginfo_init_trap(&ksi);
 			ksi.ksi_signo = SIGBUS;
 			ksi.ksi_code = BUS_OBJERR;
 			ksi.ksi_addr = (void *)regs->tf_eip;
 			trapsignal(td, &ksi);
 		}
 
 		if (vm86->vm86_has_vme) {
 			eflags = (tf->tf_eflags & ~VME_USERCHANGE) |
 			    (eflags & VME_USERCHANGE) | PSL_VM;
 		} else {
 			vm86->vm86_eflags = eflags;	/* save VIF, VIP */
 			eflags = (tf->tf_eflags & ~VM_USERCHANGE) |
 			    (eflags & VM_USERCHANGE) | PSL_VM;
 		}
 		bcopy(&ucp->uc_mcontext.mc_fs, tf, sizeof(struct trapframe));
 		tf->tf_eflags = eflags;
 		tf->tf_vm86_ds = tf->tf_ds;
 		tf->tf_vm86_es = tf->tf_es;
 		tf->tf_vm86_fs = tf->tf_fs;
 		tf->tf_vm86_gs = ucp->uc_mcontext.mc_gs;
 		tf->tf_ds = _udatasel;
 		tf->tf_es = _udatasel;
 		tf->tf_fs = _udatasel;
 	} else {
 		/*
 		 * Don't allow users to change privileged or reserved flags.
 		 */
 		if (!EFL_SECURE(eflags, regs->tf_eflags)) {
 			uprintf("pid %d (%s): sigreturn eflags = 0x%x\n",
 			    td->td_proc->p_pid, td->td_name, eflags);
 	    		return (EINVAL);
 		}
 
 		/*
 		 * Don't allow users to load a valid privileged %cs.  Let the
 		 * hardware check for invalid selectors, excess privilege in
 		 * other selectors, invalid %eip's and invalid %esp's.
 		 */
 		cs = ucp->uc_mcontext.mc_cs;
 		if (!CS_SECURE(cs)) {
 			uprintf("pid %d (%s): sigreturn cs = 0x%x\n",
 			    td->td_proc->p_pid, td->td_name, cs);
 			ksiginfo_init_trap(&ksi);
 			ksi.ksi_signo = SIGBUS;
 			ksi.ksi_code = BUS_OBJERR;
 			ksi.ksi_trapno = T_PROTFLT;
 			ksi.ksi_addr = (void *)regs->tf_eip;
 			trapsignal(td, &ksi);
 			return (EINVAL);
 		}
 
 		ret = set_fpcontext(td, &ucp->uc_mcontext);
 		if (ret != 0)
 			return (ret);
 		bcopy(&ucp->uc_mcontext.mc_fs, regs, sizeof(*regs));
 	}
 
 #if defined(COMPAT_43)
 	if (ucp->uc_mcontext.mc_onstack & 1)
 		td->td_sigstk.ss_flags |= SS_ONSTACK;
 	else
 		td->td_sigstk.ss_flags &= ~SS_ONSTACK;
 #endif
 
 	kern_sigprocmask(td, SIG_SETMASK, &ucp->uc_sigmask, NULL, 0);
 	return (EJUSTRETURN);
 }
 
 /*
  * Machine dependent boot() routine
  *
  * I haven't seen anything to put here yet
  * Possibly some stuff might be grafted back here from boot()
  */
 void
 cpu_boot(int howto)
 {
 }
 
 /*
  * Flush the D-cache for non-DMA I/O so that the I-cache can
  * be made coherent later.
  */
 void
 cpu_flush_dcache(void *ptr, size_t len)
 {
 	/* Not applicable */
 }
 
 /* Get current clock frequency for the given cpu id. */
 int
 cpu_est_clockrate(int cpu_id, uint64_t *rate)
 {
 	uint64_t tsc1, tsc2;
 	register_t reg;
 
 	if (pcpu_find(cpu_id) == NULL || rate == NULL)
 		return (EINVAL);
 	if ((cpu_feature & CPUID_TSC) == 0)
 		return (EOPNOTSUPP);
 
 #ifdef SMP
 	if (smp_cpus > 1) {
 		/* Schedule ourselves on the indicated cpu. */
 		thread_lock(curthread);
 		sched_bind(curthread, cpu_id);
 		thread_unlock(curthread);
 	}
 #endif
 
 	/* Calibrate by measuring a short delay. */
 	reg = intr_disable();
 	tsc1 = rdtsc();
 	DELAY(1000);
 	tsc2 = rdtsc();
 	intr_restore(reg);
 	*rate = (tsc2 - tsc1) * 1000;
 
 #ifdef SMP
 	if (smp_cpus > 1) {
 		thread_lock(curthread);
 		sched_unbind(curthread);
 		thread_unlock(curthread);
 	}
 #endif
 
 	return (0);
 }
 
 
 /*
  * Shutdown the CPU as much as possible
  */
 void
 cpu_halt(void)
 {
 	for (;;)
 		halt();
 }
 
 static int	idle_mwait = 1;		/* Use MONITOR/MWAIT for short idle. */
 TUNABLE_INT("machdep.idle_mwait", &idle_mwait);
 SYSCTL_INT(_machdep, OID_AUTO, idle_mwait, CTLFLAG_RW, &idle_mwait,
     0, "Use MONITOR/MWAIT for short idle");
 
 #define	STATE_RUNNING	0x0
 #define	STATE_MWAIT	0x1
 #define	STATE_SLEEPING	0x2
 
 static void
 cpu_idle_hlt(sbintime_t sbt)
 {
 	int *state;
 
 	state = (int *)PCPU_PTR(monitorbuf);
 	*state = STATE_SLEEPING;
 
 	/*
 	 * Since we may be in a critical section from cpu_idle(), if
 	 * an interrupt fires during that critical section we may have
 	 * a pending preemption.  If the CPU halts, then that thread
 	 * may not execute until a later interrupt awakens the CPU.
 	 * To handle this race, check for a runnable thread after
 	 * disabling interrupts and immediately return if one is
 	 * found.  Also, we must absolutely guarentee that hlt is
 	 * the next instruction after sti.  This ensures that any
 	 * interrupt that fires after the call to disable_intr() will
 	 * immediately awaken the CPU from hlt.  Finally, please note
 	 * that on x86 this works fine because of interrupts enabled only
 	 * after the instruction following sti takes place, while IF is set
 	 * to 1 immediately, allowing hlt instruction to acknowledge the
 	 * interrupt.
 	 */
 	disable_intr();
 	if (sched_runnable())
 		enable_intr();
 	else
 		__asm __volatile("sti; hlt");
 	*state = STATE_RUNNING;
 }
 
 /*
  * MWAIT cpu power states.  Lower 4 bits are sub-states.
  */
 #define	MWAIT_C0	0xf0
 #define	MWAIT_C1	0x00
 #define	MWAIT_C2	0x10
 #define	MWAIT_C3	0x20
 #define	MWAIT_C4	0x30
 
 static void
 cpu_idle_mwait(sbintime_t sbt)
 {
 	int *state;
 
 	state = (int *)PCPU_PTR(monitorbuf);
 	*state = STATE_MWAIT;
 
 	/* See comments in cpu_idle_hlt(). */
 	disable_intr();
 	if (sched_runnable()) {
 		enable_intr();
 		*state = STATE_RUNNING;
 		return;
 	}
 	cpu_monitor(state, 0, 0);
 	if (*state == STATE_MWAIT)
 		__asm __volatile("sti; mwait" : : "a" (MWAIT_C1), "c" (0));
 	else
 		enable_intr();
 	*state = STATE_RUNNING;
 }
 
 static void
 cpu_idle_spin(sbintime_t sbt)
 {
 	int *state;
 	int i;
 
 	state = (int *)PCPU_PTR(monitorbuf);
 	*state = STATE_RUNNING;
 
 	/*
 	 * The sched_runnable() call is racy but as long as there is
 	 * a loop missing it one time will have just a little impact if any 
 	 * (and it is much better than missing the check at all).
 	 */
 	for (i = 0; i < 1000; i++) {
 		if (sched_runnable())
 			return;
 		cpu_spinwait();
 	}
 }
 
 void (*cpu_idle_fn)(sbintime_t) = cpu_idle_hlt;
 
 void
 cpu_idle(int busy)
 {
 	sbintime_t sbt = -1;
 
 	CTR2(KTR_SPARE2, "cpu_idle(%d) at %d",
 	    busy, curcpu);
 #if defined(MP_WATCHDOG)
 	ap_watchdog(PCPU_GET(cpuid));
 #endif
 	/* If we are busy - try to use fast methods. */
 	if (busy) {
 		if ((cpu_feature2 & CPUID2_MON) && idle_mwait) {
 			cpu_idle_mwait(busy);
 			goto out;
 		}
 	}
 
 	/* If we have time - switch timers into idle mode. */
 	if (!busy) {
 		critical_enter();
 		sbt = cpu_idleclock();
 	}
 
 	/* Call main idle method. */
 	cpu_idle_fn(sbt);
 
 	/* Switch timers mack into active mode. */
 	if (!busy) {
 		cpu_activeclock();
 		critical_exit();
 	}
 out:
 	CTR2(KTR_SPARE2, "cpu_idle(%d) at %d done",
 	    busy, curcpu);
 }
 
 int
 cpu_idle_wakeup(int cpu)
 {
 	struct pcpu *pcpu;
 	int *state;
 
 	pcpu = pcpu_find(cpu);
 	state = (int *)pcpu->pc_monitorbuf;
 	/*
 	 * This doesn't need to be atomic since missing the race will
 	 * simply result in unnecessary IPIs.
 	 */
 	if (*state == STATE_SLEEPING)
 		return (0);
 	if (*state == STATE_MWAIT)
 		*state = STATE_RUNNING;
 	return (1);
 }
 
 /*
  * Ordered by speed/power consumption.
  */
 struct {
 	void	*id_fn;
 	char	*id_name;
 } idle_tbl[] = {
 	{ cpu_idle_spin, "spin" },
 	{ cpu_idle_mwait, "mwait" },
 	{ cpu_idle_hlt, "hlt" },
 	{ NULL, NULL }
 };
 
 static int
 idle_sysctl_available(SYSCTL_HANDLER_ARGS)
 {
 	char *avail, *p;
 	int error;
 	int i;
 
 	avail = malloc(256, M_TEMP, M_WAITOK);
 	p = avail;
 	for (i = 0; idle_tbl[i].id_name != NULL; i++) {
 		if (strstr(idle_tbl[i].id_name, "mwait") &&
 		    (cpu_feature2 & CPUID2_MON) == 0)
 			continue;
 		p += sprintf(p, "%s%s", p != avail ? ", " : "",
 		    idle_tbl[i].id_name);
 	}
 	error = sysctl_handle_string(oidp, avail, 0, req);
 	free(avail, M_TEMP);
 	return (error);
 }
 
 SYSCTL_PROC(_machdep, OID_AUTO, idle_available, CTLTYPE_STRING | CTLFLAG_RD,
     0, 0, idle_sysctl_available, "A", "list of available idle functions");
 
 static int
 idle_sysctl(SYSCTL_HANDLER_ARGS)
 {
 	char buf[16];
 	int error;
 	char *p;
 	int i;
 
 	p = "unknown";
 	for (i = 0; idle_tbl[i].id_name != NULL; i++) {
 		if (idle_tbl[i].id_fn == cpu_idle_fn) {
 			p = idle_tbl[i].id_name;
 			break;
 		}
 	}
 	strncpy(buf, p, sizeof(buf));
 	error = sysctl_handle_string(oidp, buf, sizeof(buf), req);
 	if (error != 0 || req->newptr == NULL)
 		return (error);
 	for (i = 0; idle_tbl[i].id_name != NULL; i++) {
 		if (strstr(idle_tbl[i].id_name, "mwait") &&
 		    (cpu_feature2 & CPUID2_MON) == 0)
 			continue;
 		if (strcmp(idle_tbl[i].id_name, buf))
 			continue;
 		cpu_idle_fn = idle_tbl[i].id_fn;
 		return (0);
 	}
 	return (EINVAL);
 }
 
 SYSCTL_PROC(_machdep, OID_AUTO, idle, CTLTYPE_STRING | CTLFLAG_RW, 0, 0,
     idle_sysctl, "A", "currently selected idle function");
 
 /*
  * Reset registers to default values on exec.
  */
 void
 exec_setregs(struct thread *td, struct image_params *imgp, u_long stack)
 {
 	struct trapframe *regs = td->td_frame;
 	struct pcb *pcb = td->td_pcb;
 
 	/* Reset pc->pcb_gs and %gs before possibly invalidating it. */
 	pcb->pcb_gs = _udatasel;
 	load_gs(_udatasel);
 
 	mtx_lock_spin(&dt_lock);
 	if (td->td_proc->p_md.md_ldt)
 		user_ldt_free(td);
 	else
 		mtx_unlock_spin(&dt_lock);
   
 	bzero((char *)regs, sizeof(struct trapframe));
 	regs->tf_eip = imgp->entry_addr;
 	regs->tf_esp = stack;
 	regs->tf_eflags = PSL_USER | (regs->tf_eflags & PSL_T);
 	regs->tf_ss = _udatasel;
 	regs->tf_ds = _udatasel;
 	regs->tf_es = _udatasel;
 	regs->tf_fs = _udatasel;
 	regs->tf_cs = _ucodesel;
 
 	/* PS_STRINGS value for BSD/OS binaries.  It is 0 for non-BSD/OS. */
 	regs->tf_ebx = imgp->ps_strings;
 
         /*
          * Reset the hardware debug registers if they were in use.
          * They won't have any meaning for the newly exec'd process.  
          */
         if (pcb->pcb_flags & PCB_DBREGS) {
                 pcb->pcb_dr0 = 0;
                 pcb->pcb_dr1 = 0;
                 pcb->pcb_dr2 = 0;
                 pcb->pcb_dr3 = 0;
                 pcb->pcb_dr6 = 0;
                 pcb->pcb_dr7 = 0;
                 if (pcb == curpcb) {
 		        /*
 			 * Clear the debug registers on the running
 			 * CPU, otherwise they will end up affecting
 			 * the next process we switch to.
 			 */
 		        reset_dbregs();
                 }
                 pcb->pcb_flags &= ~PCB_DBREGS;
         }
 
 	/*
 	 * Initialize the math emulator (if any) for the current process.
 	 * Actually, just clear the bit that says that the emulator has
 	 * been initialized.  Initialization is delayed until the process
 	 * traps to the emulator (if it is done at all) mainly because
 	 * emulators don't provide an entry point for initialization.
 	 */
 	td->td_pcb->pcb_flags &= ~FP_SOFTFP;
 	pcb->pcb_initial_npxcw = __INITIAL_NPXCW__;
 
 	/*
 	 * Drop the FP state if we hold it, so that the process gets a
 	 * clean FP state if it uses the FPU again.
 	 */
 	fpstate_drop(td);
 
 	/*
 	 * XXX - Linux emulator
 	 * Make sure sure edx is 0x0 on entry. Linux binaries depend
 	 * on it.
 	 */
 	td->td_retval[1] = 0;
 }
 
 void
 cpu_setregs(void)
 {
 	unsigned int cr0;
 
 	cr0 = rcr0();
 
 	/*
 	 * CR0_MP, CR0_NE and CR0_TS are set for NPX (FPU) support:
 	 *
 	 * Prepare to trap all ESC (i.e., NPX) instructions and all WAIT
 	 * instructions.  We must set the CR0_MP bit and use the CR0_TS
 	 * bit to control the trap, because setting the CR0_EM bit does
 	 * not cause WAIT instructions to trap.  It's important to trap
 	 * WAIT instructions - otherwise the "wait" variants of no-wait
 	 * control instructions would degenerate to the "no-wait" variants
 	 * after FP context switches but work correctly otherwise.  It's
 	 * particularly important to trap WAITs when there is no NPX -
 	 * otherwise the "wait" variants would always degenerate.
 	 *
 	 * Try setting CR0_NE to get correct error reporting on 486DX's.
 	 * Setting it should fail or do nothing on lesser processors.
 	 */
 	cr0 |= CR0_MP | CR0_NE | CR0_TS | CR0_WP | CR0_AM;
 	load_cr0(cr0);
 	load_gs(_udatasel);
 }
 
 u_long bootdev;		/* not a struct cdev *- encoding is different */
 SYSCTL_ULONG(_machdep, OID_AUTO, guessed_bootdev,
 	CTLFLAG_RD, &bootdev, 0, "Maybe the Boot device (not in struct cdev *format)");
 
 /*
  * Initialize 386 and configure to run kernel
  */
 
 /*
  * Initialize segments & interrupt table
  */
 
 int _default_ldt;
 
 union descriptor gdt[NGDT * MAXCPU];	/* global descriptor table */
 union descriptor ldt[NLDT];		/* local descriptor table */
 static struct gate_descriptor idt0[NIDT];
 struct gate_descriptor *idt = &idt0[0];	/* interrupt descriptor table */
 struct region_descriptor r_gdt, r_idt;	/* table descriptors */
 struct mtx dt_lock;			/* lock for GDT and LDT */
 
 #if defined(I586_CPU) && !defined(NO_F00F_HACK)
 extern int has_f00f_bug;
 #endif
 
 static struct i386tss dblfault_tss;
 static char dblfault_stack[PAGE_SIZE];
 
 extern  vm_offset_t	proc0kstack;
 
 
 /*
  * software prototypes -- in more palatable form.
  *
  * GCODE_SEL through GUDATA_SEL must be in this order for syscall/sysret
  * GUFS_SEL and GUGS_SEL must be in this order (swtch.s knows it)
  */
 struct soft_segment_descriptor gdt_segs[] = {
 /* GNULL_SEL	0 Null Descriptor */
 {	.ssd_base = 0x0,
 	.ssd_limit = 0x0,
 	.ssd_type = 0,
 	.ssd_dpl = SEL_KPL,
 	.ssd_p = 0,
 	.ssd_xx = 0, .ssd_xx1 = 0,
 	.ssd_def32 = 0,
 	.ssd_gran = 0		},
 /* GPRIV_SEL	1 SMP Per-Processor Private Data Descriptor */
 {	.ssd_base = 0x0,
 	.ssd_limit = 0xfffff,
 	.ssd_type = SDT_MEMRWA,
 	.ssd_dpl = SEL_KPL,
 	.ssd_p = 1,
 	.ssd_xx = 0, .ssd_xx1 = 0,
 	.ssd_def32 = 1,
 	.ssd_gran = 1		},
 /* GUFS_SEL	2 %fs Descriptor for user */
 {	.ssd_base = 0x0,
 	.ssd_limit = 0xfffff,
 	.ssd_type = SDT_MEMRWA,
 	.ssd_dpl = SEL_UPL,
 	.ssd_p = 1,
 	.ssd_xx = 0, .ssd_xx1 = 0,
 	.ssd_def32 = 1,
 	.ssd_gran = 1		},
 /* GUGS_SEL	3 %gs Descriptor for user */
 {	.ssd_base = 0x0,
 	.ssd_limit = 0xfffff,
 	.ssd_type = SDT_MEMRWA,
 	.ssd_dpl = SEL_UPL,
 	.ssd_p = 1,
 	.ssd_xx = 0, .ssd_xx1 = 0,
 	.ssd_def32 = 1,
 	.ssd_gran = 1		},
 /* GCODE_SEL	4 Code Descriptor for kernel */
 {	.ssd_base = 0x0,
 	.ssd_limit = 0xfffff,
 	.ssd_type = SDT_MEMERA,
 	.ssd_dpl = SEL_KPL,
 	.ssd_p = 1,
 	.ssd_xx = 0, .ssd_xx1 = 0,
 	.ssd_def32 = 1,
 	.ssd_gran = 1		},
 /* GDATA_SEL	5 Data Descriptor for kernel */
 {	.ssd_base = 0x0,
 	.ssd_limit = 0xfffff,
 	.ssd_type = SDT_MEMRWA,
 	.ssd_dpl = SEL_KPL,
 	.ssd_p = 1,
 	.ssd_xx = 0, .ssd_xx1 = 0,
 	.ssd_def32 = 1,
 	.ssd_gran = 1		},
 /* GUCODE_SEL	6 Code Descriptor for user */
 {	.ssd_base = 0x0,
 	.ssd_limit = 0xfffff,
 	.ssd_type = SDT_MEMERA,
 	.ssd_dpl = SEL_UPL,
 	.ssd_p = 1,
 	.ssd_xx = 0, .ssd_xx1 = 0,
 	.ssd_def32 = 1,
 	.ssd_gran = 1		},
 /* GUDATA_SEL	7 Data Descriptor for user */
 {	.ssd_base = 0x0,
 	.ssd_limit = 0xfffff,
 	.ssd_type = SDT_MEMRWA,
 	.ssd_dpl = SEL_UPL,
 	.ssd_p = 1,
 	.ssd_xx = 0, .ssd_xx1 = 0,
 	.ssd_def32 = 1,
 	.ssd_gran = 1		},
 /* GBIOSLOWMEM_SEL 8 BIOS access to realmode segment 0x40, must be #8 in GDT */
 {	.ssd_base = 0x400,
 	.ssd_limit = 0xfffff,
 	.ssd_type = SDT_MEMRWA,
 	.ssd_dpl = SEL_KPL,
 	.ssd_p = 1,
 	.ssd_xx = 0, .ssd_xx1 = 0,
 	.ssd_def32 = 1,
 	.ssd_gran = 1		},
 /* GPROC0_SEL	9 Proc 0 Tss Descriptor */
 {
 	.ssd_base = 0x0,
 	.ssd_limit = sizeof(struct i386tss)-1,
 	.ssd_type = SDT_SYS386TSS,
 	.ssd_dpl = 0,
 	.ssd_p = 1,
 	.ssd_xx = 0, .ssd_xx1 = 0,
 	.ssd_def32 = 0,
 	.ssd_gran = 0		},
 /* GLDT_SEL	10 LDT Descriptor */
 {	.ssd_base = (int) ldt,
 	.ssd_limit = sizeof(ldt)-1,
 	.ssd_type = SDT_SYSLDT,
 	.ssd_dpl = SEL_UPL,
 	.ssd_p = 1,
 	.ssd_xx = 0, .ssd_xx1 = 0,
 	.ssd_def32 = 0,
 	.ssd_gran = 0		},
 /* GUSERLDT_SEL	11 User LDT Descriptor per process */
 {	.ssd_base = (int) ldt,
 	.ssd_limit = (512 * sizeof(union descriptor)-1),
 	.ssd_type = SDT_SYSLDT,
 	.ssd_dpl = 0,
 	.ssd_p = 1,
 	.ssd_xx = 0, .ssd_xx1 = 0,
 	.ssd_def32 = 0,
 	.ssd_gran = 0		},
 /* GPANIC_SEL	12 Panic Tss Descriptor */
 {	.ssd_base = (int) &dblfault_tss,
 	.ssd_limit = sizeof(struct i386tss)-1,
 	.ssd_type = SDT_SYS386TSS,
 	.ssd_dpl = 0,
 	.ssd_p = 1,
 	.ssd_xx = 0, .ssd_xx1 = 0,
 	.ssd_def32 = 0,
 	.ssd_gran = 0		},
 /* GBIOSCODE32_SEL 13 BIOS 32-bit interface (32bit Code) */
 {	.ssd_base = 0,
 	.ssd_limit = 0xfffff,
 	.ssd_type = SDT_MEMERA,
 	.ssd_dpl = 0,
 	.ssd_p = 1,
 	.ssd_xx = 0, .ssd_xx1 = 0,
 	.ssd_def32 = 0,
 	.ssd_gran = 1		},
 /* GBIOSCODE16_SEL 14 BIOS 32-bit interface (16bit Code) */
 {	.ssd_base = 0,
 	.ssd_limit = 0xfffff,
 	.ssd_type = SDT_MEMERA,
 	.ssd_dpl = 0,
 	.ssd_p = 1,
 	.ssd_xx = 0, .ssd_xx1 = 0,
 	.ssd_def32 = 0,
 	.ssd_gran = 1		},
 /* GBIOSDATA_SEL 15 BIOS 32-bit interface (Data) */
 {	.ssd_base = 0,
 	.ssd_limit = 0xfffff,
 	.ssd_type = SDT_MEMRWA,
 	.ssd_dpl = 0,
 	.ssd_p = 1,
 	.ssd_xx = 0, .ssd_xx1 = 0,
 	.ssd_def32 = 1,
 	.ssd_gran = 1		},
 /* GBIOSUTIL_SEL 16 BIOS 16-bit interface (Utility) */
 {	.ssd_base = 0,
 	.ssd_limit = 0xfffff,
 	.ssd_type = SDT_MEMRWA,
 	.ssd_dpl = 0,
 	.ssd_p = 1,
 	.ssd_xx = 0, .ssd_xx1 = 0,
 	.ssd_def32 = 0,
 	.ssd_gran = 1		},
 /* GBIOSARGS_SEL 17 BIOS 16-bit interface (Arguments) */
 {	.ssd_base = 0,
 	.ssd_limit = 0xfffff,
 	.ssd_type = SDT_MEMRWA,
 	.ssd_dpl = 0,
 	.ssd_p = 1,
 	.ssd_xx = 0, .ssd_xx1 = 0,
 	.ssd_def32 = 0,
 	.ssd_gran = 1		},
 /* GNDIS_SEL	18 NDIS Descriptor */
 {	.ssd_base = 0x0,
 	.ssd_limit = 0x0,
 	.ssd_type = 0,
 	.ssd_dpl = 0,
 	.ssd_p = 0,
 	.ssd_xx = 0, .ssd_xx1 = 0,
 	.ssd_def32 = 0,
 	.ssd_gran = 0		},
 };
 
 static struct soft_segment_descriptor ldt_segs[] = {
 	/* Null Descriptor - overwritten by call gate */
 {	.ssd_base = 0x0,
 	.ssd_limit = 0x0,
 	.ssd_type = 0,
 	.ssd_dpl = 0,
 	.ssd_p = 0,
 	.ssd_xx = 0, .ssd_xx1 = 0,
 	.ssd_def32 = 0,
 	.ssd_gran = 0		},
 	/* Null Descriptor - overwritten by call gate */
 {	.ssd_base = 0x0,
 	.ssd_limit = 0x0,
 	.ssd_type = 0,
 	.ssd_dpl = 0,
 	.ssd_p = 0,
 	.ssd_xx = 0, .ssd_xx1 = 0,
 	.ssd_def32 = 0,
 	.ssd_gran = 0		},
 	/* Null Descriptor - overwritten by call gate */
 {	.ssd_base = 0x0,
 	.ssd_limit = 0x0,
 	.ssd_type = 0,
 	.ssd_dpl = 0,
 	.ssd_p = 0,
 	.ssd_xx = 0, .ssd_xx1 = 0,
 	.ssd_def32 = 0,
 	.ssd_gran = 0		},
 	/* Code Descriptor for user */
 {	.ssd_base = 0x0,
 	.ssd_limit = 0xfffff,
 	.ssd_type = SDT_MEMERA,
 	.ssd_dpl = SEL_UPL,
 	.ssd_p = 1,
 	.ssd_xx = 0, .ssd_xx1 = 0,
 	.ssd_def32 = 1,
 	.ssd_gran = 1		},
 	/* Null Descriptor - overwritten by call gate */
 {	.ssd_base = 0x0,
 	.ssd_limit = 0x0,
 	.ssd_type = 0,
 	.ssd_dpl = 0,
 	.ssd_p = 0,
 	.ssd_xx = 0, .ssd_xx1 = 0,
 	.ssd_def32 = 0,
 	.ssd_gran = 0		},
 	/* Data Descriptor for user */
 {	.ssd_base = 0x0,
 	.ssd_limit = 0xfffff,
 	.ssd_type = SDT_MEMRWA,
 	.ssd_dpl = SEL_UPL,
 	.ssd_p = 1,
 	.ssd_xx = 0, .ssd_xx1 = 0,
 	.ssd_def32 = 1,
 	.ssd_gran = 1		},
 };
 
 void
 setidt(idx, func, typ, dpl, selec)
 	int idx;
 	inthand_t *func;
 	int typ;
 	int dpl;
 	int selec;
 {
 	struct gate_descriptor *ip;
 
 	ip = idt + idx;
 	ip->gd_looffset = (int)func;
 	ip->gd_selector = selec;
 	ip->gd_stkcpy = 0;
 	ip->gd_xx = 0;
 	ip->gd_type = typ;
 	ip->gd_dpl = dpl;
 	ip->gd_p = 1;
 	ip->gd_hioffset = ((int)func)>>16 ;
 }
 
 extern inthand_t
 	IDTVEC(div), IDTVEC(dbg), IDTVEC(nmi), IDTVEC(bpt), IDTVEC(ofl),
 	IDTVEC(bnd), IDTVEC(ill), IDTVEC(dna), IDTVEC(fpusegm),
 	IDTVEC(tss), IDTVEC(missing), IDTVEC(stk), IDTVEC(prot),
 	IDTVEC(page), IDTVEC(mchk), IDTVEC(rsvd), IDTVEC(fpu), IDTVEC(align),
 	IDTVEC(xmm),
 #ifdef KDTRACE_HOOKS
 	IDTVEC(dtrace_ret),
 #endif
 	IDTVEC(lcall_syscall), IDTVEC(int0x80_syscall);
 
 #ifdef DDB
 /*
  * Display the index and function name of any IDT entries that don't use
  * the default 'rsvd' entry point.
  */
 DB_SHOW_COMMAND(idt, db_show_idt)
 {
 	struct gate_descriptor *ip;
 	int idx;
 	uintptr_t func;
 
 	ip = idt;
 	for (idx = 0; idx < NIDT && !db_pager_quit; idx++) {
 		func = (ip->gd_hioffset << 16 | ip->gd_looffset);
 		if (func != (uintptr_t)&IDTVEC(rsvd)) {
 			db_printf("%3d\t", idx);
 			db_printsym(func, DB_STGY_PROC);
 			db_printf("\n");
 		}
 		ip++;
 	}
 }
 
 /* Show privileged registers. */
 DB_SHOW_COMMAND(sysregs, db_show_sysregs)
 {
 	uint64_t idtr, gdtr;
 
 	idtr = ridt();
 	db_printf("idtr\t0x%08x/%04x\n",
 	    (u_int)(idtr >> 16), (u_int)idtr & 0xffff);
 	gdtr = rgdt();
 	db_printf("gdtr\t0x%08x/%04x\n",
 	    (u_int)(gdtr >> 16), (u_int)gdtr & 0xffff);
 	db_printf("ldtr\t0x%04x\n", rldt());
 	db_printf("tr\t0x%04x\n", rtr());
 	db_printf("cr0\t0x%08x\n", rcr0());
 	db_printf("cr2\t0x%08x\n", rcr2());
 	db_printf("cr3\t0x%08x\n", rcr3());
 	db_printf("cr4\t0x%08x\n", rcr4());
 }
 #endif
 
 void
 sdtossd(sd, ssd)
 	struct segment_descriptor *sd;
 	struct soft_segment_descriptor *ssd;
 {
 	ssd->ssd_base  = (sd->sd_hibase << 24) | sd->sd_lobase;
 	ssd->ssd_limit = (sd->sd_hilimit << 16) | sd->sd_lolimit;
 	ssd->ssd_type  = sd->sd_type;
 	ssd->ssd_dpl   = sd->sd_dpl;
 	ssd->ssd_p     = sd->sd_p;
 	ssd->ssd_def32 = sd->sd_def32;
 	ssd->ssd_gran  = sd->sd_gran;
 }
 
 static void
 basemem_setup(void)
 {
 	vm_paddr_t pa;
 	pt_entry_t *pte;
 	int i;
 
 	if (basemem > 640) {
 		printf("Preposterous BIOS basemem of %uK, truncating to 640K\n",
 			basemem);
 		basemem = 640;
 	}
 
 	/*
 	 * XXX if biosbasemem is now < 640, there is a `hole'
 	 * between the end of base memory and the start of
 	 * ISA memory.  The hole may be empty or it may
 	 * contain BIOS code or data.  Map it read/write so
 	 * that the BIOS can write to it.  (Memory from 0 to
 	 * the physical end of the kernel is mapped read-only
 	 * to begin with and then parts of it are remapped.
 	 * The parts that aren't remapped form holes that
 	 * remain read-only and are unused by the kernel.
 	 * The base memory area is below the physical end of
 	 * the kernel and right now forms a read-only hole.
 	 * The part of it from PAGE_SIZE to
 	 * (trunc_page(biosbasemem * 1024) - 1) will be
 	 * remapped and used by the kernel later.)
 	 *
 	 * This code is similar to the code used in
 	 * pmap_mapdev, but since no memory needs to be
 	 * allocated we simply change the mapping.
 	 */
 	for (pa = trunc_page(basemem * 1024);
 	     pa < ISA_HOLE_START; pa += PAGE_SIZE)
 		pmap_kenter(KERNBASE + pa, pa);
 
 	/*
 	 * Map pages between basemem and ISA_HOLE_START, if any, r/w into
 	 * the vm86 page table so that vm86 can scribble on them using
 	 * the vm86 map too.  XXX: why 2 ways for this and only 1 way for
 	 * page 0, at least as initialized here?
 	 */
 	pte = (pt_entry_t *)vm86paddr;
 	for (i = basemem / 4; i < 160; i++)
 		pte[i] = (i << PAGE_SHIFT) | PG_V | PG_RW | PG_U;
 }
 
 /*
  * Populate the (physmap) array with base/bound pairs describing the
  * available physical memory in the system, then test this memory and
  * build the phys_avail array describing the actually-available memory.
  *
  * If we cannot accurately determine the physical memory map, then use
  * value from the 0xE801 call, and failing that, the RTC.
  *
  * Total memory size may be set by the kernel environment variable
  * hw.physmem or the compile-time define MAXMEM.
  *
  * XXX first should be vm_paddr_t.
  */
 static void
 getmemsize(int first)
 {
 	int off, physmap_idx, pa_indx, da_indx;
 	u_long physmem_tunable, memtest;
 	vm_paddr_t physmap[PHYSMAP_SIZE];
 	pt_entry_t *pte;
 	quad_t dcons_addr, dcons_size;
 	int i;
 	int pg_n;
 	u_int extmem;
 	u_int under16;
 	vm_paddr_t pa;
 
 	bzero(physmap, sizeof(physmap));
 
 	/* XXX - some of EPSON machines can't use PG_N */
 	pg_n = PG_N;
 	if (pc98_machine_type & M_EPSON_PC98) {
 		switch (epson_machine_id) {
 #ifdef WB_CACHE
 		default:
 #endif
 		case EPSON_PC486_HX:
 		case EPSON_PC486_HG:
 		case EPSON_PC486_HA:
 			pg_n = 0;
 			break;
 		}
 	}
 
 	under16 = pc98_getmemsize(&basemem, &extmem);
 	basemem_setup();
 
 	physmap[0] = 0;
 	physmap[1] = basemem * 1024;
 	physmap_idx = 2;
 	physmap[physmap_idx] = 0x100000;
 	physmap[physmap_idx + 1] = physmap[physmap_idx] + extmem * 1024;
 
 	/*
 	 * Now, physmap contains a map of physical memory.
 	 */
 
 #ifdef SMP
 	/* make hole for AP bootstrap code */
 	physmap[1] = mp_bootaddress(physmap[1]);
 #endif
 
 	/*
 	 * Maxmem isn't the "maximum memory", it's one larger than the
 	 * highest page of the physical address space.  It should be
 	 * called something like "Maxphyspage".  We may adjust this 
 	 * based on ``hw.physmem'' and the results of the memory test.
 	 */
 	Maxmem = atop(physmap[physmap_idx + 1]);
 
 #ifdef MAXMEM
 	Maxmem = MAXMEM / 4;
 #endif
 
 	if (TUNABLE_ULONG_FETCH("hw.physmem", &physmem_tunable))
 		Maxmem = atop(physmem_tunable);
 
 	/*
 	 * By default keep the memtest enabled.  Use a general name so that
 	 * one could eventually do more with the code than just disable it.
 	 */
 	memtest = 1;
 	TUNABLE_ULONG_FETCH("hw.memtest.tests", &memtest);
 
 	if (atop(physmap[physmap_idx + 1]) != Maxmem &&
 	    (boothowto & RB_VERBOSE))
 		printf("Physical memory use set to %ldK\n", Maxmem * 4);
 
 	/*
 	 * If Maxmem has been increased beyond what the system has detected,
 	 * extend the last memory segment to the new limit.
 	 */ 
 	if (atop(physmap[physmap_idx + 1]) < Maxmem)
 		physmap[physmap_idx + 1] = ptoa((vm_paddr_t)Maxmem);
 
 	/*
 	 * We need to divide chunk if Maxmem is larger than 16MB and
 	 * under 16MB area is not full of memory.
 	 * (1) system area (15-16MB region) is cut off
 	 * (2) extended memory is only over 16MB area (ex. Melco "HYPERMEMORY")
 	 */
 	if ((under16 != 16 * 1024) && (extmem > 15 * 1024)) {
 		/* 15M - 16M region is cut off, so need to divide chunk */
 		physmap[physmap_idx + 1] = under16 * 1024;
 		physmap_idx += 2;
 		physmap[physmap_idx] = 0x1000000;
 		physmap[physmap_idx + 1] = physmap[2] + extmem * 1024;
 	}
 
 	/* call pmap initialization to make new kernel address space */
 	pmap_bootstrap(first);
 
 	/*
 	 * Size up each available chunk of physical memory.
 	 */
 	physmap[0] = PAGE_SIZE;		/* mask off page 0 */
 	pa_indx = 0;
 	da_indx = 1;
 	phys_avail[pa_indx++] = physmap[0];
 	phys_avail[pa_indx] = physmap[0];
 	dump_avail[da_indx] = physmap[0];
 	pte = CMAP1;
 
 	/*
 	 * Get dcons buffer address
 	 */
 	if (getenv_quad("dcons.addr", &dcons_addr) == 0 ||
 	    getenv_quad("dcons.size", &dcons_size) == 0)
 		dcons_addr = 0;
 
 	/*
 	 * physmap is in bytes, so when converting to page boundaries,
 	 * round up the start address and round down the end address.
 	 */
 	for (i = 0; i <= physmap_idx; i += 2) {
 		vm_paddr_t end;
 
 		end = ptoa((vm_paddr_t)Maxmem);
 		if (physmap[i + 1] < end)
 			end = trunc_page(physmap[i + 1]);
 		for (pa = round_page(physmap[i]); pa < end; pa += PAGE_SIZE) {
 			int tmp, page_bad, full;
 			int *ptr = (int *)CADDR1;
 
 			full = FALSE;
 			/*
 			 * block out kernel memory as not available.
 			 */
 			if (pa >= KERNLOAD && pa < first)
 				goto do_dump_avail;
 
 			/*
 			 * block out dcons buffer
 			 */
 			if (dcons_addr > 0
 			    && pa >= trunc_page(dcons_addr)
 			    && pa < dcons_addr + dcons_size)
 				goto do_dump_avail;
 
 			page_bad = FALSE;
 			if (memtest == 0)
 				goto skip_memtest;
 
 			/*
 			 * map page into kernel: valid, read/write,non-cacheable
 			 */
 			*pte = pa | PG_V | PG_RW | pg_n;
 			invltlb();
 
 			tmp = *(int *)ptr;
 			/*
 			 * Test for alternating 1's and 0's
 			 */
 			*(volatile int *)ptr = 0xaaaaaaaa;
 			if (*(volatile int *)ptr != 0xaaaaaaaa)
 				page_bad = TRUE;
 			/*
 			 * Test for alternating 0's and 1's
 			 */
 			*(volatile int *)ptr = 0x55555555;
 			if (*(volatile int *)ptr != 0x55555555)
 				page_bad = TRUE;
 			/*
 			 * Test for all 1's
 			 */
 			*(volatile int *)ptr = 0xffffffff;
 			if (*(volatile int *)ptr != 0xffffffff)
 				page_bad = TRUE;
 			/*
 			 * Test for all 0's
 			 */
 			*(volatile int *)ptr = 0x0;
 			if (*(volatile int *)ptr != 0x0)
 				page_bad = TRUE;
 			/*
 			 * Restore original value.
 			 */
 			*(int *)ptr = tmp;
 
 skip_memtest:
 			/*
 			 * Adjust array of valid/good pages.
 			 */
 			if (page_bad == TRUE)
 				continue;
 			/*
 			 * If this good page is a continuation of the
 			 * previous set of good pages, then just increase
 			 * the end pointer. Otherwise start a new chunk.
 			 * Note that "end" points one higher than end,
 			 * making the range >= start and < end.
 			 * If we're also doing a speculative memory
 			 * test and we at or past the end, bump up Maxmem
 			 * so that we keep going. The first bad page
 			 * will terminate the loop.
 			 */
 			if (phys_avail[pa_indx] == pa) {
 				phys_avail[pa_indx] += PAGE_SIZE;
 			} else {
 				pa_indx++;
 				if (pa_indx == PHYS_AVAIL_ARRAY_END) {
 					printf(
 		"Too many holes in the physical address space, giving up\n");
 					pa_indx--;
 					full = TRUE;
 					goto do_dump_avail;
 				}
 				phys_avail[pa_indx++] = pa;	/* start */
 				phys_avail[pa_indx] = pa + PAGE_SIZE; /* end */
 			}
 			physmem++;
 do_dump_avail:
 			if (dump_avail[da_indx] == pa) {
 				dump_avail[da_indx] += PAGE_SIZE;
 			} else {
 				da_indx++;
 				if (da_indx == DUMP_AVAIL_ARRAY_END) {
 					da_indx--;
 					goto do_next;
 				}
 				dump_avail[da_indx++] = pa;	/* start */
 				dump_avail[da_indx] = pa + PAGE_SIZE; /* end */
 			}
 do_next:
 			if (full)
 				break;
 		}
 	}
 	*pte = 0;
 	invltlb();
 	
 	/*
 	 * XXX
 	 * The last chunk must contain at least one page plus the message
 	 * buffer to avoid complicating other code (message buffer address
 	 * calculation, etc.).
 	 */
 	while (phys_avail[pa_indx - 1] + PAGE_SIZE +
 	    round_page(msgbufsize) >= phys_avail[pa_indx]) {
 		physmem -= atop(phys_avail[pa_indx] - phys_avail[pa_indx - 1]);
 		phys_avail[pa_indx--] = 0;
 		phys_avail[pa_indx--] = 0;
 	}
 
 	Maxmem = atop(phys_avail[pa_indx]);
 
 	/* Trim off space for the message buffer. */
 	phys_avail[pa_indx] -= round_page(msgbufsize);
 
 	/* Map the message buffer. */
 	for (off = 0; off < round_page(msgbufsize); off += PAGE_SIZE)
 		pmap_kenter((vm_offset_t)msgbufp + off, phys_avail[pa_indx] +
 		    off);
 
 	PT_UPDATES_FLUSH();
 }
 
 void
 init386(first)
 	int first;
 {
 	struct gate_descriptor *gdp;
 	int gsel_tss, metadata_missing, x, pa;
 	size_t kstack0_sz;
 	struct pcpu *pc;
 
 	thread0.td_kstack = proc0kstack;
 	thread0.td_kstack_pages = KSTACK_PAGES;
 	kstack0_sz = thread0.td_kstack_pages * PAGE_SIZE;
 	thread0.td_pcb = (struct pcb *)(thread0.td_kstack + kstack0_sz) - 1;
 
 	/*
  	 * This may be done better later if it gets more high level
  	 * components in it. If so just link td->td_proc here.
 	 */
 	proc_linkup0(&proc0, &thread0);
 
 	/*
 	 * Initialize DMAC
 	 */
 	pc98_init_dmac();
 
 	metadata_missing = 0;
 	if (bootinfo.bi_modulep) {
 		preload_metadata = (caddr_t)bootinfo.bi_modulep + KERNBASE;
 		preload_bootstrap_relocate(KERNBASE);
 	} else {
 		metadata_missing = 1;
 	}
 	if (envmode == 1)
 		kern_envp = static_env;
 	else if (bootinfo.bi_envp)
 		kern_envp = (caddr_t)bootinfo.bi_envp + KERNBASE;
 
 	/* Init basic tunables, hz etc */
 	init_param1();
 
 	/*
 	 * Make gdt memory segments.  All segments cover the full 4GB
 	 * of address space and permissions are enforced at page level.
 	 */
 	gdt_segs[GCODE_SEL].ssd_limit = atop(0 - 1);
 	gdt_segs[GDATA_SEL].ssd_limit = atop(0 - 1);
 	gdt_segs[GUCODE_SEL].ssd_limit = atop(0 - 1);
 	gdt_segs[GUDATA_SEL].ssd_limit = atop(0 - 1);
 	gdt_segs[GUFS_SEL].ssd_limit = atop(0 - 1);
 	gdt_segs[GUGS_SEL].ssd_limit = atop(0 - 1);
 
 	pc = &__pcpu[0];
 	gdt_segs[GPRIV_SEL].ssd_limit = atop(0 - 1);
 	gdt_segs[GPRIV_SEL].ssd_base = (int) pc;
 	gdt_segs[GPROC0_SEL].ssd_base = (int) &pc->pc_common_tss;
 
 	for (x = 0; x < NGDT; x++)
 		ssdtosd(&gdt_segs[x], &gdt[x].sd);
 
 	r_gdt.rd_limit = NGDT * sizeof(gdt[0]) - 1;
 	r_gdt.rd_base =  (int) gdt;
 	mtx_init(&dt_lock, "descriptor tables", NULL, MTX_SPIN);
 	lgdt(&r_gdt);
 
 	pcpu_init(pc, 0, sizeof(struct pcpu));
 	for (pa = first; pa < first + DPCPU_SIZE; pa += PAGE_SIZE)
 		pmap_kenter(pa + KERNBASE, pa);
 	dpcpu_init((void *)(first + KERNBASE), 0);
 	first += DPCPU_SIZE;
 	PCPU_SET(prvspace, pc);
 	PCPU_SET(curthread, &thread0);
 	PCPU_SET(curpcb, thread0.td_pcb);
 
 	/*
 	 * Initialize mutexes.
 	 *
 	 * icu_lock: in order to allow an interrupt to occur in a critical
 	 * 	     section, to set pcpu->ipending (etc...) properly, we
 	 *	     must be able to get the icu lock, so it can't be
 	 *	     under witness.
 	 */
 	mutex_init();
 	mtx_init(&icu_lock, "icu", NULL, MTX_SPIN | MTX_NOWITNESS | MTX_NOPROFILE);
 
 	/* make ldt memory segments */
 	ldt_segs[LUCODE_SEL].ssd_limit = atop(0 - 1);
 	ldt_segs[LUDATA_SEL].ssd_limit = atop(0 - 1);
 	for (x = 0; x < sizeof ldt_segs / sizeof ldt_segs[0]; x++)
 		ssdtosd(&ldt_segs[x], &ldt[x].sd);
 
 	_default_ldt = GSEL(GLDT_SEL, SEL_KPL);
 	lldt(_default_ldt);
 	PCPU_SET(currentldt, _default_ldt);
 
 	/* exceptions */
 	for (x = 0; x < NIDT; x++)
 		setidt(x, &IDTVEC(rsvd), SDT_SYS386TGT, SEL_KPL,
 		    GSEL(GCODE_SEL, SEL_KPL));
 	setidt(IDT_DE, &IDTVEC(div),  SDT_SYS386TGT, SEL_KPL,
 	    GSEL(GCODE_SEL, SEL_KPL));
 	setidt(IDT_DB, &IDTVEC(dbg),  SDT_SYS386IGT, SEL_KPL,
 	    GSEL(GCODE_SEL, SEL_KPL));
 	setidt(IDT_NMI, &IDTVEC(nmi),  SDT_SYS386IGT, SEL_KPL,
 	    GSEL(GCODE_SEL, SEL_KPL));
  	setidt(IDT_BP, &IDTVEC(bpt),  SDT_SYS386IGT, SEL_UPL,
 	    GSEL(GCODE_SEL, SEL_KPL));
 	setidt(IDT_OF, &IDTVEC(ofl),  SDT_SYS386TGT, SEL_UPL,
 	    GSEL(GCODE_SEL, SEL_KPL));
 	setidt(IDT_BR, &IDTVEC(bnd),  SDT_SYS386TGT, SEL_KPL,
 	    GSEL(GCODE_SEL, SEL_KPL));
 	setidt(IDT_UD, &IDTVEC(ill),  SDT_SYS386TGT, SEL_KPL,
 	    GSEL(GCODE_SEL, SEL_KPL));
 	setidt(IDT_NM, &IDTVEC(dna),  SDT_SYS386TGT, SEL_KPL
 	    , GSEL(GCODE_SEL, SEL_KPL));
 	setidt(IDT_DF, 0,  SDT_SYSTASKGT, SEL_KPL, GSEL(GPANIC_SEL, SEL_KPL));
 	setidt(IDT_FPUGP, &IDTVEC(fpusegm),  SDT_SYS386TGT, SEL_KPL,
 	    GSEL(GCODE_SEL, SEL_KPL));
 	setidt(IDT_TS, &IDTVEC(tss),  SDT_SYS386TGT, SEL_KPL,
 	    GSEL(GCODE_SEL, SEL_KPL));
 	setidt(IDT_NP, &IDTVEC(missing),  SDT_SYS386TGT, SEL_KPL,
 	    GSEL(GCODE_SEL, SEL_KPL));
 	setidt(IDT_SS, &IDTVEC(stk),  SDT_SYS386TGT, SEL_KPL,
 	    GSEL(GCODE_SEL, SEL_KPL));
 	setidt(IDT_GP, &IDTVEC(prot),  SDT_SYS386TGT, SEL_KPL,
 	    GSEL(GCODE_SEL, SEL_KPL));
 	setidt(IDT_PF, &IDTVEC(page),  SDT_SYS386IGT, SEL_KPL,
 	    GSEL(GCODE_SEL, SEL_KPL));
 	setidt(IDT_MF, &IDTVEC(fpu),  SDT_SYS386TGT, SEL_KPL,
 	    GSEL(GCODE_SEL, SEL_KPL));
 	setidt(IDT_AC, &IDTVEC(align), SDT_SYS386TGT, SEL_KPL,
 	    GSEL(GCODE_SEL, SEL_KPL));
 	setidt(IDT_MC, &IDTVEC(mchk),  SDT_SYS386TGT, SEL_KPL,
 	    GSEL(GCODE_SEL, SEL_KPL));
 	setidt(IDT_XF, &IDTVEC(xmm), SDT_SYS386TGT, SEL_KPL,
 	    GSEL(GCODE_SEL, SEL_KPL));
  	setidt(IDT_SYSCALL, &IDTVEC(int0x80_syscall), SDT_SYS386TGT, SEL_UPL,
 	    GSEL(GCODE_SEL, SEL_KPL));
 #ifdef KDTRACE_HOOKS
 	setidt(IDT_DTRACE_RET, &IDTVEC(dtrace_ret), SDT_SYS386TGT, SEL_UPL,
 	    GSEL(GCODE_SEL, SEL_KPL));
 #endif
 
 	r_idt.rd_limit = sizeof(idt0) - 1;
 	r_idt.rd_base = (int) idt;
 	lidt(&r_idt);
 
 	/*
 	 * Initialize the i8254 before the console so that console
 	 * initialization can use DELAY().
 	 */
 	i8254_init();
 
 	/*
 	 * Initialize the console before we print anything out.
 	 */
 	cninit();
 
 	if (metadata_missing)
 		printf("WARNING: loader(8) metadata is missing!\n");
 
 #ifdef DEV_ISA
 #ifdef DEV_ATPIC
 	atpic_startup();
 #else
 	/* Reset and mask the atpics and leave them shut down. */
 	atpic_reset();
 
 	/*
 	 * Point the ICU spurious interrupt vectors at the APIC spurious
 	 * interrupt handler.
 	 */
 	setidt(IDT_IO_INTS + 7, IDTVEC(spuriousint), SDT_SYS386IGT, SEL_KPL,
 	    GSEL(GCODE_SEL, SEL_KPL));
 	setidt(IDT_IO_INTS + 15, IDTVEC(spuriousint), SDT_SYS386IGT, SEL_KPL,
 	    GSEL(GCODE_SEL, SEL_KPL));
 #endif
 #endif
 
 #ifdef DDB
 	ksym_start = bootinfo.bi_symtab;
 	ksym_end = bootinfo.bi_esymtab;
 #endif
 
 	kdb_init();
 
 #ifdef KDB
 	if (boothowto & RB_KDB)
 		kdb_enter(KDB_WHY_BOOTFLAGS, "Boot flags requested debugger");
 #endif
 
 	finishidentcpu();	/* Final stage of CPU initialization */
 	setidt(IDT_UD, &IDTVEC(ill),  SDT_SYS386TGT, SEL_KPL,
 	    GSEL(GCODE_SEL, SEL_KPL));
 	setidt(IDT_GP, &IDTVEC(prot),  SDT_SYS386TGT, SEL_KPL,
 	    GSEL(GCODE_SEL, SEL_KPL));
 	initializecpu();	/* Initialize CPU registers */
 
 	/* make an initial tss so cpu can get interrupt stack on syscall! */
 	/* Note: -16 is so we can grow the trapframe if we came from vm86 */
 	PCPU_SET(common_tss.tss_esp0, thread0.td_kstack +
 	    kstack0_sz - sizeof(struct pcb) - 16);
 	PCPU_SET(common_tss.tss_ss0, GSEL(GDATA_SEL, SEL_KPL));
 	gsel_tss = GSEL(GPROC0_SEL, SEL_KPL);
 	PCPU_SET(tss_gdt, &gdt[GPROC0_SEL].sd);
 	PCPU_SET(common_tssd, *PCPU_GET(tss_gdt));
 	PCPU_SET(common_tss.tss_ioopt, (sizeof (struct i386tss)) << 16);
 	ltr(gsel_tss);
 
 	/* pointer to selector slot for %fs/%gs */
 	PCPU_SET(fsgs_gdt, &gdt[GUFS_SEL].sd);
 
 	dblfault_tss.tss_esp = dblfault_tss.tss_esp0 = dblfault_tss.tss_esp1 =
 	    dblfault_tss.tss_esp2 = (int)&dblfault_stack[sizeof(dblfault_stack)];
 	dblfault_tss.tss_ss = dblfault_tss.tss_ss0 = dblfault_tss.tss_ss1 =
 	    dblfault_tss.tss_ss2 = GSEL(GDATA_SEL, SEL_KPL);
 	dblfault_tss.tss_cr3 = (int)IdlePTD;
 	dblfault_tss.tss_eip = (int)dblfault_handler;
 	dblfault_tss.tss_eflags = PSL_KERNEL;
 	dblfault_tss.tss_ds = dblfault_tss.tss_es =
 	    dblfault_tss.tss_gs = GSEL(GDATA_SEL, SEL_KPL);
 	dblfault_tss.tss_fs = GSEL(GPRIV_SEL, SEL_KPL);
 	dblfault_tss.tss_cs = GSEL(GCODE_SEL, SEL_KPL);
 	dblfault_tss.tss_ldt = GSEL(GLDT_SEL, SEL_KPL);
 
 	vm86_initialize();
 	getmemsize(first);
 	init_param2(physmem);
 
 	/* now running on new page tables, configured,and u/iom is accessible */
 
 	msgbufinit(msgbufp, msgbufsize);
 
 	/* make a call gate to reenter kernel with */
 	gdp = &ldt[LSYS5CALLS_SEL].gd;
 
 	x = (int) &IDTVEC(lcall_syscall);
 	gdp->gd_looffset = x;
 	gdp->gd_selector = GSEL(GCODE_SEL,SEL_KPL);
 	gdp->gd_stkcpy = 1;
 	gdp->gd_type = SDT_SYS386CGT;
 	gdp->gd_dpl = SEL_UPL;
 	gdp->gd_p = 1;
 	gdp->gd_hioffset = x >> 16;
 
 	/* XXX does this work? */
 	/* XXX yes! */
 	ldt[LBSDICALLS_SEL] = ldt[LSYS5CALLS_SEL];
 	ldt[LSOL26CALLS_SEL] = ldt[LSYS5CALLS_SEL];
 
 	/* transfer to user mode */
 
 	_ucodesel = GSEL(GUCODE_SEL, SEL_UPL);
 	_udatasel = GSEL(GUDATA_SEL, SEL_UPL);
 
 	/* setup proc 0's pcb */
 	thread0.td_pcb->pcb_flags = 0;
 	thread0.td_pcb->pcb_cr3 = (int)IdlePTD;
 	thread0.td_pcb->pcb_ext = 0;
 	thread0.td_frame = &proc0_tf;
 }
 
 void
 cpu_pcpu_init(struct pcpu *pcpu, int cpuid, size_t size)
 {
 
 }
 
 void
 spinlock_enter(void)
 {
 	struct thread *td;
 	register_t flags;
 
 	td = curthread;
 	if (td->td_md.md_spinlock_count == 0) {
 		flags = intr_disable();
 		td->td_md.md_spinlock_count = 1;
 		td->td_md.md_saved_flags = flags;
 	} else
 		td->td_md.md_spinlock_count++;
 	critical_enter();
 }
 
 void
 spinlock_exit(void)
 {
 	struct thread *td;
 	register_t flags;
 
 	td = curthread;
 	critical_exit();
 	flags = td->td_md.md_saved_flags;
 	td->td_md.md_spinlock_count--;
 	if (td->td_md.md_spinlock_count == 0)
 		intr_restore(flags);
 }
 
 #if defined(I586_CPU) && !defined(NO_F00F_HACK)
 static void f00f_hack(void *unused);
 SYSINIT(f00f_hack, SI_SUB_INTRINSIC, SI_ORDER_FIRST, f00f_hack, NULL);
 
 static void
 f00f_hack(void *unused)
 {
 	struct gate_descriptor *new_idt;
 	vm_offset_t tmp;
 
 	if (!has_f00f_bug)
 		return;
 
 	GIANT_REQUIRED;
 
 	printf("Intel Pentium detected, installing workaround for F00F bug\n");
 
 	tmp = kmem_malloc(kernel_arena, PAGE_SIZE * 2, M_WAITOK | M_ZERO);
 	if (tmp == 0)
 		panic("kmem_alloc returned 0");
 
 	/* Put the problematic entry (#6) at the end of the lower page. */
 	new_idt = (struct gate_descriptor*)
 	    (tmp + PAGE_SIZE - 7 * sizeof(struct gate_descriptor));
 	bcopy(idt, new_idt, sizeof(idt0));
 	r_idt.rd_base = (u_int)new_idt;
 	lidt(&r_idt);
 	idt = new_idt;
 	pmap_protect(kernel_pmap, tmp, tmp + PAGE_SIZE, VM_PROT_READ);
 }
 #endif /* defined(I586_CPU) && !NO_F00F_HACK */
 
 /*
  * Construct a PCB from a trapframe. This is called from kdb_trap() where
  * we want to start a backtrace from the function that caused us to enter
  * the debugger. We have the context in the trapframe, but base the trace
  * on the PCB. The PCB doesn't have to be perfect, as long as it contains
  * enough for a backtrace.
  */
 void
 makectx(struct trapframe *tf, struct pcb *pcb)
 {
 
 	pcb->pcb_edi = tf->tf_edi;
 	pcb->pcb_esi = tf->tf_esi;
 	pcb->pcb_ebp = tf->tf_ebp;
 	pcb->pcb_ebx = tf->tf_ebx;
 	pcb->pcb_eip = tf->tf_eip;
 	pcb->pcb_esp = (ISPL(tf->tf_cs)) ? tf->tf_esp : (int)(tf + 1) - 8;
 }
 
 int
 ptrace_set_pc(struct thread *td, u_long addr)
 {
 
 	td->td_frame->tf_eip = addr;
 	return (0);
 }
 
 int
 ptrace_single_step(struct thread *td)
 {
 	td->td_frame->tf_eflags |= PSL_T;
 	return (0);
 }
 
 int
 ptrace_clear_single_step(struct thread *td)
 {
 	td->td_frame->tf_eflags &= ~PSL_T;
 	return (0);
 }
 
 int
 fill_regs(struct thread *td, struct reg *regs)
 {
 	struct pcb *pcb;
 	struct trapframe *tp;
 
 	tp = td->td_frame;
 	pcb = td->td_pcb;
 	regs->r_gs = pcb->pcb_gs;
 	return (fill_frame_regs(tp, regs));
 }
 
 int
 fill_frame_regs(struct trapframe *tp, struct reg *regs)
 {
 	regs->r_fs = tp->tf_fs;
 	regs->r_es = tp->tf_es;
 	regs->r_ds = tp->tf_ds;
 	regs->r_edi = tp->tf_edi;
 	regs->r_esi = tp->tf_esi;
 	regs->r_ebp = tp->tf_ebp;
 	regs->r_ebx = tp->tf_ebx;
 	regs->r_edx = tp->tf_edx;
 	regs->r_ecx = tp->tf_ecx;
 	regs->r_eax = tp->tf_eax;
 	regs->r_eip = tp->tf_eip;
 	regs->r_cs = tp->tf_cs;
 	regs->r_eflags = tp->tf_eflags;
 	regs->r_esp = tp->tf_esp;
 	regs->r_ss = tp->tf_ss;
 	return (0);
 }
 
 int
 set_regs(struct thread *td, struct reg *regs)
 {
 	struct pcb *pcb;
 	struct trapframe *tp;
 
 	tp = td->td_frame;
 	if (!EFL_SECURE(regs->r_eflags, tp->tf_eflags) ||
 	    !CS_SECURE(regs->r_cs))
 		return (EINVAL);
 	pcb = td->td_pcb;
 	tp->tf_fs = regs->r_fs;
 	tp->tf_es = regs->r_es;
 	tp->tf_ds = regs->r_ds;
 	tp->tf_edi = regs->r_edi;
 	tp->tf_esi = regs->r_esi;
 	tp->tf_ebp = regs->r_ebp;
 	tp->tf_ebx = regs->r_ebx;
 	tp->tf_edx = regs->r_edx;
 	tp->tf_ecx = regs->r_ecx;
 	tp->tf_eax = regs->r_eax;
 	tp->tf_eip = regs->r_eip;
 	tp->tf_cs = regs->r_cs;
 	tp->tf_eflags = regs->r_eflags;
 	tp->tf_esp = regs->r_esp;
 	tp->tf_ss = regs->r_ss;
 	pcb->pcb_gs = regs->r_gs;
 	return (0);
 }
 
 #ifdef CPU_ENABLE_SSE
 static void
 fill_fpregs_xmm(sv_xmm, sv_87)
 	struct savexmm *sv_xmm;
 	struct save87 *sv_87;
 {
 	register struct env87 *penv_87 = &sv_87->sv_env;
 	register struct envxmm *penv_xmm = &sv_xmm->sv_env;
 	int i;
 
 	bzero(sv_87, sizeof(*sv_87));
 
 	/* FPU control/status */
 	penv_87->en_cw = penv_xmm->en_cw;
 	penv_87->en_sw = penv_xmm->en_sw;
 	penv_87->en_tw = penv_xmm->en_tw;
 	penv_87->en_fip = penv_xmm->en_fip;
 	penv_87->en_fcs = penv_xmm->en_fcs;
 	penv_87->en_opcode = penv_xmm->en_opcode;
 	penv_87->en_foo = penv_xmm->en_foo;
 	penv_87->en_fos = penv_xmm->en_fos;
 
 	/* FPU registers */
 	for (i = 0; i < 8; ++i)
 		sv_87->sv_ac[i] = sv_xmm->sv_fp[i].fp_acc;
 }
 
 static void
 set_fpregs_xmm(sv_87, sv_xmm)
 	struct save87 *sv_87;
 	struct savexmm *sv_xmm;
 {
 	register struct env87 *penv_87 = &sv_87->sv_env;
 	register struct envxmm *penv_xmm = &sv_xmm->sv_env;
 	int i;
 
 	/* FPU control/status */
 	penv_xmm->en_cw = penv_87->en_cw;
 	penv_xmm->en_sw = penv_87->en_sw;
 	penv_xmm->en_tw = penv_87->en_tw;
 	penv_xmm->en_fip = penv_87->en_fip;
 	penv_xmm->en_fcs = penv_87->en_fcs;
 	penv_xmm->en_opcode = penv_87->en_opcode;
 	penv_xmm->en_foo = penv_87->en_foo;
 	penv_xmm->en_fos = penv_87->en_fos;
 
 	/* FPU registers */
 	for (i = 0; i < 8; ++i)
 		sv_xmm->sv_fp[i].fp_acc = sv_87->sv_ac[i];
 }
 #endif /* CPU_ENABLE_SSE */
 
 int
 fill_fpregs(struct thread *td, struct fpreg *fpregs)
 {
 
 	KASSERT(td == curthread || TD_IS_SUSPENDED(td) ||
 	    P_SHOULDSTOP(td->td_proc),
 	    ("not suspended thread %p", td));
 #ifdef DEV_NPX
 	npxgetregs(td);
 #else
 	bzero(fpregs, sizeof(*fpregs));
 #endif
 #ifdef CPU_ENABLE_SSE
 	if (cpu_fxsr)
 		fill_fpregs_xmm(&td->td_pcb->pcb_user_save.sv_xmm,
 		    (struct save87 *)fpregs);
 	else
 #endif /* CPU_ENABLE_SSE */
 		bcopy(&td->td_pcb->pcb_user_save.sv_87, fpregs,
 		    sizeof(*fpregs));
 	return (0);
 }
 
 int
 set_fpregs(struct thread *td, struct fpreg *fpregs)
 {
 
 #ifdef CPU_ENABLE_SSE
 	if (cpu_fxsr)
 		set_fpregs_xmm((struct save87 *)fpregs,
 		    &td->td_pcb->pcb_user_save.sv_xmm);
 	else
 #endif /* CPU_ENABLE_SSE */
 		bcopy(fpregs, &td->td_pcb->pcb_user_save.sv_87,
 		    sizeof(*fpregs));
 #ifdef DEV_NPX
 	npxuserinited(td);
 #endif
 	return (0);
 }
 
 /*
  * Get machine context.
  */
 int
 get_mcontext(struct thread *td, mcontext_t *mcp, int flags)
 {
 	struct trapframe *tp;
 	struct segment_descriptor *sdp;
 
 	tp = td->td_frame;
 
 	PROC_LOCK(curthread->td_proc);
 	mcp->mc_onstack = sigonstack(tp->tf_esp);
 	PROC_UNLOCK(curthread->td_proc);
 	mcp->mc_gs = td->td_pcb->pcb_gs;
 	mcp->mc_fs = tp->tf_fs;
 	mcp->mc_es = tp->tf_es;
 	mcp->mc_ds = tp->tf_ds;
 	mcp->mc_edi = tp->tf_edi;
 	mcp->mc_esi = tp->tf_esi;
 	mcp->mc_ebp = tp->tf_ebp;
 	mcp->mc_isp = tp->tf_isp;
 	mcp->mc_eflags = tp->tf_eflags;
 	if (flags & GET_MC_CLEAR_RET) {
 		mcp->mc_eax = 0;
 		mcp->mc_edx = 0;
 		mcp->mc_eflags &= ~PSL_C;
 	} else {
 		mcp->mc_eax = tp->tf_eax;
 		mcp->mc_edx = tp->tf_edx;
 	}
 	mcp->mc_ebx = tp->tf_ebx;
 	mcp->mc_ecx = tp->tf_ecx;
 	mcp->mc_eip = tp->tf_eip;
 	mcp->mc_cs = tp->tf_cs;
 	mcp->mc_esp = tp->tf_esp;
 	mcp->mc_ss = tp->tf_ss;
 	mcp->mc_len = sizeof(*mcp);
 	get_fpcontext(td, mcp);
 	sdp = &td->td_pcb->pcb_fsd;
 	mcp->mc_fsbase = sdp->sd_hibase << 24 | sdp->sd_lobase;
 	sdp = &td->td_pcb->pcb_gsd;
 	mcp->mc_gsbase = sdp->sd_hibase << 24 | sdp->sd_lobase;
 	mcp->mc_flags = 0;
 	bzero(mcp->mc_spare2, sizeof(mcp->mc_spare2));
 	return (0);
 }
 
 /*
  * Set machine context.
  *
  * However, we don't set any but the user modifiable flags, and we won't
  * touch the cs selector.
  */
 int
 set_mcontext(struct thread *td, const mcontext_t *mcp)
 {
 	struct trapframe *tp;
 	int eflags, ret;
 
 	tp = td->td_frame;
 	if (mcp->mc_len != sizeof(*mcp))
 		return (EINVAL);
 	eflags = (mcp->mc_eflags & PSL_USERCHANGE) |
 	    (tp->tf_eflags & ~PSL_USERCHANGE);
 	if ((ret = set_fpcontext(td, mcp)) == 0) {
 		tp->tf_fs = mcp->mc_fs;
 		tp->tf_es = mcp->mc_es;
 		tp->tf_ds = mcp->mc_ds;
 		tp->tf_edi = mcp->mc_edi;
 		tp->tf_esi = mcp->mc_esi;
 		tp->tf_ebp = mcp->mc_ebp;
 		tp->tf_ebx = mcp->mc_ebx;
 		tp->tf_edx = mcp->mc_edx;
 		tp->tf_ecx = mcp->mc_ecx;
 		tp->tf_eax = mcp->mc_eax;
 		tp->tf_eip = mcp->mc_eip;
 		tp->tf_eflags = eflags;
 		tp->tf_esp = mcp->mc_esp;
 		tp->tf_ss = mcp->mc_ss;
 		td->td_pcb->pcb_gs = mcp->mc_gs;
 		ret = 0;
 	}
 	return (ret);
 }
 
 static void
 get_fpcontext(struct thread *td, mcontext_t *mcp)
 {
 
 #ifndef DEV_NPX
 	mcp->mc_fpformat = _MC_FPFMT_NODEV;
 	mcp->mc_ownedfp = _MC_FPOWNED_NONE;
 	bzero(mcp->mc_fpstate, sizeof(mcp->mc_fpstate));
 #else
 	mcp->mc_ownedfp = npxgetregs(td);
 	bcopy(&td->td_pcb->pcb_user_save, &mcp->mc_fpstate[0],
 	    sizeof(mcp->mc_fpstate));
 	mcp->mc_fpformat = npxformat();
 #endif
 }
 
 static int
 set_fpcontext(struct thread *td, const mcontext_t *mcp)
 {
 
 	if (mcp->mc_fpformat == _MC_FPFMT_NODEV)
 		return (0);
 	else if (mcp->mc_fpformat != _MC_FPFMT_387 &&
 	    mcp->mc_fpformat != _MC_FPFMT_XMM)
 		return (EINVAL);
 	else if (mcp->mc_ownedfp == _MC_FPOWNED_NONE)
 		/* We don't care what state is left in the FPU or PCB. */
 		fpstate_drop(td);
 	else if (mcp->mc_ownedfp == _MC_FPOWNED_FPU ||
 	    mcp->mc_ownedfp == _MC_FPOWNED_PCB) {
 #ifdef DEV_NPX
 #ifdef CPU_ENABLE_SSE
 		if (cpu_fxsr)
 			((union savefpu *)&mcp->mc_fpstate)->sv_xmm.sv_env.
 			    en_mxcsr &= cpu_mxcsr_mask;
 #endif
 		npxsetregs(td, (union savefpu *)&mcp->mc_fpstate);
 #endif
 	} else
 		return (EINVAL);
 	return (0);
 }
 
 static void
 fpstate_drop(struct thread *td)
 {
 
 	KASSERT(PCB_USER_FPU(td->td_pcb), ("fpstate_drop: kernel-owned fpu"));
 	critical_enter();
 #ifdef DEV_NPX
 	if (PCPU_GET(fpcurthread) == td)
 		npxdrop();
 #endif
 	/*
 	 * XXX force a full drop of the npx.  The above only drops it if we
 	 * owned it.  npxgetregs() has the same bug in the !cpu_fxsr case.
 	 *
 	 * XXX I don't much like npxgetregs()'s semantics of doing a full
 	 * drop.  Dropping only to the pcb matches fnsave's behaviour.
 	 * We only need to drop to !PCB_INITDONE in sendsig().  But
 	 * sendsig() is the only caller of npxgetregs()... perhaps we just
 	 * have too many layers.
 	 */
 	curthread->td_pcb->pcb_flags &= ~(PCB_NPXINITDONE |
 	    PCB_NPXUSERINITDONE);
 	critical_exit();
 }
 
 int
 fill_dbregs(struct thread *td, struct dbreg *dbregs)
 {
 	struct pcb *pcb;
 
 	if (td == NULL) {
 		dbregs->dr[0] = rdr0();
 		dbregs->dr[1] = rdr1();
 		dbregs->dr[2] = rdr2();
 		dbregs->dr[3] = rdr3();
 		dbregs->dr[4] = rdr4();
 		dbregs->dr[5] = rdr5();
 		dbregs->dr[6] = rdr6();
 		dbregs->dr[7] = rdr7();
 	} else {
 		pcb = td->td_pcb;
 		dbregs->dr[0] = pcb->pcb_dr0;
 		dbregs->dr[1] = pcb->pcb_dr1;
 		dbregs->dr[2] = pcb->pcb_dr2;
 		dbregs->dr[3] = pcb->pcb_dr3;
 		dbregs->dr[4] = 0;
 		dbregs->dr[5] = 0;
 		dbregs->dr[6] = pcb->pcb_dr6;
 		dbregs->dr[7] = pcb->pcb_dr7;
 	}
 	return (0);
 }
 
 int
 set_dbregs(struct thread *td, struct dbreg *dbregs)
 {
 	struct pcb *pcb;
 	int i;
 
 	if (td == NULL) {
 		load_dr0(dbregs->dr[0]);
 		load_dr1(dbregs->dr[1]);
 		load_dr2(dbregs->dr[2]);
 		load_dr3(dbregs->dr[3]);
 		load_dr4(dbregs->dr[4]);
 		load_dr5(dbregs->dr[5]);
 		load_dr6(dbregs->dr[6]);
 		load_dr7(dbregs->dr[7]);
 	} else {
 		/*
 		 * Don't let an illegal value for dr7 get set.	Specifically,
 		 * check for undefined settings.  Setting these bit patterns
 		 * result in undefined behaviour and can lead to an unexpected
 		 * TRCTRAP.
 		 */
 		for (i = 0; i < 4; i++) {
 			if (DBREG_DR7_ACCESS(dbregs->dr[7], i) == 0x02)
 				return (EINVAL);
 			if (DBREG_DR7_LEN(dbregs->dr[7], i) == 0x02)
 				return (EINVAL);
 		}
 		
 		pcb = td->td_pcb;
 		
 		/*
 		 * Don't let a process set a breakpoint that is not within the
 		 * process's address space.  If a process could do this, it
 		 * could halt the system by setting a breakpoint in the kernel
 		 * (if ddb was enabled).  Thus, we need to check to make sure
 		 * that no breakpoints are being enabled for addresses outside
 		 * process's address space.
 		 *
 		 * XXX - what about when the watched area of the user's
 		 * address space is written into from within the kernel
 		 * ... wouldn't that still cause a breakpoint to be generated
 		 * from within kernel mode?
 		 */
 
 		if (DBREG_DR7_ENABLED(dbregs->dr[7], 0)) {
 			/* dr0 is enabled */
 			if (dbregs->dr[0] >= VM_MAXUSER_ADDRESS)
 				return (EINVAL);
 		}
 			
 		if (DBREG_DR7_ENABLED(dbregs->dr[7], 1)) {
 			/* dr1 is enabled */
 			if (dbregs->dr[1] >= VM_MAXUSER_ADDRESS)
 				return (EINVAL);
 		}
 			
 		if (DBREG_DR7_ENABLED(dbregs->dr[7], 2)) {
 			/* dr2 is enabled */
 			if (dbregs->dr[2] >= VM_MAXUSER_ADDRESS)
 				return (EINVAL);
 		}
 			
 		if (DBREG_DR7_ENABLED(dbregs->dr[7], 3)) {
 			/* dr3 is enabled */
 			if (dbregs->dr[3] >= VM_MAXUSER_ADDRESS)
 				return (EINVAL);
 		}
 
 		pcb->pcb_dr0 = dbregs->dr[0];
 		pcb->pcb_dr1 = dbregs->dr[1];
 		pcb->pcb_dr2 = dbregs->dr[2];
 		pcb->pcb_dr3 = dbregs->dr[3];
 		pcb->pcb_dr6 = dbregs->dr[6];
 		pcb->pcb_dr7 = dbregs->dr[7];
 
 		pcb->pcb_flags |= PCB_DBREGS;
 	}
 
 	return (0);
 }
 
 /*
  * Return > 0 if a hardware breakpoint has been hit, and the
  * breakpoint was in user space.  Return 0, otherwise.
  */
 int
 user_dbreg_trap(void)
 {
         u_int32_t dr7, dr6; /* debug registers dr6 and dr7 */
         u_int32_t bp;       /* breakpoint bits extracted from dr6 */
         int nbp;            /* number of breakpoints that triggered */
         caddr_t addr[4];    /* breakpoint addresses */
         int i;
         
         dr7 = rdr7();
         if ((dr7 & 0x000000ff) == 0) {
                 /*
                  * all GE and LE bits in the dr7 register are zero,
                  * thus the trap couldn't have been caused by the
                  * hardware debug registers
                  */
                 return 0;
         }
 
         nbp = 0;
         dr6 = rdr6();
         bp = dr6 & 0x0000000f;
 
         if (!bp) {
                 /*
                  * None of the breakpoint bits are set meaning this
                  * trap was not caused by any of the debug registers
                  */
                 return 0;
         }
 
         /*
          * at least one of the breakpoints were hit, check to see
          * which ones and if any of them are user space addresses
          */
 
         if (bp & 0x01) {
                 addr[nbp++] = (caddr_t)rdr0();
         }
         if (bp & 0x02) {
                 addr[nbp++] = (caddr_t)rdr1();
         }
         if (bp & 0x04) {
                 addr[nbp++] = (caddr_t)rdr2();
         }
         if (bp & 0x08) {
                 addr[nbp++] = (caddr_t)rdr3();
         }
 
         for (i = 0; i < nbp; i++) {
                 if (addr[i] < (caddr_t)VM_MAXUSER_ADDRESS) {
                         /*
                          * addr[i] is in user space
                          */
                         return nbp;
                 }
         }
 
         /*
          * None of the breakpoints are in user space.
          */
         return 0;
 }
 
 #ifdef KDB
 
 /*
  * Provide inb() and outb() as functions.  They are normally only available as
  * inline functions, thus cannot be called from the debugger.
  */
 
 /* silence compiler warnings */
 u_char inb_(u_short);
 void outb_(u_short, u_char);
 
 u_char
 inb_(u_short port)
 {
 	return inb(port);
 }
 
 void
 outb_(u_short port, u_char data)
 {
 	outb(port, data);
 }
 
 #endif /* KDB */
Index: head/sys/powerpc/aim/locore32.S
===================================================================
--- head/sys/powerpc/aim/locore32.S	(revision 258540)
+++ head/sys/powerpc/aim/locore32.S	(revision 258541)
@@ -1,213 +1,211 @@
 /* $FreeBSD$ */
 /* $NetBSD: locore.S,v 1.24 2000/05/31 05:09:17 thorpej Exp $ */
 
 /*-
  * Copyright (C) 2001 Benno Rice
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY Benno Rice ``AS IS'' AND ANY EXPRESS OR
  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  * IN NO EVENT SHALL TOOLS GMBH BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
  * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
  * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
  * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
  * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 /*-
  * Copyright (C) 1995, 1996 Wolfgang Solfrank.
  * Copyright (C) 1995, 1996 TooLs GmbH.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *	This product includes software developed by TooLs GmbH.
  * 4. The name of TooLs GmbH may not be used to endorse or promote products
  *    derived from this software without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY TOOLS GMBH ``AS IS'' AND ANY EXPRESS OR
  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  * IN NO EVENT SHALL TOOLS GMBH BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
  * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
  * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
  * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
  * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
 #include "assym.s"
 
 #include <sys/syscall.h>
 
 #include <machine/trap.h>
 #include <machine/param.h>
 #include <machine/spr.h>
 #include <machine/asm.h>
 #include "opt_platform.h"
 
-#include "opt_kdtrace.h"
-
 /* Locate the per-CPU data structure */
 #define GET_CPUINFO(r)  \
         mfsprg0  r
 
 /*
  * Compiled KERNBASE location and the kernel load address
  */
         .globl  kernbase
         .set    kernbase, KERNBASE
 
 #define	TMPSTKSZ	8192		/* 8K temporary stack */
 
 /*
  * Globals
  */
 	.data
 	.align	4
 GLOBAL(tmpstk)
 	.space	TMPSTKSZ
 GLOBAL(esym)
 	.long	0			/* end of symbol table */
 
 #define	INTRCNT_COUNT	256		/* max(HROWPIC_IRQMAX,OPENPIC_IRQMAX) */
 GLOBAL(intrnames)
 	.space	INTRCNT_COUNT * (MAXCOMLEN + 1) * 2
 GLOBAL(sintrnames)
 	.long	INTRCNT_COUNT * (MAXCOMLEN + 1) * 2
 
 	.align 4
 GLOBAL(intrcnt)
 	.space	INTRCNT_COUNT * 4 * 2
 GLOBAL(sintrcnt)
 	.long	INTRCNT_COUNT * 4 * 2
 
 	.text
 	.globl	btext
 btext:
 
 /*
  * This symbol is here for the benefit of kvm_mkdb, and is supposed to
  * mark the start of kernel text.
  */
 	.globl	kernel_text
 kernel_text:
 
 /*
  * Startup entry.  Note, this must be the first thing in the text
  * segment!
  */
 	.text
 	.globl	__start
 __start:
 	li	8,0
 	li	9,0x100
 	mtctr	9
 1:
 	dcbf	0,8
 	icbi	0,8
 	addi	8,8,0x20
 	bdnz	1b
 	sync
 	isync
 #ifdef WII
 #include <powerpc/wii/locore.S>
 #endif
 	/* Zero bss, in case we were started by something unhelpful */
 	li	0,0
 	lis	8,_edata@ha
 	addi	8,8,_edata@l
 	lis	9,_end@ha
 	addi	9,9,_end@l
 2:	stw     0,0(8)
 	addi	8,8,4
 	cmplw	8,9
 	blt	2b
 	
 	/* Save the argument pointer and length */
 	mr	20,6
 	mr	21,7
 
 	lis	8,openfirmware_entry@ha
 	stw	5,openfirmware_entry@l(8) /* save client interface handler */
 
 	lis	1,(tmpstk+TMPSTKSZ-16)@ha
 	addi	1,1,(tmpstk+TMPSTKSZ-16)@l
 
 	mfmsr	0
 	lis	9,ofmsr@ha
 	stwu	0,ofmsr@l(9)
 
 	mfsprg0	0			/* save SPRG0-3 */
 	stw	0,4(9)			/* ofmsr[1] = sprg0 */
 	mfsprg1 0
 	stw	0,8(9)			/* ofmsr[2] = sprg1 */
 	mfsprg2 0
 	stw	0,12(9)			/* ofmsr[3] = sprg2 */
 	mfsprg3 0
 	stw	0,16(9)			/* ofmsr[4] = sprg3 */
 	
 	bl	OF_initial_setup
 
 	lis	3,kernel_text@ha
 	addi	3,3,kernel_text@l
 
 	lis	4,end@ha
 	addi	4,4,end@l
 	add	4,4,3
 	mr	5,4
 
 	/* Restore the argument pointer and length */
 	mr	6,20
 	mr	7,21
 
 	bl	powerpc_init
 	mr	%r1, %r3
 	li	%r3, 0
 	stw	%r3, 0(%r1)
 	bl	mi_startup
 	b	OF_exit
 
 /*
  * int setfault()
  *
  * Similar to setjmp to setup for handling faults on accesses to user memory.
  * Any routine using this may only call bcopy, either the form below,
  * or the (currently used) C code optimized, so it doesn't use any non-volatile
  * registers.
  */
 	.globl	setfault
 setfault:
 	mflr	0
 	mfcr	12
 	mfsprg	4,0
 	lwz	4,TD_PCB(2)	/* curthread = r2 */
 	stw	3,PCB_ONFAULT(4)
 	stw	0,0(3)
 	stw	1,4(3)
 	stw	2,8(3)
 	stmw	12,12(3)
 	xor	3,3,3
 	blr
 
 #include <powerpc/aim/trap_subr32.S>
Index: head/sys/powerpc/aim/locore64.S
===================================================================
--- head/sys/powerpc/aim/locore64.S	(revision 258540)
+++ head/sys/powerpc/aim/locore64.S	(revision 258541)
@@ -1,242 +1,240 @@
 /* $FreeBSD$ */
 /* $NetBSD: locore.S,v 1.24 2000/05/31 05:09:17 thorpej Exp $ */
 
 /*-
  * Copyright (C) 2001 Benno Rice
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY Benno Rice ``AS IS'' AND ANY EXPRESS OR
  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  * IN NO EVENT SHALL TOOLS GMBH BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
  * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
  * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
  * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
  * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 /*-
  * Copyright (C) 1995, 1996 Wolfgang Solfrank.
  * Copyright (C) 1995, 1996 TooLs GmbH.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *	This product includes software developed by TooLs GmbH.
  * 4. The name of TooLs GmbH may not be used to endorse or promote products
  *    derived from this software without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY TOOLS GMBH ``AS IS'' AND ANY EXPRESS OR
  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  * IN NO EVENT SHALL TOOLS GMBH BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
  * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
  * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
  * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
  * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
 #include "assym.s"
 
 #include <sys/syscall.h>
 
 #include <machine/trap.h>
 #include <machine/param.h>
 #include <machine/spr.h>
 #include <machine/asm.h>
 
-#include "opt_kdtrace.h"
-
 /* Locate the per-CPU data structure */
 #define GET_CPUINFO(r)  \
         mfsprg0  r
 
 /*
  * Compiled KERNBASE location and the kernel load address
  */
         .globl  kernbase
         .set    kernbase, KERNBASE
 
 #define	TMPSTKSZ	16384		/* 16K temporary stack */
 
 /*
  * Globals
  */
 	.data
 	.align	4
 GLOBAL(tmpstk)
 	.space	TMPSTKSZ
 GLOBAL(esym)
 	.llong	0			/* end of symbol table */
 
 #define	INTRCNT_COUNT	256		/* max(HROWPIC_IRQMAX,OPENPIC_IRQMAX) */
 GLOBAL(intrnames)
 	.space	INTRCNT_COUNT * (MAXCOMLEN + 1) * 2
 GLOBAL(sintrnames)
 	.quad	INTRCNT_COUNT * (MAXCOMLEN + 1) * 2
 
 	.align 4
 GLOBAL(intrcnt)
 	.space	INTRCNT_COUNT * 4 * 2
 GLOBAL(sintrcnt)
 	.quad	INTRCNT_COUNT * 4 * 2
 
 	.text
 	.globl	btext
 btext:
 
 /*
  * This symbol is here for the benefit of kvm_mkdb, and is supposed to
  * mark the start of kernel text.
  */
 	.globl	kernel_text
 kernel_text:
 
 /*
  * Startup entry.  Note, this must be the first thing in the text
  * segment!
  */
 	.text
 ASENTRY_NOPROF(__start)
 	li	8,0
 	li	9,0x100
 	mtctr	9
 1:
 	dcbf	0,8
 	icbi	0,8
 	addi	8,8,0x20
 	bdnz	1b
 	sync
 	isync
 
 	/* Save the argument pointer and length */
 	mr	20,6
 	mr	21,7
 
 	lis	8,openfirmware_entry@ha
 	std	5,openfirmware_entry@l(8) /* save client interface handler */
 
 	/* Set up the stack pointer */
 	lis	1,(tmpstk+TMPSTKSZ-48)@ha
 	addi	1,1,(tmpstk+TMPSTKSZ-48)@l
 
 	/* Set up the TOC pointer */
 	lis	2,tocbase@ha
 	ld	2,tocbase@l(2)
 
 	mfmsr	0
 	lis	9,ofmsr@ha
 	stdu	0,ofmsr@l(9)
 
 	mfsprg0	0			/* save SPRG0-3 */
 	std	0,8(9)			/* ofmsr[1] = sprg0 */
 	mfsprg1 0
 	std	0,16(9)			/* ofmsr[2] = sprg1 */
 	mfsprg2 0
 	std	0,24(9)			/* ofmsr[3] = sprg2 */
 	mfsprg3 0
 	std	0,32(9)			/* ofmsr[4] = sprg3 */
 
 	/* Switch to 64-bit mode */
 	mfmsr	9
 	li	8,1
 	insrdi	9,8,1,0
 	mtmsrd	9
 	
 	bl	OF_initial_setup
 	nop
 
 	lis	3,kernbase@ha
 	addi	3,3,kernbase@l
 
 	lis	4,end@ha
 	addi	4,4,end@l
 	add	4,4,3
 	mr	5,4
 
 	/* Restore the argument pointer and length */
 	mr	6,20
 	mr	7,21
 
 	bl	powerpc_init
 	nop
 	mr	%r1, %r3
 	li	%r3, 0
 	std	%r3, 0(%r1)
 	bl	mi_startup
 	nop
 	b	OF_exit
 	nop
 
 /*
  * PPC64 ABI TOC base
  */
 
         .align  3
 	.globl	tocbase
 tocbase:
         .llong  .TOC.@tocbase
 
 /*
  * int setfault()
  *
  * Similar to setjmp to setup for handling faults on accesses to user memory.
  * Any routine using this may only call bcopy, either the form below,
  * or the (currently used) C code optimized, so it doesn't use any non-volatile
  * registers.
  */
 ASENTRY_NOPROF(setfault)
 	mflr	0
 	mfcr	12
 	mfsprg	4,0
 	ld	4,TD_PCB(13)	/* curthread = r13 */
 	std	3,PCB_ONFAULT(4)
 	std	0,0(3)
 	std	1,8(3)
 	std	2,16(3)
 
 	std	%r12,24(%r3)	/* Save the non-volatile GP regs. */
 	std	%r13,24+1*8(%r3)	
 	std	%r14,24+2*8(%r3)	
 	std	%r15,24+3*8(%r3)	
 	std	%r16,24+4*8(%r3)	
 	std	%r17,24+5*8(%r3)	
 	std	%r18,24+6*8(%r3)	
 	std	%r19,24+7*8(%r3)	
 	std	%r20,24+8*8(%r3)	
 	std	%r21,24+9*8(%r3)	
 	std	%r22,24+10*8(%r3)	
 	std	%r23,24+11*8(%r3)	
 	std	%r24,24+12*8(%r3)	
 	std	%r25,24+13*8(%r3)	
 	std	%r26,24+14*8(%r3)	
 	std	%r27,24+15*8(%r3)	
 	std	%r28,24+16*8(%r3)
 	std	%r29,24+17*8(%r3)	
 	std	%r30,24+18*8(%r3)	
 	std	%r31,24+19*8(%r3)	
 
 	xor	3,3,3
 	blr
 
 #include <powerpc/aim/trap_subr64.S>
Index: head/sys/powerpc/aim/trap.c
===================================================================
--- head/sys/powerpc/aim/trap.c	(revision 258540)
+++ head/sys/powerpc/aim/trap.c	(revision 258541)
@@ -1,797 +1,795 @@
 /*-
  * Copyright (C) 1995, 1996 Wolfgang Solfrank.
  * Copyright (C) 1995, 1996 TooLs GmbH.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *	This product includes software developed by TooLs GmbH.
  * 4. The name of TooLs GmbH may not be used to endorse or promote products
  *    derived from this software without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY TOOLS GMBH ``AS IS'' AND ANY EXPRESS OR
  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  * IN NO EVENT SHALL TOOLS GMBH BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
  * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
  * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
  * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
  * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  * $NetBSD: trap.c,v 1.58 2002/03/04 04:07:35 dbj Exp $
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
-#include "opt_kdtrace.h"
-
 #include <sys/param.h>
 #include <sys/kdb.h>
 #include <sys/proc.h>
 #include <sys/ktr.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/pioctl.h>
 #include <sys/ptrace.h>
 #include <sys/reboot.h>
 #include <sys/syscall.h>
 #include <sys/sysent.h>
 #include <sys/systm.h>
 #include <sys/uio.h>
 #include <sys/signalvar.h>
 #include <sys/vmmeter.h>
 
 #include <security/audit/audit.h>
 
 #include <vm/vm.h>
 #include <vm/pmap.h>
 #include <vm/vm_extern.h>
 #include <vm/vm_param.h>
 #include <vm/vm_kern.h>
 #include <vm/vm_map.h>
 #include <vm/vm_page.h>
 
 #include <machine/_inttypes.h>
 #include <machine/altivec.h>
 #include <machine/cpu.h>
 #include <machine/db_machdep.h>
 #include <machine/fpu.h>
 #include <machine/frame.h>
 #include <machine/pcb.h>
 #include <machine/pmap.h>
 #include <machine/psl.h>
 #include <machine/trap.h>
 #include <machine/spr.h>
 #include <machine/sr.h>
 
 static void	trap_fatal(struct trapframe *frame);
 static void	printtrap(u_int vector, struct trapframe *frame, int isfatal,
 		    int user);
 static int	trap_pfault(struct trapframe *frame, int user);
 static int	fix_unaligned(struct thread *td, struct trapframe *frame);
 static int	handle_onfault(struct trapframe *frame);
 static void	syscall(struct trapframe *frame);
 
 #ifdef __powerpc64__
        void	handle_kernel_slb_spill(int, register_t, register_t);
 static int	handle_user_slb_spill(pmap_t pm, vm_offset_t addr);
 extern int	n_slbs;
 #endif
 
 int	setfault(faultbuf);		/* defined in locore.S */
 
 /* Why are these not defined in a header? */
 int	badaddr(void *, size_t);
 int	badaddr_read(void *, size_t, int *);
 
 struct powerpc_exception {
 	u_int	vector;
 	char	*name;
 };
 
 #ifdef KDTRACE_HOOKS
 #include <sys/dtrace_bsd.h>
 
 /*
  * This is a hook which is initialised by the dtrace module
  * to handle traps which might occur during DTrace probe
  * execution.
  */
 dtrace_trap_func_t	dtrace_trap_func;
 
 dtrace_doubletrap_func_t	dtrace_doubletrap_func;
 
 /*
  * This is a hook which is initialised by the systrace module
  * when it is loaded. This keeps the DTrace syscall provider
  * implementation opaque. 
  */
 systrace_probe_func_t	systrace_probe_func;
 
 /*
  * These hooks are necessary for the pid and usdt providers.
  */
 dtrace_pid_probe_ptr_t		dtrace_pid_probe_ptr;
 dtrace_return_probe_ptr_t	dtrace_return_probe_ptr;
 int (*dtrace_invop_jump_addr)(struct trapframe *);
 #endif
 
 static struct powerpc_exception powerpc_exceptions[] = {
 	{ 0x0100, "system reset" },
 	{ 0x0200, "machine check" },
 	{ 0x0300, "data storage interrupt" },
 	{ 0x0380, "data segment exception" },
 	{ 0x0400, "instruction storage interrupt" },
 	{ 0x0480, "instruction segment exception" },
 	{ 0x0500, "external interrupt" },
 	{ 0x0600, "alignment" },
 	{ 0x0700, "program" },
 	{ 0x0800, "floating-point unavailable" },
 	{ 0x0900, "decrementer" },
 	{ 0x0c00, "system call" },
 	{ 0x0d00, "trace" },
 	{ 0x0e00, "floating-point assist" },
 	{ 0x0f00, "performance monitoring" },
 	{ 0x0f20, "altivec unavailable" },
 	{ 0x1000, "instruction tlb miss" },
 	{ 0x1100, "data load tlb miss" },
 	{ 0x1200, "data store tlb miss" },
 	{ 0x1300, "instruction breakpoint" },
 	{ 0x1400, "system management" },
 	{ 0x1600, "altivec assist" },
 	{ 0x1700, "thermal management" },
 	{ 0x2000, "run mode/trace" },
 	{ 0x3000, NULL }
 };
 
 static const char *
 trapname(u_int vector)
 {
 	struct	powerpc_exception *pe;
 
 	for (pe = powerpc_exceptions; pe->vector != 0x3000; pe++) {
 		if (pe->vector == vector)
 			return (pe->name);
 	}
 
 	return ("unknown");
 }
 
 void
 trap(struct trapframe *frame)
 {
 	struct thread	*td;
 	struct proc	*p;
 #ifdef KDTRACE_HOOKS
 	uint32_t inst;
 #endif
 	int		sig, type, user;
 	u_int		ucode;
 	ksiginfo_t	ksi;
 
 	PCPU_INC(cnt.v_trap);
 
 	td = curthread;
 	p = td->td_proc;
 
 	type = ucode = frame->exc;
 	sig = 0;
 	user = frame->srr1 & PSL_PR;
 
 	CTR3(KTR_TRAP, "trap: %s type=%s (%s)", td->td_name,
 	    trapname(type), user ? "user" : "kernel");
 
 #ifdef KDTRACE_HOOKS
 	/*
 	 * A trap can occur while DTrace executes a probe. Before
 	 * executing the probe, DTrace blocks re-scheduling and sets
 	 * a flag in it's per-cpu flags to indicate that it doesn't
 	 * want to fault. On returning from the probe, the no-fault
 	 * flag is cleared and finally re-scheduling is enabled.
 	 *
 	 * If the DTrace kernel module has registered a trap handler,
 	 * call it and if it returns non-zero, assume that it has
 	 * handled the trap and modified the trap frame so that this
 	 * function can return normally.
 	 */
 	if (dtrace_trap_func != NULL && (*dtrace_trap_func)(frame, type))
 		return;
 #endif
 
 	if (user) {
 		td->td_pticks = 0;
 		td->td_frame = frame;
 		if (td->td_ucred != p->p_ucred)
 			cred_update_thread(td);
 
 		/* User Mode Traps */
 		switch (type) {
 		case EXC_RUNMODETRC:
 		case EXC_TRC:
 			frame->srr1 &= ~PSL_SE;
 			sig = SIGTRAP;
 			break;
 
 #ifdef __powerpc64__
 		case EXC_ISE:
 		case EXC_DSE:
 			if (handle_user_slb_spill(&p->p_vmspace->vm_pmap,
 			    (type == EXC_ISE) ? frame->srr0 :
 			    frame->cpu.aim.dar) != 0)
 				sig = SIGSEGV;
 			break;
 #endif
 		case EXC_DSI:
 		case EXC_ISI:
 			sig = trap_pfault(frame, 1);
 			break;
 
 		case EXC_SC:
 			syscall(frame);
 			break;
 
 		case EXC_FPU:
 			KASSERT((td->td_pcb->pcb_flags & PCB_FPU) != PCB_FPU,
 			    ("FPU already enabled for thread"));
 			enable_fpu(td);
 			break;
 
 		case EXC_VEC:
 			KASSERT((td->td_pcb->pcb_flags & PCB_VEC) != PCB_VEC,
 			    ("Altivec already enabled for thread"));
 			enable_vec(td);
 			break;
 
 		case EXC_VECAST_G4:
 		case EXC_VECAST_G5:
 			/*
 			 * We get a VPU assist exception for IEEE mode
 			 * vector operations on denormalized floats.
 			 * Emulating this is a giant pain, so for now,
 			 * just switch off IEEE mode and treat them as
 			 * zero.
 			 */
 
 			save_vec(td);
 			td->td_pcb->pcb_vec.vscr |= ALTIVEC_VSCR_NJ;
 			enable_vec(td);
 			break;
 
 		case EXC_ALI:
 			if (fix_unaligned(td, frame) != 0)
 				sig = SIGBUS;
 			else
 				frame->srr0 += 4;
 			break;
 
 		case EXC_PGM:
 			/* Identify the trap reason */
 			if (frame->srr1 & EXC_PGM_TRAP) {
 #ifdef KDTRACE_HOOKS
 				inst = fuword32((const void *)frame->srr0);
 				if (inst == 0x0FFFDDDD && dtrace_pid_probe_ptr != NULL) {
 					struct reg regs;
 					fill_regs(td, &regs);
 					(*dtrace_pid_probe_ptr)(&regs);
 					break;
 				}
 #endif
  				sig = SIGTRAP;
 			} else {
 				sig = ppc_instr_emulate(frame, td->td_pcb);
 			}
 			break;
 
 		default:
 			trap_fatal(frame);
 		}
 	} else {
 		/* Kernel Mode Traps */
 
 		KASSERT(cold || td->td_ucred != NULL,
 		    ("kernel trap doesn't have ucred"));
 		switch (type) {
 #ifdef KDTRACE_HOOKS
 		case EXC_PGM:
 			if (frame->srr1 & EXC_PGM_TRAP) {
 				if (*(uintptr_t *)frame->srr0 == 0x7c810808) {
 					if (dtrace_invop_jump_addr != NULL) {
 						dtrace_invop_jump_addr(frame);
 						return;
 					}
 				}
 			}
 			break;
 #endif
 #ifdef __powerpc64__
 		case EXC_DSE:
 			if ((frame->cpu.aim.dar & SEGMENT_MASK) == USER_ADDR) {
 				__asm __volatile ("slbmte %0, %1" ::
 					"r"(td->td_pcb->pcb_cpu.aim.usr_vsid),
 					"r"(USER_SLB_SLBE));
 				return;
 			}
 			break;
 #endif
 		case EXC_DSI:
 			if (trap_pfault(frame, 0) == 0)
  				return;
 			break;
 		case EXC_MCHK:
 			if (handle_onfault(frame))
  				return;
 			break;
 		default:
 			break;
 		}
 		trap_fatal(frame);
 	}
 
 	if (sig != 0) {
 		if (p->p_sysent->sv_transtrap != NULL)
 			sig = (p->p_sysent->sv_transtrap)(sig, type);
 		ksiginfo_init_trap(&ksi);
 		ksi.ksi_signo = sig;
 		ksi.ksi_code = (int) ucode; /* XXX, not POSIX */
 		/* ksi.ksi_addr = ? */
 		ksi.ksi_trapno = type;
 		trapsignal(td, &ksi);
 	}
 
 	userret(td, frame);
 }
 
 static void
 trap_fatal(struct trapframe *frame)
 {
 
 	printtrap(frame->exc, frame, 1, (frame->srr1 & PSL_PR));
 #ifdef KDB
 	if ((debugger_on_panic || kdb_active) &&
 	    kdb_trap(frame->exc, 0, frame))
 		return;
 #endif
 	panic("%s trap", trapname(frame->exc));
 }
 
 static void
 printtrap(u_int vector, struct trapframe *frame, int isfatal, int user)
 {
 
 	printf("\n");
 	printf("%s %s trap:\n", isfatal ? "fatal" : "handled",
 	    user ? "user" : "kernel");
 	printf("\n");
 	printf("   exception       = 0x%x (%s)\n", vector, trapname(vector));
 	switch (vector) {
 	case EXC_DSE:
 	case EXC_DSI:
 		printf("   virtual address = 0x%" PRIxPTR "\n",
 		    frame->cpu.aim.dar);
 		printf("   dsisr           = 0x%" PRIxPTR "\n",
 		    frame->cpu.aim.dsisr);
 		break;
 	case EXC_ISE:
 	case EXC_ISI:
 		printf("   virtual address = 0x%" PRIxPTR "\n", frame->srr0);
 		break;
 	}
 	printf("   srr0            = 0x%" PRIxPTR "\n", frame->srr0);
 	printf("   srr1            = 0x%" PRIxPTR "\n", frame->srr1);
 	printf("   lr              = 0x%" PRIxPTR "\n", frame->lr);
 	printf("   curthread       = %p\n", curthread);
 	if (curthread != NULL)
 		printf("          pid = %d, comm = %s\n",
 		    curthread->td_proc->p_pid, curthread->td_name);
 	printf("\n");
 }
 
 /*
  * Handles a fatal fault when we have onfault state to recover.  Returns
  * non-zero if there was onfault recovery state available.
  */
 static int
 handle_onfault(struct trapframe *frame)
 {
 	struct		thread *td;
 	faultbuf	*fb;
 
 	td = curthread;
 	fb = td->td_pcb->pcb_onfault;
 	if (fb != NULL) {
 		frame->srr0 = (*fb)[0];
 		frame->fixreg[1] = (*fb)[1];
 		frame->fixreg[2] = (*fb)[2];
 		frame->fixreg[3] = 1;
 		frame->cr = (*fb)[3];
 		bcopy(&(*fb)[4], &frame->fixreg[13],
 		    19 * sizeof(register_t));
 		return (1);
 	}
 	return (0);
 }
 
 int
 cpu_fetch_syscall_args(struct thread *td, struct syscall_args *sa)
 {
 	struct proc *p;
 	struct trapframe *frame;
 	caddr_t	params;
 	size_t argsz;
 	int error, n, i;
 
 	p = td->td_proc;
 	frame = td->td_frame;
 
 	sa->code = frame->fixreg[0];
 	params = (caddr_t)(frame->fixreg + FIRSTARG);
 	n = NARGREG;
 
 	if (sa->code == SYS_syscall) {
 		/*
 		 * code is first argument,
 		 * followed by actual args.
 		 */
 		sa->code = *(register_t *) params;
 		params += sizeof(register_t);
 		n -= 1;
 	} else if (sa->code == SYS___syscall) {
 		/*
 		 * Like syscall, but code is a quad,
 		 * so as to maintain quad alignment
 		 * for the rest of the args.
 		 */
 		if (SV_PROC_FLAG(p, SV_ILP32)) {
 			params += sizeof(register_t);
 			sa->code = *(register_t *) params;
 			params += sizeof(register_t);
 			n -= 2;
 		} else {
 			sa->code = *(register_t *) params;
 			params += sizeof(register_t);
 			n -= 1;
 		}
 	}
 
  	if (p->p_sysent->sv_mask)
 		sa->code &= p->p_sysent->sv_mask;
 	if (sa->code >= p->p_sysent->sv_size)
 		sa->callp = &p->p_sysent->sv_table[0];
 	else
 		sa->callp = &p->p_sysent->sv_table[sa->code];
 
 	sa->narg = sa->callp->sy_narg;
 
 	if (SV_PROC_FLAG(p, SV_ILP32)) {
 		argsz = sizeof(uint32_t);
 
 		for (i = 0; i < n; i++)
 			sa->args[i] = ((u_register_t *)(params))[i] &
 			    0xffffffff;
 	} else {
 		argsz = sizeof(uint64_t);
 
 		for (i = 0; i < n; i++)
 			sa->args[i] = ((u_register_t *)(params))[i];
 	}
 
 	if (sa->narg > n)
 		error = copyin(MOREARGS(frame->fixreg[1]), sa->args + n,
 			       (sa->narg - n) * argsz);
 	else
 		error = 0;
 
 #ifdef __powerpc64__
 	if (SV_PROC_FLAG(p, SV_ILP32) && sa->narg > n) {
 		/* Expand the size of arguments copied from the stack */
 
 		for (i = sa->narg; i >= n; i--)
 			sa->args[i] = ((uint32_t *)(&sa->args[n]))[i-n];
 	}
 #endif
 
 	if (error == 0) {
 		td->td_retval[0] = 0;
 		td->td_retval[1] = frame->fixreg[FIRSTARG + 1];
 	}
 	return (error);
 }
 
 #include "../../kern/subr_syscall.c"
 
 void
 syscall(struct trapframe *frame)
 {
 	struct thread *td;
 	struct syscall_args sa;
 	int error;
 
 	td = curthread;
 	td->td_frame = frame;
 
 #ifdef __powerpc64__
 	/*
 	 * Speculatively restore last user SLB segment, which we know is
 	 * invalid already, since we are likely to do copyin()/copyout().
 	 */
 	__asm __volatile ("slbmte %0, %1; isync" ::
             "r"(td->td_pcb->pcb_cpu.aim.usr_vsid), "r"(USER_SLB_SLBE));
 #endif
 
 	error = syscallenter(td, &sa);
 	syscallret(td, error, &sa);
 }
 
 #ifdef __powerpc64__
 /* Handle kernel SLB faults -- runs in real mode, all seat belts off */
 void
 handle_kernel_slb_spill(int type, register_t dar, register_t srr0)
 {
 	struct slb *slbcache;
 	uint64_t slbe, slbv;
 	uint64_t esid, addr;
 	int i;
 
 	addr = (type == EXC_ISE) ? srr0 : dar;
 	slbcache = PCPU_GET(slb);
 	esid = (uintptr_t)addr >> ADDR_SR_SHFT;
 	slbe = (esid << SLBE_ESID_SHIFT) | SLBE_VALID;
 	
 	/* See if the hardware flushed this somehow (can happen in LPARs) */
 	for (i = 0; i < n_slbs; i++)
 		if (slbcache[i].slbe == (slbe | (uint64_t)i))
 			return;
 
 	/* Not in the map, needs to actually be added */
 	slbv = kernel_va_to_slbv(addr);
 	if (slbcache[USER_SLB_SLOT].slbe == 0) {
 		for (i = 0; i < n_slbs; i++) {
 			if (i == USER_SLB_SLOT)
 				continue;
 			if (!(slbcache[i].slbe & SLBE_VALID))
 				goto fillkernslb;
 		}
 
 		if (i == n_slbs)
 			slbcache[USER_SLB_SLOT].slbe = 1;
 	}
 
 	/* Sacrifice a random SLB entry that is not the user entry */
 	i = mftb() % n_slbs;
 	if (i == USER_SLB_SLOT)
 		i = (i+1) % n_slbs;
 
 fillkernslb:
 	/* Write new entry */
 	slbcache[i].slbv = slbv;
 	slbcache[i].slbe = slbe | (uint64_t)i;
 
 	/* Trap handler will restore from cache on exit */
 }
 
 static int 
 handle_user_slb_spill(pmap_t pm, vm_offset_t addr)
 {
 	struct slb *user_entry;
 	uint64_t esid;
 	int i;
 
 	esid = (uintptr_t)addr >> ADDR_SR_SHFT;
 
 	PMAP_LOCK(pm);
 	user_entry = user_va_to_slb_entry(pm, addr);
 
 	if (user_entry == NULL) {
 		/* allocate_vsid auto-spills it */
 		(void)allocate_user_vsid(pm, esid, 0);
 	} else {
 		/*
 		 * Check that another CPU has not already mapped this.
 		 * XXX: Per-thread SLB caches would be better.
 		 */
 		for (i = 0; i < pm->pm_slb_len; i++)
 			if (pm->pm_slb[i] == user_entry)
 				break;
 
 		if (i == pm->pm_slb_len)
 			slb_insert_user(pm, user_entry);
 	}
 	PMAP_UNLOCK(pm);
 
 	return (0);
 }
 #endif
 
 static int
 trap_pfault(struct trapframe *frame, int user)
 {
 	vm_offset_t	eva, va;
 	struct		thread *td;
 	struct		proc *p;
 	vm_map_t	map;
 	vm_prot_t	ftype;
 	int		rv;
 	register_t	user_sr;
 
 	td = curthread;
 	p = td->td_proc;
 	if (frame->exc == EXC_ISI) {
 		eva = frame->srr0;
 		ftype = VM_PROT_EXECUTE;
 		if (frame->srr1 & SRR1_ISI_PFAULT)
 			ftype |= VM_PROT_READ;
 	} else {
 		eva = frame->cpu.aim.dar;
 		if (frame->cpu.aim.dsisr & DSISR_STORE)
 			ftype = VM_PROT_WRITE;
 		else
 			ftype = VM_PROT_READ;
 	}
 
 	if (user) {
 		map = &p->p_vmspace->vm_map;
 	} else {
 		if ((eva >> ADDR_SR_SHFT) == (USER_ADDR >> ADDR_SR_SHFT)) {
 			if (p->p_vmspace == NULL)
 				return (SIGSEGV);
 
 			map = &p->p_vmspace->vm_map;
 
 			user_sr = td->td_pcb->pcb_cpu.aim.usr_segm;
 			eva &= ADDR_PIDX | ADDR_POFF;
 			eva |= user_sr << ADDR_SR_SHFT;
 		} else {
 			map = kernel_map;
 		}
 	}
 	va = trunc_page(eva);
 
 	if (map != kernel_map) {
 		/*
 		 * Keep swapout from messing with us during this
 		 *	critical time.
 		 */
 		PROC_LOCK(p);
 		++p->p_lock;
 		PROC_UNLOCK(p);
 
 		/* Fault in the user page: */
 		rv = vm_fault(map, va, ftype, VM_FAULT_NORMAL);
 
 		PROC_LOCK(p);
 		--p->p_lock;
 		PROC_UNLOCK(p);
 		/*
 		 * XXXDTRACE: add dtrace_doubletrap_func here?
 		 */
 	} else {
 		/*
 		 * Don't have to worry about process locking or stacks in the
 		 * kernel.
 		 */
 		rv = vm_fault(map, va, ftype, VM_FAULT_NORMAL);
 	}
 
 	if (rv == KERN_SUCCESS)
 		return (0);
 
 	if (!user && handle_onfault(frame))
 		return (0);
 
 	return (SIGSEGV);
 }
 
 int
 badaddr(void *addr, size_t size)
 {
 	return (badaddr_read(addr, size, NULL));
 }
 
 int
 badaddr_read(void *addr, size_t size, int *rptr)
 {
 	struct thread	*td;
 	faultbuf	env;
 	int		x;
 
 	/* Get rid of any stale machine checks that have been waiting.  */
 	__asm __volatile ("sync; isync");
 
 	td = curthread;
 
 	if (setfault(env)) {
 		td->td_pcb->pcb_onfault = 0;
 		__asm __volatile ("sync");
 		return 1;
 	}
 
 	__asm __volatile ("sync");
 
 	switch (size) {
 	case 1:
 		x = *(volatile int8_t *)addr;
 		break;
 	case 2:
 		x = *(volatile int16_t *)addr;
 		break;
 	case 4:
 		x = *(volatile int32_t *)addr;
 		break;
 	default:
 		panic("badaddr: invalid size (%zd)", size);
 	}
 
 	/* Make sure we took the machine check, if we caused one. */
 	__asm __volatile ("sync; isync");
 
 	td->td_pcb->pcb_onfault = 0;
 	__asm __volatile ("sync");	/* To be sure. */
 
 	/* Use the value to avoid reorder. */
 	if (rptr)
 		*rptr = x;
 
 	return (0);
 }
 
 /*
  * For now, this only deals with the particular unaligned access case
  * that gcc tends to generate.  Eventually it should handle all of the
  * possibilities that can happen on a 32-bit PowerPC in big-endian mode.
  */
 
 static int
 fix_unaligned(struct thread *td, struct trapframe *frame)
 {
 	struct thread	*fputhread;
 	int		indicator, reg;
 	double		*fpr;
 
 	indicator = EXC_ALI_OPCODE_INDICATOR(frame->cpu.aim.dsisr);
 
 	switch (indicator) {
 	case EXC_ALI_LFD:
 	case EXC_ALI_STFD:
 		reg = EXC_ALI_RST(frame->cpu.aim.dsisr);
 		fpr = &td->td_pcb->pcb_fpu.fpr[reg];
 		fputhread = PCPU_GET(fputhread);
 
 		/* Juggle the FPU to ensure that we've initialized
 		 * the FPRs, and that their current state is in
 		 * the PCB.
 		 */
 		if (fputhread != td) {
 			if (fputhread)
 				save_fpu(fputhread);
 			enable_fpu(td);
 		}
 		save_fpu(td);
 
 		if (indicator == EXC_ALI_LFD) {
 			if (copyin((void *)frame->cpu.aim.dar, fpr,
 			    sizeof(double)) != 0)
 				return -1;
 			enable_fpu(td);
 		} else {
 			if (copyout(fpr, (void *)frame->cpu.aim.dar,
 			    sizeof(double)) != 0)
 				return -1;
 		}
 		return 0;
 		break;
 	}
 
 	return -1;
 }
 
Index: head/sys/security/mac/mac_audit.c
===================================================================
--- head/sys/security/mac/mac_audit.c	(revision 258540)
+++ head/sys/security/mac/mac_audit.c	(revision 258541)
@@ -1,145 +1,143 @@
 /*-
  * Copyright (c) 1999-2002, 2009 Robert N. M. Watson
  * Copyright (c) 2001 Ilmar S. Habibulin
  * Copyright (c) 2001-2004 Networks Associates Technology, Inc.
  * Copyright (c) 2006 SPARTA, Inc.
  *
  * This software was developed by Robert Watson and Ilmar Habibulin for the
  * TrustedBSD Project.
  *
  * This software was developed for the FreeBSD Project in part by Network
  * Associates Laboratories, the Security Research Division of Network
  * Associates, Inc. under DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"),
  * as part of the DARPA CHATS research program.
  *
  * This software was enhanced by SPARTA ISSO under SPAWAR contract
  * N66001-04-C-6019 ("SEFOS").
  *
  * This software was developed at the University of Cambridge Computer
  * Laboratory with support from a grant from Google, Inc. 
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
-#include "opt_kdtrace.h"
-
 #include <sys/param.h>
 #include <sys/kernel.h>
 #include <sys/module.h>
 #include <sys/queue.h>
 #include <sys/sdt.h>
 #include <sys/vnode.h>
 
 #include <security/audit/audit.h>
 
 #include <security/mac/mac_framework.h>
 #include <security/mac/mac_internal.h>
 #include <security/mac/mac_policy.h>
 
 MAC_CHECK_PROBE_DEFINE2(cred_check_setaudit, "struct ucred *",
     "struct auditinfo *");
 
 int
 mac_cred_check_setaudit(struct ucred *cred, struct auditinfo *ai)
 {
 	int error;
 
 	MAC_POLICY_CHECK_NOSLEEP(cred_check_setaudit, cred, ai);
 	MAC_CHECK_PROBE2(cred_check_setaudit, error, cred, ai);
 
 	return (error);
 }
 
 MAC_CHECK_PROBE_DEFINE2(cred_check_setaudit_addr, "struct ucred *",
     "struct auditinfo_addr *");
 
 int
 mac_cred_check_setaudit_addr(struct ucred *cred, struct auditinfo_addr *aia)
 {
 	int error;
 
 	MAC_POLICY_CHECK_NOSLEEP(cred_check_setaudit_addr, cred, aia);
 	MAC_CHECK_PROBE2(cred_check_setaudit_addr, error, cred, aia);
 
 	return (error);
 }
 
 MAC_CHECK_PROBE_DEFINE2(cred_check_setauid, "struct ucred *", "uid_t");
 
 int
 mac_cred_check_setauid(struct ucred *cred, uid_t auid)
 {
 	int error;
 
 	MAC_POLICY_CHECK_NOSLEEP(cred_check_setauid, cred, auid);
 	MAC_CHECK_PROBE2(cred_check_setauid, error, cred, auid);
 
 	return (error);
 }
 
 MAC_CHECK_PROBE_DEFINE3(system_check_audit, "struct ucred *", "void *",
     "int");
 
 int
 mac_system_check_audit(struct ucred *cred, void *record, int length)
 {
 	int error;
 
 	MAC_POLICY_CHECK_NOSLEEP(system_check_audit, cred, record, length);
 	MAC_CHECK_PROBE3(system_check_audit, error, cred, record, length);
 
 	return (error);
 }
 
 MAC_CHECK_PROBE_DEFINE2(system_check_auditctl, "struct ucred *",
     "struct vnode *");
 
 int
 mac_system_check_auditctl(struct ucred *cred, struct vnode *vp)
 {
 	int error;
 	struct label *vl;
 
 	ASSERT_VOP_LOCKED(vp, "mac_system_check_auditctl");
 
 	vl = (vp != NULL) ? vp->v_label : NULL;
 	MAC_POLICY_CHECK(system_check_auditctl, cred, vp, vl);
 	MAC_CHECK_PROBE2(system_check_auditctl, error, cred, vp);
 
 	return (error);
 }
 
 MAC_CHECK_PROBE_DEFINE2(system_check_auditon, "struct ucred *", "int");
 
 int
 mac_system_check_auditon(struct ucred *cred, int cmd)
 {
 	int error;
 
 	MAC_POLICY_CHECK_NOSLEEP(system_check_auditon, cred, cmd);
 	MAC_CHECK_PROBE2(system_check_auditon, error, cred, cmd);
 
 	return (error);
 }
Index: head/sys/security/mac/mac_cred.c
===================================================================
--- head/sys/security/mac/mac_cred.c	(revision 258540)
+++ head/sys/security/mac/mac_cred.c	(revision 258541)
@@ -1,353 +1,352 @@
 /*-
  * Copyright (c) 1999-2002, 2008-2009 Robert N. M. Watson
  * Copyright (c) 2001 Ilmar S. Habibulin
  * Copyright (c) 2001-2003 Networks Associates Technology, Inc.
  * Copyright (c) 2005 Samy Al Bahra
  * Copyright (c) 2006 SPARTA, Inc.
  * Copyright (c) 2008 Apple Inc.
  * All rights reserved.
  *
  * This software was developed by Robert Watson and Ilmar Habibulin for the
  * TrustedBSD Project.
  *
  * This software was developed for the FreeBSD Project in part by Network
  * Associates Laboratories, the Security Research Division of Network
  * Associates, Inc. under DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"),
  * as part of the DARPA CHATS research program.
  *
  * This software was enhanced by SPARTA ISSO under SPAWAR contract
  * N66001-04-C-6019 ("SEFOS").
  *
  * This software was developed at the University of Cambridge Computer
  * Laboratory with support from a grant from Google, Inc. 
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
-#include "opt_kdtrace.h"
 #include "opt_mac.h"
 
 #include <sys/param.h>
 #include <sys/condvar.h>
 #include <sys/imgact.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mutex.h>
 #include <sys/mac.h>
 #include <sys/proc.h>
 #include <sys/sbuf.h>
 #include <sys/sdt.h>
 #include <sys/systm.h>
 #include <sys/vnode.h>
 #include <sys/mount.h>
 #include <sys/file.h>
 #include <sys/namei.h>
 #include <sys/sysctl.h>
 
 #include <vm/vm.h>
 #include <vm/pmap.h>
 #include <vm/vm_map.h>
 #include <vm/vm_object.h>
 
 #include <security/mac/mac_framework.h>
 #include <security/mac/mac_internal.h>
 #include <security/mac/mac_policy.h>
 
 struct label *
 mac_cred_label_alloc(void)
 {
 	struct label *label;
 
 	label = mac_labelzone_alloc(M_WAITOK);
 	MAC_POLICY_PERFORM(cred_init_label, label);
 	return (label);
 }
 
 void
 mac_cred_init(struct ucred *cred)
 {
 
 	if (mac_labeled & MPC_OBJECT_CRED)
 		cred->cr_label = mac_cred_label_alloc();
 	else
 		cred->cr_label = NULL;
 }
 
 void
 mac_cred_label_free(struct label *label)
 {
 
 	MAC_POLICY_PERFORM_NOSLEEP(cred_destroy_label, label);
 	mac_labelzone_free(label);
 }
 
 void
 mac_cred_destroy(struct ucred *cred)
 {
 
 	if (cred->cr_label != NULL) {
 		mac_cred_label_free(cred->cr_label);
 		cred->cr_label = NULL;
 	}
 }
 
 /*
  * When a thread becomes an NFS server daemon, its credential may need to be
  * updated to reflect this so that policies can recognize when file system
  * operations originate from the network.
  *
  * At some point, it would be desirable if the credential used for each NFS
  * RPC could be set based on the RPC context (i.e., source system, etc) to
  * provide more fine-grained access control.
  */
 void
 mac_cred_associate_nfsd(struct ucred *cred)
 {
 
 	MAC_POLICY_PERFORM_NOSLEEP(cred_associate_nfsd, cred);
 }
 
 /*
  * Initialize MAC label for the first kernel process, from which other kernel
  * processes and threads are spawned.
  */
 void
 mac_cred_create_swapper(struct ucred *cred)
 {
 
 	MAC_POLICY_PERFORM_NOSLEEP(cred_create_swapper, cred);
 }
 
 /*
  * Initialize MAC label for the first userland process, from which other
  * userland processes and threads are spawned.
  */
 void
 mac_cred_create_init(struct ucred *cred)
 {
 
 	MAC_POLICY_PERFORM_NOSLEEP(cred_create_init, cred);
 }
 
 int
 mac_cred_externalize_label(struct label *label, char *elements,
     char *outbuf, size_t outbuflen)
 {
 	int error;
 
 	MAC_POLICY_EXTERNALIZE(cred, label, elements, outbuf, outbuflen);
 
 	return (error);
 }
 
 int
 mac_cred_internalize_label(struct label *label, char *string)
 {
 	int error;
 
 	MAC_POLICY_INTERNALIZE(cred, label, string);
 
 	return (error);
 }
 
 /*
  * When a new process is created, its label must be initialized.  Generally,
  * this involves inheritence from the parent process, modulo possible deltas.
  * This function allows that processing to take place.
  */
 void
 mac_cred_copy(struct ucred *src, struct ucred *dest)
 {
 
 	MAC_POLICY_PERFORM_NOSLEEP(cred_copy_label, src->cr_label,
 	    dest->cr_label);
 }
 
 /*
  * When the subject's label changes, it may require revocation of privilege
  * to mapped objects.  This can't be done on-the-fly later with a unified
  * buffer cache.
  */
 void
 mac_cred_relabel(struct ucred *cred, struct label *newlabel)
 {
 
 	MAC_POLICY_PERFORM_NOSLEEP(cred_relabel, cred, newlabel);
 }
 
 MAC_CHECK_PROBE_DEFINE2(cred_check_relabel, "struct ucred *",
     "struct label *");
 
 int
 mac_cred_check_relabel(struct ucred *cred, struct label *newlabel)
 {
 	int error;
 
 	MAC_POLICY_CHECK_NOSLEEP(cred_check_relabel, cred, newlabel);
 	MAC_CHECK_PROBE2(cred_check_relabel, error, cred, newlabel);
 
 	return (error);
 }
 
 MAC_CHECK_PROBE_DEFINE2(cred_check_setuid, "struct ucred *", "uid_t");
 
 int
 mac_cred_check_setuid(struct ucred *cred, uid_t uid)
 {
 	int error;
 
 	MAC_POLICY_CHECK_NOSLEEP(cred_check_setuid, cred, uid);
 	MAC_CHECK_PROBE2(cred_check_setuid, error, cred, uid);
 
 	return (error);
 }
 
 MAC_CHECK_PROBE_DEFINE2(cred_check_seteuid, "struct ucred *", "uid_t");
 
 int
 mac_cred_check_seteuid(struct ucred *cred, uid_t euid)
 {
 	int error;
 
 	MAC_POLICY_CHECK_NOSLEEP(cred_check_seteuid, cred, euid);
 	MAC_CHECK_PROBE2(cred_check_seteuid, error, cred, euid);
 
 	return (error);
 }
 
 MAC_CHECK_PROBE_DEFINE2(cred_check_setgid, "struct ucred *", "gid_t");
 
 int
 mac_cred_check_setgid(struct ucred *cred, gid_t gid)
 {
 	int error;
 
 	MAC_POLICY_CHECK_NOSLEEP(cred_check_setgid, cred, gid);
 	MAC_CHECK_PROBE2(cred_check_setgid, error, cred, gid);
 
 	return (error);
 }
 
 MAC_CHECK_PROBE_DEFINE2(cred_check_setegid, "struct ucred *", "gid_t");
 
 int
 mac_cred_check_setegid(struct ucred *cred, gid_t egid)
 {
 	int error;
 
 	MAC_POLICY_CHECK_NOSLEEP(cred_check_setegid, cred, egid);
 	MAC_CHECK_PROBE2(cred_check_setegid, error, cred, egid);
 
 	return (error);
 }
 
 MAC_CHECK_PROBE_DEFINE3(cred_check_setgroups, "struct ucred *", "int",
     "gid_t *");
 
 int
 mac_cred_check_setgroups(struct ucred *cred, int ngroups, gid_t *gidset)
 {
 	int error;
 
 	MAC_POLICY_CHECK_NOSLEEP(cred_check_setgroups, cred, ngroups, gidset);
 	MAC_CHECK_PROBE3(cred_check_setgroups, error, cred, ngroups, gidset);
 
 	return (error);
 }
 
 MAC_CHECK_PROBE_DEFINE3(cred_check_setreuid, "struct ucred *", "uid_t",
     "uid_t");
 
 int
 mac_cred_check_setreuid(struct ucred *cred, uid_t ruid, uid_t euid)
 {
 	int error;
 
 	MAC_POLICY_CHECK_NOSLEEP(cred_check_setreuid, cred, ruid, euid);
 	MAC_CHECK_PROBE3(cred_check_setreuid, error, cred, ruid, euid);
 
 	return (error);
 }
 
 MAC_CHECK_PROBE_DEFINE3(cred_check_setregid, "struct ucred *", "gid_t",
     "gid_t");
 
 int
 mac_cred_check_setregid(struct ucred *cred, gid_t rgid, gid_t egid)
 {
 	int error;
 
 	MAC_POLICY_CHECK_NOSLEEP(cred_check_setregid, cred, rgid, egid);
 	MAC_CHECK_PROBE3(cred_check_setregid, error, cred, rgid, egid);
 
 	return (error);
 }
 
 MAC_CHECK_PROBE_DEFINE4(cred_check_setresuid, "struct ucred *", "uid_t",
     "uid_t", "uid_t");
 
 int
 mac_cred_check_setresuid(struct ucred *cred, uid_t ruid, uid_t euid,
     uid_t suid)
 {
 	int error;
 
 	MAC_POLICY_CHECK_NOSLEEP(cred_check_setresuid, cred, ruid, euid, suid);
 	MAC_CHECK_PROBE4(cred_check_setresuid, error, cred, ruid, euid,
 	    suid);
 
 	return (error);
 }
 
 MAC_CHECK_PROBE_DEFINE4(cred_check_setresgid, "struct ucred *", "gid_t",
     "gid_t", "gid_t");
 
 int
 mac_cred_check_setresgid(struct ucred *cred, gid_t rgid, gid_t egid,
     gid_t sgid)
 {
 	int error;
 
 	MAC_POLICY_CHECK_NOSLEEP(cred_check_setresgid, cred, rgid, egid, sgid);
 	MAC_CHECK_PROBE4(cred_check_setresgid, error, cred, rgid, egid,
 	    sgid);
 
 	return (error);
 }
 
 MAC_CHECK_PROBE_DEFINE2(cred_check_visible, "struct ucred *",
     "struct ucred *");
 
 int
 mac_cred_check_visible(struct ucred *cr1, struct ucred *cr2)
 {
 	int error;
 
 	MAC_POLICY_CHECK_NOSLEEP(cred_check_visible, cr1, cr2);
 	MAC_CHECK_PROBE2(cred_check_visible, error, cr1, cr2);
 
 	return (error);
 }
Index: head/sys/security/mac/mac_framework.c
===================================================================
--- head/sys/security/mac/mac_framework.c	(revision 258540)
+++ head/sys/security/mac/mac_framework.c	(revision 258541)
@@ -1,598 +1,597 @@
 /*-
  * Copyright (c) 1999-2002, 2006, 2009 Robert N. M. Watson
  * Copyright (c) 2001 Ilmar S. Habibulin
  * Copyright (c) 2001-2005 Networks Associates Technology, Inc.
  * Copyright (c) 2005-2006 SPARTA, Inc.
  * Copyright (c) 2008-2009 Apple Inc.
  * All rights reserved.
  *
  * This software was developed by Robert Watson and Ilmar Habibulin for the
  * TrustedBSD Project.
  *
  * This software was developed for the FreeBSD Project in part by Network
  * Associates Laboratories, the Security Research Division of Network
  * Associates, Inc. under DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"),
  * as part of the DARPA CHATS research program.
  *
  * This software was enhanced by SPARTA ISSO under SPAWAR contract 
  * N66001-04-C-6019 ("SEFOS").
  *
  * This software was developed at the University of Cambridge Computer
  * Laboratory with support from a grant from Google, Inc.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 /*-
  * Framework for extensible kernel access control.  This file contains core
  * kernel infrastructure for the TrustedBSD MAC Framework, including policy
  * registration, versioning, locking, error composition operator, and system
  * calls.
  *
  * The MAC Framework implements three programming interfaces:
  *
  * - The kernel MAC interface, defined in mac_framework.h, and invoked
  *   throughout the kernel to request security decisions, notify of security
  *   related events, etc.
  *
  * - The MAC policy module interface, defined in mac_policy.h, which is
  *   implemented by MAC policy modules and invoked by the MAC Framework to
  *   forward kernel security requests and notifications to policy modules.
  *
  * - The user MAC API, defined in mac.h, which allows user programs to query
  *   and set label state on objects.
  *
  * The majority of the MAC Framework implementation may be found in
  * src/sys/security/mac.  Sample policy modules may be found in
  * src/sys/security/mac_*.
  */
 
-#include "opt_kdtrace.h"
 #include "opt_mac.h"
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/condvar.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/mac.h>
 #include <sys/module.h>
 #include <sys/rmlock.h>
 #include <sys/sdt.h>
 #include <sys/sx.h>
 #include <sys/sysctl.h>
 
 #include <security/mac/mac_framework.h>
 #include <security/mac/mac_internal.h>
 #include <security/mac/mac_policy.h>
 
 /*
  * DTrace SDT providers for MAC.
  */
 SDT_PROVIDER_DEFINE(mac);
 SDT_PROVIDER_DEFINE(mac_framework);
 
 SDT_PROBE_DEFINE2(mac, kernel, policy, modevent, modevent, "int",
     "struct mac_policy_conf *");
 SDT_PROBE_DEFINE1(mac, kernel, policy, register, register,
     "struct mac_policy_conf *");
 SDT_PROBE_DEFINE1(mac, kernel, policy, unregister, unregister,
     "struct mac_policy_conf *");
 
 /*
  * Root sysctl node for all MAC and MAC policy controls.
  */
 SYSCTL_NODE(_security, OID_AUTO, mac, CTLFLAG_RW, 0,
     "TrustedBSD MAC policy controls");
 
 /*
  * Declare that the kernel provides MAC support, version 3 (FreeBSD 7.x).
  * This permits modules to refuse to be loaded if the necessary support isn't
  * present, even if it's pre-boot.
  */
 MODULE_VERSION(kernel_mac_support, MAC_VERSION);
 
 static unsigned int	mac_version = MAC_VERSION;
 SYSCTL_UINT(_security_mac, OID_AUTO, version, CTLFLAG_RD, &mac_version, 0,
     "");
 
 /*
  * Labels consist of a indexed set of "slots", which are allocated policies
  * as required.  The MAC Framework maintains a bitmask of slots allocated so
  * far to prevent reuse.  Slots cannot be reused, as the MAC Framework
  * guarantees that newly allocated slots in labels will be NULL unless
  * otherwise initialized, and because we do not have a mechanism to garbage
  * collect slots on policy unload.  As labeled policies tend to be statically
  * loaded during boot, and not frequently unloaded and reloaded, this is not
  * generally an issue.
  */
 #if MAC_MAX_SLOTS > 32
 #error "MAC_MAX_SLOTS too large"
 #endif
 
 static unsigned int mac_max_slots = MAC_MAX_SLOTS;
 static unsigned int mac_slot_offsets_free = (1 << MAC_MAX_SLOTS) - 1;
 SYSCTL_UINT(_security_mac, OID_AUTO, max_slots, CTLFLAG_RD, &mac_max_slots,
     0, "");
 
 /*
  * Has the kernel started generating labeled objects yet?  All read/write
  * access to this variable is serialized during the boot process.  Following
  * the end of serialization, we don't update this flag; no locking.
  */
 static int	mac_late = 0;
 
 /*
  * Each policy declares a mask of object types requiring labels to be
  * allocated for them.  For convenience, we combine and cache the bitwise or
  * of the per-policy object flags to track whether we will allocate a label
  * for an object type at run-time.
  */
 uint64_t	mac_labeled;
 SYSCTL_UQUAD(_security_mac, OID_AUTO, labeled, CTLFLAG_RD, &mac_labeled, 0,
     "Mask of object types being labeled");
 
 MALLOC_DEFINE(M_MACTEMP, "mactemp", "MAC temporary label storage");
 
 /*
  * MAC policy modules are placed in one of two lists: mac_static_policy_list,
  * for policies that are loaded early and cannot be unloaded, and
  * mac_policy_list, which holds policies either loaded later in the boot
  * cycle or that may be unloaded.  The static policy list does not require
  * locks to iterate over, but the dynamic list requires synchronization.
  * Support for dynamic policy loading can be compiled out using the
  * MAC_STATIC kernel option.
  *
  * The dynamic policy list is protected by two locks: modifying the list
  * requires both locks to be held exclusively.  One of the locks,
  * mac_policy_rm, is acquired over policy entry points that will never sleep;
  * the other, mac_policy_sx, is acquire over policy entry points that may
  * sleep.  The former category will be used when kernel locks may be held
  * over calls to the MAC Framework, during network processing in ithreads,
  * etc.  The latter will tend to involve potentially blocking memory
  * allocations, extended attribute I/O, etc.
  */
 #ifndef MAC_STATIC
 static struct rmlock mac_policy_rm;	/* Non-sleeping entry points. */
 static struct sx mac_policy_sx;		/* Sleeping entry points. */
 #endif
 
 struct mac_policy_list_head mac_policy_list;
 struct mac_policy_list_head mac_static_policy_list;
 u_int mac_policy_count;			/* Registered policy count. */
 
 static void	mac_policy_xlock(void);
 static void	mac_policy_xlock_assert(void);
 static void	mac_policy_xunlock(void);
 
 void
 mac_policy_slock_nosleep(struct rm_priotracker *tracker)
 {
 
 #ifndef MAC_STATIC
 	if (!mac_late)
 		return;
 
 	rm_rlock(&mac_policy_rm, tracker);
 #endif
 }
 
 void
 mac_policy_slock_sleep(void)
 {
 
 	WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL,
  	    "mac_policy_slock_sleep");
 
 #ifndef MAC_STATIC
 	if (!mac_late)
 		return;
 
 	sx_slock(&mac_policy_sx);
 #endif
 }
 
 void
 mac_policy_sunlock_nosleep(struct rm_priotracker *tracker)
 {
 
 #ifndef MAC_STATIC
 	if (!mac_late)
 		return;
 
 	rm_runlock(&mac_policy_rm, tracker);
 #endif
 }
 
 void
 mac_policy_sunlock_sleep(void)
 {
 
 #ifndef MAC_STATIC
 	if (!mac_late)
 		return;
 
 	sx_sunlock(&mac_policy_sx);
 #endif
 }
 
 static void
 mac_policy_xlock(void)
 {
 
 	WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL,
  	    "mac_policy_xlock()");
 
 #ifndef MAC_STATIC
 	if (!mac_late)
 		return;
 
 	sx_xlock(&mac_policy_sx);
 	rm_wlock(&mac_policy_rm);
 #endif
 }
 
 static void
 mac_policy_xunlock(void)
 {
 
 #ifndef MAC_STATIC
 	if (!mac_late)
 		return;
 
 	rm_wunlock(&mac_policy_rm);
 	sx_xunlock(&mac_policy_sx);
 #endif
 }
 
 static void
 mac_policy_xlock_assert(void)
 {
 
 #ifndef MAC_STATIC
 	if (!mac_late)
 		return;
 
 	/* XXXRW: rm_assert(&mac_policy_rm, RA_WLOCKED); */
 	sx_assert(&mac_policy_sx, SA_XLOCKED);
 #endif
 }
 
 /*
  * Initialize the MAC subsystem, including appropriate SMP locks.
  */
 static void
 mac_init(void)
 {
 
 	LIST_INIT(&mac_static_policy_list);
 	LIST_INIT(&mac_policy_list);
 	mac_labelzone_init();
 
 #ifndef MAC_STATIC
 	rm_init_flags(&mac_policy_rm, "mac_policy_rm", RM_NOWITNESS |
 	    RM_RECURSE);
 	sx_init_flags(&mac_policy_sx, "mac_policy_sx", SX_NOWITNESS);
 #endif
 }
 
 /*
  * For the purposes of modules that want to know if they were loaded "early",
  * set the mac_late flag once we've processed modules either linked into the
  * kernel, or loaded before the kernel startup.
  */
 static void
 mac_late_init(void)
 {
 
 	mac_late = 1;
 }
 
 /*
  * Given a policy, derive from its set of non-NULL label init methods what
  * object types the policy is interested in.
  */
 static uint64_t
 mac_policy_getlabeled(struct mac_policy_conf *mpc)
 {
 	uint64_t labeled;
 
 #define	MPC_FLAG(method, flag)					\
 	if (mpc->mpc_ops->mpo_ ## method != NULL)			\
 		labeled |= (flag);					\
 
 	labeled = 0;
 	MPC_FLAG(cred_init_label, MPC_OBJECT_CRED);
 	MPC_FLAG(proc_init_label, MPC_OBJECT_PROC);
 	MPC_FLAG(vnode_init_label, MPC_OBJECT_VNODE);
 	MPC_FLAG(inpcb_init_label, MPC_OBJECT_INPCB);
 	MPC_FLAG(socket_init_label, MPC_OBJECT_SOCKET);
 	MPC_FLAG(devfs_init_label, MPC_OBJECT_DEVFS);
 	MPC_FLAG(mbuf_init_label, MPC_OBJECT_MBUF);
 	MPC_FLAG(ipq_init_label, MPC_OBJECT_IPQ);
 	MPC_FLAG(ifnet_init_label, MPC_OBJECT_IFNET);
 	MPC_FLAG(bpfdesc_init_label, MPC_OBJECT_BPFDESC);
 	MPC_FLAG(pipe_init_label, MPC_OBJECT_PIPE);
 	MPC_FLAG(mount_init_label, MPC_OBJECT_MOUNT);
 	MPC_FLAG(posixsem_init_label, MPC_OBJECT_POSIXSEM);
 	MPC_FLAG(posixshm_init_label, MPC_OBJECT_POSIXSHM);
 	MPC_FLAG(sysvmsg_init_label, MPC_OBJECT_SYSVMSG);
 	MPC_FLAG(sysvmsq_init_label, MPC_OBJECT_SYSVMSQ);
 	MPC_FLAG(sysvsem_init_label, MPC_OBJECT_SYSVSEM);
 	MPC_FLAG(sysvshm_init_label, MPC_OBJECT_SYSVSHM);
 	MPC_FLAG(syncache_init_label, MPC_OBJECT_SYNCACHE);
 	MPC_FLAG(ip6q_init_label, MPC_OBJECT_IP6Q);
 
 #undef MPC_FLAG
 	return (labeled);
 }
 
 /*
  * When policies are loaded or unloaded, walk the list of registered policies
  * and built mac_labeled, a bitmask representing the union of all objects
  * requiring labels across all policies.
  */
 static void
 mac_policy_update(void)
 {
 	struct mac_policy_conf *mpc;
 
 	mac_policy_xlock_assert();
 
 	mac_labeled = 0;
 	mac_policy_count = 0;
 	LIST_FOREACH(mpc, &mac_static_policy_list, mpc_list) {
 		mac_labeled |= mac_policy_getlabeled(mpc);
 		mac_policy_count++;
 	}
 	LIST_FOREACH(mpc, &mac_policy_list, mpc_list) {
 		mac_labeled |= mac_policy_getlabeled(mpc);
 		mac_policy_count++;
 	}
 }
 
 static int
 mac_policy_register(struct mac_policy_conf *mpc)
 {
 	struct mac_policy_conf *tmpc;
 	int error, slot, static_entry;
 
 	error = 0;
 
 	/*
 	 * We don't technically need exclusive access while !mac_late, but
 	 * hold it for assertion consistency.
 	 */
 	mac_policy_xlock();
 
 	/*
 	 * If the module can potentially be unloaded, or we're loading late,
 	 * we have to stick it in the non-static list and pay an extra
 	 * performance overhead.  Otherwise, we can pay a light locking cost
 	 * and stick it in the static list.
 	 */
 	static_entry = (!mac_late &&
 	    !(mpc->mpc_loadtime_flags & MPC_LOADTIME_FLAG_UNLOADOK));
 
 	if (static_entry) {
 		LIST_FOREACH(tmpc, &mac_static_policy_list, mpc_list) {
 			if (strcmp(tmpc->mpc_name, mpc->mpc_name) == 0) {
 				error = EEXIST;
 				goto out;
 			}
 		}
 	} else {
 		LIST_FOREACH(tmpc, &mac_policy_list, mpc_list) {
 			if (strcmp(tmpc->mpc_name, mpc->mpc_name) == 0) {
 				error = EEXIST;
 				goto out;
 			}
 		}
 	}
 	if (mpc->mpc_field_off != NULL) {
 		slot = ffs(mac_slot_offsets_free);
 		if (slot == 0) {
 			error = ENOMEM;
 			goto out;
 		}
 		slot--;
 		mac_slot_offsets_free &= ~(1 << slot);
 		*mpc->mpc_field_off = slot;
 	}
 	mpc->mpc_runtime_flags |= MPC_RUNTIME_FLAG_REGISTERED;
 
 	/*
 	 * If we're loading a MAC module after the framework has initialized,
 	 * it has to go into the dynamic list.  If we're loading it before
 	 * we've finished initializing, it can go into the static list with
 	 * weaker locker requirements.
 	 */
 	if (static_entry)
 		LIST_INSERT_HEAD(&mac_static_policy_list, mpc, mpc_list);
 	else
 		LIST_INSERT_HEAD(&mac_policy_list, mpc, mpc_list);
 
 	/*
 	 * Per-policy initialization.  Currently, this takes place under the
 	 * exclusive lock, so policies must not sleep in their init method.
 	 * In the future, we may want to separate "init" from "start", with
 	 * "init" occuring without the lock held.  Likewise, on tear-down,
 	 * breaking out "stop" from "destroy".
 	 */
 	if (mpc->mpc_ops->mpo_init != NULL)
 		(*(mpc->mpc_ops->mpo_init))(mpc);
 	mac_policy_update();
 
 	SDT_PROBE(mac, kernel, policy, register, mpc, 0, 0, 0, 0);
 	printf("Security policy loaded: %s (%s)\n", mpc->mpc_fullname,
 	    mpc->mpc_name);
 
 out:
 	mac_policy_xunlock();
 	return (error);
 }
 
 static int
 mac_policy_unregister(struct mac_policy_conf *mpc)
 {
 
 	/*
 	 * If we fail the load, we may get a request to unload.  Check to see
 	 * if we did the run-time registration, and if not, silently succeed.
 	 */
 	mac_policy_xlock();
 	if ((mpc->mpc_runtime_flags & MPC_RUNTIME_FLAG_REGISTERED) == 0) {
 		mac_policy_xunlock();
 		return (0);
 	}
 #if 0
 	/*
 	 * Don't allow unloading modules with private data.
 	 */
 	if (mpc->mpc_field_off != NULL) {
 		mac_policy_xunlock();
 		return (EBUSY);
 	}
 #endif
 	/*
 	 * Only allow the unload to proceed if the module is unloadable by
 	 * its own definition.
 	 */
 	if ((mpc->mpc_loadtime_flags & MPC_LOADTIME_FLAG_UNLOADOK) == 0) {
 		mac_policy_xunlock();
 		return (EBUSY);
 	}
 	if (mpc->mpc_ops->mpo_destroy != NULL)
 		(*(mpc->mpc_ops->mpo_destroy))(mpc);
 
 	LIST_REMOVE(mpc, mpc_list);
 	mpc->mpc_runtime_flags &= ~MPC_RUNTIME_FLAG_REGISTERED;
 	mac_policy_update();
 	mac_policy_xunlock();
 
 	SDT_PROBE(mac, kernel, policy, unregister, mpc, 0, 0, 0, 0);
 	printf("Security policy unload: %s (%s)\n", mpc->mpc_fullname,
 	    mpc->mpc_name);
 
 	return (0);
 }
 
 /*
  * Allow MAC policy modules to register during boot, etc.
  */
 int
 mac_policy_modevent(module_t mod, int type, void *data)
 {
 	struct mac_policy_conf *mpc;
 	int error;
 
 	error = 0;
 	mpc = (struct mac_policy_conf *) data;
 
 #ifdef MAC_STATIC
 	if (mac_late) {
 		printf("mac_policy_modevent: MAC_STATIC and late\n");
 		return (EBUSY);
 	}
 #endif
 
 	SDT_PROBE(mac, kernel, policy, modevent, type, mpc, 0, 0, 0);
 	switch (type) {
 	case MOD_LOAD:
 		if (mpc->mpc_loadtime_flags & MPC_LOADTIME_FLAG_NOTLATE &&
 		    mac_late) {
 			printf("mac_policy_modevent: can't load %s policy "
 			    "after booting\n", mpc->mpc_name);
 			error = EBUSY;
 			break;
 		}
 		error = mac_policy_register(mpc);
 		break;
 	case MOD_UNLOAD:
 		/* Don't unregister the module if it was never registered. */
 		if ((mpc->mpc_runtime_flags & MPC_RUNTIME_FLAG_REGISTERED)
 		    != 0)
 			error = mac_policy_unregister(mpc);
 		else
 			error = 0;
 		break;
 	default:
 		error = EOPNOTSUPP;
 		break;
 	}
 
 	return (error);
 }
 
 /*
  * Define an error value precedence, and given two arguments, selects the
  * value with the higher precedence.
  */
 int
 mac_error_select(int error1, int error2)
 {
 
 	/* Certain decision-making errors take top priority. */
 	if (error1 == EDEADLK || error2 == EDEADLK)
 		return (EDEADLK);
 
 	/* Invalid arguments should be reported where possible. */
 	if (error1 == EINVAL || error2 == EINVAL)
 		return (EINVAL);
 
 	/* Precedence goes to "visibility", with both process and file. */
 	if (error1 == ESRCH || error2 == ESRCH)
 		return (ESRCH);
 
 	if (error1 == ENOENT || error2 == ENOENT)
 		return (ENOENT);
 
 	/* Precedence goes to DAC/MAC protections. */
 	if (error1 == EACCES || error2 == EACCES)
 		return (EACCES);
 
 	/* Precedence goes to privilege. */
 	if (error1 == EPERM || error2 == EPERM)
 		return (EPERM);
 
 	/* Precedence goes to error over success; otherwise, arbitrary. */
 	if (error1 != 0)
 		return (error1);
 	return (error2);
 }
 
 int
 mac_check_structmac_consistent(struct mac *mac)
 {
 
 	if (mac->m_buflen < 0 ||
 	    mac->m_buflen > MAC_MAX_LABEL_BUF_LEN)
 		return (EINVAL);
 
 	return (0);
 }
 
 SYSINIT(mac, SI_SUB_MAC, SI_ORDER_FIRST, mac_init, NULL);
 SYSINIT(mac_late, SI_SUB_MAC_LATE, SI_ORDER_FIRST, mac_late_init, NULL);
Index: head/sys/security/mac/mac_inet.c
===================================================================
--- head/sys/security/mac/mac_inet.c	(revision 258540)
+++ head/sys/security/mac/mac_inet.c	(revision 258541)
@@ -1,508 +1,507 @@
 /*-
  * Copyright (c) 1999-2002, 2007, 2009 Robert N. M. Watson
  * Copyright (c) 2001 Ilmar S. Habibulin
  * Copyright (c) 2001-2004 Networks Associates Technology, Inc.
  * Copyright (c) 2006 SPARTA, Inc.
  * Copyright (c) 2008 Apple Inc.
  * All rights reserved.
  *
  * This software was developed by Robert Watson and Ilmar Habibulin for the
  * TrustedBSD Project.
  *
  * This software was developed for the FreeBSD Project in part by Network
  * Associates Laboratories, the Security Research Division of Network
  * Associates, Inc. under DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"),
  * as part of the DARPA CHATS research program.
  *
  * This software was enhanced by SPARTA ISSO under SPAWAR contract
  * N66001-04-C-6019 ("SEFOS").
  *
  * This software was developed at the University of Cambridge Computer
  * Laboratory with support from a grant from Google, Inc. 
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
-#include "opt_kdtrace.h"
 #include "opt_mac.h"
 
 #include <sys/param.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mutex.h>
 #include <sys/sbuf.h>
 #include <sys/sdt.h>
 #include <sys/systm.h>
 #include <sys/mount.h>
 #include <sys/file.h>
 #include <sys/namei.h>
 #include <sys/protosw.h>
 #include <sys/socket.h>
 #include <sys/socketvar.h>
 #include <sys/sysctl.h>
 
 #include <net/if.h>
 #include <net/if_var.h>
 
 #include <netinet/in.h>
 #include <netinet/in_pcb.h>
 #include <netinet/ip_var.h>
 
 #include <security/mac/mac_framework.h>
 #include <security/mac/mac_internal.h>
 #include <security/mac/mac_policy.h>
 
 static struct label *
 mac_inpcb_label_alloc(int flag)
 {
 	struct label *label;
 	int error;
 
 	label = mac_labelzone_alloc(flag);
 	if (label == NULL)
 		return (NULL);
 	if (flag & M_WAITOK)
 		MAC_POLICY_CHECK(inpcb_init_label, label, flag);
 	else
 		MAC_POLICY_CHECK_NOSLEEP(inpcb_init_label, label, flag);
 	if (error) {
 		MAC_POLICY_PERFORM_NOSLEEP(inpcb_destroy_label, label);
 		mac_labelzone_free(label);
 		return (NULL);
 	}
 	return (label);
 }
 
 int
 mac_inpcb_init(struct inpcb *inp, int flag)
 {
 
 	if (mac_labeled & MPC_OBJECT_INPCB) {
 		inp->inp_label = mac_inpcb_label_alloc(flag);
 		if (inp->inp_label == NULL)
 			return (ENOMEM);
 	} else
 		inp->inp_label = NULL;
 	return (0);
 }
 
 static struct label *
 mac_ipq_label_alloc(int flag)
 {
 	struct label *label;
 	int error;
 
 	label = mac_labelzone_alloc(flag);
 	if (label == NULL)
 		return (NULL);
 
 	if (flag & M_WAITOK)
 		MAC_POLICY_CHECK(ipq_init_label, label, flag);
 	else
 		MAC_POLICY_CHECK_NOSLEEP(ipq_init_label, label, flag);
 	if (error) {
 		MAC_POLICY_PERFORM_NOSLEEP(ipq_destroy_label, label);
 		mac_labelzone_free(label);
 		return (NULL);
 	}
 	return (label);
 }
 
 int
 mac_ipq_init(struct ipq *q, int flag)
 {
 
 	if (mac_labeled & MPC_OBJECT_IPQ) {
 		q->ipq_label = mac_ipq_label_alloc(flag);
 		if (q->ipq_label == NULL)
 			return (ENOMEM);
 	} else
 		q->ipq_label = NULL;
 	return (0);
 }
 
 static void
 mac_inpcb_label_free(struct label *label)
 {
 
 	MAC_POLICY_PERFORM_NOSLEEP(inpcb_destroy_label, label);
 	mac_labelzone_free(label);
 }
 
 void
 mac_inpcb_destroy(struct inpcb *inp)
 {
 
 	if (inp->inp_label != NULL) {
 		mac_inpcb_label_free(inp->inp_label);
 		inp->inp_label = NULL;
 	}
 }
 
 static void
 mac_ipq_label_free(struct label *label)
 {
 
 	MAC_POLICY_PERFORM_NOSLEEP(ipq_destroy_label, label);
 	mac_labelzone_free(label);
 }
 
 void
 mac_ipq_destroy(struct ipq *q)
 {
 
 	if (q->ipq_label != NULL) {
 		mac_ipq_label_free(q->ipq_label);
 		q->ipq_label = NULL;
 	}
 }
 
 void
 mac_inpcb_create(struct socket *so, struct inpcb *inp)
 {
 
 	MAC_POLICY_PERFORM_NOSLEEP(inpcb_create, so, so->so_label, inp,
 	    inp->inp_label);
 }
 
 void
 mac_ipq_reassemble(struct ipq *q, struct mbuf *m)
 {
 	struct label *label;
 
 	if (mac_policy_count == 0)
 		return;
 
 	label = mac_mbuf_to_label(m);
 
 	MAC_POLICY_PERFORM_NOSLEEP(ipq_reassemble, q, q->ipq_label, m,
 	    label);
 }
 
 void
 mac_netinet_fragment(struct mbuf *m, struct mbuf *frag)
 {
 	struct label *mlabel, *fraglabel;
 
 	if (mac_policy_count == 0)
 		return;
 
 	mlabel = mac_mbuf_to_label(m);
 	fraglabel = mac_mbuf_to_label(frag);
 
 	MAC_POLICY_PERFORM_NOSLEEP(netinet_fragment, m, mlabel, frag,
 	    fraglabel);
 }
 
 void
 mac_ipq_create(struct mbuf *m, struct ipq *q)
 {
 	struct label *label;
 
 	if (mac_policy_count == 0)
 		return;
 
 	label = mac_mbuf_to_label(m);
 
 	MAC_POLICY_PERFORM_NOSLEEP(ipq_create, m, label, q, q->ipq_label);
 }
 
 void
 mac_inpcb_create_mbuf(struct inpcb *inp, struct mbuf *m)
 {
 	struct label *mlabel;
 
 	INP_LOCK_ASSERT(inp);
 
 	if (mac_policy_count == 0)
 		return;
 
 	mlabel = mac_mbuf_to_label(m);
 
 	MAC_POLICY_PERFORM_NOSLEEP(inpcb_create_mbuf, inp, inp->inp_label, m,
 	    mlabel);
 }
 
 int
 mac_ipq_match(struct mbuf *m, struct ipq *q)
 {
 	struct label *label;
 	int result;
 
 	if (mac_policy_count == 0)
 		return (1);
 
 	label = mac_mbuf_to_label(m);
 
 	result = 1;
 	MAC_POLICY_BOOLEAN_NOSLEEP(ipq_match, &&, m, label, q, q->ipq_label);
 
 	return (result);
 }
 
 void
 mac_netinet_arp_send(struct ifnet *ifp, struct mbuf *m)
 {
 	struct label *mlabel;
 
 	if (mac_policy_count == 0)
 		return;
 
 	mlabel = mac_mbuf_to_label(m);
 
 	MAC_IFNET_LOCK(ifp);
 	MAC_POLICY_PERFORM_NOSLEEP(netinet_arp_send, ifp, ifp->if_label, m,
 	    mlabel);
 	MAC_IFNET_UNLOCK(ifp);
 }
 
 void
 mac_netinet_icmp_reply(struct mbuf *mrecv, struct mbuf *msend)
 {
 	struct label *mrecvlabel, *msendlabel;
 
 	if (mac_policy_count == 0)
 		return;
 
 	mrecvlabel = mac_mbuf_to_label(mrecv);
 	msendlabel = mac_mbuf_to_label(msend);
 
 	MAC_POLICY_PERFORM_NOSLEEP(netinet_icmp_reply, mrecv, mrecvlabel,
 	    msend, msendlabel);
 }
 
 void
 mac_netinet_icmp_replyinplace(struct mbuf *m)
 {
 	struct label *label;
 
 	if (mac_policy_count == 0)
 		return;
 
 	label = mac_mbuf_to_label(m);
 
 	MAC_POLICY_PERFORM_NOSLEEP(netinet_icmp_replyinplace, m, label);
 }
 
 void
 mac_netinet_igmp_send(struct ifnet *ifp, struct mbuf *m)
 {
 	struct label *mlabel;
 
 	if (mac_policy_count == 0)
 		return;
 
 	mlabel = mac_mbuf_to_label(m);
 
 	MAC_IFNET_LOCK(ifp);
 	MAC_POLICY_PERFORM_NOSLEEP(netinet_igmp_send, ifp, ifp->if_label, m,
 	    mlabel);
 	MAC_IFNET_UNLOCK(ifp);
 }
 
 void
 mac_netinet_tcp_reply(struct mbuf *m)
 {
 	struct label *label;
 
 	if (mac_policy_count == 0)
 		return;
 
 	label = mac_mbuf_to_label(m);
 
 	MAC_POLICY_PERFORM_NOSLEEP(netinet_tcp_reply, m, label);
 }
 
 void
 mac_ipq_update(struct mbuf *m, struct ipq *q)
 {
 	struct label *label;
 
 	if (mac_policy_count == 0)
 		return;
 
 	label = mac_mbuf_to_label(m);
 
 	MAC_POLICY_PERFORM_NOSLEEP(ipq_update, m, label, q, q->ipq_label);
 }
 
 MAC_CHECK_PROBE_DEFINE2(inpcb_check_deliver, "struct inpcb *",
     "struct mbuf *");
 
 int
 mac_inpcb_check_deliver(struct inpcb *inp, struct mbuf *m)
 {
 	struct label *label;
 	int error;
 
 	M_ASSERTPKTHDR(m);
 
 	if (mac_policy_count == 0)
 		return (0);
 
 	label = mac_mbuf_to_label(m);
 
 	MAC_POLICY_CHECK_NOSLEEP(inpcb_check_deliver, inp, inp->inp_label, m,
 	    label);
 	MAC_CHECK_PROBE2(inpcb_check_deliver, error, inp, m);
 
 	return (error);
 }
 
 MAC_CHECK_PROBE_DEFINE2(inpcb_check_visible, "struct ucred *",
     "struct inpcb *");
 
 int
 mac_inpcb_check_visible(struct ucred *cred, struct inpcb *inp)
 {
 	int error;
 
 	INP_LOCK_ASSERT(inp);
 
 	MAC_POLICY_CHECK_NOSLEEP(inpcb_check_visible, cred, inp,
 	    inp->inp_label);
 	MAC_CHECK_PROBE2(inpcb_check_visible, error, cred, inp);
 
 	return (error);
 }
 
 void
 mac_inpcb_sosetlabel(struct socket *so, struct inpcb *inp)
 {
 
 	INP_WLOCK_ASSERT(inp);
 	SOCK_LOCK_ASSERT(so);
 
 	MAC_POLICY_PERFORM_NOSLEEP(inpcb_sosetlabel, so, so->so_label, inp,
 	    inp->inp_label);
 }
 
 void
 mac_netinet_firewall_reply(struct mbuf *mrecv, struct mbuf *msend)
 {
 	struct label *mrecvlabel, *msendlabel;
 
 	M_ASSERTPKTHDR(mrecv);
 	M_ASSERTPKTHDR(msend);
 
 	if (mac_policy_count == 0)
 		return;
 
 	mrecvlabel = mac_mbuf_to_label(mrecv);
 	msendlabel = mac_mbuf_to_label(msend);
 
 	MAC_POLICY_PERFORM_NOSLEEP(netinet_firewall_reply, mrecv, mrecvlabel,
 	    msend, msendlabel);
 }
 
 void
 mac_netinet_firewall_send(struct mbuf *m)
 {
 	struct label *label;
 
 	M_ASSERTPKTHDR(m);
 
 	if (mac_policy_count == 0)
 		return;
 
 	label = mac_mbuf_to_label(m);
 
 	MAC_POLICY_PERFORM_NOSLEEP(netinet_firewall_send, m, label);
 }
 
 /*
  * These functions really should be referencing the syncache structure
  * instead of the label.  However, due to some of the complexities associated
  * with exposing this syncache structure we operate directly on it's label
  * pointer.  This should be OK since we aren't making any access control
  * decisions within this code directly, we are merely allocating and copying
  * label storage so we can properly initialize mbuf labels for any packets
  * the syncache code might create.
  */
 void
 mac_syncache_destroy(struct label **label)
 {
 
 	if (*label != NULL) {
 		MAC_POLICY_PERFORM_NOSLEEP(syncache_destroy_label, *label);
 		mac_labelzone_free(*label);
 		*label = NULL;
 	}
 }
 
 int
 mac_syncache_init(struct label **label)
 {
 	int error;
 
 	if (mac_labeled & MPC_OBJECT_SYNCACHE) {
 		*label = mac_labelzone_alloc(M_NOWAIT);
 		if (*label == NULL)
 			return (ENOMEM);
 		/*
 		 * Since we are holding the inpcb locks the policy can not
 		 * allocate policy specific label storage using M_WAITOK.  So
 		 * we need to do a MAC_CHECK instead of the typical
 		 * MAC_PERFORM so we can propagate allocation failures back
 		 * to the syncache code.
 		 */
 		MAC_POLICY_CHECK_NOSLEEP(syncache_init_label, *label,
 		    M_NOWAIT);
 		if (error) {
 			MAC_POLICY_PERFORM_NOSLEEP(syncache_destroy_label,
 			    *label);
 			mac_labelzone_free(*label);
 		}
 		return (error);
 	} else
 		*label = NULL;
 	return (0);
 }
 
 void
 mac_syncache_create(struct label *label, struct inpcb *inp)
 {
 
 	INP_WLOCK_ASSERT(inp);
 
 	MAC_POLICY_PERFORM_NOSLEEP(syncache_create, label, inp);
 }
 
 void
 mac_syncache_create_mbuf(struct label *sc_label, struct mbuf *m)
 {
 	struct label *mlabel;
 
 	M_ASSERTPKTHDR(m);
 
 	if (mac_policy_count == 0)
 		return;
 
 	mlabel = mac_mbuf_to_label(m);
 
 	MAC_POLICY_PERFORM_NOSLEEP(syncache_create_mbuf, sc_label, m,
 	    mlabel);
 }
Index: head/sys/security/mac/mac_net.c
===================================================================
--- head/sys/security/mac/mac_net.c	(revision 258540)
+++ head/sys/security/mac/mac_net.c	(revision 258541)
@@ -1,502 +1,501 @@
 /*-
  * Copyright (c) 1999-2002, 2009 Robert N. M. Watson
  * Copyright (c) 2001 Ilmar S. Habibulin
  * Copyright (c) 2001-2004 Networks Associates Technology, Inc.
  * Copyright (c) 2006 SPARTA, Inc.
  * Copyright (c) 2008 Apple Inc.
  * All rights reserved.
  *
  * This software was developed by Robert Watson and Ilmar Habibulin for the
  * TrustedBSD Project.
  *
  * This software was enhanced by SPARTA ISSO under SPAWAR contract
  * N66001-04-C-6019 ("SEFOS").
  *
  * This software was developed for the FreeBSD Project in part by Network
  * Associates Laboratories, the Security Research Division of Network
  * Associates, Inc. under DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"),
  * as part of the DARPA CHATS research program.
  *
  * This software was developed at the University of Cambridge Computer
  * Laboratory with support from a grant from Google, Inc. 
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
-#include "opt_kdtrace.h"
 #include "opt_mac.h"
 
 #include <sys/param.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mutex.h>
 #include <sys/mac.h>
 #include <sys/priv.h>
 #include <sys/sbuf.h>
 #include <sys/sdt.h>
 #include <sys/systm.h>
 #include <sys/mount.h>
 #include <sys/file.h>
 #include <sys/namei.h>
 #include <sys/protosw.h>
 #include <sys/socket.h>
 #include <sys/socketvar.h>
 #include <sys/sysctl.h>
 
 #include <net/bpfdesc.h>
 #include <net/if.h>
 #include <net/if_var.h>
 
 #include <security/mac/mac_framework.h>
 #include <security/mac/mac_internal.h>
 #include <security/mac/mac_policy.h>
 
 /*
  * XXXRW: struct ifnet locking is incomplete in the network code, so we use
  * our own global mutex for struct ifnet.  Non-ideal, but should help in the
  * SMP environment.
  */
 struct mtx mac_ifnet_mtx;
 MTX_SYSINIT(mac_ifnet_mtx, &mac_ifnet_mtx, "mac_ifnet", MTX_DEF);
 
 /*
  * Retrieve the label associated with an mbuf by searching for the tag.
  * Depending on the value of mac_labelmbufs, it's possible that a label will
  * not be present, in which case NULL is returned.  Policies must handle the
  * possibility of an mbuf not having label storage if they do not enforce
  * early loading.
  */
 struct label *
 mac_mbuf_to_label(struct mbuf *m)
 {
 	struct m_tag *tag;
 	struct label *label;
 
 	if (m == NULL)
 		return (NULL);
 	tag = m_tag_find(m, PACKET_TAG_MACLABEL, NULL);
 	if (tag == NULL)
 		return (NULL);
 	label = (struct label *)(tag+1);
 	return (label);
 }
 
 static struct label *
 mac_bpfdesc_label_alloc(void)
 {
 	struct label *label;
 
 	label = mac_labelzone_alloc(M_WAITOK);
 	MAC_POLICY_PERFORM(bpfdesc_init_label, label);
 	return (label);
 }
 
 void
 mac_bpfdesc_init(struct bpf_d *d)
 {
 
 	if (mac_labeled & MPC_OBJECT_BPFDESC)
 		d->bd_label = mac_bpfdesc_label_alloc();
 	else
 		d->bd_label = NULL;
 }
 
 static struct label *
 mac_ifnet_label_alloc(void)
 {
 	struct label *label;
 
 	label = mac_labelzone_alloc(M_WAITOK);
 	MAC_POLICY_PERFORM(ifnet_init_label, label);
 	return (label);
 }
 
 void
 mac_ifnet_init(struct ifnet *ifp)
 {
 
 	if (mac_labeled & MPC_OBJECT_IFNET)
 		ifp->if_label = mac_ifnet_label_alloc();
 	else
 		ifp->if_label = NULL;
 }
 
 int
 mac_mbuf_tag_init(struct m_tag *tag, int flag)
 {
 	struct label *label;
 	int error;
 
 	label = (struct label *) (tag + 1);
 	mac_init_label(label);
 
 	if (flag & M_WAITOK)
 		MAC_POLICY_CHECK(mbuf_init_label, label, flag);
 	else
 		MAC_POLICY_CHECK_NOSLEEP(mbuf_init_label, label, flag);
 	if (error) {
 		MAC_POLICY_PERFORM_NOSLEEP(mbuf_destroy_label, label);
 		mac_destroy_label(label);
 	}
 	return (error);
 }
 
 int
 mac_mbuf_init(struct mbuf *m, int flag)
 {
 	struct m_tag *tag;
 	int error;
 
 	M_ASSERTPKTHDR(m);
 
 	if (mac_labeled & MPC_OBJECT_MBUF) {
 		tag = m_tag_get(PACKET_TAG_MACLABEL, sizeof(struct label),
 		    flag);
 		if (tag == NULL)
 			return (ENOMEM);
 		error = mac_mbuf_tag_init(tag, flag);
 		if (error) {
 			m_tag_free(tag);
 			return (error);
 		}
 		m_tag_prepend(m, tag);
 	}
 	return (0);
 }
 
 static void
 mac_bpfdesc_label_free(struct label *label)
 {
 
 	MAC_POLICY_PERFORM_NOSLEEP(bpfdesc_destroy_label, label);
 	mac_labelzone_free(label);
 }
 
 void
 mac_bpfdesc_destroy(struct bpf_d *d)
 {
 
 	if (d->bd_label != NULL) {
 		mac_bpfdesc_label_free(d->bd_label);
 		d->bd_label = NULL;
 	}
 }
 
 static void
 mac_ifnet_label_free(struct label *label)
 {
 
 	MAC_POLICY_PERFORM_NOSLEEP(ifnet_destroy_label, label);
 	mac_labelzone_free(label);
 }
 
 void
 mac_ifnet_destroy(struct ifnet *ifp)
 {
 
 	if (ifp->if_label != NULL) {
 		mac_ifnet_label_free(ifp->if_label);
 		ifp->if_label = NULL;
 	}
 }
 
 void
 mac_mbuf_tag_destroy(struct m_tag *tag)
 {
 	struct label *label;
 
 	label = (struct label *)(tag+1);
 
 	MAC_POLICY_PERFORM_NOSLEEP(mbuf_destroy_label, label);
 	mac_destroy_label(label);
 }
 
 /*
  * mac_mbuf_tag_copy is called when an mbuf header is duplicated, in which
  * case the labels must also be duplicated.
  */
 void
 mac_mbuf_tag_copy(struct m_tag *src, struct m_tag *dest)
 {
 	struct label *src_label, *dest_label;
 
 	src_label = (struct label *)(src+1);
 	dest_label = (struct label *)(dest+1);
 
 	/*
 	 * mac_mbuf_tag_init() is called on the target tag in m_tag_copy(),
 	 * so we don't need to call it here.
 	 */
 	MAC_POLICY_PERFORM_NOSLEEP(mbuf_copy_label, src_label, dest_label);
 }
 
 void
 mac_mbuf_copy(struct mbuf *m_from, struct mbuf *m_to)
 {
 	struct label *src_label, *dest_label;
 
 	if (mac_policy_count == 0)
 		return;
 
 	src_label = mac_mbuf_to_label(m_from);
 	dest_label = mac_mbuf_to_label(m_to);
 
 	MAC_POLICY_PERFORM_NOSLEEP(mbuf_copy_label, src_label, dest_label);
 }
 
 static void
 mac_ifnet_copy_label(struct label *src, struct label *dest)
 {
 
 	MAC_POLICY_PERFORM_NOSLEEP(ifnet_copy_label, src, dest);
 }
 
 static int
 mac_ifnet_externalize_label(struct label *label, char *elements,
     char *outbuf, size_t outbuflen)
 {
 	int error;
 
 	MAC_POLICY_EXTERNALIZE(ifnet, label, elements, outbuf, outbuflen);
 
 	return (error);
 }
 
 static int
 mac_ifnet_internalize_label(struct label *label, char *string)
 {
 	int error;
 
 	MAC_POLICY_INTERNALIZE(ifnet, label, string);
 
 	return (error);
 }
 
 void
 mac_ifnet_create(struct ifnet *ifp)
 {
 
 	if (mac_policy_count == 0)
 		return;
 
 	MAC_IFNET_LOCK(ifp);
 	MAC_POLICY_PERFORM_NOSLEEP(ifnet_create, ifp, ifp->if_label);
 	MAC_IFNET_UNLOCK(ifp);
 }
 
 void
 mac_bpfdesc_create(struct ucred *cred, struct bpf_d *d)
 {
 
 	MAC_POLICY_PERFORM_NOSLEEP(bpfdesc_create, cred, d, d->bd_label);
 }
 
 void
 mac_bpfdesc_create_mbuf(struct bpf_d *d, struct mbuf *m)
 {
 	struct label *label;
 
 	/* Assume reader lock is enough. */
 	BPFD_LOCK_ASSERT(d);
 
 	if (mac_policy_count == 0)
 		return;
 
 	label = mac_mbuf_to_label(m);
 
 	MAC_POLICY_PERFORM_NOSLEEP(bpfdesc_create_mbuf, d, d->bd_label, m,
 	    label);
 }
 
 void
 mac_ifnet_create_mbuf(struct ifnet *ifp, struct mbuf *m)
 {
 	struct label *label;
 
 	if (mac_policy_count == 0)
 		return;
 
 	label = mac_mbuf_to_label(m);
 
 	MAC_IFNET_LOCK(ifp);
 	MAC_POLICY_PERFORM_NOSLEEP(ifnet_create_mbuf, ifp, ifp->if_label, m,
 	    label);
 	MAC_IFNET_UNLOCK(ifp);
 }
 
 MAC_CHECK_PROBE_DEFINE2(bpfdesc_check_receive, "struct bpf_d *",
     "struct ifnet *");
 
 int
 mac_bpfdesc_check_receive(struct bpf_d *d, struct ifnet *ifp)
 {
 	int error;
 
 	/* Assume reader lock is enough. */
 	BPFD_LOCK_ASSERT(d);
 
 	if (mac_policy_count == 0)
 		return (0);
 
 	MAC_IFNET_LOCK(ifp);
 	MAC_POLICY_CHECK_NOSLEEP(bpfdesc_check_receive, d, d->bd_label, ifp,
 	    ifp->if_label);
 	MAC_CHECK_PROBE2(bpfdesc_check_receive, error, d, ifp);
 	MAC_IFNET_UNLOCK(ifp);
 
 	return (error);
 }
 
 MAC_CHECK_PROBE_DEFINE2(ifnet_check_transmit, "struct ifnet *",
     "struct mbuf *");
 
 int
 mac_ifnet_check_transmit(struct ifnet *ifp, struct mbuf *m)
 {
 	struct label *label;
 	int error;
 
 	M_ASSERTPKTHDR(m);
 
 	if (mac_policy_count == 0)
 		return (0);
 
 	label = mac_mbuf_to_label(m);
 
 	MAC_IFNET_LOCK(ifp);
 	MAC_POLICY_CHECK_NOSLEEP(ifnet_check_transmit, ifp, ifp->if_label, m,
 	    label);
 	MAC_CHECK_PROBE2(ifnet_check_transmit, error, ifp, m);
 	MAC_IFNET_UNLOCK(ifp);
 
 	return (error);
 }
 
 int
 mac_ifnet_ioctl_get(struct ucred *cred, struct ifreq *ifr,
     struct ifnet *ifp)
 {
 	char *elements, *buffer;
 	struct label *intlabel;
 	struct mac mac;
 	int error;
 
 	if (!(mac_labeled & MPC_OBJECT_IFNET))
 		return (EINVAL);
 
 	error = copyin(ifr->ifr_ifru.ifru_data, &mac, sizeof(mac));
 	if (error)
 		return (error);
 
 	error = mac_check_structmac_consistent(&mac);
 	if (error)
 		return (error);
 
 	elements = malloc(mac.m_buflen, M_MACTEMP, M_WAITOK);
 	error = copyinstr(mac.m_string, elements, mac.m_buflen, NULL);
 	if (error) {
 		free(elements, M_MACTEMP);
 		return (error);
 	}
 
 	buffer = malloc(mac.m_buflen, M_MACTEMP, M_WAITOK | M_ZERO);
 	intlabel = mac_ifnet_label_alloc();
 	MAC_IFNET_LOCK(ifp);
 	mac_ifnet_copy_label(ifp->if_label, intlabel);
 	MAC_IFNET_UNLOCK(ifp);
 	error = mac_ifnet_externalize_label(intlabel, elements, buffer,
 	    mac.m_buflen);
 	mac_ifnet_label_free(intlabel);
 	if (error == 0)
 		error = copyout(buffer, mac.m_string, strlen(buffer)+1);
 
 	free(buffer, M_MACTEMP);
 	free(elements, M_MACTEMP);
 
 	return (error);
 }
 
 int
 mac_ifnet_ioctl_set(struct ucred *cred, struct ifreq *ifr, struct ifnet *ifp)
 {
 	struct label *intlabel;
 	struct mac mac;
 	char *buffer;
 	int error;
 
 	if (!(mac_labeled & MPC_OBJECT_IFNET))
 		return (EINVAL);
 
 	error = copyin(ifr->ifr_ifru.ifru_data, &mac, sizeof(mac));
 	if (error)
 		return (error);
 
 	error = mac_check_structmac_consistent(&mac);
 	if (error)
 		return (error);
 
 	buffer = malloc(mac.m_buflen, M_MACTEMP, M_WAITOK);
 	error = copyinstr(mac.m_string, buffer, mac.m_buflen, NULL);
 	if (error) {
 		free(buffer, M_MACTEMP);
 		return (error);
 	}
 
 	intlabel = mac_ifnet_label_alloc();
 	error = mac_ifnet_internalize_label(intlabel, buffer);
 	free(buffer, M_MACTEMP);
 	if (error) {
 		mac_ifnet_label_free(intlabel);
 		return (error);
 	}
 
 	/*
 	 * XXX: Note that this is a redundant privilege check, since policies
 	 * impose this check themselves if required by the policy
 	 * Eventually, this should go away.
 	 */
 	error = priv_check_cred(cred, PRIV_NET_SETIFMAC, 0);
 	if (error) {
 		mac_ifnet_label_free(intlabel);
 		return (error);
 	}
 
 	MAC_IFNET_LOCK(ifp);
 	MAC_POLICY_CHECK_NOSLEEP(ifnet_check_relabel, cred, ifp,
 	    ifp->if_label, intlabel);
 	if (error) {
 		MAC_IFNET_UNLOCK(ifp);
 		mac_ifnet_label_free(intlabel);
 		return (error);
 	}
 
 	MAC_POLICY_PERFORM_NOSLEEP(ifnet_relabel, cred, ifp, ifp->if_label,
 	    intlabel);
 	MAC_IFNET_UNLOCK(ifp);
 
 	mac_ifnet_label_free(intlabel);
 	return (0);
 }
Index: head/sys/security/mac/mac_pipe.c
===================================================================
--- head/sys/security/mac/mac_pipe.c	(revision 258540)
+++ head/sys/security/mac/mac_pipe.c	(revision 258541)
@@ -1,260 +1,259 @@
 /*-
  * Copyright (c) 2002-2003 Networks Associates Technology, Inc.
  * Copyright (c) 2006 SPARTA, Inc.
  * Copyright (c) 2009 Robert N. M. Watson
  * All rights reserved.
  *
  * This software was developed for the FreeBSD Project in part by Network
  * Associates Laboratories, the Security Research Division of Network
  * Associates, Inc. under DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"),
  * as part of the DARPA CHATS research program.
  *
  * This software was enhanced by SPARTA ISSO under SPAWAR contract
  * N66001-04-C-6019 ("SEFOS").
  *
  * This software was developed at the University of Cambridge Computer
  * Laboratory with support from a grant from Google, Inc. 
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
-#include "opt_kdtrace.h"
 #include "opt_mac.h"
 
 #include <sys/param.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/module.h>
 #include <sys/mutex.h>
 #include <sys/sbuf.h>
 #include <sys/sdt.h>
 #include <sys/systm.h>
 #include <sys/vnode.h>
 #include <sys/pipe.h>
 #include <sys/sysctl.h>
 
 #include <security/mac/mac_framework.h>
 #include <security/mac/mac_internal.h>
 #include <security/mac/mac_policy.h>
 
 struct label *
 mac_pipe_label_alloc(void)
 {
 	struct label *label;
 
 	label = mac_labelzone_alloc(M_WAITOK);
 	MAC_POLICY_PERFORM(pipe_init_label, label);
 	return (label);
 }
 
 void
 mac_pipe_init(struct pipepair *pp)
 {
 
 	if (mac_labeled & MPC_OBJECT_PIPE)
 		pp->pp_label = mac_pipe_label_alloc();
 	else
 		pp->pp_label = NULL;
 }
 
 void
 mac_pipe_label_free(struct label *label)
 {
 
 	MAC_POLICY_PERFORM_NOSLEEP(pipe_destroy_label, label);
 	mac_labelzone_free(label);
 }
 
 void
 mac_pipe_destroy(struct pipepair *pp)
 {
 
 	if (pp->pp_label != NULL) {
 		mac_pipe_label_free(pp->pp_label);
 		pp->pp_label = NULL;
 	}
 }
 
 void
 mac_pipe_copy_label(struct label *src, struct label *dest)
 {
 
 	MAC_POLICY_PERFORM_NOSLEEP(pipe_copy_label, src, dest);
 }
 
 int
 mac_pipe_externalize_label(struct label *label, char *elements,
     char *outbuf, size_t outbuflen)
 {
 	int error;
 
 	MAC_POLICY_EXTERNALIZE(pipe, label, elements, outbuf, outbuflen);
 
 	return (error);
 }
 
 int
 mac_pipe_internalize_label(struct label *label, char *string)
 {
 	int error;
 
 	MAC_POLICY_INTERNALIZE(pipe, label, string);
 
 	return (error);
 }
 
 void
 mac_pipe_create(struct ucred *cred, struct pipepair *pp)
 {
 
 	MAC_POLICY_PERFORM_NOSLEEP(pipe_create, cred, pp, pp->pp_label);
 }
 
 static void
 mac_pipe_relabel(struct ucred *cred, struct pipepair *pp,
     struct label *newlabel)
 {
 
 	MAC_POLICY_PERFORM_NOSLEEP(pipe_relabel, cred, pp, pp->pp_label,
 	    newlabel);
 }
 
 MAC_CHECK_PROBE_DEFINE4(pipe_check_ioctl, "struct ucred *",
     "struct pipepair *", "unsigned long", "void *");
 
 int
 mac_pipe_check_ioctl(struct ucred *cred, struct pipepair *pp,
     unsigned long cmd, void *data)
 {
 	int error;
 
 	mtx_assert(&pp->pp_mtx, MA_OWNED);
 
 	MAC_POLICY_CHECK_NOSLEEP(pipe_check_ioctl, cred, pp, pp->pp_label,
 	    cmd, data);
 	MAC_CHECK_PROBE4(pipe_check_ioctl, error, cred, pp, cmd, data);
 
 	return (error);
 }
 
 MAC_CHECK_PROBE_DEFINE2(pipe_check_poll, "struct ucred *",
     "struct pipepair *");
 
 int
 mac_pipe_check_poll(struct ucred *cred, struct pipepair *pp)
 {
 	int error;
 
 	mtx_assert(&pp->pp_mtx, MA_OWNED);
 
 	MAC_POLICY_CHECK_NOSLEEP(pipe_check_poll, cred, pp, pp->pp_label);
 	MAC_CHECK_PROBE2(pipe_check_poll, error, cred, pp);
 
 	return (error);
 }
 
 MAC_CHECK_PROBE_DEFINE2(pipe_check_read, "struct ucred *",
     "struct pipepair *");
 
 int
 mac_pipe_check_read(struct ucred *cred, struct pipepair *pp)
 {
 	int error;
 
 	mtx_assert(&pp->pp_mtx, MA_OWNED);
 
 	MAC_POLICY_CHECK_NOSLEEP(pipe_check_read, cred, pp, pp->pp_label);
 	MAC_CHECK_PROBE2(pipe_check_read, error, cred, pp);
 
 	return (error);
 }
 
 MAC_CHECK_PROBE_DEFINE3(pipe_check_relabel, "struct ucred *",
     "struct pipepair *", "struct label *");
 
 static int
 mac_pipe_check_relabel(struct ucred *cred, struct pipepair *pp,
     struct label *newlabel)
 {
 	int error;
 
 	mtx_assert(&pp->pp_mtx, MA_OWNED);
 
 	MAC_POLICY_CHECK_NOSLEEP(pipe_check_relabel, cred, pp, pp->pp_label,
 	    newlabel);
 	MAC_CHECK_PROBE3(pipe_check_relabel, error, cred, pp, newlabel);
 
 	return (error);
 }
 
 MAC_CHECK_PROBE_DEFINE2(pipe_check_stat, "struct ucred *",
     "struct pipepair *");
 
 int
 mac_pipe_check_stat(struct ucred *cred, struct pipepair *pp)
 {
 	int error;
 
 	mtx_assert(&pp->pp_mtx, MA_OWNED);
 
 	MAC_POLICY_CHECK_NOSLEEP(pipe_check_stat, cred, pp, pp->pp_label);
 	MAC_CHECK_PROBE2(pipe_check_stat, error, cred, pp);
 
 	return (error);
 }
 
 MAC_CHECK_PROBE_DEFINE2(pipe_check_write, "struct ucred *",
     "struct pipepair *");
 
 int
 mac_pipe_check_write(struct ucred *cred, struct pipepair *pp)
 {
 	int error;
 
 	mtx_assert(&pp->pp_mtx, MA_OWNED);
 
 	MAC_POLICY_CHECK_NOSLEEP(pipe_check_write, cred, pp, pp->pp_label);
 	MAC_CHECK_PROBE2(pipe_check_write, error, cred, pp);
 
 	return (error);
 }
 
 int
 mac_pipe_label_set(struct ucred *cred, struct pipepair *pp,
     struct label *label)
 {
 	int error;
 
 	mtx_assert(&pp->pp_mtx, MA_OWNED);
 
 	error = mac_pipe_check_relabel(cred, pp, label);
 	if (error)
 		return (error);
 
 	mac_pipe_relabel(cred, pp, label);
 
 	return (0);
 }
Index: head/sys/security/mac/mac_posix_sem.c
===================================================================
--- head/sys/security/mac/mac_posix_sem.c	(revision 258540)
+++ head/sys/security/mac/mac_posix_sem.c	(revision 258541)
@@ -1,232 +1,231 @@
 /*-
  * Copyright (c) 2003-2006 SPARTA, Inc.
  * Copyright (c) 2009 Robert N. M. Watson
  * All rights reserved.
  *
  * This software was developed for the FreeBSD Project in part by Network
  * Associates Laboratories, the Security Research Division of Network
  * Associates, Inc. under DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"),
  * as part of the DARPA CHATS research program.
  *
  * This software was enhanced by SPARTA ISSO under SPAWAR contract
  * N66001-04-C-6019 ("SEFOS").
  *
  * This software was developed at the University of Cambridge Computer
  * Laboratory with support from a grant from Google, Inc. 
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
-#include "opt_kdtrace.h"
 #include "opt_mac.h"
 #include "opt_posix.h"
 
 #include <sys/param.h>
 #include <sys/kernel.h>
 #include <sys/ksem.h>
 #include <sys/malloc.h>
 #include <sys/module.h>
 #include <sys/sdt.h>
 #include <sys/systm.h>
 #include <sys/sysctl.h>
 
 #include <security/mac/mac_framework.h>
 #include <security/mac/mac_internal.h>
 #include <security/mac/mac_policy.h>
 
 static struct label *
 mac_posixsem_label_alloc(void)
 {
 	struct label *label;
 
 	label = mac_labelzone_alloc(M_WAITOK);
 	MAC_POLICY_PERFORM(posixsem_init_label, label);
 	return (label);
 }
 
 void
 mac_posixsem_init(struct ksem *ks)
 {
 
 	if (mac_labeled & MPC_OBJECT_POSIXSEM)
 		ks->ks_label = mac_posixsem_label_alloc();
 	else
 		ks->ks_label = NULL;
 }
 
 static void
 mac_posixsem_label_free(struct label *label)
 {
 
 	MAC_POLICY_PERFORM_NOSLEEP(posixsem_destroy_label, label);
 	mac_labelzone_free(label);
 }
 
 void
 mac_posixsem_destroy(struct ksem *ks)
 {
 
 	if (ks->ks_label != NULL) {
 		mac_posixsem_label_free(ks->ks_label);
 		ks->ks_label = NULL;
 	}
 }
 
 void
 mac_posixsem_create(struct ucred *cred, struct ksem *ks)
 {
 
 	MAC_POLICY_PERFORM_NOSLEEP(posixsem_create, cred, ks, ks->ks_label);
 }
 
 MAC_CHECK_PROBE_DEFINE2(posixsem_check_open, "struct ucred *",
     "struct ksem *");
 
 int
 mac_posixsem_check_open(struct ucred *cred, struct ksem *ks)
 {
 	int error;
 
 	MAC_POLICY_CHECK_NOSLEEP(posixsem_check_open, cred, ks,
 	    ks->ks_label);
 	MAC_CHECK_PROBE2(posixsem_check_open, error, cred, ks);
 
 	return (error);
 }
 
 MAC_CHECK_PROBE_DEFINE3(posixsem_check_getvalue, "struct ucred *",
     "struct ucred *", "struct ksem *");
 
 int
 mac_posixsem_check_getvalue(struct ucred *active_cred, struct ucred *file_cred,
     struct ksem *ks)
 {
 	int error;
 
 	MAC_POLICY_CHECK_NOSLEEP(posixsem_check_getvalue, active_cred,
 	    file_cred, ks, ks->ks_label);
 	MAC_CHECK_PROBE3(posixsem_check_getvalue, error, active_cred,
 	    file_cred, ks);
 
 	return (error);
 }
 
 MAC_CHECK_PROBE_DEFINE3(posixsem_check_post, "struct ucred *",
     "struct ucred *", "struct ksem *");
 
 int
 mac_posixsem_check_post(struct ucred *active_cred, struct ucred *file_cred,
     struct ksem *ks)
 {
 	int error;
 
 	MAC_POLICY_CHECK_NOSLEEP(posixsem_check_post, active_cred, file_cred,
 	    ks, ks->ks_label);
 	MAC_CHECK_PROBE3(posixsem_check_post, error, active_cred, file_cred,
 	    ks);
 
 	return (error);
 }
 
 MAC_CHECK_PROBE_DEFINE3(posixsem_check_stat, "struct ucred *",
     "struct ucred *", "struct ksem *");
 
 int
 mac_posixsem_check_stat(struct ucred *active_cred, struct ucred *file_cred,
     struct ksem *ks)
 {
 	int error;
 
 	MAC_POLICY_CHECK_NOSLEEP(posixsem_check_stat, active_cred, file_cred,
 	    ks, ks->ks_label);
 	MAC_CHECK_PROBE3(posixsem_check_stat, error, active_cred, file_cred,
 	    ks);
 
 	return (error);
 }
 
 MAC_CHECK_PROBE_DEFINE2(posixsem_check_unlink, "struct ucred *",
     "struct ksem *");
 
 int
 mac_posixsem_check_unlink(struct ucred *cred, struct ksem *ks)
 {
 	int error;
 
 	MAC_POLICY_CHECK_NOSLEEP(posixsem_check_unlink, cred, ks,
 	    ks->ks_label);
 	MAC_CHECK_PROBE2(posixsem_check_unlink, error, cred, ks);
 
 	return (error);
 }
 
 MAC_CHECK_PROBE_DEFINE3(posixsem_check_wait, "struct ucred *",
     "struct ucred *", "struct ksem *");
 
 int
 mac_posixsem_check_wait(struct ucred *active_cred, struct ucred *file_cred,
     struct ksem *ks)
 {
 	int error;
 
 	MAC_POLICY_CHECK_NOSLEEP(posixsem_check_wait, active_cred, file_cred,
 	    ks, ks->ks_label);
 	MAC_CHECK_PROBE3(posixsem_check_wait, error, active_cred, file_cred,
 	    ks);
 
 	return (error);
 }
 
 MAC_CHECK_PROBE_DEFINE3(posixsem_check_setmode, "struct ucred *",
     "struct ksem *", "mode_t");
 
 int
 mac_posixsem_check_setmode(struct ucred *cred, struct ksem *ks, mode_t mode)
 {
 	int error;
 
 	MAC_POLICY_CHECK_NOSLEEP(posixsem_check_setmode, cred, ks,
 	    ks->ks_label, mode);
 	MAC_CHECK_PROBE3(posixsem_check_setmode, error, cred, ks, mode);
 
 	return (error);
 }
 
 MAC_CHECK_PROBE_DEFINE4(posixsem_check_setowner, "struct ucred *",
     "struct ks *", "uid_t", "gid_t");
 
 int
 mac_posixsem_check_setowner(struct ucred *cred, struct ksem *ks, uid_t uid,
     gid_t gid)
 {
 	int error;
 
 	MAC_POLICY_CHECK_NOSLEEP(posixsem_check_setowner, cred, ks,
 	    ks->ks_label, uid, gid);
 	MAC_CHECK_PROBE4(posixsem_check_setowner, error, cred, ks,
 	    uid, gid);
 
 	return (error);
 }
Index: head/sys/security/mac/mac_posix_shm.c
===================================================================
--- head/sys/security/mac/mac_posix_shm.c	(revision 258540)
+++ head/sys/security/mac/mac_posix_shm.c	(revision 258541)
@@ -1,264 +1,263 @@
 /*-
  * Copyright (c) 2003-2006 SPARTA, Inc.
  * Copyright (c) 2009-2011 Robert N. M. Watson
  * All rights reserved.
  *
  * This software was developed for the FreeBSD Project in part by Network
  * Associates Laboratories, the Security Research Division of Network
  * Associates, Inc. under DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"),
  * as part of the DARPA CHATS research program.
  *
  * This software was enhanced by SPARTA ISSO under SPAWAR contract
  * N66001-04-C-6019 ("SEFOS"). *
  *
  * This software was developed at the University of Cambridge Computer
  * Laboratory with support from a grant from Google, Inc. 
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
-#include "opt_kdtrace.h"
 #include "opt_mac.h"
 
 #include <sys/param.h>
 #include <sys/kernel.h>
 #include <sys/mman.h>
 #include <sys/malloc.h>
 #include <sys/module.h>
 #include <sys/sdt.h>
 #include <sys/systm.h>
 #include <sys/sysctl.h>
 
 #include <security/mac/mac_framework.h>
 #include <security/mac/mac_internal.h>
 #include <security/mac/mac_policy.h>
 
 static struct label *
 mac_posixshm_label_alloc(void)
 {
 	struct label *label;
 
 	label = mac_labelzone_alloc(M_WAITOK);
 	MAC_POLICY_PERFORM(posixshm_init_label, label);
 	return (label);
 }
 
 void
 mac_posixshm_init(struct shmfd *shmfd)
 {
 
 	if (mac_labeled & MPC_OBJECT_POSIXSHM)
 		shmfd->shm_label = mac_posixshm_label_alloc();
 	else
 		shmfd->shm_label = NULL;
 }
 
 static void
 mac_posixshm_label_free(struct label *label)
 {
 
 	MAC_POLICY_PERFORM_NOSLEEP(posixshm_destroy_label, label);
 	mac_labelzone_free(label);
 }
 
 void
 mac_posixshm_destroy(struct shmfd *shmfd)
 {
 
 	if (shmfd->shm_label != NULL) {
 		mac_posixshm_label_free(shmfd->shm_label);
 		shmfd->shm_label = NULL;
 	}
 }
 
 void
 mac_posixshm_create(struct ucred *cred, struct shmfd *shmfd)
 {
 
 	MAC_POLICY_PERFORM_NOSLEEP(posixshm_create, cred, shmfd,
 	    shmfd->shm_label);
 }
 
 MAC_CHECK_PROBE_DEFINE2(posixshm_check_create, "struct ucred *",
     "const char *");
 
 int
 mac_posixshm_check_create(struct ucred *cred, const char *path)
 {
 	int error;
 
 	MAC_POLICY_CHECK_NOSLEEP(posixshm_check_create, cred, path);
 	MAC_CHECK_PROBE2(posixshm_check_create, error, cred, path);
 
 	return (error);
 }
 
 MAC_CHECK_PROBE_DEFINE4(posixshm_check_mmap, "struct ucred *",
     "struct shmfd *", "int", "int");
 
 int
 mac_posixshm_check_mmap(struct ucred *cred, struct shmfd *shmfd, int prot,
     int flags)
 {
 	int error;
 
 	MAC_POLICY_CHECK_NOSLEEP(posixshm_check_mmap, cred, shmfd,
 	    shmfd->shm_label, prot, flags);
 	MAC_CHECK_PROBE4(posixshm_check_mmap, error, cred, shmfd, prot,
 	    flags);
 
 	return (error);
 }
 
 MAC_CHECK_PROBE_DEFINE3(posixshm_check_open, "struct ucred *",
     "struct shmfd *", "accmode_t");
 
 int
 mac_posixshm_check_open(struct ucred *cred, struct shmfd *shmfd,
     accmode_t accmode)
 {
 	int error;
 
 	MAC_POLICY_CHECK_NOSLEEP(posixshm_check_open, cred, shmfd,
 	    shmfd->shm_label, accmode);
 	MAC_CHECK_PROBE3(posixshm_check_open, error, cred, shmfd, accmode);
 
 	return (error);
 }
 
 MAC_CHECK_PROBE_DEFINE3(posixshm_check_stat, "struct ucred *",
     "struct ucred *", "struct shmfd *");
 
 int
 mac_posixshm_check_stat(struct ucred *active_cred, struct ucred *file_cred,
     struct shmfd *shmfd)
 {
 	int error;
 
 	MAC_POLICY_CHECK_NOSLEEP(posixshm_check_stat, active_cred, file_cred,
 	    shmfd, shmfd->shm_label);
 	MAC_CHECK_PROBE3(posixshm_check_stat, error, active_cred, file_cred,
 	    shmfd);
 
 	return (error);
 }
 
 MAC_CHECK_PROBE_DEFINE3(posixshm_check_truncate, "struct ucred *",
     "struct ucred *", "struct shmfd *");
 
 int
 mac_posixshm_check_truncate(struct ucred *active_cred, struct ucred *file_cred,
     struct shmfd *shmfd)
 {
 	int error;
 
 	MAC_POLICY_CHECK_NOSLEEP(posixshm_check_truncate, active_cred,
 	    file_cred, shmfd, shmfd->shm_label);
 	MAC_CHECK_PROBE3(posixshm_check_truncate, error, active_cred,
 	    file_cred, shmfd);
 
 	return (error);
 }
 
 MAC_CHECK_PROBE_DEFINE2(posixshm_check_unlink, "struct ucred *",
     "struct shmfd *");
 
 int
 mac_posixshm_check_unlink(struct ucred *cred, struct shmfd *shmfd)
 {
 	int error;
 
 	MAC_POLICY_CHECK_NOSLEEP(posixshm_check_unlink, cred, shmfd,
 	    shmfd->shm_label);
 	MAC_CHECK_PROBE2(posixshm_check_unlink, error, cred, shmfd);
 
 	return (error);
 }
 
 MAC_CHECK_PROBE_DEFINE3(posixshm_check_setmode, "struct ucred *",
     "struct shmfd *", "mode_t");
 
 int
 mac_posixshm_check_setmode(struct ucred *cred, struct shmfd *shmfd, mode_t mode)
 {
 	int error;
 
 	MAC_POLICY_CHECK_NOSLEEP(posixshm_check_setmode, cred, shmfd,
 	    shmfd->shm_label, mode);
 	MAC_CHECK_PROBE3(posixshm_check_setmode, error, cred, shmfd, mode);
 
 	return (error);
 }
 
 MAC_CHECK_PROBE_DEFINE4(posixshm_check_setowner, "struct ucred *",
     "struct shmfd *", "uid_t", "gid_t");
 
 int
 mac_posixshm_check_setowner(struct ucred *cred, struct shmfd *shmfd, uid_t uid,
     gid_t gid)
 {
 	int error;
 
 	MAC_POLICY_CHECK_NOSLEEP(posixshm_check_setowner, cred, shmfd,
 	    shmfd->shm_label, uid, gid);
 	MAC_CHECK_PROBE4(posixshm_check_setowner, error, cred, shmfd,
 	    uid, gid);
 
 	return (error);
 }
 
 MAC_CHECK_PROBE_DEFINE3(posixshm_check_read, "struct ucred *",
     "struct ucred *", "struct shmfd *");
 
 int
 mac_posixshm_check_read(struct ucred *active_cred, struct ucred *file_cred,
     struct shmfd *shmfd)
 {
 	int error;
 
 	MAC_POLICY_CHECK_NOSLEEP(posixshm_check_read, active_cred,
 	    file_cred, shmfd, shmfd->shm_label);
 	MAC_CHECK_PROBE3(posixshm_check_read, error, active_cred,
 	    file_cred, shmfd);
 
 	return (error);
 }
 
 MAC_CHECK_PROBE_DEFINE3(posixshm_check_write, "struct ucred *",
     "struct ucred *", "struct shmfd *");
 
 int
 mac_posixshm_check_write(struct ucred *active_cred, struct ucred *file_cred,
     struct shmfd *shmfd)
 {
 	int error;
 
 	MAC_POLICY_CHECK_NOSLEEP(posixshm_check_write, active_cred,
 	    file_cred, shmfd, shmfd->shm_label);
 	MAC_CHECK_PROBE3(posixshm_check_write, error, active_cred,
 	    file_cred, shmfd);
 
 	return (error);
 }
Index: head/sys/security/mac/mac_priv.c
===================================================================
--- head/sys/security/mac/mac_priv.c	(revision 258540)
+++ head/sys/security/mac/mac_priv.c	(revision 258541)
@@ -1,96 +1,95 @@
 /*-
  * Copyright (c) 2006 nCircle Network Security, Inc.
  * Copyright (c) 2009 Robert N. M. Watson
  * All rights reserved.
  *
  * This software was developed by Robert N. M. Watson for the TrustedBSD
  * Project under contract to nCircle Network Security, Inc.
  *
  * This software was developed at the University of Cambridge Computer
  * Laboratory with support from a grant from Google, Inc.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR, NCIRCLE NETWORK SECURITY,
  * INC., OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
  * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
  * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
  * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
  * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
 /*
  * MAC checks for system privileges.
  */
 
 #include "sys/cdefs.h"
 __FBSDID("$FreeBSD$");
 
-#include "opt_kdtrace.h"
 #include "opt_mac.h"
 
 #include <sys/param.h>
 #include <sys/kernel.h>
 #include <sys/priv.h>
 #include <sys/sdt.h>
 #include <sys/module.h>
 
 #include <security/mac/mac_framework.h>
 #include <security/mac/mac_internal.h>
 #include <security/mac/mac_policy.h>
 
 /*
  * The MAC Framework interacts with kernel privilege checks in two ways: it
  * may restrict the granting of privilege to a subject, and it may grant
  * additional privileges to the subject.  Policies may implement none, one,
  * or both of these entry points.  Restriction of privilege by any policy
  * always overrides granting of privilege by any policy or other privilege
  * mechanism.  See kern_priv.c:priv_check_cred() for details of the
  * composition.
  */
 
 MAC_CHECK_PROBE_DEFINE2(priv_check, "struct ucred *", "int");
 
 /*
  * Restrict access to a privilege for a credential.  Return failure if any
  * policy denies access.
  */
 int
 mac_priv_check(struct ucred *cred, int priv)
 {
 	int error;
 
 	MAC_POLICY_CHECK_NOSLEEP(priv_check, cred, priv);
 	MAC_CHECK_PROBE2(priv_check, error, cred, priv);
 
 	return (error);
 }
 
 MAC_GRANT_PROBE_DEFINE2(priv_grant, "struct ucred *", "int");
 
 /*
  * Grant access to a privilege for a credential.  Return success if any
  * policy grants access.
  */
 int
 mac_priv_grant(struct ucred *cred, int priv)
 {
 	int error;
 
 	MAC_POLICY_GRANT_NOSLEEP(priv_grant, cred, priv);
 	MAC_GRANT_PROBE2(priv_grant, error, cred, priv);
 
 	return (error);
 }
Index: head/sys/security/mac/mac_process.c
===================================================================
--- head/sys/security/mac/mac_process.c	(revision 258540)
+++ head/sys/security/mac/mac_process.c	(revision 258541)
@@ -1,432 +1,431 @@
 /*-
  * Copyright (c) 1999-2002, 2008-2009 Robert N. M. Watson
  * Copyright (c) 2001 Ilmar S. Habibulin
  * Copyright (c) 2001-2003 Networks Associates Technology, Inc.
  * Copyright (c) 2006 SPARTA, Inc.
  * Copyright (c) 2008 Apple Inc.
  * All rights reserved.
  *
  * This software was developed by Robert Watson and Ilmar Habibulin for the
  * TrustedBSD Project.
  *
  * This software was developed for the FreeBSD Project in part by Network
  * Associates Laboratories, the Security Research Division of Network
  * Associates, Inc. under DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"),
  * as part of the DARPA CHATS research program.
  *
  * This software was enhanced by SPARTA ISSO under SPAWAR contract
  * N66001-04-C-6019 ("SEFOS").
  *
  * This software was developed at the University of Cambridge Computer
  * Laboratory with support from a grant from Google, Inc.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
-#include "opt_kdtrace.h"
 #include "opt_mac.h"
 
 #include <sys/param.h>
 #include <sys/condvar.h>
 #include <sys/imgact.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mac.h>
 #include <sys/proc.h>
 #include <sys/rwlock.h>
 #include <sys/sbuf.h>
 #include <sys/sdt.h>
 #include <sys/systm.h>
 #include <sys/vnode.h>
 #include <sys/mount.h>
 #include <sys/file.h>
 #include <sys/namei.h>
 #include <sys/sysctl.h>
 
 #include <vm/vm.h>
 #include <vm/pmap.h>
 #include <vm/vm_map.h>
 #include <vm/vm_object.h>
 
 #include <security/mac/mac_framework.h>
 #include <security/mac/mac_internal.h>
 #include <security/mac/mac_policy.h>
 
 static int	mac_mmap_revocation = 1;
 SYSCTL_INT(_security_mac, OID_AUTO, mmap_revocation, CTLFLAG_RW,
     &mac_mmap_revocation, 0, "Revoke mmap access to files on subject "
     "relabel");
 
 static int	mac_mmap_revocation_via_cow = 0;
 SYSCTL_INT(_security_mac, OID_AUTO, mmap_revocation_via_cow, CTLFLAG_RW,
     &mac_mmap_revocation_via_cow, 0, "Revoke mmap access to files via "
     "copy-on-write semantics, or by removing all write access");
 
 static void	mac_proc_vm_revoke_recurse(struct thread *td,
 		    struct ucred *cred, struct vm_map *map);
 
 static struct label *
 mac_proc_label_alloc(void)
 {
 	struct label *label;
 
 	label = mac_labelzone_alloc(M_WAITOK);
 	MAC_POLICY_PERFORM(proc_init_label, label);
 	return (label);
 }
 
 void
 mac_proc_init(struct proc *p)
 {
 
 	if (mac_labeled & MPC_OBJECT_PROC)
 		p->p_label = mac_proc_label_alloc();
 	else
 		p->p_label = NULL;
 }
 
 static void
 mac_proc_label_free(struct label *label)
 {
 
 	MAC_POLICY_PERFORM_NOSLEEP(proc_destroy_label, label);
 	mac_labelzone_free(label);
 }
 
 void
 mac_proc_destroy(struct proc *p)
 {
 
 	if (p->p_label != NULL) {
 		mac_proc_label_free(p->p_label);
 		p->p_label = NULL;
 	}
 }
 
 void
 mac_thread_userret(struct thread *td)
 {
 
 	MAC_POLICY_PERFORM(thread_userret, td);
 }
 
 int
 mac_execve_enter(struct image_params *imgp, struct mac *mac_p)
 {
 	struct label *label;
 	struct mac mac;
 	char *buffer;
 	int error;
 
 	if (mac_p == NULL)
 		return (0);
 
 	if (!(mac_labeled & MPC_OBJECT_CRED))
 		return (EINVAL);
 
 	error = copyin(mac_p, &mac, sizeof(mac));
 	if (error)
 		return (error);
 
 	error = mac_check_structmac_consistent(&mac);
 	if (error)
 		return (error);
 
 	buffer = malloc(mac.m_buflen, M_MACTEMP, M_WAITOK);
 	error = copyinstr(mac.m_string, buffer, mac.m_buflen, NULL);
 	if (error) {
 		free(buffer, M_MACTEMP);
 		return (error);
 	}
 
 	label = mac_cred_label_alloc();
 	error = mac_cred_internalize_label(label, buffer);
 	free(buffer, M_MACTEMP);
 	if (error) {
 		mac_cred_label_free(label);
 		return (error);
 	}
 	imgp->execlabel = label;
 	return (0);
 }
 
 void
 mac_execve_exit(struct image_params *imgp)
 {
 	if (imgp->execlabel != NULL) {
 		mac_cred_label_free(imgp->execlabel);
 		imgp->execlabel = NULL;
 	}
 }
 
 void
 mac_execve_interpreter_enter(struct vnode *interpvp,
     struct label **interpvplabel)
 {
 
 	if (mac_labeled & MPC_OBJECT_VNODE) {
 		*interpvplabel = mac_vnode_label_alloc();
 		mac_vnode_copy_label(interpvp->v_label, *interpvplabel);
 	} else
 		*interpvplabel = NULL;
 }
 
 void
 mac_execve_interpreter_exit(struct label *interpvplabel)
 {
 
 	if (interpvplabel != NULL)
 		mac_vnode_label_free(interpvplabel);
 }
 
 /*
  * When relabeling a process, call out to the policies for the maximum
  * permission allowed for each object type we know about in its memory space,
  * and revoke access (in the least surprising ways we know) when necessary.
  * The process lock is not held here.
  */
 void
 mac_proc_vm_revoke(struct thread *td)
 {
 	struct ucred *cred;
 
 	PROC_LOCK(td->td_proc);
 	cred = crhold(td->td_proc->p_ucred);
 	PROC_UNLOCK(td->td_proc);
 
 	/* XXX freeze all other threads */
 	mac_proc_vm_revoke_recurse(td, cred,
 	    &td->td_proc->p_vmspace->vm_map);
 	/* XXX allow other threads to continue */
 
 	crfree(cred);
 }
 
 static __inline const char *
 prot2str(vm_prot_t prot)
 {
 
 	switch (prot & VM_PROT_ALL) {
 	case VM_PROT_READ:
 		return ("r--");
 	case VM_PROT_READ | VM_PROT_WRITE:
 		return ("rw-");
 	case VM_PROT_READ | VM_PROT_EXECUTE:
 		return ("r-x");
 	case VM_PROT_READ | VM_PROT_WRITE | VM_PROT_EXECUTE:
 		return ("rwx");
 	case VM_PROT_WRITE:
 		return ("-w-");
 	case VM_PROT_EXECUTE:
 		return ("--x");
 	case VM_PROT_WRITE | VM_PROT_EXECUTE:
 		return ("-wx");
 	default:
 		return ("---");
 	}
 }
 
 static void
 mac_proc_vm_revoke_recurse(struct thread *td, struct ucred *cred,
     struct vm_map *map)
 {
 	vm_map_entry_t vme;
 	int result;
 	vm_prot_t revokeperms;
 	vm_object_t backing_object, object;
 	vm_ooffset_t offset;
 	struct vnode *vp;
 	struct mount *mp;
 
 	if (!mac_mmap_revocation)
 		return;
 
 	vm_map_lock(map);
 	for (vme = map->header.next; vme != &map->header; vme = vme->next) {
 		if (vme->eflags & MAP_ENTRY_IS_SUB_MAP) {
 			mac_proc_vm_revoke_recurse(td, cred,
 			    vme->object.sub_map);
 			continue;
 		}
 		/*
 		 * Skip over entries that obviously are not shared.
 		 */
 		if (vme->eflags & (MAP_ENTRY_COW | MAP_ENTRY_NOSYNC) ||
 		    !vme->max_protection)
 			continue;
 		/*
 		 * Drill down to the deepest backing object.
 		 */
 		offset = vme->offset;
 		object = vme->object.vm_object;
 		if (object == NULL)
 			continue;
 		VM_OBJECT_RLOCK(object);
 		while ((backing_object = object->backing_object) != NULL) {
 			VM_OBJECT_RLOCK(backing_object);
 			offset += object->backing_object_offset;
 			VM_OBJECT_RUNLOCK(object);
 			object = backing_object;
 		}
 		VM_OBJECT_RUNLOCK(object);
 		/*
 		 * At the moment, vm_maps and objects aren't considered by
 		 * the MAC system, so only things with backing by a normal
 		 * object (read: vnodes) are checked.
 		 */
 		if (object->type != OBJT_VNODE)
 			continue;
 		vp = (struct vnode *)object->handle;
 		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
 		result = vme->max_protection;
 		mac_vnode_check_mmap_downgrade(cred, vp, &result);
 		VOP_UNLOCK(vp, 0);
 		/*
 		 * Find out what maximum protection we may be allowing now
 		 * but a policy needs to get removed.
 		 */
 		revokeperms = vme->max_protection & ~result;
 		if (!revokeperms)
 			continue;
 		printf("pid %ld: revoking %s perms from %#lx:%ld "
 		    "(max %s/cur %s)\n", (long)td->td_proc->p_pid,
 		    prot2str(revokeperms), (u_long)vme->start,
 		    (long)(vme->end - vme->start),
 		    prot2str(vme->max_protection), prot2str(vme->protection));
 		/*
 		 * This is the really simple case: if a map has more
 		 * max_protection than is allowed, but it's not being
 		 * actually used (that is, the current protection is still
 		 * allowed), we can just wipe it out and do nothing more.
 		 */
 		if ((vme->protection & revokeperms) == 0) {
 			vme->max_protection -= revokeperms;
 		} else {
 			if (revokeperms & VM_PROT_WRITE) {
 				/*
 				 * In the more complicated case, flush out all
 				 * pending changes to the object then turn it
 				 * copy-on-write.
 				 */
 				vm_object_reference(object);
 				(void) vn_start_write(vp, &mp, V_WAIT);
 				vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
 				VM_OBJECT_WLOCK(object);
 				vm_object_page_clean(object, offset, offset +
 				    vme->end - vme->start, OBJPC_SYNC);
 				VM_OBJECT_WUNLOCK(object);
 				VOP_UNLOCK(vp, 0);
 				vn_finished_write(mp);
 				vm_object_deallocate(object);
 				/*
 				 * Why bother if there's no read permissions
 				 * anymore?  For the rest, we need to leave
 				 * the write permissions on for COW, or
 				 * remove them entirely if configured to.
 				 */
 				if (!mac_mmap_revocation_via_cow) {
 					vme->max_protection &= ~VM_PROT_WRITE;
 					vme->protection &= ~VM_PROT_WRITE;
 				} if ((revokeperms & VM_PROT_READ) == 0)
 					vme->eflags |= MAP_ENTRY_COW |
 					    MAP_ENTRY_NEEDS_COPY;
 			}
 			if (revokeperms & VM_PROT_EXECUTE) {
 				vme->max_protection &= ~VM_PROT_EXECUTE;
 				vme->protection &= ~VM_PROT_EXECUTE;
 			}
 			if (revokeperms & VM_PROT_READ) {
 				vme->max_protection = 0;
 				vme->protection = 0;
 			}
 			pmap_protect(map->pmap, vme->start, vme->end,
 			    vme->protection & ~revokeperms);
 			vm_map_simplify_entry(map, vme);
 		}
 	}
 	vm_map_unlock(map);
 }
 
 MAC_CHECK_PROBE_DEFINE2(proc_check_debug, "struct ucred *", "struct proc *");
 
 int
 mac_proc_check_debug(struct ucred *cred, struct proc *p)
 {
 	int error;
 
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 
 	MAC_POLICY_CHECK_NOSLEEP(proc_check_debug, cred, p);
 	MAC_CHECK_PROBE2(proc_check_debug, error, cred, p);
 
 	return (error);
 }
 
 MAC_CHECK_PROBE_DEFINE2(proc_check_sched, "struct ucred *", "struct proc *");
 
 int
 mac_proc_check_sched(struct ucred *cred, struct proc *p)
 {
 	int error;
 
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 
 	MAC_POLICY_CHECK_NOSLEEP(proc_check_sched, cred, p);
 	MAC_CHECK_PROBE2(proc_check_sched, error, cred, p);
 
 	return (error);
 }
 
 MAC_CHECK_PROBE_DEFINE3(proc_check_signal, "struct ucred *", "struct proc *",
     "int");
 
 int
 mac_proc_check_signal(struct ucred *cred, struct proc *p, int signum)
 {
 	int error;
 
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 
 	MAC_POLICY_CHECK_NOSLEEP(proc_check_signal, cred, p, signum);
 	MAC_CHECK_PROBE3(proc_check_signal, error, cred, p, signum);
 
 	return (error);
 }
 
 MAC_CHECK_PROBE_DEFINE2(proc_check_wait, "struct ucred *", "struct proc *");
 
 int
 mac_proc_check_wait(struct ucred *cred, struct proc *p)
 {
 	int error;
 
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 
 	MAC_POLICY_CHECK_NOSLEEP(proc_check_wait, cred, p);
 	MAC_CHECK_PROBE2(proc_check_wait, error, cred, p);
 
 	return (error);
 }
Index: head/sys/security/mac/mac_socket.c
===================================================================
--- head/sys/security/mac/mac_socket.c	(revision 258540)
+++ head/sys/security/mac/mac_socket.c	(revision 258541)
@@ -1,634 +1,633 @@
 /*-
  * Copyright (c) 1999-2002, 2009 Robert N. M. Watson
  * Copyright (c) 2001 Ilmar S. Habibulin
  * Copyright (c) 2001-2005 Networks Associates Technology, Inc.
  * Copyright (c) 2005-2006 SPARTA, Inc.
  * Copyright (c) 2008 Apple Inc.
  * All rights reserved.
  *
  * This software was developed by Robert Watson and Ilmar Habibulin for the
  * TrustedBSD Project.
  *
  * This software was developed for the FreeBSD Project in part by McAfee
  * Research, the Technology Research Division of Network Associates, Inc.
  * under DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as part of the
  * DARPA CHATS research program.
  *
  * This software was enhanced by SPARTA ISSO under SPAWAR contract
  * N66001-04-C-6019 ("SEFOS").
  *
  * This software was developed at the University of Cambridge Computer
  * Laboratory with support from a grant from Google, Inc.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
-#include "opt_kdtrace.h"
 #include "opt_mac.h"
 
 #include <sys/param.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mutex.h>
 #include <sys/mac.h>
 #include <sys/sbuf.h>
 #include <sys/sdt.h>
 #include <sys/systm.h>
 #include <sys/mount.h>
 #include <sys/file.h>
 #include <sys/namei.h>
 #include <sys/protosw.h>
 #include <sys/socket.h>
 #include <sys/socketvar.h>
 #include <sys/sysctl.h>
 
 #include <net/bpfdesc.h>
 #include <net/if.h>
 #include <net/if_var.h>
 
 #include <netinet/in.h>
 #include <netinet/in_pcb.h>
 #include <netinet/ip_var.h>
 
 #include <security/mac/mac_framework.h>
 #include <security/mac/mac_internal.h>
 #include <security/mac/mac_policy.h>
 
 /*
  * Currently, sockets hold two labels: the label of the socket itself, and a
  * peer label, which may be used by policies to hold a copy of the label of
  * any remote endpoint.
  *
  * Possibly, this peer label should be maintained at the protocol layer
  * (inpcb, unpcb, etc), as this would allow protocol-aware code to maintain
  * the label consistently.  For example, it might be copied live from a
  * remote socket for UNIX domain sockets rather than keeping a local copy on
  * this endpoint, but be cached and updated based on packets received for
  * TCP/IP.
  *
  * Unlike with many other object types, the lock protecting MAC labels on
  * sockets (the socket lock) is not frequently held at the points in code
  * where socket-related checks are called.  The MAC Framework acquires the
  * lock over some entry points in order to enforce atomicity (such as label
  * copies) but in other cases the policy modules will have to acquire the
  * lock themselves if they use labels.  This approach (a) avoids lock
  * acquisitions when policies don't require labels and (b) solves a number of
  * potential lock order issues when multiple sockets are used in the same
  * entry point.
  */
 
 struct label *
 mac_socket_label_alloc(int flag)
 {
 	struct label *label;
 	int error;
 
 	label = mac_labelzone_alloc(flag);
 	if (label == NULL)
 		return (NULL);
 
 	if (flag & M_WAITOK)
 		MAC_POLICY_CHECK(socket_init_label, label, flag);
 	else
 		MAC_POLICY_CHECK_NOSLEEP(socket_init_label, label, flag);
 	if (error) {
 		MAC_POLICY_PERFORM_NOSLEEP(socket_destroy_label, label);
 		mac_labelzone_free(label);
 		return (NULL);
 	}
 	return (label);
 }
 
 static struct label *
 mac_socketpeer_label_alloc(int flag)
 {
 	struct label *label;
 	int error;
 
 	label = mac_labelzone_alloc(flag);
 	if (label == NULL)
 		return (NULL);
 
 	if (flag & M_WAITOK)
 		MAC_POLICY_CHECK(socketpeer_init_label, label, flag);
 	else
 		MAC_POLICY_CHECK_NOSLEEP(socketpeer_init_label, label, flag);
 	if (error) {
 		MAC_POLICY_PERFORM_NOSLEEP(socketpeer_destroy_label, label);
 		mac_labelzone_free(label);
 		return (NULL);
 	}
 	return (label);
 }
 
 int
 mac_socket_init(struct socket *so, int flag)
 {
 
 	if (mac_labeled & MPC_OBJECT_SOCKET) {
 		so->so_label = mac_socket_label_alloc(flag);
 		if (so->so_label == NULL)
 			return (ENOMEM);
 		so->so_peerlabel = mac_socketpeer_label_alloc(flag);
 		if (so->so_peerlabel == NULL) {
 			mac_socket_label_free(so->so_label);
 			so->so_label = NULL;
 			return (ENOMEM);
 		}
 	} else {
 		so->so_label = NULL;
 		so->so_peerlabel = NULL;
 	}
 	return (0);
 }
 
 void
 mac_socket_label_free(struct label *label)
 {
 
 	MAC_POLICY_PERFORM_NOSLEEP(socket_destroy_label, label);
 	mac_labelzone_free(label);
 }
 
 static void
 mac_socketpeer_label_free(struct label *label)
 {
 
 	MAC_POLICY_PERFORM_NOSLEEP(socketpeer_destroy_label, label);
 	mac_labelzone_free(label);
 }
 
 void
 mac_socket_destroy(struct socket *so)
 {
 
 	if (so->so_label != NULL) {
 		mac_socket_label_free(so->so_label);
 		so->so_label = NULL;
 		mac_socketpeer_label_free(so->so_peerlabel);
 		so->so_peerlabel = NULL;
 	}
 }
 
 void
 mac_socket_copy_label(struct label *src, struct label *dest)
 {
 
 	MAC_POLICY_PERFORM_NOSLEEP(socket_copy_label, src, dest);
 }
 
 int
 mac_socket_externalize_label(struct label *label, char *elements,
     char *outbuf, size_t outbuflen)
 {
 	int error;
 
 	MAC_POLICY_EXTERNALIZE(socket, label, elements, outbuf, outbuflen);
 
 	return (error);
 }
 
 static int
 mac_socketpeer_externalize_label(struct label *label, char *elements,
     char *outbuf, size_t outbuflen)
 {
 	int error;
 
 	MAC_POLICY_EXTERNALIZE(socketpeer, label, elements, outbuf,
 	    outbuflen);
 
 	return (error);
 }
 
 int
 mac_socket_internalize_label(struct label *label, char *string)
 {
 	int error;
 
 	MAC_POLICY_INTERNALIZE(socket, label, string);
 
 	return (error);
 }
 
 void
 mac_socket_create(struct ucred *cred, struct socket *so)
 {
 
 	MAC_POLICY_PERFORM_NOSLEEP(socket_create, cred, so, so->so_label);
 }
 
 void
 mac_socket_newconn(struct socket *oldso, struct socket *newso)
 {
 
 	MAC_POLICY_PERFORM_NOSLEEP(socket_newconn, oldso, oldso->so_label,
 	    newso, newso->so_label);
 }
 
 static void
 mac_socket_relabel(struct ucred *cred, struct socket *so,
     struct label *newlabel)
 {
 
 	SOCK_LOCK_ASSERT(so);
 
 	MAC_POLICY_PERFORM_NOSLEEP(socket_relabel, cred, so, so->so_label,
 	    newlabel);
 }
 
 void
 mac_socketpeer_set_from_mbuf(struct mbuf *m, struct socket *so)
 {
 	struct label *label;
 
 	if (mac_policy_count == 0)
 		return;
 
 	label = mac_mbuf_to_label(m);
 
 	MAC_POLICY_PERFORM_NOSLEEP(socketpeer_set_from_mbuf, m, label, so,
 	    so->so_peerlabel);
 }
 
 void
 mac_socketpeer_set_from_socket(struct socket *oldso, struct socket *newso)
 {
 	
 	if (mac_policy_count == 0)
 		return;
 
 	MAC_POLICY_PERFORM_NOSLEEP(socketpeer_set_from_socket, oldso,
 	    oldso->so_label, newso, newso->so_peerlabel);
 }
 
 void
 mac_socket_create_mbuf(struct socket *so, struct mbuf *m)
 {
 	struct label *label;
 
 	if (mac_policy_count == 0)
 		return;
 
 	label = mac_mbuf_to_label(m);
 
 	MAC_POLICY_PERFORM_NOSLEEP(socket_create_mbuf, so, so->so_label, m,
 	    label);
 }
 
 MAC_CHECK_PROBE_DEFINE2(socket_check_accept, "struct ucred *",
     "struct socket *");
 
 int
 mac_socket_check_accept(struct ucred *cred, struct socket *so)
 {
 	int error;
 
 	MAC_POLICY_CHECK_NOSLEEP(socket_check_accept, cred, so,
 	    so->so_label);
 	MAC_CHECK_PROBE2(socket_check_accept, error, cred, so);
 
 	return (error);
 }
 
 MAC_CHECK_PROBE_DEFINE3(socket_check_bind, "struct ucred *",
     "struct socket *", "struct sockaddr *");
 
 int
 mac_socket_check_bind(struct ucred *cred, struct socket *so,
     struct sockaddr *sa)
 {
 	int error;
 
 	MAC_POLICY_CHECK_NOSLEEP(socket_check_bind, cred, so, so->so_label,
 	    sa);
 	MAC_CHECK_PROBE3(socket_check_bind, error, cred, so, sa);
 
 	return (error);
 }
 
 MAC_CHECK_PROBE_DEFINE3(socket_check_connect, "struct ucred *",
     "struct socket *", "struct sockaddr *");
 
 int
 mac_socket_check_connect(struct ucred *cred, struct socket *so,
     struct sockaddr *sa)
 {
 	int error;
 
 	MAC_POLICY_CHECK_NOSLEEP(socket_check_connect, cred, so,
 	    so->so_label, sa);
 	MAC_CHECK_PROBE3(socket_check_connect, error, cred, so, sa);
 
 	return (error);
 }
 
 MAC_CHECK_PROBE_DEFINE4(socket_check_create, "struct ucred *", "int", "int",
     "int");
 
 int
 mac_socket_check_create(struct ucred *cred, int domain, int type, int proto)
 {
 	int error;
 
 	MAC_POLICY_CHECK_NOSLEEP(socket_check_create, cred, domain, type,
 	    proto);
 	MAC_CHECK_PROBE4(socket_check_create, error, cred, domain, type,
 	    proto);
 
 	return (error);
 }
 
 MAC_CHECK_PROBE_DEFINE2(socket_check_deliver, "struct socket *",
     "struct mbuf *");
 
 int
 mac_socket_check_deliver(struct socket *so, struct mbuf *m)
 {
 	struct label *label;
 	int error;
 
 	if (mac_policy_count == 0)
 		return (0);
 
 	label = mac_mbuf_to_label(m);
 
 	MAC_POLICY_CHECK_NOSLEEP(socket_check_deliver, so, so->so_label, m,
 	    label);
 	MAC_CHECK_PROBE2(socket_check_deliver, error, so, m);
 
 	return (error);
 }
 
 MAC_CHECK_PROBE_DEFINE2(socket_check_listen, "struct ucred *",
     "struct socket *");
 
 int
 mac_socket_check_listen(struct ucred *cred, struct socket *so)
 {
 	int error;
 
 	MAC_POLICY_CHECK_NOSLEEP(socket_check_listen, cred, so,
 	    so->so_label);
 	MAC_CHECK_PROBE2(socket_check_listen, error, cred, so);
 
 	return (error);
 }
 
 MAC_CHECK_PROBE_DEFINE2(socket_check_poll, "struct ucred *",
     "struct socket *");
 
 int
 mac_socket_check_poll(struct ucred *cred, struct socket *so)
 {
 	int error;
 
 	MAC_POLICY_CHECK_NOSLEEP(socket_check_poll, cred, so, so->so_label);
 	MAC_CHECK_PROBE2(socket_check_poll, error, cred, so);
 
 	return (error);
 }
 
 MAC_CHECK_PROBE_DEFINE2(socket_check_receive, "struct ucred *",
     "struct socket *");
 
 int
 mac_socket_check_receive(struct ucred *cred, struct socket *so)
 {
 	int error;
 
 	MAC_POLICY_CHECK_NOSLEEP(socket_check_receive, cred, so,
 	    so->so_label);
 	MAC_CHECK_PROBE2(socket_check_receive, error, cred, so);
 
 	return (error);
 }
 
 MAC_CHECK_PROBE_DEFINE3(socket_check_relabel, "struct ucred *",
     "struct socket *", "struct label *");
 
 static int
 mac_socket_check_relabel(struct ucred *cred, struct socket *so,
     struct label *newlabel)
 {
 	int error;
 
 	SOCK_LOCK_ASSERT(so);
 
 	MAC_POLICY_CHECK_NOSLEEP(socket_check_relabel, cred, so,
 	    so->so_label, newlabel);
 	MAC_CHECK_PROBE3(socket_check_relabel, error, cred, so, newlabel);
 
 	return (error);
 }
 
 MAC_CHECK_PROBE_DEFINE2(socket_check_send, "struct ucred *",
     "struct socket *");
 
 int
 mac_socket_check_send(struct ucred *cred, struct socket *so)
 {
 	int error;
 
 	MAC_POLICY_CHECK_NOSLEEP(socket_check_send, cred, so, so->so_label);
 	MAC_CHECK_PROBE2(socket_check_send, error, cred, so);
 
 	return (error);
 }
 
 MAC_CHECK_PROBE_DEFINE2(socket_check_stat, "struct ucred *",
     "struct socket *");
 
 int
 mac_socket_check_stat(struct ucred *cred, struct socket *so)
 {
 	int error;
 
 	MAC_POLICY_CHECK_NOSLEEP(socket_check_stat, cred, so, so->so_label);
 	MAC_CHECK_PROBE2(socket_check_stat, error, cred, so);
 
 	return (error);
 }
 
 MAC_CHECK_PROBE_DEFINE2(socket_check_visible, "struct ucred *",
     "struct socket *");
 
 int
 mac_socket_check_visible(struct ucred *cred, struct socket *so)
 {
 	int error;
 
 	MAC_POLICY_CHECK_NOSLEEP(socket_check_visible, cred, so,
 	    so->so_label);
 	MAC_CHECK_PROBE2(socket_check_visible, error, cred, so);
 
 	return (error);
 }
 
 int
 mac_socket_label_set(struct ucred *cred, struct socket *so,
     struct label *label)
 {
 	int error;
 
 	/*
 	 * We acquire the socket lock when we perform the test and set, but
 	 * have to release it as the pcb code needs to acquire the pcb lock,
 	 * which will precede the socket lock in the lock order.  However,
 	 * this is fine, as any race will simply result in the inpcb being
 	 * refreshed twice, but still consistently, as the inpcb code will
 	 * acquire the socket lock before refreshing, holding both locks.
 	 */
 	SOCK_LOCK(so);
 	error = mac_socket_check_relabel(cred, so, label);
 	if (error) {
 		SOCK_UNLOCK(so);
 		return (error);
 	}
 
 	mac_socket_relabel(cred, so, label);
 	SOCK_UNLOCK(so);
 
 	/*
 	 * If the protocol has expressed interest in socket layer changes,
 	 * such as if it needs to propagate changes to a cached pcb label
 	 * from the socket, notify it of the label change while holding the
 	 * socket lock.
 	 */
 	if (so->so_proto->pr_usrreqs->pru_sosetlabel != NULL)
 		(so->so_proto->pr_usrreqs->pru_sosetlabel)(so);
 
 	return (0);
 }
 
 int
 mac_setsockopt_label(struct ucred *cred, struct socket *so, struct mac *mac)
 {
 	struct label *intlabel;
 	char *buffer;
 	int error;
 
 	if (!(mac_labeled & MPC_OBJECT_SOCKET))
 		return (EINVAL);
 
 	error = mac_check_structmac_consistent(mac);
 	if (error)
 		return (error);
 
 	buffer = malloc(mac->m_buflen, M_MACTEMP, M_WAITOK);
 	error = copyinstr(mac->m_string, buffer, mac->m_buflen, NULL);
 	if (error) {
 		free(buffer, M_MACTEMP);
 		return (error);
 	}
 
 	intlabel = mac_socket_label_alloc(M_WAITOK);
 	error = mac_socket_internalize_label(intlabel, buffer);
 	free(buffer, M_MACTEMP);
 	if (error)
 		goto out;
 
 	error = mac_socket_label_set(cred, so, intlabel);
 out:
 	mac_socket_label_free(intlabel);
 	return (error);
 }
 
 int
 mac_getsockopt_label(struct ucred *cred, struct socket *so, struct mac *mac)
 {
 	char *buffer, *elements;
 	struct label *intlabel;
 	int error;
 
 	if (!(mac_labeled & MPC_OBJECT_SOCKET))
 		return (EINVAL);
 
 	error = mac_check_structmac_consistent(mac);
 	if (error)
 		return (error);
 
 	elements = malloc(mac->m_buflen, M_MACTEMP, M_WAITOK);
 	error = copyinstr(mac->m_string, elements, mac->m_buflen, NULL);
 	if (error) {
 		free(elements, M_MACTEMP);
 		return (error);
 	}
 
 	buffer = malloc(mac->m_buflen, M_MACTEMP, M_WAITOK | M_ZERO);
 	intlabel = mac_socket_label_alloc(M_WAITOK);
 	SOCK_LOCK(so);
 	mac_socket_copy_label(so->so_label, intlabel);
 	SOCK_UNLOCK(so);
 	error = mac_socket_externalize_label(intlabel, elements, buffer,
 	    mac->m_buflen);
 	mac_socket_label_free(intlabel);
 	if (error == 0)
 		error = copyout(buffer, mac->m_string, strlen(buffer)+1);
 
 	free(buffer, M_MACTEMP);
 	free(elements, M_MACTEMP);
 
 	return (error);
 }
 
 int
 mac_getsockopt_peerlabel(struct ucred *cred, struct socket *so,
     struct mac *mac)
 {
 	char *elements, *buffer;
 	struct label *intlabel;
 	int error;
 
 	if (!(mac_labeled & MPC_OBJECT_SOCKET))
 		return (EINVAL);
 
 	error = mac_check_structmac_consistent(mac);
 	if (error)
 		return (error);
 
 	elements = malloc(mac->m_buflen, M_MACTEMP, M_WAITOK);
 	error = copyinstr(mac->m_string, elements, mac->m_buflen, NULL);
 	if (error) {
 		free(elements, M_MACTEMP);
 		return (error);
 	}
 
 	buffer = malloc(mac->m_buflen, M_MACTEMP, M_WAITOK | M_ZERO);
 	intlabel = mac_socket_label_alloc(M_WAITOK);
 	SOCK_LOCK(so);
 	mac_socket_copy_label(so->so_peerlabel, intlabel);
 	SOCK_UNLOCK(so);
 	error = mac_socketpeer_externalize_label(intlabel, elements, buffer,
 	    mac->m_buflen);
 	mac_socket_label_free(intlabel);
 	if (error == 0)
 		error = copyout(buffer, mac->m_string, strlen(buffer)+1);
 
 	free(buffer, M_MACTEMP);
 	free(elements, M_MACTEMP);
 
 	return (error);
 }
Index: head/sys/security/mac/mac_system.c
===================================================================
--- head/sys/security/mac/mac_system.c	(revision 258540)
+++ head/sys/security/mac/mac_system.c	(revision 258541)
@@ -1,237 +1,236 @@
 /*-
  * Copyright (c) 2002-2003 Networks Associates Technology, Inc.
  * Copyright (c) 2006 SPARTA, Inc.
  * Copyright (c) 2007, 2009 Robert N. M. Watson
  * All rights reserved.
  *
  * This software was developed for the FreeBSD Project in part by Network
  * Associates Laboratories, the Security Research Division of Network
  * Associates, Inc. under DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"),
  * as part of the DARPA CHATS research program.
  *
  * Portions of this software were developed by Robert Watson for the
  * TrustedBSD Project.
  *
  * This software was enhanced by SPARTA ISSO under SPAWAR contract
  * N66001-04-C-6019 ("SEFOS").
  *
  * This software was developed at the University of Cambridge Computer
  * Laboratory with support from a grant from Google, Inc.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 /*
  * MAC Framework entry points relating to overall operation of system,
  * including global services such as the kernel environment and loadable
  * modules.
  *
  * System checks often align with existing privilege checks, but provide
  * additional security context that may be relevant to policies, such as the
  * specific object being operated on.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
-#include "opt_kdtrace.h"
 #include "opt_mac.h"
 
 #include <sys/param.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/module.h>
 #include <sys/mutex.h>
 #include <sys/sdt.h>
 #include <sys/systm.h>
 #include <sys/vnode.h>
 #include <sys/sysctl.h>
 
 #include <security/mac/mac_framework.h>
 #include <security/mac/mac_internal.h>
 #include <security/mac/mac_policy.h>
 
 MAC_CHECK_PROBE_DEFINE1(kenv_check_dump, "struct ucred *");
 
 int
 mac_kenv_check_dump(struct ucred *cred)
 {
 	int error;
 
 	MAC_POLICY_CHECK_NOSLEEP(kenv_check_dump, cred);
 	MAC_CHECK_PROBE1(kenv_check_dump, error, cred);
 
 	return (error);
 }
 
 MAC_CHECK_PROBE_DEFINE2(kenv_check_get, "struct ucred *", "char *");
 
 int
 mac_kenv_check_get(struct ucred *cred, char *name)
 {
 	int error;
 
 	MAC_POLICY_CHECK_NOSLEEP(kenv_check_get, cred, name);
 	MAC_CHECK_PROBE2(kenv_check_get, error, cred, name);
 
 	return (error);
 }
 
 MAC_CHECK_PROBE_DEFINE3(kenv_check_set, "struct ucred *", "char *",
     "char *");
 
 int
 mac_kenv_check_set(struct ucred *cred, char *name, char *value)
 {
 	int error;
 
 	MAC_POLICY_CHECK_NOSLEEP(kenv_check_set, cred, name, value);
 	MAC_CHECK_PROBE3(kenv_check_set, error, cred, name, value);
 
 	return (error);
 }
 
 MAC_CHECK_PROBE_DEFINE2(kenv_check_unset, "struct ucred *", "char *");
 
 int
 mac_kenv_check_unset(struct ucred *cred, char *name)
 {
 	int error;
 
 	MAC_POLICY_CHECK_NOSLEEP(kenv_check_unset, cred, name);
 	MAC_CHECK_PROBE2(kenv_check_unset, error, cred, name);
 
 	return (error);
 }
 
 MAC_CHECK_PROBE_DEFINE2(kld_check_load, "struct ucred *", "struct vnode *");
 
 int
 mac_kld_check_load(struct ucred *cred, struct vnode *vp)
 {
 	int error;
 
 	ASSERT_VOP_LOCKED(vp, "mac_kld_check_load");
 
 	MAC_POLICY_CHECK(kld_check_load, cred, vp, vp->v_label);
 	MAC_CHECK_PROBE2(kld_check_load, error, cred, vp);
 
 	return (error);
 }
 
 MAC_CHECK_PROBE_DEFINE1(kld_check_stat, "struct ucred *");
 
 int
 mac_kld_check_stat(struct ucred *cred)
 {
 	int error;
 
 	MAC_POLICY_CHECK_NOSLEEP(kld_check_stat, cred);
 	MAC_CHECK_PROBE1(kld_check_stat, error, cred);
 
 	return (error);
 }
 
 MAC_CHECK_PROBE_DEFINE2(system_check_acct, "struct ucred *",
     "struct vnode *");
 
 int
 mac_system_check_acct(struct ucred *cred, struct vnode *vp)
 {
 	int error;
 
 	if (vp != NULL) {
 		ASSERT_VOP_LOCKED(vp, "mac_system_check_acct");
 	}
 
 	MAC_POLICY_CHECK(system_check_acct, cred, vp,
 	    vp != NULL ? vp->v_label : NULL);
 	MAC_CHECK_PROBE2(system_check_acct, error, cred, vp);
 
 	return (error);
 }
 
 MAC_CHECK_PROBE_DEFINE2(system_check_reboot, "struct ucred *", "int");
 
 int
 mac_system_check_reboot(struct ucred *cred, int howto)
 {
 	int error;
 
 	MAC_POLICY_CHECK_NOSLEEP(system_check_reboot, cred, howto);
 	MAC_CHECK_PROBE2(system_check_reboot, error, cred, howto);
 
 	return (error);
 }
 
 MAC_CHECK_PROBE_DEFINE2(system_check_swapon, "struct ucred *",
     "struct vnode *");
 
 int
 mac_system_check_swapon(struct ucred *cred, struct vnode *vp)
 {
 	int error;
 
 	ASSERT_VOP_LOCKED(vp, "mac_system_check_swapon");
 
 	MAC_POLICY_CHECK(system_check_swapon, cred, vp, vp->v_label);
 	MAC_CHECK_PROBE2(system_check_swapon, error, cred, vp);
 
 	return (error);
 }
 
 MAC_CHECK_PROBE_DEFINE2(system_check_swapoff, "struct ucred *",
     "struct vnode *");
 
 int
 mac_system_check_swapoff(struct ucred *cred, struct vnode *vp)
 {
 	int error;
 
 	ASSERT_VOP_LOCKED(vp, "mac_system_check_swapoff");
 
 	MAC_POLICY_CHECK(system_check_swapoff, cred, vp, vp->v_label);
 	MAC_CHECK_PROBE2(system_check_swapoff, error, cred, vp);
 
 	return (error);
 }
 
 MAC_CHECK_PROBE_DEFINE3(system_check_sysctl, "struct ucred *",
     "struct sysctl_oid *", "struct sysctl_req *");
 
 int
 mac_system_check_sysctl(struct ucred *cred, struct sysctl_oid *oidp,
     void *arg1, int arg2, struct sysctl_req *req)
 {
 	int error;
 
 	/*
 	 * XXXMAC: We would very much like to assert the SYSCTL_LOCK here,
 	 * but since it's not exported from kern_sysctl.c, we can't.
 	 */
 	MAC_POLICY_CHECK_NOSLEEP(system_check_sysctl, cred, oidp, arg1, arg2,
 	    req);
 	MAC_CHECK_PROBE3(system_check_sysctl, error, cred, oidp, req);
 
 	return (error);
 }
Index: head/sys/security/mac/mac_sysv_msg.c
===================================================================
--- head/sys/security/mac/mac_sysv_msg.c	(revision 258540)
+++ head/sys/security/mac/mac_sysv_msg.c	(revision 258541)
@@ -1,278 +1,277 @@
 /*-
  * Copyright (c) 2003-2004 Networks Associates Technology, Inc.
  * Copyright (c) 2006 SPARTA, Inc.
  * Copyright (c) 2008 Apple Inc.
  * Copyright (c) 2009 Robert N. M. Watson
  * All rights reserved.
  *
  * This software was developed for the FreeBSD Project in part by Network
  * Associates Laboratories, the Security Research Division of Network
  * Associates, Inc. under DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"),
  * as part of the DARPA CHATS research program.
  *
  * This software was enhanced by SPARTA ISSO under SPAWAR contract
  * N66001-04-C-6019 ("SEFOS").
  *
  * This software was developed at the University of Cambridge Computer
  * Laboratory with support from a grant from Google, Inc.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
-#include "opt_kdtrace.h"
 #include "opt_mac.h"
 
 #include <sys/param.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mutex.h>
 #include <sys/sbuf.h>
 #include <sys/sdt.h>
 #include <sys/systm.h>
 #include <sys/vnode.h>
 #include <sys/mount.h>
 #include <sys/file.h>
 #include <sys/namei.h>
 #include <sys/sysctl.h>
 #include <sys/msg.h>
 
 #include <security/mac/mac_framework.h>
 #include <security/mac/mac_internal.h>
 #include <security/mac/mac_policy.h>
 
 static struct label *
 mac_sysv_msgmsg_label_alloc(void)
 {
 	struct label *label;
 
 	label = mac_labelzone_alloc(M_WAITOK);
 	MAC_POLICY_PERFORM(sysvmsg_init_label, label);
 	return (label);
 }
 
 void
 mac_sysvmsg_init(struct msg *msgptr)
 {
 
 	if (mac_labeled & MPC_OBJECT_SYSVMSG)
 		msgptr->label = mac_sysv_msgmsg_label_alloc();
 	else
 		msgptr->label = NULL;
 }
 
 static struct label *
 mac_sysv_msgqueue_label_alloc(void)
 {
 	struct label *label;
 
 	label = mac_labelzone_alloc(M_WAITOK);
 	MAC_POLICY_PERFORM(sysvmsq_init_label, label);
 	return (label);
 }
 
 void
 mac_sysvmsq_init(struct msqid_kernel *msqkptr)
 {
 
 	if (mac_labeled & MPC_OBJECT_SYSVMSQ)
 		msqkptr->label = mac_sysv_msgqueue_label_alloc();
 	else
 		msqkptr->label = NULL;
 }
 
 static void
 mac_sysv_msgmsg_label_free(struct label *label)
 {
 
 	MAC_POLICY_PERFORM_NOSLEEP(sysvmsg_destroy_label, label);
 	mac_labelzone_free(label);
 }
 
 void
 mac_sysvmsg_destroy(struct msg *msgptr)
 {
 
 	if (msgptr->label != NULL) {
 		mac_sysv_msgmsg_label_free(msgptr->label);
 		msgptr->label = NULL;
 	}
 }
 
 static void
 mac_sysv_msgqueue_label_free(struct label *label)
 {
 
 	MAC_POLICY_PERFORM_NOSLEEP(sysvmsq_destroy_label, label);
 	mac_labelzone_free(label);
 }
 
 void
 mac_sysvmsq_destroy(struct msqid_kernel *msqkptr)
 {
 
 	if (msqkptr->label != NULL) {
 		mac_sysv_msgqueue_label_free(msqkptr->label);
 		msqkptr->label = NULL;
 	}
 }
 
 void
 mac_sysvmsg_create(struct ucred *cred, struct msqid_kernel *msqkptr,
     struct msg *msgptr)
 {
 
 	MAC_POLICY_PERFORM_NOSLEEP(sysvmsg_create, cred, msqkptr,
 	    msqkptr->label, msgptr, msgptr->label);
 }
 
 void
 mac_sysvmsq_create(struct ucred *cred, struct msqid_kernel *msqkptr)
 {
 
 	MAC_POLICY_PERFORM_NOSLEEP(sysvmsq_create, cred, msqkptr,
 	    msqkptr->label);
 }
 
 void
 mac_sysvmsg_cleanup(struct msg *msgptr)
 {
 
 	MAC_POLICY_PERFORM_NOSLEEP(sysvmsg_cleanup, msgptr->label);
 }
 
 void
 mac_sysvmsq_cleanup(struct msqid_kernel *msqkptr)
 {
 
 	MAC_POLICY_PERFORM_NOSLEEP(sysvmsq_cleanup, msqkptr->label);
 }
 
 MAC_CHECK_PROBE_DEFINE3(sysvmsq_check_msgmsq, "struct ucred *",
     "struct msg *", "struct msqid_kernel *");
 
 int
 mac_sysvmsq_check_msgmsq(struct ucred *cred, struct msg *msgptr,
 	struct msqid_kernel *msqkptr)
 {
 	int error;
 
 	MAC_POLICY_CHECK_NOSLEEP(sysvmsq_check_msgmsq, cred, msgptr,
 	    msgptr->label, msqkptr, msqkptr->label);
 	MAC_CHECK_PROBE3(sysvmsq_check_msgmsq, error, cred, msgptr, msqkptr);
 
 	return (error);
 }
 
 MAC_CHECK_PROBE_DEFINE2(sysvmsq_check_msgrcv, "struct ucred *",
     "struct msg *");
 
 int
 mac_sysvmsq_check_msgrcv(struct ucred *cred, struct msg *msgptr)
 {
 	int error;
 
 	MAC_POLICY_CHECK_NOSLEEP(sysvmsq_check_msgrcv, cred, msgptr,
 	    msgptr->label);
 	MAC_CHECK_PROBE2(sysvmsq_check_msgrcv, error, cred, msgptr);
 
 	return (error);
 }
 
 MAC_CHECK_PROBE_DEFINE2(sysvmsq_check_msgrmid, "struct ucred *",
     "struct msg *");
 
 int
 mac_sysvmsq_check_msgrmid(struct ucred *cred, struct msg *msgptr)
 {
 	int error;
 
 	MAC_POLICY_CHECK_NOSLEEP(sysvmsq_check_msgrmid, cred, msgptr,
 	    msgptr->label);
 	MAC_CHECK_PROBE2(sysvmsq_check_msgrmid, error, cred, msgptr);
 
 	return (error);
 }
 
 MAC_CHECK_PROBE_DEFINE2(sysvmsq_check_msqget, "struct ucred *",
     "struct msqid_kernel *");
 
 int
 mac_sysvmsq_check_msqget(struct ucred *cred, struct msqid_kernel *msqkptr)
 {
 	int error;
 
 	MAC_POLICY_CHECK_NOSLEEP(sysvmsq_check_msqget, cred, msqkptr,
 	    msqkptr->label);
 	MAC_CHECK_PROBE2(sysvmsq_check_msqget, error, cred, msqkptr);
 
 	return (error);
 }
 
 MAC_CHECK_PROBE_DEFINE2(sysvmsq_check_msqsnd, "struct ucred *",
     "struct msqid_kernel *");
 
 int
 mac_sysvmsq_check_msqsnd(struct ucred *cred, struct msqid_kernel *msqkptr)
 {
 	int error;
 
 	MAC_POLICY_CHECK_NOSLEEP(sysvmsq_check_msqsnd, cred, msqkptr,
 	    msqkptr->label);
 	MAC_CHECK_PROBE2(sysvmsq_check_msqsnd, error, cred, msqkptr);
 
 	return (error);
 }
 
 MAC_CHECK_PROBE_DEFINE2(sysvmsq_check_msqrcv, "struct ucred *",
     "struct msqid_kernel *");
 
 int
 mac_sysvmsq_check_msqrcv(struct ucred *cred, struct msqid_kernel *msqkptr)
 {
 	int error;
 
 	MAC_POLICY_CHECK_NOSLEEP(sysvmsq_check_msqrcv, cred, msqkptr,
 	    msqkptr->label);
 	MAC_CHECK_PROBE2(sysvmsq_check_msqrcv, error, cred, msqkptr);
 
 	return (error);
 }
 
 MAC_CHECK_PROBE_DEFINE3(sysvmsq_check_msqctl, "struct ucred *",
     "struct msqid_kernel *", "int");
 
 int
 mac_sysvmsq_check_msqctl(struct ucred *cred, struct msqid_kernel *msqkptr,
     int cmd)
 {
 	int error;
 
 	MAC_POLICY_CHECK_NOSLEEP(sysvmsq_check_msqctl, cred, msqkptr,
 	    msqkptr->label, cmd);
 	MAC_CHECK_PROBE3(sysvmsq_check_msqctl, error, cred, msqkptr, cmd);
 
 	return (error);
 }
Index: head/sys/security/mac/mac_sysv_sem.c
===================================================================
--- head/sys/security/mac/mac_sysv_sem.c	(revision 258540)
+++ head/sys/security/mac/mac_sysv_sem.c	(revision 258541)
@@ -1,163 +1,162 @@
 /*-
  * Copyright (c) 2003-2004 Networks Associates Technology, Inc.
  * Copyright (c) 2006 SPARTA, Inc.
  * Copyright (c) 2009 Robert N. M. Watson
  * All rights reserved.
  *
  * This software was developed for the FreeBSD Project in part by Network
  * Associates Laboratories, the Security Research Division of Network
  * Associates, Inc. under DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"),
  * as part of the DARPA CHATS research program.
  *
  * This software was enhanced by SPARTA ISSO under SPAWAR contract
  * N66001-04-C-6019 ("SEFOS").
  *
  * This software was developed at the University of Cambridge Computer
  * Laboratory with support from a grant from Google, Inc.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
-#include "opt_kdtrace.h"
 #include "opt_mac.h"
 
 #include <sys/param.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mutex.h>
 #include <sys/sbuf.h>
 #include <sys/systm.h>
 #include <sys/vnode.h>
 #include <sys/mount.h>
 #include <sys/file.h>
 #include <sys/namei.h>
 #include <sys/sdt.h>
 #include <sys/sysctl.h>
 #include <sys/sem.h>
 
 #include <security/mac/mac_framework.h>
 #include <security/mac/mac_internal.h>
 #include <security/mac/mac_policy.h>
 
 static struct label *
 mac_sysv_sem_label_alloc(void)
 {
 	struct label *label;
 
 	label = mac_labelzone_alloc(M_WAITOK);
 	MAC_POLICY_PERFORM(sysvsem_init_label, label);
 	return (label);
 }
 
 void
 mac_sysvsem_init(struct semid_kernel *semakptr)
 {
 
 	if (mac_labeled & MPC_OBJECT_SYSVSEM)
 		semakptr->label = mac_sysv_sem_label_alloc();
 	else
 		semakptr->label = NULL;
 }
 
 static void
 mac_sysv_sem_label_free(struct label *label)
 {
 
 	MAC_POLICY_PERFORM_NOSLEEP(sysvsem_destroy_label, label);
 	mac_labelzone_free(label);
 }
 
 void
 mac_sysvsem_destroy(struct semid_kernel *semakptr)
 {
 
 	if (semakptr->label != NULL) {
 		mac_sysv_sem_label_free(semakptr->label);
 		semakptr->label = NULL;
 	}
 }
 
 void
 mac_sysvsem_create(struct ucred *cred, struct semid_kernel *semakptr)
 {
 
 	MAC_POLICY_PERFORM_NOSLEEP(sysvsem_create, cred, semakptr,
 	    semakptr->label);
 }
 
 void
 mac_sysvsem_cleanup(struct semid_kernel *semakptr)
 {
 
 	MAC_POLICY_PERFORM_NOSLEEP(sysvsem_cleanup, semakptr->label);
 }
 
 MAC_CHECK_PROBE_DEFINE3(sysvsem_check_semctl, "struct ucred *",
     "struct semid_kernel *", "int");
 
 int
 mac_sysvsem_check_semctl(struct ucred *cred, struct semid_kernel *semakptr,
     int cmd)
 {
 	int error;
 
 	MAC_POLICY_CHECK_NOSLEEP(sysvsem_check_semctl, cred, semakptr,
 	    semakptr->label, cmd);
 	MAC_CHECK_PROBE3(sysvsem_check_semctl, error, cred, semakptr, cmd);
 
 	return (error);
 }
 
 MAC_CHECK_PROBE_DEFINE2(sysvsem_check_semget, "struct ucred *",
     "struct semid_kernel *");
 
 int
 mac_sysvsem_check_semget(struct ucred *cred, struct semid_kernel *semakptr)
 {
 	int error;
 
 	MAC_POLICY_CHECK_NOSLEEP(sysvsem_check_semget, cred, semakptr,
 	    semakptr->label);
 
 	return (error);
 }
 
 MAC_CHECK_PROBE_DEFINE3(sysvsem_check_semop, "struct ucred *",
     "struct semid_kernel *", "size_t");
 
 int
 mac_sysvsem_check_semop(struct ucred *cred, struct semid_kernel *semakptr,
     size_t accesstype)
 {
 	int error;
 
 	MAC_POLICY_CHECK_NOSLEEP(sysvsem_check_semop, cred, semakptr,
 	    semakptr->label, accesstype);
 	MAC_CHECK_PROBE3(sysvsem_check_semop, error, cred, semakptr,
 	    accesstype);
 
 	return (error);
 }
Index: head/sys/security/mac/mac_sysv_shm.c
===================================================================
--- head/sys/security/mac/mac_sysv_shm.c	(revision 258540)
+++ head/sys/security/mac/mac_sysv_shm.c	(revision 258541)
@@ -1,181 +1,180 @@
 /*-
  * Copyright (c) 2003-2004 Networks Associates Technology, Inc.
  * Copyright (c) 2006 SPARTA, Inc.
  * Copyright (c) 2009 Robert N. M. Watson
  * All rights reserved.
  *
  * This software was developed for the FreeBSD Project in part by Network
  * Associates Laboratories, the Security Research Division of Network
  * Associates, Inc. under DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"),
  * as part of the DARPA CHATS research program.
  *
  * This software was enhanced by SPARTA ISSO under SPAWAR contract
  * N66001-04-C-6019 ("SEFOS").
  *
  * This software was developed at the University of Cambridge Computer
  * Laboratory with support from a grant from Google, Inc.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
-#include "opt_kdtrace.h"
 #include "opt_mac.h"
 
 #include <sys/param.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mutex.h>
 #include <sys/sbuf.h>
 #include <sys/systm.h>
 #include <sys/vnode.h>
 #include <sys/mount.h>
 #include <sys/file.h>
 #include <sys/namei.h>
 #include <sys/sdt.h>
 #include <sys/sysctl.h>
 #include <sys/shm.h>
 
 #include <security/mac/mac_framework.h>
 #include <security/mac/mac_internal.h>
 #include <security/mac/mac_policy.h>
 
 static struct label *
 mac_sysv_shm_label_alloc(void)
 {
 	struct label *label;
 
 	label = mac_labelzone_alloc(M_WAITOK);
 	MAC_POLICY_PERFORM(sysvshm_init_label, label);
 	return (label);
 }
 
 void
 mac_sysvshm_init(struct shmid_kernel *shmsegptr)
 {
 
 	if (mac_labeled & MPC_OBJECT_SYSVSHM)
 		shmsegptr->label = mac_sysv_shm_label_alloc();
 	else
 		shmsegptr->label = NULL;
 }
 
 static void
 mac_sysv_shm_label_free(struct label *label)
 {
 
 	MAC_POLICY_PERFORM_NOSLEEP(sysvshm_destroy_label, label);
 	mac_labelzone_free(label);
 }
 
 void
 mac_sysvshm_destroy(struct shmid_kernel *shmsegptr)
 {
 
 	if (shmsegptr->label != NULL) {
 		mac_sysv_shm_label_free(shmsegptr->label);
 		shmsegptr->label = NULL;
 	}
 }
 
 void
 mac_sysvshm_create(struct ucred *cred, struct shmid_kernel *shmsegptr)
 {
 
 	MAC_POLICY_PERFORM_NOSLEEP(sysvshm_create, cred, shmsegptr,
 	    shmsegptr->label);
 }
 
 void
 mac_sysvshm_cleanup(struct shmid_kernel *shmsegptr)
 {
 
 	MAC_POLICY_PERFORM_NOSLEEP(sysvshm_cleanup, shmsegptr->label);
 }
 
 MAC_CHECK_PROBE_DEFINE3(sysvshm_check_shmat, "struct ucred *",
     "struct shmid_kernel *", "int");
 
 int
 mac_sysvshm_check_shmat(struct ucred *cred, struct shmid_kernel *shmsegptr,
     int shmflg)
 {
 	int error;
 
 	MAC_POLICY_CHECK_NOSLEEP(sysvshm_check_shmat, cred, shmsegptr,
 	    shmsegptr->label, shmflg);
 	MAC_CHECK_PROBE3(sysvshm_check_shmat, error, cred, shmsegptr,
 	    shmflg);
 
 	return (error);
 }
 
 MAC_CHECK_PROBE_DEFINE3(sysvshm_check_shmctl, "struct ucred *",
     "struct shmid_kernel *", "int");
 
 int
 mac_sysvshm_check_shmctl(struct ucred *cred, struct shmid_kernel *shmsegptr,
     int cmd)
 {
 	int error;
 
 	MAC_POLICY_CHECK_NOSLEEP(sysvshm_check_shmctl, cred, shmsegptr,
 	    shmsegptr->label, cmd);
 	MAC_CHECK_PROBE3(sysvshm_check_shmctl, error, cred, shmsegptr, cmd);
 
 	return (error);
 }
 
 MAC_CHECK_PROBE_DEFINE2(sysvshm_check_shmdt, "struct ucred *",
     "struct shmid *");
 
 int
 mac_sysvshm_check_shmdt(struct ucred *cred, struct shmid_kernel *shmsegptr)
 {
 	int error;
 
 	MAC_POLICY_CHECK_NOSLEEP(sysvshm_check_shmdt, cred, shmsegptr,
 	    shmsegptr->label);
 	MAC_CHECK_PROBE2(sysvshm_check_shmdt, error, cred, shmsegptr);
 
 	return (error);
 }
 
 MAC_CHECK_PROBE_DEFINE3(sysvshm_check_shmget, "struct ucred *",
     "struct shmid_kernel *", "int");
 
 int
 mac_sysvshm_check_shmget(struct ucred *cred, struct shmid_kernel *shmsegptr,
     int shmflg)
 {
 	int error;
 
 	MAC_POLICY_CHECK_NOSLEEP(sysvshm_check_shmget, cred, shmsegptr,
 	    shmsegptr->label, shmflg);
 	MAC_CHECK_PROBE3(sysvshm_check_shmget, error, cred, shmsegptr,
 	    shmflg);
 
 	return (error);
 }
Index: head/sys/security/mac/mac_vfs.c
===================================================================
--- head/sys/security/mac/mac_vfs.c	(revision 258540)
+++ head/sys/security/mac/mac_vfs.c	(revision 258541)
@@ -1,1071 +1,1070 @@
 /*-
  * Copyright (c) 1999-2002, 2009 Robert N. M. Watson
  * Copyright (c) 2001 Ilmar S. Habibulin
  * Copyright (c) 2001-2005 McAfee, Inc.
  * Copyright (c) 2005-2006 SPARTA, Inc.
  * Copyright (c) 2008 Apple Inc.
  * All rights reserved.
  *
  * This software was developed by Robert Watson and Ilmar Habibulin for the
  * TrustedBSD Project.
  *
  * This software was developed for the FreeBSD Project in part by McAfee
  * Research, the Security Research Division of McAfee, Inc. under
  * DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA
  * CHATS research program.
  *
  * This software was enhanced by SPARTA ISSO under SPAWAR contract
  * N66001-04-C-6019 ("SEFOS").
  *
  * This software was developed at the University of Cambridge Computer
  * Laboratory with support from a grant from Google, Inc. 
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
-#include "opt_kdtrace.h"
 #include "opt_mac.h"
 
 #include <sys/param.h>
 #include <sys/condvar.h>
 #include <sys/extattr.h>
 #include <sys/imgact.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mutex.h>
 #include <sys/proc.h>
 #include <sys/sbuf.h>
 #include <sys/systm.h>
 #include <sys/vnode.h>
 #include <sys/mount.h>
 #include <sys/file.h>
 #include <sys/namei.h>
 #include <sys/sdt.h>
 #include <sys/sysctl.h>
 
 #include <vm/vm.h>
 #include <vm/pmap.h>
 #include <vm/vm_map.h>
 #include <vm/vm_object.h>
 
 #include <fs/devfs/devfs.h>
 
 #include <security/mac/mac_framework.h>
 #include <security/mac/mac_internal.h>
 #include <security/mac/mac_policy.h>
 
 /*
  * Warn about EA transactions only the first time they happen.  No locking on
  * this variable.
  */
 static int	ea_warn_once = 0;
 
 static int	mac_vnode_setlabel_extattr(struct ucred *cred,
 		    struct vnode *vp, struct label *intlabel);
 
 static struct label *
 mac_devfs_label_alloc(void)
 {
 	struct label *label;
 
 	label = mac_labelzone_alloc(M_WAITOK);
 	MAC_POLICY_PERFORM(devfs_init_label, label);
 	return (label);
 }
 
 void
 mac_devfs_init(struct devfs_dirent *de)
 {
 
 	if (mac_labeled & MPC_OBJECT_DEVFS)
 		de->de_label = mac_devfs_label_alloc();
 	else
 		de->de_label = NULL;
 }
 
 static struct label *
 mac_mount_label_alloc(void)
 {
 	struct label *label;
 
 	label = mac_labelzone_alloc(M_WAITOK);
 	MAC_POLICY_PERFORM(mount_init_label, label);
 	return (label);
 }
 
 void
 mac_mount_init(struct mount *mp)
 {
 
 	if (mac_labeled & MPC_OBJECT_MOUNT)
 		mp->mnt_label = mac_mount_label_alloc();
 	else
 		mp->mnt_label = NULL;
 }
 
 struct label *
 mac_vnode_label_alloc(void)
 {
 	struct label *label;
 
 	label = mac_labelzone_alloc(M_WAITOK);
 	MAC_POLICY_PERFORM(vnode_init_label, label);
 	return (label);
 }
 
 void
 mac_vnode_init(struct vnode *vp)
 {
 
 	if (mac_labeled & MPC_OBJECT_VNODE)
 		vp->v_label = mac_vnode_label_alloc();
 	else
 		vp->v_label = NULL;
 }
 
 static void
 mac_devfs_label_free(struct label *label)
 {
 
 	MAC_POLICY_PERFORM_NOSLEEP(devfs_destroy_label, label);
 	mac_labelzone_free(label);
 }
 
 void
 mac_devfs_destroy(struct devfs_dirent *de)
 {
 
 	if (de->de_label != NULL) {
 		mac_devfs_label_free(de->de_label);
 		de->de_label = NULL;
 	}
 }
 
 static void
 mac_mount_label_free(struct label *label)
 {
 
 	MAC_POLICY_PERFORM_NOSLEEP(mount_destroy_label, label);
 	mac_labelzone_free(label);
 }
 
 void
 mac_mount_destroy(struct mount *mp)
 {
 
 	if (mp->mnt_label != NULL) {
 		mac_mount_label_free(mp->mnt_label);
 		mp->mnt_label = NULL;
 	}
 }
 
 void
 mac_vnode_label_free(struct label *label)
 {
 
 	MAC_POLICY_PERFORM_NOSLEEP(vnode_destroy_label, label);
 	mac_labelzone_free(label);
 }
 
 void
 mac_vnode_destroy(struct vnode *vp)
 {
 
 	if (vp->v_label != NULL) {
 		mac_vnode_label_free(vp->v_label);
 		vp->v_label = NULL;
 	}
 }
 
 void
 mac_vnode_copy_label(struct label *src, struct label *dest)
 {
 
 	MAC_POLICY_PERFORM_NOSLEEP(vnode_copy_label, src, dest);
 }
 
 int
 mac_vnode_externalize_label(struct label *label, char *elements,
     char *outbuf, size_t outbuflen)
 {
 	int error;
 
 	MAC_POLICY_EXTERNALIZE(vnode, label, elements, outbuf, outbuflen);
 
 	return (error);
 }
 
 int
 mac_vnode_internalize_label(struct label *label, char *string)
 {
 	int error;
 
 	MAC_POLICY_INTERNALIZE(vnode, label, string);
 
 	return (error);
 }
 
 void
 mac_devfs_update(struct mount *mp, struct devfs_dirent *de, struct vnode *vp)
 {
 
 	MAC_POLICY_PERFORM_NOSLEEP(devfs_update, mp, de, de->de_label, vp,
 	    vp->v_label);
 }
 
 void
 mac_devfs_vnode_associate(struct mount *mp, struct devfs_dirent *de,
     struct vnode *vp)
 {
 
 	MAC_POLICY_PERFORM_NOSLEEP(devfs_vnode_associate, mp, mp->mnt_label,
 	    de, de->de_label, vp, vp->v_label);
 }
 
 int
 mac_vnode_associate_extattr(struct mount *mp, struct vnode *vp)
 {
 	int error;
 
 	ASSERT_VOP_LOCKED(vp, "mac_vnode_associate_extattr");
 
 	MAC_POLICY_CHECK(vnode_associate_extattr, mp, mp->mnt_label, vp,
 	    vp->v_label);
 
 	return (error);
 }
 
 void
 mac_vnode_associate_singlelabel(struct mount *mp, struct vnode *vp)
 {
 
 	MAC_POLICY_PERFORM_NOSLEEP(vnode_associate_singlelabel, mp,
 	    mp->mnt_label, vp, vp->v_label);
 }
 
 /*
  * Functions implementing extended-attribute backed labels for file systems
  * that support it.
  *
  * Where possible, we use EA transactions to make writes to multiple
  * attributes across difference policies mutually atomic.  We allow work to
  * continue on file systems not supporting EA transactions, but generate a
  * printf warning.
  */
 int
 mac_vnode_create_extattr(struct ucred *cred, struct mount *mp,
     struct vnode *dvp, struct vnode *vp, struct componentname *cnp)
 {
 	int error;
 
 	ASSERT_VOP_LOCKED(dvp, "mac_vnode_create_extattr");
 	ASSERT_VOP_LOCKED(vp, "mac_vnode_create_extattr");
 
 	error = VOP_OPENEXTATTR(vp, cred, curthread);
 	if (error == EOPNOTSUPP) {
 		if (ea_warn_once == 0) {
 			printf("Warning: transactions not supported "
 			    "in EA write.\n");
 			ea_warn_once = 1;
 		}
 	} else if (error)
 		return (error);
 
 	MAC_POLICY_CHECK(vnode_create_extattr, cred, mp, mp->mnt_label, dvp,
 	    dvp->v_label, vp, vp->v_label, cnp);
 
 	if (error) {
 		VOP_CLOSEEXTATTR(vp, 0, NOCRED, curthread);
 		return (error);
 	}
 
 	error = VOP_CLOSEEXTATTR(vp, 1, NOCRED, curthread);
 	if (error == EOPNOTSUPP)
 		error = 0;
 
 	return (error);
 }
 
 static int
 mac_vnode_setlabel_extattr(struct ucred *cred, struct vnode *vp,
     struct label *intlabel)
 {
 	int error;
 
 	ASSERT_VOP_LOCKED(vp, "mac_vnode_setlabel_extattr");
 
 	error = VOP_OPENEXTATTR(vp, cred, curthread);
 	if (error == EOPNOTSUPP) {
 		if (ea_warn_once == 0) {
 			printf("Warning: transactions not supported "
 			    "in EA write.\n");
 			ea_warn_once = 1;
 		}
 	} else if (error)
 		return (error);
 
 	MAC_POLICY_CHECK(vnode_setlabel_extattr, cred, vp, vp->v_label,
 	    intlabel);
 
 	if (error) {
 		VOP_CLOSEEXTATTR(vp, 0, NOCRED, curthread);
 		return (error);
 	}
 
 	error = VOP_CLOSEEXTATTR(vp, 1, NOCRED, curthread);
 	if (error == EOPNOTSUPP)
 		error = 0;
 
 	return (error);
 }
 
 void
 mac_vnode_execve_transition(struct ucred *old, struct ucred *new,
     struct vnode *vp, struct label *interpvplabel, struct image_params *imgp)
 {
 
 	ASSERT_VOP_LOCKED(vp, "mac_vnode_execve_transition");
 
 	MAC_POLICY_PERFORM(vnode_execve_transition, old, new, vp,
 	    vp->v_label, interpvplabel, imgp, imgp->execlabel);
 }
 
 int
 mac_vnode_execve_will_transition(struct ucred *old, struct vnode *vp,
     struct label *interpvplabel, struct image_params *imgp)
 {
 	int result;
 
 	ASSERT_VOP_LOCKED(vp, "mac_vnode_execve_will_transition");
 
 	result = 0;
 	/* No sleeping since the process lock will be held by the caller. */
 	MAC_POLICY_BOOLEAN_NOSLEEP(vnode_execve_will_transition, ||, old, vp,
 	    vp->v_label, interpvplabel, imgp, imgp->execlabel);
 
 	return (result);
 }
 
 MAC_CHECK_PROBE_DEFINE3(vnode_check_access, "struct ucred *",
     "struct vnode *", "accmode_t");
 
 int
 mac_vnode_check_access(struct ucred *cred, struct vnode *vp, accmode_t accmode)
 {
 	int error;
 
 	ASSERT_VOP_LOCKED(vp, "mac_vnode_check_access");
 
 	MAC_POLICY_CHECK(vnode_check_access, cred, vp, vp->v_label, accmode);
 	MAC_CHECK_PROBE3(vnode_check_access, error, cred, vp, accmode);
 
 	return (error);
 }
 
 MAC_CHECK_PROBE_DEFINE2(vnode_check_chdir, "struct ucred *",
     "struct vnode *");
 
 int
 mac_vnode_check_chdir(struct ucred *cred, struct vnode *dvp)
 {
 	int error;
 
 	ASSERT_VOP_LOCKED(dvp, "mac_vnode_check_chdir");
 
 	MAC_POLICY_CHECK(vnode_check_chdir, cred, dvp, dvp->v_label);
 	MAC_CHECK_PROBE2(vnode_check_chdir, error, cred, dvp);
 
 	return (error);
 }
 
 MAC_CHECK_PROBE_DEFINE2(vnode_check_chroot, "struct ucred *",
     "struct vnode *");
 
 int
 mac_vnode_check_chroot(struct ucred *cred, struct vnode *dvp)
 {
 	int error;
 
 	ASSERT_VOP_LOCKED(dvp, "mac_vnode_check_chroot");
 
 	MAC_POLICY_CHECK(vnode_check_chroot, cred, dvp, dvp->v_label);
 	MAC_CHECK_PROBE2(vnode_check_chroot, error, cred, dvp);
 
 	return (error);
 }
 
 MAC_CHECK_PROBE_DEFINE4(vnode_check_create, "struct ucred *",
     "struct vnode *", "struct componentname *", "struct vattr *");
 
 int
 mac_vnode_check_create(struct ucred *cred, struct vnode *dvp,
     struct componentname *cnp, struct vattr *vap)
 {
 	int error;
 
 	ASSERT_VOP_LOCKED(dvp, "mac_vnode_check_create");
 
 	MAC_POLICY_CHECK(vnode_check_create, cred, dvp, dvp->v_label, cnp,
 	    vap);
 	MAC_CHECK_PROBE4(vnode_check_create, error, cred, dvp, cnp, vap);
 
 	return (error);
 }
 
 MAC_CHECK_PROBE_DEFINE3(vnode_check_deleteacl, "struct ucred *",
     "struct vnode *", "acl_type_t");
 
 int
 mac_vnode_check_deleteacl(struct ucred *cred, struct vnode *vp,
     acl_type_t type)
 {
 	int error;
 
 	ASSERT_VOP_LOCKED(vp, "mac_vnode_check_deleteacl");
 
 	MAC_POLICY_CHECK(vnode_check_deleteacl, cred, vp, vp->v_label, type);
 	MAC_CHECK_PROBE3(vnode_check_deleteacl, error, cred, vp, type);
 
 	return (error);
 }
 
 MAC_CHECK_PROBE_DEFINE4(vnode_check_deleteextattr, "struct ucred *",
     "struct vnode *", "int", "const char *");
 
 int
 mac_vnode_check_deleteextattr(struct ucred *cred, struct vnode *vp,
     int attrnamespace, const char *name)
 {
 	int error;
 
 	ASSERT_VOP_LOCKED(vp, "mac_vnode_check_deleteextattr");
 
 	MAC_POLICY_CHECK(vnode_check_deleteextattr, cred, vp, vp->v_label,
 	    attrnamespace, name);
 	MAC_CHECK_PROBE4(vnode_check_deleteextattr, error, cred, vp,
 	    attrnamespace, name);
 
 	return (error);
 }
 
 MAC_CHECK_PROBE_DEFINE3(vnode_check_exec, "struct ucred *", "struct vnode *",
     "struct image_params *");
 
 int
 mac_vnode_check_exec(struct ucred *cred, struct vnode *vp,
     struct image_params *imgp)
 {
 	int error;
 
 	ASSERT_VOP_LOCKED(vp, "mac_vnode_check_exec");
 
 	MAC_POLICY_CHECK(vnode_check_exec, cred, vp, vp->v_label, imgp,
 	    imgp->execlabel);
 	MAC_CHECK_PROBE3(vnode_check_exec, error, cred, vp, imgp);
 
 	return (error);
 }
 
 MAC_CHECK_PROBE_DEFINE3(vnode_check_getacl, "struct ucred *",
     "struct vnode *", "acl_type_t");
 
 int
 mac_vnode_check_getacl(struct ucred *cred, struct vnode *vp, acl_type_t type)
 {
 	int error;
 
 	ASSERT_VOP_LOCKED(vp, "mac_vnode_check_getacl");
 
 	MAC_POLICY_CHECK(vnode_check_getacl, cred, vp, vp->v_label, type);
 	MAC_CHECK_PROBE3(vnode_check_getacl, error, cred, vp, type);
 
 	return (error);
 }
 
 MAC_CHECK_PROBE_DEFINE4(vnode_check_getextattr, "struct ucred *",
     "struct vnode *", "int", "const char *");
 
 int
 mac_vnode_check_getextattr(struct ucred *cred, struct vnode *vp,
     int attrnamespace, const char *name)
 {
 	int error;
 
 	ASSERT_VOP_LOCKED(vp, "mac_vnode_check_getextattr");
 
 	MAC_POLICY_CHECK(vnode_check_getextattr, cred, vp, vp->v_label,
 	    attrnamespace, name);
 	MAC_CHECK_PROBE4(vnode_check_getextattr, error, cred, vp,
 	    attrnamespace, name);
 
 	return (error);
 }
 
 MAC_CHECK_PROBE_DEFINE4(vnode_check_link, "struct ucred *", "struct vnode *",
     "struct vnode *", "struct componentname *");
 
 int
 mac_vnode_check_link(struct ucred *cred, struct vnode *dvp,
     struct vnode *vp, struct componentname *cnp)
 {
 	int error;
 
 	ASSERT_VOP_LOCKED(dvp, "mac_vnode_check_link");
 	ASSERT_VOP_LOCKED(vp, "mac_vnode_check_link");
 
 	MAC_POLICY_CHECK(vnode_check_link, cred, dvp, dvp->v_label, vp,
 	    vp->v_label, cnp);
 	MAC_CHECK_PROBE4(vnode_check_link, error, cred, dvp, vp, cnp);
 
 	return (error);
 }
 
 MAC_CHECK_PROBE_DEFINE3(vnode_check_listextattr, "struct ucred *",
     "struct vnode *", "int");
 
 int
 mac_vnode_check_listextattr(struct ucred *cred, struct vnode *vp,
     int attrnamespace)
 {
 	int error;
 
 	ASSERT_VOP_LOCKED(vp, "mac_vnode_check_listextattr");
 
 	MAC_POLICY_CHECK(vnode_check_listextattr, cred, vp, vp->v_label,
 	    attrnamespace);
 	MAC_CHECK_PROBE3(vnode_check_listextattr, error, cred, vp,
 	    attrnamespace);
 
 	return (error);
 }
 
 MAC_CHECK_PROBE_DEFINE3(vnode_check_lookup, "struct ucred *",
     "struct vnode *", "struct componentname *");
 
 int
 mac_vnode_check_lookup(struct ucred *cred, struct vnode *dvp,
     struct componentname *cnp)
 {
 	int error;
 
 	ASSERT_VOP_LOCKED(dvp, "mac_vnode_check_lookup");
 
 	MAC_POLICY_CHECK(vnode_check_lookup, cred, dvp, dvp->v_label, cnp);
 	MAC_CHECK_PROBE3(vnode_check_lookup, error, cred, dvp, cnp);
 
 	return (error);
 }
 
 MAC_CHECK_PROBE_DEFINE4(vnode_check_mmap, "struct ucred *", "struct vnode *",
     "int", "int");
 
 int
 mac_vnode_check_mmap(struct ucred *cred, struct vnode *vp, int prot,
     int flags)
 {
 	int error;
 
 	ASSERT_VOP_LOCKED(vp, "mac_vnode_check_mmap");
 
 	MAC_POLICY_CHECK(vnode_check_mmap, cred, vp, vp->v_label, prot, flags);
 	MAC_CHECK_PROBE4(vnode_check_mmap, error, cred, vp, prot, flags);
 
 	return (error);
 }
 
 void
 mac_vnode_check_mmap_downgrade(struct ucred *cred, struct vnode *vp,
     int *prot)
 {
 	int result = *prot;
 
 	ASSERT_VOP_LOCKED(vp, "mac_vnode_check_mmap_downgrade");
 
 	MAC_POLICY_PERFORM(vnode_check_mmap_downgrade, cred, vp, vp->v_label,
 	    &result);
 
 	*prot = result;
 }
 
 MAC_CHECK_PROBE_DEFINE3(vnode_check_mprotect, "struct ucred *",
     "struct vnode *", "int");
 
 int
 mac_vnode_check_mprotect(struct ucred *cred, struct vnode *vp, int prot)
 {
 	int error;
 
 	ASSERT_VOP_LOCKED(vp, "mac_vnode_check_mprotect");
 
 	MAC_POLICY_CHECK(vnode_check_mprotect, cred, vp, vp->v_label, prot);
 	MAC_CHECK_PROBE3(vnode_check_mprotect, error, cred, vp, prot);
 
 	return (error);
 }
 
 MAC_CHECK_PROBE_DEFINE3(vnode_check_open, "struct ucred *", "struct vnode *",
     "accmode_t");
 
 int
 mac_vnode_check_open(struct ucred *cred, struct vnode *vp, accmode_t accmode)
 {
 	int error;
 
 	ASSERT_VOP_LOCKED(vp, "mac_vnode_check_open");
 
 	MAC_POLICY_CHECK(vnode_check_open, cred, vp, vp->v_label, accmode);
 	MAC_CHECK_PROBE3(vnode_check_open, error, cred, vp, accmode);
 
 	return (error);
 }
 
 MAC_CHECK_PROBE_DEFINE3(vnode_check_poll, "struct ucred *", "struct ucred *",
     "struct vnode *");
 
 int
 mac_vnode_check_poll(struct ucred *active_cred, struct ucred *file_cred,
     struct vnode *vp)
 {
 	int error;
 
 	ASSERT_VOP_LOCKED(vp, "mac_vnode_check_poll");
 
 	MAC_POLICY_CHECK(vnode_check_poll, active_cred, file_cred, vp,
 	    vp->v_label);
 	MAC_CHECK_PROBE3(vnode_check_poll, error, active_cred, file_cred,
 	    vp);
 
 	return (error);
 }
 
 MAC_CHECK_PROBE_DEFINE3(vnode_check_read, "struct ucred *", "struct ucred *",
     "struct vnode *");
 
 int
 mac_vnode_check_read(struct ucred *active_cred, struct ucred *file_cred,
     struct vnode *vp)
 {
 	int error;
 
 	ASSERT_VOP_LOCKED(vp, "mac_vnode_check_read");
 
 	MAC_POLICY_CHECK(vnode_check_read, active_cred, file_cred, vp,
 	    vp->v_label);
 	MAC_CHECK_PROBE3(vnode_check_read, error, active_cred, file_cred,
 	    vp);
 
 	return (error);
 }
 
 MAC_CHECK_PROBE_DEFINE2(vnode_check_readdir, "struct ucred *",
     "struct vnode *");
 
 int
 mac_vnode_check_readdir(struct ucred *cred, struct vnode *dvp)
 {
 	int error;
 
 	ASSERT_VOP_LOCKED(dvp, "mac_vnode_check_readdir");
 
 	MAC_POLICY_CHECK(vnode_check_readdir, cred, dvp, dvp->v_label);
 	MAC_CHECK_PROBE2(vnode_check_readdir, error, cred, dvp);
 
 	return (error);
 }
 
 MAC_CHECK_PROBE_DEFINE2(vnode_check_readlink, "struct ucred *",
     "struct vnode *");
 
 int
 mac_vnode_check_readlink(struct ucred *cred, struct vnode *vp)
 {
 	int error;
 
 	ASSERT_VOP_LOCKED(vp, "mac_vnode_check_readlink");
 
 	MAC_POLICY_CHECK(vnode_check_readlink, cred, vp, vp->v_label);
 	MAC_CHECK_PROBE2(vnode_check_readlink, error, cred, vp);
 
 	return (error);
 }
 
 MAC_CHECK_PROBE_DEFINE3(vnode_check_relabel, "struct ucred *",
     "struct vnode *", "struct label *");
 
 static int
 mac_vnode_check_relabel(struct ucred *cred, struct vnode *vp,
     struct label *newlabel)
 {
 	int error;
 
 	ASSERT_VOP_LOCKED(vp, "mac_vnode_check_relabel");
 
 	MAC_POLICY_CHECK(vnode_check_relabel, cred, vp, vp->v_label, newlabel);
 	MAC_CHECK_PROBE3(vnode_check_relabel, error, cred, vp, newlabel);
 
 	return (error);
 }
 
 MAC_CHECK_PROBE_DEFINE4(vnode_check_rename_from, "struct ucred *",
     "struct vnode *", "struct vnode *", "struct componentname *");
 
 int
 mac_vnode_check_rename_from(struct ucred *cred, struct vnode *dvp,
     struct vnode *vp, struct componentname *cnp)
 {
 	int error;
 
 	ASSERT_VOP_LOCKED(dvp, "mac_vnode_check_rename_from");
 	ASSERT_VOP_LOCKED(vp, "mac_vnode_check_rename_from");
 
 	MAC_POLICY_CHECK(vnode_check_rename_from, cred, dvp, dvp->v_label, vp,
 	    vp->v_label, cnp);
 	MAC_CHECK_PROBE4(vnode_check_rename_from, error, cred, dvp, vp, cnp);
 
 	return (error);
 }
 
 MAC_CHECK_PROBE_DEFINE4(vnode_check_rename_to, "struct ucred *",
     "struct vnode *", "struct vnode *", "struct componentname *");
 
 int
 mac_vnode_check_rename_to(struct ucred *cred, struct vnode *dvp,
     struct vnode *vp, int samedir, struct componentname *cnp)
 {
 	int error;
 
 	ASSERT_VOP_LOCKED(dvp, "mac_vnode_check_rename_to");
 	ASSERT_VOP_LOCKED(vp, "mac_vnode_check_rename_to");
 
 	MAC_POLICY_CHECK(vnode_check_rename_to, cred, dvp, dvp->v_label, vp,
 	    vp != NULL ? vp->v_label : NULL, samedir, cnp);
 	MAC_CHECK_PROBE4(vnode_check_rename_to, error, cred, dvp, vp, cnp);
 	return (error);
 }
 
 MAC_CHECK_PROBE_DEFINE2(vnode_check_revoke, "struct ucred *",
     "struct vnode *");
 
 int
 mac_vnode_check_revoke(struct ucred *cred, struct vnode *vp)
 {
 	int error;
 
 	ASSERT_VOP_LOCKED(vp, "mac_vnode_check_revoke");
 
 	MAC_POLICY_CHECK(vnode_check_revoke, cred, vp, vp->v_label);
 	MAC_CHECK_PROBE2(vnode_check_revoke, error, cred, vp);
 
 	return (error);
 }
 
 MAC_CHECK_PROBE_DEFINE4(vnode_check_setacl, "struct ucred *",
     "struct vnode *", "acl_type_t", "struct acl *");
 
 int
 mac_vnode_check_setacl(struct ucred *cred, struct vnode *vp, acl_type_t type,
     struct acl *acl)
 {
 	int error;
 
 	ASSERT_VOP_LOCKED(vp, "mac_vnode_check_setacl");
 
 	MAC_POLICY_CHECK(vnode_check_setacl, cred, vp, vp->v_label, type, acl);
 	MAC_CHECK_PROBE4(vnode_check_setacl, error, cred, vp, type, acl);
 
 	return (error);
 }
 
 MAC_CHECK_PROBE_DEFINE4(vnode_check_setextattr, "struct ucred *",
     "struct vnode *", "int", "const char *");
 
 int
 mac_vnode_check_setextattr(struct ucred *cred, struct vnode *vp,
     int attrnamespace, const char *name)
 {
 	int error;
 
 	ASSERT_VOP_LOCKED(vp, "mac_vnode_check_setextattr");
 
 	MAC_POLICY_CHECK(vnode_check_setextattr, cred, vp, vp->v_label,
 	    attrnamespace, name);
 	MAC_CHECK_PROBE4(vnode_check_setextattr, error, cred, vp,
 	    attrnamespace, name);
 
 	return (error);
 }
 
 MAC_CHECK_PROBE_DEFINE3(vnode_check_setflags, "struct ucred *",
     "struct vnode *", "u_long");
 
 int
 mac_vnode_check_setflags(struct ucred *cred, struct vnode *vp, u_long flags)
 {
 	int error;
 
 	ASSERT_VOP_LOCKED(vp, "mac_vnode_check_setflags");
 
 	MAC_POLICY_CHECK(vnode_check_setflags, cred, vp, vp->v_label, flags);
 	MAC_CHECK_PROBE3(vnode_check_setflags, error, cred, vp, flags);
 
 	return (error);
 }
 
 MAC_CHECK_PROBE_DEFINE3(vnode_check_setmode, "struct ucred *",
     "struct vnode *", "mode_t");
 
 int
 mac_vnode_check_setmode(struct ucred *cred, struct vnode *vp, mode_t mode)
 {
 	int error;
 
 	ASSERT_VOP_LOCKED(vp, "mac_vnode_check_setmode");
 
 	MAC_POLICY_CHECK(vnode_check_setmode, cred, vp, vp->v_label, mode);
 	MAC_CHECK_PROBE3(vnode_check_setmode, error, cred, vp, mode);
 
 	return (error);
 }
 
 MAC_CHECK_PROBE_DEFINE4(vnode_check_setowner, "struct ucred *",
     "struct vnode *", "uid_t", "gid_t");
 
 int
 mac_vnode_check_setowner(struct ucred *cred, struct vnode *vp, uid_t uid,
     gid_t gid)
 {
 	int error;
 
 	ASSERT_VOP_LOCKED(vp, "mac_vnode_check_setowner");
 
 	MAC_POLICY_CHECK(vnode_check_setowner, cred, vp, vp->v_label, uid, gid);
 	MAC_CHECK_PROBE4(vnode_check_setowner, error, cred, vp, uid, gid);
 
 	return (error);
 }
 
 MAC_CHECK_PROBE_DEFINE4(vnode_check_setutimes, "struct ucred *",
     "struct vnode *", "struct timespec *", "struct timespec *");
 
 int
 mac_vnode_check_setutimes(struct ucred *cred, struct vnode *vp,
     struct timespec atime, struct timespec mtime)
 {
 	int error;
 
 	ASSERT_VOP_LOCKED(vp, "mac_vnode_check_setutimes");
 
 	MAC_POLICY_CHECK(vnode_check_setutimes, cred, vp, vp->v_label, atime,
 	    mtime);
 	MAC_CHECK_PROBE4(vnode_check_setutimes, error, cred, vp, &atime,
 	    &mtime);
 
 	return (error);
 }
 
 MAC_CHECK_PROBE_DEFINE3(vnode_check_stat, "struct ucred *", "struct ucred *",
     "struct vnode *");
 
 int
 mac_vnode_check_stat(struct ucred *active_cred, struct ucred *file_cred,
     struct vnode *vp)
 {
 	int error;
 
 	ASSERT_VOP_LOCKED(vp, "mac_vnode_check_stat");
 
 	MAC_POLICY_CHECK(vnode_check_stat, active_cred, file_cred, vp,
 	    vp->v_label);
 	MAC_CHECK_PROBE3(vnode_check_stat, error, active_cred, file_cred,
 	    vp);
 
 	return (error);
 }
 
 MAC_CHECK_PROBE_DEFINE4(vnode_check_unlink, "struct ucred *",
     "struct vnode *", "struct vnode *", "struct componentname *");
 
 int
 mac_vnode_check_unlink(struct ucred *cred, struct vnode *dvp,
     struct vnode *vp, struct componentname *cnp)
 {
 	int error;
 
 	ASSERT_VOP_LOCKED(dvp, "mac_vnode_check_unlink");
 	ASSERT_VOP_LOCKED(vp, "mac_vnode_check_unlink");
 
 	MAC_POLICY_CHECK(vnode_check_unlink, cred, dvp, dvp->v_label, vp,
 	    vp->v_label, cnp);
 	MAC_CHECK_PROBE4(vnode_check_unlink, error, cred, dvp, vp, cnp);
 
 	return (error);
 }
 
 MAC_CHECK_PROBE_DEFINE3(vnode_check_write, "struct ucred *",
     "struct ucred *", "struct vnode *");
 
 int
 mac_vnode_check_write(struct ucred *active_cred, struct ucred *file_cred,
     struct vnode *vp)
 {
 	int error;
 
 	ASSERT_VOP_LOCKED(vp, "mac_vnode_check_write");
 
 	MAC_POLICY_CHECK(vnode_check_write, active_cred, file_cred, vp,
 	    vp->v_label);
 	MAC_CHECK_PROBE3(vnode_check_write, error, active_cred, file_cred,
 	    vp);
 
 	return (error);
 }
 
 void
 mac_vnode_relabel(struct ucred *cred, struct vnode *vp,
     struct label *newlabel)
 {
 
 	MAC_POLICY_PERFORM(vnode_relabel, cred, vp, vp->v_label, newlabel);
 }
 
 void
 mac_mount_create(struct ucred *cred, struct mount *mp)
 {
 
 	MAC_POLICY_PERFORM(mount_create, cred, mp, mp->mnt_label);
 }
 
 MAC_CHECK_PROBE_DEFINE2(mount_check_stat, "struct ucred *",
     "struct mount *");
 
 int
 mac_mount_check_stat(struct ucred *cred, struct mount *mount)
 {
 	int error;
 
 	MAC_POLICY_CHECK_NOSLEEP(mount_check_stat, cred, mount, mount->mnt_label);
 	MAC_CHECK_PROBE2(mount_check_stat, error, cred, mount);
 
 	return (error);
 }
 
 void
 mac_devfs_create_device(struct ucred *cred, struct mount *mp,
     struct cdev *dev, struct devfs_dirent *de)
 {
 
 	MAC_POLICY_PERFORM_NOSLEEP(devfs_create_device, cred, mp, dev, de,
 	    de->de_label);
 }
 
 void
 mac_devfs_create_symlink(struct ucred *cred, struct mount *mp,
     struct devfs_dirent *dd, struct devfs_dirent *de)
 {
 
 	MAC_POLICY_PERFORM_NOSLEEP(devfs_create_symlink, cred, mp, dd,
 	    dd->de_label, de, de->de_label);
 }
 
 void
 mac_devfs_create_directory(struct mount *mp, char *dirname, int dirnamelen,
     struct devfs_dirent *de)
 {
 
 	MAC_POLICY_PERFORM_NOSLEEP(devfs_create_directory, mp, dirname,
 	    dirnamelen, de, de->de_label);
 }
 
 /*
  * Implementation of VOP_SETLABEL() that relies on extended attributes to
  * store label data.  Can be referenced by filesystems supporting extended
  * attributes.
  */
 int
 vop_stdsetlabel_ea(struct vop_setlabel_args *ap)
 {
 	struct vnode *vp = ap->a_vp;
 	struct label *intlabel = ap->a_label;
 	int error;
 
 	ASSERT_VOP_LOCKED(vp, "vop_stdsetlabel_ea");
 
 	if ((vp->v_mount->mnt_flag & MNT_MULTILABEL) == 0)
 		return (EOPNOTSUPP);
 
 	error = mac_vnode_setlabel_extattr(ap->a_cred, vp, intlabel);
 	if (error)
 		return (error);
 
 	mac_vnode_relabel(ap->a_cred, vp, intlabel);
 
 	return (0);
 }
 
 int
 vn_setlabel(struct vnode *vp, struct label *intlabel, struct ucred *cred)
 {
 	int error;
 
 	if (vp->v_mount == NULL) {
 		/* printf("vn_setlabel: null v_mount\n"); */
 		if (vp->v_type != VNON)
 			printf("vn_setlabel: null v_mount with non-VNON\n");
 		return (EBADF);
 	}
 
 	if ((vp->v_mount->mnt_flag & MNT_MULTILABEL) == 0)
 		return (EOPNOTSUPP);
 
 	/*
 	 * Multi-phase commit.  First check the policies to confirm the
 	 * change is OK.  Then commit via the filesystem.  Finally, update
 	 * the actual vnode label.
 	 *
 	 * Question: maybe the filesystem should update the vnode at the end
 	 * as part of VOP_SETLABEL()?
 	 */
 	error = mac_vnode_check_relabel(cred, vp, intlabel);
 	if (error)
 		return (error);
 
 	/*
 	 * VADMIN provides the opportunity for the filesystem to make
 	 * decisions about who is and is not able to modify labels and
 	 * protections on files.  This might not be right.  We can't assume
 	 * VOP_SETLABEL() will do it, because we might implement that as part
 	 * of vop_stdsetlabel_ea().
 	 */
 	error = VOP_ACCESS(vp, VADMIN, cred, curthread);
 	if (error)
 		return (error);
 
 	error = VOP_SETLABEL(vp, intlabel, cred, curthread);
 	if (error)
 		return (error);
 
 	return (0);
 }
Index: head/sys/sys/mutex.h
===================================================================
--- head/sys/sys/mutex.h	(revision 258540)
+++ head/sys/sys/mutex.h	(revision 258541)
@@ -1,466 +1,469 @@
 /*-
  * Copyright (c) 1997 Berkeley Software Design, Inc. All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Berkeley Software Design Inc's name may not be used to endorse or
  *    promote products derived from this software without specific prior
  *    written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY BERKELEY SOFTWARE DESIGN INC ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL BERKELEY SOFTWARE DESIGN INC BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	from BSDI $Id: mutex.h,v 2.7.2.35 2000/04/27 03:10:26 cp Exp $
  * $FreeBSD$
  */
 
 #ifndef _SYS_MUTEX_H_
 #define _SYS_MUTEX_H_
 
 #include <sys/queue.h>
 #include <sys/_lock.h>
 #include <sys/_mutex.h>
 
 #ifdef _KERNEL
 #include <sys/pcpu.h>
 #include <sys/lock_profile.h>
 #include <sys/lockstat.h>
 #include <machine/atomic.h>
 #include <machine/cpufunc.h>
 
 /*
  * Mutex types and options passed to mtx_init().  MTX_QUIET and MTX_DUPOK
  * can also be passed in.
  */
 #define	MTX_DEF		0x00000000	/* DEFAULT (sleep) lock */ 
 #define MTX_SPIN	0x00000001	/* Spin lock (disables interrupts) */
 #define MTX_RECURSE	0x00000004	/* Option: lock allowed to recurse */
 #define	MTX_NOWITNESS	0x00000008	/* Don't do any witness checking. */
 #define MTX_NOPROFILE   0x00000020	/* Don't profile this lock */
 
 /*
  * Option flags passed to certain lock/unlock routines, through the use
  * of corresponding mtx_{lock,unlock}_flags() interface macros.
  */
 #define	MTX_QUIET	LOP_QUIET	/* Don't log a mutex event */
 #define	MTX_DUPOK	LOP_DUPOK	/* Don't log a duplicate acquire */
 
 /*
  * State bits kept in mutex->mtx_lock, for the DEFAULT lock type. None of this,
  * with the exception of MTX_UNOWNED, applies to spin locks.
  */
 #define	MTX_RECURSED	0x00000001	/* lock recursed (for MTX_DEF only) */
 #define	MTX_CONTESTED	0x00000002	/* lock contested (for MTX_DEF only) */
 #define MTX_UNOWNED	0x00000004	/* Cookie for free mutex */
 #define	MTX_FLAGMASK	(MTX_RECURSED | MTX_CONTESTED | MTX_UNOWNED)
 
 /*
  * Value stored in mutex->mtx_lock to denote a destroyed mutex.
  */
 #define	MTX_DESTROYED	(MTX_CONTESTED | MTX_UNOWNED)
 
 /*
  * Prototypes
  *
  * NOTE: Functions prepended with `_' (underscore) are exported to other parts
  *	 of the kernel via macros, thus allowing us to use the cpp LOCK_FILE
  *	 and LOCK_LINE or for hiding the lock cookie crunching to the
  *	 consumers. These functions should not be called directly by any
  *	 code using the API. Their macros cover their functionality.
  *	 Functions with a `_' suffix are the entrypoint for the common
  *	 KPI covering both compat shims and fast path case.  These can be
  *	 used by consumers willing to pass options, file and line
  *	 informations, in an option-independent way.
  *
  * [See below for descriptions]
  *
  */
 void	_mtx_init(volatile uintptr_t *c, const char *name, const char *type,
 	    int opts);
 void	_mtx_destroy(volatile uintptr_t *c);
 void	mtx_sysinit(void *arg);
 int	_mtx_trylock_flags_(volatile uintptr_t *c, int opts, const char *file,
 	    int line);
 void	mutex_init(void);
 void	__mtx_lock_sleep(volatile uintptr_t *c, uintptr_t tid, int opts,
 	    const char *file, int line);
 void	__mtx_unlock_sleep(volatile uintptr_t *c, int opts, const char *file,
 	    int line);
 #ifdef SMP
 void	_mtx_lock_spin_cookie(volatile uintptr_t *c, uintptr_t tid, int opts,
 	    const char *file, int line);
 #endif
 void	__mtx_lock_flags(volatile uintptr_t *c, int opts, const char *file,
 	    int line);
 void	__mtx_unlock_flags(volatile uintptr_t *c, int opts, const char *file,
 	    int line);
 void	__mtx_lock_spin_flags(volatile uintptr_t *c, int opts, const char *file,
 	     int line);
 void	__mtx_unlock_spin_flags(volatile uintptr_t *c, int opts,
 	    const char *file, int line);
 #if defined(INVARIANTS) || defined(INVARIANT_SUPPORT)
 void	__mtx_assert(const volatile uintptr_t *c, int what, const char *file,
 	    int line);
 #endif
 void	thread_lock_flags_(struct thread *, int, const char *, int);
 
 #define	thread_lock(tdp)						\
 	thread_lock_flags_((tdp), 0, __FILE__, __LINE__)
 #define	thread_lock_flags(tdp, opt)					\
 	thread_lock_flags_((tdp), (opt), __FILE__, __LINE__)
 #define	thread_unlock(tdp)						\
        mtx_unlock_spin((tdp)->td_lock)
 
 /*
  * Top-level macros to provide lock cookie once the actual mtx is passed.
  * They will also prevent passing a malformed object to the mtx KPI by
  * failing compilation as the mtx_lock reserved member will not be found.
  */
 #define	mtx_init(m, n, t, o)						\
 	_mtx_init(&(m)->mtx_lock, n, t, o)
 #define	mtx_destroy(m)							\
 	_mtx_destroy(&(m)->mtx_lock)
 #define	mtx_trylock_flags_(m, o, f, l)					\
 	_mtx_trylock_flags_(&(m)->mtx_lock, o, f, l)
 #define	_mtx_lock_sleep(m, t, o, f, l)					\
 	__mtx_lock_sleep(&(m)->mtx_lock, t, o, f, l)
 #define	_mtx_unlock_sleep(m, o, f, l)					\
 	__mtx_unlock_sleep(&(m)->mtx_lock, o, f, l)
 #ifdef SMP
 #define	_mtx_lock_spin(m, t, o, f, l)					\
 	_mtx_lock_spin_cookie(&(m)->mtx_lock, t, o, f, l)
 #endif
 #define	_mtx_lock_flags(m, o, f, l)					\
 	__mtx_lock_flags(&(m)->mtx_lock, o, f, l)
 #define	_mtx_unlock_flags(m, o, f, l)					\
 	__mtx_unlock_flags(&(m)->mtx_lock, o, f, l)
 #define	_mtx_lock_spin_flags(m, o, f, l)				\
 	__mtx_lock_spin_flags(&(m)->mtx_lock, o, f, l)
 #define	_mtx_unlock_spin_flags(m, o, f, l)				\
 	__mtx_unlock_spin_flags(&(m)->mtx_lock, o, f, l)
 #if defined(INVARIANTS) || defined(INVARIANT_SUPPORT)
 #define	_mtx_assert(m, w, f, l)						\
 	__mtx_assert(&(m)->mtx_lock, w, f, l)
 #endif
 
 #define	mtx_recurse	lock_object.lo_data
 
 /* Very simple operations on mtx_lock. */
 
 /* Try to obtain mtx_lock once. */
 #define _mtx_obtain_lock(mp, tid)					\
 	atomic_cmpset_acq_ptr(&(mp)->mtx_lock, MTX_UNOWNED, (tid))
 
 /* Try to release mtx_lock if it is unrecursed and uncontested. */
 #define _mtx_release_lock(mp, tid)					\
 	atomic_cmpset_rel_ptr(&(mp)->mtx_lock, (tid), MTX_UNOWNED)
 
 /* Release mtx_lock quickly, assuming we own it. */
 #define _mtx_release_lock_quick(mp)					\
 	atomic_store_rel_ptr(&(mp)->mtx_lock, MTX_UNOWNED)
 
 /*
  * Full lock operations that are suitable to be inlined in non-debug
  * kernels.  If the lock cannot be acquired or released trivially then
  * the work is deferred to another function.
  */
 
 /* Lock a normal mutex. */
 #define __mtx_lock(mp, tid, opts, file, line) do {			\
 	uintptr_t _tid = (uintptr_t)(tid);				\
 									\
 	if (!_mtx_obtain_lock((mp), _tid))				\
 		_mtx_lock_sleep((mp), _tid, (opts), (file), (line));	\
 	else								\
               	LOCKSTAT_PROFILE_OBTAIN_LOCK_SUCCESS(LS_MTX_LOCK_ACQUIRE, \
 		    mp, 0, 0, (file), (line));				\
 } while (0)
 
 /*
  * Lock a spin mutex.  For spinlocks, we handle recursion inline (it
  * turns out that function calls can be significantly expensive on
  * some architectures).  Since spin locks are not _too_ common,
  * inlining this code is not too big a deal.
  */
 #ifdef SMP
 #define __mtx_lock_spin(mp, tid, opts, file, line) do {			\
 	uintptr_t _tid = (uintptr_t)(tid);				\
 									\
 	spinlock_enter();						\
 	if (!_mtx_obtain_lock((mp), _tid)) {				\
 		if ((mp)->mtx_lock == _tid)				\
 			(mp)->mtx_recurse++;				\
 		else							\
 			_mtx_lock_spin((mp), _tid, (opts), (file), (line)); \
 	} else 								\
               	LOCKSTAT_PROFILE_OBTAIN_LOCK_SUCCESS(LS_MTX_SPIN_LOCK_ACQUIRE, \
 		    mp, 0, 0, (file), (line));				\
 } while (0)
 #else /* SMP */
 #define __mtx_lock_spin(mp, tid, opts, file, line) do {			\
 	uintptr_t _tid = (uintptr_t)(tid);				\
 									\
 	spinlock_enter();						\
 	if ((mp)->mtx_lock == _tid)					\
 		(mp)->mtx_recurse++;					\
 	else {								\
 		KASSERT((mp)->mtx_lock == MTX_UNOWNED, ("corrupt spinlock")); \
 		(mp)->mtx_lock = _tid;					\
 	}								\
 } while (0)
 #endif /* SMP */
 
 /* Unlock a normal mutex. */
 #define __mtx_unlock(mp, tid, opts, file, line) do {			\
 	uintptr_t _tid = (uintptr_t)(tid);				\
 									\
+	if ((mp)->mtx_recurse == 0)					\
+		 LOCKSTAT_PROFILE_RELEASE_LOCK(LS_MTX_UNLOCK_RELEASE,	\
+		    (mp));						\
 	if (!_mtx_release_lock((mp), _tid))				\
 		_mtx_unlock_sleep((mp), (opts), (file), (line));	\
 } while (0)
 
 /*
  * Unlock a spin mutex.  For spinlocks, we can handle everything
  * inline, as it's pretty simple and a function call would be too
  * expensive (at least on some architectures).  Since spin locks are
  * not _too_ common, inlining this code is not too big a deal.
  *
  * Since we always perform a spinlock_enter() when attempting to acquire a
  * spin lock, we need to always perform a matching spinlock_exit() when
  * releasing a spin lock.  This includes the recursion cases.
  */
 #ifdef SMP
 #define __mtx_unlock_spin(mp) do {					\
 	if (mtx_recursed((mp)))						\
 		(mp)->mtx_recurse--;					\
 	else {								\
 		LOCKSTAT_PROFILE_RELEASE_LOCK(LS_MTX_SPIN_UNLOCK_RELEASE, \
 			mp);						\
 		_mtx_release_lock_quick((mp));				\
 	}                                                               \
 	spinlock_exit();				                \
 } while (0)
 #else /* SMP */
 #define __mtx_unlock_spin(mp) do {					\
 	if (mtx_recursed((mp)))						\
 		(mp)->mtx_recurse--;					\
 	else {								\
 		LOCKSTAT_PROFILE_RELEASE_LOCK(LS_MTX_SPIN_UNLOCK_RELEASE, \
 			mp);						\
 		(mp)->mtx_lock = MTX_UNOWNED;				\
 	}                                                               \
 	spinlock_exit();						\
 } while (0)
 #endif /* SMP */
 
 /*
  * Exported lock manipulation interface.
  *
  * mtx_lock(m) locks MTX_DEF mutex `m'
  *
  * mtx_lock_spin(m) locks MTX_SPIN mutex `m'
  *
  * mtx_unlock(m) unlocks MTX_DEF mutex `m'
  *
  * mtx_unlock_spin(m) unlocks MTX_SPIN mutex `m'
  *
  * mtx_lock_spin_flags(m, opts) and mtx_lock_flags(m, opts) locks mutex `m'
  *     and passes option flags `opts' to the "hard" function, if required.
  *     With these routines, it is possible to pass flags such as MTX_QUIET
  *     to the appropriate lock manipulation routines.
  *
  * mtx_trylock(m) attempts to acquire MTX_DEF mutex `m' but doesn't sleep if
  *     it cannot. Rather, it returns 0 on failure and non-zero on success.
  *     It does NOT handle recursion as we assume that if a caller is properly
  *     using this part of the interface, he will know that the lock in question
  *     is _not_ recursed.
  *
  * mtx_trylock_flags(m, opts) is used the same way as mtx_trylock() but accepts
  *     relevant option flags `opts.'
  *
  * mtx_initialized(m) returns non-zero if the lock `m' has been initialized.
  *
  * mtx_owned(m) returns non-zero if the current thread owns the lock `m'
  *
  * mtx_recursed(m) returns non-zero if the lock `m' is presently recursed.
  */ 
 #define mtx_lock(m)		mtx_lock_flags((m), 0)
 #define mtx_lock_spin(m)	mtx_lock_spin_flags((m), 0)
 #define mtx_trylock(m)		mtx_trylock_flags((m), 0)
 #define mtx_unlock(m)		mtx_unlock_flags((m), 0)
 #define mtx_unlock_spin(m)	mtx_unlock_spin_flags((m), 0)
 
 struct mtx_pool;
 
 struct mtx_pool *mtx_pool_create(const char *mtx_name, int pool_size, int opts);
 void mtx_pool_destroy(struct mtx_pool **poolp);
 struct mtx *mtx_pool_find(struct mtx_pool *pool, void *ptr);
 struct mtx *mtx_pool_alloc(struct mtx_pool *pool);
 #define mtx_pool_lock(pool, ptr)					\
 	mtx_lock(mtx_pool_find((pool), (ptr)))
 #define mtx_pool_lock_spin(pool, ptr)					\
 	mtx_lock_spin(mtx_pool_find((pool), (ptr)))
 #define mtx_pool_unlock(pool, ptr)					\
 	mtx_unlock(mtx_pool_find((pool), (ptr)))
 #define mtx_pool_unlock_spin(pool, ptr)					\
 	mtx_unlock_spin(mtx_pool_find((pool), (ptr)))
 
 /*
  * mtxpool_lockbuilder is a pool of sleep locks that is not witness
  * checked and should only be used for building higher level locks.
  *
  * mtxpool_sleep is a general purpose pool of sleep mutexes.
  */
 extern struct mtx_pool *mtxpool_lockbuilder;
 extern struct mtx_pool *mtxpool_sleep;
 
 #ifndef LOCK_DEBUG
 #error LOCK_DEBUG not defined, include <sys/lock.h> before <sys/mutex.h>
 #endif
 #if LOCK_DEBUG > 0 || defined(MUTEX_NOINLINE)
 #define	mtx_lock_flags_(m, opts, file, line)				\
 	_mtx_lock_flags((m), (opts), (file), (line))
 #define	mtx_unlock_flags_(m, opts, file, line)				\
 	_mtx_unlock_flags((m), (opts), (file), (line))
 #define	mtx_lock_spin_flags_(m, opts, file, line)			\
 	_mtx_lock_spin_flags((m), (opts), (file), (line))
 #define	mtx_unlock_spin_flags_(m, opts, file, line)			\
 	_mtx_unlock_spin_flags((m), (opts), (file), (line))
 #else	/* LOCK_DEBUG == 0 && !MUTEX_NOINLINE */
 #define	mtx_lock_flags_(m, opts, file, line)				\
 	__mtx_lock((m), curthread, (opts), (file), (line))
 #define	mtx_unlock_flags_(m, opts, file, line)				\
 	__mtx_unlock((m), curthread, (opts), (file), (line))
 #define	mtx_lock_spin_flags_(m, opts, file, line)			\
 	__mtx_lock_spin((m), curthread, (opts), (file), (line))
 #define	mtx_unlock_spin_flags_(m, opts, file, line)			\
 	__mtx_unlock_spin((m))
 #endif	/* LOCK_DEBUG > 0 || MUTEX_NOINLINE */
 
 #ifdef INVARIANTS
 #define	mtx_assert_(m, what, file, line)				\
 	_mtx_assert((m), (what), (file), (line))
 
 #define GIANT_REQUIRED	mtx_assert_(&Giant, MA_OWNED, __FILE__, __LINE__)
 
 #else	/* INVARIANTS */
 #define mtx_assert_(m, what, file, line)	(void)0
 #define GIANT_REQUIRED
 #endif	/* INVARIANTS */
 
 #define	mtx_lock_flags(m, opts)						\
 	mtx_lock_flags_((m), (opts), LOCK_FILE, LOCK_LINE)
 #define	mtx_unlock_flags(m, opts)					\
 	mtx_unlock_flags_((m), (opts), LOCK_FILE, LOCK_LINE)
 #define	mtx_lock_spin_flags(m, opts)					\
 	mtx_lock_spin_flags_((m), (opts), LOCK_FILE, LOCK_LINE)
 #define	mtx_unlock_spin_flags(m, opts)					\
 	mtx_unlock_spin_flags_((m), (opts), LOCK_FILE, LOCK_LINE)
 #define mtx_trylock_flags(m, opts)					\
 	mtx_trylock_flags_((m), (opts), LOCK_FILE, LOCK_LINE)
 #define	mtx_assert(m, what)						\
 	mtx_assert_((m), (what), __FILE__, __LINE__)
 
 #define	mtx_sleep(chan, mtx, pri, wmesg, timo)				\
 	_sleep((chan), &(mtx)->lock_object, (pri), (wmesg),		\
 	    tick_sbt * (timo), 0, C_HARDCLOCK)
 
 #define	mtx_initialized(m)	lock_initalized(&(m)->lock_object)
 
 #define mtx_owned(m)	(((m)->mtx_lock & ~MTX_FLAGMASK) == (uintptr_t)curthread)
 
 #define mtx_recursed(m)	((m)->mtx_recurse != 0)
 
 #define mtx_name(m)	((m)->lock_object.lo_name)
 
 /*
  * Global locks.
  */
 extern struct mtx Giant;
 extern struct mtx blocked_lock;
 
 /*
  * Giant lock manipulation and clean exit macros.
  * Used to replace return with an exit Giant and return.
  *
  * Note that DROP_GIANT*() needs to be paired with PICKUP_GIANT() 
  * The #ifndef is to allow lint-like tools to redefine DROP_GIANT.
  */
 #ifndef DROP_GIANT
 #define DROP_GIANT()							\
 do {									\
 	int _giantcnt = 0;						\
 	WITNESS_SAVE_DECL(Giant);					\
 									\
 	if (mtx_owned(&Giant)) {					\
 		WITNESS_SAVE(&Giant.lock_object, Giant);		\
 		for (_giantcnt = 0; mtx_owned(&Giant) &&		\
 		    !SCHEDULER_STOPPED(); _giantcnt++)			\
 			mtx_unlock(&Giant);				\
 	}
 
 #define PICKUP_GIANT()							\
 	PARTIAL_PICKUP_GIANT();						\
 } while (0)
 
 #define PARTIAL_PICKUP_GIANT()						\
 	mtx_assert(&Giant, MA_NOTOWNED);				\
 	if (_giantcnt > 0) {						\
 		while (_giantcnt--)					\
 			mtx_lock(&Giant);				\
 		WITNESS_RESTORE(&Giant.lock_object, Giant);		\
 	}
 #endif
 
 struct mtx_args {
 	void		*ma_mtx;
 	const char 	*ma_desc;
 	int		 ma_opts;
 };
 
 #define	MTX_SYSINIT(name, mtx, desc, opts)				\
 	static struct mtx_args name##_args = {				\
 		(mtx),							\
 		(desc),							\
 		(opts)							\
 	};								\
 	SYSINIT(name##_mtx_sysinit, SI_SUB_LOCK, SI_ORDER_MIDDLE,	\
 	    mtx_sysinit, &name##_args);					\
 	SYSUNINIT(name##_mtx_sysuninit, SI_SUB_LOCK, SI_ORDER_MIDDLE,	\
 	    _mtx_destroy, __DEVOLATILE(void *, &(mtx)->mtx_lock))
 
 /*
  * The INVARIANTS-enabled mtx_assert() functionality.
  *
  * The constants need to be defined for INVARIANT_SUPPORT infrastructure
  * support as _mtx_assert() itself uses them and the latter implies that
  * _mtx_assert() must build.
  */
 #if defined(INVARIANTS) || defined(INVARIANT_SUPPORT)
 #define MA_OWNED	LA_XLOCKED
 #define MA_NOTOWNED	LA_UNLOCKED
 #define MA_RECURSED	LA_RECURSED
 #define MA_NOTRECURSED	LA_NOTRECURSED
 #endif
 
 /*
  * Common lock type names.
  */
 #define	MTX_NETWORK_LOCK	"network driver"
 
 #endif	/* _KERNEL */
 #endif	/* _SYS_MUTEX_H_ */
Index: head/sys/sys/rwlock.h
===================================================================
--- head/sys/sys/rwlock.h	(revision 258540)
+++ head/sys/sys/rwlock.h	(revision 258541)
@@ -1,284 +1,288 @@
 /*-
  * Copyright (c) 2006 John Baldwin <jhb@FreeBSD.org>
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the author nor the names of any co-contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  * $FreeBSD$
  */
 
 #ifndef _SYS_RWLOCK_H_
 #define _SYS_RWLOCK_H_
 
 #include <sys/_lock.h>
 #include <sys/_rwlock.h>
 #include <sys/lock_profile.h>
 #include <sys/lockstat.h>
 
 #ifdef _KERNEL
 #include <sys/pcpu.h>
 #include <machine/atomic.h>
 #endif
 
 /*
  * The rw_lock field consists of several fields.  The low bit indicates
  * if the lock is locked with a read (shared) or write (exclusive) lock.
  * A value of 0 indicates a write lock, and a value of 1 indicates a read
  * lock.  Bit 1 is a boolean indicating if there are any threads waiting
  * for a read lock.  Bit 2 is a boolean indicating if there are any threads
  * waiting for a write lock.  The rest of the variable's definition is
  * dependent on the value of the first bit.  For a write lock, it is a
  * pointer to the thread holding the lock, similar to the mtx_lock field of
  * mutexes.  For read locks, it is a count of read locks that are held.
  *
  * When the lock is not locked by any thread, it is encoded as a read lock
  * with zero waiters.
  */
 
 #define	RW_LOCK_READ		0x01
 #define	RW_LOCK_READ_WAITERS	0x02
 #define	RW_LOCK_WRITE_WAITERS	0x04
 #define	RW_LOCK_WRITE_SPINNER	0x08
 #define	RW_LOCK_FLAGMASK						\
 	(RW_LOCK_READ | RW_LOCK_READ_WAITERS | RW_LOCK_WRITE_WAITERS |	\
 	RW_LOCK_WRITE_SPINNER)
 #define	RW_LOCK_WAITERS		(RW_LOCK_READ_WAITERS | RW_LOCK_WRITE_WAITERS)
 
 #define	RW_OWNER(x)		((x) & ~RW_LOCK_FLAGMASK)
 #define	RW_READERS_SHIFT	4
 #define	RW_READERS(x)		(RW_OWNER((x)) >> RW_READERS_SHIFT)
 #define	RW_READERS_LOCK(x)	((x) << RW_READERS_SHIFT | RW_LOCK_READ)
 #define	RW_ONE_READER		(1 << RW_READERS_SHIFT)
 
 #define	RW_UNLOCKED		RW_READERS_LOCK(0)
 #define	RW_DESTROYED		(RW_LOCK_READ_WAITERS | RW_LOCK_WRITE_WAITERS)
 
 #ifdef _KERNEL
 
 #define	rw_recurse	lock_object.lo_data
 
 /* Very simple operations on rw_lock. */
 
 /* Try to obtain a write lock once. */
 #define	_rw_write_lock(rw, tid)						\
 	atomic_cmpset_acq_ptr(&(rw)->rw_lock, RW_UNLOCKED, (tid))
 
 /* Release a write lock quickly if there are no waiters. */
 #define	_rw_write_unlock(rw, tid)					\
 	atomic_cmpset_rel_ptr(&(rw)->rw_lock, (tid), RW_UNLOCKED)
 
 /*
  * Full lock operations that are suitable to be inlined in non-debug
  * kernels.  If the lock cannot be acquired or released trivially then
  * the work is deferred to another function.
  */
 
 /* Acquire a write lock. */
 #define	__rw_wlock(rw, tid, file, line) do {				\
 	uintptr_t _tid = (uintptr_t)(tid);				\
 						                        \
 	if (!_rw_write_lock((rw), _tid))				\
 		_rw_wlock_hard((rw), _tid, (file), (line));		\
 	else 								\
 		LOCKSTAT_PROFILE_OBTAIN_LOCK_SUCCESS(LS_RW_WLOCK_ACQUIRE, \
 		    rw, 0, 0, (file), (line));				\
 } while (0)
 
 /* Release a write lock. */
 #define	__rw_wunlock(rw, tid, file, line) do {				\
 	uintptr_t _tid = (uintptr_t)(tid);				\
 									\
 	if ((rw)->rw_recurse)						\
 		(rw)->rw_recurse--;					\
-	else if (!_rw_write_unlock((rw), _tid))				\
-		_rw_wunlock_hard((rw), _tid, (file), (line));		\
+	else {								\
+		LOCKSTAT_PROFILE_RELEASE_LOCK(LS_RW_WUNLOCK_RELEASE,	\
+		    (rw));						\
+		if (!_rw_write_unlock((rw), _tid))			\
+			_rw_wunlock_hard((rw), _tid, (file), (line));	\
+	}								\
 } while (0)
 
 /*
  * Function prototypes.  Routines that start with _ are not part of the
  * external API and should not be called directly.  Wrapper macros should
  * be used instead.
  */
 void	_rw_init_flags(volatile uintptr_t *c, const char *name, int opts);
 void	_rw_destroy(volatile uintptr_t *c);
 void	rw_sysinit(void *arg);
 void	rw_sysinit_flags(void *arg);
 int	_rw_wowned(const volatile uintptr_t *c);
 void	_rw_wlock_cookie(volatile uintptr_t *c, const char *file, int line);
 int	__rw_try_wlock(volatile uintptr_t *c, const char *file, int line);
 void	_rw_wunlock_cookie(volatile uintptr_t *c, const char *file, int line);
 void	__rw_rlock(volatile uintptr_t *c, const char *file, int line);
 int	__rw_try_rlock(volatile uintptr_t *c, const char *file, int line);
 void	_rw_runlock_cookie(volatile uintptr_t *c, const char *file, int line);
 void	__rw_wlock_hard(volatile uintptr_t *c, uintptr_t tid, const char *file,
 	    int line);
 void	__rw_wunlock_hard(volatile uintptr_t *c, uintptr_t tid,
 	    const char *file, int line);
 int	__rw_try_upgrade(volatile uintptr_t *c, const char *file, int line);
 void	__rw_downgrade(volatile uintptr_t *c, const char *file, int line);
 #if defined(INVARIANTS) || defined(INVARIANT_SUPPORT)
 void	__rw_assert(const volatile uintptr_t *c, int what, const char *file,
 	    int line);
 #endif
 
 /*
  * Top-level macros to provide lock cookie once the actual rwlock is passed.
  * They will also prevent passing a malformed object to the rwlock KPI by
  * failing compilation as the rw_lock reserved member will not be found.
  */
 #define	rw_init(rw, n)							\
 	_rw_init_flags(&(rw)->rw_lock, n, 0)
 #define	rw_init_flags(rw, n, o)						\
 	_rw_init_flags(&(rw)->rw_lock, n, o)
 #define	rw_destroy(rw)							\
 	_rw_destroy(&(rw)->rw_lock)
 #define	rw_wowned(rw)							\
 	_rw_wowned(&(rw)->rw_lock)
 #define	_rw_wlock(rw, f, l)						\
 	_rw_wlock_cookie(&(rw)->rw_lock, f, l)
 #define	_rw_try_wlock(rw, f, l)						\
 	__rw_try_wlock(&(rw)->rw_lock, f, l)
 #define	_rw_wunlock(rw, f, l)						\
 	_rw_wunlock_cookie(&(rw)->rw_lock, f, l)
 #define	_rw_rlock(rw, f, l)						\
 	__rw_rlock(&(rw)->rw_lock, f, l)
 #define	_rw_try_rlock(rw, f, l)						\
 	__rw_try_rlock(&(rw)->rw_lock, f, l)
 #define	_rw_runlock(rw, f, l)						\
 	_rw_runlock_cookie(&(rw)->rw_lock, f, l)
 #define	_rw_wlock_hard(rw, t, f, l)					\
 	__rw_wlock_hard(&(rw)->rw_lock, t, f, l)
 #define	_rw_wunlock_hard(rw, t, f, l)					\
 	__rw_wunlock_hard(&(rw)->rw_lock, t, f, l)
 #define	_rw_try_upgrade(rw, f, l)					\
 	__rw_try_upgrade(&(rw)->rw_lock, f, l)
 #define	_rw_downgrade(rw, f, l)						\
 	__rw_downgrade(&(rw)->rw_lock, f, l)
 #if defined(INVARIANTS) || defined(INVARIANT_SUPPORT)
 #define	_rw_assert(rw, w, f, l)						\
 	__rw_assert(&(rw)->rw_lock, w, f, l)
 #endif
 
 
 /*
  * Public interface for lock operations.
  */
 
 #ifndef LOCK_DEBUG
 #error LOCK_DEBUG not defined, include <sys/lock.h> before <sys/rwlock.h>
 #endif
 #if LOCK_DEBUG > 0 || defined(RWLOCK_NOINLINE)
 #define	rw_wlock(rw)		_rw_wlock((rw), LOCK_FILE, LOCK_LINE)
 #define	rw_wunlock(rw)		_rw_wunlock((rw), LOCK_FILE, LOCK_LINE)
 #else
 #define	rw_wlock(rw)							\
 	__rw_wlock((rw), curthread, LOCK_FILE, LOCK_LINE)
 #define	rw_wunlock(rw)							\
 	__rw_wunlock((rw), curthread, LOCK_FILE, LOCK_LINE)
 #endif
 #define	rw_rlock(rw)		_rw_rlock((rw), LOCK_FILE, LOCK_LINE)
 #define	rw_runlock(rw)		_rw_runlock((rw), LOCK_FILE, LOCK_LINE)
 #define	rw_try_rlock(rw)	_rw_try_rlock((rw), LOCK_FILE, LOCK_LINE)
 #define	rw_try_upgrade(rw)	_rw_try_upgrade((rw), LOCK_FILE, LOCK_LINE)
 #define	rw_try_wlock(rw)	_rw_try_wlock((rw), LOCK_FILE, LOCK_LINE)
 #define	rw_downgrade(rw)	_rw_downgrade((rw), LOCK_FILE, LOCK_LINE)
 #define	rw_unlock(rw)	do {						\
 	if (rw_wowned(rw))						\
 		rw_wunlock(rw);						\
 	else								\
 		rw_runlock(rw);						\
 } while (0)
 #define	rw_sleep(chan, rw, pri, wmesg, timo)				\
 	_sleep((chan), &(rw)->lock_object, (pri), (wmesg),		\
 	    tick_sbt * (timo), 0, C_HARDCLOCK)
 
 #define	rw_initialized(rw)	lock_initalized(&(rw)->lock_object)
 
 struct rw_args {
 	void		*ra_rw;
 	const char 	*ra_desc;
 };
 
 struct rw_args_flags {
 	void		*ra_rw;
 	const char 	*ra_desc;
 	int		ra_flags;
 };
 
 #define	RW_SYSINIT(name, rw, desc)					\
 	static struct rw_args name##_args = {				\
 		(rw),							\
 		(desc),							\
 	};								\
 	SYSINIT(name##_rw_sysinit, SI_SUB_LOCK, SI_ORDER_MIDDLE,	\
 	    rw_sysinit, &name##_args);					\
 	SYSUNINIT(name##_rw_sysuninit, SI_SUB_LOCK, SI_ORDER_MIDDLE,	\
 	    _rw_destroy, __DEVOLATILE(void *, &(rw)->rw_lock))
 
 
 #define	RW_SYSINIT_FLAGS(name, rw, desc, flags)				\
 	static struct rw_args_flags name##_args = {			\
 		(rw),							\
 		(desc),							\
 		(flags),						\
 	};								\
 	SYSINIT(name##_rw_sysinit, SI_SUB_LOCK, SI_ORDER_MIDDLE,	\
 	    rw_sysinit_flags, &name##_args);				\
 	SYSUNINIT(name##_rw_sysuninit, SI_SUB_LOCK, SI_ORDER_MIDDLE,	\
 	    _rw_destroy, __DEVOLATILE(void *, &(rw)->rw_lock))
 
 /*
  * Options passed to rw_init_flags().
  */
 #define	RW_DUPOK	0x01
 #define	RW_NOPROFILE	0x02
 #define	RW_NOWITNESS	0x04
 #define	RW_QUIET	0x08
 #define	RW_RECURSE	0x10
 
 /*
  * The INVARIANTS-enabled rw_assert() functionality.
  *
  * The constants need to be defined for INVARIANT_SUPPORT infrastructure
  * support as _rw_assert() itself uses them and the latter implies that
  * _rw_assert() must build.
  */
 #if defined(INVARIANTS) || defined(INVARIANT_SUPPORT)
 #define	RA_LOCKED		LA_LOCKED
 #define	RA_RLOCKED		LA_SLOCKED
 #define	RA_WLOCKED		LA_XLOCKED
 #define	RA_UNLOCKED		LA_UNLOCKED
 #define	RA_RECURSED		LA_RECURSED
 #define	RA_NOTRECURSED		LA_NOTRECURSED
 #endif
 
 #ifdef INVARIANTS
 #define	rw_assert(rw, what)	_rw_assert((rw), (what), LOCK_FILE, LOCK_LINE)
 #else
 #define	rw_assert(rw, what)
 #endif
 
 #endif /* _KERNEL */
 #endif /* !_SYS_RWLOCK_H_ */
Index: head/sys/sys/sx.h
===================================================================
--- head/sys/sys/sx.h	(revision 258540)
+++ head/sys/sys/sx.h	(revision 258541)
@@ -1,315 +1,320 @@
 /*-
  * Copyright (c) 2007 Attilio Rao <attilio@freebsd.org>
  * Copyright (c) 2001 Jason Evans <jasone@freebsd.org>
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice(s), this list of conditions and the following disclaimer as
  *    the first lines of this file unmodified other than the possible 
  *    addition of one or more copyright notices.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice(s), this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER(S) ``AS IS'' AND ANY
  * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
  * DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT HOLDER(S) BE LIABLE FOR ANY
  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
  * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
  * DAMAGE.
  *
  * $FreeBSD$
  */
 
 #ifndef	_SYS_SX_H_
 #define	_SYS_SX_H_
 
 #include <sys/_lock.h>
 #include <sys/_sx.h>
 
 #ifdef	_KERNEL
 #include <sys/pcpu.h>
 #include <sys/lock_profile.h>
 #include <sys/lockstat.h>
 #include <machine/atomic.h>
 #endif
 
 /*
  * In general, the sx locks and rwlocks use very similar algorithms.
  * The main difference in the implementations is how threads are
  * blocked when a lock is unavailable.  For this, sx locks use sleep
  * queues which do not support priority propagation, and rwlocks use
  * turnstiles which do.
  *
  * The sx_lock field consists of several fields.  The low bit
  * indicates if the lock is locked with a shared or exclusive lock.  A
  * value of 0 indicates an exclusive lock, and a value of 1 indicates
  * a shared lock.  Bit 1 is a boolean indicating if there are any
  * threads waiting for a shared lock.  Bit 2 is a boolean indicating
  * if there are any threads waiting for an exclusive lock.  Bit 3 is a
  * boolean indicating if an exclusive lock is recursively held.  The
  * rest of the variable's definition is dependent on the value of the
  * first bit.  For an exclusive lock, it is a pointer to the thread
  * holding the lock, similar to the mtx_lock field of mutexes.  For
  * shared locks, it is a count of read locks that are held.
  *
  * When the lock is not locked by any thread, it is encoded as a
  * shared lock with zero waiters.
  */
 
 #define	SX_LOCK_SHARED			0x01
 #define	SX_LOCK_SHARED_WAITERS		0x02
 #define	SX_LOCK_EXCLUSIVE_WAITERS	0x04
 #define	SX_LOCK_RECURSED		0x08
 #define	SX_LOCK_FLAGMASK						\
 	(SX_LOCK_SHARED | SX_LOCK_SHARED_WAITERS |			\
 	SX_LOCK_EXCLUSIVE_WAITERS | SX_LOCK_RECURSED)
 
 #define	SX_OWNER(x)			((x) & ~SX_LOCK_FLAGMASK)
 #define	SX_SHARERS_SHIFT		4
 #define	SX_SHARERS(x)			(SX_OWNER(x) >> SX_SHARERS_SHIFT)
 #define	SX_SHARERS_LOCK(x)						\
 	((x) << SX_SHARERS_SHIFT | SX_LOCK_SHARED)
 #define	SX_ONE_SHARER			(1 << SX_SHARERS_SHIFT)
 
 #define	SX_LOCK_UNLOCKED		SX_SHARERS_LOCK(0)
 #define	SX_LOCK_DESTROYED						\
 	(SX_LOCK_SHARED_WAITERS | SX_LOCK_EXCLUSIVE_WAITERS)
 
 #ifdef _KERNEL
 
+#define	sx_recurse	lock_object.lo_data
+
 /*
  * Function prototipes.  Routines that start with an underscore are not part
  * of the public interface and are wrappered with a macro.
  */
 void	sx_sysinit(void *arg);
 #define	sx_init(sx, desc)	sx_init_flags((sx), (desc), 0)
 void	sx_init_flags(struct sx *sx, const char *description, int opts);
 void	sx_destroy(struct sx *sx);
 int	sx_try_slock_(struct sx *sx, const char *file, int line);
 int	sx_try_xlock_(struct sx *sx, const char *file, int line);
 int	sx_try_upgrade_(struct sx *sx, const char *file, int line);
 void	sx_downgrade_(struct sx *sx, const char *file, int line);
 int	_sx_slock(struct sx *sx, int opts, const char *file, int line);
 int	_sx_xlock(struct sx *sx, int opts, const char *file, int line);
 void	_sx_sunlock(struct sx *sx, const char *file, int line);
 void	_sx_xunlock(struct sx *sx, const char *file, int line);
 int	_sx_xlock_hard(struct sx *sx, uintptr_t tid, int opts,
 	    const char *file, int line);
 int	_sx_slock_hard(struct sx *sx, int opts, const char *file, int line);
 void	_sx_xunlock_hard(struct sx *sx, uintptr_t tid, const char *file, int
 	    line);
 void	_sx_sunlock_hard(struct sx *sx, const char *file, int line);
 #if defined(INVARIANTS) || defined(INVARIANT_SUPPORT)
 void	_sx_assert(const struct sx *sx, int what, const char *file, int line);
 #endif
 #ifdef DDB
 int	sx_chain(struct thread *td, struct thread **ownerp);
 #endif
 
 struct sx_args {
 	struct sx 	*sa_sx;
 	const char	*sa_desc;
 	int		sa_flags;
 };
 
 #define	SX_SYSINIT_FLAGS(name, sxa, desc, flags)			\
 	static struct sx_args name##_args = {				\
 		(sxa),							\
 		(desc),							\
 		(flags)							\
 	};								\
 	SYSINIT(name##_sx_sysinit, SI_SUB_LOCK, SI_ORDER_MIDDLE,	\
 	    sx_sysinit, &name##_args);					\
 	SYSUNINIT(name##_sx_sysuninit, SI_SUB_LOCK, SI_ORDER_MIDDLE,	\
 	    sx_destroy, (sxa))
 
 #define	SX_SYSINIT(name, sxa, desc)	SX_SYSINIT_FLAGS(name, sxa, desc, 0)
 
 /*
  * Full lock operations that are suitable to be inlined in non-debug kernels.
  * If the lock can't be acquired or released trivially then the work is
  * deferred to 'tougher' functions.
  */
 
 /* Acquire an exclusive lock. */
 static __inline int
 __sx_xlock(struct sx *sx, struct thread *td, int opts, const char *file,
     int line)
 {
 	uintptr_t tid = (uintptr_t)td;
 	int error = 0;
 
 	if (!atomic_cmpset_acq_ptr(&sx->sx_lock, SX_LOCK_UNLOCKED, tid))
 		error = _sx_xlock_hard(sx, tid, opts, file, line);
 	else 
 		LOCKSTAT_PROFILE_OBTAIN_LOCK_SUCCESS(LS_SX_XLOCK_ACQUIRE,
 		    sx, 0, 0, file, line);
 
 	return (error);
 }
 
 /* Release an exclusive lock. */
 static __inline void
 __sx_xunlock(struct sx *sx, struct thread *td, const char *file, int line)
 {
 	uintptr_t tid = (uintptr_t)td;
 
+	if (sx->sx_recurse == 0)
+		LOCKSTAT_PROFILE_RELEASE_LOCK(LS_SX_XUNLOCK_RELEASE, sx);
 	if (!atomic_cmpset_rel_ptr(&sx->sx_lock, tid, SX_LOCK_UNLOCKED))
 		_sx_xunlock_hard(sx, tid, file, line);
 }
 
 /* Acquire a shared lock. */
 static __inline int
 __sx_slock(struct sx *sx, int opts, const char *file, int line)
 {
 	uintptr_t x = sx->sx_lock;
 	int error = 0;
 
 	if (!(x & SX_LOCK_SHARED) ||
 	    !atomic_cmpset_acq_ptr(&sx->sx_lock, x, x + SX_ONE_SHARER))
 		error = _sx_slock_hard(sx, opts, file, line);
 	else
 		LOCKSTAT_PROFILE_OBTAIN_LOCK_SUCCESS(LS_SX_SLOCK_ACQUIRE, sx, 0,
 		    0, file, line);
 
 	return (error);
 }
 
 /*
  * Release a shared lock.  We can just drop a single shared lock so
  * long as we aren't trying to drop the last shared lock when other
  * threads are waiting for an exclusive lock.  This takes advantage of
  * the fact that an unlocked lock is encoded as a shared lock with a
  * count of 0.
  */
 static __inline void
 __sx_sunlock(struct sx *sx, const char *file, int line)
 {
 	uintptr_t x = sx->sx_lock;
 
+	LOCKSTAT_PROFILE_RELEASE_LOCK(LS_SX_SUNLOCK_RELEASE, sx);
 	if (x == (SX_SHARERS_LOCK(1) | SX_LOCK_EXCLUSIVE_WAITERS) ||
 	    !atomic_cmpset_rel_ptr(&sx->sx_lock, x, x - SX_ONE_SHARER))
 		_sx_sunlock_hard(sx, file, line);
 }
 
 /*
  * Public interface for lock operations.
  */
 #ifndef LOCK_DEBUG
 #error	"LOCK_DEBUG not defined, include <sys/lock.h> before <sys/sx.h>"
 #endif
 #if	(LOCK_DEBUG > 0) || defined(SX_NOINLINE)
 #define	sx_xlock_(sx, file, line)					\
 	(void)_sx_xlock((sx), 0, (file), (line))
 #define	sx_xlock_sig_(sx, file, line)					\
 	_sx_xlock((sx), SX_INTERRUPTIBLE, (file), (line))
 #define	sx_xunlock_(sx, file, line)					\
 	_sx_xunlock((sx), (file), (line))
 #define	sx_slock_(sx, file, line)					\
 	(void)_sx_slock((sx), 0, (file), (line))
 #define	sx_slock_sig_(sx, file, line)					\
 	_sx_slock((sx), SX_INTERRUPTIBLE, (file) , (line))
 #define	sx_sunlock_(sx, file, line)					\
 	_sx_sunlock((sx), (file), (line))
 #else
 #define	sx_xlock_(sx, file, line)					\
 	(void)__sx_xlock((sx), curthread, 0, (file), (line))
 #define	sx_xlock_sig_(sx, file, line)					\
 	__sx_xlock((sx), curthread, SX_INTERRUPTIBLE, (file), (line))
 #define	sx_xunlock_(sx, file, line)					\
 	__sx_xunlock((sx), curthread, (file), (line))
 #define	sx_slock_(sx, file, line)					\
 	(void)__sx_slock((sx), 0, (file), (line))
 #define	sx_slock_sig_(sx, file, line)					\
 	__sx_slock((sx), SX_INTERRUPTIBLE, (file), (line))
 #define	sx_sunlock_(sx, file, line)					\
 	__sx_sunlock((sx), (file), (line))
 #endif	/* LOCK_DEBUG > 0 || SX_NOINLINE */
 #define	sx_try_slock(sx)	sx_try_slock_((sx), LOCK_FILE, LOCK_LINE)
 #define	sx_try_xlock(sx)	sx_try_xlock_((sx), LOCK_FILE, LOCK_LINE)
 #define	sx_try_upgrade(sx)	sx_try_upgrade_((sx), LOCK_FILE, LOCK_LINE)
 #define	sx_downgrade(sx)	sx_downgrade_((sx), LOCK_FILE, LOCK_LINE)
 #ifdef INVARIANTS
 #define	sx_assert_(sx, what, file, line)				\
 	_sx_assert((sx), (what), (file), (line))
 #else
 #define	sx_assert_(sx, what, file, line)	(void)0
 #endif
 
 #define	sx_xlock(sx)		sx_xlock_((sx), LOCK_FILE, LOCK_LINE)
 #define	sx_xlock_sig(sx)	sx_xlock_sig_((sx), LOCK_FILE, LOCK_LINE)
 #define	sx_xunlock(sx)		sx_xunlock_((sx), LOCK_FILE, LOCK_LINE)
 #define	sx_slock(sx)		sx_slock_((sx), LOCK_FILE, LOCK_LINE)
 #define	sx_slock_sig(sx)	sx_slock_sig_((sx), LOCK_FILE, LOCK_LINE)
 #define	sx_sunlock(sx)		sx_sunlock_((sx), LOCK_FILE, LOCK_LINE)
 #define	sx_assert(sx, what)	sx_assert_((sx), (what), __FILE__, __LINE__)
 
 /*
  * Return a pointer to the owning thread if the lock is exclusively
  * locked.
  */
 #define	sx_xholder(sx)							\
 	((sx)->sx_lock & SX_LOCK_SHARED ? NULL :			\
 	(struct thread *)SX_OWNER((sx)->sx_lock))
 
 #define	sx_xlocked(sx)							\
 	(((sx)->sx_lock & ~(SX_LOCK_FLAGMASK & ~SX_LOCK_SHARED)) ==	\
 	    (uintptr_t)curthread)
 
 #define	sx_unlock_(sx, file, line) do {					\
 	if (sx_xlocked(sx))						\
 		sx_xunlock_(sx, file, line);				\
 	else								\
 		sx_sunlock_(sx, file, line);				\
 } while (0)
 
 #define	sx_unlock(sx)	sx_unlock_((sx), LOCK_FILE, LOCK_LINE)
 
 #define	sx_sleep(chan, sx, pri, wmesg, timo)				\
 	_sleep((chan), &(sx)->lock_object, (pri), (wmesg),		\
 	    tick_sbt * (timo), 0,  C_HARDCLOCK)
 
 /*
  * Options passed to sx_init_flags().
  */
 #define	SX_DUPOK		0x01
 #define	SX_NOPROFILE		0x02
 #define	SX_NOWITNESS		0x04
 #define	SX_QUIET		0x08
 #define	SX_NOADAPTIVE		0x10
 #define	SX_RECURSE		0x20
 
 /*
  * Options passed to sx_*lock_hard().
  */
 #define	SX_INTERRUPTIBLE	0x40
 
 #if defined(INVARIANTS) || defined(INVARIANT_SUPPORT)
 #define	SA_LOCKED		LA_LOCKED
 #define	SA_SLOCKED		LA_SLOCKED
 #define	SA_XLOCKED		LA_XLOCKED
 #define	SA_UNLOCKED		LA_UNLOCKED
 #define	SA_RECURSED		LA_RECURSED
 #define	SA_NOTRECURSED		LA_NOTRECURSED
 
 /* Backwards compatability. */
 #define	SX_LOCKED		LA_LOCKED
 #define	SX_SLOCKED		LA_SLOCKED
 #define	SX_XLOCKED		LA_XLOCKED
 #define	SX_UNLOCKED		LA_UNLOCKED
 #define	SX_RECURSED		LA_RECURSED
 #define	SX_NOTRECURSED		LA_NOTRECURSED
 #endif
 
 #endif /* _KERNEL */
 
 #endif /* !_SYS_SX_H_ */
Index: head/sys/tools/vnode_if.awk
===================================================================
--- head/sys/tools/vnode_if.awk	(revision 258540)
+++ head/sys/tools/vnode_if.awk	(revision 258541)
@@ -1,435 +1,433 @@
 #!/usr/bin/awk -f
 
 #-
 # Copyright (c) 1992, 1993
 #	The Regents of the University of California.  All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions
 # are met:
 # 1. Redistributions of source code must retain the above copyright
 #    notice, this list of conditions and the following disclaimer.
 # 2. Redistributions in binary form must reproduce the above copyright
 #    notice, this list of conditions and the following disclaimer in the
 #    documentation and/or other materials provided with the distribution.
 # 4. Neither the name of the University nor the names of its contributors
 #    may be used to endorse or promote products derived from this software
 #    without specific prior written permission.
 #
 # THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
 # ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 # ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
 # FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 # DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 # OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 # HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 # LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 # OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 # SUCH DAMAGE.
 
 #
 #	@(#)vnode_if.sh	8.1 (Berkeley) 6/10/93
 # $FreeBSD$
 #
 # Script to produce VFS front-end sugar.
 #
 # usage: vnode_if.awk <srcfile> [-c | -h | -p | -q]
 #	(where <srcfile> is currently /sys/kern/vnode_if.src)
 #	The source file must have a .src extension
 #
 
 function usage()
 {
 	print "usage: vnode_if.awk <srcfile> [-c|-h|-p|-q]";
 	exit 1;
 }
 
 function die(msg, what)
 {
 	printf srcfile "(" fnr "): " > "/dev/stderr";
 	printf msg "\n", what > "/dev/stderr";
 	exit 1;
 }
 
 function t_spc(type)
 {
 	# Append a space if the type is not a pointer
 	return (type ~ /\*$/) ? type : type " ";
 }
 
 # These are just for convenience ...
 function printc(s) {print s > cfile;}
 function printh(s) {print s > hfile;}
 function printp(s) {print s > pfile;}
 function printq(s) {print s > qfile;}
 
 function add_debug_code(name, arg, pos, ind)
 {
 	if (arg == "vpp")
 		star = "*";
 	else
 		star = "";
 	if (lockdata[name, arg, pos] && (lockdata[name, arg, pos] != "-")) {
 		printc(ind"ASSERT_VI_UNLOCKED("star"a->a_"arg", \""uname"\");");
 		# Add assertions for locking
 		if (lockdata[name, arg, pos] == "L")
 			printc(ind"ASSERT_VOP_LOCKED(" star "a->a_"arg", \""uname"\");");
 		else if (lockdata[name, arg, pos] == "U")
 			printc(ind"ASSERT_VOP_UNLOCKED(" star "a->a_"arg", \""uname"\");");
 		else if (lockdata[name, arg, pos] == "E")
 			printc(ind"ASSERT_VOP_ELOCKED(" star "a->a_"arg", \""uname"\");");
 		else if (0) {
 			# XXX More checks!
 		}
 	}
 }
 
 function add_pre(name)
 {
 	if (lockdata[name, "pre"]) {
 		printc("\t"lockdata[name, "pre"]"(a);");
 	}
 }
 
 function add_post(name)
 {
 	if (lockdata[name, "post"]) {
 		printc("\t"lockdata[name, "post"]"(a, rc);");
 	}
 }
 
 function find_arg_with_type (type)
 {
 	for (jj = 0; jj < numargs; jj++) {
 		if (types[jj] == type) {
 			return "VOPARG_OFFSETOF(struct " \
 			    name "_args,a_" args[jj] ")";
 		}
 	}
 
 	return "VDESC_NO_OFFSET";
 }
 
 BEGIN{
 
 # Process the command line
 for (i = 1; i < ARGC; i++) {
 	arg = ARGV[i];
 	if (arg !~ /^-[chpq]+$/ && arg !~ /\.src$/)
 		usage();
 	if (arg ~ /^-.*c/)
 		cfile = "vnode_if.c";
 	if (arg ~ /^-.*h/)
 		hfile = "vnode_if.h";
 	if (arg ~ /^-.*p/)
 		pfile = "vnode_if_newproto.h";
 	if (arg ~ /^-.*q/)
 		qfile = "vnode_if_typedef.h";
 	if (arg ~ /\.src$/)
 		srcfile = arg;
 }
 ARGC = 1;
 
 if (!cfile && !hfile && !pfile && !qfile)
 	exit 0;
 
 if (!srcfile)
 	usage();
 
 common_head = \
     "/*\n" \
     " * This file is produced automatically.\n" \
     " * Do not modify anything in here by hand.\n" \
     " *\n" \
     " * Created from $FreeBSD$\n" \
     " */\n" \
     "\n";
 
 if (pfile) {
 	printp(common_head)
 	printp("struct vop_vector {")
 	printp("\tstruct vop_vector\t*vop_default;")
 	printp("\tvop_bypass_t\t*vop_bypass;")
 }
 
 if (qfile) {
 	printq(common_head)
 }
 
 if (hfile) {
 	printh(common_head "extern struct vnodeop_desc vop_default_desc;");
 	printh("#include \"vnode_if_typedef.h\"")
 	printh("#include \"vnode_if_newproto.h\"")
 }
 
 if (cfile) {
 	printc(common_head \
-	    "#include \"opt_kdtrace.h\"\n" \
-	    "\n" \
 	    "#include <sys/param.h>\n" \
 	    "#include <sys/event.h>\n" \
 	    "#include <sys/kernel.h>\n" \
 	    "#include <sys/mount.h>\n" \
 	    "#include <sys/sdt.h>\n" \
 	    "#include <sys/signalvar.h>\n" \
 	    "#include <sys/systm.h>\n" \
 	    "#include <sys/vnode.h>\n" \
 	    "\n" \
 	    "SDT_PROVIDER_DECLARE(vfs);\n" \
 	    "\n" \
 	    "struct vnodeop_desc vop_default_desc = {\n" \
 	    "	\"default\",\n" \
 	    "	0,\n" \
 	    "	(vop_bypass_t *)vop_panic,\n" \
 	    "	NULL,\n" \
 	    "	VDESC_NO_OFFSET,\n" \
 	    "	VDESC_NO_OFFSET,\n" \
 	    "	VDESC_NO_OFFSET,\n" \
 	    "	VDESC_NO_OFFSET,\n" \
 	    "};\n");
 }
 
 while ((getline < srcfile) > 0) {
 	fnr++;
 	if (NF == 0)
 		continue;
 	if ($1 ~ /^%%/) {
 		if (NF != 6 ||
 		    $2 !~ /^[a-z_]+$/  ||  $3 !~ /^[a-z]+$/  ||
 		    $4 !~ /^.$/  ||  $5 !~ /^.$/  ||  $6 !~ /^.$/) {
 			die("Invalid %s construction", "%%");
 			continue;
 		}
 		lockdata["vop_" $2, $3, "Entry"] = $4;
 		lockdata["vop_" $2, $3, "OK"]    = $5;
 		lockdata["vop_" $2, $3, "Error"] = $6;			
 		continue;
 	}
 
 	if ($1 ~ /^%!/) {
 		if (NF != 4 ||
 		    ($3 != "pre" && $3 != "post")) {
 			die("Invalid %s construction", "%!");
 			continue;
 		}
 		lockdata["vop_" $2, $3] = $4;
 		continue;
 	}
 	if ($1 ~ /^#/)
 		continue;
 
 	# Get the function name.
 	name = $1;
 	uname = toupper(name);
 
 	# Get the function arguments.
 	for (numargs = 0; ; ++numargs) {
 		if ((getline < srcfile) <= 0) {
 			die("Unable to read through the arguments for \"%s\"",
 			    name);
 		}
 		fnr++;
 		if ($1 ~ /^\};/)
 			break;
 
 		# Delete comments, if any.
 		gsub (/\/\*.*\*\//, "");
 
 		# Condense whitespace and delete leading/trailing space.
 		gsub(/[[:space:]]+/, " ");
 		sub(/^ /, "");
 		sub(/ $/, "");
 
 		# Pick off direction.
 		if ($1 != "INOUT" && $1 != "IN" && $1 != "OUT")
 			die("No IN/OUT direction for \"%s\".", $0);
 		dirs[numargs] = $1;
 		sub(/^[A-Z]* /, "");
 
 		if ((reles[numargs] = $1) == "WILLRELE")
 			sub(/^[A-Z]* /, "");
 		else
 			reles[numargs] = "WONTRELE";
 
 		# kill trailing ;
 		if (sub(/;$/, "") < 1)
 			die("Missing end-of-line ; in \"%s\".", $0);
 
 		# pick off variable name
 		if ((argp = match($0, /[A-Za-z0-9_]+$/)) < 1)
 			die("Missing var name \"a_foo\" in \"%s\".", $0);
 		args[numargs] = substr($0, argp);
 		$0 = substr($0, 1, argp - 1);
 
 		# what is left must be type
 		# remove trailing space (if any)
 		sub(/ $/, "");
 		types[numargs] = $0;
 	}
 	if (numargs > 4)
 		ctrargs = 4;
 	else
 		ctrargs = numargs;
 	ctrstr = ctrargs "(KTR_VOP, \"VOP\", \"" uname "\", (uintptr_t)a,\n\t    "; 
 	ctrstr = ctrstr "\"" args[0] ":0x%jX\", (uintptr_t)a->a_" args[0];
 	for (i = 1; i < ctrargs; ++i)
 		ctrstr = ctrstr ", \"" args[i] ":0x%jX\", a->a_" args[i];
 	ctrstr = ctrstr ");";
 
 	if (pfile) {
 		printp("\t"name"_t\t*"name";")
 	}
 	if (qfile) {
 		printq("struct "name"_args;")
 		printq("typedef int "name"_t(struct "name"_args *);\n")
 	}
 
 	if (hfile) {
 		# Print out the vop_F_args structure.
 		printh("struct "name"_args {\n\tstruct vop_generic_args a_gen;");
 		for (i = 0; i < numargs; ++i)
 			printh("\t" t_spc(types[i]) "a_" args[i] ";");
 		printh("};");
 		printh("");
 
 		# Print out extern declaration.
 		printh("extern struct vnodeop_desc " name "_desc;");
 		printh("");
 
 		# Print out function prototypes.
 		printh("int " uname "_AP(struct " name "_args *);");
 		printh("int " uname "_APV(struct vop_vector *vop, struct " name "_args *);");
 		printh("");
 		printh("static __inline int " uname "(");
 		for (i = 0; i < numargs; ++i) {
 			printh("\t" t_spc(types[i]) args[i] \
 			    (i < numargs - 1 ? "," : ")"));
 		}
 		printh("{");
 		printh("\tstruct " name "_args a;");
 		printh("");
 		printh("\ta.a_gen.a_desc = &" name "_desc;");
 		for (i = 0; i < numargs; ++i)
 			printh("\ta.a_" args[i] " = " args[i] ";");
 		printh("\treturn (" uname "_APV("args[0]"->v_op, &a));");
 		printh("}");
 
 		printh("");
 	}
 
 	if (cfile) {
 		# Print out the vop_F_vp_offsets structure.  This all depends
 		# on naming conventions and nothing else.
 		printc("static int " name "_vp_offsets[] = {");
 		# as a side effect, figure out the releflags
 		releflags = "";
 		vpnum = 0;
 		for (i = 0; i < numargs; i++) {
 			if (types[i] == "struct vnode *") {
 				printc("\tVOPARG_OFFSETOF(struct " name \
 				    "_args,a_" args[i] "),");
 				if (reles[i] == "WILLRELE") {
 					releflags = releflags \
 					    "|VDESC_VP" vpnum "_WILLRELE";
 				}
 				vpnum++;
 			}
 		}
 
 		sub(/^\|/, "", releflags);
 		printc("\tVDESC_NO_OFFSET");
 		printc("};");
 
 		printc("\n");
 		printc("SDT_PROBE_DEFINE2(vfs, vop, " name ", entry, entry, \"struct vnode *\", \"struct " name "_args *\");\n");
 		printc("SDT_PROBE_DEFINE3(vfs, vop, " name ", return, return, \"struct vnode *\", \"struct " name "_args *\", \"int\");\n");
 
 		# Print out function.
 		printc("\nint\n" uname "_AP(struct " name "_args *a)");
 		printc("{");
 		printc("");
 		printc("\treturn(" uname "_APV(a->a_" args[0] "->v_op, a));");
 		printc("}");
 		printc("\nint\n" uname "_APV(struct vop_vector *vop, struct " name "_args *a)");
 		printc("{");
 		printc("\tint rc;");
 		printc("");
 		printc("\tVNASSERT(a->a_gen.a_desc == &" name "_desc, a->a_" args[0]",");
 		printc("\t    (\"Wrong a_desc in " name "(%p, %p)\", a->a_" args[0]", a));");
 		printc("\twhile(vop != NULL && \\");
 		printc("\t    vop->"name" == NULL && vop->vop_bypass == NULL)")
 		printc("\t\tvop = vop->vop_default;")
 		printc("\tVNASSERT(vop != NULL, a->a_" args[0]", (\"No "name"(%p, %p)\", a->a_" args[0]", a));")
 		printc("\tSDT_PROBE(vfs, vop, " name ", entry, a->a_" args[0] ", a, 0, 0, 0);\n");
 		for (i = 0; i < numargs; ++i)
 			add_debug_code(name, args[i], "Entry", "\t");
 		printc("\tKTR_START" ctrstr);
 		add_pre(name);
 		printc("\tVFS_PROLOGUE(a->a_" args[0]"->v_mount);")
 		printc("\tif (vop->"name" != NULL)")
 		printc("\t\trc = vop->"name"(a);")
 		printc("\telse")
 		printc("\t\trc = vop->vop_bypass(&a->a_gen);")
 		printc("\tVFS_EPILOGUE(a->a_" args[0]"->v_mount);")
 		printc("\tSDT_PROBE(vfs, vop, " name ", return, a->a_" args[0] ", a, rc, 0, 0);\n");
 		printc("\tif (rc == 0) {");
 		for (i = 0; i < numargs; ++i)
 			add_debug_code(name, args[i], "OK", "\t\t");
 		printc("\t} else {");
 		for (i = 0; i < numargs; ++i)
 			add_debug_code(name, args[i], "Error", "\t\t");
 		printc("\t}");
 		add_post(name);
 		printc("\tKTR_STOP" ctrstr);
 		printc("\treturn (rc);");
 		printc("}\n");
 
 		# Print out the vnodeop_desc structure.
 		printc("struct vnodeop_desc " name "_desc = {");
 		# printable name
 		printc("\t\"" name "\",");
 		# flags
 		vppwillrele = "";
 		for (i = 0; i < numargs; i++) {
 			if (types[i] == "struct vnode **" && \
 			    reles[i] == "WILLRELE") {
 				vppwillrele = "|VDESC_VPP_WILLRELE";
 			}
 		}
 
 		if (!releflags)
 			releflags = "0";
 		printc("\t" releflags vppwillrele ",");
 
 		# function to call
 		printc("\t(vop_bypass_t *)" uname "_AP,");
 		# vp offsets
 		printc("\t" name "_vp_offsets,");
 		# vpp (if any)
 		printc("\t" find_arg_with_type("struct vnode **") ",");
 		# cred (if any)
 		printc("\t" find_arg_with_type("struct ucred *") ",");
 		# thread (if any)
 		printc("\t" find_arg_with_type("struct thread *") ",");
 		# componentname
 		printc("\t" find_arg_with_type("struct componentname *") ",");
 		# transport layer information
 		printc("};\n");
 	}
 }
  
 if (pfile)
 	printp("};")
  
 if (hfile)
 	close(hfile);
 if (cfile)
 	close(cfile);
 if (pfile)
 	close(pfile);
 close(srcfile);
 
 exit 0;
 
 }
Index: head/sys/x86/x86/local_apic.c
===================================================================
--- head/sys/x86/x86/local_apic.c	(revision 258540)
+++ head/sys/x86/x86/local_apic.c	(revision 258541)
@@ -1,1524 +1,1523 @@
 /*-
  * Copyright (c) 2003 John Baldwin <jhb@FreeBSD.org>
  * Copyright (c) 1996, by Steve Passe
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. The name of the developer may NOT be used to endorse or promote products
  *    derived from this software without specific prior written permission.
  * 3. Neither the name of the author nor the names of any co-contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 /*
  * Local APIC support on Pentium and later processors.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_atpic.h"
 #include "opt_hwpmc_hooks.h"
-#include "opt_kdtrace.h"
 
 #include "opt_ddb.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/bus.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/pcpu.h>
 #include <sys/proc.h>
 #include <sys/sched.h>
 #include <sys/smp.h>
 #include <sys/timeet.h>
 
 #include <vm/vm.h>
 #include <vm/pmap.h>
 
 #include <x86/apicreg.h>
 #include <machine/cpu.h>
 #include <machine/cputypes.h>
 #include <machine/frame.h>
 #include <machine/intr_machdep.h>
 #include <machine/apicvar.h>
 #include <x86/mca.h>
 #include <machine/md_var.h>
 #include <machine/smp.h>
 #include <machine/specialreg.h>
 
 #ifdef DDB
 #include <sys/interrupt.h>
 #include <ddb/ddb.h>
 #endif
 
 #ifdef __amd64__
 #define	SDT_APIC	SDT_SYSIGT
 #define	SDT_APICT	SDT_SYSIGT
 #define	GSEL_APIC	0
 #else
 #define	SDT_APIC	SDT_SYS386IGT
 #define	SDT_APICT	SDT_SYS386TGT
 #define	GSEL_APIC	GSEL(GCODE_SEL, SEL_KPL)
 #endif
 
 /* Sanity checks on IDT vectors. */
 CTASSERT(APIC_IO_INTS + APIC_NUM_IOINTS == APIC_TIMER_INT);
 CTASSERT(APIC_TIMER_INT < APIC_LOCAL_INTS);
 CTASSERT(APIC_LOCAL_INTS == 240);
 CTASSERT(IPI_STOP < APIC_SPURIOUS_INT);
 
 /* Magic IRQ values for the timer and syscalls. */
 #define	IRQ_TIMER	(NUM_IO_INTS + 1)
 #define	IRQ_SYSCALL	(NUM_IO_INTS + 2)
 #define	IRQ_DTRACE_RET	(NUM_IO_INTS + 3)
 #define	IRQ_EVTCHN	(NUM_IO_INTS + 4)
 
 /*
  * Support for local APICs.  Local APICs manage interrupts on each
  * individual processor as opposed to I/O APICs which receive interrupts
  * from I/O devices and then forward them on to the local APICs.
  *
  * Local APICs can also send interrupts to each other thus providing the
  * mechanism for IPIs.
  */
 
 struct lvt {
 	u_int lvt_edgetrigger:1;
 	u_int lvt_activehi:1;
 	u_int lvt_masked:1;
 	u_int lvt_active:1;
 	u_int lvt_mode:16;
 	u_int lvt_vector:8;
 };
 
 struct lapic {
 	struct lvt la_lvts[LVT_MAX + 1];
 	u_int la_id:8;
 	u_int la_cluster:4;
 	u_int la_cluster_id:2;
 	u_int la_present:1;
 	u_long *la_timer_count;
 	u_long la_timer_period;
 	u_int la_timer_mode;
 	uint32_t lvt_timer_cache;
 	/* Include IDT_SYSCALL to make indexing easier. */
 	int la_ioint_irqs[APIC_NUM_IOINTS + 1];
 } static lapics[MAX_APIC_ID + 1];
 
 /* Global defaults for local APIC LVT entries. */
 static struct lvt lvts[LVT_MAX + 1] = {
 	{ 1, 1, 1, 1, APIC_LVT_DM_EXTINT, 0 },	/* LINT0: masked ExtINT */
 	{ 1, 1, 0, 1, APIC_LVT_DM_NMI, 0 },	/* LINT1: NMI */
 	{ 1, 1, 1, 1, APIC_LVT_DM_FIXED, APIC_TIMER_INT },	/* Timer */
 	{ 1, 1, 0, 1, APIC_LVT_DM_FIXED, APIC_ERROR_INT },	/* Error */
 	{ 1, 1, 1, 1, APIC_LVT_DM_NMI, 0 },	/* PMC */
 	{ 1, 1, 1, 1, APIC_LVT_DM_FIXED, APIC_THERMAL_INT },	/* Thermal */
 	{ 1, 1, 1, 1, APIC_LVT_DM_FIXED, APIC_CMC_INT },	/* CMCI */
 };
 
 static inthand_t *ioint_handlers[] = {
 	NULL,			/* 0 - 31 */
 	IDTVEC(apic_isr1),	/* 32 - 63 */
 	IDTVEC(apic_isr2),	/* 64 - 95 */
 	IDTVEC(apic_isr3),	/* 96 - 127 */
 	IDTVEC(apic_isr4),	/* 128 - 159 */
 	IDTVEC(apic_isr5),	/* 160 - 191 */
 	IDTVEC(apic_isr6),	/* 192 - 223 */
 	IDTVEC(apic_isr7),	/* 224 - 255 */
 };
 
 
 static u_int32_t lapic_timer_divisors[] = {
 	APIC_TDCR_1, APIC_TDCR_2, APIC_TDCR_4, APIC_TDCR_8, APIC_TDCR_16,
 	APIC_TDCR_32, APIC_TDCR_64, APIC_TDCR_128
 };
 
 extern inthand_t IDTVEC(rsvd);
 
 volatile lapic_t *lapic;
 vm_paddr_t lapic_paddr;
 static u_long lapic_timer_divisor;
 static struct eventtimer lapic_et;
 
 static void	lapic_enable(void);
 static void	lapic_resume(struct pic *pic, bool suspend_cancelled);
 static void	lapic_timer_oneshot(struct lapic *,
 		    u_int count, int enable_int);
 static void	lapic_timer_periodic(struct lapic *,
 		    u_int count, int enable_int);
 static void	lapic_timer_stop(struct lapic *);
 static void	lapic_timer_set_divisor(u_int divisor);
 static uint32_t	lvt_mode(struct lapic *la, u_int pin, uint32_t value);
 static int	lapic_et_start(struct eventtimer *et,
     sbintime_t first, sbintime_t period);
 static int	lapic_et_stop(struct eventtimer *et);
 
 struct pic lapic_pic = { .pic_resume = lapic_resume };
 
 static uint32_t
 lvt_mode(struct lapic *la, u_int pin, uint32_t value)
 {
 	struct lvt *lvt;
 
 	KASSERT(pin <= LVT_MAX, ("%s: pin %u out of range", __func__, pin));
 	if (la->la_lvts[pin].lvt_active)
 		lvt = &la->la_lvts[pin];
 	else
 		lvt = &lvts[pin];
 
 	value &= ~(APIC_LVT_M | APIC_LVT_TM | APIC_LVT_IIPP | APIC_LVT_DM |
 	    APIC_LVT_VECTOR);
 	if (lvt->lvt_edgetrigger == 0)
 		value |= APIC_LVT_TM;
 	if (lvt->lvt_activehi == 0)
 		value |= APIC_LVT_IIPP_INTALO;
 	if (lvt->lvt_masked)
 		value |= APIC_LVT_M;
 	value |= lvt->lvt_mode;
 	switch (lvt->lvt_mode) {
 	case APIC_LVT_DM_NMI:
 	case APIC_LVT_DM_SMI:
 	case APIC_LVT_DM_INIT:
 	case APIC_LVT_DM_EXTINT:
 		if (!lvt->lvt_edgetrigger) {
 			printf("lapic%u: Forcing LINT%u to edge trigger\n",
 			    la->la_id, pin);
 			value |= APIC_LVT_TM;
 		}
 		/* Use a vector of 0. */
 		break;
 	case APIC_LVT_DM_FIXED:
 		value |= lvt->lvt_vector;
 		break;
 	default:
 		panic("bad APIC LVT delivery mode: %#x\n", value);
 	}
 	return (value);
 }
 
 /*
  * Map the local APIC and setup necessary interrupt vectors.
  */
 void
 lapic_init(vm_paddr_t addr)
 {
 	u_int regs[4];
 	int i, arat;
 
 	/* Map the local APIC and setup the spurious interrupt handler. */
 	KASSERT(trunc_page(addr) == addr,
 	    ("local APIC not aligned on a page boundary"));
 	lapic_paddr = addr;
 	lapic = pmap_mapdev(addr, sizeof(lapic_t));
 	setidt(APIC_SPURIOUS_INT, IDTVEC(spuriousint), SDT_APIC, SEL_KPL,
 	    GSEL_APIC);
 
 	/* Perform basic initialization of the BSP's local APIC. */
 	lapic_enable();
 
 	/* Set BSP's per-CPU local APIC ID. */
 	PCPU_SET(apic_id, lapic_id());
 
 	/* Local APIC timer interrupt. */
 	setidt(APIC_TIMER_INT, IDTVEC(timerint), SDT_APIC, SEL_KPL, GSEL_APIC);
 
 	/* Local APIC error interrupt. */
 	setidt(APIC_ERROR_INT, IDTVEC(errorint), SDT_APIC, SEL_KPL, GSEL_APIC);
 
 	/* XXX: Thermal interrupt */
 
 	/* Local APIC CMCI. */
 	setidt(APIC_CMC_INT, IDTVEC(cmcint), SDT_APICT, SEL_KPL, GSEL_APIC);
 
 	if ((resource_int_value("apic", 0, "clock", &i) != 0 || i != 0)) {
 		arat = 0;
 		/* Intel CPUID 0x06 EAX[2] set if APIC timer runs in C3. */
 		if (cpu_vendor_id == CPU_VENDOR_INTEL && cpu_high >= 6) {
 			do_cpuid(0x06, regs);
 			if ((regs[0] & CPUTPM1_ARAT) != 0)
 				arat = 1;
 		}
 		bzero(&lapic_et, sizeof(lapic_et));
 		lapic_et.et_name = "LAPIC";
 		lapic_et.et_flags = ET_FLAGS_PERIODIC | ET_FLAGS_ONESHOT |
 		    ET_FLAGS_PERCPU;
 		lapic_et.et_quality = 600;
 		if (!arat) {
 			lapic_et.et_flags |= ET_FLAGS_C3STOP;
 			lapic_et.et_quality -= 200;
 		}
 		lapic_et.et_frequency = 0;
 		/* We don't know frequency yet, so trying to guess. */
 		lapic_et.et_min_period = 0x00001000LL;
 		lapic_et.et_max_period = SBT_1S;
 		lapic_et.et_start = lapic_et_start;
 		lapic_et.et_stop = lapic_et_stop;
 		lapic_et.et_priv = NULL;
 		et_register(&lapic_et);
 	}
 }
 
 /*
  * Create a local APIC instance.
  */
 void
 lapic_create(u_int apic_id, int boot_cpu)
 {
 	int i;
 
 	if (apic_id > MAX_APIC_ID) {
 		printf("APIC: Ignoring local APIC with ID %d\n", apic_id);
 		if (boot_cpu)
 			panic("Can't ignore BSP");
 		return;
 	}
 	KASSERT(!lapics[apic_id].la_present, ("duplicate local APIC %u",
 	    apic_id));
 
 	/*
 	 * Assume no local LVT overrides and a cluster of 0 and
 	 * intra-cluster ID of 0.
 	 */
 	lapics[apic_id].la_present = 1;
 	lapics[apic_id].la_id = apic_id;
 	for (i = 0; i <= LVT_MAX; i++) {
 		lapics[apic_id].la_lvts[i] = lvts[i];
 		lapics[apic_id].la_lvts[i].lvt_active = 0;
 	}
 	for (i = 0; i <= APIC_NUM_IOINTS; i++)
 	    lapics[apic_id].la_ioint_irqs[i] = -1;
 	lapics[apic_id].la_ioint_irqs[IDT_SYSCALL - APIC_IO_INTS] = IRQ_SYSCALL;
 	lapics[apic_id].la_ioint_irqs[APIC_TIMER_INT - APIC_IO_INTS] =
 	    IRQ_TIMER;
 #ifdef KDTRACE_HOOKS
 	lapics[apic_id].la_ioint_irqs[IDT_DTRACE_RET - APIC_IO_INTS] =
 	    IRQ_DTRACE_RET;
 #endif
 #ifdef XENHVM
 	lapics[apic_id].la_ioint_irqs[IDT_EVTCHN - APIC_IO_INTS] = IRQ_EVTCHN;
 #endif
 
 
 #ifdef SMP
 	cpu_add(apic_id, boot_cpu);
 #endif
 }
 
 /*
  * Dump contents of local APIC registers
  */
 void
 lapic_dump(const char* str)
 {
 	uint32_t maxlvt;
 
 	maxlvt = (lapic->version & APIC_VER_MAXLVT) >> MAXLVTSHIFT;
 	printf("cpu%d %s:\n", PCPU_GET(cpuid), str);
 	printf("     ID: 0x%08x   VER: 0x%08x LDR: 0x%08x DFR: 0x%08x\n",
 	    lapic->id, lapic->version, lapic->ldr, lapic->dfr);
 	printf("  lint0: 0x%08x lint1: 0x%08x TPR: 0x%08x SVR: 0x%08x\n",
 	    lapic->lvt_lint0, lapic->lvt_lint1, lapic->tpr, lapic->svr);
 	printf("  timer: 0x%08x therm: 0x%08x err: 0x%08x",
 	    lapic->lvt_timer, lapic->lvt_thermal, lapic->lvt_error);
 	if (maxlvt >= LVT_PMC)
 		printf(" pmc: 0x%08x", lapic->lvt_pcint);
 	printf("\n");
 	if (maxlvt >= LVT_CMCI)
 		printf("   cmci: 0x%08x\n", lapic->lvt_cmci);
 }
 
 void
 lapic_setup(int boot)
 {
 	struct lapic *la;
 	u_int32_t maxlvt;
 	register_t saveintr;
 	char buf[MAXCOMLEN + 1];
 
 	la = &lapics[lapic_id()];
 	KASSERT(la->la_present, ("missing APIC structure"));
 	saveintr = intr_disable();
 	maxlvt = (lapic->version & APIC_VER_MAXLVT) >> MAXLVTSHIFT;
 
 	/* Initialize the TPR to allow all interrupts. */
 	lapic_set_tpr(0);
 
 	/* Setup spurious vector and enable the local APIC. */
 	lapic_enable();
 
 	/* Program LINT[01] LVT entries. */
 	lapic->lvt_lint0 = lvt_mode(la, LVT_LINT0, lapic->lvt_lint0);
 	lapic->lvt_lint1 = lvt_mode(la, LVT_LINT1, lapic->lvt_lint1);
 
 	/* Program the PMC LVT entry if present. */
 	if (maxlvt >= LVT_PMC)
 		lapic->lvt_pcint = lvt_mode(la, LVT_PMC, lapic->lvt_pcint);
 
 	/* Program timer LVT and setup handler. */
 	la->lvt_timer_cache = lapic->lvt_timer =
 	    lvt_mode(la, LVT_TIMER, lapic->lvt_timer);
 	if (boot) {
 		snprintf(buf, sizeof(buf), "cpu%d:timer", PCPU_GET(cpuid));
 		intrcnt_add(buf, &la->la_timer_count);
 	}
 
 	/* Setup the timer if configured. */
 	if (la->la_timer_mode != 0) {
 		KASSERT(la->la_timer_period != 0, ("lapic%u: zero divisor",
 		    lapic_id()));
 		lapic_timer_set_divisor(lapic_timer_divisor);
 		if (la->la_timer_mode == 1)
 			lapic_timer_periodic(la, la->la_timer_period, 1);
 		else
 			lapic_timer_oneshot(la, la->la_timer_period, 1);
 	}
 
 	/* Program error LVT and clear any existing errors. */
 	lapic->lvt_error = lvt_mode(la, LVT_ERROR, lapic->lvt_error);
 	lapic->esr = 0;
 
 	/* XXX: Thermal LVT */
 
 	/* Program the CMCI LVT entry if present. */
 	if (maxlvt >= LVT_CMCI)
 		lapic->lvt_cmci = lvt_mode(la, LVT_CMCI, lapic->lvt_cmci);
 	    
 	intr_restore(saveintr);
 }
 
 void
 lapic_reenable_pmc(void)
 {
 #ifdef HWPMC_HOOKS
 	uint32_t value;
 
 	value =  lapic->lvt_pcint;
 	value &= ~APIC_LVT_M;
 	lapic->lvt_pcint = value;
 #endif
 }
 
 #ifdef HWPMC_HOOKS
 static void
 lapic_update_pmc(void *dummy)
 {
 	struct lapic *la;
 
 	la = &lapics[lapic_id()];
 	lapic->lvt_pcint = lvt_mode(la, LVT_PMC, lapic->lvt_pcint);
 }
 #endif
 
 int
 lapic_enable_pmc(void)
 {
 #ifdef HWPMC_HOOKS
 	u_int32_t maxlvt;
 
 	/* Fail if the local APIC is not present. */
 	if (lapic == NULL)
 		return (0);
 
 	/* Fail if the PMC LVT is not present. */
 	maxlvt = (lapic->version & APIC_VER_MAXLVT) >> MAXLVTSHIFT;
 	if (maxlvt < LVT_PMC)
 		return (0);
 
 	lvts[LVT_PMC].lvt_masked = 0;
 
 #ifdef SMP
 	/*
 	 * If hwpmc was loaded at boot time then the APs may not be
 	 * started yet.  In that case, don't forward the request to
 	 * them as they will program the lvt when they start.
 	 */
 	if (smp_started)
 		smp_rendezvous(NULL, lapic_update_pmc, NULL, NULL);
 	else
 #endif
 		lapic_update_pmc(NULL);
 	return (1);
 #else
 	return (0);
 #endif
 }
 
 void
 lapic_disable_pmc(void)
 {
 #ifdef HWPMC_HOOKS
 	u_int32_t maxlvt;
 
 	/* Fail if the local APIC is not present. */
 	if (lapic == NULL)
 		return;
 
 	/* Fail if the PMC LVT is not present. */
 	maxlvt = (lapic->version & APIC_VER_MAXLVT) >> MAXLVTSHIFT;
 	if (maxlvt < LVT_PMC)
 		return;
 
 	lvts[LVT_PMC].lvt_masked = 1;
 
 #ifdef SMP
 	/* The APs should always be started when hwpmc is unloaded. */
 	KASSERT(mp_ncpus == 1 || smp_started, ("hwpmc unloaded too early"));
 #endif
 	smp_rendezvous(NULL, lapic_update_pmc, NULL, NULL);
 #endif
 }
 
 static int
 lapic_et_start(struct eventtimer *et, sbintime_t first, sbintime_t period)
 {
 	struct lapic *la;
 	u_long value;
 
 	la = &lapics[PCPU_GET(apic_id)];
 	if (et->et_frequency == 0) {
 		/* Start off with a divisor of 2 (power on reset default). */
 		lapic_timer_divisor = 2;
 		/* Try to calibrate the local APIC timer. */
 		do {
 			lapic_timer_set_divisor(lapic_timer_divisor);
 			lapic_timer_oneshot(la, APIC_TIMER_MAX_COUNT, 0);
 			DELAY(1000000);
 			value = APIC_TIMER_MAX_COUNT - lapic->ccr_timer;
 			if (value != APIC_TIMER_MAX_COUNT)
 				break;
 			lapic_timer_divisor <<= 1;
 		} while (lapic_timer_divisor <= 128);
 		if (lapic_timer_divisor > 128)
 			panic("lapic: Divisor too big");
 		if (bootverbose)
 			printf("lapic: Divisor %lu, Frequency %lu Hz\n",
 			    lapic_timer_divisor, value);
 		et->et_frequency = value;
 		et->et_min_period = (0x00000002LLU << 32) / et->et_frequency;
 		et->et_max_period = (0xfffffffeLLU << 32) / et->et_frequency;
 	}
 	if (la->la_timer_mode == 0)
 		lapic_timer_set_divisor(lapic_timer_divisor);
 	if (period != 0) {
 		la->la_timer_mode = 1;
 		la->la_timer_period = ((uint32_t)et->et_frequency * period) >> 32;
 		lapic_timer_periodic(la, la->la_timer_period, 1);
 	} else {
 		la->la_timer_mode = 2;
 		la->la_timer_period = ((uint32_t)et->et_frequency * first) >> 32;
 		lapic_timer_oneshot(la, la->la_timer_period, 1);
 	}
 	return (0);
 }
 
 static int
 lapic_et_stop(struct eventtimer *et)
 {
 	struct lapic *la = &lapics[PCPU_GET(apic_id)];
 
 	la->la_timer_mode = 0;
 	lapic_timer_stop(la);
 	return (0);
 }
 
 void
 lapic_disable(void)
 {
 	uint32_t value;
 
 	/* Software disable the local APIC. */
 	value = lapic->svr;
 	value &= ~APIC_SVR_SWEN;
 	lapic->svr = value;
 }
 
 static void
 lapic_enable(void)
 {
 	u_int32_t value;
 
 	/* Program the spurious vector to enable the local APIC. */
 	value = lapic->svr;
 	value &= ~(APIC_SVR_VECTOR | APIC_SVR_FOCUS);
 	value |= (APIC_SVR_FEN | APIC_SVR_SWEN | APIC_SPURIOUS_INT);
 	lapic->svr = value;
 }
 
 /* Reset the local APIC on the BSP during resume. */
 static void
 lapic_resume(struct pic *pic, bool suspend_cancelled)
 {
 
 	lapic_setup(0);
 }
 
 int
 lapic_id(void)
 {
 
 	KASSERT(lapic != NULL, ("local APIC is not mapped"));
 	return (lapic->id >> APIC_ID_SHIFT);
 }
 
 int
 lapic_intr_pending(u_int vector)
 {
 	volatile u_int32_t *irr;
 
 	/*
 	 * The IRR registers are an array of 128-bit registers each of
 	 * which only describes 32 interrupts in the low 32 bits..  Thus,
 	 * we divide the vector by 32 to get the 128-bit index.  We then
 	 * multiply that index by 4 to get the equivalent index from
 	 * treating the IRR as an array of 32-bit registers.  Finally, we
 	 * modulus the vector by 32 to determine the individual bit to
 	 * test.
 	 */
 	irr = &lapic->irr0;
 	return (irr[(vector / 32) * 4] & 1 << (vector % 32));
 }
 
 void
 lapic_set_logical_id(u_int apic_id, u_int cluster, u_int cluster_id)
 {
 	struct lapic *la;
 
 	KASSERT(lapics[apic_id].la_present, ("%s: APIC %u doesn't exist",
 	    __func__, apic_id));
 	KASSERT(cluster <= APIC_MAX_CLUSTER, ("%s: cluster %u too big",
 	    __func__, cluster));
 	KASSERT(cluster_id <= APIC_MAX_INTRACLUSTER_ID,
 	    ("%s: intra cluster id %u too big", __func__, cluster_id));
 	la = &lapics[apic_id];
 	la->la_cluster = cluster;
 	la->la_cluster_id = cluster_id;
 }
 
 int
 lapic_set_lvt_mask(u_int apic_id, u_int pin, u_char masked)
 {
 
 	if (pin > LVT_MAX)
 		return (EINVAL);
 	if (apic_id == APIC_ID_ALL) {
 		lvts[pin].lvt_masked = masked;
 		if (bootverbose)
 			printf("lapic:");
 	} else {
 		KASSERT(lapics[apic_id].la_present,
 		    ("%s: missing APIC %u", __func__, apic_id));
 		lapics[apic_id].la_lvts[pin].lvt_masked = masked;
 		lapics[apic_id].la_lvts[pin].lvt_active = 1;
 		if (bootverbose)
 			printf("lapic%u:", apic_id);
 	}
 	if (bootverbose)
 		printf(" LINT%u %s\n", pin, masked ? "masked" : "unmasked");
 	return (0);
 }
 
 int
 lapic_set_lvt_mode(u_int apic_id, u_int pin, u_int32_t mode)
 {
 	struct lvt *lvt;
 
 	if (pin > LVT_MAX)
 		return (EINVAL);
 	if (apic_id == APIC_ID_ALL) {
 		lvt = &lvts[pin];
 		if (bootverbose)
 			printf("lapic:");
 	} else {
 		KASSERT(lapics[apic_id].la_present,
 		    ("%s: missing APIC %u", __func__, apic_id));
 		lvt = &lapics[apic_id].la_lvts[pin];
 		lvt->lvt_active = 1;
 		if (bootverbose)
 			printf("lapic%u:", apic_id);
 	}
 	lvt->lvt_mode = mode;
 	switch (mode) {
 	case APIC_LVT_DM_NMI:
 	case APIC_LVT_DM_SMI:
 	case APIC_LVT_DM_INIT:
 	case APIC_LVT_DM_EXTINT:
 		lvt->lvt_edgetrigger = 1;
 		lvt->lvt_activehi = 1;
 		if (mode == APIC_LVT_DM_EXTINT)
 			lvt->lvt_masked = 1;
 		else
 			lvt->lvt_masked = 0;
 		break;
 	default:
 		panic("Unsupported delivery mode: 0x%x\n", mode);
 	}
 	if (bootverbose) {
 		printf(" Routing ");
 		switch (mode) {
 		case APIC_LVT_DM_NMI:
 			printf("NMI");
 			break;
 		case APIC_LVT_DM_SMI:
 			printf("SMI");
 			break;
 		case APIC_LVT_DM_INIT:
 			printf("INIT");
 			break;
 		case APIC_LVT_DM_EXTINT:
 			printf("ExtINT");
 			break;
 		}
 		printf(" -> LINT%u\n", pin);
 	}
 	return (0);
 }
 
 int
 lapic_set_lvt_polarity(u_int apic_id, u_int pin, enum intr_polarity pol)
 {
 
 	if (pin > LVT_MAX || pol == INTR_POLARITY_CONFORM)
 		return (EINVAL);
 	if (apic_id == APIC_ID_ALL) {
 		lvts[pin].lvt_activehi = (pol == INTR_POLARITY_HIGH);
 		if (bootverbose)
 			printf("lapic:");
 	} else {
 		KASSERT(lapics[apic_id].la_present,
 		    ("%s: missing APIC %u", __func__, apic_id));
 		lapics[apic_id].la_lvts[pin].lvt_active = 1;
 		lapics[apic_id].la_lvts[pin].lvt_activehi =
 		    (pol == INTR_POLARITY_HIGH);
 		if (bootverbose)
 			printf("lapic%u:", apic_id);
 	}
 	if (bootverbose)
 		printf(" LINT%u polarity: %s\n", pin,
 		    pol == INTR_POLARITY_HIGH ? "high" : "low");
 	return (0);
 }
 
 int
 lapic_set_lvt_triggermode(u_int apic_id, u_int pin, enum intr_trigger trigger)
 {
 
 	if (pin > LVT_MAX || trigger == INTR_TRIGGER_CONFORM)
 		return (EINVAL);
 	if (apic_id == APIC_ID_ALL) {
 		lvts[pin].lvt_edgetrigger = (trigger == INTR_TRIGGER_EDGE);
 		if (bootverbose)
 			printf("lapic:");
 	} else {
 		KASSERT(lapics[apic_id].la_present,
 		    ("%s: missing APIC %u", __func__, apic_id));
 		lapics[apic_id].la_lvts[pin].lvt_edgetrigger =
 		    (trigger == INTR_TRIGGER_EDGE);
 		lapics[apic_id].la_lvts[pin].lvt_active = 1;
 		if (bootverbose)
 			printf("lapic%u:", apic_id);
 	}
 	if (bootverbose)
 		printf(" LINT%u trigger: %s\n", pin,
 		    trigger == INTR_TRIGGER_EDGE ? "edge" : "level");
 	return (0);
 }
 
 /*
  * Adjust the TPR of the current CPU so that it blocks all interrupts below
  * the passed in vector.
  */
 void
 lapic_set_tpr(u_int vector)
 {
 #ifdef CHEAP_TPR
 	lapic->tpr = vector;
 #else
 	u_int32_t tpr;
 
 	tpr = lapic->tpr & ~APIC_TPR_PRIO;
 	tpr |= vector;
 	lapic->tpr = tpr;
 #endif
 }
 
 void
 lapic_eoi(void)
 {
 
 	lapic->eoi = 0;
 }
 
 void
 lapic_handle_intr(int vector, struct trapframe *frame)
 {
 	struct intsrc *isrc;
 
 	isrc = intr_lookup_source(apic_idt_to_irq(PCPU_GET(apic_id),
 	    vector));
 	intr_execute_handlers(isrc, frame);
 }
 
 void
 lapic_handle_timer(struct trapframe *frame)
 {
 	struct lapic *la;
 	struct trapframe *oldframe;
 	struct thread *td;
 
 	/* Send EOI first thing. */
 	lapic_eoi();
 
 #if defined(SMP) && !defined(SCHED_ULE)
 	/*
 	 * Don't do any accounting for the disabled HTT cores, since it
 	 * will provide misleading numbers for the userland.
 	 *
 	 * No locking is necessary here, since even if we lose the race
 	 * when hlt_cpus_mask changes it is not a big deal, really.
 	 *
 	 * Don't do that for ULE, since ULE doesn't consider hlt_cpus_mask
 	 * and unlike other schedulers it actually schedules threads to
 	 * those CPUs.
 	 */
 	if (CPU_ISSET(PCPU_GET(cpuid), &hlt_cpus_mask))
 		return;
 #endif
 
 	/* Look up our local APIC structure for the tick counters. */
 	la = &lapics[PCPU_GET(apic_id)];
 	(*la->la_timer_count)++;
 	critical_enter();
 	if (lapic_et.et_active) {
 		td = curthread;
 		td->td_intr_nesting_level++;
 		oldframe = td->td_intr_frame;
 		td->td_intr_frame = frame;
 		lapic_et.et_event_cb(&lapic_et, lapic_et.et_arg);
 		td->td_intr_frame = oldframe;
 		td->td_intr_nesting_level--;
 	}
 	critical_exit();
 }
 
 static void
 lapic_timer_set_divisor(u_int divisor)
 {
 
 	KASSERT(powerof2(divisor), ("lapic: invalid divisor %u", divisor));
 	KASSERT(ffs(divisor) <= sizeof(lapic_timer_divisors) /
 	    sizeof(u_int32_t), ("lapic: invalid divisor %u", divisor));
 	lapic->dcr_timer = lapic_timer_divisors[ffs(divisor) - 1];
 }
 
 static void
 lapic_timer_oneshot(struct lapic *la, u_int count, int enable_int)
 {
 	u_int32_t value;
 
 	value = la->lvt_timer_cache;
 	value &= ~APIC_LVTT_TM;
 	value |= APIC_LVTT_TM_ONE_SHOT;
 	if (enable_int)
 		value &= ~APIC_LVT_M;
 	lapic->lvt_timer = value;
 	lapic->icr_timer = count;
 }
 
 static void
 lapic_timer_periodic(struct lapic *la, u_int count, int enable_int)
 {
 	u_int32_t value;
 
 	value = la->lvt_timer_cache;
 	value &= ~APIC_LVTT_TM;
 	value |= APIC_LVTT_TM_PERIODIC;
 	if (enable_int)
 		value &= ~APIC_LVT_M;
 	lapic->lvt_timer = value;
 	lapic->icr_timer = count;
 }
 
 static void
 lapic_timer_stop(struct lapic *la)
 {
 	u_int32_t value;
 
 	value = la->lvt_timer_cache;
 	value &= ~APIC_LVTT_TM;
 	value |= APIC_LVT_M;
 	lapic->lvt_timer = value;
 }
 
 void
 lapic_handle_cmc(void)
 {
 
 	lapic_eoi();
 	cmc_intr();
 }
 
 /*
  * Called from the mca_init() to activate the CMC interrupt if this CPU is
  * responsible for monitoring any MC banks for CMC events.  Since mca_init()
  * is called prior to lapic_setup() during boot, this just needs to unmask
  * this CPU's LVT_CMCI entry.
  */
 void
 lapic_enable_cmc(void)
 {
 	u_int apic_id;
 
 #ifdef DEV_ATPIC
 	if (lapic == NULL)
 		return;
 #endif
 	apic_id = PCPU_GET(apic_id);
 	KASSERT(lapics[apic_id].la_present,
 	    ("%s: missing APIC %u", __func__, apic_id));
 	lapics[apic_id].la_lvts[LVT_CMCI].lvt_masked = 0;
 	lapics[apic_id].la_lvts[LVT_CMCI].lvt_active = 1;
 	if (bootverbose)
 		printf("lapic%u: CMCI unmasked\n", apic_id);
 }
 
 void
 lapic_handle_error(void)
 {
 	u_int32_t esr;
 
 	/*
 	 * Read the contents of the error status register.  Write to
 	 * the register first before reading from it to force the APIC
 	 * to update its value to indicate any errors that have
 	 * occurred since the previous write to the register.
 	 */
 	lapic->esr = 0;
 	esr = lapic->esr;
 
 	printf("CPU%d: local APIC error 0x%x\n", PCPU_GET(cpuid), esr);
 	lapic_eoi();
 }
 
 u_int
 apic_cpuid(u_int apic_id)
 {
 #ifdef SMP
 	return apic_cpuids[apic_id];
 #else
 	return 0;
 #endif
 }
 
 /* Request a free IDT vector to be used by the specified IRQ. */
 u_int
 apic_alloc_vector(u_int apic_id, u_int irq)
 {
 	u_int vector;
 
 	KASSERT(irq < NUM_IO_INTS, ("Invalid IRQ %u", irq));
 
 	/*
 	 * Search for a free vector.  Currently we just use a very simple
 	 * algorithm to find the first free vector.
 	 */
 	mtx_lock_spin(&icu_lock);
 	for (vector = 0; vector < APIC_NUM_IOINTS; vector++) {
 		if (lapics[apic_id].la_ioint_irqs[vector] != -1)
 			continue;
 		lapics[apic_id].la_ioint_irqs[vector] = irq;
 		mtx_unlock_spin(&icu_lock);
 		return (vector + APIC_IO_INTS);
 	}
 	mtx_unlock_spin(&icu_lock);
 	return (0);
 }
 
 /*
  * Request 'count' free contiguous IDT vectors to be used by 'count'
  * IRQs.  'count' must be a power of two and the vectors will be
  * aligned on a boundary of 'align'.  If the request cannot be
  * satisfied, 0 is returned.
  */
 u_int
 apic_alloc_vectors(u_int apic_id, u_int *irqs, u_int count, u_int align)
 {
 	u_int first, run, vector;
 
 	KASSERT(powerof2(count), ("bad count"));
 	KASSERT(powerof2(align), ("bad align"));
 	KASSERT(align >= count, ("align < count"));
 #ifdef INVARIANTS
 	for (run = 0; run < count; run++)
 		KASSERT(irqs[run] < NUM_IO_INTS, ("Invalid IRQ %u at index %u",
 		    irqs[run], run));
 #endif
 
 	/*
 	 * Search for 'count' free vectors.  As with apic_alloc_vector(),
 	 * this just uses a simple first fit algorithm.
 	 */
 	run = 0;
 	first = 0;
 	mtx_lock_spin(&icu_lock);
 	for (vector = 0; vector < APIC_NUM_IOINTS; vector++) {
 
 		/* Vector is in use, end run. */
 		if (lapics[apic_id].la_ioint_irqs[vector] != -1) {
 			run = 0;
 			first = 0;
 			continue;
 		}
 
 		/* Start a new run if run == 0 and vector is aligned. */
 		if (run == 0) {
 			if ((vector & (align - 1)) != 0)
 				continue;
 			first = vector;
 		}
 		run++;
 
 		/* Keep looping if the run isn't long enough yet. */
 		if (run < count)
 			continue;
 
 		/* Found a run, assign IRQs and return the first vector. */
 		for (vector = 0; vector < count; vector++)
 			lapics[apic_id].la_ioint_irqs[first + vector] =
 			    irqs[vector];
 		mtx_unlock_spin(&icu_lock);
 		return (first + APIC_IO_INTS);
 	}
 	mtx_unlock_spin(&icu_lock);
 	printf("APIC: Couldn't find APIC vectors for %u IRQs\n", count);
 	return (0);
 }
 
 /*
  * Enable a vector for a particular apic_id.  Since all lapics share idt
  * entries and ioint_handlers this enables the vector on all lapics.  lapics
  * which do not have the vector configured would report spurious interrupts
  * should it fire.
  */
 void
 apic_enable_vector(u_int apic_id, u_int vector)
 {
 
 	KASSERT(vector != IDT_SYSCALL, ("Attempt to overwrite syscall entry"));
 	KASSERT(ioint_handlers[vector / 32] != NULL,
 	    ("No ISR handler for vector %u", vector));
 #ifdef KDTRACE_HOOKS
 	KASSERT(vector != IDT_DTRACE_RET,
 	    ("Attempt to overwrite DTrace entry"));
 #endif
 	setidt(vector, ioint_handlers[vector / 32], SDT_APIC, SEL_KPL,
 	    GSEL_APIC);
 }
 
 void
 apic_disable_vector(u_int apic_id, u_int vector)
 {
 
 	KASSERT(vector != IDT_SYSCALL, ("Attempt to overwrite syscall entry"));
 #ifdef KDTRACE_HOOKS
 	KASSERT(vector != IDT_DTRACE_RET,
 	    ("Attempt to overwrite DTrace entry"));
 #endif
 	KASSERT(ioint_handlers[vector / 32] != NULL,
 	    ("No ISR handler for vector %u", vector));
 #ifdef notyet
 	/*
 	 * We can not currently clear the idt entry because other cpus
 	 * may have a valid vector at this offset.
 	 */
 	setidt(vector, &IDTVEC(rsvd), SDT_APICT, SEL_KPL, GSEL_APIC);
 #endif
 }
 
 /* Release an APIC vector when it's no longer in use. */
 void
 apic_free_vector(u_int apic_id, u_int vector, u_int irq)
 {
 	struct thread *td;
 
 	KASSERT(vector >= APIC_IO_INTS && vector != IDT_SYSCALL &&
 	    vector <= APIC_IO_INTS + APIC_NUM_IOINTS,
 	    ("Vector %u does not map to an IRQ line", vector));
 	KASSERT(irq < NUM_IO_INTS, ("Invalid IRQ %u", irq));
 	KASSERT(lapics[apic_id].la_ioint_irqs[vector - APIC_IO_INTS] ==
 	    irq, ("IRQ mismatch"));
 #ifdef KDTRACE_HOOKS
 	KASSERT(vector != IDT_DTRACE_RET,
 	    ("Attempt to overwrite DTrace entry"));
 #endif
 
 	/*
 	 * Bind us to the cpu that owned the vector before freeing it so
 	 * we don't lose an interrupt delivery race.
 	 */
 	td = curthread;
 	if (!rebooting) {
 		thread_lock(td);
 		if (sched_is_bound(td))
 			panic("apic_free_vector: Thread already bound.\n");
 		sched_bind(td, apic_cpuid(apic_id));
 		thread_unlock(td);
 	}
 	mtx_lock_spin(&icu_lock);
 	lapics[apic_id].la_ioint_irqs[vector - APIC_IO_INTS] = -1;
 	mtx_unlock_spin(&icu_lock);
 	if (!rebooting) {
 		thread_lock(td);
 		sched_unbind(td);
 		thread_unlock(td);
 	}
 }
 
 /* Map an IDT vector (APIC) to an IRQ (interrupt source). */
 u_int
 apic_idt_to_irq(u_int apic_id, u_int vector)
 {
 	int irq;
 
 	KASSERT(vector >= APIC_IO_INTS && vector != IDT_SYSCALL &&
 	    vector <= APIC_IO_INTS + APIC_NUM_IOINTS,
 	    ("Vector %u does not map to an IRQ line", vector));
 #ifdef KDTRACE_HOOKS
 	KASSERT(vector != IDT_DTRACE_RET,
 	    ("Attempt to overwrite DTrace entry"));
 #endif
 	irq = lapics[apic_id].la_ioint_irqs[vector - APIC_IO_INTS];
 	if (irq < 0)
 		irq = 0;
 	return (irq);
 }
 
 #ifdef DDB
 /*
  * Dump data about APIC IDT vector mappings.
  */
 DB_SHOW_COMMAND(apic, db_show_apic)
 {
 	struct intsrc *isrc;
 	int i, verbose;
 	u_int apic_id;
 	u_int irq;
 
 	if (strcmp(modif, "vv") == 0)
 		verbose = 2;
 	else if (strcmp(modif, "v") == 0)
 		verbose = 1;
 	else
 		verbose = 0;
 	for (apic_id = 0; apic_id <= MAX_APIC_ID; apic_id++) {
 		if (lapics[apic_id].la_present == 0)
 			continue;
 		db_printf("Interrupts bound to lapic %u\n", apic_id);
 		for (i = 0; i < APIC_NUM_IOINTS + 1 && !db_pager_quit; i++) {
 			irq = lapics[apic_id].la_ioint_irqs[i];
 			if (irq == -1 || irq == IRQ_SYSCALL)
 				continue;
 #ifdef KDTRACE_HOOKS
 			if (irq == IRQ_DTRACE_RET)
 				continue;
 #endif
 #ifdef XENHVM
 			if (irq == IRQ_EVTCHN)
 				continue;
 #endif
 			db_printf("vec 0x%2x -> ", i + APIC_IO_INTS);
 			if (irq == IRQ_TIMER)
 				db_printf("lapic timer\n");
 			else if (irq < NUM_IO_INTS) {
 				isrc = intr_lookup_source(irq);
 				if (isrc == NULL || verbose == 0)
 					db_printf("IRQ %u\n", irq);
 				else
 					db_dump_intr_event(isrc->is_event,
 					    verbose == 2);
 			} else
 				db_printf("IRQ %u ???\n", irq);
 		}
 	}
 }
 
 static void
 dump_mask(const char *prefix, uint32_t v, int base)
 {
 	int i, first;
 
 	first = 1;
 	for (i = 0; i < 32; i++)
 		if (v & (1 << i)) {
 			if (first) {
 				db_printf("%s:", prefix);
 				first = 0;
 			}
 			db_printf(" %02x", base + i);
 		}
 	if (!first)
 		db_printf("\n");
 }
 
 /* Show info from the lapic regs for this CPU. */
 DB_SHOW_COMMAND(lapic, db_show_lapic)
 {
 	uint32_t v;
 
 	db_printf("lapic ID = %d\n", lapic_id());
 	v = lapic->version;
 	db_printf("version  = %d.%d\n", (v & APIC_VER_VERSION) >> 4,
 	    v & 0xf);
 	db_printf("max LVT  = %d\n", (v & APIC_VER_MAXLVT) >> MAXLVTSHIFT);
 	v = lapic->svr;
 	db_printf("SVR      = %02x (%s)\n", v & APIC_SVR_VECTOR,
 	    v & APIC_SVR_ENABLE ? "enabled" : "disabled");
 	db_printf("TPR      = %02x\n", lapic->tpr);
 
 #define dump_field(prefix, index)					\
 	dump_mask(__XSTRING(prefix ## index), lapic->prefix ## index,	\
 	    index * 32)
 
 	db_printf("In-service Interrupts:\n");
 	dump_field(isr, 0);
 	dump_field(isr, 1);
 	dump_field(isr, 2);
 	dump_field(isr, 3);
 	dump_field(isr, 4);
 	dump_field(isr, 5);
 	dump_field(isr, 6);
 	dump_field(isr, 7);
 
 	db_printf("TMR Interrupts:\n");
 	dump_field(tmr, 0);
 	dump_field(tmr, 1);
 	dump_field(tmr, 2);
 	dump_field(tmr, 3);
 	dump_field(tmr, 4);
 	dump_field(tmr, 5);
 	dump_field(tmr, 6);
 	dump_field(tmr, 7);
 
 	db_printf("IRR Interrupts:\n");
 	dump_field(irr, 0);
 	dump_field(irr, 1);
 	dump_field(irr, 2);
 	dump_field(irr, 3);
 	dump_field(irr, 4);
 	dump_field(irr, 5);
 	dump_field(irr, 6);
 	dump_field(irr, 7);
 
 #undef dump_field
 }
 #endif
 
 /*
  * APIC probing support code.  This includes code to manage enumerators.
  */
 
 static SLIST_HEAD(, apic_enumerator) enumerators =
 	SLIST_HEAD_INITIALIZER(enumerators);
 static struct apic_enumerator *best_enum;
 
 void
 apic_register_enumerator(struct apic_enumerator *enumerator)
 {
 #ifdef INVARIANTS
 	struct apic_enumerator *apic_enum;
 
 	SLIST_FOREACH(apic_enum, &enumerators, apic_next) {
 		if (apic_enum == enumerator)
 			panic("%s: Duplicate register of %s", __func__,
 			    enumerator->apic_name);
 	}
 #endif
 	SLIST_INSERT_HEAD(&enumerators, enumerator, apic_next);
 }
 
 /*
  * We have to look for CPU's very, very early because certain subsystems
  * want to know how many CPU's we have extremely early on in the boot
  * process.
  */
 static void
 apic_init(void *dummy __unused)
 {
 	struct apic_enumerator *enumerator;
 #ifndef __amd64__
 	uint64_t apic_base;
 #endif
 	int retval, best;
 
 	/* We only support built in local APICs. */
 	if (!(cpu_feature & CPUID_APIC))
 		return;
 
 	/* Don't probe if APIC mode is disabled. */
 	if (resource_disabled("apic", 0))
 		return;
 
 	/* Probe all the enumerators to find the best match. */
 	best_enum = NULL;
 	best = 0;
 	SLIST_FOREACH(enumerator, &enumerators, apic_next) {
 		retval = enumerator->apic_probe();
 		if (retval > 0)
 			continue;
 		if (best_enum == NULL || best < retval) {
 			best_enum = enumerator;
 			best = retval;
 		}
 	}
 	if (best_enum == NULL) {
 		if (bootverbose)
 			printf("APIC: Could not find any APICs.\n");
 #ifndef DEV_ATPIC
 		panic("running without device atpic requires a local APIC");
 #endif
 		return;
 	}
 
 	if (bootverbose)
 		printf("APIC: Using the %s enumerator.\n",
 		    best_enum->apic_name);
 
 #ifndef __amd64__
 	/*
 	 * To work around an errata, we disable the local APIC on some
 	 * CPUs during early startup.  We need to turn the local APIC back
 	 * on on such CPUs now.
 	 */
 	if (cpu == CPU_686 && cpu_vendor_id == CPU_VENDOR_INTEL &&
 	    (cpu_id & 0xff0) == 0x610) {
 		apic_base = rdmsr(MSR_APICBASE);
 		apic_base |= APICBASE_ENABLED;
 		wrmsr(MSR_APICBASE, apic_base);
 	}
 #endif
 
 	/* Probe the CPU's in the system. */
 	retval = best_enum->apic_probe_cpus();
 	if (retval != 0)
 		printf("%s: Failed to probe CPUs: returned %d\n",
 		    best_enum->apic_name, retval);
 
 }
 SYSINIT(apic_init, SI_SUB_TUNABLES - 1, SI_ORDER_SECOND, apic_init, NULL);
 
 /*
  * Setup the local APIC.  We have to do this prior to starting up the APs
  * in the SMP case.
  */
 static void
 apic_setup_local(void *dummy __unused)
 {
 	int retval;
  
 	if (best_enum == NULL)
 		return;
 
 	/* Initialize the local APIC. */
 	retval = best_enum->apic_setup_local();
 	if (retval != 0)
 		printf("%s: Failed to setup the local APIC: returned %d\n",
 		    best_enum->apic_name, retval);
 }
 SYSINIT(apic_setup_local, SI_SUB_CPU, SI_ORDER_SECOND, apic_setup_local, NULL);
 
 /*
  * Setup the I/O APICs.
  */
 static void
 apic_setup_io(void *dummy __unused)
 {
 	int retval;
 
 	if (best_enum == NULL)
 		return;
 
 	/*
 	 * Local APIC must be registered before other PICs and pseudo PICs
 	 * for proper suspend/resume order.
 	 */
 #ifndef XEN
 	intr_register_pic(&lapic_pic);
 #endif
 
 	retval = best_enum->apic_setup_io();
 	if (retval != 0)
 		printf("%s: Failed to setup I/O APICs: returned %d\n",
 		    best_enum->apic_name, retval);
 #ifdef XEN
 	return;
 #endif
 	/*
 	 * Finish setting up the local APIC on the BSP once we know how to
 	 * properly program the LINT pins.
 	 */
 	lapic_setup(1);
 	if (bootverbose)
 		lapic_dump("BSP");
 
 	/* Enable the MSI "pic". */
 	msi_init();
 }
 SYSINIT(apic_setup_io, SI_SUB_INTR, SI_ORDER_SECOND, apic_setup_io, NULL);
 
 #ifdef SMP
 /*
  * Inter Processor Interrupt functions.  The lapic_ipi_*() functions are
  * private to the MD code.  The public interface for the rest of the
  * kernel is defined in mp_machdep.c.
  */
 int
 lapic_ipi_wait(int delay)
 {
 	int x, incr;
 
 	/*
 	 * Wait delay loops for IPI to be sent.  This is highly bogus
 	 * since this is sensitive to CPU clock speed.  If delay is
 	 * -1, we wait forever.
 	 */
 	if (delay == -1) {
 		incr = 0;
 		delay = 1;
 	} else
 		incr = 1;
 	for (x = 0; x < delay; x += incr) {
 		if ((lapic->icr_lo & APIC_DELSTAT_MASK) == APIC_DELSTAT_IDLE)
 			return (1);
 		ia32_pause();
 	}
 	return (0);
 }
 
 void
 lapic_ipi_raw(register_t icrlo, u_int dest)
 {
 	register_t value, saveintr;
 
 	/* XXX: Need more sanity checking of icrlo? */
 	KASSERT(lapic != NULL, ("%s called too early", __func__));
 	KASSERT((dest & ~(APIC_ID_MASK >> APIC_ID_SHIFT)) == 0,
 	    ("%s: invalid dest field", __func__));
 	KASSERT((icrlo & APIC_ICRLO_RESV_MASK) == 0,
 	    ("%s: reserved bits set in ICR LO register", __func__));
 
 	/* Set destination in ICR HI register if it is being used. */
 	saveintr = intr_disable();
 	if ((icrlo & APIC_DEST_MASK) == APIC_DEST_DESTFLD) {
 		value = lapic->icr_hi;
 		value &= ~APIC_ID_MASK;
 		value |= dest << APIC_ID_SHIFT;
 		lapic->icr_hi = value;
 	}
 
 	/* Program the contents of the IPI and dispatch it. */
 	value = lapic->icr_lo;
 	value &= APIC_ICRLO_RESV_MASK;
 	value |= icrlo;
 	lapic->icr_lo = value;
 	intr_restore(saveintr);
 }
 
 #define	BEFORE_SPIN	1000000
 #ifdef DETECT_DEADLOCK
 #define	AFTER_SPIN	1000
 #endif
 
 void
 lapic_ipi_vectored(u_int vector, int dest)
 {
 	register_t icrlo, destfield;
 
 	KASSERT((vector & ~APIC_VECTOR_MASK) == 0,
 	    ("%s: invalid vector %d", __func__, vector));
 
 	icrlo = APIC_DESTMODE_PHY | APIC_TRIGMOD_EDGE;
 
 	/*
 	 * IPI_STOP_HARD is just a "fake" vector used to send a NMI.
 	 * Use special rules regard NMI if passed, otherwise specify
 	 * the vector.
 	 */
 	if (vector == IPI_STOP_HARD)
 		icrlo |= APIC_DELMODE_NMI | APIC_LEVEL_ASSERT;
 	else
 		icrlo |= vector | APIC_DELMODE_FIXED | APIC_LEVEL_DEASSERT;
 	destfield = 0;
 	switch (dest) {
 	case APIC_IPI_DEST_SELF:
 		icrlo |= APIC_DEST_SELF;
 		break;
 	case APIC_IPI_DEST_ALL:
 		icrlo |= APIC_DEST_ALLISELF;
 		break;
 	case APIC_IPI_DEST_OTHERS:
 		icrlo |= APIC_DEST_ALLESELF;
 		break;
 	default:
 		KASSERT((dest & ~(APIC_ID_MASK >> APIC_ID_SHIFT)) == 0,
 		    ("%s: invalid destination 0x%x", __func__, dest));
 		destfield = dest;
 	}
 
 	/* Wait for an earlier IPI to finish. */
 	if (!lapic_ipi_wait(BEFORE_SPIN)) {
 		if (panicstr != NULL)
 			return;
 		else
 			panic("APIC: Previous IPI is stuck");
 	}
 
 	lapic_ipi_raw(icrlo, destfield);
 
 #ifdef DETECT_DEADLOCK
 	/* Wait for IPI to be delivered. */
 	if (!lapic_ipi_wait(AFTER_SPIN)) {
 #ifdef needsattention
 		/*
 		 * XXX FIXME:
 		 *
 		 * The above function waits for the message to actually be
 		 * delivered.  It breaks out after an arbitrary timeout
 		 * since the message should eventually be delivered (at
 		 * least in theory) and that if it wasn't we would catch
 		 * the failure with the check above when the next IPI is
 		 * sent.
 		 *
 		 * We could skip this wait entirely, EXCEPT it probably
 		 * protects us from other routines that assume that the
 		 * message was delivered and acted upon when this function
 		 * returns.
 		 */
 		printf("APIC: IPI might be stuck\n");
 #else /* !needsattention */
 		/* Wait until mesage is sent without a timeout. */
 		while (lapic->icr_lo & APIC_DELSTAT_PEND)
 			ia32_pause();
 #endif /* needsattention */
 	}
 #endif /* DETECT_DEADLOCK */
 }
 #endif /* SMP */