Index: projects/runtime-coverage-v2/share/man/man4/ng_eiface.4
===================================================================
--- projects/runtime-coverage-v2/share/man/man4/ng_eiface.4	(revision 347598)
+++ projects/runtime-coverage-v2/share/man/man4/ng_eiface.4	(revision 347599)
@@ -1,122 +1,122 @@
 .\" Copyright (c) 2004 Gleb Smirnoff
 .\" All rights reserved.
 .\"
 .\" Redistribution and use in source and binary forms, with or without
 .\" modification, are permitted provided that the following conditions
 .\" are met:
 .\" 1. Redistributions of source code must retain the above copyright
 .\"    notice, this list of conditions and the following disclaimer.
 .\" 2. Redistributions in binary form must reproduce the above copyright
 .\"    notice, this list of conditions and the following disclaimer in the
 .\"    documentation and/or other materials provided with the distribution.
 .\"
 .\" THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
 .\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 .\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 .\" ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
 .\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 .\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 .\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 .\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 .\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 .\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 .\" SUCH DAMAGE.
 .\"
 .\" $FreeBSD$
 .\"
-.Dd February 3, 2005
+.Dd May 14, 2019
 .Dt NG_EIFACE 4
 .Os
 .Sh NAME
 .Nm ng_eiface
 .Nd "generic Ethernet interface netgraph node type"
 .Sh SYNOPSIS
 .In netgraph/ng_eiface.h
 .Sh DESCRIPTION
 The
 .Vt eiface
 netgraph node implements the generic Ethernet interface.
-When
+When an
 .Vt eiface
 node is created, a new interface appears which is accessible via
 .Xr ifconfig 8 .
 These interfaces are named
 .Dq Li ngeth0 ,
 .Dq Li ngeth1 ,
 etc.
 When a node is shut down, the corresponding interface is removed,
 and the interface name becomes available for reuse by future
 .Vt eiface
 nodes.
 New nodes always take the first unused interface.
 .Sh HOOKS
 An
 .Vt eiface
 node has a single hook named
 .Va ether ,
 which should be connected to the
 Ethernet downstream, for example, to the
 .Xr ng_vlan 4
 node.
 Packets transmitted via the interface flow out this hook.
 Similarly, packets received on the hook go to the protocol stack as
 packets received by any real Ethernet interface.
 .Sh CONTROL MESSAGES
 This node type supports the generic control messages, plus the following:
 .Bl -tag -width foo
 .It Dv NGM_EIFACE_SET Pq Ic set
 Set link-level address of the interface.
 Requires
 .Vt "struct ether_addr"
 as an argument.
 This message also has an
 .Tn ASCII
 version, called
 .Dq Li set ,
 which requires as an argument an
 .Tn ASCII
 string consisting of 6 colon-separated hex digits.
 .It Dv NGM_EIFACE_GET_IFNAME Pq Ic getifname
 Return the name of the associated interface as a
-.Dv NUL Ns -terminated
+.Dv NULL Ns -terminated
 .Tn ASCII
 string.
 .It Dv NGM_EIFACE_GET_IFADDRS
 Return the list of link-level addresses associated with the node.
 .El
 .Sh SHUTDOWN
 This node shuts down upon receipt of a
 .Dv NGM_SHUTDOWN
 control message.
 The associated interface is removed and its name becomes
 available for reuse by future
 .Vt eiface
 nodes.
 .Pp
 Unlike most other node types, an
 .Vt eiface
 node does
 .Em not
-go away when all hooks have been disconnected; rather, and explicit
+go away when all hooks have been disconnected; rather, an explicit
 .Dv NGM_SHUTDOWN
 control message is required.
 .Sh SEE ALSO
 .Xr netgraph 4 ,
 .Xr ng_ether 4 ,
 .Xr ng_iface 4 ,
 .Xr ng_vlan 4 ,
 .Xr ifconfig 8 ,
 .Xr ngctl 8
 .Sh HISTORY
 The
 .Vt eiface
 node type was implemented in
 .Fx 4.6 .
 .Sh AUTHORS
 .An -nosplit
 The
 .Vt eiface
 node type was written by
 .An Vitaly V Belekhov .
 This manual page was written by
 .An Gleb Smirnoff .
Index: projects/runtime-coverage-v2/sys/amd64/amd64/exception.S
===================================================================
--- projects/runtime-coverage-v2/sys/amd64/amd64/exception.S	(revision 347598)
+++ projects/runtime-coverage-v2/sys/amd64/amd64/exception.S	(revision 347599)
@@ -1,1324 +1,1326 @@
 /*-
  * Copyright (c) 1989, 1990 William F. Jolitz.
  * Copyright (c) 1990 The Regents of the University of California.
  * Copyright (c) 2007-2018 The FreeBSD Foundation
  * All rights reserved.
  *
  * Portions of this software were developed by A. Joseph Koshy under
  * sponsorship from the FreeBSD Foundation and Google, Inc.
  *
  * Portions of this software were developed by
  * Konstantin Belousov <kib@FreeBSD.org> under sponsorship from
  * the FreeBSD Foundation.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  * $FreeBSD$
  */
 
 #include "opt_atpic.h"
 #include "opt_hwpmc_hooks.h"
 
 #include "assym.inc"
 
 #include <machine/psl.h>
 #include <machine/asmacros.h>
 #include <machine/trap.h>
 #include <machine/specialreg.h>
 
 #ifdef KDTRACE_HOOKS
 	.bss
 	.globl	dtrace_invop_jump_addr
 	.align	8
 	.type	dtrace_invop_jump_addr,@object
 	.size	dtrace_invop_jump_addr,8
 dtrace_invop_jump_addr:
 	.zero	8
 	.globl	dtrace_invop_calltrap_addr
 	.align	8
 	.type	dtrace_invop_calltrap_addr,@object
 	.size	dtrace_invop_calltrap_addr,8
 dtrace_invop_calltrap_addr:
 	.zero	8
 #endif
 	.text
 #ifdef HWPMC_HOOKS
 	ENTRY(start_exceptions)
 #endif
 
 /*****************************************************************************/
 /* Trap handling                                                             */
 /*****************************************************************************/
 /*
  * Trap and fault vector routines.
  *
  * All traps are 'interrupt gates', SDT_SYSIGT.  An interrupt gate pushes
  * state on the stack but also disables interrupts.  This is important for
  * us for the use of the swapgs instruction.  We cannot be interrupted
  * until the GS.base value is correct.  For most traps, we automatically
  * then enable interrupts if the interrupted context had them enabled.
  * This is equivalent to the i386 port's use of SDT_SYS386TGT.
  *
  * The cpu will push a certain amount of state onto the kernel stack for
  * the current process.  See amd64/include/frame.h.
  * This includes the current RFLAGS (status register, which includes
  * the interrupt disable state prior to the trap), the code segment register,
  * and the return instruction pointer are pushed by the cpu.  The cpu
  * will also push an 'error' code for certain traps.  We push a dummy
  * error code for those traps where the cpu doesn't in order to maintain
  * a consistent frame.  We also push a contrived 'trap number'.
  *
  * The CPU does not push the general registers, so we must do that, and we
  * must restore them prior to calling 'iret'.  The CPU adjusts %cs and %ss
  * but does not mess with %ds, %es, %gs or %fs.  We swap the %gs base for
  * for the kernel mode operation shortly, without changes to the selector
  * loaded.  Since superuser long mode works with any selectors loaded into
  * segment registers other then %cs, which makes them mostly unused in long
  * mode, and kernel does not reference %fs, leave them alone.  The segment
  * registers are reloaded on return to the usermode.
  */
 
 MCOUNT_LABEL(user)
 MCOUNT_LABEL(btrap)
 
 /* Traps that we leave interrupts disabled for. */
 	.macro	TRAP_NOEN	l, trapno
 	PTI_ENTRY	\l,X\l
 	.globl	X\l
 	.type	X\l,@function
 X\l:	subq $TF_RIP,%rsp
 	movl $\trapno,TF_TRAPNO(%rsp)
 	movq $0,TF_ADDR(%rsp)
 	movq $0,TF_ERR(%rsp)
 	jmp alltraps_noen
 	.endm
 
 	TRAP_NOEN	bpt, T_BPTFLT
 #ifdef KDTRACE_HOOKS
 	TRAP_NOEN	dtrace_ret, T_DTRACE_RET
 #endif
 
 /* Regular traps; The cpu does not supply tf_err for these. */
 	.macro	TRAP	l, trapno
 	PTI_ENTRY	\l,X\l
 	.globl	X\l
 	.type	X\l,@function
 X\l:
 	subq $TF_RIP,%rsp
 	movl $\trapno,TF_TRAPNO(%rsp)
 	movq $0,TF_ADDR(%rsp)
 	movq $0,TF_ERR(%rsp)
 	jmp alltraps
 	.endm
 
 	TRAP	div, T_DIVIDE
 	TRAP	ofl, T_OFLOW
 	TRAP	bnd, T_BOUND
 	TRAP	ill, T_PRIVINFLT
 	TRAP	dna, T_DNA
 	TRAP	fpusegm, T_FPOPFLT
 	TRAP	rsvd, T_RESERVED
 	TRAP	fpu, T_ARITHTRAP
 	TRAP	xmm, T_XMMFLT
 
 /* This group of traps have tf_err already pushed by the cpu. */
 	.macro	TRAP_ERR	l, trapno
 	PTI_ENTRY	\l,X\l,has_err=1
 	.globl	X\l
 	.type	X\l,@function
 X\l:
 	subq $TF_ERR,%rsp
 	movl $\trapno,TF_TRAPNO(%rsp)
 	movq $0,TF_ADDR(%rsp)
 	jmp alltraps
 	.endm
 
 	TRAP_ERR	tss, T_TSSFLT
 	TRAP_ERR	align, T_ALIGNFLT
 
 	/*
 	 * alltraps entry point.  Use swapgs if this is the first time in the
 	 * kernel from userland.  Reenable interrupts if they were enabled
 	 * before the trap.  This approximates SDT_SYS386TGT on the i386 port.
 	 */
 	SUPERALIGN_TEXT
 	.globl	alltraps
 	.type	alltraps,@function
 alltraps:
 	movq	%rdi,TF_RDI(%rsp)
 	testb	$SEL_RPL_MASK,TF_CS(%rsp) /* Did we come from kernel? */
 	jz	1f		/* already running with kernel GS.base */
 	swapgs
 	movq	PCPU(CURPCB),%rdi
 	andl	$~PCB_FULL_IRET,PCB_FLAGS(%rdi)
 1:	SAVE_SEGS
 	movq	%rdx,TF_RDX(%rsp)
 	movq	%rax,TF_RAX(%rsp)
 	movq	%rcx,TF_RCX(%rsp)
 	testb	$SEL_RPL_MASK,TF_CS(%rsp)
 	jz	2f
 	call	handle_ibrs_entry
 2:	testl	$PSL_I,TF_RFLAGS(%rsp)
 	jz	alltraps_pushregs_no_rax
 	sti
 alltraps_pushregs_no_rax:
 	movq	%rsi,TF_RSI(%rsp)
 	movq	%r8,TF_R8(%rsp)
 	movq	%r9,TF_R9(%rsp)
 	movq	%rbx,TF_RBX(%rsp)
 	movq	%rbp,TF_RBP(%rsp)
 	movq	%r10,TF_R10(%rsp)
 	movq	%r11,TF_R11(%rsp)
 	movq	%r12,TF_R12(%rsp)
 	movq	%r13,TF_R13(%rsp)
 	movq	%r14,TF_R14(%rsp)
 	movq	%r15,TF_R15(%rsp)
 	movl	$TF_HASSEGS,TF_FLAGS(%rsp)
 	pushfq
 	andq	$~(PSL_D | PSL_AC),(%rsp)
 	popfq
 	FAKE_MCOUNT(TF_RIP(%rsp))
 #ifdef KDTRACE_HOOKS
 	/*
 	 * DTrace Function Boundary Trace (fbt) probes are triggered
 	 * by int3 (0xcc) which causes the #BP (T_BPTFLT) breakpoint
 	 * interrupt. For all other trap types, just handle them in
 	 * the usual way.
 	 */
 	testb	$SEL_RPL_MASK,TF_CS(%rsp) /* Did we come from kernel? */
 	jnz	calltrap		/* ignore userland traps */
 	cmpl	$T_BPTFLT,TF_TRAPNO(%rsp)
 	jne	calltrap
 
 	/* Check if there is no DTrace hook registered. */
 	cmpq	$0,dtrace_invop_jump_addr
 	je	calltrap
 
 	/*
 	 * Set our jump address for the jump back in the event that
 	 * the breakpoint wasn't caused by DTrace at all.
 	 */
 	movq	$calltrap,dtrace_invop_calltrap_addr(%rip)
 
 	/* Jump to the code hooked in by DTrace. */
 	jmpq	*dtrace_invop_jump_addr
 #endif
 	.globl	calltrap
 	.type	calltrap,@function
 calltrap:
 	movq	%rsp,%rdi
 	call	trap_check
 	MEXITCOUNT
 	jmp	doreti			/* Handle any pending ASTs */
 
 	/*
 	 * alltraps_noen entry point.  Unlike alltraps above, we want to
 	 * leave the interrupts disabled.  This corresponds to
 	 * SDT_SYS386IGT on the i386 port.
 	 */
 	SUPERALIGN_TEXT
 	.globl	alltraps_noen
 	.type	alltraps_noen,@function
 alltraps_noen:
 	movq	%rdi,TF_RDI(%rsp)
 	testb	$SEL_RPL_MASK,TF_CS(%rsp) /* Did we come from kernel? */
 	jz	1f /* already running with kernel GS.base */
 	swapgs
 	movq	PCPU(CURPCB),%rdi
 	andl	$~PCB_FULL_IRET,PCB_FLAGS(%rdi)
 1:	SAVE_SEGS
 	movq	%rdx,TF_RDX(%rsp)
 	movq	%rax,TF_RAX(%rsp)
 	movq	%rcx,TF_RCX(%rsp)
 	testb	$SEL_RPL_MASK,TF_CS(%rsp)
 	jz	alltraps_pushregs_no_rax
 	call	handle_ibrs_entry
 	jmp	alltraps_pushregs_no_rax
 
 IDTVEC(dblfault)
 	subq	$TF_ERR,%rsp
 	movl	$T_DOUBLEFLT,TF_TRAPNO(%rsp)
 	movq	$0,TF_ADDR(%rsp)
 	movq	$0,TF_ERR(%rsp)
 	movq	%rdi,TF_RDI(%rsp)
 	movq	%rsi,TF_RSI(%rsp)
 	movq	%rdx,TF_RDX(%rsp)
 	movq	%rcx,TF_RCX(%rsp)
 	movq	%r8,TF_R8(%rsp)
 	movq	%r9,TF_R9(%rsp)
 	movq	%rax,TF_RAX(%rsp)
 	movq	%rbx,TF_RBX(%rsp)
 	movq	%rbp,TF_RBP(%rsp)
 	movq	%r10,TF_R10(%rsp)
 	movq	%r11,TF_R11(%rsp)
 	movq	%r12,TF_R12(%rsp)
 	movq	%r13,TF_R13(%rsp)
 	movq	%r14,TF_R14(%rsp)
 	movq	%r15,TF_R15(%rsp)
 	SAVE_SEGS
 	movl	$TF_HASSEGS,TF_FLAGS(%rsp)
 	pushfq
 	andq	$~(PSL_D | PSL_AC),(%rsp)
 	popfq
 	testb	$SEL_RPL_MASK,TF_CS(%rsp) /* Did we come from kernel? */
 	jz	1f			/* already running with kernel GS.base */
 	swapgs
 1:
 	movq	PCPU(KCR3),%rax
 	cmpq	$~0,%rax
 	je	2f
 	movq	%rax,%cr3
 2:	movq	%rsp,%rdi
 	call	dblfault_handler
 3:	hlt
 	jmp	3b
 
 	ALIGN_TEXT
 IDTVEC(page_pti)
 	testb	$SEL_RPL_MASK,PTI_CS-2*8(%rsp)
 	jz	Xpage
 	swapgs
 	pushq	%rax
 	movq	%cr3,%rax
 	movq	%rax,PCPU(SAVED_UCR3)
 	cmpq	$~0,PCPU(UCR3)
 	jne	1f
 	popq	%rax
 	jmp	2f
 1:	pushq	%rdx
 	PTI_UUENTRY has_err=1
 2:	subq	$TF_ERR,%rsp
 	movq	%rdi,TF_RDI(%rsp)
 	movq	%rax,TF_RAX(%rsp)
 	movq	%rdx,TF_RDX(%rsp)
 	movq	%rcx,TF_RCX(%rsp)
 	jmp	page_u
 IDTVEC(page)
 	subq	$TF_ERR,%rsp
 	movq	%rdi,TF_RDI(%rsp)	/* free up GP registers */
 	movq	%rax,TF_RAX(%rsp)
 	movq	%rdx,TF_RDX(%rsp)
 	movq	%rcx,TF_RCX(%rsp)
 	testb	$SEL_RPL_MASK,TF_CS(%rsp) /* Did we come from kernel? */
 	jz	page_cr2		/* already running with kernel GS.base */
 	swapgs
 page_u:	movq	PCPU(CURPCB),%rdi
 	andl	$~PCB_FULL_IRET,PCB_FLAGS(%rdi)
 	movq	PCPU(SAVED_UCR3),%rax
 	movq	%rax,PCB_SAVED_UCR3(%rdi)
 	call	handle_ibrs_entry
 page_cr2:
 	movq	%cr2,%rdi		/* preserve %cr2 before ..  */
 	movq	%rdi,TF_ADDR(%rsp)	/* enabling interrupts. */
 	SAVE_SEGS
 	movl	$T_PAGEFLT,TF_TRAPNO(%rsp)
 	testl	$PSL_I,TF_RFLAGS(%rsp)
 	jz	alltraps_pushregs_no_rax
 	sti
 	jmp	alltraps_pushregs_no_rax
 
 	/*
 	 * We have to special-case this one.  If we get a trap in doreti() at
 	 * the iretq stage, we'll reenter with the wrong gs state.  We'll have
 	 * to do a special the swapgs in this case even coming from the kernel.
 	 * XXX linux has a trap handler for their equivalent of load_gs().
 	 *
 	 * On the stack, we have the hardware interrupt frame to return
 	 * to usermode (faulted) and another frame with error code, for
 	 * fault.  For PTI, copy both frames to the main thread stack.
 	 * Handle the potential 16-byte alignment adjustment incurred
 	 * during the second fault by copying both frames independently
 	 * while unwinding the stack in between.
 	 */
 	.macro PROTF_ENTRY name,trapno
 \name\()_pti_doreti:
 	swapgs
 	cmpq	$~0,PCPU(UCR3)
 	je	1f
 	pushq	%rax
 	pushq	%rdx
 	movq	PCPU(KCR3),%rax
 	movq	%rax,%cr3
 	movq	PCPU(RSP0),%rax
 	subq	$2*PTI_SIZE-3*8,%rax /* no err, %rax, %rdx in faulted frame */
 	MOVE_STACKS	(PTI_SIZE / 8)
 	addq	$PTI_SIZE,%rax
 	movq	PTI_RSP(%rsp),%rsp
 	MOVE_STACKS	(PTI_SIZE / 8 - 3)
 	subq	$PTI_SIZE,%rax
 	movq	%rax,%rsp
 	popq	%rdx
 	popq	%rax
 1:	swapgs
 	jmp	X\name
 IDTVEC(\name\()_pti)
 	cmpq	$doreti_iret,PTI_RIP-2*8(%rsp)
 	je	\name\()_pti_doreti
 	testb	$SEL_RPL_MASK,PTI_CS-2*8(%rsp) /* %rax, %rdx not yet pushed */
 	jz	X\name
 	PTI_UENTRY has_err=1
 	swapgs
 IDTVEC(\name)
 	subq	$TF_ERR,%rsp
 	movl	$\trapno,TF_TRAPNO(%rsp)
 	jmp	prot_addrf
 	.endm
 
 	PROTF_ENTRY	missing, T_SEGNPFLT
 	PROTF_ENTRY	stk, T_STKFLT
 	PROTF_ENTRY	prot, T_PROTFLT
 
 prot_addrf:
 	movq	$0,TF_ADDR(%rsp)
 	movq	%rdi,TF_RDI(%rsp)	/* free up a GP register */
 	movq	%rax,TF_RAX(%rsp)
 	movq	%rdx,TF_RDX(%rsp)
 	movq	%rcx,TF_RCX(%rsp)
 	movw	%fs,TF_FS(%rsp)
 	movw	%gs,TF_GS(%rsp)
 	leaq	doreti_iret(%rip),%rdi
 	cmpq	%rdi,TF_RIP(%rsp)
 	je	5f			/* kernel but with user gsbase!! */
 	testb	$SEL_RPL_MASK,TF_CS(%rsp) /* Did we come from kernel? */
 	jz	6f			/* already running with kernel GS.base */
 	testb	$CPUID_STDEXT_FSGSBASE,cpu_stdext_feature(%rip)
 	jz	2f
 	cmpw	$KUF32SEL,TF_FS(%rsp)
 	jne	1f
 	rdfsbase %rax
 1:	cmpw	$KUG32SEL,TF_GS(%rsp)
 	jne	2f
 	rdgsbase %rdx
 2:	swapgs
 	movq	PCPU(CURPCB),%rdi
 	testb	$CPUID_STDEXT_FSGSBASE,cpu_stdext_feature(%rip)
 	jz	4f
 	cmpw	$KUF32SEL,TF_FS(%rsp)
 	jne	3f
 	movq	%rax,PCB_FSBASE(%rdi)
 3:	cmpw	$KUG32SEL,TF_GS(%rsp)
 	jne	4f
 	movq	%rdx,PCB_GSBASE(%rdi)
 4:	call	handle_ibrs_entry
 	orl	$PCB_FULL_IRET,PCB_FLAGS(%rdi)	/* always full iret from GPF */
 	movw	%es,TF_ES(%rsp)
 	movw	%ds,TF_DS(%rsp)
 	testl	$PSL_I,TF_RFLAGS(%rsp)
 	jz	alltraps_pushregs_no_rax
 	sti
 	jmp	alltraps_pushregs_no_rax
 
 5:	swapgs
 6:	movq	PCPU(CURPCB),%rdi
 	jmp	4b
 
 /*
  * Fast syscall entry point.  We enter here with just our new %cs/%ss set,
  * and the new privilige level.  We are still running on the old user stack
  * pointer.  We have to juggle a few things around to find our stack etc.
  * swapgs gives us access to our PCPU space only.
  *
  * We do not support invoking this from a custom segment registers,
  * esp. %cs, %ss, %fs, %gs, e.g. using entries from an LDT.
  */
 	SUPERALIGN_TEXT
 IDTVEC(fast_syscall_pti)
 	swapgs
 	movq	%rax,PCPU(SCRATCH_RAX)
 	cmpq	$~0,PCPU(UCR3)
 	je	fast_syscall_common
 	movq	PCPU(KCR3),%rax
 	movq	%rax,%cr3
 	jmp	fast_syscall_common
 	SUPERALIGN_TEXT
 IDTVEC(fast_syscall)
 	swapgs
 	movq	%rax,PCPU(SCRATCH_RAX)
 fast_syscall_common:
 	movq	%rsp,PCPU(SCRATCH_RSP)
 	movq	PCPU(RSP0),%rsp
 	/* Now emulate a trapframe. Make the 8 byte alignment odd for call. */
 	subq	$TF_SIZE,%rsp
 	/* defer TF_RSP till we have a spare register */
 	movq	%r11,TF_RFLAGS(%rsp)
 	movq	%rcx,TF_RIP(%rsp)	/* %rcx original value is in %r10 */
 	movq	PCPU(SCRATCH_RSP),%r11	/* %r11 already saved */
 	movq	%r11,TF_RSP(%rsp)	/* user stack pointer */
 	movq	PCPU(SCRATCH_RAX),%rax
 	/*
 	 * Save a few arg registers early to free them for use in
 	 * handle_ibrs_entry().  %r10 is especially tricky.  It is not an
 	 * arg register, but it holds the arg register %rcx.  Profiling
 	 * preserves %rcx, but may clobber %r10.  Profiling may also
 	 * clobber %r11, but %r11 (original %eflags) has been saved.
 	 */
 	movq	%rax,TF_RAX(%rsp)	/* syscall number */
 	movq	%rdx,TF_RDX(%rsp)	/* arg 3 */
 	movq	%r10,TF_RCX(%rsp)	/* arg 4 */
 	SAVE_SEGS
 	call	handle_ibrs_entry
 	movq	PCPU(CURPCB),%r11
 	andl	$~PCB_FULL_IRET,PCB_FLAGS(%r11)
 	sti
 	movq	$KUDSEL,TF_SS(%rsp)
 	movq	$KUCSEL,TF_CS(%rsp)
 	movq	$2,TF_ERR(%rsp)
 	movq	%rdi,TF_RDI(%rsp)	/* arg 1 */
 	movq	%rsi,TF_RSI(%rsp)	/* arg 2 */
 	movq	%r8,TF_R8(%rsp)		/* arg 5 */
 	movq	%r9,TF_R9(%rsp)		/* arg 6 */
 	movq	%rbx,TF_RBX(%rsp)	/* C preserved */
 	movq	%rbp,TF_RBP(%rsp)	/* C preserved */
 	movq	%r12,TF_R12(%rsp)	/* C preserved */
 	movq	%r13,TF_R13(%rsp)	/* C preserved */
 	movq	%r14,TF_R14(%rsp)	/* C preserved */
 	movq	%r15,TF_R15(%rsp)	/* C preserved */
 	movl	$TF_HASSEGS,TF_FLAGS(%rsp)
 	FAKE_MCOUNT(TF_RIP(%rsp))
 	movq	PCPU(CURTHREAD),%rdi
 	movq	%rsp,TD_FRAME(%rdi)
 	movl	TF_RFLAGS(%rsp),%esi
 	andl	$PSL_T,%esi
 	call	amd64_syscall
 1:	movq	PCPU(CURPCB),%rax
 	/* Disable interrupts before testing PCB_FULL_IRET. */
 	cli
 	testl	$PCB_FULL_IRET,PCB_FLAGS(%rax)
 	jnz	4f
 	/* Check for and handle AST's on return to userland. */
 	movq	PCPU(CURTHREAD),%rax
 	testl	$TDF_ASTPENDING | TDF_NEEDRESCHED,TD_FLAGS(%rax)
 	jne	3f
 	call	handle_ibrs_exit
+	callq	*mds_handler
 	/* Restore preserved registers. */
 	MEXITCOUNT
 	movq	TF_RDI(%rsp),%rdi	/* bonus; preserve arg 1 */
 	movq	TF_RSI(%rsp),%rsi	/* bonus: preserve arg 2 */
 	movq	TF_RDX(%rsp),%rdx	/* return value 2 */
 	movq	TF_RAX(%rsp),%rax	/* return value 1 */
 	movq	TF_RFLAGS(%rsp),%r11	/* original %rflags */
 	movq	TF_RIP(%rsp),%rcx	/* original %rip */
 	movq	TF_RSP(%rsp),%rsp	/* user stack pointer */
 	xorl	%r8d,%r8d		/* zero the rest of GPRs */
 	xorl	%r10d,%r10d
 	cmpq	$~0,PCPU(UCR3)
 	je	2f
 	movq	PCPU(UCR3),%r9
 	movq	%r9,%cr3
 2:	xorl	%r9d,%r9d
 	swapgs
 	sysretq
 
 3:	/* AST scheduled. */
 	sti
 	movq	%rsp,%rdi
 	call	ast
 	jmp	1b
 
 4:	/* Requested full context restore, use doreti for that. */
 	MEXITCOUNT
 	jmp	doreti
 
 /*
  * Here for CYA insurance, in case a "syscall" instruction gets
  * issued from 32 bit compatibility mode. MSR_CSTAR has to point
  * to *something* if EFER_SCE is enabled.
  */
 IDTVEC(fast_syscall32)
 	sysret
 
 /*
  * DB# handler is very similar to NM#, because 'mov/pop %ss' delay
  * generation of exception until the next instruction is executed,
  * which might be a kernel entry.  So we must execute the handler
  * on IST stack and be ready for non-kernel GSBASE.
  */
 IDTVEC(dbg)
 	subq	$TF_RIP,%rsp
 	movl	$(T_TRCTRAP),TF_TRAPNO(%rsp)
 	movq	$0,TF_ADDR(%rsp)
 	movq	$0,TF_ERR(%rsp)
 	movq	%rdi,TF_RDI(%rsp)
 	movq	%rsi,TF_RSI(%rsp)
 	movq	%rdx,TF_RDX(%rsp)
 	movq	%rcx,TF_RCX(%rsp)
 	movq	%r8,TF_R8(%rsp)
 	movq	%r9,TF_R9(%rsp)
 	movq	%rax,TF_RAX(%rsp)
 	movq	%rbx,TF_RBX(%rsp)
 	movq	%rbp,TF_RBP(%rsp)
 	movq	%r10,TF_R10(%rsp)
 	movq	%r11,TF_R11(%rsp)
 	movq	%r12,TF_R12(%rsp)
 	movq	%r13,TF_R13(%rsp)
 	movq	%r14,TF_R14(%rsp)
 	movq	%r15,TF_R15(%rsp)
 	SAVE_SEGS
 	movl	$TF_HASSEGS,TF_FLAGS(%rsp)
 	pushfq
 	andq	$~(PSL_D | PSL_AC),(%rsp)
 	popfq
 	testb	$SEL_RPL_MASK,TF_CS(%rsp)
 	jnz	dbg_fromuserspace
 	/*
 	 * We've interrupted the kernel.  Preserve GS.base in %r12,
 	 * %cr3 in %r13, and possibly lower half of MSR_IA32_SPEC_CTL in %r14d.
 	 */
 	movl	$MSR_GSBASE,%ecx
 	rdmsr
 	movq	%rax,%r12
 	shlq	$32,%rdx
 	orq	%rdx,%r12
 	/* Retrieve and load the canonical value for GS.base. */
 	movq	TF_SIZE(%rsp),%rdx
 	movl	%edx,%eax
 	shrq	$32,%rdx
 	wrmsr
 	movq	%cr3,%r13
 	movq	PCPU(KCR3),%rax
 	cmpq	$~0,%rax
 	je	1f
 	movq	%rax,%cr3
 1:	testl	$CPUID_STDEXT3_IBPB,cpu_stdext_feature3(%rip)
 	je	2f
 	movl	$MSR_IA32_SPEC_CTRL,%ecx
 	rdmsr
 	movl	%eax,%r14d
 	call	handle_ibrs_entry
 2:	FAKE_MCOUNT(TF_RIP(%rsp))
 	movq	%rsp,%rdi
 	call	trap
 	MEXITCOUNT
 	testl	$CPUID_STDEXT3_IBPB,cpu_stdext_feature3(%rip)
 	je	3f
 	movl	%r14d,%eax
 	xorl	%edx,%edx
 	movl	$MSR_IA32_SPEC_CTRL,%ecx
 	wrmsr
 	/*
 	 * Put back the preserved MSR_GSBASE value.
 	 */
 3:	movl	$MSR_GSBASE,%ecx
 	movq	%r12,%rdx
 	movl	%edx,%eax
 	shrq	$32,%rdx
 	wrmsr
 	movq	%r13,%cr3
 	RESTORE_REGS
 	addq	$TF_RIP,%rsp
 	jmp	doreti_iret
 dbg_fromuserspace:
 	/*
 	 * Switch to kernel GSBASE and kernel page table, and copy frame
 	 * from the IST stack to the normal kernel stack, since trap()
 	 * re-enables interrupts, and since we might trap on DB# while
 	 * in trap().
 	 */
 	swapgs
 	movq	PCPU(KCR3),%rax
 	cmpq	$~0,%rax
 	je	1f
 	movq	%rax,%cr3
 1:	movq	PCPU(RSP0),%rax
 	movl	$TF_SIZE,%ecx
 	subq	%rcx,%rax
 	movq	%rax,%rdi
 	movq	%rsp,%rsi
 	rep;movsb
 	movq	%rax,%rsp
 	call	handle_ibrs_entry
 	movq	PCPU(CURPCB),%rdi
 	orl	$PCB_FULL_IRET,PCB_FLAGS(%rdi)
 	testb	$CPUID_STDEXT_FSGSBASE,cpu_stdext_feature(%rip)
 	jz	3f
 	cmpw	$KUF32SEL,TF_FS(%rsp)
 	jne	2f
 	rdfsbase %rax
 	movq	%rax,PCB_FSBASE(%rdi)
 2:	cmpw	$KUG32SEL,TF_GS(%rsp)
 	jne	3f
 	movl	$MSR_KGSBASE,%ecx
 	rdmsr
 	shlq	$32,%rdx
 	orq	%rdx,%rax
 	movq	%rax,PCB_GSBASE(%rdi)
 3:	jmp	calltrap
 
 /*
  * NMI handling is special.
  *
  * First, NMIs do not respect the state of the processor's RFLAGS.IF
  * bit.  The NMI handler may be entered at any time, including when
  * the processor is in a critical section with RFLAGS.IF == 0.
  * The processor's GS.base value could be invalid on entry to the
  * handler.
  *
  * Second, the processor treats NMIs specially, blocking further NMIs
  * until an 'iretq' instruction is executed.  We thus need to execute
  * the NMI handler with interrupts disabled, to prevent a nested interrupt
  * from executing an 'iretq' instruction and inadvertently taking the
  * processor out of NMI mode.
  *
  * Third, the NMI handler runs on its own stack (tss_ist2). The canonical
  * GS.base value for the processor is stored just above the bottom of its
  * NMI stack.  For NMIs taken from kernel mode, the current value in
  * the processor's GS.base is saved at entry to C-preserved register %r12,
  * the canonical value for GS.base is then loaded into the processor, and
  * the saved value is restored at exit time.  For NMIs taken from user mode,
  * the cheaper 'SWAPGS' instructions are used for swapping GS.base.
  */
 
 IDTVEC(nmi)
 	subq	$TF_RIP,%rsp
 	movl	$(T_NMI),TF_TRAPNO(%rsp)
 	movq	$0,TF_ADDR(%rsp)
 	movq	$0,TF_ERR(%rsp)
 	movq	%rdi,TF_RDI(%rsp)
 	movq	%rsi,TF_RSI(%rsp)
 	movq	%rdx,TF_RDX(%rsp)
 	movq	%rcx,TF_RCX(%rsp)
 	movq	%r8,TF_R8(%rsp)
 	movq	%r9,TF_R9(%rsp)
 	movq	%rax,TF_RAX(%rsp)
 	movq	%rbx,TF_RBX(%rsp)
 	movq	%rbp,TF_RBP(%rsp)
 	movq	%r10,TF_R10(%rsp)
 	movq	%r11,TF_R11(%rsp)
 	movq	%r12,TF_R12(%rsp)
 	movq	%r13,TF_R13(%rsp)
 	movq	%r14,TF_R14(%rsp)
 	movq	%r15,TF_R15(%rsp)
 	SAVE_SEGS
 	movl	$TF_HASSEGS,TF_FLAGS(%rsp)
 	pushfq
 	andq	$~(PSL_D | PSL_AC),(%rsp)
 	popfq
 	xorl	%ebx,%ebx
 	testb	$SEL_RPL_MASK,TF_CS(%rsp)
 	jnz	nmi_fromuserspace
 	/*
 	 * We've interrupted the kernel.  Preserve GS.base in %r12,
 	 * %cr3 in %r13, and possibly lower half of MSR_IA32_SPEC_CTL in %r14d.
 	 */
 	movl	$MSR_GSBASE,%ecx
 	rdmsr
 	movq	%rax,%r12
 	shlq	$32,%rdx
 	orq	%rdx,%r12
 	/* Retrieve and load the canonical value for GS.base. */
 	movq	TF_SIZE(%rsp),%rdx
 	movl	%edx,%eax
 	shrq	$32,%rdx
 	wrmsr
 	movq	%cr3,%r13
 	movq	PCPU(KCR3),%rax
 	cmpq	$~0,%rax
 	je	1f
 	movq	%rax,%cr3
 1:	testl	$CPUID_STDEXT3_IBPB,cpu_stdext_feature3(%rip)
 	je	nmi_calltrap
 	movl	$MSR_IA32_SPEC_CTRL,%ecx
 	rdmsr
 	movl	%eax,%r14d
 	call	handle_ibrs_entry
 	jmp	nmi_calltrap
 nmi_fromuserspace:
 	incl	%ebx
 	swapgs
 	movq	%cr3,%r13
 	movq	PCPU(KCR3),%rax
 	cmpq	$~0,%rax
 	je	1f
 	movq	%rax,%cr3
 1:	call	handle_ibrs_entry
 	movq	PCPU(CURPCB),%rdi
 	testq	%rdi,%rdi
 	jz	3f
 	orl	$PCB_FULL_IRET,PCB_FLAGS(%rdi)
 	testb	$CPUID_STDEXT_FSGSBASE,cpu_stdext_feature(%rip)
 	jz	3f
 	cmpw	$KUF32SEL,TF_FS(%rsp)
 	jne	2f
 	rdfsbase %rax
 	movq	%rax,PCB_FSBASE(%rdi)
 2:	cmpw	$KUG32SEL,TF_GS(%rsp)
 	jne	3f
 	movl	$MSR_KGSBASE,%ecx
 	rdmsr
 	shlq	$32,%rdx
 	orq	%rdx,%rax
 	movq	%rax,PCB_GSBASE(%rdi)
 3:
 /* Note: this label is also used by ddb and gdb: */
 nmi_calltrap:
 	FAKE_MCOUNT(TF_RIP(%rsp))
 	movq	%rsp,%rdi
 	call	trap
 	MEXITCOUNT
 #ifdef HWPMC_HOOKS
 	/*
 	 * Capture a userspace callchain if needed.
 	 *
 	 * - Check if the current trap was from user mode.
 	 * - Check if the current thread is valid.
 	 * - Check if the thread requires a user call chain to be
 	 *   captured.
 	 *
 	 * We are still in NMI mode at this point.
 	 */
 	testl	%ebx,%ebx
 	jz	nocallchain	/* not from userspace */
 	movq	PCPU(CURTHREAD),%rax
 	orq	%rax,%rax	/* curthread present? */
 	jz	nocallchain
 	/*
 	 * Move execution to the regular kernel stack, because we
 	 * committed to return through doreti.
 	 */
 	movq	%rsp,%rsi	/* source stack pointer */
 	movq	$TF_SIZE,%rcx
 	movq	PCPU(RSP0),%rdx
 	subq	%rcx,%rdx
 	movq	%rdx,%rdi	/* destination stack pointer */
 	shrq	$3,%rcx		/* trap frame size in long words */
 	pushfq
 	andq	$~(PSL_D | PSL_AC),(%rsp)
 	popfq
 	rep
 	movsq			/* copy trapframe */
 	movq	%rdx,%rsp	/* we are on the regular kstack */
 
 	testl	$TDP_CALLCHAIN,TD_PFLAGS(%rax) /* flagged for capture? */
 	jz	nocallchain
 	/*
 	 * A user callchain is to be captured, so:
 	 * - Take the processor out of "NMI" mode by faking an "iret",
 	 *   to allow for nested NMI interrupts.
 	 * - Enable interrupts, so that copyin() can work.
 	 */
 	movl	%ss,%eax
 	pushq	%rax		/* tf_ss */
 	pushq	%rdx		/* tf_rsp (on kernel stack) */
 	pushfq			/* tf_rflags */
 	movl	%cs,%eax
 	pushq	%rax		/* tf_cs */
 	pushq	$outofnmi	/* tf_rip */
 	iretq
 outofnmi:
 	/*
 	 * At this point the processor has exited NMI mode and is running
 	 * with interrupts turned off on the normal kernel stack.
 	 *
 	 * If a pending NMI gets recognized at or after this point, it
 	 * will cause a kernel callchain to be traced.
 	 *
 	 * We turn interrupts back on, and call the user callchain capture hook.
 	 */
 	movq	pmc_hook,%rax
 	orq	%rax,%rax
 	jz	nocallchain
 	movq	PCPU(CURTHREAD),%rdi		/* thread */
 	movq	$PMC_FN_USER_CALLCHAIN,%rsi	/* command */
 	movq	%rsp,%rdx			/* frame */
 	sti
 	call	*%rax
 	cli
 nocallchain:
 #endif
 	testl	%ebx,%ebx	/* %ebx == 0 => return to userland */
 	jnz	doreti_exit
 	/*
 	 * Restore speculation control MSR, if preserved.
 	 */
 	testl	$CPUID_STDEXT3_IBPB,cpu_stdext_feature3(%rip)
 	je	1f
 	movl	%r14d,%eax
 	xorl	%edx,%edx
 	movl	$MSR_IA32_SPEC_CTRL,%ecx
 	wrmsr
 	/*
 	 * Put back the preserved MSR_GSBASE value.
 	 */
 1:	movl	$MSR_GSBASE,%ecx
 	movq	%r12,%rdx
 	movl	%edx,%eax
 	shrq	$32,%rdx
 	wrmsr
 	cmpb	$0, nmi_flush_l1d_sw(%rip)
 	je	2f
 	call	flush_l1d_sw		/* bhyve L1TF assist */
 2:	movq	%r13,%cr3
 	RESTORE_REGS
 	addq	$TF_RIP,%rsp
 	jmp	doreti_iret
 
 /*
  * MC# handling is similar to NMI.
  *
  * As with NMIs, machine check exceptions do not respect RFLAGS.IF and
  * can occur at any time with a GS.base value that does not correspond
  * to the privilege level in CS.
  *
  * Machine checks are not unblocked by iretq, but it is best to run
  * the handler with interrupts disabled since the exception may have
  * interrupted a critical section.
  *
  * The MC# handler runs on its own stack (tss_ist3).  The canonical
  * GS.base value for the processor is stored just above the bottom of
  * its MC# stack.  For exceptions taken from kernel mode, the current
  * value in the processor's GS.base is saved at entry to C-preserved
  * register %r12, the canonical value for GS.base is then loaded into
  * the processor, and the saved value is restored at exit time.  For
  * exceptions taken from user mode, the cheaper 'SWAPGS' instructions
  * are used for swapping GS.base.
  */
 
 IDTVEC(mchk)
 	subq	$TF_RIP,%rsp
 	movl	$(T_MCHK),TF_TRAPNO(%rsp)
 	movq	$0,TF_ADDR(%rsp)
 	movq	$0,TF_ERR(%rsp)
 	movq	%rdi,TF_RDI(%rsp)
 	movq	%rsi,TF_RSI(%rsp)
 	movq	%rdx,TF_RDX(%rsp)
 	movq	%rcx,TF_RCX(%rsp)
 	movq	%r8,TF_R8(%rsp)
 	movq	%r9,TF_R9(%rsp)
 	movq	%rax,TF_RAX(%rsp)
 	movq	%rbx,TF_RBX(%rsp)
 	movq	%rbp,TF_RBP(%rsp)
 	movq	%r10,TF_R10(%rsp)
 	movq	%r11,TF_R11(%rsp)
 	movq	%r12,TF_R12(%rsp)
 	movq	%r13,TF_R13(%rsp)
 	movq	%r14,TF_R14(%rsp)
 	movq	%r15,TF_R15(%rsp)
 	SAVE_SEGS
 	movl	$TF_HASSEGS,TF_FLAGS(%rsp)
 	pushfq
 	andq	$~(PSL_D | PSL_AC),(%rsp)
 	popfq
 	xorl	%ebx,%ebx
 	testb	$SEL_RPL_MASK,TF_CS(%rsp)
 	jnz	mchk_fromuserspace
 	/*
 	 * We've interrupted the kernel.  Preserve GS.base in %r12,
 	 * %cr3 in %r13, and possibly lower half of MSR_IA32_SPEC_CTL in %r14d.
 	 */
 	movl	$MSR_GSBASE,%ecx
 	rdmsr
 	movq	%rax,%r12
 	shlq	$32,%rdx
 	orq	%rdx,%r12
 	/* Retrieve and load the canonical value for GS.base. */
 	movq	TF_SIZE(%rsp),%rdx
 	movl	%edx,%eax
 	shrq	$32,%rdx
 	wrmsr
 	movq	%cr3,%r13
 	movq	PCPU(KCR3),%rax
 	cmpq	$~0,%rax
 	je	1f
 	movq	%rax,%cr3
 1:	testl	$CPUID_STDEXT3_IBPB,cpu_stdext_feature3(%rip)
 	je	mchk_calltrap
 	movl	$MSR_IA32_SPEC_CTRL,%ecx
 	rdmsr
 	movl	%eax,%r14d
 	call	handle_ibrs_entry
 	jmp	mchk_calltrap
 mchk_fromuserspace:
 	incl	%ebx
 	swapgs
 	movq	%cr3,%r13
 	movq	PCPU(KCR3),%rax
 	cmpq	$~0,%rax
 	je	1f
 	movq	%rax,%cr3
 1:	call	handle_ibrs_entry
 /* Note: this label is also used by ddb and gdb: */
 mchk_calltrap:
 	FAKE_MCOUNT(TF_RIP(%rsp))
 	movq	%rsp,%rdi
 	call	mca_intr
 	MEXITCOUNT
 	testl	%ebx,%ebx	/* %ebx == 0 => return to userland */
 	jnz	doreti_exit
 	/*
 	 * Restore speculation control MSR, if preserved.
 	 */
 	testl	$CPUID_STDEXT3_IBPB,cpu_stdext_feature3(%rip)
 	je	1f
 	movl	%r14d,%eax
 	xorl	%edx,%edx
 	movl	$MSR_IA32_SPEC_CTRL,%ecx
 	wrmsr
 	/*
 	 * Put back the preserved MSR_GSBASE value.
 	 */
 1:	movl	$MSR_GSBASE,%ecx
 	movq	%r12,%rdx
 	movl	%edx,%eax
 	shrq	$32,%rdx
 	wrmsr
 	movq	%r13,%cr3
 	RESTORE_REGS
 	addq	$TF_RIP,%rsp
 	jmp	doreti_iret
 
 ENTRY(fork_trampoline)
 	movq	%r12,%rdi		/* function */
 	movq	%rbx,%rsi		/* arg1 */
 	movq	%rsp,%rdx		/* trapframe pointer */
 	call	fork_exit
 	MEXITCOUNT
 	jmp	doreti			/* Handle any ASTs */
 
 /*
  * To efficiently implement classification of trap and interrupt handlers
  * for profiling, there must be only trap handlers between the labels btrap
  * and bintr, and only interrupt handlers between the labels bintr and
  * eintr.  This is implemented (partly) by including files that contain
  * some of the handlers.  Before including the files, set up a normal asm
  * environment so that the included files doen't need to know that they are
  * included.
  */
 
 #ifdef COMPAT_FREEBSD32
 	.data
 	.p2align 4
 	.text
 	SUPERALIGN_TEXT
 
 #include <amd64/ia32/ia32_exception.S>
 #endif
 
 	.data
 	.p2align 4
 	.text
 	SUPERALIGN_TEXT
 MCOUNT_LABEL(bintr)
 
 #include <amd64/amd64/apic_vector.S>
 
 #ifdef DEV_ATPIC
 	.data
 	.p2align 4
 	.text
 	SUPERALIGN_TEXT
 
 #include <amd64/amd64/atpic_vector.S>
 #endif
 
 	.text
 MCOUNT_LABEL(eintr)
 
 /*
  * void doreti(struct trapframe)
  *
  * Handle return from interrupts, traps and syscalls.
  */
 	.text
 	SUPERALIGN_TEXT
 	.type	doreti,@function
 	.globl	doreti
 doreti:
 	FAKE_MCOUNT($bintr)		/* init "from" bintr -> doreti */
 	/*
 	 * Check if ASTs can be handled now.
 	 */
 	testb	$SEL_RPL_MASK,TF_CS(%rsp) /* are we returning to user mode? */
 	jz	doreti_exit		/* can't handle ASTs now if not */
 
 doreti_ast:
 	/*
 	 * Check for ASTs atomically with returning.  Disabling CPU
 	 * interrupts provides sufficient locking even in the SMP case,
 	 * since we will be informed of any new ASTs by an IPI.
 	 */
 	cli
 	movq	PCPU(CURTHREAD),%rax
 	testl	$TDF_ASTPENDING | TDF_NEEDRESCHED,TD_FLAGS(%rax)
 	je	doreti_exit
 	sti
 	movq	%rsp,%rdi	/* pass a pointer to the trapframe */
 	call	ast
 	jmp	doreti_ast
 
 	/*
 	 * doreti_exit:	pop registers, iret.
 	 *
 	 *	The segment register pop is a special case, since it may
 	 *	fault if (for example) a sigreturn specifies bad segment
 	 *	registers.  The fault is handled in trap.c.
 	 */
 doreti_exit:
 	MEXITCOUNT
 	movq	PCPU(CURPCB),%r8
 
 	/*
 	 * Do not reload segment registers for kernel.
 	 * Since we do not reload segments registers with sane
 	 * values on kernel entry, descriptors referenced by
 	 * segments registers might be not valid.  This is fatal
 	 * for user mode, but is not a problem for the kernel.
 	 */
 	testb	$SEL_RPL_MASK,TF_CS(%rsp)
 	jz	ld_regs
 	testl	$PCB_FULL_IRET,PCB_FLAGS(%r8)
 	jz	ld_regs
 	andl	$~PCB_FULL_IRET,PCB_FLAGS(%r8)
 	testl	$TF_HASSEGS,TF_FLAGS(%rsp)
 	je	set_segs
 
 do_segs:
 	/* Restore %fs and fsbase */
 	movw	TF_FS(%rsp),%ax
 	.globl	ld_fs
 ld_fs:
 	movw	%ax,%fs
 	cmpw	$KUF32SEL,%ax
 	jne	1f
 	movl	$MSR_FSBASE,%ecx
 	movl	PCB_FSBASE(%r8),%eax
 	movl	PCB_FSBASE+4(%r8),%edx
 	.globl	ld_fsbase
 ld_fsbase:
 	wrmsr
 1:
 	/* Restore %gs and gsbase */
 	movw	TF_GS(%rsp),%si
 	pushfq
 	cli
 	movl	$MSR_GSBASE,%ecx
 	/* Save current kernel %gs base into %r12d:%r13d */
 	rdmsr
 	movl	%eax,%r12d
 	movl	%edx,%r13d
 	.globl	ld_gs
 ld_gs:
 	movw	%si,%gs
 	/* Save user %gs base into %r14d:%r15d */
 	rdmsr
 	movl	%eax,%r14d
 	movl	%edx,%r15d
 	/* Restore kernel %gs base */
 	movl	%r12d,%eax
 	movl	%r13d,%edx
 	wrmsr
 	popfq
 	/*
 	 * Restore user %gs base, either from PCB if used for TLS, or
 	 * from the previously saved msr read.
 	 */
 	movl	$MSR_KGSBASE,%ecx
 	cmpw	$KUG32SEL,%si
 	jne	1f
 	movl	PCB_GSBASE(%r8),%eax
 	movl	PCB_GSBASE+4(%r8),%edx
 	jmp	ld_gsbase
 1:
 	movl	%r14d,%eax
 	movl	%r15d,%edx
 	.globl	ld_gsbase
 ld_gsbase:
 	wrmsr	/* May trap if non-canonical, but only for TLS. */
 	.globl	ld_es
 ld_es:
 	movw	TF_ES(%rsp),%es
 	.globl	ld_ds
 ld_ds:
 	movw	TF_DS(%rsp),%ds
 ld_regs:
 	RESTORE_REGS
 	testb	$SEL_RPL_MASK,TF_CS(%rsp) /* Did we come from kernel? */
 	jz	2f			/* keep running with kernel GS.base */
 	cli
 	call	handle_ibrs_exit_rs
+	callq	*mds_handler
 	cmpq	$~0,PCPU(UCR3)
 	je	1f
 	pushq	%rdx
 	movq	PCPU(PTI_RSP0),%rdx
 	subq	$PTI_SIZE,%rdx
 	movq	%rax,PTI_RAX(%rdx)
 	popq	%rax
 	movq	%rax,PTI_RDX(%rdx)
 	movq	TF_RIP(%rsp),%rax
 	movq	%rax,PTI_RIP(%rdx)
 	movq	TF_CS(%rsp),%rax
 	movq	%rax,PTI_CS(%rdx)
 	movq	TF_RFLAGS(%rsp),%rax
 	movq	%rax,PTI_RFLAGS(%rdx)
 	movq	TF_RSP(%rsp),%rax
 	movq	%rax,PTI_RSP(%rdx)
 	movq	TF_SS(%rsp),%rax
 	movq	%rax,PTI_SS(%rdx)
 	movq	PCPU(UCR3),%rax
 	swapgs
 	movq	%rdx,%rsp
 	movq	%rax,%cr3
 	popq	%rdx
 	popq	%rax
 	addq	$8,%rsp
 	jmp	doreti_iret
 1:	swapgs
 2:	addq	$TF_RIP,%rsp
 	.globl	doreti_iret
 doreti_iret:
 	iretq
 
 set_segs:
 	movw	$KUDSEL,%ax
 	movw	%ax,TF_DS(%rsp)
 	movw	%ax,TF_ES(%rsp)
 	movw	$KUF32SEL,TF_FS(%rsp)
 	movw	$KUG32SEL,TF_GS(%rsp)
 	jmp	do_segs
 
 	/*
 	 * doreti_iret_fault.  Alternative return code for
 	 * the case where we get a fault in the doreti_exit code
 	 * above.  trap() (amd64/amd64/trap.c) catches this specific
 	 * case, sends the process a signal and continues in the
 	 * corresponding place in the code below.
 	 */
 	ALIGN_TEXT
 	.globl	doreti_iret_fault
 doreti_iret_fault:
 	subq	$TF_RIP,%rsp		/* space including tf_err, tf_trapno */
 	movq	%rax,TF_RAX(%rsp)
 	movq	%rdx,TF_RDX(%rsp)
 	movq	%rcx,TF_RCX(%rsp)
 	call	handle_ibrs_entry
 	testb	$SEL_RPL_MASK,TF_CS(%rsp)
 	jz	1f
 	sti
 1:
 	SAVE_SEGS
 	movl	$TF_HASSEGS,TF_FLAGS(%rsp)
 	movq	%rdi,TF_RDI(%rsp)
 	movq	%rsi,TF_RSI(%rsp)
 	movq	%r8,TF_R8(%rsp)
 	movq	%r9,TF_R9(%rsp)
 	movq	%rbx,TF_RBX(%rsp)
 	movq	%rbp,TF_RBP(%rsp)
 	movq	%r10,TF_R10(%rsp)
 	movq	%r11,TF_R11(%rsp)
 	movq	%r12,TF_R12(%rsp)
 	movq	%r13,TF_R13(%rsp)
 	movq	%r14,TF_R14(%rsp)
 	movq	%r15,TF_R15(%rsp)
 	movl	$T_PROTFLT,TF_TRAPNO(%rsp)
 	movq	$0,TF_ERR(%rsp)	/* XXX should be the error code */
 	movq	$0,TF_ADDR(%rsp)
 	FAKE_MCOUNT(TF_RIP(%rsp))
 	jmp	calltrap
 
 	ALIGN_TEXT
 	.globl	ds_load_fault
 ds_load_fault:
 	movl	$T_PROTFLT,TF_TRAPNO(%rsp)
 	testb	$SEL_RPL_MASK,TF_CS(%rsp)
 	jz	1f
 	sti
 1:
 	movq	%rsp,%rdi
 	call	trap
 	movw	$KUDSEL,TF_DS(%rsp)
 	jmp	doreti
 
 	ALIGN_TEXT
 	.globl	es_load_fault
 es_load_fault:
 	movl	$T_PROTFLT,TF_TRAPNO(%rsp)
 	testl	$PSL_I,TF_RFLAGS(%rsp)
 	jz	1f
 	sti
 1:
 	movq	%rsp,%rdi
 	call	trap
 	movw	$KUDSEL,TF_ES(%rsp)
 	jmp	doreti
 
 	ALIGN_TEXT
 	.globl	fs_load_fault
 fs_load_fault:
 	testl	$PSL_I,TF_RFLAGS(%rsp)
 	jz	1f
 	sti
 1:
 	movl	$T_PROTFLT,TF_TRAPNO(%rsp)
 	movq	%rsp,%rdi
 	call	trap
 	movw	$KUF32SEL,TF_FS(%rsp)
 	jmp	doreti
 
 	ALIGN_TEXT
 	.globl	gs_load_fault
 gs_load_fault:
 	popfq
 	movl	$T_PROTFLT,TF_TRAPNO(%rsp)
 	testl	$PSL_I,TF_RFLAGS(%rsp)
 	jz	1f
 	sti
 1:
 	movq	%rsp,%rdi
 	call	trap
 	movw	$KUG32SEL,TF_GS(%rsp)
 	jmp	doreti
 
 	ALIGN_TEXT
 	.globl	fsbase_load_fault
 fsbase_load_fault:
 	movl	$T_PROTFLT,TF_TRAPNO(%rsp)
 	testl	$PSL_I,TF_RFLAGS(%rsp)
 	jz	1f
 	sti
 1:
 	movq	%rsp,%rdi
 	call	trap
 	movq	PCPU(CURTHREAD),%r8
 	movq	TD_PCB(%r8),%r8
 	movq	$0,PCB_FSBASE(%r8)
 	jmp	doreti
 
 	ALIGN_TEXT
 	.globl	gsbase_load_fault
 gsbase_load_fault:
 	movl	$T_PROTFLT,TF_TRAPNO(%rsp)
 	testl	$PSL_I,TF_RFLAGS(%rsp)
 	jz	1f
 	sti
 1:
 	movq	%rsp,%rdi
 	call	trap
 	movq	PCPU(CURTHREAD),%r8
 	movq	TD_PCB(%r8),%r8
 	movq	$0,PCB_GSBASE(%r8)
 	jmp	doreti
 
 #ifdef HWPMC_HOOKS
 	ENTRY(end_exceptions)
 #endif
Index: projects/runtime-coverage-v2/sys/amd64/amd64/genassym.c
===================================================================
--- projects/runtime-coverage-v2/sys/amd64/amd64/genassym.c	(revision 347598)
+++ projects/runtime-coverage-v2/sys/amd64/amd64/genassym.c	(revision 347599)
@@ -1,271 +1,274 @@
 /*-
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Copyright (c) 1982, 1990 The Regents of the University of California.
  * All rights reserved.
  *
  * This code is derived from software contributed to Berkeley by
  * William Jolitz.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	from: @(#)genassym.c	5.11 (Berkeley) 5/10/91
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_hwpmc_hooks.h"
 #include "opt_kstack_pages.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/assym.h>
 #include <sys/bio.h>
 #include <sys/buf.h>
 #include <sys/proc.h>
 #ifdef	HWPMC_HOOKS
 #include <sys/pmckern.h>
 #endif
 #include <sys/errno.h>
 #include <sys/mount.h>
 #include <sys/mutex.h>
 #include <sys/socket.h>
 #include <sys/resourcevar.h>
 #include <sys/ucontext.h>
 #include <machine/tss.h>
 #include <sys/vmmeter.h>
 #include <vm/vm.h>
 #include <vm/vm_param.h>
 #include <vm/pmap.h>
 #include <vm/vm_map.h>
 #include <sys/proc.h>
 #include <x86/apicreg.h>
 #include <machine/cpu.h>
 #include <machine/pcb.h>
 #include <machine/sigframe.h>
 #include <machine/proc.h>
 #include <machine/segments.h>
 #include <machine/efi.h>
 
 ASSYM(P_VMSPACE, offsetof(struct proc, p_vmspace));
 ASSYM(VM_PMAP, offsetof(struct vmspace, vm_pmap));
 ASSYM(PM_ACTIVE, offsetof(struct pmap, pm_active));
 
 ASSYM(P_MD, offsetof(struct proc, p_md));
 ASSYM(MD_LDT, offsetof(struct mdproc, md_ldt));
 ASSYM(MD_LDT_SD, offsetof(struct mdproc, md_ldt_sd));
 
 ASSYM(MD_EFIRT_TMP, offsetof(struct mdthread, md_efirt_tmp));
 
 ASSYM(TD_LOCK, offsetof(struct thread, td_lock));
 ASSYM(TD_FLAGS, offsetof(struct thread, td_flags));
 ASSYM(TD_PCB, offsetof(struct thread, td_pcb));
 ASSYM(TD_PFLAGS, offsetof(struct thread, td_pflags));
 ASSYM(TD_PROC, offsetof(struct thread, td_proc));
 ASSYM(TD_FRAME, offsetof(struct thread, td_frame));
 ASSYM(TD_MD, offsetof(struct thread, td_md));
 
 ASSYM(TDF_ASTPENDING, TDF_ASTPENDING);
 ASSYM(TDF_NEEDRESCHED, TDF_NEEDRESCHED);
 
 ASSYM(TDP_CALLCHAIN, TDP_CALLCHAIN);
 ASSYM(TDP_KTHREAD, TDP_KTHREAD);
 
 ASSYM(PAGE_SIZE, PAGE_SIZE);
 ASSYM(NPTEPG, NPTEPG);
 ASSYM(NPDEPG, NPDEPG);
 ASSYM(addr_PTmap, addr_PTmap);
 ASSYM(addr_PDmap, addr_PDmap);
 ASSYM(addr_PDPmap, addr_PDPmap);
 ASSYM(addr_PML4map, addr_PML4map);
 ASSYM(addr_PML4pml4e, addr_PML4pml4e);
 ASSYM(PDESIZE, sizeof(pd_entry_t));
 ASSYM(PTESIZE, sizeof(pt_entry_t));
 ASSYM(PAGE_SHIFT, PAGE_SHIFT);
 ASSYM(PAGE_MASK, PAGE_MASK);
 ASSYM(PDRSHIFT, PDRSHIFT);
 ASSYM(PDPSHIFT, PDPSHIFT);
 ASSYM(PML4SHIFT, PML4SHIFT);
 ASSYM(val_KPDPI, KPDPI);
 ASSYM(val_KPML4I, KPML4I);
 ASSYM(val_PML4PML4I, PML4PML4I);
 ASSYM(VM_MAXUSER_ADDRESS, VM_MAXUSER_ADDRESS);
 ASSYM(KERNBASE, KERNBASE);
 ASSYM(DMAP_MIN_ADDRESS, DMAP_MIN_ADDRESS);
 ASSYM(DMAP_MAX_ADDRESS, DMAP_MAX_ADDRESS);
 
 ASSYM(PCB_R15, offsetof(struct pcb, pcb_r15));
 ASSYM(PCB_R14, offsetof(struct pcb, pcb_r14));
 ASSYM(PCB_R13, offsetof(struct pcb, pcb_r13));
 ASSYM(PCB_R12, offsetof(struct pcb, pcb_r12));
 ASSYM(PCB_RBP, offsetof(struct pcb, pcb_rbp));
 ASSYM(PCB_RSP, offsetof(struct pcb, pcb_rsp));
 ASSYM(PCB_RBX, offsetof(struct pcb, pcb_rbx));
 ASSYM(PCB_RIP, offsetof(struct pcb, pcb_rip));
 ASSYM(PCB_FSBASE, offsetof(struct pcb, pcb_fsbase));
 ASSYM(PCB_GSBASE, offsetof(struct pcb, pcb_gsbase));
 ASSYM(PCB_KGSBASE, offsetof(struct pcb, pcb_kgsbase));
 ASSYM(PCB_CR0, offsetof(struct pcb, pcb_cr0));
 ASSYM(PCB_CR2, offsetof(struct pcb, pcb_cr2));
 ASSYM(PCB_CR3, offsetof(struct pcb, pcb_cr3));
 ASSYM(PCB_CR4, offsetof(struct pcb, pcb_cr4));
 ASSYM(PCB_DR0, offsetof(struct pcb, pcb_dr0));
 ASSYM(PCB_DR1, offsetof(struct pcb, pcb_dr1));
 ASSYM(PCB_DR2, offsetof(struct pcb, pcb_dr2));
 ASSYM(PCB_DR3, offsetof(struct pcb, pcb_dr3));
 ASSYM(PCB_DR6, offsetof(struct pcb, pcb_dr6));
 ASSYM(PCB_DR7, offsetof(struct pcb, pcb_dr7));
 ASSYM(PCB_GDT, offsetof(struct pcb, pcb_gdt));
 ASSYM(PCB_IDT, offsetof(struct pcb, pcb_idt));
 ASSYM(PCB_LDT, offsetof(struct pcb, pcb_ldt));
 ASSYM(PCB_TR, offsetof(struct pcb, pcb_tr));
 ASSYM(PCB_FLAGS, offsetof(struct pcb, pcb_flags));
 ASSYM(PCB_ONFAULT, offsetof(struct pcb, pcb_onfault));
 ASSYM(PCB_SAVED_UCR3, offsetof(struct pcb, pcb_saved_ucr3));
 ASSYM(PCB_TSSP, offsetof(struct pcb, pcb_tssp));
 ASSYM(PCB_SAVEFPU, offsetof(struct pcb, pcb_save));
 ASSYM(PCB_EFER, offsetof(struct pcb, pcb_efer));
 ASSYM(PCB_STAR, offsetof(struct pcb, pcb_star));
 ASSYM(PCB_LSTAR, offsetof(struct pcb, pcb_lstar));
 ASSYM(PCB_CSTAR, offsetof(struct pcb, pcb_cstar));
 ASSYM(PCB_SFMASK, offsetof(struct pcb, pcb_sfmask));
 ASSYM(PCB_SIZE, sizeof(struct pcb));
 ASSYM(PCB_FULL_IRET, PCB_FULL_IRET);
 ASSYM(PCB_DBREGS, PCB_DBREGS);
 ASSYM(PCB_32BIT, PCB_32BIT);
 
 ASSYM(TSS_RSP0, offsetof(struct amd64tss, tss_rsp0));
 
 ASSYM(TF_R15, offsetof(struct trapframe, tf_r15));
 ASSYM(TF_R14, offsetof(struct trapframe, tf_r14));
 ASSYM(TF_R13, offsetof(struct trapframe, tf_r13));
 ASSYM(TF_R12, offsetof(struct trapframe, tf_r12));
 ASSYM(TF_R11, offsetof(struct trapframe, tf_r11));
 ASSYM(TF_R10, offsetof(struct trapframe, tf_r10));
 ASSYM(TF_R9, offsetof(struct trapframe, tf_r9));
 ASSYM(TF_R8, offsetof(struct trapframe, tf_r8));
 ASSYM(TF_RDI, offsetof(struct trapframe, tf_rdi));
 ASSYM(TF_RSI, offsetof(struct trapframe, tf_rsi));
 ASSYM(TF_RBP, offsetof(struct trapframe, tf_rbp));
 ASSYM(TF_RBX, offsetof(struct trapframe, tf_rbx));
 ASSYM(TF_RDX, offsetof(struct trapframe, tf_rdx));
 ASSYM(TF_RCX, offsetof(struct trapframe, tf_rcx));
 ASSYM(TF_RAX, offsetof(struct trapframe, tf_rax));
 ASSYM(TF_TRAPNO, offsetof(struct trapframe, tf_trapno));
 ASSYM(TF_ADDR, offsetof(struct trapframe, tf_addr));
 ASSYM(TF_ERR, offsetof(struct trapframe, tf_err));
 ASSYM(TF_RIP, offsetof(struct trapframe, tf_rip));
 ASSYM(TF_CS, offsetof(struct trapframe, tf_cs));
 ASSYM(TF_RFLAGS, offsetof(struct trapframe, tf_rflags));
 ASSYM(TF_RSP, offsetof(struct trapframe, tf_rsp));
 ASSYM(TF_SS, offsetof(struct trapframe, tf_ss));
 ASSYM(TF_DS, offsetof(struct trapframe, tf_ds));
 ASSYM(TF_ES, offsetof(struct trapframe, tf_es));
 ASSYM(TF_FS, offsetof(struct trapframe, tf_fs));
 ASSYM(TF_GS, offsetof(struct trapframe, tf_gs));
 ASSYM(TF_FLAGS, offsetof(struct trapframe, tf_flags));
 ASSYM(TF_SIZE, sizeof(struct trapframe));
 ASSYM(TF_HASSEGS, TF_HASSEGS);
 
 ASSYM(PTI_RDX, offsetof(struct pti_frame, pti_rdx));
 ASSYM(PTI_RAX, offsetof(struct pti_frame, pti_rax));
 ASSYM(PTI_ERR, offsetof(struct pti_frame, pti_err));
 ASSYM(PTI_RIP, offsetof(struct pti_frame, pti_rip));
 ASSYM(PTI_CS, offsetof(struct pti_frame, pti_cs));
 ASSYM(PTI_RFLAGS, offsetof(struct pti_frame, pti_rflags));
 ASSYM(PTI_RSP, offsetof(struct pti_frame, pti_rsp));
 ASSYM(PTI_SS, offsetof(struct pti_frame, pti_ss));
 ASSYM(PTI_SIZE, sizeof(struct pti_frame));
 
 ASSYM(SIGF_HANDLER, offsetof(struct sigframe, sf_ahu.sf_handler));
 ASSYM(SIGF_UC, offsetof(struct sigframe, sf_uc));
 ASSYM(UC_EFLAGS, offsetof(ucontext_t, uc_mcontext.mc_rflags));
 ASSYM(ENOENT, ENOENT);
 ASSYM(EFAULT, EFAULT);
 ASSYM(ENAMETOOLONG, ENAMETOOLONG);
 ASSYM(MAXCOMLEN, MAXCOMLEN);
 ASSYM(MAXPATHLEN, MAXPATHLEN);
 ASSYM(PC_SIZEOF, sizeof(struct pcpu));
 ASSYM(PC_PRVSPACE, offsetof(struct pcpu, pc_prvspace));
 ASSYM(PC_CURTHREAD, offsetof(struct pcpu, pc_curthread));
 ASSYM(PC_FPCURTHREAD, offsetof(struct pcpu, pc_fpcurthread));
 ASSYM(PC_IDLETHREAD, offsetof(struct pcpu, pc_idlethread));
 ASSYM(PC_CURPCB, offsetof(struct pcpu, pc_curpcb));
 ASSYM(PC_CPUID, offsetof(struct pcpu, pc_cpuid));
 ASSYM(PC_SCRATCH_RSP, offsetof(struct pcpu, pc_scratch_rsp));
 ASSYM(PC_SCRATCH_RAX, offsetof(struct pcpu, pc_scratch_rax));
 ASSYM(PC_CURPMAP, offsetof(struct pcpu, pc_curpmap));
 ASSYM(PC_TSSP, offsetof(struct pcpu, pc_tssp));
 ASSYM(PC_RSP0, offsetof(struct pcpu, pc_rsp0));
 ASSYM(PC_FS32P, offsetof(struct pcpu, pc_fs32p));
 ASSYM(PC_GS32P, offsetof(struct pcpu, pc_gs32p));
 ASSYM(PC_LDT, offsetof(struct pcpu, pc_ldt));
 ASSYM(PC_COMMONTSSP, offsetof(struct pcpu, pc_commontssp));
 ASSYM(PC_TSS, offsetof(struct pcpu, pc_tss));
 ASSYM(PC_PM_SAVE_CNT, offsetof(struct pcpu, pc_pm_save_cnt));
 ASSYM(PC_KCR3, offsetof(struct pcpu, pc_kcr3));
 ASSYM(PC_UCR3, offsetof(struct pcpu, pc_ucr3));
 ASSYM(PC_SAVED_UCR3, offsetof(struct pcpu, pc_saved_ucr3));
 ASSYM(PC_PTI_STACK, offsetof(struct pcpu, pc_pti_stack));
 ASSYM(PC_PTI_STACK_SZ, PC_PTI_STACK_SZ);
 ASSYM(PC_PTI_RSP0, offsetof(struct pcpu, pc_pti_rsp0));
 ASSYM(PC_IBPB_SET, offsetof(struct pcpu, pc_ibpb_set));
+ASSYM(PC_MDS_TMP, offsetof(struct pcpu, pc_mds_tmp));
+ASSYM(PC_MDS_BUF, offsetof(struct pcpu, pc_mds_buf));
+ASSYM(PC_MDS_BUF64, offsetof(struct pcpu, pc_mds_buf64));
  
 ASSYM(LA_EOI, LAPIC_EOI * LAPIC_MEM_MUL);
 ASSYM(LA_ISR, LAPIC_ISR0 * LAPIC_MEM_MUL);
 
 ASSYM(KCSEL, GSEL(GCODE_SEL, SEL_KPL));
 ASSYM(KDSEL, GSEL(GDATA_SEL, SEL_KPL));
 ASSYM(KUCSEL, GSEL(GUCODE_SEL, SEL_UPL));
 ASSYM(KUDSEL, GSEL(GUDATA_SEL, SEL_UPL));
 ASSYM(KUC32SEL, GSEL(GUCODE32_SEL, SEL_UPL));
 ASSYM(KUF32SEL, GSEL(GUFS32_SEL, SEL_UPL));
 ASSYM(KUG32SEL, GSEL(GUGS32_SEL, SEL_UPL));
 ASSYM(TSSSEL, GSEL(GPROC0_SEL, SEL_KPL));
 ASSYM(LDTSEL, GSEL(GUSERLDT_SEL, SEL_KPL));
 ASSYM(SEL_RPL_MASK, SEL_RPL_MASK);
 
 ASSYM(__FreeBSD_version, __FreeBSD_version);
 
 #ifdef	HWPMC_HOOKS
 ASSYM(PMC_FN_USER_CALLCHAIN, PMC_FN_USER_CALLCHAIN);
 #endif
 
 ASSYM(EC_EFI_STATUS, offsetof(struct efirt_callinfo, ec_efi_status));
 ASSYM(EC_FPTR, offsetof(struct efirt_callinfo, ec_fptr));
 ASSYM(EC_ARGCNT, offsetof(struct efirt_callinfo, ec_argcnt));
 ASSYM(EC_ARG1, offsetof(struct efirt_callinfo, ec_arg1));
 ASSYM(EC_ARG2, offsetof(struct efirt_callinfo, ec_arg2));
 ASSYM(EC_ARG3, offsetof(struct efirt_callinfo, ec_arg3));
 ASSYM(EC_ARG4, offsetof(struct efirt_callinfo, ec_arg4));
 ASSYM(EC_ARG5, offsetof(struct efirt_callinfo, ec_arg5));
 ASSYM(EC_RBX, offsetof(struct efirt_callinfo, ec_rbx));
 ASSYM(EC_RSP, offsetof(struct efirt_callinfo, ec_rsp));
 ASSYM(EC_RBP, offsetof(struct efirt_callinfo, ec_rbp));
 ASSYM(EC_R12, offsetof(struct efirt_callinfo, ec_r12));
 ASSYM(EC_R13, offsetof(struct efirt_callinfo, ec_r13));
 ASSYM(EC_R14, offsetof(struct efirt_callinfo, ec_r14));
 ASSYM(EC_R15, offsetof(struct efirt_callinfo, ec_r15));
Index: projects/runtime-coverage-v2/sys/amd64/amd64/initcpu.c
===================================================================
--- projects/runtime-coverage-v2/sys/amd64/amd64/initcpu.c	(revision 347598)
+++ projects/runtime-coverage-v2/sys/amd64/amd64/initcpu.c	(revision 347599)
@@ -1,305 +1,306 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
  *
  * Copyright (c) KATO Takenori, 1997, 1998.
  * 
  * All rights reserved.  Unpublished rights reserved under the copyright
  * laws of Japan.
  * 
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer as
  *    the first lines of this file unmodified.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_cpu.h"
 
 #include <sys/param.h>
 #include <sys/kernel.h>
 #include <sys/pcpu.h>
 #include <sys/systm.h>
 #include <sys/sysctl.h>
 
 #include <machine/cputypes.h>
 #include <machine/md_var.h>
 #include <machine/specialreg.h>
 
 #include <vm/vm.h>
 #include <vm/pmap.h>
 
 static int	hw_instruction_sse;
 SYSCTL_INT(_hw, OID_AUTO, instruction_sse, CTLFLAG_RD,
     &hw_instruction_sse, 0, "SIMD/MMX2 instructions available in CPU");
 static int	lower_sharedpage_init;
 int		hw_lower_amd64_sharedpage;
 SYSCTL_INT(_hw, OID_AUTO, lower_amd64_sharedpage, CTLFLAG_RDTUN,
     &hw_lower_amd64_sharedpage, 0,
    "Lower sharedpage to work around Ryzen issue with executing code near the top of user memory");
 /*
  * -1: automatic (default)
  *  0: keep enable CLFLUSH
  *  1: force disable CLFLUSH
  */
 static int	hw_clflush_disable = -1;
 
 static void
 init_amd(void)
 {
 	uint64_t msr;
 
 	/*
 	 * Work around Erratum 721 for Family 10h and 12h processors.
 	 * These processors may incorrectly update the stack pointer
 	 * after a long series of push and/or near-call instructions,
 	 * or a long series of pop and/or near-return instructions.
 	 *
 	 * http://support.amd.com/us/Processor_TechDocs/41322_10h_Rev_Gd.pdf
 	 * http://support.amd.com/us/Processor_TechDocs/44739_12h_Rev_Gd.pdf
 	 *
 	 * Hypervisors do not provide access to the errata MSR,
 	 * causing #GP exception on attempt to apply the errata.  The
 	 * MSR write shall be done on host and persist globally
 	 * anyway, so do not try to do it when under virtualization.
 	 */
 	switch (CPUID_TO_FAMILY(cpu_id)) {
 	case 0x10:
 	case 0x12:
 		if ((cpu_feature2 & CPUID2_HV) == 0)
 			wrmsr(0xc0011029, rdmsr(0xc0011029) | 1);
 		break;
 	}
 
 	/*
 	 * BIOS may fail to set InitApicIdCpuIdLo to 1 as it should per BKDG.
 	 * So, do it here or otherwise some tools could be confused by
 	 * Initial Local APIC ID reported with CPUID Function 1 in EBX.
 	 */
 	if (CPUID_TO_FAMILY(cpu_id) == 0x10) {
 		if ((cpu_feature2 & CPUID2_HV) == 0) {
 			msr = rdmsr(MSR_NB_CFG1);
 			msr |= (uint64_t)1 << 54;
 			wrmsr(MSR_NB_CFG1, msr);
 		}
 	}
 
 	/*
 	 * BIOS may configure Family 10h processors to convert WC+ cache type
 	 * to CD.  That can hurt performance of guest VMs using nested paging.
 	 * The relevant MSR bit is not documented in the BKDG,
 	 * the fix is borrowed from Linux.
 	 */
 	if (CPUID_TO_FAMILY(cpu_id) == 0x10) {
 		if ((cpu_feature2 & CPUID2_HV) == 0) {
 			msr = rdmsr(0xc001102a);
 			msr &= ~((uint64_t)1 << 24);
 			wrmsr(0xc001102a, msr);
 		}
 	}
 
 	/*
 	 * Work around Erratum 793: Specific Combination of Writes to Write
 	 * Combined Memory Types and Locked Instructions May Cause Core Hang.
 	 * See Revision Guide for AMD Family 16h Models 00h-0Fh Processors,
 	 * revision 3.04 or later, publication 51810.
 	 */
 	if (CPUID_TO_FAMILY(cpu_id) == 0x16 && CPUID_TO_MODEL(cpu_id) <= 0xf) {
 		if ((cpu_feature2 & CPUID2_HV) == 0) {
 			msr = rdmsr(0xc0011020);
 			msr |= (uint64_t)1 << 15;
 			wrmsr(0xc0011020, msr);
 		}
 	}
 
 	/* Ryzen erratas. */
 	if (CPUID_TO_FAMILY(cpu_id) == 0x17 && CPUID_TO_MODEL(cpu_id) == 0x1 &&
 	    (cpu_feature2 & CPUID2_HV) == 0) {
 		/* 1021 */
 		msr = rdmsr(0xc0011029);
 		msr |= 0x2000;
 		wrmsr(0xc0011029, msr);
 
 		/* 1033 */
 		msr = rdmsr(0xc0011020);
 		msr |= 0x10;
 		wrmsr(0xc0011020, msr);
 
 		/* 1049 */
 		msr = rdmsr(0xc0011028);
 		msr |= 0x10;
 		wrmsr(0xc0011028, msr);
 
 		/* 1095 */
 		msr = rdmsr(0xc0011020);
 		msr |= 0x200000000000000;
 		wrmsr(0xc0011020, msr);
 	}
 
 	/*
 	 * Work around a problem on Ryzen that is triggered by executing
 	 * code near the top of user memory, in our case the signal
 	 * trampoline code in the shared page on amd64.
 	 *
 	 * This function is executed once for the BSP before tunables take
 	 * effect so the value determined here can be overridden by the
 	 * tunable.  This function is then executed again for each AP and
 	 * also on resume.  Set a flag the first time so that value set by
 	 * the tunable is not overwritten.
 	 *
 	 * The stepping and/or microcode versions should be checked after
 	 * this issue is fixed by AMD so that we don't use this mode if not
 	 * needed.
 	 */
 	if (lower_sharedpage_init == 0) {
 		lower_sharedpage_init = 1;
 		if (CPUID_TO_FAMILY(cpu_id) == 0x17) {
 			hw_lower_amd64_sharedpage = 1;
 		}
 	}
 }
 
 /*
  * Initialize special VIA features
  */
 static void
 init_via(void)
 {
 	u_int regs[4], val;
 
 	/*
 	 * Check extended CPUID for PadLock features.
 	 *
 	 * http://www.via.com.tw/en/downloads/whitepapers/initiatives/padlock/programming_guide.pdf
 	 */
 	do_cpuid(0xc0000000, regs);
 	if (regs[0] >= 0xc0000001) {
 		do_cpuid(0xc0000001, regs);
 		val = regs[3];
 	} else
 		return;
 
 	/* Enable RNG if present. */
 	if ((val & VIA_CPUID_HAS_RNG) != 0) {
 		via_feature_rng = VIA_HAS_RNG;
 		wrmsr(0x110B, rdmsr(0x110B) | VIA_CPUID_DO_RNG);
 	}
 
 	/* Enable PadLock if present. */
 	if ((val & VIA_CPUID_HAS_ACE) != 0)
 		via_feature_xcrypt |= VIA_HAS_AES;
 	if ((val & VIA_CPUID_HAS_ACE2) != 0)
 		via_feature_xcrypt |= VIA_HAS_AESCTR;
 	if ((val & VIA_CPUID_HAS_PHE) != 0)
 		via_feature_xcrypt |= VIA_HAS_SHA;
 	if ((val & VIA_CPUID_HAS_PMM) != 0)
 		via_feature_xcrypt |= VIA_HAS_MM;
 	if (via_feature_xcrypt != 0)
 		wrmsr(0x1107, rdmsr(0x1107) | (1 << 28));
 }
 
 /*
  * Initialize CPU control registers
  */
 void
 initializecpu(void)
 {
 	uint64_t msr;
 	uint32_t cr4;
 
 	cr4 = rcr4();
 	if ((cpu_feature & CPUID_XMM) && (cpu_feature & CPUID_FXSR)) {
 		cr4 |= CR4_FXSR | CR4_XMM;
 		cpu_fxsr = hw_instruction_sse = 1;
 	}
 	if (cpu_stdext_feature & CPUID_STDEXT_FSGSBASE)
 		cr4 |= CR4_FSGSBASE;
 
 	if (cpu_stdext_feature2 & CPUID_STDEXT2_PKU)
 		cr4 |= CR4_PKE;
 
 	/*
 	 * Postpone enabling the SMEP on the boot CPU until the page
 	 * tables are switched from the boot loader identity mapping
 	 * to the kernel tables.  The boot loader enables the U bit in
 	 * its tables.
 	 */
 	if (!IS_BSP()) {
 		if (cpu_stdext_feature & CPUID_STDEXT_SMEP)
 			cr4 |= CR4_SMEP;
 		if (cpu_stdext_feature & CPUID_STDEXT_SMAP)
 			cr4 |= CR4_SMAP;
 	}
 	load_cr4(cr4);
 	if (IS_BSP() && (amd_feature & AMDID_NX) != 0) {
 		msr = rdmsr(MSR_EFER) | EFER_NXE;
 		wrmsr(MSR_EFER, msr);
 		pg_nx = PG_NX;
 	}
 	hw_ibrs_recalculate();
 	hw_ssb_recalculate(false);
 	amd64_syscall_ret_flush_l1d_recalc();
+	hw_mds_recalculate();
 	switch (cpu_vendor_id) {
 	case CPU_VENDOR_AMD:
 		init_amd();
 		break;
 	case CPU_VENDOR_CENTAUR:
 		init_via();
 		break;
 	}
 
 	if ((amd_feature & AMDID_RDTSCP) != 0 ||
 	    (cpu_stdext_feature2 & CPUID_STDEXT2_RDPID) != 0)
 		wrmsr(MSR_TSC_AUX, PCPU_GET(cpuid));
 }
 
 void
 initializecpucache(void)
 {
 
 	/*
 	 * CPUID with %eax = 1, %ebx returns
 	 * Bits 15-8: CLFLUSH line size
 	 * 	(Value * 8 = cache line size in bytes)
 	 */
 	if ((cpu_feature & CPUID_CLFSH) != 0)
 		cpu_clflush_line_size = ((cpu_procinfo >> 8) & 0xff) * 8;
 	/*
 	 * XXXKIB: (temporary) hack to work around traps generated
 	 * when CLFLUSHing APIC register window under virtualization
 	 * environments.  These environments tend to disable the
 	 * CPUID_SS feature even though the native CPU supports it.
 	 */
 	TUNABLE_INT_FETCH("hw.clflush_disable", &hw_clflush_disable);
 	if (vm_guest != VM_GUEST_NO && hw_clflush_disable == -1) {
 		cpu_feature &= ~CPUID_CLFSH;
 		cpu_stdext_feature &= ~CPUID_STDEXT_CLFLUSHOPT;
 	}
 
 	/*
 	 * The kernel's use of CLFLUSH{,OPT} can be disabled manually
 	 * by setting the hw.clflush_disable tunable.
 	 */
 	if (hw_clflush_disable == 1) {
 		cpu_feature &= ~CPUID_CLFSH;
 		cpu_stdext_feature &= ~CPUID_STDEXT_CLFLUSHOPT;
 	}
 }
Index: projects/runtime-coverage-v2/sys/amd64/amd64/machdep.c
===================================================================
--- projects/runtime-coverage-v2/sys/amd64/amd64/machdep.c	(revision 347598)
+++ projects/runtime-coverage-v2/sys/amd64/amd64/machdep.c	(revision 347599)
@@ -1,2740 +1,2741 @@
 /*-
  * SPDX-License-Identifier: BSD-4-Clause
  *
  * Copyright (c) 2003 Peter Wemm.
  * Copyright (c) 1992 Terrence R. Lambert.
  * Copyright (c) 1982, 1987, 1990 The Regents of the University of California.
  * All rights reserved.
  *
  * This code is derived from software contributed to Berkeley by
  * William Jolitz.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *	This product includes software developed by the University of
  *	California, Berkeley and its contributors.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	from: @(#)machdep.c	7.4 (Berkeley) 6/3/91
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_atpic.h"
 #include "opt_cpu.h"
 #include "opt_ddb.h"
 #include "opt_inet.h"
 #include "opt_isa.h"
 #include "opt_kstack_pages.h"
 #include "opt_maxmem.h"
 #include "opt_mp_watchdog.h"
 #include "opt_pci.h"
 #include "opt_platform.h"
 #include "opt_sched.h"
 
 #include <sys/param.h>
 #include <sys/proc.h>
 #include <sys/systm.h>
 #include <sys/bio.h>
 #include <sys/buf.h>
 #include <sys/bus.h>
 #include <sys/callout.h>
 #include <sys/cons.h>
 #include <sys/cpu.h>
 #include <sys/efi.h>
 #include <sys/eventhandler.h>
 #include <sys/exec.h>
 #include <sys/imgact.h>
 #include <sys/kdb.h>
 #include <sys/kernel.h>
 #include <sys/ktr.h>
 #include <sys/linker.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/memrange.h>
 #include <sys/msgbuf.h>
 #include <sys/mutex.h>
 #include <sys/pcpu.h>
 #include <sys/ptrace.h>
 #include <sys/reboot.h>
 #include <sys/rwlock.h>
 #include <sys/sched.h>
 #include <sys/signalvar.h>
 #ifdef SMP
 #include <sys/smp.h>
 #endif
 #include <sys/syscallsubr.h>
 #include <sys/sysctl.h>
 #include <sys/sysent.h>
 #include <sys/sysproto.h>
 #include <sys/ucontext.h>
 #include <sys/vmmeter.h>
 
 #include <vm/vm.h>
 #include <vm/vm_extern.h>
 #include <vm/vm_kern.h>
 #include <vm/vm_page.h>
 #include <vm/vm_map.h>
 #include <vm/vm_object.h>
 #include <vm/vm_pager.h>
 #include <vm/vm_param.h>
 #include <vm/vm_phys.h>
 
 #ifdef DDB
 #ifndef KDB
 #error KDB must be enabled in order for DDB to work!
 #endif
 #include <ddb/ddb.h>
 #include <ddb/db_sym.h>
 #endif
 
 #include <net/netisr.h>
 
 #include <machine/clock.h>
 #include <machine/cpu.h>
 #include <machine/cputypes.h>
 #include <machine/frame.h>
 #include <machine/intr_machdep.h>
 #include <x86/mca.h>
 #include <machine/md_var.h>
 #include <machine/metadata.h>
 #include <machine/mp_watchdog.h>
 #include <machine/pc/bios.h>
 #include <machine/pcb.h>
 #include <machine/proc.h>
 #include <machine/reg.h>
 #include <machine/sigframe.h>
 #include <machine/specialreg.h>
 #include <machine/trap.h>
 #include <machine/tss.h>
 #include <x86/ucode.h>
 #include <x86/ifunc.h>
 #ifdef SMP
 #include <machine/smp.h>
 #endif
 #ifdef FDT
 #include <x86/fdt.h>
 #endif
 
 #ifdef DEV_ATPIC
 #include <x86/isa/icu.h>
 #else
 #include <x86/apicvar.h>
 #endif
 
 #include <isa/isareg.h>
 #include <isa/rtc.h>
 #include <x86/init.h>
 
 /* Sanity check for __curthread() */
 CTASSERT(offsetof(struct pcpu, pc_curthread) == 0);
 
 /*
  * The PTI trampoline stack needs enough space for a hardware trapframe and a
  * couple of scratch registers, as well as the trapframe left behind after an
  * iret fault.
  */
 CTASSERT(PC_PTI_STACK_SZ * sizeof(register_t) >= 2 * sizeof(struct pti_frame) -
     offsetof(struct pti_frame, pti_rip));
 
 extern u_int64_t hammer_time(u_int64_t, u_int64_t);
 
 #define	CS_SECURE(cs)		(ISPL(cs) == SEL_UPL)
 #define	EFL_SECURE(ef, oef)	((((ef) ^ (oef)) & ~PSL_USERCHANGE) == 0)
 
 static void cpu_startup(void *);
 static void get_fpcontext(struct thread *td, mcontext_t *mcp,
     char *xfpusave, size_t xfpusave_len);
 static int  set_fpcontext(struct thread *td, mcontext_t *mcp,
     char *xfpustate, size_t xfpustate_len);
 SYSINIT(cpu, SI_SUB_CPU, SI_ORDER_FIRST, cpu_startup, NULL);
 
 /* Preload data parse function */
 static caddr_t native_parse_preload_data(u_int64_t);
 
 /* Native function to fetch and parse the e820 map */
 static void native_parse_memmap(caddr_t, vm_paddr_t *, int *);
 
 /* Default init_ops implementation. */
 struct init_ops init_ops = {
 	.parse_preload_data =	native_parse_preload_data,
 	.early_clock_source_init =	i8254_init,
 	.early_delay =			i8254_delay,
 	.parse_memmap =			native_parse_memmap,
 #ifdef SMP
 	.mp_bootaddress =		mp_bootaddress,
 	.start_all_aps =		native_start_all_aps,
 #endif
 #ifdef DEV_PCI
 	.msi_init =			msi_init,
 #endif
 };
 
 /*
  * Physical address of the EFI System Table. Stashed from the metadata hints
  * passed into the kernel and used by the EFI code to call runtime services.
  */
 vm_paddr_t efi_systbl_phys;
 
 /* Intel ICH registers */
 #define ICH_PMBASE	0x400
 #define ICH_SMI_EN	ICH_PMBASE + 0x30
 
 int	_udatasel, _ucodesel, _ucode32sel, _ufssel, _ugssel;
 
 int cold = 1;
 
 long Maxmem = 0;
 long realmem = 0;
 
 /*
  * The number of PHYSMAP entries must be one less than the number of
  * PHYSSEG entries because the PHYSMAP entry that spans the largest
  * physical address that is accessible by ISA DMA is split into two
  * PHYSSEG entries.
  */
 #define	PHYSMAP_SIZE	(2 * (VM_PHYSSEG_MAX - 1))
 
 vm_paddr_t phys_avail[PHYSMAP_SIZE + 2];
 vm_paddr_t dump_avail[PHYSMAP_SIZE + 2];
 
 /* must be 2 less so 0 0 can signal end of chunks */
 #define	PHYS_AVAIL_ARRAY_END (nitems(phys_avail) - 2)
 #define	DUMP_AVAIL_ARRAY_END (nitems(dump_avail) - 2)
 
 struct kva_md_info kmi;
 
 static struct trapframe proc0_tf;
 struct region_descriptor r_gdt, r_idt;
 
 struct pcpu __pcpu[MAXCPU];
 
 struct mtx icu_lock;
 
 struct mem_range_softc mem_range_softc;
 
 struct mtx dt_lock;	/* lock for GDT and LDT */
 
 void (*vmm_resume_p)(void);
 
 static void
 cpu_startup(dummy)
 	void *dummy;
 {
 	uintmax_t memsize;
 	char *sysenv;
 
 	/*
 	 * On MacBooks, we need to disallow the legacy USB circuit to
 	 * generate an SMI# because this can cause several problems,
 	 * namely: incorrect CPU frequency detection and failure to
 	 * start the APs.
 	 * We do this by disabling a bit in the SMI_EN (SMI Control and
 	 * Enable register) of the Intel ICH LPC Interface Bridge. 
 	 */
 	sysenv = kern_getenv("smbios.system.product");
 	if (sysenv != NULL) {
 		if (strncmp(sysenv, "MacBook1,1", 10) == 0 ||
 		    strncmp(sysenv, "MacBook3,1", 10) == 0 ||
 		    strncmp(sysenv, "MacBook4,1", 10) == 0 ||
 		    strncmp(sysenv, "MacBookPro1,1", 13) == 0 ||
 		    strncmp(sysenv, "MacBookPro1,2", 13) == 0 ||
 		    strncmp(sysenv, "MacBookPro3,1", 13) == 0 ||
 		    strncmp(sysenv, "MacBookPro4,1", 13) == 0 ||
 		    strncmp(sysenv, "Macmini1,1", 10) == 0) {
 			if (bootverbose)
 				printf("Disabling LEGACY_USB_EN bit on "
 				    "Intel ICH.\n");
 			outl(ICH_SMI_EN, inl(ICH_SMI_EN) & ~0x8);
 		}
 		freeenv(sysenv);
 	}
 
 	/*
 	 * Good {morning,afternoon,evening,night}.
 	 */
 	startrtclock();
 	printcpuinfo();
 
 	/*
 	 * Display physical memory if SMBIOS reports reasonable amount.
 	 */
 	memsize = 0;
 	sysenv = kern_getenv("smbios.memory.enabled");
 	if (sysenv != NULL) {
 		memsize = (uintmax_t)strtoul(sysenv, (char **)NULL, 10) << 10;
 		freeenv(sysenv);
 	}
 	if (memsize < ptoa((uintmax_t)vm_free_count()))
 		memsize = ptoa((uintmax_t)Maxmem);
 	printf("real memory  = %ju (%ju MB)\n", memsize, memsize >> 20);
 	realmem = atop(memsize);
 
 	/*
 	 * Display any holes after the first chunk of extended memory.
 	 */
 	if (bootverbose) {
 		int indx;
 
 		printf("Physical memory chunk(s):\n");
 		for (indx = 0; phys_avail[indx + 1] != 0; indx += 2) {
 			vm_paddr_t size;
 
 			size = phys_avail[indx + 1] - phys_avail[indx];
 			printf(
 			    "0x%016jx - 0x%016jx, %ju bytes (%ju pages)\n",
 			    (uintmax_t)phys_avail[indx],
 			    (uintmax_t)phys_avail[indx + 1] - 1,
 			    (uintmax_t)size, (uintmax_t)size / PAGE_SIZE);
 		}
 	}
 
 	vm_ksubmap_init(&kmi);
 
 	printf("avail memory = %ju (%ju MB)\n",
 	    ptoa((uintmax_t)vm_free_count()),
 	    ptoa((uintmax_t)vm_free_count()) / 1048576);
 #ifdef DEV_PCI
 	if (bootverbose && intel_graphics_stolen_base != 0)
 		printf("intel stolen mem: base %#jx size %ju MB\n",
 		    (uintmax_t)intel_graphics_stolen_base,
 		    (uintmax_t)intel_graphics_stolen_size / 1024 / 1024);
 #endif
 
 	/*
 	 * Set up buffers, so they can be used to read disk labels.
 	 */
 	bufinit();
 	vm_pager_bufferinit();
 
 	cpu_setregs();
 }
 
 /*
  * Send an interrupt to process.
  *
  * Stack is set up to allow sigcode stored
  * at top to call routine, followed by call
  * to sigreturn routine below.  After sigreturn
  * resets the signal mask, the stack, and the
  * frame pointer, it returns to the user
  * specified pc, psl.
  */
 void
 sendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask)
 {
 	struct sigframe sf, *sfp;
 	struct pcb *pcb;
 	struct proc *p;
 	struct thread *td;
 	struct sigacts *psp;
 	char *sp;
 	struct trapframe *regs;
 	char *xfpusave;
 	size_t xfpusave_len;
 	int sig;
 	int oonstack;
 
 	td = curthread;
 	pcb = td->td_pcb;
 	p = td->td_proc;
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	sig = ksi->ksi_signo;
 	psp = p->p_sigacts;
 	mtx_assert(&psp->ps_mtx, MA_OWNED);
 	regs = td->td_frame;
 	oonstack = sigonstack(regs->tf_rsp);
 
 	if (cpu_max_ext_state_size > sizeof(struct savefpu) && use_xsave) {
 		xfpusave_len = cpu_max_ext_state_size - sizeof(struct savefpu);
 		xfpusave = __builtin_alloca(xfpusave_len);
 	} else {
 		xfpusave_len = 0;
 		xfpusave = NULL;
 	}
 
 	/* Save user context. */
 	bzero(&sf, sizeof(sf));
 	sf.sf_uc.uc_sigmask = *mask;
 	sf.sf_uc.uc_stack = td->td_sigstk;
 	sf.sf_uc.uc_stack.ss_flags = (td->td_pflags & TDP_ALTSTACK)
 	    ? ((oonstack) ? SS_ONSTACK : 0) : SS_DISABLE;
 	sf.sf_uc.uc_mcontext.mc_onstack = (oonstack) ? 1 : 0;
 	bcopy(regs, &sf.sf_uc.uc_mcontext.mc_rdi, sizeof(*regs));
 	sf.sf_uc.uc_mcontext.mc_len = sizeof(sf.sf_uc.uc_mcontext); /* magic */
 	get_fpcontext(td, &sf.sf_uc.uc_mcontext, xfpusave, xfpusave_len);
 	fpstate_drop(td);
 	update_pcb_bases(pcb);
 	sf.sf_uc.uc_mcontext.mc_fsbase = pcb->pcb_fsbase;
 	sf.sf_uc.uc_mcontext.mc_gsbase = pcb->pcb_gsbase;
 	bzero(sf.sf_uc.uc_mcontext.mc_spare,
 	    sizeof(sf.sf_uc.uc_mcontext.mc_spare));
 
 	/* Allocate space for the signal handler context. */
 	if ((td->td_pflags & TDP_ALTSTACK) != 0 && !oonstack &&
 	    SIGISMEMBER(psp->ps_sigonstack, sig)) {
 		sp = (char *)td->td_sigstk.ss_sp + td->td_sigstk.ss_size;
 #if defined(COMPAT_43)
 		td->td_sigstk.ss_flags |= SS_ONSTACK;
 #endif
 	} else
 		sp = (char *)regs->tf_rsp - 128;
 	if (xfpusave != NULL) {
 		sp -= xfpusave_len;
 		sp = (char *)((unsigned long)sp & ~0x3Ful);
 		sf.sf_uc.uc_mcontext.mc_xfpustate = (register_t)sp;
 	}
 	sp -= sizeof(struct sigframe);
 	/* Align to 16 bytes. */
 	sfp = (struct sigframe *)((unsigned long)sp & ~0xFul);
 
 	/* Build the argument list for the signal handler. */
 	regs->tf_rdi = sig;			/* arg 1 in %rdi */
 	regs->tf_rdx = (register_t)&sfp->sf_uc;	/* arg 3 in %rdx */
 	bzero(&sf.sf_si, sizeof(sf.sf_si));
 	if (SIGISMEMBER(psp->ps_siginfo, sig)) {
 		/* Signal handler installed with SA_SIGINFO. */
 		regs->tf_rsi = (register_t)&sfp->sf_si;	/* arg 2 in %rsi */
 		sf.sf_ahu.sf_action = (__siginfohandler_t *)catcher;
 
 		/* Fill in POSIX parts */
 		sf.sf_si = ksi->ksi_info;
 		sf.sf_si.si_signo = sig; /* maybe a translated signal */
 		regs->tf_rcx = (register_t)ksi->ksi_addr; /* arg 4 in %rcx */
 	} else {
 		/* Old FreeBSD-style arguments. */
 		regs->tf_rsi = ksi->ksi_code;	/* arg 2 in %rsi */
 		regs->tf_rcx = (register_t)ksi->ksi_addr; /* arg 4 in %rcx */
 		sf.sf_ahu.sf_handler = catcher;
 	}
 	mtx_unlock(&psp->ps_mtx);
 	PROC_UNLOCK(p);
 
 	/*
 	 * Copy the sigframe out to the user's stack.
 	 */
 	if (copyout(&sf, sfp, sizeof(*sfp)) != 0 ||
 	    (xfpusave != NULL && copyout(xfpusave,
 	    (void *)sf.sf_uc.uc_mcontext.mc_xfpustate, xfpusave_len)
 	    != 0)) {
 #ifdef DEBUG
 		printf("process %ld has trashed its stack\n", (long)p->p_pid);
 #endif
 		PROC_LOCK(p);
 		sigexit(td, SIGILL);
 	}
 
 	regs->tf_rsp = (long)sfp;
 	regs->tf_rip = p->p_sysent->sv_sigcode_base;
 	regs->tf_rflags &= ~(PSL_T | PSL_D);
 	regs->tf_cs = _ucodesel;
 	regs->tf_ds = _udatasel;
 	regs->tf_ss = _udatasel;
 	regs->tf_es = _udatasel;
 	regs->tf_fs = _ufssel;
 	regs->tf_gs = _ugssel;
 	regs->tf_flags = TF_HASSEGS;
 	PROC_LOCK(p);
 	mtx_lock(&psp->ps_mtx);
 }
 
 /*
  * System call to cleanup state after a signal
  * has been taken.  Reset signal mask and
  * stack state from context left by sendsig (above).
  * Return to previous pc and psl as specified by
  * context left by sendsig. Check carefully to
  * make sure that the user has not modified the
  * state to gain improper privileges.
  *
  * MPSAFE
  */
 int
 sys_sigreturn(td, uap)
 	struct thread *td;
 	struct sigreturn_args /* {
 		const struct __ucontext *sigcntxp;
 	} */ *uap;
 {
 	ucontext_t uc;
 	struct pcb *pcb;
 	struct proc *p;
 	struct trapframe *regs;
 	ucontext_t *ucp;
 	char *xfpustate;
 	size_t xfpustate_len;
 	long rflags;
 	int cs, error, ret;
 	ksiginfo_t ksi;
 
 	pcb = td->td_pcb;
 	p = td->td_proc;
 
 	error = copyin(uap->sigcntxp, &uc, sizeof(uc));
 	if (error != 0) {
 		uprintf("pid %d (%s): sigreturn copyin failed\n",
 		    p->p_pid, td->td_name);
 		return (error);
 	}
 	ucp = &uc;
 	if ((ucp->uc_mcontext.mc_flags & ~_MC_FLAG_MASK) != 0) {
 		uprintf("pid %d (%s): sigreturn mc_flags %x\n", p->p_pid,
 		    td->td_name, ucp->uc_mcontext.mc_flags);
 		return (EINVAL);
 	}
 	regs = td->td_frame;
 	rflags = ucp->uc_mcontext.mc_rflags;
 	/*
 	 * Don't allow users to change privileged or reserved flags.
 	 */
 	if (!EFL_SECURE(rflags, regs->tf_rflags)) {
 		uprintf("pid %d (%s): sigreturn rflags = 0x%lx\n", p->p_pid,
 		    td->td_name, rflags);
 		return (EINVAL);
 	}
 
 	/*
 	 * Don't allow users to load a valid privileged %cs.  Let the
 	 * hardware check for invalid selectors, excess privilege in
 	 * other selectors, invalid %eip's and invalid %esp's.
 	 */
 	cs = ucp->uc_mcontext.mc_cs;
 	if (!CS_SECURE(cs)) {
 		uprintf("pid %d (%s): sigreturn cs = 0x%x\n", p->p_pid,
 		    td->td_name, cs);
 		ksiginfo_init_trap(&ksi);
 		ksi.ksi_signo = SIGBUS;
 		ksi.ksi_code = BUS_OBJERR;
 		ksi.ksi_trapno = T_PROTFLT;
 		ksi.ksi_addr = (void *)regs->tf_rip;
 		trapsignal(td, &ksi);
 		return (EINVAL);
 	}
 
 	if ((uc.uc_mcontext.mc_flags & _MC_HASFPXSTATE) != 0) {
 		xfpustate_len = uc.uc_mcontext.mc_xfpustate_len;
 		if (xfpustate_len > cpu_max_ext_state_size -
 		    sizeof(struct savefpu)) {
 			uprintf("pid %d (%s): sigreturn xfpusave_len = 0x%zx\n",
 			    p->p_pid, td->td_name, xfpustate_len);
 			return (EINVAL);
 		}
 		xfpustate = __builtin_alloca(xfpustate_len);
 		error = copyin((const void *)uc.uc_mcontext.mc_xfpustate,
 		    xfpustate, xfpustate_len);
 		if (error != 0) {
 			uprintf(
 	"pid %d (%s): sigreturn copying xfpustate failed\n",
 			    p->p_pid, td->td_name);
 			return (error);
 		}
 	} else {
 		xfpustate = NULL;
 		xfpustate_len = 0;
 	}
 	ret = set_fpcontext(td, &ucp->uc_mcontext, xfpustate, xfpustate_len);
 	if (ret != 0) {
 		uprintf("pid %d (%s): sigreturn set_fpcontext err %d\n",
 		    p->p_pid, td->td_name, ret);
 		return (ret);
 	}
 	bcopy(&ucp->uc_mcontext.mc_rdi, regs, sizeof(*regs));
 	update_pcb_bases(pcb);
 	pcb->pcb_fsbase = ucp->uc_mcontext.mc_fsbase;
 	pcb->pcb_gsbase = ucp->uc_mcontext.mc_gsbase;
 
 #if defined(COMPAT_43)
 	if (ucp->uc_mcontext.mc_onstack & 1)
 		td->td_sigstk.ss_flags |= SS_ONSTACK;
 	else
 		td->td_sigstk.ss_flags &= ~SS_ONSTACK;
 #endif
 
 	kern_sigprocmask(td, SIG_SETMASK, &ucp->uc_sigmask, NULL, 0);
 	return (EJUSTRETURN);
 }
 
 #ifdef COMPAT_FREEBSD4
 int
 freebsd4_sigreturn(struct thread *td, struct freebsd4_sigreturn_args *uap)
 {
  
 	return sys_sigreturn(td, (struct sigreturn_args *)uap);
 }
 #endif
 
 /*
  * Reset registers to default values on exec.
  */
 void
 exec_setregs(struct thread *td, struct image_params *imgp, u_long stack)
 {
 	struct trapframe *regs;
 	struct pcb *pcb;
 	register_t saved_rflags;
 
 	regs = td->td_frame;
 	pcb = td->td_pcb;
 
 	if (td->td_proc->p_md.md_ldt != NULL)
 		user_ldt_free(td);
 
 	update_pcb_bases(pcb);
 	pcb->pcb_fsbase = 0;
 	pcb->pcb_gsbase = 0;
 	clear_pcb_flags(pcb, PCB_32BIT);
 	pcb->pcb_initial_fpucw = __INITIAL_FPUCW__;
 
 	saved_rflags = regs->tf_rflags & PSL_T;
 	bzero((char *)regs, sizeof(struct trapframe));
 	regs->tf_rip = imgp->entry_addr;
 	regs->tf_rsp = ((stack - 8) & ~0xFul) + 8;
 	regs->tf_rdi = stack;		/* argv */
 	regs->tf_rflags = PSL_USER | saved_rflags;
 	regs->tf_ss = _udatasel;
 	regs->tf_cs = _ucodesel;
 	regs->tf_ds = _udatasel;
 	regs->tf_es = _udatasel;
 	regs->tf_fs = _ufssel;
 	regs->tf_gs = _ugssel;
 	regs->tf_flags = TF_HASSEGS;
 
 	/*
 	 * Reset the hardware debug registers if they were in use.
 	 * They won't have any meaning for the newly exec'd process.
 	 */
 	if (pcb->pcb_flags & PCB_DBREGS) {
 		pcb->pcb_dr0 = 0;
 		pcb->pcb_dr1 = 0;
 		pcb->pcb_dr2 = 0;
 		pcb->pcb_dr3 = 0;
 		pcb->pcb_dr6 = 0;
 		pcb->pcb_dr7 = 0;
 		if (pcb == curpcb) {
 			/*
 			 * Clear the debug registers on the running
 			 * CPU, otherwise they will end up affecting
 			 * the next process we switch to.
 			 */
 			reset_dbregs();
 		}
 		clear_pcb_flags(pcb, PCB_DBREGS);
 	}
 
 	/*
 	 * Drop the FP state if we hold it, so that the process gets a
 	 * clean FP state if it uses the FPU again.
 	 */
 	fpstate_drop(td);
 }
 
 void
 cpu_setregs(void)
 {
 	register_t cr0;
 
 	cr0 = rcr0();
 	/*
 	 * CR0_MP, CR0_NE and CR0_TS are also set by npx_probe() for the
 	 * BSP.  See the comments there about why we set them.
 	 */
 	cr0 |= CR0_MP | CR0_NE | CR0_TS | CR0_WP | CR0_AM;
 	load_cr0(cr0);
 }
 
 /*
  * Initialize amd64 and configure to run kernel
  */
 
 /*
  * Initialize segments & interrupt table
  */
 
 struct user_segment_descriptor gdt[NGDT * MAXCPU];/* global descriptor tables */
 static struct gate_descriptor idt0[NIDT];
 struct gate_descriptor *idt = &idt0[0];	/* interrupt descriptor table */
 
 static char dblfault_stack[PAGE_SIZE] __aligned(16);
 static char mce0_stack[PAGE_SIZE] __aligned(16);
 static char nmi0_stack[PAGE_SIZE] __aligned(16);
 static char dbg0_stack[PAGE_SIZE] __aligned(16);
 CTASSERT(sizeof(struct nmi_pcpu) == 16);
 
 struct amd64tss common_tss[MAXCPU];
 
 /*
  * Software prototypes -- in more palatable form.
  *
  * Keep GUFS32, GUGS32, GUCODE32 and GUDATA at the same
  * slots as corresponding segments for i386 kernel.
  */
 struct soft_segment_descriptor gdt_segs[] = {
 /* GNULL_SEL	0 Null Descriptor */
 {	.ssd_base = 0x0,
 	.ssd_limit = 0x0,
 	.ssd_type = 0,
 	.ssd_dpl = 0,
 	.ssd_p = 0,
 	.ssd_long = 0,
 	.ssd_def32 = 0,
 	.ssd_gran = 0		},
 /* GNULL2_SEL	1 Null Descriptor */
 {	.ssd_base = 0x0,
 	.ssd_limit = 0x0,
 	.ssd_type = 0,
 	.ssd_dpl = 0,
 	.ssd_p = 0,
 	.ssd_long = 0,
 	.ssd_def32 = 0,
 	.ssd_gran = 0		},
 /* GUFS32_SEL	2 32 bit %gs Descriptor for user */
 {	.ssd_base = 0x0,
 	.ssd_limit = 0xfffff,
 	.ssd_type = SDT_MEMRWA,
 	.ssd_dpl = SEL_UPL,
 	.ssd_p = 1,
 	.ssd_long = 0,
 	.ssd_def32 = 1,
 	.ssd_gran = 1		},
 /* GUGS32_SEL	3 32 bit %fs Descriptor for user */
 {	.ssd_base = 0x0,
 	.ssd_limit = 0xfffff,
 	.ssd_type = SDT_MEMRWA,
 	.ssd_dpl = SEL_UPL,
 	.ssd_p = 1,
 	.ssd_long = 0,
 	.ssd_def32 = 1,
 	.ssd_gran = 1		},
 /* GCODE_SEL	4 Code Descriptor for kernel */
 {	.ssd_base = 0x0,
 	.ssd_limit = 0xfffff,
 	.ssd_type = SDT_MEMERA,
 	.ssd_dpl = SEL_KPL,
 	.ssd_p = 1,
 	.ssd_long = 1,
 	.ssd_def32 = 0,
 	.ssd_gran = 1		},
 /* GDATA_SEL	5 Data Descriptor for kernel */
 {	.ssd_base = 0x0,
 	.ssd_limit = 0xfffff,
 	.ssd_type = SDT_MEMRWA,
 	.ssd_dpl = SEL_KPL,
 	.ssd_p = 1,
 	.ssd_long = 1,
 	.ssd_def32 = 0,
 	.ssd_gran = 1		},
 /* GUCODE32_SEL	6 32 bit Code Descriptor for user */
 {	.ssd_base = 0x0,
 	.ssd_limit = 0xfffff,
 	.ssd_type = SDT_MEMERA,
 	.ssd_dpl = SEL_UPL,
 	.ssd_p = 1,
 	.ssd_long = 0,
 	.ssd_def32 = 1,
 	.ssd_gran = 1		},
 /* GUDATA_SEL	7 32/64 bit Data Descriptor for user */
 {	.ssd_base = 0x0,
 	.ssd_limit = 0xfffff,
 	.ssd_type = SDT_MEMRWA,
 	.ssd_dpl = SEL_UPL,
 	.ssd_p = 1,
 	.ssd_long = 0,
 	.ssd_def32 = 1,
 	.ssd_gran = 1		},
 /* GUCODE_SEL	8 64 bit Code Descriptor for user */
 {	.ssd_base = 0x0,
 	.ssd_limit = 0xfffff,
 	.ssd_type = SDT_MEMERA,
 	.ssd_dpl = SEL_UPL,
 	.ssd_p = 1,
 	.ssd_long = 1,
 	.ssd_def32 = 0,
 	.ssd_gran = 1		},
 /* GPROC0_SEL	9 Proc 0 Tss Descriptor */
 {	.ssd_base = 0x0,
 	.ssd_limit = sizeof(struct amd64tss) + IOPERM_BITMAP_SIZE - 1,
 	.ssd_type = SDT_SYSTSS,
 	.ssd_dpl = SEL_KPL,
 	.ssd_p = 1,
 	.ssd_long = 0,
 	.ssd_def32 = 0,
 	.ssd_gran = 0		},
 /* Actually, the TSS is a system descriptor which is double size */
 {	.ssd_base = 0x0,
 	.ssd_limit = 0x0,
 	.ssd_type = 0,
 	.ssd_dpl = 0,
 	.ssd_p = 0,
 	.ssd_long = 0,
 	.ssd_def32 = 0,
 	.ssd_gran = 0		},
 /* GUSERLDT_SEL	11 LDT Descriptor */
 {	.ssd_base = 0x0,
 	.ssd_limit = 0x0,
 	.ssd_type = 0,
 	.ssd_dpl = 0,
 	.ssd_p = 0,
 	.ssd_long = 0,
 	.ssd_def32 = 0,
 	.ssd_gran = 0		},
 /* GUSERLDT_SEL	12 LDT Descriptor, double size */
 {	.ssd_base = 0x0,
 	.ssd_limit = 0x0,
 	.ssd_type = 0,
 	.ssd_dpl = 0,
 	.ssd_p = 0,
 	.ssd_long = 0,
 	.ssd_def32 = 0,
 	.ssd_gran = 0		},
 };
 
 void
 setidt(int idx, inthand_t *func, int typ, int dpl, int ist)
 {
 	struct gate_descriptor *ip;
 
 	ip = idt + idx;
 	ip->gd_looffset = (uintptr_t)func;
 	ip->gd_selector = GSEL(GCODE_SEL, SEL_KPL);
 	ip->gd_ist = ist;
 	ip->gd_xx = 0;
 	ip->gd_type = typ;
 	ip->gd_dpl = dpl;
 	ip->gd_p = 1;
 	ip->gd_hioffset = ((uintptr_t)func)>>16 ;
 }
 
 extern inthand_t
 	IDTVEC(div), IDTVEC(dbg), IDTVEC(nmi), IDTVEC(bpt), IDTVEC(ofl),
 	IDTVEC(bnd), IDTVEC(ill), IDTVEC(dna), IDTVEC(fpusegm),
 	IDTVEC(tss), IDTVEC(missing), IDTVEC(stk), IDTVEC(prot),
 	IDTVEC(page), IDTVEC(mchk), IDTVEC(rsvd), IDTVEC(fpu), IDTVEC(align),
 	IDTVEC(xmm), IDTVEC(dblfault),
 	IDTVEC(div_pti), IDTVEC(bpt_pti),
 	IDTVEC(ofl_pti), IDTVEC(bnd_pti), IDTVEC(ill_pti), IDTVEC(dna_pti),
 	IDTVEC(fpusegm_pti), IDTVEC(tss_pti), IDTVEC(missing_pti),
 	IDTVEC(stk_pti), IDTVEC(prot_pti), IDTVEC(page_pti),
 	IDTVEC(rsvd_pti), IDTVEC(fpu_pti), IDTVEC(align_pti),
 	IDTVEC(xmm_pti),
 #ifdef KDTRACE_HOOKS
 	IDTVEC(dtrace_ret), IDTVEC(dtrace_ret_pti),
 #endif
 #ifdef XENHVM
 	IDTVEC(xen_intr_upcall), IDTVEC(xen_intr_upcall_pti),
 #endif
 	IDTVEC(fast_syscall), IDTVEC(fast_syscall32),
 	IDTVEC(fast_syscall_pti);
 
 #ifdef DDB
 /*
  * Display the index and function name of any IDT entries that don't use
  * the default 'rsvd' entry point.
  */
 DB_SHOW_COMMAND(idt, db_show_idt)
 {
 	struct gate_descriptor *ip;
 	int idx;
 	uintptr_t func;
 
 	ip = idt;
 	for (idx = 0; idx < NIDT && !db_pager_quit; idx++) {
 		func = ((long)ip->gd_hioffset << 16 | ip->gd_looffset);
 		if (func != (uintptr_t)&IDTVEC(rsvd)) {
 			db_printf("%3d\t", idx);
 			db_printsym(func, DB_STGY_PROC);
 			db_printf("\n");
 		}
 		ip++;
 	}
 }
 
 /* Show privileged registers. */
 DB_SHOW_COMMAND(sysregs, db_show_sysregs)
 {
 	struct {
 		uint16_t limit;
 		uint64_t base;
 	} __packed idtr, gdtr;
 	uint16_t ldt, tr;
 
 	__asm __volatile("sidt %0" : "=m" (idtr));
 	db_printf("idtr\t0x%016lx/%04x\n",
 	    (u_long)idtr.base, (u_int)idtr.limit);
 	__asm __volatile("sgdt %0" : "=m" (gdtr));
 	db_printf("gdtr\t0x%016lx/%04x\n",
 	    (u_long)gdtr.base, (u_int)gdtr.limit);
 	__asm __volatile("sldt %0" : "=r" (ldt));
 	db_printf("ldtr\t0x%04x\n", ldt);
 	__asm __volatile("str %0" : "=r" (tr));
 	db_printf("tr\t0x%04x\n", tr);
 	db_printf("cr0\t0x%016lx\n", rcr0());
 	db_printf("cr2\t0x%016lx\n", rcr2());
 	db_printf("cr3\t0x%016lx\n", rcr3());
 	db_printf("cr4\t0x%016lx\n", rcr4());
 	if (rcr4() & CR4_XSAVE)
 		db_printf("xcr0\t0x%016lx\n", rxcr(0));
 	db_printf("EFER\t0x%016lx\n", rdmsr(MSR_EFER));
 	if (cpu_feature2 & (CPUID2_VMX | CPUID2_SMX))
 		db_printf("FEATURES_CTL\t%016lx\n",
 		    rdmsr(MSR_IA32_FEATURE_CONTROL));
 	db_printf("DEBUG_CTL\t0x%016lx\n", rdmsr(MSR_DEBUGCTLMSR));
 	db_printf("PAT\t0x%016lx\n", rdmsr(MSR_PAT));
 	db_printf("GSBASE\t0x%016lx\n", rdmsr(MSR_GSBASE));
 }
 
 DB_SHOW_COMMAND(dbregs, db_show_dbregs)
 {
 
 	db_printf("dr0\t0x%016lx\n", rdr0());
 	db_printf("dr1\t0x%016lx\n", rdr1());
 	db_printf("dr2\t0x%016lx\n", rdr2());
 	db_printf("dr3\t0x%016lx\n", rdr3());
 	db_printf("dr6\t0x%016lx\n", rdr6());
 	db_printf("dr7\t0x%016lx\n", rdr7());	
 }
 #endif
 
 void
 sdtossd(sd, ssd)
 	struct user_segment_descriptor *sd;
 	struct soft_segment_descriptor *ssd;
 {
 
 	ssd->ssd_base  = (sd->sd_hibase << 24) | sd->sd_lobase;
 	ssd->ssd_limit = (sd->sd_hilimit << 16) | sd->sd_lolimit;
 	ssd->ssd_type  = sd->sd_type;
 	ssd->ssd_dpl   = sd->sd_dpl;
 	ssd->ssd_p     = sd->sd_p;
 	ssd->ssd_long  = sd->sd_long;
 	ssd->ssd_def32 = sd->sd_def32;
 	ssd->ssd_gran  = sd->sd_gran;
 }
 
 void
 ssdtosd(ssd, sd)
 	struct soft_segment_descriptor *ssd;
 	struct user_segment_descriptor *sd;
 {
 
 	sd->sd_lobase = (ssd->ssd_base) & 0xffffff;
 	sd->sd_hibase = (ssd->ssd_base >> 24) & 0xff;
 	sd->sd_lolimit = (ssd->ssd_limit) & 0xffff;
 	sd->sd_hilimit = (ssd->ssd_limit >> 16) & 0xf;
 	sd->sd_type  = ssd->ssd_type;
 	sd->sd_dpl   = ssd->ssd_dpl;
 	sd->sd_p     = ssd->ssd_p;
 	sd->sd_long  = ssd->ssd_long;
 	sd->sd_def32 = ssd->ssd_def32;
 	sd->sd_gran  = ssd->ssd_gran;
 }
 
 void
 ssdtosyssd(ssd, sd)
 	struct soft_segment_descriptor *ssd;
 	struct system_segment_descriptor *sd;
 {
 
 	sd->sd_lobase = (ssd->ssd_base) & 0xffffff;
 	sd->sd_hibase = (ssd->ssd_base >> 24) & 0xfffffffffful;
 	sd->sd_lolimit = (ssd->ssd_limit) & 0xffff;
 	sd->sd_hilimit = (ssd->ssd_limit >> 16) & 0xf;
 	sd->sd_type  = ssd->ssd_type;
 	sd->sd_dpl   = ssd->ssd_dpl;
 	sd->sd_p     = ssd->ssd_p;
 	sd->sd_gran  = ssd->ssd_gran;
 }
 
 #if !defined(DEV_ATPIC) && defined(DEV_ISA)
 #include <isa/isavar.h>
 #include <isa/isareg.h>
 /*
  * Return a bitmap of the current interrupt requests.  This is 8259-specific
  * and is only suitable for use at probe time.
  * This is only here to pacify sio.  It is NOT FATAL if this doesn't work.
  * It shouldn't be here.  There should probably be an APIC centric
  * implementation in the apic driver code, if at all.
  */
 intrmask_t
 isa_irq_pending(void)
 {
 	u_char irr1;
 	u_char irr2;
 
 	irr1 = inb(IO_ICU1);
 	irr2 = inb(IO_ICU2);
 	return ((irr2 << 8) | irr1);
 }
 #endif
 
 u_int basemem;
 
 static int
 add_physmap_entry(uint64_t base, uint64_t length, vm_paddr_t *physmap,
     int *physmap_idxp)
 {
 	int i, insert_idx, physmap_idx;
 
 	physmap_idx = *physmap_idxp;
 
 	if (length == 0)
 		return (1);
 
 	/*
 	 * Find insertion point while checking for overlap.  Start off by
 	 * assuming the new entry will be added to the end.
 	 *
 	 * NB: physmap_idx points to the next free slot.
 	 */
 	insert_idx = physmap_idx;
 	for (i = 0; i <= physmap_idx; i += 2) {
 		if (base < physmap[i + 1]) {
 			if (base + length <= physmap[i]) {
 				insert_idx = i;
 				break;
 			}
 			if (boothowto & RB_VERBOSE)
 				printf(
 		    "Overlapping memory regions, ignoring second region\n");
 			return (1);
 		}
 	}
 
 	/* See if we can prepend to the next entry. */
 	if (insert_idx <= physmap_idx && base + length == physmap[insert_idx]) {
 		physmap[insert_idx] = base;
 		return (1);
 	}
 
 	/* See if we can append to the previous entry. */
 	if (insert_idx > 0 && base == physmap[insert_idx - 1]) {
 		physmap[insert_idx - 1] += length;
 		return (1);
 	}
 
 	physmap_idx += 2;
 	*physmap_idxp = physmap_idx;
 	if (physmap_idx == PHYSMAP_SIZE) {
 		printf(
 		"Too many segments in the physical address map, giving up\n");
 		return (0);
 	}
 
 	/*
 	 * Move the last 'N' entries down to make room for the new
 	 * entry if needed.
 	 */
 	for (i = (physmap_idx - 2); i > insert_idx; i -= 2) {
 		physmap[i] = physmap[i - 2];
 		physmap[i + 1] = physmap[i - 1];
 	}
 
 	/* Insert the new entry. */
 	physmap[insert_idx] = base;
 	physmap[insert_idx + 1] = base + length;
 	return (1);
 }
 
 void
 bios_add_smap_entries(struct bios_smap *smapbase, u_int32_t smapsize,
                       vm_paddr_t *physmap, int *physmap_idx)
 {
 	struct bios_smap *smap, *smapend;
 
 	smapend = (struct bios_smap *)((uintptr_t)smapbase + smapsize);
 
 	for (smap = smapbase; smap < smapend; smap++) {
 		if (boothowto & RB_VERBOSE)
 			printf("SMAP type=%02x base=%016lx len=%016lx\n",
 			    smap->type, smap->base, smap->length);
 
 		if (smap->type != SMAP_TYPE_MEMORY)
 			continue;
 
 		if (!add_physmap_entry(smap->base, smap->length, physmap,
 		    physmap_idx))
 			break;
 	}
 }
 
 static void
 add_efi_map_entries(struct efi_map_header *efihdr, vm_paddr_t *physmap,
     int *physmap_idx)
 {
 	struct efi_md *map, *p;
 	const char *type;
 	size_t efisz;
 	int ndesc, i;
 
 	static const char *types[] = {
 		"Reserved",
 		"LoaderCode",
 		"LoaderData",
 		"BootServicesCode",
 		"BootServicesData",
 		"RuntimeServicesCode",
 		"RuntimeServicesData",
 		"ConventionalMemory",
 		"UnusableMemory",
 		"ACPIReclaimMemory",
 		"ACPIMemoryNVS",
 		"MemoryMappedIO",
 		"MemoryMappedIOPortSpace",
 		"PalCode",
 		"PersistentMemory"
 	};
 
 	/*
 	 * Memory map data provided by UEFI via the GetMemoryMap
 	 * Boot Services API.
 	 */
 	efisz = (sizeof(struct efi_map_header) + 0xf) & ~0xf;
 	map = (struct efi_md *)((uint8_t *)efihdr + efisz);
 
 	if (efihdr->descriptor_size == 0)
 		return;
 	ndesc = efihdr->memory_size / efihdr->descriptor_size;
 
 	if (boothowto & RB_VERBOSE)
 		printf("%23s %12s %12s %8s %4s\n",
 		    "Type", "Physical", "Virtual", "#Pages", "Attr");
 
 	for (i = 0, p = map; i < ndesc; i++,
 	    p = efi_next_descriptor(p, efihdr->descriptor_size)) {
 		if (boothowto & RB_VERBOSE) {
 			if (p->md_type < nitems(types))
 				type = types[p->md_type];
 			else
 				type = "<INVALID>";
 			printf("%23s %012lx %12p %08lx ", type, p->md_phys,
 			    p->md_virt, p->md_pages);
 			if (p->md_attr & EFI_MD_ATTR_UC)
 				printf("UC ");
 			if (p->md_attr & EFI_MD_ATTR_WC)
 				printf("WC ");
 			if (p->md_attr & EFI_MD_ATTR_WT)
 				printf("WT ");
 			if (p->md_attr & EFI_MD_ATTR_WB)
 				printf("WB ");
 			if (p->md_attr & EFI_MD_ATTR_UCE)
 				printf("UCE ");
 			if (p->md_attr & EFI_MD_ATTR_WP)
 				printf("WP ");
 			if (p->md_attr & EFI_MD_ATTR_RP)
 				printf("RP ");
 			if (p->md_attr & EFI_MD_ATTR_XP)
 				printf("XP ");
 			if (p->md_attr & EFI_MD_ATTR_NV)
 				printf("NV ");
 			if (p->md_attr & EFI_MD_ATTR_MORE_RELIABLE)
 				printf("MORE_RELIABLE ");
 			if (p->md_attr & EFI_MD_ATTR_RO)
 				printf("RO ");
 			if (p->md_attr & EFI_MD_ATTR_RT)
 				printf("RUNTIME");
 			printf("\n");
 		}
 
 		switch (p->md_type) {
 		case EFI_MD_TYPE_CODE:
 		case EFI_MD_TYPE_DATA:
 		case EFI_MD_TYPE_BS_CODE:
 		case EFI_MD_TYPE_BS_DATA:
 		case EFI_MD_TYPE_FREE:
 			/*
 			 * We're allowed to use any entry with these types.
 			 */
 			break;
 		default:
 			continue;
 		}
 
 		if (!add_physmap_entry(p->md_phys, (p->md_pages * PAGE_SIZE),
 		    physmap, physmap_idx))
 			break;
 	}
 }
 
 static char bootmethod[16] = "";
 SYSCTL_STRING(_machdep, OID_AUTO, bootmethod, CTLFLAG_RD, bootmethod, 0,
     "System firmware boot method");
 
 static void
 native_parse_memmap(caddr_t kmdp, vm_paddr_t *physmap, int *physmap_idx)
 {
 	struct bios_smap *smap;
 	struct efi_map_header *efihdr;
 	u_int32_t size;
 
 	/*
 	 * Memory map from INT 15:E820.
 	 *
 	 * subr_module.c says:
 	 * "Consumer may safely assume that size value precedes data."
 	 * ie: an int32_t immediately precedes smap.
 	 */
 
 	efihdr = (struct efi_map_header *)preload_search_info(kmdp,
 	    MODINFO_METADATA | MODINFOMD_EFI_MAP);
 	smap = (struct bios_smap *)preload_search_info(kmdp,
 	    MODINFO_METADATA | MODINFOMD_SMAP);
 	if (efihdr == NULL && smap == NULL)
 		panic("No BIOS smap or EFI map info from loader!");
 
 	if (efihdr != NULL) {
 		add_efi_map_entries(efihdr, physmap, physmap_idx);
 		strlcpy(bootmethod, "UEFI", sizeof(bootmethod));
 	} else {
 		size = *((u_int32_t *)smap - 1);
 		bios_add_smap_entries(smap, size, physmap, physmap_idx);
 		strlcpy(bootmethod, "BIOS", sizeof(bootmethod));
 	}
 }
 
 #define	PAGES_PER_GB	(1024 * 1024 * 1024 / PAGE_SIZE)
 
 /*
  * Populate the (physmap) array with base/bound pairs describing the
  * available physical memory in the system, then test this memory and
  * build the phys_avail array describing the actually-available memory.
  *
  * Total memory size may be set by the kernel environment variable
  * hw.physmem or the compile-time define MAXMEM.
  *
  * XXX first should be vm_paddr_t.
  */
 static void
 getmemsize(caddr_t kmdp, u_int64_t first)
 {
 	int i, physmap_idx, pa_indx, da_indx;
 	vm_paddr_t pa, physmap[PHYSMAP_SIZE];
 	u_long physmem_start, physmem_tunable, memtest;
 	pt_entry_t *pte;
 	quad_t dcons_addr, dcons_size;
 	int page_counter;
 
 	/*
 	 * Tell the physical memory allocator about pages used to store
 	 * the kernel and preloaded data.  See kmem_bootstrap_free().
 	 */
 	vm_phys_add_seg((vm_paddr_t)kernphys, trunc_page(first));
 
 	bzero(physmap, sizeof(physmap));
 	physmap_idx = 0;
 
 	init_ops.parse_memmap(kmdp, physmap, &physmap_idx);
 	physmap_idx -= 2;
 
 	/*
 	 * Find the 'base memory' segment for SMP
 	 */
 	basemem = 0;
 	for (i = 0; i <= physmap_idx; i += 2) {
 		if (physmap[i] <= 0xA0000) {
 			basemem = physmap[i + 1] / 1024;
 			break;
 		}
 	}
 	if (basemem == 0 || basemem > 640) {
 		if (bootverbose)
 			printf(
 		"Memory map doesn't contain a basemem segment, faking it");
 		basemem = 640;
 	}
 
 	/*
 	 * Maxmem isn't the "maximum memory", it's one larger than the
 	 * highest page of the physical address space.  It should be
 	 * called something like "Maxphyspage".  We may adjust this
 	 * based on ``hw.physmem'' and the results of the memory test.
 	 */
 	Maxmem = atop(physmap[physmap_idx + 1]);
 
 #ifdef MAXMEM
 	Maxmem = MAXMEM / 4;
 #endif
 
 	if (TUNABLE_ULONG_FETCH("hw.physmem", &physmem_tunable))
 		Maxmem = atop(physmem_tunable);
 
 	/*
 	 * The boot memory test is disabled by default, as it takes a
 	 * significant amount of time on large-memory systems, and is
 	 * unfriendly to virtual machines as it unnecessarily touches all
 	 * pages.
 	 *
 	 * A general name is used as the code may be extended to support
 	 * additional tests beyond the current "page present" test.
 	 */
 	memtest = 0;
 	TUNABLE_ULONG_FETCH("hw.memtest.tests", &memtest);
 
 	/*
 	 * Don't allow MAXMEM or hw.physmem to extend the amount of memory
 	 * in the system.
 	 */
 	if (Maxmem > atop(physmap[physmap_idx + 1]))
 		Maxmem = atop(physmap[physmap_idx + 1]);
 
 	if (atop(physmap[physmap_idx + 1]) != Maxmem &&
 	    (boothowto & RB_VERBOSE))
 		printf("Physical memory use set to %ldK\n", Maxmem * 4);
 
 	/*
 	 * Make hole for "AP -> long mode" bootstrap code.  The
 	 * mp_bootaddress vector is only available when the kernel
 	 * is configured to support APs and APs for the system start
 	 * in real mode mode (e.g. SMP bare metal).
 	 */
 	if (init_ops.mp_bootaddress)
 		init_ops.mp_bootaddress(physmap, &physmap_idx);
 
 	/* call pmap initialization to make new kernel address space */
 	pmap_bootstrap(&first);
 
 	/*
 	 * Size up each available chunk of physical memory.
 	 *
 	 * XXX Some BIOSes corrupt low 64KB between suspend and resume.
 	 * By default, mask off the first 16 pages unless we appear to be
 	 * running in a VM.
 	 */
 	physmem_start = (vm_guest > VM_GUEST_NO ? 1 : 16) << PAGE_SHIFT;
 	TUNABLE_ULONG_FETCH("hw.physmem.start", &physmem_start);
 	if (physmap[0] < physmem_start) {
 		if (physmem_start < PAGE_SIZE)
 			physmap[0] = PAGE_SIZE;
 		else if (physmem_start >= physmap[1])
 			physmap[0] = round_page(physmap[1] - PAGE_SIZE);
 		else
 			physmap[0] = round_page(physmem_start);
 	}
 	pa_indx = 0;
 	da_indx = 1;
 	phys_avail[pa_indx++] = physmap[0];
 	phys_avail[pa_indx] = physmap[0];
 	dump_avail[da_indx] = physmap[0];
 	pte = CMAP1;
 
 	/*
 	 * Get dcons buffer address
 	 */
 	if (getenv_quad("dcons.addr", &dcons_addr) == 0 ||
 	    getenv_quad("dcons.size", &dcons_size) == 0)
 		dcons_addr = 0;
 
 	/*
 	 * physmap is in bytes, so when converting to page boundaries,
 	 * round up the start address and round down the end address.
 	 */
 	page_counter = 0;
 	if (memtest != 0)
 		printf("Testing system memory");
 	for (i = 0; i <= physmap_idx; i += 2) {
 		vm_paddr_t end;
 
 		end = ptoa((vm_paddr_t)Maxmem);
 		if (physmap[i + 1] < end)
 			end = trunc_page(physmap[i + 1]);
 		for (pa = round_page(physmap[i]); pa < end; pa += PAGE_SIZE) {
 			int tmp, page_bad, full;
 			int *ptr = (int *)CADDR1;
 
 			full = FALSE;
 			/*
 			 * block out kernel memory as not available.
 			 */
 			if (pa >= (vm_paddr_t)kernphys && pa < first)
 				goto do_dump_avail;
 
 			/*
 			 * block out dcons buffer
 			 */
 			if (dcons_addr > 0
 			    && pa >= trunc_page(dcons_addr)
 			    && pa < dcons_addr + dcons_size)
 				goto do_dump_avail;
 
 			page_bad = FALSE;
 			if (memtest == 0)
 				goto skip_memtest;
 
 			/*
 			 * Print a "." every GB to show we're making
 			 * progress.
 			 */
 			page_counter++;
 			if ((page_counter % PAGES_PER_GB) == 0)
 				printf(".");
 
 			/*
 			 * map page into kernel: valid, read/write,non-cacheable
 			 */
 			*pte = pa | PG_V | PG_RW | PG_NC_PWT | PG_NC_PCD;
 			invltlb();
 
 			tmp = *(int *)ptr;
 			/*
 			 * Test for alternating 1's and 0's
 			 */
 			*(volatile int *)ptr = 0xaaaaaaaa;
 			if (*(volatile int *)ptr != 0xaaaaaaaa)
 				page_bad = TRUE;
 			/*
 			 * Test for alternating 0's and 1's
 			 */
 			*(volatile int *)ptr = 0x55555555;
 			if (*(volatile int *)ptr != 0x55555555)
 				page_bad = TRUE;
 			/*
 			 * Test for all 1's
 			 */
 			*(volatile int *)ptr = 0xffffffff;
 			if (*(volatile int *)ptr != 0xffffffff)
 				page_bad = TRUE;
 			/*
 			 * Test for all 0's
 			 */
 			*(volatile int *)ptr = 0x0;
 			if (*(volatile int *)ptr != 0x0)
 				page_bad = TRUE;
 			/*
 			 * Restore original value.
 			 */
 			*(int *)ptr = tmp;
 
 skip_memtest:
 			/*
 			 * Adjust array of valid/good pages.
 			 */
 			if (page_bad == TRUE)
 				continue;
 			/*
 			 * If this good page is a continuation of the
 			 * previous set of good pages, then just increase
 			 * the end pointer. Otherwise start a new chunk.
 			 * Note that "end" points one higher than end,
 			 * making the range >= start and < end.
 			 * If we're also doing a speculative memory
 			 * test and we at or past the end, bump up Maxmem
 			 * so that we keep going. The first bad page
 			 * will terminate the loop.
 			 */
 			if (phys_avail[pa_indx] == pa) {
 				phys_avail[pa_indx] += PAGE_SIZE;
 			} else {
 				pa_indx++;
 				if (pa_indx == PHYS_AVAIL_ARRAY_END) {
 					printf(
 		"Too many holes in the physical address space, giving up\n");
 					pa_indx--;
 					full = TRUE;
 					goto do_dump_avail;
 				}
 				phys_avail[pa_indx++] = pa;	/* start */
 				phys_avail[pa_indx] = pa + PAGE_SIZE; /* end */
 			}
 			physmem++;
 do_dump_avail:
 			if (dump_avail[da_indx] == pa) {
 				dump_avail[da_indx] += PAGE_SIZE;
 			} else {
 				da_indx++;
 				if (da_indx == DUMP_AVAIL_ARRAY_END) {
 					da_indx--;
 					goto do_next;
 				}
 				dump_avail[da_indx++] = pa; /* start */
 				dump_avail[da_indx] = pa + PAGE_SIZE; /* end */
 			}
 do_next:
 			if (full)
 				break;
 		}
 	}
 	*pte = 0;
 	invltlb();
 	if (memtest != 0)
 		printf("\n");
 
 	/*
 	 * XXX
 	 * The last chunk must contain at least one page plus the message
 	 * buffer to avoid complicating other code (message buffer address
 	 * calculation, etc.).
 	 */
 	while (phys_avail[pa_indx - 1] + PAGE_SIZE +
 	    round_page(msgbufsize) >= phys_avail[pa_indx]) {
 		physmem -= atop(phys_avail[pa_indx] - phys_avail[pa_indx - 1]);
 		phys_avail[pa_indx--] = 0;
 		phys_avail[pa_indx--] = 0;
 	}
 
 	Maxmem = atop(phys_avail[pa_indx]);
 
 	/* Trim off space for the message buffer. */
 	phys_avail[pa_indx] -= round_page(msgbufsize);
 
 	/* Map the message buffer. */
 	msgbufp = (struct msgbuf *)PHYS_TO_DMAP(phys_avail[pa_indx]);
 }
 
 static caddr_t
 native_parse_preload_data(u_int64_t modulep)
 {
 	caddr_t kmdp;
 	char *envp;
 #ifdef DDB
 	vm_offset_t ksym_start;
 	vm_offset_t ksym_end;
 #endif
 
 	preload_metadata = (caddr_t)(uintptr_t)(modulep + KERNBASE);
 	preload_bootstrap_relocate(KERNBASE);
 	kmdp = preload_search_by_type("elf kernel");
 	if (kmdp == NULL)
 		kmdp = preload_search_by_type("elf64 kernel");
 	boothowto = MD_FETCH(kmdp, MODINFOMD_HOWTO, int);
 	envp = MD_FETCH(kmdp, MODINFOMD_ENVP, char *);
 	if (envp != NULL)
 		envp += KERNBASE;
 	init_static_kenv(envp, 0);
 #ifdef DDB
 	ksym_start = MD_FETCH(kmdp, MODINFOMD_SSYM, uintptr_t);
 	ksym_end = MD_FETCH(kmdp, MODINFOMD_ESYM, uintptr_t);
 	db_fetch_ksymtab(ksym_start, ksym_end);
 #endif
 	efi_systbl_phys = MD_FETCH(kmdp, MODINFOMD_FW_HANDLE, vm_paddr_t);
 
 	return (kmdp);
 }
 
 static void
 amd64_kdb_init(void)
 {
 	kdb_init();
 #ifdef KDB
 	if (boothowto & RB_KDB)
 		kdb_enter(KDB_WHY_BOOTFLAGS, "Boot flags requested debugger");
 #endif
 }
 
 /* Set up the fast syscall stuff */
 void
 amd64_conf_fast_syscall(void)
 {
 	uint64_t msr;
 
 	msr = rdmsr(MSR_EFER) | EFER_SCE;
 	wrmsr(MSR_EFER, msr);
 	wrmsr(MSR_LSTAR, pti ? (u_int64_t)IDTVEC(fast_syscall_pti) :
 	    (u_int64_t)IDTVEC(fast_syscall));
 	wrmsr(MSR_CSTAR, (u_int64_t)IDTVEC(fast_syscall32));
 	msr = ((u_int64_t)GSEL(GCODE_SEL, SEL_KPL) << 32) |
 	    ((u_int64_t)GSEL(GUCODE32_SEL, SEL_UPL) << 48);
 	wrmsr(MSR_STAR, msr);
 	wrmsr(MSR_SF_MASK, PSL_NT | PSL_T | PSL_I | PSL_C | PSL_D | PSL_AC);
 }
 
 u_int64_t
 hammer_time(u_int64_t modulep, u_int64_t physfree)
 {
 	caddr_t kmdp;
 	int gsel_tss, x;
 	struct pcpu *pc;
 	struct nmi_pcpu *np;
 	struct xstate_hdr *xhdr;
 	u_int64_t rsp0;
 	char *env;
 	size_t kstack0_sz;
 	int late_console;
 
 	TSRAW(&thread0, TS_ENTER, __func__, NULL);
 
 	kmdp = init_ops.parse_preload_data(modulep);
 
 	physfree += ucode_load_bsp(physfree + KERNBASE);
 	physfree = roundup2(physfree, PAGE_SIZE);
 
 	identify_cpu1();
 	identify_hypervisor();
 	identify_cpu_fixup_bsp();
 	identify_cpu2();
 	initializecpucache();
 
 	/*
 	 * Check for pti, pcid, and invpcid before ifuncs are
 	 * resolved, to correctly select the implementation for
 	 * pmap_activate_sw_mode().
 	 */
 	pti = pti_get_default();
 	TUNABLE_INT_FETCH("vm.pmap.pti", &pti);
 	TUNABLE_INT_FETCH("vm.pmap.pcid_enabled", &pmap_pcid_enabled);
 	if ((cpu_feature2 & CPUID2_PCID) != 0 && pmap_pcid_enabled) {
 		invpcid_works = (cpu_stdext_feature &
 		    CPUID_STDEXT_INVPCID) != 0;
 	} else {
 		pmap_pcid_enabled = 0;
 	}
 
 	link_elf_ireloc(kmdp);
 
 	/*
 	 * This may be done better later if it gets more high level
 	 * components in it. If so just link td->td_proc here.
 	 */
 	proc_linkup0(&proc0, &thread0);
 
 	/* Init basic tunables, hz etc */
 	init_param1();
 
 	thread0.td_kstack = physfree + KERNBASE;
 	thread0.td_kstack_pages = kstack_pages;
 	kstack0_sz = thread0.td_kstack_pages * PAGE_SIZE;
 	bzero((void *)thread0.td_kstack, kstack0_sz);
 	physfree += kstack0_sz;
 
 	/*
 	 * make gdt memory segments
 	 */
 	for (x = 0; x < NGDT; x++) {
 		if (x != GPROC0_SEL && x != (GPROC0_SEL + 1) &&
 		    x != GUSERLDT_SEL && x != (GUSERLDT_SEL) + 1)
 			ssdtosd(&gdt_segs[x], &gdt[x]);
 	}
 	gdt_segs[GPROC0_SEL].ssd_base = (uintptr_t)&common_tss[0];
 	ssdtosyssd(&gdt_segs[GPROC0_SEL],
 	    (struct system_segment_descriptor *)&gdt[GPROC0_SEL]);
 
 	r_gdt.rd_limit = NGDT * sizeof(gdt[0]) - 1;
 	r_gdt.rd_base =  (long) gdt;
 	lgdt(&r_gdt);
 	pc = &__pcpu[0];
 
 	wrmsr(MSR_FSBASE, 0);		/* User value */
 	wrmsr(MSR_GSBASE, (u_int64_t)pc);
 	wrmsr(MSR_KGSBASE, 0);		/* User value while in the kernel */
 
 	pcpu_init(pc, 0, sizeof(struct pcpu));
 	dpcpu_init((void *)(physfree + KERNBASE), 0);
 	physfree += DPCPU_SIZE;
 	PCPU_SET(prvspace, pc);
 	PCPU_SET(curthread, &thread0);
 	/* Non-late cninit() and printf() can be moved up to here. */
 	PCPU_SET(tssp, &common_tss[0]);
 	PCPU_SET(commontssp, &common_tss[0]);
 	PCPU_SET(tss, (struct system_segment_descriptor *)&gdt[GPROC0_SEL]);
 	PCPU_SET(ldt, (struct system_segment_descriptor *)&gdt[GUSERLDT_SEL]);
 	PCPU_SET(fs32p, &gdt[GUFS32_SEL]);
 	PCPU_SET(gs32p, &gdt[GUGS32_SEL]);
 
 	/*
 	 * Initialize mutexes.
 	 *
 	 * icu_lock: in order to allow an interrupt to occur in a critical
 	 * 	     section, to set pcpu->ipending (etc...) properly, we
 	 *	     must be able to get the icu lock, so it can't be
 	 *	     under witness.
 	 */
 	mutex_init();
 	mtx_init(&icu_lock, "icu", NULL, MTX_SPIN | MTX_NOWITNESS);
 	mtx_init(&dt_lock, "descriptor tables", NULL, MTX_DEF);
 
 	/* exceptions */
 	for (x = 0; x < NIDT; x++)
 		setidt(x, pti ? &IDTVEC(rsvd_pti) : &IDTVEC(rsvd), SDT_SYSIGT,
 		    SEL_KPL, 0);
 	setidt(IDT_DE, pti ? &IDTVEC(div_pti) : &IDTVEC(div), SDT_SYSIGT,
 	    SEL_KPL, 0);
 	setidt(IDT_DB, &IDTVEC(dbg), SDT_SYSIGT, SEL_KPL, 4);
 	setidt(IDT_NMI, &IDTVEC(nmi),  SDT_SYSIGT, SEL_KPL, 2);
 	setidt(IDT_BP, pti ? &IDTVEC(bpt_pti) : &IDTVEC(bpt), SDT_SYSIGT,
 	    SEL_UPL, 0);
 	setidt(IDT_OF, pti ? &IDTVEC(ofl_pti) : &IDTVEC(ofl), SDT_SYSIGT,
 	    SEL_UPL, 0);
 	setidt(IDT_BR, pti ? &IDTVEC(bnd_pti) : &IDTVEC(bnd), SDT_SYSIGT,
 	    SEL_KPL, 0);
 	setidt(IDT_UD, pti ? &IDTVEC(ill_pti) : &IDTVEC(ill), SDT_SYSIGT,
 	    SEL_KPL, 0);
 	setidt(IDT_NM, pti ? &IDTVEC(dna_pti) : &IDTVEC(dna), SDT_SYSIGT,
 	    SEL_KPL, 0);
 	setidt(IDT_DF, &IDTVEC(dblfault), SDT_SYSIGT, SEL_KPL, 1);
 	setidt(IDT_FPUGP, pti ? &IDTVEC(fpusegm_pti) : &IDTVEC(fpusegm),
 	    SDT_SYSIGT, SEL_KPL, 0);
 	setidt(IDT_TS, pti ? &IDTVEC(tss_pti) : &IDTVEC(tss), SDT_SYSIGT,
 	    SEL_KPL, 0);
 	setidt(IDT_NP, pti ? &IDTVEC(missing_pti) : &IDTVEC(missing),
 	    SDT_SYSIGT, SEL_KPL, 0);
 	setidt(IDT_SS, pti ? &IDTVEC(stk_pti) : &IDTVEC(stk), SDT_SYSIGT,
 	    SEL_KPL, 0);
 	setidt(IDT_GP, pti ? &IDTVEC(prot_pti) : &IDTVEC(prot), SDT_SYSIGT,
 	    SEL_KPL, 0);
 	setidt(IDT_PF, pti ? &IDTVEC(page_pti) : &IDTVEC(page), SDT_SYSIGT,
 	    SEL_KPL, 0);
 	setidt(IDT_MF, pti ? &IDTVEC(fpu_pti) : &IDTVEC(fpu), SDT_SYSIGT,
 	    SEL_KPL, 0);
 	setidt(IDT_AC, pti ? &IDTVEC(align_pti) : &IDTVEC(align), SDT_SYSIGT,
 	    SEL_KPL, 0);
 	setidt(IDT_MC, &IDTVEC(mchk), SDT_SYSIGT, SEL_KPL, 3);
 	setidt(IDT_XF, pti ? &IDTVEC(xmm_pti) : &IDTVEC(xmm), SDT_SYSIGT,
 	    SEL_KPL, 0);
 #ifdef KDTRACE_HOOKS
 	setidt(IDT_DTRACE_RET, pti ? &IDTVEC(dtrace_ret_pti) :
 	    &IDTVEC(dtrace_ret), SDT_SYSIGT, SEL_UPL, 0);
 #endif
 #ifdef XENHVM
 	setidt(IDT_EVTCHN, pti ? &IDTVEC(xen_intr_upcall_pti) :
 	    &IDTVEC(xen_intr_upcall), SDT_SYSIGT, SEL_KPL, 0);
 #endif
 	r_idt.rd_limit = sizeof(idt0) - 1;
 	r_idt.rd_base = (long) idt;
 	lidt(&r_idt);
 
 	/*
 	 * Initialize the clock before the console so that console
 	 * initialization can use DELAY().
 	 */
 	clock_init();
 
 	/*
 	 * Use vt(4) by default for UEFI boot (during the sc(4)/vt(4)
 	 * transition).
 	 * Once bootblocks have updated, we can test directly for
 	 * efi_systbl != NULL here...
 	 */
 	if (preload_search_info(kmdp, MODINFO_METADATA | MODINFOMD_EFI_MAP)
 	    != NULL)
 		vty_set_preferred(VTY_VT);
 
 	TUNABLE_INT_FETCH("hw.ibrs_disable", &hw_ibrs_disable);
 	TUNABLE_INT_FETCH("hw.spec_store_bypass_disable", &hw_ssb_disable);
 	TUNABLE_INT_FETCH("machdep.syscall_ret_l1d_flush",
 	    &syscall_ret_l1d_flush_mode);
+	TUNABLE_INT_FETCH("hw.mds_disable", &hw_mds_disable);
 
 	finishidentcpu();	/* Final stage of CPU initialization */
 	initializecpu();	/* Initialize CPU registers */
 
 	/* doublefault stack space, runs on ist1 */
 	common_tss[0].tss_ist1 = (long)&dblfault_stack[sizeof(dblfault_stack)];
 
 	/*
 	 * NMI stack, runs on ist2.  The pcpu pointer is stored just
 	 * above the start of the ist2 stack.
 	 */
 	np = ((struct nmi_pcpu *) &nmi0_stack[sizeof(nmi0_stack)]) - 1;
 	np->np_pcpu = (register_t) pc;
 	common_tss[0].tss_ist2 = (long) np;
 
 	/*
 	 * MC# stack, runs on ist3.  The pcpu pointer is stored just
 	 * above the start of the ist3 stack.
 	 */
 	np = ((struct nmi_pcpu *) &mce0_stack[sizeof(mce0_stack)]) - 1;
 	np->np_pcpu = (register_t) pc;
 	common_tss[0].tss_ist3 = (long) np;
 
 	/*
 	 * DB# stack, runs on ist4.
 	 */
 	np = ((struct nmi_pcpu *) &dbg0_stack[sizeof(dbg0_stack)]) - 1;
 	np->np_pcpu = (register_t) pc;
 	common_tss[0].tss_ist4 = (long) np;
 	
 	/* Set the IO permission bitmap (empty due to tss seg limit) */
 	common_tss[0].tss_iobase = sizeof(struct amd64tss) + IOPERM_BITMAP_SIZE;
 
 	gsel_tss = GSEL(GPROC0_SEL, SEL_KPL);
 	ltr(gsel_tss);
 
 	amd64_conf_fast_syscall();
 
 	/*
 	 * Temporary forge some valid pointer to PCB, for exception
 	 * handlers.  It is reinitialized properly below after FPU is
 	 * set up.  Also set up td_critnest to short-cut the page
 	 * fault handler.
 	 */
 	cpu_max_ext_state_size = sizeof(struct savefpu);
 	thread0.td_pcb = get_pcb_td(&thread0);
 	thread0.td_critnest = 1;
 
 	/*
 	 * The console and kdb should be initialized even earlier than here,
 	 * but some console drivers don't work until after getmemsize().
 	 * Default to late console initialization to support these drivers.
 	 * This loses mainly printf()s in getmemsize() and early debugging.
 	 */
 	late_console = 1;
 	TUNABLE_INT_FETCH("debug.late_console", &late_console);
 	if (!late_console) {
 		cninit();
 		amd64_kdb_init();
 	}
 
 	getmemsize(kmdp, physfree);
 	init_param2(physmem);
 
 	/* now running on new page tables, configured,and u/iom is accessible */
 
 #ifdef DEV_PCI
         /* This call might adjust phys_avail[]. */
         pci_early_quirks();
 #endif
 
 	if (late_console)
 		cninit();
 
 #ifdef DEV_ISA
 #ifdef DEV_ATPIC
 	elcr_probe();
 	atpic_startup();
 #else
 	/* Reset and mask the atpics and leave them shut down. */
 	atpic_reset();
 
 	/*
 	 * Point the ICU spurious interrupt vectors at the APIC spurious
 	 * interrupt handler.
 	 */
 	setidt(IDT_IO_INTS + 7, IDTVEC(spuriousint), SDT_SYSIGT, SEL_KPL, 0);
 	setidt(IDT_IO_INTS + 15, IDTVEC(spuriousint), SDT_SYSIGT, SEL_KPL, 0);
 #endif
 #else
 #error "have you forgotten the isa device?";
 #endif
 
 	if (late_console)
 		amd64_kdb_init();
 
 	msgbufinit(msgbufp, msgbufsize);
 	fpuinit();
 
 	/*
 	 * Set up thread0 pcb after fpuinit calculated pcb + fpu save
 	 * area size.  Zero out the extended state header in fpu save
 	 * area.
 	 */
 	thread0.td_pcb = get_pcb_td(&thread0);
 	thread0.td_pcb->pcb_save = get_pcb_user_save_td(&thread0);
 	bzero(get_pcb_user_save_td(&thread0), cpu_max_ext_state_size);
 	if (use_xsave) {
 		xhdr = (struct xstate_hdr *)(get_pcb_user_save_td(&thread0) +
 		    1);
 		xhdr->xstate_bv = xsave_mask;
 	}
 	/* make an initial tss so cpu can get interrupt stack on syscall! */
 	rsp0 = (vm_offset_t)thread0.td_pcb;
 	/* Ensure the stack is aligned to 16 bytes */
 	rsp0 &= ~0xFul;
 	common_tss[0].tss_rsp0 = rsp0;
 	PCPU_SET(rsp0, rsp0);
 	PCPU_SET(pti_rsp0, ((vm_offset_t)PCPU_PTR(pti_stack) +
 	    PC_PTI_STACK_SZ * sizeof(uint64_t)) & ~0xful);
 	PCPU_SET(curpcb, thread0.td_pcb);
 
 	/* transfer to user mode */
 
 	_ucodesel = GSEL(GUCODE_SEL, SEL_UPL);
 	_udatasel = GSEL(GUDATA_SEL, SEL_UPL);
 	_ucode32sel = GSEL(GUCODE32_SEL, SEL_UPL);
 	_ufssel = GSEL(GUFS32_SEL, SEL_UPL);
 	_ugssel = GSEL(GUGS32_SEL, SEL_UPL);
 
 	load_ds(_udatasel);
 	load_es(_udatasel);
 	load_fs(_ufssel);
 
 	/* setup proc 0's pcb */
 	thread0.td_pcb->pcb_flags = 0;
 	thread0.td_frame = &proc0_tf;
 
         env = kern_getenv("kernelname");
 	if (env != NULL)
 		strlcpy(kernelname, env, sizeof(kernelname));
 
 	cpu_probe_amdc1e();
 
 #ifdef FDT
 	x86_init_fdt();
 #endif
 	thread0.td_critnest = 0;
 
 	TSEXIT();
 
 	/* Location of kernel stack for locore */
 	return ((u_int64_t)thread0.td_pcb);
 }
 
 void
 cpu_pcpu_init(struct pcpu *pcpu, int cpuid, size_t size)
 {
 
 	pcpu->pc_acpi_id = 0xffffffff;
 }
 
 static int
 smap_sysctl_handler(SYSCTL_HANDLER_ARGS)
 {
 	struct bios_smap *smapbase;
 	struct bios_smap_xattr smap;
 	caddr_t kmdp;
 	uint32_t *smapattr;
 	int count, error, i;
 
 	/* Retrieve the system memory map from the loader. */
 	kmdp = preload_search_by_type("elf kernel");
 	if (kmdp == NULL)
 		kmdp = preload_search_by_type("elf64 kernel");
 	smapbase = (struct bios_smap *)preload_search_info(kmdp,
 	    MODINFO_METADATA | MODINFOMD_SMAP);
 	if (smapbase == NULL)
 		return (0);
 	smapattr = (uint32_t *)preload_search_info(kmdp,
 	    MODINFO_METADATA | MODINFOMD_SMAP_XATTR);
 	count = *((uint32_t *)smapbase - 1) / sizeof(*smapbase);
 	error = 0;
 	for (i = 0; i < count; i++) {
 		smap.base = smapbase[i].base;
 		smap.length = smapbase[i].length;
 		smap.type = smapbase[i].type;
 		if (smapattr != NULL)
 			smap.xattr = smapattr[i];
 		else
 			smap.xattr = 0;
 		error = SYSCTL_OUT(req, &smap, sizeof(smap));
 	}
 	return (error);
 }
 SYSCTL_PROC(_machdep, OID_AUTO, smap, CTLTYPE_OPAQUE|CTLFLAG_RD, NULL, 0,
     smap_sysctl_handler, "S,bios_smap_xattr", "Raw BIOS SMAP data");
 
 static int
 efi_map_sysctl_handler(SYSCTL_HANDLER_ARGS)
 {
 	struct efi_map_header *efihdr;
 	caddr_t kmdp;
 	uint32_t efisize;
 
 	kmdp = preload_search_by_type("elf kernel");
 	if (kmdp == NULL)
 		kmdp = preload_search_by_type("elf64 kernel");
 	efihdr = (struct efi_map_header *)preload_search_info(kmdp,
 	    MODINFO_METADATA | MODINFOMD_EFI_MAP);
 	if (efihdr == NULL)
 		return (0);
 	efisize = *((uint32_t *)efihdr - 1);
 	return (SYSCTL_OUT(req, efihdr, efisize));
 }
 SYSCTL_PROC(_machdep, OID_AUTO, efi_map, CTLTYPE_OPAQUE|CTLFLAG_RD, NULL, 0,
     efi_map_sysctl_handler, "S,efi_map_header", "Raw EFI Memory Map");
 
 void
 spinlock_enter(void)
 {
 	struct thread *td;
 	register_t flags;
 
 	td = curthread;
 	if (td->td_md.md_spinlock_count == 0) {
 		flags = intr_disable();
 		td->td_md.md_spinlock_count = 1;
 		td->td_md.md_saved_flags = flags;
 		critical_enter();
 	} else
 		td->td_md.md_spinlock_count++;
 }
 
 void
 spinlock_exit(void)
 {
 	struct thread *td;
 	register_t flags;
 
 	td = curthread;
 	flags = td->td_md.md_saved_flags;
 	td->td_md.md_spinlock_count--;
 	if (td->td_md.md_spinlock_count == 0) {
 		critical_exit();
 		intr_restore(flags);
 	}
 }
 
 /*
  * Construct a PCB from a trapframe. This is called from kdb_trap() where
  * we want to start a backtrace from the function that caused us to enter
  * the debugger. We have the context in the trapframe, but base the trace
  * on the PCB. The PCB doesn't have to be perfect, as long as it contains
  * enough for a backtrace.
  */
 void
 makectx(struct trapframe *tf, struct pcb *pcb)
 {
 
 	pcb->pcb_r12 = tf->tf_r12;
 	pcb->pcb_r13 = tf->tf_r13;
 	pcb->pcb_r14 = tf->tf_r14;
 	pcb->pcb_r15 = tf->tf_r15;
 	pcb->pcb_rbp = tf->tf_rbp;
 	pcb->pcb_rbx = tf->tf_rbx;
 	pcb->pcb_rip = tf->tf_rip;
 	pcb->pcb_rsp = tf->tf_rsp;
 }
 
 int
 ptrace_set_pc(struct thread *td, unsigned long addr)
 {
 
 	td->td_frame->tf_rip = addr;
 	set_pcb_flags(td->td_pcb, PCB_FULL_IRET);
 	return (0);
 }
 
 int
 ptrace_single_step(struct thread *td)
 {
 
 	PROC_LOCK_ASSERT(td->td_proc, MA_OWNED);
 	if ((td->td_frame->tf_rflags & PSL_T) == 0) {
 		td->td_frame->tf_rflags |= PSL_T;
 		td->td_dbgflags |= TDB_STEP;
 	}
 	return (0);
 }
 
 int
 ptrace_clear_single_step(struct thread *td)
 {
 
 	PROC_LOCK_ASSERT(td->td_proc, MA_OWNED);
 	td->td_frame->tf_rflags &= ~PSL_T;
 	td->td_dbgflags &= ~TDB_STEP;
 	return (0);
 }
 
 int
 fill_regs(struct thread *td, struct reg *regs)
 {
 	struct trapframe *tp;
 
 	tp = td->td_frame;
 	return (fill_frame_regs(tp, regs));
 }
 
 int
 fill_frame_regs(struct trapframe *tp, struct reg *regs)
 {
 
 	regs->r_r15 = tp->tf_r15;
 	regs->r_r14 = tp->tf_r14;
 	regs->r_r13 = tp->tf_r13;
 	regs->r_r12 = tp->tf_r12;
 	regs->r_r11 = tp->tf_r11;
 	regs->r_r10 = tp->tf_r10;
 	regs->r_r9  = tp->tf_r9;
 	regs->r_r8  = tp->tf_r8;
 	regs->r_rdi = tp->tf_rdi;
 	regs->r_rsi = tp->tf_rsi;
 	regs->r_rbp = tp->tf_rbp;
 	regs->r_rbx = tp->tf_rbx;
 	regs->r_rdx = tp->tf_rdx;
 	regs->r_rcx = tp->tf_rcx;
 	regs->r_rax = tp->tf_rax;
 	regs->r_rip = tp->tf_rip;
 	regs->r_cs = tp->tf_cs;
 	regs->r_rflags = tp->tf_rflags;
 	regs->r_rsp = tp->tf_rsp;
 	regs->r_ss = tp->tf_ss;
 	if (tp->tf_flags & TF_HASSEGS) {
 		regs->r_ds = tp->tf_ds;
 		regs->r_es = tp->tf_es;
 		regs->r_fs = tp->tf_fs;
 		regs->r_gs = tp->tf_gs;
 	} else {
 		regs->r_ds = 0;
 		regs->r_es = 0;
 		regs->r_fs = 0;
 		regs->r_gs = 0;
 	}
 	regs->r_err = 0;
 	regs->r_trapno = 0;
 	return (0);
 }
 
 int
 set_regs(struct thread *td, struct reg *regs)
 {
 	struct trapframe *tp;
 	register_t rflags;
 
 	tp = td->td_frame;
 	rflags = regs->r_rflags & 0xffffffff;
 	if (!EFL_SECURE(rflags, tp->tf_rflags) || !CS_SECURE(regs->r_cs))
 		return (EINVAL);
 	tp->tf_r15 = regs->r_r15;
 	tp->tf_r14 = regs->r_r14;
 	tp->tf_r13 = regs->r_r13;
 	tp->tf_r12 = regs->r_r12;
 	tp->tf_r11 = regs->r_r11;
 	tp->tf_r10 = regs->r_r10;
 	tp->tf_r9  = regs->r_r9;
 	tp->tf_r8  = regs->r_r8;
 	tp->tf_rdi = regs->r_rdi;
 	tp->tf_rsi = regs->r_rsi;
 	tp->tf_rbp = regs->r_rbp;
 	tp->tf_rbx = regs->r_rbx;
 	tp->tf_rdx = regs->r_rdx;
 	tp->tf_rcx = regs->r_rcx;
 	tp->tf_rax = regs->r_rax;
 	tp->tf_rip = regs->r_rip;
 	tp->tf_cs = regs->r_cs;
 	tp->tf_rflags = rflags;
 	tp->tf_rsp = regs->r_rsp;
 	tp->tf_ss = regs->r_ss;
 	if (0) {	/* XXXKIB */
 		tp->tf_ds = regs->r_ds;
 		tp->tf_es = regs->r_es;
 		tp->tf_fs = regs->r_fs;
 		tp->tf_gs = regs->r_gs;
 		tp->tf_flags = TF_HASSEGS;
 	}
 	set_pcb_flags(td->td_pcb, PCB_FULL_IRET);
 	return (0);
 }
 
 /* XXX check all this stuff! */
 /* externalize from sv_xmm */
 static void
 fill_fpregs_xmm(struct savefpu *sv_xmm, struct fpreg *fpregs)
 {
 	struct envxmm *penv_fpreg = (struct envxmm *)&fpregs->fpr_env;
 	struct envxmm *penv_xmm = &sv_xmm->sv_env;
 	int i;
 
 	/* pcb -> fpregs */
 	bzero(fpregs, sizeof(*fpregs));
 
 	/* FPU control/status */
 	penv_fpreg->en_cw = penv_xmm->en_cw;
 	penv_fpreg->en_sw = penv_xmm->en_sw;
 	penv_fpreg->en_tw = penv_xmm->en_tw;
 	penv_fpreg->en_opcode = penv_xmm->en_opcode;
 	penv_fpreg->en_rip = penv_xmm->en_rip;
 	penv_fpreg->en_rdp = penv_xmm->en_rdp;
 	penv_fpreg->en_mxcsr = penv_xmm->en_mxcsr;
 	penv_fpreg->en_mxcsr_mask = penv_xmm->en_mxcsr_mask;
 
 	/* FPU registers */
 	for (i = 0; i < 8; ++i)
 		bcopy(sv_xmm->sv_fp[i].fp_acc.fp_bytes, fpregs->fpr_acc[i], 10);
 
 	/* SSE registers */
 	for (i = 0; i < 16; ++i)
 		bcopy(sv_xmm->sv_xmm[i].xmm_bytes, fpregs->fpr_xacc[i], 16);
 }
 
 /* internalize from fpregs into sv_xmm */
 static void
 set_fpregs_xmm(struct fpreg *fpregs, struct savefpu *sv_xmm)
 {
 	struct envxmm *penv_xmm = &sv_xmm->sv_env;
 	struct envxmm *penv_fpreg = (struct envxmm *)&fpregs->fpr_env;
 	int i;
 
 	/* fpregs -> pcb */
 	/* FPU control/status */
 	penv_xmm->en_cw = penv_fpreg->en_cw;
 	penv_xmm->en_sw = penv_fpreg->en_sw;
 	penv_xmm->en_tw = penv_fpreg->en_tw;
 	penv_xmm->en_opcode = penv_fpreg->en_opcode;
 	penv_xmm->en_rip = penv_fpreg->en_rip;
 	penv_xmm->en_rdp = penv_fpreg->en_rdp;
 	penv_xmm->en_mxcsr = penv_fpreg->en_mxcsr;
 	penv_xmm->en_mxcsr_mask = penv_fpreg->en_mxcsr_mask & cpu_mxcsr_mask;
 
 	/* FPU registers */
 	for (i = 0; i < 8; ++i)
 		bcopy(fpregs->fpr_acc[i], sv_xmm->sv_fp[i].fp_acc.fp_bytes, 10);
 
 	/* SSE registers */
 	for (i = 0; i < 16; ++i)
 		bcopy(fpregs->fpr_xacc[i], sv_xmm->sv_xmm[i].xmm_bytes, 16);
 }
 
 /* externalize from td->pcb */
 int
 fill_fpregs(struct thread *td, struct fpreg *fpregs)
 {
 
 	KASSERT(td == curthread || TD_IS_SUSPENDED(td) ||
 	    P_SHOULDSTOP(td->td_proc),
 	    ("not suspended thread %p", td));
 	fpugetregs(td);
 	fill_fpregs_xmm(get_pcb_user_save_td(td), fpregs);
 	return (0);
 }
 
 /* internalize to td->pcb */
 int
 set_fpregs(struct thread *td, struct fpreg *fpregs)
 {
 
 	critical_enter();
 	set_fpregs_xmm(fpregs, get_pcb_user_save_td(td));
 	fpuuserinited(td);
 	critical_exit();
 	return (0);
 }
 
 /*
  * Get machine context.
  */
 int
 get_mcontext(struct thread *td, mcontext_t *mcp, int flags)
 {
 	struct pcb *pcb;
 	struct trapframe *tp;
 
 	pcb = td->td_pcb;
 	tp = td->td_frame;
 	PROC_LOCK(curthread->td_proc);
 	mcp->mc_onstack = sigonstack(tp->tf_rsp);
 	PROC_UNLOCK(curthread->td_proc);
 	mcp->mc_r15 = tp->tf_r15;
 	mcp->mc_r14 = tp->tf_r14;
 	mcp->mc_r13 = tp->tf_r13;
 	mcp->mc_r12 = tp->tf_r12;
 	mcp->mc_r11 = tp->tf_r11;
 	mcp->mc_r10 = tp->tf_r10;
 	mcp->mc_r9  = tp->tf_r9;
 	mcp->mc_r8  = tp->tf_r8;
 	mcp->mc_rdi = tp->tf_rdi;
 	mcp->mc_rsi = tp->tf_rsi;
 	mcp->mc_rbp = tp->tf_rbp;
 	mcp->mc_rbx = tp->tf_rbx;
 	mcp->mc_rcx = tp->tf_rcx;
 	mcp->mc_rflags = tp->tf_rflags;
 	if (flags & GET_MC_CLEAR_RET) {
 		mcp->mc_rax = 0;
 		mcp->mc_rdx = 0;
 		mcp->mc_rflags &= ~PSL_C;
 	} else {
 		mcp->mc_rax = tp->tf_rax;
 		mcp->mc_rdx = tp->tf_rdx;
 	}
 	mcp->mc_rip = tp->tf_rip;
 	mcp->mc_cs = tp->tf_cs;
 	mcp->mc_rsp = tp->tf_rsp;
 	mcp->mc_ss = tp->tf_ss;
 	mcp->mc_ds = tp->tf_ds;
 	mcp->mc_es = tp->tf_es;
 	mcp->mc_fs = tp->tf_fs;
 	mcp->mc_gs = tp->tf_gs;
 	mcp->mc_flags = tp->tf_flags;
 	mcp->mc_len = sizeof(*mcp);
 	get_fpcontext(td, mcp, NULL, 0);
 	update_pcb_bases(pcb);
 	mcp->mc_fsbase = pcb->pcb_fsbase;
 	mcp->mc_gsbase = pcb->pcb_gsbase;
 	mcp->mc_xfpustate = 0;
 	mcp->mc_xfpustate_len = 0;
 	bzero(mcp->mc_spare, sizeof(mcp->mc_spare));
 	return (0);
 }
 
 /*
  * Set machine context.
  *
  * However, we don't set any but the user modifiable flags, and we won't
  * touch the cs selector.
  */
 int
 set_mcontext(struct thread *td, mcontext_t *mcp)
 {
 	struct pcb *pcb;
 	struct trapframe *tp;
 	char *xfpustate;
 	long rflags;
 	int ret;
 
 	pcb = td->td_pcb;
 	tp = td->td_frame;
 	if (mcp->mc_len != sizeof(*mcp) ||
 	    (mcp->mc_flags & ~_MC_FLAG_MASK) != 0)
 		return (EINVAL);
 	rflags = (mcp->mc_rflags & PSL_USERCHANGE) |
 	    (tp->tf_rflags & ~PSL_USERCHANGE);
 	if (mcp->mc_flags & _MC_HASFPXSTATE) {
 		if (mcp->mc_xfpustate_len > cpu_max_ext_state_size -
 		    sizeof(struct savefpu))
 			return (EINVAL);
 		xfpustate = __builtin_alloca(mcp->mc_xfpustate_len);
 		ret = copyin((void *)mcp->mc_xfpustate, xfpustate,
 		    mcp->mc_xfpustate_len);
 		if (ret != 0)
 			return (ret);
 	} else
 		xfpustate = NULL;
 	ret = set_fpcontext(td, mcp, xfpustate, mcp->mc_xfpustate_len);
 	if (ret != 0)
 		return (ret);
 	tp->tf_r15 = mcp->mc_r15;
 	tp->tf_r14 = mcp->mc_r14;
 	tp->tf_r13 = mcp->mc_r13;
 	tp->tf_r12 = mcp->mc_r12;
 	tp->tf_r11 = mcp->mc_r11;
 	tp->tf_r10 = mcp->mc_r10;
 	tp->tf_r9  = mcp->mc_r9;
 	tp->tf_r8  = mcp->mc_r8;
 	tp->tf_rdi = mcp->mc_rdi;
 	tp->tf_rsi = mcp->mc_rsi;
 	tp->tf_rbp = mcp->mc_rbp;
 	tp->tf_rbx = mcp->mc_rbx;
 	tp->tf_rdx = mcp->mc_rdx;
 	tp->tf_rcx = mcp->mc_rcx;
 	tp->tf_rax = mcp->mc_rax;
 	tp->tf_rip = mcp->mc_rip;
 	tp->tf_rflags = rflags;
 	tp->tf_rsp = mcp->mc_rsp;
 	tp->tf_ss = mcp->mc_ss;
 	tp->tf_flags = mcp->mc_flags;
 	if (tp->tf_flags & TF_HASSEGS) {
 		tp->tf_ds = mcp->mc_ds;
 		tp->tf_es = mcp->mc_es;
 		tp->tf_fs = mcp->mc_fs;
 		tp->tf_gs = mcp->mc_gs;
 	}
 	set_pcb_flags(pcb, PCB_FULL_IRET);
 	if (mcp->mc_flags & _MC_HASBASES) {
 		pcb->pcb_fsbase = mcp->mc_fsbase;
 		pcb->pcb_gsbase = mcp->mc_gsbase;
 	}
 	return (0);
 }
 
 static void
 get_fpcontext(struct thread *td, mcontext_t *mcp, char *xfpusave,
     size_t xfpusave_len)
 {
 	size_t max_len, len;
 
 	mcp->mc_ownedfp = fpugetregs(td);
 	bcopy(get_pcb_user_save_td(td), &mcp->mc_fpstate[0],
 	    sizeof(mcp->mc_fpstate));
 	mcp->mc_fpformat = fpuformat();
 	if (!use_xsave || xfpusave_len == 0)
 		return;
 	max_len = cpu_max_ext_state_size - sizeof(struct savefpu);
 	len = xfpusave_len;
 	if (len > max_len) {
 		len = max_len;
 		bzero(xfpusave + max_len, len - max_len);
 	}
 	mcp->mc_flags |= _MC_HASFPXSTATE;
 	mcp->mc_xfpustate_len = len;
 	bcopy(get_pcb_user_save_td(td) + 1, xfpusave, len);
 }
 
 static int
 set_fpcontext(struct thread *td, mcontext_t *mcp, char *xfpustate,
     size_t xfpustate_len)
 {
 	int error;
 
 	if (mcp->mc_fpformat == _MC_FPFMT_NODEV)
 		return (0);
 	else if (mcp->mc_fpformat != _MC_FPFMT_XMM)
 		return (EINVAL);
 	else if (mcp->mc_ownedfp == _MC_FPOWNED_NONE) {
 		/* We don't care what state is left in the FPU or PCB. */
 		fpstate_drop(td);
 		error = 0;
 	} else if (mcp->mc_ownedfp == _MC_FPOWNED_FPU ||
 	    mcp->mc_ownedfp == _MC_FPOWNED_PCB) {
 		error = fpusetregs(td, (struct savefpu *)&mcp->mc_fpstate,
 		    xfpustate, xfpustate_len);
 	} else
 		return (EINVAL);
 	return (error);
 }
 
 void
 fpstate_drop(struct thread *td)
 {
 
 	KASSERT(PCB_USER_FPU(td->td_pcb), ("fpstate_drop: kernel-owned fpu"));
 	critical_enter();
 	if (PCPU_GET(fpcurthread) == td)
 		fpudrop();
 	/*
 	 * XXX force a full drop of the fpu.  The above only drops it if we
 	 * owned it.
 	 *
 	 * XXX I don't much like fpugetuserregs()'s semantics of doing a full
 	 * drop.  Dropping only to the pcb matches fnsave's behaviour.
 	 * We only need to drop to !PCB_INITDONE in sendsig().  But
 	 * sendsig() is the only caller of fpugetuserregs()... perhaps we just
 	 * have too many layers.
 	 */
 	clear_pcb_flags(curthread->td_pcb,
 	    PCB_FPUINITDONE | PCB_USERFPUINITDONE);
 	critical_exit();
 }
 
 int
 fill_dbregs(struct thread *td, struct dbreg *dbregs)
 {
 	struct pcb *pcb;
 
 	if (td == NULL) {
 		dbregs->dr[0] = rdr0();
 		dbregs->dr[1] = rdr1();
 		dbregs->dr[2] = rdr2();
 		dbregs->dr[3] = rdr3();
 		dbregs->dr[6] = rdr6();
 		dbregs->dr[7] = rdr7();
 	} else {
 		pcb = td->td_pcb;
 		dbregs->dr[0] = pcb->pcb_dr0;
 		dbregs->dr[1] = pcb->pcb_dr1;
 		dbregs->dr[2] = pcb->pcb_dr2;
 		dbregs->dr[3] = pcb->pcb_dr3;
 		dbregs->dr[6] = pcb->pcb_dr6;
 		dbregs->dr[7] = pcb->pcb_dr7;
 	}
 	dbregs->dr[4] = 0;
 	dbregs->dr[5] = 0;
 	dbregs->dr[8] = 0;
 	dbregs->dr[9] = 0;
 	dbregs->dr[10] = 0;
 	dbregs->dr[11] = 0;
 	dbregs->dr[12] = 0;
 	dbregs->dr[13] = 0;
 	dbregs->dr[14] = 0;
 	dbregs->dr[15] = 0;
 	return (0);
 }
 
 int
 set_dbregs(struct thread *td, struct dbreg *dbregs)
 {
 	struct pcb *pcb;
 	int i;
 
 	if (td == NULL) {
 		load_dr0(dbregs->dr[0]);
 		load_dr1(dbregs->dr[1]);
 		load_dr2(dbregs->dr[2]);
 		load_dr3(dbregs->dr[3]);
 		load_dr6(dbregs->dr[6]);
 		load_dr7(dbregs->dr[7]);
 	} else {
 		/*
 		 * Don't let an illegal value for dr7 get set.  Specifically,
 		 * check for undefined settings.  Setting these bit patterns
 		 * result in undefined behaviour and can lead to an unexpected
 		 * TRCTRAP or a general protection fault right here.
 		 * Upper bits of dr6 and dr7 must not be set
 		 */
 		for (i = 0; i < 4; i++) {
 			if (DBREG_DR7_ACCESS(dbregs->dr[7], i) == 0x02)
 				return (EINVAL);
 			if (td->td_frame->tf_cs == _ucode32sel &&
 			    DBREG_DR7_LEN(dbregs->dr[7], i) == DBREG_DR7_LEN_8)
 				return (EINVAL);
 		}
 		if ((dbregs->dr[6] & 0xffffffff00000000ul) != 0 ||
 		    (dbregs->dr[7] & 0xffffffff00000000ul) != 0)
 			return (EINVAL);
 
 		pcb = td->td_pcb;
 
 		/*
 		 * Don't let a process set a breakpoint that is not within the
 		 * process's address space.  If a process could do this, it
 		 * could halt the system by setting a breakpoint in the kernel
 		 * (if ddb was enabled).  Thus, we need to check to make sure
 		 * that no breakpoints are being enabled for addresses outside
 		 * process's address space.
 		 *
 		 * XXX - what about when the watched area of the user's
 		 * address space is written into from within the kernel
 		 * ... wouldn't that still cause a breakpoint to be generated
 		 * from within kernel mode?
 		 */
 
 		if (DBREG_DR7_ENABLED(dbregs->dr[7], 0)) {
 			/* dr0 is enabled */
 			if (dbregs->dr[0] >= VM_MAXUSER_ADDRESS)
 				return (EINVAL);
 		}
 		if (DBREG_DR7_ENABLED(dbregs->dr[7], 1)) {
 			/* dr1 is enabled */
 			if (dbregs->dr[1] >= VM_MAXUSER_ADDRESS)
 				return (EINVAL);
 		}
 		if (DBREG_DR7_ENABLED(dbregs->dr[7], 2)) {
 			/* dr2 is enabled */
 			if (dbregs->dr[2] >= VM_MAXUSER_ADDRESS)
 				return (EINVAL);
 		}
 		if (DBREG_DR7_ENABLED(dbregs->dr[7], 3)) {
 			/* dr3 is enabled */
 			if (dbregs->dr[3] >= VM_MAXUSER_ADDRESS)
 				return (EINVAL);
 		}
 
 		pcb->pcb_dr0 = dbregs->dr[0];
 		pcb->pcb_dr1 = dbregs->dr[1];
 		pcb->pcb_dr2 = dbregs->dr[2];
 		pcb->pcb_dr3 = dbregs->dr[3];
 		pcb->pcb_dr6 = dbregs->dr[6];
 		pcb->pcb_dr7 = dbregs->dr[7];
 
 		set_pcb_flags(pcb, PCB_DBREGS);
 	}
 
 	return (0);
 }
 
 void
 reset_dbregs(void)
 {
 
 	load_dr7(0);	/* Turn off the control bits first */
 	load_dr0(0);
 	load_dr1(0);
 	load_dr2(0);
 	load_dr3(0);
 	load_dr6(0);
 }
 
 /*
  * Return > 0 if a hardware breakpoint has been hit, and the
  * breakpoint was in user space.  Return 0, otherwise.
  */
 int
 user_dbreg_trap(register_t dr6)
 {
         u_int64_t dr7;
         u_int64_t bp;       /* breakpoint bits extracted from dr6 */
         int nbp;            /* number of breakpoints that triggered */
         caddr_t addr[4];    /* breakpoint addresses */
         int i;
 
         bp = dr6 & DBREG_DR6_BMASK;
         if (bp == 0) {
                 /*
                  * None of the breakpoint bits are set meaning this
                  * trap was not caused by any of the debug registers
                  */
                 return 0;
         }
 
         dr7 = rdr7();
         if ((dr7 & 0x000000ff) == 0) {
                 /*
                  * all GE and LE bits in the dr7 register are zero,
                  * thus the trap couldn't have been caused by the
                  * hardware debug registers
                  */
                 return 0;
         }
 
         nbp = 0;
 
         /*
          * at least one of the breakpoints were hit, check to see
          * which ones and if any of them are user space addresses
          */
 
         if (bp & 0x01) {
                 addr[nbp++] = (caddr_t)rdr0();
         }
         if (bp & 0x02) {
                 addr[nbp++] = (caddr_t)rdr1();
         }
         if (bp & 0x04) {
                 addr[nbp++] = (caddr_t)rdr2();
         }
         if (bp & 0x08) {
                 addr[nbp++] = (caddr_t)rdr3();
         }
 
         for (i = 0; i < nbp; i++) {
                 if (addr[i] < (caddr_t)VM_MAXUSER_ADDRESS) {
                         /*
                          * addr[i] is in user space
                          */
                         return nbp;
                 }
         }
 
         /*
          * None of the breakpoints are in user space.
          */
         return 0;
 }
 
 /*
  * The pcb_flags is only modified by current thread, or by other threads
  * when current thread is stopped.  However, current thread may change it
  * from the interrupt context in cpu_switch(), or in the trap handler.
  * When we read-modify-write pcb_flags from C sources, compiler may generate
  * code that is not atomic regarding the interrupt handler.  If a trap or
  * interrupt happens and any flag is modified from the handler, it can be
  * clobbered with the cached value later.  Therefore, we implement setting
  * and clearing flags with single-instruction functions, which do not race
  * with possible modification of the flags from the trap or interrupt context,
  * because traps and interrupts are executed only on instruction boundary.
  */
 void
 set_pcb_flags_raw(struct pcb *pcb, const u_int flags)
 {
 
 	__asm __volatile("orl %1,%0"
 	    : "=m" (pcb->pcb_flags) : "ir" (flags), "m" (pcb->pcb_flags)
 	    : "cc", "memory");
 
 }
 
 /*
  * The support for RDFSBASE, WRFSBASE and similar instructions for %gs
  * base requires that kernel saves MSR_FSBASE and MSR_{K,}GSBASE into
  * pcb if user space modified the bases.  We must save on the context
  * switch or if the return to usermode happens through the doreti.
  *
  * Tracking of both events is performed by the pcb flag PCB_FULL_IRET,
  * which have a consequence that the base MSRs must be saved each time
  * the PCB_FULL_IRET flag is set.  We disable interrupts to sync with
  * context switches.
  */
 static void
 set_pcb_flags_fsgsbase(struct pcb *pcb, const u_int flags)
 {
 	register_t r;
 
 	if (curpcb == pcb &&
 	    (flags & PCB_FULL_IRET) != 0 &&
 	    (pcb->pcb_flags & PCB_FULL_IRET) == 0) {
 		r = intr_disable();
 		if ((pcb->pcb_flags & PCB_FULL_IRET) == 0) {
 			if (rfs() == _ufssel)
 				pcb->pcb_fsbase = rdfsbase();
 			if (rgs() == _ugssel)
 				pcb->pcb_gsbase = rdmsr(MSR_KGSBASE);
 		}
 		set_pcb_flags_raw(pcb, flags);
 		intr_restore(r);
 	} else {
 		set_pcb_flags_raw(pcb, flags);
 	}
 }
 
 DEFINE_IFUNC(, void, set_pcb_flags, (struct pcb *, const u_int), static)
 {
 
 	return ((cpu_stdext_feature & CPUID_STDEXT_FSGSBASE) != 0 ?
 	    set_pcb_flags_fsgsbase : set_pcb_flags_raw);
 }
 
 void
 clear_pcb_flags(struct pcb *pcb, const u_int flags)
 {
 
 	__asm __volatile("andl %1,%0"
 	    : "=m" (pcb->pcb_flags) : "ir" (~flags), "m" (pcb->pcb_flags)
 	    : "cc", "memory");
 }
 
 #ifdef KDB
 
 /*
  * Provide inb() and outb() as functions.  They are normally only available as
  * inline functions, thus cannot be called from the debugger.
  */
 
 /* silence compiler warnings */
 u_char inb_(u_short);
 void outb_(u_short, u_char);
 
 u_char
 inb_(u_short port)
 {
 	return inb(port);
 }
 
 void
 outb_(u_short port, u_char data)
 {
 	outb(port, data);
 }
 
 #endif /* KDB */
 
 #undef memset
 #undef memmove
 #undef memcpy
 
 void	*memset_std(void *buf, int c, size_t len);
 void	*memset_erms(void *buf, int c, size_t len);
 DEFINE_IFUNC(, void *, memset, (void *, int, size_t), static)
 {
 
 	return ((cpu_stdext_feature & CPUID_STDEXT_ERMS) != 0 ?
 	    memset_erms : memset_std);
 }
 
 void    *memmove_std(void * _Nonnull dst, const void * _Nonnull src,
 	    size_t len);
 void    *memmove_erms(void * _Nonnull dst, const void * _Nonnull src,
 	    size_t len);
 DEFINE_IFUNC(, void *, memmove, (void * _Nonnull, const void * _Nonnull,
     size_t), static)
 {
 
 	return ((cpu_stdext_feature & CPUID_STDEXT_ERMS) != 0 ?
 	    memmove_erms : memmove_std);
 }
 
 void    *memcpy_std(void * _Nonnull dst, const void * _Nonnull src,
 	    size_t len);
 void    *memcpy_erms(void * _Nonnull dst, const void * _Nonnull src,
 	    size_t len);
 DEFINE_IFUNC(, void *, memcpy, (void * _Nonnull, const void * _Nonnull,size_t),
     static)
 {
 
 	return ((cpu_stdext_feature & CPUID_STDEXT_ERMS) != 0 ?
 	    memcpy_erms : memcpy_std);
 }
 
 void	pagezero_std(void *addr);
 void	pagezero_erms(void *addr);
 DEFINE_IFUNC(, void , pagezero, (void *), static)
 {
 
 	return ((cpu_stdext_feature & CPUID_STDEXT_ERMS) != 0 ?
 	    pagezero_erms : pagezero_std);
 }
Index: projects/runtime-coverage-v2/sys/amd64/amd64/support.S
===================================================================
--- projects/runtime-coverage-v2/sys/amd64/amd64/support.S	(revision 347598)
+++ projects/runtime-coverage-v2/sys/amd64/amd64/support.S	(revision 347599)
@@ -1,1624 +1,1865 @@
 /*-
+ * Copyright (c) 2018-2019 The FreeBSD Foundation
  * Copyright (c) 2003 Peter Wemm.
  * Copyright (c) 1993 The Regents of the University of California.
  * All rights reserved.
  *
+ * Portions of this software were developed by
+ * Konstantin Belousov <kib@FreeBSD.org> under sponsorship from
+ * the FreeBSD Foundation.
+ *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  * $FreeBSD$
  */
 
 #include "opt_ddb.h"
 
 #include <machine/asmacros.h>
 #include <machine/specialreg.h>
 #include <machine/pmap.h>
 
 #include "assym.inc"
 
 	.text
 
 /* Address: %rdi */
 ENTRY(pagezero_std)
 	PUSH_FRAME_POINTER
 	movl	$PAGE_SIZE/8,%ecx
 	xorl	%eax,%eax
 	rep
 	stosq
 	POP_FRAME_POINTER
 	ret
 END(pagezero_std)
 
 ENTRY(pagezero_erms)
 	PUSH_FRAME_POINTER
 	movl	$PAGE_SIZE,%ecx
 	xorl	%eax,%eax
 	rep
 	stosb
 	POP_FRAME_POINTER
 	ret
 END(pagezero_erms)
 
 /*
  * pagecopy(%rdi=from, %rsi=to)
  */
 ENTRY(pagecopy)
 	PUSH_FRAME_POINTER
 	movl	$PAGE_SIZE/8,%ecx
 	movq	%rdi,%r9
 	movq	%rsi,%rdi
 	movq	%r9,%rsi
 	rep
 	movsq
 	POP_FRAME_POINTER
 	ret
 END(pagecopy)
 
 /* Address: %rdi */
 ENTRY(sse2_pagezero)
 	PUSH_FRAME_POINTER
 	movq	$-PAGE_SIZE,%rdx
 	subq	%rdx,%rdi
 	xorl	%eax,%eax
 	jmp	1f
 	/*
 	 * The loop takes 29 bytes.  Ensure that it doesn't cross a 32-byte
 	 * cache line.
 	 */
 	.p2align 5,0x90
 1:
 	movnti	%rax,(%rdi,%rdx)
 	movnti	%rax,8(%rdi,%rdx)
 	movnti	%rax,16(%rdi,%rdx)
 	movnti	%rax,24(%rdi,%rdx)
 	addq	$32,%rdx
 	jne	1b
 	sfence
 	POP_FRAME_POINTER
 	ret
 END(sse2_pagezero)
 
 /*
  * memcmpy(b1, b2, len)
  *	   rdi,rsi,len
  */
 ENTRY(memcmp)
 	PUSH_FRAME_POINTER
 	cmpq	$16,%rdx
 	jae	5f
 1:
 	testq	%rdx,%rdx
 	je	3f
 	xorl	%ecx,%ecx
 2:
 	movzbl	(%rdi,%rcx,1),%eax
 	movzbl	(%rsi,%rcx,1),%r8d
 	cmpb	%r8b,%al
 	jne	4f
 	addq    $1,%rcx
 	cmpq    %rcx,%rdx
 	jz	3f
 	movzbl	(%rdi,%rcx,1),%eax
 	movzbl	(%rsi,%rcx,1),%r8d
 	cmpb	%r8b,%al
 	jne	4f
 	addq	$1,%rcx
 	cmpq	%rcx,%rdx
 	jz	3f
 	movzbl	(%rdi,%rcx,1),%eax
 	movzbl	(%rsi,%rcx,1),%r8d
 	cmpb	%r8b,%al
 	jne	4f
 	addq	$1,%rcx
 	cmpq	%rcx,%rdx
 	jz	3f
 	movzbl	(%rdi,%rcx,1),%eax
 	movzbl	(%rsi,%rcx,1),%r8d
 	cmpb	%r8b,%al
 	jne	4f
 	addq	$1,%rcx
 	cmpq	%rcx,%rdx
 	jne	2b
 3:
 	xorl	%eax,%eax
 	POP_FRAME_POINTER
 	ret
 4:
 	subl	%r8d,%eax
 	POP_FRAME_POINTER
 	ret
 5:
 	cmpq	$32,%rdx
 	jae	7f
 6:
 	/*
 	 * 8 bytes
 	 */
 	movq    (%rdi),%r8
 	movq    (%rsi),%r9
 	cmpq    %r8,%r9
 	jne	1b
 	leaq	8(%rdi),%rdi
 	leaq	8(%rsi),%rsi
 	subq	$8,%rdx
 	cmpq	$8,%rdx
 	jae	6b
 	jl	1b
 	jmp	3b
 7:
 	/*
 	 * 32 bytes
 	 */
 	movq    (%rsi),%r8
 	movq    8(%rsi),%r9
 	subq    (%rdi),%r8
 	subq    8(%rdi),%r9
 	or	%r8,%r9
 	jnz	1b
 
 	movq    16(%rsi),%r8
 	movq    24(%rsi),%r9
 	subq    16(%rdi),%r8
 	subq    24(%rdi),%r9
 	or	%r8,%r9
 	jnz	1b
 
 	leaq    32(%rdi),%rdi
 	leaq    32(%rsi),%rsi
 	subq    $32,%rdx
 	cmpq    $32,%rdx
 	jae	7b
 	jnz	1b
 	jmp	3b
 END(memcmp)
 
 /*
  * memmove(dst, src, cnt)
  *         rdi, rsi, rdx
  */
 
 /*
  * Register state at entry is supposed to be as follows:
  * rdi - destination
  * rsi - source
  * rdx - count
  *
  * The macro possibly clobbers the above and: rcx, r8, r9, r10
  * It does not clobber rax nor r11.
  */
 .macro MEMMOVE erms overlap begin end
 	\begin
 
 	/*
 	 * For sizes 0..32 all data is read before it is written, so there
 	 * is no correctness issue with direction of copying.
 	 */
 	cmpq	$32,%rcx
 	jbe	101632f
 
 .if \overlap == 1
 	movq	%rdi,%r8
 	subq	%rsi,%r8
 	cmpq	%rcx,%r8	/* overlapping && src < dst? */
 	jb	2f
 .endif
 
 	cmpq	$256,%rcx
 	ja	1256f
 
 103200:
 	movq	(%rsi),%rdx
 	movq	%rdx,(%rdi)
 	movq	8(%rsi),%rdx
 	movq	%rdx,8(%rdi)
 	movq	16(%rsi),%rdx
 	movq	%rdx,16(%rdi)
 	movq	24(%rsi),%rdx
 	movq	%rdx,24(%rdi)
 	leaq	32(%rsi),%rsi
 	leaq	32(%rdi),%rdi
 	subq	$32,%rcx
 	cmpq	$32,%rcx
 	jae	103200b
 	cmpb	$0,%cl
 	jne	101632f
 	\end
 	ret
 	ALIGN_TEXT
 101632:
 	cmpb	$16,%cl
 	jl	100816f
 	movq	(%rsi),%rdx
 	movq	8(%rsi),%r8
 	movq	-16(%rsi,%rcx),%r9
 	movq	-8(%rsi,%rcx),%r10
 	movq	%rdx,(%rdi)
 	movq	%r8,8(%rdi)
 	movq	%r9,-16(%rdi,%rcx)
 	movq	%r10,-8(%rdi,%rcx)
 	\end
 	ret
 	ALIGN_TEXT
 100816:
 	cmpb	$8,%cl
 	jl	100408f
 	movq	(%rsi),%rdx
 	movq	-8(%rsi,%rcx),%r8
 	movq	%rdx,(%rdi)
 	movq	%r8,-8(%rdi,%rcx,)
 	\end
 	ret
 	ALIGN_TEXT
 100408:
 	cmpb	$4,%cl
 	jl	100204f
 	movl	(%rsi),%edx
 	movl	-4(%rsi,%rcx),%r8d
 	movl	%edx,(%rdi)
 	movl	%r8d,-4(%rdi,%rcx)
 	\end
 	ret
 	ALIGN_TEXT
 100204:
 	cmpb	$2,%cl
 	jl	100001f
 	movzwl	(%rsi),%edx
 	movzwl	-2(%rsi,%rcx),%r8d
 	movw	%dx,(%rdi)
 	movw	%r8w,-2(%rdi,%rcx)
 	\end
 	ret
 	ALIGN_TEXT
 100001:
 	cmpb	$1,%cl
 	jl	100000f
 	movb	(%rsi),%dl
 	movb	%dl,(%rdi)
 100000:
 	\end
 	ret
 
 	ALIGN_TEXT
 1256:
 	testb	$15,%dil
 	jnz	100f
 .if \erms == 1
 	rep
 	movsb
 .else
 	shrq	$3,%rcx                         /* copy by 64-bit words */
 	rep
 	movsq
 	movq	%rdx,%rcx
 	andl	$7,%ecx                         /* any bytes left? */
 	jne	100408b
 .endif
 	\end
 	ret
 100:
 	movq	(%rsi),%r8
 	movq	8(%rsi),%r9
 	movq	%rdi,%r10
 	movq	%rdi,%rcx
 	andq	$15,%rcx
 	leaq	-16(%rdx,%rcx),%rdx
 	neg	%rcx
 	leaq	16(%rdi,%rcx),%rdi
 	leaq	16(%rsi,%rcx),%rsi
 	movq	%rdx,%rcx
 .if \erms == 1
 	rep
 	movsb
 	movq	%r8,(%r10)
 	movq	%r9,8(%r10)
 .else
 	shrq	$3,%rcx                         /* copy by 64-bit words */
 	rep
 	movsq
 	movq	%r8,(%r10)
 	movq	%r9,8(%r10)
 	movq	%rdx,%rcx
 	andl	$7,%ecx                         /* any bytes left? */
 	jne	100408b
 .endif
 	\end
 	ret
 
 .if \overlap == 1
 	/*
 	 * Copy backwards.
 	 */
         ALIGN_TEXT
 2:
 	cmpq	$256,%rcx
 	ja	2256f
 
 	leaq	-8(%rdi,%rcx),%rdi
 	leaq	-8(%rsi,%rcx),%rsi
 
 	cmpq	$32,%rcx
 	jb	2016f
 
 2032:
 	movq	(%rsi),%rdx
 	movq	%rdx,(%rdi)
 	movq	-8(%rsi),%rdx
 	movq	%rdx,-8(%rdi)
 	movq	-16(%rsi),%rdx
 	movq	%rdx,-16(%rdi)
 	movq	-24(%rsi),%rdx
 	movq	%rdx,-24(%rdi)
 	leaq	-32(%rsi),%rsi
 	leaq	-32(%rdi),%rdi
 	subq	$32,%rcx
 	cmpq	$32,%rcx
 	jae	2032b
 	cmpb	$0,%cl
 	jne	2016f
 	\end
 	ret
 	ALIGN_TEXT
 2016:
 	cmpb	$16,%cl
 	jl	2008f
 	movq	(%rsi),%rdx
 	movq	%rdx,(%rdi)
 	movq	-8(%rsi),%rdx
 	movq	%rdx,-8(%rdi)
 	subb	$16,%cl
 	jz	2000f
 	leaq	-16(%rsi),%rsi
 	leaq	-16(%rdi),%rdi
 2008:
 	cmpb	$8,%cl
 	jl	2004f
 	movq	(%rsi),%rdx
 	movq	%rdx,(%rdi)
 	subb	$8,%cl
 	jz	2000f
 	leaq	-8(%rsi),%rsi
 	leaq	-8(%rdi),%rdi
 2004:
 	cmpb	$4,%cl
 	jl	2002f
 	movl	4(%rsi),%edx
 	movl	%edx,4(%rdi)
 	subb	$4,%cl
 	jz	2000f
 	leaq	-4(%rsi),%rsi
 	leaq	-4(%rdi),%rdi
 2002:
 	cmpb	$2,%cl
 	jl	2001f
 	movw	6(%rsi),%dx
 	movw	%dx,6(%rdi)
 	subb	$2,%cl
 	jz	2000f
 	leaq	-2(%rsi),%rsi
 	leaq	-2(%rdi),%rdi
 2001:
 	cmpb	$1,%cl
 	jl	2000f
 	movb	7(%rsi),%dl
 	movb	%dl,7(%rdi)
 2000:
 	\end
 	ret
 	ALIGN_TEXT
 2256:
 	std
 .if \erms == 1
 	leaq	-1(%rdi,%rcx),%rdi
 	leaq	-1(%rsi,%rcx),%rsi
 	rep
 	movsb
 	cld
 .else
 	leaq	-8(%rdi,%rcx),%rdi
 	leaq	-8(%rsi,%rcx),%rsi
 	shrq	$3,%rcx
 	rep
 	movsq
 	cld
 	movq	%rdx,%rcx
 	andb	$7,%cl
 	jne	2004b
 .endif
 	\end
 	ret
 .endif
 .endm
 
 .macro MEMMOVE_BEGIN
 	PUSH_FRAME_POINTER
 	movq	%rdi,%rax
 	movq	%rdx,%rcx
 .endm
 
 .macro MEMMOVE_END
 	POP_FRAME_POINTER
 .endm
 
 ENTRY(memmove_std)
 	MEMMOVE erms=0 overlap=1 begin=MEMMOVE_BEGIN end=MEMMOVE_END
 END(memmove_std)
 
 ENTRY(memmove_erms)
 	MEMMOVE erms=1 overlap=1 begin=MEMMOVE_BEGIN end=MEMMOVE_END
 END(memmove_erms)
 
 /*
  * memcpy(dst, src, len)
  *        rdi, rsi, rdx
  *
  * Note: memcpy does not support overlapping copies
  */
 ENTRY(memcpy_std)
 	MEMMOVE erms=0 overlap=0 begin=MEMMOVE_BEGIN end=MEMMOVE_END
 END(memcpy_std)
 
 ENTRY(memcpy_erms)
 	MEMMOVE erms=1 overlap=0 begin=MEMMOVE_BEGIN end=MEMMOVE_END
 END(memcpy_erms)
 
 /*
  * memset(dst, c,   len)
  *        rdi, rsi, rdx
  */
 .macro MEMSET erms
 	PUSH_FRAME_POINTER
 	movq	%rdi,%rax
 	movq	%rdx,%rcx
 	movzbq	%sil,%r8
 	movabs	$0x0101010101010101,%r10
 	imulq	%r8,%r10
 
 	cmpq	$32,%rcx
 	jbe	101632f
 
 	cmpq	$256,%rcx
 	ja	1256f
 
 103200:
 	movq	%r10,(%rdi)
 	movq	%r10,8(%rdi)
 	movq	%r10,16(%rdi)
 	movq	%r10,24(%rdi)
 	leaq	32(%rdi),%rdi
 	subq	$32,%rcx
 	cmpq	$32,%rcx
 	ja	103200b
 	cmpb	$16,%cl
 	ja	201632f
 	movq	%r10,-16(%rdi,%rcx)
 	movq	%r10,-8(%rdi,%rcx)
 	POP_FRAME_POINTER
 	ret
 	ALIGN_TEXT
 101632:
 	cmpb	$16,%cl
 	jl	100816f
 201632:
 	movq	%r10,(%rdi)
 	movq	%r10,8(%rdi)
 	movq	%r10,-16(%rdi,%rcx)
 	movq	%r10,-8(%rdi,%rcx)
 	POP_FRAME_POINTER
 	ret
 	ALIGN_TEXT
 100816:
 	cmpb	$8,%cl
 	jl	100408f
 	movq	%r10,(%rdi)
 	movq	%r10,-8(%rdi,%rcx)
 	POP_FRAME_POINTER
 	ret
 	ALIGN_TEXT
 100408:
 	cmpb	$4,%cl
 	jl	100204f
 	movl	%r10d,(%rdi)
 	movl	%r10d,-4(%rdi,%rcx)
 	POP_FRAME_POINTER
 	ret
 	ALIGN_TEXT
 100204:
 	cmpb	$2,%cl
 	jl	100001f
 	movw	%r10w,(%rdi)
 	movw	%r10w,-2(%rdi,%rcx)
 	POP_FRAME_POINTER
 	ret
 	ALIGN_TEXT
 100001:
 	cmpb	$0,%cl
 	je	100000f
 	movb	%r10b,(%rdi)
 100000:
 	POP_FRAME_POINTER
 	ret
 	ALIGN_TEXT
 1256:
 	movq	%rdi,%r9
 	movq	%r10,%rax
 	testl	$15,%edi
 	jnz	3f
 1:
 .if \erms == 1
 	rep
 	stosb
 	movq	%r9,%rax
 .else
 	movq	%rcx,%rdx
 	shrq	$3,%rcx
 	rep
 	stosq
 	movq	%r9,%rax
 	andl	$7,%edx
 	jnz	2f
 	POP_FRAME_POINTER
 	ret
 2:
 	movq	%r10,-8(%rdi,%rdx)
 .endif
 	POP_FRAME_POINTER
 	ret
 	ALIGN_TEXT
 3:
 	movq	%r10,(%rdi)
 	movq	%r10,8(%rdi)
 	movq	%rdi,%r8
 	andq	$15,%r8
 	leaq	-16(%rcx,%r8),%rcx
 	neg	%r8
 	leaq	16(%rdi,%r8),%rdi
 	jmp	1b
 .endm
 
 ENTRY(memset_std)
 	MEMSET erms=0
 END(memset_std)
 
 ENTRY(memset_erms)
 	MEMSET erms=1
 END(memset_erms)
 
 /* fillw(pat, base, cnt) */
 /*       %rdi,%rsi, %rdx */
 ENTRY(fillw)
 	PUSH_FRAME_POINTER
 	movq	%rdi,%rax
 	movq	%rsi,%rdi
 	movq	%rdx,%rcx
 	rep
 	stosw
 	POP_FRAME_POINTER
 	ret
 END(fillw)
 
 /*****************************************************************************/
 /* copyout and fubyte family                                                 */
 /*****************************************************************************/
 /*
  * Access user memory from inside the kernel. These routines should be
  * the only places that do this.
  *
  * These routines set curpcb->pcb_onfault for the time they execute. When a
  * protection violation occurs inside the functions, the trap handler
  * returns to *curpcb->pcb_onfault instead of the function.
  */
 
 .macro SMAP_DISABLE smap
 .if	\smap
 	stac
 .endif
 .endm
 
 
 .macro SMAP_ENABLE smap
 .if	\smap
 	clac
 .endif
 .endm
 
 .macro COPYINOUT_BEGIN
 .endm
 
 .macro COPYINOUT_END
 	movq	%rax,PCB_ONFAULT(%r11)
 	POP_FRAME_POINTER
 .endm
 
 .macro COPYINOUT_SMAP_END
 	SMAP_ENABLE smap=1
 	COPYINOUT_END
 .endm
 
 /*
  * copyout(from_kernel, to_user, len)
  *         %rdi,        %rsi,    %rdx
  */
 .macro	COPYOUT smap erms
 	PUSH_FRAME_POINTER
 	movq	PCPU(CURPCB),%r11
 	movq	$copy_fault,PCB_ONFAULT(%r11)
 
 	/*
 	 * Check explicitly for non-user addresses.  If 486 write protection
 	 * is being used, this check is essential because we are in kernel
 	 * mode so the h/w does not provide any protection against writing
 	 * kernel addresses.
 	 */
 
 	/*
 	 * First, prevent address wrapping.
 	 */
 	movq	%rsi,%rax
 	addq	%rdx,%rax
 	jc	copy_fault
 /*
  * XXX STOP USING VM_MAXUSER_ADDRESS.
  * It is an end address, not a max, so every time it is used correctly it
  * looks like there is an off by one error, and of course it caused an off
  * by one error in several places.
  */
 	movq	$VM_MAXUSER_ADDRESS,%rcx
 	cmpq	%rcx,%rax
 	ja	copy_fault
 
 	/*
 	 * Set return value to zero. Remaining failure mode goes through
 	 * copy_fault.
 	 */
 	xorl	%eax,%eax
 
 	/*
 	 * Set up arguments for MEMMOVE.
 	 */
 	movq	%rdi,%r8
 	movq	%rsi,%rdi
 	movq	%r8,%rsi
 	movq	%rdx,%rcx
 
 
 	SMAP_DISABLE \smap
 .if	\smap == 1
 	MEMMOVE erms=\erms overlap=0 begin=COPYINOUT_BEGIN end=COPYINOUT_SMAP_END
 .else
 	MEMMOVE erms=\erms overlap=0 begin=COPYINOUT_BEGIN end=COPYINOUT_END
 .endif
 	/* NOTREACHED */
 .endm
 
 ENTRY(copyout_nosmap_std)
 	COPYOUT smap=0 erms=0
 END(copyout_nosmap_std)
 
 ENTRY(copyout_smap_std)
 	COPYOUT smap=1 erms=0
 END(copyout_smap_std)
 
 ENTRY(copyout_nosmap_erms)
 	COPYOUT smap=0 erms=1
 END(copyout_nosmap_erms)
 
 ENTRY(copyout_smap_erms)
 	COPYOUT smap=1 erms=1
 END(copyout_smap_erms)
 
 /*
  * copyin(from_user, to_kernel, len)
  *        %rdi,      %rsi,      %rdx
  */
 .macro	COPYIN smap erms
 	PUSH_FRAME_POINTER
 	movq	PCPU(CURPCB),%r11
 	movq	$copy_fault,PCB_ONFAULT(%r11)
 
 	/*
 	 * make sure address is valid
 	 */
 	movq	%rdi,%rax
 	addq	%rdx,%rax
 	jc	copy_fault
 	movq	$VM_MAXUSER_ADDRESS,%rcx
 	cmpq	%rcx,%rax
 	ja	copy_fault
 
 	xorl	%eax,%eax
 
 	movq	%rdi,%r8
 	movq	%rsi,%rdi
 	movq	%r8,%rsi
 	movq	%rdx,%rcx
 
 	SMAP_DISABLE \smap
 .if	\smap == 1
 	MEMMOVE erms=\erms overlap=0 begin=COPYINOUT_BEGIN end=COPYINOUT_SMAP_END
 .else
 	MEMMOVE erms=\erms overlap=0 begin=COPYINOUT_BEGIN end=COPYINOUT_END
 .endif
 	/* NOTREACHED */
 .endm
 
 ENTRY(copyin_nosmap_std)
 	COPYIN smap=0 erms=0
 END(copyin_nosmap_std)
 
 ENTRY(copyin_smap_std)
 	COPYIN smap=1 erms=0
 END(copyin_smap_std)
 
 ENTRY(copyin_nosmap_erms)
 	COPYIN smap=0 erms=1
 END(copyin_nosmap_erms)
 
 ENTRY(copyin_smap_erms)
 	COPYIN smap=1 erms=1
 END(copyin_smap_erms)
 
 	ALIGN_TEXT
 	/* Trap entry clears PSL.AC */
 copy_fault:
 	movq	$0,PCB_ONFAULT(%r11)
 	movl	$EFAULT,%eax
 	POP_FRAME_POINTER
 	ret
 
 /*
  * casueword32.  Compare and set user integer.  Returns -1 on fault,
  *        0 if access was successful.  Old value is written to *oldp.
  *        dst = %rdi, old = %esi, oldp = %rdx, new = %ecx
  */
 ENTRY(casueword32_nosmap)
 	PUSH_FRAME_POINTER
 	movq	PCPU(CURPCB),%r8
 	movq	$fusufault,PCB_ONFAULT(%r8)
 
 	movq	$VM_MAXUSER_ADDRESS-4,%rax
 	cmpq	%rax,%rdi			/* verify address is valid */
 	ja	fusufault
 
 	movl	%esi,%eax			/* old */
 #ifdef SMP
 	lock
 #endif
 	cmpxchgl %ecx,(%rdi)			/* new = %ecx */
 
 	/*
 	 * The old value is in %eax.  If the store succeeded it will be the
 	 * value we expected (old) from before the store, otherwise it will
 	 * be the current value.  Save %eax into %esi to prepare the return
 	 * value.
 	 */
 	movl	%eax,%esi
 	xorl	%eax,%eax
 	movq	%rax,PCB_ONFAULT(%r8)
 
 	/*
 	 * Access the oldp after the pcb_onfault is cleared, to correctly
 	 * catch corrupted pointer.
 	 */
 	movl	%esi,(%rdx)			/* oldp = %rdx */
 	POP_FRAME_POINTER
 	ret
 END(casueword32_nosmap)
 
 ENTRY(casueword32_smap)
 	PUSH_FRAME_POINTER
 	movq	PCPU(CURPCB),%r8
 	movq	$fusufault,PCB_ONFAULT(%r8)
 
 	movq	$VM_MAXUSER_ADDRESS-4,%rax
 	cmpq	%rax,%rdi			/* verify address is valid */
 	ja	fusufault
 
 	movl	%esi,%eax			/* old */
 	stac
 #ifdef SMP
 	lock
 #endif
 	cmpxchgl %ecx,(%rdi)			/* new = %ecx */
 	clac
 
 	/*
 	 * The old value is in %eax.  If the store succeeded it will be the
 	 * value we expected (old) from before the store, otherwise it will
 	 * be the current value.  Save %eax into %esi to prepare the return
 	 * value.
 	 */
 	movl	%eax,%esi
 	xorl	%eax,%eax
 	movq	%rax,PCB_ONFAULT(%r8)
 
 	/*
 	 * Access the oldp after the pcb_onfault is cleared, to correctly
 	 * catch corrupted pointer.
 	 */
 	movl	%esi,(%rdx)			/* oldp = %rdx */
 	POP_FRAME_POINTER
 	ret
 END(casueword32_smap)
 
 /*
  * casueword.  Compare and set user long.  Returns -1 on fault,
  *        0 if access was successful.  Old value is written to *oldp.
  *        dst = %rdi, old = %rsi, oldp = %rdx, new = %rcx
  */
 ENTRY(casueword_nosmap)
 	PUSH_FRAME_POINTER
 	movq	PCPU(CURPCB),%r8
 	movq	$fusufault,PCB_ONFAULT(%r8)
 
 	movq	$VM_MAXUSER_ADDRESS-4,%rax
 	cmpq	%rax,%rdi			/* verify address is valid */
 	ja	fusufault
 
 	movq	%rsi,%rax			/* old */
 #ifdef SMP
 	lock
 #endif
 	cmpxchgq %rcx,(%rdi)			/* new = %rcx */
 
 	/*
 	 * The old value is in %rax.  If the store succeeded it will be the
 	 * value we expected (old) from before the store, otherwise it will
 	 * be the current value.
 	 */
 	movq	%rax,%rsi
 	xorl	%eax,%eax
 	movq	%rax,PCB_ONFAULT(%r8)
 	movq	%rsi,(%rdx)
 	POP_FRAME_POINTER
 	ret
 END(casueword_nosmap)
 
 ENTRY(casueword_smap)
 	PUSH_FRAME_POINTER
 	movq	PCPU(CURPCB),%r8
 	movq	$fusufault,PCB_ONFAULT(%r8)
 
 	movq	$VM_MAXUSER_ADDRESS-4,%rax
 	cmpq	%rax,%rdi			/* verify address is valid */
 	ja	fusufault
 
 	movq	%rsi,%rax			/* old */
 	stac
 #ifdef SMP
 	lock
 #endif
 	cmpxchgq %rcx,(%rdi)			/* new = %rcx */
 	clac
 
 	/*
 	 * The old value is in %rax.  If the store succeeded it will be the
 	 * value we expected (old) from before the store, otherwise it will
 	 * be the current value.
 	 */
 	movq	%rax,%rsi
 	xorl	%eax,%eax
 	movq	%rax,PCB_ONFAULT(%r8)
 	movq	%rsi,(%rdx)
 	POP_FRAME_POINTER
 	ret
 END(casueword_smap)
 
 /*
  * Fetch (load) a 64-bit word, a 32-bit word, a 16-bit word, or an 8-bit
  * byte from user memory.
  * addr = %rdi, valp = %rsi
  */
 
 ENTRY(fueword_nosmap)
 	PUSH_FRAME_POINTER
 	movq	PCPU(CURPCB),%rcx
 	movq	$fusufault,PCB_ONFAULT(%rcx)
 
 	movq	$VM_MAXUSER_ADDRESS-8,%rax
 	cmpq	%rax,%rdi			/* verify address is valid */
 	ja	fusufault
 
 	xorl	%eax,%eax
 	movq	(%rdi),%r11
 	movq	%rax,PCB_ONFAULT(%rcx)
 	movq	%r11,(%rsi)
 	POP_FRAME_POINTER
 	ret
 END(fueword_nosmap)
 
 ENTRY(fueword_smap)
 	PUSH_FRAME_POINTER
 	movq	PCPU(CURPCB),%rcx
 	movq	$fusufault,PCB_ONFAULT(%rcx)
 
 	movq	$VM_MAXUSER_ADDRESS-8,%rax
 	cmpq	%rax,%rdi			/* verify address is valid */
 	ja	fusufault
 
 	xorl	%eax,%eax
 	stac
 	movq	(%rdi),%r11
 	clac
 	movq	%rax,PCB_ONFAULT(%rcx)
 	movq	%r11,(%rsi)
 	POP_FRAME_POINTER
 	ret
 END(fueword_smap)
 
 ENTRY(fueword32_nosmap)
 	PUSH_FRAME_POINTER
 	movq	PCPU(CURPCB),%rcx
 	movq	$fusufault,PCB_ONFAULT(%rcx)
 
 	movq	$VM_MAXUSER_ADDRESS-4,%rax
 	cmpq	%rax,%rdi			/* verify address is valid */
 	ja	fusufault
 
 	xorl	%eax,%eax
 	movl	(%rdi),%r11d
 	movq	%rax,PCB_ONFAULT(%rcx)
 	movl	%r11d,(%rsi)
 	POP_FRAME_POINTER
 	ret
 END(fueword32_nosmap)
 
 ENTRY(fueword32_smap)
 	PUSH_FRAME_POINTER
 	movq	PCPU(CURPCB),%rcx
 	movq	$fusufault,PCB_ONFAULT(%rcx)
 
 	movq	$VM_MAXUSER_ADDRESS-4,%rax
 	cmpq	%rax,%rdi			/* verify address is valid */
 	ja	fusufault
 
 	xorl	%eax,%eax
 	stac
 	movl	(%rdi),%r11d
 	clac
 	movq	%rax,PCB_ONFAULT(%rcx)
 	movl	%r11d,(%rsi)
 	POP_FRAME_POINTER
 	ret
 END(fueword32_smap)
 
 ENTRY(fuword16_nosmap)
 	PUSH_FRAME_POINTER
 	movq	PCPU(CURPCB),%rcx
 	movq	$fusufault,PCB_ONFAULT(%rcx)
 
 	movq	$VM_MAXUSER_ADDRESS-2,%rax
 	cmpq	%rax,%rdi
 	ja	fusufault
 
 	movzwl	(%rdi),%eax
 	movq	$0,PCB_ONFAULT(%rcx)
 	POP_FRAME_POINTER
 	ret
 END(fuword16_nosmap)
 
 ENTRY(fuword16_smap)
 	PUSH_FRAME_POINTER
 	movq	PCPU(CURPCB),%rcx
 	movq	$fusufault,PCB_ONFAULT(%rcx)
 
 	movq	$VM_MAXUSER_ADDRESS-2,%rax
 	cmpq	%rax,%rdi
 	ja	fusufault
 
 	stac
 	movzwl	(%rdi),%eax
 	clac
 	movq	$0,PCB_ONFAULT(%rcx)
 	POP_FRAME_POINTER
 	ret
 END(fuword16_smap)
 
 ENTRY(fubyte_nosmap)
 	PUSH_FRAME_POINTER
 	movq	PCPU(CURPCB),%rcx
 	movq	$fusufault,PCB_ONFAULT(%rcx)
 
 	movq	$VM_MAXUSER_ADDRESS-1,%rax
 	cmpq	%rax,%rdi
 	ja	fusufault
 
 	movzbl	(%rdi),%eax
 	movq	$0,PCB_ONFAULT(%rcx)
 	POP_FRAME_POINTER
 	ret
 END(fubyte_nosmap)
 
 ENTRY(fubyte_smap)
 	PUSH_FRAME_POINTER
 	movq	PCPU(CURPCB),%rcx
 	movq	$fusufault,PCB_ONFAULT(%rcx)
 
 	movq	$VM_MAXUSER_ADDRESS-1,%rax
 	cmpq	%rax,%rdi
 	ja	fusufault
 
 	stac
 	movzbl	(%rdi),%eax
 	clac
 	movq	$0,PCB_ONFAULT(%rcx)
 	POP_FRAME_POINTER
 	ret
 END(fubyte_smap)
 
 /*
  * Store a 64-bit word, a 32-bit word, a 16-bit word, or an 8-bit byte to
  * user memory.
  * addr = %rdi, value = %rsi
  */
 ENTRY(suword_nosmap)
 	PUSH_FRAME_POINTER
 	movq	PCPU(CURPCB),%rcx
 	movq	$fusufault,PCB_ONFAULT(%rcx)
 
 	movq	$VM_MAXUSER_ADDRESS-8,%rax
 	cmpq	%rax,%rdi			/* verify address validity */
 	ja	fusufault
 
 	movq	%rsi,(%rdi)
 	xorl	%eax,%eax
 	movq	%rax,PCB_ONFAULT(%rcx)
 	POP_FRAME_POINTER
 	ret
 END(suword_nosmap)
 
 ENTRY(suword_smap)
 	PUSH_FRAME_POINTER
 	movq	PCPU(CURPCB),%rcx
 	movq	$fusufault,PCB_ONFAULT(%rcx)
 
 	movq	$VM_MAXUSER_ADDRESS-8,%rax
 	cmpq	%rax,%rdi			/* verify address validity */
 	ja	fusufault
 
 	stac
 	movq	%rsi,(%rdi)
 	clac
 	xorl	%eax,%eax
 	movq	%rax,PCB_ONFAULT(%rcx)
 	POP_FRAME_POINTER
 	ret
 END(suword_smap)
 
 ENTRY(suword32_nosmap)
 	PUSH_FRAME_POINTER
 	movq	PCPU(CURPCB),%rcx
 	movq	$fusufault,PCB_ONFAULT(%rcx)
 
 	movq	$VM_MAXUSER_ADDRESS-4,%rax
 	cmpq	%rax,%rdi			/* verify address validity */
 	ja	fusufault
 
 	movl	%esi,(%rdi)
 	xorl	%eax,%eax
 	movq	%rax,PCB_ONFAULT(%rcx)
 	POP_FRAME_POINTER
 	ret
 END(suword32_nosmap)
 
 ENTRY(suword32_smap)
 	PUSH_FRAME_POINTER
 	movq	PCPU(CURPCB),%rcx
 	movq	$fusufault,PCB_ONFAULT(%rcx)
 
 	movq	$VM_MAXUSER_ADDRESS-4,%rax
 	cmpq	%rax,%rdi			/* verify address validity */
 	ja	fusufault
 
 	stac
 	movl	%esi,(%rdi)
 	clac
 	xorl	%eax,%eax
 	movq	%rax,PCB_ONFAULT(%rcx)
 	POP_FRAME_POINTER
 	ret
 END(suword32_smap)
 
 ENTRY(suword16_nosmap)
 	PUSH_FRAME_POINTER
 	movq	PCPU(CURPCB),%rcx
 	movq	$fusufault,PCB_ONFAULT(%rcx)
 
 	movq	$VM_MAXUSER_ADDRESS-2,%rax
 	cmpq	%rax,%rdi			/* verify address validity */
 	ja	fusufault
 
 	movw	%si,(%rdi)
 	xorl	%eax,%eax
 	movq	%rax,PCB_ONFAULT(%rcx)
 	POP_FRAME_POINTER
 	ret
 END(suword16_nosmap)
 
 ENTRY(suword16_smap)
 	PUSH_FRAME_POINTER
 	movq	PCPU(CURPCB),%rcx
 	movq	$fusufault,PCB_ONFAULT(%rcx)
 
 	movq	$VM_MAXUSER_ADDRESS-2,%rax
 	cmpq	%rax,%rdi			/* verify address validity */
 	ja	fusufault
 
 	stac
 	movw	%si,(%rdi)
 	clac
 	xorl	%eax,%eax
 	movq	%rax,PCB_ONFAULT(%rcx)
 	POP_FRAME_POINTER
 	ret
 END(suword16_smap)
 
 ENTRY(subyte_nosmap)
 	PUSH_FRAME_POINTER
 	movq	PCPU(CURPCB),%rcx
 	movq	$fusufault,PCB_ONFAULT(%rcx)
 
 	movq	$VM_MAXUSER_ADDRESS-1,%rax
 	cmpq	%rax,%rdi			/* verify address validity */
 	ja	fusufault
 
 	movl	%esi,%eax
 	movb	%al,(%rdi)
 	xorl	%eax,%eax
 	movq	%rax,PCB_ONFAULT(%rcx)
 	POP_FRAME_POINTER
 	ret
 END(subyte_nosmap)
 
 ENTRY(subyte_smap)
 	PUSH_FRAME_POINTER
 	movq	PCPU(CURPCB),%rcx
 	movq	$fusufault,PCB_ONFAULT(%rcx)
 
 	movq	$VM_MAXUSER_ADDRESS-1,%rax
 	cmpq	%rax,%rdi			/* verify address validity */
 	ja	fusufault
 
 	movl	%esi,%eax
 	stac
 	movb	%al,(%rdi)
 	clac
 	xorl	%eax,%eax
 	movq	%rax,PCB_ONFAULT(%rcx)
 	POP_FRAME_POINTER
 	ret
 END(subyte_smap)
 
 	ALIGN_TEXT
 	/* Fault entry clears PSL.AC */
 fusufault:
 	movq	PCPU(CURPCB),%rcx
 	xorl	%eax,%eax
 	movq	%rax,PCB_ONFAULT(%rcx)
 	decq	%rax
 	POP_FRAME_POINTER
 	ret
 
 /*
  * copyinstr(from, to, maxlen, int *lencopied)
  *           %rdi, %rsi, %rdx, %rcx
  *
  *	copy a string from 'from' to 'to', stop when a 0 character is reached.
  *	return ENAMETOOLONG if string is longer than maxlen, and
  *	EFAULT on protection violations. If lencopied is non-zero,
  *	return the actual length in *lencopied.
  */
 .macro COPYINSTR smap
 	PUSH_FRAME_POINTER
 	movq	%rdx,%r8			/* %r8 = maxlen */
 	movq	PCPU(CURPCB),%r9
 	movq	$cpystrflt,PCB_ONFAULT(%r9)
 
 	movq	$VM_MAXUSER_ADDRESS,%rax
 
 	/* make sure 'from' is within bounds */
 	subq	%rdi,%rax
 	jbe	cpystrflt
 
 	SMAP_DISABLE \smap
 
 	/* restrict maxlen to <= VM_MAXUSER_ADDRESS-from */
 	cmpq	%rdx,%rax
 	jb	8f
 1:
 	incq	%rdx
 2:
 	decq	%rdx
 .if \smap == 0
 	jz	copyinstr_toolong
 .else
 	jz	copyinstr_toolong_smap
 .endif
 
 	movb	(%rdi),%al
 	movb	%al,(%rsi)
 	incq	%rsi
 	incq	%rdi
 	testb	%al,%al
 	jnz	2b
 
 	SMAP_ENABLE \smap
 
 	/* Success -- 0 byte reached */
 	decq	%rdx
 	xorl	%eax,%eax
 
 	/* set *lencopied and return %eax */
 	movq	%rax,PCB_ONFAULT(%r9)
 
 	testq	%rcx,%rcx
 	jz	3f
 	subq	%rdx,%r8
 	movq	%r8,(%rcx)
 3:
 	POP_FRAME_POINTER
 	ret
 	ALIGN_TEXT
 8:
 	movq	%rax,%rdx
 	movq	%rax,%r8
 	jmp 1b
 
 .endm
 
 ENTRY(copyinstr_nosmap)
 	COPYINSTR smap=0
 END(copyinstr_nosmap)
 
 ENTRY(copyinstr_smap)
 	COPYINSTR smap=1
 END(copyinstr_smap)
 
 cpystrflt:
 	/* Fault entry clears PSL.AC */
 	movl	$EFAULT,%eax
 cpystrflt_x:
 	/* set *lencopied and return %eax */
 	movq	$0,PCB_ONFAULT(%r9)
 
 	testq	%rcx,%rcx
 	jz	1f
 	subq	%rdx,%r8
 	movq	%r8,(%rcx)
 1:
 	POP_FRAME_POINTER
 	ret
 
 copyinstr_toolong_smap:
 	clac
 copyinstr_toolong:
 	/* rdx is zero - return ENAMETOOLONG or EFAULT */
 	movq	$VM_MAXUSER_ADDRESS,%rax
 	cmpq	%rax,%rdi
 	jae	cpystrflt
 	movl	$ENAMETOOLONG,%eax
 	jmp	cpystrflt_x
 
 /*
  * copystr(from, to, maxlen, int *lencopied)
  *         %rdi, %rsi, %rdx, %rcx
  */
 ENTRY(copystr)
 	PUSH_FRAME_POINTER
 	movq	%rdx,%r8			/* %r8 = maxlen */
 
 	incq    %rdx
 1:
 	decq	%rdx
 	jz	4f
 	movb	(%rdi),%al
 	movb	%al,(%rsi)
 	incq	%rsi
 	incq	%rdi
 	testb	%al,%al
 	jnz	1b
 
 	/* Success -- 0 byte reached */
 	decq	%rdx
 	xorl	%eax,%eax
 2:
 	testq	%rcx,%rcx
 	jz      3f
 	/* set *lencopied and return %rax */
 	subq	%rdx,%r8
 	movq	%r8,(%rcx)
 3:
 	POP_FRAME_POINTER
 	ret
 4:
 	/* rdx is zero -- return ENAMETOOLONG */
 	movl    $ENAMETOOLONG,%eax
 	jmp	2b
 END(copystr)
 
 /*
  * Handling of special amd64 registers and descriptor tables etc
  */
 /* void lgdt(struct region_descriptor *rdp); */
 ENTRY(lgdt)
 	/* reload the descriptor table */
 	lgdt	(%rdi)
 
 	/* flush the prefetch q */
 	jmp	1f
 	nop
 1:
 	movl	$KDSEL,%eax
 	movl	%eax,%ds
 	movl	%eax,%es
 	movl	%eax,%fs	/* Beware, use wrmsr to set 64 bit base */
 	movl	%eax,%gs
 	movl	%eax,%ss
 
 	/* reload code selector by turning return into intersegmental return */
 	popq	%rax
 	pushq	$KCSEL
 	pushq	%rax
 	MEXITCOUNT
 	lretq
 END(lgdt)
 
 /*****************************************************************************/
 /* setjump, longjump                                                         */
 /*****************************************************************************/
 
 ENTRY(setjmp)
 	movq	%rbx,0(%rdi)			/* save rbx */
 	movq	%rsp,8(%rdi)			/* save rsp */
 	movq	%rbp,16(%rdi)			/* save rbp */
 	movq	%r12,24(%rdi)			/* save r12 */
 	movq	%r13,32(%rdi)			/* save r13 */
 	movq	%r14,40(%rdi)			/* save r14 */
 	movq	%r15,48(%rdi)			/* save r15 */
 	movq	0(%rsp),%rdx			/* get rta */
 	movq	%rdx,56(%rdi)			/* save rip */
 	xorl	%eax,%eax			/* return(0); */
 	ret
 END(setjmp)
 
 ENTRY(longjmp)
 	movq	0(%rdi),%rbx			/* restore rbx */
 	movq	8(%rdi),%rsp			/* restore rsp */
 	movq	16(%rdi),%rbp			/* restore rbp */
 	movq	24(%rdi),%r12			/* restore r12 */
 	movq	32(%rdi),%r13			/* restore r13 */
 	movq	40(%rdi),%r14			/* restore r14 */
 	movq	48(%rdi),%r15			/* restore r15 */
 	movq	56(%rdi),%rdx			/* get rta */
 	movq	%rdx,0(%rsp)			/* put in return frame */
 	xorl	%eax,%eax			/* return(1); */
 	incl	%eax
 	ret
 END(longjmp)
 
 /*
  * Support for reading MSRs in the safe manner.  (Instead of panic on #gp,
  * return an error.)
  */
 ENTRY(rdmsr_safe)
 /* int rdmsr_safe(u_int msr, uint64_t *data) */
 	PUSH_FRAME_POINTER
 	movq	PCPU(CURPCB),%r8
 	movq	$msr_onfault,PCB_ONFAULT(%r8)
 	movl	%edi,%ecx
 	rdmsr			/* Read MSR pointed by %ecx. Returns
 				   hi byte in edx, lo in %eax */
 	salq	$32,%rdx	/* sign-shift %rdx left */
 	movl	%eax,%eax	/* zero-extend %eax -> %rax */
 	orq	%rdx,%rax
 	movq	%rax,(%rsi)
 	xorq	%rax,%rax
 	movq	%rax,PCB_ONFAULT(%r8)
 	POP_FRAME_POINTER
 	ret
 
 /*
  * Support for writing MSRs in the safe manner.  (Instead of panic on #gp,
  * return an error.)
  */
 ENTRY(wrmsr_safe)
 /* int wrmsr_safe(u_int msr, uint64_t data) */
 	PUSH_FRAME_POINTER
 	movq	PCPU(CURPCB),%r8
 	movq	$msr_onfault,PCB_ONFAULT(%r8)
 	movl	%edi,%ecx
 	movl	%esi,%eax
 	sarq	$32,%rsi
 	movl	%esi,%edx
 	wrmsr			/* Write MSR pointed by %ecx. Accepts
 				   hi byte in edx, lo in %eax. */
 	xorq	%rax,%rax
 	movq	%rax,PCB_ONFAULT(%r8)
 	POP_FRAME_POINTER
 	ret
 
 /*
  * MSR operations fault handler
  */
 	ALIGN_TEXT
 msr_onfault:
 	movq	$0,PCB_ONFAULT(%r8)
 	movl	$EFAULT,%eax
 	POP_FRAME_POINTER
 	ret
 
 /*
  * void pmap_pti_pcid_invalidate(uint64_t ucr3, uint64_t kcr3);
  * Invalidates address space addressed by ucr3, then returns to kcr3.
  * Done in assembler to ensure no other memory accesses happen while
  * on ucr3.
  */
 	ALIGN_TEXT
 ENTRY(pmap_pti_pcid_invalidate)
 	pushfq
 	cli
 	movq	%rdi,%cr3	/* to user page table */
 	movq	%rsi,%cr3	/* back to kernel */
 	popfq
 	retq
 
 /*
  * void pmap_pti_pcid_invlpg(uint64_t ucr3, uint64_t kcr3, vm_offset_t va);
  * Invalidates virtual address va in address space ucr3, then returns to kcr3.
  */
 	ALIGN_TEXT
 ENTRY(pmap_pti_pcid_invlpg)
 	pushfq
 	cli
 	movq	%rdi,%cr3	/* to user page table */
 	invlpg	(%rdx)
 	movq	%rsi,%cr3	/* back to kernel */
 	popfq
 	retq
 
 /*
  * void pmap_pti_pcid_invlrng(uint64_t ucr3, uint64_t kcr3, vm_offset_t sva,
  *     vm_offset_t eva);
  * Invalidates virtual addresses between sva and eva in address space ucr3,
  * then returns to kcr3.
  */
 	ALIGN_TEXT
 ENTRY(pmap_pti_pcid_invlrng)
 	pushfq
 	cli
 	movq	%rdi,%cr3	/* to user page table */
 1:	invlpg	(%rdx)
 	addq	$PAGE_SIZE,%rdx
 	cmpq	%rdx,%rcx
 	ja	1b
 	movq	%rsi,%cr3	/* back to kernel */
 	popfq
 	retq
 
 	.altmacro
 	.macro	ibrs_seq_label l
 handle_ibrs_\l:
 	.endm
 	.macro	ibrs_call_label l
 	call	handle_ibrs_\l
 	.endm
 	.macro	ibrs_seq count
 	ll=1
 	.rept	\count
 	ibrs_call_label	%(ll)
 	nop
 	ibrs_seq_label %(ll)
 	addq	$8,%rsp
 	ll=ll+1
 	.endr
 	.endm
 
 /* all callers already saved %rax, %rdx, and %rcx */
 ENTRY(handle_ibrs_entry)
 	cmpb	$0,hw_ibrs_active(%rip)
 	je	1f
 	movl	$MSR_IA32_SPEC_CTRL,%ecx
 	rdmsr
 	orl	$(IA32_SPEC_CTRL_IBRS|IA32_SPEC_CTRL_STIBP),%eax
 	orl	$(IA32_SPEC_CTRL_IBRS|IA32_SPEC_CTRL_STIBP)>>32,%edx
 	wrmsr
 	movb	$1,PCPU(IBPB_SET)
 	testl	$CPUID_STDEXT_SMEP,cpu_stdext_feature(%rip)
 	jne	1f
 	ibrs_seq 32
 1:	ret
 END(handle_ibrs_entry)
 
 ENTRY(handle_ibrs_exit)
 	cmpb	$0,PCPU(IBPB_SET)
 	je	1f
 	movl	$MSR_IA32_SPEC_CTRL,%ecx
 	rdmsr
 	andl	$~(IA32_SPEC_CTRL_IBRS|IA32_SPEC_CTRL_STIBP),%eax
 	andl	$~((IA32_SPEC_CTRL_IBRS|IA32_SPEC_CTRL_STIBP)>>32),%edx
 	wrmsr
 	movb	$0,PCPU(IBPB_SET)
 1:	ret
 END(handle_ibrs_exit)
 
 /* registers-neutral version, but needs stack */
 ENTRY(handle_ibrs_exit_rs)
 	cmpb	$0,PCPU(IBPB_SET)
 	je	1f
 	pushq	%rax
 	pushq	%rdx
 	pushq	%rcx
 	movl	$MSR_IA32_SPEC_CTRL,%ecx
 	rdmsr
 	andl	$~(IA32_SPEC_CTRL_IBRS|IA32_SPEC_CTRL_STIBP),%eax
 	andl	$~((IA32_SPEC_CTRL_IBRS|IA32_SPEC_CTRL_STIBP)>>32),%edx
 	wrmsr
 	popq	%rcx
 	popq	%rdx
 	popq	%rax
 	movb	$0,PCPU(IBPB_SET)
 1:	ret
 END(handle_ibrs_exit_rs)
 
 	.noaltmacro
 
 /*
  * Flush L1D cache.  Load enough of the data from the kernel text
  * to flush existing L1D content.
  *
  * N.B. The function does not follow ABI calling conventions, it corrupts %rbx.
  * The vmm.ko caller expects that only %rax, %rdx, %rbx, %rcx, %r9, and %rflags
  * registers are clobbered.  The NMI handler caller only needs %r13 preserved.
  */
 ENTRY(flush_l1d_sw)
 #define	L1D_FLUSH_SIZE	(64 * 1024)
 	movq	$KERNBASE, %r9
 	movq	$-L1D_FLUSH_SIZE, %rcx
 	/*
 	 * pass 1: Preload TLB.
 	 * Kernel text is mapped using superpages.  TLB preload is
 	 * done for the benefit of older CPUs which split 2M page
 	 * into 4k TLB entries.
 	 */
 1:	movb	L1D_FLUSH_SIZE(%r9, %rcx), %al
 	addq	$PAGE_SIZE, %rcx
 	jne	1b
 	xorl	%eax, %eax
 	cpuid
 	movq	$-L1D_FLUSH_SIZE, %rcx
 	/* pass 2: Read each cache line. */
 2:	movb	L1D_FLUSH_SIZE(%r9, %rcx), %al
 	addq	$64, %rcx
 	jne	2b
 	lfence
 	ret
 #undef	L1D_FLUSH_SIZE
 END(flush_l1d_sw)
 
 ENTRY(flush_l1d_sw_abi)
 	pushq	%rbx
 	call	flush_l1d_sw
 	popq	%rbx
 	ret
 END(flush_l1d_sw_abi)
+
+ENTRY(mds_handler_void)
+	retq
+END(mds_handler_void)
+
+ENTRY(mds_handler_verw)
+	subq	$8, %rsp
+	movw	%ds, (%rsp)
+	verw	(%rsp)
+	addq	$8, %rsp
+	retq
+END(mds_handler_verw)
+
+ENTRY(mds_handler_ivb)
+	pushq	%rax
+	pushq	%rdx
+	pushq	%rcx
+
+	movq	%cr0, %rax
+	testb	$CR0_TS, %al
+	je	1f
+	clts
+1:	movq	PCPU(MDS_BUF), %rdx
+	movdqa	%xmm0, PCPU(MDS_TMP)
+	pxor	%xmm0, %xmm0
+
+	lfence
+	orpd	(%rdx), %xmm0
+	orpd	(%rdx), %xmm0
+	mfence
+	movl	$40, %ecx
+	addq	$16, %rdx
+2:	movntdq	%xmm0, (%rdx)
+	addq	$16, %rdx
+	decl	%ecx
+	jnz	2b
+	mfence
+
+	movdqa	PCPU(MDS_TMP),%xmm0
+	testb	$CR0_TS, %al
+	je	3f
+	movq	%rax, %cr0
+3:	popq	%rcx
+	popq	%rdx
+	popq	%rax
+	retq
+END(mds_handler_ivb)
+
+ENTRY(mds_handler_bdw)
+	pushq	%rax
+	pushq	%rbx
+	pushq	%rcx
+	pushq	%rdi
+	pushq	%rsi
+
+	movq	%cr0, %rax
+	testb	$CR0_TS, %al
+	je	1f
+	clts
+1:	movq	PCPU(MDS_BUF), %rbx
+	movdqa	%xmm0, PCPU(MDS_TMP)
+	pxor	%xmm0, %xmm0
+
+	movq	%rbx, %rdi
+	movq	%rbx, %rsi
+	movl	$40, %ecx
+2:	movntdq	%xmm0, (%rbx)
+	addq	$16, %rbx
+	decl	%ecx
+	jnz	2b
+	mfence
+	movl	$1536, %ecx
+	rep; movsb
+	lfence
+
+	movdqa	PCPU(MDS_TMP),%xmm0
+	testb	$CR0_TS, %al
+	je	3f
+	movq	%rax, %cr0
+3:	popq	%rsi
+	popq	%rdi
+	popq	%rcx
+	popq	%rbx
+	popq	%rax
+	retq
+END(mds_handler_bdw)
+
+ENTRY(mds_handler_skl_sse)
+	pushq	%rax
+	pushq	%rdx
+	pushq	%rcx
+	pushq	%rdi
+
+	movq	%cr0, %rax
+	testb	$CR0_TS, %al
+	je	1f
+	clts
+1:	movq	PCPU(MDS_BUF), %rdi
+	movq	PCPU(MDS_BUF64), %rdx
+	movdqa	%xmm0, PCPU(MDS_TMP)
+	pxor	%xmm0, %xmm0
+
+	lfence
+	orpd	(%rdx), %xmm0
+	orpd	(%rdx), %xmm0
+	xorl	%eax, %eax
+2:	clflushopt	5376(%rdi, %rax, 8)
+	addl	$8, %eax
+	cmpl	$8 * 12, %eax
+	jb	2b
+	sfence
+	movl	$6144, %ecx
+	xorl	%eax, %eax
+	rep; stosb
+	mfence
+
+	movdqa	PCPU(MDS_TMP), %xmm0
+	testb	$CR0_TS, %al
+	je	3f
+	movq	%rax, %cr0
+3:	popq	%rdi
+	popq	%rcx
+	popq	%rdx
+	popq	%rax
+	retq
+END(mds_handler_skl_sse)
+
+ENTRY(mds_handler_skl_avx)
+	pushq	%rax
+	pushq	%rdx
+	pushq	%rcx
+	pushq	%rdi
+
+	movq	%cr0, %rax
+	testb	$CR0_TS, %al
+	je	1f
+	clts
+1:	movq	PCPU(MDS_BUF), %rdi
+	movq	PCPU(MDS_BUF64), %rdx
+	vmovdqa	%ymm0, PCPU(MDS_TMP)
+	vpxor	%ymm0, %ymm0, %ymm0
+
+	lfence
+	vorpd	(%rdx), %ymm0, %ymm0
+	vorpd	(%rdx), %ymm0, %ymm0
+	xorl	%eax, %eax
+2:	clflushopt	5376(%rdi, %rax, 8)
+	addl	$8, %eax
+	cmpl	$8 * 12, %eax
+	jb	2b
+	sfence
+	movl	$6144, %ecx
+	xorl	%eax, %eax
+	rep; stosb
+	mfence
+
+	vmovdqa	PCPU(MDS_TMP), %ymm0
+	testb	$CR0_TS, %al
+	je	3f
+	movq	%rax, %cr0
+3:	popq	%rdi
+	popq	%rcx
+	popq	%rdx
+	popq	%rax
+	retq
+END(mds_handler_skl_avx)
+
+ENTRY(mds_handler_skl_avx512)
+	pushq	%rax
+	pushq	%rdx
+	pushq	%rcx
+	pushq	%rdi
+
+	movq	%cr0, %rax
+	testb	$CR0_TS, %al
+	je	1f
+	clts
+1:	movq	PCPU(MDS_BUF), %rdi
+	movq	PCPU(MDS_BUF64), %rdx
+	vmovdqa64	%zmm0, PCPU(MDS_TMP)
+	vpxor	%zmm0, %zmm0, %zmm0
+
+	lfence
+	vorpd	(%rdx), %zmm0, %zmm0
+	vorpd	(%rdx), %zmm0, %zmm0
+	xorl	%eax, %eax
+2:	clflushopt	5376(%rdi, %rax, 8)
+	addl	$8, %eax
+	cmpl	$8 * 12, %eax
+	jb	2b
+	sfence
+	movl	$6144, %ecx
+	xorl	%eax, %eax
+	rep; stosb
+	mfence
+
+	vmovdqa64	PCPU(MDS_TMP), %zmm0
+	testb	$CR0_TS, %al
+	je	3f
+	movq	%rax, %cr0
+3:	popq	%rdi
+	popq	%rcx
+	popq	%rdx
+	popq	%rax
+	retq
+END(mds_handler_skl_avx512)
+
+ENTRY(mds_handler_silvermont)
+	pushq	%rax
+	pushq	%rdx
+	pushq	%rcx
+
+	movq	%cr0, %rax
+	testb	$CR0_TS, %al
+	je	1f
+	clts
+1:	movq	PCPU(MDS_BUF), %rdx
+	movdqa	%xmm0, PCPU(MDS_TMP)
+	pxor	%xmm0, %xmm0
+
+	movl	$16, %ecx
+2:	movntdq	%xmm0, (%rdx)
+	addq	$16, %rdx
+	decl	%ecx
+	jnz	2b
+	mfence
+
+	movdqa	PCPU(MDS_TMP),%xmm0
+	testb	$CR0_TS, %al
+	je	3f
+	movq	%rax, %cr0
+3:	popq	%rcx
+	popq	%rdx
+	popq	%rax
+	retq
+END(mds_handler_silvermont)
Index: projects/runtime-coverage-v2/sys/amd64/amd64/trap.c
===================================================================
--- projects/runtime-coverage-v2/sys/amd64/amd64/trap.c	(revision 347598)
+++ projects/runtime-coverage-v2/sys/amd64/amd64/trap.c	(revision 347599)
@@ -1,1205 +1,1205 @@
 /*-
  * SPDX-License-Identifier: BSD-4-Clause
  *
  * Copyright (C) 1994, David Greenman
  * Copyright (c) 1990, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * This code is derived from software contributed to Berkeley by
  * the University of Utah, and William Jolitz.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *	This product includes software developed by the University of
  *	California, Berkeley and its contributors.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	from: @(#)trap.c	7.4 (Berkeley) 5/13/91
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 /*
  * AMD64 Trap and System call handling
  */
 
 #include "opt_clock.h"
 #include "opt_compat.h"
 #include "opt_cpu.h"
 #include "opt_hwpmc_hooks.h"
 #include "opt_isa.h"
 #include "opt_kdb.h"
 #include "opt_stack.h"
 
 #include <sys/param.h>
 #include <sys/bus.h>
 #include <sys/systm.h>
 #include <sys/proc.h>
 #include <sys/pioctl.h>
 #include <sys/ptrace.h>
 #include <sys/kdb.h>
 #include <sys/kernel.h>
 #include <sys/ktr.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/resourcevar.h>
 #include <sys/signalvar.h>
 #include <sys/syscall.h>
 #include <sys/sysctl.h>
 #include <sys/sysent.h>
 #include <sys/uio.h>
 #include <sys/vmmeter.h>
 #ifdef HWPMC_HOOKS
 #include <sys/pmckern.h>
 PMC_SOFT_DEFINE( , , page_fault, all);
 PMC_SOFT_DEFINE( , , page_fault, read);
 PMC_SOFT_DEFINE( , , page_fault, write);
 #endif
 
 #include <vm/vm.h>
 #include <vm/vm_param.h>
 #include <vm/pmap.h>
 #include <vm/vm_kern.h>
 #include <vm/vm_map.h>
 #include <vm/vm_page.h>
 #include <vm/vm_extern.h>
 
 #include <machine/cpu.h>
 #include <machine/intr_machdep.h>
 #include <x86/mca.h>
 #include <machine/md_var.h>
 #include <machine/pcb.h>
 #ifdef SMP
 #include <machine/smp.h>
 #endif
 #include <machine/stack.h>
 #include <machine/trap.h>
 #include <machine/tss.h>
 
 #ifdef KDTRACE_HOOKS
 #include <sys/dtrace_bsd.h>
 #endif
 
 extern inthand_t IDTVEC(bpt), IDTVEC(bpt_pti), IDTVEC(dbg),
     IDTVEC(fast_syscall), IDTVEC(fast_syscall_pti), IDTVEC(fast_syscall32),
     IDTVEC(int0x80_syscall_pti), IDTVEC(int0x80_syscall);
 
 void __noinline trap(struct trapframe *frame);
 void trap_check(struct trapframe *frame);
 void dblfault_handler(struct trapframe *frame);
 
 static int trap_pfault(struct trapframe *, int);
 static void trap_fatal(struct trapframe *, vm_offset_t);
 
 #define MAX_TRAP_MSG		32
 static char *trap_msg[] = {
 	"",					/*  0 unused */
 	"privileged instruction fault",		/*  1 T_PRIVINFLT */
 	"",					/*  2 unused */
 	"breakpoint instruction fault",		/*  3 T_BPTFLT */
 	"",					/*  4 unused */
 	"",					/*  5 unused */
 	"arithmetic trap",			/*  6 T_ARITHTRAP */
 	"",					/*  7 unused */
 	"",					/*  8 unused */
 	"general protection fault",		/*  9 T_PROTFLT */
 	"debug exception",			/* 10 T_TRCTRAP */
 	"",					/* 11 unused */
 	"page fault",				/* 12 T_PAGEFLT */
 	"",					/* 13 unused */
 	"alignment fault",			/* 14 T_ALIGNFLT */
 	"",					/* 15 unused */
 	"",					/* 16 unused */
 	"",					/* 17 unused */
 	"integer divide fault",			/* 18 T_DIVIDE */
 	"non-maskable interrupt trap",		/* 19 T_NMI */
 	"overflow trap",			/* 20 T_OFLOW */
 	"FPU bounds check fault",		/* 21 T_BOUND */
 	"FPU device not available",		/* 22 T_DNA */
 	"double fault",				/* 23 T_DOUBLEFLT */
 	"FPU operand fetch fault",		/* 24 T_FPOPFLT */
 	"invalid TSS fault",			/* 25 T_TSSFLT */
 	"segment not present fault",		/* 26 T_SEGNPFLT */
 	"stack fault",				/* 27 T_STKFLT */
 	"machine check trap",			/* 28 T_MCHK */
 	"SIMD floating-point exception",	/* 29 T_XMMFLT */
 	"reserved (unknown) fault",		/* 30 T_RESERVED */
 	"",					/* 31 unused (reserved) */
 	"DTrace pid return trap",		/* 32 T_DTRACE_RET */
 };
 
 static int prot_fault_translation;
 SYSCTL_INT(_machdep, OID_AUTO, prot_fault_translation, CTLFLAG_RWTUN,
     &prot_fault_translation, 0,
     "Select signal to deliver on protection fault");
 static int uprintf_signal;
 SYSCTL_INT(_machdep, OID_AUTO, uprintf_signal, CTLFLAG_RWTUN,
     &uprintf_signal, 0,
     "Print debugging information on trap signal to ctty");
 
 /*
  * Control L1D flush on return from NMI.
  *
  * Tunable  can be set to the following values:
  * 0 - only enable flush on return from NMI if required by vmm.ko (default)
  * >1 - always flush on return from NMI.
  *
  * Post-boot, the sysctl indicates if flushing is currently enabled.
  */
 int nmi_flush_l1d_sw;
 SYSCTL_INT(_machdep, OID_AUTO, nmi_flush_l1d_sw, CTLFLAG_RWTUN,
     &nmi_flush_l1d_sw, 0,
     "Flush L1 Data Cache on NMI exit, software bhyve L1TF mitigation assist");
 
 /*
  * Exception, fault, and trap interface to the FreeBSD kernel.
  * This common code is called from assembly language IDT gate entry
  * routines that prepare a suitable stack frame, and restore this
  * frame after the exception has been processed.
  */
 
 void
 trap(struct trapframe *frame)
 {
 	ksiginfo_t ksi;
 	struct thread *td;
 	struct proc *p;
 	register_t addr, dr6;
 	int signo, ucode;
 	u_int type;
 
 	td = curthread;
 	p = td->td_proc;
 	signo = 0;
 	ucode = 0;
 	addr = 0;
 	dr6 = 0;
 
 	VM_CNT_INC(v_trap);
 	type = frame->tf_trapno;
 
 #ifdef SMP
 	/* Handler for NMI IPIs used for stopping CPUs. */
 	if (type == T_NMI && ipi_nmi_handler() == 0)
 		return;
 #endif
 
 #ifdef KDB
 	if (kdb_active) {
 		kdb_reenter();
 		return;
 	}
 #endif
 
 	if (type == T_RESERVED) {
 		trap_fatal(frame, 0);
 		return;
 	}
 
 	if (type == T_NMI) {
 #ifdef HWPMC_HOOKS
 		/*
 		 * CPU PMCs interrupt using an NMI.  If the PMC module is
 		 * active, pass the 'rip' value to the PMC module's interrupt
 		 * handler.  A non-zero return value from the handler means that
 		 * the NMI was consumed by it and we can return immediately.
 		 */
 		if (pmc_intr != NULL &&
 		    (*pmc_intr)(frame) != 0)
 			return;
 #endif
 
 #ifdef STACK
 		if (stack_nmi_handler(frame) != 0)
 			return;
 #endif
 	}
 
 	if ((frame->tf_rflags & PSL_I) == 0) {
 		/*
 		 * Buggy application or kernel code has disabled
 		 * interrupts and then trapped.  Enabling interrupts
 		 * now is wrong, but it is better than running with
 		 * interrupts disabled until they are accidentally
 		 * enabled later.
 		 */
 		if (TRAPF_USERMODE(frame))
 			uprintf(
 			    "pid %ld (%s): trap %d with interrupts disabled\n",
 			    (long)curproc->p_pid, curthread->td_name, type);
 		else if (type != T_NMI && type != T_BPTFLT &&
 		    type != T_TRCTRAP) {
 			/*
 			 * XXX not quite right, since this may be for a
 			 * multiple fault in user mode.
 			 */
 			printf("kernel trap %d with interrupts disabled\n",
 			    type);
 
 			/*
 			 * We shouldn't enable interrupts while holding a
 			 * spin lock.
 			 */
 			if (td->td_md.md_spinlock_count == 0)
 				enable_intr();
 		}
 	}
 
 	if (TRAPF_USERMODE(frame)) {
 		/* user trap */
 
 		td->td_pticks = 0;
 		td->td_frame = frame;
 		addr = frame->tf_rip;
 		if (td->td_cowgen != p->p_cowgen)
 			thread_cow_update(td);
 
 		switch (type) {
 		case T_PRIVINFLT:	/* privileged instruction fault */
 			signo = SIGILL;
 			ucode = ILL_PRVOPC;
 			break;
 
 		case T_BPTFLT:		/* bpt instruction fault */
 			enable_intr();
 #ifdef KDTRACE_HOOKS
 			if (dtrace_pid_probe_ptr != NULL &&
 			    dtrace_pid_probe_ptr(frame) == 0)
 				return;
 #endif
 			signo = SIGTRAP;
 			ucode = TRAP_BRKPT;
 			break;
 
 		case T_TRCTRAP:		/* debug exception */
 			enable_intr();
 			signo = SIGTRAP;
 			ucode = TRAP_TRACE;
 			dr6 = rdr6();
 			if ((dr6 & DBREG_DR6_BS) != 0) {
 				PROC_LOCK(td->td_proc);
 				if ((td->td_dbgflags & TDB_STEP) != 0) {
 					td->td_frame->tf_rflags &= ~PSL_T;
 					td->td_dbgflags &= ~TDB_STEP;
 				}
 				PROC_UNLOCK(td->td_proc);
 			}
 			break;
 
 		case T_ARITHTRAP:	/* arithmetic trap */
 			ucode = fputrap_x87();
 			if (ucode == -1)
 				return;
 			signo = SIGFPE;
 			break;
 
 		case T_PROTFLT:		/* general protection fault */
 			signo = SIGBUS;
 			ucode = BUS_OBJERR;
 			break;
 		case T_STKFLT:		/* stack fault */
 		case T_SEGNPFLT:	/* segment not present fault */
 			signo = SIGBUS;
 			ucode = BUS_ADRERR;
 			break;
 		case T_TSSFLT:		/* invalid TSS fault */
 			signo = SIGBUS;
 			ucode = BUS_OBJERR;
 			break;
 		case T_ALIGNFLT:
 			signo = SIGBUS;
 			ucode = BUS_ADRALN;
 			break;
 		case T_DOUBLEFLT:	/* double fault */
 		default:
 			signo = SIGBUS;
 			ucode = BUS_OBJERR;
 			break;
 
 		case T_PAGEFLT:		/* page fault */
 			/*
 			 * Emulator can take care about this trap?
 			 */
 			if (*p->p_sysent->sv_trap != NULL &&
 			    (*p->p_sysent->sv_trap)(td) == 0)
 				return;
 
 			addr = frame->tf_addr;
 			signo = trap_pfault(frame, TRUE);
 			if (signo == -1)
 				return;
 			if (signo == 0)
 				goto userret;
 			if (signo == SIGSEGV) {
 				ucode = SEGV_MAPERR;
 			} else if (prot_fault_translation == 0) {
 				/*
 				 * Autodetect.  This check also covers
 				 * the images without the ABI-tag ELF
 				 * note.
 				 */
 				if (SV_CURPROC_ABI() == SV_ABI_FREEBSD &&
 				    p->p_osrel >= P_OSREL_SIGSEGV) {
 					signo = SIGSEGV;
 					ucode = SEGV_ACCERR;
 				} else {
 					signo = SIGBUS;
 					ucode = T_PAGEFLT;
 				}
 			} else if (prot_fault_translation == 1) {
 				/*
 				 * Always compat mode.
 				 */
 				signo = SIGBUS;
 				ucode = T_PAGEFLT;
 			} else {
 				/*
 				 * Always SIGSEGV mode.
 				 */
 				signo = SIGSEGV;
 				ucode = SEGV_ACCERR;
 			}
 			break;
 
 		case T_DIVIDE:		/* integer divide fault */
 			ucode = FPE_INTDIV;
 			signo = SIGFPE;
 			break;
 
 #ifdef DEV_ISA
 		case T_NMI:
 			nmi_handle_intr(type, frame);
 			return;
 #endif
 
 		case T_OFLOW:		/* integer overflow fault */
 			ucode = FPE_INTOVF;
 			signo = SIGFPE;
 			break;
 
 		case T_BOUND:		/* bounds check fault */
 			ucode = FPE_FLTSUB;
 			signo = SIGFPE;
 			break;
 
 		case T_DNA:
 			/* transparent fault (due to context switch "late") */
 			KASSERT(PCB_USER_FPU(td->td_pcb),
 			    ("kernel FPU ctx has leaked"));
 			fpudna();
 			return;
 
 		case T_FPOPFLT:		/* FPU operand fetch fault */
 			ucode = ILL_COPROC;
 			signo = SIGILL;
 			break;
 
 		case T_XMMFLT:		/* SIMD floating-point exception */
 			ucode = fputrap_sse();
 			if (ucode == -1)
 				return;
 			signo = SIGFPE;
 			break;
 #ifdef KDTRACE_HOOKS
 		case T_DTRACE_RET:
 			enable_intr();
 			if (dtrace_return_probe_ptr != NULL)
 				dtrace_return_probe_ptr(frame);
 			return;
 #endif
 		}
 	} else {
 		/* kernel trap */
 
 		KASSERT(cold || td->td_ucred != NULL,
 		    ("kernel trap doesn't have ucred"));
 		switch (type) {
 		case T_PAGEFLT:			/* page fault */
 			(void) trap_pfault(frame, FALSE);
 			return;
 
 		case T_DNA:
 			if (PCB_USER_FPU(td->td_pcb))
 				panic("Unregistered use of FPU in kernel");
 			fpudna();
 			return;
 
 		case T_ARITHTRAP:	/* arithmetic trap */
 		case T_XMMFLT:		/* SIMD floating-point exception */
 		case T_FPOPFLT:		/* FPU operand fetch fault */
 			/*
 			 * For now, supporting kernel handler
 			 * registration for FPU traps is overkill.
 			 */
 			trap_fatal(frame, 0);
 			return;
 
 		case T_STKFLT:		/* stack fault */
 		case T_PROTFLT:		/* general protection fault */
 		case T_SEGNPFLT:	/* segment not present fault */
 			if (td->td_intr_nesting_level != 0)
 				break;
 
 			/*
 			 * Invalid segment selectors and out of bounds
 			 * %rip's and %rsp's can be set up in user mode.
 			 * This causes a fault in kernel mode when the
 			 * kernel tries to return to user mode.  We want
 			 * to get this fault so that we can fix the
 			 * problem here and not have to check all the
 			 * selectors and pointers when the user changes
 			 * them.
 			 *
 			 * In case of PTI, the IRETQ faulted while the
 			 * kernel used the pti stack, and exception
 			 * frame records %rsp value pointing to that
 			 * stack.  If we return normally to
 			 * doreti_iret_fault, the trapframe is
 			 * reconstructed on pti stack, and calltrap()
 			 * called on it as well.  Due to the very
 			 * limited pti stack size, kernel does not
 			 * survive for too long.  Switch to the normal
 			 * thread stack for the trap handling.
 			 *
 			 * Magic '5' is the number of qwords occupied by
 			 * the hardware trap frame.
 			 */
 			if (frame->tf_rip == (long)doreti_iret) {
 				frame->tf_rip = (long)doreti_iret_fault;
 				if ((PCPU_GET(curpmap)->pm_ucr3 !=
 				    PMAP_NO_CR3) &&
 				    (frame->tf_rsp == (uintptr_t)PCPU_GET(
 				    pti_rsp0) - 5 * sizeof(register_t))) {
 					frame->tf_rsp = PCPU_GET(rsp0) - 5 *
 					    sizeof(register_t);
 				}
 				return;
 			}
 			if (frame->tf_rip == (long)ld_ds) {
 				frame->tf_rip = (long)ds_load_fault;
 				return;
 			}
 			if (frame->tf_rip == (long)ld_es) {
 				frame->tf_rip = (long)es_load_fault;
 				return;
 			}
 			if (frame->tf_rip == (long)ld_fs) {
 				frame->tf_rip = (long)fs_load_fault;
 				return;
 			}
 			if (frame->tf_rip == (long)ld_gs) {
 				frame->tf_rip = (long)gs_load_fault;
 				return;
 			}
 			if (frame->tf_rip == (long)ld_gsbase) {
 				frame->tf_rip = (long)gsbase_load_fault;
 				return;
 			}
 			if (frame->tf_rip == (long)ld_fsbase) {
 				frame->tf_rip = (long)fsbase_load_fault;
 				return;
 			}
 			if (curpcb->pcb_onfault != NULL) {
 				frame->tf_rip = (long)curpcb->pcb_onfault;
 				return;
 			}
 			break;
 
 		case T_TSSFLT:
 			/*
 			 * PSL_NT can be set in user mode and isn't cleared
 			 * automatically when the kernel is entered.  This
 			 * causes a TSS fault when the kernel attempts to
 			 * `iret' because the TSS link is uninitialized.  We
 			 * want to get this fault so that we can fix the
 			 * problem here and not every time the kernel is
 			 * entered.
 			 */
 			if (frame->tf_rflags & PSL_NT) {
 				frame->tf_rflags &= ~PSL_NT;
 				return;
 			}
 			break;
 
 		case T_TRCTRAP:	 /* debug exception */
 			/* Clear any pending debug events. */
 			dr6 = rdr6();
 			load_dr6(0);
 
 			/*
 			 * Ignore debug register exceptions due to
 			 * accesses in the user's address space, which
 			 * can happen under several conditions such as
 			 * if a user sets a watchpoint on a buffer and
 			 * then passes that buffer to a system call.
 			 * We still want to get TRCTRAPS for addresses
 			 * in kernel space because that is useful when
 			 * debugging the kernel.
 			 */
 			if (user_dbreg_trap(dr6))
 				return;
 
 			/*
 			 * Malicious user code can configure a debug
 			 * register watchpoint to trap on data access
 			 * to the top of stack and then execute 'pop
 			 * %ss; int 3'.  Due to exception deferral for
 			 * 'pop %ss', the CPU will not interrupt 'int
 			 * 3' to raise the DB# exception for the debug
 			 * register but will postpone the DB# until
 			 * execution of the first instruction of the
 			 * BP# handler (in kernel mode).  Normally the
 			 * previous check would ignore DB# exceptions
 			 * for watchpoints on user addresses raised in
 			 * kernel mode.  However, some CPU errata
 			 * include cases where DB# exceptions do not
 			 * properly set bits in %dr6, e.g. Haswell
 			 * HSD23 and Skylake-X SKZ24.
 			 *
 			 * A deferred DB# can also be raised on the
 			 * first instructions of system call entry
 			 * points or single-step traps via similar use
 			 * of 'pop %ss' or 'mov xxx, %ss'.
 			 */
 			if (pti) {
 				if (frame->tf_rip ==
 				    (uintptr_t)IDTVEC(fast_syscall_pti) ||
 #ifdef COMPAT_FREEBSD32
 				    frame->tf_rip ==
 				    (uintptr_t)IDTVEC(int0x80_syscall_pti) ||
 #endif
 				    frame->tf_rip == (uintptr_t)IDTVEC(bpt_pti))
 					return;
 			} else {
 				if (frame->tf_rip ==
 				    (uintptr_t)IDTVEC(fast_syscall) ||
 #ifdef COMPAT_FREEBSD32
 				    frame->tf_rip ==
 				    (uintptr_t)IDTVEC(int0x80_syscall) ||
 #endif
 				    frame->tf_rip == (uintptr_t)IDTVEC(bpt))
 					return;
 			}
 			if (frame->tf_rip == (uintptr_t)IDTVEC(dbg) ||
 			    /* Needed for AMD. */
 			    frame->tf_rip == (uintptr_t)IDTVEC(fast_syscall32))
 				return;
 			/*
 			 * FALLTHROUGH (TRCTRAP kernel mode, kernel address)
 			 */
 		case T_BPTFLT:
 			/*
 			 * If KDB is enabled, let it handle the debugger trap.
 			 * Otherwise, debugger traps "can't happen".
 			 */
 #ifdef KDB
 			if (kdb_trap(type, dr6, frame))
 				return;
 #endif
 			break;
 
 #ifdef DEV_ISA
 		case T_NMI:
 			nmi_handle_intr(type, frame);
 			return;
 #endif
 		}
 
 		trap_fatal(frame, 0);
 		return;
 	}
 
 	/* Translate fault for emulators (e.g. Linux) */
 	if (*p->p_sysent->sv_transtrap != NULL)
 		signo = (*p->p_sysent->sv_transtrap)(signo, type);
 
 	ksiginfo_init_trap(&ksi);
 	ksi.ksi_signo = signo;
 	ksi.ksi_code = ucode;
 	ksi.ksi_trapno = type;
 	ksi.ksi_addr = (void *)addr;
 	if (uprintf_signal) {
 		uprintf("pid %d comm %s: signal %d err %lx code %d type %d "
 		    "addr 0x%lx rsp 0x%lx rip 0x%lx "
 		    "<%02x %02x %02x %02x %02x %02x %02x %02x>\n",
 		    p->p_pid, p->p_comm, signo, frame->tf_err, ucode, type,
 		    addr, frame->tf_rsp, frame->tf_rip,
 		    fubyte((void *)(frame->tf_rip + 0)),
 		    fubyte((void *)(frame->tf_rip + 1)),
 		    fubyte((void *)(frame->tf_rip + 2)),
 		    fubyte((void *)(frame->tf_rip + 3)),
 		    fubyte((void *)(frame->tf_rip + 4)),
 		    fubyte((void *)(frame->tf_rip + 5)),
 		    fubyte((void *)(frame->tf_rip + 6)),
 		    fubyte((void *)(frame->tf_rip + 7)));
 	}
 	KASSERT((read_rflags() & PSL_I) != 0, ("interrupts disabled"));
 	trapsignal(td, &ksi);
 
 userret:
 	userret(td, frame);
 	KASSERT(PCB_USER_FPU(td->td_pcb),
 	    ("Return from trap with kernel FPU ctx leaked"));
 }
 
 /*
  * Ensure that we ignore any DTrace-induced faults. This function cannot
  * be instrumented, so it cannot generate such faults itself.
  */
 void
 trap_check(struct trapframe *frame)
 {
 
 #ifdef KDTRACE_HOOKS
 	if (dtrace_trap_func != NULL &&
 	    (*dtrace_trap_func)(frame, frame->tf_trapno) != 0)
 		return;
 #endif
 	trap(frame);
 }
 
 static bool
 trap_is_smap(struct trapframe *frame)
 {
 
 	/*
 	 * A page fault on a userspace address is classified as
 	 * SMAP-induced if:
 	 * - SMAP is supported;
 	 * - kernel mode accessed present data page;
 	 * - rflags.AC was cleared.
 	 * Kernel must never access user space with rflags.AC cleared
 	 * if SMAP is enabled.
 	 */
 	return ((cpu_stdext_feature & CPUID_STDEXT_SMAP) != 0 &&
 	    (frame->tf_err & (PGEX_P | PGEX_U | PGEX_I | PGEX_RSV)) ==
 	    PGEX_P && (frame->tf_rflags & PSL_AC) == 0);
 }
 
 static bool
 trap_is_pti(struct trapframe *frame)
 {
 
 	return (PCPU_GET(curpmap)->pm_ucr3 != PMAP_NO_CR3 &&
 	    pg_nx != 0 && (frame->tf_err & (PGEX_P | PGEX_W |
 	    PGEX_U | PGEX_I)) == (PGEX_P | PGEX_U | PGEX_I) &&
 	    (curpcb->pcb_saved_ucr3 & ~CR3_PCID_MASK) ==
 	    (PCPU_GET(curpmap)->pm_cr3 & ~CR3_PCID_MASK));
 }
 
 static int
 trap_pfault(struct trapframe *frame, int usermode)
 {
 	struct thread *td;
 	struct proc *p;
 	vm_map_t map;
 	vm_offset_t va;
 	int rv;
 	vm_prot_t ftype;
 	vm_offset_t eva;
 
 	td = curthread;
 	p = td->td_proc;
 	eva = frame->tf_addr;
 
 	if (__predict_false((td->td_pflags & TDP_NOFAULTING) != 0)) {
 		/*
 		 * Due to both processor errata and lazy TLB invalidation when
 		 * access restrictions are removed from virtual pages, memory
 		 * accesses that are allowed by the physical mapping layer may
 		 * nonetheless cause one spurious page fault per virtual page. 
 		 * When the thread is executing a "no faulting" section that
 		 * is bracketed by vm_fault_{disable,enable}_pagefaults(),
 		 * every page fault is treated as a spurious page fault,
 		 * unless it accesses the same virtual address as the most
 		 * recent page fault within the same "no faulting" section.
 		 */
 		if (td->td_md.md_spurflt_addr != eva ||
 		    (td->td_pflags & TDP_RESETSPUR) != 0) {
 			/*
 			 * Do nothing to the TLB.  A stale TLB entry is
 			 * flushed automatically by a page fault.
 			 */
 			td->td_md.md_spurflt_addr = eva;
 			td->td_pflags &= ~TDP_RESETSPUR;
 			return (0);
 		}
 	} else {
 		/*
 		 * If we get a page fault while in a critical section, then
 		 * it is most likely a fatal kernel page fault.  The kernel
 		 * is already going to panic trying to get a sleep lock to
 		 * do the VM lookup, so just consider it a fatal trap so the
 		 * kernel can print out a useful trap message and even get
 		 * to the debugger.
 		 *
 		 * If we get a page fault while holding a non-sleepable
 		 * lock, then it is most likely a fatal kernel page fault.
 		 * If WITNESS is enabled, then it's going to whine about
 		 * bogus LORs with various VM locks, so just skip to the
 		 * fatal trap handling directly.
 		 */
 		if (td->td_critnest != 0 ||
 		    WITNESS_CHECK(WARN_SLEEPOK | WARN_GIANTOK, NULL,
 		    "Kernel page fault") != 0) {
 			trap_fatal(frame, eva);
 			return (-1);
 		}
 	}
 	va = trunc_page(eva);
 	if (va >= VM_MIN_KERNEL_ADDRESS) {
 		/*
 		 * Don't allow user-mode faults in kernel address space.
 		 */
 		if (usermode)
 			return (SIGSEGV);
 
 		map = kernel_map;
 	} else {
 		map = &p->p_vmspace->vm_map;
 
 		/*
 		 * When accessing a usermode address, kernel must be
 		 * ready to accept the page fault, and provide a
 		 * handling routine.  Since accessing the address
 		 * without the handler is a bug, do not try to handle
 		 * it normally, and panic immediately.
 		 *
 		 * If SMAP is enabled, filter SMAP faults also,
 		 * because illegal access might occur to the mapped
 		 * user address, causing infinite loop.
 		 */
 		if (!usermode && (td->td_intr_nesting_level != 0 ||
 		    trap_is_smap(frame) || curpcb->pcb_onfault == NULL)) {
 			trap_fatal(frame, eva);
 			return (-1);
 		}
 	}
 
 	/*
 	 * If the trap was caused by errant bits in the PTE then panic.
 	 */
 	if (frame->tf_err & PGEX_RSV) {
 		trap_fatal(frame, eva);
 		return (-1);
 	}
 
 	/*
 	 * User-mode protection key violation (PKU).  May happen
 	 * either from usermode or from kernel if copyin accessed
 	 * key-protected mapping.
 	 */
 	if ((frame->tf_err & PGEX_PK) != 0) {
 		if (eva > VM_MAXUSER_ADDRESS) {
 			trap_fatal(frame, eva);
 			return (-1);
 		}
 		rv = KERN_PROTECTION_FAILURE;
 		goto after_vmfault;
 	}
 
 	/*
 	 * If nx protection of the usermode portion of kernel page
 	 * tables caused trap, panic.
 	 */
 	if (usermode && trap_is_pti(frame))
 		panic("PTI: pid %d comm %s tf_err %#lx", p->p_pid,
 		    p->p_comm, frame->tf_err);
 
 	/*
 	 * PGEX_I is defined only if the execute disable bit capability is
 	 * supported and enabled.
 	 */
 	if (frame->tf_err & PGEX_W)
 		ftype = VM_PROT_WRITE;
 	else if ((frame->tf_err & PGEX_I) && pg_nx != 0)
 		ftype = VM_PROT_EXECUTE;
 	else
 		ftype = VM_PROT_READ;
 
 	/* Fault in the page. */
 	rv = vm_fault(map, va, ftype, VM_FAULT_NORMAL);
 	if (rv == KERN_SUCCESS) {
 #ifdef HWPMC_HOOKS
 		if (ftype == VM_PROT_READ || ftype == VM_PROT_WRITE) {
 			PMC_SOFT_CALL_TF( , , page_fault, all, frame);
 			if (ftype == VM_PROT_READ)
 				PMC_SOFT_CALL_TF( , , page_fault, read,
 				    frame);
 			else
 				PMC_SOFT_CALL_TF( , , page_fault, write,
 				    frame);
 		}
 #endif
 		return (0);
 	}
 after_vmfault:
 	if (!usermode) {
 		if (td->td_intr_nesting_level == 0 &&
 		    curpcb->pcb_onfault != NULL) {
 			frame->tf_rip = (long)curpcb->pcb_onfault;
 			return (0);
 		}
 		trap_fatal(frame, eva);
 		return (-1);
 	}
 	return ((rv == KERN_PROTECTION_FAILURE) ? SIGBUS : SIGSEGV);
 }
 
 static void
 trap_fatal(frame, eva)
 	struct trapframe *frame;
 	vm_offset_t eva;
 {
 	int code, ss;
 	u_int type;
 	struct soft_segment_descriptor softseg;
 	char *msg;
 #ifdef KDB
 	bool handled;
 #endif
 
 	code = frame->tf_err;
 	type = frame->tf_trapno;
 	sdtossd(&gdt[NGDT * PCPU_GET(cpuid) + IDXSEL(frame->tf_cs & 0xffff)],
 	    &softseg);
 
 	if (type <= MAX_TRAP_MSG)
 		msg = trap_msg[type];
 	else
 		msg = "UNKNOWN";
 	printf("\n\nFatal trap %d: %s while in %s mode\n", type, msg,
 	    TRAPF_USERMODE(frame) ? "user" : "kernel");
 #ifdef SMP
 	/* two separate prints in case of a trap on an unmapped page */
 	printf("cpuid = %d; ", PCPU_GET(cpuid));
 	printf("apic id = %02x\n", PCPU_GET(apic_id));
 #endif
 	if (type == T_PAGEFLT) {
 		printf("fault virtual address	= 0x%lx\n", eva);
 		printf("fault code		= %s %s %s%s%s, %s\n",
 			code & PGEX_U ? "user" : "supervisor",
 			code & PGEX_W ? "write" : "read",
 			code & PGEX_I ? "instruction" : "data",
-			code & PGEX_PK ? " prot key" : " ",
-			code & PGEX_SGX ? " SGX" : " ",
+			code & PGEX_PK ? " prot key" : "",
+			code & PGEX_SGX ? " SGX" : "",
 			code & PGEX_RSV ? "reserved bits in PTE" :
 			code & PGEX_P ? "protection violation" : "page not present");
 	}
 	printf("instruction pointer	= 0x%lx:0x%lx\n",
 	       frame->tf_cs & 0xffff, frame->tf_rip);
 	ss = frame->tf_ss & 0xffff;
 	printf("stack pointer	        = 0x%x:0x%lx\n", ss, frame->tf_rsp);
 	printf("frame pointer	        = 0x%x:0x%lx\n", ss, frame->tf_rbp);
 	printf("code segment		= base 0x%lx, limit 0x%lx, type 0x%x\n",
 	       softseg.ssd_base, softseg.ssd_limit, softseg.ssd_type);
 	printf("			= DPL %d, pres %d, long %d, def32 %d, gran %d\n",
 	       softseg.ssd_dpl, softseg.ssd_p, softseg.ssd_long, softseg.ssd_def32,
 	       softseg.ssd_gran);
 	printf("processor eflags	= ");
 	if (frame->tf_rflags & PSL_T)
 		printf("trace trap, ");
 	if (frame->tf_rflags & PSL_I)
 		printf("interrupt enabled, ");
 	if (frame->tf_rflags & PSL_NT)
 		printf("nested task, ");
 	if (frame->tf_rflags & PSL_RF)
 		printf("resume, ");
 	printf("IOPL = %ld\n", (frame->tf_rflags & PSL_IOPL) >> 12);
 	printf("current process		= %d (%s)\n",
 	    curproc->p_pid, curthread->td_name);
 
 #ifdef KDB
 	if (debugger_on_trap) {
 		kdb_why = KDB_WHY_TRAP;
 		handled = kdb_trap(type, 0, frame);
 		kdb_why = KDB_WHY_UNSET;
 		if (handled)
 			return;
 	}
 #endif
 	printf("trap number		= %d\n", type);
 	if (type <= MAX_TRAP_MSG)
 		panic("%s", trap_msg[type]);
 	else
 		panic("unknown/reserved trap");
 }
 
 /*
  * Double fault handler. Called when a fault occurs while writing
  * a frame for a trap/exception onto the stack. This usually occurs
  * when the stack overflows (such is the case with infinite recursion,
  * for example).
  */
 void
 dblfault_handler(struct trapframe *frame)
 {
 #ifdef KDTRACE_HOOKS
 	if (dtrace_doubletrap_func != NULL)
 		(*dtrace_doubletrap_func)();
 #endif
 	printf("\nFatal double fault\n"
 	    "rip %#lx rsp %#lx rbp %#lx\n"
 	    "rax %#lx rdx %#lx rbx %#lx\n"
 	    "rcx %#lx rsi %#lx rdi %#lx\n"
 	    "r8 %#lx r9 %#lx r10 %#lx\n"
 	    "r11 %#lx r12 %#lx r13 %#lx\n"
 	    "r14 %#lx r15 %#lx rflags %#lx\n"
 	    "cs %#lx ss %#lx ds %#hx es %#hx fs %#hx gs %#hx\n"
 	    "fsbase %#lx gsbase %#lx kgsbase %#lx\n",
 	    frame->tf_rip, frame->tf_rsp, frame->tf_rbp,
 	    frame->tf_rax, frame->tf_rdx, frame->tf_rbx,
 	    frame->tf_rcx, frame->tf_rdi, frame->tf_rsi,
 	    frame->tf_r8, frame->tf_r9, frame->tf_r10,
 	    frame->tf_r11, frame->tf_r12, frame->tf_r13,
 	    frame->tf_r14, frame->tf_r15, frame->tf_rflags,
 	    frame->tf_cs, frame->tf_ss, frame->tf_ds, frame->tf_es,
 	    frame->tf_fs, frame->tf_gs,
 	    rdmsr(MSR_FSBASE), rdmsr(MSR_GSBASE), rdmsr(MSR_KGSBASE));
 #ifdef SMP
 	/* two separate prints in case of a trap on an unmapped page */
 	printf("cpuid = %d; ", PCPU_GET(cpuid));
 	printf("apic id = %02x\n", PCPU_GET(apic_id));
 #endif
 	panic("double fault");
 }
 
 static int __noinline
 cpu_fetch_syscall_args_fallback(struct thread *td, struct syscall_args *sa)
 {
 	struct proc *p;
 	struct trapframe *frame;
 	register_t *argp;
 	caddr_t params;
 	int reg, regcnt, error;
 
 	p = td->td_proc;
 	frame = td->td_frame;
 	reg = 0;
 	regcnt = NARGREGS;
 
 	sa->code = frame->tf_rax;
 
 	if (sa->code == SYS_syscall || sa->code == SYS___syscall) {
 		sa->code = frame->tf_rdi;
 		reg++;
 		regcnt--;
 	}
 
  	if (sa->code >= p->p_sysent->sv_size)
  		sa->callp = &p->p_sysent->sv_table[0];
   	else
  		sa->callp = &p->p_sysent->sv_table[sa->code];
 
 	sa->narg = sa->callp->sy_narg;
 	KASSERT(sa->narg <= nitems(sa->args), ("Too many syscall arguments!"));
 	argp = &frame->tf_rdi;
 	argp += reg;
 	memcpy(sa->args, argp, sizeof(sa->args[0]) * NARGREGS);
 	if (sa->narg > regcnt) {
 		params = (caddr_t)frame->tf_rsp + sizeof(register_t);
 		error = copyin(params, &sa->args[regcnt],
 	    	    (sa->narg - regcnt) * sizeof(sa->args[0]));
 		if (__predict_false(error != 0))
 			return (error);
 	}
 
 	td->td_retval[0] = 0;
 	td->td_retval[1] = frame->tf_rdx;
 
 	return (0);
 }
 
 int
 cpu_fetch_syscall_args(struct thread *td)
 {
 	struct proc *p;
 	struct trapframe *frame;
 	struct syscall_args *sa;
 
 	p = td->td_proc;
 	frame = td->td_frame;
 	sa = &td->td_sa;
 
 	sa->code = frame->tf_rax;
 
 	if (__predict_false(sa->code == SYS_syscall ||
 	    sa->code == SYS___syscall ||
 	    sa->code >= p->p_sysent->sv_size))
 		return (cpu_fetch_syscall_args_fallback(td, sa));
 
 	sa->callp = &p->p_sysent->sv_table[sa->code];
 	sa->narg = sa->callp->sy_narg;
 	KASSERT(sa->narg <= nitems(sa->args), ("Too many syscall arguments!"));
 
 	if (__predict_false(sa->narg > NARGREGS))
 		return (cpu_fetch_syscall_args_fallback(td, sa));
 
 	memcpy(sa->args, &frame->tf_rdi, sizeof(sa->args[0]) * NARGREGS);
 
 	td->td_retval[0] = 0;
 	td->td_retval[1] = frame->tf_rdx;
 
 	return (0);
 }
 
 #include "../../kern/subr_syscall.c"
 
 static void (*syscall_ret_l1d_flush)(void);
 int syscall_ret_l1d_flush_mode;
 
 static void
 flush_l1d_hw(void)
 {
 
 	wrmsr(MSR_IA32_FLUSH_CMD, IA32_FLUSH_CMD_L1D);
 }
 
 static void __inline
 amd64_syscall_ret_flush_l1d_inline(int error)
 {
 	void (*p)(void);
 
 	if (error != 0 && error != EEXIST && error != EAGAIN &&
 	    error != EXDEV && error != ENOENT && error != ENOTCONN &&
 	    error != EINPROGRESS) {
 		p = syscall_ret_l1d_flush;
 		if (p != NULL)
 			p();
 	}
 }
 
 void
 amd64_syscall_ret_flush_l1d(int error)
 {
 
 	amd64_syscall_ret_flush_l1d_inline(error);
 }
 
 void
 amd64_syscall_ret_flush_l1d_recalc(void)
 {
 	bool l1d_hw;
 
 	l1d_hw = (cpu_stdext_feature3 & CPUID_STDEXT3_L1D_FLUSH) != 0;
 again:
 	switch (syscall_ret_l1d_flush_mode) {
 	case 0:
 		syscall_ret_l1d_flush = NULL;
 		break;
 	case 1:
 		syscall_ret_l1d_flush = l1d_hw ? flush_l1d_hw :
 		    flush_l1d_sw_abi;
 		break;
 	case 2:
 		syscall_ret_l1d_flush = l1d_hw ? flush_l1d_hw : NULL;
 		break;
 	case 3:
 		syscall_ret_l1d_flush = flush_l1d_sw_abi;
 		break;
 	default:
 		syscall_ret_l1d_flush_mode = 1;
 		goto again;
 	}
 }
 
 static int
 machdep_syscall_ret_flush_l1d(SYSCTL_HANDLER_ARGS)
 {
 	int error, val;
 
 	val = syscall_ret_l1d_flush_mode;
 	error = sysctl_handle_int(oidp, &val, 0, req);
 	if (error != 0 || req->newptr == NULL)
 		return (error);
 	syscall_ret_l1d_flush_mode = val;
 	amd64_syscall_ret_flush_l1d_recalc();
 	return (0);
 }
 SYSCTL_PROC(_machdep, OID_AUTO, syscall_ret_flush_l1d, CTLTYPE_INT |
     CTLFLAG_RWTUN | CTLFLAG_NOFETCH | CTLFLAG_MPSAFE, NULL, 0,
     machdep_syscall_ret_flush_l1d, "I",
     "Flush L1D on syscall return with error (0 - off, 1 - on, "
     "2 - use hw only, 3 - use sw only");
 
 
 /*
  * System call handler for native binaries.  The trap frame is already
  * set up by the assembler trampoline and a pointer to it is saved in
  * td_frame.
  */
 void
 amd64_syscall(struct thread *td, int traced)
 {
 	int error;
 	ksiginfo_t ksi;
 
 #ifdef DIAGNOSTIC
 	if (!TRAPF_USERMODE(td->td_frame)) {
 		panic("syscall");
 		/* NOT REACHED */
 	}
 #endif
 	error = syscallenter(td);
 
 	/*
 	 * Traced syscall.
 	 */
 	if (__predict_false(traced)) {
 		td->td_frame->tf_rflags &= ~PSL_T;
 		ksiginfo_init_trap(&ksi);
 		ksi.ksi_signo = SIGTRAP;
 		ksi.ksi_code = TRAP_TRACE;
 		ksi.ksi_addr = (void *)td->td_frame->tf_rip;
 		trapsignal(td, &ksi);
 	}
 
 	KASSERT(PCB_USER_FPU(td->td_pcb),
 	    ("System call %s returning with kernel FPU ctx leaked",
 	     syscallname(td->td_proc, td->td_sa.code)));
 	KASSERT(td->td_pcb->pcb_save == get_pcb_user_save_td(td),
 	    ("System call %s returning with mangled pcb_save",
 	     syscallname(td->td_proc, td->td_sa.code)));
 	KASSERT(td->td_md.md_invl_gen.gen == 0,
 	    ("System call %s returning with leaked invl_gen %lu",
 	    syscallname(td->td_proc, td->td_sa.code),
 	    td->td_md.md_invl_gen.gen));
 
 	syscallret(td, error);
 
 	/*
 	 * If the user-supplied value of %rip is not a canonical
 	 * address, then some CPUs will trigger a ring 0 #GP during
 	 * the sysret instruction.  However, the fault handler would
 	 * execute in ring 0 with the user's %gs and %rsp which would
 	 * not be safe.  Instead, use the full return path which
 	 * catches the problem safely.
 	 */
 	if (__predict_false(td->td_frame->tf_rip >= VM_MAXUSER_ADDRESS))
 		set_pcb_flags(td->td_pcb, PCB_FULL_IRET);
 
 	amd64_syscall_ret_flush_l1d_inline(error);
 }
Index: projects/runtime-coverage-v2/sys/amd64/include/pcpu.h
===================================================================
--- projects/runtime-coverage-v2/sys/amd64/include/pcpu.h	(revision 347598)
+++ projects/runtime-coverage-v2/sys/amd64/include/pcpu.h	(revision 347599)
@@ -1,271 +1,275 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
  *
  * Copyright (c) Peter Wemm <peter@netplex.com.au>
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  * $FreeBSD$
  */
 
 #ifndef _MACHINE_PCPU_H_
 #define	_MACHINE_PCPU_H_
 
 #ifndef _SYS_CDEFS_H_
 #error "sys/cdefs.h is a prerequisite for this file"
 #endif
 
 #define	PC_PTI_STACK_SZ	16
 
 struct monitorbuf {
 	int idle_state;		/* Used by cpu_idle_mwait. */
 	int stop_state;		/* Used by cpustop_handler. */
 	char padding[128 - (2 * sizeof(int))];
 };
 _Static_assert(sizeof(struct monitorbuf) == 128, "2x cache line");
 
 /*
  * The SMP parts are setup in pmap.c and locore.s for the BSP, and
  * mp_machdep.c sets up the data for the AP's to "see" when they awake.
  * The reason for doing it via a struct is so that an array of pointers
  * to each CPU's data can be set up for things like "check curproc on all
  * other processors"
  */
 #define	PCPU_MD_FIELDS							\
 	struct monitorbuf pc_monitorbuf __aligned(128);	/* cache line */\
 	struct	pcpu *pc_prvspace;	/* Self-reference */		\
 	struct	pmap *pc_curpmap;					\
 	struct	amd64tss *pc_tssp;	/* TSS segment active on CPU */	\
 	struct	amd64tss *pc_commontssp;/* Common TSS for the CPU */	\
 	uint64_t pc_kcr3;						\
 	uint64_t pc_ucr3;						\
 	uint64_t pc_saved_ucr3;						\
 	register_t pc_rsp0;						\
 	register_t pc_scratch_rsp;	/* User %rsp in syscall */	\
 	register_t pc_scratch_rax;					\
 	u_int	pc_apic_id;						\
 	u_int   pc_acpi_id;		/* ACPI CPU id */		\
 	/* Pointer to the CPU %fs descriptor */				\
 	struct user_segment_descriptor	*pc_fs32p;			\
 	/* Pointer to the CPU %gs descriptor */				\
 	struct user_segment_descriptor	*pc_gs32p;			\
 	/* Pointer to the CPU LDT descriptor */				\
 	struct system_segment_descriptor *pc_ldt;			\
 	/* Pointer to the CPU TSS descriptor */				\
 	struct system_segment_descriptor *pc_tss;			\
 	uint64_t	pc_pm_save_cnt;					\
 	u_int	pc_cmci_mask;		/* MCx banks for CMCI */	\
 	uint64_t pc_dbreg[16];		/* ddb debugging regs */	\
 	uint64_t pc_pti_stack[PC_PTI_STACK_SZ];				\
 	register_t pc_pti_rsp0;						\
 	int pc_dbreg_cmd;		/* ddb debugging reg cmd */	\
 	u_int	pc_vcpu_id;		/* Xen vCPU ID */		\
 	uint32_t pc_pcid_next;						\
 	uint32_t pc_pcid_gen;						\
 	uint32_t pc_smp_tlb_done;	/* TLB op acknowledgement */	\
 	uint32_t pc_ibpb_set;						\
+	void	*pc_mds_buf;						\
+	void	*pc_mds_buf64;						\
+	uint32_t pc_pad[2];						\
+	uint8_t	pc_mds_tmp[64];						\
 	u_int 	pc_ipi_bitmap;						\
-	char	__pad[3284]		/* pad to UMA_PCPU_ALLOC_SIZE */
+	char	__pad[3172]		/* pad to UMA_PCPU_ALLOC_SIZE */
 
 #define	PC_DBREG_CMD_NONE	0
 #define	PC_DBREG_CMD_LOAD	1
 
 #ifdef _KERNEL
 
 #define MONITOR_STOPSTATE_RUNNING	0
 #define MONITOR_STOPSTATE_STOPPED	1
 
 #if defined(__GNUCLIKE_ASM) && defined(__GNUCLIKE___TYPEOF)
 
 /*
  * Evaluates to the byte offset of the per-cpu variable name.
  */
 #define	__pcpu_offset(name)						\
 	__offsetof(struct pcpu, name)
 
 /*
  * Evaluates to the type of the per-cpu variable name.
  */
 #define	__pcpu_type(name)						\
 	__typeof(((struct pcpu *)0)->name)
 
 /*
  * Evaluates to the address of the per-cpu variable name.
  */
 #define	__PCPU_PTR(name) __extension__ ({				\
 	__pcpu_type(name) *__p;						\
 									\
 	__asm __volatile("movq %%gs:%1,%0; addq %2,%0"			\
 	    : "=r" (__p)						\
 	    : "m" (*(struct pcpu *)(__pcpu_offset(pc_prvspace))),	\
 	      "i" (__pcpu_offset(name)));				\
 									\
 	__p;								\
 })
 
 /*
  * Evaluates to the value of the per-cpu variable name.
  */
 #define	__PCPU_GET(name) __extension__ ({				\
 	__pcpu_type(name) __res;					\
 	struct __s {							\
 		u_char	__b[MIN(sizeof(__pcpu_type(name)), 8)];		\
 	} __s;								\
 									\
 	if (sizeof(__res) == 1 || sizeof(__res) == 2 ||			\
 	    sizeof(__res) == 4 || sizeof(__res) == 8) {			\
 		__asm __volatile("mov %%gs:%1,%0"			\
 		    : "=r" (__s)					\
 		    : "m" (*(struct __s *)(__pcpu_offset(name))));	\
 		*(struct __s *)(void *)&__res = __s;			\
 	} else {							\
 		__res = *__PCPU_PTR(name);				\
 	}								\
 	__res;								\
 })
 
 /*
  * Adds the value to the per-cpu counter name.  The implementation
  * must be atomic with respect to interrupts.
  */
 #define	__PCPU_ADD(name, val) do {					\
 	__pcpu_type(name) __val;					\
 	struct __s {							\
 		u_char	__b[MIN(sizeof(__pcpu_type(name)), 8)];		\
 	} __s;								\
 									\
 	__val = (val);							\
 	if (sizeof(__val) == 1 || sizeof(__val) == 2 ||			\
 	    sizeof(__val) == 4 || sizeof(__val) == 8) {			\
 		__s = *(struct __s *)(void *)&__val;			\
 		__asm __volatile("add %1,%%gs:%0"			\
 		    : "=m" (*(struct __s *)(__pcpu_offset(name)))	\
 		    : "r" (__s));					\
 	} else								\
 		*__PCPU_PTR(name) += __val;				\
 } while (0)
 
 /*
  * Increments the value of the per-cpu counter name.  The implementation
  * must be atomic with respect to interrupts.
  */
 #define	__PCPU_INC(name) do {						\
 	CTASSERT(sizeof(__pcpu_type(name)) == 1 ||			\
 	    sizeof(__pcpu_type(name)) == 2 ||				\
 	    sizeof(__pcpu_type(name)) == 4 ||				\
 	    sizeof(__pcpu_type(name)) == 8);				\
 	if (sizeof(__pcpu_type(name)) == 1) {				\
 		__asm __volatile("incb %%gs:%0"				\
 		    : "=m" (*(__pcpu_type(name) *)(__pcpu_offset(name)))\
 		    : "m" (*(__pcpu_type(name) *)(__pcpu_offset(name))));\
 	} else if (sizeof(__pcpu_type(name)) == 2) {			\
 		__asm __volatile("incw %%gs:%0"				\
 		    : "=m" (*(__pcpu_type(name) *)(__pcpu_offset(name)))\
 		    : "m" (*(__pcpu_type(name) *)(__pcpu_offset(name))));\
 	} else if (sizeof(__pcpu_type(name)) == 4) {			\
 		__asm __volatile("incl %%gs:%0"				\
 		    : "=m" (*(__pcpu_type(name) *)(__pcpu_offset(name)))\
 		    : "m" (*(__pcpu_type(name) *)(__pcpu_offset(name))));\
 	} else if (sizeof(__pcpu_type(name)) == 8) {			\
 		__asm __volatile("incq %%gs:%0"				\
 		    : "=m" (*(__pcpu_type(name) *)(__pcpu_offset(name)))\
 		    : "m" (*(__pcpu_type(name) *)(__pcpu_offset(name))));\
 	}								\
 } while (0)
 
 /*
  * Sets the value of the per-cpu variable name to value val.
  */
 #define	__PCPU_SET(name, val) {						\
 	__pcpu_type(name) __val;					\
 	struct __s {							\
 		u_char	__b[MIN(sizeof(__pcpu_type(name)), 8)];		\
 	} __s;								\
 									\
 	__val = (val);							\
 	if (sizeof(__val) == 1 || sizeof(__val) == 2 ||			\
 	    sizeof(__val) == 4 || sizeof(__val) == 8) {			\
 		__s = *(struct __s *)(void *)&__val;			\
 		__asm __volatile("mov %1,%%gs:%0"			\
 		    : "=m" (*(struct __s *)(__pcpu_offset(name)))	\
 		    : "r" (__s));					\
 	} else {							\
 		*__PCPU_PTR(name) = __val;				\
 	}								\
 }
 
 #define	get_pcpu() __extension__ ({					\
 	struct pcpu *__pc;						\
 									\
 	__asm __volatile("movq %%gs:%1,%0"				\
 	    : "=r" (__pc)						\
 	    : "m" (*(struct pcpu *)(__pcpu_offset(pc_prvspace))));	\
 	__pc;								\
 })
 
 #define	PCPU_GET(member)	__PCPU_GET(pc_ ## member)
 #define	PCPU_ADD(member, val)	__PCPU_ADD(pc_ ## member, val)
 #define	PCPU_INC(member)	__PCPU_INC(pc_ ## member)
 #define	PCPU_PTR(member)	__PCPU_PTR(pc_ ## member)
 #define	PCPU_SET(member, val)	__PCPU_SET(pc_ ## member, val)
 
 #define	OFFSETOF_CURTHREAD	0
 #ifdef __clang__
 #pragma clang diagnostic push
 #pragma clang diagnostic ignored "-Wnull-dereference"
 #endif
 static __inline __pure2 struct thread *
 __curthread(void)
 {
 	struct thread *td;
 
 	__asm("movq %%gs:%P1,%0" : "=r" (td) : "n" (OFFSETOF_CURTHREAD));
 	return (td);
 }
 #ifdef __clang__
 #pragma clang diagnostic pop
 #endif
 #define	curthread		(__curthread())
 
 #define	OFFSETOF_CURPCB		32
 static __inline __pure2 struct pcb *
 __curpcb(void)
 {
 	struct pcb *pcb;
 
 	__asm("movq %%gs:%P1,%0" : "=r" (pcb) : "n" (OFFSETOF_CURPCB));
 	return (pcb);
 }
 #define	curpcb		(__curpcb())
 
 #define	IS_BSP()	(PCPU_GET(cpuid) == 0)
 
 #else /* !__GNUCLIKE_ASM || !__GNUCLIKE___TYPEOF */
 
 #error "this file needs to be ported to your compiler"
 
 #endif /* __GNUCLIKE_ASM && __GNUCLIKE___TYPEOF */
 
 #endif /* _KERNEL */
 
 #endif /* !_MACHINE_PCPU_H_ */
Index: projects/runtime-coverage-v2/sys/amd64/linux/linux_ptrace.c
===================================================================
--- projects/runtime-coverage-v2/sys/amd64/linux/linux_ptrace.c	(revision 347598)
+++ projects/runtime-coverage-v2/sys/amd64/linux/linux_ptrace.c	(revision 347599)
@@ -1,541 +1,556 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
  *
  * Copyright (c) 2017 Edward Tomasz Napierala <trasz@FreeBSD.org>
  * All rights reserved.
  *
  * This software was developed by SRI International and the University of
  * Cambridge Computer Laboratory under DARPA/AFRL contract (FA8750-10-C-0237)
  * ("CTSRD"), as part of the DARPA CRASH research programme.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/proc.h>
 #include <sys/ptrace.h>
 #include <sys/syscallsubr.h>
 
 #include <machine/pcb.h>
 #include <machine/reg.h>
 
 #include <amd64/linux/linux.h>
 #include <amd64/linux/linux_proto.h>
 #include <compat/linux/linux_signal.h>
 
 #define	LINUX_PTRACE_TRACEME		0
 #define	LINUX_PTRACE_PEEKTEXT		1
 #define	LINUX_PTRACE_PEEKDATA		2
 #define	LINUX_PTRACE_PEEKUSER		3
 #define	LINUX_PTRACE_POKETEXT		4
 #define	LINUX_PTRACE_POKEDATA		5
 #define	LINUX_PTRACE_POKEUSER		6
 #define	LINUX_PTRACE_CONT		7
 #define	LINUX_PTRACE_KILL		8
 #define	LINUX_PTRACE_SINGLESTEP		9
 #define	LINUX_PTRACE_GETREGS		12
 #define	LINUX_PTRACE_SETREGS		13
 #define	LINUX_PTRACE_GETFPREGS		14
 #define	LINUX_PTRACE_SETFPREGS		15
 #define	LINUX_PTRACE_ATTACH		16
 #define	LINUX_PTRACE_DETACH		17
 #define	LINUX_PTRACE_SYSCALL		24
 #define	LINUX_PTRACE_SETOPTIONS		0x4200
 #define	LINUX_PTRACE_GETREGSET		0x4204
 #define	LINUX_PTRACE_SEIZE		0x4206
 
 #define	LINUX_PTRACE_O_TRACESYSGOOD	1
 #define	LINUX_PTRACE_O_TRACEFORK	2
 #define	LINUX_PTRACE_O_TRACEVFORK	4
 #define	LINUX_PTRACE_O_TRACECLONE	8
 #define	LINUX_PTRACE_O_TRACEEXEC	16
 #define	LINUX_PTRACE_O_TRACEVFORKDONE	32
 #define	LINUX_PTRACE_O_TRACEEXIT	64
 #define	LINUX_PTRACE_O_TRACESECCOMP	128
 #define	LINUX_PTRACE_O_EXITKILL		1048576
 #define	LINUX_PTRACE_O_SUSPEND_SECCOMP	2097152
 
 #define	LINUX_NT_PRSTATUS		1
 
 #define	LINUX_PTRACE_O_MASK	(LINUX_PTRACE_O_TRACESYSGOOD |	\
     LINUX_PTRACE_O_TRACEFORK | LINUX_PTRACE_O_TRACEVFORK |	\
     LINUX_PTRACE_O_TRACECLONE | LINUX_PTRACE_O_TRACEEXEC |	\
     LINUX_PTRACE_O_TRACEVFORKDONE | LINUX_PTRACE_O_TRACEEXIT |	\
     LINUX_PTRACE_O_TRACESECCOMP | LINUX_PTRACE_O_EXITKILL |	\
     LINUX_PTRACE_O_SUSPEND_SECCOMP)
 
 static int
 map_signum(int lsig, int *bsigp)
 {
 	int bsig;
 
 	if (lsig == 0) {
 		*bsigp = 0;
 		return (0);
 	}
 
 	if (lsig < 0 || lsig > LINUX_SIGRTMAX)
 		return (EINVAL);
 
 	bsig = linux_to_bsd_signal(lsig);
 	if (bsig == SIGSTOP)
 		bsig = 0;
 
 	*bsigp = bsig;
 	return (0);
 }
 
 struct linux_pt_reg {
 	l_ulong	r15;
 	l_ulong	r14;
 	l_ulong	r13;
 	l_ulong	r12;
 	l_ulong	rbp;
 	l_ulong	rbx;
 	l_ulong	r11;
 	l_ulong	r10;
 	l_ulong	r9;
 	l_ulong	r8;
 	l_ulong	rax;
 	l_ulong	rcx;
 	l_ulong	rdx;
 	l_ulong	rsi;
 	l_ulong	rdi;
 	l_ulong	orig_rax;
 	l_ulong	rip;
 	l_ulong	cs;
 	l_ulong	eflags;
 	l_ulong	rsp;
 	l_ulong	ss;
 };
 
 struct linux_pt_regset {
 	l_ulong	r15;
 	l_ulong	r14;
 	l_ulong	r13;
 	l_ulong	r12;
 	l_ulong	rbp;
 	l_ulong	rbx;
 	l_ulong	r11;
 	l_ulong	r10;
 	l_ulong	r9;
 	l_ulong	r8;
 	l_ulong	rax;
 	l_ulong	rcx;
 	l_ulong	rdx;
 	l_ulong	rsi;
 	l_ulong	rdi;
 	l_ulong	orig_rax;
 	l_ulong	rip;
 	l_ulong	cs;
 	l_ulong	eflags;
 	l_ulong	rsp;
 	l_ulong	ss;
 	l_ulong fs_base;
 	l_ulong gs_base;
 	l_ulong ds;
 	l_ulong es;
 	l_ulong fs;
 	l_ulong gs;
 };
 
 /*
  * Translate amd64 ptrace registers between Linux and FreeBSD formats.
  * The translation is pretty straighforward, for all registers but
  * orig_rax on Linux side and r_trapno and r_err in FreeBSD.
  */
 static void
 map_regs_to_linux(struct reg *b_reg, struct linux_pt_reg *l_reg)
 {
 
 	l_reg->r15 = b_reg->r_r15;
 	l_reg->r14 = b_reg->r_r14;
 	l_reg->r13 = b_reg->r_r13;
 	l_reg->r12 = b_reg->r_r12;
 	l_reg->rbp = b_reg->r_rbp;
 	l_reg->rbx = b_reg->r_rbx;
 	l_reg->r11 = b_reg->r_r11;
 	l_reg->r10 = b_reg->r_r10;
 	l_reg->r9 = b_reg->r_r9;
 	l_reg->r8 = b_reg->r_r8;
 	l_reg->rax = b_reg->r_rax;
 	l_reg->rcx = b_reg->r_rcx;
 	l_reg->rdx = b_reg->r_rdx;
 	l_reg->rsi = b_reg->r_rsi;
 	l_reg->rdi = b_reg->r_rdi;
 	l_reg->orig_rax = b_reg->r_rax;
 	l_reg->rip = b_reg->r_rip;
 	l_reg->cs = b_reg->r_cs;
 	l_reg->eflags = b_reg->r_rflags;
 	l_reg->rsp = b_reg->r_rsp;
 	l_reg->ss = b_reg->r_ss;
 }
 
 static void
 map_regs_to_linux_regset(struct reg *b_reg, unsigned long fs_base,
     unsigned long gs_base, struct linux_pt_regset *l_regset)
 {
 
 	l_regset->r15 = b_reg->r_r15;
 	l_regset->r14 = b_reg->r_r14;
 	l_regset->r13 = b_reg->r_r13;
 	l_regset->r12 = b_reg->r_r12;
 	l_regset->rbp = b_reg->r_rbp;
 	l_regset->rbx = b_reg->r_rbx;
 	l_regset->r11 = b_reg->r_r11;
 	l_regset->r10 = b_reg->r_r10;
 	l_regset->r9 = b_reg->r_r9;
 	l_regset->r8 = b_reg->r_r8;
 	l_regset->rax = b_reg->r_rax;
 	l_regset->rcx = b_reg->r_rcx;
 	l_regset->rdx = b_reg->r_rdx;
 	l_regset->rsi = b_reg->r_rsi;
 	l_regset->rdi = b_reg->r_rdi;
 	l_regset->orig_rax = b_reg->r_rax;
 	l_regset->rip = b_reg->r_rip;
 	l_regset->cs = b_reg->r_cs;
 	l_regset->eflags = b_reg->r_rflags;
 	l_regset->rsp = b_reg->r_rsp;
 	l_regset->ss = b_reg->r_ss;
 	l_regset->fs_base = fs_base;
 	l_regset->gs_base = gs_base;
 	l_regset->ds = b_reg->r_ds;
 	l_regset->es = b_reg->r_es;
 	l_regset->fs = b_reg->r_fs;
 	l_regset->gs = b_reg->r_gs;
 }
 
 static void
 map_regs_from_linux(struct reg *b_reg, struct linux_pt_reg *l_reg)
 {
 	b_reg->r_r15 = l_reg->r15;
 	b_reg->r_r14 = l_reg->r14;
 	b_reg->r_r13 = l_reg->r13;
 	b_reg->r_r12 = l_reg->r12;
 	b_reg->r_r11 = l_reg->r11;
 	b_reg->r_r10 = l_reg->r10;
 	b_reg->r_r9 = l_reg->r9;
 	b_reg->r_r8 = l_reg->r8;
 	b_reg->r_rdi = l_reg->rdi;
 	b_reg->r_rsi = l_reg->rsi;
 	b_reg->r_rbp = l_reg->rbp;
 	b_reg->r_rbx = l_reg->rbx;
 	b_reg->r_rdx = l_reg->rdx;
 	b_reg->r_rcx = l_reg->rcx;
 	b_reg->r_rax = l_reg->rax;
 
 	/*
 	 * XXX: Are zeroes the right thing to put here?
 	 */
 	b_reg->r_trapno = 0;
 	b_reg->r_fs = 0;
 	b_reg->r_gs = 0;
 	b_reg->r_err = 0;
 	b_reg->r_es = 0;
 	b_reg->r_ds = 0;
 
 	b_reg->r_rip = l_reg->rip;
 	b_reg->r_cs = l_reg->cs;
 	b_reg->r_rflags = l_reg->eflags;
 	b_reg->r_rsp = l_reg->rsp;
 	b_reg->r_ss = l_reg->ss;
 }
 
 static int
 linux_ptrace_peek(struct thread *td, pid_t pid, void *addr, void *data)
 {
 	int error;
 
 	error = kern_ptrace(td, PT_READ_I, pid, addr, 0);
 	if (error == 0)
 		error = copyout(td->td_retval, data, sizeof(l_int));
 	td->td_retval[0] = error;
 
 	return (error);
 }
 
 static int
 linux_ptrace_setoptions(struct thread *td, pid_t pid, l_ulong data)
 {
 	int mask;
 
 	mask = 0;
 
 	if (data & ~LINUX_PTRACE_O_MASK) {
 		printf("%s: unknown ptrace option %lx set; "
 		    "returning EINVAL\n",
 		    __func__, data & ~LINUX_PTRACE_O_MASK);
 		return (EINVAL);
 	}
 
 	/*
 	 * PTRACE_O_EXITKILL is ignored, we do that by default.
 	 */
 
 	if (data & LINUX_PTRACE_O_TRACESYSGOOD) {
 		printf("%s: PTRACE_O_TRACESYSGOOD not implemented; "
 		    "returning EINVAL\n", __func__);
 		return (EINVAL);
 	}
 
 	if (data & LINUX_PTRACE_O_TRACEFORK)
 		mask |= PTRACE_FORK;
 
 	if (data & LINUX_PTRACE_O_TRACEVFORK)
 		mask |= PTRACE_VFORK;
 
 	if (data & LINUX_PTRACE_O_TRACECLONE)
 		mask |= PTRACE_VFORK;
 
 	if (data & LINUX_PTRACE_O_TRACEEXEC)
 		mask |= PTRACE_EXEC;
 
 	if (data & LINUX_PTRACE_O_TRACEVFORKDONE)
 		mask |= PTRACE_VFORK; /* XXX: Close enough? */
 
 	if (data & LINUX_PTRACE_O_TRACEEXIT) {
 		printf("%s: PTRACE_O_TRACEEXIT not implemented; "
 		    "returning EINVAL\n", __func__);
 		return (EINVAL);
 	}
 
 	return (kern_ptrace(td, PT_SET_EVENT_MASK, pid, &mask, sizeof(mask)));
 }
 
 static int
 linux_ptrace_getregs(struct thread *td, pid_t pid, void *data)
 {
 	struct ptrace_lwpinfo lwpinfo;
 	struct reg b_reg;
 	struct linux_pt_reg l_reg;
 	int error;
 
 	error = kern_ptrace(td, PT_GETREGS, pid, &b_reg, 0);
 	if (error != 0)
 		return (error);
 
 	map_regs_to_linux(&b_reg, &l_reg);
 
-	/*
-	 * The strace(1) utility depends on RAX being set to -ENOSYS
-	 * on syscall entry.
-	 */
 	error = kern_ptrace(td, PT_LWPINFO, pid, &lwpinfo, sizeof(lwpinfo));
 	if (error != 0) {
 		printf("%s: PT_LWPINFO failed with error %d\n", __func__, error);
 		return (error);
 	}
-	if (lwpinfo.pl_flags & PL_FLAG_SCE)
-		l_reg.rax = -38; // XXX: Don't hardcode?
+	if (lwpinfo.pl_flags & PL_FLAG_SCE) {
+		/*
+		 * The strace(1) utility depends on RAX being set to -ENOSYS
+		 * on syscall entry; otherwise it loops printing those:
+		 *
+		 * [ Process PID=928 runs in 64 bit mode. ]
+		 * [ Process PID=928 runs in x32 mode. ]
+		 */
+		l_reg.rax = -38; /* -ENOSYS */
 
+		/*
+		 * Undo the mangling done in exception.S:fast_syscall_common().
+		 */
+		l_reg.r10 = l_reg.rcx;
+	}
+
 	error = copyout(&l_reg, (void *)data, sizeof(l_reg));
 	return (error);
 }
 
 static int
 linux_ptrace_setregs(struct thread *td, pid_t pid, void *data)
 {
 	struct reg b_reg;
 	struct linux_pt_reg l_reg;
 	int error;
 
 	error = copyin(data, &l_reg, sizeof(l_reg));
 	if (error != 0)
 		return (error);
 	map_regs_from_linux(&b_reg, &l_reg);
 	error = kern_ptrace(td, PT_SETREGS, pid, &b_reg, 0);
 	return (error);
 }
 
 static int
 linux_ptrace_getregset_prstatus(struct thread *td, pid_t pid, l_ulong data)
 {
 	struct ptrace_lwpinfo lwpinfo;
 	struct reg b_reg;
 	struct linux_pt_regset l_regset;
 	struct iovec iov;
 	struct pcb *pcb;
 	unsigned long fsbase, gsbase;
 	size_t len;
 	int error;
 
 	error = copyin((const void *)data, &iov, sizeof(iov));
 	if (error != 0) {
 		printf("%s: copyin error %d\n", __func__, error);
 		return (error);
 	}
 
 	error = kern_ptrace(td, PT_GETREGS, pid, &b_reg, 0);
 	if (error != 0)
 		return (error);
 
 	pcb = td->td_pcb;
 	if (td == curthread)
 		update_pcb_bases(pcb);
 	fsbase = pcb->pcb_fsbase;
 	gsbase = pcb->pcb_gsbase;
 
 	map_regs_to_linux_regset(&b_reg, fsbase, gsbase, &l_regset);
 
-	/*
-	 * The strace(1) utility depends on RAX being set to -ENOSYS
-	 * on syscall entry; otherwise it loops printing those:
-	 *
-	 * [ Process PID=928 runs in 64 bit mode. ]
-	 * [ Process PID=928 runs in x32 mode. ]
-	 */
 	error = kern_ptrace(td, PT_LWPINFO, pid, &lwpinfo, sizeof(lwpinfo));
 	if (error != 0) {
 		printf("%s: PT_LWPINFO failed with error %d\n",
 		    __func__, error);
 		return (error);
 	}
-	if (lwpinfo.pl_flags & PL_FLAG_SCE)
-		l_regset.rax = -38; // XXX: Don't hardcode?
+	if (lwpinfo.pl_flags & PL_FLAG_SCE) {
+		/*
+		 * The strace(1) utility depends on RAX being set to -ENOSYS
+		 * on syscall entry; otherwise it loops printing those:
+		 *
+		 * [ Process PID=928 runs in 64 bit mode. ]
+		 * [ Process PID=928 runs in x32 mode. ]
+		 */
+		l_regset.rax = -38; /* -ENOSYS */
+
+		/*
+		 * Undo the mangling done in exception.S:fast_syscall_common().
+		 */
+		l_regset.r10 = l_regset.rcx;
+	}
 
 	len = MIN(iov.iov_len, sizeof(l_regset));
 	error = copyout(&l_regset, (void *)iov.iov_base, len);
 	if (error != 0) {
 		printf("%s: copyout error %d\n", __func__, error);
 		return (error);
 	}
 
 	iov.iov_len -= len;
 	error = copyout(&iov, (void *)data, sizeof(iov));
 	if (error != 0) {
 		printf("%s: iov copyout error %d\n", __func__, error);
 		return (error);
 	}
 
 	return (error);
 }
 
 static int
 linux_ptrace_getregset(struct thread *td, pid_t pid, l_ulong addr, l_ulong data)
 {
 
 	switch (addr) {
 	case LINUX_NT_PRSTATUS:
 		return (linux_ptrace_getregset_prstatus(td, pid, data));
 	default:
 		printf("%s: PTRACE_GETREGSET request %ld not implemented; "
 		    "returning EINVAL\n", __func__, addr);
 		return (EINVAL);
 	}
 }
 
 static int
 linux_ptrace_seize(struct thread *td, pid_t pid, l_ulong addr, l_ulong data)
 {
 
 	printf("%s: PTRACE_SEIZE not implemented; returning EINVAL\n", __func__);
 	return (EINVAL);
 }
 
 int
 linux_ptrace(struct thread *td, struct linux_ptrace_args *uap)
 {
 	void *addr;
 	pid_t pid;
 	int error, sig;
 
 	pid  = (pid_t)uap->pid;
 	addr = (void *)uap->addr;
 
 	switch (uap->req) {
 	case LINUX_PTRACE_TRACEME:
 		error = kern_ptrace(td, PT_TRACE_ME, 0, 0, 0);
 		break;
 	case LINUX_PTRACE_PEEKTEXT:
 	case LINUX_PTRACE_PEEKDATA:
 		error = linux_ptrace_peek(td, pid, addr, (void *)uap->data);
 		if (error != 0)
 			return (error);
 		/*
 		 * Linux expects this syscall to read 64 bits, not 32.
 		 */
 		error = linux_ptrace_peek(td, pid,
 		    (void *)(uap->addr + 4), (void *)(uap->data + 4));
 		break;
 	case LINUX_PTRACE_POKETEXT:
 		error = kern_ptrace(td, PT_WRITE_I, pid, addr, uap->data);
 		break;
 	case LINUX_PTRACE_POKEDATA:
 		error = kern_ptrace(td, PT_WRITE_D, pid, addr, uap->data);
 		break;
 	case LINUX_PTRACE_CONT:
 		error = map_signum(uap->data, &sig);
 		if (error != 0)
 			break;
 		error = kern_ptrace(td, PT_CONTINUE, pid, (void *)1, sig);
 		break;
 	case LINUX_PTRACE_KILL:
 		error = kern_ptrace(td, PT_KILL, pid, addr, uap->data);
 		break;
 	case LINUX_PTRACE_SINGLESTEP:
 		error = map_signum(uap->data, &sig);
 		if (error != 0)
 			break;
 		error = kern_ptrace(td, PT_STEP, pid, (void *)1, sig);
 		break;
 	case LINUX_PTRACE_GETREGS:
 		error = linux_ptrace_getregs(td, pid, (void *)uap->data);
 		break;
 	case LINUX_PTRACE_SETREGS:
 		error = linux_ptrace_setregs(td, pid, (void *)uap->data);
 		break;
 	case LINUX_PTRACE_ATTACH:
 		error = kern_ptrace(td, PT_ATTACH, pid, addr, uap->data);
 		break;
 	case LINUX_PTRACE_DETACH:
 		error = map_signum(uap->data, &sig);
 		if (error != 0)
 			break;
 		error = kern_ptrace(td, PT_DETACH, pid, (void *)1, sig);
 		break;
 	case LINUX_PTRACE_SYSCALL:
 		error = map_signum(uap->data, &sig);
 		if (error != 0)
 			break;
 		error = kern_ptrace(td, PT_SYSCALL, pid, (void *)1, sig);
 		break;
 	case LINUX_PTRACE_SETOPTIONS:
 		error = linux_ptrace_setoptions(td, pid, uap->data);
 		break;
 	case LINUX_PTRACE_GETREGSET:
 		error = linux_ptrace_getregset(td, pid, uap->addr, uap->data);
 		break;
 	case LINUX_PTRACE_SEIZE:
 		error = linux_ptrace_seize(td, pid, uap->addr, uap->data);
 		break;
 	default:
 		printf("%s: ptrace(%ld, ...) not implemented; returning EINVAL\n",
 		    __func__, uap->req);
 		error = EINVAL;
 		break;
 	}
 
 	return (error);
 }
Index: projects/runtime-coverage-v2/sys/compat/linuxkpi/common/include/linux/timer.h
===================================================================
--- projects/runtime-coverage-v2/sys/compat/linuxkpi/common/include/linux/timer.h	(revision 347598)
+++ projects/runtime-coverage-v2/sys/compat/linuxkpi/common/include/linux/timer.h	(revision 347599)
@@ -1,94 +1,94 @@
 /*-
  * Copyright (c) 2010 Isilon Systems, Inc.
  * Copyright (c) 2010 iX Systems, Inc.
  * Copyright (c) 2010 Panasas, Inc.
  * Copyright (c) 2013, 2014 Mellanox Technologies, Ltd.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice unmodified, this list of conditions, and the following
  *    disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  * $FreeBSD$
  */
 #ifndef _LINUX_TIMER_H_
 #define	_LINUX_TIMER_H_
 
 #include <linux/types.h>
 
 #include <sys/param.h>
 #include <sys/kernel.h>
 #include <sys/callout.h>
 
 struct timer_list {
 	struct callout callout;
 	union {
 		void (*function) (unsigned long);	/* < v4.15 */
 		void (*function_415) (struct timer_list *);
 	};
 	unsigned long data;
 	int expires;
 };
 
 extern unsigned long linux_timer_hz_mask;
 
 #define	TIMER_IRQSAFE	0x0001
 
 #define	from_timer(var, arg, field)					\
         container_of(arg, typeof(*(var)), field)
 
 #define	timer_setup(timer, func, flags) do {				\
 	CTASSERT(((flags) & ~TIMER_IRQSAFE) == 0);			\
 	(timer)->function_415 = (func);					\
 	(timer)->data = (unsigned long)(timer);				\
 	callout_init(&(timer)->callout, 1);				\
 } while (0)
 
 #define	setup_timer(timer, func, dat) do {				\
 	(timer)->function = (func);					\
 	(timer)->data = (dat);						\
 	callout_init(&(timer)->callout, 1);			\
 } while (0)
 
 #define	__setup_timer(timer, func, dat, flags) do {			\
 	CTASSERT(((flags) & ~TIMER_IRQSAFE) == 0);			\
 	setup_timer(timer, func, dat);					\
 } while (0)
 
 #define	init_timer(timer) do {						\
 	(timer)->function = NULL;					\
 	(timer)->data = 0;						\
 	callout_init(&(timer)->callout, 1);			\
 } while (0)
 
 extern void mod_timer(struct timer_list *, int);
 extern void add_timer(struct timer_list *);
 extern void add_timer_on(struct timer_list *, int cpu);
+extern int del_timer(struct timer_list *);
 
-#define	del_timer(timer)	(void)callout_stop(&(timer)->callout)
 #define	del_timer_sync(timer)	(void)callout_drain(&(timer)->callout)
 #define	timer_pending(timer)	callout_pending(&(timer)->callout)
 #define	round_jiffies(j)	\
 	((int)(((j) + linux_timer_hz_mask) & ~linux_timer_hz_mask))
 #define	round_jiffies_relative(j) round_jiffies(j)
 #define	round_jiffies_up(j)	round_jiffies(j)
 #define	round_jiffies_up_relative(j) round_jiffies_up(j)
 
 #endif					/* _LINUX_TIMER_H_ */
Index: projects/runtime-coverage-v2/sys/compat/linuxkpi/common/include/linux/ww_mutex.h
===================================================================
--- projects/runtime-coverage-v2/sys/compat/linuxkpi/common/include/linux/ww_mutex.h	(revision 347598)
+++ projects/runtime-coverage-v2/sys/compat/linuxkpi/common/include/linux/ww_mutex.h	(revision 347599)
@@ -1,142 +1,143 @@
 /*-
  * Copyright (c) 2017 Mellanox Technologies, Ltd.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice unmodified, this list of conditions, and the following
  *    disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  * $FreeBSD$
  */
 #ifndef	_LINUX_WW_MUTEX_H_
 #define	_LINUX_WW_MUTEX_H_
 
 #include <sys/param.h>
 #include <sys/proc.h>
 #include <sys/condvar.h>
 #include <sys/kernel.h>
 
 #include <linux/mutex.h>
 
 struct ww_class {
 	const char *mutex_name;
 };
 
 struct ww_acquire_ctx {
 };
 
 struct ww_mutex {
 	struct mutex base;
 	struct cv condvar;
+	struct ww_acquire_ctx *ctx;
 };
 
 #define	DEFINE_WW_CLASS(name)					\
 	struct ww_class name = {				\
 		.mutex_name = mutex_name(#name "_mutex")	\
 	}
 
 #define	DEFINE_WW_MUTEX(name, ww_class)					\
 	struct ww_mutex name;						\
 	static void name##_init(void *arg)				\
 	{								\
 		ww_mutex_init(&name, &ww_class);			\
 	}								\
 	SYSINIT(name, SI_SUB_LOCK, SI_ORDER_SECOND, name##_init, NULL)
 
 #define	ww_mutex_is_locked(_m) \
 	sx_xlocked(&(_m)->base.sx)
 
 #define	ww_mutex_lock_slow(_m, _x) \
 	ww_mutex_lock(_m, _x)
 
 #define	ww_mutex_lock_slow_interruptible(_m, _x) \
 	ww_mutex_lock_interruptible(_m, _x)
 
 static inline int __must_check
 ww_mutex_trylock(struct ww_mutex *lock)
 {
 	return (mutex_trylock(&lock->base));
 }
 
 extern int linux_ww_mutex_lock_sub(struct ww_mutex *, int catch_signal);
 
 static inline int
 ww_mutex_lock(struct ww_mutex *lock, struct ww_acquire_ctx *ctx)
 {
 	if (MUTEX_SKIP())
 		return (0);
 	else if ((struct thread *)SX_OWNER(lock->base.sx.sx_lock) == curthread)
 		return (-EALREADY);
 	else
 		return (linux_ww_mutex_lock_sub(lock, 0));
 }
 
 static inline int
 ww_mutex_lock_interruptible(struct ww_mutex *lock, struct ww_acquire_ctx *ctx)
 {
 	if (MUTEX_SKIP())
 		return (0);
 	else if ((struct thread *)SX_OWNER(lock->base.sx.sx_lock) == curthread)
 		return (-EALREADY);
 	else
 		return (linux_ww_mutex_lock_sub(lock, 1));
 }
 
 extern void linux_ww_mutex_unlock_sub(struct ww_mutex *);
 
 static inline void
 ww_mutex_unlock(struct ww_mutex *lock)
 {
 	if (MUTEX_SKIP())
 		return;
 	else
 		linux_ww_mutex_unlock_sub(lock);
 }
 
 static inline void
 ww_mutex_destroy(struct ww_mutex *lock)
 {
 	cv_destroy(&lock->condvar);
 	mutex_destroy(&lock->base);
 }
 
 static inline void
 ww_acquire_init(struct ww_acquire_ctx *ctx, struct ww_class *ww_class)
 {
 }
 
 static inline void
 ww_mutex_init(struct ww_mutex *lock, struct ww_class *ww_class)
 {
 	linux_mutex_init(&lock->base, ww_class->mutex_name, SX_NOWITNESS);
 	cv_init(&lock->condvar, "lkpi-ww");
 }
 
 static inline void
 ww_acquire_fini(struct ww_acquire_ctx *ctx)
 {
 }
 
 static inline void
 ww_acquire_done(struct ww_acquire_ctx *ctx)
 {
 }
 
 #endif					/* _LINUX_WW_MUTEX_H_ */
Index: projects/runtime-coverage-v2/sys/compat/linuxkpi/common/src/linux_compat.c
===================================================================
--- projects/runtime-coverage-v2/sys/compat/linuxkpi/common/src/linux_compat.c	(revision 347598)
+++ projects/runtime-coverage-v2/sys/compat/linuxkpi/common/src/linux_compat.c	(revision 347599)
@@ -1,2450 +1,2459 @@
 /*-
  * Copyright (c) 2010 Isilon Systems, Inc.
  * Copyright (c) 2010 iX Systems, Inc.
  * Copyright (c) 2010 Panasas, Inc.
  * Copyright (c) 2013-2018 Mellanox Technologies, Ltd.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice unmodified, this list of conditions, and the following
  *    disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_stack.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/malloc.h>
 #include <sys/kernel.h>
 #include <sys/sysctl.h>
 #include <sys/proc.h>
 #include <sys/sglist.h>
 #include <sys/sleepqueue.h>
 #include <sys/refcount.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/bus.h>
 #include <sys/fcntl.h>
 #include <sys/file.h>
 #include <sys/filio.h>
 #include <sys/rwlock.h>
 #include <sys/mman.h>
 #include <sys/stack.h>
 #include <sys/user.h>
 
 #include <vm/vm.h>
 #include <vm/pmap.h>
 #include <vm/vm_object.h>
 #include <vm/vm_page.h>
 #include <vm/vm_pager.h>
 
 #include <machine/stdarg.h>
 
 #if defined(__i386__) || defined(__amd64__)
 #include <machine/md_var.h>
 #endif
 
 #include <linux/kobject.h>
 #include <linux/device.h>
 #include <linux/slab.h>
 #include <linux/module.h>
 #include <linux/moduleparam.h>
 #include <linux/cdev.h>
 #include <linux/file.h>
 #include <linux/sysfs.h>
 #include <linux/mm.h>
 #include <linux/io.h>
 #include <linux/vmalloc.h>
 #include <linux/netdevice.h>
 #include <linux/timer.h>
 #include <linux/interrupt.h>
 #include <linux/uaccess.h>
 #include <linux/list.h>
 #include <linux/kthread.h>
 #include <linux/kernel.h>
 #include <linux/compat.h>
 #include <linux/poll.h>
 #include <linux/smp.h>
 
 #if defined(__i386__) || defined(__amd64__)
 #include <asm/smp.h>
 #endif
 
 SYSCTL_NODE(_compat, OID_AUTO, linuxkpi, CTLFLAG_RW, 0, "LinuxKPI parameters");
 
 int linuxkpi_debug;
 SYSCTL_INT(_compat_linuxkpi, OID_AUTO, debug, CTLFLAG_RWTUN,
     &linuxkpi_debug, 0, "Set to enable pr_debug() prints. Clear to disable.");
 
 MALLOC_DEFINE(M_KMALLOC, "linux", "Linux kmalloc compat");
 
 #include <linux/rbtree.h>
 /* Undo Linux compat changes. */
 #undef RB_ROOT
 #undef file
 #undef cdev
 #define	RB_ROOT(head)	(head)->rbh_root
 
 static void linux_cdev_deref(struct linux_cdev *ldev);
 static struct vm_area_struct *linux_cdev_handle_find(void *handle);
 
 struct kobject linux_class_root;
 struct device linux_root_device;
 struct class linux_class_misc;
 struct list_head pci_drivers;
 struct list_head pci_devices;
 spinlock_t pci_lock;
 
 unsigned long linux_timer_hz_mask;
 
 int
 panic_cmp(struct rb_node *one, struct rb_node *two)
 {
 	panic("no cmp");
 }
 
 RB_GENERATE(linux_root, rb_node, __entry, panic_cmp);
 
 int
 kobject_set_name_vargs(struct kobject *kobj, const char *fmt, va_list args)
 {
 	va_list tmp_va;
 	int len;
 	char *old;
 	char *name;
 	char dummy;
 
 	old = kobj->name;
 
 	if (old && fmt == NULL)
 		return (0);
 
 	/* compute length of string */
 	va_copy(tmp_va, args);
 	len = vsnprintf(&dummy, 0, fmt, tmp_va);
 	va_end(tmp_va);
 
 	/* account for zero termination */
 	len++;
 
 	/* check for error */
 	if (len < 1)
 		return (-EINVAL);
 
 	/* allocate memory for string */
 	name = kzalloc(len, GFP_KERNEL);
 	if (name == NULL)
 		return (-ENOMEM);
 	vsnprintf(name, len, fmt, args);
 	kobj->name = name;
 
 	/* free old string */
 	kfree(old);
 
 	/* filter new string */
 	for (; *name != '\0'; name++)
 		if (*name == '/')
 			*name = '!';
 	return (0);
 }
 
 int
 kobject_set_name(struct kobject *kobj, const char *fmt, ...)
 {
 	va_list args;
 	int error;
 
 	va_start(args, fmt);
 	error = kobject_set_name_vargs(kobj, fmt, args);
 	va_end(args);
 
 	return (error);
 }
 
 static int
 kobject_add_complete(struct kobject *kobj, struct kobject *parent)
 {
 	const struct kobj_type *t;
 	int error;
 
 	kobj->parent = parent;
 	error = sysfs_create_dir(kobj);
 	if (error == 0 && kobj->ktype && kobj->ktype->default_attrs) {
 		struct attribute **attr;
 		t = kobj->ktype;
 
 		for (attr = t->default_attrs; *attr != NULL; attr++) {
 			error = sysfs_create_file(kobj, *attr);
 			if (error)
 				break;
 		}
 		if (error)
 			sysfs_remove_dir(kobj);
 
 	}
 	return (error);
 }
 
 int
 kobject_add(struct kobject *kobj, struct kobject *parent, const char *fmt, ...)
 {
 	va_list args;
 	int error;
 
 	va_start(args, fmt);
 	error = kobject_set_name_vargs(kobj, fmt, args);
 	va_end(args);
 	if (error)
 		return (error);
 
 	return kobject_add_complete(kobj, parent);
 }
 
 void
 linux_kobject_release(struct kref *kref)
 {
 	struct kobject *kobj;
 	char *name;
 
 	kobj = container_of(kref, struct kobject, kref);
 	sysfs_remove_dir(kobj);
 	name = kobj->name;
 	if (kobj->ktype && kobj->ktype->release)
 		kobj->ktype->release(kobj);
 	kfree(name);
 }
 
 static void
 linux_kobject_kfree(struct kobject *kobj)
 {
 	kfree(kobj);
 }
 
 static void
 linux_kobject_kfree_name(struct kobject *kobj)
 {
 	if (kobj) {
 		kfree(kobj->name);
 	}
 }
 
 const struct kobj_type linux_kfree_type = {
 	.release = linux_kobject_kfree
 };
 
 static void
 linux_device_release(struct device *dev)
 {
 	pr_debug("linux_device_release: %s\n", dev_name(dev));
 	kfree(dev);
 }
 
 static ssize_t
 linux_class_show(struct kobject *kobj, struct attribute *attr, char *buf)
 {
 	struct class_attribute *dattr;
 	ssize_t error;
 
 	dattr = container_of(attr, struct class_attribute, attr);
 	error = -EIO;
 	if (dattr->show)
 		error = dattr->show(container_of(kobj, struct class, kobj),
 		    dattr, buf);
 	return (error);
 }
 
 static ssize_t
 linux_class_store(struct kobject *kobj, struct attribute *attr, const char *buf,
     size_t count)
 {
 	struct class_attribute *dattr;
 	ssize_t error;
 
 	dattr = container_of(attr, struct class_attribute, attr);
 	error = -EIO;
 	if (dattr->store)
 		error = dattr->store(container_of(kobj, struct class, kobj),
 		    dattr, buf, count);
 	return (error);
 }
 
 static void
 linux_class_release(struct kobject *kobj)
 {
 	struct class *class;
 
 	class = container_of(kobj, struct class, kobj);
 	if (class->class_release)
 		class->class_release(class);
 }
 
 static const struct sysfs_ops linux_class_sysfs = {
 	.show  = linux_class_show,
 	.store = linux_class_store,
 };
 
 const struct kobj_type linux_class_ktype = {
 	.release = linux_class_release,
 	.sysfs_ops = &linux_class_sysfs
 };
 
 static void
 linux_dev_release(struct kobject *kobj)
 {
 	struct device *dev;
 
 	dev = container_of(kobj, struct device, kobj);
 	/* This is the precedence defined by linux. */
 	if (dev->release)
 		dev->release(dev);
 	else if (dev->class && dev->class->dev_release)
 		dev->class->dev_release(dev);
 }
 
 static ssize_t
 linux_dev_show(struct kobject *kobj, struct attribute *attr, char *buf)
 {
 	struct device_attribute *dattr;
 	ssize_t error;
 
 	dattr = container_of(attr, struct device_attribute, attr);
 	error = -EIO;
 	if (dattr->show)
 		error = dattr->show(container_of(kobj, struct device, kobj),
 		    dattr, buf);
 	return (error);
 }
 
 static ssize_t
 linux_dev_store(struct kobject *kobj, struct attribute *attr, const char *buf,
     size_t count)
 {
 	struct device_attribute *dattr;
 	ssize_t error;
 
 	dattr = container_of(attr, struct device_attribute, attr);
 	error = -EIO;
 	if (dattr->store)
 		error = dattr->store(container_of(kobj, struct device, kobj),
 		    dattr, buf, count);
 	return (error);
 }
 
 static const struct sysfs_ops linux_dev_sysfs = {
 	.show  = linux_dev_show,
 	.store = linux_dev_store,
 };
 
 const struct kobj_type linux_dev_ktype = {
 	.release = linux_dev_release,
 	.sysfs_ops = &linux_dev_sysfs
 };
 
 struct device *
 device_create(struct class *class, struct device *parent, dev_t devt,
     void *drvdata, const char *fmt, ...)
 {
 	struct device *dev;
 	va_list args;
 
 	dev = kzalloc(sizeof(*dev), M_WAITOK);
 	dev->parent = parent;
 	dev->class = class;
 	dev->devt = devt;
 	dev->driver_data = drvdata;
 	dev->release = linux_device_release;
 	va_start(args, fmt);
 	kobject_set_name_vargs(&dev->kobj, fmt, args);
 	va_end(args);
 	device_register(dev);
 
 	return (dev);
 }
 
 int
 kobject_init_and_add(struct kobject *kobj, const struct kobj_type *ktype,
     struct kobject *parent, const char *fmt, ...)
 {
 	va_list args;
 	int error;
 
 	kobject_init(kobj, ktype);
 	kobj->ktype = ktype;
 	kobj->parent = parent;
 	kobj->name = NULL;
 
 	va_start(args, fmt);
 	error = kobject_set_name_vargs(kobj, fmt, args);
 	va_end(args);
 	if (error)
 		return (error);
 	return kobject_add_complete(kobj, parent);
 }
 
 static void
 linux_kq_lock(void *arg)
 {
 	spinlock_t *s = arg;
 
 	spin_lock(s);
 }
 static void
 linux_kq_unlock(void *arg)
 {
 	spinlock_t *s = arg;
 
 	spin_unlock(s);
 }
 
 static void
 linux_kq_lock_owned(void *arg)
 {
 #ifdef INVARIANTS
 	spinlock_t *s = arg;
 
 	mtx_assert(&s->m, MA_OWNED);
 #endif
 }
 
 static void
 linux_kq_lock_unowned(void *arg)
 {
 #ifdef INVARIANTS
 	spinlock_t *s = arg;
 
 	mtx_assert(&s->m, MA_NOTOWNED);
 #endif
 }
 
 static void
 linux_file_kqfilter_poll(struct linux_file *, int);
 
 struct linux_file *
 linux_file_alloc(void)
 {
 	struct linux_file *filp;
 
 	filp = kzalloc(sizeof(*filp), GFP_KERNEL);
 
 	/* set initial refcount */
 	filp->f_count = 1;
 
 	/* setup fields needed by kqueue support */
 	spin_lock_init(&filp->f_kqlock);
 	knlist_init(&filp->f_selinfo.si_note, &filp->f_kqlock,
 	    linux_kq_lock, linux_kq_unlock,
 	    linux_kq_lock_owned, linux_kq_lock_unowned);
 
 	return (filp);
 }
 
 void
 linux_file_free(struct linux_file *filp)
 {
 	if (filp->_file == NULL) {
 		if (filp->f_shmem != NULL)
 			vm_object_deallocate(filp->f_shmem);
 		kfree(filp);
 	} else {
 		/*
 		 * The close method of the character device or file
 		 * will free the linux_file structure:
 		 */
 		_fdrop(filp->_file, curthread);
 	}
 }
 
 static int
 linux_cdev_pager_fault(vm_object_t vm_obj, vm_ooffset_t offset, int prot,
     vm_page_t *mres)
 {
 	struct vm_area_struct *vmap;
 
 	vmap = linux_cdev_handle_find(vm_obj->handle);
 
 	MPASS(vmap != NULL);
 	MPASS(vmap->vm_private_data == vm_obj->handle);
 
 	if (likely(vmap->vm_ops != NULL && offset < vmap->vm_len)) {
 		vm_paddr_t paddr = IDX_TO_OFF(vmap->vm_pfn) + offset;
 		vm_page_t page;
 
 		if (((*mres)->flags & PG_FICTITIOUS) != 0) {
 			/*
 			 * If the passed in result page is a fake
 			 * page, update it with the new physical
 			 * address.
 			 */
 			page = *mres;
 			vm_page_updatefake(page, paddr, vm_obj->memattr);
 		} else {
 			/*
 			 * Replace the passed in "mres" page with our
 			 * own fake page and free up the all of the
 			 * original pages.
 			 */
 			VM_OBJECT_WUNLOCK(vm_obj);
 			page = vm_page_getfake(paddr, vm_obj->memattr);
 			VM_OBJECT_WLOCK(vm_obj);
 
 			vm_page_replace_checked(page, vm_obj,
 			    (*mres)->pindex, *mres);
 
 			vm_page_lock(*mres);
 			vm_page_free(*mres);
 			vm_page_unlock(*mres);
 			*mres = page;
 		}
 		page->valid = VM_PAGE_BITS_ALL;
 		return (VM_PAGER_OK);
 	}
 	return (VM_PAGER_FAIL);
 }
 
 static int
 linux_cdev_pager_populate(vm_object_t vm_obj, vm_pindex_t pidx, int fault_type,
     vm_prot_t max_prot, vm_pindex_t *first, vm_pindex_t *last)
 {
 	struct vm_area_struct *vmap;
 	int err;
 
 	linux_set_current(curthread);
 
 	/* get VM area structure */
 	vmap = linux_cdev_handle_find(vm_obj->handle);
 	MPASS(vmap != NULL);
 	MPASS(vmap->vm_private_data == vm_obj->handle);
 
 	VM_OBJECT_WUNLOCK(vm_obj);
 
 	down_write(&vmap->vm_mm->mmap_sem);
 	if (unlikely(vmap->vm_ops == NULL)) {
 		err = VM_FAULT_SIGBUS;
 	} else {
 		struct vm_fault vmf;
 
 		/* fill out VM fault structure */
 		vmf.virtual_address = (void *)(uintptr_t)IDX_TO_OFF(pidx);
 		vmf.flags = (fault_type & VM_PROT_WRITE) ? FAULT_FLAG_WRITE : 0;
 		vmf.pgoff = 0;
 		vmf.page = NULL;
 		vmf.vma = vmap;
 
 		vmap->vm_pfn_count = 0;
 		vmap->vm_pfn_pcount = &vmap->vm_pfn_count;
 		vmap->vm_obj = vm_obj;
 
 		err = vmap->vm_ops->fault(vmap, &vmf);
 
 		while (vmap->vm_pfn_count == 0 && err == VM_FAULT_NOPAGE) {
 			kern_yield(PRI_USER);
 			err = vmap->vm_ops->fault(vmap, &vmf);
 		}
 	}
 
 	/* translate return code */
 	switch (err) {
 	case VM_FAULT_OOM:
 		err = VM_PAGER_AGAIN;
 		break;
 	case VM_FAULT_SIGBUS:
 		err = VM_PAGER_BAD;
 		break;
 	case VM_FAULT_NOPAGE:
 		/*
 		 * By contract the fault handler will return having
 		 * busied all the pages itself. If pidx is already
 		 * found in the object, it will simply xbusy the first
 		 * page and return with vm_pfn_count set to 1.
 		 */
 		*first = vmap->vm_pfn_first;
 		*last = *first + vmap->vm_pfn_count - 1;
 		err = VM_PAGER_OK;
 		break;
 	default:
 		err = VM_PAGER_ERROR;
 		break;
 	}
 	up_write(&vmap->vm_mm->mmap_sem);
 	VM_OBJECT_WLOCK(vm_obj);
 	return (err);
 }
 
 static struct rwlock linux_vma_lock;
 static TAILQ_HEAD(, vm_area_struct) linux_vma_head =
     TAILQ_HEAD_INITIALIZER(linux_vma_head);
 
 static void
 linux_cdev_handle_free(struct vm_area_struct *vmap)
 {
 	/* Drop reference on vm_file */
 	if (vmap->vm_file != NULL)
 		fput(vmap->vm_file);
 
 	/* Drop reference on mm_struct */
 	mmput(vmap->vm_mm);
 
 	kfree(vmap);
 }
 
 static void
 linux_cdev_handle_remove(struct vm_area_struct *vmap)
 {
 	rw_wlock(&linux_vma_lock);
 	TAILQ_REMOVE(&linux_vma_head, vmap, vm_entry);
 	rw_wunlock(&linux_vma_lock);
 }
 
 static struct vm_area_struct *
 linux_cdev_handle_find(void *handle)
 {
 	struct vm_area_struct *vmap;
 
 	rw_rlock(&linux_vma_lock);
 	TAILQ_FOREACH(vmap, &linux_vma_head, vm_entry) {
 		if (vmap->vm_private_data == handle)
 			break;
 	}
 	rw_runlock(&linux_vma_lock);
 	return (vmap);
 }
 
 static int
 linux_cdev_pager_ctor(void *handle, vm_ooffset_t size, vm_prot_t prot,
 		      vm_ooffset_t foff, struct ucred *cred, u_short *color)
 {
 
 	MPASS(linux_cdev_handle_find(handle) != NULL);
 	*color = 0;
 	return (0);
 }
 
 static void
 linux_cdev_pager_dtor(void *handle)
 {
 	const struct vm_operations_struct *vm_ops;
 	struct vm_area_struct *vmap;
 
 	vmap = linux_cdev_handle_find(handle);
 	MPASS(vmap != NULL);
 
 	/*
 	 * Remove handle before calling close operation to prevent
 	 * other threads from reusing the handle pointer.
 	 */
 	linux_cdev_handle_remove(vmap);
 
 	down_write(&vmap->vm_mm->mmap_sem);
 	vm_ops = vmap->vm_ops;
 	if (likely(vm_ops != NULL))
 		vm_ops->close(vmap);
 	up_write(&vmap->vm_mm->mmap_sem);
 
 	linux_cdev_handle_free(vmap);
 }
 
 static struct cdev_pager_ops linux_cdev_pager_ops[2] = {
   {
 	/* OBJT_MGTDEVICE */
 	.cdev_pg_populate	= linux_cdev_pager_populate,
 	.cdev_pg_ctor	= linux_cdev_pager_ctor,
 	.cdev_pg_dtor	= linux_cdev_pager_dtor
   },
   {
 	/* OBJT_DEVICE */
 	.cdev_pg_fault	= linux_cdev_pager_fault,
 	.cdev_pg_ctor	= linux_cdev_pager_ctor,
 	.cdev_pg_dtor	= linux_cdev_pager_dtor
   },
 };
 
 int
 zap_vma_ptes(struct vm_area_struct *vma, unsigned long address,
     unsigned long size)
 {
 	vm_object_t obj;
 	vm_page_t m;
 
 	obj = vma->vm_obj;
 	if (obj == NULL || (obj->flags & OBJ_UNMANAGED) != 0)
 		return (-ENOTSUP);
 	VM_OBJECT_RLOCK(obj);
 	for (m = vm_page_find_least(obj, OFF_TO_IDX(address));
 	    m != NULL && m->pindex < OFF_TO_IDX(address + size);
 	    m = TAILQ_NEXT(m, listq))
 		pmap_remove_all(m);
 	VM_OBJECT_RUNLOCK(obj);
 	return (0);
 }
 
 static struct file_operations dummy_ldev_ops = {
 	/* XXXKIB */
 };
 
 static struct linux_cdev dummy_ldev = {
 	.ops = &dummy_ldev_ops,
 };
 
 #define	LDEV_SI_DTR	0x0001
 #define	LDEV_SI_REF	0x0002
 
 static void
 linux_get_fop(struct linux_file *filp, const struct file_operations **fop,
     struct linux_cdev **dev)
 {
 	struct linux_cdev *ldev;
 	u_int siref;
 
 	ldev = filp->f_cdev;
 	*fop = filp->f_op;
 	if (ldev != NULL) {
 		for (siref = ldev->siref;;) {
 			if ((siref & LDEV_SI_DTR) != 0) {
 				ldev = &dummy_ldev;
 				siref = ldev->siref;
 				*fop = ldev->ops;
 				MPASS((ldev->siref & LDEV_SI_DTR) == 0);
 			} else if (atomic_fcmpset_int(&ldev->siref, &siref,
 			    siref + LDEV_SI_REF)) {
 				break;
 			}
 		}
 	}
 	*dev = ldev;
 }
 
 static void
 linux_drop_fop(struct linux_cdev *ldev)
 {
 
 	if (ldev == NULL)
 		return;
 	MPASS((ldev->siref & ~LDEV_SI_DTR) != 0);
 	atomic_subtract_int(&ldev->siref, LDEV_SI_REF);
 }
 
 #define	OPW(fp,td,code) ({			\
 	struct file *__fpop;			\
 	__typeof(code) __retval;		\
 						\
 	__fpop = (td)->td_fpop;			\
 	(td)->td_fpop = (fp);			\
 	__retval = (code);			\
 	(td)->td_fpop = __fpop;			\
 	__retval;				\
 })
 
 static int
 linux_dev_fdopen(struct cdev *dev, int fflags, struct thread *td,
     struct file *file)
 {
 	struct linux_cdev *ldev;
 	struct linux_file *filp;
 	const struct file_operations *fop;
 	int error;
 
 	ldev = dev->si_drv1;
 
 	filp = linux_file_alloc();
 	filp->f_dentry = &filp->f_dentry_store;
 	filp->f_op = ldev->ops;
 	filp->f_mode = file->f_flag;
 	filp->f_flags = file->f_flag;
 	filp->f_vnode = file->f_vnode;
 	filp->_file = file;
 	refcount_acquire(&ldev->refs);
 	filp->f_cdev = ldev;
 
 	linux_set_current(td);
 	linux_get_fop(filp, &fop, &ldev);
 
 	if (fop->open != NULL) {
 		error = -fop->open(file->f_vnode, filp);
 		if (error != 0) {
 			linux_drop_fop(ldev);
 			linux_cdev_deref(filp->f_cdev);
 			kfree(filp);
 			return (error);
 		}
 	}
 
 	/* hold on to the vnode - used for fstat() */
 	vhold(filp->f_vnode);
 
 	/* release the file from devfs */
 	finit(file, filp->f_mode, DTYPE_DEV, filp, &linuxfileops);
 	linux_drop_fop(ldev);
 	return (ENXIO);
 }
 
 #define	LINUX_IOCTL_MIN_PTR 0x10000UL
 #define	LINUX_IOCTL_MAX_PTR (LINUX_IOCTL_MIN_PTR + IOCPARM_MAX)
 
 static inline int
 linux_remap_address(void **uaddr, size_t len)
 {
 	uintptr_t uaddr_val = (uintptr_t)(*uaddr);
 
 	if (unlikely(uaddr_val >= LINUX_IOCTL_MIN_PTR &&
 	    uaddr_val < LINUX_IOCTL_MAX_PTR)) {
 		struct task_struct *pts = current;
 		if (pts == NULL) {
 			*uaddr = NULL;
 			return (1);
 		}
 
 		/* compute data offset */
 		uaddr_val -= LINUX_IOCTL_MIN_PTR;
 
 		/* check that length is within bounds */
 		if ((len > IOCPARM_MAX) ||
 		    (uaddr_val + len) > pts->bsd_ioctl_len) {
 			*uaddr = NULL;
 			return (1);
 		}
 
 		/* re-add kernel buffer address */
 		uaddr_val += (uintptr_t)pts->bsd_ioctl_data;
 
 		/* update address location */
 		*uaddr = (void *)uaddr_val;
 		return (1);
 	}
 	return (0);
 }
 
 int
 linux_copyin(const void *uaddr, void *kaddr, size_t len)
 {
 	if (linux_remap_address(__DECONST(void **, &uaddr), len)) {
 		if (uaddr == NULL)
 			return (-EFAULT);
 		memcpy(kaddr, uaddr, len);
 		return (0);
 	}
 	return (-copyin(uaddr, kaddr, len));
 }
 
 int
 linux_copyout(const void *kaddr, void *uaddr, size_t len)
 {
 	if (linux_remap_address(&uaddr, len)) {
 		if (uaddr == NULL)
 			return (-EFAULT);
 		memcpy(uaddr, kaddr, len);
 		return (0);
 	}
 	return (-copyout(kaddr, uaddr, len));
 }
 
 size_t
 linux_clear_user(void *_uaddr, size_t _len)
 {
 	uint8_t *uaddr = _uaddr;
 	size_t len = _len;
 
 	/* make sure uaddr is aligned before going into the fast loop */
 	while (((uintptr_t)uaddr & 7) != 0 && len > 7) {
 		if (subyte(uaddr, 0))
 			return (_len);
 		uaddr++;
 		len--;
 	}
 
 	/* zero 8 bytes at a time */
 	while (len > 7) {
 #ifdef __LP64__
 		if (suword64(uaddr, 0))
 			return (_len);
 #else
 		if (suword32(uaddr, 0))
 			return (_len);
 		if (suword32(uaddr + 4, 0))
 			return (_len);
 #endif
 		uaddr += 8;
 		len -= 8;
 	}
 
 	/* zero fill end, if any */
 	while (len > 0) {
 		if (subyte(uaddr, 0))
 			return (_len);
 		uaddr++;
 		len--;
 	}
 	return (0);
 }
 
 int
 linux_access_ok(int rw, const void *uaddr, size_t len)
 {
 	uintptr_t saddr;
 	uintptr_t eaddr;
 
 	/* get start and end address */
 	saddr = (uintptr_t)uaddr;
 	eaddr = (uintptr_t)uaddr + len;
 
 	/* verify addresses are valid for userspace */
 	return ((saddr == eaddr) ||
 	    (eaddr > saddr && eaddr <= VM_MAXUSER_ADDRESS));
 }
 
 /*
  * This function should return either EINTR or ERESTART depending on
  * the signal type sent to this thread:
  */
 static int
 linux_get_error(struct task_struct *task, int error)
 {
 	/* check for signal type interrupt code */
 	if (error == EINTR || error == ERESTARTSYS || error == ERESTART) {
 		error = -linux_schedule_get_interrupt_value(task);
 		if (error == 0)
 			error = EINTR;
 	}
 	return (error);
 }
 
 static int
 linux_file_ioctl_sub(struct file *fp, struct linux_file *filp,
     const struct file_operations *fop, u_long cmd, caddr_t data,
     struct thread *td)
 {
 	struct task_struct *task = current;
 	unsigned size;
 	int error;
 
 	size = IOCPARM_LEN(cmd);
 	/* refer to logic in sys_ioctl() */
 	if (size > 0) {
 		/*
 		 * Setup hint for linux_copyin() and linux_copyout().
 		 *
 		 * Background: Linux code expects a user-space address
 		 * while FreeBSD supplies a kernel-space address.
 		 */
 		task->bsd_ioctl_data = data;
 		task->bsd_ioctl_len = size;
 		data = (void *)LINUX_IOCTL_MIN_PTR;
 	} else {
 		/* fetch user-space pointer */
 		data = *(void **)data;
 	}
 #if defined(__amd64__)
 	if (td->td_proc->p_elf_machine == EM_386) {
 		/* try the compat IOCTL handler first */
 		if (fop->compat_ioctl != NULL) {
 			error = -OPW(fp, td, fop->compat_ioctl(filp,
 			    cmd, (u_long)data));
 		} else {
 			error = ENOTTY;
 		}
 
 		/* fallback to the regular IOCTL handler, if any */
 		if (error == ENOTTY && fop->unlocked_ioctl != NULL) {
 			error = -OPW(fp, td, fop->unlocked_ioctl(filp,
 			    cmd, (u_long)data));
 		}
 	} else
 #endif
 	{
 		if (fop->unlocked_ioctl != NULL) {
 			error = -OPW(fp, td, fop->unlocked_ioctl(filp,
 			    cmd, (u_long)data));
 		} else {
 			error = ENOTTY;
 		}
 	}
 	if (size > 0) {
 		task->bsd_ioctl_data = NULL;
 		task->bsd_ioctl_len = 0;
 	}
 
 	if (error == EWOULDBLOCK) {
 		/* update kqfilter status, if any */
 		linux_file_kqfilter_poll(filp,
 		    LINUX_KQ_FLAG_HAS_READ | LINUX_KQ_FLAG_HAS_WRITE);
 	} else {
 		error = linux_get_error(task, error);
 	}
 	return (error);
 }
 
 #define	LINUX_POLL_TABLE_NORMAL ((poll_table *)1)
 
 /*
  * This function atomically updates the poll wakeup state and returns
  * the previous state at the time of update.
  */
 static uint8_t
 linux_poll_wakeup_state(atomic_t *v, const uint8_t *pstate)
 {
 	int c, old;
 
 	c = v->counter;
 
 	while ((old = atomic_cmpxchg(v, c, pstate[c])) != c)
 		c = old;
 
 	return (c);
 }
 
 
 static int
 linux_poll_wakeup_callback(wait_queue_t *wq, unsigned int wq_state, int flags, void *key)
 {
 	static const uint8_t state[LINUX_FWQ_STATE_MAX] = {
 		[LINUX_FWQ_STATE_INIT] = LINUX_FWQ_STATE_INIT, /* NOP */
 		[LINUX_FWQ_STATE_NOT_READY] = LINUX_FWQ_STATE_NOT_READY, /* NOP */
 		[LINUX_FWQ_STATE_QUEUED] = LINUX_FWQ_STATE_READY,
 		[LINUX_FWQ_STATE_READY] = LINUX_FWQ_STATE_READY, /* NOP */
 	};
 	struct linux_file *filp = container_of(wq, struct linux_file, f_wait_queue.wq);
 
 	switch (linux_poll_wakeup_state(&filp->f_wait_queue.state, state)) {
 	case LINUX_FWQ_STATE_QUEUED:
 		linux_poll_wakeup(filp);
 		return (1);
 	default:
 		return (0);
 	}
 }
 
 void
 linux_poll_wait(struct linux_file *filp, wait_queue_head_t *wqh, poll_table *p)
 {
 	static const uint8_t state[LINUX_FWQ_STATE_MAX] = {
 		[LINUX_FWQ_STATE_INIT] = LINUX_FWQ_STATE_NOT_READY,
 		[LINUX_FWQ_STATE_NOT_READY] = LINUX_FWQ_STATE_NOT_READY, /* NOP */
 		[LINUX_FWQ_STATE_QUEUED] = LINUX_FWQ_STATE_QUEUED, /* NOP */
 		[LINUX_FWQ_STATE_READY] = LINUX_FWQ_STATE_QUEUED,
 	};
 
 	/* check if we are called inside the select system call */
 	if (p == LINUX_POLL_TABLE_NORMAL)
 		selrecord(curthread, &filp->f_selinfo);
 
 	switch (linux_poll_wakeup_state(&filp->f_wait_queue.state, state)) {
 	case LINUX_FWQ_STATE_INIT:
 		/* NOTE: file handles can only belong to one wait-queue */
 		filp->f_wait_queue.wqh = wqh;
 		filp->f_wait_queue.wq.func = &linux_poll_wakeup_callback;
 		add_wait_queue(wqh, &filp->f_wait_queue.wq);
 		atomic_set(&filp->f_wait_queue.state, LINUX_FWQ_STATE_QUEUED);
 		break;
 	default:
 		break;
 	}
 }
 
 static void
 linux_poll_wait_dequeue(struct linux_file *filp)
 {
 	static const uint8_t state[LINUX_FWQ_STATE_MAX] = {
 		[LINUX_FWQ_STATE_INIT] = LINUX_FWQ_STATE_INIT,	/* NOP */
 		[LINUX_FWQ_STATE_NOT_READY] = LINUX_FWQ_STATE_INIT,
 		[LINUX_FWQ_STATE_QUEUED] = LINUX_FWQ_STATE_INIT,
 		[LINUX_FWQ_STATE_READY] = LINUX_FWQ_STATE_INIT,
 	};
 
 	seldrain(&filp->f_selinfo);
 
 	switch (linux_poll_wakeup_state(&filp->f_wait_queue.state, state)) {
 	case LINUX_FWQ_STATE_NOT_READY:
 	case LINUX_FWQ_STATE_QUEUED:
 	case LINUX_FWQ_STATE_READY:
 		remove_wait_queue(filp->f_wait_queue.wqh, &filp->f_wait_queue.wq);
 		break;
 	default:
 		break;
 	}
 }
 
 void
 linux_poll_wakeup(struct linux_file *filp)
 {
 	/* this function should be NULL-safe */
 	if (filp == NULL)
 		return;
 
 	selwakeup(&filp->f_selinfo);
 
 	spin_lock(&filp->f_kqlock);
 	filp->f_kqflags |= LINUX_KQ_FLAG_NEED_READ |
 	    LINUX_KQ_FLAG_NEED_WRITE;
 
 	/* make sure the "knote" gets woken up */
 	KNOTE_LOCKED(&filp->f_selinfo.si_note, 1);
 	spin_unlock(&filp->f_kqlock);
 }
 
 static void
 linux_file_kqfilter_detach(struct knote *kn)
 {
 	struct linux_file *filp = kn->kn_hook;
 
 	spin_lock(&filp->f_kqlock);
 	knlist_remove(&filp->f_selinfo.si_note, kn, 1);
 	spin_unlock(&filp->f_kqlock);
 }
 
 static int
 linux_file_kqfilter_read_event(struct knote *kn, long hint)
 {
 	struct linux_file *filp = kn->kn_hook;
 
 	mtx_assert(&filp->f_kqlock.m, MA_OWNED);
 
 	return ((filp->f_kqflags & LINUX_KQ_FLAG_NEED_READ) ? 1 : 0);
 }
 
 static int
 linux_file_kqfilter_write_event(struct knote *kn, long hint)
 {
 	struct linux_file *filp = kn->kn_hook;
 
 	mtx_assert(&filp->f_kqlock.m, MA_OWNED);
 
 	return ((filp->f_kqflags & LINUX_KQ_FLAG_NEED_WRITE) ? 1 : 0);
 }
 
 static struct filterops linux_dev_kqfiltops_read = {
 	.f_isfd = 1,
 	.f_detach = linux_file_kqfilter_detach,
 	.f_event = linux_file_kqfilter_read_event,
 };
 
 static struct filterops linux_dev_kqfiltops_write = {
 	.f_isfd = 1,
 	.f_detach = linux_file_kqfilter_detach,
 	.f_event = linux_file_kqfilter_write_event,
 };
 
 static void
 linux_file_kqfilter_poll(struct linux_file *filp, int kqflags)
 {
 	struct thread *td;
 	const struct file_operations *fop;
 	struct linux_cdev *ldev;
 	int temp;
 
 	if ((filp->f_kqflags & kqflags) == 0)
 		return;
 
 	td = curthread;
 
 	linux_get_fop(filp, &fop, &ldev);
 	/* get the latest polling state */
 	temp = OPW(filp->_file, td, fop->poll(filp, NULL));
 	linux_drop_fop(ldev);
 
 	spin_lock(&filp->f_kqlock);
 	/* clear kqflags */
 	filp->f_kqflags &= ~(LINUX_KQ_FLAG_NEED_READ |
 	    LINUX_KQ_FLAG_NEED_WRITE);
 	/* update kqflags */
 	if ((temp & (POLLIN | POLLOUT)) != 0) {
 		if ((temp & POLLIN) != 0)
 			filp->f_kqflags |= LINUX_KQ_FLAG_NEED_READ;
 		if ((temp & POLLOUT) != 0)
 			filp->f_kqflags |= LINUX_KQ_FLAG_NEED_WRITE;
 
 		/* make sure the "knote" gets woken up */
 		KNOTE_LOCKED(&filp->f_selinfo.si_note, 0);
 	}
 	spin_unlock(&filp->f_kqlock);
 }
 
 static int
 linux_file_kqfilter(struct file *file, struct knote *kn)
 {
 	struct linux_file *filp;
 	struct thread *td;
 	int error;
 
 	td = curthread;
 	filp = (struct linux_file *)file->f_data;
 	filp->f_flags = file->f_flag;
 	if (filp->f_op->poll == NULL)
 		return (EINVAL);
 
 	spin_lock(&filp->f_kqlock);
 	switch (kn->kn_filter) {
 	case EVFILT_READ:
 		filp->f_kqflags |= LINUX_KQ_FLAG_HAS_READ;
 		kn->kn_fop = &linux_dev_kqfiltops_read;
 		kn->kn_hook = filp;
 		knlist_add(&filp->f_selinfo.si_note, kn, 1);
 		error = 0;
 		break;
 	case EVFILT_WRITE:
 		filp->f_kqflags |= LINUX_KQ_FLAG_HAS_WRITE;
 		kn->kn_fop = &linux_dev_kqfiltops_write;
 		kn->kn_hook = filp;
 		knlist_add(&filp->f_selinfo.si_note, kn, 1);
 		error = 0;
 		break;
 	default:
 		error = EINVAL;
 		break;
 	}
 	spin_unlock(&filp->f_kqlock);
 
 	if (error == 0) {
 		linux_set_current(td);
 
 		/* update kqfilter status, if any */
 		linux_file_kqfilter_poll(filp,
 		    LINUX_KQ_FLAG_HAS_READ | LINUX_KQ_FLAG_HAS_WRITE);
 	}
 	return (error);
 }
 
 static int
 linux_file_mmap_single(struct file *fp, const struct file_operations *fop,
     vm_ooffset_t *offset, vm_size_t size, struct vm_object **object,
     int nprot, struct thread *td)
 {
 	struct task_struct *task;
 	struct vm_area_struct *vmap;
 	struct mm_struct *mm;
 	struct linux_file *filp;
 	vm_memattr_t attr;
 	int error;
 
 	filp = (struct linux_file *)fp->f_data;
 	filp->f_flags = fp->f_flag;
 
 	if (fop->mmap == NULL)
 		return (EOPNOTSUPP);
 
 	linux_set_current(td);
 
 	/*
 	 * The same VM object might be shared by multiple processes
 	 * and the mm_struct is usually freed when a process exits.
 	 *
 	 * The atomic reference below makes sure the mm_struct is
 	 * available as long as the vmap is in the linux_vma_head.
 	 */
 	task = current;
 	mm = task->mm;
 	if (atomic_inc_not_zero(&mm->mm_users) == 0)
 		return (EINVAL);
 
 	vmap = kzalloc(sizeof(*vmap), GFP_KERNEL);
 	vmap->vm_start = 0;
 	vmap->vm_end = size;
 	vmap->vm_pgoff = *offset / PAGE_SIZE;
 	vmap->vm_pfn = 0;
 	vmap->vm_flags = vmap->vm_page_prot = (nprot & VM_PROT_ALL);
 	vmap->vm_ops = NULL;
 	vmap->vm_file = get_file(filp);
 	vmap->vm_mm = mm;
 
 	if (unlikely(down_write_killable(&vmap->vm_mm->mmap_sem))) {
 		error = linux_get_error(task, EINTR);
 	} else {
 		error = -OPW(fp, td, fop->mmap(filp, vmap));
 		error = linux_get_error(task, error);
 		up_write(&vmap->vm_mm->mmap_sem);
 	}
 
 	if (error != 0) {
 		linux_cdev_handle_free(vmap);
 		return (error);
 	}
 
 	attr = pgprot2cachemode(vmap->vm_page_prot);
 
 	if (vmap->vm_ops != NULL) {
 		struct vm_area_struct *ptr;
 		void *vm_private_data;
 		bool vm_no_fault;
 
 		if (vmap->vm_ops->open == NULL ||
 		    vmap->vm_ops->close == NULL ||
 		    vmap->vm_private_data == NULL) {
 			/* free allocated VM area struct */
 			linux_cdev_handle_free(vmap);
 			return (EINVAL);
 		}
 
 		vm_private_data = vmap->vm_private_data;
 
 		rw_wlock(&linux_vma_lock);
 		TAILQ_FOREACH(ptr, &linux_vma_head, vm_entry) {
 			if (ptr->vm_private_data == vm_private_data)
 				break;
 		}
 		/* check if there is an existing VM area struct */
 		if (ptr != NULL) {
 			/* check if the VM area structure is invalid */
 			if (ptr->vm_ops == NULL ||
 			    ptr->vm_ops->open == NULL ||
 			    ptr->vm_ops->close == NULL) {
 				error = ESTALE;
 				vm_no_fault = 1;
 			} else {
 				error = EEXIST;
 				vm_no_fault = (ptr->vm_ops->fault == NULL);
 			}
 		} else {
 			/* insert VM area structure into list */
 			TAILQ_INSERT_TAIL(&linux_vma_head, vmap, vm_entry);
 			error = 0;
 			vm_no_fault = (vmap->vm_ops->fault == NULL);
 		}
 		rw_wunlock(&linux_vma_lock);
 
 		if (error != 0) {
 			/* free allocated VM area struct */
 			linux_cdev_handle_free(vmap);
 			/* check for stale VM area struct */
 			if (error != EEXIST)
 				return (error);
 		}
 
 		/* check if there is no fault handler */
 		if (vm_no_fault) {
 			*object = cdev_pager_allocate(vm_private_data, OBJT_DEVICE,
 			    &linux_cdev_pager_ops[1], size, nprot, *offset,
 			    td->td_ucred);
 		} else {
 			*object = cdev_pager_allocate(vm_private_data, OBJT_MGTDEVICE,
 			    &linux_cdev_pager_ops[0], size, nprot, *offset,
 			    td->td_ucred);
 		}
 
 		/* check if allocating the VM object failed */
 		if (*object == NULL) {
 			if (error == 0) {
 				/* remove VM area struct from list */
 				linux_cdev_handle_remove(vmap);
 				/* free allocated VM area struct */
 				linux_cdev_handle_free(vmap);
 			}
 			return (EINVAL);
 		}
 	} else {
 		struct sglist *sg;
 
 		sg = sglist_alloc(1, M_WAITOK);
 		sglist_append_phys(sg,
 		    (vm_paddr_t)vmap->vm_pfn << PAGE_SHIFT, vmap->vm_len);
 
 		*object = vm_pager_allocate(OBJT_SG, sg, vmap->vm_len,
 		    nprot, 0, td->td_ucred);
 
 		linux_cdev_handle_free(vmap);
 
 		if (*object == NULL) {
 			sglist_free(sg);
 			return (EINVAL);
 		}
 	}
 
 	if (attr != VM_MEMATTR_DEFAULT) {
 		VM_OBJECT_WLOCK(*object);
 		vm_object_set_memattr(*object, attr);
 		VM_OBJECT_WUNLOCK(*object);
 	}
 	*offset = 0;
 	return (0);
 }
 
 struct cdevsw linuxcdevsw = {
 	.d_version = D_VERSION,
 	.d_fdopen = linux_dev_fdopen,
 	.d_name = "lkpidev",
 };
 
 static int
 linux_file_read(struct file *file, struct uio *uio, struct ucred *active_cred,
     int flags, struct thread *td)
 {
 	struct linux_file *filp;
 	const struct file_operations *fop;
 	struct linux_cdev *ldev;
 	ssize_t bytes;
 	int error;
 
 	error = 0;
 	filp = (struct linux_file *)file->f_data;
 	filp->f_flags = file->f_flag;
 	/* XXX no support for I/O vectors currently */
 	if (uio->uio_iovcnt != 1)
 		return (EOPNOTSUPP);
 	if (uio->uio_resid > DEVFS_IOSIZE_MAX)
 		return (EINVAL);
 	linux_set_current(td);
 	linux_get_fop(filp, &fop, &ldev);
 	if (fop->read != NULL) {
 		bytes = OPW(file, td, fop->read(filp,
 		    uio->uio_iov->iov_base,
 		    uio->uio_iov->iov_len, &uio->uio_offset));
 		if (bytes >= 0) {
 			uio->uio_iov->iov_base =
 			    ((uint8_t *)uio->uio_iov->iov_base) + bytes;
 			uio->uio_iov->iov_len -= bytes;
 			uio->uio_resid -= bytes;
 		} else {
 			error = linux_get_error(current, -bytes);
 		}
 	} else
 		error = ENXIO;
 
 	/* update kqfilter status, if any */
 	linux_file_kqfilter_poll(filp, LINUX_KQ_FLAG_HAS_READ);
 	linux_drop_fop(ldev);
 
 	return (error);
 }
 
 static int
 linux_file_write(struct file *file, struct uio *uio, struct ucred *active_cred,
     int flags, struct thread *td)
 {
 	struct linux_file *filp;
 	const struct file_operations *fop;
 	struct linux_cdev *ldev;
 	ssize_t bytes;
 	int error;
 
 	filp = (struct linux_file *)file->f_data;
 	filp->f_flags = file->f_flag;
 	/* XXX no support for I/O vectors currently */
 	if (uio->uio_iovcnt != 1)
 		return (EOPNOTSUPP);
 	if (uio->uio_resid > DEVFS_IOSIZE_MAX)
 		return (EINVAL);
 	linux_set_current(td);
 	linux_get_fop(filp, &fop, &ldev);
 	if (fop->write != NULL) {
 		bytes = OPW(file, td, fop->write(filp,
 		    uio->uio_iov->iov_base,
 		    uio->uio_iov->iov_len, &uio->uio_offset));
 		if (bytes >= 0) {
 			uio->uio_iov->iov_base =
 			    ((uint8_t *)uio->uio_iov->iov_base) + bytes;
 			uio->uio_iov->iov_len -= bytes;
 			uio->uio_resid -= bytes;
 			error = 0;
 		} else {
 			error = linux_get_error(current, -bytes);
 		}
 	} else
 		error = ENXIO;
 
 	/* update kqfilter status, if any */
 	linux_file_kqfilter_poll(filp, LINUX_KQ_FLAG_HAS_WRITE);
 
 	linux_drop_fop(ldev);
 
 	return (error);
 }
 
 static int
 linux_file_poll(struct file *file, int events, struct ucred *active_cred,
     struct thread *td)
 {
 	struct linux_file *filp;
 	const struct file_operations *fop;
 	struct linux_cdev *ldev;
 	int revents;
 
 	filp = (struct linux_file *)file->f_data;
 	filp->f_flags = file->f_flag;
 	linux_set_current(td);
 	linux_get_fop(filp, &fop, &ldev);
 	if (fop->poll != NULL) {
 		revents = OPW(file, td, fop->poll(filp,
 		    LINUX_POLL_TABLE_NORMAL)) & events;
 	} else {
 		revents = 0;
 	}
 	linux_drop_fop(ldev);
 	return (revents);
 }
 
 static int
 linux_file_close(struct file *file, struct thread *td)
 {
 	struct linux_file *filp;
 	const struct file_operations *fop;
 	struct linux_cdev *ldev;
 	int error;
 
 	filp = (struct linux_file *)file->f_data;
 
 	KASSERT(file_count(filp) == 0,
 	    ("File refcount(%d) is not zero", file_count(filp)));
 
 	error = 0;
 	filp->f_flags = file->f_flag;
 	linux_set_current(td);
 	linux_poll_wait_dequeue(filp);
 	linux_get_fop(filp, &fop, &ldev);
 	if (fop->release != NULL)
 		error = -OPW(file, td, fop->release(filp->f_vnode, filp));
 	funsetown(&filp->f_sigio);
 	if (filp->f_vnode != NULL)
 		vdrop(filp->f_vnode);
 	linux_drop_fop(ldev);
 	if (filp->f_cdev != NULL)
 		linux_cdev_deref(filp->f_cdev);
 	kfree(filp);
 
 	return (error);
 }
 
 static int
 linux_file_ioctl(struct file *fp, u_long cmd, void *data, struct ucred *cred,
     struct thread *td)
 {
 	struct linux_file *filp;
 	const struct file_operations *fop;
 	struct linux_cdev *ldev;
 	int error;
 
 	error = 0;
 	filp = (struct linux_file *)fp->f_data;
 	filp->f_flags = fp->f_flag;
 	linux_get_fop(filp, &fop, &ldev);
 
 	linux_set_current(td);
 	switch (cmd) {
 	case FIONBIO:
 		break;
 	case FIOASYNC:
 		if (fop->fasync == NULL)
 			break;
 		error = -OPW(fp, td, fop->fasync(0, filp, fp->f_flag & FASYNC));
 		break;
 	case FIOSETOWN:
 		error = fsetown(*(int *)data, &filp->f_sigio);
 		if (error == 0) {
 			if (fop->fasync == NULL)
 				break;
 			error = -OPW(fp, td, fop->fasync(0, filp,
 			    fp->f_flag & FASYNC));
 		}
 		break;
 	case FIOGETOWN:
 		*(int *)data = fgetown(&filp->f_sigio);
 		break;
 	default:
 		error = linux_file_ioctl_sub(fp, filp, fop, cmd, data, td);
 		break;
 	}
 	linux_drop_fop(ldev);
 	return (error);
 }
 
 static int
 linux_file_mmap_sub(struct thread *td, vm_size_t objsize, vm_prot_t prot,
     vm_prot_t *maxprotp, int *flagsp, struct file *fp,
     vm_ooffset_t *foff, const struct file_operations *fop, vm_object_t *objp)
 {
 	/*
 	 * Character devices do not provide private mappings
 	 * of any kind:
 	 */
 	if ((*maxprotp & VM_PROT_WRITE) == 0 &&
 	    (prot & VM_PROT_WRITE) != 0)
 		return (EACCES);
 	if ((*flagsp & (MAP_PRIVATE | MAP_COPY)) != 0)
 		return (EINVAL);
 
 	return (linux_file_mmap_single(fp, fop, foff, objsize, objp,
 	    (int)prot, td));
 }
 
 static int
 linux_file_mmap(struct file *fp, vm_map_t map, vm_offset_t *addr, vm_size_t size,
     vm_prot_t prot, vm_prot_t cap_maxprot, int flags, vm_ooffset_t foff,
     struct thread *td)
 {
 	struct linux_file *filp;
 	const struct file_operations *fop;
 	struct linux_cdev *ldev;
 	struct mount *mp;
 	struct vnode *vp;
 	vm_object_t object;
 	vm_prot_t maxprot;
 	int error;
 
 	filp = (struct linux_file *)fp->f_data;
 
 	vp = filp->f_vnode;
 	if (vp == NULL)
 		return (EOPNOTSUPP);
 
 	/*
 	 * Ensure that file and memory protections are
 	 * compatible.
 	 */
 	mp = vp->v_mount;
 	if (mp != NULL && (mp->mnt_flag & MNT_NOEXEC) != 0) {
 		maxprot = VM_PROT_NONE;
 		if ((prot & VM_PROT_EXECUTE) != 0)
 			return (EACCES);
 	} else
 		maxprot = VM_PROT_EXECUTE;
 	if ((fp->f_flag & FREAD) != 0)
 		maxprot |= VM_PROT_READ;
 	else if ((prot & VM_PROT_READ) != 0)
 		return (EACCES);
 
 	/*
 	 * If we are sharing potential changes via MAP_SHARED and we
 	 * are trying to get write permission although we opened it
 	 * without asking for it, bail out.
 	 *
 	 * Note that most character devices always share mappings.
 	 *
 	 * Rely on linux_file_mmap_sub() to fail invalid MAP_PRIVATE
 	 * requests rather than doing it here.
 	 */
 	if ((flags & MAP_SHARED) != 0) {
 		if ((fp->f_flag & FWRITE) != 0)
 			maxprot |= VM_PROT_WRITE;
 		else if ((prot & VM_PROT_WRITE) != 0)
 			return (EACCES);
 	}
 	maxprot &= cap_maxprot;
 
 	linux_get_fop(filp, &fop, &ldev);
 	error = linux_file_mmap_sub(td, size, prot, &maxprot, &flags, fp,
 	    &foff, fop, &object);
 	if (error != 0)
 		goto out;
 
 	error = vm_mmap_object(map, addr, size, prot, maxprot, flags, object,
 	    foff, FALSE, td);
 	if (error != 0)
 		vm_object_deallocate(object);
 out:
 	linux_drop_fop(ldev);
 	return (error);
 }
 
 static int
 linux_file_stat(struct file *fp, struct stat *sb, struct ucred *active_cred,
     struct thread *td)
 {
 	struct linux_file *filp;
 	struct vnode *vp;
 	int error;
 
 	filp = (struct linux_file *)fp->f_data;
 	if (filp->f_vnode == NULL)
 		return (EOPNOTSUPP);
 
 	vp = filp->f_vnode;
 
 	vn_lock(vp, LK_SHARED | LK_RETRY);
 	error = vn_stat(vp, sb, td->td_ucred, NOCRED, td);
 	VOP_UNLOCK(vp, 0);
 
 	return (error);
 }
 
 static int
 linux_file_fill_kinfo(struct file *fp, struct kinfo_file *kif,
     struct filedesc *fdp)
 {
 	struct linux_file *filp;
 	struct vnode *vp;
 	int error;
 
 	filp = fp->f_data;
 	vp = filp->f_vnode;
 	if (vp == NULL) {
 		error = 0;
 		kif->kf_type = KF_TYPE_DEV;
 	} else {
 		vref(vp);
 		FILEDESC_SUNLOCK(fdp);
 		error = vn_fill_kinfo_vnode(vp, kif);
 		vrele(vp);
 		kif->kf_type = KF_TYPE_VNODE;
 		FILEDESC_SLOCK(fdp);
 	}
 	return (error);
 }
 
 unsigned int
 linux_iminor(struct inode *inode)
 {
 	struct linux_cdev *ldev;
 
 	if (inode == NULL || inode->v_rdev == NULL ||
 	    inode->v_rdev->si_devsw != &linuxcdevsw)
 		return (-1U);
 	ldev = inode->v_rdev->si_drv1;
 	if (ldev == NULL)
 		return (-1U);
 
 	return (minor(ldev->dev));
 }
 
 struct fileops linuxfileops = {
 	.fo_read = linux_file_read,
 	.fo_write = linux_file_write,
 	.fo_truncate = invfo_truncate,
 	.fo_kqfilter = linux_file_kqfilter,
 	.fo_stat = linux_file_stat,
 	.fo_fill_kinfo = linux_file_fill_kinfo,
 	.fo_poll = linux_file_poll,
 	.fo_close = linux_file_close,
 	.fo_ioctl = linux_file_ioctl,
 	.fo_mmap = linux_file_mmap,
 	.fo_chmod = invfo_chmod,
 	.fo_chown = invfo_chown,
 	.fo_sendfile = invfo_sendfile,
 	.fo_flags = DFLAG_PASSABLE,
 };
 
 /*
  * Hash of vmmap addresses.  This is infrequently accessed and does not
  * need to be particularly large.  This is done because we must store the
  * caller's idea of the map size to properly unmap.
  */
 struct vmmap {
 	LIST_ENTRY(vmmap)	vm_next;
 	void 			*vm_addr;
 	unsigned long		vm_size;
 };
 
 struct vmmaphd {
 	struct vmmap *lh_first;
 };
 #define	VMMAP_HASH_SIZE	64
 #define	VMMAP_HASH_MASK	(VMMAP_HASH_SIZE - 1)
 #define	VM_HASH(addr)	((uintptr_t)(addr) >> PAGE_SHIFT) & VMMAP_HASH_MASK
 static struct vmmaphd vmmaphead[VMMAP_HASH_SIZE];
 static struct mtx vmmaplock;
 
 static void
 vmmap_add(void *addr, unsigned long size)
 {
 	struct vmmap *vmmap;
 
 	vmmap = kmalloc(sizeof(*vmmap), GFP_KERNEL);
 	mtx_lock(&vmmaplock);
 	vmmap->vm_size = size;
 	vmmap->vm_addr = addr;
 	LIST_INSERT_HEAD(&vmmaphead[VM_HASH(addr)], vmmap, vm_next);
 	mtx_unlock(&vmmaplock);
 }
 
 static struct vmmap *
 vmmap_remove(void *addr)
 {
 	struct vmmap *vmmap;
 
 	mtx_lock(&vmmaplock);
 	LIST_FOREACH(vmmap, &vmmaphead[VM_HASH(addr)], vm_next)
 		if (vmmap->vm_addr == addr)
 			break;
 	if (vmmap)
 		LIST_REMOVE(vmmap, vm_next);
 	mtx_unlock(&vmmaplock);
 
 	return (vmmap);
 }
 
 #if defined(__i386__) || defined(__amd64__) || defined(__powerpc__) || defined(__aarch64__)
 void *
 _ioremap_attr(vm_paddr_t phys_addr, unsigned long size, int attr)
 {
 	void *addr;
 
 	addr = pmap_mapdev_attr(phys_addr, size, attr);
 	if (addr == NULL)
 		return (NULL);
 	vmmap_add(addr, size);
 
 	return (addr);
 }
 #endif
 
 void
 iounmap(void *addr)
 {
 	struct vmmap *vmmap;
 
 	vmmap = vmmap_remove(addr);
 	if (vmmap == NULL)
 		return;
 #if defined(__i386__) || defined(__amd64__) || defined(__powerpc__) || defined(__aarch64__)
 	pmap_unmapdev((vm_offset_t)addr, vmmap->vm_size);
 #endif
 	kfree(vmmap);
 }
 
 
 void *
 vmap(struct page **pages, unsigned int count, unsigned long flags, int prot)
 {
 	vm_offset_t off;
 	size_t size;
 
 	size = count * PAGE_SIZE;
 	off = kva_alloc(size);
 	if (off == 0)
 		return (NULL);
 	vmmap_add((void *)off, size);
 	pmap_qenter(off, pages, count);
 
 	return ((void *)off);
 }
 
 void
 vunmap(void *addr)
 {
 	struct vmmap *vmmap;
 
 	vmmap = vmmap_remove(addr);
 	if (vmmap == NULL)
 		return;
 	pmap_qremove((vm_offset_t)addr, vmmap->vm_size / PAGE_SIZE);
 	kva_free((vm_offset_t)addr, vmmap->vm_size);
 	kfree(vmmap);
 }
 
 char *
 kvasprintf(gfp_t gfp, const char *fmt, va_list ap)
 {
 	unsigned int len;
 	char *p;
 	va_list aq;
 
 	va_copy(aq, ap);
 	len = vsnprintf(NULL, 0, fmt, aq);
 	va_end(aq);
 
 	p = kmalloc(len + 1, gfp);
 	if (p != NULL)
 		vsnprintf(p, len + 1, fmt, ap);
 
 	return (p);
 }
 
 char *
 kasprintf(gfp_t gfp, const char *fmt, ...)
 {
 	va_list ap;
 	char *p;
 
 	va_start(ap, fmt);
 	p = kvasprintf(gfp, fmt, ap);
 	va_end(ap);
 
 	return (p);
 }
 
 static void
 linux_timer_callback_wrapper(void *context)
 {
 	struct timer_list *timer;
 
 	linux_set_current(curthread);
 
 	timer = context;
 	timer->function(timer->data);
 }
 
 void
 mod_timer(struct timer_list *timer, int expires)
 {
 
 	timer->expires = expires;
 	callout_reset(&timer->callout,
 	    linux_timer_jiffies_until(expires),
 	    &linux_timer_callback_wrapper, timer);
 }
 
 void
 add_timer(struct timer_list *timer)
 {
 
 	callout_reset(&timer->callout,
 	    linux_timer_jiffies_until(timer->expires),
 	    &linux_timer_callback_wrapper, timer);
 }
 
 void
 add_timer_on(struct timer_list *timer, int cpu)
 {
 
 	callout_reset_on(&timer->callout,
 	    linux_timer_jiffies_until(timer->expires),
 	    &linux_timer_callback_wrapper, timer, cpu);
 }
 
+int
+del_timer(struct timer_list *timer)
+{
+
+	if (callout_stop(&(timer)->callout) == -1)
+		return (0);
+	return (1);
+}
+
 static void
 linux_timer_init(void *arg)
 {
 
 	/*
 	 * Compute an internal HZ value which can divide 2**32 to
 	 * avoid timer rounding problems when the tick value wraps
 	 * around 2**32:
 	 */
 	linux_timer_hz_mask = 1;
 	while (linux_timer_hz_mask < (unsigned long)hz)
 		linux_timer_hz_mask *= 2;
 	linux_timer_hz_mask--;
 }
 SYSINIT(linux_timer, SI_SUB_DRIVERS, SI_ORDER_FIRST, linux_timer_init, NULL);
 
 void
 linux_complete_common(struct completion *c, int all)
 {
 	int wakeup_swapper;
 
 	sleepq_lock(c);
 	if (all) {
 		c->done = UINT_MAX;
 		wakeup_swapper = sleepq_broadcast(c, SLEEPQ_SLEEP, 0, 0);
 	} else {
 		if (c->done != UINT_MAX)
 			c->done++;
 		wakeup_swapper = sleepq_signal(c, SLEEPQ_SLEEP, 0, 0);
 	}
 	sleepq_release(c);
 	if (wakeup_swapper)
 		kick_proc0();
 }
 
 /*
  * Indefinite wait for done != 0 with or without signals.
  */
 int
 linux_wait_for_common(struct completion *c, int flags)
 {
 	struct task_struct *task;
 	int error;
 
 	if (SCHEDULER_STOPPED())
 		return (0);
 
 	task = current;
 
 	if (flags != 0)
 		flags = SLEEPQ_INTERRUPTIBLE | SLEEPQ_SLEEP;
 	else
 		flags = SLEEPQ_SLEEP;
 	error = 0;
 	for (;;) {
 		sleepq_lock(c);
 		if (c->done)
 			break;
 		sleepq_add(c, NULL, "completion", flags, 0);
 		if (flags & SLEEPQ_INTERRUPTIBLE) {
 			DROP_GIANT();
 			error = -sleepq_wait_sig(c, 0);
 			PICKUP_GIANT();
 			if (error != 0) {
 				linux_schedule_save_interrupt_value(task, error);
 				error = -ERESTARTSYS;
 				goto intr;
 			}
 		} else {
 			DROP_GIANT();
 			sleepq_wait(c, 0);
 			PICKUP_GIANT();
 		}
 	}
 	if (c->done != UINT_MAX)
 		c->done--;
 	sleepq_release(c);
 
 intr:
 	return (error);
 }
 
 /*
  * Time limited wait for done != 0 with or without signals.
  */
 int
 linux_wait_for_timeout_common(struct completion *c, int timeout, int flags)
 {
 	struct task_struct *task;
 	int end = jiffies + timeout;
 	int error;
 
 	if (SCHEDULER_STOPPED())
 		return (0);
 
 	task = current;
 
 	if (flags != 0)
 		flags = SLEEPQ_INTERRUPTIBLE | SLEEPQ_SLEEP;
 	else
 		flags = SLEEPQ_SLEEP;
 
 	for (;;) {
 		sleepq_lock(c);
 		if (c->done)
 			break;
 		sleepq_add(c, NULL, "completion", flags, 0);
 		sleepq_set_timeout(c, linux_timer_jiffies_until(end));
 
 		DROP_GIANT();
 		if (flags & SLEEPQ_INTERRUPTIBLE)
 			error = -sleepq_timedwait_sig(c, 0);
 		else
 			error = -sleepq_timedwait(c, 0);
 		PICKUP_GIANT();
 
 		if (error != 0) {
 			/* check for timeout */
 			if (error == -EWOULDBLOCK) {
 				error = 0;	/* timeout */
 			} else {
 				/* signal happened */
 				linux_schedule_save_interrupt_value(task, error);
 				error = -ERESTARTSYS;
 			}
 			goto done;
 		}
 	}
 	if (c->done != UINT_MAX)
 		c->done--;
 	sleepq_release(c);
 
 	/* return how many jiffies are left */
 	error = linux_timer_jiffies_until(end);
 done:
 	return (error);
 }
 
 int
 linux_try_wait_for_completion(struct completion *c)
 {
 	int isdone;
 
 	sleepq_lock(c);
 	isdone = (c->done != 0);
 	if (c->done != 0 && c->done != UINT_MAX)
 		c->done--;
 	sleepq_release(c);
 	return (isdone);
 }
 
 int
 linux_completion_done(struct completion *c)
 {
 	int isdone;
 
 	sleepq_lock(c);
 	isdone = (c->done != 0);
 	sleepq_release(c);
 	return (isdone);
 }
 
 static void
 linux_cdev_deref(struct linux_cdev *ldev)
 {
 
 	if (refcount_release(&ldev->refs))
 		kfree(ldev);
 }
 
 static void
 linux_cdev_release(struct kobject *kobj)
 {
 	struct linux_cdev *cdev;
 	struct kobject *parent;
 
 	cdev = container_of(kobj, struct linux_cdev, kobj);
 	parent = kobj->parent;
 	linux_destroy_dev(cdev);
 	linux_cdev_deref(cdev);
 	kobject_put(parent);
 }
 
 static void
 linux_cdev_static_release(struct kobject *kobj)
 {
 	struct linux_cdev *cdev;
 	struct kobject *parent;
 
 	cdev = container_of(kobj, struct linux_cdev, kobj);
 	parent = kobj->parent;
 	linux_destroy_dev(cdev);
 	kobject_put(parent);
 }
 
 void
 linux_destroy_dev(struct linux_cdev *ldev)
 {
 
 	if (ldev->cdev == NULL)
 		return;
 
 	MPASS((ldev->siref & LDEV_SI_DTR) == 0);
 	atomic_set_int(&ldev->siref, LDEV_SI_DTR);
 	while ((atomic_load_int(&ldev->siref) & ~LDEV_SI_DTR) != 0)
 		pause("ldevdtr", hz / 4);
 
 	destroy_dev(ldev->cdev);
 	ldev->cdev = NULL;
 }
 
 const struct kobj_type linux_cdev_ktype = {
 	.release = linux_cdev_release,
 };
 
 const struct kobj_type linux_cdev_static_ktype = {
 	.release = linux_cdev_static_release,
 };
 
 static void
 linux_handle_ifnet_link_event(void *arg, struct ifnet *ifp, int linkstate)
 {
 	struct notifier_block *nb;
 
 	nb = arg;
 	if (linkstate == LINK_STATE_UP)
 		nb->notifier_call(nb, NETDEV_UP, ifp);
 	else
 		nb->notifier_call(nb, NETDEV_DOWN, ifp);
 }
 
 static void
 linux_handle_ifnet_arrival_event(void *arg, struct ifnet *ifp)
 {
 	struct notifier_block *nb;
 
 	nb = arg;
 	nb->notifier_call(nb, NETDEV_REGISTER, ifp);
 }
 
 static void
 linux_handle_ifnet_departure_event(void *arg, struct ifnet *ifp)
 {
 	struct notifier_block *nb;
 
 	nb = arg;
 	nb->notifier_call(nb, NETDEV_UNREGISTER, ifp);
 }
 
 static void
 linux_handle_iflladdr_event(void *arg, struct ifnet *ifp)
 {
 	struct notifier_block *nb;
 
 	nb = arg;
 	nb->notifier_call(nb, NETDEV_CHANGEADDR, ifp);
 }
 
 static void
 linux_handle_ifaddr_event(void *arg, struct ifnet *ifp)
 {
 	struct notifier_block *nb;
 
 	nb = arg;
 	nb->notifier_call(nb, NETDEV_CHANGEIFADDR, ifp);
 }
 
 int
 register_netdevice_notifier(struct notifier_block *nb)
 {
 
 	nb->tags[NETDEV_UP] = EVENTHANDLER_REGISTER(
 	    ifnet_link_event, linux_handle_ifnet_link_event, nb, 0);
 	nb->tags[NETDEV_REGISTER] = EVENTHANDLER_REGISTER(
 	    ifnet_arrival_event, linux_handle_ifnet_arrival_event, nb, 0);
 	nb->tags[NETDEV_UNREGISTER] = EVENTHANDLER_REGISTER(
 	    ifnet_departure_event, linux_handle_ifnet_departure_event, nb, 0);
 	nb->tags[NETDEV_CHANGEADDR] = EVENTHANDLER_REGISTER(
 	    iflladdr_event, linux_handle_iflladdr_event, nb, 0);
 
 	return (0);
 }
 
 int
 register_inetaddr_notifier(struct notifier_block *nb)
 {
 
 	nb->tags[NETDEV_CHANGEIFADDR] = EVENTHANDLER_REGISTER(
 	    ifaddr_event, linux_handle_ifaddr_event, nb, 0);
 	return (0);
 }
 
 int
 unregister_netdevice_notifier(struct notifier_block *nb)
 {
 
 	EVENTHANDLER_DEREGISTER(ifnet_link_event,
 	    nb->tags[NETDEV_UP]);
 	EVENTHANDLER_DEREGISTER(ifnet_arrival_event,
 	    nb->tags[NETDEV_REGISTER]);
 	EVENTHANDLER_DEREGISTER(ifnet_departure_event,
 	    nb->tags[NETDEV_UNREGISTER]);
 	EVENTHANDLER_DEREGISTER(iflladdr_event,
 	    nb->tags[NETDEV_CHANGEADDR]);
 
 	return (0);
 }
 
 int
 unregister_inetaddr_notifier(struct notifier_block *nb)
 {
 
 	EVENTHANDLER_DEREGISTER(ifaddr_event,
 	    nb->tags[NETDEV_CHANGEIFADDR]);
 
 	return (0);
 }
 
 struct list_sort_thunk {
 	int (*cmp)(void *, struct list_head *, struct list_head *);
 	void *priv;
 };
 
 static inline int
 linux_le_cmp(void *priv, const void *d1, const void *d2)
 {
 	struct list_head *le1, *le2;
 	struct list_sort_thunk *thunk;
 
 	thunk = priv;
 	le1 = *(__DECONST(struct list_head **, d1));
 	le2 = *(__DECONST(struct list_head **, d2));
 	return ((thunk->cmp)(thunk->priv, le1, le2));
 }
 
 void
 list_sort(void *priv, struct list_head *head, int (*cmp)(void *priv,
     struct list_head *a, struct list_head *b))
 {
 	struct list_sort_thunk thunk;
 	struct list_head **ar, *le;
 	size_t count, i;
 
 	count = 0;
 	list_for_each(le, head)
 		count++;
 	ar = malloc(sizeof(struct list_head *) * count, M_KMALLOC, M_WAITOK);
 	i = 0;
 	list_for_each(le, head)
 		ar[i++] = le;
 	thunk.cmp = cmp;
 	thunk.priv = priv;
 	qsort_r(ar, count, sizeof(struct list_head *), &thunk, linux_le_cmp);
 	INIT_LIST_HEAD(head);
 	for (i = 0; i < count; i++)
 		list_add_tail(ar[i], head);
 	free(ar, M_KMALLOC);
 }
 
 void
 linux_irq_handler(void *ent)
 {
 	struct irq_ent *irqe;
 
 	linux_set_current(curthread);
 
 	irqe = ent;
 	irqe->handler(irqe->irq, irqe->arg);
 }
 
 #if defined(__i386__) || defined(__amd64__)
 int
 linux_wbinvd_on_all_cpus(void)
 {
 
 	pmap_invalidate_cache();
 	return (0);
 }
 #endif
 
 int
 linux_on_each_cpu(void callback(void *), void *data)
 {
 
 	smp_rendezvous(smp_no_rendezvous_barrier, callback,
 	    smp_no_rendezvous_barrier, data);
 	return (0);
 }
 
 int
 linux_in_atomic(void)
 {
 
 	return ((curthread->td_pflags & TDP_NOFAULTING) != 0);
 }
 
 struct linux_cdev *
 linux_find_cdev(const char *name, unsigned major, unsigned minor)
 {
 	dev_t dev = MKDEV(major, minor);
 	struct cdev *cdev;
 
 	dev_lock();
 	LIST_FOREACH(cdev, &linuxcdevsw.d_devs, si_list) {
 		struct linux_cdev *ldev = cdev->si_drv1;
 		if (ldev->dev == dev &&
 		    strcmp(kobject_name(&ldev->kobj), name) == 0) {
 			break;
 		}
 	}
 	dev_unlock();
 
 	return (cdev != NULL ? cdev->si_drv1 : NULL);
 }
 
 int
 __register_chrdev(unsigned int major, unsigned int baseminor,
     unsigned int count, const char *name,
     const struct file_operations *fops)
 {
 	struct linux_cdev *cdev;
 	int ret = 0;
 	int i;
 
 	for (i = baseminor; i < baseminor + count; i++) {
 		cdev = cdev_alloc();
 		cdev->ops = fops;
 		kobject_set_name(&cdev->kobj, name);
 
 		ret = cdev_add(cdev, makedev(major, i), 1);
 		if (ret != 0)
 			break;
 	}
 	return (ret);
 }
 
 int
 __register_chrdev_p(unsigned int major, unsigned int baseminor,
     unsigned int count, const char *name,
     const struct file_operations *fops, uid_t uid,
     gid_t gid, int mode)
 {
 	struct linux_cdev *cdev;
 	int ret = 0;
 	int i;
 
 	for (i = baseminor; i < baseminor + count; i++) {
 		cdev = cdev_alloc();
 		cdev->ops = fops;
 		kobject_set_name(&cdev->kobj, name);
 
 		ret = cdev_add_ext(cdev, makedev(major, i), uid, gid, mode);
 		if (ret != 0)
 			break;
 	}
 	return (ret);
 }
 
 void
 __unregister_chrdev(unsigned int major, unsigned int baseminor,
     unsigned int count, const char *name)
 {
 	struct linux_cdev *cdevp;
 	int i;
 
 	for (i = baseminor; i < baseminor + count; i++) {
 		cdevp = linux_find_cdev(name, major, i);
 		if (cdevp != NULL)
 			cdev_del(cdevp);
 	}
 }
 
 void
 linux_dump_stack(void)
 {
 #ifdef STACK
 	struct stack st;
 
 	stack_zero(&st);
 	stack_save(&st);
 	stack_print(&st);
 #endif
 }
 
 #if defined(__i386__) || defined(__amd64__)
 bool linux_cpu_has_clflush;
 #endif
 
 static void
 linux_compat_init(void *arg)
 {
 	struct sysctl_oid *rootoid;
 	int i;
 
 #if defined(__i386__) || defined(__amd64__)
 	linux_cpu_has_clflush = (cpu_feature & CPUID_CLFSH);
 #endif
 	rw_init(&linux_vma_lock, "lkpi-vma-lock");
 
 	rootoid = SYSCTL_ADD_ROOT_NODE(NULL,
 	    OID_AUTO, "sys", CTLFLAG_RD|CTLFLAG_MPSAFE, NULL, "sys");
 	kobject_init(&linux_class_root, &linux_class_ktype);
 	kobject_set_name(&linux_class_root, "class");
 	linux_class_root.oidp = SYSCTL_ADD_NODE(NULL, SYSCTL_CHILDREN(rootoid),
 	    OID_AUTO, "class", CTLFLAG_RD|CTLFLAG_MPSAFE, NULL, "class");
 	kobject_init(&linux_root_device.kobj, &linux_dev_ktype);
 	kobject_set_name(&linux_root_device.kobj, "device");
 	linux_root_device.kobj.oidp = SYSCTL_ADD_NODE(NULL,
 	    SYSCTL_CHILDREN(rootoid), OID_AUTO, "device", CTLFLAG_RD, NULL,
 	    "device");
 	linux_root_device.bsddev = root_bus;
 	linux_class_misc.name = "misc";
 	class_register(&linux_class_misc);
 	INIT_LIST_HEAD(&pci_drivers);
 	INIT_LIST_HEAD(&pci_devices);
 	spin_lock_init(&pci_lock);
 	mtx_init(&vmmaplock, "IO Map lock", NULL, MTX_DEF);
 	for (i = 0; i < VMMAP_HASH_SIZE; i++)
 		LIST_INIT(&vmmaphead[i]);
 }
 SYSINIT(linux_compat, SI_SUB_DRIVERS, SI_ORDER_SECOND, linux_compat_init, NULL);
 
 static void
 linux_compat_uninit(void *arg)
 {
 	linux_kobject_kfree_name(&linux_class_root);
 	linux_kobject_kfree_name(&linux_root_device.kobj);
 	linux_kobject_kfree_name(&linux_class_misc.kobj);
 
 	mtx_destroy(&vmmaplock);
 	spin_lock_destroy(&pci_lock);
 	rw_destroy(&linux_vma_lock);
 }
 SYSUNINIT(linux_compat, SI_SUB_DRIVERS, SI_ORDER_SECOND, linux_compat_uninit, NULL);
 
 /*
  * NOTE: Linux frequently uses "unsigned long" for pointer to integer
  * conversion and vice versa, where in FreeBSD "uintptr_t" would be
  * used. Assert these types have the same size, else some parts of the
  * LinuxKPI may not work like expected:
  */
 CTASSERT(sizeof(unsigned long) == sizeof(uintptr_t));
Index: projects/runtime-coverage-v2/sys/conf/kern.pre.mk
===================================================================
--- projects/runtime-coverage-v2/sys/conf/kern.pre.mk	(revision 347598)
+++ projects/runtime-coverage-v2/sys/conf/kern.pre.mk	(revision 347599)
@@ -1,320 +1,320 @@
 # $FreeBSD$
 
 # Part of a unified Makefile for building kernels.  This part contains all
 # of the definitions that need to be before %BEFORE_DEPEND.
 
 # Allow user to configure things that only effect src tree builds.
 # Note: This is duplicated from src.sys.mk to ensure that we include
 # /etc/src.conf when building the kernel. Kernels can be built without
 # the rest of /usr/src, but they still always process SRCCONF even though
 # the normal mechanisms to prevent that (compiling out of tree) won't
 # work. To ensure they do work, we have to duplicate thee few lines here.
 SRCCONF?=	/etc/src.conf
 .if (exists(${SRCCONF}) || ${SRCCONF} != "/etc/src.conf") && !target(_srcconf_included_)
 .include "${SRCCONF}"
 _srcconf_included_:
 .endif
 
 .include <bsd.own.mk>
 .include <bsd.compiler.mk>
 .include "kern.opts.mk"
 
 # The kernel build always occurs in the object directory which is .CURDIR.
 .if ${.MAKE.MODE:Unormal:Mmeta}
 .MAKE.MODE+=	curdirOk=yes
 .endif
 
 # The kernel build always expects .OBJDIR=.CURDIR.
 .OBJDIR: ${.CURDIR}
 
 .if defined(NO_OBJWALK) || ${MK_AUTO_OBJ} == "yes"
 NO_OBJWALK=		t
 NO_MODULES_OBJ=	t
 .endif
 .if !defined(NO_OBJWALK)
 _obj=		obj
 .endif
 
 # Can be overridden by makeoptions or /etc/make.conf
 KERNEL_KO?=	kernel
 KERNEL?=	kernel
 KODIR?=		/boot/${KERNEL}
 LDSCRIPT_NAME?=	ldscript.$M
 LDSCRIPT?=	$S/conf/${LDSCRIPT_NAME}
 
 M=		${MACHINE}
 
 AWK?=		awk
 CP?=		cp
 NM?=		nm
 OBJCOPY?=	objcopy
 SIZE?=		size
 
 .if defined(DEBUG)
 _MINUS_O=	-O
 CTFFLAGS+=	-g
 .else
 .if ${MACHINE_CPUARCH} == "powerpc"
 _MINUS_O=	-O	# gcc miscompiles some code at -O2
 .else
 _MINUS_O=	-O2
 .endif
 .endif
 .if ${MACHINE_CPUARCH} == "amd64"
 .if ${COMPILER_TYPE} == "clang"
 COPTFLAGS?=-O2 -pipe
 .else
 COPTFLAGS?=-O2 -frename-registers -pipe
 .endif
 .else
 COPTFLAGS?=${_MINUS_O} -pipe
 .endif
 .if !empty(COPTFLAGS:M-O[23s]) && empty(COPTFLAGS:M-fno-strict-aliasing)
 COPTFLAGS+= -fno-strict-aliasing
 .endif
 .if !defined(NO_CPU_COPTFLAGS)
 COPTFLAGS+= ${_CPUCFLAGS}
 .endif
 NOSTDINC= -nostdinc
 
 INCLUDES= ${NOSTDINC} ${INCLMAGIC} -I. -I$S -I$S/contrib/ck/include
 
 CFLAGS=	${COPTFLAGS} ${DEBUG}
 CFLAGS+= ${INCLUDES} -D_KERNEL -DHAVE_KERNEL_OPTION_HEADERS -include opt_global.h
 CFLAGS_PARAM_INLINE_UNIT_GROWTH?=100
 CFLAGS_PARAM_LARGE_FUNCTION_GROWTH?=1000
 .if ${MACHINE_CPUARCH} == "mips"
 CFLAGS_ARCH_PARAMS?=--param max-inline-insns-single=1000 -DMACHINE_ARCH='"${MACHINE_ARCH}"'
 .endif
 CFLAGS.gcc+= -fno-common -fms-extensions -finline-limit=${INLINE_LIMIT}
 CFLAGS.gcc+= --param inline-unit-growth=${CFLAGS_PARAM_INLINE_UNIT_GROWTH}
 CFLAGS.gcc+= --param large-function-growth=${CFLAGS_PARAM_LARGE_FUNCTION_GROWTH}
 CFLAGS.gcc+= -fms-extensions
 .if defined(CFLAGS_ARCH_PARAMS)
 CFLAGS.gcc+=${CFLAGS_ARCH_PARAMS}
 .endif
 WERROR?= -Werror
 
 # XXX LOCORE means "don't declare C stuff" not "for locore.s".
 ASM_CFLAGS= -x assembler-with-cpp -DLOCORE ${CFLAGS} ${ASM_CFLAGS.${.IMPSRC:T}} 
 
 .if defined(PROFLEVEL) && ${PROFLEVEL} >= 1
 CFLAGS+=	-DGPROF
 CFLAGS.gcc+=	-falign-functions=16
 .if ${PROFLEVEL} >= 2
 CFLAGS+=	-DGPROF4 -DGUPROF
 PROF=		-pg
 .if ${COMPILER_TYPE} == "gcc"
 PROF+=		-mprofiler-epilogue
 .endif
 .else
 PROF=		-pg
 .endif
 .endif
 DEFINED_PROF=	${PROF}
 
 KUBSAN_ENABLED!=	grep KUBSAN opt_global.h || true ; echo
 .if !empty(KUBSAN_ENABLED)
 SAN_CFLAGS+=	-fsanitize=undefined
 .endif
 
 COVERAGE_ENABLED!=	grep COVERAGE opt_global.h || true ; echo
 .if !empty(COVERAGE_ENABLED)
 .if ${COMPILER_TYPE} == "clang" || \
     (${COMPILER_TYPE} == "gcc" && ${COMPILER_VERSION} >= 80100)
 SAN_CFLAGS+=	-fsanitize-coverage=trace-pc,trace-cmp
 .else
 SAN_CFLAGS+=	-fsanitize-coverage=trace-pc
 .endif
 .endif
 
 CFLAGS+=	${SAN_CFLAGS}
 
 GCOV_ENABLED!=	grep GCOV opt_global.h || true ; echo
 .if !empty(GCOV_ENABLED)
 .if ${COMPILER_TYPE} == "gcc"
 GCOV_CFLAGS+=	 -fprofile-arcs -ftest-coverage
 .endif
 .endif
 
 CFLAGS+=	${GCOV_CFLAGS}
 
 # Put configuration-specific C flags last (except for ${PROF}) so that they
 # can override the others.
 CFLAGS+=	${CONF_CFLAGS}
 
 .if defined(LINKER_FEATURES) && ${LINKER_FEATURES:Mbuild-id}
-LDFLAGS+=	-Wl,--build-id=sha1
+LDFLAGS+=	--build-id=sha1
 .endif
 
 .if (${MACHINE_CPUARCH} == "aarch64" || ${MACHINE_CPUARCH} == "amd64" || \
     ${MACHINE_CPUARCH} == "i386") && \
     defined(LINKER_FEATURES) && ${LINKER_FEATURES:Mifunc} == ""
 .error amd64/arm64/i386 kernel requires linker ifunc support
 .endif
 .if ${MACHINE_CPUARCH} == "amd64"
-LDFLAGS+=	-Wl,-z max-page-size=2097152
+LDFLAGS+=	-z max-page-size=2097152
 .if ${LINKER_TYPE} != "lld"
-LDFLAGS+=	-Wl,-z common-page-size=4096
+LDFLAGS+=	-z common-page-size=4096
 .else
-LDFLAGS+=	-Wl,-z -Wl,ifunc-noplt
+LDFLAGS+=	-z notext -z ifunc-noplt
 .endif
 .endif
 
 NORMAL_C= ${CC} -c ${CFLAGS} ${WERROR} ${PROF} ${.IMPSRC}
 NORMAL_S= ${CC:N${CCACHE_BIN}} -c ${ASM_CFLAGS} ${WERROR} ${.IMPSRC}
 PROFILE_C= ${CC} -c ${CFLAGS} ${WERROR} ${.IMPSRC}
 NORMAL_C_NOWERROR= ${CC} -c ${CFLAGS} ${PROF} ${.IMPSRC}
 
 NORMAL_M= ${AWK} -f $S/tools/makeobjops.awk ${.IMPSRC} -c ; \
 	  ${CC} -c ${CFLAGS} ${WERROR} ${PROF} ${.PREFIX}.c
 
 NORMAL_FW= uudecode -o ${.TARGET} ${.ALLSRC}
 NORMAL_FWO= ${LD} -b binary --no-warn-mismatch -d -warn-common -r \
 	-m ${LD_EMULATION} -o ${.TARGET} ${.ALLSRC:M*.fw}
 
 # for ZSTD in the kernel (include zstd/lib/freebsd before other CFLAGS)
 ZSTD_C= ${CC} -c -DZSTD_HEAPMODE=1 -I$S/contrib/zstd/lib/freebsd ${CFLAGS} -I$S/contrib/zstd/lib -I$S/contrib/zstd/lib/common ${WERROR} -Wno-inline -Wno-missing-prototypes ${PROF} -U__BMI__ ${.IMPSRC}
 
 # Common for dtrace / zfs
 CDDL_CFLAGS=	-DFREEBSD_NAMECACHE -nostdinc -I$S/cddl/compat/opensolaris -I$S/cddl/contrib/opensolaris/uts/common -I$S -I$S/cddl/contrib/opensolaris/common ${CFLAGS} -Wno-unknown-pragmas -Wno-missing-prototypes -Wno-undef -Wno-strict-prototypes -Wno-cast-qual -Wno-parentheses -Wno-redundant-decls -Wno-missing-braces -Wno-uninitialized -Wno-unused -Wno-inline -Wno-switch -Wno-pointer-arith -Wno-unknown-pragmas
 CDDL_CFLAGS+=	-include $S/cddl/compat/opensolaris/sys/debug_compat.h
 CDDL_C=		${CC} -c ${CDDL_CFLAGS} ${WERROR} ${PROF} ${.IMPSRC}
 
 # Special flags for managing the compat compiles for ZFS
 ZFS_CFLAGS=	-DBUILDING_ZFS -I$S/cddl/contrib/opensolaris/uts/common/fs/zfs
 ZFS_CFLAGS+=	-I$S/cddl/contrib/opensolaris/uts/common/fs/zfs/lua
 ZFS_CFLAGS+=	-I$S/cddl/contrib/opensolaris/uts/common/zmod
 ZFS_CFLAGS+=	-I$S/cddl/contrib/opensolaris/common/zfs
 ZFS_CFLAGS+=	${CDDL_CFLAGS}
 ZFS_ASM_CFLAGS= -x assembler-with-cpp -DLOCORE ${ZFS_CFLAGS}
 ZFS_C=		${CC} -c ${ZFS_CFLAGS} ${WERROR} ${PROF} ${.IMPSRC}
 ZFS_S=		${CC} -c ${ZFS_ASM_CFLAGS} ${WERROR} ${.IMPSRC}
 
 # Special flags for managing the compat compiles for DTrace
 DTRACE_CFLAGS=	-DBUILDING_DTRACE ${CDDL_CFLAGS} -I$S/cddl/dev/dtrace -I$S/cddl/dev/dtrace/${MACHINE_CPUARCH}
 .if ${MACHINE_CPUARCH} == "amd64" || ${MACHINE_CPUARCH} == "i386"
 DTRACE_CFLAGS+=	-I$S/cddl/contrib/opensolaris/uts/intel -I$S/cddl/dev/dtrace/x86
 .endif
 DTRACE_CFLAGS+=	-I$S/cddl/contrib/opensolaris/common/util -I$S -DDIS_MEM -DSMP
 DTRACE_ASM_CFLAGS=	-x assembler-with-cpp -DLOCORE ${DTRACE_CFLAGS}
 DTRACE_C=	${CC} -c ${DTRACE_CFLAGS}	${WERROR} ${PROF} ${.IMPSRC}
 DTRACE_S=	${CC} -c ${DTRACE_ASM_CFLAGS}	${WERROR} ${.IMPSRC}
 
 # Special flags for managing the compat compiles for DTrace/FBT
 FBT_CFLAGS=	-DBUILDING_DTRACE -nostdinc -I$S/cddl/dev/fbt/${MACHINE_CPUARCH} -I$S/cddl/dev/fbt -I$S/cddl/compat/opensolaris -I$S/cddl/contrib/opensolaris/uts/common -I$S ${CDDL_CFLAGS}
 .if ${MACHINE_CPUARCH} == "amd64" || ${MACHINE_CPUARCH} == "i386"
 FBT_CFLAGS+=	-I$S/cddl/dev/fbt/x86
 .endif
 FBT_C=		${CC} -c ${FBT_CFLAGS}		${WERROR} ${PROF} ${.IMPSRC}
 
 .if ${MK_CTF} != "no"
 NORMAL_CTFCONVERT=	${CTFCONVERT} ${CTFFLAGS} ${.TARGET}
 .elif ${MAKE_VERSION} >= 5201111300
 NORMAL_CTFCONVERT=
 .else
 NORMAL_CTFCONVERT=	@:
 .endif
 
 # Linux Kernel Programming Interface C-flags
 LINUXKPI_INCLUDES=	-I$S/compat/linuxkpi/common/include
 LINUXKPI_C=		${NORMAL_C} ${LINUXKPI_INCLUDES}
 
 # Infiniband C flags.  Correct include paths and omit errors that linux
 # does not honor.
 OFEDINCLUDES=	-I$S/ofed/include -I$S/ofed/include/uapi ${LINUXKPI_INCLUDES}
 OFEDNOERR=	-Wno-cast-qual -Wno-pointer-arith
 OFEDCFLAGS=	${CFLAGS:N-I*} -DCONFIG_INFINIBAND_USER_MEM \
 		${OFEDINCLUDES} ${CFLAGS:M-I*} ${OFEDNOERR}
 OFED_C_NOIMP=	${CC} -c -o ${.TARGET} ${OFEDCFLAGS} ${WERROR} ${PROF}
 OFED_C=		${OFED_C_NOIMP} ${.IMPSRC}
 
 # mlxfw C flags.
 MLXFW_C=	${OFED_C_NOIMP} \
 		-I${SRCTOP}/sys/contrib/xz-embedded/freebsd \
 		-I${SRCTOP}/sys/contrib/xz-embedded/linux/lib/xz \
 		${.IMPSRC}
 
 GEN_CFILES= $S/$M/$M/genassym.c ${MFILES:T:S/.m$/.c/}
 SYSTEM_CFILES= config.c env.c hints.c vnode_if.c
 SYSTEM_DEP= Makefile ${SYSTEM_OBJS}
 SYSTEM_OBJS= locore.o ${MDOBJS} ${OBJS}
 SYSTEM_OBJS+= ${SYSTEM_CFILES:.c=.o}
 SYSTEM_OBJS+= hack.pico
 
 MD_ROOT_SIZE_CONFIGURED!=	grep MD_ROOT_SIZE opt_md.h || true ; echo
 .if ${MFS_IMAGE:Uno} != "no"
 .if empty(MD_ROOT_SIZE_CONFIGURED)
 SYSTEM_OBJS+= embedfs_${MFS_IMAGE:T:R}.o
 .endif
 .endif
 SYSTEM_LD= @${LD} -m ${LD_EMULATION} -Bdynamic -T ${LDSCRIPT} ${_LDFLAGS} \
 	--no-warn-mismatch --warn-common --export-dynamic \
 	--dynamic-linker /red/herring \
 	-o ${.TARGET} -X ${SYSTEM_OBJS} vers.o
 SYSTEM_LD_TAIL= @${OBJCOPY} --strip-symbol gcc2_compiled. ${.TARGET} ; \
 	${SIZE} ${.TARGET} ; chmod 755 ${.TARGET}
 SYSTEM_DEP+= ${LDSCRIPT}
 
 # Calculate path for .m files early, if needed.
 .if !defined(NO_MODULES) && !defined(__MPATH) && !make(install) && \
     (empty(.MAKEFLAGS:M-V) || defined(NO_SKIP_MPATH))
 __MPATH!=find ${S:tA}/ -name \*_if.m
 .endif
 
 # MKMODULESENV is set here so that port makefiles can augment
 # them.
 
 MKMODULESENV+=	MAKEOBJDIRPREFIX=${.OBJDIR}/modules KMODDIR=${KODIR}
 MKMODULESENV+=	MACHINE_CPUARCH=${MACHINE_CPUARCH}
 MKMODULESENV+=	MACHINE=${MACHINE} MACHINE_ARCH=${MACHINE_ARCH}
 MKMODULESENV+=	MODULES_EXTRA="${MODULES_EXTRA}" WITHOUT_MODULES="${WITHOUT_MODULES}"
 MKMODULESENV+=	ARCH_FLAGS="${ARCH_FLAGS}"
 .if (${KERN_IDENT} == LINT)
 MKMODULESENV+=	ALL_MODULES=LINT
 .endif
 .if defined(MODULES_OVERRIDE)
 MKMODULESENV+=	MODULES_OVERRIDE="${MODULES_OVERRIDE}"
 .endif
 .if defined(DEBUG)
 MKMODULESENV+=	DEBUG_FLAGS="${DEBUG}"
 .endif
 .if !defined(NO_MODULES)
 MKMODULESENV+=	__MPATH="${__MPATH}"
 .endif
 
 # Architecture and output format arguments for objcopy to convert image to
 # object file
 
 .if ${MFS_IMAGE:Uno} != "no"
 .if empty(MD_ROOT_SIZE_CONFIGURED)
 .if !defined(EMBEDFS_FORMAT.${MACHINE_ARCH})
 EMBEDFS_FORMAT.${MACHINE_ARCH}!= awk -F'"' '/OUTPUT_FORMAT/ {print $$2}' ${LDSCRIPT}
 .if empty(EMBEDFS_FORMAT.${MACHINE_ARCH})
 .undef EMBEDFS_FORMAT.${MACHINE_ARCH}
 .endif
 .endif
 
 .if !defined(EMBEDFS_ARCH.${MACHINE_ARCH})
 EMBEDFS_ARCH.${MACHINE_ARCH}!= sed -n '/OUTPUT_ARCH/s/.*(\(.*\)).*/\1/p' ${LDSCRIPT}
 .if empty(EMBEDFS_ARCH.${MACHINE_ARCH})
 .undef EMBEDFS_ARCH.${MACHINE_ARCH}
 .endif
 .endif
 
 EMBEDFS_FORMAT.arm?=		elf32-littlearm
 EMBEDFS_FORMAT.armv6?=		elf32-littlearm
 EMBEDFS_FORMAT.armv7?=		elf32-littlearm
 EMBEDFS_FORMAT.aarch64?=	elf64-littleaarch64
 EMBEDFS_FORMAT.mips?=		elf32-tradbigmips
 EMBEDFS_FORMAT.mipsel?=		elf32-tradlittlemips
 EMBEDFS_FORMAT.mips64?=		elf64-tradbigmips
 EMBEDFS_FORMAT.mips64el?=	elf64-tradlittlemips
 EMBEDFS_FORMAT.riscv?=		elf64-littleriscv
 .endif
 .endif
 
 # Detect kernel config options that force stack frames to be turned on.
 DDB_ENABLED!=	grep DDB opt_ddb.h || true ; echo
 DTR_ENABLED!=	grep KDTRACE_FRAME opt_kdtrace.h || true ; echo
 HWPMC_ENABLED!=	grep HWPMC opt_hwpmc_hooks.h || true ; echo
Index: projects/runtime-coverage-v2/sys/conf/kmod.mk
===================================================================
--- projects/runtime-coverage-v2/sys/conf/kmod.mk	(revision 347598)
+++ projects/runtime-coverage-v2/sys/conf/kmod.mk	(revision 347599)
@@ -1,524 +1,524 @@
 #	From: @(#)bsd.prog.mk	5.26 (Berkeley) 6/25/91
 # $FreeBSD$
 #
 # The include file <bsd.kmod.mk> handles building and installing loadable
 # kernel modules.
 #
 #
 # +++ variables +++
 #
 # CLEANFILES	Additional files to remove for the clean and cleandir targets.
 #
 # EXPORT_SYMS	A list of symbols that should be exported from the module,
 #		or the name of a file containing a list of symbols, or YES
 #		to export all symbols.  If not defined, no symbols are
 #		exported.
 #
 # KMOD		The name of the kernel module to build.
 #
 # KMODDIR	Base path for kernel modules (see kld(4)). [/boot/kernel]
 #
 # KMODOWN	Module file owner. [${BINOWN}]
 #
 # KMODGRP	Module file group. [${BINGRP}]
 #
 # KMODMODE	Module file mode. [${BINMODE}]
 #
 # KMODLOAD	Command to load a kernel module [/sbin/kldload]
 #
 # KMODUNLOAD	Command to unload a kernel module [/sbin/kldunload]
 #
 # KMODISLOADED	Command to check whether a kernel module is
 #		loaded [/sbin/kldstat -q -n]
 #
 # PROG		The name of the kernel module to build.
 #		If not supplied, ${KMOD}.ko is used.
 #
 # SRCS		List of source files.
 #
 # FIRMWS	List of firmware images in format filename:shortname:version
 #
 # FIRMWARE_LICENSE
 #		Set to the name of the license the user has to agree on in
 #		order to use this firmware. See /usr/share/doc/legal
 #
 # DESTDIR	The tree where the module gets installed. [not set]
 #
 # KERNBUILDDIR
 #		Set to the location of the kernel build directory where
 #		the opt_*.h files, .o's and kernel winds up.
 #
 # +++ targets +++
 #
 # 	install:
 #               install the kernel module; if the Makefile
 #               does not itself define the target install, the targets
 #               beforeinstall and afterinstall may also be used to cause
 #               actions immediately before and after the install target
 #		is executed.
 #
 # 	load:
 #		Load a module.
 #
 # 	unload:
 #		Unload a module.
 #
 #	reload:
 #		Unload if loaded, then load.
 #
 
 AWK?=		awk
 KMODLOAD?=	/sbin/kldload
 KMODUNLOAD?=	/sbin/kldunload
 KMODISLOADED?=	/sbin/kldstat -q -n
 OBJCOPY?=	objcopy
 
 .include <bsd.init.mk>
 # Grab all the options for a kernel build. For backwards compat, we need to
 # do this after bsd.own.mk.
 .include "kern.opts.mk"
 .include <bsd.compiler.mk>
 .include "config.mk"
 
 # Search for kernel source tree in standard places.
 .if empty(KERNBUILDDIR)
 .if !defined(SYSDIR)
 .for _dir in ${SRCTOP:D${SRCTOP}/sys} \
     ${.CURDIR}/../.. ${.CURDIR}/../../.. /sys /usr/src/sys
 .if !defined(SYSDIR) && exists(${_dir}/kern/)
 SYSDIR=	${_dir:tA}
 .endif
 .endfor
 .endif
 .if !defined(SYSDIR) || !exists(${SYSDIR}/kern/)
 .error "can't find kernel source tree"
 .endif
 .endif
 
 .SUFFIXES: .out .o .c .cc .cxx .C .y .l .s .S .m
 
 # amd64 and mips use direct linking for kmod, all others use shared binaries
 .if ${MACHINE_CPUARCH} != amd64 && ${MACHINE_CPUARCH} != mips
 __KLD_SHARED=yes
 .else
 __KLD_SHARED=no
 .endif
 
 .if !empty(CFLAGS:M-O[23s]) && empty(CFLAGS:M-fno-strict-aliasing)
 CFLAGS+=	-fno-strict-aliasing
 .endif
 WERROR?=	-Werror
 CFLAGS+=	${WERROR}
 CFLAGS+=	-D_KERNEL
 CFLAGS+=	-DKLD_MODULE
 .if defined(MODULE_TIED)
 CFLAGS+=	-DKLD_TIED
 .endif
 
 # Don't use any standard or source-relative include directories.
 NOSTDINC=	-nostdinc
 CFLAGS:=	${CFLAGS:N-I*} ${NOSTDINC} ${INCLMAGIC} ${CFLAGS:M-I*}
 .if defined(KERNBUILDDIR)
 CFLAGS+=	-DHAVE_KERNEL_OPTION_HEADERS -include ${KERNBUILDDIR}/opt_global.h
 .endif
 
 # Add -I paths for system headers.  Individual module makefiles don't
 # need any -I paths for this.  Similar defaults for .PATH can't be
 # set because there are no standard paths for non-headers.
 CFLAGS+=	-I. -I${SYSDIR} -I${SYSDIR}/contrib/ck/include
 
 CFLAGS.gcc+=	-finline-limit=${INLINE_LIMIT}
 CFLAGS.gcc+=	-fms-extensions
 CFLAGS.gcc+= --param inline-unit-growth=100
 CFLAGS.gcc+= --param large-function-growth=1000
 
 # Disallow common variables, and if we end up with commons from
 # somewhere unexpected, allocate storage for them in the module itself.
 CFLAGS+=	-fno-common
 LDFLAGS+=	-d -warn-common
 
 .if defined(LINKER_FEATURES) && ${LINKER_FEATURES:Mbuild-id}
-LDFLAGS+=	-Wl,--build-id=sha1
+LDFLAGS+=	--build-id=sha1
 .endif
 
 CFLAGS+=	${DEBUG_FLAGS}
 .if ${MACHINE_CPUARCH} == amd64
 CFLAGS+=	-fno-omit-frame-pointer -mno-omit-leaf-frame-pointer
 .endif
 
 .if ${MACHINE_CPUARCH} == "aarch64" || ${MACHINE_CPUARCH} == "riscv"
 CFLAGS+=	-fPIC
 .endif
 
 # Temporary workaround for PR 196407, which contains the fascinating details.
 # Don't allow clang to use fpu instructions or registers in kernel modules.
 .if ${MACHINE_CPUARCH} == arm
 .if ${COMPILER_VERSION} < 30800
 CFLAGS.clang+=	-mllvm -arm-use-movt=0
 .else
 CFLAGS.clang+=	-mno-movt
 .endif
 CFLAGS.clang+=	-mfpu=none
 CFLAGS+=	-funwind-tables
 .endif
 
 .if ${MACHINE_CPUARCH} == powerpc
 CFLAGS+=	-mlongcall -fno-omit-frame-pointer
 .endif
 
 .if ${MACHINE_CPUARCH} == mips
 CFLAGS+=	-G0 -fno-pic -mno-abicalls -mlong-calls
 .endif
 
 .if defined(DEBUG) || defined(DEBUG_FLAGS)
 CTFFLAGS+=	-g
 .endif
 
 .if defined(FIRMWS)
 ${KMOD:S/$/.c/}: ${SYSDIR}/tools/fw_stub.awk
 	${AWK} -f ${SYSDIR}/tools/fw_stub.awk ${FIRMWS} -m${KMOD} -c${KMOD:S/$/.c/g} \
 	    ${FIRMWARE_LICENSE:C/.+/-l/}${FIRMWARE_LICENSE}
 
 SRCS+=	${KMOD:S/$/.c/}
 CLEANFILES+=	${KMOD:S/$/.c/}
 
 .for _firmw in ${FIRMWS}
 ${_firmw:C/\:.*$/.fwo/:T}:	${_firmw:C/\:.*$//}
 	@${ECHO} ${_firmw:C/\:.*$//} ${.ALLSRC:M*${_firmw:C/\:.*$//}}
 	@if [ -e ${_firmw:C/\:.*$//} ]; then			\
 		${LD} -b binary --no-warn-mismatch ${_LDFLAGS}	\
 		    -m ${LD_EMULATION} -r -d			\
 		    -o ${.TARGET} ${_firmw:C/\:.*$//};		\
 	else							\
 		ln -s ${.ALLSRC:M*${_firmw:C/\:.*$//}} ${_firmw:C/\:.*$//}; \
 		${LD} -b binary --no-warn-mismatch ${_LDFLAGS}	\
 		    -m ${LD_EMULATION} -r -d			\
 		    -o ${.TARGET} ${_firmw:C/\:.*$//};		\
 		rm ${_firmw:C/\:.*$//};				\
 	fi
 
 OBJS+=	${_firmw:C/\:.*$/.fwo/:T}
 .endfor
 .endif
 
 # Conditionally include SRCS based on kernel config options.
 .for _o in ${KERN_OPTS}
 SRCS+=${SRCS.${_o}}
 .endfor
 
 OBJS+=	${SRCS:N*.h:R:S/$/.o/g}
 
 .if !defined(PROG)
 PROG=	${KMOD}.ko
 .endif
 
 .if !defined(DEBUG_FLAGS)
 FULLPROG=	${PROG}
 .else
 FULLPROG=	${PROG}.full
 ${PROG}: ${FULLPROG} ${PROG}.debug
 	${OBJCOPY} --strip-debug --add-gnu-debuglink=${PROG}.debug \
 	    ${FULLPROG} ${.TARGET}
 ${PROG}.debug: ${FULLPROG}
 	${OBJCOPY} --only-keep-debug ${FULLPROG} ${.TARGET}
 .endif
 
 .if ${__KLD_SHARED} == yes
 ${FULLPROG}: ${KMOD}.kld
 	${LD} -m ${LD_EMULATION} -Bshareable -znotext ${_LDFLAGS} \
 	    -o ${.TARGET} ${KMOD}.kld
 .if !defined(DEBUG_FLAGS)
 	${OBJCOPY} --strip-debug ${.TARGET}
 .endif
 .endif
 
 EXPORT_SYMS?=	NO
 .if ${EXPORT_SYMS} != YES
 CLEANFILES+=	export_syms
 .endif
 
 .if ${__KLD_SHARED} == yes
 ${KMOD}.kld: ${OBJS}
 .else
 ${FULLPROG}: ${OBJS}
 .endif
 	${LD} -m ${LD_EMULATION} ${_LDFLAGS} -r -d -o ${.TARGET} ${OBJS}
 .if ${MK_CTF} != "no"
 	${CTFMERGE} ${CTFFLAGS} -o ${.TARGET} ${OBJS}
 .endif
 .if defined(EXPORT_SYMS)
 .if ${EXPORT_SYMS} != YES
 .if ${EXPORT_SYMS} == NO
 	:> export_syms
 .elif !exists(${.CURDIR}/${EXPORT_SYMS})
 	echo -n "${EXPORT_SYMS:@s@$s${.newline}@}" > export_syms
 .else
 	grep -v '^#' < ${EXPORT_SYMS} > export_syms
 .endif
 	${AWK} -f ${SYSDIR}/conf/kmod_syms.awk ${.TARGET} \
 	    export_syms | xargs -J% ${OBJCOPY} % ${.TARGET}
 .endif
 .endif # defined(EXPORT_SYMS)
 .if defined(PREFIX_SYMS)
 	${AWK} -v prefix=${PREFIX_SYMS} -f ${SYSDIR}/conf/kmod_syms_prefix.awk \
 	    ${.TARGET} /dev/null | xargs -J% ${OBJCOPY} % ${.TARGET}
 .endif
 .if !defined(DEBUG_FLAGS) && ${__KLD_SHARED} == no
 	${OBJCOPY} --strip-debug ${.TARGET}
 .endif
 
 .if ${COMPILER_TYPE} == "clang" || \
     (${COMPILER_TYPE} == "gcc" && ${COMPILER_VERSION} >= 60000)
 _MAP_DEBUG_PREFIX= yes
 .endif
 
 _ILINKS=machine
 .if ${MACHINE} != ${MACHINE_CPUARCH} && ${MACHINE} != "arm64"
 _ILINKS+=${MACHINE_CPUARCH}
 .endif
 .if ${MACHINE_CPUARCH} == "i386" || ${MACHINE_CPUARCH} == "amd64"
 _ILINKS+=x86
 .endif
 CLEANFILES+=${_ILINKS}
 
 all: ${PROG}
 
 beforedepend: ${_ILINKS}
 beforebuild: ${_ILINKS}
 
 # Ensure that the links exist without depending on it when it exists which
 # causes all the modules to be rebuilt when the directory pointed to changes.
 # Ensure that debug info references the path in the source tree.
 .for _link in ${_ILINKS}
 .if !exists(${.OBJDIR}/${_link})
 OBJS_DEPEND_GUESS+=	${_link}
 .endif
 .if defined(_MAP_DEBUG_PREFIX)
 .if ${_link} == "machine"
 CFLAGS+= -fdebug-prefix-map=./machine=${SYSDIR}/${MACHINE}/include
 .else
 CFLAGS+= -fdebug-prefix-map=./${_link}=${SYSDIR}/${_link}/include
 .endif
 .endif
 .endfor
 
 .NOPATH: ${_ILINKS}
 
 ${_ILINKS}:
 	@case ${.TARGET} in \
 	machine) \
 		path=${SYSDIR}/${MACHINE}/include ;; \
 	*) \
 		path=${SYSDIR}/${.TARGET:T}/include ;; \
 	esac ; \
 	path=`(cd $$path && /bin/pwd)` ; \
 	${ECHO} ${.TARGET:T} "->" $$path ; \
 	ln -fns $$path ${.TARGET:T}
 
 CLEANFILES+= ${PROG} ${KMOD}.kld ${OBJS}
 
 .if defined(DEBUG_FLAGS)
 CLEANFILES+= ${FULLPROG} ${PROG}.debug
 .endif
 
 .if !target(install)
 
 _INSTALLFLAGS:=	${INSTALLFLAGS}
 .for ie in ${INSTALLFLAGS_EDIT}
 _INSTALLFLAGS:=	${_INSTALLFLAGS${ie}}
 .endfor
 
 .if !target(realinstall)
 KERN_DEBUGDIR?=	${DEBUGDIR}
 realinstall: _kmodinstall
 .ORDER: beforeinstall _kmodinstall
 _kmodinstall: .PHONY
 	${INSTALL} -T release -o ${KMODOWN} -g ${KMODGRP} -m ${KMODMODE} \
 	    ${_INSTALLFLAGS} ${PROG} ${DESTDIR}${KMODDIR}/
 .if defined(DEBUG_FLAGS) && !defined(INSTALL_NODEBUG) && ${MK_KERNEL_SYMBOLS} != "no"
 	${INSTALL} -T debug -o ${KMODOWN} -g ${KMODGRP} -m ${KMODMODE} \
 	    ${_INSTALLFLAGS} ${PROG}.debug ${DESTDIR}${KERN_DEBUGDIR}${KMODDIR}/
 .endif
 
 .include <bsd.links.mk>
 
 .if !defined(NO_XREF)
 afterinstall: _kldxref
 .ORDER: realinstall _kldxref
 .ORDER: _installlinks _kldxref
 _kldxref: .PHONY
 	@if type kldxref >/dev/null 2>&1; then \
 		${ECHO} kldxref ${DESTDIR}${KMODDIR}; \
 		kldxref ${DESTDIR}${KMODDIR}; \
 	fi
 .endif
 .endif # !target(realinstall)
 
 .endif # !target(install)
 
 .if !target(load)
 load: ${PROG} .PHONY
 	${KMODLOAD} -v ${.OBJDIR}/${PROG}
 .endif
 
 .if !target(unload)
 unload: .PHONY
 	if ${KMODISLOADED} ${PROG} ; then ${KMODUNLOAD} -v ${PROG} ; fi
 .endif
 
 .if !target(reload)
 reload: unload load .PHONY
 .endif
 
 .if defined(KERNBUILDDIR)
 .PATH: ${KERNBUILDDIR}
 CFLAGS+=	-I${KERNBUILDDIR}
 .for _src in ${SRCS:Mopt_*.h}
 CLEANFILES+=	${_src}
 .if !target(${_src})
 ${_src}:
 	ln -sf ${KERNBUILDDIR}/${_src} ${.TARGET}
 .endif
 .endfor
 .else
 .for _src in ${SRCS:Mopt_*.h}
 CLEANFILES+=	${_src}
 .if !target(${_src})
 ${_src}:
 	:> ${.TARGET}
 .endif
 .endfor
 .endif
 
 # Add the sanitizer C flags
 CFLAGS+=	${SAN_CFLAGS}
 
 # Add the gcov flags
 CFLAGS+=	${GCOV_CFLAGS}
 
 # Respect configuration-specific C flags.
 CFLAGS+=	${ARCH_FLAGS} ${CONF_CFLAGS}
 
 .if !empty(SRCS:Mvnode_if.c)
 CLEANFILES+=	vnode_if.c
 vnode_if.c: ${SYSDIR}/tools/vnode_if.awk ${SYSDIR}/kern/vnode_if.src
 	${AWK} -f ${SYSDIR}/tools/vnode_if.awk ${SYSDIR}/kern/vnode_if.src -c
 .endif
 
 .if !empty(SRCS:Mvnode_if.h)
 CLEANFILES+=	vnode_if.h vnode_if_newproto.h vnode_if_typedef.h
 vnode_if.h vnode_if_newproto.h vnode_if_typedef.h: ${SYSDIR}/tools/vnode_if.awk \
     ${SYSDIR}/kern/vnode_if.src
 vnode_if.h: vnode_if_newproto.h vnode_if_typedef.h
 	${AWK} -f ${SYSDIR}/tools/vnode_if.awk ${SYSDIR}/kern/vnode_if.src -h
 vnode_if_newproto.h:
 	${AWK} -f ${SYSDIR}/tools/vnode_if.awk ${SYSDIR}/kern/vnode_if.src -p
 vnode_if_typedef.h:
 	${AWK} -f ${SYSDIR}/tools/vnode_if.awk ${SYSDIR}/kern/vnode_if.src -q
 .endif
 
 # Build _if.[ch] from _if.m, and clean them when we're done.
 # __MPATH defined in config.mk
 _MFILES=${__MPATH:T:O}
 _MPATH=${__MPATH:H:O:u}
 .PATH.m: ${_MPATH}
 .for _i in ${SRCS:M*_if.[ch]}
 _MATCH=M${_i:R:S/$/.m/}
 _MATCHES=${_MFILES:${_MATCH}}
 .if !empty(_MATCHES)
 CLEANFILES+=	${_i}
 .endif
 .endfor # _i
 .m.c:	${SYSDIR}/tools/makeobjops.awk
 	${AWK} -f ${SYSDIR}/tools/makeobjops.awk ${.IMPSRC} -c
 
 .m.h:	${SYSDIR}/tools/makeobjops.awk
 	${AWK} -f ${SYSDIR}/tools/makeobjops.awk ${.IMPSRC} -h
 
 .for _i in mii pccard
 .if !empty(SRCS:M${_i}devs.h)
 CLEANFILES+=	${_i}devs.h
 ${_i}devs.h: ${SYSDIR}/tools/${_i}devs2h.awk ${SYSDIR}/dev/${_i}/${_i}devs
 	${AWK} -f ${SYSDIR}/tools/${_i}devs2h.awk ${SYSDIR}/dev/${_i}/${_i}devs
 .endif
 .endfor # _i
 
 .if !empty(SRCS:Mbhnd_nvram_map.h)
 CLEANFILES+=	bhnd_nvram_map.h
 bhnd_nvram_map.h: ${SYSDIR}/dev/bhnd/tools/nvram_map_gen.awk \
     ${SYSDIR}/dev/bhnd/tools/nvram_map_gen.sh \
     ${SYSDIR}/dev/bhnd/nvram/nvram_map
 bhnd_nvram_map.h:
 	sh ${SYSDIR}/dev/bhnd/tools/nvram_map_gen.sh \
 	    ${SYSDIR}/dev/bhnd/nvram/nvram_map -h
 .endif
 
 .if !empty(SRCS:Mbhnd_nvram_map_data.h)
 CLEANFILES+=	bhnd_nvram_map_data.h
 bhnd_nvram_map_data.h: ${SYSDIR}/dev/bhnd/tools/nvram_map_gen.awk \
     ${SYSDIR}/dev/bhnd/tools/nvram_map_gen.sh \
     ${SYSDIR}/dev/bhnd/nvram/nvram_map
 bhnd_nvram_map_data.h:
 	sh ${SYSDIR}/dev/bhnd/tools/nvram_map_gen.sh \
 	    ${SYSDIR}/dev/bhnd/nvram/nvram_map -d
 .endif
 
 .if !empty(SRCS:Musbdevs.h)
 CLEANFILES+=	usbdevs.h
 usbdevs.h: ${SYSDIR}/tools/usbdevs2h.awk ${SYSDIR}/dev/usb/usbdevs
 	${AWK} -f ${SYSDIR}/tools/usbdevs2h.awk ${SYSDIR}/dev/usb/usbdevs -h
 .endif
 
 .if !empty(SRCS:Musbdevs_data.h)
 CLEANFILES+=	usbdevs_data.h
 usbdevs_data.h: ${SYSDIR}/tools/usbdevs2h.awk ${SYSDIR}/dev/usb/usbdevs
 	${AWK} -f ${SYSDIR}/tools/usbdevs2h.awk ${SYSDIR}/dev/usb/usbdevs -d
 .endif
 
 .if !empty(SRCS:Macpi_quirks.h)
 CLEANFILES+=	acpi_quirks.h
 acpi_quirks.h: ${SYSDIR}/tools/acpi_quirks2h.awk ${SYSDIR}/dev/acpica/acpi_quirks
 	${AWK} -f ${SYSDIR}/tools/acpi_quirks2h.awk ${SYSDIR}/dev/acpica/acpi_quirks
 .endif
 
 .if !empty(SRCS:Massym.inc) || !empty(DPSRCS:Massym.inc)
 CLEANFILES+=	assym.inc
 DEPENDOBJS+=	genassym.o
 DPSRCS+=	offset.inc
 .endif
 .if defined(MODULE_TIED)
 DPSRCS+=	offset.inc
 .endif
 .if !empty(SRCS:Moffset.inc) || !empty(DPSRCS:Moffset.inc)
 CLEANFILES+=	offset.inc genoffset.o
 DEPENDOBJS+=	genoffset.o
 .endif
 assym.inc: genassym.o
 offset.inc: genoffset.o
 assym.inc: ${SYSDIR}/kern/genassym.sh
 	sh ${SYSDIR}/kern/genassym.sh genassym.o > ${.TARGET}
 genassym.o: ${SYSDIR}/${MACHINE}/${MACHINE}/genassym.c offset.inc
 genassym.o: ${SRCS:Mopt_*.h}
 	${CC} -c ${CFLAGS:N-flto:N-fno-common} \
 	    ${SYSDIR}/${MACHINE}/${MACHINE}/genassym.c
 offset.inc: ${SYSDIR}/kern/genoffset.sh genoffset.o
 	sh ${SYSDIR}/kern/genoffset.sh genoffset.o > ${.TARGET}
 genoffset.o: ${SYSDIR}/kern/genoffset.c
 genoffset.o: ${SRCS:Mopt_*.h}
 	${CC} -c ${CFLAGS:N-flto:N-fno-common} \
 	    ${SYSDIR}/kern/genoffset.c
 
 CLEANDEPENDFILES+=	${_ILINKS}
 # .depend needs include links so we remove them only together.
 cleanilinks:
 	rm -f ${_ILINKS}
 
 OBJS_DEPEND_GUESS+= ${SRCS:M*.h}
 .if defined(KERNBUILDDIR)
 OBJS_DEPEND_GUESS+= opt_global.h
 .endif
 
 .include <bsd.dep.mk>
 .include <bsd.clang-analyze.mk>
 .include <bsd.obj.mk>
 .include "kern.mk"
Index: projects/runtime-coverage-v2/sys/dev/cpuctl/cpuctl.c
===================================================================
--- projects/runtime-coverage-v2/sys/dev/cpuctl/cpuctl.c	(revision 347598)
+++ projects/runtime-coverage-v2/sys/dev/cpuctl/cpuctl.c	(revision 347599)
@@ -1,597 +1,598 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
  *
  * Copyright (c) 2006-2008 Stanislav Sedov <stas@FreeBSD.org>
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/conf.h>
 #include <sys/fcntl.h>
 #include <sys/ioccom.h>
 #include <sys/malloc.h>
 #include <sys/module.h>
 #include <sys/mutex.h>
 #include <sys/priv.h>
 #include <sys/proc.h>
 #include <sys/queue.h>
 #include <sys/sched.h>
 #include <sys/kernel.h>
 #include <sys/sysctl.h>
 #include <sys/uio.h>
 #include <sys/pcpu.h>
 #include <sys/smp.h>
 #include <sys/pmckern.h>
 #include <sys/cpuctl.h>
 
 #include <machine/cpufunc.h>
 #include <machine/md_var.h>
 #include <machine/specialreg.h>
 #include <x86/ucode.h>
 
 static d_open_t cpuctl_open;
 static d_ioctl_t cpuctl_ioctl;
 
 #define	CPUCTL_VERSION 1
 
 #ifdef CPUCTL_DEBUG
 # define	DPRINTF(format,...) printf(format, __VA_ARGS__);
 #else
 # define	DPRINTF(...)
 #endif
 
 #define	UCODE_SIZE_MAX	(4 * 1024 * 1024)
 
 static int cpuctl_do_msr(int cpu, cpuctl_msr_args_t *data, u_long cmd,
     struct thread *td);
 static int cpuctl_do_cpuid(int cpu, cpuctl_cpuid_args_t *data,
     struct thread *td);
 static int cpuctl_do_cpuid_count(int cpu, cpuctl_cpuid_count_args_t *data,
     struct thread *td);
 static int cpuctl_do_eval_cpu_features(int cpu, struct thread *td);
 static int cpuctl_do_update(int cpu, cpuctl_update_args_t *data,
     struct thread *td);
 static int update_intel(int cpu, cpuctl_update_args_t *args,
     struct thread *td);
 static int update_amd(int cpu, cpuctl_update_args_t *args, struct thread *td);
 static int update_via(int cpu, cpuctl_update_args_t *args,
     struct thread *td);
 
 static struct cdev **cpuctl_devs;
 static MALLOC_DEFINE(M_CPUCTL, "cpuctl", "CPUCTL buffer");
 
 static struct cdevsw cpuctl_cdevsw = {
         .d_version =    D_VERSION,
         .d_open =       cpuctl_open,
         .d_ioctl =      cpuctl_ioctl,
         .d_name =       "cpuctl",
 };
 
 /*
  * This function checks if specified cpu enabled or not.
  */
 static int
 cpu_enabled(int cpu)
 {
 
 	return (pmc_cpu_is_disabled(cpu) == 0);
 }
 
 /*
  * Check if the current thread is bound to a specific cpu.
  */
 static int
 cpu_sched_is_bound(struct thread *td)
 {
 	int ret;
 
 	thread_lock(td);
 	ret = sched_is_bound(td);
 	thread_unlock(td);
 	return (ret);
 }
 
 /*
  * Switch to target cpu to run.
  */
 static void
 set_cpu(int cpu, struct thread *td)
 {
 
 	KASSERT(cpu >= 0 && cpu <= mp_maxid && cpu_enabled(cpu),
 	    ("[cpuctl,%d]: bad cpu number %d", __LINE__, cpu));
 	thread_lock(td);
 	sched_bind(td, cpu);
 	thread_unlock(td);
 	KASSERT(td->td_oncpu == cpu,
 	    ("[cpuctl,%d]: cannot bind to target cpu %d on cpu %d", __LINE__,
 	    cpu, td->td_oncpu));
 }
 
 static void
 restore_cpu(int oldcpu, int is_bound, struct thread *td)
 {
 
 	KASSERT(oldcpu >= 0 && oldcpu <= mp_maxid && cpu_enabled(oldcpu),
 	    ("[cpuctl,%d]: bad cpu number %d", __LINE__, oldcpu));
 	thread_lock(td);
 	if (is_bound == 0)
 		sched_unbind(td);
 	else
 		sched_bind(td, oldcpu);
 	thread_unlock(td);
 }
 
 int
 cpuctl_ioctl(struct cdev *dev, u_long cmd, caddr_t data,
     int flags, struct thread *td)
 {
 	int cpu, ret;
 
 	cpu = dev2unit(dev);
 	if (cpu > mp_maxid || !cpu_enabled(cpu)) {
 		DPRINTF("[cpuctl,%d]: bad cpu number %d\n", __LINE__, cpu);
 		return (ENXIO);
 	}
 	/* Require write flag for "write" requests. */
 	if ((cmd == CPUCTL_MSRCBIT || cmd == CPUCTL_MSRSBIT ||
 	    cmd == CPUCTL_UPDATE || cmd == CPUCTL_WRMSR ||
 	    cmd == CPUCTL_EVAL_CPU_FEATURES) &&
 	    (flags & FWRITE) == 0)
 		return (EPERM);
 	switch (cmd) {
 	case CPUCTL_RDMSR:
 		ret = cpuctl_do_msr(cpu, (cpuctl_msr_args_t *)data, cmd, td);
 		break;
 	case CPUCTL_MSRSBIT:
 	case CPUCTL_MSRCBIT:
 	case CPUCTL_WRMSR:
 		ret = priv_check(td, PRIV_CPUCTL_WRMSR);
 		if (ret != 0)
 			goto fail;
 		ret = cpuctl_do_msr(cpu, (cpuctl_msr_args_t *)data, cmd, td);
 		break;
 	case CPUCTL_CPUID:
 		ret = cpuctl_do_cpuid(cpu, (cpuctl_cpuid_args_t *)data, td);
 		break;
 	case CPUCTL_UPDATE:
 		ret = priv_check(td, PRIV_CPUCTL_UPDATE);
 		if (ret != 0)
 			goto fail;
 		ret = cpuctl_do_update(cpu, (cpuctl_update_args_t *)data, td);
 		break;
 	case CPUCTL_CPUID_COUNT:
 		ret = cpuctl_do_cpuid_count(cpu,
 		    (cpuctl_cpuid_count_args_t *)data, td);
 		break;
 	case CPUCTL_EVAL_CPU_FEATURES:
 		ret = cpuctl_do_eval_cpu_features(cpu, td);
 		break;
 	default:
 		ret = EINVAL;
 		break;
 	}
 fail:
 	return (ret);
 }
 
 /*
  * Actually perform cpuid operation.
  */
 static int
 cpuctl_do_cpuid_count(int cpu, cpuctl_cpuid_count_args_t *data,
     struct thread *td)
 {
 	int is_bound = 0;
 	int oldcpu;
 
 	KASSERT(cpu >= 0 && cpu <= mp_maxid,
 	    ("[cpuctl,%d]: bad cpu number %d", __LINE__, cpu));
 
 	/* Explicitly clear cpuid data to avoid returning stale info. */
 	bzero(data->data, sizeof(data->data));
 	DPRINTF("[cpuctl,%d]: retrieving cpuid lev %#0x type %#0x for %d cpu\n",
 	    __LINE__, data->level, data->level_type, cpu);
 #ifdef __i386__
 	if (cpu_id == 0)
 		return (ENODEV);
 #endif
 	oldcpu = td->td_oncpu;
 	is_bound = cpu_sched_is_bound(td);
 	set_cpu(cpu, td);
 	cpuid_count(data->level, data->level_type, data->data);
 	restore_cpu(oldcpu, is_bound, td);
 	return (0);
 }
 
 static int
 cpuctl_do_cpuid(int cpu, cpuctl_cpuid_args_t *data, struct thread *td)
 {
 	cpuctl_cpuid_count_args_t cdata;
 	int error;
 
 	cdata.level = data->level;
 	/* Override the level type. */
 	cdata.level_type = 0;
 	error = cpuctl_do_cpuid_count(cpu, &cdata, td);
 	bcopy(cdata.data, data->data, sizeof(data->data)); /* Ignore error */
 	return (error);
 }
 
 /*
  * Actually perform MSR operations.
  */
 static int
 cpuctl_do_msr(int cpu, cpuctl_msr_args_t *data, u_long cmd, struct thread *td)
 {
 	uint64_t reg;
 	int is_bound = 0;
 	int oldcpu;
 	int ret;
 
 	KASSERT(cpu >= 0 && cpu <= mp_maxid,
 	    ("[cpuctl,%d]: bad cpu number %d", __LINE__, cpu));
 
 	/*
 	 * Explicitly clear cpuid data to avoid returning stale
 	 * info
 	 */
 	DPRINTF("[cpuctl,%d]: operating on MSR %#0x for %d cpu\n", __LINE__,
 	    data->msr, cpu);
 #ifdef __i386__
 	if ((cpu_feature & CPUID_MSR) == 0)
 		return (ENODEV);
 #endif
 	oldcpu = td->td_oncpu;
 	is_bound = cpu_sched_is_bound(td);
 	set_cpu(cpu, td);
 	if (cmd == CPUCTL_RDMSR) {
 		data->data = 0;
 		ret = rdmsr_safe(data->msr, &data->data);
 	} else if (cmd == CPUCTL_WRMSR) {
 		ret = wrmsr_safe(data->msr, data->data);
 	} else if (cmd == CPUCTL_MSRSBIT) {
 		critical_enter();
 		ret = rdmsr_safe(data->msr, &reg);
 		if (ret == 0)
 			ret = wrmsr_safe(data->msr, reg | data->data);
 		critical_exit();
 	} else if (cmd == CPUCTL_MSRCBIT) {
 		critical_enter();
 		ret = rdmsr_safe(data->msr, &reg);
 		if (ret == 0)
 			ret = wrmsr_safe(data->msr, reg & ~data->data);
 		critical_exit();
 	} else
 		panic("[cpuctl,%d]: unknown operation requested: %lu",
 		    __LINE__, cmd);
 	restore_cpu(oldcpu, is_bound, td);
 	return (ret);
 }
 
 /*
  * Actually perform microcode update.
  */
 static int
 cpuctl_do_update(int cpu, cpuctl_update_args_t *data, struct thread *td)
 {
 	cpuctl_cpuid_args_t args = {
 		.level = 0,
 	};
 	char vendor[13];
 	int ret;
 
 	KASSERT(cpu >= 0 && cpu <= mp_maxid,
 	    ("[cpuctl,%d]: bad cpu number %d", __LINE__, cpu));
 	DPRINTF("[cpuctl,%d]: XXX %d", __LINE__, cpu);
 
 	ret = cpuctl_do_cpuid(cpu, &args, td);
 	if (ret != 0)
 		return (ret);
 	((uint32_t *)vendor)[0] = args.data[1];
 	((uint32_t *)vendor)[1] = args.data[3];
 	((uint32_t *)vendor)[2] = args.data[2];
 	vendor[12] = '\0';
 	if (strncmp(vendor, INTEL_VENDOR_ID, sizeof(INTEL_VENDOR_ID)) == 0)
 		ret = update_intel(cpu, data, td);
 	else if(strncmp(vendor, AMD_VENDOR_ID, sizeof(AMD_VENDOR_ID)) == 0)
 		ret = update_amd(cpu, data, td);
 	else if(strncmp(vendor, CENTAUR_VENDOR_ID, sizeof(CENTAUR_VENDOR_ID))
 	    == 0)
 		ret = update_via(cpu, data, td);
 	else
 		ret = ENXIO;
 	return (ret);
 }
 
 struct ucode_update_data {
 	void *ptr;
 	int cpu;
 	int ret;
 };
 
 static void
 ucode_intel_load_rv(void *arg)
 {
 	struct ucode_update_data *d;
 
 	d = arg;
 	if (PCPU_GET(cpuid) == d->cpu)
 		d->ret = ucode_intel_load(d->ptr, true, NULL, NULL);
 }
 
 static int
 update_intel(int cpu, cpuctl_update_args_t *args, struct thread *td)
 {
 	struct ucode_update_data d;
 	void *ptr;
 	int is_bound, oldcpu, ret;
 
 	if (args->size == 0 || args->data == NULL) {
 		DPRINTF("[cpuctl,%d]: zero-sized firmware image", __LINE__);
 		return (EINVAL);
 	}
 	if (args->size > UCODE_SIZE_MAX) {
 		DPRINTF("[cpuctl,%d]: firmware image too large", __LINE__);
 		return (EINVAL);
 	}
 
 	/*
 	 * 16 byte alignment required.  Rely on the fact that
 	 * malloc(9) always returns the pointer aligned at least on
 	 * the size of the allocation.
 	 */
 	ptr = malloc(args->size + 16, M_CPUCTL, M_WAITOK);
 	if (copyin(args->data, ptr, args->size) != 0) {
 		DPRINTF("[cpuctl,%d]: copyin %p->%p of %zd bytes failed",
 		    __LINE__, args->data, ptr, args->size);
 		ret = EFAULT;
 		goto out;
 	}
 	oldcpu = td->td_oncpu;
 	is_bound = cpu_sched_is_bound(td);
 	set_cpu(cpu, td);
 	d.ptr = ptr;
 	d.cpu = cpu;
 	smp_rendezvous(NULL, ucode_intel_load_rv, NULL, &d);
 	restore_cpu(oldcpu, is_bound, td);
 	ret = d.ret;
 
 	/*
 	 * Replace any existing update.  This ensures that the new update
 	 * will be reloaded automatically during ACPI resume.
 	 */
 	if (ret == 0)
 		ptr = ucode_update(ptr);
 
 out:
 	free(ptr, M_CPUCTL);
 	return (ret);
 }
 
 /*
  * NB: MSR 0xc0010020, MSR_K8_UCODE_UPDATE, is not documented by AMD.
  * Coreboot, illumos and Linux source code was used to understand
  * its workings.
  */
 static void
 amd_ucode_wrmsr(void *ucode_ptr)
 {
 	uint32_t tmp[4];
 
 	wrmsr_safe(MSR_K8_UCODE_UPDATE, (uintptr_t)ucode_ptr);
 	do_cpuid(0, tmp);
 }
 
 static int
 update_amd(int cpu, cpuctl_update_args_t *args, struct thread *td)
 {
 	void *ptr;
 	int ret;
 
 	if (args->size == 0 || args->data == NULL) {
 		DPRINTF("[cpuctl,%d]: zero-sized firmware image", __LINE__);
 		return (EINVAL);
 	}
 	if (args->size > UCODE_SIZE_MAX) {
 		DPRINTF("[cpuctl,%d]: firmware image too large", __LINE__);
 		return (EINVAL);
 	}
 
 	/*
 	 * 16 byte alignment required.  Rely on the fact that
 	 * malloc(9) always returns the pointer aligned at least on
 	 * the size of the allocation.
 	 */
 	ptr = malloc(args->size + 16, M_CPUCTL, M_ZERO | M_WAITOK);
 	if (copyin(args->data, ptr, args->size) != 0) {
 		DPRINTF("[cpuctl,%d]: copyin %p->%p of %zd bytes failed",
 		    __LINE__, args->data, ptr, args->size);
 		ret = EFAULT;
 		goto fail;
 	}
 	smp_rendezvous(NULL, amd_ucode_wrmsr, NULL, ptr);
 	ret = 0;
 fail:
 	free(ptr, M_CPUCTL);
 	return (ret);
 }
 
 static int
 update_via(int cpu, cpuctl_update_args_t *args, struct thread *td)
 {
 	void *ptr;
 	uint64_t rev0, rev1, res;
 	uint32_t tmp[4];
 	int is_bound;
 	int oldcpu;
 	int ret;
 
 	if (args->size == 0 || args->data == NULL) {
 		DPRINTF("[cpuctl,%d]: zero-sized firmware image", __LINE__);
 		return (EINVAL);
 	}
 	if (args->size > UCODE_SIZE_MAX) {
 		DPRINTF("[cpuctl,%d]: firmware image too large", __LINE__);
 		return (EINVAL);
 	}
 
 	/*
 	 * 4 byte alignment required.
 	 */
 	ptr = malloc(args->size, M_CPUCTL, M_WAITOK);
 	if (copyin(args->data, ptr, args->size) != 0) {
 		DPRINTF("[cpuctl,%d]: copyin %p->%p of %zd bytes failed",
 		    __LINE__, args->data, ptr, args->size);
 		ret = EFAULT;
 		goto fail;
 	}
 	oldcpu = td->td_oncpu;
 	is_bound = cpu_sched_is_bound(td);
 	set_cpu(cpu, td);
 	critical_enter();
 	rdmsr_safe(MSR_BIOS_SIGN, &rev0); /* Get current microcode revision. */
 
 	/*
 	 * Perform update.
 	 */
 	wrmsr_safe(MSR_BIOS_UPDT_TRIG, (uintptr_t)(ptr));
 	do_cpuid(1, tmp);
 
 	/*
 	 * Result are in low byte of MSR FCR5:
 	 * 0x00: No update has been attempted since RESET.
 	 * 0x01: The last attempted update was successful.
 	 * 0x02: The last attempted update was unsuccessful due to a bad
 	 *       environment. No update was loaded and any preexisting
 	 *       patches are still active.
 	 * 0x03: The last attempted update was not applicable to this processor.
 	 *       No update was loaded and any preexisting patches are still
 	 *       active.
 	 * 0x04: The last attempted update was not successful due to an invalid
 	 *       update data block. No update was loaded and any preexisting
 	 *       patches are still active
 	 */
 	rdmsr_safe(0x1205, &res);
 	res &= 0xff;
 	critical_exit();
 	rdmsr_safe(MSR_BIOS_SIGN, &rev1); /* Get new microcode revision. */
 	restore_cpu(oldcpu, is_bound, td);
 
 	DPRINTF("[cpu,%d]: rev0=%x rev1=%x res=%x\n", __LINE__,
 	    (unsigned)(rev0 >> 32), (unsigned)(rev1 >> 32), (unsigned)res);
 
 	if (res != 0x01)
 		ret = EINVAL;
 	else
 		ret = 0;
 fail:
 	free(ptr, M_CPUCTL);
 	return (ret);
 }
 
 static int
 cpuctl_do_eval_cpu_features(int cpu, struct thread *td)
 {
 	int is_bound = 0;
 	int oldcpu;
 
 	KASSERT(cpu >= 0 && cpu <= mp_maxid,
 	    ("[cpuctl,%d]: bad cpu number %d", __LINE__, cpu));
 
 #ifdef __i386__
 	if (cpu_id == 0)
 		return (ENODEV);
 #endif
 	oldcpu = td->td_oncpu;
 	is_bound = cpu_sched_is_bound(td);
 	set_cpu(cpu, td);
 	identify_cpu1();
 	identify_cpu2();
 	hw_ibrs_recalculate();
 	restore_cpu(oldcpu, is_bound, td);
 	hw_ssb_recalculate(true);
 #ifdef __amd64__
 	amd64_syscall_ret_flush_l1d_recalc();
 #endif
+	hw_mds_recalculate();
 	printcpuinfo();
 	return (0);
 }
 
 
 int
 cpuctl_open(struct cdev *dev, int flags, int fmt __unused, struct thread *td)
 {
 	int ret = 0;
 	int cpu;
 
 	cpu = dev2unit(dev);
 	if (cpu > mp_maxid || !cpu_enabled(cpu)) {
 		DPRINTF("[cpuctl,%d]: incorrect cpu number %d\n", __LINE__,
 		    cpu);
 		return (ENXIO);
 	}
 	if (flags & FWRITE)
 		ret = securelevel_gt(td->td_ucred, 0);
 	return (ret);
 }
 
 static int
 cpuctl_modevent(module_t mod __unused, int type, void *data __unused)
 {
 	int cpu;
 
 	switch(type) {
 	case MOD_LOAD:
 		if (bootverbose)
 			printf("cpuctl: access to MSR registers/cpuid info.\n");
 		cpuctl_devs = malloc(sizeof(*cpuctl_devs) * (mp_maxid + 1), M_CPUCTL,
 		    M_WAITOK | M_ZERO);
 		CPU_FOREACH(cpu)
 			if (cpu_enabled(cpu))
 				cpuctl_devs[cpu] = make_dev(&cpuctl_cdevsw, cpu,
 				    UID_ROOT, GID_KMEM, 0640, "cpuctl%d", cpu);
 		break;
 	case MOD_UNLOAD:
 		CPU_FOREACH(cpu) {
 			if (cpuctl_devs[cpu] != NULL)
 				destroy_dev(cpuctl_devs[cpu]);
 		}
 		free(cpuctl_devs, M_CPUCTL);
 		break;
 	case MOD_SHUTDOWN:
 		break;
 	default:
 		return (EOPNOTSUPP);
         }
 	return (0);
 }
 
 DEV_MODULE(cpuctl, cpuctl_modevent, NULL);
 MODULE_VERSION(cpuctl, CPUCTL_VERSION);
Index: projects/runtime-coverage-v2/sys/i386/i386/exception.s
===================================================================
--- projects/runtime-coverage-v2/sys/i386/i386/exception.s	(revision 347598)
+++ projects/runtime-coverage-v2/sys/i386/i386/exception.s	(revision 347599)
@@ -1,659 +1,661 @@
 /*-
  * Copyright (c) 1989, 1990 William F. Jolitz.
  * Copyright (c) 1990 The Regents of the University of California.
  * Copyright (c) 2007, 2018 The FreeBSD Foundation
  * All rights reserved.
  *
  * Portions of this software were developed by A. Joseph Koshy under
  * sponsorship from the FreeBSD Foundation and Google, Inc.
  * Portions of this software were developed by Konstantin Belousov
  * <kib@FreeBSD.org> under sponsorship from the FreeBSD Foundation.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  * $FreeBSD$
  */
 
 #include "opt_apic.h"
 #include "opt_atpic.h"
 #include "opt_hwpmc_hooks.h"
 
 #include "assym.inc"
 
 #include <machine/psl.h>
 #include <machine/asmacros.h>
 #include <machine/trap.h>
 
 #ifdef KDTRACE_HOOKS
 	.bss
 	.globl	dtrace_invop_jump_addr
 	.align	4
 	.type	dtrace_invop_jump_addr, @object
 	.size	dtrace_invop_jump_addr, 4
 dtrace_invop_jump_addr:
 	.zero	4
 	.globl	dtrace_invop_calltrap_addr
 	.align	4
 	.type	dtrace_invop_calltrap_addr, @object
 	.size	dtrace_invop_calltrap_addr, 4
 dtrace_invop_calltrap_addr:
 	.zero	8
 #endif
 	.text
 ENTRY(start_exceptions)
 	.globl	tramp_idleptd
 tramp_idleptd:	.long	0
 
 /*****************************************************************************/
 /* Trap handling                                                             */
 /*****************************************************************************/
 /*
  * Trap and fault vector routines.
  *
  * All traps are 'interrupt gates', SDT_SYS386IGT.  Interrupts are disabled
  * by hardware to not allow interrupts until code switched to the kernel
  * address space and the kernel thread stack.
  *
  * The cpu will push a certain amount of state onto the kernel stack for
  * the current process.  The amount of state depends on the type of trap
  * and whether the trap crossed rings or not.  See i386/include/frame.h.
  * At the very least the current EFLAGS (status register, which includes
  * the interrupt disable state prior to the trap), the code segment register,
  * and the return instruction pointer are pushed by the cpu.  The cpu
  * will also push an 'error' code for certain traps.  We push a dummy
  * error code for those traps where the cpu doesn't in order to maintain
  * a consistent frame.  We also push a contrived 'trap number'.
  *
  * The cpu does not push the general registers, we must do that, and we
  * must restore them prior to calling 'iret'.  The cpu adjusts the %cs and
  * %ss segment registers, but does not mess with %ds, %es, or %fs.  Thus we
  * must load them with appropriate values for supervisor mode operation.
  *
  * This code is not executed at the linked address, it is copied to the
  * trampoline area.  As the consequence, all code there and in included files
  * must be PIC.
  */
 
 MCOUNT_LABEL(user)
 MCOUNT_LABEL(btrap)
 
 #define	TRAP(a)		pushl $(a) ; jmp alltraps
 
 IDTVEC(div)
 	pushl $0; TRAP(T_DIVIDE)
 IDTVEC(bpt)
 	pushl $0; TRAP(T_BPTFLT)
 IDTVEC(dtrace_ret)
 	pushl $0; TRAP(T_DTRACE_RET)
 IDTVEC(ofl)
 	pushl $0; TRAP(T_OFLOW)
 IDTVEC(bnd)
 	pushl $0; TRAP(T_BOUND)
 #ifndef KDTRACE_HOOKS
 IDTVEC(ill)
 	pushl $0; TRAP(T_PRIVINFLT)
 #endif
 IDTVEC(dna)
 	pushl $0; TRAP(T_DNA)
 IDTVEC(fpusegm)
 	pushl $0; TRAP(T_FPOPFLT)
 IDTVEC(tss)
 	TRAP(T_TSSFLT)
 IDTVEC(missing)
 	pushl	$T_SEGNPFLT
 	jmp	irettraps
 IDTVEC(stk)
 	pushl	$T_STKFLT
 	jmp	irettraps
 IDTVEC(prot)
 	pushl	$T_PROTFLT
 	jmp	irettraps
 IDTVEC(page)
 	testl	$PSL_VM, TF_EFLAGS-TF_ERR(%esp)
 	jnz	1f
 	testb	$SEL_RPL_MASK, TF_CS-TF_ERR(%esp)
 	jnz	1f
 	cmpl	$PMAP_TRM_MIN_ADDRESS, TF_EIP-TF_ERR(%esp)
 	jb	1f
 	movl	%ebx, %cr3
 	movl	%edx, TF_EIP-TF_ERR(%esp)
 	addl	$4, %esp
 	iret
 1:	pushl	$T_PAGEFLT
 	jmp	alltraps
 IDTVEC(rsvd_pti)
 IDTVEC(rsvd)
 	pushl $0; TRAP(T_RESERVED)
 IDTVEC(fpu)
 	pushl $0; TRAP(T_ARITHTRAP)
 IDTVEC(align)
 	TRAP(T_ALIGNFLT)
 IDTVEC(xmm)
 	pushl $0; TRAP(T_XMMFLT)
 
 	/*
 	 * All traps except ones for syscalls or invalid segment,
 	 * jump to alltraps.  If
 	 * interrupts were enabled when the trap occurred, then interrupts
 	 * are enabled now if the trap was through a trap gate, else
 	 * disabled if the trap was through an interrupt gate.  Note that
 	 * int0x80_syscall is a trap gate.   Interrupt gates are used by
 	 * page faults, non-maskable interrupts, debug and breakpoint
 	 * exceptions.
 	 */
 	SUPERALIGN_TEXT
 	.globl	alltraps
 	.type	alltraps,@function
 alltraps:
 	PUSH_FRAME2
 alltraps_with_regs_pushed:
 	SET_KERNEL_SREGS
 	cld
 	KENTER
 	FAKE_MCOUNT(TF_EIP(%esp))
 calltrap:
 	pushl	%esp
 	movl	$trap,%eax
 	call	*%eax
 	add	$4, %esp
 
 	/*
 	 * Return via doreti to handle ASTs.
 	 */
 	MEXITCOUNT
 	jmp	doreti
 
 	.globl	irettraps
 	.type	irettraps,@function
 irettraps:
 	testl	$PSL_VM, TF_EFLAGS-TF_TRAPNO(%esp)
 	jnz	alltraps
 	testb	$SEL_RPL_MASK, TF_CS-TF_TRAPNO(%esp)
 	jnz	alltraps
 
 	/*
 	 * Kernel mode.
 	 * The special case there is the kernel mode with user %cr3 and
 	 * trampoline stack. We need to copy both current frame and the
 	 * hardware portion of the frame we tried to return to, to the
 	 * normal stack.  This logic must follow the stack unwind order
 	 * in doreti.
 	 */
 	PUSH_FRAME2
 	SET_KERNEL_SREGS
 	cld
 	call	1f
 1:	popl	%ebx
 	leal	(doreti_iret - 1b)(%ebx), %edx
 	cmpl	%edx, TF_EIP(%esp)
 	jne	2f
 	movl	$(2 * TF_SZ - TF_EIP), %ecx
 	jmp	6f
 2:	leal	(doreti_popl_ds - 1b)(%ebx), %edx
 	cmpl	%edx, TF_EIP(%esp)
 	jne	3f
 	movl	$(2 * TF_SZ - TF_DS), %ecx
 	jmp	6f
 3:	leal	(doreti_popl_es - 1b)(%ebx), %edx
 	cmpl	%edx, TF_EIP(%esp)
 	jne	4f
 	movl	$(2 * TF_SZ - TF_ES), %ecx
 	jmp	6f
 4:	leal	(doreti_popl_fs - 1b)(%ebx), %edx
 	cmpl	%edx, TF_EIP(%esp)
 	jne	5f
 	movl	$(2 * TF_SZ - TF_FS), %ecx
 	jmp	6f
 	/* kernel mode, normal */
 5:	FAKE_MCOUNT(TF_EIP(%esp))
 	jmp	calltrap
 6:	cmpl	$PMAP_TRM_MIN_ADDRESS, %esp	/* trampoline stack ? */
 	jb	5b	/* if not, no need to change stacks */
 	movl	(tramp_idleptd - 1b)(%ebx), %eax
 	movl	%eax, %cr3
 	movl	PCPU(KESP0), %edx
 	subl	%ecx, %edx
 	movl	%edx, %edi
 	movl	%esp, %esi
 	rep; movsb
 	movl	%edx, %esp
 	FAKE_MCOUNT(TF_EIP(%esp))
 	jmp	calltrap
 
 /*
  * Privileged instruction fault.
  */
 #ifdef KDTRACE_HOOKS
 	SUPERALIGN_TEXT
 IDTVEC(ill)
 	/*
 	 * Check if this is a user fault.  If so, just handle it as a normal
 	 * trap.
 	 */
 	testl	$PSL_VM, 8(%esp)	/* and vm86 mode. */
 	jnz	norm_ill
 	cmpl	$GSEL_KPL, 4(%esp)	/* Check the code segment */
 	jne	norm_ill
 
 	/*
 	 * Check if a DTrace hook is registered.  The trampoline cannot
 	 * be instrumented.
 	 */
 	cmpl	$0, dtrace_invop_jump_addr
 	je	norm_ill
 
 	/*
 	 * This is a kernel instruction fault that might have been caused
 	 * by a DTrace provider.
 	 */
 	pushal
 	cld
 
 	/*
 	 * Set our jump address for the jump back in the event that
 	 * the exception wasn't caused by DTrace at all.
 	 */
 	movl	$norm_ill, dtrace_invop_calltrap_addr
 
 	/* Jump to the code hooked in by DTrace. */
 	jmpl	*dtrace_invop_jump_addr
 
 	/*
 	 * Process the instruction fault in the normal way.
 	 */
 norm_ill:
 	pushl	$0
 	pushl	$T_PRIVINFLT
 	jmp	alltraps
 #endif
 
 /*
  * See comment in the handler for the kernel case T_TRCTRAP in trap.c.
  * The exception handler must be ready to execute with wrong %cr3.
  * We save original %cr3 in frame->tf_err, similarly to NMI and MCE
  * handlers.
  */
 IDTVEC(dbg)
 	pushl	$0
 	pushl	$T_TRCTRAP
 	PUSH_FRAME2
 	SET_KERNEL_SREGS
 	cld
 	movl	%cr3, %eax
 	movl	%eax, TF_ERR(%esp)
 	call	1f
 1:	popl	%eax
 	movl	(tramp_idleptd - 1b)(%eax), %eax
 	movl	%eax, %cr3
 	FAKE_MCOUNT(TF_EIP(%esp))
 	testl	$PSL_VM, TF_EFLAGS(%esp)
 	jnz	dbg_user
 	testb	$SEL_RPL_MASK,TF_CS(%esp)
 	jz	calltrap
 dbg_user:
 	NMOVE_STACKS
 	movl	$handle_ibrs_entry,%eax
 	call	*%eax
 	pushl	%esp
 	movl	$trap,%eax
 	call	*%eax
 	add	$4, %esp
 	movl	$T_RESERVED, TF_TRAPNO(%esp)
 	MEXITCOUNT
 	jmp	doreti
 
 IDTVEC(mchk)
 	pushl	$0
 	pushl	$T_MCHK
 	jmp	nmi_mchk_common
 
 IDTVEC(nmi)
 	pushl	$0
 	pushl	$T_NMI
 nmi_mchk_common:
 	PUSH_FRAME2
 	SET_KERNEL_SREGS
 	cld
 	/*
 	 * Save %cr3 into tf_err.  There is no good place to put it.
 	 * Always reload %cr3, since we might have interrupted the
 	 * kernel entry or exit.
 	 * Do not switch to the thread kernel stack, otherwise we might
 	 * obliterate the previous context partially copied from the
 	 * trampoline stack.
 	 * Do not re-enable IBRS, there is no good place to store
 	 * previous state if we come from the kernel.
 	 */
 	movl	%cr3, %eax
 	movl	%eax, TF_ERR(%esp)
 	call	1f
 1:	popl	%eax
 	movl	(tramp_idleptd - 1b)(%eax), %eax
 	movl	%eax, %cr3
 	FAKE_MCOUNT(TF_EIP(%esp))
 	jmp	calltrap
 
 /*
  * Trap gate entry for syscalls (int 0x80).
  * This is used by FreeBSD ELF executables, "new" a.out executables, and all
  * Linux executables.
  *
  * Even though the name says 'int0x80', this is actually a trap gate, not an
  * interrupt gate.  Thus interrupts are enabled on entry just as they are for
  * a normal syscall.
  */
 	SUPERALIGN_TEXT
 IDTVEC(int0x80_syscall)
 	pushl	$2			/* sizeof "int 0x80" */
 	pushl	$0			/* tf_trapno */
 	PUSH_FRAME2
 	SET_KERNEL_SREGS
 	cld
 	MOVE_STACKS
 	movl	$handle_ibrs_entry,%eax
 	call	*%eax
 	sti
 	FAKE_MCOUNT(TF_EIP(%esp))
 	pushl	%esp
 	movl	$syscall, %eax
 	call	*%eax
 	add	$4, %esp
 	MEXITCOUNT
 	jmp	doreti
 
 ENTRY(fork_trampoline)
 	pushl	%esp			/* trapframe pointer */
 	pushl	%ebx			/* arg1 */
 	pushl	%esi			/* function */
 	movl	$fork_exit, %eax
 	call	*%eax
 	addl	$12,%esp
 	/* cut from syscall */
 
 	/*
 	 * Return via doreti to handle ASTs.
 	 */
 	MEXITCOUNT
 	jmp	doreti
 
 
 /*
  * To efficiently implement classification of trap and interrupt handlers
  * for profiling, there must be only trap handlers between the labels btrap
  * and bintr, and only interrupt handlers between the labels bintr and
  * eintr.  This is implemented (partly) by including files that contain
  * some of the handlers.  Before including the files, set up a normal asm
  * environment so that the included files doen't need to know that they are
  * included.
  */
 
 	.data
 	.p2align 4
 	.text
 	SUPERALIGN_TEXT
 MCOUNT_LABEL(bintr)
 
 #ifdef DEV_ATPIC
 #include <i386/i386/atpic_vector.s>
 #endif
 
 #if defined(DEV_APIC) && defined(DEV_ATPIC)
 	.data
 	.p2align 4
 	.text
 	SUPERALIGN_TEXT
 #endif
 
 #ifdef DEV_APIC
 #include <i386/i386/apic_vector.s>
 #endif
 
 	.data
 	.p2align 4
 	.text
 	SUPERALIGN_TEXT
 #include <i386/i386/vm86bios.s>
 
 	.text
 MCOUNT_LABEL(eintr)
 
 #include <i386/i386/copyout_fast.s>
 
 /*
  * void doreti(struct trapframe)
  *
  * Handle return from interrupts, traps and syscalls.
  */
 	.text
 	SUPERALIGN_TEXT
 	.type	doreti,@function
 	.globl	doreti
 doreti:
 	FAKE_MCOUNT($bintr)		/* init "from" bintr -> doreti */
 doreti_next:
 	/*
 	 * Check if ASTs can be handled now.  ASTs cannot be safely
 	 * processed when returning from an NMI.
 	 */
 	cmpb	$T_NMI,TF_TRAPNO(%esp)
 #ifdef HWPMC_HOOKS
 	je	doreti_nmi
 #else
 	je	doreti_exit
 #endif
 	/*
 	 * PSL_VM must be checked first since segment registers only
 	 * have an RPL in non-VM86 mode.
 	 * ASTs can not be handled now if we are in a vm86 call.
 	 */
 	testl	$PSL_VM,TF_EFLAGS(%esp)
 	jz	doreti_notvm86
 	movl	PCPU(CURPCB),%ecx
 	testl	$PCB_VM86CALL,PCB_FLAGS(%ecx)
 	jz	doreti_ast
 	jmp	doreti_popl_fs
 
 doreti_notvm86:
 	testb	$SEL_RPL_MASK,TF_CS(%esp) /* are we returning to user mode? */
 	jz	doreti_exit		/* can't handle ASTs now if not */
 
 doreti_ast:
 	/*
 	 * Check for ASTs atomically with returning.  Disabling CPU
 	 * interrupts provides sufficient locking even in the SMP case,
 	 * since we will be informed of any new ASTs by an IPI.
 	 */
 	cli
 	movl	PCPU(CURTHREAD),%eax
 	testl	$TDF_ASTPENDING | TDF_NEEDRESCHED,TD_FLAGS(%eax)
 	je	doreti_exit
 	sti
 	pushl	%esp			/* pass a pointer to the trapframe */
 	movl	$ast, %eax
 	call	*%eax
 	add	$4,%esp
 	jmp	doreti_ast
 
 	/*
 	 * doreti_exit:	pop registers, iret.
 	 *
 	 *	The segment register pop is a special case, since it may
 	 *	fault if (for example) a sigreturn specifies bad segment
 	 *	registers.  The fault is handled in trap.c.
 	 */
 doreti_exit:
 	MEXITCOUNT
 
 	cmpl	$T_NMI, TF_TRAPNO(%esp)
 	je	doreti_iret_nmi
 	cmpl	$T_MCHK, TF_TRAPNO(%esp)
 	je	doreti_iret_nmi
 	cmpl	$T_TRCTRAP, TF_TRAPNO(%esp)
 	je	doreti_iret_nmi
 	movl	$TF_SZ, %ecx
 	testl	$PSL_VM,TF_EFLAGS(%esp)
 	jz	1f			/* PCB_VM86CALL is not set */
 	addl	$VM86_STACK_SPACE, %ecx
 	jmp	2f
 1:	testl	$SEL_RPL_MASK, TF_CS(%esp)
 	jz	doreti_popl_fs
 2:	movl	$handle_ibrs_exit,%eax
 	pushl	%ecx			/* preserve enough call-used regs */
 	call	*%eax
+	movl	mds_handler,%eax
+	call	*%eax
 	popl	%ecx
 	movl	%esp, %esi
 	movl	PCPU(TRAMPSTK), %edx
 	subl	%ecx, %edx
 	movl	%edx, %edi
 	rep; movsb
 	movl	%edx, %esp
 	movl	PCPU(CURPCB),%eax
 	movl	PCB_CR3(%eax), %eax
 	movl	%eax, %cr3
 
 	.globl	doreti_popl_fs
 doreti_popl_fs:
 	popl	%fs
 	.globl	doreti_popl_es
 doreti_popl_es:
 	popl	%es
 	.globl	doreti_popl_ds
 doreti_popl_ds:
 	popl	%ds
 	popal
 	addl	$8,%esp
 	.globl	doreti_iret
 doreti_iret:
 	iret
 
 doreti_iret_nmi:
 	movl	TF_ERR(%esp), %eax
 	movl	%eax, %cr3
 	jmp	doreti_popl_fs
 
 	/*
 	 * doreti_iret_fault and friends.  Alternative return code for
 	 * the case where we get a fault in the doreti_exit code
 	 * above.  trap() (i386/i386/trap.c) catches this specific
 	 * case, and continues in the corresponding place in the code
 	 * below.
 	 *
 	 * If the fault occured during return to usermode, we recreate
 	 * the trap frame and call trap() to send a signal.  Otherwise
 	 * the kernel was tricked into fault by attempt to restore invalid
 	 * usermode segment selectors on return from nested fault or
 	 * interrupt, where interrupted kernel entry code not yet loaded
 	 * kernel selectors.  In the latter case, emulate iret and zero
 	 * the invalid selector.
 	 */
 	ALIGN_TEXT
 	.globl	doreti_iret_fault
 doreti_iret_fault:
 	pushl	$0	/* tf_err */
 	pushl	$0	/* tf_trapno XXXKIB: provide more useful value ? */
 	pushal
 	pushl	$0
 	movw	%ds,(%esp)
 	.globl	doreti_popl_ds_fault
 doreti_popl_ds_fault:
 	testb	$SEL_RPL_MASK,TF_CS-TF_DS(%esp)
 	jz	doreti_popl_ds_kfault
 	pushl	$0
 	movw	%es,(%esp)
 	.globl	doreti_popl_es_fault
 doreti_popl_es_fault:
 	testb	$SEL_RPL_MASK,TF_CS-TF_ES(%esp)
 	jz	doreti_popl_es_kfault
 	pushl	$0
 	movw	%fs,(%esp)
 	.globl	doreti_popl_fs_fault
 doreti_popl_fs_fault:
 	testb	$SEL_RPL_MASK,TF_CS-TF_FS(%esp)
 	jz	doreti_popl_fs_kfault
 	movl	$0,TF_ERR(%esp)	/* XXX should be the error code */
 	movl	$T_PROTFLT,TF_TRAPNO(%esp)
 	SET_KERNEL_SREGS
 	jmp	calltrap
 
 doreti_popl_ds_kfault:
 	movl	$0,(%esp)
 	jmp	doreti_popl_ds
 doreti_popl_es_kfault:
 	movl	$0,(%esp)
 	jmp	doreti_popl_es
 doreti_popl_fs_kfault:
 	movl	$0,(%esp)
 	jmp	doreti_popl_fs
 
 #ifdef HWPMC_HOOKS
 doreti_nmi:
 	/*
 	 * Since we are returning from an NMI, check if the current trap
 	 * was from user mode and if so whether the current thread
 	 * needs a user call chain capture.
 	 */
 	testl	$PSL_VM, TF_EFLAGS(%esp)
 	jnz	doreti_exit
 	testb	$SEL_RPL_MASK,TF_CS(%esp)
 	jz	doreti_exit
 	movl	PCPU(CURTHREAD),%eax	/* curthread present? */
 	orl	%eax,%eax
 	jz	doreti_exit
 	testl	$TDP_CALLCHAIN,TD_PFLAGS(%eax) /* flagged for capture? */
 	jz	doreti_exit
 	/*
 	 * Switch to thread stack.  Reset tf_trapno to not indicate NMI,
 	 * to cause normal userspace exit.
 	 */
 	movl	$T_RESERVED, TF_TRAPNO(%esp)
 	NMOVE_STACKS
 	/*
 	 * Take the processor out of NMI mode by executing a fake "iret".
 	 */
 	pushfl
 	pushl	%cs
 	call	1f
 1:	popl	%eax
 	leal	(outofnmi-1b)(%eax),%eax
 	pushl	%eax
 	iret
 outofnmi:
 	/*
 	 * Call the callchain capture hook after turning interrupts back on.
 	 */
 	movl	pmc_hook,%ecx
 	orl	%ecx,%ecx
 	jz	doreti_exit
 	pushl	%esp			/* frame pointer */
 	pushl	$PMC_FN_USER_CALLCHAIN	/* command */
 	movl	PCPU(CURTHREAD),%eax
 	pushl	%eax			/* curthread */
 	sti
 	call	*%ecx
 	addl	$12,%esp
 	jmp	doreti_ast
 #endif
 
 ENTRY(end_exceptions)
Index: projects/runtime-coverage-v2/sys/i386/i386/genassym.c
===================================================================
--- projects/runtime-coverage-v2/sys/i386/i386/genassym.c	(revision 347598)
+++ projects/runtime-coverage-v2/sys/i386/i386/genassym.c	(revision 347599)
@@ -1,234 +1,237 @@
 /*-
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Copyright (c) 1982, 1990 The Regents of the University of California.
  * All rights reserved.
  *
  * This code is derived from software contributed to Berkeley by
  * William Jolitz.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	from: @(#)genassym.c	5.11 (Berkeley) 5/10/91
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_apic.h"
 #include "opt_hwpmc_hooks.h"
 #include "opt_kstack_pages.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/assym.h>
 #include <sys/bio.h>
 #include <sys/buf.h>
 #ifdef	HWPMC_HOOKS
 #include <sys/pmckern.h>
 #endif
 #include <sys/proc.h>
 #include <sys/errno.h>
 #include <sys/mount.h>
 #include <sys/mutex.h>
 #include <sys/socket.h>
 #include <sys/resourcevar.h>
 #include <sys/signalvar.h>
 #include <sys/ucontext.h>
 #include <machine/bootinfo.h>
 #include <machine/tss.h>
 #include <sys/vmmeter.h>
 #include <vm/vm.h>
 #include <vm/vm_param.h>
 #include <vm/pmap.h>
 #include <vm/vm_map.h>
 #include <sys/proc.h>
 #include <net/if.h>
 #include <netinet/in.h>
 #include <nfs/nfsproto.h>
 #include <nfsclient/nfs.h>
 #include <nfs/nfsdiskless.h>
 #ifdef DEV_APIC
 #include <x86/apicreg.h>
 #endif
 #include <machine/cpu.h>
 #include <machine/pcb_ext.h>
 #include <machine/pcb.h>
 #include <machine/sigframe.h>
 #include <machine/vm86.h>
 #include <machine/proc.h>
 
 ASSYM(P_VMSPACE, offsetof(struct proc, p_vmspace));
 ASSYM(VM_PMAP, offsetof(struct vmspace, vm_pmap));
 ASSYM(PM_ACTIVE, offsetof(struct pmap, pm_active));
 
 ASSYM(TD_FLAGS, offsetof(struct thread, td_flags));
 ASSYM(TD_LOCK, offsetof(struct thread, td_lock));
 ASSYM(TD_PCB, offsetof(struct thread, td_pcb));
 ASSYM(TD_PFLAGS, offsetof(struct thread, td_pflags));
 ASSYM(TD_PROC, offsetof(struct thread, td_proc));
 ASSYM(TD_MD, offsetof(struct thread, td_md));
 
 ASSYM(TDP_CALLCHAIN, TDP_CALLCHAIN);
 
 ASSYM(P_MD, offsetof(struct proc, p_md));
 ASSYM(MD_LDT, offsetof(struct mdproc, md_ldt));
 
 ASSYM(TDF_ASTPENDING, TDF_ASTPENDING);
 ASSYM(TDF_NEEDRESCHED, TDF_NEEDRESCHED);
 
 ASSYM(TD0_KSTACK_PAGES, TD0_KSTACK_PAGES);
 ASSYM(PAGE_SIZE, PAGE_SIZE);
 ASSYM(PAGE_SHIFT, PAGE_SHIFT);
 ASSYM(PAGE_MASK, PAGE_MASK);
 ASSYM(PCB_CR0, offsetof(struct pcb, pcb_cr0));
 ASSYM(PCB_CR2, offsetof(struct pcb, pcb_cr2));
 ASSYM(PCB_CR3, offsetof(struct pcb, pcb_cr3));
 ASSYM(PCB_CR4, offsetof(struct pcb, pcb_cr4));
 ASSYM(PCB_EDI, offsetof(struct pcb, pcb_edi));
 ASSYM(PCB_ESI, offsetof(struct pcb, pcb_esi));
 ASSYM(PCB_EBP, offsetof(struct pcb, pcb_ebp));
 ASSYM(PCB_ESP, offsetof(struct pcb, pcb_esp));
 ASSYM(PCB_EBX, offsetof(struct pcb, pcb_ebx));
 ASSYM(PCB_EIP, offsetof(struct pcb, pcb_eip));
 ASSYM(TSS_ESP0, offsetof(struct i386tss, tss_esp0));
 
 ASSYM(PCB_DS, offsetof(struct pcb, pcb_ds));
 ASSYM(PCB_ES, offsetof(struct pcb, pcb_es));
 ASSYM(PCB_FS, offsetof(struct pcb, pcb_fs));
 ASSYM(PCB_GS, offsetof(struct pcb, pcb_gs));
 ASSYM(PCB_SS, offsetof(struct pcb, pcb_ss));
 ASSYM(PCB_DR0, offsetof(struct pcb, pcb_dr0));
 ASSYM(PCB_DR1, offsetof(struct pcb, pcb_dr1));
 ASSYM(PCB_DR2, offsetof(struct pcb, pcb_dr2));
 ASSYM(PCB_DR3, offsetof(struct pcb, pcb_dr3));
 ASSYM(PCB_DR6, offsetof(struct pcb, pcb_dr6));
 ASSYM(PCB_DR7, offsetof(struct pcb, pcb_dr7));
 ASSYM(PCB_DBREGS, PCB_DBREGS);
 ASSYM(PCB_EXT, offsetof(struct pcb, pcb_ext));
 
 ASSYM(PCB_EXT_TSS, offsetof(struct pcb_ext, ext_tss));
 
 ASSYM(PCB_FSD, offsetof(struct pcb, pcb_fsd));
 ASSYM(PCB_GSD, offsetof(struct pcb, pcb_gsd));
 ASSYM(PCB_VM86, offsetof(struct pcb, pcb_vm86));
 ASSYM(PCB_FLAGS, offsetof(struct pcb, pcb_flags));
 ASSYM(PCB_SAVEFPU, offsetof(struct pcb, pcb_save));
 ASSYM(PCB_ONFAULT, offsetof(struct pcb, pcb_onfault));
 
 ASSYM(PCB_SIZE, sizeof(struct pcb));
 ASSYM(PCB_VM86CALL, PCB_VM86CALL);
 
 ASSYM(PCB_GDT, offsetof(struct pcb, pcb_gdt));
 ASSYM(PCB_IDT, offsetof(struct pcb, pcb_idt));
 ASSYM(PCB_LDT, offsetof(struct pcb, pcb_ldt));
 ASSYM(PCB_TR, offsetof(struct pcb, pcb_tr));
 
 ASSYM(TF_FS, offsetof(struct trapframe, tf_fs));
 ASSYM(TF_ES, offsetof(struct trapframe, tf_es));
 ASSYM(TF_DS, offsetof(struct trapframe, tf_ds));
 ASSYM(TF_TRAPNO, offsetof(struct trapframe, tf_trapno));
 ASSYM(TF_ERR, offsetof(struct trapframe, tf_err));
 ASSYM(TF_EIP, offsetof(struct trapframe, tf_eip));
 ASSYM(TF_CS, offsetof(struct trapframe, tf_cs));
 ASSYM(TF_EFLAGS, offsetof(struct trapframe, tf_eflags));
 ASSYM(TF_SZ, sizeof(struct trapframe));
 
 ASSYM(SIGF_HANDLER, offsetof(struct sigframe, sf_ahu.sf_handler));
 #ifdef COMPAT_43
 ASSYM(SIGF_SC, offsetof(struct osigframe, sf_siginfo.si_sc));
 #endif
 ASSYM(SIGF_UC, offsetof(struct sigframe, sf_uc));
 #ifdef COMPAT_FREEBSD4
 ASSYM(SIGF_UC4, offsetof(struct sigframe4, sf_uc));
 #endif
 #ifdef COMPAT_43
 ASSYM(SC_PS, offsetof(struct osigcontext, sc_ps));
 ASSYM(SC_FS, offsetof(struct osigcontext, sc_fs));
 ASSYM(SC_GS, offsetof(struct osigcontext, sc_gs));
 ASSYM(SC_TRAPNO, offsetof(struct osigcontext, sc_trapno));
 #endif
 #ifdef COMPAT_FREEBSD4
 ASSYM(UC4_EFLAGS, offsetof(struct ucontext4, uc_mcontext.mc_eflags));
 ASSYM(UC4_GS, offsetof(struct ucontext4, uc_mcontext.mc_gs));
 #endif
 ASSYM(UC_EFLAGS, offsetof(ucontext_t, uc_mcontext.mc_eflags));
 ASSYM(UC_GS, offsetof(ucontext_t, uc_mcontext.mc_gs));
 ASSYM(ENOENT, ENOENT);
 ASSYM(EFAULT, EFAULT);
 ASSYM(ENAMETOOLONG, ENAMETOOLONG);
 ASSYM(MAXCOMLEN, MAXCOMLEN);
 ASSYM(MAXPATHLEN, MAXPATHLEN);
 ASSYM(BOOTINFO_SIZE, sizeof(struct bootinfo));
 ASSYM(BI_VERSION, offsetof(struct bootinfo, bi_version));
 ASSYM(BI_KERNELNAME, offsetof(struct bootinfo, bi_kernelname));
 ASSYM(BI_NFS_DISKLESS, offsetof(struct bootinfo, bi_nfs_diskless));
 ASSYM(BI_ENDCOMMON, offsetof(struct bootinfo, bi_endcommon));
 ASSYM(NFSDISKLESS_SIZE, sizeof(struct nfs_diskless));
 ASSYM(BI_SIZE, offsetof(struct bootinfo, bi_size));
 ASSYM(BI_SYMTAB, offsetof(struct bootinfo, bi_symtab));
 ASSYM(BI_ESYMTAB, offsetof(struct bootinfo, bi_esymtab));
 ASSYM(BI_KERNEND, offsetof(struct bootinfo, bi_kernend));
 ASSYM(PC_SIZEOF, sizeof(struct pcpu));
 ASSYM(PC_PRVSPACE, offsetof(struct pcpu, pc_prvspace));
 ASSYM(PC_CURTHREAD, offsetof(struct pcpu, pc_curthread));
 ASSYM(PC_FPCURTHREAD, offsetof(struct pcpu, pc_fpcurthread));
 ASSYM(PC_IDLETHREAD, offsetof(struct pcpu, pc_idlethread));
 ASSYM(PC_CURPCB, offsetof(struct pcpu, pc_curpcb));
 ASSYM(PC_COMMON_TSSP, offsetof(struct pcpu, pc_common_tssp));
 ASSYM(PC_COMMON_TSSD, offsetof(struct pcpu, pc_common_tssd));
 ASSYM(PC_TSS_GDT, offsetof(struct pcpu, pc_tss_gdt));
 ASSYM(PC_FSGS_GDT, offsetof(struct pcpu, pc_fsgs_gdt));
 ASSYM(PC_CURRENTLDT, offsetof(struct pcpu, pc_currentldt));
 ASSYM(PC_CPUID, offsetof(struct pcpu, pc_cpuid));
 ASSYM(PC_CURPMAP, offsetof(struct pcpu, pc_curpmap));
 ASSYM(PC_PRIVATE_TSS, offsetof(struct pcpu, pc_private_tss));
 ASSYM(PC_KESP0, offsetof(struct pcpu, pc_kesp0));
 ASSYM(PC_TRAMPSTK, offsetof(struct pcpu, pc_trampstk));
 ASSYM(PC_COPYOUT_BUF, offsetof(struct pcpu, pc_copyout_buf));
 ASSYM(PC_IBPB_SET, offsetof(struct pcpu, pc_ibpb_set));
+ASSYM(PC_MDS_TMP, offsetof(struct pcpu, pc_mds_tmp));
+ASSYM(PC_MDS_BUF, offsetof(struct pcpu, pc_mds_buf));
+ASSYM(PC_MDS_BUF64, offsetof(struct pcpu, pc_mds_buf64));
 ASSYM(PMAP_TRM_MIN_ADDRESS, PMAP_TRM_MIN_ADDRESS);
 ASSYM(KERNLOAD, KERNLOAD);
 ASSYM(KERNBASE, KERNBASE);
 
 #ifdef DEV_APIC
 ASSYM(LA_EOI, LAPIC_EOI * LAPIC_MEM_MUL);
 ASSYM(LA_ISR, LAPIC_ISR0 * LAPIC_MEM_MUL);
 #endif
 
 ASSYM(KCSEL, GSEL(GCODE_SEL, SEL_KPL));
 ASSYM(KDSEL, GSEL(GDATA_SEL, SEL_KPL));
 ASSYM(KPSEL, GSEL(GPRIV_SEL, SEL_KPL));
 
 ASSYM(BC32SEL, GSEL(GBIOSCODE32_SEL, SEL_KPL));
 ASSYM(GPROC0_SEL, GPROC0_SEL);
 ASSYM(VM86_FRAMESIZE, sizeof(struct vm86frame));
 ASSYM(VM86_STACK_SPACE, VM86_STACK_SPACE);
 
 ASSYM(TRAMP_COPYOUT_SZ, TRAMP_COPYOUT_SZ);
 
 #ifdef	HWPMC_HOOKS
 ASSYM(PMC_FN_USER_CALLCHAIN, PMC_FN_USER_CALLCHAIN);
 #endif
Index: projects/runtime-coverage-v2/sys/i386/i386/initcpu.c
===================================================================
--- projects/runtime-coverage-v2/sys/i386/i386/initcpu.c	(revision 347598)
+++ projects/runtime-coverage-v2/sys/i386/i386/initcpu.c	(revision 347599)
@@ -1,983 +1,984 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
  *
  * Copyright (c) KATO Takenori, 1997, 1998.
  * 
  * All rights reserved.  Unpublished rights reserved under the copyright
  * laws of Japan.
  * 
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer as
  *    the first lines of this file unmodified.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_cpu.h"
 
 #include <sys/param.h>
 #include <sys/kernel.h>
 #include <sys/systm.h>
 #include <sys/sysctl.h>
 
 #include <machine/cputypes.h>
 #include <machine/md_var.h>
 #include <machine/specialreg.h>
 
 #include <vm/vm.h>
 #include <vm/pmap.h>
 
 #ifdef I486_CPU
 static void init_5x86(void);
 static void init_bluelightning(void);
 static void init_486dlc(void);
 static void init_cy486dx(void);
 #ifdef CPU_I486_ON_386
 static void init_i486_on_386(void);
 #endif
 static void init_6x86(void);
 #endif /* I486_CPU */
 
 #if defined(I586_CPU) && defined(CPU_WT_ALLOC)
 static void	enable_K5_wt_alloc(void);
 static void	enable_K6_wt_alloc(void);
 static void	enable_K6_2_wt_alloc(void);
 #endif
 
 #ifdef I686_CPU
 static void	init_6x86MX(void);
 static void	init_ppro(void);
 static void	init_mendocino(void);
 #endif
 
 static int	hw_instruction_sse;
 SYSCTL_INT(_hw, OID_AUTO, instruction_sse, CTLFLAG_RD,
     &hw_instruction_sse, 0, "SIMD/MMX2 instructions available in CPU");
 /*
  * -1: automatic (default)
  *  0: keep enable CLFLUSH
  *  1: force disable CLFLUSH
  */
 static int	hw_clflush_disable = -1;
 
 u_int	cyrix_did;		/* Device ID of Cyrix CPU */
 
 #ifdef I486_CPU
 /*
  * IBM Blue Lightning
  */
 static void
 init_bluelightning(void)
 {
 	register_t saveintr;
 
 	saveintr = intr_disable();
 
 	load_cr0(rcr0() | CR0_CD | CR0_NW);
 	invd();
 
 #ifdef CPU_BLUELIGHTNING_FPU_OP_CACHE
 	wrmsr(0x1000, 0x9c92LL);	/* FP operand can be cacheable on Cyrix FPU */
 #else
 	wrmsr(0x1000, 0x1c92LL);	/* Intel FPU */
 #endif
 	/* Enables 13MB and 0-640KB cache. */
 	wrmsr(0x1001, (0xd0LL << 32) | 0x3ff);
 #ifdef CPU_BLUELIGHTNING_3X
 	wrmsr(0x1002, 0x04000000LL);	/* Enables triple-clock mode. */
 #else
 	wrmsr(0x1002, 0x03000000LL);	/* Enables double-clock mode. */
 #endif
 
 	/* Enable caching in CR0. */
 	load_cr0(rcr0() & ~(CR0_CD | CR0_NW));	/* CD = 0 and NW = 0 */
 	invd();
 	intr_restore(saveintr);
 }
 
 /*
  * Cyrix 486SLC/DLC/SR/DR series
  */
 static void
 init_486dlc(void)
 {
 	register_t saveintr;
 	u_char	ccr0;
 
 	saveintr = intr_disable();
 	invd();
 
 	ccr0 = read_cyrix_reg(CCR0);
 #ifndef CYRIX_CACHE_WORKS
 	ccr0 |= CCR0_NC1 | CCR0_BARB;
 	write_cyrix_reg(CCR0, ccr0);
 	invd();
 #else
 	ccr0 &= ~CCR0_NC0;
 #ifndef CYRIX_CACHE_REALLY_WORKS
 	ccr0 |= CCR0_NC1 | CCR0_BARB;
 #else
 	ccr0 |= CCR0_NC1;
 #endif
 #ifdef CPU_DIRECT_MAPPED_CACHE
 	ccr0 |= CCR0_CO;			/* Direct mapped mode. */
 #endif
 	write_cyrix_reg(CCR0, ccr0);
 
 	/* Clear non-cacheable region. */
 	write_cyrix_reg(NCR1+2, NCR_SIZE_0K);
 	write_cyrix_reg(NCR2+2, NCR_SIZE_0K);
 	write_cyrix_reg(NCR3+2, NCR_SIZE_0K);
 	write_cyrix_reg(NCR4+2, NCR_SIZE_0K);
 
 	write_cyrix_reg(0, 0);	/* dummy write */
 
 	/* Enable caching in CR0. */
 	load_cr0(rcr0() & ~(CR0_CD | CR0_NW));	/* CD = 0 and NW = 0 */
 	invd();
 #endif /* !CYRIX_CACHE_WORKS */
 	intr_restore(saveintr);
 }
 
 
 /*
  * Cyrix 486S/DX series
  */
 static void
 init_cy486dx(void)
 {
 	register_t saveintr;
 	u_char	ccr2;
 
 	saveintr = intr_disable();
 	invd();
 
 	ccr2 = read_cyrix_reg(CCR2);
 #ifdef CPU_SUSP_HLT
 	ccr2 |= CCR2_SUSP_HLT;
 #endif
 
 	write_cyrix_reg(CCR2, ccr2);
 	intr_restore(saveintr);
 }
 
 
 /*
  * Cyrix 5x86
  */
 static void
 init_5x86(void)
 {
 	register_t saveintr;
 	u_char	ccr2, ccr3, ccr4, pcr0;
 
 	saveintr = intr_disable();
 
 	load_cr0(rcr0() | CR0_CD | CR0_NW);
 	wbinvd();
 
 	(void)read_cyrix_reg(CCR3);		/* dummy */
 
 	/* Initialize CCR2. */
 	ccr2 = read_cyrix_reg(CCR2);
 	ccr2 |= CCR2_WB;
 #ifdef CPU_SUSP_HLT
 	ccr2 |= CCR2_SUSP_HLT;
 #else
 	ccr2 &= ~CCR2_SUSP_HLT;
 #endif
 	ccr2 |= CCR2_WT1;
 	write_cyrix_reg(CCR2, ccr2);
 
 	/* Initialize CCR4. */
 	ccr3 = read_cyrix_reg(CCR3);
 	write_cyrix_reg(CCR3, CCR3_MAPEN0);
 
 	ccr4 = read_cyrix_reg(CCR4);
 	ccr4 |= CCR4_DTE;
 	ccr4 |= CCR4_MEM;
 #ifdef CPU_FASTER_5X86_FPU
 	ccr4 |= CCR4_FASTFPE;
 #else
 	ccr4 &= ~CCR4_FASTFPE;
 #endif
 	ccr4 &= ~CCR4_IOMASK;
 	/********************************************************************
 	 * WARNING: The "BIOS Writers Guide" mentions that I/O recovery time
 	 * should be 0 for errata fix.
 	 ********************************************************************/
 #ifdef CPU_IORT
 	ccr4 |= CPU_IORT & CCR4_IOMASK;
 #endif
 	write_cyrix_reg(CCR4, ccr4);
 
 	/* Initialize PCR0. */
 	/****************************************************************
 	 * WARNING: RSTK_EN and LOOP_EN could make your system unstable.
 	 * BTB_EN might make your system unstable.
 	 ****************************************************************/
 	pcr0 = read_cyrix_reg(PCR0);
 #ifdef CPU_RSTK_EN
 	pcr0 |= PCR0_RSTK;
 #else
 	pcr0 &= ~PCR0_RSTK;
 #endif
 #ifdef CPU_BTB_EN
 	pcr0 |= PCR0_BTB;
 #else
 	pcr0 &= ~PCR0_BTB;
 #endif
 #ifdef CPU_LOOP_EN
 	pcr0 |= PCR0_LOOP;
 #else
 	pcr0 &= ~PCR0_LOOP;
 #endif
 
 	/****************************************************************
 	 * WARNING: if you use a memory mapped I/O device, don't use
 	 * DISABLE_5X86_LSSER option, which may reorder memory mapped
 	 * I/O access.
 	 * IF YOUR MOTHERBOARD HAS PCI BUS, DON'T DISABLE LSSER.
 	 ****************************************************************/
 #ifdef CPU_DISABLE_5X86_LSSER
 	pcr0 &= ~PCR0_LSSER;
 #else
 	pcr0 |= PCR0_LSSER;
 #endif
 	write_cyrix_reg(PCR0, pcr0);
 
 	/* Restore CCR3. */
 	write_cyrix_reg(CCR3, ccr3);
 
 	(void)read_cyrix_reg(0x80);		/* dummy */
 
 	/* Unlock NW bit in CR0. */
 	write_cyrix_reg(CCR2, read_cyrix_reg(CCR2) & ~CCR2_LOCK_NW);
 	load_cr0((rcr0() & ~CR0_CD) | CR0_NW);	/* CD = 0, NW = 1 */
 	/* Lock NW bit in CR0. */
 	write_cyrix_reg(CCR2, read_cyrix_reg(CCR2) | CCR2_LOCK_NW);
 
 	intr_restore(saveintr);
 }
 
 #ifdef CPU_I486_ON_386
 /*
  * There are i486 based upgrade products for i386 machines.
  * In this case, BIOS doesn't enable CPU cache.
  */
 static void
 init_i486_on_386(void)
 {
 	register_t saveintr;
 
 	saveintr = intr_disable();
 
 	load_cr0(rcr0() & ~(CR0_CD | CR0_NW));	/* CD = 0, NW = 0 */
 
 	intr_restore(saveintr);
 }
 #endif
 
 /*
  * Cyrix 6x86
  *
  * XXX - What should I do here?  Please let me know.
  */
 static void
 init_6x86(void)
 {
 	register_t saveintr;
 	u_char	ccr3, ccr4;
 
 	saveintr = intr_disable();
 
 	load_cr0(rcr0() | CR0_CD | CR0_NW);
 	wbinvd();
 
 	/* Initialize CCR0. */
 	write_cyrix_reg(CCR0, read_cyrix_reg(CCR0) | CCR0_NC1);
 
 	/* Initialize CCR1. */
 #ifdef CPU_CYRIX_NO_LOCK
 	write_cyrix_reg(CCR1, read_cyrix_reg(CCR1) | CCR1_NO_LOCK);
 #else
 	write_cyrix_reg(CCR1, read_cyrix_reg(CCR1) & ~CCR1_NO_LOCK);
 #endif
 
 	/* Initialize CCR2. */
 #ifdef CPU_SUSP_HLT
 	write_cyrix_reg(CCR2, read_cyrix_reg(CCR2) | CCR2_SUSP_HLT);
 #else
 	write_cyrix_reg(CCR2, read_cyrix_reg(CCR2) & ~CCR2_SUSP_HLT);
 #endif
 
 	ccr3 = read_cyrix_reg(CCR3);
 	write_cyrix_reg(CCR3, CCR3_MAPEN0);
 
 	/* Initialize CCR4. */
 	ccr4 = read_cyrix_reg(CCR4);
 	ccr4 |= CCR4_DTE;
 	ccr4 &= ~CCR4_IOMASK;
 #ifdef CPU_IORT
 	write_cyrix_reg(CCR4, ccr4 | (CPU_IORT & CCR4_IOMASK));
 #else
 	write_cyrix_reg(CCR4, ccr4 | 7);
 #endif
 
 	/* Initialize CCR5. */
 #ifdef CPU_WT_ALLOC
 	write_cyrix_reg(CCR5, read_cyrix_reg(CCR5) | CCR5_WT_ALLOC);
 #endif
 
 	/* Restore CCR3. */
 	write_cyrix_reg(CCR3, ccr3);
 
 	/* Unlock NW bit in CR0. */
 	write_cyrix_reg(CCR2, read_cyrix_reg(CCR2) & ~CCR2_LOCK_NW);
 
 	/*
 	 * Earlier revision of the 6x86 CPU could crash the system if
 	 * L1 cache is in write-back mode.
 	 */
 	if ((cyrix_did & 0xff00) > 0x1600)
 		load_cr0(rcr0() & ~(CR0_CD | CR0_NW));	/* CD = 0 and NW = 0 */
 	else {
 		/* Revision 2.6 and lower. */
 #ifdef CYRIX_CACHE_REALLY_WORKS
 		load_cr0(rcr0() & ~(CR0_CD | CR0_NW));	/* CD = 0 and NW = 0 */
 #else
 		load_cr0((rcr0() & ~CR0_CD) | CR0_NW);	/* CD = 0 and NW = 1 */
 #endif
 	}
 
 	/* Lock NW bit in CR0. */
 	write_cyrix_reg(CCR2, read_cyrix_reg(CCR2) | CCR2_LOCK_NW);
 
 	intr_restore(saveintr);
 }
 #endif /* I486_CPU */
 
 #ifdef I586_CPU
 /*
  * Rise mP6
  */
 static void
 init_rise(void)
 {
 
 	/*
 	 * The CMPXCHG8B instruction is always available but hidden.
 	 */
 	cpu_feature |= CPUID_CX8;
 }
 
 /*
  * IDT WinChip C6/2/2A/2B/3
  *
  * http://www.centtech.com/winchip_bios_writers_guide_v4_0.pdf
  */
 static void
 init_winchip(void)
 {
 	u_int regs[4];
 	uint64_t fcr;
 
 	fcr = rdmsr(0x0107);
 
 	/*
 	 * Set ECX8, DSMC, DTLOCK/EDCTLB, EMMX, and ERETSTK and clear DPDC.
 	 */
 	fcr |= (1 << 1) | (1 << 7) | (1 << 8) | (1 << 9) | (1 << 16);
 	fcr &= ~(1ULL << 11);
 
 	/*
 	 * Additionally, set EBRPRED, E2MMX and EAMD3D for WinChip 2 and 3.
 	 */
 	if (CPUID_TO_MODEL(cpu_id) >= 8)
 		fcr |= (1 << 12) | (1 << 19) | (1 << 20);
 
 	wrmsr(0x0107, fcr);
 	do_cpuid(1, regs);
 	cpu_feature = regs[3];
 }
 #endif
 
 #ifdef I686_CPU
 /*
  * Cyrix 6x86MX (code-named M2)
  *
  * XXX - What should I do here?  Please let me know.
  */
 static void
 init_6x86MX(void)
 {
 	register_t saveintr;
 	u_char	ccr3, ccr4;
 
 	saveintr = intr_disable();
 
 	load_cr0(rcr0() | CR0_CD | CR0_NW);
 	wbinvd();
 
 	/* Initialize CCR0. */
 	write_cyrix_reg(CCR0, read_cyrix_reg(CCR0) | CCR0_NC1);
 
 	/* Initialize CCR1. */
 #ifdef CPU_CYRIX_NO_LOCK
 	write_cyrix_reg(CCR1, read_cyrix_reg(CCR1) | CCR1_NO_LOCK);
 #else
 	write_cyrix_reg(CCR1, read_cyrix_reg(CCR1) & ~CCR1_NO_LOCK);
 #endif
 
 	/* Initialize CCR2. */
 #ifdef CPU_SUSP_HLT
 	write_cyrix_reg(CCR2, read_cyrix_reg(CCR2) | CCR2_SUSP_HLT);
 #else
 	write_cyrix_reg(CCR2, read_cyrix_reg(CCR2) & ~CCR2_SUSP_HLT);
 #endif
 
 	ccr3 = read_cyrix_reg(CCR3);
 	write_cyrix_reg(CCR3, CCR3_MAPEN0);
 
 	/* Initialize CCR4. */
 	ccr4 = read_cyrix_reg(CCR4);
 	ccr4 &= ~CCR4_IOMASK;
 #ifdef CPU_IORT
 	write_cyrix_reg(CCR4, ccr4 | (CPU_IORT & CCR4_IOMASK));
 #else
 	write_cyrix_reg(CCR4, ccr4 | 7);
 #endif
 
 	/* Initialize CCR5. */
 #ifdef CPU_WT_ALLOC
 	write_cyrix_reg(CCR5, read_cyrix_reg(CCR5) | CCR5_WT_ALLOC);
 #endif
 
 	/* Restore CCR3. */
 	write_cyrix_reg(CCR3, ccr3);
 
 	/* Unlock NW bit in CR0. */
 	write_cyrix_reg(CCR2, read_cyrix_reg(CCR2) & ~CCR2_LOCK_NW);
 
 	load_cr0(rcr0() & ~(CR0_CD | CR0_NW));	/* CD = 0 and NW = 0 */
 
 	/* Lock NW bit in CR0. */
 	write_cyrix_reg(CCR2, read_cyrix_reg(CCR2) | CCR2_LOCK_NW);
 
 	intr_restore(saveintr);
 }
 
 static int ppro_apic_used = -1;
 
 static void
 init_ppro(void)
 {
 	u_int64_t	apicbase;
 
 	/*
 	 * Local APIC should be disabled if it is not going to be used.
 	 */
 	if (ppro_apic_used != 1) {
 		apicbase = rdmsr(MSR_APICBASE);
 		apicbase &= ~APICBASE_ENABLED;
 		wrmsr(MSR_APICBASE, apicbase);
 		ppro_apic_used = 0;
 	}
 }
 
 /*
  * If the local APIC is going to be used after being disabled above,
  * re-enable it and don't disable it in the future.
  */
 void
 ppro_reenable_apic(void)
 {
 	u_int64_t	apicbase;
 
 	if (ppro_apic_used == 0) {
 		apicbase = rdmsr(MSR_APICBASE);
 		apicbase |= APICBASE_ENABLED;
 		wrmsr(MSR_APICBASE, apicbase);
 		ppro_apic_used = 1;
 	}
 }
 
 /*
  * Initialize BBL_CR_CTL3 (Control register 3: used to configure the
  * L2 cache).
  */
 static void
 init_mendocino(void)
 {
 #ifdef CPU_PPRO2CELERON
 	register_t	saveintr;
 	u_int64_t	bbl_cr_ctl3;
 
 	saveintr = intr_disable();
 
 	load_cr0(rcr0() | CR0_CD | CR0_NW);
 	wbinvd();
 
 	bbl_cr_ctl3 = rdmsr(MSR_BBL_CR_CTL3);
 
 	/* If the L2 cache is configured, do nothing. */
 	if (!(bbl_cr_ctl3 & 1)) {
 		bbl_cr_ctl3 = 0x134052bLL;
 
 		/* Set L2 Cache Latency (Default: 5). */
 #ifdef	CPU_CELERON_L2_LATENCY
 #if CPU_L2_LATENCY > 15
 #error invalid CPU_L2_LATENCY.
 #endif
 		bbl_cr_ctl3 |= CPU_L2_LATENCY << 1;
 #else
 		bbl_cr_ctl3 |= 5 << 1;
 #endif
 		wrmsr(MSR_BBL_CR_CTL3, bbl_cr_ctl3);
 	}
 
 	load_cr0(rcr0() & ~(CR0_CD | CR0_NW));
 	intr_restore(saveintr);
 #endif /* CPU_PPRO2CELERON */
 }
 
 /*
  * Initialize special VIA features
  */
 static void
 init_via(void)
 {
 	u_int regs[4], val;
 	uint64_t fcr;
 
 	/*
 	 * Explicitly enable CX8 and PGE on C3.
 	 *
 	 * http://www.via.com.tw/download/mainboards/6/13/VIA_C3_EBGA%20datasheet110.pdf
 	 */
 	if (CPUID_TO_MODEL(cpu_id) <= 9)
 		fcr = (1 << 1) | (1 << 7);
 	else
 		fcr = 0;
 
 	/*
 	 * Check extended CPUID for PadLock features.
 	 *
 	 * http://www.via.com.tw/en/downloads/whitepapers/initiatives/padlock/programming_guide.pdf
 	 */
 	do_cpuid(0xc0000000, regs);
 	if (regs[0] >= 0xc0000001) {
 		do_cpuid(0xc0000001, regs);
 		val = regs[3];
 	} else
 		val = 0;
 
 	/* Enable RNG if present. */
 	if ((val & VIA_CPUID_HAS_RNG) != 0) {
 		via_feature_rng = VIA_HAS_RNG;
 		wrmsr(0x110B, rdmsr(0x110B) | VIA_CPUID_DO_RNG);
 	}
 
 	/* Enable PadLock if present. */
 	if ((val & VIA_CPUID_HAS_ACE) != 0)
 		via_feature_xcrypt |= VIA_HAS_AES;
 	if ((val & VIA_CPUID_HAS_ACE2) != 0)
 		via_feature_xcrypt |= VIA_HAS_AESCTR;
 	if ((val & VIA_CPUID_HAS_PHE) != 0)
 		via_feature_xcrypt |= VIA_HAS_SHA;
 	if ((val & VIA_CPUID_HAS_PMM) != 0)
 		via_feature_xcrypt |= VIA_HAS_MM;
 	if (via_feature_xcrypt != 0)
 		fcr |= 1 << 28;
 
 	wrmsr(0x1107, rdmsr(0x1107) | fcr);
 }
 
 #endif /* I686_CPU */
 
 #if defined(I586_CPU) || defined(I686_CPU)
 static void
 init_transmeta(void)
 {
 	u_int regs[0];
 
 	/* Expose all hidden features. */
 	wrmsr(0x80860004, rdmsr(0x80860004) | ~0UL);
 	do_cpuid(1, regs);
 	cpu_feature = regs[3];
 }
 #endif
 
 extern int elf32_nxstack;
 
 void
 initializecpu(void)
 {
 	uint64_t msr;
 
 	switch (cpu) {
 #ifdef I486_CPU
 	case CPU_BLUE:
 		init_bluelightning();
 		break;
 	case CPU_486DLC:
 		init_486dlc();
 		break;
 	case CPU_CY486DX:
 		init_cy486dx();
 		break;
 	case CPU_M1SC:
 		init_5x86();
 		break;
 #ifdef CPU_I486_ON_386
 	case CPU_486:
 		init_i486_on_386();
 		break;
 #endif
 	case CPU_M1:
 		init_6x86();
 		break;
 #endif /* I486_CPU */
 #ifdef I586_CPU
 	case CPU_586:
 		switch (cpu_vendor_id) {
 		case CPU_VENDOR_AMD:
 #ifdef CPU_WT_ALLOC
 			if (((cpu_id & 0x0f0) > 0) &&
 			    ((cpu_id & 0x0f0) < 0x60) &&
 			    ((cpu_id & 0x00f) > 3))
 				enable_K5_wt_alloc();
 			else if (((cpu_id & 0x0f0) > 0x80) ||
 			    (((cpu_id & 0x0f0) == 0x80) &&
 				(cpu_id & 0x00f) > 0x07))
 				enable_K6_2_wt_alloc();
 			else if ((cpu_id & 0x0f0) > 0x50)
 				enable_K6_wt_alloc();
 #endif
 			if ((cpu_id & 0xf0) == 0xa0)
 				/*
 				 * Make sure the TSC runs through
 				 * suspension, otherwise we can't use
 				 * it as timecounter
 				 */
 				wrmsr(0x1900, rdmsr(0x1900) | 0x20ULL);
 			break;
 		case CPU_VENDOR_CENTAUR:
 			init_winchip();
 			break;
 		case CPU_VENDOR_TRANSMETA:
 			init_transmeta();
 			break;
 		case CPU_VENDOR_RISE:
 			init_rise();
 			break;
 		}
 		break;
 #endif
 #ifdef I686_CPU
 	case CPU_M2:
 		init_6x86MX();
 		break;
 	case CPU_686:
 		switch (cpu_vendor_id) {
 		case CPU_VENDOR_INTEL:
 			switch (cpu_id & 0xff0) {
 			case 0x610:
 				init_ppro();
 				break;
 			case 0x660:
 				init_mendocino();
 				break;
 			}
 			break;
 #ifdef CPU_ATHLON_SSE_HACK
 		case CPU_VENDOR_AMD:
 			/*
 			 * Sometimes the BIOS doesn't enable SSE instructions.
 			 * According to AMD document 20734, the mobile
 			 * Duron, the (mobile) Athlon 4 and the Athlon MP
 			 * support SSE. These correspond to cpu_id 0x66X
 			 * or 0x67X.
 			 */
 			if ((cpu_feature & CPUID_XMM) == 0 &&
 			    ((cpu_id & ~0xf) == 0x660 ||
 			     (cpu_id & ~0xf) == 0x670 ||
 			     (cpu_id & ~0xf) == 0x680)) {
 				u_int regs[4];
 				wrmsr(MSR_HWCR, rdmsr(MSR_HWCR) & ~0x08000);
 				do_cpuid(1, regs);
 				cpu_feature = regs[3];
 			}
 			break;
 #endif
 		case CPU_VENDOR_CENTAUR:
 			init_via();
 			break;
 		case CPU_VENDOR_TRANSMETA:
 			init_transmeta();
 			break;
 		}
 		break;
 #endif
 	default:
 		break;
 	}
 	if ((cpu_feature & CPUID_XMM) && (cpu_feature & CPUID_FXSR)) {
 		load_cr4(rcr4() | CR4_FXSR | CR4_XMM);
 		cpu_fxsr = hw_instruction_sse = 1;
 	}
 	if (elf32_nxstack) {
 		msr = rdmsr(MSR_EFER) | EFER_NXE;
 		wrmsr(MSR_EFER, msr);
 	}
+	hw_mds_recalculate();
 	if ((amd_feature & AMDID_RDTSCP) != 0 ||
 	    (cpu_stdext_feature2 & CPUID_STDEXT2_RDPID) != 0)
 		wrmsr(MSR_TSC_AUX, PCPU_GET(cpuid));
 }
 
 void
 initializecpucache(void)
 {
 
 	/*
 	 * CPUID with %eax = 1, %ebx returns
 	 * Bits 15-8: CLFLUSH line size
 	 * 	(Value * 8 = cache line size in bytes)
 	 */
 	if ((cpu_feature & CPUID_CLFSH) != 0)
 		cpu_clflush_line_size = ((cpu_procinfo >> 8) & 0xff) * 8;
 	/*
 	 * XXXKIB: (temporary) hack to work around traps generated
 	 * when CLFLUSHing APIC register window under virtualization
 	 * environments.  These environments tend to disable the
 	 * CPUID_SS feature even though the native CPU supports it.
 	 */
 	TUNABLE_INT_FETCH("hw.clflush_disable", &hw_clflush_disable);
 	if (vm_guest != VM_GUEST_NO && hw_clflush_disable == -1) {
 		cpu_feature &= ~CPUID_CLFSH;
 		cpu_stdext_feature &= ~CPUID_STDEXT_CLFLUSHOPT;
 	}
 	/*
 	 * The kernel's use of CLFLUSH{,OPT} can be disabled manually
 	 * by setting the hw.clflush_disable tunable.
 	 */
 	if (hw_clflush_disable == 1) {
 		cpu_feature &= ~CPUID_CLFSH;
 		cpu_stdext_feature &= ~CPUID_STDEXT_CLFLUSHOPT;
 	}
 }
 
 #if defined(I586_CPU) && defined(CPU_WT_ALLOC)
 /*
  * Enable write allocate feature of AMD processors.
  * Following two functions require the Maxmem variable being set.
  */
 static void
 enable_K5_wt_alloc(void)
 {
 	u_int64_t	msr;
 	register_t	saveintr;
 
 	/*
 	 * Write allocate is supported only on models 1, 2, and 3, with
 	 * a stepping of 4 or greater.
 	 */
 	if (((cpu_id & 0xf0) > 0) && ((cpu_id & 0x0f) > 3)) {
 		saveintr = intr_disable();
 		msr = rdmsr(0x83);		/* HWCR */
 		wrmsr(0x83, msr & !(0x10));
 
 		/*
 		 * We have to tell the chip where the top of memory is,
 		 * since video cards could have frame bufferes there,
 		 * memory-mapped I/O could be there, etc.
 		 */
 		if(Maxmem > 0)
 		  msr = Maxmem / 16;
 		else
 		  msr = 0;
 		msr |= AMD_WT_ALLOC_TME | AMD_WT_ALLOC_FRE;
 
 		/*
 		 * There is no way to know wheter 15-16M hole exists or not. 
 		 * Therefore, we disable write allocate for this range.
 		 */
 		wrmsr(0x86, 0x0ff00f0);
 		msr |= AMD_WT_ALLOC_PRE;
 		wrmsr(0x85, msr);
 
 		msr=rdmsr(0x83);
 		wrmsr(0x83, msr|0x10); /* enable write allocate */
 		intr_restore(saveintr);
 	}
 }
 
 static void
 enable_K6_wt_alloc(void)
 {
 	quad_t	size;
 	u_int64_t	whcr;
 	register_t	saveintr;
 
 	saveintr = intr_disable();
 	wbinvd();
 
 #ifdef CPU_DISABLE_CACHE
 	/*
 	 * Certain K6-2 box becomes unstable when write allocation is
 	 * enabled.
 	 */
 	/*
 	 * The AMD-K6 processer provides the 64-bit Test Register 12(TR12),
 	 * but only the Cache Inhibit(CI) (bit 3 of TR12) is suppported.
 	 * All other bits in TR12 have no effect on the processer's operation.
 	 * The I/O Trap Restart function (bit 9 of TR12) is always enabled
 	 * on the AMD-K6.
 	 */
 	wrmsr(0x0000000e, (u_int64_t)0x0008);
 #endif
 	/* Don't assume that memory size is aligned with 4M. */
 	if (Maxmem > 0)
 	  size = ((Maxmem >> 8) + 3) >> 2;
 	else
 	  size = 0;
 
 	/* Limit is 508M bytes. */
 	if (size > 0x7f)
 		size = 0x7f;
 	whcr = (rdmsr(0xc0000082) & ~(0x7fLL << 1)) | (size << 1);
 
 #if defined(NO_MEMORY_HOLE)
 	if (whcr & (0x7fLL << 1))
 		whcr |=  0x0001LL;
 #else
 	/*
 	 * There is no way to know wheter 15-16M hole exists or not. 
 	 * Therefore, we disable write allocate for this range.
 	 */
 	whcr &= ~0x0001LL;
 #endif
 	wrmsr(0x0c0000082, whcr);
 
 	intr_restore(saveintr);
 }
 
 static void
 enable_K6_2_wt_alloc(void)
 {
 	quad_t	size;
 	u_int64_t	whcr;
 	register_t	saveintr;
 
 	saveintr = intr_disable();
 	wbinvd();
 
 #ifdef CPU_DISABLE_CACHE
 	/*
 	 * Certain K6-2 box becomes unstable when write allocation is
 	 * enabled.
 	 */
 	/*
 	 * The AMD-K6 processer provides the 64-bit Test Register 12(TR12),
 	 * but only the Cache Inhibit(CI) (bit 3 of TR12) is suppported.
 	 * All other bits in TR12 have no effect on the processer's operation.
 	 * The I/O Trap Restart function (bit 9 of TR12) is always enabled
 	 * on the AMD-K6.
 	 */
 	wrmsr(0x0000000e, (u_int64_t)0x0008);
 #endif
 	/* Don't assume that memory size is aligned with 4M. */
 	if (Maxmem > 0)
 	  size = ((Maxmem >> 8) + 3) >> 2;
 	else
 	  size = 0;
 
 	/* Limit is 4092M bytes. */
 	if (size > 0x3fff)
 		size = 0x3ff;
 	whcr = (rdmsr(0xc0000082) & ~(0x3ffLL << 22)) | (size << 22);
 
 #if defined(NO_MEMORY_HOLE)
 	if (whcr & (0x3ffLL << 22))
 		whcr |=  1LL << 16;
 #else
 	/*
 	 * There is no way to know wheter 15-16M hole exists or not. 
 	 * Therefore, we disable write allocate for this range.
 	 */
 	whcr &= ~(1LL << 16);
 #endif
 	wrmsr(0x0c0000082, whcr);
 
 	intr_restore(saveintr);
 }
 #endif /* I585_CPU && CPU_WT_ALLOC */
 
 #include "opt_ddb.h"
 #ifdef DDB
 #include <ddb/ddb.h>
 
 DB_SHOW_COMMAND(cyrixreg, cyrixreg)
 {
 	register_t saveintr;
 	u_int	cr0;
 	u_char	ccr1, ccr2, ccr3;
 	u_char	ccr0 = 0, ccr4 = 0, ccr5 = 0, pcr0 = 0;
 
 	cr0 = rcr0();
 	if (cpu_vendor_id == CPU_VENDOR_CYRIX) {
 		saveintr = intr_disable();
 
 
 		if ((cpu != CPU_M1SC) && (cpu != CPU_CY486DX)) {
 			ccr0 = read_cyrix_reg(CCR0);
 		}
 		ccr1 = read_cyrix_reg(CCR1);
 		ccr2 = read_cyrix_reg(CCR2);
 		ccr3 = read_cyrix_reg(CCR3);
 		if ((cpu == CPU_M1SC) || (cpu == CPU_M1) || (cpu == CPU_M2)) {
 			write_cyrix_reg(CCR3, CCR3_MAPEN0);
 			ccr4 = read_cyrix_reg(CCR4);
 			if ((cpu == CPU_M1) || (cpu == CPU_M2))
 				ccr5 = read_cyrix_reg(CCR5);
 			else
 				pcr0 = read_cyrix_reg(PCR0);
 			write_cyrix_reg(CCR3, ccr3);		/* Restore CCR3. */
 		}
 		intr_restore(saveintr);
 
 		if ((cpu != CPU_M1SC) && (cpu != CPU_CY486DX))
 			printf("CCR0=%x, ", (u_int)ccr0);
 
 		printf("CCR1=%x, CCR2=%x, CCR3=%x",
 			(u_int)ccr1, (u_int)ccr2, (u_int)ccr3);
 		if ((cpu == CPU_M1SC) || (cpu == CPU_M1) || (cpu == CPU_M2)) {
 			printf(", CCR4=%x, ", (u_int)ccr4);
 			if (cpu == CPU_M1SC)
 				printf("PCR0=%x\n", pcr0);
 			else
 				printf("CCR5=%x\n", ccr5);
 		}
 	}
 	printf("CR0=%x\n", cr0);
 }
 #endif /* DDB */
Index: projects/runtime-coverage-v2/sys/i386/i386/support.s
===================================================================
--- projects/runtime-coverage-v2/sys/i386/i386/support.s	(revision 347598)
+++ projects/runtime-coverage-v2/sys/i386/i386/support.s	(revision 347599)
@@ -1,474 +1,658 @@
 /*-
  * Copyright (c) 1993 The Regents of the University of California.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  * $FreeBSD$
  */
 
 #include <machine/asmacros.h>
 #include <machine/cputypes.h>
 #include <machine/pmap.h>
 #include <machine/specialreg.h>
 
 #include "assym.inc"
 
 #define IDXSHIFT	10
 
 	.text
 
 /*
  * bcopy family
  * void bzero(void *buf, u_int len)
  */
 ENTRY(bzero)
 	pushl	%edi
 	movl	8(%esp),%edi
 	movl	12(%esp),%ecx
 	xorl	%eax,%eax
 	shrl	$2,%ecx
 	rep
 	stosl
 	movl	12(%esp),%ecx
 	andl	$3,%ecx
 	rep
 	stosb
 	popl	%edi
 	ret
 END(bzero)
 
 ENTRY(sse2_pagezero)
 	pushl	%ebx
 	movl	8(%esp),%ecx
 	movl	%ecx,%eax
 	addl	$4096,%eax
 	xor	%ebx,%ebx
 	jmp	1f
 	/*
 	 * The loop takes 14 bytes.  Ensure that it doesn't cross a 16-byte
 	 * cache line.
 	 */
 	.p2align 4,0x90
 1:
 	movnti	%ebx,(%ecx)
 	movnti	%ebx,4(%ecx)
 	addl	$8,%ecx
 	cmpl	%ecx,%eax
 	jne	1b
 	sfence
 	popl	%ebx
 	ret
 END(sse2_pagezero)
 
 ENTRY(i686_pagezero)
 	pushl	%edi
 	pushl	%ebx
 
 	movl	12(%esp),%edi
 	movl	$1024,%ecx
 
 	ALIGN_TEXT
 1:
 	xorl	%eax,%eax
 	repe
 	scasl
 	jnz	2f
 
 	popl	%ebx
 	popl	%edi
 	ret
 
 	ALIGN_TEXT
 
 2:
 	incl	%ecx
 	subl	$4,%edi
 
 	movl	%ecx,%edx
 	cmpl	$16,%ecx
 
 	jge	3f
 
 	movl	%edi,%ebx
 	andl	$0x3f,%ebx
 	shrl	%ebx
 	shrl	%ebx
 	movl	$16,%ecx
 	subl	%ebx,%ecx
 
 3:
 	subl	%ecx,%edx
 	rep
 	stosl
 
 	movl	%edx,%ecx
 	testl	%edx,%edx
 	jnz	1b
 
 	popl	%ebx
 	popl	%edi
 	ret
 END(i686_pagezero)
 
 /* fillw(pat, base, cnt) */
 ENTRY(fillw)
 	pushl	%edi
 	movl	8(%esp),%eax
 	movl	12(%esp),%edi
 	movl	16(%esp),%ecx
 	rep
 	stosw
 	popl	%edi
 	ret
 END(fillw)
 
 /*
  * memmove(dst, src, cnt) (return dst)
  * bcopy(src, dst, cnt)
  *  ws@tools.de     (Wolfgang Solfrank, TooLs GmbH) +49-228-985800
  */
 ENTRY(bcopy)
 	movl	4(%esp),%eax
 	movl	8(%esp),%edx
 	movl	%eax,8(%esp)
 	movl	%edx,4(%esp)
 	MEXITCOUNT
 	jmp	memmove
 END(bcopy)
 
 ENTRY(memmove)
 	pushl	%ebp
 	movl	%esp,%ebp
 	pushl	%esi
 	pushl	%edi
 	movl	8(%ebp),%edi
 	movl	12(%ebp),%esi
 1:
 	movl	16(%ebp),%ecx
 
 	movl	%edi,%eax
 	subl	%esi,%eax
 	cmpl	%ecx,%eax			/* overlapping && src < dst? */
 	jb	1f
 
 	shrl	$2,%ecx				/* copy by 32-bit words */
 	rep
 	movsl
 	movl	16(%ebp),%ecx
 	andl	$3,%ecx				/* any bytes left? */
 	rep
 	movsb
 	popl	%edi
 	popl	%esi
 	movl	8(%ebp),%eax			/* return dst for memmove */
 	popl	%ebp
 	ret
 
 	ALIGN_TEXT
 1:
 	addl	%ecx,%edi			/* copy backwards */
 	addl	%ecx,%esi
 	decl	%edi
 	decl	%esi
 	andl	$3,%ecx				/* any fractional bytes? */
 	std
 	rep
 	movsb
 	movl	16(%ebp),%ecx			/* copy remainder by 32-bit words */
 	shrl	$2,%ecx
 	subl	$3,%esi
 	subl	$3,%edi
 	rep
 	movsl
 	popl	%edi
 	popl	%esi
 	cld
 	movl	8(%ebp),%eax			/* return dst for memmove */
 	popl	%ebp
 	ret
 END(memmove)
 
 /*
  * Note: memcpy does not support overlapping copies
  */
 ENTRY(memcpy)
 	pushl	%edi
 	pushl	%esi
 	movl	12(%esp),%edi
 	movl	16(%esp),%esi
 	movl	20(%esp),%ecx
 	movl	%edi,%eax
 	shrl	$2,%ecx				/* copy by 32-bit words */
 	rep
 	movsl
 	movl	20(%esp),%ecx
 	andl	$3,%ecx				/* any bytes left? */
 	rep
 	movsb
 	popl	%esi
 	popl	%edi
 	ret
 END(memcpy)
 
 /*
  * copystr(from, to, maxlen, int *lencopied) - MP SAFE
  */
 ENTRY(copystr)
 	pushl	%esi
 	pushl	%edi
 
 	movl	12(%esp),%esi			/* %esi = from */
 	movl	16(%esp),%edi			/* %edi = to */
 	movl	20(%esp),%edx			/* %edx = maxlen */
 	incl	%edx
 1:
 	decl	%edx
 	jz	4f
 	lodsb
 	stosb
 	orb	%al,%al
 	jnz	1b
 
 	/* Success -- 0 byte reached */
 	decl	%edx
 	xorl	%eax,%eax
 	jmp	6f
 4:
 	/* edx is zero -- return ENAMETOOLONG */
 	movl	$ENAMETOOLONG,%eax
 
 6:
 	/* set *lencopied and return %eax */
 	movl	20(%esp),%ecx
 	subl	%edx,%ecx
 	movl	24(%esp),%edx
 	testl	%edx,%edx
 	jz	7f
 	movl	%ecx,(%edx)
 7:
 	popl	%edi
 	popl	%esi
 	ret
 END(copystr)
 
 ENTRY(bcmp)
 	pushl	%edi
 	pushl	%esi
 	movl	12(%esp),%edi
 	movl	16(%esp),%esi
 	movl	20(%esp),%edx
 
 	movl	%edx,%ecx
 	shrl	$2,%ecx
 	repe
 	cmpsl
 	jne	1f
 
 	movl	%edx,%ecx
 	andl	$3,%ecx
 	repe
 	cmpsb
 1:
 	setne	%al
 	movsbl	%al,%eax
 	popl	%esi
 	popl	%edi
 	ret
 END(bcmp)
 
 /*
  * Handling of special 386 registers and descriptor tables etc
  */
 /* void lgdt(struct region_descriptor *rdp); */
 ENTRY(lgdt)
 	/* reload the descriptor table */
 	movl	4(%esp),%eax
 	lgdt	(%eax)
 
 	/* flush the prefetch q */
 	jmp	1f
 	nop
 1:
 	/* reload "stale" selectors */
 	movl	$KDSEL,%eax
 	movl	%eax,%ds
 	movl	%eax,%es
 	movl	%eax,%gs
 	movl	%eax,%ss
 	movl	$KPSEL,%eax
 	movl	%eax,%fs
 
 	/* reload code selector by turning return into intersegmental return */
 	movl	(%esp),%eax
 	pushl	%eax
 	movl	$KCSEL,4(%esp)
 	MEXITCOUNT
 	lret
 END(lgdt)
 
 /* ssdtosd(*ssdp,*sdp) */
 ENTRY(ssdtosd)
 	pushl	%ebx
 	movl	8(%esp),%ecx
 	movl	8(%ecx),%ebx
 	shll	$16,%ebx
 	movl	(%ecx),%edx
 	roll	$16,%edx
 	movb	%dh,%bl
 	movb	%dl,%bh
 	rorl	$8,%ebx
 	movl	4(%ecx),%eax
 	movw	%ax,%dx
 	andl	$0xf0000,%eax
 	orl	%eax,%ebx
 	movl	12(%esp),%ecx
 	movl	%edx,(%ecx)
 	movl	%ebx,4(%ecx)
 	popl	%ebx
 	ret
 END(ssdtosd)
 
 /* void reset_dbregs() */
 ENTRY(reset_dbregs)
 	movl	$0,%eax
 	movl	%eax,%dr7	/* disable all breakpoints first */
 	movl	%eax,%dr0
 	movl	%eax,%dr1
 	movl	%eax,%dr2
 	movl	%eax,%dr3
 	movl	%eax,%dr6
 	ret
 END(reset_dbregs)
 
 /*****************************************************************************/
 /* setjump, longjump                                                         */
 /*****************************************************************************/
 
 ENTRY(setjmp)
 	movl	4(%esp),%eax
 	movl	%ebx,(%eax)			/* save ebx */
 	movl	%esp,4(%eax)			/* save esp */
 	movl	%ebp,8(%eax)			/* save ebp */
 	movl	%esi,12(%eax)			/* save esi */
 	movl	%edi,16(%eax)			/* save edi */
 	movl	(%esp),%edx			/* get rta */
 	movl	%edx,20(%eax)			/* save eip */
 	xorl	%eax,%eax			/* return(0); */
 	ret
 END(setjmp)
 
 ENTRY(longjmp)
 	movl	4(%esp),%eax
 	movl	(%eax),%ebx			/* restore ebx */
 	movl	4(%eax),%esp			/* restore esp */
 	movl	8(%eax),%ebp			/* restore ebp */
 	movl	12(%eax),%esi			/* restore esi */
 	movl	16(%eax),%edi			/* restore edi */
 	movl	20(%eax),%edx			/* get rta */
 	movl	%edx,(%esp)			/* put in return frame */
 	xorl	%eax,%eax			/* return(1); */
 	incl	%eax
 	ret
 END(longjmp)
 
 /*
  * Support for reading MSRs in the safe manner.  (Instead of panic on #gp,
  * return an error.)
  */
 ENTRY(rdmsr_safe)
 /* int rdmsr_safe(u_int msr, uint64_t *data) */
 	movl	PCPU(CURPCB),%ecx
 	movl	$msr_onfault,PCB_ONFAULT(%ecx)
 
 	movl	4(%esp),%ecx
 	rdmsr
 	movl	8(%esp),%ecx
 	movl	%eax,(%ecx)
 	movl	%edx,4(%ecx)
 	xorl	%eax,%eax
 
 	movl	PCPU(CURPCB),%ecx
 	movl	%eax,PCB_ONFAULT(%ecx)
 
 	ret
 
 /*
  * Support for writing MSRs in the safe manner.  (Instead of panic on #gp,
  * return an error.)
  */
 ENTRY(wrmsr_safe)
 /* int wrmsr_safe(u_int msr, uint64_t data) */
 	movl	PCPU(CURPCB),%ecx
 	movl	$msr_onfault,PCB_ONFAULT(%ecx)
 
 	movl	4(%esp),%ecx
 	movl	8(%esp),%eax
 	movl	12(%esp),%edx
 	wrmsr
 	xorl	%eax,%eax
 
 	movl	PCPU(CURPCB),%ecx
 	movl	%eax,PCB_ONFAULT(%ecx)
 
 	ret
 
 /*
  * MSR operations fault handler
  */
 	ALIGN_TEXT
 msr_onfault:
 	movl	PCPU(CURPCB),%ecx
 	movl	$0,PCB_ONFAULT(%ecx)
 	movl	$EFAULT,%eax
 	ret
 
 ENTRY(handle_ibrs_entry)
 	cmpb	$0,hw_ibrs_active
 	je	1f
 	movl	$MSR_IA32_SPEC_CTRL,%ecx
 	rdmsr
 	orl	$(IA32_SPEC_CTRL_IBRS|IA32_SPEC_CTRL_STIBP),%eax
 	orl	$(IA32_SPEC_CTRL_IBRS|IA32_SPEC_CTRL_STIBP)>>32,%edx
 	wrmsr
 	movb	$1,PCPU(IBPB_SET)
 	/*
 	 * i386 does not implement SMEP, but the 4/4 split makes this not
 	 * that important.
 	 */
 1:	ret
 END(handle_ibrs_entry)
 
 ENTRY(handle_ibrs_exit)
 	cmpb	$0,PCPU(IBPB_SET)
 	je	1f
 	movl	$MSR_IA32_SPEC_CTRL,%ecx
 	rdmsr
 	andl	$~(IA32_SPEC_CTRL_IBRS|IA32_SPEC_CTRL_STIBP),%eax
 	andl	$~((IA32_SPEC_CTRL_IBRS|IA32_SPEC_CTRL_STIBP)>>32),%edx
 	wrmsr
 	movb	$0,PCPU(IBPB_SET)
 1:	ret
 END(handle_ibrs_exit)
+
+ENTRY(mds_handler_void)
+	ret
+END(mds_handler_void)
+
+ENTRY(mds_handler_verw)
+	subl	$4, %esp
+	movw	%ds, (%esp)
+	verw	(%esp)
+	addl	$4, %esp
+	ret
+END(mds_handler_verw)
+
+ENTRY(mds_handler_ivb)
+	movl	%cr0, %eax
+	testb	$CR0_TS, %al
+	je	1f
+	clts
+1:	movl	PCPU(MDS_BUF), %edx
+	movdqa	%xmm0, PCPU(MDS_TMP)
+	pxor	%xmm0, %xmm0
+
+	lfence
+	orpd	(%edx), %xmm0
+	orpd	(%edx), %xmm0
+	mfence
+	movl	$40, %ecx
+	addl	$16, %edx
+2:	movntdq	%xmm0, (%edx)
+	addl	$16, %edx
+	decl	%ecx
+	jnz	2b
+	mfence
+
+	movdqa	PCPU(MDS_TMP),%xmm0
+	testb	$CR0_TS, %al
+	je	3f
+	movl	%eax, %cr0
+3:	ret
+END(mds_handler_ivb)
+
+ENTRY(mds_handler_bdw)
+	movl	%cr0, %eax
+	testb	$CR0_TS, %al
+	je	1f
+	clts
+1:	movl	PCPU(MDS_BUF), %ebx
+	movdqa	%xmm0, PCPU(MDS_TMP)
+	pxor	%xmm0, %xmm0
+
+	movl	%ebx, %edi
+	movl	%ebx, %esi
+	movl	$40, %ecx
+2:	movntdq	%xmm0, (%ebx)
+	addl	$16, %ebx
+	decl	%ecx
+	jnz	2b
+	mfence
+	movl	$1536, %ecx
+	rep; movsb
+	lfence
+
+	movdqa	PCPU(MDS_TMP),%xmm0
+	testb	$CR0_TS, %al
+	je	3f
+	movl	%eax, %cr0
+3:	ret
+END(mds_handler_bdw)
+
+ENTRY(mds_handler_skl_sse)
+	movl	%cr0, %eax
+	testb	$CR0_TS, %al
+	je	1f
+	clts
+1:	movl	PCPU(MDS_BUF), %edi
+	movl	PCPU(MDS_BUF64), %edx
+	movdqa	%xmm0, PCPU(MDS_TMP)
+	pxor	%xmm0, %xmm0
+
+	lfence
+	orpd	(%edx), %xmm0
+	orpd	(%edx), %xmm0
+	xorl	%eax, %eax
+2:	clflushopt	5376(%edi, %eax, 8)
+	addl	$8, %eax
+	cmpl	$8 * 12, %eax
+	jb	2b
+	sfence
+	movl	$6144, %ecx
+	xorl	%eax, %eax
+	rep; stosb
+	mfence
+
+	movdqa	PCPU(MDS_TMP), %xmm0
+	testb	$CR0_TS, %al
+	je	3f
+	movl	%eax, %cr0
+3:	ret
+END(mds_handler_skl_sse)
+
+ENTRY(mds_handler_skl_avx)
+	movl	%cr0, %eax
+	testb	$CR0_TS, %al
+	je	1f
+	clts
+1:	movl	PCPU(MDS_BUF), %edi
+	movl	PCPU(MDS_BUF64), %edx
+	vmovdqa	%ymm0, PCPU(MDS_TMP)
+	vpxor	%ymm0, %ymm0, %ymm0
+
+	lfence
+	vorpd	(%edx), %ymm0, %ymm0
+	vorpd	(%edx), %ymm0, %ymm0
+	xorl	%eax, %eax
+2:	clflushopt	5376(%edi, %eax, 8)
+	addl	$8, %eax
+	cmpl	$8 * 12, %eax
+	jb	2b
+	sfence
+	movl	$6144, %ecx
+	xorl	%eax, %eax
+	rep; stosb
+	mfence
+
+	vmovdqa	PCPU(MDS_TMP), %ymm0
+	testb	$CR0_TS, %al
+	je	3f
+	movl	%eax, %cr0
+3:	ret
+END(mds_handler_skl_avx)
+
+ENTRY(mds_handler_skl_avx512)
+	movl	%cr0, %eax
+	testb	$CR0_TS, %al
+	je	1f
+	clts
+1:	movl	PCPU(MDS_BUF), %edi
+	movl	PCPU(MDS_BUF64), %edx
+	vmovdqa64	%zmm0, PCPU(MDS_TMP)
+	vpxor	%zmm0, %zmm0, %zmm0
+
+	lfence
+	vorpd	(%edx), %zmm0, %zmm0
+	vorpd	(%edx), %zmm0, %zmm0
+	xorl	%eax, %eax
+2:	clflushopt	5376(%edi, %eax, 8)
+	addl	$8, %eax
+	cmpl	$8 * 12, %eax
+	jb	2b
+	sfence
+	movl	$6144, %ecx
+	xorl	%eax, %eax
+	rep; stosb
+	mfence
+
+	vmovdqa64	PCPU(MDS_TMP), %zmm0
+	testb	$CR0_TS, %al
+	je	3f
+	movl	%eax, %cr0
+3:	ret
+END(mds_handler_skl_avx512)
+
+ENTRY(mds_handler_silvermont)
+	movl	%cr0, %eax
+	testb	$CR0_TS, %al
+	je	1f
+	clts
+1:	movl	PCPU(MDS_BUF), %edx
+	movdqa	%xmm0, PCPU(MDS_TMP)
+	pxor	%xmm0, %xmm0
+
+	movl	$16, %ecx
+2:	movntdq	%xmm0, (%edx)
+	addl	$16, %edx
+	decl	%ecx
+	jnz	2b
+	mfence
+
+	movdqa	PCPU(MDS_TMP),%xmm0
+	testb	$CR0_TS, %al
+	je	3f
+	movl	%eax, %cr0
+3:	ret
+END(mds_handler_silvermont)
Index: projects/runtime-coverage-v2/sys/i386/include/pcpu.h
===================================================================
--- projects/runtime-coverage-v2/sys/i386/include/pcpu.h	(revision 347598)
+++ projects/runtime-coverage-v2/sys/i386/include/pcpu.h	(revision 347599)
@@ -1,267 +1,271 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
  *
  * Copyright (c) Peter Wemm
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  * $FreeBSD$
  */
 
 #ifndef _MACHINE_PCPU_H_
 #define	_MACHINE_PCPU_H_
 
 #ifndef _SYS_CDEFS_H_
 #error "sys/cdefs.h is a prerequisite for this file"
 #endif
 
 #include <machine/segments.h>
 #include <machine/tss.h>
 
 #include <sys/_lock.h>
 #include <sys/_mutex.h>
 
 struct monitorbuf {
 	int idle_state;		/* Used by cpu_idle_mwait. */
 	int stop_state;		/* Used by cpustop_handler. */
 	char padding[128 - (2 * sizeof(int))];
 };
 _Static_assert(sizeof(struct monitorbuf) == 128, "2x cache line");
 
 /*
  * The SMP parts are setup in pmap.c and machdep.c for the BSP, and
  * pmap.c and mp_machdep.c sets up the data for the AP's to "see" when
  * they awake.  The reason for doing it via a struct is so that an
  * array of pointers to each CPU's data can be set up for things like
  * "check curproc on all other processors"
  */
 
 #define	PCPU_MD_FIELDS							\
 	struct monitorbuf pc_monitorbuf __aligned(128);	/* cache line */\
 	struct	pcpu *pc_prvspace;	/* Self-reference */		\
 	struct	pmap *pc_curpmap;					\
 	struct	segment_descriptor pc_common_tssd;			\
 	struct	segment_descriptor *pc_tss_gdt;				\
 	struct	segment_descriptor *pc_fsgs_gdt;			\
 	struct	i386tss *pc_common_tssp;				\
 	u_int	pc_kesp0;						\
 	u_int	pc_trampstk;						\
 	int	pc_currentldt;						\
 	u_int   pc_acpi_id;		/* ACPI CPU id */		\
 	u_int	pc_apic_id;						\
 	int	pc_private_tss;		/* Flag indicating private tss*/\
 	u_int	pc_cmci_mask;		/* MCx banks for CMCI */	\
 	u_int	pc_vcpu_id;		/* Xen vCPU ID */		\
 	struct	mtx pc_cmap_lock;					\
 	void	*pc_cmap_pte1;						\
 	void	*pc_cmap_pte2;						\
 	caddr_t	pc_cmap_addr1;						\
 	caddr_t	pc_cmap_addr2;						\
 	vm_offset_t pc_qmap_addr;	/* KVA for temporary mappings */\
 	vm_offset_t pc_copyout_maddr;					\
 	vm_offset_t pc_copyout_saddr;					\
 	struct	mtx pc_copyout_mlock;					\
 	struct	sx pc_copyout_slock;					\
 	char	*pc_copyout_buf;					\
 	vm_offset_t pc_pmap_eh_va;					\
-	caddr_t pc_pmap_eh_ptep;						\
+	caddr_t pc_pmap_eh_ptep;					\
 	uint32_t pc_smp_tlb_done;	/* TLB op acknowledgement */	\
 	uint32_t pc_ibpb_set;						\
+	void	*pc_mds_buf;						\
+	void	*pc_mds_buf64;						\
+	uint32_t pc_pad[4];						\
+	uint8_t	pc_mds_tmp[64];						\
 	u_int	pc_ipi_bitmap;						\
-	char	__pad[3606]
+	char	__pad[3518]
 
 #ifdef _KERNEL
 
 #define MONITOR_STOPSTATE_RUNNING	0
 #define MONITOR_STOPSTATE_STOPPED	1
 
 #if defined(__GNUCLIKE_ASM) && defined(__GNUCLIKE___TYPEOF)
 
 /*
  * Evaluates to the byte offset of the per-cpu variable name.
  */
 #define	__pcpu_offset(name)						\
 	__offsetof(struct pcpu, name)
 
 /*
  * Evaluates to the type of the per-cpu variable name.
  */
 #define	__pcpu_type(name)						\
 	__typeof(((struct pcpu *)0)->name)
 
 /*
  * Evaluates to the address of the per-cpu variable name.
  */
 #define	__PCPU_PTR(name) __extension__ ({				\
 	__pcpu_type(name) *__p;						\
 									\
 	__asm __volatile("movl %%fs:%1,%0; addl %2,%0"			\
 	    : "=r" (__p)						\
 	    : "m" (*(struct pcpu *)(__pcpu_offset(pc_prvspace))),	\
 	      "i" (__pcpu_offset(name)));				\
 									\
 	__p;								\
 })
 
 /*
  * Evaluates to the value of the per-cpu variable name.
  */
 #define	__PCPU_GET(name) __extension__ ({				\
 	__pcpu_type(name) __res;					\
 	struct __s {							\
 		u_char	__b[MIN(sizeof(__res), 4)];			\
 	} __s;								\
 									\
 	if (sizeof(__res) == 1 || sizeof(__res) == 2 ||			\
 	    sizeof(__res) == 4) {					\
 		__asm __volatile("mov %%fs:%1,%0"			\
 		    : "=r" (__s)					\
 		    : "m" (*(struct __s *)(__pcpu_offset(name))));	\
 		*(struct __s *)(void *)&__res = __s;			\
 	} else {							\
 		__res = *__PCPU_PTR(name);				\
 	}								\
 	__res;								\
 })
 
 /*
  * Adds a value of the per-cpu counter name.  The implementation
  * must be atomic with respect to interrupts.
  */
 #define	__PCPU_ADD(name, val) do {					\
 	__pcpu_type(name) __val;					\
 	struct __s {							\
 		u_char	__b[MIN(sizeof(__val), 4)];			\
 	} __s;								\
 									\
 	__val = (val);							\
 	if (sizeof(__val) == 1 || sizeof(__val) == 2 ||			\
 	    sizeof(__val) == 4) {					\
 		__s = *(struct __s *)(void *)&__val;			\
 		__asm __volatile("add %1,%%fs:%0"			\
 		    : "=m" (*(struct __s *)(__pcpu_offset(name)))	\
 		    : "r" (__s));					\
 	} else								\
 		*__PCPU_PTR(name) += __val;				\
 } while (0)
 
 /*
  * Increments the value of the per-cpu counter name.  The implementation
  * must be atomic with respect to interrupts.
  */
 #define	__PCPU_INC(name) do {						\
 	CTASSERT(sizeof(__pcpu_type(name)) == 1 ||			\
 	    sizeof(__pcpu_type(name)) == 2 ||				\
 	    sizeof(__pcpu_type(name)) == 4);				\
 	if (sizeof(__pcpu_type(name)) == 1) {				\
 		__asm __volatile("incb %%fs:%0"				\
 		    : "=m" (*(__pcpu_type(name) *)(__pcpu_offset(name)))\
 		    : "m" (*(__pcpu_type(name) *)(__pcpu_offset(name))));\
 	} else if (sizeof(__pcpu_type(name)) == 2) {			\
 		__asm __volatile("incw %%fs:%0"				\
 		    : "=m" (*(__pcpu_type(name) *)(__pcpu_offset(name)))\
 		    : "m" (*(__pcpu_type(name) *)(__pcpu_offset(name))));\
 	} else if (sizeof(__pcpu_type(name)) == 4) {			\
 		__asm __volatile("incl %%fs:%0"				\
 		    : "=m" (*(__pcpu_type(name) *)(__pcpu_offset(name)))\
 		    : "m" (*(__pcpu_type(name) *)(__pcpu_offset(name))));\
 	}								\
 } while (0)
 
 /*
  * Sets the value of the per-cpu variable name to value val.
  */
 #define	__PCPU_SET(name, val) do {					\
 	__pcpu_type(name) __val;					\
 	struct __s {							\
 		u_char	__b[MIN(sizeof(__val), 4)];			\
 	} __s;								\
 									\
 	__val = (val);							\
 	if (sizeof(__val) == 1 || sizeof(__val) == 2 ||			\
 	    sizeof(__val) == 4) {					\
 		__s = *(struct __s *)(void *)&__val;			\
 		__asm __volatile("mov %1,%%fs:%0"			\
 		    : "=m" (*(struct __s *)(__pcpu_offset(name)))	\
 		    : "r" (__s));					\
 	} else {							\
 		*__PCPU_PTR(name) = __val;				\
 	}								\
 } while (0)
 
 #define	get_pcpu() __extension__ ({					\
 	struct pcpu *__pc;						\
 									\
 	__asm __volatile("movl %%fs:%1,%0"				\
 	    : "=r" (__pc)						\
 	    : "m" (*(struct pcpu *)(__pcpu_offset(pc_prvspace))));	\
 	__pc;								\
 })
 
 #define	PCPU_GET(member)	__PCPU_GET(pc_ ## member)
 #define	PCPU_ADD(member, val)	__PCPU_ADD(pc_ ## member, val)
 #define	PCPU_INC(member)	__PCPU_INC(pc_ ## member)
 #define	PCPU_PTR(member)	__PCPU_PTR(pc_ ## member)
 #define	PCPU_SET(member, val)	__PCPU_SET(pc_ ## member, val)
 
 #define	OFFSETOF_CURTHREAD	0
 #ifdef __clang__
 #pragma clang diagnostic push
 #pragma clang diagnostic ignored "-Wnull-dereference"
 #endif
 static __inline __pure2 struct thread *
 __curthread(void)
 {
 	struct thread *td;
 
 	__asm("movl %%fs:%1,%0" : "=r" (td)
 	    : "m" (*(char *)OFFSETOF_CURTHREAD));
 	return (td);
 }
 #ifdef __clang__
 #pragma clang diagnostic pop
 #endif
 #define	curthread		(__curthread())
 
 #define	OFFSETOF_CURPCB		16
 static __inline __pure2 struct pcb *
 __curpcb(void)
 {
 	struct pcb *pcb;
 
 	__asm("movl %%fs:%1,%0" : "=r" (pcb) : "m" (*(char *)OFFSETOF_CURPCB));
 	return (pcb);
 }
 #define	curpcb		(__curpcb())
 
 #define	IS_BSP()	(PCPU_GET(cpuid) == 0)
 
 #else /* defined(__GNUCLIKE_ASM) && defined(__GNUCLIKE___TYPEOF) */
 
 #error "this file needs to be ported to your compiler"
 
 #endif /* __GNUCLIKE_ASM etc. */
 
 #endif /* _KERNEL */
 
 #endif /* !_MACHINE_PCPU_H_ */
Index: projects/runtime-coverage-v2/sys/kern/subr_witness.c
===================================================================
--- projects/runtime-coverage-v2/sys/kern/subr_witness.c	(revision 347598)
+++ projects/runtime-coverage-v2/sys/kern/subr_witness.c	(revision 347599)
@@ -1,3084 +1,3083 @@
 /*-
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Copyright (c) 2008 Isilon Systems, Inc.
  * Copyright (c) 2008 Ilya Maykov <ivmaykov@gmail.com>
  * Copyright (c) 1998 Berkeley Software Design, Inc.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Berkeley Software Design Inc's name may not be used to endorse or
  *    promote products derived from this software without specific prior
  *    written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY BERKELEY SOFTWARE DESIGN INC ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL BERKELEY SOFTWARE DESIGN INC BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	from BSDI $Id: mutex_witness.c,v 1.1.2.20 2000/04/27 03:10:27 cp Exp $
  *	and BSDI $Id: synch_machdep.c,v 2.3.2.39 2000/04/27 03:10:25 cp Exp $
  */
 
 /*
  * Implementation of the `witness' lock verifier.  Originally implemented for
  * mutexes in BSD/OS.  Extended to handle generic lock objects and lock
  * classes in FreeBSD.
  */
 
 /*
  *	Main Entry: witness
  *	Pronunciation: 'wit-n&s
  *	Function: noun
  *	Etymology: Middle English witnesse, from Old English witnes knowledge,
  *	    testimony, witness, from 2wit
  *	Date: before 12th century
  *	1 : attestation of a fact or event : TESTIMONY
  *	2 : one that gives evidence; specifically : one who testifies in
  *	    a cause or before a judicial tribunal
  *	3 : one asked to be present at a transaction so as to be able to
  *	    testify to its having taken place
  *	4 : one who has personal knowledge of something
  *	5 a : something serving as evidence or proof : SIGN
  *	  b : public affirmation by word or example of usually
  *	      religious faith or conviction <the heroic witness to divine
  *	      life -- Pilot>
  *	6 capitalized : a member of the Jehovah's Witnesses 
  */
 
 /*
  * Special rules concerning Giant and lock orders:
  *
  * 1) Giant must be acquired before any other mutexes.  Stated another way,
  *    no other mutex may be held when Giant is acquired.
  *
  * 2) Giant must be released when blocking on a sleepable lock.
  *
  * This rule is less obvious, but is a result of Giant providing the same
  * semantics as spl().  Basically, when a thread sleeps, it must release
  * Giant.  When a thread blocks on a sleepable lock, it sleeps.  Hence rule
  * 2).
  *
  * 3) Giant may be acquired before or after sleepable locks.
  *
  * This rule is also not quite as obvious.  Giant may be acquired after
  * a sleepable lock because it is a non-sleepable lock and non-sleepable
  * locks may always be acquired while holding a sleepable lock.  The second
  * case, Giant before a sleepable lock, follows from rule 2) above.  Suppose
  * you have two threads T1 and T2 and a sleepable lock X.  Suppose that T1
  * acquires X and blocks on Giant.  Then suppose that T2 acquires Giant and
  * blocks on X.  When T2 blocks on X, T2 will release Giant allowing T1 to
  * execute.  Thus, acquiring Giant both before and after a sleepable lock
  * will not result in a lock order reversal.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_ddb.h"
 #include "opt_hwpmc_hooks.h"
 #include "opt_stack.h"
 #include "opt_witness.h"
 
 #include <sys/param.h>
 #include <sys/bus.h>
 #include <sys/kdb.h>
 #include <sys/kernel.h>
 #include <sys/ktr.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mutex.h>
 #include <sys/priv.h>
 #include <sys/proc.h>
 #include <sys/sbuf.h>
 #include <sys/sched.h>
 #include <sys/stack.h>
 #include <sys/sysctl.h>
 #include <sys/syslog.h>
 #include <sys/systm.h>
 
 #ifdef DDB
 #include <ddb/ddb.h>
 #endif
 
 #include <machine/stdarg.h>
 
 #if !defined(DDB) && !defined(STACK)
 #error "DDB or STACK options are required for WITNESS"
 #endif
 
 /* Note that these traces do not work with KTR_ALQ. */
 #if 0
 #define	KTR_WITNESS	KTR_SUBSYS
 #else
 #define	KTR_WITNESS	0
 #endif
 
 #define	LI_RECURSEMASK	0x0000ffff	/* Recursion depth of lock instance. */
 #define	LI_EXCLUSIVE	0x00010000	/* Exclusive lock instance. */
 #define	LI_NORELEASE	0x00020000	/* Lock not allowed to be released. */
 
 /* Define this to check for blessed mutexes */
 #undef BLESSING
 
 #ifndef WITNESS_COUNT
 #define	WITNESS_COUNT 		1536
 #endif
 #define	WITNESS_HASH_SIZE	251	/* Prime, gives load factor < 2 */
 #define	WITNESS_PENDLIST	(512 + (MAXCPU * 4))
 
 /* Allocate 256 KB of stack data space */
 #define	WITNESS_LO_DATA_COUNT	2048
 
 /* Prime, gives load factor of ~2 at full load */
 #define	WITNESS_LO_HASH_SIZE	1021
 
 /*
  * XXX: This is somewhat bogus, as we assume here that at most 2048 threads
  * will hold LOCK_NCHILDREN locks.  We handle failure ok, and we should
  * probably be safe for the most part, but it's still a SWAG.
  */
 #define	LOCK_NCHILDREN	5
 #define	LOCK_CHILDCOUNT	2048
 
 #define	MAX_W_NAME	64
 
 #define	FULLGRAPH_SBUF_SIZE	512
 
 /*
  * These flags go in the witness relationship matrix and describe the
  * relationship between any two struct witness objects.
  */
 #define	WITNESS_UNRELATED        0x00    /* No lock order relation. */
 #define	WITNESS_PARENT           0x01    /* Parent, aka direct ancestor. */
 #define	WITNESS_ANCESTOR         0x02    /* Direct or indirect ancestor. */
 #define	WITNESS_CHILD            0x04    /* Child, aka direct descendant. */
 #define	WITNESS_DESCENDANT       0x08    /* Direct or indirect descendant. */
 #define	WITNESS_ANCESTOR_MASK    (WITNESS_PARENT | WITNESS_ANCESTOR)
 #define	WITNESS_DESCENDANT_MASK  (WITNESS_CHILD | WITNESS_DESCENDANT)
 #define	WITNESS_RELATED_MASK						\
 	(WITNESS_ANCESTOR_MASK | WITNESS_DESCENDANT_MASK)
 #define	WITNESS_REVERSAL         0x10    /* A lock order reversal has been
 					  * observed. */
 #define	WITNESS_RESERVED1        0x20    /* Unused flag, reserved. */
 #define	WITNESS_RESERVED2        0x40    /* Unused flag, reserved. */
 #define	WITNESS_LOCK_ORDER_KNOWN 0x80    /* This lock order is known. */
 
 /* Descendant to ancestor flags */
 #define	WITNESS_DTOA(x)	(((x) & WITNESS_RELATED_MASK) >> 2)
 
 /* Ancestor to descendant flags */
 #define	WITNESS_ATOD(x)	(((x) & WITNESS_RELATED_MASK) << 2)
 
 #define	WITNESS_INDEX_ASSERT(i)						\
 	MPASS((i) > 0 && (i) <= w_max_used_index && (i) < witness_count)
 
 static MALLOC_DEFINE(M_WITNESS, "Witness", "Witness");
 
 /*
  * Lock instances.  A lock instance is the data associated with a lock while
  * it is held by witness.  For example, a lock instance will hold the
  * recursion count of a lock.  Lock instances are held in lists.  Spin locks
  * are held in a per-cpu list while sleep locks are held in per-thread list.
  */
 struct lock_instance {
 	struct lock_object	*li_lock;
 	const char		*li_file;
 	int			li_line;
 	u_int			li_flags;
 };
 
 /*
  * A simple list type used to build the list of locks held by a thread
  * or CPU.  We can't simply embed the list in struct lock_object since a
  * lock may be held by more than one thread if it is a shared lock.  Locks
  * are added to the head of the list, so we fill up each list entry from
  * "the back" logically.  To ease some of the arithmetic, we actually fill
  * in each list entry the normal way (children[0] then children[1], etc.) but
  * when we traverse the list we read children[count-1] as the first entry
  * down to children[0] as the final entry.
  */
 struct lock_list_entry {
 	struct lock_list_entry	*ll_next;
 	struct lock_instance	ll_children[LOCK_NCHILDREN];
 	u_int			ll_count;
 };
 
 /*
  * The main witness structure. One of these per named lock type in the system
  * (for example, "vnode interlock").
  */
 struct witness {
 	char  			w_name[MAX_W_NAME];
 	uint32_t 		w_index;  /* Index in the relationship matrix */
 	struct lock_class	*w_class;
 	STAILQ_ENTRY(witness) 	w_list;		/* List of all witnesses. */
 	STAILQ_ENTRY(witness) 	w_typelist;	/* Witnesses of a type. */
 	struct witness		*w_hash_next; /* Linked list in hash buckets. */
 	const char		*w_file; /* File where last acquired */
 	uint32_t 		w_line; /* Line where last acquired */
 	uint32_t 		w_refcount;
 	uint16_t 		w_num_ancestors; /* direct/indirect
 						  * ancestor count */
 	uint16_t 		w_num_descendants; /* direct/indirect
 						    * descendant count */
 	int16_t 		w_ddb_level;
 	unsigned		w_displayed:1;
 	unsigned		w_reversed:1;
 };
 
 STAILQ_HEAD(witness_list, witness);
 
 /*
  * The witness hash table. Keys are witness names (const char *), elements are
  * witness objects (struct witness *).
  */
 struct witness_hash {
 	struct witness	*wh_array[WITNESS_HASH_SIZE];
 	uint32_t	wh_size;
 	uint32_t	wh_count;
 };
 
 /*
  * Key type for the lock order data hash table.
  */
 struct witness_lock_order_key {
 	uint16_t	from;
 	uint16_t	to;
 };
 
 struct witness_lock_order_data {
 	struct stack			wlod_stack;
 	struct witness_lock_order_key	wlod_key;
 	struct witness_lock_order_data	*wlod_next;
 };
 
 /*
  * The witness lock order data hash table. Keys are witness index tuples
  * (struct witness_lock_order_key), elements are lock order data objects
  * (struct witness_lock_order_data). 
  */
 struct witness_lock_order_hash {
 	struct witness_lock_order_data	*wloh_array[WITNESS_LO_HASH_SIZE];
 	u_int	wloh_size;
 	u_int	wloh_count;
 };
 
 #ifdef BLESSING
 struct witness_blessed {
 	const char	*b_lock1;
 	const char	*b_lock2;
 };
 #endif
 
 struct witness_pendhelp {
 	const char		*wh_type;
 	struct lock_object	*wh_lock;
 };
 
 struct witness_order_list_entry {
 	const char		*w_name;
 	struct lock_class	*w_class;
 };
 
 /*
  * Returns 0 if one of the locks is a spin lock and the other is not.
  * Returns 1 otherwise.
  */
 static __inline int
 witness_lock_type_equal(struct witness *w1, struct witness *w2)
 {
 
 	return ((w1->w_class->lc_flags & (LC_SLEEPLOCK | LC_SPINLOCK)) ==
 		(w2->w_class->lc_flags & (LC_SLEEPLOCK | LC_SPINLOCK)));
 }
 
 static __inline int
 witness_lock_order_key_equal(const struct witness_lock_order_key *a,
     const struct witness_lock_order_key *b)
 {
 
 	return (a->from == b->from && a->to == b->to);
 }
 
 static int	_isitmyx(struct witness *w1, struct witness *w2, int rmask,
 		    const char *fname);
 static void	adopt(struct witness *parent, struct witness *child);
 #ifdef BLESSING
 static int	blessed(struct witness *, struct witness *);
 #endif
 static void	depart(struct witness *w);
 static struct witness	*enroll(const char *description,
 			    struct lock_class *lock_class);
 static struct lock_instance	*find_instance(struct lock_list_entry *list,
 				    const struct lock_object *lock);
 static int	isitmychild(struct witness *parent, struct witness *child);
 static int	isitmydescendant(struct witness *parent, struct witness *child);
 static void	itismychild(struct witness *parent, struct witness *child);
 static int	sysctl_debug_witness_badstacks(SYSCTL_HANDLER_ARGS);
 static int	sysctl_debug_witness_watch(SYSCTL_HANDLER_ARGS);
 static int	sysctl_debug_witness_fullgraph(SYSCTL_HANDLER_ARGS);
 static int	sysctl_debug_witness_channel(SYSCTL_HANDLER_ARGS);
 static void	witness_add_fullgraph(struct sbuf *sb, struct witness *parent);
 #ifdef DDB
 static void	witness_ddb_compute_levels(void);
 static void	witness_ddb_display(int(*)(const char *fmt, ...));
 static void	witness_ddb_display_descendants(int(*)(const char *fmt, ...),
 		    struct witness *, int indent);
 static void	witness_ddb_display_list(int(*prnt)(const char *fmt, ...),
 		    struct witness_list *list);
 static void	witness_ddb_level_descendants(struct witness *parent, int l);
 static void	witness_ddb_list(struct thread *td);
 #endif
 static void	witness_debugger(int cond, const char *msg);
 static void	witness_free(struct witness *m);
 static struct witness	*witness_get(void);
 static uint32_t	witness_hash_djb2(const uint8_t *key, uint32_t size);
 static struct witness	*witness_hash_get(const char *key);
 static void	witness_hash_put(struct witness *w);
 static void	witness_init_hash_tables(void);
 static void	witness_increment_graph_generation(void);
 static void	witness_lock_list_free(struct lock_list_entry *lle);
 static struct lock_list_entry	*witness_lock_list_get(void);
 static int	witness_lock_order_add(struct witness *parent,
 		    struct witness *child);
 static int	witness_lock_order_check(struct witness *parent,
 		    struct witness *child);
 static struct witness_lock_order_data	*witness_lock_order_get(
 					    struct witness *parent,
 					    struct witness *child);
 static void	witness_list_lock(struct lock_instance *instance,
 		    int (*prnt)(const char *fmt, ...));
 static int	witness_output(const char *fmt, ...) __printflike(1, 2);
 static int	witness_voutput(const char *fmt, va_list ap) __printflike(1, 0);
 static void	witness_setflag(struct lock_object *lock, int flag, int set);
 
 static SYSCTL_NODE(_debug, OID_AUTO, witness, CTLFLAG_RW, NULL,
     "Witness Locking");
 
 /*
  * If set to 0, lock order checking is disabled.  If set to -1,
  * witness is completely disabled.  Otherwise witness performs full
  * lock order checking for all locks.  At runtime, lock order checking
  * may be toggled.  However, witness cannot be reenabled once it is
  * completely disabled.
  */
 static int witness_watch = 1;
 SYSCTL_PROC(_debug_witness, OID_AUTO, watch, CTLFLAG_RWTUN | CTLTYPE_INT, NULL, 0,
     sysctl_debug_witness_watch, "I", "witness is watching lock operations");
 
 #ifdef KDB
 /*
  * When KDB is enabled and witness_kdb is 1, it will cause the system
  * to drop into kdebug() when:
  *	- a lock hierarchy violation occurs
  *	- locks are held when going to sleep.
  */
 #ifdef WITNESS_KDB
 int	witness_kdb = 1;
 #else
 int	witness_kdb = 0;
 #endif
 SYSCTL_INT(_debug_witness, OID_AUTO, kdb, CTLFLAG_RWTUN, &witness_kdb, 0, "");
 #endif /* KDB */
 
 #if defined(DDB) || defined(KDB)
 /*
  * When DDB or KDB is enabled and witness_trace is 1, it will cause the system
  * to print a stack trace:
  *	- a lock hierarchy violation occurs
  *	- locks are held when going to sleep.
  */
 int	witness_trace = 1;
 SYSCTL_INT(_debug_witness, OID_AUTO, trace, CTLFLAG_RWTUN, &witness_trace, 0, "");
 #endif /* DDB || KDB */
 
 #ifdef WITNESS_SKIPSPIN
 int	witness_skipspin = 1;
 #else
 int	witness_skipspin = 0;
 #endif
 SYSCTL_INT(_debug_witness, OID_AUTO, skipspin, CTLFLAG_RDTUN, &witness_skipspin, 0, "");
 
 int badstack_sbuf_size;
 
 int witness_count = WITNESS_COUNT;
 SYSCTL_INT(_debug_witness, OID_AUTO, witness_count, CTLFLAG_RDTUN, 
     &witness_count, 0, "");
 
 /*
  * Output channel for witness messages.  By default we print to the console.
  */
 enum witness_channel {
 	WITNESS_CONSOLE,
 	WITNESS_LOG,
 	WITNESS_NONE,
 };
 
 static enum witness_channel witness_channel = WITNESS_CONSOLE;
 SYSCTL_PROC(_debug_witness, OID_AUTO, output_channel, CTLTYPE_STRING |
     CTLFLAG_RWTUN, NULL, 0, sysctl_debug_witness_channel, "A",
     "Output channel for warnings");
 
 /*
  * Call this to print out the relations between locks.
  */
 SYSCTL_PROC(_debug_witness, OID_AUTO, fullgraph, CTLTYPE_STRING | CTLFLAG_RD,
     NULL, 0, sysctl_debug_witness_fullgraph, "A", "Show locks relation graphs");
 
 /*
  * Call this to print out the witness faulty stacks.
  */
 SYSCTL_PROC(_debug_witness, OID_AUTO, badstacks, CTLTYPE_STRING | CTLFLAG_RD,
     NULL, 0, sysctl_debug_witness_badstacks, "A", "Show bad witness stacks");
 
 static struct mtx w_mtx;
 
 /* w_list */
 static struct witness_list w_free = STAILQ_HEAD_INITIALIZER(w_free);
 static struct witness_list w_all = STAILQ_HEAD_INITIALIZER(w_all);
 
 /* w_typelist */
 static struct witness_list w_spin = STAILQ_HEAD_INITIALIZER(w_spin);
 static struct witness_list w_sleep = STAILQ_HEAD_INITIALIZER(w_sleep);
 
 /* lock list */
 static struct lock_list_entry *w_lock_list_free = NULL;
 static struct witness_pendhelp pending_locks[WITNESS_PENDLIST];
 static u_int pending_cnt;
 
 static int w_free_cnt, w_spin_cnt, w_sleep_cnt;
 SYSCTL_INT(_debug_witness, OID_AUTO, free_cnt, CTLFLAG_RD, &w_free_cnt, 0, "");
 SYSCTL_INT(_debug_witness, OID_AUTO, spin_cnt, CTLFLAG_RD, &w_spin_cnt, 0, "");
 SYSCTL_INT(_debug_witness, OID_AUTO, sleep_cnt, CTLFLAG_RD, &w_sleep_cnt, 0,
     "");
 
 static struct witness *w_data;
 static uint8_t **w_rmatrix;
 static struct lock_list_entry w_locklistdata[LOCK_CHILDCOUNT];
 static struct witness_hash w_hash;	/* The witness hash table. */
 
 /* The lock order data hash */
 static struct witness_lock_order_data w_lodata[WITNESS_LO_DATA_COUNT];
 static struct witness_lock_order_data *w_lofree = NULL;
 static struct witness_lock_order_hash w_lohash;
 static int w_max_used_index = 0;
 static unsigned int w_generation = 0;
 static const char w_notrunning[] = "Witness not running\n";
 static const char w_stillcold[] = "Witness is still cold\n";
 #ifdef __i386__
 static const char w_notallowed[] = "The sysctl is disabled on the arch\n";
 #endif
 
 static struct witness_order_list_entry order_lists[] = {
 	/*
 	 * sx locks
 	 */
 	{ "proctree", &lock_class_sx },
 	{ "allproc", &lock_class_sx },
 	{ "allprison", &lock_class_sx },
 	{ NULL, NULL },
 	/*
 	 * Various mutexes
 	 */
 	{ "Giant", &lock_class_mtx_sleep },
 	{ "pipe mutex", &lock_class_mtx_sleep },
 	{ "sigio lock", &lock_class_mtx_sleep },
 	{ "process group", &lock_class_mtx_sleep },
 #ifdef	HWPMC_HOOKS
 	{ "pmc-sleep", &lock_class_mtx_sleep },
 #endif
 	{ "process lock", &lock_class_mtx_sleep },
 	{ "session", &lock_class_mtx_sleep },
 	{ "uidinfo hash", &lock_class_rw },
 	{ "time lock", &lock_class_mtx_sleep },
 	{ NULL, NULL },
 	/*
 	 * umtx
 	 */
 	{ "umtx lock", &lock_class_mtx_sleep },
 	{ NULL, NULL },
 	/*
 	 * Sockets
 	 */
 	{ "accept", &lock_class_mtx_sleep },
 	{ "so_snd", &lock_class_mtx_sleep },
 	{ "so_rcv", &lock_class_mtx_sleep },
 	{ "sellck", &lock_class_mtx_sleep },
 	{ NULL, NULL },
 	/*
 	 * Routing
 	 */
 	{ "so_rcv", &lock_class_mtx_sleep },
 	{ "radix node head", &lock_class_rm },
 	{ "rtentry", &lock_class_mtx_sleep },
 	{ "ifaddr", &lock_class_mtx_sleep },
 	{ NULL, NULL },
 	/*
 	 * IPv4 multicast:
 	 * protocol locks before interface locks, after UDP locks.
 	 */
 	{ "in_multi_sx", &lock_class_sx },
 	{ "udpinp", &lock_class_rw },
 	{ "in_multi_list_mtx", &lock_class_mtx_sleep },
 	{ "igmp_mtx", &lock_class_mtx_sleep },
 	{ "ifnet_rw", &lock_class_rw },
 	{ "if_addr_lock", &lock_class_mtx_sleep },
 	{ NULL, NULL },
 	/*
 	 * IPv6 multicast:
 	 * protocol locks before interface locks, after UDP locks.
 	 */
 	{ "in6_multi_sx", &lock_class_sx },
 	{ "udpinp", &lock_class_rw },
 	{ "in6_multi_list_mtx", &lock_class_mtx_sleep },
 	{ "mld_mtx", &lock_class_mtx_sleep },
 	{ "ifnet_rw", &lock_class_rw },
 	{ "if_addr_lock", &lock_class_mtx_sleep },
 	{ NULL, NULL },
 	/*
 	 * UNIX Domain Sockets
 	 */
 	{ "unp_link_rwlock", &lock_class_rw },
 	{ "unp_list_lock", &lock_class_mtx_sleep },
 	{ "unp", &lock_class_mtx_sleep },
 	{ "so_snd", &lock_class_mtx_sleep },
 	{ NULL, NULL },
 	/*
 	 * UDP/IP
 	 */
 	{ "udp", &lock_class_mtx_sleep },
 	{ "udpinp", &lock_class_rw },
 	{ "so_snd", &lock_class_mtx_sleep },
 	{ NULL, NULL },
 	/*
 	 * TCP/IP
 	 */
 	{ "tcp", &lock_class_mtx_sleep },
 	{ "tcpinp", &lock_class_rw },
 	{ "so_snd", &lock_class_mtx_sleep },
 	{ NULL, NULL },
 	/*
 	 * BPF
 	 */
 	{ "bpf global lock", &lock_class_sx },
-	{ "bpf interface lock", &lock_class_rw },
 	{ "bpf cdev lock", &lock_class_mtx_sleep },
 	{ NULL, NULL },
 	/*
 	 * NFS server
 	 */
 	{ "nfsd_mtx", &lock_class_mtx_sleep },
 	{ "so_snd", &lock_class_mtx_sleep },
 	{ NULL, NULL },
 
 	/*
 	 * IEEE 802.11
 	 */
 	{ "802.11 com lock", &lock_class_mtx_sleep},
 	{ NULL, NULL },
 	/*
 	 * Network drivers
 	 */
 	{ "network driver", &lock_class_mtx_sleep},
 	{ NULL, NULL },
 
 	/*
 	 * Netgraph
 	 */
 	{ "ng_node", &lock_class_mtx_sleep },
 	{ "ng_worklist", &lock_class_mtx_sleep },
 	{ NULL, NULL },
 	/*
 	 * CDEV
 	 */
 	{ "vm map (system)", &lock_class_mtx_sleep },
 	{ "vnode interlock", &lock_class_mtx_sleep },
 	{ "cdev", &lock_class_mtx_sleep },
 	{ NULL, NULL },
 	/*
 	 * VM
 	 */
 	{ "vm map (user)", &lock_class_sx },
 	{ "vm object", &lock_class_rw },
 	{ "vm page", &lock_class_mtx_sleep },
 	{ "pmap pv global", &lock_class_rw },
 	{ "pmap", &lock_class_mtx_sleep },
 	{ "pmap pv list", &lock_class_rw },
 	{ "vm page free queue", &lock_class_mtx_sleep },
 	{ "vm pagequeue", &lock_class_mtx_sleep },
 	{ NULL, NULL },
 	/*
 	 * kqueue/VFS interaction
 	 */
 	{ "kqueue", &lock_class_mtx_sleep },
 	{ "struct mount mtx", &lock_class_mtx_sleep },
 	{ "vnode interlock", &lock_class_mtx_sleep },
 	{ NULL, NULL },
 	/*
 	 * VFS namecache
 	 */
 	{ "ncvn", &lock_class_mtx_sleep },
 	{ "ncbuc", &lock_class_rw },
 	{ "vnode interlock", &lock_class_mtx_sleep },
 	{ "ncneg", &lock_class_mtx_sleep },
 	{ NULL, NULL },
 	/*
 	 * ZFS locking
 	 */
 	{ "dn->dn_mtx", &lock_class_sx },
 	{ "dr->dt.di.dr_mtx", &lock_class_sx },
 	{ "db->db_mtx", &lock_class_sx },
 	{ NULL, NULL },
 	/*
 	 * TCP log locks
 	 */
 	{ "TCP ID tree", &lock_class_rw },
 	{ "tcp log id bucket", &lock_class_mtx_sleep },
 	{ "tcpinp", &lock_class_rw },
 	{ "TCP log expireq", &lock_class_mtx_sleep },
 	{ NULL, NULL },
 	/*
 	 * spin locks
 	 */
 #ifdef SMP
 	{ "ap boot", &lock_class_mtx_spin },
 #endif
 	{ "rm.mutex_mtx", &lock_class_mtx_spin },
 	{ "sio", &lock_class_mtx_spin },
 #ifdef __i386__
 	{ "cy", &lock_class_mtx_spin },
 #endif
 #ifdef __sparc64__
 	{ "pcib_mtx", &lock_class_mtx_spin },
 	{ "rtc_mtx", &lock_class_mtx_spin },
 #endif
 	{ "scc_hwmtx", &lock_class_mtx_spin },
 	{ "uart_hwmtx", &lock_class_mtx_spin },
 	{ "fast_taskqueue", &lock_class_mtx_spin },
 	{ "intr table", &lock_class_mtx_spin },
 	{ "process slock", &lock_class_mtx_spin },
 	{ "syscons video lock", &lock_class_mtx_spin },
 	{ "sleepq chain", &lock_class_mtx_spin },
 	{ "rm_spinlock", &lock_class_mtx_spin },
 	{ "turnstile chain", &lock_class_mtx_spin },
 	{ "turnstile lock", &lock_class_mtx_spin },
 	{ "sched lock", &lock_class_mtx_spin },
 	{ "td_contested", &lock_class_mtx_spin },
 	{ "callout", &lock_class_mtx_spin },
 	{ "entropy harvest mutex", &lock_class_mtx_spin },
 #ifdef SMP
 	{ "smp rendezvous", &lock_class_mtx_spin },
 #endif
 #ifdef __powerpc__
 	{ "tlb0", &lock_class_mtx_spin },
 #endif
 	{ NULL, NULL },
 	{ "sched lock", &lock_class_mtx_spin },
 #ifdef	HWPMC_HOOKS
 	{ "pmc-per-proc", &lock_class_mtx_spin },
 #endif
 	{ NULL, NULL },
 	/*
 	 * leaf locks
 	 */
 	{ "intrcnt", &lock_class_mtx_spin },
 	{ "icu", &lock_class_mtx_spin },
 #if defined(SMP) && defined(__sparc64__)
 	{ "ipi", &lock_class_mtx_spin },
 #endif
 #ifdef __i386__
 	{ "allpmaps", &lock_class_mtx_spin },
 	{ "descriptor tables", &lock_class_mtx_spin },
 #endif
 	{ "clk", &lock_class_mtx_spin },
 	{ "cpuset", &lock_class_mtx_spin },
 	{ "mprof lock", &lock_class_mtx_spin },
 	{ "zombie lock", &lock_class_mtx_spin },
 	{ "ALD Queue", &lock_class_mtx_spin },
 #if defined(__i386__) || defined(__amd64__)
 	{ "pcicfg", &lock_class_mtx_spin },
 	{ "NDIS thread lock", &lock_class_mtx_spin },
 #endif
 	{ "tw_osl_io_lock", &lock_class_mtx_spin },
 	{ "tw_osl_q_lock", &lock_class_mtx_spin },
 	{ "tw_cl_io_lock", &lock_class_mtx_spin },
 	{ "tw_cl_intr_lock", &lock_class_mtx_spin },
 	{ "tw_cl_gen_lock", &lock_class_mtx_spin },
 #ifdef	HWPMC_HOOKS
 	{ "pmc-leaf", &lock_class_mtx_spin },
 #endif
 	{ "blocked lock", &lock_class_mtx_spin },
 	{ NULL, NULL },
 	{ NULL, NULL }
 };
 
 #ifdef BLESSING
 /*
  * Pairs of locks which have been blessed
  * Don't complain about order problems with blessed locks
  */
 static struct witness_blessed blessed_list[] = {
 };
 #endif
 
 /*
  * This global is set to 0 once it becomes safe to use the witness code.
  */
 static int witness_cold = 1;
 
 /*
  * This global is set to 1 once the static lock orders have been enrolled
  * so that a warning can be issued for any spin locks enrolled later.
  */
 static int witness_spin_warn = 0;
 
 /* Trim useless garbage from filenames. */
 static const char *
 fixup_filename(const char *file)
 {
 
 	if (file == NULL)
 		return (NULL);
 	while (strncmp(file, "../", 3) == 0)
 		file += 3;
 	return (file);
 }
 
 /*
  * Calculate the size of early witness structures.
  */
 int
 witness_startup_count(void)
 {
 	int sz;
 
 	sz = sizeof(struct witness) * witness_count;
 	sz += sizeof(*w_rmatrix) * (witness_count + 1);
 	sz += sizeof(*w_rmatrix[0]) * (witness_count + 1) *
 	    (witness_count + 1);
 
 	return (sz);
 }
 
 /*
  * The WITNESS-enabled diagnostic code.  Note that the witness code does
  * assume that the early boot is single-threaded at least until after this
  * routine is completed.
  */
 void
 witness_startup(void *mem)
 {
 	struct lock_object *lock;
 	struct witness_order_list_entry *order;
 	struct witness *w, *w1;
 	uintptr_t p;
 	int i;
 
 	p = (uintptr_t)mem;
 	w_data = (void *)p;
 	p += sizeof(struct witness) * witness_count;
 
 	w_rmatrix = (void *)p;
 	p += sizeof(*w_rmatrix) * (witness_count + 1);
 
 	for (i = 0; i < witness_count + 1; i++) {
 		w_rmatrix[i] = (void *)p;
 		p += sizeof(*w_rmatrix[i]) * (witness_count + 1);
 	}
 	badstack_sbuf_size = witness_count * 256;
 
 	/*
 	 * We have to release Giant before initializing its witness
 	 * structure so that WITNESS doesn't get confused.
 	 */
 	mtx_unlock(&Giant);
 	mtx_assert(&Giant, MA_NOTOWNED);
 
 	CTR1(KTR_WITNESS, "%s: initializing witness", __func__);
 	mtx_init(&w_mtx, "witness lock", NULL, MTX_SPIN | MTX_QUIET |
 	    MTX_NOWITNESS | MTX_NOPROFILE);
 	for (i = witness_count - 1; i >= 0; i--) {
 		w = &w_data[i];
 		memset(w, 0, sizeof(*w));
 		w_data[i].w_index = i;	/* Witness index never changes. */
 		witness_free(w);
 	}
 	KASSERT(STAILQ_FIRST(&w_free)->w_index == 0,
 	    ("%s: Invalid list of free witness objects", __func__));
 
 	/* Witness with index 0 is not used to aid in debugging. */
 	STAILQ_REMOVE_HEAD(&w_free, w_list);
 	w_free_cnt--;
 
 	for (i = 0; i < witness_count; i++) {
 		memset(w_rmatrix[i], 0, sizeof(*w_rmatrix[i]) * 
 		    (witness_count + 1));
 	}
 
 	for (i = 0; i < LOCK_CHILDCOUNT; i++)
 		witness_lock_list_free(&w_locklistdata[i]);
 	witness_init_hash_tables();
 
 	/* First add in all the specified order lists. */
 	for (order = order_lists; order->w_name != NULL; order++) {
 		w = enroll(order->w_name, order->w_class);
 		if (w == NULL)
 			continue;
 		w->w_file = "order list";
 		for (order++; order->w_name != NULL; order++) {
 			w1 = enroll(order->w_name, order->w_class);
 			if (w1 == NULL)
 				continue;
 			w1->w_file = "order list";
 			itismychild(w, w1);
 			w = w1;
 		}
 	}
 	witness_spin_warn = 1;
 
 	/* Iterate through all locks and add them to witness. */
 	for (i = 0; pending_locks[i].wh_lock != NULL; i++) {
 		lock = pending_locks[i].wh_lock;
 		KASSERT(lock->lo_flags & LO_WITNESS,
 		    ("%s: lock %s is on pending list but not LO_WITNESS",
 		    __func__, lock->lo_name));
 		lock->lo_witness = enroll(pending_locks[i].wh_type,
 		    LOCK_CLASS(lock));
 	}
 
 	/* Mark the witness code as being ready for use. */
 	witness_cold = 0;
 
 	mtx_lock(&Giant);
 }
 
 void
 witness_init(struct lock_object *lock, const char *type)
 {
 	struct lock_class *class;
 
 	/* Various sanity checks. */
 	class = LOCK_CLASS(lock);
 	if ((lock->lo_flags & LO_RECURSABLE) != 0 &&
 	    (class->lc_flags & LC_RECURSABLE) == 0)
 		kassert_panic("%s: lock (%s) %s can not be recursable",
 		    __func__, class->lc_name, lock->lo_name);
 	if ((lock->lo_flags & LO_SLEEPABLE) != 0 &&
 	    (class->lc_flags & LC_SLEEPABLE) == 0)
 		kassert_panic("%s: lock (%s) %s can not be sleepable",
 		    __func__, class->lc_name, lock->lo_name);
 	if ((lock->lo_flags & LO_UPGRADABLE) != 0 &&
 	    (class->lc_flags & LC_UPGRADABLE) == 0)
 		kassert_panic("%s: lock (%s) %s can not be upgradable",
 		    __func__, class->lc_name, lock->lo_name);
 
 	/*
 	 * If we shouldn't watch this lock, then just clear lo_witness.
 	 * Otherwise, if witness_cold is set, then it is too early to
 	 * enroll this lock, so defer it to witness_initialize() by adding
 	 * it to the pending_locks list.  If it is not too early, then enroll
 	 * the lock now.
 	 */
 	if (witness_watch < 1 || panicstr != NULL ||
 	    (lock->lo_flags & LO_WITNESS) == 0)
 		lock->lo_witness = NULL;
 	else if (witness_cold) {
 		pending_locks[pending_cnt].wh_lock = lock;
 		pending_locks[pending_cnt++].wh_type = type;
 		if (pending_cnt > WITNESS_PENDLIST)
 			panic("%s: pending locks list is too small, "
 			    "increase WITNESS_PENDLIST\n",
 			    __func__);
 	} else
 		lock->lo_witness = enroll(type, class);
 }
 
 void
 witness_destroy(struct lock_object *lock)
 {
 	struct lock_class *class;
 	struct witness *w;
 
 	class = LOCK_CLASS(lock);
 
 	if (witness_cold)
 		panic("lock (%s) %s destroyed while witness_cold",
 		    class->lc_name, lock->lo_name);
 
 	/* XXX: need to verify that no one holds the lock */
 	if ((lock->lo_flags & LO_WITNESS) == 0 || lock->lo_witness == NULL)
 		return;
 	w = lock->lo_witness;
 
 	mtx_lock_spin(&w_mtx);
 	MPASS(w->w_refcount > 0);
 	w->w_refcount--;
 
 	if (w->w_refcount == 0)
 		depart(w);
 	mtx_unlock_spin(&w_mtx);
 }
 
 #ifdef DDB
 static void
 witness_ddb_compute_levels(void)
 {
 	struct witness *w;
 
 	/*
 	 * First clear all levels.
 	 */
 	STAILQ_FOREACH(w, &w_all, w_list)
 		w->w_ddb_level = -1;
 
 	/*
 	 * Look for locks with no parents and level all their descendants.
 	 */
 	STAILQ_FOREACH(w, &w_all, w_list) {
 
 		/* If the witness has ancestors (is not a root), skip it. */
 		if (w->w_num_ancestors > 0)
 			continue;
 		witness_ddb_level_descendants(w, 0);
 	}
 }
 
 static void
 witness_ddb_level_descendants(struct witness *w, int l)
 {
 	int i;
 
 	if (w->w_ddb_level >= l)
 		return;
 
 	w->w_ddb_level = l;
 	l++;
 
 	for (i = 1; i <= w_max_used_index; i++) {
 		if (w_rmatrix[w->w_index][i] & WITNESS_PARENT)
 			witness_ddb_level_descendants(&w_data[i], l);
 	}
 }
 
 static void
 witness_ddb_display_descendants(int(*prnt)(const char *fmt, ...),
     struct witness *w, int indent)
 {
 	int i;
 
  	for (i = 0; i < indent; i++)
  		prnt(" ");
 	prnt("%s (type: %s, depth: %d, active refs: %d)",
 	     w->w_name, w->w_class->lc_name,
 	     w->w_ddb_level, w->w_refcount);
  	if (w->w_displayed) {
  		prnt(" -- (already displayed)\n");
  		return;
  	}
  	w->w_displayed = 1;
 	if (w->w_file != NULL && w->w_line != 0)
 		prnt(" -- last acquired @ %s:%d\n", fixup_filename(w->w_file),
 		    w->w_line);
 	else
 		prnt(" -- never acquired\n");
 	indent++;
 	WITNESS_INDEX_ASSERT(w->w_index);
 	for (i = 1; i <= w_max_used_index; i++) {
 		if (db_pager_quit)
 			return;
 		if (w_rmatrix[w->w_index][i] & WITNESS_PARENT)
 			witness_ddb_display_descendants(prnt, &w_data[i],
 			    indent);
 	}
 }
 
 static void
 witness_ddb_display_list(int(*prnt)(const char *fmt, ...),
     struct witness_list *list)
 {
 	struct witness *w;
 
 	STAILQ_FOREACH(w, list, w_typelist) {
 		if (w->w_file == NULL || w->w_ddb_level > 0)
 			continue;
 
 		/* This lock has no anscestors - display its descendants. */
 		witness_ddb_display_descendants(prnt, w, 0);
 		if (db_pager_quit)
 			return;
 	}
 }
 	
 static void
 witness_ddb_display(int(*prnt)(const char *fmt, ...))
 {
 	struct witness *w;
 
 	KASSERT(witness_cold == 0, ("%s: witness_cold", __func__));
 	witness_ddb_compute_levels();
 
 	/* Clear all the displayed flags. */
 	STAILQ_FOREACH(w, &w_all, w_list)
 		w->w_displayed = 0;
 
 	/*
 	 * First, handle sleep locks which have been acquired at least
 	 * once.
 	 */
 	prnt("Sleep locks:\n");
 	witness_ddb_display_list(prnt, &w_sleep);
 	if (db_pager_quit)
 		return;
 	
 	/*
 	 * Now do spin locks which have been acquired at least once.
 	 */
 	prnt("\nSpin locks:\n");
 	witness_ddb_display_list(prnt, &w_spin);
 	if (db_pager_quit)
 		return;
 	
 	/*
 	 * Finally, any locks which have not been acquired yet.
 	 */
 	prnt("\nLocks which were never acquired:\n");
 	STAILQ_FOREACH(w, &w_all, w_list) {
 		if (w->w_file != NULL || w->w_refcount == 0)
 			continue;
 		prnt("%s (type: %s, depth: %d)\n", w->w_name,
 		    w->w_class->lc_name, w->w_ddb_level);
 		if (db_pager_quit)
 			return;
 	}
 }
 #endif /* DDB */
 
 int
 witness_defineorder(struct lock_object *lock1, struct lock_object *lock2)
 {
 
 	if (witness_watch == -1 || panicstr != NULL)
 		return (0);
 
 	/* Require locks that witness knows about. */
 	if (lock1 == NULL || lock1->lo_witness == NULL || lock2 == NULL ||
 	    lock2->lo_witness == NULL)
 		return (EINVAL);
 
 	mtx_assert(&w_mtx, MA_NOTOWNED);
 	mtx_lock_spin(&w_mtx);
 
 	/*
 	 * If we already have either an explicit or implied lock order that
 	 * is the other way around, then return an error.
 	 */
 	if (witness_watch &&
 	    isitmydescendant(lock2->lo_witness, lock1->lo_witness)) {
 		mtx_unlock_spin(&w_mtx);
 		return (EDOOFUS);
 	}
 	
 	/* Try to add the new order. */
 	CTR3(KTR_WITNESS, "%s: adding %s as a child of %s", __func__,
 	    lock2->lo_witness->w_name, lock1->lo_witness->w_name);
 	itismychild(lock1->lo_witness, lock2->lo_witness);
 	mtx_unlock_spin(&w_mtx);
 	return (0);
 }
 
 void
 witness_checkorder(struct lock_object *lock, int flags, const char *file,
     int line, struct lock_object *interlock)
 {
 	struct lock_list_entry *lock_list, *lle;
 	struct lock_instance *lock1, *lock2, *plock;
 	struct lock_class *class, *iclass;
 	struct witness *w, *w1;
 	struct thread *td;
 	int i, j;
 
 	if (witness_cold || witness_watch < 1 || lock->lo_witness == NULL ||
 	    panicstr != NULL)
 		return;
 
 	w = lock->lo_witness;
 	class = LOCK_CLASS(lock);
 	td = curthread;
 
 	if (class->lc_flags & LC_SLEEPLOCK) {
 
 		/*
 		 * Since spin locks include a critical section, this check
 		 * implicitly enforces a lock order of all sleep locks before
 		 * all spin locks.
 		 */
 		if (td->td_critnest != 0 && !kdb_active)
 			kassert_panic("acquiring blockable sleep lock with "
 			    "spinlock or critical section held (%s) %s @ %s:%d",
 			    class->lc_name, lock->lo_name,
 			    fixup_filename(file), line);
 
 		/*
 		 * If this is the first lock acquired then just return as
 		 * no order checking is needed.
 		 */
 		lock_list = td->td_sleeplocks;
 		if (lock_list == NULL || lock_list->ll_count == 0)
 			return;
 	} else {
 
 		/*
 		 * If this is the first lock, just return as no order
 		 * checking is needed.  Avoid problems with thread
 		 * migration pinning the thread while checking if
 		 * spinlocks are held.  If at least one spinlock is held
 		 * the thread is in a safe path and it is allowed to
 		 * unpin it.
 		 */
 		sched_pin();
 		lock_list = PCPU_GET(spinlocks);
 		if (lock_list == NULL || lock_list->ll_count == 0) {
 			sched_unpin();
 			return;
 		}
 		sched_unpin();
 	}
 
 	/*
 	 * Check to see if we are recursing on a lock we already own.  If
 	 * so, make sure that we don't mismatch exclusive and shared lock
 	 * acquires.
 	 */
 	lock1 = find_instance(lock_list, lock);
 	if (lock1 != NULL) {
 		if ((lock1->li_flags & LI_EXCLUSIVE) != 0 &&
 		    (flags & LOP_EXCLUSIVE) == 0) {
 			witness_output("shared lock of (%s) %s @ %s:%d\n",
 			    class->lc_name, lock->lo_name,
 			    fixup_filename(file), line);
 			witness_output("while exclusively locked from %s:%d\n",
 			    fixup_filename(lock1->li_file), lock1->li_line);
 			kassert_panic("excl->share");
 		}
 		if ((lock1->li_flags & LI_EXCLUSIVE) == 0 &&
 		    (flags & LOP_EXCLUSIVE) != 0) {
 			witness_output("exclusive lock of (%s) %s @ %s:%d\n",
 			    class->lc_name, lock->lo_name,
 			    fixup_filename(file), line);
 			witness_output("while share locked from %s:%d\n",
 			    fixup_filename(lock1->li_file), lock1->li_line);
 			kassert_panic("share->excl");
 		}
 		return;
 	}
 
 	/* Warn if the interlock is not locked exactly once. */
 	if (interlock != NULL) {
 		iclass = LOCK_CLASS(interlock);
 		lock1 = find_instance(lock_list, interlock);
 		if (lock1 == NULL)
 			kassert_panic("interlock (%s) %s not locked @ %s:%d",
 			    iclass->lc_name, interlock->lo_name,
 			    fixup_filename(file), line);
 		else if ((lock1->li_flags & LI_RECURSEMASK) != 0)
 			kassert_panic("interlock (%s) %s recursed @ %s:%d",
 			    iclass->lc_name, interlock->lo_name,
 			    fixup_filename(file), line);
 	}
 
 	/*
 	 * Find the previously acquired lock, but ignore interlocks.
 	 */
 	plock = &lock_list->ll_children[lock_list->ll_count - 1];
 	if (interlock != NULL && plock->li_lock == interlock) {
 		if (lock_list->ll_count > 1)
 			plock =
 			    &lock_list->ll_children[lock_list->ll_count - 2];
 		else {
 			lle = lock_list->ll_next;
 
 			/*
 			 * The interlock is the only lock we hold, so
 			 * simply return.
 			 */
 			if (lle == NULL)
 				return;
 			plock = &lle->ll_children[lle->ll_count - 1];
 		}
 	}
 	
 	/*
 	 * Try to perform most checks without a lock.  If this succeeds we
 	 * can skip acquiring the lock and return success.  Otherwise we redo
 	 * the check with the lock held to handle races with concurrent updates.
 	 */
 	w1 = plock->li_lock->lo_witness;
 	if (witness_lock_order_check(w1, w))
 		return;
 
 	mtx_lock_spin(&w_mtx);
 	if (witness_lock_order_check(w1, w)) {
 		mtx_unlock_spin(&w_mtx);
 		return;
 	}
 	witness_lock_order_add(w1, w);
 
 	/*
 	 * Check for duplicate locks of the same type.  Note that we only
 	 * have to check for this on the last lock we just acquired.  Any
 	 * other cases will be caught as lock order violations.
 	 */
 	if (w1 == w) {
 		i = w->w_index;
 		if (!(lock->lo_flags & LO_DUPOK) && !(flags & LOP_DUPOK) &&
 		    !(w_rmatrix[i][i] & WITNESS_REVERSAL)) {
 		    w_rmatrix[i][i] |= WITNESS_REVERSAL;
 			w->w_reversed = 1;
 			mtx_unlock_spin(&w_mtx);
 			witness_output(
 			    "acquiring duplicate lock of same type: \"%s\"\n", 
 			    w->w_name);
 			witness_output(" 1st %s @ %s:%d\n", plock->li_lock->lo_name,
 			    fixup_filename(plock->li_file), plock->li_line);
 			witness_output(" 2nd %s @ %s:%d\n", lock->lo_name,
 			    fixup_filename(file), line);
 			witness_debugger(1, __func__);
 		} else
 			mtx_unlock_spin(&w_mtx);
 		return;
 	}
 	mtx_assert(&w_mtx, MA_OWNED);
 
 	/*
 	 * If we know that the lock we are acquiring comes after
 	 * the lock we most recently acquired in the lock order tree,
 	 * then there is no need for any further checks.
 	 */
 	if (isitmychild(w1, w))
 		goto out;
 
 	for (j = 0, lle = lock_list; lle != NULL; lle = lle->ll_next) {
 		for (i = lle->ll_count - 1; i >= 0; i--, j++) {
 
 			MPASS(j < LOCK_CHILDCOUNT * LOCK_NCHILDREN);
 			lock1 = &lle->ll_children[i];
 
 			/*
 			 * Ignore the interlock.
 			 */
 			if (interlock == lock1->li_lock)
 				continue;
 
 			/*
 			 * If this lock doesn't undergo witness checking,
 			 * then skip it.
 			 */
 			w1 = lock1->li_lock->lo_witness;
 			if (w1 == NULL) {
 				KASSERT((lock1->li_lock->lo_flags & LO_WITNESS) == 0,
 				    ("lock missing witness structure"));
 				continue;
 			}
 
 			/*
 			 * If we are locking Giant and this is a sleepable
 			 * lock, then skip it.
 			 */
 			if ((lock1->li_lock->lo_flags & LO_SLEEPABLE) != 0 &&
 			    lock == &Giant.lock_object)
 				continue;
 
 			/*
 			 * If we are locking a sleepable lock and this lock
 			 * is Giant, then skip it.
 			 */
 			if ((lock->lo_flags & LO_SLEEPABLE) != 0 &&
 			    lock1->li_lock == &Giant.lock_object)
 				continue;
 
 			/*
 			 * If we are locking a sleepable lock and this lock
 			 * isn't sleepable, we want to treat it as a lock
 			 * order violation to enfore a general lock order of
 			 * sleepable locks before non-sleepable locks.
 			 */
 			if (((lock->lo_flags & LO_SLEEPABLE) != 0 &&
 			    (lock1->li_lock->lo_flags & LO_SLEEPABLE) == 0))
 				goto reversal;
 
 			/*
 			 * If we are locking Giant and this is a non-sleepable
 			 * lock, then treat it as a reversal.
 			 */
 			if ((lock1->li_lock->lo_flags & LO_SLEEPABLE) == 0 &&
 			    lock == &Giant.lock_object)
 				goto reversal;
 
 			/*
 			 * Check the lock order hierarchy for a reveresal.
 			 */
 			if (!isitmydescendant(w, w1))
 				continue;
 		reversal:
 
 			/*
 			 * We have a lock order violation, check to see if it
 			 * is allowed or has already been yelled about.
 			 */
 #ifdef BLESSING
 
 			/*
 			 * If the lock order is blessed, just bail.  We don't
 			 * look for other lock order violations though, which
 			 * may be a bug.
 			 */
 			if (blessed(w, w1))
 				goto out;
 #endif
 
 			/* Bail if this violation is known */
 			if (w_rmatrix[w1->w_index][w->w_index] & WITNESS_REVERSAL)
 				goto out;
 
 			/* Record this as a violation */
 			w_rmatrix[w1->w_index][w->w_index] |= WITNESS_REVERSAL;
 			w_rmatrix[w->w_index][w1->w_index] |= WITNESS_REVERSAL;
 			w->w_reversed = w1->w_reversed = 1;
 			witness_increment_graph_generation();
 			mtx_unlock_spin(&w_mtx);
 
 #ifdef WITNESS_NO_VNODE
 			/*
 			 * There are known LORs between VNODE locks. They are
 			 * not an indication of a bug. VNODE locks are flagged
 			 * as such (LO_IS_VNODE) and we don't yell if the LOR
 			 * is between 2 VNODE locks.
 			 */
 			if ((lock->lo_flags & LO_IS_VNODE) != 0 &&
 			    (lock1->li_lock->lo_flags & LO_IS_VNODE) != 0)
 				return;
 #endif
 
 			/*
 			 * Ok, yell about it.
 			 */
 			if (((lock->lo_flags & LO_SLEEPABLE) != 0 &&
 			    (lock1->li_lock->lo_flags & LO_SLEEPABLE) == 0))
 				witness_output(
 		"lock order reversal: (sleepable after non-sleepable)\n");
 			else if ((lock1->li_lock->lo_flags & LO_SLEEPABLE) == 0
 			    && lock == &Giant.lock_object)
 				witness_output(
 		"lock order reversal: (Giant after non-sleepable)\n");
 			else
 				witness_output("lock order reversal:\n");
 
 			/*
 			 * Try to locate an earlier lock with
 			 * witness w in our list.
 			 */
 			do {
 				lock2 = &lle->ll_children[i];
 				MPASS(lock2->li_lock != NULL);
 				if (lock2->li_lock->lo_witness == w)
 					break;
 				if (i == 0 && lle->ll_next != NULL) {
 					lle = lle->ll_next;
 					i = lle->ll_count - 1;
 					MPASS(i >= 0 && i < LOCK_NCHILDREN);
 				} else
 					i--;
 			} while (i >= 0);
 			if (i < 0) {
 				witness_output(" 1st %p %s (%s) @ %s:%d\n",
 				    lock1->li_lock, lock1->li_lock->lo_name,
 				    w1->w_name, fixup_filename(lock1->li_file),
 				    lock1->li_line);
 				witness_output(" 2nd %p %s (%s) @ %s:%d\n", lock,
 				    lock->lo_name, w->w_name,
 				    fixup_filename(file), line);
 			} else {
 				witness_output(" 1st %p %s (%s) @ %s:%d\n",
 				    lock2->li_lock, lock2->li_lock->lo_name,
 				    lock2->li_lock->lo_witness->w_name,
 				    fixup_filename(lock2->li_file),
 				    lock2->li_line);
 				witness_output(" 2nd %p %s (%s) @ %s:%d\n",
 				    lock1->li_lock, lock1->li_lock->lo_name,
 				    w1->w_name, fixup_filename(lock1->li_file),
 				    lock1->li_line);
 				witness_output(" 3rd %p %s (%s) @ %s:%d\n", lock,
 				    lock->lo_name, w->w_name,
 				    fixup_filename(file), line);
 			}
 			witness_debugger(1, __func__);
 			return;
 		}
 	}
 
 	/*
 	 * If requested, build a new lock order.  However, don't build a new
 	 * relationship between a sleepable lock and Giant if it is in the
 	 * wrong direction.  The correct lock order is that sleepable locks
 	 * always come before Giant.
 	 */
 	if (flags & LOP_NEWORDER &&
 	    !(plock->li_lock == &Giant.lock_object &&
 	    (lock->lo_flags & LO_SLEEPABLE) != 0)) {
 		CTR3(KTR_WITNESS, "%s: adding %s as a child of %s", __func__,
 		    w->w_name, plock->li_lock->lo_witness->w_name);
 		itismychild(plock->li_lock->lo_witness, w);
 	}
 out:
 	mtx_unlock_spin(&w_mtx);
 }
 
 void
 witness_lock(struct lock_object *lock, int flags, const char *file, int line)
 {
 	struct lock_list_entry **lock_list, *lle;
 	struct lock_instance *instance;
 	struct witness *w;
 	struct thread *td;
 
 	if (witness_cold || witness_watch == -1 || lock->lo_witness == NULL ||
 	    panicstr != NULL)
 		return;
 	w = lock->lo_witness;
 	td = curthread;
 
 	/* Determine lock list for this lock. */
 	if (LOCK_CLASS(lock)->lc_flags & LC_SLEEPLOCK)
 		lock_list = &td->td_sleeplocks;
 	else
 		lock_list = PCPU_PTR(spinlocks);
 
 	/* Check to see if we are recursing on a lock we already own. */
 	instance = find_instance(*lock_list, lock);
 	if (instance != NULL) {
 		instance->li_flags++;
 		CTR4(KTR_WITNESS, "%s: pid %d recursed on %s r=%d", __func__,
 		    td->td_proc->p_pid, lock->lo_name,
 		    instance->li_flags & LI_RECURSEMASK);
 		instance->li_file = file;
 		instance->li_line = line;
 		return;
 	}
 
 	/* Update per-witness last file and line acquire. */
 	w->w_file = file;
 	w->w_line = line;
 
 	/* Find the next open lock instance in the list and fill it. */
 	lle = *lock_list;
 	if (lle == NULL || lle->ll_count == LOCK_NCHILDREN) {
 		lle = witness_lock_list_get();
 		if (lle == NULL)
 			return;
 		lle->ll_next = *lock_list;
 		CTR3(KTR_WITNESS, "%s: pid %d added lle %p", __func__,
 		    td->td_proc->p_pid, lle);
 		*lock_list = lle;
 	}
 	instance = &lle->ll_children[lle->ll_count++];
 	instance->li_lock = lock;
 	instance->li_line = line;
 	instance->li_file = file;
 	if ((flags & LOP_EXCLUSIVE) != 0)
 		instance->li_flags = LI_EXCLUSIVE;
 	else
 		instance->li_flags = 0;
 	CTR4(KTR_WITNESS, "%s: pid %d added %s as lle[%d]", __func__,
 	    td->td_proc->p_pid, lock->lo_name, lle->ll_count - 1);
 }
 
 void
 witness_upgrade(struct lock_object *lock, int flags, const char *file, int line)
 {
 	struct lock_instance *instance;
 	struct lock_class *class;
 
 	KASSERT(witness_cold == 0, ("%s: witness_cold", __func__));
 	if (lock->lo_witness == NULL || witness_watch == -1 || panicstr != NULL)
 		return;
 	class = LOCK_CLASS(lock);
 	if (witness_watch) {
 		if ((lock->lo_flags & LO_UPGRADABLE) == 0)
 			kassert_panic(
 			    "upgrade of non-upgradable lock (%s) %s @ %s:%d",
 			    class->lc_name, lock->lo_name,
 			    fixup_filename(file), line);
 		if ((class->lc_flags & LC_SLEEPLOCK) == 0)
 			kassert_panic(
 			    "upgrade of non-sleep lock (%s) %s @ %s:%d",
 			    class->lc_name, lock->lo_name,
 			    fixup_filename(file), line);
 	}
 	instance = find_instance(curthread->td_sleeplocks, lock);
 	if (instance == NULL) {
 		kassert_panic("upgrade of unlocked lock (%s) %s @ %s:%d",
 		    class->lc_name, lock->lo_name,
 		    fixup_filename(file), line);
 		return;
 	}
 	if (witness_watch) {
 		if ((instance->li_flags & LI_EXCLUSIVE) != 0)
 			kassert_panic(
 			    "upgrade of exclusive lock (%s) %s @ %s:%d",
 			    class->lc_name, lock->lo_name,
 			    fixup_filename(file), line);
 		if ((instance->li_flags & LI_RECURSEMASK) != 0)
 			kassert_panic(
 			    "upgrade of recursed lock (%s) %s r=%d @ %s:%d",
 			    class->lc_name, lock->lo_name,
 			    instance->li_flags & LI_RECURSEMASK,
 			    fixup_filename(file), line);
 	}
 	instance->li_flags |= LI_EXCLUSIVE;
 }
 
 void
 witness_downgrade(struct lock_object *lock, int flags, const char *file,
     int line)
 {
 	struct lock_instance *instance;
 	struct lock_class *class;
 
 	KASSERT(witness_cold == 0, ("%s: witness_cold", __func__));
 	if (lock->lo_witness == NULL || witness_watch == -1 || panicstr != NULL)
 		return;
 	class = LOCK_CLASS(lock);
 	if (witness_watch) {
 		if ((lock->lo_flags & LO_UPGRADABLE) == 0)
 			kassert_panic(
 			    "downgrade of non-upgradable lock (%s) %s @ %s:%d",
 			    class->lc_name, lock->lo_name,
 			    fixup_filename(file), line);
 		if ((class->lc_flags & LC_SLEEPLOCK) == 0)
 			kassert_panic(
 			    "downgrade of non-sleep lock (%s) %s @ %s:%d",
 			    class->lc_name, lock->lo_name,
 			    fixup_filename(file), line);
 	}
 	instance = find_instance(curthread->td_sleeplocks, lock);
 	if (instance == NULL) {
 		kassert_panic("downgrade of unlocked lock (%s) %s @ %s:%d",
 		    class->lc_name, lock->lo_name,
 		    fixup_filename(file), line);
 		return;
 	}
 	if (witness_watch) {
 		if ((instance->li_flags & LI_EXCLUSIVE) == 0)
 			kassert_panic(
 			    "downgrade of shared lock (%s) %s @ %s:%d",
 			    class->lc_name, lock->lo_name,
 			    fixup_filename(file), line);
 		if ((instance->li_flags & LI_RECURSEMASK) != 0)
 			kassert_panic(
 			    "downgrade of recursed lock (%s) %s r=%d @ %s:%d",
 			    class->lc_name, lock->lo_name,
 			    instance->li_flags & LI_RECURSEMASK,
 			    fixup_filename(file), line);
 	}
 	instance->li_flags &= ~LI_EXCLUSIVE;
 }
 
 void
 witness_unlock(struct lock_object *lock, int flags, const char *file, int line)
 {
 	struct lock_list_entry **lock_list, *lle;
 	struct lock_instance *instance;
 	struct lock_class *class;
 	struct thread *td;
 	register_t s;
 	int i, j;
 
 	if (witness_cold || lock->lo_witness == NULL || panicstr != NULL)
 		return;
 	td = curthread;
 	class = LOCK_CLASS(lock);
 
 	/* Find lock instance associated with this lock. */
 	if (class->lc_flags & LC_SLEEPLOCK)
 		lock_list = &td->td_sleeplocks;
 	else
 		lock_list = PCPU_PTR(spinlocks);
 	lle = *lock_list;
 	for (; *lock_list != NULL; lock_list = &(*lock_list)->ll_next)
 		for (i = 0; i < (*lock_list)->ll_count; i++) {
 			instance = &(*lock_list)->ll_children[i];
 			if (instance->li_lock == lock)
 				goto found;
 		}
 
 	/*
 	 * When disabling WITNESS through witness_watch we could end up in
 	 * having registered locks in the td_sleeplocks queue.
 	 * We have to make sure we flush these queues, so just search for
 	 * eventual register locks and remove them.
 	 */
 	if (witness_watch > 0) {
 		kassert_panic("lock (%s) %s not locked @ %s:%d", class->lc_name,
 		    lock->lo_name, fixup_filename(file), line);
 		return;
 	} else {
 		return;
 	}
 found:
 
 	/* First, check for shared/exclusive mismatches. */
 	if ((instance->li_flags & LI_EXCLUSIVE) != 0 && witness_watch > 0 &&
 	    (flags & LOP_EXCLUSIVE) == 0) {
 		witness_output("shared unlock of (%s) %s @ %s:%d\n",
 		    class->lc_name, lock->lo_name, fixup_filename(file), line);
 		witness_output("while exclusively locked from %s:%d\n",
 		    fixup_filename(instance->li_file), instance->li_line);
 		kassert_panic("excl->ushare");
 	}
 	if ((instance->li_flags & LI_EXCLUSIVE) == 0 && witness_watch > 0 &&
 	    (flags & LOP_EXCLUSIVE) != 0) {
 		witness_output("exclusive unlock of (%s) %s @ %s:%d\n",
 		    class->lc_name, lock->lo_name, fixup_filename(file), line);
 		witness_output("while share locked from %s:%d\n",
 		    fixup_filename(instance->li_file),
 		    instance->li_line);
 		kassert_panic("share->uexcl");
 	}
 	/* If we are recursed, unrecurse. */
 	if ((instance->li_flags & LI_RECURSEMASK) > 0) {
 		CTR4(KTR_WITNESS, "%s: pid %d unrecursed on %s r=%d", __func__,
 		    td->td_proc->p_pid, instance->li_lock->lo_name,
 		    instance->li_flags);
 		instance->li_flags--;
 		return;
 	}
 	/* The lock is now being dropped, check for NORELEASE flag */
 	if ((instance->li_flags & LI_NORELEASE) != 0 && witness_watch > 0) {
 		witness_output("forbidden unlock of (%s) %s @ %s:%d\n",
 		    class->lc_name, lock->lo_name, fixup_filename(file), line);
 		kassert_panic("lock marked norelease");
 	}
 
 	/* Otherwise, remove this item from the list. */
 	s = intr_disable();
 	CTR4(KTR_WITNESS, "%s: pid %d removed %s from lle[%d]", __func__,
 	    td->td_proc->p_pid, instance->li_lock->lo_name,
 	    (*lock_list)->ll_count - 1);
 	for (j = i; j < (*lock_list)->ll_count - 1; j++)
 		(*lock_list)->ll_children[j] =
 		    (*lock_list)->ll_children[j + 1];
 	(*lock_list)->ll_count--;
 	intr_restore(s);
 
 	/*
 	 * In order to reduce contention on w_mtx, we want to keep always an
 	 * head object into lists so that frequent allocation from the 
 	 * free witness pool (and subsequent locking) is avoided.
 	 * In order to maintain the current code simple, when the head
 	 * object is totally unloaded it means also that we do not have
 	 * further objects in the list, so the list ownership needs to be
 	 * hand over to another object if the current head needs to be freed.
 	 */
 	if ((*lock_list)->ll_count == 0) {
 		if (*lock_list == lle) {
 			if (lle->ll_next == NULL)
 				return;
 		} else
 			lle = *lock_list;
 		*lock_list = lle->ll_next;
 		CTR3(KTR_WITNESS, "%s: pid %d removed lle %p", __func__,
 		    td->td_proc->p_pid, lle);
 		witness_lock_list_free(lle);
 	}
 }
 
 void
 witness_thread_exit(struct thread *td)
 {
 	struct lock_list_entry *lle;
 	int i, n;
 
 	lle = td->td_sleeplocks;
 	if (lle == NULL || panicstr != NULL)
 		return;
 	if (lle->ll_count != 0) {
 		for (n = 0; lle != NULL; lle = lle->ll_next)
 			for (i = lle->ll_count - 1; i >= 0; i--) {
 				if (n == 0)
 					witness_output(
 		    "Thread %p exiting with the following locks held:\n", td);
 				n++;
 				witness_list_lock(&lle->ll_children[i],
 				    witness_output);
 				
 			}
 		kassert_panic(
 		    "Thread %p cannot exit while holding sleeplocks\n", td);
 	}
 	witness_lock_list_free(lle);
 }
 
 /*
  * Warn if any locks other than 'lock' are held.  Flags can be passed in to
  * exempt Giant and sleepable locks from the checks as well.  If any
  * non-exempt locks are held, then a supplied message is printed to the
  * output channel along with a list of the offending locks.  If indicated in the
  * flags then a failure results in a panic as well.
  */
 int
 witness_warn(int flags, struct lock_object *lock, const char *fmt, ...)
 {
 	struct lock_list_entry *lock_list, *lle;
 	struct lock_instance *lock1;
 	struct thread *td;
 	va_list ap;
 	int i, n;
 
 	if (witness_cold || witness_watch < 1 || panicstr != NULL)
 		return (0);
 	n = 0;
 	td = curthread;
 	for (lle = td->td_sleeplocks; lle != NULL; lle = lle->ll_next)
 		for (i = lle->ll_count - 1; i >= 0; i--) {
 			lock1 = &lle->ll_children[i];
 			if (lock1->li_lock == lock)
 				continue;
 			if (flags & WARN_GIANTOK &&
 			    lock1->li_lock == &Giant.lock_object)
 				continue;
 			if (flags & WARN_SLEEPOK &&
 			    (lock1->li_lock->lo_flags & LO_SLEEPABLE) != 0)
 				continue;
 			if (n == 0) {
 				va_start(ap, fmt);
 				vprintf(fmt, ap);
 				va_end(ap);
 				printf(" with the following %slocks held:\n",
 				    (flags & WARN_SLEEPOK) != 0 ?
 				    "non-sleepable " : "");
 			}
 			n++;
 			witness_list_lock(lock1, printf);
 		}
 
 	/*
 	 * Pin the thread in order to avoid problems with thread migration.
 	 * Once that all verifies are passed about spinlocks ownership,
 	 * the thread is in a safe path and it can be unpinned.
 	 */
 	sched_pin();
 	lock_list = PCPU_GET(spinlocks);
 	if (lock_list != NULL && lock_list->ll_count != 0) {
 		sched_unpin();
 
 		/*
 		 * We should only have one spinlock and as long as
 		 * the flags cannot match for this locks class,
 		 * check if the first spinlock is the one curthread
 		 * should hold.
 		 */
 		lock1 = &lock_list->ll_children[lock_list->ll_count - 1];
 		if (lock_list->ll_count == 1 && lock_list->ll_next == NULL &&
 		    lock1->li_lock == lock && n == 0)
 			return (0);
 
 		va_start(ap, fmt);
 		vprintf(fmt, ap);
 		va_end(ap);
 		printf(" with the following %slocks held:\n",
 		    (flags & WARN_SLEEPOK) != 0 ?  "non-sleepable " : "");
 		n += witness_list_locks(&lock_list, printf);
 	} else
 		sched_unpin();
 	if (flags & WARN_PANIC && n)
 		kassert_panic("%s", __func__);
 	else
 		witness_debugger(n, __func__);
 	return (n);
 }
 
 const char *
 witness_file(struct lock_object *lock)
 {
 	struct witness *w;
 
 	if (witness_cold || witness_watch < 1 || lock->lo_witness == NULL)
 		return ("?");
 	w = lock->lo_witness;
 	return (w->w_file);
 }
 
 int
 witness_line(struct lock_object *lock)
 {
 	struct witness *w;
 
 	if (witness_cold || witness_watch < 1 || lock->lo_witness == NULL)
 		return (0);
 	w = lock->lo_witness;
 	return (w->w_line);
 }
 
 static struct witness *
 enroll(const char *description, struct lock_class *lock_class)
 {
 	struct witness *w;
 
 	MPASS(description != NULL);
 
 	if (witness_watch == -1 || panicstr != NULL)
 		return (NULL);
 	if ((lock_class->lc_flags & LC_SPINLOCK)) {
 		if (witness_skipspin)
 			return (NULL);
 	} else if ((lock_class->lc_flags & LC_SLEEPLOCK) == 0) {
 		kassert_panic("lock class %s is not sleep or spin",
 		    lock_class->lc_name);
 		return (NULL);
 	}
 
 	mtx_lock_spin(&w_mtx);
 	w = witness_hash_get(description);
 	if (w)
 		goto found;
 	if ((w = witness_get()) == NULL)
 		return (NULL);
 	MPASS(strlen(description) < MAX_W_NAME);
 	strcpy(w->w_name, description);
 	w->w_class = lock_class;
 	w->w_refcount = 1;
 	STAILQ_INSERT_HEAD(&w_all, w, w_list);
 	if (lock_class->lc_flags & LC_SPINLOCK) {
 		STAILQ_INSERT_HEAD(&w_spin, w, w_typelist);
 		w_spin_cnt++;
 	} else if (lock_class->lc_flags & LC_SLEEPLOCK) {
 		STAILQ_INSERT_HEAD(&w_sleep, w, w_typelist);
 		w_sleep_cnt++;
 	}
 
 	/* Insert new witness into the hash */
 	witness_hash_put(w);
 	witness_increment_graph_generation();
 	mtx_unlock_spin(&w_mtx);
 	return (w);
 found:
 	w->w_refcount++;
 	if (w->w_refcount == 1)
 		w->w_class = lock_class;
 	mtx_unlock_spin(&w_mtx);
 	if (lock_class != w->w_class)
 		kassert_panic(
 		    "lock (%s) %s does not match earlier (%s) lock",
 		    description, lock_class->lc_name,
 		    w->w_class->lc_name);
 	return (w);
 }
 
 static void
 depart(struct witness *w)
 {
 
 	MPASS(w->w_refcount == 0);
 	if (w->w_class->lc_flags & LC_SLEEPLOCK) {
 		w_sleep_cnt--;
 	} else {
 		w_spin_cnt--;
 	}
 	/*
 	 * Set file to NULL as it may point into a loadable module.
 	 */
 	w->w_file = NULL;
 	w->w_line = 0;
 	witness_increment_graph_generation();
 }
 
 
 static void
 adopt(struct witness *parent, struct witness *child)
 {
 	int pi, ci, i, j;
 
 	if (witness_cold == 0)
 		mtx_assert(&w_mtx, MA_OWNED);
 
 	/* If the relationship is already known, there's no work to be done. */
 	if (isitmychild(parent, child))
 		return;
 
 	/* When the structure of the graph changes, bump up the generation. */
 	witness_increment_graph_generation();
 
 	/*
 	 * The hard part ... create the direct relationship, then propagate all
 	 * indirect relationships.
 	 */
 	pi = parent->w_index;
 	ci = child->w_index;
 	WITNESS_INDEX_ASSERT(pi);
 	WITNESS_INDEX_ASSERT(ci);
 	MPASS(pi != ci);
 	w_rmatrix[pi][ci] |= WITNESS_PARENT;
 	w_rmatrix[ci][pi] |= WITNESS_CHILD;
 
 	/*
 	 * If parent was not already an ancestor of child,
 	 * then we increment the descendant and ancestor counters.
 	 */
 	if ((w_rmatrix[pi][ci] & WITNESS_ANCESTOR) == 0) {
 		parent->w_num_descendants++;
 		child->w_num_ancestors++;
 	}
 
 	/* 
 	 * Find each ancestor of 'pi'. Note that 'pi' itself is counted as 
 	 * an ancestor of 'pi' during this loop.
 	 */
 	for (i = 1; i <= w_max_used_index; i++) {
 		if ((w_rmatrix[i][pi] & WITNESS_ANCESTOR_MASK) == 0 && 
 		    (i != pi))
 			continue;
 
 		/* Find each descendant of 'i' and mark it as a descendant. */
 		for (j = 1; j <= w_max_used_index; j++) {
 
 			/* 
 			 * Skip children that are already marked as
 			 * descendants of 'i'.
 			 */
 			if (w_rmatrix[i][j] & WITNESS_ANCESTOR_MASK)
 				continue;
 
 			/*
 			 * We are only interested in descendants of 'ci'. Note
 			 * that 'ci' itself is counted as a descendant of 'ci'.
 			 */
 			if ((w_rmatrix[ci][j] & WITNESS_ANCESTOR_MASK) == 0 && 
 			    (j != ci))
 				continue;
 			w_rmatrix[i][j] |= WITNESS_ANCESTOR;
 			w_rmatrix[j][i] |= WITNESS_DESCENDANT;
 			w_data[i].w_num_descendants++;
 			w_data[j].w_num_ancestors++;
 
 			/* 
 			 * Make sure we aren't marking a node as both an
 			 * ancestor and descendant. We should have caught 
 			 * this as a lock order reversal earlier.
 			 */
 			if ((w_rmatrix[i][j] & WITNESS_ANCESTOR_MASK) &&
 			    (w_rmatrix[i][j] & WITNESS_DESCENDANT_MASK)) {
 				printf("witness rmatrix paradox! [%d][%d]=%d "
 				    "both ancestor and descendant\n",
 				    i, j, w_rmatrix[i][j]); 
 				kdb_backtrace();
 				printf("Witness disabled.\n");
 				witness_watch = -1;
 			}
 			if ((w_rmatrix[j][i] & WITNESS_ANCESTOR_MASK) &&
 			    (w_rmatrix[j][i] & WITNESS_DESCENDANT_MASK)) {
 				printf("witness rmatrix paradox! [%d][%d]=%d "
 				    "both ancestor and descendant\n",
 				    j, i, w_rmatrix[j][i]); 
 				kdb_backtrace();
 				printf("Witness disabled.\n");
 				witness_watch = -1;
 			}
 		}
 	}
 }
 
 static void
 itismychild(struct witness *parent, struct witness *child)
 {
 	int unlocked;
 
 	MPASS(child != NULL && parent != NULL);
 	if (witness_cold == 0)
 		mtx_assert(&w_mtx, MA_OWNED);
 
 	if (!witness_lock_type_equal(parent, child)) {
 		if (witness_cold == 0) {
 			unlocked = 1;
 			mtx_unlock_spin(&w_mtx);
 		} else {
 			unlocked = 0;
 		}
 		kassert_panic(
 		    "%s: parent \"%s\" (%s) and child \"%s\" (%s) are not "
 		    "the same lock type", __func__, parent->w_name,
 		    parent->w_class->lc_name, child->w_name,
 		    child->w_class->lc_name);
 		if (unlocked)
 			mtx_lock_spin(&w_mtx);
 	}
 	adopt(parent, child);
 }
 
 /*
  * Generic code for the isitmy*() functions. The rmask parameter is the
  * expected relationship of w1 to w2.
  */
 static int
 _isitmyx(struct witness *w1, struct witness *w2, int rmask, const char *fname)
 {
 	unsigned char r1, r2;
 	int i1, i2;
 
 	i1 = w1->w_index;
 	i2 = w2->w_index;
 	WITNESS_INDEX_ASSERT(i1);
 	WITNESS_INDEX_ASSERT(i2);
 	r1 = w_rmatrix[i1][i2] & WITNESS_RELATED_MASK;
 	r2 = w_rmatrix[i2][i1] & WITNESS_RELATED_MASK;
 
 	/* The flags on one better be the inverse of the flags on the other */
 	if (!((WITNESS_ATOD(r1) == r2 && WITNESS_DTOA(r2) == r1) ||
 	    (WITNESS_DTOA(r1) == r2 && WITNESS_ATOD(r2) == r1))) {
 		/* Don't squawk if we're potentially racing with an update. */
 		if (!mtx_owned(&w_mtx))
 			return (0);
 		printf("%s: rmatrix mismatch between %s (index %d) and %s "
 		    "(index %d): w_rmatrix[%d][%d] == %hhx but "
 		    "w_rmatrix[%d][%d] == %hhx\n",
 		    fname, w1->w_name, i1, w2->w_name, i2, i1, i2, r1,
 		    i2, i1, r2);
 		kdb_backtrace();
 		printf("Witness disabled.\n");
 		witness_watch = -1;
 	}
 	return (r1 & rmask);
 }
 
 /*
  * Checks if @child is a direct child of @parent.
  */
 static int
 isitmychild(struct witness *parent, struct witness *child)
 {
 
 	return (_isitmyx(parent, child, WITNESS_PARENT, __func__));
 }
 
 /*
  * Checks if @descendant is a direct or inderect descendant of @ancestor.
  */
 static int
 isitmydescendant(struct witness *ancestor, struct witness *descendant)
 {
 
 	return (_isitmyx(ancestor, descendant, WITNESS_ANCESTOR_MASK,
 	    __func__));
 }
 
 #ifdef BLESSING
 static int
 blessed(struct witness *w1, struct witness *w2)
 {
 	int i;
 	struct witness_blessed *b;
 
 	for (i = 0; i < nitems(blessed_list); i++) {
 		b = &blessed_list[i];
 		if (strcmp(w1->w_name, b->b_lock1) == 0) {
 			if (strcmp(w2->w_name, b->b_lock2) == 0)
 				return (1);
 			continue;
 		}
 		if (strcmp(w1->w_name, b->b_lock2) == 0)
 			if (strcmp(w2->w_name, b->b_lock1) == 0)
 				return (1);
 	}
 	return (0);
 }
 #endif
 
 static struct witness *
 witness_get(void)
 {
 	struct witness *w;
 	int index;
 
 	if (witness_cold == 0)
 		mtx_assert(&w_mtx, MA_OWNED);
 
 	if (witness_watch == -1) {
 		mtx_unlock_spin(&w_mtx);
 		return (NULL);
 	}
 	if (STAILQ_EMPTY(&w_free)) {
 		witness_watch = -1;
 		mtx_unlock_spin(&w_mtx);
 		printf("WITNESS: unable to allocate a new witness object\n");
 		return (NULL);
 	}
 	w = STAILQ_FIRST(&w_free);
 	STAILQ_REMOVE_HEAD(&w_free, w_list);
 	w_free_cnt--;
 	index = w->w_index;
 	MPASS(index > 0 && index == w_max_used_index+1 &&
 	    index < witness_count);
 	bzero(w, sizeof(*w));
 	w->w_index = index;
 	if (index > w_max_used_index)
 		w_max_used_index = index;
 	return (w);
 }
 
 static void
 witness_free(struct witness *w)
 {
 
 	STAILQ_INSERT_HEAD(&w_free, w, w_list);
 	w_free_cnt++;
 }
 
 static struct lock_list_entry *
 witness_lock_list_get(void)
 {
 	struct lock_list_entry *lle;
 
 	if (witness_watch == -1)
 		return (NULL);
 	mtx_lock_spin(&w_mtx);
 	lle = w_lock_list_free;
 	if (lle == NULL) {
 		witness_watch = -1;
 		mtx_unlock_spin(&w_mtx);
 		printf("%s: witness exhausted\n", __func__);
 		return (NULL);
 	}
 	w_lock_list_free = lle->ll_next;
 	mtx_unlock_spin(&w_mtx);
 	bzero(lle, sizeof(*lle));
 	return (lle);
 }
 		
 static void
 witness_lock_list_free(struct lock_list_entry *lle)
 {
 
 	mtx_lock_spin(&w_mtx);
 	lle->ll_next = w_lock_list_free;
 	w_lock_list_free = lle;
 	mtx_unlock_spin(&w_mtx);
 }
 
 static struct lock_instance *
 find_instance(struct lock_list_entry *list, const struct lock_object *lock)
 {
 	struct lock_list_entry *lle;
 	struct lock_instance *instance;
 	int i;
 
 	for (lle = list; lle != NULL; lle = lle->ll_next)
 		for (i = lle->ll_count - 1; i >= 0; i--) {
 			instance = &lle->ll_children[i];
 			if (instance->li_lock == lock)
 				return (instance);
 		}
 	return (NULL);
 }
 
 static void
 witness_list_lock(struct lock_instance *instance,
     int (*prnt)(const char *fmt, ...))
 {
 	struct lock_object *lock;
 
 	lock = instance->li_lock;
 	prnt("%s %s %s", (instance->li_flags & LI_EXCLUSIVE) != 0 ?
 	    "exclusive" : "shared", LOCK_CLASS(lock)->lc_name, lock->lo_name);
 	if (lock->lo_witness->w_name != lock->lo_name)
 		prnt(" (%s)", lock->lo_witness->w_name);
 	prnt(" r = %d (%p) locked @ %s:%d\n",
 	    instance->li_flags & LI_RECURSEMASK, lock,
 	    fixup_filename(instance->li_file), instance->li_line);
 }
 
 static int
 witness_output(const char *fmt, ...)
 {
 	va_list ap;
 	int ret;
 
 	va_start(ap, fmt);
 	ret = witness_voutput(fmt, ap);
 	va_end(ap);
 	return (ret);
 }
 
 static int
 witness_voutput(const char *fmt, va_list ap)
 {
 	int ret;
 
 	ret = 0;
 	switch (witness_channel) {
 	case WITNESS_CONSOLE:
 		ret = vprintf(fmt, ap);
 		break;
 	case WITNESS_LOG:
 		vlog(LOG_NOTICE, fmt, ap);
 		break;
 	case WITNESS_NONE:
 		break;
 	}
 	return (ret);
 }
 
 #ifdef DDB
 static int
 witness_thread_has_locks(struct thread *td)
 {
 
 	if (td->td_sleeplocks == NULL)
 		return (0);
 	return (td->td_sleeplocks->ll_count != 0);
 }
 
 static int
 witness_proc_has_locks(struct proc *p)
 {
 	struct thread *td;
 
 	FOREACH_THREAD_IN_PROC(p, td) {
 		if (witness_thread_has_locks(td))
 			return (1);
 	}
 	return (0);
 }
 #endif
 
 int
 witness_list_locks(struct lock_list_entry **lock_list,
     int (*prnt)(const char *fmt, ...))
 {
 	struct lock_list_entry *lle;
 	int i, nheld;
 
 	nheld = 0;
 	for (lle = *lock_list; lle != NULL; lle = lle->ll_next)
 		for (i = lle->ll_count - 1; i >= 0; i--) {
 			witness_list_lock(&lle->ll_children[i], prnt);
 			nheld++;
 		}
 	return (nheld);
 }
 
 /*
  * This is a bit risky at best.  We call this function when we have timed
  * out acquiring a spin lock, and we assume that the other CPU is stuck
  * with this lock held.  So, we go groveling around in the other CPU's
  * per-cpu data to try to find the lock instance for this spin lock to
  * see when it was last acquired.
  */
 void
 witness_display_spinlock(struct lock_object *lock, struct thread *owner,
     int (*prnt)(const char *fmt, ...))
 {
 	struct lock_instance *instance;
 	struct pcpu *pc;
 
 	if (owner->td_critnest == 0 || owner->td_oncpu == NOCPU)
 		return;
 	pc = pcpu_find(owner->td_oncpu);
 	instance = find_instance(pc->pc_spinlocks, lock);
 	if (instance != NULL)
 		witness_list_lock(instance, prnt);
 }
 
 void
 witness_save(struct lock_object *lock, const char **filep, int *linep)
 {
 	struct lock_list_entry *lock_list;
 	struct lock_instance *instance;
 	struct lock_class *class;
 
 	/*
 	 * This function is used independently in locking code to deal with
 	 * Giant, SCHEDULER_STOPPED() check can be removed here after Giant
 	 * is gone.
 	 */
 	if (SCHEDULER_STOPPED())
 		return;
 	KASSERT(witness_cold == 0, ("%s: witness_cold", __func__));
 	if (lock->lo_witness == NULL || witness_watch == -1 || panicstr != NULL)
 		return;
 	class = LOCK_CLASS(lock);
 	if (class->lc_flags & LC_SLEEPLOCK)
 		lock_list = curthread->td_sleeplocks;
 	else {
 		if (witness_skipspin)
 			return;
 		lock_list = PCPU_GET(spinlocks);
 	}
 	instance = find_instance(lock_list, lock);
 	if (instance == NULL) {
 		kassert_panic("%s: lock (%s) %s not locked", __func__,
 		    class->lc_name, lock->lo_name);
 		return;
 	}
 	*filep = instance->li_file;
 	*linep = instance->li_line;
 }
 
 void
 witness_restore(struct lock_object *lock, const char *file, int line)
 {
 	struct lock_list_entry *lock_list;
 	struct lock_instance *instance;
 	struct lock_class *class;
 
 	/*
 	 * This function is used independently in locking code to deal with
 	 * Giant, SCHEDULER_STOPPED() check can be removed here after Giant
 	 * is gone.
 	 */
 	if (SCHEDULER_STOPPED())
 		return;
 	KASSERT(witness_cold == 0, ("%s: witness_cold", __func__));
 	if (lock->lo_witness == NULL || witness_watch == -1 || panicstr != NULL)
 		return;
 	class = LOCK_CLASS(lock);
 	if (class->lc_flags & LC_SLEEPLOCK)
 		lock_list = curthread->td_sleeplocks;
 	else {
 		if (witness_skipspin)
 			return;
 		lock_list = PCPU_GET(spinlocks);
 	}
 	instance = find_instance(lock_list, lock);
 	if (instance == NULL)
 		kassert_panic("%s: lock (%s) %s not locked", __func__,
 		    class->lc_name, lock->lo_name);
 	lock->lo_witness->w_file = file;
 	lock->lo_witness->w_line = line;
 	if (instance == NULL)
 		return;
 	instance->li_file = file;
 	instance->li_line = line;
 }
 
 void
 witness_assert(const struct lock_object *lock, int flags, const char *file,
     int line)
 {
 #ifdef INVARIANT_SUPPORT
 	struct lock_instance *instance;
 	struct lock_class *class;
 
 	if (lock->lo_witness == NULL || witness_watch < 1 || panicstr != NULL)
 		return;
 	class = LOCK_CLASS(lock);
 	if ((class->lc_flags & LC_SLEEPLOCK) != 0)
 		instance = find_instance(curthread->td_sleeplocks, lock);
 	else if ((class->lc_flags & LC_SPINLOCK) != 0)
 		instance = find_instance(PCPU_GET(spinlocks), lock);
 	else {
 		kassert_panic("Lock (%s) %s is not sleep or spin!",
 		    class->lc_name, lock->lo_name);
 		return;
 	}
 	switch (flags) {
 	case LA_UNLOCKED:
 		if (instance != NULL)
 			kassert_panic("Lock (%s) %s locked @ %s:%d.",
 			    class->lc_name, lock->lo_name,
 			    fixup_filename(file), line);
 		break;
 	case LA_LOCKED:
 	case LA_LOCKED | LA_RECURSED:
 	case LA_LOCKED | LA_NOTRECURSED:
 	case LA_SLOCKED:
 	case LA_SLOCKED | LA_RECURSED:
 	case LA_SLOCKED | LA_NOTRECURSED:
 	case LA_XLOCKED:
 	case LA_XLOCKED | LA_RECURSED:
 	case LA_XLOCKED | LA_NOTRECURSED:
 		if (instance == NULL) {
 			kassert_panic("Lock (%s) %s not locked @ %s:%d.",
 			    class->lc_name, lock->lo_name,
 			    fixup_filename(file), line);
 			break;
 		}
 		if ((flags & LA_XLOCKED) != 0 &&
 		    (instance->li_flags & LI_EXCLUSIVE) == 0)
 			kassert_panic(
 			    "Lock (%s) %s not exclusively locked @ %s:%d.",
 			    class->lc_name, lock->lo_name,
 			    fixup_filename(file), line);
 		if ((flags & LA_SLOCKED) != 0 &&
 		    (instance->li_flags & LI_EXCLUSIVE) != 0)
 			kassert_panic(
 			    "Lock (%s) %s exclusively locked @ %s:%d.",
 			    class->lc_name, lock->lo_name,
 			    fixup_filename(file), line);
 		if ((flags & LA_RECURSED) != 0 &&
 		    (instance->li_flags & LI_RECURSEMASK) == 0)
 			kassert_panic("Lock (%s) %s not recursed @ %s:%d.",
 			    class->lc_name, lock->lo_name,
 			    fixup_filename(file), line);
 		if ((flags & LA_NOTRECURSED) != 0 &&
 		    (instance->li_flags & LI_RECURSEMASK) != 0)
 			kassert_panic("Lock (%s) %s recursed @ %s:%d.",
 			    class->lc_name, lock->lo_name,
 			    fixup_filename(file), line);
 		break;
 	default:
 		kassert_panic("Invalid lock assertion at %s:%d.",
 		    fixup_filename(file), line);
 
 	}
 #endif	/* INVARIANT_SUPPORT */
 }
 
 static void
 witness_setflag(struct lock_object *lock, int flag, int set)
 {
 	struct lock_list_entry *lock_list;
 	struct lock_instance *instance;
 	struct lock_class *class;
 
 	if (lock->lo_witness == NULL || witness_watch == -1 || panicstr != NULL)
 		return;
 	class = LOCK_CLASS(lock);
 	if (class->lc_flags & LC_SLEEPLOCK)
 		lock_list = curthread->td_sleeplocks;
 	else {
 		if (witness_skipspin)
 			return;
 		lock_list = PCPU_GET(spinlocks);
 	}
 	instance = find_instance(lock_list, lock);
 	if (instance == NULL) {
 		kassert_panic("%s: lock (%s) %s not locked", __func__,
 		    class->lc_name, lock->lo_name);
 		return;
 	}
 
 	if (set)
 		instance->li_flags |= flag;
 	else
 		instance->li_flags &= ~flag;
 }
 
 void
 witness_norelease(struct lock_object *lock)
 {
 
 	witness_setflag(lock, LI_NORELEASE, 1);
 }
 
 void
 witness_releaseok(struct lock_object *lock)
 {
 
 	witness_setflag(lock, LI_NORELEASE, 0);
 }
 
 #ifdef DDB
 static void
 witness_ddb_list(struct thread *td)
 {
 
 	KASSERT(witness_cold == 0, ("%s: witness_cold", __func__));
 	KASSERT(kdb_active, ("%s: not in the debugger", __func__));
 
 	if (witness_watch < 1)
 		return;
 
 	witness_list_locks(&td->td_sleeplocks, db_printf);
 
 	/*
 	 * We only handle spinlocks if td == curthread.  This is somewhat broken
 	 * if td is currently executing on some other CPU and holds spin locks
 	 * as we won't display those locks.  If we had a MI way of getting
 	 * the per-cpu data for a given cpu then we could use
 	 * td->td_oncpu to get the list of spinlocks for this thread
 	 * and "fix" this.
 	 *
 	 * That still wouldn't really fix this unless we locked the scheduler
 	 * lock or stopped the other CPU to make sure it wasn't changing the
 	 * list out from under us.  It is probably best to just not try to
 	 * handle threads on other CPU's for now.
 	 */
 	if (td == curthread && PCPU_GET(spinlocks) != NULL)
 		witness_list_locks(PCPU_PTR(spinlocks), db_printf);
 }
 
 DB_SHOW_COMMAND(locks, db_witness_list)
 {
 	struct thread *td;
 
 	if (have_addr)
 		td = db_lookup_thread(addr, true);
 	else
 		td = kdb_thread;
 	witness_ddb_list(td);
 }
 
 DB_SHOW_ALL_COMMAND(locks, db_witness_list_all)
 {
 	struct thread *td;
 	struct proc *p;
 
 	/*
 	 * It would be nice to list only threads and processes that actually
 	 * held sleep locks, but that information is currently not exported
 	 * by WITNESS.
 	 */
 	FOREACH_PROC_IN_SYSTEM(p) {
 		if (!witness_proc_has_locks(p))
 			continue;
 		FOREACH_THREAD_IN_PROC(p, td) {
 			if (!witness_thread_has_locks(td))
 				continue;
 			db_printf("Process %d (%s) thread %p (%d)\n", p->p_pid,
 			    p->p_comm, td, td->td_tid);
 			witness_ddb_list(td);
 			if (db_pager_quit)
 				return;
 		}
 	}
 }
 DB_SHOW_ALIAS(alllocks, db_witness_list_all)
 
 DB_SHOW_COMMAND(witness, db_witness_display)
 {
 
 	witness_ddb_display(db_printf);
 }
 #endif
 
 static void
 sbuf_print_witness_badstacks(struct sbuf *sb, size_t *oldidx)
 {
 	struct witness_lock_order_data *data1, *data2, *tmp_data1, *tmp_data2;
 	struct witness *tmp_w1, *tmp_w2, *w1, *w2;
 	int generation, i, j;
 
 	tmp_data1 = NULL;
 	tmp_data2 = NULL;
 	tmp_w1 = NULL;
 	tmp_w2 = NULL;
 
 	/* Allocate and init temporary storage space. */
 	tmp_w1 = malloc(sizeof(struct witness), M_TEMP, M_WAITOK | M_ZERO);
 	tmp_w2 = malloc(sizeof(struct witness), M_TEMP, M_WAITOK | M_ZERO);
 	tmp_data1 = malloc(sizeof(struct witness_lock_order_data), M_TEMP, 
 	    M_WAITOK | M_ZERO);
 	tmp_data2 = malloc(sizeof(struct witness_lock_order_data), M_TEMP, 
 	    M_WAITOK | M_ZERO);
 	stack_zero(&tmp_data1->wlod_stack);
 	stack_zero(&tmp_data2->wlod_stack);
 
 restart:
 	mtx_lock_spin(&w_mtx);
 	generation = w_generation;
 	mtx_unlock_spin(&w_mtx);
 	sbuf_printf(sb, "Number of known direct relationships is %d\n",
 	    w_lohash.wloh_count);
 	for (i = 1; i < w_max_used_index; i++) {
 		mtx_lock_spin(&w_mtx);
 		if (generation != w_generation) {
 			mtx_unlock_spin(&w_mtx);
 
 			/* The graph has changed, try again. */
 			*oldidx = 0;
 			sbuf_clear(sb);
 			goto restart;
 		}
 
 		w1 = &w_data[i];
 		if (w1->w_reversed == 0) {
 			mtx_unlock_spin(&w_mtx);
 			continue;
 		}
 
 		/* Copy w1 locally so we can release the spin lock. */
 		*tmp_w1 = *w1;
 		mtx_unlock_spin(&w_mtx);
 
 		if (tmp_w1->w_reversed == 0)
 			continue;
 		for (j = 1; j < w_max_used_index; j++) {
 			if ((w_rmatrix[i][j] & WITNESS_REVERSAL) == 0 || i > j)
 				continue;
 
 			mtx_lock_spin(&w_mtx);
 			if (generation != w_generation) {
 				mtx_unlock_spin(&w_mtx);
 
 				/* The graph has changed, try again. */
 				*oldidx = 0;
 				sbuf_clear(sb);
 				goto restart;
 			}
 
 			w2 = &w_data[j];
 			data1 = witness_lock_order_get(w1, w2);
 			data2 = witness_lock_order_get(w2, w1);
 
 			/*
 			 * Copy information locally so we can release the
 			 * spin lock.
 			 */
 			*tmp_w2 = *w2;
 
 			if (data1) {
 				stack_zero(&tmp_data1->wlod_stack);
 				stack_copy(&data1->wlod_stack,
 				    &tmp_data1->wlod_stack);
 			}
 			if (data2 && data2 != data1) {
 				stack_zero(&tmp_data2->wlod_stack);
 				stack_copy(&data2->wlod_stack,
 				    &tmp_data2->wlod_stack);
 			}
 			mtx_unlock_spin(&w_mtx);
 
 			sbuf_printf(sb,
 	    "\nLock order reversal between \"%s\"(%s) and \"%s\"(%s)!\n",
 			    tmp_w1->w_name, tmp_w1->w_class->lc_name, 
 			    tmp_w2->w_name, tmp_w2->w_class->lc_name);
 			if (data1) {
 				sbuf_printf(sb,
 			"Lock order \"%s\"(%s) -> \"%s\"(%s) first seen at:\n",
 				    tmp_w1->w_name, tmp_w1->w_class->lc_name, 
 				    tmp_w2->w_name, tmp_w2->w_class->lc_name);
 				stack_sbuf_print(sb, &tmp_data1->wlod_stack);
 				sbuf_printf(sb, "\n");
 			}
 			if (data2 && data2 != data1) {
 				sbuf_printf(sb,
 			"Lock order \"%s\"(%s) -> \"%s\"(%s) first seen at:\n",
 				    tmp_w2->w_name, tmp_w2->w_class->lc_name, 
 				    tmp_w1->w_name, tmp_w1->w_class->lc_name);
 				stack_sbuf_print(sb, &tmp_data2->wlod_stack);
 				sbuf_printf(sb, "\n");
 			}
 		}
 	}
 	mtx_lock_spin(&w_mtx);
 	if (generation != w_generation) {
 		mtx_unlock_spin(&w_mtx);
 
 		/*
 		 * The graph changed while we were printing stack data,
 		 * try again.
 		 */
 		*oldidx = 0;
 		sbuf_clear(sb);
 		goto restart;
 	}
 	mtx_unlock_spin(&w_mtx);
 
 	/* Free temporary storage space. */
 	free(tmp_data1, M_TEMP);
 	free(tmp_data2, M_TEMP);
 	free(tmp_w1, M_TEMP);
 	free(tmp_w2, M_TEMP);
 }
 
 static int
 sysctl_debug_witness_badstacks(SYSCTL_HANDLER_ARGS)
 {
 	struct sbuf *sb;
 	int error;
 
 	if (witness_watch < 1) {
 		error = SYSCTL_OUT(req, w_notrunning, sizeof(w_notrunning));
 		return (error);
 	}
 	if (witness_cold) {
 		error = SYSCTL_OUT(req, w_stillcold, sizeof(w_stillcold));
 		return (error);
 	}
 	error = 0;
 	sb = sbuf_new(NULL, NULL, badstack_sbuf_size, SBUF_AUTOEXTEND);
 	if (sb == NULL)
 		return (ENOMEM);
 
 	sbuf_print_witness_badstacks(sb, &req->oldidx);
 
 	sbuf_finish(sb);
 	error = SYSCTL_OUT(req, sbuf_data(sb), sbuf_len(sb) + 1);
 	sbuf_delete(sb);
 
 	return (error);
 }
 
 #ifdef DDB
 static int
 sbuf_db_printf_drain(void *arg __unused, const char *data, int len)
 {
 
 	return (db_printf("%.*s", len, data));
 }
 
 DB_SHOW_COMMAND(badstacks, db_witness_badstacks)
 {
 	struct sbuf sb;
 	char buffer[128];
 	size_t dummy;
 
 	sbuf_new(&sb, buffer, sizeof(buffer), SBUF_FIXEDLEN);
 	sbuf_set_drain(&sb, sbuf_db_printf_drain, NULL);
 	sbuf_print_witness_badstacks(&sb, &dummy);
 	sbuf_finish(&sb);
 }
 #endif
 
 static int
 sysctl_debug_witness_channel(SYSCTL_HANDLER_ARGS)
 {
 	static const struct {
 		enum witness_channel channel;
 		const char *name;
 	} channels[] = {
 		{ WITNESS_CONSOLE, "console" },
 		{ WITNESS_LOG, "log" },
 		{ WITNESS_NONE, "none" },
 	};
 	char buf[16];
 	u_int i;
 	int error;
 
 	buf[0] = '\0';
 	for (i = 0; i < nitems(channels); i++)
 		if (witness_channel == channels[i].channel) {
 			snprintf(buf, sizeof(buf), "%s", channels[i].name);
 			break;
 		}
 
 	error = sysctl_handle_string(oidp, buf, sizeof(buf), req);
 	if (error != 0 || req->newptr == NULL)
 		return (error);
 
 	error = EINVAL;
 	for (i = 0; i < nitems(channels); i++)
 		if (strcmp(channels[i].name, buf) == 0) {
 			witness_channel = channels[i].channel;
 			error = 0;
 			break;
 		}
 	return (error);
 }
 
 static int
 sysctl_debug_witness_fullgraph(SYSCTL_HANDLER_ARGS)
 {
 	struct witness *w;
 	struct sbuf *sb;
 	int error;
 
 #ifdef __i386__
 	error = SYSCTL_OUT(req, w_notallowed, sizeof(w_notallowed));
 	return (error);
 #endif
 
 	if (witness_watch < 1) {
 		error = SYSCTL_OUT(req, w_notrunning, sizeof(w_notrunning));
 		return (error);
 	}
 	if (witness_cold) {
 		error = SYSCTL_OUT(req, w_stillcold, sizeof(w_stillcold));
 		return (error);
 	}
 	error = 0;
 
 	error = sysctl_wire_old_buffer(req, 0);
 	if (error != 0)
 		return (error);
 	sb = sbuf_new_for_sysctl(NULL, NULL, FULLGRAPH_SBUF_SIZE, req);
 	if (sb == NULL)
 		return (ENOMEM);
 	sbuf_printf(sb, "\n");
 
 	mtx_lock_spin(&w_mtx);
 	STAILQ_FOREACH(w, &w_all, w_list)
 		w->w_displayed = 0;
 	STAILQ_FOREACH(w, &w_all, w_list)
 		witness_add_fullgraph(sb, w);
 	mtx_unlock_spin(&w_mtx);
 
 	/*
 	 * Close the sbuf and return to userland.
 	 */
 	error = sbuf_finish(sb);
 	sbuf_delete(sb);
 
 	return (error);
 }
 
 static int
 sysctl_debug_witness_watch(SYSCTL_HANDLER_ARGS)
 {
 	int error, value;
 
 	value = witness_watch;
 	error = sysctl_handle_int(oidp, &value, 0, req);
 	if (error != 0 || req->newptr == NULL)
 		return (error);
 	if (value > 1 || value < -1 ||
 	    (witness_watch == -1 && value != witness_watch))
 		return (EINVAL);
 	witness_watch = value;
 	return (0);
 }
 
 static void
 witness_add_fullgraph(struct sbuf *sb, struct witness *w)
 {
 	int i;
 
 	if (w->w_displayed != 0 || (w->w_file == NULL && w->w_line == 0))
 		return;
 	w->w_displayed = 1;
 
 	WITNESS_INDEX_ASSERT(w->w_index);
 	for (i = 1; i <= w_max_used_index; i++) {
 		if (w_rmatrix[w->w_index][i] & WITNESS_PARENT) {
 			sbuf_printf(sb, "\"%s\",\"%s\"\n", w->w_name,
 			    w_data[i].w_name);
 			witness_add_fullgraph(sb, &w_data[i]);
 		}
 	}
 }
 
 /*
  * A simple hash function. Takes a key pointer and a key size. If size == 0,
  * interprets the key as a string and reads until the null
  * terminator. Otherwise, reads the first size bytes. Returns an unsigned 32-bit
  * hash value computed from the key.
  */
 static uint32_t
 witness_hash_djb2(const uint8_t *key, uint32_t size)
 {
 	unsigned int hash = 5381;
 	int i;
 
 	/* hash = hash * 33 + key[i] */
 	if (size)
 		for (i = 0; i < size; i++)
 			hash = ((hash << 5) + hash) + (unsigned int)key[i];
 	else
 		for (i = 0; key[i] != 0; i++)
 			hash = ((hash << 5) + hash) + (unsigned int)key[i];
 
 	return (hash);
 }
 
 
 /*
  * Initializes the two witness hash tables. Called exactly once from
  * witness_initialize().
  */
 static void
 witness_init_hash_tables(void)
 {
 	int i;
 
 	MPASS(witness_cold);
 
 	/* Initialize the hash tables. */
 	for (i = 0; i < WITNESS_HASH_SIZE; i++)
 		w_hash.wh_array[i] = NULL;
 
 	w_hash.wh_size = WITNESS_HASH_SIZE;
 	w_hash.wh_count = 0;
 
 	/* Initialize the lock order data hash. */
 	w_lofree = NULL;
 	for (i = 0; i < WITNESS_LO_DATA_COUNT; i++) {
 		memset(&w_lodata[i], 0, sizeof(w_lodata[i]));
 		w_lodata[i].wlod_next = w_lofree;
 		w_lofree = &w_lodata[i];
 	}
 	w_lohash.wloh_size = WITNESS_LO_HASH_SIZE;
 	w_lohash.wloh_count = 0;
 	for (i = 0; i < WITNESS_LO_HASH_SIZE; i++)
 		w_lohash.wloh_array[i] = NULL;
 }
 
 static struct witness *
 witness_hash_get(const char *key)
 {
 	struct witness *w;
 	uint32_t hash;
 	
 	MPASS(key != NULL);
 	if (witness_cold == 0)
 		mtx_assert(&w_mtx, MA_OWNED);
 	hash = witness_hash_djb2(key, 0) % w_hash.wh_size;
 	w = w_hash.wh_array[hash];
 	while (w != NULL) {
 		if (strcmp(w->w_name, key) == 0)
 			goto out;
 		w = w->w_hash_next;
 	}
 
 out:
 	return (w);
 }
 
 static void
 witness_hash_put(struct witness *w)
 {
 	uint32_t hash;
 
 	MPASS(w != NULL);
 	MPASS(w->w_name != NULL);
 	if (witness_cold == 0)
 		mtx_assert(&w_mtx, MA_OWNED);
 	KASSERT(witness_hash_get(w->w_name) == NULL,
 	    ("%s: trying to add a hash entry that already exists!", __func__));
 	KASSERT(w->w_hash_next == NULL,
 	    ("%s: w->w_hash_next != NULL", __func__));
 
 	hash = witness_hash_djb2(w->w_name, 0) % w_hash.wh_size;
 	w->w_hash_next = w_hash.wh_array[hash];
 	w_hash.wh_array[hash] = w;
 	w_hash.wh_count++;
 }
 
 
 static struct witness_lock_order_data *
 witness_lock_order_get(struct witness *parent, struct witness *child)
 {
 	struct witness_lock_order_data *data = NULL;
 	struct witness_lock_order_key key;
 	unsigned int hash;
 
 	MPASS(parent != NULL && child != NULL);
 	key.from = parent->w_index;
 	key.to = child->w_index;
 	WITNESS_INDEX_ASSERT(key.from);
 	WITNESS_INDEX_ASSERT(key.to);
 	if ((w_rmatrix[parent->w_index][child->w_index]
 	    & WITNESS_LOCK_ORDER_KNOWN) == 0)
 		goto out;
 
 	hash = witness_hash_djb2((const char*)&key,
 	    sizeof(key)) % w_lohash.wloh_size;
 	data = w_lohash.wloh_array[hash];
 	while (data != NULL) {
 		if (witness_lock_order_key_equal(&data->wlod_key, &key))
 			break;
 		data = data->wlod_next;
 	}
 
 out:
 	return (data);
 }
 
 /*
  * Verify that parent and child have a known relationship, are not the same,
  * and child is actually a child of parent.  This is done without w_mtx
  * to avoid contention in the common case.
  */
 static int
 witness_lock_order_check(struct witness *parent, struct witness *child)
 {
 
 	if (parent != child &&
 	    w_rmatrix[parent->w_index][child->w_index]
 	    & WITNESS_LOCK_ORDER_KNOWN &&
 	    isitmychild(parent, child))
 		return (1);
 
 	return (0);
 }
 
 static int
 witness_lock_order_add(struct witness *parent, struct witness *child)
 {
 	struct witness_lock_order_data *data = NULL;
 	struct witness_lock_order_key key;
 	unsigned int hash;
 	
 	MPASS(parent != NULL && child != NULL);
 	key.from = parent->w_index;
 	key.to = child->w_index;
 	WITNESS_INDEX_ASSERT(key.from);
 	WITNESS_INDEX_ASSERT(key.to);
 	if (w_rmatrix[parent->w_index][child->w_index]
 	    & WITNESS_LOCK_ORDER_KNOWN)
 		return (1);
 
 	hash = witness_hash_djb2((const char*)&key,
 	    sizeof(key)) % w_lohash.wloh_size;
 	w_rmatrix[parent->w_index][child->w_index] |= WITNESS_LOCK_ORDER_KNOWN;
 	data = w_lofree;
 	if (data == NULL)
 		return (0);
 	w_lofree = data->wlod_next;
 	data->wlod_next = w_lohash.wloh_array[hash];
 	data->wlod_key = key;
 	w_lohash.wloh_array[hash] = data;
 	w_lohash.wloh_count++;
 	stack_zero(&data->wlod_stack);
 	stack_save(&data->wlod_stack);
 	return (1);
 }
 
 /* Call this whenever the structure of the witness graph changes. */
 static void
 witness_increment_graph_generation(void)
 {
 
 	if (witness_cold == 0)
 		mtx_assert(&w_mtx, MA_OWNED);
 	w_generation++;
 }
 
 static int
 witness_output_drain(void *arg __unused, const char *data, int len)
 {
 
 	witness_output("%.*s", len, data);
 	return (len);
 }
 
 static void
 witness_debugger(int cond, const char *msg)
 {
 	char buf[32];
 	struct sbuf sb;
 	struct stack st;
 
 	if (!cond)
 		return;
 
 	if (witness_trace) {
 		sbuf_new(&sb, buf, sizeof(buf), SBUF_FIXEDLEN);
 		sbuf_set_drain(&sb, witness_output_drain, NULL);
 
 		stack_zero(&st);
 		stack_save(&st);
 		witness_output("stack backtrace:\n");
 		stack_sbuf_print_ddb(&sb, &st);
 
 		sbuf_finish(&sb);
 	}
 
 #ifdef KDB
 	if (witness_kdb)
 		kdb_enter(KDB_WHY_WITNESS, msg);
 #endif
 }
Index: projects/runtime-coverage-v2/sys/net/if_tuntap.c
===================================================================
--- projects/runtime-coverage-v2/sys/net/if_tuntap.c	(revision 347598)
+++ projects/runtime-coverage-v2/sys/net/if_tuntap.c	(revision 347599)
@@ -1,1718 +1,1718 @@
 /*	$NetBSD: if_tun.c,v 1.14 1994/06/29 06:36:25 cgd Exp $	*/
 /*-
  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
  *
  * Copyright (C) 1999-2000 by Maksim Yevmenkin <m_evmenkin@yahoo.com>
  * All rights reserved.
  * Copyright (c) 2019 Kyle Evans <kevans@FreeBSD.org>
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  *
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  * BASED ON:
  * -------------------------------------------------------------------------
  *
  * Copyright (c) 1988, Julian Onions <jpo@cs.nott.ac.uk>
  * Nottingham University 1987.
  *
  * This source may be freely distributed, however I would be interested
  * in any changes that are made.
  *
  * This driver takes packets off the IP i/f and hands them up to a
  * user process to have its wicked way with. This driver has it's
  * roots in a similar driver written by Phil Cockcroft (formerly) at
  * UCL. This driver is based much more on read/write/poll mode of
  * operation though.
  *
  * $FreeBSD$
  */
 
 #include "opt_inet.h"
 #include "opt_inet6.h"
 
 #include <sys/param.h>
 #include <sys/lock.h>
 #include <sys/priv.h>
 #include <sys/proc.h>
 #include <sys/systm.h>
 #include <sys/jail.h>
 #include <sys/mbuf.h>
 #include <sys/module.h>
 #include <sys/socket.h>
 #include <sys/fcntl.h>
 #include <sys/filio.h>
 #include <sys/sockio.h>
 #include <sys/sx.h>
 #include <sys/ttycom.h>
 #include <sys/poll.h>
 #include <sys/selinfo.h>
 #include <sys/signalvar.h>
 #include <sys/filedesc.h>
 #include <sys/kernel.h>
 #include <sys/sysctl.h>
 #include <sys/conf.h>
 #include <sys/uio.h>
 #include <sys/malloc.h>
 #include <sys/random.h>
 #include <sys/ctype.h>
 
 #include <net/ethernet.h>
 #include <net/if.h>
 #include <net/if_var.h>
 #include <net/if_clone.h>
 #include <net/if_dl.h>
 #include <net/if_media.h>
 #include <net/if_types.h>
 #include <net/netisr.h>
 #include <net/route.h>
 #include <net/vnet.h>
 #ifdef INET
 #include <netinet/in.h>
 #endif
 #include <net/bpf.h>
 #include <net/if_tap.h>
 #include <net/if_tun.h>
 
 #include <sys/queue.h>
 #include <sys/condvar.h>
 #include <security/mac/mac_framework.h>
 
 struct tuntap_driver;
 
 /*
  * tun_list is protected by global tunmtx.  Other mutable fields are
  * protected by tun->tun_mtx, or by their owning subsystem.  tun_dev is
  * static for the duration of a tunnel interface.
  */
 struct tuntap_softc {
 	TAILQ_ENTRY(tuntap_softc)	 tun_list;
 	struct cdev			*tun_dev;
 	u_short				 tun_flags;	/* misc flags */
 #define	TUN_OPEN	0x0001
 #define	TUN_INITED	0x0002
 #define	TUN_RCOLL	0x0004
 #define	TUN_IASET	0x0008
 #define	TUN_DSTADDR	0x0010
 #define	TUN_LMODE	0x0020
 #define	TUN_RWAIT	0x0040
 #define	TUN_ASYNC	0x0080
 #define	TUN_IFHEAD	0x0100
 #define	TUN_DYING	0x0200
 #define	TUN_L2		0x0400
 #define	TUN_VMNET	0x0800
 
 #define	TUN_DRIVER_IDENT_MASK	(TUN_L2 | TUN_VMNET)
 #define	TUN_READY		(TUN_OPEN | TUN_INITED)
 
 	pid_t			 tun_pid;	/* owning pid */
 	struct ifnet		*tun_ifp;	/* the interface */
 	struct sigio		*tun_sigio;	/* async I/O info */
 	struct tuntap_driver	*tun_drv;	/* appropriate driver */
 	struct selinfo		 tun_rsel;	/* read select */
 	struct mtx		 tun_mtx;	/* softc field mutex */
 	struct cv		 tun_cv;	/* for ref'd dev destroy */
 	struct ether_addr	 tun_ether;	/* remote address */
 };
 #define	TUN2IFP(sc)	((sc)->tun_ifp)
 
 #define	TUNDEBUG	if (tundebug) if_printf
 
 #define	TUN_LOCK(tp)	mtx_lock(&(tp)->tun_mtx)
 #define	TUN_UNLOCK(tp)	mtx_unlock(&(tp)->tun_mtx)
 
 #define	TUN_VMIO_FLAG_MASK	0x0fff
 
 /*
  * All mutable global variables in if_tun are locked using tunmtx, with
  * the exception of tundebug, which is used unlocked, and the drivers' *clones,
  * which are static after setup.
  */
 static struct mtx tunmtx;
 static eventhandler_tag tag;
 static const char tunname[] = "tun";
 static const char tapname[] = "tap";
 static const char vmnetname[] = "vmnet";
 static MALLOC_DEFINE(M_TUN, tunname, "Tunnel Interface");
 static int tundebug = 0;
 static int tundclone = 1;
 static int tap_allow_uopen = 0;	/* allow user open() */
 static int tapuponopen = 0;	/* IFF_UP on open() */
 static int tapdclone = 1;	/* enable devfs cloning */
 
 static TAILQ_HEAD(,tuntap_softc)	tunhead = TAILQ_HEAD_INITIALIZER(tunhead);
 SYSCTL_INT(_debug, OID_AUTO, if_tun_debug, CTLFLAG_RW, &tundebug, 0, "");
 
 static struct sx tun_ioctl_sx;
 SX_SYSINIT(tun_ioctl_sx, &tun_ioctl_sx, "tun_ioctl");
 
 SYSCTL_DECL(_net_link);
 /* tun */
 static SYSCTL_NODE(_net_link, OID_AUTO, tun, CTLFLAG_RW, 0,
     "IP tunnel software network interface.");
 SYSCTL_INT(_net_link_tun, OID_AUTO, devfs_cloning, CTLFLAG_RWTUN, &tundclone, 0,
     "Enable legacy devfs interface creation.");
 
 /* tap */
 static SYSCTL_NODE(_net_link, OID_AUTO, tap, CTLFLAG_RW, 0,
     "Ethernet tunnel software network interface");
 SYSCTL_INT(_net_link_tap, OID_AUTO, user_open, CTLFLAG_RW, &tap_allow_uopen, 0,
     "Allow user to open /dev/tap (based on node permissions)");
 SYSCTL_INT(_net_link_tap, OID_AUTO, up_on_open, CTLFLAG_RW, &tapuponopen, 0,
     "Bring interface up when /dev/tap is opened");
 SYSCTL_INT(_net_link_tap, OID_AUTO, devfs_cloning, CTLFLAG_RWTUN, &tapdclone, 0,
     "Enable legacy devfs interface creation");
 SYSCTL_INT(_net_link_tap, OID_AUTO, debug, CTLFLAG_RW, &tundebug, 0, "");
 
 static int	tuntap_name2info(const char *name, int *unit, int *flags);
 static void	tunclone(void *arg, struct ucred *cred, char *name,
 		    int namelen, struct cdev **dev);
 static void	tuncreate(struct cdev *dev, struct tuntap_driver *);
 static int	tunifioctl(struct ifnet *, u_long, caddr_t);
 static void	tuninit(struct ifnet *);
 static void	tunifinit(void *xtp);
 static int	tuntapmodevent(module_t, int, void *);
 static int	tunoutput(struct ifnet *, struct mbuf *,
 		    const struct sockaddr *, struct route *ro);
 static void	tunstart(struct ifnet *);
 static void	tunstart_l2(struct ifnet *);
 
 static int	tun_clone_match(struct if_clone *ifc, const char *name);
 static int	tap_clone_match(struct if_clone *ifc, const char *name);
 static int	vmnet_clone_match(struct if_clone *ifc, const char *name);
 static int	tun_clone_create(struct if_clone *, char *, size_t, caddr_t);
 static int	tun_clone_destroy(struct if_clone *, struct ifnet *);
 
 static d_open_t		tunopen;
 static d_close_t	tunclose;
 static d_read_t		tunread;
 static d_write_t	tunwrite;
 static d_ioctl_t	tunioctl;
 static d_poll_t		tunpoll;
 static d_kqfilter_t	tunkqfilter;
 
 static int		tunkqread(struct knote *, long);
 static int		tunkqwrite(struct knote *, long);
 static void		tunkqdetach(struct knote *);
 
 static struct filterops tun_read_filterops = {
 	.f_isfd =	1,
 	.f_attach =	NULL,
 	.f_detach =	tunkqdetach,
 	.f_event =	tunkqread,
 };
 
 static struct filterops tun_write_filterops = {
 	.f_isfd =	1,
 	.f_attach =	NULL,
 	.f_detach =	tunkqdetach,
 	.f_event =	tunkqwrite,
 };
 
 static struct tuntap_driver {
 	struct cdevsw		 cdevsw;
 	int			 ident_flags;
 	struct unrhdr		*unrhdr;
 	struct clonedevs	*clones;
 	ifc_match_t		*clone_match_fn;
 	ifc_create_t		*clone_create_fn;
 	ifc_destroy_t		*clone_destroy_fn;
 } tuntap_drivers[] = {
 	{
 		.ident_flags =	0,
 		.cdevsw =	{
 		    .d_version =	D_VERSION,
 		    .d_flags =		D_NEEDMINOR,
 		    .d_open =		tunopen,
 		    .d_close =		tunclose,
 		    .d_read =		tunread,
 		    .d_write =		tunwrite,
 		    .d_ioctl =		tunioctl,
 		    .d_poll =		tunpoll,
 		    .d_kqfilter =	tunkqfilter,
 		    .d_name =		tunname,
 		},
 		.clone_match_fn =	tun_clone_match,
 		.clone_create_fn =	tun_clone_create,
 		.clone_destroy_fn =	tun_clone_destroy,
 	},
 	{
 		.ident_flags =	TUN_L2,
 		.cdevsw =	{
 		    .d_version =	D_VERSION,
 		    .d_flags =		D_NEEDMINOR,
 		    .d_open =		tunopen,
 		    .d_close =		tunclose,
 		    .d_read =		tunread,
 		    .d_write =		tunwrite,
 		    .d_ioctl =		tunioctl,
 		    .d_poll =		tunpoll,
 		    .d_kqfilter =	tunkqfilter,
 		    .d_name =		tapname,
 		},
 		.clone_match_fn =	tap_clone_match,
 		.clone_create_fn =	tun_clone_create,
 		.clone_destroy_fn =	tun_clone_destroy,
 	},
 	{
 		.ident_flags =	TUN_L2 | TUN_VMNET,
 		.cdevsw =	{
 		    .d_version =	D_VERSION,
 		    .d_flags =		D_NEEDMINOR,
 		    .d_open =		tunopen,
 		    .d_close =		tunclose,
 		    .d_read =		tunread,
 		    .d_write =		tunwrite,
 		    .d_ioctl =		tunioctl,
 		    .d_poll =		tunpoll,
 		    .d_kqfilter =	tunkqfilter,
 		    .d_name =		vmnetname,
 		},
 		.clone_match_fn =	vmnet_clone_match,
 		.clone_create_fn =	tun_clone_create,
 		.clone_destroy_fn =	tun_clone_destroy,
 	},
 };
 
 struct tuntap_driver_cloner {
 	SLIST_ENTRY(tuntap_driver_cloner)	 link;
 	struct tuntap_driver			*drv;
 	struct if_clone				*cloner;
 };
 
 VNET_DEFINE_STATIC(SLIST_HEAD(, tuntap_driver_cloner), tuntap_driver_cloners) =
     SLIST_HEAD_INITIALIZER(tuntap_driver_cloners);
 
 #define	V_tuntap_driver_cloners	VNET(tuntap_driver_cloners)
 
 /*
  * Sets unit and/or flags given the device name.  Must be called with correct
  * vnet context.
  */
 static int
 tuntap_name2info(const char *name, int *outunit, int *outflags)
 {
 	struct tuntap_driver *drv;
 	struct tuntap_driver_cloner *drvc;
 	char *dname;
 	int flags, unit;
 	bool found;
 
 	if (name == NULL)
 		return (EINVAL);
 
 	/*
 	 * Needed for dev_stdclone, but dev_stdclone will not modify, it just
 	 * wants to be able to pass back a char * through the second param. We
 	 * will always set that as NULL here, so we'll fake it.
 	 */
 	dname = __DECONST(char *, name);
 	found = false;
 
 	KASSERT(!SLIST_EMPTY(&V_tuntap_driver_cloners),
 	    ("tuntap_driver_cloners failed to initialize"));
 	SLIST_FOREACH(drvc, &V_tuntap_driver_cloners, link) {
 		KASSERT(drvc->drv != NULL,
 		    ("tuntap_driver_cloners entry not properly initialized"));
 		drv = drvc->drv;
 
 		if (strcmp(name, drv->cdevsw.d_name) == 0) {
 			found = true;
 			unit = -1;
 			flags = drv->ident_flags;
 			break;
 		}
 
 		if (dev_stdclone(dname, NULL, drv->cdevsw.d_name, &unit) == 1) {
 			found = true;
 			flags = drv->ident_flags;
 			break;
 		}
 	}
 
 	if (!found)
 		return (ENXIO);
 
 	if (outunit != NULL)
 		*outunit = unit;
 	if (outflags != NULL)
 		*outflags = flags;
 	return (0);
 }
 
 /*
  * Get driver information from a set of flags specified.  Masks the identifying
  * part of the flags and compares it against all of the available
  * tuntap_drivers. Must be called with correct vnet context.
  */
 static struct tuntap_driver *
 tuntap_driver_from_flags(int tun_flags)
 {
 	struct tuntap_driver *drv;
 	struct tuntap_driver_cloner *drvc;
 
 	KASSERT(!SLIST_EMPTY(&V_tuntap_driver_cloners),
 	    ("tuntap_driver_cloners failed to initialize"));
 	SLIST_FOREACH(drvc, &V_tuntap_driver_cloners, link) {
 		KASSERT(drvc->drv != NULL,
 		    ("tuntap_driver_cloners entry not properly initialized"));
 		drv = drvc->drv;
 		if ((tun_flags & TUN_DRIVER_IDENT_MASK) == drv->ident_flags)
 			return (drv);
 	}
 
 	return (NULL);
 }
 
 
 
 static int
 tun_clone_match(struct if_clone *ifc, const char *name)
 {
 	int tunflags;
 
 	if (tuntap_name2info(name, NULL, &tunflags) == 0) {
 		if ((tunflags & TUN_L2) == 0)
 			return (1);
 	}
 
 	return (0);
 }
 
 static int
 tap_clone_match(struct if_clone *ifc, const char *name)
 {
 	int tunflags;
 
 	if (tuntap_name2info(name, NULL, &tunflags) == 0) {
 		if ((tunflags & (TUN_L2 | TUN_VMNET)) == TUN_L2)
 			return (1);
 	}
 
 	return (0);
 }
 
 static int
 vmnet_clone_match(struct if_clone *ifc, const char *name)
 {
 	int tunflags;
 
 	if (tuntap_name2info(name, NULL, &tunflags) == 0) {
 		if ((tunflags & TUN_VMNET) != 0)
 			return (1);
 	}
 
 	return (0);
 }
 
 static int
 tun_clone_create(struct if_clone *ifc, char *name, size_t len, caddr_t params)
 {
 	struct tuntap_driver *drv;
 	struct cdev *dev;
 	int err, i, tunflags, unit;
 
 	tunflags = 0;
 	/* The name here tells us exactly what we're creating */
 	err = tuntap_name2info(name, &unit, &tunflags);
 	if (err != 0)
 		return (err);
 
 	drv = tuntap_driver_from_flags(tunflags);
 	if (drv == NULL)
 		return (ENXIO);
 
 	if (unit != -1) {
 		/* If this unit number is still available that/s okay. */
 		if (alloc_unr_specific(drv->unrhdr, unit) == -1)
 			return (EEXIST);
 	} else {
 		unit = alloc_unr(drv->unrhdr);
 	}
 
 	snprintf(name, IFNAMSIZ, "%s%d", drv->cdevsw.d_name, unit);
 
 	/* find any existing device, or allocate new unit number */
 	i = clone_create(&drv->clones, &drv->cdevsw, &unit, &dev, 0);
 	if (i) {
 		/* No preexisting struct cdev *, create one */
 		dev = make_dev(&drv->cdevsw, unit, UID_UUCP, GID_DIALER, 0600,
 		    "%s%d", drv->cdevsw.d_name, unit);
 	}
 
 	tuncreate(dev, drv);
 
 	return (0);
 }
 
 static void
 tunclone(void *arg, struct ucred *cred, char *name, int namelen,
     struct cdev **dev)
 {
 	char devname[SPECNAMELEN + 1];
 	struct tuntap_driver *drv;
 	int append_unit, i, u, tunflags;
 	bool mayclone;
 
 	if (*dev != NULL)
 		return;
 
 	tunflags = 0;
 	CURVNET_SET(CRED_TO_VNET(cred));
 	if (tuntap_name2info(name, &u, &tunflags) != 0)
 		goto out;	/* Not recognized */
 
 	if (u != -1 && u > IF_MAXUNIT)
 		goto out;	/* Unit number too high */
 
 	mayclone = priv_check_cred(cred, PRIV_NET_IFCREATE) == 0;
 	if ((tunflags & TUN_L2) != 0) {
 		/* tap/vmnet allow user open with a sysctl */
 		mayclone = (mayclone || tap_allow_uopen) && tapdclone;
 	} else {
 		mayclone = mayclone && tundclone;
 	}
 
 	/*
 	 * If tun cloning is enabled, only the superuser can create an
 	 * interface.
 	 */
 	if (!mayclone)
 		goto out;
 
 	if (u == -1)
 		append_unit = 1;
 	else
 		append_unit = 0;
 
 	drv = tuntap_driver_from_flags(tunflags);
 	if (drv == NULL)
 		goto out;
 
 	/* find any existing device, or allocate new unit number */
 	i = clone_create(&drv->clones, &drv->cdevsw, &u, dev, 0);
 	if (i) {
 		if (append_unit) {
 			namelen = snprintf(devname, sizeof(devname), "%s%d",
 			    name, u);
 			name = devname;
 		}
 		/* No preexisting struct cdev *, create one */
 		*dev = make_dev_credf(MAKEDEV_REF, &drv->cdevsw, u, cred,
 		    UID_UUCP, GID_DIALER, 0600, "%s", name);
 	}
 
 	if_clone_create(name, namelen, NULL);
 out:
 	CURVNET_RESTORE();
 }
 
 static void
 tun_destroy(struct tuntap_softc *tp)
 {
 
 	TUN_LOCK(tp);
 	tp->tun_flags |= TUN_DYING;
 	if ((tp->tun_flags & TUN_OPEN) != 0)
 		cv_wait_unlock(&tp->tun_cv, &tp->tun_mtx);
 	else
 		TUN_UNLOCK(tp);
 
 	CURVNET_SET(TUN2IFP(tp)->if_vnet);
-	sx_xlock(&tun_ioctl_sx);
-	TUN2IFP(tp)->if_softc = NULL;
-	sx_xunlock(&tun_ioctl_sx);
 
 	destroy_dev(tp->tun_dev);
 	seldrain(&tp->tun_rsel);
 	knlist_clear(&tp->tun_rsel.si_note, 0);
 	knlist_destroy(&tp->tun_rsel.si_note);
 	if ((tp->tun_flags & TUN_L2) != 0) {
 		ether_ifdetach(TUN2IFP(tp));
 	} else {
 		bpfdetach(TUN2IFP(tp));
 		if_detach(TUN2IFP(tp));
 	}
+	sx_xlock(&tun_ioctl_sx);
+	TUN2IFP(tp)->if_softc = NULL;
+	sx_xunlock(&tun_ioctl_sx);
 	free_unr(tp->tun_drv->unrhdr, TUN2IFP(tp)->if_dunit);
 	if_free(TUN2IFP(tp));
 	mtx_destroy(&tp->tun_mtx);
 	cv_destroy(&tp->tun_cv);
 	free(tp, M_TUN);
 	CURVNET_RESTORE();
 }
 
 static int
 tun_clone_destroy(struct if_clone *ifc __unused, struct ifnet *ifp)
 {
 	struct tuntap_softc *tp = ifp->if_softc;
 
 	mtx_lock(&tunmtx);
 	TAILQ_REMOVE(&tunhead, tp, tun_list);
 	mtx_unlock(&tunmtx);
 	tun_destroy(tp);
 
 	return (0);
 }
 
 static void
 vnet_tun_init(const void *unused __unused)
 {
 	struct tuntap_driver *drv;
 	struct tuntap_driver_cloner *drvc;
 	int i;
 
 	for (i = 0; i < nitems(tuntap_drivers); ++i) {
 		drv = &tuntap_drivers[i];
 		drvc = malloc(sizeof(*drvc), M_TUN, M_WAITOK | M_ZERO);
 
 		drvc->drv = drv;
 		drvc->cloner = if_clone_advanced(drv->cdevsw.d_name, 0,
 		    drv->clone_match_fn, drv->clone_create_fn,
 		    drv->clone_destroy_fn);
 		SLIST_INSERT_HEAD(&V_tuntap_driver_cloners, drvc, link);
 	};
 }
 VNET_SYSINIT(vnet_tun_init, SI_SUB_PROTO_IF, SI_ORDER_ANY,
 		vnet_tun_init, NULL);
 
 static void
 vnet_tun_uninit(const void *unused __unused)
 {
 	struct tuntap_driver_cloner *drvc;
 
 	while (!SLIST_EMPTY(&V_tuntap_driver_cloners)) {
 		drvc = SLIST_FIRST(&V_tuntap_driver_cloners);
 		SLIST_REMOVE_HEAD(&V_tuntap_driver_cloners, link);
 
 		if_clone_detach(drvc->cloner);
 		free(drvc, M_TUN);
 	}
 }
 VNET_SYSUNINIT(vnet_tun_uninit, SI_SUB_PROTO_IF, SI_ORDER_ANY,
     vnet_tun_uninit, NULL);
 
 static void
 tun_uninit(const void *unused __unused)
 {
 	struct tuntap_driver *drv;
 	struct tuntap_softc *tp;
 	int i;
 
 	EVENTHANDLER_DEREGISTER(dev_clone, tag);
 	drain_dev_clone_events();
 
 	mtx_lock(&tunmtx);
 	while ((tp = TAILQ_FIRST(&tunhead)) != NULL) {
 		TAILQ_REMOVE(&tunhead, tp, tun_list);
 		mtx_unlock(&tunmtx);
 		tun_destroy(tp);
 		mtx_lock(&tunmtx);
 	}
 	mtx_unlock(&tunmtx);
 	for (i = 0; i < nitems(tuntap_drivers); ++i) {
 		drv = &tuntap_drivers[i];
 		delete_unrhdr(drv->unrhdr);
 		clone_cleanup(&drv->clones);
 	}
 	mtx_destroy(&tunmtx);
 }
 SYSUNINIT(tun_uninit, SI_SUB_PROTO_IF, SI_ORDER_ANY, tun_uninit, NULL);
 
 static int
 tuntapmodevent(module_t mod, int type, void *data)
 {
 	struct tuntap_driver *drv;
 	int i;
 
 	switch (type) {
 	case MOD_LOAD:
 		mtx_init(&tunmtx, "tunmtx", NULL, MTX_DEF);
 		for (i = 0; i < nitems(tuntap_drivers); ++i) {
 			drv = &tuntap_drivers[i];
 			clone_setup(&drv->clones);
 			drv->unrhdr = new_unrhdr(0, IF_MAXUNIT, &tunmtx);
 		}
 		tag = EVENTHANDLER_REGISTER(dev_clone, tunclone, 0, 1000);
 		if (tag == NULL)
 			return (ENOMEM);
 		break;
 	case MOD_UNLOAD:
 		/* See tun_uninit, so it's done after the vnet_sysuninit() */
 		break;
 	default:
 		return EOPNOTSUPP;
 	}
 	return 0;
 }
 
 static moduledata_t tuntap_mod = {
 	"if_tuntap",
 	tuntapmodevent,
 	0
 };
 
 DECLARE_MODULE(if_tuntap, tuntap_mod, SI_SUB_PSEUDO, SI_ORDER_ANY);
 MODULE_VERSION(if_tuntap, 1);
 
 static void
 tunstart(struct ifnet *ifp)
 {
 	struct tuntap_softc *tp = ifp->if_softc;
 	struct mbuf *m;
 
 	TUNDEBUG(ifp, "starting\n");
 	if (ALTQ_IS_ENABLED(&ifp->if_snd)) {
 		IFQ_LOCK(&ifp->if_snd);
 		IFQ_POLL_NOLOCK(&ifp->if_snd, m);
 		if (m == NULL) {
 			IFQ_UNLOCK(&ifp->if_snd);
 			return;
 		}
 		IFQ_UNLOCK(&ifp->if_snd);
 	}
 
 	TUN_LOCK(tp);
 	if (tp->tun_flags & TUN_RWAIT) {
 		tp->tun_flags &= ~TUN_RWAIT;
 		wakeup(tp);
 	}
 	selwakeuppri(&tp->tun_rsel, PZERO + 1);
 	KNOTE_LOCKED(&tp->tun_rsel.si_note, 0);
 	if (tp->tun_flags & TUN_ASYNC && tp->tun_sigio) {
 		TUN_UNLOCK(tp);
 		pgsigio(&tp->tun_sigio, SIGIO, 0);
 	} else
 		TUN_UNLOCK(tp);
 }
 
 /*
  * tunstart_l2
  *
  * queue packets from higher level ready to put out
  */
 static void
 tunstart_l2(struct ifnet *ifp)
 {
 	struct tuntap_softc	*tp = ifp->if_softc;
 
 	TUNDEBUG(ifp, "starting\n");
 
 	/*
 	 * do not junk pending output if we are in VMnet mode.
 	 * XXX: can this do any harm because of queue overflow?
 	 */
 
 	TUN_LOCK(tp);
 	if (((tp->tun_flags & TUN_VMNET) == 0) &&
 	    ((tp->tun_flags & TUN_READY) != TUN_READY)) {
 		struct mbuf *m;
 
 		/* Unlocked read. */
 		TUNDEBUG(ifp, "not ready, tun_flags = 0x%x\n", tp->tun_flags);
 
 		for (;;) {
 			IF_DEQUEUE(&ifp->if_snd, m);
 			if (m != NULL) {
 				m_freem(m);
 				if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
 			} else
 				break;
 		}
 		TUN_UNLOCK(tp);
 
 		return;
 	}
 
 	ifp->if_drv_flags |= IFF_DRV_OACTIVE;
 
 	if (!IFQ_IS_EMPTY(&ifp->if_snd)) {
 		if (tp->tun_flags & TUN_RWAIT) {
 			tp->tun_flags &= ~TUN_RWAIT;
 			wakeup(tp);
 		}
 
 		if ((tp->tun_flags & TUN_ASYNC) && (tp->tun_sigio != NULL)) {
 			TUN_UNLOCK(tp);
 			pgsigio(&tp->tun_sigio, SIGIO, 0);
 			TUN_LOCK(tp);
 		}
 
 		selwakeuppri(&tp->tun_rsel, PZERO+1);
 		KNOTE_LOCKED(&tp->tun_rsel.si_note, 0);
 		if_inc_counter(ifp, IFCOUNTER_OPACKETS, 1); /* obytes are counted in ether_output */
 	}
 
 	ifp->if_drv_flags &= ~IFF_DRV_OACTIVE;
 	TUN_UNLOCK(tp);
 } /* tunstart_l2 */
 
 
 /* XXX: should return an error code so it can fail. */
 static void
 tuncreate(struct cdev *dev, struct tuntap_driver *drv)
 {
 	struct tuntap_softc *sc;
 	struct ifnet *ifp;
 	struct ether_addr eaddr;
 	int iflags;
 	u_char type;
 
 	sc = malloc(sizeof(*sc), M_TUN, M_WAITOK | M_ZERO);
 	mtx_init(&sc->tun_mtx, "tun_mtx", NULL, MTX_DEF);
 	cv_init(&sc->tun_cv, "tun_condvar");
 	sc->tun_flags = drv->ident_flags;
 	sc->tun_dev = dev;
 	sc->tun_drv = drv;
 	mtx_lock(&tunmtx);
 	TAILQ_INSERT_TAIL(&tunhead, sc, tun_list);
 	mtx_unlock(&tunmtx);
 
 	iflags = IFF_MULTICAST;
 	if ((sc->tun_flags & TUN_L2) != 0) {
 		type = IFT_ETHER;
 		iflags |= IFF_BROADCAST | IFF_SIMPLEX;
 	} else {
 		type = IFT_PPP;
 		iflags |= IFF_POINTOPOINT;
 	}
 	ifp = sc->tun_ifp = if_alloc(type);
 	if (ifp == NULL)
 		panic("%s%d: failed to if_alloc() interface.\n",
 		    drv->cdevsw.d_name, dev2unit(dev));
 	ifp->if_softc = sc;
 	if_initname(ifp, drv->cdevsw.d_name, dev2unit(dev));
 	ifp->if_ioctl = tunifioctl;
 	ifp->if_flags = iflags;
 	IFQ_SET_MAXLEN(&ifp->if_snd, ifqmaxlen);
 	knlist_init_mtx(&sc->tun_rsel.si_note, &sc->tun_mtx);
 	ifp->if_capabilities |= IFCAP_LINKSTATE;
 	ifp->if_capenable |= IFCAP_LINKSTATE;
 
 	if ((sc->tun_flags & TUN_L2) != 0) {
 		ifp->if_mtu = ETHERMTU;
 		ifp->if_init = tunifinit;
 		ifp->if_start = tunstart_l2;
 
 		ether_gen_addr(ifp, &eaddr);
 		ether_ifattach(ifp, eaddr.octet);
 	} else {
 		ifp->if_mtu = TUNMTU;
 		ifp->if_start = tunstart;
 		ifp->if_output = tunoutput;
 
 		ifp->if_snd.ifq_drv_maxlen = 0;
 		IFQ_SET_READY(&ifp->if_snd);
 
 		if_attach(ifp);
 		bpfattach(ifp, DLT_NULL, sizeof(u_int32_t));
 	}
 	dev->si_drv1 = sc;
 
 	TUN_LOCK(sc);
 	sc->tun_flags |= TUN_INITED;
 	TUN_UNLOCK(sc);
 
 	TUNDEBUG(ifp, "interface %s is created, minor = %#x\n",
 	    ifp->if_xname, dev2unit(dev));
 }
 
 static int
 tunopen(struct cdev *dev, int flag, int mode, struct thread *td)
 {
 	struct ifnet	*ifp;
 	struct tuntap_driver *drv;
 	struct tuntap_softc *tp;
 	int error, tunflags;
 
 	tunflags = 0;
 	CURVNET_SET(TD_TO_VNET(td));
 	error = tuntap_name2info(dev->si_name, NULL, &tunflags);
 	if (error != 0) {
 		CURVNET_RESTORE();
 		return (error);	/* Shouldn't happen */
 	}
 
 	if ((tunflags & TUN_L2) != 0) {
 		/* Restrict? */
 		if (tap_allow_uopen == 0) {
 			error = priv_check(td, PRIV_NET_TAP);
 			if (error != 0) {
 				CURVNET_RESTORE();
 				return (error);
 			}
 		}
 	}
 
 	/*
 	 * XXXRW: Non-atomic test and set of dev->si_drv1 requires
 	 * synchronization.
 	 */
 	tp = dev->si_drv1;
 	if (!tp) {
 		drv = tuntap_driver_from_flags(tunflags);
 		if (drv == NULL) {
 			CURVNET_RESTORE();
 			return (ENXIO);
 		}
 		tuncreate(dev, drv);
 		tp = dev->si_drv1;
 	}
 
 	TUN_LOCK(tp);
 	if ((tp->tun_flags & (TUN_OPEN | TUN_DYING)) != 0) {
 		TUN_UNLOCK(tp);
 		CURVNET_RESTORE();
 		return (EBUSY);
 	}
 
 	ifp = TUN2IFP(tp);
 
 	if ((tp->tun_flags & TUN_L2) != 0) {
 		bcopy(IF_LLADDR(ifp), tp->tun_ether.octet,
 		    sizeof(tp->tun_ether.octet));
 
 		ifp->if_drv_flags |= IFF_DRV_RUNNING;
 		ifp->if_drv_flags &= ~IFF_DRV_OACTIVE;
 
 		if (tapuponopen)
 			ifp->if_flags |= IFF_UP;
 	}
 
 	tp->tun_pid = td->td_proc->p_pid;
 	tp->tun_flags |= TUN_OPEN;
 
 	if_link_state_change(ifp, LINK_STATE_UP);
 	TUNDEBUG(ifp, "open\n");
 	TUN_UNLOCK(tp);
 	CURVNET_RESTORE();
 	return (0);
 }
 
 /*
  * tunclose - close the device - mark i/f down & delete
  * routing info
  */
 static	int
 tunclose(struct cdev *dev, int foo, int bar, struct thread *td)
 {
 	struct tuntap_softc *tp;
 	struct ifnet *ifp;
 	bool l2tun;
 
 	tp = dev->si_drv1;
 	ifp = TUN2IFP(tp);
 
 	TUN_LOCK(tp);
 	/*
 	 * Simply close the device if this isn't the controlling process.  This
 	 * may happen if, for instance, the tunnel has been handed off to
 	 * another process.  The original controller should be able to close it
 	 * without putting us into an inconsistent state.
 	 */
 	if (td->td_proc->p_pid != tp->tun_pid) {
 		TUN_UNLOCK(tp);
 		return (0);
 	}
 
 	/*
 	 * junk all pending output
 	 */
 	CURVNET_SET(ifp->if_vnet);
 
 	l2tun = false;
 	if ((tp->tun_flags & TUN_L2) != 0) {
 		l2tun = true;
 		IF_DRAIN(&ifp->if_snd);
 	} else {
 		IFQ_PURGE(&ifp->if_snd);
 	}
 
 	/* For vmnet, we won't do most of the address/route bits */
 	if ((tp->tun_flags & TUN_VMNET) != 0 ||
 	    (l2tun && (ifp->if_flags & IFF_LINK0) != 0))
 		goto out;
 
 	if (ifp->if_flags & IFF_UP) {
 		TUN_UNLOCK(tp);
 		if_down(ifp);
 		TUN_LOCK(tp);
 	}
 
 	/* Delete all addresses and routes which reference this interface. */
 	if (ifp->if_drv_flags & IFF_DRV_RUNNING) {
 		struct ifaddr *ifa;
 
 		ifp->if_drv_flags &= ~IFF_DRV_RUNNING;
 		TUN_UNLOCK(tp);
 		CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
 			/* deal w/IPv4 PtP destination; unlocked read */
 			if (!l2tun && ifa->ifa_addr->sa_family == AF_INET) {
 				rtinit(ifa, (int)RTM_DELETE,
 				    tp->tun_flags & TUN_DSTADDR ? RTF_HOST : 0);
 			} else {
 				rtinit(ifa, (int)RTM_DELETE, 0);
 			}
 		}
 		if_purgeaddrs(ifp);
 		TUN_LOCK(tp);
 	}
 
 out:
 	if_link_state_change(ifp, LINK_STATE_DOWN);
 	CURVNET_RESTORE();
 
 	funsetown(&tp->tun_sigio);
 	selwakeuppri(&tp->tun_rsel, PZERO + 1);
 	KNOTE_LOCKED(&tp->tun_rsel.si_note, 0);
 	TUNDEBUG (ifp, "closed\n");
 	tp->tun_flags &= ~TUN_OPEN;
 	tp->tun_pid = 0;
 
 	cv_broadcast(&tp->tun_cv);
 	TUN_UNLOCK(tp);
 	return (0);
 }
 
 static void
 tuninit(struct ifnet *ifp)
 {
 	struct tuntap_softc *tp = ifp->if_softc;
 #ifdef INET
 	struct ifaddr *ifa;
 #endif
 
 	TUNDEBUG(ifp, "tuninit\n");
 
 	TUN_LOCK(tp);
 	ifp->if_drv_flags |= IFF_DRV_RUNNING;
 	if ((tp->tun_flags & TUN_L2) == 0) {
 		ifp->if_flags |= IFF_UP;
 		getmicrotime(&ifp->if_lastchange);
 #ifdef INET
 		if_addr_rlock(ifp);
 		CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
 			if (ifa->ifa_addr->sa_family == AF_INET) {
 				struct sockaddr_in *si;
 
 				si = (struct sockaddr_in *)ifa->ifa_addr;
 				if (si->sin_addr.s_addr)
 					tp->tun_flags |= TUN_IASET;
 
 				si = (struct sockaddr_in *)ifa->ifa_dstaddr;
 				if (si && si->sin_addr.s_addr)
 					tp->tun_flags |= TUN_DSTADDR;
 			}
 		}
 		if_addr_runlock(ifp);
 #endif
 		TUN_UNLOCK(tp);
 	} else {
 		ifp->if_drv_flags &= ~IFF_DRV_OACTIVE;
 		TUN_UNLOCK(tp);
 		/* attempt to start output */
 		tunstart_l2(ifp);
 	}
 
 }
 
 /*
  * Used only for l2 tunnel.
  */
 static void
 tunifinit(void *xtp)
 {
 	struct tuntap_softc *tp;
 
 	tp = (struct tuntap_softc *)xtp;
 	tuninit(tp->tun_ifp);
 }
 
 /*
  * Process an ioctl request.
  */
 static int
 tunifioctl(struct ifnet *ifp, u_long cmd, caddr_t data)
 {
 	struct ifreq *ifr = (struct ifreq *)data;
 	struct tuntap_softc *tp;
 	struct ifstat *ifs;
 	struct ifmediareq	*ifmr;
 	int		dummy, error = 0;
 	bool		l2tun;
 
 	ifmr = NULL;
 	sx_xlock(&tun_ioctl_sx);
 	tp = ifp->if_softc;
 	if (tp == NULL) {
 		error = ENXIO;
 		goto bad;
 	}
 	l2tun = (tp->tun_flags & TUN_L2) != 0;
 	switch(cmd) {
 	case SIOCGIFSTATUS:
 		ifs = (struct ifstat *)data;
 		TUN_LOCK(tp);
 		if (tp->tun_pid)
 			snprintf(ifs->ascii, sizeof(ifs->ascii),
 			    "\tOpened by PID %d\n", tp->tun_pid);
 		else
 			ifs->ascii[0] = '\0';
 		TUN_UNLOCK(tp);
 		break;
 	case SIOCSIFADDR:
 		if (l2tun)
 			error = ether_ioctl(ifp, cmd, data);
 		else
 			tuninit(ifp);
 		if (error == 0)
 		    TUNDEBUG(ifp, "address set\n");
 		break;
 	case SIOCSIFMTU:
 		ifp->if_mtu = ifr->ifr_mtu;
 		TUNDEBUG(ifp, "mtu set\n");
 		break;
 	case SIOCSIFFLAGS:
 	case SIOCADDMULTI:
 	case SIOCDELMULTI:
 		break;
 	case SIOCGIFMEDIA:
 		if (!l2tun) {
 			error = EINVAL;
 			break;
 		}
 
 		ifmr = (struct ifmediareq *)data;
 		dummy = ifmr->ifm_count;
 		ifmr->ifm_count = 1;
 		ifmr->ifm_status = IFM_AVALID;
 		ifmr->ifm_active = IFM_ETHER;
 		if (tp->tun_flags & TUN_OPEN)
 			ifmr->ifm_status |= IFM_ACTIVE;
 		ifmr->ifm_current = ifmr->ifm_active;
 		if (dummy >= 1) {
 			int media = IFM_ETHER;
 			error = copyout(&media, ifmr->ifm_ulist, sizeof(int));
 		}
 		break;
 	default:
 		if (l2tun) {
 			error = ether_ioctl(ifp, cmd, data);
 		} else {
 			error = EINVAL;
 		}
 	}
 bad:
 	sx_xunlock(&tun_ioctl_sx);
 	return (error);
 }
 
 /*
  * tunoutput - queue packets from higher level ready to put out.
  */
 static int
 tunoutput(struct ifnet *ifp, struct mbuf *m0, const struct sockaddr *dst,
     struct route *ro)
 {
 	struct tuntap_softc *tp = ifp->if_softc;
 	u_short cached_tun_flags;
 	int error;
 	u_int32_t af;
 
 	TUNDEBUG (ifp, "tunoutput\n");
 
 #ifdef MAC
 	error = mac_ifnet_check_transmit(ifp, m0);
 	if (error) {
 		m_freem(m0);
 		return (error);
 	}
 #endif
 
 	/* Could be unlocked read? */
 	TUN_LOCK(tp);
 	cached_tun_flags = tp->tun_flags;
 	TUN_UNLOCK(tp);
 	if ((cached_tun_flags & TUN_READY) != TUN_READY) {
 		TUNDEBUG (ifp, "not ready 0%o\n", tp->tun_flags);
 		m_freem (m0);
 		return (EHOSTDOWN);
 	}
 
 	if ((ifp->if_flags & IFF_UP) != IFF_UP) {
 		m_freem (m0);
 		return (EHOSTDOWN);
 	}
 
 	/* BPF writes need to be handled specially. */
 	if (dst->sa_family == AF_UNSPEC)
 		bcopy(dst->sa_data, &af, sizeof(af));
 	else
 		af = dst->sa_family;
 
 	if (bpf_peers_present(ifp->if_bpf))
 		bpf_mtap2(ifp->if_bpf, &af, sizeof(af), m0);
 
 	/* prepend sockaddr? this may abort if the mbuf allocation fails */
 	if (cached_tun_flags & TUN_LMODE) {
 		/* allocate space for sockaddr */
 		M_PREPEND(m0, dst->sa_len, M_NOWAIT);
 
 		/* if allocation failed drop packet */
 		if (m0 == NULL) {
 			if_inc_counter(ifp, IFCOUNTER_IQDROPS, 1);
 			if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
 			return (ENOBUFS);
 		} else {
 			bcopy(dst, m0->m_data, dst->sa_len);
 		}
 	}
 
 	if (cached_tun_flags & TUN_IFHEAD) {
 		/* Prepend the address family */
 		M_PREPEND(m0, 4, M_NOWAIT);
 
 		/* if allocation failed drop packet */
 		if (m0 == NULL) {
 			if_inc_counter(ifp, IFCOUNTER_IQDROPS, 1);
 			if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
 			return (ENOBUFS);
 		} else
 			*(u_int32_t *)m0->m_data = htonl(af);
 	} else {
 #ifdef INET
 		if (af != AF_INET)
 #endif
 		{
 			m_freem(m0);
 			return (EAFNOSUPPORT);
 		}
 	}
 
 	error = (ifp->if_transmit)(ifp, m0);
 	if (error)
 		return (ENOBUFS);
 	if_inc_counter(ifp, IFCOUNTER_OPACKETS, 1);
 	return (0);
 }
 
 /*
  * the cdevsw interface is now pretty minimal.
  */
 static	int
 tunioctl(struct cdev *dev, u_long cmd, caddr_t data, int flag,
     struct thread *td)
 {
 	struct ifreq ifr, *ifrp;
 	struct tuntap_softc *tp = dev->si_drv1;
 	struct tuninfo *tunp;
 	int error, iflags;
 #if defined(COMPAT_FREEBSD6) || defined(COMPAT_FREEBSD5) || \
     defined(COMPAT_FREEBSD4)
 	int	ival;
 #endif
 	bool	l2tun;
 
 	l2tun = (tp->tun_flags & TUN_L2) != 0;
 	if (l2tun) {
 		/* tap specific ioctls */
 		switch(cmd) {
 		case TAPGIFNAME:
 			ifrp = (struct ifreq *)data;
 			strlcpy(ifrp->ifr_name, TUN2IFP(tp)->if_xname,
 			    IFNAMSIZ);
 
 			return (0);
 		/* VMware/VMnet port ioctl's */
 #if defined(COMPAT_FREEBSD6) || defined(COMPAT_FREEBSD5) || \
     defined(COMPAT_FREEBSD4)
 		case _IO('V', 0):
 			ival = IOCPARM_IVAL(data);
 			data = (caddr_t)&ival;
 			/* FALLTHROUGH */
 #endif
 		case VMIO_SIOCSIFFLAGS: /* VMware/VMnet SIOCSIFFLAGS */
 			iflags = *(int *)data;
 			iflags &= TUN_VMIO_FLAG_MASK;
 			iflags &= ~IFF_CANTCHANGE;
 			iflags |= IFF_UP;
 
 			TUN_LOCK(tp);
 			TUN2IFP(tp)->if_flags = iflags |
 			    (TUN2IFP(tp)->if_flags & IFF_CANTCHANGE);
 			TUN_UNLOCK(tp);
 
 			return (0);
 		case SIOCGIFADDR:	/* get MAC address of the remote side */
 			TUN_LOCK(tp);
 			bcopy(&tp->tun_ether.octet, data,
 			    sizeof(tp->tun_ether.octet));
 			TUN_UNLOCK(tp);
 
 			return (0);
 		case SIOCSIFADDR:	/* set MAC address of the remote side */
 			TUN_LOCK(tp);
 			bcopy(data, &tp->tun_ether.octet,
 			    sizeof(tp->tun_ether.octet));
 			TUN_UNLOCK(tp);
 
 			return (0);
 		}
 
 		/* Fall through to the common ioctls if unhandled */
 	} else {
 		switch (cmd) {
 		case TUNSLMODE:
 			TUN_LOCK(tp);
 			if (*(int *)data) {
 				tp->tun_flags |= TUN_LMODE;
 				tp->tun_flags &= ~TUN_IFHEAD;
 			} else
 				tp->tun_flags &= ~TUN_LMODE;
 			TUN_UNLOCK(tp);
 
 			return (0);
 		case TUNSIFHEAD:
 			TUN_LOCK(tp);
 			if (*(int *)data) {
 				tp->tun_flags |= TUN_IFHEAD;
 				tp->tun_flags &= ~TUN_LMODE;
 			} else
 				tp->tun_flags &= ~TUN_IFHEAD;
 			TUN_UNLOCK(tp);
 
 			return (0);
 		case TUNGIFHEAD:
 			TUN_LOCK(tp);
 			*(int *)data = (tp->tun_flags & TUN_IFHEAD) ? 1 : 0;
 			TUN_UNLOCK(tp);
 
 			return (0);
 		case TUNSIFMODE:
 			/* deny this if UP */
 			if (TUN2IFP(tp)->if_flags & IFF_UP)
 				return (EBUSY);
 
 			switch (*(int *)data & ~IFF_MULTICAST) {
 			case IFF_POINTOPOINT:
 			case IFF_BROADCAST:
 				TUN_LOCK(tp);
 				TUN2IFP(tp)->if_flags &=
 				    ~(IFF_BROADCAST|IFF_POINTOPOINT|IFF_MULTICAST);
 				TUN2IFP(tp)->if_flags |= *(int *)data;
 				TUN_UNLOCK(tp);
 
 				break;
 			default:
 				return (EINVAL);
 			}
 
 			return (0);
 		case TUNSIFPID:
 			TUN_LOCK(tp);
 			tp->tun_pid = curthread->td_proc->p_pid;
 			TUN_UNLOCK(tp);
 
 			return (0);
 		}
 		/* Fall through to the common ioctls if unhandled */
 	}
 
 	switch (cmd) {
 	case TUNSIFINFO:
 		tunp = (struct tuninfo *)data;
 		if (TUN2IFP(tp)->if_type != tunp->type)
 			return (EPROTOTYPE);
 		TUN_LOCK(tp);
 		if (TUN2IFP(tp)->if_mtu != tunp->mtu) {
 			strlcpy(ifr.ifr_name, if_name(TUN2IFP(tp)), IFNAMSIZ);
 			ifr.ifr_mtu = tunp->mtu;
 			CURVNET_SET(TUN2IFP(tp)->if_vnet);
 			error = ifhwioctl(SIOCSIFMTU, TUN2IFP(tp),
 			    (caddr_t)&ifr, td);
 			CURVNET_RESTORE();
 			if (error) {
 				TUN_UNLOCK(tp);
 				return (error);
 			}
 		}
 		TUN2IFP(tp)->if_baudrate = tunp->baudrate;
 		TUN_UNLOCK(tp);
 		break;
 	case TUNGIFINFO:
 		tunp = (struct tuninfo *)data;
 		TUN_LOCK(tp);
 		tunp->mtu = TUN2IFP(tp)->if_mtu;
 		tunp->type = TUN2IFP(tp)->if_type;
 		tunp->baudrate = TUN2IFP(tp)->if_baudrate;
 		TUN_UNLOCK(tp);
 		break;
 	case TUNSDEBUG:
 		tundebug = *(int *)data;
 		break;
 	case TUNGDEBUG:
 		*(int *)data = tundebug;
 		break;
 	case FIONBIO:
 		break;
 	case FIOASYNC:
 		TUN_LOCK(tp);
 		if (*(int *)data)
 			tp->tun_flags |= TUN_ASYNC;
 		else
 			tp->tun_flags &= ~TUN_ASYNC;
 		TUN_UNLOCK(tp);
 		break;
 	case FIONREAD:
 		if (!IFQ_IS_EMPTY(&TUN2IFP(tp)->if_snd)) {
 			struct mbuf *mb;
 			IFQ_LOCK(&TUN2IFP(tp)->if_snd);
 			IFQ_POLL_NOLOCK(&TUN2IFP(tp)->if_snd, mb);
 			for (*(int *)data = 0; mb != NULL; mb = mb->m_next)
 				*(int *)data += mb->m_len;
 			IFQ_UNLOCK(&TUN2IFP(tp)->if_snd);
 		} else
 			*(int *)data = 0;
 		break;
 	case FIOSETOWN:
 		return (fsetown(*(int *)data, &tp->tun_sigio));
 
 	case FIOGETOWN:
 		*(int *)data = fgetown(&tp->tun_sigio);
 		return (0);
 
 	/* This is deprecated, FIOSETOWN should be used instead. */
 	case TIOCSPGRP:
 		return (fsetown(-(*(int *)data), &tp->tun_sigio));
 
 	/* This is deprecated, FIOGETOWN should be used instead. */
 	case TIOCGPGRP:
 		*(int *)data = -fgetown(&tp->tun_sigio);
 		return (0);
 
 	default:
 		return (ENOTTY);
 	}
 	return (0);
 }
 
 /*
  * The cdevsw read interface - reads a packet at a time, or at
  * least as much of a packet as can be read.
  */
 static	int
 tunread(struct cdev *dev, struct uio *uio, int flag)
 {
 	struct tuntap_softc *tp = dev->si_drv1;
 	struct ifnet	*ifp = TUN2IFP(tp);
 	struct mbuf	*m;
 	int		error=0, len;
 
 	TUNDEBUG (ifp, "read\n");
 	TUN_LOCK(tp);
 	if ((tp->tun_flags & TUN_READY) != TUN_READY) {
 		TUN_UNLOCK(tp);
 		TUNDEBUG (ifp, "not ready 0%o\n", tp->tun_flags);
 		return (EHOSTDOWN);
 	}
 
 	tp->tun_flags &= ~TUN_RWAIT;
 
 	do {
 		IFQ_DEQUEUE(&ifp->if_snd, m);
 		if (m == NULL) {
 			if (flag & O_NONBLOCK) {
 				TUN_UNLOCK(tp);
 				return (EWOULDBLOCK);
 			}
 			tp->tun_flags |= TUN_RWAIT;
 			error = mtx_sleep(tp, &tp->tun_mtx, PCATCH | (PZERO + 1),
 			    "tunread", 0);
 			if (error != 0) {
 				TUN_UNLOCK(tp);
 				return (error);
 			}
 		}
 	} while (m == NULL);
 	TUN_UNLOCK(tp);
 
 	if ((tp->tun_flags & TUN_L2) != 0)
 		BPF_MTAP(ifp, m);
 
 	while (m && uio->uio_resid > 0 && error == 0) {
 		len = min(uio->uio_resid, m->m_len);
 		if (len != 0)
 			error = uiomove(mtod(m, void *), len, uio);
 		m = m_free(m);
 	}
 
 	if (m) {
 		TUNDEBUG(ifp, "Dropping mbuf\n");
 		m_freem(m);
 	}
 	return (error);
 }
 
 static int
 tunwrite_l2(struct tuntap_softc *tp, struct mbuf *m)
 {
 	struct ether_header *eh;
 	struct ifnet *ifp;
 
 	ifp = TUN2IFP(tp);
 
 	/*
 	 * Only pass a unicast frame to ether_input(), if it would
 	 * actually have been received by non-virtual hardware.
 	 */
 	if (m->m_len < sizeof(struct ether_header)) {
 		m_freem(m);
 		return (0);
 	}
 
 	eh = mtod(m, struct ether_header *);
 
 	if (eh && (ifp->if_flags & IFF_PROMISC) == 0 &&
 	    !ETHER_IS_MULTICAST(eh->ether_dhost) &&
 	    bcmp(eh->ether_dhost, IF_LLADDR(ifp), ETHER_ADDR_LEN) != 0) {
 		m_freem(m);
 		return (0);
 	}
 
 	/* Pass packet up to parent. */
 	CURVNET_SET(ifp->if_vnet);
 	(*ifp->if_input)(ifp, m);
 	CURVNET_RESTORE();
 	/* ibytes are counted in parent */
 	if_inc_counter(ifp, IFCOUNTER_IPACKETS, 1);
 	return (0);
 }
 
 static int
 tunwrite_l3(struct tuntap_softc *tp, struct mbuf *m)
 {
 	struct ifnet *ifp;
 	int family, isr;
 
 	ifp = TUN2IFP(tp);
 	/* Could be unlocked read? */
 	TUN_LOCK(tp);
 	if (tp->tun_flags & TUN_IFHEAD) {
 		TUN_UNLOCK(tp);
 		if (m->m_len < sizeof(family) &&
 		(m = m_pullup(m, sizeof(family))) == NULL)
 			return (ENOBUFS);
 		family = ntohl(*mtod(m, u_int32_t *));
 		m_adj(m, sizeof(family));
 	} else {
 		TUN_UNLOCK(tp);
 		family = AF_INET;
 	}
 
 	BPF_MTAP2(ifp, &family, sizeof(family), m);
 
 	switch (family) {
 #ifdef INET
 	case AF_INET:
 		isr = NETISR_IP;
 		break;
 #endif
 #ifdef INET6
 	case AF_INET6:
 		isr = NETISR_IPV6;
 		break;
 #endif
 	default:
 		m_freem(m);
 		return (EAFNOSUPPORT);
 	}
 	random_harvest_queue(m, sizeof(*m), RANDOM_NET_TUN);
 	if_inc_counter(ifp, IFCOUNTER_IBYTES, m->m_pkthdr.len);
 	if_inc_counter(ifp, IFCOUNTER_IPACKETS, 1);
 	CURVNET_SET(ifp->if_vnet);
 	M_SETFIB(m, ifp->if_fib);
 	netisr_dispatch(isr, m);
 	CURVNET_RESTORE();
 	return (0);
 }
 
 /*
  * the cdevsw write interface - an atomic write is a packet - or else!
  */
 static	int
 tunwrite(struct cdev *dev, struct uio *uio, int flag)
 {
 	struct tuntap_softc *tp;
 	struct ifnet	*ifp;
 	struct mbuf	*m;
 	uint32_t	mru;
 	int		align;
 	bool		l2tun;
 
 	tp = dev->si_drv1;
 	ifp = TUN2IFP(tp);
 	TUNDEBUG(ifp, "tunwrite\n");
 	if ((ifp->if_flags & IFF_UP) != IFF_UP)
 		/* ignore silently */
 		return (0);
 
 	if (uio->uio_resid == 0)
 		return (0);
 
 	l2tun = (tp->tun_flags & TUN_L2) != 0;
 	align = 0;
 	mru = l2tun ? TAPMRU : TUNMRU;
 	if (l2tun)
 		align = ETHER_ALIGN;
 	else if ((tp->tun_flags & TUN_IFHEAD) != 0)
 		mru += sizeof(uint32_t);	/* family */
 	if (uio->uio_resid < 0 || uio->uio_resid > mru) {
 		TUNDEBUG(ifp, "len=%zd!\n", uio->uio_resid);
 		return (EIO);
 	}
 
 	if ((m = m_uiotombuf(uio, M_NOWAIT, 0, align, M_PKTHDR)) == NULL) {
 		if_inc_counter(ifp, IFCOUNTER_IERRORS, 1);
 		return (ENOBUFS);
 	}
 
 	m->m_pkthdr.rcvif = ifp;
 #ifdef MAC
 	mac_ifnet_create_mbuf(ifp, m);
 #endif
 
 	if (l2tun)
 		return (tunwrite_l2(tp, m));
 
 	return (tunwrite_l3(tp, m));
 }
 
 /*
  * tunpoll - the poll interface, this is only useful on reads
  * really. The write detect always returns true, write never blocks
  * anyway, it either accepts the packet or drops it.
  */
 static	int
 tunpoll(struct cdev *dev, int events, struct thread *td)
 {
 	struct tuntap_softc *tp = dev->si_drv1;
 	struct ifnet	*ifp = TUN2IFP(tp);
 	int		revents = 0;
 
 	TUNDEBUG(ifp, "tunpoll\n");
 
 	if (events & (POLLIN | POLLRDNORM)) {
 		IFQ_LOCK(&ifp->if_snd);
 		if (!IFQ_IS_EMPTY(&ifp->if_snd)) {
 			TUNDEBUG(ifp, "tunpoll q=%d\n", ifp->if_snd.ifq_len);
 			revents |= events & (POLLIN | POLLRDNORM);
 		} else {
 			TUNDEBUG(ifp, "tunpoll waiting\n");
 			selrecord(td, &tp->tun_rsel);
 		}
 		IFQ_UNLOCK(&ifp->if_snd);
 	}
 	if (events & (POLLOUT | POLLWRNORM))
 		revents |= events & (POLLOUT | POLLWRNORM);
 
 	return (revents);
 }
 
 /*
  * tunkqfilter - support for the kevent() system call.
  */
 static int
 tunkqfilter(struct cdev *dev, struct knote *kn)
 {
 	struct tuntap_softc	*tp = dev->si_drv1;
 	struct ifnet	*ifp = TUN2IFP(tp);
 
 	switch(kn->kn_filter) {
 	case EVFILT_READ:
 		TUNDEBUG(ifp, "%s kqfilter: EVFILT_READ, minor = %#x\n",
 		    ifp->if_xname, dev2unit(dev));
 		kn->kn_fop = &tun_read_filterops;
 		break;
 
 	case EVFILT_WRITE:
 		TUNDEBUG(ifp, "%s kqfilter: EVFILT_WRITE, minor = %#x\n",
 		    ifp->if_xname, dev2unit(dev));
 		kn->kn_fop = &tun_write_filterops;
 		break;
 
 	default:
 		TUNDEBUG(ifp, "%s kqfilter: invalid filter, minor = %#x\n",
 		    ifp->if_xname, dev2unit(dev));
 		return(EINVAL);
 	}
 
 	kn->kn_hook = tp;
 	knlist_add(&tp->tun_rsel.si_note, kn, 0);
 
 	return (0);
 }
 
 /*
  * Return true of there is data in the interface queue.
  */
 static int
 tunkqread(struct knote *kn, long hint)
 {
 	int			ret;
 	struct tuntap_softc	*tp = kn->kn_hook;
 	struct cdev		*dev = tp->tun_dev;
 	struct ifnet	*ifp = TUN2IFP(tp);
 
 	if ((kn->kn_data = ifp->if_snd.ifq_len) > 0) {
 		TUNDEBUG(ifp,
 		    "%s have data in the queue.  Len = %d, minor = %#x\n",
 		    ifp->if_xname, ifp->if_snd.ifq_len, dev2unit(dev));
 		ret = 1;
 	} else {
 		TUNDEBUG(ifp,
 		    "%s waiting for data, minor = %#x\n", ifp->if_xname,
 		    dev2unit(dev));
 		ret = 0;
 	}
 
 	return (ret);
 }
 
 /*
  * Always can write, always return MTU in kn->data.
  */
 static int
 tunkqwrite(struct knote *kn, long hint)
 {
 	struct tuntap_softc	*tp = kn->kn_hook;
 	struct ifnet	*ifp = TUN2IFP(tp);
 
 	kn->kn_data = ifp->if_mtu;
 
 	return (1);
 }
 
 static void
 tunkqdetach(struct knote *kn)
 {
 	struct tuntap_softc	*tp = kn->kn_hook;
 
 	knlist_remove(&tp->tun_rsel.si_note, kn, 0);
 }
Index: projects/runtime-coverage-v2/sys/netinet/in_mcast.c
===================================================================
--- projects/runtime-coverage-v2/sys/netinet/in_mcast.c	(revision 347598)
+++ projects/runtime-coverage-v2/sys/netinet/in_mcast.c	(revision 347599)
@@ -1,3162 +1,3142 @@
 /*-
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Copyright (c) 2007-2009 Bruce Simpson.
  * Copyright (c) 2005 Robert N. M. Watson.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. The name of the author may not be used to endorse or promote
  *    products derived from this software without specific prior written
  *    permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 /*
  * IPv4 multicast socket, group, and socket option processing module.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mbuf.h>
 #include <sys/protosw.h>
 #include <sys/rmlock.h>
 #include <sys/socket.h>
 #include <sys/socketvar.h>
 #include <sys/protosw.h>
 #include <sys/sysctl.h>
 #include <sys/ktr.h>
 #include <sys/taskqueue.h>
 #include <sys/gtaskqueue.h>
 #include <sys/tree.h>
 
 #include <net/if.h>
 #include <net/if_var.h>
 #include <net/if_dl.h>
 #include <net/route.h>
 #include <net/vnet.h>
 
 #include <net/ethernet.h>
 
 #include <netinet/in.h>
 #include <netinet/in_systm.h>
 #include <netinet/in_fib.h>
 #include <netinet/in_pcb.h>
 #include <netinet/in_var.h>
 #include <netinet/ip_var.h>
 #include <netinet/igmp_var.h>
 
 #ifndef KTR_IGMPV3
 #define KTR_IGMPV3 KTR_INET
 #endif
 
 #ifndef __SOCKUNION_DECLARED
 union sockunion {
 	struct sockaddr_storage	ss;
 	struct sockaddr		sa;
 	struct sockaddr_dl	sdl;
 	struct sockaddr_in	sin;
 };
 typedef union sockunion sockunion_t;
 #define __SOCKUNION_DECLARED
 #endif /* __SOCKUNION_DECLARED */
 
 static MALLOC_DEFINE(M_INMFILTER, "in_mfilter",
     "IPv4 multicast PCB-layer source filter");
 static MALLOC_DEFINE(M_IPMADDR, "in_multi", "IPv4 multicast group");
 static MALLOC_DEFINE(M_IPMOPTS, "ip_moptions", "IPv4 multicast options");
 static MALLOC_DEFINE(M_IPMSOURCE, "ip_msource",
     "IPv4 multicast IGMP-layer source filter");
 
 /*
  * Locking:
  * - Lock order is: Giant, INP_WLOCK, IN_MULTI_LIST_LOCK, IGMP_LOCK, IF_ADDR_LOCK.
  * - The IF_ADDR_LOCK is implicitly taken by inm_lookup() earlier, however
  *   it can be taken by code in net/if.c also.
  * - ip_moptions and in_mfilter are covered by the INP_WLOCK.
  *
  * struct in_multi is covered by IN_MULTI_LIST_LOCK. There isn't strictly
  * any need for in_multi itself to be virtualized -- it is bound to an ifp
  * anyway no matter what happens.
  */
 struct mtx in_multi_list_mtx;
 MTX_SYSINIT(in_multi_mtx, &in_multi_list_mtx, "in_multi_list_mtx", MTX_DEF);
 
 struct mtx in_multi_free_mtx;
 MTX_SYSINIT(in_multi_free_mtx, &in_multi_free_mtx, "in_multi_free_mtx", MTX_DEF);
 
 struct sx in_multi_sx;
 SX_SYSINIT(in_multi_sx, &in_multi_sx, "in_multi_sx");
 
 int ifma_restart;
 
 /*
  * Functions with non-static linkage defined in this file should be
  * declared in in_var.h:
  *  imo_multi_filter()
  *  in_addmulti()
  *  in_delmulti()
  *  in_joingroup()
  *  in_joingroup_locked()
  *  in_leavegroup()
  *  in_leavegroup_locked()
  * and ip_var.h:
  *  inp_freemoptions()
  *  inp_getmoptions()
  *  inp_setmoptions()
  *
  * XXX: Both carp and pf need to use the legacy (*,G) KPIs in_addmulti()
  * and in_delmulti().
  */
 static void	imf_commit(struct in_mfilter *);
 static int	imf_get_source(struct in_mfilter *imf,
 		    const struct sockaddr_in *psin,
 		    struct in_msource **);
 static struct in_msource *
 		imf_graft(struct in_mfilter *, const uint8_t,
 		    const struct sockaddr_in *);
 static void	imf_leave(struct in_mfilter *);
 static int	imf_prune(struct in_mfilter *, const struct sockaddr_in *);
 static void	imf_purge(struct in_mfilter *);
 static void	imf_rollback(struct in_mfilter *);
 static void	imf_reap(struct in_mfilter *);
 static int	imo_grow(struct ip_moptions *);
 static size_t	imo_match_group(const struct ip_moptions *,
 		    const struct ifnet *, const struct sockaddr *);
 static struct in_msource *
 		imo_match_source(const struct ip_moptions *, const size_t,
 		    const struct sockaddr *);
 static void	ims_merge(struct ip_msource *ims,
 		    const struct in_msource *lims, const int rollback);
 static int	in_getmulti(struct ifnet *, const struct in_addr *,
 		    struct in_multi **);
 static int	inm_get_source(struct in_multi *inm, const in_addr_t haddr,
 		    const int noalloc, struct ip_msource **pims);
 #ifdef KTR
 static int	inm_is_ifp_detached(const struct in_multi *);
 #endif
 static int	inm_merge(struct in_multi *, /*const*/ struct in_mfilter *);
 static void	inm_purge(struct in_multi *);
 static void	inm_reap(struct in_multi *);
 static void inm_release(struct in_multi *);
 static struct ip_moptions *
 		inp_findmoptions(struct inpcb *);
 static int	inp_get_source_filters(struct inpcb *, struct sockopt *);
 static int	inp_join_group(struct inpcb *, struct sockopt *);
 static int	inp_leave_group(struct inpcb *, struct sockopt *);
 static struct ifnet *
 		inp_lookup_mcast_ifp(const struct inpcb *,
 		    const struct sockaddr_in *, const struct in_addr);
 static int	inp_block_unblock_source(struct inpcb *, struct sockopt *);
 static int	inp_set_multicast_if(struct inpcb *, struct sockopt *);
 static int	inp_set_source_filters(struct inpcb *, struct sockopt *);
 static int	sysctl_ip_mcast_filters(SYSCTL_HANDLER_ARGS);
 
 static SYSCTL_NODE(_net_inet_ip, OID_AUTO, mcast, CTLFLAG_RW, 0,
     "IPv4 multicast");
 
 static u_long in_mcast_maxgrpsrc = IP_MAX_GROUP_SRC_FILTER;
 SYSCTL_ULONG(_net_inet_ip_mcast, OID_AUTO, maxgrpsrc,
     CTLFLAG_RWTUN, &in_mcast_maxgrpsrc, 0,
     "Max source filters per group");
 
 static u_long in_mcast_maxsocksrc = IP_MAX_SOCK_SRC_FILTER;
 SYSCTL_ULONG(_net_inet_ip_mcast, OID_AUTO, maxsocksrc,
     CTLFLAG_RWTUN, &in_mcast_maxsocksrc, 0,
     "Max source filters per socket");
 
 int in_mcast_loop = IP_DEFAULT_MULTICAST_LOOP;
 SYSCTL_INT(_net_inet_ip_mcast, OID_AUTO, loop, CTLFLAG_RWTUN,
     &in_mcast_loop, 0, "Loopback multicast datagrams by default");
 
 static SYSCTL_NODE(_net_inet_ip_mcast, OID_AUTO, filters,
     CTLFLAG_RD | CTLFLAG_MPSAFE, sysctl_ip_mcast_filters,
     "Per-interface stack-wide source filters");
 
 #ifdef KTR
 /*
  * Inline function which wraps assertions for a valid ifp.
  * The ifnet layer will set the ifma's ifp pointer to NULL if the ifp
  * is detached.
  */
 static int __inline
 inm_is_ifp_detached(const struct in_multi *inm)
 {
 	struct ifnet *ifp;
 
 	KASSERT(inm->inm_ifma != NULL, ("%s: no ifma", __func__));
 	ifp = inm->inm_ifma->ifma_ifp;
 	if (ifp != NULL) {
 		/*
 		 * Sanity check that netinet's notion of ifp is the
 		 * same as net's.
 		 */
 		KASSERT(inm->inm_ifp == ifp, ("%s: bad ifp", __func__));
 	}
 
 	return (ifp == NULL);
 }
 #endif
 
 static struct grouptask free_gtask;
 static struct in_multi_head inm_free_list;
 static void inm_release_task(void *arg __unused);
 static void inm_init(void)
 {
 	SLIST_INIT(&inm_free_list);
 	taskqgroup_config_gtask_init(NULL, &free_gtask, inm_release_task, "inm release task");
 }
 
 #ifdef EARLY_AP_STARTUP
 SYSINIT(inm_init, SI_SUB_SMP + 1, SI_ORDER_FIRST,
 	inm_init, NULL);
 #else
 SYSINIT(inm_init, SI_SUB_ROOT_CONF - 1, SI_ORDER_FIRST,
 	inm_init, NULL);
 #endif
 
 
 void
 inm_release_list_deferred(struct in_multi_head *inmh)
 {
 
 	if (SLIST_EMPTY(inmh))
 		return;
 	mtx_lock(&in_multi_free_mtx);
 	SLIST_CONCAT(&inm_free_list, inmh, in_multi, inm_nrele);
 	mtx_unlock(&in_multi_free_mtx);
 	GROUPTASK_ENQUEUE(&free_gtask);
 }
 
 void
 inm_disconnect(struct in_multi *inm)
 {
 	struct ifnet *ifp;
 	struct ifmultiaddr *ifma, *ll_ifma;
 
 	ifp = inm->inm_ifp;
 	IF_ADDR_WLOCK_ASSERT(ifp);
 	ifma = inm->inm_ifma;
 
 	if_ref(ifp);
 	if (ifma->ifma_flags & IFMA_F_ENQUEUED) {
 		CK_STAILQ_REMOVE(&ifp->if_multiaddrs, ifma, ifmultiaddr, ifma_link);
 		ifma->ifma_flags &= ~IFMA_F_ENQUEUED;
 	}
 	MCDPRINTF("removed ifma: %p from %s\n", ifma, ifp->if_xname);
 	if ((ll_ifma = ifma->ifma_llifma) != NULL) {
 		MPASS(ifma != ll_ifma);
 		ifma->ifma_llifma = NULL;
 		MPASS(ll_ifma->ifma_llifma == NULL);
 		MPASS(ll_ifma->ifma_ifp == ifp);
 		if (--ll_ifma->ifma_refcount == 0) {
 			if (ll_ifma->ifma_flags & IFMA_F_ENQUEUED) {
 				CK_STAILQ_REMOVE(&ifp->if_multiaddrs, ll_ifma, ifmultiaddr, ifma_link);
 				ll_ifma->ifma_flags &= ~IFMA_F_ENQUEUED;
 			}
 			MCDPRINTF("removed ll_ifma: %p from %s\n", ll_ifma, ifp->if_xname);
 			if_freemulti(ll_ifma);
 			ifma_restart = true;
 		}
 	}
 }
 
 void
 inm_release_deferred(struct in_multi *inm)
 {
 	struct in_multi_head tmp;
 
 	IN_MULTI_LIST_LOCK_ASSERT();
 	MPASS(inm->inm_refcount > 0);
 	if (--inm->inm_refcount == 0) {
 		SLIST_INIT(&tmp);
 		inm_disconnect(inm);
 		inm->inm_ifma->ifma_protospec = NULL;
 		SLIST_INSERT_HEAD(&tmp, inm, inm_nrele);
 		inm_release_list_deferred(&tmp);
 	}
 }
 
 static void
 inm_release_task(void *arg __unused)
 {
 	struct in_multi_head inm_free_tmp;
 	struct in_multi *inm, *tinm;
 
 	SLIST_INIT(&inm_free_tmp);
 	mtx_lock(&in_multi_free_mtx);
 	SLIST_CONCAT(&inm_free_tmp, &inm_free_list, in_multi, inm_nrele);
 	mtx_unlock(&in_multi_free_mtx);
 	IN_MULTI_LOCK();
 	SLIST_FOREACH_SAFE(inm, &inm_free_tmp, inm_nrele, tinm) {
 		SLIST_REMOVE_HEAD(&inm_free_tmp, inm_nrele);
 		MPASS(inm);
 		inm_release(inm);
 	}
 	IN_MULTI_UNLOCK();
 }
 
 /*
  * Initialize an in_mfilter structure to a known state at t0, t1
  * with an empty source filter list.
  */
 static __inline void
 imf_init(struct in_mfilter *imf, const int st0, const int st1)
 {
 	memset(imf, 0, sizeof(struct in_mfilter));
 	RB_INIT(&imf->imf_sources);
 	imf->imf_st[0] = st0;
 	imf->imf_st[1] = st1;
 }
 
 /*
  * Function for looking up an in_multi record for an IPv4 multicast address
  * on a given interface. ifp must be valid. If no record found, return NULL.
  * The IN_MULTI_LIST_LOCK and IF_ADDR_LOCK on ifp must be held.
  */
 struct in_multi *
 inm_lookup_locked(struct ifnet *ifp, const struct in_addr ina)
 {
 	struct ifmultiaddr *ifma;
 	struct in_multi *inm;
 
 	IN_MULTI_LIST_LOCK_ASSERT();
 	IF_ADDR_LOCK_ASSERT(ifp);
 
 	inm = NULL;
 	CK_STAILQ_FOREACH(ifma, &((ifp)->if_multiaddrs), ifma_link) {
 		if (ifma->ifma_addr->sa_family != AF_INET ||
 			ifma->ifma_protospec == NULL)
 			continue;
 		inm = (struct in_multi *)ifma->ifma_protospec;
 		if (inm->inm_addr.s_addr == ina.s_addr)
 			break;
 		inm = NULL;
 	}
 	return (inm);
 }
 
 /*
  * Wrapper for inm_lookup_locked().
  * The IF_ADDR_LOCK will be taken on ifp and released on return.
  */
 struct in_multi *
 inm_lookup(struct ifnet *ifp, const struct in_addr ina)
 {
 	struct epoch_tracker et;
 	struct in_multi *inm;
 
 	IN_MULTI_LIST_LOCK_ASSERT();
 	NET_EPOCH_ENTER(et);
 	inm = inm_lookup_locked(ifp, ina);
 	NET_EPOCH_EXIT(et);
 
 	return (inm);
 }
 
 /*
  * Resize the ip_moptions vector to the next power-of-two minus 1.
  * May be called with locks held; do not sleep.
  */
 static int
 imo_grow(struct ip_moptions *imo)
 {
 	struct in_multi		**nmships;
 	struct in_multi		**omships;
 	struct in_mfilter	 *nmfilters;
 	struct in_mfilter	 *omfilters;
 	size_t			  idx;
 	size_t			  newmax;
 	size_t			  oldmax;
 
 	nmships = NULL;
 	nmfilters = NULL;
 	omships = imo->imo_membership;
 	omfilters = imo->imo_mfilters;
 	oldmax = imo->imo_max_memberships;
 	newmax = ((oldmax + 1) * 2) - 1;
 
 	if (newmax <= IP_MAX_MEMBERSHIPS) {
 		nmships = (struct in_multi **)realloc(omships,
 		    sizeof(struct in_multi *) * newmax, M_IPMOPTS, M_NOWAIT);
 		nmfilters = (struct in_mfilter *)realloc(omfilters,
 		    sizeof(struct in_mfilter) * newmax, M_INMFILTER, M_NOWAIT);
 		if (nmships != NULL && nmfilters != NULL) {
 			/* Initialize newly allocated source filter heads. */
 			for (idx = oldmax; idx < newmax; idx++) {
 				imf_init(&nmfilters[idx], MCAST_UNDEFINED,
 				    MCAST_EXCLUDE);
 			}
 			imo->imo_max_memberships = newmax;
 			imo->imo_membership = nmships;
 			imo->imo_mfilters = nmfilters;
 		}
 	}
 
 	if (nmships == NULL || nmfilters == NULL) {
 		if (nmships != NULL)
 			free(nmships, M_IPMOPTS);
 		if (nmfilters != NULL)
 			free(nmfilters, M_INMFILTER);
 		return (ETOOMANYREFS);
 	}
 
 	return (0);
 }
 
 /*
  * Find an IPv4 multicast group entry for this ip_moptions instance
  * which matches the specified group, and optionally an interface.
  * Return its index into the array, or -1 if not found.
  */
 static size_t
 imo_match_group(const struct ip_moptions *imo, const struct ifnet *ifp,
     const struct sockaddr *group)
 {
 	const struct sockaddr_in *gsin;
 	struct in_multi	**pinm;
 	int		  idx;
 	int		  nmships;
 
 	gsin = (const struct sockaddr_in *)group;
 
 	/* The imo_membership array may be lazy allocated. */
 	if (imo->imo_membership == NULL || imo->imo_num_memberships == 0)
 		return (-1);
 
 	nmships = imo->imo_num_memberships;
 	pinm = &imo->imo_membership[0];
 	for (idx = 0; idx < nmships; idx++, pinm++) {
 		if (*pinm == NULL)
 			continue;
 		if ((ifp == NULL || ((*pinm)->inm_ifp == ifp)) &&
 		    in_hosteq((*pinm)->inm_addr, gsin->sin_addr)) {
 			break;
 		}
 	}
 	if (idx >= nmships)
 		idx = -1;
 
 	return (idx);
 }
 
 /*
  * Find an IPv4 multicast source entry for this imo which matches
  * the given group index for this socket, and source address.
  *
  * NOTE: This does not check if the entry is in-mode, merely if
  * it exists, which may not be the desired behaviour.
  */
 static struct in_msource *
 imo_match_source(const struct ip_moptions *imo, const size_t gidx,
     const struct sockaddr *src)
 {
 	struct ip_msource	 find;
 	struct in_mfilter	*imf;
 	struct ip_msource	*ims;
 	const sockunion_t	*psa;
 
 	KASSERT(src->sa_family == AF_INET, ("%s: !AF_INET", __func__));
 	KASSERT(gidx != -1 && gidx < imo->imo_num_memberships,
 	    ("%s: invalid index %d\n", __func__, (int)gidx));
 
 	/* The imo_mfilters array may be lazy allocated. */
 	if (imo->imo_mfilters == NULL)
 		return (NULL);
 	imf = &imo->imo_mfilters[gidx];
 
 	/* Source trees are keyed in host byte order. */
 	psa = (const sockunion_t *)src;
 	find.ims_haddr = ntohl(psa->sin.sin_addr.s_addr);
 	ims = RB_FIND(ip_msource_tree, &imf->imf_sources, &find);
 
 	return ((struct in_msource *)ims);
 }
 
 /*
  * Perform filtering for multicast datagrams on a socket by group and source.
  *
  * Returns 0 if a datagram should be allowed through, or various error codes
  * if the socket was not a member of the group, or the source was muted, etc.
  */
 int
 imo_multi_filter(const struct ip_moptions *imo, const struct ifnet *ifp,
     const struct sockaddr *group, const struct sockaddr *src)
 {
 	size_t gidx;
 	struct in_msource *ims;
 	int mode;
 
 	KASSERT(ifp != NULL, ("%s: null ifp", __func__));
 
 	gidx = imo_match_group(imo, ifp, group);
 	if (gidx == -1)
 		return (MCAST_NOTGMEMBER);
 
 	/*
 	 * Check if the source was included in an (S,G) join.
 	 * Allow reception on exclusive memberships by default,
 	 * reject reception on inclusive memberships by default.
 	 * Exclude source only if an in-mode exclude filter exists.
 	 * Include source only if an in-mode include filter exists.
 	 * NOTE: We are comparing group state here at IGMP t1 (now)
 	 * with socket-layer t0 (since last downcall).
 	 */
 	mode = imo->imo_mfilters[gidx].imf_st[1];
 	ims = imo_match_source(imo, gidx, src);
 
 	if ((ims == NULL && mode == MCAST_INCLUDE) ||
 	    (ims != NULL && ims->imsl_st[0] != mode))
 		return (MCAST_NOTSMEMBER);
 
 	return (MCAST_PASS);
 }
 
 /*
  * Find and return a reference to an in_multi record for (ifp, group),
  * and bump its reference count.
  * If one does not exist, try to allocate it, and update link-layer multicast
  * filters on ifp to listen for group.
  * Assumes the IN_MULTI lock is held across the call.
  * Return 0 if successful, otherwise return an appropriate error code.
  */
 static int
 in_getmulti(struct ifnet *ifp, const struct in_addr *group,
     struct in_multi **pinm)
 {
 	struct sockaddr_in	 gsin;
 	struct ifmultiaddr	*ifma;
 	struct in_ifinfo	*ii;
 	struct in_multi		*inm;
 	int error;
 
 	IN_MULTI_LOCK_ASSERT();
 
 	ii = (struct in_ifinfo *)ifp->if_afdata[AF_INET];
 	IN_MULTI_LIST_LOCK();
 	inm = inm_lookup(ifp, *group);
 	if (inm != NULL) {
 		/*
 		 * If we already joined this group, just bump the
 		 * refcount and return it.
 		 */
 		KASSERT(inm->inm_refcount >= 1,
 		    ("%s: bad refcount %d", __func__, inm->inm_refcount));
 		inm_acquire_locked(inm);
 		*pinm = inm;
 	}
 	IN_MULTI_LIST_UNLOCK();
 	if (inm != NULL)
 		return (0);
 	
 	memset(&gsin, 0, sizeof(gsin));
 	gsin.sin_family = AF_INET;
 	gsin.sin_len = sizeof(struct sockaddr_in);
 	gsin.sin_addr = *group;
 
 	/*
 	 * Check if a link-layer group is already associated
 	 * with this network-layer group on the given ifnet.
 	 */
 	error = if_addmulti(ifp, (struct sockaddr *)&gsin, &ifma);
 	if (error != 0)
 		return (error);
 
 	/* XXX ifma_protospec must be covered by IF_ADDR_LOCK */
 	IN_MULTI_LIST_LOCK();
 	IF_ADDR_WLOCK(ifp);
 
 	/*
 	 * If something other than netinet is occupying the link-layer
 	 * group, print a meaningful error message and back out of
 	 * the allocation.
 	 * Otherwise, bump the refcount on the existing network-layer
 	 * group association and return it.
 	 */
 	if (ifma->ifma_protospec != NULL) {
 		inm = (struct in_multi *)ifma->ifma_protospec;
 #ifdef INVARIANTS
 		KASSERT(ifma->ifma_addr != NULL, ("%s: no ifma_addr",
 		    __func__));
 		KASSERT(ifma->ifma_addr->sa_family == AF_INET,
 		    ("%s: ifma not AF_INET", __func__));
 		KASSERT(inm != NULL, ("%s: no ifma_protospec", __func__));
 		if (inm->inm_ifma != ifma || inm->inm_ifp != ifp ||
 		    !in_hosteq(inm->inm_addr, *group)) {
 			char addrbuf[INET_ADDRSTRLEN];
 
 			panic("%s: ifma %p is inconsistent with %p (%s)",
 			    __func__, ifma, inm, inet_ntoa_r(*group, addrbuf));
 		}
 #endif
 		inm_acquire_locked(inm);
 		*pinm = inm;
 		goto out_locked;
 	}
 
 	IF_ADDR_WLOCK_ASSERT(ifp);
 
 	/*
 	 * A new in_multi record is needed; allocate and initialize it.
 	 * We DO NOT perform an IGMP join as the in_ layer may need to
 	 * push an initial source list down to IGMP to support SSM.
 	 *
 	 * The initial source filter state is INCLUDE, {} as per the RFC.
 	 */
 	inm = malloc(sizeof(*inm), M_IPMADDR, M_NOWAIT | M_ZERO);
 	if (inm == NULL) {
 		IF_ADDR_WUNLOCK(ifp);
 		IN_MULTI_LIST_UNLOCK();
 		if_delmulti_ifma(ifma);
 		return (ENOMEM);
 	}
 	inm->inm_addr = *group;
 	inm->inm_ifp = ifp;
 	inm->inm_igi = ii->ii_igmp;
 	inm->inm_ifma = ifma;
 	inm->inm_refcount = 1;
 	inm->inm_state = IGMP_NOT_MEMBER;
 	mbufq_init(&inm->inm_scq, IGMP_MAX_STATE_CHANGES);
 	inm->inm_st[0].iss_fmode = MCAST_UNDEFINED;
 	inm->inm_st[1].iss_fmode = MCAST_UNDEFINED;
 	RB_INIT(&inm->inm_srcs);
 
 	ifma->ifma_protospec = inm;
 
 	*pinm = inm;
  out_locked:
 	IF_ADDR_WUNLOCK(ifp);
 	IN_MULTI_LIST_UNLOCK();
 	return (0);
 }
 
 /*
  * Drop a reference to an in_multi record.
  *
  * If the refcount drops to 0, free the in_multi record and
  * delete the underlying link-layer membership.
  */
 static void
 inm_release(struct in_multi *inm)
 {
 	struct ifmultiaddr *ifma;
 	struct ifnet *ifp;
 
 	CTR2(KTR_IGMPV3, "%s: refcount is %d", __func__, inm->inm_refcount);
 	MPASS(inm->inm_refcount == 0);
 	CTR2(KTR_IGMPV3, "%s: freeing inm %p", __func__, inm);
 
 	ifma = inm->inm_ifma;
 	ifp = inm->inm_ifp;
 
 	/* XXX this access is not covered by IF_ADDR_LOCK */
 	CTR2(KTR_IGMPV3, "%s: purging ifma %p", __func__, ifma);
 	if (ifp != NULL) {
 		CURVNET_SET(ifp->if_vnet);
 		inm_purge(inm);
 		free(inm, M_IPMADDR);
 		if_delmulti_ifma_flags(ifma, 1);
 		CURVNET_RESTORE();
 		if_rele(ifp);
 	} else {
 		inm_purge(inm);
 		free(inm, M_IPMADDR);
 		if_delmulti_ifma_flags(ifma, 1);
 	}
 }
 
 /*
  * Clear recorded source entries for a group.
  * Used by the IGMP code. Caller must hold the IN_MULTI lock.
  * FIXME: Should reap.
  */
 void
 inm_clear_recorded(struct in_multi *inm)
 {
 	struct ip_msource	*ims;
 
 	IN_MULTI_LIST_LOCK_ASSERT();
 
 	RB_FOREACH(ims, ip_msource_tree, &inm->inm_srcs) {
 		if (ims->ims_stp) {
 			ims->ims_stp = 0;
 			--inm->inm_st[1].iss_rec;
 		}
 	}
 	KASSERT(inm->inm_st[1].iss_rec == 0,
 	    ("%s: iss_rec %d not 0", __func__, inm->inm_st[1].iss_rec));
 }
 
 /*
  * Record a source as pending for a Source-Group IGMPv3 query.
  * This lives here as it modifies the shared tree.
  *
  * inm is the group descriptor.
  * naddr is the address of the source to record in network-byte order.
  *
  * If the net.inet.igmp.sgalloc sysctl is non-zero, we will
  * lazy-allocate a source node in response to an SG query.
  * Otherwise, no allocation is performed. This saves some memory
  * with the trade-off that the source will not be reported to the
  * router if joined in the window between the query response and
  * the group actually being joined on the local host.
  *
  * VIMAGE: XXX: Currently the igmp_sgalloc feature has been removed.
  * This turns off the allocation of a recorded source entry if
  * the group has not been joined.
  *
  * Return 0 if the source didn't exist or was already marked as recorded.
  * Return 1 if the source was marked as recorded by this function.
  * Return <0 if any error occurred (negated errno code).
  */
 int
 inm_record_source(struct in_multi *inm, const in_addr_t naddr)
 {
 	struct ip_msource	 find;
 	struct ip_msource	*ims, *nims;
 
 	IN_MULTI_LIST_LOCK_ASSERT();
 
 	find.ims_haddr = ntohl(naddr);
 	ims = RB_FIND(ip_msource_tree, &inm->inm_srcs, &find);
 	if (ims && ims->ims_stp)
 		return (0);
 	if (ims == NULL) {
 		if (inm->inm_nsrc == in_mcast_maxgrpsrc)
 			return (-ENOSPC);
 		nims = malloc(sizeof(struct ip_msource), M_IPMSOURCE,
 		    M_NOWAIT | M_ZERO);
 		if (nims == NULL)
 			return (-ENOMEM);
 		nims->ims_haddr = find.ims_haddr;
 		RB_INSERT(ip_msource_tree, &inm->inm_srcs, nims);
 		++inm->inm_nsrc;
 		ims = nims;
 	}
 
 	/*
 	 * Mark the source as recorded and update the recorded
 	 * source count.
 	 */
 	++ims->ims_stp;
 	++inm->inm_st[1].iss_rec;
 
 	return (1);
 }
 
 /*
  * Return a pointer to an in_msource owned by an in_mfilter,
  * given its source address.
  * Lazy-allocate if needed. If this is a new entry its filter state is
  * undefined at t0.
  *
  * imf is the filter set being modified.
  * haddr is the source address in *host* byte-order.
  *
  * SMPng: May be called with locks held; malloc must not block.
  */
 static int
 imf_get_source(struct in_mfilter *imf, const struct sockaddr_in *psin,
     struct in_msource **plims)
 {
 	struct ip_msource	 find;
 	struct ip_msource	*ims, *nims;
 	struct in_msource	*lims;
 	int			 error;
 
 	error = 0;
 	ims = NULL;
 	lims = NULL;
 
 	/* key is host byte order */
 	find.ims_haddr = ntohl(psin->sin_addr.s_addr);
 	ims = RB_FIND(ip_msource_tree, &imf->imf_sources, &find);
 	lims = (struct in_msource *)ims;
 	if (lims == NULL) {
 		if (imf->imf_nsrc == in_mcast_maxsocksrc)
 			return (ENOSPC);
 		nims = malloc(sizeof(struct in_msource), M_INMFILTER,
 		    M_NOWAIT | M_ZERO);
 		if (nims == NULL)
 			return (ENOMEM);
 		lims = (struct in_msource *)nims;
 		lims->ims_haddr = find.ims_haddr;
 		lims->imsl_st[0] = MCAST_UNDEFINED;
 		RB_INSERT(ip_msource_tree, &imf->imf_sources, nims);
 		++imf->imf_nsrc;
 	}
 
 	*plims = lims;
 
 	return (error);
 }
 
 /*
  * Graft a source entry into an existing socket-layer filter set,
  * maintaining any required invariants and checking allocations.
  *
  * The source is marked as being in the new filter mode at t1.
  *
  * Return the pointer to the new node, otherwise return NULL.
  */
 static struct in_msource *
 imf_graft(struct in_mfilter *imf, const uint8_t st1,
     const struct sockaddr_in *psin)
 {
 	struct ip_msource	*nims;
 	struct in_msource	*lims;
 
 	nims = malloc(sizeof(struct in_msource), M_INMFILTER,
 	    M_NOWAIT | M_ZERO);
 	if (nims == NULL)
 		return (NULL);
 	lims = (struct in_msource *)nims;
 	lims->ims_haddr = ntohl(psin->sin_addr.s_addr);
 	lims->imsl_st[0] = MCAST_UNDEFINED;
 	lims->imsl_st[1] = st1;
 	RB_INSERT(ip_msource_tree, &imf->imf_sources, nims);
 	++imf->imf_nsrc;
 
 	return (lims);
 }
 
 /*
  * Prune a source entry from an existing socket-layer filter set,
  * maintaining any required invariants and checking allocations.
  *
  * The source is marked as being left at t1, it is not freed.
  *
  * Return 0 if no error occurred, otherwise return an errno value.
  */
 static int
 imf_prune(struct in_mfilter *imf, const struct sockaddr_in *psin)
 {
 	struct ip_msource	 find;
 	struct ip_msource	*ims;
 	struct in_msource	*lims;
 
 	/* key is host byte order */
 	find.ims_haddr = ntohl(psin->sin_addr.s_addr);
 	ims = RB_FIND(ip_msource_tree, &imf->imf_sources, &find);
 	if (ims == NULL)
 		return (ENOENT);
 	lims = (struct in_msource *)ims;
 	lims->imsl_st[1] = MCAST_UNDEFINED;
 	return (0);
 }
 
 /*
  * Revert socket-layer filter set deltas at t1 to t0 state.
  */
 static void
 imf_rollback(struct in_mfilter *imf)
 {
 	struct ip_msource	*ims, *tims;
 	struct in_msource	*lims;
 
 	RB_FOREACH_SAFE(ims, ip_msource_tree, &imf->imf_sources, tims) {
 		lims = (struct in_msource *)ims;
 		if (lims->imsl_st[0] == lims->imsl_st[1]) {
 			/* no change at t1 */
 			continue;
 		} else if (lims->imsl_st[0] != MCAST_UNDEFINED) {
 			/* revert change to existing source at t1 */
 			lims->imsl_st[1] = lims->imsl_st[0];
 		} else {
 			/* revert source added t1 */
 			CTR2(KTR_IGMPV3, "%s: free ims %p", __func__, ims);
 			RB_REMOVE(ip_msource_tree, &imf->imf_sources, ims);
 			free(ims, M_INMFILTER);
 			imf->imf_nsrc--;
 		}
 	}
 	imf->imf_st[1] = imf->imf_st[0];
 }
 
 /*
  * Mark socket-layer filter set as INCLUDE {} at t1.
  */
 static void
 imf_leave(struct in_mfilter *imf)
 {
 	struct ip_msource	*ims;
 	struct in_msource	*lims;
 
 	RB_FOREACH(ims, ip_msource_tree, &imf->imf_sources) {
 		lims = (struct in_msource *)ims;
 		lims->imsl_st[1] = MCAST_UNDEFINED;
 	}
 	imf->imf_st[1] = MCAST_INCLUDE;
 }
 
 /*
  * Mark socket-layer filter set deltas as committed.
  */
 static void
 imf_commit(struct in_mfilter *imf)
 {
 	struct ip_msource	*ims;
 	struct in_msource	*lims;
 
 	RB_FOREACH(ims, ip_msource_tree, &imf->imf_sources) {
 		lims = (struct in_msource *)ims;
 		lims->imsl_st[0] = lims->imsl_st[1];
 	}
 	imf->imf_st[0] = imf->imf_st[1];
 }
 
 /*
  * Reap unreferenced sources from socket-layer filter set.
  */
 static void
 imf_reap(struct in_mfilter *imf)
 {
 	struct ip_msource	*ims, *tims;
 	struct in_msource	*lims;
 
 	RB_FOREACH_SAFE(ims, ip_msource_tree, &imf->imf_sources, tims) {
 		lims = (struct in_msource *)ims;
 		if ((lims->imsl_st[0] == MCAST_UNDEFINED) &&
 		    (lims->imsl_st[1] == MCAST_UNDEFINED)) {
 			CTR2(KTR_IGMPV3, "%s: free lims %p", __func__, ims);
 			RB_REMOVE(ip_msource_tree, &imf->imf_sources, ims);
 			free(ims, M_INMFILTER);
 			imf->imf_nsrc--;
 		}
 	}
 }
 
 /*
  * Purge socket-layer filter set.
  */
 static void
 imf_purge(struct in_mfilter *imf)
 {
 	struct ip_msource	*ims, *tims;
 
 	RB_FOREACH_SAFE(ims, ip_msource_tree, &imf->imf_sources, tims) {
 		CTR2(KTR_IGMPV3, "%s: free ims %p", __func__, ims);
 		RB_REMOVE(ip_msource_tree, &imf->imf_sources, ims);
 		free(ims, M_INMFILTER);
 		imf->imf_nsrc--;
 	}
 	imf->imf_st[0] = imf->imf_st[1] = MCAST_UNDEFINED;
 	KASSERT(RB_EMPTY(&imf->imf_sources),
 	    ("%s: imf_sources not empty", __func__));
 }
 
 /*
  * Look up a source filter entry for a multicast group.
  *
  * inm is the group descriptor to work with.
  * haddr is the host-byte-order IPv4 address to look up.
  * noalloc may be non-zero to suppress allocation of sources.
  * *pims will be set to the address of the retrieved or allocated source.
  *
  * SMPng: NOTE: may be called with locks held.
  * Return 0 if successful, otherwise return a non-zero error code.
  */
 static int
 inm_get_source(struct in_multi *inm, const in_addr_t haddr,
     const int noalloc, struct ip_msource **pims)
 {
 	struct ip_msource	 find;
 	struct ip_msource	*ims, *nims;
 
 	find.ims_haddr = haddr;
 	ims = RB_FIND(ip_msource_tree, &inm->inm_srcs, &find);
 	if (ims == NULL && !noalloc) {
 		if (inm->inm_nsrc == in_mcast_maxgrpsrc)
 			return (ENOSPC);
 		nims = malloc(sizeof(struct ip_msource), M_IPMSOURCE,
 		    M_NOWAIT | M_ZERO);
 		if (nims == NULL)
 			return (ENOMEM);
 		nims->ims_haddr = haddr;
 		RB_INSERT(ip_msource_tree, &inm->inm_srcs, nims);
 		++inm->inm_nsrc;
 		ims = nims;
 #ifdef KTR
 		CTR3(KTR_IGMPV3, "%s: allocated 0x%08x as %p", __func__,
 		    haddr, ims);
 #endif
 	}
 
 	*pims = ims;
 	return (0);
 }
 
 /*
  * Merge socket-layer source into IGMP-layer source.
  * If rollback is non-zero, perform the inverse of the merge.
  */
 static void
 ims_merge(struct ip_msource *ims, const struct in_msource *lims,
     const int rollback)
 {
 	int n = rollback ? -1 : 1;
 
 	if (lims->imsl_st[0] == MCAST_EXCLUDE) {
 		CTR3(KTR_IGMPV3, "%s: t1 ex -= %d on 0x%08x",
 		    __func__, n, ims->ims_haddr);
 		ims->ims_st[1].ex -= n;
 	} else if (lims->imsl_st[0] == MCAST_INCLUDE) {
 		CTR3(KTR_IGMPV3, "%s: t1 in -= %d on 0x%08x",
 		    __func__, n, ims->ims_haddr);
 		ims->ims_st[1].in -= n;
 	}
 
 	if (lims->imsl_st[1] == MCAST_EXCLUDE) {
 		CTR3(KTR_IGMPV3, "%s: t1 ex += %d on 0x%08x",
 		    __func__, n, ims->ims_haddr);
 		ims->ims_st[1].ex += n;
 	} else if (lims->imsl_st[1] == MCAST_INCLUDE) {
 		CTR3(KTR_IGMPV3, "%s: t1 in += %d on 0x%08x",
 		    __func__, n, ims->ims_haddr);
 		ims->ims_st[1].in += n;
 	}
 }
 
 /*
  * Atomically update the global in_multi state, when a membership's
  * filter list is being updated in any way.
  *
  * imf is the per-inpcb-membership group filter pointer.
  * A fake imf may be passed for in-kernel consumers.
  *
  * XXX This is a candidate for a set-symmetric-difference style loop
  * which would eliminate the repeated lookup from root of ims nodes,
  * as they share the same key space.
  *
  * If any error occurred this function will back out of refcounts
  * and return a non-zero value.
  */
 static int
 inm_merge(struct in_multi *inm, /*const*/ struct in_mfilter *imf)
 {
 	struct ip_msource	*ims, *nims;
 	struct in_msource	*lims;
 	int			 schanged, error;
 	int			 nsrc0, nsrc1;
 
 	schanged = 0;
 	error = 0;
 	nsrc1 = nsrc0 = 0;
 	IN_MULTI_LIST_LOCK_ASSERT();
 
 	/*
 	 * Update the source filters first, as this may fail.
 	 * Maintain count of in-mode filters at t0, t1. These are
 	 * used to work out if we transition into ASM mode or not.
 	 * Maintain a count of source filters whose state was
 	 * actually modified by this operation.
 	 */
 	RB_FOREACH(ims, ip_msource_tree, &imf->imf_sources) {
 		lims = (struct in_msource *)ims;
 		if (lims->imsl_st[0] == imf->imf_st[0]) nsrc0++;
 		if (lims->imsl_st[1] == imf->imf_st[1]) nsrc1++;
 		if (lims->imsl_st[0] == lims->imsl_st[1]) continue;
 		error = inm_get_source(inm, lims->ims_haddr, 0, &nims);
 		++schanged;
 		if (error)
 			break;
 		ims_merge(nims, lims, 0);
 	}
 	if (error) {
 		struct ip_msource *bims;
 
 		RB_FOREACH_REVERSE_FROM(ims, ip_msource_tree, nims) {
 			lims = (struct in_msource *)ims;
 			if (lims->imsl_st[0] == lims->imsl_st[1])
 				continue;
 			(void)inm_get_source(inm, lims->ims_haddr, 1, &bims);
 			if (bims == NULL)
 				continue;
 			ims_merge(bims, lims, 1);
 		}
 		goto out_reap;
 	}
 
 	CTR3(KTR_IGMPV3, "%s: imf filters in-mode: %d at t0, %d at t1",
 	    __func__, nsrc0, nsrc1);
 
 	/* Handle transition between INCLUDE {n} and INCLUDE {} on socket. */
 	if (imf->imf_st[0] == imf->imf_st[1] &&
 	    imf->imf_st[1] == MCAST_INCLUDE) {
 		if (nsrc1 == 0) {
 			CTR1(KTR_IGMPV3, "%s: --in on inm at t1", __func__);
 			--inm->inm_st[1].iss_in;
 		}
 	}
 
 	/* Handle filter mode transition on socket. */
 	if (imf->imf_st[0] != imf->imf_st[1]) {
 		CTR3(KTR_IGMPV3, "%s: imf transition %d to %d",
 		    __func__, imf->imf_st[0], imf->imf_st[1]);
 
 		if (imf->imf_st[0] == MCAST_EXCLUDE) {
 			CTR1(KTR_IGMPV3, "%s: --ex on inm at t1", __func__);
 			--inm->inm_st[1].iss_ex;
 		} else if (imf->imf_st[0] == MCAST_INCLUDE) {
 			CTR1(KTR_IGMPV3, "%s: --in on inm at t1", __func__);
 			--inm->inm_st[1].iss_in;
 		}
 
 		if (imf->imf_st[1] == MCAST_EXCLUDE) {
 			CTR1(KTR_IGMPV3, "%s: ex++ on inm at t1", __func__);
 			inm->inm_st[1].iss_ex++;
 		} else if (imf->imf_st[1] == MCAST_INCLUDE && nsrc1 > 0) {
 			CTR1(KTR_IGMPV3, "%s: in++ on inm at t1", __func__);
 			inm->inm_st[1].iss_in++;
 		}
 	}
 
 	/*
 	 * Track inm filter state in terms of listener counts.
 	 * If there are any exclusive listeners, stack-wide
 	 * membership is exclusive.
 	 * Otherwise, if only inclusive listeners, stack-wide is inclusive.
 	 * If no listeners remain, state is undefined at t1,
 	 * and the IGMP lifecycle for this group should finish.
 	 */
 	if (inm->inm_st[1].iss_ex > 0) {
 		CTR1(KTR_IGMPV3, "%s: transition to EX", __func__);
 		inm->inm_st[1].iss_fmode = MCAST_EXCLUDE;
 	} else if (inm->inm_st[1].iss_in > 0) {
 		CTR1(KTR_IGMPV3, "%s: transition to IN", __func__);
 		inm->inm_st[1].iss_fmode = MCAST_INCLUDE;
 	} else {
 		CTR1(KTR_IGMPV3, "%s: transition to UNDEF", __func__);
 		inm->inm_st[1].iss_fmode = MCAST_UNDEFINED;
 	}
 
 	/* Decrement ASM listener count on transition out of ASM mode. */
 	if (imf->imf_st[0] == MCAST_EXCLUDE && nsrc0 == 0) {
 		if ((imf->imf_st[1] != MCAST_EXCLUDE) ||
 		    (imf->imf_st[1] == MCAST_EXCLUDE && nsrc1 > 0)) {
 			CTR1(KTR_IGMPV3, "%s: --asm on inm at t1", __func__);
 			--inm->inm_st[1].iss_asm;
 		}
 	}
 
 	/* Increment ASM listener count on transition to ASM mode. */
 	if (imf->imf_st[1] == MCAST_EXCLUDE && nsrc1 == 0) {
 		CTR1(KTR_IGMPV3, "%s: asm++ on inm at t1", __func__);
 		inm->inm_st[1].iss_asm++;
 	}
 
 	CTR3(KTR_IGMPV3, "%s: merged imf %p to inm %p", __func__, imf, inm);
 	inm_print(inm);
 
 out_reap:
 	if (schanged > 0) {
 		CTR1(KTR_IGMPV3, "%s: sources changed; reaping", __func__);
 		inm_reap(inm);
 	}
 	return (error);
 }
 
 /*
  * Mark an in_multi's filter set deltas as committed.
  * Called by IGMP after a state change has been enqueued.
  */
 void
 inm_commit(struct in_multi *inm)
 {
 	struct ip_msource	*ims;
 
 	CTR2(KTR_IGMPV3, "%s: commit inm %p", __func__, inm);
 	CTR1(KTR_IGMPV3, "%s: pre commit:", __func__);
 	inm_print(inm);
 
 	RB_FOREACH(ims, ip_msource_tree, &inm->inm_srcs) {
 		ims->ims_st[0] = ims->ims_st[1];
 	}
 	inm->inm_st[0] = inm->inm_st[1];
 }
 
 /*
  * Reap unreferenced nodes from an in_multi's filter set.
  */
 static void
 inm_reap(struct in_multi *inm)
 {
 	struct ip_msource	*ims, *tims;
 
 	RB_FOREACH_SAFE(ims, ip_msource_tree, &inm->inm_srcs, tims) {
 		if (ims->ims_st[0].ex > 0 || ims->ims_st[0].in > 0 ||
 		    ims->ims_st[1].ex > 0 || ims->ims_st[1].in > 0 ||
 		    ims->ims_stp != 0)
 			continue;
 		CTR2(KTR_IGMPV3, "%s: free ims %p", __func__, ims);
 		RB_REMOVE(ip_msource_tree, &inm->inm_srcs, ims);
 		free(ims, M_IPMSOURCE);
 		inm->inm_nsrc--;
 	}
 }
 
 /*
  * Purge all source nodes from an in_multi's filter set.
  */
 static void
 inm_purge(struct in_multi *inm)
 {
 	struct ip_msource	*ims, *tims;
 
 	RB_FOREACH_SAFE(ims, ip_msource_tree, &inm->inm_srcs, tims) {
 		CTR2(KTR_IGMPV3, "%s: free ims %p", __func__, ims);
 		RB_REMOVE(ip_msource_tree, &inm->inm_srcs, ims);
 		free(ims, M_IPMSOURCE);
 		inm->inm_nsrc--;
 	}
 }
 
 /*
  * Join a multicast group; unlocked entry point.
  *
  * SMPng: XXX: in_joingroup() is called from in_control() when Giant
  * is not held. Fortunately, ifp is unlikely to have been detached
  * at this point, so we assume it's OK to recurse.
  */
 int
 in_joingroup(struct ifnet *ifp, const struct in_addr *gina,
     /*const*/ struct in_mfilter *imf, struct in_multi **pinm)
 {
 	int error;
 
 	IN_MULTI_LOCK();
 	error = in_joingroup_locked(ifp, gina, imf, pinm);
 	IN_MULTI_UNLOCK();
 
 	return (error);
 }
 
 /*
  * Join a multicast group; real entry point.
  *
  * Only preserves atomicity at inm level.
  * NOTE: imf argument cannot be const due to sys/tree.h limitations.
  *
  * If the IGMP downcall fails, the group is not joined, and an error
  * code is returned.
  */
 int
 in_joingroup_locked(struct ifnet *ifp, const struct in_addr *gina,
     /*const*/ struct in_mfilter *imf, struct in_multi **pinm)
 {
 	struct in_mfilter	 timf;
 	struct in_multi		*inm;
 	int			 error;
 
 	IN_MULTI_LOCK_ASSERT();
 	IN_MULTI_LIST_UNLOCK_ASSERT();
 
 	CTR4(KTR_IGMPV3, "%s: join 0x%08x on %p(%s))", __func__,
 	    ntohl(gina->s_addr), ifp, ifp->if_xname);
 
 	error = 0;
 	inm = NULL;
 
 	/*
 	 * If no imf was specified (i.e. kernel consumer),
 	 * fake one up and assume it is an ASM join.
 	 */
 	if (imf == NULL) {
 		imf_init(&timf, MCAST_UNDEFINED, MCAST_EXCLUDE);
 		imf = &timf;
 	}
 
 	error = in_getmulti(ifp, gina, &inm);
 	if (error) {
 		CTR1(KTR_IGMPV3, "%s: in_getmulti() failure", __func__);
 		return (error);
 	}
 	IN_MULTI_LIST_LOCK();
 	CTR1(KTR_IGMPV3, "%s: merge inm state", __func__);
 	error = inm_merge(inm, imf);
 	if (error) {
 		CTR1(KTR_IGMPV3, "%s: failed to merge inm state", __func__);
 		goto out_inm_release;
 	}
 
 	CTR1(KTR_IGMPV3, "%s: doing igmp downcall", __func__);
 	error = igmp_change_state(inm);
 	if (error) {
 		CTR1(KTR_IGMPV3, "%s: failed to update source", __func__);
 		goto out_inm_release;
 	}
 
  out_inm_release:
 	if (error) {
 
 		CTR2(KTR_IGMPV3, "%s: dropping ref on %p", __func__, inm);
 		inm_release_deferred(inm);
 	} else {
 		*pinm = inm;
 	}
 	IN_MULTI_LIST_UNLOCK();
 
 	return (error);
 }
 
 /*
  * Leave a multicast group; unlocked entry point.
  */
 int
 in_leavegroup(struct in_multi *inm, /*const*/ struct in_mfilter *imf)
 {
 	int error;
 
 	IN_MULTI_LOCK();
 	error = in_leavegroup_locked(inm, imf);
 	IN_MULTI_UNLOCK();
 
 	return (error);
 }
 
 /*
  * Leave a multicast group; real entry point.
  * All source filters will be expunged.
  *
  * Only preserves atomicity at inm level.
  *
  * Holding the write lock for the INP which contains imf
  * is highly advisable. We can't assert for it as imf does not
  * contain a back-pointer to the owning inp.
  *
  * Note: This is not the same as inm_release(*) as this function also
  * makes a state change downcall into IGMP.
  */
 int
 in_leavegroup_locked(struct in_multi *inm, /*const*/ struct in_mfilter *imf)
 {
 	struct in_mfilter	 timf;
 	int			 error;
 
 	error = 0;
 
 	IN_MULTI_LOCK_ASSERT();
 	IN_MULTI_LIST_UNLOCK_ASSERT();
 
 	CTR5(KTR_IGMPV3, "%s: leave inm %p, 0x%08x/%s, imf %p", __func__,
 	    inm, ntohl(inm->inm_addr.s_addr),
 	    (inm_is_ifp_detached(inm) ? "null" : inm->inm_ifp->if_xname),
 	    imf);
 
 	/*
 	 * If no imf was specified (i.e. kernel consumer),
 	 * fake one up and assume it is an ASM join.
 	 */
 	if (imf == NULL) {
 		imf_init(&timf, MCAST_EXCLUDE, MCAST_UNDEFINED);
 		imf = &timf;
 	}
 
 	/*
 	 * Begin state merge transaction at IGMP layer.
 	 *
 	 * As this particular invocation should not cause any memory
 	 * to be allocated, and there is no opportunity to roll back
 	 * the transaction, it MUST NOT fail.
 	 */
 	CTR1(KTR_IGMPV3, "%s: merge inm state", __func__);
 	IN_MULTI_LIST_LOCK();
 	error = inm_merge(inm, imf);
 	KASSERT(error == 0, ("%s: failed to merge inm state", __func__));
 
 	CTR1(KTR_IGMPV3, "%s: doing igmp downcall", __func__);
 	CURVNET_SET(inm->inm_ifp->if_vnet);
 	error = igmp_change_state(inm);
 	IF_ADDR_WLOCK(inm->inm_ifp);
 	inm_release_deferred(inm);
 	IF_ADDR_WUNLOCK(inm->inm_ifp);
 	IN_MULTI_LIST_UNLOCK();
 	CURVNET_RESTORE();
 	if (error)
 		CTR1(KTR_IGMPV3, "%s: failed igmp downcall", __func__);
 
 	CTR2(KTR_IGMPV3, "%s: dropping ref on %p", __func__, inm);
 
 	return (error);
 }
 
 /*#ifndef BURN_BRIDGES*/
 /*
  * Join an IPv4 multicast group in (*,G) exclusive mode.
  * The group must be a 224.0.0.0/24 link-scope group.
  * This KPI is for legacy kernel consumers only.
  */
 struct in_multi *
 in_addmulti(struct in_addr *ap, struct ifnet *ifp)
 {
 	struct in_multi *pinm;
 	int error;
 #ifdef INVARIANTS
 	char addrbuf[INET_ADDRSTRLEN];
 #endif
 
 	KASSERT(IN_LOCAL_GROUP(ntohl(ap->s_addr)),
 	    ("%s: %s not in 224.0.0.0/24", __func__,
 	    inet_ntoa_r(*ap, addrbuf)));
 
 	error = in_joingroup(ifp, ap, NULL, &pinm);
 	if (error != 0)
 		pinm = NULL;
 
 	return (pinm);
 }
 
 /*
  * Block or unblock an ASM multicast source on an inpcb.
  * This implements the delta-based API described in RFC 3678.
  *
  * The delta-based API applies only to exclusive-mode memberships.
  * An IGMP downcall will be performed.
  *
  * SMPng: NOTE: Must take Giant as a join may create a new ifma.
  *
  * Return 0 if successful, otherwise return an appropriate error code.
  */
 static int
 inp_block_unblock_source(struct inpcb *inp, struct sockopt *sopt)
 {
 	struct group_source_req		 gsr;
 	struct rm_priotracker		 in_ifa_tracker;
 	sockunion_t			*gsa, *ssa;
 	struct ifnet			*ifp;
 	struct in_mfilter		*imf;
 	struct ip_moptions		*imo;
 	struct in_msource		*ims;
 	struct in_multi			*inm;
 	size_t				 idx;
 	uint16_t			 fmode;
 	int				 error, doblock;
 
 	ifp = NULL;
 	error = 0;
 	doblock = 0;
 
 	memset(&gsr, 0, sizeof(struct group_source_req));
 	gsa = (sockunion_t *)&gsr.gsr_group;
 	ssa = (sockunion_t *)&gsr.gsr_source;
 
 	switch (sopt->sopt_name) {
 	case IP_BLOCK_SOURCE:
 	case IP_UNBLOCK_SOURCE: {
 		struct ip_mreq_source	 mreqs;
 
 		error = sooptcopyin(sopt, &mreqs,
 		    sizeof(struct ip_mreq_source),
 		    sizeof(struct ip_mreq_source));
 		if (error)
 			return (error);
 
 		gsa->sin.sin_family = AF_INET;
 		gsa->sin.sin_len = sizeof(struct sockaddr_in);
 		gsa->sin.sin_addr = mreqs.imr_multiaddr;
 
 		ssa->sin.sin_family = AF_INET;
 		ssa->sin.sin_len = sizeof(struct sockaddr_in);
 		ssa->sin.sin_addr = mreqs.imr_sourceaddr;
 
 		if (!in_nullhost(mreqs.imr_interface)) {
 			IN_IFADDR_RLOCK(&in_ifa_tracker);
 			INADDR_TO_IFP(mreqs.imr_interface, ifp);
 			IN_IFADDR_RUNLOCK(&in_ifa_tracker);
 		}
 		if (sopt->sopt_name == IP_BLOCK_SOURCE)
 			doblock = 1;
 
 		CTR3(KTR_IGMPV3, "%s: imr_interface = 0x%08x, ifp = %p",
 		    __func__, ntohl(mreqs.imr_interface.s_addr), ifp);
 		break;
 	    }
 
 	case MCAST_BLOCK_SOURCE:
 	case MCAST_UNBLOCK_SOURCE:
 		error = sooptcopyin(sopt, &gsr,
 		    sizeof(struct group_source_req),
 		    sizeof(struct group_source_req));
 		if (error)
 			return (error);
 
 		if (gsa->sin.sin_family != AF_INET ||
 		    gsa->sin.sin_len != sizeof(struct sockaddr_in))
 			return (EINVAL);
 
 		if (ssa->sin.sin_family != AF_INET ||
 		    ssa->sin.sin_len != sizeof(struct sockaddr_in))
 			return (EINVAL);
 
 		if (gsr.gsr_interface == 0 || V_if_index < gsr.gsr_interface)
 			return (EADDRNOTAVAIL);
 
 		ifp = ifnet_byindex(gsr.gsr_interface);
 
 		if (sopt->sopt_name == MCAST_BLOCK_SOURCE)
 			doblock = 1;
 		break;
 
 	default:
 		CTR2(KTR_IGMPV3, "%s: unknown sopt_name %d",
 		    __func__, sopt->sopt_name);
 		return (EOPNOTSUPP);
 		break;
 	}
 
 	if (!IN_MULTICAST(ntohl(gsa->sin.sin_addr.s_addr)))
 		return (EINVAL);
 
 	/*
 	 * Check if we are actually a member of this group.
 	 */
+	IN_MULTI_LOCK();
 	imo = inp_findmoptions(inp);
 	idx = imo_match_group(imo, ifp, &gsa->sa);
 	if (idx == -1 || imo->imo_mfilters == NULL) {
 		error = EADDRNOTAVAIL;
 		goto out_inp_locked;
 	}
 
 	KASSERT(imo->imo_mfilters != NULL,
 	    ("%s: imo_mfilters not allocated", __func__));
 	imf = &imo->imo_mfilters[idx];
 	inm = imo->imo_membership[idx];
 
 	/*
 	 * Attempting to use the delta-based API on an
 	 * non exclusive-mode membership is an error.
 	 */
 	fmode = imf->imf_st[0];
 	if (fmode != MCAST_EXCLUDE) {
 		error = EINVAL;
 		goto out_inp_locked;
 	}
 
 	/*
 	 * Deal with error cases up-front:
 	 *  Asked to block, but already blocked; or
 	 *  Asked to unblock, but nothing to unblock.
 	 * If adding a new block entry, allocate it.
 	 */
 	ims = imo_match_source(imo, idx, &ssa->sa);
 	if ((ims != NULL && doblock) || (ims == NULL && !doblock)) {
 		CTR3(KTR_IGMPV3, "%s: source 0x%08x %spresent", __func__,
 		    ntohl(ssa->sin.sin_addr.s_addr), doblock ? "" : "not ");
 		error = EADDRNOTAVAIL;
 		goto out_inp_locked;
 	}
 
 	INP_WLOCK_ASSERT(inp);
 
 	/*
 	 * Begin state merge transaction at socket layer.
 	 */
 	if (doblock) {
 		CTR2(KTR_IGMPV3, "%s: %s source", __func__, "block");
 		ims = imf_graft(imf, fmode, &ssa->sin);
 		if (ims == NULL)
 			error = ENOMEM;
 	} else {
 		CTR2(KTR_IGMPV3, "%s: %s source", __func__, "allow");
 		error = imf_prune(imf, &ssa->sin);
 	}
 
 	if (error) {
 		CTR1(KTR_IGMPV3, "%s: merge imf state failed", __func__);
 		goto out_imf_rollback;
 	}
 
 	/*
 	 * Begin state merge transaction at IGMP layer.
 	 */
-	IN_MULTI_LOCK();
 	CTR1(KTR_IGMPV3, "%s: merge inm state", __func__);
 	IN_MULTI_LIST_LOCK();
 	error = inm_merge(inm, imf);
 	if (error) {
 		CTR1(KTR_IGMPV3, "%s: failed to merge inm state", __func__);
 		IN_MULTI_LIST_UNLOCK();
-		goto out_in_multi_locked;
+		goto out_imf_rollback;
 	}
 
 	CTR1(KTR_IGMPV3, "%s: doing igmp downcall", __func__);
 	error = igmp_change_state(inm);
 	IN_MULTI_LIST_UNLOCK();
 	if (error)
 		CTR1(KTR_IGMPV3, "%s: failed igmp downcall", __func__);
 
-out_in_multi_locked:
-
-	IN_MULTI_UNLOCK();
 out_imf_rollback:
 	if (error)
 		imf_rollback(imf);
 	else
 		imf_commit(imf);
 
 	imf_reap(imf);
 
 out_inp_locked:
 	INP_WUNLOCK(inp);
+	IN_MULTI_UNLOCK();
 	return (error);
 }
 
 /*
  * Given an inpcb, return its multicast options structure pointer.  Accepts
  * an unlocked inpcb pointer, but will return it locked.  May sleep.
  *
  * SMPng: NOTE: Potentially calls malloc(M_WAITOK) with Giant held.
  * SMPng: NOTE: Returns with the INP write lock held.
  */
 static struct ip_moptions *
 inp_findmoptions(struct inpcb *inp)
 {
 	struct ip_moptions	 *imo;
 	struct in_multi		**immp;
 	struct in_mfilter	 *imfp;
 	size_t			  idx;
 
 	INP_WLOCK(inp);
 	if (inp->inp_moptions != NULL)
 		return (inp->inp_moptions);
 
 	INP_WUNLOCK(inp);
 
 	imo = malloc(sizeof(*imo), M_IPMOPTS, M_WAITOK);
 	immp = malloc(sizeof(*immp) * IP_MIN_MEMBERSHIPS, M_IPMOPTS,
 	    M_WAITOK | M_ZERO);
 	imfp = malloc(sizeof(struct in_mfilter) * IP_MIN_MEMBERSHIPS,
 	    M_INMFILTER, M_WAITOK);
 
 	imo->imo_multicast_ifp = NULL;
 	imo->imo_multicast_addr.s_addr = INADDR_ANY;
 	imo->imo_multicast_vif = -1;
 	imo->imo_multicast_ttl = IP_DEFAULT_MULTICAST_TTL;
 	imo->imo_multicast_loop = in_mcast_loop;
 	imo->imo_num_memberships = 0;
 	imo->imo_max_memberships = IP_MIN_MEMBERSHIPS;
 	imo->imo_membership = immp;
 
 	/* Initialize per-group source filters. */
 	for (idx = 0; idx < IP_MIN_MEMBERSHIPS; idx++)
 		imf_init(&imfp[idx], MCAST_UNDEFINED, MCAST_EXCLUDE);
 	imo->imo_mfilters = imfp;
 
 	INP_WLOCK(inp);
 	if (inp->inp_moptions != NULL) {
 		free(imfp, M_INMFILTER);
 		free(immp, M_IPMOPTS);
 		free(imo, M_IPMOPTS);
 		return (inp->inp_moptions);
 	}
 	inp->inp_moptions = imo;
 	return (imo);
 }
 
 static void
 inp_gcmoptions(struct ip_moptions *imo)
 {
-	struct in_mfilter	*imf;
+	struct in_mfilter *imf;
 	struct in_multi *inm;
 	struct ifnet *ifp;
-	size_t			 idx, nmships;
+	size_t idx, nmships;
 
 	nmships = imo->imo_num_memberships;
 	for (idx = 0; idx < nmships; ++idx) {
 		imf = imo->imo_mfilters ? &imo->imo_mfilters[idx] : NULL;
 		if (imf)
 			imf_leave(imf);
 		inm = imo->imo_membership[idx];
 		ifp = inm->inm_ifp;
 		if (ifp != NULL) {
 			CURVNET_SET(ifp->if_vnet);
 			(void)in_leavegroup(inm, imf);
 			CURVNET_RESTORE();
 		} else {
 			(void)in_leavegroup(inm, imf);
 		}
 		if (imf)
 			imf_purge(imf);
 	}
 
 	if (imo->imo_mfilters)
 		free(imo->imo_mfilters, M_INMFILTER);
 	free(imo->imo_membership, M_IPMOPTS);
 	free(imo, M_IPMOPTS);
 }
 
 /*
  * Discard the IP multicast options (and source filters).  To minimize
  * the amount of work done while holding locks such as the INP's
  * pcbinfo lock (which is used in the receive path), the free
  * operation is deferred to the epoch callback task.
  */
 void
 inp_freemoptions(struct ip_moptions *imo)
 {
 	if (imo == NULL)
 		return;
 	inp_gcmoptions(imo);
 }
 
 /*
  * Atomically get source filters on a socket for an IPv4 multicast group.
  * Called with INP lock held; returns with lock released.
  */
 static int
 inp_get_source_filters(struct inpcb *inp, struct sockopt *sopt)
 {
 	struct __msfilterreq	 msfr;
 	sockunion_t		*gsa;
 	struct ifnet		*ifp;
 	struct ip_moptions	*imo;
 	struct in_mfilter	*imf;
 	struct ip_msource	*ims;
 	struct in_msource	*lims;
 	struct sockaddr_in	*psin;
 	struct sockaddr_storage	*ptss;
 	struct sockaddr_storage	*tss;
 	int			 error;
 	size_t			 idx, nsrcs, ncsrcs;
 
 	INP_WLOCK_ASSERT(inp);
 
 	imo = inp->inp_moptions;
 	KASSERT(imo != NULL, ("%s: null ip_moptions", __func__));
 
 	INP_WUNLOCK(inp);
 
 	error = sooptcopyin(sopt, &msfr, sizeof(struct __msfilterreq),
 	    sizeof(struct __msfilterreq));
 	if (error)
 		return (error);
 
 	if (msfr.msfr_ifindex == 0 || V_if_index < msfr.msfr_ifindex)
 		return (EINVAL);
 
 	ifp = ifnet_byindex(msfr.msfr_ifindex);
 	if (ifp == NULL)
 		return (EINVAL);
 
 	INP_WLOCK(inp);
 
 	/*
 	 * Lookup group on the socket.
 	 */
 	gsa = (sockunion_t *)&msfr.msfr_group;
 	idx = imo_match_group(imo, ifp, &gsa->sa);
 	if (idx == -1 || imo->imo_mfilters == NULL) {
 		INP_WUNLOCK(inp);
 		return (EADDRNOTAVAIL);
 	}
 	imf = &imo->imo_mfilters[idx];
 
 	/*
 	 * Ignore memberships which are in limbo.
 	 */
 	if (imf->imf_st[1] == MCAST_UNDEFINED) {
 		INP_WUNLOCK(inp);
 		return (EAGAIN);
 	}
 	msfr.msfr_fmode = imf->imf_st[1];
 
 	/*
 	 * If the user specified a buffer, copy out the source filter
 	 * entries to userland gracefully.
 	 * We only copy out the number of entries which userland
 	 * has asked for, but we always tell userland how big the
 	 * buffer really needs to be.
 	 */
 	if (msfr.msfr_nsrcs > in_mcast_maxsocksrc)
 		msfr.msfr_nsrcs = in_mcast_maxsocksrc;
 	tss = NULL;
 	if (msfr.msfr_srcs != NULL && msfr.msfr_nsrcs > 0) {
 		tss = malloc(sizeof(struct sockaddr_storage) * msfr.msfr_nsrcs,
 		    M_TEMP, M_NOWAIT | M_ZERO);
 		if (tss == NULL) {
 			INP_WUNLOCK(inp);
 			return (ENOBUFS);
 		}
 	}
 
 	/*
 	 * Count number of sources in-mode at t0.
 	 * If buffer space exists and remains, copy out source entries.
 	 */
 	nsrcs = msfr.msfr_nsrcs;
 	ncsrcs = 0;
 	ptss = tss;
 	RB_FOREACH(ims, ip_msource_tree, &imf->imf_sources) {
 		lims = (struct in_msource *)ims;
 		if (lims->imsl_st[0] == MCAST_UNDEFINED ||
 		    lims->imsl_st[0] != imf->imf_st[0])
 			continue;
 		++ncsrcs;
 		if (tss != NULL && nsrcs > 0) {
 			psin = (struct sockaddr_in *)ptss;
 			psin->sin_family = AF_INET;
 			psin->sin_len = sizeof(struct sockaddr_in);
 			psin->sin_addr.s_addr = htonl(lims->ims_haddr);
 			psin->sin_port = 0;
 			++ptss;
 			--nsrcs;
 		}
 	}
 
 	INP_WUNLOCK(inp);
 
 	if (tss != NULL) {
 		error = copyout(tss, msfr.msfr_srcs,
 		    sizeof(struct sockaddr_storage) * msfr.msfr_nsrcs);
 		free(tss, M_TEMP);
 		if (error)
 			return (error);
 	}
 
 	msfr.msfr_nsrcs = ncsrcs;
 	error = sooptcopyout(sopt, &msfr, sizeof(struct __msfilterreq));
 
 	return (error);
 }
 
 /*
  * Return the IP multicast options in response to user getsockopt().
  */
 int
 inp_getmoptions(struct inpcb *inp, struct sockopt *sopt)
 {
 	struct rm_priotracker	 in_ifa_tracker;
 	struct ip_mreqn		 mreqn;
 	struct ip_moptions	*imo;
 	struct ifnet		*ifp;
 	struct in_ifaddr	*ia;
 	int			 error, optval;
 	u_char			 coptval;
 
 	INP_WLOCK(inp);
 	imo = inp->inp_moptions;
 	/*
 	 * If socket is neither of type SOCK_RAW or SOCK_DGRAM,
 	 * or is a divert socket, reject it.
 	 */
 	if (inp->inp_socket->so_proto->pr_protocol == IPPROTO_DIVERT ||
 	    (inp->inp_socket->so_proto->pr_type != SOCK_RAW &&
 	    inp->inp_socket->so_proto->pr_type != SOCK_DGRAM)) {
 		INP_WUNLOCK(inp);
 		return (EOPNOTSUPP);
 	}
 
 	error = 0;
 	switch (sopt->sopt_name) {
 	case IP_MULTICAST_VIF:
 		if (imo != NULL)
 			optval = imo->imo_multicast_vif;
 		else
 			optval = -1;
 		INP_WUNLOCK(inp);
 		error = sooptcopyout(sopt, &optval, sizeof(int));
 		break;
 
 	case IP_MULTICAST_IF:
 		memset(&mreqn, 0, sizeof(struct ip_mreqn));
 		if (imo != NULL) {
 			ifp = imo->imo_multicast_ifp;
 			if (!in_nullhost(imo->imo_multicast_addr)) {
 				mreqn.imr_address = imo->imo_multicast_addr;
 			} else if (ifp != NULL) {
 				struct epoch_tracker et;
 
 				mreqn.imr_ifindex = ifp->if_index;
 				NET_EPOCH_ENTER(et);
 				IFP_TO_IA(ifp, ia, &in_ifa_tracker);
 				if (ia != NULL)
 					mreqn.imr_address =
 					    IA_SIN(ia)->sin_addr;
 				NET_EPOCH_EXIT(et);
 			}
 		}
 		INP_WUNLOCK(inp);
 		if (sopt->sopt_valsize == sizeof(struct ip_mreqn)) {
 			error = sooptcopyout(sopt, &mreqn,
 			    sizeof(struct ip_mreqn));
 		} else {
 			error = sooptcopyout(sopt, &mreqn.imr_address,
 			    sizeof(struct in_addr));
 		}
 		break;
 
 	case IP_MULTICAST_TTL:
 		if (imo == NULL)
 			optval = coptval = IP_DEFAULT_MULTICAST_TTL;
 		else
 			optval = coptval = imo->imo_multicast_ttl;
 		INP_WUNLOCK(inp);
 		if (sopt->sopt_valsize == sizeof(u_char))
 			error = sooptcopyout(sopt, &coptval, sizeof(u_char));
 		else
 			error = sooptcopyout(sopt, &optval, sizeof(int));
 		break;
 
 	case IP_MULTICAST_LOOP:
 		if (imo == NULL)
 			optval = coptval = IP_DEFAULT_MULTICAST_LOOP;
 		else
 			optval = coptval = imo->imo_multicast_loop;
 		INP_WUNLOCK(inp);
 		if (sopt->sopt_valsize == sizeof(u_char))
 			error = sooptcopyout(sopt, &coptval, sizeof(u_char));
 		else
 			error = sooptcopyout(sopt, &optval, sizeof(int));
 		break;
 
 	case IP_MSFILTER:
 		if (imo == NULL) {
 			error = EADDRNOTAVAIL;
 			INP_WUNLOCK(inp);
 		} else {
 			error = inp_get_source_filters(inp, sopt);
 		}
 		break;
 
 	default:
 		INP_WUNLOCK(inp);
 		error = ENOPROTOOPT;
 		break;
 	}
 
 	INP_UNLOCK_ASSERT(inp);
 
 	return (error);
 }
 
 /*
  * Look up the ifnet to use for a multicast group membership,
  * given the IPv4 address of an interface, and the IPv4 group address.
  *
  * This routine exists to support legacy multicast applications
  * which do not understand that multicast memberships are scoped to
  * specific physical links in the networking stack, or which need
  * to join link-scope groups before IPv4 addresses are configured.
  *
  * If inp is non-NULL, use this socket's current FIB number for any
  * required FIB lookup.
  * If ina is INADDR_ANY, look up the group address in the unicast FIB,
  * and use its ifp; usually, this points to the default next-hop.
  *
  * If the FIB lookup fails, attempt to use the first non-loopback
  * interface with multicast capability in the system as a
  * last resort. The legacy IPv4 ASM API requires that we do
  * this in order to allow groups to be joined when the routing
  * table has not yet been populated during boot.
  *
  * Returns NULL if no ifp could be found.
  *
  * FUTURE: Implement IPv4 source-address selection.
  */
 static struct ifnet *
 inp_lookup_mcast_ifp(const struct inpcb *inp,
     const struct sockaddr_in *gsin, const struct in_addr ina)
 {
 	struct rm_priotracker in_ifa_tracker;
 	struct ifnet *ifp;
 	struct nhop4_basic nh4;
 	uint32_t fibnum;
 
 	KASSERT(gsin->sin_family == AF_INET, ("%s: not AF_INET", __func__));
 	KASSERT(IN_MULTICAST(ntohl(gsin->sin_addr.s_addr)),
 	    ("%s: not multicast", __func__));
 
 	ifp = NULL;
 	if (!in_nullhost(ina)) {
 		IN_IFADDR_RLOCK(&in_ifa_tracker);
 		INADDR_TO_IFP(ina, ifp);
 		IN_IFADDR_RUNLOCK(&in_ifa_tracker);
 	} else {
 		fibnum = inp ? inp->inp_inc.inc_fibnum : 0;
 		if (fib4_lookup_nh_basic(fibnum, gsin->sin_addr, 0, 0, &nh4)==0)
 			ifp = nh4.nh_ifp;
 		else {
 			struct in_ifaddr *ia;
 			struct ifnet *mifp;
 
 			mifp = NULL;
 			IN_IFADDR_RLOCK(&in_ifa_tracker);
 			CK_STAILQ_FOREACH(ia, &V_in_ifaddrhead, ia_link) {
 				mifp = ia->ia_ifp;
 				if (!(mifp->if_flags & IFF_LOOPBACK) &&
 				     (mifp->if_flags & IFF_MULTICAST)) {
 					ifp = mifp;
 					break;
 				}
 			}
 			IN_IFADDR_RUNLOCK(&in_ifa_tracker);
 		}
 	}
 
 	return (ifp);
 }
 
 /*
  * Join an IPv4 multicast group, possibly with a source.
  */
 static int
 inp_join_group(struct inpcb *inp, struct sockopt *sopt)
 {
 	struct group_source_req		 gsr;
 	sockunion_t			*gsa, *ssa;
 	struct ifnet			*ifp;
 	struct in_mfilter		*imf;
 	struct ip_moptions		*imo;
 	struct in_multi			*inm;
 	struct in_msource		*lims;
 	size_t				 idx;
 	int				 error, is_new;
 
 	ifp = NULL;
 	imf = NULL;
 	lims = NULL;
 	error = 0;
 	is_new = 0;
 
 	memset(&gsr, 0, sizeof(struct group_source_req));
 	gsa = (sockunion_t *)&gsr.gsr_group;
 	gsa->ss.ss_family = AF_UNSPEC;
 	ssa = (sockunion_t *)&gsr.gsr_source;
 	ssa->ss.ss_family = AF_UNSPEC;
 
 	switch (sopt->sopt_name) {
 	case IP_ADD_MEMBERSHIP: {
 		struct ip_mreqn mreqn;
 
 		if (sopt->sopt_valsize == sizeof(struct ip_mreqn))
 			error = sooptcopyin(sopt, &mreqn,
 			    sizeof(struct ip_mreqn), sizeof(struct ip_mreqn));
 		else
 			error = sooptcopyin(sopt, &mreqn,
 			    sizeof(struct ip_mreq), sizeof(struct ip_mreq));
 		if (error)
 			return (error);
 
 		gsa->sin.sin_family = AF_INET;
 		gsa->sin.sin_len = sizeof(struct sockaddr_in);
 		gsa->sin.sin_addr = mreqn.imr_multiaddr;
 		if (!IN_MULTICAST(ntohl(gsa->sin.sin_addr.s_addr)))
 			return (EINVAL);
 
 		if (sopt->sopt_valsize == sizeof(struct ip_mreqn) &&
 		    mreqn.imr_ifindex != 0)
 			ifp = ifnet_byindex(mreqn.imr_ifindex);
 		else
 			ifp = inp_lookup_mcast_ifp(inp, &gsa->sin,
 			    mreqn.imr_address);
 		break;
 	}
 	case IP_ADD_SOURCE_MEMBERSHIP: {
 		struct ip_mreq_source	 mreqs;
 
 		error = sooptcopyin(sopt, &mreqs, sizeof(struct ip_mreq_source),
 			    sizeof(struct ip_mreq_source));
 		if (error)
 			return (error);
 
 		gsa->sin.sin_family = ssa->sin.sin_family = AF_INET;
 		gsa->sin.sin_len = ssa->sin.sin_len =
 		    sizeof(struct sockaddr_in);
 
 		gsa->sin.sin_addr = mreqs.imr_multiaddr;
 		if (!IN_MULTICAST(ntohl(gsa->sin.sin_addr.s_addr)))
 			return (EINVAL);
 
 		ssa->sin.sin_addr = mreqs.imr_sourceaddr;
 
 		ifp = inp_lookup_mcast_ifp(inp, &gsa->sin,
 		    mreqs.imr_interface);
 		CTR3(KTR_IGMPV3, "%s: imr_interface = 0x%08x, ifp = %p",
 		    __func__, ntohl(mreqs.imr_interface.s_addr), ifp);
 		break;
 	}
 
 	case MCAST_JOIN_GROUP:
 	case MCAST_JOIN_SOURCE_GROUP:
 		if (sopt->sopt_name == MCAST_JOIN_GROUP) {
 			error = sooptcopyin(sopt, &gsr,
 			    sizeof(struct group_req),
 			    sizeof(struct group_req));
 		} else if (sopt->sopt_name == MCAST_JOIN_SOURCE_GROUP) {
 			error = sooptcopyin(sopt, &gsr,
 			    sizeof(struct group_source_req),
 			    sizeof(struct group_source_req));
 		}
 		if (error)
 			return (error);
 
 		if (gsa->sin.sin_family != AF_INET ||
 		    gsa->sin.sin_len != sizeof(struct sockaddr_in))
 			return (EINVAL);
 
 		/*
 		 * Overwrite the port field if present, as the sockaddr
 		 * being copied in may be matched with a binary comparison.
 		 */
 		gsa->sin.sin_port = 0;
 		if (sopt->sopt_name == MCAST_JOIN_SOURCE_GROUP) {
 			if (ssa->sin.sin_family != AF_INET ||
 			    ssa->sin.sin_len != sizeof(struct sockaddr_in))
 				return (EINVAL);
 			ssa->sin.sin_port = 0;
 		}
 
 		if (!IN_MULTICAST(ntohl(gsa->sin.sin_addr.s_addr)))
 			return (EINVAL);
 
 		if (gsr.gsr_interface == 0 || V_if_index < gsr.gsr_interface)
 			return (EADDRNOTAVAIL);
 		ifp = ifnet_byindex(gsr.gsr_interface);
 		break;
 
 	default:
 		CTR2(KTR_IGMPV3, "%s: unknown sopt_name %d",
 		    __func__, sopt->sopt_name);
 		return (EOPNOTSUPP);
-		break;
 	}
 
 	if (ifp == NULL || (ifp->if_flags & IFF_MULTICAST) == 0)
 		return (EADDRNOTAVAIL);
 
+	IN_MULTI_LOCK();
 	imo = inp_findmoptions(inp);
 	idx = imo_match_group(imo, ifp, &gsa->sa);
 	if (idx == -1) {
 		is_new = 1;
 	} else {
 		inm = imo->imo_membership[idx];
 		imf = &imo->imo_mfilters[idx];
 		if (ssa->ss.ss_family != AF_UNSPEC) {
 			/*
 			 * MCAST_JOIN_SOURCE_GROUP on an exclusive membership
 			 * is an error. On an existing inclusive membership,
 			 * it just adds the source to the filter list.
 			 */
 			if (imf->imf_st[1] != MCAST_INCLUDE) {
 				error = EINVAL;
 				goto out_inp_locked;
 			}
 			/*
 			 * Throw out duplicates.
 			 *
 			 * XXX FIXME: This makes a naive assumption that
 			 * even if entries exist for *ssa in this imf,
 			 * they will be rejected as dupes, even if they
 			 * are not valid in the current mode (in-mode).
 			 *
 			 * in_msource is transactioned just as for anything
 			 * else in SSM -- but note naive use of inm_graft()
 			 * below for allocating new filter entries.
 			 *
 			 * This is only an issue if someone mixes the
 			 * full-state SSM API with the delta-based API,
 			 * which is discouraged in the relevant RFCs.
 			 */
 			lims = imo_match_source(imo, idx, &ssa->sa);
 			if (lims != NULL /*&&
 			    lims->imsl_st[1] == MCAST_INCLUDE*/) {
 				error = EADDRNOTAVAIL;
 				goto out_inp_locked;
 			}
 		} else {
 			/*
 			 * MCAST_JOIN_GROUP on an existing exclusive
 			 * membership is an error; return EADDRINUSE
 			 * to preserve 4.4BSD API idempotence, and
 			 * avoid tedious detour to code below.
 			 * NOTE: This is bending RFC 3678 a bit.
 			 *
 			 * On an existing inclusive membership, this is also
 			 * an error; if you want to change filter mode,
 			 * you must use the userland API setsourcefilter().
 			 * XXX We don't reject this for imf in UNDEFINED
 			 * state at t1, because allocation of a filter
 			 * is atomic with allocation of a membership.
 			 */
 			error = EINVAL;
 			if (imf->imf_st[1] == MCAST_EXCLUDE)
 				error = EADDRINUSE;
 			goto out_inp_locked;
 		}
 	}
 
 	/*
 	 * Begin state merge transaction at socket layer.
 	 */
 	INP_WLOCK_ASSERT(inp);
 
 	if (is_new) {
 		if (imo->imo_num_memberships == imo->imo_max_memberships) {
 			error = imo_grow(imo);
 			if (error)
 				goto out_inp_locked;
 		}
 		/*
 		 * Allocate the new slot upfront so we can deal with
 		 * grafting the new source filter in same code path
 		 * as for join-source on existing membership.
 		 */
 		idx = imo->imo_num_memberships;
 		imo->imo_membership[idx] = NULL;
 		imo->imo_num_memberships++;
 		KASSERT(imo->imo_mfilters != NULL,
 		    ("%s: imf_mfilters vector was not allocated", __func__));
 		imf = &imo->imo_mfilters[idx];
 		KASSERT(RB_EMPTY(&imf->imf_sources),
 		    ("%s: imf_sources not empty", __func__));
 	}
 
 	/*
 	 * Graft new source into filter list for this inpcb's
 	 * membership of the group. The in_multi may not have
 	 * been allocated yet if this is a new membership, however,
 	 * the in_mfilter slot will be allocated and must be initialized.
 	 *
 	 * Note: Grafting of exclusive mode filters doesn't happen
 	 * in this path.
 	 * XXX: Should check for non-NULL lims (node exists but may
 	 * not be in-mode) for interop with full-state API.
 	 */
 	if (ssa->ss.ss_family != AF_UNSPEC) {
 		/* Membership starts in IN mode */
 		if (is_new) {
 			CTR1(KTR_IGMPV3, "%s: new join w/source", __func__);
 			imf_init(imf, MCAST_UNDEFINED, MCAST_INCLUDE);
 		} else {
 			CTR2(KTR_IGMPV3, "%s: %s source", __func__, "allow");
 		}
 		lims = imf_graft(imf, MCAST_INCLUDE, &ssa->sin);
 		if (lims == NULL) {
 			CTR1(KTR_IGMPV3, "%s: merge imf state failed",
 			    __func__);
 			error = ENOMEM;
 			goto out_imo_free;
 		}
 	} else {
 		/* No address specified; Membership starts in EX mode */
 		if (is_new) {
 			CTR1(KTR_IGMPV3, "%s: new join w/o source", __func__);
 			imf_init(imf, MCAST_UNDEFINED, MCAST_EXCLUDE);
 		}
 	}
 
 	/*
 	 * Begin state merge transaction at IGMP layer.
 	 */
-	in_pcbref(inp);
-	INP_WUNLOCK(inp);
-	IN_MULTI_LOCK();
-
 	if (is_new) {
 		error = in_joingroup_locked(ifp, &gsa->sin.sin_addr, imf,
 		    &inm);
 		if (error) {
                         CTR1(KTR_IGMPV3, "%s: in_joingroup_locked failed", 
                             __func__);
                         IN_MULTI_LIST_UNLOCK();
 			goto out_imo_free;
 		}
 		inm_acquire(inm);
+		KASSERT(imo->imo_membership[idx] == NULL,
+		    ("%s: imo_membership already allocated", __func__));
 		imo->imo_membership[idx] = inm;
 	} else {
 		CTR1(KTR_IGMPV3, "%s: merge inm state", __func__);
 		IN_MULTI_LIST_LOCK();
 		error = inm_merge(inm, imf);
 		if (error) {
 			CTR1(KTR_IGMPV3, "%s: failed to merge inm state",
 				 __func__);
 			IN_MULTI_LIST_UNLOCK();
-			goto out_in_multi_locked;
+			goto out_imf_rollback;
 		}
 		CTR1(KTR_IGMPV3, "%s: doing igmp downcall", __func__);
 		error = igmp_change_state(inm);
 		IN_MULTI_LIST_UNLOCK();
 		if (error) {
 			CTR1(KTR_IGMPV3, "%s: failed igmp downcall",
 			    __func__);
-			goto out_in_multi_locked;
+			goto out_imf_rollback;
 		}
 	}
 
-out_in_multi_locked:
-
-	IN_MULTI_UNLOCK();
-	INP_WLOCK(inp);
-	if (in_pcbrele_wlocked(inp))
-		return (ENXIO);
+out_imf_rollback:
 	if (error) {
 		imf_rollback(imf);
 		if (is_new)
 			imf_purge(imf);
 		else
 			imf_reap(imf);
 	} else {
 		imf_commit(imf);
 	}
 
 out_imo_free:
 	if (error && is_new) {
 		inm = imo->imo_membership[idx];
 		if (inm != NULL) {
 			IN_MULTI_LIST_LOCK();
 			inm_release_deferred(inm);
 			IN_MULTI_LIST_UNLOCK();
 		}
 		imo->imo_membership[idx] = NULL;
 		--imo->imo_num_memberships;
 	}
 
 out_inp_locked:
 	INP_WUNLOCK(inp);
+	IN_MULTI_UNLOCK();
 	return (error);
 }
 
 /*
  * Leave an IPv4 multicast group on an inpcb, possibly with a source.
  */
 static int
 inp_leave_group(struct inpcb *inp, struct sockopt *sopt)
 {
 	struct group_source_req		 gsr;
 	struct ip_mreq_source		 mreqs;
 	struct rm_priotracker		 in_ifa_tracker;
 	sockunion_t			*gsa, *ssa;
 	struct ifnet			*ifp;
 	struct in_mfilter		*imf;
 	struct ip_moptions		*imo;
 	struct in_msource		*ims;
 	struct in_multi			*inm;
 	size_t				 idx;
 	int				 error, is_final;
 
 	ifp = NULL;
 	error = 0;
 	is_final = 1;
 
 	memset(&gsr, 0, sizeof(struct group_source_req));
 	gsa = (sockunion_t *)&gsr.gsr_group;
 	gsa->ss.ss_family = AF_UNSPEC;
 	ssa = (sockunion_t *)&gsr.gsr_source;
 	ssa->ss.ss_family = AF_UNSPEC;
 
 	switch (sopt->sopt_name) {
 	case IP_DROP_MEMBERSHIP:
 	case IP_DROP_SOURCE_MEMBERSHIP:
 		if (sopt->sopt_name == IP_DROP_MEMBERSHIP) {
 			error = sooptcopyin(sopt, &mreqs,
 			    sizeof(struct ip_mreq),
 			    sizeof(struct ip_mreq));
 			/*
 			 * Swap interface and sourceaddr arguments,
 			 * as ip_mreq and ip_mreq_source are laid
 			 * out differently.
 			 */
 			mreqs.imr_interface = mreqs.imr_sourceaddr;
 			mreqs.imr_sourceaddr.s_addr = INADDR_ANY;
 		} else if (sopt->sopt_name == IP_DROP_SOURCE_MEMBERSHIP) {
 			error = sooptcopyin(sopt, &mreqs,
 			    sizeof(struct ip_mreq_source),
 			    sizeof(struct ip_mreq_source));
 		}
 		if (error)
 			return (error);
 
 		gsa->sin.sin_family = AF_INET;
 		gsa->sin.sin_len = sizeof(struct sockaddr_in);
 		gsa->sin.sin_addr = mreqs.imr_multiaddr;
 
 		if (sopt->sopt_name == IP_DROP_SOURCE_MEMBERSHIP) {
 			ssa->sin.sin_family = AF_INET;
 			ssa->sin.sin_len = sizeof(struct sockaddr_in);
 			ssa->sin.sin_addr = mreqs.imr_sourceaddr;
 		}
 
 		/*
 		 * Attempt to look up hinted ifp from interface address.
 		 * Fallthrough with null ifp iff lookup fails, to
 		 * preserve 4.4BSD mcast API idempotence.
 		 * XXX NOTE WELL: The RFC 3678 API is preferred because
 		 * using an IPv4 address as a key is racy.
 		 */
 		if (!in_nullhost(mreqs.imr_interface)) {
 			IN_IFADDR_RLOCK(&in_ifa_tracker);
 			INADDR_TO_IFP(mreqs.imr_interface, ifp);
 			IN_IFADDR_RUNLOCK(&in_ifa_tracker);
 		}
 		CTR3(KTR_IGMPV3, "%s: imr_interface = 0x%08x, ifp = %p",
 		    __func__, ntohl(mreqs.imr_interface.s_addr), ifp);
 
 		break;
 
 	case MCAST_LEAVE_GROUP:
 	case MCAST_LEAVE_SOURCE_GROUP:
 		if (sopt->sopt_name == MCAST_LEAVE_GROUP) {
 			error = sooptcopyin(sopt, &gsr,
 			    sizeof(struct group_req),
 			    sizeof(struct group_req));
 		} else if (sopt->sopt_name == MCAST_LEAVE_SOURCE_GROUP) {
 			error = sooptcopyin(sopt, &gsr,
 			    sizeof(struct group_source_req),
 			    sizeof(struct group_source_req));
 		}
 		if (error)
 			return (error);
 
 		if (gsa->sin.sin_family != AF_INET ||
 		    gsa->sin.sin_len != sizeof(struct sockaddr_in))
 			return (EINVAL);
 
 		if (sopt->sopt_name == MCAST_LEAVE_SOURCE_GROUP) {
 			if (ssa->sin.sin_family != AF_INET ||
 			    ssa->sin.sin_len != sizeof(struct sockaddr_in))
 				return (EINVAL);
 		}
 
 		if (gsr.gsr_interface == 0 || V_if_index < gsr.gsr_interface)
 			return (EADDRNOTAVAIL);
 
 		ifp = ifnet_byindex(gsr.gsr_interface);
 
 		if (ifp == NULL)
 			return (EADDRNOTAVAIL);
 		break;
 
 	default:
 		CTR2(KTR_IGMPV3, "%s: unknown sopt_name %d",
 		    __func__, sopt->sopt_name);
 		return (EOPNOTSUPP);
 		break;
 	}
 
 	if (!IN_MULTICAST(ntohl(gsa->sin.sin_addr.s_addr)))
 		return (EINVAL);
 
 	/*
 	 * Find the membership in the membership array.
 	 */
+	IN_MULTI_LOCK();
 	imo = inp_findmoptions(inp);
 	idx = imo_match_group(imo, ifp, &gsa->sa);
 	if (idx == -1) {
 		error = EADDRNOTAVAIL;
 		goto out_inp_locked;
 	}
 	inm = imo->imo_membership[idx];
 	imf = &imo->imo_mfilters[idx];
 
 	if (ssa->ss.ss_family != AF_UNSPEC)
 		is_final = 0;
 
 	/*
 	 * Begin state merge transaction at socket layer.
 	 */
 	INP_WLOCK_ASSERT(inp);
 
 	/*
 	 * If we were instructed only to leave a given source, do so.
 	 * MCAST_LEAVE_SOURCE_GROUP is only valid for inclusive memberships.
 	 */
 	if (is_final) {
 		imf_leave(imf);
 	} else {
 		if (imf->imf_st[0] == MCAST_EXCLUDE) {
 			error = EADDRNOTAVAIL;
 			goto out_inp_locked;
 		}
 		ims = imo_match_source(imo, idx, &ssa->sa);
 		if (ims == NULL) {
 			CTR3(KTR_IGMPV3, "%s: source 0x%08x %spresent",
 			    __func__, ntohl(ssa->sin.sin_addr.s_addr), "not ");
 			error = EADDRNOTAVAIL;
 			goto out_inp_locked;
 		}
 		CTR2(KTR_IGMPV3, "%s: %s source", __func__, "block");
 		error = imf_prune(imf, &ssa->sin);
 		if (error) {
 			CTR1(KTR_IGMPV3, "%s: merge imf state failed",
 			    __func__);
 			goto out_inp_locked;
 		}
 	}
 
 	/*
 	 * Begin state merge transaction at IGMP layer.
 	 */
-	in_pcbref(inp);
-	INP_WUNLOCK(inp);
-	IN_MULTI_LOCK();
 
 	if (is_final) {
 		/*
 		 * Give up the multicast address record to which
 		 * the membership points.
 		 */
 		(void)in_leavegroup_locked(inm, imf);
 	} else {
 		CTR1(KTR_IGMPV3, "%s: merge inm state", __func__);
 		IN_MULTI_LIST_LOCK();
 		error = inm_merge(inm, imf);
 		if (error) {
 			CTR1(KTR_IGMPV3, "%s: failed to merge inm state",
 			    __func__);
 			IN_MULTI_LIST_UNLOCK();
-			goto out_in_multi_locked;
+			goto out_imf_rollback;
 		}
 
 		CTR1(KTR_IGMPV3, "%s: doing igmp downcall", __func__);
 		error = igmp_change_state(inm);
 		IN_MULTI_LIST_UNLOCK();
 		if (error) {
 			CTR1(KTR_IGMPV3, "%s: failed igmp downcall",
 			    __func__);
 		}
 	}
 
-out_in_multi_locked:
-
-	IN_MULTI_UNLOCK();
-	INP_WLOCK(inp);
-	if (in_pcbrele_wlocked(inp))
-		return (ENXIO);
-
+out_imf_rollback:
 	if (error)
 		imf_rollback(imf);
 	else
 		imf_commit(imf);
 
 	imf_reap(imf);
 
 	if (is_final) {
 		/* Remove the gap in the membership and filter array. */
 		KASSERT(RB_EMPTY(&imf->imf_sources),
-		    ("%s: imf_sources not empty", __func__));
+		    ("%s: imf_sources (%p %p %zu) not empty", __func__, imf, imo, idx));
 		for (++idx; idx < imo->imo_num_memberships; ++idx) {
 			imo->imo_membership[idx - 1] = imo->imo_membership[idx];
 			imo->imo_mfilters[idx - 1] = imo->imo_mfilters[idx];
 		}
 		imf_init(&imo->imo_mfilters[idx - 1], MCAST_UNDEFINED,
 		    MCAST_EXCLUDE);
 		imo->imo_num_memberships--;
 	}
 
 out_inp_locked:
 	INP_WUNLOCK(inp);
+	IN_MULTI_UNLOCK();
 	return (error);
 }
 
 /*
  * Select the interface for transmitting IPv4 multicast datagrams.
  *
  * Either an instance of struct in_addr or an instance of struct ip_mreqn
  * may be passed to this socket option. An address of INADDR_ANY or an
  * interface index of 0 is used to remove a previous selection.
  * When no interface is selected, one is chosen for every send.
  */
 static int
 inp_set_multicast_if(struct inpcb *inp, struct sockopt *sopt)
 {
 	struct rm_priotracker	 in_ifa_tracker;
 	struct in_addr		 addr;
 	struct ip_mreqn		 mreqn;
 	struct ifnet		*ifp;
 	struct ip_moptions	*imo;
 	int			 error;
 
 	if (sopt->sopt_valsize == sizeof(struct ip_mreqn)) {
 		/*
 		 * An interface index was specified using the
 		 * Linux-derived ip_mreqn structure.
 		 */
 		error = sooptcopyin(sopt, &mreqn, sizeof(struct ip_mreqn),
 		    sizeof(struct ip_mreqn));
 		if (error)
 			return (error);
 
 		if (mreqn.imr_ifindex < 0 || V_if_index < mreqn.imr_ifindex)
 			return (EINVAL);
 
 		if (mreqn.imr_ifindex == 0) {
 			ifp = NULL;
 		} else {
 			ifp = ifnet_byindex(mreqn.imr_ifindex);
 			if (ifp == NULL)
 				return (EADDRNOTAVAIL);
 		}
 	} else {
 		/*
 		 * An interface was specified by IPv4 address.
 		 * This is the traditional BSD usage.
 		 */
 		error = sooptcopyin(sopt, &addr, sizeof(struct in_addr),
 		    sizeof(struct in_addr));
 		if (error)
 			return (error);
 		if (in_nullhost(addr)) {
 			ifp = NULL;
 		} else {
 			IN_IFADDR_RLOCK(&in_ifa_tracker);
 			INADDR_TO_IFP(addr, ifp);
 			IN_IFADDR_RUNLOCK(&in_ifa_tracker);
 			if (ifp == NULL)
 				return (EADDRNOTAVAIL);
 		}
 		CTR3(KTR_IGMPV3, "%s: ifp = %p, addr = 0x%08x", __func__, ifp,
 		    ntohl(addr.s_addr));
 	}
 
 	/* Reject interfaces which do not support multicast. */
 	if (ifp != NULL && (ifp->if_flags & IFF_MULTICAST) == 0)
 		return (EOPNOTSUPP);
 
 	imo = inp_findmoptions(inp);
 	imo->imo_multicast_ifp = ifp;
 	imo->imo_multicast_addr.s_addr = INADDR_ANY;
 	INP_WUNLOCK(inp);
 
 	return (0);
 }
 
 /*
  * Atomically set source filters on a socket for an IPv4 multicast group.
- *
- * SMPng: NOTE: Potentially calls malloc(M_WAITOK) with Giant held.
  */
 static int
 inp_set_source_filters(struct inpcb *inp, struct sockopt *sopt)
 {
 	struct __msfilterreq	 msfr;
 	sockunion_t		*gsa;
 	struct ifnet		*ifp;
 	struct in_mfilter	*imf;
 	struct ip_moptions	*imo;
 	struct in_multi		*inm;
 	size_t			 idx;
 	int			 error;
 
 	error = sooptcopyin(sopt, &msfr, sizeof(struct __msfilterreq),
 	    sizeof(struct __msfilterreq));
 	if (error)
 		return (error);
 
 	if (msfr.msfr_nsrcs > in_mcast_maxsocksrc)
 		return (ENOBUFS);
 
 	if ((msfr.msfr_fmode != MCAST_EXCLUDE &&
 	     msfr.msfr_fmode != MCAST_INCLUDE))
 		return (EINVAL);
 
 	if (msfr.msfr_group.ss_family != AF_INET ||
 	    msfr.msfr_group.ss_len != sizeof(struct sockaddr_in))
 		return (EINVAL);
 
 	gsa = (sockunion_t *)&msfr.msfr_group;
 	if (!IN_MULTICAST(ntohl(gsa->sin.sin_addr.s_addr)))
 		return (EINVAL);
 
 	gsa->sin.sin_port = 0;	/* ignore port */
 
 	if (msfr.msfr_ifindex == 0 || V_if_index < msfr.msfr_ifindex)
 		return (EADDRNOTAVAIL);
 
 	ifp = ifnet_byindex(msfr.msfr_ifindex);
 	if (ifp == NULL)
 		return (EADDRNOTAVAIL);
 
 	/*
 	 * Take the INP write lock.
 	 * Check if this socket is a member of this group.
 	 */
+	IN_MULTI_LOCK();
 	imo = inp_findmoptions(inp);
 	idx = imo_match_group(imo, ifp, &gsa->sa);
 	if (idx == -1 || imo->imo_mfilters == NULL) {
 		error = EADDRNOTAVAIL;
 		goto out_inp_locked;
 	}
 	inm = imo->imo_membership[idx];
 	imf = &imo->imo_mfilters[idx];
 
 	/*
 	 * Begin state merge transaction at socket layer.
 	 */
 	INP_WLOCK_ASSERT(inp);
 
 	imf->imf_st[1] = msfr.msfr_fmode;
 
 	/*
 	 * Apply any new source filters, if present.
 	 * Make a copy of the user-space source vector so
 	 * that we may copy them with a single copyin. This
 	 * allows us to deal with page faults up-front.
 	 */
 	if (msfr.msfr_nsrcs > 0) {
 		struct in_msource	*lims;
 		struct sockaddr_in	*psin;
 		struct sockaddr_storage	*kss, *pkss;
 		int			 i;
 
 		INP_WUNLOCK(inp);
  
 		CTR2(KTR_IGMPV3, "%s: loading %lu source list entries",
 		    __func__, (unsigned long)msfr.msfr_nsrcs);
 		kss = malloc(sizeof(struct sockaddr_storage) * msfr.msfr_nsrcs,
 		    M_TEMP, M_WAITOK);
 		error = copyin(msfr.msfr_srcs, kss,
 		    sizeof(struct sockaddr_storage) * msfr.msfr_nsrcs);
 		if (error) {
 			free(kss, M_TEMP);
 			return (error);
 		}
 
 		INP_WLOCK(inp);
 
 		/*
 		 * Mark all source filters as UNDEFINED at t1.
 		 * Restore new group filter mode, as imf_leave()
 		 * will set it to INCLUDE.
 		 */
 		imf_leave(imf);
 		imf->imf_st[1] = msfr.msfr_fmode;
 
 		/*
 		 * Update socket layer filters at t1, lazy-allocating
 		 * new entries. This saves a bunch of memory at the
 		 * cost of one RB_FIND() per source entry; duplicate
 		 * entries in the msfr_nsrcs vector are ignored.
 		 * If we encounter an error, rollback transaction.
 		 *
 		 * XXX This too could be replaced with a set-symmetric
 		 * difference like loop to avoid walking from root
 		 * every time, as the key space is common.
 		 */
 		for (i = 0, pkss = kss; i < msfr.msfr_nsrcs; i++, pkss++) {
 			psin = (struct sockaddr_in *)pkss;
 			if (psin->sin_family != AF_INET) {
 				error = EAFNOSUPPORT;
 				break;
 			}
 			if (psin->sin_len != sizeof(struct sockaddr_in)) {
 				error = EINVAL;
 				break;
 			}
 			error = imf_get_source(imf, psin, &lims);
 			if (error)
 				break;
 			lims->imsl_st[1] = imf->imf_st[1];
 		}
 		free(kss, M_TEMP);
 	}
 
 	if (error)
 		goto out_imf_rollback;
 
 	INP_WLOCK_ASSERT(inp);
-	IN_MULTI_LOCK();
 
 	/*
 	 * Begin state merge transaction at IGMP layer.
 	 */
 	CTR1(KTR_IGMPV3, "%s: merge inm state", __func__);
 	IN_MULTI_LIST_LOCK();
 	error = inm_merge(inm, imf);
 	if (error) {
 		CTR1(KTR_IGMPV3, "%s: failed to merge inm state", __func__);
 		IN_MULTI_LIST_UNLOCK();
-		goto out_in_multi_locked;
+		goto out_imf_rollback;
 	}
 
 	CTR1(KTR_IGMPV3, "%s: doing igmp downcall", __func__);
 	error = igmp_change_state(inm);
 	IN_MULTI_LIST_UNLOCK();
 	if (error)
 		CTR1(KTR_IGMPV3, "%s: failed igmp downcall", __func__);
 
-out_in_multi_locked:
-
-	IN_MULTI_UNLOCK();
-
 out_imf_rollback:
 	if (error)
 		imf_rollback(imf);
 	else
 		imf_commit(imf);
 
 	imf_reap(imf);
 
 out_inp_locked:
 	INP_WUNLOCK(inp);
+	IN_MULTI_UNLOCK();
 	return (error);
 }
 
 /*
  * Set the IP multicast options in response to user setsockopt().
  *
  * Many of the socket options handled in this function duplicate the
  * functionality of socket options in the regular unicast API. However,
  * it is not possible to merge the duplicate code, because the idempotence
  * of the IPv4 multicast part of the BSD Sockets API must be preserved;
  * the effects of these options must be treated as separate and distinct.
  *
  * SMPng: XXX: Unlocked read of inp_socket believed OK.
  * FUTURE: The IP_MULTICAST_VIF option may be eliminated if MROUTING
  * is refactored to no longer use vifs.
  */
 int
 inp_setmoptions(struct inpcb *inp, struct sockopt *sopt)
 {
 	struct ip_moptions	*imo;
 	int			 error;
 
 	error = 0;
 
 	/*
 	 * If socket is neither of type SOCK_RAW or SOCK_DGRAM,
 	 * or is a divert socket, reject it.
 	 */
 	if (inp->inp_socket->so_proto->pr_protocol == IPPROTO_DIVERT ||
 	    (inp->inp_socket->so_proto->pr_type != SOCK_RAW &&
 	     inp->inp_socket->so_proto->pr_type != SOCK_DGRAM))
 		return (EOPNOTSUPP);
 
 	switch (sopt->sopt_name) {
 	case IP_MULTICAST_VIF: {
 		int vifi;
 		/*
 		 * Select a multicast VIF for transmission.
 		 * Only useful if multicast forwarding is active.
 		 */
 		if (legal_vif_num == NULL) {
 			error = EOPNOTSUPP;
 			break;
 		}
 		error = sooptcopyin(sopt, &vifi, sizeof(int), sizeof(int));
 		if (error)
 			break;
 		if (!legal_vif_num(vifi) && (vifi != -1)) {
 			error = EINVAL;
 			break;
 		}
 		imo = inp_findmoptions(inp);
 		imo->imo_multicast_vif = vifi;
 		INP_WUNLOCK(inp);
 		break;
 	}
 
 	case IP_MULTICAST_IF:
 		error = inp_set_multicast_if(inp, sopt);
 		break;
 
 	case IP_MULTICAST_TTL: {
 		u_char ttl;
 
 		/*
 		 * Set the IP time-to-live for outgoing multicast packets.
 		 * The original multicast API required a char argument,
 		 * which is inconsistent with the rest of the socket API.
 		 * We allow either a char or an int.
 		 */
 		if (sopt->sopt_valsize == sizeof(u_char)) {
 			error = sooptcopyin(sopt, &ttl, sizeof(u_char),
 			    sizeof(u_char));
 			if (error)
 				break;
 		} else {
 			u_int ittl;
 
 			error = sooptcopyin(sopt, &ittl, sizeof(u_int),
 			    sizeof(u_int));
 			if (error)
 				break;
 			if (ittl > 255) {
 				error = EINVAL;
 				break;
 			}
 			ttl = (u_char)ittl;
 		}
 		imo = inp_findmoptions(inp);
 		imo->imo_multicast_ttl = ttl;
 		INP_WUNLOCK(inp);
 		break;
 	}
 
 	case IP_MULTICAST_LOOP: {
 		u_char loop;
 
 		/*
 		 * Set the loopback flag for outgoing multicast packets.
 		 * Must be zero or one.  The original multicast API required a
 		 * char argument, which is inconsistent with the rest
 		 * of the socket API.  We allow either a char or an int.
 		 */
 		if (sopt->sopt_valsize == sizeof(u_char)) {
 			error = sooptcopyin(sopt, &loop, sizeof(u_char),
 			    sizeof(u_char));
 			if (error)
 				break;
 		} else {
 			u_int iloop;
 
 			error = sooptcopyin(sopt, &iloop, sizeof(u_int),
 					    sizeof(u_int));
 			if (error)
 				break;
 			loop = (u_char)iloop;
 		}
 		imo = inp_findmoptions(inp);
 		imo->imo_multicast_loop = !!loop;
 		INP_WUNLOCK(inp);
 		break;
 	}
 
 	case IP_ADD_MEMBERSHIP:
 	case IP_ADD_SOURCE_MEMBERSHIP:
 	case MCAST_JOIN_GROUP:
 	case MCAST_JOIN_SOURCE_GROUP:
 		error = inp_join_group(inp, sopt);
 		break;
 
 	case IP_DROP_MEMBERSHIP:
 	case IP_DROP_SOURCE_MEMBERSHIP:
 	case MCAST_LEAVE_GROUP:
 	case MCAST_LEAVE_SOURCE_GROUP:
 		error = inp_leave_group(inp, sopt);
 		break;
 
 	case IP_BLOCK_SOURCE:
 	case IP_UNBLOCK_SOURCE:
 	case MCAST_BLOCK_SOURCE:
 	case MCAST_UNBLOCK_SOURCE:
 		error = inp_block_unblock_source(inp, sopt);
 		break;
 
 	case IP_MSFILTER:
 		error = inp_set_source_filters(inp, sopt);
 		break;
 
 	default:
 		error = EOPNOTSUPP;
 		break;
 	}
 
 	INP_UNLOCK_ASSERT(inp);
 
 	return (error);
 }
 
 /*
  * Expose IGMP's multicast filter mode and source list(s) to userland,
  * keyed by (ifindex, group).
  * The filter mode is written out as a uint32_t, followed by
  * 0..n of struct in_addr.
  * For use by ifmcstat(8).
  * SMPng: NOTE: unlocked read of ifindex space.
  */
 static int
 sysctl_ip_mcast_filters(SYSCTL_HANDLER_ARGS)
 {
 	struct in_addr			 src, group;
 	struct epoch_tracker		 et;
 	struct ifnet			*ifp;
 	struct ifmultiaddr		*ifma;
 	struct in_multi			*inm;
 	struct ip_msource		*ims;
 	int				*name;
 	int				 retval;
 	u_int				 namelen;
 	uint32_t			 fmode, ifindex;
 
 	name = (int *)arg1;
 	namelen = arg2;
 
 	if (req->newptr != NULL)
 		return (EPERM);
 
 	if (namelen != 2)
 		return (EINVAL);
 
 	ifindex = name[0];
 	if (ifindex <= 0 || ifindex > V_if_index) {
 		CTR2(KTR_IGMPV3, "%s: ifindex %u out of range",
 		    __func__, ifindex);
 		return (ENOENT);
 	}
 
 	group.s_addr = name[1];
 	if (!IN_MULTICAST(ntohl(group.s_addr))) {
 		CTR2(KTR_IGMPV3, "%s: group 0x%08x is not multicast",
 		    __func__, ntohl(group.s_addr));
 		return (EINVAL);
 	}
 
 	ifp = ifnet_byindex(ifindex);
 	if (ifp == NULL) {
 		CTR2(KTR_IGMPV3, "%s: no ifp for ifindex %u",
 		    __func__, ifindex);
 		return (ENOENT);
 	}
 
 	retval = sysctl_wire_old_buffer(req,
 	    sizeof(uint32_t) + (in_mcast_maxgrpsrc * sizeof(struct in_addr)));
 	if (retval)
 		return (retval);
 
 	IN_MULTI_LIST_LOCK();
 
 	NET_EPOCH_ENTER(et);
 	CK_STAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) {
 		if (ifma->ifma_addr->sa_family != AF_INET ||
 		    ifma->ifma_protospec == NULL)
 			continue;
 		inm = (struct in_multi *)ifma->ifma_protospec;
 		if (!in_hosteq(inm->inm_addr, group))
 			continue;
 		fmode = inm->inm_st[1].iss_fmode;
 		retval = SYSCTL_OUT(req, &fmode, sizeof(uint32_t));
 		if (retval != 0)
 			break;
 		RB_FOREACH(ims, ip_msource_tree, &inm->inm_srcs) {
 			CTR2(KTR_IGMPV3, "%s: visit node 0x%08x", __func__,
 			    ims->ims_haddr);
 			/*
 			 * Only copy-out sources which are in-mode.
 			 */
 			if (fmode != ims_get_mode(inm, ims, 1)) {
 				CTR1(KTR_IGMPV3, "%s: skip non-in-mode",
 				    __func__);
 				continue;
 			}
 			src.s_addr = htonl(ims->ims_haddr);
 			retval = SYSCTL_OUT(req, &src, sizeof(struct in_addr));
 			if (retval != 0)
 				break;
 		}
 	}
 	NET_EPOCH_EXIT(et);
 
 	IN_MULTI_LIST_UNLOCK();
 
 	return (retval);
 }
 
 #if defined(KTR) && (KTR_COMPILE & KTR_IGMPV3)
 
 static const char *inm_modestrs[] = {
 	[MCAST_UNDEFINED] = "un",
 	[MCAST_INCLUDE] = "in",
 	[MCAST_EXCLUDE] = "ex",
 };
 _Static_assert(MCAST_UNDEFINED == 0 &&
 	       MCAST_EXCLUDE + 1 == nitems(inm_modestrs),
 	       "inm_modestrs: no longer matches #defines");
 
 static const char *
 inm_mode_str(const int mode)
 {
 
 	if (mode >= MCAST_UNDEFINED && mode <= MCAST_EXCLUDE)
 		return (inm_modestrs[mode]);
 	return ("??");
 }
 
 static const char *inm_statestrs[] = {
 	[IGMP_NOT_MEMBER] = "not-member",
 	[IGMP_SILENT_MEMBER] = "silent",
 	[IGMP_REPORTING_MEMBER] = "reporting",
 	[IGMP_IDLE_MEMBER] = "idle",
 	[IGMP_LAZY_MEMBER] = "lazy",
 	[IGMP_SLEEPING_MEMBER] = "sleeping",
 	[IGMP_AWAKENING_MEMBER] = "awakening",
 	[IGMP_G_QUERY_PENDING_MEMBER] = "query-pending",
 	[IGMP_SG_QUERY_PENDING_MEMBER] = "sg-query-pending",
 	[IGMP_LEAVING_MEMBER] = "leaving",
 };
 _Static_assert(IGMP_NOT_MEMBER == 0 &&
 	       IGMP_LEAVING_MEMBER + 1 == nitems(inm_statestrs),
 	       "inm_statetrs: no longer matches #defines");
 
 static const char *
 inm_state_str(const int state)
 {
 
 	if (state >= IGMP_NOT_MEMBER && state <= IGMP_LEAVING_MEMBER)
 		return (inm_statestrs[state]);
 	return ("??");
 }
 
 /*
  * Dump an in_multi structure to the console.
  */
 void
 inm_print(const struct in_multi *inm)
 {
 	int t;
 	char addrbuf[INET_ADDRSTRLEN];
 
 	if ((ktr_mask & KTR_IGMPV3) == 0)
 		return;
 
 	printf("%s: --- begin inm %p ---\n", __func__, inm);
 	printf("addr %s ifp %p(%s) ifma %p\n",
 	    inet_ntoa_r(inm->inm_addr, addrbuf),
 	    inm->inm_ifp,
 	    inm->inm_ifp->if_xname,
 	    inm->inm_ifma);
 	printf("timer %u state %s refcount %u scq.len %u\n",
 	    inm->inm_timer,
 	    inm_state_str(inm->inm_state),
 	    inm->inm_refcount,
 	    inm->inm_scq.mq_len);
 	printf("igi %p nsrc %lu sctimer %u scrv %u\n",
 	    inm->inm_igi,
 	    inm->inm_nsrc,
 	    inm->inm_sctimer,
 	    inm->inm_scrv);
 	for (t = 0; t < 2; t++) {
 		printf("t%d: fmode %s asm %u ex %u in %u rec %u\n", t,
 		    inm_mode_str(inm->inm_st[t].iss_fmode),
 		    inm->inm_st[t].iss_asm,
 		    inm->inm_st[t].iss_ex,
 		    inm->inm_st[t].iss_in,
 		    inm->inm_st[t].iss_rec);
 	}
 	printf("%s: --- end inm %p ---\n", __func__, inm);
 }
 
 #else /* !KTR || !(KTR_COMPILE & KTR_IGMPV3) */
 
 void
 inm_print(const struct in_multi *inm)
 {
 
 }
 
 #endif /* KTR && (KTR_COMPILE & KTR_IGMPV3) */
 
 RB_GENERATE(ip_msource_tree, ip_msource, ims_link, ip_msource_cmp);
Index: projects/runtime-coverage-v2/sys/netinet6/in6_mcast.c
===================================================================
--- projects/runtime-coverage-v2/sys/netinet6/in6_mcast.c	(revision 347598)
+++ projects/runtime-coverage-v2/sys/netinet6/in6_mcast.c	(revision 347599)
@@ -1,2999 +1,2984 @@
 /*-
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Copyright (c) 2009 Bruce Simpson.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. The name of the author may not be used to endorse or promote
  *    products derived from this software without specific prior written
  *    permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 /*
  * IPv6 multicast socket, group, and socket option processing module.
  * Normative references: RFC 2292, RFC 3492, RFC 3542, RFC 3678, RFC 3810.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_inet6.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/gtaskqueue.h>
 #include <sys/kernel.h>
 #include <sys/malloc.h>
 #include <sys/mbuf.h>
 #include <sys/protosw.h>
 #include <sys/socket.h>
 #include <sys/socketvar.h>
 #include <sys/sysctl.h>
 #include <sys/priv.h>
 #include <sys/ktr.h>
 #include <sys/tree.h>
 
 #include <net/if.h>
 #include <net/if_var.h>
 #include <net/if_dl.h>
 #include <net/route.h>
 #include <net/vnet.h>
 
 
 #include <netinet/in.h>
 #include <netinet/udp.h>
 #include <netinet/in_var.h>
 #include <netinet/ip_var.h>
 #include <netinet/udp_var.h>
 #include <netinet6/in6_fib.h>
 #include <netinet6/in6_var.h>
 #include <netinet/ip6.h>
 #include <netinet/icmp6.h>
 #include <netinet6/ip6_var.h>
 #include <netinet/in_pcb.h>
 #include <netinet/tcp_var.h>
 #include <netinet6/nd6.h>
 #include <netinet6/mld6_var.h>
 #include <netinet6/scope6_var.h>
 
 #ifndef KTR_MLD
 #define KTR_MLD KTR_INET6
 #endif
 
 #ifndef __SOCKUNION_DECLARED
 union sockunion {
 	struct sockaddr_storage	ss;
 	struct sockaddr		sa;
 	struct sockaddr_dl	sdl;
 	struct sockaddr_in6	sin6;
 };
 typedef union sockunion sockunion_t;
 #define __SOCKUNION_DECLARED
 #endif /* __SOCKUNION_DECLARED */
 
 static MALLOC_DEFINE(M_IN6MFILTER, "in6_mfilter",
     "IPv6 multicast PCB-layer source filter");
 MALLOC_DEFINE(M_IP6MADDR, "in6_multi", "IPv6 multicast group");
 static MALLOC_DEFINE(M_IP6MOPTS, "ip6_moptions", "IPv6 multicast options");
 static MALLOC_DEFINE(M_IP6MSOURCE, "ip6_msource",
     "IPv6 multicast MLD-layer source filter");
 
 RB_GENERATE(ip6_msource_tree, ip6_msource, im6s_link, ip6_msource_cmp);
 
 /*
  * Locking:
  * - Lock order is: Giant, INP_WLOCK, IN6_MULTI_LOCK, MLD_LOCK, IF_ADDR_LOCK.
  * - The IF_ADDR_LOCK is implicitly taken by in6m_lookup() earlier, however
  *   it can be taken by code in net/if.c also.
  * - ip6_moptions and in6_mfilter are covered by the INP_WLOCK.
  *
  * struct in6_multi is covered by IN6_MULTI_LOCK. There isn't strictly
  * any need for in6_multi itself to be virtualized -- it is bound to an ifp
  * anyway no matter what happens.
  */
 struct mtx in6_multi_list_mtx;
 MTX_SYSINIT(in6_multi_mtx, &in6_multi_list_mtx, "in6_multi_list_mtx", MTX_DEF);
 
 struct mtx in6_multi_free_mtx;
 MTX_SYSINIT(in6_multi_free_mtx, &in6_multi_free_mtx, "in6_multi_free_mtx", MTX_DEF);
 
 struct sx in6_multi_sx;
 SX_SYSINIT(in6_multi_sx, &in6_multi_sx, "in6_multi_sx");
 
 
 
 static void	im6f_commit(struct in6_mfilter *);
 static int	im6f_get_source(struct in6_mfilter *imf,
 		    const struct sockaddr_in6 *psin,
 		    struct in6_msource **);
 static struct in6_msource *
 		im6f_graft(struct in6_mfilter *, const uint8_t,
 		    const struct sockaddr_in6 *);
 static void	im6f_leave(struct in6_mfilter *);
 static int	im6f_prune(struct in6_mfilter *, const struct sockaddr_in6 *);
 static void	im6f_purge(struct in6_mfilter *);
 static void	im6f_rollback(struct in6_mfilter *);
 static void	im6f_reap(struct in6_mfilter *);
 static int	im6o_grow(struct ip6_moptions *);
 static size_t	im6o_match_group(const struct ip6_moptions *,
 		    const struct ifnet *, const struct sockaddr *);
 static struct in6_msource *
 		im6o_match_source(const struct ip6_moptions *, const size_t,
 		    const struct sockaddr *);
 static void	im6s_merge(struct ip6_msource *ims,
 		    const struct in6_msource *lims, const int rollback);
 static int	in6_getmulti(struct ifnet *, const struct in6_addr *,
 		    struct in6_multi **);
 static int	in6m_get_source(struct in6_multi *inm,
 		    const struct in6_addr *addr, const int noalloc,
 		    struct ip6_msource **pims);
 #ifdef KTR
 static int	in6m_is_ifp_detached(const struct in6_multi *);
 #endif
 static int	in6m_merge(struct in6_multi *, /*const*/ struct in6_mfilter *);
 static void	in6m_purge(struct in6_multi *);
 static void	in6m_reap(struct in6_multi *);
 static struct ip6_moptions *
 		in6p_findmoptions(struct inpcb *);
 static int	in6p_get_source_filters(struct inpcb *, struct sockopt *);
 static int	in6p_join_group(struct inpcb *, struct sockopt *);
 static int	in6p_leave_group(struct inpcb *, struct sockopt *);
 static struct ifnet *
 		in6p_lookup_mcast_ifp(const struct inpcb *,
 		    const struct sockaddr_in6 *);
 static int	in6p_block_unblock_source(struct inpcb *, struct sockopt *);
 static int	in6p_set_multicast_if(struct inpcb *, struct sockopt *);
 static int	in6p_set_source_filters(struct inpcb *, struct sockopt *);
 static int	sysctl_ip6_mcast_filters(SYSCTL_HANDLER_ARGS);
 
 SYSCTL_DECL(_net_inet6_ip6);	/* XXX Not in any common header. */
 
 static SYSCTL_NODE(_net_inet6_ip6, OID_AUTO, mcast, CTLFLAG_RW, 0,
     "IPv6 multicast");
 
 static u_long in6_mcast_maxgrpsrc = IPV6_MAX_GROUP_SRC_FILTER;
 SYSCTL_ULONG(_net_inet6_ip6_mcast, OID_AUTO, maxgrpsrc,
     CTLFLAG_RWTUN, &in6_mcast_maxgrpsrc, 0,
     "Max source filters per group");
 
 static u_long in6_mcast_maxsocksrc = IPV6_MAX_SOCK_SRC_FILTER;
 SYSCTL_ULONG(_net_inet6_ip6_mcast, OID_AUTO, maxsocksrc,
     CTLFLAG_RWTUN, &in6_mcast_maxsocksrc, 0,
     "Max source filters per socket");
 
 /* TODO Virtualize this switch. */
 int in6_mcast_loop = IPV6_DEFAULT_MULTICAST_LOOP;
 SYSCTL_INT(_net_inet6_ip6_mcast, OID_AUTO, loop, CTLFLAG_RWTUN,
     &in6_mcast_loop, 0, "Loopback multicast datagrams by default");
 
 static SYSCTL_NODE(_net_inet6_ip6_mcast, OID_AUTO, filters,
     CTLFLAG_RD | CTLFLAG_MPSAFE, sysctl_ip6_mcast_filters,
     "Per-interface stack-wide source filters");
 
 #ifdef KTR
 /*
  * Inline function which wraps assertions for a valid ifp.
  * The ifnet layer will set the ifma's ifp pointer to NULL if the ifp
  * is detached.
  */
 static int __inline
 in6m_is_ifp_detached(const struct in6_multi *inm)
 {
 	struct ifnet *ifp;
 
 	KASSERT(inm->in6m_ifma != NULL, ("%s: no ifma", __func__));
 	ifp = inm->in6m_ifma->ifma_ifp;
 	if (ifp != NULL) {
 		/*
 		 * Sanity check that network-layer notion of ifp is the
 		 * same as that of link-layer.
 		 */
 		KASSERT(inm->in6m_ifp == ifp, ("%s: bad ifp", __func__));
 	}
 
 	return (ifp == NULL);
 }
 #endif
 
 /*
  * Initialize an in6_mfilter structure to a known state at t0, t1
  * with an empty source filter list.
  */
 static __inline void
 im6f_init(struct in6_mfilter *imf, const int st0, const int st1)
 {
 	memset(imf, 0, sizeof(struct in6_mfilter));
 	RB_INIT(&imf->im6f_sources);
 	imf->im6f_st[0] = st0;
 	imf->im6f_st[1] = st1;
 }
 
 /*
  * Resize the ip6_moptions vector to the next power-of-two minus 1.
  * May be called with locks held; do not sleep.
  */
 static int
 im6o_grow(struct ip6_moptions *imo)
 {
 	struct in6_multi	**nmships;
 	struct in6_multi	**omships;
 	struct in6_mfilter	 *nmfilters;
 	struct in6_mfilter	 *omfilters;
 	size_t			  idx;
 	size_t			  newmax;
 	size_t			  oldmax;
 
 	nmships = NULL;
 	nmfilters = NULL;
 	omships = imo->im6o_membership;
 	omfilters = imo->im6o_mfilters;
 	oldmax = imo->im6o_max_memberships;
 	newmax = ((oldmax + 1) * 2) - 1;
 
 	if (newmax <= IPV6_MAX_MEMBERSHIPS) {
 		nmships = (struct in6_multi **)realloc(omships,
 		    sizeof(struct in6_multi *) * newmax, M_IP6MOPTS, M_NOWAIT);
 		nmfilters = (struct in6_mfilter *)realloc(omfilters,
 		    sizeof(struct in6_mfilter) * newmax, M_IN6MFILTER,
 		    M_NOWAIT);
 		if (nmships != NULL && nmfilters != NULL) {
 			/* Initialize newly allocated source filter heads. */
 			for (idx = oldmax; idx < newmax; idx++) {
 				im6f_init(&nmfilters[idx], MCAST_UNDEFINED,
 				    MCAST_EXCLUDE);
 			}
 			imo->im6o_max_memberships = newmax;
 			imo->im6o_membership = nmships;
 			imo->im6o_mfilters = nmfilters;
 		}
 	}
 
 	if (nmships == NULL || nmfilters == NULL) {
 		if (nmships != NULL)
 			free(nmships, M_IP6MOPTS);
 		if (nmfilters != NULL)
 			free(nmfilters, M_IN6MFILTER);
 		return (ETOOMANYREFS);
 	}
 
 	return (0);
 }
 
 /*
  * Find an IPv6 multicast group entry for this ip6_moptions instance
  * which matches the specified group, and optionally an interface.
  * Return its index into the array, or -1 if not found.
  */
 static size_t
 im6o_match_group(const struct ip6_moptions *imo, const struct ifnet *ifp,
     const struct sockaddr *group)
 {
 	const struct sockaddr_in6 *gsin6;
 	struct in6_multi	**pinm;
 	int		  idx;
 	int		  nmships;
 
 	gsin6 = (const struct sockaddr_in6 *)group;
 
 	/* The im6o_membership array may be lazy allocated. */
 	if (imo->im6o_membership == NULL || imo->im6o_num_memberships == 0)
 		return (-1);
 
 	nmships = imo->im6o_num_memberships;
 	pinm = &imo->im6o_membership[0];
 	for (idx = 0; idx < nmships; idx++, pinm++) {
 		if (*pinm == NULL)
 			continue;
 		if ((ifp == NULL || ((*pinm)->in6m_ifp == ifp)) &&
 		    IN6_ARE_ADDR_EQUAL(&(*pinm)->in6m_addr,
 		    &gsin6->sin6_addr)) {
 			break;
 		}
 	}
 	if (idx >= nmships)
 		idx = -1;
 
 	return (idx);
 }
 
 /*
  * Find an IPv6 multicast source entry for this imo which matches
  * the given group index for this socket, and source address.
  *
  * XXX TODO: The scope ID, if present in src, is stripped before
  * any comparison. We SHOULD enforce scope/zone checks where the source
  * filter entry has a link scope.
  *
  * NOTE: This does not check if the entry is in-mode, merely if
  * it exists, which may not be the desired behaviour.
  */
 static struct in6_msource *
 im6o_match_source(const struct ip6_moptions *imo, const size_t gidx,
     const struct sockaddr *src)
 {
 	struct ip6_msource	 find;
 	struct in6_mfilter	*imf;
 	struct ip6_msource	*ims;
 	const sockunion_t	*psa;
 
 	KASSERT(src->sa_family == AF_INET6, ("%s: !AF_INET6", __func__));
 	KASSERT(gidx != -1 && gidx < imo->im6o_num_memberships,
 	    ("%s: invalid index %d\n", __func__, (int)gidx));
 
 	/* The im6o_mfilters array may be lazy allocated. */
 	if (imo->im6o_mfilters == NULL)
 		return (NULL);
 	imf = &imo->im6o_mfilters[gidx];
 
 	psa = (const sockunion_t *)src;
 	find.im6s_addr = psa->sin6.sin6_addr;
 	in6_clearscope(&find.im6s_addr);		/* XXX */
 	ims = RB_FIND(ip6_msource_tree, &imf->im6f_sources, &find);
 
 	return ((struct in6_msource *)ims);
 }
 
 /*
  * Perform filtering for multicast datagrams on a socket by group and source.
  *
  * Returns 0 if a datagram should be allowed through, or various error codes
  * if the socket was not a member of the group, or the source was muted, etc.
  */
 int
 im6o_mc_filter(const struct ip6_moptions *imo, const struct ifnet *ifp,
     const struct sockaddr *group, const struct sockaddr *src)
 {
 	size_t gidx;
 	struct in6_msource *ims;
 	int mode;
 
 	KASSERT(ifp != NULL, ("%s: null ifp", __func__));
 
 	gidx = im6o_match_group(imo, ifp, group);
 	if (gidx == -1)
 		return (MCAST_NOTGMEMBER);
 
 	/*
 	 * Check if the source was included in an (S,G) join.
 	 * Allow reception on exclusive memberships by default,
 	 * reject reception on inclusive memberships by default.
 	 * Exclude source only if an in-mode exclude filter exists.
 	 * Include source only if an in-mode include filter exists.
 	 * NOTE: We are comparing group state here at MLD t1 (now)
 	 * with socket-layer t0 (since last downcall).
 	 */
 	mode = imo->im6o_mfilters[gidx].im6f_st[1];
 	ims = im6o_match_source(imo, gidx, src);
 
 	if ((ims == NULL && mode == MCAST_INCLUDE) ||
 	    (ims != NULL && ims->im6sl_st[0] != mode))
 		return (MCAST_NOTSMEMBER);
 
 	return (MCAST_PASS);
 }
 
 /*
  * Find and return a reference to an in6_multi record for (ifp, group),
  * and bump its reference count.
  * If one does not exist, try to allocate it, and update link-layer multicast
  * filters on ifp to listen for group.
  * Assumes the IN6_MULTI lock is held across the call.
  * Return 0 if successful, otherwise return an appropriate error code.
  */
 static int
 in6_getmulti(struct ifnet *ifp, const struct in6_addr *group,
     struct in6_multi **pinm)
 {
 	struct epoch_tracker	 et;
 	struct sockaddr_in6	 gsin6;
 	struct ifmultiaddr	*ifma;
 	struct in6_multi	*inm;
 	int			 error;
 
 	error = 0;
 
 	/*
 	 * XXX: Accesses to ifma_protospec must be covered by IF_ADDR_LOCK;
 	 * if_addmulti() takes this mutex itself, so we must drop and
 	 * re-acquire around the call.
 	 */
 	IN6_MULTI_LOCK_ASSERT();
 	IN6_MULTI_LIST_LOCK();
 	IF_ADDR_WLOCK(ifp);
 	NET_EPOCH_ENTER(et);
 	inm = in6m_lookup_locked(ifp, group);
 	NET_EPOCH_EXIT(et);
 
 	if (inm != NULL) {
 		/*
 		 * If we already joined this group, just bump the
 		 * refcount and return it.
 		 */
 		KASSERT(inm->in6m_refcount >= 1,
 		    ("%s: bad refcount %d", __func__, inm->in6m_refcount));
 		in6m_acquire_locked(inm);
 		*pinm = inm;
 		goto out_locked;
 	}
 
 	memset(&gsin6, 0, sizeof(gsin6));
 	gsin6.sin6_family = AF_INET6;
 	gsin6.sin6_len = sizeof(struct sockaddr_in6);
 	gsin6.sin6_addr = *group;
 
 	/*
 	 * Check if a link-layer group is already associated
 	 * with this network-layer group on the given ifnet.
 	 */
 	IN6_MULTI_LIST_UNLOCK();
 	IF_ADDR_WUNLOCK(ifp);
 	error = if_addmulti(ifp, (struct sockaddr *)&gsin6, &ifma);
 	if (error != 0)
 		return (error);
 	IN6_MULTI_LIST_LOCK();
 	IF_ADDR_WLOCK(ifp);
 
 	/*
 	 * If something other than netinet6 is occupying the link-layer
 	 * group, print a meaningful error message and back out of
 	 * the allocation.
 	 * Otherwise, bump the refcount on the existing network-layer
 	 * group association and return it.
 	 */
 	if (ifma->ifma_protospec != NULL) {
 		inm = (struct in6_multi *)ifma->ifma_protospec;
 #ifdef INVARIANTS
 		KASSERT(ifma->ifma_addr != NULL, ("%s: no ifma_addr",
 		    __func__));
 		KASSERT(ifma->ifma_addr->sa_family == AF_INET6,
 		    ("%s: ifma not AF_INET6", __func__));
 		KASSERT(inm != NULL, ("%s: no ifma_protospec", __func__));
 		if (inm->in6m_ifma != ifma || inm->in6m_ifp != ifp ||
 		    !IN6_ARE_ADDR_EQUAL(&inm->in6m_addr, group))
 			panic("%s: ifma %p is inconsistent with %p (%p)",
 			    __func__, ifma, inm, group);
 #endif
 		in6m_acquire_locked(inm);
 		*pinm = inm;
 		goto out_locked;
 	}
 
 	IF_ADDR_WLOCK_ASSERT(ifp);
 
 	/*
 	 * A new in6_multi record is needed; allocate and initialize it.
 	 * We DO NOT perform an MLD join as the in6_ layer may need to
 	 * push an initial source list down to MLD to support SSM.
 	 *
 	 * The initial source filter state is INCLUDE, {} as per the RFC.
 	 * Pending state-changes per group are subject to a bounds check.
 	 */
 	inm = malloc(sizeof(*inm), M_IP6MADDR, M_NOWAIT | M_ZERO);
 	if (inm == NULL) {
 		IN6_MULTI_LIST_UNLOCK();
 		IF_ADDR_WUNLOCK(ifp);
 		if_delmulti_ifma(ifma);
 		return (ENOMEM);
 	}
 	inm->in6m_addr = *group;
 	inm->in6m_ifp = ifp;
 	inm->in6m_mli = MLD_IFINFO(ifp);
 	inm->in6m_ifma = ifma;
 	inm->in6m_refcount = 1;
 	inm->in6m_state = MLD_NOT_MEMBER;
 	mbufq_init(&inm->in6m_scq, MLD_MAX_STATE_CHANGES);
 
 	inm->in6m_st[0].iss_fmode = MCAST_UNDEFINED;
 	inm->in6m_st[1].iss_fmode = MCAST_UNDEFINED;
 	RB_INIT(&inm->in6m_srcs);
 
 	ifma->ifma_protospec = inm;
 	*pinm = inm;
 
  out_locked:
 	IN6_MULTI_LIST_UNLOCK();
 	IF_ADDR_WUNLOCK(ifp);
 	return (error);
 }
 
 /*
  * Drop a reference to an in6_multi record.
  *
  * If the refcount drops to 0, free the in6_multi record and
  * delete the underlying link-layer membership.
  */
 static void
 in6m_release(struct in6_multi *inm)
 {
 	struct ifmultiaddr *ifma;
 	struct ifnet *ifp;
 
 	CTR2(KTR_MLD, "%s: refcount is %d", __func__, inm->in6m_refcount);
 
 	MPASS(inm->in6m_refcount == 0);
 	CTR2(KTR_MLD, "%s: freeing inm %p", __func__, inm);
 
 	ifma = inm->in6m_ifma;
 	ifp = inm->in6m_ifp;
 	MPASS(ifma->ifma_llifma == NULL);
 
 	/* XXX this access is not covered by IF_ADDR_LOCK */
 	CTR2(KTR_MLD, "%s: purging ifma %p", __func__, ifma);
 	KASSERT(ifma->ifma_protospec == NULL,
 	    ("%s: ifma_protospec != NULL", __func__));
 	if (ifp == NULL)
 		ifp = ifma->ifma_ifp;
 
 	if (ifp != NULL) {
 		CURVNET_SET(ifp->if_vnet);
 		in6m_purge(inm);
 		free(inm, M_IP6MADDR);
 		if_delmulti_ifma_flags(ifma, 1);
 		CURVNET_RESTORE();
 		if_rele(ifp);
 	} else {
 		in6m_purge(inm);
 		free(inm, M_IP6MADDR);
 		if_delmulti_ifma_flags(ifma, 1);
 	}
 }
 
 static struct grouptask free_gtask;
 static struct in6_multi_head in6m_free_list;
 static void in6m_release_task(void *arg __unused);
 static void in6m_init(void)
 {
 	SLIST_INIT(&in6m_free_list);
 	taskqgroup_config_gtask_init(NULL, &free_gtask, in6m_release_task, "in6m release task");
 }
 
 #ifdef EARLY_AP_STARTUP
 SYSINIT(in6m_init, SI_SUB_SMP + 1, SI_ORDER_FIRST,
 	in6m_init, NULL);
 #else
 SYSINIT(in6m_init, SI_SUB_ROOT_CONF - 1, SI_ORDER_SECOND,
 	in6m_init, NULL);
 #endif
 
 
 void
 in6m_release_list_deferred(struct in6_multi_head *inmh)
 {
 	if (SLIST_EMPTY(inmh))
 		return;
 	mtx_lock(&in6_multi_free_mtx);
 	SLIST_CONCAT(&in6m_free_list, inmh, in6_multi, in6m_nrele);
 	mtx_unlock(&in6_multi_free_mtx);
 	GROUPTASK_ENQUEUE(&free_gtask);
 }
 
 void
 in6m_release_wait(void)
 {
 
 	/* Wait for all jobs to complete. */
 	gtaskqueue_drain_all(free_gtask.gt_taskqueue);
 }
 
 void
 in6m_disconnect_locked(struct in6_multi_head *inmh, struct in6_multi *inm)
 {
 	struct ifnet *ifp;
 	struct ifaddr *ifa;
 	struct in6_ifaddr *ifa6;
 	struct in6_multi_mship *imm, *imm_tmp;
 	struct ifmultiaddr *ifma, *ll_ifma;
 
 	IN6_MULTI_LIST_LOCK_ASSERT();
 
 	ifp = inm->in6m_ifp;
 	if (ifp == NULL)
 		return;		/* already called */
 
 	inm->in6m_ifp = NULL;
 	IF_ADDR_WLOCK_ASSERT(ifp);
 	ifma = inm->in6m_ifma;
 	if (ifma == NULL)
 		return;
 
 	if_ref(ifp);
 	if (ifma->ifma_flags & IFMA_F_ENQUEUED) {
 		CK_STAILQ_REMOVE(&ifp->if_multiaddrs, ifma, ifmultiaddr, ifma_link);
 		ifma->ifma_flags &= ~IFMA_F_ENQUEUED;
 	}
 	MCDPRINTF("removed ifma: %p from %s\n", ifma, ifp->if_xname);
 	if ((ll_ifma = ifma->ifma_llifma) != NULL) {
 		MPASS(ifma != ll_ifma);
 		ifma->ifma_llifma = NULL;
 		MPASS(ll_ifma->ifma_llifma == NULL);
 		MPASS(ll_ifma->ifma_ifp == ifp);
 		if (--ll_ifma->ifma_refcount == 0) {
 			if (ll_ifma->ifma_flags & IFMA_F_ENQUEUED) {
 				CK_STAILQ_REMOVE(&ifp->if_multiaddrs, ll_ifma, ifmultiaddr, ifma_link);
 				ll_ifma->ifma_flags &= ~IFMA_F_ENQUEUED;
 			}
 			MCDPRINTF("removed ll_ifma: %p from %s\n", ll_ifma, ifp->if_xname);
 			if_freemulti(ll_ifma);
 		}
 	}
 	CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
 		if (ifa->ifa_addr->sa_family != AF_INET6)
 			continue;
 		ifa6 = (void *)ifa;
 		LIST_FOREACH_SAFE(imm, &ifa6->ia6_memberships,
 		    i6mm_chain, imm_tmp) {
 			if (inm == imm->i6mm_maddr) {
 				LIST_REMOVE(imm, i6mm_chain);
 				free(imm, M_IP6MADDR);
 				in6m_rele_locked(inmh, inm);
 			}
 		}
 	}
 }
 
 static void
 in6m_release_task(void *arg __unused)
 {
 	struct in6_multi_head in6m_free_tmp;
 	struct in6_multi *inm, *tinm;
 
 	SLIST_INIT(&in6m_free_tmp);
 	mtx_lock(&in6_multi_free_mtx);
 	SLIST_CONCAT(&in6m_free_tmp, &in6m_free_list, in6_multi, in6m_nrele);
 	mtx_unlock(&in6_multi_free_mtx);
 	IN6_MULTI_LOCK();
 	SLIST_FOREACH_SAFE(inm, &in6m_free_tmp, in6m_nrele, tinm) {
 		SLIST_REMOVE_HEAD(&in6m_free_tmp, in6m_nrele);
 		in6m_release(inm);
 	}
 	IN6_MULTI_UNLOCK();
 }
 
 /*
  * Clear recorded source entries for a group.
  * Used by the MLD code. Caller must hold the IN6_MULTI lock.
  * FIXME: Should reap.
  */
 void
 in6m_clear_recorded(struct in6_multi *inm)
 {
 	struct ip6_msource	*ims;
 
 	IN6_MULTI_LIST_LOCK_ASSERT();
 
 	RB_FOREACH(ims, ip6_msource_tree, &inm->in6m_srcs) {
 		if (ims->im6s_stp) {
 			ims->im6s_stp = 0;
 			--inm->in6m_st[1].iss_rec;
 		}
 	}
 	KASSERT(inm->in6m_st[1].iss_rec == 0,
 	    ("%s: iss_rec %d not 0", __func__, inm->in6m_st[1].iss_rec));
 }
 
 /*
  * Record a source as pending for a Source-Group MLDv2 query.
  * This lives here as it modifies the shared tree.
  *
  * inm is the group descriptor.
  * naddr is the address of the source to record in network-byte order.
  *
  * If the net.inet6.mld.sgalloc sysctl is non-zero, we will
  * lazy-allocate a source node in response to an SG query.
  * Otherwise, no allocation is performed. This saves some memory
  * with the trade-off that the source will not be reported to the
  * router if joined in the window between the query response and
  * the group actually being joined on the local host.
  *
  * VIMAGE: XXX: Currently the mld_sgalloc feature has been removed.
  * This turns off the allocation of a recorded source entry if
  * the group has not been joined.
  *
  * Return 0 if the source didn't exist or was already marked as recorded.
  * Return 1 if the source was marked as recorded by this function.
  * Return <0 if any error occurred (negated errno code).
  */
 int
 in6m_record_source(struct in6_multi *inm, const struct in6_addr *addr)
 {
 	struct ip6_msource	 find;
 	struct ip6_msource	*ims, *nims;
 
 	IN6_MULTI_LIST_LOCK_ASSERT();
 
 	find.im6s_addr = *addr;
 	ims = RB_FIND(ip6_msource_tree, &inm->in6m_srcs, &find);
 	if (ims && ims->im6s_stp)
 		return (0);
 	if (ims == NULL) {
 		if (inm->in6m_nsrc == in6_mcast_maxgrpsrc)
 			return (-ENOSPC);
 		nims = malloc(sizeof(struct ip6_msource), M_IP6MSOURCE,
 		    M_NOWAIT | M_ZERO);
 		if (nims == NULL)
 			return (-ENOMEM);
 		nims->im6s_addr = find.im6s_addr;
 		RB_INSERT(ip6_msource_tree, &inm->in6m_srcs, nims);
 		++inm->in6m_nsrc;
 		ims = nims;
 	}
 
 	/*
 	 * Mark the source as recorded and update the recorded
 	 * source count.
 	 */
 	++ims->im6s_stp;
 	++inm->in6m_st[1].iss_rec;
 
 	return (1);
 }
 
 /*
  * Return a pointer to an in6_msource owned by an in6_mfilter,
  * given its source address.
  * Lazy-allocate if needed. If this is a new entry its filter state is
  * undefined at t0.
  *
  * imf is the filter set being modified.
  * addr is the source address.
  *
  * SMPng: May be called with locks held; malloc must not block.
  */
 static int
 im6f_get_source(struct in6_mfilter *imf, const struct sockaddr_in6 *psin,
     struct in6_msource **plims)
 {
 	struct ip6_msource	 find;
 	struct ip6_msource	*ims, *nims;
 	struct in6_msource	*lims;
 	int			 error;
 
 	error = 0;
 	ims = NULL;
 	lims = NULL;
 
 	find.im6s_addr = psin->sin6_addr;
 	ims = RB_FIND(ip6_msource_tree, &imf->im6f_sources, &find);
 	lims = (struct in6_msource *)ims;
 	if (lims == NULL) {
 		if (imf->im6f_nsrc == in6_mcast_maxsocksrc)
 			return (ENOSPC);
 		nims = malloc(sizeof(struct in6_msource), M_IN6MFILTER,
 		    M_NOWAIT | M_ZERO);
 		if (nims == NULL)
 			return (ENOMEM);
 		lims = (struct in6_msource *)nims;
 		lims->im6s_addr = find.im6s_addr;
 		lims->im6sl_st[0] = MCAST_UNDEFINED;
 		RB_INSERT(ip6_msource_tree, &imf->im6f_sources, nims);
 		++imf->im6f_nsrc;
 	}
 
 	*plims = lims;
 
 	return (error);
 }
 
 /*
  * Graft a source entry into an existing socket-layer filter set,
  * maintaining any required invariants and checking allocations.
  *
  * The source is marked as being in the new filter mode at t1.
  *
  * Return the pointer to the new node, otherwise return NULL.
  */
 static struct in6_msource *
 im6f_graft(struct in6_mfilter *imf, const uint8_t st1,
     const struct sockaddr_in6 *psin)
 {
 	struct ip6_msource	*nims;
 	struct in6_msource	*lims;
 
 	nims = malloc(sizeof(struct in6_msource), M_IN6MFILTER,
 	    M_NOWAIT | M_ZERO);
 	if (nims == NULL)
 		return (NULL);
 	lims = (struct in6_msource *)nims;
 	lims->im6s_addr = psin->sin6_addr;
 	lims->im6sl_st[0] = MCAST_UNDEFINED;
 	lims->im6sl_st[1] = st1;
 	RB_INSERT(ip6_msource_tree, &imf->im6f_sources, nims);
 	++imf->im6f_nsrc;
 
 	return (lims);
 }
 
 /*
  * Prune a source entry from an existing socket-layer filter set,
  * maintaining any required invariants and checking allocations.
  *
  * The source is marked as being left at t1, it is not freed.
  *
  * Return 0 if no error occurred, otherwise return an errno value.
  */
 static int
 im6f_prune(struct in6_mfilter *imf, const struct sockaddr_in6 *psin)
 {
 	struct ip6_msource	 find;
 	struct ip6_msource	*ims;
 	struct in6_msource	*lims;
 
 	find.im6s_addr = psin->sin6_addr;
 	ims = RB_FIND(ip6_msource_tree, &imf->im6f_sources, &find);
 	if (ims == NULL)
 		return (ENOENT);
 	lims = (struct in6_msource *)ims;
 	lims->im6sl_st[1] = MCAST_UNDEFINED;
 	return (0);
 }
 
 /*
  * Revert socket-layer filter set deltas at t1 to t0 state.
  */
 static void
 im6f_rollback(struct in6_mfilter *imf)
 {
 	struct ip6_msource	*ims, *tims;
 	struct in6_msource	*lims;
 
 	RB_FOREACH_SAFE(ims, ip6_msource_tree, &imf->im6f_sources, tims) {
 		lims = (struct in6_msource *)ims;
 		if (lims->im6sl_st[0] == lims->im6sl_st[1]) {
 			/* no change at t1 */
 			continue;
 		} else if (lims->im6sl_st[0] != MCAST_UNDEFINED) {
 			/* revert change to existing source at t1 */
 			lims->im6sl_st[1] = lims->im6sl_st[0];
 		} else {
 			/* revert source added t1 */
 			CTR2(KTR_MLD, "%s: free ims %p", __func__, ims);
 			RB_REMOVE(ip6_msource_tree, &imf->im6f_sources, ims);
 			free(ims, M_IN6MFILTER);
 			imf->im6f_nsrc--;
 		}
 	}
 	imf->im6f_st[1] = imf->im6f_st[0];
 }
 
 /*
  * Mark socket-layer filter set as INCLUDE {} at t1.
  */
 static void
 im6f_leave(struct in6_mfilter *imf)
 {
 	struct ip6_msource	*ims;
 	struct in6_msource	*lims;
 
 	RB_FOREACH(ims, ip6_msource_tree, &imf->im6f_sources) {
 		lims = (struct in6_msource *)ims;
 		lims->im6sl_st[1] = MCAST_UNDEFINED;
 	}
 	imf->im6f_st[1] = MCAST_INCLUDE;
 }
 
 /*
  * Mark socket-layer filter set deltas as committed.
  */
 static void
 im6f_commit(struct in6_mfilter *imf)
 {
 	struct ip6_msource	*ims;
 	struct in6_msource	*lims;
 
 	RB_FOREACH(ims, ip6_msource_tree, &imf->im6f_sources) {
 		lims = (struct in6_msource *)ims;
 		lims->im6sl_st[0] = lims->im6sl_st[1];
 	}
 	imf->im6f_st[0] = imf->im6f_st[1];
 }
 
 /*
  * Reap unreferenced sources from socket-layer filter set.
  */
 static void
 im6f_reap(struct in6_mfilter *imf)
 {
 	struct ip6_msource	*ims, *tims;
 	struct in6_msource	*lims;
 
 	RB_FOREACH_SAFE(ims, ip6_msource_tree, &imf->im6f_sources, tims) {
 		lims = (struct in6_msource *)ims;
 		if ((lims->im6sl_st[0] == MCAST_UNDEFINED) &&
 		    (lims->im6sl_st[1] == MCAST_UNDEFINED)) {
 			CTR2(KTR_MLD, "%s: free lims %p", __func__, ims);
 			RB_REMOVE(ip6_msource_tree, &imf->im6f_sources, ims);
 			free(ims, M_IN6MFILTER);
 			imf->im6f_nsrc--;
 		}
 	}
 }
 
 /*
  * Purge socket-layer filter set.
  */
 static void
 im6f_purge(struct in6_mfilter *imf)
 {
 	struct ip6_msource	*ims, *tims;
 
 	RB_FOREACH_SAFE(ims, ip6_msource_tree, &imf->im6f_sources, tims) {
 		CTR2(KTR_MLD, "%s: free ims %p", __func__, ims);
 		RB_REMOVE(ip6_msource_tree, &imf->im6f_sources, ims);
 		free(ims, M_IN6MFILTER);
 		imf->im6f_nsrc--;
 	}
 	imf->im6f_st[0] = imf->im6f_st[1] = MCAST_UNDEFINED;
 	KASSERT(RB_EMPTY(&imf->im6f_sources),
 	    ("%s: im6f_sources not empty", __func__));
 }
 
 /*
  * Look up a source filter entry for a multicast group.
  *
  * inm is the group descriptor to work with.
  * addr is the IPv6 address to look up.
  * noalloc may be non-zero to suppress allocation of sources.
  * *pims will be set to the address of the retrieved or allocated source.
  *
  * SMPng: NOTE: may be called with locks held.
  * Return 0 if successful, otherwise return a non-zero error code.
  */
 static int
 in6m_get_source(struct in6_multi *inm, const struct in6_addr *addr,
     const int noalloc, struct ip6_msource **pims)
 {
 	struct ip6_msource	 find;
 	struct ip6_msource	*ims, *nims;
 #ifdef KTR
 	char			 ip6tbuf[INET6_ADDRSTRLEN];
 #endif
 
 	find.im6s_addr = *addr;
 	ims = RB_FIND(ip6_msource_tree, &inm->in6m_srcs, &find);
 	if (ims == NULL && !noalloc) {
 		if (inm->in6m_nsrc == in6_mcast_maxgrpsrc)
 			return (ENOSPC);
 		nims = malloc(sizeof(struct ip6_msource), M_IP6MSOURCE,
 		    M_NOWAIT | M_ZERO);
 		if (nims == NULL)
 			return (ENOMEM);
 		nims->im6s_addr = *addr;
 		RB_INSERT(ip6_msource_tree, &inm->in6m_srcs, nims);
 		++inm->in6m_nsrc;
 		ims = nims;
 		CTR3(KTR_MLD, "%s: allocated %s as %p", __func__,
 		    ip6_sprintf(ip6tbuf, addr), ims);
 	}
 
 	*pims = ims;
 	return (0);
 }
 
 /*
  * Merge socket-layer source into MLD-layer source.
  * If rollback is non-zero, perform the inverse of the merge.
  */
 static void
 im6s_merge(struct ip6_msource *ims, const struct in6_msource *lims,
     const int rollback)
 {
 	int n = rollback ? -1 : 1;
 #ifdef KTR
 	char ip6tbuf[INET6_ADDRSTRLEN];
 
 	ip6_sprintf(ip6tbuf, &lims->im6s_addr);
 #endif
 
 	if (lims->im6sl_st[0] == MCAST_EXCLUDE) {
 		CTR3(KTR_MLD, "%s: t1 ex -= %d on %s", __func__, n, ip6tbuf);
 		ims->im6s_st[1].ex -= n;
 	} else if (lims->im6sl_st[0] == MCAST_INCLUDE) {
 		CTR3(KTR_MLD, "%s: t1 in -= %d on %s", __func__, n, ip6tbuf);
 		ims->im6s_st[1].in -= n;
 	}
 
 	if (lims->im6sl_st[1] == MCAST_EXCLUDE) {
 		CTR3(KTR_MLD, "%s: t1 ex += %d on %s", __func__, n, ip6tbuf);
 		ims->im6s_st[1].ex += n;
 	} else if (lims->im6sl_st[1] == MCAST_INCLUDE) {
 		CTR3(KTR_MLD, "%s: t1 in += %d on %s", __func__, n, ip6tbuf);
 		ims->im6s_st[1].in += n;
 	}
 }
 
 /*
  * Atomically update the global in6_multi state, when a membership's
  * filter list is being updated in any way.
  *
  * imf is the per-inpcb-membership group filter pointer.
  * A fake imf may be passed for in-kernel consumers.
  *
  * XXX This is a candidate for a set-symmetric-difference style loop
  * which would eliminate the repeated lookup from root of ims nodes,
  * as they share the same key space.
  *
  * If any error occurred this function will back out of refcounts
  * and return a non-zero value.
  */
 static int
 in6m_merge(struct in6_multi *inm, /*const*/ struct in6_mfilter *imf)
 {
 	struct ip6_msource	*ims, *nims;
 	struct in6_msource	*lims;
 	int			 schanged, error;
 	int			 nsrc0, nsrc1;
 
 	schanged = 0;
 	error = 0;
 	nsrc1 = nsrc0 = 0;
 	IN6_MULTI_LIST_LOCK_ASSERT();
 
 	/*
 	 * Update the source filters first, as this may fail.
 	 * Maintain count of in-mode filters at t0, t1. These are
 	 * used to work out if we transition into ASM mode or not.
 	 * Maintain a count of source filters whose state was
 	 * actually modified by this operation.
 	 */
 	RB_FOREACH(ims, ip6_msource_tree, &imf->im6f_sources) {
 		lims = (struct in6_msource *)ims;
 		if (lims->im6sl_st[0] == imf->im6f_st[0]) nsrc0++;
 		if (lims->im6sl_st[1] == imf->im6f_st[1]) nsrc1++;
 		if (lims->im6sl_st[0] == lims->im6sl_st[1]) continue;
 		error = in6m_get_source(inm, &lims->im6s_addr, 0, &nims);
 		++schanged;
 		if (error)
 			break;
 		im6s_merge(nims, lims, 0);
 	}
 	if (error) {
 		struct ip6_msource *bims;
 
 		RB_FOREACH_REVERSE_FROM(ims, ip6_msource_tree, nims) {
 			lims = (struct in6_msource *)ims;
 			if (lims->im6sl_st[0] == lims->im6sl_st[1])
 				continue;
 			(void)in6m_get_source(inm, &lims->im6s_addr, 1, &bims);
 			if (bims == NULL)
 				continue;
 			im6s_merge(bims, lims, 1);
 		}
 		goto out_reap;
 	}
 
 	CTR3(KTR_MLD, "%s: imf filters in-mode: %d at t0, %d at t1",
 	    __func__, nsrc0, nsrc1);
 
 	/* Handle transition between INCLUDE {n} and INCLUDE {} on socket. */
 	if (imf->im6f_st[0] == imf->im6f_st[1] &&
 	    imf->im6f_st[1] == MCAST_INCLUDE) {
 		if (nsrc1 == 0) {
 			CTR1(KTR_MLD, "%s: --in on inm at t1", __func__);
 			--inm->in6m_st[1].iss_in;
 		}
 	}
 
 	/* Handle filter mode transition on socket. */
 	if (imf->im6f_st[0] != imf->im6f_st[1]) {
 		CTR3(KTR_MLD, "%s: imf transition %d to %d",
 		    __func__, imf->im6f_st[0], imf->im6f_st[1]);
 
 		if (imf->im6f_st[0] == MCAST_EXCLUDE) {
 			CTR1(KTR_MLD, "%s: --ex on inm at t1", __func__);
 			--inm->in6m_st[1].iss_ex;
 		} else if (imf->im6f_st[0] == MCAST_INCLUDE) {
 			CTR1(KTR_MLD, "%s: --in on inm at t1", __func__);
 			--inm->in6m_st[1].iss_in;
 		}
 
 		if (imf->im6f_st[1] == MCAST_EXCLUDE) {
 			CTR1(KTR_MLD, "%s: ex++ on inm at t1", __func__);
 			inm->in6m_st[1].iss_ex++;
 		} else if (imf->im6f_st[1] == MCAST_INCLUDE && nsrc1 > 0) {
 			CTR1(KTR_MLD, "%s: in++ on inm at t1", __func__);
 			inm->in6m_st[1].iss_in++;
 		}
 	}
 
 	/*
 	 * Track inm filter state in terms of listener counts.
 	 * If there are any exclusive listeners, stack-wide
 	 * membership is exclusive.
 	 * Otherwise, if only inclusive listeners, stack-wide is inclusive.
 	 * If no listeners remain, state is undefined at t1,
 	 * and the MLD lifecycle for this group should finish.
 	 */
 	if (inm->in6m_st[1].iss_ex > 0) {
 		CTR1(KTR_MLD, "%s: transition to EX", __func__);
 		inm->in6m_st[1].iss_fmode = MCAST_EXCLUDE;
 	} else if (inm->in6m_st[1].iss_in > 0) {
 		CTR1(KTR_MLD, "%s: transition to IN", __func__);
 		inm->in6m_st[1].iss_fmode = MCAST_INCLUDE;
 	} else {
 		CTR1(KTR_MLD, "%s: transition to UNDEF", __func__);
 		inm->in6m_st[1].iss_fmode = MCAST_UNDEFINED;
 	}
 
 	/* Decrement ASM listener count on transition out of ASM mode. */
 	if (imf->im6f_st[0] == MCAST_EXCLUDE && nsrc0 == 0) {
 		if ((imf->im6f_st[1] != MCAST_EXCLUDE) ||
 		    (imf->im6f_st[1] == MCAST_EXCLUDE && nsrc1 > 0)) {
 			CTR1(KTR_MLD, "%s: --asm on inm at t1", __func__);
 			--inm->in6m_st[1].iss_asm;
 		}
 	}
 
 	/* Increment ASM listener count on transition to ASM mode. */
 	if (imf->im6f_st[1] == MCAST_EXCLUDE && nsrc1 == 0) {
 		CTR1(KTR_MLD, "%s: asm++ on inm at t1", __func__);
 		inm->in6m_st[1].iss_asm++;
 	}
 
 	CTR3(KTR_MLD, "%s: merged imf %p to inm %p", __func__, imf, inm);
 	in6m_print(inm);
 
 out_reap:
 	if (schanged > 0) {
 		CTR1(KTR_MLD, "%s: sources changed; reaping", __func__);
 		in6m_reap(inm);
 	}
 	return (error);
 }
 
 /*
  * Mark an in6_multi's filter set deltas as committed.
  * Called by MLD after a state change has been enqueued.
  */
 void
 in6m_commit(struct in6_multi *inm)
 {
 	struct ip6_msource	*ims;
 
 	CTR2(KTR_MLD, "%s: commit inm %p", __func__, inm);
 	CTR1(KTR_MLD, "%s: pre commit:", __func__);
 	in6m_print(inm);
 
 	RB_FOREACH(ims, ip6_msource_tree, &inm->in6m_srcs) {
 		ims->im6s_st[0] = ims->im6s_st[1];
 	}
 	inm->in6m_st[0] = inm->in6m_st[1];
 }
 
 /*
  * Reap unreferenced nodes from an in6_multi's filter set.
  */
 static void
 in6m_reap(struct in6_multi *inm)
 {
 	struct ip6_msource	*ims, *tims;
 
 	RB_FOREACH_SAFE(ims, ip6_msource_tree, &inm->in6m_srcs, tims) {
 		if (ims->im6s_st[0].ex > 0 || ims->im6s_st[0].in > 0 ||
 		    ims->im6s_st[1].ex > 0 || ims->im6s_st[1].in > 0 ||
 		    ims->im6s_stp != 0)
 			continue;
 		CTR2(KTR_MLD, "%s: free ims %p", __func__, ims);
 		RB_REMOVE(ip6_msource_tree, &inm->in6m_srcs, ims);
 		free(ims, M_IP6MSOURCE);
 		inm->in6m_nsrc--;
 	}
 }
 
 /*
  * Purge all source nodes from an in6_multi's filter set.
  */
 static void
 in6m_purge(struct in6_multi *inm)
 {
 	struct ip6_msource	*ims, *tims;
 
 	RB_FOREACH_SAFE(ims, ip6_msource_tree, &inm->in6m_srcs, tims) {
 		CTR2(KTR_MLD, "%s: free ims %p", __func__, ims);
 		RB_REMOVE(ip6_msource_tree, &inm->in6m_srcs, ims);
 		free(ims, M_IP6MSOURCE);
 		inm->in6m_nsrc--;
 	}
 	/* Free state-change requests that might be queued. */
 	mbufq_drain(&inm->in6m_scq);
 }
 
 /*
  * Join a multicast address w/o sources.
  * KAME compatibility entry point.
  *
  * SMPng: Assume no mc locks held by caller.
  */
 int
 in6_joingroup(struct ifnet *ifp, const struct in6_addr *mcaddr,
     /*const*/ struct in6_mfilter *imf, struct in6_multi **pinm,
     const int delay)
 {
 	int error;
 
 	IN6_MULTI_LOCK();
 	error = in6_joingroup_locked(ifp, mcaddr, NULL, pinm, delay);
 	IN6_MULTI_UNLOCK();
 	return (error);
 }
 
 /*
  * Join a multicast group; real entry point.
  *
  * Only preserves atomicity at inm level.
  * NOTE: imf argument cannot be const due to sys/tree.h limitations.
  *
  * If the MLD downcall fails, the group is not joined, and an error
  * code is returned.
  */
 int
 in6_joingroup_locked(struct ifnet *ifp, const struct in6_addr *mcaddr,
     /*const*/ struct in6_mfilter *imf, struct in6_multi **pinm,
     const int delay)
 {
 	struct in6_multi_head    inmh;
 	struct in6_mfilter	 timf;
 	struct in6_multi	*inm;
 	struct ifmultiaddr *ifma;
 	int			 error;
 #ifdef KTR
 	char			 ip6tbuf[INET6_ADDRSTRLEN];
 #endif
 
 	/*
 	 * Sanity: Check scope zone ID was set for ifp, if and
 	 * only if group is scoped to an interface.
 	 */
 	KASSERT(IN6_IS_ADDR_MULTICAST(mcaddr),
 	    ("%s: not a multicast address", __func__));
 	if (IN6_IS_ADDR_MC_LINKLOCAL(mcaddr) ||
 	    IN6_IS_ADDR_MC_INTFACELOCAL(mcaddr)) {
 		KASSERT(mcaddr->s6_addr16[1] != 0,
 		    ("%s: scope zone ID not set", __func__));
 	}
 
 	IN6_MULTI_LOCK_ASSERT();
 	IN6_MULTI_LIST_UNLOCK_ASSERT();
 
 	CTR4(KTR_MLD, "%s: join %s on %p(%s))", __func__,
 	    ip6_sprintf(ip6tbuf, mcaddr), ifp, if_name(ifp));
 
 	error = 0;
 	inm = NULL;
 
 	/*
 	 * If no imf was specified (i.e. kernel consumer),
 	 * fake one up and assume it is an ASM join.
 	 */
 	if (imf == NULL) {
 		im6f_init(&timf, MCAST_UNDEFINED, MCAST_EXCLUDE);
 		imf = &timf;
 	}
 	error = in6_getmulti(ifp, mcaddr, &inm);
 	if (error) {
 		CTR1(KTR_MLD, "%s: in6_getmulti() failure", __func__);
 		return (error);
 	}
 
 	IN6_MULTI_LIST_LOCK();
 	CTR1(KTR_MLD, "%s: merge inm state", __func__);
 	error = in6m_merge(inm, imf);
 	if (error) {
 		CTR1(KTR_MLD, "%s: failed to merge inm state", __func__);
 		goto out_in6m_release;
 	}
 
 	CTR1(KTR_MLD, "%s: doing mld downcall", __func__);
 	error = mld_change_state(inm, delay);
 	if (error) {
 		CTR1(KTR_MLD, "%s: failed to update source", __func__);
 		goto out_in6m_release;
 	}
 
 out_in6m_release:
 	SLIST_INIT(&inmh);
 	if (error) {
 		struct epoch_tracker et;
 
 		CTR2(KTR_MLD, "%s: dropping ref on %p", __func__, inm);
 		NET_EPOCH_ENTER(et);
 		CK_STAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) {
 			if (ifma->ifma_protospec == inm) {
 				ifma->ifma_protospec = NULL;
 				break;
 			}
 		}
 		in6m_disconnect_locked(&inmh, inm);
 		in6m_rele_locked(&inmh, inm);
 		NET_EPOCH_EXIT(et);
 	} else {
 		*pinm = inm;
 	}
 	IN6_MULTI_LIST_UNLOCK();
 	in6m_release_list_deferred(&inmh);
 	return (error);
 }
 
 /*
  * Leave a multicast group; unlocked entry point.
  */
 int
 in6_leavegroup(struct in6_multi *inm, /*const*/ struct in6_mfilter *imf)
 {
 	int error;
 
 	IN6_MULTI_LOCK();
 	error = in6_leavegroup_locked(inm, imf);
 	IN6_MULTI_UNLOCK();
 	return (error);
 }
 
 /*
  * Leave a multicast group; real entry point.
  * All source filters will be expunged.
  *
  * Only preserves atomicity at inm level.
  *
  * Holding the write lock for the INP which contains imf
  * is highly advisable. We can't assert for it as imf does not
  * contain a back-pointer to the owning inp.
  *
  * Note: This is not the same as in6m_release(*) as this function also
  * makes a state change downcall into MLD.
  */
 int
 in6_leavegroup_locked(struct in6_multi *inm, /*const*/ struct in6_mfilter *imf)
 {
 	struct in6_multi_head	 inmh;
 	struct in6_mfilter	 timf;
 	struct ifnet *ifp;
 	int			 error;
 #ifdef KTR
 	char			 ip6tbuf[INET6_ADDRSTRLEN];
 #endif
 
 	error = 0;
 
 	IN6_MULTI_LOCK_ASSERT();
 
 	CTR5(KTR_MLD, "%s: leave inm %p, %s/%s, imf %p", __func__,
 	    inm, ip6_sprintf(ip6tbuf, &inm->in6m_addr),
 	    (in6m_is_ifp_detached(inm) ? "null" : if_name(inm->in6m_ifp)),
 	    imf);
 
 	/*
 	 * If no imf was specified (i.e. kernel consumer),
 	 * fake one up and assume it is an ASM join.
 	 */
 	if (imf == NULL) {
 		im6f_init(&timf, MCAST_EXCLUDE, MCAST_UNDEFINED);
 		imf = &timf;
 	}
 
 	/*
 	 * Begin state merge transaction at MLD layer.
 	 *
 	 * As this particular invocation should not cause any memory
 	 * to be allocated, and there is no opportunity to roll back
 	 * the transaction, it MUST NOT fail.
 	 */
 
 	ifp = inm->in6m_ifp;
 	IN6_MULTI_LIST_LOCK();
 	CTR1(KTR_MLD, "%s: merge inm state", __func__);
 	error = in6m_merge(inm, imf);
 	KASSERT(error == 0, ("%s: failed to merge inm state", __func__));
 
 	CTR1(KTR_MLD, "%s: doing mld downcall", __func__);
 	error = 0;
 	if (ifp)
 		error = mld_change_state(inm, 0);
 	if (error)
 		CTR1(KTR_MLD, "%s: failed mld downcall", __func__);
 
 	CTR2(KTR_MLD, "%s: dropping ref on %p", __func__, inm);
 	if (ifp)
 		IF_ADDR_WLOCK(ifp);
 
 	SLIST_INIT(&inmh);
 	if (inm->in6m_refcount == 1)
 		in6m_disconnect_locked(&inmh, inm);
 	in6m_rele_locked(&inmh, inm);
 	if (ifp)
 		IF_ADDR_WUNLOCK(ifp);
 	IN6_MULTI_LIST_UNLOCK();
 	in6m_release_list_deferred(&inmh);
 	return (error);
 }
 
 
 /*
  * Block or unblock an ASM multicast source on an inpcb.
  * This implements the delta-based API described in RFC 3678.
  *
  * The delta-based API applies only to exclusive-mode memberships.
  * An MLD downcall will be performed.
  *
  * SMPng: NOTE: Must take Giant as a join may create a new ifma.
  *
  * Return 0 if successful, otherwise return an appropriate error code.
  */
 static int
 in6p_block_unblock_source(struct inpcb *inp, struct sockopt *sopt)
 {
 	struct group_source_req		 gsr;
 	sockunion_t			*gsa, *ssa;
 	struct ifnet			*ifp;
 	struct in6_mfilter		*imf;
 	struct ip6_moptions		*imo;
 	struct in6_msource		*ims;
 	struct in6_multi			*inm;
 	size_t				 idx;
 	uint16_t			 fmode;
 	int				 error, doblock;
 #ifdef KTR
 	char				 ip6tbuf[INET6_ADDRSTRLEN];
 #endif
 
 	ifp = NULL;
 	error = 0;
 	doblock = 0;
 
 	memset(&gsr, 0, sizeof(struct group_source_req));
 	gsa = (sockunion_t *)&gsr.gsr_group;
 	ssa = (sockunion_t *)&gsr.gsr_source;
 
 	switch (sopt->sopt_name) {
 	case MCAST_BLOCK_SOURCE:
 	case MCAST_UNBLOCK_SOURCE:
 		error = sooptcopyin(sopt, &gsr,
 		    sizeof(struct group_source_req),
 		    sizeof(struct group_source_req));
 		if (error)
 			return (error);
 
 		if (gsa->sin6.sin6_family != AF_INET6 ||
 		    gsa->sin6.sin6_len != sizeof(struct sockaddr_in6))
 			return (EINVAL);
 
 		if (ssa->sin6.sin6_family != AF_INET6 ||
 		    ssa->sin6.sin6_len != sizeof(struct sockaddr_in6))
 			return (EINVAL);
 
 		if (gsr.gsr_interface == 0 || V_if_index < gsr.gsr_interface)
 			return (EADDRNOTAVAIL);
 
 		ifp = ifnet_byindex(gsr.gsr_interface);
 
 		if (sopt->sopt_name == MCAST_BLOCK_SOURCE)
 			doblock = 1;
 		break;
 
 	default:
 		CTR2(KTR_MLD, "%s: unknown sopt_name %d",
 		    __func__, sopt->sopt_name);
 		return (EOPNOTSUPP);
 		break;
 	}
 
 	if (!IN6_IS_ADDR_MULTICAST(&gsa->sin6.sin6_addr))
 		return (EINVAL);
 
 	(void)in6_setscope(&gsa->sin6.sin6_addr, ifp, NULL);
 
 	/*
 	 * Check if we are actually a member of this group.
 	 */
 	imo = in6p_findmoptions(inp);
 	idx = im6o_match_group(imo, ifp, &gsa->sa);
 	if (idx == -1 || imo->im6o_mfilters == NULL) {
 		error = EADDRNOTAVAIL;
 		goto out_in6p_locked;
 	}
 
 	KASSERT(imo->im6o_mfilters != NULL,
 	    ("%s: im6o_mfilters not allocated", __func__));
 	imf = &imo->im6o_mfilters[idx];
 	inm = imo->im6o_membership[idx];
 
 	/*
 	 * Attempting to use the delta-based API on an
 	 * non exclusive-mode membership is an error.
 	 */
 	fmode = imf->im6f_st[0];
 	if (fmode != MCAST_EXCLUDE) {
 		error = EINVAL;
 		goto out_in6p_locked;
 	}
 
 	/*
 	 * Deal with error cases up-front:
 	 *  Asked to block, but already blocked; or
 	 *  Asked to unblock, but nothing to unblock.
 	 * If adding a new block entry, allocate it.
 	 */
 	ims = im6o_match_source(imo, idx, &ssa->sa);
 	if ((ims != NULL && doblock) || (ims == NULL && !doblock)) {
 		CTR3(KTR_MLD, "%s: source %s %spresent", __func__,
 		    ip6_sprintf(ip6tbuf, &ssa->sin6.sin6_addr),
 		    doblock ? "" : "not ");
 		error = EADDRNOTAVAIL;
 		goto out_in6p_locked;
 	}
 
 	INP_WLOCK_ASSERT(inp);
 
 	/*
 	 * Begin state merge transaction at socket layer.
 	 */
 	if (doblock) {
 		CTR2(KTR_MLD, "%s: %s source", __func__, "block");
 		ims = im6f_graft(imf, fmode, &ssa->sin6);
 		if (ims == NULL)
 			error = ENOMEM;
 	} else {
 		CTR2(KTR_MLD, "%s: %s source", __func__, "allow");
 		error = im6f_prune(imf, &ssa->sin6);
 	}
 
 	if (error) {
 		CTR1(KTR_MLD, "%s: merge imf state failed", __func__);
 		goto out_im6f_rollback;
 	}
 
 	/*
 	 * Begin state merge transaction at MLD layer.
 	 */
 	IN6_MULTI_LIST_LOCK();
 	CTR1(KTR_MLD, "%s: merge inm state", __func__);
 	error = in6m_merge(inm, imf);
 	if (error)
 		CTR1(KTR_MLD, "%s: failed to merge inm state", __func__);
 	else {
 		CTR1(KTR_MLD, "%s: doing mld downcall", __func__);
 		error = mld_change_state(inm, 0);
 		if (error)
 			CTR1(KTR_MLD, "%s: failed mld downcall", __func__);
 	}
 
 	IN6_MULTI_LIST_UNLOCK();
 
 out_im6f_rollback:
 	if (error)
 		im6f_rollback(imf);
 	else
 		im6f_commit(imf);
 
 	im6f_reap(imf);
 
 out_in6p_locked:
 	INP_WUNLOCK(inp);
 	return (error);
 }
 
 /*
  * Given an inpcb, return its multicast options structure pointer.  Accepts
  * an unlocked inpcb pointer, but will return it locked.  May sleep.
  *
  * SMPng: NOTE: Potentially calls malloc(M_WAITOK) with Giant held.
  * SMPng: NOTE: Returns with the INP write lock held.
  */
 static struct ip6_moptions *
 in6p_findmoptions(struct inpcb *inp)
 {
 	struct ip6_moptions	 *imo;
 	struct in6_multi		**immp;
 	struct in6_mfilter	 *imfp;
 	size_t			  idx;
 
 	INP_WLOCK(inp);
 	if (inp->in6p_moptions != NULL)
 		return (inp->in6p_moptions);
 
 	INP_WUNLOCK(inp);
 
 	imo = malloc(sizeof(*imo), M_IP6MOPTS, M_WAITOK);
 	immp = malloc(sizeof(*immp) * IPV6_MIN_MEMBERSHIPS, M_IP6MOPTS,
 	    M_WAITOK | M_ZERO);
 	imfp = malloc(sizeof(struct in6_mfilter) * IPV6_MIN_MEMBERSHIPS,
 	    M_IN6MFILTER, M_WAITOK);
 
 	imo->im6o_multicast_ifp = NULL;
 	imo->im6o_multicast_hlim = V_ip6_defmcasthlim;
 	imo->im6o_multicast_loop = in6_mcast_loop;
 	imo->im6o_num_memberships = 0;
 	imo->im6o_max_memberships = IPV6_MIN_MEMBERSHIPS;
 	imo->im6o_membership = immp;
 
 	/* Initialize per-group source filters. */
 	for (idx = 0; idx < IPV6_MIN_MEMBERSHIPS; idx++)
 		im6f_init(&imfp[idx], MCAST_UNDEFINED, MCAST_EXCLUDE);
 	imo->im6o_mfilters = imfp;
 
 	INP_WLOCK(inp);
 	if (inp->in6p_moptions != NULL) {
 		free(imfp, M_IN6MFILTER);
 		free(immp, M_IP6MOPTS);
 		free(imo, M_IP6MOPTS);
 		return (inp->in6p_moptions);
 	}
 	inp->in6p_moptions = imo;
 	return (imo);
 }
 
 /*
  * Discard the IPv6 multicast options (and source filters).
  *
  * SMPng: NOTE: assumes INP write lock is held.
  *
  * XXX can all be safely deferred to epoch_call
  *
  */
 
 static void
 inp_gcmoptions(struct ip6_moptions *imo)
 {
 	struct in6_mfilter	*imf;
 	struct in6_multi *inm;
 	struct ifnet *ifp;
 	size_t			 idx, nmships;
 
 	nmships = imo->im6o_num_memberships;
 	for (idx = 0; idx < nmships; ++idx) {
 		imf = imo->im6o_mfilters ? &imo->im6o_mfilters[idx] : NULL;
 		if (imf)
 			im6f_leave(imf);
 		inm = imo->im6o_membership[idx];
 		ifp = inm->in6m_ifp;
 		if (ifp != NULL) {
 			CURVNET_SET(ifp->if_vnet);
 			(void)in6_leavegroup(inm, imf);
 			CURVNET_RESTORE();
 		} else {
 			(void)in6_leavegroup(inm, imf);
 		}
 		if (imf)
 			im6f_purge(imf);
 	}
 
 	if (imo->im6o_mfilters)
 		free(imo->im6o_mfilters, M_IN6MFILTER);
 	free(imo->im6o_membership, M_IP6MOPTS);
 	free(imo, M_IP6MOPTS);
 }
 
 void
 ip6_freemoptions(struct ip6_moptions *imo)
 {
 	if (imo == NULL)
 		return;
 	inp_gcmoptions(imo);
 }
 
 /*
  * Atomically get source filters on a socket for an IPv6 multicast group.
  * Called with INP lock held; returns with lock released.
  */
 static int
 in6p_get_source_filters(struct inpcb *inp, struct sockopt *sopt)
 {
 	struct __msfilterreq	 msfr;
 	sockunion_t		*gsa;
 	struct ifnet		*ifp;
 	struct ip6_moptions	*imo;
 	struct in6_mfilter	*imf;
 	struct ip6_msource	*ims;
 	struct in6_msource	*lims;
 	struct sockaddr_in6	*psin;
 	struct sockaddr_storage	*ptss;
 	struct sockaddr_storage	*tss;
 	int			 error;
 	size_t			 idx, nsrcs, ncsrcs;
 
 	INP_WLOCK_ASSERT(inp);
 
 	imo = inp->in6p_moptions;
 	KASSERT(imo != NULL, ("%s: null ip6_moptions", __func__));
 
 	INP_WUNLOCK(inp);
 
 	error = sooptcopyin(sopt, &msfr, sizeof(struct __msfilterreq),
 	    sizeof(struct __msfilterreq));
 	if (error)
 		return (error);
 
 	if (msfr.msfr_group.ss_family != AF_INET6 ||
 	    msfr.msfr_group.ss_len != sizeof(struct sockaddr_in6))
 		return (EINVAL);
 
 	gsa = (sockunion_t *)&msfr.msfr_group;
 	if (!IN6_IS_ADDR_MULTICAST(&gsa->sin6.sin6_addr))
 		return (EINVAL);
 
 	if (msfr.msfr_ifindex == 0 || V_if_index < msfr.msfr_ifindex)
 		return (EADDRNOTAVAIL);
 	ifp = ifnet_byindex(msfr.msfr_ifindex);
 	if (ifp == NULL)
 		return (EADDRNOTAVAIL);
 	(void)in6_setscope(&gsa->sin6.sin6_addr, ifp, NULL);
 
 	INP_WLOCK(inp);
 
 	/*
 	 * Lookup group on the socket.
 	 */
 	idx = im6o_match_group(imo, ifp, &gsa->sa);
 	if (idx == -1 || imo->im6o_mfilters == NULL) {
 		INP_WUNLOCK(inp);
 		return (EADDRNOTAVAIL);
 	}
 	imf = &imo->im6o_mfilters[idx];
 
 	/*
 	 * Ignore memberships which are in limbo.
 	 */
 	if (imf->im6f_st[1] == MCAST_UNDEFINED) {
 		INP_WUNLOCK(inp);
 		return (EAGAIN);
 	}
 	msfr.msfr_fmode = imf->im6f_st[1];
 
 	/*
 	 * If the user specified a buffer, copy out the source filter
 	 * entries to userland gracefully.
 	 * We only copy out the number of entries which userland
 	 * has asked for, but we always tell userland how big the
 	 * buffer really needs to be.
 	 */
 	if (msfr.msfr_nsrcs > in6_mcast_maxsocksrc)
 		msfr.msfr_nsrcs = in6_mcast_maxsocksrc;
 	tss = NULL;
 	if (msfr.msfr_srcs != NULL && msfr.msfr_nsrcs > 0) {
 		tss = malloc(sizeof(struct sockaddr_storage) * msfr.msfr_nsrcs,
 		    M_TEMP, M_NOWAIT | M_ZERO);
 		if (tss == NULL) {
 			INP_WUNLOCK(inp);
 			return (ENOBUFS);
 		}
 	}
 
 	/*
 	 * Count number of sources in-mode at t0.
 	 * If buffer space exists and remains, copy out source entries.
 	 */
 	nsrcs = msfr.msfr_nsrcs;
 	ncsrcs = 0;
 	ptss = tss;
 	RB_FOREACH(ims, ip6_msource_tree, &imf->im6f_sources) {
 		lims = (struct in6_msource *)ims;
 		if (lims->im6sl_st[0] == MCAST_UNDEFINED ||
 		    lims->im6sl_st[0] != imf->im6f_st[0])
 			continue;
 		++ncsrcs;
 		if (tss != NULL && nsrcs > 0) {
 			psin = (struct sockaddr_in6 *)ptss;
 			psin->sin6_family = AF_INET6;
 			psin->sin6_len = sizeof(struct sockaddr_in6);
 			psin->sin6_addr = lims->im6s_addr;
 			psin->sin6_port = 0;
 			--nsrcs;
 			++ptss;
 		}
 	}
 
 	INP_WUNLOCK(inp);
 
 	if (tss != NULL) {
 		error = copyout(tss, msfr.msfr_srcs,
 		    sizeof(struct sockaddr_storage) * msfr.msfr_nsrcs);
 		free(tss, M_TEMP);
 		if (error)
 			return (error);
 	}
 
 	msfr.msfr_nsrcs = ncsrcs;
 	error = sooptcopyout(sopt, &msfr, sizeof(struct __msfilterreq));
 
 	return (error);
 }
 
 /*
  * Return the IP multicast options in response to user getsockopt().
  */
 int
 ip6_getmoptions(struct inpcb *inp, struct sockopt *sopt)
 {
 	struct ip6_moptions	*im6o;
 	int			 error;
 	u_int			 optval;
 
 	INP_WLOCK(inp);
 	im6o = inp->in6p_moptions;
 	/*
 	 * If socket is neither of type SOCK_RAW or SOCK_DGRAM,
 	 * or is a divert socket, reject it.
 	 */
 	if (inp->inp_socket->so_proto->pr_protocol == IPPROTO_DIVERT ||
 	    (inp->inp_socket->so_proto->pr_type != SOCK_RAW &&
 	    inp->inp_socket->so_proto->pr_type != SOCK_DGRAM)) {
 		INP_WUNLOCK(inp);
 		return (EOPNOTSUPP);
 	}
 
 	error = 0;
 	switch (sopt->sopt_name) {
 	case IPV6_MULTICAST_IF:
 		if (im6o == NULL || im6o->im6o_multicast_ifp == NULL) {
 			optval = 0;
 		} else {
 			optval = im6o->im6o_multicast_ifp->if_index;
 		}
 		INP_WUNLOCK(inp);
 		error = sooptcopyout(sopt, &optval, sizeof(u_int));
 		break;
 
 	case IPV6_MULTICAST_HOPS:
 		if (im6o == NULL)
 			optval = V_ip6_defmcasthlim;
 		else
 			optval = im6o->im6o_multicast_hlim;
 		INP_WUNLOCK(inp);
 		error = sooptcopyout(sopt, &optval, sizeof(u_int));
 		break;
 
 	case IPV6_MULTICAST_LOOP:
 		if (im6o == NULL)
 			optval = in6_mcast_loop; /* XXX VIMAGE */
 		else
 			optval = im6o->im6o_multicast_loop;
 		INP_WUNLOCK(inp);
 		error = sooptcopyout(sopt, &optval, sizeof(u_int));
 		break;
 
 	case IPV6_MSFILTER:
 		if (im6o == NULL) {
 			error = EADDRNOTAVAIL;
 			INP_WUNLOCK(inp);
 		} else {
 			error = in6p_get_source_filters(inp, sopt);
 		}
 		break;
 
 	default:
 		INP_WUNLOCK(inp);
 		error = ENOPROTOOPT;
 		break;
 	}
 
 	INP_UNLOCK_ASSERT(inp);
 
 	return (error);
 }
 
 /*
  * Look up the ifnet to use for a multicast group membership,
  * given the address of an IPv6 group.
  *
  * This routine exists to support legacy IPv6 multicast applications.
  *
  * If inp is non-NULL, use this socket's current FIB number for any
  * required FIB lookup. Look up the group address in the unicast FIB,
  * and use its ifp; usually, this points to the default next-hop.
  * If the FIB lookup fails, return NULL.
  *
  * FUTURE: Support multiple forwarding tables for IPv6.
  *
  * Returns NULL if no ifp could be found.
  */
 static struct ifnet *
 in6p_lookup_mcast_ifp(const struct inpcb *in6p,
     const struct sockaddr_in6 *gsin6)
 {
 	struct nhop6_basic	nh6;
 	struct in6_addr		dst;
 	uint32_t		scopeid;
 	uint32_t		fibnum;
 
 	KASSERT(in6p->inp_vflag & INP_IPV6,
 	    ("%s: not INP_IPV6 inpcb", __func__));
 	KASSERT(gsin6->sin6_family == AF_INET6,
 	    ("%s: not AF_INET6 group", __func__));
 
 	in6_splitscope(&gsin6->sin6_addr, &dst, &scopeid);
 	fibnum = in6p ? in6p->inp_inc.inc_fibnum : RT_DEFAULT_FIB;
 	if (fib6_lookup_nh_basic(fibnum, &dst, scopeid, 0, 0, &nh6) != 0)
 		return (NULL);
 
 	return (nh6.nh_ifp);
 }
 
 /*
  * Join an IPv6 multicast group, possibly with a source.
  *
  * FIXME: The KAME use of the unspecified address (::)
  * to join *all* multicast groups is currently unsupported.
  */
 static int
 in6p_join_group(struct inpcb *inp, struct sockopt *sopt)
 {
 	struct in6_multi_head		 inmh;
 	struct group_source_req		 gsr;
 	sockunion_t			*gsa, *ssa;
 	struct ifnet			*ifp;
 	struct in6_mfilter		*imf;
 	struct ip6_moptions		*imo;
 	struct in6_multi		*inm;
 	struct in6_msource		*lims;
 	size_t				 idx;
 	int				 error, is_new;
 
 	SLIST_INIT(&inmh);
 	ifp = NULL;
 	imf = NULL;
 	lims = NULL;
 	error = 0;
 	is_new = 0;
 
 	memset(&gsr, 0, sizeof(struct group_source_req));
 	gsa = (sockunion_t *)&gsr.gsr_group;
 	gsa->ss.ss_family = AF_UNSPEC;
 	ssa = (sockunion_t *)&gsr.gsr_source;
 	ssa->ss.ss_family = AF_UNSPEC;
 
 	/*
 	 * Chew everything into struct group_source_req.
 	 * Overwrite the port field if present, as the sockaddr
 	 * being copied in may be matched with a binary comparison.
 	 * Ignore passed-in scope ID.
 	 */
 	switch (sopt->sopt_name) {
 	case IPV6_JOIN_GROUP: {
 		struct ipv6_mreq mreq;
 
 		error = sooptcopyin(sopt, &mreq, sizeof(struct ipv6_mreq),
 		    sizeof(struct ipv6_mreq));
 		if (error)
 			return (error);
 
 		gsa->sin6.sin6_family = AF_INET6;
 		gsa->sin6.sin6_len = sizeof(struct sockaddr_in6);
 		gsa->sin6.sin6_addr = mreq.ipv6mr_multiaddr;
 
 		if (mreq.ipv6mr_interface == 0) {
 			ifp = in6p_lookup_mcast_ifp(inp, &gsa->sin6);
 		} else {
 			if (V_if_index < mreq.ipv6mr_interface)
 				return (EADDRNOTAVAIL);
 			ifp = ifnet_byindex(mreq.ipv6mr_interface);
 		}
 		CTR3(KTR_MLD, "%s: ipv6mr_interface = %d, ifp = %p",
 		    __func__, mreq.ipv6mr_interface, ifp);
 	} break;
 
 	case MCAST_JOIN_GROUP:
 	case MCAST_JOIN_SOURCE_GROUP:
 		if (sopt->sopt_name == MCAST_JOIN_GROUP) {
 			error = sooptcopyin(sopt, &gsr,
 			    sizeof(struct group_req),
 			    sizeof(struct group_req));
 		} else if (sopt->sopt_name == MCAST_JOIN_SOURCE_GROUP) {
 			error = sooptcopyin(sopt, &gsr,
 			    sizeof(struct group_source_req),
 			    sizeof(struct group_source_req));
 		}
 		if (error)
 			return (error);
 
 		if (gsa->sin6.sin6_family != AF_INET6 ||
 		    gsa->sin6.sin6_len != sizeof(struct sockaddr_in6))
 			return (EINVAL);
 
 		if (sopt->sopt_name == MCAST_JOIN_SOURCE_GROUP) {
 			if (ssa->sin6.sin6_family != AF_INET6 ||
 			    ssa->sin6.sin6_len != sizeof(struct sockaddr_in6))
 				return (EINVAL);
 			if (IN6_IS_ADDR_MULTICAST(&ssa->sin6.sin6_addr))
 				return (EINVAL);
 			/*
 			 * TODO: Validate embedded scope ID in source
 			 * list entry against passed-in ifp, if and only
 			 * if source list filter entry is iface or node local.
 			 */
 			in6_clearscope(&ssa->sin6.sin6_addr);
 			ssa->sin6.sin6_port = 0;
 			ssa->sin6.sin6_scope_id = 0;
 		}
 
 		if (gsr.gsr_interface == 0 || V_if_index < gsr.gsr_interface)
 			return (EADDRNOTAVAIL);
 		ifp = ifnet_byindex(gsr.gsr_interface);
 		break;
 
 	default:
 		CTR2(KTR_MLD, "%s: unknown sopt_name %d",
 		    __func__, sopt->sopt_name);
 		return (EOPNOTSUPP);
 		break;
 	}
 
 	if (!IN6_IS_ADDR_MULTICAST(&gsa->sin6.sin6_addr))
 		return (EINVAL);
 
 	if (ifp == NULL || (ifp->if_flags & IFF_MULTICAST) == 0)
 		return (EADDRNOTAVAIL);
 
 	gsa->sin6.sin6_port = 0;
 	gsa->sin6.sin6_scope_id = 0;
 
 	/*
 	 * Always set the scope zone ID on memberships created from userland.
 	 * Use the passed-in ifp to do this.
 	 * XXX The in6_setscope() return value is meaningless.
 	 * XXX SCOPE6_LOCK() is taken by in6_setscope().
 	 */
 	(void)in6_setscope(&gsa->sin6.sin6_addr, ifp, NULL);
 
+	IN6_MULTI_LOCK();
 	imo = in6p_findmoptions(inp);
 	idx = im6o_match_group(imo, ifp, &gsa->sa);
 	if (idx == -1) {
 		is_new = 1;
 	} else {
 		inm = imo->im6o_membership[idx];
 		imf = &imo->im6o_mfilters[idx];
 		if (ssa->ss.ss_family != AF_UNSPEC) {
 			/*
 			 * MCAST_JOIN_SOURCE_GROUP on an exclusive membership
 			 * is an error. On an existing inclusive membership,
 			 * it just adds the source to the filter list.
 			 */
 			if (imf->im6f_st[1] != MCAST_INCLUDE) {
 				error = EINVAL;
 				goto out_in6p_locked;
 			}
 			/*
 			 * Throw out duplicates.
 			 *
 			 * XXX FIXME: This makes a naive assumption that
 			 * even if entries exist for *ssa in this imf,
 			 * they will be rejected as dupes, even if they
 			 * are not valid in the current mode (in-mode).
 			 *
 			 * in6_msource is transactioned just as for anything
 			 * else in SSM -- but note naive use of in6m_graft()
 			 * below for allocating new filter entries.
 			 *
 			 * This is only an issue if someone mixes the
 			 * full-state SSM API with the delta-based API,
 			 * which is discouraged in the relevant RFCs.
 			 */
 			lims = im6o_match_source(imo, idx, &ssa->sa);
 			if (lims != NULL /*&&
 			    lims->im6sl_st[1] == MCAST_INCLUDE*/) {
 				error = EADDRNOTAVAIL;
 				goto out_in6p_locked;
 			}
 		} else {
 			/*
 			 * MCAST_JOIN_GROUP alone, on any existing membership,
 			 * is rejected, to stop the same inpcb tying up
 			 * multiple refs to the in_multi.
 			 * On an existing inclusive membership, this is also
 			 * an error; if you want to change filter mode,
 			 * you must use the userland API setsourcefilter().
 			 * XXX We don't reject this for imf in UNDEFINED
 			 * state at t1, because allocation of a filter
 			 * is atomic with allocation of a membership.
 			 */
 			error = EINVAL;
 			goto out_in6p_locked;
 		}
 	}
 
 	/*
 	 * Begin state merge transaction at socket layer.
 	 */
 	INP_WLOCK_ASSERT(inp);
 
 	if (is_new) {
 		if (imo->im6o_num_memberships == imo->im6o_max_memberships) {
 			error = im6o_grow(imo);
 			if (error)
 				goto out_in6p_locked;
 		}
 		/*
 		 * Allocate the new slot upfront so we can deal with
 		 * grafting the new source filter in same code path
 		 * as for join-source on existing membership.
 		 */
 		idx = imo->im6o_num_memberships;
 		imo->im6o_membership[idx] = NULL;
 		imo->im6o_num_memberships++;
 		KASSERT(imo->im6o_mfilters != NULL,
 		    ("%s: im6f_mfilters vector was not allocated", __func__));
 		imf = &imo->im6o_mfilters[idx];
 		KASSERT(RB_EMPTY(&imf->im6f_sources),
 		    ("%s: im6f_sources not empty", __func__));
 	}
 
 	/*
 	 * Graft new source into filter list for this inpcb's
 	 * membership of the group. The in6_multi may not have
 	 * been allocated yet if this is a new membership, however,
 	 * the in_mfilter slot will be allocated and must be initialized.
 	 *
 	 * Note: Grafting of exclusive mode filters doesn't happen
 	 * in this path.
 	 * XXX: Should check for non-NULL lims (node exists but may
 	 * not be in-mode) for interop with full-state API.
 	 */
 	if (ssa->ss.ss_family != AF_UNSPEC) {
 		/* Membership starts in IN mode */
 		if (is_new) {
 			CTR1(KTR_MLD, "%s: new join w/source", __func__);
 			im6f_init(imf, MCAST_UNDEFINED, MCAST_INCLUDE);
 		} else {
 			CTR2(KTR_MLD, "%s: %s source", __func__, "allow");
 		}
 		lims = im6f_graft(imf, MCAST_INCLUDE, &ssa->sin6);
 		if (lims == NULL) {
 			CTR1(KTR_MLD, "%s: merge imf state failed",
 			    __func__);
 			error = ENOMEM;
 			goto out_im6o_free;
 		}
 	} else {
 		/* No address specified; Membership starts in EX mode */
 		if (is_new) {
 			CTR1(KTR_MLD, "%s: new join w/o source", __func__);
 			im6f_init(imf, MCAST_UNDEFINED, MCAST_EXCLUDE);
 		}
 	}
 
 	/*
 	 * Begin state merge transaction at MLD layer.
 	 */
-	in_pcbref(inp);
-	INP_WUNLOCK(inp);
-	IN6_MULTI_LOCK();
-
 	if (is_new) {
 		error = in6_joingroup_locked(ifp, &gsa->sin6.sin6_addr, imf,
 		    &inm, 0);
 		if (error) {
 			IN6_MULTI_UNLOCK();
 			goto out_im6o_free;
 		}
 		/*
 		 * NOTE: Refcount from in6_joingroup_locked()
 		 * is protecting membership.
 		 */
 		imo->im6o_membership[idx] = inm;
 	} else {
 		CTR1(KTR_MLD, "%s: merge inm state", __func__);
 		IN6_MULTI_LIST_LOCK();
 		error = in6m_merge(inm, imf);
 		if (error)
 			CTR1(KTR_MLD, "%s: failed to merge inm state",
 			    __func__);
 		else {
 			CTR1(KTR_MLD, "%s: doing mld downcall", __func__);
 			error = mld_change_state(inm, 0);
 			if (error)
 				CTR1(KTR_MLD, "%s: failed mld downcall",
 				    __func__);
 		}
 		IN6_MULTI_LIST_UNLOCK();
 	}
 
-	IN6_MULTI_UNLOCK();
-	INP_WLOCK(inp);
-	if (in_pcbrele_wlocked(inp))
-		return (ENXIO);
 	if (error) {
 		im6f_rollback(imf);
 		if (is_new)
 			im6f_purge(imf);
 		else
 			im6f_reap(imf);
 	} else {
 		im6f_commit(imf);
 	}
 
 out_im6o_free:
 	if (error && is_new) {
 		inm = imo->im6o_membership[idx];
 		if (inm != NULL) {
 			IN6_MULTI_LIST_LOCK();
 			in6m_rele_locked(&inmh, inm);
 			IN6_MULTI_LIST_UNLOCK();
 		}
 		imo->im6o_membership[idx] = NULL;
 		--imo->im6o_num_memberships;
 	}
 
 out_in6p_locked:
 	INP_WUNLOCK(inp);
+	IN6_MULTI_UNLOCK();
 	in6m_release_list_deferred(&inmh);
 	return (error);
 }
 
 /*
  * Leave an IPv6 multicast group on an inpcb, possibly with a source.
  */
 static int
 in6p_leave_group(struct inpcb *inp, struct sockopt *sopt)
 {
 	struct ipv6_mreq		 mreq;
 	struct group_source_req		 gsr;
 	sockunion_t			*gsa, *ssa;
 	struct ifnet			*ifp;
 	struct in6_mfilter		*imf;
 	struct ip6_moptions		*imo;
 	struct in6_msource		*ims;
 	struct in6_multi		*inm;
 	uint32_t			 ifindex;
 	size_t				 idx;
 	int				 error, is_final;
 #ifdef KTR
 	char				 ip6tbuf[INET6_ADDRSTRLEN];
 #endif
 
 	ifp = NULL;
 	ifindex = 0;
 	error = 0;
 	is_final = 1;
 
 	memset(&gsr, 0, sizeof(struct group_source_req));
 	gsa = (sockunion_t *)&gsr.gsr_group;
 	gsa->ss.ss_family = AF_UNSPEC;
 	ssa = (sockunion_t *)&gsr.gsr_source;
 	ssa->ss.ss_family = AF_UNSPEC;
 
 	/*
 	 * Chew everything passed in up into a struct group_source_req
 	 * as that is easier to process.
 	 * Note: Any embedded scope ID in the multicast group passed
 	 * in by userland is ignored, the interface index is the recommended
 	 * mechanism to specify an interface; see below.
 	 */
 	switch (sopt->sopt_name) {
 	case IPV6_LEAVE_GROUP:
 		error = sooptcopyin(sopt, &mreq, sizeof(struct ipv6_mreq),
 		    sizeof(struct ipv6_mreq));
 		if (error)
 			return (error);
 		gsa->sin6.sin6_family = AF_INET6;
 		gsa->sin6.sin6_len = sizeof(struct sockaddr_in6);
 		gsa->sin6.sin6_addr = mreq.ipv6mr_multiaddr;
 		gsa->sin6.sin6_port = 0;
 		gsa->sin6.sin6_scope_id = 0;
 		ifindex = mreq.ipv6mr_interface;
 		break;
 
 	case MCAST_LEAVE_GROUP:
 	case MCAST_LEAVE_SOURCE_GROUP:
 		if (sopt->sopt_name == MCAST_LEAVE_GROUP) {
 			error = sooptcopyin(sopt, &gsr,
 			    sizeof(struct group_req),
 			    sizeof(struct group_req));
 		} else if (sopt->sopt_name == MCAST_LEAVE_SOURCE_GROUP) {
 			error = sooptcopyin(sopt, &gsr,
 			    sizeof(struct group_source_req),
 			    sizeof(struct group_source_req));
 		}
 		if (error)
 			return (error);
 
 		if (gsa->sin6.sin6_family != AF_INET6 ||
 		    gsa->sin6.sin6_len != sizeof(struct sockaddr_in6))
 			return (EINVAL);
 		if (sopt->sopt_name == MCAST_LEAVE_SOURCE_GROUP) {
 			if (ssa->sin6.sin6_family != AF_INET6 ||
 			    ssa->sin6.sin6_len != sizeof(struct sockaddr_in6))
 				return (EINVAL);
 			if (IN6_IS_ADDR_MULTICAST(&ssa->sin6.sin6_addr))
 				return (EINVAL);
 			/*
 			 * TODO: Validate embedded scope ID in source
 			 * list entry against passed-in ifp, if and only
 			 * if source list filter entry is iface or node local.
 			 */
 			in6_clearscope(&ssa->sin6.sin6_addr);
 		}
 		gsa->sin6.sin6_port = 0;
 		gsa->sin6.sin6_scope_id = 0;
 		ifindex = gsr.gsr_interface;
 		break;
 
 	default:
 		CTR2(KTR_MLD, "%s: unknown sopt_name %d",
 		    __func__, sopt->sopt_name);
 		return (EOPNOTSUPP);
 		break;
 	}
 
 	if (!IN6_IS_ADDR_MULTICAST(&gsa->sin6.sin6_addr))
 		return (EINVAL);
 
 	/*
 	 * Validate interface index if provided. If no interface index
 	 * was provided separately, attempt to look the membership up
 	 * from the default scope as a last resort to disambiguate
 	 * the membership we are being asked to leave.
 	 * XXX SCOPE6 lock potentially taken here.
 	 */
 	if (ifindex != 0) {
 		if (V_if_index < ifindex)
 			return (EADDRNOTAVAIL);
 		ifp = ifnet_byindex(ifindex);
 		if (ifp == NULL)
 			return (EADDRNOTAVAIL);
 		(void)in6_setscope(&gsa->sin6.sin6_addr, ifp, NULL);
 	} else {
 		error = sa6_embedscope(&gsa->sin6, V_ip6_use_defzone);
 		if (error)
 			return (EADDRNOTAVAIL);
 		/*
 		 * Some badly behaved applications don't pass an ifindex
 		 * or a scope ID, which is an API violation. In this case,
 		 * perform a lookup as per a v6 join.
 		 *
 		 * XXX For now, stomp on zone ID for the corner case.
 		 * This is not the 'KAME way', but we need to see the ifp
 		 * directly until such time as this implementation is
 		 * refactored, assuming the scope IDs are the way to go.
 		 */
 		ifindex = ntohs(gsa->sin6.sin6_addr.s6_addr16[1]);
 		if (ifindex == 0) {
 			CTR2(KTR_MLD, "%s: warning: no ifindex, looking up "
 			    "ifp for group %s.", __func__,
 			    ip6_sprintf(ip6tbuf, &gsa->sin6.sin6_addr));
 			ifp = in6p_lookup_mcast_ifp(inp, &gsa->sin6);
 		} else {
 			ifp = ifnet_byindex(ifindex);
 		}
 		if (ifp == NULL)
 			return (EADDRNOTAVAIL);
 	}
 
 	CTR2(KTR_MLD, "%s: ifp = %p", __func__, ifp);
 	KASSERT(ifp != NULL, ("%s: ifp did not resolve", __func__));
 
 	/*
 	 * Find the membership in the membership array.
 	 */
+	IN6_MULTI_LOCK();
 	imo = in6p_findmoptions(inp);
 	idx = im6o_match_group(imo, ifp, &gsa->sa);
 	if (idx == -1) {
 		error = EADDRNOTAVAIL;
 		goto out_in6p_locked;
 	}
 	inm = imo->im6o_membership[idx];
 	imf = &imo->im6o_mfilters[idx];
 
 	if (ssa->ss.ss_family != AF_UNSPEC)
 		is_final = 0;
 
 	/*
 	 * Begin state merge transaction at socket layer.
 	 */
 	INP_WLOCK_ASSERT(inp);
 
 	/*
 	 * If we were instructed only to leave a given source, do so.
 	 * MCAST_LEAVE_SOURCE_GROUP is only valid for inclusive memberships.
 	 */
 	if (is_final) {
 		im6f_leave(imf);
 	} else {
 		if (imf->im6f_st[0] == MCAST_EXCLUDE) {
 			error = EADDRNOTAVAIL;
 			goto out_in6p_locked;
 		}
 		ims = im6o_match_source(imo, idx, &ssa->sa);
 		if (ims == NULL) {
 			CTR3(KTR_MLD, "%s: source %p %spresent", __func__,
 			    ip6_sprintf(ip6tbuf, &ssa->sin6.sin6_addr),
 			    "not ");
 			error = EADDRNOTAVAIL;
 			goto out_in6p_locked;
 		}
 		CTR2(KTR_MLD, "%s: %s source", __func__, "block");
 		error = im6f_prune(imf, &ssa->sin6);
 		if (error) {
 			CTR1(KTR_MLD, "%s: merge imf state failed",
 			    __func__);
 			goto out_in6p_locked;
 		}
 	}
 
 	/*
 	 * Begin state merge transaction at MLD layer.
 	 */
-	in_pcbref(inp);
-	INP_WUNLOCK(inp);
-	IN6_MULTI_LOCK();
-
 	if (is_final) {
 		/*
 		 * Give up the multicast address record to which
 		 * the membership points.
 		 */
 		(void)in6_leavegroup_locked(inm, imf);
 	} else {
 		CTR1(KTR_MLD, "%s: merge inm state", __func__);
 		IN6_MULTI_LIST_LOCK();
 		error = in6m_merge(inm, imf);
 		if (error)
 			CTR1(KTR_MLD, "%s: failed to merge inm state",
 			    __func__);
 		else {
 			CTR1(KTR_MLD, "%s: doing mld downcall", __func__);
 			error = mld_change_state(inm, 0);
 			if (error)
 				CTR1(KTR_MLD, "%s: failed mld downcall",
 				    __func__);
 		}
 		IN6_MULTI_LIST_UNLOCK();
 	}
 
-	IN6_MULTI_UNLOCK();
-	INP_WLOCK(inp);
-	if (in_pcbrele_wlocked(inp))
-		return (ENXIO);
-
 	if (error)
 		im6f_rollback(imf);
 	else
 		im6f_commit(imf);
 
 	im6f_reap(imf);
 
 	if (is_final) {
 		/* Remove the gap in the membership array. */
 		KASSERT(RB_EMPTY(&imf->im6f_sources),
 		    ("%s: im6f_sources not empty", __func__));
 		for (++idx; idx < imo->im6o_num_memberships; ++idx) {
 			imo->im6o_membership[idx - 1] = imo->im6o_membership[idx];
 			imo->im6o_mfilters[idx - 1] = imo->im6o_mfilters[idx];
 		}
 		im6f_init(&imo->im6o_mfilters[idx - 1], MCAST_UNDEFINED,
 		    MCAST_EXCLUDE);
 		imo->im6o_num_memberships--;
 	}
 
 out_in6p_locked:
 	INP_WUNLOCK(inp);
+	IN6_MULTI_UNLOCK();
 	return (error);
 }
 
 /*
  * Select the interface for transmitting IPv6 multicast datagrams.
  *
  * Either an instance of struct in6_addr or an instance of struct ipv6_mreqn
  * may be passed to this socket option. An address of in6addr_any or an
  * interface index of 0 is used to remove a previous selection.
  * When no interface is selected, one is chosen for every send.
  */
 static int
 in6p_set_multicast_if(struct inpcb *inp, struct sockopt *sopt)
 {
 	struct ifnet		*ifp;
 	struct ip6_moptions	*imo;
 	u_int			 ifindex;
 	int			 error;
 
 	if (sopt->sopt_valsize != sizeof(u_int))
 		return (EINVAL);
 
 	error = sooptcopyin(sopt, &ifindex, sizeof(u_int), sizeof(u_int));
 	if (error)
 		return (error);
 	if (V_if_index < ifindex)
 		return (EINVAL);
 	if (ifindex == 0)
 		ifp = NULL;
 	else {
 		ifp = ifnet_byindex(ifindex);
 		if (ifp == NULL)
 			return (EINVAL);
 		if ((ifp->if_flags & IFF_MULTICAST) == 0)
 			return (EADDRNOTAVAIL);
 	}
 	imo = in6p_findmoptions(inp);
 	imo->im6o_multicast_ifp = ifp;
 	INP_WUNLOCK(inp);
 
 	return (0);
 }
 
 /*
  * Atomically set source filters on a socket for an IPv6 multicast group.
- *
- * SMPng: NOTE: Potentially calls malloc(M_WAITOK) with Giant held.
  */
 static int
 in6p_set_source_filters(struct inpcb *inp, struct sockopt *sopt)
 {
 	struct __msfilterreq	 msfr;
 	sockunion_t		*gsa;
 	struct ifnet		*ifp;
 	struct in6_mfilter	*imf;
 	struct ip6_moptions	*imo;
 	struct in6_multi		*inm;
 	size_t			 idx;
 	int			 error;
 
 	error = sooptcopyin(sopt, &msfr, sizeof(struct __msfilterreq),
 	    sizeof(struct __msfilterreq));
 	if (error)
 		return (error);
 
 	if (msfr.msfr_nsrcs > in6_mcast_maxsocksrc)
 		return (ENOBUFS);
 
 	if (msfr.msfr_fmode != MCAST_EXCLUDE &&
 	    msfr.msfr_fmode != MCAST_INCLUDE)
 		return (EINVAL);
 
 	if (msfr.msfr_group.ss_family != AF_INET6 ||
 	    msfr.msfr_group.ss_len != sizeof(struct sockaddr_in6))
 		return (EINVAL);
 
 	gsa = (sockunion_t *)&msfr.msfr_group;
 	if (!IN6_IS_ADDR_MULTICAST(&gsa->sin6.sin6_addr))
 		return (EINVAL);
 
 	gsa->sin6.sin6_port = 0;	/* ignore port */
 
 	if (msfr.msfr_ifindex == 0 || V_if_index < msfr.msfr_ifindex)
 		return (EADDRNOTAVAIL);
 	ifp = ifnet_byindex(msfr.msfr_ifindex);
 	if (ifp == NULL)
 		return (EADDRNOTAVAIL);
 	(void)in6_setscope(&gsa->sin6.sin6_addr, ifp, NULL);
 
 	/*
 	 * Take the INP write lock.
 	 * Check if this socket is a member of this group.
 	 */
 	imo = in6p_findmoptions(inp);
 	idx = im6o_match_group(imo, ifp, &gsa->sa);
 	if (idx == -1 || imo->im6o_mfilters == NULL) {
 		error = EADDRNOTAVAIL;
 		goto out_in6p_locked;
 	}
 	inm = imo->im6o_membership[idx];
 	imf = &imo->im6o_mfilters[idx];
 
 	/*
 	 * Begin state merge transaction at socket layer.
 	 */
 	INP_WLOCK_ASSERT(inp);
 
 	imf->im6f_st[1] = msfr.msfr_fmode;
 
 	/*
 	 * Apply any new source filters, if present.
 	 * Make a copy of the user-space source vector so
 	 * that we may copy them with a single copyin. This
 	 * allows us to deal with page faults up-front.
 	 */
 	if (msfr.msfr_nsrcs > 0) {
 		struct in6_msource	*lims;
 		struct sockaddr_in6	*psin;
 		struct sockaddr_storage	*kss, *pkss;
 		int			 i;
 
 		INP_WUNLOCK(inp);
  
 		CTR2(KTR_MLD, "%s: loading %lu source list entries",
 		    __func__, (unsigned long)msfr.msfr_nsrcs);
 		kss = malloc(sizeof(struct sockaddr_storage) * msfr.msfr_nsrcs,
 		    M_TEMP, M_WAITOK);
 		error = copyin(msfr.msfr_srcs, kss,
 		    sizeof(struct sockaddr_storage) * msfr.msfr_nsrcs);
 		if (error) {
 			free(kss, M_TEMP);
 			return (error);
 		}
 
 		INP_WLOCK(inp);
 
 		/*
 		 * Mark all source filters as UNDEFINED at t1.
 		 * Restore new group filter mode, as im6f_leave()
 		 * will set it to INCLUDE.
 		 */
 		im6f_leave(imf);
 		imf->im6f_st[1] = msfr.msfr_fmode;
 
 		/*
 		 * Update socket layer filters at t1, lazy-allocating
 		 * new entries. This saves a bunch of memory at the
 		 * cost of one RB_FIND() per source entry; duplicate
 		 * entries in the msfr_nsrcs vector are ignored.
 		 * If we encounter an error, rollback transaction.
 		 *
 		 * XXX This too could be replaced with a set-symmetric
 		 * difference like loop to avoid walking from root
 		 * every time, as the key space is common.
 		 */
 		for (i = 0, pkss = kss; i < msfr.msfr_nsrcs; i++, pkss++) {
 			psin = (struct sockaddr_in6 *)pkss;
 			if (psin->sin6_family != AF_INET6) {
 				error = EAFNOSUPPORT;
 				break;
 			}
 			if (psin->sin6_len != sizeof(struct sockaddr_in6)) {
 				error = EINVAL;
 				break;
 			}
 			if (IN6_IS_ADDR_MULTICAST(&psin->sin6_addr)) {
 				error = EINVAL;
 				break;
 			}
 			/*
 			 * TODO: Validate embedded scope ID in source
 			 * list entry against passed-in ifp, if and only
 			 * if source list filter entry is iface or node local.
 			 */
 			in6_clearscope(&psin->sin6_addr);
 			error = im6f_get_source(imf, psin, &lims);
 			if (error)
 				break;
 			lims->im6sl_st[1] = imf->im6f_st[1];
 		}
 		free(kss, M_TEMP);
 	}
 
 	if (error)
 		goto out_im6f_rollback;
 
 	INP_WLOCK_ASSERT(inp);
 	IN6_MULTI_LIST_LOCK();
 
 	/*
 	 * Begin state merge transaction at MLD layer.
 	 */
 	CTR1(KTR_MLD, "%s: merge inm state", __func__);
 	error = in6m_merge(inm, imf);
 	if (error)
 		CTR1(KTR_MLD, "%s: failed to merge inm state", __func__);
 	else {
 		CTR1(KTR_MLD, "%s: doing mld downcall", __func__);
 		error = mld_change_state(inm, 0);
 		if (error)
 			CTR1(KTR_MLD, "%s: failed mld downcall", __func__);
 	}
 
 	IN6_MULTI_LIST_UNLOCK();
 
 out_im6f_rollback:
 	if (error)
 		im6f_rollback(imf);
 	else
 		im6f_commit(imf);
 
 	im6f_reap(imf);
 
 out_in6p_locked:
 	INP_WUNLOCK(inp);
 	return (error);
 }
 
 /*
  * Set the IP multicast options in response to user setsockopt().
  *
  * Many of the socket options handled in this function duplicate the
  * functionality of socket options in the regular unicast API. However,
  * it is not possible to merge the duplicate code, because the idempotence
  * of the IPv6 multicast part of the BSD Sockets API must be preserved;
  * the effects of these options must be treated as separate and distinct.
  *
  * SMPng: XXX: Unlocked read of inp_socket believed OK.
  */
 int
 ip6_setmoptions(struct inpcb *inp, struct sockopt *sopt)
 {
 	struct ip6_moptions	*im6o;
 	int			 error;
 
 	error = 0;
 
 	/*
 	 * If socket is neither of type SOCK_RAW or SOCK_DGRAM,
 	 * or is a divert socket, reject it.
 	 */
 	if (inp->inp_socket->so_proto->pr_protocol == IPPROTO_DIVERT ||
 	    (inp->inp_socket->so_proto->pr_type != SOCK_RAW &&
 	     inp->inp_socket->so_proto->pr_type != SOCK_DGRAM))
 		return (EOPNOTSUPP);
 
 	switch (sopt->sopt_name) {
 	case IPV6_MULTICAST_IF:
 		error = in6p_set_multicast_if(inp, sopt);
 		break;
 
 	case IPV6_MULTICAST_HOPS: {
 		int hlim;
 
 		if (sopt->sopt_valsize != sizeof(int)) {
 			error = EINVAL;
 			break;
 		}
 		error = sooptcopyin(sopt, &hlim, sizeof(hlim), sizeof(int));
 		if (error)
 			break;
 		if (hlim < -1 || hlim > 255) {
 			error = EINVAL;
 			break;
 		} else if (hlim == -1) {
 			hlim = V_ip6_defmcasthlim;
 		}
 		im6o = in6p_findmoptions(inp);
 		im6o->im6o_multicast_hlim = hlim;
 		INP_WUNLOCK(inp);
 		break;
 	}
 
 	case IPV6_MULTICAST_LOOP: {
 		u_int loop;
 
 		/*
 		 * Set the loopback flag for outgoing multicast packets.
 		 * Must be zero or one.
 		 */
 		if (sopt->sopt_valsize != sizeof(u_int)) {
 			error = EINVAL;
 			break;
 		}
 		error = sooptcopyin(sopt, &loop, sizeof(u_int), sizeof(u_int));
 		if (error)
 			break;
 		if (loop > 1) {
 			error = EINVAL;
 			break;
 		}
 		im6o = in6p_findmoptions(inp);
 		im6o->im6o_multicast_loop = loop;
 		INP_WUNLOCK(inp);
 		break;
 	}
 
 	case IPV6_JOIN_GROUP:
 	case MCAST_JOIN_GROUP:
 	case MCAST_JOIN_SOURCE_GROUP:
 		error = in6p_join_group(inp, sopt);
 		break;
 
 	case IPV6_LEAVE_GROUP:
 	case MCAST_LEAVE_GROUP:
 	case MCAST_LEAVE_SOURCE_GROUP:
 		error = in6p_leave_group(inp, sopt);
 		break;
 
 	case MCAST_BLOCK_SOURCE:
 	case MCAST_UNBLOCK_SOURCE:
 		error = in6p_block_unblock_source(inp, sopt);
 		break;
 
 	case IPV6_MSFILTER:
 		error = in6p_set_source_filters(inp, sopt);
 		break;
 
 	default:
 		error = EOPNOTSUPP;
 		break;
 	}
 
 	INP_UNLOCK_ASSERT(inp);
 
 	return (error);
 }
 
 /*
  * Expose MLD's multicast filter mode and source list(s) to userland,
  * keyed by (ifindex, group).
  * The filter mode is written out as a uint32_t, followed by
  * 0..n of struct in6_addr.
  * For use by ifmcstat(8).
  * SMPng: NOTE: unlocked read of ifindex space.
  */
 static int
 sysctl_ip6_mcast_filters(SYSCTL_HANDLER_ARGS)
 {
 	struct in6_addr			 mcaddr;
 	struct in6_addr			 src;
 	struct epoch_tracker		 et;
 	struct ifnet			*ifp;
 	struct ifmultiaddr		*ifma;
 	struct in6_multi		*inm;
 	struct ip6_msource		*ims;
 	int				*name;
 	int				 retval;
 	u_int				 namelen;
 	uint32_t			 fmode, ifindex;
 #ifdef KTR
 	char				 ip6tbuf[INET6_ADDRSTRLEN];
 #endif
 
 	name = (int *)arg1;
 	namelen = arg2;
 
 	if (req->newptr != NULL)
 		return (EPERM);
 
 	/* int: ifindex + 4 * 32 bits of IPv6 address */
 	if (namelen != 5)
 		return (EINVAL);
 
 	ifindex = name[0];
 	if (ifindex <= 0 || ifindex > V_if_index) {
 		CTR2(KTR_MLD, "%s: ifindex %u out of range",
 		    __func__, ifindex);
 		return (ENOENT);
 	}
 
 	memcpy(&mcaddr, &name[1], sizeof(struct in6_addr));
 	if (!IN6_IS_ADDR_MULTICAST(&mcaddr)) {
 		CTR2(KTR_MLD, "%s: group %s is not multicast",
 		    __func__, ip6_sprintf(ip6tbuf, &mcaddr));
 		return (EINVAL);
 	}
 
 	ifp = ifnet_byindex(ifindex);
 	if (ifp == NULL) {
 		CTR2(KTR_MLD, "%s: no ifp for ifindex %u",
 		    __func__, ifindex);
 		return (ENOENT);
 	}
 	/*
 	 * Internal MLD lookups require that scope/zone ID is set.
 	 */
 	(void)in6_setscope(&mcaddr, ifp, NULL);
 
 	retval = sysctl_wire_old_buffer(req,
 	    sizeof(uint32_t) + (in6_mcast_maxgrpsrc * sizeof(struct in6_addr)));
 	if (retval)
 		return (retval);
 
 	IN6_MULTI_LOCK();
 	IN6_MULTI_LIST_LOCK();
 	NET_EPOCH_ENTER(et);
 	CK_STAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) {
 		inm = in6m_ifmultiaddr_get_inm(ifma);
 		if (inm == NULL)
 			continue;
 		if (!IN6_ARE_ADDR_EQUAL(&inm->in6m_addr, &mcaddr))
 			continue;
 		fmode = inm->in6m_st[1].iss_fmode;
 		retval = SYSCTL_OUT(req, &fmode, sizeof(uint32_t));
 		if (retval != 0)
 			break;
 		RB_FOREACH(ims, ip6_msource_tree, &inm->in6m_srcs) {
 			CTR2(KTR_MLD, "%s: visit node %p", __func__, ims);
 			/*
 			 * Only copy-out sources which are in-mode.
 			 */
 			if (fmode != im6s_get_mode(inm, ims, 1)) {
 				CTR1(KTR_MLD, "%s: skip non-in-mode",
 				    __func__);
 				continue;
 			}
 			src = ims->im6s_addr;
 			retval = SYSCTL_OUT(req, &src,
 			    sizeof(struct in6_addr));
 			if (retval != 0)
 				break;
 		}
 	}
 	NET_EPOCH_EXIT(et);
 
 	IN6_MULTI_LIST_UNLOCK();
 	IN6_MULTI_UNLOCK();
 
 	return (retval);
 }
 
 #ifdef KTR
 
 static const char *in6m_modestrs[] = { "un", "in", "ex" };
 
 static const char *
 in6m_mode_str(const int mode)
 {
 
 	if (mode >= MCAST_UNDEFINED && mode <= MCAST_EXCLUDE)
 		return (in6m_modestrs[mode]);
 	return ("??");
 }
 
 static const char *in6m_statestrs[] = {
 	"not-member",
 	"silent",
 	"idle",
 	"lazy",
 	"sleeping",
 	"awakening",
 	"query-pending",
 	"sg-query-pending",
 	"leaving"
 };
 
 static const char *
 in6m_state_str(const int state)
 {
 
 	if (state >= MLD_NOT_MEMBER && state <= MLD_LEAVING_MEMBER)
 		return (in6m_statestrs[state]);
 	return ("??");
 }
 
 /*
  * Dump an in6_multi structure to the console.
  */
 void
 in6m_print(const struct in6_multi *inm)
 {
 	int t;
 	char ip6tbuf[INET6_ADDRSTRLEN];
 
 	if ((ktr_mask & KTR_MLD) == 0)
 		return;
 
 	printf("%s: --- begin in6m %p ---\n", __func__, inm);
 	printf("addr %s ifp %p(%s) ifma %p\n",
 	    ip6_sprintf(ip6tbuf, &inm->in6m_addr),
 	    inm->in6m_ifp,
 	    if_name(inm->in6m_ifp),
 	    inm->in6m_ifma);
 	printf("timer %u state %s refcount %u scq.len %u\n",
 	    inm->in6m_timer,
 	    in6m_state_str(inm->in6m_state),
 	    inm->in6m_refcount,
 	    mbufq_len(&inm->in6m_scq));
 	printf("mli %p nsrc %lu sctimer %u scrv %u\n",
 	    inm->in6m_mli,
 	    inm->in6m_nsrc,
 	    inm->in6m_sctimer,
 	    inm->in6m_scrv);
 	for (t = 0; t < 2; t++) {
 		printf("t%d: fmode %s asm %u ex %u in %u rec %u\n", t,
 		    in6m_mode_str(inm->in6m_st[t].iss_fmode),
 		    inm->in6m_st[t].iss_asm,
 		    inm->in6m_st[t].iss_ex,
 		    inm->in6m_st[t].iss_in,
 		    inm->in6m_st[t].iss_rec);
 	}
 	printf("%s: --- end in6m %p ---\n", __func__, inm);
 }
 
 #else /* !KTR */
 
 void
 in6m_print(const struct in6_multi *inm)
 {
 
 }
 
 #endif /* KTR */
Index: projects/runtime-coverage-v2/sys/sys/param.h
===================================================================
--- projects/runtime-coverage-v2/sys/sys/param.h	(revision 347598)
+++ projects/runtime-coverage-v2/sys/sys/param.h	(revision 347599)
@@ -1,367 +1,367 @@
 /*-
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Copyright (c) 1982, 1986, 1989, 1993
  *	The Regents of the University of California.  All rights reserved.
  * (c) UNIX System Laboratories, Inc.
  * All or some portions of this file are derived from material licensed
  * to the University of California by American Telephone and Telegraph
  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
  * the permission of UNIX System Laboratories, Inc.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)param.h	8.3 (Berkeley) 4/4/95
  * $FreeBSD$
  */
 
 #ifndef _SYS_PARAM_H_
 #define _SYS_PARAM_H_
 
 #include <sys/_null.h>
 
 #define	BSD	199506		/* System version (year & month). */
 #define BSD4_3	1
 #define BSD4_4	1
 
 /*
  * __FreeBSD_version numbers are documented in the Porter's Handbook.
  * If you bump the version for any reason, you should update the documentation
  * there.
  * Currently this lives here in the doc/ repository:
  *
  *	head/en_US.ISO8859-1/books/porters-handbook/versions/chapter.xml
  *
  * scheme is:  <major><two digit minor>Rxx
  *		'R' is in the range 0 to 4 if this is a release branch or
  *		X.0-CURRENT before releng/X.0 is created, otherwise 'R' is
  *		in the range 5 to 9.
  */
 #undef __FreeBSD_version
-#define __FreeBSD_version 1300025	/* Master, propagated to newvers */
+#define __FreeBSD_version 1300026	/* Master, propagated to newvers */
 
 /*
  * __FreeBSD_kernel__ indicates that this system uses the kernel of FreeBSD,
  * which by definition is always true on FreeBSD. This macro is also defined
  * on other systems that use the kernel of FreeBSD, such as GNU/kFreeBSD.
  *
  * It is tempting to use this macro in userland code when we want to enable
  * kernel-specific routines, and in fact it's fine to do this in code that
  * is part of FreeBSD itself.  However, be aware that as presence of this
  * macro is still not widespread (e.g. older FreeBSD versions, 3rd party
  * compilers, etc), it is STRONGLY DISCOURAGED to check for this macro in
  * external applications without also checking for __FreeBSD__ as an
  * alternative.
  */
 #undef __FreeBSD_kernel__
 #define __FreeBSD_kernel__
 
 #if defined(_KERNEL) || defined(IN_RTLD)
 #define	P_OSREL_SIGWAIT			700000
 #define	P_OSREL_SIGSEGV			700004
 #define	P_OSREL_MAP_ANON		800104
 #define	P_OSREL_MAP_FSTRICT		1100036
 #define	P_OSREL_SHUTDOWN_ENOTCONN	1100077
 #define	P_OSREL_MAP_GUARD		1200035
 #define	P_OSREL_WRFSBASE		1200041
 #define	P_OSREL_CK_CYLGRP		1200046
 #define	P_OSREL_VMTOTAL64		1200054
 #define	P_OSREL_CK_SUPERBLOCK		1300000
 #define	P_OSREL_CK_INODE		1300005
 
 #define	P_OSREL_MAJOR(x)		((x) / 100000)
 #endif
 
 #ifndef LOCORE
 #include <sys/types.h>
 #endif
 
 /*
  * Machine-independent constants (some used in following include files).
  * Redefined constants are from POSIX 1003.1 limits file.
  *
  * MAXCOMLEN should be >= sizeof(ac_comm) (see <acct.h>)
  */
 #include <sys/syslimits.h>
 
 #define	MAXCOMLEN	19		/* max command name remembered */
 #define	MAXINTERP	PATH_MAX	/* max interpreter file name length */
 #define	MAXLOGNAME	33		/* max login name length (incl. NUL) */
 #define	MAXUPRC		CHILD_MAX	/* max simultaneous processes */
 #define	NCARGS		ARG_MAX		/* max bytes for an exec function */
 #define	NGROUPS		(NGROUPS_MAX+1)	/* max number groups */
 #define	NOFILE		OPEN_MAX	/* max open files per process */
 #define	NOGROUP		65535		/* marker for empty group set member */
 #define MAXHOSTNAMELEN	256		/* max hostname size */
 #define SPECNAMELEN	255		/* max length of devicename */
 
 /* More types and definitions used throughout the kernel. */
 #ifdef _KERNEL
 #include <sys/cdefs.h>
 #include <sys/errno.h>
 #ifndef LOCORE
 #include <sys/time.h>
 #include <sys/priority.h>
 #endif
 
 #ifndef FALSE
 #define	FALSE	0
 #endif
 #ifndef TRUE
 #define	TRUE	1
 #endif
 #endif
 
 #ifndef _KERNEL
 /* Signals. */
 #include <sys/signal.h>
 #endif
 
 /* Machine type dependent parameters. */
 #include <machine/param.h>
 #ifndef _KERNEL
 #include <sys/limits.h>
 #endif
 
 #ifndef DEV_BSHIFT
 #define	DEV_BSHIFT	9		/* log2(DEV_BSIZE) */
 #endif
 #define	DEV_BSIZE	(1<<DEV_BSHIFT)
 
 #ifndef BLKDEV_IOSIZE
 #define BLKDEV_IOSIZE  PAGE_SIZE	/* default block device I/O size */
 #endif
 #ifndef DFLTPHYS
 #define DFLTPHYS	(64 * 1024)	/* default max raw I/O transfer size */
 #endif
 #ifndef MAXPHYS
 #define MAXPHYS		(128 * 1024)	/* max raw I/O transfer size */
 #endif
 #ifndef MAXDUMPPGS
 #define MAXDUMPPGS	(DFLTPHYS/PAGE_SIZE)
 #endif
 
 /*
  * Constants related to network buffer management.
  * MCLBYTES must be no larger than PAGE_SIZE.
  */
 #ifndef	MSIZE
 #define	MSIZE		256		/* size of an mbuf */
 #endif
 
 #ifndef	MCLSHIFT
 #define MCLSHIFT	11		/* convert bytes to mbuf clusters */
 #endif	/* MCLSHIFT */
 
 #define MCLBYTES	(1 << MCLSHIFT)	/* size of an mbuf cluster */
 
 #if PAGE_SIZE < 2048
 #define	MJUMPAGESIZE	MCLBYTES
 #elif PAGE_SIZE <= 8192
 #define	MJUMPAGESIZE	PAGE_SIZE
 #else
 #define	MJUMPAGESIZE	(8 * 1024)
 #endif
 
 #define	MJUM9BYTES	(9 * 1024)	/* jumbo cluster 9k */
 #define	MJUM16BYTES	(16 * 1024)	/* jumbo cluster 16k */
 
 /*
  * Some macros for units conversion
  */
 
 /* clicks to bytes */
 #ifndef ctob
 #define ctob(x)	((x)<<PAGE_SHIFT)
 #endif
 
 /* bytes to clicks */
 #ifndef btoc
 #define btoc(x)	(((vm_offset_t)(x)+PAGE_MASK)>>PAGE_SHIFT)
 #endif
 
 /*
  * btodb() is messy and perhaps slow because `bytes' may be an off_t.  We
  * want to shift an unsigned type to avoid sign extension and we don't
  * want to widen `bytes' unnecessarily.  Assume that the result fits in
  * a daddr_t.
  */
 #ifndef btodb
 #define btodb(bytes)	 		/* calculates (bytes / DEV_BSIZE) */ \
 	(sizeof (bytes) > sizeof(long) \
 	 ? (daddr_t)((unsigned long long)(bytes) >> DEV_BSHIFT) \
 	 : (daddr_t)((unsigned long)(bytes) >> DEV_BSHIFT))
 #endif
 
 #ifndef dbtob
 #define dbtob(db)			/* calculates (db * DEV_BSIZE) */ \
 	((off_t)(db) << DEV_BSHIFT)
 #endif
 
 #define	PRIMASK	0x0ff
 #define	PCATCH	0x100		/* OR'd with pri for tsleep to check signals */
 #define	PDROP	0x200	/* OR'd with pri to stop re-entry of interlock mutex */
 
 #define	NZERO	0		/* default "nice" */
 
 #define	NBBY	8		/* number of bits in a byte */
 #define	NBPW	sizeof(int)	/* number of bytes per word (integer) */
 
 #define	CMASK	022		/* default file mask: S_IWGRP|S_IWOTH */
 
 #define	NODEV	(dev_t)(-1)	/* non-existent device */
 
 /*
  * File system parameters and macros.
  *
  * MAXBSIZE -	Filesystems are made out of blocks of at most MAXBSIZE bytes
  *		per block.  MAXBSIZE may be made larger without effecting
  *		any existing filesystems as long as it does not exceed MAXPHYS,
  *		and may be made smaller at the risk of not being able to use
  *		filesystems which require a block size exceeding MAXBSIZE.
  *
  * MAXBCACHEBUF - Maximum size of a buffer in the buffer cache.  This must
  *		be >= MAXBSIZE and can be set differently for different
  *		architectures by defining it in <machine/param.h>.
  *		Making this larger allows NFS to do larger reads/writes.
  *
  * BKVASIZE -	Nominal buffer space per buffer, in bytes.  BKVASIZE is the
  *		minimum KVM memory reservation the kernel is willing to make.
  *		Filesystems can of course request smaller chunks.  Actual
  *		backing memory uses a chunk size of a page (PAGE_SIZE).
  *		The default value here can be overridden on a per-architecture
  *		basis by defining it in <machine/param.h>.
  *
  *		If you make BKVASIZE too small you risk seriously fragmenting
  *		the buffer KVM map which may slow things down a bit.  If you
  *		make it too big the kernel will not be able to optimally use
  *		the KVM memory reserved for the buffer cache and will wind
  *		up with too-few buffers.
  *
  *		The default is 16384, roughly 2x the block size used by a
  *		normal UFS filesystem.
  */
 #define MAXBSIZE	65536	/* must be power of 2 */
 #ifndef	MAXBCACHEBUF
 #define	MAXBCACHEBUF	MAXBSIZE /* must be a power of 2 >= MAXBSIZE */
 #endif
 #ifndef	BKVASIZE
 #define BKVASIZE	16384	/* must be power of 2 */
 #endif
 #define BKVAMASK	(BKVASIZE-1)
 
 /*
  * MAXPATHLEN defines the longest permissible path length after expanding
  * symbolic links. It is used to allocate a temporary buffer from the buffer
  * pool in which to do the name expansion, hence should be a power of two,
  * and must be less than or equal to MAXBSIZE.  MAXSYMLINKS defines the
  * maximum number of symbolic links that may be expanded in a path name.
  * It should be set high enough to allow all legitimate uses, but halt
  * infinite loops reasonably quickly.
  */
 #define	MAXPATHLEN	PATH_MAX
 #define MAXSYMLINKS	32
 
 /* Bit map related macros. */
 #define	setbit(a,i)	(((unsigned char *)(a))[(i)/NBBY] |= 1<<((i)%NBBY))
 #define	clrbit(a,i)	(((unsigned char *)(a))[(i)/NBBY] &= ~(1<<((i)%NBBY)))
 #define	isset(a,i)							\
 	(((const unsigned char *)(a))[(i)/NBBY] & (1<<((i)%NBBY)))
 #define	isclr(a,i)							\
 	((((const unsigned char *)(a))[(i)/NBBY] & (1<<((i)%NBBY))) == 0)
 
 /* Macros for counting and rounding. */
 #ifndef howmany
 #define	howmany(x, y)	(((x)+((y)-1))/(y))
 #endif
 #define	nitems(x)	(sizeof((x)) / sizeof((x)[0]))
 #define	rounddown(x, y)	(((x)/(y))*(y))
 #define	rounddown2(x, y) ((x)&(~((y)-1)))          /* if y is power of two */
 #define	roundup(x, y)	((((x)+((y)-1))/(y))*(y))  /* to any y */
 #define	roundup2(x, y)	(((x)+((y)-1))&(~((y)-1))) /* if y is powers of two */
 #define powerof2(x)	((((x)-1)&(x))==0)
 
 /* Macros for min/max. */
 #define	MIN(a,b) (((a)<(b))?(a):(b))
 #define	MAX(a,b) (((a)>(b))?(a):(b))
 
 #ifdef _KERNEL
 /*
  * Basic byte order function prototypes for non-inline functions.
  */
 #ifndef LOCORE
 #ifndef _BYTEORDER_PROTOTYPED
 #define	_BYTEORDER_PROTOTYPED
 __BEGIN_DECLS
 __uint32_t	 htonl(__uint32_t);
 __uint16_t	 htons(__uint16_t);
 __uint32_t	 ntohl(__uint32_t);
 __uint16_t	 ntohs(__uint16_t);
 __END_DECLS
 #endif
 #endif
 
 #ifndef _BYTEORDER_FUNC_DEFINED
 #define	_BYTEORDER_FUNC_DEFINED
 #define	htonl(x)	__htonl(x)
 #define	htons(x)	__htons(x)
 #define	ntohl(x)	__ntohl(x)
 #define	ntohs(x)	__ntohs(x)
 #endif /* !_BYTEORDER_FUNC_DEFINED */
 #endif /* _KERNEL */
 
 /*
  * Scale factor for scaled integers used to count %cpu time and load avgs.
  *
  * The number of CPU `tick's that map to a unique `%age' can be expressed
  * by the formula (1 / (2 ^ (FSHIFT - 11))).  The maximum load average that
  * can be calculated (assuming 32 bits) can be closely approximated using
  * the formula (2 ^ (2 * (16 - FSHIFT))) for (FSHIFT < 15).
  *
  * For the scheduler to maintain a 1:1 mapping of CPU `tick' to `%age',
  * FSHIFT must be at least 11; this gives us a maximum load avg of ~1024.
  */
 #define	FSHIFT	11		/* bits to right of fixed binary point */
 #define FSCALE	(1<<FSHIFT)
 
 #define dbtoc(db)			/* calculates devblks to pages */ \
 	((db + (ctodb(1) - 1)) >> (PAGE_SHIFT - DEV_BSHIFT))
 
 #define ctodb(db)			/* calculates pages to devblks */ \
 	((db) << (PAGE_SHIFT - DEV_BSHIFT))
 
 /*
  * Old spelling of __containerof().
  */
 #define	member2struct(s, m, x)						\
 	((struct s *)(void *)((char *)(x) - offsetof(struct s, m)))
 
 /*
  * Access a variable length array that has been declared as a fixed
  * length array.
  */
 #define __PAST_END(array, offset) (((__typeof__(*(array)) *)(array))[offset])
 
 #endif	/* _SYS_PARAM_H_ */
Index: projects/runtime-coverage-v2/sys/x86/include/specialreg.h
===================================================================
--- projects/runtime-coverage-v2/sys/x86/include/specialreg.h	(revision 347598)
+++ projects/runtime-coverage-v2/sys/x86/include/specialreg.h	(revision 347599)
@@ -1,1118 +1,1120 @@
 /*-
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Copyright (c) 1991 The Regents of the University of California.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	from: @(#)specialreg.h	7.1 (Berkeley) 5/9/91
  * $FreeBSD$
  */
 
 #ifndef _MACHINE_SPECIALREG_H_
 #define	_MACHINE_SPECIALREG_H_
 
 /*
  * Bits in 386 special registers:
  */
 #define	CR0_PE	0x00000001	/* Protected mode Enable */
 #define	CR0_MP	0x00000002	/* "Math" (fpu) Present */
 #define	CR0_EM	0x00000004	/* EMulate FPU instructions. (trap ESC only) */
 #define	CR0_TS	0x00000008	/* Task Switched (if MP, trap ESC and WAIT) */
 #define	CR0_PG	0x80000000	/* PaGing enable */
 
 /*
  * Bits in 486 special registers:
  */
 #define	CR0_NE	0x00000020	/* Numeric Error enable (EX16 vs IRQ13) */
 #define	CR0_WP	0x00010000	/* Write Protect (honor page protect in
 							   all modes) */
 #define	CR0_AM	0x00040000	/* Alignment Mask (set to enable AC flag) */
 #define	CR0_NW  0x20000000	/* Not Write-through */
 #define	CR0_CD  0x40000000	/* Cache Disable */
 
 #define	CR3_PCID_SAVE 0x8000000000000000
 #define	CR3_PCID_MASK 0xfff
 
 /*
  * Bits in PPro special registers
  */
 #define	CR4_VME	0x00000001	/* Virtual 8086 mode extensions */
 #define	CR4_PVI	0x00000002	/* Protected-mode virtual interrupts */
 #define	CR4_TSD	0x00000004	/* Time stamp disable */
 #define	CR4_DE	0x00000008	/* Debugging extensions */
 #define	CR4_PSE	0x00000010	/* Page size extensions */
 #define	CR4_PAE	0x00000020	/* Physical address extension */
 #define	CR4_MCE	0x00000040	/* Machine check enable */
 #define	CR4_PGE	0x00000080	/* Page global enable */
 #define	CR4_PCE	0x00000100	/* Performance monitoring counter enable */
 #define	CR4_FXSR 0x00000200	/* Fast FPU save/restore used by OS */
 #define	CR4_XMM	0x00000400	/* enable SIMD/MMX2 to use except 16 */
 #define	CR4_VMXE 0x00002000	/* enable VMX operation (Intel-specific) */
 #define	CR4_FSGSBASE 0x00010000	/* Enable FS/GS BASE accessing instructions */
 #define	CR4_PCIDE 0x00020000	/* Enable Context ID */
 #define	CR4_XSAVE 0x00040000	/* XSETBV/XGETBV */
 #define	CR4_SMEP 0x00100000	/* Supervisor-Mode Execution Prevention */
 #define	CR4_SMAP 0x00200000	/* Supervisor-Mode Access Prevention */
 #define	CR4_PKE	0x00400000	/* Protection Keys Enable */
 
 /*
  * Bits in AMD64 special registers.  EFER is 64 bits wide.
  */
 #define	EFER_SCE 0x000000001	/* System Call Extensions (R/W) */
 #define	EFER_LME 0x000000100	/* Long mode enable (R/W) */
 #define	EFER_LMA 0x000000400	/* Long mode active (R) */
 #define	EFER_NXE 0x000000800	/* PTE No-Execute bit enable (R/W) */
 #define	EFER_SVM 0x000001000	/* SVM enable bit for AMD, reserved for Intel */
 #define	EFER_LMSLE 0x000002000	/* Long Mode Segment Limit Enable */
 #define	EFER_FFXSR 0x000004000	/* Fast FXSAVE/FSRSTOR */
 #define	EFER_TCE   0x000008000	/* Translation Cache Extension */
 
 /*
  * Intel Extended Features registers
  */
 #define	XCR0	0		/* XFEATURE_ENABLED_MASK register */
 
 #define	XFEATURE_ENABLED_X87		0x00000001
 #define	XFEATURE_ENABLED_SSE		0x00000002
 #define	XFEATURE_ENABLED_YMM_HI128	0x00000004
 #define	XFEATURE_ENABLED_AVX		XFEATURE_ENABLED_YMM_HI128
 #define	XFEATURE_ENABLED_BNDREGS	0x00000008
 #define	XFEATURE_ENABLED_BNDCSR		0x00000010
 #define	XFEATURE_ENABLED_OPMASK		0x00000020
 #define	XFEATURE_ENABLED_ZMM_HI256	0x00000040
 #define	XFEATURE_ENABLED_HI16_ZMM	0x00000080
 
 #define	XFEATURE_AVX					\
     (XFEATURE_ENABLED_X87 | XFEATURE_ENABLED_SSE | XFEATURE_ENABLED_AVX)
 #define	XFEATURE_AVX512						\
     (XFEATURE_ENABLED_OPMASK | XFEATURE_ENABLED_ZMM_HI256 |	\
     XFEATURE_ENABLED_HI16_ZMM)
 #define	XFEATURE_MPX					\
     (XFEATURE_ENABLED_BNDREGS | XFEATURE_ENABLED_BNDCSR)
 
 /*
  * CPUID instruction features register
  */
 #define	CPUID_FPU	0x00000001
 #define	CPUID_VME	0x00000002
 #define	CPUID_DE	0x00000004
 #define	CPUID_PSE	0x00000008
 #define	CPUID_TSC	0x00000010
 #define	CPUID_MSR	0x00000020
 #define	CPUID_PAE	0x00000040
 #define	CPUID_MCE	0x00000080
 #define	CPUID_CX8	0x00000100
 #define	CPUID_APIC	0x00000200
 #define	CPUID_B10	0x00000400
 #define	CPUID_SEP	0x00000800
 #define	CPUID_MTRR	0x00001000
 #define	CPUID_PGE	0x00002000
 #define	CPUID_MCA	0x00004000
 #define	CPUID_CMOV	0x00008000
 #define	CPUID_PAT	0x00010000
 #define	CPUID_PSE36	0x00020000
 #define	CPUID_PSN	0x00040000
 #define	CPUID_CLFSH	0x00080000
 #define	CPUID_B20	0x00100000
 #define	CPUID_DS	0x00200000
 #define	CPUID_ACPI	0x00400000
 #define	CPUID_MMX	0x00800000
 #define	CPUID_FXSR	0x01000000
 #define	CPUID_SSE	0x02000000
 #define	CPUID_XMM	0x02000000
 #define	CPUID_SSE2	0x04000000
 #define	CPUID_SS	0x08000000
 #define	CPUID_HTT	0x10000000
 #define	CPUID_TM	0x20000000
 #define	CPUID_IA64	0x40000000
 #define	CPUID_PBE	0x80000000
 
 #define	CPUID2_SSE3	0x00000001
 #define	CPUID2_PCLMULQDQ 0x00000002
 #define	CPUID2_DTES64	0x00000004
 #define	CPUID2_MON	0x00000008
 #define	CPUID2_DS_CPL	0x00000010
 #define	CPUID2_VMX	0x00000020
 #define	CPUID2_SMX	0x00000040
 #define	CPUID2_EST	0x00000080
 #define	CPUID2_TM2	0x00000100
 #define	CPUID2_SSSE3	0x00000200
 #define	CPUID2_CNXTID	0x00000400
 #define	CPUID2_SDBG	0x00000800
 #define	CPUID2_FMA	0x00001000
 #define	CPUID2_CX16	0x00002000
 #define	CPUID2_XTPR	0x00004000
 #define	CPUID2_PDCM	0x00008000
 #define	CPUID2_PCID	0x00020000
 #define	CPUID2_DCA	0x00040000
 #define	CPUID2_SSE41	0x00080000
 #define	CPUID2_SSE42	0x00100000
 #define	CPUID2_X2APIC	0x00200000
 #define	CPUID2_MOVBE	0x00400000
 #define	CPUID2_POPCNT	0x00800000
 #define	CPUID2_TSCDLT	0x01000000
 #define	CPUID2_AESNI	0x02000000
 #define	CPUID2_XSAVE	0x04000000
 #define	CPUID2_OSXSAVE	0x08000000
 #define	CPUID2_AVX	0x10000000
 #define	CPUID2_F16C	0x20000000
 #define	CPUID2_RDRAND	0x40000000
 #define	CPUID2_HV	0x80000000
 
 /*
  * Important bits in the Thermal and Power Management flags
  * CPUID.6 EAX and ECX.
  */
 #define	CPUTPM1_SENSOR	0x00000001
 #define	CPUTPM1_TURBO	0x00000002
 #define	CPUTPM1_ARAT	0x00000004
 #define	CPUTPM1_HWP	0x00000080
 #define	CPUTPM1_HWP_NOTIFICATION	0x00000100
 #define	CPUTPM1_HWP_ACTIVITY_WINDOW	0x00000200
 #define	CPUTPM1_HWP_PERF_PREF	0x00000400
 #define	CPUTPM1_HWP_PKG	0x00000800
 #define	CPUTPM1_HWP_FLEXIBLE	0x00020000
 #define	CPUTPM2_EFFREQ	0x00000001
 
 /* Intel Processor Trace CPUID. */
 
 /* Leaf 0 ebx. */
 #define	CPUPT_CR3		(1 << 0)	/* CR3 Filtering Support */
 #define	CPUPT_PSB		(1 << 1)	/* Configurable PSB and Cycle-Accurate Mode Supported */
 #define	CPUPT_IPF		(1 << 2)	/* IP Filtering and TraceStop supported */
 #define	CPUPT_MTC		(1 << 3)	/* MTC Supported */
 #define	CPUPT_PRW		(1 << 4)	/* PTWRITE Supported */
 #define	CPUPT_PWR		(1 << 5)	/* Power Event Trace Supported */
 
 /* Leaf 0 ecx. */
 #define	CPUPT_TOPA		(1 << 0)	/* ToPA Output Supported */
 #define	CPUPT_TOPA_MULTI	(1 << 1)	/* ToPA Tables Allow Multiple Output Entries */
 #define	CPUPT_SINGLE		(1 << 2)	/* Single-Range Output Supported */
 #define	CPUPT_TT_OUT		(1 << 3)	/* Output to Trace Transport Subsystem Supported */
 #define	CPUPT_LINEAR_IP		(1 << 31)	/* IP Payloads are Linear IP, otherwise IP is effective */
 
 /* Leaf 1 eax. */
 #define	CPUPT_NADDR_S		0	/* Number of Address Ranges */
 #define	CPUPT_NADDR_M		(0x7 << CPUPT_NADDR_S)
 #define	CPUPT_MTC_BITMAP_S	16	/* Bitmap of supported MTC Period Encodings */
 #define	CPUPT_MTC_BITMAP_M	(0xffff << CPUPT_MTC_BITMAP_S)
 
 /* Leaf 1 ebx. */
 #define	CPUPT_CT_BITMAP_S	0	/* Bitmap of supported Cycle Threshold values */
 #define	CPUPT_CT_BITMAP_M	(0xffff << CPUPT_CT_BITMAP_S)
 #define	CPUPT_PFE_BITMAP_S	16	/* Bitmap of supported Configurable PSB Frequency encoding */
 #define	CPUPT_PFE_BITMAP_M	(0xffff << CPUPT_PFE_BITMAP_S)
 
 /*
  * Important bits in the AMD extended cpuid flags
  */
 #define	AMDID_SYSCALL	0x00000800
 #define	AMDID_MP	0x00080000
 #define	AMDID_NX	0x00100000
 #define	AMDID_EXT_MMX	0x00400000
 #define	AMDID_FFXSR	0x02000000
 #define	AMDID_PAGE1GB	0x04000000
 #define	AMDID_RDTSCP	0x08000000
 #define	AMDID_LM	0x20000000
 #define	AMDID_EXT_3DNOW	0x40000000
 #define	AMDID_3DNOW	0x80000000
 
 #define	AMDID2_LAHF	0x00000001
 #define	AMDID2_CMP	0x00000002
 #define	AMDID2_SVM	0x00000004
 #define	AMDID2_EXT_APIC	0x00000008
 #define	AMDID2_CR8	0x00000010
 #define	AMDID2_ABM	0x00000020
 #define	AMDID2_SSE4A	0x00000040
 #define	AMDID2_MAS	0x00000080
 #define	AMDID2_PREFETCH	0x00000100
 #define	AMDID2_OSVW	0x00000200
 #define	AMDID2_IBS	0x00000400
 #define	AMDID2_XOP	0x00000800
 #define	AMDID2_SKINIT	0x00001000
 #define	AMDID2_WDT	0x00002000
 #define	AMDID2_LWP	0x00008000
 #define	AMDID2_FMA4	0x00010000
 #define	AMDID2_TCE	0x00020000
 #define	AMDID2_NODE_ID	0x00080000
 #define	AMDID2_TBM	0x00200000
 #define	AMDID2_TOPOLOGY	0x00400000
 #define	AMDID2_PCXC	0x00800000
 #define	AMDID2_PNXC	0x01000000
 #define	AMDID2_DBE	0x04000000
 #define	AMDID2_PTSC	0x08000000
 #define	AMDID2_PTSCEL2I	0x10000000
 #define	AMDID2_MWAITX	0x20000000
 
 /*
  * CPUID instruction 1 eax info
  */
 #define	CPUID_STEPPING		0x0000000f
 #define	CPUID_MODEL		0x000000f0
 #define	CPUID_FAMILY		0x00000f00
 #define	CPUID_EXT_MODEL		0x000f0000
 #define	CPUID_EXT_FAMILY	0x0ff00000
 #ifdef __i386__
 #define	CPUID_TO_MODEL(id) \
     ((((id) & CPUID_MODEL) >> 4) | \
     ((((id) & CPUID_FAMILY) >= 0x600) ? \
     (((id) & CPUID_EXT_MODEL) >> 12) : 0))
 #define	CPUID_TO_FAMILY(id) \
     ((((id) & CPUID_FAMILY) >> 8) + \
     ((((id) & CPUID_FAMILY) == 0xf00) ? \
     (((id) & CPUID_EXT_FAMILY) >> 20) : 0))
 #else
 #define	CPUID_TO_MODEL(id) \
     ((((id) & CPUID_MODEL) >> 4) | \
     (((id) & CPUID_EXT_MODEL) >> 12))
 #define	CPUID_TO_FAMILY(id) \
     ((((id) & CPUID_FAMILY) >> 8) + \
     (((id) & CPUID_EXT_FAMILY) >> 20))
 #endif
 
 /*
  * CPUID instruction 1 ebx info
  */
 #define	CPUID_BRAND_INDEX	0x000000ff
 #define	CPUID_CLFUSH_SIZE	0x0000ff00
 #define	CPUID_HTT_CORES		0x00ff0000
 #define	CPUID_LOCAL_APIC_ID	0xff000000
 
 /*
  * CPUID instruction 5 info
  */
 #define	CPUID5_MON_MIN_SIZE	0x0000ffff	/* eax */
 #define	CPUID5_MON_MAX_SIZE	0x0000ffff	/* ebx */
 #define	CPUID5_MON_MWAIT_EXT	0x00000001	/* ecx */
 #define	CPUID5_MWAIT_INTRBREAK	0x00000002	/* ecx */
 
 /*
  * MWAIT cpu power states.  Lower 4 bits are sub-states.
  */
 #define	MWAIT_C0	0xf0
 #define	MWAIT_C1	0x00
 #define	MWAIT_C2	0x10
 #define	MWAIT_C3	0x20
 #define	MWAIT_C4	0x30
 
 /*
  * MWAIT extensions.
  */
 /* Interrupt breaks MWAIT even when masked. */
 #define	MWAIT_INTRBREAK		0x00000001
 
 /*
  * CPUID instruction 6 ecx info
  */
 #define	CPUID_PERF_STAT		0x00000001
 #define	CPUID_PERF_BIAS		0x00000008
 
 /* 
  * CPUID instruction 0xb ebx info.
  */
 #define	CPUID_TYPE_INVAL	0
 #define	CPUID_TYPE_SMT		1
 #define	CPUID_TYPE_CORE		2
 
 /*
  * CPUID instruction 0xd Processor Extended State Enumeration Sub-leaf 1
  */
 #define	CPUID_EXTSTATE_XSAVEOPT	0x00000001
 #define	CPUID_EXTSTATE_XSAVEC	0x00000002
 #define	CPUID_EXTSTATE_XINUSE	0x00000004
 #define	CPUID_EXTSTATE_XSAVES	0x00000008
 
 /*
  * AMD extended function 8000_0007h ebx info
  */
 #define	AMDRAS_MCA_OF_RECOV	0x00000001
 #define	AMDRAS_SUCCOR		0x00000002
 #define	AMDRAS_HW_ASSERT	0x00000004
 #define	AMDRAS_SCALABLE_MCA	0x00000008
 #define	AMDRAS_PFEH_SUPPORT	0x00000010
 
 /*
  * AMD extended function 8000_0007h edx info
  */
 #define	AMDPM_TS		0x00000001
 #define	AMDPM_FID		0x00000002
 #define	AMDPM_VID		0x00000004
 #define	AMDPM_TTP		0x00000008
 #define	AMDPM_TM		0x00000010
 #define	AMDPM_STC		0x00000020
 #define	AMDPM_100MHZ_STEPS	0x00000040
 #define	AMDPM_HW_PSTATE		0x00000080
 #define	AMDPM_TSC_INVARIANT	0x00000100
 #define	AMDPM_CPB		0x00000200
 
 /*
  * AMD extended function 8000_0008h ebx info (amd_extended_feature_extensions)
  */
 #define	AMDFEID_CLZERO		0x00000001
 #define	AMDFEID_IRPERF		0x00000002
 #define	AMDFEID_XSAVEERPTR	0x00000004
 #define	AMDFEID_IBPB		0x00001000
 #define	AMDFEID_IBRS		0x00004000
 #define	AMDFEID_STIBP		0x00008000
 /* The below are only defined if the corresponding base feature above exists. */
 #define	AMDFEID_IBRS_ALWAYSON	0x00010000
 #define	AMDFEID_STIBP_ALWAYSON	0x00020000
 #define	AMDFEID_PREFER_IBRS	0x00040000
 #define	AMDFEID_SSBD		0x01000000
 /* SSBD via MSRC001_011F instead of MSR 0x48: */
 #define	AMDFEID_VIRT_SSBD	0x02000000
 #define	AMDFEID_SSB_NO		0x04000000
 
 /*
  * AMD extended function 8000_0008h ecx info
  */
 #define	AMDID_CMP_CORES		0x000000ff
 #define	AMDID_COREID_SIZE	0x0000f000
 #define	AMDID_COREID_SIZE_SHIFT	12
 
 /*
  * CPUID instruction 7 Structured Extended Features, leaf 0 ebx info
  */
 #define	CPUID_STDEXT_FSGSBASE	0x00000001
 #define	CPUID_STDEXT_TSC_ADJUST	0x00000002
 #define	CPUID_STDEXT_SGX	0x00000004
 #define	CPUID_STDEXT_BMI1	0x00000008
 #define	CPUID_STDEXT_HLE	0x00000010
 #define	CPUID_STDEXT_AVX2	0x00000020
 #define	CPUID_STDEXT_FDP_EXC	0x00000040
 #define	CPUID_STDEXT_SMEP	0x00000080
 #define	CPUID_STDEXT_BMI2	0x00000100
 #define	CPUID_STDEXT_ERMS	0x00000200
 #define	CPUID_STDEXT_INVPCID	0x00000400
 #define	CPUID_STDEXT_RTM	0x00000800
 #define	CPUID_STDEXT_PQM	0x00001000
 #define	CPUID_STDEXT_NFPUSG	0x00002000
 #define	CPUID_STDEXT_MPX	0x00004000
 #define	CPUID_STDEXT_PQE	0x00008000
 #define	CPUID_STDEXT_AVX512F	0x00010000
 #define	CPUID_STDEXT_AVX512DQ	0x00020000
 #define	CPUID_STDEXT_RDSEED	0x00040000
 #define	CPUID_STDEXT_ADX	0x00080000
 #define	CPUID_STDEXT_SMAP	0x00100000
 #define	CPUID_STDEXT_AVX512IFMA	0x00200000
 #define	CPUID_STDEXT_PCOMMIT	0x00400000
 #define	CPUID_STDEXT_CLFLUSHOPT	0x00800000
 #define	CPUID_STDEXT_CLWB	0x01000000
 #define	CPUID_STDEXT_PROCTRACE	0x02000000
 #define	CPUID_STDEXT_AVX512PF	0x04000000
 #define	CPUID_STDEXT_AVX512ER	0x08000000
 #define	CPUID_STDEXT_AVX512CD	0x10000000
 #define	CPUID_STDEXT_SHA	0x20000000
 #define	CPUID_STDEXT_AVX512BW	0x40000000
 #define	CPUID_STDEXT_AVX512VL	0x80000000
 
 /*
  * CPUID instruction 7 Structured Extended Features, leaf 0 ecx info
  */
 #define	CPUID_STDEXT2_PREFETCHWT1 0x00000001
 #define	CPUID_STDEXT2_UMIP	0x00000004
 #define	CPUID_STDEXT2_PKU	0x00000008
 #define	CPUID_STDEXT2_OSPKE	0x00000010
 #define	CPUID_STDEXT2_WAITPKG	0x00000020
 #define	CPUID_STDEXT2_GFNI	0x00000100
 #define	CPUID_STDEXT2_RDPID	0x00400000
 #define	CPUID_STDEXT2_CLDEMOTE	0x02000000
 #define	CPUID_STDEXT2_MOVDIRI	0x08000000
 #define	CPUID_STDEXT2_MOVDIRI64B	0x10000000
 #define	CPUID_STDEXT2_SGXLC	0x40000000
 
 /*
  * CPUID instruction 7 Structured Extended Features, leaf 0 edx info
  */
+#define	CPUID_STDEXT3_MD_CLEAR	0x00000400
 #define	CPUID_STDEXT3_TSXFA	0x00002000
 #define	CPUID_STDEXT3_IBPB	0x04000000
 #define	CPUID_STDEXT3_STIBP	0x08000000
 #define	CPUID_STDEXT3_L1D_FLUSH	0x10000000
 #define	CPUID_STDEXT3_ARCH_CAP	0x20000000
 #define	CPUID_STDEXT3_CORE_CAP	0x40000000
 #define	CPUID_STDEXT3_SSBD	0x80000000
 
 /* MSR IA32_ARCH_CAP(ABILITIES) bits */
 #define	IA32_ARCH_CAP_RDCL_NO	0x00000001
 #define	IA32_ARCH_CAP_IBRS_ALL	0x00000002
 #define	IA32_ARCH_CAP_RSBA	0x00000004
 #define	IA32_ARCH_CAP_SKIP_L1DFL_VMENTRY	0x00000008
 #define	IA32_ARCH_CAP_SSB_NO	0x00000010
+#define	IA32_ARCH_CAP_MDS_NO	0x00000020
 
 /*
  * CPUID manufacturers identifiers
  */
 #define	AMD_VENDOR_ID		"AuthenticAMD"
 #define	CENTAUR_VENDOR_ID	"CentaurHauls"
 #define	CYRIX_VENDOR_ID		"CyrixInstead"
 #define	INTEL_VENDOR_ID		"GenuineIntel"
 #define	NEXGEN_VENDOR_ID	"NexGenDriven"
 #define	NSC_VENDOR_ID		"Geode by NSC"
 #define	RISE_VENDOR_ID		"RiseRiseRise"
 #define	SIS_VENDOR_ID		"SiS SiS SiS "
 #define	TRANSMETA_VENDOR_ID	"GenuineTMx86"
 #define	UMC_VENDOR_ID		"UMC UMC UMC "
 
 /*
  * Model-specific registers for the i386 family
  */
 #define	MSR_P5_MC_ADDR		0x000
 #define	MSR_P5_MC_TYPE		0x001
 #define	MSR_TSC			0x010
 #define	MSR_P5_CESR		0x011
 #define	MSR_P5_CTR0		0x012
 #define	MSR_P5_CTR1		0x013
 #define	MSR_IA32_PLATFORM_ID	0x017
 #define	MSR_APICBASE		0x01b
 #define	MSR_EBL_CR_POWERON	0x02a
 #define	MSR_TEST_CTL		0x033
 #define	MSR_IA32_FEATURE_CONTROL 0x03a
 #define	MSR_IA32_SPEC_CTRL	0x048
 #define	MSR_IA32_PRED_CMD	0x049
 #define	MSR_BIOS_UPDT_TRIG	0x079
 #define	MSR_BBL_CR_D0		0x088
 #define	MSR_BBL_CR_D1		0x089
 #define	MSR_BBL_CR_D2		0x08a
 #define	MSR_BIOS_SIGN		0x08b
 #define	MSR_PERFCTR0		0x0c1
 #define	MSR_PERFCTR1		0x0c2
 #define	MSR_PLATFORM_INFO	0x0ce
 #define	MSR_MPERF		0x0e7
 #define	MSR_APERF		0x0e8
 #define	MSR_IA32_EXT_CONFIG	0x0ee	/* Undocumented. Core Solo/Duo only */
 #define	MSR_MTRRcap		0x0fe
 #define	MSR_IA32_ARCH_CAP	0x10a
 #define	MSR_IA32_FLUSH_CMD	0x10b
 #define	MSR_TSX_FORCE_ABORT	0x10f
 #define	MSR_BBL_CR_ADDR		0x116
 #define	MSR_BBL_CR_DECC		0x118
 #define	MSR_BBL_CR_CTL		0x119
 #define	MSR_BBL_CR_TRIG		0x11a
 #define	MSR_BBL_CR_BUSY		0x11b
 #define	MSR_BBL_CR_CTL3		0x11e
 #define	MSR_SYSENTER_CS_MSR	0x174
 #define	MSR_SYSENTER_ESP_MSR	0x175
 #define	MSR_SYSENTER_EIP_MSR	0x176
 #define	MSR_MCG_CAP		0x179
 #define	MSR_MCG_STATUS		0x17a
 #define	MSR_MCG_CTL		0x17b
 #define	MSR_EVNTSEL0		0x186
 #define	MSR_EVNTSEL1		0x187
 #define	MSR_THERM_CONTROL	0x19a
 #define	MSR_THERM_INTERRUPT	0x19b
 #define	MSR_THERM_STATUS	0x19c
 #define	MSR_IA32_MISC_ENABLE	0x1a0
 #define	MSR_IA32_TEMPERATURE_TARGET	0x1a2
 #define	MSR_TURBO_RATIO_LIMIT	0x1ad
 #define	MSR_TURBO_RATIO_LIMIT1	0x1ae
 #define	MSR_DEBUGCTLMSR		0x1d9
 #define	MSR_LASTBRANCHFROMIP	0x1db
 #define	MSR_LASTBRANCHTOIP	0x1dc
 #define	MSR_LASTINTFROMIP	0x1dd
 #define	MSR_LASTINTTOIP		0x1de
 #define	MSR_ROB_CR_BKUPTMPDR6	0x1e0
 #define	MSR_MTRRVarBase		0x200
 #define	MSR_MTRR64kBase		0x250
 #define	MSR_MTRR16kBase		0x258
 #define	MSR_MTRR4kBase		0x268
 #define	MSR_PAT			0x277
 #define	MSR_MC0_CTL2		0x280
 #define	MSR_MTRRdefType		0x2ff
 #define	MSR_MC0_CTL		0x400
 #define	MSR_MC0_STATUS		0x401
 #define	MSR_MC0_ADDR		0x402
 #define	MSR_MC0_MISC		0x403
 #define	MSR_MC1_CTL		0x404
 #define	MSR_MC1_STATUS		0x405
 #define	MSR_MC1_ADDR		0x406
 #define	MSR_MC1_MISC		0x407
 #define	MSR_MC2_CTL		0x408
 #define	MSR_MC2_STATUS		0x409
 #define	MSR_MC2_ADDR		0x40a
 #define	MSR_MC2_MISC		0x40b
 #define	MSR_MC3_CTL		0x40c
 #define	MSR_MC3_STATUS		0x40d
 #define	MSR_MC3_ADDR		0x40e
 #define	MSR_MC3_MISC		0x40f
 #define	MSR_MC4_CTL		0x410
 #define	MSR_MC4_STATUS		0x411
 #define	MSR_MC4_ADDR		0x412
 #define	MSR_MC4_MISC		0x413
 #define	MSR_RAPL_POWER_UNIT	0x606
 #define	MSR_PKG_ENERGY_STATUS	0x611
 #define	MSR_DRAM_ENERGY_STATUS	0x619
 #define	MSR_PP0_ENERGY_STATUS	0x639
 #define	MSR_PP1_ENERGY_STATUS	0x641
 #define	MSR_PPERF		0x64e
 #define	MSR_TSC_DEADLINE	0x6e0	/* Writes are not serializing */
 #define	MSR_IA32_PM_ENABLE	0x770
 #define	MSR_IA32_HWP_CAPABILITIES	0x771
 #define	MSR_IA32_HWP_REQUEST_PKG	0x772
 #define	MSR_IA32_HWP_INTERRUPT		0x773
 #define	MSR_IA32_HWP_REQUEST	0x774
 #define	MSR_IA32_HWP_STATUS	0x777
 
 /*
  * VMX MSRs
  */
 #define	MSR_VMX_BASIC		0x480
 #define	MSR_VMX_PINBASED_CTLS	0x481
 #define	MSR_VMX_PROCBASED_CTLS	0x482
 #define	MSR_VMX_EXIT_CTLS	0x483
 #define	MSR_VMX_ENTRY_CTLS	0x484
 #define	MSR_VMX_CR0_FIXED0	0x486
 #define	MSR_VMX_CR0_FIXED1	0x487
 #define	MSR_VMX_CR4_FIXED0	0x488
 #define	MSR_VMX_CR4_FIXED1	0x489
 #define	MSR_VMX_PROCBASED_CTLS2	0x48b
 #define	MSR_VMX_EPT_VPID_CAP	0x48c
 #define	MSR_VMX_TRUE_PINBASED_CTLS	0x48d
 #define	MSR_VMX_TRUE_PROCBASED_CTLS	0x48e
 #define	MSR_VMX_TRUE_EXIT_CTLS	0x48f
 #define	MSR_VMX_TRUE_ENTRY_CTLS	0x490
 
 /*
  * X2APIC MSRs.
  * Writes are not serializing.
  */
 #define	MSR_APIC_000		0x800
 #define	MSR_APIC_ID		0x802
 #define	MSR_APIC_VERSION	0x803
 #define	MSR_APIC_TPR		0x808
 #define	MSR_APIC_EOI		0x80b
 #define	MSR_APIC_LDR		0x80d
 #define	MSR_APIC_SVR		0x80f
 #define	MSR_APIC_ISR0		0x810
 #define	MSR_APIC_ISR1		0x811
 #define	MSR_APIC_ISR2		0x812
 #define	MSR_APIC_ISR3		0x813
 #define	MSR_APIC_ISR4		0x814
 #define	MSR_APIC_ISR5		0x815
 #define	MSR_APIC_ISR6		0x816
 #define	MSR_APIC_ISR7		0x817
 #define	MSR_APIC_TMR0		0x818
 #define	MSR_APIC_IRR0		0x820
 #define	MSR_APIC_ESR		0x828
 #define	MSR_APIC_LVT_CMCI	0x82F
 #define	MSR_APIC_ICR		0x830
 #define	MSR_APIC_LVT_TIMER	0x832
 #define	MSR_APIC_LVT_THERMAL	0x833
 #define	MSR_APIC_LVT_PCINT	0x834
 #define	MSR_APIC_LVT_LINT0	0x835
 #define	MSR_APIC_LVT_LINT1	0x836
 #define	MSR_APIC_LVT_ERROR	0x837
 #define	MSR_APIC_ICR_TIMER	0x838
 #define	MSR_APIC_CCR_TIMER	0x839
 #define	MSR_APIC_DCR_TIMER	0x83e
 #define	MSR_APIC_SELF_IPI	0x83f
 
 #define	MSR_IA32_XSS		0xda0
 
 /*
  * Intel Processor Trace (PT) MSRs.
  */
 #define	MSR_IA32_RTIT_OUTPUT_BASE	0x560	/* Trace Output Base Register (R/W) */
 #define	MSR_IA32_RTIT_OUTPUT_MASK_PTRS	0x561	/* Trace Output Mask Pointers Register (R/W) */
 #define	MSR_IA32_RTIT_CTL		0x570	/* Trace Control Register (R/W) */
 #define	 RTIT_CTL_TRACEEN	(1 << 0)
 #define	 RTIT_CTL_CYCEN		(1 << 1)
 #define	 RTIT_CTL_OS		(1 << 2)
 #define	 RTIT_CTL_USER		(1 << 3)
 #define	 RTIT_CTL_PWREVTEN	(1 << 4)
 #define	 RTIT_CTL_FUPONPTW	(1 << 5)
 #define	 RTIT_CTL_FABRICEN	(1 << 6)
 #define	 RTIT_CTL_CR3FILTER	(1 << 7)
 #define	 RTIT_CTL_TOPA		(1 << 8)
 #define	 RTIT_CTL_MTCEN		(1 << 9)
 #define	 RTIT_CTL_TSCEN		(1 << 10)
 #define	 RTIT_CTL_DISRETC	(1 << 11)
 #define	 RTIT_CTL_PTWEN		(1 << 12)
 #define	 RTIT_CTL_BRANCHEN	(1 << 13)
 #define	 RTIT_CTL_MTC_FREQ_S	14
 #define	 RTIT_CTL_MTC_FREQ(n)	((n) << RTIT_CTL_MTC_FREQ_S)
 #define	 RTIT_CTL_MTC_FREQ_M	(0xf << RTIT_CTL_MTC_FREQ_S)
 #define	 RTIT_CTL_CYC_THRESH_S	19
 #define	 RTIT_CTL_CYC_THRESH_M	(0xf << RTIT_CTL_CYC_THRESH_S)
 #define	 RTIT_CTL_PSB_FREQ_S	24
 #define	 RTIT_CTL_PSB_FREQ_M	(0xf << RTIT_CTL_PSB_FREQ_S)
 #define	 RTIT_CTL_ADDR_CFG_S(n) (32 + (n) * 4)
 #define	 RTIT_CTL_ADDR0_CFG_S	32
 #define	 RTIT_CTL_ADDR0_CFG_M	(0xfULL << RTIT_CTL_ADDR0_CFG_S)
 #define	 RTIT_CTL_ADDR1_CFG_S	36
 #define	 RTIT_CTL_ADDR1_CFG_M	(0xfULL << RTIT_CTL_ADDR1_CFG_S)
 #define	 RTIT_CTL_ADDR2_CFG_S	40
 #define	 RTIT_CTL_ADDR2_CFG_M	(0xfULL << RTIT_CTL_ADDR2_CFG_S)
 #define	 RTIT_CTL_ADDR3_CFG_S	44
 #define	 RTIT_CTL_ADDR3_CFG_M	(0xfULL << RTIT_CTL_ADDR3_CFG_S)
 #define	MSR_IA32_RTIT_STATUS		0x571	/* Tracing Status Register (R/W) */
 #define	 RTIT_STATUS_FILTEREN	(1 << 0)
 #define	 RTIT_STATUS_CONTEXTEN	(1 << 1)
 #define	 RTIT_STATUS_TRIGGEREN	(1 << 2)
 #define	 RTIT_STATUS_ERROR	(1 << 4)
 #define	 RTIT_STATUS_STOPPED	(1 << 5)
 #define	 RTIT_STATUS_PACKETBYTECNT_S	32
 #define	 RTIT_STATUS_PACKETBYTECNT_M	(0x1ffffULL << RTIT_STATUS_PACKETBYTECNT_S)
 #define	MSR_IA32_RTIT_CR3_MATCH		0x572	/* Trace Filter CR3 Match Register (R/W) */
 #define	MSR_IA32_RTIT_ADDR_A(n)		(0x580 + (n) * 2)
 #define	MSR_IA32_RTIT_ADDR_B(n)		(0x581 + (n) * 2)
 #define	MSR_IA32_RTIT_ADDR0_A		0x580	/* Region 0 Start Address (R/W) */
 #define	MSR_IA32_RTIT_ADDR0_B		0x581	/* Region 0 End Address (R/W) */
 #define	MSR_IA32_RTIT_ADDR1_A		0x582	/* Region 1 Start Address (R/W) */
 #define	MSR_IA32_RTIT_ADDR1_B		0x583	/* Region 1 End Address (R/W) */
 #define	MSR_IA32_RTIT_ADDR2_A		0x584	/* Region 2 Start Address (R/W) */
 #define	MSR_IA32_RTIT_ADDR2_B		0x585	/* Region 2 End Address (R/W) */
 #define	MSR_IA32_RTIT_ADDR3_A		0x586	/* Region 3 Start Address (R/W) */
 #define	MSR_IA32_RTIT_ADDR3_B		0x587	/* Region 3 End Address (R/W) */
 
 /* Intel Processor Trace Table of Physical Addresses (ToPA). */
 #define	TOPA_SIZE_S	6
 #define	TOPA_SIZE_M	(0xf << TOPA_SIZE_S)
 #define	TOPA_SIZE_4K	(0 << TOPA_SIZE_S)
 #define	TOPA_SIZE_8K	(1 << TOPA_SIZE_S)
 #define	TOPA_SIZE_16K	(2 << TOPA_SIZE_S)
 #define	TOPA_SIZE_32K	(3 << TOPA_SIZE_S)
 #define	TOPA_SIZE_64K	(4 << TOPA_SIZE_S)
 #define	TOPA_SIZE_128K	(5 << TOPA_SIZE_S)
 #define	TOPA_SIZE_256K	(6 << TOPA_SIZE_S)
 #define	TOPA_SIZE_512K	(7 << TOPA_SIZE_S)
 #define	TOPA_SIZE_1M	(8 << TOPA_SIZE_S)
 #define	TOPA_SIZE_2M	(9 << TOPA_SIZE_S)
 #define	TOPA_SIZE_4M	(10 << TOPA_SIZE_S)
 #define	TOPA_SIZE_8M	(11 << TOPA_SIZE_S)
 #define	TOPA_SIZE_16M	(12 << TOPA_SIZE_S)
 #define	TOPA_SIZE_32M	(13 << TOPA_SIZE_S)
 #define	TOPA_SIZE_64M	(14 << TOPA_SIZE_S)
 #define	TOPA_SIZE_128M	(15 << TOPA_SIZE_S)
 #define	TOPA_STOP	(1 << 4)
 #define	TOPA_INT	(1 << 2)
 #define	TOPA_END	(1 << 0)
 
 /*
  * Constants related to MSR's.
  */
 #define	APICBASE_RESERVED	0x000002ff
 #define	APICBASE_BSP		0x00000100
 #define	APICBASE_X2APIC		0x00000400
 #define	APICBASE_ENABLED	0x00000800
 #define	APICBASE_ADDRESS	0xfffff000
 
 /* MSR_IA32_FEATURE_CONTROL related */
 #define	IA32_FEATURE_CONTROL_LOCK	0x01	/* lock bit */
 #define	IA32_FEATURE_CONTROL_SMX_EN	0x02	/* enable VMX inside SMX */
 #define	IA32_FEATURE_CONTROL_VMX_EN	0x04	/* enable VMX outside SMX */
 
 /* MSR IA32_MISC_ENABLE */
 #define	IA32_MISC_EN_FASTSTR	0x0000000000000001ULL
 #define	IA32_MISC_EN_ATCCE	0x0000000000000008ULL
 #define	IA32_MISC_EN_PERFMON	0x0000000000000080ULL
 #define	IA32_MISC_EN_PEBSU	0x0000000000001000ULL
 #define	IA32_MISC_EN_ESSTE	0x0000000000010000ULL
 #define	IA32_MISC_EN_MONE	0x0000000000040000ULL
 #define	IA32_MISC_EN_LIMCPUID	0x0000000000400000ULL
 #define	IA32_MISC_EN_xTPRD	0x0000000000800000ULL
 #define	IA32_MISC_EN_XDD	0x0000000400000000ULL
 
 /*
  * IA32_SPEC_CTRL and IA32_PRED_CMD MSRs are described in the Intel'
  * document 336996-001 Speculative Execution Side Channel Mitigations.
  *
  * AMD uses the same MSRs and bit definitions, as described in 111006-B
  * "Indirect Branch Control Extension" and 124441 "Speculative Store Bypass
  * Disable."
  */
 /* MSR IA32_SPEC_CTRL */
 #define	IA32_SPEC_CTRL_IBRS	0x00000001
 #define	IA32_SPEC_CTRL_STIBP	0x00000002
 #define	IA32_SPEC_CTRL_SSBD	0x00000004
 
 /* MSR IA32_PRED_CMD */
 #define	IA32_PRED_CMD_IBPB_BARRIER	0x0000000000000001ULL
 
 /* MSR IA32_FLUSH_CMD */
 #define	IA32_FLUSH_CMD_L1D	0x00000001
 
 /* MSR IA32_HWP_CAPABILITIES */
 #define	IA32_HWP_CAPABILITIES_HIGHEST_PERFORMANCE(x)	(((x) >> 0) & 0xff)
 #define	IA32_HWP_CAPABILITIES_GUARANTEED_PERFORMANCE(x)	(((x) >> 8) & 0xff)
 #define	IA32_HWP_CAPABILITIES_EFFICIENT_PERFORMANCE(x)	(((x) >> 16) & 0xff)
 #define	IA32_HWP_CAPABILITIES_LOWEST_PERFORMANCE(x)	(((x) >> 24) & 0xff)
 
 /* MSR IA32_HWP_REQUEST */
 #define	IA32_HWP_REQUEST_MINIMUM_VALID			(1ULL << 63)
 #define	IA32_HWP_REQUEST_MAXIMUM_VALID			(1ULL << 62)
 #define	IA32_HWP_REQUEST_DESIRED_VALID			(1ULL << 61)
 #define	IA32_HWP_REQUEST_EPP_VALID 			(1ULL << 60)
 #define	IA32_HWP_REQUEST_ACTIVITY_WINDOW_VALID		(1ULL << 59)
 #define	IA32_HWP_REQUEST_PACKAGE_CONTROL		(1ULL << 42)
 #define	IA32_HWP_ACTIVITY_WINDOW			(0x3ffULL << 32)
 #define	IA32_HWP_REQUEST_ENERGY_PERFORMANCE_PREFERENCE	(0xffULL << 24)
 #define	IA32_HWP_DESIRED_PERFORMANCE			(0xffULL << 16)
 #define	IA32_HWP_REQUEST_MAXIMUM_PERFORMANCE		(0xffULL << 8)
 #define	IA32_HWP_MINIMUM_PERFORMANCE			(0xffULL << 0)
 
 /*
  * PAT modes.
  */
 #define	PAT_UNCACHEABLE		0x00
 #define	PAT_WRITE_COMBINING	0x01
 #define	PAT_WRITE_THROUGH	0x04
 #define	PAT_WRITE_PROTECTED	0x05
 #define	PAT_WRITE_BACK		0x06
 #define	PAT_UNCACHED		0x07
 #define	PAT_VALUE(i, m)		((long long)(m) << (8 * (i)))
 #define	PAT_MASK(i)		PAT_VALUE(i, 0xff)
 
 /*
  * Constants related to MTRRs
  */
 #define	MTRR_UNCACHEABLE	0x00
 #define	MTRR_WRITE_COMBINING	0x01
 #define	MTRR_WRITE_THROUGH	0x04
 #define	MTRR_WRITE_PROTECTED	0x05
 #define	MTRR_WRITE_BACK		0x06
 #define	MTRR_N64K		8	/* numbers of fixed-size entries */
 #define	MTRR_N16K		16
 #define	MTRR_N4K		64
 #define	MTRR_CAP_WC		0x0000000000000400
 #define	MTRR_CAP_FIXED		0x0000000000000100
 #define	MTRR_CAP_VCNT		0x00000000000000ff
 #define	MTRR_DEF_ENABLE		0x0000000000000800
 #define	MTRR_DEF_FIXED_ENABLE	0x0000000000000400
 #define	MTRR_DEF_TYPE		0x00000000000000ff
 #define	MTRR_PHYSBASE_PHYSBASE	0x000ffffffffff000
 #define	MTRR_PHYSBASE_TYPE	0x00000000000000ff
 #define	MTRR_PHYSMASK_PHYSMASK	0x000ffffffffff000
 #define	MTRR_PHYSMASK_VALID	0x0000000000000800
 
 /*
  * Cyrix configuration registers, accessible as IO ports.
  */
 #define	CCR0			0xc0	/* Configuration control register 0 */
 #define	CCR0_NC0		0x01	/* First 64K of each 1M memory region is
 								   non-cacheable */
 #define	CCR0_NC1		0x02	/* 640K-1M region is non-cacheable */
 #define	CCR0_A20M		0x04	/* Enables A20M# input pin */
 #define	CCR0_KEN		0x08	/* Enables KEN# input pin */
 #define	CCR0_FLUSH		0x10	/* Enables FLUSH# input pin */
 #define	CCR0_BARB		0x20	/* Flushes internal cache when entering hold
 								   state */
 #define	CCR0_CO			0x40	/* Cache org: 1=direct mapped, 0=2x set
 								   assoc */
 #define	CCR0_SUSPEND	0x80	/* Enables SUSP# and SUSPA# pins */
 
 #define	CCR1			0xc1	/* Configuration control register 1 */
 #define	CCR1_RPL		0x01	/* Enables RPLSET and RPLVAL# pins */
 #define	CCR1_SMI		0x02	/* Enables SMM pins */
 #define	CCR1_SMAC		0x04	/* System management memory access */
 #define	CCR1_MMAC		0x08	/* Main memory access */
 #define	CCR1_NO_LOCK	0x10	/* Negate LOCK# */
 #define	CCR1_SM3		0x80	/* SMM address space address region 3 */
 
 #define	CCR2			0xc2
 #define	CCR2_WB			0x02	/* Enables WB cache interface pins */
 #define	CCR2_SADS		0x02	/* Slow ADS */
 #define	CCR2_LOCK_NW	0x04	/* LOCK NW Bit */
 #define	CCR2_SUSP_HLT	0x08	/* Suspend on HALT */
 #define	CCR2_WT1		0x10	/* WT region 1 */
 #define	CCR2_WPR1		0x10	/* Write-protect region 1 */
 #define	CCR2_BARB		0x20	/* Flushes write-back cache when entering
 								   hold state. */
 #define	CCR2_BWRT		0x40	/* Enables burst write cycles */
 #define	CCR2_USE_SUSP	0x80	/* Enables suspend pins */
 
 #define	CCR3			0xc3
 #define	CCR3_SMILOCK	0x01	/* SMM register lock */
 #define	CCR3_NMI		0x02	/* Enables NMI during SMM */
 #define	CCR3_LINBRST	0x04	/* Linear address burst cycles */
 #define	CCR3_SMMMODE	0x08	/* SMM Mode */
 #define	CCR3_MAPEN0		0x10	/* Enables Map0 */
 #define	CCR3_MAPEN1		0x20	/* Enables Map1 */
 #define	CCR3_MAPEN2		0x40	/* Enables Map2 */
 #define	CCR3_MAPEN3		0x80	/* Enables Map3 */
 
 #define	CCR4			0xe8
 #define	CCR4_IOMASK		0x07
 #define	CCR4_MEM		0x08	/* Enables momory bypassing */
 #define	CCR4_DTE		0x10	/* Enables directory table entry cache */
 #define	CCR4_FASTFPE	0x20	/* Fast FPU exception */
 #define	CCR4_CPUID		0x80	/* Enables CPUID instruction */
 
 #define	CCR5			0xe9
 #define	CCR5_WT_ALLOC	0x01	/* Write-through allocate */
 #define	CCR5_SLOP		0x02	/* LOOP instruction slowed down */
 #define	CCR5_LBR1		0x10	/* Local bus region 1 */
 #define	CCR5_ARREN		0x20	/* Enables ARR region */
 
 #define	CCR6			0xea
 
 #define	CCR7			0xeb
 
 /* Performance Control Register (5x86 only). */
 #define	PCR0			0x20
 #define	PCR0_RSTK		0x01	/* Enables return stack */
 #define	PCR0_BTB		0x02	/* Enables branch target buffer */
 #define	PCR0_LOOP		0x04	/* Enables loop */
 #define	PCR0_AIS		0x08	/* Enables all instrcutions stalled to
 								   serialize pipe. */
 #define	PCR0_MLR		0x10	/* Enables reordering of misaligned loads */
 #define	PCR0_BTBRT		0x40	/* Enables BTB test register. */
 #define	PCR0_LSSER		0x80	/* Disable reorder */
 
 /* Device Identification Registers */
 #define	DIR0			0xfe
 #define	DIR1			0xff
 
 /*
  * Machine Check register constants.
  */
 #define	MCG_CAP_COUNT		0x000000ff
 #define	MCG_CAP_CTL_P		0x00000100
 #define	MCG_CAP_EXT_P		0x00000200
 #define	MCG_CAP_CMCI_P		0x00000400
 #define	MCG_CAP_TES_P		0x00000800
 #define	MCG_CAP_EXT_CNT		0x00ff0000
 #define	MCG_CAP_SER_P		0x01000000
 #define	MCG_STATUS_RIPV		0x00000001
 #define	MCG_STATUS_EIPV		0x00000002
 #define	MCG_STATUS_MCIP		0x00000004
 #define	MCG_CTL_ENABLE		0xffffffffffffffff
 #define	MCG_CTL_DISABLE		0x0000000000000000
 #define	MSR_MC_CTL(x)		(MSR_MC0_CTL + (x) * 4)
 #define	MSR_MC_STATUS(x)	(MSR_MC0_STATUS + (x) * 4)
 #define	MSR_MC_ADDR(x)		(MSR_MC0_ADDR + (x) * 4)
 #define	MSR_MC_MISC(x)		(MSR_MC0_MISC + (x) * 4)
 #define	MSR_MC_CTL2(x)		(MSR_MC0_CTL2 + (x))	/* If MCG_CAP_CMCI_P */
 #define	MC_STATUS_MCA_ERROR	0x000000000000ffff
 #define	MC_STATUS_MODEL_ERROR	0x00000000ffff0000
 #define	MC_STATUS_OTHER_INFO	0x01ffffff00000000
 #define	MC_STATUS_COR_COUNT	0x001fffc000000000	/* If MCG_CAP_CMCI_P */
 #define	MC_STATUS_TES_STATUS	0x0060000000000000	/* If MCG_CAP_TES_P */
 #define	MC_STATUS_AR		0x0080000000000000	/* If MCG_CAP_TES_P */
 #define	MC_STATUS_S		0x0100000000000000	/* If MCG_CAP_TES_P */
 #define	MC_STATUS_PCC		0x0200000000000000
 #define	MC_STATUS_ADDRV		0x0400000000000000
 #define	MC_STATUS_MISCV		0x0800000000000000
 #define	MC_STATUS_EN		0x1000000000000000
 #define	MC_STATUS_UC		0x2000000000000000
 #define	MC_STATUS_OVER		0x4000000000000000
 #define	MC_STATUS_VAL		0x8000000000000000
 #define	MC_MISC_RA_LSB		0x000000000000003f	/* If MCG_CAP_SER_P */
 #define	MC_MISC_ADDRESS_MODE	0x00000000000001c0	/* If MCG_CAP_SER_P */
 #define	MC_CTL2_THRESHOLD	0x0000000000007fff
 #define	MC_CTL2_CMCI_EN		0x0000000040000000
 #define	MC_AMDNB_BANK		4
 #define	MC_MISC_AMD_VAL		0x8000000000000000	/* Counter presence valid */
 #define	MC_MISC_AMD_CNTP	0x4000000000000000	/* Counter present */
 #define	MC_MISC_AMD_LOCK	0x2000000000000000	/* Register locked */
 #define	MC_MISC_AMD_INTP	0x1000000000000000	/* Int. type can generate interrupts */
 #define	MC_MISC_AMD_LVT_MASK	0x00f0000000000000	/* Extended LVT offset */
 #define	MC_MISC_AMD_LVT_SHIFT	52
 #define	MC_MISC_AMD_CNTEN	0x0008000000000000	/* Counter enabled */
 #define	MC_MISC_AMD_INT_MASK	0x0006000000000000	/* Interrupt type */
 #define	MC_MISC_AMD_INT_LVT	0x0002000000000000	/* Interrupt via Extended LVT */
 #define	MC_MISC_AMD_INT_SMI	0x0004000000000000	/* SMI */
 #define	MC_MISC_AMD_OVERFLOW	0x0001000000000000	/* Counter overflow */
 #define	MC_MISC_AMD_CNT_MASK	0x00000fff00000000	/* Counter value */
 #define	MC_MISC_AMD_CNT_SHIFT	32
 #define	MC_MISC_AMD_CNT_MAX	0xfff
 #define	MC_MISC_AMD_PTR_MASK	0x00000000ff000000	/* Pointer to additional registers */
 #define	MC_MISC_AMD_PTR_SHIFT	24
 
 /*
  * The following four 3-byte registers control the non-cacheable regions.
  * These registers must be written as three separate bytes.
  *
  * NCRx+0: A31-A24 of starting address
  * NCRx+1: A23-A16 of starting address
  * NCRx+2: A15-A12 of starting address | NCR_SIZE_xx.
  *
  * The non-cacheable region's starting address must be aligned to the
  * size indicated by the NCR_SIZE_xx field.
  */
 #define	NCR1	0xc4
 #define	NCR2	0xc7
 #define	NCR3	0xca
 #define	NCR4	0xcd
 
 #define	NCR_SIZE_0K	0
 #define	NCR_SIZE_4K	1
 #define	NCR_SIZE_8K	2
 #define	NCR_SIZE_16K	3
 #define	NCR_SIZE_32K	4
 #define	NCR_SIZE_64K	5
 #define	NCR_SIZE_128K	6
 #define	NCR_SIZE_256K	7
 #define	NCR_SIZE_512K	8
 #define	NCR_SIZE_1M	9
 #define	NCR_SIZE_2M	10
 #define	NCR_SIZE_4M	11
 #define	NCR_SIZE_8M	12
 #define	NCR_SIZE_16M	13
 #define	NCR_SIZE_32M	14
 #define	NCR_SIZE_4G	15
 
 /*
  * The address region registers are used to specify the location and
  * size for the eight address regions.
  *
  * ARRx + 0: A31-A24 of start address
  * ARRx + 1: A23-A16 of start address
  * ARRx + 2: A15-A12 of start address | ARR_SIZE_xx
  */
 #define	ARR0	0xc4
 #define	ARR1	0xc7
 #define	ARR2	0xca
 #define	ARR3	0xcd
 #define	ARR4	0xd0
 #define	ARR5	0xd3
 #define	ARR6	0xd6
 #define	ARR7	0xd9
 
 #define	ARR_SIZE_0K		0
 #define	ARR_SIZE_4K		1
 #define	ARR_SIZE_8K		2
 #define	ARR_SIZE_16K	3
 #define	ARR_SIZE_32K	4
 #define	ARR_SIZE_64K	5
 #define	ARR_SIZE_128K	6
 #define	ARR_SIZE_256K	7
 #define	ARR_SIZE_512K	8
 #define	ARR_SIZE_1M		9
 #define	ARR_SIZE_2M		10
 #define	ARR_SIZE_4M		11
 #define	ARR_SIZE_8M		12
 #define	ARR_SIZE_16M	13
 #define	ARR_SIZE_32M	14
 #define	ARR_SIZE_4G		15
 
 /*
  * The region control registers specify the attributes associated with
  * the ARRx addres regions.
  */
 #define	RCR0	0xdc
 #define	RCR1	0xdd
 #define	RCR2	0xde
 #define	RCR3	0xdf
 #define	RCR4	0xe0
 #define	RCR5	0xe1
 #define	RCR6	0xe2
 #define	RCR7	0xe3
 
 #define	RCR_RCD	0x01	/* Disables caching for ARRx (x = 0-6). */
 #define	RCR_RCE	0x01	/* Enables caching for ARR7. */
 #define	RCR_WWO	0x02	/* Weak write ordering. */
 #define	RCR_WL	0x04	/* Weak locking. */
 #define	RCR_WG	0x08	/* Write gathering. */
 #define	RCR_WT	0x10	/* Write-through. */
 #define	RCR_NLB	0x20	/* LBA# pin is not asserted. */
 
 /* AMD Write Allocate Top-Of-Memory and Control Register */
 #define	AMD_WT_ALLOC_TME	0x40000	/* top-of-memory enable */
 #define	AMD_WT_ALLOC_PRE	0x20000	/* programmable range enable */
 #define	AMD_WT_ALLOC_FRE	0x10000	/* fixed (A0000-FFFFF) range enable */
 
 /* AMD64 MSR's */
 #define	MSR_EFER	0xc0000080	/* extended features */
 #define	MSR_STAR	0xc0000081	/* legacy mode SYSCALL target/cs/ss */
 #define	MSR_LSTAR	0xc0000082	/* long mode SYSCALL target rip */
 #define	MSR_CSTAR	0xc0000083	/* compat mode SYSCALL target rip */
 #define	MSR_SF_MASK	0xc0000084	/* syscall flags mask */
 #define	MSR_FSBASE	0xc0000100	/* base address of the %fs "segment" */
 #define	MSR_GSBASE	0xc0000101	/* base address of the %gs "segment" */
 #define	MSR_KGSBASE	0xc0000102	/* base address of the kernel %gs */
 #define	MSR_TSC_AUX	0xc0000103
 #define	MSR_PERFEVSEL0	0xc0010000
 #define	MSR_PERFEVSEL1	0xc0010001
 #define	MSR_PERFEVSEL2	0xc0010002
 #define	MSR_PERFEVSEL3	0xc0010003
 #define	MSR_K7_PERFCTR0	0xc0010004
 #define	MSR_K7_PERFCTR1	0xc0010005
 #define	MSR_K7_PERFCTR2	0xc0010006
 #define	MSR_K7_PERFCTR3	0xc0010007
 #define	MSR_SYSCFG	0xc0010010
 #define	MSR_HWCR	0xc0010015
 #define	MSR_IORRBASE0	0xc0010016
 #define	MSR_IORRMASK0	0xc0010017
 #define	MSR_IORRBASE1	0xc0010018
 #define	MSR_IORRMASK1	0xc0010019
 #define	MSR_TOP_MEM	0xc001001a	/* boundary for ram below 4G */
 #define	MSR_TOP_MEM2	0xc001001d	/* boundary for ram above 4G */
 #define	MSR_NB_CFG1	0xc001001f	/* NB configuration 1 */
 #define	MSR_K8_UCODE_UPDATE 0xc0010020	/* update microcode */
 #define	MSR_MC0_CTL_MASK 0xc0010044
 #define	MSR_P_STATE_LIMIT 0xc0010061	/* P-state Current Limit Register */
 #define	MSR_P_STATE_CONTROL 0xc0010062	/* P-state Control Register */
 #define	MSR_P_STATE_STATUS 0xc0010063	/* P-state Status Register */
 #define	MSR_P_STATE_CONFIG(n) (0xc0010064 + (n)) /* P-state Config */
 #define	MSR_SMM_ADDR	0xc0010112	/* SMM TSEG base address */
 #define	MSR_SMM_MASK	0xc0010113	/* SMM TSEG address mask */
 #define	MSR_VM_CR	0xc0010114	/* SVM: feature control */
 #define	MSR_VM_HSAVE_PA 0xc0010117	/* SVM: host save area address */
 #define	MSR_AMD_CPUID07	0xc0011002	/* CPUID 07 %ebx override */
 #define	MSR_EXTFEATURES	0xc0011005	/* Extended CPUID Features override */
 #define	MSR_IC_CFG	0xc0011021	/* Instruction Cache Configuration */
 
 /* MSR_VM_CR related */
 #define	VM_CR_SVMDIS		0x10	/* SVM: disabled by BIOS */
 
 /* VIA ACE crypto featureset: for via_feature_rng */
 #define	VIA_HAS_RNG		1	/* cpu has RNG */
 
 /* VIA ACE crypto featureset: for via_feature_xcrypt */
 #define	VIA_HAS_AES		1	/* cpu has AES */
 #define	VIA_HAS_SHA		2	/* cpu has SHA1 & SHA256 */
 #define	VIA_HAS_MM		4	/* cpu has RSA instructions */
 #define	VIA_HAS_AESCTR		8	/* cpu has AES-CTR instructions */
 
 /* Centaur Extended Feature flags */
 #define	VIA_CPUID_HAS_RNG	0x000004
 #define	VIA_CPUID_DO_RNG	0x000008
 #define	VIA_CPUID_HAS_ACE	0x000040
 #define	VIA_CPUID_DO_ACE	0x000080
 #define	VIA_CPUID_HAS_ACE2	0x000100
 #define	VIA_CPUID_DO_ACE2	0x000200
 #define	VIA_CPUID_HAS_PHE	0x000400
 #define	VIA_CPUID_DO_PHE	0x000800
 #define	VIA_CPUID_HAS_PMM	0x001000
 #define	VIA_CPUID_DO_PMM	0x002000
 
 /* VIA ACE xcrypt-* instruction context control options */
 #define	VIA_CRYPT_CWLO_ROUND_M		0x0000000f
 #define	VIA_CRYPT_CWLO_ALG_M		0x00000070
 #define	VIA_CRYPT_CWLO_ALG_AES		0x00000000
 #define	VIA_CRYPT_CWLO_KEYGEN_M		0x00000080
 #define	VIA_CRYPT_CWLO_KEYGEN_HW	0x00000000
 #define	VIA_CRYPT_CWLO_KEYGEN_SW	0x00000080
 #define	VIA_CRYPT_CWLO_NORMAL		0x00000000
 #define	VIA_CRYPT_CWLO_INTERMEDIATE	0x00000100
 #define	VIA_CRYPT_CWLO_ENCRYPT		0x00000000
 #define	VIA_CRYPT_CWLO_DECRYPT		0x00000200
 #define	VIA_CRYPT_CWLO_KEY128		0x0000000a	/* 128bit, 10 rds */
 #define	VIA_CRYPT_CWLO_KEY192		0x0000040c	/* 192bit, 12 rds */
 #define	VIA_CRYPT_CWLO_KEY256		0x0000080e	/* 256bit, 15 rds */
 
 #endif /* !_MACHINE_SPECIALREG_H_ */
Index: projects/runtime-coverage-v2/sys/x86/include/x86_var.h
===================================================================
--- projects/runtime-coverage-v2/sys/x86/include/x86_var.h	(revision 347598)
+++ projects/runtime-coverage-v2/sys/x86/include/x86_var.h	(revision 347599)
@@ -1,142 +1,144 @@
 /*-
  * Copyright (c) 1995 Bruce D. Evans.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the author nor the names of contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  * $FreeBSD$
  */
 
 #ifndef _X86_X86_VAR_H_
 #define	_X86_X86_VAR_H_
 
 /*
  * Miscellaneous machine-dependent declarations.
  */
 
 extern	long	Maxmem;
 extern	u_int	basemem;
 extern	int	busdma_swi_pending;
 extern	u_int	cpu_exthigh;
 extern	u_int	cpu_feature;
 extern	u_int	cpu_feature2;
 extern	u_int	amd_feature;
 extern	u_int	amd_feature2;
 extern	u_int	amd_rascap;
 extern	u_int	amd_pminfo;
 extern	u_int	amd_extended_feature_extensions;
 extern	u_int	via_feature_rng;
 extern	u_int	via_feature_xcrypt;
 extern	u_int	cpu_clflush_line_size;
 extern	u_int	cpu_stdext_feature;
 extern	u_int	cpu_stdext_feature2;
 extern	u_int	cpu_stdext_feature3;
 extern	uint64_t cpu_ia32_arch_caps;
 extern	u_int	cpu_fxsr;
 extern	u_int	cpu_high;
 extern	u_int	cpu_id;
 extern	u_int	cpu_max_ext_state_size;
 extern	u_int	cpu_mxcsr_mask;
 extern	u_int	cpu_procinfo;
 extern	u_int	cpu_procinfo2;
 extern	char	cpu_vendor[];
 extern	u_int	cpu_vendor_id;
 extern	u_int	cpu_mon_mwait_flags;
 extern	u_int	cpu_mon_min_size;
 extern	u_int	cpu_mon_max_size;
 extern	u_int	cpu_maxphyaddr;
 extern	char	ctx_switch_xsave[];
 extern	u_int	hv_high;
 extern	char	hv_vendor[];
 extern	char	kstack[];
 extern	char	sigcode[];
 extern	int	szsigcode;
 extern	int	vm_page_dump_size;
 extern	int	workaround_erratum383;
 extern	int	_udatasel;
 extern	int	_ucodesel;
 extern	int	_ucode32sel;
 extern	int	_ufssel;
 extern	int	_ugssel;
 extern	int	use_xsave;
 extern	uint64_t xsave_mask;
 extern	u_int	max_apic_id;
 extern	int	i386_read_exec;
 extern	int	pti;
 extern	int	hw_ibrs_active;
+extern	int	hw_mds_disable;
 extern	int	hw_ssb_active;
 
 struct	pcb;
 struct	thread;
 struct	reg;
 struct	fpreg;
 struct  dbreg;
 struct	dumperinfo;
 struct	trapframe;
 
 /*
  * The interface type of the interrupt handler entry point cannot be
  * expressed in C.  Use simplest non-variadic function type as an
  * approximation.
  */
 typedef void alias_for_inthand_t(void);
 
 bool	acpi_get_fadt_bootflags(uint16_t *flagsp);
 void	*alloc_fpusave(int flags);
 void	busdma_swi(void);
 vm_paddr_t cpu_getmaxphyaddr(void);
 bool	cpu_mwait_usable(void);
 void	cpu_probe_amdc1e(void);
 void	cpu_setregs(void);
 bool	disable_wp(void);
 void	restore_wp(bool old_wp);
 void	dump_add_page(vm_paddr_t);
 void	dump_drop_page(vm_paddr_t);
 void	finishidentcpu(void);
 void	identify_cpu1(void);
 void	identify_cpu2(void);
 void	identify_cpu_fixup_bsp(void);
 void	identify_hypervisor(void);
 void	initializecpu(void);
 void	initializecpucache(void);
 bool	fix_cpuid(void);
 void	fillw(int /*u_short*/ pat, void *base, size_t cnt);
 int	is_physical_memory(vm_paddr_t addr);
 int	isa_nmi(int cd);
 void	handle_ibrs_entry(void);
 void	handle_ibrs_exit(void);
 void	hw_ibrs_recalculate(void);
+void	hw_mds_recalculate(void);
 void	hw_ssb_recalculate(bool all_cpus);
 void	nmi_call_kdb(u_int cpu, u_int type, struct trapframe *frame);
 void	nmi_call_kdb_smp(u_int type, struct trapframe *frame);
 void	nmi_handle_intr(u_int type, struct trapframe *frame);
 void	pagecopy(void *from, void *to);
 void	printcpuinfo(void);
 int	pti_get_default(void);
 int	user_dbreg_trap(register_t dr6);
 int	minidumpsys(struct dumperinfo *);
 struct pcb *get_pcb_td(struct thread *td);
 
 #endif
Index: projects/runtime-coverage-v2/sys/x86/x86/cpu_machdep.c
===================================================================
--- projects/runtime-coverage-v2/sys/x86/x86/cpu_machdep.c	(revision 347598)
+++ projects/runtime-coverage-v2/sys/x86/x86/cpu_machdep.c	(revision 347599)
@@ -1,977 +1,1175 @@
 /*-
  * Copyright (c) 2003 Peter Wemm.
  * Copyright (c) 1992 Terrence R. Lambert.
  * Copyright (c) 1982, 1987, 1990 The Regents of the University of California.
  * All rights reserved.
  *
  * This code is derived from software contributed to Berkeley by
  * William Jolitz.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *	This product includes software developed by the University of
  *	California, Berkeley and its contributors.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	from: @(#)machdep.c	7.4 (Berkeley) 6/3/91
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_acpi.h"
 #include "opt_atpic.h"
 #include "opt_cpu.h"
 #include "opt_ddb.h"
 #include "opt_inet.h"
 #include "opt_isa.h"
 #include "opt_kdb.h"
 #include "opt_kstack_pages.h"
 #include "opt_maxmem.h"
 #include "opt_mp_watchdog.h"
 #include "opt_platform.h"
 #ifdef __i386__
 #include "opt_apic.h"
 #endif
 
 #include <sys/param.h>
 #include <sys/proc.h>
 #include <sys/systm.h>
 #include <sys/bus.h>
 #include <sys/cpu.h>
+#include <sys/domainset.h>
 #include <sys/kdb.h>
 #include <sys/kernel.h>
 #include <sys/ktr.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mutex.h>
 #include <sys/pcpu.h>
 #include <sys/rwlock.h>
 #include <sys/sched.h>
 #include <sys/smp.h>
 #include <sys/sysctl.h>
 
 #include <machine/clock.h>
 #include <machine/cpu.h>
 #include <machine/cputypes.h>
 #include <machine/specialreg.h>
 #include <machine/md_var.h>
 #include <machine/mp_watchdog.h>
 #include <machine/tss.h>
 #ifdef SMP
 #include <machine/smp.h>
 #endif
 #ifdef CPU_ELAN
 #include <machine/elan_mmcr.h>
 #endif
 #include <x86/acpica_machdep.h>
 
 #include <vm/vm.h>
 #include <vm/vm_extern.h>
 #include <vm/vm_kern.h>
 #include <vm/vm_page.h>
 #include <vm/vm_map.h>
 #include <vm/vm_object.h>
 #include <vm/vm_pager.h>
 #include <vm/vm_param.h>
 
 #include <isa/isareg.h>
 
 #include <contrib/dev/acpica/include/acpi.h>
 
 #define	STATE_RUNNING	0x0
 #define	STATE_MWAIT	0x1
 #define	STATE_SLEEPING	0x2
 
 #ifdef SMP
 static u_int	cpu_reset_proxyid;
 static volatile u_int	cpu_reset_proxy_active;
 #endif
 
 /*
  * Automatically initialized per CPU errata in cpu_idle_tun below.
  */
 bool mwait_cpustop_broken = false;
 SYSCTL_BOOL(_machdep, OID_AUTO, mwait_cpustop_broken, CTLFLAG_RDTUN,
     &mwait_cpustop_broken, 0,
     "Can not reliably wake MONITOR/MWAIT cpus without interrupts");
 
 /*
  * Machine dependent boot() routine
  *
  * I haven't seen anything to put here yet
  * Possibly some stuff might be grafted back here from boot()
  */
 void
 cpu_boot(int howto)
 {
 }
 
 /*
  * Flush the D-cache for non-DMA I/O so that the I-cache can
  * be made coherent later.
  */
 void
 cpu_flush_dcache(void *ptr, size_t len)
 {
 	/* Not applicable */
 }
 
 void
 acpi_cpu_c1(void)
 {
 
 	__asm __volatile("sti; hlt");
 }
 
 /*
  * Use mwait to pause execution while waiting for an interrupt or
  * another thread to signal that there is more work.
  *
  * NOTE: Interrupts will cause a wakeup; however, this function does
  * not enable interrupt handling. The caller is responsible to enable
  * interrupts.
  */
 void
 acpi_cpu_idle_mwait(uint32_t mwait_hint)
 {
 	int *state;
 	uint64_t v;
 
 	/*
 	 * A comment in Linux patch claims that 'CPUs run faster with
 	 * speculation protection disabled. All CPU threads in a core
 	 * must disable speculation protection for it to be
 	 * disabled. Disable it while we are idle so the other
 	 * hyperthread can run fast.'
 	 *
 	 * XXXKIB.  Software coordination mode should be supported,
 	 * but all Intel CPUs provide hardware coordination.
 	 */
 
 	state = &PCPU_PTR(monitorbuf)->idle_state;
 	KASSERT(atomic_load_int(state) == STATE_SLEEPING,
 	    ("cpu_mwait_cx: wrong monitorbuf state"));
 	atomic_store_int(state, STATE_MWAIT);
 	if (PCPU_GET(ibpb_set) || hw_ssb_active) {
 		v = rdmsr(MSR_IA32_SPEC_CTRL);
 		wrmsr(MSR_IA32_SPEC_CTRL, v & ~(IA32_SPEC_CTRL_IBRS |
 		    IA32_SPEC_CTRL_STIBP | IA32_SPEC_CTRL_SSBD));
 	} else {
 		v = 0;
 	}
 	cpu_monitor(state, 0, 0);
 	if (atomic_load_int(state) == STATE_MWAIT)
 		cpu_mwait(MWAIT_INTRBREAK, mwait_hint);
 
 	/*
 	 * SSB cannot be disabled while we sleep, or rather, if it was
 	 * disabled, the sysctl thread will bind to our cpu to tweak
 	 * MSR.
 	 */
 	if (v != 0)
 		wrmsr(MSR_IA32_SPEC_CTRL, v);
 
 	/*
 	 * We should exit on any event that interrupts mwait, because
 	 * that event might be a wanted interrupt.
 	 */
 	atomic_store_int(state, STATE_RUNNING);
 }
 
 /* Get current clock frequency for the given cpu id. */
 int
 cpu_est_clockrate(int cpu_id, uint64_t *rate)
 {
 	uint64_t tsc1, tsc2;
 	uint64_t acnt, mcnt, perf;
 	register_t reg;
 
 	if (pcpu_find(cpu_id) == NULL || rate == NULL)
 		return (EINVAL);
 #ifdef __i386__
 	if ((cpu_feature & CPUID_TSC) == 0)
 		return (EOPNOTSUPP);
 #endif
 
 	/*
 	 * If TSC is P-state invariant and APERF/MPERF MSRs do not exist,
 	 * DELAY(9) based logic fails.
 	 */
 	if (tsc_is_invariant && !tsc_perf_stat)
 		return (EOPNOTSUPP);
 
 #ifdef SMP
 	if (smp_cpus > 1) {
 		/* Schedule ourselves on the indicated cpu. */
 		thread_lock(curthread);
 		sched_bind(curthread, cpu_id);
 		thread_unlock(curthread);
 	}
 #endif
 
 	/* Calibrate by measuring a short delay. */
 	reg = intr_disable();
 	if (tsc_is_invariant) {
 		wrmsr(MSR_MPERF, 0);
 		wrmsr(MSR_APERF, 0);
 		tsc1 = rdtsc();
 		DELAY(1000);
 		mcnt = rdmsr(MSR_MPERF);
 		acnt = rdmsr(MSR_APERF);
 		tsc2 = rdtsc();
 		intr_restore(reg);
 		perf = 1000 * acnt / mcnt;
 		*rate = (tsc2 - tsc1) * perf;
 	} else {
 		tsc1 = rdtsc();
 		DELAY(1000);
 		tsc2 = rdtsc();
 		intr_restore(reg);
 		*rate = (tsc2 - tsc1) * 1000;
 	}
 
 #ifdef SMP
 	if (smp_cpus > 1) {
 		thread_lock(curthread);
 		sched_unbind(curthread);
 		thread_unlock(curthread);
 	}
 #endif
 
 	return (0);
 }
 
 /*
  * Shutdown the CPU as much as possible
  */
 void
 cpu_halt(void)
 {
 	for (;;)
 		halt();
 }
 
 static void
 cpu_reset_real(void)
 {
 	struct region_descriptor null_idt;
 	int b;
 
 	disable_intr();
 #ifdef CPU_ELAN
 	if (elan_mmcr != NULL)
 		elan_mmcr->RESCFG = 1;
 #endif
 #ifdef __i386__
 	if (cpu == CPU_GEODE1100) {
 		/* Attempt Geode's own reset */
 		outl(0xcf8, 0x80009044ul);
 		outl(0xcfc, 0xf);
 	}
 #endif
 #if !defined(BROKEN_KEYBOARD_RESET)
 	/*
 	 * Attempt to do a CPU reset via the keyboard controller,
 	 * do not turn off GateA20, as any machine that fails
 	 * to do the reset here would then end up in no man's land.
 	 */
 	outb(IO_KBD + 4, 0xFE);
 	DELAY(500000);	/* wait 0.5 sec to see if that did it */
 #endif
 
 	/*
 	 * Attempt to force a reset via the Reset Control register at
 	 * I/O port 0xcf9.  Bit 2 forces a system reset when it
 	 * transitions from 0 to 1.  Bit 1 selects the type of reset
 	 * to attempt: 0 selects a "soft" reset, and 1 selects a
 	 * "hard" reset.  We try a "hard" reset.  The first write sets
 	 * bit 1 to select a "hard" reset and clears bit 2.  The
 	 * second write forces a 0 -> 1 transition in bit 2 to trigger
 	 * a reset.
 	 */
 	outb(0xcf9, 0x2);
 	outb(0xcf9, 0x6);
 	DELAY(500000);  /* wait 0.5 sec to see if that did it */
 
 	/*
 	 * Attempt to force a reset via the Fast A20 and Init register
 	 * at I/O port 0x92.  Bit 1 serves as an alternate A20 gate.
 	 * Bit 0 asserts INIT# when set to 1.  We are careful to only
 	 * preserve bit 1 while setting bit 0.  We also must clear bit
 	 * 0 before setting it if it isn't already clear.
 	 */
 	b = inb(0x92);
 	if (b != 0xff) {
 		if ((b & 0x1) != 0)
 			outb(0x92, b & 0xfe);
 		outb(0x92, b | 0x1);
 		DELAY(500000);  /* wait 0.5 sec to see if that did it */
 	}
 
 	printf("No known reset method worked, attempting CPU shutdown\n");
 	DELAY(1000000); /* wait 1 sec for printf to complete */
 
 	/* Wipe the IDT. */
 	null_idt.rd_limit = 0;
 	null_idt.rd_base = 0;
 	lidt(&null_idt);
 
 	/* "good night, sweet prince .... <THUNK!>" */
 	breakpoint();
 
 	/* NOTREACHED */
 	while(1);
 }
 
 #ifdef SMP
 static void
 cpu_reset_proxy(void)
 {
 
 	cpu_reset_proxy_active = 1;
 	while (cpu_reset_proxy_active == 1)
 		ia32_pause(); /* Wait for other cpu to see that we've started */
 
 	printf("cpu_reset_proxy: Stopped CPU %d\n", cpu_reset_proxyid);
 	DELAY(1000000);
 	cpu_reset_real();
 }
 #endif
 
 void
 cpu_reset(void)
 {
 #ifdef SMP
 	struct monitorbuf *mb;
 	cpuset_t map;
 	u_int cnt;
 
 	if (smp_started) {
 		map = all_cpus;
 		CPU_CLR(PCPU_GET(cpuid), &map);
 		CPU_NAND(&map, &stopped_cpus);
 		if (!CPU_EMPTY(&map)) {
 			printf("cpu_reset: Stopping other CPUs\n");
 			stop_cpus(map);
 		}
 
 		if (PCPU_GET(cpuid) != 0) {
 			cpu_reset_proxyid = PCPU_GET(cpuid);
 			cpustop_restartfunc = cpu_reset_proxy;
 			cpu_reset_proxy_active = 0;
 			printf("cpu_reset: Restarting BSP\n");
 
 			/* Restart CPU #0. */
 			CPU_SETOF(0, &started_cpus);
 			mb = &pcpu_find(0)->pc_monitorbuf;
 			atomic_store_int(&mb->stop_state,
 			    MONITOR_STOPSTATE_RUNNING);
 			wmb();
 
 			cnt = 0;
 			while (cpu_reset_proxy_active == 0 && cnt < 10000000) {
 				ia32_pause();
 				cnt++;	/* Wait for BSP to announce restart */
 			}
 			if (cpu_reset_proxy_active == 0) {
 				printf("cpu_reset: Failed to restart BSP\n");
 			} else {
 				cpu_reset_proxy_active = 2;
 				while (1)
 					ia32_pause();
 				/* NOTREACHED */
 			}
 		}
 
 		DELAY(1000000);
 	}
 #endif
 	cpu_reset_real();
 	/* NOTREACHED */
 }
 
 bool
 cpu_mwait_usable(void)
 {
 
 	return ((cpu_feature2 & CPUID2_MON) != 0 && ((cpu_mon_mwait_flags &
 	    (CPUID5_MON_MWAIT_EXT | CPUID5_MWAIT_INTRBREAK)) ==
 	    (CPUID5_MON_MWAIT_EXT | CPUID5_MWAIT_INTRBREAK)));
 }
 
 void (*cpu_idle_hook)(sbintime_t) = NULL;	/* ACPI idle hook. */
 static int	cpu_ident_amdc1e = 0;	/* AMD C1E supported. */
 static int	idle_mwait = 1;		/* Use MONITOR/MWAIT for short idle. */
 SYSCTL_INT(_machdep, OID_AUTO, idle_mwait, CTLFLAG_RWTUN, &idle_mwait,
     0, "Use MONITOR/MWAIT for short idle");
 
 static void
 cpu_idle_acpi(sbintime_t sbt)
 {
 	int *state;
 
 	state = &PCPU_PTR(monitorbuf)->idle_state;
 	atomic_store_int(state, STATE_SLEEPING);
 
 	/* See comments in cpu_idle_hlt(). */
 	disable_intr();
 	if (sched_runnable())
 		enable_intr();
 	else if (cpu_idle_hook)
 		cpu_idle_hook(sbt);
 	else
 		acpi_cpu_c1();
 	atomic_store_int(state, STATE_RUNNING);
 }
 
 static void
 cpu_idle_hlt(sbintime_t sbt)
 {
 	int *state;
 
 	state = &PCPU_PTR(monitorbuf)->idle_state;
 	atomic_store_int(state, STATE_SLEEPING);
 
 	/*
 	 * Since we may be in a critical section from cpu_idle(), if
 	 * an interrupt fires during that critical section we may have
 	 * a pending preemption.  If the CPU halts, then that thread
 	 * may not execute until a later interrupt awakens the CPU.
 	 * To handle this race, check for a runnable thread after
 	 * disabling interrupts and immediately return if one is
 	 * found.  Also, we must absolutely guarentee that hlt is
 	 * the next instruction after sti.  This ensures that any
 	 * interrupt that fires after the call to disable_intr() will
 	 * immediately awaken the CPU from hlt.  Finally, please note
 	 * that on x86 this works fine because of interrupts enabled only
 	 * after the instruction following sti takes place, while IF is set
 	 * to 1 immediately, allowing hlt instruction to acknowledge the
 	 * interrupt.
 	 */
 	disable_intr();
 	if (sched_runnable())
 		enable_intr();
 	else
 		acpi_cpu_c1();
 	atomic_store_int(state, STATE_RUNNING);
 }
 
 static void
 cpu_idle_mwait(sbintime_t sbt)
 {
 	int *state;
 
 	state = &PCPU_PTR(monitorbuf)->idle_state;
 	atomic_store_int(state, STATE_MWAIT);
 
 	/* See comments in cpu_idle_hlt(). */
 	disable_intr();
 	if (sched_runnable()) {
 		atomic_store_int(state, STATE_RUNNING);
 		enable_intr();
 		return;
 	}
 
 	cpu_monitor(state, 0, 0);
 	if (atomic_load_int(state) == STATE_MWAIT)
 		__asm __volatile("sti; mwait" : : "a" (MWAIT_C1), "c" (0));
 	else
 		enable_intr();
 	atomic_store_int(state, STATE_RUNNING);
 }
 
 static void
 cpu_idle_spin(sbintime_t sbt)
 {
 	int *state;
 	int i;
 
 	state = &PCPU_PTR(monitorbuf)->idle_state;
 	atomic_store_int(state, STATE_RUNNING);
 
 	/*
 	 * The sched_runnable() call is racy but as long as there is
 	 * a loop missing it one time will have just a little impact if any 
 	 * (and it is much better than missing the check at all).
 	 */
 	for (i = 0; i < 1000; i++) {
 		if (sched_runnable())
 			return;
 		cpu_spinwait();
 	}
 }
 
 /*
  * C1E renders the local APIC timer dead, so we disable it by
  * reading the Interrupt Pending Message register and clearing
  * both C1eOnCmpHalt (bit 28) and SmiOnCmpHalt (bit 27).
  * 
  * Reference:
  *   "BIOS and Kernel Developer's Guide for AMD NPT Family 0Fh Processors"
  *   #32559 revision 3.00+
  */
 #define	MSR_AMDK8_IPM		0xc0010055
 #define	AMDK8_SMIONCMPHALT	(1ULL << 27)
 #define	AMDK8_C1EONCMPHALT	(1ULL << 28)
 #define	AMDK8_CMPHALT		(AMDK8_SMIONCMPHALT | AMDK8_C1EONCMPHALT)
 
 void
 cpu_probe_amdc1e(void)
 {
 
 	/*
 	 * Detect the presence of C1E capability mostly on latest
 	 * dual-cores (or future) k8 family.
 	 */
 	if (cpu_vendor_id == CPU_VENDOR_AMD &&
 	    (cpu_id & 0x00000f00) == 0x00000f00 &&
 	    (cpu_id & 0x0fff0000) >=  0x00040000) {
 		cpu_ident_amdc1e = 1;
 	}
 }
 
 void (*cpu_idle_fn)(sbintime_t) = cpu_idle_acpi;
 
 void
 cpu_idle(int busy)
 {
 	uint64_t msr;
 	sbintime_t sbt = -1;
 
 	CTR2(KTR_SPARE2, "cpu_idle(%d) at %d",
 	    busy, curcpu);
 #ifdef MP_WATCHDOG
 	ap_watchdog(PCPU_GET(cpuid));
 #endif
 
 	/* If we are busy - try to use fast methods. */
 	if (busy) {
 		if ((cpu_feature2 & CPUID2_MON) && idle_mwait) {
 			cpu_idle_mwait(busy);
 			goto out;
 		}
 	}
 
 	/* If we have time - switch timers into idle mode. */
 	if (!busy) {
 		critical_enter();
 		sbt = cpu_idleclock();
 	}
 
 	/* Apply AMD APIC timer C1E workaround. */
 	if (cpu_ident_amdc1e && cpu_disable_c3_sleep) {
 		msr = rdmsr(MSR_AMDK8_IPM);
 		if (msr & AMDK8_CMPHALT)
 			wrmsr(MSR_AMDK8_IPM, msr & ~AMDK8_CMPHALT);
 	}
 
 	/* Call main idle method. */
 	cpu_idle_fn(sbt);
 
 	/* Switch timers back into active mode. */
 	if (!busy) {
 		cpu_activeclock();
 		critical_exit();
 	}
 out:
 	CTR2(KTR_SPARE2, "cpu_idle(%d) at %d done",
 	    busy, curcpu);
 }
 
 static int cpu_idle_apl31_workaround;
 SYSCTL_INT(_machdep, OID_AUTO, idle_apl31, CTLFLAG_RW,
     &cpu_idle_apl31_workaround, 0,
     "Apollo Lake APL31 MWAIT bug workaround");
 
 int
 cpu_idle_wakeup(int cpu)
 {
 	struct monitorbuf *mb;
 	int *state;
 
 	mb = &pcpu_find(cpu)->pc_monitorbuf;
 	state = &mb->idle_state;
 	switch (atomic_load_int(state)) {
 	case STATE_SLEEPING:
 		return (0);
 	case STATE_MWAIT:
 		atomic_store_int(state, STATE_RUNNING);
 		return (cpu_idle_apl31_workaround ? 0 : 1);
 	case STATE_RUNNING:
 		return (1);
 	default:
 		panic("bad monitor state");
 		return (1);
 	}
 }
 
 /*
  * Ordered by speed/power consumption.
  */
 static struct {
 	void	*id_fn;
 	char	*id_name;
 	int	id_cpuid2_flag;
 } idle_tbl[] = {
 	{ .id_fn = cpu_idle_spin, .id_name = "spin" },
 	{ .id_fn = cpu_idle_mwait, .id_name = "mwait",
 	    .id_cpuid2_flag = CPUID2_MON },
 	{ .id_fn = cpu_idle_hlt, .id_name = "hlt" },
 	{ .id_fn = cpu_idle_acpi, .id_name = "acpi" },
 };
 
 static int
 idle_sysctl_available(SYSCTL_HANDLER_ARGS)
 {
 	char *avail, *p;
 	int error;
 	int i;
 
 	avail = malloc(256, M_TEMP, M_WAITOK);
 	p = avail;
 	for (i = 0; i < nitems(idle_tbl); i++) {
 		if (idle_tbl[i].id_cpuid2_flag != 0 &&
 		    (cpu_feature2 & idle_tbl[i].id_cpuid2_flag) == 0)
 			continue;
 		if (strcmp(idle_tbl[i].id_name, "acpi") == 0 &&
 		    cpu_idle_hook == NULL)
 			continue;
 		p += sprintf(p, "%s%s", p != avail ? ", " : "",
 		    idle_tbl[i].id_name);
 	}
 	error = sysctl_handle_string(oidp, avail, 0, req);
 	free(avail, M_TEMP);
 	return (error);
 }
 
 SYSCTL_PROC(_machdep, OID_AUTO, idle_available, CTLTYPE_STRING | CTLFLAG_RD,
     0, 0, idle_sysctl_available, "A", "list of available idle functions");
 
 static bool
 cpu_idle_selector(const char *new_idle_name)
 {
 	int i;
 
 	for (i = 0; i < nitems(idle_tbl); i++) {
 		if (idle_tbl[i].id_cpuid2_flag != 0 &&
 		    (cpu_feature2 & idle_tbl[i].id_cpuid2_flag) == 0)
 			continue;
 		if (strcmp(idle_tbl[i].id_name, "acpi") == 0 &&
 		    cpu_idle_hook == NULL)
 			continue;
 		if (strcmp(idle_tbl[i].id_name, new_idle_name))
 			continue;
 		cpu_idle_fn = idle_tbl[i].id_fn;
 		if (bootverbose)
 			printf("CPU idle set to %s\n", idle_tbl[i].id_name);
 		return (true);
 	}
 	return (false);
 }
 
 static int
 cpu_idle_sysctl(SYSCTL_HANDLER_ARGS)
 {
 	char buf[16], *p;
 	int error, i;
 
 	p = "unknown";
 	for (i = 0; i < nitems(idle_tbl); i++) {
 		if (idle_tbl[i].id_fn == cpu_idle_fn) {
 			p = idle_tbl[i].id_name;
 			break;
 		}
 	}
 	strncpy(buf, p, sizeof(buf));
 	error = sysctl_handle_string(oidp, buf, sizeof(buf), req);
 	if (error != 0 || req->newptr == NULL)
 		return (error);
 	return (cpu_idle_selector(buf) ? 0 : EINVAL);
 }
 
 SYSCTL_PROC(_machdep, OID_AUTO, idle, CTLTYPE_STRING | CTLFLAG_RW, 0, 0,
     cpu_idle_sysctl, "A", "currently selected idle function");
 
 static void
 cpu_idle_tun(void *unused __unused)
 {
 	char tunvar[16];
 
 	if (TUNABLE_STR_FETCH("machdep.idle", tunvar, sizeof(tunvar)))
 		cpu_idle_selector(tunvar);
 	else if (cpu_vendor_id == CPU_VENDOR_AMD &&
 	    CPUID_TO_FAMILY(cpu_id) == 0x17 && CPUID_TO_MODEL(cpu_id) == 0x1) {
 		/* Ryzen erratas 1057, 1109. */
 		cpu_idle_selector("hlt");
 		idle_mwait = 0;
 		mwait_cpustop_broken = true;
 	}
 
 	if (cpu_vendor_id == CPU_VENDOR_INTEL && cpu_id == 0x506c9) {
 		/*
 		 * Apollo Lake errata APL31 (public errata APL30).
 		 * Stores to the armed address range may not trigger
 		 * MWAIT to resume execution.  OS needs to use
 		 * interrupts to wake processors from MWAIT-induced
 		 * sleep states.
 		 */
 		cpu_idle_apl31_workaround = 1;
 		mwait_cpustop_broken = true;
 	}
 	TUNABLE_INT_FETCH("machdep.idle_apl31", &cpu_idle_apl31_workaround);
 }
 SYSINIT(cpu_idle_tun, SI_SUB_CPU, SI_ORDER_MIDDLE, cpu_idle_tun, NULL);
 
 static int panic_on_nmi = 1;
 SYSCTL_INT(_machdep, OID_AUTO, panic_on_nmi, CTLFLAG_RWTUN,
     &panic_on_nmi, 0,
     "Panic on NMI raised by hardware failure");
 int nmi_is_broadcast = 1;
 SYSCTL_INT(_machdep, OID_AUTO, nmi_is_broadcast, CTLFLAG_RWTUN,
     &nmi_is_broadcast, 0,
     "Chipset NMI is broadcast");
 #ifdef KDB
 int kdb_on_nmi = 1;
 SYSCTL_INT(_machdep, OID_AUTO, kdb_on_nmi, CTLFLAG_RWTUN,
     &kdb_on_nmi, 0,
     "Go to KDB on NMI with unknown source");
 #endif
 
 void
 nmi_call_kdb(u_int cpu, u_int type, struct trapframe *frame)
 {
 	bool claimed = false;
 
 #ifdef DEV_ISA
 	/* machine/parity/power fail/"kitchen sink" faults */
 	if (isa_nmi(frame->tf_err)) {
 		claimed = true;
 		if (panic_on_nmi)
 			panic("NMI indicates hardware failure");
 	}
 #endif /* DEV_ISA */
 #ifdef KDB
 	if (!claimed && kdb_on_nmi) {
 		/*
 		 * NMI can be hooked up to a pushbutton for debugging.
 		 */
 		printf("NMI/cpu%d ... going to debugger\n", cpu);
 		kdb_trap(type, 0, frame);
 	}
 #endif /* KDB */
 }
 
 void
 nmi_handle_intr(u_int type, struct trapframe *frame)
 {
 
 #ifdef SMP
 	if (nmi_is_broadcast) {
 		nmi_call_kdb_smp(type, frame);
 		return;
 	}
 #endif
 	nmi_call_kdb(PCPU_GET(cpuid), type, frame);
 }
 
 int hw_ibrs_active;
 int hw_ibrs_disable = 1;
 
 SYSCTL_INT(_hw, OID_AUTO, ibrs_active, CTLFLAG_RD, &hw_ibrs_active, 0,
     "Indirect Branch Restricted Speculation active");
 
 void
 hw_ibrs_recalculate(void)
 {
 	uint64_t v;
 
 	if ((cpu_ia32_arch_caps & IA32_ARCH_CAP_IBRS_ALL) != 0) {
 		if (hw_ibrs_disable) {
 			v = rdmsr(MSR_IA32_SPEC_CTRL);
 			v &= ~(uint64_t)IA32_SPEC_CTRL_IBRS;
 			wrmsr(MSR_IA32_SPEC_CTRL, v);
 		} else {
 			v = rdmsr(MSR_IA32_SPEC_CTRL);
 			v |= IA32_SPEC_CTRL_IBRS;
 			wrmsr(MSR_IA32_SPEC_CTRL, v);
 		}
 		return;
 	}
 	hw_ibrs_active = (cpu_stdext_feature3 & CPUID_STDEXT3_IBPB) != 0 &&
 	    !hw_ibrs_disable;
 }
 
 static int
 hw_ibrs_disable_handler(SYSCTL_HANDLER_ARGS)
 {
 	int error, val;
 
 	val = hw_ibrs_disable;
 	error = sysctl_handle_int(oidp, &val, 0, req);
 	if (error != 0 || req->newptr == NULL)
 		return (error);
 	hw_ibrs_disable = val != 0;
 	hw_ibrs_recalculate();
 	return (0);
 }
 SYSCTL_PROC(_hw, OID_AUTO, ibrs_disable, CTLTYPE_INT | CTLFLAG_RWTUN |
     CTLFLAG_NOFETCH | CTLFLAG_MPSAFE, NULL, 0, hw_ibrs_disable_handler, "I",
     "Disable Indirect Branch Restricted Speculation");
 
 int hw_ssb_active;
 int hw_ssb_disable;
 
 SYSCTL_INT(_hw, OID_AUTO, spec_store_bypass_disable_active, CTLFLAG_RD,
     &hw_ssb_active, 0,
     "Speculative Store Bypass Disable active");
 
 static void
 hw_ssb_set_one(bool enable)
 {
 	uint64_t v;
 
 	v = rdmsr(MSR_IA32_SPEC_CTRL);
 	if (enable)
 		v |= (uint64_t)IA32_SPEC_CTRL_SSBD;
 	else
 		v &= ~(uint64_t)IA32_SPEC_CTRL_SSBD;
 	wrmsr(MSR_IA32_SPEC_CTRL, v);
 }
 
 static void
 hw_ssb_set(bool enable, bool for_all_cpus)
 {
 	struct thread *td;
 	int bound_cpu, i, is_bound;
 
 	if ((cpu_stdext_feature3 & CPUID_STDEXT3_SSBD) == 0) {
 		hw_ssb_active = 0;
 		return;
 	}
 	hw_ssb_active = enable;
 	if (for_all_cpus) {
 		td = curthread;
 		thread_lock(td);
 		is_bound = sched_is_bound(td);
 		bound_cpu = td->td_oncpu;
 		CPU_FOREACH(i) {
 			sched_bind(td, i);
 			hw_ssb_set_one(enable);
 		}
 		if (is_bound)
 			sched_bind(td, bound_cpu);
 		else
 			sched_unbind(td);
 		thread_unlock(td);
 	} else {
 		hw_ssb_set_one(enable);
 	}
 }
 
 void
 hw_ssb_recalculate(bool all_cpus)
 {
 
 	switch (hw_ssb_disable) {
 	default:
 		hw_ssb_disable = 0;
 		/* FALLTHROUGH */
 	case 0: /* off */
 		hw_ssb_set(false, all_cpus);
 		break;
 	case 1: /* on */
 		hw_ssb_set(true, all_cpus);
 		break;
 	case 2: /* auto */
 		hw_ssb_set((cpu_ia32_arch_caps & IA32_ARCH_CAP_SSB_NO) != 0 ?
 		    false : true, all_cpus);
 		break;
 	}
 }
 
 static int
 hw_ssb_disable_handler(SYSCTL_HANDLER_ARGS)
 {
 	int error, val;
 
 	val = hw_ssb_disable;
 	error = sysctl_handle_int(oidp, &val, 0, req);
 	if (error != 0 || req->newptr == NULL)
 		return (error);
 	hw_ssb_disable = val;
 	hw_ssb_recalculate(true);
 	return (0);
 }
 SYSCTL_PROC(_hw, OID_AUTO, spec_store_bypass_disable, CTLTYPE_INT |
     CTLFLAG_RWTUN | CTLFLAG_NOFETCH | CTLFLAG_MPSAFE, NULL, 0,
     hw_ssb_disable_handler, "I",
     "Speculative Store Bypass Disable (0 - off, 1 - on, 2 - auto");
+
+int hw_mds_disable;
+
+/*
+ * Handler for Microarchitectural Data Sampling issues.  Really not a
+ * pointer to C function: on amd64 the code must not change any CPU
+ * architectural state except possibly %rflags. Also, it is always
+ * called with interrupts disabled.
+ */
+void (*mds_handler)(void);
+void mds_handler_void(void);
+void mds_handler_verw(void);
+void mds_handler_ivb(void);
+void mds_handler_bdw(void);
+void mds_handler_skl_sse(void);
+void mds_handler_skl_avx(void);
+void mds_handler_skl_avx512(void);
+void mds_handler_silvermont(void);
+
+static int
+sysctl_hw_mds_disable_state_handler(SYSCTL_HANDLER_ARGS)
+{
+	const char *state;
+
+	if (mds_handler == mds_handler_void)
+		state = "inactive";
+	else if (mds_handler == mds_handler_verw)
+		state = "VERW";
+	else if (mds_handler == mds_handler_ivb)
+		state = "software IvyBridge";
+	else if (mds_handler == mds_handler_bdw)
+		state = "software Broadwell";
+	else if (mds_handler == mds_handler_skl_sse)
+		state = "software Skylake SSE";
+	else if (mds_handler == mds_handler_skl_avx)
+		state = "software Skylake AVX";
+	else if (mds_handler == mds_handler_skl_avx512)
+		state = "software Skylake AVX512";
+	else if (mds_handler == mds_handler_silvermont)
+		state = "software Silvermont";
+	else
+		state = "unknown";
+	return (SYSCTL_OUT(req, state, strlen(state)));
+}
+
+SYSCTL_PROC(_hw, OID_AUTO, mds_disable_state,
+    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0,
+    sysctl_hw_mds_disable_state_handler, "A",
+    "Microarchitectural Data Sampling Mitigation state");
+
+_Static_assert(__offsetof(struct pcpu, pc_mds_tmp) % 64 == 0, "MDS AVX512");
+
+void
+hw_mds_recalculate(void)
+{
+	struct pcpu *pc;
+	vm_offset_t b64;
+	u_long xcr0;
+	int i;
+
+	/*
+	 * Allow user to force VERW variant even if MD_CLEAR is not
+	 * reported.  For instance, hypervisor might unknowingly
+	 * filter the cap out.
+	 * For the similar reasons, and for testing, allow to enable
+	 * mitigation even for RDCL_NO or MDS_NO caps.
+	 */
+	if (cpu_vendor_id != CPU_VENDOR_INTEL || hw_mds_disable == 0 ||
+	    ((cpu_ia32_arch_caps & (IA32_ARCH_CAP_RDCL_NO |
+	    IA32_ARCH_CAP_MDS_NO)) != 0 && hw_mds_disable == 3)) {
+		mds_handler = mds_handler_void;
+	} else if (((cpu_stdext_feature3 & CPUID_STDEXT3_MD_CLEAR) != 0 &&
+	    hw_mds_disable == 3) || hw_mds_disable == 1) {
+		mds_handler = mds_handler_verw;
+	} else if (CPUID_TO_FAMILY(cpu_id) == 0x6 &&
+	    (CPUID_TO_MODEL(cpu_id) == 0x2e || CPUID_TO_MODEL(cpu_id) == 0x1e ||
+	    CPUID_TO_MODEL(cpu_id) == 0x1f || CPUID_TO_MODEL(cpu_id) == 0x1a ||
+	    CPUID_TO_MODEL(cpu_id) == 0x2f || CPUID_TO_MODEL(cpu_id) == 0x25 ||
+	    CPUID_TO_MODEL(cpu_id) == 0x2c || CPUID_TO_MODEL(cpu_id) == 0x2d ||
+	    CPUID_TO_MODEL(cpu_id) == 0x2a || CPUID_TO_MODEL(cpu_id) == 0x3e ||
+	    CPUID_TO_MODEL(cpu_id) == 0x3a) &&
+	    (hw_mds_disable == 2 || hw_mds_disable == 3)) {
+		/*
+		 * Nehalem, SandyBridge, IvyBridge
+		 */
+		CPU_FOREACH(i) {
+			pc = pcpu_find(i);
+			if (pc->pc_mds_buf == NULL) {
+				pc->pc_mds_buf = malloc_domainset(672, M_TEMP,
+				    DOMAINSET_PREF(pc->pc_domain), M_WAITOK);
+				bzero(pc->pc_mds_buf, 16);
+			}
+		}
+		mds_handler = mds_handler_ivb;
+	} else if (CPUID_TO_FAMILY(cpu_id) == 0x6 &&
+	    (CPUID_TO_MODEL(cpu_id) == 0x3f || CPUID_TO_MODEL(cpu_id) == 0x3c ||
+	    CPUID_TO_MODEL(cpu_id) == 0x45 || CPUID_TO_MODEL(cpu_id) == 0x46 ||
+	    CPUID_TO_MODEL(cpu_id) == 0x56 || CPUID_TO_MODEL(cpu_id) == 0x4f ||
+	    CPUID_TO_MODEL(cpu_id) == 0x47 || CPUID_TO_MODEL(cpu_id) == 0x3d) &&
+	    (hw_mds_disable == 2 || hw_mds_disable == 3)) {
+		/*
+		 * Haswell, Broadwell
+		 */
+		CPU_FOREACH(i) {
+			pc = pcpu_find(i);
+			if (pc->pc_mds_buf == NULL) {
+				pc->pc_mds_buf = malloc_domainset(1536, M_TEMP,
+				    DOMAINSET_PREF(pc->pc_domain), M_WAITOK);
+				bzero(pc->pc_mds_buf, 16);
+			}
+		}
+		mds_handler = mds_handler_bdw;
+	} else if (CPUID_TO_FAMILY(cpu_id) == 0x6 &&
+	    ((CPUID_TO_MODEL(cpu_id) == 0x55 && (cpu_id &
+	    CPUID_STEPPING) <= 5) ||
+	    CPUID_TO_MODEL(cpu_id) == 0x4e || CPUID_TO_MODEL(cpu_id) == 0x5e ||
+	    (CPUID_TO_MODEL(cpu_id) == 0x8e && (cpu_id &
+	    CPUID_STEPPING) <= 0xb) ||
+	    (CPUID_TO_MODEL(cpu_id) == 0x9e && (cpu_id &
+	    CPUID_STEPPING) <= 0xc)) &&
+	    (hw_mds_disable == 2 || hw_mds_disable == 3)) {
+		/*
+		 * Skylake, KabyLake, CoffeeLake, WhiskeyLake,
+		 * CascadeLake
+		 */
+		CPU_FOREACH(i) {
+			pc = pcpu_find(i);
+			if (pc->pc_mds_buf == NULL) {
+				pc->pc_mds_buf = malloc_domainset(6 * 1024,
+				    M_TEMP, DOMAINSET_PREF(pc->pc_domain),
+				    M_WAITOK);
+				b64 = (vm_offset_t)malloc_domainset(64 + 63,
+				    M_TEMP, DOMAINSET_PREF(pc->pc_domain),
+				    M_WAITOK);
+				pc->pc_mds_buf64 = (void *)roundup2(b64, 64);
+				bzero(pc->pc_mds_buf64, 64);
+			}
+		}
+		xcr0 = rxcr(0);
+		if ((xcr0 & XFEATURE_ENABLED_ZMM_HI256) != 0 &&
+		    (cpu_stdext_feature2 & CPUID_STDEXT_AVX512DQ) != 0)
+			mds_handler = mds_handler_skl_avx512;
+		else if ((xcr0 & XFEATURE_ENABLED_AVX) != 0 &&
+		    (cpu_feature2 & CPUID2_AVX) != 0)
+			mds_handler = mds_handler_skl_avx;
+		else
+			mds_handler = mds_handler_skl_sse;
+	} else if (CPUID_TO_FAMILY(cpu_id) == 0x6 &&
+	    ((CPUID_TO_MODEL(cpu_id) == 0x37 ||
+	    CPUID_TO_MODEL(cpu_id) == 0x4a ||
+	    CPUID_TO_MODEL(cpu_id) == 0x4c ||
+	    CPUID_TO_MODEL(cpu_id) == 0x4d ||
+	    CPUID_TO_MODEL(cpu_id) == 0x5a ||
+	    CPUID_TO_MODEL(cpu_id) == 0x5d ||
+	    CPUID_TO_MODEL(cpu_id) == 0x6e ||
+	    CPUID_TO_MODEL(cpu_id) == 0x65 ||
+	    CPUID_TO_MODEL(cpu_id) == 0x75 ||
+	    CPUID_TO_MODEL(cpu_id) == 0x1c ||
+	    CPUID_TO_MODEL(cpu_id) == 0x26 ||
+	    CPUID_TO_MODEL(cpu_id) == 0x27 ||
+	    CPUID_TO_MODEL(cpu_id) == 0x35 ||
+	    CPUID_TO_MODEL(cpu_id) == 0x36 ||
+	    CPUID_TO_MODEL(cpu_id) == 0x7a))) {
+		/* Silvermont, Airmont */
+		CPU_FOREACH(i) {
+			pc = pcpu_find(i);
+			if (pc->pc_mds_buf == NULL)
+				pc->pc_mds_buf = malloc(256, M_TEMP, M_WAITOK);
+		}
+		mds_handler = mds_handler_silvermont;
+	} else {
+		hw_mds_disable = 0;
+		mds_handler = mds_handler_void;
+	}
+}
+
+static int
+sysctl_mds_disable_handler(SYSCTL_HANDLER_ARGS)
+{
+	int error, val;
+
+	val = hw_mds_disable;
+	error = sysctl_handle_int(oidp, &val, 0, req);
+	if (error != 0 || req->newptr == NULL)
+		return (error);
+	if (val < 0 || val > 3)
+		return (EINVAL);
+	hw_mds_disable = val;
+	hw_mds_recalculate();
+	return (0);
+}
+
+SYSCTL_PROC(_hw, OID_AUTO, mds_disable, CTLTYPE_INT |
+    CTLFLAG_RWTUN | CTLFLAG_NOFETCH | CTLFLAG_MPSAFE, NULL, 0,
+    sysctl_mds_disable_handler, "I",
+    "Microarchitectural Data Sampling Mitigation "
+    "(0 - off, 1 - on VERW, 2 - on SW, 3 - on AUTO");
 
 /*
  * Enable and restore kernel text write permissions.
  * Callers must ensure that disable_wp()/restore_wp() are executed
  * without rescheduling on the same core.
  */
 bool
 disable_wp(void)
 {
 	u_int cr0;
 
 	cr0 = rcr0();
 	if ((cr0 & CR0_WP) == 0)
 		return (false);
 	load_cr0(cr0 & ~CR0_WP);
 	return (true);
 }
 
 void
 restore_wp(bool old_wp)
 {
 
 	if (old_wp)
 		load_cr0(rcr0() | CR0_WP);
 }
 
 bool
 acpi_get_fadt_bootflags(uint16_t *flagsp)
 {
 #ifdef DEV_ACPI
 	ACPI_TABLE_FADT *fadt;
 	vm_paddr_t physaddr;
 
 	physaddr = acpi_find_table(ACPI_SIG_FADT);
 	if (physaddr == 0)
 		return (false);
 	fadt = acpi_map_table(physaddr, ACPI_SIG_FADT);
 	if (fadt == NULL)
 		return (false);
 	*flagsp = fadt->BootFlags;
 	acpi_unmap_table(fadt);
 	return (true);
 #else
 	return (false);
 #endif
 }
Index: projects/runtime-coverage-v2/usr.sbin/mountd/mountd.c
===================================================================
--- projects/runtime-coverage-v2/usr.sbin/mountd/mountd.c	(revision 347598)
+++ projects/runtime-coverage-v2/usr.sbin/mountd/mountd.c	(revision 347599)
@@ -1,3340 +1,3337 @@
 /*-
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Copyright (c) 1989, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * This code is derived from software contributed to Berkeley by
  * Herb Hasler and Rick Macklem at The University of Guelph.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #ifndef lint
 static const char copyright[] =
 "@(#) Copyright (c) 1989, 1993\n\
 	The Regents of the University of California.  All rights reserved.\n";
 #endif /*not lint*/
 
 #if 0
 #ifndef lint
 static char sccsid[] = "@(#)mountd.c	8.15 (Berkeley) 5/1/95";
 #endif /*not lint*/
 #endif
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/fcntl.h>
 #include <sys/linker.h>
 #include <sys/module.h>
 #include <sys/mount.h>
 #include <sys/queue.h>
 #include <sys/stat.h>
 #include <sys/sysctl.h>
 #include <sys/syslog.h>
 
 #include <rpc/rpc.h>
 #include <rpc/rpc_com.h>
 #include <rpc/pmap_clnt.h>
 #include <rpc/pmap_prot.h>
 #include <rpcsvc/mount.h>
 #include <nfs/nfsproto.h>
 #include <nfs/nfssvc.h>
 #include <nfsserver/nfs.h>
 
 #include <fs/nfs/nfsport.h>
 
 #include <arpa/inet.h>
 
 #include <ctype.h>
 #include <err.h>
 #include <errno.h>
 #include <grp.h>
 #include <libutil.h>
 #include <limits.h>
 #include <netdb.h>
 #include <pwd.h>
 #include <signal.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 #include <unistd.h>
 #include "pathnames.h"
 #include "mntopts.h"
 
 #ifdef DEBUG
 #include <stdarg.h>
 #endif
 
 /*
  * Structures for keeping the mount list and export list
  */
 struct mountlist {
 	char	ml_host[MNTNAMLEN+1];
 	char	ml_dirp[MNTPATHLEN+1];
 
 	SLIST_ENTRY(mountlist)	next;
 };
 
 struct dirlist {
 	struct dirlist	*dp_left;
 	struct dirlist	*dp_right;
 	int		dp_flag;
 	struct hostlist	*dp_hosts;	/* List of hosts this dir exported to */
 	char		*dp_dirp;
 };
 /* dp_flag bits */
 #define	DP_DEFSET	0x1
 #define DP_HOSTSET	0x2
 
 struct exportlist {
 	struct dirlist	*ex_dirl;
 	struct dirlist	*ex_defdir;
+	struct grouplist *ex_grphead;
 	int		ex_flag;
 	fsid_t		ex_fs;
 	char		*ex_fsdir;
 	char		*ex_indexfile;
 	int		ex_numsecflavors;
 	int		ex_secflavors[MAXSECFLAVORS];
 	int		ex_defnumsecflavors;
 	int		ex_defsecflavors[MAXSECFLAVORS];
 
 	SLIST_ENTRY(exportlist) entries;
 };
 /* ex_flag bits */
 #define	EX_LINKED	0x1
 
 SLIST_HEAD(exportlisthead, exportlist);
 
 struct netmsk {
 	struct sockaddr_storage nt_net;
 	struct sockaddr_storage nt_mask;
 	char		*nt_name;
 };
 
 union grouptypes {
 	struct addrinfo *gt_addrinfo;
 	struct netmsk	gt_net;
 };
 
 struct grouplist {
 	int gr_type;
 	union grouptypes gr_ptr;
 	struct grouplist *gr_next;
 	int gr_numsecflavors;
 	int gr_secflavors[MAXSECFLAVORS];
 };
 /* Group types */
 #define	GT_NULL		0x0
 #define	GT_HOST		0x1
 #define	GT_NET		0x2
 #define	GT_DEFAULT	0x3
 #define GT_IGNORE	0x5
 
 struct hostlist {
 	int		 ht_flag;	/* Uses DP_xx bits */
 	struct grouplist *ht_grp;
 	struct hostlist	 *ht_next;
 };
 
 struct fhreturn {
 	int	fhr_flag;
 	int	fhr_vers;
 	nfsfh_t	fhr_fh;
 	int	fhr_numsecflavors;
 	int	*fhr_secflavors;
 };
 
 #define	GETPORT_MAXTRY	20	/* Max tries to get a port # */
 
 /* Global defs */
 static char	*add_expdir(struct dirlist **, char *, int);
 static void	add_dlist(struct dirlist **, struct dirlist *,
 		    struct grouplist *, int, struct exportlist *);
 static void	add_mlist(char *, char *);
 static int	check_dirpath(char *);
 static int	check_options(struct dirlist *);
 static int	checkmask(struct sockaddr *sa);
 static int	chk_host(struct dirlist *, struct sockaddr *, int *, int *,
 		    int *, int **);
 static char	*strsep_quote(char **stringp, const char *delim);
 static int	create_service(struct netconfig *nconf);
 static void	complete_service(struct netconfig *nconf, char *port_str);
 static void	clearout_service(void);
 static void	del_mlist(char *hostp, char *dirp);
 static struct dirlist	*dirp_search(struct dirlist *, char *);
 static int	do_mount(struct exportlist *, struct grouplist *, int,
 		    struct xucred *, char *, int, struct statfs *);
 static int	do_opt(char **, char **, struct exportlist *,
 		    struct grouplist *, int *, int *, struct xucred *);
 static struct exportlist	*ex_search(fsid_t *, struct exportlisthead *);
 static struct exportlist	*get_exp(void);
 static void	free_dir(struct dirlist *);
 static void	free_exp(struct exportlist *);
 static void	free_grp(struct grouplist *);
 static void	free_host(struct hostlist *);
 static void	get_exportlist(void);
 static void	insert_exports(struct exportlist *, struct exportlisthead *);
 static void	free_exports(struct exportlisthead *);
 static void	read_exportfile(void);
 static void	delete_export(struct iovec *, int, struct statfs *, char *);
 static int	get_host(char *, struct grouplist *, struct grouplist *);
 static struct hostlist *get_ht(void);
 static int	get_line(void);
 static void	get_mountlist(void);
 static int	get_net(char *, struct netmsk *, int);
 static void	getexp_err(struct exportlist *, struct grouplist *, const char *);
 static struct grouplist	*get_grp(void);
 static void	hang_dirp(struct dirlist *, struct grouplist *,
 				struct exportlist *, int);
 static void	huphandler(int sig);
 static int	makemask(struct sockaddr_storage *ssp, int bitlen);
 static void	mntsrv(struct svc_req *, SVCXPRT *);
 static void	nextfield(char **, char **);
 static void	out_of_mem(void);
 static void	parsecred(char *, struct xucred *);
 static int	parsesec(char *, struct exportlist *);
 static int	put_exlist(struct dirlist *, XDR *, struct dirlist *,
 		    int *, int);
 static void	*sa_rawaddr(struct sockaddr *sa, int *nbytes);
 static int	sacmp(struct sockaddr *sa1, struct sockaddr *sa2,
 		    struct sockaddr *samask);
 static int	scan_tree(struct dirlist *, struct sockaddr *);
 static void	usage(void);
 static int	xdr_dir(XDR *, char *);
 static int	xdr_explist(XDR *, caddr_t);
 static int	xdr_explist_brief(XDR *, caddr_t);
 static int	xdr_explist_common(XDR *, caddr_t, int);
 static int	xdr_fhs(XDR *, caddr_t);
 static int	xdr_mlist(XDR *, caddr_t);
 static void	terminate(int);
 
 static struct exportlisthead exphead = SLIST_HEAD_INITIALIZER(&exphead);
 static SLIST_HEAD(, mountlist) mlhead = SLIST_HEAD_INITIALIZER(&mlhead);
-static struct grouplist *grphead;
 static char *exnames_default[2] = { _PATH_EXPORTS, NULL };
 static char **exnames;
 static char **hosts = NULL;
 static struct xucred def_anon = {
 	XUCRED_VERSION,
 	(uid_t)65534,
 	1,
 	{ (gid_t)65533 },
 	NULL
 };
 static int force_v2 = 0;
 static int resvport_only = 1;
 static int nhosts = 0;
 static int dir_only = 1;
 static int dolog = 0;
 static int got_sighup = 0;
 static int xcreated = 0;
 
 static char *svcport_str = NULL;
 static int mallocd_svcport = 0;
 static int *sock_fd;
 static int sock_fdcnt;
 static int sock_fdpos;
 static int suspend_nfsd = 0;
 
 static int opt_flags;
 static int have_v6 = 1;
 
 static int v4root_phase = 0;
 static char v4root_dirpath[PATH_MAX + 1];
 static int has_publicfh = 0;
 
 static struct pidfh *pfh = NULL;
 /* Bits for opt_flags above */
 #define	OP_MAPROOT	0x01
 #define	OP_MAPALL	0x02
 /* 0x4 free */
 #define	OP_MASK		0x08
 #define	OP_NET		0x10
 #define	OP_ALLDIRS	0x40
 #define	OP_HAVEMASK	0x80	/* A mask was specified or inferred. */
 #define	OP_QUIET	0x100
 #define OP_MASKLEN	0x200
 #define OP_SEC		0x400
 
 #ifdef DEBUG
 static int debug = 1;
 static void	SYSLOG(int, const char *, ...) __printflike(2, 3);
 #define syslog SYSLOG
 #else
 static int debug = 0;
 #endif
 
 /*
  * Similar to strsep(), but it allows for quoted strings
  * and escaped characters.
  *
  * It returns the string (or NULL, if *stringp is NULL),
  * which is a de-quoted version of the string if necessary.
  *
  * It modifies *stringp in place.
  */
 static char *
 strsep_quote(char **stringp, const char *delim)
 {
 	char *srcptr, *dstptr, *retval;
 	char quot = 0;
 	
 	if (stringp == NULL || *stringp == NULL)
 		return (NULL);
 
 	srcptr = dstptr = retval = *stringp;
 
 	while (*srcptr) {
 		/*
 		 * We're looking for several edge cases here.
 		 * First:  if we're in quote state (quot != 0),
 		 * then we ignore the delim characters, but otherwise
 		 * process as normal, unless it is the quote character.
 		 * Second:  if the current character is a backslash,
 		 * we take the next character as-is, without checking
 		 * for delim, quote, or backslash.  Exception:  if the
 		 * next character is a NUL, that's the end of the string.
 		 * Third:  if the character is a quote character, we toggle
 		 * quote state.
 		 * Otherwise:  check the current character for NUL, or
 		 * being in delim, and end the string if either is true.
 		 */
 		if (*srcptr == '\\') {
 			srcptr++;
 			/*
 			 * The edge case here is if the next character
 			 * is NUL, we want to stop processing.  But if
 			 * it's not NUL, then we simply want to copy it.
 			 */
 			if (*srcptr) {
 				*dstptr++ = *srcptr++;
 			}
 			continue;
 		}
 		if (quot == 0 && (*srcptr == '\'' || *srcptr == '"')) {
 			quot = *srcptr++;
 			continue;
 		}
 		if (quot && *srcptr == quot) {
 			/* End of the quoted part */
 			quot = 0;
 			srcptr++;
 			continue;
 		}
 		if (!quot && strchr(delim, *srcptr))
 			break;
 		*dstptr++ = *srcptr++;
 	}
 
 	*dstptr = 0; /* Terminate the string */
 	*stringp = (*srcptr == '\0') ? NULL : srcptr + 1;
 	return (retval);
 }
 
 /*
  * Mountd server for NFS mount protocol as described in:
  * NFS: Network File System Protocol Specification, RFC1094, Appendix A
  * The optional arguments are the exports file name
  * default: _PATH_EXPORTS
  * and "-n" to allow nonroot mount.
  */
 int
 main(int argc, char **argv)
 {
 	fd_set readfds;
 	struct netconfig *nconf;
 	char *endptr, **hosts_bak;
 	void *nc_handle;
 	pid_t otherpid;
 	in_port_t svcport;
 	int c, k, s;
 	int maxrec = RPC_MAXDATASIZE;
 	int attempt_cnt, port_len, port_pos, ret;
 	char **port_list;
 
 	/* Check that another mountd isn't already running. */
 	pfh = pidfile_open(_PATH_MOUNTDPID, 0600, &otherpid);
 	if (pfh == NULL) {
 		if (errno == EEXIST)
 			errx(1, "mountd already running, pid: %d.", otherpid);
 		warn("cannot open or create pidfile");
 	}
 
 	s = socket(AF_INET6, SOCK_DGRAM, IPPROTO_UDP);
 	if (s < 0)
 		have_v6 = 0;
 	else
 		close(s);
 
 	while ((c = getopt(argc, argv, "2deh:lnp:rS")) != -1)
 		switch (c) {
 		case '2':
 			force_v2 = 1;
 			break;
 		case 'e':
 			/* now a no-op, since this is the default */
 			break;
 		case 'n':
 			resvport_only = 0;
 			break;
 		case 'r':
 			dir_only = 0;
 			break;
 		case 'd':
 			debug = debug ? 0 : 1;
 			break;
 		case 'l':
 			dolog = 1;
 			break;
 		case 'p':
 			endptr = NULL;
 			svcport = (in_port_t)strtoul(optarg, &endptr, 10);
 			if (endptr == NULL || *endptr != '\0' ||
 			    svcport == 0 || svcport >= IPPORT_MAX)
 				usage();
 			svcport_str = strdup(optarg);
 			break;
 		case 'h':
 			++nhosts;
 			hosts_bak = hosts;
 			hosts_bak = realloc(hosts, nhosts * sizeof(char *));
 			if (hosts_bak == NULL) {
 				if (hosts != NULL) {
 					for (k = 0; k < nhosts; k++) 
 						free(hosts[k]);
 					free(hosts);
 					out_of_mem();
 				}
 			}
 			hosts = hosts_bak;
 			hosts[nhosts - 1] = strdup(optarg);
 			if (hosts[nhosts - 1] == NULL) {
 				for (k = 0; k < (nhosts - 1); k++) 
 					free(hosts[k]);
 				free(hosts);
 				out_of_mem();
 			}
 			break;
 		case 'S':
 			suspend_nfsd = 1;
 			break;
 		default:
 			usage();
 		}
 
 	if (modfind("nfsd") < 0) {
 		/* Not present in kernel, try loading it */
 		if (kldload("nfsd") < 0 || modfind("nfsd") < 0)
 			errx(1, "NFS server is not available");
 	}
 
 	argc -= optind;
 	argv += optind;
-	grphead = (struct grouplist *)NULL;
 	if (argc > 0)
 		exnames = argv;
 	else
 		exnames = exnames_default;
 	openlog("mountd", LOG_PID, LOG_DAEMON);
 	if (debug)
 		warnx("getting export list");
 	get_exportlist();
 	if (debug)
 		warnx("getting mount list");
 	get_mountlist();
 	if (debug)
 		warnx("here we go");
 	if (debug == 0) {
 		daemon(0, 0);
 		signal(SIGINT, SIG_IGN);
 		signal(SIGQUIT, SIG_IGN);
 	}
 	signal(SIGHUP, huphandler);
 	signal(SIGTERM, terminate);
 	signal(SIGPIPE, SIG_IGN);
 
 	pidfile_write(pfh);
 
 	rpcb_unset(MOUNTPROG, MOUNTVERS, NULL);
 	rpcb_unset(MOUNTPROG, MOUNTVERS3, NULL);
 	rpc_control(RPC_SVC_CONNMAXREC_SET, &maxrec);
 
 	if (!resvport_only) {
 		if (sysctlbyname("vfs.nfsd.nfs_privport", NULL, NULL,
 		    &resvport_only, sizeof(resvport_only)) != 0 &&
 		    errno != ENOENT) {
 			syslog(LOG_ERR, "sysctl: %m");
 			exit(1);
 		}
 	}
 
 	/*
 	 * If no hosts were specified, add a wildcard entry to bind to
 	 * INADDR_ANY. Otherwise make sure 127.0.0.1 and ::1 are added to the
 	 * list.
 	 */
 	if (nhosts == 0) {
 		hosts = malloc(sizeof(char *));
 		if (hosts == NULL)
 			out_of_mem();
 		hosts[0] = "*";
 		nhosts = 1;
 	} else {
 		hosts_bak = hosts;
 		if (have_v6) {
 			hosts_bak = realloc(hosts, (nhosts + 2) *
 			    sizeof(char *));
 			if (hosts_bak == NULL) {
 				for (k = 0; k < nhosts; k++)
 					free(hosts[k]);
 		    		free(hosts);
 		    		out_of_mem();
 			} else
 				hosts = hosts_bak;
 			nhosts += 2;
 			hosts[nhosts - 2] = "::1";
 		} else {
 			hosts_bak = realloc(hosts, (nhosts + 1) * sizeof(char *));
 			if (hosts_bak == NULL) {
 				for (k = 0; k < nhosts; k++)
 					free(hosts[k]);
 				free(hosts);
 				out_of_mem();
 			} else {
 				nhosts += 1;
 				hosts = hosts_bak;
 			}
 		}
 
 		hosts[nhosts - 1] = "127.0.0.1";
 	}
 
 	attempt_cnt = 1;
 	sock_fdcnt = 0;
 	sock_fd = NULL;
 	port_list = NULL;
 	port_len = 0;
 	nc_handle = setnetconfig();
 	while ((nconf = getnetconfig(nc_handle))) {
 		if (nconf->nc_flag & NC_VISIBLE) {
 			if (have_v6 == 0 && strcmp(nconf->nc_protofmly,
 			    "inet6") == 0) {
 				/* DO NOTHING */
 			} else {
 				ret = create_service(nconf);
 				if (ret == 1)
 					/* Ignore this call */
 					continue;
 				if (ret < 0) {
 					/*
 					 * Failed to bind port, so close off
 					 * all sockets created and try again
 					 * if the port# was dynamically
 					 * assigned via bind(2).
 					 */
 					clearout_service();
 					if (mallocd_svcport != 0 &&
 					    attempt_cnt < GETPORT_MAXTRY) {
 						free(svcport_str);
 						svcport_str = NULL;
 						mallocd_svcport = 0;
 					} else {
 						errno = EADDRINUSE;
 						syslog(LOG_ERR,
 						    "bindresvport_sa: %m");
 						exit(1);
 					}
 
 					/* Start over at the first service. */
 					free(sock_fd);
 					sock_fdcnt = 0;
 					sock_fd = NULL;
 					nc_handle = setnetconfig();
 					attempt_cnt++;
 				} else if (mallocd_svcport != 0 &&
 				    attempt_cnt == GETPORT_MAXTRY) {
 					/*
 					 * For the last attempt, allow
 					 * different port #s for each nconf
 					 * by saving the svcport_str and
 					 * setting it back to NULL.
 					 */
 					port_list = realloc(port_list,
 					    (port_len + 1) * sizeof(char *));
 					if (port_list == NULL)
 						out_of_mem();
 					port_list[port_len++] = svcport_str;
 					svcport_str = NULL;
 					mallocd_svcport = 0;
 				}
 			}
 		}
 	}
 
 	/*
 	 * Successfully bound the ports, so call complete_service() to
 	 * do the rest of the setup on the service(s).
 	 */
 	sock_fdpos = 0;
 	port_pos = 0;
 	nc_handle = setnetconfig();
 	while ((nconf = getnetconfig(nc_handle))) {
 		if (nconf->nc_flag & NC_VISIBLE) {
 			if (have_v6 == 0 && strcmp(nconf->nc_protofmly,
 			    "inet6") == 0) {
 				/* DO NOTHING */
 			} else if (port_list != NULL) {
 				if (port_pos >= port_len) {
 					syslog(LOG_ERR, "too many port#s");
 					exit(1);
 				}
 				complete_service(nconf, port_list[port_pos++]);
 			} else
 				complete_service(nconf, svcport_str);
 		}
 	}
 	endnetconfig(nc_handle);
 	free(sock_fd);
 	if (port_list != NULL) {
 		for (port_pos = 0; port_pos < port_len; port_pos++)
 			free(port_list[port_pos]);
 		free(port_list);
 	}
 
 	if (xcreated == 0) {
 		syslog(LOG_ERR, "could not create any services");
 		exit(1);
 	}
 
 	/* Expand svc_run() here so that we can call get_exportlist(). */
 	for (;;) {
 		if (got_sighup) {
 			get_exportlist();
 			got_sighup = 0;
 		}
 		readfds = svc_fdset;
 		switch (select(svc_maxfd + 1, &readfds, NULL, NULL, NULL)) {
 		case -1:
 			if (errno == EINTR)
                                 continue;
 			syslog(LOG_ERR, "mountd died: select: %m");
 			exit(1);
 		case 0:
 			continue;
 		default:
 			svc_getreqset(&readfds);
 		}
 	}
 } 
 
 /*
  * This routine creates and binds sockets on the appropriate
  * addresses. It gets called one time for each transport.
  * It returns 0 upon success, 1 for ingore the call and -1 to indicate
  * bind failed with EADDRINUSE.
  * Any file descriptors that have been created are stored in sock_fd and
  * the total count of them is maintained in sock_fdcnt.
  */
 static int
 create_service(struct netconfig *nconf)
 {
 	struct addrinfo hints, *res = NULL;
 	struct sockaddr_in *sin;
 	struct sockaddr_in6 *sin6;
 	struct __rpc_sockinfo si;
 	int aicode;
 	int fd;
 	int nhostsbak;
 	int one = 1;
 	int r;
 	u_int32_t host_addr[4];  /* IPv4 or IPv6 */
 	int mallocd_res;
 
 	if ((nconf->nc_semantics != NC_TPI_CLTS) &&
 	    (nconf->nc_semantics != NC_TPI_COTS) &&
 	    (nconf->nc_semantics != NC_TPI_COTS_ORD))
 		return (1);	/* not my type */
 
 	/*
 	 * XXX - using RPC library internal functions.
 	 */
 	if (!__rpc_nconf2sockinfo(nconf, &si)) {
 		syslog(LOG_ERR, "cannot get information for %s",
 		    nconf->nc_netid);
 		return (1);
 	}
 
 	/* Get mountd's address on this transport */
 	memset(&hints, 0, sizeof hints);
 	hints.ai_family = si.si_af;
 	hints.ai_socktype = si.si_socktype;
 	hints.ai_protocol = si.si_proto;
 
 	/*
 	 * Bind to specific IPs if asked to
 	 */
 	nhostsbak = nhosts;
 	while (nhostsbak > 0) {
 		--nhostsbak;
 		sock_fd = realloc(sock_fd, (sock_fdcnt + 1) * sizeof(int));
 		if (sock_fd == NULL)
 			out_of_mem();
 		sock_fd[sock_fdcnt++] = -1;	/* Set invalid for now. */
 		mallocd_res = 0;
 
 		hints.ai_flags = AI_PASSIVE;
 
 		/*	
 		 * XXX - using RPC library internal functions.
 		 */
 		if ((fd = __rpc_nconf2fd(nconf)) < 0) {
 			int non_fatal = 0;
 	    		if (errno == EAFNOSUPPORT &&
 			    nconf->nc_semantics != NC_TPI_CLTS) 
 				non_fatal = 1;
 				
 			syslog(non_fatal ? LOG_DEBUG : LOG_ERR, 
 			    "cannot create socket for %s", nconf->nc_netid);
 			if (non_fatal != 0)
 				continue;
 			exit(1);
 		}
 
 		switch (hints.ai_family) {
 		case AF_INET:
 			if (inet_pton(AF_INET, hosts[nhostsbak],
 			    host_addr) == 1) {
 				hints.ai_flags |= AI_NUMERICHOST;
 			} else {
 				/*
 				 * Skip if we have an AF_INET6 address.
 				 */
 				if (inet_pton(AF_INET6, hosts[nhostsbak],
 				    host_addr) == 1) {
 					close(fd);
 					continue;
 				}
 			}
 			break;
 		case AF_INET6:
 			if (inet_pton(AF_INET6, hosts[nhostsbak],
 			    host_addr) == 1) {
 				hints.ai_flags |= AI_NUMERICHOST;
 			} else {
 				/*
 				 * Skip if we have an AF_INET address.
 				 */
 				if (inet_pton(AF_INET, hosts[nhostsbak],
 				    host_addr) == 1) {
 					close(fd);
 					continue;
 				}
 			}
 
 			/*
 			 * We're doing host-based access checks here, so don't
 			 * allow v4-in-v6 to confuse things. The kernel will
 			 * disable it by default on NFS sockets too.
 			 */
 			if (setsockopt(fd, IPPROTO_IPV6, IPV6_V6ONLY, &one,
 			    sizeof one) < 0) {
 				syslog(LOG_ERR,
 				    "can't disable v4-in-v6 on IPv6 socket");
 				exit(1);
 			}
 			break;
 		default:
 			break;
 		}
 
 		/*
 		 * If no hosts were specified, just bind to INADDR_ANY
 		 */
 		if (strcmp("*", hosts[nhostsbak]) == 0) {
 			if (svcport_str == NULL) {
 				res = malloc(sizeof(struct addrinfo));
 				if (res == NULL) 
 					out_of_mem();
 				mallocd_res = 1;
 				res->ai_flags = hints.ai_flags;
 				res->ai_family = hints.ai_family;
 				res->ai_protocol = hints.ai_protocol;
 				switch (res->ai_family) {
 				case AF_INET:
 					sin = malloc(sizeof(struct sockaddr_in));
 					if (sin == NULL) 
 						out_of_mem();
 					sin->sin_family = AF_INET;
 					sin->sin_port = htons(0);
 					sin->sin_addr.s_addr = htonl(INADDR_ANY);
 					res->ai_addr = (struct sockaddr*) sin;
 					res->ai_addrlen = (socklen_t)
 					    sizeof(struct sockaddr_in);
 					break;
 				case AF_INET6:
 					sin6 = malloc(sizeof(struct sockaddr_in6));
 					if (sin6 == NULL)
 						out_of_mem();
 					sin6->sin6_family = AF_INET6;
 					sin6->sin6_port = htons(0);
 					sin6->sin6_addr = in6addr_any;
 					res->ai_addr = (struct sockaddr*) sin6;
 					res->ai_addrlen = (socklen_t)
 					    sizeof(struct sockaddr_in6);
 					break;
 				default:
 					syslog(LOG_ERR, "bad addr fam %d",
 					    res->ai_family);
 					exit(1);
 				}
 			} else { 
 				if ((aicode = getaddrinfo(NULL, svcport_str,
 				    &hints, &res)) != 0) {
 					syslog(LOG_ERR,
 					    "cannot get local address for %s: %s",
 					    nconf->nc_netid,
 					    gai_strerror(aicode));
 					close(fd);
 					continue;
 				}
 			}
 		} else {
 			if ((aicode = getaddrinfo(hosts[nhostsbak], svcport_str,
 			    &hints, &res)) != 0) {
 				syslog(LOG_ERR,
 				    "cannot get local address for %s: %s",
 				    nconf->nc_netid, gai_strerror(aicode));
 				close(fd);
 				continue;
 			}
 		}
 
 		/* Store the fd. */
 		sock_fd[sock_fdcnt - 1] = fd;
 
 		/* Now, attempt the bind. */
 		r = bindresvport_sa(fd, res->ai_addr);
 		if (r != 0) {
 			if (errno == EADDRINUSE && mallocd_svcport != 0) {
 				if (mallocd_res != 0) {
 					free(res->ai_addr);
 					free(res);
 				} else
 					freeaddrinfo(res);
 				return (-1);
 			}
 			syslog(LOG_ERR, "bindresvport_sa: %m");
 			exit(1);
 		}
 
 		if (svcport_str == NULL) {
 			svcport_str = malloc(NI_MAXSERV * sizeof(char));
 			if (svcport_str == NULL)
 				out_of_mem();
 			mallocd_svcport = 1;
 
 			if (getnameinfo(res->ai_addr,
 			    res->ai_addr->sa_len, NULL, NI_MAXHOST,
 			    svcport_str, NI_MAXSERV * sizeof(char),
 			    NI_NUMERICHOST | NI_NUMERICSERV))
 				errx(1, "Cannot get port number");
 		}
 		if (mallocd_res != 0) {
 			free(res->ai_addr);
 			free(res);
 		} else
 			freeaddrinfo(res);
 		res = NULL;
 	}
 	return (0);
 }
 
 /*
  * Called after all the create_service() calls have succeeded, to complete
  * the setup and registration.
  */
 static void
 complete_service(struct netconfig *nconf, char *port_str)
 {
 	struct addrinfo hints, *res = NULL;
 	struct __rpc_sockinfo si;
 	struct netbuf servaddr;
 	SVCXPRT	*transp = NULL;
 	int aicode, fd, nhostsbak;
 	int registered = 0;
 
 	if ((nconf->nc_semantics != NC_TPI_CLTS) &&
 	    (nconf->nc_semantics != NC_TPI_COTS) &&
 	    (nconf->nc_semantics != NC_TPI_COTS_ORD))
 		return;	/* not my type */
 
 	/*
 	 * XXX - using RPC library internal functions.
 	 */
 	if (!__rpc_nconf2sockinfo(nconf, &si)) {
 		syslog(LOG_ERR, "cannot get information for %s",
 		    nconf->nc_netid);
 		return;
 	}
 
 	nhostsbak = nhosts;
 	while (nhostsbak > 0) {
 		--nhostsbak;
 		if (sock_fdpos >= sock_fdcnt) {
 			/* Should never happen. */
 			syslog(LOG_ERR, "Ran out of socket fd's");
 			return;
 		}
 		fd = sock_fd[sock_fdpos++];
 		if (fd < 0)
 			continue;
 
 		/*
 		 * Using -1 tells listen(2) to use
 		 * kern.ipc.soacceptqueue for the backlog.
 		 */
 		if (nconf->nc_semantics != NC_TPI_CLTS)
 			listen(fd, -1);
 
 		if (nconf->nc_semantics == NC_TPI_CLTS )
 			transp = svc_dg_create(fd, 0, 0);
 		else 
 			transp = svc_vc_create(fd, RPC_MAXDATASIZE,
 			    RPC_MAXDATASIZE);
 
 		if (transp != (SVCXPRT *) NULL) {
 			if (!svc_reg(transp, MOUNTPROG, MOUNTVERS, mntsrv,
 			    NULL)) 
 				syslog(LOG_ERR,
 				    "can't register %s MOUNTVERS service",
 				    nconf->nc_netid);
 			if (!force_v2) {
 				if (!svc_reg(transp, MOUNTPROG, MOUNTVERS3,
 				    mntsrv, NULL)) 
 					syslog(LOG_ERR,
 					    "can't register %s MOUNTVERS3 service",
 					    nconf->nc_netid);
 			}
 		} else 
 			syslog(LOG_WARNING, "can't create %s services",
 			    nconf->nc_netid);
 
 		if (registered == 0) {
 			registered = 1;
 			memset(&hints, 0, sizeof hints);
 			hints.ai_flags = AI_PASSIVE;
 			hints.ai_family = si.si_af;
 			hints.ai_socktype = si.si_socktype;
 			hints.ai_protocol = si.si_proto;
 
 			if ((aicode = getaddrinfo(NULL, port_str, &hints,
 			    &res)) != 0) {
 				syslog(LOG_ERR, "cannot get local address: %s",
 				    gai_strerror(aicode));
 				exit(1);
 			}
 
 			servaddr.buf = malloc(res->ai_addrlen);
 			memcpy(servaddr.buf, res->ai_addr, res->ai_addrlen);
 			servaddr.len = res->ai_addrlen;
 
 			rpcb_set(MOUNTPROG, MOUNTVERS, nconf, &servaddr);
 			rpcb_set(MOUNTPROG, MOUNTVERS3, nconf, &servaddr);
 
 			xcreated++;
 			freeaddrinfo(res);
 		}
 	} /* end while */
 }
 
 /*
  * Clear out sockets after a failure to bind one of them, so that the
  * cycle of socket creation/binding can start anew.
  */
 static void
 clearout_service(void)
 {
 	int i;
 
 	for (i = 0; i < sock_fdcnt; i++) {
 		if (sock_fd[i] >= 0) {
 			shutdown(sock_fd[i], SHUT_RDWR);
 			close(sock_fd[i]);
 		}
 	}
 }
 
 static void
 usage(void)
 {
 	fprintf(stderr,
 		"usage: mountd [-2] [-d] [-e] [-l] [-n] [-p <port>] [-r] "
 		"[-S] [-h <bindip>] [export_file ...]\n");
 	exit(1);
 }
 
 /*
  * The mount rpc service
  */
 void
 mntsrv(struct svc_req *rqstp, SVCXPRT *transp)
 {
 	struct exportlist *ep;
 	struct dirlist *dp;
 	struct fhreturn fhr;
 	struct stat stb;
 	struct statfs fsb;
 	char host[NI_MAXHOST], numerichost[NI_MAXHOST];
 	int lookup_failed = 1;
 	struct sockaddr *saddr;
 	u_short sport;
 	char rpcpath[MNTPATHLEN + 1], dirpath[MAXPATHLEN];
 	int bad = 0, defset, hostset;
 	sigset_t sighup_mask;
 	int numsecflavors, *secflavorsp;
 
 	sigemptyset(&sighup_mask);
 	sigaddset(&sighup_mask, SIGHUP);
 	saddr = svc_getrpccaller(transp)->buf;
 	switch (saddr->sa_family) {
 	case AF_INET6:
 		sport = ntohs(((struct sockaddr_in6 *)saddr)->sin6_port);
 		break;
 	case AF_INET:
 		sport = ntohs(((struct sockaddr_in *)saddr)->sin_port);
 		break;
 	default:
 		syslog(LOG_ERR, "request from unknown address family");
 		return;
 	}
 	switch (rqstp->rq_proc) {
 	case MOUNTPROC_MNT:
 	case MOUNTPROC_UMNT:
 	case MOUNTPROC_UMNTALL:
 		lookup_failed = getnameinfo(saddr, saddr->sa_len, host,
 		    sizeof host, NULL, 0, 0);
 	}
 	getnameinfo(saddr, saddr->sa_len, numerichost,
 	    sizeof numerichost, NULL, 0, NI_NUMERICHOST);
 	switch (rqstp->rq_proc) {
 	case NULLPROC:
 		if (!svc_sendreply(transp, (xdrproc_t)xdr_void, NULL))
 			syslog(LOG_ERR, "can't send reply");
 		return;
 	case MOUNTPROC_MNT:
 		if (sport >= IPPORT_RESERVED && resvport_only) {
 			syslog(LOG_NOTICE,
 			    "mount request from %s from unprivileged port",
 			    numerichost);
 			svcerr_weakauth(transp);
 			return;
 		}
 		if (!svc_getargs(transp, (xdrproc_t)xdr_dir, rpcpath)) {
 			syslog(LOG_NOTICE, "undecodable mount request from %s",
 			    numerichost);
 			svcerr_decode(transp);
 			return;
 		}
 
 		/*
 		 * Get the real pathname and make sure it is a directory
 		 * or a regular file if the -r option was specified
 		 * and it exists.
 		 */
 		if (realpath(rpcpath, dirpath) == NULL ||
 		    stat(dirpath, &stb) < 0 ||
 		    statfs(dirpath, &fsb) < 0) {
 			chdir("/");	/* Just in case realpath doesn't */
 			syslog(LOG_NOTICE,
 			    "mount request from %s for non existent path %s",
 			    numerichost, dirpath);
 			if (debug)
 				warnx("stat failed on %s", dirpath);
 			bad = ENOENT;	/* We will send error reply later */
 		}
 		if (!bad &&
 		    !S_ISDIR(stb.st_mode) &&
 		    (dir_only || !S_ISREG(stb.st_mode))) {
 			syslog(LOG_NOTICE,
 			    "mount request from %s for non-directory path %s",
 			    numerichost, dirpath);
 			if (debug)
 				warnx("mounting non-directory %s", dirpath);
 			bad = ENOTDIR;	/* We will send error reply later */
 		}
 
 		/* Check in the exports list */
 		sigprocmask(SIG_BLOCK, &sighup_mask, NULL);
 		if (bad)
 			ep = NULL;
 		else
 			ep = ex_search(&fsb.f_fsid, &exphead);
 		hostset = defset = 0;
 		if (ep && (chk_host(ep->ex_defdir, saddr, &defset, &hostset,
 		    &numsecflavors, &secflavorsp) ||
 		    ((dp = dirp_search(ep->ex_dirl, dirpath)) &&
 		      chk_host(dp, saddr, &defset, &hostset, &numsecflavors,
 		       &secflavorsp)) ||
 		    (defset && scan_tree(ep->ex_defdir, saddr) == 0 &&
 		     scan_tree(ep->ex_dirl, saddr) == 0))) {
 			if (bad) {
 				if (!svc_sendreply(transp, (xdrproc_t)xdr_long,
 				    (caddr_t)&bad))
 					syslog(LOG_ERR, "can't send reply");
 				sigprocmask(SIG_UNBLOCK, &sighup_mask, NULL);
 				return;
 			}
 			if (hostset & DP_HOSTSET) {
 				fhr.fhr_flag = hostset;
 				fhr.fhr_numsecflavors = numsecflavors;
 				fhr.fhr_secflavors = secflavorsp;
 			} else {
 				fhr.fhr_flag = defset;
 				fhr.fhr_numsecflavors = ep->ex_defnumsecflavors;
 				fhr.fhr_secflavors = ep->ex_defsecflavors;
 			}
 			fhr.fhr_vers = rqstp->rq_vers;
 			/* Get the file handle */
 			memset(&fhr.fhr_fh, 0, sizeof(nfsfh_t));
 			if (getfh(dirpath, (fhandle_t *)&fhr.fhr_fh) < 0) {
 				bad = errno;
 				syslog(LOG_ERR, "can't get fh for %s", dirpath);
 				if (!svc_sendreply(transp, (xdrproc_t)xdr_long,
 				    (caddr_t)&bad))
 					syslog(LOG_ERR, "can't send reply");
 				sigprocmask(SIG_UNBLOCK, &sighup_mask, NULL);
 				return;
 			}
 			if (!svc_sendreply(transp, (xdrproc_t)xdr_fhs,
 			    (caddr_t)&fhr))
 				syslog(LOG_ERR, "can't send reply");
 			if (!lookup_failed)
 				add_mlist(host, dirpath);
 			else
 				add_mlist(numerichost, dirpath);
 			if (debug)
 				warnx("mount successful");
 			if (dolog)
 				syslog(LOG_NOTICE,
 				    "mount request succeeded from %s for %s",
 				    numerichost, dirpath);
 		} else {
 			if (!bad)
 				bad = EACCES;
 			syslog(LOG_NOTICE,
 			    "mount request denied from %s for %s",
 			    numerichost, dirpath);
 		}
 
 		if (bad && !svc_sendreply(transp, (xdrproc_t)xdr_long,
 		    (caddr_t)&bad))
 			syslog(LOG_ERR, "can't send reply");
 		sigprocmask(SIG_UNBLOCK, &sighup_mask, NULL);
 		return;
 	case MOUNTPROC_DUMP:
 		if (!svc_sendreply(transp, (xdrproc_t)xdr_mlist, (caddr_t)NULL))
 			syslog(LOG_ERR, "can't send reply");
 		else if (dolog)
 			syslog(LOG_NOTICE,
 			    "dump request succeeded from %s",
 			    numerichost);
 		return;
 	case MOUNTPROC_UMNT:
 		if (sport >= IPPORT_RESERVED && resvport_only) {
 			syslog(LOG_NOTICE,
 			    "umount request from %s from unprivileged port",
 			    numerichost);
 			svcerr_weakauth(transp);
 			return;
 		}
 		if (!svc_getargs(transp, (xdrproc_t)xdr_dir, rpcpath)) {
 			syslog(LOG_NOTICE, "undecodable umount request from %s",
 			    numerichost);
 			svcerr_decode(transp);
 			return;
 		}
 		if (realpath(rpcpath, dirpath) == NULL) {
 			syslog(LOG_NOTICE, "umount request from %s "
 			    "for non existent path %s",
 			    numerichost, dirpath);
 		}
 		if (!svc_sendreply(transp, (xdrproc_t)xdr_void, (caddr_t)NULL))
 			syslog(LOG_ERR, "can't send reply");
 		if (!lookup_failed)
 			del_mlist(host, dirpath);
 		del_mlist(numerichost, dirpath);
 		if (dolog)
 			syslog(LOG_NOTICE,
 			    "umount request succeeded from %s for %s",
 			    numerichost, dirpath);
 		return;
 	case MOUNTPROC_UMNTALL:
 		if (sport >= IPPORT_RESERVED && resvport_only) {
 			syslog(LOG_NOTICE,
 			    "umountall request from %s from unprivileged port",
 			    numerichost);
 			svcerr_weakauth(transp);
 			return;
 		}
 		if (!svc_sendreply(transp, (xdrproc_t)xdr_void, (caddr_t)NULL))
 			syslog(LOG_ERR, "can't send reply");
 		if (!lookup_failed)
 			del_mlist(host, NULL);
 		del_mlist(numerichost, NULL);
 		if (dolog)
 			syslog(LOG_NOTICE,
 			    "umountall request succeeded from %s",
 			    numerichost);
 		return;
 	case MOUNTPROC_EXPORT:
 		if (!svc_sendreply(transp, (xdrproc_t)xdr_explist, (caddr_t)NULL))
 			if (!svc_sendreply(transp, (xdrproc_t)xdr_explist_brief,
 			    (caddr_t)NULL))
 				syslog(LOG_ERR, "can't send reply");
 		if (dolog)
 			syslog(LOG_NOTICE,
 			    "export request succeeded from %s",
 			    numerichost);
 		return;
 	default:
 		svcerr_noproc(transp);
 		return;
 	}
 }
 
 /*
  * Xdr conversion for a dirpath string
  */
 static int
 xdr_dir(XDR *xdrsp, char *dirp)
 {
 	return (xdr_string(xdrsp, &dirp, MNTPATHLEN));
 }
 
 /*
  * Xdr routine to generate file handle reply
  */
 static int
 xdr_fhs(XDR *xdrsp, caddr_t cp)
 {
 	struct fhreturn *fhrp = (struct fhreturn *)cp;
 	u_long ok = 0, len, auth;
 	int i;
 
 	if (!xdr_long(xdrsp, &ok))
 		return (0);
 	switch (fhrp->fhr_vers) {
 	case 1:
 		return (xdr_opaque(xdrsp, (caddr_t)&fhrp->fhr_fh, NFSX_V2FH));
 	case 3:
 		len = NFSX_V3FH;
 		if (!xdr_long(xdrsp, &len))
 			return (0);
 		if (!xdr_opaque(xdrsp, (caddr_t)&fhrp->fhr_fh, len))
 			return (0);
 		if (fhrp->fhr_numsecflavors) {
 			if (!xdr_int(xdrsp, &fhrp->fhr_numsecflavors))
 				return (0);
 			for (i = 0; i < fhrp->fhr_numsecflavors; i++)
 				if (!xdr_int(xdrsp, &fhrp->fhr_secflavors[i]))
 					return (0);
 			return (1);
 		} else {
 			auth = AUTH_SYS;
 			len = 1;
 			if (!xdr_long(xdrsp, &len))
 				return (0);
 			return (xdr_long(xdrsp, &auth));
 		}
 	}
 	return (0);
 }
 
 static int
 xdr_mlist(XDR *xdrsp, caddr_t cp __unused)
 {
 	struct mountlist *mlp;
 	int true = 1;
 	int false = 0;
 	char *strp;
 
 	SLIST_FOREACH(mlp, &mlhead, next) {
 		if (!xdr_bool(xdrsp, &true))
 			return (0);
 		strp = &mlp->ml_host[0];
 		if (!xdr_string(xdrsp, &strp, MNTNAMLEN))
 			return (0);
 		strp = &mlp->ml_dirp[0];
 		if (!xdr_string(xdrsp, &strp, MNTPATHLEN))
 			return (0);
 	}
 	if (!xdr_bool(xdrsp, &false))
 		return (0);
 	return (1);
 }
 
 /*
  * Xdr conversion for export list
  */
 static int
 xdr_explist_common(XDR *xdrsp, caddr_t cp __unused, int brief)
 {
 	struct exportlist *ep;
 	int false = 0;
 	int putdef;
 	sigset_t sighup_mask;
 
 	sigemptyset(&sighup_mask);
 	sigaddset(&sighup_mask, SIGHUP);
 	sigprocmask(SIG_BLOCK, &sighup_mask, NULL);
 
 	SLIST_FOREACH(ep, &exphead, entries) {
 		putdef = 0;
 		if (put_exlist(ep->ex_dirl, xdrsp, ep->ex_defdir,
 			       &putdef, brief))
 			goto errout;
 		if (ep->ex_defdir && putdef == 0 &&
 			put_exlist(ep->ex_defdir, xdrsp, (struct dirlist *)NULL,
 			&putdef, brief))
 			goto errout;
 	}
 	sigprocmask(SIG_UNBLOCK, &sighup_mask, NULL);
 	if (!xdr_bool(xdrsp, &false))
 		return (0);
 	return (1);
 errout:
 	sigprocmask(SIG_UNBLOCK, &sighup_mask, NULL);
 	return (0);
 }
 
 /*
  * Called from xdr_explist() to traverse the tree and export the
  * directory paths.
  */
 static int
 put_exlist(struct dirlist *dp, XDR *xdrsp, struct dirlist *adp, int *putdefp,
 	int brief)
 {
 	struct grouplist *grp;
 	struct hostlist *hp;
 	int true = 1;
 	int false = 0;
 	int gotalldir = 0;
 	char *strp;
 
 	if (dp) {
 		if (put_exlist(dp->dp_left, xdrsp, adp, putdefp, brief))
 			return (1);
 		if (!xdr_bool(xdrsp, &true))
 			return (1);
 		strp = dp->dp_dirp;
 		if (!xdr_string(xdrsp, &strp, MNTPATHLEN))
 			return (1);
 		if (adp && !strcmp(dp->dp_dirp, adp->dp_dirp)) {
 			gotalldir = 1;
 			*putdefp = 1;
 		}
 		if (brief) {
 			if (!xdr_bool(xdrsp, &true))
 				return (1);
 			strp = "(...)";
 			if (!xdr_string(xdrsp, &strp, MNTPATHLEN))
 				return (1);
 		} else if ((dp->dp_flag & DP_DEFSET) == 0 &&
 		    (gotalldir == 0 || (adp->dp_flag & DP_DEFSET) == 0)) {
 			hp = dp->dp_hosts;
 			while (hp) {
 				grp = hp->ht_grp;
 				if (grp->gr_type == GT_HOST) {
 					if (!xdr_bool(xdrsp, &true))
 						return (1);
 					strp = grp->gr_ptr.gt_addrinfo->ai_canonname;
 					if (!xdr_string(xdrsp, &strp,
 					    MNTNAMLEN))
 						return (1);
 				} else if (grp->gr_type == GT_NET) {
 					if (!xdr_bool(xdrsp, &true))
 						return (1);
 					strp = grp->gr_ptr.gt_net.nt_name;
 					if (!xdr_string(xdrsp, &strp,
 					    MNTNAMLEN))
 						return (1);
 				}
 				hp = hp->ht_next;
 				if (gotalldir && hp == (struct hostlist *)NULL) {
 					hp = adp->dp_hosts;
 					gotalldir = 0;
 				}
 			}
 		}
 		if (!xdr_bool(xdrsp, &false))
 			return (1);
 		if (put_exlist(dp->dp_right, xdrsp, adp, putdefp, brief))
 			return (1);
 	}
 	return (0);
 }
 
 static int
 xdr_explist(XDR *xdrsp, caddr_t cp)
 {
 
 	return xdr_explist_common(xdrsp, cp, 0);
 }
 
 static int
 xdr_explist_brief(XDR *xdrsp, caddr_t cp)
 {
 
 	return xdr_explist_common(xdrsp, cp, 1);
 }
 
 static char *line;
 static size_t linesize;
 static FILE *exp_file;
 
 /*
  * Get the export list from one, currently open file
  */
 static void
 get_exportlist_one(void)
 {
 	struct exportlist *ep;
 	struct grouplist *grp, *tgrp;
 	struct dirlist *dirhead;
 	struct statfs fsb;
 	struct xucred anon;
 	char *cp, *endcp, *dirp, *hst, *usr, *dom, savedc;
 	int len, has_host, exflags, got_nondir, dirplen, netgrp;
 
 	v4root_phase = 0;
 	dirhead = (struct dirlist *)NULL;
 	while (get_line()) {
 		if (debug)
 			warnx("got line %s", line);
 		cp = line;
 		nextfield(&cp, &endcp);
 		if (*cp == '#')
 			goto nextline;
 
 		/*
 		 * Set defaults.
 		 */
 		has_host = FALSE;
 		anon = def_anon;
 		exflags = MNT_EXPORTED;
 		got_nondir = 0;
 		opt_flags = 0;
 		ep = (struct exportlist *)NULL;
 		dirp = NULL;
 
 		/*
 		 * Handle the V4 root dir.
 		 */
 		if (*cp == 'V' && *(cp + 1) == '4' && *(cp + 2) == ':') {
 			/*
 			 * V4: just indicates that it is the v4 root point,
 			 * so skip over that and set v4root_phase.
 			 */
 			if (v4root_phase > 0) {
 				syslog(LOG_ERR, "V4:duplicate line, ignored");
 				goto nextline;
 			}
 			v4root_phase = 1;
 			cp += 3;
 			nextfield(&cp, &endcp);
 		}
 
 		/*
 		 * Create new exports list entry
 		 */
 		len = endcp-cp;
 		tgrp = grp = get_grp();
 		while (len > 0) {
 			if (len > MNTNAMLEN) {
 			    getexp_err(ep, tgrp, "mountpoint too long");
 			    goto nextline;
 			}
 			if (*cp == '-') {
 			    if (ep == (struct exportlist *)NULL) {
 				getexp_err(ep, tgrp,
 				    "flag before export path definition");
 				goto nextline;
 			    }
 			    if (debug)
 				warnx("doing opt %s", cp);
 			    got_nondir = 1;
 			    if (do_opt(&cp, &endcp, ep, grp, &has_host,
 				&exflags, &anon)) {
 				getexp_err(ep, tgrp, NULL);
 				goto nextline;
 			    }
 			} else if (*cp == '/') {
 			    savedc = *endcp;
 			    *endcp = '\0';
 			    if (v4root_phase > 1) {
 				    if (dirp != NULL) {
 					getexp_err(ep, tgrp, "Multiple V4 dirs");
 					goto nextline;
 				    }
 			    }
 			    if (check_dirpath(cp) &&
 				statfs(cp, &fsb) >= 0) {
 				if ((fsb.f_flags & MNT_AUTOMOUNTED) != 0)
 				    syslog(LOG_ERR, "Warning: exporting of "
 					"automounted fs %s not supported", cp);
 				if (got_nondir) {
 				    getexp_err(ep, tgrp, "dirs must be first");
 				    goto nextline;
 				}
 				if (v4root_phase == 1) {
 				    if (dirp != NULL) {
 					getexp_err(ep, tgrp, "Multiple V4 dirs");
 					goto nextline;
 				    }
 				    if (strlen(v4root_dirpath) == 0) {
 					strlcpy(v4root_dirpath, cp,
 					    sizeof (v4root_dirpath));
 				    } else if (strcmp(v4root_dirpath, cp)
 					!= 0) {
 					syslog(LOG_ERR,
 					    "different V4 dirpath %s", cp);
 					getexp_err(ep, tgrp, NULL);
 					goto nextline;
 				    }
 				    dirp = cp;
 				    v4root_phase = 2;
 				    got_nondir = 1;
 				    ep = get_exp();
 				} else {
 				    if (ep) {
 					if (ep->ex_fs.val[0] !=
 					    fsb.f_fsid.val[0] ||
 					    ep->ex_fs.val[1] !=
 					    fsb.f_fsid.val[1]) {
 						getexp_err(ep, tgrp,
 						    "fsid mismatch");
 						goto nextline;
 					}
 				    } else {
 					/*
 					 * See if this directory is already
 					 * in the list.
 					 */
 					ep = ex_search(&fsb.f_fsid, &exphead);
 					if (ep == (struct exportlist *)NULL) {
 					    ep = get_exp();
 					    ep->ex_fs = fsb.f_fsid;
 					    ep->ex_fsdir = strdup(fsb.f_mntonname);
 					    if (ep->ex_fsdir == NULL)
 						out_of_mem();
 					    if (debug)
 						warnx(
 						  "making new ep fs=0x%x,0x%x",
 						  fsb.f_fsid.val[0],
 						  fsb.f_fsid.val[1]);
 					} else if (debug)
 					    warnx("found ep fs=0x%x,0x%x",
 						fsb.f_fsid.val[0],
 						fsb.f_fsid.val[1]);
 				    }
 
 				    /*
 				     * Add dirpath to export mount point.
 				     */
 				    dirp = add_expdir(&dirhead, cp, len);
 				    dirplen = len;
 				}
 			    } else {
 				getexp_err(ep, tgrp,
 				    "symbolic link in export path or statfs failed");
 				goto nextline;
 			    }
 			    *endcp = savedc;
 			} else {
 			    savedc = *endcp;
 			    *endcp = '\0';
 			    got_nondir = 1;
 			    if (ep == (struct exportlist *)NULL) {
 				getexp_err(ep, tgrp,
 				    "host(s) before export path definition");
 				goto nextline;
 			    }
 
 			    /*
 			     * Get the host or netgroup.
 			     */
 			    setnetgrent(cp);
 			    netgrp = getnetgrent(&hst, &usr, &dom);
 			    do {
 				if (has_host) {
 				    grp->gr_next = get_grp();
 				    grp = grp->gr_next;
 				}
 				if (netgrp) {
 				    if (hst == 0) {
 					syslog(LOG_ERR,
 				"null hostname in netgroup %s, skipping", cp);
 					grp->gr_type = GT_IGNORE;
 				    } else if (get_host(hst, grp, tgrp)) {
 					syslog(LOG_ERR,
 			"bad host %s in netgroup %s, skipping", hst, cp);
 					grp->gr_type = GT_IGNORE;
 				    }
 				} else if (get_host(cp, grp, tgrp)) {
 				    syslog(LOG_ERR, "bad host %s, skipping", cp);
 				    grp->gr_type = GT_IGNORE;
 				}
 				has_host = TRUE;
 			    } while (netgrp && getnetgrent(&hst, &usr, &dom));
 			    endnetgrent();
 			    *endcp = savedc;
 			}
 			cp = endcp;
 			nextfield(&cp, &endcp);
 			len = endcp - cp;
 		}
 		if (check_options(dirhead)) {
 			getexp_err(ep, tgrp, NULL);
 			goto nextline;
 		}
 		if (!has_host) {
 			grp->gr_type = GT_DEFAULT;
 			if (debug)
 				warnx("adding a default entry");
 
 		/*
 		 * Don't allow a network export coincide with a list of
 		 * host(s) on the same line.
 		 */
 		} else if ((opt_flags & OP_NET) && tgrp->gr_next) {
 			getexp_err(ep, tgrp, "network/host conflict");
 			goto nextline;
 
 		/*
 		 * If an export list was specified on this line, make sure
 		 * that we have at least one valid entry, otherwise skip it.
 		 */
 		} else {
 			grp = tgrp;
 			while (grp && grp->gr_type == GT_IGNORE)
 				grp = grp->gr_next;
 			if (! grp) {
 			    getexp_err(ep, tgrp, "no valid entries");
 			    goto nextline;
 			}
 		}
 
 		if (v4root_phase == 1) {
 			getexp_err(ep, tgrp, "V4:root, no dirp, ignored");
 			goto nextline;
 		}
 
 		/*
 		 * Loop through hosts, pushing the exports into the kernel.
 		 * After loop, tgrp points to the start of the list and
 		 * grp points to the last entry in the list.
 		 */
 		grp = tgrp;
 		do {
 			if (do_mount(ep, grp, exflags, &anon, dirp, dirplen,
 			    &fsb)) {
 				getexp_err(ep, tgrp, NULL);
 				goto nextline;
 			}
 		} while (grp->gr_next && (grp = grp->gr_next));
 
 		/*
 		 * For V4: don't enter in mount lists.
 		 */
 		if (v4root_phase > 0 && v4root_phase <= 2) {
 			/*
 			 * Since these structures aren't used by mountd,
 			 * free them up now.
 			 */
 			if (ep != NULL)
 				free_exp(ep);
 			while (tgrp != NULL) {
 				grp = tgrp;
 				tgrp = tgrp->gr_next;
 				free_grp(grp);
 			}
 			goto nextline;
 		}
 
 		/*
 		 * Success. Update the data structures.
 		 */
 		if (has_host) {
 			hang_dirp(dirhead, tgrp, ep, opt_flags);
-			grp->gr_next = grphead;
-			grphead = tgrp;
+			grp->gr_next = ep->ex_grphead;
+			ep->ex_grphead = tgrp;
 		} else {
 			hang_dirp(dirhead, (struct grouplist *)NULL, ep,
 				opt_flags);
 			free_grp(grp);
 		}
 		dirhead = (struct dirlist *)NULL;
 		if ((ep->ex_flag & EX_LINKED) == 0) {
 			insert_exports(ep, &exphead);
 
 			ep->ex_flag |= EX_LINKED;
 		}
 nextline:
 		v4root_phase = 0;
 		if (dirhead) {
 			free_dir(dirhead);
 			dirhead = (struct dirlist *)NULL;
 		}
 	}
 }
 
 /*
  * Get the export list from all specified files
  */
 static void
 get_exportlist(void)
 {
-	struct grouplist *grp, *tgrp;
 	struct export_args export;
 	struct iovec *iov;
 	struct statfs *mntbufp;
 	char errmsg[255];
 	int num, i;
 	int iovlen;
 	struct nfsex_args eargs;
 
 	if (suspend_nfsd != 0)
 		(void)nfssvc(NFSSVC_SUSPENDNFSD, NULL);
 	v4root_dirpath[0] = '\0';
 	bzero(&export, sizeof(export));
 	export.ex_flags = MNT_DELEXPORT;
 	iov = NULL;
 	iovlen = 0;
 	bzero(errmsg, sizeof(errmsg));
 
 	/*
 	 * First, get rid of the old list
 	 */
 	free_exports(&exphead);
 
-	grp = grphead;
-	while (grp) {
-		tgrp = grp;
-		grp = grp->gr_next;
-		free_grp(tgrp);
-	}
-	grphead = (struct grouplist *)NULL;
-
 	/*
 	 * and the old V4 root dir.
 	 */
 	bzero(&eargs, sizeof (eargs));
 	eargs.export.ex_flags = MNT_DELEXPORT;
 	if (nfssvc(NFSSVC_V4ROOTEXPORT, (caddr_t)&eargs) < 0 &&
 	    errno != ENOENT)
 		syslog(LOG_ERR, "Can't delete exports for V4:");
 
 	/*
 	 * and clear flag that notes if a public fh has been exported.
 	 */
 	has_publicfh = 0;
 
 	/*
 	 * And delete exports that are in the kernel for all local
 	 * filesystems.
 	 * XXX: Should know how to handle all local exportable filesystems.
 	 */
 	num = getmntinfo(&mntbufp, MNT_NOWAIT);
 
 	if (num > 0) {
 		build_iovec(&iov, &iovlen, "fstype", NULL, 0);
 		build_iovec(&iov, &iovlen, "fspath", NULL, 0);
 		build_iovec(&iov, &iovlen, "from", NULL, 0);
 		build_iovec(&iov, &iovlen, "update", NULL, 0);
 		build_iovec(&iov, &iovlen, "export", &export, sizeof(export));
 		build_iovec(&iov, &iovlen, "errmsg", errmsg, sizeof(errmsg));
 	}
 
 	for (i = 0; i < num; i++)
 		delete_export(iov, iovlen, &mntbufp[i], errmsg);
 
 	if (iov != NULL) {
 		/* Free strings allocated by strdup() in getmntopts.c */
 		free(iov[0].iov_base); /* fstype */
 		free(iov[2].iov_base); /* fspath */
 		free(iov[4].iov_base); /* from */
 		free(iov[6].iov_base); /* update */
 		free(iov[8].iov_base); /* export */
 		free(iov[10].iov_base); /* errmsg */
 
 		/* free iov, allocated by realloc() */
 		free(iov);
 		iovlen = 0;
 	}
 
 	read_exportfile();
 
 	/*
 	 * If there was no public fh, clear any previous one set.
 	 */
 	if (has_publicfh == 0)
 		(void) nfssvc(NFSSVC_NOPUBLICFH, NULL);
 
 	/* Resume the nfsd. If they weren't suspended, this is harmless. */
 	(void)nfssvc(NFSSVC_RESUMENFSD, NULL);
 }
 
 /*
  * Insert an export entry in the appropriate list.
  */
 static void
 insert_exports(struct exportlist *ep, struct exportlisthead *exhp)
 {
 
 	SLIST_INSERT_HEAD(exhp, ep, entries);
 }
 
 /*
  * Free up the exports lists passed in as arguments.
  */
 static void
 free_exports(struct exportlisthead *exhp)
 {
 	struct exportlist *ep, *ep2;
 
 	SLIST_FOREACH_SAFE(ep, exhp, entries, ep2) {
 		SLIST_REMOVE(exhp, ep, exportlist, entries);
 		free_exp(ep);
 	}
 	SLIST_INIT(exhp);
 }
 
 /*
  * Read the exports file(s) and call get_exportlist_one() for each line.
  */
 static void
 read_exportfile(void)
 {
 	int done, i;
 
 	/*
 	 * Read in the exports file and build the list, calling
 	 * nmount() as we go along to push the export rules into the kernel.
 	 */
 	done = 0;
 	for (i = 0; exnames[i] != NULL; i++) {
 		if (debug)
 			warnx("reading exports from %s", exnames[i]);
 		if ((exp_file = fopen(exnames[i], "r")) == NULL) {
 			syslog(LOG_WARNING, "can't open %s", exnames[i]);
 			continue;
 		}
 		get_exportlist_one();
 		fclose(exp_file);
 		done++;
 	}
 	if (done == 0) {
 		syslog(LOG_ERR, "can't open any exports file");
 		exit(2);
 	}
 }
 
 /*
  * Delete an exports entry.
  */
 static void
 delete_export(struct iovec *iov, int iovlen, struct statfs *fsp, char *errmsg)
 {
 	struct xvfsconf vfc;
 
 	if (getvfsbyname(fsp->f_fstypename, &vfc) != 0) {
 		syslog(LOG_ERR, "getvfsbyname() failed for %s",
 		    fsp->f_fstypename);
 		return;
 	}
 	
 	/*
 	 * We do not need to delete "export" flag from
 	 * filesystems that do not have it set.
 	 */
 	if (!(fsp->f_flags & MNT_EXPORTED))
 		return;
 	/*
 	 * Do not delete export for network filesystem by
 	 * passing "export" arg to nmount().
 	 * It only makes sense to do this for local filesystems.
 	 */
 	if (vfc.vfc_flags & VFCF_NETWORK)
 		return;
 	
 	iov[1].iov_base = fsp->f_fstypename;
 	iov[1].iov_len = strlen(fsp->f_fstypename) + 1;
 	iov[3].iov_base = fsp->f_mntonname;
 	iov[3].iov_len = strlen(fsp->f_mntonname) + 1;
 	iov[5].iov_base = fsp->f_mntfromname;
 	iov[5].iov_len = strlen(fsp->f_mntfromname) + 1;
 	errmsg[0] = '\0';
 	
 	/*
 	 * EXDEV is returned when path exists but is not a
 	 * mount point.  May happens if raced with unmount.
 	 */
 	if (nmount(iov, iovlen, fsp->f_flags) < 0 && errno != ENOENT &&
 	    errno != ENOTSUP && errno != EXDEV) {
 		syslog(LOG_ERR,
 		    "can't delete exports for %s: %m %s",
 		    fsp->f_mntonname, errmsg);
 	}
 }
 
 /*
  * Allocate an export list element
  */
 static struct exportlist *
 get_exp(void)
 {
 	struct exportlist *ep;
 
 	ep = (struct exportlist *)calloc(1, sizeof (struct exportlist));
 	if (ep == (struct exportlist *)NULL)
 		out_of_mem();
 	return (ep);
 }
 
 /*
  * Allocate a group list element
  */
 static struct grouplist *
 get_grp(void)
 {
 	struct grouplist *gp;
 
 	gp = (struct grouplist *)calloc(1, sizeof (struct grouplist));
 	if (gp == (struct grouplist *)NULL)
 		out_of_mem();
 	return (gp);
 }
 
 /*
  * Clean up upon an error in get_exportlist().
  */
 static void
 getexp_err(struct exportlist *ep, struct grouplist *grp, const char *reason)
 {
 	struct grouplist *tgrp;
 
 	if (!(opt_flags & OP_QUIET)) {
 		if (reason != NULL)
 			syslog(LOG_ERR, "bad exports list line '%s': %s", line,
 			    reason);
 		else
 			syslog(LOG_ERR, "bad exports list line '%s'", line);
 	}
 	if (ep && (ep->ex_flag & EX_LINKED) == 0)
 		free_exp(ep);
 	while (grp) {
 		tgrp = grp;
 		grp = grp->gr_next;
 		free_grp(tgrp);
 	}
 }
 
 /*
  * Search the export list for a matching fs.
  */
 static struct exportlist *
 ex_search(fsid_t *fsid, struct exportlisthead *exhp)
 {
 	struct exportlist *ep;
 
 	SLIST_FOREACH(ep, exhp, entries) {
 		if (ep->ex_fs.val[0] == fsid->val[0] &&
 		    ep->ex_fs.val[1] == fsid->val[1])
 			return (ep);
 	}
 
 	return (ep);
 }
 
 /*
  * Add a directory path to the list.
  */
 static char *
 add_expdir(struct dirlist **dpp, char *cp, int len)
 {
 	struct dirlist *dp;
 
 	dp = malloc(sizeof (struct dirlist));
 	if (dp == (struct dirlist *)NULL)
 		out_of_mem();
 	dp->dp_left = *dpp;
 	dp->dp_right = (struct dirlist *)NULL;
 	dp->dp_flag = 0;
 	dp->dp_hosts = (struct hostlist *)NULL;
 	dp->dp_dirp = strndup(cp, len);
 	if (dp->dp_dirp == NULL)
 		out_of_mem();
 	*dpp = dp;
 	return (dp->dp_dirp);
 }
 
 /*
  * Hang the dir list element off the dirpath binary tree as required
  * and update the entry for host.
  */
 static void
 hang_dirp(struct dirlist *dp, struct grouplist *grp, struct exportlist *ep,
 	int flags)
 {
 	struct hostlist *hp;
 	struct dirlist *dp2;
 
 	if (flags & OP_ALLDIRS) {
 		if (ep->ex_defdir)
 			free((caddr_t)dp);
 		else
 			ep->ex_defdir = dp;
 		if (grp == (struct grouplist *)NULL) {
 			ep->ex_defdir->dp_flag |= DP_DEFSET;
 			/* Save the default security flavors list. */
 			ep->ex_defnumsecflavors = ep->ex_numsecflavors;
 			if (ep->ex_numsecflavors > 0)
 				memcpy(ep->ex_defsecflavors, ep->ex_secflavors,
 				    sizeof(ep->ex_secflavors));
 		} else while (grp) {
 			hp = get_ht();
 			hp->ht_grp = grp;
 			hp->ht_next = ep->ex_defdir->dp_hosts;
 			ep->ex_defdir->dp_hosts = hp;
 			/* Save the security flavors list for this host set. */
 			grp->gr_numsecflavors = ep->ex_numsecflavors;
 			if (ep->ex_numsecflavors > 0)
 				memcpy(grp->gr_secflavors, ep->ex_secflavors,
 				    sizeof(ep->ex_secflavors));
 			grp = grp->gr_next;
 		}
 	} else {
 
 		/*
 		 * Loop through the directories adding them to the tree.
 		 */
 		while (dp) {
 			dp2 = dp->dp_left;
 			add_dlist(&ep->ex_dirl, dp, grp, flags, ep);
 			dp = dp2;
 		}
 	}
 }
 
 /*
  * Traverse the binary tree either updating a node that is already there
  * for the new directory or adding the new node.
  */
 static void
 add_dlist(struct dirlist **dpp, struct dirlist *newdp, struct grouplist *grp,
 	int flags, struct exportlist *ep)
 {
 	struct dirlist *dp;
 	struct hostlist *hp;
 	int cmp;
 
 	dp = *dpp;
 	if (dp) {
 		cmp = strcmp(dp->dp_dirp, newdp->dp_dirp);
 		if (cmp > 0) {
 			add_dlist(&dp->dp_left, newdp, grp, flags, ep);
 			return;
 		} else if (cmp < 0) {
 			add_dlist(&dp->dp_right, newdp, grp, flags, ep);
 			return;
 		} else
 			free((caddr_t)newdp);
 	} else {
 		dp = newdp;
 		dp->dp_left = (struct dirlist *)NULL;
 		*dpp = dp;
 	}
 	if (grp) {
 
 		/*
 		 * Hang all of the host(s) off of the directory point.
 		 */
 		do {
 			hp = get_ht();
 			hp->ht_grp = grp;
 			hp->ht_next = dp->dp_hosts;
 			dp->dp_hosts = hp;
 			/* Save the security flavors list for this host set. */
 			grp->gr_numsecflavors = ep->ex_numsecflavors;
 			if (ep->ex_numsecflavors > 0)
 				memcpy(grp->gr_secflavors, ep->ex_secflavors,
 				    sizeof(ep->ex_secflavors));
 			grp = grp->gr_next;
 		} while (grp);
 	} else {
 		dp->dp_flag |= DP_DEFSET;
 		/* Save the default security flavors list. */
 		ep->ex_defnumsecflavors = ep->ex_numsecflavors;
 		if (ep->ex_numsecflavors > 0)
 			memcpy(ep->ex_defsecflavors, ep->ex_secflavors,
 			    sizeof(ep->ex_secflavors));
 	}
 }
 
 /*
  * Search for a dirpath on the export point.
  */
 static struct dirlist *
 dirp_search(struct dirlist *dp, char *dirp)
 {
 	int cmp;
 
 	if (dp) {
 		cmp = strcmp(dp->dp_dirp, dirp);
 		if (cmp > 0)
 			return (dirp_search(dp->dp_left, dirp));
 		else if (cmp < 0)
 			return (dirp_search(dp->dp_right, dirp));
 		else
 			return (dp);
 	}
 	return (dp);
 }
 
 /*
  * Scan for a host match in a directory tree.
  */
 static int
 chk_host(struct dirlist *dp, struct sockaddr *saddr, int *defsetp,
 	int *hostsetp, int *numsecflavors, int **secflavorsp)
 {
 	struct hostlist *hp;
 	struct grouplist *grp;
 	struct addrinfo *ai;
 
 	if (dp) {
 		if (dp->dp_flag & DP_DEFSET)
 			*defsetp = dp->dp_flag;
 		hp = dp->dp_hosts;
 		while (hp) {
 			grp = hp->ht_grp;
 			switch (grp->gr_type) {
 			case GT_HOST:
 				ai = grp->gr_ptr.gt_addrinfo;
 				for (; ai; ai = ai->ai_next) {
 					if (!sacmp(ai->ai_addr, saddr, NULL)) {
 						*hostsetp =
 						    (hp->ht_flag | DP_HOSTSET);
 						if (numsecflavors != NULL) {
 							*numsecflavors =
 							    grp->gr_numsecflavors;
 							*secflavorsp =
 							    grp->gr_secflavors;
 						}
 						return (1);
 					}
 				}
 				break;
 			case GT_NET:
 				if (!sacmp(saddr, (struct sockaddr *)
 				    &grp->gr_ptr.gt_net.nt_net,
 				    (struct sockaddr *)
 				    &grp->gr_ptr.gt_net.nt_mask)) {
 					*hostsetp = (hp->ht_flag | DP_HOSTSET);
 					if (numsecflavors != NULL) {
 						*numsecflavors =
 						    grp->gr_numsecflavors;
 						*secflavorsp =
 						    grp->gr_secflavors;
 					}
 					return (1);
 				}
 				break;
 			}
 			hp = hp->ht_next;
 		}
 	}
 	return (0);
 }
 
 /*
  * Scan tree for a host that matches the address.
  */
 static int
 scan_tree(struct dirlist *dp, struct sockaddr *saddr)
 {
 	int defset, hostset;
 
 	if (dp) {
 		if (scan_tree(dp->dp_left, saddr))
 			return (1);
 		if (chk_host(dp, saddr, &defset, &hostset, NULL, NULL))
 			return (1);
 		if (scan_tree(dp->dp_right, saddr))
 			return (1);
 	}
 	return (0);
 }
 
 /*
  * Traverse the dirlist tree and free it up.
  */
 static void
 free_dir(struct dirlist *dp)
 {
 
 	if (dp) {
 		free_dir(dp->dp_left);
 		free_dir(dp->dp_right);
 		free_host(dp->dp_hosts);
 		free(dp->dp_dirp);
 		free(dp);
 	}
 }
 
 /*
  * Parse a colon separated list of security flavors
  */
 static int
 parsesec(char *seclist, struct exportlist *ep)
 {
 	char *cp, savedc;
 	int flavor;
 
 	ep->ex_numsecflavors = 0;
 	for (;;) {
 		cp = strchr(seclist, ':');
 		if (cp) {
 			savedc = *cp;
 			*cp = '\0';
 		}
 
 		if (!strcmp(seclist, "sys"))
 			flavor = AUTH_SYS;
 		else if (!strcmp(seclist, "krb5"))
 			flavor = RPCSEC_GSS_KRB5;
 		else if (!strcmp(seclist, "krb5i"))
 			flavor = RPCSEC_GSS_KRB5I;
 		else if (!strcmp(seclist, "krb5p"))
 			flavor = RPCSEC_GSS_KRB5P;
 		else {
 			if (cp)
 				*cp = savedc;
 			syslog(LOG_ERR, "bad sec flavor: %s", seclist);
 			return (1);
 		}
 		if (ep->ex_numsecflavors == MAXSECFLAVORS) {
 			if (cp)
 				*cp = savedc;
 			syslog(LOG_ERR, "too many sec flavors: %s", seclist);
 			return (1);
 		}
 		ep->ex_secflavors[ep->ex_numsecflavors] = flavor;
 		ep->ex_numsecflavors++;
 		if (cp) {
 			*cp = savedc;
 			seclist = cp + 1;
 		} else {
 			break;
 		}
 	}
 	return (0);
 }
 
 /*
  * Parse the option string and update fields.
  * Option arguments may either be -<option>=<value> or
  * -<option> <value>
  */
 static int
 do_opt(char **cpp, char **endcpp, struct exportlist *ep, struct grouplist *grp,
 	int *has_hostp, int *exflagsp, struct xucred *cr)
 {
 	char *cpoptarg, *cpoptend;
 	char *cp, *endcp, *cpopt, savedc, savedc2;
 	int allflag, usedarg;
 
 	savedc2 = '\0';
 	cpopt = *cpp;
 	cpopt++;
 	cp = *endcpp;
 	savedc = *cp;
 	*cp = '\0';
 	while (cpopt && *cpopt) {
 		allflag = 1;
 		usedarg = -2;
 		if ((cpoptend = strchr(cpopt, ','))) {
 			*cpoptend++ = '\0';
 			if ((cpoptarg = strchr(cpopt, '=')))
 				*cpoptarg++ = '\0';
 		} else {
 			if ((cpoptarg = strchr(cpopt, '=')))
 				*cpoptarg++ = '\0';
 			else {
 				*cp = savedc;
 				nextfield(&cp, &endcp);
 				**endcpp = '\0';
 				if (endcp > cp && *cp != '-') {
 					cpoptarg = cp;
 					savedc2 = *endcp;
 					*endcp = '\0';
 					usedarg = 0;
 				}
 			}
 		}
 		if (!strcmp(cpopt, "ro") || !strcmp(cpopt, "o")) {
 			*exflagsp |= MNT_EXRDONLY;
 		} else if (cpoptarg && (!strcmp(cpopt, "maproot") ||
 		    !(allflag = strcmp(cpopt, "mapall")) ||
 		    !strcmp(cpopt, "root") || !strcmp(cpopt, "r"))) {
 			usedarg++;
 			parsecred(cpoptarg, cr);
 			if (allflag == 0) {
 				*exflagsp |= MNT_EXPORTANON;
 				opt_flags |= OP_MAPALL;
 			} else
 				opt_flags |= OP_MAPROOT;
 		} else if (cpoptarg && (!strcmp(cpopt, "mask") ||
 		    !strcmp(cpopt, "m"))) {
 			if (get_net(cpoptarg, &grp->gr_ptr.gt_net, 1)) {
 				syslog(LOG_ERR, "bad mask: %s", cpoptarg);
 				return (1);
 			}
 			usedarg++;
 			opt_flags |= OP_MASK;
 		} else if (cpoptarg && (!strcmp(cpopt, "network") ||
 			!strcmp(cpopt, "n"))) {
 			if (strchr(cpoptarg, '/') != NULL) {
 				if (debug)
 					fprintf(stderr, "setting OP_MASKLEN\n");
 				opt_flags |= OP_MASKLEN;
 			}
 			if (grp->gr_type != GT_NULL) {
 				syslog(LOG_ERR, "network/host conflict");
 				return (1);
 			} else if (get_net(cpoptarg, &grp->gr_ptr.gt_net, 0)) {
 				syslog(LOG_ERR, "bad net: %s", cpoptarg);
 				return (1);
 			}
 			grp->gr_type = GT_NET;
 			*has_hostp = 1;
 			usedarg++;
 			opt_flags |= OP_NET;
 		} else if (!strcmp(cpopt, "alldirs")) {
 			opt_flags |= OP_ALLDIRS;
 		} else if (!strcmp(cpopt, "public")) {
 			*exflagsp |= MNT_EXPUBLIC;
 		} else if (!strcmp(cpopt, "webnfs")) {
 			*exflagsp |= (MNT_EXPUBLIC|MNT_EXRDONLY|MNT_EXPORTANON);
 			opt_flags |= OP_MAPALL;
 		} else if (cpoptarg && !strcmp(cpopt, "index")) {
 			ep->ex_indexfile = strdup(cpoptarg);
 		} else if (!strcmp(cpopt, "quiet")) {
 			opt_flags |= OP_QUIET;
 		} else if (cpoptarg && !strcmp(cpopt, "sec")) {
 			if (parsesec(cpoptarg, ep))
 				return (1);
 			opt_flags |= OP_SEC;
 			usedarg++;
 		} else {
 			syslog(LOG_ERR, "bad opt %s", cpopt);
 			return (1);
 		}
 		if (usedarg >= 0) {
 			*endcp = savedc2;
 			**endcpp = savedc;
 			if (usedarg > 0) {
 				*cpp = cp;
 				*endcpp = endcp;
 			}
 			return (0);
 		}
 		cpopt = cpoptend;
 	}
 	**endcpp = savedc;
 	return (0);
 }
 
 /*
  * Translate a character string to the corresponding list of network
  * addresses for a hostname.
  */
 static int
 get_host(char *cp, struct grouplist *grp, struct grouplist *tgrp)
 {
 	struct grouplist *checkgrp;
 	struct addrinfo *ai, *tai, hints;
 	int ecode;
 	char host[NI_MAXHOST];
 
 	if (grp->gr_type != GT_NULL) {
 		syslog(LOG_ERR, "Bad netgroup type for ip host %s", cp);
 		return (1);
 	}
 	memset(&hints, 0, sizeof hints);
 	hints.ai_flags = AI_CANONNAME;
 	hints.ai_protocol = IPPROTO_UDP;
 	ecode = getaddrinfo(cp, NULL, &hints, &ai);
 	if (ecode != 0) {
 		syslog(LOG_ERR,"can't get address info for host %s", cp);
 		return 1;
 	}
 	grp->gr_ptr.gt_addrinfo = ai;
 	while (ai != NULL) {
 		if (ai->ai_canonname == NULL) {
 			if (getnameinfo(ai->ai_addr, ai->ai_addrlen, host,
 			    sizeof host, NULL, 0, NI_NUMERICHOST) != 0)
 				strlcpy(host, "?", sizeof(host));
 			ai->ai_canonname = strdup(host);
 			ai->ai_flags |= AI_CANONNAME;
 		}
 		if (debug)
 			fprintf(stderr, "got host %s\n", ai->ai_canonname);
 		/*
 		 * Sanity check: make sure we don't already have an entry
 		 * for this host in the grouplist.
 		 */
 		for (checkgrp = tgrp; checkgrp != NULL;
 		    checkgrp = checkgrp->gr_next) {
 			if (checkgrp->gr_type != GT_HOST)
 				continue;
 			for (tai = checkgrp->gr_ptr.gt_addrinfo; tai != NULL;
 			    tai = tai->ai_next) {
 				if (sacmp(tai->ai_addr, ai->ai_addr, NULL) != 0)
 					continue;
 				if (debug)
 					fprintf(stderr,
 					    "ignoring duplicate host %s\n",
 					    ai->ai_canonname);
 				grp->gr_type = GT_IGNORE;
 				return (0);
 			}
 		}
 		ai = ai->ai_next;
 	}
 	grp->gr_type = GT_HOST;
 	return (0);
 }
 
 /*
  * Free up an exports list component
  */
 static void
 free_exp(struct exportlist *ep)
 {
+	struct grouplist *grp, *tgrp;
 
 	if (ep->ex_defdir) {
 		free_host(ep->ex_defdir->dp_hosts);
 		free((caddr_t)ep->ex_defdir);
 	}
 	if (ep->ex_fsdir)
 		free(ep->ex_fsdir);
 	if (ep->ex_indexfile)
 		free(ep->ex_indexfile);
 	free_dir(ep->ex_dirl);
+	grp = ep->ex_grphead;
+	while (grp) {
+		tgrp = grp;
+		grp = grp->gr_next;
+		free_grp(tgrp);
+	}
 	free((caddr_t)ep);
 }
 
 /*
  * Free hosts.
  */
 static void
 free_host(struct hostlist *hp)
 {
 	struct hostlist *hp2;
 
 	while (hp) {
 		hp2 = hp;
 		hp = hp->ht_next;
 		free((caddr_t)hp2);
 	}
 }
 
 static struct hostlist *
 get_ht(void)
 {
 	struct hostlist *hp;
 
 	hp = (struct hostlist *)malloc(sizeof (struct hostlist));
 	if (hp == (struct hostlist *)NULL)
 		out_of_mem();
 	hp->ht_next = (struct hostlist *)NULL;
 	hp->ht_flag = 0;
 	return (hp);
 }
 
 /*
  * Out of memory, fatal
  */
 static void
 out_of_mem(void)
 {
 
 	syslog(LOG_ERR, "out of memory");
 	exit(2);
 }
 
 /*
  * Do the nmount() syscall with the update flag to push the export info into
  * the kernel.
  */
 static int
 do_mount(struct exportlist *ep, struct grouplist *grp, int exflags,
     struct xucred *anoncrp, char *dirp, int dirplen, struct statfs *fsb)
 {
 	struct statfs fsb1;
 	struct addrinfo *ai;
 	struct export_args *eap;
 	char errmsg[255];
 	char *cp;
 	int done;
 	char savedc;
 	struct iovec *iov;
 	int i, iovlen;
 	int ret;
 	struct nfsex_args nfsea;
 
 	eap = &nfsea.export;
 
 	cp = NULL;
 	savedc = '\0';
 	iov = NULL;
 	iovlen = 0;
 	ret = 0;
 
 	bzero(eap, sizeof (struct export_args));
 	bzero(errmsg, sizeof(errmsg));
 	eap->ex_flags = exflags;
 	eap->ex_anon = *anoncrp;
 	eap->ex_indexfile = ep->ex_indexfile;
 	if (grp->gr_type == GT_HOST)
 		ai = grp->gr_ptr.gt_addrinfo;
 	else
 		ai = NULL;
 	eap->ex_numsecflavors = ep->ex_numsecflavors;
 	for (i = 0; i < eap->ex_numsecflavors; i++)
 		eap->ex_secflavors[i] = ep->ex_secflavors[i];
 	if (eap->ex_numsecflavors == 0) {
 		eap->ex_numsecflavors = 1;
 		eap->ex_secflavors[0] = AUTH_SYS;
 	}
 	done = FALSE;
 
 	if (v4root_phase == 0) {
 		build_iovec(&iov, &iovlen, "fstype", NULL, 0);
 		build_iovec(&iov, &iovlen, "fspath", NULL, 0);
 		build_iovec(&iov, &iovlen, "from", NULL, 0);
 		build_iovec(&iov, &iovlen, "update", NULL, 0);
 		build_iovec(&iov, &iovlen, "export", eap,
 		    sizeof (struct export_args));
 		build_iovec(&iov, &iovlen, "errmsg", errmsg, sizeof(errmsg));
 	}
 
 	while (!done) {
 		switch (grp->gr_type) {
 		case GT_HOST:
 			if (ai->ai_addr->sa_family == AF_INET6 && have_v6 == 0)
 				goto skip;
 			eap->ex_addr = ai->ai_addr;
 			eap->ex_addrlen = ai->ai_addrlen;
 			eap->ex_masklen = 0;
 			break;
 		case GT_NET:
 			if (grp->gr_ptr.gt_net.nt_net.ss_family == AF_INET6 &&
 			    have_v6 == 0)
 				goto skip;
 			eap->ex_addr =
 			    (struct sockaddr *)&grp->gr_ptr.gt_net.nt_net;
 			eap->ex_addrlen =
 			    ((struct sockaddr *)&grp->gr_ptr.gt_net.nt_net)->sa_len;
 			eap->ex_mask =
 			    (struct sockaddr *)&grp->gr_ptr.gt_net.nt_mask;
 			eap->ex_masklen = ((struct sockaddr *)&grp->gr_ptr.gt_net.nt_mask)->sa_len;
 			break;
 		case GT_DEFAULT:
 			eap->ex_addr = NULL;
 			eap->ex_addrlen = 0;
 			eap->ex_mask = NULL;
 			eap->ex_masklen = 0;
 			break;
 		case GT_IGNORE:
 			ret = 0;
 			goto error_exit;
 			break;
 		default:
 			syslog(LOG_ERR, "bad grouptype");
 			if (cp)
 				*cp = savedc;
 			ret = 1;
 			goto error_exit;
 		}
 
 		/*
 		 * For V4:, use the nfssvc() syscall, instead of mount().
 		 */
 		if (v4root_phase == 2) {
 			nfsea.fspec = v4root_dirpath;
 			if (nfssvc(NFSSVC_V4ROOTEXPORT, (caddr_t)&nfsea) < 0) {
 				syslog(LOG_ERR, "Exporting V4: failed");
 				return (2);
 			}
 		} else {
 			/*
 			 * XXX:
 			 * Maybe I should just use the fsb->f_mntonname path
 			 * instead of looping back up the dirp to the mount
 			 * point??
 			 * Also, needs to know how to export all types of local
 			 * exportable filesystems and not just "ufs".
 			 */
 			iov[1].iov_base = fsb->f_fstypename; /* "fstype" */
 			iov[1].iov_len = strlen(fsb->f_fstypename) + 1;
 			iov[3].iov_base = fsb->f_mntonname; /* "fspath" */
 			iov[3].iov_len = strlen(fsb->f_mntonname) + 1;
 			iov[5].iov_base = fsb->f_mntfromname; /* "from" */
 			iov[5].iov_len = strlen(fsb->f_mntfromname) + 1;
 			errmsg[0] = '\0';
 	
 			while (nmount(iov, iovlen, fsb->f_flags) < 0) {
 				if (cp)
 					*cp-- = savedc;
 				else
 					cp = dirp + dirplen - 1;
 				if (opt_flags & OP_QUIET) {
 					ret = 1;
 					goto error_exit;
 				}
 				if (errno == EPERM) {
 					if (debug)
 						warnx("can't change attributes for %s: %s",
 						    dirp, errmsg);
 					syslog(LOG_ERR,
 					   "can't change attributes for %s: %s",
 					    dirp, errmsg);
 					ret = 1;
 					goto error_exit;
 				}
 				if (opt_flags & OP_ALLDIRS) {
 					if (errno == EINVAL)
 						syslog(LOG_ERR,
 		"-alldirs requested but %s is not a filesystem mountpoint",
 						    dirp);
 					else
 						syslog(LOG_ERR,
 						    "could not remount %s: %m",
 						    dirp);
 					ret = 1;
 					goto error_exit;
 				}
 				/* back up over the last component */
 				while (*cp == '/' && cp > dirp)
 					cp--;
 				while (*(cp - 1) != '/' && cp > dirp)
 					cp--;
 				if (cp == dirp) {
 					if (debug)
 						warnx("mnt unsucc");
 					syslog(LOG_ERR, "can't export %s %s",
 					    dirp, errmsg);
 					ret = 1;
 					goto error_exit;
 				}
 				savedc = *cp;
 				*cp = '\0';
 				/*
 				 * Check that we're still on the same
 				 * filesystem.
 				 */
 				if (statfs(dirp, &fsb1) != 0 ||
 				    bcmp(&fsb1.f_fsid, &fsb->f_fsid,
 				    sizeof (fsb1.f_fsid)) != 0) {
 					*cp = savedc;
 					syslog(LOG_ERR,
 					    "can't export %s %s", dirp,
 					    errmsg);
 					ret = 1;
 					goto error_exit;
 				}
 			}
 		}
 
 		/*
 		 * For the experimental server:
 		 * If this is the public directory, get the file handle
 		 * and load it into the kernel via the nfssvc() syscall.
 		 */
 		if ((exflags & MNT_EXPUBLIC) != 0) {
 			fhandle_t fh;
 			char *public_name;
 
 			if (eap->ex_indexfile != NULL)
 				public_name = eap->ex_indexfile;
 			else
 				public_name = dirp;
 			if (getfh(public_name, &fh) < 0)
 				syslog(LOG_ERR,
 				    "Can't get public fh for %s", public_name);
 			else if (nfssvc(NFSSVC_PUBLICFH, (caddr_t)&fh) < 0)
 				syslog(LOG_ERR,
 				    "Can't set public fh for %s", public_name);
 			else
 				has_publicfh = 1;
 		}
 skip:
 		if (ai != NULL)
 			ai = ai->ai_next;
 		if (ai == NULL)
 			done = TRUE;
 	}
 	if (cp)
 		*cp = savedc;
 error_exit:
 	/* free strings allocated by strdup() in getmntopts.c */
 	if (iov != NULL) {
 		free(iov[0].iov_base); /* fstype */
 		free(iov[2].iov_base); /* fspath */
 		free(iov[4].iov_base); /* from */
 		free(iov[6].iov_base); /* update */
 		free(iov[8].iov_base); /* export */
 		free(iov[10].iov_base); /* errmsg */
 
 		/* free iov, allocated by realloc() */
 		free(iov);
 	}
 	return (ret);
 }
 
 /*
  * Translate a net address.
  *
  * If `maskflg' is nonzero, then `cp' is a netmask, not a network address.
  */
 static int
 get_net(char *cp, struct netmsk *net, int maskflg)
 {
 	struct netent *np = NULL;
 	char *name, *p, *prefp;
 	struct sockaddr_in sin;
 	struct sockaddr *sa = NULL;
 	struct addrinfo hints, *ai = NULL;
 	char netname[NI_MAXHOST];
 	long preflen;
 
 	p = prefp = NULL;
 	if ((opt_flags & OP_MASKLEN) && !maskflg) {
 		p = strchr(cp, '/');
 		*p = '\0';
 		prefp = p + 1;
 	}
 
 	/*
 	 * Check for a numeric address first. We wish to avoid
 	 * possible DNS lookups in getnetbyname().
 	 */
 	if (isxdigit(*cp) || *cp == ':') {
 		memset(&hints, 0, sizeof hints);
 		/* Ensure the mask and the network have the same family. */
 		if (maskflg && (opt_flags & OP_NET))
 			hints.ai_family = net->nt_net.ss_family;
 		else if (!maskflg && (opt_flags & OP_HAVEMASK))
 			hints.ai_family = net->nt_mask.ss_family;
 		else
 			hints.ai_family = AF_UNSPEC;
 		hints.ai_flags = AI_NUMERICHOST;
 		if (getaddrinfo(cp, NULL, &hints, &ai) == 0)
 			sa = ai->ai_addr;
 		if (sa != NULL && ai->ai_family == AF_INET) {
 			/*
 			 * The address in `cp' is really a network address, so
 			 * use inet_network() to re-interpret this correctly.
 			 * e.g. "127.1" means 127.1.0.0, not 127.0.0.1.
 			 */
 			bzero(&sin, sizeof sin);
 			sin.sin_family = AF_INET;
 			sin.sin_len = sizeof sin;
 			sin.sin_addr = inet_makeaddr(inet_network(cp), 0);
 			if (debug)
 				fprintf(stderr, "get_net: v4 addr %s\n",
 				    inet_ntoa(sin.sin_addr));
 			sa = (struct sockaddr *)&sin;
 		}
 	}
 	if (sa == NULL && (np = getnetbyname(cp)) != NULL) {
 		bzero(&sin, sizeof sin);
 		sin.sin_family = AF_INET;
 		sin.sin_len = sizeof sin;
 		sin.sin_addr = inet_makeaddr(np->n_net, 0);
 		sa = (struct sockaddr *)&sin;
 	}
 	if (sa == NULL)
 		goto fail;
 
 	if (maskflg) {
 		/* The specified sockaddr is a mask. */
 		if (checkmask(sa) != 0)
 			goto fail;
 		bcopy(sa, &net->nt_mask, sa->sa_len);
 		opt_flags |= OP_HAVEMASK;
 	} else {
 		/* The specified sockaddr is a network address. */
 		bcopy(sa, &net->nt_net, sa->sa_len);
 
 		/* Get a network name for the export list. */
 		if (np) {
 			name = np->n_name;
 		} else if (getnameinfo(sa, sa->sa_len, netname, sizeof netname,
 		   NULL, 0, NI_NUMERICHOST) == 0) {
 			name = netname;
 		} else {
 			goto fail;
 		}
 		if ((net->nt_name = strdup(name)) == NULL)
 			out_of_mem();
 
 		/*
 		 * Extract a mask from either a "/<masklen>" suffix, or
 		 * from the class of an IPv4 address.
 		 */
 		if (opt_flags & OP_MASKLEN) {
 			preflen = strtol(prefp, NULL, 10);
 			if (preflen < 0L || preflen == LONG_MAX)
 				goto fail;
 			bcopy(sa, &net->nt_mask, sa->sa_len);
 			if (makemask(&net->nt_mask, (int)preflen) != 0)
 				goto fail;
 			opt_flags |= OP_HAVEMASK;
 			*p = '/';
 		} else if (sa->sa_family == AF_INET &&
 		    (opt_flags & OP_MASK) == 0) {
 			in_addr_t addr;
 
 			addr = ((struct sockaddr_in *)sa)->sin_addr.s_addr;
 			if (IN_CLASSA(addr))
 				preflen = 8;
 			else if (IN_CLASSB(addr))
 				preflen = 16;
 			else if (IN_CLASSC(addr))
 				preflen = 24;
 			else if (IN_CLASSD(addr))
 				preflen = 28;
 			else
 				preflen = 32;	/* XXX */
 
 			bcopy(sa, &net->nt_mask, sa->sa_len);
 			makemask(&net->nt_mask, (int)preflen);
 			opt_flags |= OP_HAVEMASK;
 		}
 	}
 
 	if (ai)
 		freeaddrinfo(ai);
 	return 0;
 
 fail:
 	if (ai)
 		freeaddrinfo(ai);
 	return 1;
 }
 
 /*
  * Parse out the next white space separated field
  */
 static void
 nextfield(char **cp, char **endcp)
 {
 	char *p;
 	char quot = 0;
 
 	p = *cp;
 	while (*p == ' ' || *p == '\t')
 		p++;
 	*cp = p;
 	while (*p != '\0') {
 		if (quot) {
 			if (*p == quot)
 				quot = 0;
 		} else {
 			if (*p == '\\' && *(p + 1) != '\0')
 				p++;
 			else if (*p == '\'' || *p == '"')
 				quot = *p;
 			else if (*p == ' ' || *p == '\t')
 				break;
 		}
 		p++;
 	};
 	*endcp = p;
 }
 
 /*
  * Get an exports file line. Skip over blank lines and handle line
  * continuations.
  */
 static int
 get_line(void)
 {
 	char *p, *cp;
 	size_t len;
 	int totlen, cont_line;
 
 	/*
 	 * Loop around ignoring blank lines and getting all continuation lines.
 	 */
 	p = line;
 	totlen = 0;
 	do {
 		if ((p = fgetln(exp_file, &len)) == NULL)
 			return (0);
 		cp = p + len - 1;
 		cont_line = 0;
 		while (cp >= p &&
 		    (*cp == ' ' || *cp == '\t' || *cp == '\n' || *cp == '\\')) {
 			if (*cp == '\\')
 				cont_line = 1;
 			cp--;
 			len--;
 		}
 		if (cont_line) {
 			*++cp = ' ';
 			len++;
 		}
 		if (linesize < len + totlen + 1) {
 			linesize = len + totlen + 1;
 			line = realloc(line, linesize);
 			if (line == NULL)
 				out_of_mem();
 		}
 		memcpy(line + totlen, p, len);
 		totlen += len;
 		line[totlen] = '\0';
 	} while (totlen == 0 || cont_line);
 	return (1);
 }
 
 /*
  * Parse a description of a credential.
  */
 static void
 parsecred(char *namelist, struct xucred *cr)
 {
 	char *name;
 	int cnt;
 	char *names;
 	struct passwd *pw;
 	struct group *gr;
 	gid_t groups[XU_NGROUPS + 1];
 	int ngroups;
 
 	cr->cr_version = XUCRED_VERSION;
 	/*
 	 * Set up the unprivileged user.
 	 */
 	cr->cr_uid = 65534;
 	cr->cr_groups[0] = 65533;
 	cr->cr_ngroups = 1;
 	/*
 	 * Get the user's password table entry.
 	 */
 	names = namelist;
 	name = strsep_quote(&names, ":");
 	/* Bug?  name could be NULL here */
 	if (isdigit(*name) || *name == '-')
 		pw = getpwuid(atoi(name));
 	else
 		pw = getpwnam(name);
 	/*
 	 * Credentials specified as those of a user.
 	 */
 	if (names == NULL) {
 		if (pw == NULL) {
 			syslog(LOG_ERR, "unknown user: %s", name);
 			return;
 		}
 		cr->cr_uid = pw->pw_uid;
 		ngroups = XU_NGROUPS + 1;
 		if (getgrouplist(pw->pw_name, pw->pw_gid, groups, &ngroups)) {
 			syslog(LOG_ERR, "too many groups");
 			ngroups = XU_NGROUPS + 1;
 		}
 
 		/*
 		 * Compress out duplicate.
 		 */
 		cr->cr_ngroups = ngroups - 1;
 		cr->cr_groups[0] = groups[0];
 		for (cnt = 2; cnt < ngroups; cnt++)
 			cr->cr_groups[cnt - 1] = groups[cnt];
 		return;
 	}
 	/*
 	 * Explicit credential specified as a colon separated list:
 	 *	uid:gid:gid:...
 	 */
 	if (pw != NULL)
 		cr->cr_uid = pw->pw_uid;
 	else if (isdigit(*name) || *name == '-')
 		cr->cr_uid = atoi(name);
 	else {
 		syslog(LOG_ERR, "unknown user: %s", name);
 		return;
 	}
 	cr->cr_ngroups = 0;
 	while (names != NULL && *names != '\0' && cr->cr_ngroups < XU_NGROUPS) {
 		name = strsep_quote(&names, ":");
 		if (isdigit(*name) || *name == '-') {
 			cr->cr_groups[cr->cr_ngroups++] = atoi(name);
 		} else {
 			if ((gr = getgrnam(name)) == NULL) {
 				syslog(LOG_ERR, "unknown group: %s", name);
 				continue;
 			}
 			cr->cr_groups[cr->cr_ngroups++] = gr->gr_gid;
 		}
 	}
 	if (names != NULL && *names != '\0' && cr->cr_ngroups == XU_NGROUPS)
 		syslog(LOG_ERR, "too many groups");
 }
 
 #define	STRSIZ	(MNTNAMLEN+MNTPATHLEN+50)
 /*
  * Routines that maintain the remote mounttab
  */
 static void
 get_mountlist(void)
 {
 	struct mountlist *mlp;
 	char *host, *dirp, *cp;
 	char str[STRSIZ];
 	FILE *mlfile;
 
 	if ((mlfile = fopen(_PATH_RMOUNTLIST, "r")) == NULL) {
 		if (errno == ENOENT)
 			return;
 		else {
 			syslog(LOG_ERR, "can't open %s", _PATH_RMOUNTLIST);
 			return;
 		}
 	}
 	while (fgets(str, STRSIZ, mlfile) != NULL) {
 		cp = str;
 		host = strsep(&cp, " \t\n");
 		dirp = strsep(&cp, " \t\n");
 		if (host == NULL || dirp == NULL)
 			continue;
 		mlp = (struct mountlist *)malloc(sizeof (*mlp));
 		if (mlp == (struct mountlist *)NULL)
 			out_of_mem();
 		strncpy(mlp->ml_host, host, MNTNAMLEN);
 		mlp->ml_host[MNTNAMLEN] = '\0';
 		strncpy(mlp->ml_dirp, dirp, MNTPATHLEN);
 		mlp->ml_dirp[MNTPATHLEN] = '\0';
 
 		SLIST_INSERT_HEAD(&mlhead, mlp, next);
 	}
 	fclose(mlfile);
 }
 
 static void
 del_mlist(char *hostp, char *dirp)
 {
 	struct mountlist *mlp, *mlp2;
 	FILE *mlfile;
 	int fnd = 0;
 
 	SLIST_FOREACH_SAFE(mlp, &mlhead, next, mlp2) {
 		if (!strcmp(mlp->ml_host, hostp) &&
 		    (!dirp || !strcmp(mlp->ml_dirp, dirp))) {
 			fnd = 1;
 			SLIST_REMOVE(&mlhead, mlp, mountlist, next);
 			free((caddr_t)mlp);
 		}
 	}
 	if (fnd) {
 		if ((mlfile = fopen(_PATH_RMOUNTLIST, "w")) == NULL) {
 			syslog(LOG_ERR,"can't update %s", _PATH_RMOUNTLIST);
 			return;
 		}
 		SLIST_FOREACH(mlp, &mlhead, next) {
 			fprintf(mlfile, "%s %s\n", mlp->ml_host, mlp->ml_dirp);
 		}
 		fclose(mlfile);
 	}
 }
 
 static void
 add_mlist(char *hostp, char *dirp)
 {
 	struct mountlist *mlp;
 	FILE *mlfile;
 
 	SLIST_FOREACH(mlp, &mlhead, next) {
 		if (!strcmp(mlp->ml_host, hostp) && !strcmp(mlp->ml_dirp, dirp))
 			return;
 	}
 
 	mlp = (struct mountlist *)malloc(sizeof (*mlp));
 	if (mlp == (struct mountlist *)NULL)
 		out_of_mem();
 	strncpy(mlp->ml_host, hostp, MNTNAMLEN);
 	mlp->ml_host[MNTNAMLEN] = '\0';
 	strncpy(mlp->ml_dirp, dirp, MNTPATHLEN);
 	mlp->ml_dirp[MNTPATHLEN] = '\0';
 	SLIST_INSERT_HEAD(&mlhead, mlp, next);
 	if ((mlfile = fopen(_PATH_RMOUNTLIST, "a")) == NULL) {
 		syslog(LOG_ERR, "can't update %s", _PATH_RMOUNTLIST);
 		return;
 	}
 	fprintf(mlfile, "%s %s\n", mlp->ml_host, mlp->ml_dirp);
 	fclose(mlfile);
 }
 
 /*
  * Free up a group list.
  */
 static void
 free_grp(struct grouplist *grp)
 {
 	if (grp->gr_type == GT_HOST) {
 		if (grp->gr_ptr.gt_addrinfo != NULL)
 			freeaddrinfo(grp->gr_ptr.gt_addrinfo);
 	} else if (grp->gr_type == GT_NET) {
 		if (grp->gr_ptr.gt_net.nt_name)
 			free(grp->gr_ptr.gt_net.nt_name);
 	}
 	free((caddr_t)grp);
 }
 
 #ifdef DEBUG
 static void
 SYSLOG(int pri, const char *fmt, ...)
 {
 	va_list ap;
 
 	va_start(ap, fmt);
 	vfprintf(stderr, fmt, ap);
 	va_end(ap);
 }
 #endif /* DEBUG */
 
 /*
  * Check options for consistency.
  */
 static int
 check_options(struct dirlist *dp)
 {
 
 	if (v4root_phase == 0 && dp == NULL)
 	    return (1);
 	if ((opt_flags & (OP_MAPROOT | OP_MAPALL)) == (OP_MAPROOT | OP_MAPALL)) {
 	    syslog(LOG_ERR, "-mapall and -maproot mutually exclusive");
 	    return (1);
 	}
 	if ((opt_flags & OP_MASK) && (opt_flags & OP_NET) == 0) {
 		syslog(LOG_ERR, "-mask requires -network");
 		return (1);
 	}
 	if ((opt_flags & OP_NET) && (opt_flags & OP_HAVEMASK) == 0) {
 		syslog(LOG_ERR, "-network requires mask specification");
 		return (1);
 	}
 	if ((opt_flags & OP_MASK) && (opt_flags & OP_MASKLEN)) {
 		syslog(LOG_ERR, "-mask and /masklen are mutually exclusive");
 		return (1);
 	}
 	if (v4root_phase > 0 &&
 	    (opt_flags &
 	     ~(OP_SEC | OP_MASK | OP_NET | OP_HAVEMASK | OP_MASKLEN)) != 0) {
 	    syslog(LOG_ERR,"only -sec,-net,-mask options allowed on V4:");
 	    return (1);
 	}
 	if ((opt_flags & OP_ALLDIRS) && dp->dp_left) {
 	    syslog(LOG_ERR, "-alldirs has multiple directories");
 	    return (1);
 	}
 	return (0);
 }
 
 /*
  * Check an absolute directory path for any symbolic links. Return true
  */
 static int
 check_dirpath(char *dirp)
 {
 	char *cp;
 	int ret = 1;
 	struct stat sb;
 
 	cp = dirp + 1;
 	while (*cp && ret) {
 		if (*cp == '/') {
 			*cp = '\0';
 			if (lstat(dirp, &sb) < 0 || !S_ISDIR(sb.st_mode))
 				ret = 0;
 			*cp = '/';
 		}
 		cp++;
 	}
 	if (lstat(dirp, &sb) < 0 || !S_ISDIR(sb.st_mode))
 		ret = 0;
 	return (ret);
 }
 
 /*
  * Make a netmask according to the specified prefix length. The ss_family
  * and other non-address fields must be initialised before calling this.
  */
 static int
 makemask(struct sockaddr_storage *ssp, int bitlen)
 {
 	u_char *p;
 	int bits, i, len;
 
 	if ((p = sa_rawaddr((struct sockaddr *)ssp, &len)) == NULL)
 		return (-1);
 	if (bitlen > len * CHAR_BIT)
 		return (-1);
 
 	for (i = 0; i < len; i++) {
 		bits = MIN(CHAR_BIT, bitlen);
 		*p++ = (u_char)~0 << (CHAR_BIT - bits);
 		bitlen -= bits;
 	}
 	return 0;
 }
 
 /*
  * Check that the sockaddr is a valid netmask. Returns 0 if the mask
  * is acceptable (i.e. of the form 1...10....0).
  */
 static int
 checkmask(struct sockaddr *sa)
 {
 	u_char *mask;
 	int i, len;
 
 	if ((mask = sa_rawaddr(sa, &len)) == NULL)
 		return (-1);
 
 	for (i = 0; i < len; i++)
 		if (mask[i] != 0xff)
 			break;
 	if (i < len) {
 		if (~mask[i] & (u_char)(~mask[i] + 1))
 			return (-1);
 		i++;
 	}
 	for (; i < len; i++)
 		if (mask[i] != 0)
 			return (-1);
 	return (0);
 }
 
 /*
  * Compare two sockaddrs according to a specified mask. Return zero if
  * `sa1' matches `sa2' when filtered by the netmask in `samask'.
  * If samask is NULL, perform a full comparison.
  */
 static int
 sacmp(struct sockaddr *sa1, struct sockaddr *sa2, struct sockaddr *samask)
 {
 	unsigned char *p1, *p2, *mask;
 	int len, i;
 
 	if (sa1->sa_family != sa2->sa_family ||
 	    (p1 = sa_rawaddr(sa1, &len)) == NULL ||
 	    (p2 = sa_rawaddr(sa2, NULL)) == NULL)
 		return (1);
 
 	switch (sa1->sa_family) {
 	case AF_INET6:
 		if (((struct sockaddr_in6 *)sa1)->sin6_scope_id !=
 		    ((struct sockaddr_in6 *)sa2)->sin6_scope_id)
 			return (1);
 		break;
 	}
 
 	/* Simple binary comparison if no mask specified. */
 	if (samask == NULL)
 		return (memcmp(p1, p2, len));
 
 	/* Set up the mask, and do a mask-based comparison. */
 	if (sa1->sa_family != samask->sa_family ||
 	    (mask = sa_rawaddr(samask, NULL)) == NULL)
 		return (1);
 
 	for (i = 0; i < len; i++)
 		if ((p1[i] & mask[i]) != (p2[i] & mask[i]))
 			return (1);
 	return (0);
 }
 
 /*
  * Return a pointer to the part of the sockaddr that contains the
  * raw address, and set *nbytes to its length in bytes. Returns
  * NULL if the address family is unknown.
  */
 static void *
 sa_rawaddr(struct sockaddr *sa, int *nbytes) {
 	void *p;
 	int len;
 
 	switch (sa->sa_family) {
 	case AF_INET:
 		len = sizeof(((struct sockaddr_in *)sa)->sin_addr);
 		p = &((struct sockaddr_in *)sa)->sin_addr;
 		break;
 	case AF_INET6:
 		len = sizeof(((struct sockaddr_in6 *)sa)->sin6_addr);
 		p = &((struct sockaddr_in6 *)sa)->sin6_addr;
 		break;
 	default:
 		p = NULL;
 		len = 0;
 	}
 
 	if (nbytes != NULL)
 		*nbytes = len;
 	return (p);
 }
 
 static void
 huphandler(int sig __unused)
 {
 
 	got_sighup = 1;
 }
 
 static void
 terminate(int sig __unused)
 {
 	pidfile_remove(pfh);
 	rpcb_unset(MOUNTPROG, MOUNTVERS, NULL);
 	rpcb_unset(MOUNTPROG, MOUNTVERS3, NULL);
 	exit (0);
 }
Index: projects/runtime-coverage-v2
===================================================================
--- projects/runtime-coverage-v2	(revision 347598)
+++ projects/runtime-coverage-v2	(revision 347599)

Property changes on: projects/runtime-coverage-v2
___________________________________________________________________
Modified: svn:mergeinfo
## -0,0 +0,1 ##
   Merged /head:r347560-347598