Index: head/sys/amd64/amd64/exception.S
===================================================================
--- head/sys/amd64/amd64/exception.S	(revision 338067)
+++ head/sys/amd64/amd64/exception.S	(revision 338068)
@@ -1,1319 +1,1322 @@
 /*-
  * Copyright (c) 1989, 1990 William F. Jolitz.
  * Copyright (c) 1990 The Regents of the University of California.
  * Copyright (c) 2007-2018 The FreeBSD Foundation
  * All rights reserved.
  *
  * Portions of this software were developed by A. Joseph Koshy under
  * sponsorship from the FreeBSD Foundation and Google, Inc.
  *
  * Portions of this software were developed by
  * Konstantin Belousov <kib@FreeBSD.org> under sponsorship from
  * the FreeBSD Foundation.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  * $FreeBSD$
  */
 
 #include "opt_atpic.h"
 #include "opt_hwpmc_hooks.h"
 
 #include "assym.inc"
 
 #include <machine/psl.h>
 #include <machine/asmacros.h>
 #include <machine/trap.h>
 #include <machine/specialreg.h>
 
 #ifdef KDTRACE_HOOKS
 	.bss
 	.globl	dtrace_invop_jump_addr
 	.align	8
 	.type	dtrace_invop_jump_addr,@object
 	.size	dtrace_invop_jump_addr,8
 dtrace_invop_jump_addr:
 	.zero	8
 	.globl	dtrace_invop_calltrap_addr
 	.align	8
 	.type	dtrace_invop_calltrap_addr,@object
 	.size	dtrace_invop_calltrap_addr,8
 dtrace_invop_calltrap_addr:
 	.zero	8
 #endif
 	.text
 #ifdef HWPMC_HOOKS
 	ENTRY(start_exceptions)
 #endif
 
 /*****************************************************************************/
 /* Trap handling                                                             */
 /*****************************************************************************/
 /*
  * Trap and fault vector routines.
  *
  * All traps are 'interrupt gates', SDT_SYSIGT.  An interrupt gate pushes
  * state on the stack but also disables interrupts.  This is important for
  * us for the use of the swapgs instruction.  We cannot be interrupted
  * until the GS.base value is correct.  For most traps, we automatically
  * then enable interrupts if the interrupted context had them enabled.
  * This is equivalent to the i386 port's use of SDT_SYS386TGT.
  *
  * The cpu will push a certain amount of state onto the kernel stack for
  * the current process.  See amd64/include/frame.h.
  * This includes the current RFLAGS (status register, which includes
  * the interrupt disable state prior to the trap), the code segment register,
  * and the return instruction pointer are pushed by the cpu.  The cpu
  * will also push an 'error' code for certain traps.  We push a dummy
  * error code for those traps where the cpu doesn't in order to maintain
  * a consistent frame.  We also push a contrived 'trap number'.
  *
  * The CPU does not push the general registers, so we must do that, and we
  * must restore them prior to calling 'iret'.  The CPU adjusts %cs and %ss
  * but does not mess with %ds, %es, %gs or %fs.  We swap the %gs base for
  * for the kernel mode operation shortly, without changes to the selector
  * loaded.  Since superuser long mode works with any selectors loaded into
  * segment registers other then %cs, which makes them mostly unused in long
  * mode, and kernel does not reference %fs, leave them alone.  The segment
  * registers are reloaded on return to the usermode.
  */
 
 MCOUNT_LABEL(user)
 MCOUNT_LABEL(btrap)
 
 /* Traps that we leave interrupts disabled for. */
 	.macro	TRAP_NOEN	l, trapno
 	PTI_ENTRY	\l,X\l
 	.globl	X\l
 	.type	X\l,@function
 X\l:	subq $TF_RIP,%rsp
 	movl $\trapno,TF_TRAPNO(%rsp)
 	movq $0,TF_ADDR(%rsp)
 	movq $0,TF_ERR(%rsp)
 	jmp alltraps_noen
 	.endm
 
 	TRAP_NOEN	bpt, T_BPTFLT
 #ifdef KDTRACE_HOOKS
 	TRAP_NOEN	dtrace_ret, T_DTRACE_RET
 #endif
 
 /* Regular traps; The cpu does not supply tf_err for these. */
 	.macro	TRAP	l, trapno
 	PTI_ENTRY	\l,X\l
 	.globl	X\l
 	.type	X\l,@function
 X\l:
 	subq $TF_RIP,%rsp
 	movl $\trapno,TF_TRAPNO(%rsp)
 	movq $0,TF_ADDR(%rsp)
 	movq $0,TF_ERR(%rsp)
 	jmp alltraps
 	.endm
 
 	TRAP	div, T_DIVIDE
 	TRAP	ofl, T_OFLOW
 	TRAP	bnd, T_BOUND
 	TRAP	ill, T_PRIVINFLT
 	TRAP	dna, T_DNA
 	TRAP	fpusegm, T_FPOPFLT
 	TRAP	rsvd, T_RESERVED
 	TRAP	fpu, T_ARITHTRAP
 	TRAP	xmm, T_XMMFLT
 
 /* This group of traps have tf_err already pushed by the cpu. */
 	.macro	TRAP_ERR	l, trapno
 	PTI_ENTRY	\l,X\l,has_err=1
 	.globl	X\l
 	.type	X\l,@function
 X\l:
 	subq $TF_ERR,%rsp
 	movl $\trapno,TF_TRAPNO(%rsp)
 	movq $0,TF_ADDR(%rsp)
 	jmp alltraps
 	.endm
 
 	TRAP_ERR	tss, T_TSSFLT
 	TRAP_ERR	align, T_ALIGNFLT
 
 	/*
 	 * alltraps entry point.  Use swapgs if this is the first time in the
 	 * kernel from userland.  Reenable interrupts if they were enabled
 	 * before the trap.  This approximates SDT_SYS386TGT on the i386 port.
 	 */
 	SUPERALIGN_TEXT
 	.globl	alltraps
 	.type	alltraps,@function
 alltraps:
 	movq	%rdi,TF_RDI(%rsp)
 	testb	$SEL_RPL_MASK,TF_CS(%rsp) /* Did we come from kernel? */
 	jz	1f		/* already running with kernel GS.base */
 	swapgs
 	movq	PCPU(CURPCB),%rdi
 	andl	$~PCB_FULL_IRET,PCB_FLAGS(%rdi)
 1:	SAVE_SEGS
 	movq	%rdx,TF_RDX(%rsp)
 	movq	%rax,TF_RAX(%rsp)
 	movq	%rcx,TF_RCX(%rsp)
 	testb	$SEL_RPL_MASK,TF_CS(%rsp)
 	jz	2f
 	call	handle_ibrs_entry
 2:	testl	$PSL_I,TF_RFLAGS(%rsp)
 	jz	alltraps_pushregs_no_rax
 	sti
 alltraps_pushregs_no_rax:
 	movq	%rsi,TF_RSI(%rsp)
 	movq	%r8,TF_R8(%rsp)
 	movq	%r9,TF_R9(%rsp)
 	movq	%rbx,TF_RBX(%rsp)
 	movq	%rbp,TF_RBP(%rsp)
 	movq	%r10,TF_R10(%rsp)
 	movq	%r11,TF_R11(%rsp)
 	movq	%r12,TF_R12(%rsp)
 	movq	%r13,TF_R13(%rsp)
 	movq	%r14,TF_R14(%rsp)
 	movq	%r15,TF_R15(%rsp)
 	movl	$TF_HASSEGS,TF_FLAGS(%rsp)
 	pushfq
 	andq	$~(PSL_D | PSL_AC),(%rsp)
 	popfq
 	FAKE_MCOUNT(TF_RIP(%rsp))
 #ifdef KDTRACE_HOOKS
 	/*
 	 * DTrace Function Boundary Trace (fbt) probes are triggered
 	 * by int3 (0xcc) which causes the #BP (T_BPTFLT) breakpoint
 	 * interrupt. For all other trap types, just handle them in
 	 * the usual way.
 	 */
 	testb	$SEL_RPL_MASK,TF_CS(%rsp) /* Did we come from kernel? */
 	jnz	calltrap		/* ignore userland traps */
 	cmpl	$T_BPTFLT,TF_TRAPNO(%rsp)
 	jne	calltrap
 
 	/* Check if there is no DTrace hook registered. */
 	cmpq	$0,dtrace_invop_jump_addr
 	je	calltrap
 
 	/*
 	 * Set our jump address for the jump back in the event that
 	 * the breakpoint wasn't caused by DTrace at all.
 	 */
 	movq	$calltrap,dtrace_invop_calltrap_addr(%rip)
 
 	/* Jump to the code hooked in by DTrace. */
 	jmpq	*dtrace_invop_jump_addr
 #endif
 	.globl	calltrap
 	.type	calltrap,@function
 calltrap:
 	movq	%rsp,%rdi
 	call	trap_check
 	MEXITCOUNT
 	jmp	doreti			/* Handle any pending ASTs */
 
 	/*
 	 * alltraps_noen entry point.  Unlike alltraps above, we want to
 	 * leave the interrupts disabled.  This corresponds to
 	 * SDT_SYS386IGT on the i386 port.
 	 */
 	SUPERALIGN_TEXT
 	.globl	alltraps_noen
 	.type	alltraps_noen,@function
 alltraps_noen:
 	movq	%rdi,TF_RDI(%rsp)
 	testb	$SEL_RPL_MASK,TF_CS(%rsp) /* Did we come from kernel? */
 	jz	1f /* already running with kernel GS.base */
 	swapgs
 	movq	PCPU(CURPCB),%rdi
 	andl	$~PCB_FULL_IRET,PCB_FLAGS(%rdi)
 1:	SAVE_SEGS
 	movq	%rdx,TF_RDX(%rsp)
 	movq	%rax,TF_RAX(%rsp)
 	movq	%rcx,TF_RCX(%rsp)
 	testb	$SEL_RPL_MASK,TF_CS(%rsp)
 	jz	alltraps_pushregs_no_rax
 	call	handle_ibrs_entry
 	jmp	alltraps_pushregs_no_rax
 
 IDTVEC(dblfault)
 	subq	$TF_ERR,%rsp
 	movl	$T_DOUBLEFLT,TF_TRAPNO(%rsp)
 	movq	$0,TF_ADDR(%rsp)
 	movq	$0,TF_ERR(%rsp)
 	movq	%rdi,TF_RDI(%rsp)
 	movq	%rsi,TF_RSI(%rsp)
 	movq	%rdx,TF_RDX(%rsp)
 	movq	%rcx,TF_RCX(%rsp)
 	movq	%r8,TF_R8(%rsp)
 	movq	%r9,TF_R9(%rsp)
 	movq	%rax,TF_RAX(%rsp)
 	movq	%rbx,TF_RBX(%rsp)
 	movq	%rbp,TF_RBP(%rsp)
 	movq	%r10,TF_R10(%rsp)
 	movq	%r11,TF_R11(%rsp)
 	movq	%r12,TF_R12(%rsp)
 	movq	%r13,TF_R13(%rsp)
 	movq	%r14,TF_R14(%rsp)
 	movq	%r15,TF_R15(%rsp)
 	SAVE_SEGS
 	movl	$TF_HASSEGS,TF_FLAGS(%rsp)
 	pushfq
 	andq	$~(PSL_D | PSL_AC),(%rsp)
 	popfq
 	testb	$SEL_RPL_MASK,TF_CS(%rsp) /* Did we come from kernel? */
 	jz	1f			/* already running with kernel GS.base */
 	swapgs
 1:
 	movq	PCPU(KCR3),%rax
 	cmpq	$~0,%rax
 	je	2f
 	movq	%rax,%cr3
 2:	movq	%rsp,%rdi
 	call	dblfault_handler
 3:	hlt
 	jmp	3b
 
 	ALIGN_TEXT
 IDTVEC(page_pti)
 	testb	$SEL_RPL_MASK,PTI_CS-2*8(%rsp)
 	jz	Xpage
 	swapgs
 	pushq	%rax
 	movq	%cr3,%rax
 	movq	%rax,PCPU(SAVED_UCR3)
 	cmpq	$~0,PCPU(UCR3)
 	jne	1f
 	popq	%rax
 	jmp	2f
 1:	pushq	%rdx
 	PTI_UUENTRY has_err=1
 2:	subq	$TF_ERR,%rsp
 	movq	%rdi,TF_RDI(%rsp)
 	movq	%rax,TF_RAX(%rsp)
 	movq	%rdx,TF_RDX(%rsp)
 	movq	%rcx,TF_RCX(%rsp)
 	jmp	page_u
 IDTVEC(page)
 	subq	$TF_ERR,%rsp
 	movq	%rdi,TF_RDI(%rsp)	/* free up GP registers */
 	movq	%rax,TF_RAX(%rsp)
 	movq	%rdx,TF_RDX(%rsp)
 	movq	%rcx,TF_RCX(%rsp)
 	testb	$SEL_RPL_MASK,TF_CS(%rsp) /* Did we come from kernel? */
 	jz	page_cr2		/* already running with kernel GS.base */
 	swapgs
 page_u:	movq	PCPU(CURPCB),%rdi
 	andl	$~PCB_FULL_IRET,PCB_FLAGS(%rdi)
 	movq	PCPU(SAVED_UCR3),%rax
 	movq	%rax,PCB_SAVED_UCR3(%rdi)
 	call	handle_ibrs_entry
 page_cr2:
 	movq	%cr2,%rdi		/* preserve %cr2 before ..  */
 	movq	%rdi,TF_ADDR(%rsp)	/* enabling interrupts. */
 	SAVE_SEGS
 	movl	$T_PAGEFLT,TF_TRAPNO(%rsp)
 	testl	$PSL_I,TF_RFLAGS(%rsp)
 	jz	alltraps_pushregs_no_rax
 	sti
 	jmp	alltraps_pushregs_no_rax
 
 	/*
 	 * We have to special-case this one.  If we get a trap in doreti() at
 	 * the iretq stage, we'll reenter with the wrong gs state.  We'll have
 	 * to do a special the swapgs in this case even coming from the kernel.
 	 * XXX linux has a trap handler for their equivalent of load_gs().
 	 *
 	 * On the stack, we have the hardware interrupt frame to return
 	 * to usermode (faulted) and another frame with error code, for
 	 * fault.  For PTI, copy both frames to the main thread stack.
 	 * Handle the potential 16-byte alignment adjustment incurred
 	 * during the second fault by copying both frames independently
 	 * while unwinding the stack in between.
 	 */
 	.macro PROTF_ENTRY name,trapno
 \name\()_pti_doreti:
 	swapgs
 	cmpq	$~0,PCPU(UCR3)
 	je	1f
 	pushq	%rax
 	pushq	%rdx
 	movq	PCPU(KCR3),%rax
 	movq	%rax,%cr3
 	movq	PCPU(RSP0),%rax
 	subq	$2*PTI_SIZE-3*8,%rax /* no err, %rax, %rdx in faulted frame */
 	MOVE_STACKS	(PTI_SIZE / 8)
 	addq	$PTI_SIZE,%rax
 	movq	PTI_RSP(%rsp),%rsp
 	MOVE_STACKS	(PTI_SIZE / 8 - 3)
 	subq	$PTI_SIZE,%rax
 	movq	%rax,%rsp
 	popq	%rdx
 	popq	%rax
 1:	swapgs
 	jmp	X\name
 IDTVEC(\name\()_pti)
 	cmpq	$doreti_iret,PTI_RIP-2*8(%rsp)
 	je	\name\()_pti_doreti
 	testb	$SEL_RPL_MASK,PTI_CS-2*8(%rsp) /* %rax, %rdx not yet pushed */
 	jz	X\name
 	PTI_UENTRY has_err=1
 	swapgs
 IDTVEC(\name)
 	subq	$TF_ERR,%rsp
 	movl	$\trapno,TF_TRAPNO(%rsp)
 	jmp	prot_addrf
 	.endm
 
 	PROTF_ENTRY	missing, T_SEGNPFLT
 	PROTF_ENTRY	stk, T_STKFLT
 	PROTF_ENTRY	prot, T_PROTFLT
 
 prot_addrf:
 	movq	$0,TF_ADDR(%rsp)
 	movq	%rdi,TF_RDI(%rsp)	/* free up a GP register */
 	movq	%rax,TF_RAX(%rsp)
 	movq	%rdx,TF_RDX(%rsp)
 	movq	%rcx,TF_RCX(%rsp)
 	movw	%fs,TF_FS(%rsp)
 	movw	%gs,TF_GS(%rsp)
 	leaq	doreti_iret(%rip),%rdi
 	cmpq	%rdi,TF_RIP(%rsp)
 	je	5f			/* kernel but with user gsbase!! */
 	testb	$SEL_RPL_MASK,TF_CS(%rsp) /* Did we come from kernel? */
 	jz	6f			/* already running with kernel GS.base */
 	testb	$CPUID_STDEXT_FSGSBASE,cpu_stdext_feature(%rip)
 	jz	2f
 	cmpw	$KUF32SEL,TF_FS(%rsp)
 	jne	1f
 	rdfsbase %rax
 1:	cmpw	$KUG32SEL,TF_GS(%rsp)
 	jne	2f
 	rdgsbase %rdx
 2:	swapgs
 	movq	PCPU(CURPCB),%rdi
 	testb	$CPUID_STDEXT_FSGSBASE,cpu_stdext_feature(%rip)
 	jz	4f
 	cmpw	$KUF32SEL,TF_FS(%rsp)
 	jne	3f
 	movq	%rax,PCB_FSBASE(%rdi)
 3:	cmpw	$KUG32SEL,TF_GS(%rsp)
 	jne	4f
 	movq	%rdx,PCB_GSBASE(%rdi)
 4:	call	handle_ibrs_entry
 	orl	$PCB_FULL_IRET,PCB_FLAGS(%rdi)	/* always full iret from GPF */
 	movw	%es,TF_ES(%rsp)
 	movw	%ds,TF_DS(%rsp)
 	testl	$PSL_I,TF_RFLAGS(%rsp)
 	jz	alltraps_pushregs_no_rax
 	sti
 	jmp	alltraps_pushregs_no_rax
 
 5:	swapgs
 6:	movq	PCPU(CURPCB),%rdi
 	jmp	4b
 
 /*
  * Fast syscall entry point.  We enter here with just our new %cs/%ss set,
  * and the new privilige level.  We are still running on the old user stack
  * pointer.  We have to juggle a few things around to find our stack etc.
  * swapgs gives us access to our PCPU space only.
  *
  * We do not support invoking this from a custom segment registers,
  * esp. %cs, %ss, %fs, %gs, e.g. using entries from an LDT.
  */
 	SUPERALIGN_TEXT
 IDTVEC(fast_syscall_pti)
 	swapgs
 	movq	%rax,PCPU(SCRATCH_RAX)
 	cmpq	$~0,PCPU(UCR3)
 	je	fast_syscall_common
 	movq	PCPU(KCR3),%rax
 	movq	%rax,%cr3
 	jmp	fast_syscall_common
 	SUPERALIGN_TEXT
 IDTVEC(fast_syscall)
 	swapgs
 	movq	%rax,PCPU(SCRATCH_RAX)
 fast_syscall_common:
 	movq	%rsp,PCPU(SCRATCH_RSP)
 	movq	PCPU(RSP0),%rsp
 	/* Now emulate a trapframe. Make the 8 byte alignment odd for call. */
 	subq	$TF_SIZE,%rsp
 	/* defer TF_RSP till we have a spare register */
 	movq	%r11,TF_RFLAGS(%rsp)
 	movq	%rcx,TF_RIP(%rsp)	/* %rcx original value is in %r10 */
 	movq	PCPU(SCRATCH_RSP),%r11	/* %r11 already saved */
 	movq	%r11,TF_RSP(%rsp)	/* user stack pointer */
 	movq	PCPU(SCRATCH_RAX),%rax
 	/*
 	 * Save a few arg registers early to free them for use in
 	 * handle_ibrs_entry().  %r10 is especially tricky.  It is not an
 	 * arg register, but it holds the arg register %rcx.  Profiling
 	 * preserves %rcx, but may clobber %r10.  Profiling may also
 	 * clobber %r11, but %r11 (original %eflags) has been saved.
 	 */
 	movq	%rax,TF_RAX(%rsp)	/* syscall number */
 	movq	%rdx,TF_RDX(%rsp)	/* arg 3 */
 	movq	%r10,TF_RCX(%rsp)	/* arg 4 */
 	SAVE_SEGS
 	call	handle_ibrs_entry
 	movq	PCPU(CURPCB),%r11
 	andl	$~PCB_FULL_IRET,PCB_FLAGS(%r11)
 	sti
 	movq	$KUDSEL,TF_SS(%rsp)
 	movq	$KUCSEL,TF_CS(%rsp)
 	movq	$2,TF_ERR(%rsp)
 	movq	%rdi,TF_RDI(%rsp)	/* arg 1 */
 	movq	%rsi,TF_RSI(%rsp)	/* arg 2 */
 	movq	%r8,TF_R8(%rsp)		/* arg 5 */
 	movq	%r9,TF_R9(%rsp)		/* arg 6 */
 	movq	%rbx,TF_RBX(%rsp)	/* C preserved */
 	movq	%rbp,TF_RBP(%rsp)	/* C preserved */
 	movq	%r12,TF_R12(%rsp)	/* C preserved */
 	movq	%r13,TF_R13(%rsp)	/* C preserved */
 	movq	%r14,TF_R14(%rsp)	/* C preserved */
 	movq	%r15,TF_R15(%rsp)	/* C preserved */
 	movl	$TF_HASSEGS,TF_FLAGS(%rsp)
 	FAKE_MCOUNT(TF_RIP(%rsp))
 	movq	PCPU(CURTHREAD),%rdi
 	movq	%rsp,TD_FRAME(%rdi)
 	movl	TF_RFLAGS(%rsp),%esi
 	andl	$PSL_T,%esi
 	call	amd64_syscall
 1:	movq	PCPU(CURPCB),%rax
 	/* Disable interrupts before testing PCB_FULL_IRET. */
 	cli
 	testl	$PCB_FULL_IRET,PCB_FLAGS(%rax)
 	jnz	4f
 	/* Check for and handle AST's on return to userland. */
 	movq	PCPU(CURTHREAD),%rax
 	testl	$TDF_ASTPENDING | TDF_NEEDRESCHED,TD_FLAGS(%rax)
 	jne	3f
 	call	handle_ibrs_exit
 	/* Restore preserved registers. */
 	MEXITCOUNT
 	movq	TF_RDI(%rsp),%rdi	/* bonus; preserve arg 1 */
 	movq	TF_RSI(%rsp),%rsi	/* bonus: preserve arg 2 */
 	movq	TF_RDX(%rsp),%rdx	/* return value 2 */
 	movq	TF_RAX(%rsp),%rax	/* return value 1 */
 	movq	TF_RFLAGS(%rsp),%r11	/* original %rflags */
 	movq	TF_RIP(%rsp),%rcx	/* original %rip */
 	movq	TF_RSP(%rsp),%rsp	/* user stack pointer */
 	cmpq	$~0,PCPU(UCR3)
 	je	2f
 	movq	PCPU(UCR3),%r9
 	movq	%r9,%cr3
 	xorl	%r9d,%r9d
 2:	swapgs
 	sysretq
 
 3:	/* AST scheduled. */
 	sti
 	movq	%rsp,%rdi
 	call	ast
 	jmp	1b
 
 4:	/* Requested full context restore, use doreti for that. */
 	MEXITCOUNT
 	jmp	doreti
 
 /*
  * Here for CYA insurance, in case a "syscall" instruction gets
  * issued from 32 bit compatibility mode. MSR_CSTAR has to point
  * to *something* if EFER_SCE is enabled.
  */
 IDTVEC(fast_syscall32)
 	sysret
 
 /*
  * DB# handler is very similar to NM#, because 'mov/pop %ss' delay
  * generation of exception until the next instruction is executed,
  * which might be a kernel entry.  So we must execute the handler
  * on IST stack and be ready for non-kernel GSBASE.
  */
 IDTVEC(dbg)
 	subq	$TF_RIP,%rsp
 	movl	$(T_TRCTRAP),TF_TRAPNO(%rsp)
 	movq	$0,TF_ADDR(%rsp)
 	movq	$0,TF_ERR(%rsp)
 	movq	%rdi,TF_RDI(%rsp)
 	movq	%rsi,TF_RSI(%rsp)
 	movq	%rdx,TF_RDX(%rsp)
 	movq	%rcx,TF_RCX(%rsp)
 	movq	%r8,TF_R8(%rsp)
 	movq	%r9,TF_R9(%rsp)
 	movq	%rax,TF_RAX(%rsp)
 	movq	%rbx,TF_RBX(%rsp)
 	movq	%rbp,TF_RBP(%rsp)
 	movq	%r10,TF_R10(%rsp)
 	movq	%r11,TF_R11(%rsp)
 	movq	%r12,TF_R12(%rsp)
 	movq	%r13,TF_R13(%rsp)
 	movq	%r14,TF_R14(%rsp)
 	movq	%r15,TF_R15(%rsp)
 	SAVE_SEGS
 	movl	$TF_HASSEGS,TF_FLAGS(%rsp)
 	pushfq
 	andq	$~(PSL_D | PSL_AC),(%rsp)
 	popfq
 	testb	$SEL_RPL_MASK,TF_CS(%rsp)
 	jnz	dbg_fromuserspace
 	/*
 	 * We've interrupted the kernel.  Preserve GS.base in %r12,
 	 * %cr3 in %r13, and possibly lower half of MSR_IA32_SPEC_CTL in %r14d.
 	 */
 	movl	$MSR_GSBASE,%ecx
 	rdmsr
 	movq	%rax,%r12
 	shlq	$32,%rdx
 	orq	%rdx,%r12
 	/* Retrieve and load the canonical value for GS.base. */
 	movq	TF_SIZE(%rsp),%rdx
 	movl	%edx,%eax
 	shrq	$32,%rdx
 	wrmsr
 	movq	%cr3,%r13
 	movq	PCPU(KCR3),%rax
 	cmpq	$~0,%rax
 	je	1f
 	movq	%rax,%cr3
 1:	testl	$CPUID_STDEXT3_IBPB,cpu_stdext_feature3(%rip)
 	je	2f
 	movl	$MSR_IA32_SPEC_CTRL,%ecx
 	rdmsr
 	movl	%eax,%r14d
 	call	handle_ibrs_entry
 2:	FAKE_MCOUNT(TF_RIP(%rsp))
 	movq	%rsp,%rdi
 	call	trap
 	MEXITCOUNT
 	testl	$CPUID_STDEXT3_IBPB,cpu_stdext_feature3(%rip)
 	je	3f
 	movl	%r14d,%eax
 	xorl	%edx,%edx
 	movl	$MSR_IA32_SPEC_CTRL,%ecx
 	wrmsr
 	/*
 	 * Put back the preserved MSR_GSBASE value.
 	 */
 3:	movl	$MSR_GSBASE,%ecx
 	movq	%r12,%rdx
 	movl	%edx,%eax
 	shrq	$32,%rdx
 	wrmsr
 	movq	%r13,%cr3
 	RESTORE_REGS
 	addq	$TF_RIP,%rsp
 	jmp	doreti_iret
 dbg_fromuserspace:
 	/*
 	 * Switch to kernel GSBASE and kernel page table, and copy frame
 	 * from the IST stack to the normal kernel stack, since trap()
 	 * re-enables interrupts, and since we might trap on DB# while
 	 * in trap().
 	 */
 	swapgs
 	movq	PCPU(KCR3),%rax
 	cmpq	$~0,%rax
 	je	1f
 	movq	%rax,%cr3
 1:	movq	PCPU(RSP0),%rax
 	movl	$TF_SIZE,%ecx
 	subq	%rcx,%rax
 	movq	%rax,%rdi
 	movq	%rsp,%rsi
 	rep;movsb
 	movq	%rax,%rsp
 	call	handle_ibrs_entry
 	movq	PCPU(CURPCB),%rdi
 	orl	$PCB_FULL_IRET,PCB_FLAGS(%rdi)
 	testb	$CPUID_STDEXT_FSGSBASE,cpu_stdext_feature(%rip)
 	jz	3f
 	cmpw	$KUF32SEL,TF_FS(%rsp)
 	jne	2f
 	rdfsbase %rax
 	movq	%rax,PCB_FSBASE(%rdi)
 2:	cmpw	$KUG32SEL,TF_GS(%rsp)
 	jne	3f
 	movl	$MSR_KGSBASE,%ecx
 	rdmsr
 	shlq	$32,%rdx
 	orq	%rdx,%rax
 	movq	%rax,PCB_GSBASE(%rdi)
 3:	jmp	calltrap
 
 /*
  * NMI handling is special.
  *
  * First, NMIs do not respect the state of the processor's RFLAGS.IF
  * bit.  The NMI handler may be entered at any time, including when
  * the processor is in a critical section with RFLAGS.IF == 0.
  * The processor's GS.base value could be invalid on entry to the
  * handler.
  *
  * Second, the processor treats NMIs specially, blocking further NMIs
  * until an 'iretq' instruction is executed.  We thus need to execute
  * the NMI handler with interrupts disabled, to prevent a nested interrupt
  * from executing an 'iretq' instruction and inadvertently taking the
  * processor out of NMI mode.
  *
  * Third, the NMI handler runs on its own stack (tss_ist2). The canonical
  * GS.base value for the processor is stored just above the bottom of its
  * NMI stack.  For NMIs taken from kernel mode, the current value in
  * the processor's GS.base is saved at entry to C-preserved register %r12,
  * the canonical value for GS.base is then loaded into the processor, and
  * the saved value is restored at exit time.  For NMIs taken from user mode,
  * the cheaper 'SWAPGS' instructions are used for swapping GS.base.
  */
 
 IDTVEC(nmi)
 	subq	$TF_RIP,%rsp
 	movl	$(T_NMI),TF_TRAPNO(%rsp)
 	movq	$0,TF_ADDR(%rsp)
 	movq	$0,TF_ERR(%rsp)
 	movq	%rdi,TF_RDI(%rsp)
 	movq	%rsi,TF_RSI(%rsp)
 	movq	%rdx,TF_RDX(%rsp)
 	movq	%rcx,TF_RCX(%rsp)
 	movq	%r8,TF_R8(%rsp)
 	movq	%r9,TF_R9(%rsp)
 	movq	%rax,TF_RAX(%rsp)
 	movq	%rbx,TF_RBX(%rsp)
 	movq	%rbp,TF_RBP(%rsp)
 	movq	%r10,TF_R10(%rsp)
 	movq	%r11,TF_R11(%rsp)
 	movq	%r12,TF_R12(%rsp)
 	movq	%r13,TF_R13(%rsp)
 	movq	%r14,TF_R14(%rsp)
 	movq	%r15,TF_R15(%rsp)
 	SAVE_SEGS
 	movl	$TF_HASSEGS,TF_FLAGS(%rsp)
 	pushfq
 	andq	$~(PSL_D | PSL_AC),(%rsp)
 	popfq
 	xorl	%ebx,%ebx
 	testb	$SEL_RPL_MASK,TF_CS(%rsp)
 	jnz	nmi_fromuserspace
 	/*
 	 * We've interrupted the kernel.  Preserve GS.base in %r12,
 	 * %cr3 in %r13, and possibly lower half of MSR_IA32_SPEC_CTL in %r14d.
 	 */
 	movl	$MSR_GSBASE,%ecx
 	rdmsr
 	movq	%rax,%r12
 	shlq	$32,%rdx
 	orq	%rdx,%r12
 	/* Retrieve and load the canonical value for GS.base. */
 	movq	TF_SIZE(%rsp),%rdx
 	movl	%edx,%eax
 	shrq	$32,%rdx
 	wrmsr
 	movq	%cr3,%r13
 	movq	PCPU(KCR3),%rax
 	cmpq	$~0,%rax
 	je	1f
 	movq	%rax,%cr3
 1:	testl	$CPUID_STDEXT3_IBPB,cpu_stdext_feature3(%rip)
 	je	nmi_calltrap
 	movl	$MSR_IA32_SPEC_CTRL,%ecx
 	rdmsr
 	movl	%eax,%r14d
 	call	handle_ibrs_entry
 	jmp	nmi_calltrap
 nmi_fromuserspace:
 	incl	%ebx
 	swapgs
 	movq	%cr3,%r13
 	movq	PCPU(KCR3),%rax
 	cmpq	$~0,%rax
 	je	1f
 	movq	%rax,%cr3
 1:	call	handle_ibrs_entry
 	movq	PCPU(CURPCB),%rdi
 	testq	%rdi,%rdi
 	jz	3f
 	orl	$PCB_FULL_IRET,PCB_FLAGS(%rdi)
 	testb	$CPUID_STDEXT_FSGSBASE,cpu_stdext_feature(%rip)
 	jz	3f
 	cmpw	$KUF32SEL,TF_FS(%rsp)
 	jne	2f
 	rdfsbase %rax
 	movq	%rax,PCB_FSBASE(%rdi)
 2:	cmpw	$KUG32SEL,TF_GS(%rsp)
 	jne	3f
 	movl	$MSR_KGSBASE,%ecx
 	rdmsr
 	shlq	$32,%rdx
 	orq	%rdx,%rax
 	movq	%rax,PCB_GSBASE(%rdi)
 3:
 /* Note: this label is also used by ddb and gdb: */
 nmi_calltrap:
 	FAKE_MCOUNT(TF_RIP(%rsp))
 	movq	%rsp,%rdi
 	call	trap
 	MEXITCOUNT
 #ifdef HWPMC_HOOKS
 	/*
 	 * Capture a userspace callchain if needed.
 	 *
 	 * - Check if the current trap was from user mode.
 	 * - Check if the current thread is valid.
 	 * - Check if the thread requires a user call chain to be
 	 *   captured.
 	 *
 	 * We are still in NMI mode at this point.
 	 */
 	testl	%ebx,%ebx
 	jz	nocallchain	/* not from userspace */
 	movq	PCPU(CURTHREAD),%rax
 	orq	%rax,%rax	/* curthread present? */
 	jz	nocallchain
 	/*
 	 * Move execution to the regular kernel stack, because we
 	 * committed to return through doreti.
 	 */
 	movq	%rsp,%rsi	/* source stack pointer */
 	movq	$TF_SIZE,%rcx
 	movq	PCPU(RSP0),%rdx
 	subq	%rcx,%rdx
 	movq	%rdx,%rdi	/* destination stack pointer */
 	shrq	$3,%rcx		/* trap frame size in long words */
 	pushfq
 	andq	$~(PSL_D | PSL_AC),(%rsp)
 	popfq
 	rep
 	movsq			/* copy trapframe */
 	movq	%rdx,%rsp	/* we are on the regular kstack */
 
 	testl	$TDP_CALLCHAIN,TD_PFLAGS(%rax) /* flagged for capture? */
 	jz	nocallchain
 	/*
 	 * A user callchain is to be captured, so:
 	 * - Take the processor out of "NMI" mode by faking an "iret",
 	 *   to allow for nested NMI interrupts.
 	 * - Enable interrupts, so that copyin() can work.
 	 */
 	movl	%ss,%eax
 	pushq	%rax		/* tf_ss */
 	pushq	%rdx		/* tf_rsp (on kernel stack) */
 	pushfq			/* tf_rflags */
 	movl	%cs,%eax
 	pushq	%rax		/* tf_cs */
 	pushq	$outofnmi	/* tf_rip */
 	iretq
 outofnmi:
 	/*
 	 * At this point the processor has exited NMI mode and is running
 	 * with interrupts turned off on the normal kernel stack.
 	 *
 	 * If a pending NMI gets recognized at or after this point, it
 	 * will cause a kernel callchain to be traced.
 	 *
 	 * We turn interrupts back on, and call the user callchain capture hook.
 	 */
 	movq	pmc_hook,%rax
 	orq	%rax,%rax
 	jz	nocallchain
 	movq	PCPU(CURTHREAD),%rdi		/* thread */
 	movq	$PMC_FN_USER_CALLCHAIN,%rsi	/* command */
 	movq	%rsp,%rdx			/* frame */
 	sti
 	call	*%rax
 	cli
 nocallchain:
 #endif
 	testl	%ebx,%ebx	/* %ebx == 0 => return to userland */
 	jnz	doreti_exit
 	/*
 	 * Restore speculation control MSR, if preserved.
 	 */
 	testl	$CPUID_STDEXT3_IBPB,cpu_stdext_feature3(%rip)
 	je	1f
 	movl	%r14d,%eax
 	xorl	%edx,%edx
 	movl	$MSR_IA32_SPEC_CTRL,%ecx
 	wrmsr
 	/*
 	 * Put back the preserved MSR_GSBASE value.
 	 */
 1:	movl	$MSR_GSBASE,%ecx
 	movq	%r12,%rdx
 	movl	%edx,%eax
 	shrq	$32,%rdx
 	wrmsr
-	movq	%r13,%cr3
+	cmpb	$0, nmi_flush_l1d_sw(%rip)
+	je	2f
+	call	flush_l1d_sw		/* bhyve L1TF assist */
+2:	movq	%r13,%cr3
 	RESTORE_REGS
 	addq	$TF_RIP,%rsp
 	jmp	doreti_iret
 
 /*
  * MC# handling is similar to NMI.
  *
  * As with NMIs, machine check exceptions do not respect RFLAGS.IF and
  * can occur at any time with a GS.base value that does not correspond
  * to the privilege level in CS.
  *
  * Machine checks are not unblocked by iretq, but it is best to run
  * the handler with interrupts disabled since the exception may have
  * interrupted a critical section.
  *
  * The MC# handler runs on its own stack (tss_ist3).  The canonical
  * GS.base value for the processor is stored just above the bottom of
  * its MC# stack.  For exceptions taken from kernel mode, the current
  * value in the processor's GS.base is saved at entry to C-preserved
  * register %r12, the canonical value for GS.base is then loaded into
  * the processor, and the saved value is restored at exit time.  For
  * exceptions taken from user mode, the cheaper 'SWAPGS' instructions
  * are used for swapping GS.base.
  */
 
 IDTVEC(mchk)
 	subq	$TF_RIP,%rsp
 	movl	$(T_MCHK),TF_TRAPNO(%rsp)
 	movq	$0,TF_ADDR(%rsp)
 	movq	$0,TF_ERR(%rsp)
 	movq	%rdi,TF_RDI(%rsp)
 	movq	%rsi,TF_RSI(%rsp)
 	movq	%rdx,TF_RDX(%rsp)
 	movq	%rcx,TF_RCX(%rsp)
 	movq	%r8,TF_R8(%rsp)
 	movq	%r9,TF_R9(%rsp)
 	movq	%rax,TF_RAX(%rsp)
 	movq	%rbx,TF_RBX(%rsp)
 	movq	%rbp,TF_RBP(%rsp)
 	movq	%r10,TF_R10(%rsp)
 	movq	%r11,TF_R11(%rsp)
 	movq	%r12,TF_R12(%rsp)
 	movq	%r13,TF_R13(%rsp)
 	movq	%r14,TF_R14(%rsp)
 	movq	%r15,TF_R15(%rsp)
 	SAVE_SEGS
 	movl	$TF_HASSEGS,TF_FLAGS(%rsp)
 	pushfq
 	andq	$~(PSL_D | PSL_AC),(%rsp)
 	popfq
 	xorl	%ebx,%ebx
 	testb	$SEL_RPL_MASK,TF_CS(%rsp)
 	jnz	mchk_fromuserspace
 	/*
 	 * We've interrupted the kernel.  Preserve GS.base in %r12,
 	 * %cr3 in %r13, and possibly lower half of MSR_IA32_SPEC_CTL in %r14d.
 	 */
 	movl	$MSR_GSBASE,%ecx
 	rdmsr
 	movq	%rax,%r12
 	shlq	$32,%rdx
 	orq	%rdx,%r12
 	/* Retrieve and load the canonical value for GS.base. */
 	movq	TF_SIZE(%rsp),%rdx
 	movl	%edx,%eax
 	shrq	$32,%rdx
 	wrmsr
 	movq	%cr3,%r13
 	movq	PCPU(KCR3),%rax
 	cmpq	$~0,%rax
 	je	1f
 	movq	%rax,%cr3
 1:	testl	$CPUID_STDEXT3_IBPB,cpu_stdext_feature3(%rip)
 	je	mchk_calltrap
 	movl	$MSR_IA32_SPEC_CTRL,%ecx
 	rdmsr
 	movl	%eax,%r14d
 	call	handle_ibrs_entry
 	jmp	mchk_calltrap
 mchk_fromuserspace:
 	incl	%ebx
 	swapgs
 	movq	%cr3,%r13
 	movq	PCPU(KCR3),%rax
 	cmpq	$~0,%rax
 	je	1f
 	movq	%rax,%cr3
 1:	call	handle_ibrs_entry
 /* Note: this label is also used by ddb and gdb: */
 mchk_calltrap:
 	FAKE_MCOUNT(TF_RIP(%rsp))
 	movq	%rsp,%rdi
 	call	mca_intr
 	MEXITCOUNT
 	testl	%ebx,%ebx	/* %ebx == 0 => return to userland */
 	jnz	doreti_exit
 	/*
 	 * Restore speculation control MSR, if preserved.
 	 */
 	testl	$CPUID_STDEXT3_IBPB,cpu_stdext_feature3(%rip)
 	je	1f
 	movl	%r14d,%eax
 	xorl	%edx,%edx
 	movl	$MSR_IA32_SPEC_CTRL,%ecx
 	wrmsr
 	/*
 	 * Put back the preserved MSR_GSBASE value.
 	 */
 1:	movl	$MSR_GSBASE,%ecx
 	movq	%r12,%rdx
 	movl	%edx,%eax
 	shrq	$32,%rdx
 	wrmsr
 	movq	%r13,%cr3
 	RESTORE_REGS
 	addq	$TF_RIP,%rsp
 	jmp	doreti_iret
 
 ENTRY(fork_trampoline)
 	movq	%r12,%rdi		/* function */
 	movq	%rbx,%rsi		/* arg1 */
 	movq	%rsp,%rdx		/* trapframe pointer */
 	call	fork_exit
 	MEXITCOUNT
 	jmp	doreti			/* Handle any ASTs */
 
 /*
  * To efficiently implement classification of trap and interrupt handlers
  * for profiling, there must be only trap handlers between the labels btrap
  * and bintr, and only interrupt handlers between the labels bintr and
  * eintr.  This is implemented (partly) by including files that contain
  * some of the handlers.  Before including the files, set up a normal asm
  * environment so that the included files doen't need to know that they are
  * included.
  */
 
 #ifdef COMPAT_FREEBSD32
 	.data
 	.p2align 4
 	.text
 	SUPERALIGN_TEXT
 
 #include <amd64/ia32/ia32_exception.S>
 #endif
 
 	.data
 	.p2align 4
 	.text
 	SUPERALIGN_TEXT
 MCOUNT_LABEL(bintr)
 
 #include <amd64/amd64/apic_vector.S>
 
 #ifdef DEV_ATPIC
 	.data
 	.p2align 4
 	.text
 	SUPERALIGN_TEXT
 
 #include <amd64/amd64/atpic_vector.S>
 #endif
 
 	.text
 MCOUNT_LABEL(eintr)
 
 /*
  * void doreti(struct trapframe)
  *
  * Handle return from interrupts, traps and syscalls.
  */
 	.text
 	SUPERALIGN_TEXT
 	.type	doreti,@function
 	.globl	doreti
 doreti:
 	FAKE_MCOUNT($bintr)		/* init "from" bintr -> doreti */
 	/*
 	 * Check if ASTs can be handled now.
 	 */
 	testb	$SEL_RPL_MASK,TF_CS(%rsp) /* are we returning to user mode? */
 	jz	doreti_exit		/* can't handle ASTs now if not */
 
 doreti_ast:
 	/*
 	 * Check for ASTs atomically with returning.  Disabling CPU
 	 * interrupts provides sufficient locking even in the SMP case,
 	 * since we will be informed of any new ASTs by an IPI.
 	 */
 	cli
 	movq	PCPU(CURTHREAD),%rax
 	testl	$TDF_ASTPENDING | TDF_NEEDRESCHED,TD_FLAGS(%rax)
 	je	doreti_exit
 	sti
 	movq	%rsp,%rdi	/* pass a pointer to the trapframe */
 	call	ast
 	jmp	doreti_ast
 
 	/*
 	 * doreti_exit:	pop registers, iret.
 	 *
 	 *	The segment register pop is a special case, since it may
 	 *	fault if (for example) a sigreturn specifies bad segment
 	 *	registers.  The fault is handled in trap.c.
 	 */
 doreti_exit:
 	MEXITCOUNT
 	movq	PCPU(CURPCB),%r8
 
 	/*
 	 * Do not reload segment registers for kernel.
 	 * Since we do not reload segments registers with sane
 	 * values on kernel entry, descriptors referenced by
 	 * segments registers might be not valid.  This is fatal
 	 * for user mode, but is not a problem for the kernel.
 	 */
 	testb	$SEL_RPL_MASK,TF_CS(%rsp)
 	jz	ld_regs
 	testl	$PCB_FULL_IRET,PCB_FLAGS(%r8)
 	jz	ld_regs
 	andl	$~PCB_FULL_IRET,PCB_FLAGS(%r8)
 	testl	$TF_HASSEGS,TF_FLAGS(%rsp)
 	je	set_segs
 
 do_segs:
 	/* Restore %fs and fsbase */
 	movw	TF_FS(%rsp),%ax
 	.globl	ld_fs
 ld_fs:
 	movw	%ax,%fs
 	cmpw	$KUF32SEL,%ax
 	jne	1f
 	movl	$MSR_FSBASE,%ecx
 	movl	PCB_FSBASE(%r8),%eax
 	movl	PCB_FSBASE+4(%r8),%edx
 	.globl	ld_fsbase
 ld_fsbase:
 	wrmsr
 1:
 	/* Restore %gs and gsbase */
 	movw	TF_GS(%rsp),%si
 	pushfq
 	cli
 	movl	$MSR_GSBASE,%ecx
 	/* Save current kernel %gs base into %r12d:%r13d */
 	rdmsr
 	movl	%eax,%r12d
 	movl	%edx,%r13d
 	.globl	ld_gs
 ld_gs:
 	movw	%si,%gs
 	/* Save user %gs base into %r14d:%r15d */
 	rdmsr
 	movl	%eax,%r14d
 	movl	%edx,%r15d
 	/* Restore kernel %gs base */
 	movl	%r12d,%eax
 	movl	%r13d,%edx
 	wrmsr
 	popfq
 	/*
 	 * Restore user %gs base, either from PCB if used for TLS, or
 	 * from the previously saved msr read.
 	 */
 	movl	$MSR_KGSBASE,%ecx
 	cmpw	$KUG32SEL,%si
 	jne	1f
 	movl	PCB_GSBASE(%r8),%eax
 	movl	PCB_GSBASE+4(%r8),%edx
 	jmp	ld_gsbase
 1:
 	movl	%r14d,%eax
 	movl	%r15d,%edx
 	.globl	ld_gsbase
 ld_gsbase:
 	wrmsr	/* May trap if non-canonical, but only for TLS. */
 	.globl	ld_es
 ld_es:
 	movw	TF_ES(%rsp),%es
 	.globl	ld_ds
 ld_ds:
 	movw	TF_DS(%rsp),%ds
 ld_regs:
 	RESTORE_REGS
 	testb	$SEL_RPL_MASK,TF_CS(%rsp) /* Did we come from kernel? */
 	jz	2f			/* keep running with kernel GS.base */
 	cli
 	call	handle_ibrs_exit_rs
 	cmpq	$~0,PCPU(UCR3)
 	je	1f
 	pushq	%rdx
 	movq	PCPU(PTI_RSP0),%rdx
 	subq	$PTI_SIZE,%rdx
 	movq	%rax,PTI_RAX(%rdx)
 	popq	%rax
 	movq	%rax,PTI_RDX(%rdx)
 	movq	TF_RIP(%rsp),%rax
 	movq	%rax,PTI_RIP(%rdx)
 	movq	TF_CS(%rsp),%rax
 	movq	%rax,PTI_CS(%rdx)
 	movq	TF_RFLAGS(%rsp),%rax
 	movq	%rax,PTI_RFLAGS(%rdx)
 	movq	TF_RSP(%rsp),%rax
 	movq	%rax,PTI_RSP(%rdx)
 	movq	TF_SS(%rsp),%rax
 	movq	%rax,PTI_SS(%rdx)
 	movq	PCPU(UCR3),%rax
 	swapgs
 	movq	%rdx,%rsp
 	movq	%rax,%cr3
 	popq	%rdx
 	popq	%rax
 	addq	$8,%rsp
 	jmp	doreti_iret
 1:	swapgs
 2:	addq	$TF_RIP,%rsp
 	.globl	doreti_iret
 doreti_iret:
 	iretq
 
 set_segs:
 	movw	$KUDSEL,%ax
 	movw	%ax,TF_DS(%rsp)
 	movw	%ax,TF_ES(%rsp)
 	movw	$KUF32SEL,TF_FS(%rsp)
 	movw	$KUG32SEL,TF_GS(%rsp)
 	jmp	do_segs
 
 	/*
 	 * doreti_iret_fault.  Alternative return code for
 	 * the case where we get a fault in the doreti_exit code
 	 * above.  trap() (amd64/amd64/trap.c) catches this specific
 	 * case, sends the process a signal and continues in the
 	 * corresponding place in the code below.
 	 */
 	ALIGN_TEXT
 	.globl	doreti_iret_fault
 doreti_iret_fault:
 	subq	$TF_RIP,%rsp		/* space including tf_err, tf_trapno */
 	movq	%rax,TF_RAX(%rsp)
 	movq	%rdx,TF_RDX(%rsp)
 	movq	%rcx,TF_RCX(%rsp)
 	call	handle_ibrs_entry
 	testb	$SEL_RPL_MASK,TF_CS(%rsp)
 	jz	1f
 	sti
 1:
 	SAVE_SEGS
 	movl	$TF_HASSEGS,TF_FLAGS(%rsp)
 	movq	%rdi,TF_RDI(%rsp)
 	movq	%rsi,TF_RSI(%rsp)
 	movq	%r8,TF_R8(%rsp)
 	movq	%r9,TF_R9(%rsp)
 	movq	%rbx,TF_RBX(%rsp)
 	movq	%rbp,TF_RBP(%rsp)
 	movq	%r10,TF_R10(%rsp)
 	movq	%r11,TF_R11(%rsp)
 	movq	%r12,TF_R12(%rsp)
 	movq	%r13,TF_R13(%rsp)
 	movq	%r14,TF_R14(%rsp)
 	movq	%r15,TF_R15(%rsp)
 	movl	$T_PROTFLT,TF_TRAPNO(%rsp)
 	movq	$0,TF_ERR(%rsp)	/* XXX should be the error code */
 	movq	$0,TF_ADDR(%rsp)
 	FAKE_MCOUNT(TF_RIP(%rsp))
 	jmp	calltrap
 
 	ALIGN_TEXT
 	.globl	ds_load_fault
 ds_load_fault:
 	movl	$T_PROTFLT,TF_TRAPNO(%rsp)
 	testb	$SEL_RPL_MASK,TF_CS(%rsp)
 	jz	1f
 	sti
 1:
 	movq	%rsp,%rdi
 	call	trap
 	movw	$KUDSEL,TF_DS(%rsp)
 	jmp	doreti
 
 	ALIGN_TEXT
 	.globl	es_load_fault
 es_load_fault:
 	movl	$T_PROTFLT,TF_TRAPNO(%rsp)
 	testl	$PSL_I,TF_RFLAGS(%rsp)
 	jz	1f
 	sti
 1:
 	movq	%rsp,%rdi
 	call	trap
 	movw	$KUDSEL,TF_ES(%rsp)
 	jmp	doreti
 
 	ALIGN_TEXT
 	.globl	fs_load_fault
 fs_load_fault:
 	testl	$PSL_I,TF_RFLAGS(%rsp)
 	jz	1f
 	sti
 1:
 	movl	$T_PROTFLT,TF_TRAPNO(%rsp)
 	movq	%rsp,%rdi
 	call	trap
 	movw	$KUF32SEL,TF_FS(%rsp)
 	jmp	doreti
 
 	ALIGN_TEXT
 	.globl	gs_load_fault
 gs_load_fault:
 	popfq
 	movl	$T_PROTFLT,TF_TRAPNO(%rsp)
 	testl	$PSL_I,TF_RFLAGS(%rsp)
 	jz	1f
 	sti
 1:
 	movq	%rsp,%rdi
 	call	trap
 	movw	$KUG32SEL,TF_GS(%rsp)
 	jmp	doreti
 
 	ALIGN_TEXT
 	.globl	fsbase_load_fault
 fsbase_load_fault:
 	movl	$T_PROTFLT,TF_TRAPNO(%rsp)
 	testl	$PSL_I,TF_RFLAGS(%rsp)
 	jz	1f
 	sti
 1:
 	movq	%rsp,%rdi
 	call	trap
 	movq	PCPU(CURTHREAD),%r8
 	movq	TD_PCB(%r8),%r8
 	movq	$0,PCB_FSBASE(%r8)
 	jmp	doreti
 
 	ALIGN_TEXT
 	.globl	gsbase_load_fault
 gsbase_load_fault:
 	movl	$T_PROTFLT,TF_TRAPNO(%rsp)
 	testl	$PSL_I,TF_RFLAGS(%rsp)
 	jz	1f
 	sti
 1:
 	movq	%rsp,%rdi
 	call	trap
 	movq	PCPU(CURTHREAD),%r8
 	movq	TD_PCB(%r8),%r8
 	movq	$0,PCB_GSBASE(%r8)
 	jmp	doreti
 
 #ifdef HWPMC_HOOKS
 	ENTRY(end_exceptions)
 #endif
Index: head/sys/amd64/amd64/support.S
===================================================================
--- head/sys/amd64/amd64/support.S	(revision 338067)
+++ head/sys/amd64/amd64/support.S	(revision 338068)
@@ -1,1227 +1,1260 @@
 /*-
  * Copyright (c) 2003 Peter Wemm.
  * Copyright (c) 1993 The Regents of the University of California.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  * $FreeBSD$
  */
 
 #include "opt_ddb.h"
 
 #include <machine/asmacros.h>
 #include <machine/specialreg.h>
 #include <machine/pmap.h>
 
 #include "assym.inc"
 
 	.text
 
 /* Address: %rdi */
 ENTRY(pagezero)
 	PUSH_FRAME_POINTER
 	movq	$PAGE_SIZE/8,%rcx
 	xorl	%eax,%eax
 	rep
 	stosq
 	POP_FRAME_POINTER
 	ret
 END(pagezero)
 
 /*
  * pagecopy(%rdi=from, %rsi=to)
  */
 ENTRY(pagecopy)
 	PUSH_FRAME_POINTER
 	movq	$PAGE_SIZE/8,%rcx
 	movq	%rdi,%r9
 	movq	%rsi,%rdi
 	movq	%r9,%rsi
 	rep
 	movsq
 	POP_FRAME_POINTER
 	ret
 END(pagecopy)
 
 /* Address: %rdi */
 ENTRY(sse2_pagezero)
 	PUSH_FRAME_POINTER
 	movq	$-PAGE_SIZE,%rdx
 	subq	%rdx,%rdi
 	xorl	%eax,%eax
 	jmp	1f
 	/*
 	 * The loop takes 29 bytes.  Ensure that it doesn't cross a 32-byte
 	 * cache line.
 	 */
 	.p2align 5,0x90
 1:
 	movnti	%rax,(%rdi,%rdx)
 	movnti	%rax,8(%rdi,%rdx)
 	movnti	%rax,16(%rdi,%rdx)
 	movnti	%rax,24(%rdi,%rdx)
 	addq	$32,%rdx
 	jne	1b
 	sfence
 	POP_FRAME_POINTER
 	ret
 END(sse2_pagezero)
 
 /*
  * memmove(dst, src, cnt)
  *         rdi, rsi, rdx
  * Adapted from bcopy written by:
  *  ws@tools.de     (Wolfgang Solfrank, TooLs GmbH) +49-228-985800
  */
 ENTRY(memmove)
 	PUSH_FRAME_POINTER
 	movq	%rdi,%r9
 	movq	%rdx,%rcx
 
 	movq	%rdi,%rax
 	subq	%rsi,%rax
 	cmpq	%rcx,%rax			/* overlapping && src < dst? */
 	jb	1f
 
 	shrq	$3,%rcx				/* copy by 64-bit words */
 	rep
 	movsq
 	movq	%rdx,%rcx
 	andq	$7,%rcx				/* any bytes left? */
 	jne	2f
 	movq	%r9,%rax
 	POP_FRAME_POINTER
 	ret
 2:
 	rep
 	movsb
 	movq	%r9,%rax
 	POP_FRAME_POINTER
 	ret
 
 	/* ALIGN_TEXT */
 1:
 	addq	%rcx,%rdi			/* copy backwards */
 	addq	%rcx,%rsi
 	decq	%rdi
 	decq	%rsi
 	andq	$7,%rcx				/* any fractional bytes? */
 	std
 	rep
 	movsb
 	movq	%rdx,%rcx			/* copy remainder by 32-bit words */
 	shrq	$3,%rcx
 	subq	$7,%rsi
 	subq	$7,%rdi
 	rep
 	movsq
 	cld
 	movq	%r9,%rax
 	POP_FRAME_POINTER
 	ret
 END(memmove)
 
 /*
  * memcpy(dst, src, len)
  *        rdi, rsi, rdx
  *
  * Note: memcpy does not support overlapping copies
  */
 ENTRY(memcpy)
 	PUSH_FRAME_POINTER
 	movq	%rdi,%rax
 	movq	%rdx,%rcx
 	shrq	$3,%rcx				/* copy by 64-bit words */
 	rep
 	movsq
 	movq	%rdx,%rcx
 	andq	$7,%rcx				/* any bytes left? */
 	jne	1f
 	POP_FRAME_POINTER
 	ret
 1:
 	rep
 	movsb
 	POP_FRAME_POINTER
 	ret
 END(memcpy)
 
 /*
  * memset(dst, c,   len)
  *        rdi, rsi, rdx
  */
 ENTRY(memset)
 	PUSH_FRAME_POINTER
 	movq	%rdi,%r9
 	movq	%rdx,%rcx
 	movzbq	%sil,%r8
 	movabs	$0x0101010101010101,%rax
 	imulq	%r8,%rax
 	shrq	$3,%rcx
 	rep
 	stosq
 	movq	%rdx,%rcx
 	andq	$7,%rcx
 	jne	1f
 	movq	%r9,%rax
 	POP_FRAME_POINTER
 	ret
 1:
 	rep
 	stosb
 	movq	%r9,%rax
 	POP_FRAME_POINTER
 	ret
 END(memset)
 
 /* fillw(pat, base, cnt) */
 /*       %rdi,%rsi, %rdx */
 ENTRY(fillw)
 	PUSH_FRAME_POINTER
 	movq	%rdi,%rax
 	movq	%rsi,%rdi
 	movq	%rdx,%rcx
 	rep
 	stosw
 	POP_FRAME_POINTER
 	ret
 END(fillw)
 
 /*****************************************************************************/
 /* copyout and fubyte family                                                 */
 /*****************************************************************************/
 /*
  * Access user memory from inside the kernel. These routines should be
  * the only places that do this.
  *
  * These routines set curpcb->pcb_onfault for the time they execute. When a
  * protection violation occurs inside the functions, the trap handler
  * returns to *curpcb->pcb_onfault instead of the function.
  */
 
 /*
  * copyout(from_kernel, to_user, len)
  *         %rdi,        %rsi,    %rdx
  */
 ENTRY(copyout_nosmap)
 	PUSH_FRAME_POINTER
 	movq	PCPU(CURPCB),%rax
 	movq	$copyout_fault,PCB_ONFAULT(%rax)
 	testq	%rdx,%rdx			/* anything to do? */
 	jz	done_copyout
 
 	/*
 	 * Check explicitly for non-user addresses.  This check is essential
 	 * because it prevents usermode from writing into the kernel.  We do
 	 * not verify anywhere else that the user did not specify a rogue
 	 * address.
 	 */
 	/*
 	 * First, prevent address wrapping.
 	 */
 	movq	%rsi,%rax
 	addq	%rdx,%rax
 	jc	copyout_fault
 /*
  * XXX STOP USING VM_MAXUSER_ADDRESS.
  * It is an end address, not a max, so every time it is used correctly it
  * looks like there is an off by one error, and of course it caused an off
  * by one error in several places.
  */
 	movq	$VM_MAXUSER_ADDRESS,%rcx
 	cmpq	%rcx,%rax
 	ja	copyout_fault
 
 	xchgq	%rdi,%rsi
 	/* bcopy(%rsi, %rdi, %rdx) */
 	movq	%rdx,%rcx
 
 	shrq	$3,%rcx
 	rep
 	movsq
 	movb	%dl,%cl
 	andb	$7,%cl
 	je	done_copyout
 	rep
 	movsb
 
 	jmp	done_copyout
 END(copyout_nosmap)
 
 ENTRY(copyout_smap)
 	PUSH_FRAME_POINTER
 	movq	PCPU(CURPCB),%rax
 	/* Trap entry clears PSL.AC */
 	movq	$copyout_fault,PCB_ONFAULT(%rax)
 	testq	%rdx,%rdx			/* anything to do? */
 	jz	done_copyout
 
 	/*
 	 * Check explicitly for non-user addresses.  If 486 write protection
 	 * is being used, this check is essential because we are in kernel
 	 * mode so the h/w does not provide any protection against writing
 	 * kernel addresses.
 	 */
 
 	/*
 	 * First, prevent address wrapping.
 	 */
 	movq	%rsi,%rax
 	addq	%rdx,%rax
 	jc	copyout_fault
 /*
  * XXX STOP USING VM_MAXUSER_ADDRESS.
  * It is an end address, not a max, so every time it is used correctly it
  * looks like there is an off by one error, and of course it caused an off
  * by one error in several places.
  */
 	movq	$VM_MAXUSER_ADDRESS,%rcx
 	cmpq	%rcx,%rax
 	ja	copyout_fault
 
 	xchgq	%rdi,%rsi
 	/* bcopy(%rsi, %rdi, %rdx) */
 	movq	%rdx,%rcx
 
 	shrq	$3,%rcx
 	stac
 	rep
 	movsq
 	movb	%dl,%cl
 	andb	$7,%cl
 	rep
 	movsb
 	clac
 
 done_copyout:
 	xorl	%eax,%eax
 	movq	PCPU(CURPCB),%rdx
 	movq	%rax,PCB_ONFAULT(%rdx)
 	POP_FRAME_POINTER
 	ret
 
 	ALIGN_TEXT
 copyout_fault:
 	movq	PCPU(CURPCB),%rdx
 	movq	$0,PCB_ONFAULT(%rdx)
 	movq	$EFAULT,%rax
 	POP_FRAME_POINTER
 	ret
 END(copyout_smap)
 
 /*
  * copyin(from_user, to_kernel, len)
  *        %rdi,      %rsi,      %rdx
  */
 ENTRY(copyin_nosmap)
 	PUSH_FRAME_POINTER
 	movq	PCPU(CURPCB),%rax
 	movq	$copyin_fault,PCB_ONFAULT(%rax)
 	testq	%rdx,%rdx			/* anything to do? */
 	jz	done_copyin
 
 	/*
 	 * make sure address is valid
 	 */
 	movq	%rdi,%rax
 	addq	%rdx,%rax
 	jc	copyin_fault
 	movq	$VM_MAXUSER_ADDRESS,%rcx
 	cmpq	%rcx,%rax
 	ja	copyin_fault
 
 	xchgq	%rdi,%rsi
 	movq	%rdx,%rcx
 	movb	%cl,%al
 	shrq	$3,%rcx				/* copy longword-wise */
 	rep
 	movsq
 	movb	%al,%cl
 	andb	$7,%cl				/* copy remaining bytes */
 	rep
 	movsb
 
 	jmp	done_copyin
 END(copyin_nosmap)
 
 ENTRY(copyin_smap)
 	PUSH_FRAME_POINTER
 	movq	PCPU(CURPCB),%rax
 	movq	$copyin_fault,PCB_ONFAULT(%rax)
 	testq	%rdx,%rdx			/* anything to do? */
 	jz	done_copyin
 
 	/*
 	 * make sure address is valid
 	 */
 	movq	%rdi,%rax
 	addq	%rdx,%rax
 	jc	copyin_fault
 	movq	$VM_MAXUSER_ADDRESS,%rcx
 	cmpq	%rcx,%rax
 	ja	copyin_fault
 
 	xchgq	%rdi,%rsi
 	movq	%rdx,%rcx
 	movb	%cl,%al
 	shrq	$3,%rcx				/* copy longword-wise */
 	stac
 	rep
 	movsq
 	movb	%al,%cl
 	andb	$7,%cl				/* copy remaining bytes */
 	je	done_copyin
 	rep
 	movsb
 	clac
 
 done_copyin:
 	xorl	%eax,%eax
 	movq	PCPU(CURPCB),%rdx
 	movq	%rax,PCB_ONFAULT(%rdx)
 	POP_FRAME_POINTER
 	ret
 END(copyin_smap)
 
 	ALIGN_TEXT
 copyin_fault:
 	movq	PCPU(CURPCB),%rdx
 	movq	$0,PCB_ONFAULT(%rdx)
 	movq	$EFAULT,%rax
 	POP_FRAME_POINTER
 	ret
 
 /*
  * casueword32.  Compare and set user integer.  Returns -1 on fault,
  *        0 if access was successful.  Old value is written to *oldp.
  *        dst = %rdi, old = %esi, oldp = %rdx, new = %ecx
  */
 ENTRY(casueword32_nosmap)
 	PUSH_FRAME_POINTER
 	movq	PCPU(CURPCB),%r8
 	movq	$fusufault,PCB_ONFAULT(%r8)
 
 	movq	$VM_MAXUSER_ADDRESS-4,%rax
 	cmpq	%rax,%rdi			/* verify address is valid */
 	ja	fusufault
 
 	movl	%esi,%eax			/* old */
 #ifdef SMP
 	lock
 #endif
 	cmpxchgl %ecx,(%rdi)			/* new = %ecx */
 
 	/*
 	 * The old value is in %eax.  If the store succeeded it will be the
 	 * value we expected (old) from before the store, otherwise it will
 	 * be the current value.  Save %eax into %esi to prepare the return
 	 * value.
 	 */
 	movl	%eax,%esi
 	xorl	%eax,%eax
 	movq	%rax,PCB_ONFAULT(%r8)
 
 	/*
 	 * Access the oldp after the pcb_onfault is cleared, to correctly
 	 * catch corrupted pointer.
 	 */
 	movl	%esi,(%rdx)			/* oldp = %rdx */
 	POP_FRAME_POINTER
 	ret
 END(casueword32_nosmap)
 
 ENTRY(casueword32_smap)
 	PUSH_FRAME_POINTER
 	movq	PCPU(CURPCB),%r8
 	movq	$fusufault,PCB_ONFAULT(%r8)
 
 	movq	$VM_MAXUSER_ADDRESS-4,%rax
 	cmpq	%rax,%rdi			/* verify address is valid */
 	ja	fusufault
 
 	movl	%esi,%eax			/* old */
 	stac
 #ifdef SMP
 	lock
 #endif
 	cmpxchgl %ecx,(%rdi)			/* new = %ecx */
 	clac
 
 	/*
 	 * The old value is in %eax.  If the store succeeded it will be the
 	 * value we expected (old) from before the store, otherwise it will
 	 * be the current value.  Save %eax into %esi to prepare the return
 	 * value.
 	 */
 	movl	%eax,%esi
 	xorl	%eax,%eax
 	movq	%rax,PCB_ONFAULT(%r8)
 
 	/*
 	 * Access the oldp after the pcb_onfault is cleared, to correctly
 	 * catch corrupted pointer.
 	 */
 	movl	%esi,(%rdx)			/* oldp = %rdx */
 	POP_FRAME_POINTER
 	ret
 END(casueword32_smap)
 
 /*
  * casueword.  Compare and set user long.  Returns -1 on fault,
  *        0 if access was successful.  Old value is written to *oldp.
  *        dst = %rdi, old = %rsi, oldp = %rdx, new = %rcx
  */
 ENTRY(casueword_nosmap)
 	PUSH_FRAME_POINTER
 	movq	PCPU(CURPCB),%r8
 	movq	$fusufault,PCB_ONFAULT(%r8)
 
 	movq	$VM_MAXUSER_ADDRESS-4,%rax
 	cmpq	%rax,%rdi			/* verify address is valid */
 	ja	fusufault
 
 	movq	%rsi,%rax			/* old */
 #ifdef SMP
 	lock
 #endif
 	cmpxchgq %rcx,(%rdi)			/* new = %rcx */
 
 	/*
 	 * The old value is in %rax.  If the store succeeded it will be the
 	 * value we expected (old) from before the store, otherwise it will
 	 * be the current value.
 	 */
 	movq	%rax,%rsi
 	xorl	%eax,%eax
 	movq	%rax,PCB_ONFAULT(%r8)
 	movq	%rsi,(%rdx)
 	POP_FRAME_POINTER
 	ret
 END(casueword_nosmap)
 
 ENTRY(casueword_smap)
 	PUSH_FRAME_POINTER
 	movq	PCPU(CURPCB),%r8
 	movq	$fusufault,PCB_ONFAULT(%r8)
 
 	movq	$VM_MAXUSER_ADDRESS-4,%rax
 	cmpq	%rax,%rdi			/* verify address is valid */
 	ja	fusufault
 
 	movq	%rsi,%rax			/* old */
 	stac
 #ifdef SMP
 	lock
 #endif
 	cmpxchgq %rcx,(%rdi)			/* new = %rcx */
 	clac
 
 	/*
 	 * The old value is in %rax.  If the store succeeded it will be the
 	 * value we expected (old) from before the store, otherwise it will
 	 * be the current value.
 	 */
 	movq	%rax,%rsi
 	xorl	%eax,%eax
 	movq	%rax,PCB_ONFAULT(%r8)
 	movq	%rsi,(%rdx)
 	POP_FRAME_POINTER
 	ret
 END(casueword_smap)
 
 /*
  * Fetch (load) a 64-bit word, a 32-bit word, a 16-bit word, or an 8-bit
  * byte from user memory.
  * addr = %rdi, valp = %rsi
  */
 
 ENTRY(fueword_nosmap)
 	PUSH_FRAME_POINTER
 	movq	PCPU(CURPCB),%rcx
 	movq	$fusufault,PCB_ONFAULT(%rcx)
 
 	movq	$VM_MAXUSER_ADDRESS-8,%rax
 	cmpq	%rax,%rdi			/* verify address is valid */
 	ja	fusufault
 
 	xorl	%eax,%eax
 	movq	(%rdi),%r11
 	movq	%rax,PCB_ONFAULT(%rcx)
 	movq	%r11,(%rsi)
 	POP_FRAME_POINTER
 	ret
 END(fueword_nosmap)
 
 ENTRY(fueword_smap)
 	PUSH_FRAME_POINTER
 	movq	PCPU(CURPCB),%rcx
 	movq	$fusufault,PCB_ONFAULT(%rcx)
 
 	movq	$VM_MAXUSER_ADDRESS-8,%rax
 	cmpq	%rax,%rdi			/* verify address is valid */
 	ja	fusufault
 
 	xorl	%eax,%eax
 	stac
 	movq	(%rdi),%r11
 	clac
 	movq	%rax,PCB_ONFAULT(%rcx)
 	movq	%r11,(%rsi)
 	POP_FRAME_POINTER
 	ret
 END(fueword_smap)
 
 ENTRY(fueword32_nosmap)
 	PUSH_FRAME_POINTER
 	movq	PCPU(CURPCB),%rcx
 	movq	$fusufault,PCB_ONFAULT(%rcx)
 
 	movq	$VM_MAXUSER_ADDRESS-4,%rax
 	cmpq	%rax,%rdi			/* verify address is valid */
 	ja	fusufault
 
 	xorl	%eax,%eax
 	movl	(%rdi),%r11d
 	movq	%rax,PCB_ONFAULT(%rcx)
 	movl	%r11d,(%rsi)
 	POP_FRAME_POINTER
 	ret
 END(fueword32_nosmap)
 
 ENTRY(fueword32_smap)
 	PUSH_FRAME_POINTER
 	movq	PCPU(CURPCB),%rcx
 	movq	$fusufault,PCB_ONFAULT(%rcx)
 
 	movq	$VM_MAXUSER_ADDRESS-4,%rax
 	cmpq	%rax,%rdi			/* verify address is valid */
 	ja	fusufault
 
 	xorl	%eax,%eax
 	stac
 	movl	(%rdi),%r11d
 	clac
 	movq	%rax,PCB_ONFAULT(%rcx)
 	movl	%r11d,(%rsi)
 	POP_FRAME_POINTER
 	ret
 END(fueword32_smap)
 
 ENTRY(fuword16_nosmap)
 	PUSH_FRAME_POINTER
 	movq	PCPU(CURPCB),%rcx
 	movq	$fusufault,PCB_ONFAULT(%rcx)
 
 	movq	$VM_MAXUSER_ADDRESS-2,%rax
 	cmpq	%rax,%rdi
 	ja	fusufault
 
 	movzwl	(%rdi),%eax
 	movq	$0,PCB_ONFAULT(%rcx)
 	POP_FRAME_POINTER
 	ret
 END(fuword16_nosmap)
 
 ENTRY(fuword16_smap)
 	PUSH_FRAME_POINTER
 	movq	PCPU(CURPCB),%rcx
 	movq	$fusufault,PCB_ONFAULT(%rcx)
 
 	movq	$VM_MAXUSER_ADDRESS-2,%rax
 	cmpq	%rax,%rdi
 	ja	fusufault
 
 	stac
 	movzwl	(%rdi),%eax
 	clac
 	movq	$0,PCB_ONFAULT(%rcx)
 	POP_FRAME_POINTER
 	ret
 END(fuword16_smap)
 
 ENTRY(fubyte_nosmap)
 	PUSH_FRAME_POINTER
 	movq	PCPU(CURPCB),%rcx
 	movq	$fusufault,PCB_ONFAULT(%rcx)
 
 	movq	$VM_MAXUSER_ADDRESS-1,%rax
 	cmpq	%rax,%rdi
 	ja	fusufault
 
 	movzbl	(%rdi),%eax
 	movq	$0,PCB_ONFAULT(%rcx)
 	POP_FRAME_POINTER
 	ret
 END(fubyte_nosmap)
 
 ENTRY(fubyte_smap)
 	PUSH_FRAME_POINTER
 	movq	PCPU(CURPCB),%rcx
 	movq	$fusufault,PCB_ONFAULT(%rcx)
 
 	movq	$VM_MAXUSER_ADDRESS-1,%rax
 	cmpq	%rax,%rdi
 	ja	fusufault
 
 	stac
 	movzbl	(%rdi),%eax
 	clac
 	movq	$0,PCB_ONFAULT(%rcx)
 	POP_FRAME_POINTER
 	ret
 END(fubyte_smap)
 
 	ALIGN_TEXT
 	/* Fault entry clears PSL.AC */
 fusufault:
 	movq	PCPU(CURPCB),%rcx
 	xorl	%eax,%eax
 	movq	%rax,PCB_ONFAULT(%rcx)
 	decq	%rax
 	POP_FRAME_POINTER
 	ret
 
 /*
  * Store a 64-bit word, a 32-bit word, a 16-bit word, or an 8-bit byte to
  * user memory.
  * addr = %rdi, value = %rsi
  */
 ENTRY(suword_nosmap)
 	PUSH_FRAME_POINTER
 	movq	PCPU(CURPCB),%rcx
 	movq	$fusufault,PCB_ONFAULT(%rcx)
 
 	movq	$VM_MAXUSER_ADDRESS-8,%rax
 	cmpq	%rax,%rdi			/* verify address validity */
 	ja	fusufault
 
 	movq	%rsi,(%rdi)
 	xorl	%eax,%eax
 	movq	PCPU(CURPCB),%rcx
 	movq	%rax,PCB_ONFAULT(%rcx)
 	POP_FRAME_POINTER
 	ret
 END(suword_nosmap)
 
 ENTRY(suword_smap)
 	PUSH_FRAME_POINTER
 	movq	PCPU(CURPCB),%rcx
 	movq	$fusufault,PCB_ONFAULT(%rcx)
 
 	movq	$VM_MAXUSER_ADDRESS-8,%rax
 	cmpq	%rax,%rdi			/* verify address validity */
 	ja	fusufault
 
 	stac
 	movq	%rsi,(%rdi)
 	clac
 	xorl	%eax,%eax
 	movq	PCPU(CURPCB),%rcx
 	movq	%rax,PCB_ONFAULT(%rcx)
 	POP_FRAME_POINTER
 	ret
 END(suword_smap)
 
 ENTRY(suword32_nosmap)
 	PUSH_FRAME_POINTER
 	movq	PCPU(CURPCB),%rcx
 	movq	$fusufault,PCB_ONFAULT(%rcx)
 
 	movq	$VM_MAXUSER_ADDRESS-4,%rax
 	cmpq	%rax,%rdi			/* verify address validity */
 	ja	fusufault
 
 	movl	%esi,(%rdi)
 	xorl	%eax,%eax
 	movq	PCPU(CURPCB),%rcx
 	movq	%rax,PCB_ONFAULT(%rcx)
 	POP_FRAME_POINTER
 	ret
 END(suword32_nosmap)
 
 ENTRY(suword32_smap)
 	PUSH_FRAME_POINTER
 	movq	PCPU(CURPCB),%rcx
 	movq	$fusufault,PCB_ONFAULT(%rcx)
 
 	movq	$VM_MAXUSER_ADDRESS-4,%rax
 	cmpq	%rax,%rdi			/* verify address validity */
 	ja	fusufault
 
 	stac
 	movl	%esi,(%rdi)
 	clac
 	xorl	%eax,%eax
 	movq	PCPU(CURPCB),%rcx
 	movq	%rax,PCB_ONFAULT(%rcx)
 	POP_FRAME_POINTER
 	ret
 END(suword32_smap)
 
 ENTRY(suword16_nosmap)
 	PUSH_FRAME_POINTER
 	movq	PCPU(CURPCB),%rcx
 	movq	$fusufault,PCB_ONFAULT(%rcx)
 
 	movq	$VM_MAXUSER_ADDRESS-2,%rax
 	cmpq	%rax,%rdi			/* verify address validity */
 	ja	fusufault
 
 	movw	%si,(%rdi)
 	xorl	%eax,%eax
 	movq	PCPU(CURPCB),%rcx		/* restore trashed register */
 	movq	%rax,PCB_ONFAULT(%rcx)
 	POP_FRAME_POINTER
 	ret
 END(suword16_nosmap)
 
 ENTRY(suword16_smap)
 	PUSH_FRAME_POINTER
 	movq	PCPU(CURPCB),%rcx
 	movq	$fusufault,PCB_ONFAULT(%rcx)
 
 	movq	$VM_MAXUSER_ADDRESS-2,%rax
 	cmpq	%rax,%rdi			/* verify address validity */
 	ja	fusufault
 
 	stac
 	movw	%si,(%rdi)
 	clac
 	xorl	%eax,%eax
 	movq	PCPU(CURPCB),%rcx		/* restore trashed register */
 	movq	%rax,PCB_ONFAULT(%rcx)
 	POP_FRAME_POINTER
 	ret
 END(suword16_smap)
 
 ENTRY(subyte_nosmap)
 	PUSH_FRAME_POINTER
 	movq	PCPU(CURPCB),%rcx
 	movq	$fusufault,PCB_ONFAULT(%rcx)
 
 	movq	$VM_MAXUSER_ADDRESS-1,%rax
 	cmpq	%rax,%rdi			/* verify address validity */
 	ja	fusufault
 
 	movl	%esi,%eax
 	movb	%al,(%rdi)
 	xorl	%eax,%eax
 	movq	PCPU(CURPCB),%rcx		/* restore trashed register */
 	movq	%rax,PCB_ONFAULT(%rcx)
 	POP_FRAME_POINTER
 	ret
 END(subyte_nosmap)
 
 ENTRY(subyte_smap)
 	PUSH_FRAME_POINTER
 	movq	PCPU(CURPCB),%rcx
 	movq	$fusufault,PCB_ONFAULT(%rcx)
 
 	movq	$VM_MAXUSER_ADDRESS-1,%rax
 	cmpq	%rax,%rdi			/* verify address validity */
 	ja	fusufault
 
 	movl	%esi,%eax
 	stac
 	movb	%al,(%rdi)
 	clac
 	xorl	%eax,%eax
 	movq	PCPU(CURPCB),%rcx		/* restore trashed register */
 	movq	%rax,PCB_ONFAULT(%rcx)
 	POP_FRAME_POINTER
 	ret
 END(subyte_smap)
 
 /*
  * copyinstr(from, to, maxlen, int *lencopied)
  *           %rdi, %rsi, %rdx, %rcx
  *
  *	copy a string from 'from' to 'to', stop when a 0 character is reached.
  *	return ENAMETOOLONG if string is longer than maxlen, and
  *	EFAULT on protection violations. If lencopied is non-zero,
  *	return the actual length in *lencopied.
  */
 ENTRY(copyinstr_nosmap)
 	PUSH_FRAME_POINTER
 	movq	%rdx,%r8			/* %r8 = maxlen */
 	movq	%rcx,%r9			/* %r9 = *len */
 	xchgq	%rdi,%rsi			/* %rdi = from, %rsi = to */
 	movq	PCPU(CURPCB),%rcx
 	movq	$cpystrflt,PCB_ONFAULT(%rcx)
 
 	movq	$VM_MAXUSER_ADDRESS,%rax
 
 	/* make sure 'from' is within bounds */
 	subq	%rsi,%rax
 	jbe	cpystrflt
 
 	/* restrict maxlen to <= VM_MAXUSER_ADDRESS-from */
 	cmpq	%rdx,%rax
 	jae	1f
 	movq	%rax,%rdx
 	movq	%rax,%r8
 1:
 	incq	%rdx
 
 2:
 	decq	%rdx
 	jz	copyinstr_toolong
 
 	lodsb
 	stosb
 	orb	%al,%al
 	jnz	2b
 
 	jmp	copyinstr_succ
 END(copyinstr_nosmap)
 
 ENTRY(copyinstr_smap)
 	PUSH_FRAME_POINTER
 	movq	%rdx,%r8			/* %r8 = maxlen */
 	movq	%rcx,%r9			/* %r9 = *len */
 	xchgq	%rdi,%rsi			/* %rdi = from, %rsi = to */
 	movq	PCPU(CURPCB),%rcx
 	movq	$cpystrflt,PCB_ONFAULT(%rcx)
 
 	movq	$VM_MAXUSER_ADDRESS,%rax
 
 	/* make sure 'from' is within bounds */
 	subq	%rsi,%rax
 	jbe	cpystrflt
 
 	/* restrict maxlen to <= VM_MAXUSER_ADDRESS-from */
 	cmpq	%rdx,%rax
 	jae	1f
 	movq	%rax,%rdx
 	movq	%rax,%r8
 1:
 	incq	%rdx
 
 2:
 	decq	%rdx
 	jz	copyinstr_toolong
 
 	stac
 	lodsb
 	stosb
 	clac
 	orb	%al,%al
 	jnz	2b
 
 copyinstr_succ:
 	/* Success -- 0 byte reached */
 	decq	%rdx
 	xorl	%eax,%eax
 	jmp	cpystrflt_x
 copyinstr_toolong:
 	/* rdx is zero - return ENAMETOOLONG or EFAULT */
 	movq	$VM_MAXUSER_ADDRESS,%rax
 	cmpq	%rax,%rsi
 	jae	cpystrflt
 	movq	$ENAMETOOLONG,%rax
 	jmp	cpystrflt_x
 
 	/* Fault entry clears PSL.AC */
 cpystrflt:
 	movq	$EFAULT,%rax
 
 cpystrflt_x:
 	/* set *lencopied and return %eax */
 	movq	PCPU(CURPCB),%rcx
 	movq	$0,PCB_ONFAULT(%rcx)
 
 	testq	%r9,%r9
 	jz	1f
 	subq	%rdx,%r8
 	movq	%r8,(%r9)
 1:
 	POP_FRAME_POINTER
 	ret
 END(copyinstr_smap)
 
 /*
  * copystr(from, to, maxlen, int *lencopied)
  *         %rdi, %rsi, %rdx, %rcx
  */
 ENTRY(copystr)
 	PUSH_FRAME_POINTER
 	movq	%rdx,%r8			/* %r8 = maxlen */
 
 	xchgq	%rdi,%rsi
 	incq	%rdx
 1:
 	decq	%rdx
 	jz	4f
 	lodsb
 	stosb
 	orb	%al,%al
 	jnz	1b
 
 	/* Success -- 0 byte reached */
 	decq	%rdx
 	xorl	%eax,%eax
 	jmp	6f
 4:
 	/* rdx is zero -- return ENAMETOOLONG */
 	movq	$ENAMETOOLONG,%rax
 
 6:
 
 	testq	%rcx,%rcx
 	jz	7f
 	/* set *lencopied and return %rax */
 	subq	%rdx,%r8
 	movq	%r8,(%rcx)
 7:
 	POP_FRAME_POINTER
 	ret
 END(copystr)
 
 /*
  * Handling of special amd64 registers and descriptor tables etc
  */
 /* void lgdt(struct region_descriptor *rdp); */
 ENTRY(lgdt)
 	/* reload the descriptor table */
 	lgdt	(%rdi)
 
 	/* flush the prefetch q */
 	jmp	1f
 	nop
 1:
 	movl	$KDSEL,%eax
 	movl	%eax,%ds
 	movl	%eax,%es
 	movl	%eax,%fs	/* Beware, use wrmsr to set 64 bit base */
 	movl	%eax,%gs
 	movl	%eax,%ss
 
 	/* reload code selector by turning return into intersegmental return */
 	popq	%rax
 	pushq	$KCSEL
 	pushq	%rax
 	MEXITCOUNT
 	lretq
 END(lgdt)
 
 /*****************************************************************************/
 /* setjump, longjump                                                         */
 /*****************************************************************************/
 
 ENTRY(setjmp)
 	movq	%rbx,0(%rdi)			/* save rbx */
 	movq	%rsp,8(%rdi)			/* save rsp */
 	movq	%rbp,16(%rdi)			/* save rbp */
 	movq	%r12,24(%rdi)			/* save r12 */
 	movq	%r13,32(%rdi)			/* save r13 */
 	movq	%r14,40(%rdi)			/* save r14 */
 	movq	%r15,48(%rdi)			/* save r15 */
 	movq	0(%rsp),%rdx			/* get rta */
 	movq	%rdx,56(%rdi)			/* save rip */
 	xorl	%eax,%eax			/* return(0); */
 	ret
 END(setjmp)
 
 ENTRY(longjmp)
 	movq	0(%rdi),%rbx			/* restore rbx */
 	movq	8(%rdi),%rsp			/* restore rsp */
 	movq	16(%rdi),%rbp			/* restore rbp */
 	movq	24(%rdi),%r12			/* restore r12 */
 	movq	32(%rdi),%r13			/* restore r13 */
 	movq	40(%rdi),%r14			/* restore r14 */
 	movq	48(%rdi),%r15			/* restore r15 */
 	movq	56(%rdi),%rdx			/* get rta */
 	movq	%rdx,0(%rsp)			/* put in return frame */
 	xorl	%eax,%eax			/* return(1); */
 	incl	%eax
 	ret
 END(longjmp)
 
 /*
  * Support for reading MSRs in the safe manner.  (Instead of panic on #gp,
  * return an error.)
  */
 ENTRY(rdmsr_safe)
 /* int rdmsr_safe(u_int msr, uint64_t *data) */
 	PUSH_FRAME_POINTER
 	movq	PCPU(CURPCB),%r8
 	movq	$msr_onfault,PCB_ONFAULT(%r8)
 	movl	%edi,%ecx
 	rdmsr			/* Read MSR pointed by %ecx. Returns
 				   hi byte in edx, lo in %eax */
 	salq	$32,%rdx	/* sign-shift %rdx left */
 	movl	%eax,%eax	/* zero-extend %eax -> %rax */
 	orq	%rdx,%rax
 	movq	%rax,(%rsi)
 	xorq	%rax,%rax
 	movq	%rax,PCB_ONFAULT(%r8)
 	POP_FRAME_POINTER
 	ret
 
 /*
  * Support for writing MSRs in the safe manner.  (Instead of panic on #gp,
  * return an error.)
  */
 ENTRY(wrmsr_safe)
 /* int wrmsr_safe(u_int msr, uint64_t data) */
 	PUSH_FRAME_POINTER
 	movq	PCPU(CURPCB),%r8
 	movq	$msr_onfault,PCB_ONFAULT(%r8)
 	movl	%edi,%ecx
 	movl	%esi,%eax
 	sarq	$32,%rsi
 	movl	%esi,%edx
 	wrmsr			/* Write MSR pointed by %ecx. Accepts
 				   hi byte in edx, lo in %eax. */
 	xorq	%rax,%rax
 	movq	%rax,PCB_ONFAULT(%r8)
 	POP_FRAME_POINTER
 	ret
 
 /*
  * MSR operations fault handler
  */
 	ALIGN_TEXT
 msr_onfault:
 	movq	$0,PCB_ONFAULT(%r8)
 	movl	$EFAULT,%eax
 	POP_FRAME_POINTER
 	ret
 
 /*
  * void pmap_pti_pcid_invalidate(uint64_t ucr3, uint64_t kcr3);
  * Invalidates address space addressed by ucr3, then returns to kcr3.
  * Done in assembler to ensure no other memory accesses happen while
  * on ucr3.
  */
 	ALIGN_TEXT
 ENTRY(pmap_pti_pcid_invalidate)
 	pushfq
 	cli
 	movq	%rdi,%cr3	/* to user page table */
 	movq	%rsi,%cr3	/* back to kernel */
 	popfq
 	retq
 
 /*
  * void pmap_pti_pcid_invlpg(uint64_t ucr3, uint64_t kcr3, vm_offset_t va);
  * Invalidates virtual address va in address space ucr3, then returns to kcr3.
  */
 	ALIGN_TEXT
 ENTRY(pmap_pti_pcid_invlpg)
 	pushfq
 	cli
 	movq	%rdi,%cr3	/* to user page table */
 	invlpg	(%rdx)
 	movq	%rsi,%cr3	/* back to kernel */
 	popfq
 	retq
 
 /*
  * void pmap_pti_pcid_invlrng(uint64_t ucr3, uint64_t kcr3, vm_offset_t sva,
  *     vm_offset_t eva);
  * Invalidates virtual addresses between sva and eva in address space ucr3,
  * then returns to kcr3.
  */
 	ALIGN_TEXT
 ENTRY(pmap_pti_pcid_invlrng)
 	pushfq
 	cli
 	movq	%rdi,%cr3	/* to user page table */
 1:	invlpg	(%rdx)
 	addq	$PAGE_SIZE,%rdx
 	cmpq	%rdx,%rcx
 	ja	1b
 	movq	%rsi,%cr3	/* back to kernel */
 	popfq
 	retq
 
 	.altmacro
 	.macro	ibrs_seq_label l
 handle_ibrs_\l:
 	.endm
 	.macro	ibrs_call_label l
 	call	handle_ibrs_\l
 	.endm
 	.macro	ibrs_seq count
 	ll=1
 	.rept	\count
 	ibrs_call_label	%(ll)
 	nop
 	ibrs_seq_label %(ll)
 	addq	$8,%rsp
 	ll=ll+1
 	.endr
 	.endm
 
 /* all callers already saved %rax, %rdx, and %rcx */
 ENTRY(handle_ibrs_entry)
 	cmpb	$0,hw_ibrs_active(%rip)
 	je	1f
 	movl	$MSR_IA32_SPEC_CTRL,%ecx
 	rdmsr
 	orl	$(IA32_SPEC_CTRL_IBRS|IA32_SPEC_CTRL_STIBP),%eax
 	orl	$(IA32_SPEC_CTRL_IBRS|IA32_SPEC_CTRL_STIBP)>>32,%edx
 	wrmsr
 	movb	$1,PCPU(IBPB_SET)
 	testl	$CPUID_STDEXT_SMEP,cpu_stdext_feature(%rip)
 	jne	1f
 	ibrs_seq 32
 1:	ret
 END(handle_ibrs_entry)
 
 ENTRY(handle_ibrs_exit)
 	cmpb	$0,PCPU(IBPB_SET)
 	je	1f
 	movl	$MSR_IA32_SPEC_CTRL,%ecx
 	rdmsr
 	andl	$~(IA32_SPEC_CTRL_IBRS|IA32_SPEC_CTRL_STIBP),%eax
 	andl	$~((IA32_SPEC_CTRL_IBRS|IA32_SPEC_CTRL_STIBP)>>32),%edx
 	wrmsr
 	movb	$0,PCPU(IBPB_SET)
 1:	ret
 END(handle_ibrs_exit)
 
 /* registers-neutral version, but needs stack */
 ENTRY(handle_ibrs_exit_rs)
 	cmpb	$0,PCPU(IBPB_SET)
 	je	1f
 	pushq	%rax
 	pushq	%rdx
 	pushq	%rcx
 	movl	$MSR_IA32_SPEC_CTRL,%ecx
 	rdmsr
 	andl	$~(IA32_SPEC_CTRL_IBRS|IA32_SPEC_CTRL_STIBP),%eax
 	andl	$~((IA32_SPEC_CTRL_IBRS|IA32_SPEC_CTRL_STIBP)>>32),%edx
 	wrmsr
 	popq	%rcx
 	popq	%rdx
 	popq	%rax
 	movb	$0,PCPU(IBPB_SET)
 1:	ret
 END(handle_ibrs_exit_rs)
 
 	.noaltmacro
+
+/*
+ * Flush L1D cache.  Load enough of the data from the kernel text
+ * to flush existing L1D content.
+ *
+ * N.B. The function follows ABI calling conventions, but the vmm.ko
+ * caller expects that only %rax, %rcx, %r9, and %rflags registers
+ * are clobbered.
+ */
+ENTRY(flush_l1d_sw)
+#define	L1D_FLUSH_SIZE	(64 * 1024)
+	movq	$KERNBASE, %r9
+	movq	$-L1D_FLUSH_SIZE, %rcx
+	/*
+	 * pass 1: Preload TLB.
+	 * Kernel text is mapped using superpages.  TLB preload is
+	 * done for the benefit of older CPUs which split 2M page
+	 * into 4k TLB entries.
+	 */
+1:	movb	L1D_FLUSH_SIZE(%r9, %rcx), %al
+	addq	$PAGE_SIZE, %rcx
+	jne	1b
+	xorl	%eax, %eax
+	cpuid
+	movq	$-L1D_FLUSH_SIZE, %rcx
+	/* pass 2: Read each cache line. */
+2:	movb	L1D_FLUSH_SIZE(%r9, %rcx), %al
+	addq	$64, %rcx
+	jne	2b
+	lfence
+	ret
+#undef	L1D_FLUSH_SIZE
+END(flush_l1d_sw)
Index: head/sys/amd64/amd64/trap.c
===================================================================
--- head/sys/amd64/amd64/trap.c	(revision 338067)
+++ head/sys/amd64/amd64/trap.c	(revision 338068)
@@ -1,1066 +1,1080 @@
 /*-
  * SPDX-License-Identifier: BSD-4-Clause
  *
  * Copyright (C) 1994, David Greenman
  * Copyright (c) 1990, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * This code is derived from software contributed to Berkeley by
  * the University of Utah, and William Jolitz.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *	This product includes software developed by the University of
  *	California, Berkeley and its contributors.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	from: @(#)trap.c	7.4 (Berkeley) 5/13/91
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 /*
  * AMD64 Trap and System call handling
  */
 
 #include "opt_clock.h"
 #include "opt_compat.h"
 #include "opt_cpu.h"
 #include "opt_hwpmc_hooks.h"
 #include "opt_isa.h"
 #include "opt_kdb.h"
 #include "opt_stack.h"
 
 #include <sys/param.h>
 #include <sys/bus.h>
 #include <sys/systm.h>
 #include <sys/proc.h>
 #include <sys/pioctl.h>
 #include <sys/ptrace.h>
 #include <sys/kdb.h>
 #include <sys/kernel.h>
 #include <sys/ktr.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/resourcevar.h>
 #include <sys/signalvar.h>
 #include <sys/syscall.h>
 #include <sys/sysctl.h>
 #include <sys/sysent.h>
 #include <sys/uio.h>
 #include <sys/vmmeter.h>
 #ifdef HWPMC_HOOKS
 #include <sys/pmckern.h>
 PMC_SOFT_DEFINE( , , page_fault, all);
 PMC_SOFT_DEFINE( , , page_fault, read);
 PMC_SOFT_DEFINE( , , page_fault, write);
 #endif
 
 #include <vm/vm.h>
 #include <vm/vm_param.h>
 #include <vm/pmap.h>
 #include <vm/vm_kern.h>
 #include <vm/vm_map.h>
 #include <vm/vm_page.h>
 #include <vm/vm_extern.h>
 
 #include <machine/cpu.h>
 #include <machine/intr_machdep.h>
 #include <x86/mca.h>
 #include <machine/md_var.h>
 #include <machine/pcb.h>
 #ifdef SMP
 #include <machine/smp.h>
 #endif
 #include <machine/stack.h>
 #include <machine/trap.h>
 #include <machine/tss.h>
 
 #ifdef KDTRACE_HOOKS
 #include <sys/dtrace_bsd.h>
 #endif
 
 extern inthand_t IDTVEC(bpt), IDTVEC(bpt_pti), IDTVEC(dbg),
     IDTVEC(fast_syscall), IDTVEC(fast_syscall_pti), IDTVEC(fast_syscall32),
     IDTVEC(int0x80_syscall_pti), IDTVEC(int0x80_syscall);
 
 void __noinline trap(struct trapframe *frame);
 void trap_check(struct trapframe *frame);
 void dblfault_handler(struct trapframe *frame);
 
 static int trap_pfault(struct trapframe *, int);
 static void trap_fatal(struct trapframe *, vm_offset_t);
 
 #define MAX_TRAP_MSG		32
 static char *trap_msg[] = {
 	"",					/*  0 unused */
 	"privileged instruction fault",		/*  1 T_PRIVINFLT */
 	"",					/*  2 unused */
 	"breakpoint instruction fault",		/*  3 T_BPTFLT */
 	"",					/*  4 unused */
 	"",					/*  5 unused */
 	"arithmetic trap",			/*  6 T_ARITHTRAP */
 	"",					/*  7 unused */
 	"",					/*  8 unused */
 	"general protection fault",		/*  9 T_PROTFLT */
 	"debug exception",			/* 10 T_TRCTRAP */
 	"",					/* 11 unused */
 	"page fault",				/* 12 T_PAGEFLT */
 	"",					/* 13 unused */
 	"alignment fault",			/* 14 T_ALIGNFLT */
 	"",					/* 15 unused */
 	"",					/* 16 unused */
 	"",					/* 17 unused */
 	"integer divide fault",			/* 18 T_DIVIDE */
 	"non-maskable interrupt trap",		/* 19 T_NMI */
 	"overflow trap",			/* 20 T_OFLOW */
 	"FPU bounds check fault",		/* 21 T_BOUND */
 	"FPU device not available",		/* 22 T_DNA */
 	"double fault",				/* 23 T_DOUBLEFLT */
 	"FPU operand fetch fault",		/* 24 T_FPOPFLT */
 	"invalid TSS fault",			/* 25 T_TSSFLT */
 	"segment not present fault",		/* 26 T_SEGNPFLT */
 	"stack fault",				/* 27 T_STKFLT */
 	"machine check trap",			/* 28 T_MCHK */
 	"SIMD floating-point exception",	/* 29 T_XMMFLT */
 	"reserved (unknown) fault",		/* 30 T_RESERVED */
 	"",					/* 31 unused (reserved) */
 	"DTrace pid return trap",		/* 32 T_DTRACE_RET */
 };
 
 static int prot_fault_translation;
 SYSCTL_INT(_machdep, OID_AUTO, prot_fault_translation, CTLFLAG_RWTUN,
     &prot_fault_translation, 0,
     "Select signal to deliver on protection fault");
 static int uprintf_signal;
 SYSCTL_INT(_machdep, OID_AUTO, uprintf_signal, CTLFLAG_RWTUN,
     &uprintf_signal, 0,
     "Print debugging information on trap signal to ctty");
 
 /*
+ * Control L1D flush on return from NMI.
+ *
+ * Tunable  can be set to the following values:
+ * 0 - only enable flush on return from NMI if required by vmm.ko (default)
+ * >1 - always flush on return from NMI.
+ *
+ * Post-boot, the sysctl indicates if flushing is currently enabled.
+ */
+int nmi_flush_l1d_sw;
+SYSCTL_INT(_machdep, OID_AUTO, nmi_flush_l1d_sw, CTLFLAG_RWTUN,
+    &nmi_flush_l1d_sw, 0,
+    "Flush L1 Data Cache on NMI exit, software bhyve L1TF mitigation assist");
+
+/*
  * Exception, fault, and trap interface to the FreeBSD kernel.
  * This common code is called from assembly language IDT gate entry
  * routines that prepare a suitable stack frame, and restore this
  * frame after the exception has been processed.
  */
 
 void
 trap(struct trapframe *frame)
 {
 	ksiginfo_t ksi;
 	struct thread *td;
 	struct proc *p;
 	register_t addr, dr6;
 	int signo, ucode;
 	u_int type;
 
 	td = curthread;
 	p = td->td_proc;
 	signo = 0;
 	ucode = 0;
 	addr = 0;
 	dr6 = 0;
 
 	VM_CNT_INC(v_trap);
 	type = frame->tf_trapno;
 
 #ifdef SMP
 	/* Handler for NMI IPIs used for stopping CPUs. */
 	if (type == T_NMI && ipi_nmi_handler() == 0)
 		return;
 #endif
 
 #ifdef KDB
 	if (kdb_active) {
 		kdb_reenter();
 		return;
 	}
 #endif
 
 	if (type == T_RESERVED) {
 		trap_fatal(frame, 0);
 		return;
 	}
 
 	if (type == T_NMI) {
 #ifdef HWPMC_HOOKS
 		/*
 		 * CPU PMCs interrupt using an NMI.  If the PMC module is
 		 * active, pass the 'rip' value to the PMC module's interrupt
 		 * handler.  A non-zero return value from the handler means that
 		 * the NMI was consumed by it and we can return immediately.
 		 */
 		if (pmc_intr != NULL &&
 		    (*pmc_intr)(frame) != 0)
 			return;
 #endif
 
 #ifdef STACK
 		if (stack_nmi_handler(frame) != 0)
 			return;
 #endif
 	}
 
 	if ((frame->tf_rflags & PSL_I) == 0) {
 		/*
 		 * Buggy application or kernel code has disabled
 		 * interrupts and then trapped.  Enabling interrupts
 		 * now is wrong, but it is better than running with
 		 * interrupts disabled until they are accidentally
 		 * enabled later.
 		 */
 		if (TRAPF_USERMODE(frame))
 			uprintf(
 			    "pid %ld (%s): trap %d with interrupts disabled\n",
 			    (long)curproc->p_pid, curthread->td_name, type);
 		else if (type != T_NMI && type != T_BPTFLT &&
 		    type != T_TRCTRAP) {
 			/*
 			 * XXX not quite right, since this may be for a
 			 * multiple fault in user mode.
 			 */
 			printf("kernel trap %d with interrupts disabled\n",
 			    type);
 
 			/*
 			 * We shouldn't enable interrupts while holding a
 			 * spin lock.
 			 */
 			if (td->td_md.md_spinlock_count == 0)
 				enable_intr();
 		}
 	}
 
 	if (TRAPF_USERMODE(frame)) {
 		/* user trap */
 
 		td->td_pticks = 0;
 		td->td_frame = frame;
 		addr = frame->tf_rip;
 		if (td->td_cowgen != p->p_cowgen)
 			thread_cow_update(td);
 
 		switch (type) {
 		case T_PRIVINFLT:	/* privileged instruction fault */
 			signo = SIGILL;
 			ucode = ILL_PRVOPC;
 			break;
 
 		case T_BPTFLT:		/* bpt instruction fault */
 			enable_intr();
 #ifdef KDTRACE_HOOKS
 			if (dtrace_pid_probe_ptr != NULL &&
 			    dtrace_pid_probe_ptr(frame) == 0)
 				return;
 #endif
 			signo = SIGTRAP;
 			ucode = TRAP_BRKPT;
 			break;
 
 		case T_TRCTRAP:		/* debug exception */
 			enable_intr();
 			signo = SIGTRAP;
 			ucode = TRAP_TRACE;
 			dr6 = rdr6();
 			if ((dr6 & DBREG_DR6_BS) != 0) {
 				PROC_LOCK(td->td_proc);
 				if ((td->td_dbgflags & TDB_STEP) != 0) {
 					td->td_frame->tf_rflags &= ~PSL_T;
 					td->td_dbgflags &= ~TDB_STEP;
 				}
 				PROC_UNLOCK(td->td_proc);
 			}
 			break;
 
 		case T_ARITHTRAP:	/* arithmetic trap */
 			ucode = fputrap_x87();
 			if (ucode == -1)
 				return;
 			signo = SIGFPE;
 			break;
 
 		case T_PROTFLT:		/* general protection fault */
 			signo = SIGBUS;
 			ucode = BUS_OBJERR;
 			break;
 		case T_STKFLT:		/* stack fault */
 		case T_SEGNPFLT:	/* segment not present fault */
 			signo = SIGBUS;
 			ucode = BUS_ADRERR;
 			break;
 		case T_TSSFLT:		/* invalid TSS fault */
 			signo = SIGBUS;
 			ucode = BUS_OBJERR;
 			break;
 		case T_ALIGNFLT:
 			signo = SIGBUS;
 			ucode = BUS_ADRALN;
 			break;
 		case T_DOUBLEFLT:	/* double fault */
 		default:
 			signo = SIGBUS;
 			ucode = BUS_OBJERR;
 			break;
 
 		case T_PAGEFLT:		/* page fault */
 			/*
 			 * Emulator can take care about this trap?
 			 */
 			if (*p->p_sysent->sv_trap != NULL &&
 			    (*p->p_sysent->sv_trap)(td) == 0)
 				return;
 
 			addr = frame->tf_addr;
 			signo = trap_pfault(frame, TRUE);
 			if (signo == -1)
 				return;
 			if (signo == 0)
 				goto userret;
 			if (signo == SIGSEGV) {
 				ucode = SEGV_MAPERR;
 			} else if (prot_fault_translation == 0) {
 				/*
 				 * Autodetect.  This check also covers
 				 * the images without the ABI-tag ELF
 				 * note.
 				 */
 				if (SV_CURPROC_ABI() == SV_ABI_FREEBSD &&
 				    p->p_osrel >= P_OSREL_SIGSEGV) {
 					signo = SIGSEGV;
 					ucode = SEGV_ACCERR;
 				} else {
 					signo = SIGBUS;
 					ucode = T_PAGEFLT;
 				}
 			} else if (prot_fault_translation == 1) {
 				/*
 				 * Always compat mode.
 				 */
 				signo = SIGBUS;
 				ucode = T_PAGEFLT;
 			} else {
 				/*
 				 * Always SIGSEGV mode.
 				 */
 				signo = SIGSEGV;
 				ucode = SEGV_ACCERR;
 			}
 			break;
 
 		case T_DIVIDE:		/* integer divide fault */
 			ucode = FPE_INTDIV;
 			signo = SIGFPE;
 			break;
 
 #ifdef DEV_ISA
 		case T_NMI:
 			nmi_handle_intr(type, frame);
 			return;
 #endif
 
 		case T_OFLOW:		/* integer overflow fault */
 			ucode = FPE_INTOVF;
 			signo = SIGFPE;
 			break;
 
 		case T_BOUND:		/* bounds check fault */
 			ucode = FPE_FLTSUB;
 			signo = SIGFPE;
 			break;
 
 		case T_DNA:
 			/* transparent fault (due to context switch "late") */
 			KASSERT(PCB_USER_FPU(td->td_pcb),
 			    ("kernel FPU ctx has leaked"));
 			fpudna();
 			return;
 
 		case T_FPOPFLT:		/* FPU operand fetch fault */
 			ucode = ILL_COPROC;
 			signo = SIGILL;
 			break;
 
 		case T_XMMFLT:		/* SIMD floating-point exception */
 			ucode = fputrap_sse();
 			if (ucode == -1)
 				return;
 			signo = SIGFPE;
 			break;
 #ifdef KDTRACE_HOOKS
 		case T_DTRACE_RET:
 			enable_intr();
 			if (dtrace_return_probe_ptr != NULL)
 				dtrace_return_probe_ptr(frame);
 			return;
 #endif
 		}
 	} else {
 		/* kernel trap */
 
 		KASSERT(cold || td->td_ucred != NULL,
 		    ("kernel trap doesn't have ucred"));
 		switch (type) {
 		case T_PAGEFLT:			/* page fault */
 			(void) trap_pfault(frame, FALSE);
 			return;
 
 		case T_DNA:
 			if (PCB_USER_FPU(td->td_pcb))
 				panic("Unregistered use of FPU in kernel");
 			fpudna();
 			return;
 
 		case T_ARITHTRAP:	/* arithmetic trap */
 		case T_XMMFLT:		/* SIMD floating-point exception */
 		case T_FPOPFLT:		/* FPU operand fetch fault */
 			/*
 			 * For now, supporting kernel handler
 			 * registration for FPU traps is overkill.
 			 */
 			trap_fatal(frame, 0);
 			return;
 
 		case T_STKFLT:		/* stack fault */
 		case T_PROTFLT:		/* general protection fault */
 		case T_SEGNPFLT:	/* segment not present fault */
 			if (td->td_intr_nesting_level != 0)
 				break;
 
 			/*
 			 * Invalid segment selectors and out of bounds
 			 * %rip's and %rsp's can be set up in user mode.
 			 * This causes a fault in kernel mode when the
 			 * kernel tries to return to user mode.  We want
 			 * to get this fault so that we can fix the
 			 * problem here and not have to check all the
 			 * selectors and pointers when the user changes
 			 * them.
 			 *
 			 * In case of PTI, the IRETQ faulted while the
 			 * kernel used the pti stack, and exception
 			 * frame records %rsp value pointing to that
 			 * stack.  If we return normally to
 			 * doreti_iret_fault, the trapframe is
 			 * reconstructed on pti stack, and calltrap()
 			 * called on it as well.  Due to the very
 			 * limited pti stack size, kernel does not
 			 * survive for too long.  Switch to the normal
 			 * thread stack for the trap handling.
 			 *
 			 * Magic '5' is the number of qwords occupied by
 			 * the hardware trap frame.
 			 */
 			if (frame->tf_rip == (long)doreti_iret) {
 				frame->tf_rip = (long)doreti_iret_fault;
 				if ((PCPU_GET(curpmap)->pm_ucr3 !=
 				    PMAP_NO_CR3) &&
 				    (frame->tf_rsp == (uintptr_t)PCPU_GET(
 				    pti_rsp0) - 5 * sizeof(register_t))) {
 					frame->tf_rsp = PCPU_GET(rsp0) - 5 *
 					    sizeof(register_t);
 				}
 				return;
 			}
 			if (frame->tf_rip == (long)ld_ds) {
 				frame->tf_rip = (long)ds_load_fault;
 				return;
 			}
 			if (frame->tf_rip == (long)ld_es) {
 				frame->tf_rip = (long)es_load_fault;
 				return;
 			}
 			if (frame->tf_rip == (long)ld_fs) {
 				frame->tf_rip = (long)fs_load_fault;
 				return;
 			}
 			if (frame->tf_rip == (long)ld_gs) {
 				frame->tf_rip = (long)gs_load_fault;
 				return;
 			}
 			if (frame->tf_rip == (long)ld_gsbase) {
 				frame->tf_rip = (long)gsbase_load_fault;
 				return;
 			}
 			if (frame->tf_rip == (long)ld_fsbase) {
 				frame->tf_rip = (long)fsbase_load_fault;
 				return;
 			}
 			if (curpcb->pcb_onfault != NULL) {
 				frame->tf_rip = (long)curpcb->pcb_onfault;
 				return;
 			}
 			break;
 
 		case T_TSSFLT:
 			/*
 			 * PSL_NT can be set in user mode and isn't cleared
 			 * automatically when the kernel is entered.  This
 			 * causes a TSS fault when the kernel attempts to
 			 * `iret' because the TSS link is uninitialized.  We
 			 * want to get this fault so that we can fix the
 			 * problem here and not every time the kernel is
 			 * entered.
 			 */
 			if (frame->tf_rflags & PSL_NT) {
 				frame->tf_rflags &= ~PSL_NT;
 				return;
 			}
 			break;
 
 		case T_TRCTRAP:	 /* debug exception */
 			/* Clear any pending debug events. */
 			dr6 = rdr6();
 			load_dr6(0);
 
 			/*
 			 * Ignore debug register exceptions due to
 			 * accesses in the user's address space, which
 			 * can happen under several conditions such as
 			 * if a user sets a watchpoint on a buffer and
 			 * then passes that buffer to a system call.
 			 * We still want to get TRCTRAPS for addresses
 			 * in kernel space because that is useful when
 			 * debugging the kernel.
 			 */
 			if (user_dbreg_trap(dr6))
 				return;
 
 			/*
 			 * Malicious user code can configure a debug
 			 * register watchpoint to trap on data access
 			 * to the top of stack and then execute 'pop
 			 * %ss; int 3'.  Due to exception deferral for
 			 * 'pop %ss', the CPU will not interrupt 'int
 			 * 3' to raise the DB# exception for the debug
 			 * register but will postpone the DB# until
 			 * execution of the first instruction of the
 			 * BP# handler (in kernel mode).  Normally the
 			 * previous check would ignore DB# exceptions
 			 * for watchpoints on user addresses raised in
 			 * kernel mode.  However, some CPU errata
 			 * include cases where DB# exceptions do not
 			 * properly set bits in %dr6, e.g. Haswell
 			 * HSD23 and Skylake-X SKZ24.
 			 *
 			 * A deferred DB# can also be raised on the
 			 * first instructions of system call entry
 			 * points or single-step traps via similar use
 			 * of 'pop %ss' or 'mov xxx, %ss'.
 			 */
 			if (pti) {
 				if (frame->tf_rip ==
 				    (uintptr_t)IDTVEC(fast_syscall_pti) ||
 #ifdef COMPAT_FREEBSD32
 				    frame->tf_rip ==
 				    (uintptr_t)IDTVEC(int0x80_syscall_pti) ||
 #endif
 				    frame->tf_rip == (uintptr_t)IDTVEC(bpt_pti))
 					return;
 			} else {
 				if (frame->tf_rip ==
 				    (uintptr_t)IDTVEC(fast_syscall) ||
 #ifdef COMPAT_FREEBSD32
 				    frame->tf_rip ==
 				    (uintptr_t)IDTVEC(int0x80_syscall) ||
 #endif
 				    frame->tf_rip == (uintptr_t)IDTVEC(bpt))
 					return;
 			}
 			if (frame->tf_rip == (uintptr_t)IDTVEC(dbg) ||
 			    /* Needed for AMD. */
 			    frame->tf_rip == (uintptr_t)IDTVEC(fast_syscall32))
 				return;
 			/*
 			 * FALLTHROUGH (TRCTRAP kernel mode, kernel address)
 			 */
 		case T_BPTFLT:
 			/*
 			 * If KDB is enabled, let it handle the debugger trap.
 			 * Otherwise, debugger traps "can't happen".
 			 */
 #ifdef KDB
 			if (kdb_trap(type, dr6, frame))
 				return;
 #endif
 			break;
 
 #ifdef DEV_ISA
 		case T_NMI:
 			nmi_handle_intr(type, frame);
 			return;
 #endif
 		}
 
 		trap_fatal(frame, 0);
 		return;
 	}
 
 	/* Translate fault for emulators (e.g. Linux) */
 	if (*p->p_sysent->sv_transtrap != NULL)
 		signo = (*p->p_sysent->sv_transtrap)(signo, type);
 
 	ksiginfo_init_trap(&ksi);
 	ksi.ksi_signo = signo;
 	ksi.ksi_code = ucode;
 	ksi.ksi_trapno = type;
 	ksi.ksi_addr = (void *)addr;
 	if (uprintf_signal) {
 		uprintf("pid %d comm %s: signal %d err %lx code %d type %d "
 		    "addr 0x%lx rsp 0x%lx rip 0x%lx "
 		    "<%02x %02x %02x %02x %02x %02x %02x %02x>\n",
 		    p->p_pid, p->p_comm, signo, frame->tf_err, ucode, type,
 		    addr, frame->tf_rsp, frame->tf_rip,
 		    fubyte((void *)(frame->tf_rip + 0)),
 		    fubyte((void *)(frame->tf_rip + 1)),
 		    fubyte((void *)(frame->tf_rip + 2)),
 		    fubyte((void *)(frame->tf_rip + 3)),
 		    fubyte((void *)(frame->tf_rip + 4)),
 		    fubyte((void *)(frame->tf_rip + 5)),
 		    fubyte((void *)(frame->tf_rip + 6)),
 		    fubyte((void *)(frame->tf_rip + 7)));
 	}
 	KASSERT((read_rflags() & PSL_I) != 0, ("interrupts disabled"));
 	trapsignal(td, &ksi);
 
 	/*
 	 * Clear any pending debug exceptions after allowing a
 	 * debugger to read DR6 while stopped in trapsignal().
 	 */
 	if (type == T_TRCTRAP)
 		load_dr6(0);
 userret:
 	userret(td, frame);
 	KASSERT(PCB_USER_FPU(td->td_pcb),
 	    ("Return from trap with kernel FPU ctx leaked"));
 }
 
 /*
  * Ensure that we ignore any DTrace-induced faults. This function cannot
  * be instrumented, so it cannot generate such faults itself.
  */
 void
 trap_check(struct trapframe *frame)
 {
 
 #ifdef KDTRACE_HOOKS
 	if (dtrace_trap_func != NULL &&
 	    (*dtrace_trap_func)(frame, frame->tf_trapno) != 0)
 		return;
 #endif
 	trap(frame);
 }
 
 static bool
 trap_is_smap(struct trapframe *frame)
 {
 
 	/*
 	 * A page fault on a userspace address is classified as
 	 * SMAP-induced if:
 	 * - SMAP is supported;
 	 * - kernel mode accessed present data page;
 	 * - rflags.AC was cleared.
 	 * Kernel must never access user space with rflags.AC cleared
 	 * if SMAP is enabled.
 	 */
 	return ((cpu_stdext_feature & CPUID_STDEXT_SMAP) != 0 &&
 	    (frame->tf_err & (PGEX_P | PGEX_U | PGEX_I | PGEX_RSV)) ==
 	    PGEX_P && (frame->tf_rflags & PSL_AC) == 0);
 }
 
 static int
 trap_pfault(struct trapframe *frame, int usermode)
 {
 	struct thread *td;
 	struct proc *p;
 	vm_map_t map;
 	vm_offset_t va;
 	int rv;
 	vm_prot_t ftype;
 	vm_offset_t eva;
 
 	td = curthread;
 	p = td->td_proc;
 	eva = frame->tf_addr;
 
 	if (__predict_false((td->td_pflags & TDP_NOFAULTING) != 0)) {
 		/*
 		 * Due to both processor errata and lazy TLB invalidation when
 		 * access restrictions are removed from virtual pages, memory
 		 * accesses that are allowed by the physical mapping layer may
 		 * nonetheless cause one spurious page fault per virtual page. 
 		 * When the thread is executing a "no faulting" section that
 		 * is bracketed by vm_fault_{disable,enable}_pagefaults(),
 		 * every page fault is treated as a spurious page fault,
 		 * unless it accesses the same virtual address as the most
 		 * recent page fault within the same "no faulting" section.
 		 */
 		if (td->td_md.md_spurflt_addr != eva ||
 		    (td->td_pflags & TDP_RESETSPUR) != 0) {
 			/*
 			 * Do nothing to the TLB.  A stale TLB entry is
 			 * flushed automatically by a page fault.
 			 */
 			td->td_md.md_spurflt_addr = eva;
 			td->td_pflags &= ~TDP_RESETSPUR;
 			return (0);
 		}
 	} else {
 		/*
 		 * If we get a page fault while in a critical section, then
 		 * it is most likely a fatal kernel page fault.  The kernel
 		 * is already going to panic trying to get a sleep lock to
 		 * do the VM lookup, so just consider it a fatal trap so the
 		 * kernel can print out a useful trap message and even get
 		 * to the debugger.
 		 *
 		 * If we get a page fault while holding a non-sleepable
 		 * lock, then it is most likely a fatal kernel page fault.
 		 * If WITNESS is enabled, then it's going to whine about
 		 * bogus LORs with various VM locks, so just skip to the
 		 * fatal trap handling directly.
 		 */
 		if (td->td_critnest != 0 ||
 		    WITNESS_CHECK(WARN_SLEEPOK | WARN_GIANTOK, NULL,
 		    "Kernel page fault") != 0) {
 			trap_fatal(frame, eva);
 			return (-1);
 		}
 	}
 	va = trunc_page(eva);
 	if (va >= VM_MIN_KERNEL_ADDRESS) {
 		/*
 		 * Don't allow user-mode faults in kernel address space.
 		 */
 		if (usermode)
 			return (SIGSEGV);
 
 		map = kernel_map;
 	} else {
 		map = &p->p_vmspace->vm_map;
 
 		/*
 		 * When accessing a usermode address, kernel must be
 		 * ready to accept the page fault, and provide a
 		 * handling routine.  Since accessing the address
 		 * without the handler is a bug, do not try to handle
 		 * it normally, and panic immediately.
 		 *
 		 * If SMAP is enabled, filter SMAP faults also,
 		 * because illegal access might occur to the mapped
 		 * user address, causing infinite loop.
 		 */
 		if (!usermode && (td->td_intr_nesting_level != 0 ||
 		    trap_is_smap(frame) || curpcb->pcb_onfault == NULL)) {
 			trap_fatal(frame, eva);
 			return (-1);
 		}
 	}
 
 	/*
 	 * If the trap was caused by errant bits in the PTE then panic.
 	 */
 	if (frame->tf_err & PGEX_RSV) {
 		trap_fatal(frame, eva);
 		return (-1);
 	}
 
 	/*
 	 * If nx protection of the usermode portion of kernel page
 	 * tables caused trap, panic.
 	 */
 	if (PCPU_GET(curpmap)->pm_ucr3 != PMAP_NO_CR3 && usermode &&
 	    pg_nx != 0 && (frame->tf_err & (PGEX_P | PGEX_W |
 	    PGEX_U | PGEX_I)) == (PGEX_P | PGEX_U | PGEX_I) &&
 	    (curpcb->pcb_saved_ucr3 & ~CR3_PCID_MASK)==
 	    (PCPU_GET(curpmap)->pm_cr3 & ~CR3_PCID_MASK))
 		panic("PTI: pid %d comm %s tf_err %#lx\n", p->p_pid,
 		    p->p_comm, frame->tf_err);
 
 	/*
 	 * PGEX_I is defined only if the execute disable bit capability is
 	 * supported and enabled.
 	 */
 	if (frame->tf_err & PGEX_W)
 		ftype = VM_PROT_WRITE;
 	else if ((frame->tf_err & PGEX_I) && pg_nx != 0)
 		ftype = VM_PROT_EXECUTE;
 	else
 		ftype = VM_PROT_READ;
 
 	/* Fault in the page. */
 	rv = vm_fault(map, va, ftype, VM_FAULT_NORMAL);
 	if (rv == KERN_SUCCESS) {
 #ifdef HWPMC_HOOKS
 		if (ftype == VM_PROT_READ || ftype == VM_PROT_WRITE) {
 			PMC_SOFT_CALL_TF( , , page_fault, all, frame);
 			if (ftype == VM_PROT_READ)
 				PMC_SOFT_CALL_TF( , , page_fault, read,
 				    frame);
 			else
 				PMC_SOFT_CALL_TF( , , page_fault, write,
 				    frame);
 		}
 #endif
 		return (0);
 	}
 	if (!usermode) {
 		if (td->td_intr_nesting_level == 0 &&
 		    curpcb->pcb_onfault != NULL) {
 			frame->tf_rip = (long)curpcb->pcb_onfault;
 			return (0);
 		}
 		trap_fatal(frame, eva);
 		return (-1);
 	}
 	return ((rv == KERN_PROTECTION_FAILURE) ? SIGBUS : SIGSEGV);
 }
 
 static void
 trap_fatal(frame, eva)
 	struct trapframe *frame;
 	vm_offset_t eva;
 {
 	int code, ss;
 	u_int type;
 	struct soft_segment_descriptor softseg;
 	char *msg;
 #ifdef KDB
 	bool handled;
 #endif
 
 	code = frame->tf_err;
 	type = frame->tf_trapno;
 	sdtossd(&gdt[NGDT * PCPU_GET(cpuid) + IDXSEL(frame->tf_cs & 0xffff)],
 	    &softseg);
 
 	if (type <= MAX_TRAP_MSG)
 		msg = trap_msg[type];
 	else
 		msg = "UNKNOWN";
 	printf("\n\nFatal trap %d: %s while in %s mode\n", type, msg,
 	    TRAPF_USERMODE(frame) ? "user" : "kernel");
 #ifdef SMP
 	/* two separate prints in case of a trap on an unmapped page */
 	printf("cpuid = %d; ", PCPU_GET(cpuid));
 	printf("apic id = %02x\n", PCPU_GET(apic_id));
 #endif
 	if (type == T_PAGEFLT) {
 		printf("fault virtual address	= 0x%lx\n", eva);
 		printf("fault code		= %s %s %s, %s\n",
 			code & PGEX_U ? "user" : "supervisor",
 			code & PGEX_W ? "write" : "read",
 			code & PGEX_I ? "instruction" : "data",
 			code & PGEX_RSV ? "reserved bits in PTE" :
 			code & PGEX_P ? "protection violation" : "page not present");
 	}
 	printf("instruction pointer	= 0x%lx:0x%lx\n",
 	       frame->tf_cs & 0xffff, frame->tf_rip);
 	ss = frame->tf_ss & 0xffff;
 	printf("stack pointer	        = 0x%x:0x%lx\n", ss, frame->tf_rsp);
 	printf("frame pointer	        = 0x%x:0x%lx\n", ss, frame->tf_rbp);
 	printf("code segment		= base 0x%lx, limit 0x%lx, type 0x%x\n",
 	       softseg.ssd_base, softseg.ssd_limit, softseg.ssd_type);
 	printf("			= DPL %d, pres %d, long %d, def32 %d, gran %d\n",
 	       softseg.ssd_dpl, softseg.ssd_p, softseg.ssd_long, softseg.ssd_def32,
 	       softseg.ssd_gran);
 	printf("processor eflags	= ");
 	if (frame->tf_rflags & PSL_T)
 		printf("trace trap, ");
 	if (frame->tf_rflags & PSL_I)
 		printf("interrupt enabled, ");
 	if (frame->tf_rflags & PSL_NT)
 		printf("nested task, ");
 	if (frame->tf_rflags & PSL_RF)
 		printf("resume, ");
 	printf("IOPL = %ld\n", (frame->tf_rflags & PSL_IOPL) >> 12);
 	printf("current process		= %d (%s)\n",
 	    curproc->p_pid, curthread->td_name);
 
 #ifdef KDB
 	if (debugger_on_panic) {
 		kdb_why = KDB_WHY_TRAP;
 		handled = kdb_trap(type, 0, frame);
 		kdb_why = KDB_WHY_UNSET;
 		if (handled)
 			return;
 	}
 #endif
 	printf("trap number		= %d\n", type);
 	if (type <= MAX_TRAP_MSG)
 		panic("%s", trap_msg[type]);
 	else
 		panic("unknown/reserved trap");
 }
 
 /*
  * Double fault handler. Called when a fault occurs while writing
  * a frame for a trap/exception onto the stack. This usually occurs
  * when the stack overflows (such is the case with infinite recursion,
  * for example).
  */
 void
 dblfault_handler(struct trapframe *frame)
 {
 #ifdef KDTRACE_HOOKS
 	if (dtrace_doubletrap_func != NULL)
 		(*dtrace_doubletrap_func)();
 #endif
 	printf("\nFatal double fault\n"
 	    "rip %#lx rsp %#lx rbp %#lx\n"
 	    "rax %#lx rdx %#lx rbx %#lx\n"
 	    "rcx %#lx rsi %#lx rdi %#lx\n"
 	    "r8 %#lx r9 %#lx r10 %#lx\n"
 	    "r11 %#lx r12 %#lx r13 %#lx\n"
 	    "r14 %#lx r15 %#lx rflags %#lx\n"
 	    "cs %#lx ss %#lx ds %#hx es %#hx fs %#hx gs %#hx\n"
 	    "fsbase %#lx gsbase %#lx kgsbase %#lx\n",
 	    frame->tf_rip, frame->tf_rsp, frame->tf_rbp,
 	    frame->tf_rax, frame->tf_rdx, frame->tf_rbx,
 	    frame->tf_rcx, frame->tf_rdi, frame->tf_rsi,
 	    frame->tf_r8, frame->tf_r9, frame->tf_r10,
 	    frame->tf_r11, frame->tf_r12, frame->tf_r13,
 	    frame->tf_r14, frame->tf_r15, frame->tf_rflags,
 	    frame->tf_cs, frame->tf_ss, frame->tf_ds, frame->tf_es,
 	    frame->tf_fs, frame->tf_gs,
 	    rdmsr(MSR_FSBASE), rdmsr(MSR_GSBASE), rdmsr(MSR_KGSBASE));
 #ifdef SMP
 	/* two separate prints in case of a trap on an unmapped page */
 	printf("cpuid = %d; ", PCPU_GET(cpuid));
 	printf("apic id = %02x\n", PCPU_GET(apic_id));
 #endif
 	panic("double fault");
 }
 
 int
 cpu_fetch_syscall_args(struct thread *td)
 {
 	struct proc *p;
 	struct trapframe *frame;
 	register_t *argp;
 	struct syscall_args *sa;
 	caddr_t params;
 	int reg, regcnt, error;
 
 	p = td->td_proc;
 	frame = td->td_frame;
 	sa = &td->td_sa;
 	reg = 0;
 	regcnt = 6;
 
 	sa->code = frame->tf_rax;
 
 	if (sa->code == SYS_syscall || sa->code == SYS___syscall) {
 		sa->code = frame->tf_rdi;
 		reg++;
 		regcnt--;
 	}
  	if (p->p_sysent->sv_mask)
  		sa->code &= p->p_sysent->sv_mask;
 
  	if (sa->code >= p->p_sysent->sv_size)
  		sa->callp = &p->p_sysent->sv_table[0];
   	else
  		sa->callp = &p->p_sysent->sv_table[sa->code];
 
 	sa->narg = sa->callp->sy_narg;
 	KASSERT(sa->narg <= sizeof(sa->args) / sizeof(sa->args[0]),
 	    ("Too many syscall arguments!"));
 	error = 0;
 	argp = &frame->tf_rdi;
 	argp += reg;
 	memcpy(sa->args, argp, sizeof(sa->args[0]) * 6);
 	if (sa->narg > regcnt) {
 		params = (caddr_t)frame->tf_rsp + sizeof(register_t);
 		error = copyin(params, &sa->args[regcnt],
 	    	    (sa->narg - regcnt) * sizeof(sa->args[0]));
 	}
 
 	if (error == 0) {
 		td->td_retval[0] = 0;
 		td->td_retval[1] = frame->tf_rdx;
 	}
 
 	return (error);
 }
 
 #include "../../kern/subr_syscall.c"
 
 /*
  * System call handler for native binaries.  The trap frame is already
  * set up by the assembler trampoline and a pointer to it is saved in
  * td_frame.
  */
 void
 amd64_syscall(struct thread *td, int traced)
 {
 	int error;
 	ksiginfo_t ksi;
 
 #ifdef DIAGNOSTIC
 	if (!TRAPF_USERMODE(td->td_frame)) {
 		panic("syscall");
 		/* NOT REACHED */
 	}
 #endif
 	error = syscallenter(td);
 
 	/*
 	 * Traced syscall.
 	 */
 	if (__predict_false(traced)) {
 		td->td_frame->tf_rflags &= ~PSL_T;
 		ksiginfo_init_trap(&ksi);
 		ksi.ksi_signo = SIGTRAP;
 		ksi.ksi_code = TRAP_TRACE;
 		ksi.ksi_addr = (void *)td->td_frame->tf_rip;
 		trapsignal(td, &ksi);
 	}
 
 	KASSERT(PCB_USER_FPU(td->td_pcb),
 	    ("System call %s returning with kernel FPU ctx leaked",
 	     syscallname(td->td_proc, td->td_sa.code)));
 	KASSERT(td->td_pcb->pcb_save == get_pcb_user_save_td(td),
 	    ("System call %s returning with mangled pcb_save",
 	     syscallname(td->td_proc, td->td_sa.code)));
 	KASSERT(td->td_md.md_invl_gen.gen == 0,
 	    ("System call %s returning with leaked invl_gen %lu",
 	    syscallname(td->td_proc, td->td_sa.code),
 	    td->td_md.md_invl_gen.gen));
 
 	syscallret(td, error);
 
 	/*
 	 * If the user-supplied value of %rip is not a canonical
 	 * address, then some CPUs will trigger a ring 0 #GP during
 	 * the sysret instruction.  However, the fault handler would
 	 * execute in ring 0 with the user's %gs and %rsp which would
 	 * not be safe.  Instead, use the full return path which
 	 * catches the problem safely.
 	 */
 	if (__predict_false(td->td_frame->tf_rip >= VM_MAXUSER_ADDRESS))
 		set_pcb_flags(td->td_pcb, PCB_FULL_IRET);
 }
Index: head/sys/amd64/include/md_var.h
===================================================================
--- head/sys/amd64/include/md_var.h	(revision 338067)
+++ head/sys/amd64/include/md_var.h	(revision 338068)
@@ -1,78 +1,79 @@
 /*-
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Copyright (c) 1995 Bruce D. Evans.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the author nor the names of contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  * $FreeBSD$
  */
 
 #ifndef _MACHINE_MD_VAR_H_
 #define	_MACHINE_MD_VAR_H_
 
 #include <x86/x86_var.h>
 
 extern uint64_t	*vm_page_dump;
 extern int	hw_lower_amd64_sharedpage;
 extern int	hw_ibrs_disable;
 extern int	hw_ssb_disable;
+extern int	nmi_flush_l1d_sw;
 
 /*
  * The file "conf/ldscript.amd64" defines the symbol "kernphys".  Its
  * value is the physical address at which the kernel is loaded.
  */
 extern char kernphys[];
 
 struct	savefpu;
 struct	sysentvec;
 
 void	amd64_conf_fast_syscall(void);
 void	amd64_db_resume_dbreg(void);
 void	amd64_lower_shared_page(struct sysentvec *);
 void	amd64_syscall(struct thread *td, int traced);
 void	doreti_iret(void) __asm(__STRING(doreti_iret));
 void	doreti_iret_fault(void) __asm(__STRING(doreti_iret_fault));
 void	ld_ds(void) __asm(__STRING(ld_ds));
 void	ld_es(void) __asm(__STRING(ld_es));
 void	ld_fs(void) __asm(__STRING(ld_fs));
 void	ld_gs(void) __asm(__STRING(ld_gs));
 void	ld_fsbase(void) __asm(__STRING(ld_fsbase));
 void	ld_gsbase(void) __asm(__STRING(ld_gsbase));
 void	ds_load_fault(void) __asm(__STRING(ds_load_fault));
 void	es_load_fault(void) __asm(__STRING(es_load_fault));
 void	fs_load_fault(void) __asm(__STRING(fs_load_fault));
 void	gs_load_fault(void) __asm(__STRING(gs_load_fault));
 void	fsbase_load_fault(void) __asm(__STRING(fsbase_load_fault));
 void	gsbase_load_fault(void) __asm(__STRING(gsbase_load_fault));
 void	fpstate_drop(struct thread *td);
 void	pagezero(void *addr);
 void	setidt(int idx, alias_for_inthand_t *func, int typ, int dpl, int ist);
 void	sse2_pagezero(void *addr);
 struct savefpu *get_pcb_user_save_td(struct thread *td);
 struct savefpu *get_pcb_user_save_pcb(struct pcb *pcb);
 
 #endif /* !_MACHINE_MD_VAR_H_ */
Index: head/sys/amd64/vmm/intel/vmx.c
===================================================================
--- head/sys/amd64/vmm/intel/vmx.c	(revision 338067)
+++ head/sys/amd64/vmm/intel/vmx.c	(revision 338068)
@@ -1,3681 +1,3715 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
  *
  * Copyright (c) 2011 NetApp, Inc.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  * $FreeBSD$
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/smp.h>
 #include <sys/kernel.h>
 #include <sys/malloc.h>
 #include <sys/pcpu.h>
 #include <sys/proc.h>
 #include <sys/sysctl.h>
 
 #include <vm/vm.h>
 #include <vm/pmap.h>
 
 #include <machine/psl.h>
 #include <machine/cpufunc.h>
 #include <machine/md_var.h>
 #include <machine/reg.h>
 #include <machine/segments.h>
 #include <machine/smp.h>
 #include <machine/specialreg.h>
 #include <machine/vmparam.h>
 
 #include <machine/vmm.h>
 #include <machine/vmm_dev.h>
 #include <machine/vmm_instruction_emul.h>
 #include "vmm_lapic.h"
 #include "vmm_host.h"
 #include "vmm_ioport.h"
 #include "vmm_ktr.h"
 #include "vmm_stat.h"
 #include "vatpic.h"
 #include "vlapic.h"
 #include "vlapic_priv.h"
 
 #include "ept.h"
 #include "vmx_cpufunc.h"
 #include "vmx.h"
 #include "vmx_msr.h"
 #include "x86.h"
 #include "vmx_controls.h"
 
 #define	PINBASED_CTLS_ONE_SETTING					\
 	(PINBASED_EXTINT_EXITING	|				\
 	 PINBASED_NMI_EXITING		|				\
 	 PINBASED_VIRTUAL_NMI)
 #define	PINBASED_CTLS_ZERO_SETTING	0
 
 #define PROCBASED_CTLS_WINDOW_SETTING					\
 	(PROCBASED_INT_WINDOW_EXITING	|				\
 	 PROCBASED_NMI_WINDOW_EXITING)
 
 #define	PROCBASED_CTLS_ONE_SETTING 					\
 	(PROCBASED_SECONDARY_CONTROLS	|				\
 	 PROCBASED_MWAIT_EXITING	|				\
 	 PROCBASED_MONITOR_EXITING	|				\
 	 PROCBASED_IO_EXITING		|				\
 	 PROCBASED_MSR_BITMAPS		|				\
 	 PROCBASED_CTLS_WINDOW_SETTING	|				\
 	 PROCBASED_CR8_LOAD_EXITING	|				\
 	 PROCBASED_CR8_STORE_EXITING)
 #define	PROCBASED_CTLS_ZERO_SETTING	\
 	(PROCBASED_CR3_LOAD_EXITING |	\
 	PROCBASED_CR3_STORE_EXITING |	\
 	PROCBASED_IO_BITMAPS)
 
 #define	PROCBASED_CTLS2_ONE_SETTING	PROCBASED2_ENABLE_EPT
 #define	PROCBASED_CTLS2_ZERO_SETTING	0
 
 #define	VM_EXIT_CTLS_ONE_SETTING					\
 	(VM_EXIT_SAVE_DEBUG_CONTROLS		|			\
 	VM_EXIT_HOST_LMA			|			\
 	VM_EXIT_SAVE_EFER			|			\
 	VM_EXIT_LOAD_EFER			|			\
 	VM_EXIT_ACKNOWLEDGE_INTERRUPT)
 
 #define	VM_EXIT_CTLS_ZERO_SETTING	0
 
 #define	VM_ENTRY_CTLS_ONE_SETTING					\
 	(VM_ENTRY_LOAD_DEBUG_CONTROLS		|			\
 	VM_ENTRY_LOAD_EFER)
 
 #define	VM_ENTRY_CTLS_ZERO_SETTING					\
 	(VM_ENTRY_INTO_SMM			|			\
 	VM_ENTRY_DEACTIVATE_DUAL_MONITOR)
 
 #define	HANDLED		1
 #define	UNHANDLED	0
 
 static MALLOC_DEFINE(M_VMX, "vmx", "vmx");
 static MALLOC_DEFINE(M_VLAPIC, "vlapic", "vlapic");
 
 SYSCTL_DECL(_hw_vmm);
 SYSCTL_NODE(_hw_vmm, OID_AUTO, vmx, CTLFLAG_RW, NULL, NULL);
 
 int vmxon_enabled[MAXCPU];
 static char vmxon_region[MAXCPU][PAGE_SIZE] __aligned(PAGE_SIZE);
 
 static uint32_t pinbased_ctls, procbased_ctls, procbased_ctls2;
 static uint32_t exit_ctls, entry_ctls;
 
 static uint64_t cr0_ones_mask, cr0_zeros_mask;
 SYSCTL_ULONG(_hw_vmm_vmx, OID_AUTO, cr0_ones_mask, CTLFLAG_RD,
 	     &cr0_ones_mask, 0, NULL);
 SYSCTL_ULONG(_hw_vmm_vmx, OID_AUTO, cr0_zeros_mask, CTLFLAG_RD,
 	     &cr0_zeros_mask, 0, NULL);
 
 static uint64_t cr4_ones_mask, cr4_zeros_mask;
 SYSCTL_ULONG(_hw_vmm_vmx, OID_AUTO, cr4_ones_mask, CTLFLAG_RD,
 	     &cr4_ones_mask, 0, NULL);
 SYSCTL_ULONG(_hw_vmm_vmx, OID_AUTO, cr4_zeros_mask, CTLFLAG_RD,
 	     &cr4_zeros_mask, 0, NULL);
 
 static int vmx_initialized;
 SYSCTL_INT(_hw_vmm_vmx, OID_AUTO, initialized, CTLFLAG_RD,
 	   &vmx_initialized, 0, "Intel VMX initialized");
 
 /*
  * Optional capabilities
  */
 static SYSCTL_NODE(_hw_vmm_vmx, OID_AUTO, cap, CTLFLAG_RW, NULL, NULL);
 
 static int cap_halt_exit;
 SYSCTL_INT(_hw_vmm_vmx_cap, OID_AUTO, halt_exit, CTLFLAG_RD, &cap_halt_exit, 0,
     "HLT triggers a VM-exit");
 
 static int cap_pause_exit;
 SYSCTL_INT(_hw_vmm_vmx_cap, OID_AUTO, pause_exit, CTLFLAG_RD, &cap_pause_exit,
     0, "PAUSE triggers a VM-exit");
 
 static int cap_unrestricted_guest;
 SYSCTL_INT(_hw_vmm_vmx_cap, OID_AUTO, unrestricted_guest, CTLFLAG_RD,
     &cap_unrestricted_guest, 0, "Unrestricted guests");
 
 static int cap_monitor_trap;
 SYSCTL_INT(_hw_vmm_vmx_cap, OID_AUTO, monitor_trap, CTLFLAG_RD,
     &cap_monitor_trap, 0, "Monitor trap flag");
 
 static int cap_invpcid;
 SYSCTL_INT(_hw_vmm_vmx_cap, OID_AUTO, invpcid, CTLFLAG_RD, &cap_invpcid,
     0, "Guests are allowed to use INVPCID");
 
 static int virtual_interrupt_delivery;
 SYSCTL_INT(_hw_vmm_vmx_cap, OID_AUTO, virtual_interrupt_delivery, CTLFLAG_RD,
     &virtual_interrupt_delivery, 0, "APICv virtual interrupt delivery support");
 
 static int posted_interrupts;
 SYSCTL_INT(_hw_vmm_vmx_cap, OID_AUTO, posted_interrupts, CTLFLAG_RD,
     &posted_interrupts, 0, "APICv posted interrupt support");
 
 static int pirvec = -1;
 SYSCTL_INT(_hw_vmm_vmx, OID_AUTO, posted_interrupt_vector, CTLFLAG_RD,
     &pirvec, 0, "APICv posted interrupt vector");
 
 static struct unrhdr *vpid_unr;
 static u_int vpid_alloc_failed;
 SYSCTL_UINT(_hw_vmm_vmx, OID_AUTO, vpid_alloc_failed, CTLFLAG_RD,
 	    &vpid_alloc_failed, 0, NULL);
 
 static int guest_l1d_flush;
 SYSCTL_INT(_hw_vmm_vmx, OID_AUTO, l1d_flush, CTLFLAG_RD,
     &guest_l1d_flush, 0, NULL);
+static int guest_l1d_flush_sw;
+SYSCTL_INT(_hw_vmm_vmx, OID_AUTO, l1d_flush_sw, CTLFLAG_RD,
+    &guest_l1d_flush_sw, 0, NULL);
 
-uint64_t vmx_msr_flush_cmd;
+static struct msr_entry msr_load_list[1] __aligned(16);
 
 /*
  * The definitions of SDT probes for VMX.
  */
 
 SDT_PROBE_DEFINE3(vmm, vmx, exit, entry,
     "struct vmx *", "int", "struct vm_exit *");
 
 SDT_PROBE_DEFINE4(vmm, vmx, exit, taskswitch,
     "struct vmx *", "int", "struct vm_exit *", "struct vm_task_switch *");
 
 SDT_PROBE_DEFINE4(vmm, vmx, exit, craccess,
     "struct vmx *", "int", "struct vm_exit *", "uint64_t");
 
 SDT_PROBE_DEFINE4(vmm, vmx, exit, rdmsr,
     "struct vmx *", "int", "struct vm_exit *", "uint32_t");
 
 SDT_PROBE_DEFINE5(vmm, vmx, exit, wrmsr,
     "struct vmx *", "int", "struct vm_exit *", "uint32_t", "uint64_t");
 
 SDT_PROBE_DEFINE3(vmm, vmx, exit, halt,
     "struct vmx *", "int", "struct vm_exit *");
 
 SDT_PROBE_DEFINE3(vmm, vmx, exit, mtrap,
     "struct vmx *", "int", "struct vm_exit *");
 
 SDT_PROBE_DEFINE3(vmm, vmx, exit, pause,
     "struct vmx *", "int", "struct vm_exit *");
 
 SDT_PROBE_DEFINE3(vmm, vmx, exit, intrwindow,
     "struct vmx *", "int", "struct vm_exit *");
 
 SDT_PROBE_DEFINE4(vmm, vmx, exit, interrupt,
     "struct vmx *", "int", "struct vm_exit *", "uint32_t");
 
 SDT_PROBE_DEFINE3(vmm, vmx, exit, nmiwindow,
     "struct vmx *", "int", "struct vm_exit *");
 
 SDT_PROBE_DEFINE3(vmm, vmx, exit, inout,
     "struct vmx *", "int", "struct vm_exit *");
 
 SDT_PROBE_DEFINE3(vmm, vmx, exit, cpuid,
     "struct vmx *", "int", "struct vm_exit *");
 
 SDT_PROBE_DEFINE5(vmm, vmx, exit, exception,
     "struct vmx *", "int", "struct vm_exit *", "uint32_t", "int");
 
 SDT_PROBE_DEFINE5(vmm, vmx, exit, nestedfault,
     "struct vmx *", "int", "struct vm_exit *", "uint64_t", "uint64_t");
 
 SDT_PROBE_DEFINE4(vmm, vmx, exit, mmiofault,
     "struct vmx *", "int", "struct vm_exit *", "uint64_t");
 
 SDT_PROBE_DEFINE3(vmm, vmx, exit, eoi,
     "struct vmx *", "int", "struct vm_exit *");
 
 SDT_PROBE_DEFINE3(vmm, vmx, exit, apicaccess,
     "struct vmx *", "int", "struct vm_exit *");
 
 SDT_PROBE_DEFINE4(vmm, vmx, exit, apicwrite,
     "struct vmx *", "int", "struct vm_exit *", "struct vlapic *");
 
 SDT_PROBE_DEFINE3(vmm, vmx, exit, xsetbv,
     "struct vmx *", "int", "struct vm_exit *");
 
 SDT_PROBE_DEFINE3(vmm, vmx, exit, monitor,
     "struct vmx *", "int", "struct vm_exit *");
 
 SDT_PROBE_DEFINE3(vmm, vmx, exit, mwait,
     "struct vmx *", "int", "struct vm_exit *");
 
 SDT_PROBE_DEFINE4(vmm, vmx, exit, unknown,
     "struct vmx *", "int", "struct vm_exit *", "uint32_t");
 
 SDT_PROBE_DEFINE4(vmm, vmx, exit, return,
     "struct vmx *", "int", "struct vm_exit *", "int");
 
 /*
  * Use the last page below 4GB as the APIC access address. This address is
  * occupied by the boot firmware so it is guaranteed that it will not conflict
  * with a page in system memory.
  */
 #define	APIC_ACCESS_ADDRESS	0xFFFFF000
 
 static int vmx_getdesc(void *arg, int vcpu, int reg, struct seg_desc *desc);
 static int vmx_getreg(void *arg, int vcpu, int reg, uint64_t *retval);
 static int vmxctx_setreg(struct vmxctx *vmxctx, int reg, uint64_t val);
 static void vmx_inject_pir(struct vlapic *vlapic);
 
 #ifdef KTR
 static const char *
 exit_reason_to_str(int reason)
 {
 	static char reasonbuf[32];
 
 	switch (reason) {
 	case EXIT_REASON_EXCEPTION:
 		return "exception";
 	case EXIT_REASON_EXT_INTR:
 		return "extint";
 	case EXIT_REASON_TRIPLE_FAULT:
 		return "triplefault";
 	case EXIT_REASON_INIT:
 		return "init";
 	case EXIT_REASON_SIPI:
 		return "sipi";
 	case EXIT_REASON_IO_SMI:
 		return "iosmi";
 	case EXIT_REASON_SMI:
 		return "smi";
 	case EXIT_REASON_INTR_WINDOW:
 		return "intrwindow";
 	case EXIT_REASON_NMI_WINDOW:
 		return "nmiwindow";
 	case EXIT_REASON_TASK_SWITCH:
 		return "taskswitch";
 	case EXIT_REASON_CPUID:
 		return "cpuid";
 	case EXIT_REASON_GETSEC:
 		return "getsec";
 	case EXIT_REASON_HLT:
 		return "hlt";
 	case EXIT_REASON_INVD:
 		return "invd";
 	case EXIT_REASON_INVLPG:
 		return "invlpg";
 	case EXIT_REASON_RDPMC:
 		return "rdpmc";
 	case EXIT_REASON_RDTSC:
 		return "rdtsc";
 	case EXIT_REASON_RSM:
 		return "rsm";
 	case EXIT_REASON_VMCALL:
 		return "vmcall";
 	case EXIT_REASON_VMCLEAR:
 		return "vmclear";
 	case EXIT_REASON_VMLAUNCH:
 		return "vmlaunch";
 	case EXIT_REASON_VMPTRLD:
 		return "vmptrld";
 	case EXIT_REASON_VMPTRST:
 		return "vmptrst";
 	case EXIT_REASON_VMREAD:
 		return "vmread";
 	case EXIT_REASON_VMRESUME:
 		return "vmresume";
 	case EXIT_REASON_VMWRITE:
 		return "vmwrite";
 	case EXIT_REASON_VMXOFF:
 		return "vmxoff";
 	case EXIT_REASON_VMXON:
 		return "vmxon";
 	case EXIT_REASON_CR_ACCESS:
 		return "craccess";
 	case EXIT_REASON_DR_ACCESS:
 		return "draccess";
 	case EXIT_REASON_INOUT:
 		return "inout";
 	case EXIT_REASON_RDMSR:
 		return "rdmsr";
 	case EXIT_REASON_WRMSR:
 		return "wrmsr";
 	case EXIT_REASON_INVAL_VMCS:
 		return "invalvmcs";
 	case EXIT_REASON_INVAL_MSR:
 		return "invalmsr";
 	case EXIT_REASON_MWAIT:
 		return "mwait";
 	case EXIT_REASON_MTF:
 		return "mtf";
 	case EXIT_REASON_MONITOR:
 		return "monitor";
 	case EXIT_REASON_PAUSE:
 		return "pause";
 	case EXIT_REASON_MCE_DURING_ENTRY:
 		return "mce-during-entry";
 	case EXIT_REASON_TPR:
 		return "tpr";
 	case EXIT_REASON_APIC_ACCESS:
 		return "apic-access";
 	case EXIT_REASON_GDTR_IDTR:
 		return "gdtridtr";
 	case EXIT_REASON_LDTR_TR:
 		return "ldtrtr";
 	case EXIT_REASON_EPT_FAULT:
 		return "eptfault";
 	case EXIT_REASON_EPT_MISCONFIG:
 		return "eptmisconfig";
 	case EXIT_REASON_INVEPT:
 		return "invept";
 	case EXIT_REASON_RDTSCP:
 		return "rdtscp";
 	case EXIT_REASON_VMX_PREEMPT:
 		return "vmxpreempt";
 	case EXIT_REASON_INVVPID:
 		return "invvpid";
 	case EXIT_REASON_WBINVD:
 		return "wbinvd";
 	case EXIT_REASON_XSETBV:
 		return "xsetbv";
 	case EXIT_REASON_APIC_WRITE:
 		return "apic-write";
 	default:
 		snprintf(reasonbuf, sizeof(reasonbuf), "%d", reason);
 		return (reasonbuf);
 	}
 }
 #endif	/* KTR */
 
 static int
 vmx_allow_x2apic_msrs(struct vmx *vmx)
 {
 	int i, error;
 
 	error = 0;
 
 	/*
 	 * Allow readonly access to the following x2APIC MSRs from the guest.
 	 */
 	error += guest_msr_ro(vmx, MSR_APIC_ID);
 	error += guest_msr_ro(vmx, MSR_APIC_VERSION);
 	error += guest_msr_ro(vmx, MSR_APIC_LDR);
 	error += guest_msr_ro(vmx, MSR_APIC_SVR);
 
 	for (i = 0; i < 8; i++)
 		error += guest_msr_ro(vmx, MSR_APIC_ISR0 + i);
 
 	for (i = 0; i < 8; i++)
 		error += guest_msr_ro(vmx, MSR_APIC_TMR0 + i);
 	
 	for (i = 0; i < 8; i++)
 		error += guest_msr_ro(vmx, MSR_APIC_IRR0 + i);
 
 	error += guest_msr_ro(vmx, MSR_APIC_ESR);
 	error += guest_msr_ro(vmx, MSR_APIC_LVT_TIMER);
 	error += guest_msr_ro(vmx, MSR_APIC_LVT_THERMAL);
 	error += guest_msr_ro(vmx, MSR_APIC_LVT_PCINT);
 	error += guest_msr_ro(vmx, MSR_APIC_LVT_LINT0);
 	error += guest_msr_ro(vmx, MSR_APIC_LVT_LINT1);
 	error += guest_msr_ro(vmx, MSR_APIC_LVT_ERROR);
 	error += guest_msr_ro(vmx, MSR_APIC_ICR_TIMER);
 	error += guest_msr_ro(vmx, MSR_APIC_DCR_TIMER);
 	error += guest_msr_ro(vmx, MSR_APIC_ICR);
 
 	/*
 	 * Allow TPR, EOI and SELF_IPI MSRs to be read and written by the guest.
 	 *
 	 * These registers get special treatment described in the section
 	 * "Virtualizing MSR-Based APIC Accesses".
 	 */
 	error += guest_msr_rw(vmx, MSR_APIC_TPR);
 	error += guest_msr_rw(vmx, MSR_APIC_EOI);
 	error += guest_msr_rw(vmx, MSR_APIC_SELF_IPI);
 
 	return (error);
 }
 
 u_long
 vmx_fix_cr0(u_long cr0)
 {
 
 	return ((cr0 | cr0_ones_mask) & ~cr0_zeros_mask);
 }
 
 u_long
 vmx_fix_cr4(u_long cr4)
 {
 
 	return ((cr4 | cr4_ones_mask) & ~cr4_zeros_mask);
 }
 
 static void
 vpid_free(int vpid)
 {
 	if (vpid < 0 || vpid > 0xffff)
 		panic("vpid_free: invalid vpid %d", vpid);
 
 	/*
 	 * VPIDs [0,VM_MAXCPU] are special and are not allocated from
 	 * the unit number allocator.
 	 */
 
 	if (vpid > VM_MAXCPU)
 		free_unr(vpid_unr, vpid);
 }
 
 static void
 vpid_alloc(uint16_t *vpid, int num)
 {
 	int i, x;
 
 	if (num <= 0 || num > VM_MAXCPU)
 		panic("invalid number of vpids requested: %d", num);
 
 	/*
 	 * If the "enable vpid" execution control is not enabled then the
 	 * VPID is required to be 0 for all vcpus.
 	 */
 	if ((procbased_ctls2 & PROCBASED2_ENABLE_VPID) == 0) {
 		for (i = 0; i < num; i++)
 			vpid[i] = 0;
 		return;
 	}
 
 	/*
 	 * Allocate a unique VPID for each vcpu from the unit number allocator.
 	 */
 	for (i = 0; i < num; i++) {
 		x = alloc_unr(vpid_unr);
 		if (x == -1)
 			break;
 		else
 			vpid[i] = x;
 	}
 
 	if (i < num) {
 		atomic_add_int(&vpid_alloc_failed, 1);
 
 		/*
 		 * If the unit number allocator does not have enough unique
 		 * VPIDs then we need to allocate from the [1,VM_MAXCPU] range.
 		 *
 		 * These VPIDs are not be unique across VMs but this does not
 		 * affect correctness because the combined mappings are also
 		 * tagged with the EP4TA which is unique for each VM.
 		 *
 		 * It is still sub-optimal because the invvpid will invalidate
 		 * combined mappings for a particular VPID across all EP4TAs.
 		 */
 		while (i-- > 0)
 			vpid_free(vpid[i]);
 
 		for (i = 0; i < num; i++)
 			vpid[i] = i + 1;
 	}
 }
 
 static void
 vpid_init(void)
 {
 	/*
 	 * VPID 0 is required when the "enable VPID" execution control is
 	 * disabled.
 	 *
 	 * VPIDs [1,VM_MAXCPU] are used as the "overflow namespace" when the
 	 * unit number allocator does not have sufficient unique VPIDs to
 	 * satisfy the allocation.
 	 *
 	 * The remaining VPIDs are managed by the unit number allocator.
 	 */
 	vpid_unr = new_unrhdr(VM_MAXCPU + 1, 0xffff, NULL);
 }
 
 static void
 vmx_disable(void *arg __unused)
 {
 	struct invvpid_desc invvpid_desc = { 0 };
 	struct invept_desc invept_desc = { 0 };
 
 	if (vmxon_enabled[curcpu]) {
 		/*
 		 * See sections 25.3.3.3 and 25.3.3.4 in Intel Vol 3b.
 		 *
 		 * VMXON or VMXOFF are not required to invalidate any TLB
 		 * caching structures. This prevents potential retention of
 		 * cached information in the TLB between distinct VMX episodes.
 		 */
 		invvpid(INVVPID_TYPE_ALL_CONTEXTS, invvpid_desc);
 		invept(INVEPT_TYPE_ALL_CONTEXTS, invept_desc);
 		vmxoff();
 	}
 	load_cr4(rcr4() & ~CR4_VMXE);
 }
 
 static int
 vmx_cleanup(void)
 {
 	
 	if (pirvec >= 0)
 		lapic_ipi_free(pirvec);
 
 	if (vpid_unr != NULL) {
 		delete_unrhdr(vpid_unr);
 		vpid_unr = NULL;
 	}
 
+	if (nmi_flush_l1d_sw == 1)
+		nmi_flush_l1d_sw = 0;
+
 	smp_rendezvous(NULL, vmx_disable, NULL, NULL);
 
 	return (0);
 }
 
 static void
 vmx_enable(void *arg __unused)
 {
 	int error;
 	uint64_t feature_control;
 
 	feature_control = rdmsr(MSR_IA32_FEATURE_CONTROL);
 	if ((feature_control & IA32_FEATURE_CONTROL_LOCK) == 0 ||
 	    (feature_control & IA32_FEATURE_CONTROL_VMX_EN) == 0) {
 		wrmsr(MSR_IA32_FEATURE_CONTROL,
 		    feature_control | IA32_FEATURE_CONTROL_VMX_EN |
 		    IA32_FEATURE_CONTROL_LOCK);
 	}
 
 	load_cr4(rcr4() | CR4_VMXE);
 
 	*(uint32_t *)vmxon_region[curcpu] = vmx_revision();
 	error = vmxon(vmxon_region[curcpu]);
 	if (error == 0)
 		vmxon_enabled[curcpu] = 1;
 }
 
 static void
 vmx_restore(void)
 {
 
 	if (vmxon_enabled[curcpu])
 		vmxon(vmxon_region[curcpu]);
 }
 
 static int
 vmx_init(int ipinum)
 {
 	int error, use_tpr_shadow;
 	uint64_t basic, fixed0, fixed1, feature_control;
 	uint32_t tmp, procbased2_vid_bits;
 
 	/* CPUID.1:ECX[bit 5] must be 1 for processor to support VMX */
 	if (!(cpu_feature2 & CPUID2_VMX)) {
 		printf("vmx_init: processor does not support VMX operation\n");
 		return (ENXIO);
 	}
 
 	/*
 	 * Verify that MSR_IA32_FEATURE_CONTROL lock and VMXON enable bits
 	 * are set (bits 0 and 2 respectively).
 	 */
 	feature_control = rdmsr(MSR_IA32_FEATURE_CONTROL);
 	if ((feature_control & IA32_FEATURE_CONTROL_LOCK) == 1 &&
 	    (feature_control & IA32_FEATURE_CONTROL_VMX_EN) == 0) {
 		printf("vmx_init: VMX operation disabled by BIOS\n");
 		return (ENXIO);
 	}
 
 	/*
 	 * Verify capabilities MSR_VMX_BASIC:
 	 * - bit 54 indicates support for INS/OUTS decoding
 	 */
 	basic = rdmsr(MSR_VMX_BASIC);
 	if ((basic & (1UL << 54)) == 0) {
 		printf("vmx_init: processor does not support desired basic "
 		    "capabilities\n");
 		return (EINVAL);
 	}
 
 	/* Check support for primary processor-based VM-execution controls */
 	error = vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS,
 			       MSR_VMX_TRUE_PROCBASED_CTLS,
 			       PROCBASED_CTLS_ONE_SETTING,
 			       PROCBASED_CTLS_ZERO_SETTING, &procbased_ctls);
 	if (error) {
 		printf("vmx_init: processor does not support desired primary "
 		       "processor-based controls\n");
 		return (error);
 	}
 
 	/* Clear the processor-based ctl bits that are set on demand */
 	procbased_ctls &= ~PROCBASED_CTLS_WINDOW_SETTING;
 
 	/* Check support for secondary processor-based VM-execution controls */
 	error = vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS2,
 			       MSR_VMX_PROCBASED_CTLS2,
 			       PROCBASED_CTLS2_ONE_SETTING,
 			       PROCBASED_CTLS2_ZERO_SETTING, &procbased_ctls2);
 	if (error) {
 		printf("vmx_init: processor does not support desired secondary "
 		       "processor-based controls\n");
 		return (error);
 	}
 
 	/* Check support for VPID */
 	error = vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS2, MSR_VMX_PROCBASED_CTLS2,
 			       PROCBASED2_ENABLE_VPID, 0, &tmp);
 	if (error == 0)
 		procbased_ctls2 |= PROCBASED2_ENABLE_VPID;
 
 	/* Check support for pin-based VM-execution controls */
 	error = vmx_set_ctlreg(MSR_VMX_PINBASED_CTLS,
 			       MSR_VMX_TRUE_PINBASED_CTLS,
 			       PINBASED_CTLS_ONE_SETTING,
 			       PINBASED_CTLS_ZERO_SETTING, &pinbased_ctls);
 	if (error) {
 		printf("vmx_init: processor does not support desired "
 		       "pin-based controls\n");
 		return (error);
 	}
 
 	/* Check support for VM-exit controls */
 	error = vmx_set_ctlreg(MSR_VMX_EXIT_CTLS, MSR_VMX_TRUE_EXIT_CTLS,
 			       VM_EXIT_CTLS_ONE_SETTING,
 			       VM_EXIT_CTLS_ZERO_SETTING,
 			       &exit_ctls);
 	if (error) {
 		printf("vmx_init: processor does not support desired "
 		    "exit controls\n");
 		return (error);
 	}
 
 	/* Check support for VM-entry controls */
 	error = vmx_set_ctlreg(MSR_VMX_ENTRY_CTLS, MSR_VMX_TRUE_ENTRY_CTLS,
 	    VM_ENTRY_CTLS_ONE_SETTING, VM_ENTRY_CTLS_ZERO_SETTING,
 	    &entry_ctls);
 	if (error) {
 		printf("vmx_init: processor does not support desired "
 		    "entry controls\n");
 		return (error);
 	}
 
 	/*
 	 * Check support for optional features by testing them
 	 * as individual bits
 	 */
 	cap_halt_exit = (vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS,
 					MSR_VMX_TRUE_PROCBASED_CTLS,
 					PROCBASED_HLT_EXITING, 0,
 					&tmp) == 0);
 
 	cap_monitor_trap = (vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS,
 					MSR_VMX_PROCBASED_CTLS,
 					PROCBASED_MTF, 0,
 					&tmp) == 0);
 
 	cap_pause_exit = (vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS,
 					 MSR_VMX_TRUE_PROCBASED_CTLS,
 					 PROCBASED_PAUSE_EXITING, 0,
 					 &tmp) == 0);
 
 	cap_unrestricted_guest = (vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS2,
 					MSR_VMX_PROCBASED_CTLS2,
 					PROCBASED2_UNRESTRICTED_GUEST, 0,
 				        &tmp) == 0);
 
 	cap_invpcid = (vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS2,
 	    MSR_VMX_PROCBASED_CTLS2, PROCBASED2_ENABLE_INVPCID, 0,
 	    &tmp) == 0);
 
 	/*
 	 * Check support for virtual interrupt delivery.
 	 */
 	procbased2_vid_bits = (PROCBASED2_VIRTUALIZE_APIC_ACCESSES |
 	    PROCBASED2_VIRTUALIZE_X2APIC_MODE |
 	    PROCBASED2_APIC_REGISTER_VIRTUALIZATION |
 	    PROCBASED2_VIRTUAL_INTERRUPT_DELIVERY);
 
 	use_tpr_shadow = (vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS,
 	    MSR_VMX_TRUE_PROCBASED_CTLS, PROCBASED_USE_TPR_SHADOW, 0,
 	    &tmp) == 0);
 
 	error = vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS2, MSR_VMX_PROCBASED_CTLS2,
 	    procbased2_vid_bits, 0, &tmp);
 	if (error == 0 && use_tpr_shadow) {
 		virtual_interrupt_delivery = 1;
 		TUNABLE_INT_FETCH("hw.vmm.vmx.use_apic_vid",
 		    &virtual_interrupt_delivery);
 	}
 
 	if (virtual_interrupt_delivery) {
 		procbased_ctls |= PROCBASED_USE_TPR_SHADOW;
 		procbased_ctls2 |= procbased2_vid_bits;
 		procbased_ctls2 &= ~PROCBASED2_VIRTUALIZE_X2APIC_MODE;
 
 		/*
 		 * No need to emulate accesses to %CR8 if virtual
 		 * interrupt delivery is enabled.
 		 */
 		procbased_ctls &= ~PROCBASED_CR8_LOAD_EXITING;
 		procbased_ctls &= ~PROCBASED_CR8_STORE_EXITING;
 
 		/*
 		 * Check for Posted Interrupts only if Virtual Interrupt
 		 * Delivery is enabled.
 		 */
 		error = vmx_set_ctlreg(MSR_VMX_PINBASED_CTLS,
 		    MSR_VMX_TRUE_PINBASED_CTLS, PINBASED_POSTED_INTERRUPT, 0,
 		    &tmp);
 		if (error == 0) {
 			pirvec = lapic_ipi_alloc(pti ? &IDTVEC(justreturn1_pti) :
 			    &IDTVEC(justreturn));
 			if (pirvec < 0) {
 				if (bootverbose) {
 					printf("vmx_init: unable to allocate "
 					    "posted interrupt vector\n");
 				}
 			} else {
 				posted_interrupts = 1;
 				TUNABLE_INT_FETCH("hw.vmm.vmx.use_apic_pir",
 				    &posted_interrupts);
 			}
 		}
 	}
 
 	if (posted_interrupts)
 		    pinbased_ctls |= PINBASED_POSTED_INTERRUPT;
 
 	/* Initialize EPT */
 	error = ept_init(ipinum);
 	if (error) {
 		printf("vmx_init: ept initialization failed (%d)\n", error);
 		return (error);
 	}
 
 	guest_l1d_flush = (cpu_ia32_arch_caps & IA32_ARCH_CAP_RDCL_NO) == 0;
 	TUNABLE_INT_FETCH("hw.vmm.l1d_flush", &guest_l1d_flush);
-	if (guest_l1d_flush &&
-	    (cpu_stdext_feature3 & CPUID_STDEXT3_L1D_FLUSH) != 0)
-		vmx_msr_flush_cmd = IA32_FLUSH_CMD_L1D;
 
 	/*
+	 * L1D cache flush is enabled.  Use IA32_FLUSH_CMD MSR when
+	 * available.  Otherwise fall back to the software flush
+	 * method which loads enough data from the kernel text to
+	 * flush existing L1D content, both on VMX entry and on NMI
+	 * return.
+	 */
+	if (guest_l1d_flush) {
+		if ((cpu_stdext_feature3 & CPUID_STDEXT3_L1D_FLUSH) == 0) {
+			guest_l1d_flush_sw = 1;
+			TUNABLE_INT_FETCH("hw.vmm.l1d_flush_sw",
+			    &guest_l1d_flush_sw);
+		}
+		if (guest_l1d_flush_sw) {
+			if (nmi_flush_l1d_sw <= 1)
+				nmi_flush_l1d_sw = 1;
+		} else {
+			msr_load_list[0].index = MSR_IA32_FLUSH_CMD;
+			msr_load_list[0].val = IA32_FLUSH_CMD_L1D;
+		}
+	}
+
+	/*
 	 * Stash the cr0 and cr4 bits that must be fixed to 0 or 1
 	 */
 	fixed0 = rdmsr(MSR_VMX_CR0_FIXED0);
 	fixed1 = rdmsr(MSR_VMX_CR0_FIXED1);
 	cr0_ones_mask = fixed0 & fixed1;
 	cr0_zeros_mask = ~fixed0 & ~fixed1;
 
 	/*
 	 * CR0_PE and CR0_PG can be set to zero in VMX non-root operation
 	 * if unrestricted guest execution is allowed.
 	 */
 	if (cap_unrestricted_guest)
 		cr0_ones_mask &= ~(CR0_PG | CR0_PE);
 
 	/*
 	 * Do not allow the guest to set CR0_NW or CR0_CD.
 	 */
 	cr0_zeros_mask |= (CR0_NW | CR0_CD);
 
 	fixed0 = rdmsr(MSR_VMX_CR4_FIXED0);
 	fixed1 = rdmsr(MSR_VMX_CR4_FIXED1);
 	cr4_ones_mask = fixed0 & fixed1;
 	cr4_zeros_mask = ~fixed0 & ~fixed1;
 
 	vpid_init();
 
 	vmx_msr_init();
 
 	/* enable VMX operation */
 	smp_rendezvous(NULL, vmx_enable, NULL, NULL);
 
 	vmx_initialized = 1;
 
 	return (0);
 }
 
 static void
 vmx_trigger_hostintr(int vector)
 {
 	uintptr_t func;
 	struct gate_descriptor *gd;
 
 	gd = &idt[vector];
 
 	KASSERT(vector >= 32 && vector <= 255, ("vmx_trigger_hostintr: "
 	    "invalid vector %d", vector));
 	KASSERT(gd->gd_p == 1, ("gate descriptor for vector %d not present",
 	    vector));
 	KASSERT(gd->gd_type == SDT_SYSIGT, ("gate descriptor for vector %d "
 	    "has invalid type %d", vector, gd->gd_type));
 	KASSERT(gd->gd_dpl == SEL_KPL, ("gate descriptor for vector %d "
 	    "has invalid dpl %d", vector, gd->gd_dpl));
 	KASSERT(gd->gd_selector == GSEL(GCODE_SEL, SEL_KPL), ("gate descriptor "
 	    "for vector %d has invalid selector %d", vector, gd->gd_selector));
 	KASSERT(gd->gd_ist == 0, ("gate descriptor for vector %d has invalid "
 	    "IST %d", vector, gd->gd_ist));
 
 	func = ((long)gd->gd_hioffset << 16 | gd->gd_looffset);
 	vmx_call_isr(func);
 }
 
 static int
 vmx_setup_cr_shadow(int which, struct vmcs *vmcs, uint32_t initial)
 {
 	int error, mask_ident, shadow_ident;
 	uint64_t mask_value;
 
 	if (which != 0 && which != 4)
 		panic("vmx_setup_cr_shadow: unknown cr%d", which);
 
 	if (which == 0) {
 		mask_ident = VMCS_CR0_MASK;
 		mask_value = cr0_ones_mask | cr0_zeros_mask;
 		shadow_ident = VMCS_CR0_SHADOW;
 	} else {
 		mask_ident = VMCS_CR4_MASK;
 		mask_value = cr4_ones_mask | cr4_zeros_mask;
 		shadow_ident = VMCS_CR4_SHADOW;
 	}
 
 	error = vmcs_setreg(vmcs, 0, VMCS_IDENT(mask_ident), mask_value);
 	if (error)
 		return (error);
 
 	error = vmcs_setreg(vmcs, 0, VMCS_IDENT(shadow_ident), initial);
 	if (error)
 		return (error);
 
 	return (0);
 }
 #define	vmx_setup_cr0_shadow(vmcs,init)	vmx_setup_cr_shadow(0, (vmcs), (init))
 #define	vmx_setup_cr4_shadow(vmcs,init)	vmx_setup_cr_shadow(4, (vmcs), (init))
 
 static void *
 vmx_vminit(struct vm *vm, pmap_t pmap)
 {
 	uint16_t vpid[VM_MAXCPU];
 	int i, error;
 	struct vmx *vmx;
 	struct vmcs *vmcs;
 	uint32_t exc_bitmap;
 
 	vmx = malloc(sizeof(struct vmx), M_VMX, M_WAITOK | M_ZERO);
 	if ((uintptr_t)vmx & PAGE_MASK) {
 		panic("malloc of struct vmx not aligned on %d byte boundary",
 		      PAGE_SIZE);
 	}
 	vmx->vm = vm;
 
 	vmx->eptp = eptp(vtophys((vm_offset_t)pmap->pm_pml4));
 
 	/*
 	 * Clean up EPTP-tagged guest physical and combined mappings
 	 *
 	 * VMX transitions are not required to invalidate any guest physical
 	 * mappings. So, it may be possible for stale guest physical mappings
 	 * to be present in the processor TLBs.
 	 *
 	 * Combined mappings for this EP4TA are also invalidated for all VPIDs.
 	 */
 	ept_invalidate_mappings(vmx->eptp);
 
 	msr_bitmap_initialize(vmx->msr_bitmap);
 
 	/*
 	 * It is safe to allow direct access to MSR_GSBASE and MSR_FSBASE.
 	 * The guest FSBASE and GSBASE are saved and restored during
 	 * vm-exit and vm-entry respectively. The host FSBASE and GSBASE are
 	 * always restored from the vmcs host state area on vm-exit.
 	 *
 	 * The SYSENTER_CS/ESP/EIP MSRs are identical to FS/GSBASE in
 	 * how they are saved/restored so can be directly accessed by the
 	 * guest.
 	 *
 	 * MSR_EFER is saved and restored in the guest VMCS area on a
 	 * VM exit and entry respectively. It is also restored from the
 	 * host VMCS area on a VM exit.
 	 *
 	 * The TSC MSR is exposed read-only. Writes are disallowed as
 	 * that will impact the host TSC.  If the guest does a write
 	 * the "use TSC offsetting" execution control is enabled and the
 	 * difference between the host TSC and the guest TSC is written
 	 * into the TSC offset in the VMCS.
 	 */
 	if (guest_msr_rw(vmx, MSR_GSBASE) ||
 	    guest_msr_rw(vmx, MSR_FSBASE) ||
 	    guest_msr_rw(vmx, MSR_SYSENTER_CS_MSR) ||
 	    guest_msr_rw(vmx, MSR_SYSENTER_ESP_MSR) ||
 	    guest_msr_rw(vmx, MSR_SYSENTER_EIP_MSR) ||
 	    guest_msr_rw(vmx, MSR_EFER) ||
 	    guest_msr_ro(vmx, MSR_TSC))
 		panic("vmx_vminit: error setting guest msr access");
 
 	vpid_alloc(vpid, VM_MAXCPU);
 
 	if (virtual_interrupt_delivery) {
 		error = vm_map_mmio(vm, DEFAULT_APIC_BASE, PAGE_SIZE,
 		    APIC_ACCESS_ADDRESS);
 		/* XXX this should really return an error to the caller */
 		KASSERT(error == 0, ("vm_map_mmio(apicbase) error %d", error));
 	}
 
 	for (i = 0; i < VM_MAXCPU; i++) {
 		vmcs = &vmx->vmcs[i];
 		vmcs->identifier = vmx_revision();
 		error = vmclear(vmcs);
 		if (error != 0) {
 			panic("vmx_vminit: vmclear error %d on vcpu %d\n",
 			      error, i);
 		}
 
 		vmx_msr_guest_init(vmx, i);
 
 		error = vmcs_init(vmcs);
 		KASSERT(error == 0, ("vmcs_init error %d", error));
 
 		VMPTRLD(vmcs);
 		error = 0;
 		error += vmwrite(VMCS_HOST_RSP, (u_long)&vmx->ctx[i]);
 		error += vmwrite(VMCS_EPTP, vmx->eptp);
 		error += vmwrite(VMCS_PIN_BASED_CTLS, pinbased_ctls);
 		error += vmwrite(VMCS_PRI_PROC_BASED_CTLS, procbased_ctls);
 		error += vmwrite(VMCS_SEC_PROC_BASED_CTLS, procbased_ctls2);
 		error += vmwrite(VMCS_EXIT_CTLS, exit_ctls);
 		error += vmwrite(VMCS_ENTRY_CTLS, entry_ctls);
 		error += vmwrite(VMCS_MSR_BITMAP, vtophys(vmx->msr_bitmap));
 		error += vmwrite(VMCS_VPID, vpid[i]);
+
+		if (guest_l1d_flush && !guest_l1d_flush_sw) {
+			vmcs_write(VMCS_ENTRY_MSR_LOAD, pmap_kextract(
+			    (vm_offset_t)&msr_load_list[0]));
+			vmcs_write(VMCS_ENTRY_MSR_LOAD_COUNT,
+			    nitems(msr_load_list));
+			vmcs_write(VMCS_EXIT_MSR_STORE, 0);
+			vmcs_write(VMCS_EXIT_MSR_STORE_COUNT, 0);
+		}
 
 		/* exception bitmap */
 		if (vcpu_trace_exceptions(vm, i))
 			exc_bitmap = 0xffffffff;
 		else
 			exc_bitmap = 1 << IDT_MC;
 		error += vmwrite(VMCS_EXCEPTION_BITMAP, exc_bitmap);
 
 		vmx->ctx[i].guest_dr6 = DBREG_DR6_RESERVED1;
 		error += vmwrite(VMCS_GUEST_DR7, DBREG_DR7_RESERVED1);
 
 		if (virtual_interrupt_delivery) {
 			error += vmwrite(VMCS_APIC_ACCESS, APIC_ACCESS_ADDRESS);
 			error += vmwrite(VMCS_VIRTUAL_APIC,
 			    vtophys(&vmx->apic_page[i]));
 			error += vmwrite(VMCS_EOI_EXIT0, 0);
 			error += vmwrite(VMCS_EOI_EXIT1, 0);
 			error += vmwrite(VMCS_EOI_EXIT2, 0);
 			error += vmwrite(VMCS_EOI_EXIT3, 0);
 		}
 		if (posted_interrupts) {
 			error += vmwrite(VMCS_PIR_VECTOR, pirvec);
 			error += vmwrite(VMCS_PIR_DESC,
 			    vtophys(&vmx->pir_desc[i]));
 		}
 		VMCLEAR(vmcs);
 		KASSERT(error == 0, ("vmx_vminit: error customizing the vmcs"));
 
 		vmx->cap[i].set = 0;
 		vmx->cap[i].proc_ctls = procbased_ctls;
 		vmx->cap[i].proc_ctls2 = procbased_ctls2;
 
 		vmx->state[i].nextrip = ~0;
 		vmx->state[i].lastcpu = NOCPU;
 		vmx->state[i].vpid = vpid[i];
 
 		/*
 		 * Set up the CR0/4 shadows, and init the read shadow
 		 * to the power-on register value from the Intel Sys Arch.
 		 *  CR0 - 0x60000010
 		 *  CR4 - 0
 		 */
 		error = vmx_setup_cr0_shadow(vmcs, 0x60000010);
 		if (error != 0)
 			panic("vmx_setup_cr0_shadow %d", error);
 
 		error = vmx_setup_cr4_shadow(vmcs, 0);
 		if (error != 0)
 			panic("vmx_setup_cr4_shadow %d", error);
 
 		vmx->ctx[i].pmap = pmap;
 	}
 
 	return (vmx);
 }
 
 static int
 vmx_handle_cpuid(struct vm *vm, int vcpu, struct vmxctx *vmxctx)
 {
 	int handled, func;
 	
 	func = vmxctx->guest_rax;
 
 	handled = x86_emulate_cpuid(vm, vcpu,
 				    (uint32_t*)(&vmxctx->guest_rax),
 				    (uint32_t*)(&vmxctx->guest_rbx),
 				    (uint32_t*)(&vmxctx->guest_rcx),
 				    (uint32_t*)(&vmxctx->guest_rdx));
 	return (handled);
 }
 
 static __inline void
 vmx_run_trace(struct vmx *vmx, int vcpu)
 {
 #ifdef KTR
 	VCPU_CTR1(vmx->vm, vcpu, "Resume execution at %#lx", vmcs_guest_rip());
 #endif
 }
 
 static __inline void
 vmx_exit_trace(struct vmx *vmx, int vcpu, uint64_t rip, uint32_t exit_reason,
 	       int handled)
 {
 #ifdef KTR
 	VCPU_CTR3(vmx->vm, vcpu, "%s %s vmexit at 0x%0lx",
 		 handled ? "handled" : "unhandled",
 		 exit_reason_to_str(exit_reason), rip);
 #endif
 }
 
 static __inline void
 vmx_astpending_trace(struct vmx *vmx, int vcpu, uint64_t rip)
 {
 #ifdef KTR
 	VCPU_CTR1(vmx->vm, vcpu, "astpending vmexit at 0x%0lx", rip);
 #endif
 }
 
 static VMM_STAT_INTEL(VCPU_INVVPID_SAVED, "Number of vpid invalidations saved");
 static VMM_STAT_INTEL(VCPU_INVVPID_DONE, "Number of vpid invalidations done");
 
 /*
  * Invalidate guest mappings identified by its vpid from the TLB.
  */
 static __inline void
 vmx_invvpid(struct vmx *vmx, int vcpu, pmap_t pmap, int running)
 {
 	struct vmxstate *vmxstate;
 	struct invvpid_desc invvpid_desc;
 
 	vmxstate = &vmx->state[vcpu];
 	if (vmxstate->vpid == 0)
 		return;
 
 	if (!running) {
 		/*
 		 * Set the 'lastcpu' to an invalid host cpu.
 		 *
 		 * This will invalidate TLB entries tagged with the vcpu's
 		 * vpid the next time it runs via vmx_set_pcpu_defaults().
 		 */
 		vmxstate->lastcpu = NOCPU;
 		return;
 	}
 
 	KASSERT(curthread->td_critnest > 0, ("%s: vcpu %d running outside "
 	    "critical section", __func__, vcpu));
 
 	/*
 	 * Invalidate all mappings tagged with 'vpid'
 	 *
 	 * We do this because this vcpu was executing on a different host
 	 * cpu when it last ran. We do not track whether it invalidated
 	 * mappings associated with its 'vpid' during that run. So we must
 	 * assume that the mappings associated with 'vpid' on 'curcpu' are
 	 * stale and invalidate them.
 	 *
 	 * Note that we incur this penalty only when the scheduler chooses to
 	 * move the thread associated with this vcpu between host cpus.
 	 *
 	 * Note also that this will invalidate mappings tagged with 'vpid'
 	 * for "all" EP4TAs.
 	 */
 	if (pmap->pm_eptgen == vmx->eptgen[curcpu]) {
 		invvpid_desc._res1 = 0;
 		invvpid_desc._res2 = 0;
 		invvpid_desc.vpid = vmxstate->vpid;
 		invvpid_desc.linear_addr = 0;
 		invvpid(INVVPID_TYPE_SINGLE_CONTEXT, invvpid_desc);
 		vmm_stat_incr(vmx->vm, vcpu, VCPU_INVVPID_DONE, 1);
 	} else {
 		/*
 		 * The invvpid can be skipped if an invept is going to
 		 * be performed before entering the guest. The invept
 		 * will invalidate combined mappings tagged with
 		 * 'vmx->eptp' for all vpids.
 		 */
 		vmm_stat_incr(vmx->vm, vcpu, VCPU_INVVPID_SAVED, 1);
 	}
 }
 
 static void
 vmx_set_pcpu_defaults(struct vmx *vmx, int vcpu, pmap_t pmap)
 {
 	struct vmxstate *vmxstate;
 
 	vmxstate = &vmx->state[vcpu];
 	if (vmxstate->lastcpu == curcpu)
 		return;
 
 	vmxstate->lastcpu = curcpu;
 
 	vmm_stat_incr(vmx->vm, vcpu, VCPU_MIGRATIONS, 1);
 
 	vmcs_write(VMCS_HOST_TR_BASE, vmm_get_host_trbase());
 	vmcs_write(VMCS_HOST_GDTR_BASE, vmm_get_host_gdtrbase());
 	vmcs_write(VMCS_HOST_GS_BASE, vmm_get_host_gsbase());
 	vmx_invvpid(vmx, vcpu, pmap, 1);
 }
 
 /*
  * We depend on 'procbased_ctls' to have the Interrupt Window Exiting bit set.
  */
 CTASSERT((PROCBASED_CTLS_ONE_SETTING & PROCBASED_INT_WINDOW_EXITING) != 0);
 
 static void __inline
 vmx_set_int_window_exiting(struct vmx *vmx, int vcpu)
 {
 
 	if ((vmx->cap[vcpu].proc_ctls & PROCBASED_INT_WINDOW_EXITING) == 0) {
 		vmx->cap[vcpu].proc_ctls |= PROCBASED_INT_WINDOW_EXITING;
 		vmcs_write(VMCS_PRI_PROC_BASED_CTLS, vmx->cap[vcpu].proc_ctls);
 		VCPU_CTR0(vmx->vm, vcpu, "Enabling interrupt window exiting");
 	}
 }
 
 static void __inline
 vmx_clear_int_window_exiting(struct vmx *vmx, int vcpu)
 {
 
 	KASSERT((vmx->cap[vcpu].proc_ctls & PROCBASED_INT_WINDOW_EXITING) != 0,
 	    ("intr_window_exiting not set: %#x", vmx->cap[vcpu].proc_ctls));
 	vmx->cap[vcpu].proc_ctls &= ~PROCBASED_INT_WINDOW_EXITING;
 	vmcs_write(VMCS_PRI_PROC_BASED_CTLS, vmx->cap[vcpu].proc_ctls);
 	VCPU_CTR0(vmx->vm, vcpu, "Disabling interrupt window exiting");
 }
 
 static void __inline
 vmx_set_nmi_window_exiting(struct vmx *vmx, int vcpu)
 {
 
 	if ((vmx->cap[vcpu].proc_ctls & PROCBASED_NMI_WINDOW_EXITING) == 0) {
 		vmx->cap[vcpu].proc_ctls |= PROCBASED_NMI_WINDOW_EXITING;
 		vmcs_write(VMCS_PRI_PROC_BASED_CTLS, vmx->cap[vcpu].proc_ctls);
 		VCPU_CTR0(vmx->vm, vcpu, "Enabling NMI window exiting");
 	}
 }
 
 static void __inline
 vmx_clear_nmi_window_exiting(struct vmx *vmx, int vcpu)
 {
 
 	KASSERT((vmx->cap[vcpu].proc_ctls & PROCBASED_NMI_WINDOW_EXITING) != 0,
 	    ("nmi_window_exiting not set %#x", vmx->cap[vcpu].proc_ctls));
 	vmx->cap[vcpu].proc_ctls &= ~PROCBASED_NMI_WINDOW_EXITING;
 	vmcs_write(VMCS_PRI_PROC_BASED_CTLS, vmx->cap[vcpu].proc_ctls);
 	VCPU_CTR0(vmx->vm, vcpu, "Disabling NMI window exiting");
 }
 
 int
 vmx_set_tsc_offset(struct vmx *vmx, int vcpu, uint64_t offset)
 {
 	int error;
 
 	if ((vmx->cap[vcpu].proc_ctls & PROCBASED_TSC_OFFSET) == 0) {
 		vmx->cap[vcpu].proc_ctls |= PROCBASED_TSC_OFFSET;
 		vmcs_write(VMCS_PRI_PROC_BASED_CTLS, vmx->cap[vcpu].proc_ctls);
 		VCPU_CTR0(vmx->vm, vcpu, "Enabling TSC offsetting");
 	}
 
 	error = vmwrite(VMCS_TSC_OFFSET, offset);
 
 	return (error);
 }
 
 #define	NMI_BLOCKING	(VMCS_INTERRUPTIBILITY_NMI_BLOCKING |		\
 			 VMCS_INTERRUPTIBILITY_MOVSS_BLOCKING)
 #define	HWINTR_BLOCKING	(VMCS_INTERRUPTIBILITY_STI_BLOCKING |		\
 			 VMCS_INTERRUPTIBILITY_MOVSS_BLOCKING)
 
 static void
 vmx_inject_nmi(struct vmx *vmx, int vcpu)
 {
 	uint32_t gi, info;
 
 	gi = vmcs_read(VMCS_GUEST_INTERRUPTIBILITY);
 	KASSERT((gi & NMI_BLOCKING) == 0, ("vmx_inject_nmi: invalid guest "
 	    "interruptibility-state %#x", gi));
 
 	info = vmcs_read(VMCS_ENTRY_INTR_INFO);
 	KASSERT((info & VMCS_INTR_VALID) == 0, ("vmx_inject_nmi: invalid "
 	    "VM-entry interruption information %#x", info));
 
 	/*
 	 * Inject the virtual NMI. The vector must be the NMI IDT entry
 	 * or the VMCS entry check will fail.
 	 */
 	info = IDT_NMI | VMCS_INTR_T_NMI | VMCS_INTR_VALID;
 	vmcs_write(VMCS_ENTRY_INTR_INFO, info);
 
 	VCPU_CTR0(vmx->vm, vcpu, "Injecting vNMI");
 
 	/* Clear the request */
 	vm_nmi_clear(vmx->vm, vcpu);
 }
 
 static void
 vmx_inject_interrupts(struct vmx *vmx, int vcpu, struct vlapic *vlapic,
     uint64_t guestrip)
 {
 	int vector, need_nmi_exiting, extint_pending;
 	uint64_t rflags, entryinfo;
 	uint32_t gi, info;
 
 	if (vmx->state[vcpu].nextrip != guestrip) {
 		gi = vmcs_read(VMCS_GUEST_INTERRUPTIBILITY);
 		if (gi & HWINTR_BLOCKING) {
 			VCPU_CTR2(vmx->vm, vcpu, "Guest interrupt blocking "
 			    "cleared due to rip change: %#lx/%#lx",
 			    vmx->state[vcpu].nextrip, guestrip);
 			gi &= ~HWINTR_BLOCKING;
 			vmcs_write(VMCS_GUEST_INTERRUPTIBILITY, gi);
 		}
 	}
 
 	if (vm_entry_intinfo(vmx->vm, vcpu, &entryinfo)) {
 		KASSERT((entryinfo & VMCS_INTR_VALID) != 0, ("%s: entry "
 		    "intinfo is not valid: %#lx", __func__, entryinfo));
 
 		info = vmcs_read(VMCS_ENTRY_INTR_INFO);
 		KASSERT((info & VMCS_INTR_VALID) == 0, ("%s: cannot inject "
 		     "pending exception: %#lx/%#x", __func__, entryinfo, info));
 
 		info = entryinfo;
 		vector = info & 0xff;
 		if (vector == IDT_BP || vector == IDT_OF) {
 			/*
 			 * VT-x requires #BP and #OF to be injected as software
 			 * exceptions.
 			 */
 			info &= ~VMCS_INTR_T_MASK;
 			info |= VMCS_INTR_T_SWEXCEPTION;
 		}
 
 		if (info & VMCS_INTR_DEL_ERRCODE)
 			vmcs_write(VMCS_ENTRY_EXCEPTION_ERROR, entryinfo >> 32);
 
 		vmcs_write(VMCS_ENTRY_INTR_INFO, info);
 	}
 
 	if (vm_nmi_pending(vmx->vm, vcpu)) {
 		/*
 		 * If there are no conditions blocking NMI injection then
 		 * inject it directly here otherwise enable "NMI window
 		 * exiting" to inject it as soon as we can.
 		 *
 		 * We also check for STI_BLOCKING because some implementations
 		 * don't allow NMI injection in this case. If we are running
 		 * on a processor that doesn't have this restriction it will
 		 * immediately exit and the NMI will be injected in the
 		 * "NMI window exiting" handler.
 		 */
 		need_nmi_exiting = 1;
 		gi = vmcs_read(VMCS_GUEST_INTERRUPTIBILITY);
 		if ((gi & (HWINTR_BLOCKING | NMI_BLOCKING)) == 0) {
 			info = vmcs_read(VMCS_ENTRY_INTR_INFO);
 			if ((info & VMCS_INTR_VALID) == 0) {
 				vmx_inject_nmi(vmx, vcpu);
 				need_nmi_exiting = 0;
 			} else {
 				VCPU_CTR1(vmx->vm, vcpu, "Cannot inject NMI "
 				    "due to VM-entry intr info %#x", info);
 			}
 		} else {
 			VCPU_CTR1(vmx->vm, vcpu, "Cannot inject NMI due to "
 			    "Guest Interruptibility-state %#x", gi);
 		}
 
 		if (need_nmi_exiting)
 			vmx_set_nmi_window_exiting(vmx, vcpu);
 	}
 
 	extint_pending = vm_extint_pending(vmx->vm, vcpu);
 
 	if (!extint_pending && virtual_interrupt_delivery) {
 		vmx_inject_pir(vlapic);
 		return;
 	}
 
 	/*
 	 * If interrupt-window exiting is already in effect then don't bother
 	 * checking for pending interrupts. This is just an optimization and
 	 * not needed for correctness.
 	 */
 	if ((vmx->cap[vcpu].proc_ctls & PROCBASED_INT_WINDOW_EXITING) != 0) {
 		VCPU_CTR0(vmx->vm, vcpu, "Skip interrupt injection due to "
 		    "pending int_window_exiting");
 		return;
 	}
 
 	if (!extint_pending) {
 		/* Ask the local apic for a vector to inject */
 		if (!vlapic_pending_intr(vlapic, &vector))
 			return;
 
 		/*
 		 * From the Intel SDM, Volume 3, Section "Maskable
 		 * Hardware Interrupts":
 		 * - maskable interrupt vectors [16,255] can be delivered
 		 *   through the local APIC.
 		*/
 		KASSERT(vector >= 16 && vector <= 255,
 		    ("invalid vector %d from local APIC", vector));
 	} else {
 		/* Ask the legacy pic for a vector to inject */
 		vatpic_pending_intr(vmx->vm, &vector);
 
 		/*
 		 * From the Intel SDM, Volume 3, Section "Maskable
 		 * Hardware Interrupts":
 		 * - maskable interrupt vectors [0,255] can be delivered
 		 *   through the INTR pin.
 		 */
 		KASSERT(vector >= 0 && vector <= 255,
 		    ("invalid vector %d from INTR", vector));
 	}
 
 	/* Check RFLAGS.IF and the interruptibility state of the guest */
 	rflags = vmcs_read(VMCS_GUEST_RFLAGS);
 	if ((rflags & PSL_I) == 0) {
 		VCPU_CTR2(vmx->vm, vcpu, "Cannot inject vector %d due to "
 		    "rflags %#lx", vector, rflags);
 		goto cantinject;
 	}
 
 	gi = vmcs_read(VMCS_GUEST_INTERRUPTIBILITY);
 	if (gi & HWINTR_BLOCKING) {
 		VCPU_CTR2(vmx->vm, vcpu, "Cannot inject vector %d due to "
 		    "Guest Interruptibility-state %#x", vector, gi);
 		goto cantinject;
 	}
 
 	info = vmcs_read(VMCS_ENTRY_INTR_INFO);
 	if (info & VMCS_INTR_VALID) {
 		/*
 		 * This is expected and could happen for multiple reasons:
 		 * - A vectoring VM-entry was aborted due to astpending
 		 * - A VM-exit happened during event injection.
 		 * - An exception was injected above.
 		 * - An NMI was injected above or after "NMI window exiting"
 		 */
 		VCPU_CTR2(vmx->vm, vcpu, "Cannot inject vector %d due to "
 		    "VM-entry intr info %#x", vector, info);
 		goto cantinject;
 	}
 
 	/* Inject the interrupt */
 	info = VMCS_INTR_T_HWINTR | VMCS_INTR_VALID;
 	info |= vector;
 	vmcs_write(VMCS_ENTRY_INTR_INFO, info);
 
 	if (!extint_pending) {
 		/* Update the Local APIC ISR */
 		vlapic_intr_accepted(vlapic, vector);
 	} else {
 		vm_extint_clear(vmx->vm, vcpu);
 		vatpic_intr_accepted(vmx->vm, vector);
 
 		/*
 		 * After we accepted the current ExtINT the PIC may
 		 * have posted another one.  If that is the case, set
 		 * the Interrupt Window Exiting execution control so
 		 * we can inject that one too.
 		 *
 		 * Also, interrupt window exiting allows us to inject any
 		 * pending APIC vector that was preempted by the ExtINT
 		 * as soon as possible. This applies both for the software
 		 * emulated vlapic and the hardware assisted virtual APIC.
 		 */
 		vmx_set_int_window_exiting(vmx, vcpu);
 	}
 
 	VCPU_CTR1(vmx->vm, vcpu, "Injecting hwintr at vector %d", vector);
 
 	return;
 
 cantinject:
 	/*
 	 * Set the Interrupt Window Exiting execution control so we can inject
 	 * the interrupt as soon as blocking condition goes away.
 	 */
 	vmx_set_int_window_exiting(vmx, vcpu);
 }
 
 /*
  * If the Virtual NMIs execution control is '1' then the logical processor
  * tracks virtual-NMI blocking in the Guest Interruptibility-state field of
  * the VMCS. An IRET instruction in VMX non-root operation will remove any
  * virtual-NMI blocking.
  *
  * This unblocking occurs even if the IRET causes a fault. In this case the
  * hypervisor needs to restore virtual-NMI blocking before resuming the guest.
  */
 static void
 vmx_restore_nmi_blocking(struct vmx *vmx, int vcpuid)
 {
 	uint32_t gi;
 
 	VCPU_CTR0(vmx->vm, vcpuid, "Restore Virtual-NMI blocking");
 	gi = vmcs_read(VMCS_GUEST_INTERRUPTIBILITY);
 	gi |= VMCS_INTERRUPTIBILITY_NMI_BLOCKING;
 	vmcs_write(VMCS_GUEST_INTERRUPTIBILITY, gi);
 }
 
 static void
 vmx_clear_nmi_blocking(struct vmx *vmx, int vcpuid)
 {
 	uint32_t gi;
 
 	VCPU_CTR0(vmx->vm, vcpuid, "Clear Virtual-NMI blocking");
 	gi = vmcs_read(VMCS_GUEST_INTERRUPTIBILITY);
 	gi &= ~VMCS_INTERRUPTIBILITY_NMI_BLOCKING;
 	vmcs_write(VMCS_GUEST_INTERRUPTIBILITY, gi);
 }
 
 static void
 vmx_assert_nmi_blocking(struct vmx *vmx, int vcpuid)
 {
 	uint32_t gi;
 
 	gi = vmcs_read(VMCS_GUEST_INTERRUPTIBILITY);
 	KASSERT(gi & VMCS_INTERRUPTIBILITY_NMI_BLOCKING,
 	    ("NMI blocking is not in effect %#x", gi));
 }
 
 static int
 vmx_emulate_xsetbv(struct vmx *vmx, int vcpu, struct vm_exit *vmexit)
 {
 	struct vmxctx *vmxctx;
 	uint64_t xcrval;
 	const struct xsave_limits *limits;
 
 	vmxctx = &vmx->ctx[vcpu];
 	limits = vmm_get_xsave_limits();
 
 	/*
 	 * Note that the processor raises a GP# fault on its own if
 	 * xsetbv is executed for CPL != 0, so we do not have to
 	 * emulate that fault here.
 	 */
 
 	/* Only xcr0 is supported. */
 	if (vmxctx->guest_rcx != 0) {
 		vm_inject_gp(vmx->vm, vcpu);
 		return (HANDLED);
 	}
 
 	/* We only handle xcr0 if both the host and guest have XSAVE enabled. */
 	if (!limits->xsave_enabled || !(vmcs_read(VMCS_GUEST_CR4) & CR4_XSAVE)) {
 		vm_inject_ud(vmx->vm, vcpu);
 		return (HANDLED);
 	}
 
 	xcrval = vmxctx->guest_rdx << 32 | (vmxctx->guest_rax & 0xffffffff);
 	if ((xcrval & ~limits->xcr0_allowed) != 0) {
 		vm_inject_gp(vmx->vm, vcpu);
 		return (HANDLED);
 	}
 
 	if (!(xcrval & XFEATURE_ENABLED_X87)) {
 		vm_inject_gp(vmx->vm, vcpu);
 		return (HANDLED);
 	}
 
 	/* AVX (YMM_Hi128) requires SSE. */
 	if (xcrval & XFEATURE_ENABLED_AVX &&
 	    (xcrval & XFEATURE_AVX) != XFEATURE_AVX) {
 		vm_inject_gp(vmx->vm, vcpu);
 		return (HANDLED);
 	}
 
 	/*
 	 * AVX512 requires base AVX (YMM_Hi128) as well as OpMask,
 	 * ZMM_Hi256, and Hi16_ZMM.
 	 */
 	if (xcrval & XFEATURE_AVX512 &&
 	    (xcrval & (XFEATURE_AVX512 | XFEATURE_AVX)) !=
 	    (XFEATURE_AVX512 | XFEATURE_AVX)) {
 		vm_inject_gp(vmx->vm, vcpu);
 		return (HANDLED);
 	}
 
 	/*
 	 * Intel MPX requires both bound register state flags to be
 	 * set.
 	 */
 	if (((xcrval & XFEATURE_ENABLED_BNDREGS) != 0) !=
 	    ((xcrval & XFEATURE_ENABLED_BNDCSR) != 0)) {
 		vm_inject_gp(vmx->vm, vcpu);
 		return (HANDLED);
 	}
 
 	/*
 	 * This runs "inside" vmrun() with the guest's FPU state, so
 	 * modifying xcr0 directly modifies the guest's xcr0, not the
 	 * host's.
 	 */
 	load_xcr(0, xcrval);
 	return (HANDLED);
 }
 
 static uint64_t
 vmx_get_guest_reg(struct vmx *vmx, int vcpu, int ident)
 {
 	const struct vmxctx *vmxctx;
 
 	vmxctx = &vmx->ctx[vcpu];
 
 	switch (ident) {
 	case 0:
 		return (vmxctx->guest_rax);
 	case 1:
 		return (vmxctx->guest_rcx);
 	case 2:
 		return (vmxctx->guest_rdx);
 	case 3:
 		return (vmxctx->guest_rbx);
 	case 4:
 		return (vmcs_read(VMCS_GUEST_RSP));
 	case 5:
 		return (vmxctx->guest_rbp);
 	case 6:
 		return (vmxctx->guest_rsi);
 	case 7:
 		return (vmxctx->guest_rdi);
 	case 8:
 		return (vmxctx->guest_r8);
 	case 9:
 		return (vmxctx->guest_r9);
 	case 10:
 		return (vmxctx->guest_r10);
 	case 11:
 		return (vmxctx->guest_r11);
 	case 12:
 		return (vmxctx->guest_r12);
 	case 13:
 		return (vmxctx->guest_r13);
 	case 14:
 		return (vmxctx->guest_r14);
 	case 15:
 		return (vmxctx->guest_r15);
 	default:
 		panic("invalid vmx register %d", ident);
 	}
 }
 
 static void
 vmx_set_guest_reg(struct vmx *vmx, int vcpu, int ident, uint64_t regval)
 {
 	struct vmxctx *vmxctx;
 
 	vmxctx = &vmx->ctx[vcpu];
 
 	switch (ident) {
 	case 0:
 		vmxctx->guest_rax = regval;
 		break;
 	case 1:
 		vmxctx->guest_rcx = regval;
 		break;
 	case 2:
 		vmxctx->guest_rdx = regval;
 		break;
 	case 3:
 		vmxctx->guest_rbx = regval;
 		break;
 	case 4:
 		vmcs_write(VMCS_GUEST_RSP, regval);
 		break;
 	case 5:
 		vmxctx->guest_rbp = regval;
 		break;
 	case 6:
 		vmxctx->guest_rsi = regval;
 		break;
 	case 7:
 		vmxctx->guest_rdi = regval;
 		break;
 	case 8:
 		vmxctx->guest_r8 = regval;
 		break;
 	case 9:
 		vmxctx->guest_r9 = regval;
 		break;
 	case 10:
 		vmxctx->guest_r10 = regval;
 		break;
 	case 11:
 		vmxctx->guest_r11 = regval;
 		break;
 	case 12:
 		vmxctx->guest_r12 = regval;
 		break;
 	case 13:
 		vmxctx->guest_r13 = regval;
 		break;
 	case 14:
 		vmxctx->guest_r14 = regval;
 		break;
 	case 15:
 		vmxctx->guest_r15 = regval;
 		break;
 	default:
 		panic("invalid vmx register %d", ident);
 	}
 }
 
 static int
 vmx_emulate_cr0_access(struct vmx *vmx, int vcpu, uint64_t exitqual)
 {
 	uint64_t crval, regval;
 
 	/* We only handle mov to %cr0 at this time */
 	if ((exitqual & 0xf0) != 0x00)
 		return (UNHANDLED);
 
 	regval = vmx_get_guest_reg(vmx, vcpu, (exitqual >> 8) & 0xf);
 
 	vmcs_write(VMCS_CR0_SHADOW, regval);
 
 	crval = regval | cr0_ones_mask;
 	crval &= ~cr0_zeros_mask;
 	vmcs_write(VMCS_GUEST_CR0, crval);
 
 	if (regval & CR0_PG) {
 		uint64_t efer, entry_ctls;
 
 		/*
 		 * If CR0.PG is 1 and EFER.LME is 1 then EFER.LMA and
 		 * the "IA-32e mode guest" bit in VM-entry control must be
 		 * equal.
 		 */
 		efer = vmcs_read(VMCS_GUEST_IA32_EFER);
 		if (efer & EFER_LME) {
 			efer |= EFER_LMA;
 			vmcs_write(VMCS_GUEST_IA32_EFER, efer);
 			entry_ctls = vmcs_read(VMCS_ENTRY_CTLS);
 			entry_ctls |= VM_ENTRY_GUEST_LMA;
 			vmcs_write(VMCS_ENTRY_CTLS, entry_ctls);
 		}
 	}
 
 	return (HANDLED);
 }
 
 static int
 vmx_emulate_cr4_access(struct vmx *vmx, int vcpu, uint64_t exitqual)
 {
 	uint64_t crval, regval;
 
 	/* We only handle mov to %cr4 at this time */
 	if ((exitqual & 0xf0) != 0x00)
 		return (UNHANDLED);
 
 	regval = vmx_get_guest_reg(vmx, vcpu, (exitqual >> 8) & 0xf);
 
 	vmcs_write(VMCS_CR4_SHADOW, regval);
 
 	crval = regval | cr4_ones_mask;
 	crval &= ~cr4_zeros_mask;
 	vmcs_write(VMCS_GUEST_CR4, crval);
 
 	return (HANDLED);
 }
 
 static int
 vmx_emulate_cr8_access(struct vmx *vmx, int vcpu, uint64_t exitqual)
 {
 	struct vlapic *vlapic;
 	uint64_t cr8;
 	int regnum;
 
 	/* We only handle mov %cr8 to/from a register at this time. */
 	if ((exitqual & 0xe0) != 0x00) {
 		return (UNHANDLED);
 	}
 
 	vlapic = vm_lapic(vmx->vm, vcpu);
 	regnum = (exitqual >> 8) & 0xf;
 	if (exitqual & 0x10) {
 		cr8 = vlapic_get_cr8(vlapic);
 		vmx_set_guest_reg(vmx, vcpu, regnum, cr8);
 	} else {
 		cr8 = vmx_get_guest_reg(vmx, vcpu, regnum);
 		vlapic_set_cr8(vlapic, cr8);
 	}
 
 	return (HANDLED);
 }
 
 /*
  * From section "Guest Register State" in the Intel SDM: CPL = SS.DPL
  */
 static int
 vmx_cpl(void)
 {
 	uint32_t ssar;
 
 	ssar = vmcs_read(VMCS_GUEST_SS_ACCESS_RIGHTS);
 	return ((ssar >> 5) & 0x3);
 }
 
 static enum vm_cpu_mode
 vmx_cpu_mode(void)
 {
 	uint32_t csar;
 
 	if (vmcs_read(VMCS_GUEST_IA32_EFER) & EFER_LMA) {
 		csar = vmcs_read(VMCS_GUEST_CS_ACCESS_RIGHTS);
 		if (csar & 0x2000)
 			return (CPU_MODE_64BIT);	/* CS.L = 1 */
 		else
 			return (CPU_MODE_COMPATIBILITY);
 	} else if (vmcs_read(VMCS_GUEST_CR0) & CR0_PE) {
 		return (CPU_MODE_PROTECTED);
 	} else {
 		return (CPU_MODE_REAL);
 	}
 }
 
 static enum vm_paging_mode
 vmx_paging_mode(void)
 {
 
 	if (!(vmcs_read(VMCS_GUEST_CR0) & CR0_PG))
 		return (PAGING_MODE_FLAT);
 	if (!(vmcs_read(VMCS_GUEST_CR4) & CR4_PAE))
 		return (PAGING_MODE_32);
 	if (vmcs_read(VMCS_GUEST_IA32_EFER) & EFER_LME)
 		return (PAGING_MODE_64);
 	else
 		return (PAGING_MODE_PAE);
 }
 
 static uint64_t
 inout_str_index(struct vmx *vmx, int vcpuid, int in)
 {
 	uint64_t val;
 	int error;
 	enum vm_reg_name reg;
 
 	reg = in ? VM_REG_GUEST_RDI : VM_REG_GUEST_RSI;
 	error = vmx_getreg(vmx, vcpuid, reg, &val);
 	KASSERT(error == 0, ("%s: vmx_getreg error %d", __func__, error));
 	return (val);
 }
 
 static uint64_t
 inout_str_count(struct vmx *vmx, int vcpuid, int rep)
 {
 	uint64_t val;
 	int error;
 
 	if (rep) {
 		error = vmx_getreg(vmx, vcpuid, VM_REG_GUEST_RCX, &val);
 		KASSERT(!error, ("%s: vmx_getreg error %d", __func__, error));
 	} else {
 		val = 1;
 	}
 	return (val);
 }
 
 static int
 inout_str_addrsize(uint32_t inst_info)
 {
 	uint32_t size;
 
 	size = (inst_info >> 7) & 0x7;
 	switch (size) {
 	case 0:
 		return (2);	/* 16 bit */
 	case 1:
 		return (4);	/* 32 bit */
 	case 2:
 		return (8);	/* 64 bit */
 	default:
 		panic("%s: invalid size encoding %d", __func__, size);
 	}
 }
 
 static void
 inout_str_seginfo(struct vmx *vmx, int vcpuid, uint32_t inst_info, int in,
     struct vm_inout_str *vis)
 {
 	int error, s;
 
 	if (in) {
 		vis->seg_name = VM_REG_GUEST_ES;
 	} else {
 		s = (inst_info >> 15) & 0x7;
 		vis->seg_name = vm_segment_name(s);
 	}
 
 	error = vmx_getdesc(vmx, vcpuid, vis->seg_name, &vis->seg_desc);
 	KASSERT(error == 0, ("%s: vmx_getdesc error %d", __func__, error));
 }
 
 static void
 vmx_paging_info(struct vm_guest_paging *paging)
 {
 	paging->cr3 = vmcs_guest_cr3();
 	paging->cpl = vmx_cpl();
 	paging->cpu_mode = vmx_cpu_mode();
 	paging->paging_mode = vmx_paging_mode();
 }
 
 static void
 vmexit_inst_emul(struct vm_exit *vmexit, uint64_t gpa, uint64_t gla)
 {
 	struct vm_guest_paging *paging;
 	uint32_t csar;
 
 	paging = &vmexit->u.inst_emul.paging;
 
 	vmexit->exitcode = VM_EXITCODE_INST_EMUL;
 	vmexit->inst_length = 0;
 	vmexit->u.inst_emul.gpa = gpa;
 	vmexit->u.inst_emul.gla = gla;
 	vmx_paging_info(paging);
 	switch (paging->cpu_mode) {
 	case CPU_MODE_REAL:
 		vmexit->u.inst_emul.cs_base = vmcs_read(VMCS_GUEST_CS_BASE);
 		vmexit->u.inst_emul.cs_d = 0;
 		break;
 	case CPU_MODE_PROTECTED:
 	case CPU_MODE_COMPATIBILITY:
 		vmexit->u.inst_emul.cs_base = vmcs_read(VMCS_GUEST_CS_BASE);
 		csar = vmcs_read(VMCS_GUEST_CS_ACCESS_RIGHTS);
 		vmexit->u.inst_emul.cs_d = SEG_DESC_DEF32(csar);
 		break;
 	default:
 		vmexit->u.inst_emul.cs_base = 0;
 		vmexit->u.inst_emul.cs_d = 0;
 		break;
 	}
 	vie_init(&vmexit->u.inst_emul.vie, NULL, 0);
 }
 
 static int
 ept_fault_type(uint64_t ept_qual)
 {
 	int fault_type;
 
 	if (ept_qual & EPT_VIOLATION_DATA_WRITE)
 		fault_type = VM_PROT_WRITE;
 	else if (ept_qual & EPT_VIOLATION_INST_FETCH)
 		fault_type = VM_PROT_EXECUTE;
 	else
 		fault_type= VM_PROT_READ;
 
 	return (fault_type);
 }
 
 static boolean_t
 ept_emulation_fault(uint64_t ept_qual)
 {
 	int read, write;
 
 	/* EPT fault on an instruction fetch doesn't make sense here */
 	if (ept_qual & EPT_VIOLATION_INST_FETCH)
 		return (FALSE);
 
 	/* EPT fault must be a read fault or a write fault */
 	read = ept_qual & EPT_VIOLATION_DATA_READ ? 1 : 0;
 	write = ept_qual & EPT_VIOLATION_DATA_WRITE ? 1 : 0;
 	if ((read | write) == 0)
 		return (FALSE);
 
 	/*
 	 * The EPT violation must have been caused by accessing a
 	 * guest-physical address that is a translation of a guest-linear
 	 * address.
 	 */
 	if ((ept_qual & EPT_VIOLATION_GLA_VALID) == 0 ||
 	    (ept_qual & EPT_VIOLATION_XLAT_VALID) == 0) {
 		return (FALSE);
 	}
 
 	return (TRUE);
 }
 
 static __inline int
 apic_access_virtualization(struct vmx *vmx, int vcpuid)
 {
 	uint32_t proc_ctls2;
 
 	proc_ctls2 = vmx->cap[vcpuid].proc_ctls2;
 	return ((proc_ctls2 & PROCBASED2_VIRTUALIZE_APIC_ACCESSES) ? 1 : 0);
 }
 
 static __inline int
 x2apic_virtualization(struct vmx *vmx, int vcpuid)
 {
 	uint32_t proc_ctls2;
 
 	proc_ctls2 = vmx->cap[vcpuid].proc_ctls2;
 	return ((proc_ctls2 & PROCBASED2_VIRTUALIZE_X2APIC_MODE) ? 1 : 0);
 }
 
 static int
 vmx_handle_apic_write(struct vmx *vmx, int vcpuid, struct vlapic *vlapic,
     uint64_t qual)
 {
 	int error, handled, offset;
 	uint32_t *apic_regs, vector;
 	bool retu;
 
 	handled = HANDLED;
 	offset = APIC_WRITE_OFFSET(qual);
 
 	if (!apic_access_virtualization(vmx, vcpuid)) {
 		/*
 		 * In general there should not be any APIC write VM-exits
 		 * unless APIC-access virtualization is enabled.
 		 *
 		 * However self-IPI virtualization can legitimately trigger
 		 * an APIC-write VM-exit so treat it specially.
 		 */
 		if (x2apic_virtualization(vmx, vcpuid) &&
 		    offset == APIC_OFFSET_SELF_IPI) {
 			apic_regs = (uint32_t *)(vlapic->apic_page);
 			vector = apic_regs[APIC_OFFSET_SELF_IPI / 4];
 			vlapic_self_ipi_handler(vlapic, vector);
 			return (HANDLED);
 		} else
 			return (UNHANDLED);
 	}
 
 	switch (offset) {
 	case APIC_OFFSET_ID:
 		vlapic_id_write_handler(vlapic);
 		break;
 	case APIC_OFFSET_LDR:
 		vlapic_ldr_write_handler(vlapic);
 		break;
 	case APIC_OFFSET_DFR:
 		vlapic_dfr_write_handler(vlapic);
 		break;
 	case APIC_OFFSET_SVR:
 		vlapic_svr_write_handler(vlapic);
 		break;
 	case APIC_OFFSET_ESR:
 		vlapic_esr_write_handler(vlapic);
 		break;
 	case APIC_OFFSET_ICR_LOW:
 		retu = false;
 		error = vlapic_icrlo_write_handler(vlapic, &retu);
 		if (error != 0 || retu)
 			handled = UNHANDLED;
 		break;
 	case APIC_OFFSET_CMCI_LVT:
 	case APIC_OFFSET_TIMER_LVT ... APIC_OFFSET_ERROR_LVT:
 		vlapic_lvt_write_handler(vlapic, offset);
 		break;
 	case APIC_OFFSET_TIMER_ICR:
 		vlapic_icrtmr_write_handler(vlapic);
 		break;
 	case APIC_OFFSET_TIMER_DCR:
 		vlapic_dcr_write_handler(vlapic);
 		break;
 	default:
 		handled = UNHANDLED;
 		break;
 	}
 	return (handled);
 }
 
 static bool
 apic_access_fault(struct vmx *vmx, int vcpuid, uint64_t gpa)
 {
 
 	if (apic_access_virtualization(vmx, vcpuid) &&
 	    (gpa >= DEFAULT_APIC_BASE && gpa < DEFAULT_APIC_BASE + PAGE_SIZE))
 		return (true);
 	else
 		return (false);
 }
 
 static int
 vmx_handle_apic_access(struct vmx *vmx, int vcpuid, struct vm_exit *vmexit)
 {
 	uint64_t qual;
 	int access_type, offset, allowed;
 
 	if (!apic_access_virtualization(vmx, vcpuid))
 		return (UNHANDLED);
 
 	qual = vmexit->u.vmx.exit_qualification;
 	access_type = APIC_ACCESS_TYPE(qual);
 	offset = APIC_ACCESS_OFFSET(qual);
 
 	allowed = 0;
 	if (access_type == 0) {
 		/*
 		 * Read data access to the following registers is expected.
 		 */
 		switch (offset) {
 		case APIC_OFFSET_APR:
 		case APIC_OFFSET_PPR:
 		case APIC_OFFSET_RRR:
 		case APIC_OFFSET_CMCI_LVT:
 		case APIC_OFFSET_TIMER_CCR:
 			allowed = 1;
 			break;
 		default:
 			break;
 		}
 	} else if (access_type == 1) {
 		/*
 		 * Write data access to the following registers is expected.
 		 */
 		switch (offset) {
 		case APIC_OFFSET_VER:
 		case APIC_OFFSET_APR:
 		case APIC_OFFSET_PPR:
 		case APIC_OFFSET_RRR:
 		case APIC_OFFSET_ISR0 ... APIC_OFFSET_ISR7:
 		case APIC_OFFSET_TMR0 ... APIC_OFFSET_TMR7:
 		case APIC_OFFSET_IRR0 ... APIC_OFFSET_IRR7:
 		case APIC_OFFSET_CMCI_LVT:
 		case APIC_OFFSET_TIMER_CCR:
 			allowed = 1;
 			break;
 		default:
 			break;
 		}
 	}
 
 	if (allowed) {
 		vmexit_inst_emul(vmexit, DEFAULT_APIC_BASE + offset,
 		    VIE_INVALID_GLA);
 	}
 
 	/*
 	 * Regardless of whether the APIC-access is allowed this handler
 	 * always returns UNHANDLED:
 	 * - if the access is allowed then it is handled by emulating the
 	 *   instruction that caused the VM-exit (outside the critical section)
 	 * - if the access is not allowed then it will be converted to an
 	 *   exitcode of VM_EXITCODE_VMX and will be dealt with in userland.
 	 */
 	return (UNHANDLED);
 }
 
 static enum task_switch_reason
 vmx_task_switch_reason(uint64_t qual)
 {
 	int reason;
 
 	reason = (qual >> 30) & 0x3;
 	switch (reason) {
 	case 0:
 		return (TSR_CALL);
 	case 1:
 		return (TSR_IRET);
 	case 2:
 		return (TSR_JMP);
 	case 3:
 		return (TSR_IDT_GATE);
 	default:
 		panic("%s: invalid reason %d", __func__, reason);
 	}
 }
 
 static int
 emulate_wrmsr(struct vmx *vmx, int vcpuid, u_int num, uint64_t val, bool *retu)
 {
 	int error;
 
 	if (lapic_msr(num))
 		error = lapic_wrmsr(vmx->vm, vcpuid, num, val, retu);
 	else
 		error = vmx_wrmsr(vmx, vcpuid, num, val, retu);
 
 	return (error);
 }
 
 static int
 emulate_rdmsr(struct vmx *vmx, int vcpuid, u_int num, bool *retu)
 {
 	struct vmxctx *vmxctx;
 	uint64_t result;
 	uint32_t eax, edx;
 	int error;
 
 	if (lapic_msr(num))
 		error = lapic_rdmsr(vmx->vm, vcpuid, num, &result, retu);
 	else
 		error = vmx_rdmsr(vmx, vcpuid, num, &result, retu);
 
 	if (error == 0) {
 		eax = result;
 		vmxctx = &vmx->ctx[vcpuid];
 		error = vmxctx_setreg(vmxctx, VM_REG_GUEST_RAX, eax);
 		KASSERT(error == 0, ("vmxctx_setreg(rax) error %d", error));
 
 		edx = result >> 32;
 		error = vmxctx_setreg(vmxctx, VM_REG_GUEST_RDX, edx);
 		KASSERT(error == 0, ("vmxctx_setreg(rdx) error %d", error));
 	}
 
 	return (error);
 }
 
 static int
 vmx_exit_process(struct vmx *vmx, int vcpu, struct vm_exit *vmexit)
 {
 	int error, errcode, errcode_valid, handled, in;
 	struct vmxctx *vmxctx;
 	struct vlapic *vlapic;
 	struct vm_inout_str *vis;
 	struct vm_task_switch *ts;
 	uint32_t eax, ecx, edx, idtvec_info, idtvec_err, intr_info, inst_info;
 	uint32_t intr_type, intr_vec, reason;
 	uint64_t exitintinfo, qual, gpa;
 	bool retu;
 
 	CTASSERT((PINBASED_CTLS_ONE_SETTING & PINBASED_VIRTUAL_NMI) != 0);
 	CTASSERT((PINBASED_CTLS_ONE_SETTING & PINBASED_NMI_EXITING) != 0);
 
 	handled = UNHANDLED;
 	vmxctx = &vmx->ctx[vcpu];
 
 	qual = vmexit->u.vmx.exit_qualification;
 	reason = vmexit->u.vmx.exit_reason;
 	vmexit->exitcode = VM_EXITCODE_BOGUS;
 
 	vmm_stat_incr(vmx->vm, vcpu, VMEXIT_COUNT, 1);
 	SDT_PROBE3(vmm, vmx, exit, entry, vmx, vcpu, vmexit);
 
 	/*
 	 * VM-entry failures during or after loading guest state.
 	 *
 	 * These VM-exits are uncommon but must be handled specially
 	 * as most VM-exit fields are not populated as usual.
 	 */
 	if (__predict_false(reason == EXIT_REASON_MCE_DURING_ENTRY)) {
 		VCPU_CTR0(vmx->vm, vcpu, "Handling MCE during VM-entry");
 		__asm __volatile("int $18");
 		return (1);
 	}
 
 	/*
 	 * VM exits that can be triggered during event delivery need to
 	 * be handled specially by re-injecting the event if the IDT
 	 * vectoring information field's valid bit is set.
 	 *
 	 * See "Information for VM Exits During Event Delivery" in Intel SDM
 	 * for details.
 	 */
 	idtvec_info = vmcs_idt_vectoring_info();
 	if (idtvec_info & VMCS_IDT_VEC_VALID) {
 		idtvec_info &= ~(1 << 12); /* clear undefined bit */
 		exitintinfo = idtvec_info;
 		if (idtvec_info & VMCS_IDT_VEC_ERRCODE_VALID) {
 			idtvec_err = vmcs_idt_vectoring_err();
 			exitintinfo |= (uint64_t)idtvec_err << 32;
 		}
 		error = vm_exit_intinfo(vmx->vm, vcpu, exitintinfo);
 		KASSERT(error == 0, ("%s: vm_set_intinfo error %d",
 		    __func__, error));
 
 		/*
 		 * If 'virtual NMIs' are being used and the VM-exit
 		 * happened while injecting an NMI during the previous
 		 * VM-entry, then clear "blocking by NMI" in the
 		 * Guest Interruptibility-State so the NMI can be
 		 * reinjected on the subsequent VM-entry.
 		 *
 		 * However, if the NMI was being delivered through a task
 		 * gate, then the new task must start execution with NMIs
 		 * blocked so don't clear NMI blocking in this case.
 		 */
 		intr_type = idtvec_info & VMCS_INTR_T_MASK;
 		if (intr_type == VMCS_INTR_T_NMI) {
 			if (reason != EXIT_REASON_TASK_SWITCH)
 				vmx_clear_nmi_blocking(vmx, vcpu);
 			else
 				vmx_assert_nmi_blocking(vmx, vcpu);
 		}
 
 		/*
 		 * Update VM-entry instruction length if the event being
 		 * delivered was a software interrupt or software exception.
 		 */
 		if (intr_type == VMCS_INTR_T_SWINTR ||
 		    intr_type == VMCS_INTR_T_PRIV_SWEXCEPTION ||
 		    intr_type == VMCS_INTR_T_SWEXCEPTION) {
 			vmcs_write(VMCS_ENTRY_INST_LENGTH, vmexit->inst_length);
 		}
 	}
 
 	switch (reason) {
 	case EXIT_REASON_TASK_SWITCH:
 		ts = &vmexit->u.task_switch;
 		ts->tsssel = qual & 0xffff;
 		ts->reason = vmx_task_switch_reason(qual);
 		ts->ext = 0;
 		ts->errcode_valid = 0;
 		vmx_paging_info(&ts->paging);
 		/*
 		 * If the task switch was due to a CALL, JMP, IRET, software
 		 * interrupt (INT n) or software exception (INT3, INTO),
 		 * then the saved %rip references the instruction that caused
 		 * the task switch. The instruction length field in the VMCS
 		 * is valid in this case.
 		 *
 		 * In all other cases (e.g., NMI, hardware exception) the
 		 * saved %rip is one that would have been saved in the old TSS
 		 * had the task switch completed normally so the instruction
 		 * length field is not needed in this case and is explicitly
 		 * set to 0.
 		 */
 		if (ts->reason == TSR_IDT_GATE) {
 			KASSERT(idtvec_info & VMCS_IDT_VEC_VALID,
 			    ("invalid idtvec_info %#x for IDT task switch",
 			    idtvec_info));
 			intr_type = idtvec_info & VMCS_INTR_T_MASK;
 			if (intr_type != VMCS_INTR_T_SWINTR &&
 			    intr_type != VMCS_INTR_T_SWEXCEPTION &&
 			    intr_type != VMCS_INTR_T_PRIV_SWEXCEPTION) {
 				/* Task switch triggered by external event */
 				ts->ext = 1;
 				vmexit->inst_length = 0;
 				if (idtvec_info & VMCS_IDT_VEC_ERRCODE_VALID) {
 					ts->errcode_valid = 1;
 					ts->errcode = vmcs_idt_vectoring_err();
 				}
 			}
 		}
 		vmexit->exitcode = VM_EXITCODE_TASK_SWITCH;
 		SDT_PROBE4(vmm, vmx, exit, taskswitch, vmx, vcpu, vmexit, ts);
 		VCPU_CTR4(vmx->vm, vcpu, "task switch reason %d, tss 0x%04x, "
 		    "%s errcode 0x%016lx", ts->reason, ts->tsssel,
 		    ts->ext ? "external" : "internal",
 		    ((uint64_t)ts->errcode << 32) | ts->errcode_valid);
 		break;
 	case EXIT_REASON_CR_ACCESS:
 		vmm_stat_incr(vmx->vm, vcpu, VMEXIT_CR_ACCESS, 1);
 		SDT_PROBE4(vmm, vmx, exit, craccess, vmx, vcpu, vmexit, qual);
 		switch (qual & 0xf) {
 		case 0:
 			handled = vmx_emulate_cr0_access(vmx, vcpu, qual);
 			break;
 		case 4:
 			handled = vmx_emulate_cr4_access(vmx, vcpu, qual);
 			break;
 		case 8:
 			handled = vmx_emulate_cr8_access(vmx, vcpu, qual);
 			break;
 		}
 		break;
 	case EXIT_REASON_RDMSR:
 		vmm_stat_incr(vmx->vm, vcpu, VMEXIT_RDMSR, 1);
 		retu = false;
 		ecx = vmxctx->guest_rcx;
 		VCPU_CTR1(vmx->vm, vcpu, "rdmsr 0x%08x", ecx);
 		SDT_PROBE4(vmm, vmx, exit, rdmsr, vmx, vcpu, vmexit, ecx);
 		error = emulate_rdmsr(vmx, vcpu, ecx, &retu);
 		if (error) {
 			vmexit->exitcode = VM_EXITCODE_RDMSR;
 			vmexit->u.msr.code = ecx;
 		} else if (!retu) {
 			handled = HANDLED;
 		} else {
 			/* Return to userspace with a valid exitcode */
 			KASSERT(vmexit->exitcode != VM_EXITCODE_BOGUS,
 			    ("emulate_rdmsr retu with bogus exitcode"));
 		}
 		break;
 	case EXIT_REASON_WRMSR:
 		vmm_stat_incr(vmx->vm, vcpu, VMEXIT_WRMSR, 1);
 		retu = false;
 		eax = vmxctx->guest_rax;
 		ecx = vmxctx->guest_rcx;
 		edx = vmxctx->guest_rdx;
 		VCPU_CTR2(vmx->vm, vcpu, "wrmsr 0x%08x value 0x%016lx",
 		    ecx, (uint64_t)edx << 32 | eax);
 		SDT_PROBE5(vmm, vmx, exit, wrmsr, vmx, vmexit, vcpu, ecx,
 		    (uint64_t)edx << 32 | eax);
 		error = emulate_wrmsr(vmx, vcpu, ecx,
 		    (uint64_t)edx << 32 | eax, &retu);
 		if (error) {
 			vmexit->exitcode = VM_EXITCODE_WRMSR;
 			vmexit->u.msr.code = ecx;
 			vmexit->u.msr.wval = (uint64_t)edx << 32 | eax;
 		} else if (!retu) {
 			handled = HANDLED;
 		} else {
 			/* Return to userspace with a valid exitcode */
 			KASSERT(vmexit->exitcode != VM_EXITCODE_BOGUS,
 			    ("emulate_wrmsr retu with bogus exitcode"));
 		}
 		break;
 	case EXIT_REASON_HLT:
 		vmm_stat_incr(vmx->vm, vcpu, VMEXIT_HLT, 1);
 		SDT_PROBE3(vmm, vmx, exit, halt, vmx, vcpu, vmexit);
 		vmexit->exitcode = VM_EXITCODE_HLT;
 		vmexit->u.hlt.rflags = vmcs_read(VMCS_GUEST_RFLAGS);
 		if (virtual_interrupt_delivery)
 			vmexit->u.hlt.intr_status =
 			    vmcs_read(VMCS_GUEST_INTR_STATUS);
 		else
 			vmexit->u.hlt.intr_status = 0;
 		break;
 	case EXIT_REASON_MTF:
 		vmm_stat_incr(vmx->vm, vcpu, VMEXIT_MTRAP, 1);
 		SDT_PROBE3(vmm, vmx, exit, mtrap, vmx, vcpu, vmexit);
 		vmexit->exitcode = VM_EXITCODE_MTRAP;
 		vmexit->inst_length = 0;
 		break;
 	case EXIT_REASON_PAUSE:
 		vmm_stat_incr(vmx->vm, vcpu, VMEXIT_PAUSE, 1);
 		SDT_PROBE3(vmm, vmx, exit, pause, vmx, vcpu, vmexit);
 		vmexit->exitcode = VM_EXITCODE_PAUSE;
 		break;
 	case EXIT_REASON_INTR_WINDOW:
 		vmm_stat_incr(vmx->vm, vcpu, VMEXIT_INTR_WINDOW, 1);
 		SDT_PROBE3(vmm, vmx, exit, intrwindow, vmx, vcpu, vmexit);
 		vmx_clear_int_window_exiting(vmx, vcpu);
 		return (1);
 	case EXIT_REASON_EXT_INTR:
 		/*
 		 * External interrupts serve only to cause VM exits and allow
 		 * the host interrupt handler to run.
 		 *
 		 * If this external interrupt triggers a virtual interrupt
 		 * to a VM, then that state will be recorded by the
 		 * host interrupt handler in the VM's softc. We will inject
 		 * this virtual interrupt during the subsequent VM enter.
 		 */
 		intr_info = vmcs_read(VMCS_EXIT_INTR_INFO);
 		SDT_PROBE4(vmm, vmx, exit, interrupt,
 		    vmx, vcpu, vmexit, intr_info);
 
 		/*
 		 * XXX: Ignore this exit if VMCS_INTR_VALID is not set.
 		 * This appears to be a bug in VMware Fusion?
 		 */
 		if (!(intr_info & VMCS_INTR_VALID))
 			return (1);
 		KASSERT((intr_info & VMCS_INTR_VALID) != 0 &&
 		    (intr_info & VMCS_INTR_T_MASK) == VMCS_INTR_T_HWINTR,
 		    ("VM exit interruption info invalid: %#x", intr_info));
 		vmx_trigger_hostintr(intr_info & 0xff);
 
 		/*
 		 * This is special. We want to treat this as an 'handled'
 		 * VM-exit but not increment the instruction pointer.
 		 */
 		vmm_stat_incr(vmx->vm, vcpu, VMEXIT_EXTINT, 1);
 		return (1);
 	case EXIT_REASON_NMI_WINDOW:
 		SDT_PROBE3(vmm, vmx, exit, nmiwindow, vmx, vcpu, vmexit);
 		/* Exit to allow the pending virtual NMI to be injected */
 		if (vm_nmi_pending(vmx->vm, vcpu))
 			vmx_inject_nmi(vmx, vcpu);
 		vmx_clear_nmi_window_exiting(vmx, vcpu);
 		vmm_stat_incr(vmx->vm, vcpu, VMEXIT_NMI_WINDOW, 1);
 		return (1);
 	case EXIT_REASON_INOUT:
 		vmm_stat_incr(vmx->vm, vcpu, VMEXIT_INOUT, 1);
 		vmexit->exitcode = VM_EXITCODE_INOUT;
 		vmexit->u.inout.bytes = (qual & 0x7) + 1;
 		vmexit->u.inout.in = in = (qual & 0x8) ? 1 : 0;
 		vmexit->u.inout.string = (qual & 0x10) ? 1 : 0;
 		vmexit->u.inout.rep = (qual & 0x20) ? 1 : 0;
 		vmexit->u.inout.port = (uint16_t)(qual >> 16);
 		vmexit->u.inout.eax = (uint32_t)(vmxctx->guest_rax);
 		if (vmexit->u.inout.string) {
 			inst_info = vmcs_read(VMCS_EXIT_INSTRUCTION_INFO);
 			vmexit->exitcode = VM_EXITCODE_INOUT_STR;
 			vis = &vmexit->u.inout_str;
 			vmx_paging_info(&vis->paging);
 			vis->rflags = vmcs_read(VMCS_GUEST_RFLAGS);
 			vis->cr0 = vmcs_read(VMCS_GUEST_CR0);
 			vis->index = inout_str_index(vmx, vcpu, in);
 			vis->count = inout_str_count(vmx, vcpu, vis->inout.rep);
 			vis->addrsize = inout_str_addrsize(inst_info);
 			inout_str_seginfo(vmx, vcpu, inst_info, in, vis);
 		}
 		SDT_PROBE3(vmm, vmx, exit, inout, vmx, vcpu, vmexit);
 		break;
 	case EXIT_REASON_CPUID:
 		vmm_stat_incr(vmx->vm, vcpu, VMEXIT_CPUID, 1);
 		SDT_PROBE3(vmm, vmx, exit, cpuid, vmx, vcpu, vmexit);
 		handled = vmx_handle_cpuid(vmx->vm, vcpu, vmxctx);
 		break;
 	case EXIT_REASON_EXCEPTION:
 		vmm_stat_incr(vmx->vm, vcpu, VMEXIT_EXCEPTION, 1);
 		intr_info = vmcs_read(VMCS_EXIT_INTR_INFO);
 		KASSERT((intr_info & VMCS_INTR_VALID) != 0,
 		    ("VM exit interruption info invalid: %#x", intr_info));
 
 		intr_vec = intr_info & 0xff;
 		intr_type = intr_info & VMCS_INTR_T_MASK;
 
 		/*
 		 * If Virtual NMIs control is 1 and the VM-exit is due to a
 		 * fault encountered during the execution of IRET then we must
 		 * restore the state of "virtual-NMI blocking" before resuming
 		 * the guest.
 		 *
 		 * See "Resuming Guest Software after Handling an Exception".
 		 * See "Information for VM Exits Due to Vectored Events".
 		 */
 		if ((idtvec_info & VMCS_IDT_VEC_VALID) == 0 &&
 		    (intr_vec != IDT_DF) &&
 		    (intr_info & EXIT_QUAL_NMIUDTI) != 0)
 			vmx_restore_nmi_blocking(vmx, vcpu);
 
 		/*
 		 * The NMI has already been handled in vmx_exit_handle_nmi().
 		 */
 		if (intr_type == VMCS_INTR_T_NMI)
 			return (1);
 
 		/*
 		 * Call the machine check handler by hand. Also don't reflect
 		 * the machine check back into the guest.
 		 */
 		if (intr_vec == IDT_MC) {
 			VCPU_CTR0(vmx->vm, vcpu, "Vectoring to MCE handler");
 			__asm __volatile("int $18");
 			return (1);
 		}
 
 		if (intr_vec == IDT_PF) {
 			error = vmxctx_setreg(vmxctx, VM_REG_GUEST_CR2, qual);
 			KASSERT(error == 0, ("%s: vmxctx_setreg(cr2) error %d",
 			    __func__, error));
 		}
 
 		/*
 		 * Software exceptions exhibit trap-like behavior. This in
 		 * turn requires populating the VM-entry instruction length
 		 * so that the %rip in the trap frame is past the INT3/INTO
 		 * instruction.
 		 */
 		if (intr_type == VMCS_INTR_T_SWEXCEPTION)
 			vmcs_write(VMCS_ENTRY_INST_LENGTH, vmexit->inst_length);
 
 		/* Reflect all other exceptions back into the guest */
 		errcode_valid = errcode = 0;
 		if (intr_info & VMCS_INTR_DEL_ERRCODE) {
 			errcode_valid = 1;
 			errcode = vmcs_read(VMCS_EXIT_INTR_ERRCODE);
 		}
 		VCPU_CTR2(vmx->vm, vcpu, "Reflecting exception %d/%#x into "
 		    "the guest", intr_vec, errcode);
 		SDT_PROBE5(vmm, vmx, exit, exception,
 		    vmx, vcpu, vmexit, intr_vec, errcode);
 		error = vm_inject_exception(vmx->vm, vcpu, intr_vec,
 		    errcode_valid, errcode, 0);
 		KASSERT(error == 0, ("%s: vm_inject_exception error %d",
 		    __func__, error));
 		return (1);
 
 	case EXIT_REASON_EPT_FAULT:
 		/*
 		 * If 'gpa' lies within the address space allocated to
 		 * memory then this must be a nested page fault otherwise
 		 * this must be an instruction that accesses MMIO space.
 		 */
 		gpa = vmcs_gpa();
 		if (vm_mem_allocated(vmx->vm, vcpu, gpa) ||
 		    apic_access_fault(vmx, vcpu, gpa)) {
 			vmexit->exitcode = VM_EXITCODE_PAGING;
 			vmexit->inst_length = 0;
 			vmexit->u.paging.gpa = gpa;
 			vmexit->u.paging.fault_type = ept_fault_type(qual);
 			vmm_stat_incr(vmx->vm, vcpu, VMEXIT_NESTED_FAULT, 1);
 			SDT_PROBE5(vmm, vmx, exit, nestedfault,
 			    vmx, vcpu, vmexit, gpa, qual);
 		} else if (ept_emulation_fault(qual)) {
 			vmexit_inst_emul(vmexit, gpa, vmcs_gla());
 			vmm_stat_incr(vmx->vm, vcpu, VMEXIT_INST_EMUL, 1);
 			SDT_PROBE4(vmm, vmx, exit, mmiofault,
 			    vmx, vcpu, vmexit, gpa);
 		}
 		/*
 		 * If Virtual NMIs control is 1 and the VM-exit is due to an
 		 * EPT fault during the execution of IRET then we must restore
 		 * the state of "virtual-NMI blocking" before resuming.
 		 *
 		 * See description of "NMI unblocking due to IRET" in
 		 * "Exit Qualification for EPT Violations".
 		 */
 		if ((idtvec_info & VMCS_IDT_VEC_VALID) == 0 &&
 		    (qual & EXIT_QUAL_NMIUDTI) != 0)
 			vmx_restore_nmi_blocking(vmx, vcpu);
 		break;
 	case EXIT_REASON_VIRTUALIZED_EOI:
 		vmexit->exitcode = VM_EXITCODE_IOAPIC_EOI;
 		vmexit->u.ioapic_eoi.vector = qual & 0xFF;
 		SDT_PROBE3(vmm, vmx, exit, eoi, vmx, vcpu, vmexit);
 		vmexit->inst_length = 0;	/* trap-like */
 		break;
 	case EXIT_REASON_APIC_ACCESS:
 		SDT_PROBE3(vmm, vmx, exit, apicaccess, vmx, vcpu, vmexit);
 		handled = vmx_handle_apic_access(vmx, vcpu, vmexit);
 		break;
 	case EXIT_REASON_APIC_WRITE:
 		/*
 		 * APIC-write VM exit is trap-like so the %rip is already
 		 * pointing to the next instruction.
 		 */
 		vmexit->inst_length = 0;
 		vlapic = vm_lapic(vmx->vm, vcpu);
 		SDT_PROBE4(vmm, vmx, exit, apicwrite,
 		    vmx, vcpu, vmexit, vlapic);
 		handled = vmx_handle_apic_write(vmx, vcpu, vlapic, qual);
 		break;
 	case EXIT_REASON_XSETBV:
 		SDT_PROBE3(vmm, vmx, exit, xsetbv, vmx, vcpu, vmexit);
 		handled = vmx_emulate_xsetbv(vmx, vcpu, vmexit);
 		break;
 	case EXIT_REASON_MONITOR:
 		SDT_PROBE3(vmm, vmx, exit, monitor, vmx, vcpu, vmexit);
 		vmexit->exitcode = VM_EXITCODE_MONITOR;
 		break;
 	case EXIT_REASON_MWAIT:
 		SDT_PROBE3(vmm, vmx, exit, mwait, vmx, vcpu, vmexit);
 		vmexit->exitcode = VM_EXITCODE_MWAIT;
 		break;
 	default:
 		SDT_PROBE4(vmm, vmx, exit, unknown,
 		    vmx, vcpu, vmexit, reason);
 		vmm_stat_incr(vmx->vm, vcpu, VMEXIT_UNKNOWN, 1);
 		break;
 	}
 
 	if (handled) {
 		/*
 		 * It is possible that control is returned to userland
 		 * even though we were able to handle the VM exit in the
 		 * kernel.
 		 *
 		 * In such a case we want to make sure that the userland
 		 * restarts guest execution at the instruction *after*
 		 * the one we just processed. Therefore we update the
 		 * guest rip in the VMCS and in 'vmexit'.
 		 */
 		vmexit->rip += vmexit->inst_length;
 		vmexit->inst_length = 0;
 		vmcs_write(VMCS_GUEST_RIP, vmexit->rip);
 	} else {
 		if (vmexit->exitcode == VM_EXITCODE_BOGUS) {
 			/*
 			 * If this VM exit was not claimed by anybody then
 			 * treat it as a generic VMX exit.
 			 */
 			vmexit->exitcode = VM_EXITCODE_VMX;
 			vmexit->u.vmx.status = VM_SUCCESS;
 			vmexit->u.vmx.inst_type = 0;
 			vmexit->u.vmx.inst_error = 0;
 		} else {
 			/*
 			 * The exitcode and collateral have been populated.
 			 * The VM exit will be processed further in userland.
 			 */
 		}
 	}
 
 	SDT_PROBE4(vmm, vmx, exit, return,
 	    vmx, vcpu, vmexit, handled);
 	return (handled);
 }
 
 static __inline void
 vmx_exit_inst_error(struct vmxctx *vmxctx, int rc, struct vm_exit *vmexit)
 {
 
 	KASSERT(vmxctx->inst_fail_status != VM_SUCCESS,
 	    ("vmx_exit_inst_error: invalid inst_fail_status %d",
 	    vmxctx->inst_fail_status));
 
 	vmexit->inst_length = 0;
 	vmexit->exitcode = VM_EXITCODE_VMX;
 	vmexit->u.vmx.status = vmxctx->inst_fail_status;
 	vmexit->u.vmx.inst_error = vmcs_instruction_error();
 	vmexit->u.vmx.exit_reason = ~0;
 	vmexit->u.vmx.exit_qualification = ~0;
 
 	switch (rc) {
 	case VMX_VMRESUME_ERROR:
 	case VMX_VMLAUNCH_ERROR:
 	case VMX_INVEPT_ERROR:
 		vmexit->u.vmx.inst_type = rc;
 		break;
 	default:
 		panic("vm_exit_inst_error: vmx_enter_guest returned %d", rc);
 	}
 }
 
 /*
  * If the NMI-exiting VM execution control is set to '1' then an NMI in
  * non-root operation causes a VM-exit. NMI blocking is in effect so it is
  * sufficient to simply vector to the NMI handler via a software interrupt.
  * However, this must be done before maskable interrupts are enabled
  * otherwise the "iret" issued by an interrupt handler will incorrectly
  * clear NMI blocking.
  */
 static __inline void
 vmx_exit_handle_nmi(struct vmx *vmx, int vcpuid, struct vm_exit *vmexit)
 {
 	uint32_t intr_info;
 
 	KASSERT((read_rflags() & PSL_I) == 0, ("interrupts enabled"));
 
 	if (vmexit->u.vmx.exit_reason != EXIT_REASON_EXCEPTION)
 		return;
 
 	intr_info = vmcs_read(VMCS_EXIT_INTR_INFO);
 	KASSERT((intr_info & VMCS_INTR_VALID) != 0,
 	    ("VM exit interruption info invalid: %#x", intr_info));
 
 	if ((intr_info & VMCS_INTR_T_MASK) == VMCS_INTR_T_NMI) {
 		KASSERT((intr_info & 0xff) == IDT_NMI, ("VM exit due "
 		    "to NMI has invalid vector: %#x", intr_info));
 		VCPU_CTR0(vmx->vm, vcpuid, "Vectoring to NMI handler");
 		__asm __volatile("int $2");
 	}
 }
 
 static __inline void
 vmx_dr_enter_guest(struct vmxctx *vmxctx)
 {
 	register_t rflags;
 
 	/* Save host control debug registers. */
 	vmxctx->host_dr7 = rdr7();
 	vmxctx->host_debugctl = rdmsr(MSR_DEBUGCTLMSR);
 
 	/*
 	 * Disable debugging in DR7 and DEBUGCTL to avoid triggering
 	 * exceptions in the host based on the guest DRx values.  The
 	 * guest DR7 and DEBUGCTL are saved/restored in the VMCS.
 	 */
 	load_dr7(0);
 	wrmsr(MSR_DEBUGCTLMSR, 0);
 
 	/*
 	 * Disable single stepping the kernel to avoid corrupting the
 	 * guest DR6.  A debugger might still be able to corrupt the
 	 * guest DR6 by setting a breakpoint after this point and then
 	 * single stepping.
 	 */
 	rflags = read_rflags();
 	vmxctx->host_tf = rflags & PSL_T;
 	write_rflags(rflags & ~PSL_T);
 
 	/* Save host debug registers. */
 	vmxctx->host_dr0 = rdr0();
 	vmxctx->host_dr1 = rdr1();
 	vmxctx->host_dr2 = rdr2();
 	vmxctx->host_dr3 = rdr3();
 	vmxctx->host_dr6 = rdr6();
 
 	/* Restore guest debug registers. */
 	load_dr0(vmxctx->guest_dr0);
 	load_dr1(vmxctx->guest_dr1);
 	load_dr2(vmxctx->guest_dr2);
 	load_dr3(vmxctx->guest_dr3);
 	load_dr6(vmxctx->guest_dr6);
 }
 
 static __inline void
 vmx_dr_leave_guest(struct vmxctx *vmxctx)
 {
 
 	/* Save guest debug registers. */
 	vmxctx->guest_dr0 = rdr0();
 	vmxctx->guest_dr1 = rdr1();
 	vmxctx->guest_dr2 = rdr2();
 	vmxctx->guest_dr3 = rdr3();
 	vmxctx->guest_dr6 = rdr6();
 
 	/*
 	 * Restore host debug registers.  Restore DR7, DEBUGCTL, and
 	 * PSL_T last.
 	 */
 	load_dr0(vmxctx->host_dr0);
 	load_dr1(vmxctx->host_dr1);
 	load_dr2(vmxctx->host_dr2);
 	load_dr3(vmxctx->host_dr3);
 	load_dr6(vmxctx->host_dr6);
 	wrmsr(MSR_DEBUGCTLMSR, vmxctx->host_debugctl);
 	load_dr7(vmxctx->host_dr7);
 	write_rflags(read_rflags() | vmxctx->host_tf);
 }
 
 static int
 vmx_run(void *arg, int vcpu, register_t rip, pmap_t pmap,
     struct vm_eventinfo *evinfo)
 {
 	int rc, handled, launched;
 	struct vmx *vmx;
 	struct vm *vm;
 	struct vmxctx *vmxctx;
 	struct vmcs *vmcs;
 	struct vm_exit *vmexit;
 	struct vlapic *vlapic;
 	uint32_t exit_reason;
 
 	vmx = arg;
 	vm = vmx->vm;
 	vmcs = &vmx->vmcs[vcpu];
 	vmxctx = &vmx->ctx[vcpu];
 	vlapic = vm_lapic(vm, vcpu);
 	vmexit = vm_exitinfo(vm, vcpu);
 	launched = 0;
 
 	KASSERT(vmxctx->pmap == pmap,
 	    ("pmap %p different than ctx pmap %p", pmap, vmxctx->pmap));
 
 	vmx_msr_guest_enter(vmx, vcpu);
 
 	VMPTRLD(vmcs);
 
 	/*
 	 * XXX
 	 * We do this every time because we may setup the virtual machine
 	 * from a different process than the one that actually runs it.
 	 *
 	 * If the life of a virtual machine was spent entirely in the context
 	 * of a single process we could do this once in vmx_vminit().
 	 */
 	vmcs_write(VMCS_HOST_CR3, rcr3());
 
 	vmcs_write(VMCS_GUEST_RIP, rip);
 	vmx_set_pcpu_defaults(vmx, vcpu, pmap);
 	do {
 		KASSERT(vmcs_guest_rip() == rip, ("%s: vmcs guest rip mismatch "
 		    "%#lx/%#lx", __func__, vmcs_guest_rip(), rip));
 
 		handled = UNHANDLED;
 		/*
 		 * Interrupts are disabled from this point on until the
 		 * guest starts executing. This is done for the following
 		 * reasons:
 		 *
 		 * If an AST is asserted on this thread after the check below,
 		 * then the IPI_AST notification will not be lost, because it
 		 * will cause a VM exit due to external interrupt as soon as
 		 * the guest state is loaded.
 		 *
 		 * A posted interrupt after 'vmx_inject_interrupts()' will
 		 * not be "lost" because it will be held pending in the host
 		 * APIC because interrupts are disabled. The pending interrupt
 		 * will be recognized as soon as the guest state is loaded.
 		 *
 		 * The same reasoning applies to the IPI generated by
 		 * pmap_invalidate_ept().
 		 */
 		disable_intr();
 		vmx_inject_interrupts(vmx, vcpu, vlapic, rip);
 
 		/*
 		 * Check for vcpu suspension after injecting events because
 		 * vmx_inject_interrupts() can suspend the vcpu due to a
 		 * triple fault.
 		 */
 		if (vcpu_suspended(evinfo)) {
 			enable_intr();
 			vm_exit_suspended(vmx->vm, vcpu, rip);
 			break;
 		}
 
 		if (vcpu_rendezvous_pending(evinfo)) {
 			enable_intr();
 			vm_exit_rendezvous(vmx->vm, vcpu, rip);
 			break;
 		}
 
 		if (vcpu_reqidle(evinfo)) {
 			enable_intr();
 			vm_exit_reqidle(vmx->vm, vcpu, rip);
 			break;
 		}
 
 		if (vcpu_should_yield(vm, vcpu)) {
 			enable_intr();
 			vm_exit_astpending(vmx->vm, vcpu, rip);
 			vmx_astpending_trace(vmx, vcpu, rip);
 			handled = HANDLED;
 			break;
 		}
 
 		if (vcpu_debugged(vm, vcpu)) {
 			enable_intr();
 			vm_exit_debug(vmx->vm, vcpu, rip);
 			break;
 		}
 
 		vmx_run_trace(vmx, vcpu);
 		vmx_dr_enter_guest(vmxctx);
 		rc = vmx_enter_guest(vmxctx, vmx, launched);
 		vmx_dr_leave_guest(vmxctx);
 
 		/* Collect some information for VM exit processing */
 		vmexit->rip = rip = vmcs_guest_rip();
 		vmexit->inst_length = vmexit_instruction_length();
 		vmexit->u.vmx.exit_reason = exit_reason = vmcs_exit_reason();
 		vmexit->u.vmx.exit_qualification = vmcs_exit_qualification();
 
 		/* Update 'nextrip' */
 		vmx->state[vcpu].nextrip = rip;
 
 		if (rc == VMX_GUEST_VMEXIT) {
 			vmx_exit_handle_nmi(vmx, vcpu, vmexit);
 			enable_intr();
 			handled = vmx_exit_process(vmx, vcpu, vmexit);
 		} else {
 			enable_intr();
 			vmx_exit_inst_error(vmxctx, rc, vmexit);
 		}
 		launched = 1;
 		vmx_exit_trace(vmx, vcpu, rip, exit_reason, handled);
 		rip = vmexit->rip;
 	} while (handled);
 
 	/*
 	 * If a VM exit has been handled then the exitcode must be BOGUS
 	 * If a VM exit is not handled then the exitcode must not be BOGUS
 	 */
 	if ((handled && vmexit->exitcode != VM_EXITCODE_BOGUS) ||
 	    (!handled && vmexit->exitcode == VM_EXITCODE_BOGUS)) {
 		panic("Mismatch between handled (%d) and exitcode (%d)",
 		      handled, vmexit->exitcode);
 	}
 
 	if (!handled)
 		vmm_stat_incr(vm, vcpu, VMEXIT_USERSPACE, 1);
 
 	VCPU_CTR1(vm, vcpu, "returning from vmx_run: exitcode %d",
 	    vmexit->exitcode);
 
 	VMCLEAR(vmcs);
 	vmx_msr_guest_exit(vmx, vcpu);
 
 	return (0);
 }
 
 static void
 vmx_vmcleanup(void *arg)
 {
 	int i;
 	struct vmx *vmx = arg;
 
 	if (apic_access_virtualization(vmx, 0))
 		vm_unmap_mmio(vmx->vm, DEFAULT_APIC_BASE, PAGE_SIZE);
 
 	for (i = 0; i < VM_MAXCPU; i++)
 		vpid_free(vmx->state[i].vpid);
 
 	free(vmx, M_VMX);
 
 	return;
 }
 
 static register_t *
 vmxctx_regptr(struct vmxctx *vmxctx, int reg)
 {
 
 	switch (reg) {
 	case VM_REG_GUEST_RAX:
 		return (&vmxctx->guest_rax);
 	case VM_REG_GUEST_RBX:
 		return (&vmxctx->guest_rbx);
 	case VM_REG_GUEST_RCX:
 		return (&vmxctx->guest_rcx);
 	case VM_REG_GUEST_RDX:
 		return (&vmxctx->guest_rdx);
 	case VM_REG_GUEST_RSI:
 		return (&vmxctx->guest_rsi);
 	case VM_REG_GUEST_RDI:
 		return (&vmxctx->guest_rdi);
 	case VM_REG_GUEST_RBP:
 		return (&vmxctx->guest_rbp);
 	case VM_REG_GUEST_R8:
 		return (&vmxctx->guest_r8);
 	case VM_REG_GUEST_R9:
 		return (&vmxctx->guest_r9);
 	case VM_REG_GUEST_R10:
 		return (&vmxctx->guest_r10);
 	case VM_REG_GUEST_R11:
 		return (&vmxctx->guest_r11);
 	case VM_REG_GUEST_R12:
 		return (&vmxctx->guest_r12);
 	case VM_REG_GUEST_R13:
 		return (&vmxctx->guest_r13);
 	case VM_REG_GUEST_R14:
 		return (&vmxctx->guest_r14);
 	case VM_REG_GUEST_R15:
 		return (&vmxctx->guest_r15);
 	case VM_REG_GUEST_CR2:
 		return (&vmxctx->guest_cr2);
 	case VM_REG_GUEST_DR0:
 		return (&vmxctx->guest_dr0);
 	case VM_REG_GUEST_DR1:
 		return (&vmxctx->guest_dr1);
 	case VM_REG_GUEST_DR2:
 		return (&vmxctx->guest_dr2);
 	case VM_REG_GUEST_DR3:
 		return (&vmxctx->guest_dr3);
 	case VM_REG_GUEST_DR6:
 		return (&vmxctx->guest_dr6);
 	default:
 		break;
 	}
 	return (NULL);
 }
 
 static int
 vmxctx_getreg(struct vmxctx *vmxctx, int reg, uint64_t *retval)
 {
 	register_t *regp;
 
 	if ((regp = vmxctx_regptr(vmxctx, reg)) != NULL) {
 		*retval = *regp;
 		return (0);
 	} else
 		return (EINVAL);
 }
 
 static int
 vmxctx_setreg(struct vmxctx *vmxctx, int reg, uint64_t val)
 {
 	register_t *regp;
 
 	if ((regp = vmxctx_regptr(vmxctx, reg)) != NULL) {
 		*regp = val;
 		return (0);
 	} else
 		return (EINVAL);
 }
 
 static int
 vmx_get_intr_shadow(struct vmx *vmx, int vcpu, int running, uint64_t *retval)
 {
 	uint64_t gi;
 	int error;
 
 	error = vmcs_getreg(&vmx->vmcs[vcpu], running, 
 	    VMCS_IDENT(VMCS_GUEST_INTERRUPTIBILITY), &gi);
 	*retval = (gi & HWINTR_BLOCKING) ? 1 : 0;
 	return (error);
 }
 
 static int
 vmx_modify_intr_shadow(struct vmx *vmx, int vcpu, int running, uint64_t val)
 {
 	struct vmcs *vmcs;
 	uint64_t gi;
 	int error, ident;
 
 	/*
 	 * Forcing the vcpu into an interrupt shadow is not supported.
 	 */
 	if (val) {
 		error = EINVAL;
 		goto done;
 	}
 
 	vmcs = &vmx->vmcs[vcpu];
 	ident = VMCS_IDENT(VMCS_GUEST_INTERRUPTIBILITY);
 	error = vmcs_getreg(vmcs, running, ident, &gi);
 	if (error == 0) {
 		gi &= ~HWINTR_BLOCKING;
 		error = vmcs_setreg(vmcs, running, ident, gi);
 	}
 done:
 	VCPU_CTR2(vmx->vm, vcpu, "Setting intr_shadow to %#lx %s", val,
 	    error ? "failed" : "succeeded");
 	return (error);
 }
 
 static int
 vmx_shadow_reg(int reg)
 {
 	int shreg;
 
 	shreg = -1;
 
 	switch (reg) {
 	case VM_REG_GUEST_CR0:
 		shreg = VMCS_CR0_SHADOW;
                 break;
         case VM_REG_GUEST_CR4:
 		shreg = VMCS_CR4_SHADOW;
 		break;
 	default:
 		break;
 	}
 
 	return (shreg);
 }
 
 static int
 vmx_getreg(void *arg, int vcpu, int reg, uint64_t *retval)
 {
 	int running, hostcpu;
 	struct vmx *vmx = arg;
 
 	running = vcpu_is_running(vmx->vm, vcpu, &hostcpu);
 	if (running && hostcpu != curcpu)
 		panic("vmx_getreg: %s%d is running", vm_name(vmx->vm), vcpu);
 
 	if (reg == VM_REG_GUEST_INTR_SHADOW)
 		return (vmx_get_intr_shadow(vmx, vcpu, running, retval));
 
 	if (vmxctx_getreg(&vmx->ctx[vcpu], reg, retval) == 0)
 		return (0);
 
 	return (vmcs_getreg(&vmx->vmcs[vcpu], running, reg, retval));
 }
 
 static int
 vmx_setreg(void *arg, int vcpu, int reg, uint64_t val)
 {
 	int error, hostcpu, running, shadow;
 	uint64_t ctls;
 	pmap_t pmap;
 	struct vmx *vmx = arg;
 
 	running = vcpu_is_running(vmx->vm, vcpu, &hostcpu);
 	if (running && hostcpu != curcpu)
 		panic("vmx_setreg: %s%d is running", vm_name(vmx->vm), vcpu);
 
 	if (reg == VM_REG_GUEST_INTR_SHADOW)
 		return (vmx_modify_intr_shadow(vmx, vcpu, running, val));
 
 	if (vmxctx_setreg(&vmx->ctx[vcpu], reg, val) == 0)
 		return (0);
 
 	error = vmcs_setreg(&vmx->vmcs[vcpu], running, reg, val);
 
 	if (error == 0) {
 		/*
 		 * If the "load EFER" VM-entry control is 1 then the
 		 * value of EFER.LMA must be identical to "IA-32e mode guest"
 		 * bit in the VM-entry control.
 		 */
 		if ((entry_ctls & VM_ENTRY_LOAD_EFER) != 0 &&
 		    (reg == VM_REG_GUEST_EFER)) {
 			vmcs_getreg(&vmx->vmcs[vcpu], running,
 				    VMCS_IDENT(VMCS_ENTRY_CTLS), &ctls);
 			if (val & EFER_LMA)
 				ctls |= VM_ENTRY_GUEST_LMA;
 			else
 				ctls &= ~VM_ENTRY_GUEST_LMA;
 			vmcs_setreg(&vmx->vmcs[vcpu], running,
 				    VMCS_IDENT(VMCS_ENTRY_CTLS), ctls);
 		}
 
 		shadow = vmx_shadow_reg(reg);
 		if (shadow > 0) {
 			/*
 			 * Store the unmodified value in the shadow
 			 */			
 			error = vmcs_setreg(&vmx->vmcs[vcpu], running,
 				    VMCS_IDENT(shadow), val);
 		}
 
 		if (reg == VM_REG_GUEST_CR3) {
 			/*
 			 * Invalidate the guest vcpu's TLB mappings to emulate
 			 * the behavior of updating %cr3.
 			 *
 			 * XXX the processor retains global mappings when %cr3
 			 * is updated but vmx_invvpid() does not.
 			 */
 			pmap = vmx->ctx[vcpu].pmap;
 			vmx_invvpid(vmx, vcpu, pmap, running);
 		}
 	}
 
 	return (error);
 }
 
 static int
 vmx_getdesc(void *arg, int vcpu, int reg, struct seg_desc *desc)
 {
 	int hostcpu, running;
 	struct vmx *vmx = arg;
 
 	running = vcpu_is_running(vmx->vm, vcpu, &hostcpu);
 	if (running && hostcpu != curcpu)
 		panic("vmx_getdesc: %s%d is running", vm_name(vmx->vm), vcpu);
 
 	return (vmcs_getdesc(&vmx->vmcs[vcpu], running, reg, desc));
 }
 
 static int
 vmx_setdesc(void *arg, int vcpu, int reg, struct seg_desc *desc)
 {
 	int hostcpu, running;
 	struct vmx *vmx = arg;
 
 	running = vcpu_is_running(vmx->vm, vcpu, &hostcpu);
 	if (running && hostcpu != curcpu)
 		panic("vmx_setdesc: %s%d is running", vm_name(vmx->vm), vcpu);
 
 	return (vmcs_setdesc(&vmx->vmcs[vcpu], running, reg, desc));
 }
 
 static int
 vmx_getcap(void *arg, int vcpu, int type, int *retval)
 {
 	struct vmx *vmx = arg;
 	int vcap;
 	int ret;
 
 	ret = ENOENT;
 
 	vcap = vmx->cap[vcpu].set;
 
 	switch (type) {
 	case VM_CAP_HALT_EXIT:
 		if (cap_halt_exit)
 			ret = 0;
 		break;
 	case VM_CAP_PAUSE_EXIT:
 		if (cap_pause_exit)
 			ret = 0;
 		break;
 	case VM_CAP_MTRAP_EXIT:
 		if (cap_monitor_trap)
 			ret = 0;
 		break;
 	case VM_CAP_UNRESTRICTED_GUEST:
 		if (cap_unrestricted_guest)
 			ret = 0;
 		break;
 	case VM_CAP_ENABLE_INVPCID:
 		if (cap_invpcid)
 			ret = 0;
 		break;
 	default:
 		break;
 	}
 
 	if (ret == 0)
 		*retval = (vcap & (1 << type)) ? 1 : 0;
 
 	return (ret);
 }
 
 static int
 vmx_setcap(void *arg, int vcpu, int type, int val)
 {
 	struct vmx *vmx = arg;
 	struct vmcs *vmcs = &vmx->vmcs[vcpu];
 	uint32_t baseval;
 	uint32_t *pptr;
 	int error;
 	int flag;
 	int reg;
 	int retval;
 
 	retval = ENOENT;
 	pptr = NULL;
 
 	switch (type) {
 	case VM_CAP_HALT_EXIT:
 		if (cap_halt_exit) {
 			retval = 0;
 			pptr = &vmx->cap[vcpu].proc_ctls;
 			baseval = *pptr;
 			flag = PROCBASED_HLT_EXITING;
 			reg = VMCS_PRI_PROC_BASED_CTLS;
 		}
 		break;
 	case VM_CAP_MTRAP_EXIT:
 		if (cap_monitor_trap) {
 			retval = 0;
 			pptr = &vmx->cap[vcpu].proc_ctls;
 			baseval = *pptr;
 			flag = PROCBASED_MTF;
 			reg = VMCS_PRI_PROC_BASED_CTLS;
 		}
 		break;
 	case VM_CAP_PAUSE_EXIT:
 		if (cap_pause_exit) {
 			retval = 0;
 			pptr = &vmx->cap[vcpu].proc_ctls;
 			baseval = *pptr;
 			flag = PROCBASED_PAUSE_EXITING;
 			reg = VMCS_PRI_PROC_BASED_CTLS;
 		}
 		break;
 	case VM_CAP_UNRESTRICTED_GUEST:
 		if (cap_unrestricted_guest) {
 			retval = 0;
 			pptr = &vmx->cap[vcpu].proc_ctls2;
 			baseval = *pptr;
 			flag = PROCBASED2_UNRESTRICTED_GUEST;
 			reg = VMCS_SEC_PROC_BASED_CTLS;
 		}
 		break;
 	case VM_CAP_ENABLE_INVPCID:
 		if (cap_invpcid) {
 			retval = 0;
 			pptr = &vmx->cap[vcpu].proc_ctls2;
 			baseval = *pptr;
 			flag = PROCBASED2_ENABLE_INVPCID;
 			reg = VMCS_SEC_PROC_BASED_CTLS;
 		}
 		break;
 	default:
 		break;
 	}
 
 	if (retval == 0) {
 		if (val) {
 			baseval |= flag;
 		} else {
 			baseval &= ~flag;
 		}
 		VMPTRLD(vmcs);
 		error = vmwrite(reg, baseval);
 		VMCLEAR(vmcs);
 
 		if (error) {
 			retval = error;
 		} else {
 			/*
 			 * Update optional stored flags, and record
 			 * setting
 			 */
 			if (pptr != NULL) {
 				*pptr = baseval;
 			}
 
 			if (val) {
 				vmx->cap[vcpu].set |= (1 << type);
 			} else {
 				vmx->cap[vcpu].set &= ~(1 << type);
 			}
 		}
 	}
 
         return (retval);
 }
 
 struct vlapic_vtx {
 	struct vlapic	vlapic;
 	struct pir_desc	*pir_desc;
 	struct vmx	*vmx;
 };
 
 #define	VMX_CTR_PIR(vm, vcpuid, pir_desc, notify, vector, level, msg)	\
 do {									\
 	VCPU_CTR2(vm, vcpuid, msg " assert %s-triggered vector %d",	\
 	    level ? "level" : "edge", vector);				\
 	VCPU_CTR1(vm, vcpuid, msg " pir0 0x%016lx", pir_desc->pir[0]);	\
 	VCPU_CTR1(vm, vcpuid, msg " pir1 0x%016lx", pir_desc->pir[1]);	\
 	VCPU_CTR1(vm, vcpuid, msg " pir2 0x%016lx", pir_desc->pir[2]);	\
 	VCPU_CTR1(vm, vcpuid, msg " pir3 0x%016lx", pir_desc->pir[3]);	\
 	VCPU_CTR1(vm, vcpuid, msg " notify: %s", notify ? "yes" : "no");\
 } while (0)
 
 /*
  * vlapic->ops handlers that utilize the APICv hardware assist described in
  * Chapter 29 of the Intel SDM.
  */
 static int
 vmx_set_intr_ready(struct vlapic *vlapic, int vector, bool level)
 {
 	struct vlapic_vtx *vlapic_vtx;
 	struct pir_desc *pir_desc;
 	uint64_t mask;
 	int idx, notify;
 
 	vlapic_vtx = (struct vlapic_vtx *)vlapic;
 	pir_desc = vlapic_vtx->pir_desc;
 
 	/*
 	 * Keep track of interrupt requests in the PIR descriptor. This is
 	 * because the virtual APIC page pointed to by the VMCS cannot be
 	 * modified if the vcpu is running.
 	 */
 	idx = vector / 64;
 	mask = 1UL << (vector % 64);
 	atomic_set_long(&pir_desc->pir[idx], mask);
 	notify = atomic_cmpset_long(&pir_desc->pending, 0, 1);
 
 	VMX_CTR_PIR(vlapic->vm, vlapic->vcpuid, pir_desc, notify, vector,
 	    level, "vmx_set_intr_ready");
 	return (notify);
 }
 
 static int
 vmx_pending_intr(struct vlapic *vlapic, int *vecptr)
 {
 	struct vlapic_vtx *vlapic_vtx;
 	struct pir_desc *pir_desc;
 	struct LAPIC *lapic;
 	uint64_t pending, pirval;
 	uint32_t ppr, vpr;
 	int i;
 
 	/*
 	 * This function is only expected to be called from the 'HLT' exit
 	 * handler which does not care about the vector that is pending.
 	 */
 	KASSERT(vecptr == NULL, ("vmx_pending_intr: vecptr must be NULL"));
 
 	vlapic_vtx = (struct vlapic_vtx *)vlapic;
 	pir_desc = vlapic_vtx->pir_desc;
 
 	pending = atomic_load_acq_long(&pir_desc->pending);
 	if (!pending) {
 		/*
 		 * While a virtual interrupt may have already been
 		 * processed the actual delivery maybe pending the
 		 * interruptibility of the guest.  Recognize a pending
 		 * interrupt by reevaluating virtual interrupts
 		 * following Section 29.2.1 in the Intel SDM Volume 3.
 		 */
 		struct vm_exit *vmexit;
 		uint8_t rvi, ppr;
 
 		vmexit = vm_exitinfo(vlapic->vm, vlapic->vcpuid);
 		KASSERT(vmexit->exitcode == VM_EXITCODE_HLT,
 		    ("vmx_pending_intr: exitcode not 'HLT'"));
 		rvi = vmexit->u.hlt.intr_status & APIC_TPR_INT;
 		lapic = vlapic->apic_page;
 		ppr = lapic->ppr & APIC_TPR_INT;
 		if (rvi > ppr) {
 			return (1);
 		}
 
 		return (0);
 	}
 
 	/*
 	 * If there is an interrupt pending then it will be recognized only
 	 * if its priority is greater than the processor priority.
 	 *
 	 * Special case: if the processor priority is zero then any pending
 	 * interrupt will be recognized.
 	 */
 	lapic = vlapic->apic_page;
 	ppr = lapic->ppr & APIC_TPR_INT;
 	if (ppr == 0)
 		return (1);
 
 	VCPU_CTR1(vlapic->vm, vlapic->vcpuid, "HLT with non-zero PPR %d",
 	    lapic->ppr);
 
 	for (i = 3; i >= 0; i--) {
 		pirval = pir_desc->pir[i];
 		if (pirval != 0) {
 			vpr = (i * 64 + flsl(pirval) - 1) & APIC_TPR_INT;
 			return (vpr > ppr);
 		}
 	}
 	return (0);
 }
 
 static void
 vmx_intr_accepted(struct vlapic *vlapic, int vector)
 {
 
 	panic("vmx_intr_accepted: not expected to be called");
 }
 
 static void
 vmx_set_tmr(struct vlapic *vlapic, int vector, bool level)
 {
 	struct vlapic_vtx *vlapic_vtx;
 	struct vmx *vmx;
 	struct vmcs *vmcs;
 	uint64_t mask, val;
 
 	KASSERT(vector >= 0 && vector <= 255, ("invalid vector %d", vector));
 	KASSERT(!vcpu_is_running(vlapic->vm, vlapic->vcpuid, NULL),
 	    ("vmx_set_tmr: vcpu cannot be running"));
 
 	vlapic_vtx = (struct vlapic_vtx *)vlapic;
 	vmx = vlapic_vtx->vmx;
 	vmcs = &vmx->vmcs[vlapic->vcpuid];
 	mask = 1UL << (vector % 64);
 
 	VMPTRLD(vmcs);
 	val = vmcs_read(VMCS_EOI_EXIT(vector));
 	if (level)
 		val |= mask;
 	else
 		val &= ~mask;
 	vmcs_write(VMCS_EOI_EXIT(vector), val);
 	VMCLEAR(vmcs);
 }
 
 static void
 vmx_enable_x2apic_mode(struct vlapic *vlapic)
 {
 	struct vmx *vmx;
 	struct vmcs *vmcs;
 	uint32_t proc_ctls2;
 	int vcpuid, error;
 
 	vcpuid = vlapic->vcpuid;
 	vmx = ((struct vlapic_vtx *)vlapic)->vmx;
 	vmcs = &vmx->vmcs[vcpuid];
 
 	proc_ctls2 = vmx->cap[vcpuid].proc_ctls2;
 	KASSERT((proc_ctls2 & PROCBASED2_VIRTUALIZE_APIC_ACCESSES) != 0,
 	    ("%s: invalid proc_ctls2 %#x", __func__, proc_ctls2));
 
 	proc_ctls2 &= ~PROCBASED2_VIRTUALIZE_APIC_ACCESSES;
 	proc_ctls2 |= PROCBASED2_VIRTUALIZE_X2APIC_MODE;
 	vmx->cap[vcpuid].proc_ctls2 = proc_ctls2;
 
 	VMPTRLD(vmcs);
 	vmcs_write(VMCS_SEC_PROC_BASED_CTLS, proc_ctls2);
 	VMCLEAR(vmcs);
 
 	if (vlapic->vcpuid == 0) {
 		/*
 		 * The nested page table mappings are shared by all vcpus
 		 * so unmap the APIC access page just once.
 		 */
 		error = vm_unmap_mmio(vmx->vm, DEFAULT_APIC_BASE, PAGE_SIZE);
 		KASSERT(error == 0, ("%s: vm_unmap_mmio error %d",
 		    __func__, error));
 
 		/*
 		 * The MSR bitmap is shared by all vcpus so modify it only
 		 * once in the context of vcpu 0.
 		 */
 		error = vmx_allow_x2apic_msrs(vmx);
 		KASSERT(error == 0, ("%s: vmx_allow_x2apic_msrs error %d",
 		    __func__, error));
 	}
 }
 
 static void
 vmx_post_intr(struct vlapic *vlapic, int hostcpu)
 {
 
 	ipi_cpu(hostcpu, pirvec);
 }
 
 /*
  * Transfer the pending interrupts in the PIR descriptor to the IRR
  * in the virtual APIC page.
  */
 static void
 vmx_inject_pir(struct vlapic *vlapic)
 {
 	struct vlapic_vtx *vlapic_vtx;
 	struct pir_desc *pir_desc;
 	struct LAPIC *lapic;
 	uint64_t val, pirval;
 	int rvi, pirbase = -1;
 	uint16_t intr_status_old, intr_status_new;
 
 	vlapic_vtx = (struct vlapic_vtx *)vlapic;
 	pir_desc = vlapic_vtx->pir_desc;
 	if (atomic_cmpset_long(&pir_desc->pending, 1, 0) == 0) {
 		VCPU_CTR0(vlapic->vm, vlapic->vcpuid, "vmx_inject_pir: "
 		    "no posted interrupt pending");
 		return;
 	}
 
 	pirval = 0;
 	pirbase = -1;
 	lapic = vlapic->apic_page;
 
 	val = atomic_readandclear_long(&pir_desc->pir[0]);
 	if (val != 0) {
 		lapic->irr0 |= val;
 		lapic->irr1 |= val >> 32;
 		pirbase = 0;
 		pirval = val;
 	}
 
 	val = atomic_readandclear_long(&pir_desc->pir[1]);
 	if (val != 0) {
 		lapic->irr2 |= val;
 		lapic->irr3 |= val >> 32;
 		pirbase = 64;
 		pirval = val;
 	}
 
 	val = atomic_readandclear_long(&pir_desc->pir[2]);
 	if (val != 0) {
 		lapic->irr4 |= val;
 		lapic->irr5 |= val >> 32;
 		pirbase = 128;
 		pirval = val;
 	}
 
 	val = atomic_readandclear_long(&pir_desc->pir[3]);
 	if (val != 0) {
 		lapic->irr6 |= val;
 		lapic->irr7 |= val >> 32;
 		pirbase = 192;
 		pirval = val;
 	}
 
 	VLAPIC_CTR_IRR(vlapic, "vmx_inject_pir");
 
 	/*
 	 * Update RVI so the processor can evaluate pending virtual
 	 * interrupts on VM-entry.
 	 *
 	 * It is possible for pirval to be 0 here, even though the
 	 * pending bit has been set. The scenario is:
 	 * CPU-Y is sending a posted interrupt to CPU-X, which
 	 * is running a guest and processing posted interrupts in h/w.
 	 * CPU-X will eventually exit and the state seen in s/w is
 	 * the pending bit set, but no PIR bits set.
 	 *
 	 *      CPU-X                      CPU-Y
 	 *   (vm running)                (host running)
 	 *   rx posted interrupt
 	 *   CLEAR pending bit
 	 *				 SET PIR bit
 	 *   READ/CLEAR PIR bits
 	 *				 SET pending bit
 	 *   (vm exit)
 	 *   pending bit set, PIR 0
 	 */
 	if (pirval != 0) {
 		rvi = pirbase + flsl(pirval) - 1;
 		intr_status_old = vmcs_read(VMCS_GUEST_INTR_STATUS);
 		intr_status_new = (intr_status_old & 0xFF00) | rvi;
 		if (intr_status_new > intr_status_old) {
 			vmcs_write(VMCS_GUEST_INTR_STATUS, intr_status_new);
 			VCPU_CTR2(vlapic->vm, vlapic->vcpuid, "vmx_inject_pir: "
 			    "guest_intr_status changed from 0x%04x to 0x%04x",
 			    intr_status_old, intr_status_new);
 		}
 	}
 }
 
 static struct vlapic *
 vmx_vlapic_init(void *arg, int vcpuid)
 {
 	struct vmx *vmx;
 	struct vlapic *vlapic;
 	struct vlapic_vtx *vlapic_vtx;
 	
 	vmx = arg;
 
 	vlapic = malloc(sizeof(struct vlapic_vtx), M_VLAPIC, M_WAITOK | M_ZERO);
 	vlapic->vm = vmx->vm;
 	vlapic->vcpuid = vcpuid;
 	vlapic->apic_page = (struct LAPIC *)&vmx->apic_page[vcpuid];
 
 	vlapic_vtx = (struct vlapic_vtx *)vlapic;
 	vlapic_vtx->pir_desc = &vmx->pir_desc[vcpuid];
 	vlapic_vtx->vmx = vmx;
 
 	if (virtual_interrupt_delivery) {
 		vlapic->ops.set_intr_ready = vmx_set_intr_ready;
 		vlapic->ops.pending_intr = vmx_pending_intr;
 		vlapic->ops.intr_accepted = vmx_intr_accepted;
 		vlapic->ops.set_tmr = vmx_set_tmr;
 		vlapic->ops.enable_x2apic_mode = vmx_enable_x2apic_mode;
 	}
 
 	if (posted_interrupts)
 		vlapic->ops.post_intr = vmx_post_intr;
 
 	vlapic_init(vlapic);
 
 	return (vlapic);
 }
 
 static void
 vmx_vlapic_cleanup(void *arg, struct vlapic *vlapic)
 {
 
 	vlapic_cleanup(vlapic);
 	free(vlapic, M_VLAPIC);
 }
 
 struct vmm_ops vmm_ops_intel = {
 	vmx_init,
 	vmx_cleanup,
 	vmx_restore,
 	vmx_vminit,
 	vmx_run,
 	vmx_vmcleanup,
 	vmx_getreg,
 	vmx_setreg,
 	vmx_getdesc,
 	vmx_setdesc,
 	vmx_getcap,
 	vmx_setcap,
 	ept_vmspace_alloc,
 	ept_vmspace_free,
 	vmx_vlapic_init,
 	vmx_vlapic_cleanup,
 };
Index: head/sys/amd64/vmm/intel/vmx_support.S
===================================================================
--- head/sys/amd64/vmm/intel/vmx_support.S	(revision 338067)
+++ head/sys/amd64/vmm/intel/vmx_support.S	(revision 338068)
@@ -1,366 +1,332 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
  *
  * Copyright (c) 2011 NetApp, Inc.
  * Copyright (c) 2013 Neel Natu <neel@freebsd.org>
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  * $FreeBSD$
  */
 
 #include <machine/asmacros.h>
 #include <machine/specialreg.h>
 
 #include "vmx_assym.h"
 
 #ifdef SMP
 #define	LK	lock ;
 #else
 #define	LK
 #endif
 
 /* Be friendly to DTrace FBT's prologue/epilogue pattern matching */
 #define VENTER  push %rbp ; mov %rsp,%rbp
 #define VLEAVE  pop %rbp
 
 /*
  * Save the guest context.
  */
 #define	VMX_GUEST_SAVE							\
 	movq	%rdi,VMXCTX_GUEST_RDI(%rsp);				\
 	movq	%rsi,VMXCTX_GUEST_RSI(%rsp);				\
 	movq	%rdx,VMXCTX_GUEST_RDX(%rsp);				\
 	movq	%rcx,VMXCTX_GUEST_RCX(%rsp);				\
 	movq	%r8,VMXCTX_GUEST_R8(%rsp);				\
 	movq	%r9,VMXCTX_GUEST_R9(%rsp);				\
 	movq	%rax,VMXCTX_GUEST_RAX(%rsp);				\
 	movq	%rbx,VMXCTX_GUEST_RBX(%rsp);				\
 	movq	%rbp,VMXCTX_GUEST_RBP(%rsp);				\
 	movq	%r10,VMXCTX_GUEST_R10(%rsp);				\
 	movq	%r11,VMXCTX_GUEST_R11(%rsp);				\
 	movq	%r12,VMXCTX_GUEST_R12(%rsp);				\
 	movq	%r13,VMXCTX_GUEST_R13(%rsp);				\
 	movq	%r14,VMXCTX_GUEST_R14(%rsp);				\
 	movq	%r15,VMXCTX_GUEST_R15(%rsp);				\
 	movq	%cr2,%rdi;						\
 	movq	%rdi,VMXCTX_GUEST_CR2(%rsp);				\
 	movq	%rsp,%rdi;
 
 /*
  * Assumes that %rdi holds a pointer to the 'vmxctx'.
  *
  * On "return" all registers are updated to reflect guest state. The two
  * exceptions are %rip and %rsp. These registers are atomically switched
  * by hardware from the guest area of the vmcs.
  *
  * We modify %rsp to point to the 'vmxctx' so we can use it to restore
  * host context in case of an error with 'vmlaunch' or 'vmresume'.
  */
 #define	VMX_GUEST_RESTORE						\
 	movq	%rdi,%rsp;						\
 	movq	VMXCTX_GUEST_CR2(%rdi),%rsi;				\
 	movq	%rsi,%cr2;						\
 	movq	VMXCTX_GUEST_RSI(%rdi),%rsi;				\
 	movq	VMXCTX_GUEST_RDX(%rdi),%rdx;				\
 	movq	VMXCTX_GUEST_RCX(%rdi),%rcx;				\
 	movq	VMXCTX_GUEST_R8(%rdi),%r8;				\
 	movq	VMXCTX_GUEST_R9(%rdi),%r9;				\
 	movq	VMXCTX_GUEST_RAX(%rdi),%rax;				\
 	movq	VMXCTX_GUEST_RBX(%rdi),%rbx;				\
 	movq	VMXCTX_GUEST_RBP(%rdi),%rbp;				\
 	movq	VMXCTX_GUEST_R10(%rdi),%r10;				\
 	movq	VMXCTX_GUEST_R11(%rdi),%r11;				\
 	movq	VMXCTX_GUEST_R12(%rdi),%r12;				\
 	movq	VMXCTX_GUEST_R13(%rdi),%r13;				\
 	movq	VMXCTX_GUEST_R14(%rdi),%r14;				\
 	movq	VMXCTX_GUEST_R15(%rdi),%r15;				\
 	movq	VMXCTX_GUEST_RDI(%rdi),%rdi; /* restore rdi the last */
 
 /*
  * Clobber the remaining registers with guest contents so they can't
  * be misused.
  */
 #define	VMX_GUEST_CLOBBER						\
 	xor	%rax, %rax;						\
 	xor	%rcx, %rcx;						\
 	xor	%rdx, %rdx;						\
 	xor	%rsi, %rsi;						\
 	xor	%r8, %r8;						\
 	xor	%r9, %r9;						\
 	xor	%r10, %r10;						\
 	xor	%r11, %r11;
 
 /*
  * Save and restore the host context.
  *
  * Assumes that %rdi holds a pointer to the 'vmxctx'.
  */
 #define	VMX_HOST_SAVE							\
 	movq    %r15, VMXCTX_HOST_R15(%rdi);				\
 	movq    %r14, VMXCTX_HOST_R14(%rdi);				\
 	movq    %r13, VMXCTX_HOST_R13(%rdi);				\
 	movq    %r12, VMXCTX_HOST_R12(%rdi);				\
 	movq    %rbp, VMXCTX_HOST_RBP(%rdi);				\
 	movq    %rsp, VMXCTX_HOST_RSP(%rdi);				\
 	movq    %rbx, VMXCTX_HOST_RBX(%rdi);				\
 
 #define	VMX_HOST_RESTORE						\
 	movq	VMXCTX_HOST_R15(%rdi), %r15;				\
 	movq	VMXCTX_HOST_R14(%rdi), %r14;				\
 	movq	VMXCTX_HOST_R13(%rdi), %r13;				\
 	movq	VMXCTX_HOST_R12(%rdi), %r12;				\
 	movq	VMXCTX_HOST_RBP(%rdi), %rbp;				\
 	movq	VMXCTX_HOST_RSP(%rdi), %rsp;				\
 	movq	VMXCTX_HOST_RBX(%rdi), %rbx;				\
 
 /*
  * vmx_enter_guest(struct vmxctx *vmxctx, int launched)
  * %rdi: pointer to the 'vmxctx'
  * %rsi: pointer to the 'vmx'
  * %edx: launch state of the VMCS
  * Interrupts must be disabled on entry.
  */
 ENTRY(vmx_enter_guest)
 	VENTER
 	/*
 	 * Save host state before doing anything else.
 	 */
 	VMX_HOST_SAVE
 
 	/*
 	 * Activate guest pmap on this cpu.
 	 */
 	movq	VMXCTX_PMAP(%rdi), %r11
 	movl	PCPU(CPUID), %eax
 	LK btsl	%eax, PM_ACTIVE(%r11)
 
 	/*
 	 * If 'vmx->eptgen[curcpu]' is not identical to 'pmap->pm_eptgen'
 	 * then we must invalidate all mappings associated with this EPTP.
 	 */
 	movq	PM_EPTGEN(%r11), %r10
 	cmpq	%r10, VMX_EPTGEN(%rsi, %rax, 8)
 	je	guest_restore
 
 	/* Refresh 'vmx->eptgen[curcpu]' */
 	movq	%r10, VMX_EPTGEN(%rsi, %rax, 8)
 
 	/* Setup the invept descriptor on the host stack */
 	mov	%rsp, %r11
 	movq	VMX_EPTP(%rsi), %rax
 	movq	%rax, -16(%r11)
 	movq	$0x0, -8(%r11)
 	mov	$0x1, %eax		/* Single context invalidate */
 	invept	-16(%r11), %rax
 	jbe	invept_error		/* Check invept instruction error */
 
 guest_restore:
-
-	/*
-	 * Flush L1D cache if requested.  Use IA32_FLUSH_CMD MSR if available,
-	 * otherwise load enough of the data from the zero_region to flush
-	 * existing L1D content.
-	 */
-#define	L1D_FLUSH_SIZE	(64 * 1024)
 	movl	%edx, %r8d
-	cmpb	$0, guest_l1d_flush(%rip)
+	cmpb	$0, guest_l1d_flush_sw(%rip)
 	je	after_l1d
-	movq	vmx_msr_flush_cmd(%rip), %rax
-	testq	%rax, %rax
-	jz	1f
-	movq	%rax, %rdx
-	shrq	$32, %rdx
-	movl	$MSR_IA32_FLUSH_CMD, %ecx
-	wrmsr
-	jmp	after_l1d
-1:	movq	$KERNBASE, %r9
-	movq	$-L1D_FLUSH_SIZE, %rcx
-	/*
-	 * pass 1: Preload TLB.
-	 * Kernel text is mapped using superpages.  TLB preload is
-	 * done for the benefit of older CPUs which split 2M page
-	 * into 4k TLB entries.
-	 */
-2:	movb	L1D_FLUSH_SIZE(%r9, %rcx), %al
-	addq	$PAGE_SIZE, %rcx
-	jne	2b
-	xorl	%eax, %eax
-	cpuid
-	movq	$-L1D_FLUSH_SIZE, %rcx
-	/* pass 2: Read each cache line */
-3:	movb	L1D_FLUSH_SIZE(%r9, %rcx), %al
-	addq	$64, %rcx
-	jne	3b
-	lfence
-#undef	L1D_FLUSH_SIZE
+	call	flush_l1d_sw
 after_l1d:
 	cmpl	$0, %r8d
 	je	do_launch
 	VMX_GUEST_RESTORE
 	vmresume
 	/*
 	 * In the common case 'vmresume' returns back to the host through
 	 * 'vmx_exit_guest' with %rsp pointing to 'vmxctx'.
 	 *
 	 * If there is an error we return VMX_VMRESUME_ERROR to the caller.
 	 */
 	movq	%rsp, %rdi		/* point %rdi back to 'vmxctx' */
 	movl	$VMX_VMRESUME_ERROR, %eax
 	jmp	decode_inst_error
 
 do_launch:
 	VMX_GUEST_RESTORE
 	vmlaunch
 	/*
 	 * In the common case 'vmlaunch' returns back to the host through
 	 * 'vmx_exit_guest' with %rsp pointing to 'vmxctx'.
 	 *
 	 * If there is an error we return VMX_VMLAUNCH_ERROR to the caller.
 	 */
 	movq	%rsp, %rdi		/* point %rdi back to 'vmxctx' */
 	movl	$VMX_VMLAUNCH_ERROR, %eax
 	jmp	decode_inst_error
 
 invept_error:
 	movl	$VMX_INVEPT_ERROR, %eax
 	jmp	decode_inst_error
 
 decode_inst_error:
 	movl	$VM_FAIL_VALID, %r11d
 	jz	inst_error
 	movl	$VM_FAIL_INVALID, %r11d
 inst_error:
 	movl	%r11d, VMXCTX_INST_FAIL_STATUS(%rdi)
 
 	/*
 	 * The return value is already populated in %eax so we cannot use
 	 * it as a scratch register beyond this point.
 	 */
 
 	/*
 	 * Deactivate guest pmap from this cpu.
 	 */
 	movq	VMXCTX_PMAP(%rdi), %r11
 	movl	PCPU(CPUID), %r10d
 	LK btrl	%r10d, PM_ACTIVE(%r11)
 
 	VMX_HOST_RESTORE
 	VLEAVE
 	ret
 
 /*
  * Non-error VM-exit from the guest. Make this a label so it can
  * be used by C code when setting up the VMCS.
  * The VMCS-restored %rsp points to the struct vmxctx
  */
 	ALIGN_TEXT
 	.globl	vmx_exit_guest_flush_rsb
 vmx_exit_guest_flush_rsb:
 	/*
 	 * Save guest state that is not automatically saved in the vmcs.
 	 */
 	VMX_GUEST_SAVE
 
 	/*
 	 * Deactivate guest pmap from this cpu.
 	 */
 	movq	VMXCTX_PMAP(%rdi), %r11
 	movl	PCPU(CPUID), %r10d
 	LK btrl	%r10d, PM_ACTIVE(%r11)
 
 	VMX_HOST_RESTORE
 
 	VMX_GUEST_CLOBBER
 
 	/*
 	 * To prevent malicious branch target predictions from
 	 * affecting the host, overwrite all entries in the RSB upon
 	 * exiting a guest.
 	 */
 	mov	$16, %ecx	/* 16 iterations, two calls per loop */
 	mov	%rsp, %rax
 0:	call	2f		/* create an RSB entry. */
 1:	pause
 	call	1b		/* capture rogue speculation. */
 2:	call	2f		/* create an RSB entry. */
 1:	pause
 	call	1b		/* capture rogue speculation. */
 2:	sub	$1, %ecx
 	jnz	0b
 	mov	%rax, %rsp
 
 	/*
 	 * This will return to the caller of 'vmx_enter_guest()' with a return
 	 * value of VMX_GUEST_VMEXIT.
 	 */
 	movl	$VMX_GUEST_VMEXIT, %eax
 	VLEAVE
 	ret
 
 	.globl	vmx_exit_guest
 vmx_exit_guest:
 	/*
 	 * Save guest state that is not automatically saved in the vmcs.
 	 */
 	VMX_GUEST_SAVE
 
 	/*
 	 * Deactivate guest pmap from this cpu.
 	 */
 	movq	VMXCTX_PMAP(%rdi), %r11
 	movl	PCPU(CPUID), %r10d
 	LK btrl	%r10d, PM_ACTIVE(%r11)
 
 	VMX_HOST_RESTORE
 
 	VMX_GUEST_CLOBBER
 
 	/*
 	 * This will return to the caller of 'vmx_enter_guest()' with a return
 	 * value of VMX_GUEST_VMEXIT.
 	 */
 	movl	$VMX_GUEST_VMEXIT, %eax
 	VLEAVE
 	ret
 END(vmx_enter_guest)
 
 /*
  * %rdi = interrupt handler entry point
  *
  * Calling sequence described in the "Instruction Set Reference" for the "INT"
  * instruction in Intel SDM, Vol 2.
  */
 ENTRY(vmx_call_isr)
 	VENTER
 	mov	%rsp, %r11			/* save %rsp */
 	and	$~0xf, %rsp			/* align on 16-byte boundary */
 	pushq	$KERNEL_SS			/* %ss */
 	pushq	%r11				/* %rsp */
 	pushfq					/* %rflags */
 	pushq	$KERNEL_CS			/* %cs */
 	cli					/* disable interrupts */
 	callq	*%rdi				/* push %rip and call isr */
 	VLEAVE
 	ret
 END(vmx_call_isr)