Index: sys/amd64/vmm/amd/svm.c =================================================================== --- sys/amd64/vmm/amd/svm.c +++ sys/amd64/vmm/amd/svm.c @@ -82,6 +82,11 @@ #define AMD_CPUID_SVM_PAUSE_FTH BIT(12) /* Pause filter threshold */ #define AMD_CPUID_SVM_AVIC BIT(13) /* AVIC present */ +/* + * Bitmap for all exceptions excluding unimplemented vectors 2 and 9. + */ +#define ALL_EXCEPTIONS_BITMAP 0xFFFFFDFB + #define VMCB_CACHE_DEFAULT (VMCB_CACHE_ASID | \ VMCB_CACHE_IOPM | \ VMCB_CACHE_I | \ @@ -126,6 +131,12 @@ static VMM_STAT_AMD(VCPU_EXITINTINFO, "VM exits during event delivery"); static VMM_STAT_AMD(VCPU_INTINFO_INJECTED, "Events pending at VM entry"); static VMM_STAT_AMD(VMEXIT_VINTR, "VM exits due to interrupt window"); +static VMM_STAT_AMD(VMEXIT_EXCEPTION_DURING_IRET, "VM exits due to exceptions " + "during iret"); +static VMM_STAT_AMD(NMI_SPECULATIVE_UNBLOCKING, "Number of times vNMI " + "unblocked speculatively"); +static VMM_STAT_AMD(NMI_PRECISE_UNBLOCKING, "Number of times vNMI " + "unblocked precisely"); static int svm_setreg(void *arg, int vcpu, int ident, uint64_t val); @@ -401,6 +412,22 @@ } static void +set_exception_bitmap(struct svm_softc *sc, int vcpu, uint32_t newval) +{ + struct vmcb_ctrl *ctrl; + uint32_t oldval; + + ctrl = svm_get_vmcb_ctrl(sc, vcpu); + oldval = ctrl->intercept[VMCB_EXC_INTCPT]; + if (newval != oldval) { + ctrl->intercept[VMCB_EXC_INTCPT] = newval; + svm_set_dirty(sc, vcpu, VMCB_CACHE_I); + VCPU_CTR3(sc->vm, vcpu, "intercept[%d] modified " + "from %#x to %#x", VMCB_EXC_INTCPT, oldval, newval); + } +} + +static void vmcb_init(struct svm_softc *sc, int vcpu, uint64_t iopm_base_pa, uint64_t msrpm_base_pa, uint64_t np_pml4) { @@ -436,19 +463,11 @@ * Intercept everything when tracing guest exceptions otherwise * just intercept machine check exception. */ - if (vcpu_trace_exceptions(sc->vm, vcpu)) { - for (n = 0; n < 32; n++) { - /* - * Skip unimplemented vectors in the exception bitmap. - */ - if (n == 2 || n == 9) { - continue; - } - svm_enable_intercept(sc, vcpu, VMCB_EXC_INTCPT, BIT(n)); - } - } else { - svm_enable_intercept(sc, vcpu, VMCB_EXC_INTCPT, BIT(IDT_MC)); - } + if (vcpu_trace_exceptions(sc->vm, vcpu)) + mask = ALL_EXCEPTIONS_BITMAP; + else + mask = BIT(IDT_MC); + set_exception_bitmap(sc, vcpu, mask); /* Intercept various events (for e.g. I/O, MSR and CPUID accesses) */ svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_IO); @@ -1027,48 +1046,37 @@ return (0); } -/* - * Once an NMI is injected it blocks delivery of further NMIs until the handler - * executes an IRET. The IRET intercept is enabled when an NMI is injected to - * to track when the vcpu is done handling the NMI. - */ -static int -nmi_blocked(struct svm_softc *sc, int vcpu) -{ - int blocked; - - blocked = svm_get_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, - VMCB_INTCPT_IRET); - return (blocked); -} - static void -enable_nmi_blocking(struct svm_softc *sc, int vcpu) +nmi_enable_iret_intercept(struct svm_softc *sc, int vcpu) { + struct svm_vcpu *vcpustate; - KASSERT(!nmi_blocked(sc, vcpu), ("vNMI already blocked")); - VCPU_CTR0(sc->vm, vcpu, "vNMI blocking enabled"); + vcpustate = svm_get_vcpu(sc, vcpu); + KASSERT(!vcpustate->nmi.blocking, ("invalid vNMI blocking state %d", + vcpustate->nmi.blocking)); + + vcpustate->nmi.blocking = NMI_IRET_INTERCEPT; svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_IRET); + VCPU_CTR0(sc->vm, vcpu, "vNMI iret intercept enabled"); } static void -clear_nmi_blocking(struct svm_softc *sc, int vcpu) +nmi_enable_iret_tracing(struct svm_softc *sc, int vcpu) { + struct svm_vcpu *vcpustate; + struct vmcb_state *state; + struct vmcb_ctrl *ctrl; + struct vmcb *vmcb; int error; - KASSERT(nmi_blocked(sc, vcpu), ("vNMI already unblocked")); - VCPU_CTR0(sc->vm, vcpu, "vNMI blocking cleared"); - /* - * When the IRET intercept is cleared the vcpu will attempt to execute - * the "iret" when it runs next. However, it is possible to inject - * another NMI into the vcpu before the "iret" has actually executed. - * - * For e.g. if the "iret" encounters a #NPF when accessing the stack - * it will trap back into the hypervisor. If an NMI is pending for - * the vcpu it will be injected into the guest. - * - * XXX this needs to be fixed - */ + vcpustate = svm_get_vcpu(sc, vcpu); + ctrl = svm_get_vmcb_ctrl(sc, vcpu); + vmcb = svm_get_vmcb(sc, vcpu); + state = &vmcb->state; + + KASSERT(vcpustate->nmi.blocking == NMI_IRET_INTERCEPT, + ("invalid vNMI blocking state %d", vcpustate->nmi.blocking)); + svm_disable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_IRET); /* @@ -1077,6 +1085,77 @@ */ error = svm_modify_intr_shadow(sc, vcpu, 1); KASSERT(!error, ("%s: error %d setting intr_shadow", __func__, error)); + + /* + * XXX + * Single stepping using the trap flag does not work across a task + * switch so we unblock vNMIs right here. A vNMI can be prematurely + * injected into the vcpu if a #VMEXIT is triggered before the "iret" + * can finish execution (e.g. #NPF). + */ + if (state->rflags & PSL_NT) { + vcpustate->nmi.blocking = 0; + vmm_stat_incr(sc->vm, vcpu, NMI_SPECULATIVE_UNBLOCKING, 1); + VCPU_CTR0(sc->vm, vcpu, "vNMI unblocked speculatively"); + return; + } + + /* + * Single step "iret" which can trigger a #VMEXIT for the following + * reasons: + * + * 1. The "iret" executes successfully in which case the single step + * will trigger a VMEXIT_EXCEPTION (IDT_DB). + * 2. The "iret" triggers an exception which in turn will cause a + * VMEXIT_EXCEPTION (IDT_GP, IDT_PF, IDT_SS etc). + * 3. An #VMEXIT is triggered by reasons unrelated to the "iret". + * For e.g. nested page fault, hardware interrupt or NMI. + * + * From section "Handling Multiple NMIs" from the Intel SDM + * cases (1) and (2) will unblock vNMIs. + */ + vcpustate->nmi.blocking = NMI_IRET_TRACING; + vcpustate->nmi.rflags = state->rflags; + state->rflags |= PSL_RF | PSL_T; + vcpustate->nmi.exception_bitmap = ctrl->intercept[VMCB_EXC_INTCPT]; + set_exception_bitmap(sc, vcpu, ALL_EXCEPTIONS_BITMAP); + + VCPU_CTR4(sc->vm, vcpu, "vNMI iret tracing enabled: " + "rflags (%#lx/%#lx) exception_bitmap (%#08x/%#08x)", + vcpustate->nmi.rflags, state->rflags, + vcpustate->nmi.exception_bitmap, ALL_EXCEPTIONS_BITMAP); +} + +static void +nmi_unblock(struct svm_softc *sc, int vcpu, bool restore_rflags) +{ + struct svm_vcpu *vcpustate; + struct vmcb_state *state; + struct vmcb *vmcb; + + vcpustate = svm_get_vcpu(sc, vcpu); + vmcb = svm_get_vmcb(sc, vcpu); + state = &vmcb->state; + + KASSERT(vcpustate->nmi.blocking == NMI_IRET_TRACING, + ("invalid vNMI blocking state %d", vcpustate->nmi.blocking)); + + /* + * If the "iret" execution triggered an exception then restore the + * PSL_RF and PSL_T bits in %rflags before injecting the exception + * into the guest. + * + * If the "iret" instruction completes successfully then %rflags has + * already been restored from the NMI stack. + */ + if (restore_rflags) { + state->rflags &= ~(PSL_RF | PSL_T); + state->rflags |= (vcpustate->nmi.rflags & (PSL_RF | PSL_T)); + } + set_exception_bitmap(sc, vcpu, vcpustate->nmi.exception_bitmap); + vcpustate->nmi.blocking = 0; + vmm_stat_incr(sc->vm, vcpu, NMI_PRECISE_UNBLOCKING, 1); + VCPU_CTR0(sc->vm, vcpu, "vNMIs unblocked precisely"); } static int @@ -1206,6 +1285,7 @@ { struct vmcb *vmcb; struct vmcb_state *state; + struct svm_vcpu *vcpustate; struct vmcb_ctrl *ctrl; struct svm_regctx *ctx; uint64_t code, info1, info2, val; @@ -1214,6 +1294,7 @@ bool retu; ctx = svm_get_guest_regctx(svm_sc, vcpu); + vcpustate = svm_get_vcpu(svm_sc, vcpu); vmcb = svm_get_vmcb(svm_sc, vcpu); state = &vmcb->state; ctrl = &vmcb->ctrl; @@ -1255,7 +1336,7 @@ * Restart execution at "iret" but with the intercept cleared. */ vmexit->inst_length = 0; - clear_nmi_blocking(svm_sc, vcpu); + nmi_enable_iret_tracing(svm_sc, vcpu); handled = 1; break; case VMCB_EXIT_VINTR: /* interrupt window exiting */ @@ -1273,6 +1354,25 @@ vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_EXCEPTION, 1); reflect = 1; idtvec = code - 0x40; + if (vcpustate->nmi.blocking == NMI_IRET_TRACING) { + if (idtvec == IDT_DB) { + /* Don't reflect #DB into the guest */ + reflect = 0; + + /* + * APMv2 Section 15.2.2 #DB (Debug): + * The value saved for DR6 and DR7 matches + * what would be visible to a #DB handler. + */ + KASSERT((state->dr6 & (1 << 14)) != 0, + ("DR6.BS not set (%#lx)", state->dr6)); + } else { + vmm_stat_incr(svm_sc->vm, vcpu, + VMEXIT_EXCEPTION_DURING_IRET, 1); + } + nmi_unblock(svm_sc, vcpu, idtvec == IDT_DB ? 0 : 1); + } + switch (idtvec) { case IDT_MC: /* @@ -1511,13 +1611,14 @@ /* NMI event has priority over interrupts. */ if (vm_nmi_pending(sc->vm, vcpu)) { - if (nmi_blocked(sc, vcpu)) { + if (vcpustate->nmi.blocking) { /* * Can't inject another NMI if the guest has not * yet executed an "iret" after the last NMI. */ - VCPU_CTR0(sc->vm, vcpu, "Cannot inject NMI due " - "to NMI-blocking"); + VCPU_CTR1(sc->vm, vcpu, "Cannot inject NMI due to %s ", + vcpustate->nmi.blocking == NMI_IRET_INTERCEPT ? + "iret intercept" : "iret tracing"); } else if (ctrl->intr_shadow) { /* * Can't inject an NMI if the vcpu is in an intr_shadow. @@ -1553,7 +1654,7 @@ IDT_NMI, 0, false); /* virtual NMI blocking is now in effect */ - enable_nmi_blocking(sc, vcpu); + nmi_enable_iret_intercept(sc, vcpu); VCPU_CTR0(sc->vm, vcpu, "Injecting vNMI"); } @@ -1688,6 +1789,18 @@ } else { disable_intr_window_exiting(sc, vcpu); } + +#ifdef INVARIANTS + if (vcpustate->nmi.blocking == NMI_IRET_TRACING) { + KASSERT((state->rflags & (PSL_RF | PSL_T)) == (PSL_RF | PSL_T), + ("invalid rflags value during iret tracing (%#lx)", + state->rflags)); + KASSERT(ctrl->intr_shadow, ("vcpu must be in interrupt " + "shadow during iret tracing")); + KASSERT((ctrl->eventinj & VMCB_EVENTINJ_VALID) == 0, + ("event injection not expected during iret tracing")); + } +#endif } static __inline void Index: sys/amd64/vmm/amd/svm_softc.h =================================================================== --- sys/amd64/vmm/amd/svm_softc.h +++ sys/amd64/vmm/amd/svm_softc.h @@ -37,6 +37,12 @@ uint32_t num; /* range is [1, nasid - 1] */ }; +enum nmi_blocking { + NMI_UNBLOCKED = 0, + NMI_IRET_INTERCEPT, /* iret intercept is enabled */ + NMI_IRET_TRACING, /* iret tracing is enabled */ +}; + /* * XXX separate out 'struct vmcb' from 'svm_vcpu' to avoid wasting space * due to VMCB alignment requirements. @@ -50,6 +56,11 @@ uint32_t dirty; /* state cache bits that must be cleared */ long eptgen; /* pmap->pm_eptgen when the vcpu last ran */ struct asid asid; + struct { + enum nmi_blocking blocking; + uint32_t exception_bitmap; + uint64_t rflags; + } nmi; } __aligned(PAGE_SIZE); /* Index: sys/amd64/vmm/vmm.c =================================================================== --- sys/amd64/vmm/vmm.c +++ sys/amd64/vmm/vmm.c @@ -1539,6 +1539,9 @@ if (error == 0 && retu == false) goto restart; + VCPU_CTR2(vm, vcpuid, "returning from vm_run with " + "error %d and exitcode %d", error, vme->exitcode); + /* copy the exit information */ bcopy(vme, &vmrun->vm_exit, sizeof(struct vm_exit)); return (error);