Index: lib/libvmmapi/vmmapi.h =================================================================== --- lib/libvmmapi/vmmapi.h +++ lib/libvmmapi/vmmapi.h @@ -32,6 +32,12 @@ #include #include +/* + * API version for out-of-tree consumers like grub-bhyve for making compile + * time decisions. + */ +#define VMMAPI_VERSION 0101 /* 2 digit major followed by 2 digit minor */ + struct iovec; struct vmctx; enum x2apic_state; @@ -70,13 +76,12 @@ struct seg_desc *seg_desc); int vm_set_register(struct vmctx *ctx, int vcpu, int reg, uint64_t val); int vm_get_register(struct vmctx *ctx, int vcpu, int reg, uint64_t *retval); -int vm_run(struct vmctx *ctx, int vcpu, uint64_t rip, - struct vm_exit *ret_vmexit); +int vm_run(struct vmctx *ctx, int vcpu, struct vm_exit *ret_vmexit); int vm_suspend(struct vmctx *ctx, enum vm_suspend_how how); int vm_reinit(struct vmctx *ctx); int vm_apicid2vcpu(struct vmctx *ctx, int apicid); -int vm_inject_exception(struct vmctx *ctx, int vcpu, int vec); -int vm_inject_exception2(struct vmctx *ctx, int vcpu, int vec, int errcode); +int vm_inject_exception(struct vmctx *ctx, int vcpu, int vector, + int errcode_valid, uint32_t errcode, int restart_instruction); int vm_lapic_irq(struct vmctx *ctx, int vcpu, int vector); int vm_lapic_local_irq(struct vmctx *ctx, int vcpu, int vector); int vm_lapic_msi(struct vmctx *ctx, uint64_t addr, uint64_t msg); Index: lib/libvmmapi/vmmapi.c =================================================================== --- lib/libvmmapi/vmmapi.c +++ lib/libvmmapi/vmmapi.c @@ -368,14 +368,13 @@ } int -vm_run(struct vmctx *ctx, int vcpu, uint64_t rip, struct vm_exit *vmexit) +vm_run(struct vmctx *ctx, int vcpu, struct vm_exit *vmexit) { int error; struct vm_run vmrun; bzero(&vmrun, sizeof(vmrun)); vmrun.cpuid = vcpu; - vmrun.rip = rip; error = ioctl(ctx->fd, VM_RUN, &vmrun); bcopy(&vmrun.vm_exit, vmexit, sizeof(struct vm_exit)); @@ -399,36 +398,22 @@ return (ioctl(ctx->fd, VM_REINIT, 0)); } -static int -vm_inject_exception_real(struct vmctx *ctx, int vcpu, int vector, - int error_code, int error_code_valid) +int +vm_inject_exception(struct vmctx *ctx, int vcpu, int vector, int errcode_valid, + uint32_t errcode, int restart_instruction) { struct vm_exception exc; - bzero(&exc, sizeof(exc)); exc.cpuid = vcpu; exc.vector = vector; - exc.error_code = error_code; - exc.error_code_valid = error_code_valid; + exc.error_code = errcode; + exc.error_code_valid = errcode_valid; + exc.restart_instruction = restart_instruction; return (ioctl(ctx->fd, VM_INJECT_EXCEPTION, &exc)); } int -vm_inject_exception(struct vmctx *ctx, int vcpu, int vector) -{ - - return (vm_inject_exception_real(ctx, vcpu, vector, 0, 0)); -} - -int -vm_inject_exception2(struct vmctx *ctx, int vcpu, int vector, int errcode) -{ - - return (vm_inject_exception_real(ctx, vcpu, vector, errcode, 1)); -} - -int vm_apicid2vcpu(struct vmctx *ctx, int apicid) { /* @@ -1198,3 +1183,11 @@ *secs = rtctime.secs; return (error); } + +int +vm_restart_instruction(void *arg, int vcpu) +{ + struct vmctx *ctx = arg; + + return (ioctl(ctx->fd, VM_RESTART_INSTRUCTION, &vcpu)); +} Index: sys/amd64/include/vmm_dev.h =================================================================== --- sys/amd64/include/vmm_dev.h +++ sys/amd64/include/vmm_dev.h @@ -54,7 +54,6 @@ struct vm_run { int cpuid; - uint64_t rip; /* start running here */ struct vm_exit vm_exit; }; @@ -238,6 +237,7 @@ IOCNUM_LAPIC_MSI = 36, IOCNUM_LAPIC_LOCAL_IRQ = 37, IOCNUM_IOAPIC_PINCOUNT = 38, + IOCNUM_RESTART_INSTRUCTION = 39, /* PCI pass-thru */ IOCNUM_BIND_PPTDEV = 40, @@ -360,4 +360,6 @@ _IOW('v', IOCNUM_RTC_SETTIME, struct vm_rtc_time) #define VM_RTC_GETTIME \ _IOR('v', IOCNUM_RTC_GETTIME, struct vm_rtc_time) +#define VM_RESTART_INSTRUCTION \ + _IOW('v', IOCNUM_RESTART_INSTRUCTION, int) #endif Index: sys/amd64/vmm/intel/vmx.c =================================================================== --- sys/amd64/vmm/intel/vmx.c +++ sys/amd64/vmm/intel/vmx.c @@ -2412,6 +2412,7 @@ if (vm_mem_allocated(vmx->vm, gpa) || apic_access_fault(vmx, vcpu, gpa)) { vmexit->exitcode = VM_EXITCODE_PAGING; + vmexit->inst_length = 0; vmexit->u.paging.gpa = gpa; vmexit->u.paging.fault_type = ept_fault_type(qual); vmm_stat_incr(vmx->vm, vcpu, VMEXIT_NESTED_FAULT, 1); Index: sys/amd64/vmm/vmm.c =================================================================== --- sys/amd64/vmm/vmm.c +++ sys/amd64/vmm/vmm.c @@ -109,6 +109,7 @@ uint64_t guest_xcr0; /* (i) guest %xcr0 register */ void *stats; /* (a,i) statistics */ struct vm_exit exitinfo; /* (x) exit reason and collateral */ + uint64_t nextrip; /* (x) next instruction to execute */ }; #define vcpu_lock_initialized(v) mtx_initialized(&((v)->mtx)) @@ -850,16 +851,26 @@ } int -vm_set_register(struct vm *vm, int vcpu, int reg, uint64_t val) +vm_set_register(struct vm *vm, int vcpuid, int reg, uint64_t val) { + struct vcpu *vcpu; + int error; - if (vcpu < 0 || vcpu >= VM_MAXCPU) + if (vcpuid < 0 || vcpuid >= VM_MAXCPU) return (EINVAL); if (reg >= VM_REG_LAST) return (EINVAL); - return (VMSETREG(vm->cookie, vcpu, reg, val)); + error = VMSETREG(vm->cookie, vcpuid, reg, val); + if (error || reg != VM_REG_GUEST_RIP) + return (error); + + /* Set 'nextrip' to match the value of %rip */ + VCPU_CTR1(vm, vcpuid, "Setting nextrip to %#lx", val); + vcpu = &vm->vcpu[vcpuid]; + vcpu->nextrip = val; + return (0); } static boolean_t @@ -1199,6 +1210,9 @@ vcpu = &vm->vcpu[vcpuid]; vme = &vcpu->exitinfo; + KASSERT(vme->inst_length == 0, ("%s: invalid inst_length %d", + __func__, vme->inst_length)); + ftype = vme->u.paging.fault_type; KASSERT(ftype == VM_PROT_READ || ftype == VM_PROT_WRITE || ftype == VM_PROT_EXECUTE, @@ -1224,9 +1238,6 @@ if (rv != KERN_SUCCESS) return (EFAULT); done: - /* restart execution at the faulting instruction */ - vm_restart_instruction(vm, vcpuid); - return (0); } @@ -1281,10 +1292,13 @@ return (EFAULT); /* - * If the instruction length is not specified the update it now. + * If the instruction length was not specified then update it now + * along with 'nextrip'. */ - if (vme->inst_length == 0) + if (vme->inst_length == 0) { vme->inst_length = vie->num_processed; + vcpu->nextrip += vie->num_processed; + } /* return to userland unless this is an in-kernel emulated device */ if (gpa >= DEFAULT_APIC_BASE && gpa < DEFAULT_APIC_BASE + PAGE_SIZE) { @@ -1433,7 +1447,7 @@ int error, vcpuid; struct vcpu *vcpu; struct pcb *pcb; - uint64_t tscval, rip; + uint64_t tscval; struct vm_exit *vme; bool retu, intr_disabled; pmap_t pmap; @@ -1455,7 +1469,6 @@ pmap = vmspace_pmap(vm->vmspace); vcpu = &vm->vcpu[vcpuid]; vme = &vcpu->exitinfo; - rip = vmrun->rip; restart: critical_enter(); @@ -1470,7 +1483,7 @@ restore_guest_fpustate(vcpu); vcpu_require_state(vm, vcpuid, VCPU_RUNNING); - error = VMRUN(vm->cookie, vcpuid, rip, pmap, rptr, sptr); + error = VMRUN(vm->cookie, vcpuid, vcpu->nextrip, pmap, rptr, sptr); vcpu_require_state(vm, vcpuid, VCPU_FROZEN); save_guest_fpustate(vcpu); @@ -1481,6 +1494,7 @@ if (error == 0) { retu = false; + vcpu->nextrip = vme->rip + vme->inst_length; switch (vme->exitcode) { case VM_EXITCODE_SUSPENDED: error = vm_handle_suspend(vm, vcpuid, &retu); @@ -1517,10 +1531,8 @@ } } - if (error == 0 && retu == false) { - rip = vme->rip + vme->inst_length; + if (error == 0 && retu == false) goto restart; - } /* copy the exit information */ bcopy(vme, &vmrun->vm_exit, sizeof(struct vm_exit)); @@ -1530,14 +1542,43 @@ int vm_restart_instruction(void *arg, int vcpuid) { + struct vm *vm; struct vcpu *vcpu; - struct vm *vm = arg; + enum vcpu_state state; + uint64_t rip; + int error; + vm = arg; if (vcpuid < 0 || vcpuid >= VM_MAXCPU) return (EINVAL); vcpu = &vm->vcpu[vcpuid]; - vcpu->exitinfo.inst_length = 0; + state = vcpu_get_state(vm, vcpuid, NULL); + if (state == VCPU_RUNNING) { + /* + * When a vcpu is "running" the next instruction is determined + * by adding 'rip' and 'inst_length' in the vcpu's 'exitinfo'. + * Thus setting 'inst_length' to zero will cause the current + * instruction to be restarted. + */ + vcpu->exitinfo.inst_length = 0; + VCPU_CTR1(vm, vcpuid, "restarting instruction at %#lx by " + "setting inst_length to zero", vcpu->exitinfo.rip); + } else if (state == VCPU_FROZEN) { + /* + * When a vcpu is "frozen" it is outside the critical section + * around VMRUN() and 'nextrip' points to the next instruction. + * Thus instruction restart is achieved by setting 'nextrip' + * to the vcpu's %rip. + */ + error = vm_get_register(vm, vcpuid, VM_REG_GUEST_RIP, &rip); + KASSERT(!error, ("%s: error %d getting rip", __func__, error)); + VCPU_CTR2(vm, vcpuid, "restarting instruction by updating " + "nextrip from %#lx to %#lx", vcpu->nextrip, rip); + vcpu->nextrip = rip; + } else { + panic("%s: invalid state %d", __func__, state); + } return (0); } Index: sys/amd64/vmm/vmm_dev.c =================================================================== --- sys/amd64/vmm/vmm_dev.c +++ sys/amd64/vmm/vmm_dev.c @@ -205,6 +205,7 @@ case VM_ACTIVATE_CPU: case VM_SET_INTINFO: case VM_GET_INTINFO: + case VM_RESTART_INSTRUCTION: /* * XXX fragile, handle with care * Assumes that the first field of the ioctl data is the vcpu. @@ -506,6 +507,9 @@ rtctime = (struct vm_rtc_time *)data; rtctime->secs = vrtc_get_time(sc->vm); break; + case VM_RESTART_INSTRUCTION: + error = vm_restart_instruction(sc->vm, vcpu); + break; default: error = ENOTTY; break; Index: usr.sbin/bhyve/bhyverun.h =================================================================== --- usr.sbin/bhyve/bhyverun.h +++ usr.sbin/bhyve/bhyverun.h @@ -35,9 +35,8 @@ #define __CTASSERT(x, y) typedef char __assert ## y[(x) ? 1 : -1] #endif -#define VMEXIT_CONTINUE 1 /* continue from next instruction */ -#define VMEXIT_RESTART 2 /* restart current instruction */ -#define VMEXIT_ABORT 3 /* abort the vm run loop */ +#define VMEXIT_CONTINUE (0) +#define VMEXIT_ABORT (-1) struct vmctx; extern int guest_ncpus; Index: usr.sbin/bhyve/bhyverun.c =================================================================== --- usr.sbin/bhyve/bhyverun.c +++ usr.sbin/bhyve/bhyverun.c @@ -185,20 +185,14 @@ int errcode) { struct vmctx *ctx; - int error; + int error, restart_instruction; ctx = arg; - if (errcode_valid) - error = vm_inject_exception2(ctx, vcpu, vector, errcode); - else - error = vm_inject_exception(ctx, vcpu, vector); - assert(error == 0); + restart_instruction = 1; - /* - * Set the instruction length to 0 to ensure that the instruction is - * restarted when the fault handler returns. - */ - vmexit[vcpu].inst_length = 0; + error = vm_inject_exception(ctx, vcpu, vector, errcode_valid, errcode, + restart_instruction); + assert(error == 0); } void * @@ -329,12 +323,6 @@ } error = emulate_inout(ctx, vcpu, vme, strictio); - if (!error && in && !string) { - error = vm_set_register(ctx, vcpu, VM_REG_GUEST_RAX, - vme->u.inout.eax); - assert(error == 0); - } - if (error) { fprintf(stderr, "Unhandled %s%c 0x%04x\n", in ? "in" : "out", bytes == 1 ? 'b' : (bytes == 2 ? 'w' : 'l'), port); @@ -358,7 +346,7 @@ vme->u.msr.code, *pvcpu); if (strictmsr) { vm_inject_gp(ctx, *pvcpu); - return (VMEXIT_RESTART); + return (VMEXIT_CONTINUE); } } @@ -384,7 +372,7 @@ vme->u.msr.code, vme->u.msr.wval, *pvcpu); if (strictmsr) { vm_inject_gp(ctx, *pvcpu); - return (VMEXIT_RESTART); + return (VMEXIT_CONTINUE); } } return (VMEXIT_CONTINUE); @@ -462,9 +450,11 @@ vmexit_bogus(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu) { + assert(vmexit->inst_length == 0); + stats.vmexit_bogus++; - return (VMEXIT_RESTART); + return (VMEXIT_CONTINUE); } static int @@ -494,9 +484,11 @@ vmexit_mtrap(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu) { + assert(vmexit->inst_length == 0); + stats.vmexit_mtrap++; - return (VMEXIT_RESTART); + return (VMEXIT_CONTINUE); } static int @@ -581,7 +573,7 @@ }; static void -vm_loop(struct vmctx *ctx, int vcpu, uint64_t rip) +vm_loop(struct vmctx *ctx, int vcpu, uint64_t startrip) { int error, rc, prevcpu; enum vm_exitcode exitcode; @@ -596,8 +588,11 @@ error = vm_active_cpus(ctx, &active_cpus); assert(CPU_ISSET(vcpu, &active_cpus)); + error = vm_set_register(ctx, vcpu, VM_REG_GUEST_RIP, startrip); + assert(error == 0); + while (1) { - error = vm_run(ctx, vcpu, rip, &vmexit[vcpu]); + error = vm_run(ctx, vcpu, &vmexit[vcpu]); if (error != 0) break; @@ -614,10 +609,6 @@ switch (rc) { case VMEXIT_CONTINUE: - rip = vmexit[vcpu].rip + vmexit[vcpu].inst_length; - break; - case VMEXIT_RESTART: - rip = vmexit[vcpu].rip; break; case VMEXIT_ABORT: abort(); Index: usr.sbin/bhyve/inout.c =================================================================== --- usr.sbin/bhyve/inout.c +++ usr.sbin/bhyve/inout.c @@ -104,7 +104,7 @@ emulate_inout(struct vmctx *ctx, int vcpu, struct vm_exit *vmexit, int strict) { int addrsize, bytes, flags, in, port, prot, rep; - uint32_t val; + uint32_t eax, val; inout_func_t handler; void *arg; int error, retval; @@ -214,16 +214,20 @@ } /* Restart the instruction if more iterations remain */ - if (retval == 0 && count != 0) - vmexit->inst_length = 0; - } else { - if (!in) { - val = vmexit->u.inout.eax & vie_size2mask(bytes); + if (retval == 0 && count != 0) { + error = vm_restart_instruction(ctx, vcpu); + assert(error == 0); } + } else { + eax = vmexit->u.inout.eax; + val = eax & vie_size2mask(bytes); retval = handler(ctx, vcpu, in, port, bytes, &val, arg); if (retval == 0 && in) { - vmexit->u.inout.eax &= ~vie_size2mask(bytes); - vmexit->u.inout.eax |= val & vie_size2mask(bytes); + eax &= ~vie_size2mask(bytes); + eax |= val & vie_size2mask(bytes); + error = vm_set_register(ctx, vcpu, VM_REG_GUEST_RAX, + eax); + assert(error == 0); } } return (retval); Index: usr.sbin/bhyve/task_switch.c =================================================================== --- usr.sbin/bhyve/task_switch.c +++ usr.sbin/bhyve/task_switch.c @@ -725,21 +725,11 @@ assert(paging->cpu_mode == CPU_MODE_PROTECTED); /* - * Calculate the %eip to store in the old TSS before modifying the - * 'inst_length'. + * Calculate the instruction pointer to store in the old TSS. */ eip = vmexit->rip + vmexit->inst_length; /* - * Set the 'inst_length' to '0'. - * - * If an exception is triggered during emulation of the task switch - * then the exception handler should return to the instruction that - * caused the task switch as opposed to the subsequent instruction. - */ - vmexit->inst_length = 0; - - /* * Section 4.6, "Access Rights" in Intel SDM Vol 3. * The following page table accesses are implicitly supervisor mode: * - accesses to GDT or LDT to load segment descriptors @@ -883,8 +873,8 @@ * after this point will be handled in the context of the new task and * the saved instruction pointer will belong to the new task. */ - vmexit->rip = newtss.tss_eip; - assert(vmexit->inst_length == 0); + error = vm_set_register(ctx, vcpu, VM_REG_GUEST_RIP, newtss.tss_eip); + assert(error == 0); /* Load processor state from new TSS */ error = tss32_restore(ctx, vcpu, task_switch, ot_sel, &newtss, nt_iov); Index: usr.sbin/bhyvectl/bhyvectl.c =================================================================== --- usr.sbin/bhyvectl/bhyvectl.c +++ usr.sbin/bhyvectl/bhyvectl.c @@ -2118,10 +2118,7 @@ } if (!error && run) { - error = vm_get_register(ctx, vcpu, VM_REG_GUEST_RIP, &rip); - assert(error == 0); - - error = vm_run(ctx, vcpu, rip, &vmexit); + error = vm_run(ctx, vcpu, &vmexit); if (error == 0) dump_vm_run_exitcode(&vmexit, vcpu); else