Index: sys/amd64/include/vmm.h =================================================================== --- sys/amd64/include/vmm.h +++ sys/amd64/include/vmm.h @@ -3,6 +3,7 @@ * * Copyright (c) 2011 NetApp, Inc. * All rights reserved. + * Copyright (c) 2019 Joyent, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions @@ -132,7 +133,7 @@ struct pmap; struct vm_eventinfo { - void *rptr; /* rendezvous cookie */ + u_int *rptr; /* runblock cookie */ int *sptr; /* suspend cookie */ int *iptr; /* reqidle cookie */ }; @@ -247,38 +248,21 @@ struct vm_exit *vm_exitinfo(struct vm *vm, int vcpuid); void vm_exit_suspended(struct vm *vm, int vcpuid, uint64_t rip); void vm_exit_debug(struct vm *vm, int vcpuid, uint64_t rip); -void vm_exit_rendezvous(struct vm *vm, int vcpuid, uint64_t rip); +void vm_exit_runblock(struct vm *vm, int vcpuid, uint64_t rip); void vm_exit_astpending(struct vm *vm, int vcpuid, uint64_t rip); void vm_exit_reqidle(struct vm *vm, int vcpuid, uint64_t rip); #ifdef _SYS__CPUSET_H_ -/* - * Rendezvous all vcpus specified in 'dest' and execute 'func(arg)'. - * The rendezvous 'func(arg)' is not allowed to do anything that will - * cause the thread to be put to sleep. - * - * If the rendezvous is being initiated from a vcpu context then the - * 'vcpuid' must refer to that vcpu, otherwise it should be set to -1. - * - * The caller cannot hold any locks when initiating the rendezvous. - * - * The implementation of this API may cause vcpus other than those specified - * by 'dest' to be stalled. The caller should not rely on any vcpus making - * forward progress when the rendezvous is in progress. - */ -typedef void (*vm_rendezvous_func_t)(struct vm *vm, int vcpuid, void *arg); -void vm_smp_rendezvous(struct vm *vm, int vcpuid, cpuset_t dest, - vm_rendezvous_func_t func, void *arg); cpuset_t vm_active_cpus(struct vm *vm); cpuset_t vm_debug_cpus(struct vm *vm); cpuset_t vm_suspended_cpus(struct vm *vm); #endif /* _SYS__CPUSET_H_ */ static __inline int -vcpu_rendezvous_pending(struct vm_eventinfo *info) +vcpu_runblocked(struct vm_eventinfo *info) { - return (*((uintptr_t *)(info->rptr)) != 0); + return (*info->rptr != 0); } static __inline int @@ -317,6 +301,8 @@ int vcpu_set_state(struct vm *vm, int vcpu, enum vcpu_state state, bool from_idle); enum vcpu_state vcpu_get_state(struct vm *vm, int vcpu, int *hostcpu); +void vcpu_block_run(struct vm *, int); +void vcpu_unblock_run(struct vm *, int); static int __inline vcpu_is_running(struct vm *vm, int vcpu, int *hostcpu) @@ -548,7 +534,7 @@ VM_EXITCODE_INST_EMUL, VM_EXITCODE_SPINUP_AP, VM_EXITCODE_DEPRECATED1, /* used to be SPINDOWN_CPU */ - VM_EXITCODE_RENDEZVOUS, + VM_EXITCODE_RUNBLOCK, VM_EXITCODE_IOAPIC_EOI, VM_EXITCODE_SUSPENDED, VM_EXITCODE_INOUT_STR, Index: sys/amd64/vmm/amd/svm.c =================================================================== --- sys/amd64/vmm/amd/svm.c +++ sys/amd64/vmm/amd/svm.c @@ -1573,6 +1573,8 @@ need_intr_window = 0; + vlapic_tmr_update(vlapic); + if (vcpustate->nextrip != state->rip) { ctrl->intr_shadow = 0; VCPU_CTR2(sc->vm, vcpu, "Guest interrupt blocking " @@ -1971,8 +1973,8 @@ * XXX * Setting 'vcpustate->lastcpu' here is bit premature because * we may return from this function without actually executing - * the VMRUN instruction. This could happen if a rendezvous - * or an AST is pending on the first time through the loop. + * the VMRUN instruction. This could happen if an AST or yield + * condition is pending on the first time through the loop. * * This works for now but any new side-effects of vcpu * migration should take this case into account. @@ -2002,9 +2004,9 @@ break; } - if (vcpu_rendezvous_pending(evinfo)) { + if (vcpu_runblocked(evinfo)) { enable_gintr(); - vm_exit_rendezvous(vm, vcpu, state->rip); + vm_exit_runblock(vm, vcpu, state->rip); break; } Index: sys/amd64/vmm/intel/vmx.c =================================================================== --- sys/amd64/vmm/intel/vmx.c +++ sys/amd64/vmm/intel/vmx.c @@ -1324,6 +1324,8 @@ uint64_t rflags, entryinfo; uint32_t gi, info; + vlapic_tmr_update(vlapic); + if (vmx->state[vcpu].nextrip != guestrip) { gi = vmcs_read(VMCS_GUEST_INTERRUPTIBILITY); if (gi & HWINTR_BLOCKING) { @@ -2904,9 +2906,9 @@ break; } - if (vcpu_rendezvous_pending(evinfo)) { + if (vcpu_runblocked(evinfo)) { enable_intr(); - vm_exit_rendezvous(vmx->vm, vcpu, rip); + vm_exit_runblock(vmx->vm, vcpu, rip); break; } @@ -3577,30 +3579,12 @@ } static void -vmx_set_tmr(struct vlapic *vlapic, int vector, bool level) +vmx_set_tmr(struct vlapic *vlapic, const uint32_t *masks) { - struct vlapic_vtx *vlapic_vtx; - struct vmx *vmx; - struct vmcs *vmcs; - uint64_t mask, val; - - KASSERT(vector >= 0 && vector <= 255, ("invalid vector %d", vector)); - KASSERT(!vcpu_is_running(vlapic->vm, vlapic->vcpuid, NULL), - ("vmx_set_tmr: vcpu cannot be running")); - - vlapic_vtx = (struct vlapic_vtx *)vlapic; - vmx = vlapic_vtx->vmx; - vmcs = &vmx->vmcs[vlapic->vcpuid]; - mask = 1UL << (vector % 64); - - VMPTRLD(vmcs); - val = vmcs_read(VMCS_EOI_EXIT(vector)); - if (level) - val |= mask; - else - val &= ~mask; - vmcs_write(VMCS_EOI_EXIT(vector), val); - VMCLEAR(vmcs); + vmcs_write(VMCS_EOI_EXIT0, ((uint64_t)masks[1] << 32) | masks[0]); + vmcs_write(VMCS_EOI_EXIT1, ((uint64_t)masks[3] << 32) | masks[2]); + vmcs_write(VMCS_EOI_EXIT2, ((uint64_t)masks[5] << 32) | masks[4]); + vmcs_write(VMCS_EOI_EXIT3, ((uint64_t)masks[7] << 32) | masks[6]); } static void Index: sys/amd64/vmm/io/vioapic.c =================================================================== --- sys/amd64/vmm/io/vioapic.c +++ sys/amd64/vmm/io/vioapic.c @@ -4,6 +4,7 @@ * Copyright (c) 2013 Tycho Nightingale * Copyright (c) 2013 Neel Natu * All rights reserved. + * Copyright (c) 2019 Joyent, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions @@ -39,6 +40,7 @@ #include #include #include +#include #include #include @@ -223,48 +225,139 @@ return (vioapic_set_irqstate(vm, irq, IRQSTATE_PULSE)); } +#define REDIR_IS_PHYS(reg) (((reg) & IOART_DESTMOD) == IOART_DESTPHY) +#define REDIR_IS_LOWPRIO(reg) (((reg) & IOART_DELMOD) == IOART_DELLOPRI) +/* Level-triggered interrupts only valid in fixed and low-priority modes */ +#define REDIR_IS_LVLTRIG(reg) \ + (((reg) & IOART_TRGRLVL) != 0 && \ + (((reg) & IOART_DELMOD) == IOART_DELFIXED || REDIR_IS_LOWPRIO(reg))) +#define REDIR_DEST(reg) ((reg) >> (32 + APIC_ID_SHIFT)) +#define REDIR_VECTOR(reg) ((reg) & IOART_INTVEC) + /* - * Reset the vlapic's trigger-mode register to reflect the ioapic pin - * configuration. + * Given a redirection entry, determine which vCPUs would be targeted. */ static void -vioapic_update_tmr(struct vm *vm, int vcpuid, void *arg) +vioapic_calcdest(struct vioapic *vioapic, uint64_t redir_ent, cpuset_t *dmask) { - struct vioapic *vioapic; - struct vlapic *vlapic; - uint32_t low, high, dest; - int delmode, pin, vector; - bool level, phys; - vlapic = vm_lapic(vm, vcpuid); - vioapic = vm_ioapic(vm); + /* + * When calculating interrupt destinations with vlapic_calcdest(), the + * legacy xAPIC format is assumed, since the system lacks interrupt + * redirection hardware. + * See vlapic_deliver_intr() for more details. + */ + vlapic_calcdest(vioapic->vm, dmask, REDIR_DEST(redir_ent), + REDIR_IS_PHYS(redir_ent), REDIR_IS_LOWPRIO(redir_ent), false); +} + +/* + * Across all redirection entries utilizing a specified vector, determine the + * set of vCPUs which would be targeted by a level-triggered interrupt. + */ +static void +vioapic_tmr_active(struct vioapic *vioapic, uint8_t vec, cpuset_t *result) +{ + u_int i; + + CPU_ZERO(result); + if (vec == 0) { + return; + } + + for (i = 0; i < REDIR_ENTRIES; i++) { + cpuset_t dest; + const uint64_t val = vioapic->rtbl[i].reg; + + if (!REDIR_IS_LVLTRIG(val) || REDIR_VECTOR(val) != vec) { + continue; + } + + CPU_ZERO(&dest); + vioapic_calcdest(vioapic, val, &dest); + CPU_OR(result, &dest); + } +} + +/* + * Update TMR state in vLAPICs after changes to vIOAPIC pin configuration + */ +static void +vioapic_update_tmrs(struct vioapic *vioapic, int vcpuid, uint64_t oldval, + uint64_t newval) +{ + cpuset_t active, allset, newset, oldset; + struct vm *vm; + uint8_t newvec, oldvec; + + vm = vioapic->vm; + CPU_ZERO(&allset); + CPU_ZERO(&newset); + CPU_ZERO(&oldset); + newvec = oldvec = 0; + + if (REDIR_IS_LVLTRIG(oldval)) { + vioapic_calcdest(vioapic, oldval, &oldset); + CPU_OR(&allset, &oldset); + oldvec = REDIR_VECTOR(oldval); + } + + if (REDIR_IS_LVLTRIG(newval)) { + vioapic_calcdest(vioapic, newval, &newset); + CPU_OR(&allset, &newset); + newvec = REDIR_VECTOR(newval); + } + + if (CPU_EMPTY(&allset) || + (CPU_CMP(&oldset, &newset) == 0 && oldvec == newvec)) { + return; + } - VIOAPIC_LOCK(vioapic); /* - * Reset all vectors to be edge-triggered. + * Since the write to the redirection table has already occurred, a + * scan of level-triggered entries referencing the old vector will find + * only entries which are now currently valid. */ - vlapic_reset_tmr(vlapic); - for (pin = 0; pin < REDIR_ENTRIES; pin++) { - low = vioapic->rtbl[pin].reg; - high = vioapic->rtbl[pin].reg >> 32; + vioapic_tmr_active(vioapic, oldvec, &active); - level = low & IOART_TRGRLVL ? true : false; - if (!level) + while (!CPU_EMPTY(&allset)) { + struct vlapic *vlapic; + u_int i; + + i = CPU_FFS(&allset) - 1; + CPU_CLR(i, &allset); + + if (oldvec == newvec && + CPU_ISSET(i, &oldset) && CPU_ISSET(i, &newset)) { continue; + } - /* - * For a level-triggered 'pin' let the vlapic figure out if - * an assertion on this 'pin' would result in an interrupt - * being delivered to it. If yes, then it will modify the - * TMR bit associated with this vector to level-triggered. - */ - phys = ((low & IOART_DESTMOD) == IOART_DESTPHY); - delmode = low & IOART_DELMOD; - vector = low & IOART_INTVEC; - dest = high >> APIC_ID_SHIFT; - vlapic_set_tmr_level(vlapic, dest, phys, delmode, vector); + if (i != vcpuid) { + vcpu_block_run(vm, i); + } + + vlapic = vm_lapic(vm, i); + if (CPU_ISSET(i, &oldset)) { + /* + * Perform the deassertion if no other level-triggered + * IOAPIC entries target this vCPU with the old vector + * + * Note: Sharing of vectors like that should be + * extremely rare in modern operating systems and was + * previously unsupported by the bhyve vIOAPIC. + */ + if (!CPU_ISSET(i, &active)) { + vlapic_tmr_set(vlapic, oldvec, false); + } + } + if (CPU_ISSET(i, &newset)) { + vlapic_tmr_set(vlapic, newvec, true); + } + + if (i != vcpuid) { + vcpu_unblock_run(vm, i); + } } - VIOAPIC_UNLOCK(vioapic); } static uint32_t @@ -308,7 +401,6 @@ uint64_t data64, mask64; uint64_t last, changed; int regnum, pin, lshift; - cpuset_t allvcpus; regnum = addr & 0xff; switch (regnum) { @@ -344,18 +436,15 @@ /* * If any fields in the redirection table entry (except mask - * or polarity) have changed then rendezvous all the vcpus - * to update their vlapic trigger-mode registers. + * or polarity) have changed then update the trigger-mode + * registers on all the vlapics. */ changed = last ^ vioapic->rtbl[pin].reg; if (changed & ~(IOART_INTMASK | IOART_INTPOL)) { VIOAPIC_CTR1(vioapic, "ioapic pin%d: recalculate " "vlapic trigger-mode register", pin); - VIOAPIC_UNLOCK(vioapic); - allvcpus = vm_active_cpus(vioapic->vm); - vm_smp_rendezvous(vioapic->vm, vcpuid, allvcpus, - vioapic_update_tmr, NULL); - VIOAPIC_LOCK(vioapic); + vioapic_update_tmrs(vioapic, vcpuid, last, + vioapic->rtbl[pin].reg); } /* Index: sys/amd64/vmm/io/vlapic.h =================================================================== --- sys/amd64/vmm/io/vlapic.h +++ sys/amd64/vmm/io/vlapic.h @@ -83,16 +83,11 @@ void vlapic_deliver_intr(struct vm *vm, bool level, uint32_t dest, bool phys, int delmode, int vec); -/* Reset the trigger-mode bits for all vectors to be edge-triggered */ -void vlapic_reset_tmr(struct vlapic *vlapic); +void vlapic_calcdest(struct vm *vm, cpuset_t *dmask, uint32_t dest, bool phys, + bool lowprio, bool x2apic_dest); -/* - * Set the trigger-mode bit associated with 'vector' to level-triggered if - * the (dest,phys,delmode) tuple resolves to an interrupt being delivered to - * this 'vlapic'. - */ -void vlapic_set_tmr_level(struct vlapic *vlapic, uint32_t dest, bool phys, - int delmode, int vector); +void vlapic_tmr_update(struct vlapic *vlapic); +void vlapic_tmr_set(struct vlapic *vlapic, uint8_t vector, bool active); void vlapic_set_cr8(struct vlapic *vlapic, uint64_t val); uint64_t vlapic_get_cr8(struct vlapic *vlapic); Index: sys/amd64/vmm/io/vlapic.c =================================================================== --- sys/amd64/vmm/io/vlapic.c +++ sys/amd64/vmm/io/vlapic.c @@ -3,6 +3,7 @@ * * Copyright (c) 2011 NetApp, Inc. * All rights reserved. + * Copyright (c) 2019 Joyent, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions @@ -78,6 +79,8 @@ */ #define VLAPIC_BUS_FREQ (128 * 1024 * 1024) +static void vlapic_tmr_reset(struct vlapic *); + static __inline uint32_t vlapic_get_id(struct vlapic *vlapic) { @@ -809,11 +812,11 @@ /* * This function populates 'dmask' with the set of vcpus that match the * addressing specified by the (dest, phys, lowprio) tuple. - * + * * 'x2apic_dest' specifies whether 'dest' is interpreted as x2APIC (32-bit) * or xAPIC (8-bit) destination field. */ -static void +void vlapic_calcdest(struct vm *vm, cpuset_t *dmask, uint32_t dest, bool phys, bool lowprio, bool x2apic_dest) { @@ -1432,7 +1435,7 @@ lapic->dfr = 0xffffffff; lapic->svr = APIC_SVR_VECTOR; vlapic_mask_lvts(vlapic); - vlapic_reset_tmr(vlapic); + vlapic_tmr_reset(vlapic); lapic->dcr_timer = 0; vlapic_dcr_write_handler(vlapic); @@ -1600,60 +1603,77 @@ } static void -vlapic_set_tmr(struct vlapic *vlapic, int vector, bool level) +vlapic_tmr_reset(struct vlapic *vlapic) { struct LAPIC *lapic; - uint32_t *tmrptr, mask; - int idx; lapic = vlapic->apic_page; - tmrptr = &lapic->tmr0; - idx = (vector / 32) * 4; - mask = 1 << (vector % 32); - if (level) - tmrptr[idx] |= mask; - else - tmrptr[idx] &= ~mask; - - if (vlapic->ops.set_tmr != NULL) - (*vlapic->ops.set_tmr)(vlapic, vector, level); + lapic->tmr0 = lapic->tmr1 = lapic->tmr2 = lapic->tmr3 = 0; + lapic->tmr4 = lapic->tmr5 = lapic->tmr6 = lapic->tmr7 = 0; + vlapic->tmr_pending = 1; } +/* + * Synchronize TMR designations into the LAPIC state. + * The vCPU must be in the VCPU_RUNNING state. + */ void -vlapic_reset_tmr(struct vlapic *vlapic) +vlapic_tmr_update(struct vlapic *vlapic) { - int vector; + struct LAPIC *lapic; + uint32_t *tmrptr; + uint32_t result[VLAPIC_TMR_CNT]; + u_int i, tmr_idx; - VLAPIC_CTR0(vlapic, "vlapic resetting all vectors to edge-triggered"); + if (vlapic->tmr_pending == 0) { + return; + } + + lapic = vlapic->apic_page; + tmrptr = &lapic->tmr0; - for (vector = 0; vector <= 255; vector++) - vlapic_set_tmr(vlapic, vector, false); + VLAPIC_CTR0(vlapic, "synchronizing TMR"); + for (i = 0; i < VLAPIC_TMR_CNT; i++) { + tmr_idx = i * 4; + + tmrptr[tmr_idx] &= ~vlapic->tmr_vec_deassert[i]; + tmrptr[tmr_idx] |= vlapic->tmr_vec_assert[i]; + vlapic->tmr_vec_deassert[i] = 0; + vlapic->tmr_vec_assert[i] = 0; + result[i] = tmrptr[tmr_idx]; + } + vlapic->tmr_pending = 0; + + if (vlapic->ops.set_tmr != NULL) { + (*vlapic->ops.set_tmr)(vlapic, result); + } } +/* + * Designate the TMR state for a given interrupt vector. + * The caller must hold the vIOAPIC lock and prevent the vCPU corresponding to + * this vLAPIC instance from being-in or entering the VCPU_RUNNING state. + */ void -vlapic_set_tmr_level(struct vlapic *vlapic, uint32_t dest, bool phys, - int delmode, int vector) +vlapic_tmr_set(struct vlapic *vlapic, uint8_t vector, bool active) { - cpuset_t dmask; - bool lowprio; - - KASSERT(vector >= 0 && vector <= 255, ("invalid vector %d", vector)); + const uint32_t idx = vector / 32; + const uint32_t mask = 1 << (vector % 32); + + VLAPIC_CTR2(vlapic, "TMR for vector %u %sasserted", vector, + active ? "" : "de"); + if (active) { + vlapic->tmr_vec_assert[idx] |= mask; + vlapic->tmr_vec_deassert[idx] &= ~mask; + } else { + vlapic->tmr_vec_deassert[idx] |= mask; + vlapic->tmr_vec_assert[idx] &= ~mask; + } /* - * A level trigger is valid only for fixed and lowprio delivery modes. + * Track the number of TMR changes between calls to vlapic_tmr_update. + * While a simple boolean would suffice, this count may be useful when + * tracing or debugging, and is cheap to calculate. */ - if (delmode != APIC_DELMODE_FIXED && delmode != APIC_DELMODE_LOWPRIO) { - VLAPIC_CTR1(vlapic, "Ignoring level trigger-mode for " - "delivery-mode %d", delmode); - return; - } - - lowprio = (delmode == APIC_DELMODE_LOWPRIO); - vlapic_calcdest(vlapic->vm, &dmask, dest, phys, lowprio, false); - - if (!CPU_ISSET(vlapic->vcpuid, &dmask)) - return; - - VLAPIC_CTR1(vlapic, "vector %d set to level-triggered", vector); - vlapic_set_tmr(vlapic, vector, true); + vlapic->tmr_pending = MIN(UINT32_MAX - 1, vlapic->tmr_pending) + 1; } Index: sys/amd64/vmm/io/vlapic_priv.h =================================================================== --- sys/amd64/vmm/io/vlapic_priv.h +++ sys/amd64/vmm/io/vlapic_priv.h @@ -3,6 +3,7 @@ * * Copyright (c) 2013 Neel Natu * All rights reserved. + * Copyright (c) 2019 Joyent, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions @@ -138,6 +139,8 @@ #define VLAPIC_MAXLVT_INDEX APIC_LVT_CMCI +#define VLAPIC_TMR_CNT 8 + struct vlapic; struct vlapic_ops { @@ -145,7 +148,7 @@ int (*pending_intr)(struct vlapic *vlapic, int *vecptr); void (*intr_accepted)(struct vlapic *vlapic, int vector); void (*post_intr)(struct vlapic *vlapic, int hostcpu); - void (*set_tmr)(struct vlapic *vlapic, int vector, bool level); + void (*set_tmr)(struct vlapic *vlapic, const uint32_t *result); void (*enable_x2apic_mode)(struct vlapic *vlapic); }; @@ -157,6 +160,7 @@ uint32_t esr_pending; int esr_firing; + uint32_t tmr_pending; struct callout callout; /* vlapic timer */ struct bintime timer_fire_bt; /* callout expiry time */ @@ -184,6 +188,19 @@ */ uint32_t svr_last; uint32_t lvt_last[VLAPIC_MAXLVT_INDEX + 1]; + + /* + * Store intended modifications to the trigger-mode register state. + * Along with the tmr_pending counter above, these are protected by the + * vIOAPIC lock and can only be modified under specific conditions: + * + * 1. When holding the vIOAPIC lock, and the vCPU to which the vLAPIC + * belongs is prevented from entering the VCPU_RUNNING state. + * 2. When the owning vCPU is in the VCPU_RUNNING state, and is + * applying the TMR modifications prior to interrupt injection. + */ + uint32_t tmr_vec_deassert[VLAPIC_TMR_CNT]; + uint32_t tmr_vec_assert[VLAPIC_TMR_CNT]; }; void vlapic_init(struct vlapic *vlapic); Index: sys/amd64/vmm/vmm.c =================================================================== --- sys/amd64/vmm/vmm.c +++ sys/amd64/vmm/vmm.c @@ -3,6 +3,7 @@ * * Copyright (c) 2011 NetApp, Inc. * All rights reserved. + * Copyright (c) 2019 Joyent, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions @@ -96,6 +97,7 @@ struct mtx mtx; /* (o) protects 'state' and 'hostcpu' */ enum vcpu_state state; /* (o) vcpu state */ int hostcpu; /* (o) vcpu's host cpu */ + u_int runblock; /* (i) block vcpu from run state */ int reqidle; /* (i) request vcpu to idle */ struct vlapic *vlapic; /* (i) APIC device model */ enum x2apic_state x2apic_state; /* (i) APIC mode */ @@ -156,11 +158,6 @@ int suspend; /* (i) stop VM execution */ volatile cpuset_t suspended_cpus; /* (i) suspended vcpus */ volatile cpuset_t halted_cpus; /* (x) cpus in a hard halt */ - cpuset_t rendezvous_req_cpus; /* (x) rendezvous requested */ - cpuset_t rendezvous_done_cpus; /* (x) rendezvous finished */ - void *rendezvous_arg; /* (x) rendezvous func/arg */ - vm_rendezvous_func_t rendezvous_func; - struct mtx rendezvous_mtx; /* (o) rendezvous lock */ struct mem_map mem_maps[VM_MAX_MEMMAPS]; /* (i) guest address space */ struct mem_seg mem_segs[VM_MAX_MEMSEGS]; /* (o) guest memory regions */ struct vmspace *vmspace; /* (o) guest's address space */ @@ -293,6 +290,7 @@ vcpu->vlapic = VLAPIC_INIT(vm->cookie, vcpu_id); vm_set_x2apic_state(vm, vcpu_id, X2APIC_DISABLED); + vcpu->runblock = 0; vcpu->reqidle = 0; vcpu->exitintinfo = 0; vcpu->nmi_pending = 0; @@ -461,7 +459,6 @@ vm = malloc(sizeof(struct vm), M_VM, M_WAITOK | M_ZERO); strcpy(vm->name, name); vm->vmspace = vmspace; - mtx_init(&vm->rendezvous_mtx, "vm rendezvous lock", 0, MTX_DEF); vm->sockets = 1; vm->cores = cores_per_package; /* XXX backwards compatibility */ @@ -1201,6 +1198,12 @@ break; } + if (newstate == VCPU_RUNNING) { + while (vcpu->runblock != 0) { + msleep_spin(&vcpu->state, &vcpu->mtx, "vcpublk", 0); + } + } + if (error) return (EBUSY); @@ -1213,8 +1216,10 @@ else vcpu->hostcpu = NOCPU; - if (newstate == VCPU_IDLE) + if (newstate == VCPU_IDLE || + (newstate == VCPU_FROZEN && vcpu->runblock != 0)) { wakeup(&vcpu->state); + } return (0); } @@ -1237,63 +1242,6 @@ panic("Error %d setting state to %d", error, newstate); } -static void -vm_set_rendezvous_func(struct vm *vm, vm_rendezvous_func_t func) -{ - - KASSERT(mtx_owned(&vm->rendezvous_mtx), ("rendezvous_mtx not locked")); - - /* - * Update 'rendezvous_func' and execute a write memory barrier to - * ensure that it is visible across all host cpus. This is not needed - * for correctness but it does ensure that all the vcpus will notice - * that the rendezvous is requested immediately. - */ - vm->rendezvous_func = func; - wmb(); -} - -#define RENDEZVOUS_CTR0(vm, vcpuid, fmt) \ - do { \ - if (vcpuid >= 0) \ - VCPU_CTR0(vm, vcpuid, fmt); \ - else \ - VM_CTR0(vm, fmt); \ - } while (0) - -static void -vm_handle_rendezvous(struct vm *vm, int vcpuid) -{ - - KASSERT(vcpuid == -1 || (vcpuid >= 0 && vcpuid < vm->maxcpus), - ("vm_handle_rendezvous: invalid vcpuid %d", vcpuid)); - - mtx_lock(&vm->rendezvous_mtx); - while (vm->rendezvous_func != NULL) { - /* 'rendezvous_req_cpus' must be a subset of 'active_cpus' */ - CPU_AND(&vm->rendezvous_req_cpus, &vm->active_cpus); - - if (vcpuid != -1 && - CPU_ISSET(vcpuid, &vm->rendezvous_req_cpus) && - !CPU_ISSET(vcpuid, &vm->rendezvous_done_cpus)) { - VCPU_CTR0(vm, vcpuid, "Calling rendezvous func"); - (*vm->rendezvous_func)(vm, vcpuid, vm->rendezvous_arg); - CPU_SET(vcpuid, &vm->rendezvous_done_cpus); - } - if (CPU_CMP(&vm->rendezvous_req_cpus, - &vm->rendezvous_done_cpus) == 0) { - VCPU_CTR0(vm, vcpuid, "Rendezvous completed"); - vm_set_rendezvous_func(vm, NULL); - wakeup(&vm->rendezvous_func); - break; - } - RENDEZVOUS_CTR0(vm, vcpuid, "Wait for rendezvous completion"); - mtx_sleep(&vm->rendezvous_func, &vm->rendezvous_mtx, 0, - "vmrndv", 0); - } - mtx_unlock(&vm->rendezvous_mtx); -} - /* * Emulate a guest 'hlt' by sleeping until the vcpu is ready to run. */ @@ -1321,7 +1269,7 @@ * vcpu returned from VMRUN() and before it acquired the * vcpu lock above. */ - if (vm->rendezvous_func != NULL || vm->suspend || vcpu->reqidle) + if (vm->suspend || vcpu->reqidle) break; if (vm_nmi_pending(vm, vcpuid)) break; @@ -1515,10 +1463,6 @@ /* * Wait until all 'active_cpus' have suspended themselves. - * - * Since a VM may be suspended at any time including when one or - * more vcpus are doing a rendezvous we need to call the rendezvous - * handler while we are waiting to prevent a deadlock. */ vcpu_lock(vcpu); while (1) { @@ -1527,17 +1471,10 @@ break; } - if (vm->rendezvous_func == NULL) { - VCPU_CTR0(vm, vcpuid, "Sleeping during suspend"); - vcpu_require_state_locked(vm, vcpuid, VCPU_SLEEPING); - msleep_spin(vcpu, &vcpu->mtx, "vmsusp", hz); - vcpu_require_state_locked(vm, vcpuid, VCPU_FROZEN); - } else { - VCPU_CTR0(vm, vcpuid, "Rendezvous during suspend"); - vcpu_unlock(vcpu); - vm_handle_rendezvous(vm, vcpuid); - vcpu_lock(vcpu); - } + VCPU_CTR0(vm, vcpuid, "Sleeping during suspend"); + vcpu_require_state_locked(vm, vcpuid, VCPU_SLEEPING); + msleep_spin(vcpu, &vcpu->mtx, "vmsusp", hz); + vcpu_require_state_locked(vm, vcpuid, VCPU_FROZEN); } vcpu_unlock(vcpu); @@ -1621,17 +1558,15 @@ } void -vm_exit_rendezvous(struct vm *vm, int vcpuid, uint64_t rip) +vm_exit_runblock(struct vm *vm, int vcpuid, uint64_t rip) { struct vm_exit *vmexit; - KASSERT(vm->rendezvous_func != NULL, ("rendezvous not in progress")); - vmexit = vm_exitinfo(vm, vcpuid); vmexit->rip = rip; vmexit->inst_length = 0; - vmexit->exitcode = VM_EXITCODE_RENDEZVOUS; - vmm_stat_incr(vm, vcpuid, VMEXIT_RENDEZVOUS, 1); + vmexit->exitcode = VM_EXITCODE_RUNBLOCK; + vmm_stat_incr(vm, vcpuid, VMEXIT_RUNBLOCK, 1); } void @@ -1684,7 +1619,7 @@ pmap = vmspace_pmap(vm->vmspace); vcpu = &vm->vcpu[vcpuid]; vme = &vcpu->exitinfo; - evinfo.rptr = &vm->rendezvous_func; + evinfo.rptr = &vcpu->runblock; evinfo.sptr = &vm->suspend; evinfo.iptr = &vcpu->reqidle; restart: @@ -1724,9 +1659,7 @@ vioapic_process_eoi(vm, vcpuid, vme->u.ioapic_eoi.vector); break; - case VM_EXITCODE_RENDEZVOUS: - vm_handle_rendezvous(vm, vcpuid); - error = 0; + case VM_EXITCODE_RUNBLOCK: break; case VM_EXITCODE_HLT: intr_disabled = ((vme->u.hlt.rflags & PSL_I) == 0); @@ -2321,6 +2254,46 @@ return (state); } +void +vcpu_block_run(struct vm *vm, int vcpuid) +{ + struct vcpu *vcpu; + + if (vcpuid < 0 || vcpuid >= VM_MAXCPU) + panic("vcpu_block_run: invalid vcpuid %d", vcpuid); + + vcpu = &vm->vcpu[vcpuid]; + + vcpu_lock(vcpu); + vcpu->runblock++; + if (vcpu->runblock == 1 && vcpu->state == VCPU_RUNNING) { + vcpu_notify_event_locked(vcpu, false); + } + while (vcpu->state == VCPU_RUNNING) { + msleep_spin(&vcpu->state, &vcpu->mtx, "vcpublk", 0); + } + vcpu_unlock(vcpu); +} + +void +vcpu_unblock_run(struct vm *vm, int vcpuid) +{ + struct vcpu *vcpu; + + if (vcpuid < 0 || vcpuid >= VM_MAXCPU) + panic("vcpu_block_run: invalid vcpuid %d", vcpuid); + + vcpu = &vm->vcpu[vcpuid]; + + vcpu_lock(vcpu); + KASSERT(vcpu->runblock != 0, ("expected non-zero runblock")); + vcpu->runblock--; + if (vcpu->runblock == 0) { + wakeup(&vcpu->state); + } + vcpu_unlock(vcpu); +} + int vm_activate_cpu(struct vm *vm, int vcpuid) { @@ -2504,54 +2477,6 @@ return (apicid); } -void -vm_smp_rendezvous(struct vm *vm, int vcpuid, cpuset_t dest, - vm_rendezvous_func_t func, void *arg) -{ - int i; - - /* - * Enforce that this function is called without any locks - */ - WITNESS_WARN(WARN_PANIC, NULL, "vm_smp_rendezvous"); - KASSERT(vcpuid == -1 || (vcpuid >= 0 && vcpuid < vm->maxcpus), - ("vm_smp_rendezvous: invalid vcpuid %d", vcpuid)); - -restart: - mtx_lock(&vm->rendezvous_mtx); - if (vm->rendezvous_func != NULL) { - /* - * If a rendezvous is already in progress then we need to - * call the rendezvous handler in case this 'vcpuid' is one - * of the targets of the rendezvous. - */ - RENDEZVOUS_CTR0(vm, vcpuid, "Rendezvous already in progress"); - mtx_unlock(&vm->rendezvous_mtx); - vm_handle_rendezvous(vm, vcpuid); - goto restart; - } - KASSERT(vm->rendezvous_func == NULL, ("vm_smp_rendezvous: previous " - "rendezvous is still in progress")); - - RENDEZVOUS_CTR0(vm, vcpuid, "Initiating rendezvous"); - vm->rendezvous_req_cpus = dest; - CPU_ZERO(&vm->rendezvous_done_cpus); - vm->rendezvous_arg = arg; - vm_set_rendezvous_func(vm, func); - mtx_unlock(&vm->rendezvous_mtx); - - /* - * Wake up any sleeping vcpus and trigger a VM-exit in any running - * vcpus so they handle the rendezvous as soon as possible. - */ - for (i = 0; i < vm->maxcpus; i++) { - if (CPU_ISSET(i, &dest)) - vcpu_notify_event(vm, i, false); - } - - vm_handle_rendezvous(vm, vcpuid); -} - struct vatpic * vm_atpic(struct vm *vm) { Index: sys/amd64/vmm/vmm_stat.h =================================================================== --- sys/amd64/vmm/vmm_stat.h +++ sys/amd64/vmm/vmm_stat.h @@ -157,7 +157,7 @@ VMM_STAT_DECLARE(VMEXIT_UNKNOWN); VMM_STAT_DECLARE(VMEXIT_ASTPENDING); VMM_STAT_DECLARE(VMEXIT_USERSPACE); -VMM_STAT_DECLARE(VMEXIT_RENDEZVOUS); +VMM_STAT_DECLARE(VMEXIT_RUNBLOCK); VMM_STAT_DECLARE(VMEXIT_EXCEPTION); VMM_STAT_DECLARE(VMEXIT_REQIDLE); #endif Index: sys/amd64/vmm/vmm_stat.c =================================================================== --- sys/amd64/vmm/vmm_stat.c +++ sys/amd64/vmm/vmm_stat.c @@ -168,5 +168,5 @@ VMM_STAT(VMEXIT_ASTPENDING, "number of times astpending at exit"); VMM_STAT(VMEXIT_REQIDLE, "number of times idle requested at exit"); VMM_STAT(VMEXIT_USERSPACE, "number of vm exits handled in userspace"); -VMM_STAT(VMEXIT_RENDEZVOUS, "number of times rendezvous pending at exit"); +VMM_STAT(VMEXIT_RUNBLOCK, "number of times runblock at exit"); VMM_STAT(VMEXIT_EXCEPTION, "number of vm exits due to exceptions");