diff --git a/sys/amd64/vmm/io/vatpic.c b/sys/amd64/vmm/io/vatpic.c index 5ece5d433244..d959909d7cdc 100644 --- a/sys/amd64/vmm/io/vatpic.c +++ b/sys/amd64/vmm/io/vatpic.c @@ -1,852 +1,853 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2014 Tycho Nightingale * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include "opt_bhyve_snapshot.h" #include #include #include #include #include #include #include #include #include #include #include #include #include "vmm_ktr.h" #include "vmm_lapic.h" #include "vioapic.h" #include "vatpic.h" static MALLOC_DEFINE(M_VATPIC, "atpic", "bhyve virtual atpic (8259)"); #define VATPIC_LOCK(vatpic) mtx_lock_spin(&((vatpic)->mtx)) #define VATPIC_UNLOCK(vatpic) mtx_unlock_spin(&((vatpic)->mtx)) #define VATPIC_LOCKED(vatpic) mtx_owned(&((vatpic)->mtx)) enum irqstate { IRQSTATE_ASSERT, IRQSTATE_DEASSERT, IRQSTATE_PULSE }; struct atpic { bool ready; int icw_num; int rd_cmd_reg; bool aeoi; bool poll; bool rotate; bool sfn; /* special fully-nested mode */ int irq_base; uint8_t request; /* Interrupt Request Register (IIR) */ uint8_t service; /* Interrupt Service (ISR) */ uint8_t mask; /* Interrupt Mask Register (IMR) */ uint8_t smm; /* special mask mode */ int acnt[8]; /* sum of pin asserts and deasserts */ int lowprio; /* lowest priority irq */ bool intr_raised; }; struct vatpic { struct vm *vm; struct mtx mtx; struct atpic atpic[2]; uint8_t elc[2]; }; #define VATPIC_CTR0(vatpic, fmt) \ VM_CTR0((vatpic)->vm, fmt) #define VATPIC_CTR1(vatpic, fmt, a1) \ VM_CTR1((vatpic)->vm, fmt, a1) #define VATPIC_CTR2(vatpic, fmt, a1, a2) \ VM_CTR2((vatpic)->vm, fmt, a1, a2) #define VATPIC_CTR3(vatpic, fmt, a1, a2, a3) \ VM_CTR3((vatpic)->vm, fmt, a1, a2, a3) #define VATPIC_CTR4(vatpic, fmt, a1, a2, a3, a4) \ VM_CTR4((vatpic)->vm, fmt, a1, a2, a3, a4) /* * Loop over all the pins in priority order from highest to lowest. */ #define ATPIC_PIN_FOREACH(pinvar, atpic, tmpvar) \ for (tmpvar = 0, pinvar = (atpic->lowprio + 1) & 0x7; \ tmpvar < 8; \ tmpvar++, pinvar = (pinvar + 1) & 0x7) static void vatpic_set_pinstate(struct vatpic *vatpic, int pin, bool newstate); static __inline bool master_atpic(struct vatpic *vatpic, struct atpic *atpic) { if (atpic == &vatpic->atpic[0]) return (true); else return (false); } static __inline int vatpic_get_highest_isrpin(struct atpic *atpic) { int bit, pin; int i; ATPIC_PIN_FOREACH(pin, atpic, i) { bit = (1 << pin); if (atpic->service & bit) { /* * An IS bit that is masked by an IMR bit will not be * cleared by a non-specific EOI in Special Mask Mode. */ if (atpic->smm && (atpic->mask & bit) != 0) continue; else return (pin); } } return (-1); } static __inline int vatpic_get_highest_irrpin(struct atpic *atpic) { int serviced; int bit, pin, tmp; /* * In 'Special Fully-Nested Mode' when an interrupt request from * a slave is in service, the slave is not locked out from the * master's priority logic. */ serviced = atpic->service; if (atpic->sfn) serviced &= ~(1 << 2); /* * In 'Special Mask Mode', when a mask bit is set in OCW1 it inhibits * further interrupts at that level and enables interrupts from all * other levels that are not masked. In other words the ISR has no * bearing on the levels that can generate interrupts. */ if (atpic->smm) serviced = 0; ATPIC_PIN_FOREACH(pin, atpic, tmp) { bit = 1 << pin; /* * If there is already an interrupt in service at the same * or higher priority then bail. */ if ((serviced & bit) != 0) break; /* * If an interrupt is asserted and not masked then return * the corresponding 'pin' to the caller. */ if ((atpic->request & bit) != 0 && (atpic->mask & bit) == 0) return (pin); } return (-1); } static void vatpic_notify_intr(struct vatpic *vatpic) { struct atpic *atpic; int pin; KASSERT(VATPIC_LOCKED(vatpic), ("vatpic_notify_intr not locked")); /* * First check the slave. */ atpic = &vatpic->atpic[1]; if (!atpic->intr_raised && (pin = vatpic_get_highest_irrpin(atpic)) != -1) { VATPIC_CTR4(vatpic, "atpic slave notify pin = %d " "(imr 0x%x irr 0x%x isr 0x%x)", pin, atpic->mask, atpic->request, atpic->service); /* * Cascade the request from the slave to the master. */ atpic->intr_raised = true; vatpic_set_pinstate(vatpic, 2, true); vatpic_set_pinstate(vatpic, 2, false); } else { VATPIC_CTR3(vatpic, "atpic slave no eligible interrupts " "(imr 0x%x irr 0x%x isr 0x%x)", atpic->mask, atpic->request, atpic->service); } /* * Then check the master. */ atpic = &vatpic->atpic[0]; if (!atpic->intr_raised && (pin = vatpic_get_highest_irrpin(atpic)) != -1) { VATPIC_CTR4(vatpic, "atpic master notify pin = %d " "(imr 0x%x irr 0x%x isr 0x%x)", pin, atpic->mask, atpic->request, atpic->service); /* * From Section 3.6.2, "Interrupt Modes", in the * MPtable Specification, Version 1.4 * * PIC interrupts are routed to both the Local APIC * and the I/O APIC to support operation in 1 of 3 * modes. * * 1. Legacy PIC Mode: the PIC effectively bypasses * all APIC components. In this mode the local APIC is * disabled and LINT0 is reconfigured as INTR to * deliver the PIC interrupt directly to the CPU. * * 2. Virtual Wire Mode: the APIC is treated as a * virtual wire which delivers interrupts from the PIC * to the CPU. In this mode LINT0 is programmed as * ExtINT to indicate that the PIC is the source of * the interrupt. * * 3. Virtual Wire Mode via I/O APIC: PIC interrupts are * fielded by the I/O APIC and delivered to the appropriate * CPU. In this mode the I/O APIC input 0 is programmed * as ExtINT to indicate that the PIC is the source of the * interrupt. */ atpic->intr_raised = true; lapic_set_local_intr(vatpic->vm, NULL, APIC_LVT_LINT0); vioapic_pulse_irq(vatpic->vm, 0); } else { VATPIC_CTR3(vatpic, "atpic master no eligible interrupts " "(imr 0x%x irr 0x%x isr 0x%x)", atpic->mask, atpic->request, atpic->service); } } static int vatpic_icw1(struct vatpic *vatpic, struct atpic *atpic, uint8_t val) { VATPIC_CTR1(vatpic, "atpic icw1 0x%x", val); atpic->ready = false; atpic->icw_num = 1; atpic->request = 0; atpic->mask = 0; atpic->lowprio = 7; atpic->rd_cmd_reg = 0; atpic->poll = 0; atpic->smm = 0; if ((val & ICW1_SNGL) != 0) { VATPIC_CTR0(vatpic, "vatpic cascade mode required"); return (-1); } if ((val & ICW1_IC4) == 0) { VATPIC_CTR0(vatpic, "vatpic icw4 required"); return (-1); } atpic->icw_num++; return (0); } static int vatpic_icw2(struct vatpic *vatpic, struct atpic *atpic, uint8_t val) { VATPIC_CTR1(vatpic, "atpic icw2 0x%x", val); atpic->irq_base = val & 0xf8; atpic->icw_num++; return (0); } static int vatpic_icw3(struct vatpic *vatpic, struct atpic *atpic, uint8_t val) { VATPIC_CTR1(vatpic, "atpic icw3 0x%x", val); atpic->icw_num++; return (0); } static int vatpic_icw4(struct vatpic *vatpic, struct atpic *atpic, uint8_t val) { VATPIC_CTR1(vatpic, "atpic icw4 0x%x", val); if ((val & ICW4_8086) == 0) { VATPIC_CTR0(vatpic, "vatpic microprocessor mode required"); return (-1); } if ((val & ICW4_AEOI) != 0) atpic->aeoi = true; if ((val & ICW4_SFNM) != 0) { if (master_atpic(vatpic, atpic)) { atpic->sfn = true; } else { VATPIC_CTR1(vatpic, "Ignoring special fully nested " "mode on slave atpic: %#x", val); } } atpic->icw_num = 0; atpic->ready = true; return (0); } static int vatpic_ocw1(struct vatpic *vatpic, struct atpic *atpic, uint8_t val) { VATPIC_CTR1(vatpic, "atpic ocw1 0x%x", val); atpic->mask = val & 0xff; return (0); } static int vatpic_ocw2(struct vatpic *vatpic, struct atpic *atpic, uint8_t val) { VATPIC_CTR1(vatpic, "atpic ocw2 0x%x", val); atpic->rotate = ((val & OCW2_R) != 0); if ((val & OCW2_EOI) != 0) { int isr_bit; if ((val & OCW2_SL) != 0) { /* specific EOI */ isr_bit = val & 0x7; } else { /* non-specific EOI */ isr_bit = vatpic_get_highest_isrpin(atpic); } if (isr_bit != -1) { atpic->service &= ~(1 << isr_bit); if (atpic->rotate) atpic->lowprio = isr_bit; } } else if ((val & OCW2_SL) != 0 && atpic->rotate == true) { /* specific priority */ atpic->lowprio = val & 0x7; } return (0); } static int vatpic_ocw3(struct vatpic *vatpic, struct atpic *atpic, uint8_t val) { VATPIC_CTR1(vatpic, "atpic ocw3 0x%x", val); if (val & OCW3_ESMM) { atpic->smm = val & OCW3_SMM ? 1 : 0; VATPIC_CTR2(vatpic, "%s atpic special mask mode %s", master_atpic(vatpic, atpic) ? "master" : "slave", atpic->smm ? "enabled" : "disabled"); } if (val & OCW3_RR) { /* read register command */ atpic->rd_cmd_reg = val & OCW3_RIS; /* Polling mode */ atpic->poll = ((val & OCW3_P) != 0); } return (0); } static void vatpic_set_pinstate(struct vatpic *vatpic, int pin, bool newstate) { struct atpic *atpic; int oldcnt, newcnt; bool level; KASSERT(pin >= 0 && pin < 16, ("vatpic_set_pinstate: invalid pin number %d", pin)); KASSERT(VATPIC_LOCKED(vatpic), ("vatpic_set_pinstate: vatpic is not locked")); atpic = &vatpic->atpic[pin >> 3]; oldcnt = atpic->acnt[pin & 0x7]; if (newstate) atpic->acnt[pin & 0x7]++; else atpic->acnt[pin & 0x7]--; newcnt = atpic->acnt[pin & 0x7]; if (newcnt < 0) { VATPIC_CTR2(vatpic, "atpic pin%d: bad acnt %d", pin, newcnt); } level = ((vatpic->elc[pin >> 3] & (1 << (pin & 0x7))) != 0); if ((oldcnt == 0 && newcnt == 1) || (newcnt > 0 && level == true)) { /* rising edge or level */ VATPIC_CTR1(vatpic, "atpic pin%d: asserted", pin); atpic->request |= (1 << (pin & 0x7)); } else if (oldcnt == 1 && newcnt == 0) { /* falling edge */ VATPIC_CTR1(vatpic, "atpic pin%d: deasserted", pin); if (level) atpic->request &= ~(1 << (pin & 0x7)); } else { VATPIC_CTR3(vatpic, "atpic pin%d: %s, ignored, acnt %d", pin, newstate ? "asserted" : "deasserted", newcnt); } vatpic_notify_intr(vatpic); } static int vatpic_set_irqstate(struct vm *vm, int irq, enum irqstate irqstate) { struct vatpic *vatpic; struct atpic *atpic; if (irq < 0 || irq > 15) return (EINVAL); vatpic = vm_atpic(vm); atpic = &vatpic->atpic[irq >> 3]; if (atpic->ready == false) return (0); VATPIC_LOCK(vatpic); switch (irqstate) { case IRQSTATE_ASSERT: vatpic_set_pinstate(vatpic, irq, true); break; case IRQSTATE_DEASSERT: vatpic_set_pinstate(vatpic, irq, false); break; case IRQSTATE_PULSE: vatpic_set_pinstate(vatpic, irq, true); vatpic_set_pinstate(vatpic, irq, false); break; default: panic("vatpic_set_irqstate: invalid irqstate %d", irqstate); } VATPIC_UNLOCK(vatpic); return (0); } int vatpic_assert_irq(struct vm *vm, int irq) { return (vatpic_set_irqstate(vm, irq, IRQSTATE_ASSERT)); } int vatpic_deassert_irq(struct vm *vm, int irq) { return (vatpic_set_irqstate(vm, irq, IRQSTATE_DEASSERT)); } int vatpic_pulse_irq(struct vm *vm, int irq) { return (vatpic_set_irqstate(vm, irq, IRQSTATE_PULSE)); } int vatpic_set_irq_trigger(struct vm *vm, int irq, enum vm_intr_trigger trigger) { struct vatpic *vatpic; if (irq < 0 || irq > 15) return (EINVAL); /* * See comment in vatpic_elc_handler. These IRQs must be * edge triggered. */ if (trigger == LEVEL_TRIGGER) { switch (irq) { case 0: case 1: case 2: case 8: case 13: return (EINVAL); } } vatpic = vm_atpic(vm); VATPIC_LOCK(vatpic); if (trigger == LEVEL_TRIGGER) vatpic->elc[irq >> 3] |= 1 << (irq & 0x7); else vatpic->elc[irq >> 3] &= ~(1 << (irq & 0x7)); VATPIC_UNLOCK(vatpic); return (0); } void vatpic_pending_intr(struct vm *vm, int *vecptr) { struct vatpic *vatpic; struct atpic *atpic; int pin; vatpic = vm_atpic(vm); atpic = &vatpic->atpic[0]; VATPIC_LOCK(vatpic); pin = vatpic_get_highest_irrpin(atpic); if (pin == 2) { atpic = &vatpic->atpic[1]; pin = vatpic_get_highest_irrpin(atpic); } /* * If there are no pins active at this moment then return the spurious * interrupt vector instead. */ if (pin == -1) pin = 7; KASSERT(pin >= 0 && pin <= 7, ("%s: invalid pin %d", __func__, pin)); *vecptr = atpic->irq_base + pin; VATPIC_UNLOCK(vatpic); } static void vatpic_pin_accepted(struct atpic *atpic, int pin) { atpic->intr_raised = false; if (atpic->acnt[pin] == 0) atpic->request &= ~(1 << pin); if (atpic->aeoi == true) { if (atpic->rotate == true) atpic->lowprio = pin; } else { atpic->service |= (1 << pin); } } void vatpic_intr_accepted(struct vm *vm, int vector) { struct vatpic *vatpic; int pin; vatpic = vm_atpic(vm); VATPIC_LOCK(vatpic); pin = vector & 0x7; if ((vector & ~0x7) == vatpic->atpic[1].irq_base) { vatpic_pin_accepted(&vatpic->atpic[1], pin); /* * If this vector originated from the slave, * accept the cascaded interrupt too. */ vatpic_pin_accepted(&vatpic->atpic[0], 2); } else { vatpic_pin_accepted(&vatpic->atpic[0], pin); } vatpic_notify_intr(vatpic); VATPIC_UNLOCK(vatpic); } static int vatpic_read(struct vatpic *vatpic, struct atpic *atpic, bool in, int port, int bytes, uint32_t *eax) { int pin; VATPIC_LOCK(vatpic); if (atpic->poll) { atpic->poll = 0; pin = vatpic_get_highest_irrpin(atpic); if (pin >= 0) { vatpic_pin_accepted(atpic, pin); *eax = 0x80 | pin; } else { *eax = 0; } } else { if (port & ICU_IMR_OFFSET) { /* read interrrupt mask register */ *eax = atpic->mask; } else { if (atpic->rd_cmd_reg == OCW3_RIS) { /* read interrupt service register */ *eax = atpic->service; } else { /* read interrupt request register */ *eax = atpic->request; } } } VATPIC_UNLOCK(vatpic); return (0); } static int vatpic_write(struct vatpic *vatpic, struct atpic *atpic, bool in, int port, int bytes, uint32_t *eax) { int error; uint8_t val; error = 0; val = *eax; VATPIC_LOCK(vatpic); if (port & ICU_IMR_OFFSET) { switch (atpic->icw_num) { case 2: error = vatpic_icw2(vatpic, atpic, val); break; case 3: error = vatpic_icw3(vatpic, atpic, val); break; case 4: error = vatpic_icw4(vatpic, atpic, val); break; default: error = vatpic_ocw1(vatpic, atpic, val); break; } } else { if (val & (1 << 4)) error = vatpic_icw1(vatpic, atpic, val); if (atpic->ready) { if (val & (1 << 3)) error = vatpic_ocw3(vatpic, atpic, val); else error = vatpic_ocw2(vatpic, atpic, val); } } if (atpic->ready) vatpic_notify_intr(vatpic); VATPIC_UNLOCK(vatpic); return (error); } int vatpic_master_handler(struct vm *vm, bool in, int port, int bytes, uint32_t *eax) { struct vatpic *vatpic; struct atpic *atpic; vatpic = vm_atpic(vm); atpic = &vatpic->atpic[0]; if (bytes != 1) return (-1); if (in) { return (vatpic_read(vatpic, atpic, in, port, bytes, eax)); } return (vatpic_write(vatpic, atpic, in, port, bytes, eax)); } int vatpic_slave_handler(struct vm *vm, bool in, int port, int bytes, uint32_t *eax) { struct vatpic *vatpic; struct atpic *atpic; vatpic = vm_atpic(vm); atpic = &vatpic->atpic[1]; if (bytes != 1) return (-1); if (in) { return (vatpic_read(vatpic, atpic, in, port, bytes, eax)); } return (vatpic_write(vatpic, atpic, in, port, bytes, eax)); } int vatpic_elc_handler(struct vm *vm, bool in, int port, int bytes, uint32_t *eax) { struct vatpic *vatpic; bool is_master; vatpic = vm_atpic(vm); is_master = (port == IO_ELCR1); if (bytes != 1) return (-1); VATPIC_LOCK(vatpic); if (in) { if (is_master) *eax = vatpic->elc[0]; else *eax = vatpic->elc[1]; } else { /* * For the master PIC the cascade channel (IRQ2), the * heart beat timer (IRQ0), and the keyboard * controller (IRQ1) cannot be programmed for level * mode. * * For the slave PIC the real time clock (IRQ8) and * the floating point error interrupt (IRQ13) cannot * be programmed for level mode. */ if (is_master) vatpic->elc[0] = (*eax & 0xf8); else vatpic->elc[1] = (*eax & 0xde); } VATPIC_UNLOCK(vatpic); return (0); } struct vatpic * vatpic_init(struct vm *vm) { struct vatpic *vatpic; vatpic = malloc(sizeof(struct vatpic), M_VATPIC, M_WAITOK | M_ZERO); vatpic->vm = vm; mtx_init(&vatpic->mtx, "vatpic lock", NULL, MTX_SPIN); return (vatpic); } void vatpic_cleanup(struct vatpic *vatpic) { + mtx_destroy(&vatpic->mtx); free(vatpic, M_VATPIC); } #ifdef BHYVE_SNAPSHOT int vatpic_snapshot(struct vatpic *vatpic, struct vm_snapshot_meta *meta) { int ret; int i; struct atpic *atpic; for (i = 0; i < nitems(vatpic->atpic); i++) { atpic = &vatpic->atpic[i]; SNAPSHOT_VAR_OR_LEAVE(atpic->ready, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(atpic->icw_num, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(atpic->rd_cmd_reg, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(atpic->aeoi, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(atpic->poll, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(atpic->rotate, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(atpic->sfn, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(atpic->irq_base, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(atpic->request, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(atpic->service, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(atpic->mask, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(atpic->smm, meta, ret, done); SNAPSHOT_BUF_OR_LEAVE(atpic->acnt, sizeof(atpic->acnt), meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(atpic->lowprio, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(atpic->intr_raised, meta, ret, done); } SNAPSHOT_BUF_OR_LEAVE(vatpic->elc, sizeof(vatpic->elc), meta, ret, done); done: return (ret); } #endif diff --git a/sys/amd64/vmm/io/vatpit.c b/sys/amd64/vmm/io/vatpit.c index 7626b4dc4cc2..c58a6da66c1a 100644 --- a/sys/amd64/vmm/io/vatpit.c +++ b/sys/amd64/vmm/io/vatpit.c @@ -1,513 +1,514 @@ /*- * Copyright (c) 2014 Tycho Nightingale * Copyright (c) 2011 NetApp, Inc. * All rights reserved. * Copyright (c) 2018 Joyent, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include "opt_bhyve_snapshot.h" #include #include #include #include #include #include #include #include #include #include #include "vmm_ktr.h" #include "vatpic.h" #include "vioapic.h" #include "vatpit.h" static MALLOC_DEFINE(M_VATPIT, "atpit", "bhyve virtual atpit (8254)"); #define VATPIT_LOCK(vatpit) mtx_lock_spin(&((vatpit)->mtx)) #define VATPIT_UNLOCK(vatpit) mtx_unlock_spin(&((vatpit)->mtx)) #define VATPIT_LOCKED(vatpit) mtx_owned(&((vatpit)->mtx)) #define TIMER_SEL_MASK 0xc0 #define TIMER_RW_MASK 0x30 #define TIMER_MODE_MASK 0x0f #define TIMER_SEL_READBACK 0xc0 #define TIMER_STS_OUT 0x80 #define TIMER_STS_NULLCNT 0x40 #define TIMER_RB_LCTR 0x20 #define TIMER_RB_LSTATUS 0x10 #define TIMER_RB_CTR_2 0x08 #define TIMER_RB_CTR_1 0x04 #define TIMER_RB_CTR_0 0x02 #define TMR2_OUT_STS 0x20 #define PIT_8254_FREQ 1193182 #define TIMER_DIV(freq, hz) (((freq) + (hz) / 2) / (hz)) struct vatpit_callout_arg { struct vatpit *vatpit; int channel_num; }; struct channel { int mode; uint16_t initial; /* initial counter value */ struct bintime now_bt; /* uptime when counter was loaded */ uint8_t cr[2]; uint8_t ol[2]; bool slatched; /* status latched */ uint8_t status; int crbyte; int olbyte; int frbyte; struct callout callout; struct bintime callout_bt; /* target time */ struct vatpit_callout_arg callout_arg; }; struct vatpit { struct vm *vm; struct mtx mtx; struct bintime freq_bt; struct channel channel[3]; }; static void pit_timer_start_cntr0(struct vatpit *vatpit); static uint64_t vatpit_delta_ticks(struct vatpit *vatpit, struct channel *c) { struct bintime delta; uint64_t result; binuptime(&delta); bintime_sub(&delta, &c->now_bt); result = delta.sec * PIT_8254_FREQ; result += delta.frac / vatpit->freq_bt.frac; return (result); } static int vatpit_get_out(struct vatpit *vatpit, int channel) { struct channel *c; uint64_t delta_ticks; int out; c = &vatpit->channel[channel]; switch (c->mode) { case TIMER_INTTC: delta_ticks = vatpit_delta_ticks(vatpit, c); out = (delta_ticks >= c->initial); break; default: out = 0; break; } return (out); } static void vatpit_callout_handler(void *a) { struct vatpit_callout_arg *arg = a; struct vatpit *vatpit; struct callout *callout; struct channel *c; vatpit = arg->vatpit; c = &vatpit->channel[arg->channel_num]; callout = &c->callout; VM_CTR1(vatpit->vm, "atpit t%d fired", arg->channel_num); VATPIT_LOCK(vatpit); if (callout_pending(callout)) /* callout was reset */ goto done; if (!callout_active(callout)) /* callout was stopped */ goto done; callout_deactivate(callout); if (c->mode == TIMER_RATEGEN) { pit_timer_start_cntr0(vatpit); } vatpic_pulse_irq(vatpit->vm, 0); vioapic_pulse_irq(vatpit->vm, 2); done: VATPIT_UNLOCK(vatpit); return; } static void pit_timer_start_cntr0(struct vatpit *vatpit) { struct channel *c; struct bintime now, delta; sbintime_t precision; c = &vatpit->channel[0]; if (c->initial != 0) { delta.sec = 0; delta.frac = vatpit->freq_bt.frac * c->initial; bintime_add(&c->callout_bt, &delta); precision = bttosbt(delta) >> tc_precexp; /* * Reset 'callout_bt' if the time that the callout * was supposed to fire is more than 'c->initial' * ticks in the past. */ binuptime(&now); if (bintime_cmp(&c->callout_bt, &now, <)) { c->callout_bt = now; bintime_add(&c->callout_bt, &delta); } callout_reset_sbt(&c->callout, bttosbt(c->callout_bt), precision, vatpit_callout_handler, &c->callout_arg, C_ABSOLUTE); } } static uint16_t pit_update_counter(struct vatpit *vatpit, struct channel *c, bool latch) { uint16_t lval; uint64_t delta_ticks; /* cannot latch a new value until the old one has been consumed */ if (latch && c->olbyte != 0) return (0); if (c->initial == 0) { /* * This is possibly an o/s bug - reading the value of * the timer without having set up the initial value. * * The original user-space version of this code set * the timer to 100hz in this condition; do the same * here. */ c->initial = TIMER_DIV(PIT_8254_FREQ, 100); binuptime(&c->now_bt); c->status &= ~TIMER_STS_NULLCNT; } delta_ticks = vatpit_delta_ticks(vatpit, c); lval = c->initial - delta_ticks % c->initial; if (latch) { c->olbyte = 2; c->ol[1] = lval; /* LSB */ c->ol[0] = lval >> 8; /* MSB */ } return (lval); } static int pit_readback1(struct vatpit *vatpit, int channel, uint8_t cmd) { struct channel *c; c = &vatpit->channel[channel]; /* * Latch the count/status of the timer if not already latched. * N.B. that the count/status latch-select bits are active-low. */ if (!(cmd & TIMER_RB_LCTR) && !c->olbyte) { (void) pit_update_counter(vatpit, c, true); } if (!(cmd & TIMER_RB_LSTATUS) && !c->slatched) { c->slatched = true; /* * For mode 0, see if the elapsed time is greater * than the initial value - this results in the * output pin being set to 1 in the status byte. */ if (c->mode == TIMER_INTTC && vatpit_get_out(vatpit, channel)) c->status |= TIMER_STS_OUT; else c->status &= ~TIMER_STS_OUT; } return (0); } static int pit_readback(struct vatpit *vatpit, uint8_t cmd) { int error; /* * The readback command can apply to all timers. */ error = 0; if (cmd & TIMER_RB_CTR_0) error = pit_readback1(vatpit, 0, cmd); if (!error && cmd & TIMER_RB_CTR_1) error = pit_readback1(vatpit, 1, cmd); if (!error && cmd & TIMER_RB_CTR_2) error = pit_readback1(vatpit, 2, cmd); return (error); } static int vatpit_update_mode(struct vatpit *vatpit, uint8_t val) { struct channel *c; int sel, rw, mode; sel = val & TIMER_SEL_MASK; rw = val & TIMER_RW_MASK; mode = val & TIMER_MODE_MASK; if (sel == TIMER_SEL_READBACK) return (pit_readback(vatpit, val)); if (rw != TIMER_LATCH && rw != TIMER_16BIT) return (-1); if (rw != TIMER_LATCH) { /* * Counter mode is not affected when issuing a * latch command. */ if (mode != TIMER_INTTC && mode != TIMER_RATEGEN && mode != TIMER_SQWAVE && mode != TIMER_SWSTROBE) return (-1); } c = &vatpit->channel[sel >> 6]; if (rw == TIMER_LATCH) pit_update_counter(vatpit, c, true); else { c->mode = mode; c->olbyte = 0; /* reset latch after reprogramming */ c->status |= TIMER_STS_NULLCNT; } return (0); } int vatpit_handler(struct vm *vm, bool in, int port, int bytes, uint32_t *eax) { struct vatpit *vatpit; struct channel *c; uint8_t val; int error; vatpit = vm_atpit(vm); if (bytes != 1) return (-1); val = *eax; if (port == TIMER_MODE) { if (in) { VM_CTR0(vatpit->vm, "vatpit attempt to read mode"); return (-1); } VATPIT_LOCK(vatpit); error = vatpit_update_mode(vatpit, val); VATPIT_UNLOCK(vatpit); return (error); } /* counter ports */ KASSERT(port >= TIMER_CNTR0 && port <= TIMER_CNTR2, ("invalid port 0x%x", port)); c = &vatpit->channel[port - TIMER_CNTR0]; VATPIT_LOCK(vatpit); if (in && c->slatched) { /* * Return the status byte if latched */ *eax = c->status; c->slatched = false; c->status = 0; } else if (in) { /* * The spec says that once the output latch is completely * read it should revert to "following" the counter. Use * the free running counter for this case (i.e. Linux * TSC calibration). Assuming the access mode is 16-bit, * toggle the MSB/LSB bit on each read. */ if (c->olbyte == 0) { uint16_t tmp; tmp = pit_update_counter(vatpit, c, false); if (c->frbyte) tmp >>= 8; tmp &= 0xff; *eax = tmp; c->frbyte ^= 1; } else *eax = c->ol[--c->olbyte]; } else { c->cr[c->crbyte++] = *eax; if (c->crbyte == 2) { c->status &= ~TIMER_STS_NULLCNT; c->frbyte = 0; c->crbyte = 0; c->initial = c->cr[0] | (uint16_t)c->cr[1] << 8; binuptime(&c->now_bt); /* Start an interval timer for channel 0 */ if (port == TIMER_CNTR0) { c->callout_bt = c->now_bt; pit_timer_start_cntr0(vatpit); } if (c->initial == 0) c->initial = 0xffff; } } VATPIT_UNLOCK(vatpit); return (0); } int vatpit_nmisc_handler(struct vm *vm, bool in, int port, int bytes, uint32_t *eax) { struct vatpit *vatpit; vatpit = vm_atpit(vm); if (in) { VATPIT_LOCK(vatpit); if (vatpit_get_out(vatpit, 2)) *eax = TMR2_OUT_STS; else *eax = 0; VATPIT_UNLOCK(vatpit); } return (0); } struct vatpit * vatpit_init(struct vm *vm) { struct vatpit *vatpit; struct vatpit_callout_arg *arg; int i; vatpit = malloc(sizeof(struct vatpit), M_VATPIT, M_WAITOK | M_ZERO); vatpit->vm = vm; mtx_init(&vatpit->mtx, "vatpit lock", NULL, MTX_SPIN); FREQ2BT(PIT_8254_FREQ, &vatpit->freq_bt); for (i = 0; i < 3; i++) { callout_init(&vatpit->channel[i].callout, 1); arg = &vatpit->channel[i].callout_arg; arg->vatpit = vatpit; arg->channel_num = i; } return (vatpit); } void vatpit_cleanup(struct vatpit *vatpit) { int i; for (i = 0; i < 3; i++) callout_drain(&vatpit->channel[i].callout); + mtx_destroy(&vatpit->mtx); free(vatpit, M_VATPIT); } #ifdef BHYVE_SNAPSHOT int vatpit_snapshot(struct vatpit *vatpit, struct vm_snapshot_meta *meta) { int ret; int i; struct channel *channel; SNAPSHOT_VAR_OR_LEAVE(vatpit->freq_bt.sec, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(vatpit->freq_bt.frac, meta, ret, done); /* properly restore timers; they will NOT work currently */ printf("%s: snapshot restore does not reset timers!\r\n", __func__); for (i = 0; i < nitems(vatpit->channel); i++) { channel = &vatpit->channel[i]; SNAPSHOT_VAR_OR_LEAVE(channel->mode, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(channel->initial, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(channel->now_bt.sec, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(channel->now_bt.frac, meta, ret, done); SNAPSHOT_BUF_OR_LEAVE(channel->cr, sizeof(channel->cr), meta, ret, done); SNAPSHOT_BUF_OR_LEAVE(channel->ol, sizeof(channel->ol), meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(channel->slatched, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(channel->status, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(channel->crbyte, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(channel->frbyte, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(channel->callout_bt.sec, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(channel->callout_bt.frac, meta, ret, done); } done: return (ret); } #endif diff --git a/sys/amd64/vmm/io/vhpet.c b/sys/amd64/vmm/io/vhpet.c index dd409cde188f..7e73684ebad2 100644 --- a/sys/amd64/vmm/io/vhpet.c +++ b/sys/amd64/vmm/io/vhpet.c @@ -1,812 +1,813 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2013 Tycho Nightingale * Copyright (c) 2013 Neel Natu * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #include __FBSDID("$FreeBSD$"); #include "opt_bhyve_snapshot.h" #include #include #include #include #include #include #include #include #include #include #include "vmm_lapic.h" #include "vatpic.h" #include "vioapic.h" #include "vhpet.h" #include "vmm_ktr.h" static MALLOC_DEFINE(M_VHPET, "vhpet", "bhyve virtual hpet"); #define HPET_FREQ 16777216 /* 16.7 (2^24) Mhz */ #define FS_PER_S 1000000000000000ul /* Timer N Configuration and Capabilities Register */ #define HPET_TCAP_RO_MASK (HPET_TCAP_INT_ROUTE | \ HPET_TCAP_FSB_INT_DEL | \ HPET_TCAP_SIZE | \ HPET_TCAP_PER_INT) /* * HPET requires at least 3 timers and up to 32 timers per block. */ #define VHPET_NUM_TIMERS 8 CTASSERT(VHPET_NUM_TIMERS >= 3 && VHPET_NUM_TIMERS <= 32); struct vhpet_callout_arg { struct vhpet *vhpet; int timer_num; }; struct vhpet { struct vm *vm; struct mtx mtx; sbintime_t freq_sbt; uint64_t config; /* Configuration */ uint64_t isr; /* Interrupt Status */ uint32_t countbase; /* HPET counter base value */ sbintime_t countbase_sbt; /* uptime corresponding to base value */ struct { uint64_t cap_config; /* Configuration */ uint64_t msireg; /* FSB interrupt routing */ uint32_t compval; /* Comparator */ uint32_t comprate; struct callout callout; sbintime_t callout_sbt; /* time when counter==compval */ struct vhpet_callout_arg arg; } timer[VHPET_NUM_TIMERS]; }; #define VHPET_LOCK(vhp) mtx_lock(&((vhp)->mtx)) #define VHPET_UNLOCK(vhp) mtx_unlock(&((vhp)->mtx)) static void vhpet_start_timer(struct vhpet *vhpet, int n, uint32_t counter, sbintime_t now); static uint64_t vhpet_capabilities(void) { uint64_t cap = 0; cap |= 0x8086 << 16; /* vendor id */ cap |= (VHPET_NUM_TIMERS - 1) << 8; /* number of timers */ cap |= 1; /* revision */ cap &= ~HPET_CAP_COUNT_SIZE; /* 32-bit timer */ cap &= 0xffffffff; cap |= (FS_PER_S / HPET_FREQ) << 32; /* tick period in fs */ return (cap); } static __inline bool vhpet_counter_enabled(struct vhpet *vhpet) { return ((vhpet->config & HPET_CNF_ENABLE) ? true : false); } static __inline bool vhpet_timer_msi_enabled(struct vhpet *vhpet, int n) { const uint64_t msi_enable = HPET_TCAP_FSB_INT_DEL | HPET_TCNF_FSB_EN; if ((vhpet->timer[n].cap_config & msi_enable) == msi_enable) return (true); else return (false); } static __inline int vhpet_timer_ioapic_pin(struct vhpet *vhpet, int n) { /* * If the timer is configured to use MSI then treat it as if the * timer is not connected to the ioapic. */ if (vhpet_timer_msi_enabled(vhpet, n)) return (0); return ((vhpet->timer[n].cap_config & HPET_TCNF_INT_ROUTE) >> 9); } static uint32_t vhpet_counter(struct vhpet *vhpet, sbintime_t *nowptr) { uint32_t val; sbintime_t now, delta; val = vhpet->countbase; if (vhpet_counter_enabled(vhpet)) { now = sbinuptime(); delta = now - vhpet->countbase_sbt; KASSERT(delta >= 0, ("vhpet_counter: uptime went backwards: " "%#lx to %#lx", vhpet->countbase_sbt, now)); val += delta / vhpet->freq_sbt; if (nowptr != NULL) *nowptr = now; } else { /* * The sbinuptime corresponding to the 'countbase' is * meaningless when the counter is disabled. Make sure * that the caller doesn't want to use it. */ KASSERT(nowptr == NULL, ("vhpet_counter: nowptr must be NULL")); } return (val); } static void vhpet_timer_clear_isr(struct vhpet *vhpet, int n) { int pin; if (vhpet->isr & (1 << n)) { pin = vhpet_timer_ioapic_pin(vhpet, n); KASSERT(pin != 0, ("vhpet timer %d irq incorrectly routed", n)); vioapic_deassert_irq(vhpet->vm, pin); vhpet->isr &= ~(1 << n); } } static __inline bool vhpet_periodic_timer(struct vhpet *vhpet, int n) { return ((vhpet->timer[n].cap_config & HPET_TCNF_TYPE) != 0); } static __inline bool vhpet_timer_interrupt_enabled(struct vhpet *vhpet, int n) { return ((vhpet->timer[n].cap_config & HPET_TCNF_INT_ENB) != 0); } static __inline bool vhpet_timer_edge_trig(struct vhpet *vhpet, int n) { KASSERT(!vhpet_timer_msi_enabled(vhpet, n), ("vhpet_timer_edge_trig: " "timer %d is using MSI", n)); if ((vhpet->timer[n].cap_config & HPET_TCNF_INT_TYPE) == 0) return (true); else return (false); } static void vhpet_timer_interrupt(struct vhpet *vhpet, int n) { int pin; /* If interrupts are not enabled for this timer then just return. */ if (!vhpet_timer_interrupt_enabled(vhpet, n)) return; /* * If a level triggered interrupt is already asserted then just return. */ if ((vhpet->isr & (1 << n)) != 0) { VM_CTR1(vhpet->vm, "hpet t%d intr is already asserted", n); return; } if (vhpet_timer_msi_enabled(vhpet, n)) { lapic_intr_msi(vhpet->vm, vhpet->timer[n].msireg >> 32, vhpet->timer[n].msireg & 0xffffffff); return; } pin = vhpet_timer_ioapic_pin(vhpet, n); if (pin == 0) { VM_CTR1(vhpet->vm, "hpet t%d intr is not routed to ioapic", n); return; } if (vhpet_timer_edge_trig(vhpet, n)) { vioapic_pulse_irq(vhpet->vm, pin); } else { vhpet->isr |= 1 << n; vioapic_assert_irq(vhpet->vm, pin); } } static void vhpet_adjust_compval(struct vhpet *vhpet, int n, uint32_t counter) { uint32_t compval, comprate, compnext; KASSERT(vhpet->timer[n].comprate != 0, ("hpet t%d is not periodic", n)); compval = vhpet->timer[n].compval; comprate = vhpet->timer[n].comprate; /* * Calculate the comparator value to be used for the next periodic * interrupt. * * This function is commonly called from the callout handler. * In this scenario the 'counter' is ahead of 'compval'. To find * the next value to program into the accumulator we divide the * number space between 'compval' and 'counter' into 'comprate' * sized units. The 'compval' is rounded up such that is "ahead" * of 'counter'. */ compnext = compval + ((counter - compval) / comprate + 1) * comprate; vhpet->timer[n].compval = compnext; } static void vhpet_handler(void *a) { int n; uint32_t counter; sbintime_t now; struct vhpet *vhpet; struct callout *callout; struct vhpet_callout_arg *arg; arg = a; vhpet = arg->vhpet; n = arg->timer_num; callout = &vhpet->timer[n].callout; VM_CTR1(vhpet->vm, "hpet t%d fired", n); VHPET_LOCK(vhpet); if (callout_pending(callout)) /* callout was reset */ goto done; if (!callout_active(callout)) /* callout was stopped */ goto done; callout_deactivate(callout); if (!vhpet_counter_enabled(vhpet)) panic("vhpet(%p) callout with counter disabled", vhpet); counter = vhpet_counter(vhpet, &now); vhpet_start_timer(vhpet, n, counter, now); vhpet_timer_interrupt(vhpet, n); done: VHPET_UNLOCK(vhpet); return; } static void vhpet_stop_timer(struct vhpet *vhpet, int n, sbintime_t now) { VM_CTR1(vhpet->vm, "hpet t%d stopped", n); callout_stop(&vhpet->timer[n].callout); /* * If the callout was scheduled to expire in the past but hasn't * had a chance to execute yet then trigger the timer interrupt * here. Failing to do so will result in a missed timer interrupt * in the guest. This is especially bad in one-shot mode because * the next interrupt has to wait for the counter to wrap around. */ if (vhpet->timer[n].callout_sbt < now) { VM_CTR1(vhpet->vm, "hpet t%d interrupt triggered after " "stopping timer", n); vhpet_timer_interrupt(vhpet, n); } } static void vhpet_start_timer(struct vhpet *vhpet, int n, uint32_t counter, sbintime_t now) { sbintime_t delta, precision; if (vhpet->timer[n].comprate != 0) vhpet_adjust_compval(vhpet, n, counter); else { /* * In one-shot mode it is the guest's responsibility to make * sure that the comparator value is not in the "past". The * hardware doesn't have any belt-and-suspenders to deal with * this so we don't either. */ } delta = (vhpet->timer[n].compval - counter) * vhpet->freq_sbt; precision = delta >> tc_precexp; vhpet->timer[n].callout_sbt = now + delta; callout_reset_sbt(&vhpet->timer[n].callout, vhpet->timer[n].callout_sbt, precision, vhpet_handler, &vhpet->timer[n].arg, C_ABSOLUTE); } static void vhpet_start_counting(struct vhpet *vhpet) { int i; vhpet->countbase_sbt = sbinuptime(); for (i = 0; i < VHPET_NUM_TIMERS; i++) { /* * Restart the timers based on the value of the main counter * when it stopped counting. */ vhpet_start_timer(vhpet, i, vhpet->countbase, vhpet->countbase_sbt); } } static void vhpet_stop_counting(struct vhpet *vhpet, uint32_t counter, sbintime_t now) { int i; vhpet->countbase = counter; for (i = 0; i < VHPET_NUM_TIMERS; i++) vhpet_stop_timer(vhpet, i, now); } static __inline void update_register(uint64_t *regptr, uint64_t data, uint64_t mask) { *regptr &= ~mask; *regptr |= (data & mask); } static void vhpet_timer_update_config(struct vhpet *vhpet, int n, uint64_t data, uint64_t mask) { bool clear_isr; int old_pin, new_pin; uint32_t allowed_irqs; uint64_t oldval, newval; if (vhpet_timer_msi_enabled(vhpet, n) || vhpet_timer_edge_trig(vhpet, n)) { if (vhpet->isr & (1 << n)) panic("vhpet timer %d isr should not be asserted", n); } old_pin = vhpet_timer_ioapic_pin(vhpet, n); oldval = vhpet->timer[n].cap_config; newval = oldval; update_register(&newval, data, mask); newval &= ~(HPET_TCAP_RO_MASK | HPET_TCNF_32MODE); newval |= oldval & HPET_TCAP_RO_MASK; if (newval == oldval) return; vhpet->timer[n].cap_config = newval; VM_CTR2(vhpet->vm, "hpet t%d cap_config set to 0x%016x", n, newval); /* * Validate the interrupt routing in the HPET_TCNF_INT_ROUTE field. * If it does not match the bits set in HPET_TCAP_INT_ROUTE then set * it to the default value of 0. */ allowed_irqs = vhpet->timer[n].cap_config >> 32; new_pin = vhpet_timer_ioapic_pin(vhpet, n); if (new_pin != 0 && (allowed_irqs & (1 << new_pin)) == 0) { VM_CTR3(vhpet->vm, "hpet t%d configured invalid irq %d, " "allowed_irqs 0x%08x", n, new_pin, allowed_irqs); new_pin = 0; vhpet->timer[n].cap_config &= ~HPET_TCNF_INT_ROUTE; } if (!vhpet_periodic_timer(vhpet, n)) vhpet->timer[n].comprate = 0; /* * If the timer's ISR bit is set then clear it in the following cases: * - interrupt is disabled * - interrupt type is changed from level to edge or fsb. * - interrupt routing is changed * * This is to ensure that this timer's level triggered interrupt does * not remain asserted forever. */ if (vhpet->isr & (1 << n)) { KASSERT(old_pin != 0, ("timer %d isr asserted to ioapic pin %d", n, old_pin)); if (!vhpet_timer_interrupt_enabled(vhpet, n)) clear_isr = true; else if (vhpet_timer_msi_enabled(vhpet, n)) clear_isr = true; else if (vhpet_timer_edge_trig(vhpet, n)) clear_isr = true; else if (vhpet_timer_ioapic_pin(vhpet, n) != old_pin) clear_isr = true; else clear_isr = false; if (clear_isr) { VM_CTR1(vhpet->vm, "hpet t%d isr cleared due to " "configuration change", n); vioapic_deassert_irq(vhpet->vm, old_pin); vhpet->isr &= ~(1 << n); } } } int vhpet_mmio_write(struct vcpu *vcpu, uint64_t gpa, uint64_t val, int size, void *arg) { struct vhpet *vhpet; uint64_t data, mask, oldval, val64; uint32_t isr_clear_mask, old_compval, old_comprate, counter; sbintime_t now, *nowptr; int i, offset; vhpet = vm_hpet(vcpu_vm(vcpu)); offset = gpa - VHPET_BASE; VHPET_LOCK(vhpet); /* Accesses to the HPET should be 4 or 8 bytes wide */ switch (size) { case 8: mask = 0xffffffffffffffff; data = val; break; case 4: mask = 0xffffffff; data = val; if ((offset & 0x4) != 0) { mask <<= 32; data <<= 32; } break; default: VM_CTR2(vhpet->vm, "hpet invalid mmio write: " "offset 0x%08x, size %d", offset, size); goto done; } /* Access to the HPET should be naturally aligned to its width */ if (offset & (size - 1)) { VM_CTR2(vhpet->vm, "hpet invalid mmio write: " "offset 0x%08x, size %d", offset, size); goto done; } if (offset == HPET_CONFIG || offset == HPET_CONFIG + 4) { /* * Get the most recent value of the counter before updating * the 'config' register. If the HPET is going to be disabled * then we need to update 'countbase' with the value right * before it is disabled. */ nowptr = vhpet_counter_enabled(vhpet) ? &now : NULL; counter = vhpet_counter(vhpet, nowptr); oldval = vhpet->config; update_register(&vhpet->config, data, mask); /* * LegacyReplacement Routing is not supported so clear the * bit explicitly. */ vhpet->config &= ~HPET_CNF_LEG_RT; if ((oldval ^ vhpet->config) & HPET_CNF_ENABLE) { if (vhpet_counter_enabled(vhpet)) { vhpet_start_counting(vhpet); VM_CTR0(vhpet->vm, "hpet enabled"); } else { vhpet_stop_counting(vhpet, counter, now); VM_CTR0(vhpet->vm, "hpet disabled"); } } goto done; } if (offset == HPET_ISR || offset == HPET_ISR + 4) { isr_clear_mask = vhpet->isr & data; for (i = 0; i < VHPET_NUM_TIMERS; i++) { if ((isr_clear_mask & (1 << i)) != 0) { VM_CTR1(vhpet->vm, "hpet t%d isr cleared", i); vhpet_timer_clear_isr(vhpet, i); } } goto done; } if (offset == HPET_MAIN_COUNTER || offset == HPET_MAIN_COUNTER + 4) { /* Zero-extend the counter to 64-bits before updating it */ val64 = vhpet_counter(vhpet, NULL); update_register(&val64, data, mask); vhpet->countbase = val64; if (vhpet_counter_enabled(vhpet)) vhpet_start_counting(vhpet); goto done; } for (i = 0; i < VHPET_NUM_TIMERS; i++) { if (offset == HPET_TIMER_CAP_CNF(i) || offset == HPET_TIMER_CAP_CNF(i) + 4) { vhpet_timer_update_config(vhpet, i, data, mask); break; } if (offset == HPET_TIMER_COMPARATOR(i) || offset == HPET_TIMER_COMPARATOR(i) + 4) { old_compval = vhpet->timer[i].compval; old_comprate = vhpet->timer[i].comprate; if (vhpet_periodic_timer(vhpet, i)) { /* * In periodic mode writes to the comparator * change the 'compval' register only if the * HPET_TCNF_VAL_SET bit is set in the config * register. */ val64 = vhpet->timer[i].comprate; update_register(&val64, data, mask); vhpet->timer[i].comprate = val64; if ((vhpet->timer[i].cap_config & HPET_TCNF_VAL_SET) != 0) { vhpet->timer[i].compval = val64; } } else { KASSERT(vhpet->timer[i].comprate == 0, ("vhpet one-shot timer %d has invalid " "rate %u", i, vhpet->timer[i].comprate)); val64 = vhpet->timer[i].compval; update_register(&val64, data, mask); vhpet->timer[i].compval = val64; } vhpet->timer[i].cap_config &= ~HPET_TCNF_VAL_SET; if (vhpet->timer[i].compval != old_compval || vhpet->timer[i].comprate != old_comprate) { if (vhpet_counter_enabled(vhpet)) { counter = vhpet_counter(vhpet, &now); vhpet_start_timer(vhpet, i, counter, now); } } break; } if (offset == HPET_TIMER_FSB_VAL(i) || offset == HPET_TIMER_FSB_ADDR(i)) { update_register(&vhpet->timer[i].msireg, data, mask); break; } } done: VHPET_UNLOCK(vhpet); return (0); } int vhpet_mmio_read(struct vcpu *vcpu, uint64_t gpa, uint64_t *rval, int size, void *arg) { int i, offset; struct vhpet *vhpet; uint64_t data; vhpet = vm_hpet(vcpu_vm(vcpu)); offset = gpa - VHPET_BASE; VHPET_LOCK(vhpet); /* Accesses to the HPET should be 4 or 8 bytes wide */ if (size != 4 && size != 8) { VM_CTR2(vhpet->vm, "hpet invalid mmio read: " "offset 0x%08x, size %d", offset, size); data = 0; goto done; } /* Access to the HPET should be naturally aligned to its width */ if (offset & (size - 1)) { VM_CTR2(vhpet->vm, "hpet invalid mmio read: " "offset 0x%08x, size %d", offset, size); data = 0; goto done; } if (offset == HPET_CAPABILITIES || offset == HPET_CAPABILITIES + 4) { data = vhpet_capabilities(); goto done; } if (offset == HPET_CONFIG || offset == HPET_CONFIG + 4) { data = vhpet->config; goto done; } if (offset == HPET_ISR || offset == HPET_ISR + 4) { data = vhpet->isr; goto done; } if (offset == HPET_MAIN_COUNTER || offset == HPET_MAIN_COUNTER + 4) { data = vhpet_counter(vhpet, NULL); goto done; } for (i = 0; i < VHPET_NUM_TIMERS; i++) { if (offset == HPET_TIMER_CAP_CNF(i) || offset == HPET_TIMER_CAP_CNF(i) + 4) { data = vhpet->timer[i].cap_config; break; } if (offset == HPET_TIMER_COMPARATOR(i) || offset == HPET_TIMER_COMPARATOR(i) + 4) { data = vhpet->timer[i].compval; break; } if (offset == HPET_TIMER_FSB_VAL(i) || offset == HPET_TIMER_FSB_ADDR(i)) { data = vhpet->timer[i].msireg; break; } } if (i >= VHPET_NUM_TIMERS) data = 0; done: VHPET_UNLOCK(vhpet); if (size == 4) { if (offset & 0x4) data >>= 32; } *rval = data; return (0); } struct vhpet * vhpet_init(struct vm *vm) { int i, pincount; struct vhpet *vhpet; uint64_t allowed_irqs; struct vhpet_callout_arg *arg; struct bintime bt; vhpet = malloc(sizeof(struct vhpet), M_VHPET, M_WAITOK | M_ZERO); vhpet->vm = vm; mtx_init(&vhpet->mtx, "vhpet lock", NULL, MTX_DEF); FREQ2BT(HPET_FREQ, &bt); vhpet->freq_sbt = bttosbt(bt); pincount = vioapic_pincount(vm); if (pincount >= 32) allowed_irqs = 0xff000000; /* irqs 24-31 */ else if (pincount >= 20) allowed_irqs = 0xf << (pincount - 4); /* 4 upper irqs */ else allowed_irqs = 0; /* * Initialize HPET timer hardware state. */ for (i = 0; i < VHPET_NUM_TIMERS; i++) { vhpet->timer[i].cap_config = allowed_irqs << 32; vhpet->timer[i].cap_config |= HPET_TCAP_PER_INT; vhpet->timer[i].cap_config |= HPET_TCAP_FSB_INT_DEL; vhpet->timer[i].compval = 0xffffffff; callout_init(&vhpet->timer[i].callout, 1); arg = &vhpet->timer[i].arg; arg->vhpet = vhpet; arg->timer_num = i; } return (vhpet); } void vhpet_cleanup(struct vhpet *vhpet) { int i; for (i = 0; i < VHPET_NUM_TIMERS; i++) callout_drain(&vhpet->timer[i].callout); + mtx_destroy(&vhpet->mtx); free(vhpet, M_VHPET); } int vhpet_getcap(struct vm_hpet_cap *cap) { cap->capabilities = vhpet_capabilities(); return (0); } #ifdef BHYVE_SNAPSHOT int vhpet_snapshot(struct vhpet *vhpet, struct vm_snapshot_meta *meta) { int i, ret; uint32_t countbase; SNAPSHOT_VAR_OR_LEAVE(vhpet->freq_sbt, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(vhpet->config, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(vhpet->isr, meta, ret, done); /* at restore time the countbase should have the value it had when the * snapshot was created; since the value is not directly kept in * vhpet->countbase, but rather computed relative to the current system * uptime using countbase_sbt, save the value retured by vhpet_counter */ if (meta->op == VM_SNAPSHOT_SAVE) countbase = vhpet_counter(vhpet, NULL); SNAPSHOT_VAR_OR_LEAVE(countbase, meta, ret, done); if (meta->op == VM_SNAPSHOT_RESTORE) vhpet->countbase = countbase; for (i = 0; i < nitems(vhpet->timer); i++) { SNAPSHOT_VAR_OR_LEAVE(vhpet->timer[i].cap_config, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(vhpet->timer[i].msireg, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(vhpet->timer[i].compval, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(vhpet->timer[i].comprate, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(vhpet->timer[i].callout_sbt, meta, ret, done); } done: return (ret); } int vhpet_restore_time(struct vhpet *vhpet) { if (vhpet_counter_enabled(vhpet)) vhpet_start_counting(vhpet); return (0); } #endif diff --git a/sys/amd64/vmm/io/vioapic.c b/sys/amd64/vmm/io/vioapic.c index e41b5acac920..00a206fa604d 100644 --- a/sys/amd64/vmm/io/vioapic.c +++ b/sys/amd64/vmm/io/vioapic.c @@ -1,538 +1,539 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2013 Tycho Nightingale * Copyright (c) 2013 Neel Natu * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #include __FBSDID("$FreeBSD$"); #include "opt_bhyve_snapshot.h" #include #include #include #include #include #include #include #include #include #include #include "vmm_ktr.h" #include "vmm_lapic.h" #include "vlapic.h" #include "vioapic.h" #define IOREGSEL 0x00 #define IOWIN 0x10 #define REDIR_ENTRIES 32 #define RTBL_RO_BITS ((uint64_t)(IOART_REM_IRR | IOART_DELIVS)) struct vioapic { struct vm *vm; struct mtx mtx; uint32_t id; uint32_t ioregsel; struct { uint64_t reg; int acnt; /* sum of pin asserts (+1) and deasserts (-1) */ } rtbl[REDIR_ENTRIES]; }; #define VIOAPIC_LOCK(vioapic) mtx_lock_spin(&((vioapic)->mtx)) #define VIOAPIC_UNLOCK(vioapic) mtx_unlock_spin(&((vioapic)->mtx)) #define VIOAPIC_LOCKED(vioapic) mtx_owned(&((vioapic)->mtx)) static MALLOC_DEFINE(M_VIOAPIC, "vioapic", "bhyve virtual ioapic"); #define VIOAPIC_CTR1(vioapic, fmt, a1) \ VM_CTR1((vioapic)->vm, fmt, a1) #define VIOAPIC_CTR2(vioapic, fmt, a1, a2) \ VM_CTR2((vioapic)->vm, fmt, a1, a2) #define VIOAPIC_CTR3(vioapic, fmt, a1, a2, a3) \ VM_CTR3((vioapic)->vm, fmt, a1, a2, a3) #define VIOAPIC_CTR4(vioapic, fmt, a1, a2, a3, a4) \ VM_CTR4((vioapic)->vm, fmt, a1, a2, a3, a4) #ifdef KTR static const char * pinstate_str(bool asserted) { if (asserted) return ("asserted"); else return ("deasserted"); } #endif static void vioapic_send_intr(struct vioapic *vioapic, int pin) { int vector, delmode; uint32_t low, high, dest; bool level, phys; KASSERT(pin >= 0 && pin < REDIR_ENTRIES, ("vioapic_set_pinstate: invalid pin number %d", pin)); KASSERT(VIOAPIC_LOCKED(vioapic), ("vioapic_set_pinstate: vioapic is not locked")); low = vioapic->rtbl[pin].reg; high = vioapic->rtbl[pin].reg >> 32; if ((low & IOART_INTMASK) == IOART_INTMSET) { VIOAPIC_CTR1(vioapic, "ioapic pin%d: masked", pin); return; } phys = ((low & IOART_DESTMOD) == IOART_DESTPHY); delmode = low & IOART_DELMOD; level = low & IOART_TRGRLVL ? true : false; if (level) { if ((low & IOART_REM_IRR) != 0) { VIOAPIC_CTR1(vioapic, "ioapic pin%d: irr pending", pin); return; } vioapic->rtbl[pin].reg |= IOART_REM_IRR; } vector = low & IOART_INTVEC; dest = high >> APIC_ID_SHIFT; vlapic_deliver_intr(vioapic->vm, level, dest, phys, delmode, vector); } static void vioapic_set_pinstate(struct vioapic *vioapic, int pin, bool newstate) { int oldcnt, newcnt; bool needintr; KASSERT(pin >= 0 && pin < REDIR_ENTRIES, ("vioapic_set_pinstate: invalid pin number %d", pin)); KASSERT(VIOAPIC_LOCKED(vioapic), ("vioapic_set_pinstate: vioapic is not locked")); oldcnt = vioapic->rtbl[pin].acnt; if (newstate) vioapic->rtbl[pin].acnt++; else vioapic->rtbl[pin].acnt--; newcnt = vioapic->rtbl[pin].acnt; if (newcnt < 0) { VIOAPIC_CTR2(vioapic, "ioapic pin%d: bad acnt %d", pin, newcnt); } needintr = false; if (oldcnt == 0 && newcnt == 1) { needintr = true; VIOAPIC_CTR1(vioapic, "ioapic pin%d: asserted", pin); } else if (oldcnt == 1 && newcnt == 0) { VIOAPIC_CTR1(vioapic, "ioapic pin%d: deasserted", pin); } else { VIOAPIC_CTR3(vioapic, "ioapic pin%d: %s, ignored, acnt %d", pin, pinstate_str(newstate), newcnt); } if (needintr) vioapic_send_intr(vioapic, pin); } enum irqstate { IRQSTATE_ASSERT, IRQSTATE_DEASSERT, IRQSTATE_PULSE }; static int vioapic_set_irqstate(struct vm *vm, int irq, enum irqstate irqstate) { struct vioapic *vioapic; if (irq < 0 || irq >= REDIR_ENTRIES) return (EINVAL); vioapic = vm_ioapic(vm); VIOAPIC_LOCK(vioapic); switch (irqstate) { case IRQSTATE_ASSERT: vioapic_set_pinstate(vioapic, irq, true); break; case IRQSTATE_DEASSERT: vioapic_set_pinstate(vioapic, irq, false); break; case IRQSTATE_PULSE: vioapic_set_pinstate(vioapic, irq, true); vioapic_set_pinstate(vioapic, irq, false); break; default: panic("vioapic_set_irqstate: invalid irqstate %d", irqstate); } VIOAPIC_UNLOCK(vioapic); return (0); } int vioapic_assert_irq(struct vm *vm, int irq) { return (vioapic_set_irqstate(vm, irq, IRQSTATE_ASSERT)); } int vioapic_deassert_irq(struct vm *vm, int irq) { return (vioapic_set_irqstate(vm, irq, IRQSTATE_DEASSERT)); } int vioapic_pulse_irq(struct vm *vm, int irq) { return (vioapic_set_irqstate(vm, irq, IRQSTATE_PULSE)); } /* * Reset the vlapic's trigger-mode register to reflect the ioapic pin * configuration. */ static void vioapic_update_tmr(struct vcpu *vcpu, void *arg) { struct vioapic *vioapic; struct vlapic *vlapic; uint32_t low, high, dest; int delmode, pin, vector; bool level, phys; vlapic = vm_lapic(vcpu); vioapic = vm_ioapic(vcpu_vm(vcpu)); VIOAPIC_LOCK(vioapic); /* * Reset all vectors to be edge-triggered. */ vlapic_reset_tmr(vlapic); for (pin = 0; pin < REDIR_ENTRIES; pin++) { low = vioapic->rtbl[pin].reg; high = vioapic->rtbl[pin].reg >> 32; level = low & IOART_TRGRLVL ? true : false; if (!level) continue; /* * For a level-triggered 'pin' let the vlapic figure out if * an assertion on this 'pin' would result in an interrupt * being delivered to it. If yes, then it will modify the * TMR bit associated with this vector to level-triggered. */ phys = ((low & IOART_DESTMOD) == IOART_DESTPHY); delmode = low & IOART_DELMOD; vector = low & IOART_INTVEC; dest = high >> APIC_ID_SHIFT; vlapic_set_tmr_level(vlapic, dest, phys, delmode, vector); } VIOAPIC_UNLOCK(vioapic); } static uint32_t vioapic_read(struct vioapic *vioapic, struct vcpu *vcpu, uint32_t addr) { int regnum, pin, rshift; regnum = addr & 0xff; switch (regnum) { case IOAPIC_ID: return (vioapic->id); break; case IOAPIC_VER: return (((REDIR_ENTRIES - 1) << MAXREDIRSHIFT) | 0x11); break; case IOAPIC_ARB: return (vioapic->id); break; default: break; } /* redirection table entries */ if (regnum >= IOAPIC_REDTBL && regnum < IOAPIC_REDTBL + REDIR_ENTRIES * 2) { pin = (regnum - IOAPIC_REDTBL) / 2; if ((regnum - IOAPIC_REDTBL) % 2) rshift = 32; else rshift = 0; return (vioapic->rtbl[pin].reg >> rshift); } return (0); } static void vioapic_write(struct vioapic *vioapic, struct vcpu *vcpu, uint32_t addr, uint32_t data) { uint64_t data64, mask64; uint64_t last, changed; int regnum, pin, lshift; cpuset_t allvcpus; regnum = addr & 0xff; switch (regnum) { case IOAPIC_ID: vioapic->id = data & APIC_ID_MASK; break; case IOAPIC_VER: case IOAPIC_ARB: /* readonly */ break; default: break; } /* redirection table entries */ if (regnum >= IOAPIC_REDTBL && regnum < IOAPIC_REDTBL + REDIR_ENTRIES * 2) { pin = (regnum - IOAPIC_REDTBL) / 2; if ((regnum - IOAPIC_REDTBL) % 2) lshift = 32; else lshift = 0; last = vioapic->rtbl[pin].reg; data64 = (uint64_t)data << lshift; mask64 = (uint64_t)0xffffffff << lshift; vioapic->rtbl[pin].reg &= ~mask64 | RTBL_RO_BITS; vioapic->rtbl[pin].reg |= data64 & ~RTBL_RO_BITS; /* * Switching from level to edge triggering will clear the IRR * bit. This is what FreeBSD will do in order to EOI an * interrupt when the IO-APIC doesn't support targeted EOI (see * _ioapic_eoi_source). */ if ((vioapic->rtbl[pin].reg & IOART_TRGRMOD) == IOART_TRGREDG && (vioapic->rtbl[pin].reg & IOART_REM_IRR) != 0) vioapic->rtbl[pin].reg &= ~IOART_REM_IRR; VIOAPIC_CTR2(vioapic, "ioapic pin%d: redir table entry %#lx", pin, vioapic->rtbl[pin].reg); /* * If any fields in the redirection table entry (except mask * or polarity) have changed then rendezvous all the vcpus * to update their vlapic trigger-mode registers. */ changed = last ^ vioapic->rtbl[pin].reg; if (changed & ~(IOART_INTMASK | IOART_INTPOL)) { VIOAPIC_CTR1(vioapic, "ioapic pin%d: recalculate " "vlapic trigger-mode register", pin); VIOAPIC_UNLOCK(vioapic); allvcpus = vm_active_cpus(vioapic->vm); (void)vm_smp_rendezvous(vcpu, allvcpus, vioapic_update_tmr, NULL); VIOAPIC_LOCK(vioapic); } /* * Generate an interrupt if the following conditions are met: * - pin trigger mode is level * - pin level is asserted */ if ((vioapic->rtbl[pin].reg & IOART_TRGRMOD) == IOART_TRGRLVL && (vioapic->rtbl[pin].acnt > 0)) { VIOAPIC_CTR2(vioapic, "ioapic pin%d: asserted at rtbl " "write, acnt %d", pin, vioapic->rtbl[pin].acnt); vioapic_send_intr(vioapic, pin); } } } static int vioapic_mmio_rw(struct vioapic *vioapic, struct vcpu *vcpu, uint64_t gpa, uint64_t *data, int size, bool doread) { uint64_t offset; offset = gpa - VIOAPIC_BASE; /* * The IOAPIC specification allows 32-bit wide accesses to the * IOREGSEL (offset 0) and IOWIN (offset 16) registers. */ if (size != 4 || (offset != IOREGSEL && offset != IOWIN)) { if (doread) *data = 0; return (0); } VIOAPIC_LOCK(vioapic); if (offset == IOREGSEL) { if (doread) *data = vioapic->ioregsel; else vioapic->ioregsel = *data; } else { if (doread) { *data = vioapic_read(vioapic, vcpu, vioapic->ioregsel); } else { vioapic_write(vioapic, vcpu, vioapic->ioregsel, *data); } } VIOAPIC_UNLOCK(vioapic); return (0); } int vioapic_mmio_read(struct vcpu *vcpu, uint64_t gpa, uint64_t *rval, int size, void *arg) { int error; struct vioapic *vioapic; vioapic = vm_ioapic(vcpu_vm(vcpu)); error = vioapic_mmio_rw(vioapic, vcpu, gpa, rval, size, true); return (error); } int vioapic_mmio_write(struct vcpu *vcpu, uint64_t gpa, uint64_t wval, int size, void *arg) { int error; struct vioapic *vioapic; vioapic = vm_ioapic(vcpu_vm(vcpu)); error = vioapic_mmio_rw(vioapic, vcpu, gpa, &wval, size, false); return (error); } void vioapic_process_eoi(struct vm *vm, int vector) { struct vioapic *vioapic; int pin; KASSERT(vector >= 0 && vector < 256, ("vioapic_process_eoi: invalid vector %d", vector)); vioapic = vm_ioapic(vm); VIOAPIC_CTR1(vioapic, "ioapic processing eoi for vector %d", vector); /* * XXX keep track of the pins associated with this vector instead * of iterating on every single pin each time. */ VIOAPIC_LOCK(vioapic); for (pin = 0; pin < REDIR_ENTRIES; pin++) { if ((vioapic->rtbl[pin].reg & IOART_REM_IRR) == 0) continue; if ((vioapic->rtbl[pin].reg & IOART_INTVEC) != vector) continue; vioapic->rtbl[pin].reg &= ~IOART_REM_IRR; if (vioapic->rtbl[pin].acnt > 0) { VIOAPIC_CTR2(vioapic, "ioapic pin%d: asserted at eoi, " "acnt %d", pin, vioapic->rtbl[pin].acnt); vioapic_send_intr(vioapic, pin); } } VIOAPIC_UNLOCK(vioapic); } struct vioapic * vioapic_init(struct vm *vm) { int i; struct vioapic *vioapic; vioapic = malloc(sizeof(struct vioapic), M_VIOAPIC, M_WAITOK | M_ZERO); vioapic->vm = vm; mtx_init(&vioapic->mtx, "vioapic lock", NULL, MTX_SPIN); /* Initialize all redirection entries to mask all interrupts */ for (i = 0; i < REDIR_ENTRIES; i++) vioapic->rtbl[i].reg = 0x0001000000010000UL; return (vioapic); } void vioapic_cleanup(struct vioapic *vioapic) { + mtx_destroy(&vioapic->mtx); free(vioapic, M_VIOAPIC); } int vioapic_pincount(struct vm *vm) { return (REDIR_ENTRIES); } #ifdef BHYVE_SNAPSHOT int vioapic_snapshot(struct vioapic *vioapic, struct vm_snapshot_meta *meta) { int ret; int i; SNAPSHOT_VAR_OR_LEAVE(vioapic->ioregsel, meta, ret, done); for (i = 0; i < nitems(vioapic->rtbl); i++) { SNAPSHOT_VAR_OR_LEAVE(vioapic->rtbl[i].reg, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(vioapic->rtbl[i].acnt, meta, ret, done); } done: return (ret); } #endif diff --git a/sys/amd64/vmm/io/vlapic.c b/sys/amd64/vmm/io/vlapic.c index 9c61726e77d7..1a8b54bba3bf 100644 --- a/sys/amd64/vmm/io/vlapic.c +++ b/sys/amd64/vmm/io/vlapic.c @@ -1,1928 +1,1929 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2011 NetApp, Inc. * All rights reserved. * Copyright (c) 2019 Joyent, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #include __FBSDID("$FreeBSD$"); #include "opt_bhyve_snapshot.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include "vmm_lapic.h" #include "vmm_ktr.h" #include "vmm_stat.h" #include "vlapic.h" #include "vlapic_priv.h" #include "vioapic.h" #define PRIO(x) ((x) >> 4) #define VLAPIC_VERSION (0x14) #define x2apic(vlapic) (((vlapic)->msr_apicbase & APICBASE_X2APIC) ? 1 : 0) /* * The 'vlapic->timer_mtx' is used to provide mutual exclusion between the * vlapic_callout_handler() and vcpu accesses to: * - timer_freq_bt, timer_period_bt, timer_fire_bt * - timer LVT register */ #define VLAPIC_TIMER_LOCK(vlapic) mtx_lock_spin(&((vlapic)->timer_mtx)) #define VLAPIC_TIMER_UNLOCK(vlapic) mtx_unlock_spin(&((vlapic)->timer_mtx)) #define VLAPIC_TIMER_LOCKED(vlapic) mtx_owned(&((vlapic)->timer_mtx)) /* * APIC timer frequency: * - arbitrary but chosen to be in the ballpark of contemporary hardware. * - power-of-two to avoid loss of precision when converted to a bintime. */ #define VLAPIC_BUS_FREQ (128 * 1024 * 1024) static void vlapic_set_error(struct vlapic *, uint32_t, bool); static void vlapic_callout_handler(void *arg); static void vlapic_reset(struct vlapic *vlapic); static __inline uint32_t vlapic_get_id(struct vlapic *vlapic) { if (x2apic(vlapic)) return (vlapic->vcpuid); else return (vlapic->vcpuid << 24); } static uint32_t x2apic_ldr(struct vlapic *vlapic) { int apicid; uint32_t ldr; apicid = vlapic_get_id(vlapic); ldr = 1 << (apicid & 0xf); ldr |= (apicid & 0xffff0) << 12; return (ldr); } void vlapic_dfr_write_handler(struct vlapic *vlapic) { struct LAPIC *lapic; lapic = vlapic->apic_page; if (x2apic(vlapic)) { VM_CTR1(vlapic->vm, "ignoring write to DFR in x2apic mode: %#x", lapic->dfr); lapic->dfr = 0; return; } lapic->dfr &= APIC_DFR_MODEL_MASK; lapic->dfr |= APIC_DFR_RESERVED; if ((lapic->dfr & APIC_DFR_MODEL_MASK) == APIC_DFR_MODEL_FLAT) VLAPIC_CTR0(vlapic, "vlapic DFR in Flat Model"); else if ((lapic->dfr & APIC_DFR_MODEL_MASK) == APIC_DFR_MODEL_CLUSTER) VLAPIC_CTR0(vlapic, "vlapic DFR in Cluster Model"); else VLAPIC_CTR1(vlapic, "DFR in Unknown Model %#x", lapic->dfr); } void vlapic_ldr_write_handler(struct vlapic *vlapic) { struct LAPIC *lapic; lapic = vlapic->apic_page; /* LDR is read-only in x2apic mode */ if (x2apic(vlapic)) { VLAPIC_CTR1(vlapic, "ignoring write to LDR in x2apic mode: %#x", lapic->ldr); lapic->ldr = x2apic_ldr(vlapic); } else { lapic->ldr &= ~APIC_LDR_RESERVED; VLAPIC_CTR1(vlapic, "vlapic LDR set to %#x", lapic->ldr); } } void vlapic_id_write_handler(struct vlapic *vlapic) { struct LAPIC *lapic; /* * We don't allow the ID register to be modified so reset it back to * its default value. */ lapic = vlapic->apic_page; lapic->id = vlapic_get_id(vlapic); } static int vlapic_timer_divisor(uint32_t dcr) { switch (dcr & 0xB) { case APIC_TDCR_1: return (1); case APIC_TDCR_2: return (2); case APIC_TDCR_4: return (4); case APIC_TDCR_8: return (8); case APIC_TDCR_16: return (16); case APIC_TDCR_32: return (32); case APIC_TDCR_64: return (64); case APIC_TDCR_128: return (128); default: panic("vlapic_timer_divisor: invalid dcr 0x%08x", dcr); } } #if 0 static inline void vlapic_dump_lvt(uint32_t offset, uint32_t *lvt) { printf("Offset %x: lvt %08x (V:%02x DS:%x M:%x)\n", offset, *lvt, *lvt & APIC_LVTT_VECTOR, *lvt & APIC_LVTT_DS, *lvt & APIC_LVTT_M); } #endif static uint32_t vlapic_get_ccr(struct vlapic *vlapic) { struct bintime bt_now, bt_rem; struct LAPIC *lapic __diagused; uint32_t ccr; ccr = 0; lapic = vlapic->apic_page; VLAPIC_TIMER_LOCK(vlapic); if (callout_active(&vlapic->callout)) { /* * If the timer is scheduled to expire in the future then * compute the value of 'ccr' based on the remaining time. */ binuptime(&bt_now); if (bintime_cmp(&vlapic->timer_fire_bt, &bt_now, >)) { bt_rem = vlapic->timer_fire_bt; bintime_sub(&bt_rem, &bt_now); ccr += bt_rem.sec * BT2FREQ(&vlapic->timer_freq_bt); ccr += bt_rem.frac / vlapic->timer_freq_bt.frac; } } KASSERT(ccr <= lapic->icr_timer, ("vlapic_get_ccr: invalid ccr %#x, " "icr_timer is %#x", ccr, lapic->icr_timer)); VLAPIC_CTR2(vlapic, "vlapic ccr_timer = %#x, icr_timer = %#x", ccr, lapic->icr_timer); VLAPIC_TIMER_UNLOCK(vlapic); return (ccr); } void vlapic_dcr_write_handler(struct vlapic *vlapic) { struct LAPIC *lapic; int divisor; lapic = vlapic->apic_page; VLAPIC_TIMER_LOCK(vlapic); divisor = vlapic_timer_divisor(lapic->dcr_timer); VLAPIC_CTR2(vlapic, "vlapic dcr_timer=%#x, divisor=%d", lapic->dcr_timer, divisor); /* * Update the timer frequency and the timer period. * * XXX changes to the frequency divider will not take effect until * the timer is reloaded. */ FREQ2BT(VLAPIC_BUS_FREQ / divisor, &vlapic->timer_freq_bt); vlapic->timer_period_bt = vlapic->timer_freq_bt; bintime_mul(&vlapic->timer_period_bt, lapic->icr_timer); VLAPIC_TIMER_UNLOCK(vlapic); } void vlapic_esr_write_handler(struct vlapic *vlapic) { struct LAPIC *lapic; lapic = vlapic->apic_page; lapic->esr = vlapic->esr_pending; vlapic->esr_pending = 0; } int vlapic_set_intr_ready(struct vlapic *vlapic, int vector, bool level) { struct LAPIC *lapic; uint32_t *irrptr, *tmrptr, mask; int idx; KASSERT(vector >= 0 && vector < 256, ("invalid vector %d", vector)); lapic = vlapic->apic_page; if (!(lapic->svr & APIC_SVR_ENABLE)) { VLAPIC_CTR1(vlapic, "vlapic is software disabled, ignoring " "interrupt %d", vector); return (0); } if (vector < 16) { vlapic_set_error(vlapic, APIC_ESR_RECEIVE_ILLEGAL_VECTOR, false); VLAPIC_CTR1(vlapic, "vlapic ignoring interrupt to vector %d", vector); return (1); } if (vlapic->ops.set_intr_ready) return ((*vlapic->ops.set_intr_ready)(vlapic, vector, level)); idx = (vector / 32) * 4; mask = 1 << (vector % 32); irrptr = &lapic->irr0; atomic_set_int(&irrptr[idx], mask); /* * Verify that the trigger-mode of the interrupt matches with * the vlapic TMR registers. */ tmrptr = &lapic->tmr0; if ((tmrptr[idx] & mask) != (level ? mask : 0)) { VLAPIC_CTR3(vlapic, "vlapic TMR[%d] is 0x%08x but " "interrupt is %s-triggered", idx / 4, tmrptr[idx], level ? "level" : "edge"); } VLAPIC_CTR_IRR(vlapic, "vlapic_set_intr_ready"); return (1); } static __inline uint32_t * vlapic_get_lvtptr(struct vlapic *vlapic, uint32_t offset) { struct LAPIC *lapic = vlapic->apic_page; int i; switch (offset) { case APIC_OFFSET_CMCI_LVT: return (&lapic->lvt_cmci); case APIC_OFFSET_TIMER_LVT ... APIC_OFFSET_ERROR_LVT: i = (offset - APIC_OFFSET_TIMER_LVT) >> 2; return ((&lapic->lvt_timer) + i); default: panic("vlapic_get_lvt: invalid LVT\n"); } } static __inline int lvt_off_to_idx(uint32_t offset) { int index; switch (offset) { case APIC_OFFSET_CMCI_LVT: index = APIC_LVT_CMCI; break; case APIC_OFFSET_TIMER_LVT: index = APIC_LVT_TIMER; break; case APIC_OFFSET_THERM_LVT: index = APIC_LVT_THERMAL; break; case APIC_OFFSET_PERF_LVT: index = APIC_LVT_PMC; break; case APIC_OFFSET_LINT0_LVT: index = APIC_LVT_LINT0; break; case APIC_OFFSET_LINT1_LVT: index = APIC_LVT_LINT1; break; case APIC_OFFSET_ERROR_LVT: index = APIC_LVT_ERROR; break; default: index = -1; break; } KASSERT(index >= 0 && index <= VLAPIC_MAXLVT_INDEX, ("lvt_off_to_idx: " "invalid lvt index %d for offset %#x", index, offset)); return (index); } static __inline uint32_t vlapic_get_lvt(struct vlapic *vlapic, uint32_t offset) { int idx; uint32_t val; idx = lvt_off_to_idx(offset); val = atomic_load_acq_32(&vlapic->lvt_last[idx]); return (val); } void vlapic_lvt_write_handler(struct vlapic *vlapic, uint32_t offset) { uint32_t *lvtptr, mask, val; struct LAPIC *lapic; int idx; lapic = vlapic->apic_page; lvtptr = vlapic_get_lvtptr(vlapic, offset); val = *lvtptr; idx = lvt_off_to_idx(offset); if (!(lapic->svr & APIC_SVR_ENABLE)) val |= APIC_LVT_M; mask = APIC_LVT_M | APIC_LVT_DS | APIC_LVT_VECTOR; switch (offset) { case APIC_OFFSET_TIMER_LVT: mask |= APIC_LVTT_TM; break; case APIC_OFFSET_ERROR_LVT: break; case APIC_OFFSET_LINT0_LVT: case APIC_OFFSET_LINT1_LVT: mask |= APIC_LVT_TM | APIC_LVT_RIRR | APIC_LVT_IIPP; /* FALLTHROUGH */ default: mask |= APIC_LVT_DM; break; } val &= mask; *lvtptr = val; atomic_store_rel_32(&vlapic->lvt_last[idx], val); } static void vlapic_mask_lvts(struct vlapic *vlapic) { struct LAPIC *lapic = vlapic->apic_page; lapic->lvt_cmci |= APIC_LVT_M; vlapic_lvt_write_handler(vlapic, APIC_OFFSET_CMCI_LVT); lapic->lvt_timer |= APIC_LVT_M; vlapic_lvt_write_handler(vlapic, APIC_OFFSET_TIMER_LVT); lapic->lvt_thermal |= APIC_LVT_M; vlapic_lvt_write_handler(vlapic, APIC_OFFSET_THERM_LVT); lapic->lvt_pcint |= APIC_LVT_M; vlapic_lvt_write_handler(vlapic, APIC_OFFSET_PERF_LVT); lapic->lvt_lint0 |= APIC_LVT_M; vlapic_lvt_write_handler(vlapic, APIC_OFFSET_LINT0_LVT); lapic->lvt_lint1 |= APIC_LVT_M; vlapic_lvt_write_handler(vlapic, APIC_OFFSET_LINT1_LVT); lapic->lvt_error |= APIC_LVT_M; vlapic_lvt_write_handler(vlapic, APIC_OFFSET_ERROR_LVT); } static int vlapic_fire_lvt(struct vlapic *vlapic, u_int lvt) { uint32_t mode, reg, vec; reg = atomic_load_acq_32(&vlapic->lvt_last[lvt]); if (reg & APIC_LVT_M) return (0); vec = reg & APIC_LVT_VECTOR; mode = reg & APIC_LVT_DM; switch (mode) { case APIC_LVT_DM_FIXED: if (vec < 16) { vlapic_set_error(vlapic, APIC_ESR_SEND_ILLEGAL_VECTOR, lvt == APIC_LVT_ERROR); return (0); } if (vlapic_set_intr_ready(vlapic, vec, false)) vcpu_notify_event(vlapic->vcpu, true); break; case APIC_LVT_DM_NMI: vm_inject_nmi(vlapic->vcpu); break; case APIC_LVT_DM_EXTINT: vm_inject_extint(vlapic->vcpu); break; default: // Other modes ignored return (0); } return (1); } #if 1 static void dump_isrvec_stk(struct vlapic *vlapic) { int i; uint32_t *isrptr; isrptr = &vlapic->apic_page->isr0; for (i = 0; i < 8; i++) printf("ISR%d 0x%08x\n", i, isrptr[i * 4]); for (i = 0; i <= vlapic->isrvec_stk_top; i++) printf("isrvec_stk[%d] = %d\n", i, vlapic->isrvec_stk[i]); } #endif /* * Algorithm adopted from section "Interrupt, Task and Processor Priority" * in Intel Architecture Manual Vol 3a. */ static void vlapic_update_ppr(struct vlapic *vlapic) { int isrvec, tpr, ppr; /* * Note that the value on the stack at index 0 is always 0. * * This is a placeholder for the value of ISRV when none of the * bits is set in the ISRx registers. */ isrvec = vlapic->isrvec_stk[vlapic->isrvec_stk_top]; tpr = vlapic->apic_page->tpr; #if 1 { int i, lastprio, curprio, vector, idx; uint32_t *isrptr; if (vlapic->isrvec_stk_top == 0 && isrvec != 0) panic("isrvec_stk is corrupted: %d", isrvec); /* * Make sure that the priority of the nested interrupts is * always increasing. */ lastprio = -1; for (i = 1; i <= vlapic->isrvec_stk_top; i++) { curprio = PRIO(vlapic->isrvec_stk[i]); if (curprio <= lastprio) { dump_isrvec_stk(vlapic); panic("isrvec_stk does not satisfy invariant"); } lastprio = curprio; } /* * Make sure that each bit set in the ISRx registers has a * corresponding entry on the isrvec stack. */ i = 1; isrptr = &vlapic->apic_page->isr0; for (vector = 0; vector < 256; vector++) { idx = (vector / 32) * 4; if (isrptr[idx] & (1 << (vector % 32))) { if (i > vlapic->isrvec_stk_top || vlapic->isrvec_stk[i] != vector) { dump_isrvec_stk(vlapic); panic("ISR and isrvec_stk out of sync"); } i++; } } } #endif if (PRIO(tpr) >= PRIO(isrvec)) ppr = tpr; else ppr = isrvec & 0xf0; vlapic->apic_page->ppr = ppr; VLAPIC_CTR1(vlapic, "vlapic_update_ppr 0x%02x", ppr); } void vlapic_sync_tpr(struct vlapic *vlapic) { vlapic_update_ppr(vlapic); } static VMM_STAT(VLAPIC_GRATUITOUS_EOI, "EOI without any in-service interrupt"); static void vlapic_process_eoi(struct vlapic *vlapic) { struct LAPIC *lapic = vlapic->apic_page; uint32_t *isrptr, *tmrptr; int i, idx, bitpos, vector; isrptr = &lapic->isr0; tmrptr = &lapic->tmr0; for (i = 7; i >= 0; i--) { idx = i * 4; bitpos = fls(isrptr[idx]); if (bitpos-- != 0) { if (vlapic->isrvec_stk_top <= 0) { panic("invalid vlapic isrvec_stk_top %d", vlapic->isrvec_stk_top); } isrptr[idx] &= ~(1 << bitpos); vector = i * 32 + bitpos; VLAPIC_CTR1(vlapic, "EOI vector %d", vector); VLAPIC_CTR_ISR(vlapic, "vlapic_process_eoi"); vlapic->isrvec_stk_top--; vlapic_update_ppr(vlapic); if ((tmrptr[idx] & (1 << bitpos)) != 0) { vioapic_process_eoi(vlapic->vm, vector); } return; } } VLAPIC_CTR0(vlapic, "Gratuitous EOI"); vmm_stat_incr(vlapic->vcpu, VLAPIC_GRATUITOUS_EOI, 1); } static __inline int vlapic_get_lvt_field(uint32_t lvt, uint32_t mask) { return (lvt & mask); } static __inline int vlapic_periodic_timer(struct vlapic *vlapic) { uint32_t lvt; lvt = vlapic_get_lvt(vlapic, APIC_OFFSET_TIMER_LVT); return (vlapic_get_lvt_field(lvt, APIC_LVTT_TM_PERIODIC)); } static VMM_STAT(VLAPIC_INTR_ERROR, "error interrupts generated by vlapic"); static void vlapic_set_error(struct vlapic *vlapic, uint32_t mask, bool lvt_error) { vlapic->esr_pending |= mask; /* * Avoid infinite recursion if the error LVT itself is configured with * an illegal vector. */ if (lvt_error) return; if (vlapic_fire_lvt(vlapic, APIC_LVT_ERROR)) { vmm_stat_incr(vlapic->vcpu, VLAPIC_INTR_ERROR, 1); } } static VMM_STAT(VLAPIC_INTR_TIMER, "timer interrupts generated by vlapic"); static void vlapic_fire_timer(struct vlapic *vlapic) { KASSERT(VLAPIC_TIMER_LOCKED(vlapic), ("vlapic_fire_timer not locked")); if (vlapic_fire_lvt(vlapic, APIC_LVT_TIMER)) { VLAPIC_CTR0(vlapic, "vlapic timer fired"); vmm_stat_incr(vlapic->vcpu, VLAPIC_INTR_TIMER, 1); } } static VMM_STAT(VLAPIC_INTR_CMC, "corrected machine check interrupts generated by vlapic"); void vlapic_fire_cmci(struct vlapic *vlapic) { if (vlapic_fire_lvt(vlapic, APIC_LVT_CMCI)) { vmm_stat_incr(vlapic->vcpu, VLAPIC_INTR_CMC, 1); } } static VMM_STAT_ARRAY(LVTS_TRIGGERRED, VLAPIC_MAXLVT_INDEX + 1, "lvts triggered"); int vlapic_trigger_lvt(struct vlapic *vlapic, int vector) { if (vlapic_enabled(vlapic) == false) { /* * When the local APIC is global/hardware disabled, * LINT[1:0] pins are configured as INTR and NMI pins, * respectively. */ switch (vector) { case APIC_LVT_LINT0: vm_inject_extint(vlapic->vcpu); break; case APIC_LVT_LINT1: vm_inject_nmi(vlapic->vcpu); break; default: break; } return (0); } switch (vector) { case APIC_LVT_LINT0: case APIC_LVT_LINT1: case APIC_LVT_TIMER: case APIC_LVT_ERROR: case APIC_LVT_PMC: case APIC_LVT_THERMAL: case APIC_LVT_CMCI: if (vlapic_fire_lvt(vlapic, vector)) { vmm_stat_array_incr(vlapic->vcpu, LVTS_TRIGGERRED, vector, 1); } break; default: return (EINVAL); } return (0); } static void vlapic_callout_reset(struct vlapic *vlapic, sbintime_t t) { callout_reset_sbt_curcpu(&vlapic->callout, t, 0, vlapic_callout_handler, vlapic, 0); } static void vlapic_callout_handler(void *arg) { struct vlapic *vlapic; struct bintime bt, btnow; sbintime_t rem_sbt; vlapic = arg; VLAPIC_TIMER_LOCK(vlapic); if (callout_pending(&vlapic->callout)) /* callout was reset */ goto done; if (!callout_active(&vlapic->callout)) /* callout was stopped */ goto done; callout_deactivate(&vlapic->callout); vlapic_fire_timer(vlapic); if (vlapic_periodic_timer(vlapic)) { binuptime(&btnow); KASSERT(bintime_cmp(&btnow, &vlapic->timer_fire_bt, >=), ("vlapic callout at %#lx.%#lx, expected at %#lx.#%lx", btnow.sec, btnow.frac, vlapic->timer_fire_bt.sec, vlapic->timer_fire_bt.frac)); /* * Compute the delta between when the timer was supposed to * fire and the present time. */ bt = btnow; bintime_sub(&bt, &vlapic->timer_fire_bt); rem_sbt = bttosbt(vlapic->timer_period_bt); if (bintime_cmp(&bt, &vlapic->timer_period_bt, <)) { /* * Adjust the time until the next countdown downward * to account for the lost time. */ rem_sbt -= bttosbt(bt); } else { /* * If the delta is greater than the timer period then * just reset our time base instead of trying to catch * up. */ vlapic->timer_fire_bt = btnow; VLAPIC_CTR2(vlapic, "vlapic timer lagging by %lu " "usecs, period is %lu usecs - resetting time base", bttosbt(bt) / SBT_1US, bttosbt(vlapic->timer_period_bt) / SBT_1US); } bintime_add(&vlapic->timer_fire_bt, &vlapic->timer_period_bt); vlapic_callout_reset(vlapic, rem_sbt); } done: VLAPIC_TIMER_UNLOCK(vlapic); } void vlapic_icrtmr_write_handler(struct vlapic *vlapic) { struct LAPIC *lapic; sbintime_t sbt; uint32_t icr_timer; VLAPIC_TIMER_LOCK(vlapic); lapic = vlapic->apic_page; icr_timer = lapic->icr_timer; vlapic->timer_period_bt = vlapic->timer_freq_bt; bintime_mul(&vlapic->timer_period_bt, icr_timer); if (icr_timer != 0) { binuptime(&vlapic->timer_fire_bt); bintime_add(&vlapic->timer_fire_bt, &vlapic->timer_period_bt); sbt = bttosbt(vlapic->timer_period_bt); vlapic_callout_reset(vlapic, sbt); } else callout_stop(&vlapic->callout); VLAPIC_TIMER_UNLOCK(vlapic); } /* * This function populates 'dmask' with the set of vcpus that match the * addressing specified by the (dest, phys, lowprio) tuple. * * 'x2apic_dest' specifies whether 'dest' is interpreted as x2APIC (32-bit) * or xAPIC (8-bit) destination field. */ static void vlapic_calcdest(struct vm *vm, cpuset_t *dmask, uint32_t dest, bool phys, bool lowprio, bool x2apic_dest) { struct vlapic *vlapic; uint32_t dfr, ldr, ldest, cluster; uint32_t mda_flat_ldest, mda_cluster_ldest, mda_ldest, mda_cluster_id; cpuset_t amask; int vcpuid; if ((x2apic_dest && dest == 0xffffffff) || (!x2apic_dest && dest == 0xff)) { /* * Broadcast in both logical and physical modes. */ *dmask = vm_active_cpus(vm); return; } if (phys) { /* * Physical mode: destination is APIC ID. */ CPU_ZERO(dmask); vcpuid = vm_apicid2vcpuid(vm, dest); amask = vm_active_cpus(vm); if (vcpuid < vm_get_maxcpus(vm) && CPU_ISSET(vcpuid, &amask)) CPU_SET(vcpuid, dmask); } else { /* * In the "Flat Model" the MDA is interpreted as an 8-bit wide * bitmask. This model is only available in the xAPIC mode. */ mda_flat_ldest = dest & 0xff; /* * In the "Cluster Model" the MDA is used to identify a * specific cluster and a set of APICs in that cluster. */ if (x2apic_dest) { mda_cluster_id = dest >> 16; mda_cluster_ldest = dest & 0xffff; } else { mda_cluster_id = (dest >> 4) & 0xf; mda_cluster_ldest = dest & 0xf; } /* * Logical mode: match each APIC that has a bit set * in its LDR that matches a bit in the ldest. */ CPU_ZERO(dmask); amask = vm_active_cpus(vm); CPU_FOREACH_ISSET(vcpuid, &amask) { vlapic = vm_lapic(vm_vcpu(vm, vcpuid)); dfr = vlapic->apic_page->dfr; ldr = vlapic->apic_page->ldr; if ((dfr & APIC_DFR_MODEL_MASK) == APIC_DFR_MODEL_FLAT) { ldest = ldr >> 24; mda_ldest = mda_flat_ldest; } else if ((dfr & APIC_DFR_MODEL_MASK) == APIC_DFR_MODEL_CLUSTER) { if (x2apic(vlapic)) { cluster = ldr >> 16; ldest = ldr & 0xffff; } else { cluster = ldr >> 28; ldest = (ldr >> 24) & 0xf; } if (cluster != mda_cluster_id) continue; mda_ldest = mda_cluster_ldest; } else { /* * Guest has configured a bad logical * model for this vcpu - skip it. */ VLAPIC_CTR1(vlapic, "vlapic has bad logical " "model %x - cannot deliver interrupt", dfr); continue; } if ((mda_ldest & ldest) != 0) { CPU_SET(vcpuid, dmask); if (lowprio) break; } } } } static VMM_STAT_ARRAY(IPIS_SENT, VMM_STAT_NELEMS_VCPU, "ipis sent to vcpu"); static void vlapic_set_tpr(struct vlapic *vlapic, uint8_t val) { struct LAPIC *lapic = vlapic->apic_page; if (lapic->tpr != val) { VLAPIC_CTR2(vlapic, "vlapic TPR changed from %#x to %#x", lapic->tpr, val); lapic->tpr = val; vlapic_update_ppr(vlapic); } } static uint8_t vlapic_get_tpr(struct vlapic *vlapic) { struct LAPIC *lapic = vlapic->apic_page; return (lapic->tpr); } void vlapic_set_cr8(struct vlapic *vlapic, uint64_t val) { uint8_t tpr; if (val & ~0xf) { vm_inject_gp(vlapic->vcpu); return; } tpr = val << 4; vlapic_set_tpr(vlapic, tpr); } uint64_t vlapic_get_cr8(struct vlapic *vlapic) { uint8_t tpr; tpr = vlapic_get_tpr(vlapic); return (tpr >> 4); } static bool vlapic_is_icr_valid(uint64_t icrval) { uint32_t mode = icrval & APIC_DELMODE_MASK; uint32_t level = icrval & APIC_LEVEL_MASK; uint32_t trigger = icrval & APIC_TRIGMOD_MASK; uint32_t shorthand = icrval & APIC_DEST_MASK; switch (mode) { case APIC_DELMODE_FIXED: if (trigger == APIC_TRIGMOD_EDGE) return (true); /* * AMD allows a level assert IPI and Intel converts a level * assert IPI into an edge IPI. */ if (trigger == APIC_TRIGMOD_LEVEL && level == APIC_LEVEL_ASSERT) return (true); break; case APIC_DELMODE_LOWPRIO: case APIC_DELMODE_SMI: case APIC_DELMODE_NMI: case APIC_DELMODE_INIT: if (trigger == APIC_TRIGMOD_EDGE && (shorthand == APIC_DEST_DESTFLD || shorthand == APIC_DEST_ALLESELF)) return (true); /* * AMD allows a level assert IPI and Intel converts a level * assert IPI into an edge IPI. */ if (trigger == APIC_TRIGMOD_LEVEL && level == APIC_LEVEL_ASSERT && (shorthand == APIC_DEST_DESTFLD || shorthand == APIC_DEST_ALLESELF)) return (true); /* * An level triggered deassert INIT is defined in the Intel * Multiprocessor Specification and the Intel Software Developer * Manual. Due to the MPS it's required to send a level assert * INIT to a cpu and then a level deassert INIT. Some operating * systems e.g. FreeBSD or Linux use that algorithm. According * to the SDM a level deassert INIT is only supported by Pentium * and P6 processors. It's always send to all cpus regardless of * the destination or shorthand field. It resets the arbitration * id register. This register is not software accessible and * only required for the APIC bus arbitration. So, the level * deassert INIT doesn't need any emulation and we should ignore * it. The SDM also defines that newer processors don't support * the level deassert INIT and it's not valid any more. As it's * defined for older systems, it can't be invalid per se. * Otherwise, backward compatibility would be broken. However, * when returning false here, it'll be ignored which is the * desired behaviour. */ if (mode == APIC_DELMODE_INIT && trigger == APIC_TRIGMOD_LEVEL && level == APIC_LEVEL_DEASSERT) return (false); break; case APIC_DELMODE_STARTUP: if (shorthand == APIC_DEST_DESTFLD || shorthand == APIC_DEST_ALLESELF) return (true); break; case APIC_DELMODE_RR: /* Only available on AMD! */ if (trigger == APIC_TRIGMOD_EDGE && shorthand == APIC_DEST_DESTFLD) return (true); break; case APIC_DELMODE_RESV: return (false); default: __assert_unreachable(); } return (false); } int vlapic_icrlo_write_handler(struct vlapic *vlapic, bool *retu) { int i; bool phys; cpuset_t dmask, ipimask; uint64_t icrval; uint32_t dest, vec, mode, shorthand; struct vlapic *vlapic2; struct vcpu *vcpu; struct vm_exit *vmexit; struct LAPIC *lapic; lapic = vlapic->apic_page; lapic->icr_lo &= ~APIC_DELSTAT_PEND; icrval = ((uint64_t)lapic->icr_hi << 32) | lapic->icr_lo; if (x2apic(vlapic)) dest = icrval >> 32; else dest = icrval >> (32 + 24); vec = icrval & APIC_VECTOR_MASK; mode = icrval & APIC_DELMODE_MASK; phys = (icrval & APIC_DESTMODE_LOG) == 0; shorthand = icrval & APIC_DEST_MASK; VLAPIC_CTR2(vlapic, "icrlo 0x%016lx triggered ipi %d", icrval, vec); switch (shorthand) { case APIC_DEST_DESTFLD: vlapic_calcdest(vlapic->vm, &dmask, dest, phys, false, x2apic(vlapic)); break; case APIC_DEST_SELF: CPU_SETOF(vlapic->vcpuid, &dmask); break; case APIC_DEST_ALLISELF: dmask = vm_active_cpus(vlapic->vm); break; case APIC_DEST_ALLESELF: dmask = vm_active_cpus(vlapic->vm); CPU_CLR(vlapic->vcpuid, &dmask); break; default: __assert_unreachable(); } /* * Ignore invalid combinations of the icr. */ if (!vlapic_is_icr_valid(icrval)) { VLAPIC_CTR1(vlapic, "Ignoring invalid ICR %016lx", icrval); return (0); } /* * ipimask is a set of vCPUs needing userland handling of the current * IPI. */ CPU_ZERO(&ipimask); switch (mode) { case APIC_DELMODE_FIXED: if (vec < 16) { vlapic_set_error(vlapic, APIC_ESR_SEND_ILLEGAL_VECTOR, false); VLAPIC_CTR1(vlapic, "Ignoring invalid IPI %d", vec); return (0); } CPU_FOREACH_ISSET(i, &dmask) { vcpu = vm_vcpu(vlapic->vm, i); lapic_intr_edge(vcpu, vec); vmm_stat_array_incr(vlapic->vcpu, IPIS_SENT, i, 1); VLAPIC_CTR2(vlapic, "vlapic sending ipi %d to vcpuid %d", vec, i); } break; case APIC_DELMODE_NMI: CPU_FOREACH_ISSET(i, &dmask) { vcpu = vm_vcpu(vlapic->vm, i); vm_inject_nmi(vcpu); VLAPIC_CTR1(vlapic, "vlapic sending ipi nmi to vcpuid %d", i); } break; case APIC_DELMODE_INIT: if (!vlapic->ipi_exit) { if (!phys) break; i = vm_apicid2vcpuid(vlapic->vm, dest); if (i >= vm_get_maxcpus(vlapic->vm) || i == vlapic->vcpuid) break; /* * Userland which doesn't support the IPI exit * requires that the boot state is set to SIPI * here. */ vcpu = vm_vcpu(vlapic->vm, i); vlapic2 = vm_lapic(vcpu); vlapic2->boot_state = BS_SIPI; break; } CPU_COPY(&dmask, &ipimask); break; case APIC_DELMODE_STARTUP: if (!vlapic->ipi_exit) { if (!phys) break; /* * Old bhyve versions don't support the IPI * exit. Translate it into the old style. */ i = vm_apicid2vcpuid(vlapic->vm, dest); if (i >= vm_get_maxcpus(vlapic->vm) || i == vlapic->vcpuid) break; /* * Ignore SIPIs in any state other than wait-for-SIPI */ vcpu = vm_vcpu(vlapic->vm, i); vlapic2 = vm_lapic(vcpu); if (vlapic2->boot_state != BS_SIPI) break; vlapic2->boot_state = BS_RUNNING; vmexit = vm_exitinfo(vlapic->vcpu); vmexit->exitcode = VM_EXITCODE_SPINUP_AP; vmexit->u.spinup_ap.vcpu = i; vmexit->u.spinup_ap.rip = vec << PAGE_SHIFT; *retu = true; break; } CPU_FOREACH_ISSET(i, &dmask) { vcpu = vm_vcpu(vlapic->vm, i); vlapic2 = vm_lapic(vcpu); /* * Ignore SIPIs in any state other than wait-for-SIPI */ if (vlapic2->boot_state != BS_SIPI) continue; vlapic2->boot_state = BS_RUNNING; CPU_SET(i, &ipimask); } break; default: return (1); } if (!CPU_EMPTY(&ipimask)) { vmexit = vm_exitinfo(vlapic->vcpu); vmexit->exitcode = VM_EXITCODE_IPI; vmexit->u.ipi.mode = mode; vmexit->u.ipi.vector = vec; vmexit->u.ipi.dmask = dmask; *retu = true; } return (0); } static void vlapic_handle_init(struct vcpu *vcpu, void *arg) { struct vlapic *vlapic = vm_lapic(vcpu); vlapic_reset(vlapic); /* vlapic_reset modifies the boot state. */ vlapic->boot_state = BS_SIPI; } int vm_handle_ipi(struct vcpu *vcpu, struct vm_exit *vme, bool *retu) { *retu = true; switch (vme->u.ipi.mode) { case APIC_DELMODE_INIT: vm_smp_rendezvous(vcpu, vme->u.ipi.dmask, vlapic_handle_init, NULL); break; case APIC_DELMODE_STARTUP: break; default: return (1); } return (0); } void vlapic_self_ipi_handler(struct vlapic *vlapic, uint64_t val) { int vec; KASSERT(x2apic(vlapic), ("SELF_IPI does not exist in xAPIC mode")); vec = val & 0xff; lapic_intr_edge(vlapic->vcpu, vec); vmm_stat_array_incr(vlapic->vcpu, IPIS_SENT, vlapic->vcpuid, 1); VLAPIC_CTR1(vlapic, "vlapic self-ipi %d", vec); } int vlapic_pending_intr(struct vlapic *vlapic, int *vecptr) { struct LAPIC *lapic = vlapic->apic_page; int idx, i, bitpos, vector; uint32_t *irrptr, val; vlapic_update_ppr(vlapic); if (vlapic->ops.pending_intr) return ((*vlapic->ops.pending_intr)(vlapic, vecptr)); irrptr = &lapic->irr0; for (i = 7; i >= 0; i--) { idx = i * 4; val = atomic_load_acq_int(&irrptr[idx]); bitpos = fls(val); if (bitpos != 0) { vector = i * 32 + (bitpos - 1); if (PRIO(vector) > PRIO(lapic->ppr)) { VLAPIC_CTR1(vlapic, "pending intr %d", vector); if (vecptr != NULL) *vecptr = vector; return (1); } else break; } } return (0); } void vlapic_intr_accepted(struct vlapic *vlapic, int vector) { struct LAPIC *lapic = vlapic->apic_page; uint32_t *irrptr, *isrptr; int idx, stk_top; if (vlapic->ops.intr_accepted) return ((*vlapic->ops.intr_accepted)(vlapic, vector)); /* * clear the ready bit for vector being accepted in irr * and set the vector as in service in isr. */ idx = (vector / 32) * 4; irrptr = &lapic->irr0; atomic_clear_int(&irrptr[idx], 1 << (vector % 32)); VLAPIC_CTR_IRR(vlapic, "vlapic_intr_accepted"); isrptr = &lapic->isr0; isrptr[idx] |= 1 << (vector % 32); VLAPIC_CTR_ISR(vlapic, "vlapic_intr_accepted"); /* * Update the PPR */ vlapic->isrvec_stk_top++; stk_top = vlapic->isrvec_stk_top; if (stk_top >= ISRVEC_STK_SIZE) panic("isrvec_stk_top overflow %d", stk_top); vlapic->isrvec_stk[stk_top] = vector; } void vlapic_svr_write_handler(struct vlapic *vlapic) { struct LAPIC *lapic; uint32_t old, new, changed; lapic = vlapic->apic_page; new = lapic->svr; old = vlapic->svr_last; vlapic->svr_last = new; changed = old ^ new; if ((changed & APIC_SVR_ENABLE) != 0) { if ((new & APIC_SVR_ENABLE) == 0) { /* * The apic is now disabled so stop the apic timer * and mask all the LVT entries. */ VLAPIC_CTR0(vlapic, "vlapic is software-disabled"); VLAPIC_TIMER_LOCK(vlapic); callout_stop(&vlapic->callout); VLAPIC_TIMER_UNLOCK(vlapic); vlapic_mask_lvts(vlapic); } else { /* * The apic is now enabled so restart the apic timer * if it is configured in periodic mode. */ VLAPIC_CTR0(vlapic, "vlapic is software-enabled"); if (vlapic_periodic_timer(vlapic)) vlapic_icrtmr_write_handler(vlapic); } } } int vlapic_read(struct vlapic *vlapic, int mmio_access, uint64_t offset, uint64_t *data, bool *retu) { struct LAPIC *lapic = vlapic->apic_page; uint32_t *reg; int i; /* Ignore MMIO accesses in x2APIC mode */ if (x2apic(vlapic) && mmio_access) { VLAPIC_CTR1(vlapic, "MMIO read from offset %#lx in x2APIC mode", offset); *data = 0; goto done; } if (!x2apic(vlapic) && !mmio_access) { /* * XXX Generate GP fault for MSR accesses in xAPIC mode */ VLAPIC_CTR1(vlapic, "x2APIC MSR read from offset %#lx in " "xAPIC mode", offset); *data = 0; goto done; } if (offset > sizeof(*lapic)) { *data = 0; goto done; } offset &= ~3; switch(offset) { case APIC_OFFSET_ID: *data = lapic->id; break; case APIC_OFFSET_VER: *data = lapic->version; break; case APIC_OFFSET_TPR: *data = vlapic_get_tpr(vlapic); break; case APIC_OFFSET_APR: *data = lapic->apr; break; case APIC_OFFSET_PPR: *data = lapic->ppr; break; case APIC_OFFSET_EOI: *data = lapic->eoi; break; case APIC_OFFSET_LDR: *data = lapic->ldr; break; case APIC_OFFSET_DFR: *data = lapic->dfr; break; case APIC_OFFSET_SVR: *data = lapic->svr; break; case APIC_OFFSET_ISR0 ... APIC_OFFSET_ISR7: i = (offset - APIC_OFFSET_ISR0) >> 2; reg = &lapic->isr0; *data = *(reg + i); break; case APIC_OFFSET_TMR0 ... APIC_OFFSET_TMR7: i = (offset - APIC_OFFSET_TMR0) >> 2; reg = &lapic->tmr0; *data = *(reg + i); break; case APIC_OFFSET_IRR0 ... APIC_OFFSET_IRR7: i = (offset - APIC_OFFSET_IRR0) >> 2; reg = &lapic->irr0; *data = atomic_load_acq_int(reg + i); break; case APIC_OFFSET_ESR: *data = lapic->esr; break; case APIC_OFFSET_ICR_LOW: *data = lapic->icr_lo; if (x2apic(vlapic)) *data |= (uint64_t)lapic->icr_hi << 32; break; case APIC_OFFSET_ICR_HI: *data = lapic->icr_hi; break; case APIC_OFFSET_CMCI_LVT: case APIC_OFFSET_TIMER_LVT ... APIC_OFFSET_ERROR_LVT: *data = vlapic_get_lvt(vlapic, offset); #ifdef INVARIANTS reg = vlapic_get_lvtptr(vlapic, offset); KASSERT(*data == *reg, ("inconsistent lvt value at " "offset %#lx: %#lx/%#x", offset, *data, *reg)); #endif break; case APIC_OFFSET_TIMER_ICR: *data = lapic->icr_timer; break; case APIC_OFFSET_TIMER_CCR: *data = vlapic_get_ccr(vlapic); break; case APIC_OFFSET_TIMER_DCR: *data = lapic->dcr_timer; break; case APIC_OFFSET_SELF_IPI: /* * XXX generate a GP fault if vlapic is in x2apic mode */ *data = 0; break; case APIC_OFFSET_RRR: default: *data = 0; break; } done: VLAPIC_CTR2(vlapic, "vlapic read offset %#x, data %#lx", offset, *data); return 0; } int vlapic_write(struct vlapic *vlapic, int mmio_access, uint64_t offset, uint64_t data, bool *retu) { struct LAPIC *lapic = vlapic->apic_page; uint32_t *regptr; int retval; KASSERT((offset & 0xf) == 0 && offset < PAGE_SIZE, ("vlapic_write: invalid offset %#lx", offset)); VLAPIC_CTR2(vlapic, "vlapic write offset %#lx, data %#lx", offset, data); if (offset > sizeof(*lapic)) return (0); /* Ignore MMIO accesses in x2APIC mode */ if (x2apic(vlapic) && mmio_access) { VLAPIC_CTR2(vlapic, "MMIO write of %#lx to offset %#lx " "in x2APIC mode", data, offset); return (0); } /* * XXX Generate GP fault for MSR accesses in xAPIC mode */ if (!x2apic(vlapic) && !mmio_access) { VLAPIC_CTR2(vlapic, "x2APIC MSR write of %#lx to offset %#lx " "in xAPIC mode", data, offset); return (0); } retval = 0; switch(offset) { case APIC_OFFSET_ID: lapic->id = data; vlapic_id_write_handler(vlapic); break; case APIC_OFFSET_TPR: vlapic_set_tpr(vlapic, data & 0xff); break; case APIC_OFFSET_EOI: vlapic_process_eoi(vlapic); break; case APIC_OFFSET_LDR: lapic->ldr = data; vlapic_ldr_write_handler(vlapic); break; case APIC_OFFSET_DFR: lapic->dfr = data; vlapic_dfr_write_handler(vlapic); break; case APIC_OFFSET_SVR: lapic->svr = data; vlapic_svr_write_handler(vlapic); break; case APIC_OFFSET_ICR_LOW: lapic->icr_lo = data; if (x2apic(vlapic)) lapic->icr_hi = data >> 32; retval = vlapic_icrlo_write_handler(vlapic, retu); break; case APIC_OFFSET_ICR_HI: lapic->icr_hi = data; break; case APIC_OFFSET_CMCI_LVT: case APIC_OFFSET_TIMER_LVT ... APIC_OFFSET_ERROR_LVT: regptr = vlapic_get_lvtptr(vlapic, offset); *regptr = data; vlapic_lvt_write_handler(vlapic, offset); break; case APIC_OFFSET_TIMER_ICR: lapic->icr_timer = data; vlapic_icrtmr_write_handler(vlapic); break; case APIC_OFFSET_TIMER_DCR: lapic->dcr_timer = data; vlapic_dcr_write_handler(vlapic); break; case APIC_OFFSET_ESR: vlapic_esr_write_handler(vlapic); break; case APIC_OFFSET_SELF_IPI: if (x2apic(vlapic)) vlapic_self_ipi_handler(vlapic, data); break; case APIC_OFFSET_VER: case APIC_OFFSET_APR: case APIC_OFFSET_PPR: case APIC_OFFSET_RRR: case APIC_OFFSET_ISR0 ... APIC_OFFSET_ISR7: case APIC_OFFSET_TMR0 ... APIC_OFFSET_TMR7: case APIC_OFFSET_IRR0 ... APIC_OFFSET_IRR7: case APIC_OFFSET_TIMER_CCR: default: // Read only. break; } return (retval); } static void vlapic_reset(struct vlapic *vlapic) { struct LAPIC *lapic; lapic = vlapic->apic_page; bzero(lapic, sizeof(struct LAPIC)); lapic->id = vlapic_get_id(vlapic); lapic->version = VLAPIC_VERSION; lapic->version |= (VLAPIC_MAXLVT_INDEX << MAXLVTSHIFT); lapic->dfr = 0xffffffff; lapic->svr = APIC_SVR_VECTOR; vlapic_mask_lvts(vlapic); vlapic_reset_tmr(vlapic); lapic->dcr_timer = 0; vlapic_dcr_write_handler(vlapic); if (vlapic->vcpuid == 0) vlapic->boot_state = BS_RUNNING; /* BSP */ else vlapic->boot_state = BS_INIT; /* AP */ vlapic->svr_last = lapic->svr; } void vlapic_init(struct vlapic *vlapic) { KASSERT(vlapic->vm != NULL, ("vlapic_init: vm is not initialized")); KASSERT(vlapic->vcpuid >= 0 && vlapic->vcpuid < vm_get_maxcpus(vlapic->vm), ("vlapic_init: vcpuid is not initialized")); KASSERT(vlapic->apic_page != NULL, ("vlapic_init: apic_page is not " "initialized")); /* * If the vlapic is configured in x2apic mode then it will be * accessed in the critical section via the MSR emulation code. * * Therefore the timer mutex must be a spinlock because blockable * mutexes cannot be acquired in a critical section. */ mtx_init(&vlapic->timer_mtx, "vlapic timer mtx", NULL, MTX_SPIN); callout_init(&vlapic->callout, 1); vlapic->msr_apicbase = DEFAULT_APIC_BASE | APICBASE_ENABLED; if (vlapic->vcpuid == 0) vlapic->msr_apicbase |= APICBASE_BSP; vlapic->ipi_exit = false; vlapic_reset(vlapic); } void vlapic_cleanup(struct vlapic *vlapic) { callout_drain(&vlapic->callout); + mtx_destroy(&vlapic->timer_mtx); } uint64_t vlapic_get_apicbase(struct vlapic *vlapic) { return (vlapic->msr_apicbase); } int vlapic_set_apicbase(struct vlapic *vlapic, uint64_t new) { if (vlapic->msr_apicbase != new) { VLAPIC_CTR2(vlapic, "Changing APIC_BASE MSR from %#lx to %#lx " "not supported", vlapic->msr_apicbase, new); return (-1); } return (0); } void vlapic_set_x2apic_state(struct vcpu *vcpu, enum x2apic_state state) { struct vlapic *vlapic; struct LAPIC *lapic; vlapic = vm_lapic(vcpu); if (state == X2APIC_DISABLED) vlapic->msr_apicbase &= ~APICBASE_X2APIC; else vlapic->msr_apicbase |= APICBASE_X2APIC; /* * Reset the local APIC registers whose values are mode-dependent. * * XXX this works because the APIC mode can be changed only at vcpu * initialization time. */ lapic = vlapic->apic_page; lapic->id = vlapic_get_id(vlapic); if (x2apic(vlapic)) { lapic->ldr = x2apic_ldr(vlapic); lapic->dfr = 0; } else { lapic->ldr = 0; lapic->dfr = 0xffffffff; } if (state == X2APIC_ENABLED) { if (vlapic->ops.enable_x2apic_mode) (*vlapic->ops.enable_x2apic_mode)(vlapic); } } void vlapic_deliver_intr(struct vm *vm, bool level, uint32_t dest, bool phys, int delmode, int vec) { struct vcpu *vcpu; bool lowprio; int vcpuid; cpuset_t dmask; if (delmode != IOART_DELFIXED && delmode != IOART_DELLOPRI && delmode != IOART_DELEXINT) { VM_CTR1(vm, "vlapic intr invalid delmode %#x", delmode); return; } lowprio = (delmode == IOART_DELLOPRI); /* * We don't provide any virtual interrupt redirection hardware so * all interrupts originating from the ioapic or MSI specify the * 'dest' in the legacy xAPIC format. */ vlapic_calcdest(vm, &dmask, dest, phys, lowprio, false); CPU_FOREACH_ISSET(vcpuid, &dmask) { vcpu = vm_vcpu(vm, vcpuid); if (delmode == IOART_DELEXINT) { vm_inject_extint(vcpu); } else { lapic_set_intr(vcpu, vec, level); } } } void vlapic_post_intr(struct vlapic *vlapic, int hostcpu, int ipinum) { /* * Post an interrupt to the vcpu currently running on 'hostcpu'. * * This is done by leveraging features like Posted Interrupts (Intel) * Doorbell MSR (AMD AVIC) that avoid a VM exit. * * If neither of these features are available then fallback to * sending an IPI to 'hostcpu'. */ if (vlapic->ops.post_intr) (*vlapic->ops.post_intr)(vlapic, hostcpu); else ipi_cpu(hostcpu, ipinum); } bool vlapic_enabled(struct vlapic *vlapic) { struct LAPIC *lapic = vlapic->apic_page; if ((vlapic->msr_apicbase & APICBASE_ENABLED) != 0 && (lapic->svr & APIC_SVR_ENABLE) != 0) return (true); else return (false); } static void vlapic_set_tmr(struct vlapic *vlapic, int vector, bool level) { struct LAPIC *lapic; uint32_t *tmrptr, mask; int idx; lapic = vlapic->apic_page; tmrptr = &lapic->tmr0; idx = (vector / 32) * 4; mask = 1 << (vector % 32); if (level) tmrptr[idx] |= mask; else tmrptr[idx] &= ~mask; if (vlapic->ops.set_tmr != NULL) (*vlapic->ops.set_tmr)(vlapic, vector, level); } void vlapic_reset_tmr(struct vlapic *vlapic) { int vector; VLAPIC_CTR0(vlapic, "vlapic resetting all vectors to edge-triggered"); for (vector = 0; vector <= 255; vector++) vlapic_set_tmr(vlapic, vector, false); } void vlapic_set_tmr_level(struct vlapic *vlapic, uint32_t dest, bool phys, int delmode, int vector) { cpuset_t dmask; bool lowprio; KASSERT(vector >= 0 && vector <= 255, ("invalid vector %d", vector)); /* * A level trigger is valid only for fixed and lowprio delivery modes. */ if (delmode != APIC_DELMODE_FIXED && delmode != APIC_DELMODE_LOWPRIO) { VLAPIC_CTR1(vlapic, "Ignoring level trigger-mode for " "delivery-mode %d", delmode); return; } lowprio = (delmode == APIC_DELMODE_LOWPRIO); vlapic_calcdest(vlapic->vm, &dmask, dest, phys, lowprio, false); if (!CPU_ISSET(vlapic->vcpuid, &dmask)) return; VLAPIC_CTR1(vlapic, "vector %d set to level-triggered", vector); vlapic_set_tmr(vlapic, vector, true); } #ifdef BHYVE_SNAPSHOT static void vlapic_reset_callout(struct vlapic *vlapic, uint32_t ccr) { /* The implementation is similar to the one in the * `vlapic_icrtmr_write_handler` function */ sbintime_t sbt; struct bintime bt; VLAPIC_TIMER_LOCK(vlapic); bt = vlapic->timer_freq_bt; bintime_mul(&bt, ccr); if (ccr != 0) { binuptime(&vlapic->timer_fire_bt); bintime_add(&vlapic->timer_fire_bt, &bt); sbt = bttosbt(bt); vlapic_callout_reset(vlapic, sbt); } else { /* even if the CCR was 0, periodic timers should be reset */ if (vlapic_periodic_timer(vlapic)) { binuptime(&vlapic->timer_fire_bt); bintime_add(&vlapic->timer_fire_bt, &vlapic->timer_period_bt); sbt = bttosbt(vlapic->timer_period_bt); callout_stop(&vlapic->callout); vlapic_callout_reset(vlapic, sbt); } } VLAPIC_TIMER_UNLOCK(vlapic); } int vlapic_snapshot(struct vm *vm, struct vm_snapshot_meta *meta) { int ret; struct vlapic *vlapic; struct LAPIC *lapic; uint32_t ccr; uint16_t i, maxcpus; KASSERT(vm != NULL, ("%s: arg was NULL", __func__)); ret = 0; maxcpus = vm_get_maxcpus(vm); for (i = 0; i < maxcpus; i++) { vlapic = vm_lapic(vm_vcpu(vm, i)); /* snapshot the page first; timer period depends on icr_timer */ lapic = vlapic->apic_page; SNAPSHOT_BUF_OR_LEAVE(lapic, PAGE_SIZE, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(vlapic->esr_pending, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(vlapic->timer_freq_bt.sec, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(vlapic->timer_freq_bt.frac, meta, ret, done); /* * Timer period is equal to 'icr_timer' ticks at a frequency of * 'timer_freq_bt'. */ if (meta->op == VM_SNAPSHOT_RESTORE) { vlapic->timer_period_bt = vlapic->timer_freq_bt; bintime_mul(&vlapic->timer_period_bt, lapic->icr_timer); } SNAPSHOT_BUF_OR_LEAVE(vlapic->isrvec_stk, sizeof(vlapic->isrvec_stk), meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(vlapic->isrvec_stk_top, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(vlapic->boot_state, meta, ret, done); SNAPSHOT_BUF_OR_LEAVE(vlapic->lvt_last, sizeof(vlapic->lvt_last), meta, ret, done); if (meta->op == VM_SNAPSHOT_SAVE) ccr = vlapic_get_ccr(vlapic); SNAPSHOT_VAR_OR_LEAVE(ccr, meta, ret, done); if (meta->op == VM_SNAPSHOT_RESTORE && vlapic_enabled(vlapic) && lapic->icr_timer != 0) { /* Reset the value of the 'timer_fire_bt' and the vlapic * callout based on the value of the current count * register saved when the VM snapshot was created. * If initial count register is 0, timer is not used. * Look at "10.5.4 APIC Timer" in Software Developer Manual. */ vlapic_reset_callout(vlapic, ccr); } } done: return (ret); } #endif diff --git a/sys/amd64/vmm/io/vrtc.c b/sys/amd64/vmm/io/vrtc.c index d950bd68ad04..9cf70027f3d8 100644 --- a/sys/amd64/vmm/io/vrtc.c +++ b/sys/amd64/vmm/io/vrtc.c @@ -1,1064 +1,1065 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2014, Neel Natu (neel@freebsd.org) * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice unmodified, this list of conditions, and the following * disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include "opt_bhyve_snapshot.h" #include #include #include #include #include #include #include #include #include #include #include #include #include "vmm_ktr.h" #include "vatpic.h" #include "vioapic.h" #include "vrtc.h" /* Register layout of the RTC */ struct rtcdev { uint8_t sec; uint8_t alarm_sec; uint8_t min; uint8_t alarm_min; uint8_t hour; uint8_t alarm_hour; uint8_t day_of_week; uint8_t day_of_month; uint8_t month; uint8_t year; uint8_t reg_a; uint8_t reg_b; uint8_t reg_c; uint8_t reg_d; uint8_t nvram[36]; uint8_t century; uint8_t nvram2[128 - 51]; } __packed; CTASSERT(sizeof(struct rtcdev) == 128); CTASSERT(offsetof(struct rtcdev, century) == RTC_CENTURY); struct vrtc { struct vm *vm; struct mtx mtx; struct callout callout; u_int addr; /* RTC register to read or write */ sbintime_t base_uptime; time_t base_rtctime; struct rtcdev rtcdev; }; #define VRTC_LOCK(vrtc) mtx_lock(&((vrtc)->mtx)) #define VRTC_UNLOCK(vrtc) mtx_unlock(&((vrtc)->mtx)) #define VRTC_LOCKED(vrtc) mtx_owned(&((vrtc)->mtx)) /* * RTC time is considered "broken" if: * - RTC updates are halted by the guest * - RTC date/time fields have invalid values */ #define VRTC_BROKEN_TIME ((time_t)-1) #define RTC_IRQ 8 #define RTCSB_BIN 0x04 #define RTCSB_ALL_INTRS (RTCSB_UINTR | RTCSB_AINTR | RTCSB_PINTR) #define rtc_halted(vrtc) ((vrtc->rtcdev.reg_b & RTCSB_HALT) != 0) #define aintr_enabled(vrtc) (((vrtc)->rtcdev.reg_b & RTCSB_AINTR) != 0) #define pintr_enabled(vrtc) (((vrtc)->rtcdev.reg_b & RTCSB_PINTR) != 0) #define uintr_enabled(vrtc) (((vrtc)->rtcdev.reg_b & RTCSB_UINTR) != 0) static void vrtc_callout_handler(void *arg); static void vrtc_set_reg_c(struct vrtc *vrtc, uint8_t newval); static MALLOC_DEFINE(M_VRTC, "vrtc", "bhyve virtual rtc"); SYSCTL_DECL(_hw_vmm); SYSCTL_NODE(_hw_vmm, OID_AUTO, vrtc, CTLFLAG_RW | CTLFLAG_MPSAFE, NULL, NULL); static int rtc_flag_broken_time = 1; SYSCTL_INT(_hw_vmm_vrtc, OID_AUTO, flag_broken_time, CTLFLAG_RDTUN, &rtc_flag_broken_time, 0, "Stop guest when invalid RTC time is detected"); static __inline bool divider_enabled(int reg_a) { /* * The RTC is counting only when dividers are not held in reset. */ return ((reg_a & 0x70) == 0x20); } static __inline bool update_enabled(struct vrtc *vrtc) { /* * RTC date/time can be updated only if: * - divider is not held in reset * - guest has not disabled updates * - the date/time fields have valid contents */ if (!divider_enabled(vrtc->rtcdev.reg_a)) return (false); if (rtc_halted(vrtc)) return (false); if (vrtc->base_rtctime == VRTC_BROKEN_TIME) return (false); return (true); } static time_t vrtc_curtime(struct vrtc *vrtc, sbintime_t *basetime) { sbintime_t now, delta; time_t t, secs; KASSERT(VRTC_LOCKED(vrtc), ("%s: vrtc not locked", __func__)); t = vrtc->base_rtctime; *basetime = vrtc->base_uptime; if (update_enabled(vrtc)) { now = sbinuptime(); delta = now - vrtc->base_uptime; KASSERT(delta >= 0, ("vrtc_curtime: uptime went backwards: " "%#lx to %#lx", vrtc->base_uptime, now)); secs = delta / SBT_1S; t += secs; *basetime += secs * SBT_1S; } return (t); } static __inline uint8_t rtcset(struct rtcdev *rtc, int val) { KASSERT(val >= 0 && val < 100, ("%s: invalid bin2bcd index %d", __func__, val)); return ((rtc->reg_b & RTCSB_BIN) ? val : bin2bcd_data[val]); } static void secs_to_rtc(time_t rtctime, struct vrtc *vrtc, int force_update) { struct clocktime ct; struct timespec ts; struct rtcdev *rtc; int hour; KASSERT(VRTC_LOCKED(vrtc), ("%s: vrtc not locked", __func__)); if (rtctime < 0) { KASSERT(rtctime == VRTC_BROKEN_TIME, ("%s: invalid vrtc time %#lx", __func__, rtctime)); return; } /* * If the RTC is halted then the guest has "ownership" of the * date/time fields. Don't update the RTC date/time fields in * this case (unless forced). */ if (rtc_halted(vrtc) && !force_update) return; ts.tv_sec = rtctime; ts.tv_nsec = 0; clock_ts_to_ct(&ts, &ct); KASSERT(ct.sec >= 0 && ct.sec <= 59, ("invalid clocktime sec %d", ct.sec)); KASSERT(ct.min >= 0 && ct.min <= 59, ("invalid clocktime min %d", ct.min)); KASSERT(ct.hour >= 0 && ct.hour <= 23, ("invalid clocktime hour %d", ct.hour)); KASSERT(ct.dow >= 0 && ct.dow <= 6, ("invalid clocktime wday %d", ct.dow)); KASSERT(ct.day >= 1 && ct.day <= 31, ("invalid clocktime mday %d", ct.day)); KASSERT(ct.mon >= 1 && ct.mon <= 12, ("invalid clocktime month %d", ct.mon)); KASSERT(ct.year >= POSIX_BASE_YEAR, ("invalid clocktime year %d", ct.year)); rtc = &vrtc->rtcdev; rtc->sec = rtcset(rtc, ct.sec); rtc->min = rtcset(rtc, ct.min); if (rtc->reg_b & RTCSB_24HR) { hour = ct.hour; } else { /* * Convert to the 12-hour format. */ switch (ct.hour) { case 0: /* 12 AM */ case 12: /* 12 PM */ hour = 12; break; default: /* * The remaining 'ct.hour' values are interpreted as: * [1 - 11] -> 1 - 11 AM * [13 - 23] -> 1 - 11 PM */ hour = ct.hour % 12; break; } } rtc->hour = rtcset(rtc, hour); if ((rtc->reg_b & RTCSB_24HR) == 0 && ct.hour >= 12) rtc->hour |= 0x80; /* set MSB to indicate PM */ rtc->day_of_week = rtcset(rtc, ct.dow + 1); rtc->day_of_month = rtcset(rtc, ct.day); rtc->month = rtcset(rtc, ct.mon); rtc->year = rtcset(rtc, ct.year % 100); rtc->century = rtcset(rtc, ct.year / 100); } static int rtcget(struct rtcdev *rtc, int val, int *retval) { uint8_t upper, lower; if (rtc->reg_b & RTCSB_BIN) { *retval = val; return (0); } lower = val & 0xf; upper = (val >> 4) & 0xf; if (lower > 9 || upper > 9) return (-1); *retval = upper * 10 + lower; return (0); } static time_t rtc_to_secs(struct vrtc *vrtc) { struct clocktime ct; struct timespec ts; struct rtcdev *rtc; #ifdef KTR struct vm *vm = vrtc->vm; #endif int century, error, hour, pm, year; KASSERT(VRTC_LOCKED(vrtc), ("%s: vrtc not locked", __func__)); rtc = &vrtc->rtcdev; bzero(&ct, sizeof(struct clocktime)); error = rtcget(rtc, rtc->sec, &ct.sec); if (error || ct.sec < 0 || ct.sec > 59) { VM_CTR2(vm, "Invalid RTC sec %#x/%d", rtc->sec, ct.sec); goto fail; } error = rtcget(rtc, rtc->min, &ct.min); if (error || ct.min < 0 || ct.min > 59) { VM_CTR2(vm, "Invalid RTC min %#x/%d", rtc->min, ct.min); goto fail; } pm = 0; hour = rtc->hour; if ((rtc->reg_b & RTCSB_24HR) == 0) { if (hour & 0x80) { hour &= ~0x80; pm = 1; } } error = rtcget(rtc, hour, &ct.hour); if ((rtc->reg_b & RTCSB_24HR) == 0) { if (ct.hour >= 1 && ct.hour <= 12) { /* * Convert from 12-hour format to internal 24-hour * representation as follows: * * 12-hour format ct.hour * 12 AM 0 * 1 - 11 AM 1 - 11 * 12 PM 12 * 1 - 11 PM 13 - 23 */ if (ct.hour == 12) ct.hour = 0; if (pm) ct.hour += 12; } else { VM_CTR2(vm, "Invalid RTC 12-hour format %#x/%d", rtc->hour, ct.hour); goto fail; } } if (error || ct.hour < 0 || ct.hour > 23) { VM_CTR2(vm, "Invalid RTC hour %#x/%d", rtc->hour, ct.hour); goto fail; } /* * Ignore 'rtc->dow' because some guests like Linux don't bother * setting it at all while others like OpenBSD/i386 set it incorrectly. * * clock_ct_to_ts() does not depend on 'ct.dow' anyways so ignore it. */ ct.dow = -1; error = rtcget(rtc, rtc->day_of_month, &ct.day); if (error || ct.day < 1 || ct.day > 31) { VM_CTR2(vm, "Invalid RTC mday %#x/%d", rtc->day_of_month, ct.day); goto fail; } error = rtcget(rtc, rtc->month, &ct.mon); if (error || ct.mon < 1 || ct.mon > 12) { VM_CTR2(vm, "Invalid RTC month %#x/%d", rtc->month, ct.mon); goto fail; } error = rtcget(rtc, rtc->year, &year); if (error || year < 0 || year > 99) { VM_CTR2(vm, "Invalid RTC year %#x/%d", rtc->year, year); goto fail; } error = rtcget(rtc, rtc->century, ¢ury); ct.year = century * 100 + year; if (error || ct.year < POSIX_BASE_YEAR) { VM_CTR2(vm, "Invalid RTC century %#x/%d", rtc->century, ct.year); goto fail; } error = clock_ct_to_ts(&ct, &ts); if (error || ts.tv_sec < 0) { VM_CTR3(vm, "Invalid RTC clocktime.date %04d-%02d-%02d", ct.year, ct.mon, ct.day); VM_CTR3(vm, "Invalid RTC clocktime.time %02d:%02d:%02d", ct.hour, ct.min, ct.sec); goto fail; } return (ts.tv_sec); /* success */ fail: /* * Stop updating the RTC if the date/time fields programmed by * the guest are invalid. */ VM_CTR0(vrtc->vm, "Invalid RTC date/time programming detected"); return (VRTC_BROKEN_TIME); } static int vrtc_time_update(struct vrtc *vrtc, time_t newtime, sbintime_t newbase) { struct rtcdev *rtc; time_t oldtime; uint8_t alarm_sec, alarm_min, alarm_hour; KASSERT(VRTC_LOCKED(vrtc), ("%s: vrtc not locked", __func__)); rtc = &vrtc->rtcdev; alarm_sec = rtc->alarm_sec; alarm_min = rtc->alarm_min; alarm_hour = rtc->alarm_hour; oldtime = vrtc->base_rtctime; VM_CTR2(vrtc->vm, "Updating RTC secs from %#lx to %#lx", oldtime, newtime); VM_CTR2(vrtc->vm, "Updating RTC base uptime from %#lx to %#lx", vrtc->base_uptime, newbase); vrtc->base_uptime = newbase; if (newtime == oldtime) return (0); /* * If 'newtime' indicates that RTC updates are disabled then just * record that and return. There is no need to do alarm interrupt * processing in this case. */ if (newtime == VRTC_BROKEN_TIME) { vrtc->base_rtctime = VRTC_BROKEN_TIME; return (0); } /* * Return an error if RTC updates are halted by the guest. */ if (rtc_halted(vrtc)) { VM_CTR0(vrtc->vm, "RTC update halted by guest"); return (EBUSY); } do { /* * If the alarm interrupt is enabled and 'oldtime' is valid * then visit all the seconds between 'oldtime' and 'newtime' * to check for the alarm condition. * * Otherwise move the RTC time forward directly to 'newtime'. */ if (aintr_enabled(vrtc) && oldtime != VRTC_BROKEN_TIME) vrtc->base_rtctime++; else vrtc->base_rtctime = newtime; if (aintr_enabled(vrtc)) { /* * Update the RTC date/time fields before checking * if the alarm conditions are satisfied. */ secs_to_rtc(vrtc->base_rtctime, vrtc, 0); if ((alarm_sec >= 0xC0 || alarm_sec == rtc->sec) && (alarm_min >= 0xC0 || alarm_min == rtc->min) && (alarm_hour >= 0xC0 || alarm_hour == rtc->hour)) { vrtc_set_reg_c(vrtc, rtc->reg_c | RTCIR_ALARM); } } } while (vrtc->base_rtctime != newtime); if (uintr_enabled(vrtc)) vrtc_set_reg_c(vrtc, rtc->reg_c | RTCIR_UPDATE); return (0); } static sbintime_t vrtc_freq(struct vrtc *vrtc) { int ratesel; static sbintime_t pf[16] = { 0, SBT_1S / 256, SBT_1S / 128, SBT_1S / 8192, SBT_1S / 4096, SBT_1S / 2048, SBT_1S / 1024, SBT_1S / 512, SBT_1S / 256, SBT_1S / 128, SBT_1S / 64, SBT_1S / 32, SBT_1S / 16, SBT_1S / 8, SBT_1S / 4, SBT_1S / 2, }; KASSERT(VRTC_LOCKED(vrtc), ("%s: vrtc not locked", __func__)); /* * If both periodic and alarm interrupts are enabled then use the * periodic frequency to drive the callout. The minimum periodic * frequency (2 Hz) is higher than the alarm frequency (1 Hz) so * piggyback the alarm on top of it. The same argument applies to * the update interrupt. */ if (pintr_enabled(vrtc) && divider_enabled(vrtc->rtcdev.reg_a)) { ratesel = vrtc->rtcdev.reg_a & 0xf; return (pf[ratesel]); } else if (aintr_enabled(vrtc) && update_enabled(vrtc)) { return (SBT_1S); } else if (uintr_enabled(vrtc) && update_enabled(vrtc)) { return (SBT_1S); } else { return (0); } } static void vrtc_callout_reset(struct vrtc *vrtc, sbintime_t freqsbt) { KASSERT(VRTC_LOCKED(vrtc), ("%s: vrtc not locked", __func__)); if (freqsbt == 0) { if (callout_active(&vrtc->callout)) { VM_CTR0(vrtc->vm, "RTC callout stopped"); callout_stop(&vrtc->callout); } return; } VM_CTR1(vrtc->vm, "RTC callout frequency %d hz", SBT_1S / freqsbt); callout_reset_sbt(&vrtc->callout, freqsbt, 0, vrtc_callout_handler, vrtc, 0); } static void vrtc_callout_handler(void *arg) { struct vrtc *vrtc = arg; sbintime_t freqsbt, basetime; time_t rtctime; int error __diagused; VM_CTR0(vrtc->vm, "vrtc callout fired"); VRTC_LOCK(vrtc); if (callout_pending(&vrtc->callout)) /* callout was reset */ goto done; if (!callout_active(&vrtc->callout)) /* callout was stopped */ goto done; callout_deactivate(&vrtc->callout); KASSERT((vrtc->rtcdev.reg_b & RTCSB_ALL_INTRS) != 0, ("gratuitous vrtc callout")); if (pintr_enabled(vrtc)) vrtc_set_reg_c(vrtc, vrtc->rtcdev.reg_c | RTCIR_PERIOD); if (aintr_enabled(vrtc) || uintr_enabled(vrtc)) { rtctime = vrtc_curtime(vrtc, &basetime); error = vrtc_time_update(vrtc, rtctime, basetime); KASSERT(error == 0, ("%s: vrtc_time_update error %d", __func__, error)); } freqsbt = vrtc_freq(vrtc); KASSERT(freqsbt != 0, ("%s: vrtc frequency cannot be zero", __func__)); vrtc_callout_reset(vrtc, freqsbt); done: VRTC_UNLOCK(vrtc); } static __inline void vrtc_callout_check(struct vrtc *vrtc, sbintime_t freq) { int active __diagused; active = callout_active(&vrtc->callout) ? 1 : 0; KASSERT((freq == 0 && !active) || (freq != 0 && active), ("vrtc callout %s with frequency %#lx", active ? "active" : "inactive", freq)); } static void vrtc_set_reg_c(struct vrtc *vrtc, uint8_t newval) { struct rtcdev *rtc; int oldirqf, newirqf; uint8_t oldval, changed; KASSERT(VRTC_LOCKED(vrtc), ("%s: vrtc not locked", __func__)); rtc = &vrtc->rtcdev; newval &= RTCIR_ALARM | RTCIR_PERIOD | RTCIR_UPDATE; oldirqf = rtc->reg_c & RTCIR_INT; if ((aintr_enabled(vrtc) && (newval & RTCIR_ALARM) != 0) || (pintr_enabled(vrtc) && (newval & RTCIR_PERIOD) != 0) || (uintr_enabled(vrtc) && (newval & RTCIR_UPDATE) != 0)) { newirqf = RTCIR_INT; } else { newirqf = 0; } oldval = rtc->reg_c; rtc->reg_c = newirqf | newval; changed = oldval ^ rtc->reg_c; if (changed) { VM_CTR2(vrtc->vm, "RTC reg_c changed from %#x to %#x", oldval, rtc->reg_c); } if (!oldirqf && newirqf) { VM_CTR1(vrtc->vm, "RTC irq %d asserted", RTC_IRQ); vatpic_pulse_irq(vrtc->vm, RTC_IRQ); vioapic_pulse_irq(vrtc->vm, RTC_IRQ); } else if (oldirqf && !newirqf) { VM_CTR1(vrtc->vm, "RTC irq %d deasserted", RTC_IRQ); } } static int vrtc_set_reg_b(struct vrtc *vrtc, uint8_t newval) { struct rtcdev *rtc; sbintime_t oldfreq, newfreq, basetime; time_t curtime, rtctime; int error __diagused; uint8_t oldval, changed; KASSERT(VRTC_LOCKED(vrtc), ("%s: vrtc not locked", __func__)); rtc = &vrtc->rtcdev; oldval = rtc->reg_b; oldfreq = vrtc_freq(vrtc); rtc->reg_b = newval; changed = oldval ^ newval; if (changed) { VM_CTR2(vrtc->vm, "RTC reg_b changed from %#x to %#x", oldval, newval); } if (changed & RTCSB_HALT) { if ((newval & RTCSB_HALT) == 0) { rtctime = rtc_to_secs(vrtc); basetime = sbinuptime(); if (rtctime == VRTC_BROKEN_TIME) { if (rtc_flag_broken_time) return (-1); } } else { curtime = vrtc_curtime(vrtc, &basetime); KASSERT(curtime == vrtc->base_rtctime, ("%s: mismatch " "between vrtc basetime (%#lx) and curtime (%#lx)", __func__, vrtc->base_rtctime, curtime)); /* * Force a refresh of the RTC date/time fields so * they reflect the time right before the guest set * the HALT bit. */ secs_to_rtc(curtime, vrtc, 1); /* * Updates are halted so mark 'base_rtctime' to denote * that the RTC date/time is in flux. */ rtctime = VRTC_BROKEN_TIME; rtc->reg_b &= ~RTCSB_UINTR; } error = vrtc_time_update(vrtc, rtctime, basetime); KASSERT(error == 0, ("vrtc_time_update error %d", error)); } /* * Side effect of changes to the interrupt enable bits. */ if (changed & RTCSB_ALL_INTRS) vrtc_set_reg_c(vrtc, vrtc->rtcdev.reg_c); /* * Change the callout frequency if it has changed. */ newfreq = vrtc_freq(vrtc); if (newfreq != oldfreq) vrtc_callout_reset(vrtc, newfreq); else vrtc_callout_check(vrtc, newfreq); /* * The side effect of bits that control the RTC date/time format * is handled lazily when those fields are actually read. */ return (0); } static void vrtc_set_reg_a(struct vrtc *vrtc, uint8_t newval) { sbintime_t oldfreq, newfreq; uint8_t oldval, changed; KASSERT(VRTC_LOCKED(vrtc), ("%s: vrtc not locked", __func__)); newval &= ~RTCSA_TUP; oldval = vrtc->rtcdev.reg_a; oldfreq = vrtc_freq(vrtc); if (divider_enabled(oldval) && !divider_enabled(newval)) { VM_CTR2(vrtc->vm, "RTC divider held in reset at %#lx/%#lx", vrtc->base_rtctime, vrtc->base_uptime); } else if (!divider_enabled(oldval) && divider_enabled(newval)) { /* * If the dividers are coming out of reset then update * 'base_uptime' before this happens. This is done to * maintain the illusion that the RTC date/time was frozen * while the dividers were disabled. */ vrtc->base_uptime = sbinuptime(); VM_CTR2(vrtc->vm, "RTC divider out of reset at %#lx/%#lx", vrtc->base_rtctime, vrtc->base_uptime); } else { /* NOTHING */ } vrtc->rtcdev.reg_a = newval; changed = oldval ^ newval; if (changed) { VM_CTR2(vrtc->vm, "RTC reg_a changed from %#x to %#x", oldval, newval); } /* * Side effect of changes to rate select and divider enable bits. */ newfreq = vrtc_freq(vrtc); if (newfreq != oldfreq) vrtc_callout_reset(vrtc, newfreq); else vrtc_callout_check(vrtc, newfreq); } int vrtc_set_time(struct vm *vm, time_t secs) { struct vrtc *vrtc; int error; vrtc = vm_rtc(vm); VRTC_LOCK(vrtc); error = vrtc_time_update(vrtc, secs, sbinuptime()); VRTC_UNLOCK(vrtc); if (error) { VM_CTR2(vrtc->vm, "Error %d setting RTC time to %#lx", error, secs); } else { VM_CTR1(vrtc->vm, "RTC time set to %#lx", secs); } return (error); } time_t vrtc_get_time(struct vm *vm) { struct vrtc *vrtc; sbintime_t basetime; time_t t; vrtc = vm_rtc(vm); VRTC_LOCK(vrtc); t = vrtc_curtime(vrtc, &basetime); VRTC_UNLOCK(vrtc); return (t); } int vrtc_nvram_write(struct vm *vm, int offset, uint8_t value) { struct vrtc *vrtc; uint8_t *ptr; vrtc = vm_rtc(vm); /* * Don't allow writes to RTC control registers or the date/time fields. */ if (offset < offsetof(struct rtcdev, nvram[0]) || offset == RTC_CENTURY || offset >= sizeof(struct rtcdev)) { VM_CTR1(vrtc->vm, "RTC nvram write to invalid offset %d", offset); return (EINVAL); } VRTC_LOCK(vrtc); ptr = (uint8_t *)(&vrtc->rtcdev); ptr[offset] = value; VM_CTR2(vrtc->vm, "RTC nvram write %#x to offset %#x", value, offset); VRTC_UNLOCK(vrtc); return (0); } int vrtc_nvram_read(struct vm *vm, int offset, uint8_t *retval) { struct vrtc *vrtc; sbintime_t basetime; time_t curtime; uint8_t *ptr; /* * Allow all offsets in the RTC to be read. */ if (offset < 0 || offset >= sizeof(struct rtcdev)) return (EINVAL); vrtc = vm_rtc(vm); VRTC_LOCK(vrtc); /* * Update RTC date/time fields if necessary. */ if (offset < 10 || offset == RTC_CENTURY) { curtime = vrtc_curtime(vrtc, &basetime); secs_to_rtc(curtime, vrtc, 0); } ptr = (uint8_t *)(&vrtc->rtcdev); *retval = ptr[offset]; VRTC_UNLOCK(vrtc); return (0); } int vrtc_addr_handler(struct vm *vm, bool in, int port, int bytes, uint32_t *val) { struct vrtc *vrtc; vrtc = vm_rtc(vm); if (bytes != 1) return (-1); if (in) { *val = 0xff; return (0); } VRTC_LOCK(vrtc); vrtc->addr = *val & 0x7f; VRTC_UNLOCK(vrtc); return (0); } int vrtc_data_handler(struct vm *vm, bool in, int port, int bytes, uint32_t *val) { struct vrtc *vrtc; struct rtcdev *rtc; sbintime_t basetime; time_t curtime; int error, offset; vrtc = vm_rtc(vm); rtc = &vrtc->rtcdev; if (bytes != 1) return (-1); VRTC_LOCK(vrtc); offset = vrtc->addr; if (offset >= sizeof(struct rtcdev)) { VRTC_UNLOCK(vrtc); return (-1); } error = 0; curtime = vrtc_curtime(vrtc, &basetime); vrtc_time_update(vrtc, curtime, basetime); /* * Update RTC date/time fields if necessary. * * This is not just for reads of the RTC. The side-effect of writing * the century byte requires other RTC date/time fields (e.g. sec) * to be updated here. */ if (offset < 10 || offset == RTC_CENTURY) secs_to_rtc(curtime, vrtc, 0); if (in) { if (offset == 12) { /* * XXX * reg_c interrupt flags are updated only if the * corresponding interrupt enable bit in reg_b is set. */ *val = vrtc->rtcdev.reg_c; vrtc_set_reg_c(vrtc, 0); } else { *val = *((uint8_t *)rtc + offset); } VM_CTR2(vm, "Read value %#x from RTC offset %#x", *val, offset); } else { switch (offset) { case 10: VM_CTR1(vm, "RTC reg_a set to %#x", *val); vrtc_set_reg_a(vrtc, *val); break; case 11: VM_CTR1(vm, "RTC reg_b set to %#x", *val); error = vrtc_set_reg_b(vrtc, *val); break; case 12: VM_CTR1(vm, "RTC reg_c set to %#x (ignored)", *val); break; case 13: VM_CTR1(vm, "RTC reg_d set to %#x (ignored)", *val); break; case 0: /* * High order bit of 'seconds' is readonly. */ *val &= 0x7f; /* FALLTHRU */ default: VM_CTR2(vm, "RTC offset %#x set to %#x", offset, *val); *((uint8_t *)rtc + offset) = *val; break; } /* * XXX some guests (e.g. OpenBSD) write the century byte * outside of RTCSB_HALT so re-calculate the RTC date/time. */ if (offset == RTC_CENTURY && !rtc_halted(vrtc)) { curtime = rtc_to_secs(vrtc); error = vrtc_time_update(vrtc, curtime, sbinuptime()); KASSERT(!error, ("vrtc_time_update error %d", error)); if (curtime == VRTC_BROKEN_TIME && rtc_flag_broken_time) error = -1; } } VRTC_UNLOCK(vrtc); return (error); } void vrtc_reset(struct vrtc *vrtc) { struct rtcdev *rtc; VRTC_LOCK(vrtc); rtc = &vrtc->rtcdev; vrtc_set_reg_b(vrtc, rtc->reg_b & ~(RTCSB_ALL_INTRS | RTCSB_SQWE)); vrtc_set_reg_c(vrtc, 0); KASSERT(!callout_active(&vrtc->callout), ("rtc callout still active")); VRTC_UNLOCK(vrtc); } struct vrtc * vrtc_init(struct vm *vm) { struct vrtc *vrtc; struct rtcdev *rtc; time_t curtime; vrtc = malloc(sizeof(struct vrtc), M_VRTC, M_WAITOK | M_ZERO); vrtc->vm = vm; mtx_init(&vrtc->mtx, "vrtc lock", NULL, MTX_DEF); callout_init(&vrtc->callout, 1); /* Allow dividers to keep time but disable everything else */ rtc = &vrtc->rtcdev; rtc->reg_a = 0x20; rtc->reg_b = RTCSB_24HR; rtc->reg_c = 0; rtc->reg_d = RTCSD_PWR; /* Reset the index register to a safe value. */ vrtc->addr = RTC_STATUSD; /* * Initialize RTC time to 00:00:00 Jan 1, 1970. */ curtime = 0; VRTC_LOCK(vrtc); vrtc->base_rtctime = VRTC_BROKEN_TIME; vrtc_time_update(vrtc, curtime, sbinuptime()); secs_to_rtc(curtime, vrtc, 0); VRTC_UNLOCK(vrtc); return (vrtc); } void vrtc_cleanup(struct vrtc *vrtc) { callout_drain(&vrtc->callout); + mtx_destroy(&vrtc->mtx); free(vrtc, M_VRTC); } #ifdef BHYVE_SNAPSHOT int vrtc_snapshot(struct vrtc *vrtc, struct vm_snapshot_meta *meta) { int ret; VRTC_LOCK(vrtc); SNAPSHOT_VAR_OR_LEAVE(vrtc->addr, meta, ret, done); if (meta->op == VM_SNAPSHOT_RESTORE) vrtc->base_uptime = sbinuptime(); SNAPSHOT_VAR_OR_LEAVE(vrtc->base_rtctime, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(vrtc->rtcdev.sec, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(vrtc->rtcdev.alarm_sec, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(vrtc->rtcdev.min, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(vrtc->rtcdev.alarm_min, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(vrtc->rtcdev.hour, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(vrtc->rtcdev.alarm_hour, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(vrtc->rtcdev.day_of_week, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(vrtc->rtcdev.day_of_month, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(vrtc->rtcdev.month, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(vrtc->rtcdev.year, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(vrtc->rtcdev.reg_a, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(vrtc->rtcdev.reg_b, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(vrtc->rtcdev.reg_c, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(vrtc->rtcdev.reg_d, meta, ret, done); SNAPSHOT_BUF_OR_LEAVE(vrtc->rtcdev.nvram, sizeof(vrtc->rtcdev.nvram), meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(vrtc->rtcdev.century, meta, ret, done); SNAPSHOT_BUF_OR_LEAVE(vrtc->rtcdev.nvram2, sizeof(vrtc->rtcdev.nvram2), meta, ret, done); vrtc_callout_reset(vrtc, vrtc_freq(vrtc)); VRTC_UNLOCK(vrtc); done: return (ret); } #endif diff --git a/sys/amd64/vmm/vmm.c b/sys/amd64/vmm/vmm.c index 825b3383b787..4659495ca5a2 100644 --- a/sys/amd64/vmm/vmm.c +++ b/sys/amd64/vmm/vmm.c @@ -1,2854 +1,2858 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2011 NetApp, Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #include __FBSDID("$FreeBSD$"); #include "opt_bhyve_snapshot.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "vmm_ioport.h" #include "vmm_ktr.h" #include "vmm_host.h" #include "vmm_mem.h" #include "vmm_util.h" #include "vatpic.h" #include "vatpit.h" #include "vhpet.h" #include "vioapic.h" #include "vlapic.h" #include "vpmtmr.h" #include "vrtc.h" #include "vmm_stat.h" #include "vmm_lapic.h" #include "io/ppt.h" #include "io/iommu.h" struct vlapic; /* * Initialization: * (a) allocated when vcpu is created * (i) initialized when vcpu is created and when it is reinitialized * (o) initialized the first time the vcpu is created * (x) initialized before use */ struct vcpu { struct mtx mtx; /* (o) protects 'state' and 'hostcpu' */ enum vcpu_state state; /* (o) vcpu state */ int vcpuid; /* (o) */ int hostcpu; /* (o) vcpu's host cpu */ int reqidle; /* (i) request vcpu to idle */ struct vm *vm; /* (o) */ void *cookie; /* (i) cpu-specific data */ struct vlapic *vlapic; /* (i) APIC device model */ enum x2apic_state x2apic_state; /* (i) APIC mode */ uint64_t exitintinfo; /* (i) events pending at VM exit */ int nmi_pending; /* (i) NMI pending */ int extint_pending; /* (i) INTR pending */ int exception_pending; /* (i) exception pending */ int exc_vector; /* (x) exception collateral */ int exc_errcode_valid; uint32_t exc_errcode; struct savefpu *guestfpu; /* (a,i) guest fpu state */ uint64_t guest_xcr0; /* (i) guest %xcr0 register */ void *stats; /* (a,i) statistics */ struct vm_exit exitinfo; /* (x) exit reason and collateral */ uint64_t nextrip; /* (x) next instruction to execute */ uint64_t tsc_offset; /* (o) TSC offsetting */ }; #define vcpu_lock_initialized(v) mtx_initialized(&((v)->mtx)) #define vcpu_lock_init(v) mtx_init(&((v)->mtx), "vcpu lock", 0, MTX_SPIN) +#define vcpu_lock_destroy(v) mtx_destroy(&((v)->mtx)) #define vcpu_lock(v) mtx_lock_spin(&((v)->mtx)) #define vcpu_unlock(v) mtx_unlock_spin(&((v)->mtx)) #define vcpu_assert_locked(v) mtx_assert(&((v)->mtx), MA_OWNED) struct mem_seg { size_t len; bool sysmem; struct vm_object *object; }; #define VM_MAX_MEMSEGS 4 struct mem_map { vm_paddr_t gpa; size_t len; vm_ooffset_t segoff; int segid; int prot; int flags; }; #define VM_MAX_MEMMAPS 8 /* * Initialization: * (o) initialized the first time the VM is created * (i) initialized when VM is created and when it is reinitialized * (x) initialized before use */ struct vm { void *cookie; /* (i) cpu-specific data */ void *iommu; /* (x) iommu-specific data */ struct vhpet *vhpet; /* (i) virtual HPET */ struct vioapic *vioapic; /* (i) virtual ioapic */ struct vatpic *vatpic; /* (i) virtual atpic */ struct vatpit *vatpit; /* (i) virtual atpit */ struct vpmtmr *vpmtmr; /* (i) virtual ACPI PM timer */ struct vrtc *vrtc; /* (o) virtual RTC */ volatile cpuset_t active_cpus; /* (i) active vcpus */ volatile cpuset_t debug_cpus; /* (i) vcpus stopped for debug */ int suspend; /* (i) stop VM execution */ volatile cpuset_t suspended_cpus; /* (i) suspended vcpus */ volatile cpuset_t halted_cpus; /* (x) cpus in a hard halt */ cpuset_t rendezvous_req_cpus; /* (x) rendezvous requested */ cpuset_t rendezvous_done_cpus; /* (x) rendezvous finished */ void *rendezvous_arg; /* (x) rendezvous func/arg */ vm_rendezvous_func_t rendezvous_func; struct mtx rendezvous_mtx; /* (o) rendezvous lock */ struct mem_map mem_maps[VM_MAX_MEMMAPS]; /* (i) guest address space */ struct mem_seg mem_segs[VM_MAX_MEMSEGS]; /* (o) guest memory regions */ struct vmspace *vmspace; /* (o) guest's address space */ char name[VM_MAX_NAMELEN+1]; /* (o) virtual machine name */ struct vcpu vcpu[VM_MAXCPU]; /* (i) guest vcpus */ /* The following describe the vm cpu topology */ uint16_t sockets; /* (o) num of sockets */ uint16_t cores; /* (o) num of cores/socket */ uint16_t threads; /* (o) num of threads/core */ uint16_t maxcpus; /* (o) max pluggable cpus */ }; #define VMM_CTR0(vcpu, format) \ VCPU_CTR0((vcpu)->vm, (vcpu)->vcpuid, format) #define VMM_CTR1(vcpu, format, p1) \ VCPU_CTR1((vcpu)->vm, (vcpu)->vcpuid, format, p1) #define VMM_CTR2(vcpu, format, p1, p2) \ VCPU_CTR2((vcpu)->vm, (vcpu)->vcpuid, format, p1, p2) #define VMM_CTR3(vcpu, format, p1, p2, p3) \ VCPU_CTR3((vcpu)->vm, (vcpu)->vcpuid, format, p1, p2, p3) #define VMM_CTR4(vcpu, format, p1, p2, p3, p4) \ VCPU_CTR4((vcpu)->vm, (vcpu)->vcpuid, format, p1, p2, p3, p4) static int vmm_initialized; static void vmmops_panic(void); static void vmmops_panic(void) { panic("vmm_ops func called when !vmm_is_intel() && !vmm_is_svm()"); } #define DEFINE_VMMOPS_IFUNC(ret_type, opname, args) \ DEFINE_IFUNC(static, ret_type, vmmops_##opname, args) \ { \ if (vmm_is_intel()) \ return (vmm_ops_intel.opname); \ else if (vmm_is_svm()) \ return (vmm_ops_amd.opname); \ else \ return ((ret_type (*)args)vmmops_panic); \ } DEFINE_VMMOPS_IFUNC(int, modinit, (int ipinum)) DEFINE_VMMOPS_IFUNC(int, modcleanup, (void)) DEFINE_VMMOPS_IFUNC(void, modresume, (void)) DEFINE_VMMOPS_IFUNC(void *, init, (struct vm *vm, struct pmap *pmap)) DEFINE_VMMOPS_IFUNC(int, run, (void *vcpui, register_t rip, struct pmap *pmap, struct vm_eventinfo *info)) DEFINE_VMMOPS_IFUNC(void, cleanup, (void *vmi)) DEFINE_VMMOPS_IFUNC(void *, vcpu_init, (void *vmi, struct vcpu *vcpu, int vcpu_id)) DEFINE_VMMOPS_IFUNC(void, vcpu_cleanup, (void *vcpui)) DEFINE_VMMOPS_IFUNC(int, getreg, (void *vcpui, int num, uint64_t *retval)) DEFINE_VMMOPS_IFUNC(int, setreg, (void *vcpui, int num, uint64_t val)) DEFINE_VMMOPS_IFUNC(int, getdesc, (void *vcpui, int num, struct seg_desc *desc)) DEFINE_VMMOPS_IFUNC(int, setdesc, (void *vcpui, int num, struct seg_desc *desc)) DEFINE_VMMOPS_IFUNC(int, getcap, (void *vcpui, int num, int *retval)) DEFINE_VMMOPS_IFUNC(int, setcap, (void *vcpui, int num, int val)) DEFINE_VMMOPS_IFUNC(struct vmspace *, vmspace_alloc, (vm_offset_t min, vm_offset_t max)) DEFINE_VMMOPS_IFUNC(void, vmspace_free, (struct vmspace *vmspace)) DEFINE_VMMOPS_IFUNC(struct vlapic *, vlapic_init, (void *vcpui)) DEFINE_VMMOPS_IFUNC(void, vlapic_cleanup, (struct vlapic *vlapic)) #ifdef BHYVE_SNAPSHOT DEFINE_VMMOPS_IFUNC(int, snapshot, (void *vmi, struct vm_snapshot_meta *meta)) DEFINE_VMMOPS_IFUNC(int, vcpu_snapshot, (void *vcpui, struct vm_snapshot_meta *meta)) DEFINE_VMMOPS_IFUNC(int, restore_tsc, (void *vcpui, uint64_t now)) #endif #define fpu_start_emulating() load_cr0(rcr0() | CR0_TS) #define fpu_stop_emulating() clts() SDT_PROVIDER_DEFINE(vmm); static MALLOC_DEFINE(M_VM, "vm", "vm"); /* statistics */ static VMM_STAT(VCPU_TOTAL_RUNTIME, "vcpu total runtime"); SYSCTL_NODE(_hw, OID_AUTO, vmm, CTLFLAG_RW | CTLFLAG_MPSAFE, NULL, NULL); /* * Halt the guest if all vcpus are executing a HLT instruction with * interrupts disabled. */ static int halt_detection_enabled = 1; SYSCTL_INT(_hw_vmm, OID_AUTO, halt_detection, CTLFLAG_RDTUN, &halt_detection_enabled, 0, "Halt VM if all vcpus execute HLT with interrupts disabled"); static int vmm_ipinum; SYSCTL_INT(_hw_vmm, OID_AUTO, ipinum, CTLFLAG_RD, &vmm_ipinum, 0, "IPI vector used for vcpu notifications"); static int trace_guest_exceptions; SYSCTL_INT(_hw_vmm, OID_AUTO, trace_guest_exceptions, CTLFLAG_RDTUN, &trace_guest_exceptions, 0, "Trap into hypervisor on all guest exceptions and reflect them back"); static int trap_wbinvd; SYSCTL_INT(_hw_vmm, OID_AUTO, trap_wbinvd, CTLFLAG_RDTUN, &trap_wbinvd, 0, "WBINVD triggers a VM-exit"); static void vm_free_memmap(struct vm *vm, int ident); static bool sysmem_mapping(struct vm *vm, struct mem_map *mm); static void vcpu_notify_event_locked(struct vcpu *vcpu, bool lapic_intr); #ifdef KTR static const char * vcpu_state2str(enum vcpu_state state) { switch (state) { case VCPU_IDLE: return ("idle"); case VCPU_FROZEN: return ("frozen"); case VCPU_RUNNING: return ("running"); case VCPU_SLEEPING: return ("sleeping"); default: return ("unknown"); } } #endif static void vcpu_cleanup(struct vm *vm, int i, bool destroy) { struct vcpu *vcpu = &vm->vcpu[i]; vmmops_vlapic_cleanup(vcpu->vlapic); vmmops_vcpu_cleanup(vcpu->cookie); vcpu->cookie = NULL; if (destroy) { vmm_stat_free(vcpu->stats); fpu_save_area_free(vcpu->guestfpu); + vcpu_lock_destroy(vcpu); } } static void vcpu_init(struct vm *vm, int vcpu_id, bool create) { struct vcpu *vcpu; KASSERT(vcpu_id >= 0 && vcpu_id < vm->maxcpus, ("vcpu_init: invalid vcpu %d", vcpu_id)); vcpu = &vm->vcpu[vcpu_id]; if (create) { KASSERT(!vcpu_lock_initialized(vcpu), ("vcpu %d already " "initialized", vcpu_id)); vcpu_lock_init(vcpu); vcpu->state = VCPU_IDLE; vcpu->hostcpu = NOCPU; vcpu->vcpuid = vcpu_id; vcpu->vm = vm; vcpu->guestfpu = fpu_save_area_alloc(); vcpu->stats = vmm_stat_alloc(); vcpu->tsc_offset = 0; } vcpu->cookie = vmmops_vcpu_init(vm->cookie, vcpu, vcpu_id); vcpu->vlapic = vmmops_vlapic_init(vcpu->cookie); vm_set_x2apic_state(vcpu, X2APIC_DISABLED); vcpu->reqidle = 0; vcpu->exitintinfo = 0; vcpu->nmi_pending = 0; vcpu->extint_pending = 0; vcpu->exception_pending = 0; vcpu->guest_xcr0 = XFEATURE_ENABLED_X87; fpu_save_area_reset(vcpu->guestfpu); vmm_stat_init(vcpu->stats); } int vcpu_trace_exceptions(struct vcpu *vcpu) { return (trace_guest_exceptions); } int vcpu_trap_wbinvd(struct vcpu *vcpu) { return (trap_wbinvd); } struct vm_exit * vm_exitinfo(struct vcpu *vcpu) { return (&vcpu->exitinfo); } static int vmm_init(void) { int error; if (!vmm_is_hw_supported()) return (ENXIO); vmm_host_state_init(); vmm_ipinum = lapic_ipi_alloc(pti ? &IDTVEC(justreturn1_pti) : &IDTVEC(justreturn)); if (vmm_ipinum < 0) vmm_ipinum = IPI_AST; error = vmm_mem_init(); if (error) return (error); vmm_resume_p = vmmops_modresume; return (vmmops_modinit(vmm_ipinum)); } static int vmm_handler(module_t mod, int what, void *arg) { int error; switch (what) { case MOD_LOAD: if (vmm_is_hw_supported()) { vmmdev_init(); error = vmm_init(); if (error == 0) vmm_initialized = 1; } else { error = ENXIO; } break; case MOD_UNLOAD: if (vmm_is_hw_supported()) { error = vmmdev_cleanup(); if (error == 0) { vmm_resume_p = NULL; iommu_cleanup(); if (vmm_ipinum != IPI_AST) lapic_ipi_free(vmm_ipinum); error = vmmops_modcleanup(); /* * Something bad happened - prevent new * VMs from being created */ if (error) vmm_initialized = 0; } } else { error = 0; } break; default: error = 0; break; } return (error); } static moduledata_t vmm_kmod = { "vmm", vmm_handler, NULL }; /* * vmm initialization has the following dependencies: * * - VT-x initialization requires smp_rendezvous() and therefore must happen * after SMP is fully functional (after SI_SUB_SMP). */ DECLARE_MODULE(vmm, vmm_kmod, SI_SUB_SMP + 1, SI_ORDER_ANY); MODULE_VERSION(vmm, 1); static void vm_init(struct vm *vm, bool create) { int i; vm->cookie = vmmops_init(vm, vmspace_pmap(vm->vmspace)); vm->iommu = NULL; vm->vioapic = vioapic_init(vm); vm->vhpet = vhpet_init(vm); vm->vatpic = vatpic_init(vm); vm->vatpit = vatpit_init(vm); vm->vpmtmr = vpmtmr_init(vm); if (create) vm->vrtc = vrtc_init(vm); CPU_ZERO(&vm->active_cpus); CPU_ZERO(&vm->debug_cpus); vm->suspend = 0; CPU_ZERO(&vm->suspended_cpus); for (i = 0; i < vm->maxcpus; i++) vcpu_init(vm, i, create); } /* * The default CPU topology is a single thread per package. */ u_int cores_per_package = 1; u_int threads_per_core = 1; int vm_create(const char *name, struct vm **retvm) { struct vm *vm; struct vmspace *vmspace; /* * If vmm.ko could not be successfully initialized then don't attempt * to create the virtual machine. */ if (!vmm_initialized) return (ENXIO); if (name == NULL || strnlen(name, VM_MAX_NAMELEN + 1) == VM_MAX_NAMELEN + 1) return (EINVAL); vmspace = vmmops_vmspace_alloc(0, VM_MAXUSER_ADDRESS_LA48); if (vmspace == NULL) return (ENOMEM); vm = malloc(sizeof(struct vm), M_VM, M_WAITOK | M_ZERO); strcpy(vm->name, name); vm->vmspace = vmspace; mtx_init(&vm->rendezvous_mtx, "vm rendezvous lock", 0, MTX_DEF); vm->sockets = 1; vm->cores = cores_per_package; /* XXX backwards compatibility */ vm->threads = threads_per_core; /* XXX backwards compatibility */ vm->maxcpus = VM_MAXCPU; /* XXX temp to keep code working */ vm_init(vm, true); *retvm = vm; return (0); } void vm_get_topology(struct vm *vm, uint16_t *sockets, uint16_t *cores, uint16_t *threads, uint16_t *maxcpus) { *sockets = vm->sockets; *cores = vm->cores; *threads = vm->threads; *maxcpus = vm->maxcpus; } uint16_t vm_get_maxcpus(struct vm *vm) { return (vm->maxcpus); } int vm_set_topology(struct vm *vm, uint16_t sockets, uint16_t cores, uint16_t threads, uint16_t maxcpus) { if (maxcpus != 0) return (EINVAL); /* XXX remove when supported */ if ((sockets * cores * threads) > vm->maxcpus) return (EINVAL); /* XXX need to check sockets * cores * threads == vCPU, how? */ vm->sockets = sockets; vm->cores = cores; vm->threads = threads; vm->maxcpus = VM_MAXCPU; /* XXX temp to keep code working */ return(0); } static void vm_cleanup(struct vm *vm, bool destroy) { struct mem_map *mm; int i; ppt_unassign_all(vm); if (vm->iommu != NULL) iommu_destroy_domain(vm->iommu); if (destroy) vrtc_cleanup(vm->vrtc); else vrtc_reset(vm->vrtc); vpmtmr_cleanup(vm->vpmtmr); vatpit_cleanup(vm->vatpit); vhpet_cleanup(vm->vhpet); vatpic_cleanup(vm->vatpic); vioapic_cleanup(vm->vioapic); for (i = 0; i < vm->maxcpus; i++) vcpu_cleanup(vm, i, destroy); vmmops_cleanup(vm->cookie); /* * System memory is removed from the guest address space only when * the VM is destroyed. This is because the mapping remains the same * across VM reset. * * Device memory can be relocated by the guest (e.g. using PCI BARs) * so those mappings are removed on a VM reset. */ for (i = 0; i < VM_MAX_MEMMAPS; i++) { mm = &vm->mem_maps[i]; if (destroy || !sysmem_mapping(vm, mm)) vm_free_memmap(vm, i); } if (destroy) { for (i = 0; i < VM_MAX_MEMSEGS; i++) vm_free_memseg(vm, i); vmmops_vmspace_free(vm->vmspace); vm->vmspace = NULL; + + mtx_destroy(&vm->rendezvous_mtx); } } void vm_destroy(struct vm *vm) { vm_cleanup(vm, true); free(vm, M_VM); } int vm_reinit(struct vm *vm) { int error; /* * A virtual machine can be reset only if all vcpus are suspended. */ if (CPU_CMP(&vm->suspended_cpus, &vm->active_cpus) == 0) { vm_cleanup(vm, false); vm_init(vm, false); error = 0; } else { error = EBUSY; } return (error); } const char * vm_name(struct vm *vm) { return (vm->name); } int vm_map_mmio(struct vm *vm, vm_paddr_t gpa, size_t len, vm_paddr_t hpa) { vm_object_t obj; if ((obj = vmm_mmio_alloc(vm->vmspace, gpa, len, hpa)) == NULL) return (ENOMEM); else return (0); } int vm_unmap_mmio(struct vm *vm, vm_paddr_t gpa, size_t len) { vmm_mmio_free(vm->vmspace, gpa, len); return (0); } /* * Return 'true' if 'gpa' is allocated in the guest address space. * * This function is called in the context of a running vcpu which acts as * an implicit lock on 'vm->mem_maps[]'. */ bool vm_mem_allocated(struct vcpu *vcpu, vm_paddr_t gpa) { struct vm *vm = vcpu->vm; struct mem_map *mm; int i; #ifdef INVARIANTS int hostcpu, state; state = vcpu_get_state(vcpu, &hostcpu); KASSERT(state == VCPU_RUNNING && hostcpu == curcpu, ("%s: invalid vcpu state %d/%d", __func__, state, hostcpu)); #endif for (i = 0; i < VM_MAX_MEMMAPS; i++) { mm = &vm->mem_maps[i]; if (mm->len != 0 && gpa >= mm->gpa && gpa < mm->gpa + mm->len) return (true); /* 'gpa' is sysmem or devmem */ } if (ppt_is_mmio(vm, gpa)) return (true); /* 'gpa' is pci passthru mmio */ return (false); } int vm_alloc_memseg(struct vm *vm, int ident, size_t len, bool sysmem) { struct mem_seg *seg; vm_object_t obj; if (ident < 0 || ident >= VM_MAX_MEMSEGS) return (EINVAL); if (len == 0 || (len & PAGE_MASK)) return (EINVAL); seg = &vm->mem_segs[ident]; if (seg->object != NULL) { if (seg->len == len && seg->sysmem == sysmem) return (EEXIST); else return (EINVAL); } obj = vm_object_allocate(OBJT_SWAP, len >> PAGE_SHIFT); if (obj == NULL) return (ENOMEM); seg->len = len; seg->object = obj; seg->sysmem = sysmem; return (0); } int vm_get_memseg(struct vm *vm, int ident, size_t *len, bool *sysmem, vm_object_t *objptr) { struct mem_seg *seg; if (ident < 0 || ident >= VM_MAX_MEMSEGS) return (EINVAL); seg = &vm->mem_segs[ident]; if (len) *len = seg->len; if (sysmem) *sysmem = seg->sysmem; if (objptr) *objptr = seg->object; return (0); } void vm_free_memseg(struct vm *vm, int ident) { struct mem_seg *seg; KASSERT(ident >= 0 && ident < VM_MAX_MEMSEGS, ("%s: invalid memseg ident %d", __func__, ident)); seg = &vm->mem_segs[ident]; if (seg->object != NULL) { vm_object_deallocate(seg->object); bzero(seg, sizeof(struct mem_seg)); } } int vm_mmap_memseg(struct vm *vm, vm_paddr_t gpa, int segid, vm_ooffset_t first, size_t len, int prot, int flags) { struct mem_seg *seg; struct mem_map *m, *map; vm_ooffset_t last; int i, error; if (prot == 0 || (prot & ~(VM_PROT_ALL)) != 0) return (EINVAL); if (flags & ~VM_MEMMAP_F_WIRED) return (EINVAL); if (segid < 0 || segid >= VM_MAX_MEMSEGS) return (EINVAL); seg = &vm->mem_segs[segid]; if (seg->object == NULL) return (EINVAL); last = first + len; if (first < 0 || first >= last || last > seg->len) return (EINVAL); if ((gpa | first | last) & PAGE_MASK) return (EINVAL); map = NULL; for (i = 0; i < VM_MAX_MEMMAPS; i++) { m = &vm->mem_maps[i]; if (m->len == 0) { map = m; break; } } if (map == NULL) return (ENOSPC); error = vm_map_find(&vm->vmspace->vm_map, seg->object, first, &gpa, len, 0, VMFS_NO_SPACE, prot, prot, 0); if (error != KERN_SUCCESS) return (EFAULT); vm_object_reference(seg->object); if (flags & VM_MEMMAP_F_WIRED) { error = vm_map_wire(&vm->vmspace->vm_map, gpa, gpa + len, VM_MAP_WIRE_USER | VM_MAP_WIRE_NOHOLES); if (error != KERN_SUCCESS) { vm_map_remove(&vm->vmspace->vm_map, gpa, gpa + len); return (error == KERN_RESOURCE_SHORTAGE ? ENOMEM : EFAULT); } } map->gpa = gpa; map->len = len; map->segoff = first; map->segid = segid; map->prot = prot; map->flags = flags; return (0); } int vm_munmap_memseg(struct vm *vm, vm_paddr_t gpa, size_t len) { struct mem_map *m; int i; for (i = 0; i < VM_MAX_MEMMAPS; i++) { m = &vm->mem_maps[i]; if (m->gpa == gpa && m->len == len && (m->flags & VM_MEMMAP_F_IOMMU) == 0) { vm_free_memmap(vm, i); return (0); } } return (EINVAL); } int vm_mmap_getnext(struct vm *vm, vm_paddr_t *gpa, int *segid, vm_ooffset_t *segoff, size_t *len, int *prot, int *flags) { struct mem_map *mm, *mmnext; int i; mmnext = NULL; for (i = 0; i < VM_MAX_MEMMAPS; i++) { mm = &vm->mem_maps[i]; if (mm->len == 0 || mm->gpa < *gpa) continue; if (mmnext == NULL || mm->gpa < mmnext->gpa) mmnext = mm; } if (mmnext != NULL) { *gpa = mmnext->gpa; if (segid) *segid = mmnext->segid; if (segoff) *segoff = mmnext->segoff; if (len) *len = mmnext->len; if (prot) *prot = mmnext->prot; if (flags) *flags = mmnext->flags; return (0); } else { return (ENOENT); } } static void vm_free_memmap(struct vm *vm, int ident) { struct mem_map *mm; int error __diagused; mm = &vm->mem_maps[ident]; if (mm->len) { error = vm_map_remove(&vm->vmspace->vm_map, mm->gpa, mm->gpa + mm->len); KASSERT(error == KERN_SUCCESS, ("%s: vm_map_remove error %d", __func__, error)); bzero(mm, sizeof(struct mem_map)); } } static __inline bool sysmem_mapping(struct vm *vm, struct mem_map *mm) { if (mm->len != 0 && vm->mem_segs[mm->segid].sysmem) return (true); else return (false); } vm_paddr_t vmm_sysmem_maxaddr(struct vm *vm) { struct mem_map *mm; vm_paddr_t maxaddr; int i; maxaddr = 0; for (i = 0; i < VM_MAX_MEMMAPS; i++) { mm = &vm->mem_maps[i]; if (sysmem_mapping(vm, mm)) { if (maxaddr < mm->gpa + mm->len) maxaddr = mm->gpa + mm->len; } } return (maxaddr); } static void vm_iommu_modify(struct vm *vm, bool map) { int i, sz; vm_paddr_t gpa, hpa; struct mem_map *mm; void *vp, *cookie, *host_domain; sz = PAGE_SIZE; host_domain = iommu_host_domain(); for (i = 0; i < VM_MAX_MEMMAPS; i++) { mm = &vm->mem_maps[i]; if (!sysmem_mapping(vm, mm)) continue; if (map) { KASSERT((mm->flags & VM_MEMMAP_F_IOMMU) == 0, ("iommu map found invalid memmap %#lx/%#lx/%#x", mm->gpa, mm->len, mm->flags)); if ((mm->flags & VM_MEMMAP_F_WIRED) == 0) continue; mm->flags |= VM_MEMMAP_F_IOMMU; } else { if ((mm->flags & VM_MEMMAP_F_IOMMU) == 0) continue; mm->flags &= ~VM_MEMMAP_F_IOMMU; KASSERT((mm->flags & VM_MEMMAP_F_WIRED) != 0, ("iommu unmap found invalid memmap %#lx/%#lx/%#x", mm->gpa, mm->len, mm->flags)); } gpa = mm->gpa; while (gpa < mm->gpa + mm->len) { vp = vm_gpa_hold_global(vm, gpa, PAGE_SIZE, VM_PROT_WRITE, &cookie); KASSERT(vp != NULL, ("vm(%s) could not map gpa %#lx", vm_name(vm), gpa)); vm_gpa_release(cookie); hpa = DMAP_TO_PHYS((uintptr_t)vp); if (map) { iommu_create_mapping(vm->iommu, gpa, hpa, sz); } else { iommu_remove_mapping(vm->iommu, gpa, sz); } gpa += PAGE_SIZE; } } /* * Invalidate the cached translations associated with the domain * from which pages were removed. */ if (map) iommu_invalidate_tlb(host_domain); else iommu_invalidate_tlb(vm->iommu); } #define vm_iommu_unmap(vm) vm_iommu_modify((vm), false) #define vm_iommu_map(vm) vm_iommu_modify((vm), true) int vm_unassign_pptdev(struct vm *vm, int bus, int slot, int func) { int error; error = ppt_unassign_device(vm, bus, slot, func); if (error) return (error); if (ppt_assigned_devices(vm) == 0) vm_iommu_unmap(vm); return (0); } int vm_assign_pptdev(struct vm *vm, int bus, int slot, int func) { int error; vm_paddr_t maxaddr; /* Set up the IOMMU to do the 'gpa' to 'hpa' translation */ if (ppt_assigned_devices(vm) == 0) { KASSERT(vm->iommu == NULL, ("vm_assign_pptdev: iommu must be NULL")); maxaddr = vmm_sysmem_maxaddr(vm); vm->iommu = iommu_create_domain(maxaddr); if (vm->iommu == NULL) return (ENXIO); vm_iommu_map(vm); } error = ppt_assign_device(vm, bus, slot, func); return (error); } static void * _vm_gpa_hold(struct vm *vm, vm_paddr_t gpa, size_t len, int reqprot, void **cookie) { int i, count, pageoff; struct mem_map *mm; vm_page_t m; pageoff = gpa & PAGE_MASK; if (len > PAGE_SIZE - pageoff) panic("vm_gpa_hold: invalid gpa/len: 0x%016lx/%lu", gpa, len); count = 0; for (i = 0; i < VM_MAX_MEMMAPS; i++) { mm = &vm->mem_maps[i]; if (gpa >= mm->gpa && gpa < mm->gpa + mm->len) { count = vm_fault_quick_hold_pages(&vm->vmspace->vm_map, trunc_page(gpa), PAGE_SIZE, reqprot, &m, 1); break; } } if (count == 1) { *cookie = m; return ((void *)(PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)) + pageoff)); } else { *cookie = NULL; return (NULL); } } void * vm_gpa_hold(struct vcpu *vcpu, vm_paddr_t gpa, size_t len, int reqprot, void **cookie) { #ifdef INVARIANTS /* * The current vcpu should be frozen to ensure 'vm_memmap[]' * stability. */ int state = vcpu_get_state(vcpu, NULL); KASSERT(state == VCPU_FROZEN, ("%s: invalid vcpu state %d", __func__, state)); #endif return (_vm_gpa_hold(vcpu->vm, gpa, len, reqprot, cookie)); } void * vm_gpa_hold_global(struct vm *vm, vm_paddr_t gpa, size_t len, int reqprot, void **cookie) { #ifdef INVARIANTS /* * All vcpus are frozen by ioctls that modify the memory map * (e.g. VM_MMAP_MEMSEG). Therefore 'vm->memmap[]' stability is * guaranteed if at least one vcpu is in the VCPU_FROZEN state. */ int state; for (int i = 0; i < vm->maxcpus; i++) { state = vcpu_get_state(vm_vcpu(vm, i), NULL); KASSERT(state == VCPU_FROZEN, ("%s: invalid vcpu state %d", __func__, state)); } #endif return (_vm_gpa_hold(vm, gpa, len, reqprot, cookie)); } void vm_gpa_release(void *cookie) { vm_page_t m = cookie; vm_page_unwire(m, PQ_ACTIVE); } int vm_get_register(struct vcpu *vcpu, int reg, uint64_t *retval) { if (reg >= VM_REG_LAST) return (EINVAL); return (vmmops_getreg(vcpu->cookie, reg, retval)); } int vm_set_register(struct vcpu *vcpu, int reg, uint64_t val) { int error; if (reg >= VM_REG_LAST) return (EINVAL); error = vmmops_setreg(vcpu->cookie, reg, val); if (error || reg != VM_REG_GUEST_RIP) return (error); /* Set 'nextrip' to match the value of %rip */ VMM_CTR1(vcpu, "Setting nextrip to %#lx", val); vcpu->nextrip = val; return (0); } static bool is_descriptor_table(int reg) { switch (reg) { case VM_REG_GUEST_IDTR: case VM_REG_GUEST_GDTR: return (true); default: return (false); } } static bool is_segment_register(int reg) { switch (reg) { case VM_REG_GUEST_ES: case VM_REG_GUEST_CS: case VM_REG_GUEST_SS: case VM_REG_GUEST_DS: case VM_REG_GUEST_FS: case VM_REG_GUEST_GS: case VM_REG_GUEST_TR: case VM_REG_GUEST_LDTR: return (true); default: return (false); } } int vm_get_seg_desc(struct vcpu *vcpu, int reg, struct seg_desc *desc) { if (!is_segment_register(reg) && !is_descriptor_table(reg)) return (EINVAL); return (vmmops_getdesc(vcpu->cookie, reg, desc)); } int vm_set_seg_desc(struct vcpu *vcpu, int reg, struct seg_desc *desc) { if (!is_segment_register(reg) && !is_descriptor_table(reg)) return (EINVAL); return (vmmops_setdesc(vcpu->cookie, reg, desc)); } static void restore_guest_fpustate(struct vcpu *vcpu) { /* flush host state to the pcb */ fpuexit(curthread); /* restore guest FPU state */ fpu_stop_emulating(); fpurestore(vcpu->guestfpu); /* restore guest XCR0 if XSAVE is enabled in the host */ if (rcr4() & CR4_XSAVE) load_xcr(0, vcpu->guest_xcr0); /* * The FPU is now "dirty" with the guest's state so turn on emulation * to trap any access to the FPU by the host. */ fpu_start_emulating(); } static void save_guest_fpustate(struct vcpu *vcpu) { if ((rcr0() & CR0_TS) == 0) panic("fpu emulation not enabled in host!"); /* save guest XCR0 and restore host XCR0 */ if (rcr4() & CR4_XSAVE) { vcpu->guest_xcr0 = rxcr(0); load_xcr(0, vmm_get_host_xcr0()); } /* save guest FPU state */ fpu_stop_emulating(); fpusave(vcpu->guestfpu); fpu_start_emulating(); } static VMM_STAT(VCPU_IDLE_TICKS, "number of ticks vcpu was idle"); static int vcpu_set_state_locked(struct vcpu *vcpu, enum vcpu_state newstate, bool from_idle) { int error; vcpu_assert_locked(vcpu); /* * State transitions from the vmmdev_ioctl() must always begin from * the VCPU_IDLE state. This guarantees that there is only a single * ioctl() operating on a vcpu at any point. */ if (from_idle) { while (vcpu->state != VCPU_IDLE) { vcpu->reqidle = 1; vcpu_notify_event_locked(vcpu, false); VMM_CTR1(vcpu, "vcpu state change from %s to " "idle requested", vcpu_state2str(vcpu->state)); msleep_spin(&vcpu->state, &vcpu->mtx, "vmstat", hz); } } else { KASSERT(vcpu->state != VCPU_IDLE, ("invalid transition from " "vcpu idle state")); } if (vcpu->state == VCPU_RUNNING) { KASSERT(vcpu->hostcpu == curcpu, ("curcpu %d and hostcpu %d " "mismatch for running vcpu", curcpu, vcpu->hostcpu)); } else { KASSERT(vcpu->hostcpu == NOCPU, ("Invalid hostcpu %d for a " "vcpu that is not running", vcpu->hostcpu)); } /* * The following state transitions are allowed: * IDLE -> FROZEN -> IDLE * FROZEN -> RUNNING -> FROZEN * FROZEN -> SLEEPING -> FROZEN */ switch (vcpu->state) { case VCPU_IDLE: case VCPU_RUNNING: case VCPU_SLEEPING: error = (newstate != VCPU_FROZEN); break; case VCPU_FROZEN: error = (newstate == VCPU_FROZEN); break; default: error = 1; break; } if (error) return (EBUSY); VMM_CTR2(vcpu, "vcpu state changed from %s to %s", vcpu_state2str(vcpu->state), vcpu_state2str(newstate)); vcpu->state = newstate; if (newstate == VCPU_RUNNING) vcpu->hostcpu = curcpu; else vcpu->hostcpu = NOCPU; if (newstate == VCPU_IDLE) wakeup(&vcpu->state); return (0); } static void vcpu_require_state(struct vcpu *vcpu, enum vcpu_state newstate) { int error; if ((error = vcpu_set_state(vcpu, newstate, false)) != 0) panic("Error %d setting state to %d\n", error, newstate); } static void vcpu_require_state_locked(struct vcpu *vcpu, enum vcpu_state newstate) { int error; if ((error = vcpu_set_state_locked(vcpu, newstate, false)) != 0) panic("Error %d setting state to %d", error, newstate); } static int vm_handle_rendezvous(struct vcpu *vcpu) { struct vm *vm = vcpu->vm; struct thread *td; int error, vcpuid; error = 0; vcpuid = vcpu->vcpuid; td = curthread; mtx_lock(&vm->rendezvous_mtx); while (vm->rendezvous_func != NULL) { /* 'rendezvous_req_cpus' must be a subset of 'active_cpus' */ CPU_AND(&vm->rendezvous_req_cpus, &vm->rendezvous_req_cpus, &vm->active_cpus); if (CPU_ISSET(vcpuid, &vm->rendezvous_req_cpus) && !CPU_ISSET(vcpuid, &vm->rendezvous_done_cpus)) { VMM_CTR0(vcpu, "Calling rendezvous func"); (*vm->rendezvous_func)(vcpu, vm->rendezvous_arg); CPU_SET(vcpuid, &vm->rendezvous_done_cpus); } if (CPU_CMP(&vm->rendezvous_req_cpus, &vm->rendezvous_done_cpus) == 0) { VMM_CTR0(vcpu, "Rendezvous completed"); vm->rendezvous_func = NULL; wakeup(&vm->rendezvous_func); break; } VMM_CTR0(vcpu, "Wait for rendezvous completion"); mtx_sleep(&vm->rendezvous_func, &vm->rendezvous_mtx, 0, "vmrndv", hz); if ((td->td_flags & TDF_NEEDSUSPCHK) != 0) { mtx_unlock(&vm->rendezvous_mtx); error = thread_check_susp(td, true); if (error != 0) return (error); mtx_lock(&vm->rendezvous_mtx); } } mtx_unlock(&vm->rendezvous_mtx); return (0); } /* * Emulate a guest 'hlt' by sleeping until the vcpu is ready to run. */ static int vm_handle_hlt(struct vcpu *vcpu, bool intr_disabled, bool *retu) { struct vm *vm = vcpu->vm; const char *wmesg; struct thread *td; int error, t, vcpuid, vcpu_halted, vm_halted; vcpuid = vcpu->vcpuid; vcpu_halted = 0; vm_halted = 0; error = 0; td = curthread; KASSERT(!CPU_ISSET(vcpuid, &vm->halted_cpus), ("vcpu already halted")); vcpu_lock(vcpu); while (1) { /* * Do a final check for pending NMI or interrupts before * really putting this thread to sleep. Also check for * software events that would cause this vcpu to wakeup. * * These interrupts/events could have happened after the * vcpu returned from vmmops_run() and before it acquired the * vcpu lock above. */ if (vm->rendezvous_func != NULL || vm->suspend || vcpu->reqidle) break; if (vm_nmi_pending(vcpu)) break; if (!intr_disabled) { if (vm_extint_pending(vcpu) || vlapic_pending_intr(vcpu->vlapic, NULL)) { break; } } /* Don't go to sleep if the vcpu thread needs to yield */ if (vcpu_should_yield(vcpu)) break; if (vcpu_debugged(vcpu)) break; /* * Some Linux guests implement "halt" by having all vcpus * execute HLT with interrupts disabled. 'halted_cpus' keeps * track of the vcpus that have entered this state. When all * vcpus enter the halted state the virtual machine is halted. */ if (intr_disabled) { wmesg = "vmhalt"; VMM_CTR0(vcpu, "Halted"); if (!vcpu_halted && halt_detection_enabled) { vcpu_halted = 1; CPU_SET_ATOMIC(vcpuid, &vm->halted_cpus); } if (CPU_CMP(&vm->halted_cpus, &vm->active_cpus) == 0) { vm_halted = 1; break; } } else { wmesg = "vmidle"; } t = ticks; vcpu_require_state_locked(vcpu, VCPU_SLEEPING); /* * XXX msleep_spin() cannot be interrupted by signals so * wake up periodically to check pending signals. */ msleep_spin(vcpu, &vcpu->mtx, wmesg, hz); vcpu_require_state_locked(vcpu, VCPU_FROZEN); vmm_stat_incr(vcpu, VCPU_IDLE_TICKS, ticks - t); if ((td->td_flags & TDF_NEEDSUSPCHK) != 0) { vcpu_unlock(vcpu); error = thread_check_susp(td, false); if (error != 0) { if (vcpu_halted) { CPU_CLR_ATOMIC(vcpuid, &vm->halted_cpus); } return (error); } vcpu_lock(vcpu); } } if (vcpu_halted) CPU_CLR_ATOMIC(vcpuid, &vm->halted_cpus); vcpu_unlock(vcpu); if (vm_halted) vm_suspend(vm, VM_SUSPEND_HALT); return (0); } static int vm_handle_paging(struct vcpu *vcpu, bool *retu) { struct vm *vm = vcpu->vm; int rv, ftype; struct vm_map *map; struct vm_exit *vme; vme = &vcpu->exitinfo; KASSERT(vme->inst_length == 0, ("%s: invalid inst_length %d", __func__, vme->inst_length)); ftype = vme->u.paging.fault_type; KASSERT(ftype == VM_PROT_READ || ftype == VM_PROT_WRITE || ftype == VM_PROT_EXECUTE, ("vm_handle_paging: invalid fault_type %d", ftype)); if (ftype == VM_PROT_READ || ftype == VM_PROT_WRITE) { rv = pmap_emulate_accessed_dirty(vmspace_pmap(vm->vmspace), vme->u.paging.gpa, ftype); if (rv == 0) { VMM_CTR2(vcpu, "%s bit emulation for gpa %#lx", ftype == VM_PROT_READ ? "accessed" : "dirty", vme->u.paging.gpa); goto done; } } map = &vm->vmspace->vm_map; rv = vm_fault(map, vme->u.paging.gpa, ftype, VM_FAULT_NORMAL, NULL); VMM_CTR3(vcpu, "vm_handle_paging rv = %d, gpa = %#lx, " "ftype = %d", rv, vme->u.paging.gpa, ftype); if (rv != KERN_SUCCESS) return (EFAULT); done: return (0); } static int vm_handle_inst_emul(struct vcpu *vcpu, bool *retu) { struct vie *vie; struct vm_exit *vme; uint64_t gla, gpa, cs_base; struct vm_guest_paging *paging; mem_region_read_t mread; mem_region_write_t mwrite; enum vm_cpu_mode cpu_mode; int cs_d, error, fault; vme = &vcpu->exitinfo; KASSERT(vme->inst_length == 0, ("%s: invalid inst_length %d", __func__, vme->inst_length)); gla = vme->u.inst_emul.gla; gpa = vme->u.inst_emul.gpa; cs_base = vme->u.inst_emul.cs_base; cs_d = vme->u.inst_emul.cs_d; vie = &vme->u.inst_emul.vie; paging = &vme->u.inst_emul.paging; cpu_mode = paging->cpu_mode; VMM_CTR1(vcpu, "inst_emul fault accessing gpa %#lx", gpa); /* Fetch, decode and emulate the faulting instruction */ if (vie->num_valid == 0) { error = vmm_fetch_instruction(vcpu, paging, vme->rip + cs_base, VIE_INST_SIZE, vie, &fault); } else { /* * The instruction bytes have already been copied into 'vie' */ error = fault = 0; } if (error || fault) return (error); if (vmm_decode_instruction(vcpu, gla, cpu_mode, cs_d, vie) != 0) { VMM_CTR1(vcpu, "Error decoding instruction at %#lx", vme->rip + cs_base); *retu = true; /* dump instruction bytes in userspace */ return (0); } /* * Update 'nextrip' based on the length of the emulated instruction. */ vme->inst_length = vie->num_processed; vcpu->nextrip += vie->num_processed; VMM_CTR1(vcpu, "nextrip updated to %#lx after instruction decoding", vcpu->nextrip); /* return to userland unless this is an in-kernel emulated device */ if (gpa >= DEFAULT_APIC_BASE && gpa < DEFAULT_APIC_BASE + PAGE_SIZE) { mread = lapic_mmio_read; mwrite = lapic_mmio_write; } else if (gpa >= VIOAPIC_BASE && gpa < VIOAPIC_BASE + VIOAPIC_SIZE) { mread = vioapic_mmio_read; mwrite = vioapic_mmio_write; } else if (gpa >= VHPET_BASE && gpa < VHPET_BASE + VHPET_SIZE) { mread = vhpet_mmio_read; mwrite = vhpet_mmio_write; } else { *retu = true; return (0); } error = vmm_emulate_instruction(vcpu, gpa, vie, paging, mread, mwrite, retu); return (error); } static int vm_handle_suspend(struct vcpu *vcpu, bool *retu) { struct vm *vm = vcpu->vm; int error, i; struct thread *td; error = 0; td = curthread; CPU_SET_ATOMIC(vcpu->vcpuid, &vm->suspended_cpus); /* * Wait until all 'active_cpus' have suspended themselves. * * Since a VM may be suspended at any time including when one or * more vcpus are doing a rendezvous we need to call the rendezvous * handler while we are waiting to prevent a deadlock. */ vcpu_lock(vcpu); while (error == 0) { if (CPU_CMP(&vm->suspended_cpus, &vm->active_cpus) == 0) { VMM_CTR0(vcpu, "All vcpus suspended"); break; } if (vm->rendezvous_func == NULL) { VMM_CTR0(vcpu, "Sleeping during suspend"); vcpu_require_state_locked(vcpu, VCPU_SLEEPING); msleep_spin(vcpu, &vcpu->mtx, "vmsusp", hz); vcpu_require_state_locked(vcpu, VCPU_FROZEN); if ((td->td_flags & TDF_NEEDSUSPCHK) != 0) { vcpu_unlock(vcpu); error = thread_check_susp(td, false); vcpu_lock(vcpu); } } else { VMM_CTR0(vcpu, "Rendezvous during suspend"); vcpu_unlock(vcpu); error = vm_handle_rendezvous(vcpu); vcpu_lock(vcpu); } } vcpu_unlock(vcpu); /* * Wakeup the other sleeping vcpus and return to userspace. */ for (i = 0; i < vm->maxcpus; i++) { if (CPU_ISSET(i, &vm->suspended_cpus)) { vcpu_notify_event(vm_vcpu(vm, i), false); } } *retu = true; return (error); } static int vm_handle_reqidle(struct vcpu *vcpu, bool *retu) { vcpu_lock(vcpu); KASSERT(vcpu->reqidle, ("invalid vcpu reqidle %d", vcpu->reqidle)); vcpu->reqidle = 0; vcpu_unlock(vcpu); *retu = true; return (0); } int vm_suspend(struct vm *vm, enum vm_suspend_how how) { int i; if (how <= VM_SUSPEND_NONE || how >= VM_SUSPEND_LAST) return (EINVAL); if (atomic_cmpset_int(&vm->suspend, 0, how) == 0) { VM_CTR2(vm, "virtual machine already suspended %d/%d", vm->suspend, how); return (EALREADY); } VM_CTR1(vm, "virtual machine successfully suspended %d", how); /* * Notify all active vcpus that they are now suspended. */ for (i = 0; i < vm->maxcpus; i++) { if (CPU_ISSET(i, &vm->active_cpus)) vcpu_notify_event(vm_vcpu(vm, i), false); } return (0); } void vm_exit_suspended(struct vcpu *vcpu, uint64_t rip) { struct vm *vm = vcpu->vm; struct vm_exit *vmexit; KASSERT(vm->suspend > VM_SUSPEND_NONE && vm->suspend < VM_SUSPEND_LAST, ("vm_exit_suspended: invalid suspend type %d", vm->suspend)); vmexit = vm_exitinfo(vcpu); vmexit->rip = rip; vmexit->inst_length = 0; vmexit->exitcode = VM_EXITCODE_SUSPENDED; vmexit->u.suspended.how = vm->suspend; } void vm_exit_debug(struct vcpu *vcpu, uint64_t rip) { struct vm_exit *vmexit; vmexit = vm_exitinfo(vcpu); vmexit->rip = rip; vmexit->inst_length = 0; vmexit->exitcode = VM_EXITCODE_DEBUG; } void vm_exit_rendezvous(struct vcpu *vcpu, uint64_t rip) { struct vm_exit *vmexit; vmexit = vm_exitinfo(vcpu); vmexit->rip = rip; vmexit->inst_length = 0; vmexit->exitcode = VM_EXITCODE_RENDEZVOUS; vmm_stat_incr(vcpu, VMEXIT_RENDEZVOUS, 1); } void vm_exit_reqidle(struct vcpu *vcpu, uint64_t rip) { struct vm_exit *vmexit; vmexit = vm_exitinfo(vcpu); vmexit->rip = rip; vmexit->inst_length = 0; vmexit->exitcode = VM_EXITCODE_REQIDLE; vmm_stat_incr(vcpu, VMEXIT_REQIDLE, 1); } void vm_exit_astpending(struct vcpu *vcpu, uint64_t rip) { struct vm_exit *vmexit; vmexit = vm_exitinfo(vcpu); vmexit->rip = rip; vmexit->inst_length = 0; vmexit->exitcode = VM_EXITCODE_BOGUS; vmm_stat_incr(vcpu, VMEXIT_ASTPENDING, 1); } int vm_run(struct vcpu *vcpu, struct vm_exit *vme_user) { struct vm *vm = vcpu->vm; struct vm_eventinfo evinfo; int error, vcpuid; struct pcb *pcb; uint64_t tscval; struct vm_exit *vme; bool retu, intr_disabled; pmap_t pmap; vcpuid = vcpu->vcpuid; if (!CPU_ISSET(vcpuid, &vm->active_cpus)) return (EINVAL); if (CPU_ISSET(vcpuid, &vm->suspended_cpus)) return (EINVAL); pmap = vmspace_pmap(vm->vmspace); vme = &vcpu->exitinfo; evinfo.rptr = &vm->rendezvous_func; evinfo.sptr = &vm->suspend; evinfo.iptr = &vcpu->reqidle; restart: critical_enter(); KASSERT(!CPU_ISSET(curcpu, &pmap->pm_active), ("vm_run: absurd pm_active")); tscval = rdtsc(); pcb = PCPU_GET(curpcb); set_pcb_flags(pcb, PCB_FULL_IRET); restore_guest_fpustate(vcpu); vcpu_require_state(vcpu, VCPU_RUNNING); error = vmmops_run(vcpu->cookie, vcpu->nextrip, pmap, &evinfo); vcpu_require_state(vcpu, VCPU_FROZEN); save_guest_fpustate(vcpu); vmm_stat_incr(vcpu, VCPU_TOTAL_RUNTIME, rdtsc() - tscval); critical_exit(); if (error == 0) { retu = false; vcpu->nextrip = vme->rip + vme->inst_length; switch (vme->exitcode) { case VM_EXITCODE_REQIDLE: error = vm_handle_reqidle(vcpu, &retu); break; case VM_EXITCODE_SUSPENDED: error = vm_handle_suspend(vcpu, &retu); break; case VM_EXITCODE_IOAPIC_EOI: vioapic_process_eoi(vm, vme->u.ioapic_eoi.vector); break; case VM_EXITCODE_RENDEZVOUS: error = vm_handle_rendezvous(vcpu); break; case VM_EXITCODE_HLT: intr_disabled = ((vme->u.hlt.rflags & PSL_I) == 0); error = vm_handle_hlt(vcpu, intr_disabled, &retu); break; case VM_EXITCODE_PAGING: error = vm_handle_paging(vcpu, &retu); break; case VM_EXITCODE_INST_EMUL: error = vm_handle_inst_emul(vcpu, &retu); break; case VM_EXITCODE_INOUT: case VM_EXITCODE_INOUT_STR: error = vm_handle_inout(vcpu, vme, &retu); break; case VM_EXITCODE_MONITOR: case VM_EXITCODE_MWAIT: case VM_EXITCODE_VMINSN: vm_inject_ud(vcpu); break; default: retu = true; /* handled in userland */ break; } } /* * VM_EXITCODE_INST_EMUL could access the apic which could transform the * exit code into VM_EXITCODE_IPI. */ if (error == 0 && vme->exitcode == VM_EXITCODE_IPI) { retu = false; error = vm_handle_ipi(vcpu, vme, &retu); } if (error == 0 && retu == false) goto restart; vmm_stat_incr(vcpu, VMEXIT_USERSPACE, 1); VMM_CTR2(vcpu, "retu %d/%d", error, vme->exitcode); /* copy the exit information */ *vme_user = *vme; return (error); } int vm_restart_instruction(struct vcpu *vcpu) { enum vcpu_state state; uint64_t rip; int error __diagused; state = vcpu_get_state(vcpu, NULL); if (state == VCPU_RUNNING) { /* * When a vcpu is "running" the next instruction is determined * by adding 'rip' and 'inst_length' in the vcpu's 'exitinfo'. * Thus setting 'inst_length' to zero will cause the current * instruction to be restarted. */ vcpu->exitinfo.inst_length = 0; VMM_CTR1(vcpu, "restarting instruction at %#lx by " "setting inst_length to zero", vcpu->exitinfo.rip); } else if (state == VCPU_FROZEN) { /* * When a vcpu is "frozen" it is outside the critical section * around vmmops_run() and 'nextrip' points to the next * instruction. Thus instruction restart is achieved by setting * 'nextrip' to the vcpu's %rip. */ error = vm_get_register(vcpu, VM_REG_GUEST_RIP, &rip); KASSERT(!error, ("%s: error %d getting rip", __func__, error)); VMM_CTR2(vcpu, "restarting instruction by updating " "nextrip from %#lx to %#lx", vcpu->nextrip, rip); vcpu->nextrip = rip; } else { panic("%s: invalid state %d", __func__, state); } return (0); } int vm_exit_intinfo(struct vcpu *vcpu, uint64_t info) { int type, vector; if (info & VM_INTINFO_VALID) { type = info & VM_INTINFO_TYPE; vector = info & 0xff; if (type == VM_INTINFO_NMI && vector != IDT_NMI) return (EINVAL); if (type == VM_INTINFO_HWEXCEPTION && vector >= 32) return (EINVAL); if (info & VM_INTINFO_RSVD) return (EINVAL); } else { info = 0; } VMM_CTR2(vcpu, "%s: info1(%#lx)", __func__, info); vcpu->exitintinfo = info; return (0); } enum exc_class { EXC_BENIGN, EXC_CONTRIBUTORY, EXC_PAGEFAULT }; #define IDT_VE 20 /* Virtualization Exception (Intel specific) */ static enum exc_class exception_class(uint64_t info) { int type, vector; KASSERT(info & VM_INTINFO_VALID, ("intinfo must be valid: %#lx", info)); type = info & VM_INTINFO_TYPE; vector = info & 0xff; /* Table 6-4, "Interrupt and Exception Classes", Intel SDM, Vol 3 */ switch (type) { case VM_INTINFO_HWINTR: case VM_INTINFO_SWINTR: case VM_INTINFO_NMI: return (EXC_BENIGN); default: /* * Hardware exception. * * SVM and VT-x use identical type values to represent NMI, * hardware interrupt and software interrupt. * * SVM uses type '3' for all exceptions. VT-x uses type '3' * for exceptions except #BP and #OF. #BP and #OF use a type * value of '5' or '6'. Therefore we don't check for explicit * values of 'type' to classify 'intinfo' into a hardware * exception. */ break; } switch (vector) { case IDT_PF: case IDT_VE: return (EXC_PAGEFAULT); case IDT_DE: case IDT_TS: case IDT_NP: case IDT_SS: case IDT_GP: return (EXC_CONTRIBUTORY); default: return (EXC_BENIGN); } } static int nested_fault(struct vcpu *vcpu, uint64_t info1, uint64_t info2, uint64_t *retinfo) { enum exc_class exc1, exc2; int type1, vector1; KASSERT(info1 & VM_INTINFO_VALID, ("info1 %#lx is not valid", info1)); KASSERT(info2 & VM_INTINFO_VALID, ("info2 %#lx is not valid", info2)); /* * If an exception occurs while attempting to call the double-fault * handler the processor enters shutdown mode (aka triple fault). */ type1 = info1 & VM_INTINFO_TYPE; vector1 = info1 & 0xff; if (type1 == VM_INTINFO_HWEXCEPTION && vector1 == IDT_DF) { VMM_CTR2(vcpu, "triple fault: info1(%#lx), info2(%#lx)", info1, info2); vm_suspend(vcpu->vm, VM_SUSPEND_TRIPLEFAULT); *retinfo = 0; return (0); } /* * Table 6-5 "Conditions for Generating a Double Fault", Intel SDM, Vol3 */ exc1 = exception_class(info1); exc2 = exception_class(info2); if ((exc1 == EXC_CONTRIBUTORY && exc2 == EXC_CONTRIBUTORY) || (exc1 == EXC_PAGEFAULT && exc2 != EXC_BENIGN)) { /* Convert nested fault into a double fault. */ *retinfo = IDT_DF; *retinfo |= VM_INTINFO_VALID | VM_INTINFO_HWEXCEPTION; *retinfo |= VM_INTINFO_DEL_ERRCODE; } else { /* Handle exceptions serially */ *retinfo = info2; } return (1); } static uint64_t vcpu_exception_intinfo(struct vcpu *vcpu) { uint64_t info = 0; if (vcpu->exception_pending) { info = vcpu->exc_vector & 0xff; info |= VM_INTINFO_VALID | VM_INTINFO_HWEXCEPTION; if (vcpu->exc_errcode_valid) { info |= VM_INTINFO_DEL_ERRCODE; info |= (uint64_t)vcpu->exc_errcode << 32; } } return (info); } int vm_entry_intinfo(struct vcpu *vcpu, uint64_t *retinfo) { uint64_t info1, info2; int valid; info1 = vcpu->exitintinfo; vcpu->exitintinfo = 0; info2 = 0; if (vcpu->exception_pending) { info2 = vcpu_exception_intinfo(vcpu); vcpu->exception_pending = 0; VMM_CTR2(vcpu, "Exception %d delivered: %#lx", vcpu->exc_vector, info2); } if ((info1 & VM_INTINFO_VALID) && (info2 & VM_INTINFO_VALID)) { valid = nested_fault(vcpu, info1, info2, retinfo); } else if (info1 & VM_INTINFO_VALID) { *retinfo = info1; valid = 1; } else if (info2 & VM_INTINFO_VALID) { *retinfo = info2; valid = 1; } else { valid = 0; } if (valid) { VMM_CTR4(vcpu, "%s: info1(%#lx), info2(%#lx), " "retinfo(%#lx)", __func__, info1, info2, *retinfo); } return (valid); } int vm_get_intinfo(struct vcpu *vcpu, uint64_t *info1, uint64_t *info2) { *info1 = vcpu->exitintinfo; *info2 = vcpu_exception_intinfo(vcpu); return (0); } int vm_inject_exception(struct vcpu *vcpu, int vector, int errcode_valid, uint32_t errcode, int restart_instruction) { uint64_t regval; int error __diagused; if (vector < 0 || vector >= 32) return (EINVAL); /* * A double fault exception should never be injected directly into * the guest. It is a derived exception that results from specific * combinations of nested faults. */ if (vector == IDT_DF) return (EINVAL); if (vcpu->exception_pending) { VMM_CTR2(vcpu, "Unable to inject exception %d due to " "pending exception %d", vector, vcpu->exc_vector); return (EBUSY); } if (errcode_valid) { /* * Exceptions don't deliver an error code in real mode. */ error = vm_get_register(vcpu, VM_REG_GUEST_CR0, ®val); KASSERT(!error, ("%s: error %d getting CR0", __func__, error)); if (!(regval & CR0_PE)) errcode_valid = 0; } /* * From section 26.6.1 "Interruptibility State" in Intel SDM: * * Event blocking by "STI" or "MOV SS" is cleared after guest executes * one instruction or incurs an exception. */ error = vm_set_register(vcpu, VM_REG_GUEST_INTR_SHADOW, 0); KASSERT(error == 0, ("%s: error %d clearing interrupt shadow", __func__, error)); if (restart_instruction) vm_restart_instruction(vcpu); vcpu->exception_pending = 1; vcpu->exc_vector = vector; vcpu->exc_errcode = errcode; vcpu->exc_errcode_valid = errcode_valid; VMM_CTR1(vcpu, "Exception %d pending", vector); return (0); } void vm_inject_fault(struct vcpu *vcpu, int vector, int errcode_valid, int errcode) { int error __diagused, restart_instruction; restart_instruction = 1; error = vm_inject_exception(vcpu, vector, errcode_valid, errcode, restart_instruction); KASSERT(error == 0, ("vm_inject_exception error %d", error)); } void vm_inject_pf(struct vcpu *vcpu, int error_code, uint64_t cr2) { int error __diagused; VMM_CTR2(vcpu, "Injecting page fault: error_code %#x, cr2 %#lx", error_code, cr2); error = vm_set_register(vcpu, VM_REG_GUEST_CR2, cr2); KASSERT(error == 0, ("vm_set_register(cr2) error %d", error)); vm_inject_fault(vcpu, IDT_PF, 1, error_code); } static VMM_STAT(VCPU_NMI_COUNT, "number of NMIs delivered to vcpu"); int vm_inject_nmi(struct vcpu *vcpu) { vcpu->nmi_pending = 1; vcpu_notify_event(vcpu, false); return (0); } int vm_nmi_pending(struct vcpu *vcpu) { return (vcpu->nmi_pending); } void vm_nmi_clear(struct vcpu *vcpu) { if (vcpu->nmi_pending == 0) panic("vm_nmi_clear: inconsistent nmi_pending state"); vcpu->nmi_pending = 0; vmm_stat_incr(vcpu, VCPU_NMI_COUNT, 1); } static VMM_STAT(VCPU_EXTINT_COUNT, "number of ExtINTs delivered to vcpu"); int vm_inject_extint(struct vcpu *vcpu) { vcpu->extint_pending = 1; vcpu_notify_event(vcpu, false); return (0); } int vm_extint_pending(struct vcpu *vcpu) { return (vcpu->extint_pending); } void vm_extint_clear(struct vcpu *vcpu) { if (vcpu->extint_pending == 0) panic("vm_extint_clear: inconsistent extint_pending state"); vcpu->extint_pending = 0; vmm_stat_incr(vcpu, VCPU_EXTINT_COUNT, 1); } int vm_get_capability(struct vcpu *vcpu, int type, int *retval) { if (type < 0 || type >= VM_CAP_MAX) return (EINVAL); return (vmmops_getcap(vcpu->cookie, type, retval)); } int vm_set_capability(struct vcpu *vcpu, int type, int val) { if (type < 0 || type >= VM_CAP_MAX) return (EINVAL); return (vmmops_setcap(vcpu->cookie, type, val)); } struct vm * vcpu_vm(struct vcpu *vcpu) { return (vcpu->vm); } int vcpu_vcpuid(struct vcpu *vcpu) { return (vcpu->vcpuid); } struct vcpu * vm_vcpu(struct vm *vm, int vcpuid) { return (&vm->vcpu[vcpuid]); } struct vlapic * vm_lapic(struct vcpu *vcpu) { return (vcpu->vlapic); } struct vioapic * vm_ioapic(struct vm *vm) { return (vm->vioapic); } struct vhpet * vm_hpet(struct vm *vm) { return (vm->vhpet); } bool vmm_is_pptdev(int bus, int slot, int func) { int b, f, i, n, s; char *val, *cp, *cp2; bool found; /* * XXX * The length of an environment variable is limited to 128 bytes which * puts an upper limit on the number of passthru devices that may be * specified using a single environment variable. * * Work around this by scanning multiple environment variable * names instead of a single one - yuck! */ const char *names[] = { "pptdevs", "pptdevs2", "pptdevs3", NULL }; /* set pptdevs="1/2/3 4/5/6 7/8/9 10/11/12" */ found = false; for (i = 0; names[i] != NULL && !found; i++) { cp = val = kern_getenv(names[i]); while (cp != NULL && *cp != '\0') { if ((cp2 = strchr(cp, ' ')) != NULL) *cp2 = '\0'; n = sscanf(cp, "%d/%d/%d", &b, &s, &f); if (n == 3 && bus == b && slot == s && func == f) { found = true; break; } if (cp2 != NULL) *cp2++ = ' '; cp = cp2; } freeenv(val); } return (found); } void * vm_iommu_domain(struct vm *vm) { return (vm->iommu); } int vcpu_set_state(struct vcpu *vcpu, enum vcpu_state newstate, bool from_idle) { int error; vcpu_lock(vcpu); error = vcpu_set_state_locked(vcpu, newstate, from_idle); vcpu_unlock(vcpu); return (error); } enum vcpu_state vcpu_get_state(struct vcpu *vcpu, int *hostcpu) { enum vcpu_state state; vcpu_lock(vcpu); state = vcpu->state; if (hostcpu != NULL) *hostcpu = vcpu->hostcpu; vcpu_unlock(vcpu); return (state); } int vm_activate_cpu(struct vcpu *vcpu) { struct vm *vm = vcpu->vm; if (CPU_ISSET(vcpu->vcpuid, &vm->active_cpus)) return (EBUSY); VMM_CTR0(vcpu, "activated"); CPU_SET_ATOMIC(vcpu->vcpuid, &vm->active_cpus); return (0); } int vm_suspend_cpu(struct vm *vm, struct vcpu *vcpu) { if (vcpu == NULL) { vm->debug_cpus = vm->active_cpus; for (int i = 0; i < vm->maxcpus; i++) { if (CPU_ISSET(i, &vm->active_cpus)) vcpu_notify_event(vm_vcpu(vm, i), false); } } else { if (!CPU_ISSET(vcpu->vcpuid, &vm->active_cpus)) return (EINVAL); CPU_SET_ATOMIC(vcpu->vcpuid, &vm->debug_cpus); vcpu_notify_event(vcpu, false); } return (0); } int vm_resume_cpu(struct vm *vm, struct vcpu *vcpu) { if (vcpu == NULL) { CPU_ZERO(&vm->debug_cpus); } else { if (!CPU_ISSET(vcpu->vcpuid, &vm->debug_cpus)) return (EINVAL); CPU_CLR_ATOMIC(vcpu->vcpuid, &vm->debug_cpus); } return (0); } int vcpu_debugged(struct vcpu *vcpu) { return (CPU_ISSET(vcpu->vcpuid, &vcpu->vm->debug_cpus)); } cpuset_t vm_active_cpus(struct vm *vm) { return (vm->active_cpus); } cpuset_t vm_debug_cpus(struct vm *vm) { return (vm->debug_cpus); } cpuset_t vm_suspended_cpus(struct vm *vm) { return (vm->suspended_cpus); } void * vcpu_stats(struct vcpu *vcpu) { return (vcpu->stats); } int vm_get_x2apic_state(struct vcpu *vcpu, enum x2apic_state *state) { *state = vcpu->x2apic_state; return (0); } int vm_set_x2apic_state(struct vcpu *vcpu, enum x2apic_state state) { if (state >= X2APIC_STATE_LAST) return (EINVAL); vcpu->x2apic_state = state; vlapic_set_x2apic_state(vcpu, state); return (0); } /* * This function is called to ensure that a vcpu "sees" a pending event * as soon as possible: * - If the vcpu thread is sleeping then it is woken up. * - If the vcpu is running on a different host_cpu then an IPI will be directed * to the host_cpu to cause the vcpu to trap into the hypervisor. */ static void vcpu_notify_event_locked(struct vcpu *vcpu, bool lapic_intr) { int hostcpu; hostcpu = vcpu->hostcpu; if (vcpu->state == VCPU_RUNNING) { KASSERT(hostcpu != NOCPU, ("vcpu running on invalid hostcpu")); if (hostcpu != curcpu) { if (lapic_intr) { vlapic_post_intr(vcpu->vlapic, hostcpu, vmm_ipinum); } else { ipi_cpu(hostcpu, vmm_ipinum); } } else { /* * If the 'vcpu' is running on 'curcpu' then it must * be sending a notification to itself (e.g. SELF_IPI). * The pending event will be picked up when the vcpu * transitions back to guest context. */ } } else { KASSERT(hostcpu == NOCPU, ("vcpu state %d not consistent " "with hostcpu %d", vcpu->state, hostcpu)); if (vcpu->state == VCPU_SLEEPING) wakeup_one(vcpu); } } void vcpu_notify_event(struct vcpu *vcpu, bool lapic_intr) { vcpu_lock(vcpu); vcpu_notify_event_locked(vcpu, lapic_intr); vcpu_unlock(vcpu); } struct vmspace * vm_get_vmspace(struct vm *vm) { return (vm->vmspace); } int vm_apicid2vcpuid(struct vm *vm, int apicid) { /* * XXX apic id is assumed to be numerically identical to vcpu id */ return (apicid); } int vm_smp_rendezvous(struct vcpu *vcpu, cpuset_t dest, vm_rendezvous_func_t func, void *arg) { struct vm *vm = vcpu->vm; int error, i; /* * Enforce that this function is called without any locks */ WITNESS_WARN(WARN_PANIC, NULL, "vm_smp_rendezvous"); restart: mtx_lock(&vm->rendezvous_mtx); if (vm->rendezvous_func != NULL) { /* * If a rendezvous is already in progress then we need to * call the rendezvous handler in case this 'vcpu' is one * of the targets of the rendezvous. */ VMM_CTR0(vcpu, "Rendezvous already in progress"); mtx_unlock(&vm->rendezvous_mtx); error = vm_handle_rendezvous(vcpu); if (error != 0) return (error); goto restart; } KASSERT(vm->rendezvous_func == NULL, ("vm_smp_rendezvous: previous " "rendezvous is still in progress")); VMM_CTR0(vcpu, "Initiating rendezvous"); vm->rendezvous_req_cpus = dest; CPU_ZERO(&vm->rendezvous_done_cpus); vm->rendezvous_arg = arg; vm->rendezvous_func = func; mtx_unlock(&vm->rendezvous_mtx); /* * Wake up any sleeping vcpus and trigger a VM-exit in any running * vcpus so they handle the rendezvous as soon as possible. */ for (i = 0; i < vm->maxcpus; i++) { if (CPU_ISSET(i, &dest)) vcpu_notify_event(vm_vcpu(vm, i), false); } return (vm_handle_rendezvous(vcpu)); } struct vatpic * vm_atpic(struct vm *vm) { return (vm->vatpic); } struct vatpit * vm_atpit(struct vm *vm) { return (vm->vatpit); } struct vpmtmr * vm_pmtmr(struct vm *vm) { return (vm->vpmtmr); } struct vrtc * vm_rtc(struct vm *vm) { return (vm->vrtc); } enum vm_reg_name vm_segment_name(int seg) { static enum vm_reg_name seg_names[] = { VM_REG_GUEST_ES, VM_REG_GUEST_CS, VM_REG_GUEST_SS, VM_REG_GUEST_DS, VM_REG_GUEST_FS, VM_REG_GUEST_GS }; KASSERT(seg >= 0 && seg < nitems(seg_names), ("%s: invalid segment encoding %d", __func__, seg)); return (seg_names[seg]); } void vm_copy_teardown(struct vm_copyinfo *copyinfo, int num_copyinfo) { int idx; for (idx = 0; idx < num_copyinfo; idx++) { if (copyinfo[idx].cookie != NULL) vm_gpa_release(copyinfo[idx].cookie); } bzero(copyinfo, num_copyinfo * sizeof(struct vm_copyinfo)); } int vm_copy_setup(struct vcpu *vcpu, struct vm_guest_paging *paging, uint64_t gla, size_t len, int prot, struct vm_copyinfo *copyinfo, int num_copyinfo, int *fault) { int error, idx, nused; size_t n, off, remaining; void *hva, *cookie; uint64_t gpa; bzero(copyinfo, sizeof(struct vm_copyinfo) * num_copyinfo); nused = 0; remaining = len; while (remaining > 0) { KASSERT(nused < num_copyinfo, ("insufficient vm_copyinfo")); error = vm_gla2gpa(vcpu, paging, gla, prot, &gpa, fault); if (error || *fault) return (error); off = gpa & PAGE_MASK; n = min(remaining, PAGE_SIZE - off); copyinfo[nused].gpa = gpa; copyinfo[nused].len = n; remaining -= n; gla += n; nused++; } for (idx = 0; idx < nused; idx++) { hva = vm_gpa_hold(vcpu, copyinfo[idx].gpa, copyinfo[idx].len, prot, &cookie); if (hva == NULL) break; copyinfo[idx].hva = hva; copyinfo[idx].cookie = cookie; } if (idx != nused) { vm_copy_teardown(copyinfo, num_copyinfo); return (EFAULT); } else { *fault = 0; return (0); } } void vm_copyin(struct vm_copyinfo *copyinfo, void *kaddr, size_t len) { char *dst; int idx; dst = kaddr; idx = 0; while (len > 0) { bcopy(copyinfo[idx].hva, dst, copyinfo[idx].len); len -= copyinfo[idx].len; dst += copyinfo[idx].len; idx++; } } void vm_copyout(const void *kaddr, struct vm_copyinfo *copyinfo, size_t len) { const char *src; int idx; src = kaddr; idx = 0; while (len > 0) { bcopy(src, copyinfo[idx].hva, copyinfo[idx].len); len -= copyinfo[idx].len; src += copyinfo[idx].len; idx++; } } /* * Return the amount of in-use and wired memory for the VM. Since * these are global stats, only return the values with for vCPU 0 */ VMM_STAT_DECLARE(VMM_MEM_RESIDENT); VMM_STAT_DECLARE(VMM_MEM_WIRED); static void vm_get_rescnt(struct vcpu *vcpu, struct vmm_stat_type *stat) { if (vcpu->vcpuid == 0) { vmm_stat_set(vcpu, VMM_MEM_RESIDENT, PAGE_SIZE * vmspace_resident_count(vcpu->vm->vmspace)); } } static void vm_get_wiredcnt(struct vcpu *vcpu, struct vmm_stat_type *stat) { if (vcpu->vcpuid == 0) { vmm_stat_set(vcpu, VMM_MEM_WIRED, PAGE_SIZE * pmap_wired_count(vmspace_pmap(vcpu->vm->vmspace))); } } VMM_STAT_FUNC(VMM_MEM_RESIDENT, "Resident memory", vm_get_rescnt); VMM_STAT_FUNC(VMM_MEM_WIRED, "Wired memory", vm_get_wiredcnt); #ifdef BHYVE_SNAPSHOT static int vm_snapshot_vcpus(struct vm *vm, struct vm_snapshot_meta *meta) { uint64_t tsc, now; int ret; struct vcpu *vcpu; uint16_t i, maxcpus; now = rdtsc(); maxcpus = vm_get_maxcpus(vm); for (i = 0; i < maxcpus; i++) { vcpu = &vm->vcpu[i]; SNAPSHOT_VAR_OR_LEAVE(vcpu->x2apic_state, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(vcpu->exitintinfo, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(vcpu->exc_vector, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(vcpu->exc_errcode_valid, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(vcpu->exc_errcode, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(vcpu->guest_xcr0, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(vcpu->exitinfo, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(vcpu->nextrip, meta, ret, done); /* * Save the absolute TSC value by adding now to tsc_offset. * * It will be turned turned back into an actual offset when the * TSC restore function is called */ tsc = now + vcpu->tsc_offset; SNAPSHOT_VAR_OR_LEAVE(tsc, meta, ret, done); } done: return (ret); } static int vm_snapshot_vm(struct vm *vm, struct vm_snapshot_meta *meta) { int ret; ret = vm_snapshot_vcpus(vm, meta); if (ret != 0) goto done; done: return (ret); } static int vm_snapshot_vcpu(struct vm *vm, struct vm_snapshot_meta *meta) { int error; struct vcpu *vcpu; uint16_t i, maxcpus; error = 0; maxcpus = vm_get_maxcpus(vm); for (i = 0; i < maxcpus; i++) { vcpu = &vm->vcpu[i]; error = vmmops_vcpu_snapshot(vcpu->cookie, meta); if (error != 0) { printf("%s: failed to snapshot vmcs/vmcb data for " "vCPU: %d; error: %d\n", __func__, i, error); goto done; } } done: return (error); } /* * Save kernel-side structures to user-space for snapshotting. */ int vm_snapshot_req(struct vm *vm, struct vm_snapshot_meta *meta) { int ret = 0; switch (meta->dev_req) { case STRUCT_VMX: ret = vmmops_snapshot(vm->cookie, meta); break; case STRUCT_VMCX: ret = vm_snapshot_vcpu(vm, meta); break; case STRUCT_VM: ret = vm_snapshot_vm(vm, meta); break; case STRUCT_VIOAPIC: ret = vioapic_snapshot(vm_ioapic(vm), meta); break; case STRUCT_VLAPIC: ret = vlapic_snapshot(vm, meta); break; case STRUCT_VHPET: ret = vhpet_snapshot(vm_hpet(vm), meta); break; case STRUCT_VATPIC: ret = vatpic_snapshot(vm_atpic(vm), meta); break; case STRUCT_VATPIT: ret = vatpit_snapshot(vm_atpit(vm), meta); break; case STRUCT_VPMTMR: ret = vpmtmr_snapshot(vm_pmtmr(vm), meta); break; case STRUCT_VRTC: ret = vrtc_snapshot(vm_rtc(vm), meta); break; default: printf("%s: failed to find the requested type %#x\n", __func__, meta->dev_req); ret = (EINVAL); } return (ret); } void vm_set_tsc_offset(struct vcpu *vcpu, uint64_t offset) { vcpu->tsc_offset = offset; } int vm_restore_time(struct vm *vm) { int error; uint64_t now; struct vcpu *vcpu; uint16_t i, maxcpus; now = rdtsc(); error = vhpet_restore_time(vm_hpet(vm)); if (error) return (error); maxcpus = vm_get_maxcpus(vm); for (i = 0; i < maxcpus; i++) { vcpu = &vm->vcpu[i]; error = vmmops_restore_tsc(vcpu->cookie, vcpu->tsc_offset - now); if (error) return (error); } return (0); } #endif diff --git a/sys/amd64/vmm/vmm_dev.c b/sys/amd64/vmm/vmm_dev.c index a007a25aa9b5..249131b16464 100644 --- a/sys/amd64/vmm/vmm_dev.c +++ b/sys/amd64/vmm/vmm_dev.c @@ -1,1330 +1,1330 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2011 NetApp, Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #include __FBSDID("$FreeBSD$"); #include "opt_bhyve_snapshot.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "vmm_lapic.h" #include "vmm_stat.h" #include "vmm_mem.h" #include "io/ppt.h" #include "io/vatpic.h" #include "io/vioapic.h" #include "io/vhpet.h" #include "io/vrtc.h" #ifdef COMPAT_FREEBSD13 struct vm_stats_old { int cpuid; /* in */ int num_entries; /* out */ struct timeval tv; uint64_t statbuf[MAX_VM_STATS]; }; #define VM_STATS_OLD \ _IOWR('v', IOCNUM_VM_STATS, struct vm_stats_old) #endif struct devmem_softc { int segid; char *name; struct cdev *cdev; struct vmmdev_softc *sc; SLIST_ENTRY(devmem_softc) link; }; struct vmmdev_softc { struct vm *vm; /* vm instance cookie */ struct cdev *cdev; struct ucred *ucred; SLIST_ENTRY(vmmdev_softc) link; SLIST_HEAD(, devmem_softc) devmem; int flags; }; #define VSC_LINKED 0x01 static SLIST_HEAD(, vmmdev_softc) head; static unsigned pr_allow_flag; static struct mtx vmmdev_mtx; +MTX_SYSINIT(vmmdev_mtx, &vmmdev_mtx, "vmm device mutex", MTX_DEF); static MALLOC_DEFINE(M_VMMDEV, "vmmdev", "vmmdev"); SYSCTL_DECL(_hw_vmm); static int vmm_priv_check(struct ucred *ucred); static int devmem_create_cdev(const char *vmname, int id, char *devmem); static void devmem_destroy(void *arg); static int vmm_priv_check(struct ucred *ucred) { if (jailed(ucred) && !(ucred->cr_prison->pr_allow & pr_allow_flag)) return (EPERM); return (0); } static int vcpu_lock_one(struct vmmdev_softc *sc, int vcpu) { int error; if (vcpu < 0 || vcpu >= vm_get_maxcpus(sc->vm)) return (EINVAL); error = vcpu_set_state(vm_vcpu(sc->vm, vcpu), VCPU_FROZEN, true); return (error); } static void vcpu_unlock_one(struct vmmdev_softc *sc, int vcpuid) { struct vcpu *vcpu; enum vcpu_state state; vcpu = vm_vcpu(sc->vm, vcpuid); state = vcpu_get_state(vcpu, NULL); if (state != VCPU_FROZEN) { panic("vcpu %s(%d) has invalid state %d", vm_name(sc->vm), vcpuid, state); } vcpu_set_state(vcpu, VCPU_IDLE, false); } static int vcpu_lock_all(struct vmmdev_softc *sc) { int error, vcpu; uint16_t maxcpus; maxcpus = vm_get_maxcpus(sc->vm); for (vcpu = 0; vcpu < maxcpus; vcpu++) { error = vcpu_lock_one(sc, vcpu); if (error) break; } if (error) { while (--vcpu >= 0) vcpu_unlock_one(sc, vcpu); } return (error); } static void vcpu_unlock_all(struct vmmdev_softc *sc) { int vcpu; uint16_t maxcpus; maxcpus = vm_get_maxcpus(sc->vm); for (vcpu = 0; vcpu < maxcpus; vcpu++) vcpu_unlock_one(sc, vcpu); } static struct vmmdev_softc * vmmdev_lookup(const char *name) { struct vmmdev_softc *sc; #ifdef notyet /* XXX kernel is not compiled with invariants */ mtx_assert(&vmmdev_mtx, MA_OWNED); #endif SLIST_FOREACH(sc, &head, link) { if (strcmp(name, vm_name(sc->vm)) == 0) break; } if (sc == NULL) return (NULL); if (cr_cansee(curthread->td_ucred, sc->ucred)) return (NULL); return (sc); } static struct vmmdev_softc * vmmdev_lookup2(struct cdev *cdev) { return (cdev->si_drv1); } static int vmmdev_rw(struct cdev *cdev, struct uio *uio, int flags) { int error, off, c, prot; vm_paddr_t gpa, maxaddr; void *hpa, *cookie; struct vmmdev_softc *sc; struct vcpu *vcpu; uint16_t lastcpu; error = vmm_priv_check(curthread->td_ucred); if (error) return (error); sc = vmmdev_lookup2(cdev); if (sc == NULL) return (ENXIO); /* * Get a read lock on the guest memory map by freezing any vcpu. */ lastcpu = vm_get_maxcpus(sc->vm) - 1; error = vcpu_lock_one(sc, lastcpu); if (error) return (error); vcpu = vm_vcpu(sc->vm, lastcpu); prot = (uio->uio_rw == UIO_WRITE ? VM_PROT_WRITE : VM_PROT_READ); maxaddr = vmm_sysmem_maxaddr(sc->vm); while (uio->uio_resid > 0 && error == 0) { gpa = uio->uio_offset; off = gpa & PAGE_MASK; c = min(uio->uio_resid, PAGE_SIZE - off); /* * The VM has a hole in its physical memory map. If we want to * use 'dd' to inspect memory beyond the hole we need to * provide bogus data for memory that lies in the hole. * * Since this device does not support lseek(2), dd(1) will * read(2) blocks of data to simulate the lseek(2). */ hpa = vm_gpa_hold(vcpu, gpa, c, prot, &cookie); if (hpa == NULL) { if (uio->uio_rw == UIO_READ && gpa < maxaddr) error = uiomove(__DECONST(void *, zero_region), c, uio); else error = EFAULT; } else { error = uiomove(hpa, c, uio); vm_gpa_release(cookie); } } vcpu_unlock_one(sc, lastcpu); return (error); } CTASSERT(sizeof(((struct vm_memseg *)0)->name) >= VM_MAX_SUFFIXLEN + 1); static int get_memseg(struct vmmdev_softc *sc, struct vm_memseg *mseg, size_t len) { struct devmem_softc *dsc; int error; bool sysmem; error = vm_get_memseg(sc->vm, mseg->segid, &mseg->len, &sysmem, NULL); if (error || mseg->len == 0) return (error); if (!sysmem) { SLIST_FOREACH(dsc, &sc->devmem, link) { if (dsc->segid == mseg->segid) break; } KASSERT(dsc != NULL, ("%s: devmem segment %d not found", __func__, mseg->segid)); error = copystr(dsc->name, mseg->name, len, NULL); } else { bzero(mseg->name, len); } return (error); } static int alloc_memseg(struct vmmdev_softc *sc, struct vm_memseg *mseg, size_t len) { char *name; int error; bool sysmem; error = 0; name = NULL; sysmem = true; /* * The allocation is lengthened by 1 to hold a terminating NUL. It'll * by stripped off when devfs processes the full string. */ if (VM_MEMSEG_NAME(mseg)) { sysmem = false; name = malloc(len, M_VMMDEV, M_WAITOK); error = copystr(mseg->name, name, len, NULL); if (error) goto done; } error = vm_alloc_memseg(sc->vm, mseg->segid, mseg->len, sysmem); if (error) goto done; if (VM_MEMSEG_NAME(mseg)) { error = devmem_create_cdev(vm_name(sc->vm), mseg->segid, name); if (error) vm_free_memseg(sc->vm, mseg->segid); else name = NULL; /* freed when 'cdev' is destroyed */ } done: free(name, M_VMMDEV); return (error); } static int vm_get_register_set(struct vcpu *vcpu, unsigned int count, int *regnum, uint64_t *regval) { int error, i; error = 0; for (i = 0; i < count; i++) { error = vm_get_register(vcpu, regnum[i], ®val[i]); if (error) break; } return (error); } static int vm_set_register_set(struct vcpu *vcpu, unsigned int count, int *regnum, uint64_t *regval) { int error, i; error = 0; for (i = 0; i < count; i++) { error = vm_set_register(vcpu, regnum[i], regval[i]); if (error) break; } return (error); } static struct vcpu * lookup_vcpu(struct vm *vm, int vcpuid) { if (vcpuid < 0 || vcpuid >= vm_get_maxcpus(vm)) return (NULL); return (vm_vcpu(vm, vcpuid)); } static int vmmdev_ioctl(struct cdev *cdev, u_long cmd, caddr_t data, int fflag, struct thread *td) { int error, vcpuid, state_changed, size; cpuset_t *cpuset; struct vmmdev_softc *sc; struct vcpu *vcpu; struct vm_register *vmreg; struct vm_seg_desc *vmsegdesc; struct vm_register_set *vmregset; struct vm_run *vmrun; struct vm_exception *vmexc; struct vm_lapic_irq *vmirq; struct vm_lapic_msi *vmmsi; struct vm_ioapic_irq *ioapic_irq; struct vm_isa_irq *isa_irq; struct vm_isa_irq_trigger *isa_irq_trigger; struct vm_capability *vmcap; struct vm_pptdev *pptdev; struct vm_pptdev_mmio *pptmmio; struct vm_pptdev_msi *pptmsi; struct vm_pptdev_msix *pptmsix; #ifdef COMPAT_FREEBSD13 struct vm_stats_old *vmstats_old; #endif struct vm_stats *vmstats; struct vm_stat_desc *statdesc; struct vm_x2apic *x2apic; struct vm_gpa_pte *gpapte; struct vm_suspend *vmsuspend; struct vm_gla2gpa *gg; struct vm_cpuset *vm_cpuset; struct vm_intinfo *vmii; struct vm_rtc_time *rtctime; struct vm_rtc_data *rtcdata; struct vm_memmap *mm; struct vm_munmap *mu; struct vm_cpu_topology *topology; struct vm_readwrite_kernemu_device *kernemu; uint64_t *regvals; int *regnums; #ifdef BHYVE_SNAPSHOT struct vm_snapshot_meta *snapshot_meta; #endif error = vmm_priv_check(curthread->td_ucred); if (error) return (error); sc = vmmdev_lookup2(cdev); if (sc == NULL) return (ENXIO); vcpuid = -1; vcpu = NULL; state_changed = 0; /* * For VMM ioctls that operate on a single vCPU, lookup the * vcpu. For VMM ioctls which require one or more vCPUs to * not be running, lock necessary vCPUs. * * XXX fragile, handle with care * Most of these assume that the first field of the ioctl data * is the vcpuid. */ switch (cmd) { case VM_RUN: case VM_GET_REGISTER: case VM_SET_REGISTER: case VM_GET_SEGMENT_DESCRIPTOR: case VM_SET_SEGMENT_DESCRIPTOR: case VM_GET_REGISTER_SET: case VM_SET_REGISTER_SET: case VM_INJECT_EXCEPTION: case VM_GET_CAPABILITY: case VM_SET_CAPABILITY: case VM_PPTDEV_MSI: case VM_PPTDEV_MSIX: case VM_SET_X2APIC_STATE: case VM_GLA2GPA: case VM_GLA2GPA_NOFAULT: case VM_ACTIVATE_CPU: case VM_SET_INTINFO: case VM_GET_INTINFO: case VM_RESTART_INSTRUCTION: /* * ioctls that can operate only on vcpus that are not running. */ vcpuid = *(int *)data; error = vcpu_lock_one(sc, vcpuid); if (error) goto done; state_changed = 1; vcpu = vm_vcpu(sc->vm, vcpuid); break; case VM_MAP_PPTDEV_MMIO: case VM_UNMAP_PPTDEV_MMIO: case VM_BIND_PPTDEV: case VM_UNBIND_PPTDEV: #ifdef COMPAT_FREEBSD12 case VM_ALLOC_MEMSEG_FBSD12: #endif case VM_ALLOC_MEMSEG: case VM_MMAP_MEMSEG: case VM_MUNMAP_MEMSEG: case VM_REINIT: /* * ioctls that operate on the entire virtual machine must * prevent all vcpus from running. */ error = vcpu_lock_all(sc); if (error) goto done; state_changed = 2; break; #ifdef COMPAT_FREEBSD12 case VM_GET_MEMSEG_FBSD12: #endif case VM_GET_MEMSEG: case VM_MMAP_GETNEXT: /* * Lock a vcpu to make sure that the memory map cannot be * modified while it is being inspected. */ vcpuid = vm_get_maxcpus(sc->vm) - 1; error = vcpu_lock_one(sc, vcpuid); if (error) goto done; state_changed = 1; break; #ifdef COMPAT_FREEBSD13 case VM_STATS_OLD: #endif case VM_STATS: case VM_INJECT_NMI: case VM_LAPIC_IRQ: case VM_GET_X2APIC_STATE: /* * These do not need the vCPU locked but do operate on * a specific vCPU. */ vcpuid = *(int *)data; vcpu = lookup_vcpu(sc->vm, vcpuid); if (vcpu == NULL) { error = EINVAL; goto done; } break; case VM_LAPIC_LOCAL_IRQ: case VM_SUSPEND_CPU: case VM_RESUME_CPU: /* * These can either operate on all CPUs via a vcpuid of * -1 or on a specific vCPU. */ vcpuid = *(int *)data; if (vcpuid == -1) break; vcpu = lookup_vcpu(sc->vm, vcpuid); if (vcpu == NULL) { error = EINVAL; goto done; } break; default: break; } switch(cmd) { case VM_RUN: vmrun = (struct vm_run *)data; error = vm_run(vcpu, &vmrun->vm_exit); break; case VM_SUSPEND: vmsuspend = (struct vm_suspend *)data; error = vm_suspend(sc->vm, vmsuspend->how); break; case VM_REINIT: error = vm_reinit(sc->vm); break; case VM_STAT_DESC: { statdesc = (struct vm_stat_desc *)data; error = vmm_stat_desc_copy(statdesc->index, statdesc->desc, sizeof(statdesc->desc)); break; } #ifdef COMPAT_FREEBSD13 case VM_STATS_OLD: vmstats_old = (struct vm_stats_old *)data; getmicrotime(&vmstats_old->tv); error = vmm_stat_copy(vcpu, 0, nitems(vmstats_old->statbuf), &vmstats_old->num_entries, vmstats_old->statbuf); break; #endif case VM_STATS: { vmstats = (struct vm_stats *)data; getmicrotime(&vmstats->tv); error = vmm_stat_copy(vcpu, vmstats->index, nitems(vmstats->statbuf), &vmstats->num_entries, vmstats->statbuf); break; } case VM_PPTDEV_MSI: pptmsi = (struct vm_pptdev_msi *)data; error = ppt_setup_msi(sc->vm, pptmsi->bus, pptmsi->slot, pptmsi->func, pptmsi->addr, pptmsi->msg, pptmsi->numvec); break; case VM_PPTDEV_MSIX: pptmsix = (struct vm_pptdev_msix *)data; error = ppt_setup_msix(sc->vm, pptmsix->bus, pptmsix->slot, pptmsix->func, pptmsix->idx, pptmsix->addr, pptmsix->msg, pptmsix->vector_control); break; case VM_PPTDEV_DISABLE_MSIX: pptdev = (struct vm_pptdev *)data; error = ppt_disable_msix(sc->vm, pptdev->bus, pptdev->slot, pptdev->func); break; case VM_MAP_PPTDEV_MMIO: pptmmio = (struct vm_pptdev_mmio *)data; error = ppt_map_mmio(sc->vm, pptmmio->bus, pptmmio->slot, pptmmio->func, pptmmio->gpa, pptmmio->len, pptmmio->hpa); break; case VM_UNMAP_PPTDEV_MMIO: pptmmio = (struct vm_pptdev_mmio *)data; error = ppt_unmap_mmio(sc->vm, pptmmio->bus, pptmmio->slot, pptmmio->func, pptmmio->gpa, pptmmio->len); break; case VM_BIND_PPTDEV: pptdev = (struct vm_pptdev *)data; error = vm_assign_pptdev(sc->vm, pptdev->bus, pptdev->slot, pptdev->func); break; case VM_UNBIND_PPTDEV: pptdev = (struct vm_pptdev *)data; error = vm_unassign_pptdev(sc->vm, pptdev->bus, pptdev->slot, pptdev->func); break; case VM_INJECT_EXCEPTION: vmexc = (struct vm_exception *)data; error = vm_inject_exception(vcpu, vmexc->vector, vmexc->error_code_valid, vmexc->error_code, vmexc->restart_instruction); break; case VM_INJECT_NMI: error = vm_inject_nmi(vcpu); break; case VM_LAPIC_IRQ: vmirq = (struct vm_lapic_irq *)data; error = lapic_intr_edge(vcpu, vmirq->vector); break; case VM_LAPIC_LOCAL_IRQ: vmirq = (struct vm_lapic_irq *)data; error = lapic_set_local_intr(sc->vm, vcpu, vmirq->vector); break; case VM_LAPIC_MSI: vmmsi = (struct vm_lapic_msi *)data; error = lapic_intr_msi(sc->vm, vmmsi->addr, vmmsi->msg); break; case VM_IOAPIC_ASSERT_IRQ: ioapic_irq = (struct vm_ioapic_irq *)data; error = vioapic_assert_irq(sc->vm, ioapic_irq->irq); break; case VM_IOAPIC_DEASSERT_IRQ: ioapic_irq = (struct vm_ioapic_irq *)data; error = vioapic_deassert_irq(sc->vm, ioapic_irq->irq); break; case VM_IOAPIC_PULSE_IRQ: ioapic_irq = (struct vm_ioapic_irq *)data; error = vioapic_pulse_irq(sc->vm, ioapic_irq->irq); break; case VM_IOAPIC_PINCOUNT: *(int *)data = vioapic_pincount(sc->vm); break; case VM_SET_KERNEMU_DEV: case VM_GET_KERNEMU_DEV: { mem_region_write_t mwrite; mem_region_read_t mread; bool arg; kernemu = (void *)data; if (kernemu->access_width > 0) size = (1u << kernemu->access_width); else size = 1; if (kernemu->gpa >= DEFAULT_APIC_BASE && kernemu->gpa < DEFAULT_APIC_BASE + PAGE_SIZE) { mread = lapic_mmio_read; mwrite = lapic_mmio_write; } else if (kernemu->gpa >= VIOAPIC_BASE && kernemu->gpa < VIOAPIC_BASE + VIOAPIC_SIZE) { mread = vioapic_mmio_read; mwrite = vioapic_mmio_write; } else if (kernemu->gpa >= VHPET_BASE && kernemu->gpa < VHPET_BASE + VHPET_SIZE) { mread = vhpet_mmio_read; mwrite = vhpet_mmio_write; } else { error = EINVAL; break; } if (cmd == VM_SET_KERNEMU_DEV) error = mwrite(vcpu, kernemu->gpa, kernemu->value, size, &arg); else error = mread(vcpu, kernemu->gpa, &kernemu->value, size, &arg); break; } case VM_ISA_ASSERT_IRQ: isa_irq = (struct vm_isa_irq *)data; error = vatpic_assert_irq(sc->vm, isa_irq->atpic_irq); if (error == 0 && isa_irq->ioapic_irq != -1) error = vioapic_assert_irq(sc->vm, isa_irq->ioapic_irq); break; case VM_ISA_DEASSERT_IRQ: isa_irq = (struct vm_isa_irq *)data; error = vatpic_deassert_irq(sc->vm, isa_irq->atpic_irq); if (error == 0 && isa_irq->ioapic_irq != -1) error = vioapic_deassert_irq(sc->vm, isa_irq->ioapic_irq); break; case VM_ISA_PULSE_IRQ: isa_irq = (struct vm_isa_irq *)data; error = vatpic_pulse_irq(sc->vm, isa_irq->atpic_irq); if (error == 0 && isa_irq->ioapic_irq != -1) error = vioapic_pulse_irq(sc->vm, isa_irq->ioapic_irq); break; case VM_ISA_SET_IRQ_TRIGGER: isa_irq_trigger = (struct vm_isa_irq_trigger *)data; error = vatpic_set_irq_trigger(sc->vm, isa_irq_trigger->atpic_irq, isa_irq_trigger->trigger); break; case VM_MMAP_GETNEXT: mm = (struct vm_memmap *)data; error = vm_mmap_getnext(sc->vm, &mm->gpa, &mm->segid, &mm->segoff, &mm->len, &mm->prot, &mm->flags); break; case VM_MMAP_MEMSEG: mm = (struct vm_memmap *)data; error = vm_mmap_memseg(sc->vm, mm->gpa, mm->segid, mm->segoff, mm->len, mm->prot, mm->flags); break; case VM_MUNMAP_MEMSEG: mu = (struct vm_munmap *)data; error = vm_munmap_memseg(sc->vm, mu->gpa, mu->len); break; #ifdef COMPAT_FREEBSD12 case VM_ALLOC_MEMSEG_FBSD12: error = alloc_memseg(sc, (struct vm_memseg *)data, sizeof(((struct vm_memseg_fbsd12 *)0)->name)); break; #endif case VM_ALLOC_MEMSEG: error = alloc_memseg(sc, (struct vm_memseg *)data, sizeof(((struct vm_memseg *)0)->name)); break; #ifdef COMPAT_FREEBSD12 case VM_GET_MEMSEG_FBSD12: error = get_memseg(sc, (struct vm_memseg *)data, sizeof(((struct vm_memseg_fbsd12 *)0)->name)); break; #endif case VM_GET_MEMSEG: error = get_memseg(sc, (struct vm_memseg *)data, sizeof(((struct vm_memseg *)0)->name)); break; case VM_GET_REGISTER: vmreg = (struct vm_register *)data; error = vm_get_register(vcpu, vmreg->regnum, &vmreg->regval); break; case VM_SET_REGISTER: vmreg = (struct vm_register *)data; error = vm_set_register(vcpu, vmreg->regnum, vmreg->regval); break; case VM_SET_SEGMENT_DESCRIPTOR: vmsegdesc = (struct vm_seg_desc *)data; error = vm_set_seg_desc(vcpu, vmsegdesc->regnum, &vmsegdesc->desc); break; case VM_GET_SEGMENT_DESCRIPTOR: vmsegdesc = (struct vm_seg_desc *)data; error = vm_get_seg_desc(vcpu, vmsegdesc->regnum, &vmsegdesc->desc); break; case VM_GET_REGISTER_SET: vmregset = (struct vm_register_set *)data; if (vmregset->count > VM_REG_LAST) { error = EINVAL; break; } regvals = malloc(sizeof(regvals[0]) * vmregset->count, M_VMMDEV, M_WAITOK); regnums = malloc(sizeof(regnums[0]) * vmregset->count, M_VMMDEV, M_WAITOK); error = copyin(vmregset->regnums, regnums, sizeof(regnums[0]) * vmregset->count); if (error == 0) error = vm_get_register_set(vcpu, vmregset->count, regnums, regvals); if (error == 0) error = copyout(regvals, vmregset->regvals, sizeof(regvals[0]) * vmregset->count); free(regvals, M_VMMDEV); free(regnums, M_VMMDEV); break; case VM_SET_REGISTER_SET: vmregset = (struct vm_register_set *)data; if (vmregset->count > VM_REG_LAST) { error = EINVAL; break; } regvals = malloc(sizeof(regvals[0]) * vmregset->count, M_VMMDEV, M_WAITOK); regnums = malloc(sizeof(regnums[0]) * vmregset->count, M_VMMDEV, M_WAITOK); error = copyin(vmregset->regnums, regnums, sizeof(regnums[0]) * vmregset->count); if (error == 0) error = copyin(vmregset->regvals, regvals, sizeof(regvals[0]) * vmregset->count); if (error == 0) error = vm_set_register_set(vcpu, vmregset->count, regnums, regvals); free(regvals, M_VMMDEV); free(regnums, M_VMMDEV); break; case VM_GET_CAPABILITY: vmcap = (struct vm_capability *)data; error = vm_get_capability(vcpu, vmcap->captype, &vmcap->capval); break; case VM_SET_CAPABILITY: vmcap = (struct vm_capability *)data; error = vm_set_capability(vcpu, vmcap->captype, vmcap->capval); break; case VM_SET_X2APIC_STATE: x2apic = (struct vm_x2apic *)data; error = vm_set_x2apic_state(vcpu, x2apic->state); break; case VM_GET_X2APIC_STATE: x2apic = (struct vm_x2apic *)data; error = vm_get_x2apic_state(vcpu, &x2apic->state); break; case VM_GET_GPA_PMAP: gpapte = (struct vm_gpa_pte *)data; pmap_get_mapping(vmspace_pmap(vm_get_vmspace(sc->vm)), gpapte->gpa, gpapte->pte, &gpapte->ptenum); error = 0; break; case VM_GET_HPET_CAPABILITIES: error = vhpet_getcap((struct vm_hpet_cap *)data); break; case VM_GLA2GPA: { CTASSERT(PROT_READ == VM_PROT_READ); CTASSERT(PROT_WRITE == VM_PROT_WRITE); CTASSERT(PROT_EXEC == VM_PROT_EXECUTE); gg = (struct vm_gla2gpa *)data; error = vm_gla2gpa(vcpu, &gg->paging, gg->gla, gg->prot, &gg->gpa, &gg->fault); KASSERT(error == 0 || error == EFAULT, ("%s: vm_gla2gpa unknown error %d", __func__, error)); break; } case VM_GLA2GPA_NOFAULT: gg = (struct vm_gla2gpa *)data; error = vm_gla2gpa_nofault(vcpu, &gg->paging, gg->gla, gg->prot, &gg->gpa, &gg->fault); KASSERT(error == 0 || error == EFAULT, ("%s: vm_gla2gpa unknown error %d", __func__, error)); break; case VM_ACTIVATE_CPU: error = vm_activate_cpu(vcpu); break; case VM_GET_CPUS: error = 0; vm_cpuset = (struct vm_cpuset *)data; size = vm_cpuset->cpusetsize; if (size < sizeof(cpuset_t) || size > CPU_MAXSIZE / NBBY) { error = ERANGE; break; } cpuset = malloc(size, M_TEMP, M_WAITOK | M_ZERO); if (vm_cpuset->which == VM_ACTIVE_CPUS) *cpuset = vm_active_cpus(sc->vm); else if (vm_cpuset->which == VM_SUSPENDED_CPUS) *cpuset = vm_suspended_cpus(sc->vm); else if (vm_cpuset->which == VM_DEBUG_CPUS) *cpuset = vm_debug_cpus(sc->vm); else error = EINVAL; if (error == 0) error = copyout(cpuset, vm_cpuset->cpus, size); free(cpuset, M_TEMP); break; case VM_SUSPEND_CPU: error = vm_suspend_cpu(sc->vm, vcpu); break; case VM_RESUME_CPU: error = vm_resume_cpu(sc->vm, vcpu); break; case VM_SET_INTINFO: vmii = (struct vm_intinfo *)data; error = vm_exit_intinfo(vcpu, vmii->info1); break; case VM_GET_INTINFO: vmii = (struct vm_intinfo *)data; error = vm_get_intinfo(vcpu, &vmii->info1, &vmii->info2); break; case VM_RTC_WRITE: rtcdata = (struct vm_rtc_data *)data; error = vrtc_nvram_write(sc->vm, rtcdata->offset, rtcdata->value); break; case VM_RTC_READ: rtcdata = (struct vm_rtc_data *)data; error = vrtc_nvram_read(sc->vm, rtcdata->offset, &rtcdata->value); break; case VM_RTC_SETTIME: rtctime = (struct vm_rtc_time *)data; error = vrtc_set_time(sc->vm, rtctime->secs); break; case VM_RTC_GETTIME: error = 0; rtctime = (struct vm_rtc_time *)data; rtctime->secs = vrtc_get_time(sc->vm); break; case VM_RESTART_INSTRUCTION: error = vm_restart_instruction(vcpu); break; case VM_SET_TOPOLOGY: topology = (struct vm_cpu_topology *)data; error = vm_set_topology(sc->vm, topology->sockets, topology->cores, topology->threads, topology->maxcpus); break; case VM_GET_TOPOLOGY: topology = (struct vm_cpu_topology *)data; vm_get_topology(sc->vm, &topology->sockets, &topology->cores, &topology->threads, &topology->maxcpus); error = 0; break; #ifdef BHYVE_SNAPSHOT case VM_SNAPSHOT_REQ: snapshot_meta = (struct vm_snapshot_meta *)data; error = vm_snapshot_req(sc->vm, snapshot_meta); break; case VM_RESTORE_TIME: error = vm_restore_time(sc->vm); break; #endif default: error = ENOTTY; break; } if (state_changed == 1) vcpu_unlock_one(sc, vcpuid); else if (state_changed == 2) vcpu_unlock_all(sc); done: /* * Make sure that no handler returns a kernel-internal * error value to userspace. */ KASSERT(error == ERESTART || error >= 0, ("vmmdev_ioctl: invalid error return %d", error)); return (error); } static int vmmdev_mmap_single(struct cdev *cdev, vm_ooffset_t *offset, vm_size_t mapsize, struct vm_object **objp, int nprot) { struct vmmdev_softc *sc; vm_paddr_t gpa; size_t len; vm_ooffset_t segoff, first, last; int error, found, segid; uint16_t lastcpu; bool sysmem; error = vmm_priv_check(curthread->td_ucred); if (error) return (error); first = *offset; last = first + mapsize; if ((nprot & PROT_EXEC) || first < 0 || first >= last) return (EINVAL); sc = vmmdev_lookup2(cdev); if (sc == NULL) { /* virtual machine is in the process of being created */ return (EINVAL); } /* * Get a read lock on the guest memory map by freezing any vcpu. */ lastcpu = vm_get_maxcpus(sc->vm) - 1; error = vcpu_lock_one(sc, lastcpu); if (error) return (error); gpa = 0; found = 0; while (!found) { error = vm_mmap_getnext(sc->vm, &gpa, &segid, &segoff, &len, NULL, NULL); if (error) break; if (first >= gpa && last <= gpa + len) found = 1; else gpa += len; } if (found) { error = vm_get_memseg(sc->vm, segid, &len, &sysmem, objp); KASSERT(error == 0 && *objp != NULL, ("%s: invalid memory segment %d", __func__, segid)); if (sysmem) { vm_object_reference(*objp); *offset = segoff + (first - gpa); } else { error = EINVAL; } } vcpu_unlock_one(sc, lastcpu); return (error); } static void vmmdev_destroy(void *arg) { struct vmmdev_softc *sc = arg; struct devmem_softc *dsc; int error __diagused; error = vcpu_lock_all(sc); KASSERT(error == 0, ("%s: error %d freezing vcpus", __func__, error)); while ((dsc = SLIST_FIRST(&sc->devmem)) != NULL) { KASSERT(dsc->cdev == NULL, ("%s: devmem not free", __func__)); SLIST_REMOVE_HEAD(&sc->devmem, link); free(dsc->name, M_VMMDEV); free(dsc, M_VMMDEV); } if (sc->cdev != NULL) destroy_dev(sc->cdev); if (sc->vm != NULL) vm_destroy(sc->vm); if (sc->ucred != NULL) crfree(sc->ucred); if ((sc->flags & VSC_LINKED) != 0) { mtx_lock(&vmmdev_mtx); SLIST_REMOVE(&head, sc, vmmdev_softc, link); mtx_unlock(&vmmdev_mtx); } free(sc, M_VMMDEV); } static int sysctl_vmm_destroy(SYSCTL_HANDLER_ARGS) { struct devmem_softc *dsc; struct vmmdev_softc *sc; struct cdev *cdev; char *buf; int error, buflen; error = vmm_priv_check(req->td->td_ucred); if (error) return (error); buflen = VM_MAX_NAMELEN + 1; buf = malloc(buflen, M_VMMDEV, M_WAITOK | M_ZERO); strlcpy(buf, "beavis", buflen); error = sysctl_handle_string(oidp, buf, buflen, req); if (error != 0 || req->newptr == NULL) goto out; mtx_lock(&vmmdev_mtx); sc = vmmdev_lookup(buf); if (sc == NULL || sc->cdev == NULL) { mtx_unlock(&vmmdev_mtx); error = EINVAL; goto out; } /* * Setting 'sc->cdev' to NULL is used to indicate that the VM * is scheduled for destruction. */ cdev = sc->cdev; sc->cdev = NULL; mtx_unlock(&vmmdev_mtx); /* * Destroy all cdevs: * * - any new operations on the 'cdev' will return an error (ENXIO). * * - the 'devmem' cdevs are destroyed before the virtual machine 'cdev' */ SLIST_FOREACH(dsc, &sc->devmem, link) { KASSERT(dsc->cdev != NULL, ("devmem cdev already destroyed")); destroy_dev(dsc->cdev); devmem_destroy(dsc); } destroy_dev(cdev); vmmdev_destroy(sc); error = 0; out: free(buf, M_VMMDEV); return (error); } SYSCTL_PROC(_hw_vmm, OID_AUTO, destroy, CTLTYPE_STRING | CTLFLAG_RW | CTLFLAG_PRISON | CTLFLAG_MPSAFE, NULL, 0, sysctl_vmm_destroy, "A", NULL); static struct cdevsw vmmdevsw = { .d_name = "vmmdev", .d_version = D_VERSION, .d_ioctl = vmmdev_ioctl, .d_mmap_single = vmmdev_mmap_single, .d_read = vmmdev_rw, .d_write = vmmdev_rw, }; static int sysctl_vmm_create(SYSCTL_HANDLER_ARGS) { struct vm *vm; struct cdev *cdev; struct vmmdev_softc *sc, *sc2; char *buf; int error, buflen; error = vmm_priv_check(req->td->td_ucred); if (error) return (error); buflen = VM_MAX_NAMELEN + 1; buf = malloc(buflen, M_VMMDEV, M_WAITOK | M_ZERO); strlcpy(buf, "beavis", buflen); error = sysctl_handle_string(oidp, buf, buflen, req); if (error != 0 || req->newptr == NULL) goto out; mtx_lock(&vmmdev_mtx); sc = vmmdev_lookup(buf); mtx_unlock(&vmmdev_mtx); if (sc != NULL) { error = EEXIST; goto out; } error = vm_create(buf, &vm); if (error != 0) goto out; sc = malloc(sizeof(struct vmmdev_softc), M_VMMDEV, M_WAITOK | M_ZERO); sc->ucred = crhold(curthread->td_ucred); sc->vm = vm; SLIST_INIT(&sc->devmem); /* * Lookup the name again just in case somebody sneaked in when we * dropped the lock. */ mtx_lock(&vmmdev_mtx); sc2 = vmmdev_lookup(buf); if (sc2 == NULL) { SLIST_INSERT_HEAD(&head, sc, link); sc->flags |= VSC_LINKED; } mtx_unlock(&vmmdev_mtx); if (sc2 != NULL) { vmmdev_destroy(sc); error = EEXIST; goto out; } error = make_dev_p(MAKEDEV_CHECKNAME, &cdev, &vmmdevsw, sc->ucred, UID_ROOT, GID_WHEEL, 0600, "vmm/%s", buf); if (error != 0) { vmmdev_destroy(sc); goto out; } mtx_lock(&vmmdev_mtx); sc->cdev = cdev; sc->cdev->si_drv1 = sc; mtx_unlock(&vmmdev_mtx); out: free(buf, M_VMMDEV); return (error); } SYSCTL_PROC(_hw_vmm, OID_AUTO, create, CTLTYPE_STRING | CTLFLAG_RW | CTLFLAG_PRISON | CTLFLAG_MPSAFE, NULL, 0, sysctl_vmm_create, "A", NULL); void vmmdev_init(void) { - mtx_init(&vmmdev_mtx, "vmm device mutex", NULL, MTX_DEF); pr_allow_flag = prison_add_allow(NULL, "vmm", NULL, "Allow use of vmm in a jail."); } int vmmdev_cleanup(void) { int error; if (SLIST_EMPTY(&head)) error = 0; else error = EBUSY; return (error); } static int devmem_mmap_single(struct cdev *cdev, vm_ooffset_t *offset, vm_size_t len, struct vm_object **objp, int nprot) { struct devmem_softc *dsc; vm_ooffset_t first, last; size_t seglen; int error; uint16_t lastcpu; bool sysmem; dsc = cdev->si_drv1; if (dsc == NULL) { /* 'cdev' has been created but is not ready for use */ return (ENXIO); } first = *offset; last = *offset + len; if ((nprot & PROT_EXEC) || first < 0 || first >= last) return (EINVAL); lastcpu = vm_get_maxcpus(dsc->sc->vm) - 1; error = vcpu_lock_one(dsc->sc, lastcpu); if (error) return (error); error = vm_get_memseg(dsc->sc->vm, dsc->segid, &seglen, &sysmem, objp); KASSERT(error == 0 && !sysmem && *objp != NULL, ("%s: invalid devmem segment %d", __func__, dsc->segid)); vcpu_unlock_one(dsc->sc, lastcpu); if (seglen >= last) { vm_object_reference(*objp); return (0); } else { return (EINVAL); } } static struct cdevsw devmemsw = { .d_name = "devmem", .d_version = D_VERSION, .d_mmap_single = devmem_mmap_single, }; static int devmem_create_cdev(const char *vmname, int segid, char *devname) { struct devmem_softc *dsc; struct vmmdev_softc *sc; struct cdev *cdev; int error; error = make_dev_p(MAKEDEV_CHECKNAME, &cdev, &devmemsw, NULL, UID_ROOT, GID_WHEEL, 0600, "vmm.io/%s.%s", vmname, devname); if (error) return (error); dsc = malloc(sizeof(struct devmem_softc), M_VMMDEV, M_WAITOK | M_ZERO); mtx_lock(&vmmdev_mtx); sc = vmmdev_lookup(vmname); KASSERT(sc != NULL, ("%s: vm %s softc not found", __func__, vmname)); if (sc->cdev == NULL) { /* virtual machine is being created or destroyed */ mtx_unlock(&vmmdev_mtx); free(dsc, M_VMMDEV); destroy_dev_sched_cb(cdev, NULL, 0); return (ENODEV); } dsc->segid = segid; dsc->name = devname; dsc->cdev = cdev; dsc->sc = sc; SLIST_INSERT_HEAD(&sc->devmem, dsc, link); mtx_unlock(&vmmdev_mtx); /* The 'cdev' is ready for use after 'si_drv1' is initialized */ cdev->si_drv1 = dsc; return (0); } static void devmem_destroy(void *arg) { struct devmem_softc *dsc = arg; KASSERT(dsc->cdev, ("%s: devmem cdev already destroyed", __func__)); dsc->cdev = NULL; dsc->sc = NULL; }