diff --git a/sys/amd64/vmm/amd/amdvi_hw.c b/sys/amd64/vmm/amd/amdvi_hw.c index 87283325600c..831c31277570 100644 --- a/sys/amd64/vmm/amd/amdvi_hw.c +++ b/sys/amd64/vmm/amd/amdvi_hw.c @@ -1,1382 +1,1387 @@ /*- * SPDX-License-Identifier: BSD-2-Clause * * Copyright (c) 2016, Anish Gupta (anish@freebsd.org) * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice unmodified, this list of conditions, and the following * disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "ivhd_if.h" #include "pcib_if.h" #include "io/iommu.h" #include "amdvi_priv.h" SYSCTL_DECL(_hw_vmm); SYSCTL_NODE(_hw_vmm, OID_AUTO, amdvi, CTLFLAG_RW | CTLFLAG_MPSAFE, NULL, NULL); #define MOD_INC(a, s, m) (((a) + (s)) % ((m) * (s))) #define MOD_DEC(a, s, m) (((a) - (s)) % ((m) * (s))) /* Print RID or device ID in PCI string format. */ #define RID2PCI_STR(d) PCI_RID2BUS(d), PCI_RID2SLOT(d), PCI_RID2FUNC(d) static void amdvi_dump_cmds(struct amdvi_softc *softc, int count); static void amdvi_print_dev_cap(struct amdvi_softc *softc); MALLOC_DEFINE(M_AMDVI, "amdvi", "amdvi"); extern device_t *ivhd_devs; extern int ivhd_count; SYSCTL_INT(_hw_vmm_amdvi, OID_AUTO, count, CTLFLAG_RDTUN, &ivhd_count, 0, NULL); static int amdvi_enable_user = 0; SYSCTL_INT(_hw_vmm_amdvi, OID_AUTO, enable, CTLFLAG_RDTUN, &amdvi_enable_user, 0, NULL); TUNABLE_INT("hw.vmm.amdvi_enable", &amdvi_enable_user); #ifdef AMDVI_ATS_ENABLE /* XXX: ATS is not tested. */ static int amdvi_enable_iotlb = 1; SYSCTL_INT(_hw_vmm_amdvi, OID_AUTO, iotlb_enabled, CTLFLAG_RDTUN, &amdvi_enable_iotlb, 0, NULL); TUNABLE_INT("hw.vmm.enable_iotlb", &amdvi_enable_iotlb); #endif static int amdvi_host_ptp = 1; /* Use page tables for host. */ SYSCTL_INT(_hw_vmm_amdvi, OID_AUTO, host_ptp, CTLFLAG_RDTUN, &amdvi_host_ptp, 0, NULL); TUNABLE_INT("hw.vmm.amdvi.host_ptp", &amdvi_host_ptp); /* Page table level used <= supported by h/w[v1=7]. */ int amdvi_ptp_level = 4; SYSCTL_INT(_hw_vmm_amdvi, OID_AUTO, ptp_level, CTLFLAG_RDTUN, &amdvi_ptp_level, 0, NULL); TUNABLE_INT("hw.vmm.amdvi.ptp_level", &amdvi_ptp_level); /* Disable fault event reporting. */ static int amdvi_disable_io_fault = 0; SYSCTL_INT(_hw_vmm_amdvi, OID_AUTO, disable_io_fault, CTLFLAG_RDTUN, &amdvi_disable_io_fault, 0, NULL); TUNABLE_INT("hw.vmm.amdvi.disable_io_fault", &amdvi_disable_io_fault); static uint32_t amdvi_dom_id = 0; /* 0 is reserved for host. */ SYSCTL_UINT(_hw_vmm_amdvi, OID_AUTO, domain_id, CTLFLAG_RD, &amdvi_dom_id, 0, NULL); /* * Device table entry. * Bus(256) x Dev(32) x Fun(8) x DTE(256 bits or 32 bytes). * = 256 * 2 * PAGE_SIZE. */ static struct amdvi_dte amdvi_dte[PCI_NUM_DEV_MAX] __aligned(PAGE_SIZE); CTASSERT(PCI_NUM_DEV_MAX == 0x10000); CTASSERT(sizeof(amdvi_dte) == 0x200000); static SLIST_HEAD (, amdvi_domain) dom_head; static inline uint32_t amdvi_pci_read(struct amdvi_softc *softc, int off) { return (pci_cfgregread(softc->pci_seg, PCI_RID2BUS(softc->pci_rid), PCI_RID2SLOT(softc->pci_rid), PCI_RID2FUNC(softc->pci_rid), off, 4)); } #ifdef AMDVI_ATS_ENABLE /* XXX: Should be in pci.c */ /* * Check if device has ATS capability and its enabled. * If ATS is absent or disabled, return (-1), otherwise ATS * queue length. */ static int amdvi_find_ats_qlen(uint16_t devid) { device_t dev; uint32_t off, cap; int qlen = -1; dev = pci_find_bsf(PCI_RID2BUS(devid), PCI_RID2SLOT(devid), PCI_RID2FUNC(devid)); if (!dev) { return (-1); } #define PCIM_ATS_EN BIT(31) if (pci_find_extcap(dev, PCIZ_ATS, &off) == 0) { cap = pci_read_config(dev, off + 4, 4); qlen = (cap & 0x1F); qlen = qlen ? qlen : 32; printf("AMD-Vi: PCI device %d.%d.%d ATS %s qlen=%d\n", RID2PCI_STR(devid), (cap & PCIM_ATS_EN) ? "enabled" : "Disabled", qlen); qlen = (cap & PCIM_ATS_EN) ? qlen : -1; } return (qlen); } /* * Check if an endpoint device support device IOTLB or ATS. */ static inline bool amdvi_dev_support_iotlb(struct amdvi_softc *softc, uint16_t devid) { struct ivhd_dev_cfg *cfg; int qlen, i; bool pci_ats, ivhd_ats; qlen = amdvi_find_ats_qlen(devid); if (qlen < 0) return (false); KASSERT(softc, ("softc is NULL")); cfg = softc->dev_cfg; ivhd_ats = false; for (i = 0; i < softc->dev_cfg_cnt; i++) { if ((cfg->start_id <= devid) && (cfg->end_id >= devid)) { ivhd_ats = cfg->enable_ats; break; } cfg++; } pci_ats = (qlen < 0) ? false : true; if (pci_ats != ivhd_ats) device_printf(softc->dev, "BIOS bug: mismatch in ATS setting for %d.%d.%d," "ATS inv qlen = %d\n", RID2PCI_STR(devid), qlen); /* Ignore IVRS setting and respect PCI setting. */ return (pci_ats); } #endif /* Enable IOTLB support for IOMMU if its supported. */ static inline void amdvi_hw_enable_iotlb(struct amdvi_softc *softc) { #ifndef AMDVI_ATS_ENABLE softc->iotlb = false; #else bool supported; supported = (softc->ivhd_flag & IVHD_FLAG_IOTLB) ? true : false; if (softc->pci_cap & AMDVI_PCI_CAP_IOTLB) { if (!supported) device_printf(softc->dev, "IOTLB disabled by BIOS.\n"); if (supported && !amdvi_enable_iotlb) { device_printf(softc->dev, "IOTLB disabled by user.\n"); supported = false; } } else supported = false; softc->iotlb = supported; #endif } static int amdvi_init_cmd(struct amdvi_softc *softc) { struct amdvi_ctrl *ctrl = softc->ctrl; ctrl->cmd.len = 8; /* Use 256 command buffer entries. */ softc->cmd_max = 1 << ctrl->cmd.len; softc->cmd = malloc(sizeof(struct amdvi_cmd) * softc->cmd_max, M_AMDVI, M_WAITOK | M_ZERO); if ((uintptr_t)softc->cmd & PAGE_MASK) panic("AMDVi: Command buffer not aligned on page boundary."); ctrl->cmd.base = vtophys(softc->cmd) / PAGE_SIZE; /* * XXX: Reset the h/w pointers in case IOMMU is restarting, * h/w doesn't clear these pointers based on empirical data. */ ctrl->cmd_tail = 0; ctrl->cmd_head = 0; return (0); } /* * Note: Update tail pointer after we have written the command since tail * pointer update cause h/w to execute new commands, see section 3.3 * of AMD IOMMU spec ver 2.0. */ /* Get the command tail pointer w/o updating it. */ static struct amdvi_cmd * amdvi_get_cmd_tail(struct amdvi_softc *softc) { struct amdvi_ctrl *ctrl; struct amdvi_cmd *tail; KASSERT(softc, ("softc is NULL")); KASSERT(softc->cmd != NULL, ("cmd is NULL")); ctrl = softc->ctrl; KASSERT(ctrl != NULL, ("ctrl is NULL")); tail = (struct amdvi_cmd *)((uint8_t *)softc->cmd + ctrl->cmd_tail); return (tail); } /* * Update the command tail pointer which will start command execution. */ static void amdvi_update_cmd_tail(struct amdvi_softc *softc) { struct amdvi_ctrl *ctrl; int size; size = sizeof(struct amdvi_cmd); KASSERT(softc->cmd != NULL, ("cmd is NULL")); ctrl = softc->ctrl; KASSERT(ctrl != NULL, ("ctrl is NULL")); ctrl->cmd_tail = MOD_INC(ctrl->cmd_tail, size, softc->cmd_max); softc->total_cmd++; #ifdef AMDVI_DEBUG_CMD device_printf(softc->dev, "cmd_tail: %s Tail:0x%x, Head:0x%x.\n", ctrl->cmd_tail, ctrl->cmd_head); #endif } /* * Various commands supported by IOMMU. */ /* Completion wait command. */ static void amdvi_cmd_cmp(struct amdvi_softc *softc, const uint64_t data) { struct amdvi_cmd *cmd; uint64_t pa; cmd = amdvi_get_cmd_tail(softc); KASSERT(cmd != NULL, ("Cmd is NULL")); pa = vtophys(&softc->cmp_data); cmd->opcode = AMDVI_CMP_WAIT_OPCODE; cmd->word0 = (pa & 0xFFFFFFF8) | AMDVI_CMP_WAIT_STORE; cmd->word1 = (pa >> 32) & 0xFFFFF; cmd->addr = data; amdvi_update_cmd_tail(softc); } /* Invalidate device table entry. */ static void amdvi_cmd_inv_dte(struct amdvi_softc *softc, uint16_t devid) { struct amdvi_cmd *cmd; cmd = amdvi_get_cmd_tail(softc); KASSERT(cmd != NULL, ("Cmd is NULL")); cmd->opcode = AMDVI_INVD_DTE_OPCODE; cmd->word0 = devid; amdvi_update_cmd_tail(softc); #ifdef AMDVI_DEBUG_CMD device_printf(softc->dev, "Invalidated DTE:0x%x\n", devid); #endif } /* Invalidate IOMMU page, use for invalidation of domain. */ static void amdvi_cmd_inv_iommu_pages(struct amdvi_softc *softc, uint16_t domain_id, uint64_t addr, bool guest_nested, bool pde, bool page) { struct amdvi_cmd *cmd; cmd = amdvi_get_cmd_tail(softc); KASSERT(cmd != NULL, ("Cmd is NULL")); cmd->opcode = AMDVI_INVD_PAGE_OPCODE; cmd->word1 = domain_id; /* * Invalidate all addresses for this domain. */ cmd->addr = addr; cmd->addr |= pde ? AMDVI_INVD_PAGE_PDE : 0; cmd->addr |= page ? AMDVI_INVD_PAGE_S : 0; amdvi_update_cmd_tail(softc); } #ifdef AMDVI_ATS_ENABLE /* Invalidate device IOTLB. */ static void amdvi_cmd_inv_iotlb(struct amdvi_softc *softc, uint16_t devid) { struct amdvi_cmd *cmd; int qlen; if (!softc->iotlb) return; qlen = amdvi_find_ats_qlen(devid); if (qlen < 0) { panic("AMDVI: Invalid ATS qlen(%d) for device %d.%d.%d\n", qlen, RID2PCI_STR(devid)); } cmd = amdvi_get_cmd_tail(softc); KASSERT(cmd != NULL, ("Cmd is NULL")); #ifdef AMDVI_DEBUG_CMD device_printf(softc->dev, "Invalidate IOTLB devID 0x%x" " Qlen:%d\n", devid, qlen); #endif cmd->opcode = AMDVI_INVD_IOTLB_OPCODE; cmd->word0 = devid; cmd->word1 = qlen; cmd->addr = AMDVI_INVD_IOTLB_ALL_ADDR | AMDVI_INVD_IOTLB_S; amdvi_update_cmd_tail(softc); } #endif #ifdef notyet /* For Interrupt Remap. */ static void amdvi_cmd_inv_intr_map(struct amdvi_softc *softc, uint16_t devid) { struct amdvi_cmd *cmd; cmd = amdvi_get_cmd_tail(softc); KASSERT(cmd != NULL, ("Cmd is NULL")); cmd->opcode = AMDVI_INVD_INTR_OPCODE; cmd->word0 = devid; amdvi_update_cmd_tail(softc); #ifdef AMDVI_DEBUG_CMD device_printf(softc->dev, "Invalidate INTR map of devID 0x%x\n", devid); #endif } #endif /* Invalidate domain using INVALIDATE_IOMMU_PAGES command. */ static void amdvi_inv_domain(struct amdvi_softc *softc, uint16_t domain_id) { struct amdvi_cmd *cmd __diagused; cmd = amdvi_get_cmd_tail(softc); KASSERT(cmd != NULL, ("Cmd is NULL")); /* * See section 3.3.3 of IOMMU spec rev 2.0, software note * for invalidating domain. */ amdvi_cmd_inv_iommu_pages(softc, domain_id, AMDVI_INVD_PAGE_ALL_ADDR, false, true, true); #ifdef AMDVI_DEBUG_CMD device_printf(softc->dev, "Invalidate domain:0x%x\n", domain_id); #endif } static bool amdvi_cmp_wait(struct amdvi_softc *softc) { #ifdef AMDVI_DEBUG_CMD struct amdvi_ctrl *ctrl = softc->ctrl; #endif const uint64_t VERIFY = 0xA5A5; volatile uint64_t *read; int i; bool status; read = &softc->cmp_data; *read = 0; amdvi_cmd_cmp(softc, VERIFY); /* Wait for h/w to update completion data. */ for (i = 0; i < 100 && (*read != VERIFY); i++) { DELAY(1000); /* 1 ms */ } status = (VERIFY == softc->cmp_data) ? true : false; #ifdef AMDVI_DEBUG_CMD if (status) device_printf(softc->dev, "CMD completion DONE Tail:0x%x, " "Head:0x%x, loop:%d.\n", ctrl->cmd_tail, ctrl->cmd_head, loop); #endif return (status); } static void amdvi_wait(struct amdvi_softc *softc) { struct amdvi_ctrl *ctrl; int i; KASSERT(softc, ("softc is NULL")); ctrl = softc->ctrl; KASSERT(ctrl != NULL, ("ctrl is NULL")); /* Don't wait if h/w is not enabled. */ if ((ctrl->control & AMDVI_CTRL_EN) == 0) return; for (i = 0; i < 10; i++) { if (amdvi_cmp_wait(softc)) return; } device_printf(softc->dev, "Error: completion failed" " tail:0x%x, head:0x%x.\n", ctrl->cmd_tail, ctrl->cmd_head); /* Dump the last command. */ amdvi_dump_cmds(softc, 1); } static void amdvi_dump_cmds(struct amdvi_softc *softc, int count) { struct amdvi_ctrl *ctrl; struct amdvi_cmd *cmd; int off, i; ctrl = softc->ctrl; device_printf(softc->dev, "Dump last %d command(s):\n", count); /* * If h/w is stuck in completion, it is the previous command, * start dumping from previous command onward. */ off = MOD_DEC(ctrl->cmd_head, sizeof(struct amdvi_cmd), softc->cmd_max); for (i = 0; off != ctrl->cmd_tail && i < count; i++) { cmd = (struct amdvi_cmd *)((uint8_t *)softc->cmd + off); printf(" [CMD%d, off:0x%x] opcode= 0x%x 0x%x" " 0x%x 0x%lx\n", i, off, cmd->opcode, cmd->word0, cmd->word1, cmd->addr); off = MOD_INC(off, sizeof(struct amdvi_cmd), softc->cmd_max); } } static int amdvi_init_event(struct amdvi_softc *softc) { struct amdvi_ctrl *ctrl; ctrl = softc->ctrl; ctrl->event.len = 8; softc->event_max = 1 << ctrl->event.len; softc->event = malloc(sizeof(struct amdvi_event) * softc->event_max, M_AMDVI, M_WAITOK | M_ZERO); if ((uintptr_t)softc->event & PAGE_MASK) { device_printf(softc->dev, "Event buffer not aligned on page."); return (false); } ctrl->event.base = vtophys(softc->event) / PAGE_SIZE; /* Reset the pointers. */ ctrl->evt_head = 0; ctrl->evt_tail = 0; return (0); } static inline void amdvi_decode_evt_flag(uint16_t flag) { flag &= AMDVI_EVENT_FLAG_MASK; printf(" 0x%b]\n", flag, "\020" "\001GN" "\002NX" "\003US" "\004I" "\005PR" "\006RW" "\007PE" "\010RZ" "\011TR" ); } /* See section 2.5.4 of AMD IOMMU spec ver 2.62.*/ static inline void amdvi_decode_evt_flag_type(uint8_t type) { switch (AMDVI_EVENT_FLAG_TYPE(type)) { case 0: printf("RSVD\n"); break; case 1: printf("Master Abort\n"); break; case 2: printf("Target Abort\n"); break; case 3: printf("Data Err\n"); break; default: break; } } static void amdvi_decode_inv_dte_evt(uint16_t devid, uint16_t domid, uint64_t addr, uint16_t flag) { printf("\t[IO_PAGE_FAULT EVT: devId:0x%x DomId:0x%x" " Addr:0x%lx", devid, domid, addr); amdvi_decode_evt_flag(flag); } static void amdvi_decode_pf_evt(uint16_t devid, uint16_t domid, uint64_t addr, uint16_t flag) { printf("\t[IO_PAGE_FAULT EVT: devId:0x%x DomId:0x%x" " Addr:0x%lx", devid, domid, addr); amdvi_decode_evt_flag(flag); } static void amdvi_decode_dte_hwerr_evt(uint16_t devid, uint16_t domid, uint64_t addr, uint16_t flag) { printf("\t[DEV_TAB_HW_ERR EVT: devId:0x%x DomId:0x%x" " Addr:0x%lx", devid, domid, addr); amdvi_decode_evt_flag(flag); amdvi_decode_evt_flag_type(flag); } static void amdvi_decode_page_hwerr_evt(uint16_t devid, uint16_t domid, uint64_t addr, uint16_t flag) { printf("\t[PAGE_TAB_HW_ERR EVT: devId:0x%x DomId:0x%x" " Addr:0x%lx", devid, domid, addr); amdvi_decode_evt_flag(flag); amdvi_decode_evt_flag_type(AMDVI_EVENT_FLAG_TYPE(flag)); } static void amdvi_decode_evt(struct amdvi_event *evt) { struct amdvi_cmd *cmd; switch (evt->opcode) { case AMDVI_EVENT_INVALID_DTE: amdvi_decode_inv_dte_evt(evt->devid, evt->pasid_domid, evt->addr, evt->flag); break; case AMDVI_EVENT_PFAULT: amdvi_decode_pf_evt(evt->devid, evt->pasid_domid, evt->addr, evt->flag); break; case AMDVI_EVENT_DTE_HW_ERROR: amdvi_decode_dte_hwerr_evt(evt->devid, evt->pasid_domid, evt->addr, evt->flag); break; case AMDVI_EVENT_PAGE_HW_ERROR: amdvi_decode_page_hwerr_evt(evt->devid, evt->pasid_domid, evt->addr, evt->flag); break; case AMDVI_EVENT_ILLEGAL_CMD: /* FALL THROUGH */ case AMDVI_EVENT_CMD_HW_ERROR: printf("\t[%s EVT]\n", (evt->opcode == AMDVI_EVENT_ILLEGAL_CMD) ? "ILLEGAL CMD" : "CMD HW ERR"); cmd = (struct amdvi_cmd *)PHYS_TO_DMAP(evt->addr); printf("\tCMD opcode= 0x%x 0x%x 0x%x 0x%lx\n", cmd->opcode, cmd->word0, cmd->word1, cmd->addr); break; case AMDVI_EVENT_IOTLB_TIMEOUT: printf("\t[IOTLB_INV_TIMEOUT devid:0x%x addr:0x%lx]\n", evt->devid, evt->addr); break; case AMDVI_EVENT_INVALID_DTE_REQ: printf("\t[INV_DTE devid:0x%x addr:0x%lx type:0x%x tr:%d]\n", evt->devid, evt->addr, evt->flag >> 9, (evt->flag >> 8) & 1); break; case AMDVI_EVENT_INVALID_PPR_REQ: case AMDVI_EVENT_COUNTER_ZERO: printf("AMD-Vi: v2 events.\n"); break; default: printf("Unsupported AMD-Vi event:%d\n", evt->opcode); } } static void amdvi_print_events(struct amdvi_softc *softc) { struct amdvi_ctrl *ctrl; struct amdvi_event *event; int i, size; ctrl = softc->ctrl; size = sizeof(struct amdvi_event); for (i = 0; i < softc->event_max; i++) { event = &softc->event[ctrl->evt_head / size]; if (!event->opcode) break; device_printf(softc->dev, "\t[Event%d: Head:0x%x Tail:0x%x]\n", i, ctrl->evt_head, ctrl->evt_tail); amdvi_decode_evt(event); ctrl->evt_head = MOD_INC(ctrl->evt_head, size, softc->event_max); } } static int amdvi_init_dte(struct amdvi_softc *softc) { struct amdvi_ctrl *ctrl; ctrl = softc->ctrl; ctrl->dte.base = vtophys(amdvi_dte) / PAGE_SIZE; ctrl->dte.size = 0x1FF; /* 2MB device table. */ return (0); } /* * Not all capabilities of IOMMU are available in ACPI IVHD flag * or EFR entry, read directly from device. */ static int amdvi_print_pci_cap(device_t dev) { struct amdvi_softc *softc; uint32_t off, cap; softc = device_get_softc(dev); off = softc->cap_off; /* * Section 3.7.1 of IOMMU sepc rev 2.0. * Read capability from device. */ cap = amdvi_pci_read(softc, off); /* Make sure capability type[18:16] is 3. */ KASSERT((((cap >> 16) & 0x7) == 0x3), ("Not a IOMMU capability 0x%x@0x%x", cap, off)); softc->pci_cap = cap >> 24; device_printf(softc->dev, "PCI cap 0x%x@0x%x feature:%b\n", cap, off, softc->pci_cap, "\20\1IOTLB\2HT\3NPCache\4EFR\5CapExt"); return (0); } static void amdvi_event_intr(void *arg) { struct amdvi_softc *softc; struct amdvi_ctrl *ctrl; softc = (struct amdvi_softc *)arg; ctrl = softc->ctrl; device_printf(softc->dev, "EVT INTR %ld Status:0x%x" " EVT Head:0x%x Tail:0x%x]\n", softc->event_intr_cnt++, ctrl->status, ctrl->evt_head, ctrl->evt_tail); printf(" [CMD Total 0x%lx] Tail:0x%x, Head:0x%x.\n", softc->total_cmd, ctrl->cmd_tail, ctrl->cmd_head); amdvi_print_events(softc); ctrl->status &= AMDVI_STATUS_EV_OF | AMDVI_STATUS_EV_INTR; } static void amdvi_free_evt_intr_res(device_t dev) { struct amdvi_softc *softc; device_t mmio_dev; softc = device_get_softc(dev); mmio_dev = softc->pci_dev; IVHD_TEARDOWN_INTR(mmio_dev); } static bool amdvi_alloc_intr_resources(struct amdvi_softc *softc) { struct amdvi_ctrl *ctrl; device_t dev, mmio_dev; int err; dev = softc->dev; mmio_dev = softc->pci_dev; /* Clear interrupt status bits. */ ctrl = softc->ctrl; ctrl->status &= AMDVI_STATUS_EV_OF | AMDVI_STATUS_EV_INTR; err = IVHD_SETUP_INTR(mmio_dev, amdvi_event_intr, softc, "fault"); if (err) device_printf(dev, "Interrupt setup failed on %s\n", device_get_nameunit(mmio_dev)); return (err); } static void amdvi_print_dev_cap(struct amdvi_softc *softc) { struct ivhd_dev_cfg *cfg; int i; cfg = softc->dev_cfg; for (i = 0; i < softc->dev_cfg_cnt; i++) { device_printf(softc->dev, "device [0x%x - 0x%x] " "config:%b%s\n", cfg->start_id, cfg->end_id, cfg->data, "\020\001INIT\002ExtInt\003NMI" "\007LINT0\010LINT1", cfg->enable_ats ? "ATS enabled" : ""); cfg++; } } static int amdvi_handle_sysctl(SYSCTL_HANDLER_ARGS) { struct amdvi_softc *softc; int result, type, error = 0; softc = (struct amdvi_softc *)arg1; type = arg2; switch (type) { case 0: result = softc->ctrl->cmd_head; error = sysctl_handle_int(oidp, &result, 0, req); break; case 1: result = softc->ctrl->cmd_tail; error = sysctl_handle_int(oidp, &result, 0, req); break; case 2: result = softc->ctrl->evt_head; error = sysctl_handle_int(oidp, &result, 0, req); break; case 3: result = softc->ctrl->evt_tail; error = sysctl_handle_int(oidp, &result, 0, req); break; default: device_printf(softc->dev, "Unknown sysctl:%d\n", type); } return (error); } static void amdvi_add_sysctl(struct amdvi_softc *softc) { struct sysctl_oid_list *child; struct sysctl_ctx_list *ctx; device_t dev; dev = softc->dev; ctx = device_get_sysctl_ctx(dev); child = SYSCTL_CHILDREN(device_get_sysctl_tree(dev)); SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "event_intr_count", CTLFLAG_RD, &softc->event_intr_cnt, "Event interrupt count"); SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "command_count", CTLFLAG_RD, &softc->total_cmd, "Command submitted count"); SYSCTL_ADD_U16(ctx, child, OID_AUTO, "pci_rid", CTLFLAG_RD, &softc->pci_rid, 0, "IOMMU RID"); SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "command_head", CTLTYPE_UINT | CTLFLAG_RD | CTLFLAG_MPSAFE, softc, 0, amdvi_handle_sysctl, "IU", "Command head"); SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "command_tail", CTLTYPE_UINT | CTLFLAG_RD | CTLFLAG_MPSAFE, softc, 1, amdvi_handle_sysctl, "IU", "Command tail"); SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "event_head", CTLTYPE_UINT | CTLFLAG_RD | CTLFLAG_MPSAFE, softc, 2, amdvi_handle_sysctl, "IU", "Command head"); SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "event_tail", CTLTYPE_UINT | CTLFLAG_RD | CTLFLAG_MPSAFE, softc, 3, amdvi_handle_sysctl, "IU", "Command tail"); } int amdvi_setup_hw(struct amdvi_softc *softc) { device_t dev; int status; dev = softc->dev; amdvi_hw_enable_iotlb(softc); amdvi_print_dev_cap(softc); if ((status = amdvi_print_pci_cap(dev)) != 0) { device_printf(dev, "PCI capability.\n"); return (status); } if ((status = amdvi_init_cmd(softc)) != 0) { device_printf(dev, "Couldn't configure command buffer.\n"); return (status); } if ((status = amdvi_init_event(softc)) != 0) { device_printf(dev, "Couldn't configure event buffer.\n"); return (status); } if ((status = amdvi_init_dte(softc)) != 0) { device_printf(dev, "Couldn't configure device table.\n"); return (status); } if ((status = amdvi_alloc_intr_resources(softc)) != 0) { return (status); } amdvi_add_sysctl(softc); return (0); } int amdvi_teardown_hw(struct amdvi_softc *softc) { device_t dev; dev = softc->dev; /* * Called after disable, h/w is stopped by now, free all the resources. */ amdvi_free_evt_intr_res(dev); if (softc->cmd) free(softc->cmd, M_AMDVI); if (softc->event) free(softc->event, M_AMDVI); return (0); } /*********** bhyve interfaces *********************/ static int amdvi_init(void) { if (!ivhd_count) { return (EIO); } if (!amdvi_enable_user && ivhd_count) { printf("bhyve: Found %d AMD-Vi/IOMMU device(s), " "use hw.vmm.amdvi.enable=1 to enable pass-through.\n", ivhd_count); return (EINVAL); } return (0); } static void amdvi_cleanup(void) { /* Nothing. */ } static uint16_t amdvi_domainId(void) { /* * If we hit maximum domain limit, rollover leaving host * domain(0). * XXX: make sure that this domain is not used. */ if (amdvi_dom_id == AMDVI_MAX_DOMAIN) amdvi_dom_id = 1; return ((uint16_t)amdvi_dom_id++); } static void amdvi_do_inv_domain(uint16_t domain_id, bool create) { struct amdvi_softc *softc; int i; for (i = 0; i < ivhd_count; i++) { softc = device_get_softc(ivhd_devs[i]); KASSERT(softc, ("softc is NULL")); /* * If not present pages are cached, invalidate page after * creating domain. */ #if 0 if (create && ((softc->pci_cap & AMDVI_PCI_CAP_NPCACHE) == 0)) continue; #endif amdvi_inv_domain(softc, domain_id); amdvi_wait(softc); } } static void * amdvi_create_domain(vm_paddr_t maxaddr) { struct amdvi_domain *dom; dom = malloc(sizeof(struct amdvi_domain), M_AMDVI, M_ZERO | M_WAITOK); dom->id = amdvi_domainId(); //dom->maxaddr = maxaddr; #ifdef AMDVI_DEBUG_CMD printf("Created domain #%d\n", dom->id); #endif /* * Host domain(#0) don't create translation table. */ if (dom->id || amdvi_host_ptp) dom->ptp = malloc(PAGE_SIZE, M_AMDVI, M_WAITOK | M_ZERO); dom->ptp_level = amdvi_ptp_level; amdvi_do_inv_domain(dom->id, true); SLIST_INSERT_HEAD(&dom_head, dom, next); return (dom); } static void amdvi_free_ptp(uint64_t *ptp, int level) { int i; if (level < 1) return; for (i = 0; i < NPTEPG ; i++) { if ((ptp[i] & AMDVI_PT_PRESENT) == 0) continue; /* XXX: Add super-page or PTE mapping > 4KB. */ #ifdef notyet /* Super-page mapping. */ if (AMDVI_PD_SUPER(ptp[i])) continue; #endif amdvi_free_ptp((uint64_t *)PHYS_TO_DMAP(ptp[i] & AMDVI_PT_MASK), level - 1); } free(ptp, M_AMDVI); } static void amdvi_destroy_domain(void *arg) { struct amdvi_domain *domain; domain = (struct amdvi_domain *)arg; KASSERT(domain, ("domain is NULL")); #ifdef AMDVI_DEBUG_CMD printf("Destroying domain %d\n", domain->id); #endif if (domain->ptp) amdvi_free_ptp(domain->ptp, domain->ptp_level); amdvi_do_inv_domain(domain->id, false); SLIST_REMOVE(&dom_head, domain, amdvi_domain, next); free(domain, M_AMDVI); } static uint64_t amdvi_set_pt(uint64_t *pt, int level, vm_paddr_t gpa, vm_paddr_t hpa, uint64_t pg_size, bool create) { uint64_t *page, pa; int shift, index; const int PT_SHIFT = 9; const int PT_INDEX_MASK = (1 << PT_SHIFT) - 1; /* Based on PT_SHIFT */ if (!pg_size) return (0); if (hpa & (pg_size - 1)) { printf("HPA is not size aligned.\n"); return (0); } if (gpa & (pg_size - 1)) { printf("HPA is not size aligned.\n"); return (0); } shift = PML4SHIFT; while ((shift > PAGE_SHIFT) && (pg_size < (1UL << shift))) { index = (gpa >> shift) & PT_INDEX_MASK; if ((pt[index] == 0) && create) { page = malloc(PAGE_SIZE, M_AMDVI, M_WAITOK | M_ZERO); pa = vtophys(page); pt[index] = pa | AMDVI_PT_PRESENT | AMDVI_PT_RW | ((level - 1) << AMDVI_PD_LEVEL_SHIFT); } #ifdef AMDVI_DEBUG_PTE if ((gpa % 0x1000000) == 0) printf("[level%d, shift = %d]PTE:0x%lx\n", level, shift, pt[index]); #endif #define PTE2PA(x) ((uint64_t)(x) & AMDVI_PT_MASK) pa = PTE2PA(pt[index]); pt = (uint64_t *)PHYS_TO_DMAP(pa); shift -= PT_SHIFT; level--; } /* Leaf entry. */ index = (gpa >> shift) & PT_INDEX_MASK; if (create) { pt[index] = hpa | AMDVI_PT_RW | AMDVI_PT_PRESENT; } else pt[index] = 0; #ifdef AMDVI_DEBUG_PTE if ((gpa % 0x1000000) == 0) printf("[Last level%d, shift = %d]PTE:0x%lx\n", level, shift, pt[index]); #endif return (1ULL << shift); } static uint64_t amdvi_update_mapping(struct amdvi_domain *domain, vm_paddr_t gpa, vm_paddr_t hpa, uint64_t size, bool create) { uint64_t mapped, *ptp, len; int level; KASSERT(domain, ("domain is NULL")); level = domain->ptp_level; KASSERT(level, ("Page table level is 0")); ptp = domain->ptp; KASSERT(ptp, ("PTP is NULL")); mapped = 0; while (mapped < size) { len = amdvi_set_pt(ptp, level, gpa + mapped, hpa + mapped, PAGE_SIZE, create); if (!len) { printf("Error: Couldn't map HPA:0x%lx GPA:0x%lx\n", hpa, gpa); return (0); } mapped += len; } return (mapped); } -static uint64_t +static int amdvi_create_mapping(void *arg, vm_paddr_t gpa, vm_paddr_t hpa, - uint64_t len) + uint64_t len, uint64_t *res_len) { struct amdvi_domain *domain; domain = (struct amdvi_domain *)arg; if (domain->id && !domain->ptp) { printf("ptp is NULL"); - return (-1); + return (EINVAL); } /* * If host domain is created w/o page table, skip IOMMU page * table set-up. */ if (domain->ptp) - return (amdvi_update_mapping(domain, gpa, hpa, len, true)); + *res_len = amdvi_update_mapping(domain, gpa, hpa, len, true); else - return (len); + *res_len = len; + return (0); } -static uint64_t -amdvi_remove_mapping(void *arg, vm_paddr_t gpa, uint64_t len) +static int +amdvi_remove_mapping(void *arg, vm_paddr_t gpa, uint64_t len, uint64_t *res_len) { struct amdvi_domain *domain; domain = (struct amdvi_domain *)arg; /* * If host domain is created w/o page table, skip IOMMU page * table set-up. */ if (domain->ptp) - return (amdvi_update_mapping(domain, gpa, 0, len, false)); - return - (len); + *res_len = amdvi_update_mapping(domain, gpa, 0, len, false); + else + *res_len = len; + return (0); } static struct amdvi_softc * amdvi_find_iommu(uint16_t devid) { struct amdvi_softc *softc; int i, j; for (i = 0; i < ivhd_count; i++) { softc = device_get_softc(ivhd_devs[i]); for (j = 0; j < softc->dev_cfg_cnt; j++) if ((devid >= softc->dev_cfg[j].start_id) && (devid <= softc->dev_cfg[j].end_id)) return (softc); } return (NULL); } /* * Set-up device table entry. * IOMMU spec Rev 2.0, section 3.2.2.2, some of the fields must * be set concurrently, e.g. read and write bits. */ static void amdvi_set_dte(struct amdvi_domain *domain, struct amdvi_softc *softc, uint16_t devid, bool enable) { struct amdvi_dte* temp; KASSERT(domain, ("domain is NULL for pci_rid:0x%x\n", devid)); KASSERT(softc, ("softc is NULL for pci_rid:0x%x\n", devid)); temp = &amdvi_dte[devid]; #ifdef AMDVI_ATS_ENABLE /* If IOMMU and device support IOTLB, enable it. */ if (amdvi_dev_support_iotlb(softc, devid) && softc->iotlb) temp->iotlb_enable = 1; #endif /* Avoid duplicate I/O faults. */ temp->sup_second_io_fault = 1; temp->sup_all_io_fault = amdvi_disable_io_fault; temp->dt_valid = 1; temp->domain_id = domain->id; if (enable) { if (domain->ptp) { temp->pt_base = vtophys(domain->ptp) >> 12; temp->pt_level = amdvi_ptp_level; } /* * XXX: Page table valid[TV] bit must be set even if host domain * page tables are not enabled. */ temp->pt_valid = 1; temp->read_allow = 1; temp->write_allow = 1; } } static void amdvi_inv_device(struct amdvi_softc *softc, uint16_t devid) { KASSERT(softc, ("softc is NULL")); amdvi_cmd_inv_dte(softc, devid); #ifdef AMDVI_ATS_ENABLE if (amdvi_dev_support_iotlb(softc, devid)) amdvi_cmd_inv_iotlb(softc, devid); #endif amdvi_wait(softc); } -static void -amdvi_add_device(void *arg, uint16_t devid) +static int +amdvi_add_device(void *arg, device_t dev __unused, uint16_t devid) { struct amdvi_domain *domain; struct amdvi_softc *softc; domain = (struct amdvi_domain *)arg; KASSERT(domain != NULL, ("domain is NULL")); #ifdef AMDVI_DEBUG_CMD printf("Assigning device(%d.%d.%d) to domain:%d\n", RID2PCI_STR(devid), domain->id); #endif softc = amdvi_find_iommu(devid); if (softc == NULL) - return; + return (ENXIO); amdvi_set_dte(domain, softc, devid, true); amdvi_inv_device(softc, devid); + return (0); } -static void -amdvi_remove_device(void *arg, uint16_t devid) +static int +amdvi_remove_device(void *arg, device_t dev __unused, uint16_t devid) { struct amdvi_domain *domain; struct amdvi_softc *softc; domain = (struct amdvi_domain *)arg; #ifdef AMDVI_DEBUG_CMD printf("Remove device(0x%x) from domain:%d\n", devid, domain->id); #endif softc = amdvi_find_iommu(devid); if (softc == NULL) - return; + return (ENXIO); amdvi_set_dte(domain, softc, devid, false); amdvi_inv_device(softc, devid); + return (0); } static void amdvi_enable(void) { struct amdvi_ctrl *ctrl; struct amdvi_softc *softc; uint64_t val; int i; for (i = 0; i < ivhd_count; i++) { softc = device_get_softc(ivhd_devs[i]); KASSERT(softc, ("softc is NULL\n")); ctrl = softc->ctrl; KASSERT(ctrl, ("ctrl is NULL\n")); val = ( AMDVI_CTRL_EN | AMDVI_CTRL_CMD | AMDVI_CTRL_ELOG | AMDVI_CTRL_ELOGINT | AMDVI_CTRL_INV_TO_1S); if (softc->ivhd_flag & IVHD_FLAG_COH) val |= AMDVI_CTRL_COH; if (softc->ivhd_flag & IVHD_FLAG_HTT) val |= AMDVI_CTRL_HTT; if (softc->ivhd_flag & IVHD_FLAG_RPPW) val |= AMDVI_CTRL_RPPW; if (softc->ivhd_flag & IVHD_FLAG_PPW) val |= AMDVI_CTRL_PPW; if (softc->ivhd_flag & IVHD_FLAG_ISOC) val |= AMDVI_CTRL_ISOC; ctrl->control = val; } } static void amdvi_disable(void) { struct amdvi_ctrl *ctrl; struct amdvi_softc *softc; int i; for (i = 0; i < ivhd_count; i++) { softc = device_get_softc(ivhd_devs[i]); KASSERT(softc, ("softc is NULL\n")); ctrl = softc->ctrl; KASSERT(ctrl, ("ctrl is NULL\n")); ctrl->control = 0; } } -static void +static int amdvi_invalidate_tlb(void *arg) { struct amdvi_domain *domain; domain = (struct amdvi_domain *)arg; KASSERT(domain, ("domain is NULL")); amdvi_do_inv_domain(domain->id, false); + return (0); } const struct iommu_ops iommu_ops_amd = { .init = amdvi_init, .cleanup = amdvi_cleanup, .enable = amdvi_enable, .disable = amdvi_disable, .create_domain = amdvi_create_domain, .destroy_domain = amdvi_destroy_domain, .create_mapping = amdvi_create_mapping, .remove_mapping = amdvi_remove_mapping, .add_device = amdvi_add_device, .remove_device = amdvi_remove_device, .invalidate_tlb = amdvi_invalidate_tlb, }; diff --git a/sys/amd64/vmm/intel/vtd.c b/sys/amd64/vmm/intel/vtd.c index 72cedeca6ec1..b56541290a9d 100644 --- a/sys/amd64/vmm/intel/vtd.c +++ b/sys/amd64/vmm/intel/vtd.c @@ -1,773 +1,779 @@ /*- * SPDX-License-Identifier: BSD-2-Clause * * Copyright (c) 2011 NetApp, Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include #include #include #include #include #include #include #include #include #include "io/iommu.h" /* * Documented in the "Intel Virtualization Technology for Directed I/O", * Architecture Spec, September 2008. */ #define VTD_DRHD_INCLUDE_PCI_ALL(Flags) (((Flags) >> 0) & 0x1) /* Section 10.4 "Register Descriptions" */ struct vtdmap { volatile uint32_t version; volatile uint32_t res0; volatile uint64_t cap; volatile uint64_t ext_cap; volatile uint32_t gcr; volatile uint32_t gsr; volatile uint64_t rta; volatile uint64_t ccr; }; #define VTD_CAP_SAGAW(cap) (((cap) >> 8) & 0x1F) #define VTD_CAP_ND(cap) ((cap) & 0x7) #define VTD_CAP_CM(cap) (((cap) >> 7) & 0x1) #define VTD_CAP_SPS(cap) (((cap) >> 34) & 0xF) #define VTD_CAP_RWBF(cap) (((cap) >> 4) & 0x1) #define VTD_ECAP_DI(ecap) (((ecap) >> 2) & 0x1) #define VTD_ECAP_COHERENCY(ecap) ((ecap) & 0x1) #define VTD_ECAP_IRO(ecap) (((ecap) >> 8) & 0x3FF) #define VTD_GCR_WBF (1 << 27) #define VTD_GCR_SRTP (1 << 30) #define VTD_GCR_TE (1U << 31) #define VTD_GSR_WBFS (1 << 27) #define VTD_GSR_RTPS (1 << 30) #define VTD_GSR_TES (1U << 31) #define VTD_CCR_ICC (1UL << 63) /* invalidate context cache */ #define VTD_CCR_CIRG_GLOBAL (1UL << 61) /* global invalidation */ #define VTD_IIR_IVT (1UL << 63) /* invalidation IOTLB */ #define VTD_IIR_IIRG_GLOBAL (1ULL << 60) /* global IOTLB invalidation */ #define VTD_IIR_IIRG_DOMAIN (2ULL << 60) /* domain IOTLB invalidation */ #define VTD_IIR_IIRG_PAGE (3ULL << 60) /* page IOTLB invalidation */ #define VTD_IIR_DRAIN_READS (1ULL << 49) /* drain pending DMA reads */ #define VTD_IIR_DRAIN_WRITES (1ULL << 48) /* drain pending DMA writes */ #define VTD_IIR_DOMAIN_P 32 #define VTD_ROOT_PRESENT 0x1 #define VTD_CTX_PRESENT 0x1 #define VTD_CTX_TT_ALL (1UL << 2) #define VTD_PTE_RD (1UL << 0) #define VTD_PTE_WR (1UL << 1) #define VTD_PTE_SUPERPAGE (1UL << 7) #define VTD_PTE_ADDR_M (0x000FFFFFFFFFF000UL) #define VTD_RID2IDX(rid) (((rid) & 0xff) * 2) struct domain { uint64_t *ptp; /* first level page table page */ int pt_levels; /* number of page table levels */ int addrwidth; /* 'AW' field in context entry */ int spsmask; /* supported super page sizes */ u_int id; /* domain id */ vm_paddr_t maxaddr; /* highest address to be mapped */ SLIST_ENTRY(domain) next; }; static SLIST_HEAD(, domain) domhead; #define DRHD_MAX_UNITS 16 static ACPI_DMAR_HARDWARE_UNIT *drhds[DRHD_MAX_UNITS]; static int drhd_num; static struct vtdmap *vtdmaps[DRHD_MAX_UNITS]; static int max_domains; typedef int (*drhd_ident_func_t)(void); static uint64_t root_table[PAGE_SIZE / sizeof(uint64_t)] __aligned(4096); static uint64_t ctx_tables[256][PAGE_SIZE / sizeof(uint64_t)] __aligned(4096); static MALLOC_DEFINE(M_VTD, "vtd", "vtd"); static int vtd_max_domains(struct vtdmap *vtdmap) { int nd; nd = VTD_CAP_ND(vtdmap->cap); switch (nd) { case 0: return (16); case 1: return (64); case 2: return (256); case 3: return (1024); case 4: return (4 * 1024); case 5: return (16 * 1024); case 6: return (64 * 1024); default: panic("vtd_max_domains: invalid value of nd (0x%0x)", nd); } } static u_int domain_id(void) { u_int id; struct domain *dom; /* Skip domain id 0 - it is reserved when Caching Mode field is set */ for (id = 1; id < max_domains; id++) { SLIST_FOREACH(dom, &domhead, next) { if (dom->id == id) break; } if (dom == NULL) break; /* found it */ } if (id >= max_domains) panic("domain ids exhausted"); return (id); } static struct vtdmap * vtd_device_scope(uint16_t rid) { int i, remaining, pathremaining; char *end, *pathend; struct vtdmap *vtdmap; ACPI_DMAR_HARDWARE_UNIT *drhd; ACPI_DMAR_DEVICE_SCOPE *device_scope; ACPI_DMAR_PCI_PATH *path; for (i = 0; i < drhd_num; i++) { drhd = drhds[i]; if (VTD_DRHD_INCLUDE_PCI_ALL(drhd->Flags)) { /* * From Intel VT-d arch spec, version 3.0: * If a DRHD structure with INCLUDE_PCI_ALL flag Set is reported * for a Segment, it must be enumerated by BIOS after all other * DRHD structures for the same Segment. */ vtdmap = vtdmaps[i]; return(vtdmap); } end = (char *)drhd + drhd->Header.Length; remaining = drhd->Header.Length - sizeof(ACPI_DMAR_HARDWARE_UNIT); while (remaining > sizeof(ACPI_DMAR_DEVICE_SCOPE)) { device_scope = (ACPI_DMAR_DEVICE_SCOPE *)(end - remaining); remaining -= device_scope->Length; switch (device_scope->EntryType){ /* 0x01 and 0x02 are PCI device entries */ case 0x01: case 0x02: break; default: continue; } if (PCI_RID2BUS(rid) != device_scope->Bus) continue; pathend = (char *)device_scope + device_scope->Length; pathremaining = device_scope->Length - sizeof(ACPI_DMAR_DEVICE_SCOPE); while (pathremaining >= sizeof(ACPI_DMAR_PCI_PATH)) { path = (ACPI_DMAR_PCI_PATH *)(pathend - pathremaining); pathremaining -= sizeof(ACPI_DMAR_PCI_PATH); if (PCI_RID2SLOT(rid) != path->Device) continue; if (PCI_RID2FUNC(rid) != path->Function) continue; vtdmap = vtdmaps[i]; return (vtdmap); } } } /* No matching scope */ return (NULL); } static void vtd_wbflush(struct vtdmap *vtdmap) { if (VTD_ECAP_COHERENCY(vtdmap->ext_cap) == 0) pmap_invalidate_cache(); if (VTD_CAP_RWBF(vtdmap->cap)) { vtdmap->gcr = VTD_GCR_WBF; while ((vtdmap->gsr & VTD_GSR_WBFS) != 0) ; } } static void vtd_ctx_global_invalidate(struct vtdmap *vtdmap) { vtdmap->ccr = VTD_CCR_ICC | VTD_CCR_CIRG_GLOBAL; while ((vtdmap->ccr & VTD_CCR_ICC) != 0) ; } static void vtd_iotlb_global_invalidate(struct vtdmap *vtdmap) { int offset; volatile uint64_t *iotlb_reg, val; vtd_wbflush(vtdmap); offset = VTD_ECAP_IRO(vtdmap->ext_cap) * 16; iotlb_reg = (volatile uint64_t *)((caddr_t)vtdmap + offset + 8); *iotlb_reg = VTD_IIR_IVT | VTD_IIR_IIRG_GLOBAL | VTD_IIR_DRAIN_READS | VTD_IIR_DRAIN_WRITES; while (1) { val = *iotlb_reg; if ((val & VTD_IIR_IVT) == 0) break; } } static void vtd_translation_enable(struct vtdmap *vtdmap) { vtdmap->gcr = VTD_GCR_TE; while ((vtdmap->gsr & VTD_GSR_TES) == 0) ; } static void vtd_translation_disable(struct vtdmap *vtdmap) { vtdmap->gcr = 0; while ((vtdmap->gsr & VTD_GSR_TES) != 0) ; } static int vtd_init(void) { int i, units, remaining, tmp; struct vtdmap *vtdmap; vm_paddr_t ctx_paddr; char *end, envname[32]; unsigned long mapaddr; ACPI_STATUS status; ACPI_TABLE_DMAR *dmar; ACPI_DMAR_HEADER *hdr; ACPI_DMAR_HARDWARE_UNIT *drhd; /* * Allow the user to override the ACPI DMAR table by specifying the * physical address of each remapping unit. * * The following example specifies two remapping units at * physical addresses 0xfed90000 and 0xfeda0000 respectively. * set vtd.regmap.0.addr=0xfed90000 * set vtd.regmap.1.addr=0xfeda0000 */ for (units = 0; units < DRHD_MAX_UNITS; units++) { snprintf(envname, sizeof(envname), "vtd.regmap.%d.addr", units); if (getenv_ulong(envname, &mapaddr) == 0) break; vtdmaps[units] = (struct vtdmap *)PHYS_TO_DMAP(mapaddr); } if (units > 0) goto skip_dmar; /* Search for DMAR table. */ status = AcpiGetTable(ACPI_SIG_DMAR, 0, (ACPI_TABLE_HEADER **)&dmar); if (ACPI_FAILURE(status)) return (ENXIO); end = (char *)dmar + dmar->Header.Length; remaining = dmar->Header.Length - sizeof(ACPI_TABLE_DMAR); while (remaining > sizeof(ACPI_DMAR_HEADER)) { hdr = (ACPI_DMAR_HEADER *)(end - remaining); if (hdr->Length > remaining) break; /* * From Intel VT-d arch spec, version 1.3: * BIOS implementations must report mapping structures * in numerical order, i.e. All remapping structures of * type 0 (DRHD) enumerated before remapping structures of * type 1 (RMRR) and so forth. */ if (hdr->Type != ACPI_DMAR_TYPE_HARDWARE_UNIT) break; drhd = (ACPI_DMAR_HARDWARE_UNIT *)hdr; drhds[units] = drhd; vtdmaps[units] = (struct vtdmap *)PHYS_TO_DMAP(drhd->Address); if (++units >= DRHD_MAX_UNITS) break; remaining -= hdr->Length; } if (units <= 0) return (ENXIO); skip_dmar: drhd_num = units; max_domains = 64 * 1024; /* maximum valid value */ for (i = 0; i < drhd_num; i++){ vtdmap = vtdmaps[i]; if (VTD_CAP_CM(vtdmap->cap) != 0) panic("vtd_init: invalid caching mode"); /* take most compatible (minimum) value */ if ((tmp = vtd_max_domains(vtdmap)) < max_domains) max_domains = tmp; } /* * Set up the root-table to point to the context-entry tables */ for (i = 0; i < 256; i++) { ctx_paddr = vtophys(ctx_tables[i]); if (ctx_paddr & PAGE_MASK) panic("ctx table (0x%0lx) not page aligned", ctx_paddr); root_table[i * 2] = ctx_paddr | VTD_ROOT_PRESENT; } return (0); } static void vtd_cleanup(void) { } static void vtd_enable(void) { int i; struct vtdmap *vtdmap; for (i = 0; i < drhd_num; i++) { vtdmap = vtdmaps[i]; vtd_wbflush(vtdmap); /* Update the root table address */ vtdmap->rta = vtophys(root_table); vtdmap->gcr = VTD_GCR_SRTP; while ((vtdmap->gsr & VTD_GSR_RTPS) == 0) ; vtd_ctx_global_invalidate(vtdmap); vtd_iotlb_global_invalidate(vtdmap); vtd_translation_enable(vtdmap); } } static void vtd_disable(void) { int i; struct vtdmap *vtdmap; for (i = 0; i < drhd_num; i++) { vtdmap = vtdmaps[i]; vtd_translation_disable(vtdmap); } } -static void -vtd_add_device(void *arg, uint16_t rid) +static int +vtd_add_device(void *arg, device_t dev __unused, uint16_t rid) { int idx; uint64_t *ctxp; struct domain *dom = arg; vm_paddr_t pt_paddr; struct vtdmap *vtdmap; uint8_t bus; KASSERT(dom != NULL, ("domain is NULL")); bus = PCI_RID2BUS(rid); ctxp = ctx_tables[bus]; pt_paddr = vtophys(dom->ptp); idx = VTD_RID2IDX(rid); if (ctxp[idx] & VTD_CTX_PRESENT) { panic("vtd_add_device: device %x is already owned by " "domain %d", rid, (uint16_t)(ctxp[idx + 1] >> 8)); } if ((vtdmap = vtd_device_scope(rid)) == NULL) panic("vtd_add_device: device %x is not in scope for " "any DMA remapping unit", rid); /* * Order is important. The 'present' bit is set only after all fields * of the context pointer are initialized. */ ctxp[idx + 1] = dom->addrwidth | (dom->id << 8); if (VTD_ECAP_DI(vtdmap->ext_cap)) ctxp[idx] = VTD_CTX_TT_ALL; else ctxp[idx] = 0; ctxp[idx] |= pt_paddr | VTD_CTX_PRESENT; /* * 'Not Present' entries are not cached in either the Context Cache * or in the IOTLB, so there is no need to invalidate either of them. */ + return (0); } -static void -vtd_remove_device(void *arg, uint16_t rid) +static int +vtd_remove_device(void *arg, device_t dev __unused, uint16_t rid) { int i, idx; uint64_t *ctxp; struct vtdmap *vtdmap; uint8_t bus; bus = PCI_RID2BUS(rid); ctxp = ctx_tables[bus]; idx = VTD_RID2IDX(rid); /* * Order is important. The 'present' bit is must be cleared first. */ ctxp[idx] = 0; ctxp[idx + 1] = 0; /* * Invalidate the Context Cache and the IOTLB. * * XXX use device-selective invalidation for Context Cache * XXX use domain-selective invalidation for IOTLB */ for (i = 0; i < drhd_num; i++) { vtdmap = vtdmaps[i]; vtd_ctx_global_invalidate(vtdmap); vtd_iotlb_global_invalidate(vtdmap); } + return (0); } #define CREATE_MAPPING 0 #define REMOVE_MAPPING 1 static uint64_t vtd_update_mapping(void *arg, vm_paddr_t gpa, vm_paddr_t hpa, uint64_t len, int remove) { struct domain *dom; int i, spshift, ptpshift, ptpindex, nlevels; uint64_t spsize, *ptp; dom = arg; ptpindex = 0; ptpshift = 0; KASSERT(gpa + len > gpa, ("%s: invalid gpa range %#lx/%#lx", __func__, gpa, len)); KASSERT(gpa + len <= dom->maxaddr, ("%s: gpa range %#lx/%#lx beyond " "domain maxaddr %#lx", __func__, gpa, len, dom->maxaddr)); if (gpa & PAGE_MASK) panic("vtd_create_mapping: unaligned gpa 0x%0lx", gpa); if (hpa & PAGE_MASK) panic("vtd_create_mapping: unaligned hpa 0x%0lx", hpa); if (len & PAGE_MASK) panic("vtd_create_mapping: unaligned len 0x%0lx", len); /* * Compute the size of the mapping that we can accommodate. * * This is based on three factors: * - supported super page size * - alignment of the region starting at 'gpa' and 'hpa' * - length of the region 'len' */ spshift = 48; for (i = 3; i >= 0; i--) { spsize = 1UL << spshift; if ((dom->spsmask & (1 << i)) != 0 && (gpa & (spsize - 1)) == 0 && (hpa & (spsize - 1)) == 0 && (len >= spsize)) { break; } spshift -= 9; } ptp = dom->ptp; nlevels = dom->pt_levels; while (--nlevels >= 0) { ptpshift = 12 + nlevels * 9; ptpindex = (gpa >> ptpshift) & 0x1FF; /* We have reached the leaf mapping */ if (spshift >= ptpshift) { break; } /* * We are working on a non-leaf page table page. * * Create a downstream page table page if necessary and point * to it from the current page table. */ if (ptp[ptpindex] == 0) { void *nlp = malloc(PAGE_SIZE, M_VTD, M_WAITOK | M_ZERO); ptp[ptpindex] = vtophys(nlp)| VTD_PTE_RD | VTD_PTE_WR; } ptp = (uint64_t *)PHYS_TO_DMAP(ptp[ptpindex] & VTD_PTE_ADDR_M); } if ((gpa & ((1UL << ptpshift) - 1)) != 0) panic("gpa 0x%lx and ptpshift %d mismatch", gpa, ptpshift); /* * Update the 'gpa' -> 'hpa' mapping */ if (remove) { ptp[ptpindex] = 0; } else { ptp[ptpindex] = hpa | VTD_PTE_RD | VTD_PTE_WR; if (nlevels > 0) ptp[ptpindex] |= VTD_PTE_SUPERPAGE; } return (1UL << ptpshift); } -static uint64_t -vtd_create_mapping(void *arg, vm_paddr_t gpa, vm_paddr_t hpa, uint64_t len) +static int +vtd_create_mapping(void *arg, vm_paddr_t gpa, vm_paddr_t hpa, uint64_t len, + uint64_t *res_len) { - return (vtd_update_mapping(arg, gpa, hpa, len, CREATE_MAPPING)); + *res_len = vtd_update_mapping(arg, gpa, hpa, len, CREATE_MAPPING); + return (0); } -static uint64_t -vtd_remove_mapping(void *arg, vm_paddr_t gpa, uint64_t len) +static int +vtd_remove_mapping(void *arg, vm_paddr_t gpa, uint64_t len, uint64_t *res_len) { - return (vtd_update_mapping(arg, gpa, 0, len, REMOVE_MAPPING)); + *res_len = vtd_update_mapping(arg, gpa, 0, len, REMOVE_MAPPING); + return (0); } -static void +static int vtd_invalidate_tlb(void *dom) { int i; struct vtdmap *vtdmap; /* * Invalidate the IOTLB. * XXX use domain-selective invalidation for IOTLB */ for (i = 0; i < drhd_num; i++) { vtdmap = vtdmaps[i]; vtd_iotlb_global_invalidate(vtdmap); } + return (0); } static void * vtd_create_domain(vm_paddr_t maxaddr) { struct domain *dom; vm_paddr_t addr; int tmp, i, gaw, agaw, sagaw, res, pt_levels, addrwidth; struct vtdmap *vtdmap; if (drhd_num <= 0) panic("vtd_create_domain: no dma remapping hardware available"); /* * Calculate AGAW. * Section 3.4.2 "Adjusted Guest Address Width", Architecture Spec. */ addr = 0; for (gaw = 0; addr < maxaddr; gaw++) addr = 1ULL << gaw; res = (gaw - 12) % 9; if (res == 0) agaw = gaw; else agaw = gaw + 9 - res; if (agaw > 64) agaw = 64; /* * Select the smallest Supported AGAW and the corresponding number * of page table levels. */ pt_levels = 2; sagaw = 30; addrwidth = 0; tmp = ~0; for (i = 0; i < drhd_num; i++) { vtdmap = vtdmaps[i]; /* take most compatible value */ tmp &= VTD_CAP_SAGAW(vtdmap->cap); } for (i = 0; i < 5; i++) { if ((tmp & (1 << i)) != 0 && sagaw >= agaw) break; pt_levels++; addrwidth++; sagaw += 9; if (sagaw > 64) sagaw = 64; } if (i >= 5) { panic("vtd_create_domain: SAGAW 0x%x does not support AGAW %d", tmp, agaw); } dom = malloc(sizeof(struct domain), M_VTD, M_ZERO | M_WAITOK); dom->pt_levels = pt_levels; dom->addrwidth = addrwidth; dom->id = domain_id(); dom->maxaddr = maxaddr; dom->ptp = malloc(PAGE_SIZE, M_VTD, M_ZERO | M_WAITOK); if ((uintptr_t)dom->ptp & PAGE_MASK) panic("vtd_create_domain: ptp (%p) not page aligned", dom->ptp); #ifdef notyet /* * XXX superpage mappings for the iommu do not work correctly. * * By default all physical memory is mapped into the host_domain. * When a VM is allocated wired memory the pages belonging to it * are removed from the host_domain and added to the vm's domain. * * If the page being removed was mapped using a superpage mapping * in the host_domain then we need to demote the mapping before * removing the page. * * There is not any code to deal with the demotion at the moment * so we disable superpage mappings altogether. */ dom->spsmask = ~0; for (i = 0; i < drhd_num; i++) { vtdmap = vtdmaps[i]; /* take most compatible value */ dom->spsmask &= VTD_CAP_SPS(vtdmap->cap); } #endif SLIST_INSERT_HEAD(&domhead, dom, next); return (dom); } static void vtd_free_ptp(uint64_t *ptp, int level) { int i; uint64_t *nlp; if (level > 1) { for (i = 0; i < 512; i++) { if ((ptp[i] & (VTD_PTE_RD | VTD_PTE_WR)) == 0) continue; if ((ptp[i] & VTD_PTE_SUPERPAGE) != 0) continue; nlp = (uint64_t *)PHYS_TO_DMAP(ptp[i] & VTD_PTE_ADDR_M); vtd_free_ptp(nlp, level - 1); } } bzero(ptp, PAGE_SIZE); free(ptp, M_VTD); } static void vtd_destroy_domain(void *arg) { struct domain *dom; dom = arg; SLIST_REMOVE(&domhead, dom, domain, next); vtd_free_ptp(dom->ptp, dom->pt_levels); free(dom, M_VTD); } const struct iommu_ops iommu_ops_intel = { .init = vtd_init, .cleanup = vtd_cleanup, .enable = vtd_enable, .disable = vtd_disable, .create_domain = vtd_create_domain, .destroy_domain = vtd_destroy_domain, .create_mapping = vtd_create_mapping, .remove_mapping = vtd_remove_mapping, .add_device = vtd_add_device, .remove_device = vtd_remove_device, .invalidate_tlb = vtd_invalidate_tlb, }; diff --git a/sys/amd64/vmm/io/iommu.c b/sys/amd64/vmm/io/iommu.c index dc4a0de94bb6..bd86ac37a83e 100644 --- a/sys/amd64/vmm/io/iommu.c +++ b/sys/amd64/vmm/io/iommu.c @@ -1,338 +1,359 @@ /*- * SPDX-License-Identifier: BSD-2-Clause * * Copyright (c) 2011 NetApp, Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include #include #include #include #include #include #include #include #include #include "vmm_util.h" #include "vmm_mem.h" #include "iommu.h" SYSCTL_DECL(_hw_vmm); SYSCTL_NODE(_hw_vmm, OID_AUTO, iommu, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, "bhyve iommu parameters"); static int iommu_avail; SYSCTL_INT(_hw_vmm_iommu, OID_AUTO, initialized, CTLFLAG_RD, &iommu_avail, 0, "bhyve iommu initialized?"); static int iommu_enable = 1; SYSCTL_INT(_hw_vmm_iommu, OID_AUTO, enable, CTLFLAG_RDTUN, &iommu_enable, 0, "Enable use of I/O MMU (required for PCI passthrough)."); static const struct iommu_ops *ops; static void *host_domain; static eventhandler_tag add_tag, delete_tag; +static void iommu_cleanup_int(bool iommu_disable); + static __inline int IOMMU_INIT(void) { if (ops != NULL) return ((*ops->init)()); else return (ENXIO); } static __inline void IOMMU_CLEANUP(void) { if (ops != NULL && iommu_avail) (*ops->cleanup)(); } static __inline void * IOMMU_CREATE_DOMAIN(vm_paddr_t maxaddr) { if (ops != NULL && iommu_avail) return ((*ops->create_domain)(maxaddr)); else return (NULL); } static __inline void IOMMU_DESTROY_DOMAIN(void *dom) { if (ops != NULL && iommu_avail) (*ops->destroy_domain)(dom); } -static __inline uint64_t -IOMMU_CREATE_MAPPING(void *domain, vm_paddr_t gpa, vm_paddr_t hpa, uint64_t len) +static __inline int +IOMMU_CREATE_MAPPING(void *domain, vm_paddr_t gpa, vm_paddr_t hpa, + uint64_t len, uint64_t *res_len) { if (ops != NULL && iommu_avail) - return ((*ops->create_mapping)(domain, gpa, hpa, len)); - else - return (len); /* XXX */ + return ((*ops->create_mapping)(domain, gpa, hpa, len, res_len)); + return (EOPNOTSUPP); } static __inline uint64_t -IOMMU_REMOVE_MAPPING(void *domain, vm_paddr_t gpa, uint64_t len) +IOMMU_REMOVE_MAPPING(void *domain, vm_paddr_t gpa, uint64_t len, + uint64_t *res_len) { if (ops != NULL && iommu_avail) - return ((*ops->remove_mapping)(domain, gpa, len)); - else - return (len); /* XXX */ + return ((*ops->remove_mapping)(domain, gpa, len, res_len)); + return (EOPNOTSUPP); } -static __inline void -IOMMU_ADD_DEVICE(void *domain, uint16_t rid) +static __inline int +IOMMU_ADD_DEVICE(void *domain, device_t dev, uint16_t rid) { if (ops != NULL && iommu_avail) - (*ops->add_device)(domain, rid); + return ((*ops->add_device)(domain, dev, rid)); + return (EOPNOTSUPP); } -static __inline void -IOMMU_REMOVE_DEVICE(void *domain, uint16_t rid) +static __inline int +IOMMU_REMOVE_DEVICE(void *domain, device_t dev, uint16_t rid) { if (ops != NULL && iommu_avail) - (*ops->remove_device)(domain, rid); + return ((*ops->remove_device)(domain, dev, rid)); + return (EOPNOTSUPP); } -static __inline void +static __inline int IOMMU_INVALIDATE_TLB(void *domain) { if (ops != NULL && iommu_avail) - (*ops->invalidate_tlb)(domain); + return ((*ops->invalidate_tlb)(domain)); + return (0); } static __inline void IOMMU_ENABLE(void) { if (ops != NULL && iommu_avail) (*ops->enable)(); } static __inline void IOMMU_DISABLE(void) { if (ops != NULL && iommu_avail) (*ops->disable)(); } static void iommu_pci_add(void *arg, device_t dev) { /* Add new devices to the host domain. */ - iommu_add_device(host_domain, pci_get_rid(dev)); + iommu_add_device(host_domain, dev, pci_get_rid(dev)); } static void iommu_pci_delete(void *arg, device_t dev) { - iommu_remove_device(host_domain, pci_get_rid(dev)); + iommu_remove_device(host_domain, dev, pci_get_rid(dev)); } static void iommu_init(void) { int error, bus, slot, func; vm_paddr_t maxaddr; devclass_t dc; device_t dev; if (!iommu_enable) return; if (vmm_is_intel()) ops = &iommu_ops_intel; else if (vmm_is_svm()) ops = &iommu_ops_amd; else ops = NULL; error = IOMMU_INIT(); if (error) return; iommu_avail = 1; /* * Create a domain for the devices owned by the host */ maxaddr = vmm_mem_maxaddr(); host_domain = IOMMU_CREATE_DOMAIN(maxaddr); if (host_domain == NULL) { printf("iommu_init: unable to create a host domain"); IOMMU_CLEANUP(); ops = NULL; iommu_avail = 0; return; } /* * Create 1:1 mappings from '0' to 'maxaddr' for devices assigned to * the host */ iommu_create_mapping(host_domain, 0, 0, maxaddr); add_tag = EVENTHANDLER_REGISTER(pci_add_device, iommu_pci_add, NULL, 0); delete_tag = EVENTHANDLER_REGISTER(pci_delete_device, iommu_pci_delete, NULL, 0); dc = devclass_find("ppt"); for (bus = 0; bus <= PCI_BUSMAX; bus++) { for (slot = 0; slot <= PCI_SLOTMAX; slot++) { for (func = 0; func <= PCI_FUNCMAX; func++) { dev = pci_find_dbsf(0, bus, slot, func); if (dev == NULL) continue; /* Skip passthrough devices. */ if (dc != NULL && device_get_devclass(dev) == dc) continue; /* * Everything else belongs to the host * domain. */ - iommu_add_device(host_domain, + error = iommu_add_device(host_domain, dev, pci_get_rid(dev)); + if (error != 0) { + iommu_cleanup_int(false); + return; + } } } } IOMMU_ENABLE(); - } -void -iommu_cleanup(void) +static void +iommu_cleanup_int(bool iommu_disable) { if (add_tag != NULL) { EVENTHANDLER_DEREGISTER(pci_add_device, add_tag); add_tag = NULL; } if (delete_tag != NULL) { EVENTHANDLER_DEREGISTER(pci_delete_device, delete_tag); delete_tag = NULL; } - IOMMU_DISABLE(); + if (iommu_disable) + IOMMU_DISABLE(); IOMMU_DESTROY_DOMAIN(host_domain); host_domain = NULL; IOMMU_CLEANUP(); } +void +iommu_cleanup(void) +{ + iommu_cleanup_int(true); +} + void * iommu_create_domain(vm_paddr_t maxaddr) { static volatile int iommu_initted; if (iommu_initted < 2) { if (atomic_cmpset_int(&iommu_initted, 0, 1)) { iommu_init(); atomic_store_rel_int(&iommu_initted, 2); } else while (iommu_initted == 1) cpu_spinwait(); } return (IOMMU_CREATE_DOMAIN(maxaddr)); } void iommu_destroy_domain(void *dom) { IOMMU_DESTROY_DOMAIN(dom); } -void +int iommu_create_mapping(void *dom, vm_paddr_t gpa, vm_paddr_t hpa, size_t len) { uint64_t mapped, remaining; - - remaining = len; - - while (remaining > 0) { - mapped = IOMMU_CREATE_MAPPING(dom, gpa, hpa, remaining); - gpa += mapped; - hpa += mapped; - remaining -= mapped; + int error; + + for (remaining = len; remaining > 0; gpa += mapped, hpa += mapped, + remaining -= mapped) { + error = IOMMU_CREATE_MAPPING(dom, gpa, hpa, remaining, + &mapped); + if (error != 0) { + /* XXXKIB rollback */ + return (error); + } } + return (0); } -void +int iommu_remove_mapping(void *dom, vm_paddr_t gpa, size_t len) { uint64_t unmapped, remaining; - - remaining = len; - - while (remaining > 0) { - unmapped = IOMMU_REMOVE_MAPPING(dom, gpa, remaining); - gpa += unmapped; - remaining -= unmapped; + int error; + + for (remaining = len; remaining > 0; gpa += unmapped, + remaining -= unmapped) { + error = IOMMU_REMOVE_MAPPING(dom, gpa, remaining, &unmapped); + if (error != 0) { + /* XXXKIB ? */ + return (error); + } } + return (0); } void * iommu_host_domain(void) { return (host_domain); } -void -iommu_add_device(void *dom, uint16_t rid) +int +iommu_add_device(void *dom, device_t dev, uint16_t rid) { - IOMMU_ADD_DEVICE(dom, rid); + return (IOMMU_ADD_DEVICE(dom, dev, rid)); } -void -iommu_remove_device(void *dom, uint16_t rid) +int +iommu_remove_device(void *dom, device_t dev, uint16_t rid) { - IOMMU_REMOVE_DEVICE(dom, rid); + return (IOMMU_REMOVE_DEVICE(dom, dev, rid)); } -void +int iommu_invalidate_tlb(void *domain) { - IOMMU_INVALIDATE_TLB(domain); + return (IOMMU_INVALIDATE_TLB(domain)); } diff --git a/sys/amd64/vmm/io/iommu.h b/sys/amd64/vmm/io/iommu.h index c2891b62b5f2..5294a9d92a6b 100644 --- a/sys/amd64/vmm/io/iommu.h +++ b/sys/amd64/vmm/io/iommu.h @@ -1,74 +1,74 @@ /*- * SPDX-License-Identifier: BSD-2-Clause * * Copyright (c) 2011 NetApp, Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #ifndef _IO_IOMMU_H_ #define _IO_IOMMU_H_ typedef int (*iommu_init_func_t)(void); typedef void (*iommu_cleanup_func_t)(void); typedef void (*iommu_enable_func_t)(void); typedef void (*iommu_disable_func_t)(void); typedef void *(*iommu_create_domain_t)(vm_paddr_t maxaddr); typedef void (*iommu_destroy_domain_t)(void *domain); -typedef uint64_t (*iommu_create_mapping_t)(void *domain, vm_paddr_t gpa, - vm_paddr_t hpa, uint64_t len); -typedef uint64_t (*iommu_remove_mapping_t)(void *domain, vm_paddr_t gpa, - uint64_t len); -typedef void (*iommu_add_device_t)(void *domain, uint16_t rid); -typedef void (*iommu_remove_device_t)(void *dom, uint16_t rid); -typedef void (*iommu_invalidate_tlb_t)(void *dom); +typedef int (*iommu_create_mapping_t)(void *domain, vm_paddr_t gpa, + vm_paddr_t hpa, uint64_t len, uint64_t *res_len); +typedef int (*iommu_remove_mapping_t)(void *domain, vm_paddr_t gpa, + uint64_t len, uint64_t *res_len); +typedef int (*iommu_add_device_t)(void *domain, device_t dev, uint16_t rid); +typedef int (*iommu_remove_device_t)(void *dom, device_t dev, uint16_t rid); +typedef int (*iommu_invalidate_tlb_t)(void *dom); struct iommu_ops { iommu_init_func_t init; /* module wide */ iommu_cleanup_func_t cleanup; iommu_enable_func_t enable; iommu_disable_func_t disable; iommu_create_domain_t create_domain; /* domain-specific */ iommu_destroy_domain_t destroy_domain; iommu_create_mapping_t create_mapping; iommu_remove_mapping_t remove_mapping; iommu_add_device_t add_device; iommu_remove_device_t remove_device; iommu_invalidate_tlb_t invalidate_tlb; }; extern const struct iommu_ops iommu_ops_intel; extern const struct iommu_ops iommu_ops_amd; void iommu_cleanup(void); void *iommu_host_domain(void); void *iommu_create_domain(vm_paddr_t maxaddr); void iommu_destroy_domain(void *dom); -void iommu_create_mapping(void *dom, vm_paddr_t gpa, vm_paddr_t hpa, - size_t len); -void iommu_remove_mapping(void *dom, vm_paddr_t gpa, size_t len); -void iommu_add_device(void *dom, uint16_t rid); -void iommu_remove_device(void *dom, uint16_t rid); -void iommu_invalidate_tlb(void *domain); +int iommu_create_mapping(void *dom, vm_paddr_t gpa, vm_paddr_t hpa, + size_t len); +int iommu_remove_mapping(void *dom, vm_paddr_t gpa, size_t len); +int iommu_add_device(void *dom, device_t dev, uint16_t rid); +int iommu_remove_device(void *dom, device_t dev, uint16_t rid); +int iommu_invalidate_tlb(void *domain); #endif diff --git a/sys/amd64/vmm/io/ppt.c b/sys/amd64/vmm/io/ppt.c index 3b043c64fbde..c3b2b57da988 100644 --- a/sys/amd64/vmm/io/ppt.c +++ b/sys/amd64/vmm/io/ppt.c @@ -1,786 +1,801 @@ /*- * SPDX-License-Identifier: BSD-2-Clause * * Copyright (c) 2011 NetApp, Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "vmm_lapic.h" #include "iommu.h" #include "ppt.h" /* XXX locking */ #define MAX_MSIMSGS 32 /* * If the MSI-X table is located in the middle of a BAR then that MMIO * region gets split into two segments - one segment above the MSI-X table * and the other segment below the MSI-X table - with a hole in place of * the MSI-X table so accesses to it can be trapped and emulated. * * So, allocate a MMIO segment for each BAR register + 1 additional segment. */ #define MAX_MMIOSEGS ((PCIR_MAX_BAR_0 + 1) + 1) MALLOC_DEFINE(M_PPTMSIX, "pptmsix", "Passthru MSI-X resources"); struct pptintr_arg { /* pptintr(pptintr_arg) */ struct pptdev *pptdev; uint64_t addr; uint64_t msg_data; }; struct pptseg { vm_paddr_t gpa; size_t len; int wired; }; struct pptdev { device_t dev; struct vm *vm; /* owner of this device */ TAILQ_ENTRY(pptdev) next; struct pptseg mmio[MAX_MMIOSEGS]; struct { int num_msgs; /* guest state */ int startrid; /* host state */ struct resource *res[MAX_MSIMSGS]; void *cookie[MAX_MSIMSGS]; struct pptintr_arg arg[MAX_MSIMSGS]; } msi; struct { int num_msgs; int startrid; int msix_table_rid; int msix_pba_rid; struct resource *msix_table_res; struct resource *msix_pba_res; struct resource **res; void **cookie; struct pptintr_arg *arg; } msix; }; SYSCTL_DECL(_hw_vmm); SYSCTL_NODE(_hw_vmm, OID_AUTO, ppt, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, "bhyve passthru devices"); static int num_pptdevs; SYSCTL_INT(_hw_vmm_ppt, OID_AUTO, devices, CTLFLAG_RD, &num_pptdevs, 0, "number of pci passthru devices"); static TAILQ_HEAD(, pptdev) pptdev_list = TAILQ_HEAD_INITIALIZER(pptdev_list); static int ppt_probe(device_t dev) { int bus, slot, func; struct pci_devinfo *dinfo; dinfo = (struct pci_devinfo *)device_get_ivars(dev); bus = pci_get_bus(dev); slot = pci_get_slot(dev); func = pci_get_function(dev); /* * To qualify as a pci passthrough device a device must: * - be allowed by administrator to be used in this role * - be an endpoint device */ if ((dinfo->cfg.hdrtype & PCIM_HDRTYPE) != PCIM_HDRTYPE_NORMAL) return (ENXIO); else if (vmm_is_pptdev(bus, slot, func)) return (0); else /* * Returning BUS_PROBE_NOWILDCARD here matches devices that the * SR-IOV infrastructure specified as "ppt" passthrough devices. * All normal devices that did not have "ppt" specified as their * driver will not be matched by this. */ return (BUS_PROBE_NOWILDCARD); } static int ppt_attach(device_t dev) { struct pptdev *ppt; - uint16_t cmd; + uint16_t cmd, cmd1; + int error; ppt = device_get_softc(dev); - cmd = pci_read_config(dev, PCIR_COMMAND, 2); + cmd1 = cmd = pci_read_config(dev, PCIR_COMMAND, 2); cmd &= ~(PCIM_CMD_PORTEN | PCIM_CMD_MEMEN | PCIM_CMD_BUSMASTEREN); pci_write_config(dev, PCIR_COMMAND, cmd, 2); - iommu_remove_device(iommu_host_domain(), pci_get_rid(dev)); + error = iommu_remove_device(iommu_host_domain(), dev, pci_get_rid(dev)); + if (error != 0) { + pci_write_config(dev, PCIR_COMMAND, cmd1, 2); + return (error); + } num_pptdevs++; TAILQ_INSERT_TAIL(&pptdev_list, ppt, next); ppt->dev = dev; if (bootverbose) device_printf(dev, "attached\n"); return (0); } static int ppt_detach(device_t dev) { struct pptdev *ppt; + int error; ppt = device_get_softc(dev); if (ppt->vm != NULL) return (EBUSY); + if (iommu_host_domain() != NULL) { + error = iommu_add_device(iommu_host_domain(), dev, + pci_get_rid(dev)); + } else { + error = 0; + } + if (error != 0) + return (error); num_pptdevs--; TAILQ_REMOVE(&pptdev_list, ppt, next); - if (iommu_host_domain() != NULL) - iommu_add_device(iommu_host_domain(), pci_get_rid(dev)); - return (0); } static device_method_t ppt_methods[] = { /* Device interface */ DEVMETHOD(device_probe, ppt_probe), DEVMETHOD(device_attach, ppt_attach), DEVMETHOD(device_detach, ppt_detach), {0, 0} }; DEFINE_CLASS_0(ppt, ppt_driver, ppt_methods, sizeof(struct pptdev)); DRIVER_MODULE(ppt, pci, ppt_driver, NULL, NULL); static int ppt_find(struct vm *vm, int bus, int slot, int func, struct pptdev **pptp) { device_t dev; struct pptdev *ppt; int b, s, f; TAILQ_FOREACH(ppt, &pptdev_list, next) { dev = ppt->dev; b = pci_get_bus(dev); s = pci_get_slot(dev); f = pci_get_function(dev); if (bus == b && slot == s && func == f) break; } if (ppt == NULL) return (ENOENT); if (ppt->vm != vm) /* Make sure we own this device */ return (EBUSY); *pptp = ppt; return (0); } static void ppt_unmap_all_mmio(struct vm *vm, struct pptdev *ppt) { int i; struct pptseg *seg; for (i = 0; i < MAX_MMIOSEGS; i++) { seg = &ppt->mmio[i]; if (seg->len == 0) continue; (void)vm_unmap_mmio(vm, seg->gpa, seg->len); bzero(seg, sizeof(struct pptseg)); } } static void ppt_teardown_msi(struct pptdev *ppt) { int i, rid; void *cookie; struct resource *res; if (ppt->msi.num_msgs == 0) return; for (i = 0; i < ppt->msi.num_msgs; i++) { rid = ppt->msi.startrid + i; res = ppt->msi.res[i]; cookie = ppt->msi.cookie[i]; if (cookie != NULL) bus_teardown_intr(ppt->dev, res, cookie); if (res != NULL) bus_release_resource(ppt->dev, SYS_RES_IRQ, rid, res); ppt->msi.res[i] = NULL; ppt->msi.cookie[i] = NULL; } if (ppt->msi.startrid == 1) pci_release_msi(ppt->dev); ppt->msi.num_msgs = 0; } static void ppt_teardown_msix_intr(struct pptdev *ppt, int idx) { int rid; struct resource *res; void *cookie; rid = ppt->msix.startrid + idx; res = ppt->msix.res[idx]; cookie = ppt->msix.cookie[idx]; if (cookie != NULL) bus_teardown_intr(ppt->dev, res, cookie); if (res != NULL) bus_release_resource(ppt->dev, SYS_RES_IRQ, rid, res); ppt->msix.res[idx] = NULL; ppt->msix.cookie[idx] = NULL; } static void ppt_teardown_msix(struct pptdev *ppt) { int i; if (ppt->msix.num_msgs == 0) return; for (i = 0; i < ppt->msix.num_msgs; i++) ppt_teardown_msix_intr(ppt, i); free(ppt->msix.res, M_PPTMSIX); free(ppt->msix.cookie, M_PPTMSIX); free(ppt->msix.arg, M_PPTMSIX); pci_release_msi(ppt->dev); if (ppt->msix.msix_table_res) { bus_release_resource(ppt->dev, SYS_RES_MEMORY, ppt->msix.msix_table_rid, ppt->msix.msix_table_res); ppt->msix.msix_table_res = NULL; ppt->msix.msix_table_rid = 0; } if (ppt->msix.msix_pba_res) { bus_release_resource(ppt->dev, SYS_RES_MEMORY, ppt->msix.msix_pba_rid, ppt->msix.msix_pba_res); ppt->msix.msix_pba_res = NULL; ppt->msix.msix_pba_rid = 0; } ppt->msix.num_msgs = 0; } int ppt_avail_devices(void) { return (num_pptdevs); } int ppt_assigned_devices(struct vm *vm) { struct pptdev *ppt; int num; num = 0; TAILQ_FOREACH(ppt, &pptdev_list, next) { if (ppt->vm == vm) num++; } return (num); } bool ppt_is_mmio(struct vm *vm, vm_paddr_t gpa) { int i; struct pptdev *ppt; struct pptseg *seg; TAILQ_FOREACH(ppt, &pptdev_list, next) { if (ppt->vm != vm) continue; for (i = 0; i < MAX_MMIOSEGS; i++) { seg = &ppt->mmio[i]; if (seg->len == 0) continue; if (gpa >= seg->gpa && gpa < seg->gpa + seg->len) return (true); } } return (false); } static void ppt_pci_reset(device_t dev) { if (pcie_flr(dev, max(pcie_get_max_completion_timeout(dev) / 1000, 10), true)) return; pci_power_reset(dev); } static uint16_t ppt_bar_enables(struct pptdev *ppt) { struct pci_map *pm; uint16_t cmd; cmd = 0; for (pm = pci_first_bar(ppt->dev); pm != NULL; pm = pci_next_bar(pm)) { if (PCI_BAR_IO(pm->pm_value)) cmd |= PCIM_CMD_PORTEN; if (PCI_BAR_MEM(pm->pm_value)) cmd |= PCIM_CMD_MEMEN; } return (cmd); } int ppt_assign_device(struct vm *vm, int bus, int slot, int func) { struct pptdev *ppt; int error; uint16_t cmd; /* Passing NULL requires the device to be unowned. */ error = ppt_find(NULL, bus, slot, func, &ppt); if (error) return (error); pci_save_state(ppt->dev); ppt_pci_reset(ppt->dev); pci_restore_state(ppt->dev); + error = iommu_add_device(vm_iommu_domain(vm), ppt->dev, + pci_get_rid(ppt->dev)); + if (error != 0) + return (error); ppt->vm = vm; - iommu_add_device(vm_iommu_domain(vm), pci_get_rid(ppt->dev)); cmd = pci_read_config(ppt->dev, PCIR_COMMAND, 2); cmd |= PCIM_CMD_BUSMASTEREN | ppt_bar_enables(ppt); pci_write_config(ppt->dev, PCIR_COMMAND, cmd, 2); return (0); } int ppt_unassign_device(struct vm *vm, int bus, int slot, int func) { struct pptdev *ppt; int error; uint16_t cmd; error = ppt_find(vm, bus, slot, func, &ppt); if (error) return (error); cmd = pci_read_config(ppt->dev, PCIR_COMMAND, 2); cmd &= ~(PCIM_CMD_PORTEN | PCIM_CMD_MEMEN | PCIM_CMD_BUSMASTEREN); pci_write_config(ppt->dev, PCIR_COMMAND, cmd, 2); pci_save_state(ppt->dev); ppt_pci_reset(ppt->dev); pci_restore_state(ppt->dev); ppt_unmap_all_mmio(vm, ppt); ppt_teardown_msi(ppt); ppt_teardown_msix(ppt); - iommu_remove_device(vm_iommu_domain(vm), pci_get_rid(ppt->dev)); + error = iommu_remove_device(vm_iommu_domain(vm), ppt->dev, + pci_get_rid(ppt->dev)); ppt->vm = NULL; - return (0); + return (error); } int ppt_unassign_all(struct vm *vm) { struct pptdev *ppt; int bus, slot, func; device_t dev; TAILQ_FOREACH(ppt, &pptdev_list, next) { if (ppt->vm == vm) { dev = ppt->dev; bus = pci_get_bus(dev); slot = pci_get_slot(dev); func = pci_get_function(dev); vm_unassign_pptdev(vm, bus, slot, func); } } return (0); } static bool ppt_valid_bar_mapping(struct pptdev *ppt, vm_paddr_t hpa, size_t len) { struct pci_map *pm; pci_addr_t base, size; for (pm = pci_first_bar(ppt->dev); pm != NULL; pm = pci_next_bar(pm)) { if (!PCI_BAR_MEM(pm->pm_value)) continue; base = pm->pm_value & PCIM_BAR_MEM_BASE; size = (pci_addr_t)1 << pm->pm_size; if (hpa >= base && hpa + len <= base + size) return (true); } return (false); } int ppt_map_mmio(struct vm *vm, int bus, int slot, int func, vm_paddr_t gpa, size_t len, vm_paddr_t hpa) { int i, error; struct pptseg *seg; struct pptdev *ppt; if (len % PAGE_SIZE != 0 || len == 0 || gpa % PAGE_SIZE != 0 || hpa % PAGE_SIZE != 0 || gpa + len < gpa || hpa + len < hpa) return (EINVAL); error = ppt_find(vm, bus, slot, func, &ppt); if (error) return (error); if (!ppt_valid_bar_mapping(ppt, hpa, len)) return (EINVAL); for (i = 0; i < MAX_MMIOSEGS; i++) { seg = &ppt->mmio[i]; if (seg->len == 0) { error = vm_map_mmio(vm, gpa, len, hpa); if (error == 0) { seg->gpa = gpa; seg->len = len; } return (error); } } return (ENOSPC); } int ppt_unmap_mmio(struct vm *vm, int bus, int slot, int func, vm_paddr_t gpa, size_t len) { int i, error; struct pptseg *seg; struct pptdev *ppt; error = ppt_find(vm, bus, slot, func, &ppt); if (error) return (error); for (i = 0; i < MAX_MMIOSEGS; i++) { seg = &ppt->mmio[i]; if (seg->gpa == gpa && seg->len == len) { error = vm_unmap_mmio(vm, seg->gpa, seg->len); if (error == 0) { seg->gpa = 0; seg->len = 0; } return (error); } } return (ENOENT); } static int pptintr(void *arg) { struct pptdev *ppt; struct pptintr_arg *pptarg; pptarg = arg; ppt = pptarg->pptdev; if (ppt->vm != NULL) lapic_intr_msi(ppt->vm, pptarg->addr, pptarg->msg_data); else { /* * XXX * This is not expected to happen - panic? */ } /* * For legacy interrupts give other filters a chance in case * the interrupt was not generated by the passthrough device. */ if (ppt->msi.startrid == 0) return (FILTER_STRAY); else return (FILTER_HANDLED); } int ppt_setup_msi(struct vm *vm, int bus, int slot, int func, uint64_t addr, uint64_t msg, int numvec) { int i, rid, flags; int msi_count, startrid, error, tmp; struct pptdev *ppt; if (numvec < 0 || numvec > MAX_MSIMSGS) return (EINVAL); error = ppt_find(vm, bus, slot, func, &ppt); if (error) return (error); /* Reject attempts to enable MSI while MSI-X is active. */ if (ppt->msix.num_msgs != 0 && numvec != 0) return (EBUSY); /* Free any allocated resources */ ppt_teardown_msi(ppt); if (numvec == 0) /* nothing more to do */ return (0); flags = RF_ACTIVE; msi_count = pci_msi_count(ppt->dev); if (msi_count == 0) { startrid = 0; /* legacy interrupt */ msi_count = 1; flags |= RF_SHAREABLE; } else startrid = 1; /* MSI */ /* * The device must be capable of supporting the number of vectors * the guest wants to allocate. */ if (numvec > msi_count) return (EINVAL); /* * Make sure that we can allocate all the MSI vectors that are needed * by the guest. */ if (startrid == 1) { tmp = numvec; error = pci_alloc_msi(ppt->dev, &tmp); if (error) return (error); else if (tmp != numvec) { pci_release_msi(ppt->dev); return (ENOSPC); } else { /* success */ } } ppt->msi.startrid = startrid; /* * Allocate the irq resource and attach it to the interrupt handler. */ for (i = 0; i < numvec; i++) { ppt->msi.num_msgs = i + 1; ppt->msi.cookie[i] = NULL; rid = startrid + i; ppt->msi.res[i] = bus_alloc_resource_any(ppt->dev, SYS_RES_IRQ, &rid, flags); if (ppt->msi.res[i] == NULL) break; ppt->msi.arg[i].pptdev = ppt; ppt->msi.arg[i].addr = addr; ppt->msi.arg[i].msg_data = msg + i; error = bus_setup_intr(ppt->dev, ppt->msi.res[i], INTR_TYPE_NET | INTR_MPSAFE, pptintr, NULL, &ppt->msi.arg[i], &ppt->msi.cookie[i]); if (error != 0) break; } if (i < numvec) { ppt_teardown_msi(ppt); return (ENXIO); } return (0); } int ppt_setup_msix(struct vm *vm, int bus, int slot, int func, int idx, uint64_t addr, uint64_t msg, uint32_t vector_control) { struct pptdev *ppt; struct pci_devinfo *dinfo; int numvec, alloced, rid, error; size_t res_size, cookie_size, arg_size; error = ppt_find(vm, bus, slot, func, &ppt); if (error) return (error); /* Reject attempts to enable MSI-X while MSI is active. */ if (ppt->msi.num_msgs != 0) return (EBUSY); dinfo = device_get_ivars(ppt->dev); if (!dinfo) return (ENXIO); /* * First-time configuration: * Allocate the MSI-X table * Allocate the IRQ resources * Set up some variables in ppt->msix */ if (ppt->msix.num_msgs == 0) { numvec = pci_msix_count(ppt->dev); if (numvec <= 0) return (EINVAL); ppt->msix.startrid = 1; ppt->msix.num_msgs = numvec; res_size = numvec * sizeof(ppt->msix.res[0]); cookie_size = numvec * sizeof(ppt->msix.cookie[0]); arg_size = numvec * sizeof(ppt->msix.arg[0]); ppt->msix.res = malloc(res_size, M_PPTMSIX, M_WAITOK | M_ZERO); ppt->msix.cookie = malloc(cookie_size, M_PPTMSIX, M_WAITOK | M_ZERO); ppt->msix.arg = malloc(arg_size, M_PPTMSIX, M_WAITOK | M_ZERO); rid = dinfo->cfg.msix.msix_table_bar; ppt->msix.msix_table_res = bus_alloc_resource_any(ppt->dev, SYS_RES_MEMORY, &rid, RF_ACTIVE); if (ppt->msix.msix_table_res == NULL) { ppt_teardown_msix(ppt); return (ENOSPC); } ppt->msix.msix_table_rid = rid; if (dinfo->cfg.msix.msix_table_bar != dinfo->cfg.msix.msix_pba_bar) { rid = dinfo->cfg.msix.msix_pba_bar; ppt->msix.msix_pba_res = bus_alloc_resource_any( ppt->dev, SYS_RES_MEMORY, &rid, RF_ACTIVE); if (ppt->msix.msix_pba_res == NULL) { ppt_teardown_msix(ppt); return (ENOSPC); } ppt->msix.msix_pba_rid = rid; } alloced = numvec; error = pci_alloc_msix(ppt->dev, &alloced); if (error || alloced != numvec) { ppt_teardown_msix(ppt); return (error == 0 ? ENOSPC: error); } } if ((vector_control & PCIM_MSIX_VCTRL_MASK) == 0) { /* Tear down the IRQ if it's already set up */ ppt_teardown_msix_intr(ppt, idx); /* Allocate the IRQ resource */ ppt->msix.cookie[idx] = NULL; rid = ppt->msix.startrid + idx; ppt->msix.res[idx] = bus_alloc_resource_any(ppt->dev, SYS_RES_IRQ, &rid, RF_ACTIVE); if (ppt->msix.res[idx] == NULL) return (ENXIO); ppt->msix.arg[idx].pptdev = ppt; ppt->msix.arg[idx].addr = addr; ppt->msix.arg[idx].msg_data = msg; /* Setup the MSI-X interrupt */ error = bus_setup_intr(ppt->dev, ppt->msix.res[idx], INTR_TYPE_NET | INTR_MPSAFE, pptintr, NULL, &ppt->msix.arg[idx], &ppt->msix.cookie[idx]); if (error != 0) { bus_release_resource(ppt->dev, SYS_RES_IRQ, rid, ppt->msix.res[idx]); ppt->msix.cookie[idx] = NULL; ppt->msix.res[idx] = NULL; return (ENXIO); } } else { /* Masked, tear it down if it's already been set up */ ppt_teardown_msix_intr(ppt, idx); } return (0); } int ppt_disable_msix(struct vm *vm, int bus, int slot, int func) { struct pptdev *ppt; int error; error = ppt_find(vm, bus, slot, func, &ppt); if (error) return (error); ppt_teardown_msix(ppt); return (0); } diff --git a/sys/amd64/vmm/vmm.c b/sys/amd64/vmm/vmm.c index 1d410835be88..bd703f63b557 100644 --- a/sys/amd64/vmm/vmm.c +++ b/sys/amd64/vmm/vmm.c @@ -1,2694 +1,2698 @@ /*- * SPDX-License-Identifier: BSD-2-Clause * * Copyright (c) 2011 NetApp, Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include "opt_bhyve_snapshot.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "vmm_ioport.h" #include "vmm_host.h" #include "vmm_mem.h" #include "vmm_util.h" #include "vatpic.h" #include "vatpit.h" #include "vhpet.h" #include "vioapic.h" #include "vlapic.h" #include "vpmtmr.h" #include "vrtc.h" #include "vmm_stat.h" #include "vmm_lapic.h" #include "io/ppt.h" #include "io/iommu.h" struct vlapic; /* * Initialization: * (a) allocated when vcpu is created * (i) initialized when vcpu is created and when it is reinitialized * (o) initialized the first time the vcpu is created * (x) initialized before use */ struct vcpu { struct mtx mtx; /* (o) protects 'state' and 'hostcpu' */ enum vcpu_state state; /* (o) vcpu state */ int vcpuid; /* (o) */ int hostcpu; /* (o) vcpu's host cpu */ int reqidle; /* (i) request vcpu to idle */ struct vm *vm; /* (o) */ void *cookie; /* (i) cpu-specific data */ struct vlapic *vlapic; /* (i) APIC device model */ enum x2apic_state x2apic_state; /* (i) APIC mode */ uint64_t exitintinfo; /* (i) events pending at VM exit */ int nmi_pending; /* (i) NMI pending */ int extint_pending; /* (i) INTR pending */ int exception_pending; /* (i) exception pending */ int exc_vector; /* (x) exception collateral */ int exc_errcode_valid; uint32_t exc_errcode; struct savefpu *guestfpu; /* (a,i) guest fpu state */ uint64_t guest_xcr0; /* (i) guest %xcr0 register */ void *stats; /* (a,i) statistics */ struct vm_exit exitinfo; /* (x) exit reason and collateral */ cpuset_t exitinfo_cpuset; /* (x) storage for vmexit handlers */ uint64_t nextrip; /* (x) next instruction to execute */ uint64_t tsc_offset; /* (o) TSC offsetting */ }; #define vcpu_lock_init(v) mtx_init(&((v)->mtx), "vcpu lock", 0, MTX_SPIN) #define vcpu_lock_destroy(v) mtx_destroy(&((v)->mtx)) #define vcpu_lock(v) mtx_lock_spin(&((v)->mtx)) #define vcpu_unlock(v) mtx_unlock_spin(&((v)->mtx)) #define vcpu_assert_locked(v) mtx_assert(&((v)->mtx), MA_OWNED) /* * Initialization: * (o) initialized the first time the VM is created * (i) initialized when VM is created and when it is reinitialized * (x) initialized before use * * Locking: * [m] mem_segs_lock * [r] rendezvous_mtx * [v] reads require one frozen vcpu, writes require freezing all vcpus */ struct vm { void *cookie; /* (i) cpu-specific data */ void *iommu; /* (x) iommu-specific data */ struct vhpet *vhpet; /* (i) virtual HPET */ struct vioapic *vioapic; /* (i) virtual ioapic */ struct vatpic *vatpic; /* (i) virtual atpic */ struct vatpit *vatpit; /* (i) virtual atpit */ struct vpmtmr *vpmtmr; /* (i) virtual ACPI PM timer */ struct vrtc *vrtc; /* (o) virtual RTC */ volatile cpuset_t active_cpus; /* (i) active vcpus */ volatile cpuset_t debug_cpus; /* (i) vcpus stopped for debug */ cpuset_t startup_cpus; /* (i) [r] waiting for startup */ int suspend; /* (i) stop VM execution */ bool dying; /* (o) is dying */ volatile cpuset_t suspended_cpus; /* (i) suspended vcpus */ volatile cpuset_t halted_cpus; /* (x) cpus in a hard halt */ cpuset_t rendezvous_req_cpus; /* (x) [r] rendezvous requested */ cpuset_t rendezvous_done_cpus; /* (x) [r] rendezvous finished */ void *rendezvous_arg; /* (x) [r] rendezvous func/arg */ vm_rendezvous_func_t rendezvous_func; struct mtx rendezvous_mtx; /* (o) rendezvous lock */ struct vmspace *vmspace; /* (o) guest's address space */ struct vm_mem mem; /* (i) [m+v] guest memory */ char name[VM_MAX_NAMELEN+1]; /* (o) virtual machine name */ struct vcpu **vcpu; /* (o) guest vcpus */ /* The following describe the vm cpu topology */ uint16_t sockets; /* (o) num of sockets */ uint16_t cores; /* (o) num of cores/socket */ uint16_t threads; /* (o) num of threads/core */ uint16_t maxcpus; /* (o) max pluggable cpus */ struct sx vcpus_init_lock; /* (o) */ }; #define VMM_CTR0(vcpu, format) \ VCPU_CTR0((vcpu)->vm, (vcpu)->vcpuid, format) #define VMM_CTR1(vcpu, format, p1) \ VCPU_CTR1((vcpu)->vm, (vcpu)->vcpuid, format, p1) #define VMM_CTR2(vcpu, format, p1, p2) \ VCPU_CTR2((vcpu)->vm, (vcpu)->vcpuid, format, p1, p2) #define VMM_CTR3(vcpu, format, p1, p2, p3) \ VCPU_CTR3((vcpu)->vm, (vcpu)->vcpuid, format, p1, p2, p3) #define VMM_CTR4(vcpu, format, p1, p2, p3, p4) \ VCPU_CTR4((vcpu)->vm, (vcpu)->vcpuid, format, p1, p2, p3, p4) static int vmm_initialized; static void vmmops_panic(void); static void vmmops_panic(void) { panic("vmm_ops func called when !vmm_is_intel() && !vmm_is_svm()"); } #define DEFINE_VMMOPS_IFUNC(ret_type, opname, args) \ DEFINE_IFUNC(static, ret_type, vmmops_##opname, args) \ { \ if (vmm_is_intel()) \ return (vmm_ops_intel.opname); \ else if (vmm_is_svm()) \ return (vmm_ops_amd.opname); \ else \ return ((ret_type (*)args)vmmops_panic); \ } DEFINE_VMMOPS_IFUNC(int, modinit, (int ipinum)) DEFINE_VMMOPS_IFUNC(int, modcleanup, (void)) DEFINE_VMMOPS_IFUNC(void, modsuspend, (void)) DEFINE_VMMOPS_IFUNC(void, modresume, (void)) DEFINE_VMMOPS_IFUNC(void *, init, (struct vm *vm, struct pmap *pmap)) DEFINE_VMMOPS_IFUNC(int, run, (void *vcpui, register_t rip, struct pmap *pmap, struct vm_eventinfo *info)) DEFINE_VMMOPS_IFUNC(void, cleanup, (void *vmi)) DEFINE_VMMOPS_IFUNC(void *, vcpu_init, (void *vmi, struct vcpu *vcpu, int vcpu_id)) DEFINE_VMMOPS_IFUNC(void, vcpu_cleanup, (void *vcpui)) DEFINE_VMMOPS_IFUNC(int, getreg, (void *vcpui, int num, uint64_t *retval)) DEFINE_VMMOPS_IFUNC(int, setreg, (void *vcpui, int num, uint64_t val)) DEFINE_VMMOPS_IFUNC(int, getdesc, (void *vcpui, int num, struct seg_desc *desc)) DEFINE_VMMOPS_IFUNC(int, setdesc, (void *vcpui, int num, struct seg_desc *desc)) DEFINE_VMMOPS_IFUNC(int, getcap, (void *vcpui, int num, int *retval)) DEFINE_VMMOPS_IFUNC(int, setcap, (void *vcpui, int num, int val)) DEFINE_VMMOPS_IFUNC(struct vmspace *, vmspace_alloc, (vm_offset_t min, vm_offset_t max)) DEFINE_VMMOPS_IFUNC(void, vmspace_free, (struct vmspace *vmspace)) DEFINE_VMMOPS_IFUNC(struct vlapic *, vlapic_init, (void *vcpui)) DEFINE_VMMOPS_IFUNC(void, vlapic_cleanup, (struct vlapic *vlapic)) #ifdef BHYVE_SNAPSHOT DEFINE_VMMOPS_IFUNC(int, vcpu_snapshot, (void *vcpui, struct vm_snapshot_meta *meta)) DEFINE_VMMOPS_IFUNC(int, restore_tsc, (void *vcpui, uint64_t now)) #endif SDT_PROVIDER_DEFINE(vmm); static MALLOC_DEFINE(M_VM, "vm", "vm"); /* statistics */ static VMM_STAT(VCPU_TOTAL_RUNTIME, "vcpu total runtime"); SYSCTL_NODE(_hw, OID_AUTO, vmm, CTLFLAG_RW | CTLFLAG_MPSAFE, NULL, NULL); /* * Halt the guest if all vcpus are executing a HLT instruction with * interrupts disabled. */ static int halt_detection_enabled = 1; SYSCTL_INT(_hw_vmm, OID_AUTO, halt_detection, CTLFLAG_RDTUN, &halt_detection_enabled, 0, "Halt VM if all vcpus execute HLT with interrupts disabled"); static int vmm_ipinum; SYSCTL_INT(_hw_vmm, OID_AUTO, ipinum, CTLFLAG_RD, &vmm_ipinum, 0, "IPI vector used for vcpu notifications"); static int trace_guest_exceptions; SYSCTL_INT(_hw_vmm, OID_AUTO, trace_guest_exceptions, CTLFLAG_RDTUN, &trace_guest_exceptions, 0, "Trap into hypervisor on all guest exceptions and reflect them back"); static int trap_wbinvd; SYSCTL_INT(_hw_vmm, OID_AUTO, trap_wbinvd, CTLFLAG_RDTUN, &trap_wbinvd, 0, "WBINVD triggers a VM-exit"); u_int vm_maxcpu; SYSCTL_UINT(_hw_vmm, OID_AUTO, maxcpu, CTLFLAG_RDTUN | CTLFLAG_NOFETCH, &vm_maxcpu, 0, "Maximum number of vCPUs"); static void vcpu_notify_event_locked(struct vcpu *vcpu, bool lapic_intr); /* global statistics */ VMM_STAT(VCPU_MIGRATIONS, "vcpu migration across host cpus"); VMM_STAT(VMEXIT_COUNT, "total number of vm exits"); VMM_STAT(VMEXIT_EXTINT, "vm exits due to external interrupt"); VMM_STAT(VMEXIT_HLT, "number of times hlt was intercepted"); VMM_STAT(VMEXIT_CR_ACCESS, "number of times %cr access was intercepted"); VMM_STAT(VMEXIT_RDMSR, "number of times rdmsr was intercepted"); VMM_STAT(VMEXIT_WRMSR, "number of times wrmsr was intercepted"); VMM_STAT(VMEXIT_MTRAP, "number of monitor trap exits"); VMM_STAT(VMEXIT_PAUSE, "number of times pause was intercepted"); VMM_STAT(VMEXIT_INTR_WINDOW, "vm exits due to interrupt window opening"); VMM_STAT(VMEXIT_NMI_WINDOW, "vm exits due to nmi window opening"); VMM_STAT(VMEXIT_INOUT, "number of times in/out was intercepted"); VMM_STAT(VMEXIT_CPUID, "number of times cpuid was intercepted"); VMM_STAT(VMEXIT_NESTED_FAULT, "vm exits due to nested page fault"); VMM_STAT(VMEXIT_INST_EMUL, "vm exits for instruction emulation"); VMM_STAT(VMEXIT_UNKNOWN, "number of vm exits for unknown reason"); VMM_STAT(VMEXIT_ASTPENDING, "number of times astpending at exit"); VMM_STAT(VMEXIT_REQIDLE, "number of times idle requested at exit"); VMM_STAT(VMEXIT_USERSPACE, "number of vm exits handled in userspace"); VMM_STAT(VMEXIT_RENDEZVOUS, "number of times rendezvous pending at exit"); VMM_STAT(VMEXIT_EXCEPTION, "number of vm exits due to exceptions"); /* * Upper limit on vm_maxcpu. Limited by use of uint16_t types for CPU * counts as well as range of vpid values for VT-x and by the capacity * of cpuset_t masks. The call to new_unrhdr() in vpid_init() in * vmx.c requires 'vm_maxcpu + 1 <= 0xffff', hence the '- 1' below. */ #define VM_MAXCPU MIN(0xffff - 1, CPU_SETSIZE) #ifdef KTR static const char * vcpu_state2str(enum vcpu_state state) { switch (state) { case VCPU_IDLE: return ("idle"); case VCPU_FROZEN: return ("frozen"); case VCPU_RUNNING: return ("running"); case VCPU_SLEEPING: return ("sleeping"); default: return ("unknown"); } } #endif static void vcpu_cleanup(struct vcpu *vcpu, bool destroy) { vmmops_vlapic_cleanup(vcpu->vlapic); vmmops_vcpu_cleanup(vcpu->cookie); vcpu->cookie = NULL; if (destroy) { vmm_stat_free(vcpu->stats); fpu_save_area_free(vcpu->guestfpu); vcpu_lock_destroy(vcpu); free(vcpu, M_VM); } } static struct vcpu * vcpu_alloc(struct vm *vm, int vcpu_id) { struct vcpu *vcpu; KASSERT(vcpu_id >= 0 && vcpu_id < vm->maxcpus, ("vcpu_init: invalid vcpu %d", vcpu_id)); vcpu = malloc(sizeof(*vcpu), M_VM, M_WAITOK | M_ZERO); vcpu_lock_init(vcpu); vcpu->state = VCPU_IDLE; vcpu->hostcpu = NOCPU; vcpu->vcpuid = vcpu_id; vcpu->vm = vm; vcpu->guestfpu = fpu_save_area_alloc(); vcpu->stats = vmm_stat_alloc(); vcpu->tsc_offset = 0; return (vcpu); } static void vcpu_init(struct vcpu *vcpu) { vcpu->cookie = vmmops_vcpu_init(vcpu->vm->cookie, vcpu, vcpu->vcpuid); vcpu->vlapic = vmmops_vlapic_init(vcpu->cookie); vm_set_x2apic_state(vcpu, X2APIC_DISABLED); vcpu->reqidle = 0; vcpu->exitintinfo = 0; vcpu->nmi_pending = 0; vcpu->extint_pending = 0; vcpu->exception_pending = 0; vcpu->guest_xcr0 = XFEATURE_ENABLED_X87; fpu_save_area_reset(vcpu->guestfpu); vmm_stat_init(vcpu->stats); } int vcpu_trace_exceptions(struct vcpu *vcpu) { return (trace_guest_exceptions); } int vcpu_trap_wbinvd(struct vcpu *vcpu) { return (trap_wbinvd); } struct vm_exit * vm_exitinfo(struct vcpu *vcpu) { return (&vcpu->exitinfo); } cpuset_t * vm_exitinfo_cpuset(struct vcpu *vcpu) { return (&vcpu->exitinfo_cpuset); } static int vmm_init(void) { if (!vmm_is_hw_supported()) return (ENXIO); vm_maxcpu = mp_ncpus; TUNABLE_INT_FETCH("hw.vmm.maxcpu", &vm_maxcpu); if (vm_maxcpu > VM_MAXCPU) { printf("vmm: vm_maxcpu clamped to %u\n", VM_MAXCPU); vm_maxcpu = VM_MAXCPU; } if (vm_maxcpu == 0) vm_maxcpu = 1; vmm_host_state_init(); vmm_ipinum = lapic_ipi_alloc(pti ? &IDTVEC(justreturn1_pti) : &IDTVEC(justreturn)); if (vmm_ipinum < 0) vmm_ipinum = IPI_AST; vmm_suspend_p = vmmops_modsuspend; vmm_resume_p = vmmops_modresume; return (vmmops_modinit(vmm_ipinum)); } static int vmm_handler(module_t mod, int what, void *arg) { int error; switch (what) { case MOD_LOAD: if (vmm_is_hw_supported()) { error = vmmdev_init(); if (error != 0) break; error = vmm_init(); if (error == 0) vmm_initialized = 1; else (void)vmmdev_cleanup(); } else { error = ENXIO; } break; case MOD_UNLOAD: if (vmm_is_hw_supported()) { error = vmmdev_cleanup(); if (error == 0) { vmm_suspend_p = NULL; vmm_resume_p = NULL; iommu_cleanup(); if (vmm_ipinum != IPI_AST) lapic_ipi_free(vmm_ipinum); error = vmmops_modcleanup(); /* * Something bad happened - prevent new * VMs from being created */ if (error) vmm_initialized = 0; } } else { error = 0; } break; default: error = 0; break; } return (error); } static moduledata_t vmm_kmod = { "vmm", vmm_handler, NULL }; /* * vmm initialization has the following dependencies: * * - VT-x initialization requires smp_rendezvous() and therefore must happen * after SMP is fully functional (after SI_SUB_SMP). * - vmm device initialization requires an initialized devfs. */ DECLARE_MODULE(vmm, vmm_kmod, MAX(SI_SUB_SMP, SI_SUB_DEVFS) + 1, SI_ORDER_ANY); MODULE_VERSION(vmm, 1); static void vm_init(struct vm *vm, bool create) { vm->cookie = vmmops_init(vm, vmspace_pmap(vm->vmspace)); vm->iommu = NULL; vm->vioapic = vioapic_init(vm); vm->vhpet = vhpet_init(vm); vm->vatpic = vatpic_init(vm); vm->vatpit = vatpit_init(vm); vm->vpmtmr = vpmtmr_init(vm); if (create) vm->vrtc = vrtc_init(vm); CPU_ZERO(&vm->active_cpus); CPU_ZERO(&vm->debug_cpus); CPU_ZERO(&vm->startup_cpus); vm->suspend = 0; CPU_ZERO(&vm->suspended_cpus); if (!create) { for (int i = 0; i < vm->maxcpus; i++) { if (vm->vcpu[i] != NULL) vcpu_init(vm->vcpu[i]); } } } void vm_disable_vcpu_creation(struct vm *vm) { sx_xlock(&vm->vcpus_init_lock); vm->dying = true; sx_xunlock(&vm->vcpus_init_lock); } struct vcpu * vm_alloc_vcpu(struct vm *vm, int vcpuid) { struct vcpu *vcpu; if (vcpuid < 0 || vcpuid >= vm_get_maxcpus(vm)) return (NULL); vcpu = (struct vcpu *) atomic_load_acq_ptr((uintptr_t *)&vm->vcpu[vcpuid]); if (__predict_true(vcpu != NULL)) return (vcpu); sx_xlock(&vm->vcpus_init_lock); vcpu = vm->vcpu[vcpuid]; if (vcpu == NULL && !vm->dying) { vcpu = vcpu_alloc(vm, vcpuid); vcpu_init(vcpu); /* * Ensure vCPU is fully created before updating pointer * to permit unlocked reads above. */ atomic_store_rel_ptr((uintptr_t *)&vm->vcpu[vcpuid], (uintptr_t)vcpu); } sx_xunlock(&vm->vcpus_init_lock); return (vcpu); } void vm_slock_vcpus(struct vm *vm) { sx_slock(&vm->vcpus_init_lock); } void vm_unlock_vcpus(struct vm *vm) { sx_unlock(&vm->vcpus_init_lock); } /* * The default CPU topology is a single thread per package. */ u_int cores_per_package = 1; u_int threads_per_core = 1; int vm_create(const char *name, struct vm **retvm) { struct vm *vm; struct vmspace *vmspace; /* * If vmm.ko could not be successfully initialized then don't attempt * to create the virtual machine. */ if (!vmm_initialized) return (ENXIO); if (name == NULL || strnlen(name, VM_MAX_NAMELEN + 1) == VM_MAX_NAMELEN + 1) return (EINVAL); vmspace = vmmops_vmspace_alloc(0, VM_MAXUSER_ADDRESS_LA48); if (vmspace == NULL) return (ENOMEM); vm = malloc(sizeof(struct vm), M_VM, M_WAITOK | M_ZERO); strcpy(vm->name, name); vm->vmspace = vmspace; vm_mem_init(&vm->mem); mtx_init(&vm->rendezvous_mtx, "vm rendezvous lock", 0, MTX_DEF); sx_init(&vm->vcpus_init_lock, "vm vcpus"); vm->vcpu = malloc(sizeof(*vm->vcpu) * vm_maxcpu, M_VM, M_WAITOK | M_ZERO); vm->sockets = 1; vm->cores = cores_per_package; /* XXX backwards compatibility */ vm->threads = threads_per_core; /* XXX backwards compatibility */ vm->maxcpus = vm_maxcpu; vm_init(vm, true); *retvm = vm; return (0); } void vm_get_topology(struct vm *vm, uint16_t *sockets, uint16_t *cores, uint16_t *threads, uint16_t *maxcpus) { *sockets = vm->sockets; *cores = vm->cores; *threads = vm->threads; *maxcpus = vm->maxcpus; } uint16_t vm_get_maxcpus(struct vm *vm) { return (vm->maxcpus); } int vm_set_topology(struct vm *vm, uint16_t sockets, uint16_t cores, uint16_t threads, uint16_t maxcpus __unused) { /* Ignore maxcpus. */ if ((sockets * cores * threads) > vm->maxcpus) return (EINVAL); vm->sockets = sockets; vm->cores = cores; vm->threads = threads; return(0); } static void vm_cleanup(struct vm *vm, bool destroy) { if (destroy) vm_xlock_memsegs(vm); else vm_assert_memseg_xlocked(vm); ppt_unassign_all(vm); if (vm->iommu != NULL) iommu_destroy_domain(vm->iommu); if (destroy) vrtc_cleanup(vm->vrtc); else vrtc_reset(vm->vrtc); vpmtmr_cleanup(vm->vpmtmr); vatpit_cleanup(vm->vatpit); vhpet_cleanup(vm->vhpet); vatpic_cleanup(vm->vatpic); vioapic_cleanup(vm->vioapic); for (int i = 0; i < vm->maxcpus; i++) { if (vm->vcpu[i] != NULL) vcpu_cleanup(vm->vcpu[i], destroy); } vmmops_cleanup(vm->cookie); vm_mem_cleanup(vm); if (destroy) { vm_mem_destroy(vm); vmmops_vmspace_free(vm->vmspace); vm->vmspace = NULL; free(vm->vcpu, M_VM); sx_destroy(&vm->vcpus_init_lock); mtx_destroy(&vm->rendezvous_mtx); } } void vm_destroy(struct vm *vm) { vm_cleanup(vm, true); free(vm, M_VM); } int vm_reinit(struct vm *vm) { int error; /* * A virtual machine can be reset only if all vcpus are suspended. */ if (CPU_CMP(&vm->suspended_cpus, &vm->active_cpus) == 0) { vm_cleanup(vm, false); vm_init(vm, false); error = 0; } else { error = EBUSY; } return (error); } const char * vm_name(struct vm *vm) { return (vm->name); } int vm_map_mmio(struct vm *vm, vm_paddr_t gpa, size_t len, vm_paddr_t hpa) { vm_object_t obj; if ((obj = vmm_mmio_alloc(vm->vmspace, gpa, len, hpa)) == NULL) return (ENOMEM); else return (0); } int vm_unmap_mmio(struct vm *vm, vm_paddr_t gpa, size_t len) { vmm_mmio_free(vm->vmspace, gpa, len); return (0); } -static void +static int vm_iommu_map(struct vm *vm) { vm_paddr_t gpa, hpa; struct vm_mem_map *mm; - int i; + int error, i; sx_assert(&vm->mem.mem_segs_lock, SX_LOCKED); for (i = 0; i < VM_MAX_MEMMAPS; i++) { if (!vm_memseg_sysmem(vm, i)) continue; mm = &vm->mem.mem_maps[i]; KASSERT((mm->flags & VM_MEMMAP_F_IOMMU) == 0, ("iommu map found invalid memmap %#lx/%#lx/%#x", mm->gpa, mm->len, mm->flags)); if ((mm->flags & VM_MEMMAP_F_WIRED) == 0) continue; mm->flags |= VM_MEMMAP_F_IOMMU; for (gpa = mm->gpa; gpa < mm->gpa + mm->len; gpa += PAGE_SIZE) { hpa = pmap_extract(vmspace_pmap(vm->vmspace), gpa); /* * All mappings in the vmm vmspace must be * present since they are managed by vmm in this way. * Because we are in pass-through mode, the * mappings must also be wired. This implies * that all pages must be mapped and wired, * allowing to use pmap_extract() and avoiding the * need to use vm_gpa_hold_global(). * * This could change if/when we start * supporting page faults on IOMMU maps. */ KASSERT(vm_page_wired(PHYS_TO_VM_PAGE(hpa)), ("vm_iommu_map: vm %p gpa %jx hpa %jx not wired", vm, (uintmax_t)gpa, (uintmax_t)hpa)); iommu_create_mapping(vm->iommu, gpa, hpa, PAGE_SIZE); } } - iommu_invalidate_tlb(iommu_host_domain()); + error = iommu_invalidate_tlb(iommu_host_domain()); + return (error); } -static void +static int vm_iommu_unmap(struct vm *vm) { vm_paddr_t gpa; struct vm_mem_map *mm; - int i; + int error, i; sx_assert(&vm->mem.mem_segs_lock, SX_LOCKED); for (i = 0; i < VM_MAX_MEMMAPS; i++) { if (!vm_memseg_sysmem(vm, i)) continue; mm = &vm->mem.mem_maps[i]; if ((mm->flags & VM_MEMMAP_F_IOMMU) == 0) continue; mm->flags &= ~VM_MEMMAP_F_IOMMU; KASSERT((mm->flags & VM_MEMMAP_F_WIRED) != 0, ("iommu unmap found invalid memmap %#lx/%#lx/%#x", mm->gpa, mm->len, mm->flags)); for (gpa = mm->gpa; gpa < mm->gpa + mm->len; gpa += PAGE_SIZE) { KASSERT(vm_page_wired(PHYS_TO_VM_PAGE(pmap_extract( vmspace_pmap(vm->vmspace), gpa))), ("vm_iommu_unmap: vm %p gpa %jx not wired", vm, (uintmax_t)gpa)); iommu_remove_mapping(vm->iommu, gpa, PAGE_SIZE); } } /* * Invalidate the cached translations associated with the domain * from which pages were removed. */ - iommu_invalidate_tlb(vm->iommu); + error = iommu_invalidate_tlb(vm->iommu); + return (error); } int vm_unassign_pptdev(struct vm *vm, int bus, int slot, int func) { int error; error = ppt_unassign_device(vm, bus, slot, func); if (error) return (error); if (ppt_assigned_devices(vm) == 0) - vm_iommu_unmap(vm); + error = vm_iommu_unmap(vm); - return (0); + return (error); } int vm_assign_pptdev(struct vm *vm, int bus, int slot, int func) { int error; vm_paddr_t maxaddr; /* Set up the IOMMU to do the 'gpa' to 'hpa' translation */ if (ppt_assigned_devices(vm) == 0) { KASSERT(vm->iommu == NULL, ("vm_assign_pptdev: iommu must be NULL")); maxaddr = vmm_sysmem_maxaddr(vm); vm->iommu = iommu_create_domain(maxaddr); if (vm->iommu == NULL) return (ENXIO); - vm_iommu_map(vm); } error = ppt_assign_device(vm, bus, slot, func); + if (error != 0) + return (error); + error = vm_iommu_map(vm); return (error); } int vm_get_register(struct vcpu *vcpu, int reg, uint64_t *retval) { if (reg >= VM_REG_LAST) return (EINVAL); return (vmmops_getreg(vcpu->cookie, reg, retval)); } int vm_set_register(struct vcpu *vcpu, int reg, uint64_t val) { int error; if (reg >= VM_REG_LAST) return (EINVAL); error = vmmops_setreg(vcpu->cookie, reg, val); if (error || reg != VM_REG_GUEST_RIP) return (error); /* Set 'nextrip' to match the value of %rip */ VMM_CTR1(vcpu, "Setting nextrip to %#lx", val); vcpu->nextrip = val; return (0); } static bool is_descriptor_table(int reg) { switch (reg) { case VM_REG_GUEST_IDTR: case VM_REG_GUEST_GDTR: return (true); default: return (false); } } static bool is_segment_register(int reg) { switch (reg) { case VM_REG_GUEST_ES: case VM_REG_GUEST_CS: case VM_REG_GUEST_SS: case VM_REG_GUEST_DS: case VM_REG_GUEST_FS: case VM_REG_GUEST_GS: case VM_REG_GUEST_TR: case VM_REG_GUEST_LDTR: return (true); default: return (false); } } int vm_get_seg_desc(struct vcpu *vcpu, int reg, struct seg_desc *desc) { if (!is_segment_register(reg) && !is_descriptor_table(reg)) return (EINVAL); return (vmmops_getdesc(vcpu->cookie, reg, desc)); } int vm_set_seg_desc(struct vcpu *vcpu, int reg, struct seg_desc *desc) { if (!is_segment_register(reg) && !is_descriptor_table(reg)) return (EINVAL); return (vmmops_setdesc(vcpu->cookie, reg, desc)); } static void restore_guest_fpustate(struct vcpu *vcpu) { /* flush host state to the pcb */ fpuexit(curthread); /* restore guest FPU state */ fpu_enable(); fpurestore(vcpu->guestfpu); /* restore guest XCR0 if XSAVE is enabled in the host */ if (rcr4() & CR4_XSAVE) load_xcr(0, vcpu->guest_xcr0); /* * The FPU is now "dirty" with the guest's state so disable * the FPU to trap any access by the host. */ fpu_disable(); } static void save_guest_fpustate(struct vcpu *vcpu) { if ((rcr0() & CR0_TS) == 0) panic("fpu emulation not enabled in host!"); /* save guest XCR0 and restore host XCR0 */ if (rcr4() & CR4_XSAVE) { vcpu->guest_xcr0 = rxcr(0); load_xcr(0, vmm_get_host_xcr0()); } /* save guest FPU state */ fpu_enable(); fpusave(vcpu->guestfpu); fpu_disable(); } static VMM_STAT(VCPU_IDLE_TICKS, "number of ticks vcpu was idle"); static int vcpu_set_state_locked(struct vcpu *vcpu, enum vcpu_state newstate, bool from_idle) { int error; vcpu_assert_locked(vcpu); /* * State transitions from the vmmdev_ioctl() must always begin from * the VCPU_IDLE state. This guarantees that there is only a single * ioctl() operating on a vcpu at any point. */ if (from_idle) { while (vcpu->state != VCPU_IDLE) { vcpu->reqidle = 1; vcpu_notify_event_locked(vcpu, false); VMM_CTR1(vcpu, "vcpu state change from %s to " "idle requested", vcpu_state2str(vcpu->state)); msleep_spin(&vcpu->state, &vcpu->mtx, "vmstat", hz); } } else { KASSERT(vcpu->state != VCPU_IDLE, ("invalid transition from " "vcpu idle state")); } if (vcpu->state == VCPU_RUNNING) { KASSERT(vcpu->hostcpu == curcpu, ("curcpu %d and hostcpu %d " "mismatch for running vcpu", curcpu, vcpu->hostcpu)); } else { KASSERT(vcpu->hostcpu == NOCPU, ("Invalid hostcpu %d for a " "vcpu that is not running", vcpu->hostcpu)); } /* * The following state transitions are allowed: * IDLE -> FROZEN -> IDLE * FROZEN -> RUNNING -> FROZEN * FROZEN -> SLEEPING -> FROZEN */ switch (vcpu->state) { case VCPU_IDLE: case VCPU_RUNNING: case VCPU_SLEEPING: error = (newstate != VCPU_FROZEN); break; case VCPU_FROZEN: error = (newstate == VCPU_FROZEN); break; default: error = 1; break; } if (error) return (EBUSY); VMM_CTR2(vcpu, "vcpu state changed from %s to %s", vcpu_state2str(vcpu->state), vcpu_state2str(newstate)); vcpu->state = newstate; if (newstate == VCPU_RUNNING) vcpu->hostcpu = curcpu; else vcpu->hostcpu = NOCPU; if (newstate == VCPU_IDLE) wakeup(&vcpu->state); return (0); } static void vcpu_require_state(struct vcpu *vcpu, enum vcpu_state newstate) { int error; if ((error = vcpu_set_state(vcpu, newstate, false)) != 0) panic("Error %d setting state to %d\n", error, newstate); } static void vcpu_require_state_locked(struct vcpu *vcpu, enum vcpu_state newstate) { int error; if ((error = vcpu_set_state_locked(vcpu, newstate, false)) != 0) panic("Error %d setting state to %d", error, newstate); } static int vm_handle_rendezvous(struct vcpu *vcpu) { struct vm *vm = vcpu->vm; struct thread *td; int error, vcpuid; error = 0; vcpuid = vcpu->vcpuid; td = curthread; mtx_lock(&vm->rendezvous_mtx); while (vm->rendezvous_func != NULL) { /* 'rendezvous_req_cpus' must be a subset of 'active_cpus' */ CPU_AND(&vm->rendezvous_req_cpus, &vm->rendezvous_req_cpus, &vm->active_cpus); if (CPU_ISSET(vcpuid, &vm->rendezvous_req_cpus) && !CPU_ISSET(vcpuid, &vm->rendezvous_done_cpus)) { VMM_CTR0(vcpu, "Calling rendezvous func"); (*vm->rendezvous_func)(vcpu, vm->rendezvous_arg); CPU_SET(vcpuid, &vm->rendezvous_done_cpus); } if (CPU_CMP(&vm->rendezvous_req_cpus, &vm->rendezvous_done_cpus) == 0) { VMM_CTR0(vcpu, "Rendezvous completed"); CPU_ZERO(&vm->rendezvous_req_cpus); vm->rendezvous_func = NULL; wakeup(&vm->rendezvous_func); break; } VMM_CTR0(vcpu, "Wait for rendezvous completion"); mtx_sleep(&vm->rendezvous_func, &vm->rendezvous_mtx, 0, "vmrndv", hz); if (td_ast_pending(td, TDA_SUSPEND)) { mtx_unlock(&vm->rendezvous_mtx); error = thread_check_susp(td, true); if (error != 0) return (error); mtx_lock(&vm->rendezvous_mtx); } } mtx_unlock(&vm->rendezvous_mtx); return (0); } /* * Emulate a guest 'hlt' by sleeping until the vcpu is ready to run. */ static int vm_handle_hlt(struct vcpu *vcpu, bool intr_disabled, bool *retu) { struct vm *vm = vcpu->vm; const char *wmesg; struct thread *td; int error, t, vcpuid, vcpu_halted, vm_halted; vcpuid = vcpu->vcpuid; vcpu_halted = 0; vm_halted = 0; error = 0; td = curthread; KASSERT(!CPU_ISSET(vcpuid, &vm->halted_cpus), ("vcpu already halted")); vcpu_lock(vcpu); while (1) { /* * Do a final check for pending NMI or interrupts before * really putting this thread to sleep. Also check for * software events that would cause this vcpu to wakeup. * * These interrupts/events could have happened after the * vcpu returned from vmmops_run() and before it acquired the * vcpu lock above. */ if (vm->rendezvous_func != NULL || vm->suspend || vcpu->reqidle) break; if (vm_nmi_pending(vcpu)) break; if (!intr_disabled) { if (vm_extint_pending(vcpu) || vlapic_pending_intr(vcpu->vlapic, NULL)) { break; } } /* Don't go to sleep if the vcpu thread needs to yield */ if (vcpu_should_yield(vcpu)) break; if (vcpu_debugged(vcpu)) break; /* * Some Linux guests implement "halt" by having all vcpus * execute HLT with interrupts disabled. 'halted_cpus' keeps * track of the vcpus that have entered this state. When all * vcpus enter the halted state the virtual machine is halted. */ if (intr_disabled) { wmesg = "vmhalt"; VMM_CTR0(vcpu, "Halted"); if (!vcpu_halted && halt_detection_enabled) { vcpu_halted = 1; CPU_SET_ATOMIC(vcpuid, &vm->halted_cpus); } if (CPU_CMP(&vm->halted_cpus, &vm->active_cpus) == 0) { vm_halted = 1; break; } } else { wmesg = "vmidle"; } t = ticks; vcpu_require_state_locked(vcpu, VCPU_SLEEPING); /* * XXX msleep_spin() cannot be interrupted by signals so * wake up periodically to check pending signals. */ msleep_spin(vcpu, &vcpu->mtx, wmesg, hz); vcpu_require_state_locked(vcpu, VCPU_FROZEN); vmm_stat_incr(vcpu, VCPU_IDLE_TICKS, ticks - t); if (td_ast_pending(td, TDA_SUSPEND)) { vcpu_unlock(vcpu); error = thread_check_susp(td, false); if (error != 0) { if (vcpu_halted) { CPU_CLR_ATOMIC(vcpuid, &vm->halted_cpus); } return (error); } vcpu_lock(vcpu); } } if (vcpu_halted) CPU_CLR_ATOMIC(vcpuid, &vm->halted_cpus); vcpu_unlock(vcpu); if (vm_halted) vm_suspend(vm, VM_SUSPEND_HALT); return (0); } static int vm_handle_paging(struct vcpu *vcpu, bool *retu) { struct vm *vm = vcpu->vm; int rv, ftype; struct vm_map *map; struct vm_exit *vme; vme = &vcpu->exitinfo; KASSERT(vme->inst_length == 0, ("%s: invalid inst_length %d", __func__, vme->inst_length)); ftype = vme->u.paging.fault_type; KASSERT(ftype == VM_PROT_READ || ftype == VM_PROT_WRITE || ftype == VM_PROT_EXECUTE, ("vm_handle_paging: invalid fault_type %d", ftype)); if (ftype == VM_PROT_READ || ftype == VM_PROT_WRITE) { rv = pmap_emulate_accessed_dirty(vmspace_pmap(vm->vmspace), vme->u.paging.gpa, ftype); if (rv == 0) { VMM_CTR2(vcpu, "%s bit emulation for gpa %#lx", ftype == VM_PROT_READ ? "accessed" : "dirty", vme->u.paging.gpa); goto done; } } map = &vm->vmspace->vm_map; rv = vm_fault(map, vme->u.paging.gpa, ftype, VM_FAULT_NORMAL, NULL); VMM_CTR3(vcpu, "vm_handle_paging rv = %d, gpa = %#lx, " "ftype = %d", rv, vme->u.paging.gpa, ftype); if (rv != KERN_SUCCESS) return (EFAULT); done: return (0); } static int vm_handle_inst_emul(struct vcpu *vcpu, bool *retu) { struct vie *vie; struct vm_exit *vme; uint64_t gla, gpa, cs_base; struct vm_guest_paging *paging; mem_region_read_t mread; mem_region_write_t mwrite; enum vm_cpu_mode cpu_mode; int cs_d, error, fault; vme = &vcpu->exitinfo; KASSERT(vme->inst_length == 0, ("%s: invalid inst_length %d", __func__, vme->inst_length)); gla = vme->u.inst_emul.gla; gpa = vme->u.inst_emul.gpa; cs_base = vme->u.inst_emul.cs_base; cs_d = vme->u.inst_emul.cs_d; vie = &vme->u.inst_emul.vie; paging = &vme->u.inst_emul.paging; cpu_mode = paging->cpu_mode; VMM_CTR1(vcpu, "inst_emul fault accessing gpa %#lx", gpa); /* Fetch, decode and emulate the faulting instruction */ if (vie->num_valid == 0) { error = vmm_fetch_instruction(vcpu, paging, vme->rip + cs_base, VIE_INST_SIZE, vie, &fault); } else { /* * The instruction bytes have already been copied into 'vie' */ error = fault = 0; } if (error || fault) return (error); if (vmm_decode_instruction(vcpu, gla, cpu_mode, cs_d, vie) != 0) { VMM_CTR1(vcpu, "Error decoding instruction at %#lx", vme->rip + cs_base); *retu = true; /* dump instruction bytes in userspace */ return (0); } /* * Update 'nextrip' based on the length of the emulated instruction. */ vme->inst_length = vie->num_processed; vcpu->nextrip += vie->num_processed; VMM_CTR1(vcpu, "nextrip updated to %#lx after instruction decoding", vcpu->nextrip); /* return to userland unless this is an in-kernel emulated device */ if (gpa >= DEFAULT_APIC_BASE && gpa < DEFAULT_APIC_BASE + PAGE_SIZE) { mread = lapic_mmio_read; mwrite = lapic_mmio_write; } else if (gpa >= VIOAPIC_BASE && gpa < VIOAPIC_BASE + VIOAPIC_SIZE) { mread = vioapic_mmio_read; mwrite = vioapic_mmio_write; } else if (gpa >= VHPET_BASE && gpa < VHPET_BASE + VHPET_SIZE) { mread = vhpet_mmio_read; mwrite = vhpet_mmio_write; } else { *retu = true; return (0); } error = vmm_emulate_instruction(vcpu, gpa, vie, paging, mread, mwrite, retu); return (error); } static int vm_handle_suspend(struct vcpu *vcpu, bool *retu) { struct vm *vm = vcpu->vm; int error, i; struct thread *td; error = 0; td = curthread; CPU_SET_ATOMIC(vcpu->vcpuid, &vm->suspended_cpus); /* * Wait until all 'active_cpus' have suspended themselves. * * Since a VM may be suspended at any time including when one or * more vcpus are doing a rendezvous we need to call the rendezvous * handler while we are waiting to prevent a deadlock. */ vcpu_lock(vcpu); while (error == 0) { if (CPU_CMP(&vm->suspended_cpus, &vm->active_cpus) == 0) { VMM_CTR0(vcpu, "All vcpus suspended"); break; } if (vm->rendezvous_func == NULL) { VMM_CTR0(vcpu, "Sleeping during suspend"); vcpu_require_state_locked(vcpu, VCPU_SLEEPING); msleep_spin(vcpu, &vcpu->mtx, "vmsusp", hz); vcpu_require_state_locked(vcpu, VCPU_FROZEN); if (td_ast_pending(td, TDA_SUSPEND)) { vcpu_unlock(vcpu); error = thread_check_susp(td, false); vcpu_lock(vcpu); } } else { VMM_CTR0(vcpu, "Rendezvous during suspend"); vcpu_unlock(vcpu); error = vm_handle_rendezvous(vcpu); vcpu_lock(vcpu); } } vcpu_unlock(vcpu); /* * Wakeup the other sleeping vcpus and return to userspace. */ for (i = 0; i < vm->maxcpus; i++) { if (CPU_ISSET(i, &vm->suspended_cpus)) { vcpu_notify_event(vm_vcpu(vm, i), false); } } *retu = true; return (error); } static int vm_handle_reqidle(struct vcpu *vcpu, bool *retu) { vcpu_lock(vcpu); KASSERT(vcpu->reqidle, ("invalid vcpu reqidle %d", vcpu->reqidle)); vcpu->reqidle = 0; vcpu_unlock(vcpu); *retu = true; return (0); } static int vm_handle_db(struct vcpu *vcpu, struct vm_exit *vme, bool *retu) { int error, fault; uint64_t rsp; uint64_t rflags; struct vm_copyinfo copyinfo[2]; *retu = true; if (!vme->u.dbg.pushf_intercept || vme->u.dbg.tf_shadow_val != 0) { return (0); } vm_get_register(vcpu, VM_REG_GUEST_RSP, &rsp); error = vm_copy_setup(vcpu, &vme->u.dbg.paging, rsp, sizeof(uint64_t), VM_PROT_RW, copyinfo, nitems(copyinfo), &fault); if (error != 0 || fault != 0) { *retu = false; return (EINVAL); } /* Read pushed rflags value from top of stack. */ vm_copyin(copyinfo, &rflags, sizeof(uint64_t)); /* Clear TF bit. */ rflags &= ~(PSL_T); /* Write updated value back to memory. */ vm_copyout(&rflags, copyinfo, sizeof(uint64_t)); vm_copy_teardown(copyinfo, nitems(copyinfo)); return (0); } int vm_suspend(struct vm *vm, enum vm_suspend_how how) { int i; if (how <= VM_SUSPEND_NONE || how >= VM_SUSPEND_LAST) return (EINVAL); if (atomic_cmpset_int(&vm->suspend, 0, how) == 0) { VM_CTR2(vm, "virtual machine already suspended %d/%d", vm->suspend, how); return (EALREADY); } VM_CTR1(vm, "virtual machine successfully suspended %d", how); /* * Notify all active vcpus that they are now suspended. */ for (i = 0; i < vm->maxcpus; i++) { if (CPU_ISSET(i, &vm->active_cpus)) vcpu_notify_event(vm_vcpu(vm, i), false); } return (0); } void vm_exit_suspended(struct vcpu *vcpu, uint64_t rip) { struct vm *vm = vcpu->vm; struct vm_exit *vmexit; KASSERT(vm->suspend > VM_SUSPEND_NONE && vm->suspend < VM_SUSPEND_LAST, ("vm_exit_suspended: invalid suspend type %d", vm->suspend)); vmexit = vm_exitinfo(vcpu); vmexit->rip = rip; vmexit->inst_length = 0; vmexit->exitcode = VM_EXITCODE_SUSPENDED; vmexit->u.suspended.how = vm->suspend; } void vm_exit_debug(struct vcpu *vcpu, uint64_t rip) { struct vm_exit *vmexit; vmexit = vm_exitinfo(vcpu); vmexit->rip = rip; vmexit->inst_length = 0; vmexit->exitcode = VM_EXITCODE_DEBUG; } void vm_exit_rendezvous(struct vcpu *vcpu, uint64_t rip) { struct vm_exit *vmexit; vmexit = vm_exitinfo(vcpu); vmexit->rip = rip; vmexit->inst_length = 0; vmexit->exitcode = VM_EXITCODE_RENDEZVOUS; vmm_stat_incr(vcpu, VMEXIT_RENDEZVOUS, 1); } void vm_exit_reqidle(struct vcpu *vcpu, uint64_t rip) { struct vm_exit *vmexit; vmexit = vm_exitinfo(vcpu); vmexit->rip = rip; vmexit->inst_length = 0; vmexit->exitcode = VM_EXITCODE_REQIDLE; vmm_stat_incr(vcpu, VMEXIT_REQIDLE, 1); } void vm_exit_astpending(struct vcpu *vcpu, uint64_t rip) { struct vm_exit *vmexit; vmexit = vm_exitinfo(vcpu); vmexit->rip = rip; vmexit->inst_length = 0; vmexit->exitcode = VM_EXITCODE_BOGUS; vmm_stat_incr(vcpu, VMEXIT_ASTPENDING, 1); } int vm_run(struct vcpu *vcpu) { struct vm *vm = vcpu->vm; struct vm_eventinfo evinfo; int error, vcpuid; struct pcb *pcb; uint64_t tscval; struct vm_exit *vme; bool retu, intr_disabled; pmap_t pmap; vcpuid = vcpu->vcpuid; if (!CPU_ISSET(vcpuid, &vm->active_cpus)) return (EINVAL); if (CPU_ISSET(vcpuid, &vm->suspended_cpus)) return (EINVAL); pmap = vmspace_pmap(vm->vmspace); vme = &vcpu->exitinfo; evinfo.rptr = &vm->rendezvous_req_cpus; evinfo.sptr = &vm->suspend; evinfo.iptr = &vcpu->reqidle; restart: critical_enter(); KASSERT(!CPU_ISSET(curcpu, &pmap->pm_active), ("vm_run: absurd pm_active")); tscval = rdtsc(); pcb = PCPU_GET(curpcb); set_pcb_flags(pcb, PCB_FULL_IRET); restore_guest_fpustate(vcpu); vcpu_require_state(vcpu, VCPU_RUNNING); error = vmmops_run(vcpu->cookie, vcpu->nextrip, pmap, &evinfo); vcpu_require_state(vcpu, VCPU_FROZEN); save_guest_fpustate(vcpu); vmm_stat_incr(vcpu, VCPU_TOTAL_RUNTIME, rdtsc() - tscval); critical_exit(); if (error == 0) { retu = false; vcpu->nextrip = vme->rip + vme->inst_length; switch (vme->exitcode) { case VM_EXITCODE_REQIDLE: error = vm_handle_reqidle(vcpu, &retu); break; case VM_EXITCODE_SUSPENDED: error = vm_handle_suspend(vcpu, &retu); break; case VM_EXITCODE_IOAPIC_EOI: vioapic_process_eoi(vm, vme->u.ioapic_eoi.vector); break; case VM_EXITCODE_RENDEZVOUS: error = vm_handle_rendezvous(vcpu); break; case VM_EXITCODE_HLT: intr_disabled = ((vme->u.hlt.rflags & PSL_I) == 0); error = vm_handle_hlt(vcpu, intr_disabled, &retu); break; case VM_EXITCODE_PAGING: error = vm_handle_paging(vcpu, &retu); break; case VM_EXITCODE_INST_EMUL: error = vm_handle_inst_emul(vcpu, &retu); break; case VM_EXITCODE_INOUT: case VM_EXITCODE_INOUT_STR: error = vm_handle_inout(vcpu, vme, &retu); break; case VM_EXITCODE_DB: error = vm_handle_db(vcpu, vme, &retu); break; case VM_EXITCODE_MONITOR: case VM_EXITCODE_MWAIT: case VM_EXITCODE_VMINSN: vm_inject_ud(vcpu); break; default: retu = true; /* handled in userland */ break; } } /* * VM_EXITCODE_INST_EMUL could access the apic which could transform the * exit code into VM_EXITCODE_IPI. */ if (error == 0 && vme->exitcode == VM_EXITCODE_IPI) error = vm_handle_ipi(vcpu, vme, &retu); if (error == 0 && retu == false) goto restart; vmm_stat_incr(vcpu, VMEXIT_USERSPACE, 1); VMM_CTR2(vcpu, "retu %d/%d", error, vme->exitcode); return (error); } int vm_restart_instruction(struct vcpu *vcpu) { enum vcpu_state state; uint64_t rip; int error __diagused; state = vcpu_get_state(vcpu, NULL); if (state == VCPU_RUNNING) { /* * When a vcpu is "running" the next instruction is determined * by adding 'rip' and 'inst_length' in the vcpu's 'exitinfo'. * Thus setting 'inst_length' to zero will cause the current * instruction to be restarted. */ vcpu->exitinfo.inst_length = 0; VMM_CTR1(vcpu, "restarting instruction at %#lx by " "setting inst_length to zero", vcpu->exitinfo.rip); } else if (state == VCPU_FROZEN) { /* * When a vcpu is "frozen" it is outside the critical section * around vmmops_run() and 'nextrip' points to the next * instruction. Thus instruction restart is achieved by setting * 'nextrip' to the vcpu's %rip. */ error = vm_get_register(vcpu, VM_REG_GUEST_RIP, &rip); KASSERT(!error, ("%s: error %d getting rip", __func__, error)); VMM_CTR2(vcpu, "restarting instruction by updating " "nextrip from %#lx to %#lx", vcpu->nextrip, rip); vcpu->nextrip = rip; } else { panic("%s: invalid state %d", __func__, state); } return (0); } int vm_exit_intinfo(struct vcpu *vcpu, uint64_t info) { int type, vector; if (info & VM_INTINFO_VALID) { type = info & VM_INTINFO_TYPE; vector = info & 0xff; if (type == VM_INTINFO_NMI && vector != IDT_NMI) return (EINVAL); if (type == VM_INTINFO_HWEXCEPTION && vector >= 32) return (EINVAL); if (info & VM_INTINFO_RSVD) return (EINVAL); } else { info = 0; } VMM_CTR2(vcpu, "%s: info1(%#lx)", __func__, info); vcpu->exitintinfo = info; return (0); } enum exc_class { EXC_BENIGN, EXC_CONTRIBUTORY, EXC_PAGEFAULT }; #define IDT_VE 20 /* Virtualization Exception (Intel specific) */ static enum exc_class exception_class(uint64_t info) { int type, vector; KASSERT(info & VM_INTINFO_VALID, ("intinfo must be valid: %#lx", info)); type = info & VM_INTINFO_TYPE; vector = info & 0xff; /* Table 6-4, "Interrupt and Exception Classes", Intel SDM, Vol 3 */ switch (type) { case VM_INTINFO_HWINTR: case VM_INTINFO_SWINTR: case VM_INTINFO_NMI: return (EXC_BENIGN); default: /* * Hardware exception. * * SVM and VT-x use identical type values to represent NMI, * hardware interrupt and software interrupt. * * SVM uses type '3' for all exceptions. VT-x uses type '3' * for exceptions except #BP and #OF. #BP and #OF use a type * value of '5' or '6'. Therefore we don't check for explicit * values of 'type' to classify 'intinfo' into a hardware * exception. */ break; } switch (vector) { case IDT_PF: case IDT_VE: return (EXC_PAGEFAULT); case IDT_DE: case IDT_TS: case IDT_NP: case IDT_SS: case IDT_GP: return (EXC_CONTRIBUTORY); default: return (EXC_BENIGN); } } static int nested_fault(struct vcpu *vcpu, uint64_t info1, uint64_t info2, uint64_t *retinfo) { enum exc_class exc1, exc2; int type1, vector1; KASSERT(info1 & VM_INTINFO_VALID, ("info1 %#lx is not valid", info1)); KASSERT(info2 & VM_INTINFO_VALID, ("info2 %#lx is not valid", info2)); /* * If an exception occurs while attempting to call the double-fault * handler the processor enters shutdown mode (aka triple fault). */ type1 = info1 & VM_INTINFO_TYPE; vector1 = info1 & 0xff; if (type1 == VM_INTINFO_HWEXCEPTION && vector1 == IDT_DF) { VMM_CTR2(vcpu, "triple fault: info1(%#lx), info2(%#lx)", info1, info2); vm_suspend(vcpu->vm, VM_SUSPEND_TRIPLEFAULT); *retinfo = 0; return (0); } /* * Table 6-5 "Conditions for Generating a Double Fault", Intel SDM, Vol3 */ exc1 = exception_class(info1); exc2 = exception_class(info2); if ((exc1 == EXC_CONTRIBUTORY && exc2 == EXC_CONTRIBUTORY) || (exc1 == EXC_PAGEFAULT && exc2 != EXC_BENIGN)) { /* Convert nested fault into a double fault. */ *retinfo = IDT_DF; *retinfo |= VM_INTINFO_VALID | VM_INTINFO_HWEXCEPTION; *retinfo |= VM_INTINFO_DEL_ERRCODE; } else { /* Handle exceptions serially */ *retinfo = info2; } return (1); } static uint64_t vcpu_exception_intinfo(struct vcpu *vcpu) { uint64_t info = 0; if (vcpu->exception_pending) { info = vcpu->exc_vector & 0xff; info |= VM_INTINFO_VALID | VM_INTINFO_HWEXCEPTION; if (vcpu->exc_errcode_valid) { info |= VM_INTINFO_DEL_ERRCODE; info |= (uint64_t)vcpu->exc_errcode << 32; } } return (info); } int vm_entry_intinfo(struct vcpu *vcpu, uint64_t *retinfo) { uint64_t info1, info2; int valid; info1 = vcpu->exitintinfo; vcpu->exitintinfo = 0; info2 = 0; if (vcpu->exception_pending) { info2 = vcpu_exception_intinfo(vcpu); vcpu->exception_pending = 0; VMM_CTR2(vcpu, "Exception %d delivered: %#lx", vcpu->exc_vector, info2); } if ((info1 & VM_INTINFO_VALID) && (info2 & VM_INTINFO_VALID)) { valid = nested_fault(vcpu, info1, info2, retinfo); } else if (info1 & VM_INTINFO_VALID) { *retinfo = info1; valid = 1; } else if (info2 & VM_INTINFO_VALID) { *retinfo = info2; valid = 1; } else { valid = 0; } if (valid) { VMM_CTR4(vcpu, "%s: info1(%#lx), info2(%#lx), " "retinfo(%#lx)", __func__, info1, info2, *retinfo); } return (valid); } int vm_get_intinfo(struct vcpu *vcpu, uint64_t *info1, uint64_t *info2) { *info1 = vcpu->exitintinfo; *info2 = vcpu_exception_intinfo(vcpu); return (0); } int vm_inject_exception(struct vcpu *vcpu, int vector, int errcode_valid, uint32_t errcode, int restart_instruction) { uint64_t regval; int error __diagused; if (vector < 0 || vector >= 32) return (EINVAL); /* * A double fault exception should never be injected directly into * the guest. It is a derived exception that results from specific * combinations of nested faults. */ if (vector == IDT_DF) return (EINVAL); if (vcpu->exception_pending) { VMM_CTR2(vcpu, "Unable to inject exception %d due to " "pending exception %d", vector, vcpu->exc_vector); return (EBUSY); } if (errcode_valid) { /* * Exceptions don't deliver an error code in real mode. */ error = vm_get_register(vcpu, VM_REG_GUEST_CR0, ®val); KASSERT(!error, ("%s: error %d getting CR0", __func__, error)); if (!(regval & CR0_PE)) errcode_valid = 0; } /* * From section 26.6.1 "Interruptibility State" in Intel SDM: * * Event blocking by "STI" or "MOV SS" is cleared after guest executes * one instruction or incurs an exception. */ error = vm_set_register(vcpu, VM_REG_GUEST_INTR_SHADOW, 0); KASSERT(error == 0, ("%s: error %d clearing interrupt shadow", __func__, error)); if (restart_instruction) vm_restart_instruction(vcpu); vcpu->exception_pending = 1; vcpu->exc_vector = vector; vcpu->exc_errcode = errcode; vcpu->exc_errcode_valid = errcode_valid; VMM_CTR1(vcpu, "Exception %d pending", vector); return (0); } void vm_inject_fault(struct vcpu *vcpu, int vector, int errcode_valid, int errcode) { int error __diagused, restart_instruction; restart_instruction = 1; error = vm_inject_exception(vcpu, vector, errcode_valid, errcode, restart_instruction); KASSERT(error == 0, ("vm_inject_exception error %d", error)); } void vm_inject_pf(struct vcpu *vcpu, int error_code, uint64_t cr2) { int error __diagused; VMM_CTR2(vcpu, "Injecting page fault: error_code %#x, cr2 %#lx", error_code, cr2); error = vm_set_register(vcpu, VM_REG_GUEST_CR2, cr2); KASSERT(error == 0, ("vm_set_register(cr2) error %d", error)); vm_inject_fault(vcpu, IDT_PF, 1, error_code); } static VMM_STAT(VCPU_NMI_COUNT, "number of NMIs delivered to vcpu"); int vm_inject_nmi(struct vcpu *vcpu) { vcpu->nmi_pending = 1; vcpu_notify_event(vcpu, false); return (0); } int vm_nmi_pending(struct vcpu *vcpu) { return (vcpu->nmi_pending); } void vm_nmi_clear(struct vcpu *vcpu) { if (vcpu->nmi_pending == 0) panic("vm_nmi_clear: inconsistent nmi_pending state"); vcpu->nmi_pending = 0; vmm_stat_incr(vcpu, VCPU_NMI_COUNT, 1); } static VMM_STAT(VCPU_EXTINT_COUNT, "number of ExtINTs delivered to vcpu"); int vm_inject_extint(struct vcpu *vcpu) { vcpu->extint_pending = 1; vcpu_notify_event(vcpu, false); return (0); } int vm_extint_pending(struct vcpu *vcpu) { return (vcpu->extint_pending); } void vm_extint_clear(struct vcpu *vcpu) { if (vcpu->extint_pending == 0) panic("vm_extint_clear: inconsistent extint_pending state"); vcpu->extint_pending = 0; vmm_stat_incr(vcpu, VCPU_EXTINT_COUNT, 1); } int vm_get_capability(struct vcpu *vcpu, int type, int *retval) { if (type < 0 || type >= VM_CAP_MAX) return (EINVAL); return (vmmops_getcap(vcpu->cookie, type, retval)); } int vm_set_capability(struct vcpu *vcpu, int type, int val) { if (type < 0 || type >= VM_CAP_MAX) return (EINVAL); return (vmmops_setcap(vcpu->cookie, type, val)); } struct vm * vcpu_vm(struct vcpu *vcpu) { return (vcpu->vm); } int vcpu_vcpuid(struct vcpu *vcpu) { return (vcpu->vcpuid); } struct vcpu * vm_vcpu(struct vm *vm, int vcpuid) { return (vm->vcpu[vcpuid]); } struct vlapic * vm_lapic(struct vcpu *vcpu) { return (vcpu->vlapic); } struct vioapic * vm_ioapic(struct vm *vm) { return (vm->vioapic); } struct vhpet * vm_hpet(struct vm *vm) { return (vm->vhpet); } bool vmm_is_pptdev(int bus, int slot, int func) { int b, f, i, n, s; char *val, *cp, *cp2; bool found; /* * XXX * The length of an environment variable is limited to 128 bytes which * puts an upper limit on the number of passthru devices that may be * specified using a single environment variable. * * Work around this by scanning multiple environment variable * names instead of a single one - yuck! */ const char *names[] = { "pptdevs", "pptdevs2", "pptdevs3", NULL }; /* set pptdevs="1/2/3 4/5/6 7/8/9 10/11/12" */ found = false; for (i = 0; names[i] != NULL && !found; i++) { cp = val = kern_getenv(names[i]); while (cp != NULL && *cp != '\0') { if ((cp2 = strchr(cp, ' ')) != NULL) *cp2 = '\0'; n = sscanf(cp, "%d/%d/%d", &b, &s, &f); if (n == 3 && bus == b && slot == s && func == f) { found = true; break; } if (cp2 != NULL) *cp2++ = ' '; cp = cp2; } freeenv(val); } return (found); } void * vm_iommu_domain(struct vm *vm) { return (vm->iommu); } int vcpu_set_state(struct vcpu *vcpu, enum vcpu_state newstate, bool from_idle) { int error; vcpu_lock(vcpu); error = vcpu_set_state_locked(vcpu, newstate, from_idle); vcpu_unlock(vcpu); return (error); } enum vcpu_state vcpu_get_state(struct vcpu *vcpu, int *hostcpu) { enum vcpu_state state; vcpu_lock(vcpu); state = vcpu->state; if (hostcpu != NULL) *hostcpu = vcpu->hostcpu; vcpu_unlock(vcpu); return (state); } int vm_activate_cpu(struct vcpu *vcpu) { struct vm *vm = vcpu->vm; if (CPU_ISSET(vcpu->vcpuid, &vm->active_cpus)) return (EBUSY); VMM_CTR0(vcpu, "activated"); CPU_SET_ATOMIC(vcpu->vcpuid, &vm->active_cpus); return (0); } int vm_suspend_cpu(struct vm *vm, struct vcpu *vcpu) { if (vcpu == NULL) { vm->debug_cpus = vm->active_cpus; for (int i = 0; i < vm->maxcpus; i++) { if (CPU_ISSET(i, &vm->active_cpus)) vcpu_notify_event(vm_vcpu(vm, i), false); } } else { if (!CPU_ISSET(vcpu->vcpuid, &vm->active_cpus)) return (EINVAL); CPU_SET_ATOMIC(vcpu->vcpuid, &vm->debug_cpus); vcpu_notify_event(vcpu, false); } return (0); } int vm_resume_cpu(struct vm *vm, struct vcpu *vcpu) { if (vcpu == NULL) { CPU_ZERO(&vm->debug_cpus); } else { if (!CPU_ISSET(vcpu->vcpuid, &vm->debug_cpus)) return (EINVAL); CPU_CLR_ATOMIC(vcpu->vcpuid, &vm->debug_cpus); } return (0); } int vcpu_debugged(struct vcpu *vcpu) { return (CPU_ISSET(vcpu->vcpuid, &vcpu->vm->debug_cpus)); } cpuset_t vm_active_cpus(struct vm *vm) { return (vm->active_cpus); } cpuset_t vm_debug_cpus(struct vm *vm) { return (vm->debug_cpus); } cpuset_t vm_suspended_cpus(struct vm *vm) { return (vm->suspended_cpus); } /* * Returns the subset of vCPUs in tostart that are awaiting startup. * These vCPUs are also marked as no longer awaiting startup. */ cpuset_t vm_start_cpus(struct vm *vm, const cpuset_t *tostart) { cpuset_t set; mtx_lock(&vm->rendezvous_mtx); CPU_AND(&set, &vm->startup_cpus, tostart); CPU_ANDNOT(&vm->startup_cpus, &vm->startup_cpus, &set); mtx_unlock(&vm->rendezvous_mtx); return (set); } void vm_await_start(struct vm *vm, const cpuset_t *waiting) { mtx_lock(&vm->rendezvous_mtx); CPU_OR(&vm->startup_cpus, &vm->startup_cpus, waiting); mtx_unlock(&vm->rendezvous_mtx); } void * vcpu_stats(struct vcpu *vcpu) { return (vcpu->stats); } int vm_get_x2apic_state(struct vcpu *vcpu, enum x2apic_state *state) { *state = vcpu->x2apic_state; return (0); } int vm_set_x2apic_state(struct vcpu *vcpu, enum x2apic_state state) { if (state >= X2APIC_STATE_LAST) return (EINVAL); vcpu->x2apic_state = state; vlapic_set_x2apic_state(vcpu, state); return (0); } /* * This function is called to ensure that a vcpu "sees" a pending event * as soon as possible: * - If the vcpu thread is sleeping then it is woken up. * - If the vcpu is running on a different host_cpu then an IPI will be directed * to the host_cpu to cause the vcpu to trap into the hypervisor. */ static void vcpu_notify_event_locked(struct vcpu *vcpu, bool lapic_intr) { int hostcpu; hostcpu = vcpu->hostcpu; if (vcpu->state == VCPU_RUNNING) { KASSERT(hostcpu != NOCPU, ("vcpu running on invalid hostcpu")); if (hostcpu != curcpu) { if (lapic_intr) { vlapic_post_intr(vcpu->vlapic, hostcpu, vmm_ipinum); } else { ipi_cpu(hostcpu, vmm_ipinum); } } else { /* * If the 'vcpu' is running on 'curcpu' then it must * be sending a notification to itself (e.g. SELF_IPI). * The pending event will be picked up when the vcpu * transitions back to guest context. */ } } else { KASSERT(hostcpu == NOCPU, ("vcpu state %d not consistent " "with hostcpu %d", vcpu->state, hostcpu)); if (vcpu->state == VCPU_SLEEPING) wakeup_one(vcpu); } } void vcpu_notify_event(struct vcpu *vcpu, bool lapic_intr) { vcpu_lock(vcpu); vcpu_notify_event_locked(vcpu, lapic_intr); vcpu_unlock(vcpu); } struct vmspace * vm_vmspace(struct vm *vm) { return (vm->vmspace); } struct vm_mem * vm_mem(struct vm *vm) { return (&vm->mem); } int vm_apicid2vcpuid(struct vm *vm, int apicid) { /* * XXX apic id is assumed to be numerically identical to vcpu id */ return (apicid); } int vm_smp_rendezvous(struct vcpu *vcpu, cpuset_t dest, vm_rendezvous_func_t func, void *arg) { struct vm *vm = vcpu->vm; int error, i; /* * Enforce that this function is called without any locks */ WITNESS_WARN(WARN_PANIC, NULL, "vm_smp_rendezvous"); restart: mtx_lock(&vm->rendezvous_mtx); if (vm->rendezvous_func != NULL) { /* * If a rendezvous is already in progress then we need to * call the rendezvous handler in case this 'vcpu' is one * of the targets of the rendezvous. */ VMM_CTR0(vcpu, "Rendezvous already in progress"); mtx_unlock(&vm->rendezvous_mtx); error = vm_handle_rendezvous(vcpu); if (error != 0) return (error); goto restart; } KASSERT(vm->rendezvous_func == NULL, ("vm_smp_rendezvous: previous " "rendezvous is still in progress")); VMM_CTR0(vcpu, "Initiating rendezvous"); vm->rendezvous_req_cpus = dest; CPU_ZERO(&vm->rendezvous_done_cpus); vm->rendezvous_arg = arg; vm->rendezvous_func = func; mtx_unlock(&vm->rendezvous_mtx); /* * Wake up any sleeping vcpus and trigger a VM-exit in any running * vcpus so they handle the rendezvous as soon as possible. */ for (i = 0; i < vm->maxcpus; i++) { if (CPU_ISSET(i, &dest)) vcpu_notify_event(vm_vcpu(vm, i), false); } return (vm_handle_rendezvous(vcpu)); } struct vatpic * vm_atpic(struct vm *vm) { return (vm->vatpic); } struct vatpit * vm_atpit(struct vm *vm) { return (vm->vatpit); } struct vpmtmr * vm_pmtmr(struct vm *vm) { return (vm->vpmtmr); } struct vrtc * vm_rtc(struct vm *vm) { return (vm->vrtc); } enum vm_reg_name vm_segment_name(int seg) { static enum vm_reg_name seg_names[] = { VM_REG_GUEST_ES, VM_REG_GUEST_CS, VM_REG_GUEST_SS, VM_REG_GUEST_DS, VM_REG_GUEST_FS, VM_REG_GUEST_GS }; KASSERT(seg >= 0 && seg < nitems(seg_names), ("%s: invalid segment encoding %d", __func__, seg)); return (seg_names[seg]); } void vm_copy_teardown(struct vm_copyinfo *copyinfo, int num_copyinfo) { int idx; for (idx = 0; idx < num_copyinfo; idx++) { if (copyinfo[idx].cookie != NULL) vm_gpa_release(copyinfo[idx].cookie); } bzero(copyinfo, num_copyinfo * sizeof(struct vm_copyinfo)); } int vm_copy_setup(struct vcpu *vcpu, struct vm_guest_paging *paging, uint64_t gla, size_t len, int prot, struct vm_copyinfo *copyinfo, int num_copyinfo, int *fault) { int error, idx, nused; size_t n, off, remaining; void *hva, *cookie; uint64_t gpa; bzero(copyinfo, sizeof(struct vm_copyinfo) * num_copyinfo); nused = 0; remaining = len; while (remaining > 0) { if (nused >= num_copyinfo) return (EFAULT); error = vm_gla2gpa(vcpu, paging, gla, prot, &gpa, fault); if (error || *fault) return (error); off = gpa & PAGE_MASK; n = min(remaining, PAGE_SIZE - off); copyinfo[nused].gpa = gpa; copyinfo[nused].len = n; remaining -= n; gla += n; nused++; } for (idx = 0; idx < nused; idx++) { hva = vm_gpa_hold(vcpu, copyinfo[idx].gpa, copyinfo[idx].len, prot, &cookie); if (hva == NULL) break; copyinfo[idx].hva = hva; copyinfo[idx].cookie = cookie; } if (idx != nused) { vm_copy_teardown(copyinfo, num_copyinfo); return (EFAULT); } else { *fault = 0; return (0); } } void vm_copyin(struct vm_copyinfo *copyinfo, void *kaddr, size_t len) { char *dst; int idx; dst = kaddr; idx = 0; while (len > 0) { bcopy(copyinfo[idx].hva, dst, copyinfo[idx].len); len -= copyinfo[idx].len; dst += copyinfo[idx].len; idx++; } } void vm_copyout(const void *kaddr, struct vm_copyinfo *copyinfo, size_t len) { const char *src; int idx; src = kaddr; idx = 0; while (len > 0) { bcopy(src, copyinfo[idx].hva, copyinfo[idx].len); len -= copyinfo[idx].len; src += copyinfo[idx].len; idx++; } } /* * Return the amount of in-use and wired memory for the VM. Since * these are global stats, only return the values with for vCPU 0 */ VMM_STAT_DECLARE(VMM_MEM_RESIDENT); VMM_STAT_DECLARE(VMM_MEM_WIRED); static void vm_get_rescnt(struct vcpu *vcpu, struct vmm_stat_type *stat) { if (vcpu->vcpuid == 0) { vmm_stat_set(vcpu, VMM_MEM_RESIDENT, PAGE_SIZE * vmspace_resident_count(vcpu->vm->vmspace)); } } static void vm_get_wiredcnt(struct vcpu *vcpu, struct vmm_stat_type *stat) { if (vcpu->vcpuid == 0) { vmm_stat_set(vcpu, VMM_MEM_WIRED, PAGE_SIZE * pmap_wired_count(vmspace_pmap(vcpu->vm->vmspace))); } } VMM_STAT_FUNC(VMM_MEM_RESIDENT, "Resident memory", vm_get_rescnt); VMM_STAT_FUNC(VMM_MEM_WIRED, "Wired memory", vm_get_wiredcnt); #ifdef BHYVE_SNAPSHOT static int vm_snapshot_vcpus(struct vm *vm, struct vm_snapshot_meta *meta) { uint64_t tsc, now; int ret; struct vcpu *vcpu; uint16_t i, maxcpus; now = rdtsc(); maxcpus = vm_get_maxcpus(vm); for (i = 0; i < maxcpus; i++) { vcpu = vm->vcpu[i]; if (vcpu == NULL) continue; SNAPSHOT_VAR_OR_LEAVE(vcpu->x2apic_state, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(vcpu->exitintinfo, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(vcpu->exc_vector, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(vcpu->exc_errcode_valid, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(vcpu->exc_errcode, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(vcpu->guest_xcr0, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(vcpu->exitinfo, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(vcpu->nextrip, meta, ret, done); /* * Save the absolute TSC value by adding now to tsc_offset. * * It will be turned turned back into an actual offset when the * TSC restore function is called */ tsc = now + vcpu->tsc_offset; SNAPSHOT_VAR_OR_LEAVE(tsc, meta, ret, done); if (meta->op == VM_SNAPSHOT_RESTORE) vcpu->tsc_offset = tsc; } done: return (ret); } static int vm_snapshot_vm(struct vm *vm, struct vm_snapshot_meta *meta) { int ret; ret = vm_snapshot_vcpus(vm, meta); if (ret != 0) goto done; SNAPSHOT_VAR_OR_LEAVE(vm->startup_cpus, meta, ret, done); done: return (ret); } static int vm_snapshot_vcpu(struct vm *vm, struct vm_snapshot_meta *meta) { int error; struct vcpu *vcpu; uint16_t i, maxcpus; error = 0; maxcpus = vm_get_maxcpus(vm); for (i = 0; i < maxcpus; i++) { vcpu = vm->vcpu[i]; if (vcpu == NULL) continue; error = vmmops_vcpu_snapshot(vcpu->cookie, meta); if (error != 0) { printf("%s: failed to snapshot vmcs/vmcb data for " "vCPU: %d; error: %d\n", __func__, i, error); goto done; } } done: return (error); } /* * Save kernel-side structures to user-space for snapshotting. */ int vm_snapshot_req(struct vm *vm, struct vm_snapshot_meta *meta) { int ret = 0; switch (meta->dev_req) { case STRUCT_VMCX: ret = vm_snapshot_vcpu(vm, meta); break; case STRUCT_VM: ret = vm_snapshot_vm(vm, meta); break; case STRUCT_VIOAPIC: ret = vioapic_snapshot(vm_ioapic(vm), meta); break; case STRUCT_VLAPIC: ret = vlapic_snapshot(vm, meta); break; case STRUCT_VHPET: ret = vhpet_snapshot(vm_hpet(vm), meta); break; case STRUCT_VATPIC: ret = vatpic_snapshot(vm_atpic(vm), meta); break; case STRUCT_VATPIT: ret = vatpit_snapshot(vm_atpit(vm), meta); break; case STRUCT_VPMTMR: ret = vpmtmr_snapshot(vm_pmtmr(vm), meta); break; case STRUCT_VRTC: ret = vrtc_snapshot(vm_rtc(vm), meta); break; default: printf("%s: failed to find the requested type %#x\n", __func__, meta->dev_req); ret = (EINVAL); } return (ret); } void vm_set_tsc_offset(struct vcpu *vcpu, uint64_t offset) { vcpu->tsc_offset = offset; } int vm_restore_time(struct vm *vm) { int error; uint64_t now; struct vcpu *vcpu; uint16_t i, maxcpus; now = rdtsc(); error = vhpet_restore_time(vm_hpet(vm)); if (error) return (error); maxcpus = vm_get_maxcpus(vm); for (i = 0; i < maxcpus; i++) { vcpu = vm->vcpu[i]; if (vcpu == NULL) continue; error = vmmops_restore_tsc(vcpu->cookie, vcpu->tsc_offset - now); if (error) return (error); } return (0); } #endif