Index: head/sys/amd64/vmm/amd/amdvi_hw.c =================================================================== --- head/sys/amd64/vmm/amd/amdvi_hw.c (revision 335029) +++ head/sys/amd64/vmm/amd/amdvi_hw.c (revision 335030) @@ -1,1459 +1,1461 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2016, Anish Gupta (anish@freebsd.org) * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice unmodified, this list of conditions, and the following * disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "pcib_if.h" #include "io/iommu.h" #include "amdvi_priv.h" SYSCTL_DECL(_hw_vmm); SYSCTL_NODE(_hw_vmm, OID_AUTO, amdvi, CTLFLAG_RW, NULL, NULL); #define MOD_INC(a, s, m) (((a) + (s)) % ((m) * (s))) #define MOD_DEC(a, s, m) (((a) - (s)) % ((m) * (s))) /* Print RID or device ID in PCI string format. */ #define RID2PCI_STR(d) PCI_RID2BUS(d), PCI_RID2SLOT(d), PCI_RID2FUNC(d) static void amdvi_dump_cmds(struct amdvi_softc *softc); static void amdvi_print_dev_cap(struct amdvi_softc *softc); MALLOC_DEFINE(M_AMDVI, "amdvi", "amdvi"); extern device_t *ivhd_devs; extern int ivhd_count; SYSCTL_INT(_hw_vmm_amdvi, OID_AUTO, count, CTLFLAG_RDTUN, &ivhd_count, 0, NULL); static int amdvi_enable_user = 0; SYSCTL_INT(_hw_vmm_amdvi, OID_AUTO, enable, CTLFLAG_RDTUN, &amdvi_enable_user, 0, NULL); TUNABLE_INT("hw.vmm.amdvi_enable", &amdvi_enable_user); #ifdef AMDVI_ATS_ENABLE /* XXX: ATS is not tested. */ static int amdvi_enable_iotlb = 1; SYSCTL_INT(_hw_vmm_amdvi, OID_AUTO, iotlb_enabled, CTLFLAG_RDTUN, &amdvi_enable_iotlb, 0, NULL); TUNABLE_INT("hw.vmm.enable_iotlb", &amdvi_enable_iotlb); #endif static int amdvi_host_ptp = 1; /* Use page tables for host. */ SYSCTL_INT(_hw_vmm_amdvi, OID_AUTO, host_ptp, CTLFLAG_RDTUN, &amdvi_host_ptp, 0, NULL); TUNABLE_INT("hw.vmm.amdvi.host_ptp", &amdvi_host_ptp); /* Page table level used <= supported by h/w[v1=7]. */ static int amdvi_ptp_level = 4; SYSCTL_INT(_hw_vmm_amdvi, OID_AUTO, ptp_level, CTLFLAG_RDTUN, &amdvi_ptp_level, 0, NULL); TUNABLE_INT("hw.vmm.amdvi.ptp_level", &amdvi_ptp_level); /* Disable fault event reporting. */ static int amdvi_disable_io_fault = 0; SYSCTL_INT(_hw_vmm_amdvi, OID_AUTO, disable_io_fault, CTLFLAG_RDTUN, &amdvi_disable_io_fault, 0, NULL); TUNABLE_INT("hw.vmm.amdvi.disable_io_fault", &amdvi_disable_io_fault); static uint32_t amdvi_dom_id = 0; /* 0 is reserved for host. */ SYSCTL_UINT(_hw_vmm_amdvi, OID_AUTO, domain_id, CTLFLAG_RD, &amdvi_dom_id, 0, NULL); /* * Device table entry. * Bus(256) x Dev(32) x Fun(8) x DTE(256 bits or 32 bytes). * = 256 * 2 * PAGE_SIZE. */ static struct amdvi_dte amdvi_dte[PCI_NUM_DEV_MAX] __aligned(PAGE_SIZE); CTASSERT(PCI_NUM_DEV_MAX == 0x10000); CTASSERT(sizeof(amdvi_dte) == 0x200000); static SLIST_HEAD (, amdvi_domain) dom_head; static inline uint32_t amdvi_pci_read(struct amdvi_softc *softc, int off) { return (pci_cfgregread(PCI_RID2BUS(softc->pci_rid), PCI_RID2SLOT(softc->pci_rid), PCI_RID2FUNC(softc->pci_rid), off, 4)); } #ifdef AMDVI_ATS_ENABLE /* XXX: Should be in pci.c */ /* * Check if device has ATS capability and its enabled. * If ATS is absent or disabled, return (-1), otherwise ATS * queue length. */ static int amdvi_find_ats_qlen(uint16_t devid) { device_t dev; uint32_t off, cap; int qlen = -1; dev = pci_find_bsf(PCI_RID2BUS(devid), PCI_RID2SLOT(devid), PCI_RID2FUNC(devid)); if (!dev) { return (-1); } #define PCIM_ATS_EN BIT(31) if (pci_find_extcap(dev, PCIZ_ATS, &off) == 0) { cap = pci_read_config(dev, off + 4, 4); qlen = (cap & 0x1F); qlen = qlen ? qlen : 32; printf("AMD-Vi: PCI device %d.%d.%d ATS %s qlen=%d\n", RID2PCI_STR(devid), (cap & PCIM_ATS_EN) ? "enabled" : "Disabled", qlen); qlen = (cap & PCIM_ATS_EN) ? qlen : -1; } return (qlen); } /* * Check if an endpoint device support device IOTLB or ATS. */ static inline bool amdvi_dev_support_iotlb(struct amdvi_softc *softc, uint16_t devid) { struct ivhd_dev_cfg *cfg; int qlen, i; bool pci_ats, ivhd_ats; qlen = amdvi_find_ats_qlen(devid); if (qlen < 0) return (false); KASSERT(softc, ("softc is NULL")); cfg = softc->dev_cfg; ivhd_ats = false; for (i = 0; i < softc->dev_cfg_cnt; i++) { if ((cfg->start_id <= devid) && (cfg->end_id >= devid)) { ivhd_ats = cfg->enable_ats; break; } cfg++; } pci_ats = (qlen < 0) ? false : true; if (pci_ats != ivhd_ats) device_printf(softc->dev, "BIOS bug: mismatch in ATS setting for %d.%d.%d," "ATS inv qlen = %d\n", RID2PCI_STR(devid), qlen); /* Ignore IVRS setting and respect PCI setting. */ return (pci_ats); } #endif /* Enable IOTLB support for IOMMU if its supported. */ static inline void amdvi_hw_enable_iotlb(struct amdvi_softc *softc) { #ifndef AMDVI_ATS_ENABLE softc->iotlb = false; #else bool supported; supported = (softc->ivhd_flag & IVHD_FLAG_IOTLB) ? true : false; if (softc->pci_cap & AMDVI_PCI_CAP_IOTLB) { if (!supported) device_printf(softc->dev, "IOTLB disabled by BIOS.\n"); if (supported && !amdvi_enable_iotlb) { device_printf(softc->dev, "IOTLB disabled by user.\n"); supported = false; } } else supported = false; softc->iotlb = supported; #endif } static int amdvi_init_cmd(struct amdvi_softc *softc) { struct amdvi_ctrl *ctrl = softc->ctrl; ctrl->cmd.len = 8; /* Use 256 command buffer entries. */ softc->cmd_max = 1 << ctrl->cmd.len; softc->cmd = malloc(sizeof(struct amdvi_cmd) * softc->cmd_max, M_AMDVI, M_WAITOK | M_ZERO); if ((uintptr_t)softc->cmd & PAGE_MASK) panic("AMDVi: Command buffer not aligned on page boundary."); ctrl->cmd.base = vtophys(softc->cmd) / PAGE_SIZE; /* * XXX: Reset the h/w pointers in case IOMMU is restarting, * h/w doesn't clear these pointers based on empirical data. */ ctrl->cmd_tail = 0; ctrl->cmd_head = 0; return (0); } /* * Note: Update tail pointer after we have written the command since tail * pointer update cause h/w to execute new commands, see section 3.3 * of AMD IOMMU spec ver 2.0. */ /* Get the command tail pointer w/o updating it. */ static struct amdvi_cmd * amdvi_get_cmd_tail(struct amdvi_softc *softc) { struct amdvi_ctrl *ctrl; struct amdvi_cmd *tail; KASSERT(softc, ("softc is NULL")); KASSERT(softc->cmd != NULL, ("cmd is NULL")); ctrl = softc->ctrl; KASSERT(ctrl != NULL, ("ctrl is NULL")); tail = (struct amdvi_cmd *)((uint8_t *)softc->cmd + ctrl->cmd_tail); return (tail); } /* * Update the command tail pointer which will start command execution. */ static void amdvi_update_cmd_tail(struct amdvi_softc *softc) { struct amdvi_ctrl *ctrl; int size; size = sizeof(struct amdvi_cmd); KASSERT(softc->cmd != NULL, ("cmd is NULL")); ctrl = softc->ctrl; KASSERT(ctrl != NULL, ("ctrl is NULL")); ctrl->cmd_tail = MOD_INC(ctrl->cmd_tail, size, softc->cmd_max); softc->total_cmd++; #ifdef AMDVI_DEBUG_CMD device_printf(softc->dev, "cmd_tail: %s Tail:0x%x, Head:0x%x.\n", ctrl->cmd_tail, ctrl->cmd_head); #endif } /* * Various commands supported by IOMMU. */ /* Completion wait command. */ static void amdvi_cmd_cmp(struct amdvi_softc *softc, const uint64_t data) { struct amdvi_cmd *cmd; uint64_t pa; cmd = amdvi_get_cmd_tail(softc); KASSERT(cmd != NULL, ("Cmd is NULL")); pa = vtophys(&softc->cmp_data); cmd->opcode = AMDVI_CMP_WAIT_OPCODE; cmd->word0 = (pa & 0xFFFFFFF8) | (AMDVI_CMP_WAIT_STORE); //(AMDVI_CMP_WAIT_FLUSH | AMDVI_CMP_WAIT_STORE); cmd->word1 = (pa >> 32) & 0xFFFFF; cmd->addr = data; amdvi_update_cmd_tail(softc); } /* Invalidate device table entry. */ static void amdvi_cmd_inv_dte(struct amdvi_softc *softc, uint16_t devid) { struct amdvi_cmd *cmd; cmd = amdvi_get_cmd_tail(softc); KASSERT(cmd != NULL, ("Cmd is NULL")); cmd->opcode = AMDVI_INVD_DTE_OPCODE; cmd->word0 = devid; amdvi_update_cmd_tail(softc); #ifdef AMDVI_DEBUG_CMD device_printf(softc->dev, "Invalidated DTE:0x%x\n", devid); #endif } /* Invalidate IOMMU page, use for invalidation of domain. */ static void amdvi_cmd_inv_iommu_pages(struct amdvi_softc *softc, uint16_t domain_id, uint64_t addr, bool guest_nested, bool pde, bool page) { struct amdvi_cmd *cmd; cmd = amdvi_get_cmd_tail(softc); KASSERT(cmd != NULL, ("Cmd is NULL")); cmd->opcode = AMDVI_INVD_PAGE_OPCODE; cmd->word1 = domain_id; /* * Invalidate all addresses for this domain. */ cmd->addr = addr; cmd->addr |= pde ? AMDVI_INVD_PAGE_PDE : 0; cmd->addr |= page ? AMDVI_INVD_PAGE_S : 0; amdvi_update_cmd_tail(softc); } #ifdef AMDVI_ATS_ENABLE /* Invalidate device IOTLB. */ static void amdvi_cmd_inv_iotlb(struct amdvi_softc *softc, uint16_t devid) { struct amdvi_cmd *cmd; int qlen; if (!softc->iotlb) return; qlen = amdvi_find_ats_qlen(devid); if (qlen < 0) { panic("AMDVI: Invalid ATS qlen(%d) for device %d.%d.%d\n", qlen, RID2PCI_STR(devid)); } cmd = amdvi_get_cmd_tail(softc); KASSERT(cmd != NULL, ("Cmd is NULL")); #ifdef AMDVI_DEBUG_CMD device_printf(softc->dev, "Invalidate IOTLB devID 0x%x" " Qlen:%d\n", devid, qlen); #endif cmd->opcode = AMDVI_INVD_IOTLB_OPCODE; cmd->word0 = devid; cmd->word1 = qlen; cmd->addr = AMDVI_INVD_IOTLB_ALL_ADDR | AMDVI_INVD_IOTLB_S; amdvi_update_cmd_tail(softc); } #endif #ifdef notyet /* For Interrupt Remap. */ static void amdvi_cmd_inv_intr_map(struct amdvi_softc *softc, uint16_t devid) { struct amdvi_cmd *cmd; cmd = amdvi_get_cmd_tail(softc); KASSERT(cmd != NULL, ("Cmd is NULL")); cmd->opcode = AMDVI_INVD_INTR_OPCODE; cmd->word0 = devid; amdvi_update_cmd_tail(softc); #ifdef AMDVI_DEBUG_CMD device_printf(softc->dev, "Invalidate INTR map of devID 0x%x\n", devid); #endif } #endif /* Invalidate domain using INVALIDATE_IOMMU_PAGES command. */ static void amdvi_inv_domain(struct amdvi_softc *softc, uint16_t domain_id) { struct amdvi_cmd *cmd; cmd = amdvi_get_cmd_tail(softc); KASSERT(cmd != NULL, ("Cmd is NULL")); /* * See section 3.3.3 of IOMMU spec rev 2.0, software note * for invalidating domain. */ amdvi_cmd_inv_iommu_pages(softc, domain_id, AMDVI_INVD_PAGE_ALL_ADDR, false, true, true); #ifdef AMDVI_DEBUG_CMD device_printf(softc->dev, "Invalidate domain:0x%x\n", domain_id); #endif } static bool amdvi_cmp_wait(struct amdvi_softc *softc) { struct amdvi_ctrl *ctrl; const uint64_t VERIFY = 0xA5A5; volatile uint64_t *read; int i; bool status; ctrl = softc->ctrl; read = &softc->cmp_data; *read = 0; amdvi_cmd_cmp(softc, VERIFY); /* Wait for h/w to update completion data. */ for (i = 0; i < 100 && (*read != VERIFY); i++) { DELAY(1000); /* 1 ms */ } status = (VERIFY == softc->cmp_data) ? true : false; #ifdef AMDVI_DEBUG_CMD if (status) device_printf(softc->dev, "CMD completion DONE Tail:0x%x, " "Head:0x%x, loop:%d.\n", ctrl->cmd_tail, ctrl->cmd_head, loop); #endif return (status); } static void amdvi_wait(struct amdvi_softc *softc) { struct amdvi_ctrl *ctrl; int i; KASSERT(softc, ("softc is NULL")); ctrl = softc->ctrl; KASSERT(ctrl != NULL, ("ctrl is NULL")); /* Don't wait if h/w is not enabled. */ if ((ctrl->control & AMDVI_CTRL_EN) == 0) return; for (i = 0; i < 10; i++) { if (amdvi_cmp_wait(softc)) return; } device_printf(softc->dev, "Error: completion failed" " tail:0x%x, head:0x%x.\n", ctrl->cmd_tail, ctrl->cmd_head); amdvi_dump_cmds(softc); } static void amdvi_dump_cmds(struct amdvi_softc *softc) { struct amdvi_ctrl *ctrl; struct amdvi_cmd *cmd; int off, i; ctrl = softc->ctrl; device_printf(softc->dev, "Dump all the commands:\n"); /* * If h/w is stuck in completion, it is the previous command, * start dumping from previous command onward. */ off = MOD_DEC(ctrl->cmd_head, sizeof(struct amdvi_cmd), softc->cmd_max); for (i = 0; off != ctrl->cmd_tail && i < softc->cmd_max; i++) { cmd = (struct amdvi_cmd *)((uint8_t *)softc->cmd + off); printf(" [CMD%d, off:0x%x] opcode= 0x%x 0x%x" " 0x%x 0x%lx\n", i, off, cmd->opcode, cmd->word0, cmd->word1, cmd->addr); off = (off + sizeof(struct amdvi_cmd)) % (softc->cmd_max * sizeof(struct amdvi_cmd)); } } static int amdvi_init_event(struct amdvi_softc *softc) { struct amdvi_ctrl *ctrl; ctrl = softc->ctrl; ctrl->event.len = 8; softc->event_max = 1 << ctrl->event.len; softc->event = malloc(sizeof(struct amdvi_event) * softc->event_max, M_AMDVI, M_WAITOK | M_ZERO); if ((uintptr_t)softc->event & PAGE_MASK) { device_printf(softc->dev, "Event buffer not aligned on page."); return (false); } ctrl->event.base = vtophys(softc->event) / PAGE_SIZE; /* Reset the pointers. */ ctrl->evt_head = 0; ctrl->evt_tail = 0; return (0); } static inline void amdvi_decode_evt_flag(uint16_t flag) { flag &= AMDVI_EVENT_FLAG_MASK; printf(" 0x%b]\n", flag, "\020" "\001GN" "\002NX" "\003US" "\004I" "\005PR" "\006RW" "\007PE" "\010RZ" "\011TR" ); } /* See section 2.5.4 of AMD IOMMU spec ver 2.62.*/ static inline void amdvi_decode_evt_flag_type(uint8_t type) { switch (AMDVI_EVENT_FLAG_TYPE(type)) { case 0: printf("RSVD\n"); break; case 1: printf("Master Abort\n"); break; case 2: printf("Target Abort\n"); break; case 3: printf("Data Err\n"); break; default: break; } } static void amdvi_decode_inv_dte_evt(uint16_t devid, uint16_t domid, uint64_t addr, uint16_t flag) { printf("\t[IO_PAGE_FAULT EVT: devId:0x%x DomId:0x%x" " Addr:0x%lx", devid, domid, addr); amdvi_decode_evt_flag(flag); } static void amdvi_decode_pf_evt(uint16_t devid, uint16_t domid, uint64_t addr, uint16_t flag) { printf("\t[IO_PAGE_FAULT EVT: devId:0x%x DomId:0x%x" " Addr:0x%lx", devid, domid, addr); amdvi_decode_evt_flag(flag); } static void amdvi_decode_dte_hwerr_evt(uint16_t devid, uint16_t domid, uint64_t addr, uint16_t flag) { printf("\t[DEV_TAB_HW_ERR EVT: devId:0x%x DomId:0x%x" " Addr:0x%lx", devid, domid, addr); amdvi_decode_evt_flag(flag); amdvi_decode_evt_flag_type(flag); } static void amdvi_decode_page_hwerr_evt(uint16_t devid, uint16_t domid, uint64_t addr, uint16_t flag) { printf("\t[PAGE_TAB_HW_ERR EVT: devId:0x%x DomId:0x%x" " Addr:0x%lx", devid, domid, addr); amdvi_decode_evt_flag(flag); amdvi_decode_evt_flag_type(AMDVI_EVENT_FLAG_TYPE(flag)); } static void amdvi_decode_evt(struct amdvi_event *evt) { struct amdvi_cmd *cmd; switch (evt->opcode) { case AMDVI_EVENT_INVALID_DTE: amdvi_decode_inv_dte_evt(evt->devid, evt->pasid_domid, evt->addr, evt->flag); break; case AMDVI_EVENT_PFAULT: amdvi_decode_pf_evt(evt->devid, evt->pasid_domid, evt->addr, evt->flag); break; case AMDVI_EVENT_DTE_HW_ERROR: amdvi_decode_dte_hwerr_evt(evt->devid, evt->pasid_domid, evt->addr, evt->flag); break; case AMDVI_EVENT_PAGE_HW_ERROR: amdvi_decode_page_hwerr_evt(evt->devid, evt->pasid_domid, evt->addr, evt->flag); break; case AMDVI_EVENT_ILLEGAL_CMD: /* FALL THROUGH */ case AMDVI_EVENT_CMD_HW_ERROR: printf("\t[%s EVT]\n", (evt->opcode == AMDVI_EVENT_ILLEGAL_CMD) ? "ILLEGAL CMD" : "CMD HW ERR"); cmd = (struct amdvi_cmd *)PHYS_TO_DMAP(evt->addr); printf("\tCMD opcode= 0x%x 0x%x 0x%x 0x%lx\n", cmd->opcode, cmd->word0, cmd->word1, cmd->addr); break; case AMDVI_EVENT_IOTLB_TIMEOUT: printf("\t[IOTLB_INV_TIMEOUT devid:0x%x addr:0x%lx]\n", evt->devid, evt->addr); break; case AMDVI_EVENT_INVALID_DTE_REQ: printf("\t[INV_DTE devid:0x%x addr:0x%lx type:0x%x tr:%d]\n", evt->devid, evt->addr, evt->flag >> 9, (evt->flag >> 8) & 1); break; case AMDVI_EVENT_INVALID_PPR_REQ: case AMDVI_EVENT_COUNTER_ZERO: printf("AMD-Vi: v2 events.\n"); break; default: printf("Unsupported AMD-Vi event:%d\n", evt->opcode); } } static void amdvi_print_events(struct amdvi_softc *softc) { struct amdvi_ctrl *ctrl; struct amdvi_event *event; int i, size; ctrl = softc->ctrl; size = sizeof(struct amdvi_event); for (i = 0; i < softc->event_max; i++) { event = &softc->event[ctrl->evt_head / size]; if (!event->opcode) break; device_printf(softc->dev, "\t[Event%d: Head:0x%x Tail:0x%x]\n", i, ctrl->evt_head, ctrl->evt_tail); amdvi_decode_evt(event); ctrl->evt_head = MOD_INC(ctrl->evt_head, size, softc->event_max); } } static int amdvi_init_dte(struct amdvi_softc *softc) { struct amdvi_ctrl *ctrl; ctrl = softc->ctrl; ctrl->dte.base = vtophys(amdvi_dte) / PAGE_SIZE; ctrl->dte.size = 0x1FF; /* 2MB device table. */ return (0); } /* * Not all capabilities of IOMMU are available in ACPI IVHD flag * or EFR entry, read directly from device. */ static int amdvi_print_pci_cap(device_t dev) { struct amdvi_softc *softc; uint32_t off, cap; softc = device_get_softc(dev); off = softc->cap_off; /* * Section 3.7.1 of IOMMU sepc rev 2.0. * Read capability from device. */ cap = amdvi_pci_read(softc, off); /* Make sure capability type[18:16] is 3. */ KASSERT((((cap >> 16) & 0x7) == 0x3), ("Not a IOMMU capability 0x%x@0x%x", cap, off)); softc->pci_cap = cap >> 24; device_printf(softc->dev, "PCI cap 0x%x@0x%x feature:%b\n", cap, off, softc->pci_cap, "\20\1IOTLB\2HT\3NPCache\4EFR\5CapExt"); return (0); } static void amdvi_event_intr(void *arg) { struct amdvi_softc *softc; struct amdvi_ctrl *ctrl; softc = (struct amdvi_softc *)arg; ctrl = softc->ctrl; device_printf(softc->dev, "EVT INTR %ld Status:0x%x" " EVT Head:0x%x Tail:0x%x]\n", softc->event_intr_cnt++, ctrl->status, ctrl->evt_head, ctrl->evt_tail); printf(" [CMD Total 0x%lx] Tail:0x%x, Head:0x%x.\n", softc->total_cmd, ctrl->cmd_tail, ctrl->cmd_head); amdvi_print_events(softc); ctrl->status &= AMDVI_STATUS_EV_OF | AMDVI_STATUS_EV_INTR; } static void amdvi_free_evt_intr_res(device_t dev) { struct amdvi_softc *softc; softc = device_get_softc(dev); if (softc->event_tag != NULL) { bus_teardown_intr(dev, softc->event_res, softc->event_tag); } if (softc->event_res != NULL) { bus_release_resource(dev, SYS_RES_IRQ, softc->event_rid, softc->event_res); } bus_delete_resource(dev, SYS_RES_IRQ, softc->event_rid); PCIB_RELEASE_MSI(device_get_parent(device_get_parent(dev)), dev, 1, &softc->event_irq); } static bool amdvi_alloc_intr_resources(struct amdvi_softc *softc) { struct amdvi_ctrl *ctrl; device_t dev, pcib; device_t mmio_dev; uint64_t msi_addr; uint32_t msi_data; int err; dev = softc->dev; pcib = device_get_parent(device_get_parent(dev)); mmio_dev = pci_find_bsf(PCI_RID2BUS(softc->pci_rid), PCI_RID2SLOT(softc->pci_rid), PCI_RID2FUNC(softc->pci_rid)); if (device_is_attached(mmio_dev)) { device_printf(dev, "warning: IOMMU device is claimed by another driver %s\n", device_get_driver(mmio_dev)->name); } softc->event_irq = -1; softc->event_rid = 0; /* * Section 3.7.1 of IOMMU rev 2.0. With MSI, there is only one * interrupt. XXX: Enable MSI/X support. */ err = PCIB_ALLOC_MSI(pcib, dev, 1, 1, &softc->event_irq); if (err) { device_printf(dev, "Couldn't find event MSI IRQ resource.\n"); return (ENOENT); } err = bus_set_resource(dev, SYS_RES_IRQ, softc->event_rid, softc->event_irq, 1); if (err) { device_printf(dev, "Couldn't set event MSI resource.\n"); return (ENXIO); } softc->event_res = bus_alloc_resource_any(dev, SYS_RES_IRQ, &softc->event_rid, RF_ACTIVE); if (!softc->event_res) { device_printf(dev, "Unable to allocate event INTR resource.\n"); return (ENOMEM); } if (bus_setup_intr(dev, softc->event_res, INTR_TYPE_MISC | INTR_MPSAFE, NULL, amdvi_event_intr, softc, &softc->event_tag)) { device_printf(dev, "Fail to setup event intr\n"); bus_release_resource(softc->dev, SYS_RES_IRQ, softc->event_rid, softc->event_res); softc->event_res = NULL; return (ENXIO); } bus_describe_intr(dev, softc->event_res, softc->event_tag, "fault"); err = PCIB_MAP_MSI(pcib, dev, softc->event_irq, &msi_addr, &msi_data); if (err) { device_printf(dev, "Event interrupt config failed, err=%d.\n", err); amdvi_free_evt_intr_res(softc->dev); return (err); } /* Clear interrupt status bits. */ ctrl = softc->ctrl; ctrl->status &= AMDVI_STATUS_EV_OF | AMDVI_STATUS_EV_INTR; /* Now enable MSI interrupt. */ pci_enable_msi(mmio_dev, msi_addr, msi_data); return (0); } static void amdvi_print_dev_cap(struct amdvi_softc *softc) { struct ivhd_dev_cfg *cfg; int i; cfg = softc->dev_cfg; for (i = 0; i < softc->dev_cfg_cnt; i++) { device_printf(softc->dev, "device [0x%x - 0x%x]" "config:%b%s\n", cfg->start_id, cfg->end_id, cfg->data, "\020\001INIT\002ExtInt\003NMI" "\007LINT0\008LINT1", cfg->enable_ats ? "ATS enabled" : ""); cfg++; } } static int amdvi_handle_sysctl(SYSCTL_HANDLER_ARGS) { struct amdvi_softc *softc; int result, type, error = 0; softc = (struct amdvi_softc *)arg1; type = arg2; switch (type) { case 0: result = softc->ctrl->cmd_head; error = sysctl_handle_int(oidp, &result, 0, req); break; case 1: result = softc->ctrl->cmd_tail; error = sysctl_handle_int(oidp, &result, 0, req); break; case 2: result = softc->ctrl->evt_head; error = sysctl_handle_int(oidp, &result, 0, req); break; case 3: result = softc->ctrl->evt_tail; error = sysctl_handle_int(oidp, &result, 0, req); break; default: device_printf(softc->dev, "Unknown sysctl:%d\n", type); } return (error); } static void amdvi_add_sysctl(struct amdvi_softc *softc) { struct sysctl_oid_list *child; struct sysctl_ctx_list *ctx; device_t dev; dev = softc->dev; ctx = device_get_sysctl_ctx(dev); child = SYSCTL_CHILDREN(device_get_sysctl_tree(dev)); SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "event_intr_count", CTLFLAG_RD, &softc->event_intr_cnt, "Event interrupt count"); SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "command_count", CTLFLAG_RD, &softc->total_cmd, "Command submitted count"); SYSCTL_ADD_U16(ctx, child, OID_AUTO, "pci_rid", CTLFLAG_RD, &softc->pci_rid, 0, "IOMMU RID"); SYSCTL_ADD_U16(ctx, child, OID_AUTO, "start_dev_rid", CTLFLAG_RD, &softc->start_dev_rid, 0, "Start of device under this IOMMU"); SYSCTL_ADD_U16(ctx, child, OID_AUTO, "end_dev_rid", CTLFLAG_RD, &softc->end_dev_rid, 0, "End of device under this IOMMU"); SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "command_head", CTLTYPE_UINT | CTLFLAG_RD, softc, 0, amdvi_handle_sysctl, "IU", "Command head"); SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "command_tail", CTLTYPE_UINT | CTLFLAG_RD, softc, 1, amdvi_handle_sysctl, "IU", "Command tail"); SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "event_head", CTLTYPE_UINT | CTLFLAG_RD, softc, 2, amdvi_handle_sysctl, "IU", "Command head"); SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "event_tail", CTLTYPE_UINT | CTLFLAG_RD, softc, 3, amdvi_handle_sysctl, "IU", "Command tail"); } int amdvi_setup_hw(struct amdvi_softc *softc) { device_t dev; int status; dev = softc->dev; amdvi_hw_enable_iotlb(softc); amdvi_print_dev_cap(softc); if ((status = amdvi_print_pci_cap(dev)) != 0) { device_printf(dev, "PCI capability.\n"); return (status); } if ((status = amdvi_init_cmd(softc)) != 0) { device_printf(dev, "Couldn't configure command buffer.\n"); return (status); } if ((status = amdvi_init_event(softc)) != 0) { device_printf(dev, "Couldn't configure event buffer.\n"); return (status); } if ((status = amdvi_init_dte(softc)) != 0) { device_printf(dev, "Couldn't configure device table.\n"); return (status); } if ((status = amdvi_alloc_intr_resources(softc)) != 0) { return (status); } amdvi_add_sysctl(softc); return (0); } int amdvi_teardown_hw(struct amdvi_softc *softc) { device_t dev; dev = softc->dev; /* * Called after disable, h/w is stopped by now, free all the resources. */ amdvi_free_evt_intr_res(dev); if (softc->cmd) free(softc->cmd, M_AMDVI); if (softc->event) free(softc->event, M_AMDVI); return (0); } /*********** bhyve interfaces *********************/ static int amdvi_init(void) { if (!ivhd_count) { return (EIO); } if (!amdvi_enable_user && ivhd_count) { printf("bhyve: Found %d AMD-Vi/IOMMU device(s), " "use hw.vmm.amdvi.enable=1 to enable pass-through.\n", ivhd_count); return (EINVAL); } return (0); } static void amdvi_cleanup(void) { /* Nothing. */ } static uint16_t amdvi_domainId(void) { /* * If we hit maximum domain limit, rollover leaving host * domain(0). * XXX: make sure that this domain is not used. */ if (amdvi_dom_id == AMDVI_MAX_DOMAIN) amdvi_dom_id = 1; return ((uint16_t)amdvi_dom_id++); } static void amdvi_do_inv_domain(uint16_t domain_id, bool create) { struct amdvi_softc *softc; int i; for (i = 0; i < ivhd_count; i++) { softc = device_get_softc(ivhd_devs[i]); KASSERT(softc, ("softc is NULL")); /* * If not present pages are cached, invalidate page after * creating domain. */ #if 0 if (create && ((softc->pci_cap & AMDVI_PCI_CAP_NPCACHE) == 0)) continue; #endif amdvi_inv_domain(softc, domain_id); amdvi_wait(softc); } } static void * amdvi_create_domain(vm_paddr_t maxaddr) { struct amdvi_domain *dom; dom = malloc(sizeof(struct amdvi_domain), M_AMDVI, M_ZERO | M_WAITOK); dom->id = amdvi_domainId(); //dom->maxaddr = maxaddr; #ifdef AMDVI_DEBUG_CMD printf("Created domain #%d\n", dom->id); #endif /* * Host domain(#0) don't create translation table. */ if (dom->id || amdvi_host_ptp) dom->ptp = malloc(PAGE_SIZE, M_AMDVI, M_WAITOK | M_ZERO); dom->ptp_level = amdvi_ptp_level; amdvi_do_inv_domain(dom->id, true); SLIST_INSERT_HEAD(&dom_head, dom, next); return (dom); } static void amdvi_free_ptp(uint64_t *ptp, int level) { int i; if (level < 1) return; for (i = 0; i < NPTEPG ; i++) { if ((ptp[i] & AMDVI_PT_PRESENT) == 0) continue; /* XXX: Add super-page or PTE mapping > 4KB. */ #ifdef notyet /* Super-page mapping. */ if (AMDVI_PD_SUPER(ptp[i])) continue; #endif amdvi_free_ptp((uint64_t *)PHYS_TO_DMAP(ptp[i] & AMDVI_PT_MASK), level - 1); } free(ptp, M_AMDVI); } static void amdvi_destroy_domain(void *arg) { struct amdvi_domain *domain; domain = (struct amdvi_domain *)arg; KASSERT(domain, ("domain is NULL")); #ifdef AMDVI_DEBUG_CMD printf("Destroying domain %d\n", domain->id); #endif if (domain->ptp) amdvi_free_ptp(domain->ptp, domain->ptp_level); amdvi_do_inv_domain(domain->id, false); SLIST_REMOVE(&dom_head, domain, amdvi_domain, next); free(domain, M_AMDVI); } static uint64_t amdvi_set_pt(uint64_t *pt, int level, vm_paddr_t gpa, vm_paddr_t hpa, uint64_t pg_size, bool create) { uint64_t *page, pa; int shift, index; const int PT_SHIFT = 9; const int PT_INDEX_MASK = (1 << PT_SHIFT) - 1; /* Based on PT_SHIFT */ if (!pg_size) return (0); if (hpa & (pg_size - 1)) { printf("HPA is not size aligned.\n"); return (0); } if (gpa & (pg_size - 1)) { printf("HPA is not size aligned.\n"); return (0); } shift = PML4SHIFT; while ((shift > PAGE_SHIFT) && (pg_size < (1UL << shift))) { index = (gpa >> shift) & PT_INDEX_MASK; if ((pt[index] == 0) && create) { page = malloc(PAGE_SIZE, M_AMDVI, M_WAITOK | M_ZERO); pa = vtophys(page); pt[index] = pa | AMDVI_PT_PRESENT | AMDVI_PT_RW | ((level - 1) << AMDVI_PD_LEVEL_SHIFT); } #ifdef AMDVI_DEBUG_PTE if ((gpa % 0x1000000) == 0) printf("[level%d, shift = %d]PTE:0x%lx\n", level, shift, pt[index]); #endif #define PTE2PA(x) ((uint64_t)(x) & AMDVI_PT_MASK) pa = PTE2PA(pt[index]); pt = (uint64_t *)PHYS_TO_DMAP(pa); shift -= PT_SHIFT; level--; } /* Leaf entry. */ index = (gpa >> shift) & PT_INDEX_MASK; if (create) { pt[index] = hpa | AMDVI_PT_RW | AMDVI_PT_PRESENT; } else pt[index] = 0; #ifdef AMDVI_DEBUG_PTE if ((gpa % 0x1000000) == 0) printf("[Last level%d, shift = %d]PTE:0x%lx\n", level, shift, pt[index]); #endif return (1ULL << shift); } static uint64_t amdvi_update_mapping(struct amdvi_domain *domain, vm_paddr_t gpa, vm_paddr_t hpa, uint64_t size, bool create) { uint64_t mapped, *ptp, len; int level; KASSERT(domain, ("domain is NULL")); level = domain->ptp_level; KASSERT(level, ("Page table level is 0")); ptp = domain->ptp; KASSERT(ptp, ("PTP is NULL")); mapped = 0; while (mapped < size) { len = amdvi_set_pt(ptp, level, gpa + mapped, hpa + mapped, PAGE_SIZE, create); if (!len) { printf("Error: Couldn't map HPA:0x%lx GPA:0x%lx\n", hpa, gpa); return (0); } mapped += len; } return (mapped); } static uint64_t amdvi_create_mapping(void *arg, vm_paddr_t gpa, vm_paddr_t hpa, uint64_t len) { struct amdvi_domain *domain; domain = (struct amdvi_domain *)arg; if (domain->id && !domain->ptp) { printf("ptp is NULL"); return (-1); } /* * If host domain is created w/o page table, skip IOMMU page * table set-up. */ if (domain->ptp) return (amdvi_update_mapping(domain, gpa, hpa, len, true)); else return (len); } static uint64_t amdvi_destroy_mapping(void *arg, vm_paddr_t gpa, uint64_t len) { struct amdvi_domain *domain; domain = (struct amdvi_domain *)arg; /* * If host domain is created w/o page table, skip IOMMU page * table set-up. */ if (domain->ptp) return (amdvi_update_mapping(domain, gpa, 0, len, false)); return (len); } static struct amdvi_softc * amdvi_find_iommu(uint16_t devid) { struct amdvi_softc *softc; int i; for (i = 0; i < ivhd_count; i++) { softc = device_get_softc(ivhd_devs[i]); if ((devid >= softc->start_dev_rid) && (devid <= softc->end_dev_rid)) return (softc); } /* * XXX: BIOS bug, device not in IVRS table, assume its from first IOMMU. */ printf("BIOS bug device(%d.%d.%d) doesn't have IVHD entry.\n", RID2PCI_STR(devid)); return (device_get_softc(ivhd_devs[0])); } /* * Set-up device table entry. * IOMMU spec Rev 2.0, section 3.2.2.2, some of the fields must * be set concurrently, e.g. read and write bits. */ static void amdvi_set_dte(struct amdvi_domain *domain, uint16_t devid, bool enable) { struct amdvi_softc *softc; struct amdvi_dte* temp; KASSERT(domain, ("domain is NULL for pci_rid:0x%x\n", devid)); softc = amdvi_find_iommu(devid); KASSERT(softc, ("softc is NULL for pci_rid:0x%x\n", devid)); temp = &amdvi_dte[devid]; #ifdef AMDVI_ATS_ENABLE /* If IOMMU and device support IOTLB, enable it. */ if (amdvi_dev_support_iotlb(softc, devid) && softc->iotlb) temp->iotlb_enable = 1; #endif /* Avoid duplicate I/O faults. */ temp->sup_second_io_fault = 1; temp->sup_all_io_fault = amdvi_disable_io_fault; temp->dt_valid = 1; temp->domain_id = domain->id; if (enable) { if (domain->ptp) { temp->pt_base = vtophys(domain->ptp) >> 12; temp->pt_level = amdvi_ptp_level; } /* * XXX: Page table valid[TV] bit must be set even if host domain * page tables are not enabled. */ temp->pt_valid = 1; temp->read_allow = 1; temp->write_allow = 1; } } static void amdvi_inv_device(uint16_t devid) { struct amdvi_softc *softc; softc = amdvi_find_iommu(devid); KASSERT(softc, ("softc is NULL")); amdvi_cmd_inv_dte(softc, devid); #ifdef AMDVI_ATS_ENABLE if (amdvi_dev_support_iotlb(softc, devid)) amdvi_cmd_inv_iotlb(softc, devid); #endif amdvi_wait(softc); } static void amdvi_add_device(void *arg, uint16_t devid) { struct amdvi_domain *domain; domain = (struct amdvi_domain *)arg; KASSERT(domain != NULL, ("domain is NULL")); #ifdef AMDVI_DEBUG_CMD printf("Assigning device(%d.%d.%d) to domain:%d\n", RID2PCI_STR(devid), domain->id); #endif amdvi_set_dte(domain, devid, true); amdvi_inv_device(devid); } static void amdvi_remove_device(void *arg, uint16_t devid) { struct amdvi_domain *domain; domain = (struct amdvi_domain *)arg; #ifdef AMDVI_DEBUG_CMD printf("Remove device(0x%x) from domain:%d\n", devid, domain->id); #endif amdvi_set_dte(domain, devid, false); amdvi_inv_device(devid); } static void amdvi_enable(void) { struct amdvi_ctrl *ctrl; struct amdvi_softc *softc; uint64_t val; int i; for (i = 0; i < ivhd_count; i++) { softc = device_get_softc(ivhd_devs[i]); KASSERT(softc, ("softc is NULL\n")); ctrl = softc->ctrl; KASSERT(ctrl, ("ctrl is NULL\n")); val = ( AMDVI_CTRL_EN | AMDVI_CTRL_CMD | AMDVI_CTRL_ELOG | AMDVI_CTRL_ELOGINT | AMDVI_CTRL_INV_TO_1S); if (softc->ivhd_flag & IVHD_FLAG_COH) val |= AMDVI_CTRL_COH; if (softc->ivhd_flag & IVHD_FLAG_HTT) val |= AMDVI_CTRL_HTT; if (softc->ivhd_flag & IVHD_FLAG_RPPW) val |= AMDVI_CTRL_RPPW; if (softc->ivhd_flag & IVHD_FLAG_PPW) val |= AMDVI_CTRL_PPW; if (softc->ivhd_flag & IVHD_FLAG_ISOC) val |= AMDVI_CTRL_ISOC; ctrl->control = val; } } static void amdvi_disable(void) { struct amdvi_ctrl *ctrl; struct amdvi_softc *softc; int i; for (i = 0; i < ivhd_count; i++) { softc = device_get_softc(ivhd_devs[i]); KASSERT(softc, ("softc is NULL\n")); ctrl = softc->ctrl; KASSERT(ctrl, ("ctrl is NULL\n")); ctrl->control = 0; } } static void amdvi_inv_tlb(void *arg) { struct amdvi_domain *domain; domain = (struct amdvi_domain *)arg; KASSERT(domain, ("domain is NULL")); amdvi_do_inv_domain(domain->id, false); } struct iommu_ops iommu_ops_amd = { amdvi_init, amdvi_cleanup, amdvi_enable, amdvi_disable, amdvi_create_domain, amdvi_destroy_domain, amdvi_create_mapping, amdvi_destroy_mapping, amdvi_add_device, amdvi_remove_device, amdvi_inv_tlb }; Index: head/sys/amd64/vmm/amd/amdvi_priv.h =================================================================== --- head/sys/amd64/vmm/amd/amdvi_priv.h (revision 335029) +++ head/sys/amd64/vmm/amd/amdvi_priv.h (revision 335030) @@ -1,429 +1,431 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2016 Anish Gupta (anish@freebsd.org) * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice unmodified, this list of conditions, and the following * disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * * $FreeBSD$ */ #ifndef _AMDVI_PRIV_H_ #define _AMDVI_PRIV_H_ #include #define BIT(n) (1ULL << (n)) /* Return value of bits[n:m] where n and (n >= ) m are bit positions. */ #define REG_BITS(x, n, m) (((x) >> (m)) & \ ((1 << (((n) - (m)) + 1)) - 1)) /* * IOMMU PCI capability. */ #define AMDVI_PCI_CAP_IOTLB BIT(0) /* IOTLB is supported. */ #define AMDVI_PCI_CAP_HT BIT(1) /* HyperTransport tunnel support. */ #define AMDVI_PCI_CAP_NPCACHE BIT(2) /* Not present page cached. */ #define AMDVI_PCI_CAP_EFR BIT(3) /* Extended features. */ #define AMDVI_PCI_CAP_EXT BIT(4) /* Miscellaneous information reg. */ /* * IOMMU extended features. */ #define AMDVI_EX_FEA_PREFSUP BIT(0) /* Prefetch command support. */ #define AMDVI_EX_FEA_PPRSUP BIT(1) /* PPR support */ #define AMDVI_EX_FEA_XTSUP BIT(2) /* Reserved */ #define AMDVI_EX_FEA_NXSUP BIT(3) /* No-execute. */ #define AMDVI_EX_FEA_GTSUP BIT(4) /* Guest translation support. */ #define AMDVI_EX_FEA_EFRW BIT(5) /* Reserved */ #define AMDVI_EX_FEA_IASUP BIT(6) /* Invalidate all command supp. */ #define AMDVI_EX_FEA_GASUP BIT(7) /* Guest APIC or AVIC support. */ #define AMDVI_EX_FEA_HESUP BIT(8) /* Hardware Error. */ #define AMDVI_EX_FEA_PCSUP BIT(9) /* Performance counters support. */ /* XXX: add more EFER bits. */ /* * Device table entry or DTE * NOTE: Must be 256-bits/32 bytes aligned. */ struct amdvi_dte { uint32_t dt_valid:1; /* Device Table valid. */ uint32_t pt_valid:1; /* Page translation valid. */ uint16_t :7; /* Reserved[8:2] */ uint8_t pt_level:3; /* Paging level, 0 to disable. */ uint64_t pt_base:40; /* Page table root pointer. */ uint8_t :3; /* Reserved[54:52] */ uint8_t gv_valid:1; /* Revision 2, GVA to SPA. */ uint8_t gv_level:2; /* Revision 2, GLX level. */ uint8_t gv_cr3_lsb:3; /* Revision 2, GCR3[14:12] */ uint8_t read_allow:1; /* I/O read enabled. */ uint8_t write_allow:1; /* I/O write enabled. */ uint8_t :1; /* Reserved[63] */ uint16_t domain_id:16; /* Domain ID */ uint16_t gv_cr3_lsb2:16; /* Revision 2, GCR3[30:15] */ uint8_t iotlb_enable:1; /* Device support IOTLB */ uint8_t sup_second_io_fault:1; /* Suppress subsequent I/O faults. */ uint8_t sup_all_io_fault:1; /* Suppress all I/O page faults. */ uint8_t IOctl:2; /* Port I/O control. */ uint8_t iotlb_cache_disable:1; /* IOTLB cache hints. */ uint8_t snoop_disable:1; /* Snoop disable. */ uint8_t allow_ex:1; /* Allow exclusion. */ uint8_t sysmgmt:2; /* System management message.*/ uint8_t :1; /* Reserved[106] */ uint32_t gv_cr3_msb:21; /* Revision 2, GCR3[51:31] */ uint8_t intmap_valid:1; /* Interrupt map valid. */ uint8_t intmap_len:4; /* Interrupt map table length. */ uint8_t intmap_ign:1; /* Ignore unmapped interrupts. */ uint64_t intmap_base:46; /* IntMap base. */ uint8_t :4; /* Reserved[183:180] */ uint8_t init_pass:1; /* INIT pass through or PT */ uint8_t extintr_pass:1; /* External Interrupt PT */ uint8_t nmi_pass:1; /* NMI PT */ uint8_t :1; /* Reserved[187] */ uint8_t intr_ctrl:2; /* Interrupt control */ uint8_t lint0_pass:1; /* LINT0 PT */ uint8_t lint1_pass:1; /* LINT1 PT */ uint64_t :64; /* Reserved[255:192] */ } __attribute__((__packed__)); CTASSERT(sizeof(struct amdvi_dte) == 32); /* * IOMMU command entry. */ struct amdvi_cmd { uint32_t word0; uint32_t word1:28; uint8_t opcode:4; uint64_t addr; } __attribute__((__packed__)); /* Command opcodes. */ #define AMDVI_CMP_WAIT_OPCODE 0x1 /* Completion wait. */ #define AMDVI_INVD_DTE_OPCODE 0x2 /* Invalidate device table entry. */ #define AMDVI_INVD_PAGE_OPCODE 0x3 /* Invalidate pages. */ #define AMDVI_INVD_IOTLB_OPCODE 0x4 /* Invalidate IOTLB pages. */ #define AMDVI_INVD_INTR_OPCODE 0x5 /* Invalidate Interrupt table. */ #define AMDVI_PREFETCH_PAGES_OPCODE 0x6 /* Prefetch IOMMU pages. */ #define AMDVI_COMP_PPR_OPCODE 0x7 /* Complete PPR request. */ #define AMDVI_INV_ALL_OPCODE 0x8 /* Invalidate all. */ /* Completion wait attributes. */ #define AMDVI_CMP_WAIT_STORE BIT(0) /* Write back data. */ #define AMDVI_CMP_WAIT_INTR BIT(1) /* Completion wait interrupt. */ #define AMDVI_CMP_WAIT_FLUSH BIT(2) /* Flush queue. */ /* Invalidate page. */ #define AMDVI_INVD_PAGE_S BIT(0) /* Invalidation size. */ #define AMDVI_INVD_PAGE_PDE BIT(1) /* Invalidate PDE. */ #define AMDVI_INVD_PAGE_GN_GVA BIT(2) /* GPA or GVA. */ #define AMDVI_INVD_PAGE_ALL_ADDR (0x7FFFFFFFFFFFFULL << 12) /* Invalidate IOTLB. */ #define AMDVI_INVD_IOTLB_S BIT(0) /* Invalidation size 4k or addr */ #define AMDVI_INVD_IOTLB_GN_GVA BIT(2) /* GPA or GVA. */ #define AMDVI_INVD_IOTLB_ALL_ADDR (0x7FFFFFFFFFFFFULL << 12) /* XXX: add more command entries. */ /* * IOMMU event entry. */ struct amdvi_event { uint16_t devid; uint16_t pasid_hi; uint16_t pasid_domid; /* PASID low or DomainID */ uint16_t flag:12; uint8_t opcode:4; uint64_t addr; } __attribute__((__packed__)); CTASSERT(sizeof(struct amdvi_event) == 16); /* Various event types. */ #define AMDVI_EVENT_INVALID_DTE 0x1 #define AMDVI_EVENT_PFAULT 0x2 #define AMDVI_EVENT_DTE_HW_ERROR 0x3 #define AMDVI_EVENT_PAGE_HW_ERROR 0x4 #define AMDVI_EVENT_ILLEGAL_CMD 0x5 #define AMDVI_EVENT_CMD_HW_ERROR 0x6 #define AMDVI_EVENT_IOTLB_TIMEOUT 0x7 #define AMDVI_EVENT_INVALID_DTE_REQ 0x8 #define AMDVI_EVENT_INVALID_PPR_REQ 0x9 #define AMDVI_EVENT_COUNTER_ZERO 0xA #define AMDVI_EVENT_FLAG_MASK 0x1FF /* Mask for event flags. */ #define AMDVI_EVENT_FLAG_TYPE(x) (((x) >> 9) & 0x3) /* * IOMMU control block. */ struct amdvi_ctrl { struct { uint16_t size:9; uint16_t :3; uint64_t base:40; /* Devtable register base. */ uint16_t :12; } dte; struct { uint16_t :12; uint64_t base:40; uint8_t :4; uint8_t len:4; uint8_t :4; } cmd; struct { uint16_t :12; uint64_t base:40; uint8_t :4; uint8_t len:4; uint8_t :4; } event; uint16_t control :13; uint64_t :51; struct { uint8_t enable:1; uint8_t allow:1; uint16_t :10; uint64_t base:40; uint16_t :12; uint16_t :12; uint64_t limit:40; uint16_t :12; } excl; /* * Revision 2 only. */ uint64_t ex_feature; struct { uint16_t :12; uint64_t base:40; uint8_t :4; uint8_t len:4; uint8_t :4; } ppr; uint64_t first_event; uint64_t second_event; uint64_t event_status; /* Revision 2 only, end. */ uint8_t pad1[0x1FA8]; /* Padding. */ uint32_t cmd_head:19; uint64_t :45; uint32_t cmd_tail:19; uint64_t :45; uint32_t evt_head:19; uint64_t :45; uint32_t evt_tail:19; uint64_t :45; uint32_t status:19; uint64_t :45; uint64_t pad2; uint8_t :4; uint16_t ppr_head:15; uint64_t :45; uint8_t :4; uint16_t ppr_tail:15; uint64_t :45; uint8_t pad3[0x1FC0]; /* Padding. */ /* XXX: More for rev2. */ } __attribute__((__packed__)); CTASSERT(offsetof(struct amdvi_ctrl, pad1)== 0x58); CTASSERT(offsetof(struct amdvi_ctrl, pad2)== 0x2028); CTASSERT(offsetof(struct amdvi_ctrl, pad3)== 0x2040); #define AMDVI_MMIO_V1_SIZE (4 * PAGE_SIZE) /* v1 size */ /* * AMF IOMMU v2 size including event counters */ #define AMDVI_MMIO_V2_SIZE (8 * PAGE_SIZE) CTASSERT(sizeof(struct amdvi_ctrl) == 0x4000); CTASSERT(sizeof(struct amdvi_ctrl) == AMDVI_MMIO_V1_SIZE); /* IVHD flag */ #define IVHD_FLAG_HTT BIT(0) /* Hypertransport Tunnel. */ #define IVHD_FLAG_PPW BIT(1) /* Pass posted write. */ #define IVHD_FLAG_RPPW BIT(2) /* Response pass posted write. */ #define IVHD_FLAG_ISOC BIT(3) /* Isoc support. */ #define IVHD_FLAG_IOTLB BIT(4) /* IOTLB support. */ #define IVHD_FLAG_COH BIT(5) /* Coherent control, default 1 */ #define IVHD_FLAG_PFS BIT(6) /* Prefetch IOMMU pages. */ #define IVHD_FLAG_PPRS BIT(7) /* Peripheral page support. */ /* IVHD device entry data setting. */ #define IVHD_DEV_LINT0_PASS BIT(6) /* LINT0 interrupts. */ #define IVHD_DEV_LINT1_PASS BIT(7) /* LINT1 interrupts. */ /* Bit[5:4] for System Mgmt. Bit3 is reserved. */ #define IVHD_DEV_INIT_PASS BIT(0) /* INIT */ #define IVHD_DEV_EXTINTR_PASS BIT(1) /* ExtInt */ #define IVHD_DEV_NMI_PASS BIT(2) /* NMI */ /* IVHD 8-byte extended data settings. */ #define IVHD_DEV_EXT_ATS_DISABLE BIT(31) /* Disable ATS */ /* IOMMU control register. */ #define AMDVI_CTRL_EN BIT(0) /* IOMMU enable. */ #define AMDVI_CTRL_HTT BIT(1) /* Hypertransport tunnel enable. */ #define AMDVI_CTRL_ELOG BIT(2) /* Event log enable. */ #define AMDVI_CTRL_ELOGINT BIT(3) /* Event log interrupt. */ #define AMDVI_CTRL_COMINT BIT(4) /* Completion wait interrupt. */ #define AMDVI_CTRL_PPW BIT(8) #define AMDVI_CTRL_RPPW BIT(9) #define AMDVI_CTRL_COH BIT(10) #define AMDVI_CTRL_ISOC BIT(11) #define AMDVI_CTRL_CMD BIT(12) /* Command buffer enable. */ #define AMDVI_CTRL_PPRLOG BIT(13) #define AMDVI_CTRL_PPRINT BIT(14) #define AMDVI_CTRL_PPREN BIT(15) #define AMDVI_CTRL_GTE BIT(16) /* Guest translation enable. */ #define AMDVI_CTRL_GAE BIT(17) /* Guest APIC enable. */ /* Invalidation timeout. */ #define AMDVI_CTRL_INV_NO_TO 0 /* No timeout. */ #define AMDVI_CTRL_INV_TO_1ms 1 /* 1 ms */ #define AMDVI_CTRL_INV_TO_10ms 2 /* 10 ms */ #define AMDVI_CTRL_INV_TO_100ms 3 /* 100 ms */ #define AMDVI_CTRL_INV_TO_1S 4 /* 1 second */ #define AMDVI_CTRL_INV_TO_10S 5 /* 10 second */ #define AMDVI_CTRL_INV_TO_100S 6 /* 100 second */ /* * Max number of PCI devices. * 256 bus x 32 slot/devices x 8 functions. */ #define PCI_NUM_DEV_MAX 0x10000 /* Maximum number of domains supported by IOMMU. */ #define AMDVI_MAX_DOMAIN (BIT(16) - 1) /* * IOMMU Page Table attributes. */ #define AMDVI_PT_PRESENT BIT(0) #define AMDVI_PT_COHERENT BIT(60) #define AMDVI_PT_READ BIT(61) #define AMDVI_PT_WRITE BIT(62) #define AMDVI_PT_RW (AMDVI_PT_READ | AMDVI_PT_WRITE) #define AMDVI_PT_MASK 0xFFFFFFFFFF000UL /* Only [51:12] for PA */ #define AMDVI_PD_LEVEL_SHIFT 9 #define AMDVI_PD_SUPER(x) (((x) >> AMDVI_PD_LEVEL_SHIFT) == 7) /* * IOMMU Status, offset 0x2020 */ #define AMDVI_STATUS_EV_OF BIT(0) /* Event overflow. */ #define AMDVI_STATUS_EV_INTR BIT(1) /* Event interrupt. */ /* Completion wait command completed. */ #define AMDVI_STATUS_CMP BIT(2) #define IVRS_CTRL_RID 1 /* MMIO RID */ /* ACPI IVHD */ struct ivhd_dev_cfg { uint32_t start_id; uint32_t end_id; uint8_t data; /* Device configuration. */ bool enable_ats; /* ATS enabled for the device. */ int ats_qlen; /* ATS invalidation queue depth. */ }; struct amdvi_domain { uint64_t *ptp; /* Highest level page table */ int ptp_level; /* Level of page tables */ u_int id; /* Domain id */ SLIST_ENTRY (amdvi_domain) next; }; /* * I/O Virtualization Hardware Definition Block (IVHD) type 0x10 (legacy) * uses ACPI_IVRS_HARDWARE define in contrib/dev/acpica/include/actbl2.h * New IVHD types 0x11 and 0x40 as defined in AMD IOMMU spec[48882] are missing in * ACPI code. These new types add extra field EFR(Extended Feature Register). * XXX : Use definition from ACPI when it is available. */ typedef struct acpi_ivrs_hardware_efr_sup { ACPI_IVRS_HEADER Header; UINT16 CapabilityOffset; /* Offset for IOMMU control fields */ UINT64 BaseAddress; /* IOMMU control registers */ UINT16 PciSegmentGroup; UINT16 Info; /* MSI number and unit ID */ UINT32 Attr; /* IOMMU Feature */ UINT64 ExtFR; /* IOMMU Extended Feature */ UINT64 Reserved; /* v1 feature or v2 attribute */ } __attribute__ ((__packed__)) ACPI_IVRS_HARDWARE_EFRSUP; CTASSERT(sizeof(ACPI_IVRS_HARDWARE_EFRSUP) == 40); /* * Different type of IVHD. * XXX: Use AcpiIvrsType once new IVHD types are available. */ enum IvrsType { IVRS_TYPE_HARDWARE_LEGACY = 0x10, /* Legacy without EFRi support. */ IVRS_TYPE_HARDWARE_EFR = 0x11, /* With EFR support. */ IVRS_TYPE_HARDWARE_MIXED = 0x40, /* Mixed with EFR support. */ }; /* * AMD IOMMU softc. */ struct amdvi_softc { struct amdvi_ctrl *ctrl; /* Control area. */ device_t dev; /* IOMMU device. */ enum IvrsType ivhd_type; /* IOMMU IVHD type. */ bool iotlb; /* IOTLB supported by IOMMU */ struct amdvi_cmd *cmd; /* Command descriptor area. */ int cmd_max; /* Max number of commands. */ uint64_t cmp_data; /* Command completion write back. */ struct amdvi_event *event; /* Event descriptor area. */ struct resource *event_res; /* Event interrupt resource. */ void *event_tag; /* Event interrupt tag. */ int event_max; /* Max number of events. */ int event_irq; int event_rid; /* ACPI various flags. */ uint32_t ivhd_flag; /* ACPI IVHD flag. */ uint32_t ivhd_feature; /* ACPI v1 Reserved or v2 attribute. */ uint64_t ext_feature; /* IVHD EFR */ /* PCI related. */ uint16_t cap_off; /* PCI Capability offset. */ uint8_t pci_cap; /* PCI capability. */ uint16_t pci_seg; /* IOMMU PCI domain/segment. */ uint16_t pci_rid; /* PCI BDF of IOMMU */ /* Device range under this IOMMU. */ uint16_t start_dev_rid; /* First device under this IOMMU. */ uint16_t end_dev_rid; /* Last device under this IOMMU. */ /* BIOS provided device configuration for end points. */ struct ivhd_dev_cfg dev_cfg[10]; int dev_cfg_cnt; /* Software statistics. */ uint64_t event_intr_cnt; /* Total event INTR count. */ uint64_t total_cmd; /* Total number of commands. */ }; int amdvi_setup_hw(struct amdvi_softc *softc); int amdvi_teardown_hw(struct amdvi_softc *softc); #endif /* _AMDVI_PRIV_H_ */ Index: head/sys/amd64/vmm/amd/ivrs_drv.c =================================================================== --- head/sys/amd64/vmm/amd/ivrs_drv.c (revision 335029) +++ head/sys/amd64/vmm/amd/ivrs_drv.c (revision 335030) @@ -1,733 +1,735 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2016, Anish Gupta (anish@freebsd.org) * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice unmodified, this list of conditions, and the following * disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include "opt_acpi.h" #include #include #include #include #include #include #include #include #include #include #include #include "io/iommu.h" #include "amdvi_priv.h" device_t *ivhd_devs; /* IVHD or AMD-Vi device list. */ int ivhd_count; /* Number of IVHD header. */ /* * Cached IVHD header list. * Single entry for each IVHD, filtered the legacy one. */ ACPI_IVRS_HARDWARE *ivhd_hdrs[10]; extern int amdvi_ptp_level; /* Page table levels. */ typedef int (*ivhd_iter_t)(ACPI_IVRS_HEADER *ptr, void *arg); /* * Iterate IVRS table for IVHD and IVMD device type. */ static void ivrs_hdr_iterate_tbl(ivhd_iter_t iter, void *arg) { ACPI_TABLE_IVRS *ivrs; ACPI_IVRS_HEADER *ivrs_hdr, *end; ACPI_STATUS status; status = AcpiGetTable(ACPI_SIG_IVRS, 1, (ACPI_TABLE_HEADER **)&ivrs); if (ACPI_FAILURE(status)) return; if (ivrs->Header.Length == 0) { return; } ivrs_hdr = (ACPI_IVRS_HEADER *)(ivrs + 1); end = (ACPI_IVRS_HEADER *)((char *)ivrs + ivrs->Header.Length); while (ivrs_hdr < end) { if ((uint8_t *)ivrs_hdr + ivrs_hdr->Length > (uint8_t *)end) { printf("AMD-Vi:IVHD/IVMD is corrupted, length : %d\n", ivrs_hdr->Length); break; } switch (ivrs_hdr->Type) { case IVRS_TYPE_HARDWARE_LEGACY: /* Legacy */ case IVRS_TYPE_HARDWARE_EFR: case IVRS_TYPE_HARDWARE_MIXED: if (!iter(ivrs_hdr, arg)) return; break; case ACPI_IVRS_TYPE_MEMORY1: case ACPI_IVRS_TYPE_MEMORY2: case ACPI_IVRS_TYPE_MEMORY3: if (!iter(ivrs_hdr, arg)) return; break; default: printf("AMD-Vi:Not IVHD/IVMD type(%d)", ivrs_hdr->Type); } ivrs_hdr = (ACPI_IVRS_HEADER *)((uint8_t *)ivrs_hdr + ivrs_hdr->Length); } } static bool ivrs_is_ivhd(UINT8 type) { switch(type) { case IVRS_TYPE_HARDWARE_LEGACY: case IVRS_TYPE_HARDWARE_EFR: case IVRS_TYPE_HARDWARE_MIXED: return (true); default: return (false); } } /* Count the number of AMD-Vi devices in the system. */ static int ivhd_count_iter(ACPI_IVRS_HEADER * ivrs_he, void *arg) { if (ivrs_is_ivhd(ivrs_he->Type)) ivhd_count++; return (1); } struct find_ivrs_hdr_args { int i; ACPI_IVRS_HEADER *ptr; }; static int ivrs_hdr_find_iter(ACPI_IVRS_HEADER * ivrs_hdr, void *args) { struct find_ivrs_hdr_args *fi; fi = (struct find_ivrs_hdr_args *)args; if (ivrs_is_ivhd(ivrs_hdr->Type)) { if (fi->i == 0) { fi->ptr = ivrs_hdr; return (0); } fi->i--; } return (1); } static ACPI_IVRS_HARDWARE * ivhd_find_by_index(int idx) { struct find_ivrs_hdr_args fi; fi.i = idx; fi.ptr = NULL; ivrs_hdr_iterate_tbl(ivrs_hdr_find_iter, &fi); return ((ACPI_IVRS_HARDWARE *)fi.ptr); } static void ivhd_dev_add_entry(struct amdvi_softc *softc, uint32_t start_id, uint32_t end_id, uint8_t cfg, bool ats) { struct ivhd_dev_cfg *dev_cfg; /* If device doesn't have special data, don't add it. */ if (!cfg) return; dev_cfg = &softc->dev_cfg[softc->dev_cfg_cnt++]; dev_cfg->start_id = start_id; dev_cfg->end_id = end_id; dev_cfg->data = cfg; dev_cfg->enable_ats = ats; } /* * Record device attributes as suggested by BIOS. */ static int ivhd_dev_parse(ACPI_IVRS_HARDWARE* ivhd, struct amdvi_softc *softc) { ACPI_IVRS_DE_HEADER *de; uint8_t *p, *end; int range_start_id = 0, range_end_id = 0; uint32_t *extended; uint8_t all_data = 0, range_data = 0; bool range_enable_ats = false, enable_ats; softc->start_dev_rid = ~0; softc->end_dev_rid = 0; switch (ivhd->Header.Type) { case IVRS_TYPE_HARDWARE_LEGACY: p = (uint8_t *)ivhd + sizeof(ACPI_IVRS_HARDWARE); break; case IVRS_TYPE_HARDWARE_EFR: case IVRS_TYPE_HARDWARE_MIXED: p = (uint8_t *)ivhd + sizeof(ACPI_IVRS_HARDWARE_EFRSUP); break; default: device_printf(softc->dev, "unknown type: 0x%x\n", ivhd->Header.Type); return (-1); } end = (uint8_t *)ivhd + ivhd->Header.Length; while (p < end) { de = (ACPI_IVRS_DE_HEADER *)p; softc->start_dev_rid = MIN(softc->start_dev_rid, de->Id); softc->end_dev_rid = MAX(softc->end_dev_rid, de->Id); switch (de->Type) { case ACPI_IVRS_TYPE_ALL: all_data = de->DataSetting; break; case ACPI_IVRS_TYPE_SELECT: case ACPI_IVRS_TYPE_ALIAS_SELECT: case ACPI_IVRS_TYPE_EXT_SELECT: enable_ats = false; if (de->Type == ACPI_IVRS_TYPE_EXT_SELECT) { extended = (uint32_t *)(de + 1); enable_ats = (*extended & IVHD_DEV_EXT_ATS_DISABLE) ? false : true; } ivhd_dev_add_entry(softc, de->Id, de->Id, de->DataSetting | all_data, enable_ats); break; case ACPI_IVRS_TYPE_START: case ACPI_IVRS_TYPE_ALIAS_START: case ACPI_IVRS_TYPE_EXT_START: range_start_id = de->Id; range_data = de->DataSetting; if (de->Type == ACPI_IVRS_TYPE_EXT_START) { extended = (uint32_t *)(de + 1); range_enable_ats = (*extended & IVHD_DEV_EXT_ATS_DISABLE) ? false : true; } break; case ACPI_IVRS_TYPE_END: range_end_id = de->Id; ivhd_dev_add_entry(softc, range_start_id, range_end_id, range_data | all_data, range_enable_ats); range_start_id = range_end_id = 0; range_data = 0; all_data = 0; break; case ACPI_IVRS_TYPE_PAD4: break; case ACPI_IVRS_TYPE_SPECIAL: /* HPET or IOAPIC */ break; default: if ((de->Type < 5) || (de->Type >= ACPI_IVRS_TYPE_PAD8)) device_printf(softc->dev, "Unknown dev entry:0x%x\n", de->Type); } if (softc->dev_cfg_cnt > (sizeof(softc->dev_cfg) / sizeof(softc->dev_cfg[0]))) { device_printf(softc->dev, "WARN Too many device entries.\n"); return (EINVAL); } if (de->Type < 0x40) p += sizeof(ACPI_IVRS_DEVICE4); else if (de->Type < 0x80) p += sizeof(ACPI_IVRS_DEVICE8A); else { printf("Variable size IVHD type 0x%x not supported\n", de->Type); break; } } KASSERT((softc->end_dev_rid >= softc->start_dev_rid), ("Device end[0x%x] < start[0x%x.\n", softc->end_dev_rid, softc->start_dev_rid)); return (0); } static bool ivhd_is_newer(ACPI_IVRS_HEADER *old, ACPI_IVRS_HEADER *new) { /* * Newer IVRS header type take precedence. */ if ((old->DeviceId == new->DeviceId) && (old->Type == IVRS_TYPE_HARDWARE_LEGACY) && ((new->Type == IVRS_TYPE_HARDWARE_EFR) || (new->Type == IVRS_TYPE_HARDWARE_MIXED))) { return (true); } return (false); } static void ivhd_identify(driver_t *driver, device_t parent) { ACPI_TABLE_IVRS *ivrs; ACPI_IVRS_HARDWARE *ivhd; ACPI_STATUS status; int i, count = 0; uint32_t ivrs_ivinfo; if (acpi_disabled("ivhd")) return; status = AcpiGetTable(ACPI_SIG_IVRS, 1, (ACPI_TABLE_HEADER **)&ivrs); if (ACPI_FAILURE(status)) return; if (ivrs->Header.Length == 0) { return; } ivrs_ivinfo = ivrs->Info; printf("AMD-Vi: IVRS Info VAsize = %d PAsize = %d GVAsize = %d" " flags:%b\n", REG_BITS(ivrs_ivinfo, 21, 15), REG_BITS(ivrs_ivinfo, 14, 8), REG_BITS(ivrs_ivinfo, 7, 5), REG_BITS(ivrs_ivinfo, 22, 22), "\020\001EFRSup"); ivrs_hdr_iterate_tbl(ivhd_count_iter, NULL); if (!ivhd_count) return; for (i = 0; i < ivhd_count; i++) { ivhd = ivhd_find_by_index(i); KASSERT(ivhd, ("ivhd%d is NULL\n", i)); ivhd_hdrs[i] = ivhd; } /* * Scan for presence of legacy and non-legacy device type * for same AMD-Vi device and override the old one. */ for (i = ivhd_count - 1 ; i > 0 ; i--){ if (ivhd_is_newer(&ivhd_hdrs[i-1]->Header, &ivhd_hdrs[i]->Header)) { ivhd_hdrs[i-1] = ivhd_hdrs[i]; ivhd_count--; } } ivhd_devs = malloc(sizeof(device_t) * ivhd_count, M_DEVBUF, M_WAITOK | M_ZERO); for (i = 0; i < ivhd_count; i++) { ivhd = ivhd_hdrs[i]; KASSERT(ivhd, ("ivhd%d is NULL\n", i)); /* * Use a high order to ensure that this driver is probed after * the Host-PCI bridge and the root PCI bus. */ ivhd_devs[i] = BUS_ADD_CHILD(parent, ACPI_DEV_BASE_ORDER + 10 * 10, "ivhd", i); /* * XXX: In case device was not destroyed before, add will fail. * locate the old device instance. */ if (ivhd_devs[i] == NULL) { ivhd_devs[i] = device_find_child(parent, "ivhd", i); if (ivhd_devs[i] == NULL) { printf("AMD-Vi: cant find ivhd%d\n", i); break; } } count++; } /* * Update device count in case failed to attach. */ ivhd_count = count; } static int ivhd_probe(device_t dev) { ACPI_IVRS_HARDWARE *ivhd; int unit; if (acpi_get_handle(dev) != NULL) return (ENXIO); unit = device_get_unit(dev); KASSERT((unit < ivhd_count), ("ivhd unit %d > count %d", unit, ivhd_count)); ivhd = ivhd_hdrs[unit]; KASSERT(ivhd, ("ivhd is NULL")); switch (ivhd->Header.Type) { case IVRS_TYPE_HARDWARE_EFR: device_set_desc(dev, "AMD-Vi/IOMMU ivhd with EFR"); break; case IVRS_TYPE_HARDWARE_MIXED: device_set_desc(dev, "AMD-Vi/IOMMU ivhd in mixed format"); break; case IVRS_TYPE_HARDWARE_LEGACY: default: device_set_desc(dev, "AMD-Vi/IOMMU ivhd"); break; } return (BUS_PROBE_NOWILDCARD); } static void ivhd_print_flag(device_t dev, enum IvrsType ivhd_type, uint8_t flag) { /* * IVHD lgeacy type has two extra high bits in flag which has * been moved to EFR for non-legacy device. */ switch (ivhd_type) { case IVRS_TYPE_HARDWARE_LEGACY: device_printf(dev, "Flag:%b\n", flag, "\020" "\001HtTunEn" "\002PassPW" "\003ResPassPW" "\004Isoc" "\005IotlbSup" "\006Coherent" "\007PreFSup" "\008PPRSup"); break; case IVRS_TYPE_HARDWARE_EFR: case IVRS_TYPE_HARDWARE_MIXED: device_printf(dev, "Flag:%b\n", flag, "\020" "\001HtTunEn" "\002PassPW" "\003ResPassPW" "\004Isoc" "\005IotlbSup" "\006Coherent"); break; default: device_printf(dev, "Can't decode flag of ivhd type :0x%x\n", ivhd_type); break; } } /* * Feature in legacy IVHD type(0x10) and attribute in newer type(0x11 and 0x40). */ static void ivhd_print_feature(device_t dev, enum IvrsType ivhd_type, uint32_t feature) { switch (ivhd_type) { case IVRS_TYPE_HARDWARE_LEGACY: device_printf(dev, "Features(type:0x%x) HATS = %d GATS = %d" " MsiNumPPR = %d PNBanks= %d PNCounters= %d\n", ivhd_type, REG_BITS(feature, 31, 30), REG_BITS(feature, 29, 28), REG_BITS(feature, 27, 23), REG_BITS(feature, 22, 17), REG_BITS(feature, 16, 13)); device_printf(dev, "max PASID = %d GLXSup = %d Feature:%b\n", REG_BITS(feature, 12, 8), REG_BITS(feature, 4, 3), feature, "\020" "\002NXSup" "\003GTSup" "\004" "\005IASup" "\006GASup" "\007HESup"); break; /* Fewer features or attributes are reported in non-legacy type. */ case IVRS_TYPE_HARDWARE_EFR: case IVRS_TYPE_HARDWARE_MIXED: device_printf(dev, "Features(type:0x%x) MsiNumPPR = %d" " PNBanks= %d PNCounters= %d\n", ivhd_type, REG_BITS(feature, 27, 23), REG_BITS(feature, 22, 17), REG_BITS(feature, 16, 13)); break; default: /* Other ivhd type features are not decoded. */ device_printf(dev, "Can't decode ivhd type :0x%x\n", ivhd_type); } } /* Print extended features of IOMMU. */ static void ivhd_print_ext_feature(device_t dev, uint64_t ext_feature) { uint32_t ext_low, ext_high; if (!ext_feature) return; ext_low = ext_feature; device_printf(dev, "Extended features[31:0]:%b " "HATS = 0x%x GATS = 0x%x " "GLXSup = 0x%x SmiFSup = 0x%x SmiFRC = 0x%x " "GAMSup = 0x%x DualPortLogSup = 0x%x DualEventLogSup = 0x%x\n", (int)ext_low, "\020" "\001PreFSup" "\002PPRSup" "\003" "\004NXSup" "\005GTSup" "\006" "\007IASup" "\008GASup" "\009HESup" "\010PCSup", REG_BITS(ext_low, 11, 10), REG_BITS(ext_low, 13, 12), REG_BITS(ext_low, 15, 14), REG_BITS(ext_low, 17, 16), REG_BITS(ext_low, 20, 18), REG_BITS(ext_low, 23, 21), REG_BITS(ext_low, 25, 24), REG_BITS(ext_low, 29, 28)); ext_high = ext_feature >> 32; device_printf(dev, "Extended features[62:32]:%b " "Max PASID: 0x%x DevTblSegSup = 0x%x " "MarcSup = 0x%x\n", (int)(ext_high), "\020" "\006USSup" "\009PprOvrflwEarlySup" "\010PPRAutoRspSup" "\013BlKStopMrkSup" "\014PerfOptSup" "\015MsiCapMmioSup" "\017GIOSup" "\018HASup" "\019EPHSup" "\020AttrFWSup" "\021HDSup" "\023InvIotlbSup", REG_BITS(ext_high, 5, 0), REG_BITS(ext_high, 8, 7), REG_BITS(ext_high, 11, 10)); } static int ivhd_print_cap(struct amdvi_softc *softc, ACPI_IVRS_HARDWARE * ivhd) { device_t dev; int max_ptp_level; dev = softc->dev; ivhd_print_flag(dev, softc->ivhd_type, softc->ivhd_flag); ivhd_print_feature(dev, softc->ivhd_type, softc->ivhd_feature); ivhd_print_ext_feature(dev, softc->ext_feature); max_ptp_level = 7; /* Make sure device support minimum page level as requested by user. */ if (max_ptp_level < amdvi_ptp_level) { device_printf(dev, "insufficient PTP level:%d\n", max_ptp_level); return (EINVAL); } else { device_printf(softc->dev, "supported paging level:%d, will use only: %d\n", max_ptp_level, amdvi_ptp_level); } device_printf(softc->dev, "device range: 0x%x - 0x%x\n", softc->start_dev_rid, softc->end_dev_rid); return (0); } static int ivhd_attach(device_t dev) { ACPI_IVRS_HARDWARE *ivhd; ACPI_IVRS_HARDWARE_EFRSUP *ivhd_efr; struct amdvi_softc *softc; int status, unit; unit = device_get_unit(dev); KASSERT((unit < ivhd_count), ("ivhd unit %d > count %d", unit, ivhd_count)); /* Make sure its same device for which attach is called. */ KASSERT((ivhd_devs[unit] == dev), ("Not same device old %p new %p", ivhd_devs[unit], dev)); softc = device_get_softc(dev); softc->dev = dev; ivhd = ivhd_hdrs[unit]; KASSERT(ivhd, ("ivhd is NULL")); softc->ivhd_type = ivhd->Header.Type; softc->pci_seg = ivhd->PciSegmentGroup; softc->pci_rid = ivhd->Header.DeviceId; softc->ivhd_flag = ivhd->Header.Flags; /* * On lgeacy IVHD type(0x10), it is documented as feature * but in newer type it is attribute. */ softc->ivhd_feature = ivhd->Reserved; /* * PCI capability has more capabilities that are not part of IVRS. */ softc->cap_off = ivhd->CapabilityOffset; #ifdef notyet /* IVHD Info bit[4:0] is event MSI/X number. */ softc->event_msix = ivhd->Info & 0x1F; #endif switch (ivhd->Header.Type) { case IVRS_TYPE_HARDWARE_EFR: case IVRS_TYPE_HARDWARE_MIXED: ivhd_efr = (ACPI_IVRS_HARDWARE_EFRSUP *)ivhd; softc->ext_feature = ivhd_efr->ExtFR; break; } softc->ctrl = (struct amdvi_ctrl *) PHYS_TO_DMAP(ivhd->BaseAddress); status = ivhd_dev_parse(ivhd, softc); if (status != 0) { device_printf(dev, "endpoint device parsing error=%d\n", status); } status = ivhd_print_cap(softc, ivhd); if (status != 0) { return (status); } status = amdvi_setup_hw(softc); if (status != 0) { device_printf(dev, "couldn't be initialised, error=%d\n", status); return (status); } return (0); } static int ivhd_detach(device_t dev) { struct amdvi_softc *softc; softc = device_get_softc(dev); amdvi_teardown_hw(softc); /* * XXX: delete the device. * don't allow detach, return EBUSY. */ return (0); } static int ivhd_suspend(device_t dev) { return (0); } static int ivhd_resume(device_t dev) { return (0); } static device_method_t ivhd_methods[] = { DEVMETHOD(device_identify, ivhd_identify), DEVMETHOD(device_probe, ivhd_probe), DEVMETHOD(device_attach, ivhd_attach), DEVMETHOD(device_detach, ivhd_detach), DEVMETHOD(device_suspend, ivhd_suspend), DEVMETHOD(device_resume, ivhd_resume), DEVMETHOD_END }; static driver_t ivhd_driver = { "ivhd", ivhd_methods, sizeof(struct amdvi_softc), }; static devclass_t ivhd_devclass; /* * Load this module at the end after PCI re-probing to configure interrupt. */ DRIVER_MODULE_ORDERED(ivhd, acpi, ivhd_driver, ivhd_devclass, 0, 0, SI_ORDER_ANY); MODULE_DEPEND(ivhd, acpi, 1, 1, 1); MODULE_DEPEND(ivhd, pci, 1, 1, 1); Index: head/sys/amd64/vmm/amd/npt.c =================================================================== --- head/sys/amd64/vmm/amd/npt.c (revision 335029) +++ head/sys/amd64/vmm/amd/npt.c (revision 335030) @@ -1,85 +1,87 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2013 Anish Gupta (akgupt3@gmail.com) * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice unmodified, this list of conditions, and the following * disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include "npt.h" SYSCTL_DECL(_hw_vmm); SYSCTL_NODE(_hw_vmm, OID_AUTO, npt, CTLFLAG_RW, NULL, NULL); static int npt_flags; SYSCTL_INT(_hw_vmm_npt, OID_AUTO, pmap_flags, CTLFLAG_RD, &npt_flags, 0, NULL); #define NPT_IPIMASK 0xFF /* * AMD nested page table init. */ int svm_npt_init(int ipinum) { int enable_superpage = 1; npt_flags = ipinum & NPT_IPIMASK; TUNABLE_INT_FETCH("hw.vmm.npt.enable_superpage", &enable_superpage); if (enable_superpage) npt_flags |= PMAP_PDE_SUPERPAGE; return (0); } static int npt_pinit(pmap_t pmap) { return (pmap_pinit_type(pmap, PT_RVI, npt_flags)); } struct vmspace * svm_npt_alloc(vm_offset_t min, vm_offset_t max) { return (vmspace_alloc(min, max, npt_pinit)); } void svm_npt_free(struct vmspace *vmspace) { vmspace_free(vmspace); } Index: head/sys/amd64/vmm/amd/npt.h =================================================================== --- head/sys/amd64/vmm/amd/npt.h (revision 335029) +++ head/sys/amd64/vmm/amd/npt.h (revision 335030) @@ -1,36 +1,38 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2013 Anish Gupta (akgupt3@gmail.com) * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice unmodified, this list of conditions, and the following * disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * * $FreeBSD$ */ #ifndef _SVM_NPT_H_ #define _SVM_NPT_H_ int svm_npt_init(int ipinum); struct vmspace *svm_npt_alloc(vm_offset_t min, vm_offset_t max); void svm_npt_free(struct vmspace *vmspace); #endif /* _SVM_NPT_H_ */ Index: head/sys/amd64/vmm/amd/svm.c =================================================================== --- head/sys/amd64/vmm/amd/svm.c (revision 335029) +++ head/sys/amd64/vmm/amd/svm.c (revision 335030) @@ -1,2285 +1,2287 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2013, Anish Gupta (akgupt3@gmail.com) * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice unmodified, this list of conditions, and the following * disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "vmm_lapic.h" #include "vmm_stat.h" #include "vmm_ktr.h" #include "vmm_ioport.h" #include "vatpic.h" #include "vlapic.h" #include "vlapic_priv.h" #include "x86.h" #include "vmcb.h" #include "svm.h" #include "svm_softc.h" #include "svm_msr.h" #include "npt.h" SYSCTL_DECL(_hw_vmm); SYSCTL_NODE(_hw_vmm, OID_AUTO, svm, CTLFLAG_RW, NULL, NULL); /* * SVM CPUID function 0x8000_000A, edx bit decoding. */ #define AMD_CPUID_SVM_NP BIT(0) /* Nested paging or RVI */ #define AMD_CPUID_SVM_LBR BIT(1) /* Last branch virtualization */ #define AMD_CPUID_SVM_SVML BIT(2) /* SVM lock */ #define AMD_CPUID_SVM_NRIP_SAVE BIT(3) /* Next RIP is saved */ #define AMD_CPUID_SVM_TSC_RATE BIT(4) /* TSC rate control. */ #define AMD_CPUID_SVM_VMCB_CLEAN BIT(5) /* VMCB state caching */ #define AMD_CPUID_SVM_FLUSH_BY_ASID BIT(6) /* Flush by ASID */ #define AMD_CPUID_SVM_DECODE_ASSIST BIT(7) /* Decode assist */ #define AMD_CPUID_SVM_PAUSE_INC BIT(10) /* Pause intercept filter. */ #define AMD_CPUID_SVM_PAUSE_FTH BIT(12) /* Pause filter threshold */ #define AMD_CPUID_SVM_AVIC BIT(13) /* AVIC present */ #define VMCB_CACHE_DEFAULT (VMCB_CACHE_ASID | \ VMCB_CACHE_IOPM | \ VMCB_CACHE_I | \ VMCB_CACHE_TPR | \ VMCB_CACHE_CR2 | \ VMCB_CACHE_CR | \ VMCB_CACHE_DR | \ VMCB_CACHE_DT | \ VMCB_CACHE_SEG | \ VMCB_CACHE_NP) static uint32_t vmcb_clean = VMCB_CACHE_DEFAULT; SYSCTL_INT(_hw_vmm_svm, OID_AUTO, vmcb_clean, CTLFLAG_RDTUN, &vmcb_clean, 0, NULL); static MALLOC_DEFINE(M_SVM, "svm", "svm"); static MALLOC_DEFINE(M_SVM_VLAPIC, "svm-vlapic", "svm-vlapic"); /* Per-CPU context area. */ extern struct pcpu __pcpu[]; static uint32_t svm_feature = ~0U; /* AMD SVM features. */ SYSCTL_UINT(_hw_vmm_svm, OID_AUTO, features, CTLFLAG_RDTUN, &svm_feature, 0, "SVM features advertised by CPUID.8000000AH:EDX"); static int disable_npf_assist; SYSCTL_INT(_hw_vmm_svm, OID_AUTO, disable_npf_assist, CTLFLAG_RWTUN, &disable_npf_assist, 0, NULL); /* Maximum ASIDs supported by the processor */ static uint32_t nasid; SYSCTL_UINT(_hw_vmm_svm, OID_AUTO, num_asids, CTLFLAG_RDTUN, &nasid, 0, "Number of ASIDs supported by this processor"); /* Current ASID generation for each host cpu */ static struct asid asid[MAXCPU]; /* * SVM host state saved area of size 4KB for each core. */ static uint8_t hsave[MAXCPU][PAGE_SIZE] __aligned(PAGE_SIZE); static VMM_STAT_AMD(VCPU_EXITINTINFO, "VM exits during event delivery"); static VMM_STAT_AMD(VCPU_INTINFO_INJECTED, "Events pending at VM entry"); static VMM_STAT_AMD(VMEXIT_VINTR, "VM exits due to interrupt window"); static int svm_setreg(void *arg, int vcpu, int ident, uint64_t val); static __inline int flush_by_asid(void) { return (svm_feature & AMD_CPUID_SVM_FLUSH_BY_ASID); } static __inline int decode_assist(void) { return (svm_feature & AMD_CPUID_SVM_DECODE_ASSIST); } static void svm_disable(void *arg __unused) { uint64_t efer; efer = rdmsr(MSR_EFER); efer &= ~EFER_SVM; wrmsr(MSR_EFER, efer); } /* * Disable SVM on all CPUs. */ static int svm_cleanup(void) { smp_rendezvous(NULL, svm_disable, NULL, NULL); return (0); } /* * Verify that all the features required by bhyve are available. */ static int check_svm_features(void) { u_int regs[4]; /* CPUID Fn8000_000A is for SVM */ do_cpuid(0x8000000A, regs); svm_feature &= regs[3]; /* * The number of ASIDs can be configured to be less than what is * supported by the hardware but not more. */ if (nasid == 0 || nasid > regs[1]) nasid = regs[1]; KASSERT(nasid > 1, ("Insufficient ASIDs for guests: %#x", nasid)); /* bhyve requires the Nested Paging feature */ if (!(svm_feature & AMD_CPUID_SVM_NP)) { printf("SVM: Nested Paging feature not available.\n"); return (ENXIO); } /* bhyve requires the NRIP Save feature */ if (!(svm_feature & AMD_CPUID_SVM_NRIP_SAVE)) { printf("SVM: NRIP Save feature not available.\n"); return (ENXIO); } return (0); } static void svm_enable(void *arg __unused) { uint64_t efer; efer = rdmsr(MSR_EFER); efer |= EFER_SVM; wrmsr(MSR_EFER, efer); wrmsr(MSR_VM_HSAVE_PA, vtophys(hsave[curcpu])); } /* * Return 1 if SVM is enabled on this processor and 0 otherwise. */ static int svm_available(void) { uint64_t msr; /* Section 15.4 Enabling SVM from APM2. */ if ((amd_feature2 & AMDID2_SVM) == 0) { printf("SVM: not available.\n"); return (0); } msr = rdmsr(MSR_VM_CR); if ((msr & VM_CR_SVMDIS) != 0) { printf("SVM: disabled by BIOS.\n"); return (0); } return (1); } static int svm_init(int ipinum) { int error, cpu; if (!svm_available()) return (ENXIO); error = check_svm_features(); if (error) return (error); vmcb_clean &= VMCB_CACHE_DEFAULT; for (cpu = 0; cpu < MAXCPU; cpu++) { /* * Initialize the host ASIDs to their "highest" valid values. * * The next ASID allocation will rollover both 'gen' and 'num' * and start off the sequence at {1,1}. */ asid[cpu].gen = ~0UL; asid[cpu].num = nasid - 1; } svm_msr_init(); svm_npt_init(ipinum); /* Enable SVM on all CPUs */ smp_rendezvous(NULL, svm_enable, NULL, NULL); return (0); } static void svm_restore(void) { svm_enable(NULL); } /* Pentium compatible MSRs */ #define MSR_PENTIUM_START 0 #define MSR_PENTIUM_END 0x1FFF /* AMD 6th generation and Intel compatible MSRs */ #define MSR_AMD6TH_START 0xC0000000UL #define MSR_AMD6TH_END 0xC0001FFFUL /* AMD 7th and 8th generation compatible MSRs */ #define MSR_AMD7TH_START 0xC0010000UL #define MSR_AMD7TH_END 0xC0011FFFUL /* * Get the index and bit position for a MSR in permission bitmap. * Two bits are used for each MSR: lower bit for read and higher bit for write. */ static int svm_msr_index(uint64_t msr, int *index, int *bit) { uint32_t base, off; *index = -1; *bit = (msr % 4) * 2; base = 0; if (msr >= MSR_PENTIUM_START && msr <= MSR_PENTIUM_END) { *index = msr / 4; return (0); } base += (MSR_PENTIUM_END - MSR_PENTIUM_START + 1); if (msr >= MSR_AMD6TH_START && msr <= MSR_AMD6TH_END) { off = (msr - MSR_AMD6TH_START); *index = (off + base) / 4; return (0); } base += (MSR_AMD6TH_END - MSR_AMD6TH_START + 1); if (msr >= MSR_AMD7TH_START && msr <= MSR_AMD7TH_END) { off = (msr - MSR_AMD7TH_START); *index = (off + base) / 4; return (0); } return (EINVAL); } /* * Allow vcpu to read or write the 'msr' without trapping into the hypervisor. */ static void svm_msr_perm(uint8_t *perm_bitmap, uint64_t msr, bool read, bool write) { int index, bit, error; error = svm_msr_index(msr, &index, &bit); KASSERT(error == 0, ("%s: invalid msr %#lx", __func__, msr)); KASSERT(index >= 0 && index < SVM_MSR_BITMAP_SIZE, ("%s: invalid index %d for msr %#lx", __func__, index, msr)); KASSERT(bit >= 0 && bit <= 6, ("%s: invalid bit position %d " "msr %#lx", __func__, bit, msr)); if (read) perm_bitmap[index] &= ~(1UL << bit); if (write) perm_bitmap[index] &= ~(2UL << bit); } static void svm_msr_rw_ok(uint8_t *perm_bitmap, uint64_t msr) { svm_msr_perm(perm_bitmap, msr, true, true); } static void svm_msr_rd_ok(uint8_t *perm_bitmap, uint64_t msr) { svm_msr_perm(perm_bitmap, msr, true, false); } static __inline int svm_get_intercept(struct svm_softc *sc, int vcpu, int idx, uint32_t bitmask) { struct vmcb_ctrl *ctrl; KASSERT(idx >=0 && idx < 5, ("invalid intercept index %d", idx)); ctrl = svm_get_vmcb_ctrl(sc, vcpu); return (ctrl->intercept[idx] & bitmask ? 1 : 0); } static __inline void svm_set_intercept(struct svm_softc *sc, int vcpu, int idx, uint32_t bitmask, int enabled) { struct vmcb_ctrl *ctrl; uint32_t oldval; KASSERT(idx >=0 && idx < 5, ("invalid intercept index %d", idx)); ctrl = svm_get_vmcb_ctrl(sc, vcpu); oldval = ctrl->intercept[idx]; if (enabled) ctrl->intercept[idx] |= bitmask; else ctrl->intercept[idx] &= ~bitmask; if (ctrl->intercept[idx] != oldval) { svm_set_dirty(sc, vcpu, VMCB_CACHE_I); VCPU_CTR3(sc->vm, vcpu, "intercept[%d] modified " "from %#x to %#x", idx, oldval, ctrl->intercept[idx]); } } static __inline void svm_disable_intercept(struct svm_softc *sc, int vcpu, int off, uint32_t bitmask) { svm_set_intercept(sc, vcpu, off, bitmask, 0); } static __inline void svm_enable_intercept(struct svm_softc *sc, int vcpu, int off, uint32_t bitmask) { svm_set_intercept(sc, vcpu, off, bitmask, 1); } static void vmcb_init(struct svm_softc *sc, int vcpu, uint64_t iopm_base_pa, uint64_t msrpm_base_pa, uint64_t np_pml4) { struct vmcb_ctrl *ctrl; struct vmcb_state *state; uint32_t mask; int n; ctrl = svm_get_vmcb_ctrl(sc, vcpu); state = svm_get_vmcb_state(sc, vcpu); ctrl->iopm_base_pa = iopm_base_pa; ctrl->msrpm_base_pa = msrpm_base_pa; /* Enable nested paging */ ctrl->np_enable = 1; ctrl->n_cr3 = np_pml4; /* * Intercept accesses to the control registers that are not shadowed * in the VMCB - i.e. all except cr0, cr2, cr3, cr4 and cr8. */ for (n = 0; n < 16; n++) { mask = (BIT(n) << 16) | BIT(n); if (n == 0 || n == 2 || n == 3 || n == 4 || n == 8) svm_disable_intercept(sc, vcpu, VMCB_CR_INTCPT, mask); else svm_enable_intercept(sc, vcpu, VMCB_CR_INTCPT, mask); } /* * Intercept everything when tracing guest exceptions otherwise * just intercept machine check exception. */ if (vcpu_trace_exceptions(sc->vm, vcpu)) { for (n = 0; n < 32; n++) { /* * Skip unimplemented vectors in the exception bitmap. */ if (n == 2 || n == 9) { continue; } svm_enable_intercept(sc, vcpu, VMCB_EXC_INTCPT, BIT(n)); } } else { svm_enable_intercept(sc, vcpu, VMCB_EXC_INTCPT, BIT(IDT_MC)); } /* Intercept various events (for e.g. I/O, MSR and CPUID accesses) */ svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_IO); svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_MSR); svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_CPUID); svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_INTR); svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_INIT); svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_NMI); svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_SMI); svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_SHUTDOWN); svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_FERR_FREEZE); svm_enable_intercept(sc, vcpu, VMCB_CTRL2_INTCPT, VMCB_INTCPT_MONITOR); svm_enable_intercept(sc, vcpu, VMCB_CTRL2_INTCPT, VMCB_INTCPT_MWAIT); /* * From section "Canonicalization and Consistency Checks" in APMv2 * the VMRUN intercept bit must be set to pass the consistency check. */ svm_enable_intercept(sc, vcpu, VMCB_CTRL2_INTCPT, VMCB_INTCPT_VMRUN); /* * The ASID will be set to a non-zero value just before VMRUN. */ ctrl->asid = 0; /* * Section 15.21.1, Interrupt Masking in EFLAGS * Section 15.21.2, Virtualizing APIC.TPR * * This must be set for %rflag and %cr8 isolation of guest and host. */ ctrl->v_intr_masking = 1; /* Enable Last Branch Record aka LBR for debugging */ ctrl->lbr_virt_en = 1; state->dbgctl = BIT(0); /* EFER_SVM must always be set when the guest is executing */ state->efer = EFER_SVM; /* Set up the PAT to power-on state */ state->g_pat = PAT_VALUE(0, PAT_WRITE_BACK) | PAT_VALUE(1, PAT_WRITE_THROUGH) | PAT_VALUE(2, PAT_UNCACHED) | PAT_VALUE(3, PAT_UNCACHEABLE) | PAT_VALUE(4, PAT_WRITE_BACK) | PAT_VALUE(5, PAT_WRITE_THROUGH) | PAT_VALUE(6, PAT_UNCACHED) | PAT_VALUE(7, PAT_UNCACHEABLE); /* Set up DR6/7 to power-on state */ state->dr6 = DBREG_DR6_RESERVED1; state->dr7 = DBREG_DR7_RESERVED1; } /* * Initialize a virtual machine. */ static void * svm_vminit(struct vm *vm, pmap_t pmap) { struct svm_softc *svm_sc; struct svm_vcpu *vcpu; vm_paddr_t msrpm_pa, iopm_pa, pml4_pa; int i; svm_sc = malloc(sizeof (*svm_sc), M_SVM, M_WAITOK | M_ZERO); if (((uintptr_t)svm_sc & PAGE_MASK) != 0) panic("malloc of svm_softc not aligned on page boundary"); svm_sc->msr_bitmap = contigmalloc(SVM_MSR_BITMAP_SIZE, M_SVM, M_WAITOK, 0, ~(vm_paddr_t)0, PAGE_SIZE, 0); if (svm_sc->msr_bitmap == NULL) panic("contigmalloc of SVM MSR bitmap failed"); svm_sc->iopm_bitmap = contigmalloc(SVM_IO_BITMAP_SIZE, M_SVM, M_WAITOK, 0, ~(vm_paddr_t)0, PAGE_SIZE, 0); if (svm_sc->iopm_bitmap == NULL) panic("contigmalloc of SVM IO bitmap failed"); svm_sc->vm = vm; svm_sc->nptp = (vm_offset_t)vtophys(pmap->pm_pml4); /* * Intercept read and write accesses to all MSRs. */ memset(svm_sc->msr_bitmap, 0xFF, SVM_MSR_BITMAP_SIZE); /* * Access to the following MSRs is redirected to the VMCB when the * guest is executing. Therefore it is safe to allow the guest to * read/write these MSRs directly without hypervisor involvement. */ svm_msr_rw_ok(svm_sc->msr_bitmap, MSR_GSBASE); svm_msr_rw_ok(svm_sc->msr_bitmap, MSR_FSBASE); svm_msr_rw_ok(svm_sc->msr_bitmap, MSR_KGSBASE); svm_msr_rw_ok(svm_sc->msr_bitmap, MSR_STAR); svm_msr_rw_ok(svm_sc->msr_bitmap, MSR_LSTAR); svm_msr_rw_ok(svm_sc->msr_bitmap, MSR_CSTAR); svm_msr_rw_ok(svm_sc->msr_bitmap, MSR_SF_MASK); svm_msr_rw_ok(svm_sc->msr_bitmap, MSR_SYSENTER_CS_MSR); svm_msr_rw_ok(svm_sc->msr_bitmap, MSR_SYSENTER_ESP_MSR); svm_msr_rw_ok(svm_sc->msr_bitmap, MSR_SYSENTER_EIP_MSR); svm_msr_rw_ok(svm_sc->msr_bitmap, MSR_PAT); svm_msr_rd_ok(svm_sc->msr_bitmap, MSR_TSC); /* * Intercept writes to make sure that the EFER_SVM bit is not cleared. */ svm_msr_rd_ok(svm_sc->msr_bitmap, MSR_EFER); /* Intercept access to all I/O ports. */ memset(svm_sc->iopm_bitmap, 0xFF, SVM_IO_BITMAP_SIZE); iopm_pa = vtophys(svm_sc->iopm_bitmap); msrpm_pa = vtophys(svm_sc->msr_bitmap); pml4_pa = svm_sc->nptp; for (i = 0; i < VM_MAXCPU; i++) { vcpu = svm_get_vcpu(svm_sc, i); vcpu->nextrip = ~0; vcpu->lastcpu = NOCPU; vcpu->vmcb_pa = vtophys(&vcpu->vmcb); vmcb_init(svm_sc, i, iopm_pa, msrpm_pa, pml4_pa); svm_msr_guest_init(svm_sc, i); } return (svm_sc); } /* * Collateral for a generic SVM VM-exit. */ static void vm_exit_svm(struct vm_exit *vme, uint64_t code, uint64_t info1, uint64_t info2) { vme->exitcode = VM_EXITCODE_SVM; vme->u.svm.exitcode = code; vme->u.svm.exitinfo1 = info1; vme->u.svm.exitinfo2 = info2; } static int svm_cpl(struct vmcb_state *state) { /* * From APMv2: * "Retrieve the CPL from the CPL field in the VMCB, not * from any segment DPL" */ return (state->cpl); } static enum vm_cpu_mode svm_vcpu_mode(struct vmcb *vmcb) { struct vmcb_segment seg; struct vmcb_state *state; int error; state = &vmcb->state; if (state->efer & EFER_LMA) { error = vmcb_seg(vmcb, VM_REG_GUEST_CS, &seg); KASSERT(error == 0, ("%s: vmcb_seg(cs) error %d", __func__, error)); /* * Section 4.8.1 for APM2, check if Code Segment has * Long attribute set in descriptor. */ if (seg.attrib & VMCB_CS_ATTRIB_L) return (CPU_MODE_64BIT); else return (CPU_MODE_COMPATIBILITY); } else if (state->cr0 & CR0_PE) { return (CPU_MODE_PROTECTED); } else { return (CPU_MODE_REAL); } } static enum vm_paging_mode svm_paging_mode(uint64_t cr0, uint64_t cr4, uint64_t efer) { if ((cr0 & CR0_PG) == 0) return (PAGING_MODE_FLAT); if ((cr4 & CR4_PAE) == 0) return (PAGING_MODE_32); if (efer & EFER_LME) return (PAGING_MODE_64); else return (PAGING_MODE_PAE); } /* * ins/outs utility routines */ static uint64_t svm_inout_str_index(struct svm_regctx *regs, int in) { uint64_t val; val = in ? regs->sctx_rdi : regs->sctx_rsi; return (val); } static uint64_t svm_inout_str_count(struct svm_regctx *regs, int rep) { uint64_t val; val = rep ? regs->sctx_rcx : 1; return (val); } static void svm_inout_str_seginfo(struct svm_softc *svm_sc, int vcpu, int64_t info1, int in, struct vm_inout_str *vis) { int error, s; if (in) { vis->seg_name = VM_REG_GUEST_ES; } else { /* The segment field has standard encoding */ s = (info1 >> 10) & 0x7; vis->seg_name = vm_segment_name(s); } error = vmcb_getdesc(svm_sc, vcpu, vis->seg_name, &vis->seg_desc); KASSERT(error == 0, ("%s: svm_getdesc error %d", __func__, error)); } static int svm_inout_str_addrsize(uint64_t info1) { uint32_t size; size = (info1 >> 7) & 0x7; switch (size) { case 1: return (2); /* 16 bit */ case 2: return (4); /* 32 bit */ case 4: return (8); /* 64 bit */ default: panic("%s: invalid size encoding %d", __func__, size); } } static void svm_paging_info(struct vmcb *vmcb, struct vm_guest_paging *paging) { struct vmcb_state *state; state = &vmcb->state; paging->cr3 = state->cr3; paging->cpl = svm_cpl(state); paging->cpu_mode = svm_vcpu_mode(vmcb); paging->paging_mode = svm_paging_mode(state->cr0, state->cr4, state->efer); } #define UNHANDLED 0 /* * Handle guest I/O intercept. */ static int svm_handle_io(struct svm_softc *svm_sc, int vcpu, struct vm_exit *vmexit) { struct vmcb_ctrl *ctrl; struct vmcb_state *state; struct svm_regctx *regs; struct vm_inout_str *vis; uint64_t info1; int inout_string; state = svm_get_vmcb_state(svm_sc, vcpu); ctrl = svm_get_vmcb_ctrl(svm_sc, vcpu); regs = svm_get_guest_regctx(svm_sc, vcpu); info1 = ctrl->exitinfo1; inout_string = info1 & BIT(2) ? 1 : 0; /* * The effective segment number in EXITINFO1[12:10] is populated * only if the processor has the DecodeAssist capability. * * XXX this is not specified explicitly in APMv2 but can be verified * empirically. */ if (inout_string && !decode_assist()) return (UNHANDLED); vmexit->exitcode = VM_EXITCODE_INOUT; vmexit->u.inout.in = (info1 & BIT(0)) ? 1 : 0; vmexit->u.inout.string = inout_string; vmexit->u.inout.rep = (info1 & BIT(3)) ? 1 : 0; vmexit->u.inout.bytes = (info1 >> 4) & 0x7; vmexit->u.inout.port = (uint16_t)(info1 >> 16); vmexit->u.inout.eax = (uint32_t)(state->rax); if (inout_string) { vmexit->exitcode = VM_EXITCODE_INOUT_STR; vis = &vmexit->u.inout_str; svm_paging_info(svm_get_vmcb(svm_sc, vcpu), &vis->paging); vis->rflags = state->rflags; vis->cr0 = state->cr0; vis->index = svm_inout_str_index(regs, vmexit->u.inout.in); vis->count = svm_inout_str_count(regs, vmexit->u.inout.rep); vis->addrsize = svm_inout_str_addrsize(info1); svm_inout_str_seginfo(svm_sc, vcpu, info1, vmexit->u.inout.in, vis); } return (UNHANDLED); } static int npf_fault_type(uint64_t exitinfo1) { if (exitinfo1 & VMCB_NPF_INFO1_W) return (VM_PROT_WRITE); else if (exitinfo1 & VMCB_NPF_INFO1_ID) return (VM_PROT_EXECUTE); else return (VM_PROT_READ); } static bool svm_npf_emul_fault(uint64_t exitinfo1) { if (exitinfo1 & VMCB_NPF_INFO1_ID) { return (false); } if (exitinfo1 & VMCB_NPF_INFO1_GPT) { return (false); } if ((exitinfo1 & VMCB_NPF_INFO1_GPA) == 0) { return (false); } return (true); } static void svm_handle_inst_emul(struct vmcb *vmcb, uint64_t gpa, struct vm_exit *vmexit) { struct vm_guest_paging *paging; struct vmcb_segment seg; struct vmcb_ctrl *ctrl; char *inst_bytes; int error, inst_len; ctrl = &vmcb->ctrl; paging = &vmexit->u.inst_emul.paging; vmexit->exitcode = VM_EXITCODE_INST_EMUL; vmexit->u.inst_emul.gpa = gpa; vmexit->u.inst_emul.gla = VIE_INVALID_GLA; svm_paging_info(vmcb, paging); error = vmcb_seg(vmcb, VM_REG_GUEST_CS, &seg); KASSERT(error == 0, ("%s: vmcb_seg(CS) error %d", __func__, error)); switch(paging->cpu_mode) { case CPU_MODE_REAL: vmexit->u.inst_emul.cs_base = seg.base; vmexit->u.inst_emul.cs_d = 0; break; case CPU_MODE_PROTECTED: case CPU_MODE_COMPATIBILITY: vmexit->u.inst_emul.cs_base = seg.base; /* * Section 4.8.1 of APM2, Default Operand Size or D bit. */ vmexit->u.inst_emul.cs_d = (seg.attrib & VMCB_CS_ATTRIB_D) ? 1 : 0; break; default: vmexit->u.inst_emul.cs_base = 0; vmexit->u.inst_emul.cs_d = 0; break; } /* * Copy the instruction bytes into 'vie' if available. */ if (decode_assist() && !disable_npf_assist) { inst_len = ctrl->inst_len; inst_bytes = ctrl->inst_bytes; } else { inst_len = 0; inst_bytes = NULL; } vie_init(&vmexit->u.inst_emul.vie, inst_bytes, inst_len); } #ifdef KTR static const char * intrtype_to_str(int intr_type) { switch (intr_type) { case VMCB_EVENTINJ_TYPE_INTR: return ("hwintr"); case VMCB_EVENTINJ_TYPE_NMI: return ("nmi"); case VMCB_EVENTINJ_TYPE_INTn: return ("swintr"); case VMCB_EVENTINJ_TYPE_EXCEPTION: return ("exception"); default: panic("%s: unknown intr_type %d", __func__, intr_type); } } #endif /* * Inject an event to vcpu as described in section 15.20, "Event injection". */ static void svm_eventinject(struct svm_softc *sc, int vcpu, int intr_type, int vector, uint32_t error, bool ec_valid) { struct vmcb_ctrl *ctrl; ctrl = svm_get_vmcb_ctrl(sc, vcpu); KASSERT((ctrl->eventinj & VMCB_EVENTINJ_VALID) == 0, ("%s: event already pending %#lx", __func__, ctrl->eventinj)); KASSERT(vector >=0 && vector <= 255, ("%s: invalid vector %d", __func__, vector)); switch (intr_type) { case VMCB_EVENTINJ_TYPE_INTR: case VMCB_EVENTINJ_TYPE_NMI: case VMCB_EVENTINJ_TYPE_INTn: break; case VMCB_EVENTINJ_TYPE_EXCEPTION: if (vector >= 0 && vector <= 31 && vector != 2) break; /* FALLTHROUGH */ default: panic("%s: invalid intr_type/vector: %d/%d", __func__, intr_type, vector); } ctrl->eventinj = vector | (intr_type << 8) | VMCB_EVENTINJ_VALID; if (ec_valid) { ctrl->eventinj |= VMCB_EVENTINJ_EC_VALID; ctrl->eventinj |= (uint64_t)error << 32; VCPU_CTR3(sc->vm, vcpu, "Injecting %s at vector %d errcode %#x", intrtype_to_str(intr_type), vector, error); } else { VCPU_CTR2(sc->vm, vcpu, "Injecting %s at vector %d", intrtype_to_str(intr_type), vector); } } static void svm_update_virqinfo(struct svm_softc *sc, int vcpu) { struct vm *vm; struct vlapic *vlapic; struct vmcb_ctrl *ctrl; vm = sc->vm; vlapic = vm_lapic(vm, vcpu); ctrl = svm_get_vmcb_ctrl(sc, vcpu); /* Update %cr8 in the emulated vlapic */ vlapic_set_cr8(vlapic, ctrl->v_tpr); /* Virtual interrupt injection is not used. */ KASSERT(ctrl->v_intr_vector == 0, ("%s: invalid " "v_intr_vector %d", __func__, ctrl->v_intr_vector)); } static void svm_save_intinfo(struct svm_softc *svm_sc, int vcpu) { struct vmcb_ctrl *ctrl; uint64_t intinfo; ctrl = svm_get_vmcb_ctrl(svm_sc, vcpu); intinfo = ctrl->exitintinfo; if (!VMCB_EXITINTINFO_VALID(intinfo)) return; /* * From APMv2, Section "Intercepts during IDT interrupt delivery" * * If a #VMEXIT happened during event delivery then record the event * that was being delivered. */ VCPU_CTR2(svm_sc->vm, vcpu, "SVM:Pending INTINFO(0x%lx), vector=%d.\n", intinfo, VMCB_EXITINTINFO_VECTOR(intinfo)); vmm_stat_incr(svm_sc->vm, vcpu, VCPU_EXITINTINFO, 1); vm_exit_intinfo(svm_sc->vm, vcpu, intinfo); } #ifdef INVARIANTS static __inline int vintr_intercept_enabled(struct svm_softc *sc, int vcpu) { return (svm_get_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_VINTR)); } #endif static __inline void enable_intr_window_exiting(struct svm_softc *sc, int vcpu) { struct vmcb_ctrl *ctrl; ctrl = svm_get_vmcb_ctrl(sc, vcpu); if (ctrl->v_irq && ctrl->v_intr_vector == 0) { KASSERT(ctrl->v_ign_tpr, ("%s: invalid v_ign_tpr", __func__)); KASSERT(vintr_intercept_enabled(sc, vcpu), ("%s: vintr intercept should be enabled", __func__)); return; } VCPU_CTR0(sc->vm, vcpu, "Enable intr window exiting"); ctrl->v_irq = 1; ctrl->v_ign_tpr = 1; ctrl->v_intr_vector = 0; svm_set_dirty(sc, vcpu, VMCB_CACHE_TPR); svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_VINTR); } static __inline void disable_intr_window_exiting(struct svm_softc *sc, int vcpu) { struct vmcb_ctrl *ctrl; ctrl = svm_get_vmcb_ctrl(sc, vcpu); if (!ctrl->v_irq && ctrl->v_intr_vector == 0) { KASSERT(!vintr_intercept_enabled(sc, vcpu), ("%s: vintr intercept should be disabled", __func__)); return; } VCPU_CTR0(sc->vm, vcpu, "Disable intr window exiting"); ctrl->v_irq = 0; ctrl->v_intr_vector = 0; svm_set_dirty(sc, vcpu, VMCB_CACHE_TPR); svm_disable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_VINTR); } static int svm_modify_intr_shadow(struct svm_softc *sc, int vcpu, uint64_t val) { struct vmcb_ctrl *ctrl; int oldval, newval; ctrl = svm_get_vmcb_ctrl(sc, vcpu); oldval = ctrl->intr_shadow; newval = val ? 1 : 0; if (newval != oldval) { ctrl->intr_shadow = newval; VCPU_CTR1(sc->vm, vcpu, "Setting intr_shadow to %d", newval); } return (0); } static int svm_get_intr_shadow(struct svm_softc *sc, int vcpu, uint64_t *val) { struct vmcb_ctrl *ctrl; ctrl = svm_get_vmcb_ctrl(sc, vcpu); *val = ctrl->intr_shadow; return (0); } /* * Once an NMI is injected it blocks delivery of further NMIs until the handler * executes an IRET. The IRET intercept is enabled when an NMI is injected to * to track when the vcpu is done handling the NMI. */ static int nmi_blocked(struct svm_softc *sc, int vcpu) { int blocked; blocked = svm_get_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_IRET); return (blocked); } static void enable_nmi_blocking(struct svm_softc *sc, int vcpu) { KASSERT(!nmi_blocked(sc, vcpu), ("vNMI already blocked")); VCPU_CTR0(sc->vm, vcpu, "vNMI blocking enabled"); svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_IRET); } static void clear_nmi_blocking(struct svm_softc *sc, int vcpu) { int error; KASSERT(nmi_blocked(sc, vcpu), ("vNMI already unblocked")); VCPU_CTR0(sc->vm, vcpu, "vNMI blocking cleared"); /* * When the IRET intercept is cleared the vcpu will attempt to execute * the "iret" when it runs next. However, it is possible to inject * another NMI into the vcpu before the "iret" has actually executed. * * For e.g. if the "iret" encounters a #NPF when accessing the stack * it will trap back into the hypervisor. If an NMI is pending for * the vcpu it will be injected into the guest. * * XXX this needs to be fixed */ svm_disable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_IRET); /* * Set 'intr_shadow' to prevent an NMI from being injected on the * immediate VMRUN. */ error = svm_modify_intr_shadow(sc, vcpu, 1); KASSERT(!error, ("%s: error %d setting intr_shadow", __func__, error)); } #define EFER_MBZ_BITS 0xFFFFFFFFFFFF0200UL static int svm_write_efer(struct svm_softc *sc, int vcpu, uint64_t newval, bool *retu) { struct vm_exit *vme; struct vmcb_state *state; uint64_t changed, lma, oldval; int error; state = svm_get_vmcb_state(sc, vcpu); oldval = state->efer; VCPU_CTR2(sc->vm, vcpu, "wrmsr(efer) %#lx/%#lx", oldval, newval); newval &= ~0xFE; /* clear the Read-As-Zero (RAZ) bits */ changed = oldval ^ newval; if (newval & EFER_MBZ_BITS) goto gpf; /* APMv2 Table 14-5 "Long-Mode Consistency Checks" */ if (changed & EFER_LME) { if (state->cr0 & CR0_PG) goto gpf; } /* EFER.LMA = EFER.LME & CR0.PG */ if ((newval & EFER_LME) != 0 && (state->cr0 & CR0_PG) != 0) lma = EFER_LMA; else lma = 0; if ((newval & EFER_LMA) != lma) goto gpf; if (newval & EFER_NXE) { if (!vm_cpuid_capability(sc->vm, vcpu, VCC_NO_EXECUTE)) goto gpf; } /* * XXX bhyve does not enforce segment limits in 64-bit mode. Until * this is fixed flag guest attempt to set EFER_LMSLE as an error. */ if (newval & EFER_LMSLE) { vme = vm_exitinfo(sc->vm, vcpu); vm_exit_svm(vme, VMCB_EXIT_MSR, 1, 0); *retu = true; return (0); } if (newval & EFER_FFXSR) { if (!vm_cpuid_capability(sc->vm, vcpu, VCC_FFXSR)) goto gpf; } if (newval & EFER_TCE) { if (!vm_cpuid_capability(sc->vm, vcpu, VCC_TCE)) goto gpf; } error = svm_setreg(sc, vcpu, VM_REG_GUEST_EFER, newval); KASSERT(error == 0, ("%s: error %d updating efer", __func__, error)); return (0); gpf: vm_inject_gp(sc->vm, vcpu); return (0); } static int emulate_wrmsr(struct svm_softc *sc, int vcpu, u_int num, uint64_t val, bool *retu) { int error; if (lapic_msr(num)) error = lapic_wrmsr(sc->vm, vcpu, num, val, retu); else if (num == MSR_EFER) error = svm_write_efer(sc, vcpu, val, retu); else error = svm_wrmsr(sc, vcpu, num, val, retu); return (error); } static int emulate_rdmsr(struct svm_softc *sc, int vcpu, u_int num, bool *retu) { struct vmcb_state *state; struct svm_regctx *ctx; uint64_t result; int error; if (lapic_msr(num)) error = lapic_rdmsr(sc->vm, vcpu, num, &result, retu); else error = svm_rdmsr(sc, vcpu, num, &result, retu); if (error == 0) { state = svm_get_vmcb_state(sc, vcpu); ctx = svm_get_guest_regctx(sc, vcpu); state->rax = result & 0xffffffff; ctx->sctx_rdx = result >> 32; } return (error); } #ifdef KTR static const char * exit_reason_to_str(uint64_t reason) { static char reasonbuf[32]; switch (reason) { case VMCB_EXIT_INVALID: return ("invalvmcb"); case VMCB_EXIT_SHUTDOWN: return ("shutdown"); case VMCB_EXIT_NPF: return ("nptfault"); case VMCB_EXIT_PAUSE: return ("pause"); case VMCB_EXIT_HLT: return ("hlt"); case VMCB_EXIT_CPUID: return ("cpuid"); case VMCB_EXIT_IO: return ("inout"); case VMCB_EXIT_MC: return ("mchk"); case VMCB_EXIT_INTR: return ("extintr"); case VMCB_EXIT_NMI: return ("nmi"); case VMCB_EXIT_VINTR: return ("vintr"); case VMCB_EXIT_MSR: return ("msr"); case VMCB_EXIT_IRET: return ("iret"); case VMCB_EXIT_MONITOR: return ("monitor"); case VMCB_EXIT_MWAIT: return ("mwait"); default: snprintf(reasonbuf, sizeof(reasonbuf), "%#lx", reason); return (reasonbuf); } } #endif /* KTR */ /* * From section "State Saved on Exit" in APMv2: nRIP is saved for all #VMEXITs * that are due to instruction intercepts as well as MSR and IOIO intercepts * and exceptions caused by INT3, INTO and BOUND instructions. * * Return 1 if the nRIP is valid and 0 otherwise. */ static int nrip_valid(uint64_t exitcode) { switch (exitcode) { case 0x00 ... 0x0F: /* read of CR0 through CR15 */ case 0x10 ... 0x1F: /* write of CR0 through CR15 */ case 0x20 ... 0x2F: /* read of DR0 through DR15 */ case 0x30 ... 0x3F: /* write of DR0 through DR15 */ case 0x43: /* INT3 */ case 0x44: /* INTO */ case 0x45: /* BOUND */ case 0x65 ... 0x7C: /* VMEXIT_CR0_SEL_WRITE ... VMEXIT_MSR */ case 0x80 ... 0x8D: /* VMEXIT_VMRUN ... VMEXIT_XSETBV */ return (1); default: return (0); } } static int svm_vmexit(struct svm_softc *svm_sc, int vcpu, struct vm_exit *vmexit) { struct vmcb *vmcb; struct vmcb_state *state; struct vmcb_ctrl *ctrl; struct svm_regctx *ctx; uint64_t code, info1, info2, val; uint32_t eax, ecx, edx; int error, errcode_valid, handled, idtvec, reflect; bool retu; ctx = svm_get_guest_regctx(svm_sc, vcpu); vmcb = svm_get_vmcb(svm_sc, vcpu); state = &vmcb->state; ctrl = &vmcb->ctrl; handled = 0; code = ctrl->exitcode; info1 = ctrl->exitinfo1; info2 = ctrl->exitinfo2; vmexit->exitcode = VM_EXITCODE_BOGUS; vmexit->rip = state->rip; vmexit->inst_length = nrip_valid(code) ? ctrl->nrip - state->rip : 0; vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_COUNT, 1); /* * #VMEXIT(INVALID) needs to be handled early because the VMCB is * in an inconsistent state and can trigger assertions that would * never happen otherwise. */ if (code == VMCB_EXIT_INVALID) { vm_exit_svm(vmexit, code, info1, info2); return (0); } KASSERT((ctrl->eventinj & VMCB_EVENTINJ_VALID) == 0, ("%s: event " "injection valid bit is set %#lx", __func__, ctrl->eventinj)); KASSERT(vmexit->inst_length >= 0 && vmexit->inst_length <= 15, ("invalid inst_length %d: code (%#lx), info1 (%#lx), info2 (%#lx)", vmexit->inst_length, code, info1, info2)); svm_update_virqinfo(svm_sc, vcpu); svm_save_intinfo(svm_sc, vcpu); switch (code) { case VMCB_EXIT_IRET: /* * Restart execution at "iret" but with the intercept cleared. */ vmexit->inst_length = 0; clear_nmi_blocking(svm_sc, vcpu); handled = 1; break; case VMCB_EXIT_VINTR: /* interrupt window exiting */ vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_VINTR, 1); handled = 1; break; case VMCB_EXIT_INTR: /* external interrupt */ vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_EXTINT, 1); handled = 1; break; case VMCB_EXIT_NMI: /* external NMI */ handled = 1; break; case 0x40 ... 0x5F: vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_EXCEPTION, 1); reflect = 1; idtvec = code - 0x40; switch (idtvec) { case IDT_MC: /* * Call the machine check handler by hand. Also don't * reflect the machine check back into the guest. */ reflect = 0; VCPU_CTR0(svm_sc->vm, vcpu, "Vectoring to MCE handler"); __asm __volatile("int $18"); break; case IDT_PF: error = svm_setreg(svm_sc, vcpu, VM_REG_GUEST_CR2, info2); KASSERT(error == 0, ("%s: error %d updating cr2", __func__, error)); /* fallthru */ case IDT_NP: case IDT_SS: case IDT_GP: case IDT_AC: case IDT_TS: errcode_valid = 1; break; case IDT_DF: errcode_valid = 1; info1 = 0; break; case IDT_BP: case IDT_OF: case IDT_BR: /* * The 'nrip' field is populated for INT3, INTO and * BOUND exceptions and this also implies that * 'inst_length' is non-zero. * * Reset 'inst_length' to zero so the guest %rip at * event injection is identical to what it was when * the exception originally happened. */ VCPU_CTR2(svm_sc->vm, vcpu, "Reset inst_length from %d " "to zero before injecting exception %d", vmexit->inst_length, idtvec); vmexit->inst_length = 0; /* fallthru */ default: errcode_valid = 0; info1 = 0; break; } KASSERT(vmexit->inst_length == 0, ("invalid inst_length (%d) " "when reflecting exception %d into guest", vmexit->inst_length, idtvec)); if (reflect) { /* Reflect the exception back into the guest */ VCPU_CTR2(svm_sc->vm, vcpu, "Reflecting exception " "%d/%#x into the guest", idtvec, (int)info1); error = vm_inject_exception(svm_sc->vm, vcpu, idtvec, errcode_valid, info1, 0); KASSERT(error == 0, ("%s: vm_inject_exception error %d", __func__, error)); } handled = 1; break; case VMCB_EXIT_MSR: /* MSR access. */ eax = state->rax; ecx = ctx->sctx_rcx; edx = ctx->sctx_rdx; retu = false; if (info1) { vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_WRMSR, 1); val = (uint64_t)edx << 32 | eax; VCPU_CTR2(svm_sc->vm, vcpu, "wrmsr %#x val %#lx", ecx, val); if (emulate_wrmsr(svm_sc, vcpu, ecx, val, &retu)) { vmexit->exitcode = VM_EXITCODE_WRMSR; vmexit->u.msr.code = ecx; vmexit->u.msr.wval = val; } else if (!retu) { handled = 1; } else { KASSERT(vmexit->exitcode != VM_EXITCODE_BOGUS, ("emulate_wrmsr retu with bogus exitcode")); } } else { VCPU_CTR1(svm_sc->vm, vcpu, "rdmsr %#x", ecx); vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_RDMSR, 1); if (emulate_rdmsr(svm_sc, vcpu, ecx, &retu)) { vmexit->exitcode = VM_EXITCODE_RDMSR; vmexit->u.msr.code = ecx; } else if (!retu) { handled = 1; } else { KASSERT(vmexit->exitcode != VM_EXITCODE_BOGUS, ("emulate_rdmsr retu with bogus exitcode")); } } break; case VMCB_EXIT_IO: handled = svm_handle_io(svm_sc, vcpu, vmexit); vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_INOUT, 1); break; case VMCB_EXIT_CPUID: vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_CPUID, 1); handled = x86_emulate_cpuid(svm_sc->vm, vcpu, (uint32_t *)&state->rax, (uint32_t *)&ctx->sctx_rbx, (uint32_t *)&ctx->sctx_rcx, (uint32_t *)&ctx->sctx_rdx); break; case VMCB_EXIT_HLT: vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_HLT, 1); vmexit->exitcode = VM_EXITCODE_HLT; vmexit->u.hlt.rflags = state->rflags; break; case VMCB_EXIT_PAUSE: vmexit->exitcode = VM_EXITCODE_PAUSE; vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_PAUSE, 1); break; case VMCB_EXIT_NPF: /* EXITINFO2 contains the faulting guest physical address */ if (info1 & VMCB_NPF_INFO1_RSV) { VCPU_CTR2(svm_sc->vm, vcpu, "nested page fault with " "reserved bits set: info1(%#lx) info2(%#lx)", info1, info2); } else if (vm_mem_allocated(svm_sc->vm, vcpu, info2)) { vmexit->exitcode = VM_EXITCODE_PAGING; vmexit->u.paging.gpa = info2; vmexit->u.paging.fault_type = npf_fault_type(info1); vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_NESTED_FAULT, 1); VCPU_CTR3(svm_sc->vm, vcpu, "nested page fault " "on gpa %#lx/%#lx at rip %#lx", info2, info1, state->rip); } else if (svm_npf_emul_fault(info1)) { svm_handle_inst_emul(vmcb, info2, vmexit); vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_INST_EMUL, 1); VCPU_CTR3(svm_sc->vm, vcpu, "inst_emul fault " "for gpa %#lx/%#lx at rip %#lx", info2, info1, state->rip); } break; case VMCB_EXIT_MONITOR: vmexit->exitcode = VM_EXITCODE_MONITOR; break; case VMCB_EXIT_MWAIT: vmexit->exitcode = VM_EXITCODE_MWAIT; break; default: vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_UNKNOWN, 1); break; } VCPU_CTR4(svm_sc->vm, vcpu, "%s %s vmexit at %#lx/%d", handled ? "handled" : "unhandled", exit_reason_to_str(code), vmexit->rip, vmexit->inst_length); if (handled) { vmexit->rip += vmexit->inst_length; vmexit->inst_length = 0; state->rip = vmexit->rip; } else { if (vmexit->exitcode == VM_EXITCODE_BOGUS) { /* * If this VM exit was not claimed by anybody then * treat it as a generic SVM exit. */ vm_exit_svm(vmexit, code, info1, info2); } else { /* * The exitcode and collateral have been populated. * The VM exit will be processed further in userland. */ } } return (handled); } static void svm_inj_intinfo(struct svm_softc *svm_sc, int vcpu) { uint64_t intinfo; if (!vm_entry_intinfo(svm_sc->vm, vcpu, &intinfo)) return; KASSERT(VMCB_EXITINTINFO_VALID(intinfo), ("%s: entry intinfo is not " "valid: %#lx", __func__, intinfo)); svm_eventinject(svm_sc, vcpu, VMCB_EXITINTINFO_TYPE(intinfo), VMCB_EXITINTINFO_VECTOR(intinfo), VMCB_EXITINTINFO_EC(intinfo), VMCB_EXITINTINFO_EC_VALID(intinfo)); vmm_stat_incr(svm_sc->vm, vcpu, VCPU_INTINFO_INJECTED, 1); VCPU_CTR1(svm_sc->vm, vcpu, "Injected entry intinfo: %#lx", intinfo); } /* * Inject event to virtual cpu. */ static void svm_inj_interrupts(struct svm_softc *sc, int vcpu, struct vlapic *vlapic) { struct vmcb_ctrl *ctrl; struct vmcb_state *state; struct svm_vcpu *vcpustate; uint8_t v_tpr; int vector, need_intr_window; int extint_pending; state = svm_get_vmcb_state(sc, vcpu); ctrl = svm_get_vmcb_ctrl(sc, vcpu); vcpustate = svm_get_vcpu(sc, vcpu); need_intr_window = 0; if (vcpustate->nextrip != state->rip) { ctrl->intr_shadow = 0; VCPU_CTR2(sc->vm, vcpu, "Guest interrupt blocking " "cleared due to rip change: %#lx/%#lx", vcpustate->nextrip, state->rip); } /* * Inject pending events or exceptions for this vcpu. * * An event might be pending because the previous #VMEXIT happened * during event delivery (i.e. ctrl->exitintinfo). * * An event might also be pending because an exception was injected * by the hypervisor (e.g. #PF during instruction emulation). */ svm_inj_intinfo(sc, vcpu); /* NMI event has priority over interrupts. */ if (vm_nmi_pending(sc->vm, vcpu)) { if (nmi_blocked(sc, vcpu)) { /* * Can't inject another NMI if the guest has not * yet executed an "iret" after the last NMI. */ VCPU_CTR0(sc->vm, vcpu, "Cannot inject NMI due " "to NMI-blocking"); } else if (ctrl->intr_shadow) { /* * Can't inject an NMI if the vcpu is in an intr_shadow. */ VCPU_CTR0(sc->vm, vcpu, "Cannot inject NMI due to " "interrupt shadow"); need_intr_window = 1; goto done; } else if (ctrl->eventinj & VMCB_EVENTINJ_VALID) { /* * If there is already an exception/interrupt pending * then defer the NMI until after that. */ VCPU_CTR1(sc->vm, vcpu, "Cannot inject NMI due to " "eventinj %#lx", ctrl->eventinj); /* * Use self-IPI to trigger a VM-exit as soon as * possible after the event injection is completed. * * This works only if the external interrupt exiting * is at a lower priority than the event injection. * * Although not explicitly specified in APMv2 the * relative priorities were verified empirically. */ ipi_cpu(curcpu, IPI_AST); /* XXX vmm_ipinum? */ } else { vm_nmi_clear(sc->vm, vcpu); /* Inject NMI, vector number is not used */ svm_eventinject(sc, vcpu, VMCB_EVENTINJ_TYPE_NMI, IDT_NMI, 0, false); /* virtual NMI blocking is now in effect */ enable_nmi_blocking(sc, vcpu); VCPU_CTR0(sc->vm, vcpu, "Injecting vNMI"); } } extint_pending = vm_extint_pending(sc->vm, vcpu); if (!extint_pending) { if (!vlapic_pending_intr(vlapic, &vector)) goto done; KASSERT(vector >= 16 && vector <= 255, ("invalid vector %d from local APIC", vector)); } else { /* Ask the legacy pic for a vector to inject */ vatpic_pending_intr(sc->vm, &vector); KASSERT(vector >= 0 && vector <= 255, ("invalid vector %d from INTR", vector)); } /* * If the guest has disabled interrupts or is in an interrupt shadow * then we cannot inject the pending interrupt. */ if ((state->rflags & PSL_I) == 0) { VCPU_CTR2(sc->vm, vcpu, "Cannot inject vector %d due to " "rflags %#lx", vector, state->rflags); need_intr_window = 1; goto done; } if (ctrl->intr_shadow) { VCPU_CTR1(sc->vm, vcpu, "Cannot inject vector %d due to " "interrupt shadow", vector); need_intr_window = 1; goto done; } if (ctrl->eventinj & VMCB_EVENTINJ_VALID) { VCPU_CTR2(sc->vm, vcpu, "Cannot inject vector %d due to " "eventinj %#lx", vector, ctrl->eventinj); need_intr_window = 1; goto done; } svm_eventinject(sc, vcpu, VMCB_EVENTINJ_TYPE_INTR, vector, 0, false); if (!extint_pending) { vlapic_intr_accepted(vlapic, vector); } else { vm_extint_clear(sc->vm, vcpu); vatpic_intr_accepted(sc->vm, vector); } /* * Force a VM-exit as soon as the vcpu is ready to accept another * interrupt. This is done because the PIC might have another vector * that it wants to inject. Also, if the APIC has a pending interrupt * that was preempted by the ExtInt then it allows us to inject the * APIC vector as soon as possible. */ need_intr_window = 1; done: /* * The guest can modify the TPR by writing to %CR8. In guest mode * the processor reflects this write to V_TPR without hypervisor * intervention. * * The guest can also modify the TPR by writing to it via the memory * mapped APIC page. In this case, the write will be emulated by the * hypervisor. For this reason V_TPR must be updated before every * VMRUN. */ v_tpr = vlapic_get_cr8(vlapic); KASSERT(v_tpr <= 15, ("invalid v_tpr %#x", v_tpr)); if (ctrl->v_tpr != v_tpr) { VCPU_CTR2(sc->vm, vcpu, "VMCB V_TPR changed from %#x to %#x", ctrl->v_tpr, v_tpr); ctrl->v_tpr = v_tpr; svm_set_dirty(sc, vcpu, VMCB_CACHE_TPR); } if (need_intr_window) { /* * We use V_IRQ in conjunction with the VINTR intercept to * trap into the hypervisor as soon as a virtual interrupt * can be delivered. * * Since injected events are not subject to intercept checks * we need to ensure that the V_IRQ is not actually going to * be delivered on VM entry. The KASSERT below enforces this. */ KASSERT((ctrl->eventinj & VMCB_EVENTINJ_VALID) != 0 || (state->rflags & PSL_I) == 0 || ctrl->intr_shadow, ("Bogus intr_window_exiting: eventinj (%#lx), " "intr_shadow (%u), rflags (%#lx)", ctrl->eventinj, ctrl->intr_shadow, state->rflags)); enable_intr_window_exiting(sc, vcpu); } else { disable_intr_window_exiting(sc, vcpu); } } static __inline void restore_host_tss(void) { struct system_segment_descriptor *tss_sd; /* * The TSS descriptor was in use prior to launching the guest so it * has been marked busy. * * 'ltr' requires the descriptor to be marked available so change the * type to "64-bit available TSS". */ tss_sd = PCPU_GET(tss); tss_sd->sd_type = SDT_SYSTSS; ltr(GSEL(GPROC0_SEL, SEL_KPL)); } static void check_asid(struct svm_softc *sc, int vcpuid, pmap_t pmap, u_int thiscpu) { struct svm_vcpu *vcpustate; struct vmcb_ctrl *ctrl; long eptgen; bool alloc_asid; KASSERT(CPU_ISSET(thiscpu, &pmap->pm_active), ("%s: nested pmap not " "active on cpu %u", __func__, thiscpu)); vcpustate = svm_get_vcpu(sc, vcpuid); ctrl = svm_get_vmcb_ctrl(sc, vcpuid); /* * The TLB entries associated with the vcpu's ASID are not valid * if either of the following conditions is true: * * 1. The vcpu's ASID generation is different than the host cpu's * ASID generation. This happens when the vcpu migrates to a new * host cpu. It can also happen when the number of vcpus executing * on a host cpu is greater than the number of ASIDs available. * * 2. The pmap generation number is different than the value cached in * the 'vcpustate'. This happens when the host invalidates pages * belonging to the guest. * * asidgen eptgen Action * mismatch mismatch * 0 0 (a) * 0 1 (b1) or (b2) * 1 0 (c) * 1 1 (d) * * (a) There is no mismatch in eptgen or ASID generation and therefore * no further action is needed. * * (b1) If the cpu supports FlushByAsid then the vcpu's ASID is * retained and the TLB entries associated with this ASID * are flushed by VMRUN. * * (b2) If the cpu does not support FlushByAsid then a new ASID is * allocated. * * (c) A new ASID is allocated. * * (d) A new ASID is allocated. */ alloc_asid = false; eptgen = pmap->pm_eptgen; ctrl->tlb_ctrl = VMCB_TLB_FLUSH_NOTHING; if (vcpustate->asid.gen != asid[thiscpu].gen) { alloc_asid = true; /* (c) and (d) */ } else if (vcpustate->eptgen != eptgen) { if (flush_by_asid()) ctrl->tlb_ctrl = VMCB_TLB_FLUSH_GUEST; /* (b1) */ else alloc_asid = true; /* (b2) */ } else { /* * This is the common case (a). */ KASSERT(!alloc_asid, ("ASID allocation not necessary")); KASSERT(ctrl->tlb_ctrl == VMCB_TLB_FLUSH_NOTHING, ("Invalid VMCB tlb_ctrl: %#x", ctrl->tlb_ctrl)); } if (alloc_asid) { if (++asid[thiscpu].num >= nasid) { asid[thiscpu].num = 1; if (++asid[thiscpu].gen == 0) asid[thiscpu].gen = 1; /* * If this cpu does not support "flush-by-asid" * then flush the entire TLB on a generation * bump. Subsequent ASID allocation in this * generation can be done without a TLB flush. */ if (!flush_by_asid()) ctrl->tlb_ctrl = VMCB_TLB_FLUSH_ALL; } vcpustate->asid.gen = asid[thiscpu].gen; vcpustate->asid.num = asid[thiscpu].num; ctrl->asid = vcpustate->asid.num; svm_set_dirty(sc, vcpuid, VMCB_CACHE_ASID); /* * If this cpu supports "flush-by-asid" then the TLB * was not flushed after the generation bump. The TLB * is flushed selectively after every new ASID allocation. */ if (flush_by_asid()) ctrl->tlb_ctrl = VMCB_TLB_FLUSH_GUEST; } vcpustate->eptgen = eptgen; KASSERT(ctrl->asid != 0, ("Guest ASID must be non-zero")); KASSERT(ctrl->asid == vcpustate->asid.num, ("ASID mismatch: %u/%u", ctrl->asid, vcpustate->asid.num)); } static __inline void disable_gintr(void) { __asm __volatile("clgi"); } static __inline void enable_gintr(void) { __asm __volatile("stgi"); } static __inline void svm_dr_enter_guest(struct svm_regctx *gctx) { /* Save host control debug registers. */ gctx->host_dr7 = rdr7(); gctx->host_debugctl = rdmsr(MSR_DEBUGCTLMSR); /* * Disable debugging in DR7 and DEBUGCTL to avoid triggering * exceptions in the host based on the guest DRx values. The * guest DR6, DR7, and DEBUGCTL are saved/restored in the * VMCB. */ load_dr7(0); wrmsr(MSR_DEBUGCTLMSR, 0); /* Save host debug registers. */ gctx->host_dr0 = rdr0(); gctx->host_dr1 = rdr1(); gctx->host_dr2 = rdr2(); gctx->host_dr3 = rdr3(); gctx->host_dr6 = rdr6(); /* Restore guest debug registers. */ load_dr0(gctx->sctx_dr0); load_dr1(gctx->sctx_dr1); load_dr2(gctx->sctx_dr2); load_dr3(gctx->sctx_dr3); } static __inline void svm_dr_leave_guest(struct svm_regctx *gctx) { /* Save guest debug registers. */ gctx->sctx_dr0 = rdr0(); gctx->sctx_dr1 = rdr1(); gctx->sctx_dr2 = rdr2(); gctx->sctx_dr3 = rdr3(); /* * Restore host debug registers. Restore DR7 and DEBUGCTL * last. */ load_dr0(gctx->host_dr0); load_dr1(gctx->host_dr1); load_dr2(gctx->host_dr2); load_dr3(gctx->host_dr3); load_dr6(gctx->host_dr6); wrmsr(MSR_DEBUGCTLMSR, gctx->host_debugctl); load_dr7(gctx->host_dr7); } /* * Start vcpu with specified RIP. */ static int svm_vmrun(void *arg, int vcpu, register_t rip, pmap_t pmap, struct vm_eventinfo *evinfo) { struct svm_regctx *gctx; struct svm_softc *svm_sc; struct svm_vcpu *vcpustate; struct vmcb_state *state; struct vmcb_ctrl *ctrl; struct vm_exit *vmexit; struct vlapic *vlapic; struct vm *vm; uint64_t vmcb_pa; int handled; svm_sc = arg; vm = svm_sc->vm; vcpustate = svm_get_vcpu(svm_sc, vcpu); state = svm_get_vmcb_state(svm_sc, vcpu); ctrl = svm_get_vmcb_ctrl(svm_sc, vcpu); vmexit = vm_exitinfo(vm, vcpu); vlapic = vm_lapic(vm, vcpu); gctx = svm_get_guest_regctx(svm_sc, vcpu); vmcb_pa = svm_sc->vcpu[vcpu].vmcb_pa; if (vcpustate->lastcpu != curcpu) { /* * Force new ASID allocation by invalidating the generation. */ vcpustate->asid.gen = 0; /* * Invalidate the VMCB state cache by marking all fields dirty. */ svm_set_dirty(svm_sc, vcpu, 0xffffffff); /* * XXX * Setting 'vcpustate->lastcpu' here is bit premature because * we may return from this function without actually executing * the VMRUN instruction. This could happen if a rendezvous * or an AST is pending on the first time through the loop. * * This works for now but any new side-effects of vcpu * migration should take this case into account. */ vcpustate->lastcpu = curcpu; vmm_stat_incr(vm, vcpu, VCPU_MIGRATIONS, 1); } svm_msr_guest_enter(svm_sc, vcpu); /* Update Guest RIP */ state->rip = rip; do { /* * Disable global interrupts to guarantee atomicity during * loading of guest state. This includes not only the state * loaded by the "vmrun" instruction but also software state * maintained by the hypervisor: suspended and rendezvous * state, NPT generation number, vlapic interrupts etc. */ disable_gintr(); if (vcpu_suspended(evinfo)) { enable_gintr(); vm_exit_suspended(vm, vcpu, state->rip); break; } if (vcpu_rendezvous_pending(evinfo)) { enable_gintr(); vm_exit_rendezvous(vm, vcpu, state->rip); break; } if (vcpu_reqidle(evinfo)) { enable_gintr(); vm_exit_reqidle(vm, vcpu, state->rip); break; } /* We are asked to give the cpu by scheduler. */ if (vcpu_should_yield(vm, vcpu)) { enable_gintr(); vm_exit_astpending(vm, vcpu, state->rip); break; } if (vcpu_debugged(vm, vcpu)) { enable_gintr(); vm_exit_debug(vm, vcpu, state->rip); break; } svm_inj_interrupts(svm_sc, vcpu, vlapic); /* Activate the nested pmap on 'curcpu' */ CPU_SET_ATOMIC_ACQ(curcpu, &pmap->pm_active); /* * Check the pmap generation and the ASID generation to * ensure that the vcpu does not use stale TLB mappings. */ check_asid(svm_sc, vcpu, pmap, curcpu); ctrl->vmcb_clean = vmcb_clean & ~vcpustate->dirty; vcpustate->dirty = 0; VCPU_CTR1(vm, vcpu, "vmcb clean %#x", ctrl->vmcb_clean); /* Launch Virtual Machine. */ VCPU_CTR1(vm, vcpu, "Resume execution at %#lx", state->rip); svm_dr_enter_guest(gctx); svm_launch(vmcb_pa, gctx, &__pcpu[curcpu]); svm_dr_leave_guest(gctx); CPU_CLR_ATOMIC(curcpu, &pmap->pm_active); /* * The host GDTR and IDTR is saved by VMRUN and restored * automatically on #VMEXIT. However, the host TSS needs * to be restored explicitly. */ restore_host_tss(); /* #VMEXIT disables interrupts so re-enable them here. */ enable_gintr(); /* Update 'nextrip' */ vcpustate->nextrip = state->rip; /* Handle #VMEXIT and if required return to user space. */ handled = svm_vmexit(svm_sc, vcpu, vmexit); } while (handled); svm_msr_guest_exit(svm_sc, vcpu); return (0); } static void svm_vmcleanup(void *arg) { struct svm_softc *sc = arg; contigfree(sc->iopm_bitmap, SVM_IO_BITMAP_SIZE, M_SVM); contigfree(sc->msr_bitmap, SVM_MSR_BITMAP_SIZE, M_SVM); free(sc, M_SVM); } static register_t * swctx_regptr(struct svm_regctx *regctx, int reg) { switch (reg) { case VM_REG_GUEST_RBX: return (®ctx->sctx_rbx); case VM_REG_GUEST_RCX: return (®ctx->sctx_rcx); case VM_REG_GUEST_RDX: return (®ctx->sctx_rdx); case VM_REG_GUEST_RDI: return (®ctx->sctx_rdi); case VM_REG_GUEST_RSI: return (®ctx->sctx_rsi); case VM_REG_GUEST_RBP: return (®ctx->sctx_rbp); case VM_REG_GUEST_R8: return (®ctx->sctx_r8); case VM_REG_GUEST_R9: return (®ctx->sctx_r9); case VM_REG_GUEST_R10: return (®ctx->sctx_r10); case VM_REG_GUEST_R11: return (®ctx->sctx_r11); case VM_REG_GUEST_R12: return (®ctx->sctx_r12); case VM_REG_GUEST_R13: return (®ctx->sctx_r13); case VM_REG_GUEST_R14: return (®ctx->sctx_r14); case VM_REG_GUEST_R15: return (®ctx->sctx_r15); case VM_REG_GUEST_DR0: return (®ctx->sctx_dr0); case VM_REG_GUEST_DR1: return (®ctx->sctx_dr1); case VM_REG_GUEST_DR2: return (®ctx->sctx_dr2); case VM_REG_GUEST_DR3: return (®ctx->sctx_dr3); default: return (NULL); } } static int svm_getreg(void *arg, int vcpu, int ident, uint64_t *val) { struct svm_softc *svm_sc; register_t *reg; svm_sc = arg; if (ident == VM_REG_GUEST_INTR_SHADOW) { return (svm_get_intr_shadow(svm_sc, vcpu, val)); } if (vmcb_read(svm_sc, vcpu, ident, val) == 0) { return (0); } reg = swctx_regptr(svm_get_guest_regctx(svm_sc, vcpu), ident); if (reg != NULL) { *val = *reg; return (0); } VCPU_CTR1(svm_sc->vm, vcpu, "svm_getreg: unknown register %#x", ident); return (EINVAL); } static int svm_setreg(void *arg, int vcpu, int ident, uint64_t val) { struct svm_softc *svm_sc; register_t *reg; svm_sc = arg; if (ident == VM_REG_GUEST_INTR_SHADOW) { return (svm_modify_intr_shadow(svm_sc, vcpu, val)); } if (vmcb_write(svm_sc, vcpu, ident, val) == 0) { return (0); } reg = swctx_regptr(svm_get_guest_regctx(svm_sc, vcpu), ident); if (reg != NULL) { *reg = val; return (0); } /* * XXX deal with CR3 and invalidate TLB entries tagged with the * vcpu's ASID. This needs to be treated differently depending on * whether 'running' is true/false. */ VCPU_CTR1(svm_sc->vm, vcpu, "svm_setreg: unknown register %#x", ident); return (EINVAL); } static int svm_setcap(void *arg, int vcpu, int type, int val) { struct svm_softc *sc; int error; sc = arg; error = 0; switch (type) { case VM_CAP_HALT_EXIT: svm_set_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_HLT, val); break; case VM_CAP_PAUSE_EXIT: svm_set_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_PAUSE, val); break; case VM_CAP_UNRESTRICTED_GUEST: /* Unrestricted guest execution cannot be disabled in SVM */ if (val == 0) error = EINVAL; break; default: error = ENOENT; break; } return (error); } static int svm_getcap(void *arg, int vcpu, int type, int *retval) { struct svm_softc *sc; int error; sc = arg; error = 0; switch (type) { case VM_CAP_HALT_EXIT: *retval = svm_get_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_HLT); break; case VM_CAP_PAUSE_EXIT: *retval = svm_get_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_PAUSE); break; case VM_CAP_UNRESTRICTED_GUEST: *retval = 1; /* unrestricted guest is always enabled */ break; default: error = ENOENT; break; } return (error); } static struct vlapic * svm_vlapic_init(void *arg, int vcpuid) { struct svm_softc *svm_sc; struct vlapic *vlapic; svm_sc = arg; vlapic = malloc(sizeof(struct vlapic), M_SVM_VLAPIC, M_WAITOK | M_ZERO); vlapic->vm = svm_sc->vm; vlapic->vcpuid = vcpuid; vlapic->apic_page = (struct LAPIC *)&svm_sc->apic_page[vcpuid]; vlapic_init(vlapic); return (vlapic); } static void svm_vlapic_cleanup(void *arg, struct vlapic *vlapic) { vlapic_cleanup(vlapic); free(vlapic, M_SVM_VLAPIC); } struct vmm_ops vmm_ops_amd = { svm_init, svm_cleanup, svm_restore, svm_vminit, svm_vmrun, svm_vmcleanup, svm_getreg, svm_setreg, vmcb_getdesc, vmcb_setdesc, svm_getcap, svm_setcap, svm_npt_alloc, svm_npt_free, svm_vlapic_init, svm_vlapic_cleanup }; Index: head/sys/amd64/vmm/amd/svm.h =================================================================== --- head/sys/amd64/vmm/amd/svm.h (revision 335029) +++ head/sys/amd64/vmm/amd/svm.h (revision 335030) @@ -1,68 +1,70 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2013 Anish Gupta (akgupt3@gmail.com) * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice unmodified, this list of conditions, and the following * disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * * $FreeBSD$ */ #ifndef _SVM_H_ #define _SVM_H_ struct pcpu; /* * Guest register state that is saved outside the VMCB. */ struct svm_regctx { register_t sctx_rbp; register_t sctx_rbx; register_t sctx_rcx; register_t sctx_rdx; register_t sctx_rdi; register_t sctx_rsi; register_t sctx_r8; register_t sctx_r9; register_t sctx_r10; register_t sctx_r11; register_t sctx_r12; register_t sctx_r13; register_t sctx_r14; register_t sctx_r15; register_t sctx_dr0; register_t sctx_dr1; register_t sctx_dr2; register_t sctx_dr3; register_t host_dr0; register_t host_dr1; register_t host_dr2; register_t host_dr3; register_t host_dr6; register_t host_dr7; uint64_t host_debugctl; }; void svm_launch(uint64_t pa, struct svm_regctx *gctx, struct pcpu *pcpu); #endif /* _SVM_H_ */ Index: head/sys/amd64/vmm/amd/svm_genassym.c =================================================================== --- head/sys/amd64/vmm/amd/svm_genassym.c (revision 335029) +++ head/sys/amd64/vmm/amd/svm_genassym.c (revision 335030) @@ -1,50 +1,52 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2013 Anish Gupta (akgupt3@gmail.com) * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice unmodified, this list of conditions, and the following * disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include "svm.h" ASSYM(SCTX_RBX, offsetof(struct svm_regctx, sctx_rbx)); ASSYM(SCTX_RCX, offsetof(struct svm_regctx, sctx_rcx)); ASSYM(SCTX_RBP, offsetof(struct svm_regctx, sctx_rbp)); ASSYM(SCTX_RDX, offsetof(struct svm_regctx, sctx_rdx)); ASSYM(SCTX_RDI, offsetof(struct svm_regctx, sctx_rdi)); ASSYM(SCTX_RSI, offsetof(struct svm_regctx, sctx_rsi)); ASSYM(SCTX_R8, offsetof(struct svm_regctx, sctx_r8)); ASSYM(SCTX_R9, offsetof(struct svm_regctx, sctx_r9)); ASSYM(SCTX_R10, offsetof(struct svm_regctx, sctx_r10)); ASSYM(SCTX_R11, offsetof(struct svm_regctx, sctx_r11)); ASSYM(SCTX_R12, offsetof(struct svm_regctx, sctx_r12)); ASSYM(SCTX_R13, offsetof(struct svm_regctx, sctx_r13)); ASSYM(SCTX_R14, offsetof(struct svm_regctx, sctx_r14)); ASSYM(SCTX_R15, offsetof(struct svm_regctx, sctx_r15)); ASSYM(MSR_GSBASE, MSR_GSBASE); Index: head/sys/amd64/vmm/amd/svm_msr.c =================================================================== --- head/sys/amd64/vmm/amd/svm_msr.c (revision 335029) +++ head/sys/amd64/vmm/amd/svm_msr.c (revision 335030) @@ -1,170 +1,172 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2014, Neel Natu (neel@freebsd.org) * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice unmodified, this list of conditions, and the following * disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include "svm.h" #include "vmcb.h" #include "svm_softc.h" #include "svm_msr.h" #ifndef MSR_AMDK8_IPM #define MSR_AMDK8_IPM 0xc0010055 #endif enum { IDX_MSR_LSTAR, IDX_MSR_CSTAR, IDX_MSR_STAR, IDX_MSR_SF_MASK, HOST_MSR_NUM /* must be the last enumeration */ }; static uint64_t host_msrs[HOST_MSR_NUM]; void svm_msr_init(void) { /* * It is safe to cache the values of the following MSRs because they * don't change based on curcpu, curproc or curthread. */ host_msrs[IDX_MSR_LSTAR] = rdmsr(MSR_LSTAR); host_msrs[IDX_MSR_CSTAR] = rdmsr(MSR_CSTAR); host_msrs[IDX_MSR_STAR] = rdmsr(MSR_STAR); host_msrs[IDX_MSR_SF_MASK] = rdmsr(MSR_SF_MASK); } void svm_msr_guest_init(struct svm_softc *sc, int vcpu) { /* * All the MSRs accessible to the guest are either saved/restored by * hardware on every #VMEXIT/VMRUN (e.g., G_PAT) or are saved/restored * by VMSAVE/VMLOAD (e.g., MSR_GSBASE). * * There are no guest MSRs that are saved/restored "by hand" so nothing * more to do here. */ return; } void svm_msr_guest_enter(struct svm_softc *sc, int vcpu) { /* * Save host MSRs (if any) and restore guest MSRs (if any). */ } void svm_msr_guest_exit(struct svm_softc *sc, int vcpu) { /* * Save guest MSRs (if any) and restore host MSRs. */ wrmsr(MSR_LSTAR, host_msrs[IDX_MSR_LSTAR]); wrmsr(MSR_CSTAR, host_msrs[IDX_MSR_CSTAR]); wrmsr(MSR_STAR, host_msrs[IDX_MSR_STAR]); wrmsr(MSR_SF_MASK, host_msrs[IDX_MSR_SF_MASK]); /* MSR_KGSBASE will be restored on the way back to userspace */ } int svm_rdmsr(struct svm_softc *sc, int vcpu, u_int num, uint64_t *result, bool *retu) { int error = 0; switch (num) { case MSR_MCG_CAP: case MSR_MCG_STATUS: *result = 0; break; case MSR_MTRRcap: case MSR_MTRRdefType: case MSR_MTRR4kBase ... MSR_MTRR4kBase + 8: case MSR_MTRR16kBase ... MSR_MTRR16kBase + 1: case MSR_MTRR64kBase: case MSR_SYSCFG: *result = 0; break; case MSR_AMDK8_IPM: *result = 0; break; default: error = EINVAL; break; } return (error); } int svm_wrmsr(struct svm_softc *sc, int vcpu, u_int num, uint64_t val, bool *retu) { int error = 0; switch (num) { case MSR_MCG_CAP: case MSR_MCG_STATUS: break; /* ignore writes */ case MSR_MTRRcap: vm_inject_gp(sc->vm, vcpu); break; case MSR_MTRRdefType: case MSR_MTRR4kBase ... MSR_MTRR4kBase + 8: case MSR_MTRR16kBase ... MSR_MTRR16kBase + 1: case MSR_MTRR64kBase: case MSR_SYSCFG: break; /* Ignore writes */ case MSR_AMDK8_IPM: /* * Ignore writes to the "Interrupt Pending Message" MSR. */ break; case MSR_K8_UCODE_UPDATE: /* * Ignore writes to microcode update register. */ break; default: error = EINVAL; break; } return (error); } Index: head/sys/amd64/vmm/amd/svm_msr.h =================================================================== --- head/sys/amd64/vmm/amd/svm_msr.h (revision 335029) +++ head/sys/amd64/vmm/amd/svm_msr.h (revision 335030) @@ -1,44 +1,46 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2014 Neel Natu (neel@freebsd.org) * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice unmodified, this list of conditions, and the following * disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * * $FreeBSD$ */ #ifndef _SVM_MSR_H_ #define _SVM_MSR_H_ struct svm_softc; void svm_msr_init(void); void svm_msr_guest_init(struct svm_softc *sc, int vcpu); void svm_msr_guest_enter(struct svm_softc *sc, int vcpu); void svm_msr_guest_exit(struct svm_softc *sc, int vcpu); int svm_wrmsr(struct svm_softc *sc, int vcpu, u_int num, uint64_t val, bool *retu); int svm_rdmsr(struct svm_softc *sc, int vcpu, u_int num, uint64_t *result, bool *retu); #endif /* _SVM_MSR_H_ */ Index: head/sys/amd64/vmm/amd/svm_softc.h =================================================================== --- head/sys/amd64/vmm/amd/svm_softc.h (revision 335029) +++ head/sys/amd64/vmm/amd/svm_softc.h (revision 335030) @@ -1,114 +1,116 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2013 Anish Gupta (akgupt3@gmail.com) * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice unmodified, this list of conditions, and the following * disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * * $FreeBSD$ */ #ifndef _SVM_SOFTC_H_ #define _SVM_SOFTC_H_ #define SVM_IO_BITMAP_SIZE (3 * PAGE_SIZE) #define SVM_MSR_BITMAP_SIZE (2 * PAGE_SIZE) struct asid { uint64_t gen; /* range is [1, ~0UL] */ uint32_t num; /* range is [1, nasid - 1] */ }; /* * XXX separate out 'struct vmcb' from 'svm_vcpu' to avoid wasting space * due to VMCB alignment requirements. */ struct svm_vcpu { struct vmcb vmcb; /* hardware saved vcpu context */ struct svm_regctx swctx; /* software saved vcpu context */ uint64_t vmcb_pa; /* VMCB physical address */ uint64_t nextrip; /* next instruction to be executed by guest */ int lastcpu; /* host cpu that the vcpu last ran on */ uint32_t dirty; /* state cache bits that must be cleared */ long eptgen; /* pmap->pm_eptgen when the vcpu last ran */ struct asid asid; } __aligned(PAGE_SIZE); /* * SVM softc, one per virtual machine. */ struct svm_softc { uint8_t apic_page[VM_MAXCPU][PAGE_SIZE]; struct svm_vcpu vcpu[VM_MAXCPU]; vm_offset_t nptp; /* nested page table */ uint8_t *iopm_bitmap; /* shared by all vcpus */ uint8_t *msr_bitmap; /* shared by all vcpus */ struct vm *vm; }; CTASSERT((offsetof(struct svm_softc, nptp) & PAGE_MASK) == 0); static __inline struct svm_vcpu * svm_get_vcpu(struct svm_softc *sc, int vcpu) { return (&(sc->vcpu[vcpu])); } static __inline struct vmcb * svm_get_vmcb(struct svm_softc *sc, int vcpu) { return (&(sc->vcpu[vcpu].vmcb)); } static __inline struct vmcb_state * svm_get_vmcb_state(struct svm_softc *sc, int vcpu) { return (&(sc->vcpu[vcpu].vmcb.state)); } static __inline struct vmcb_ctrl * svm_get_vmcb_ctrl(struct svm_softc *sc, int vcpu) { return (&(sc->vcpu[vcpu].vmcb.ctrl)); } static __inline struct svm_regctx * svm_get_guest_regctx(struct svm_softc *sc, int vcpu) { return (&(sc->vcpu[vcpu].swctx)); } static __inline void svm_set_dirty(struct svm_softc *sc, int vcpu, uint32_t dirtybits) { struct svm_vcpu *vcpustate; vcpustate = svm_get_vcpu(sc, vcpu); vcpustate->dirty |= dirtybits; } #endif /* _SVM_SOFTC_H_ */ Index: head/sys/amd64/vmm/amd/svm_support.S =================================================================== --- head/sys/amd64/vmm/amd/svm_support.S (revision 335029) +++ head/sys/amd64/vmm/amd/svm_support.S (revision 335030) @@ -1,161 +1,163 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2013, Anish Gupta (akgupt3@gmail.com) * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice unmodified, this list of conditions, and the following * disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * * $FreeBSD$ */ #include #include "svm_assym.h" /* * Be friendly to DTrace FBT's prologue/epilogue pattern matching. * * They are also responsible for saving/restoring the host %rbp across VMRUN. */ #define VENTER push %rbp ; mov %rsp,%rbp #define VLEAVE pop %rbp #define VMLOAD .byte 0x0f, 0x01, 0xda #define VMRUN .byte 0x0f, 0x01, 0xd8 #define VMSAVE .byte 0x0f, 0x01, 0xdb /* * svm_launch(uint64_t vmcb, struct svm_regctx *gctx, struct pcpu *pcpu) * %rdi: physical address of VMCB * %rsi: pointer to guest context * %rdx: pointer to the pcpu data */ ENTRY(svm_launch) VENTER /* save pointer to the pcpu data */ push %rdx /* * Host register state saved across a VMRUN. * * All "callee saved registers" except: * %rsp: because it is preserved by the processor across VMRUN. * %rbp: because it is saved/restored by the function prologue/epilogue. */ push %rbx push %r12 push %r13 push %r14 push %r15 /* Save the physical address of the VMCB in %rax */ movq %rdi, %rax push %rsi /* push guest context pointer on the stack */ /* * Restore guest state. */ movq SCTX_R8(%rsi), %r8 movq SCTX_R9(%rsi), %r9 movq SCTX_R10(%rsi), %r10 movq SCTX_R11(%rsi), %r11 movq SCTX_R12(%rsi), %r12 movq SCTX_R13(%rsi), %r13 movq SCTX_R14(%rsi), %r14 movq SCTX_R15(%rsi), %r15 movq SCTX_RBP(%rsi), %rbp movq SCTX_RBX(%rsi), %rbx movq SCTX_RCX(%rsi), %rcx movq SCTX_RDX(%rsi), %rdx movq SCTX_RDI(%rsi), %rdi movq SCTX_RSI(%rsi), %rsi /* %rsi must be restored last */ VMLOAD VMRUN VMSAVE pop %rax /* pop guest context pointer from the stack */ /* * Save guest state. */ movq %r8, SCTX_R8(%rax) movq %r9, SCTX_R9(%rax) movq %r10, SCTX_R10(%rax) movq %r11, SCTX_R11(%rax) movq %r12, SCTX_R12(%rax) movq %r13, SCTX_R13(%rax) movq %r14, SCTX_R14(%rax) movq %r15, SCTX_R15(%rax) movq %rbp, SCTX_RBP(%rax) movq %rbx, SCTX_RBX(%rax) movq %rcx, SCTX_RCX(%rax) movq %rdx, SCTX_RDX(%rax) movq %rdi, SCTX_RDI(%rax) movq %rsi, SCTX_RSI(%rax) /* * To prevent malicious branch target predictions from * affecting the host, overwrite all entries in the RSB upon * exiting a guest. */ mov $16, %ecx /* 16 iterations, two calls per loop */ mov %rsp, %rax 0: call 2f /* create an RSB entry. */ 1: pause call 1b /* capture rogue speculation. */ 2: call 2f /* create an RSB entry. */ 1: pause call 1b /* capture rogue speculation. */ 2: sub $1, %ecx jnz 0b mov %rax, %rsp /* Restore host state */ pop %r15 pop %r14 pop %r13 pop %r12 pop %rbx /* Restore %GS.base to point to the host's pcpu data */ pop %rdx mov %edx, %eax shr $32, %rdx mov $MSR_GSBASE, %rcx wrmsr /* * Clobber the remaining registers with guest contents so they * can't be misused. */ xor %rbp, %rbp xor %rdi, %rdi xor %rsi, %rsi xor %r8, %r8 xor %r9, %r9 xor %r10, %r10 xor %r11, %r11 VLEAVE ret END(svm_launch) Index: head/sys/amd64/vmm/amd/vmcb.c =================================================================== --- head/sys/amd64/vmm/amd/vmcb.c (revision 335029) +++ head/sys/amd64/vmm/amd/vmcb.c (revision 335030) @@ -1,452 +1,454 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2013 Anish Gupta (akgupt3@gmail.com) * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice unmodified, this list of conditions, and the following * disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include "vmm_ktr.h" #include "vmcb.h" #include "svm.h" #include "svm_softc.h" /* * The VMCB aka Virtual Machine Control Block is a 4KB aligned page * in memory that describes the virtual machine. * * The VMCB contains: * - instructions or events in the guest to intercept * - control bits that modify execution environment of the guest * - guest processor state (e.g. general purpose registers) */ /* * Return VMCB segment area. */ static struct vmcb_segment * vmcb_segptr(struct vmcb *vmcb, int type) { struct vmcb_state *state; struct vmcb_segment *seg; state = &vmcb->state; switch (type) { case VM_REG_GUEST_CS: seg = &state->cs; break; case VM_REG_GUEST_DS: seg = &state->ds; break; case VM_REG_GUEST_ES: seg = &state->es; break; case VM_REG_GUEST_FS: seg = &state->fs; break; case VM_REG_GUEST_GS: seg = &state->gs; break; case VM_REG_GUEST_SS: seg = &state->ss; break; case VM_REG_GUEST_GDTR: seg = &state->gdt; break; case VM_REG_GUEST_IDTR: seg = &state->idt; break; case VM_REG_GUEST_LDTR: seg = &state->ldt; break; case VM_REG_GUEST_TR: seg = &state->tr; break; default: seg = NULL; break; } return (seg); } static int vmcb_access(struct svm_softc *softc, int vcpu, int write, int ident, uint64_t *val) { struct vmcb *vmcb; int off, bytes; char *ptr; vmcb = svm_get_vmcb(softc, vcpu); off = VMCB_ACCESS_OFFSET(ident); bytes = VMCB_ACCESS_BYTES(ident); if ((off + bytes) >= sizeof (struct vmcb)) return (EINVAL); ptr = (char *)vmcb; if (!write) *val = 0; switch (bytes) { case 8: case 4: case 2: if (write) memcpy(ptr + off, val, bytes); else memcpy(val, ptr + off, bytes); break; default: VCPU_CTR1(softc->vm, vcpu, "Invalid size %d for VMCB access: %d", bytes); return (EINVAL); } /* Invalidate all VMCB state cached by h/w. */ if (write) svm_set_dirty(softc, vcpu, 0xffffffff); return (0); } /* * Read from segment selector, control and general purpose register of VMCB. */ int vmcb_read(struct svm_softc *sc, int vcpu, int ident, uint64_t *retval) { struct vmcb *vmcb; struct vmcb_state *state; struct vmcb_segment *seg; int err; vmcb = svm_get_vmcb(sc, vcpu); state = &vmcb->state; err = 0; if (VMCB_ACCESS_OK(ident)) return (vmcb_access(sc, vcpu, 0, ident, retval)); switch (ident) { case VM_REG_GUEST_CR0: *retval = state->cr0; break; case VM_REG_GUEST_CR2: *retval = state->cr2; break; case VM_REG_GUEST_CR3: *retval = state->cr3; break; case VM_REG_GUEST_CR4: *retval = state->cr4; break; case VM_REG_GUEST_DR6: *retval = state->dr6; break; case VM_REG_GUEST_DR7: *retval = state->dr7; break; case VM_REG_GUEST_EFER: *retval = state->efer; break; case VM_REG_GUEST_RAX: *retval = state->rax; break; case VM_REG_GUEST_RFLAGS: *retval = state->rflags; break; case VM_REG_GUEST_RIP: *retval = state->rip; break; case VM_REG_GUEST_RSP: *retval = state->rsp; break; case VM_REG_GUEST_CS: case VM_REG_GUEST_DS: case VM_REG_GUEST_ES: case VM_REG_GUEST_FS: case VM_REG_GUEST_GS: case VM_REG_GUEST_SS: case VM_REG_GUEST_LDTR: case VM_REG_GUEST_TR: seg = vmcb_segptr(vmcb, ident); KASSERT(seg != NULL, ("%s: unable to get segment %d from VMCB", __func__, ident)); *retval = seg->selector; break; case VM_REG_GUEST_GDTR: case VM_REG_GUEST_IDTR: /* GDTR and IDTR don't have segment selectors */ err = EINVAL; break; default: err = EINVAL; break; } return (err); } /* * Write to segment selector, control and general purpose register of VMCB. */ int vmcb_write(struct svm_softc *sc, int vcpu, int ident, uint64_t val) { struct vmcb *vmcb; struct vmcb_state *state; struct vmcb_segment *seg; int err, dirtyseg; vmcb = svm_get_vmcb(sc, vcpu); state = &vmcb->state; dirtyseg = 0; err = 0; if (VMCB_ACCESS_OK(ident)) return (vmcb_access(sc, vcpu, 1, ident, &val)); switch (ident) { case VM_REG_GUEST_CR0: state->cr0 = val; svm_set_dirty(sc, vcpu, VMCB_CACHE_CR); break; case VM_REG_GUEST_CR2: state->cr2 = val; svm_set_dirty(sc, vcpu, VMCB_CACHE_CR2); break; case VM_REG_GUEST_CR3: state->cr3 = val; svm_set_dirty(sc, vcpu, VMCB_CACHE_CR); break; case VM_REG_GUEST_CR4: state->cr4 = val; svm_set_dirty(sc, vcpu, VMCB_CACHE_CR); break; case VM_REG_GUEST_DR6: state->dr6 = val; svm_set_dirty(sc, vcpu, VMCB_CACHE_DR); break; case VM_REG_GUEST_DR7: state->dr7 = val; svm_set_dirty(sc, vcpu, VMCB_CACHE_DR); break; case VM_REG_GUEST_EFER: /* EFER_SVM must always be set when the guest is executing */ state->efer = val | EFER_SVM; svm_set_dirty(sc, vcpu, VMCB_CACHE_CR); break; case VM_REG_GUEST_RAX: state->rax = val; break; case VM_REG_GUEST_RFLAGS: state->rflags = val; break; case VM_REG_GUEST_RIP: state->rip = val; break; case VM_REG_GUEST_RSP: state->rsp = val; break; case VM_REG_GUEST_CS: case VM_REG_GUEST_DS: case VM_REG_GUEST_ES: case VM_REG_GUEST_SS: dirtyseg = 1; /* FALLTHROUGH */ case VM_REG_GUEST_FS: case VM_REG_GUEST_GS: case VM_REG_GUEST_LDTR: case VM_REG_GUEST_TR: seg = vmcb_segptr(vmcb, ident); KASSERT(seg != NULL, ("%s: unable to get segment %d from VMCB", __func__, ident)); seg->selector = val; if (dirtyseg) svm_set_dirty(sc, vcpu, VMCB_CACHE_SEG); break; case VM_REG_GUEST_GDTR: case VM_REG_GUEST_IDTR: /* GDTR and IDTR don't have segment selectors */ err = EINVAL; break; default: err = EINVAL; break; } return (err); } int vmcb_seg(struct vmcb *vmcb, int ident, struct vmcb_segment *seg2) { struct vmcb_segment *seg; seg = vmcb_segptr(vmcb, ident); if (seg != NULL) { bcopy(seg, seg2, sizeof(struct vmcb_segment)); return (0); } else { return (EINVAL); } } int vmcb_setdesc(void *arg, int vcpu, int reg, struct seg_desc *desc) { struct vmcb *vmcb; struct svm_softc *sc; struct vmcb_segment *seg; uint16_t attrib; sc = arg; vmcb = svm_get_vmcb(sc, vcpu); seg = vmcb_segptr(vmcb, reg); KASSERT(seg != NULL, ("%s: invalid segment descriptor %d", __func__, reg)); seg->base = desc->base; seg->limit = desc->limit; if (reg != VM_REG_GUEST_GDTR && reg != VM_REG_GUEST_IDTR) { /* * Map seg_desc access to VMCB attribute format. * * SVM uses the 'P' bit in the segment attributes to indicate a * NULL segment so clear it if the segment is marked unusable. */ attrib = ((desc->access & 0xF000) >> 4) | (desc->access & 0xFF); if (SEG_DESC_UNUSABLE(desc->access)) { attrib &= ~0x80; } seg->attrib = attrib; } VCPU_CTR4(sc->vm, vcpu, "Setting desc %d: base (%#lx), limit (%#x), " "attrib (%#x)", reg, seg->base, seg->limit, seg->attrib); switch (reg) { case VM_REG_GUEST_CS: case VM_REG_GUEST_DS: case VM_REG_GUEST_ES: case VM_REG_GUEST_SS: svm_set_dirty(sc, vcpu, VMCB_CACHE_SEG); break; case VM_REG_GUEST_GDTR: case VM_REG_GUEST_IDTR: svm_set_dirty(sc, vcpu, VMCB_CACHE_DT); break; default: break; } return (0); } int vmcb_getdesc(void *arg, int vcpu, int reg, struct seg_desc *desc) { struct vmcb *vmcb; struct svm_softc *sc; struct vmcb_segment *seg; sc = arg; vmcb = svm_get_vmcb(sc, vcpu); seg = vmcb_segptr(vmcb, reg); KASSERT(seg != NULL, ("%s: invalid segment descriptor %d", __func__, reg)); desc->base = seg->base; desc->limit = seg->limit; desc->access = 0; if (reg != VM_REG_GUEST_GDTR && reg != VM_REG_GUEST_IDTR) { /* Map seg_desc access to VMCB attribute format */ desc->access = ((seg->attrib & 0xF00) << 4) | (seg->attrib & 0xFF); /* * VT-x uses bit 16 to indicate a segment that has been loaded * with a NULL selector (aka unusable). The 'desc->access' * field is interpreted in the VT-x format by the * processor-independent code. * * SVM uses the 'P' bit to convey the same information so * convert it into the VT-x format. For more details refer to * section "Segment State in the VMCB" in APMv2. */ if (reg != VM_REG_GUEST_CS && reg != VM_REG_GUEST_TR) { if ((desc->access & 0x80) == 0) desc->access |= 0x10000; /* Unusable segment */ } } return (0); } Index: head/sys/amd64/vmm/amd/vmcb.h =================================================================== --- head/sys/amd64/vmm/amd/vmcb.h (revision 335029) +++ head/sys/amd64/vmm/amd/vmcb.h (revision 335030) @@ -1,334 +1,336 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2013 Anish Gupta (akgupt3@gmail.com) * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice unmodified, this list of conditions, and the following * disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * * $FreeBSD$ */ #ifndef _VMCB_H_ #define _VMCB_H_ struct svm_softc; #define BIT(n) (1ULL << n) /* * Secure Virtual Machine: AMD64 Programmer's Manual Vol2, Chapter 15 * Layout of VMCB: AMD64 Programmer's Manual Vol2, Appendix B */ /* vmcb_ctrl->intercept[] array indices */ #define VMCB_CR_INTCPT 0 #define VMCB_DR_INTCPT 1 #define VMCB_EXC_INTCPT 2 #define VMCB_CTRL1_INTCPT 3 #define VMCB_CTRL2_INTCPT 4 /* intercept[VMCB_CTRL1_INTCPT] fields */ #define VMCB_INTCPT_INTR BIT(0) #define VMCB_INTCPT_NMI BIT(1) #define VMCB_INTCPT_SMI BIT(2) #define VMCB_INTCPT_INIT BIT(3) #define VMCB_INTCPT_VINTR BIT(4) #define VMCB_INTCPT_CR0_WRITE BIT(5) #define VMCB_INTCPT_IDTR_READ BIT(6) #define VMCB_INTCPT_GDTR_READ BIT(7) #define VMCB_INTCPT_LDTR_READ BIT(8) #define VMCB_INTCPT_TR_READ BIT(9) #define VMCB_INTCPT_IDTR_WRITE BIT(10) #define VMCB_INTCPT_GDTR_WRITE BIT(11) #define VMCB_INTCPT_LDTR_WRITE BIT(12) #define VMCB_INTCPT_TR_WRITE BIT(13) #define VMCB_INTCPT_RDTSC BIT(14) #define VMCB_INTCPT_RDPMC BIT(15) #define VMCB_INTCPT_PUSHF BIT(16) #define VMCB_INTCPT_POPF BIT(17) #define VMCB_INTCPT_CPUID BIT(18) #define VMCB_INTCPT_RSM BIT(19) #define VMCB_INTCPT_IRET BIT(20) #define VMCB_INTCPT_INTn BIT(21) #define VMCB_INTCPT_INVD BIT(22) #define VMCB_INTCPT_PAUSE BIT(23) #define VMCB_INTCPT_HLT BIT(24) #define VMCB_INTCPT_INVPG BIT(25) #define VMCB_INTCPT_INVPGA BIT(26) #define VMCB_INTCPT_IO BIT(27) #define VMCB_INTCPT_MSR BIT(28) #define VMCB_INTCPT_TASK_SWITCH BIT(29) #define VMCB_INTCPT_FERR_FREEZE BIT(30) #define VMCB_INTCPT_SHUTDOWN BIT(31) /* intercept[VMCB_CTRL2_INTCPT] fields */ #define VMCB_INTCPT_VMRUN BIT(0) #define VMCB_INTCPT_VMMCALL BIT(1) #define VMCB_INTCPT_VMLOAD BIT(2) #define VMCB_INTCPT_VMSAVE BIT(3) #define VMCB_INTCPT_STGI BIT(4) #define VMCB_INTCPT_CLGI BIT(5) #define VMCB_INTCPT_SKINIT BIT(6) #define VMCB_INTCPT_RDTSCP BIT(7) #define VMCB_INTCPT_ICEBP BIT(8) #define VMCB_INTCPT_WBINVD BIT(9) #define VMCB_INTCPT_MONITOR BIT(10) #define VMCB_INTCPT_MWAIT BIT(11) #define VMCB_INTCPT_MWAIT_ARMED BIT(12) #define VMCB_INTCPT_XSETBV BIT(13) /* VMCB TLB control */ #define VMCB_TLB_FLUSH_NOTHING 0 /* Flush nothing */ #define VMCB_TLB_FLUSH_ALL 1 /* Flush entire TLB */ #define VMCB_TLB_FLUSH_GUEST 3 /* Flush all guest entries */ #define VMCB_TLB_FLUSH_GUEST_NONGLOBAL 7 /* Flush guest non-PG entries */ /* VMCB state caching */ #define VMCB_CACHE_NONE 0 /* No caching */ #define VMCB_CACHE_I BIT(0) /* Intercept, TSC off, Pause filter */ #define VMCB_CACHE_IOPM BIT(1) /* I/O and MSR permission */ #define VMCB_CACHE_ASID BIT(2) /* ASID */ #define VMCB_CACHE_TPR BIT(3) /* V_TPR to V_INTR_VECTOR */ #define VMCB_CACHE_NP BIT(4) /* Nested Paging */ #define VMCB_CACHE_CR BIT(5) /* CR0, CR3, CR4 & EFER */ #define VMCB_CACHE_DR BIT(6) /* Debug registers */ #define VMCB_CACHE_DT BIT(7) /* GDT/IDT */ #define VMCB_CACHE_SEG BIT(8) /* User segments, CPL */ #define VMCB_CACHE_CR2 BIT(9) /* page fault address */ #define VMCB_CACHE_LBR BIT(10) /* Last branch */ /* VMCB control event injection */ #define VMCB_EVENTINJ_EC_VALID BIT(11) /* Error Code valid */ #define VMCB_EVENTINJ_VALID BIT(31) /* Event valid */ /* Event types that can be injected */ #define VMCB_EVENTINJ_TYPE_INTR 0 #define VMCB_EVENTINJ_TYPE_NMI 2 #define VMCB_EVENTINJ_TYPE_EXCEPTION 3 #define VMCB_EVENTINJ_TYPE_INTn 4 /* VMCB exit code, APM vol2 Appendix C */ #define VMCB_EXIT_MC 0x52 #define VMCB_EXIT_INTR 0x60 #define VMCB_EXIT_NMI 0x61 #define VMCB_EXIT_VINTR 0x64 #define VMCB_EXIT_PUSHF 0x70 #define VMCB_EXIT_POPF 0x71 #define VMCB_EXIT_CPUID 0x72 #define VMCB_EXIT_IRET 0x74 #define VMCB_EXIT_PAUSE 0x77 #define VMCB_EXIT_HLT 0x78 #define VMCB_EXIT_IO 0x7B #define VMCB_EXIT_MSR 0x7C #define VMCB_EXIT_SHUTDOWN 0x7F #define VMCB_EXIT_VMSAVE 0x83 #define VMCB_EXIT_MONITOR 0x8A #define VMCB_EXIT_MWAIT 0x8B #define VMCB_EXIT_NPF 0x400 #define VMCB_EXIT_INVALID -1 /* * Nested page fault. * Bit definitions to decode EXITINFO1. */ #define VMCB_NPF_INFO1_P BIT(0) /* Nested page present. */ #define VMCB_NPF_INFO1_W BIT(1) /* Access was write. */ #define VMCB_NPF_INFO1_U BIT(2) /* Access was user access. */ #define VMCB_NPF_INFO1_RSV BIT(3) /* Reserved bits present. */ #define VMCB_NPF_INFO1_ID BIT(4) /* Code read. */ #define VMCB_NPF_INFO1_GPA BIT(32) /* Guest physical address. */ #define VMCB_NPF_INFO1_GPT BIT(33) /* Guest page table. */ /* * EXITINTINFO, Interrupt exit info for all intrecepts. * Section 15.7.2, Intercepts during IDT Interrupt Delivery. */ #define VMCB_EXITINTINFO_VECTOR(x) ((x) & 0xFF) #define VMCB_EXITINTINFO_TYPE(x) (((x) >> 8) & 0x7) #define VMCB_EXITINTINFO_EC_VALID(x) (((x) & BIT(11)) ? 1 : 0) #define VMCB_EXITINTINFO_VALID(x) (((x) & BIT(31)) ? 1 : 0) #define VMCB_EXITINTINFO_EC(x) (((x) >> 32) & 0xFFFFFFFF) /* Offset of various VMCB fields. */ #define VMCB_OFF_CTRL(x) (x) #define VMCB_OFF_STATE(x) ((x) + 0x400) #define VMCB_OFF_CR_INTERCEPT VMCB_OFF_CTRL(0x0) #define VMCB_OFF_DR_INTERCEPT VMCB_OFF_CTRL(0x4) #define VMCB_OFF_EXC_INTERCEPT VMCB_OFF_CTRL(0x8) #define VMCB_OFF_INST1_INTERCEPT VMCB_OFF_CTRL(0xC) #define VMCB_OFF_INST2_INTERCEPT VMCB_OFF_CTRL(0x10) #define VMCB_OFF_IO_PERM VMCB_OFF_CTRL(0x40) #define VMCB_OFF_MSR_PERM VMCB_OFF_CTRL(0x48) #define VMCB_OFF_TSC_OFFSET VMCB_OFF_CTRL(0x50) #define VMCB_OFF_ASID VMCB_OFF_CTRL(0x58) #define VMCB_OFF_TLB_CTRL VMCB_OFF_CTRL(0x5C) #define VMCB_OFF_VIRQ VMCB_OFF_CTRL(0x60) #define VMCB_OFF_EXIT_REASON VMCB_OFF_CTRL(0x70) #define VMCB_OFF_EXITINFO1 VMCB_OFF_CTRL(0x78) #define VMCB_OFF_EXITINFO2 VMCB_OFF_CTRL(0x80) #define VMCB_OFF_EXITINTINFO VMCB_OFF_CTRL(0x88) #define VMCB_OFF_AVIC_BAR VMCB_OFF_CTRL(0x98) #define VMCB_OFF_NPT_BASE VMCB_OFF_CTRL(0xB0) #define VMCB_OFF_AVIC_PAGE VMCB_OFF_CTRL(0xE0) #define VMCB_OFF_AVIC_LT VMCB_OFF_CTRL(0xF0) #define VMCB_OFF_AVIC_PT VMCB_OFF_CTRL(0xF8) #define VMCB_OFF_SYSENTER_CS VMCB_OFF_STATE(0x228) #define VMCB_OFF_SYSENTER_ESP VMCB_OFF_STATE(0x230) #define VMCB_OFF_SYSENTER_EIP VMCB_OFF_STATE(0x238) #define VMCB_OFF_GUEST_PAT VMCB_OFF_STATE(0x268) /* * Encode the VMCB offset and bytes that we want to read from VMCB. */ #define VMCB_ACCESS(o, w) (0x80000000 | (((w) & 0xF) << 16) | \ ((o) & 0xFFF)) #define VMCB_ACCESS_OK(v) ((v) & 0x80000000 ) #define VMCB_ACCESS_BYTES(v) (((v) >> 16) & 0xF) #define VMCB_ACCESS_OFFSET(v) ((v) & 0xFFF) #ifdef _KERNEL /* VMCB save state area segment format */ struct vmcb_segment { uint16_t selector; uint16_t attrib; uint32_t limit; uint64_t base; } __attribute__ ((__packed__)); CTASSERT(sizeof(struct vmcb_segment) == 16); /* Code segment descriptor attribute in 12 bit format as saved by VMCB. */ #define VMCB_CS_ATTRIB_L BIT(9) /* Long mode. */ #define VMCB_CS_ATTRIB_D BIT(10) /* OPerand size bit. */ /* * The VMCB is divided into two areas - the first one contains various * control bits including the intercept vector and the second one contains * the guest state. */ /* VMCB control area - padded up to 1024 bytes */ struct vmcb_ctrl { uint32_t intercept[5]; /* all intercepts */ uint8_t pad1[0x28]; /* Offsets 0x14-0x3B are reserved. */ uint16_t pause_filthresh; /* Offset 0x3C, PAUSE filter threshold */ uint16_t pause_filcnt; /* Offset 0x3E, PAUSE filter count */ uint64_t iopm_base_pa; /* 0x40: IOPM_BASE_PA */ uint64_t msrpm_base_pa; /* 0x48: MSRPM_BASE_PA */ uint64_t tsc_offset; /* 0x50: TSC_OFFSET */ uint32_t asid; /* 0x58: Guest ASID */ uint8_t tlb_ctrl; /* 0x5C: TLB_CONTROL */ uint8_t pad2[3]; /* 0x5D-0x5F: Reserved. */ uint8_t v_tpr; /* 0x60: V_TPR, guest CR8 */ uint8_t v_irq:1; /* Is virtual interrupt pending? */ uint8_t :7; /* Padding */ uint8_t v_intr_prio:4; /* 0x62: Priority for virtual interrupt. */ uint8_t v_ign_tpr:1; uint8_t :3; uint8_t v_intr_masking:1; /* Guest and host sharing of RFLAGS. */ uint8_t :7; uint8_t v_intr_vector; /* 0x64: Vector for virtual interrupt. */ uint8_t pad3[3]; /* 0x65-0x67 Reserved. */ uint64_t intr_shadow:1; /* 0x68: Interrupt shadow, section15.2.1 APM2 */ uint64_t :63; uint64_t exitcode; /* 0x70, Exitcode */ uint64_t exitinfo1; /* 0x78, EXITINFO1 */ uint64_t exitinfo2; /* 0x80, EXITINFO2 */ uint64_t exitintinfo; /* 0x88, Interrupt exit value. */ uint64_t np_enable:1; /* 0x90, Nested paging enable. */ uint64_t :63; uint8_t pad4[0x10]; /* 0x98-0xA7 reserved. */ uint64_t eventinj; /* 0xA8, Event injection. */ uint64_t n_cr3; /* B0, Nested page table. */ uint64_t lbr_virt_en:1; /* Enable LBR virtualization. */ uint64_t :63; uint32_t vmcb_clean; /* 0xC0: VMCB clean bits for caching */ uint32_t :32; /* 0xC4: Reserved */ uint64_t nrip; /* 0xC8: Guest next nRIP. */ uint8_t inst_len; /* 0xD0: #NPF decode assist */ uint8_t inst_bytes[15]; uint8_t padd6[0x320]; } __attribute__ ((__packed__)); CTASSERT(sizeof(struct vmcb_ctrl) == 1024); struct vmcb_state { struct vmcb_segment es; struct vmcb_segment cs; struct vmcb_segment ss; struct vmcb_segment ds; struct vmcb_segment fs; struct vmcb_segment gs; struct vmcb_segment gdt; struct vmcb_segment ldt; struct vmcb_segment idt; struct vmcb_segment tr; uint8_t pad1[0x2b]; /* Reserved: 0xA0-0xCA */ uint8_t cpl; uint8_t pad2[4]; uint64_t efer; uint8_t pad3[0x70]; /* Reserved: 0xd8-0x147 */ uint64_t cr4; uint64_t cr3; /* Guest CR3 */ uint64_t cr0; uint64_t dr7; uint64_t dr6; uint64_t rflags; uint64_t rip; uint8_t pad4[0x58]; /* Reserved: 0x180-0x1D7 */ uint64_t rsp; uint8_t pad5[0x18]; /* Reserved 0x1E0-0x1F7 */ uint64_t rax; uint64_t star; uint64_t lstar; uint64_t cstar; uint64_t sfmask; uint64_t kernelgsbase; uint64_t sysenter_cs; uint64_t sysenter_esp; uint64_t sysenter_eip; uint64_t cr2; uint8_t pad6[0x20]; uint64_t g_pat; uint64_t dbgctl; uint64_t br_from; uint64_t br_to; uint64_t int_from; uint64_t int_to; uint8_t pad7[0x968]; /* Reserved up to end of VMCB */ } __attribute__ ((__packed__)); CTASSERT(sizeof(struct vmcb_state) == 0xC00); struct vmcb { struct vmcb_ctrl ctrl; struct vmcb_state state; } __attribute__ ((__packed__)); CTASSERT(sizeof(struct vmcb) == PAGE_SIZE); CTASSERT(offsetof(struct vmcb, state) == 0x400); int vmcb_read(struct svm_softc *sc, int vcpu, int ident, uint64_t *retval); int vmcb_write(struct svm_softc *sc, int vcpu, int ident, uint64_t val); int vmcb_setdesc(void *arg, int vcpu, int ident, struct seg_desc *desc); int vmcb_getdesc(void *arg, int vcpu, int ident, struct seg_desc *desc); int vmcb_seg(struct vmcb *vmcb, int ident, struct vmcb_segment *seg); #endif /* _KERNEL */ #endif /* _VMCB_H_ */ Index: head/sys/amd64/vmm/intel/vmx_support.S =================================================================== --- head/sys/amd64/vmm/intel/vmx_support.S (revision 335029) +++ head/sys/amd64/vmm/intel/vmx_support.S (revision 335030) @@ -1,325 +1,327 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2011 NetApp, Inc. * Copyright (c) 2013 Neel Natu * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #include #include "vmx_assym.h" #ifdef SMP #define LK lock ; #else #define LK #endif /* Be friendly to DTrace FBT's prologue/epilogue pattern matching */ #define VENTER push %rbp ; mov %rsp,%rbp #define VLEAVE pop %rbp /* * Save the guest context. */ #define VMX_GUEST_SAVE \ movq %rdi,VMXCTX_GUEST_RDI(%rsp); \ movq %rsi,VMXCTX_GUEST_RSI(%rsp); \ movq %rdx,VMXCTX_GUEST_RDX(%rsp); \ movq %rcx,VMXCTX_GUEST_RCX(%rsp); \ movq %r8,VMXCTX_GUEST_R8(%rsp); \ movq %r9,VMXCTX_GUEST_R9(%rsp); \ movq %rax,VMXCTX_GUEST_RAX(%rsp); \ movq %rbx,VMXCTX_GUEST_RBX(%rsp); \ movq %rbp,VMXCTX_GUEST_RBP(%rsp); \ movq %r10,VMXCTX_GUEST_R10(%rsp); \ movq %r11,VMXCTX_GUEST_R11(%rsp); \ movq %r12,VMXCTX_GUEST_R12(%rsp); \ movq %r13,VMXCTX_GUEST_R13(%rsp); \ movq %r14,VMXCTX_GUEST_R14(%rsp); \ movq %r15,VMXCTX_GUEST_R15(%rsp); \ movq %cr2,%rdi; \ movq %rdi,VMXCTX_GUEST_CR2(%rsp); \ movq %rsp,%rdi; /* * Assumes that %rdi holds a pointer to the 'vmxctx'. * * On "return" all registers are updated to reflect guest state. The two * exceptions are %rip and %rsp. These registers are atomically switched * by hardware from the guest area of the vmcs. * * We modify %rsp to point to the 'vmxctx' so we can use it to restore * host context in case of an error with 'vmlaunch' or 'vmresume'. */ #define VMX_GUEST_RESTORE \ movq %rdi,%rsp; \ movq VMXCTX_GUEST_CR2(%rdi),%rsi; \ movq %rsi,%cr2; \ movq VMXCTX_GUEST_RSI(%rdi),%rsi; \ movq VMXCTX_GUEST_RDX(%rdi),%rdx; \ movq VMXCTX_GUEST_RCX(%rdi),%rcx; \ movq VMXCTX_GUEST_R8(%rdi),%r8; \ movq VMXCTX_GUEST_R9(%rdi),%r9; \ movq VMXCTX_GUEST_RAX(%rdi),%rax; \ movq VMXCTX_GUEST_RBX(%rdi),%rbx; \ movq VMXCTX_GUEST_RBP(%rdi),%rbp; \ movq VMXCTX_GUEST_R10(%rdi),%r10; \ movq VMXCTX_GUEST_R11(%rdi),%r11; \ movq VMXCTX_GUEST_R12(%rdi),%r12; \ movq VMXCTX_GUEST_R13(%rdi),%r13; \ movq VMXCTX_GUEST_R14(%rdi),%r14; \ movq VMXCTX_GUEST_R15(%rdi),%r15; \ movq VMXCTX_GUEST_RDI(%rdi),%rdi; /* restore rdi the last */ /* * Clobber the remaining registers with guest contents so they can't * be misused. */ #define VMX_GUEST_CLOBBER \ xor %rax, %rax; \ xor %rcx, %rcx; \ xor %rdx, %rdx; \ xor %rsi, %rsi; \ xor %r8, %r8; \ xor %r9, %r9; \ xor %r10, %r10; \ xor %r11, %r11; /* * Save and restore the host context. * * Assumes that %rdi holds a pointer to the 'vmxctx'. */ #define VMX_HOST_SAVE \ movq %r15, VMXCTX_HOST_R15(%rdi); \ movq %r14, VMXCTX_HOST_R14(%rdi); \ movq %r13, VMXCTX_HOST_R13(%rdi); \ movq %r12, VMXCTX_HOST_R12(%rdi); \ movq %rbp, VMXCTX_HOST_RBP(%rdi); \ movq %rsp, VMXCTX_HOST_RSP(%rdi); \ movq %rbx, VMXCTX_HOST_RBX(%rdi); \ #define VMX_HOST_RESTORE \ movq VMXCTX_HOST_R15(%rdi), %r15; \ movq VMXCTX_HOST_R14(%rdi), %r14; \ movq VMXCTX_HOST_R13(%rdi), %r13; \ movq VMXCTX_HOST_R12(%rdi), %r12; \ movq VMXCTX_HOST_RBP(%rdi), %rbp; \ movq VMXCTX_HOST_RSP(%rdi), %rsp; \ movq VMXCTX_HOST_RBX(%rdi), %rbx; \ /* * vmx_enter_guest(struct vmxctx *vmxctx, int launched) * %rdi: pointer to the 'vmxctx' * %rsi: pointer to the 'vmx' * %edx: launch state of the VMCS * Interrupts must be disabled on entry. */ ENTRY(vmx_enter_guest) VENTER /* * Save host state before doing anything else. */ VMX_HOST_SAVE /* * Activate guest pmap on this cpu. */ movq VMXCTX_PMAP(%rdi), %r11 movl PCPU(CPUID), %eax LK btsl %eax, PM_ACTIVE(%r11) /* * If 'vmx->eptgen[curcpu]' is not identical to 'pmap->pm_eptgen' * then we must invalidate all mappings associated with this EPTP. */ movq PM_EPTGEN(%r11), %r10 cmpq %r10, VMX_EPTGEN(%rsi, %rax, 8) je guest_restore /* Refresh 'vmx->eptgen[curcpu]' */ movq %r10, VMX_EPTGEN(%rsi, %rax, 8) /* Setup the invept descriptor on the host stack */ mov %rsp, %r11 movq VMX_EPTP(%rsi), %rax movq %rax, -16(%r11) movq $0x0, -8(%r11) mov $0x1, %eax /* Single context invalidate */ invept -16(%r11), %rax jbe invept_error /* Check invept instruction error */ guest_restore: cmpl $0, %edx je do_launch VMX_GUEST_RESTORE vmresume /* * In the common case 'vmresume' returns back to the host through * 'vmx_exit_guest' with %rsp pointing to 'vmxctx'. * * If there is an error we return VMX_VMRESUME_ERROR to the caller. */ movq %rsp, %rdi /* point %rdi back to 'vmxctx' */ movl $VMX_VMRESUME_ERROR, %eax jmp decode_inst_error do_launch: VMX_GUEST_RESTORE vmlaunch /* * In the common case 'vmlaunch' returns back to the host through * 'vmx_exit_guest' with %rsp pointing to 'vmxctx'. * * If there is an error we return VMX_VMLAUNCH_ERROR to the caller. */ movq %rsp, %rdi /* point %rdi back to 'vmxctx' */ movl $VMX_VMLAUNCH_ERROR, %eax jmp decode_inst_error invept_error: movl $VMX_INVEPT_ERROR, %eax jmp decode_inst_error decode_inst_error: movl $VM_FAIL_VALID, %r11d jz inst_error movl $VM_FAIL_INVALID, %r11d inst_error: movl %r11d, VMXCTX_INST_FAIL_STATUS(%rdi) /* * The return value is already populated in %eax so we cannot use * it as a scratch register beyond this point. */ /* * Deactivate guest pmap from this cpu. */ movq VMXCTX_PMAP(%rdi), %r11 movl PCPU(CPUID), %r10d LK btrl %r10d, PM_ACTIVE(%r11) VMX_HOST_RESTORE VLEAVE ret /* * Non-error VM-exit from the guest. Make this a label so it can * be used by C code when setting up the VMCS. * The VMCS-restored %rsp points to the struct vmxctx */ ALIGN_TEXT .globl vmx_exit_guest_flush_rsb vmx_exit_guest_flush_rsb: /* * Save guest state that is not automatically saved in the vmcs. */ VMX_GUEST_SAVE /* * Deactivate guest pmap from this cpu. */ movq VMXCTX_PMAP(%rdi), %r11 movl PCPU(CPUID), %r10d LK btrl %r10d, PM_ACTIVE(%r11) VMX_HOST_RESTORE VMX_GUEST_CLOBBER /* * To prevent malicious branch target predictions from * affecting the host, overwrite all entries in the RSB upon * exiting a guest. */ mov $16, %ecx /* 16 iterations, two calls per loop */ mov %rsp, %rax 0: call 2f /* create an RSB entry. */ 1: pause call 1b /* capture rogue speculation. */ 2: call 2f /* create an RSB entry. */ 1: pause call 1b /* capture rogue speculation. */ 2: sub $1, %ecx jnz 0b mov %rax, %rsp /* * This will return to the caller of 'vmx_enter_guest()' with a return * value of VMX_GUEST_VMEXIT. */ movl $VMX_GUEST_VMEXIT, %eax VLEAVE ret .globl vmx_exit_guest vmx_exit_guest: /* * Save guest state that is not automatically saved in the vmcs. */ VMX_GUEST_SAVE /* * Deactivate guest pmap from this cpu. */ movq VMXCTX_PMAP(%rdi), %r11 movl PCPU(CPUID), %r10d LK btrl %r10d, PM_ACTIVE(%r11) VMX_HOST_RESTORE VMX_GUEST_CLOBBER /* * This will return to the caller of 'vmx_enter_guest()' with a return * value of VMX_GUEST_VMEXIT. */ movl $VMX_GUEST_VMEXIT, %eax VLEAVE ret END(vmx_enter_guest) /* * %rdi = interrupt handler entry point * * Calling sequence described in the "Instruction Set Reference" for the "INT" * instruction in Intel SDM, Vol 2. */ ENTRY(vmx_call_isr) VENTER mov %rsp, %r11 /* save %rsp */ and $~0xf, %rsp /* align on 16-byte boundary */ pushq $KERNEL_SS /* %ss */ pushq %r11 /* %rsp */ pushfq /* %rflags */ pushq $KERNEL_CS /* %cs */ cli /* disable interrupts */ callq *%rdi /* push %rip and call isr */ VLEAVE ret END(vmx_call_isr) Index: head/sys/amd64/vmm/io/vatpic.c =================================================================== --- head/sys/amd64/vmm/io/vatpic.c (revision 335029) +++ head/sys/amd64/vmm/io/vatpic.c (revision 335030) @@ -1,808 +1,810 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2014 Tycho Nightingale * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include "vmm_ktr.h" #include "vmm_lapic.h" #include "vioapic.h" #include "vatpic.h" static MALLOC_DEFINE(M_VATPIC, "atpic", "bhyve virtual atpic (8259)"); #define VATPIC_LOCK(vatpic) mtx_lock_spin(&((vatpic)->mtx)) #define VATPIC_UNLOCK(vatpic) mtx_unlock_spin(&((vatpic)->mtx)) #define VATPIC_LOCKED(vatpic) mtx_owned(&((vatpic)->mtx)) enum irqstate { IRQSTATE_ASSERT, IRQSTATE_DEASSERT, IRQSTATE_PULSE }; struct atpic { bool ready; int icw_num; int rd_cmd_reg; bool aeoi; bool poll; bool rotate; bool sfn; /* special fully-nested mode */ int irq_base; uint8_t request; /* Interrupt Request Register (IIR) */ uint8_t service; /* Interrupt Service (ISR) */ uint8_t mask; /* Interrupt Mask Register (IMR) */ uint8_t smm; /* special mask mode */ int acnt[8]; /* sum of pin asserts and deasserts */ int lowprio; /* lowest priority irq */ bool intr_raised; }; struct vatpic { struct vm *vm; struct mtx mtx; struct atpic atpic[2]; uint8_t elc[2]; }; #define VATPIC_CTR0(vatpic, fmt) \ VM_CTR0((vatpic)->vm, fmt) #define VATPIC_CTR1(vatpic, fmt, a1) \ VM_CTR1((vatpic)->vm, fmt, a1) #define VATPIC_CTR2(vatpic, fmt, a1, a2) \ VM_CTR2((vatpic)->vm, fmt, a1, a2) #define VATPIC_CTR3(vatpic, fmt, a1, a2, a3) \ VM_CTR3((vatpic)->vm, fmt, a1, a2, a3) #define VATPIC_CTR4(vatpic, fmt, a1, a2, a3, a4) \ VM_CTR4((vatpic)->vm, fmt, a1, a2, a3, a4) /* * Loop over all the pins in priority order from highest to lowest. */ #define ATPIC_PIN_FOREACH(pinvar, atpic, tmpvar) \ for (tmpvar = 0, pinvar = (atpic->lowprio + 1) & 0x7; \ tmpvar < 8; \ tmpvar++, pinvar = (pinvar + 1) & 0x7) static void vatpic_set_pinstate(struct vatpic *vatpic, int pin, bool newstate); static __inline bool master_atpic(struct vatpic *vatpic, struct atpic *atpic) { if (atpic == &vatpic->atpic[0]) return (true); else return (false); } static __inline int vatpic_get_highest_isrpin(struct atpic *atpic) { int bit, pin; int i; ATPIC_PIN_FOREACH(pin, atpic, i) { bit = (1 << pin); if (atpic->service & bit) { /* * An IS bit that is masked by an IMR bit will not be * cleared by a non-specific EOI in Special Mask Mode. */ if (atpic->smm && (atpic->mask & bit) != 0) continue; else return (pin); } } return (-1); } static __inline int vatpic_get_highest_irrpin(struct atpic *atpic) { int serviced; int bit, pin, tmp; /* * In 'Special Fully-Nested Mode' when an interrupt request from * a slave is in service, the slave is not locked out from the * master's priority logic. */ serviced = atpic->service; if (atpic->sfn) serviced &= ~(1 << 2); /* * In 'Special Mask Mode', when a mask bit is set in OCW1 it inhibits * further interrupts at that level and enables interrupts from all * other levels that are not masked. In other words the ISR has no * bearing on the levels that can generate interrupts. */ if (atpic->smm) serviced = 0; ATPIC_PIN_FOREACH(pin, atpic, tmp) { bit = 1 << pin; /* * If there is already an interrupt in service at the same * or higher priority then bail. */ if ((serviced & bit) != 0) break; /* * If an interrupt is asserted and not masked then return * the corresponding 'pin' to the caller. */ if ((atpic->request & bit) != 0 && (atpic->mask & bit) == 0) return (pin); } return (-1); } static void vatpic_notify_intr(struct vatpic *vatpic) { struct atpic *atpic; int pin; KASSERT(VATPIC_LOCKED(vatpic), ("vatpic_notify_intr not locked")); /* * First check the slave. */ atpic = &vatpic->atpic[1]; if (!atpic->intr_raised && (pin = vatpic_get_highest_irrpin(atpic)) != -1) { VATPIC_CTR4(vatpic, "atpic slave notify pin = %d " "(imr 0x%x irr 0x%x isr 0x%x)", pin, atpic->mask, atpic->request, atpic->service); /* * Cascade the request from the slave to the master. */ atpic->intr_raised = true; vatpic_set_pinstate(vatpic, 2, true); vatpic_set_pinstate(vatpic, 2, false); } else { VATPIC_CTR3(vatpic, "atpic slave no eligible interrupts " "(imr 0x%x irr 0x%x isr 0x%x)", atpic->mask, atpic->request, atpic->service); } /* * Then check the master. */ atpic = &vatpic->atpic[0]; if (!atpic->intr_raised && (pin = vatpic_get_highest_irrpin(atpic)) != -1) { VATPIC_CTR4(vatpic, "atpic master notify pin = %d " "(imr 0x%x irr 0x%x isr 0x%x)", pin, atpic->mask, atpic->request, atpic->service); /* * From Section 3.6.2, "Interrupt Modes", in the * MPtable Specification, Version 1.4 * * PIC interrupts are routed to both the Local APIC * and the I/O APIC to support operation in 1 of 3 * modes. * * 1. Legacy PIC Mode: the PIC effectively bypasses * all APIC components. In this mode the local APIC is * disabled and LINT0 is reconfigured as INTR to * deliver the PIC interrupt directly to the CPU. * * 2. Virtual Wire Mode: the APIC is treated as a * virtual wire which delivers interrupts from the PIC * to the CPU. In this mode LINT0 is programmed as * ExtINT to indicate that the PIC is the source of * the interrupt. * * 3. Virtual Wire Mode via I/O APIC: PIC interrupts are * fielded by the I/O APIC and delivered to the appropriate * CPU. In this mode the I/O APIC input 0 is programmed * as ExtINT to indicate that the PIC is the source of the * interrupt. */ atpic->intr_raised = true; lapic_set_local_intr(vatpic->vm, -1, APIC_LVT_LINT0); vioapic_pulse_irq(vatpic->vm, 0); } else { VATPIC_CTR3(vatpic, "atpic master no eligible interrupts " "(imr 0x%x irr 0x%x isr 0x%x)", atpic->mask, atpic->request, atpic->service); } } static int vatpic_icw1(struct vatpic *vatpic, struct atpic *atpic, uint8_t val) { VATPIC_CTR1(vatpic, "atpic icw1 0x%x", val); atpic->ready = false; atpic->icw_num = 1; atpic->request = 0; atpic->mask = 0; atpic->lowprio = 7; atpic->rd_cmd_reg = 0; atpic->poll = 0; atpic->smm = 0; if ((val & ICW1_SNGL) != 0) { VATPIC_CTR0(vatpic, "vatpic cascade mode required"); return (-1); } if ((val & ICW1_IC4) == 0) { VATPIC_CTR0(vatpic, "vatpic icw4 required"); return (-1); } atpic->icw_num++; return (0); } static int vatpic_icw2(struct vatpic *vatpic, struct atpic *atpic, uint8_t val) { VATPIC_CTR1(vatpic, "atpic icw2 0x%x", val); atpic->irq_base = val & 0xf8; atpic->icw_num++; return (0); } static int vatpic_icw3(struct vatpic *vatpic, struct atpic *atpic, uint8_t val) { VATPIC_CTR1(vatpic, "atpic icw3 0x%x", val); atpic->icw_num++; return (0); } static int vatpic_icw4(struct vatpic *vatpic, struct atpic *atpic, uint8_t val) { VATPIC_CTR1(vatpic, "atpic icw4 0x%x", val); if ((val & ICW4_8086) == 0) { VATPIC_CTR0(vatpic, "vatpic microprocessor mode required"); return (-1); } if ((val & ICW4_AEOI) != 0) atpic->aeoi = true; if ((val & ICW4_SFNM) != 0) { if (master_atpic(vatpic, atpic)) { atpic->sfn = true; } else { VATPIC_CTR1(vatpic, "Ignoring special fully nested " "mode on slave atpic: %#x", val); } } atpic->icw_num = 0; atpic->ready = true; return (0); } static int vatpic_ocw1(struct vatpic *vatpic, struct atpic *atpic, uint8_t val) { VATPIC_CTR1(vatpic, "atpic ocw1 0x%x", val); atpic->mask = val & 0xff; return (0); } static int vatpic_ocw2(struct vatpic *vatpic, struct atpic *atpic, uint8_t val) { VATPIC_CTR1(vatpic, "atpic ocw2 0x%x", val); atpic->rotate = ((val & OCW2_R) != 0); if ((val & OCW2_EOI) != 0) { int isr_bit; if ((val & OCW2_SL) != 0) { /* specific EOI */ isr_bit = val & 0x7; } else { /* non-specific EOI */ isr_bit = vatpic_get_highest_isrpin(atpic); } if (isr_bit != -1) { atpic->service &= ~(1 << isr_bit); if (atpic->rotate) atpic->lowprio = isr_bit; } } else if ((val & OCW2_SL) != 0 && atpic->rotate == true) { /* specific priority */ atpic->lowprio = val & 0x7; } return (0); } static int vatpic_ocw3(struct vatpic *vatpic, struct atpic *atpic, uint8_t val) { VATPIC_CTR1(vatpic, "atpic ocw3 0x%x", val); if (val & OCW3_ESMM) { atpic->smm = val & OCW3_SMM ? 1 : 0; VATPIC_CTR2(vatpic, "%s atpic special mask mode %s", master_atpic(vatpic, atpic) ? "master" : "slave", atpic->smm ? "enabled" : "disabled"); } if (val & OCW3_RR) { /* read register command */ atpic->rd_cmd_reg = val & OCW3_RIS; /* Polling mode */ atpic->poll = ((val & OCW3_P) != 0); } return (0); } static void vatpic_set_pinstate(struct vatpic *vatpic, int pin, bool newstate) { struct atpic *atpic; int oldcnt, newcnt; bool level; KASSERT(pin >= 0 && pin < 16, ("vatpic_set_pinstate: invalid pin number %d", pin)); KASSERT(VATPIC_LOCKED(vatpic), ("vatpic_set_pinstate: vatpic is not locked")); atpic = &vatpic->atpic[pin >> 3]; oldcnt = atpic->acnt[pin & 0x7]; if (newstate) atpic->acnt[pin & 0x7]++; else atpic->acnt[pin & 0x7]--; newcnt = atpic->acnt[pin & 0x7]; if (newcnt < 0) { VATPIC_CTR2(vatpic, "atpic pin%d: bad acnt %d", pin, newcnt); } level = ((vatpic->elc[pin >> 3] & (1 << (pin & 0x7))) != 0); if ((oldcnt == 0 && newcnt == 1) || (newcnt > 0 && level == true)) { /* rising edge or level */ VATPIC_CTR1(vatpic, "atpic pin%d: asserted", pin); atpic->request |= (1 << (pin & 0x7)); } else if (oldcnt == 1 && newcnt == 0) { /* falling edge */ VATPIC_CTR1(vatpic, "atpic pin%d: deasserted", pin); if (level) atpic->request &= ~(1 << (pin & 0x7)); } else { VATPIC_CTR3(vatpic, "atpic pin%d: %s, ignored, acnt %d", pin, newstate ? "asserted" : "deasserted", newcnt); } vatpic_notify_intr(vatpic); } static int vatpic_set_irqstate(struct vm *vm, int irq, enum irqstate irqstate) { struct vatpic *vatpic; struct atpic *atpic; if (irq < 0 || irq > 15) return (EINVAL); vatpic = vm_atpic(vm); atpic = &vatpic->atpic[irq >> 3]; if (atpic->ready == false) return (0); VATPIC_LOCK(vatpic); switch (irqstate) { case IRQSTATE_ASSERT: vatpic_set_pinstate(vatpic, irq, true); break; case IRQSTATE_DEASSERT: vatpic_set_pinstate(vatpic, irq, false); break; case IRQSTATE_PULSE: vatpic_set_pinstate(vatpic, irq, true); vatpic_set_pinstate(vatpic, irq, false); break; default: panic("vatpic_set_irqstate: invalid irqstate %d", irqstate); } VATPIC_UNLOCK(vatpic); return (0); } int vatpic_assert_irq(struct vm *vm, int irq) { return (vatpic_set_irqstate(vm, irq, IRQSTATE_ASSERT)); } int vatpic_deassert_irq(struct vm *vm, int irq) { return (vatpic_set_irqstate(vm, irq, IRQSTATE_DEASSERT)); } int vatpic_pulse_irq(struct vm *vm, int irq) { return (vatpic_set_irqstate(vm, irq, IRQSTATE_PULSE)); } int vatpic_set_irq_trigger(struct vm *vm, int irq, enum vm_intr_trigger trigger) { struct vatpic *vatpic; if (irq < 0 || irq > 15) return (EINVAL); /* * See comment in vatpic_elc_handler. These IRQs must be * edge triggered. */ if (trigger == LEVEL_TRIGGER) { switch (irq) { case 0: case 1: case 2: case 8: case 13: return (EINVAL); } } vatpic = vm_atpic(vm); VATPIC_LOCK(vatpic); if (trigger == LEVEL_TRIGGER) vatpic->elc[irq >> 3] |= 1 << (irq & 0x7); else vatpic->elc[irq >> 3] &= ~(1 << (irq & 0x7)); VATPIC_UNLOCK(vatpic); return (0); } void vatpic_pending_intr(struct vm *vm, int *vecptr) { struct vatpic *vatpic; struct atpic *atpic; int pin; vatpic = vm_atpic(vm); atpic = &vatpic->atpic[0]; VATPIC_LOCK(vatpic); pin = vatpic_get_highest_irrpin(atpic); if (pin == 2) { atpic = &vatpic->atpic[1]; pin = vatpic_get_highest_irrpin(atpic); } /* * If there are no pins active at this moment then return the spurious * interrupt vector instead. */ if (pin == -1) pin = 7; KASSERT(pin >= 0 && pin <= 7, ("%s: invalid pin %d", __func__, pin)); *vecptr = atpic->irq_base + pin; VATPIC_UNLOCK(vatpic); } static void vatpic_pin_accepted(struct atpic *atpic, int pin) { atpic->intr_raised = false; if (atpic->acnt[pin] == 0) atpic->request &= ~(1 << pin); if (atpic->aeoi == true) { if (atpic->rotate == true) atpic->lowprio = pin; } else { atpic->service |= (1 << pin); } } void vatpic_intr_accepted(struct vm *vm, int vector) { struct vatpic *vatpic; int pin; vatpic = vm_atpic(vm); VATPIC_LOCK(vatpic); pin = vector & 0x7; if ((vector & ~0x7) == vatpic->atpic[1].irq_base) { vatpic_pin_accepted(&vatpic->atpic[1], pin); /* * If this vector originated from the slave, * accept the cascaded interrupt too. */ vatpic_pin_accepted(&vatpic->atpic[0], 2); } else { vatpic_pin_accepted(&vatpic->atpic[0], pin); } vatpic_notify_intr(vatpic); VATPIC_UNLOCK(vatpic); } static int vatpic_read(struct vatpic *vatpic, struct atpic *atpic, bool in, int port, int bytes, uint32_t *eax) { int pin; VATPIC_LOCK(vatpic); if (atpic->poll) { atpic->poll = 0; pin = vatpic_get_highest_irrpin(atpic); if (pin >= 0) { vatpic_pin_accepted(atpic, pin); *eax = 0x80 | pin; } else { *eax = 0; } } else { if (port & ICU_IMR_OFFSET) { /* read interrrupt mask register */ *eax = atpic->mask; } else { if (atpic->rd_cmd_reg == OCW3_RIS) { /* read interrupt service register */ *eax = atpic->service; } else { /* read interrupt request register */ *eax = atpic->request; } } } VATPIC_UNLOCK(vatpic); return (0); } static int vatpic_write(struct vatpic *vatpic, struct atpic *atpic, bool in, int port, int bytes, uint32_t *eax) { int error; uint8_t val; error = 0; val = *eax; VATPIC_LOCK(vatpic); if (port & ICU_IMR_OFFSET) { switch (atpic->icw_num) { case 2: error = vatpic_icw2(vatpic, atpic, val); break; case 3: error = vatpic_icw3(vatpic, atpic, val); break; case 4: error = vatpic_icw4(vatpic, atpic, val); break; default: error = vatpic_ocw1(vatpic, atpic, val); break; } } else { if (val & (1 << 4)) error = vatpic_icw1(vatpic, atpic, val); if (atpic->ready) { if (val & (1 << 3)) error = vatpic_ocw3(vatpic, atpic, val); else error = vatpic_ocw2(vatpic, atpic, val); } } if (atpic->ready) vatpic_notify_intr(vatpic); VATPIC_UNLOCK(vatpic); return (error); } int vatpic_master_handler(struct vm *vm, int vcpuid, bool in, int port, int bytes, uint32_t *eax) { struct vatpic *vatpic; struct atpic *atpic; vatpic = vm_atpic(vm); atpic = &vatpic->atpic[0]; if (bytes != 1) return (-1); if (in) { return (vatpic_read(vatpic, atpic, in, port, bytes, eax)); } return (vatpic_write(vatpic, atpic, in, port, bytes, eax)); } int vatpic_slave_handler(struct vm *vm, int vcpuid, bool in, int port, int bytes, uint32_t *eax) { struct vatpic *vatpic; struct atpic *atpic; vatpic = vm_atpic(vm); atpic = &vatpic->atpic[1]; if (bytes != 1) return (-1); if (in) { return (vatpic_read(vatpic, atpic, in, port, bytes, eax)); } return (vatpic_write(vatpic, atpic, in, port, bytes, eax)); } int vatpic_elc_handler(struct vm *vm, int vcpuid, bool in, int port, int bytes, uint32_t *eax) { struct vatpic *vatpic; bool is_master; vatpic = vm_atpic(vm); is_master = (port == IO_ELCR1); if (bytes != 1) return (-1); VATPIC_LOCK(vatpic); if (in) { if (is_master) *eax = vatpic->elc[0]; else *eax = vatpic->elc[1]; } else { /* * For the master PIC the cascade channel (IRQ2), the * heart beat timer (IRQ0), and the keyboard * controller (IRQ1) cannot be programmed for level * mode. * * For the slave PIC the real time clock (IRQ8) and * the floating point error interrupt (IRQ13) cannot * be programmed for level mode. */ if (is_master) vatpic->elc[0] = (*eax & 0xf8); else vatpic->elc[1] = (*eax & 0xde); } VATPIC_UNLOCK(vatpic); return (0); } struct vatpic * vatpic_init(struct vm *vm) { struct vatpic *vatpic; vatpic = malloc(sizeof(struct vatpic), M_VATPIC, M_WAITOK | M_ZERO); vatpic->vm = vm; mtx_init(&vatpic->mtx, "vatpic lock", NULL, MTX_SPIN); return (vatpic); } void vatpic_cleanup(struct vatpic *vatpic) { free(vatpic, M_VATPIC); } Index: head/sys/amd64/vmm/io/vatpit.h =================================================================== --- head/sys/amd64/vmm/io/vatpit.h (revision 335029) +++ head/sys/amd64/vmm/io/vatpit.h (revision 335030) @@ -1,45 +1,47 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2014 Tycho Nightingale * Copyright (c) 2011 NetApp, Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #ifndef _VATPIT_H_ #define _VATPIT_H_ #include #define NMISC_PORT 0x61 struct vatpit *vatpit_init(struct vm *vm); void vatpit_cleanup(struct vatpit *vatpit); int vatpit_handler(struct vm *vm, int vcpuid, bool in, int port, int bytes, uint32_t *eax); int vatpit_nmisc_handler(struct vm *vm, int vcpuid, bool in, int port, int bytes, uint32_t *eax); #endif /* _VATPIT_H_ */ Index: head/sys/amd64/vmm/io/vpmtmr.c =================================================================== --- head/sys/amd64/vmm/io/vpmtmr.c (revision 335029) +++ head/sys/amd64/vmm/io/vpmtmr.c (revision 335030) @@ -1,103 +1,105 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2014, Neel Natu (neel@freebsd.org) * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice unmodified, this list of conditions, and the following * disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include "vpmtmr.h" /* * The ACPI Power Management timer is a free-running 24- or 32-bit * timer with a frequency of 3.579545MHz * * This implementation will be 32-bits */ #define PMTMR_FREQ 3579545 /* 3.579545MHz */ struct vpmtmr { sbintime_t freq_sbt; sbintime_t baseuptime; uint32_t baseval; }; static MALLOC_DEFINE(M_VPMTMR, "vpmtmr", "bhyve virtual acpi timer"); struct vpmtmr * vpmtmr_init(struct vm *vm) { struct vpmtmr *vpmtmr; struct bintime bt; vpmtmr = malloc(sizeof(struct vpmtmr), M_VPMTMR, M_WAITOK | M_ZERO); vpmtmr->baseuptime = sbinuptime(); vpmtmr->baseval = 0; FREQ2BT(PMTMR_FREQ, &bt); vpmtmr->freq_sbt = bttosbt(bt); return (vpmtmr); } void vpmtmr_cleanup(struct vpmtmr *vpmtmr) { free(vpmtmr, M_VPMTMR); } int vpmtmr_handler(struct vm *vm, int vcpuid, bool in, int port, int bytes, uint32_t *val) { struct vpmtmr *vpmtmr; sbintime_t now, delta; if (!in || bytes != 4) return (-1); vpmtmr = vm_pmtmr(vm); /* * No locking needed because 'baseuptime' and 'baseval' are * written only during initialization. */ now = sbinuptime(); delta = now - vpmtmr->baseuptime; KASSERT(delta >= 0, ("vpmtmr_handler: uptime went backwards: " "%#lx to %#lx", vpmtmr->baseuptime, now)); *val = vpmtmr->baseval + delta / vpmtmr->freq_sbt; return (0); } Index: head/sys/amd64/vmm/io/vpmtmr.h =================================================================== --- head/sys/amd64/vmm/io/vpmtmr.h (revision 335029) +++ head/sys/amd64/vmm/io/vpmtmr.h (revision 335030) @@ -1,42 +1,44 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2014 Neel Natu (neel@freebsd.org) * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice unmodified, this list of conditions, and the following * disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * * $FreeBSD$ */ #ifndef _VPMTMR_H_ #define _VPMTMR_H_ #define IO_PMTMR 0x408 struct vpmtmr; struct vpmtmr *vpmtmr_init(struct vm *vm); void vpmtmr_cleanup(struct vpmtmr *pmtmr); int vpmtmr_handler(struct vm *vm, int vcpuid, bool in, int port, int bytes, uint32_t *val); #endif Index: head/sys/amd64/vmm/io/vrtc.c =================================================================== --- head/sys/amd64/vmm/io/vrtc.c (revision 335029) +++ head/sys/amd64/vmm/io/vrtc.c (revision 335030) @@ -1,1019 +1,1021 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2014, Neel Natu (neel@freebsd.org) * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice unmodified, this list of conditions, and the following * disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include "vmm_ktr.h" #include "vatpic.h" #include "vioapic.h" #include "vrtc.h" /* Register layout of the RTC */ struct rtcdev { uint8_t sec; uint8_t alarm_sec; uint8_t min; uint8_t alarm_min; uint8_t hour; uint8_t alarm_hour; uint8_t day_of_week; uint8_t day_of_month; uint8_t month; uint8_t year; uint8_t reg_a; uint8_t reg_b; uint8_t reg_c; uint8_t reg_d; uint8_t nvram[36]; uint8_t century; uint8_t nvram2[128 - 51]; } __packed; CTASSERT(sizeof(struct rtcdev) == 128); CTASSERT(offsetof(struct rtcdev, century) == RTC_CENTURY); struct vrtc { struct vm *vm; struct mtx mtx; struct callout callout; u_int addr; /* RTC register to read or write */ sbintime_t base_uptime; time_t base_rtctime; struct rtcdev rtcdev; }; #define VRTC_LOCK(vrtc) mtx_lock(&((vrtc)->mtx)) #define VRTC_UNLOCK(vrtc) mtx_unlock(&((vrtc)->mtx)) #define VRTC_LOCKED(vrtc) mtx_owned(&((vrtc)->mtx)) /* * RTC time is considered "broken" if: * - RTC updates are halted by the guest * - RTC date/time fields have invalid values */ #define VRTC_BROKEN_TIME ((time_t)-1) #define RTC_IRQ 8 #define RTCSB_BIN 0x04 #define RTCSB_ALL_INTRS (RTCSB_UINTR | RTCSB_AINTR | RTCSB_PINTR) #define rtc_halted(vrtc) ((vrtc->rtcdev.reg_b & RTCSB_HALT) != 0) #define aintr_enabled(vrtc) (((vrtc)->rtcdev.reg_b & RTCSB_AINTR) != 0) #define pintr_enabled(vrtc) (((vrtc)->rtcdev.reg_b & RTCSB_PINTR) != 0) #define uintr_enabled(vrtc) (((vrtc)->rtcdev.reg_b & RTCSB_UINTR) != 0) static void vrtc_callout_handler(void *arg); static void vrtc_set_reg_c(struct vrtc *vrtc, uint8_t newval); static MALLOC_DEFINE(M_VRTC, "vrtc", "bhyve virtual rtc"); SYSCTL_DECL(_hw_vmm); SYSCTL_NODE(_hw_vmm, OID_AUTO, vrtc, CTLFLAG_RW, NULL, NULL); static int rtc_flag_broken_time = 1; SYSCTL_INT(_hw_vmm_vrtc, OID_AUTO, flag_broken_time, CTLFLAG_RDTUN, &rtc_flag_broken_time, 0, "Stop guest when invalid RTC time is detected"); static __inline bool divider_enabled(int reg_a) { /* * The RTC is counting only when dividers are not held in reset. */ return ((reg_a & 0x70) == 0x20); } static __inline bool update_enabled(struct vrtc *vrtc) { /* * RTC date/time can be updated only if: * - divider is not held in reset * - guest has not disabled updates * - the date/time fields have valid contents */ if (!divider_enabled(vrtc->rtcdev.reg_a)) return (false); if (rtc_halted(vrtc)) return (false); if (vrtc->base_rtctime == VRTC_BROKEN_TIME) return (false); return (true); } static time_t vrtc_curtime(struct vrtc *vrtc, sbintime_t *basetime) { sbintime_t now, delta; time_t t, secs; KASSERT(VRTC_LOCKED(vrtc), ("%s: vrtc not locked", __func__)); t = vrtc->base_rtctime; *basetime = vrtc->base_uptime; if (update_enabled(vrtc)) { now = sbinuptime(); delta = now - vrtc->base_uptime; KASSERT(delta >= 0, ("vrtc_curtime: uptime went backwards: " "%#lx to %#lx", vrtc->base_uptime, now)); secs = delta / SBT_1S; t += secs; *basetime += secs * SBT_1S; } return (t); } static __inline uint8_t rtcset(struct rtcdev *rtc, int val) { KASSERT(val >= 0 && val < 100, ("%s: invalid bin2bcd index %d", __func__, val)); return ((rtc->reg_b & RTCSB_BIN) ? val : bin2bcd_data[val]); } static void secs_to_rtc(time_t rtctime, struct vrtc *vrtc, int force_update) { struct clocktime ct; struct timespec ts; struct rtcdev *rtc; int hour; KASSERT(VRTC_LOCKED(vrtc), ("%s: vrtc not locked", __func__)); if (rtctime < 0) { KASSERT(rtctime == VRTC_BROKEN_TIME, ("%s: invalid vrtc time %#lx", __func__, rtctime)); return; } /* * If the RTC is halted then the guest has "ownership" of the * date/time fields. Don't update the RTC date/time fields in * this case (unless forced). */ if (rtc_halted(vrtc) && !force_update) return; ts.tv_sec = rtctime; ts.tv_nsec = 0; clock_ts_to_ct(&ts, &ct); KASSERT(ct.sec >= 0 && ct.sec <= 59, ("invalid clocktime sec %d", ct.sec)); KASSERT(ct.min >= 0 && ct.min <= 59, ("invalid clocktime min %d", ct.min)); KASSERT(ct.hour >= 0 && ct.hour <= 23, ("invalid clocktime hour %d", ct.hour)); KASSERT(ct.dow >= 0 && ct.dow <= 6, ("invalid clocktime wday %d", ct.dow)); KASSERT(ct.day >= 1 && ct.day <= 31, ("invalid clocktime mday %d", ct.day)); KASSERT(ct.mon >= 1 && ct.mon <= 12, ("invalid clocktime month %d", ct.mon)); KASSERT(ct.year >= POSIX_BASE_YEAR, ("invalid clocktime year %d", ct.year)); rtc = &vrtc->rtcdev; rtc->sec = rtcset(rtc, ct.sec); rtc->min = rtcset(rtc, ct.min); if (rtc->reg_b & RTCSB_24HR) { hour = ct.hour; } else { /* * Convert to the 12-hour format. */ switch (ct.hour) { case 0: /* 12 AM */ case 12: /* 12 PM */ hour = 12; break; default: /* * The remaining 'ct.hour' values are interpreted as: * [1 - 11] -> 1 - 11 AM * [13 - 23] -> 1 - 11 PM */ hour = ct.hour % 12; break; } } rtc->hour = rtcset(rtc, hour); if ((rtc->reg_b & RTCSB_24HR) == 0 && ct.hour >= 12) rtc->hour |= 0x80; /* set MSB to indicate PM */ rtc->day_of_week = rtcset(rtc, ct.dow + 1); rtc->day_of_month = rtcset(rtc, ct.day); rtc->month = rtcset(rtc, ct.mon); rtc->year = rtcset(rtc, ct.year % 100); rtc->century = rtcset(rtc, ct.year / 100); } static int rtcget(struct rtcdev *rtc, int val, int *retval) { uint8_t upper, lower; if (rtc->reg_b & RTCSB_BIN) { *retval = val; return (0); } lower = val & 0xf; upper = (val >> 4) & 0xf; if (lower > 9 || upper > 9) return (-1); *retval = upper * 10 + lower; return (0); } static time_t rtc_to_secs(struct vrtc *vrtc) { struct clocktime ct; struct timespec ts; struct rtcdev *rtc; struct vm *vm; int century, error, hour, pm, year; KASSERT(VRTC_LOCKED(vrtc), ("%s: vrtc not locked", __func__)); vm = vrtc->vm; rtc = &vrtc->rtcdev; bzero(&ct, sizeof(struct clocktime)); error = rtcget(rtc, rtc->sec, &ct.sec); if (error || ct.sec < 0 || ct.sec > 59) { VM_CTR2(vm, "Invalid RTC sec %#x/%d", rtc->sec, ct.sec); goto fail; } error = rtcget(rtc, rtc->min, &ct.min); if (error || ct.min < 0 || ct.min > 59) { VM_CTR2(vm, "Invalid RTC min %#x/%d", rtc->min, ct.min); goto fail; } pm = 0; hour = rtc->hour; if ((rtc->reg_b & RTCSB_24HR) == 0) { if (hour & 0x80) { hour &= ~0x80; pm = 1; } } error = rtcget(rtc, hour, &ct.hour); if ((rtc->reg_b & RTCSB_24HR) == 0) { if (ct.hour >= 1 && ct.hour <= 12) { /* * Convert from 12-hour format to internal 24-hour * representation as follows: * * 12-hour format ct.hour * 12 AM 0 * 1 - 11 AM 1 - 11 * 12 PM 12 * 1 - 11 PM 13 - 23 */ if (ct.hour == 12) ct.hour = 0; if (pm) ct.hour += 12; } else { VM_CTR2(vm, "Invalid RTC 12-hour format %#x/%d", rtc->hour, ct.hour); goto fail; } } if (error || ct.hour < 0 || ct.hour > 23) { VM_CTR2(vm, "Invalid RTC hour %#x/%d", rtc->hour, ct.hour); goto fail; } /* * Ignore 'rtc->dow' because some guests like Linux don't bother * setting it at all while others like OpenBSD/i386 set it incorrectly. * * clock_ct_to_ts() does not depend on 'ct.dow' anyways so ignore it. */ ct.dow = -1; error = rtcget(rtc, rtc->day_of_month, &ct.day); if (error || ct.day < 1 || ct.day > 31) { VM_CTR2(vm, "Invalid RTC mday %#x/%d", rtc->day_of_month, ct.day); goto fail; } error = rtcget(rtc, rtc->month, &ct.mon); if (error || ct.mon < 1 || ct.mon > 12) { VM_CTR2(vm, "Invalid RTC month %#x/%d", rtc->month, ct.mon); goto fail; } error = rtcget(rtc, rtc->year, &year); if (error || year < 0 || year > 99) { VM_CTR2(vm, "Invalid RTC year %#x/%d", rtc->year, year); goto fail; } error = rtcget(rtc, rtc->century, ¢ury); ct.year = century * 100 + year; if (error || ct.year < POSIX_BASE_YEAR) { VM_CTR2(vm, "Invalid RTC century %#x/%d", rtc->century, ct.year); goto fail; } error = clock_ct_to_ts(&ct, &ts); if (error || ts.tv_sec < 0) { VM_CTR3(vm, "Invalid RTC clocktime.date %04d-%02d-%02d", ct.year, ct.mon, ct.day); VM_CTR3(vm, "Invalid RTC clocktime.time %02d:%02d:%02d", ct.hour, ct.min, ct.sec); goto fail; } return (ts.tv_sec); /* success */ fail: /* * Stop updating the RTC if the date/time fields programmed by * the guest are invalid. */ VM_CTR0(vrtc->vm, "Invalid RTC date/time programming detected"); return (VRTC_BROKEN_TIME); } static int vrtc_time_update(struct vrtc *vrtc, time_t newtime, sbintime_t newbase) { struct rtcdev *rtc; sbintime_t oldbase; time_t oldtime; uint8_t alarm_sec, alarm_min, alarm_hour; KASSERT(VRTC_LOCKED(vrtc), ("%s: vrtc not locked", __func__)); rtc = &vrtc->rtcdev; alarm_sec = rtc->alarm_sec; alarm_min = rtc->alarm_min; alarm_hour = rtc->alarm_hour; oldtime = vrtc->base_rtctime; VM_CTR2(vrtc->vm, "Updating RTC secs from %#lx to %#lx", oldtime, newtime); oldbase = vrtc->base_uptime; VM_CTR2(vrtc->vm, "Updating RTC base uptime from %#lx to %#lx", oldbase, newbase); vrtc->base_uptime = newbase; if (newtime == oldtime) return (0); /* * If 'newtime' indicates that RTC updates are disabled then just * record that and return. There is no need to do alarm interrupt * processing in this case. */ if (newtime == VRTC_BROKEN_TIME) { vrtc->base_rtctime = VRTC_BROKEN_TIME; return (0); } /* * Return an error if RTC updates are halted by the guest. */ if (rtc_halted(vrtc)) { VM_CTR0(vrtc->vm, "RTC update halted by guest"); return (EBUSY); } do { /* * If the alarm interrupt is enabled and 'oldtime' is valid * then visit all the seconds between 'oldtime' and 'newtime' * to check for the alarm condition. * * Otherwise move the RTC time forward directly to 'newtime'. */ if (aintr_enabled(vrtc) && oldtime != VRTC_BROKEN_TIME) vrtc->base_rtctime++; else vrtc->base_rtctime = newtime; if (aintr_enabled(vrtc)) { /* * Update the RTC date/time fields before checking * if the alarm conditions are satisfied. */ secs_to_rtc(vrtc->base_rtctime, vrtc, 0); if ((alarm_sec >= 0xC0 || alarm_sec == rtc->sec) && (alarm_min >= 0xC0 || alarm_min == rtc->min) && (alarm_hour >= 0xC0 || alarm_hour == rtc->hour)) { vrtc_set_reg_c(vrtc, rtc->reg_c | RTCIR_ALARM); } } } while (vrtc->base_rtctime != newtime); if (uintr_enabled(vrtc)) vrtc_set_reg_c(vrtc, rtc->reg_c | RTCIR_UPDATE); return (0); } static sbintime_t vrtc_freq(struct vrtc *vrtc) { int ratesel; static sbintime_t pf[16] = { 0, SBT_1S / 256, SBT_1S / 128, SBT_1S / 8192, SBT_1S / 4096, SBT_1S / 2048, SBT_1S / 1024, SBT_1S / 512, SBT_1S / 256, SBT_1S / 128, SBT_1S / 64, SBT_1S / 32, SBT_1S / 16, SBT_1S / 8, SBT_1S / 4, SBT_1S / 2, }; KASSERT(VRTC_LOCKED(vrtc), ("%s: vrtc not locked", __func__)); /* * If both periodic and alarm interrupts are enabled then use the * periodic frequency to drive the callout. The minimum periodic * frequency (2 Hz) is higher than the alarm frequency (1 Hz) so * piggyback the alarm on top of it. The same argument applies to * the update interrupt. */ if (pintr_enabled(vrtc) && divider_enabled(vrtc->rtcdev.reg_a)) { ratesel = vrtc->rtcdev.reg_a & 0xf; return (pf[ratesel]); } else if (aintr_enabled(vrtc) && update_enabled(vrtc)) { return (SBT_1S); } else if (uintr_enabled(vrtc) && update_enabled(vrtc)) { return (SBT_1S); } else { return (0); } } static void vrtc_callout_reset(struct vrtc *vrtc, sbintime_t freqsbt) { KASSERT(VRTC_LOCKED(vrtc), ("%s: vrtc not locked", __func__)); if (freqsbt == 0) { if (callout_active(&vrtc->callout)) { VM_CTR0(vrtc->vm, "RTC callout stopped"); callout_stop(&vrtc->callout); } return; } VM_CTR1(vrtc->vm, "RTC callout frequency %d hz", SBT_1S / freqsbt); callout_reset_sbt(&vrtc->callout, freqsbt, 0, vrtc_callout_handler, vrtc, 0); } static void vrtc_callout_handler(void *arg) { struct vrtc *vrtc = arg; sbintime_t freqsbt, basetime; time_t rtctime; int error; VM_CTR0(vrtc->vm, "vrtc callout fired"); VRTC_LOCK(vrtc); if (callout_pending(&vrtc->callout)) /* callout was reset */ goto done; if (!callout_active(&vrtc->callout)) /* callout was stopped */ goto done; callout_deactivate(&vrtc->callout); KASSERT((vrtc->rtcdev.reg_b & RTCSB_ALL_INTRS) != 0, ("gratuitous vrtc callout")); if (pintr_enabled(vrtc)) vrtc_set_reg_c(vrtc, vrtc->rtcdev.reg_c | RTCIR_PERIOD); if (aintr_enabled(vrtc) || uintr_enabled(vrtc)) { rtctime = vrtc_curtime(vrtc, &basetime); error = vrtc_time_update(vrtc, rtctime, basetime); KASSERT(error == 0, ("%s: vrtc_time_update error %d", __func__, error)); } freqsbt = vrtc_freq(vrtc); KASSERT(freqsbt != 0, ("%s: vrtc frequency cannot be zero", __func__)); vrtc_callout_reset(vrtc, freqsbt); done: VRTC_UNLOCK(vrtc); } static __inline void vrtc_callout_check(struct vrtc *vrtc, sbintime_t freq) { int active; active = callout_active(&vrtc->callout) ? 1 : 0; KASSERT((freq == 0 && !active) || (freq != 0 && active), ("vrtc callout %s with frequency %#lx", active ? "active" : "inactive", freq)); } static void vrtc_set_reg_c(struct vrtc *vrtc, uint8_t newval) { struct rtcdev *rtc; int oldirqf, newirqf; uint8_t oldval, changed; KASSERT(VRTC_LOCKED(vrtc), ("%s: vrtc not locked", __func__)); rtc = &vrtc->rtcdev; newval &= RTCIR_ALARM | RTCIR_PERIOD | RTCIR_UPDATE; oldirqf = rtc->reg_c & RTCIR_INT; if ((aintr_enabled(vrtc) && (newval & RTCIR_ALARM) != 0) || (pintr_enabled(vrtc) && (newval & RTCIR_PERIOD) != 0) || (uintr_enabled(vrtc) && (newval & RTCIR_UPDATE) != 0)) { newirqf = RTCIR_INT; } else { newirqf = 0; } oldval = rtc->reg_c; rtc->reg_c = newirqf | newval; changed = oldval ^ rtc->reg_c; if (changed) { VM_CTR2(vrtc->vm, "RTC reg_c changed from %#x to %#x", oldval, rtc->reg_c); } if (!oldirqf && newirqf) { VM_CTR1(vrtc->vm, "RTC irq %d asserted", RTC_IRQ); vatpic_pulse_irq(vrtc->vm, RTC_IRQ); vioapic_pulse_irq(vrtc->vm, RTC_IRQ); } else if (oldirqf && !newirqf) { VM_CTR1(vrtc->vm, "RTC irq %d deasserted", RTC_IRQ); } } static int vrtc_set_reg_b(struct vrtc *vrtc, uint8_t newval) { struct rtcdev *rtc; sbintime_t oldfreq, newfreq, basetime; time_t curtime, rtctime; int error; uint8_t oldval, changed; KASSERT(VRTC_LOCKED(vrtc), ("%s: vrtc not locked", __func__)); rtc = &vrtc->rtcdev; oldval = rtc->reg_b; oldfreq = vrtc_freq(vrtc); rtc->reg_b = newval; changed = oldval ^ newval; if (changed) { VM_CTR2(vrtc->vm, "RTC reg_b changed from %#x to %#x", oldval, newval); } if (changed & RTCSB_HALT) { if ((newval & RTCSB_HALT) == 0) { rtctime = rtc_to_secs(vrtc); basetime = sbinuptime(); if (rtctime == VRTC_BROKEN_TIME) { if (rtc_flag_broken_time) return (-1); } } else { curtime = vrtc_curtime(vrtc, &basetime); KASSERT(curtime == vrtc->base_rtctime, ("%s: mismatch " "between vrtc basetime (%#lx) and curtime (%#lx)", __func__, vrtc->base_rtctime, curtime)); /* * Force a refresh of the RTC date/time fields so * they reflect the time right before the guest set * the HALT bit. */ secs_to_rtc(curtime, vrtc, 1); /* * Updates are halted so mark 'base_rtctime' to denote * that the RTC date/time is in flux. */ rtctime = VRTC_BROKEN_TIME; rtc->reg_b &= ~RTCSB_UINTR; } error = vrtc_time_update(vrtc, rtctime, basetime); KASSERT(error == 0, ("vrtc_time_update error %d", error)); } /* * Side effect of changes to the interrupt enable bits. */ if (changed & RTCSB_ALL_INTRS) vrtc_set_reg_c(vrtc, vrtc->rtcdev.reg_c); /* * Change the callout frequency if it has changed. */ newfreq = vrtc_freq(vrtc); if (newfreq != oldfreq) vrtc_callout_reset(vrtc, newfreq); else vrtc_callout_check(vrtc, newfreq); /* * The side effect of bits that control the RTC date/time format * is handled lazily when those fields are actually read. */ return (0); } static void vrtc_set_reg_a(struct vrtc *vrtc, uint8_t newval) { sbintime_t oldfreq, newfreq; uint8_t oldval, changed; KASSERT(VRTC_LOCKED(vrtc), ("%s: vrtc not locked", __func__)); newval &= ~RTCSA_TUP; oldval = vrtc->rtcdev.reg_a; oldfreq = vrtc_freq(vrtc); if (divider_enabled(oldval) && !divider_enabled(newval)) { VM_CTR2(vrtc->vm, "RTC divider held in reset at %#lx/%#lx", vrtc->base_rtctime, vrtc->base_uptime); } else if (!divider_enabled(oldval) && divider_enabled(newval)) { /* * If the dividers are coming out of reset then update * 'base_uptime' before this happens. This is done to * maintain the illusion that the RTC date/time was frozen * while the dividers were disabled. */ vrtc->base_uptime = sbinuptime(); VM_CTR2(vrtc->vm, "RTC divider out of reset at %#lx/%#lx", vrtc->base_rtctime, vrtc->base_uptime); } else { /* NOTHING */ } vrtc->rtcdev.reg_a = newval; changed = oldval ^ newval; if (changed) { VM_CTR2(vrtc->vm, "RTC reg_a changed from %#x to %#x", oldval, newval); } /* * Side effect of changes to rate select and divider enable bits. */ newfreq = vrtc_freq(vrtc); if (newfreq != oldfreq) vrtc_callout_reset(vrtc, newfreq); else vrtc_callout_check(vrtc, newfreq); } int vrtc_set_time(struct vm *vm, time_t secs) { struct vrtc *vrtc; int error; vrtc = vm_rtc(vm); VRTC_LOCK(vrtc); error = vrtc_time_update(vrtc, secs, sbinuptime()); VRTC_UNLOCK(vrtc); if (error) { VM_CTR2(vrtc->vm, "Error %d setting RTC time to %#lx", error, secs); } else { VM_CTR1(vrtc->vm, "RTC time set to %#lx", secs); } return (error); } time_t vrtc_get_time(struct vm *vm) { struct vrtc *vrtc; sbintime_t basetime; time_t t; vrtc = vm_rtc(vm); VRTC_LOCK(vrtc); t = vrtc_curtime(vrtc, &basetime); VRTC_UNLOCK(vrtc); return (t); } int vrtc_nvram_write(struct vm *vm, int offset, uint8_t value) { struct vrtc *vrtc; uint8_t *ptr; vrtc = vm_rtc(vm); /* * Don't allow writes to RTC control registers or the date/time fields. */ if (offset < offsetof(struct rtcdev, nvram[0]) || offset == RTC_CENTURY || offset >= sizeof(struct rtcdev)) { VM_CTR1(vrtc->vm, "RTC nvram write to invalid offset %d", offset); return (EINVAL); } VRTC_LOCK(vrtc); ptr = (uint8_t *)(&vrtc->rtcdev); ptr[offset] = value; VM_CTR2(vrtc->vm, "RTC nvram write %#x to offset %#x", value, offset); VRTC_UNLOCK(vrtc); return (0); } int vrtc_nvram_read(struct vm *vm, int offset, uint8_t *retval) { struct vrtc *vrtc; sbintime_t basetime; time_t curtime; uint8_t *ptr; /* * Allow all offsets in the RTC to be read. */ if (offset < 0 || offset >= sizeof(struct rtcdev)) return (EINVAL); vrtc = vm_rtc(vm); VRTC_LOCK(vrtc); /* * Update RTC date/time fields if necessary. */ if (offset < 10 || offset == RTC_CENTURY) { curtime = vrtc_curtime(vrtc, &basetime); secs_to_rtc(curtime, vrtc, 0); } ptr = (uint8_t *)(&vrtc->rtcdev); *retval = ptr[offset]; VRTC_UNLOCK(vrtc); return (0); } int vrtc_addr_handler(struct vm *vm, int vcpuid, bool in, int port, int bytes, uint32_t *val) { struct vrtc *vrtc; vrtc = vm_rtc(vm); if (bytes != 1) return (-1); if (in) { *val = 0xff; return (0); } VRTC_LOCK(vrtc); vrtc->addr = *val & 0x7f; VRTC_UNLOCK(vrtc); return (0); } int vrtc_data_handler(struct vm *vm, int vcpuid, bool in, int port, int bytes, uint32_t *val) { struct vrtc *vrtc; struct rtcdev *rtc; sbintime_t basetime; time_t curtime; int error, offset; vrtc = vm_rtc(vm); rtc = &vrtc->rtcdev; if (bytes != 1) return (-1); VRTC_LOCK(vrtc); offset = vrtc->addr; if (offset >= sizeof(struct rtcdev)) { VRTC_UNLOCK(vrtc); return (-1); } error = 0; curtime = vrtc_curtime(vrtc, &basetime); vrtc_time_update(vrtc, curtime, basetime); /* * Update RTC date/time fields if necessary. * * This is not just for reads of the RTC. The side-effect of writing * the century byte requires other RTC date/time fields (e.g. sec) * to be updated here. */ if (offset < 10 || offset == RTC_CENTURY) secs_to_rtc(curtime, vrtc, 0); if (in) { if (offset == 12) { /* * XXX * reg_c interrupt flags are updated only if the * corresponding interrupt enable bit in reg_b is set. */ *val = vrtc->rtcdev.reg_c; vrtc_set_reg_c(vrtc, 0); } else { *val = *((uint8_t *)rtc + offset); } VCPU_CTR2(vm, vcpuid, "Read value %#x from RTC offset %#x", *val, offset); } else { switch (offset) { case 10: VCPU_CTR1(vm, vcpuid, "RTC reg_a set to %#x", *val); vrtc_set_reg_a(vrtc, *val); break; case 11: VCPU_CTR1(vm, vcpuid, "RTC reg_b set to %#x", *val); error = vrtc_set_reg_b(vrtc, *val); break; case 12: VCPU_CTR1(vm, vcpuid, "RTC reg_c set to %#x (ignored)", *val); break; case 13: VCPU_CTR1(vm, vcpuid, "RTC reg_d set to %#x (ignored)", *val); break; case 0: /* * High order bit of 'seconds' is readonly. */ *val &= 0x7f; /* FALLTHRU */ default: VCPU_CTR2(vm, vcpuid, "RTC offset %#x set to %#x", offset, *val); *((uint8_t *)rtc + offset) = *val; break; } /* * XXX some guests (e.g. OpenBSD) write the century byte * outside of RTCSB_HALT so re-calculate the RTC date/time. */ if (offset == RTC_CENTURY && !rtc_halted(vrtc)) { curtime = rtc_to_secs(vrtc); error = vrtc_time_update(vrtc, curtime, sbinuptime()); KASSERT(!error, ("vrtc_time_update error %d", error)); if (curtime == VRTC_BROKEN_TIME && rtc_flag_broken_time) error = -1; } } VRTC_UNLOCK(vrtc); return (error); } void vrtc_reset(struct vrtc *vrtc) { struct rtcdev *rtc; VRTC_LOCK(vrtc); rtc = &vrtc->rtcdev; vrtc_set_reg_b(vrtc, rtc->reg_b & ~(RTCSB_ALL_INTRS | RTCSB_SQWE)); vrtc_set_reg_c(vrtc, 0); KASSERT(!callout_active(&vrtc->callout), ("rtc callout still active")); VRTC_UNLOCK(vrtc); } struct vrtc * vrtc_init(struct vm *vm) { struct vrtc *vrtc; struct rtcdev *rtc; time_t curtime; vrtc = malloc(sizeof(struct vrtc), M_VRTC, M_WAITOK | M_ZERO); vrtc->vm = vm; mtx_init(&vrtc->mtx, "vrtc lock", NULL, MTX_DEF); callout_init(&vrtc->callout, 1); /* Allow dividers to keep time but disable everything else */ rtc = &vrtc->rtcdev; rtc->reg_a = 0x20; rtc->reg_b = RTCSB_24HR; rtc->reg_c = 0; rtc->reg_d = RTCSD_PWR; /* Reset the index register to a safe value. */ vrtc->addr = RTC_STATUSD; /* * Initialize RTC time to 00:00:00 Jan 1, 1970. */ curtime = 0; VRTC_LOCK(vrtc); vrtc->base_rtctime = VRTC_BROKEN_TIME; vrtc_time_update(vrtc, curtime, sbinuptime()); secs_to_rtc(curtime, vrtc, 0); VRTC_UNLOCK(vrtc); return (vrtc); } void vrtc_cleanup(struct vrtc *vrtc) { callout_drain(&vrtc->callout); free(vrtc, M_VRTC); } Index: head/sys/amd64/vmm/io/vrtc.h =================================================================== --- head/sys/amd64/vmm/io/vrtc.h (revision 335029) +++ head/sys/amd64/vmm/io/vrtc.h (revision 335030) @@ -1,50 +1,52 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2014 Neel Natu (neel@freebsd.org) * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice unmodified, this list of conditions, and the following * disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * * $FreeBSD$ */ #ifndef _VRTC_H_ #define _VRTC_H_ #include struct vrtc; struct vrtc *vrtc_init(struct vm *vm); void vrtc_cleanup(struct vrtc *vrtc); void vrtc_reset(struct vrtc *vrtc); time_t vrtc_get_time(struct vm *vm); int vrtc_set_time(struct vm *vm, time_t secs); int vrtc_nvram_write(struct vm *vm, int offset, uint8_t value); int vrtc_nvram_read(struct vm *vm, int offset, uint8_t *retval); int vrtc_addr_handler(struct vm *vm, int vcpuid, bool in, int port, int bytes, uint32_t *val); int vrtc_data_handler(struct vm *vm, int vcpuid, bool in, int port, int bytes, uint32_t *val); #endif Index: head/sys/amd64/vmm/vmm_ioport.c =================================================================== --- head/sys/amd64/vmm/vmm_ioport.c (revision 335029) +++ head/sys/amd64/vmm/vmm_ioport.c (revision 335030) @@ -1,176 +1,178 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2014 Tycho Nightingale * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include "vatpic.h" #include "vatpit.h" #include "vpmtmr.h" #include "vrtc.h" #include "vmm_ioport.h" #include "vmm_ktr.h" #define MAX_IOPORTS 1280 ioport_handler_func_t ioport_handler[MAX_IOPORTS] = { [TIMER_MODE] = vatpit_handler, [TIMER_CNTR0] = vatpit_handler, [TIMER_CNTR1] = vatpit_handler, [TIMER_CNTR2] = vatpit_handler, [NMISC_PORT] = vatpit_nmisc_handler, [IO_ICU1] = vatpic_master_handler, [IO_ICU1 + ICU_IMR_OFFSET] = vatpic_master_handler, [IO_ICU2] = vatpic_slave_handler, [IO_ICU2 + ICU_IMR_OFFSET] = vatpic_slave_handler, [IO_ELCR1] = vatpic_elc_handler, [IO_ELCR2] = vatpic_elc_handler, [IO_PMTMR] = vpmtmr_handler, [IO_RTC] = vrtc_addr_handler, [IO_RTC + 1] = vrtc_data_handler, }; #ifdef KTR static const char * inout_instruction(struct vm_exit *vmexit) { int index; static const char *iodesc[] = { "outb", "outw", "outl", "inb", "inw", "inl", "outsb", "outsw", "outsd", "insb", "insw", "insd", }; switch (vmexit->u.inout.bytes) { case 1: index = 0; break; case 2: index = 1; break; default: index = 2; break; } if (vmexit->u.inout.in) index += 3; if (vmexit->u.inout.string) index += 6; KASSERT(index < nitems(iodesc), ("%s: invalid index %d", __func__, index)); return (iodesc[index]); } #endif /* KTR */ static int emulate_inout_port(struct vm *vm, int vcpuid, struct vm_exit *vmexit, bool *retu) { ioport_handler_func_t handler; uint32_t mask, val; int error; /* * If there is no handler for the I/O port then punt to userspace. */ if (vmexit->u.inout.port >= MAX_IOPORTS || (handler = ioport_handler[vmexit->u.inout.port]) == NULL) { *retu = true; return (0); } mask = vie_size2mask(vmexit->u.inout.bytes); if (!vmexit->u.inout.in) { val = vmexit->u.inout.eax & mask; } error = (*handler)(vm, vcpuid, vmexit->u.inout.in, vmexit->u.inout.port, vmexit->u.inout.bytes, &val); if (error) { /* * The value returned by this function is also the return value * of vm_run(). This needs to be a positive number otherwise it * can be interpreted as a "pseudo-error" like ERESTART. * * Enforce this by mapping all errors to EIO. */ return (EIO); } if (vmexit->u.inout.in) { vmexit->u.inout.eax &= ~mask; vmexit->u.inout.eax |= val & mask; error = vm_set_register(vm, vcpuid, VM_REG_GUEST_RAX, vmexit->u.inout.eax); KASSERT(error == 0, ("emulate_ioport: error %d setting guest " "rax register", error)); } *retu = false; return (0); } static int emulate_inout_str(struct vm *vm, int vcpuid, struct vm_exit *vmexit, bool *retu) { *retu = true; return (0); /* Return to userspace to finish emulation */ } int vm_handle_inout(struct vm *vm, int vcpuid, struct vm_exit *vmexit, bool *retu) { int bytes, error; bytes = vmexit->u.inout.bytes; KASSERT(bytes == 1 || bytes == 2 || bytes == 4, ("vm_handle_inout: invalid operand size %d", bytes)); if (vmexit->u.inout.string) error = emulate_inout_str(vm, vcpuid, vmexit, retu); else error = emulate_inout_port(vm, vcpuid, vmexit, retu); VCPU_CTR4(vm, vcpuid, "%s%s 0x%04x: %s", vmexit->u.inout.rep ? "rep " : "", inout_instruction(vmexit), vmexit->u.inout.port, error ? "error" : (*retu ? "userspace" : "handled")); return (error); } Index: head/sys/amd64/vmm/vmm_ioport.h =================================================================== --- head/sys/amd64/vmm/vmm_ioport.h (revision 335029) +++ head/sys/amd64/vmm/vmm_ioport.h (revision 335030) @@ -1,37 +1,39 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2014 Tycho Nightingale * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #ifndef _VMM_IOPORT_H_ #define _VMM_IOPORT_H_ typedef int (*ioport_handler_func_t)(struct vm *vm, int vcpuid, bool in, int port, int bytes, uint32_t *val); int vm_handle_inout(struct vm *vm, int vcpuid, struct vm_exit *vme, bool *retu); #endif /* _VMM_IOPORT_H_ */