diff --git a/sys/amd64/amd64/pmap.c b/sys/amd64/amd64/pmap.c --- a/sys/amd64/amd64/pmap.c +++ b/sys/amd64/amd64/pmap.c @@ -12095,7 +12095,7 @@ continue; } if (PMAP_ADDRESS_IN_LARGEMAP(sva) && - vm_phys_paddr_to_vm_page(pa) == NULL) { + vm_phys_paddr_to_vm_page(pa, NULL) == NULL) { /* * Page table pages for the large map may be * freed. Validate the next-level address @@ -12124,7 +12124,8 @@ continue; } if (PMAP_ADDRESS_IN_LARGEMAP(sva) && - vm_phys_paddr_to_vm_page(pa) == NULL) { + vm_phys_paddr_to_vm_page(pa, NULL) == + NULL) { /* * Page table pages for the large map * may be freed. Validate the diff --git a/sys/amd64/vmm/amd/amdsysdrv_iommu.c b/sys/amd64/vmm/amd/amdsysdrv_iommu.c new file mode 100644 --- /dev/null +++ b/sys/amd64/vmm/amd/amdsysdrv_iommu.c @@ -0,0 +1,113 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2025 The FreeBSD Foundation + * All rights reserved. + * + * This software was developed by Konstantin Belousov + * under sponsorship from the FreeBSD Foundation. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "io/iommu.h" + +static int +amdsysdrv_init(void) +{ + + return (amdiommu_is_running()); +} + +static void +amdsysdrv_cleanup(void) +{ + + /* XXXKIB */ +} + +static void +amdsysdrv_enable(void) +{ + + /* XXXKIB */ +} + +static void +amdsysdrv_disable(void) +{ + + /* XXXKIB */ +} + +static bool +amdsysdrv_get_swcaps(enum iommu_swcaps cap) +{ + + switch (cap) { + case IOMMU_CAP_BULK: + return (true); + default: + return (false); + } +} + +const struct iommu_ops iommu_ops_amdsysdrv = { + .init = amdsysdrv_init, + .cleanup = amdsysdrv_cleanup, + .enable = amdsysdrv_enable, + .disable = amdsysdrv_disable, + .create_domain = geniommu_create_domain, + .destroy_domain = geniommu_destroy_domain, + .create_mapping = geniommu_create_mapping, + .create_mapping_bulk = geniommu_create_mapping_bulk, + .remove_mapping = geniommu_remove_mapping, + .add_device = geniommu_add_device, + .remove_device = geniommu_remove_device, + .invalidate_tlb = geniommu_invalidate_tlb, + .get_swcaps = amdsysdrv_get_swcaps, +}; diff --git a/sys/amd64/vmm/amd/amdvi_hw.c b/sys/amd64/vmm/amd/amdvi_hw.c --- a/sys/amd64/vmm/amd/amdvi_hw.c +++ b/sys/amd64/vmm/amd/amdvi_hw.c @@ -994,7 +994,7 @@ } static void * -amdvi_create_domain(vm_paddr_t maxaddr) +amdvi_create_domain(vm_paddr_t maxaddr, bool host_domain __unused) { struct amdvi_domain *dom; diff --git a/sys/amd64/vmm/intel/dmar_iommu.c b/sys/amd64/vmm/intel/dmar_iommu.c new file mode 100644 --- /dev/null +++ b/sys/amd64/vmm/intel/dmar_iommu.c @@ -0,0 +1,113 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2019 The FreeBSD Foundation + * All rights reserved. + * + * This software was developed by Konstantin Belousov + * under sponsorship from the FreeBSD Foundation. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "io/iommu.h" + +static int +dmar_init(void) +{ + + return (dmar_is_running()); +} + +static void +dmar_cleanup(void) +{ + + /* XXXKIB */ +} + +static void +dmar_enable(void) +{ + + /* XXXKIB */ +} + +static void +dmar_disable(void) +{ + + /* XXXKIB */ +} + +static bool +dmar_get_swcaps(enum iommu_swcaps cap) +{ + + switch (cap) { + case IOMMU_CAP_BULK: + return (true); + default: + return (false); + } +} + +const struct iommu_ops iommu_ops_dmar = { + .init = dmar_init, + .cleanup = dmar_cleanup, + .enable = dmar_enable, + .disable = dmar_disable, + .create_domain = geniommu_create_domain, + .destroy_domain = geniommu_destroy_domain, + .create_mapping = geniommu_create_mapping, + .create_mapping_bulk = geniommu_create_mapping_bulk, + .remove_mapping = geniommu_remove_mapping, + .add_device = geniommu_add_device, + .remove_device = geniommu_remove_device, + .invalidate_tlb = geniommu_invalidate_tlb, + .get_swcaps = dmar_get_swcaps, +}; diff --git a/sys/amd64/vmm/intel/vtd.c b/sys/amd64/vmm/intel/vtd.c --- a/sys/amd64/vmm/intel/vtd.c +++ b/sys/amd64/vmm/intel/vtd.c @@ -637,7 +637,7 @@ } static void * -vtd_create_domain(vm_paddr_t maxaddr) +vtd_create_domain(vm_paddr_t maxaddr, bool host_domain __unused) { struct domain *dom; vm_paddr_t addr; diff --git a/sys/amd64/vmm/io/iommu.h b/sys/amd64/vmm/io/iommu.h --- a/sys/amd64/vmm/io/iommu.h +++ b/sys/amd64/vmm/io/iommu.h @@ -29,19 +29,27 @@ #ifndef _IO_IOMMU_H_ #define _IO_IOMMU_H_ +enum iommu_swcaps { + IOMMU_CAP_NOP, + IOMMU_CAP_BULK, +}; + typedef int (*iommu_init_func_t)(void); typedef void (*iommu_cleanup_func_t)(void); typedef void (*iommu_enable_func_t)(void); typedef void (*iommu_disable_func_t)(void); -typedef void *(*iommu_create_domain_t)(vm_paddr_t maxaddr); +typedef void *(*iommu_create_domain_t)(vm_paddr_t maxaddr, bool host_domain); typedef void (*iommu_destroy_domain_t)(void *domain); typedef int (*iommu_create_mapping_t)(void *domain, vm_paddr_t gpa, vm_paddr_t hpa, uint64_t len, uint64_t *res_len); +typedef int (*iommu_create_mapping_bulk_t)(void *domain, vm_paddr_t gpa, + struct vm_page **ma, uint64_t len); typedef int (*iommu_remove_mapping_t)(void *domain, vm_paddr_t gpa, uint64_t len, uint64_t *res_len); typedef int (*iommu_add_device_t)(void *domain, device_t dev, uint16_t rid); typedef int (*iommu_remove_device_t)(void *dom, device_t dev, uint16_t rid); typedef int (*iommu_invalidate_tlb_t)(void *dom); +typedef bool (*iommu_get_swcaps_t)(enum iommu_swcaps); struct iommu_ops { iommu_init_func_t init; /* module wide */ @@ -52,14 +60,40 @@ iommu_create_domain_t create_domain; /* domain-specific */ iommu_destroy_domain_t destroy_domain; iommu_create_mapping_t create_mapping; + iommu_create_mapping_bulk_t create_mapping_bulk; iommu_remove_mapping_t remove_mapping; iommu_add_device_t add_device; iommu_remove_device_t remove_device; iommu_invalidate_tlb_t invalidate_tlb; + iommu_get_swcaps_t get_swcaps; }; -extern const struct iommu_ops iommu_ops_intel; +struct geniommu_vmm_domain { + bool host_domain; + struct sx lock; + LIST_HEAD(, iommu_domain) iommu_domains; +}; + +void *geniommu_create_domain(vm_paddr_t maxaddr, bool host_domain); +void geniommu_destroy_domain(void *domain); +int geniommu_create_mapping_one(struct iommu_domain *iodom, vm_paddr_t gpa, + vm_paddr_t hpa, uint64_t len); +int geniommu_create_mapping(void *dom1, vm_paddr_t gpa, vm_paddr_t hpa, + uint64_t len, uint64_t *res_len); +int geniommu_create_mapping_bulk_one(struct iommu_domain *iodom, vm_paddr_t gpa, + struct vm_page **ma, uint64_t len); +int geniommu_create_mapping_bulk(void *dom1, vm_paddr_t gpa, struct vm_page **ma, + uint64_t len); +int geniommu_remove_mapping(void *dom1, vm_paddr_t gpa, uint64_t len, + uint64_t *res_len); +int geniommu_add_device(void *dom1, device_t dev, uint16_t rid); +int geniommu_remove_device(void *dom1, device_t dev, uint16_t rid); +int geniommu_invalidate_tlb(void *dom); + +extern const struct iommu_ops iommu_ops_amdsysdrv; extern const struct iommu_ops iommu_ops_amd; +extern const struct iommu_ops iommu_ops_dmar; +extern const struct iommu_ops iommu_ops_intel; void iommu_cleanup(void); void *iommu_host_domain(void); @@ -67,8 +101,11 @@ void iommu_destroy_domain(void *dom); int iommu_create_mapping(void *dom, vm_paddr_t gpa, vm_paddr_t hpa, size_t len); +int iommu_create_mapping_bulk(void *dom, vm_paddr_t gpa, + struct vm_page **ma, size_t len); int iommu_remove_mapping(void *dom, vm_paddr_t gpa, size_t len); int iommu_add_device(void *dom, device_t dev, uint16_t rid); int iommu_remove_device(void *dom, device_t dev, uint16_t rid); int iommu_invalidate_tlb(void *domain); +bool iommu_get_swcaps(enum iommu_swcaps cap); #endif diff --git a/sys/amd64/vmm/io/iommu.c b/sys/amd64/vmm/io/iommu.c --- a/sys/amd64/vmm/io/iommu.c +++ b/sys/amd64/vmm/io/iommu.c @@ -4,6 +4,12 @@ * Copyright (c) 2011 NetApp, Inc. * All rights reserved. * + * Copyright (c) 2019, 2025 The FreeBSD Foundation + * All rights reserved. + * + * Part of this software was developed by Konstantin Belousov + * under sponsorship from the FreeBSD Foundation. + * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: @@ -26,17 +32,30 @@ * SUCH DAMAGE. */ -#include +#include "opt_iommu.h" + +#include #include #include +#include +#include +#include +#include +#include +#include #include -#include +#include #include #include #include #include +#include + +#include +#include +#include #include "vmm_util.h" #include "vmm_mem.h" @@ -77,11 +96,11 @@ } static __inline void * -IOMMU_CREATE_DOMAIN(vm_paddr_t maxaddr) +IOMMU_CREATE_DOMAIN(vm_paddr_t maxaddr, bool host_domain) { if (ops != NULL && iommu_avail) - return ((*ops->create_domain)(maxaddr)); + return ((*ops->create_domain)(maxaddr, host_domain)); else return (NULL); } @@ -104,6 +123,16 @@ return (EOPNOTSUPP); } +static __inline int +IOMMU_CREATE_MAPPING_BULK(void *domain, vm_paddr_t gpa, struct vm_page **ma, + size_t len) +{ + + if (ops != NULL && iommu_avail) + return ((*ops->create_mapping_bulk)(domain, gpa, ma, len)); + return (EOPNOTSUPP); +} + static __inline uint64_t IOMMU_REMOVE_MAPPING(void *domain, vm_paddr_t gpa, uint64_t len, uint64_t *res_len) @@ -157,6 +186,15 @@ (*ops->disable)(); } +static __inline bool +IOMMU_GET_SWCAPS(enum iommu_swcaps cap) +{ + + if (ops != NULL && ops->get_swcaps != NULL && iommu_avail) + return (*ops->get_swcaps)(cap); + return (false); +} + static void iommu_pci_add(void *arg, device_t dev) { @@ -183,16 +221,28 @@ if (!iommu_enable) return; - if (vmm_is_intel()) - ops = &iommu_ops_intel; - else if (vmm_is_svm()) - ops = &iommu_ops_amd; - else + if (vmm_is_intel()) { +#ifdef IOMMU + if (dmar_is_running() == 0) + ops = &iommu_ops_dmar; + else +#endif + ops = &iommu_ops_intel; + } else if (vmm_is_svm()) { +#ifdef IOMMU + if (amdiommu_is_running() == 0) + ops = &iommu_ops_amdsysdrv; + else +#endif + ops = &iommu_ops_amd; + } else ops = NULL; error = IOMMU_INIT(); - if (error) + if (error != 0) { + printf("iommu_init: error %d\n", error); return; + } iommu_avail = 1; @@ -200,7 +250,7 @@ * Create a domain for the devices owned by the host */ maxaddr = vmm_mem_maxaddr(); - host_domain = IOMMU_CREATE_DOMAIN(maxaddr); + host_domain = IOMMU_CREATE_DOMAIN(maxaddr, true); if (host_domain == NULL) { printf("iommu_init: unable to create a host domain"); IOMMU_CLEANUP(); @@ -289,7 +339,7 @@ while (iommu_initted == 1) cpu_spinwait(); } - return (IOMMU_CREATE_DOMAIN(maxaddr)); + return (IOMMU_CREATE_DOMAIN(maxaddr, false)); } void @@ -317,6 +367,14 @@ return (0); } +int +iommu_create_mapping_bulk(void *dom, vm_paddr_t gpa, struct vm_page **ma, + size_t len) +{ + + return (IOMMU_CREATE_MAPPING_BULK(dom, gpa, ma, len)); +} + int iommu_remove_mapping(void *dom, vm_paddr_t gpa, size_t len) { @@ -361,3 +419,227 @@ return (IOMMU_INVALIDATE_TLB(domain)); } + +bool +iommu_get_swcaps(enum iommu_swcaps cap) +{ + + return (IOMMU_GET_SWCAPS(cap)); +} + +static MALLOC_DEFINE(M_GENIOMMU_VMM, "geniommu_vmm", + "Gen iommu driver VMM memory"); + +void * +geniommu_create_domain(vm_paddr_t maxaddr, bool host_domain) +{ + struct geniommu_vmm_domain *vmm_dom; + + vmm_dom = malloc(sizeof(struct geniommu_vmm_domain), M_GENIOMMU_VMM, + M_WAITOK | M_ZERO); + LIST_INIT(&vmm_dom->iommu_domains); + sx_init(&vmm_dom->lock, "vmmdom"); + vmm_dom->host_domain = host_domain; + return (vmm_dom); +} + +void +geniommu_destroy_domain(void *domain) +{ + struct geniommu_vmm_domain *vmm_dom; + + vmm_dom = domain; +#if 0 + LIST_FOREACH_SAFE() { + } +#endif + sx_destroy(&vmm_dom->lock); + free(vmm_dom, M_GENIOMMU_VMM); +} + +int +geniommu_create_mapping_one(struct iommu_domain *iodom, vm_paddr_t gpa, + vm_paddr_t hpa, uint64_t len) +{ + struct iommu_map_entry *e; + vm_page_t fm, m, *ma; + vm_paddr_t pa; + iommu_gaddr_t glen, gtotal_len; + u_long cnt_after; + int error; + + fm = NULL; + for (pa = hpa, gtotal_len = 0; gtotal_len < len; pa += glen, + gpa += glen, gtotal_len += glen) { + e = iommu_gas_alloc_entry(iodom, IOMMU_PGF_WAITOK); + e->start = gpa; + m = vm_page_phys_to_vm_page(pa, &cnt_after); + if (m == NULL) { + if (fm == NULL) + fm = vm_page_getfake(pa, VM_MEMATTR_DEFAULT); + else + vm_page_updatefake(fm, pa, VM_MEMATTR_DEFAULT); + ma = &fm; + glen = PAGE_SIZE; + } else { + ma = &m; + glen = MIN(len - gtotal_len, ptoa(cnt_after + 1)); + } + e->end = gpa + glen; + error = iommu_gas_map_region(iodom, e, + IOMMU_MAP_ENTRY_READ | IOMMU_MAP_ENTRY_WRITE, + IOMMU_MF_CANWAIT | IOMMU_MF_CANSPLIT | IOMMU_MF_VMM, ma); + if (error != 0) + break; + glen = e->end - e->start; + } + if (fm != NULL) + vm_page_putfake(fm); + return (error); +} + +int +geniommu_create_mapping(void *dom1, vm_paddr_t gpa, vm_paddr_t hpa, + uint64_t len, uint64_t *res_len) +{ + struct geniommu_vmm_domain *vmm_dom; + struct iommu_domain *dom; + int error; + + vmm_dom = dom1; + error = 0; + if (vmm_dom->host_domain) { + sx_xlock(&vmm_dom->lock); + LIST_FOREACH(dom, &vmm_dom->iommu_domains, vmm_dom_link) { + error = geniommu_create_mapping_one(dom, gpa, + hpa, len); + if (error != 0) + break; + } + sx_xunlock(&vmm_dom->lock); + } + *res_len = len; + return (error); +} + +int +geniommu_create_mapping_bulk_one(struct iommu_domain *iodom, vm_paddr_t gpa, + struct vm_page **ma, uint64_t len) +{ + struct iommu_map_entry *e; + iommu_gaddr_t gtotal_len, glen; + int error; + + for (gtotal_len = 0; gtotal_len < len; gpa += glen, + gtotal_len += glen) { + e = iommu_gas_alloc_entry(iodom, IOMMU_PGF_WAITOK); + glen = len - gtotal_len; + e->start = gpa; + e->end = gpa + glen; + error = iommu_gas_map_region(iodom, e, + IOMMU_MAP_ENTRY_READ | IOMMU_MAP_ENTRY_WRITE, + IOMMU_MF_CANWAIT | IOMMU_MF_CANSPLIT | IOMMU_MF_VMM, + ma + atop(gtotal_len)); + if (error != 0) + break; + glen = e->end - e->start; + } + return (error); +} + +int +geniommu_create_mapping_bulk(void *dom1, vm_paddr_t gpa, struct vm_page **ma, + uint64_t len) +{ + struct geniommu_vmm_domain *vmm_dom; + struct iommu_domain *dom; + int error; + + vmm_dom = dom1; + error = 0; + if (!vmm_dom->host_domain) { + sx_xlock(&vmm_dom->lock); + LIST_FOREACH(dom, &vmm_dom->iommu_domains, vmm_dom_link) { + error = geniommu_create_mapping_bulk_one(dom, gpa, + ma, len); + if (error != 0) + break; + } + sx_xunlock(&vmm_dom->lock); + } + return (error); +} + +int +geniommu_remove_mapping(void *dom1, vm_paddr_t gpa, uint64_t len, + uint64_t *res_len) +{ + struct geniommu_vmm_domain *vmm_dom; + struct iommu_domain *dom; + + vmm_dom = dom1; + if (!vmm_dom->host_domain) { + sx_xlock(&vmm_dom->lock); + LIST_FOREACH(dom, &vmm_dom->iommu_domains, vmm_dom_link) + iommu_gas_remove(dom, gpa, len); + sx_xunlock(&vmm_dom->lock); + } + *res_len = len; + return (0); +} + +int +geniommu_add_device(void *dom1, device_t dev, uint16_t rid) +{ + struct geniommu_vmm_domain *vmm_dom; + struct iommu_unit *iommu; + struct iommu_domain *iodom, *rdomain; + int error; + + vmm_dom = dom1; + if (vmm_dom->host_domain) + return (0); + error = 0; + sx_xlock(&vmm_dom->lock); + iommu = iommu_find(dev, true); + if (iommu == NULL) { + error = ENOTTY; + goto done; + } + LIST_FOREACH(iodom, &vmm_dom->iommu_domains, vmm_dom_link) { + if (iodom->iommu == iommu) + break; + } + rdomain = get_x86_iommu()->vmm_domain_add_dev(iommu, iodom, dev, rid); + if (rdomain == NULL) { + error = EBUSY; + goto done; + } + if (iodom == NULL || iodom != rdomain) { + LIST_INSERT_HEAD(&vmm_dom->iommu_domains, rdomain, + vmm_dom_link); + } +done: + sx_xunlock(&vmm_dom->lock); + return (error); +} + +int +geniommu_remove_device(void *dom1, device_t dev, uint16_t rid) +{ + struct geniommu_vmm_domain *vmm_dom; + + vmm_dom = dom1; + if (vmm_dom->host_domain) + return (0); + /* XXXKIB */ + return (0); +} + +int +geniommu_invalidate_tlb(void *dom) +{ + + /* XXXKIB: do nothing, rely on map/unmap ? */ + return (0); +} diff --git a/sys/amd64/vmm/vmm.c b/sys/amd64/vmm/vmm.c --- a/sys/amd64/vmm/vmm.c +++ b/sys/amd64/vmm/vmm.c @@ -640,8 +640,12 @@ return (0); } +#define NMA (1 << VM_LEVEL_0_ORDER) +#define NMAP (NMA * sizeof(vm_page_t)) +_Static_assert(NMAP == PAGE_SIZE, "XXX"); + static int -vm_iommu_map(struct vm *vm) +vm_iommu_map_pg(struct vm *vm) { pmap_t pmap; vm_paddr_t gpa, hpa; @@ -651,6 +655,7 @@ sx_assert(&vm->mem.mem_segs_lock, SX_LOCKED); pmap = vmspace_pmap(vm_vmspace(vm)); + error = 0; for (i = 0; i < VM_MAX_MEMMAPS; i++) { if (!vm_memseg_sysmem(vm, i)) continue; @@ -682,22 +687,91 @@ ("vm_iommu_map: vm %p gpa %jx hpa %jx not wired", vm, (uintmax_t)gpa, (uintmax_t)hpa)); - iommu_create_mapping(vm->iommu, gpa, hpa, PAGE_SIZE); + error = iommu_create_mapping(vm->iommu, gpa, hpa, + PAGE_SIZE); + if (error != 0) + break; } } - - error = iommu_invalidate_tlb(iommu_host_domain()); return (error); } static int -vm_iommu_unmap(struct vm *vm) +vm_iommu_map_bulk(struct vm *vm) +{ + vm_paddr_t gpa, gpa_end, g; + struct vm_mem_map *mm; + vm_page_t *ma; + int error, i, j, n; + + vm_assert_memseg_locked(vm); + + error = 0; + ma = (void *)kmem_malloc(NMAP, M_WAITOK | M_ZERO); + + for (i = 0; i < VM_MAX_MEMMAPS; i++) { + if (!vm_memseg_sysmem(vm, i)) + continue; + mm = &vm_mem(vm)->mem_maps[i]; + if (mm->len == 0) + continue; + + KASSERT((mm->flags & VM_MEMMAP_F_IOMMU) == 0, + ("iommu map found invalid memmap %#lx/%#lx/%#x", + mm->gpa, mm->len, mm->flags)); + if ((mm->flags & VM_MEMMAP_F_WIRED) == 0) + continue; + mm->flags |= VM_MEMMAP_F_IOMMU; + + for (gpa = mm->gpa; gpa < mm->gpa + mm->len; gpa = gpa_end) { + gpa_end = min(ptoa(NMA), mm->gpa + mm->len); + if ((gpa_end & (ptoa(NMA) - 1)) != 0) + gpa_end &= ~(ptoa(NMA) - 1); + n = atop(gpa_end - gpa); + + for (j = 0, g = gpa; j < n; j++, g += PAGE_SIZE) { + ma[j] = PHYS_TO_VM_PAGE(pmap_extract( + vmspace_pmap(vm->vmspace), g)); + KASSERT(vm_page_wired(ma[j]), + ("vm_iommu_map: vm %p gpa %jx page %p not wired", + vm, (uintmax_t)g, ma[j])); + } + error = iommu_create_mapping_bulk(vm->iommu, gpa, ma, + gpa_end - gpa); + if (error != 0) + break; + } + } + kmem_free(ma, NMAP); + return (error); +} + + +static int +vm_iommu_map(struct vm *vm) +{ + int error, error1; + + vm_assert_memseg_xlocked(vm); + + if (iommu_get_swcaps(IOMMU_CAP_BULK)) + error = vm_iommu_map_bulk(vm); + else + error = vm_iommu_map_pg(vm); + error1 = iommu_invalidate_tlb(iommu_host_domain()); + if (error == 0) + error = error1; + return (error); +} + +static int +vm_iommu_unmap_pg(struct vm *vm, bool bulk) { vm_paddr_t gpa; struct vm_mem_map *mm; - int error, i; + int i; - sx_assert(&vm->mem.mem_segs_lock, SX_LOCKED); + vm_assert_memseg_xlocked(vm); for (i = 0; i < VM_MAX_MEMMAPS; i++) { if (!vm_memseg_sysmem(vm, i)) @@ -711,6 +785,10 @@ ("iommu unmap found invalid memmap %#lx/%#lx/%#x", mm->gpa, mm->len, mm->flags)); + if (bulk) { + iommu_remove_mapping(vm->iommu, mm->gpa, mm->len); + continue; + } for (gpa = mm->gpa; gpa < mm->gpa + mm->len; gpa += PAGE_SIZE) { KASSERT(vm_page_wired(PHYS_TO_VM_PAGE(pmap_extract( vmspace_pmap(vm_vmspace(vm)), gpa))), @@ -719,12 +797,26 @@ iommu_remove_mapping(vm->iommu, gpa, PAGE_SIZE); } } + return (0); +} + +static int +vm_iommu_unmap(struct vm *vm) +{ + int error, error1; + + vm_assert_memseg_xlocked(vm); + + error = vm_iommu_unmap_pg(vm, iommu_get_swcaps(IOMMU_CAP_BULK)); /* * Invalidate the cached translations associated with the domain * from which pages were removed. */ - error = iommu_invalidate_tlb(vm->iommu); + error1 = iommu_invalidate_tlb(vm->iommu); + if (error == 0) + error = error1; + return (error); } diff --git a/sys/dev/iommu/iommu.h b/sys/dev/iommu/iommu.h --- a/sys/dev/iommu/iommu.h +++ b/sys/dev/iommu/iommu.h @@ -122,6 +122,7 @@ vm_paddr_t msi_phys; /* (d) Arch-specific */ u_int flags; /* (u) */ LIST_HEAD(, iommu_ctx) contexts;/* (u) */ + LIST_ENTRY(iommu_domain) vmm_dom_link; /* Member in external vmm dom */ }; struct iommu_ctx { @@ -148,6 +149,8 @@ page table */ #define IOMMU_DOMAIN_RMRR 0x0020 /* Domain contains RMRR entry, cannot be turned off */ +#define IOMMU_DOMAIN_VMM 0x0040 /* Used by VMM */ +#define IOMMU_DOMAIN_BUSDMA 0x0080 /* Used for busdma */ #define IOMMU_LOCK(unit) mtx_lock(&(unit)->lock) #define IOMMU_UNLOCK(unit) mtx_unlock(&(unit)->lock) diff --git a/sys/dev/iommu/iommu_gas.h b/sys/dev/iommu/iommu_gas.h --- a/sys/dev/iommu/iommu_gas.h +++ b/sys/dev/iommu/iommu_gas.h @@ -35,6 +35,7 @@ #define IOMMU_MF_CANWAIT 0x0001 #define IOMMU_MF_CANSPLIT 0x0002 #define IOMMU_MF_RMRR 0x0004 +#define IOMMU_MF_VMM 0x0008 #define IOMMU_PGF_WAITOK 0x0001 #define IOMMU_PGF_ZERO 0x0002 @@ -47,6 +48,7 @@ dmamap_link */ #define IOMMU_MAP_ENTRY_MAP 0x0004 /* Busdma created, linked by dmamap_link */ +#define IOMMU_MAP_ENTRY_VMM 0x0008 /* VMM created */ #define IOMMU_MAP_ENTRY_UNMAPPED 0x0010 /* No backing pages */ #define IOMMU_MAP_ENTRY_REMOVING 0x0020 /* In process of removal by iommu_gas_remove() */ diff --git a/sys/dev/iommu/iommu_gas.c b/sys/dev/iommu/iommu_gas.c --- a/sys/dev/iommu/iommu_gas.c +++ b/sys/dev/iommu/iommu_gas.c @@ -583,7 +583,9 @@ } if ((flags & IOMMU_MF_RMRR) != 0) - entry->flags = IOMMU_MAP_ENTRY_RMRR; + entry->flags |= IOMMU_MAP_ENTRY_RMRR; + if ((flags & IOMMU_MF_VMM) != 0) + entry->flags |= IOMMU_MAP_ENTRY_VMM; #ifdef INVARIANTS struct iommu_map_entry *ip, *in; @@ -610,9 +612,10 @@ struct iommu_domain *domain; domain = entry->domain; - KASSERT((entry->flags & (IOMMU_MAP_ENTRY_PLACE | IOMMU_MAP_ENTRY_RMRR | - IOMMU_MAP_ENTRY_MAP)) == IOMMU_MAP_ENTRY_MAP, - ("permanent entry %p %p", domain, entry)); + KASSERT((entry->flags & IOMMU_MAP_ENTRY_RMRR) == 0, + ("removing RMRR entry dom %p e %p (%#jx, %#jx) fl %#x", domain, + entry, + (uintmax_t)entry->start, (uintmax_t)entry->end, entry->flags)); IOMMU_DOMAIN_LOCK(domain); iommu_gas_rb_remove(domain, entry); @@ -852,6 +855,8 @@ iommu_gas_map_region(struct iommu_domain *domain, struct iommu_map_entry *entry, u_int eflags, u_int flags, vm_page_t *ma) { + struct iommu_map_entries_tailq gc; + struct iommu_map_entry *r1, *r2; iommu_gaddr_t start; int error; @@ -860,20 +865,27 @@ entry, entry->domain)); KASSERT(entry->flags == 0, ("used RMRR entry %p %p %x", domain, entry, entry->flags)); - KASSERT((flags & ~(IOMMU_MF_CANWAIT | IOMMU_MF_RMRR)) == 0, + KASSERT((flags & ~(IOMMU_MF_CANWAIT | IOMMU_MF_CANSPLIT | + IOMMU_MF_RMRR | IOMMU_MF_VMM)) == 0, ("invalid flags 0x%x", flags)); + if ((flags & IOMMU_MF_VMM) != 0) + iommu_gas_remove_init(domain, &gc, &r1, &r2); start = entry->start; IOMMU_DOMAIN_LOCK(domain); + if ((flags & IOMMU_MF_VMM) != 0) { + iommu_gas_remove_locked(domain, entry->start, entry->end, + &gc, &r1, &r2); + } error = iommu_gas_alloc_region(domain, entry, flags); if (error != 0) { IOMMU_DOMAIN_UNLOCK(domain); - return (error); + goto done; } entry->flags |= eflags; IOMMU_DOMAIN_UNLOCK(domain); if (entry->end == entry->start) - return (0); + goto done; /* * iommu_gas_alloc_region() might clipped the entry start and @@ -886,12 +898,14 @@ if (error == ENOMEM) { iommu_domain_unload_entry(entry, false, (flags & IOMMU_MF_CANWAIT) != 0); - return (error); + goto done; } KASSERT(error == 0, ("unexpected error %d from domain_map_buf", error)); - - return (0); +done: + if ((flags & IOMMU_MF_VMM) != 0) + iommu_gas_remove_cleanup(domain, &gc, &r1, &r2); + return (error); } static int diff --git a/sys/modules/vmm/Makefile b/sys/modules/vmm/Makefile --- a/sys/modules/vmm/Makefile +++ b/sys/modules/vmm/Makefile @@ -101,6 +101,7 @@ # intel-specific files .PATH: ${SRCTOP}/sys/amd64/vmm/intel SRCS+= ept.c \ + dmar_iommu.c \ vmcs.c \ vmx_msr.c \ vmx_support.S \ @@ -111,6 +112,7 @@ .PATH: ${SRCTOP}/sys/amd64/vmm/amd SRCS+= vmcb.c \ amdviiommu.c \ + amdsysdrv_iommu.c \ ivhd_if.c \ ivhd_if.h \ svm.c \ diff --git a/sys/vm/device_pager.c b/sys/vm/device_pager.c --- a/sys/vm/device_pager.c +++ b/sys/vm/device_pager.c @@ -443,7 +443,7 @@ } /* If "paddr" is a real page, perform a sanity check on "memattr". */ - if ((m_paddr = vm_phys_paddr_to_vm_page(paddr)) != NULL && + if ((m_paddr = vm_phys_paddr_to_vm_page(paddr, NULL)) != NULL && (memattr1 = pmap_page_get_memattr(m_paddr)) != memattr) { /* * For the /dev/mem d_mmap routine to return the diff --git a/sys/vm/sg_pager.c b/sys/vm/sg_pager.c --- a/sys/vm/sg_pager.c +++ b/sys/vm/sg_pager.c @@ -180,7 +180,7 @@ KASSERT(paddr != 1, ("invalid SG page index")); /* If "paddr" is a real page, perform a sanity check on "memattr". */ - if ((m_paddr = vm_phys_paddr_to_vm_page(paddr)) != NULL && + if ((m_paddr = vm_phys_paddr_to_vm_page(paddr, NULL)) != NULL && pmap_page_get_memattr(m_paddr) != memattr) { memattr = pmap_page_get_memattr(m_paddr); printf( diff --git a/sys/vm/vm_page.h b/sys/vm/vm_page.h --- a/sys/vm/vm_page.h +++ b/sys/vm/vm_page.h @@ -458,8 +458,11 @@ * PHYS_TO_VM_PAGE() returns the vm_page_t object that represents a memory * page to which the given physical address belongs. The correct vm_page_t * object is returned for addresses that are not page-aligned. + * vm_page_phys_to_vm_page() is same as PHYS_TO_VM_PAGE() but also can + * return the count of pages after m in the same physical segment. */ vm_page_t PHYS_TO_VM_PAGE(vm_paddr_t pa); +vm_page_t vm_page_phys_to_vm_page(vm_paddr_t pa, u_long *cnt_after); /* * vm_page allocation arguments for the functions vm_page_alloc(), diff --git a/sys/vm/vm_page.c b/sys/vm/vm_page.c --- a/sys/vm/vm_page.c +++ b/sys/vm/vm_page.c @@ -334,7 +334,7 @@ vm_page_t m; bool found; - m = vm_phys_paddr_to_vm_page(pa); + m = vm_phys_paddr_to_vm_page(pa, NULL); if (m == NULL) return (true); /* page does not exist, no failure */ @@ -575,7 +575,7 @@ vm_offset_t mapped; int witness_size; #endif -#if defined(__i386__) && defined(VM_PHYSSEG_DENSE) +#if (defined(__i386__) || defined(__amd64__)) && defined(VM_PHYSSEG_DENSE) long ii; #endif int pool; @@ -772,7 +772,11 @@ * Initialize the page structures and add every available page to the * physical memory allocator's free lists. */ -#if defined(__i386__) && defined(VM_PHYSSEG_DENSE) +#if (defined(__i386__) || defined(__amd64__)) && defined(VM_PHYSSEG_DENSE) + /* + * i386 needs this for copyout(9) calling vm_fault_quick_hold_pages(). + * amd64 requires that for DMAR busdma and bhyve IOMMU. + */ for (ii = 0; ii < vm_page_array_size; ii++) { m = &vm_page_array[ii]; vm_page_init_page(m, (first_page + ii) << PAGE_SHIFT, 0, @@ -1278,13 +1282,20 @@ vm_page_t PHYS_TO_VM_PAGE(vm_paddr_t pa) +{ + + return (vm_page_phys_to_vm_page(pa, NULL)); +} + +vm_page_t +vm_page_phys_to_vm_page(vm_paddr_t pa, u_long *cnt_after) { vm_page_t m; #ifdef VM_PHYSSEG_SPARSE - m = vm_phys_paddr_to_vm_page(pa); + m = vm_phys_paddr_to_vm_page(pa, cnt_after); if (m == NULL) - m = vm_phys_fictitious_to_vm_page(pa); + m = vm_phys_fictitious_to_vm_page(pa, cnt_after); return (m); #elif defined(VM_PHYSSEG_DENSE) long pi; @@ -1292,9 +1303,11 @@ pi = atop(pa); if (pi >= first_page && (pi - first_page) < vm_page_array_size) { m = &vm_page_array[pi - first_page]; + if (cnt_after != NULL) + *cnt_after = vm_page_array_size - pi - first_page; return (m); } - return (vm_phys_fictitious_to_vm_page(pa)); + return (vm_phys_fictitious_to_vm_page(pa, cnt_after)); #else #error "Either VM_PHYSSEG_DENSE or VM_PHYSSEG_SPARSE must be defined." #endif diff --git a/sys/vm/vm_phys.h b/sys/vm/vm_phys.h --- a/sys/vm/vm_phys.h +++ b/sys/vm/vm_phys.h @@ -68,13 +68,14 @@ int vm_phys_fictitious_reg_range(vm_paddr_t start, vm_paddr_t end, vm_memattr_t memattr); void vm_phys_fictitious_unreg_range(vm_paddr_t start, vm_paddr_t end); -vm_page_t vm_phys_fictitious_to_vm_page(vm_paddr_t pa); +vm_page_t vm_phys_fictitious_to_vm_page(vm_paddr_t pa, u_long *cnt_after); int vm_phys_find_range(vm_page_t bounds[], int segind, int domain, u_long npages, vm_paddr_t low, vm_paddr_t high); +vm_page_t vm_phys_fictitious_to_vm_page(vm_paddr_t pa, u_long *cnt_after); void vm_phys_free_contig(vm_page_t m, int pool, u_long npages); void vm_phys_free_pages(vm_page_t m, int pool, int order); void vm_phys_init(void); -vm_page_t vm_phys_paddr_to_vm_page(vm_paddr_t pa); +vm_page_t vm_phys_paddr_to_vm_page(vm_paddr_t pa, u_long *cnt_after); vm_page_t vm_phys_seg_paddr_to_vm_page(struct vm_phys_seg *seg, vm_paddr_t pa); void vm_phys_register_domains(int ndomains, struct mem_affinity *affinity, int *locality); diff --git a/sys/vm/vm_phys.c b/sys/vm/vm_phys.c --- a/sys/vm/vm_phys.c +++ b/sys/vm/vm_phys.c @@ -1033,17 +1033,20 @@ * Find the vm_page corresponding to the given physical address. */ vm_page_t -vm_phys_paddr_to_vm_page(vm_paddr_t pa) +vm_phys_paddr_to_vm_page(vm_paddr_t pa, u_long *cnt_after) { struct vm_phys_seg *seg; - if ((seg = vm_phys_paddr_to_seg(pa)) != NULL) + if ((seg = vm_phys_paddr_to_seg(pa)) != NULL) { + if (cnt_after != NULL) + *cnt_after = atop(seg->end - pa); return (vm_phys_seg_paddr_to_vm_page(seg, pa)); + } return (NULL); } vm_page_t -vm_phys_fictitious_to_vm_page(vm_paddr_t pa) +vm_phys_fictitious_to_vm_page(vm_paddr_t pa, u_long *cnt_after) { struct vm_phys_fictitious_seg tmp, *seg; vm_page_t m; @@ -1061,6 +1064,8 @@ m = &seg->first_page[atop(pa - seg->start)]; KASSERT((m->flags & PG_FICTITIOUS) != 0, ("%p not fictitious", m)); + if (cnt_after != NULL) + *cnt_after = atop(seg->end - pa); return (m); } @@ -1523,7 +1528,7 @@ * physical pages containing the given physical page "m" and * assign it to "m_set". */ - m = vm_phys_paddr_to_vm_page(pa); + m = vm_phys_paddr_to_vm_page(pa, NULL); for (m_set = m, order = 0; m_set->order == VM_NFREEORDER && order < VM_NFREEORDER - 1; ) { order++; @@ -1905,7 +1910,7 @@ vm_page_t m; int i; - if ((m = vm_phys_paddr_to_vm_page(pa)) != NULL) + if ((m = vm_phys_paddr_to_vm_page(pa, NULL)) != NULL) return ((m->flags & PG_NODUMP) == 0); for (i = 0; dump_avail[i] != 0 || dump_avail[i + 1] != 0; i += 2) { diff --git a/sys/x86/iommu/amd_ctx.c b/sys/x86/iommu/amd_ctx.c --- a/sys/x86/iommu/amd_ctx.c +++ b/sys/x86/iommu/amd_ctx.c @@ -501,11 +501,12 @@ } } -struct amdiommu_ctx * -amdiommu_get_ctx_for_dev(struct amdiommu_unit *unit, device_t dev, uint16_t rid, +static struct amdiommu_ctx * +amdiommu_get_ctx_for_dev_locked(struct amdiommu_unit *unit, + struct amdiommu_domain *domain, device_t dev, uint16_t rid, int dev_domain, bool id_mapped, bool rmrr_init, uint8_t dte, uint32_t edte) { - struct amdiommu_domain *domain, *domain1; + struct amdiommu_domain *domain1; struct amdiommu_ctx *ctx, *ctx1; int bus, slot, func; @@ -518,7 +519,7 @@ slot = PCI_RID2SLOT(rid); func = PCI_RID2FUNC(rid); } - AMDIOMMU_LOCK(unit); + AMDIOMMU_ASSERT_LOCKED(unit); KASSERT(!iommu_is_buswide_ctx(AMD2IOMMU(unit), bus) || (slot == 0 && func == 0), ("iommu%d pci%d:%d:%d get_ctx for buswide", AMD2IOMMU(unit)->unit, @@ -530,61 +531,172 @@ * higher chance to succeed if the sleep is allowed. */ AMDIOMMU_UNLOCK(unit); - domain1 = amdiommu_domain_alloc(unit, id_mapped); - if (domain1 == NULL) - return (NULL); - if (!id_mapped) { - /* - * XXXKIB IVMD seems to be less significant - * and less used on AMD than RMRR on Intel. - * Not implemented for now. - */ - } - ctx1 = amdiommu_ctx_alloc(domain1, rid); - amdiommu_ctx_init_irte(ctx1); - AMDIOMMU_LOCK(unit); - - /* - * Recheck the contexts, other thread might have - * already allocated needed one. - */ - ctx = amdiommu_find_ctx_locked(unit, rid); - if (ctx == NULL) { - domain = domain1; - ctx = ctx1; - amdiommu_ctx_link(ctx); - ctx->context.tag->owner = dev; - iommu_device_tag_init(CTX2IOCTX(ctx), dev); - - LIST_INSERT_HEAD(&unit->domains, domain, link); - dte_entry_init(ctx, false, dte, edte); - amdiommu_qi_invalidate_ctx_locked(ctx); - if (dev != NULL) { - device_printf(dev, - "amdiommu%d pci%d:%d:%d:%d rid %x domain %d " - "%s-mapped\n", - AMD2IOMMU(unit)->unit, unit->unit_dom, - bus, slot, func, rid, domain->domain, - id_mapped ? "id" : "re"); + if (domain == NULL) { + domain1 = amdiommu_domain_alloc(unit, id_mapped); + if (domain1 == NULL) + return (NULL); + if (!id_mapped) { + /* + * XXXKIB IVMD seems to be less significant + * and less used on AMD than RMRR on Intel. + * Not implemented for now. + */ } - } else { - amdiommu_domain_destroy(domain1); - /* Nothing needs to be done to destroy ctx1. */ - free(ctx1, M_AMDIOMMU_CTX); - domain = CTX2DOM(ctx); - ctx->context.refs++; /* tag referenced us */ + ctx1 = amdiommu_ctx_alloc(domain1, rid); + amdiommu_ctx_init_irte(ctx1); + AMDIOMMU_LOCK(unit); + + /* + * Recheck the contexts, other thread might have + * already allocated needed one. + */ + ctx = amdiommu_find_ctx_locked(unit, rid); + if (ctx == NULL) { + domain = domain1; + ctx = ctx1; + amdiommu_ctx_link(ctx); + ctx->context.tag->owner = dev; + iommu_device_tag_init(CTX2IOCTX(ctx), dev); + + LIST_INSERT_HEAD(&unit->domains, domain, link); + dte_entry_init(ctx, false, dte, edte); + amdiommu_qi_invalidate_ctx_locked(ctx); + if (dev != NULL) { + device_printf(dev, + "amdiommu%d pci%d:%d:%d:%d rid %x domain %d " + "%s-mapped\n", + AMD2IOMMU(unit)->unit, + unit->unit_dom, + bus, slot, func, rid, + domain->domain, + id_mapped ? "id" : "re"); + } + } else { + amdiommu_domain_destroy(domain1); + /* Nothing needs to be done to destroy ctx1. */ + free(ctx1, M_AMDIOMMU_CTX); + domain = CTX2DOM(ctx); + ctx->context.refs++; /* tag referenced us */ + } + if (domain1 != NULL) + amdiommu_domain_destroy(domain1); } } else { + MPASS(domain == NULL); domain = CTX2DOM(ctx); if (ctx->context.tag->owner == NULL) ctx->context.tag->owner = dev; ctx->context.refs++; /* tag referenced us */ } - AMDIOMMU_UNLOCK(unit); - return (ctx); } +struct amdiommu_ctx * +amdiommu_get_ctx_for_dev(struct amdiommu_unit *unit, + struct amdiommu_domain *domain, device_t dev, uint16_t rid, + int dev_domain, bool id_mapped, bool rmrr_init, uint8_t dte, uint32_t edte) +{ + struct amdiommu_ctx *res; + + AMDIOMMU_LOCK(unit); + res = amdiommu_get_ctx_for_dev_locked(unit, domain, dev, rid, + dev_domain, id_mapped, rmrr_init, dte, edte); + AMDIOMMU_UNLOCK(unit); + return (res); +} + +static struct amdiommu_domain * +amdiommu_move_ctx_to_domain(struct amdiommu_domain *domain, + struct amdiommu_ctx *ctx) +{ +#if 0 + struct amdiommu_unit *unit; + struct amdiommu_domain *old_domain; + + /* XXXKIB */ + dmar_ctx_entry_t *ctxp; + struct sf_buf *sf; + + dmar = domain->dmar; + DMAR_ASSERT_LOCKED(dmar); + old_domain = CTX2DOM(ctx); + if (domain == old_domain) { + DMAR_UNLOCK(dmar); + return (0); + } + KASSERT(old_domain->iodom.iommu == domain->iodom.iommu, + ("domain %p %u moving between dmars %u %u", domain, + domain->domain, old_domain->dmar->iommu.unit, + domain->dmar->iommu.unit)); + + if ((old_domain->iodom.flags & IOMMU_DOMAIN_RMRR) != 0) + return (old_domain); + + TD_PREP_PINNED_ASSERT; + + ctxp = dmar_map_ctx_entry(ctx, &sf); + dmar_ctx_unlink(ctx); + ctx->context.domain = &domain->iodom; + dmar_ctx_link(ctx); + ctx_id_entry_init(ctx, ctxp, true, PCI_BUSMAX + 100); + iommu_unmap_pgtbl(sf); + (void)dmar_flush_for_ctx_entry(dmar, true); + /* If flush failed, rolling back would not work as well. */ + printf("dmar%d rid %x domain %d->%d %s-mapped\n", + dmar->iommu.unit, ctx->context.rid, old_domain->domain, + domain->domain, (domain->iodom.flags & IOMMU_DOMAIN_IDMAP) != 0 ? + "id" : "re"); + dmar_unref_domain_locked(dmar, old_domain); + TD_PINNED_ASSERT; +#endif + return (domain); +} + +struct iommu_domain * +amdiommu_vmm_domain_add_dev(struct iommu_unit *iommu, + struct iommu_domain *domain, device_t dev, uint16_t rid) +{ + struct amdiommu_unit *unit; + struct amdiommu_domain *ddomain, *rdomain; + struct amdiommu_ctx *ctx; + bool drain; + + unit = IOMMU2AMD(iommu); + ddomain = domain == NULL ? NULL : IODOM2DOM(domain); + MPASS(ddomain == NULL || ddomain->unit == unit); + rdomain = NULL; + drain = false; + AMDIOMMU_LOCK(unit); + ctx = amdiommu_find_ctx_locked(unit, rid); + if (ctx != NULL) { + rdomain = domain != NULL ? amdiommu_move_ctx_to_domain(ddomain, + ctx) : IODOM2DOM(ctx->context.domain); + } else { + ctx = amdiommu_get_ctx_for_dev_locked(unit, ddomain, dev, rid, + 0/*XXX dev_domain*/, false, true, 0/*XXXdte*/, 0/*XXXedte*/); + if (ctx != NULL) { + rdomain = IODOM2DOM(ctx->context.domain); + MPASS(domain == NULL || rdomain == ddomain); + } + } + if (rdomain != NULL) { + MPASS((rdomain->iodom.flags & (IOMMU_DOMAIN_BUSDMA | + IOMMU_DOMAIN_VMM)) != (IOMMU_DOMAIN_BUSDMA | + IOMMU_DOMAIN_VMM)); + if ((rdomain->iodom.flags & IOMMU_DOMAIN_BUSDMA) != 0) { + rdomain->iodom.flags &= ~IOMMU_DOMAIN_BUSDMA; + drain = true; + } + rdomain->iodom.flags |= IOMMU_DOMAIN_VMM; + } + AMDIOMMU_UNLOCK(unit); + if (drain) { + taskqueue_drain(iommu->delayed_taskqueue, + &rdomain->iodom.unload_task); + } + return (DOM2IODOM(rdomain)); +} + struct iommu_ctx * amdiommu_get_ctx(struct iommu_unit *iommu, device_t dev, uint16_t rid, bool id_mapped, bool rmrr_init) @@ -602,8 +714,8 @@ return (NULL); if (AMD2IOMMU(unit) != iommu) /* XXX complain loudly */ return (NULL); - ret = amdiommu_get_ctx_for_dev(unit, dev, rid1, pci_get_domain(dev), - id_mapped, rmrr_init, dte, edte); + ret = amdiommu_get_ctx_for_dev(unit, NULL, dev, rid1, + pci_get_domain(dev), id_mapped, rmrr_init, dte, edte); return (CTX2IOCTX(ret)); } diff --git a/sys/x86/iommu/amd_drv.c b/sys/x86/iommu/amd_drv.c --- a/sys/x86/iommu/amd_drv.c +++ b/sys/x86/iommu/amd_drv.c @@ -1094,6 +1094,7 @@ .domain_unload = amdiommu_domain_unload, .get_ctx = amdiommu_get_ctx, .free_ctx_locked = amdiommu_free_ctx_locked_method, + .vmm_domain_add_dev = amdiommu_vmm_domain_add_dev, .alloc_msi_intr = amdiommu_alloc_msi_intr, .map_msi_intr = amdiommu_map_msi_intr, .unmap_msi_intr = amdiommu_unmap_msi_intr, diff --git a/sys/x86/iommu/amd_intrmap.c b/sys/x86/iommu/amd_intrmap.c --- a/sys/x86/iommu/amd_intrmap.c +++ b/sys/x86/iommu/amd_intrmap.c @@ -255,9 +255,9 @@ error = amdiommu_find_unit(src, &unit, &rid, &dte, &edte, bootverbose); if (error == 0) { - ioctx = iommu_instantiate_ctx(AMD2IOMMU(unit), src, false); - if (ioctx != NULL) - ctx = IOCTX2CTX(ioctx); + iommu_get_requester(src, &rid); + ctx = amdiommu_get_ctx_for_dev(unit, NULL, src, + rid, 0, false /* XXXKIB */, false, dte, edte); } } if (ridp != NULL) diff --git a/sys/x86/iommu/amd_iommu.h b/sys/x86/iommu/amd_iommu.h --- a/sys/x86/iommu/amd_iommu.h +++ b/sys/x86/iommu/amd_iommu.h @@ -214,8 +214,9 @@ void amdiommu_domain_unload(struct iommu_domain *iodom, struct iommu_map_entries_tailq *entries, bool cansleep); struct amdiommu_ctx *amdiommu_get_ctx_for_dev(struct amdiommu_unit *unit, - device_t dev, uint16_t rid, int dev_domain, bool id_mapped, - bool rmrr_init, uint8_t dte, uint32_t edte); + struct amdiommu_domain *domain, device_t dev, uint16_t rid, + int dev_domain, bool id_mapped, bool rmrr_init, + uint8_t dte, uint32_t edte); struct iommu_ctx *amdiommu_get_ctx(struct iommu_unit *iommu, device_t dev, uint16_t rid, bool id_mapped, bool rmrr_init); struct amdiommu_ctx *amdiommu_find_ctx_locked(struct amdiommu_unit *unit, @@ -224,6 +225,8 @@ struct iommu_ctx *context); struct amdiommu_domain *amdiommu_find_domain(struct amdiommu_unit *unit, uint16_t rid); +struct iommu_domain *amdiommu_vmm_domain_add_dev(struct iommu_unit *iommu, + struct iommu_domain *domain, device_t dev, uint16_t rid); void amdiommu_qi_invalidate_ctx_locked(struct amdiommu_ctx *ctx); void amdiommu_qi_invalidate_ctx_locked_nowait(struct amdiommu_ctx *ctx); @@ -239,6 +242,4 @@ void amdiommu_domain_free_pgtbl(struct amdiommu_domain *domain); extern const struct iommu_domain_map_ops amdiommu_domain_map_ops; -int amdiommu_is_running(void); - #endif diff --git a/sys/x86/iommu/intel_ctx.c b/sys/x86/iommu/intel_ctx.c --- a/sys/x86/iommu/intel_ctx.c +++ b/sys/x86/iommu/intel_ctx.c @@ -498,11 +498,11 @@ } static struct dmar_ctx * -dmar_get_ctx_for_dev1(struct dmar_unit *dmar, device_t dev, uint16_t rid, - int dev_domain, int dev_busno, const void *dev_path, int dev_path_len, - bool id_mapped, bool rmrr_init) +dmar_get_ctx_for_dev1(struct dmar_unit *dmar, struct dmar_domain *domain, + device_t dev, uint16_t rid, int dev_domain, int dev_busno, + const void *dev_path, int dev_path_len, bool id_mapped, bool rmrr_init) { - struct dmar_domain *domain, *domain1; + struct dmar_domain *domain1; struct dmar_ctx *ctx, *ctx1; struct iommu_unit *unit __diagused; dmar_ctx_entry_t *ctxp; @@ -520,9 +520,10 @@ func = PCI_RID2FUNC(rid); } enable = false; + domain1 = NULL; + DMAR_ASSERT_LOCKED(dmar); TD_PREP_PINNED_ASSERT; unit = DMAR2IOMMU(dmar); - DMAR_LOCK(dmar); KASSERT(!iommu_is_buswide_ctx(unit, bus) || (slot == 0 && func == 0), ("iommu%d pci%d:%d:%d get_ctx for buswide", dmar->iommu.unit, bus, slot, func)); @@ -534,23 +535,27 @@ * higher chance to succeed if the sleep is allowed. */ DMAR_UNLOCK(dmar); - dmar_ensure_ctx_page(dmar, PCI_RID2BUS(rid)); - domain1 = dmar_domain_alloc(dmar, id_mapped); - if (domain1 == NULL) { - TD_PINNED_ASSERT; - return (NULL); - } - if (!id_mapped) { - error = domain_init_rmrr(domain1, dev, bus, - slot, func, dev_domain, dev_busno, dev_path, - dev_path_len); - if (error == 0 && dev != NULL) - error = dmar_reserve_pci_regions(domain1, dev); - if (error != 0) { - dmar_domain_destroy(domain1); + if (domain == NULL) { + dmar_ensure_ctx_page(dmar, PCI_RID2BUS(rid)); + domain1 = dmar_domain_alloc(dmar, id_mapped); + if (domain1 == NULL) { TD_PINNED_ASSERT; return (NULL); } + if (!id_mapped) { + error = domain_init_rmrr(domain1, dev, bus, + slot, func, dev_domain, dev_busno, dev_path, + dev_path_len); + if (error == 0 && dev != NULL) { + error = dmar_reserve_pci_regions( + domain1, dev); + } + if (error != 0) { + dmar_domain_destroy(domain1); + TD_PINNED_ASSERT; + return (NULL); + } + } } ctx1 = dmar_ctx_alloc(domain1, rid); ctxp = dmar_map_ctx_entry(ctx1, &sf); @@ -562,7 +567,15 @@ */ ctx = dmar_find_ctx_locked(dmar, rid); if (ctx == NULL) { - domain = domain1; + if (LIST_EMPTY(&dmar->domains)) { + MPASS(domain == NULL); + enable = true; + } + if (domain == NULL) { + domain = domain1; + domain1 = NULL; + LIST_INSERT_HEAD(&dmar->domains, domain, link); + } ctx = ctx1; dmar_ctx_link(ctx); ctx->context.tag->owner = dev; @@ -573,9 +586,6 @@ * DMAR unit. Enable the translation after * everything is set up. */ - if (LIST_EMPTY(&dmar->domains)) - enable = true; - LIST_INSERT_HEAD(&dmar->domains, domain, link); ctx_id_entry_init(ctx, ctxp, false, bus); if (dev != NULL) { device_printf(dev, @@ -587,14 +597,17 @@ } iommu_unmap_pgtbl(sf); } else { - iommu_unmap_pgtbl(sf); - dmar_domain_destroy(domain1); /* Nothing needs to be done to destroy ctx1. */ free(ctx1, M_DMAR_CTX); domain = CTX2DOM(ctx); ctx->context.refs++; /* tag referenced us */ } + if (domain1 != NULL) { + iommu_unmap_pgtbl(sf); + dmar_domain_destroy(domain1); + } } else { + MPASS(domain == NULL); domain = CTX2DOM(ctx); if (ctx->context.tag->owner == NULL) ctx->context.tag->owner = dev; @@ -632,13 +645,13 @@ return (NULL); } } - DMAR_UNLOCK(dmar); TD_PINNED_ASSERT; return (ctx); } -struct dmar_ctx * -dmar_get_ctx_for_dev(struct dmar_unit *dmar, device_t dev, uint16_t rid, +static struct dmar_ctx * +dmar_get_ctx_for_dev_locked(struct dmar_unit *dmar, + struct dmar_domain *domain, device_t dev, uint16_t rid, bool id_mapped, bool rmrr_init) { int dev_domain, dev_path_len, dev_busno; @@ -647,8 +660,25 @@ dev_path_len = dmar_dev_depth(dev); ACPI_DMAR_PCI_PATH dev_path[dev_path_len]; dmar_dev_path(dev, &dev_busno, dev_path, dev_path_len); - return (dmar_get_ctx_for_dev1(dmar, dev, rid, dev_domain, dev_busno, - dev_path, dev_path_len, id_mapped, rmrr_init)); + return (dmar_get_ctx_for_dev1(dmar, domain, dev, rid, dev_domain, + dev_busno, dev_path, dev_path_len, id_mapped, rmrr_init)); +} + +struct dmar_ctx * +dmar_get_ctx_for_dev(struct dmar_unit *dmar, struct dmar_domain *domain, + device_t dev, uint16_t rid, bool id_mapped, bool rmrr_init) +{ + struct dmar_ctx *ctx; + + DMAR_LOCK(dmar); + ctx = dmar_get_ctx_for_dev_locked(dmar, domain, dev, rid, + id_mapped, rmrr_init); + if (ctx != NULL) { + MPASS((ctx->context.domain->flags & IOMMU_DOMAIN_VMM) == 0); + ctx->context.domain->flags |= IOMMU_DOMAIN_BUSDMA; + } + DMAR_UNLOCK(dmar); + return (ctx); } struct dmar_ctx * @@ -657,38 +687,51 @@ const void *dev_path, int dev_path_len, bool id_mapped, bool rmrr_init) { + struct dmar_ctx *ctx; - return (dmar_get_ctx_for_dev1(dmar, NULL, rid, dev_domain, dev_busno, - dev_path, dev_path_len, id_mapped, rmrr_init)); + DMAR_LOCK(dmar); + ctx = dmar_get_ctx_for_dev1(dmar, NULL, NULL, rid, dev_domain, + dev_busno, dev_path, dev_path_len, id_mapped, rmrr_init); + if (ctx != NULL) { + MPASS((ctx->context.domain->flags & IOMMU_DOMAIN_VMM) == 0); + ctx->context.domain->flags |= IOMMU_DOMAIN_BUSDMA; + } + DMAR_UNLOCK(dmar); + return (ctx); } -int +static struct dmar_domain * dmar_move_ctx_to_domain(struct dmar_domain *domain, struct dmar_ctx *ctx) { struct dmar_unit *dmar; struct dmar_domain *old_domain; dmar_ctx_entry_t *ctxp; struct sf_buf *sf; - int error; dmar = domain->dmar; + DMAR_ASSERT_LOCKED(dmar); old_domain = CTX2DOM(ctx); - if (domain == old_domain) + if (domain == old_domain) { + DMAR_UNLOCK(dmar); return (0); + } KASSERT(old_domain->iodom.iommu == domain->iodom.iommu, ("domain %p %u moving between dmars %u %u", domain, - domain->domain, old_domain->iodom.iommu->unit, - domain->iodom.iommu->unit)); + domain->domain, old_domain->dmar->iommu.unit, + domain->dmar->iommu.unit)); + + if ((old_domain->iodom.flags & IOMMU_DOMAIN_RMRR) != 0) + return (old_domain); + TD_PREP_PINNED_ASSERT; ctxp = dmar_map_ctx_entry(ctx, &sf); - DMAR_LOCK(dmar); dmar_ctx_unlink(ctx); ctx->context.domain = &domain->iodom; dmar_ctx_link(ctx); ctx_id_entry_init(ctx, ctxp, true, PCI_BUSMAX + 100); iommu_unmap_pgtbl(sf); - error = dmar_flush_for_ctx_entry(dmar, true); + (void)dmar_flush_for_ctx_entry(dmar, true); /* If flush failed, rolling back would not work as well. */ printf("dmar%d rid %x domain %d->%d %s-mapped\n", dmar->iommu.unit, ctx->context.rid, old_domain->domain, @@ -696,7 +739,59 @@ "id" : "re"); dmar_unref_domain_locked(dmar, old_domain); TD_PINNED_ASSERT; - return (error); + return (domain); +} + +/* + * Create a VMM domain for the given device. Keep on private domain + * if the device needs RMRR. Otherwise coalesce VMM domains to reduce + * number of maintained page tables. If this is the first domain on + * this dmar on this VM (domain == NULL), reuse already created busdma + * domain if possible. + */ +struct iommu_domain * +dmar_vmm_domain_add_dev(struct iommu_unit *iommu, struct iommu_domain *domain, + device_t dev, uint16_t rid) +{ + struct dmar_unit *dmar; + struct dmar_domain *ddomain, *rdomain; + struct dmar_ctx *ctx; + bool drain; + + dmar = IOMMU2DMAR(iommu); + ddomain = domain == NULL ? NULL : IODOM2DOM(domain); + MPASS(ddomain == NULL || ddomain->dmar == dmar); + rdomain = NULL; + drain = false; + DMAR_LOCK(dmar); + ctx = dmar_find_ctx_locked(dmar, rid); + if (ctx != NULL) { + rdomain = domain != NULL ? dmar_move_ctx_to_domain(ddomain, + ctx) : IODOM2DOM(ctx->context.domain); + } else { + ctx = dmar_get_ctx_for_dev_locked(dmar, ddomain, dev, rid, + false, true); + if (ctx != NULL) { + rdomain = IODOM2DOM(ctx->context.domain); + MPASS(domain == NULL || rdomain == ddomain); + } + } + if (rdomain != NULL) { + MPASS((rdomain->iodom.flags & (IOMMU_DOMAIN_BUSDMA | + IOMMU_DOMAIN_VMM)) != (IOMMU_DOMAIN_BUSDMA | + IOMMU_DOMAIN_VMM)); + if ((rdomain->iodom.flags & IOMMU_DOMAIN_BUSDMA) != 0) { + rdomain->iodom.flags &= ~IOMMU_DOMAIN_BUSDMA; + drain = true; + } + rdomain->iodom.flags |= IOMMU_DOMAIN_VMM; + } + DMAR_UNLOCK(dmar); + if (drain) { + taskqueue_drain(dmar->iommu.delayed_taskqueue, + &rdomain->iodom.unload_task); + } + return (DOM2IODOM(rdomain)); } static void @@ -915,7 +1010,7 @@ struct dmar_ctx *ret; dmar = IOMMU2DMAR(iommu); - ret = dmar_get_ctx_for_dev(dmar, dev, rid, id_mapped, rmrr_init); + ret = dmar_get_ctx_for_dev(dmar, NULL, dev, rid, id_mapped, rmrr_init); return (CTX2IOCTX(ret)); } diff --git a/sys/x86/iommu/intel_dmar.h b/sys/x86/iommu/intel_dmar.h --- a/sys/x86/iommu/intel_dmar.h +++ b/sys/x86/iommu/intel_dmar.h @@ -229,12 +229,12 @@ int dmar_dev_depth(device_t child); void dmar_dev_path(device_t child, int *busno, void *path1, int depth); -struct dmar_ctx *dmar_get_ctx_for_dev(struct dmar_unit *dmar, device_t dev, - uint16_t rid, bool id_mapped, bool rmrr_init); +struct dmar_ctx *dmar_get_ctx_for_dev(struct dmar_unit *dmar, + struct dmar_domain *domain, device_t dev, uint16_t rid, bool id_mapped, + bool rmrr_init); struct dmar_ctx *dmar_get_ctx_for_devpath(struct dmar_unit *dmar, uint16_t rid, int dev_domain, int dev_busno, const void *dev_path, int dev_path_len, bool id_mapped, bool rmrr_init); -int dmar_move_ctx_to_domain(struct dmar_domain *domain, struct dmar_ctx *ctx); void dmar_free_ctx_locked_method(struct iommu_unit *dmar, struct iommu_ctx *ctx); struct dmar_ctx *dmar_find_ctx_locked(struct dmar_unit *dmar, uint16_t rid); @@ -244,6 +244,8 @@ bool cansleep); void dmar_domain_unload(struct iommu_domain *iodom, struct iommu_map_entries_tailq *entries, bool cansleep); +struct iommu_domain *dmar_vmm_domain_add_dev(struct iommu_unit *iommu, + struct iommu_domain *domain, device_t dev, uint16_t rid); void dmar_dev_parse_rmrr(struct dmar_domain *domain, int dev_domain, int dev_busno, const void *dev_path, int dev_path_len, @@ -263,8 +265,6 @@ bool activehi, int irq, u_int *cookie, uint32_t *hi, uint32_t *lo); int dmar_unmap_ioapic_intr(u_int ioapic_id, u_int *cookie); -int dmar_is_running(void); - extern int haw; extern int dmar_rmrr_enable; diff --git a/sys/x86/iommu/intel_drv.c b/sys/x86/iommu/intel_drv.c --- a/sys/x86/iommu/intel_drv.c +++ b/sys/x86/iommu/intel_drv.c @@ -1329,6 +1329,7 @@ .get_ctx = dmar_get_ctx, .free_ctx_locked = dmar_free_ctx_locked_method, .find = dmar_find_method, + .vmm_domain_add_dev = dmar_vmm_domain_add_dev, .alloc_msi_intr = dmar_alloc_msi_intr, .map_msi_intr = dmar_map_msi_intr, .unmap_msi_intr = dmar_unmap_msi_intr, diff --git a/sys/x86/iommu/x86_iommu.h b/sys/x86/iommu/x86_iommu.h --- a/sys/x86/iommu/x86_iommu.h +++ b/sys/x86/iommu/x86_iommu.h @@ -82,6 +82,8 @@ void (*free_ctx_locked)(struct iommu_unit *iommu, struct iommu_ctx *context); struct iommu_unit *(*find)(device_t dev, bool verbose); + struct iommu_domain *(*vmm_domain_add_dev)(struct iommu_unit *iommu, + struct iommu_domain *domain, device_t dev, uint16_t rid); int (*alloc_msi_intr)(device_t src, u_int *cookies, u_int count); int (*map_msi_intr)(device_t src, u_int cpu, u_int vector, u_int cookie, uint64_t *addr, uint32_t *data); @@ -199,4 +201,7 @@ void iommu_db_domain_print_contexts(struct iommu_domain *iodom); void iommu_db_domain_print_mappings(struct iommu_domain *iodom); +int amdiommu_is_running(void); +int dmar_is_running(void); + #endif diff --git a/usr.sbin/bhyve/pci_passthru.c b/usr.sbin/bhyve/pci_passthru.c --- a/usr.sbin/bhyve/pci_passthru.c +++ b/usr.sbin/bhyve/pci_passthru.c @@ -582,7 +582,7 @@ struct pci_devinst *pi; struct pci_bar_io bar; enum pcibar_type bartype; - uint64_t base, size; + uint64_t base, old_base, size; pi = sc->psc_pi; @@ -621,8 +621,14 @@ "base %#lx or size %#lx not page aligned\n", sc->psc_sel.pc_bus, sc->psc_sel.pc_dev, sc->psc_sel.pc_func, i, base, size); - return (-1); } + if ((base & PAGE_MASK) != 0) { + old_base = base; + base = trunc_page(base); + size += old_base - base; + } + if ((size & PAGE_MASK) != 0) + size = round_page(size); } /* Cache information about the "real" BAR */