Index: head/sys/powerpc/powernv/opal_pci.c =================================================================== --- head/sys/powerpc/powernv/opal_pci.c (revision 366048) +++ head/sys/powerpc/powernv/opal_pci.c (revision 366049) @@ -1,724 +1,724 @@ /*- * Copyright (c) 2015-2016 Nathan Whitehorn * Copyright (c) 2017-2018 Semihalf * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "pcib_if.h" #include "pic_if.h" #include "iommu_if.h" #include "opal.h" #define OPAL_PCI_TCE_MAX_ENTRIES (1024*1024UL) #define OPAL_PCI_TCE_DEFAULT_SEG_SIZE (16*1024*1024UL) #define OPAL_PCI_TCE_R (1UL << 0) #define OPAL_PCI_TCE_W (1UL << 1) #define PHB3_TCE_KILL_INVAL_ALL (1UL << 63) /* * Device interface. */ static int opalpci_probe(device_t); static int opalpci_attach(device_t); /* * pcib interface. */ static uint32_t opalpci_read_config(device_t, u_int, u_int, u_int, u_int, int); static void opalpci_write_config(device_t, u_int, u_int, u_int, u_int, u_int32_t, int); static int opalpci_alloc_msi(device_t dev, device_t child, int count, int maxcount, int *irqs); static int opalpci_release_msi(device_t dev, device_t child, int count, int *irqs); static int opalpci_alloc_msix(device_t dev, device_t child, int *irq); static int opalpci_release_msix(device_t dev, device_t child, int irq); static int opalpci_map_msi(device_t dev, device_t child, int irq, uint64_t *addr, uint32_t *data); static int opalpci_route_interrupt(device_t bus, device_t dev, int pin); /* * MSI PIC interface. */ static void opalpic_pic_enable(device_t dev, u_int irq, u_int vector, void **); static void opalpic_pic_eoi(device_t dev, u_int irq, void *); /* Bus interface */ static bus_dma_tag_t opalpci_get_dma_tag(device_t dev, device_t child); /* * Commands */ #define OPAL_M32_WINDOW_TYPE 1 #define OPAL_M64_WINDOW_TYPE 2 #define OPAL_IO_WINDOW_TYPE 3 #define OPAL_RESET_PHB_COMPLETE 1 #define OPAL_RESET_PCI_IODA_TABLE 6 #define OPAL_DISABLE_M64 0 #define OPAL_ENABLE_M64_SPLIT 1 #define OPAL_ENABLE_M64_NON_SPLIT 2 #define OPAL_EEH_ACTION_CLEAR_FREEZE_MMIO 1 #define OPAL_EEH_ACTION_CLEAR_FREEZE_DMA 2 #define OPAL_EEH_ACTION_CLEAR_FREEZE_ALL 3 #define OPAL_EEH_STOPPED_NOT_FROZEN 0 /* * Constants */ #define OPAL_PCI_DEFAULT_PE 1 #define OPAL_PCI_BUS_SPACE_LOWADDR_32BIT 0x7FFFFFFFUL /* * Driver methods. */ static device_method_t opalpci_methods[] = { /* Device interface */ DEVMETHOD(device_probe, opalpci_probe), DEVMETHOD(device_attach, opalpci_attach), /* pcib interface */ DEVMETHOD(pcib_read_config, opalpci_read_config), DEVMETHOD(pcib_write_config, opalpci_write_config), DEVMETHOD(pcib_alloc_msi, opalpci_alloc_msi), DEVMETHOD(pcib_release_msi, opalpci_release_msi), DEVMETHOD(pcib_alloc_msix, opalpci_alloc_msix), DEVMETHOD(pcib_release_msix, opalpci_release_msix), DEVMETHOD(pcib_map_msi, opalpci_map_msi), DEVMETHOD(pcib_route_interrupt, opalpci_route_interrupt), /* PIC interface for MSIs */ DEVMETHOD(pic_enable, opalpic_pic_enable), DEVMETHOD(pic_eoi, opalpic_pic_eoi), /* Bus interface */ DEVMETHOD(bus_get_dma_tag, opalpci_get_dma_tag), DEVMETHOD(bus_get_cpus, ofw_pcibus_get_cpus), DEVMETHOD(bus_get_domain, ofw_pcibus_get_domain), DEVMETHOD_END }; struct opalpci_softc { struct ofw_pci_softc ofw_sc; uint64_t phb_id; vmem_t *msi_vmem; int msi_base; /* Base XIVE number */ int base_msi_irq; /* Base IRQ assigned by FreeBSD to this PIC */ uint64_t *tce; /* TCE table for 1:1 mapping */ struct resource *r_reg; }; static devclass_t opalpci_devclass; DEFINE_CLASS_1(pcib, opalpci_driver, opalpci_methods, sizeof(struct opalpci_softc), ofw_pci_driver); EARLY_DRIVER_MODULE(opalpci, ofwbus, opalpci_driver, opalpci_devclass, 0, 0, BUS_PASS_BUS); static int opalpci_probe(device_t dev) { const char *type; if (opal_check() != 0) return (ENXIO); type = ofw_bus_get_type(dev); if (type == NULL || (strcmp(type, "pci") != 0 && strcmp(type, "pciex") != 0)) return (ENXIO); if (!OF_hasprop(ofw_bus_get_node(dev), "ibm,opal-phbid")) return (ENXIO); device_set_desc(dev, "OPAL Host-PCI bridge"); return (BUS_PROBE_GENERIC); } static void pci_phb3_tce_invalidate_entire(struct opalpci_softc *sc) { mb(); bus_write_8(sc->r_reg, 0x210, PHB3_TCE_KILL_INVAL_ALL); mb(); } /* Simple function to round to a power of 2 */ static uint64_t round_pow2(uint64_t val) { return (1 << (flsl(val + (val - 1)) - 1)); } /* * Starting with skiboot 5.10 PCIe nodes have a new property, * "ibm,supported-tce-sizes", to denote the TCE sizes available. This allows us * to avoid hard-coding the maximum TCE size allowed, and instead provide a sane * default (however, the "sane" default, which works for all targets, is 64k, * limiting us to 64GB if we have 1M entries. */ static uint64_t max_tce_size(device_t dev) { phandle_t node; cell_t sizes[64]; /* Property is a list of bit-widths, up to 64-bits */ int count; node = ofw_bus_get_node(dev); count = OF_getencprop(node, "ibm,supported-tce-sizes", sizes, sizeof(sizes)); if (count < (int) sizeof(cell_t)) return OPAL_PCI_TCE_DEFAULT_SEG_SIZE; count /= sizeof(cell_t); return (1ULL << sizes[count - 1]); } static int opalpci_attach(device_t dev) { struct opalpci_softc *sc; cell_t id[2], m64ranges[2], m64window[6], npe; phandle_t node; int i, err; uint64_t maxmem; uint64_t entries; uint64_t tce_size; uint64_t tce_tbl_size; int m64bar; int rid; sc = device_get_softc(dev); node = ofw_bus_get_node(dev); switch (OF_getproplen(node, "ibm,opal-phbid")) { case 8: OF_getencprop(node, "ibm,opal-phbid", id, 8); sc->phb_id = ((uint64_t)id[0] << 32) | id[1]; break; case 4: OF_getencprop(node, "ibm,opal-phbid", id, 4); sc->phb_id = id[0]; break; default: device_printf(dev, "PHB ID property had wrong length (%zd)\n", OF_getproplen(node, "ibm,opal-phbid")); return (ENXIO); } if (bootverbose) device_printf(dev, "OPAL ID %#lx\n", sc->phb_id); rid = 0; sc->r_reg = bus_alloc_resource_any(dev, SYS_RES_MEMORY, &rid, RF_ACTIVE | RF_SHAREABLE); if (sc->r_reg == NULL) { device_printf(dev, "Failed to allocate PHB[%jd] registers\n", (uintmax_t)sc->phb_id); return (ENXIO); } #if 0 /* * Reset PCI IODA table */ err = opal_call(OPAL_PCI_RESET, sc->phb_id, OPAL_RESET_PCI_IODA_TABLE, 1); if (err != 0) { device_printf(dev, "IODA table reset failed: %d\n", err); return (ENXIO); } err = opal_call(OPAL_PCI_RESET, sc->phb_id, OPAL_RESET_PHB_COMPLETE, 1); if (err < 0) { device_printf(dev, "PHB reset failed: %d\n", err); return (ENXIO); } if (err > 0) { while ((err = opal_call(OPAL_PCI_POLL, sc->phb_id)) > 0) { DELAY(1000*(err + 1)); /* Returns expected delay in ms */ } } if (err < 0) { device_printf(dev, "WARNING: PHB IODA reset poll failed: %d\n", err); } err = opal_call(OPAL_PCI_RESET, sc->phb_id, OPAL_RESET_PHB_COMPLETE, 0); if (err < 0) { device_printf(dev, "PHB reset failed: %d\n", err); return (ENXIO); } if (err > 0) { while ((err = opal_call(OPAL_PCI_POLL, sc->phb_id)) > 0) { DELAY(1000*(err + 1)); /* Returns expected delay in ms */ } } #endif /* * Map all devices on the bus to partitionable endpoint one until * such time as we start wanting to do things like bhyve. */ err = opal_call(OPAL_PCI_SET_PE, sc->phb_id, OPAL_PCI_DEFAULT_PE, 0, OPAL_PCI_BUS_ANY, OPAL_IGNORE_RID_DEVICE_NUMBER, OPAL_IGNORE_RID_FUNC_NUMBER, OPAL_MAP_PE); if (err != 0) { device_printf(dev, "PE mapping failed: %d\n", err); return (ENXIO); } /* * Turn on MMIO, mapped to PE 1 */ if (OF_getencprop(node, "ibm,opal-num-pes", &npe, 4) != 4) npe = 1; for (i = 0; i < npe; i++) { err = opal_call(OPAL_PCI_MAP_PE_MMIO_WINDOW, sc->phb_id, OPAL_PCI_DEFAULT_PE, OPAL_M32_WINDOW_TYPE, 0, i); if (err != 0) device_printf(dev, "MMIO %d map failed: %d\n", i, err); } if (OF_getencprop(node, "ibm,opal-available-m64-ranges", m64ranges, sizeof(m64ranges)) == sizeof(m64ranges)) m64bar = m64ranges[0]; else m64bar = 0; /* XXX: multiple M64 windows? */ if (OF_getencprop(node, "ibm,opal-m64-window", m64window, sizeof(m64window)) == sizeof(m64window)) { opal_call(OPAL_PCI_PHB_MMIO_ENABLE, sc->phb_id, OPAL_M64_WINDOW_TYPE, m64bar, 0); opal_call(OPAL_PCI_SET_PHB_MEM_WINDOW, sc->phb_id, OPAL_M64_WINDOW_TYPE, m64bar /* index */, ((uint64_t)m64window[2] << 32) | m64window[3], 0, ((uint64_t)m64window[4] << 32) | m64window[5]); opal_call(OPAL_PCI_MAP_PE_MMIO_WINDOW, sc->phb_id, OPAL_PCI_DEFAULT_PE, OPAL_M64_WINDOW_TYPE, m64bar /* index */, 0); opal_call(OPAL_PCI_PHB_MMIO_ENABLE, sc->phb_id, OPAL_M64_WINDOW_TYPE, m64bar, OPAL_ENABLE_M64_NON_SPLIT); } /* * Enable IOMMU for PE1 - map everything 1:1 using * segments of max_tce_size size */ tce_size = max_tce_size(dev); maxmem = roundup2(powerpc_ptob(Maxmem), tce_size); entries = round_pow2(maxmem / tce_size); tce_tbl_size = MAX(entries * sizeof(uint64_t), 4096); if (entries > OPAL_PCI_TCE_MAX_ENTRIES) panic("POWERNV supports only %jdGB of memory space\n", (uintmax_t)((OPAL_PCI_TCE_MAX_ENTRIES * tce_size) >> 30)); if (bootverbose) device_printf(dev, "Mapping 0-%#jx for DMA\n", (uintmax_t)maxmem); sc->tce = contigmalloc(tce_tbl_size, M_DEVBUF, M_NOWAIT | M_ZERO, 0, BUS_SPACE_MAXADDR, tce_tbl_size, 0); if (sc->tce == NULL) panic("Failed to allocate TCE memory for PHB %jd\n", (uintmax_t)sc->phb_id); for (i = 0; i < entries; i++) - sc->tce[i] = (i * tce_size) | OPAL_PCI_TCE_R | OPAL_PCI_TCE_W; + sc->tce[i] = htobe64((i * tce_size) | OPAL_PCI_TCE_R | OPAL_PCI_TCE_W); /* Map TCE for every PE. It seems necessary for Power8 */ for (i = 0; i < npe; i++) { err = opal_call(OPAL_PCI_MAP_PE_DMA_WINDOW, sc->phb_id, i, (i << 1), 1, pmap_kextract((uint64_t)&sc->tce[0]), tce_tbl_size, tce_size); if (err != 0) { device_printf(dev, "DMA IOMMU mapping failed: %d\n", err); return (ENXIO); } err = opal_call(OPAL_PCI_MAP_PE_DMA_WINDOW_REAL, sc->phb_id, i, (i << 1) + 1, (1UL << 59), maxmem); if (err != 0) { device_printf(dev, "DMA 64b bypass mapping failed: %d\n", err); return (ENXIO); } } /* * Invalidate all previous TCE entries. */ if (ofw_bus_is_compatible(dev, "power8-pciex")) pci_phb3_tce_invalidate_entire(sc); else opal_call(OPAL_PCI_TCE_KILL, sc->phb_id, OPAL_PCI_TCE_KILL_ALL, OPAL_PCI_DEFAULT_PE, 0, 0, 0); /* * Get MSI properties */ sc->msi_vmem = NULL; if (OF_getproplen(node, "ibm,opal-msi-ranges") > 0) { cell_t msi_ranges[2]; OF_getencprop(node, "ibm,opal-msi-ranges", msi_ranges, sizeof(msi_ranges)); sc->msi_base = msi_ranges[0]; sc->msi_vmem = vmem_create("OPAL MSI", msi_ranges[0], msi_ranges[1], 1, 0, M_BESTFIT | M_WAITOK); sc->base_msi_irq = powerpc_register_pic(dev, OF_xref_from_node(node), msi_ranges[0] + msi_ranges[1], 0, FALSE); if (bootverbose) device_printf(dev, "Supports %d MSIs starting at %d\n", msi_ranges[1], msi_ranges[0]); } /* Create the parent DMA tag */ /* * Constrain it to POWER8 PHB (ioda2) for now. It seems to mess up on * POWER9 systems. */ if (ofw_bus_is_compatible(dev, "ibm,ioda2-phb")) { err = bus_dma_tag_create(bus_get_dma_tag(dev), /* parent */ 1, 0, /* alignment, bounds */ OPAL_PCI_BUS_SPACE_LOWADDR_32BIT, /* lowaddr */ BUS_SPACE_MAXADDR_32BIT, /* highaddr */ NULL, NULL, /* filter, filterarg */ BUS_SPACE_MAXSIZE, /* maxsize */ BUS_SPACE_UNRESTRICTED, /* nsegments */ BUS_SPACE_MAXSIZE, /* maxsegsize */ 0, /* flags */ NULL, NULL, /* lockfunc, lockarg */ &sc->ofw_sc.sc_dmat); if (err != 0) { device_printf(dev, "Failed to create DMA tag\n"); return (err); } } /* * General OFW PCI attach */ err = ofw_pci_init(dev); if (err != 0) return (err); /* * Unfreeze non-config-space PCI operations. Let this fail silently * if e.g. there is no current freeze. */ opal_call(OPAL_PCI_EEH_FREEZE_CLEAR, sc->phb_id, OPAL_PCI_DEFAULT_PE, OPAL_EEH_ACTION_CLEAR_FREEZE_ALL); /* * OPAL stores 64-bit BARs in a special property rather than "ranges" */ if (OF_getencprop(node, "ibm,opal-m64-window", m64window, sizeof(m64window)) == sizeof(m64window)) { struct ofw_pci_range *rp; sc->ofw_sc.sc_nrange++; sc->ofw_sc.sc_range = realloc(sc->ofw_sc.sc_range, sc->ofw_sc.sc_nrange * sizeof(sc->ofw_sc.sc_range[0]), M_DEVBUF, M_WAITOK); rp = &sc->ofw_sc.sc_range[sc->ofw_sc.sc_nrange-1]; rp->pci_hi = OFW_PCI_PHYS_HI_SPACE_MEM64 | OFW_PCI_PHYS_HI_PREFETCHABLE; rp->pci = ((uint64_t)m64window[0] << 32) | m64window[1]; rp->host = ((uint64_t)m64window[2] << 32) | m64window[3]; rp->size = ((uint64_t)m64window[4] << 32) | m64window[5]; rman_manage_region(&sc->ofw_sc.sc_mem_rman, rp->pci, rp->pci + rp->size - 1); } return (ofw_pci_attach(dev)); } static uint32_t opalpci_read_config(device_t dev, u_int bus, u_int slot, u_int func, u_int reg, int width) { struct opalpci_softc *sc; uint64_t config_addr; uint8_t byte, eeh_state; uint16_t half; uint32_t word; int error; uint16_t err_type; sc = device_get_softc(dev); config_addr = (bus << 8) | ((slot & 0x1f) << 3) | (func & 0x7); switch (width) { case 1: error = opal_call(OPAL_PCI_CONFIG_READ_BYTE, sc->phb_id, config_addr, reg, vtophys(&byte)); word = byte; break; case 2: error = opal_call(OPAL_PCI_CONFIG_READ_HALF_WORD, sc->phb_id, config_addr, reg, vtophys(&half)); word = be16toh(half); break; case 4: error = opal_call(OPAL_PCI_CONFIG_READ_WORD, sc->phb_id, config_addr, reg, vtophys(&word)); word = be32toh(word); break; default: error = OPAL_SUCCESS; word = 0xffffffff; width = 4; } /* * Poking config state for non-existant devices can make * the host bridge hang up. Clear any errors. */ if (error != OPAL_SUCCESS || (word == ((1UL << (8 * width)) - 1))) { if (error != OPAL_HARDWARE) { opal_call(OPAL_PCI_EEH_FREEZE_STATUS, sc->phb_id, OPAL_PCI_DEFAULT_PE, vtophys(&eeh_state), vtophys(&err_type), NULL); err_type = be16toh(err_type); /* XXX unused */ if (eeh_state != OPAL_EEH_STOPPED_NOT_FROZEN) opal_call(OPAL_PCI_EEH_FREEZE_CLEAR, sc->phb_id, OPAL_PCI_DEFAULT_PE, OPAL_EEH_ACTION_CLEAR_FREEZE_ALL); } if (error != OPAL_SUCCESS) word = 0xffffffff; } return (word); } static void opalpci_write_config(device_t dev, u_int bus, u_int slot, u_int func, u_int reg, uint32_t val, int width) { struct opalpci_softc *sc; uint64_t config_addr; int error = OPAL_SUCCESS; sc = device_get_softc(dev); config_addr = (bus << 8) | ((slot & 0x1f) << 3) | (func & 0x7); switch (width) { case 1: error = opal_call(OPAL_PCI_CONFIG_WRITE_BYTE, sc->phb_id, config_addr, reg, val); break; case 2: error = opal_call(OPAL_PCI_CONFIG_WRITE_HALF_WORD, sc->phb_id, config_addr, reg, val); break; case 4: error = opal_call(OPAL_PCI_CONFIG_WRITE_WORD, sc->phb_id, config_addr, reg, val); break; } if (error != OPAL_SUCCESS) { /* * Poking config state for non-existant devices can make * the host bridge hang up. Clear any errors. */ if (error != OPAL_HARDWARE) { opal_call(OPAL_PCI_EEH_FREEZE_CLEAR, sc->phb_id, OPAL_PCI_DEFAULT_PE, OPAL_EEH_ACTION_CLEAR_FREEZE_ALL); } } } static int opalpci_route_interrupt(device_t bus, device_t dev, int pin) { return (pin); } static int opalpci_alloc_msi(device_t dev, device_t child, int count, int maxcount, int *irqs) { struct opalpci_softc *sc; vmem_addr_t start; phandle_t xref; int err, i; sc = device_get_softc(dev); if (sc->msi_vmem == NULL) return (ENODEV); err = vmem_xalloc(sc->msi_vmem, count, powerof2(count), 0, 0, VMEM_ADDR_MIN, VMEM_ADDR_MAX, M_BESTFIT | M_WAITOK, &start); if (err) return (err); xref = OF_xref_from_node(ofw_bus_get_node(dev)); for (i = 0; i < count; i++) irqs[i] = MAP_IRQ(xref, start + i); return (0); } static int opalpci_release_msi(device_t dev, device_t child, int count, int *irqs) { struct opalpci_softc *sc; sc = device_get_softc(dev); if (sc->msi_vmem == NULL) return (ENODEV); vmem_xfree(sc->msi_vmem, irqs[0] - sc->base_msi_irq, count); return (0); } static int opalpci_alloc_msix(device_t dev, device_t child, int *irq) { return (opalpci_alloc_msi(dev, child, 1, 1, irq)); } static int opalpci_release_msix(device_t dev, device_t child, int irq) { return (opalpci_release_msi(dev, child, 1, &irq)); } static int opalpci_map_msi(device_t dev, device_t child, int irq, uint64_t *addr, uint32_t *data) { struct opalpci_softc *sc; struct pci_devinfo *dinfo; int err, xive; sc = device_get_softc(dev); if (sc->msi_vmem == NULL) return (ENODEV); xive = irq - sc->base_msi_irq - sc->msi_base; opal_call(OPAL_PCI_SET_XIVE_PE, sc->phb_id, OPAL_PCI_DEFAULT_PE, xive); dinfo = device_get_ivars(child); if (dinfo->cfg.msi.msi_alloc > 0 && (dinfo->cfg.msi.msi_ctrl & PCIM_MSICTRL_64BIT) == 0) { uint32_t msi32; err = opal_call(OPAL_GET_MSI_32, sc->phb_id, OPAL_PCI_DEFAULT_PE, xive, 1, vtophys(&msi32), vtophys(data)); *addr = be32toh(msi32); } else { err = opal_call(OPAL_GET_MSI_64, sc->phb_id, OPAL_PCI_DEFAULT_PE, xive, 1, vtophys(addr), vtophys(data)); *addr = be64toh(*addr); } *data = be32toh(*data); if (bootverbose && err != 0) device_printf(child, "OPAL MSI mapping error: %d\n", err); return ((err == 0) ? 0 : ENXIO); } static void opalpic_pic_enable(device_t dev, u_int irq, u_int vector, void **priv) { struct opalpci_softc *sc = device_get_softc(dev); PIC_ENABLE(root_pic, irq, vector, priv); opal_call(OPAL_PCI_MSI_EOI, sc->phb_id, irq, priv); } static void opalpic_pic_eoi(device_t dev, u_int irq, void *priv) { struct opalpci_softc *sc; sc = device_get_softc(dev); opal_call(OPAL_PCI_MSI_EOI, sc->phb_id, irq); PIC_EOI(root_pic, irq, priv); } static bus_dma_tag_t opalpci_get_dma_tag(device_t dev, device_t child) { struct opalpci_softc *sc; sc = device_get_softc(dev); return (sc->ofw_sc.sc_dmat); } Index: head/sys/powerpc/powernv/xive.c =================================================================== --- head/sys/powerpc/powernv/xive.c (revision 366048) +++ head/sys/powerpc/powernv/xive.c (revision 366049) @@ -1,767 +1,780 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright 2019 Justin Hibbits * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include "opt_platform.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef POWERNV #include #endif #include "pic_if.h" #define XIVE_PRIORITY 7 /* Random non-zero number */ #define MAX_XIVE_IRQS (1<<24) /* 24-bit XIRR field */ /* Registers */ #define XIVE_TM_QW1_OS 0x010 /* Guest OS registers */ #define XIVE_TM_QW2_HV_POOL 0x020 /* Hypervisor pool registers */ #define XIVE_TM_QW3_HV 0x030 /* Hypervisor registers */ #define XIVE_TM_NSR 0x00 #define XIVE_TM_CPPR 0x01 #define XIVE_TM_IPB 0x02 #define XIVE_TM_LSMFB 0x03 #define XIVE_TM_ACK_CNT 0x04 #define XIVE_TM_INC 0x05 #define XIVE_TM_AGE 0x06 #define XIVE_TM_PIPR 0x07 #define TM_WORD0 0x0 #define TM_WORD2 0x8 #define TM_QW2W2_VP 0x80000000 #define XIVE_TM_SPC_ACK 0x800 #define TM_QW3NSR_HE_SHIFT 14 #define TM_QW3_NSR_HE_NONE 0 #define TM_QW3_NSR_HE_POOL 1 #define TM_QW3_NSR_HE_PHYS 2 #define TM_QW3_NSR_HE_LSI 3 #define XIVE_TM_SPC_PULL_POOL_CTX 0x828 #define XIVE_IRQ_LOAD_EOI 0x000 #define XIVE_IRQ_STORE_EOI 0x400 #define XIVE_IRQ_PQ_00 0xc00 #define XIVE_IRQ_PQ_01 0xd00 #define XIVE_IRQ_VAL_P 0x02 #define XIVE_IRQ_VAL_Q 0x01 struct xive_softc; struct xive_irq; extern void (*powernv_smp_ap_extra_init)(void); /* Private support */ static void xive_setup_cpu(void); static void xive_smp_cpu_startup(void); static void xive_init_irq(struct xive_irq *irqd, u_int irq); static struct xive_irq *xive_configure_irq(u_int irq); static int xive_provision_page(struct xive_softc *sc); /* Interfaces */ static int xive_probe(device_t); static int xive_attach(device_t); static int xics_probe(device_t); static int xics_attach(device_t); static void xive_bind(device_t, u_int, cpuset_t, void **); static void xive_dispatch(device_t, struct trapframe *); static void xive_enable(device_t, u_int, u_int, void **); static void xive_eoi(device_t, u_int, void *); static void xive_ipi(device_t, u_int); static void xive_mask(device_t, u_int, void *); static void xive_unmask(device_t, u_int, void *); static void xive_translate_code(device_t dev, u_int irq, int code, enum intr_trigger *trig, enum intr_polarity *pol); static device_method_t xive_methods[] = { /* Device interface */ DEVMETHOD(device_probe, xive_probe), DEVMETHOD(device_attach, xive_attach), /* PIC interface */ DEVMETHOD(pic_bind, xive_bind), DEVMETHOD(pic_dispatch, xive_dispatch), DEVMETHOD(pic_enable, xive_enable), DEVMETHOD(pic_eoi, xive_eoi), DEVMETHOD(pic_ipi, xive_ipi), DEVMETHOD(pic_mask, xive_mask), DEVMETHOD(pic_unmask, xive_unmask), DEVMETHOD(pic_translate_code, xive_translate_code), DEVMETHOD_END }; static device_method_t xics_methods[] = { /* Device interface */ DEVMETHOD(device_probe, xics_probe), DEVMETHOD(device_attach, xics_attach), DEVMETHOD_END }; struct xive_softc { struct mtx sc_mtx; struct resource *sc_mem; vm_size_t sc_prov_page_size; uint32_t sc_offset; }; struct xive_queue { uint32_t *q_page; uint32_t *q_eoi_page; uint32_t q_toggle; uint32_t q_size; uint32_t q_index; uint32_t q_mask; }; struct xive_irq { uint32_t girq; uint32_t lirq; uint64_t vp; uint64_t flags; #define OPAL_XIVE_IRQ_EOI_VIA_FW 0x00000020 #define OPAL_XIVE_IRQ_MASK_VIA_FW 0x00000010 #define OPAL_XIVE_IRQ_SHIFT_BUG 0x00000008 #define OPAL_XIVE_IRQ_LSI 0x00000004 #define OPAL_XIVE_IRQ_STORE_EOI 0x00000002 #define OPAL_XIVE_IRQ_TRIGGER_PAGE 0x00000001 uint8_t prio; vm_offset_t eoi_page; vm_offset_t trig_page; vm_size_t esb_size; int chip; }; struct xive_cpu { uint64_t vp; uint64_t flags; struct xive_irq ipi_data; struct xive_queue queue; /* We only use a single queue for now. */ uint64_t cam; uint32_t chip; }; static driver_t xive_driver = { "xive", xive_methods, sizeof(struct xive_softc) }; static driver_t xics_driver = { "xivevc", xics_methods, 0 }; static devclass_t xive_devclass; static devclass_t xics_devclass; EARLY_DRIVER_MODULE(xive, ofwbus, xive_driver, xive_devclass, 0, 0, BUS_PASS_INTERRUPT-1); EARLY_DRIVER_MODULE(xivevc, ofwbus, xics_driver, xics_devclass, 0, 0, BUS_PASS_INTERRUPT); MALLOC_DEFINE(M_XIVE, "xive", "XIVE Memory"); DPCPU_DEFINE_STATIC(struct xive_cpu, xive_cpu_data); static int xive_ipi_vector = -1; /* * XIVE Exploitation mode driver. * * The XIVE, present in the POWER9 CPU, can run in two modes: XICS emulation * mode, and "Exploitation mode". XICS emulation mode is compatible with the * POWER8 and earlier XICS interrupt controller, using OPAL calls to emulate * hypervisor calls and memory accesses. Exploitation mode gives us raw access * to the XIVE MMIO, improving performance significantly. * * The XIVE controller is a very bizarre interrupt controller. It uses queues * in memory to pass interrupts around, and maps itself into 512GB of physical * device address space, giving each interrupt in the system one or more pages * of address space. An IRQ is tied to a virtual processor, which could be a * physical CPU thread, or a guest CPU thread (LPAR running on a physical * thread). Thus, the controller can route interrupts directly to guest OSes * bypassing processing by the hypervisor, thereby improving performance of the * guest OS. * * An IRQ, in addition to being tied to a virtual processor, has one or two * page mappings: an EOI page, and an optional trigger page. The trigger page * could be the same as the EOI page. Level-sensitive interrupts (LSIs) don't * have a trigger page, as they're external interrupts controlled by physical * lines. MSIs and IPIs have trigger pages. An IPI is really just another IRQ * in the XIVE, which is triggered by software. * * An interesting behavior of the XIVE controller is that oftentimes the * contents of an address location don't actually matter, but the direction of * the action is the signifier (read vs write), and the address is significant. * Hence, masking and unmasking an interrupt is done by reading different * addresses in the EOI page, and triggering an interrupt consists of writing to * the trigger page. * * Additionally, the MMIO region mapped is CPU-sensitive, just like the * per-processor register space (private access) in OpenPIC. In order for a CPU * to receive interrupts it must itself configure its CPPR (Current Processor * Priority Register), it cannot be set by any other processor. This * necessitates the xive_smp_cpu_startup() function. * * Queues are pages of memory, sized powers-of-two, that are shared with the * XIVE. The XIVE writes into the queue with an alternating polarity bit, which * flips when the queue wraps. */ /* * Offset-based read/write interfaces. */ static uint16_t xive_read_2(struct xive_softc *sc, bus_size_t offset) { return (bus_read_2(sc->sc_mem, sc->sc_offset + offset)); } static void xive_write_1(struct xive_softc *sc, bus_size_t offset, uint8_t val) { bus_write_1(sc->sc_mem, sc->sc_offset + offset, val); } /* EOI and Trigger page access interfaces. */ static uint64_t xive_read_mmap8(vm_offset_t addr) { return (*(volatile uint64_t *)addr); } static void xive_write_mmap8(vm_offset_t addr, uint64_t val) { *(uint64_t *)(addr) = val; } /* Device interfaces. */ static int xive_probe(device_t dev) { if (!ofw_bus_is_compatible(dev, "ibm,opal-xive-pe")) return (ENXIO); device_set_desc(dev, "External Interrupt Virtualization Engine"); /* Make sure we always win against the xicp driver. */ return (BUS_PROBE_DEFAULT); } static int xics_probe(device_t dev) { if (!ofw_bus_is_compatible(dev, "ibm,opal-xive-vc")) return (ENXIO); device_set_desc(dev, "External Interrupt Virtualization Engine Root"); return (BUS_PROBE_DEFAULT); } static int xive_attach(device_t dev) { struct xive_softc *sc = device_get_softc(dev); struct xive_cpu *xive_cpud; phandle_t phandle = ofw_bus_get_node(dev); int64_t vp_block; int error; int rid; int i, order; uint64_t vp_id; int64_t ipi_irq; opal_call(OPAL_XIVE_RESET, OPAL_XIVE_XICS_MODE_EXP); error = OF_getencprop(phandle, "ibm,xive-provision-page-size", (pcell_t *)&sc->sc_prov_page_size, sizeof(sc->sc_prov_page_size)); rid = 1; /* Get the Hypervisor-level register set. */ sc->sc_mem = bus_alloc_resource_any(dev, SYS_RES_MEMORY, &rid, RF_ACTIVE); sc->sc_offset = XIVE_TM_QW3_HV; mtx_init(&sc->sc_mtx, "XIVE", NULL, MTX_DEF); /* Workaround for qemu single-thread powernv */ if (mp_maxid == 0) order = 1; else order = fls(mp_maxid + (mp_maxid - 1)) - 1; do { vp_block = opal_call(OPAL_XIVE_ALLOCATE_VP_BLOCK, order); if (vp_block == OPAL_BUSY) DELAY(10); else if (vp_block == OPAL_XIVE_PROVISIONING) xive_provision_page(sc); else break; } while (1); if (vp_block < 0) { device_printf(dev, "Unable to allocate VP block. Opal error %d\n", (int)vp_block); bus_release_resource(dev, SYS_RES_MEMORY, rid, sc->sc_mem); return (ENXIO); } /* * Set up the VPs. Try to do as much as we can in attach, to lessen * what's needed at AP spawn time. */ CPU_FOREACH(i) { vp_id = pcpu_find(i)->pc_hwref; xive_cpud = DPCPU_ID_PTR(i, xive_cpu_data); xive_cpud->vp = vp_id + vp_block; opal_call(OPAL_XIVE_GET_VP_INFO, xive_cpud->vp, NULL, vtophys(&xive_cpud->cam), NULL, vtophys(&xive_cpud->chip)); + xive_cpud->cam = be64toh(xive_cpud->cam); + xive_cpud->chip = be64toh(xive_cpud->chip); + /* Allocate the queue page and populate the queue state data. */ xive_cpud->queue.q_page = contigmalloc(PAGE_SIZE, M_XIVE, M_ZERO | M_WAITOK, 0, BUS_SPACE_MAXADDR, PAGE_SIZE, 0); xive_cpud->queue.q_size = 1 << PAGE_SHIFT; xive_cpud->queue.q_mask = ((xive_cpud->queue.q_size / sizeof(int)) - 1); xive_cpud->queue.q_toggle = 0; xive_cpud->queue.q_index = 0; do { error = opal_call(OPAL_XIVE_SET_VP_INFO, xive_cpud->vp, OPAL_XIVE_VP_ENABLED, 0); } while (error == OPAL_BUSY); error = opal_call(OPAL_XIVE_SET_QUEUE_INFO, vp_id, XIVE_PRIORITY, vtophys(xive_cpud->queue.q_page), PAGE_SHIFT, OPAL_XIVE_EQ_ALWAYS_NOTIFY | OPAL_XIVE_EQ_ENABLED); do { ipi_irq = opal_call(OPAL_XIVE_ALLOCATE_IRQ, xive_cpud->chip); } while (ipi_irq == OPAL_BUSY); if (ipi_irq < 0) device_printf(root_pic, "Failed allocating IPI. OPAL error %d\n", (int)ipi_irq); else { xive_init_irq(&xive_cpud->ipi_data, ipi_irq); xive_cpud->ipi_data.vp = vp_id; xive_cpud->ipi_data.lirq = MAX_XIVE_IRQS; opal_call(OPAL_XIVE_SET_IRQ_CONFIG, ipi_irq, xive_cpud->ipi_data.vp, XIVE_PRIORITY, MAX_XIVE_IRQS); } } powerpc_register_pic(dev, OF_xref_from_node(phandle), MAX_XIVE_IRQS, 1 /* Number of IPIs */, FALSE); root_pic = dev; xive_setup_cpu(); powernv_smp_ap_extra_init = xive_smp_cpu_startup; return (0); } static int xics_attach(device_t dev) { phandle_t phandle = ofw_bus_get_node(dev); /* The XIVE (root PIC) will handle all our interrupts */ powerpc_register_pic(root_pic, OF_xref_from_node(phandle), MAX_XIVE_IRQS, 1 /* Number of IPIs */, FALSE); return (0); } /* * PIC I/F methods. */ static void xive_bind(device_t dev, u_int irq, cpuset_t cpumask, void **priv) { struct xive_irq *irqd; int cpu; int ncpus, i, error; if (*priv == NULL) *priv = xive_configure_irq(irq); irqd = *priv; /* * This doesn't appear to actually support affinity groups, so pick a * random CPU. */ ncpus = 0; CPU_FOREACH(cpu) if (CPU_ISSET(cpu, &cpumask)) ncpus++; i = mftb() % ncpus; ncpus = 0; CPU_FOREACH(cpu) { if (!CPU_ISSET(cpu, &cpumask)) continue; if (ncpus == i) break; ncpus++; } opal_call(OPAL_XIVE_SYNC, OPAL_XIVE_SYNC_QUEUE, irq); irqd->vp = pcpu_find(cpu)->pc_hwref; error = opal_call(OPAL_XIVE_SET_IRQ_CONFIG, irq, irqd->vp, XIVE_PRIORITY, irqd->lirq); if (error < 0) panic("Cannot bind interrupt %d to CPU %d", irq, cpu); xive_eoi(dev, irq, irqd); } /* Read the next entry in the queue page and update the index. */ static int xive_read_eq(struct xive_queue *q) { uint32_t i = be32toh(q->q_page[q->q_index]); /* Check validity, using current queue polarity. */ if ((i >> 31) == q->q_toggle) return (0); q->q_index = (q->q_index + 1) & q->q_mask; if (q->q_index == 0) q->q_toggle ^= 1; return (i & 0x7fffffff); } static void xive_dispatch(device_t dev, struct trapframe *tf) { struct xive_softc *sc; struct xive_cpu *xive_cpud; uint32_t vector; uint16_t ack; uint8_t cppr, he; sc = device_get_softc(dev); xive_cpud = DPCPU_PTR(xive_cpu_data); for (;;) { ack = xive_read_2(sc, XIVE_TM_SPC_ACK); cppr = (ack & 0xff); he = ack >> TM_QW3NSR_HE_SHIFT; if (he == TM_QW3_NSR_HE_NONE) break; else if (__predict_false(he != TM_QW3_NSR_HE_PHYS)) { /* * We don't support TM_QW3_NSR_HE_POOL or * TM_QW3_NSR_HE_LSI interrupts. */ device_printf(dev, "Unexpected interrupt he type: %d\n", he); goto end; } xive_write_1(sc, XIVE_TM_CPPR, cppr); for (;;) { vector = xive_read_eq(&xive_cpud->queue); if (vector == 0) break; if (vector == MAX_XIVE_IRQS) vector = xive_ipi_vector; powerpc_dispatch_intr(vector, tf); } } end: xive_write_1(sc, XIVE_TM_CPPR, 0xff); } static void xive_enable(device_t dev, u_int irq, u_int vector, void **priv) { struct xive_irq *irqd; cell_t status, cpu; if (irq == MAX_XIVE_IRQS) { if (xive_ipi_vector == -1) xive_ipi_vector = vector; return; } if (*priv == NULL) *priv = xive_configure_irq(irq); irqd = *priv; /* Bind to this CPU to start */ cpu = PCPU_GET(hwref); irqd->lirq = vector; for (;;) { status = opal_call(OPAL_XIVE_SET_IRQ_CONFIG, irq, cpu, XIVE_PRIORITY, vector); if (status != OPAL_BUSY) break; DELAY(10); } if (status != 0) panic("OPAL_SET_XIVE IRQ %d -> cpu %d failed: %d", irq, cpu, status); xive_unmask(dev, irq, *priv); } static void xive_eoi(device_t dev, u_int irq, void *priv) { struct xive_irq *rirq; struct xive_cpu *cpud; uint8_t eoi_val; if (irq == MAX_XIVE_IRQS) { cpud = DPCPU_PTR(xive_cpu_data); rirq = &cpud->ipi_data; } else rirq = priv; if (rirq->flags & OPAL_XIVE_IRQ_EOI_VIA_FW) opal_call(OPAL_INT_EOI, irq); else if (rirq->flags & OPAL_XIVE_IRQ_STORE_EOI) xive_write_mmap8(rirq->eoi_page + XIVE_IRQ_STORE_EOI, 0); else if (rirq->flags & OPAL_XIVE_IRQ_LSI) xive_read_mmap8(rirq->eoi_page + XIVE_IRQ_LOAD_EOI); else { eoi_val = xive_read_mmap8(rirq->eoi_page + XIVE_IRQ_PQ_00); if ((eoi_val & XIVE_IRQ_VAL_Q) && rirq->trig_page != 0) xive_write_mmap8(rirq->trig_page, 0); } } static void xive_ipi(device_t dev, u_int cpu) { struct xive_cpu *xive_cpud; xive_cpud = DPCPU_ID_PTR(cpu, xive_cpu_data); if (xive_cpud->ipi_data.trig_page == 0) return; xive_write_mmap8(xive_cpud->ipi_data.trig_page, 0); } static void xive_mask(device_t dev, u_int irq, void *priv) { struct xive_irq *rirq; /* Never mask IPIs */ if (irq == MAX_XIVE_IRQS) return; rirq = priv; if (!(rirq->flags & OPAL_XIVE_IRQ_LSI)) return; xive_read_mmap8(rirq->eoi_page + XIVE_IRQ_PQ_01); } static void xive_unmask(device_t dev, u_int irq, void *priv) { struct xive_irq *rirq; rirq = priv; xive_read_mmap8(rirq->eoi_page + XIVE_IRQ_PQ_00); } static void xive_translate_code(device_t dev, u_int irq, int code, enum intr_trigger *trig, enum intr_polarity *pol) { switch (code) { case 0: /* L to H edge */ *trig = INTR_TRIGGER_EDGE; *pol = INTR_POLARITY_HIGH; break; case 1: /* Active L level */ *trig = INTR_TRIGGER_LEVEL; *pol = INTR_POLARITY_LOW; break; default: *trig = INTR_TRIGGER_CONFORM; *pol = INTR_POLARITY_CONFORM; } } /* Private functions. */ /* * Setup the current CPU. Called by the BSP at driver attachment, and by each * AP at wakeup (via xive_smp_cpu_startup()). */ static void xive_setup_cpu(void) { struct xive_softc *sc; struct xive_cpu *cpup; uint32_t val; cpup = DPCPU_PTR(xive_cpu_data); sc = device_get_softc(root_pic); val = bus_read_4(sc->sc_mem, XIVE_TM_QW2_HV_POOL + TM_WORD2); if (val & TM_QW2W2_VP) bus_read_8(sc->sc_mem, XIVE_TM_SPC_PULL_POOL_CTX); bus_write_4(sc->sc_mem, XIVE_TM_QW2_HV_POOL + TM_WORD0, 0xff); bus_write_4(sc->sc_mem, XIVE_TM_QW2_HV_POOL + TM_WORD2, TM_QW2W2_VP | cpup->cam); xive_unmask(root_pic, cpup->ipi_data.girq, &cpup->ipi_data); xive_write_1(sc, XIVE_TM_CPPR, 0xff); } /* Populate an IRQ structure, mapping the EOI and trigger pages. */ static void xive_init_irq(struct xive_irq *irqd, u_int irq) { uint64_t eoi_phys, trig_phys; uint32_t esb_shift; opal_call(OPAL_XIVE_GET_IRQ_INFO, irq, vtophys(&irqd->flags), vtophys(&eoi_phys), vtophys(&trig_phys), vtophys(&esb_shift), vtophys(&irqd->chip)); + irqd->flags = be64toh(irqd->flags); + eoi_phys = be64toh(eoi_phys); + trig_phys = be64toh(trig_phys); + esb_shift = be32toh(esb_shift); + irqd->chip = be32toh(irqd->chip); + irqd->girq = irq; irqd->esb_size = 1 << esb_shift; irqd->eoi_page = (vm_offset_t)pmap_mapdev(eoi_phys, irqd->esb_size); if (eoi_phys == trig_phys) irqd->trig_page = irqd->eoi_page; else if (trig_phys != 0) irqd->trig_page = (vm_offset_t)pmap_mapdev(trig_phys, irqd->esb_size); else irqd->trig_page = 0; opal_call(OPAL_XIVE_GET_IRQ_CONFIG, irq, vtophys(&irqd->vp), vtophys(&irqd->prio), vtophys(&irqd->lirq)); + + irqd->vp = be64toh(irqd->vp); + irqd->prio = be64toh(irqd->prio); + irqd->lirq = be32toh(irqd->lirq); } /* Allocate an IRQ struct before populating it. */ static struct xive_irq * xive_configure_irq(u_int irq) { struct xive_irq *irqd; irqd = malloc(sizeof(struct xive_irq), M_XIVE, M_WAITOK); xive_init_irq(irqd, irq); return (irqd); } /* * Part of the OPAL API. OPAL_XIVE_ALLOCATE_VP_BLOCK might require more pages, * provisioned through this call. */ static int xive_provision_page(struct xive_softc *sc) { void *prov_page; int error; do { prov_page = contigmalloc(sc->sc_prov_page_size, M_XIVE, 0, 0, BUS_SPACE_MAXADDR, sc->sc_prov_page_size, sc->sc_prov_page_size); error = opal_call(OPAL_XIVE_DONATE_PAGE, -1, vtophys(prov_page)); } while (error == OPAL_XIVE_PROVISIONING); return (0); } /* The XIVE_TM_CPPR register must be set by each thread */ static void xive_smp_cpu_startup(void) { xive_setup_cpu(); }