Index: head/sys/powerpc/aim/mmu_oea64.c =================================================================== --- head/sys/powerpc/aim/mmu_oea64.c (revision 346173) +++ head/sys/powerpc/aim/mmu_oea64.c (revision 346174) @@ -1,2892 +1,2895 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2008-2015 Nathan Whitehorn * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); /* * Manages physical address maps. * * Since the information managed by this module is also stored by the * logical address mapping module, this module may throw away valid virtual * to physical mappings at almost any time. However, invalidations of * mappings must be done as requested. * * In order to cope with hardware architectures which make virtual to * physical map invalidates expensive, this module may delay invalidate * reduced protection operations until such time as they are actually * necessary. This module is given full information as to which processors * are currently using which maps, and to when physical maps must be made * correct. */ #include "opt_kstack_pages.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "mmu_oea64.h" #include "mmu_if.h" #include "moea64_if.h" void moea64_release_vsid(uint64_t vsid); uintptr_t moea64_get_unique_vsid(void); #define DISABLE_TRANS(msr) msr = mfmsr(); mtmsr(msr & ~PSL_DR) #define ENABLE_TRANS(msr) mtmsr(msr) #define VSID_MAKE(sr, hash) ((sr) | (((hash) & 0xfffff) << 4)) #define VSID_TO_HASH(vsid) (((vsid) >> 4) & 0xfffff) #define VSID_HASH_MASK 0x0000007fffffffffULL /* * Locking semantics: * * There are two locks of interest: the page locks and the pmap locks, which * protect their individual PVO lists and are locked in that order. The contents * of all PVO entries are protected by the locks of their respective pmaps. * The pmap of any PVO is guaranteed not to change so long as the PVO is linked * into any list. * */ #define PV_LOCK_COUNT PA_LOCK_COUNT*3 static struct mtx_padalign pv_lock[PV_LOCK_COUNT]; #define PV_LOCKPTR(pa) ((struct mtx *)(&pv_lock[pa_index(pa) % PV_LOCK_COUNT])) #define PV_LOCK(pa) mtx_lock(PV_LOCKPTR(pa)) #define PV_UNLOCK(pa) mtx_unlock(PV_LOCKPTR(pa)) #define PV_LOCKASSERT(pa) mtx_assert(PV_LOCKPTR(pa), MA_OWNED) #define PV_PAGE_LOCK(m) PV_LOCK(VM_PAGE_TO_PHYS(m)) #define PV_PAGE_UNLOCK(m) PV_UNLOCK(VM_PAGE_TO_PHYS(m)) #define PV_PAGE_LOCKASSERT(m) PV_LOCKASSERT(VM_PAGE_TO_PHYS(m)) struct ofw_map { cell_t om_va; cell_t om_len; uint64_t om_pa; cell_t om_mode; }; extern unsigned char _etext[]; extern unsigned char _end[]; extern void *slbtrap, *slbtrapend; /* * Map of physical memory regions. */ static struct mem_region *regions; static struct mem_region *pregions; +static struct numa_mem_region *numa_pregions; static u_int phys_avail_count; -static int regions_sz, pregions_sz; +static int regions_sz, pregions_sz, numapregions_sz; extern void bs_remap_earlyboot(void); /* * Lock for the SLB tables. */ struct mtx moea64_slb_mutex; /* * PTEG data. */ u_long moea64_pteg_count; u_long moea64_pteg_mask; /* * PVO data. */ uma_zone_t moea64_pvo_zone; /* zone for pvo entries */ static struct pvo_entry *moea64_bpvo_pool; static int moea64_bpvo_pool_index = 0; static int moea64_bpvo_pool_size = 327680; TUNABLE_INT("machdep.moea64_bpvo_pool_size", &moea64_bpvo_pool_size); SYSCTL_INT(_machdep, OID_AUTO, moea64_allocated_bpvo_entries, CTLFLAG_RD, &moea64_bpvo_pool_index, 0, ""); #define VSID_NBPW (sizeof(u_int32_t) * 8) #ifdef __powerpc64__ #define NVSIDS (NPMAPS * 16) #define VSID_HASHMASK 0xffffffffUL #else #define NVSIDS NPMAPS #define VSID_HASHMASK 0xfffffUL #endif static u_int moea64_vsid_bitmap[NVSIDS / VSID_NBPW]; static boolean_t moea64_initialized = FALSE; /* * Statistics. */ u_int moea64_pte_valid = 0; u_int moea64_pte_overflow = 0; u_int moea64_pvo_entries = 0; u_int moea64_pvo_enter_calls = 0; u_int moea64_pvo_remove_calls = 0; SYSCTL_INT(_machdep, OID_AUTO, moea64_pte_valid, CTLFLAG_RD, &moea64_pte_valid, 0, ""); SYSCTL_INT(_machdep, OID_AUTO, moea64_pte_overflow, CTLFLAG_RD, &moea64_pte_overflow, 0, ""); SYSCTL_INT(_machdep, OID_AUTO, moea64_pvo_entries, CTLFLAG_RD, &moea64_pvo_entries, 0, ""); SYSCTL_INT(_machdep, OID_AUTO, moea64_pvo_enter_calls, CTLFLAG_RD, &moea64_pvo_enter_calls, 0, ""); SYSCTL_INT(_machdep, OID_AUTO, moea64_pvo_remove_calls, CTLFLAG_RD, &moea64_pvo_remove_calls, 0, ""); vm_offset_t moea64_scratchpage_va[2]; struct pvo_entry *moea64_scratchpage_pvo[2]; struct mtx moea64_scratchpage_mtx; uint64_t moea64_large_page_mask = 0; uint64_t moea64_large_page_size = 0; int moea64_large_page_shift = 0; /* * PVO calls. */ static int moea64_pvo_enter(mmu_t mmu, struct pvo_entry *pvo, struct pvo_head *pvo_head); static void moea64_pvo_remove_from_pmap(mmu_t mmu, struct pvo_entry *pvo); static void moea64_pvo_remove_from_page(mmu_t mmu, struct pvo_entry *pvo); static struct pvo_entry *moea64_pvo_find_va(pmap_t, vm_offset_t); /* * Utility routines. */ static boolean_t moea64_query_bit(mmu_t, vm_page_t, uint64_t); static u_int moea64_clear_bit(mmu_t, vm_page_t, uint64_t); static void moea64_kremove(mmu_t, vm_offset_t); static void moea64_syncicache(mmu_t, pmap_t pmap, vm_offset_t va, vm_paddr_t pa, vm_size_t sz); static void moea64_pmap_init_qpages(void); /* * Kernel MMU interface */ void moea64_clear_modify(mmu_t, vm_page_t); void moea64_copy_page(mmu_t, vm_page_t, vm_page_t); void moea64_copy_pages(mmu_t mmu, vm_page_t *ma, vm_offset_t a_offset, vm_page_t *mb, vm_offset_t b_offset, int xfersize); int moea64_enter(mmu_t, pmap_t, vm_offset_t, vm_page_t, vm_prot_t, u_int flags, int8_t psind); void moea64_enter_object(mmu_t, pmap_t, vm_offset_t, vm_offset_t, vm_page_t, vm_prot_t); void moea64_enter_quick(mmu_t, pmap_t, vm_offset_t, vm_page_t, vm_prot_t); vm_paddr_t moea64_extract(mmu_t, pmap_t, vm_offset_t); vm_page_t moea64_extract_and_hold(mmu_t, pmap_t, vm_offset_t, vm_prot_t); void moea64_init(mmu_t); boolean_t moea64_is_modified(mmu_t, vm_page_t); boolean_t moea64_is_prefaultable(mmu_t, pmap_t, vm_offset_t); boolean_t moea64_is_referenced(mmu_t, vm_page_t); int moea64_ts_referenced(mmu_t, vm_page_t); vm_offset_t moea64_map(mmu_t, vm_offset_t *, vm_paddr_t, vm_paddr_t, int); boolean_t moea64_page_exists_quick(mmu_t, pmap_t, vm_page_t); void moea64_page_init(mmu_t, vm_page_t); int moea64_page_wired_mappings(mmu_t, vm_page_t); void moea64_pinit(mmu_t, pmap_t); void moea64_pinit0(mmu_t, pmap_t); void moea64_protect(mmu_t, pmap_t, vm_offset_t, vm_offset_t, vm_prot_t); void moea64_qenter(mmu_t, vm_offset_t, vm_page_t *, int); void moea64_qremove(mmu_t, vm_offset_t, int); void moea64_release(mmu_t, pmap_t); void moea64_remove(mmu_t, pmap_t, vm_offset_t, vm_offset_t); void moea64_remove_pages(mmu_t, pmap_t); void moea64_remove_all(mmu_t, vm_page_t); void moea64_remove_write(mmu_t, vm_page_t); void moea64_unwire(mmu_t, pmap_t, vm_offset_t, vm_offset_t); void moea64_zero_page(mmu_t, vm_page_t); void moea64_zero_page_area(mmu_t, vm_page_t, int, int); void moea64_activate(mmu_t, struct thread *); void moea64_deactivate(mmu_t, struct thread *); void *moea64_mapdev(mmu_t, vm_paddr_t, vm_size_t); void *moea64_mapdev_attr(mmu_t, vm_paddr_t, vm_size_t, vm_memattr_t); void moea64_unmapdev(mmu_t, vm_offset_t, vm_size_t); vm_paddr_t moea64_kextract(mmu_t, vm_offset_t); void moea64_page_set_memattr(mmu_t, vm_page_t m, vm_memattr_t ma); void moea64_kenter_attr(mmu_t, vm_offset_t, vm_paddr_t, vm_memattr_t ma); void moea64_kenter(mmu_t, vm_offset_t, vm_paddr_t); boolean_t moea64_dev_direct_mapped(mmu_t, vm_paddr_t, vm_size_t); static void moea64_sync_icache(mmu_t, pmap_t, vm_offset_t, vm_size_t); void moea64_dumpsys_map(mmu_t mmu, vm_paddr_t pa, size_t sz, void **va); void moea64_scan_init(mmu_t mmu); vm_offset_t moea64_quick_enter_page(mmu_t mmu, vm_page_t m); void moea64_quick_remove_page(mmu_t mmu, vm_offset_t addr); static int moea64_map_user_ptr(mmu_t mmu, pmap_t pm, volatile const void *uaddr, void **kaddr, size_t ulen, size_t *klen); static int moea64_decode_kernel_ptr(mmu_t mmu, vm_offset_t addr, int *is_user, vm_offset_t *decoded_addr); static mmu_method_t moea64_methods[] = { MMUMETHOD(mmu_clear_modify, moea64_clear_modify), MMUMETHOD(mmu_copy_page, moea64_copy_page), MMUMETHOD(mmu_copy_pages, moea64_copy_pages), MMUMETHOD(mmu_enter, moea64_enter), MMUMETHOD(mmu_enter_object, moea64_enter_object), MMUMETHOD(mmu_enter_quick, moea64_enter_quick), MMUMETHOD(mmu_extract, moea64_extract), MMUMETHOD(mmu_extract_and_hold, moea64_extract_and_hold), MMUMETHOD(mmu_init, moea64_init), MMUMETHOD(mmu_is_modified, moea64_is_modified), MMUMETHOD(mmu_is_prefaultable, moea64_is_prefaultable), MMUMETHOD(mmu_is_referenced, moea64_is_referenced), MMUMETHOD(mmu_ts_referenced, moea64_ts_referenced), MMUMETHOD(mmu_map, moea64_map), MMUMETHOD(mmu_page_exists_quick,moea64_page_exists_quick), MMUMETHOD(mmu_page_init, moea64_page_init), MMUMETHOD(mmu_page_wired_mappings,moea64_page_wired_mappings), MMUMETHOD(mmu_pinit, moea64_pinit), MMUMETHOD(mmu_pinit0, moea64_pinit0), MMUMETHOD(mmu_protect, moea64_protect), MMUMETHOD(mmu_qenter, moea64_qenter), MMUMETHOD(mmu_qremove, moea64_qremove), MMUMETHOD(mmu_release, moea64_release), MMUMETHOD(mmu_remove, moea64_remove), MMUMETHOD(mmu_remove_pages, moea64_remove_pages), MMUMETHOD(mmu_remove_all, moea64_remove_all), MMUMETHOD(mmu_remove_write, moea64_remove_write), MMUMETHOD(mmu_sync_icache, moea64_sync_icache), MMUMETHOD(mmu_unwire, moea64_unwire), MMUMETHOD(mmu_zero_page, moea64_zero_page), MMUMETHOD(mmu_zero_page_area, moea64_zero_page_area), MMUMETHOD(mmu_activate, moea64_activate), MMUMETHOD(mmu_deactivate, moea64_deactivate), MMUMETHOD(mmu_page_set_memattr, moea64_page_set_memattr), MMUMETHOD(mmu_quick_enter_page, moea64_quick_enter_page), MMUMETHOD(mmu_quick_remove_page, moea64_quick_remove_page), /* Internal interfaces */ MMUMETHOD(mmu_mapdev, moea64_mapdev), MMUMETHOD(mmu_mapdev_attr, moea64_mapdev_attr), MMUMETHOD(mmu_unmapdev, moea64_unmapdev), MMUMETHOD(mmu_kextract, moea64_kextract), MMUMETHOD(mmu_kenter, moea64_kenter), MMUMETHOD(mmu_kenter_attr, moea64_kenter_attr), MMUMETHOD(mmu_dev_direct_mapped,moea64_dev_direct_mapped), MMUMETHOD(mmu_scan_init, moea64_scan_init), MMUMETHOD(mmu_dumpsys_map, moea64_dumpsys_map), MMUMETHOD(mmu_map_user_ptr, moea64_map_user_ptr), MMUMETHOD(mmu_decode_kernel_ptr, moea64_decode_kernel_ptr), { 0, 0 } }; MMU_DEF(oea64_mmu, "mmu_oea64_base", moea64_methods, 0); static struct pvo_head * vm_page_to_pvoh(vm_page_t m) { mtx_assert(PV_LOCKPTR(VM_PAGE_TO_PHYS(m)), MA_OWNED); return (&m->md.mdpg_pvoh); } static struct pvo_entry * alloc_pvo_entry(int bootstrap) { struct pvo_entry *pvo; if (!moea64_initialized || bootstrap) { if (moea64_bpvo_pool_index >= moea64_bpvo_pool_size) { panic("moea64_enter: bpvo pool exhausted, %d, %d, %zd", moea64_bpvo_pool_index, moea64_bpvo_pool_size, moea64_bpvo_pool_size * sizeof(struct pvo_entry)); } pvo = &moea64_bpvo_pool[ atomic_fetchadd_int(&moea64_bpvo_pool_index, 1)]; bzero(pvo, sizeof(*pvo)); pvo->pvo_vaddr = PVO_BOOTSTRAP; } else { pvo = uma_zalloc(moea64_pvo_zone, M_NOWAIT); bzero(pvo, sizeof(*pvo)); } return (pvo); } static void init_pvo_entry(struct pvo_entry *pvo, pmap_t pmap, vm_offset_t va) { uint64_t vsid; uint64_t hash; int shift; PMAP_LOCK_ASSERT(pmap, MA_OWNED); pvo->pvo_pmap = pmap; va &= ~ADDR_POFF; pvo->pvo_vaddr |= va; vsid = va_to_vsid(pmap, va); pvo->pvo_vpn = (uint64_t)((va & ADDR_PIDX) >> ADDR_PIDX_SHFT) | (vsid << 16); shift = (pvo->pvo_vaddr & PVO_LARGE) ? moea64_large_page_shift : ADDR_PIDX_SHFT; hash = (vsid & VSID_HASH_MASK) ^ (((uint64_t)va & ADDR_PIDX) >> shift); pvo->pvo_pte.slot = (hash & moea64_pteg_mask) << 3; } static void free_pvo_entry(struct pvo_entry *pvo) { if (!(pvo->pvo_vaddr & PVO_BOOTSTRAP)) uma_zfree(moea64_pvo_zone, pvo); } void moea64_pte_from_pvo(const struct pvo_entry *pvo, struct lpte *lpte) { lpte->pte_hi = (pvo->pvo_vpn >> (ADDR_API_SHFT64 - ADDR_PIDX_SHFT)) & LPTE_AVPN_MASK; lpte->pte_hi |= LPTE_VALID; if (pvo->pvo_vaddr & PVO_LARGE) lpte->pte_hi |= LPTE_BIG; if (pvo->pvo_vaddr & PVO_WIRED) lpte->pte_hi |= LPTE_WIRED; if (pvo->pvo_vaddr & PVO_HID) lpte->pte_hi |= LPTE_HID; lpte->pte_lo = pvo->pvo_pte.pa; /* Includes WIMG bits */ if (pvo->pvo_pte.prot & VM_PROT_WRITE) lpte->pte_lo |= LPTE_BW; else lpte->pte_lo |= LPTE_BR; if (!(pvo->pvo_pte.prot & VM_PROT_EXECUTE)) lpte->pte_lo |= LPTE_NOEXEC; } static __inline uint64_t moea64_calc_wimg(vm_paddr_t pa, vm_memattr_t ma) { uint64_t pte_lo; int i; if (ma != VM_MEMATTR_DEFAULT) { switch (ma) { case VM_MEMATTR_UNCACHEABLE: return (LPTE_I | LPTE_G); case VM_MEMATTR_CACHEABLE: return (LPTE_M); case VM_MEMATTR_WRITE_COMBINING: case VM_MEMATTR_WRITE_BACK: case VM_MEMATTR_PREFETCHABLE: return (LPTE_I); case VM_MEMATTR_WRITE_THROUGH: return (LPTE_W | LPTE_M); } } /* * Assume the page is cache inhibited and access is guarded unless * it's in our available memory array. */ pte_lo = LPTE_I | LPTE_G; for (i = 0; i < pregions_sz; i++) { if ((pa >= pregions[i].mr_start) && (pa < (pregions[i].mr_start + pregions[i].mr_size))) { pte_lo &= ~(LPTE_I | LPTE_G); pte_lo |= LPTE_M; break; } } return pte_lo; } /* * Quick sort callout for comparing memory regions. */ static int om_cmp(const void *a, const void *b); static int om_cmp(const void *a, const void *b) { const struct ofw_map *mapa; const struct ofw_map *mapb; mapa = a; mapb = b; if (mapa->om_pa < mapb->om_pa) return (-1); else if (mapa->om_pa > mapb->om_pa) return (1); else return (0); } static void moea64_add_ofw_mappings(mmu_t mmup, phandle_t mmu, size_t sz) { struct ofw_map translations[sz/(4*sizeof(cell_t))]; /*>= 4 cells per */ pcell_t acells, trans_cells[sz/sizeof(cell_t)]; struct pvo_entry *pvo; register_t msr; vm_offset_t off; vm_paddr_t pa_base; int i, j; bzero(translations, sz); OF_getencprop(OF_finddevice("/"), "#address-cells", &acells, sizeof(acells)); if (OF_getencprop(mmu, "translations", trans_cells, sz) == -1) panic("moea64_bootstrap: can't get ofw translations"); CTR0(KTR_PMAP, "moea64_add_ofw_mappings: translations"); sz /= sizeof(cell_t); for (i = 0, j = 0; i < sz; j++) { translations[j].om_va = trans_cells[i++]; translations[j].om_len = trans_cells[i++]; translations[j].om_pa = trans_cells[i++]; if (acells == 2) { translations[j].om_pa <<= 32; translations[j].om_pa |= trans_cells[i++]; } translations[j].om_mode = trans_cells[i++]; } KASSERT(i == sz, ("Translations map has incorrect cell count (%d/%zd)", i, sz)); sz = j; qsort(translations, sz, sizeof (*translations), om_cmp); for (i = 0; i < sz; i++) { pa_base = translations[i].om_pa; #ifndef __powerpc64__ if ((translations[i].om_pa >> 32) != 0) panic("OFW translations above 32-bit boundary!"); #endif if (pa_base % PAGE_SIZE) panic("OFW translation not page-aligned (phys)!"); if (translations[i].om_va % PAGE_SIZE) panic("OFW translation not page-aligned (virt)!"); CTR3(KTR_PMAP, "translation: pa=%#zx va=%#x len=%#x", pa_base, translations[i].om_va, translations[i].om_len); /* Now enter the pages for this mapping */ DISABLE_TRANS(msr); for (off = 0; off < translations[i].om_len; off += PAGE_SIZE) { /* If this address is direct-mapped, skip remapping */ if (hw_direct_map && translations[i].om_va == PHYS_TO_DMAP(pa_base) && moea64_calc_wimg(pa_base + off, VM_MEMATTR_DEFAULT) == LPTE_M) continue; PMAP_LOCK(kernel_pmap); pvo = moea64_pvo_find_va(kernel_pmap, translations[i].om_va + off); PMAP_UNLOCK(kernel_pmap); if (pvo != NULL) continue; moea64_kenter(mmup, translations[i].om_va + off, pa_base + off); } ENABLE_TRANS(msr); } } #ifdef __powerpc64__ static void moea64_probe_large_page(void) { uint16_t pvr = mfpvr() >> 16; switch (pvr) { case IBM970: case IBM970FX: case IBM970MP: powerpc_sync(); isync(); mtspr(SPR_HID4, mfspr(SPR_HID4) & ~HID4_970_DISABLE_LG_PG); powerpc_sync(); isync(); /* FALLTHROUGH */ default: if (moea64_large_page_size == 0) { moea64_large_page_size = 0x1000000; /* 16 MB */ moea64_large_page_shift = 24; } } moea64_large_page_mask = moea64_large_page_size - 1; } static void moea64_bootstrap_slb_prefault(vm_offset_t va, int large) { struct slb *cache; struct slb entry; uint64_t esid, slbe; uint64_t i; cache = PCPU_GET(aim.slb); esid = va >> ADDR_SR_SHFT; slbe = (esid << SLBE_ESID_SHIFT) | SLBE_VALID; for (i = 0; i < 64; i++) { if (cache[i].slbe == (slbe | i)) return; } entry.slbe = slbe; entry.slbv = KERNEL_VSID(esid) << SLBV_VSID_SHIFT; if (large) entry.slbv |= SLBV_L; slb_insert_kernel(entry.slbe, entry.slbv); } #endif static void moea64_setup_direct_map(mmu_t mmup, vm_offset_t kernelstart, vm_offset_t kernelend) { struct pvo_entry *pvo; register_t msr; vm_paddr_t pa; vm_offset_t size, off; uint64_t pte_lo; int i; if (moea64_large_page_size == 0) hw_direct_map = 0; DISABLE_TRANS(msr); if (hw_direct_map) { PMAP_LOCK(kernel_pmap); for (i = 0; i < pregions_sz; i++) { for (pa = pregions[i].mr_start; pa < pregions[i].mr_start + pregions[i].mr_size; pa += moea64_large_page_size) { pte_lo = LPTE_M; pvo = alloc_pvo_entry(1 /* bootstrap */); pvo->pvo_vaddr |= PVO_WIRED | PVO_LARGE; init_pvo_entry(pvo, kernel_pmap, PHYS_TO_DMAP(pa)); /* * Set memory access as guarded if prefetch within * the page could exit the available physmem area. */ if (pa & moea64_large_page_mask) { pa &= moea64_large_page_mask; pte_lo |= LPTE_G; } if (pa + moea64_large_page_size > pregions[i].mr_start + pregions[i].mr_size) pte_lo |= LPTE_G; pvo->pvo_pte.prot = VM_PROT_READ | VM_PROT_WRITE | VM_PROT_EXECUTE; pvo->pvo_pte.pa = pa | pte_lo; moea64_pvo_enter(mmup, pvo, NULL); } } PMAP_UNLOCK(kernel_pmap); } /* * Make sure the kernel and BPVO pool stay mapped on systems either * without a direct map or on which the kernel is not already executing * out of the direct-mapped region. */ if (!hw_direct_map || kernelstart < DMAP_BASE_ADDRESS) { for (pa = kernelstart & ~PAGE_MASK; pa < kernelend; pa += PAGE_SIZE) moea64_kenter(mmup, pa, pa); } if (!hw_direct_map) { size = moea64_bpvo_pool_size*sizeof(struct pvo_entry); off = (vm_offset_t)(moea64_bpvo_pool); for (pa = off; pa < off + size; pa += PAGE_SIZE) moea64_kenter(mmup, pa, pa); } ENABLE_TRANS(msr); /* * Allow user to override unmapped_buf_allowed for testing. * XXXKIB Only direct map implementation was tested. */ if (!TUNABLE_INT_FETCH("vfs.unmapped_buf_allowed", &unmapped_buf_allowed)) unmapped_buf_allowed = hw_direct_map; } /* Quick sort callout for comparing physical addresses. */ static int pa_cmp(const void *a, const void *b) { const vm_paddr_t *pa = a, *pb = b; if (*pa < *pb) return (-1); else if (*pa > *pb) return (1); else return (0); } void moea64_early_bootstrap(mmu_t mmup, vm_offset_t kernelstart, vm_offset_t kernelend) { int i, j; vm_size_t physsz, hwphyssz; vm_paddr_t kernelphysstart, kernelphysend; int rm_pavail; #ifndef __powerpc64__ /* We don't have a direct map since there is no BAT */ hw_direct_map = 0; /* Make sure battable is zero, since we have no BAT */ for (i = 0; i < 16; i++) { battable[i].batu = 0; battable[i].batl = 0; } #else moea64_probe_large_page(); /* Use a direct map if we have large page support */ if (moea64_large_page_size > 0) hw_direct_map = 1; else hw_direct_map = 0; /* Install trap handlers for SLBs */ bcopy(&slbtrap, (void *)EXC_DSE,(size_t)&slbtrapend - (size_t)&slbtrap); bcopy(&slbtrap, (void *)EXC_ISE,(size_t)&slbtrapend - (size_t)&slbtrap); __syncicache((void *)EXC_DSE, 0x80); __syncicache((void *)EXC_ISE, 0x80); #endif kernelphysstart = kernelstart & ~DMAP_BASE_ADDRESS; kernelphysend = kernelend & ~DMAP_BASE_ADDRESS; /* Get physical memory regions from firmware */ mem_regions(&pregions, &pregions_sz, ®ions, ®ions_sz); CTR0(KTR_PMAP, "moea64_bootstrap: physical memory"); if (sizeof(phys_avail)/sizeof(phys_avail[0]) < regions_sz) panic("moea64_bootstrap: phys_avail too small"); phys_avail_count = 0; physsz = 0; hwphyssz = 0; TUNABLE_ULONG_FETCH("hw.physmem", (u_long *) &hwphyssz); for (i = 0, j = 0; i < regions_sz; i++, j += 2) { CTR3(KTR_PMAP, "region: %#zx - %#zx (%#zx)", regions[i].mr_start, regions[i].mr_start + regions[i].mr_size, regions[i].mr_size); if (hwphyssz != 0 && (physsz + regions[i].mr_size) >= hwphyssz) { if (physsz < hwphyssz) { phys_avail[j] = regions[i].mr_start; phys_avail[j + 1] = regions[i].mr_start + hwphyssz - physsz; physsz = hwphyssz; phys_avail_count++; } break; } phys_avail[j] = regions[i].mr_start; phys_avail[j + 1] = regions[i].mr_start + regions[i].mr_size; phys_avail_count++; physsz += regions[i].mr_size; } /* Check for overlap with the kernel and exception vectors */ rm_pavail = 0; for (j = 0; j < 2*phys_avail_count; j+=2) { if (phys_avail[j] < EXC_LAST) phys_avail[j] += EXC_LAST; if (phys_avail[j] >= kernelphysstart && phys_avail[j+1] <= kernelphysend) { phys_avail[j] = phys_avail[j+1] = ~0; rm_pavail++; continue; } if (kernelphysstart >= phys_avail[j] && kernelphysstart < phys_avail[j+1]) { if (kernelphysend < phys_avail[j+1]) { phys_avail[2*phys_avail_count] = (kernelphysend & ~PAGE_MASK) + PAGE_SIZE; phys_avail[2*phys_avail_count + 1] = phys_avail[j+1]; phys_avail_count++; } phys_avail[j+1] = kernelphysstart & ~PAGE_MASK; } if (kernelphysend >= phys_avail[j] && kernelphysend < phys_avail[j+1]) { if (kernelphysstart > phys_avail[j]) { phys_avail[2*phys_avail_count] = phys_avail[j]; phys_avail[2*phys_avail_count + 1] = kernelphysstart & ~PAGE_MASK; phys_avail_count++; } phys_avail[j] = (kernelphysend & ~PAGE_MASK) + PAGE_SIZE; } } /* Remove physical available regions marked for removal (~0) */ if (rm_pavail) { qsort(phys_avail, 2*phys_avail_count, sizeof(phys_avail[0]), pa_cmp); phys_avail_count -= rm_pavail; for (i = 2*phys_avail_count; i < 2*(phys_avail_count + rm_pavail); i+=2) phys_avail[i] = phys_avail[i+1] = 0; } physmem = btoc(physsz); #ifdef PTEGCOUNT moea64_pteg_count = PTEGCOUNT; #else moea64_pteg_count = 0x1000; while (moea64_pteg_count < physmem) moea64_pteg_count <<= 1; moea64_pteg_count >>= 1; #endif /* PTEGCOUNT */ } void moea64_mid_bootstrap(mmu_t mmup, vm_offset_t kernelstart, vm_offset_t kernelend) { int i; /* * Set PTEG mask */ moea64_pteg_mask = moea64_pteg_count - 1; /* * Initialize SLB table lock and page locks */ mtx_init(&moea64_slb_mutex, "SLB table", NULL, MTX_DEF); for (i = 0; i < PV_LOCK_COUNT; i++) mtx_init(&pv_lock[i], "page pv", NULL, MTX_DEF); /* * Initialise the bootstrap pvo pool. */ moea64_bpvo_pool = (struct pvo_entry *)moea64_bootstrap_alloc( moea64_bpvo_pool_size*sizeof(struct pvo_entry), 0); moea64_bpvo_pool_index = 0; /* Place at address usable through the direct map */ if (hw_direct_map) moea64_bpvo_pool = (struct pvo_entry *) PHYS_TO_DMAP((uintptr_t)moea64_bpvo_pool); /* * Make sure kernel vsid is allocated as well as VSID 0. */ #ifndef __powerpc64__ moea64_vsid_bitmap[(KERNEL_VSIDBITS & (NVSIDS - 1)) / VSID_NBPW] |= 1 << (KERNEL_VSIDBITS % VSID_NBPW); moea64_vsid_bitmap[0] |= 1; #endif /* * Initialize the kernel pmap (which is statically allocated). */ #ifdef __powerpc64__ for (i = 0; i < 64; i++) { pcpup->pc_aim.slb[i].slbv = 0; pcpup->pc_aim.slb[i].slbe = 0; } #else for (i = 0; i < 16; i++) kernel_pmap->pm_sr[i] = EMPTY_SEGMENT + i; #endif kernel_pmap->pmap_phys = kernel_pmap; CPU_FILL(&kernel_pmap->pm_active); RB_INIT(&kernel_pmap->pmap_pvo); PMAP_LOCK_INIT(kernel_pmap); /* * Now map in all the other buffers we allocated earlier */ moea64_setup_direct_map(mmup, kernelstart, kernelend); } void moea64_late_bootstrap(mmu_t mmup, vm_offset_t kernelstart, vm_offset_t kernelend) { ihandle_t mmui; phandle_t chosen; phandle_t mmu; ssize_t sz; int i; vm_offset_t pa, va; void *dpcpu; /* * Set up the Open Firmware pmap and add its mappings if not in real * mode. */ chosen = OF_finddevice("/chosen"); if (chosen != -1 && OF_getencprop(chosen, "mmu", &mmui, 4) != -1) { mmu = OF_instance_to_package(mmui); if (mmu == -1 || (sz = OF_getproplen(mmu, "translations")) == -1) sz = 0; if (sz > 6144 /* tmpstksz - 2 KB headroom */) panic("moea64_bootstrap: too many ofw translations"); if (sz > 0) moea64_add_ofw_mappings(mmup, mmu, sz); } /* * Calculate the last available physical address. */ Maxmem = 0; for (i = 0; phys_avail[i + 2] != 0; i += 2) Maxmem = MAX(Maxmem, powerpc_btop(phys_avail[i + 1])); /* * Initialize MMU. */ MMU_CPU_BOOTSTRAP(mmup,0); mtmsr(mfmsr() | PSL_DR | PSL_IR); pmap_bootstrapped++; /* * Set the start and end of kva. */ virtual_avail = VM_MIN_KERNEL_ADDRESS; virtual_end = VM_MAX_SAFE_KERNEL_ADDRESS; /* * Map the entire KVA range into the SLB. We must not fault there. */ #ifdef __powerpc64__ for (va = virtual_avail; va < virtual_end; va += SEGMENT_LENGTH) moea64_bootstrap_slb_prefault(va, 0); #endif /* * Remap any early IO mappings (console framebuffer, etc.) */ bs_remap_earlyboot(); /* * Figure out how far we can extend virtual_end into segment 16 * without running into existing mappings. Segment 16 is guaranteed * to contain neither RAM nor devices (at least on Apple hardware), * but will generally contain some OFW mappings we should not * step on. */ #ifndef __powerpc64__ /* KVA is in high memory on PPC64 */ PMAP_LOCK(kernel_pmap); while (virtual_end < VM_MAX_KERNEL_ADDRESS && moea64_pvo_find_va(kernel_pmap, virtual_end+1) == NULL) virtual_end += PAGE_SIZE; PMAP_UNLOCK(kernel_pmap); #endif /* * Allocate a kernel stack with a guard page for thread0 and map it * into the kernel page map. */ pa = moea64_bootstrap_alloc(kstack_pages * PAGE_SIZE, PAGE_SIZE); va = virtual_avail + KSTACK_GUARD_PAGES * PAGE_SIZE; virtual_avail = va + kstack_pages * PAGE_SIZE; CTR2(KTR_PMAP, "moea64_bootstrap: kstack0 at %#x (%#x)", pa, va); thread0.td_kstack = va; thread0.td_kstack_pages = kstack_pages; for (i = 0; i < kstack_pages; i++) { moea64_kenter(mmup, va, pa); pa += PAGE_SIZE; va += PAGE_SIZE; } /* * Allocate virtual address space for the message buffer. */ pa = msgbuf_phys = moea64_bootstrap_alloc(msgbufsize, PAGE_SIZE); msgbufp = (struct msgbuf *)virtual_avail; va = virtual_avail; virtual_avail += round_page(msgbufsize); while (va < virtual_avail) { moea64_kenter(mmup, va, pa); pa += PAGE_SIZE; va += PAGE_SIZE; } /* * Allocate virtual address space for the dynamic percpu area. */ pa = moea64_bootstrap_alloc(DPCPU_SIZE, PAGE_SIZE); dpcpu = (void *)virtual_avail; va = virtual_avail; virtual_avail += DPCPU_SIZE; while (va < virtual_avail) { moea64_kenter(mmup, va, pa); pa += PAGE_SIZE; va += PAGE_SIZE; } dpcpu_init(dpcpu, curcpu); /* * Allocate some things for page zeroing. We put this directly * in the page table and use MOEA64_PTE_REPLACE to avoid any * of the PVO book-keeping or other parts of the VM system * from even knowing that this hack exists. */ if (!hw_direct_map) { mtx_init(&moea64_scratchpage_mtx, "pvo zero page", NULL, MTX_DEF); for (i = 0; i < 2; i++) { moea64_scratchpage_va[i] = (virtual_end+1) - PAGE_SIZE; virtual_end -= PAGE_SIZE; moea64_kenter(mmup, moea64_scratchpage_va[i], 0); PMAP_LOCK(kernel_pmap); moea64_scratchpage_pvo[i] = moea64_pvo_find_va( kernel_pmap, (vm_offset_t)moea64_scratchpage_va[i]); PMAP_UNLOCK(kernel_pmap); } } + + numa_mem_regions(&numa_pregions, &numapregions_sz); } static void moea64_pmap_init_qpages(void) { struct pcpu *pc; int i; if (hw_direct_map) return; CPU_FOREACH(i) { pc = pcpu_find(i); pc->pc_qmap_addr = kva_alloc(PAGE_SIZE); if (pc->pc_qmap_addr == 0) panic("pmap_init_qpages: unable to allocate KVA"); PMAP_LOCK(kernel_pmap); pc->pc_aim.qmap_pvo = moea64_pvo_find_va(kernel_pmap, pc->pc_qmap_addr); PMAP_UNLOCK(kernel_pmap); mtx_init(&pc->pc_aim.qmap_lock, "qmap lock", NULL, MTX_DEF); } } SYSINIT(qpages_init, SI_SUB_CPU, SI_ORDER_ANY, moea64_pmap_init_qpages, NULL); /* * Activate a user pmap. This mostly involves setting some non-CPU * state. */ void moea64_activate(mmu_t mmu, struct thread *td) { pmap_t pm; pm = &td->td_proc->p_vmspace->vm_pmap; CPU_SET(PCPU_GET(cpuid), &pm->pm_active); #ifdef __powerpc64__ PCPU_SET(aim.userslb, pm->pm_slb); __asm __volatile("slbmte %0, %1; isync" :: "r"(td->td_pcb->pcb_cpu.aim.usr_vsid), "r"(USER_SLB_SLBE)); #else PCPU_SET(curpmap, pm->pmap_phys); mtsrin(USER_SR << ADDR_SR_SHFT, td->td_pcb->pcb_cpu.aim.usr_vsid); #endif } void moea64_deactivate(mmu_t mmu, struct thread *td) { pmap_t pm; __asm __volatile("isync; slbie %0" :: "r"(USER_ADDR)); pm = &td->td_proc->p_vmspace->vm_pmap; CPU_CLR(PCPU_GET(cpuid), &pm->pm_active); #ifdef __powerpc64__ PCPU_SET(aim.userslb, NULL); #else PCPU_SET(curpmap, NULL); #endif } void moea64_unwire(mmu_t mmu, pmap_t pm, vm_offset_t sva, vm_offset_t eva) { struct pvo_entry key, *pvo; vm_page_t m; int64_t refchg; key.pvo_vaddr = sva; PMAP_LOCK(pm); for (pvo = RB_NFIND(pvo_tree, &pm->pmap_pvo, &key); pvo != NULL && PVO_VADDR(pvo) < eva; pvo = RB_NEXT(pvo_tree, &pm->pmap_pvo, pvo)) { if ((pvo->pvo_vaddr & PVO_WIRED) == 0) panic("moea64_unwire: pvo %p is missing PVO_WIRED", pvo); pvo->pvo_vaddr &= ~PVO_WIRED; refchg = MOEA64_PTE_REPLACE(mmu, pvo, 0 /* No invalidation */); if ((pvo->pvo_vaddr & PVO_MANAGED) && (pvo->pvo_pte.prot & VM_PROT_WRITE)) { if (refchg < 0) refchg = LPTE_CHG; m = PHYS_TO_VM_PAGE(pvo->pvo_pte.pa & LPTE_RPGN); refchg |= atomic_readandclear_32(&m->md.mdpg_attrs); if (refchg & LPTE_CHG) vm_page_dirty(m); if (refchg & LPTE_REF) vm_page_aflag_set(m, PGA_REFERENCED); } pm->pm_stats.wired_count--; } PMAP_UNLOCK(pm); } /* * This goes through and sets the physical address of our * special scratch PTE to the PA we want to zero or copy. Because * of locking issues (this can get called in pvo_enter() by * the UMA allocator), we can't use most other utility functions here */ static __inline void moea64_set_scratchpage_pa(mmu_t mmup, int which, vm_paddr_t pa) { KASSERT(!hw_direct_map, ("Using OEA64 scratchpage with a direct map!")); mtx_assert(&moea64_scratchpage_mtx, MA_OWNED); moea64_scratchpage_pvo[which]->pvo_pte.pa = moea64_calc_wimg(pa, VM_MEMATTR_DEFAULT) | (uint64_t)pa; MOEA64_PTE_REPLACE(mmup, moea64_scratchpage_pvo[which], MOEA64_PTE_INVALIDATE); isync(); } void moea64_copy_page(mmu_t mmu, vm_page_t msrc, vm_page_t mdst) { vm_offset_t dst; vm_offset_t src; dst = VM_PAGE_TO_PHYS(mdst); src = VM_PAGE_TO_PHYS(msrc); if (hw_direct_map) { bcopy((void *)PHYS_TO_DMAP(src), (void *)PHYS_TO_DMAP(dst), PAGE_SIZE); } else { mtx_lock(&moea64_scratchpage_mtx); moea64_set_scratchpage_pa(mmu, 0, src); moea64_set_scratchpage_pa(mmu, 1, dst); bcopy((void *)moea64_scratchpage_va[0], (void *)moea64_scratchpage_va[1], PAGE_SIZE); mtx_unlock(&moea64_scratchpage_mtx); } } static inline void moea64_copy_pages_dmap(mmu_t mmu, vm_page_t *ma, vm_offset_t a_offset, vm_page_t *mb, vm_offset_t b_offset, int xfersize) { void *a_cp, *b_cp; vm_offset_t a_pg_offset, b_pg_offset; int cnt; while (xfersize > 0) { a_pg_offset = a_offset & PAGE_MASK; cnt = min(xfersize, PAGE_SIZE - a_pg_offset); a_cp = (char *)(uintptr_t)PHYS_TO_DMAP( VM_PAGE_TO_PHYS(ma[a_offset >> PAGE_SHIFT])) + a_pg_offset; b_pg_offset = b_offset & PAGE_MASK; cnt = min(cnt, PAGE_SIZE - b_pg_offset); b_cp = (char *)(uintptr_t)PHYS_TO_DMAP( VM_PAGE_TO_PHYS(mb[b_offset >> PAGE_SHIFT])) + b_pg_offset; bcopy(a_cp, b_cp, cnt); a_offset += cnt; b_offset += cnt; xfersize -= cnt; } } static inline void moea64_copy_pages_nodmap(mmu_t mmu, vm_page_t *ma, vm_offset_t a_offset, vm_page_t *mb, vm_offset_t b_offset, int xfersize) { void *a_cp, *b_cp; vm_offset_t a_pg_offset, b_pg_offset; int cnt; mtx_lock(&moea64_scratchpage_mtx); while (xfersize > 0) { a_pg_offset = a_offset & PAGE_MASK; cnt = min(xfersize, PAGE_SIZE - a_pg_offset); moea64_set_scratchpage_pa(mmu, 0, VM_PAGE_TO_PHYS(ma[a_offset >> PAGE_SHIFT])); a_cp = (char *)moea64_scratchpage_va[0] + a_pg_offset; b_pg_offset = b_offset & PAGE_MASK; cnt = min(cnt, PAGE_SIZE - b_pg_offset); moea64_set_scratchpage_pa(mmu, 1, VM_PAGE_TO_PHYS(mb[b_offset >> PAGE_SHIFT])); b_cp = (char *)moea64_scratchpage_va[1] + b_pg_offset; bcopy(a_cp, b_cp, cnt); a_offset += cnt; b_offset += cnt; xfersize -= cnt; } mtx_unlock(&moea64_scratchpage_mtx); } void moea64_copy_pages(mmu_t mmu, vm_page_t *ma, vm_offset_t a_offset, vm_page_t *mb, vm_offset_t b_offset, int xfersize) { if (hw_direct_map) { moea64_copy_pages_dmap(mmu, ma, a_offset, mb, b_offset, xfersize); } else { moea64_copy_pages_nodmap(mmu, ma, a_offset, mb, b_offset, xfersize); } } void moea64_zero_page_area(mmu_t mmu, vm_page_t m, int off, int size) { vm_paddr_t pa = VM_PAGE_TO_PHYS(m); if (size + off > PAGE_SIZE) panic("moea64_zero_page: size + off > PAGE_SIZE"); if (hw_direct_map) { bzero((caddr_t)(uintptr_t)PHYS_TO_DMAP(pa) + off, size); } else { mtx_lock(&moea64_scratchpage_mtx); moea64_set_scratchpage_pa(mmu, 0, pa); bzero((caddr_t)moea64_scratchpage_va[0] + off, size); mtx_unlock(&moea64_scratchpage_mtx); } } /* * Zero a page of physical memory by temporarily mapping it */ void moea64_zero_page(mmu_t mmu, vm_page_t m) { vm_paddr_t pa = VM_PAGE_TO_PHYS(m); vm_offset_t va, off; if (!hw_direct_map) { mtx_lock(&moea64_scratchpage_mtx); moea64_set_scratchpage_pa(mmu, 0, pa); va = moea64_scratchpage_va[0]; } else { va = PHYS_TO_DMAP(pa); } for (off = 0; off < PAGE_SIZE; off += cacheline_size) __asm __volatile("dcbz 0,%0" :: "r"(va + off)); if (!hw_direct_map) mtx_unlock(&moea64_scratchpage_mtx); } vm_offset_t moea64_quick_enter_page(mmu_t mmu, vm_page_t m) { struct pvo_entry *pvo; vm_paddr_t pa = VM_PAGE_TO_PHYS(m); if (hw_direct_map) return (PHYS_TO_DMAP(pa)); /* * MOEA64_PTE_REPLACE does some locking, so we can't just grab * a critical section and access the PCPU data like on i386. * Instead, pin the thread and grab the PCPU lock to prevent * a preempting thread from using the same PCPU data. */ sched_pin(); mtx_assert(PCPU_PTR(aim.qmap_lock), MA_NOTOWNED); pvo = PCPU_GET(aim.qmap_pvo); mtx_lock(PCPU_PTR(aim.qmap_lock)); pvo->pvo_pte.pa = moea64_calc_wimg(pa, pmap_page_get_memattr(m)) | (uint64_t)pa; MOEA64_PTE_REPLACE(mmu, pvo, MOEA64_PTE_INVALIDATE); isync(); return (PCPU_GET(qmap_addr)); } void moea64_quick_remove_page(mmu_t mmu, vm_offset_t addr) { if (hw_direct_map) return; mtx_assert(PCPU_PTR(aim.qmap_lock), MA_OWNED); KASSERT(PCPU_GET(qmap_addr) == addr, ("moea64_quick_remove_page: invalid address")); mtx_unlock(PCPU_PTR(aim.qmap_lock)); sched_unpin(); } /* * Map the given physical page at the specified virtual address in the * target pmap with the protection requested. If specified the page * will be wired down. */ int moea64_enter(mmu_t mmu, pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot, u_int flags, int8_t psind) { struct pvo_entry *pvo, *oldpvo; struct pvo_head *pvo_head; uint64_t pte_lo; int error; if ((m->oflags & VPO_UNMANAGED) == 0 && !vm_page_xbusied(m)) VM_OBJECT_ASSERT_LOCKED(m->object); pvo = alloc_pvo_entry(0); pvo->pvo_pmap = NULL; /* to be filled in later */ pvo->pvo_pte.prot = prot; pte_lo = moea64_calc_wimg(VM_PAGE_TO_PHYS(m), pmap_page_get_memattr(m)); pvo->pvo_pte.pa = VM_PAGE_TO_PHYS(m) | pte_lo; if ((flags & PMAP_ENTER_WIRED) != 0) pvo->pvo_vaddr |= PVO_WIRED; if ((m->oflags & VPO_UNMANAGED) != 0 || !moea64_initialized) { pvo_head = NULL; } else { pvo_head = &m->md.mdpg_pvoh; pvo->pvo_vaddr |= PVO_MANAGED; } for (;;) { PV_PAGE_LOCK(m); PMAP_LOCK(pmap); if (pvo->pvo_pmap == NULL) init_pvo_entry(pvo, pmap, va); if (prot & VM_PROT_WRITE) if (pmap_bootstrapped && (m->oflags & VPO_UNMANAGED) == 0) vm_page_aflag_set(m, PGA_WRITEABLE); oldpvo = moea64_pvo_find_va(pmap, va); if (oldpvo != NULL) { if (oldpvo->pvo_vaddr == pvo->pvo_vaddr && oldpvo->pvo_pte.pa == pvo->pvo_pte.pa && oldpvo->pvo_pte.prot == prot) { /* Identical mapping already exists */ error = 0; /* If not in page table, reinsert it */ if (MOEA64_PTE_SYNCH(mmu, oldpvo) < 0) { moea64_pte_overflow--; MOEA64_PTE_INSERT(mmu, oldpvo); } /* Then just clean up and go home */ PV_PAGE_UNLOCK(m); PMAP_UNLOCK(pmap); free_pvo_entry(pvo); break; } /* Otherwise, need to kill it first */ KASSERT(oldpvo->pvo_pmap == pmap, ("pmap of old " "mapping does not match new mapping")); moea64_pvo_remove_from_pmap(mmu, oldpvo); } error = moea64_pvo_enter(mmu, pvo, pvo_head); PV_PAGE_UNLOCK(m); PMAP_UNLOCK(pmap); /* Free any dead pages */ if (oldpvo != NULL) { PV_LOCK(oldpvo->pvo_pte.pa & LPTE_RPGN); moea64_pvo_remove_from_page(mmu, oldpvo); PV_UNLOCK(oldpvo->pvo_pte.pa & LPTE_RPGN); free_pvo_entry(oldpvo); } if (error != ENOMEM) break; if ((flags & PMAP_ENTER_NOSLEEP) != 0) return (KERN_RESOURCE_SHORTAGE); VM_OBJECT_ASSERT_UNLOCKED(m->object); vm_wait(NULL); } /* * Flush the page from the instruction cache if this page is * mapped executable and cacheable. */ if (pmap != kernel_pmap && !(m->aflags & PGA_EXECUTABLE) && (pte_lo & (LPTE_I | LPTE_G | LPTE_NOEXEC)) == 0) { vm_page_aflag_set(m, PGA_EXECUTABLE); moea64_syncicache(mmu, pmap, va, VM_PAGE_TO_PHYS(m), PAGE_SIZE); } return (KERN_SUCCESS); } static void moea64_syncicache(mmu_t mmu, pmap_t pmap, vm_offset_t va, vm_paddr_t pa, vm_size_t sz) { /* * This is much trickier than on older systems because * we can't sync the icache on physical addresses directly * without a direct map. Instead we check a couple of cases * where the memory is already mapped in and, failing that, * use the same trick we use for page zeroing to create * a temporary mapping for this physical address. */ if (!pmap_bootstrapped) { /* * If PMAP is not bootstrapped, we are likely to be * in real mode. */ __syncicache((void *)(uintptr_t)pa, sz); } else if (pmap == kernel_pmap) { __syncicache((void *)va, sz); } else if (hw_direct_map) { __syncicache((void *)(uintptr_t)PHYS_TO_DMAP(pa), sz); } else { /* Use the scratch page to set up a temp mapping */ mtx_lock(&moea64_scratchpage_mtx); moea64_set_scratchpage_pa(mmu, 1, pa & ~ADDR_POFF); __syncicache((void *)(moea64_scratchpage_va[1] + (va & ADDR_POFF)), sz); mtx_unlock(&moea64_scratchpage_mtx); } } /* * Maps a sequence of resident pages belonging to the same object. * The sequence begins with the given page m_start. This page is * mapped at the given virtual address start. Each subsequent page is * mapped at a virtual address that is offset from start by the same * amount as the page is offset from m_start within the object. The * last page in the sequence is the page with the largest offset from * m_start that can be mapped at a virtual address less than the given * virtual address end. Not every virtual page between start and end * is mapped; only those for which a resident page exists with the * corresponding offset from m_start are mapped. */ void moea64_enter_object(mmu_t mmu, pmap_t pm, vm_offset_t start, vm_offset_t end, vm_page_t m_start, vm_prot_t prot) { vm_page_t m; vm_pindex_t diff, psize; VM_OBJECT_ASSERT_LOCKED(m_start->object); psize = atop(end - start); m = m_start; while (m != NULL && (diff = m->pindex - m_start->pindex) < psize) { moea64_enter(mmu, pm, start + ptoa(diff), m, prot & (VM_PROT_READ | VM_PROT_EXECUTE), PMAP_ENTER_NOSLEEP, 0); m = TAILQ_NEXT(m, listq); } } void moea64_enter_quick(mmu_t mmu, pmap_t pm, vm_offset_t va, vm_page_t m, vm_prot_t prot) { moea64_enter(mmu, pm, va, m, prot & (VM_PROT_READ | VM_PROT_EXECUTE), PMAP_ENTER_NOSLEEP, 0); } vm_paddr_t moea64_extract(mmu_t mmu, pmap_t pm, vm_offset_t va) { struct pvo_entry *pvo; vm_paddr_t pa; PMAP_LOCK(pm); pvo = moea64_pvo_find_va(pm, va); if (pvo == NULL) pa = 0; else pa = (pvo->pvo_pte.pa & LPTE_RPGN) | (va - PVO_VADDR(pvo)); PMAP_UNLOCK(pm); return (pa); } /* * Atomically extract and hold the physical page with the given * pmap and virtual address pair if that mapping permits the given * protection. */ vm_page_t moea64_extract_and_hold(mmu_t mmu, pmap_t pmap, vm_offset_t va, vm_prot_t prot) { struct pvo_entry *pvo; vm_page_t m; vm_paddr_t pa; m = NULL; pa = 0; PMAP_LOCK(pmap); retry: pvo = moea64_pvo_find_va(pmap, va & ~ADDR_POFF); if (pvo != NULL && (pvo->pvo_pte.prot & prot) == prot) { if (vm_page_pa_tryrelock(pmap, pvo->pvo_pte.pa & LPTE_RPGN, &pa)) goto retry; m = PHYS_TO_VM_PAGE(pvo->pvo_pte.pa & LPTE_RPGN); vm_page_hold(m); } PA_UNLOCK_COND(pa); PMAP_UNLOCK(pmap); return (m); } static mmu_t installed_mmu; static void * moea64_uma_page_alloc(uma_zone_t zone, vm_size_t bytes, int domain, uint8_t *flags, int wait) { struct pvo_entry *pvo; vm_offset_t va; vm_page_t m; int needed_lock; /* * This entire routine is a horrible hack to avoid bothering kmem * for new KVA addresses. Because this can get called from inside * kmem allocation routines, calling kmem for a new address here * can lead to multiply locking non-recursive mutexes. */ *flags = UMA_SLAB_PRIV; needed_lock = !PMAP_LOCKED(kernel_pmap); m = vm_page_alloc_domain(NULL, 0, domain, malloc2vm_flags(wait) | VM_ALLOC_WIRED | VM_ALLOC_NOOBJ); if (m == NULL) return (NULL); va = VM_PAGE_TO_PHYS(m); pvo = alloc_pvo_entry(1 /* bootstrap */); pvo->pvo_pte.prot = VM_PROT_READ | VM_PROT_WRITE; pvo->pvo_pte.pa = VM_PAGE_TO_PHYS(m) | LPTE_M; if (needed_lock) PMAP_LOCK(kernel_pmap); init_pvo_entry(pvo, kernel_pmap, va); pvo->pvo_vaddr |= PVO_WIRED; moea64_pvo_enter(installed_mmu, pvo, NULL); if (needed_lock) PMAP_UNLOCK(kernel_pmap); if ((wait & M_ZERO) && (m->flags & PG_ZERO) == 0) bzero((void *)va, PAGE_SIZE); return (void *)va; } extern int elf32_nxstack; void moea64_init(mmu_t mmu) { CTR0(KTR_PMAP, "moea64_init"); moea64_pvo_zone = uma_zcreate("UPVO entry", sizeof (struct pvo_entry), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_VM | UMA_ZONE_NOFREE); if (!hw_direct_map) { installed_mmu = mmu; uma_zone_set_allocf(moea64_pvo_zone, moea64_uma_page_alloc); } #ifdef COMPAT_FREEBSD32 elf32_nxstack = 1; #endif moea64_initialized = TRUE; } boolean_t moea64_is_referenced(mmu_t mmu, vm_page_t m) { KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("moea64_is_referenced: page %p is not managed", m)); return (moea64_query_bit(mmu, m, LPTE_REF)); } boolean_t moea64_is_modified(mmu_t mmu, vm_page_t m) { KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("moea64_is_modified: page %p is not managed", m)); /* * If the page is not exclusive busied, then PGA_WRITEABLE cannot be * concurrently set while the object is locked. Thus, if PGA_WRITEABLE * is clear, no PTEs can have LPTE_CHG set. */ VM_OBJECT_ASSERT_LOCKED(m->object); if (!vm_page_xbusied(m) && (m->aflags & PGA_WRITEABLE) == 0) return (FALSE); return (moea64_query_bit(mmu, m, LPTE_CHG)); } boolean_t moea64_is_prefaultable(mmu_t mmu, pmap_t pmap, vm_offset_t va) { struct pvo_entry *pvo; boolean_t rv = TRUE; PMAP_LOCK(pmap); pvo = moea64_pvo_find_va(pmap, va & ~ADDR_POFF); if (pvo != NULL) rv = FALSE; PMAP_UNLOCK(pmap); return (rv); } void moea64_clear_modify(mmu_t mmu, vm_page_t m) { KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("moea64_clear_modify: page %p is not managed", m)); VM_OBJECT_ASSERT_WLOCKED(m->object); KASSERT(!vm_page_xbusied(m), ("moea64_clear_modify: page %p is exclusive busied", m)); /* * If the page is not PGA_WRITEABLE, then no PTEs can have LPTE_CHG * set. If the object containing the page is locked and the page is * not exclusive busied, then PGA_WRITEABLE cannot be concurrently set. */ if ((m->aflags & PGA_WRITEABLE) == 0) return; moea64_clear_bit(mmu, m, LPTE_CHG); } /* * Clear the write and modified bits in each of the given page's mappings. */ void moea64_remove_write(mmu_t mmu, vm_page_t m) { struct pvo_entry *pvo; int64_t refchg, ret; pmap_t pmap; KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("moea64_remove_write: page %p is not managed", m)); /* * If the page is not exclusive busied, then PGA_WRITEABLE cannot be * set by another thread while the object is locked. Thus, * if PGA_WRITEABLE is clear, no page table entries need updating. */ VM_OBJECT_ASSERT_WLOCKED(m->object); if (!vm_page_xbusied(m) && (m->aflags & PGA_WRITEABLE) == 0) return; powerpc_sync(); PV_PAGE_LOCK(m); refchg = 0; LIST_FOREACH(pvo, vm_page_to_pvoh(m), pvo_vlink) { pmap = pvo->pvo_pmap; PMAP_LOCK(pmap); if (!(pvo->pvo_vaddr & PVO_DEAD) && (pvo->pvo_pte.prot & VM_PROT_WRITE)) { pvo->pvo_pte.prot &= ~VM_PROT_WRITE; ret = MOEA64_PTE_REPLACE(mmu, pvo, MOEA64_PTE_PROT_UPDATE); if (ret < 0) ret = LPTE_CHG; refchg |= ret; if (pvo->pvo_pmap == kernel_pmap) isync(); } PMAP_UNLOCK(pmap); } if ((refchg | atomic_readandclear_32(&m->md.mdpg_attrs)) & LPTE_CHG) vm_page_dirty(m); vm_page_aflag_clear(m, PGA_WRITEABLE); PV_PAGE_UNLOCK(m); } /* * moea64_ts_referenced: * * Return a count of reference bits for a page, clearing those bits. * It is not necessary for every reference bit to be cleared, but it * is necessary that 0 only be returned when there are truly no * reference bits set. * * XXX: The exact number of bits to check and clear is a matter that * should be tested and standardized at some point in the future for * optimal aging of shared pages. */ int moea64_ts_referenced(mmu_t mmu, vm_page_t m) { KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("moea64_ts_referenced: page %p is not managed", m)); return (moea64_clear_bit(mmu, m, LPTE_REF)); } /* * Modify the WIMG settings of all mappings for a page. */ void moea64_page_set_memattr(mmu_t mmu, vm_page_t m, vm_memattr_t ma) { struct pvo_entry *pvo; int64_t refchg; pmap_t pmap; uint64_t lo; if ((m->oflags & VPO_UNMANAGED) != 0) { m->md.mdpg_cache_attrs = ma; return; } lo = moea64_calc_wimg(VM_PAGE_TO_PHYS(m), ma); PV_PAGE_LOCK(m); LIST_FOREACH(pvo, vm_page_to_pvoh(m), pvo_vlink) { pmap = pvo->pvo_pmap; PMAP_LOCK(pmap); if (!(pvo->pvo_vaddr & PVO_DEAD)) { pvo->pvo_pte.pa &= ~LPTE_WIMG; pvo->pvo_pte.pa |= lo; refchg = MOEA64_PTE_REPLACE(mmu, pvo, MOEA64_PTE_INVALIDATE); if (refchg < 0) refchg = (pvo->pvo_pte.prot & VM_PROT_WRITE) ? LPTE_CHG : 0; if ((pvo->pvo_vaddr & PVO_MANAGED) && (pvo->pvo_pte.prot & VM_PROT_WRITE)) { refchg |= atomic_readandclear_32(&m->md.mdpg_attrs); if (refchg & LPTE_CHG) vm_page_dirty(m); if (refchg & LPTE_REF) vm_page_aflag_set(m, PGA_REFERENCED); } if (pvo->pvo_pmap == kernel_pmap) isync(); } PMAP_UNLOCK(pmap); } m->md.mdpg_cache_attrs = ma; PV_PAGE_UNLOCK(m); } /* * Map a wired page into kernel virtual address space. */ void moea64_kenter_attr(mmu_t mmu, vm_offset_t va, vm_paddr_t pa, vm_memattr_t ma) { int error; struct pvo_entry *pvo, *oldpvo; pvo = alloc_pvo_entry(0); pvo->pvo_pte.prot = VM_PROT_READ | VM_PROT_WRITE | VM_PROT_EXECUTE; pvo->pvo_pte.pa = (pa & ~ADDR_POFF) | moea64_calc_wimg(pa, ma); pvo->pvo_vaddr |= PVO_WIRED; PMAP_LOCK(kernel_pmap); oldpvo = moea64_pvo_find_va(kernel_pmap, va); if (oldpvo != NULL) moea64_pvo_remove_from_pmap(mmu, oldpvo); init_pvo_entry(pvo, kernel_pmap, va); error = moea64_pvo_enter(mmu, pvo, NULL); PMAP_UNLOCK(kernel_pmap); /* Free any dead pages */ if (oldpvo != NULL) { PV_LOCK(oldpvo->pvo_pte.pa & LPTE_RPGN); moea64_pvo_remove_from_page(mmu, oldpvo); PV_UNLOCK(oldpvo->pvo_pte.pa & LPTE_RPGN); free_pvo_entry(oldpvo); } if (error != 0 && error != ENOENT) panic("moea64_kenter: failed to enter va %#zx pa %#jx: %d", va, (uintmax_t)pa, error); } void moea64_kenter(mmu_t mmu, vm_offset_t va, vm_paddr_t pa) { moea64_kenter_attr(mmu, va, pa, VM_MEMATTR_DEFAULT); } /* * Extract the physical page address associated with the given kernel virtual * address. */ vm_paddr_t moea64_kextract(mmu_t mmu, vm_offset_t va) { struct pvo_entry *pvo; vm_paddr_t pa; /* * Shortcut the direct-mapped case when applicable. We never put * anything but 1:1 (or 62-bit aliased) mappings below * VM_MIN_KERNEL_ADDRESS. */ if (va < VM_MIN_KERNEL_ADDRESS) return (va & ~DMAP_BASE_ADDRESS); PMAP_LOCK(kernel_pmap); pvo = moea64_pvo_find_va(kernel_pmap, va); KASSERT(pvo != NULL, ("moea64_kextract: no addr found for %#" PRIxPTR, va)); pa = (pvo->pvo_pte.pa & LPTE_RPGN) | (va - PVO_VADDR(pvo)); PMAP_UNLOCK(kernel_pmap); return (pa); } /* * Remove a wired page from kernel virtual address space. */ void moea64_kremove(mmu_t mmu, vm_offset_t va) { moea64_remove(mmu, kernel_pmap, va, va + PAGE_SIZE); } /* * Provide a kernel pointer corresponding to a given userland pointer. * The returned pointer is valid until the next time this function is * called in this thread. This is used internally in copyin/copyout. */ static int moea64_map_user_ptr(mmu_t mmu, pmap_t pm, volatile const void *uaddr, void **kaddr, size_t ulen, size_t *klen) { size_t l; #ifdef __powerpc64__ struct slb *slb; #endif register_t slbv; *kaddr = (char *)USER_ADDR + ((uintptr_t)uaddr & ~SEGMENT_MASK); l = ((char *)USER_ADDR + SEGMENT_LENGTH) - (char *)(*kaddr); if (l > ulen) l = ulen; if (klen) *klen = l; else if (l != ulen) return (EFAULT); #ifdef __powerpc64__ /* Try lockless look-up first */ slb = user_va_to_slb_entry(pm, (vm_offset_t)uaddr); if (slb == NULL) { /* If it isn't there, we need to pre-fault the VSID */ PMAP_LOCK(pm); slbv = va_to_vsid(pm, (vm_offset_t)uaddr) << SLBV_VSID_SHIFT; PMAP_UNLOCK(pm); } else { slbv = slb->slbv; } /* Mark segment no-execute */ slbv |= SLBV_N; #else slbv = va_to_vsid(pm, (vm_offset_t)uaddr); /* Mark segment no-execute */ slbv |= SR_N; #endif /* If we have already set this VSID, we can just return */ if (curthread->td_pcb->pcb_cpu.aim.usr_vsid == slbv) return (0); __asm __volatile("isync"); curthread->td_pcb->pcb_cpu.aim.usr_segm = (uintptr_t)uaddr >> ADDR_SR_SHFT; curthread->td_pcb->pcb_cpu.aim.usr_vsid = slbv; #ifdef __powerpc64__ __asm __volatile ("slbie %0; slbmte %1, %2; isync" :: "r"(USER_ADDR), "r"(slbv), "r"(USER_SLB_SLBE)); #else __asm __volatile("mtsr %0,%1; isync" :: "n"(USER_SR), "r"(slbv)); #endif return (0); } /* * Figure out where a given kernel pointer (usually in a fault) points * to from the VM's perspective, potentially remapping into userland's * address space. */ static int moea64_decode_kernel_ptr(mmu_t mmu, vm_offset_t addr, int *is_user, vm_offset_t *decoded_addr) { vm_offset_t user_sr; if ((addr >> ADDR_SR_SHFT) == (USER_ADDR >> ADDR_SR_SHFT)) { user_sr = curthread->td_pcb->pcb_cpu.aim.usr_segm; addr &= ADDR_PIDX | ADDR_POFF; addr |= user_sr << ADDR_SR_SHFT; *decoded_addr = addr; *is_user = 1; } else { *decoded_addr = addr; *is_user = 0; } return (0); } /* * Map a range of physical addresses into kernel virtual address space. * * The value passed in *virt is a suggested virtual address for the mapping. * Architectures which can support a direct-mapped physical to virtual region * can return the appropriate address within that region, leaving '*virt' * unchanged. Other architectures should map the pages starting at '*virt' and * update '*virt' with the first usable address after the mapped region. */ vm_offset_t moea64_map(mmu_t mmu, vm_offset_t *virt, vm_paddr_t pa_start, vm_paddr_t pa_end, int prot) { vm_offset_t sva, va; if (hw_direct_map) { /* * Check if every page in the region is covered by the direct * map. The direct map covers all of physical memory. Use * moea64_calc_wimg() as a shortcut to see if the page is in * physical memory as a way to see if the direct map covers it. */ for (va = pa_start; va < pa_end; va += PAGE_SIZE) if (moea64_calc_wimg(va, VM_MEMATTR_DEFAULT) != LPTE_M) break; if (va == pa_end) return (PHYS_TO_DMAP(pa_start)); } sva = *virt; va = sva; /* XXX respect prot argument */ for (; pa_start < pa_end; pa_start += PAGE_SIZE, va += PAGE_SIZE) moea64_kenter(mmu, va, pa_start); *virt = va; return (sva); } /* * Returns true if the pmap's pv is one of the first * 16 pvs linked to from this page. This count may * be changed upwards or downwards in the future; it * is only necessary that true be returned for a small * subset of pmaps for proper page aging. */ boolean_t moea64_page_exists_quick(mmu_t mmu, pmap_t pmap, vm_page_t m) { int loops; struct pvo_entry *pvo; boolean_t rv; KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("moea64_page_exists_quick: page %p is not managed", m)); loops = 0; rv = FALSE; PV_PAGE_LOCK(m); LIST_FOREACH(pvo, vm_page_to_pvoh(m), pvo_vlink) { if (!(pvo->pvo_vaddr & PVO_DEAD) && pvo->pvo_pmap == pmap) { rv = TRUE; break; } if (++loops >= 16) break; } PV_PAGE_UNLOCK(m); return (rv); } void moea64_page_init(mmu_t mmu __unused, vm_page_t m) { m->md.mdpg_attrs = 0; m->md.mdpg_cache_attrs = VM_MEMATTR_DEFAULT; LIST_INIT(&m->md.mdpg_pvoh); } /* * Return the number of managed mappings to the given physical page * that are wired. */ int moea64_page_wired_mappings(mmu_t mmu, vm_page_t m) { struct pvo_entry *pvo; int count; count = 0; if ((m->oflags & VPO_UNMANAGED) != 0) return (count); PV_PAGE_LOCK(m); LIST_FOREACH(pvo, vm_page_to_pvoh(m), pvo_vlink) if ((pvo->pvo_vaddr & (PVO_DEAD | PVO_WIRED)) == PVO_WIRED) count++; PV_PAGE_UNLOCK(m); return (count); } static uintptr_t moea64_vsidcontext; uintptr_t moea64_get_unique_vsid(void) { u_int entropy; register_t hash; uint32_t mask; int i; entropy = 0; __asm __volatile("mftb %0" : "=r"(entropy)); mtx_lock(&moea64_slb_mutex); for (i = 0; i < NVSIDS; i += VSID_NBPW) { u_int n; /* * Create a new value by mutiplying by a prime and adding in * entropy from the timebase register. This is to make the * VSID more random so that the PT hash function collides * less often. (Note that the prime casues gcc to do shifts * instead of a multiply.) */ moea64_vsidcontext = (moea64_vsidcontext * 0x1105) + entropy; hash = moea64_vsidcontext & (NVSIDS - 1); if (hash == 0) /* 0 is special, avoid it */ continue; n = hash >> 5; mask = 1 << (hash & (VSID_NBPW - 1)); hash = (moea64_vsidcontext & VSID_HASHMASK); if (moea64_vsid_bitmap[n] & mask) { /* collision? */ /* anything free in this bucket? */ if (moea64_vsid_bitmap[n] == 0xffffffff) { entropy = (moea64_vsidcontext >> 20); continue; } i = ffs(~moea64_vsid_bitmap[n]) - 1; mask = 1 << i; hash &= rounddown2(VSID_HASHMASK, VSID_NBPW); hash |= i; } if (hash == VSID_VRMA) /* also special, avoid this too */ continue; KASSERT(!(moea64_vsid_bitmap[n] & mask), ("Allocating in-use VSID %#zx\n", hash)); moea64_vsid_bitmap[n] |= mask; mtx_unlock(&moea64_slb_mutex); return (hash); } mtx_unlock(&moea64_slb_mutex); panic("%s: out of segments",__func__); } #ifdef __powerpc64__ void moea64_pinit(mmu_t mmu, pmap_t pmap) { RB_INIT(&pmap->pmap_pvo); pmap->pm_slb_tree_root = slb_alloc_tree(); pmap->pm_slb = slb_alloc_user_cache(); pmap->pm_slb_len = 0; } #else void moea64_pinit(mmu_t mmu, pmap_t pmap) { int i; uint32_t hash; RB_INIT(&pmap->pmap_pvo); if (pmap_bootstrapped) pmap->pmap_phys = (pmap_t)moea64_kextract(mmu, (vm_offset_t)pmap); else pmap->pmap_phys = pmap; /* * Allocate some segment registers for this pmap. */ hash = moea64_get_unique_vsid(); for (i = 0; i < 16; i++) pmap->pm_sr[i] = VSID_MAKE(i, hash); KASSERT(pmap->pm_sr[0] != 0, ("moea64_pinit: pm_sr[0] = 0")); } #endif /* * Initialize the pmap associated with process 0. */ void moea64_pinit0(mmu_t mmu, pmap_t pm) { PMAP_LOCK_INIT(pm); moea64_pinit(mmu, pm); bzero(&pm->pm_stats, sizeof(pm->pm_stats)); } /* * Set the physical protection on the specified range of this map as requested. */ static void moea64_pvo_protect(mmu_t mmu, pmap_t pm, struct pvo_entry *pvo, vm_prot_t prot) { struct vm_page *pg; vm_prot_t oldprot; int32_t refchg; PMAP_LOCK_ASSERT(pm, MA_OWNED); /* * Change the protection of the page. */ oldprot = pvo->pvo_pte.prot; pvo->pvo_pte.prot = prot; pg = PHYS_TO_VM_PAGE(pvo->pvo_pte.pa & LPTE_RPGN); /* * If the PVO is in the page table, update mapping */ refchg = MOEA64_PTE_REPLACE(mmu, pvo, MOEA64_PTE_PROT_UPDATE); if (refchg < 0) refchg = (oldprot & VM_PROT_WRITE) ? LPTE_CHG : 0; if (pm != kernel_pmap && pg != NULL && !(pg->aflags & PGA_EXECUTABLE) && (pvo->pvo_pte.pa & (LPTE_I | LPTE_G | LPTE_NOEXEC)) == 0) { if ((pg->oflags & VPO_UNMANAGED) == 0) vm_page_aflag_set(pg, PGA_EXECUTABLE); moea64_syncicache(mmu, pm, PVO_VADDR(pvo), pvo->pvo_pte.pa & LPTE_RPGN, PAGE_SIZE); } /* * Update vm about the REF/CHG bits if the page is managed and we have * removed write access. */ if (pg != NULL && (pvo->pvo_vaddr & PVO_MANAGED) && (oldprot & VM_PROT_WRITE)) { refchg |= atomic_readandclear_32(&pg->md.mdpg_attrs); if (refchg & LPTE_CHG) vm_page_dirty(pg); if (refchg & LPTE_REF) vm_page_aflag_set(pg, PGA_REFERENCED); } } void moea64_protect(mmu_t mmu, pmap_t pm, vm_offset_t sva, vm_offset_t eva, vm_prot_t prot) { struct pvo_entry *pvo, *tpvo, key; CTR4(KTR_PMAP, "moea64_protect: pm=%p sva=%#x eva=%#x prot=%#x", pm, sva, eva, prot); KASSERT(pm == &curproc->p_vmspace->vm_pmap || pm == kernel_pmap, ("moea64_protect: non current pmap")); if ((prot & VM_PROT_READ) == VM_PROT_NONE) { moea64_remove(mmu, pm, sva, eva); return; } PMAP_LOCK(pm); key.pvo_vaddr = sva; for (pvo = RB_NFIND(pvo_tree, &pm->pmap_pvo, &key); pvo != NULL && PVO_VADDR(pvo) < eva; pvo = tpvo) { tpvo = RB_NEXT(pvo_tree, &pm->pmap_pvo, pvo); moea64_pvo_protect(mmu, pm, pvo, prot); } PMAP_UNLOCK(pm); } /* * Map a list of wired pages into kernel virtual address space. This is * intended for temporary mappings which do not need page modification or * references recorded. Existing mappings in the region are overwritten. */ void moea64_qenter(mmu_t mmu, vm_offset_t va, vm_page_t *m, int count) { while (count-- > 0) { moea64_kenter(mmu, va, VM_PAGE_TO_PHYS(*m)); va += PAGE_SIZE; m++; } } /* * Remove page mappings from kernel virtual address space. Intended for * temporary mappings entered by moea64_qenter. */ void moea64_qremove(mmu_t mmu, vm_offset_t va, int count) { while (count-- > 0) { moea64_kremove(mmu, va); va += PAGE_SIZE; } } void moea64_release_vsid(uint64_t vsid) { int idx, mask; mtx_lock(&moea64_slb_mutex); idx = vsid & (NVSIDS-1); mask = 1 << (idx % VSID_NBPW); idx /= VSID_NBPW; KASSERT(moea64_vsid_bitmap[idx] & mask, ("Freeing unallocated VSID %#jx", vsid)); moea64_vsid_bitmap[idx] &= ~mask; mtx_unlock(&moea64_slb_mutex); } void moea64_release(mmu_t mmu, pmap_t pmap) { /* * Free segment registers' VSIDs */ #ifdef __powerpc64__ slb_free_tree(pmap); slb_free_user_cache(pmap->pm_slb); #else KASSERT(pmap->pm_sr[0] != 0, ("moea64_release: pm_sr[0] = 0")); moea64_release_vsid(VSID_TO_HASH(pmap->pm_sr[0])); #endif } /* * Remove all pages mapped by the specified pmap */ void moea64_remove_pages(mmu_t mmu, pmap_t pm) { struct pvo_entry *pvo, *tpvo; struct pvo_tree tofree; RB_INIT(&tofree); PMAP_LOCK(pm); RB_FOREACH_SAFE(pvo, pvo_tree, &pm->pmap_pvo, tpvo) { if (pvo->pvo_vaddr & PVO_WIRED) continue; /* * For locking reasons, remove this from the page table and * pmap, but save delinking from the vm_page for a second * pass */ moea64_pvo_remove_from_pmap(mmu, pvo); RB_INSERT(pvo_tree, &tofree, pvo); } PMAP_UNLOCK(pm); RB_FOREACH_SAFE(pvo, pvo_tree, &tofree, tpvo) { PV_LOCK(pvo->pvo_pte.pa & LPTE_RPGN); moea64_pvo_remove_from_page(mmu, pvo); PV_UNLOCK(pvo->pvo_pte.pa & LPTE_RPGN); RB_REMOVE(pvo_tree, &tofree, pvo); free_pvo_entry(pvo); } } /* * Remove the given range of addresses from the specified map. */ void moea64_remove(mmu_t mmu, pmap_t pm, vm_offset_t sva, vm_offset_t eva) { struct pvo_entry *pvo, *tpvo, key; struct pvo_tree tofree; /* * Perform an unsynchronized read. This is, however, safe. */ if (pm->pm_stats.resident_count == 0) return; key.pvo_vaddr = sva; RB_INIT(&tofree); PMAP_LOCK(pm); for (pvo = RB_NFIND(pvo_tree, &pm->pmap_pvo, &key); pvo != NULL && PVO_VADDR(pvo) < eva; pvo = tpvo) { tpvo = RB_NEXT(pvo_tree, &pm->pmap_pvo, pvo); /* * For locking reasons, remove this from the page table and * pmap, but save delinking from the vm_page for a second * pass */ moea64_pvo_remove_from_pmap(mmu, pvo); RB_INSERT(pvo_tree, &tofree, pvo); } PMAP_UNLOCK(pm); RB_FOREACH_SAFE(pvo, pvo_tree, &tofree, tpvo) { PV_LOCK(pvo->pvo_pte.pa & LPTE_RPGN); moea64_pvo_remove_from_page(mmu, pvo); PV_UNLOCK(pvo->pvo_pte.pa & LPTE_RPGN); RB_REMOVE(pvo_tree, &tofree, pvo); free_pvo_entry(pvo); } } /* * Remove physical page from all pmaps in which it resides. moea64_pvo_remove() * will reflect changes in pte's back to the vm_page. */ void moea64_remove_all(mmu_t mmu, vm_page_t m) { struct pvo_entry *pvo, *next_pvo; struct pvo_head freequeue; int wasdead; pmap_t pmap; LIST_INIT(&freequeue); PV_PAGE_LOCK(m); LIST_FOREACH_SAFE(pvo, vm_page_to_pvoh(m), pvo_vlink, next_pvo) { pmap = pvo->pvo_pmap; PMAP_LOCK(pmap); wasdead = (pvo->pvo_vaddr & PVO_DEAD); if (!wasdead) moea64_pvo_remove_from_pmap(mmu, pvo); moea64_pvo_remove_from_page(mmu, pvo); if (!wasdead) LIST_INSERT_HEAD(&freequeue, pvo, pvo_vlink); PMAP_UNLOCK(pmap); } KASSERT(!pmap_page_is_mapped(m), ("Page still has mappings")); KASSERT(!(m->aflags & PGA_WRITEABLE), ("Page still writable")); PV_PAGE_UNLOCK(m); /* Clean up UMA allocations */ LIST_FOREACH_SAFE(pvo, &freequeue, pvo_vlink, next_pvo) free_pvo_entry(pvo); } /* * Allocate a physical page of memory directly from the phys_avail map. * Can only be called from moea64_bootstrap before avail start and end are * calculated. */ vm_offset_t moea64_bootstrap_alloc(vm_size_t size, vm_size_t align) { vm_offset_t s, e; int i, j; size = round_page(size); for (i = 0; phys_avail[i + 1] != 0; i += 2) { if (align != 0) s = roundup2(phys_avail[i], align); else s = phys_avail[i]; e = s + size; if (s < phys_avail[i] || e > phys_avail[i + 1]) continue; if (s + size > platform_real_maxaddr()) continue; if (s == phys_avail[i]) { phys_avail[i] += size; } else if (e == phys_avail[i + 1]) { phys_avail[i + 1] -= size; } else { for (j = phys_avail_count * 2; j > i; j -= 2) { phys_avail[j] = phys_avail[j - 2]; phys_avail[j + 1] = phys_avail[j - 1]; } phys_avail[i + 3] = phys_avail[i + 1]; phys_avail[i + 1] = s; phys_avail[i + 2] = e; phys_avail_count++; } return (s); } panic("moea64_bootstrap_alloc: could not allocate memory"); } static int moea64_pvo_enter(mmu_t mmu, struct pvo_entry *pvo, struct pvo_head *pvo_head) { int first, err; PMAP_LOCK_ASSERT(pvo->pvo_pmap, MA_OWNED); KASSERT(moea64_pvo_find_va(pvo->pvo_pmap, PVO_VADDR(pvo)) == NULL, ("Existing mapping for VA %#jx", (uintmax_t)PVO_VADDR(pvo))); moea64_pvo_enter_calls++; /* * Add to pmap list */ RB_INSERT(pvo_tree, &pvo->pvo_pmap->pmap_pvo, pvo); /* * Remember if the list was empty and therefore will be the first * item. */ if (pvo_head != NULL) { if (LIST_FIRST(pvo_head) == NULL) first = 1; LIST_INSERT_HEAD(pvo_head, pvo, pvo_vlink); } if (pvo->pvo_vaddr & PVO_WIRED) pvo->pvo_pmap->pm_stats.wired_count++; pvo->pvo_pmap->pm_stats.resident_count++; /* * Insert it into the hardware page table */ err = MOEA64_PTE_INSERT(mmu, pvo); if (err != 0) { panic("moea64_pvo_enter: overflow"); } moea64_pvo_entries++; if (pvo->pvo_pmap == kernel_pmap) isync(); #ifdef __powerpc64__ /* * Make sure all our bootstrap mappings are in the SLB as soon * as virtual memory is switched on. */ if (!pmap_bootstrapped) moea64_bootstrap_slb_prefault(PVO_VADDR(pvo), pvo->pvo_vaddr & PVO_LARGE); #endif return (first ? ENOENT : 0); } static void moea64_pvo_remove_from_pmap(mmu_t mmu, struct pvo_entry *pvo) { struct vm_page *pg; int32_t refchg; KASSERT(pvo->pvo_pmap != NULL, ("Trying to remove PVO with no pmap")); PMAP_LOCK_ASSERT(pvo->pvo_pmap, MA_OWNED); KASSERT(!(pvo->pvo_vaddr & PVO_DEAD), ("Trying to remove dead PVO")); /* * If there is an active pte entry, we need to deactivate it */ refchg = MOEA64_PTE_UNSET(mmu, pvo); if (refchg < 0) { /* * If it was evicted from the page table, be pessimistic and * dirty the page. */ if (pvo->pvo_pte.prot & VM_PROT_WRITE) refchg = LPTE_CHG; else refchg = 0; } /* * Update our statistics. */ pvo->pvo_pmap->pm_stats.resident_count--; if (pvo->pvo_vaddr & PVO_WIRED) pvo->pvo_pmap->pm_stats.wired_count--; /* * Remove this PVO from the pmap list. */ RB_REMOVE(pvo_tree, &pvo->pvo_pmap->pmap_pvo, pvo); /* * Mark this for the next sweep */ pvo->pvo_vaddr |= PVO_DEAD; /* Send RC bits to VM */ if ((pvo->pvo_vaddr & PVO_MANAGED) && (pvo->pvo_pte.prot & VM_PROT_WRITE)) { pg = PHYS_TO_VM_PAGE(pvo->pvo_pte.pa & LPTE_RPGN); if (pg != NULL) { refchg |= atomic_readandclear_32(&pg->md.mdpg_attrs); if (refchg & LPTE_CHG) vm_page_dirty(pg); if (refchg & LPTE_REF) vm_page_aflag_set(pg, PGA_REFERENCED); } } } static void moea64_pvo_remove_from_page(mmu_t mmu, struct pvo_entry *pvo) { struct vm_page *pg; KASSERT(pvo->pvo_vaddr & PVO_DEAD, ("Trying to delink live page")); /* Use NULL pmaps as a sentinel for races in page deletion */ if (pvo->pvo_pmap == NULL) return; pvo->pvo_pmap = NULL; /* * Update vm about page writeability/executability if managed */ PV_LOCKASSERT(pvo->pvo_pte.pa & LPTE_RPGN); if (pvo->pvo_vaddr & PVO_MANAGED) { pg = PHYS_TO_VM_PAGE(pvo->pvo_pte.pa & LPTE_RPGN); if (pg != NULL) { LIST_REMOVE(pvo, pvo_vlink); if (LIST_EMPTY(vm_page_to_pvoh(pg))) vm_page_aflag_clear(pg, PGA_WRITEABLE | PGA_EXECUTABLE); } } moea64_pvo_entries--; moea64_pvo_remove_calls++; } static struct pvo_entry * moea64_pvo_find_va(pmap_t pm, vm_offset_t va) { struct pvo_entry key; PMAP_LOCK_ASSERT(pm, MA_OWNED); key.pvo_vaddr = va & ~ADDR_POFF; return (RB_FIND(pvo_tree, &pm->pmap_pvo, &key)); } static boolean_t moea64_query_bit(mmu_t mmu, vm_page_t m, uint64_t ptebit) { struct pvo_entry *pvo; int64_t ret; boolean_t rv; /* * See if this bit is stored in the page already. */ if (m->md.mdpg_attrs & ptebit) return (TRUE); /* * Examine each PTE. Sync so that any pending REF/CHG bits are * flushed to the PTEs. */ rv = FALSE; powerpc_sync(); PV_PAGE_LOCK(m); LIST_FOREACH(pvo, vm_page_to_pvoh(m), pvo_vlink) { ret = 0; /* * See if this pvo has a valid PTE. if so, fetch the * REF/CHG bits from the valid PTE. If the appropriate * ptebit is set, return success. */ PMAP_LOCK(pvo->pvo_pmap); if (!(pvo->pvo_vaddr & PVO_DEAD)) ret = MOEA64_PTE_SYNCH(mmu, pvo); PMAP_UNLOCK(pvo->pvo_pmap); if (ret > 0) { atomic_set_32(&m->md.mdpg_attrs, ret & (LPTE_CHG | LPTE_REF)); if (ret & ptebit) { rv = TRUE; break; } } } PV_PAGE_UNLOCK(m); return (rv); } static u_int moea64_clear_bit(mmu_t mmu, vm_page_t m, u_int64_t ptebit) { u_int count; struct pvo_entry *pvo; int64_t ret; /* * Sync so that any pending REF/CHG bits are flushed to the PTEs (so * we can reset the right ones). */ powerpc_sync(); /* * For each pvo entry, clear the pte's ptebit. */ count = 0; PV_PAGE_LOCK(m); LIST_FOREACH(pvo, vm_page_to_pvoh(m), pvo_vlink) { ret = 0; PMAP_LOCK(pvo->pvo_pmap); if (!(pvo->pvo_vaddr & PVO_DEAD)) ret = MOEA64_PTE_CLEAR(mmu, pvo, ptebit); PMAP_UNLOCK(pvo->pvo_pmap); if (ret > 0 && (ret & ptebit)) count++; } atomic_clear_32(&m->md.mdpg_attrs, ptebit); PV_PAGE_UNLOCK(m); return (count); } boolean_t moea64_dev_direct_mapped(mmu_t mmu, vm_paddr_t pa, vm_size_t size) { struct pvo_entry *pvo, key; vm_offset_t ppa; int error = 0; if (hw_direct_map && mem_valid(pa, size) == 0) return (0); PMAP_LOCK(kernel_pmap); ppa = pa & ~ADDR_POFF; key.pvo_vaddr = DMAP_BASE_ADDRESS + ppa; for (pvo = RB_FIND(pvo_tree, &kernel_pmap->pmap_pvo, &key); ppa < pa + size; ppa += PAGE_SIZE, pvo = RB_NEXT(pvo_tree, &kernel_pmap->pmap_pvo, pvo)) { if (pvo == NULL || (pvo->pvo_pte.pa & LPTE_RPGN) != ppa) { error = EFAULT; break; } } PMAP_UNLOCK(kernel_pmap); return (error); } /* * Map a set of physical memory pages into the kernel virtual * address space. Return a pointer to where it is mapped. This * routine is intended to be used for mapping device memory, * NOT real memory. */ void * moea64_mapdev_attr(mmu_t mmu, vm_paddr_t pa, vm_size_t size, vm_memattr_t ma) { vm_offset_t va, tmpva, ppa, offset; ppa = trunc_page(pa); offset = pa & PAGE_MASK; size = roundup2(offset + size, PAGE_SIZE); va = kva_alloc(size); if (!va) panic("moea64_mapdev: Couldn't alloc kernel virtual memory"); for (tmpva = va; size > 0;) { moea64_kenter_attr(mmu, tmpva, ppa, ma); size -= PAGE_SIZE; tmpva += PAGE_SIZE; ppa += PAGE_SIZE; } return ((void *)(va + offset)); } void * moea64_mapdev(mmu_t mmu, vm_paddr_t pa, vm_size_t size) { return moea64_mapdev_attr(mmu, pa, size, VM_MEMATTR_DEFAULT); } void moea64_unmapdev(mmu_t mmu, vm_offset_t va, vm_size_t size) { vm_offset_t base, offset; base = trunc_page(va); offset = va & PAGE_MASK; size = roundup2(offset + size, PAGE_SIZE); kva_free(base, size); } void moea64_sync_icache(mmu_t mmu, pmap_t pm, vm_offset_t va, vm_size_t sz) { struct pvo_entry *pvo; vm_offset_t lim; vm_paddr_t pa; vm_size_t len; PMAP_LOCK(pm); while (sz > 0) { lim = round_page(va+1); len = MIN(lim - va, sz); pvo = moea64_pvo_find_va(pm, va & ~ADDR_POFF); if (pvo != NULL && !(pvo->pvo_pte.pa & LPTE_I)) { pa = (pvo->pvo_pte.pa & LPTE_RPGN) | (va & ADDR_POFF); moea64_syncicache(mmu, pm, va, pa, len); } va += len; sz -= len; } PMAP_UNLOCK(pm); } void moea64_dumpsys_map(mmu_t mmu, vm_paddr_t pa, size_t sz, void **va) { *va = (void *)(uintptr_t)pa; } extern struct dump_pa dump_map[PHYS_AVAIL_SZ + 1]; void moea64_scan_init(mmu_t mmu) { struct pvo_entry *pvo; vm_offset_t va; int i; if (!do_minidump) { /* Initialize phys. segments for dumpsys(). */ memset(&dump_map, 0, sizeof(dump_map)); mem_regions(&pregions, &pregions_sz, ®ions, ®ions_sz); for (i = 0; i < pregions_sz; i++) { dump_map[i].pa_start = pregions[i].mr_start; dump_map[i].pa_size = pregions[i].mr_size; } return; } /* Virtual segments for minidumps: */ memset(&dump_map, 0, sizeof(dump_map)); /* 1st: kernel .data and .bss. */ dump_map[0].pa_start = trunc_page((uintptr_t)_etext); dump_map[0].pa_size = round_page((uintptr_t)_end) - dump_map[0].pa_start; /* 2nd: msgbuf and tables (see pmap_bootstrap()). */ dump_map[1].pa_start = (vm_paddr_t)(uintptr_t)msgbufp->msg_ptr; dump_map[1].pa_size = round_page(msgbufp->msg_size); /* 3rd: kernel VM. */ va = dump_map[1].pa_start + dump_map[1].pa_size; /* Find start of next chunk (from va). */ while (va < virtual_end) { /* Don't dump the buffer cache. */ if (va >= kmi.buffer_sva && va < kmi.buffer_eva) { va = kmi.buffer_eva; continue; } pvo = moea64_pvo_find_va(kernel_pmap, va & ~ADDR_POFF); if (pvo != NULL && !(pvo->pvo_vaddr & PVO_DEAD)) break; va += PAGE_SIZE; } if (va < virtual_end) { dump_map[2].pa_start = va; va += PAGE_SIZE; /* Find last page in chunk. */ while (va < virtual_end) { /* Don't run into the buffer cache. */ if (va == kmi.buffer_sva) break; pvo = moea64_pvo_find_va(kernel_pmap, va & ~ADDR_POFF); if (pvo == NULL || (pvo->pvo_vaddr & PVO_DEAD)) break; va += PAGE_SIZE; } dump_map[2].pa_size = va - dump_map[2].pa_start; } } Index: head/sys/powerpc/conf/GENERIC64 =================================================================== --- head/sys/powerpc/conf/GENERIC64 (revision 346173) +++ head/sys/powerpc/conf/GENERIC64 (revision 346174) @@ -1,254 +1,255 @@ # # GENERIC -- Generic kernel configuration file for FreeBSD/powerpc # # For more information on this file, please read the handbook section on # Kernel Configuration Files: # # https://www.FreeBSD.org/doc/en_US.ISO8859-1/books/handbook/kernelconfig-config.html # # The handbook is also available locally in /usr/share/doc/handbook # if you've installed the doc distribution, otherwise always see the # FreeBSD World Wide Web server (https://www.FreeBSD.org/) for the # latest information. # # An exhaustive list of options and more detailed explanations of the # device lines is also present in the ../../conf/NOTES and NOTES files. # If you are in doubt as to the purpose or necessity of a line, check first # in NOTES. # # $FreeBSD$ cpu AIM ident GENERIC machine powerpc powerpc64 makeoptions DEBUG=-g #Build kernel with gdb(1) debug symbols makeoptions WITH_CTF=1 # Platform support options POWERMAC #NewWorld Apple PowerMacs options PS3 #Sony Playstation 3 options MAMBO #IBM Mambo Full System Simulator options PSERIES #PAPR-compliant systems (e.g. IBM p) options POWERNV #Non-virtualized OpenPOWER systems options FDT #Flattened Device Tree options SCHED_ULE #ULE scheduler +options NUMA #Non-Uniform Memory Architecture support options PREEMPTION #Enable kernel thread preemption options VIMAGE # Subsystem virtualization, e.g. VNET options INET #InterNETworking options INET6 #IPv6 communications protocols options IPSEC # IP (v4/v6) security options IPSEC_SUPPORT # Allow kldload of ipsec and tcpmd5 options TCP_OFFLOAD # TCP offload options TCP_BLACKBOX # Enhanced TCP event logging options TCP_HHOOK # hhook(9) framework for TCP options TCP_RFC7413 # TCP Fast Open options SCTP #Stream Control Transmission Protocol options FFS #Berkeley Fast Filesystem options SOFTUPDATES #Enable FFS soft updates support options UFS_ACL #Support for access control lists options UFS_DIRHASH #Improve performance on big directories options UFS_GJOURNAL #Enable gjournal-based UFS journaling options QUOTA #Enable disk quotas for UFS options MD_ROOT #MD is a potential root device options NFSCL #Network Filesystem Client options NFSD #Network Filesystem Server options NFSLOCKD #Network Lock Manager options NFS_ROOT #NFS usable as root device options MSDOSFS #MSDOS Filesystem options CD9660 #ISO 9660 Filesystem options PROCFS #Process filesystem (requires PSEUDOFS) options PSEUDOFS #Pseudo-filesystem framework options GEOM_PART_APM #Apple Partition Maps. options GEOM_PART_GPT #GUID Partition Tables. options GEOM_LABEL #Provides labelization options COMPAT_FREEBSD32 #Compatible with FreeBSD/powerpc binaries options COMPAT_FREEBSD5 #Compatible with FreeBSD5 options COMPAT_FREEBSD6 #Compatible with FreeBSD6 options COMPAT_FREEBSD7 #Compatible with FreeBSD7 options COMPAT_FREEBSD9 # Compatible with FreeBSD9 options COMPAT_FREEBSD10 # Compatible with FreeBSD10 options COMPAT_FREEBSD11 # Compatible with FreeBSD11 options SCSI_DELAY=5000 #Delay (in ms) before probing SCSI options KTRACE #ktrace(1) syscall trace support options STACK #stack(9) support options SYSVSHM #SYSV-style shared memory options SYSVMSG #SYSV-style message queues options SYSVSEM #SYSV-style semaphores options _KPOSIX_PRIORITY_SCHEDULING #Posix P1003_1B real-time extensions options PRINTF_BUFR_SIZE=128 # Prevent printf output being interspersed. options HWPMC_HOOKS # Necessary kernel hooks for hwpmc(4) options AUDIT # Security event auditing options CAPABILITY_MODE # Capsicum capability mode options CAPABILITIES # Capsicum capabilities options MAC # TrustedBSD MAC Framework options KDTRACE_HOOKS # Kernel DTrace hooks options DDB_CTF # Kernel ELF linker loads CTF data options INCLUDE_CONFIG_FILE # Include this file in kernel options RACCT # Resource accounting framework options RACCT_DEFAULT_TO_DISABLED # Set kern.racct.enable=0 by default options RCTL # Resource limits # Debugging support. Always need this: options KDB # Enable kernel debugger support. options KDB_TRACE # Print a stack trace for a panic. # For full debugger support use (turn off in stable branch): options DDB #Support DDB #options DEADLKRES #Enable the deadlock resolver options INVARIANTS #Enable calls of extra sanity checking options INVARIANT_SUPPORT #Extra sanity checks of internal structures, required by INVARIANTS options WITNESS #Enable checks to detect deadlocks and cycles options WITNESS_SKIPSPIN #Don't run witness on spinlocks for speed options MALLOC_DEBUG_MAXZONES=8 # Separate malloc(9) zones options VERBOSE_SYSINIT=0 # Support debug.verbose_sysinit, off by default # Kernel dump features. options EKCD # Support for encrypted kernel dumps options GZIO # gzip-compressed kernel and user dumps options ZSTDIO # zstd-compressed kernel and user dumps options NETDUMP # netdump(4) client support # Make an SMP-capable kernel by default options SMP # Symmetric MultiProcessor Kernel # CPU frequency control device cpufreq # Standard busses device pci options PCI_HP # PCI-Express native HotPlug device agp # ATA controllers device ahci # AHCI-compatible SATA controllers device ata # Legacy ATA/SATA controllers device mvs # Marvell 88SX50XX/88SX60XX/88SX70XX/SoC SATA device siis # SiliconImage SiI3124/SiI3132/SiI3531 SATA # NVM Express (NVMe) support device nvme # base NVMe driver options NVME_USE_NVD=0 # prefer the cam(4) based nda(4) driver device nvd # expose NVMe namespaces as disks, depends on nvme # SCSI Controllers device ahc # AHA2940 and onboard AIC7xxx devices options AHC_ALLOW_MEMIO # Attempt to use memory mapped I/O device isp # Qlogic family device ispfw # Firmware module for Qlogic host adapters device mpt # LSI-Logic MPT-Fusion device mps # LSI-Logic MPT-Fusion 2 device sym # NCR/Symbios/LSI Logic 53C8XX/53C1010/53C1510D # ATA/SCSI peripherals device scbus # SCSI bus (required for ATA/SCSI) device ch # SCSI media changers device da # Direct Access (disks) device sa # Sequential Access (tape etc) device cd # CD device pass # Passthrough device (direct ATA/SCSI access) device ses # Enclosure Service (SES and SAF-TE) # vt is the default console driver, resembling an SCO console device vt # Core console driver device kbdmux # Serial (COM) ports device scc device uart device uart_z8530 device iflib # Ethernet hardware device em # Intel PRO/1000 Gigabit Ethernet Family device ix # Intel PRO/10GbE PCIE PF Ethernet Family device ixv # Intel PRO/10GbE PCIE VF Ethernet Family device glc # Sony Playstation 3 Ethernet device llan # IBM pSeries Virtual Ethernet device cxgbe # Chelsio 10/25G NIC # PCI Ethernet NICs that use the common MII bus controller code. device miibus # MII bus support device bge # Broadcom BCM570xx Gigabit Ethernet device gem # Sun GEM/Sun ERI/Apple GMAC device dc # DEC/Intel 21143 and various workalikes device fxp # Intel EtherExpress PRO/100B (82557, 82558) device re # RealTek 8139C+/8169/8169S/8110S device rl # RealTek 8129/8139 # Pseudo devices. device crypto # core crypto support device loop # Network loopback device random # Entropy device device ether # Ethernet support device vlan # 802.1Q VLAN support device tun # Packet tunnel. device md # Memory "disks" device ofwd # Open Firmware disks device gif # IPv6 and IPv4 tunneling device firmware # firmware assist module # The `bpf' device enables the Berkeley Packet Filter. # Be aware of the administrative consequences of enabling this! # Note that 'bpf' is required for DHCP. device bpf #Berkeley packet filter # USB support options USB_DEBUG # enable debug msgs device uhci # UHCI PCI->USB interface device ohci # OHCI PCI->USB interface device ehci # EHCI PCI->USB interface device xhci # XHCI PCI->USB interface device usb # USB Bus (required) device uhid # "Human Interface Devices" device ukbd # Keyboard options KBD_INSTALL_CDEV # install a CDEV entry in /dev device umass # Disks/Mass storage - Requires scbus and da0 device ums # Mouse # USB Ethernet device aue # ADMtek USB Ethernet device axe # ASIX Electronics USB Ethernet device cdce # Generic USB over Ethernet device cue # CATC USB Ethernet device kue # Kawasaki LSI USB Ethernet # Wireless NIC cards options IEEE80211_SUPPORT_MESH # FireWire support device firewire # FireWire bus code device sbp # SCSI over FireWire (Requires scbus and da) device fwe # Ethernet over FireWire (non-standard!) # Misc device iicbus # I2C bus code device iic device kiic # Keywest I2C device ad7417 # PowerMac7,2 temperature sensor device ds1631 # PowerMac11,2 temperature sensor device ds1775 # PowerMac7,2 temperature sensor device fcu # Apple Fan Control Unit device max6690 # PowerMac7,2 temperature sensor device powermac_nvram # Open Firmware configuration NVRAM device smu # Apple System Management Unit device atibl # ATI-based backlight driver for PowerBooks/iBooks device nvbl # nVidia-based backlight driver for PowerBooks/iBooks # ADB support device adb device pmu # Sound support device sound # Generic sound driver (required) device snd_ai2s # Apple I2S audio device snd_uaudio # USB Audio # Netmap provides direct access to TX/RX rings on supported NICs device netmap # netmap(4) support # evdev interface options EVDEV_SUPPORT # evdev support in legacy drivers device evdev # input event device support device uinput # install /dev/uinput cdev Index: head/sys/powerpc/include/intr_machdep.h =================================================================== --- head/sys/powerpc/include/intr_machdep.h (revision 346173) +++ head/sys/powerpc/include/intr_machdep.h (revision 346174) @@ -1,66 +1,66 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (C) 2002 Benno Rice. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY Benno Rice ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL TOOLS GMBH BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * * $FreeBSD$ */ #ifndef _MACHINE_INTR_MACHDEP_H_ #define _MACHINE_INTR_MACHDEP_H_ #define INTR_VECTORS 256 #define MAX_PICS 32 #define MAP_IRQ(node, pin) powerpc_get_irq(node, pin) /* * Default base address for MSI messages on PowerPC */ #define MSI_INTEL_ADDR_BASE 0xfee00000 extern device_t root_pic; struct trapframe; driver_filter_t powerpc_ipi_handler; void intrcnt_add(const char *name, u_long **countp); u_int powerpc_register_pic(device_t, uint32_t, u_int, u_int, u_int); u_int powerpc_get_irq(uint32_t, u_int); void powerpc_dispatch_intr(u_int, struct trapframe *); int powerpc_enable_intr(void); int powerpc_setup_intr(const char *, u_int, driver_filter_t, driver_intr_t, - void *, enum intr_type, void **); + void *, enum intr_type, void **, int); int powerpc_teardown_intr(void *); int powerpc_bind_intr(u_int irq, u_char cpu); int powerpc_config_intr(int, enum intr_trigger, enum intr_polarity); int powerpc_fw_config_intr(int irq, int sense_code); void powerpc_intr_mask(u_int irq); void powerpc_intr_unmask(u_int irq); #endif /* _MACHINE_INTR_MACHDEP_H_ */ Index: head/sys/powerpc/include/ofw_machdep.h =================================================================== --- head/sys/powerpc/include/ofw_machdep.h (revision 346173) +++ head/sys/powerpc/include/ofw_machdep.h (revision 346174) @@ -1,53 +1,57 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2001 by Thomas Moestl . * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE * USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * * $FreeBSD$ */ #ifndef _MACHINE_OFW_MACHDEP_H_ #define _MACHINE_OFW_MACHDEP_H_ #include #include #include #include #include #include typedef uint32_t cell_t; void OF_getetheraddr(device_t dev, u_char *addr); void OF_initial_setup(void *fdt_ptr, void *junk, int (*openfirm)(void *)); boolean_t OF_bootstrap(void); void OF_reboot(void); void ofw_mem_regions(struct mem_region *, int *, struct mem_region *, int *); +void ofw_numa_mem_regions(struct numa_mem_region *, int *); void ofw_quiesce(void); /* Must be called before VM is up! */ void ofw_save_trap_vec(char *); +int ofw_pcibus_get_domain(device_t dev, device_t child, int *domain); +int ofw_pcibus_get_cpus(device_t dev, device_t child, enum cpu_sets op, + size_t setsize, cpuset_t *cpuset); #endif /* _MACHINE_OFW_MACHDEP_H_ */ Index: head/sys/powerpc/include/param.h =================================================================== --- head/sys/powerpc/include/param.h (revision 346173) +++ head/sys/powerpc/include/param.h (revision 346174) @@ -1,138 +1,138 @@ /*- * SPDX-License-Identifier: BSD-4-Clause * * Copyright (c) 2001 David E. O'Brien * Copyright (c) 1990 The Regents of the University of California. * All rights reserved. * * This code is derived from software contributed to Berkeley by * William Jolitz. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from: @(#)param.h 5.8 (Berkeley) 6/28/91 * $FreeBSD$ */ #ifndef _POWERPC_INCLUDE_PARAM_H_ #define _POWERPC_INCLUDE_PARAM_H_ /* * Machine dependent constants for PowerPC */ #include /* Needed to display interrupts on OFW PCI */ #define __PCI_REROUTE_INTERRUPT #ifndef MACHINE #define MACHINE "powerpc" #endif #ifndef MACHINE_ARCH #ifdef __powerpc64__ #define MACHINE_ARCH "powerpc64" #else #ifdef __SPE__ #define MACHINE_ARCH "powerpcspe" #else #define MACHINE_ARCH "powerpc" #endif #endif #endif #define MID_MACHINE MID_POWERPC #ifdef __powerpc64__ #ifndef MACHINE_ARCH32 #define MACHINE_ARCH32 "powerpc" #endif #endif #if defined(SMP) || defined(KLD_MODULE) #ifndef MAXCPU #define MAXCPU 256 #endif #else #define MAXCPU 1 #endif /* SMP || KLD_MODULE */ #ifndef MAXMEMDOM -#define MAXMEMDOM 1 +#define MAXMEMDOM 8 #endif #define ALIGNBYTES _ALIGNBYTES #define ALIGN(p) _ALIGN(p) /* * ALIGNED_POINTER is a boolean macro that checks whether an address * is valid to fetch data elements of type t from on this architecture. * This does not reflect the optimal alignment, just the possibility * (within reasonable limits). */ #define ALIGNED_POINTER(p, t) ((((uintptr_t)(p)) & (sizeof (t) - 1)) == 0) /* * CACHE_LINE_SIZE is the compile-time maximum cache line size for an * architecture. It should be used with appropriate caution. */ #define CACHE_LINE_SHIFT 7 #define CACHE_LINE_SIZE (1 << CACHE_LINE_SHIFT) #define PAGE_SHIFT 12 #define PAGE_SIZE (1L << PAGE_SHIFT) /* Page size */ #define PAGE_MASK (PAGE_SIZE - 1) #define NPTEPG (PAGE_SIZE/(sizeof (pt_entry_t))) #define MAXPAGESIZES 1 /* maximum number of supported page sizes */ #ifndef KSTACK_PAGES #ifdef __powerpc64__ #define KSTACK_PAGES 8 /* includes pcb */ #else #define KSTACK_PAGES 4 /* includes pcb */ #endif #endif #define KSTACK_GUARD_PAGES 1 /* pages of kstack guard; 0 disables */ #define USPACE (kstack_pages * PAGE_SIZE) /* total size of pcb */ /* * Mach derived conversion macros */ #define trunc_page(x) ((x) & ~(PAGE_MASK)) #define round_page(x) (((x) + PAGE_MASK) & ~PAGE_MASK) #define atop(x) ((x) >> PAGE_SHIFT) #define ptoa(x) ((x) << PAGE_SHIFT) #define powerpc_btop(x) ((x) >> PAGE_SHIFT) #define powerpc_ptob(x) ((x) << PAGE_SHIFT) #define pgtok(x) ((x) * (PAGE_SIZE / 1024UL)) #define btoc(x) ((vm_offset_t)(((x)+PAGE_MASK)>>PAGE_SHIFT)) #endif /* !_POWERPC_INCLUDE_PARAM_H_ */ Index: head/sys/powerpc/include/platform.h =================================================================== --- head/sys/powerpc/include/platform.h (revision 346173) +++ head/sys/powerpc/include/platform.h (revision 346174) @@ -1,68 +1,75 @@ /*- * SPDX-License-Identifier: BSD-4-Clause * * Copyright (C) 1996 Wolfgang Solfrank. * Copyright (C) 1996 TooLs GmbH. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by TooLs GmbH. * 4. The name of TooLs GmbH may not be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY TOOLS GMBH ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL TOOLS GMBH BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * * $NetBSD: powerpc.h,v 1.3 2000/06/01 00:49:59 matt Exp $ * $FreeBSD$ */ #ifndef _MACHINE_PLATFORM_H_ #define _MACHINE_PLATFORM_H_ #include #include struct mem_region { uint64_t mr_start; uint64_t mr_size; }; +struct numa_mem_region { + uint64_t mr_start; + uint64_t mr_size; + uint64_t mr_domain; +}; + /* Documentation for these functions is in platform_if.m */ void mem_regions(struct mem_region **, int *, struct mem_region **, int *); +void numa_mem_regions(struct numa_mem_region **, int *); vm_offset_t platform_real_maxaddr(void); u_long platform_timebase_freq(struct cpuref *); int platform_smp_first_cpu(struct cpuref *); int platform_smp_next_cpu(struct cpuref *); int platform_smp_get_bsp(struct cpuref *); int platform_smp_start_cpu(struct pcpu *); void platform_smp_timebase_sync(u_long tb, int ap); void platform_smp_ap_init(void); void platform_smp_probe_threads(void); const char *installed_platform(void); void platform_probe_and_attach(void); void platform_sleep(void); #endif /* _MACHINE_PLATFORM_H_ */ Index: head/sys/powerpc/include/smp.h =================================================================== --- head/sys/powerpc/include/smp.h (revision 346173) +++ head/sys/powerpc/include/smp.h (revision 346174) @@ -1,67 +1,68 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2008 Marcel Moolenaar * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * * $FreeBSD$ */ #ifndef _MACHINE_SMP_H_ #define _MACHINE_SMP_H_ #ifdef _KERNEL #define IPI_AST 0 #define IPI_PREEMPT 1 #define IPI_RENDEZVOUS 2 #define IPI_STOP 3 #define IPI_STOP_HARD 3 #define IPI_HARDCLOCK 4 #ifndef LOCORE #include #include void ipi_all_but_self(int ipi); void ipi_cpu(int cpu, u_int ipi); void ipi_selected(cpuset_t cpus, int ipi); struct cpuref { uintptr_t cr_hwref; u_int cr_cpuid; + u_int cr_domain; }; void pmap_cpu_bootstrap(int); void cpudep_ap_early_bootstrap(void); uintptr_t cpudep_ap_bootstrap(void); void cpudep_ap_setup(void); void machdep_ap_bootstrap(void); extern struct pcb stoppcbs[]; #endif /* !LOCORE */ #endif /* _KERNEL */ #endif /* !_MACHINE_SMP_H */ Index: head/sys/powerpc/ofw/ofw_machdep.c =================================================================== --- head/sys/powerpc/ofw/ofw_machdep.c (revision 346173) +++ head/sys/powerpc/ofw/ofw_machdep.c (revision 346174) @@ -1,778 +1,872 @@ /*- * SPDX-License-Identifier: BSD-4-Clause * * Copyright (C) 1996 Wolfgang Solfrank. * Copyright (C) 1996 TooLs GmbH. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by TooLs GmbH. * 4. The name of TooLs GmbH may not be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY TOOLS GMBH ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL TOOLS GMBH BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * * $NetBSD: ofw_machdep.c,v 1.5 2000/05/23 13:25:43 tsubai Exp $ */ #include __FBSDID("$FreeBSD$"); #include "opt_platform.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include +#include #include #include #include #include #include #include #include #ifdef POWERNV #include #endif static void *fdt; int ofw_real_mode; #ifdef AIM extern register_t ofmsr[5]; extern void *openfirmware_entry; char save_trap_init[0x2f00]; /* EXC_LAST */ char save_trap_of[0x2f00]; /* EXC_LAST */ int ofwcall(void *); static int openfirmware(void *args); __inline void ofw_save_trap_vec(char *save_trap_vec) { if (!ofw_real_mode || !hw_direct_map) return; bcopy((void *)PHYS_TO_DMAP(EXC_RST), save_trap_vec, EXC_LAST - EXC_RST); } static __inline void ofw_restore_trap_vec(char *restore_trap_vec) { if (!ofw_real_mode || !hw_direct_map) return; bcopy(restore_trap_vec, (void *)PHYS_TO_DMAP(EXC_RST), EXC_LAST - EXC_RST); __syncicache((void *)PHYS_TO_DMAP(EXC_RSVD), EXC_LAST - EXC_RSVD); } /* * Saved SPRG0-3 from OpenFirmware. Will be restored prior to the callback. */ register_t ofw_sprg0_save; static __inline void ofw_sprg_prepare(void) { if (ofw_real_mode) return; /* * Assume that interrupt are disabled at this point, or * SPRG1-3 could be trashed */ #ifdef __powerpc64__ __asm __volatile("mtsprg1 %0\n\t" "mtsprg2 %1\n\t" "mtsprg3 %2\n\t" : : "r"(ofmsr[2]), "r"(ofmsr[3]), "r"(ofmsr[4])); #else __asm __volatile("mfsprg0 %0\n\t" "mtsprg0 %1\n\t" "mtsprg1 %2\n\t" "mtsprg2 %3\n\t" "mtsprg3 %4\n\t" : "=&r"(ofw_sprg0_save) : "r"(ofmsr[1]), "r"(ofmsr[2]), "r"(ofmsr[3]), "r"(ofmsr[4])); #endif } static __inline void ofw_sprg_restore(void) { if (ofw_real_mode) return; /* * Note that SPRG1-3 contents are irrelevant. They are scratch * registers used in the early portion of trap handling when * interrupts are disabled. * * PCPU data cannot be used until this routine is called ! */ #ifndef __powerpc64__ __asm __volatile("mtsprg0 %0" :: "r"(ofw_sprg0_save)); #endif } #endif static int parse_ofw_memory(phandle_t node, const char *prop, struct mem_region *output) { cell_t address_cells, size_cells; cell_t OFmem[4 * PHYS_AVAIL_SZ]; int sz, i, j; phandle_t phandle; sz = 0; /* * Get #address-cells from root node, defaulting to 1 if it cannot * be found. */ phandle = OF_finddevice("/"); if (OF_getencprop(phandle, "#address-cells", &address_cells, sizeof(address_cells)) < (ssize_t)sizeof(address_cells)) address_cells = 1; if (OF_getencprop(phandle, "#size-cells", &size_cells, sizeof(size_cells)) < (ssize_t)sizeof(size_cells)) size_cells = 1; /* * Get memory. */ if (node == -1 || (sz = OF_getencprop(node, prop, OFmem, sizeof(OFmem))) <= 0) panic("Physical memory map not found"); i = 0; j = 0; while (i < sz/sizeof(cell_t)) { output[j].mr_start = OFmem[i++]; if (address_cells == 2) { output[j].mr_start <<= 32; output[j].mr_start += OFmem[i++]; } output[j].mr_size = OFmem[i++]; if (size_cells == 2) { output[j].mr_size <<= 32; output[j].mr_size += OFmem[i++]; } if (output[j].mr_start > BUS_SPACE_MAXADDR) continue; /* * Constrain memory to that which we can access. * 32-bit AIM can only reference 32 bits of address currently, * but Book-E can access 36 bits. */ if (((uint64_t)output[j].mr_start + (uint64_t)output[j].mr_size - 1) > BUS_SPACE_MAXADDR) { output[j].mr_size = BUS_SPACE_MAXADDR - output[j].mr_start + 1; } j++; } - sz = j*sizeof(output[0]); - return (sz); + return (j); } +static int +parse_numa_ofw_memory(phandle_t node, const char *prop, + struct numa_mem_region *output) +{ + cell_t address_cells, size_cells; + cell_t OFmem[4 * PHYS_AVAIL_SZ]; + int sz, i, j; + phandle_t phandle; + + sz = 0; + + /* + * Get #address-cells from root node, defaulting to 1 if it cannot + * be found. + */ + phandle = OF_finddevice("/"); + if (OF_getencprop(phandle, "#address-cells", &address_cells, + sizeof(address_cells)) < (ssize_t)sizeof(address_cells)) + address_cells = 1; + if (OF_getencprop(phandle, "#size-cells", &size_cells, + sizeof(size_cells)) < (ssize_t)sizeof(size_cells)) + size_cells = 1; + + /* + * Get memory. + */ + if (node == -1 || (sz = OF_getencprop(node, prop, + OFmem, sizeof(OFmem))) <= 0) + panic("Physical memory map not found"); + + i = 0; + j = 0; + while (i < sz/sizeof(cell_t)) { + output[j].mr_start = OFmem[i++]; + if (address_cells == 2) { + output[j].mr_start <<= 32; + output[j].mr_start += OFmem[i++]; + } + output[j].mr_size = OFmem[i++]; + if (size_cells == 2) { + output[j].mr_size <<= 32; + output[j].mr_size += OFmem[i++]; + } + j++; + } + + return (j); +} + #ifdef FDT static int excise_reserved_regions(struct mem_region *avail, int asz, struct mem_region *exclude, int esz) { int i, j, k; for (i = 0; i < asz; i++) { for (j = 0; j < esz; j++) { /* * Case 1: Exclusion region encloses complete * available entry. Drop it and move on. */ if (exclude[j].mr_start <= avail[i].mr_start && exclude[j].mr_start + exclude[j].mr_size >= avail[i].mr_start + avail[i].mr_size) { for (k = i+1; k < asz; k++) avail[k-1] = avail[k]; asz--; i--; /* Repeat some entries */ continue; } /* * Case 2: Exclusion region starts in available entry. * Trim it to where the entry begins and append * a new available entry with the region after * the excluded region, if any. */ if (exclude[j].mr_start >= avail[i].mr_start && exclude[j].mr_start < avail[i].mr_start + avail[i].mr_size) { if (exclude[j].mr_start + exclude[j].mr_size < avail[i].mr_start + avail[i].mr_size) { avail[asz].mr_start = exclude[j].mr_start + exclude[j].mr_size; avail[asz].mr_size = avail[i].mr_start + avail[i].mr_size - avail[asz].mr_start; asz++; } avail[i].mr_size = exclude[j].mr_start - avail[i].mr_start; } /* * Case 3: Exclusion region ends in available entry. * Move start point to where the exclusion zone ends. * The case of a contained exclusion zone has already * been caught in case 2. */ if (exclude[j].mr_start + exclude[j].mr_size >= avail[i].mr_start && exclude[j].mr_start + exclude[j].mr_size < avail[i].mr_start + avail[i].mr_size) { avail[i].mr_size += avail[i].mr_start; avail[i].mr_start = exclude[j].mr_start + exclude[j].mr_size; avail[i].mr_size -= avail[i].mr_start; } } } return (asz); } static int excise_initrd_region(struct mem_region *avail, int asz) { phandle_t chosen; uint64_t start, end; ssize_t size; struct mem_region initrdmap[1]; pcell_t cell[2]; chosen = OF_finddevice("/chosen"); size = OF_getencprop(chosen, "linux,initrd-start", cell, sizeof(cell)); if (size < 0) return (asz); else if (size == 4) start = cell[0]; else if (size == 8) start = (uint64_t)cell[0] << 32 | cell[1]; else { /* Invalid value length */ printf("WARNING: linux,initrd-start must be either 4 or 8 bytes long\n"); return (asz); } size = OF_getencprop(chosen, "linux,initrd-end", cell, sizeof(cell)); if (size < 0) return (asz); else if (size == 4) end = cell[0]; else if (size == 8) end = (uint64_t)cell[0] << 32 | cell[1]; else { /* Invalid value length */ printf("WARNING: linux,initrd-end must be either 4 or 8 bytes long\n"); return (asz); } if (end <= start) return (asz); initrdmap[0].mr_start = start; initrdmap[0].mr_size = end - start; asz = excise_reserved_regions(avail, asz, initrdmap, 1); return (asz); } #ifdef POWERNV static int excise_msi_region(struct mem_region *avail, int asz) { uint64_t start, end; struct mem_region initrdmap[1]; /* * This range of physical addresses is used to implement optimized * 32 bit MSI interrupts on POWER9. Exclude it to avoid accidentally * using it for DMA, as this will cause an immediate PHB fence. * While we could theoretically turn off this behavior in the ETU, * doing so would break 32-bit MSI, so just reserve the range in * the physical map instead. * See section 4.4.2.8 of the PHB4 specification. */ start = 0x00000000ffff0000ul; end = 0x00000000fffffffful; initrdmap[0].mr_start = start; initrdmap[0].mr_size = end - start; asz = excise_reserved_regions(avail, asz, initrdmap, 1); return (asz); } #endif static int excise_fdt_reserved(struct mem_region *avail, int asz) { struct mem_region fdtmap[32]; ssize_t fdtmapsize; phandle_t chosen; int j, fdtentries; chosen = OF_finddevice("/chosen"); fdtmapsize = OF_getprop(chosen, "fdtmemreserv", fdtmap, sizeof(fdtmap)); for (j = 0; j < fdtmapsize/sizeof(fdtmap[0]); j++) { fdtmap[j].mr_start = be64toh(fdtmap[j].mr_start) & ~PAGE_MASK; fdtmap[j].mr_size = round_page(be64toh(fdtmap[j].mr_size)); } KASSERT(j*sizeof(fdtmap[0]) < sizeof(fdtmap), ("Exceeded number of FDT reservations")); /* Add a virtual entry for the FDT itself */ if (fdt != NULL) { fdtmap[j].mr_start = (vm_offset_t)fdt & ~PAGE_MASK; fdtmap[j].mr_size = round_page(fdt_totalsize(fdt)); fdtmapsize += sizeof(fdtmap[0]); } fdtentries = fdtmapsize/sizeof(fdtmap[0]); asz = excise_reserved_regions(avail, asz, fdtmap, fdtentries); return (asz); } #endif /* * This is called during powerpc_init, before the system is really initialized. * It shall provide the total and the available regions of RAM. * The available regions need not take the kernel into account. */ void +ofw_numa_mem_regions(struct numa_mem_region *memp, int *memsz) +{ + phandle_t phandle; + int res, count, msz; + char name[31]; + cell_t associativity[5]; + struct numa_mem_region *curmemp; + + msz = 0; + /* + * Get memory from all the /memory nodes. + */ + for (phandle = OF_child(OF_peer(0)); phandle != 0; + phandle = OF_peer(phandle)) { + if (OF_getprop(phandle, "name", name, sizeof(name)) <= 0) + continue; + if (strncmp(name, "memory@", strlen("memory@")) != 0) + continue; + + count = parse_numa_ofw_memory(phandle, "reg", &memp[msz]); + if (count == 0) + continue; + curmemp = &memp[msz]; + res = OF_getproplen(phandle, "ibm,associativity"); + if (res <= 0) + continue; + MPASS(count == 1); + OF_getencprop(phandle, "ibm,associativity", + associativity, res); + curmemp->mr_domain = associativity[3] - 1; + if (bootverbose) + printf("%s %#jx-%#jx domain(%ju)\n", + name, (uintmax_t)curmemp->mr_start, + (uintmax_t)curmemp->mr_start + curmemp->mr_size, + (uintmax_t)curmemp->mr_domain); + msz += count; + } + *memsz = msz; +} +/* + * This is called during powerpc_init, before the system is really initialized. + * It shall provide the total and the available regions of RAM. + * The available regions need not take the kernel into account. + */ +void ofw_mem_regions(struct mem_region *memp, int *memsz, struct mem_region *availp, int *availsz) { phandle_t phandle; int asz, msz; int res; char name[31]; asz = msz = 0; /* * Get memory from all the /memory nodes. */ for (phandle = OF_child(OF_peer(0)); phandle != 0; phandle = OF_peer(phandle)) { if (OF_getprop(phandle, "name", name, sizeof(name)) <= 0) continue; if (strncmp(name, "memory", sizeof(name)) != 0 && strncmp(name, "memory@", strlen("memory@")) != 0) continue; res = parse_ofw_memory(phandle, "reg", &memp[msz]); - msz += res/sizeof(struct mem_region); + msz += res; /* * On POWER9 Systems we might have both linux,usable-memory and * reg properties. 'reg' denotes all available memory, but we * must use 'linux,usable-memory', a subset, as some memory * regions are reserved for NVLink. */ if (OF_getproplen(phandle, "linux,usable-memory") >= 0) res = parse_ofw_memory(phandle, "linux,usable-memory", &availp[asz]); else if (OF_getproplen(phandle, "available") >= 0) res = parse_ofw_memory(phandle, "available", &availp[asz]); else res = parse_ofw_memory(phandle, "reg", &availp[asz]); - asz += res/sizeof(struct mem_region); + asz += res; } #ifdef FDT phandle = OF_finddevice("/chosen"); if (OF_hasprop(phandle, "fdtmemreserv")) asz = excise_fdt_reserved(availp, asz); /* If the kernel is being loaded through kexec, initrd region is listed * in /chosen but the region is not marked as reserved, so, we might exclude * it here. */ if (OF_hasprop(phandle, "linux,initrd-start")) asz = excise_initrd_region(availp, asz); #endif #ifdef POWERNV if (opal_check() == 0) asz = excise_msi_region(availp, asz); #endif *memsz = msz; *availsz = asz; } void OF_initial_setup(void *fdt_ptr, void *junk, int (*openfirm)(void *)) { #ifdef AIM ofmsr[0] = mfmsr(); #ifdef __powerpc64__ ofmsr[0] &= ~PSL_SF; #else __asm __volatile("mfsprg0 %0" : "=&r"(ofmsr[1])); #endif __asm __volatile("mfsprg1 %0" : "=&r"(ofmsr[2])); __asm __volatile("mfsprg2 %0" : "=&r"(ofmsr[3])); __asm __volatile("mfsprg3 %0" : "=&r"(ofmsr[4])); openfirmware_entry = openfirm; if (ofmsr[0] & PSL_DR) ofw_real_mode = 0; else ofw_real_mode = 1; ofw_save_trap_vec(save_trap_init); #else ofw_real_mode = 1; #endif fdt = fdt_ptr; } boolean_t OF_bootstrap() { boolean_t status = FALSE; int err = 0; #ifdef AIM if (openfirmware_entry != NULL) { if (ofw_real_mode) { status = OF_install(OFW_STD_REAL, 0); } else { #ifdef __powerpc64__ status = OF_install(OFW_STD_32BIT, 0); #else status = OF_install(OFW_STD_DIRECT, 0); #endif } if (status != TRUE) return status; err = OF_init(openfirmware); } else #endif if (fdt != NULL) { #ifdef FDT #ifdef AIM bus_space_tag_t fdt_bt; vm_offset_t tmp_fdt_ptr; vm_size_t fdt_size; uintptr_t fdt_va; #endif status = OF_install(OFW_FDT, 0); if (status != TRUE) return status; #ifdef AIM /* AIM-only for now -- Book-E does this remapping in early init */ /* Get the FDT size for mapping if we can */ tmp_fdt_ptr = pmap_early_io_map((vm_paddr_t)fdt, PAGE_SIZE); if (fdt_check_header((void *)tmp_fdt_ptr) != 0) { pmap_early_io_unmap(tmp_fdt_ptr, PAGE_SIZE); return FALSE; } fdt_size = fdt_totalsize((void *)tmp_fdt_ptr); pmap_early_io_unmap(tmp_fdt_ptr, PAGE_SIZE); /* * Map this for real. Use bus_space_map() to take advantage * of its auto-remapping function once the kernel is loaded. * This is a dirty hack, but what we have. */ #ifdef _LITTLE_ENDIAN fdt_bt = &bs_le_tag; #else fdt_bt = &bs_be_tag; #endif bus_space_map(fdt_bt, (vm_paddr_t)fdt, fdt_size, 0, &fdt_va); err = OF_init((void *)fdt_va); #else err = OF_init(fdt); #endif #endif } #ifdef FDT_DTB_STATIC /* * Check for a statically included blob already in the kernel and * needing no mapping. */ else { status = OF_install(OFW_FDT, 0); if (status != TRUE) return status; err = OF_init(&fdt_static_dtb); } #endif if (err != 0) { OF_install(NULL, 0); status = FALSE; } return (status); } #ifdef AIM void ofw_quiesce(void) { struct { cell_t name; cell_t nargs; cell_t nreturns; } args; KASSERT(!pmap_bootstrapped, ("Cannot call ofw_quiesce after VM is up")); args.name = (cell_t)(uintptr_t)"quiesce"; args.nargs = 0; args.nreturns = 0; openfirmware(&args); } static int openfirmware_core(void *args) { int result; register_t oldmsr; if (openfirmware_entry == NULL) return (-1); /* * Turn off exceptions - we really don't want to end up * anywhere unexpected with PCPU set to something strange * or the stack pointer wrong. */ oldmsr = intr_disable(); ofw_sprg_prepare(); /* Save trap vectors */ ofw_save_trap_vec(save_trap_of); /* Restore initially saved trap vectors */ ofw_restore_trap_vec(save_trap_init); #ifndef __powerpc64__ /* * Clear battable[] translations */ if (!(cpu_features & PPC_FEATURE_64)) __asm __volatile("mtdbatu 2, %0\n" "mtdbatu 3, %0" : : "r" (0)); isync(); #endif result = ofwcall(args); /* Restore trap vecotrs */ ofw_restore_trap_vec(save_trap_of); ofw_sprg_restore(); intr_restore(oldmsr); return (result); } #ifdef SMP struct ofw_rv_args { void *args; int retval; volatile int in_progress; }; static void ofw_rendezvous_dispatch(void *xargs) { struct ofw_rv_args *rv_args = xargs; /* NOTE: Interrupts are disabled here */ if (PCPU_GET(cpuid) == 0) { /* * Execute all OF calls on CPU 0 */ rv_args->retval = openfirmware_core(rv_args->args); rv_args->in_progress = 0; } else { /* * Spin with interrupts off on other CPUs while OF has * control of the machine. */ while (rv_args->in_progress) cpu_spinwait(); } } #endif static int openfirmware(void *args) { int result; #ifdef SMP struct ofw_rv_args rv_args; #endif if (openfirmware_entry == NULL) return (-1); #ifdef SMP if (cold) { result = openfirmware_core(args); } else { rv_args.args = args; rv_args.in_progress = 1; smp_rendezvous(smp_no_rendezvous_barrier, ofw_rendezvous_dispatch, smp_no_rendezvous_barrier, &rv_args); result = rv_args.retval; } #else result = openfirmware_core(args); #endif return (result); } void OF_reboot() { struct { cell_t name; cell_t nargs; cell_t nreturns; cell_t arg; } args; args.name = (cell_t)(uintptr_t)"interpret"; args.nargs = 1; args.nreturns = 0; args.arg = (cell_t)(uintptr_t)"reset-all"; openfirmware_core(&args); /* Don't do rendezvous! */ for (;;); /* just in case */ } #endif /* AIM */ void OF_getetheraddr(device_t dev, u_char *addr) { phandle_t node; node = ofw_bus_get_node(dev); OF_getprop(node, "local-mac-address", addr, ETHER_ADDR_LEN); } /* * Return a bus handle and bus tag that corresponds to the register * numbered regno for the device referenced by the package handle * dev. This function is intended to be used by console drivers in * early boot only. It works by mapping the address of the device's * register in the address space of its parent and recursively walk * the device tree upward this way. */ int OF_decode_addr(phandle_t dev, int regno, bus_space_tag_t *tag, bus_space_handle_t *handle, bus_size_t *sz) { bus_addr_t addr; bus_size_t size; pcell_t pci_hi; int flags, res; res = ofw_reg_to_paddr(dev, regno, &addr, &size, &pci_hi); if (res < 0) return (res); if (pci_hi == OFW_PADDR_NOT_PCI) { *tag = &bs_be_tag; flags = 0; } else { *tag = &bs_le_tag; flags = (pci_hi & OFW_PCI_PHYS_HI_PREFETCHABLE) ? BUS_SPACE_MAP_PREFETCHABLE: 0; } if (sz != NULL) *sz = size; return (bus_space_map(*tag, addr, size, flags, handle)); } Index: head/sys/powerpc/ofw/ofw_pcibus.c =================================================================== --- head/sys/powerpc/ofw/ofw_pcibus.c (revision 346173) +++ head/sys/powerpc/ofw/ofw_pcibus.c (revision 346174) @@ -1,384 +1,460 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 1997, Stefan Esser * Copyright (c) 2000, Michael Smith * Copyright (c) 2000, BSDi * Copyright (c) 2003, Thomas Moestl * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice unmodified, this list of conditions, and the following * disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include +#include #include #include #include #include #include #include #include #include #include #include #include "ofw_pcibus.h" #include "pcib_if.h" #include "pci_if.h" typedef uint32_t ofw_pci_intr_t; /* Methods */ static device_probe_t ofw_pcibus_probe; static device_attach_t ofw_pcibus_attach; static pci_alloc_devinfo_t ofw_pcibus_alloc_devinfo; static pci_assign_interrupt_t ofw_pcibus_assign_interrupt; static ofw_bus_get_devinfo_t ofw_pcibus_get_devinfo; static bus_child_deleted_t ofw_pcibus_child_deleted; static int ofw_pcibus_child_pnpinfo_str_method(device_t cbdev, device_t child, char *buf, size_t buflen); static void ofw_pcibus_enum_devtree(device_t dev, u_int domain, u_int busno); static void ofw_pcibus_enum_bus(device_t dev, u_int domain, u_int busno); static device_method_t ofw_pcibus_methods[] = { /* Device interface */ DEVMETHOD(device_probe, ofw_pcibus_probe), DEVMETHOD(device_attach, ofw_pcibus_attach), /* Bus interface */ DEVMETHOD(bus_child_deleted, ofw_pcibus_child_deleted), DEVMETHOD(bus_child_pnpinfo_str, ofw_pcibus_child_pnpinfo_str_method), DEVMETHOD(bus_rescan, bus_null_rescan), + DEVMETHOD(bus_get_cpus, ofw_pcibus_get_cpus), + DEVMETHOD(bus_get_domain, ofw_pcibus_get_domain), /* PCI interface */ DEVMETHOD(pci_alloc_devinfo, ofw_pcibus_alloc_devinfo), DEVMETHOD(pci_assign_interrupt, ofw_pcibus_assign_interrupt), /* ofw_bus interface */ DEVMETHOD(ofw_bus_get_devinfo, ofw_pcibus_get_devinfo), DEVMETHOD(ofw_bus_get_compat, ofw_bus_gen_get_compat), DEVMETHOD(ofw_bus_get_model, ofw_bus_gen_get_model), DEVMETHOD(ofw_bus_get_name, ofw_bus_gen_get_name), DEVMETHOD(ofw_bus_get_node, ofw_bus_gen_get_node), DEVMETHOD(ofw_bus_get_type, ofw_bus_gen_get_type), DEVMETHOD_END }; static devclass_t pci_devclass; DEFINE_CLASS_1(pci, ofw_pcibus_driver, ofw_pcibus_methods, sizeof(struct pci_softc), pci_driver); EARLY_DRIVER_MODULE(ofw_pcibus, pcib, ofw_pcibus_driver, pci_devclass, 0, 0, BUS_PASS_BUS); MODULE_VERSION(ofw_pcibus, 1); MODULE_DEPEND(ofw_pcibus, pci, 1, 1, 1); static int ofw_devices_only = 0; TUNABLE_INT("hw.pci.ofw_devices_only", &ofw_devices_only); static int ofw_pcibus_probe(device_t dev) { if (ofw_bus_get_node(dev) == -1) return (ENXIO); device_set_desc(dev, "OFW PCI bus"); return (BUS_PROBE_DEFAULT); } static int ofw_pcibus_attach(device_t dev) { u_int busno, domain; int error; error = pci_attach_common(dev); if (error) return (error); domain = pcib_get_domain(dev); busno = pcib_get_bus(dev); /* * Attach those children represented in the device tree. */ ofw_pcibus_enum_devtree(dev, domain, busno); /* * We now attach any laggard devices. FDT, for instance, allows * the device tree to enumerate only some PCI devices. Apple's * OF device tree on some Grackle-based hardware can also miss * functions on multi-function cards. */ if (!ofw_devices_only) ofw_pcibus_enum_bus(dev, domain, busno); return (bus_generic_attach(dev)); } struct pci_devinfo * ofw_pcibus_alloc_devinfo(device_t dev) { struct ofw_pcibus_devinfo *dinfo; dinfo = malloc(sizeof(*dinfo), M_DEVBUF, M_WAITOK | M_ZERO); return (&dinfo->opd_dinfo); } static void ofw_pcibus_enum_devtree(device_t dev, u_int domain, u_int busno) { device_t pcib; struct ofw_pci_register pcir; struct ofw_pcibus_devinfo *dinfo; phandle_t node, child; u_int func, slot; int intline; pcib = device_get_parent(dev); node = ofw_bus_get_node(dev); for (child = OF_child(node); child != 0; child = OF_peer(child)) { if (OF_getencprop(child, "reg", (pcell_t *)&pcir, sizeof(pcir)) == -1) continue; slot = OFW_PCI_PHYS_HI_DEVICE(pcir.phys_hi); func = OFW_PCI_PHYS_HI_FUNCTION(pcir.phys_hi); /* Some OFW device trees contain dupes. */ if (pci_find_dbsf(domain, busno, slot, func) != NULL) continue; /* * The preset in the intline register is usually bogus. Reset * it such that the PCI code will reroute the interrupt if * needed. */ intline = PCI_INVALID_IRQ; if (OF_getproplen(child, "interrupts") > 0) intline = 0; PCIB_WRITE_CONFIG(pcib, busno, slot, func, PCIR_INTLINE, intline, 1); /* * Now set up the PCI and OFW bus layer devinfo and add it * to the PCI bus. */ dinfo = (struct ofw_pcibus_devinfo *)pci_read_device(pcib, dev, domain, busno, slot, func); if (dinfo == NULL) continue; if (ofw_bus_gen_setup_devinfo(&dinfo->opd_obdinfo, child) != 0) { pci_freecfg((struct pci_devinfo *)dinfo); continue; } dinfo->opd_dma_tag = NULL; pci_add_child(dev, (struct pci_devinfo *)dinfo); /* * Some devices don't have an intpin set, but do have * interrupts. These are fully specified, and set in the * interrupts property, so add that value to the device's * resource list. */ if (dinfo->opd_dinfo.cfg.intpin == 0) ofw_bus_intr_to_rl(dev, child, &dinfo->opd_dinfo.resources, NULL); } } /* * The following is an almost exact clone of pci_add_children(), with the * addition that it (a) will not add children that have already been added, * and (b) will set up the OFW devinfo to point to invalid values. This is * to handle non-enumerated PCI children as exist in FDT and on the second * function of the Rage 128 in my Blue & White G3. */ static void ofw_pcibus_enum_bus(device_t dev, u_int domain, u_int busno) { device_t pcib; struct ofw_pcibus_devinfo *dinfo; int maxslots; int s, f, pcifunchigh; uint8_t hdrtype; pcib = device_get_parent(dev); maxslots = PCIB_MAXSLOTS(pcib); for (s = 0; s <= maxslots; s++) { pcifunchigh = 0; f = 0; DELAY(1); hdrtype = PCIB_READ_CONFIG(pcib, busno, s, f, PCIR_HDRTYPE, 1); if ((hdrtype & PCIM_HDRTYPE) > PCI_MAXHDRTYPE) continue; if (hdrtype & PCIM_MFDEV) pcifunchigh = PCI_FUNCMAX; for (f = 0; f <= pcifunchigh; f++) { /* Filter devices we have already added */ if (pci_find_dbsf(domain, busno, s, f) != NULL) continue; dinfo = (struct ofw_pcibus_devinfo *)pci_read_device( pcib, dev, domain, busno, s, f); if (dinfo == NULL) continue; dinfo->opd_dma_tag = NULL; dinfo->opd_obdinfo.obd_node = -1; dinfo->opd_obdinfo.obd_name = NULL; dinfo->opd_obdinfo.obd_compat = NULL; dinfo->opd_obdinfo.obd_type = NULL; dinfo->opd_obdinfo.obd_model = NULL; /* * For non OFW-devices, don't believe 0 * for an interrupt. */ if (dinfo->opd_dinfo.cfg.intline == 0) { dinfo->opd_dinfo.cfg.intline = PCI_INVALID_IRQ; PCIB_WRITE_CONFIG(pcib, busno, s, f, PCIR_INTLINE, PCI_INVALID_IRQ, 1); } pci_add_child(dev, (struct pci_devinfo *)dinfo); } } } static void ofw_pcibus_child_deleted(device_t dev, device_t child) { struct ofw_pcibus_devinfo *dinfo; dinfo = device_get_ivars(dev); ofw_bus_gen_destroy_devinfo(&dinfo->opd_obdinfo); pci_child_deleted(dev, child); } static int ofw_pcibus_child_pnpinfo_str_method(device_t cbdev, device_t child, char *buf, size_t buflen) { pci_child_pnpinfo_str_method(cbdev, child, buf, buflen); if (ofw_bus_get_node(child) != -1) { strlcat(buf, " ", buflen); /* Separate info */ ofw_bus_gen_child_pnpinfo_str(cbdev, child, buf, buflen); } return (0); } static int ofw_pcibus_assign_interrupt(device_t dev, device_t child) { ofw_pci_intr_t intr[2]; phandle_t node, iparent; int isz, icells; node = ofw_bus_get_node(child); if (node == -1) { /* Non-firmware enumerated child, use standard routing */ intr[0] = pci_get_intpin(child); return (PCIB_ROUTE_INTERRUPT(device_get_parent(dev), child, intr[0])); } /* * Try to determine the node's interrupt parent so we know which * PIC to use. */ iparent = -1; if (OF_getencprop(node, "interrupt-parent", &iparent, sizeof(iparent)) < 0) iparent = -1; icells = 1; if (iparent != -1) OF_getencprop(OF_node_from_xref(iparent), "#interrupt-cells", &icells, sizeof(icells)); /* * Any AAPL,interrupts property gets priority and is * fully specified (i.e. does not need routing) */ isz = OF_getencprop(node, "AAPL,interrupts", intr, sizeof(intr)); if (isz == sizeof(intr[0])*icells) return ((iparent == -1) ? intr[0] : ofw_bus_map_intr(dev, iparent, icells, intr)); isz = OF_getencprop(node, "interrupts", intr, sizeof(intr)); if (isz == sizeof(intr[0])*icells) { if (iparent != -1) intr[0] = ofw_bus_map_intr(dev, iparent, icells, intr); } else { /* No property: our best guess is the intpin. */ intr[0] = pci_get_intpin(child); } /* * If we got intr from a property, it may or may not be an intpin. * For on-board devices, it frequently is not, and is completely out * of the valid intpin range. For PCI slots, it hopefully is, * otherwise we will have trouble interfacing with non-OFW buses * such as cardbus. * Since we cannot tell which it is without violating layering, we * will always use the route_interrupt method, and treat exceptions * on the level they become apparent. */ return (PCIB_ROUTE_INTERRUPT(device_get_parent(dev), child, intr[0])); } static const struct ofw_bus_devinfo * ofw_pcibus_get_devinfo(device_t bus, device_t dev) { struct ofw_pcibus_devinfo *dinfo; dinfo = device_get_ivars(dev); return (&dinfo->opd_obdinfo); } +static int +ofw_pcibus_parse_associativity(device_t dev, int *domain) +{ + phandle_t node; + cell_t associativity[5]; + int res; + + if ((node = ofw_bus_get_node(dev)) == -1) { + device_printf(dev, "no ofw node found\n"); + return (ENXIO); + } + res = OF_getproplen(node, "ibm,associativity"); + if (res <= 0) + return (ENXIO); + OF_getencprop(node, "ibm,associativity", + associativity, res); + + *domain = associativity[3] - 1; + if (bootverbose) + device_printf(dev, "domain(%d)\n", *domain); + return (0); +} + +int +ofw_pcibus_get_cpus(device_t dev, device_t child, enum cpu_sets op, size_t setsize, + cpuset_t *cpuset) +{ + int d, error; + + error = ofw_pcibus_parse_associativity(child, &d); + if (error) + return (bus_generic_get_cpus(dev, child, op, setsize, cpuset)); + + switch (op) { + case LOCAL_CPUS: + if (setsize != sizeof(cpuset_t)) + return (EINVAL); + *cpuset = cpuset_domain[d]; + return (0); + case INTR_CPUS: + error = bus_generic_get_cpus(dev, child, op, setsize, cpuset); + if (error != 0) + return (error); + if (setsize != sizeof(cpuset_t)) + return (EINVAL); + CPU_AND(cpuset, &cpuset_domain[d]); + return (0); + default: + return (bus_generic_get_cpus(dev, child, op, setsize, cpuset)); + } + return (0); +} + +/* + * Fetch the NUMA domain for the given device 'dev'. + * + * If a device has a _PXM method, map that to a NUMA domain. + * Otherwise, pass the request up to the parent. + * If there's no matching domain or the domain cannot be + * determined, return ENOENT. + */ +int +ofw_pcibus_get_domain(device_t dev, device_t child, int *domain) +{ + int d, error; + + error = ofw_pcibus_parse_associativity(child, &d); + /* No ofw node; go up a level */ + if (error) + return (bus_generic_get_domain(dev, child, domain)); + *domain = d; + return (0); +} Index: head/sys/powerpc/powernv/opal_pci.c =================================================================== --- head/sys/powerpc/powernv/opal_pci.c (revision 346173) +++ head/sys/powerpc/powernv/opal_pci.c (revision 346174) @@ -1,703 +1,705 @@ /*- * Copyright (c) 2015-2016 Nathan Whitehorn * Copyright (c) 2017-2018 Semihalf * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "pcib_if.h" #include "pic_if.h" #include "iommu_if.h" #include "opal.h" #define OPAL_PCI_TCE_MAX_ENTRIES (1024*1024UL) #define OPAL_PCI_TCE_DEFAULT_SEG_SIZE (16*1024*1024UL) #define OPAL_PCI_TCE_R (1UL << 0) #define OPAL_PCI_TCE_W (1UL << 1) #define PHB3_TCE_KILL_INVAL_ALL (1UL << 63) /* * Device interface. */ static int opalpci_probe(device_t); static int opalpci_attach(device_t); /* * pcib interface. */ static uint32_t opalpci_read_config(device_t, u_int, u_int, u_int, u_int, int); static void opalpci_write_config(device_t, u_int, u_int, u_int, u_int, u_int32_t, int); static int opalpci_alloc_msi(device_t dev, device_t child, int count, int maxcount, int *irqs); static int opalpci_release_msi(device_t dev, device_t child, int count, int *irqs); static int opalpci_alloc_msix(device_t dev, device_t child, int *irq); static int opalpci_release_msix(device_t dev, device_t child, int irq); static int opalpci_map_msi(device_t dev, device_t child, int irq, uint64_t *addr, uint32_t *data); static int opalpci_route_interrupt(device_t bus, device_t dev, int pin); /* * MSI PIC interface. */ static void opalpic_pic_enable(device_t dev, u_int irq, u_int vector, void **); static void opalpic_pic_eoi(device_t dev, u_int irq, void *); /* Bus interface */ static bus_dma_tag_t opalpci_get_dma_tag(device_t dev, device_t child); /* * Commands */ #define OPAL_M32_WINDOW_TYPE 1 #define OPAL_M64_WINDOW_TYPE 2 #define OPAL_IO_WINDOW_TYPE 3 #define OPAL_RESET_PHB_COMPLETE 1 #define OPAL_RESET_PCI_IODA_TABLE 6 #define OPAL_DISABLE_M64 0 #define OPAL_ENABLE_M64_SPLIT 1 #define OPAL_ENABLE_M64_NON_SPLIT 2 #define OPAL_EEH_ACTION_CLEAR_FREEZE_MMIO 1 #define OPAL_EEH_ACTION_CLEAR_FREEZE_DMA 2 #define OPAL_EEH_ACTION_CLEAR_FREEZE_ALL 3 /* * Constants */ #define OPAL_PCI_DEFAULT_PE 1 #define OPAL_PCI_BUS_SPACE_LOWADDR_32BIT 0x7FFFFFFFUL /* * Driver methods. */ static device_method_t opalpci_methods[] = { /* Device interface */ DEVMETHOD(device_probe, opalpci_probe), DEVMETHOD(device_attach, opalpci_attach), /* pcib interface */ DEVMETHOD(pcib_read_config, opalpci_read_config), DEVMETHOD(pcib_write_config, opalpci_write_config), DEVMETHOD(pcib_alloc_msi, opalpci_alloc_msi), DEVMETHOD(pcib_release_msi, opalpci_release_msi), DEVMETHOD(pcib_alloc_msix, opalpci_alloc_msix), DEVMETHOD(pcib_release_msix, opalpci_release_msix), DEVMETHOD(pcib_map_msi, opalpci_map_msi), DEVMETHOD(pcib_route_interrupt, opalpci_route_interrupt), /* PIC interface for MSIs */ DEVMETHOD(pic_enable, opalpic_pic_enable), DEVMETHOD(pic_eoi, opalpic_pic_eoi), /* Bus interface */ DEVMETHOD(bus_get_dma_tag, opalpci_get_dma_tag), + DEVMETHOD(bus_get_cpus, ofw_pcibus_get_cpus), + DEVMETHOD(bus_get_domain, ofw_pcibus_get_domain), DEVMETHOD_END }; struct opalpci_softc { struct ofw_pci_softc ofw_sc; uint64_t phb_id; vmem_t *msi_vmem; int msi_base; /* Base XIVE number */ int base_msi_irq; /* Base IRQ assigned by FreeBSD to this PIC */ uint64_t *tce; /* TCE table for 1:1 mapping */ struct resource *r_reg; }; static devclass_t opalpci_devclass; DEFINE_CLASS_1(pcib, opalpci_driver, opalpci_methods, sizeof(struct opalpci_softc), ofw_pci_driver); EARLY_DRIVER_MODULE(opalpci, ofwbus, opalpci_driver, opalpci_devclass, 0, 0, BUS_PASS_BUS); static int opalpci_probe(device_t dev) { const char *type; if (opal_check() != 0) return (ENXIO); type = ofw_bus_get_type(dev); if (type == NULL || (strcmp(type, "pci") != 0 && strcmp(type, "pciex") != 0)) return (ENXIO); if (!OF_hasprop(ofw_bus_get_node(dev), "ibm,opal-phbid")) return (ENXIO); device_set_desc(dev, "OPAL Host-PCI bridge"); return (BUS_PROBE_GENERIC); } static void pci_phb3_tce_invalidate_entire(struct opalpci_softc *sc) { mb(); bus_write_8(sc->r_reg, 0x210, PHB3_TCE_KILL_INVAL_ALL); mb(); } /* Simple function to round to a power of 2 */ static uint64_t round_pow2(uint64_t val) { return (1 << (flsl(val + (val - 1)) - 1)); } /* * Starting with skiboot 5.10 PCIe nodes have a new property, * "ibm,supported-tce-sizes", to denote the TCE sizes available. This allows us * to avoid hard-coding the maximum TCE size allowed, and instead provide a sane * default (however, the "sane" default, which works for all targets, is 64k, * limiting us to 64GB if we have 1M entries. */ static uint64_t max_tce_size(device_t dev) { phandle_t node; cell_t sizes[64]; /* Property is a list of bit-widths, up to 64-bits */ int count; node = ofw_bus_get_node(dev); count = OF_getencprop(node, "ibm,supported-tce-sizes", sizes, sizeof(sizes)); if (count < (int) sizeof(cell_t)) return OPAL_PCI_TCE_DEFAULT_SEG_SIZE; count /= sizeof(cell_t); return (1ULL << sizes[count - 1]); } static int opalpci_attach(device_t dev) { struct opalpci_softc *sc; cell_t id[2], m64ranges[2], m64window[6], npe; phandle_t node; int i, err; uint64_t maxmem; uint64_t entries; uint64_t tce_size; uint64_t tce_tbl_size; int m64bar; int rid; sc = device_get_softc(dev); node = ofw_bus_get_node(dev); switch (OF_getproplen(node, "ibm,opal-phbid")) { case 8: OF_getencprop(node, "ibm,opal-phbid", id, 8); sc->phb_id = ((uint64_t)id[0] << 32) | id[1]; break; case 4: OF_getencprop(node, "ibm,opal-phbid", id, 4); sc->phb_id = id[0]; break; default: device_printf(dev, "PHB ID property had wrong length (%zd)\n", OF_getproplen(node, "ibm,opal-phbid")); return (ENXIO); } if (bootverbose) device_printf(dev, "OPAL ID %#lx\n", sc->phb_id); rid = 0; sc->r_reg = bus_alloc_resource_any(dev, SYS_RES_MEMORY, &rid, RF_ACTIVE | RF_SHAREABLE); if (sc->r_reg == NULL) { device_printf(dev, "Failed to allocate PHB[%jd] registers\n", (uintmax_t)sc->phb_id); return (ENXIO); } #if 0 /* * Reset PCI IODA table */ err = opal_call(OPAL_PCI_RESET, sc->phb_id, OPAL_RESET_PCI_IODA_TABLE, 1); if (err != 0) { device_printf(dev, "IODA table reset failed: %d\n", err); return (ENXIO); } err = opal_call(OPAL_PCI_RESET, sc->phb_id, OPAL_RESET_PHB_COMPLETE, 1); if (err < 0) { device_printf(dev, "PHB reset failed: %d\n", err); return (ENXIO); } if (err > 0) { while ((err = opal_call(OPAL_PCI_POLL, sc->phb_id)) > 0) { DELAY(1000*(err + 1)); /* Returns expected delay in ms */ } } if (err < 0) { device_printf(dev, "WARNING: PHB IODA reset poll failed: %d\n", err); } err = opal_call(OPAL_PCI_RESET, sc->phb_id, OPAL_RESET_PHB_COMPLETE, 0); if (err < 0) { device_printf(dev, "PHB reset failed: %d\n", err); return (ENXIO); } if (err > 0) { while ((err = opal_call(OPAL_PCI_POLL, sc->phb_id)) > 0) { DELAY(1000*(err + 1)); /* Returns expected delay in ms */ } } #endif /* * Map all devices on the bus to partitionable endpoint one until * such time as we start wanting to do things like bhyve. */ err = opal_call(OPAL_PCI_SET_PE, sc->phb_id, OPAL_PCI_DEFAULT_PE, 0, OPAL_PCI_BUS_ANY, OPAL_IGNORE_RID_DEVICE_NUMBER, OPAL_IGNORE_RID_FUNC_NUMBER, OPAL_MAP_PE); if (err != 0) { device_printf(dev, "PE mapping failed: %d\n", err); return (ENXIO); } /* * Turn on MMIO, mapped to PE 1 */ if (OF_getencprop(node, "ibm,opal-num-pes", &npe, 4) != 4) npe = 1; for (i = 0; i < npe; i++) { err = opal_call(OPAL_PCI_MAP_PE_MMIO_WINDOW, sc->phb_id, OPAL_PCI_DEFAULT_PE, OPAL_M32_WINDOW_TYPE, 0, i); if (err != 0) device_printf(dev, "MMIO %d map failed: %d\n", i, err); } if (OF_getencprop(node, "ibm,opal-available-m64-ranges", m64ranges, sizeof(m64ranges)) == sizeof(m64ranges)) m64bar = m64ranges[0]; else m64bar = 0; /* XXX: multiple M64 windows? */ if (OF_getencprop(node, "ibm,opal-m64-window", m64window, sizeof(m64window)) == sizeof(m64window)) { opal_call(OPAL_PCI_PHB_MMIO_ENABLE, sc->phb_id, OPAL_M64_WINDOW_TYPE, m64bar, 0); opal_call(OPAL_PCI_SET_PHB_MEM_WINDOW, sc->phb_id, OPAL_M64_WINDOW_TYPE, m64bar /* index */, ((uint64_t)m64window[2] << 32) | m64window[3], 0, ((uint64_t)m64window[4] << 32) | m64window[5]); opal_call(OPAL_PCI_MAP_PE_MMIO_WINDOW, sc->phb_id, OPAL_PCI_DEFAULT_PE, OPAL_M64_WINDOW_TYPE, m64bar /* index */, 0); opal_call(OPAL_PCI_PHB_MMIO_ENABLE, sc->phb_id, OPAL_M64_WINDOW_TYPE, m64bar, OPAL_ENABLE_M64_NON_SPLIT); } /* * Enable IOMMU for PE1 - map everything 1:1 using * segments of max_tce_size size */ tce_size = max_tce_size(dev); maxmem = roundup2(powerpc_ptob(Maxmem), tce_size); entries = round_pow2(maxmem / tce_size); - tce_tbl_size = max(entries * sizeof(uint64_t), 4096); + tce_tbl_size = MAX(entries * sizeof(uint64_t), 4096); if (entries > OPAL_PCI_TCE_MAX_ENTRIES) panic("POWERNV supports only %jdGB of memory space\n", (uintmax_t)((OPAL_PCI_TCE_MAX_ENTRIES * tce_size) >> 30)); if (bootverbose) device_printf(dev, "Mapping 0-%#jx for DMA\n", (uintmax_t)maxmem); sc->tce = contigmalloc(tce_tbl_size, M_DEVBUF, M_NOWAIT | M_ZERO, 0, BUS_SPACE_MAXADDR, tce_tbl_size, 0); if (sc->tce == NULL) panic("Failed to allocate TCE memory for PHB %jd\n", (uintmax_t)sc->phb_id); for (i = 0; i < entries; i++) sc->tce[i] = (i * tce_size) | OPAL_PCI_TCE_R | OPAL_PCI_TCE_W; /* Map TCE for every PE. It seems necessary for Power8 */ for (i = 0; i < npe; i++) { err = opal_call(OPAL_PCI_MAP_PE_DMA_WINDOW, sc->phb_id, i, (i << 1), 1, pmap_kextract((uint64_t)&sc->tce[0]), tce_tbl_size, tce_size); if (err != 0) { device_printf(dev, "DMA IOMMU mapping failed: %d\n", err); return (ENXIO); } err = opal_call(OPAL_PCI_MAP_PE_DMA_WINDOW_REAL, sc->phb_id, i, (i << 1) + 1, (1UL << 59), maxmem); if (err != 0) { device_printf(dev, "DMA 64b bypass mapping failed: %d\n", err); return (ENXIO); } } /* * Invalidate all previous TCE entries. * * TODO: add support for other PHBs than PHB3 */ pci_phb3_tce_invalidate_entire(sc); /* * Get MSI properties */ sc->msi_vmem = NULL; if (OF_getproplen(node, "ibm,opal-msi-ranges") > 0) { cell_t msi_ranges[2]; OF_getencprop(node, "ibm,opal-msi-ranges", msi_ranges, sizeof(msi_ranges)); sc->msi_base = msi_ranges[0]; sc->msi_vmem = vmem_create("OPAL MSI", msi_ranges[0], msi_ranges[1], 1, 16, M_BESTFIT | M_WAITOK); sc->base_msi_irq = powerpc_register_pic(dev, OF_xref_from_node(node), msi_ranges[0] + msi_ranges[1], 0, FALSE); if (bootverbose) device_printf(dev, "Supports %d MSIs starting at %d\n", msi_ranges[1], msi_ranges[0]); } /* Create the parent DMA tag */ /* * Constrain it to POWER8 PHB (ioda2) for now. It seems to mess up on * POWER9 systems. */ if (ofw_bus_is_compatible(dev, "ibm,ioda2-phb")) { err = bus_dma_tag_create(bus_get_dma_tag(dev), /* parent */ 1, 0, /* alignment, bounds */ OPAL_PCI_BUS_SPACE_LOWADDR_32BIT, /* lowaddr */ BUS_SPACE_MAXADDR_32BIT, /* highaddr */ NULL, NULL, /* filter, filterarg */ BUS_SPACE_MAXSIZE, /* maxsize */ BUS_SPACE_UNRESTRICTED, /* nsegments */ BUS_SPACE_MAXSIZE, /* maxsegsize */ 0, /* flags */ NULL, NULL, /* lockfunc, lockarg */ &sc->ofw_sc.sc_dmat); if (err != 0) { device_printf(dev, "Failed to create DMA tag\n"); return (err); } } /* * General OFW PCI attach */ err = ofw_pci_init(dev); if (err != 0) return (err); /* * Unfreeze non-config-space PCI operations. Let this fail silently * if e.g. there is no current freeze. */ opal_call(OPAL_PCI_EEH_FREEZE_CLEAR, sc->phb_id, OPAL_PCI_DEFAULT_PE, OPAL_EEH_ACTION_CLEAR_FREEZE_ALL); /* * OPAL stores 64-bit BARs in a special property rather than "ranges" */ if (OF_getencprop(node, "ibm,opal-m64-window", m64window, sizeof(m64window)) == sizeof(m64window)) { struct ofw_pci_range *rp; sc->ofw_sc.sc_nrange++; sc->ofw_sc.sc_range = realloc(sc->ofw_sc.sc_range, sc->ofw_sc.sc_nrange * sizeof(sc->ofw_sc.sc_range[0]), M_DEVBUF, M_WAITOK); rp = &sc->ofw_sc.sc_range[sc->ofw_sc.sc_nrange-1]; rp->pci_hi = OFW_PCI_PHYS_HI_SPACE_MEM64 | OFW_PCI_PHYS_HI_PREFETCHABLE; rp->pci = ((uint64_t)m64window[0] << 32) | m64window[1]; rp->host = ((uint64_t)m64window[2] << 32) | m64window[3]; rp->size = ((uint64_t)m64window[4] << 32) | m64window[5]; rman_manage_region(&sc->ofw_sc.sc_mem_rman, rp->pci, rp->pci + rp->size - 1); } return (ofw_pci_attach(dev)); } static uint32_t opalpci_read_config(device_t dev, u_int bus, u_int slot, u_int func, u_int reg, int width) { struct opalpci_softc *sc; uint64_t config_addr; uint8_t byte; uint16_t half; uint32_t word; int error; sc = device_get_softc(dev); config_addr = (bus << 8) | ((slot & 0x1f) << 3) | (func & 0x7); switch (width) { case 1: error = opal_call(OPAL_PCI_CONFIG_READ_BYTE, sc->phb_id, config_addr, reg, vtophys(&byte)); word = byte; break; case 2: error = opal_call(OPAL_PCI_CONFIG_READ_HALF_WORD, sc->phb_id, config_addr, reg, vtophys(&half)); word = half; break; case 4: error = opal_call(OPAL_PCI_CONFIG_READ_WORD, sc->phb_id, config_addr, reg, vtophys(&word)); break; default: error = OPAL_SUCCESS; word = 0xffffffff; } /* * Poking config state for non-existant devices can make * the host bridge hang up. Clear any errors. * * XXX: Make this conditional on the existence of a freeze */ opal_call(OPAL_PCI_EEH_FREEZE_CLEAR, sc->phb_id, OPAL_PCI_DEFAULT_PE, OPAL_EEH_ACTION_CLEAR_FREEZE_ALL); if (error != OPAL_SUCCESS) word = 0xffffffff; return (word); } static void opalpci_write_config(device_t dev, u_int bus, u_int slot, u_int func, u_int reg, uint32_t val, int width) { struct opalpci_softc *sc; uint64_t config_addr; int error = OPAL_SUCCESS; sc = device_get_softc(dev); config_addr = (bus << 8) | ((slot & 0x1f) << 3) | (func & 0x7); switch (width) { case 1: error = opal_call(OPAL_PCI_CONFIG_WRITE_BYTE, sc->phb_id, config_addr, reg, val); break; case 2: error = opal_call(OPAL_PCI_CONFIG_WRITE_HALF_WORD, sc->phb_id, config_addr, reg, val); break; case 4: error = opal_call(OPAL_PCI_CONFIG_WRITE_WORD, sc->phb_id, config_addr, reg, val); break; } if (error != OPAL_SUCCESS) { /* * Poking config state for non-existant devices can make * the host bridge hang up. Clear any errors. */ opal_call(OPAL_PCI_EEH_FREEZE_CLEAR, sc->phb_id, OPAL_PCI_DEFAULT_PE, OPAL_EEH_ACTION_CLEAR_FREEZE_ALL); } } static int opalpci_route_interrupt(device_t bus, device_t dev, int pin) { return (pin); } static int opalpci_alloc_msi(device_t dev, device_t child, int count, int maxcount, int *irqs) { struct opalpci_softc *sc; vmem_addr_t start; phandle_t xref; int err, i; sc = device_get_softc(dev); if (sc->msi_vmem == NULL) return (ENODEV); err = vmem_xalloc(sc->msi_vmem, count, powerof2(count), 0, 0, VMEM_ADDR_MIN, VMEM_ADDR_MAX, M_BESTFIT | M_WAITOK, &start); if (err) return (err); xref = OF_xref_from_node(ofw_bus_get_node(dev)); for (i = 0; i < count; i++) irqs[i] = MAP_IRQ(xref, start + i); return (0); } static int opalpci_release_msi(device_t dev, device_t child, int count, int *irqs) { struct opalpci_softc *sc; sc = device_get_softc(dev); if (sc->msi_vmem == NULL) return (ENODEV); vmem_xfree(sc->msi_vmem, irqs[0] - sc->base_msi_irq, count); return (0); } static int opalpci_alloc_msix(device_t dev, device_t child, int *irq) { return (opalpci_alloc_msi(dev, child, 1, 1, irq)); } static int opalpci_release_msix(device_t dev, device_t child, int irq) { return (opalpci_release_msi(dev, child, 1, &irq)); } static int opalpci_map_msi(device_t dev, device_t child, int irq, uint64_t *addr, uint32_t *data) { struct opalpci_softc *sc; struct pci_devinfo *dinfo; int err, xive; sc = device_get_softc(dev); if (sc->msi_vmem == NULL) return (ENODEV); xive = irq - sc->base_msi_irq - sc->msi_base; opal_call(OPAL_PCI_SET_XIVE_PE, sc->phb_id, OPAL_PCI_DEFAULT_PE, xive); dinfo = device_get_ivars(child); if (dinfo->cfg.msi.msi_alloc > 0 && (dinfo->cfg.msi.msi_ctrl & PCIM_MSICTRL_64BIT) == 0) { uint32_t msi32; err = opal_call(OPAL_GET_MSI_32, sc->phb_id, OPAL_PCI_DEFAULT_PE, xive, 1, vtophys(&msi32), vtophys(data)); *addr = be32toh(msi32); } else { err = opal_call(OPAL_GET_MSI_64, sc->phb_id, OPAL_PCI_DEFAULT_PE, xive, 1, vtophys(addr), vtophys(data)); *addr = be64toh(*addr); } *data = be32toh(*data); if (bootverbose && err != 0) device_printf(child, "OPAL MSI mapping error: %d\n", err); return ((err == 0) ? 0 : ENXIO); } static void opalpic_pic_enable(device_t dev, u_int irq, u_int vector, void **priv) { struct opalpci_softc *sc = device_get_softc(dev); PIC_ENABLE(root_pic, irq, vector, priv); opal_call(OPAL_PCI_MSI_EOI, sc->phb_id, irq, priv); } static void opalpic_pic_eoi(device_t dev, u_int irq, void *priv) { struct opalpci_softc *sc; sc = device_get_softc(dev); opal_call(OPAL_PCI_MSI_EOI, sc->phb_id, irq); PIC_EOI(root_pic, irq, priv); } static bus_dma_tag_t opalpci_get_dma_tag(device_t dev, device_t child) { struct opalpci_softc *sc; sc = device_get_softc(dev); return (sc->ofw_sc.sc_dmat); } Index: head/sys/powerpc/powernv/platform_powernv.c =================================================================== --- head/sys/powerpc/powernv/platform_powernv.c (revision 346173) +++ head/sys/powerpc/powernv/platform_powernv.c (revision 346174) @@ -1,484 +1,496 @@ /*- * Copyright (c) 2015 Nathan Whitehorn * Copyright (c) 2017-2018 Semihalf * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "platform_if.h" #include "opal.h" #ifdef SMP extern void *ap_pcpu; #endif void (*powernv_smp_ap_extra_init)(void); static int powernv_probe(platform_t); static int powernv_attach(platform_t); void powernv_mem_regions(platform_t, struct mem_region *phys, int *physsz, struct mem_region *avail, int *availsz); +static void powernv_numa_mem_regions(platform_t plat, struct numa_mem_region *phys, int *physsz); static u_long powernv_timebase_freq(platform_t, struct cpuref *cpuref); static int powernv_smp_first_cpu(platform_t, struct cpuref *cpuref); static int powernv_smp_next_cpu(platform_t, struct cpuref *cpuref); static int powernv_smp_get_bsp(platform_t, struct cpuref *cpuref); static void powernv_smp_ap_init(platform_t); #ifdef SMP static int powernv_smp_start_cpu(platform_t, struct pcpu *cpu); static void powernv_smp_probe_threads(platform_t); static struct cpu_group *powernv_smp_topo(platform_t plat); #endif static void powernv_reset(platform_t); static void powernv_cpu_idle(sbintime_t sbt); static int powernv_cpuref_init(void); static platform_method_t powernv_methods[] = { PLATFORMMETHOD(platform_probe, powernv_probe), PLATFORMMETHOD(platform_attach, powernv_attach), PLATFORMMETHOD(platform_mem_regions, powernv_mem_regions), + PLATFORMMETHOD(platform_numa_mem_regions, powernv_numa_mem_regions), PLATFORMMETHOD(platform_timebase_freq, powernv_timebase_freq), PLATFORMMETHOD(platform_smp_ap_init, powernv_smp_ap_init), PLATFORMMETHOD(platform_smp_first_cpu, powernv_smp_first_cpu), PLATFORMMETHOD(platform_smp_next_cpu, powernv_smp_next_cpu), PLATFORMMETHOD(platform_smp_get_bsp, powernv_smp_get_bsp), #ifdef SMP PLATFORMMETHOD(platform_smp_start_cpu, powernv_smp_start_cpu), PLATFORMMETHOD(platform_smp_probe_threads, powernv_smp_probe_threads), PLATFORMMETHOD(platform_smp_topo, powernv_smp_topo), #endif PLATFORMMETHOD(platform_reset, powernv_reset), { 0, 0 } }; static platform_def_t powernv_platform = { "powernv", powernv_methods, 0 }; static struct cpuref platform_cpuref[MAXCPU]; static int platform_cpuref_cnt; static int platform_cpuref_valid; PLATFORM_DEF(powernv_platform); static uint64_t powernv_boot_pir; static int powernv_probe(platform_t plat) { if (opal_check() == 0) return (BUS_PROBE_SPECIFIC); return (ENXIO); } static int powernv_attach(platform_t plat) { uint32_t nptlp, shift = 0, slb_encoding = 0; int32_t lp_size, lp_encoding; char buf[255]; pcell_t prop; phandle_t cpu; int res, len, idx; register_t msr; /* Ping OPAL again just to make sure */ opal_check(); #if BYTE_ORDER == LITTLE_ENDIAN opal_call(OPAL_REINIT_CPUS, 2 /* Little endian */); #else opal_call(OPAL_REINIT_CPUS, 1 /* Big endian */); #endif if (cpu_idle_hook == NULL) cpu_idle_hook = powernv_cpu_idle; powernv_boot_pir = mfspr(SPR_PIR); /* LPID must not be altered when PSL_DR or PSL_IR is set */ msr = mfmsr(); mtmsr(msr & ~(PSL_DR | PSL_IR)); /* Direct interrupts to SRR instead of HSRR and reset LPCR otherwise */ mtspr(SPR_LPID, 0); isync(); if (cpu_features2 & PPC_FEATURE2_ARCH_3_00) lpcr |= LPCR_HVICE; mtspr(SPR_LPCR, lpcr); isync(); mtmsr(msr); powernv_cpuref_init(); /* Set SLB count from device tree */ cpu = OF_peer(0); cpu = OF_child(cpu); while (cpu != 0) { res = OF_getprop(cpu, "name", buf, sizeof(buf)); if (res > 0 && strcmp(buf, "cpus") == 0) break; cpu = OF_peer(cpu); } if (cpu == 0) goto out; cpu = OF_child(cpu); while (cpu != 0) { res = OF_getprop(cpu, "device_type", buf, sizeof(buf)); if (res > 0 && strcmp(buf, "cpu") == 0) break; cpu = OF_peer(cpu); } if (cpu == 0) goto out; res = OF_getencprop(cpu, "ibm,slb-size", &prop, sizeof(prop)); if (res > 0) n_slbs = prop; /* * Scan the large page size property for PAPR compatible machines. * See PAPR D.5 Changes to Section 5.1.4, 'CPU Node Properties' * for the encoding of the property. */ len = OF_getproplen(cpu, "ibm,segment-page-sizes"); if (len > 0) { /* * We have to use a variable length array on the stack * since we have very limited stack space. */ pcell_t arr[len/sizeof(cell_t)]; res = OF_getencprop(cpu, "ibm,segment-page-sizes", arr, sizeof(arr)); len /= 4; idx = 0; while (len > 0) { shift = arr[idx]; slb_encoding = arr[idx + 1]; nptlp = arr[idx + 2]; idx += 3; len -= 3; while (len > 0 && nptlp) { lp_size = arr[idx]; lp_encoding = arr[idx+1]; if (slb_encoding == SLBV_L && lp_encoding == 0) break; idx += 2; len -= 2; nptlp--; } if (nptlp && slb_encoding == SLBV_L && lp_encoding == 0) break; } if (len == 0) panic("Standard large pages (SLB[L] = 1, PTE[LP] = 0) " "not supported by this system."); moea64_large_page_shift = shift; moea64_large_page_size = 1ULL << lp_size; } out: return (0); } void powernv_mem_regions(platform_t plat, struct mem_region *phys, int *physsz, struct mem_region *avail, int *availsz) { ofw_mem_regions(phys, physsz, avail, availsz); } +static void +powernv_numa_mem_regions(platform_t plat, struct numa_mem_region *phys, int *physsz) +{ + + ofw_numa_mem_regions(phys, physsz); +} + static u_long powernv_timebase_freq(platform_t plat, struct cpuref *cpuref) { char buf[8]; phandle_t cpu, dev, root; int res; int32_t ticks = -1; root = OF_peer(0); dev = OF_child(root); while (dev != 0) { res = OF_getprop(dev, "name", buf, sizeof(buf)); if (res > 0 && strcmp(buf, "cpus") == 0) break; dev = OF_peer(dev); } for (cpu = OF_child(dev); cpu != 0; cpu = OF_peer(cpu)) { res = OF_getprop(cpu, "device_type", buf, sizeof(buf)); if (res > 0 && strcmp(buf, "cpu") == 0) break; } if (cpu == 0) return (512000000); OF_getencprop(cpu, "timebase-frequency", &ticks, sizeof(ticks)); if (ticks <= 0) panic("Unable to determine timebase frequency!"); return (ticks); } static int powernv_cpuref_init(void) { phandle_t cpu, dev; char buf[32]; int a, res, tmp_cpuref_cnt; static struct cpuref tmp_cpuref[MAXCPU]; cell_t interrupt_servers[32]; uint64_t bsp; if (platform_cpuref_valid) return (0); dev = OF_peer(0); dev = OF_child(dev); while (dev != 0) { res = OF_getprop(dev, "name", buf, sizeof(buf)); if (res > 0 && strcmp(buf, "cpus") == 0) break; dev = OF_peer(dev); } bsp = 0; tmp_cpuref_cnt = 0; for (cpu = OF_child(dev); cpu != 0; cpu = OF_peer(cpu)) { res = OF_getprop(cpu, "device_type", buf, sizeof(buf)); if (res > 0 && strcmp(buf, "cpu") == 0) { res = OF_getproplen(cpu, "ibm,ppc-interrupt-server#s"); if (res > 0) { - - OF_getencprop(cpu, "ibm,ppc-interrupt-server#s", interrupt_servers, res); for (a = 0; a < res/sizeof(cell_t); a++) { tmp_cpuref[tmp_cpuref_cnt].cr_hwref = interrupt_servers[a]; tmp_cpuref[tmp_cpuref_cnt].cr_cpuid = tmp_cpuref_cnt; - + tmp_cpuref[tmp_cpuref_cnt].cr_domain = interrupt_servers[a] >> 11; if (interrupt_servers[a] == (uint32_t)powernv_boot_pir) bsp = tmp_cpuref_cnt; tmp_cpuref_cnt++; } } } } /* Map IDs, so BSP has CPUID 0 regardless of hwref */ for (a = bsp; a < tmp_cpuref_cnt; a++) { platform_cpuref[platform_cpuref_cnt].cr_hwref = tmp_cpuref[a].cr_hwref; platform_cpuref[platform_cpuref_cnt].cr_cpuid = platform_cpuref_cnt; + platform_cpuref[platform_cpuref_cnt].cr_domain = tmp_cpuref[a].cr_domain; platform_cpuref_cnt++; } for (a = 0; a < bsp; a++) { platform_cpuref[platform_cpuref_cnt].cr_hwref = tmp_cpuref[a].cr_hwref; platform_cpuref[platform_cpuref_cnt].cr_cpuid = platform_cpuref_cnt; + platform_cpuref[platform_cpuref_cnt].cr_domain = tmp_cpuref[a].cr_domain; platform_cpuref_cnt++; } platform_cpuref_valid = 1; return (0); } static int powernv_smp_first_cpu(platform_t plat, struct cpuref *cpuref) { if (platform_cpuref_valid == 0) return (EINVAL); cpuref->cr_cpuid = 0; cpuref->cr_hwref = platform_cpuref[0].cr_hwref; + cpuref->cr_domain = platform_cpuref[0].cr_domain; return (0); } static int powernv_smp_next_cpu(platform_t plat, struct cpuref *cpuref) { int id; if (platform_cpuref_valid == 0) return (EINVAL); id = cpuref->cr_cpuid + 1; if (id >= platform_cpuref_cnt) return (ENOENT); cpuref->cr_cpuid = platform_cpuref[id].cr_cpuid; cpuref->cr_hwref = platform_cpuref[id].cr_hwref; + cpuref->cr_domain = platform_cpuref[id].cr_domain; return (0); } static int powernv_smp_get_bsp(platform_t plat, struct cpuref *cpuref) { cpuref->cr_cpuid = platform_cpuref[0].cr_cpuid; cpuref->cr_hwref = platform_cpuref[0].cr_hwref; + cpuref->cr_domain = platform_cpuref[0].cr_domain; return (0); } #ifdef SMP static int powernv_smp_start_cpu(platform_t plat, struct pcpu *pc) { int result; ap_pcpu = pc; powerpc_sync(); result = opal_call(OPAL_START_CPU, pc->pc_hwref, EXC_RST); if (result != OPAL_SUCCESS) { printf("OPAL error (%d): unable to start AP %d\n", result, (int)pc->pc_hwref); return (ENXIO); } return (0); } static void powernv_smp_probe_threads(platform_t plat) { char buf[8]; phandle_t cpu, dev, root; int res, nthreads; root = OF_peer(0); dev = OF_child(root); while (dev != 0) { res = OF_getprop(dev, "name", buf, sizeof(buf)); if (res > 0 && strcmp(buf, "cpus") == 0) break; dev = OF_peer(dev); } nthreads = 1; for (cpu = OF_child(dev); cpu != 0; cpu = OF_peer(cpu)) { res = OF_getprop(cpu, "device_type", buf, sizeof(buf)); if (res <= 0 || strcmp(buf, "cpu") != 0) continue; res = OF_getproplen(cpu, "ibm,ppc-interrupt-server#s"); if (res >= 0) nthreads = res / sizeof(cell_t); else nthreads = 1; break; } smp_threads_per_core = nthreads; if (mp_ncpus % nthreads == 0) mp_ncores = mp_ncpus / nthreads; } static struct cpu_group * powernv_smp_topo(platform_t plat) { if (mp_ncpus % smp_threads_per_core != 0) { printf("WARNING: Irregular SMP topology. Performance may be " "suboptimal (%d threads, %d on first core)\n", mp_ncpus, smp_threads_per_core); return (smp_topo_none()); } /* Don't do anything fancier for non-threaded SMP */ if (smp_threads_per_core == 1) return (smp_topo_none()); return (smp_topo_1level(CG_SHARE_L1, smp_threads_per_core, CG_FLAG_SMT)); } #endif static void powernv_reset(platform_t platform) { opal_call(OPAL_CEC_REBOOT); } static void powernv_smp_ap_init(platform_t platform) { if (powernv_smp_ap_extra_init != NULL) powernv_smp_ap_extra_init(); } static void powernv_cpu_idle(sbintime_t sbt) { } Index: head/sys/powerpc/powerpc/intr_machdep.c =================================================================== --- head/sys/powerpc/powerpc/intr_machdep.c (revision 346173) +++ head/sys/powerpc/powerpc/intr_machdep.c (revision 346174) @@ -1,687 +1,696 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1991 The Regents of the University of California. * All rights reserved. * * This code is derived from software contributed to Berkeley by * William Jolitz. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /*- * Copyright (c) 2002 Benno Rice. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from: @(#)isa.c 7.2 (Berkeley) 5/13/91 * form: src/sys/i386/isa/intr_machdep.c,v 1.57 2001/07/20 * * $FreeBSD$ */ #include "opt_isa.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "pic_if.h" #define MAX_STRAY_LOG 5 static MALLOC_DEFINE(M_INTR, "intr", "interrupt handler data"); struct powerpc_intr { struct intr_event *event; long *cntp; void *priv; /* PIC-private data */ - u_int irq; device_t pic; + u_int irq; u_int intline; u_int vector; u_int cntindex; - cpuset_t cpu; - enum intr_trigger trig; - enum intr_polarity pol; int fwcode; int ipi; + int pi_domain; + enum intr_trigger trig; + enum intr_polarity pol; + cpuset_t pi_cpuset; }; struct pic { device_t dev; uint32_t node; u_int irqs; u_int ipis; int base; }; static u_int intrcnt_index = 0; static struct mtx intr_table_lock; static struct powerpc_intr **powerpc_intrs; static struct pic piclist[MAX_PICS]; static u_int nvectors; /* Allocated vectors */ static u_int npics; /* PICs registered */ #ifdef DEV_ISA static u_int nirqs = 16; /* Allocated IRQS (ISA pre-allocated). */ #else static u_int nirqs = 0; /* Allocated IRQs. */ #endif static u_int stray_count; u_long *intrcnt; char *intrnames; size_t sintrcnt = sizeof(intrcnt); size_t sintrnames = sizeof(intrnames); int nintrcnt; /* * Just to start */ #ifdef __powerpc64__ u_int num_io_irqs = 768; #else u_int num_io_irqs = 256; #endif device_t root_pic; #ifdef SMP static void *ipi_cookie; #endif static void intrcnt_setname(const char *name, int index) { snprintf(intrnames + (MAXCOMLEN + 1) * index, MAXCOMLEN + 1, "%-*s", MAXCOMLEN, name); } static void intr_init(void *dummy __unused) { mtx_init(&intr_table_lock, "intr sources lock", NULL, MTX_DEF); } SYSINIT(intr_init, SI_SUB_INTR, SI_ORDER_FIRST, intr_init, NULL); static void intr_init_sources(void *arg __unused) { powerpc_intrs = mallocarray(num_io_irqs, sizeof(*powerpc_intrs), M_INTR, M_WAITOK | M_ZERO); nintrcnt = 1 + num_io_irqs * 2 + mp_ncpus * 2; #ifdef COUNT_IPIS if (mp_ncpus > 1) nintrcnt += 8 * mp_ncpus; #endif intrcnt = mallocarray(nintrcnt, sizeof(u_long), M_INTR, M_WAITOK | M_ZERO); intrnames = mallocarray(nintrcnt, MAXCOMLEN + 1, M_INTR, M_WAITOK | M_ZERO); sintrcnt = nintrcnt * sizeof(u_long); sintrnames = nintrcnt * (MAXCOMLEN + 1); intrcnt_setname("???", 0); intrcnt_index = 1; } /* * This needs to happen before SI_SUB_CPU */ SYSINIT(intr_init_sources, SI_SUB_KLD, SI_ORDER_ANY, intr_init_sources, NULL); #ifdef SMP static void smp_intr_init(void *dummy __unused) { struct powerpc_intr *i; int vector; for (vector = 0; vector < nvectors; vector++) { i = powerpc_intrs[vector]; if (i != NULL && i->event != NULL && i->pic == root_pic) - PIC_BIND(i->pic, i->intline, i->cpu, &i->priv); + PIC_BIND(i->pic, i->intline, i->pi_cpuset, &i->priv); } } SYSINIT(smp_intr_init, SI_SUB_SMP, SI_ORDER_ANY, smp_intr_init, NULL); #endif void intrcnt_add(const char *name, u_long **countp) { int idx; idx = atomic_fetchadd_int(&intrcnt_index, 1); KASSERT(idx < nintrcnt, ("intrcnt_add: Interrupt counter index %d/%d" "reached nintrcnt : %d", intrcnt_index, idx, nintrcnt)); *countp = &intrcnt[idx]; intrcnt_setname(name, idx); } extern void kdb_backtrace(void); static struct powerpc_intr * intr_lookup(u_int irq) { char intrname[16]; struct powerpc_intr *i, *iscan; int vector; mtx_lock(&intr_table_lock); for (vector = 0; vector < nvectors; vector++) { i = powerpc_intrs[vector]; if (i != NULL && i->irq == irq) { mtx_unlock(&intr_table_lock); return (i); } } i = malloc(sizeof(*i), M_INTR, M_NOWAIT); if (i == NULL) { mtx_unlock(&intr_table_lock); return (NULL); } i->event = NULL; i->cntp = NULL; i->priv = NULL; i->trig = INTR_TRIGGER_CONFORM; i->pol = INTR_POLARITY_CONFORM; i->irq = irq; i->pic = NULL; i->vector = -1; i->fwcode = 0; i->ipi = 0; #ifdef SMP - i->cpu = all_cpus; + i->pi_cpuset = all_cpus; #else - CPU_SETOF(0, &i->cpu); + CPU_SETOF(0, &i->pi_cpuset); #endif for (vector = 0; vector < num_io_irqs && vector <= nvectors; vector++) { iscan = powerpc_intrs[vector]; if (iscan != NULL && iscan->irq == irq) break; if (iscan == NULL && i->vector == -1) i->vector = vector; iscan = NULL; } if (iscan == NULL && i->vector != -1) { powerpc_intrs[i->vector] = i; i->cntindex = atomic_fetchadd_int(&intrcnt_index, 1); i->cntp = &intrcnt[i->cntindex]; sprintf(intrname, "irq%u:", i->irq); intrcnt_setname(intrname, i->cntindex); nvectors++; } mtx_unlock(&intr_table_lock); if (iscan != NULL || i->vector == -1) { free(i, M_INTR); i = iscan; } return (i); } static int powerpc_map_irq(struct powerpc_intr *i) { struct pic *p; u_int cnt; int idx; for (idx = 0; idx < npics; idx++) { p = &piclist[idx]; cnt = p->irqs + p->ipis; if (i->irq >= p->base && i->irq < p->base + cnt) break; } if (idx == npics) return (EINVAL); i->intline = i->irq - p->base; i->pic = p->dev; /* Try a best guess if that failed */ if (i->pic == NULL) i->pic = root_pic; return (0); } static void powerpc_intr_eoi(void *arg) { struct powerpc_intr *i = arg; PIC_EOI(i->pic, i->intline, i->priv); } static void powerpc_intr_pre_ithread(void *arg) { struct powerpc_intr *i = arg; PIC_MASK(i->pic, i->intline, i->priv); PIC_EOI(i->pic, i->intline, i->priv); } static void powerpc_intr_post_ithread(void *arg) { struct powerpc_intr *i = arg; PIC_UNMASK(i->pic, i->intline, i->priv); } static int powerpc_assign_intr_cpu(void *arg, int cpu) { #ifdef SMP struct powerpc_intr *i = arg; if (cpu == NOCPU) - i->cpu = all_cpus; + i->pi_cpuset = all_cpus; else - CPU_SETOF(cpu, &i->cpu); + CPU_SETOF(cpu, &i->pi_cpuset); if (!cold && i->pic != NULL && i->pic == root_pic) - PIC_BIND(i->pic, i->intline, i->cpu, &i->priv); + PIC_BIND(i->pic, i->intline, i->pi_cpuset, &i->priv); return (0); #else return (EOPNOTSUPP); #endif } u_int powerpc_register_pic(device_t dev, uint32_t node, u_int irqs, u_int ipis, u_int atpic) { struct pic *p; u_int irq; int idx; mtx_lock(&intr_table_lock); /* XXX see powerpc_get_irq(). */ for (idx = 0; idx < npics; idx++) { p = &piclist[idx]; if (p->node != node) continue; if (node != 0 || p->dev == dev) break; } p = &piclist[idx]; p->dev = dev; p->node = node; p->irqs = irqs; p->ipis = ipis; if (idx == npics) { #ifdef DEV_ISA p->base = (atpic) ? 0 : nirqs; #else p->base = nirqs; #endif irq = p->base + irqs + ipis; nirqs = MAX(nirqs, irq); npics++; } KASSERT(npics < MAX_PICS, ("Number of PICs exceeds maximum (%d)", MAX_PICS)); mtx_unlock(&intr_table_lock); return (p->base); } u_int powerpc_get_irq(uint32_t node, u_int pin) { int idx; if (node == 0) return (pin); mtx_lock(&intr_table_lock); for (idx = 0; idx < npics; idx++) { if (piclist[idx].node == node) { mtx_unlock(&intr_table_lock); return (piclist[idx].base + pin); } } /* * XXX we should never encounter an unregistered PIC, but that * can only be done when we properly support bus enumeration * using multiple passes. Until then, fake an entry and give it * some adhoc maximum number of IRQs and IPIs. */ piclist[idx].dev = NULL; piclist[idx].node = node; piclist[idx].irqs = 124; piclist[idx].ipis = 4; piclist[idx].base = nirqs; nirqs += (1 << 25); npics++; KASSERT(npics < MAX_PICS, ("Number of PICs exceeds maximum (%d)", MAX_PICS)); mtx_unlock(&intr_table_lock); return (piclist[idx].base + pin); } int powerpc_enable_intr(void) { struct powerpc_intr *i; int error, vector; #ifdef SMP int n; #endif if (npics == 0) panic("no PIC detected\n"); if (root_pic == NULL) root_pic = piclist[0].dev; #ifdef SMP /* Install an IPI handler. */ if (mp_ncpus > 1) { for (n = 0; n < npics; n++) { if (piclist[n].dev != root_pic) continue; KASSERT(piclist[n].ipis != 0, ("%s: SMP root PIC does not supply any IPIs", __func__)); error = powerpc_setup_intr("IPI", MAP_IRQ(piclist[n].node, piclist[n].irqs), powerpc_ipi_handler, NULL, NULL, - INTR_TYPE_MISC | INTR_EXCL, &ipi_cookie); + INTR_TYPE_MISC | INTR_EXCL, &ipi_cookie, + 0 /* domain XXX */); if (error) { printf("unable to setup IPI handler\n"); return (error); } /* * Some subterfuge: disable late EOI and mark this * as an IPI to the dispatch layer. */ i = intr_lookup(MAP_IRQ(piclist[n].node, piclist[n].irqs)); i->event->ie_post_filter = NULL; i->ipi = 1; } } #endif for (vector = 0; vector < nvectors; vector++) { i = powerpc_intrs[vector]; if (i == NULL) continue; error = powerpc_map_irq(i); if (error) continue; if (i->trig == INTR_TRIGGER_INVALID) PIC_TRANSLATE_CODE(i->pic, i->intline, i->fwcode, &i->trig, &i->pol); if (i->trig != INTR_TRIGGER_CONFORM || i->pol != INTR_POLARITY_CONFORM) PIC_CONFIG(i->pic, i->intline, i->trig, i->pol); if (i->event != NULL) PIC_ENABLE(i->pic, i->intline, vector, &i->priv); } return (0); } int powerpc_setup_intr(const char *name, u_int irq, driver_filter_t filter, - driver_intr_t handler, void *arg, enum intr_type flags, void **cookiep) + driver_intr_t handler, void *arg, enum intr_type flags, void **cookiep, + int domain) { struct powerpc_intr *i; int error, enable = 0; i = intr_lookup(irq); if (i == NULL) return (ENOMEM); if (i->event == NULL) { error = intr_event_create(&i->event, (void *)i, 0, irq, powerpc_intr_pre_ithread, powerpc_intr_post_ithread, powerpc_intr_eoi, powerpc_assign_intr_cpu, "irq%u:", irq); if (error) return (error); enable = 1; } error = intr_event_add_handler(i->event, name, filter, handler, arg, intr_priority(flags), flags, cookiep); - + if (error) + return (error); + i->pi_domain = domain; + if (strcmp(name, "IPI") != 0) { + CPU_ZERO(&i->pi_cpuset); + CPU_COPY(&cpuset_domain[domain], &i->pi_cpuset); + } mtx_lock(&intr_table_lock); intrcnt_setname(i->event->ie_fullname, i->cntindex); mtx_unlock(&intr_table_lock); if (!cold) { error = powerpc_map_irq(i); if (!error) { if (i->trig == INTR_TRIGGER_INVALID) PIC_TRANSLATE_CODE(i->pic, i->intline, i->fwcode, &i->trig, &i->pol); if (i->trig != INTR_TRIGGER_CONFORM || i->pol != INTR_POLARITY_CONFORM) PIC_CONFIG(i->pic, i->intline, i->trig, i->pol); if (i->pic == root_pic) - PIC_BIND(i->pic, i->intline, i->cpu, &i->priv); + PIC_BIND(i->pic, i->intline, i->pi_cpuset, &i->priv); if (enable) PIC_ENABLE(i->pic, i->intline, i->vector, &i->priv); } } return (error); } int powerpc_teardown_intr(void *cookie) { return (intr_event_remove_handler(cookie)); } #ifdef SMP int powerpc_bind_intr(u_int irq, u_char cpu) { struct powerpc_intr *i; i = intr_lookup(irq); if (i == NULL) return (ENOMEM); return (intr_event_bind(i->event, cpu)); } #endif int powerpc_fw_config_intr(int irq, int sense_code) { struct powerpc_intr *i; i = intr_lookup(irq); if (i == NULL) return (ENOMEM); i->trig = INTR_TRIGGER_INVALID; i->pol = INTR_POLARITY_CONFORM; i->fwcode = sense_code; if (!cold && i->pic != NULL) { PIC_TRANSLATE_CODE(i->pic, i->intline, i->fwcode, &i->trig, &i->pol); PIC_CONFIG(i->pic, i->intline, i->trig, i->pol); } return (0); } int powerpc_config_intr(int irq, enum intr_trigger trig, enum intr_polarity pol) { struct powerpc_intr *i; i = intr_lookup(irq); if (i == NULL) return (ENOMEM); i->trig = trig; i->pol = pol; if (!cold && i->pic != NULL) PIC_CONFIG(i->pic, i->intline, trig, pol); return (0); } void powerpc_dispatch_intr(u_int vector, struct trapframe *tf) { struct powerpc_intr *i; struct intr_event *ie; i = powerpc_intrs[vector]; if (i == NULL) goto stray; (*i->cntp)++; ie = i->event; KASSERT(ie != NULL, ("%s: interrupt without an event", __func__)); /* * IPIs are magical and need to be EOI'ed before filtering. * This prevents races in IPI handling. */ if (i->ipi) PIC_EOI(i->pic, i->intline, i->priv); if (intr_event_handle(ie, tf) != 0) { goto stray; } return; stray: stray_count++; if (stray_count <= MAX_STRAY_LOG) { printf("stray irq %d\n", i ? i->irq : -1); if (stray_count >= MAX_STRAY_LOG) { printf("got %d stray interrupts, not logging anymore\n", MAX_STRAY_LOG); } } if (i != NULL) PIC_MASK(i->pic, i->intline, i->priv); } void powerpc_intr_mask(u_int irq) { struct powerpc_intr *i; i = intr_lookup(irq); if (i == NULL || i->pic == NULL) return; PIC_MASK(i->pic, i->intline, i->priv); } void powerpc_intr_unmask(u_int irq) { struct powerpc_intr *i; i = intr_lookup(irq); if (i == NULL || i->pic == NULL) return; PIC_UNMASK(i->pic, i->intline, i->priv); } Index: head/sys/powerpc/powerpc/mp_machdep.c =================================================================== --- head/sys/powerpc/powerpc/mp_machdep.c (revision 346173) +++ head/sys/powerpc/powerpc/mp_machdep.c (revision 346174) @@ -1,386 +1,395 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2008 Marcel Moolenaar * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "pic_if.h" extern struct pcpu __pcpu[MAXCPU]; volatile static int ap_awake; volatile static u_int ap_letgo; volatile static u_quad_t ap_timebase; static u_int ipi_msg_cnt[32]; static struct mtx ap_boot_mtx; struct pcb stoppcbs[MAXCPU]; void machdep_ap_bootstrap(void) { PCPU_SET(awake, 1); __asm __volatile("msync; isync"); while (ap_letgo == 0) nop_prio_vlow(); nop_prio_medium(); /* * Set timebase as soon as possible to meet an implicit rendezvous * from cpu_mp_unleash(), which sets ap_letgo and then immediately * sets timebase. * * Note that this is instrinsically racy and is only relevant on * platforms that do not support better mechanisms. */ platform_smp_timebase_sync(ap_timebase, 1); /* Give platform code a chance to do anything else necessary */ platform_smp_ap_init(); /* Initialize decrementer */ decr_ap_init(); /* Serialize console output and AP count increment */ mtx_lock_spin(&ap_boot_mtx); ap_awake++; if (bootverbose) printf("SMP: AP CPU #%d launched\n", PCPU_GET(cpuid)); else printf("%s%d%s", ap_awake == 2 ? "Launching APs: " : "", PCPU_GET(cpuid), ap_awake == mp_ncpus ? "\n" : " "); mtx_unlock_spin(&ap_boot_mtx); while(smp_started == 0) ; /* Start per-CPU event timers. */ cpu_initclocks_ap(); /* Announce ourselves awake, and enter the scheduler */ sched_throw(NULL); } void cpu_mp_setmaxid(void) { struct cpuref cpuref; int error; mp_ncpus = 0; mp_maxid = 0; error = platform_smp_first_cpu(&cpuref); while (!error) { mp_ncpus++; mp_maxid = max(cpuref.cr_cpuid, mp_maxid); error = platform_smp_next_cpu(&cpuref); } /* Sanity. */ if (mp_ncpus == 0) mp_ncpus = 1; } int cpu_mp_probe(void) { /* * We're not going to enable SMP if there's only 1 processor. */ return (mp_ncpus > 1); } void cpu_mp_start(void) { struct cpuref bsp, cpu; struct pcpu *pc; int error; error = platform_smp_get_bsp(&bsp); KASSERT(error == 0, ("Don't know BSP")); error = platform_smp_first_cpu(&cpu); while (!error) { if (cpu.cr_cpuid >= MAXCPU) { printf("SMP: cpu%d: skipped -- ID out of range\n", cpu.cr_cpuid); goto next; } if (CPU_ISSET(cpu.cr_cpuid, &all_cpus)) { printf("SMP: cpu%d: skipped - duplicate ID\n", cpu.cr_cpuid); goto next; } if (cpu.cr_cpuid != bsp.cr_cpuid) { void *dpcpu; pc = &__pcpu[cpu.cr_cpuid]; dpcpu = (void *)kmem_malloc(DPCPU_SIZE, M_WAITOK | M_ZERO); pcpu_init(pc, cpu.cr_cpuid, sizeof(*pc)); dpcpu_init(dpcpu, cpu.cr_cpuid); } else { pc = pcpup; pc->pc_cpuid = bsp.cr_cpuid; pc->pc_bsp = 1; } pc->pc_hwref = cpu.cr_hwref; + + if (vm_ndomains > 1) + pc->pc_domain = cpu.cr_domain; + else + pc->pc_domain = 0; + + CPU_SET(pc->pc_cpuid, &cpuset_domain[pc->pc_domain]); + KASSERT(pc->pc_domain < MAXMEMDOM, ("bad domain value %d\n", + pc->pc_domain)); CPU_SET(pc->pc_cpuid, &all_cpus); next: error = platform_smp_next_cpu(&cpu); } #ifdef SMP platform_smp_probe_threads(); #endif } void cpu_mp_announce(void) { struct pcpu *pc; int i; if (!bootverbose) return; CPU_FOREACH(i) { pc = pcpu_find(i); if (pc == NULL) continue; - printf("cpu%d: dev=%x", i, (int)pc->pc_hwref); + printf("cpu%d: dev=%x domain=%d ", i, (int)pc->pc_hwref, pc->pc_domain); if (pc->pc_bsp) printf(" (BSP)"); printf("\n"); } } static void cpu_mp_unleash(void *dummy) { struct pcpu *pc; int cpus, timeout; int ret; if (mp_ncpus <= 1) return; mtx_init(&ap_boot_mtx, "ap boot", NULL, MTX_SPIN); cpus = 0; smp_cpus = 0; #ifdef BOOKE tlb1_ap_prep(); #endif STAILQ_FOREACH(pc, &cpuhead, pc_allcpu) { cpus++; if (!pc->pc_bsp) { if (bootverbose) printf("Waking up CPU %d (dev=%x)\n", pc->pc_cpuid, (int)pc->pc_hwref); ret = platform_smp_start_cpu(pc); if (ret == 0) { timeout = 2000; /* wait 2sec for the AP */ while (!pc->pc_awake && --timeout > 0) DELAY(1000); } } else { pc->pc_awake = 1; } if (pc->pc_awake) { if (bootverbose) printf("Adding CPU %d, hwref=%jx, awake=%x\n", pc->pc_cpuid, (uintmax_t)pc->pc_hwref, pc->pc_awake); smp_cpus++; } else CPU_SET(pc->pc_cpuid, &stopped_cpus); } ap_awake = 1; /* Provide our current DEC and TB values for APs */ ap_timebase = mftb() + 10; __asm __volatile("msync; isync"); /* Let APs continue */ atomic_store_rel_int(&ap_letgo, 1); platform_smp_timebase_sync(ap_timebase, 0); while (ap_awake < smp_cpus) ; if (smp_cpus != cpus || cpus != mp_ncpus) { printf("SMP: %d CPUs found; %d CPUs usable; %d CPUs woken\n", mp_ncpus, cpus, smp_cpus); } if (smp_cpus > 1) atomic_store_rel_int(&smp_started, 1); /* Let the APs get into the scheduler */ DELAY(10000); } SYSINIT(start_aps, SI_SUB_SMP, SI_ORDER_FIRST, cpu_mp_unleash, NULL); int powerpc_ipi_handler(void *arg) { u_int cpuid; uint32_t ipimask; int msg; CTR2(KTR_SMP, "%s: MSR 0x%08x", __func__, mfmsr()); ipimask = atomic_readandclear_32(&(pcpup->pc_ipimask)); if (ipimask == 0) return (FILTER_STRAY); while ((msg = ffs(ipimask) - 1) != -1) { ipimask &= ~(1u << msg); ipi_msg_cnt[msg]++; switch (msg) { case IPI_AST: CTR1(KTR_SMP, "%s: IPI_AST", __func__); break; case IPI_PREEMPT: CTR1(KTR_SMP, "%s: IPI_PREEMPT", __func__); sched_preempt(curthread); break; case IPI_RENDEZVOUS: CTR1(KTR_SMP, "%s: IPI_RENDEZVOUS", __func__); smp_rendezvous_action(); break; case IPI_STOP: /* * IPI_STOP_HARD is mapped to IPI_STOP so it is not * necessary to add such case in the switch. */ CTR1(KTR_SMP, "%s: IPI_STOP or IPI_STOP_HARD (stop)", __func__); cpuid = PCPU_GET(cpuid); savectx(&stoppcbs[cpuid]); savectx(PCPU_GET(curpcb)); CPU_SET_ATOMIC(cpuid, &stopped_cpus); while (!CPU_ISSET(cpuid, &started_cpus)) cpu_spinwait(); CPU_CLR_ATOMIC(cpuid, &stopped_cpus); CPU_CLR_ATOMIC(cpuid, &started_cpus); CTR1(KTR_SMP, "%s: IPI_STOP (restart)", __func__); break; case IPI_HARDCLOCK: CTR1(KTR_SMP, "%s: IPI_HARDCLOCK", __func__); hardclockintr(); break; } } return (FILTER_HANDLED); } static void ipi_send(struct pcpu *pc, int ipi) { CTR4(KTR_SMP, "%s: pc=%p, targetcpu=%d, IPI=%d", __func__, pc, pc->pc_cpuid, ipi); atomic_set_32(&pc->pc_ipimask, (1 << ipi)); powerpc_sync(); PIC_IPI(root_pic, pc->pc_cpuid); CTR1(KTR_SMP, "%s: sent", __func__); } /* Send an IPI to a set of cpus. */ void ipi_selected(cpuset_t cpus, int ipi) { struct pcpu *pc; STAILQ_FOREACH(pc, &cpuhead, pc_allcpu) { if (CPU_ISSET(pc->pc_cpuid, &cpus)) ipi_send(pc, ipi); } } /* Send an IPI to a specific CPU. */ void ipi_cpu(int cpu, u_int ipi) { ipi_send(cpuid_to_pcpu[cpu], ipi); } /* Send an IPI to all CPUs EXCEPT myself. */ void ipi_all_but_self(int ipi) { struct pcpu *pc; STAILQ_FOREACH(pc, &cpuhead, pc_allcpu) { if (pc != pcpup) ipi_send(pc, ipi); } } Index: head/sys/powerpc/powerpc/nexus.c =================================================================== --- head/sys/powerpc/powerpc/nexus.c (revision 346173) +++ head/sys/powerpc/powerpc/nexus.c (revision 346174) @@ -1,236 +1,266 @@ /*- * Copyright 1998 Massachusetts Institute of Technology * Copyright 2001 by Thomas Moestl . * Copyright 2006 by Marius Strobl . * All rights reserved. * * Permission to use, copy, modify, and distribute this software and * its documentation for any purpose and without fee is hereby * granted, provided that both the above copyright notice and this * permission notice appear in all copies, that both the above * copyright notice and this permission notice appear in all * supporting documentation, and that the name of M.I.T. not be used * in advertising or publicity pertaining to distribution of the * software without specific, written prior permission. M.I.T. makes * no representations about the suitability of this software for any * purpose. It is provided "as is" without express or implied * warranty. * * THIS SOFTWARE IS PROVIDED BY M.I.T. ``AS IS''. M.I.T. DISCLAIMS * ALL EXPRESS OR IMPLIED WARRANTIES WITH REGARD TO THIS SOFTWARE, * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. IN NO EVENT * SHALL M.I.T. BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from: FreeBSD: src/sys/i386/i386/nexus.c,v 1.43 2001/02/09 */ #include __FBSDID("$FreeBSD$"); #include #include #include +#include #include #include #include #include #include +#include #include #include #include #include #include #include #include #include /* * The nexus handles root-level resource allocation requests and interrupt * mapping. All direct subdevices of nexus are attached by DEVICE_IDENTIFY(). */ static device_probe_t nexus_probe; static device_attach_t nexus_attach; static bus_setup_intr_t nexus_setup_intr; static bus_teardown_intr_t nexus_teardown_intr; static bus_activate_resource_t nexus_activate_resource; static bus_deactivate_resource_t nexus_deactivate_resource; static bus_space_tag_t nexus_get_bus_tag(device_t, device_t); +static int nexus_get_cpus(device_t, device_t, enum cpu_sets, size_t, + cpuset_t *); #ifdef SMP static bus_bind_intr_t nexus_bind_intr; #endif static bus_config_intr_t nexus_config_intr; static ofw_bus_map_intr_t nexus_ofw_map_intr; static device_method_t nexus_methods[] = { /* Device interface */ DEVMETHOD(device_probe, nexus_probe), DEVMETHOD(device_attach, nexus_attach), /* Bus interface */ DEVMETHOD(bus_add_child, bus_generic_add_child), DEVMETHOD(bus_activate_resource, nexus_activate_resource), DEVMETHOD(bus_deactivate_resource, nexus_deactivate_resource), DEVMETHOD(bus_setup_intr, nexus_setup_intr), DEVMETHOD(bus_teardown_intr, nexus_teardown_intr), #ifdef SMP DEVMETHOD(bus_bind_intr, nexus_bind_intr), #endif DEVMETHOD(bus_config_intr, nexus_config_intr), DEVMETHOD(bus_get_bus_tag, nexus_get_bus_tag), + DEVMETHOD(bus_get_cpus, nexus_get_cpus), /* ofw_bus interface */ DEVMETHOD(ofw_bus_map_intr, nexus_ofw_map_intr), DEVMETHOD_END }; static devclass_t nexus_devclass; DEFINE_CLASS_0(nexus, nexus_driver, nexus_methods, 1); EARLY_DRIVER_MODULE(nexus, root, nexus_driver, nexus_devclass, 0, 0, BUS_PASS_BUS); MODULE_VERSION(nexus, 1); static int nexus_probe(device_t dev) { device_quiet(dev); /* suppress attach message for neatness */ return (BUS_PROBE_DEFAULT); } static int nexus_attach(device_t dev) { bus_generic_probe(dev); bus_generic_attach(dev); return (0); } static int nexus_setup_intr(device_t bus __unused, device_t child, struct resource *r, int flags, driver_filter_t *filt, driver_intr_t *intr, void *arg, void **cookiep) { - int error; + int error, domain; if (r == NULL) panic("%s: NULL interrupt resource!", __func__); + if (cookiep != NULL) + *cookiep = NULL; if ((rman_get_flags(r) & RF_SHAREABLE) == 0) flags |= INTR_EXCL; /* We depend here on rman_activate_resource() being idempotent. */ error = rman_activate_resource(r); if (error) return (error); + if (bus_get_domain(child, &domain) != 0) { + if(bootverbose) + device_printf(child, "no domain found\n"); + domain = 0; + } error = powerpc_setup_intr(device_get_nameunit(child), - rman_get_start(r), filt, intr, arg, flags, cookiep); + rman_get_start(r), filt, intr, arg, flags, cookiep, domain); return (error); } static int nexus_teardown_intr(device_t bus __unused, device_t child __unused, struct resource *r, void *ih) { if (r == NULL) return (EINVAL); return (powerpc_teardown_intr(ih)); } static bus_space_tag_t nexus_get_bus_tag(device_t bus __unused, device_t child __unused) { return(&bs_be_tag); +} + +static int +nexus_get_cpus(device_t dev, device_t child, enum cpu_sets op, size_t setsize, + cpuset_t *cpuset) +{ + + switch (op) { +#ifdef SMP + case INTR_CPUS: + if (setsize != sizeof(cpuset_t)) + return (EINVAL); + *cpuset = all_cpus; + return (0); +#endif + default: + return (bus_generic_get_cpus(dev, child, op, setsize, cpuset)); + } } #ifdef SMP static int nexus_bind_intr(device_t bus __unused, device_t child __unused, struct resource *r, int cpu) { return (powerpc_bind_intr(rman_get_start(r), cpu)); } #endif static int nexus_config_intr(device_t dev, int irq, enum intr_trigger trig, enum intr_polarity pol) { return (powerpc_config_intr(irq, trig, pol)); } static int nexus_ofw_map_intr(device_t dev, device_t child, phandle_t iparent, int icells, pcell_t *irq) { u_int intr = MAP_IRQ(iparent, irq[0]); if (icells > 1) powerpc_fw_config_intr(intr, irq[1]); return (intr); } static int nexus_activate_resource(device_t bus __unused, device_t child __unused, int type, int rid __unused, struct resource *r) { if (type == SYS_RES_MEMORY) { vm_paddr_t start; void *p; start = (vm_paddr_t) rman_get_start(r); if (bootverbose) printf("nexus mapdev: start %jx, len %jd\n", (uintmax_t)start, rman_get_size(r)); p = pmap_mapdev(start, (vm_size_t) rman_get_size(r)); if (p == NULL) return (ENOMEM); rman_set_virtual(r, p); rman_set_bustag(r, &bs_be_tag); rman_set_bushandle(r, (u_long)p); } return (rman_activate_resource(r)); } static int nexus_deactivate_resource(device_t bus __unused, device_t child __unused, int type __unused, int rid __unused, struct resource *r) { /* * If this is a memory resource, unmap it. */ if ((type == SYS_RES_MEMORY) || (type == SYS_RES_IOPORT)) { bus_size_t psize; psize = rman_get_size(r); pmap_unmapdev((vm_offset_t)rman_get_virtual(r), psize); } return (rman_deactivate_resource(r)); } Index: head/sys/powerpc/powerpc/platform.c =================================================================== --- head/sys/powerpc/powerpc/platform.c (revision 346173) +++ head/sys/powerpc/powerpc/platform.c (revision 346174) @@ -1,345 +1,399 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2005 Peter Grehan * Copyright (c) 2009 Nathan Whitehorn * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * */ #include __FBSDID("$FreeBSD$"); /* * Dispatch platform calls to the appropriate platform implementation * through a previously registered kernel object. */ #include #include #include #include #include #include #include #include #include #include #include +#include #include +#include #include #include #include #include #include +#include #include "platform_if.h" static platform_def_t *plat_def_impl; static platform_t plat_obj; static struct kobj_ops plat_kernel_kops; static struct platform_kobj plat_kernel_obj; static char plat_name[64] = ""; SYSCTL_STRING(_hw, OID_AUTO, platform, CTLFLAG_RD | CTLFLAG_TUN, plat_name, 0, "Platform currently in use"); +static struct mem_affinity mem_info[VM_PHYSSEG_MAX + 1]; +static int vm_locality_table[MAXMEMDOM * MAXMEMDOM]; static struct mem_region pregions[PHYS_AVAIL_SZ]; +static struct numa_mem_region numa_pregions[PHYS_AVAIL_SZ]; static struct mem_region aregions[PHYS_AVAIL_SZ]; -static int npregions, naregions; +static int nnumapregions, npregions, naregions; /* * Memory region utilities: determine if two regions overlap, * and merge two overlapping regions into one */ static int memr_overlap(struct mem_region *r1, struct mem_region *r2) { if ((r1->mr_start + r1->mr_size) < r2->mr_start || (r2->mr_start + r2->mr_size) < r1->mr_start) return (FALSE); return (TRUE); } static void memr_merge(struct mem_region *from, struct mem_region *to) { vm_offset_t end; end = uqmax(to->mr_start + to->mr_size, from->mr_start + from->mr_size); to->mr_start = uqmin(from->mr_start, to->mr_start); to->mr_size = end - to->mr_start; } /* * Quick sort callout for comparing memory regions. */ static int mr_cmp(const void *a, const void *b) { const struct mem_region *regiona, *regionb; regiona = a; regionb = b; if (regiona->mr_start < regionb->mr_start) return (-1); else if (regiona->mr_start > regionb->mr_start) return (1); else return (0); } void +numa_mem_regions(struct numa_mem_region **phys, int *physsz) +{ + struct mem_affinity *mi; + int i, j, maxdom, ndomain, offset; + + nnumapregions = 0; + PLATFORM_NUMA_MEM_REGIONS(plat_obj, numa_pregions, &nnumapregions); + + if (physsz != NULL) + *physsz = nnumapregions; + if (phys != NULL) + *phys = numa_pregions; + if (physsz == NULL || phys == NULL) { + printf("unset value\n"); + return; + } + maxdom = 0; + for (i = 0; i < nnumapregions; i++) + if (numa_pregions[i].mr_domain > maxdom) + maxdom = numa_pregions[i].mr_domain; + + mi = mem_info; + for (i = 0; i < nnumapregions; i++, mi++) { + mi->start = numa_pregions[i].mr_start; + mi->end = numa_pregions[i].mr_start + numa_pregions[i].mr_size; + mi->domain = numa_pregions[i].mr_domain; + } + offset = 0; + vm_locality_table[offset] = 10; + ndomain = maxdom + 1; + if (ndomain > 1) { + for (i = 0; i < ndomain; i++) { + for (j = 0; j < ndomain; j++) { + /* + * Not sure what these values should actually be + */ + if (i == j) + vm_locality_table[offset] = 10; + else + vm_locality_table[offset] = 21; + offset++; + } + } + } + vm_phys_register_domains(ndomain, mem_info, vm_locality_table); +} + +void mem_regions(struct mem_region **phys, int *physsz, struct mem_region **avail, int *availsz) { int i, j, still_merging; if (npregions == 0) { PLATFORM_MEM_REGIONS(plat_obj, pregions, &npregions, aregions, &naregions); qsort(pregions, npregions, sizeof(*pregions), mr_cmp); qsort(aregions, naregions, sizeof(*aregions), mr_cmp); /* Remove overlapping available regions */ do { still_merging = FALSE; for (i = 0; i < naregions; i++) { if (aregions[i].mr_size == 0) continue; for (j = i+1; j < naregions; j++) { if (aregions[j].mr_size == 0) continue; if (!memr_overlap(&aregions[j], &aregions[i])) continue; memr_merge(&aregions[j], &aregions[i]); /* mark inactive */ aregions[j].mr_size = 0; still_merging = TRUE; } } } while (still_merging == TRUE); /* Collapse zero-length available regions */ for (i = 0; i < naregions; i++) { if (aregions[i].mr_size == 0) { memcpy(&aregions[i], &aregions[i+1], (naregions - i - 1)*sizeof(*aregions)); naregions--; i--; } } } if (phys != NULL) *phys = pregions; if (avail != NULL) *avail = aregions; if (physsz != NULL) *physsz = npregions; if (availsz != NULL) *availsz = naregions; } int mem_valid(vm_offset_t addr, int len) { int i; if (npregions == 0) { struct mem_region *p, *a; int na, np; mem_regions(&p, &np, &a, &na); } for (i = 0; i < npregions; i++) if ((addr >= pregions[i].mr_start) && (addr + len <= pregions[i].mr_start + pregions[i].mr_size)) return (0); return (EFAULT); } vm_offset_t platform_real_maxaddr(void) { return (PLATFORM_REAL_MAXADDR(plat_obj)); } const char * installed_platform() { return (plat_def_impl->name); } u_long platform_timebase_freq(struct cpuref *cpu) { return (PLATFORM_TIMEBASE_FREQ(plat_obj, cpu)); } /* * Put the current CPU, as last step in suspend, to sleep */ void platform_sleep() { PLATFORM_SLEEP(plat_obj); } int platform_smp_first_cpu(struct cpuref *cpu) { return (PLATFORM_SMP_FIRST_CPU(plat_obj, cpu)); } int platform_smp_next_cpu(struct cpuref *cpu) { return (PLATFORM_SMP_NEXT_CPU(plat_obj, cpu)); } int platform_smp_get_bsp(struct cpuref *cpu) { return (PLATFORM_SMP_GET_BSP(plat_obj, cpu)); } int platform_smp_start_cpu(struct pcpu *cpu) { return (PLATFORM_SMP_START_CPU(plat_obj, cpu)); } void platform_smp_ap_init() { PLATFORM_SMP_AP_INIT(plat_obj); } void platform_smp_probe_threads(void) { PLATFORM_SMP_PROBE_THREADS(plat_obj); } #ifdef SMP struct cpu_group * cpu_topo(void) { - return (PLATFORM_SMP_TOPO(plat_obj)); + return (PLATFORM_SMP_TOPO(plat_obj)); } #endif /* * Reset back to firmware. */ void cpu_reset() { PLATFORM_RESET(plat_obj); } void platform_smp_timebase_sync(u_long tb, int ap) { PLATFORM_SMP_TIMEBASE_SYNC(plat_obj, tb, ap); } /* * Platform install routines. Highest priority wins, using the same * algorithm as bus attachment. */ SET_DECLARE(platform_set, platform_def_t); void platform_probe_and_attach() { platform_def_t **platpp, *platp; int prio, best_prio; plat_obj = &plat_kernel_obj; best_prio = 0; /* * Try to locate the best platform kobj */ SET_FOREACH(platpp, platform_set) { platp = *platpp; /* * Take care of compiling the selected class, and * then statically initialise the MMU object */ kobj_class_compile_static(platp, &plat_kernel_kops); kobj_init_static((kobj_t)plat_obj, platp); prio = PLATFORM_PROBE(plat_obj); /* Check for errors */ if (prio > 0) continue; /* * Check if this module was specifically requested through * the loader tunable we provide. */ if (strcmp(platp->name,plat_name) == 0) { plat_def_impl = platp; break; } /* Otherwise, see if it is better than our current best */ if (plat_def_impl == NULL || prio > best_prio) { best_prio = prio; plat_def_impl = platp; } /* * We can't free the KOBJ, since it is static. Reset the ops * member of this class so that we can come back later. */ platp->ops = NULL; } if (plat_def_impl == NULL) panic("No platform module found!"); /* * Recompile to make sure we ended with the * correct one, and then attach. */ kobj_class_compile_static(plat_def_impl, &plat_kernel_kops); kobj_init_static((kobj_t)plat_obj, plat_def_impl); strlcpy(plat_name,plat_def_impl->name,sizeof(plat_name)); PLATFORM_ATTACH(plat_obj); } Index: head/sys/powerpc/powerpc/platform_if.m =================================================================== --- head/sys/powerpc/powerpc/platform_if.m (revision 346173) +++ head/sys/powerpc/powerpc/platform_if.m (revision 346174) @@ -1,241 +1,257 @@ #- # Copyright (c) 2009 Nathan Whitehorn # All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions # are met: # 1. Redistributions of source code must retain the above copyright # notice, this list of conditions and the following disclaimer. # 2. Redistributions in binary form must reproduce the above copyright # notice, this list of conditions and the following disclaimer in the # documentation and/or other materials provided with the distribution. # # THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND # ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE # ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE # FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL # DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS # OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) # HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT # LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY # OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF # SUCH DAMAGE. # # $FreeBSD$ # #include #include #include #include #include #include #include #include #include /** * @defgroup PLATFORM platform - KObj methods for PowerPC platform * implementations * @brief A set of methods required by all platform implementations. * These are used to bring up secondary CPUs, supply the physical memory * map, etc. *@{ */ INTERFACE platform; # # Default implementations # CODE { static void platform_null_attach(platform_t plat) { return; } static int platform_null_smp_first_cpu(platform_t plat, struct cpuref *cpuref) { cpuref->cr_hwref = -1; cpuref->cr_cpuid = 0; return (0); } static int platform_null_smp_next_cpu(platform_t plat, struct cpuref *_cpuref) { return (ENOENT); } static struct cpu_group *platform_null_smp_topo(platform_t plat) { #ifdef SMP return (smp_topo_none()); #else return (NULL); #endif } static vm_offset_t platform_null_real_maxaddr(platform_t plat) { return (VM_MAX_ADDRESS); } static void platform_null_smp_ap_init(platform_t plat) { return; } static void platform_null_smp_probe_threads(void) { return; } }; /** * @brief Probe for whether we are on this platform, returning the standard * newbus probe codes. If we have Open Firmware or a flattened device tree, * it is guaranteed to be available at this point. */ METHOD int probe { platform_t _plat; }; /** * @brief Attach this platform module. This happens before the MMU is online, * so the platform module can install its own high-priority MMU module at * this point. */ METHOD int attach { platform_t _plat; } DEFAULT platform_null_attach; /** * @brief Return the system's physical memory map. * * It shall provide the total and the available regions of RAM. * The available regions need not take the kernel into account. * * @param _memp Array of physical memory chunks * @param _memsz Number of physical memory chunks * @param _availp Array of available physical memory chunks * @param _availsz Number of available physical memory chunks */ METHOD void mem_regions { platform_t _plat; struct mem_region *_memp; int *_memsz; struct mem_region *_availp; int *_availsz; }; + +/** + * @brief Return the system's physical memory map. + * + * It shall provide the total RAM with the corresponding domains. + * + * @param _memp Array of physical memory chunks + * @param _memsz Number of physical memory chunks + */ + +METHOD void numa_mem_regions { + platform_t _plat; + struct numa_mem_region *_memp; + int *_memsz; +}; + /** * @brief Return the maximum address accessible in real mode * (for use with hypervisors) */ METHOD vm_offset_t real_maxaddr { platform_t _plat; } DEFAULT platform_null_real_maxaddr; /** * @brief Get the CPU's timebase frequency, in ticks per second. * * @param _cpu CPU whose timebase to query */ METHOD u_long timebase_freq { platform_t _plat; struct cpuref *_cpu; }; # SMP bits /** * @brief Fill the first CPU's cpuref * * @param _cpuref CPU */ METHOD int smp_first_cpu { platform_t _plat; struct cpuref *_cpuref; } DEFAULT platform_null_smp_first_cpu; /** * @brief Fill the next CPU's cpuref * * @param _cpuref CPU */ METHOD int smp_next_cpu { platform_t _plat; struct cpuref *_cpuref; } DEFAULT platform_null_smp_next_cpu; /** * @brief Find the boot processor * * @param _cpuref CPU */ METHOD int smp_get_bsp { platform_t _plat; struct cpuref *_cpuref; } DEFAULT platform_null_smp_first_cpu; /** * @brief Start a CPU * * @param _cpuref CPU */ METHOD int smp_start_cpu { platform_t _plat; struct pcpu *_cpu; }; /** * @brief Start a CPU * */ METHOD void smp_ap_init { platform_t _plat; } DEFAULT platform_null_smp_ap_init; /** * @brief Probe mp_ncores and smp_threads_per_core for early MI code */ METHOD void smp_probe_threads { platform_t _plat; } DEFAULT platform_null_smp_probe_threads; /** * @brief Return SMP topology */ METHOD cpu_group_t smp_topo { platform_t _plat; } DEFAULT platform_null_smp_topo; /** * @brief Reset system */ METHOD void reset { platform_t _plat; }; /** * @brief Suspend the CPU */ METHOD void sleep { platform_t _plat; }; /** * @brief Attempt to synchronize timebase of current CPU with others. * Entered (approximately) simultaneously on all CPUs, including the BSP. * Passed the timebase value on the BSP as of shortly before the call. */ METHOD void smp_timebase_sync { platform_t _plat; u_long _tb; int _ap; };