Index: stable/10/sys/amd64/amd64/uma_machdep.c =================================================================== --- stable/10/sys/amd64/amd64/uma_machdep.c (revision 287944) +++ stable/10/sys/amd64/amd64/uma_machdep.c (revision 287945) @@ -1,84 +1,84 @@ /*- * Copyright (c) 2003 Alan L. Cox * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include void * -uma_small_alloc(uma_zone_t zone, int bytes, u_int8_t *flags, int wait) +uma_small_alloc(uma_zone_t zone, vm_size_t bytes, u_int8_t *flags, int wait) { vm_page_t m; vm_paddr_t pa; void *va; int pflags; *flags = UMA_SLAB_PRIV; pflags = malloc2vm_flags(wait) | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED; for (;;) { m = vm_page_alloc(NULL, 0, pflags); if (m == NULL) { if (wait & M_NOWAIT) return (NULL); else VM_WAIT; } else break; } pa = m->phys_addr; if ((wait & M_NODUMP) == 0) dump_add_page(pa); va = (void *)PHYS_TO_DMAP(pa); if ((wait & M_ZERO) && (m->flags & PG_ZERO) == 0) pagezero(va); return (va); } void -uma_small_free(void *mem, int size, u_int8_t flags) +uma_small_free(void *mem, vm_size_t size, u_int8_t flags) { vm_page_t m; vm_paddr_t pa; pa = DMAP_TO_PHYS((vm_offset_t)mem); dump_drop_page(pa); m = PHYS_TO_VM_PAGE(pa); m->wire_count--; vm_page_free(m); atomic_subtract_int(&cnt.v_wire_count, 1); } Index: stable/10/sys/i386/i386/pmap.c =================================================================== --- stable/10/sys/i386/i386/pmap.c (revision 287944) +++ stable/10/sys/i386/i386/pmap.c (revision 287945) @@ -1,5682 +1,5683 @@ /*- * Copyright (c) 1991 Regents of the University of California. * All rights reserved. * Copyright (c) 1994 John S. Dyson * All rights reserved. * Copyright (c) 1994 David Greenman * All rights reserved. * Copyright (c) 2005-2010 Alan L. Cox * All rights reserved. * * This code is derived from software contributed to Berkeley by * the Systems Programming Group of the University of Utah Computer * Science Department and William Jolitz of UUNET Technologies Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from: @(#)pmap.c 7.7 (Berkeley) 5/12/91 */ /*- * Copyright (c) 2003 Networks Associates Technology, Inc. * All rights reserved. * * This software was developed for the FreeBSD Project by Jake Burkholder, * Safeport Network Services, and Network Associates Laboratories, the * Security Research Division of Network Associates, Inc. under * DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA * CHATS research program. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); /* * Manages physical address maps. * * Since the information managed by this module is * also stored by the logical address mapping module, * this module may throw away valid virtual-to-physical * mappings at almost any time. However, invalidations * of virtual-to-physical mappings must be done as * requested. * * In order to cope with hardware architectures which * make virtual-to-physical map invalidates expensive, * this module may delay invalidate or reduced protection * operations until such time as they are actually * necessary. This module is given full information as * to which processors are currently using which maps, * and to when physical maps must be made correct. */ #include "opt_apic.h" #include "opt_cpu.h" #include "opt_pmap.h" #include "opt_smp.h" #include "opt_xbox.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef SMP #include #else #include #endif #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef DEV_APIC #include #include #include #endif #include #include #include #include #include #ifdef SMP #include #endif #ifdef XBOX #include #endif #if !defined(CPU_DISABLE_SSE) && defined(I686_CPU) #define CPU_ENABLE_SSE #endif #ifndef PMAP_SHPGPERPROC #define PMAP_SHPGPERPROC 200 #endif #if !defined(DIAGNOSTIC) #ifdef __GNUC_GNU_INLINE__ #define PMAP_INLINE __attribute__((__gnu_inline__)) inline #else #define PMAP_INLINE extern inline #endif #else #define PMAP_INLINE #endif #ifdef PV_STATS #define PV_STAT(x) do { x ; } while (0) #else #define PV_STAT(x) do { } while (0) #endif #define pa_index(pa) ((pa) >> PDRSHIFT) #define pa_to_pvh(pa) (&pv_table[pa_index(pa)]) /* * Get PDEs and PTEs for user/kernel address space */ #define pmap_pde(m, v) (&((m)->pm_pdir[(vm_offset_t)(v) >> PDRSHIFT])) #define pdir_pde(m, v) (m[(vm_offset_t)(v) >> PDRSHIFT]) #define pmap_pde_v(pte) ((*(int *)pte & PG_V) != 0) #define pmap_pte_w(pte) ((*(int *)pte & PG_W) != 0) #define pmap_pte_m(pte) ((*(int *)pte & PG_M) != 0) #define pmap_pte_u(pte) ((*(int *)pte & PG_A) != 0) #define pmap_pte_v(pte) ((*(int *)pte & PG_V) != 0) #define pmap_pte_set_w(pte, v) ((v) ? atomic_set_int((u_int *)(pte), PG_W) : \ atomic_clear_int((u_int *)(pte), PG_W)) #define pmap_pte_set_prot(pte, v) ((*(int *)pte &= ~PG_PROT), (*(int *)pte |= (v))) struct pmap kernel_pmap_store; LIST_HEAD(pmaplist, pmap); static struct pmaplist allpmaps; static struct mtx allpmaps_lock; vm_offset_t virtual_avail; /* VA of first avail page (after kernel bss) */ vm_offset_t virtual_end; /* VA of last avail page (end of kernel AS) */ int pgeflag = 0; /* PG_G or-in */ int pseflag = 0; /* PG_PS or-in */ static int nkpt = NKPT; vm_offset_t kernel_vm_end = KERNBASE + NKPT * NBPDR; extern u_int32_t KERNend; extern u_int32_t KPTphys; #if defined(PAE) || defined(PAE_TABLES) pt_entry_t pg_nx; static uma_zone_t pdptzone; #endif static SYSCTL_NODE(_vm, OID_AUTO, pmap, CTLFLAG_RD, 0, "VM/pmap parameters"); static int pat_works = 1; SYSCTL_INT(_vm_pmap, OID_AUTO, pat_works, CTLFLAG_RD, &pat_works, 1, "Is page attribute table fully functional?"); static int pg_ps_enabled = 1; SYSCTL_INT(_vm_pmap, OID_AUTO, pg_ps_enabled, CTLFLAG_RDTUN, &pg_ps_enabled, 0, "Are large page mappings enabled?"); #define PAT_INDEX_SIZE 8 static int pat_index[PAT_INDEX_SIZE]; /* cache mode to PAT index conversion */ /* * pmap_mapdev support pre initialization (i.e. console) */ #define PMAP_PREINIT_MAPPING_COUNT 8 static struct pmap_preinit_mapping { vm_paddr_t pa; vm_offset_t va; vm_size_t sz; int mode; } pmap_preinit_mapping[PMAP_PREINIT_MAPPING_COUNT]; static int pmap_initialized; static struct rwlock_padalign pvh_global_lock; /* * Data for the pv entry allocation mechanism */ static TAILQ_HEAD(pch, pv_chunk) pv_chunks = TAILQ_HEAD_INITIALIZER(pv_chunks); static int pv_entry_count = 0, pv_entry_max = 0, pv_entry_high_water = 0; static struct md_page *pv_table; static int shpgperproc = PMAP_SHPGPERPROC; struct pv_chunk *pv_chunkbase; /* KVA block for pv_chunks */ int pv_maxchunks; /* How many chunks we have KVA for */ vm_offset_t pv_vafree; /* freelist stored in the PTE */ /* * All those kernel PT submaps that BSD is so fond of */ struct sysmaps { struct mtx lock; pt_entry_t *CMAP1; pt_entry_t *CMAP2; caddr_t CADDR1; caddr_t CADDR2; }; static struct sysmaps sysmaps_pcpu[MAXCPU]; pt_entry_t *CMAP3; static pd_entry_t *KPTD; caddr_t ptvmmap = 0; caddr_t CADDR3; struct msgbuf *msgbufp = 0; /* * Crashdump maps. */ static caddr_t crashdumpmap; static pt_entry_t *PMAP1 = 0, *PMAP2; static pt_entry_t *PADDR1 = 0, *PADDR2; #ifdef SMP static int PMAP1cpu; static int PMAP1changedcpu; SYSCTL_INT(_debug, OID_AUTO, PMAP1changedcpu, CTLFLAG_RD, &PMAP1changedcpu, 0, "Number of times pmap_pte_quick changed CPU with same PMAP1"); #endif static int PMAP1changed; SYSCTL_INT(_debug, OID_AUTO, PMAP1changed, CTLFLAG_RD, &PMAP1changed, 0, "Number of times pmap_pte_quick changed PMAP1"); static int PMAP1unchanged; SYSCTL_INT(_debug, OID_AUTO, PMAP1unchanged, CTLFLAG_RD, &PMAP1unchanged, 0, "Number of times pmap_pte_quick didn't change PMAP1"); static struct mtx PMAP2mutex; static void free_pv_chunk(struct pv_chunk *pc); static void free_pv_entry(pmap_t pmap, pv_entry_t pv); static pv_entry_t get_pv_entry(pmap_t pmap, boolean_t try); static void pmap_pv_demote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa); static boolean_t pmap_pv_insert_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa); static void pmap_pv_promote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa); static void pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va); static pv_entry_t pmap_pvh_remove(struct md_page *pvh, pmap_t pmap, vm_offset_t va); static int pmap_pvh_wired_mappings(struct md_page *pvh, int count); static boolean_t pmap_demote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va); static boolean_t pmap_enter_pde(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot); static vm_page_t pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot, vm_page_t mpte); static void pmap_flush_page(vm_page_t m); static int pmap_insert_pt_page(pmap_t pmap, vm_page_t mpte); static void pmap_fill_ptp(pt_entry_t *firstpte, pt_entry_t newpte); static boolean_t pmap_is_modified_pvh(struct md_page *pvh); static boolean_t pmap_is_referenced_pvh(struct md_page *pvh); static void pmap_kenter_attr(vm_offset_t va, vm_paddr_t pa, int mode); static void pmap_kenter_pde(vm_offset_t va, pd_entry_t newpde); static vm_page_t pmap_lookup_pt_page(pmap_t pmap, vm_offset_t va); static void pmap_pde_attr(pd_entry_t *pde, int cache_bits); static void pmap_promote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va); static boolean_t pmap_protect_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t sva, vm_prot_t prot); static void pmap_pte_attr(pt_entry_t *pte, int cache_bits); static void pmap_remove_pde(pmap_t pmap, pd_entry_t *pdq, vm_offset_t sva, struct spglist *free); static int pmap_remove_pte(pmap_t pmap, pt_entry_t *ptq, vm_offset_t sva, struct spglist *free); static void pmap_remove_pt_page(pmap_t pmap, vm_page_t mpte); static void pmap_remove_page(struct pmap *pmap, vm_offset_t va, struct spglist *free); static void pmap_remove_entry(struct pmap *pmap, vm_page_t m, vm_offset_t va); static void pmap_insert_entry(pmap_t pmap, vm_offset_t va, vm_page_t m); static boolean_t pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va, vm_page_t m); static void pmap_update_pde(pmap_t pmap, vm_offset_t va, pd_entry_t *pde, pd_entry_t newpde); static void pmap_update_pde_invalidate(vm_offset_t va, pd_entry_t newpde); static vm_page_t pmap_allocpte(pmap_t pmap, vm_offset_t va, u_int flags); static vm_page_t _pmap_allocpte(pmap_t pmap, u_int ptepindex, u_int flags); static void _pmap_unwire_ptp(pmap_t pmap, vm_page_t m, struct spglist *free); static pt_entry_t *pmap_pte_quick(pmap_t pmap, vm_offset_t va); static void pmap_pte_release(pt_entry_t *pte); static int pmap_unuse_pt(pmap_t, vm_offset_t, struct spglist *); #if defined(PAE) || defined(PAE_TABLES) -static void *pmap_pdpt_allocf(uma_zone_t zone, int bytes, u_int8_t *flags, int wait); +static void *pmap_pdpt_allocf(uma_zone_t zone, vm_size_t bytes, uint8_t *flags, + int wait); #endif static void pmap_set_pg(void); static __inline void pagezero(void *page); CTASSERT(1 << PDESHIFT == sizeof(pd_entry_t)); CTASSERT(1 << PTESHIFT == sizeof(pt_entry_t)); /* * If you get an error here, then you set KVA_PAGES wrong! See the * description of KVA_PAGES in sys/i386/include/pmap.h. It must be * multiple of 4 for a normal kernel, or a multiple of 8 for a PAE. */ CTASSERT(KERNBASE % (1 << 24) == 0); /* * Bootstrap the system enough to run with virtual memory. * * On the i386 this is called after mapping has already been enabled * and just syncs the pmap module with what has already been done. * [We can't call it easily with mapping off since the kernel is not * mapped with PA == VA, hence we would have to relocate every address * from the linked base (virtual) address "KERNBASE" to the actual * (physical) address starting relative to 0] */ void pmap_bootstrap(vm_paddr_t firstaddr) { vm_offset_t va; pt_entry_t *pte, *unused; struct sysmaps *sysmaps; int i; /* * Add a physical memory segment (vm_phys_seg) corresponding to the * preallocated kernel page table pages so that vm_page structures * representing these pages will be created. The vm_page structures * are required for promotion of the corresponding kernel virtual * addresses to superpage mappings. */ vm_phys_add_seg(KPTphys, KPTphys + ptoa(nkpt)); /* * Initialize the first available kernel virtual address. However, * using "firstaddr" may waste a few pages of the kernel virtual * address space, because locore may not have mapped every physical * page that it allocated. Preferably, locore would provide a first * unused virtual address in addition to "firstaddr". */ virtual_avail = (vm_offset_t) KERNBASE + firstaddr; virtual_end = VM_MAX_KERNEL_ADDRESS; /* * Initialize the kernel pmap (which is statically allocated). */ PMAP_LOCK_INIT(kernel_pmap); kernel_pmap->pm_pdir = (pd_entry_t *) (KERNBASE + (u_int)IdlePTD); #if defined(PAE) || defined(PAE_TABLES) kernel_pmap->pm_pdpt = (pdpt_entry_t *) (KERNBASE + (u_int)IdlePDPT); #endif CPU_FILL(&kernel_pmap->pm_active); /* don't allow deactivation */ TAILQ_INIT(&kernel_pmap->pm_pvchunk); /* * Initialize the global pv list lock. */ rw_init(&pvh_global_lock, "pmap pv global"); LIST_INIT(&allpmaps); /* * Request a spin mutex so that changes to allpmaps cannot be * preempted by smp_rendezvous_cpus(). Otherwise, * pmap_update_pde_kernel() could access allpmaps while it is * being changed. */ mtx_init(&allpmaps_lock, "allpmaps", NULL, MTX_SPIN); mtx_lock_spin(&allpmaps_lock); LIST_INSERT_HEAD(&allpmaps, kernel_pmap, pm_list); mtx_unlock_spin(&allpmaps_lock); /* * Reserve some special page table entries/VA space for temporary * mapping of pages. */ #define SYSMAP(c, p, v, n) \ v = (c)va; va += ((n)*PAGE_SIZE); p = pte; pte += (n); va = virtual_avail; pte = vtopte(va); /* * CMAP1/CMAP2 are used for zeroing and copying pages. * CMAP3 is used for the idle process page zeroing. */ for (i = 0; i < MAXCPU; i++) { sysmaps = &sysmaps_pcpu[i]; mtx_init(&sysmaps->lock, "SYSMAPS", NULL, MTX_DEF); SYSMAP(caddr_t, sysmaps->CMAP1, sysmaps->CADDR1, 1) SYSMAP(caddr_t, sysmaps->CMAP2, sysmaps->CADDR2, 1) } SYSMAP(caddr_t, CMAP3, CADDR3, 1) /* * Crashdump maps. */ SYSMAP(caddr_t, unused, crashdumpmap, MAXDUMPPGS) /* * ptvmmap is used for reading arbitrary physical pages via /dev/mem. */ SYSMAP(caddr_t, unused, ptvmmap, 1) /* * msgbufp is used to map the system message buffer. */ SYSMAP(struct msgbuf *, unused, msgbufp, atop(round_page(msgbufsize))) /* * KPTmap is used by pmap_kextract(). * * KPTmap is first initialized by locore. However, that initial * KPTmap can only support NKPT page table pages. Here, a larger * KPTmap is created that can support KVA_PAGES page table pages. */ SYSMAP(pt_entry_t *, KPTD, KPTmap, KVA_PAGES) for (i = 0; i < NKPT; i++) KPTD[i] = (KPTphys + (i << PAGE_SHIFT)) | pgeflag | PG_RW | PG_V; /* * Adjust the start of the KPTD and KPTmap so that the implementation * of pmap_kextract() and pmap_growkernel() can be made simpler. */ KPTD -= KPTDI; KPTmap -= i386_btop(KPTDI << PDRSHIFT); /* * PADDR1 and PADDR2 are used by pmap_pte_quick() and pmap_pte(), * respectively. */ SYSMAP(pt_entry_t *, PMAP1, PADDR1, 1) SYSMAP(pt_entry_t *, PMAP2, PADDR2, 1) mtx_init(&PMAP2mutex, "PMAP2", NULL, MTX_DEF); virtual_avail = va; /* * Leave in place an identity mapping (virt == phys) for the low 1 MB * physical memory region that is used by the ACPI wakeup code. This * mapping must not have PG_G set. */ #ifdef XBOX /* FIXME: This is gross, but needed for the XBOX. Since we are in such * an early stadium, we cannot yet neatly map video memory ... :-( * Better fixes are very welcome! */ if (!arch_i386_is_xbox) #endif for (i = 1; i < NKPT; i++) PTD[i] = 0; /* Initialize the PAT MSR if present. */ pmap_init_pat(); /* Turn on PG_G on kernel page(s) */ pmap_set_pg(); } /* * Setup the PAT MSR. */ void pmap_init_pat(void) { int pat_table[PAT_INDEX_SIZE]; uint64_t pat_msr; u_long cr0, cr4; int i; /* Set default PAT index table. */ for (i = 0; i < PAT_INDEX_SIZE; i++) pat_table[i] = -1; pat_table[PAT_WRITE_BACK] = 0; pat_table[PAT_WRITE_THROUGH] = 1; pat_table[PAT_UNCACHEABLE] = 3; pat_table[PAT_WRITE_COMBINING] = 3; pat_table[PAT_WRITE_PROTECTED] = 3; pat_table[PAT_UNCACHED] = 3; /* Bail if this CPU doesn't implement PAT. */ if ((cpu_feature & CPUID_PAT) == 0) { for (i = 0; i < PAT_INDEX_SIZE; i++) pat_index[i] = pat_table[i]; pat_works = 0; return; } /* * Due to some Intel errata, we can only safely use the lower 4 * PAT entries. * * Intel Pentium III Processor Specification Update * Errata E.27 (Upper Four PAT Entries Not Usable With Mode B * or Mode C Paging) * * Intel Pentium IV Processor Specification Update * Errata N46 (PAT Index MSB May Be Calculated Incorrectly) */ if (cpu_vendor_id == CPU_VENDOR_INTEL && !(CPUID_TO_FAMILY(cpu_id) == 6 && CPUID_TO_MODEL(cpu_id) >= 0xe)) pat_works = 0; /* Initialize default PAT entries. */ pat_msr = PAT_VALUE(0, PAT_WRITE_BACK) | PAT_VALUE(1, PAT_WRITE_THROUGH) | PAT_VALUE(2, PAT_UNCACHED) | PAT_VALUE(3, PAT_UNCACHEABLE) | PAT_VALUE(4, PAT_WRITE_BACK) | PAT_VALUE(5, PAT_WRITE_THROUGH) | PAT_VALUE(6, PAT_UNCACHED) | PAT_VALUE(7, PAT_UNCACHEABLE); if (pat_works) { /* * Leave the indices 0-3 at the default of WB, WT, UC-, and UC. * Program 5 and 6 as WP and WC. * Leave 4 and 7 as WB and UC. */ pat_msr &= ~(PAT_MASK(5) | PAT_MASK(6)); pat_msr |= PAT_VALUE(5, PAT_WRITE_PROTECTED) | PAT_VALUE(6, PAT_WRITE_COMBINING); pat_table[PAT_UNCACHED] = 2; pat_table[PAT_WRITE_PROTECTED] = 5; pat_table[PAT_WRITE_COMBINING] = 6; } else { /* * Just replace PAT Index 2 with WC instead of UC-. */ pat_msr &= ~PAT_MASK(2); pat_msr |= PAT_VALUE(2, PAT_WRITE_COMBINING); pat_table[PAT_WRITE_COMBINING] = 2; } /* Disable PGE. */ cr4 = rcr4(); load_cr4(cr4 & ~CR4_PGE); /* Disable caches (CD = 1, NW = 0). */ cr0 = rcr0(); load_cr0((cr0 & ~CR0_NW) | CR0_CD); /* Flushes caches and TLBs. */ wbinvd(); invltlb(); /* Update PAT and index table. */ wrmsr(MSR_PAT, pat_msr); for (i = 0; i < PAT_INDEX_SIZE; i++) pat_index[i] = pat_table[i]; /* Flush caches and TLBs again. */ wbinvd(); invltlb(); /* Restore caches and PGE. */ load_cr0(cr0); load_cr4(cr4); } /* * Set PG_G on kernel pages. Only the BSP calls this when SMP is turned on. */ static void pmap_set_pg(void) { pt_entry_t *pte; vm_offset_t va, endva; if (pgeflag == 0) return; endva = KERNBASE + KERNend; if (pseflag) { va = KERNBASE + KERNLOAD; while (va < endva) { pdir_pde(PTD, va) |= pgeflag; invltlb(); /* Play it safe, invltlb() every time */ va += NBPDR; } } else { va = (vm_offset_t)btext; while (va < endva) { pte = vtopte(va); if (*pte) *pte |= pgeflag; invltlb(); /* Play it safe, invltlb() every time */ va += PAGE_SIZE; } } } /* * Initialize a vm_page's machine-dependent fields. */ void pmap_page_init(vm_page_t m) { TAILQ_INIT(&m->md.pv_list); m->md.pat_mode = PAT_WRITE_BACK; } #if defined(PAE) || defined(PAE_TABLES) static void * -pmap_pdpt_allocf(uma_zone_t zone, int bytes, u_int8_t *flags, int wait) +pmap_pdpt_allocf(uma_zone_t zone, vm_size_t bytes, uint8_t *flags, int wait) { /* Inform UMA that this allocator uses kernel_map/object. */ *flags = UMA_SLAB_KERNEL; return ((void *)kmem_alloc_contig(kernel_arena, bytes, wait, 0x0ULL, 0xffffffffULL, 1, 0, VM_MEMATTR_DEFAULT)); } #endif /* * Abuse the pte nodes for unmapped kva to thread a kva freelist through. * Requirements: * - Must deal with pages in order to ensure that none of the PG_* bits * are ever set, PG_V in particular. * - Assumes we can write to ptes without pte_store() atomic ops, even * on PAE systems. This should be ok. * - Assumes nothing will ever test these addresses for 0 to indicate * no mapping instead of correctly checking PG_V. * - Assumes a vm_offset_t will fit in a pte (true for i386). * Because PG_V is never set, there can be no mappings to invalidate. */ static vm_offset_t pmap_ptelist_alloc(vm_offset_t *head) { pt_entry_t *pte; vm_offset_t va; va = *head; if (va == 0) panic("pmap_ptelist_alloc: exhausted ptelist KVA"); pte = vtopte(va); *head = *pte; if (*head & PG_V) panic("pmap_ptelist_alloc: va with PG_V set!"); *pte = 0; return (va); } static void pmap_ptelist_free(vm_offset_t *head, vm_offset_t va) { pt_entry_t *pte; if (va & PG_V) panic("pmap_ptelist_free: freeing va with PG_V set!"); pte = vtopte(va); *pte = *head; /* virtual! PG_V is 0 though */ *head = va; } static void pmap_ptelist_init(vm_offset_t *head, void *base, int npages) { int i; vm_offset_t va; *head = 0; for (i = npages - 1; i >= 0; i--) { va = (vm_offset_t)base + i * PAGE_SIZE; pmap_ptelist_free(head, va); } } /* * Initialize the pmap module. * Called by vm_init, to initialize any structures that the pmap * system needs to map virtual memory. */ void pmap_init(void) { struct pmap_preinit_mapping *ppim; vm_page_t mpte; vm_size_t s; int i, pv_npg; /* * Initialize the vm page array entries for the kernel pmap's * page table pages. */ for (i = 0; i < NKPT; i++) { mpte = PHYS_TO_VM_PAGE(KPTphys + (i << PAGE_SHIFT)); KASSERT(mpte >= vm_page_array && mpte < &vm_page_array[vm_page_array_size], ("pmap_init: page table page is out of range")); mpte->pindex = i + KPTDI; mpte->phys_addr = KPTphys + (i << PAGE_SHIFT); } /* * Initialize the address space (zone) for the pv entries. Set a * high water mark so that the system can recover from excessive * numbers of pv entries. */ TUNABLE_INT_FETCH("vm.pmap.shpgperproc", &shpgperproc); pv_entry_max = shpgperproc * maxproc + cnt.v_page_count; TUNABLE_INT_FETCH("vm.pmap.pv_entries", &pv_entry_max); pv_entry_max = roundup(pv_entry_max, _NPCPV); pv_entry_high_water = 9 * (pv_entry_max / 10); /* * If the kernel is running on a virtual machine, then it must assume * that MCA is enabled by the hypervisor. Moreover, the kernel must * be prepared for the hypervisor changing the vendor and family that * are reported by CPUID. Consequently, the workaround for AMD Family * 10h Erratum 383 is enabled if the processor's feature set does not * include at least one feature that is only supported by older Intel * or newer AMD processors. */ if (vm_guest == VM_GUEST_VM && (cpu_feature & CPUID_SS) == 0 && (cpu_feature2 & (CPUID2_SSSE3 | CPUID2_SSE41 | CPUID2_AESNI | CPUID2_AVX | CPUID2_XSAVE)) == 0 && (amd_feature2 & (AMDID2_XOP | AMDID2_FMA4)) == 0) workaround_erratum383 = 1; /* * Are large page mappings supported and enabled? */ TUNABLE_INT_FETCH("vm.pmap.pg_ps_enabled", &pg_ps_enabled); if (pseflag == 0) pg_ps_enabled = 0; else if (pg_ps_enabled) { KASSERT(MAXPAGESIZES > 1 && pagesizes[1] == 0, ("pmap_init: can't assign to pagesizes[1]")); pagesizes[1] = NBPDR; } /* * Calculate the size of the pv head table for superpages. * Handle the possibility that "vm_phys_segs[...].end" is zero. */ pv_npg = trunc_4mpage(vm_phys_segs[vm_phys_nsegs - 1].end - PAGE_SIZE) / NBPDR + 1; /* * Allocate memory for the pv head table for superpages. */ s = (vm_size_t)(pv_npg * sizeof(struct md_page)); s = round_page(s); pv_table = (struct md_page *)kmem_malloc(kernel_arena, s, M_WAITOK | M_ZERO); for (i = 0; i < pv_npg; i++) TAILQ_INIT(&pv_table[i].pv_list); pv_maxchunks = MAX(pv_entry_max / _NPCPV, maxproc); pv_chunkbase = (struct pv_chunk *)kva_alloc(PAGE_SIZE * pv_maxchunks); if (pv_chunkbase == NULL) panic("pmap_init: not enough kvm for pv chunks"); pmap_ptelist_init(&pv_vafree, pv_chunkbase, pv_maxchunks); #if defined(PAE) || defined(PAE_TABLES) pdptzone = uma_zcreate("PDPT", NPGPTD * sizeof(pdpt_entry_t), NULL, NULL, NULL, NULL, (NPGPTD * sizeof(pdpt_entry_t)) - 1, UMA_ZONE_VM | UMA_ZONE_NOFREE); uma_zone_set_allocf(pdptzone, pmap_pdpt_allocf); #endif pmap_initialized = 1; if (!bootverbose) return; for (i = 0; i < PMAP_PREINIT_MAPPING_COUNT; i++) { ppim = pmap_preinit_mapping + i; if (ppim->va == 0) continue; printf("PPIM %u: PA=%#jx, VA=%#x, size=%#x, mode=%#x\n", i, (uintmax_t)ppim->pa, ppim->va, ppim->sz, ppim->mode); } } SYSCTL_INT(_vm_pmap, OID_AUTO, pv_entry_max, CTLFLAG_RD, &pv_entry_max, 0, "Max number of PV entries"); SYSCTL_INT(_vm_pmap, OID_AUTO, shpgperproc, CTLFLAG_RD, &shpgperproc, 0, "Page share factor per proc"); static SYSCTL_NODE(_vm_pmap, OID_AUTO, pde, CTLFLAG_RD, 0, "2/4MB page mapping counters"); static u_long pmap_pde_demotions; SYSCTL_ULONG(_vm_pmap_pde, OID_AUTO, demotions, CTLFLAG_RD, &pmap_pde_demotions, 0, "2/4MB page demotions"); static u_long pmap_pde_mappings; SYSCTL_ULONG(_vm_pmap_pde, OID_AUTO, mappings, CTLFLAG_RD, &pmap_pde_mappings, 0, "2/4MB page mappings"); static u_long pmap_pde_p_failures; SYSCTL_ULONG(_vm_pmap_pde, OID_AUTO, p_failures, CTLFLAG_RD, &pmap_pde_p_failures, 0, "2/4MB page promotion failures"); static u_long pmap_pde_promotions; SYSCTL_ULONG(_vm_pmap_pde, OID_AUTO, promotions, CTLFLAG_RD, &pmap_pde_promotions, 0, "2/4MB page promotions"); /*************************************************** * Low level helper routines..... ***************************************************/ /* * Determine the appropriate bits to set in a PTE or PDE for a specified * caching mode. */ int pmap_cache_bits(int mode, boolean_t is_pde) { int cache_bits, pat_flag, pat_idx; if (mode < 0 || mode >= PAT_INDEX_SIZE || pat_index[mode] < 0) panic("Unknown caching mode %d\n", mode); /* The PAT bit is different for PTE's and PDE's. */ pat_flag = is_pde ? PG_PDE_PAT : PG_PTE_PAT; /* Map the caching mode to a PAT index. */ pat_idx = pat_index[mode]; /* Map the 3-bit index value into the PAT, PCD, and PWT bits. */ cache_bits = 0; if (pat_idx & 0x4) cache_bits |= pat_flag; if (pat_idx & 0x2) cache_bits |= PG_NC_PCD; if (pat_idx & 0x1) cache_bits |= PG_NC_PWT; return (cache_bits); } /* * The caller is responsible for maintaining TLB consistency. */ static void pmap_kenter_pde(vm_offset_t va, pd_entry_t newpde) { pd_entry_t *pde; pmap_t pmap; boolean_t PTD_updated; PTD_updated = FALSE; mtx_lock_spin(&allpmaps_lock); LIST_FOREACH(pmap, &allpmaps, pm_list) { if ((pmap->pm_pdir[PTDPTDI] & PG_FRAME) == (PTDpde[0] & PG_FRAME)) PTD_updated = TRUE; pde = pmap_pde(pmap, va); pde_store(pde, newpde); } mtx_unlock_spin(&allpmaps_lock); KASSERT(PTD_updated, ("pmap_kenter_pde: current page table is not in allpmaps")); } /* * After changing the page size for the specified virtual address in the page * table, flush the corresponding entries from the processor's TLB. Only the * calling processor's TLB is affected. * * The calling thread must be pinned to a processor. */ static void pmap_update_pde_invalidate(vm_offset_t va, pd_entry_t newpde) { u_long cr4; if ((newpde & PG_PS) == 0) /* Demotion: flush a specific 2MB page mapping. */ invlpg(va); else if ((newpde & PG_G) == 0) /* * Promotion: flush every 4KB page mapping from the TLB * because there are too many to flush individually. */ invltlb(); else { /* * Promotion: flush every 4KB page mapping from the TLB, * including any global (PG_G) mappings. */ cr4 = rcr4(); load_cr4(cr4 & ~CR4_PGE); /* * Although preemption at this point could be detrimental to * performance, it would not lead to an error. PG_G is simply * ignored if CR4.PGE is clear. Moreover, in case this block * is re-entered, the load_cr4() either above or below will * modify CR4.PGE flushing the TLB. */ load_cr4(cr4 | CR4_PGE); } } #ifdef SMP /* * For SMP, these functions have to use the IPI mechanism for coherence. * * N.B.: Before calling any of the following TLB invalidation functions, * the calling processor must ensure that all stores updating a non- * kernel page table are globally performed. Otherwise, another * processor could cache an old, pre-update entry without being * invalidated. This can happen one of two ways: (1) The pmap becomes * active on another processor after its pm_active field is checked by * one of the following functions but before a store updating the page * table is globally performed. (2) The pmap becomes active on another * processor before its pm_active field is checked but due to * speculative loads one of the following functions stills reads the * pmap as inactive on the other processor. * * The kernel page table is exempt because its pm_active field is * immutable. The kernel page table is always active on every * processor. */ void pmap_invalidate_page(pmap_t pmap, vm_offset_t va) { cpuset_t other_cpus; u_int cpuid; sched_pin(); if (pmap == kernel_pmap || !CPU_CMP(&pmap->pm_active, &all_cpus)) { invlpg(va); smp_invlpg(va); } else { cpuid = PCPU_GET(cpuid); other_cpus = all_cpus; CPU_CLR(cpuid, &other_cpus); if (CPU_ISSET(cpuid, &pmap->pm_active)) invlpg(va); CPU_AND(&other_cpus, &pmap->pm_active); if (!CPU_EMPTY(&other_cpus)) smp_masked_invlpg(other_cpus, va); } sched_unpin(); } void pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) { cpuset_t other_cpus; vm_offset_t addr; u_int cpuid; sched_pin(); if (pmap == kernel_pmap || !CPU_CMP(&pmap->pm_active, &all_cpus)) { for (addr = sva; addr < eva; addr += PAGE_SIZE) invlpg(addr); smp_invlpg_range(sva, eva); } else { cpuid = PCPU_GET(cpuid); other_cpus = all_cpus; CPU_CLR(cpuid, &other_cpus); if (CPU_ISSET(cpuid, &pmap->pm_active)) for (addr = sva; addr < eva; addr += PAGE_SIZE) invlpg(addr); CPU_AND(&other_cpus, &pmap->pm_active); if (!CPU_EMPTY(&other_cpus)) smp_masked_invlpg_range(other_cpus, sva, eva); } sched_unpin(); } void pmap_invalidate_all(pmap_t pmap) { cpuset_t other_cpus; u_int cpuid; sched_pin(); if (pmap == kernel_pmap || !CPU_CMP(&pmap->pm_active, &all_cpus)) { invltlb(); smp_invltlb(); } else { cpuid = PCPU_GET(cpuid); other_cpus = all_cpus; CPU_CLR(cpuid, &other_cpus); if (CPU_ISSET(cpuid, &pmap->pm_active)) invltlb(); CPU_AND(&other_cpus, &pmap->pm_active); if (!CPU_EMPTY(&other_cpus)) smp_masked_invltlb(other_cpus); } sched_unpin(); } void pmap_invalidate_cache(void) { sched_pin(); wbinvd(); smp_cache_flush(); sched_unpin(); } struct pde_action { cpuset_t invalidate; /* processors that invalidate their TLB */ vm_offset_t va; pd_entry_t *pde; pd_entry_t newpde; u_int store; /* processor that updates the PDE */ }; static void pmap_update_pde_kernel(void *arg) { struct pde_action *act = arg; pd_entry_t *pde; pmap_t pmap; if (act->store == PCPU_GET(cpuid)) { /* * Elsewhere, this operation requires allpmaps_lock for * synchronization. Here, it does not because it is being * performed in the context of an all_cpus rendezvous. */ LIST_FOREACH(pmap, &allpmaps, pm_list) { pde = pmap_pde(pmap, act->va); pde_store(pde, act->newpde); } } } static void pmap_update_pde_user(void *arg) { struct pde_action *act = arg; if (act->store == PCPU_GET(cpuid)) pde_store(act->pde, act->newpde); } static void pmap_update_pde_teardown(void *arg) { struct pde_action *act = arg; if (CPU_ISSET(PCPU_GET(cpuid), &act->invalidate)) pmap_update_pde_invalidate(act->va, act->newpde); } /* * Change the page size for the specified virtual address in a way that * prevents any possibility of the TLB ever having two entries that map the * same virtual address using different page sizes. This is the recommended * workaround for Erratum 383 on AMD Family 10h processors. It prevents a * machine check exception for a TLB state that is improperly diagnosed as a * hardware error. */ static void pmap_update_pde(pmap_t pmap, vm_offset_t va, pd_entry_t *pde, pd_entry_t newpde) { struct pde_action act; cpuset_t active, other_cpus; u_int cpuid; sched_pin(); cpuid = PCPU_GET(cpuid); other_cpus = all_cpus; CPU_CLR(cpuid, &other_cpus); if (pmap == kernel_pmap) active = all_cpus; else active = pmap->pm_active; if (CPU_OVERLAP(&active, &other_cpus)) { act.store = cpuid; act.invalidate = active; act.va = va; act.pde = pde; act.newpde = newpde; CPU_SET(cpuid, &active); smp_rendezvous_cpus(active, smp_no_rendevous_barrier, pmap == kernel_pmap ? pmap_update_pde_kernel : pmap_update_pde_user, pmap_update_pde_teardown, &act); } else { if (pmap == kernel_pmap) pmap_kenter_pde(va, newpde); else pde_store(pde, newpde); if (CPU_ISSET(cpuid, &active)) pmap_update_pde_invalidate(va, newpde); } sched_unpin(); } #else /* !SMP */ /* * Normal, non-SMP, 486+ invalidation functions. * We inline these within pmap.c for speed. */ PMAP_INLINE void pmap_invalidate_page(pmap_t pmap, vm_offset_t va) { if (pmap == kernel_pmap || !CPU_EMPTY(&pmap->pm_active)) invlpg(va); } PMAP_INLINE void pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) { vm_offset_t addr; if (pmap == kernel_pmap || !CPU_EMPTY(&pmap->pm_active)) for (addr = sva; addr < eva; addr += PAGE_SIZE) invlpg(addr); } PMAP_INLINE void pmap_invalidate_all(pmap_t pmap) { if (pmap == kernel_pmap || !CPU_EMPTY(&pmap->pm_active)) invltlb(); } PMAP_INLINE void pmap_invalidate_cache(void) { wbinvd(); } static void pmap_update_pde(pmap_t pmap, vm_offset_t va, pd_entry_t *pde, pd_entry_t newpde) { if (pmap == kernel_pmap) pmap_kenter_pde(va, newpde); else pde_store(pde, newpde); if (pmap == kernel_pmap || !CPU_EMPTY(&pmap->pm_active)) pmap_update_pde_invalidate(va, newpde); } #endif /* !SMP */ #define PMAP_CLFLUSH_THRESHOLD (2 * 1024 * 1024) void pmap_invalidate_cache_range(vm_offset_t sva, vm_offset_t eva, boolean_t force) { if (force) { sva &= ~(vm_offset_t)cpu_clflush_line_size; } else { KASSERT((sva & PAGE_MASK) == 0, ("pmap_invalidate_cache_range: sva not page-aligned")); KASSERT((eva & PAGE_MASK) == 0, ("pmap_invalidate_cache_range: eva not page-aligned")); } if ((cpu_feature & CPUID_SS) != 0 && !force) ; /* If "Self Snoop" is supported and allowed, do nothing. */ else if ((cpu_feature & CPUID_CLFSH) != 0 && eva - sva < PMAP_CLFLUSH_THRESHOLD) { #ifdef DEV_APIC /* * XXX: Some CPUs fault, hang, or trash the local APIC * registers if we use CLFLUSH on the local APIC * range. The local APIC is always uncached, so we * don't need to flush for that range anyway. */ if (pmap_kextract(sva) == lapic_paddr) return; #endif /* * Otherwise, do per-cache line flush. Use the mfence * instruction to insure that previous stores are * included in the write-back. The processor * propagates flush to other processors in the cache * coherence domain. */ mfence(); for (; sva < eva; sva += cpu_clflush_line_size) clflush(sva); mfence(); } else { /* * No targeted cache flush methods are supported by CPU, * or the supplied range is bigger than 2MB. * Globally invalidate cache. */ pmap_invalidate_cache(); } } void pmap_invalidate_cache_pages(vm_page_t *pages, int count) { int i; if (count >= PMAP_CLFLUSH_THRESHOLD / PAGE_SIZE || (cpu_feature & CPUID_CLFSH) == 0) { pmap_invalidate_cache(); } else { for (i = 0; i < count; i++) pmap_flush_page(pages[i]); } } /* * Are we current address space or kernel? N.B. We return FALSE when * a pmap's page table is in use because a kernel thread is borrowing * it. The borrowed page table can change spontaneously, making any * dependence on its continued use subject to a race condition. */ static __inline int pmap_is_current(pmap_t pmap) { return (pmap == kernel_pmap || (pmap == vmspace_pmap(curthread->td_proc->p_vmspace) && (pmap->pm_pdir[PTDPTDI] & PG_FRAME) == (PTDpde[0] & PG_FRAME))); } /* * If the given pmap is not the current or kernel pmap, the returned pte must * be released by passing it to pmap_pte_release(). */ pt_entry_t * pmap_pte(pmap_t pmap, vm_offset_t va) { pd_entry_t newpf; pd_entry_t *pde; pde = pmap_pde(pmap, va); if (*pde & PG_PS) return (pde); if (*pde != 0) { /* are we current address space or kernel? */ if (pmap_is_current(pmap)) return (vtopte(va)); mtx_lock(&PMAP2mutex); newpf = *pde & PG_FRAME; if ((*PMAP2 & PG_FRAME) != newpf) { *PMAP2 = newpf | PG_RW | PG_V | PG_A | PG_M; pmap_invalidate_page(kernel_pmap, (vm_offset_t)PADDR2); } return (PADDR2 + (i386_btop(va) & (NPTEPG - 1))); } return (NULL); } /* * Releases a pte that was obtained from pmap_pte(). Be prepared for the pte * being NULL. */ static __inline void pmap_pte_release(pt_entry_t *pte) { if ((pt_entry_t *)((vm_offset_t)pte & ~PAGE_MASK) == PADDR2) mtx_unlock(&PMAP2mutex); } /* * NB: The sequence of updating a page table followed by accesses to the * corresponding pages is subject to the situation described in the "AMD64 * Architecture Programmer's Manual Volume 2: System Programming" rev. 3.23, * "7.3.1 Special Coherency Considerations". Therefore, issuing the INVLPG * right after modifying the PTE bits is crucial. */ static __inline void invlcaddr(void *caddr) { invlpg((u_int)caddr); } /* * Super fast pmap_pte routine best used when scanning * the pv lists. This eliminates many coarse-grained * invltlb calls. Note that many of the pv list * scans are across different pmaps. It is very wasteful * to do an entire invltlb for checking a single mapping. * * If the given pmap is not the current pmap, pvh_global_lock * must be held and curthread pinned to a CPU. */ static pt_entry_t * pmap_pte_quick(pmap_t pmap, vm_offset_t va) { pd_entry_t newpf; pd_entry_t *pde; pde = pmap_pde(pmap, va); if (*pde & PG_PS) return (pde); if (*pde != 0) { /* are we current address space or kernel? */ if (pmap_is_current(pmap)) return (vtopte(va)); rw_assert(&pvh_global_lock, RA_WLOCKED); KASSERT(curthread->td_pinned > 0, ("curthread not pinned")); newpf = *pde & PG_FRAME; if ((*PMAP1 & PG_FRAME) != newpf) { *PMAP1 = newpf | PG_RW | PG_V | PG_A | PG_M; #ifdef SMP PMAP1cpu = PCPU_GET(cpuid); #endif invlcaddr(PADDR1); PMAP1changed++; } else #ifdef SMP if (PMAP1cpu != PCPU_GET(cpuid)) { PMAP1cpu = PCPU_GET(cpuid); invlcaddr(PADDR1); PMAP1changedcpu++; } else #endif PMAP1unchanged++; return (PADDR1 + (i386_btop(va) & (NPTEPG - 1))); } return (0); } /* * Routine: pmap_extract * Function: * Extract the physical page address associated * with the given map/virtual_address pair. */ vm_paddr_t pmap_extract(pmap_t pmap, vm_offset_t va) { vm_paddr_t rtval; pt_entry_t *pte; pd_entry_t pde; rtval = 0; PMAP_LOCK(pmap); pde = pmap->pm_pdir[va >> PDRSHIFT]; if (pde != 0) { if ((pde & PG_PS) != 0) rtval = (pde & PG_PS_FRAME) | (va & PDRMASK); else { pte = pmap_pte(pmap, va); rtval = (*pte & PG_FRAME) | (va & PAGE_MASK); pmap_pte_release(pte); } } PMAP_UNLOCK(pmap); return (rtval); } /* * Routine: pmap_extract_and_hold * Function: * Atomically extract and hold the physical page * with the given pmap and virtual address pair * if that mapping permits the given protection. */ vm_page_t pmap_extract_and_hold(pmap_t pmap, vm_offset_t va, vm_prot_t prot) { pd_entry_t pde; pt_entry_t pte, *ptep; vm_page_t m; vm_paddr_t pa; pa = 0; m = NULL; PMAP_LOCK(pmap); retry: pde = *pmap_pde(pmap, va); if (pde != 0) { if (pde & PG_PS) { if ((pde & PG_RW) || (prot & VM_PROT_WRITE) == 0) { if (vm_page_pa_tryrelock(pmap, (pde & PG_PS_FRAME) | (va & PDRMASK), &pa)) goto retry; m = PHYS_TO_VM_PAGE((pde & PG_PS_FRAME) | (va & PDRMASK)); vm_page_hold(m); } } else { ptep = pmap_pte(pmap, va); pte = *ptep; pmap_pte_release(ptep); if (pte != 0 && ((pte & PG_RW) || (prot & VM_PROT_WRITE) == 0)) { if (vm_page_pa_tryrelock(pmap, pte & PG_FRAME, &pa)) goto retry; m = PHYS_TO_VM_PAGE(pte & PG_FRAME); vm_page_hold(m); } } } PA_UNLOCK_COND(pa); PMAP_UNLOCK(pmap); return (m); } /*************************************************** * Low level mapping routines..... ***************************************************/ /* * Add a wired page to the kva. * Note: not SMP coherent. * * This function may be used before pmap_bootstrap() is called. */ PMAP_INLINE void pmap_kenter(vm_offset_t va, vm_paddr_t pa) { pt_entry_t *pte; pte = vtopte(va); pte_store(pte, pa | PG_RW | PG_V | pgeflag); } static __inline void pmap_kenter_attr(vm_offset_t va, vm_paddr_t pa, int mode) { pt_entry_t *pte; pte = vtopte(va); pte_store(pte, pa | PG_RW | PG_V | pgeflag | pmap_cache_bits(mode, 0)); } /* * Remove a page from the kernel pagetables. * Note: not SMP coherent. * * This function may be used before pmap_bootstrap() is called. */ PMAP_INLINE void pmap_kremove(vm_offset_t va) { pt_entry_t *pte; pte = vtopte(va); pte_clear(pte); } /* * Used to map a range of physical addresses into kernel * virtual address space. * * The value passed in '*virt' is a suggested virtual address for * the mapping. Architectures which can support a direct-mapped * physical to virtual region can return the appropriate address * within that region, leaving '*virt' unchanged. Other * architectures should map the pages starting at '*virt' and * update '*virt' with the first usable address after the mapped * region. */ vm_offset_t pmap_map(vm_offset_t *virt, vm_paddr_t start, vm_paddr_t end, int prot) { vm_offset_t va, sva; vm_paddr_t superpage_offset; pd_entry_t newpde; va = *virt; /* * Does the physical address range's size and alignment permit at * least one superpage mapping to be created? */ superpage_offset = start & PDRMASK; if ((end - start) - ((NBPDR - superpage_offset) & PDRMASK) >= NBPDR) { /* * Increase the starting virtual address so that its alignment * does not preclude the use of superpage mappings. */ if ((va & PDRMASK) < superpage_offset) va = (va & ~PDRMASK) + superpage_offset; else if ((va & PDRMASK) > superpage_offset) va = ((va + PDRMASK) & ~PDRMASK) + superpage_offset; } sva = va; while (start < end) { if ((start & PDRMASK) == 0 && end - start >= NBPDR && pseflag) { KASSERT((va & PDRMASK) == 0, ("pmap_map: misaligned va %#x", va)); newpde = start | PG_PS | pgeflag | PG_RW | PG_V; pmap_kenter_pde(va, newpde); va += NBPDR; start += NBPDR; } else { pmap_kenter(va, start); va += PAGE_SIZE; start += PAGE_SIZE; } } pmap_invalidate_range(kernel_pmap, sva, va); *virt = va; return (sva); } /* * Add a list of wired pages to the kva * this routine is only used for temporary * kernel mappings that do not need to have * page modification or references recorded. * Note that old mappings are simply written * over. The page *must* be wired. * Note: SMP coherent. Uses a ranged shootdown IPI. */ void pmap_qenter(vm_offset_t sva, vm_page_t *ma, int count) { pt_entry_t *endpte, oldpte, pa, *pte; vm_page_t m; oldpte = 0; pte = vtopte(sva); endpte = pte + count; while (pte < endpte) { m = *ma++; pa = VM_PAGE_TO_PHYS(m) | pmap_cache_bits(m->md.pat_mode, 0); if ((*pte & (PG_FRAME | PG_PTE_CACHE)) != pa) { oldpte |= *pte; pte_store(pte, pa | pgeflag | PG_RW | PG_V); } pte++; } if (__predict_false((oldpte & PG_V) != 0)) pmap_invalidate_range(kernel_pmap, sva, sva + count * PAGE_SIZE); } /* * This routine tears out page mappings from the * kernel -- it is meant only for temporary mappings. * Note: SMP coherent. Uses a ranged shootdown IPI. */ void pmap_qremove(vm_offset_t sva, int count) { vm_offset_t va; va = sva; while (count-- > 0) { pmap_kremove(va); va += PAGE_SIZE; } pmap_invalidate_range(kernel_pmap, sva, va); } /*************************************************** * Page table page management routines..... ***************************************************/ static __inline void pmap_free_zero_pages(struct spglist *free) { vm_page_t m; while ((m = SLIST_FIRST(free)) != NULL) { SLIST_REMOVE_HEAD(free, plinks.s.ss); /* Preserve the page's PG_ZERO setting. */ vm_page_free_toq(m); } } /* * Schedule the specified unused page table page to be freed. Specifically, * add the page to the specified list of pages that will be released to the * physical memory manager after the TLB has been updated. */ static __inline void pmap_add_delayed_free_list(vm_page_t m, struct spglist *free, boolean_t set_PG_ZERO) { if (set_PG_ZERO) m->flags |= PG_ZERO; else m->flags &= ~PG_ZERO; SLIST_INSERT_HEAD(free, m, plinks.s.ss); } /* * Inserts the specified page table page into the specified pmap's collection * of idle page table pages. Each of a pmap's page table pages is responsible * for mapping a distinct range of virtual addresses. The pmap's collection is * ordered by this virtual address range. */ static __inline int pmap_insert_pt_page(pmap_t pmap, vm_page_t mpte) { PMAP_LOCK_ASSERT(pmap, MA_OWNED); return (vm_radix_insert(&pmap->pm_root, mpte)); } /* * Looks for a page table page mapping the specified virtual address in the * specified pmap's collection of idle page table pages. Returns NULL if there * is no page table page corresponding to the specified virtual address. */ static __inline vm_page_t pmap_lookup_pt_page(pmap_t pmap, vm_offset_t va) { PMAP_LOCK_ASSERT(pmap, MA_OWNED); return (vm_radix_lookup(&pmap->pm_root, va >> PDRSHIFT)); } /* * Removes the specified page table page from the specified pmap's collection * of idle page table pages. The specified page table page must be a member of * the pmap's collection. */ static __inline void pmap_remove_pt_page(pmap_t pmap, vm_page_t mpte) { PMAP_LOCK_ASSERT(pmap, MA_OWNED); vm_radix_remove(&pmap->pm_root, mpte->pindex); } /* * Decrements a page table page's wire count, which is used to record the * number of valid page table entries within the page. If the wire count * drops to zero, then the page table page is unmapped. Returns TRUE if the * page table page was unmapped and FALSE otherwise. */ static inline boolean_t pmap_unwire_ptp(pmap_t pmap, vm_page_t m, struct spglist *free) { --m->wire_count; if (m->wire_count == 0) { _pmap_unwire_ptp(pmap, m, free); return (TRUE); } else return (FALSE); } static void _pmap_unwire_ptp(pmap_t pmap, vm_page_t m, struct spglist *free) { vm_offset_t pteva; /* * unmap the page table page */ pmap->pm_pdir[m->pindex] = 0; --pmap->pm_stats.resident_count; /* * This is a release store so that the ordinary store unmapping * the page table page is globally performed before TLB shoot- * down is begun. */ atomic_subtract_rel_int(&cnt.v_wire_count, 1); /* * Do an invltlb to make the invalidated mapping * take effect immediately. */ pteva = VM_MAXUSER_ADDRESS + i386_ptob(m->pindex); pmap_invalidate_page(pmap, pteva); /* * Put page on a list so that it is released after * *ALL* TLB shootdown is done */ pmap_add_delayed_free_list(m, free, TRUE); } /* * After removing a page table entry, this routine is used to * conditionally free the page, and manage the hold/wire counts. */ static int pmap_unuse_pt(pmap_t pmap, vm_offset_t va, struct spglist *free) { pd_entry_t ptepde; vm_page_t mpte; if (va >= VM_MAXUSER_ADDRESS) return (0); ptepde = *pmap_pde(pmap, va); mpte = PHYS_TO_VM_PAGE(ptepde & PG_FRAME); return (pmap_unwire_ptp(pmap, mpte, free)); } /* * Initialize the pmap for the swapper process. */ void pmap_pinit0(pmap_t pmap) { PMAP_LOCK_INIT(pmap); /* * Since the page table directory is shared with the kernel pmap, * which is already included in the list "allpmaps", this pmap does * not need to be inserted into that list. */ pmap->pm_pdir = (pd_entry_t *)(KERNBASE + (vm_offset_t)IdlePTD); #if defined(PAE) || defined(PAE_TABLES) pmap->pm_pdpt = (pdpt_entry_t *)(KERNBASE + (vm_offset_t)IdlePDPT); #endif pmap->pm_root.rt_root = 0; CPU_ZERO(&pmap->pm_active); PCPU_SET(curpmap, pmap); TAILQ_INIT(&pmap->pm_pvchunk); bzero(&pmap->pm_stats, sizeof pmap->pm_stats); } /* * Initialize a preallocated and zeroed pmap structure, * such as one in a vmspace structure. */ int pmap_pinit(pmap_t pmap) { vm_page_t m, ptdpg[NPGPTD]; vm_paddr_t pa; int i; /* * No need to allocate page table space yet but we do need a valid * page directory table. */ if (pmap->pm_pdir == NULL) { pmap->pm_pdir = (pd_entry_t *)kva_alloc(NBPTD); if (pmap->pm_pdir == NULL) return (0); #if defined(PAE) || defined(PAE_TABLES) pmap->pm_pdpt = uma_zalloc(pdptzone, M_WAITOK | M_ZERO); KASSERT(((vm_offset_t)pmap->pm_pdpt & ((NPGPTD * sizeof(pdpt_entry_t)) - 1)) == 0, ("pmap_pinit: pdpt misaligned")); KASSERT(pmap_kextract((vm_offset_t)pmap->pm_pdpt) < (4ULL<<30), ("pmap_pinit: pdpt above 4g")); #endif pmap->pm_root.rt_root = 0; } KASSERT(vm_radix_is_empty(&pmap->pm_root), ("pmap_pinit: pmap has reserved page table page(s)")); /* * allocate the page directory page(s) */ for (i = 0; i < NPGPTD;) { m = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED | VM_ALLOC_ZERO); if (m == NULL) VM_WAIT; else { ptdpg[i++] = m; } } pmap_qenter((vm_offset_t)pmap->pm_pdir, ptdpg, NPGPTD); for (i = 0; i < NPGPTD; i++) if ((ptdpg[i]->flags & PG_ZERO) == 0) pagezero(pmap->pm_pdir + (i * NPDEPG)); mtx_lock_spin(&allpmaps_lock); LIST_INSERT_HEAD(&allpmaps, pmap, pm_list); /* Copy the kernel page table directory entries. */ bcopy(PTD + KPTDI, pmap->pm_pdir + KPTDI, nkpt * sizeof(pd_entry_t)); mtx_unlock_spin(&allpmaps_lock); /* install self-referential address mapping entry(s) */ for (i = 0; i < NPGPTD; i++) { pa = VM_PAGE_TO_PHYS(ptdpg[i]); pmap->pm_pdir[PTDPTDI + i] = pa | PG_V | PG_RW | PG_A | PG_M; #if defined(PAE) || defined(PAE_TABLES) pmap->pm_pdpt[i] = pa | PG_V; #endif } CPU_ZERO(&pmap->pm_active); TAILQ_INIT(&pmap->pm_pvchunk); bzero(&pmap->pm_stats, sizeof pmap->pm_stats); return (1); } /* * this routine is called if the page table page is not * mapped correctly. */ static vm_page_t _pmap_allocpte(pmap_t pmap, u_int ptepindex, u_int flags) { vm_paddr_t ptepa; vm_page_t m; /* * Allocate a page table page. */ if ((m = vm_page_alloc(NULL, ptepindex, VM_ALLOC_NOOBJ | VM_ALLOC_WIRED | VM_ALLOC_ZERO)) == NULL) { if ((flags & PMAP_ENTER_NOSLEEP) == 0) { PMAP_UNLOCK(pmap); rw_wunlock(&pvh_global_lock); VM_WAIT; rw_wlock(&pvh_global_lock); PMAP_LOCK(pmap); } /* * Indicate the need to retry. While waiting, the page table * page may have been allocated. */ return (NULL); } if ((m->flags & PG_ZERO) == 0) pmap_zero_page(m); /* * Map the pagetable page into the process address space, if * it isn't already there. */ pmap->pm_stats.resident_count++; ptepa = VM_PAGE_TO_PHYS(m); pmap->pm_pdir[ptepindex] = (pd_entry_t) (ptepa | PG_U | PG_RW | PG_V | PG_A | PG_M); return (m); } static vm_page_t pmap_allocpte(pmap_t pmap, vm_offset_t va, u_int flags) { u_int ptepindex; pd_entry_t ptepa; vm_page_t m; /* * Calculate pagetable page index */ ptepindex = va >> PDRSHIFT; retry: /* * Get the page directory entry */ ptepa = pmap->pm_pdir[ptepindex]; /* * This supports switching from a 4MB page to a * normal 4K page. */ if (ptepa & PG_PS) { (void)pmap_demote_pde(pmap, &pmap->pm_pdir[ptepindex], va); ptepa = pmap->pm_pdir[ptepindex]; } /* * If the page table page is mapped, we just increment the * hold count, and activate it. */ if (ptepa) { m = PHYS_TO_VM_PAGE(ptepa & PG_FRAME); m->wire_count++; } else { /* * Here if the pte page isn't mapped, or if it has * been deallocated. */ m = _pmap_allocpte(pmap, ptepindex, flags); if (m == NULL && (flags & PMAP_ENTER_NOSLEEP) == 0) goto retry; } return (m); } /*************************************************** * Pmap allocation/deallocation routines. ***************************************************/ #ifdef SMP /* * Deal with a SMP shootdown of other users of the pmap that we are * trying to dispose of. This can be a bit hairy. */ static cpuset_t *lazymask; static u_int lazyptd; static volatile u_int lazywait; void pmap_lazyfix_action(void); void pmap_lazyfix_action(void) { #ifdef COUNT_IPIS (*ipi_lazypmap_counts[PCPU_GET(cpuid)])++; #endif if (rcr3() == lazyptd) load_cr3(curpcb->pcb_cr3); CPU_CLR_ATOMIC(PCPU_GET(cpuid), lazymask); atomic_store_rel_int(&lazywait, 1); } static void pmap_lazyfix_self(u_int cpuid) { if (rcr3() == lazyptd) load_cr3(curpcb->pcb_cr3); CPU_CLR_ATOMIC(cpuid, lazymask); } static void pmap_lazyfix(pmap_t pmap) { cpuset_t mymask, mask; u_int cpuid, spins; int lsb; mask = pmap->pm_active; while (!CPU_EMPTY(&mask)) { spins = 50000000; /* Find least significant set bit. */ lsb = CPU_FFS(&mask); MPASS(lsb != 0); lsb--; CPU_SETOF(lsb, &mask); mtx_lock_spin(&smp_ipi_mtx); #if defined(PAE) || defined(PAE_TABLES) lazyptd = vtophys(pmap->pm_pdpt); #else lazyptd = vtophys(pmap->pm_pdir); #endif cpuid = PCPU_GET(cpuid); /* Use a cpuset just for having an easy check. */ CPU_SETOF(cpuid, &mymask); if (!CPU_CMP(&mask, &mymask)) { lazymask = &pmap->pm_active; pmap_lazyfix_self(cpuid); } else { atomic_store_rel_int((u_int *)&lazymask, (u_int)&pmap->pm_active); atomic_store_rel_int(&lazywait, 0); ipi_selected(mask, IPI_LAZYPMAP); while (lazywait == 0) { ia32_pause(); if (--spins == 0) break; } } mtx_unlock_spin(&smp_ipi_mtx); if (spins == 0) printf("pmap_lazyfix: spun for 50000000\n"); mask = pmap->pm_active; } } #else /* SMP */ /* * Cleaning up on uniprocessor is easy. For various reasons, we're * unlikely to have to even execute this code, including the fact * that the cleanup is deferred until the parent does a wait(2), which * means that another userland process has run. */ static void pmap_lazyfix(pmap_t pmap) { u_int cr3; cr3 = vtophys(pmap->pm_pdir); if (cr3 == rcr3()) { load_cr3(curpcb->pcb_cr3); CPU_CLR(PCPU_GET(cpuid), &pmap->pm_active); } } #endif /* SMP */ /* * Release any resources held by the given physical map. * Called when a pmap initialized by pmap_pinit is being released. * Should only be called if the map contains no valid mappings. */ void pmap_release(pmap_t pmap) { vm_page_t m, ptdpg[NPGPTD]; int i; KASSERT(pmap->pm_stats.resident_count == 0, ("pmap_release: pmap resident count %ld != 0", pmap->pm_stats.resident_count)); KASSERT(vm_radix_is_empty(&pmap->pm_root), ("pmap_release: pmap has reserved page table page(s)")); pmap_lazyfix(pmap); mtx_lock_spin(&allpmaps_lock); LIST_REMOVE(pmap, pm_list); mtx_unlock_spin(&allpmaps_lock); for (i = 0; i < NPGPTD; i++) ptdpg[i] = PHYS_TO_VM_PAGE(pmap->pm_pdir[PTDPTDI + i] & PG_FRAME); bzero(pmap->pm_pdir + PTDPTDI, (nkpt + NPGPTD) * sizeof(*pmap->pm_pdir)); pmap_qremove((vm_offset_t)pmap->pm_pdir, NPGPTD); for (i = 0; i < NPGPTD; i++) { m = ptdpg[i]; #if defined(PAE) || defined(PAE_TABLES) KASSERT(VM_PAGE_TO_PHYS(m) == (pmap->pm_pdpt[i] & PG_FRAME), ("pmap_release: got wrong ptd page")); #endif m->wire_count--; atomic_subtract_int(&cnt.v_wire_count, 1); vm_page_free_zero(m); } } static int kvm_size(SYSCTL_HANDLER_ARGS) { unsigned long ksize = VM_MAX_KERNEL_ADDRESS - KERNBASE; return (sysctl_handle_long(oidp, &ksize, 0, req)); } SYSCTL_PROC(_vm, OID_AUTO, kvm_size, CTLTYPE_LONG|CTLFLAG_RD, 0, 0, kvm_size, "IU", "Size of KVM"); static int kvm_free(SYSCTL_HANDLER_ARGS) { unsigned long kfree = VM_MAX_KERNEL_ADDRESS - kernel_vm_end; return (sysctl_handle_long(oidp, &kfree, 0, req)); } SYSCTL_PROC(_vm, OID_AUTO, kvm_free, CTLTYPE_LONG|CTLFLAG_RD, 0, 0, kvm_free, "IU", "Amount of KVM free"); /* * grow the number of kernel page table entries, if needed */ void pmap_growkernel(vm_offset_t addr) { vm_paddr_t ptppaddr; vm_page_t nkpg; pd_entry_t newpdir; mtx_assert(&kernel_map->system_mtx, MA_OWNED); addr = roundup2(addr, NBPDR); if (addr - 1 >= kernel_map->max_offset) addr = kernel_map->max_offset; while (kernel_vm_end < addr) { if (pdir_pde(PTD, kernel_vm_end)) { kernel_vm_end = (kernel_vm_end + NBPDR) & ~PDRMASK; if (kernel_vm_end - 1 >= kernel_map->max_offset) { kernel_vm_end = kernel_map->max_offset; break; } continue; } nkpg = vm_page_alloc(NULL, kernel_vm_end >> PDRSHIFT, VM_ALLOC_INTERRUPT | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED | VM_ALLOC_ZERO); if (nkpg == NULL) panic("pmap_growkernel: no memory to grow kernel"); nkpt++; if ((nkpg->flags & PG_ZERO) == 0) pmap_zero_page(nkpg); ptppaddr = VM_PAGE_TO_PHYS(nkpg); newpdir = (pd_entry_t) (ptppaddr | PG_V | PG_RW | PG_A | PG_M); pdir_pde(KPTD, kernel_vm_end) = pgeflag | newpdir; pmap_kenter_pde(kernel_vm_end, newpdir); kernel_vm_end = (kernel_vm_end + NBPDR) & ~PDRMASK; if (kernel_vm_end - 1 >= kernel_map->max_offset) { kernel_vm_end = kernel_map->max_offset; break; } } } /*************************************************** * page management routines. ***************************************************/ CTASSERT(sizeof(struct pv_chunk) == PAGE_SIZE); CTASSERT(_NPCM == 11); CTASSERT(_NPCPV == 336); static __inline struct pv_chunk * pv_to_chunk(pv_entry_t pv) { return ((struct pv_chunk *)((uintptr_t)pv & ~(uintptr_t)PAGE_MASK)); } #define PV_PMAP(pv) (pv_to_chunk(pv)->pc_pmap) #define PC_FREE0_9 0xfffffffful /* Free values for index 0 through 9 */ #define PC_FREE10 0x0000fffful /* Free values for index 10 */ static const uint32_t pc_freemask[_NPCM] = { PC_FREE0_9, PC_FREE0_9, PC_FREE0_9, PC_FREE0_9, PC_FREE0_9, PC_FREE0_9, PC_FREE0_9, PC_FREE0_9, PC_FREE0_9, PC_FREE0_9, PC_FREE10 }; SYSCTL_INT(_vm_pmap, OID_AUTO, pv_entry_count, CTLFLAG_RD, &pv_entry_count, 0, "Current number of pv entries"); #ifdef PV_STATS static int pc_chunk_count, pc_chunk_allocs, pc_chunk_frees, pc_chunk_tryfail; SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_count, CTLFLAG_RD, &pc_chunk_count, 0, "Current number of pv entry chunks"); SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_allocs, CTLFLAG_RD, &pc_chunk_allocs, 0, "Current number of pv entry chunks allocated"); SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_frees, CTLFLAG_RD, &pc_chunk_frees, 0, "Current number of pv entry chunks frees"); SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_tryfail, CTLFLAG_RD, &pc_chunk_tryfail, 0, "Number of times tried to get a chunk page but failed."); static long pv_entry_frees, pv_entry_allocs; static int pv_entry_spare; SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_frees, CTLFLAG_RD, &pv_entry_frees, 0, "Current number of pv entry frees"); SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_allocs, CTLFLAG_RD, &pv_entry_allocs, 0, "Current number of pv entry allocs"); SYSCTL_INT(_vm_pmap, OID_AUTO, pv_entry_spare, CTLFLAG_RD, &pv_entry_spare, 0, "Current number of spare pv entries"); #endif /* * We are in a serious low memory condition. Resort to * drastic measures to free some pages so we can allocate * another pv entry chunk. */ static vm_page_t pmap_pv_reclaim(pmap_t locked_pmap) { struct pch newtail; struct pv_chunk *pc; struct md_page *pvh; pd_entry_t *pde; pmap_t pmap; pt_entry_t *pte, tpte; pv_entry_t pv; vm_offset_t va; vm_page_t m, m_pc; struct spglist free; uint32_t inuse; int bit, field, freed; PMAP_LOCK_ASSERT(locked_pmap, MA_OWNED); pmap = NULL; m_pc = NULL; SLIST_INIT(&free); TAILQ_INIT(&newtail); while ((pc = TAILQ_FIRST(&pv_chunks)) != NULL && (pv_vafree == 0 || SLIST_EMPTY(&free))) { TAILQ_REMOVE(&pv_chunks, pc, pc_lru); if (pmap != pc->pc_pmap) { if (pmap != NULL) { pmap_invalidate_all(pmap); if (pmap != locked_pmap) PMAP_UNLOCK(pmap); } pmap = pc->pc_pmap; /* Avoid deadlock and lock recursion. */ if (pmap > locked_pmap) PMAP_LOCK(pmap); else if (pmap != locked_pmap && !PMAP_TRYLOCK(pmap)) { pmap = NULL; TAILQ_INSERT_TAIL(&newtail, pc, pc_lru); continue; } } /* * Destroy every non-wired, 4 KB page mapping in the chunk. */ freed = 0; for (field = 0; field < _NPCM; field++) { for (inuse = ~pc->pc_map[field] & pc_freemask[field]; inuse != 0; inuse &= ~(1UL << bit)) { bit = bsfl(inuse); pv = &pc->pc_pventry[field * 32 + bit]; va = pv->pv_va; pde = pmap_pde(pmap, va); if ((*pde & PG_PS) != 0) continue; pte = pmap_pte(pmap, va); tpte = *pte; if ((tpte & PG_W) == 0) tpte = pte_load_clear(pte); pmap_pte_release(pte); if ((tpte & PG_W) != 0) continue; KASSERT(tpte != 0, ("pmap_pv_reclaim: pmap %p va %x zero pte", pmap, va)); if ((tpte & PG_G) != 0) pmap_invalidate_page(pmap, va); m = PHYS_TO_VM_PAGE(tpte & PG_FRAME); if ((tpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) vm_page_dirty(m); if ((tpte & PG_A) != 0) vm_page_aflag_set(m, PGA_REFERENCED); TAILQ_REMOVE(&m->md.pv_list, pv, pv_next); if (TAILQ_EMPTY(&m->md.pv_list) && (m->flags & PG_FICTITIOUS) == 0) { pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); if (TAILQ_EMPTY(&pvh->pv_list)) { vm_page_aflag_clear(m, PGA_WRITEABLE); } } pc->pc_map[field] |= 1UL << bit; pmap_unuse_pt(pmap, va, &free); freed++; } } if (freed == 0) { TAILQ_INSERT_TAIL(&newtail, pc, pc_lru); continue; } /* Every freed mapping is for a 4 KB page. */ pmap->pm_stats.resident_count -= freed; PV_STAT(pv_entry_frees += freed); PV_STAT(pv_entry_spare += freed); pv_entry_count -= freed; TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); for (field = 0; field < _NPCM; field++) if (pc->pc_map[field] != pc_freemask[field]) { TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list); TAILQ_INSERT_TAIL(&newtail, pc, pc_lru); /* * One freed pv entry in locked_pmap is * sufficient. */ if (pmap == locked_pmap) goto out; break; } if (field == _NPCM) { PV_STAT(pv_entry_spare -= _NPCPV); PV_STAT(pc_chunk_count--); PV_STAT(pc_chunk_frees++); /* Entire chunk is free; return it. */ m_pc = PHYS_TO_VM_PAGE(pmap_kextract((vm_offset_t)pc)); pmap_qremove((vm_offset_t)pc, 1); pmap_ptelist_free(&pv_vafree, (vm_offset_t)pc); break; } } out: TAILQ_CONCAT(&pv_chunks, &newtail, pc_lru); if (pmap != NULL) { pmap_invalidate_all(pmap); if (pmap != locked_pmap) PMAP_UNLOCK(pmap); } if (m_pc == NULL && pv_vafree != 0 && SLIST_EMPTY(&free)) { m_pc = SLIST_FIRST(&free); SLIST_REMOVE_HEAD(&free, plinks.s.ss); /* Recycle a freed page table page. */ m_pc->wire_count = 1; atomic_add_int(&cnt.v_wire_count, 1); } pmap_free_zero_pages(&free); return (m_pc); } /* * free the pv_entry back to the free list */ static void free_pv_entry(pmap_t pmap, pv_entry_t pv) { struct pv_chunk *pc; int idx, field, bit; rw_assert(&pvh_global_lock, RA_WLOCKED); PMAP_LOCK_ASSERT(pmap, MA_OWNED); PV_STAT(pv_entry_frees++); PV_STAT(pv_entry_spare++); pv_entry_count--; pc = pv_to_chunk(pv); idx = pv - &pc->pc_pventry[0]; field = idx / 32; bit = idx % 32; pc->pc_map[field] |= 1ul << bit; for (idx = 0; idx < _NPCM; idx++) if (pc->pc_map[idx] != pc_freemask[idx]) { /* * 98% of the time, pc is already at the head of the * list. If it isn't already, move it to the head. */ if (__predict_false(TAILQ_FIRST(&pmap->pm_pvchunk) != pc)) { TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list); } return; } TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); free_pv_chunk(pc); } static void free_pv_chunk(struct pv_chunk *pc) { vm_page_t m; TAILQ_REMOVE(&pv_chunks, pc, pc_lru); PV_STAT(pv_entry_spare -= _NPCPV); PV_STAT(pc_chunk_count--); PV_STAT(pc_chunk_frees++); /* entire chunk is free, return it */ m = PHYS_TO_VM_PAGE(pmap_kextract((vm_offset_t)pc)); pmap_qremove((vm_offset_t)pc, 1); vm_page_unwire(m, 0); vm_page_free(m); pmap_ptelist_free(&pv_vafree, (vm_offset_t)pc); } /* * get a new pv_entry, allocating a block from the system * when needed. */ static pv_entry_t get_pv_entry(pmap_t pmap, boolean_t try) { static const struct timeval printinterval = { 60, 0 }; static struct timeval lastprint; int bit, field; pv_entry_t pv; struct pv_chunk *pc; vm_page_t m; rw_assert(&pvh_global_lock, RA_WLOCKED); PMAP_LOCK_ASSERT(pmap, MA_OWNED); PV_STAT(pv_entry_allocs++); pv_entry_count++; if (pv_entry_count > pv_entry_high_water) if (ratecheck(&lastprint, &printinterval)) printf("Approaching the limit on PV entries, consider " "increasing either the vm.pmap.shpgperproc or the " "vm.pmap.pv_entry_max tunable.\n"); retry: pc = TAILQ_FIRST(&pmap->pm_pvchunk); if (pc != NULL) { for (field = 0; field < _NPCM; field++) { if (pc->pc_map[field]) { bit = bsfl(pc->pc_map[field]); break; } } if (field < _NPCM) { pv = &pc->pc_pventry[field * 32 + bit]; pc->pc_map[field] &= ~(1ul << bit); /* If this was the last item, move it to tail */ for (field = 0; field < _NPCM; field++) if (pc->pc_map[field] != 0) { PV_STAT(pv_entry_spare--); return (pv); /* not full, return */ } TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, pc_list); PV_STAT(pv_entry_spare--); return (pv); } } /* * Access to the ptelist "pv_vafree" is synchronized by the pvh * global lock. If "pv_vafree" is currently non-empty, it will * remain non-empty until pmap_ptelist_alloc() completes. */ if (pv_vafree == 0 || (m = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED)) == NULL) { if (try) { pv_entry_count--; PV_STAT(pc_chunk_tryfail++); return (NULL); } m = pmap_pv_reclaim(pmap); if (m == NULL) goto retry; } PV_STAT(pc_chunk_count++); PV_STAT(pc_chunk_allocs++); pc = (struct pv_chunk *)pmap_ptelist_alloc(&pv_vafree); pmap_qenter((vm_offset_t)pc, &m, 1); pc->pc_pmap = pmap; pc->pc_map[0] = pc_freemask[0] & ~1ul; /* preallocated bit 0 */ for (field = 1; field < _NPCM; field++) pc->pc_map[field] = pc_freemask[field]; TAILQ_INSERT_TAIL(&pv_chunks, pc, pc_lru); pv = &pc->pc_pventry[0]; TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list); PV_STAT(pv_entry_spare += _NPCPV - 1); return (pv); } static __inline pv_entry_t pmap_pvh_remove(struct md_page *pvh, pmap_t pmap, vm_offset_t va) { pv_entry_t pv; rw_assert(&pvh_global_lock, RA_WLOCKED); TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { if (pmap == PV_PMAP(pv) && va == pv->pv_va) { TAILQ_REMOVE(&pvh->pv_list, pv, pv_next); break; } } return (pv); } static void pmap_pv_demote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa) { struct md_page *pvh; pv_entry_t pv; vm_offset_t va_last; vm_page_t m; rw_assert(&pvh_global_lock, RA_WLOCKED); KASSERT((pa & PDRMASK) == 0, ("pmap_pv_demote_pde: pa is not 4mpage aligned")); /* * Transfer the 4mpage's pv entry for this mapping to the first * page's pv list. */ pvh = pa_to_pvh(pa); va = trunc_4mpage(va); pv = pmap_pvh_remove(pvh, pmap, va); KASSERT(pv != NULL, ("pmap_pv_demote_pde: pv not found")); m = PHYS_TO_VM_PAGE(pa); TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); /* Instantiate the remaining NPTEPG - 1 pv entries. */ va_last = va + NBPDR - PAGE_SIZE; do { m++; KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("pmap_pv_demote_pde: page %p is not managed", m)); va += PAGE_SIZE; pmap_insert_entry(pmap, va, m); } while (va < va_last); } static void pmap_pv_promote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa) { struct md_page *pvh; pv_entry_t pv; vm_offset_t va_last; vm_page_t m; rw_assert(&pvh_global_lock, RA_WLOCKED); KASSERT((pa & PDRMASK) == 0, ("pmap_pv_promote_pde: pa is not 4mpage aligned")); /* * Transfer the first page's pv entry for this mapping to the * 4mpage's pv list. Aside from avoiding the cost of a call * to get_pv_entry(), a transfer avoids the possibility that * get_pv_entry() calls pmap_collect() and that pmap_collect() * removes one of the mappings that is being promoted. */ m = PHYS_TO_VM_PAGE(pa); va = trunc_4mpage(va); pv = pmap_pvh_remove(&m->md, pmap, va); KASSERT(pv != NULL, ("pmap_pv_promote_pde: pv not found")); pvh = pa_to_pvh(pa); TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next); /* Free the remaining NPTEPG - 1 pv entries. */ va_last = va + NBPDR - PAGE_SIZE; do { m++; va += PAGE_SIZE; pmap_pvh_free(&m->md, pmap, va); } while (va < va_last); } static void pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va) { pv_entry_t pv; pv = pmap_pvh_remove(pvh, pmap, va); KASSERT(pv != NULL, ("pmap_pvh_free: pv not found")); free_pv_entry(pmap, pv); } static void pmap_remove_entry(pmap_t pmap, vm_page_t m, vm_offset_t va) { struct md_page *pvh; rw_assert(&pvh_global_lock, RA_WLOCKED); pmap_pvh_free(&m->md, pmap, va); if (TAILQ_EMPTY(&m->md.pv_list) && (m->flags & PG_FICTITIOUS) == 0) { pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); if (TAILQ_EMPTY(&pvh->pv_list)) vm_page_aflag_clear(m, PGA_WRITEABLE); } } /* * Create a pv entry for page at pa for * (pmap, va). */ static void pmap_insert_entry(pmap_t pmap, vm_offset_t va, vm_page_t m) { pv_entry_t pv; rw_assert(&pvh_global_lock, RA_WLOCKED); PMAP_LOCK_ASSERT(pmap, MA_OWNED); pv = get_pv_entry(pmap, FALSE); pv->pv_va = va; TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); } /* * Conditionally create a pv entry. */ static boolean_t pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va, vm_page_t m) { pv_entry_t pv; rw_assert(&pvh_global_lock, RA_WLOCKED); PMAP_LOCK_ASSERT(pmap, MA_OWNED); if (pv_entry_count < pv_entry_high_water && (pv = get_pv_entry(pmap, TRUE)) != NULL) { pv->pv_va = va; TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); return (TRUE); } else return (FALSE); } /* * Create the pv entries for each of the pages within a superpage. */ static boolean_t pmap_pv_insert_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa) { struct md_page *pvh; pv_entry_t pv; rw_assert(&pvh_global_lock, RA_WLOCKED); if (pv_entry_count < pv_entry_high_water && (pv = get_pv_entry(pmap, TRUE)) != NULL) { pv->pv_va = va; pvh = pa_to_pvh(pa); TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next); return (TRUE); } else return (FALSE); } /* * Fills a page table page with mappings to consecutive physical pages. */ static void pmap_fill_ptp(pt_entry_t *firstpte, pt_entry_t newpte) { pt_entry_t *pte; for (pte = firstpte; pte < firstpte + NPTEPG; pte++) { *pte = newpte; newpte += PAGE_SIZE; } } /* * Tries to demote a 2- or 4MB page mapping. If demotion fails, the * 2- or 4MB page mapping is invalidated. */ static boolean_t pmap_demote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va) { pd_entry_t newpde, oldpde; pt_entry_t *firstpte, newpte; vm_paddr_t mptepa; vm_page_t mpte; struct spglist free; PMAP_LOCK_ASSERT(pmap, MA_OWNED); oldpde = *pde; KASSERT((oldpde & (PG_PS | PG_V)) == (PG_PS | PG_V), ("pmap_demote_pde: oldpde is missing PG_PS and/or PG_V")); if ((oldpde & PG_A) != 0 && (mpte = pmap_lookup_pt_page(pmap, va)) != NULL) pmap_remove_pt_page(pmap, mpte); else { KASSERT((oldpde & PG_W) == 0, ("pmap_demote_pde: page table page for a wired mapping" " is missing")); /* * Invalidate the 2- or 4MB page mapping and return * "failure" if the mapping was never accessed or the * allocation of the new page table page fails. */ if ((oldpde & PG_A) == 0 || (mpte = vm_page_alloc(NULL, va >> PDRSHIFT, VM_ALLOC_NOOBJ | VM_ALLOC_NORMAL | VM_ALLOC_WIRED)) == NULL) { SLIST_INIT(&free); pmap_remove_pde(pmap, pde, trunc_4mpage(va), &free); pmap_invalidate_page(pmap, trunc_4mpage(va)); pmap_free_zero_pages(&free); CTR2(KTR_PMAP, "pmap_demote_pde: failure for va %#x" " in pmap %p", va, pmap); return (FALSE); } if (va < VM_MAXUSER_ADDRESS) pmap->pm_stats.resident_count++; } mptepa = VM_PAGE_TO_PHYS(mpte); /* * If the page mapping is in the kernel's address space, then the * KPTmap can provide access to the page table page. Otherwise, * temporarily map the page table page (mpte) into the kernel's * address space at either PADDR1 or PADDR2. */ if (va >= KERNBASE) firstpte = &KPTmap[i386_btop(trunc_4mpage(va))]; else if (curthread->td_pinned > 0 && rw_wowned(&pvh_global_lock)) { if ((*PMAP1 & PG_FRAME) != mptepa) { *PMAP1 = mptepa | PG_RW | PG_V | PG_A | PG_M; #ifdef SMP PMAP1cpu = PCPU_GET(cpuid); #endif invlcaddr(PADDR1); PMAP1changed++; } else #ifdef SMP if (PMAP1cpu != PCPU_GET(cpuid)) { PMAP1cpu = PCPU_GET(cpuid); invlcaddr(PADDR1); PMAP1changedcpu++; } else #endif PMAP1unchanged++; firstpte = PADDR1; } else { mtx_lock(&PMAP2mutex); if ((*PMAP2 & PG_FRAME) != mptepa) { *PMAP2 = mptepa | PG_RW | PG_V | PG_A | PG_M; pmap_invalidate_page(kernel_pmap, (vm_offset_t)PADDR2); } firstpte = PADDR2; } newpde = mptepa | PG_M | PG_A | (oldpde & PG_U) | PG_RW | PG_V; KASSERT((oldpde & PG_A) != 0, ("pmap_demote_pde: oldpde is missing PG_A")); KASSERT((oldpde & (PG_M | PG_RW)) != PG_RW, ("pmap_demote_pde: oldpde is missing PG_M")); newpte = oldpde & ~PG_PS; if ((newpte & PG_PDE_PAT) != 0) newpte ^= PG_PDE_PAT | PG_PTE_PAT; /* * If the page table page is new, initialize it. */ if (mpte->wire_count == 1) { mpte->wire_count = NPTEPG; pmap_fill_ptp(firstpte, newpte); } KASSERT((*firstpte & PG_FRAME) == (newpte & PG_FRAME), ("pmap_demote_pde: firstpte and newpte map different physical" " addresses")); /* * If the mapping has changed attributes, update the page table * entries. */ if ((*firstpte & PG_PTE_PROMOTE) != (newpte & PG_PTE_PROMOTE)) pmap_fill_ptp(firstpte, newpte); /* * Demote the mapping. This pmap is locked. The old PDE has * PG_A set. If the old PDE has PG_RW set, it also has PG_M * set. Thus, there is no danger of a race with another * processor changing the setting of PG_A and/or PG_M between * the read above and the store below. */ if (workaround_erratum383) pmap_update_pde(pmap, va, pde, newpde); else if (pmap == kernel_pmap) pmap_kenter_pde(va, newpde); else pde_store(pde, newpde); if (firstpte == PADDR2) mtx_unlock(&PMAP2mutex); /* * Invalidate the recursive mapping of the page table page. */ pmap_invalidate_page(pmap, (vm_offset_t)vtopte(va)); /* * Demote the pv entry. This depends on the earlier demotion * of the mapping. Specifically, the (re)creation of a per- * page pv entry might trigger the execution of pmap_collect(), * which might reclaim a newly (re)created per-page pv entry * and destroy the associated mapping. In order to destroy * the mapping, the PDE must have already changed from mapping * the 2mpage to referencing the page table page. */ if ((oldpde & PG_MANAGED) != 0) pmap_pv_demote_pde(pmap, va, oldpde & PG_PS_FRAME); pmap_pde_demotions++; CTR2(KTR_PMAP, "pmap_demote_pde: success for va %#x" " in pmap %p", va, pmap); return (TRUE); } /* * Removes a 2- or 4MB page mapping from the kernel pmap. */ static void pmap_remove_kernel_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va) { pd_entry_t newpde; vm_paddr_t mptepa; vm_page_t mpte; PMAP_LOCK_ASSERT(pmap, MA_OWNED); mpte = pmap_lookup_pt_page(pmap, va); if (mpte == NULL) panic("pmap_remove_kernel_pde: Missing pt page."); pmap_remove_pt_page(pmap, mpte); mptepa = VM_PAGE_TO_PHYS(mpte); newpde = mptepa | PG_M | PG_A | PG_RW | PG_V; /* * Initialize the page table page. */ pagezero((void *)&KPTmap[i386_btop(trunc_4mpage(va))]); /* * Remove the mapping. */ if (workaround_erratum383) pmap_update_pde(pmap, va, pde, newpde); else pmap_kenter_pde(va, newpde); /* * Invalidate the recursive mapping of the page table page. */ pmap_invalidate_page(pmap, (vm_offset_t)vtopte(va)); } /* * pmap_remove_pde: do the things to unmap a superpage in a process */ static void pmap_remove_pde(pmap_t pmap, pd_entry_t *pdq, vm_offset_t sva, struct spglist *free) { struct md_page *pvh; pd_entry_t oldpde; vm_offset_t eva, va; vm_page_t m, mpte; PMAP_LOCK_ASSERT(pmap, MA_OWNED); KASSERT((sva & PDRMASK) == 0, ("pmap_remove_pde: sva is not 4mpage aligned")); oldpde = pte_load_clear(pdq); if (oldpde & PG_W) pmap->pm_stats.wired_count -= NBPDR / PAGE_SIZE; /* * Machines that don't support invlpg, also don't support * PG_G. */ if (oldpde & PG_G) pmap_invalidate_page(kernel_pmap, sva); pmap->pm_stats.resident_count -= NBPDR / PAGE_SIZE; if (oldpde & PG_MANAGED) { pvh = pa_to_pvh(oldpde & PG_PS_FRAME); pmap_pvh_free(pvh, pmap, sva); eva = sva + NBPDR; for (va = sva, m = PHYS_TO_VM_PAGE(oldpde & PG_PS_FRAME); va < eva; va += PAGE_SIZE, m++) { if ((oldpde & (PG_M | PG_RW)) == (PG_M | PG_RW)) vm_page_dirty(m); if (oldpde & PG_A) vm_page_aflag_set(m, PGA_REFERENCED); if (TAILQ_EMPTY(&m->md.pv_list) && TAILQ_EMPTY(&pvh->pv_list)) vm_page_aflag_clear(m, PGA_WRITEABLE); } } if (pmap == kernel_pmap) { pmap_remove_kernel_pde(pmap, pdq, sva); } else { mpte = pmap_lookup_pt_page(pmap, sva); if (mpte != NULL) { pmap_remove_pt_page(pmap, mpte); pmap->pm_stats.resident_count--; KASSERT(mpte->wire_count == NPTEPG, ("pmap_remove_pde: pte page wire count error")); mpte->wire_count = 0; pmap_add_delayed_free_list(mpte, free, FALSE); atomic_subtract_int(&cnt.v_wire_count, 1); } } } /* * pmap_remove_pte: do the things to unmap a page in a process */ static int pmap_remove_pte(pmap_t pmap, pt_entry_t *ptq, vm_offset_t va, struct spglist *free) { pt_entry_t oldpte; vm_page_t m; rw_assert(&pvh_global_lock, RA_WLOCKED); PMAP_LOCK_ASSERT(pmap, MA_OWNED); oldpte = pte_load_clear(ptq); KASSERT(oldpte != 0, ("pmap_remove_pte: pmap %p va %x zero pte", pmap, va)); if (oldpte & PG_W) pmap->pm_stats.wired_count -= 1; /* * Machines that don't support invlpg, also don't support * PG_G. */ if (oldpte & PG_G) pmap_invalidate_page(kernel_pmap, va); pmap->pm_stats.resident_count -= 1; if (oldpte & PG_MANAGED) { m = PHYS_TO_VM_PAGE(oldpte & PG_FRAME); if ((oldpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) vm_page_dirty(m); if (oldpte & PG_A) vm_page_aflag_set(m, PGA_REFERENCED); pmap_remove_entry(pmap, m, va); } return (pmap_unuse_pt(pmap, va, free)); } /* * Remove a single page from a process address space */ static void pmap_remove_page(pmap_t pmap, vm_offset_t va, struct spglist *free) { pt_entry_t *pte; rw_assert(&pvh_global_lock, RA_WLOCKED); KASSERT(curthread->td_pinned > 0, ("curthread not pinned")); PMAP_LOCK_ASSERT(pmap, MA_OWNED); if ((pte = pmap_pte_quick(pmap, va)) == NULL || *pte == 0) return; pmap_remove_pte(pmap, pte, va, free); pmap_invalidate_page(pmap, va); } /* * Remove the given range of addresses from the specified map. * * It is assumed that the start and end are properly * rounded to the page size. */ void pmap_remove(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) { vm_offset_t pdnxt; pd_entry_t ptpaddr; pt_entry_t *pte; struct spglist free; int anyvalid; /* * Perform an unsynchronized read. This is, however, safe. */ if (pmap->pm_stats.resident_count == 0) return; anyvalid = 0; SLIST_INIT(&free); rw_wlock(&pvh_global_lock); sched_pin(); PMAP_LOCK(pmap); /* * special handling of removing one page. a very * common operation and easy to short circuit some * code. */ if ((sva + PAGE_SIZE == eva) && ((pmap->pm_pdir[(sva >> PDRSHIFT)] & PG_PS) == 0)) { pmap_remove_page(pmap, sva, &free); goto out; } for (; sva < eva; sva = pdnxt) { u_int pdirindex; /* * Calculate index for next page table. */ pdnxt = (sva + NBPDR) & ~PDRMASK; if (pdnxt < sva) pdnxt = eva; if (pmap->pm_stats.resident_count == 0) break; pdirindex = sva >> PDRSHIFT; ptpaddr = pmap->pm_pdir[pdirindex]; /* * Weed out invalid mappings. Note: we assume that the page * directory table is always allocated, and in kernel virtual. */ if (ptpaddr == 0) continue; /* * Check for large page. */ if ((ptpaddr & PG_PS) != 0) { /* * Are we removing the entire large page? If not, * demote the mapping and fall through. */ if (sva + NBPDR == pdnxt && eva >= pdnxt) { /* * The TLB entry for a PG_G mapping is * invalidated by pmap_remove_pde(). */ if ((ptpaddr & PG_G) == 0) anyvalid = 1; pmap_remove_pde(pmap, &pmap->pm_pdir[pdirindex], sva, &free); continue; } else if (!pmap_demote_pde(pmap, &pmap->pm_pdir[pdirindex], sva)) { /* The large page mapping was destroyed. */ continue; } } /* * Limit our scan to either the end of the va represented * by the current page table page, or to the end of the * range being removed. */ if (pdnxt > eva) pdnxt = eva; for (pte = pmap_pte_quick(pmap, sva); sva != pdnxt; pte++, sva += PAGE_SIZE) { if (*pte == 0) continue; /* * The TLB entry for a PG_G mapping is invalidated * by pmap_remove_pte(). */ if ((*pte & PG_G) == 0) anyvalid = 1; if (pmap_remove_pte(pmap, pte, sva, &free)) break; } } out: sched_unpin(); if (anyvalid) pmap_invalidate_all(pmap); rw_wunlock(&pvh_global_lock); PMAP_UNLOCK(pmap); pmap_free_zero_pages(&free); } /* * Routine: pmap_remove_all * Function: * Removes this physical page from * all physical maps in which it resides. * Reflects back modify bits to the pager. * * Notes: * Original versions of this routine were very * inefficient because they iteratively called * pmap_remove (slow...) */ void pmap_remove_all(vm_page_t m) { struct md_page *pvh; pv_entry_t pv; pmap_t pmap; pt_entry_t *pte, tpte; pd_entry_t *pde; vm_offset_t va; struct spglist free; KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("pmap_remove_all: page %p is not managed", m)); SLIST_INIT(&free); rw_wlock(&pvh_global_lock); sched_pin(); if ((m->flags & PG_FICTITIOUS) != 0) goto small_mappings; pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); while ((pv = TAILQ_FIRST(&pvh->pv_list)) != NULL) { va = pv->pv_va; pmap = PV_PMAP(pv); PMAP_LOCK(pmap); pde = pmap_pde(pmap, va); (void)pmap_demote_pde(pmap, pde, va); PMAP_UNLOCK(pmap); } small_mappings: while ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) { pmap = PV_PMAP(pv); PMAP_LOCK(pmap); pmap->pm_stats.resident_count--; pde = pmap_pde(pmap, pv->pv_va); KASSERT((*pde & PG_PS) == 0, ("pmap_remove_all: found" " a 4mpage in page %p's pv list", m)); pte = pmap_pte_quick(pmap, pv->pv_va); tpte = pte_load_clear(pte); KASSERT(tpte != 0, ("pmap_remove_all: pmap %p va %x zero pte", pmap, pv->pv_va)); if (tpte & PG_W) pmap->pm_stats.wired_count--; if (tpte & PG_A) vm_page_aflag_set(m, PGA_REFERENCED); /* * Update the vm_page_t clean and reference bits. */ if ((tpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) vm_page_dirty(m); pmap_unuse_pt(pmap, pv->pv_va, &free); pmap_invalidate_page(pmap, pv->pv_va); TAILQ_REMOVE(&m->md.pv_list, pv, pv_next); free_pv_entry(pmap, pv); PMAP_UNLOCK(pmap); } vm_page_aflag_clear(m, PGA_WRITEABLE); sched_unpin(); rw_wunlock(&pvh_global_lock); pmap_free_zero_pages(&free); } /* * pmap_protect_pde: do the things to protect a 4mpage in a process */ static boolean_t pmap_protect_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t sva, vm_prot_t prot) { pd_entry_t newpde, oldpde; vm_offset_t eva, va; vm_page_t m; boolean_t anychanged; PMAP_LOCK_ASSERT(pmap, MA_OWNED); KASSERT((sva & PDRMASK) == 0, ("pmap_protect_pde: sva is not 4mpage aligned")); anychanged = FALSE; retry: oldpde = newpde = *pde; if (oldpde & PG_MANAGED) { eva = sva + NBPDR; for (va = sva, m = PHYS_TO_VM_PAGE(oldpde & PG_PS_FRAME); va < eva; va += PAGE_SIZE, m++) if ((oldpde & (PG_M | PG_RW)) == (PG_M | PG_RW)) vm_page_dirty(m); } if ((prot & VM_PROT_WRITE) == 0) newpde &= ~(PG_RW | PG_M); #if defined(PAE) || defined(PAE_TABLES) if ((prot & VM_PROT_EXECUTE) == 0) newpde |= pg_nx; #endif if (newpde != oldpde) { if (!pde_cmpset(pde, oldpde, newpde)) goto retry; if (oldpde & PG_G) pmap_invalidate_page(pmap, sva); else anychanged = TRUE; } return (anychanged); } /* * Set the physical protection on the * specified range of this map as requested. */ void pmap_protect(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, vm_prot_t prot) { vm_offset_t pdnxt; pd_entry_t ptpaddr; pt_entry_t *pte; boolean_t anychanged, pv_lists_locked; KASSERT((prot & ~VM_PROT_ALL) == 0, ("invalid prot %x", prot)); if (prot == VM_PROT_NONE) { pmap_remove(pmap, sva, eva); return; } #if defined(PAE) || defined(PAE_TABLES) if ((prot & (VM_PROT_WRITE|VM_PROT_EXECUTE)) == (VM_PROT_WRITE|VM_PROT_EXECUTE)) return; #else if (prot & VM_PROT_WRITE) return; #endif if (pmap_is_current(pmap)) pv_lists_locked = FALSE; else { pv_lists_locked = TRUE; resume: rw_wlock(&pvh_global_lock); sched_pin(); } anychanged = FALSE; PMAP_LOCK(pmap); for (; sva < eva; sva = pdnxt) { pt_entry_t obits, pbits; u_int pdirindex; pdnxt = (sva + NBPDR) & ~PDRMASK; if (pdnxt < sva) pdnxt = eva; pdirindex = sva >> PDRSHIFT; ptpaddr = pmap->pm_pdir[pdirindex]; /* * Weed out invalid mappings. Note: we assume that the page * directory table is always allocated, and in kernel virtual. */ if (ptpaddr == 0) continue; /* * Check for large page. */ if ((ptpaddr & PG_PS) != 0) { /* * Are we protecting the entire large page? If not, * demote the mapping and fall through. */ if (sva + NBPDR == pdnxt && eva >= pdnxt) { /* * The TLB entry for a PG_G mapping is * invalidated by pmap_protect_pde(). */ if (pmap_protect_pde(pmap, &pmap->pm_pdir[pdirindex], sva, prot)) anychanged = TRUE; continue; } else { if (!pv_lists_locked) { pv_lists_locked = TRUE; if (!rw_try_wlock(&pvh_global_lock)) { if (anychanged) pmap_invalidate_all( pmap); PMAP_UNLOCK(pmap); goto resume; } sched_pin(); } if (!pmap_demote_pde(pmap, &pmap->pm_pdir[pdirindex], sva)) { /* * The large page mapping was * destroyed. */ continue; } } } if (pdnxt > eva) pdnxt = eva; for (pte = pmap_pte_quick(pmap, sva); sva != pdnxt; pte++, sva += PAGE_SIZE) { vm_page_t m; retry: /* * Regardless of whether a pte is 32 or 64 bits in * size, PG_RW, PG_A, and PG_M are among the least * significant 32 bits. */ obits = pbits = *pte; if ((pbits & PG_V) == 0) continue; if ((prot & VM_PROT_WRITE) == 0) { if ((pbits & (PG_MANAGED | PG_M | PG_RW)) == (PG_MANAGED | PG_M | PG_RW)) { m = PHYS_TO_VM_PAGE(pbits & PG_FRAME); vm_page_dirty(m); } pbits &= ~(PG_RW | PG_M); } #if defined(PAE) || defined(PAE_TABLES) if ((prot & VM_PROT_EXECUTE) == 0) pbits |= pg_nx; #endif if (pbits != obits) { #if defined(PAE) || defined(PAE_TABLES) if (!atomic_cmpset_64(pte, obits, pbits)) goto retry; #else if (!atomic_cmpset_int((u_int *)pte, obits, pbits)) goto retry; #endif if (obits & PG_G) pmap_invalidate_page(pmap, sva); else anychanged = TRUE; } } } if (anychanged) pmap_invalidate_all(pmap); if (pv_lists_locked) { sched_unpin(); rw_wunlock(&pvh_global_lock); } PMAP_UNLOCK(pmap); } /* * Tries to promote the 512 or 1024, contiguous 4KB page mappings that are * within a single page table page (PTP) to a single 2- or 4MB page mapping. * For promotion to occur, two conditions must be met: (1) the 4KB page * mappings must map aligned, contiguous physical memory and (2) the 4KB page * mappings must have identical characteristics. * * Managed (PG_MANAGED) mappings within the kernel address space are not * promoted. The reason is that kernel PDEs are replicated in each pmap but * pmap_clear_ptes() and pmap_ts_referenced() only read the PDE from the kernel * pmap. */ static void pmap_promote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va) { pd_entry_t newpde; pt_entry_t *firstpte, oldpte, pa, *pte; vm_offset_t oldpteva; vm_page_t mpte; PMAP_LOCK_ASSERT(pmap, MA_OWNED); /* * Examine the first PTE in the specified PTP. Abort if this PTE is * either invalid, unused, or does not map the first 4KB physical page * within a 2- or 4MB page. */ firstpte = pmap_pte_quick(pmap, trunc_4mpage(va)); setpde: newpde = *firstpte; if ((newpde & ((PG_FRAME & PDRMASK) | PG_A | PG_V)) != (PG_A | PG_V)) { pmap_pde_p_failures++; CTR2(KTR_PMAP, "pmap_promote_pde: failure for va %#x" " in pmap %p", va, pmap); return; } if ((*firstpte & PG_MANAGED) != 0 && pmap == kernel_pmap) { pmap_pde_p_failures++; CTR2(KTR_PMAP, "pmap_promote_pde: failure for va %#x" " in pmap %p", va, pmap); return; } if ((newpde & (PG_M | PG_RW)) == PG_RW) { /* * When PG_M is already clear, PG_RW can be cleared without * a TLB invalidation. */ if (!atomic_cmpset_int((u_int *)firstpte, newpde, newpde & ~PG_RW)) goto setpde; newpde &= ~PG_RW; } /* * Examine each of the other PTEs in the specified PTP. Abort if this * PTE maps an unexpected 4KB physical page or does not have identical * characteristics to the first PTE. */ pa = (newpde & (PG_PS_FRAME | PG_A | PG_V)) + NBPDR - PAGE_SIZE; for (pte = firstpte + NPTEPG - 1; pte > firstpte; pte--) { setpte: oldpte = *pte; if ((oldpte & (PG_FRAME | PG_A | PG_V)) != pa) { pmap_pde_p_failures++; CTR2(KTR_PMAP, "pmap_promote_pde: failure for va %#x" " in pmap %p", va, pmap); return; } if ((oldpte & (PG_M | PG_RW)) == PG_RW) { /* * When PG_M is already clear, PG_RW can be cleared * without a TLB invalidation. */ if (!atomic_cmpset_int((u_int *)pte, oldpte, oldpte & ~PG_RW)) goto setpte; oldpte &= ~PG_RW; oldpteva = (oldpte & PG_FRAME & PDRMASK) | (va & ~PDRMASK); CTR2(KTR_PMAP, "pmap_promote_pde: protect for va %#x" " in pmap %p", oldpteva, pmap); } if ((oldpte & PG_PTE_PROMOTE) != (newpde & PG_PTE_PROMOTE)) { pmap_pde_p_failures++; CTR2(KTR_PMAP, "pmap_promote_pde: failure for va %#x" " in pmap %p", va, pmap); return; } pa -= PAGE_SIZE; } /* * Save the page table page in its current state until the PDE * mapping the superpage is demoted by pmap_demote_pde() or * destroyed by pmap_remove_pde(). */ mpte = PHYS_TO_VM_PAGE(*pde & PG_FRAME); KASSERT(mpte >= vm_page_array && mpte < &vm_page_array[vm_page_array_size], ("pmap_promote_pde: page table page is out of range")); KASSERT(mpte->pindex == va >> PDRSHIFT, ("pmap_promote_pde: page table page's pindex is wrong")); if (pmap_insert_pt_page(pmap, mpte)) { pmap_pde_p_failures++; CTR2(KTR_PMAP, "pmap_promote_pde: failure for va %#x in pmap %p", va, pmap); return; } /* * Promote the pv entries. */ if ((newpde & PG_MANAGED) != 0) pmap_pv_promote_pde(pmap, va, newpde & PG_PS_FRAME); /* * Propagate the PAT index to its proper position. */ if ((newpde & PG_PTE_PAT) != 0) newpde ^= PG_PDE_PAT | PG_PTE_PAT; /* * Map the superpage. */ if (workaround_erratum383) pmap_update_pde(pmap, va, pde, PG_PS | newpde); else if (pmap == kernel_pmap) pmap_kenter_pde(va, PG_PS | newpde); else pde_store(pde, PG_PS | newpde); pmap_pde_promotions++; CTR2(KTR_PMAP, "pmap_promote_pde: success for va %#x" " in pmap %p", va, pmap); } /* * Insert the given physical page (p) at * the specified virtual address (v) in the * target physical map with the protection requested. * * If specified, the page will be wired down, meaning * that the related pte can not be reclaimed. * * NB: This is the only routine which MAY NOT lazy-evaluate * or lose information. That is, this routine must actually * insert this page into the given map NOW. */ int pmap_enter(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot, u_int flags, int8_t psind) { pd_entry_t *pde; pt_entry_t *pte; pt_entry_t newpte, origpte; pv_entry_t pv; vm_paddr_t opa, pa; vm_page_t mpte, om; boolean_t invlva, wired; va = trunc_page(va); mpte = NULL; wired = (flags & PMAP_ENTER_WIRED) != 0; KASSERT(va <= VM_MAX_KERNEL_ADDRESS, ("pmap_enter: toobig")); KASSERT(va < UPT_MIN_ADDRESS || va >= UPT_MAX_ADDRESS, ("pmap_enter: invalid to pmap_enter page table pages (va: 0x%x)", va)); if ((m->oflags & VPO_UNMANAGED) == 0 && !vm_page_xbusied(m)) VM_OBJECT_ASSERT_LOCKED(m->object); rw_wlock(&pvh_global_lock); PMAP_LOCK(pmap); sched_pin(); /* * In the case that a page table page is not * resident, we are creating it here. */ if (va < VM_MAXUSER_ADDRESS) { mpte = pmap_allocpte(pmap, va, flags); if (mpte == NULL) { KASSERT((flags & PMAP_ENTER_NOSLEEP) != 0, ("pmap_allocpte failed with sleep allowed")); sched_unpin(); rw_wunlock(&pvh_global_lock); PMAP_UNLOCK(pmap); return (KERN_RESOURCE_SHORTAGE); } } pde = pmap_pde(pmap, va); if ((*pde & PG_PS) != 0) panic("pmap_enter: attempted pmap_enter on 4MB page"); pte = pmap_pte_quick(pmap, va); /* * Page Directory table entry not valid, we need a new PT page */ if (pte == NULL) { panic("pmap_enter: invalid page directory pdir=%#jx, va=%#x", (uintmax_t)pmap->pm_pdir[PTDPTDI], va); } pa = VM_PAGE_TO_PHYS(m); om = NULL; origpte = *pte; opa = origpte & PG_FRAME; /* * Mapping has not changed, must be protection or wiring change. */ if (origpte && (opa == pa)) { /* * Wiring change, just update stats. We don't worry about * wiring PT pages as they remain resident as long as there * are valid mappings in them. Hence, if a user page is wired, * the PT page will be also. */ if (wired && ((origpte & PG_W) == 0)) pmap->pm_stats.wired_count++; else if (!wired && (origpte & PG_W)) pmap->pm_stats.wired_count--; /* * Remove extra pte reference */ if (mpte) mpte->wire_count--; if (origpte & PG_MANAGED) { om = m; pa |= PG_MANAGED; } goto validate; } pv = NULL; /* * Mapping has changed, invalidate old range and fall through to * handle validating new mapping. */ if (opa) { if (origpte & PG_W) pmap->pm_stats.wired_count--; if (origpte & PG_MANAGED) { om = PHYS_TO_VM_PAGE(opa); pv = pmap_pvh_remove(&om->md, pmap, va); } if (mpte != NULL) { mpte->wire_count--; KASSERT(mpte->wire_count > 0, ("pmap_enter: missing reference to page table page," " va: 0x%x", va)); } } else pmap->pm_stats.resident_count++; /* * Enter on the PV list if part of our managed memory. */ if ((m->oflags & VPO_UNMANAGED) == 0) { KASSERT(va < kmi.clean_sva || va >= kmi.clean_eva, ("pmap_enter: managed mapping within the clean submap")); if (pv == NULL) pv = get_pv_entry(pmap, FALSE); pv->pv_va = va; TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); pa |= PG_MANAGED; } else if (pv != NULL) free_pv_entry(pmap, pv); /* * Increment counters */ if (wired) pmap->pm_stats.wired_count++; validate: /* * Now validate mapping with desired protection/wiring. */ newpte = (pt_entry_t)(pa | pmap_cache_bits(m->md.pat_mode, 0) | PG_V); if ((prot & VM_PROT_WRITE) != 0) { newpte |= PG_RW; if ((newpte & PG_MANAGED) != 0) vm_page_aflag_set(m, PGA_WRITEABLE); } #if defined(PAE) || defined(PAE_TABLES) if ((prot & VM_PROT_EXECUTE) == 0) newpte |= pg_nx; #endif if (wired) newpte |= PG_W; if (va < VM_MAXUSER_ADDRESS) newpte |= PG_U; if (pmap == kernel_pmap) newpte |= pgeflag; /* * if the mapping or permission bits are different, we need * to update the pte. */ if ((origpte & ~(PG_M|PG_A)) != newpte) { newpte |= PG_A; if ((flags & VM_PROT_WRITE) != 0) newpte |= PG_M; if (origpte & PG_V) { invlva = FALSE; origpte = pte_load_store(pte, newpte); if (origpte & PG_A) { if (origpte & PG_MANAGED) vm_page_aflag_set(om, PGA_REFERENCED); if (opa != VM_PAGE_TO_PHYS(m)) invlva = TRUE; #if defined(PAE) || defined(PAE_TABLES) if ((origpte & PG_NX) == 0 && (newpte & PG_NX) != 0) invlva = TRUE; #endif } if ((origpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) { if ((origpte & PG_MANAGED) != 0) vm_page_dirty(om); if ((prot & VM_PROT_WRITE) == 0) invlva = TRUE; } if ((origpte & PG_MANAGED) != 0 && TAILQ_EMPTY(&om->md.pv_list) && ((om->flags & PG_FICTITIOUS) != 0 || TAILQ_EMPTY(&pa_to_pvh(opa)->pv_list))) vm_page_aflag_clear(om, PGA_WRITEABLE); if (invlva) pmap_invalidate_page(pmap, va); } else pte_store(pte, newpte); } /* * If both the page table page and the reservation are fully * populated, then attempt promotion. */ if ((mpte == NULL || mpte->wire_count == NPTEPG) && pg_ps_enabled && (m->flags & PG_FICTITIOUS) == 0 && vm_reserv_level_iffullpop(m) == 0) pmap_promote_pde(pmap, pde, va); sched_unpin(); rw_wunlock(&pvh_global_lock); PMAP_UNLOCK(pmap); return (KERN_SUCCESS); } /* * Tries to create a 2- or 4MB page mapping. Returns TRUE if successful and * FALSE otherwise. Fails if (1) a page table page cannot be allocated without * blocking, (2) a mapping already exists at the specified virtual address, or * (3) a pv entry cannot be allocated without reclaiming another pv entry. */ static boolean_t pmap_enter_pde(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot) { pd_entry_t *pde, newpde; rw_assert(&pvh_global_lock, RA_WLOCKED); PMAP_LOCK_ASSERT(pmap, MA_OWNED); pde = pmap_pde(pmap, va); if (*pde != 0) { CTR2(KTR_PMAP, "pmap_enter_pde: failure for va %#lx" " in pmap %p", va, pmap); return (FALSE); } newpde = VM_PAGE_TO_PHYS(m) | pmap_cache_bits(m->md.pat_mode, 1) | PG_PS | PG_V; if ((m->oflags & VPO_UNMANAGED) == 0) { newpde |= PG_MANAGED; /* * Abort this mapping if its PV entry could not be created. */ if (!pmap_pv_insert_pde(pmap, va, VM_PAGE_TO_PHYS(m))) { CTR2(KTR_PMAP, "pmap_enter_pde: failure for va %#lx" " in pmap %p", va, pmap); return (FALSE); } } #if defined(PAE) || defined(PAE_TABLES) if ((prot & VM_PROT_EXECUTE) == 0) newpde |= pg_nx; #endif if (va < VM_MAXUSER_ADDRESS) newpde |= PG_U; /* * Increment counters. */ pmap->pm_stats.resident_count += NBPDR / PAGE_SIZE; /* * Map the superpage. */ pde_store(pde, newpde); pmap_pde_mappings++; CTR2(KTR_PMAP, "pmap_enter_pde: success for va %#lx" " in pmap %p", va, pmap); return (TRUE); } /* * Maps a sequence of resident pages belonging to the same object. * The sequence begins with the given page m_start. This page is * mapped at the given virtual address start. Each subsequent page is * mapped at a virtual address that is offset from start by the same * amount as the page is offset from m_start within the object. The * last page in the sequence is the page with the largest offset from * m_start that can be mapped at a virtual address less than the given * virtual address end. Not every virtual page between start and end * is mapped; only those for which a resident page exists with the * corresponding offset from m_start are mapped. */ void pmap_enter_object(pmap_t pmap, vm_offset_t start, vm_offset_t end, vm_page_t m_start, vm_prot_t prot) { vm_offset_t va; vm_page_t m, mpte; vm_pindex_t diff, psize; VM_OBJECT_ASSERT_LOCKED(m_start->object); psize = atop(end - start); mpte = NULL; m = m_start; rw_wlock(&pvh_global_lock); PMAP_LOCK(pmap); while (m != NULL && (diff = m->pindex - m_start->pindex) < psize) { va = start + ptoa(diff); if ((va & PDRMASK) == 0 && va + NBPDR <= end && m->psind == 1 && pg_ps_enabled && pmap_enter_pde(pmap, va, m, prot)) m = &m[NBPDR / PAGE_SIZE - 1]; else mpte = pmap_enter_quick_locked(pmap, va, m, prot, mpte); m = TAILQ_NEXT(m, listq); } rw_wunlock(&pvh_global_lock); PMAP_UNLOCK(pmap); } /* * this code makes some *MAJOR* assumptions: * 1. Current pmap & pmap exists. * 2. Not wired. * 3. Read access. * 4. No page table pages. * but is *MUCH* faster than pmap_enter... */ void pmap_enter_quick(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot) { rw_wlock(&pvh_global_lock); PMAP_LOCK(pmap); (void)pmap_enter_quick_locked(pmap, va, m, prot, NULL); rw_wunlock(&pvh_global_lock); PMAP_UNLOCK(pmap); } static vm_page_t pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot, vm_page_t mpte) { pt_entry_t *pte; vm_paddr_t pa; struct spglist free; KASSERT(va < kmi.clean_sva || va >= kmi.clean_eva || (m->oflags & VPO_UNMANAGED) != 0, ("pmap_enter_quick_locked: managed mapping within the clean submap")); rw_assert(&pvh_global_lock, RA_WLOCKED); PMAP_LOCK_ASSERT(pmap, MA_OWNED); /* * In the case that a page table page is not * resident, we are creating it here. */ if (va < VM_MAXUSER_ADDRESS) { u_int ptepindex; pd_entry_t ptepa; /* * Calculate pagetable page index */ ptepindex = va >> PDRSHIFT; if (mpte && (mpte->pindex == ptepindex)) { mpte->wire_count++; } else { /* * Get the page directory entry */ ptepa = pmap->pm_pdir[ptepindex]; /* * If the page table page is mapped, we just increment * the hold count, and activate it. */ if (ptepa) { if (ptepa & PG_PS) return (NULL); mpte = PHYS_TO_VM_PAGE(ptepa & PG_FRAME); mpte->wire_count++; } else { mpte = _pmap_allocpte(pmap, ptepindex, PMAP_ENTER_NOSLEEP); if (mpte == NULL) return (mpte); } } } else { mpte = NULL; } /* * This call to vtopte makes the assumption that we are * entering the page into the current pmap. In order to support * quick entry into any pmap, one would likely use pmap_pte_quick. * But that isn't as quick as vtopte. */ pte = vtopte(va); if (*pte) { if (mpte != NULL) { mpte->wire_count--; mpte = NULL; } return (mpte); } /* * Enter on the PV list if part of our managed memory. */ if ((m->oflags & VPO_UNMANAGED) == 0 && !pmap_try_insert_pv_entry(pmap, va, m)) { if (mpte != NULL) { SLIST_INIT(&free); if (pmap_unwire_ptp(pmap, mpte, &free)) { pmap_invalidate_page(pmap, va); pmap_free_zero_pages(&free); } mpte = NULL; } return (mpte); } /* * Increment counters */ pmap->pm_stats.resident_count++; pa = VM_PAGE_TO_PHYS(m) | pmap_cache_bits(m->md.pat_mode, 0); #if defined(PAE) || defined(PAE_TABLES) if ((prot & VM_PROT_EXECUTE) == 0) pa |= pg_nx; #endif /* * Now validate mapping with RO protection */ if ((m->oflags & VPO_UNMANAGED) != 0) pte_store(pte, pa | PG_V | PG_U); else pte_store(pte, pa | PG_V | PG_U | PG_MANAGED); return (mpte); } /* * Make a temporary mapping for a physical address. This is only intended * to be used for panic dumps. */ void * pmap_kenter_temporary(vm_paddr_t pa, int i) { vm_offset_t va; va = (vm_offset_t)crashdumpmap + (i * PAGE_SIZE); pmap_kenter(va, pa); invlpg(va); return ((void *)crashdumpmap); } /* * This code maps large physical mmap regions into the * processor address space. Note that some shortcuts * are taken, but the code works. */ void pmap_object_init_pt(pmap_t pmap, vm_offset_t addr, vm_object_t object, vm_pindex_t pindex, vm_size_t size) { pd_entry_t *pde; vm_paddr_t pa, ptepa; vm_page_t p; int pat_mode; VM_OBJECT_ASSERT_WLOCKED(object); KASSERT(object->type == OBJT_DEVICE || object->type == OBJT_SG, ("pmap_object_init_pt: non-device object")); if (pseflag && (addr & (NBPDR - 1)) == 0 && (size & (NBPDR - 1)) == 0) { if (!vm_object_populate(object, pindex, pindex + atop(size))) return; p = vm_page_lookup(object, pindex); KASSERT(p->valid == VM_PAGE_BITS_ALL, ("pmap_object_init_pt: invalid page %p", p)); pat_mode = p->md.pat_mode; /* * Abort the mapping if the first page is not physically * aligned to a 2/4MB page boundary. */ ptepa = VM_PAGE_TO_PHYS(p); if (ptepa & (NBPDR - 1)) return; /* * Skip the first page. Abort the mapping if the rest of * the pages are not physically contiguous or have differing * memory attributes. */ p = TAILQ_NEXT(p, listq); for (pa = ptepa + PAGE_SIZE; pa < ptepa + size; pa += PAGE_SIZE) { KASSERT(p->valid == VM_PAGE_BITS_ALL, ("pmap_object_init_pt: invalid page %p", p)); if (pa != VM_PAGE_TO_PHYS(p) || pat_mode != p->md.pat_mode) return; p = TAILQ_NEXT(p, listq); } /* * Map using 2/4MB pages. Since "ptepa" is 2/4M aligned and * "size" is a multiple of 2/4M, adding the PAT setting to * "pa" will not affect the termination of this loop. */ PMAP_LOCK(pmap); for (pa = ptepa | pmap_cache_bits(pat_mode, 1); pa < ptepa + size; pa += NBPDR) { pde = pmap_pde(pmap, addr); if (*pde == 0) { pde_store(pde, pa | PG_PS | PG_M | PG_A | PG_U | PG_RW | PG_V); pmap->pm_stats.resident_count += NBPDR / PAGE_SIZE; pmap_pde_mappings++; } /* Else continue on if the PDE is already valid. */ addr += NBPDR; } PMAP_UNLOCK(pmap); } } /* * Clear the wired attribute from the mappings for the specified range of * addresses in the given pmap. Every valid mapping within that range * must have the wired attribute set. In contrast, invalid mappings * cannot have the wired attribute set, so they are ignored. * * The wired attribute of the page table entry is not a hardware feature, * so there is no need to invalidate any TLB entries. */ void pmap_unwire(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) { vm_offset_t pdnxt; pd_entry_t *pde; pt_entry_t *pte; boolean_t pv_lists_locked; if (pmap_is_current(pmap)) pv_lists_locked = FALSE; else { pv_lists_locked = TRUE; resume: rw_wlock(&pvh_global_lock); sched_pin(); } PMAP_LOCK(pmap); for (; sva < eva; sva = pdnxt) { pdnxt = (sva + NBPDR) & ~PDRMASK; if (pdnxt < sva) pdnxt = eva; pde = pmap_pde(pmap, sva); if ((*pde & PG_V) == 0) continue; if ((*pde & PG_PS) != 0) { if ((*pde & PG_W) == 0) panic("pmap_unwire: pde %#jx is missing PG_W", (uintmax_t)*pde); /* * Are we unwiring the entire large page? If not, * demote the mapping and fall through. */ if (sva + NBPDR == pdnxt && eva >= pdnxt) { /* * Regardless of whether a pde (or pte) is 32 * or 64 bits in size, PG_W is among the least * significant 32 bits. */ atomic_clear_int((u_int *)pde, PG_W); pmap->pm_stats.wired_count -= NBPDR / PAGE_SIZE; continue; } else { if (!pv_lists_locked) { pv_lists_locked = TRUE; if (!rw_try_wlock(&pvh_global_lock)) { PMAP_UNLOCK(pmap); /* Repeat sva. */ goto resume; } sched_pin(); } if (!pmap_demote_pde(pmap, pde, sva)) panic("pmap_unwire: demotion failed"); } } if (pdnxt > eva) pdnxt = eva; for (pte = pmap_pte_quick(pmap, sva); sva != pdnxt; pte++, sva += PAGE_SIZE) { if ((*pte & PG_V) == 0) continue; if ((*pte & PG_W) == 0) panic("pmap_unwire: pte %#jx is missing PG_W", (uintmax_t)*pte); /* * PG_W must be cleared atomically. Although the pmap * lock synchronizes access to PG_W, another processor * could be setting PG_M and/or PG_A concurrently. * * PG_W is among the least significant 32 bits. */ atomic_clear_int((u_int *)pte, PG_W); pmap->pm_stats.wired_count--; } } if (pv_lists_locked) { sched_unpin(); rw_wunlock(&pvh_global_lock); } PMAP_UNLOCK(pmap); } /* * Copy the range specified by src_addr/len * from the source map to the range dst_addr/len * in the destination map. * * This routine is only advisory and need not do anything. */ void pmap_copy(pmap_t dst_pmap, pmap_t src_pmap, vm_offset_t dst_addr, vm_size_t len, vm_offset_t src_addr) { struct spglist free; vm_offset_t addr; vm_offset_t end_addr = src_addr + len; vm_offset_t pdnxt; if (dst_addr != src_addr) return; if (!pmap_is_current(src_pmap)) return; rw_wlock(&pvh_global_lock); if (dst_pmap < src_pmap) { PMAP_LOCK(dst_pmap); PMAP_LOCK(src_pmap); } else { PMAP_LOCK(src_pmap); PMAP_LOCK(dst_pmap); } sched_pin(); for (addr = src_addr; addr < end_addr; addr = pdnxt) { pt_entry_t *src_pte, *dst_pte; vm_page_t dstmpte, srcmpte; pd_entry_t srcptepaddr; u_int ptepindex; KASSERT(addr < UPT_MIN_ADDRESS, ("pmap_copy: invalid to pmap_copy page tables")); pdnxt = (addr + NBPDR) & ~PDRMASK; if (pdnxt < addr) pdnxt = end_addr; ptepindex = addr >> PDRSHIFT; srcptepaddr = src_pmap->pm_pdir[ptepindex]; if (srcptepaddr == 0) continue; if (srcptepaddr & PG_PS) { if ((addr & PDRMASK) != 0 || addr + NBPDR > end_addr) continue; if (dst_pmap->pm_pdir[ptepindex] == 0 && ((srcptepaddr & PG_MANAGED) == 0 || pmap_pv_insert_pde(dst_pmap, addr, srcptepaddr & PG_PS_FRAME))) { dst_pmap->pm_pdir[ptepindex] = srcptepaddr & ~PG_W; dst_pmap->pm_stats.resident_count += NBPDR / PAGE_SIZE; } continue; } srcmpte = PHYS_TO_VM_PAGE(srcptepaddr & PG_FRAME); KASSERT(srcmpte->wire_count > 0, ("pmap_copy: source page table page is unused")); if (pdnxt > end_addr) pdnxt = end_addr; src_pte = vtopte(addr); while (addr < pdnxt) { pt_entry_t ptetemp; ptetemp = *src_pte; /* * we only virtual copy managed pages */ if ((ptetemp & PG_MANAGED) != 0) { dstmpte = pmap_allocpte(dst_pmap, addr, PMAP_ENTER_NOSLEEP); if (dstmpte == NULL) goto out; dst_pte = pmap_pte_quick(dst_pmap, addr); if (*dst_pte == 0 && pmap_try_insert_pv_entry(dst_pmap, addr, PHYS_TO_VM_PAGE(ptetemp & PG_FRAME))) { /* * Clear the wired, modified, and * accessed (referenced) bits * during the copy. */ *dst_pte = ptetemp & ~(PG_W | PG_M | PG_A); dst_pmap->pm_stats.resident_count++; } else { SLIST_INIT(&free); if (pmap_unwire_ptp(dst_pmap, dstmpte, &free)) { pmap_invalidate_page(dst_pmap, addr); pmap_free_zero_pages(&free); } goto out; } if (dstmpte->wire_count >= srcmpte->wire_count) break; } addr += PAGE_SIZE; src_pte++; } } out: sched_unpin(); rw_wunlock(&pvh_global_lock); PMAP_UNLOCK(src_pmap); PMAP_UNLOCK(dst_pmap); } static __inline void pagezero(void *page) { #if defined(I686_CPU) if (cpu_class == CPUCLASS_686) { #if defined(CPU_ENABLE_SSE) if (cpu_feature & CPUID_SSE2) sse2_pagezero(page); else #endif i686_pagezero(page); } else #endif bzero(page, PAGE_SIZE); } /* * pmap_zero_page zeros the specified hardware page by mapping * the page into KVM and using bzero to clear its contents. */ void pmap_zero_page(vm_page_t m) { struct sysmaps *sysmaps; sysmaps = &sysmaps_pcpu[PCPU_GET(cpuid)]; mtx_lock(&sysmaps->lock); if (*sysmaps->CMAP2) panic("pmap_zero_page: CMAP2 busy"); sched_pin(); *sysmaps->CMAP2 = PG_V | PG_RW | VM_PAGE_TO_PHYS(m) | PG_A | PG_M | pmap_cache_bits(m->md.pat_mode, 0); invlcaddr(sysmaps->CADDR2); pagezero(sysmaps->CADDR2); *sysmaps->CMAP2 = 0; sched_unpin(); mtx_unlock(&sysmaps->lock); } /* * pmap_zero_page_area zeros the specified hardware page by mapping * the page into KVM and using bzero to clear its contents. * * off and size may not cover an area beyond a single hardware page. */ void pmap_zero_page_area(vm_page_t m, int off, int size) { struct sysmaps *sysmaps; sysmaps = &sysmaps_pcpu[PCPU_GET(cpuid)]; mtx_lock(&sysmaps->lock); if (*sysmaps->CMAP2) panic("pmap_zero_page_area: CMAP2 busy"); sched_pin(); *sysmaps->CMAP2 = PG_V | PG_RW | VM_PAGE_TO_PHYS(m) | PG_A | PG_M | pmap_cache_bits(m->md.pat_mode, 0); invlcaddr(sysmaps->CADDR2); if (off == 0 && size == PAGE_SIZE) pagezero(sysmaps->CADDR2); else bzero((char *)sysmaps->CADDR2 + off, size); *sysmaps->CMAP2 = 0; sched_unpin(); mtx_unlock(&sysmaps->lock); } /* * pmap_zero_page_idle zeros the specified hardware page by mapping * the page into KVM and using bzero to clear its contents. This * is intended to be called from the vm_pagezero process only and * outside of Giant. */ void pmap_zero_page_idle(vm_page_t m) { if (*CMAP3) panic("pmap_zero_page_idle: CMAP3 busy"); sched_pin(); *CMAP3 = PG_V | PG_RW | VM_PAGE_TO_PHYS(m) | PG_A | PG_M | pmap_cache_bits(m->md.pat_mode, 0); invlcaddr(CADDR3); pagezero(CADDR3); *CMAP3 = 0; sched_unpin(); } /* * pmap_copy_page copies the specified (machine independent) * page by mapping the page into virtual memory and using * bcopy to copy the page, one machine dependent page at a * time. */ void pmap_copy_page(vm_page_t src, vm_page_t dst) { struct sysmaps *sysmaps; sysmaps = &sysmaps_pcpu[PCPU_GET(cpuid)]; mtx_lock(&sysmaps->lock); if (*sysmaps->CMAP1) panic("pmap_copy_page: CMAP1 busy"); if (*sysmaps->CMAP2) panic("pmap_copy_page: CMAP2 busy"); sched_pin(); *sysmaps->CMAP1 = PG_V | VM_PAGE_TO_PHYS(src) | PG_A | pmap_cache_bits(src->md.pat_mode, 0); invlcaddr(sysmaps->CADDR1); *sysmaps->CMAP2 = PG_V | PG_RW | VM_PAGE_TO_PHYS(dst) | PG_A | PG_M | pmap_cache_bits(dst->md.pat_mode, 0); invlcaddr(sysmaps->CADDR2); bcopy(sysmaps->CADDR1, sysmaps->CADDR2, PAGE_SIZE); *sysmaps->CMAP1 = 0; *sysmaps->CMAP2 = 0; sched_unpin(); mtx_unlock(&sysmaps->lock); } int unmapped_buf_allowed = 1; void pmap_copy_pages(vm_page_t ma[], vm_offset_t a_offset, vm_page_t mb[], vm_offset_t b_offset, int xfersize) { struct sysmaps *sysmaps; vm_page_t a_pg, b_pg; char *a_cp, *b_cp; vm_offset_t a_pg_offset, b_pg_offset; int cnt; sysmaps = &sysmaps_pcpu[PCPU_GET(cpuid)]; mtx_lock(&sysmaps->lock); if (*sysmaps->CMAP1 != 0) panic("pmap_copy_pages: CMAP1 busy"); if (*sysmaps->CMAP2 != 0) panic("pmap_copy_pages: CMAP2 busy"); sched_pin(); while (xfersize > 0) { a_pg = ma[a_offset >> PAGE_SHIFT]; a_pg_offset = a_offset & PAGE_MASK; cnt = min(xfersize, PAGE_SIZE - a_pg_offset); b_pg = mb[b_offset >> PAGE_SHIFT]; b_pg_offset = b_offset & PAGE_MASK; cnt = min(cnt, PAGE_SIZE - b_pg_offset); *sysmaps->CMAP1 = PG_V | VM_PAGE_TO_PHYS(a_pg) | PG_A | pmap_cache_bits(a_pg->md.pat_mode, 0); invlcaddr(sysmaps->CADDR1); *sysmaps->CMAP2 = PG_V | PG_RW | VM_PAGE_TO_PHYS(b_pg) | PG_A | PG_M | pmap_cache_bits(b_pg->md.pat_mode, 0); invlcaddr(sysmaps->CADDR2); a_cp = sysmaps->CADDR1 + a_pg_offset; b_cp = sysmaps->CADDR2 + b_pg_offset; bcopy(a_cp, b_cp, cnt); a_offset += cnt; b_offset += cnt; xfersize -= cnt; } *sysmaps->CMAP1 = 0; *sysmaps->CMAP2 = 0; sched_unpin(); mtx_unlock(&sysmaps->lock); } /* * Returns true if the pmap's pv is one of the first * 16 pvs linked to from this page. This count may * be changed upwards or downwards in the future; it * is only necessary that true be returned for a small * subset of pmaps for proper page aging. */ boolean_t pmap_page_exists_quick(pmap_t pmap, vm_page_t m) { struct md_page *pvh; pv_entry_t pv; int loops = 0; boolean_t rv; KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("pmap_page_exists_quick: page %p is not managed", m)); rv = FALSE; rw_wlock(&pvh_global_lock); TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { if (PV_PMAP(pv) == pmap) { rv = TRUE; break; } loops++; if (loops >= 16) break; } if (!rv && loops < 16 && (m->flags & PG_FICTITIOUS) == 0) { pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { if (PV_PMAP(pv) == pmap) { rv = TRUE; break; } loops++; if (loops >= 16) break; } } rw_wunlock(&pvh_global_lock); return (rv); } /* * pmap_page_wired_mappings: * * Return the number of managed mappings to the given physical page * that are wired. */ int pmap_page_wired_mappings(vm_page_t m) { int count; count = 0; if ((m->oflags & VPO_UNMANAGED) != 0) return (count); rw_wlock(&pvh_global_lock); count = pmap_pvh_wired_mappings(&m->md, count); if ((m->flags & PG_FICTITIOUS) == 0) { count = pmap_pvh_wired_mappings(pa_to_pvh(VM_PAGE_TO_PHYS(m)), count); } rw_wunlock(&pvh_global_lock); return (count); } /* * pmap_pvh_wired_mappings: * * Return the updated number "count" of managed mappings that are wired. */ static int pmap_pvh_wired_mappings(struct md_page *pvh, int count) { pmap_t pmap; pt_entry_t *pte; pv_entry_t pv; rw_assert(&pvh_global_lock, RA_WLOCKED); sched_pin(); TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { pmap = PV_PMAP(pv); PMAP_LOCK(pmap); pte = pmap_pte_quick(pmap, pv->pv_va); if ((*pte & PG_W) != 0) count++; PMAP_UNLOCK(pmap); } sched_unpin(); return (count); } /* * Returns TRUE if the given page is mapped individually or as part of * a 4mpage. Otherwise, returns FALSE. */ boolean_t pmap_page_is_mapped(vm_page_t m) { boolean_t rv; if ((m->oflags & VPO_UNMANAGED) != 0) return (FALSE); rw_wlock(&pvh_global_lock); rv = !TAILQ_EMPTY(&m->md.pv_list) || ((m->flags & PG_FICTITIOUS) == 0 && !TAILQ_EMPTY(&pa_to_pvh(VM_PAGE_TO_PHYS(m))->pv_list)); rw_wunlock(&pvh_global_lock); return (rv); } /* * Remove all pages from specified address space * this aids process exit speeds. Also, this code * is special cased for current process only, but * can have the more generic (and slightly slower) * mode enabled. This is much faster than pmap_remove * in the case of running down an entire address space. */ void pmap_remove_pages(pmap_t pmap) { pt_entry_t *pte, tpte; vm_page_t m, mpte, mt; pv_entry_t pv; struct md_page *pvh; struct pv_chunk *pc, *npc; struct spglist free; int field, idx; int32_t bit; uint32_t inuse, bitmask; int allfree; if (pmap != PCPU_GET(curpmap)) { printf("warning: pmap_remove_pages called with non-current pmap\n"); return; } SLIST_INIT(&free); rw_wlock(&pvh_global_lock); PMAP_LOCK(pmap); sched_pin(); TAILQ_FOREACH_SAFE(pc, &pmap->pm_pvchunk, pc_list, npc) { KASSERT(pc->pc_pmap == pmap, ("Wrong pmap %p %p", pmap, pc->pc_pmap)); allfree = 1; for (field = 0; field < _NPCM; field++) { inuse = ~pc->pc_map[field] & pc_freemask[field]; while (inuse != 0) { bit = bsfl(inuse); bitmask = 1UL << bit; idx = field * 32 + bit; pv = &pc->pc_pventry[idx]; inuse &= ~bitmask; pte = pmap_pde(pmap, pv->pv_va); tpte = *pte; if ((tpte & PG_PS) == 0) { pte = vtopte(pv->pv_va); tpte = *pte & ~PG_PTE_PAT; } if (tpte == 0) { printf( "TPTE at %p IS ZERO @ VA %08x\n", pte, pv->pv_va); panic("bad pte"); } /* * We cannot remove wired pages from a process' mapping at this time */ if (tpte & PG_W) { allfree = 0; continue; } m = PHYS_TO_VM_PAGE(tpte & PG_FRAME); KASSERT(m->phys_addr == (tpte & PG_FRAME), ("vm_page_t %p phys_addr mismatch %016jx %016jx", m, (uintmax_t)m->phys_addr, (uintmax_t)tpte)); KASSERT((m->flags & PG_FICTITIOUS) != 0 || m < &vm_page_array[vm_page_array_size], ("pmap_remove_pages: bad tpte %#jx", (uintmax_t)tpte)); pte_clear(pte); /* * Update the vm_page_t clean/reference bits. */ if ((tpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) { if ((tpte & PG_PS) != 0) { for (mt = m; mt < &m[NBPDR / PAGE_SIZE]; mt++) vm_page_dirty(mt); } else vm_page_dirty(m); } /* Mark free */ PV_STAT(pv_entry_frees++); PV_STAT(pv_entry_spare++); pv_entry_count--; pc->pc_map[field] |= bitmask; if ((tpte & PG_PS) != 0) { pmap->pm_stats.resident_count -= NBPDR / PAGE_SIZE; pvh = pa_to_pvh(tpte & PG_PS_FRAME); TAILQ_REMOVE(&pvh->pv_list, pv, pv_next); if (TAILQ_EMPTY(&pvh->pv_list)) { for (mt = m; mt < &m[NBPDR / PAGE_SIZE]; mt++) if (TAILQ_EMPTY(&mt->md.pv_list)) vm_page_aflag_clear(mt, PGA_WRITEABLE); } mpte = pmap_lookup_pt_page(pmap, pv->pv_va); if (mpte != NULL) { pmap_remove_pt_page(pmap, mpte); pmap->pm_stats.resident_count--; KASSERT(mpte->wire_count == NPTEPG, ("pmap_remove_pages: pte page wire count error")); mpte->wire_count = 0; pmap_add_delayed_free_list(mpte, &free, FALSE); atomic_subtract_int(&cnt.v_wire_count, 1); } } else { pmap->pm_stats.resident_count--; TAILQ_REMOVE(&m->md.pv_list, pv, pv_next); if (TAILQ_EMPTY(&m->md.pv_list) && (m->flags & PG_FICTITIOUS) == 0) { pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); if (TAILQ_EMPTY(&pvh->pv_list)) vm_page_aflag_clear(m, PGA_WRITEABLE); } pmap_unuse_pt(pmap, pv->pv_va, &free); } } } if (allfree) { TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); free_pv_chunk(pc); } } sched_unpin(); pmap_invalidate_all(pmap); rw_wunlock(&pvh_global_lock); PMAP_UNLOCK(pmap); pmap_free_zero_pages(&free); } /* * pmap_is_modified: * * Return whether or not the specified physical page was modified * in any physical maps. */ boolean_t pmap_is_modified(vm_page_t m) { boolean_t rv; KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("pmap_is_modified: page %p is not managed", m)); /* * If the page is not exclusive busied, then PGA_WRITEABLE cannot be * concurrently set while the object is locked. Thus, if PGA_WRITEABLE * is clear, no PTEs can have PG_M set. */ VM_OBJECT_ASSERT_WLOCKED(m->object); if (!vm_page_xbusied(m) && (m->aflags & PGA_WRITEABLE) == 0) return (FALSE); rw_wlock(&pvh_global_lock); rv = pmap_is_modified_pvh(&m->md) || ((m->flags & PG_FICTITIOUS) == 0 && pmap_is_modified_pvh(pa_to_pvh(VM_PAGE_TO_PHYS(m)))); rw_wunlock(&pvh_global_lock); return (rv); } /* * Returns TRUE if any of the given mappings were used to modify * physical memory. Otherwise, returns FALSE. Both page and 2mpage * mappings are supported. */ static boolean_t pmap_is_modified_pvh(struct md_page *pvh) { pv_entry_t pv; pt_entry_t *pte; pmap_t pmap; boolean_t rv; rw_assert(&pvh_global_lock, RA_WLOCKED); rv = FALSE; sched_pin(); TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { pmap = PV_PMAP(pv); PMAP_LOCK(pmap); pte = pmap_pte_quick(pmap, pv->pv_va); rv = (*pte & (PG_M | PG_RW)) == (PG_M | PG_RW); PMAP_UNLOCK(pmap); if (rv) break; } sched_unpin(); return (rv); } /* * pmap_is_prefaultable: * * Return whether or not the specified virtual address is elgible * for prefault. */ boolean_t pmap_is_prefaultable(pmap_t pmap, vm_offset_t addr) { pd_entry_t *pde; pt_entry_t *pte; boolean_t rv; rv = FALSE; PMAP_LOCK(pmap); pde = pmap_pde(pmap, addr); if (*pde != 0 && (*pde & PG_PS) == 0) { pte = vtopte(addr); rv = *pte == 0; } PMAP_UNLOCK(pmap); return (rv); } /* * pmap_is_referenced: * * Return whether or not the specified physical page was referenced * in any physical maps. */ boolean_t pmap_is_referenced(vm_page_t m) { boolean_t rv; KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("pmap_is_referenced: page %p is not managed", m)); rw_wlock(&pvh_global_lock); rv = pmap_is_referenced_pvh(&m->md) || ((m->flags & PG_FICTITIOUS) == 0 && pmap_is_referenced_pvh(pa_to_pvh(VM_PAGE_TO_PHYS(m)))); rw_wunlock(&pvh_global_lock); return (rv); } /* * Returns TRUE if any of the given mappings were referenced and FALSE * otherwise. Both page and 4mpage mappings are supported. */ static boolean_t pmap_is_referenced_pvh(struct md_page *pvh) { pv_entry_t pv; pt_entry_t *pte; pmap_t pmap; boolean_t rv; rw_assert(&pvh_global_lock, RA_WLOCKED); rv = FALSE; sched_pin(); TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { pmap = PV_PMAP(pv); PMAP_LOCK(pmap); pte = pmap_pte_quick(pmap, pv->pv_va); rv = (*pte & (PG_A | PG_V)) == (PG_A | PG_V); PMAP_UNLOCK(pmap); if (rv) break; } sched_unpin(); return (rv); } /* * Clear the write and modified bits in each of the given page's mappings. */ void pmap_remove_write(vm_page_t m) { struct md_page *pvh; pv_entry_t next_pv, pv; pmap_t pmap; pd_entry_t *pde; pt_entry_t oldpte, *pte; vm_offset_t va; KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("pmap_remove_write: page %p is not managed", m)); /* * If the page is not exclusive busied, then PGA_WRITEABLE cannot be * set by another thread while the object is locked. Thus, * if PGA_WRITEABLE is clear, no page table entries need updating. */ VM_OBJECT_ASSERT_WLOCKED(m->object); if (!vm_page_xbusied(m) && (m->aflags & PGA_WRITEABLE) == 0) return; rw_wlock(&pvh_global_lock); sched_pin(); if ((m->flags & PG_FICTITIOUS) != 0) goto small_mappings; pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_next, next_pv) { va = pv->pv_va; pmap = PV_PMAP(pv); PMAP_LOCK(pmap); pde = pmap_pde(pmap, va); if ((*pde & PG_RW) != 0) (void)pmap_demote_pde(pmap, pde, va); PMAP_UNLOCK(pmap); } small_mappings: TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { pmap = PV_PMAP(pv); PMAP_LOCK(pmap); pde = pmap_pde(pmap, pv->pv_va); KASSERT((*pde & PG_PS) == 0, ("pmap_clear_write: found" " a 4mpage in page %p's pv list", m)); pte = pmap_pte_quick(pmap, pv->pv_va); retry: oldpte = *pte; if ((oldpte & PG_RW) != 0) { /* * Regardless of whether a pte is 32 or 64 bits * in size, PG_RW and PG_M are among the least * significant 32 bits. */ if (!atomic_cmpset_int((u_int *)pte, oldpte, oldpte & ~(PG_RW | PG_M))) goto retry; if ((oldpte & PG_M) != 0) vm_page_dirty(m); pmap_invalidate_page(pmap, pv->pv_va); } PMAP_UNLOCK(pmap); } vm_page_aflag_clear(m, PGA_WRITEABLE); sched_unpin(); rw_wunlock(&pvh_global_lock); } #define PMAP_TS_REFERENCED_MAX 5 /* * pmap_ts_referenced: * * Return a count of reference bits for a page, clearing those bits. * It is not necessary for every reference bit to be cleared, but it * is necessary that 0 only be returned when there are truly no * reference bits set. * * XXX: The exact number of bits to check and clear is a matter that * should be tested and standardized at some point in the future for * optimal aging of shared pages. */ int pmap_ts_referenced(vm_page_t m) { struct md_page *pvh; pv_entry_t pv, pvf; pmap_t pmap; pd_entry_t *pde; pt_entry_t *pte; vm_paddr_t pa; int rtval = 0; KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("pmap_ts_referenced: page %p is not managed", m)); pa = VM_PAGE_TO_PHYS(m); pvh = pa_to_pvh(pa); rw_wlock(&pvh_global_lock); sched_pin(); if ((m->flags & PG_FICTITIOUS) != 0 || (pvf = TAILQ_FIRST(&pvh->pv_list)) == NULL) goto small_mappings; pv = pvf; do { pmap = PV_PMAP(pv); PMAP_LOCK(pmap); pde = pmap_pde(pmap, pv->pv_va); if ((*pde & PG_A) != 0) { /* * Since this reference bit is shared by either 1024 * or 512 4KB pages, it should not be cleared every * time it is tested. Apply a simple "hash" function * on the physical page number, the virtual superpage * number, and the pmap address to select one 4KB page * out of the 1024 or 512 on which testing the * reference bit will result in clearing that bit. * This function is designed to avoid the selection of * the same 4KB page for every 2- or 4MB page mapping. * * On demotion, a mapping that hasn't been referenced * is simply destroyed. To avoid the possibility of a * subsequent page fault on a demoted wired mapping, * always leave its reference bit set. Moreover, * since the superpage is wired, the current state of * its reference bit won't affect page replacement. */ if ((((pa >> PAGE_SHIFT) ^ (pv->pv_va >> PDRSHIFT) ^ (uintptr_t)pmap) & (NPTEPG - 1)) == 0 && (*pde & PG_W) == 0) { atomic_clear_int((u_int *)pde, PG_A); pmap_invalidate_page(pmap, pv->pv_va); } rtval++; } PMAP_UNLOCK(pmap); /* Rotate the PV list if it has more than one entry. */ if (TAILQ_NEXT(pv, pv_next) != NULL) { TAILQ_REMOVE(&pvh->pv_list, pv, pv_next); TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next); } if (rtval >= PMAP_TS_REFERENCED_MAX) goto out; } while ((pv = TAILQ_FIRST(&pvh->pv_list)) != pvf); small_mappings: if ((pvf = TAILQ_FIRST(&m->md.pv_list)) == NULL) goto out; pv = pvf; do { pmap = PV_PMAP(pv); PMAP_LOCK(pmap); pde = pmap_pde(pmap, pv->pv_va); KASSERT((*pde & PG_PS) == 0, ("pmap_ts_referenced: found a 4mpage in page %p's pv list", m)); pte = pmap_pte_quick(pmap, pv->pv_va); if ((*pte & PG_A) != 0) { atomic_clear_int((u_int *)pte, PG_A); pmap_invalidate_page(pmap, pv->pv_va); rtval++; } PMAP_UNLOCK(pmap); /* Rotate the PV list if it has more than one entry. */ if (TAILQ_NEXT(pv, pv_next) != NULL) { TAILQ_REMOVE(&m->md.pv_list, pv, pv_next); TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); } } while ((pv = TAILQ_FIRST(&m->md.pv_list)) != pvf && rtval < PMAP_TS_REFERENCED_MAX); out: sched_unpin(); rw_wunlock(&pvh_global_lock); return (rtval); } /* * Apply the given advice to the specified range of addresses within the * given pmap. Depending on the advice, clear the referenced and/or * modified flags in each mapping and set the mapped page's dirty field. */ void pmap_advise(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, int advice) { pd_entry_t oldpde, *pde; pt_entry_t *pte; vm_offset_t pdnxt; vm_page_t m; boolean_t anychanged, pv_lists_locked; if (advice != MADV_DONTNEED && advice != MADV_FREE) return; if (pmap_is_current(pmap)) pv_lists_locked = FALSE; else { pv_lists_locked = TRUE; resume: rw_wlock(&pvh_global_lock); sched_pin(); } anychanged = FALSE; PMAP_LOCK(pmap); for (; sva < eva; sva = pdnxt) { pdnxt = (sva + NBPDR) & ~PDRMASK; if (pdnxt < sva) pdnxt = eva; pde = pmap_pde(pmap, sva); oldpde = *pde; if ((oldpde & PG_V) == 0) continue; else if ((oldpde & PG_PS) != 0) { if ((oldpde & PG_MANAGED) == 0) continue; if (!pv_lists_locked) { pv_lists_locked = TRUE; if (!rw_try_wlock(&pvh_global_lock)) { if (anychanged) pmap_invalidate_all(pmap); PMAP_UNLOCK(pmap); goto resume; } sched_pin(); } if (!pmap_demote_pde(pmap, pde, sva)) { /* * The large page mapping was destroyed. */ continue; } /* * Unless the page mappings are wired, remove the * mapping to a single page so that a subsequent * access may repromote. Since the underlying page * table page is fully populated, this removal never * frees a page table page. */ if ((oldpde & PG_W) == 0) { pte = pmap_pte_quick(pmap, sva); KASSERT((*pte & PG_V) != 0, ("pmap_advise: invalid PTE")); pmap_remove_pte(pmap, pte, sva, NULL); anychanged = TRUE; } } if (pdnxt > eva) pdnxt = eva; for (pte = pmap_pte_quick(pmap, sva); sva != pdnxt; pte++, sva += PAGE_SIZE) { if ((*pte & (PG_MANAGED | PG_V)) != (PG_MANAGED | PG_V)) continue; else if ((*pte & (PG_M | PG_RW)) == (PG_M | PG_RW)) { if (advice == MADV_DONTNEED) { /* * Future calls to pmap_is_modified() * can be avoided by making the page * dirty now. */ m = PHYS_TO_VM_PAGE(*pte & PG_FRAME); vm_page_dirty(m); } atomic_clear_int((u_int *)pte, PG_M | PG_A); } else if ((*pte & PG_A) != 0) atomic_clear_int((u_int *)pte, PG_A); else continue; if ((*pte & PG_G) != 0) pmap_invalidate_page(pmap, sva); else anychanged = TRUE; } } if (anychanged) pmap_invalidate_all(pmap); if (pv_lists_locked) { sched_unpin(); rw_wunlock(&pvh_global_lock); } PMAP_UNLOCK(pmap); } /* * Clear the modify bits on the specified physical page. */ void pmap_clear_modify(vm_page_t m) { struct md_page *pvh; pv_entry_t next_pv, pv; pmap_t pmap; pd_entry_t oldpde, *pde; pt_entry_t oldpte, *pte; vm_offset_t va; KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("pmap_clear_modify: page %p is not managed", m)); VM_OBJECT_ASSERT_WLOCKED(m->object); KASSERT(!vm_page_xbusied(m), ("pmap_clear_modify: page %p is exclusive busied", m)); /* * If the page is not PGA_WRITEABLE, then no PTEs can have PG_M set. * If the object containing the page is locked and the page is not * exclusive busied, then PGA_WRITEABLE cannot be concurrently set. */ if ((m->aflags & PGA_WRITEABLE) == 0) return; rw_wlock(&pvh_global_lock); sched_pin(); if ((m->flags & PG_FICTITIOUS) != 0) goto small_mappings; pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_next, next_pv) { va = pv->pv_va; pmap = PV_PMAP(pv); PMAP_LOCK(pmap); pde = pmap_pde(pmap, va); oldpde = *pde; if ((oldpde & PG_RW) != 0) { if (pmap_demote_pde(pmap, pde, va)) { if ((oldpde & PG_W) == 0) { /* * Write protect the mapping to a * single page so that a subsequent * write access may repromote. */ va += VM_PAGE_TO_PHYS(m) - (oldpde & PG_PS_FRAME); pte = pmap_pte_quick(pmap, va); oldpte = *pte; if ((oldpte & PG_V) != 0) { /* * Regardless of whether a pte is 32 or 64 bits * in size, PG_RW and PG_M are among the least * significant 32 bits. */ while (!atomic_cmpset_int((u_int *)pte, oldpte, oldpte & ~(PG_M | PG_RW))) oldpte = *pte; vm_page_dirty(m); pmap_invalidate_page(pmap, va); } } } } PMAP_UNLOCK(pmap); } small_mappings: TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { pmap = PV_PMAP(pv); PMAP_LOCK(pmap); pde = pmap_pde(pmap, pv->pv_va); KASSERT((*pde & PG_PS) == 0, ("pmap_clear_modify: found" " a 4mpage in page %p's pv list", m)); pte = pmap_pte_quick(pmap, pv->pv_va); if ((*pte & (PG_M | PG_RW)) == (PG_M | PG_RW)) { /* * Regardless of whether a pte is 32 or 64 bits * in size, PG_M is among the least significant * 32 bits. */ atomic_clear_int((u_int *)pte, PG_M); pmap_invalidate_page(pmap, pv->pv_va); } PMAP_UNLOCK(pmap); } sched_unpin(); rw_wunlock(&pvh_global_lock); } /* * Miscellaneous support routines follow */ /* Adjust the cache mode for a 4KB page mapped via a PTE. */ static __inline void pmap_pte_attr(pt_entry_t *pte, int cache_bits) { u_int opte, npte; /* * The cache mode bits are all in the low 32-bits of the * PTE, so we can just spin on updating the low 32-bits. */ do { opte = *(u_int *)pte; npte = opte & ~PG_PTE_CACHE; npte |= cache_bits; } while (npte != opte && !atomic_cmpset_int((u_int *)pte, opte, npte)); } /* Adjust the cache mode for a 2/4MB page mapped via a PDE. */ static __inline void pmap_pde_attr(pd_entry_t *pde, int cache_bits) { u_int opde, npde; /* * The cache mode bits are all in the low 32-bits of the * PDE, so we can just spin on updating the low 32-bits. */ do { opde = *(u_int *)pde; npde = opde & ~PG_PDE_CACHE; npde |= cache_bits; } while (npde != opde && !atomic_cmpset_int((u_int *)pde, opde, npde)); } /* * Map a set of physical memory pages into the kernel virtual * address space. Return a pointer to where it is mapped. This * routine is intended to be used for mapping device memory, * NOT real memory. */ void * pmap_mapdev_attr(vm_paddr_t pa, vm_size_t size, int mode) { struct pmap_preinit_mapping *ppim; vm_offset_t va, offset; vm_size_t tmpsize; int i; offset = pa & PAGE_MASK; size = round_page(offset + size); pa = pa & PG_FRAME; if (pa < KERNLOAD && pa + size <= KERNLOAD) va = KERNBASE + pa; else if (!pmap_initialized) { va = 0; for (i = 0; i < PMAP_PREINIT_MAPPING_COUNT; i++) { ppim = pmap_preinit_mapping + i; if (ppim->va == 0) { ppim->pa = pa; ppim->sz = size; ppim->mode = mode; ppim->va = virtual_avail; virtual_avail += size; va = ppim->va; break; } } if (va == 0) panic("%s: too many preinit mappings", __func__); } else { /* * If we have a preinit mapping, re-use it. */ for (i = 0; i < PMAP_PREINIT_MAPPING_COUNT; i++) { ppim = pmap_preinit_mapping + i; if (ppim->pa == pa && ppim->sz == size && ppim->mode == mode) return ((void *)(ppim->va + offset)); } va = kva_alloc(size); if (va == 0) panic("%s: Couldn't allocate KVA", __func__); } for (tmpsize = 0; tmpsize < size; tmpsize += PAGE_SIZE) pmap_kenter_attr(va + tmpsize, pa + tmpsize, mode); pmap_invalidate_range(kernel_pmap, va, va + tmpsize); pmap_invalidate_cache_range(va, va + size, FALSE); return ((void *)(va + offset)); } void * pmap_mapdev(vm_paddr_t pa, vm_size_t size) { return (pmap_mapdev_attr(pa, size, PAT_UNCACHEABLE)); } void * pmap_mapbios(vm_paddr_t pa, vm_size_t size) { return (pmap_mapdev_attr(pa, size, PAT_WRITE_BACK)); } void pmap_unmapdev(vm_offset_t va, vm_size_t size) { struct pmap_preinit_mapping *ppim; vm_offset_t offset; int i; if (va >= KERNBASE && va + size <= KERNBASE + KERNLOAD) return; offset = va & PAGE_MASK; size = round_page(offset + size); va = trunc_page(va); for (i = 0; i < PMAP_PREINIT_MAPPING_COUNT; i++) { ppim = pmap_preinit_mapping + i; if (ppim->va == va && ppim->sz == size) { if (pmap_initialized) return; ppim->pa = 0; ppim->va = 0; ppim->sz = 0; ppim->mode = 0; if (va + size == virtual_avail) virtual_avail = va; return; } } if (pmap_initialized) kva_free(va, size); } /* * Sets the memory attribute for the specified page. */ void pmap_page_set_memattr(vm_page_t m, vm_memattr_t ma) { m->md.pat_mode = ma; if ((m->flags & PG_FICTITIOUS) != 0) return; /* * If "m" is a normal page, flush it from the cache. * See pmap_invalidate_cache_range(). * * First, try to find an existing mapping of the page by sf * buffer. sf_buf_invalidate_cache() modifies mapping and * flushes the cache. */ if (sf_buf_invalidate_cache(m)) return; /* * If page is not mapped by sf buffer, but CPU does not * support self snoop, map the page transient and do * invalidation. In the worst case, whole cache is flushed by * pmap_invalidate_cache_range(). */ if ((cpu_feature & CPUID_SS) == 0) pmap_flush_page(m); } static void pmap_flush_page(vm_page_t m) { struct sysmaps *sysmaps; vm_offset_t sva, eva; if ((cpu_feature & CPUID_CLFSH) != 0) { sysmaps = &sysmaps_pcpu[PCPU_GET(cpuid)]; mtx_lock(&sysmaps->lock); if (*sysmaps->CMAP2) panic("pmap_flush_page: CMAP2 busy"); sched_pin(); *sysmaps->CMAP2 = PG_V | PG_RW | VM_PAGE_TO_PHYS(m) | PG_A | PG_M | pmap_cache_bits(m->md.pat_mode, 0); invlcaddr(sysmaps->CADDR2); sva = (vm_offset_t)sysmaps->CADDR2; eva = sva + PAGE_SIZE; /* * Use mfence despite the ordering implied by * mtx_{un,}lock() because clflush is not guaranteed * to be ordered by any other instruction. */ mfence(); for (; sva < eva; sva += cpu_clflush_line_size) clflush(sva); mfence(); *sysmaps->CMAP2 = 0; sched_unpin(); mtx_unlock(&sysmaps->lock); } else pmap_invalidate_cache(); } /* * Changes the specified virtual address range's memory type to that given by * the parameter "mode". The specified virtual address range must be * completely contained within either the kernel map. * * Returns zero if the change completed successfully, and either EINVAL or * ENOMEM if the change failed. Specifically, EINVAL is returned if some part * of the virtual address range was not mapped, and ENOMEM is returned if * there was insufficient memory available to complete the change. */ int pmap_change_attr(vm_offset_t va, vm_size_t size, int mode) { vm_offset_t base, offset, tmpva; pd_entry_t *pde; pt_entry_t *pte; int cache_bits_pte, cache_bits_pde; boolean_t changed; base = trunc_page(va); offset = va & PAGE_MASK; size = round_page(offset + size); /* * Only supported on kernel virtual addresses above the recursive map. */ if (base < VM_MIN_KERNEL_ADDRESS) return (EINVAL); cache_bits_pde = pmap_cache_bits(mode, 1); cache_bits_pte = pmap_cache_bits(mode, 0); changed = FALSE; /* * Pages that aren't mapped aren't supported. Also break down * 2/4MB pages into 4KB pages if required. */ PMAP_LOCK(kernel_pmap); for (tmpva = base; tmpva < base + size; ) { pde = pmap_pde(kernel_pmap, tmpva); if (*pde == 0) { PMAP_UNLOCK(kernel_pmap); return (EINVAL); } if (*pde & PG_PS) { /* * If the current 2/4MB page already has * the required memory type, then we need not * demote this page. Just increment tmpva to * the next 2/4MB page frame. */ if ((*pde & PG_PDE_CACHE) == cache_bits_pde) { tmpva = trunc_4mpage(tmpva) + NBPDR; continue; } /* * If the current offset aligns with a 2/4MB * page frame and there is at least 2/4MB left * within the range, then we need not break * down this page into 4KB pages. */ if ((tmpva & PDRMASK) == 0 && tmpva + PDRMASK < base + size) { tmpva += NBPDR; continue; } if (!pmap_demote_pde(kernel_pmap, pde, tmpva)) { PMAP_UNLOCK(kernel_pmap); return (ENOMEM); } } pte = vtopte(tmpva); if (*pte == 0) { PMAP_UNLOCK(kernel_pmap); return (EINVAL); } tmpva += PAGE_SIZE; } PMAP_UNLOCK(kernel_pmap); /* * Ok, all the pages exist, so run through them updating their * cache mode if required. */ for (tmpva = base; tmpva < base + size; ) { pde = pmap_pde(kernel_pmap, tmpva); if (*pde & PG_PS) { if ((*pde & PG_PDE_CACHE) != cache_bits_pde) { pmap_pde_attr(pde, cache_bits_pde); changed = TRUE; } tmpva = trunc_4mpage(tmpva) + NBPDR; } else { pte = vtopte(tmpva); if ((*pte & PG_PTE_CACHE) != cache_bits_pte) { pmap_pte_attr(pte, cache_bits_pte); changed = TRUE; } tmpva += PAGE_SIZE; } } /* * Flush CPU caches to make sure any data isn't cached that * shouldn't be, etc. */ if (changed) { pmap_invalidate_range(kernel_pmap, base, tmpva); pmap_invalidate_cache_range(base, tmpva, FALSE); } return (0); } /* * perform the pmap work for mincore */ int pmap_mincore(pmap_t pmap, vm_offset_t addr, vm_paddr_t *locked_pa) { pd_entry_t *pdep; pt_entry_t *ptep, pte; vm_paddr_t pa; int val; PMAP_LOCK(pmap); retry: pdep = pmap_pde(pmap, addr); if (*pdep != 0) { if (*pdep & PG_PS) { pte = *pdep; /* Compute the physical address of the 4KB page. */ pa = ((*pdep & PG_PS_FRAME) | (addr & PDRMASK)) & PG_FRAME; val = MINCORE_SUPER; } else { ptep = pmap_pte(pmap, addr); pte = *ptep; pmap_pte_release(ptep); pa = pte & PG_FRAME; val = 0; } } else { pte = 0; pa = 0; val = 0; } if ((pte & PG_V) != 0) { val |= MINCORE_INCORE; if ((pte & (PG_M | PG_RW)) == (PG_M | PG_RW)) val |= MINCORE_MODIFIED | MINCORE_MODIFIED_OTHER; if ((pte & PG_A) != 0) val |= MINCORE_REFERENCED | MINCORE_REFERENCED_OTHER; } if ((val & (MINCORE_MODIFIED_OTHER | MINCORE_REFERENCED_OTHER)) != (MINCORE_MODIFIED_OTHER | MINCORE_REFERENCED_OTHER) && (pte & (PG_MANAGED | PG_V)) == (PG_MANAGED | PG_V)) { /* Ensure that "PHYS_TO_VM_PAGE(pa)->object" doesn't change. */ if (vm_page_pa_tryrelock(pmap, pa, locked_pa)) goto retry; } else PA_UNLOCK_COND(*locked_pa); PMAP_UNLOCK(pmap); return (val); } void pmap_activate(struct thread *td) { pmap_t pmap, oldpmap; u_int cpuid; u_int32_t cr3; critical_enter(); pmap = vmspace_pmap(td->td_proc->p_vmspace); oldpmap = PCPU_GET(curpmap); cpuid = PCPU_GET(cpuid); #if defined(SMP) CPU_CLR_ATOMIC(cpuid, &oldpmap->pm_active); CPU_SET_ATOMIC(cpuid, &pmap->pm_active); #else CPU_CLR(cpuid, &oldpmap->pm_active); CPU_SET(cpuid, &pmap->pm_active); #endif #if defined(PAE) || defined(PAE_TABLES) cr3 = vtophys(pmap->pm_pdpt); #else cr3 = vtophys(pmap->pm_pdir); #endif /* * pmap_activate is for the current thread on the current cpu */ td->td_pcb->pcb_cr3 = cr3; load_cr3(cr3); PCPU_SET(curpmap, pmap); critical_exit(); } void pmap_sync_icache(pmap_t pm, vm_offset_t va, vm_size_t sz) { } /* * Increase the starting virtual address of the given mapping if a * different alignment might result in more superpage mappings. */ void pmap_align_superpage(vm_object_t object, vm_ooffset_t offset, vm_offset_t *addr, vm_size_t size) { vm_offset_t superpage_offset; if (size < NBPDR) return; if (object != NULL && (object->flags & OBJ_COLORED) != 0) offset += ptoa(object->pg_color); superpage_offset = offset & PDRMASK; if (size - ((NBPDR - superpage_offset) & PDRMASK) < NBPDR || (*addr & PDRMASK) == superpage_offset) return; if ((*addr & PDRMASK) < superpage_offset) *addr = (*addr & ~PDRMASK) + superpage_offset; else *addr = ((*addr + PDRMASK) & ~PDRMASK) + superpage_offset; } #if defined(PMAP_DEBUG) pmap_pid_dump(int pid) { pmap_t pmap; struct proc *p; int npte = 0; int index; sx_slock(&allproc_lock); FOREACH_PROC_IN_SYSTEM(p) { if (p->p_pid != pid) continue; if (p->p_vmspace) { int i,j; index = 0; pmap = vmspace_pmap(p->p_vmspace); for (i = 0; i < NPDEPTD; i++) { pd_entry_t *pde; pt_entry_t *pte; vm_offset_t base = i << PDRSHIFT; pde = &pmap->pm_pdir[i]; if (pde && pmap_pde_v(pde)) { for (j = 0; j < NPTEPG; j++) { vm_offset_t va = base + (j << PAGE_SHIFT); if (va >= (vm_offset_t) VM_MIN_KERNEL_ADDRESS) { if (index) { index = 0; printf("\n"); } sx_sunlock(&allproc_lock); return (npte); } pte = pmap_pte(pmap, va); if (pte && pmap_pte_v(pte)) { pt_entry_t pa; vm_page_t m; pa = *pte; m = PHYS_TO_VM_PAGE(pa & PG_FRAME); printf("va: 0x%x, pt: 0x%x, h: %d, w: %d, f: 0x%x", va, pa, m->hold_count, m->wire_count, m->flags); npte++; index++; if (index >= 2) { index = 0; printf("\n"); } else { printf(" "); } } } } } } } sx_sunlock(&allproc_lock); return (npte); } #endif #if defined(DEBUG) static void pads(pmap_t pm); void pmap_pvdump(vm_paddr_t pa); /* print address space of pmap*/ static void pads(pmap_t pm) { int i, j; vm_paddr_t va; pt_entry_t *ptep; if (pm == kernel_pmap) return; for (i = 0; i < NPDEPTD; i++) if (pm->pm_pdir[i]) for (j = 0; j < NPTEPG; j++) { va = (i << PDRSHIFT) + (j << PAGE_SHIFT); if (pm == kernel_pmap && va < KERNBASE) continue; if (pm != kernel_pmap && va > UPT_MAX_ADDRESS) continue; ptep = pmap_pte(pm, va); if (pmap_pte_v(ptep)) printf("%x:%x ", va, *ptep); }; } void pmap_pvdump(vm_paddr_t pa) { pv_entry_t pv; pmap_t pmap; vm_page_t m; printf("pa %x", pa); m = PHYS_TO_VM_PAGE(pa); TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { pmap = PV_PMAP(pv); printf(" -> pmap %p, va %x", (void *)pmap, pv->pv_va); pads(pmap); } printf(" "); } #endif Index: stable/10/sys/ia64/ia64/uma_machdep.c =================================================================== --- stable/10/sys/ia64/ia64/uma_machdep.c (revision 287944) +++ stable/10/sys/ia64/ia64/uma_machdep.c (revision 287945) @@ -1,77 +1,77 @@ /*- * Copyright (c) 2003 The FreeBSD Project * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include void * -uma_small_alloc(uma_zone_t zone, int bytes, u_int8_t *flags, int wait) +uma_small_alloc(uma_zone_t zone, vm_size_t bytes, u_int8_t *flags, int wait) { void *va; vm_page_t m; int pflags; *flags = UMA_SLAB_PRIV; pflags = malloc2vm_flags(wait) | VM_ALLOC_WIRED; for (;;) { m = vm_page_alloc(NULL, 0, pflags | VM_ALLOC_NOOBJ); if (m == NULL) { if (wait & M_NOWAIT) return (NULL); VM_WAIT; } else break; } va = (void *)IA64_PHYS_TO_RR7(VM_PAGE_TO_PHYS(m)); if ((wait & M_ZERO) && (m->flags & PG_ZERO) == 0) bzero(va, PAGE_SIZE); return (va); } void -uma_small_free(void *mem, int size, u_int8_t flags) +uma_small_free(void *mem, vm_size_t size, u_int8_t flags) { vm_page_t m; m = PHYS_TO_VM_PAGE(IA64_RR_MASK((u_int64_t)mem)); m->wire_count--; vm_page_free(m); atomic_subtract_int(&cnt.v_wire_count, 1); } Index: stable/10/sys/kern/kern_mbuf.c =================================================================== --- stable/10/sys/kern/kern_mbuf.c (revision 287944) +++ stable/10/sys/kern/kern_mbuf.c (revision 287945) @@ -1,694 +1,694 @@ /*- * Copyright (c) 2004, 2005, * Bosko Milekic . All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice unmodified, this list of conditions and the following * disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include "opt_param.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* * In FreeBSD, Mbufs and Mbuf Clusters are allocated from UMA * Zones. * * Mbuf Clusters (2K, contiguous) are allocated from the Cluster * Zone. The Zone can be capped at kern.ipc.nmbclusters, if the * administrator so desires. * * Mbufs are allocated from a UMA Master Zone called the Mbuf * Zone. * * Additionally, FreeBSD provides a Packet Zone, which it * configures as a Secondary Zone to the Mbuf Master Zone, * thus sharing backend Slab kegs with the Mbuf Master Zone. * * Thus common-case allocations and locking are simplified: * * m_clget() m_getcl() * | | * | .------------>[(Packet Cache)] m_get(), m_gethdr() * | | [ Packet ] | * [(Cluster Cache)] [ Secondary ] [ (Mbuf Cache) ] * [ Cluster Zone ] [ Zone ] [ Mbuf Master Zone ] * | \________ | * [ Cluster Keg ] \ / * | [ Mbuf Keg ] * [ Cluster Slabs ] | * | [ Mbuf Slabs ] * \____________(VM)_________________/ * * * Whenever an object is allocated with uma_zalloc() out of * one of the Zones its _ctor_ function is executed. The same * for any deallocation through uma_zfree() the _dtor_ function * is executed. * * Caches are per-CPU and are filled from the Master Zone. * * Whenever an object is allocated from the underlying global * memory pool it gets pre-initialized with the _zinit_ functions. * When the Keg's are overfull objects get decomissioned with * _zfini_ functions and free'd back to the global memory pool. * */ int nmbufs; /* limits number of mbufs */ int nmbclusters; /* limits number of mbuf clusters */ int nmbjumbop; /* limits number of page size jumbo clusters */ int nmbjumbo9; /* limits number of 9k jumbo clusters */ int nmbjumbo16; /* limits number of 16k jumbo clusters */ static quad_t maxmbufmem; /* overall real memory limit for all mbufs */ SYSCTL_QUAD(_kern_ipc, OID_AUTO, maxmbufmem, CTLFLAG_RDTUN, &maxmbufmem, 0, "Maximum real memory allocatable to various mbuf types"); /* * tunable_mbinit() has to be run before any mbuf allocations are done. */ static void tunable_mbinit(void *dummy) { quad_t realmem; /* * The default limit for all mbuf related memory is 1/2 of all * available kernel memory (physical or kmem). * At most it can be 3/4 of available kernel memory. */ realmem = qmin((quad_t)physmem * PAGE_SIZE, vm_kmem_size); maxmbufmem = realmem / 2; TUNABLE_QUAD_FETCH("kern.ipc.maxmbufmem", &maxmbufmem); if (maxmbufmem > realmem / 4 * 3) maxmbufmem = realmem / 4 * 3; TUNABLE_INT_FETCH("kern.ipc.nmbclusters", &nmbclusters); if (nmbclusters == 0) nmbclusters = maxmbufmem / MCLBYTES / 4; TUNABLE_INT_FETCH("kern.ipc.nmbjumbop", &nmbjumbop); if (nmbjumbop == 0) nmbjumbop = maxmbufmem / MJUMPAGESIZE / 4; TUNABLE_INT_FETCH("kern.ipc.nmbjumbo9", &nmbjumbo9); if (nmbjumbo9 == 0) nmbjumbo9 = maxmbufmem / MJUM9BYTES / 6; TUNABLE_INT_FETCH("kern.ipc.nmbjumbo16", &nmbjumbo16); if (nmbjumbo16 == 0) nmbjumbo16 = maxmbufmem / MJUM16BYTES / 6; /* * We need at least as many mbufs as we have clusters of * the various types added together. */ TUNABLE_INT_FETCH("kern.ipc.nmbufs", &nmbufs); if (nmbufs < nmbclusters + nmbjumbop + nmbjumbo9 + nmbjumbo16) nmbufs = lmax(maxmbufmem / MSIZE / 5, nmbclusters + nmbjumbop + nmbjumbo9 + nmbjumbo16); } SYSINIT(tunable_mbinit, SI_SUB_KMEM, SI_ORDER_MIDDLE, tunable_mbinit, NULL); static int sysctl_nmbclusters(SYSCTL_HANDLER_ARGS) { int error, newnmbclusters; newnmbclusters = nmbclusters; error = sysctl_handle_int(oidp, &newnmbclusters, 0, req); if (error == 0 && req->newptr && newnmbclusters != nmbclusters) { if (newnmbclusters > nmbclusters && nmbufs >= nmbclusters + nmbjumbop + nmbjumbo9 + nmbjumbo16) { nmbclusters = newnmbclusters; nmbclusters = uma_zone_set_max(zone_clust, nmbclusters); EVENTHANDLER_INVOKE(nmbclusters_change); } else error = EINVAL; } return (error); } SYSCTL_PROC(_kern_ipc, OID_AUTO, nmbclusters, CTLTYPE_INT|CTLFLAG_RW, &nmbclusters, 0, sysctl_nmbclusters, "IU", "Maximum number of mbuf clusters allowed"); static int sysctl_nmbjumbop(SYSCTL_HANDLER_ARGS) { int error, newnmbjumbop; newnmbjumbop = nmbjumbop; error = sysctl_handle_int(oidp, &newnmbjumbop, 0, req); if (error == 0 && req->newptr && newnmbjumbop != nmbjumbop) { if (newnmbjumbop > nmbjumbop && nmbufs >= nmbclusters + nmbjumbop + nmbjumbo9 + nmbjumbo16) { nmbjumbop = newnmbjumbop; nmbjumbop = uma_zone_set_max(zone_jumbop, nmbjumbop); } else error = EINVAL; } return (error); } SYSCTL_PROC(_kern_ipc, OID_AUTO, nmbjumbop, CTLTYPE_INT|CTLFLAG_RW, &nmbjumbop, 0, sysctl_nmbjumbop, "IU", "Maximum number of mbuf page size jumbo clusters allowed"); static int sysctl_nmbjumbo9(SYSCTL_HANDLER_ARGS) { int error, newnmbjumbo9; newnmbjumbo9 = nmbjumbo9; error = sysctl_handle_int(oidp, &newnmbjumbo9, 0, req); if (error == 0 && req->newptr && newnmbjumbo9 != nmbjumbo9) { if (newnmbjumbo9 > nmbjumbo9 && nmbufs >= nmbclusters + nmbjumbop + nmbjumbo9 + nmbjumbo16) { nmbjumbo9 = newnmbjumbo9; nmbjumbo9 = uma_zone_set_max(zone_jumbo9, nmbjumbo9); } else error = EINVAL; } return (error); } SYSCTL_PROC(_kern_ipc, OID_AUTO, nmbjumbo9, CTLTYPE_INT|CTLFLAG_RW, &nmbjumbo9, 0, sysctl_nmbjumbo9, "IU", "Maximum number of mbuf 9k jumbo clusters allowed"); static int sysctl_nmbjumbo16(SYSCTL_HANDLER_ARGS) { int error, newnmbjumbo16; newnmbjumbo16 = nmbjumbo16; error = sysctl_handle_int(oidp, &newnmbjumbo16, 0, req); if (error == 0 && req->newptr && newnmbjumbo16 != nmbjumbo16) { if (newnmbjumbo16 > nmbjumbo16 && nmbufs >= nmbclusters + nmbjumbop + nmbjumbo9 + nmbjumbo16) { nmbjumbo16 = newnmbjumbo16; nmbjumbo16 = uma_zone_set_max(zone_jumbo16, nmbjumbo16); } else error = EINVAL; } return (error); } SYSCTL_PROC(_kern_ipc, OID_AUTO, nmbjumbo16, CTLTYPE_INT|CTLFLAG_RW, &nmbjumbo16, 0, sysctl_nmbjumbo16, "IU", "Maximum number of mbuf 16k jumbo clusters allowed"); static int sysctl_nmbufs(SYSCTL_HANDLER_ARGS) { int error, newnmbufs; newnmbufs = nmbufs; error = sysctl_handle_int(oidp, &newnmbufs, 0, req); if (error == 0 && req->newptr && newnmbufs != nmbufs) { if (newnmbufs > nmbufs) { nmbufs = newnmbufs; nmbufs = uma_zone_set_max(zone_mbuf, nmbufs); EVENTHANDLER_INVOKE(nmbufs_change); } else error = EINVAL; } return (error); } SYSCTL_PROC(_kern_ipc, OID_AUTO, nmbufs, CTLTYPE_INT|CTLFLAG_RW, &nmbufs, 0, sysctl_nmbufs, "IU", "Maximum number of mbufs allowed"); /* * Zones from which we allocate. */ uma_zone_t zone_mbuf; uma_zone_t zone_clust; uma_zone_t zone_pack; uma_zone_t zone_jumbop; uma_zone_t zone_jumbo9; uma_zone_t zone_jumbo16; uma_zone_t zone_ext_refcnt; /* * Local prototypes. */ static int mb_ctor_mbuf(void *, int, void *, int); static int mb_ctor_clust(void *, int, void *, int); static int mb_ctor_pack(void *, int, void *, int); static void mb_dtor_mbuf(void *, int, void *); static void mb_dtor_clust(void *, int, void *); static void mb_dtor_pack(void *, int, void *); static int mb_zinit_pack(void *, int, int); static void mb_zfini_pack(void *, int); static void mb_reclaim(void *); -static void *mbuf_jumbo_alloc(uma_zone_t, int, uint8_t *, int); +static void *mbuf_jumbo_alloc(uma_zone_t, vm_size_t, uint8_t *, int); /* Ensure that MSIZE is a power of 2. */ CTASSERT((((MSIZE - 1) ^ MSIZE) + 1) >> 1 == MSIZE); /* * Initialize FreeBSD Network buffer allocation. */ static void mbuf_init(void *dummy) { /* * Configure UMA zones for Mbufs, Clusters, and Packets. */ zone_mbuf = uma_zcreate(MBUF_MEM_NAME, MSIZE, mb_ctor_mbuf, mb_dtor_mbuf, #ifdef INVARIANTS trash_init, trash_fini, #else NULL, NULL, #endif MSIZE - 1, UMA_ZONE_MAXBUCKET); if (nmbufs > 0) nmbufs = uma_zone_set_max(zone_mbuf, nmbufs); uma_zone_set_warning(zone_mbuf, "kern.ipc.nmbufs limit reached"); zone_clust = uma_zcreate(MBUF_CLUSTER_MEM_NAME, MCLBYTES, mb_ctor_clust, mb_dtor_clust, #ifdef INVARIANTS trash_init, trash_fini, #else NULL, NULL, #endif UMA_ALIGN_PTR, UMA_ZONE_REFCNT); if (nmbclusters > 0) nmbclusters = uma_zone_set_max(zone_clust, nmbclusters); uma_zone_set_warning(zone_clust, "kern.ipc.nmbclusters limit reached"); zone_pack = uma_zsecond_create(MBUF_PACKET_MEM_NAME, mb_ctor_pack, mb_dtor_pack, mb_zinit_pack, mb_zfini_pack, zone_mbuf); /* Make jumbo frame zone too. Page size, 9k and 16k. */ zone_jumbop = uma_zcreate(MBUF_JUMBOP_MEM_NAME, MJUMPAGESIZE, mb_ctor_clust, mb_dtor_clust, #ifdef INVARIANTS trash_init, trash_fini, #else NULL, NULL, #endif UMA_ALIGN_PTR, UMA_ZONE_REFCNT); if (nmbjumbop > 0) nmbjumbop = uma_zone_set_max(zone_jumbop, nmbjumbop); uma_zone_set_warning(zone_jumbop, "kern.ipc.nmbjumbop limit reached"); zone_jumbo9 = uma_zcreate(MBUF_JUMBO9_MEM_NAME, MJUM9BYTES, mb_ctor_clust, mb_dtor_clust, #ifdef INVARIANTS trash_init, trash_fini, #else NULL, NULL, #endif UMA_ALIGN_PTR, UMA_ZONE_REFCNT); uma_zone_set_allocf(zone_jumbo9, mbuf_jumbo_alloc); if (nmbjumbo9 > 0) nmbjumbo9 = uma_zone_set_max(zone_jumbo9, nmbjumbo9); uma_zone_set_warning(zone_jumbo9, "kern.ipc.nmbjumbo9 limit reached"); zone_jumbo16 = uma_zcreate(MBUF_JUMBO16_MEM_NAME, MJUM16BYTES, mb_ctor_clust, mb_dtor_clust, #ifdef INVARIANTS trash_init, trash_fini, #else NULL, NULL, #endif UMA_ALIGN_PTR, UMA_ZONE_REFCNT); uma_zone_set_allocf(zone_jumbo16, mbuf_jumbo_alloc); if (nmbjumbo16 > 0) nmbjumbo16 = uma_zone_set_max(zone_jumbo16, nmbjumbo16); uma_zone_set_warning(zone_jumbo16, "kern.ipc.nmbjumbo16 limit reached"); zone_ext_refcnt = uma_zcreate(MBUF_EXTREFCNT_MEM_NAME, sizeof(u_int), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_ZINIT); /* uma_prealloc() goes here... */ /* * Hook event handler for low-memory situation, used to * drain protocols and push data back to the caches (UMA * later pushes it back to VM). */ EVENTHANDLER_REGISTER(vm_lowmem, mb_reclaim, NULL, EVENTHANDLER_PRI_FIRST); } SYSINIT(mbuf, SI_SUB_MBUF, SI_ORDER_FIRST, mbuf_init, NULL); /* * UMA backend page allocator for the jumbo frame zones. * * Allocates kernel virtual memory that is backed by contiguous physical * pages. */ static void * -mbuf_jumbo_alloc(uma_zone_t zone, int bytes, uint8_t *flags, int wait) +mbuf_jumbo_alloc(uma_zone_t zone, vm_size_t bytes, uint8_t *flags, int wait) { /* Inform UMA that this allocator uses kernel_map/object. */ *flags = UMA_SLAB_KERNEL; return ((void *)kmem_alloc_contig(kernel_arena, bytes, wait, (vm_paddr_t)0, ~(vm_paddr_t)0, 1, 0, VM_MEMATTR_DEFAULT)); } /* * Constructor for Mbuf master zone. * * The 'arg' pointer points to a mb_args structure which * contains call-specific information required to support the * mbuf allocation API. See mbuf.h. */ static int mb_ctor_mbuf(void *mem, int size, void *arg, int how) { struct mbuf *m; struct mb_args *args; int error; int flags; short type; #ifdef INVARIANTS trash_ctor(mem, size, arg, how); #endif args = (struct mb_args *)arg; type = args->type; /* * The mbuf is initialized later. The caller has the * responsibility to set up any MAC labels too. */ if (type == MT_NOINIT) return (0); m = (struct mbuf *)mem; flags = args->flags; error = m_init(m, NULL, size, how, type, flags); return (error); } /* * The Mbuf master zone destructor. */ static void mb_dtor_mbuf(void *mem, int size, void *arg) { struct mbuf *m; unsigned long flags; m = (struct mbuf *)mem; flags = (unsigned long)arg; if ((m->m_flags & M_PKTHDR) && !SLIST_EMPTY(&m->m_pkthdr.tags)) m_tag_delete_chain(m, NULL); KASSERT((m->m_flags & M_EXT) == 0, ("%s: M_EXT set", __func__)); KASSERT((m->m_flags & M_NOFREE) == 0, ("%s: M_NOFREE set", __func__)); #ifdef INVARIANTS trash_dtor(mem, size, arg); #endif } /* * The Mbuf Packet zone destructor. */ static void mb_dtor_pack(void *mem, int size, void *arg) { struct mbuf *m; m = (struct mbuf *)mem; if ((m->m_flags & M_PKTHDR) != 0) m_tag_delete_chain(m, NULL); /* Make sure we've got a clean cluster back. */ KASSERT((m->m_flags & M_EXT) == M_EXT, ("%s: M_EXT not set", __func__)); KASSERT(m->m_ext.ext_buf != NULL, ("%s: ext_buf == NULL", __func__)); KASSERT(m->m_ext.ext_free == NULL, ("%s: ext_free != NULL", __func__)); KASSERT(m->m_ext.ext_arg1 == NULL, ("%s: ext_arg1 != NULL", __func__)); KASSERT(m->m_ext.ext_arg2 == NULL, ("%s: ext_arg2 != NULL", __func__)); KASSERT(m->m_ext.ext_size == MCLBYTES, ("%s: ext_size != MCLBYTES", __func__)); KASSERT(m->m_ext.ext_type == EXT_PACKET, ("%s: ext_type != EXT_PACKET", __func__)); KASSERT(*m->m_ext.ref_cnt == 1, ("%s: ref_cnt != 1", __func__)); #ifdef INVARIANTS trash_dtor(m->m_ext.ext_buf, MCLBYTES, arg); #endif /* * If there are processes blocked on zone_clust, waiting for pages * to be freed up, * cause them to be woken up by draining the * packet zone. We are exposed to a race here * (in the check for * the UMA_ZFLAG_FULL) where we might miss the flag set, but that * is deliberate. We don't want to acquire the zone lock for every * mbuf free. */ if (uma_zone_exhausted_nolock(zone_clust)) zone_drain(zone_pack); } /* * The Cluster and Jumbo[PAGESIZE|9|16] zone constructor. * * Here the 'arg' pointer points to the Mbuf which we * are configuring cluster storage for. If 'arg' is * empty we allocate just the cluster without setting * the mbuf to it. See mbuf.h. */ static int mb_ctor_clust(void *mem, int size, void *arg, int how) { struct mbuf *m; u_int *refcnt; int type; uma_zone_t zone; #ifdef INVARIANTS trash_ctor(mem, size, arg, how); #endif switch (size) { case MCLBYTES: type = EXT_CLUSTER; zone = zone_clust; break; #if MJUMPAGESIZE != MCLBYTES case MJUMPAGESIZE: type = EXT_JUMBOP; zone = zone_jumbop; break; #endif case MJUM9BYTES: type = EXT_JUMBO9; zone = zone_jumbo9; break; case MJUM16BYTES: type = EXT_JUMBO16; zone = zone_jumbo16; break; default: panic("unknown cluster size"); break; } m = (struct mbuf *)arg; refcnt = uma_find_refcnt(zone, mem); *refcnt = 1; if (m != NULL) { m->m_ext.ext_buf = (caddr_t)mem; m->m_data = m->m_ext.ext_buf; m->m_flags |= M_EXT; m->m_ext.ext_free = NULL; m->m_ext.ext_arg1 = NULL; m->m_ext.ext_arg2 = NULL; m->m_ext.ext_size = size; m->m_ext.ext_type = type; m->m_ext.ext_flags = 0; m->m_ext.ref_cnt = refcnt; } return (0); } /* * The Mbuf Cluster zone destructor. */ static void mb_dtor_clust(void *mem, int size, void *arg) { #ifdef INVARIANTS uma_zone_t zone; zone = m_getzone(size); KASSERT(*(uma_find_refcnt(zone, mem)) <= 1, ("%s: refcnt incorrect %u", __func__, *(uma_find_refcnt(zone, mem))) ); trash_dtor(mem, size, arg); #endif } /* * The Packet secondary zone's init routine, executed on the * object's transition from mbuf keg slab to zone cache. */ static int mb_zinit_pack(void *mem, int size, int how) { struct mbuf *m; m = (struct mbuf *)mem; /* m is virgin. */ if (uma_zalloc_arg(zone_clust, m, how) == NULL || m->m_ext.ext_buf == NULL) return (ENOMEM); m->m_ext.ext_type = EXT_PACKET; /* Override. */ #ifdef INVARIANTS trash_init(m->m_ext.ext_buf, MCLBYTES, how); #endif return (0); } /* * The Packet secondary zone's fini routine, executed on the * object's transition from zone cache to keg slab. */ static void mb_zfini_pack(void *mem, int size) { struct mbuf *m; m = (struct mbuf *)mem; #ifdef INVARIANTS trash_fini(m->m_ext.ext_buf, MCLBYTES); #endif uma_zfree_arg(zone_clust, m->m_ext.ext_buf, NULL); #ifdef INVARIANTS trash_dtor(mem, size, NULL); #endif } /* * The "packet" keg constructor. */ static int mb_ctor_pack(void *mem, int size, void *arg, int how) { struct mbuf *m; struct mb_args *args; int error, flags; short type; m = (struct mbuf *)mem; args = (struct mb_args *)arg; flags = args->flags; type = args->type; #ifdef INVARIANTS trash_ctor(m->m_ext.ext_buf, MCLBYTES, arg, how); #endif error = m_init(m, NULL, size, how, type, flags); /* m_ext is already initialized. */ m->m_data = m->m_ext.ext_buf; m->m_flags = (flags | M_EXT); return (error); } int m_pkthdr_init(struct mbuf *m, int how) { #ifdef MAC int error; #endif m->m_data = m->m_pktdat; m->m_pkthdr.rcvif = NULL; SLIST_INIT(&m->m_pkthdr.tags); m->m_pkthdr.len = 0; m->m_pkthdr.flowid = 0; m->m_pkthdr.csum_flags = 0; m->m_pkthdr.fibnum = 0; m->m_pkthdr.cosqos = 0; m->m_pkthdr.rsstype = 0; m->m_pkthdr.l2hlen = 0; m->m_pkthdr.l3hlen = 0; m->m_pkthdr.l4hlen = 0; m->m_pkthdr.l5hlen = 0; m->m_pkthdr.PH_per.sixtyfour[0] = 0; m->m_pkthdr.PH_loc.sixtyfour[0] = 0; #ifdef MAC /* If the label init fails, fail the alloc */ error = mac_mbuf_init(m, how); if (error) return (error); #endif return (0); } /* * This is the protocol drain routine. * * No locks should be held when this is called. The drain routines have to * presently acquire some locks which raises the possibility of lock order * reversal. */ static void mb_reclaim(void *junk) { struct domain *dp; struct protosw *pr; WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK | WARN_PANIC, NULL, "mb_reclaim()"); for (dp = domains; dp != NULL; dp = dp->dom_next) for (pr = dp->dom_protosw; pr < dp->dom_protoswNPROTOSW; pr++) if (pr->pr_drain != NULL) (*pr->pr_drain)(); } Index: stable/10/sys/kern/subr_busdma_bufalloc.c =================================================================== --- stable/10/sys/kern/subr_busdma_bufalloc.c (revision 287944) +++ stable/10/sys/kern/subr_busdma_bufalloc.c (revision 287945) @@ -1,174 +1,174 @@ /*- * Copyright (c) 2012 Ian Lepore * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); /* * Buffer allocation support routines for bus_dmamem_alloc implementations. */ #include #include #include #include #include #include #include #include #include /* * We manage buffer zones up to a page in size. Buffers larger than a page can * be managed by one of the kernel's page-oriented memory allocation routines as * efficiently as what we can do here. Also, a page is the largest size for * which we can g'tee contiguity when using uma, and contiguity is one of the * requirements we have to fulfill. */ #define MIN_ZONE_BUFSIZE 32 #define MAX_ZONE_BUFSIZE PAGE_SIZE /* * The static array of 12 bufzones is big enough to handle all the zones for the * smallest supported allocation size of 32 through the largest supported page * size of 64K. If you up the biggest page size number, up the array size too. * Basically the size of the array needs to be log2(maxsize)-log2(minsize)+1, * but I don't know of an easy way to express that as a compile-time constant. */ #if PAGE_SIZE > 65536 #error Unsupported page size #endif struct busdma_bufalloc { bus_size_t min_size; size_t num_zones; struct busdma_bufzone buf_zones[12]; }; busdma_bufalloc_t busdma_bufalloc_create(const char *name, bus_size_t minimum_alignment, uma_alloc alloc_func, uma_free free_func, u_int32_t zcreate_flags) { struct busdma_bufalloc *ba; struct busdma_bufzone *bz; int i; bus_size_t cursize; ba = malloc(sizeof(struct busdma_bufalloc), M_DEVBUF, M_ZERO | M_WAITOK); ba->min_size = MAX(MIN_ZONE_BUFSIZE, minimum_alignment); /* * Each uma zone is created with an alignment of size-1, meaning that * the alignment is equal to the size (I.E., 64 byte buffers are aligned * to 64 byte boundaries, etc). This allows for a fast efficient test * when deciding whether a pool buffer meets the constraints of a given * tag used for allocation: the buffer is usable if tag->alignment <= * bufzone->size. */ for (i = 0, bz = ba->buf_zones, cursize = ba->min_size; i < nitems(ba->buf_zones) && cursize <= MAX_ZONE_BUFSIZE; ++i, ++bz, cursize <<= 1) { snprintf(bz->name, sizeof(bz->name), "dma %.10s %lu", name, cursize); bz->size = cursize; bz->umazone = uma_zcreate(bz->name, bz->size, NULL, NULL, NULL, NULL, bz->size - 1, zcreate_flags); if (bz->umazone == NULL) { busdma_bufalloc_destroy(ba); return (NULL); } if (alloc_func != NULL) uma_zone_set_allocf(bz->umazone, alloc_func); if (free_func != NULL) uma_zone_set_freef(bz->umazone, free_func); ++ba->num_zones; } return (ba); } void busdma_bufalloc_destroy(busdma_bufalloc_t ba) { struct busdma_bufzone *bz; int i; if (ba == NULL) return; for (i = 0, bz = ba->buf_zones; i < ba->num_zones; ++i, ++bz) { uma_zdestroy(bz->umazone); } free(ba, M_DEVBUF); } struct busdma_bufzone * busdma_bufalloc_findzone(busdma_bufalloc_t ba, bus_size_t size) { struct busdma_bufzone *bz; int i; if (size > MAX_ZONE_BUFSIZE) return (NULL); for (i = 0, bz = ba->buf_zones; i < ba->num_zones; ++i, ++bz) { if (bz->size >= size) return (bz); } panic("Didn't find a buffer zone of the right size"); } void * -busdma_bufalloc_alloc_uncacheable(uma_zone_t zone, int size, u_int8_t *pflag, - int wait) +busdma_bufalloc_alloc_uncacheable(uma_zone_t zone, vm_size_t size, + uint8_t *pflag, int wait) { #ifdef VM_MEMATTR_UNCACHEABLE /* Inform UMA that this allocator uses kernel_arena/object. */ *pflag = UMA_SLAB_KERNEL; return ((void *)kmem_alloc_attr(kernel_arena, size, wait, 0, BUS_SPACE_MAXADDR, VM_MEMATTR_UNCACHEABLE)); #else panic("VM_MEMATTR_UNCACHEABLE unavailable"); #endif /* VM_MEMATTR_UNCACHEABLE */ } void -busdma_bufalloc_free_uncacheable(void *item, int size, u_int8_t pflag) +busdma_bufalloc_free_uncacheable(void *item, vm_size_t size, uint8_t pflag) { kmem_free(kernel_arena, (vm_offset_t)item, size); } Index: stable/10/sys/kern/subr_vmem.c =================================================================== --- stable/10/sys/kern/subr_vmem.c (revision 287944) +++ stable/10/sys/kern/subr_vmem.c (revision 287945) @@ -1,1513 +1,1513 @@ /*- * Copyright (c)2006,2007,2008,2009 YAMAMOTO Takashi, * Copyright (c) 2013 EMC Corp. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /* * From: * $NetBSD: vmem_impl.h,v 1.2 2013/01/29 21:26:24 para Exp $ * $NetBSD: subr_vmem.c,v 1.83 2013/03/06 11:20:10 yamt Exp $ */ /* * reference: * - Magazines and Vmem: Extending the Slab Allocator * to Many CPUs and Arbitrary Resources * http://www.usenix.org/event/usenix01/bonwick.html */ #include __FBSDID("$FreeBSD$"); #include "opt_ddb.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "opt_vm.h" #include #include #include #include #include #include #include #include #include #define VMEM_OPTORDER 5 #define VMEM_OPTVALUE (1 << VMEM_OPTORDER) #define VMEM_MAXORDER \ (VMEM_OPTVALUE - 1 + sizeof(vmem_size_t) * NBBY - VMEM_OPTORDER) #define VMEM_HASHSIZE_MIN 16 #define VMEM_HASHSIZE_MAX 131072 #define VMEM_QCACHE_IDX_MAX 16 #define VMEM_FITMASK (M_BESTFIT | M_FIRSTFIT) #define VMEM_FLAGS \ (M_NOWAIT | M_WAITOK | M_USE_RESERVE | M_NOVM | M_BESTFIT | M_FIRSTFIT) #define BT_FLAGS (M_NOWAIT | M_WAITOK | M_USE_RESERVE | M_NOVM) #define QC_NAME_MAX 16 /* * Data structures private to vmem. */ MALLOC_DEFINE(M_VMEM, "vmem", "vmem internal structures"); typedef struct vmem_btag bt_t; TAILQ_HEAD(vmem_seglist, vmem_btag); LIST_HEAD(vmem_freelist, vmem_btag); LIST_HEAD(vmem_hashlist, vmem_btag); struct qcache { uma_zone_t qc_cache; vmem_t *qc_vmem; vmem_size_t qc_size; char qc_name[QC_NAME_MAX]; }; typedef struct qcache qcache_t; #define QC_POOL_TO_QCACHE(pool) ((qcache_t *)(pool->pr_qcache)) #define VMEM_NAME_MAX 16 /* vmem arena */ struct vmem { struct mtx_padalign vm_lock; struct cv vm_cv; char vm_name[VMEM_NAME_MAX+1]; LIST_ENTRY(vmem) vm_alllist; struct vmem_hashlist vm_hash0[VMEM_HASHSIZE_MIN]; struct vmem_freelist vm_freelist[VMEM_MAXORDER]; struct vmem_seglist vm_seglist; struct vmem_hashlist *vm_hashlist; vmem_size_t vm_hashsize; /* Constant after init */ vmem_size_t vm_qcache_max; vmem_size_t vm_quantum_mask; vmem_size_t vm_import_quantum; int vm_quantum_shift; /* Written on alloc/free */ LIST_HEAD(, vmem_btag) vm_freetags; int vm_nfreetags; int vm_nbusytag; vmem_size_t vm_inuse; vmem_size_t vm_size; /* Used on import. */ vmem_import_t *vm_importfn; vmem_release_t *vm_releasefn; void *vm_arg; /* Space exhaustion callback. */ vmem_reclaim_t *vm_reclaimfn; /* quantum cache */ qcache_t vm_qcache[VMEM_QCACHE_IDX_MAX]; }; /* boundary tag */ struct vmem_btag { TAILQ_ENTRY(vmem_btag) bt_seglist; union { LIST_ENTRY(vmem_btag) u_freelist; /* BT_TYPE_FREE */ LIST_ENTRY(vmem_btag) u_hashlist; /* BT_TYPE_BUSY */ } bt_u; #define bt_hashlist bt_u.u_hashlist #define bt_freelist bt_u.u_freelist vmem_addr_t bt_start; vmem_size_t bt_size; int bt_type; }; #define BT_TYPE_SPAN 1 /* Allocated from importfn */ #define BT_TYPE_SPAN_STATIC 2 /* vmem_add() or create. */ #define BT_TYPE_FREE 3 /* Available space. */ #define BT_TYPE_BUSY 4 /* Used space. */ #define BT_ISSPAN_P(bt) ((bt)->bt_type <= BT_TYPE_SPAN_STATIC) #define BT_END(bt) ((bt)->bt_start + (bt)->bt_size - 1) #if defined(DIAGNOSTIC) static int enable_vmem_check = 1; SYSCTL_INT(_debug, OID_AUTO, vmem_check, CTLFLAG_RWTUN, &enable_vmem_check, 0, "Enable vmem check"); static void vmem_check(vmem_t *); #endif static struct callout vmem_periodic_ch; static int vmem_periodic_interval; static struct task vmem_periodic_wk; static struct mtx_padalign vmem_list_lock; static LIST_HEAD(, vmem) vmem_list = LIST_HEAD_INITIALIZER(vmem_list); /* ---- misc */ #define VMEM_CONDVAR_INIT(vm, wchan) cv_init(&vm->vm_cv, wchan) #define VMEM_CONDVAR_DESTROY(vm) cv_destroy(&vm->vm_cv) #define VMEM_CONDVAR_WAIT(vm) cv_wait(&vm->vm_cv, &vm->vm_lock) #define VMEM_CONDVAR_BROADCAST(vm) cv_broadcast(&vm->vm_cv) #define VMEM_LOCK(vm) mtx_lock(&vm->vm_lock) #define VMEM_TRYLOCK(vm) mtx_trylock(&vm->vm_lock) #define VMEM_UNLOCK(vm) mtx_unlock(&vm->vm_lock) #define VMEM_LOCK_INIT(vm, name) mtx_init(&vm->vm_lock, (name), NULL, MTX_DEF) #define VMEM_LOCK_DESTROY(vm) mtx_destroy(&vm->vm_lock) #define VMEM_ASSERT_LOCKED(vm) mtx_assert(&vm->vm_lock, MA_OWNED); #define VMEM_ALIGNUP(addr, align) (-(-(addr) & -(align))) #define VMEM_CROSS_P(addr1, addr2, boundary) \ ((((addr1) ^ (addr2)) & -(boundary)) != 0) #define ORDER2SIZE(order) ((order) < VMEM_OPTVALUE ? ((order) + 1) : \ (vmem_size_t)1 << ((order) - (VMEM_OPTVALUE - VMEM_OPTORDER - 1))) #define SIZE2ORDER(size) ((size) <= VMEM_OPTVALUE ? ((size) - 1) : \ (flsl(size) + (VMEM_OPTVALUE - VMEM_OPTORDER - 2))) /* * Maximum number of boundary tags that may be required to satisfy an * allocation. Two may be required to import. Another two may be * required to clip edges. */ #define BT_MAXALLOC 4 /* * Max free limits the number of locally cached boundary tags. We * just want to avoid hitting the zone allocator for every call. */ #define BT_MAXFREE (BT_MAXALLOC * 8) /* Allocator for boundary tags. */ static uma_zone_t vmem_bt_zone; /* boot time arena storage. */ static struct vmem kernel_arena_storage; static struct vmem kmem_arena_storage; static struct vmem buffer_arena_storage; static struct vmem transient_arena_storage; vmem_t *kernel_arena = &kernel_arena_storage; vmem_t *kmem_arena = &kmem_arena_storage; vmem_t *buffer_arena = &buffer_arena_storage; vmem_t *transient_arena = &transient_arena_storage; #ifdef DEBUG_MEMGUARD static struct vmem memguard_arena_storage; vmem_t *memguard_arena = &memguard_arena_storage; #endif /* * Fill the vmem's boundary tag cache. We guarantee that boundary tag * allocation will not fail once bt_fill() passes. To do so we cache * at least the maximum possible tag allocations in the arena. */ static int bt_fill(vmem_t *vm, int flags) { bt_t *bt; VMEM_ASSERT_LOCKED(vm); /* * Only allow the kmem arena to dip into reserve tags. It is the * vmem where new tags come from. */ flags &= BT_FLAGS; if (vm != kmem_arena) flags &= ~M_USE_RESERVE; /* * Loop until we meet the reserve. To minimize the lock shuffle * and prevent simultaneous fills we first try a NOWAIT regardless * of the caller's flags. Specify M_NOVM so we don't recurse while * holding a vmem lock. */ while (vm->vm_nfreetags < BT_MAXALLOC) { bt = uma_zalloc(vmem_bt_zone, (flags & M_USE_RESERVE) | M_NOWAIT | M_NOVM); if (bt == NULL) { VMEM_UNLOCK(vm); bt = uma_zalloc(vmem_bt_zone, flags); VMEM_LOCK(vm); if (bt == NULL && (flags & M_NOWAIT) != 0) break; } LIST_INSERT_HEAD(&vm->vm_freetags, bt, bt_freelist); vm->vm_nfreetags++; } if (vm->vm_nfreetags < BT_MAXALLOC) return ENOMEM; return 0; } /* * Pop a tag off of the freetag stack. */ static bt_t * bt_alloc(vmem_t *vm) { bt_t *bt; VMEM_ASSERT_LOCKED(vm); bt = LIST_FIRST(&vm->vm_freetags); MPASS(bt != NULL); LIST_REMOVE(bt, bt_freelist); vm->vm_nfreetags--; return bt; } /* * Trim the per-vmem free list. Returns with the lock released to * avoid allocator recursions. */ static void bt_freetrim(vmem_t *vm, int freelimit) { LIST_HEAD(, vmem_btag) freetags; bt_t *bt; LIST_INIT(&freetags); VMEM_ASSERT_LOCKED(vm); while (vm->vm_nfreetags > freelimit) { bt = LIST_FIRST(&vm->vm_freetags); LIST_REMOVE(bt, bt_freelist); vm->vm_nfreetags--; LIST_INSERT_HEAD(&freetags, bt, bt_freelist); } VMEM_UNLOCK(vm); while ((bt = LIST_FIRST(&freetags)) != NULL) { LIST_REMOVE(bt, bt_freelist); uma_zfree(vmem_bt_zone, bt); } } static inline void bt_free(vmem_t *vm, bt_t *bt) { VMEM_ASSERT_LOCKED(vm); MPASS(LIST_FIRST(&vm->vm_freetags) != bt); LIST_INSERT_HEAD(&vm->vm_freetags, bt, bt_freelist); vm->vm_nfreetags++; } /* * freelist[0] ... [1, 1] * freelist[1] ... [2, 2] * : * freelist[29] ... [30, 30] * freelist[30] ... [31, 31] * freelist[31] ... [32, 63] * freelist[33] ... [64, 127] * : * freelist[n] ... [(1 << (n - 26)), (1 << (n - 25)) - 1] * : */ static struct vmem_freelist * bt_freehead_tofree(vmem_t *vm, vmem_size_t size) { const vmem_size_t qsize = size >> vm->vm_quantum_shift; const int idx = SIZE2ORDER(qsize); MPASS(size != 0 && qsize != 0); MPASS((size & vm->vm_quantum_mask) == 0); MPASS(idx >= 0); MPASS(idx < VMEM_MAXORDER); return &vm->vm_freelist[idx]; } /* * bt_freehead_toalloc: return the freelist for the given size and allocation * strategy. * * For M_FIRSTFIT, return the list in which any blocks are large enough * for the requested size. otherwise, return the list which can have blocks * large enough for the requested size. */ static struct vmem_freelist * bt_freehead_toalloc(vmem_t *vm, vmem_size_t size, int strat) { const vmem_size_t qsize = size >> vm->vm_quantum_shift; int idx = SIZE2ORDER(qsize); MPASS(size != 0 && qsize != 0); MPASS((size & vm->vm_quantum_mask) == 0); if (strat == M_FIRSTFIT && ORDER2SIZE(idx) != qsize) { idx++; /* check too large request? */ } MPASS(idx >= 0); MPASS(idx < VMEM_MAXORDER); return &vm->vm_freelist[idx]; } /* ---- boundary tag hash */ static struct vmem_hashlist * bt_hashhead(vmem_t *vm, vmem_addr_t addr) { struct vmem_hashlist *list; unsigned int hash; hash = hash32_buf(&addr, sizeof(addr), 0); list = &vm->vm_hashlist[hash % vm->vm_hashsize]; return list; } static bt_t * bt_lookupbusy(vmem_t *vm, vmem_addr_t addr) { struct vmem_hashlist *list; bt_t *bt; VMEM_ASSERT_LOCKED(vm); list = bt_hashhead(vm, addr); LIST_FOREACH(bt, list, bt_hashlist) { if (bt->bt_start == addr) { break; } } return bt; } static void bt_rembusy(vmem_t *vm, bt_t *bt) { VMEM_ASSERT_LOCKED(vm); MPASS(vm->vm_nbusytag > 0); vm->vm_inuse -= bt->bt_size; vm->vm_nbusytag--; LIST_REMOVE(bt, bt_hashlist); } static void bt_insbusy(vmem_t *vm, bt_t *bt) { struct vmem_hashlist *list; VMEM_ASSERT_LOCKED(vm); MPASS(bt->bt_type == BT_TYPE_BUSY); list = bt_hashhead(vm, bt->bt_start); LIST_INSERT_HEAD(list, bt, bt_hashlist); vm->vm_nbusytag++; vm->vm_inuse += bt->bt_size; } /* ---- boundary tag list */ static void bt_remseg(vmem_t *vm, bt_t *bt) { TAILQ_REMOVE(&vm->vm_seglist, bt, bt_seglist); bt_free(vm, bt); } static void bt_insseg(vmem_t *vm, bt_t *bt, bt_t *prev) { TAILQ_INSERT_AFTER(&vm->vm_seglist, prev, bt, bt_seglist); } static void bt_insseg_tail(vmem_t *vm, bt_t *bt) { TAILQ_INSERT_TAIL(&vm->vm_seglist, bt, bt_seglist); } static void bt_remfree(vmem_t *vm, bt_t *bt) { MPASS(bt->bt_type == BT_TYPE_FREE); LIST_REMOVE(bt, bt_freelist); } static void bt_insfree(vmem_t *vm, bt_t *bt) { struct vmem_freelist *list; list = bt_freehead_tofree(vm, bt->bt_size); LIST_INSERT_HEAD(list, bt, bt_freelist); } /* ---- vmem internal functions */ /* * Import from the arena into the quantum cache in UMA. */ static int qc_import(void *arg, void **store, int cnt, int flags) { qcache_t *qc; vmem_addr_t addr; int i; qc = arg; flags |= M_BESTFIT; for (i = 0; i < cnt; i++) { if (vmem_xalloc(qc->qc_vmem, qc->qc_size, 0, 0, 0, VMEM_ADDR_MIN, VMEM_ADDR_MAX, flags, &addr) != 0) break; store[i] = (void *)addr; /* Only guarantee one allocation. */ flags &= ~M_WAITOK; flags |= M_NOWAIT; } return i; } /* * Release memory from the UMA cache to the arena. */ static void qc_release(void *arg, void **store, int cnt) { qcache_t *qc; int i; qc = arg; for (i = 0; i < cnt; i++) vmem_xfree(qc->qc_vmem, (vmem_addr_t)store[i], qc->qc_size); } static void qc_init(vmem_t *vm, vmem_size_t qcache_max) { qcache_t *qc; vmem_size_t size; int qcache_idx_max; int i; MPASS((qcache_max & vm->vm_quantum_mask) == 0); qcache_idx_max = MIN(qcache_max >> vm->vm_quantum_shift, VMEM_QCACHE_IDX_MAX); vm->vm_qcache_max = qcache_idx_max << vm->vm_quantum_shift; for (i = 0; i < qcache_idx_max; i++) { qc = &vm->vm_qcache[i]; size = (i + 1) << vm->vm_quantum_shift; snprintf(qc->qc_name, sizeof(qc->qc_name), "%s-%zu", vm->vm_name, size); qc->qc_vmem = vm; qc->qc_size = size; qc->qc_cache = uma_zcache_create(qc->qc_name, size, NULL, NULL, NULL, NULL, qc_import, qc_release, qc, UMA_ZONE_VM); MPASS(qc->qc_cache); } } static void qc_destroy(vmem_t *vm) { int qcache_idx_max; int i; qcache_idx_max = vm->vm_qcache_max >> vm->vm_quantum_shift; for (i = 0; i < qcache_idx_max; i++) uma_zdestroy(vm->vm_qcache[i].qc_cache); } static void qc_drain(vmem_t *vm) { int qcache_idx_max; int i; qcache_idx_max = vm->vm_qcache_max >> vm->vm_quantum_shift; for (i = 0; i < qcache_idx_max; i++) zone_drain(vm->vm_qcache[i].qc_cache); } #ifndef UMA_MD_SMALL_ALLOC static struct mtx_padalign vmem_bt_lock; /* * vmem_bt_alloc: Allocate a new page of boundary tags. * * On architectures with uma_small_alloc there is no recursion; no address * space need be allocated to allocate boundary tags. For the others, we * must handle recursion. Boundary tags are necessary to allocate new * boundary tags. * * UMA guarantees that enough tags are held in reserve to allocate a new * page of kva. We dip into this reserve by specifying M_USE_RESERVE only * when allocating the page to hold new boundary tags. In this way the * reserve is automatically filled by the allocation that uses the reserve. * * We still have to guarantee that the new tags are allocated atomically since * many threads may try concurrently. The bt_lock provides this guarantee. * We convert WAITOK allocations to NOWAIT and then handle the blocking here * on failure. It's ok to return NULL for a WAITOK allocation as UMA will * loop again after checking to see if we lost the race to allocate. * * There is a small race between vmem_bt_alloc() returning the page and the * zone lock being acquired to add the page to the zone. For WAITOK * allocations we just pause briefly. NOWAIT may experience a transient * failure. To alleviate this we permit a small number of simultaneous * fills to proceed concurrently so NOWAIT is less likely to fail unless * we are really out of KVA. */ static void * -vmem_bt_alloc(uma_zone_t zone, int bytes, uint8_t *pflag, int wait) +vmem_bt_alloc(uma_zone_t zone, vm_size_t bytes, uint8_t *pflag, int wait) { vmem_addr_t addr; *pflag = UMA_SLAB_KMEM; /* * Single thread boundary tag allocation so that the address space * and memory are added in one atomic operation. */ mtx_lock(&vmem_bt_lock); if (vmem_xalloc(kmem_arena, bytes, 0, 0, 0, VMEM_ADDR_MIN, VMEM_ADDR_MAX, M_NOWAIT | M_NOVM | M_USE_RESERVE | M_BESTFIT, &addr) == 0) { if (kmem_back(kmem_object, addr, bytes, M_NOWAIT | M_USE_RESERVE) == 0) { mtx_unlock(&vmem_bt_lock); return ((void *)addr); } vmem_xfree(kmem_arena, addr, bytes); mtx_unlock(&vmem_bt_lock); /* * Out of memory, not address space. This may not even be * possible due to M_USE_RESERVE page allocation. */ if (wait & M_WAITOK) VM_WAIT; return (NULL); } mtx_unlock(&vmem_bt_lock); /* * We're either out of address space or lost a fill race. */ if (wait & M_WAITOK) pause("btalloc", 1); return (NULL); } #endif void vmem_startup(void) { mtx_init(&vmem_list_lock, "vmem list lock", NULL, MTX_DEF); vmem_bt_zone = uma_zcreate("vmem btag", sizeof(struct vmem_btag), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_VM); #ifndef UMA_MD_SMALL_ALLOC mtx_init(&vmem_bt_lock, "btag lock", NULL, MTX_DEF); uma_prealloc(vmem_bt_zone, BT_MAXALLOC); /* * Reserve enough tags to allocate new tags. We allow multiple * CPUs to attempt to allocate new tags concurrently to limit * false restarts in UMA. */ uma_zone_reserve(vmem_bt_zone, BT_MAXALLOC * (mp_ncpus + 1) / 2); uma_zone_set_allocf(vmem_bt_zone, vmem_bt_alloc); #endif } /* ---- rehash */ static int vmem_rehash(vmem_t *vm, vmem_size_t newhashsize) { bt_t *bt; int i; struct vmem_hashlist *newhashlist; struct vmem_hashlist *oldhashlist; vmem_size_t oldhashsize; MPASS(newhashsize > 0); newhashlist = malloc(sizeof(struct vmem_hashlist) * newhashsize, M_VMEM, M_NOWAIT); if (newhashlist == NULL) return ENOMEM; for (i = 0; i < newhashsize; i++) { LIST_INIT(&newhashlist[i]); } VMEM_LOCK(vm); oldhashlist = vm->vm_hashlist; oldhashsize = vm->vm_hashsize; vm->vm_hashlist = newhashlist; vm->vm_hashsize = newhashsize; if (oldhashlist == NULL) { VMEM_UNLOCK(vm); return 0; } for (i = 0; i < oldhashsize; i++) { while ((bt = LIST_FIRST(&oldhashlist[i])) != NULL) { bt_rembusy(vm, bt); bt_insbusy(vm, bt); } } VMEM_UNLOCK(vm); if (oldhashlist != vm->vm_hash0) { free(oldhashlist, M_VMEM); } return 0; } static void vmem_periodic_kick(void *dummy) { taskqueue_enqueue(taskqueue_thread, &vmem_periodic_wk); } static void vmem_periodic(void *unused, int pending) { vmem_t *vm; vmem_size_t desired; vmem_size_t current; mtx_lock(&vmem_list_lock); LIST_FOREACH(vm, &vmem_list, vm_alllist) { #ifdef DIAGNOSTIC /* Convenient time to verify vmem state. */ if (enable_vmem_check == 1) { VMEM_LOCK(vm); vmem_check(vm); VMEM_UNLOCK(vm); } #endif desired = 1 << flsl(vm->vm_nbusytag); desired = MIN(MAX(desired, VMEM_HASHSIZE_MIN), VMEM_HASHSIZE_MAX); current = vm->vm_hashsize; /* Grow in powers of two. Shrink less aggressively. */ if (desired >= current * 2 || desired * 4 <= current) vmem_rehash(vm, desired); /* * Periodically wake up threads waiting for resources, * so they could ask for reclamation again. */ VMEM_CONDVAR_BROADCAST(vm); } mtx_unlock(&vmem_list_lock); callout_reset(&vmem_periodic_ch, vmem_periodic_interval, vmem_periodic_kick, NULL); } static void vmem_start_callout(void *unused) { TASK_INIT(&vmem_periodic_wk, 0, vmem_periodic, NULL); vmem_periodic_interval = hz * 10; callout_init(&vmem_periodic_ch, CALLOUT_MPSAFE); callout_reset(&vmem_periodic_ch, vmem_periodic_interval, vmem_periodic_kick, NULL); } SYSINIT(vfs, SI_SUB_CONFIGURE, SI_ORDER_ANY, vmem_start_callout, NULL); static void vmem_add1(vmem_t *vm, vmem_addr_t addr, vmem_size_t size, int type) { bt_t *btspan; bt_t *btfree; MPASS(type == BT_TYPE_SPAN || type == BT_TYPE_SPAN_STATIC); MPASS((size & vm->vm_quantum_mask) == 0); btspan = bt_alloc(vm); btspan->bt_type = type; btspan->bt_start = addr; btspan->bt_size = size; bt_insseg_tail(vm, btspan); btfree = bt_alloc(vm); btfree->bt_type = BT_TYPE_FREE; btfree->bt_start = addr; btfree->bt_size = size; bt_insseg(vm, btfree, btspan); bt_insfree(vm, btfree); vm->vm_size += size; } static void vmem_destroy1(vmem_t *vm) { bt_t *bt; /* * Drain per-cpu quantum caches. */ qc_destroy(vm); /* * The vmem should now only contain empty segments. */ VMEM_LOCK(vm); MPASS(vm->vm_nbusytag == 0); while ((bt = TAILQ_FIRST(&vm->vm_seglist)) != NULL) bt_remseg(vm, bt); if (vm->vm_hashlist != NULL && vm->vm_hashlist != vm->vm_hash0) free(vm->vm_hashlist, M_VMEM); bt_freetrim(vm, 0); VMEM_CONDVAR_DESTROY(vm); VMEM_LOCK_DESTROY(vm); free(vm, M_VMEM); } static int vmem_import(vmem_t *vm, vmem_size_t size, vmem_size_t align, int flags) { vmem_addr_t addr; int error; if (vm->vm_importfn == NULL) return EINVAL; /* * To make sure we get a span that meets the alignment we double it * and add the size to the tail. This slightly overestimates. */ if (align != vm->vm_quantum_mask + 1) size = (align * 2) + size; size = roundup(size, vm->vm_import_quantum); /* * Hide MAXALLOC tags so we're guaranteed to be able to add this * span and the tag we want to allocate from it. */ MPASS(vm->vm_nfreetags >= BT_MAXALLOC); vm->vm_nfreetags -= BT_MAXALLOC; VMEM_UNLOCK(vm); error = (vm->vm_importfn)(vm->vm_arg, size, flags, &addr); VMEM_LOCK(vm); vm->vm_nfreetags += BT_MAXALLOC; if (error) return ENOMEM; vmem_add1(vm, addr, size, BT_TYPE_SPAN); return 0; } /* * vmem_fit: check if a bt can satisfy the given restrictions. * * it's a caller's responsibility to ensure the region is big enough * before calling us. */ static int vmem_fit(const bt_t *bt, vmem_size_t size, vmem_size_t align, vmem_size_t phase, vmem_size_t nocross, vmem_addr_t minaddr, vmem_addr_t maxaddr, vmem_addr_t *addrp) { vmem_addr_t start; vmem_addr_t end; MPASS(size > 0); MPASS(bt->bt_size >= size); /* caller's responsibility */ /* * XXX assumption: vmem_addr_t and vmem_size_t are * unsigned integer of the same size. */ start = bt->bt_start; if (start < minaddr) { start = minaddr; } end = BT_END(bt); if (end > maxaddr) end = maxaddr; if (start > end) return (ENOMEM); start = VMEM_ALIGNUP(start - phase, align) + phase; if (start < bt->bt_start) start += align; if (VMEM_CROSS_P(start, start + size - 1, nocross)) { MPASS(align < nocross); start = VMEM_ALIGNUP(start - phase, nocross) + phase; } if (start <= end && end - start >= size - 1) { MPASS((start & (align - 1)) == phase); MPASS(!VMEM_CROSS_P(start, start + size - 1, nocross)); MPASS(minaddr <= start); MPASS(maxaddr == 0 || start + size - 1 <= maxaddr); MPASS(bt->bt_start <= start); MPASS(BT_END(bt) - start >= size - 1); *addrp = start; return (0); } return (ENOMEM); } /* * vmem_clip: Trim the boundary tag edges to the requested start and size. */ static void vmem_clip(vmem_t *vm, bt_t *bt, vmem_addr_t start, vmem_size_t size) { bt_t *btnew; bt_t *btprev; VMEM_ASSERT_LOCKED(vm); MPASS(bt->bt_type == BT_TYPE_FREE); MPASS(bt->bt_size >= size); bt_remfree(vm, bt); if (bt->bt_start != start) { btprev = bt_alloc(vm); btprev->bt_type = BT_TYPE_FREE; btprev->bt_start = bt->bt_start; btprev->bt_size = start - bt->bt_start; bt->bt_start = start; bt->bt_size -= btprev->bt_size; bt_insfree(vm, btprev); bt_insseg(vm, btprev, TAILQ_PREV(bt, vmem_seglist, bt_seglist)); } MPASS(bt->bt_start == start); if (bt->bt_size != size && bt->bt_size - size > vm->vm_quantum_mask) { /* split */ btnew = bt_alloc(vm); btnew->bt_type = BT_TYPE_BUSY; btnew->bt_start = bt->bt_start; btnew->bt_size = size; bt->bt_start = bt->bt_start + size; bt->bt_size -= size; bt_insfree(vm, bt); bt_insseg(vm, btnew, TAILQ_PREV(bt, vmem_seglist, bt_seglist)); bt_insbusy(vm, btnew); bt = btnew; } else { bt->bt_type = BT_TYPE_BUSY; bt_insbusy(vm, bt); } MPASS(bt->bt_size >= size); bt->bt_type = BT_TYPE_BUSY; } /* ---- vmem API */ void vmem_set_import(vmem_t *vm, vmem_import_t *importfn, vmem_release_t *releasefn, void *arg, vmem_size_t import_quantum) { VMEM_LOCK(vm); vm->vm_importfn = importfn; vm->vm_releasefn = releasefn; vm->vm_arg = arg; vm->vm_import_quantum = import_quantum; VMEM_UNLOCK(vm); } void vmem_set_reclaim(vmem_t *vm, vmem_reclaim_t *reclaimfn) { VMEM_LOCK(vm); vm->vm_reclaimfn = reclaimfn; VMEM_UNLOCK(vm); } /* * vmem_init: Initializes vmem arena. */ vmem_t * vmem_init(vmem_t *vm, const char *name, vmem_addr_t base, vmem_size_t size, vmem_size_t quantum, vmem_size_t qcache_max, int flags) { int i; MPASS(quantum > 0); MPASS((quantum & (quantum - 1)) == 0); bzero(vm, sizeof(*vm)); VMEM_CONDVAR_INIT(vm, name); VMEM_LOCK_INIT(vm, name); vm->vm_nfreetags = 0; LIST_INIT(&vm->vm_freetags); strlcpy(vm->vm_name, name, sizeof(vm->vm_name)); vm->vm_quantum_mask = quantum - 1; vm->vm_quantum_shift = flsl(quantum) - 1; vm->vm_nbusytag = 0; vm->vm_size = 0; vm->vm_inuse = 0; qc_init(vm, qcache_max); TAILQ_INIT(&vm->vm_seglist); for (i = 0; i < VMEM_MAXORDER; i++) { LIST_INIT(&vm->vm_freelist[i]); } memset(&vm->vm_hash0, 0, sizeof(vm->vm_hash0)); vm->vm_hashsize = VMEM_HASHSIZE_MIN; vm->vm_hashlist = vm->vm_hash0; if (size != 0) { if (vmem_add(vm, base, size, flags) != 0) { vmem_destroy1(vm); return NULL; } } mtx_lock(&vmem_list_lock); LIST_INSERT_HEAD(&vmem_list, vm, vm_alllist); mtx_unlock(&vmem_list_lock); return vm; } /* * vmem_create: create an arena. */ vmem_t * vmem_create(const char *name, vmem_addr_t base, vmem_size_t size, vmem_size_t quantum, vmem_size_t qcache_max, int flags) { vmem_t *vm; vm = malloc(sizeof(*vm), M_VMEM, flags & (M_WAITOK|M_NOWAIT)); if (vm == NULL) return (NULL); if (vmem_init(vm, name, base, size, quantum, qcache_max, flags) == NULL) { free(vm, M_VMEM); return (NULL); } return (vm); } void vmem_destroy(vmem_t *vm) { mtx_lock(&vmem_list_lock); LIST_REMOVE(vm, vm_alllist); mtx_unlock(&vmem_list_lock); vmem_destroy1(vm); } vmem_size_t vmem_roundup_size(vmem_t *vm, vmem_size_t size) { return (size + vm->vm_quantum_mask) & ~vm->vm_quantum_mask; } /* * vmem_alloc: allocate resource from the arena. */ int vmem_alloc(vmem_t *vm, vmem_size_t size, int flags, vmem_addr_t *addrp) { const int strat __unused = flags & VMEM_FITMASK; qcache_t *qc; flags &= VMEM_FLAGS; MPASS(size > 0); MPASS(strat == M_BESTFIT || strat == M_FIRSTFIT); if ((flags & M_NOWAIT) == 0) WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL, "vmem_alloc"); if (size <= vm->vm_qcache_max) { qc = &vm->vm_qcache[(size - 1) >> vm->vm_quantum_shift]; *addrp = (vmem_addr_t)uma_zalloc(qc->qc_cache, flags); if (*addrp == 0) return (ENOMEM); return (0); } return vmem_xalloc(vm, size, 0, 0, 0, VMEM_ADDR_MIN, VMEM_ADDR_MAX, flags, addrp); } int vmem_xalloc(vmem_t *vm, const vmem_size_t size0, vmem_size_t align, const vmem_size_t phase, const vmem_size_t nocross, const vmem_addr_t minaddr, const vmem_addr_t maxaddr, int flags, vmem_addr_t *addrp) { const vmem_size_t size = vmem_roundup_size(vm, size0); struct vmem_freelist *list; struct vmem_freelist *first; struct vmem_freelist *end; vmem_size_t avail; bt_t *bt; int error; int strat; flags &= VMEM_FLAGS; strat = flags & VMEM_FITMASK; MPASS(size0 > 0); MPASS(size > 0); MPASS(strat == M_BESTFIT || strat == M_FIRSTFIT); MPASS((flags & (M_NOWAIT|M_WAITOK)) != (M_NOWAIT|M_WAITOK)); if ((flags & M_NOWAIT) == 0) WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL, "vmem_xalloc"); MPASS((align & vm->vm_quantum_mask) == 0); MPASS((align & (align - 1)) == 0); MPASS((phase & vm->vm_quantum_mask) == 0); MPASS((nocross & vm->vm_quantum_mask) == 0); MPASS((nocross & (nocross - 1)) == 0); MPASS((align == 0 && phase == 0) || phase < align); MPASS(nocross == 0 || nocross >= size); MPASS(minaddr <= maxaddr); MPASS(!VMEM_CROSS_P(phase, phase + size - 1, nocross)); if (align == 0) align = vm->vm_quantum_mask + 1; *addrp = 0; end = &vm->vm_freelist[VMEM_MAXORDER]; /* * choose a free block from which we allocate. */ first = bt_freehead_toalloc(vm, size, strat); VMEM_LOCK(vm); for (;;) { /* * Make sure we have enough tags to complete the * operation. */ if (vm->vm_nfreetags < BT_MAXALLOC && bt_fill(vm, flags) != 0) { error = ENOMEM; break; } /* * Scan freelists looking for a tag that satisfies the * allocation. If we're doing BESTFIT we may encounter * sizes below the request. If we're doing FIRSTFIT we * inspect only the first element from each list. */ for (list = first; list < end; list++) { LIST_FOREACH(bt, list, bt_freelist) { if (bt->bt_size >= size) { error = vmem_fit(bt, size, align, phase, nocross, minaddr, maxaddr, addrp); if (error == 0) { vmem_clip(vm, bt, *addrp, size); goto out; } } /* FIRST skips to the next list. */ if (strat == M_FIRSTFIT) break; } } /* * Retry if the fast algorithm failed. */ if (strat == M_FIRSTFIT) { strat = M_BESTFIT; first = bt_freehead_toalloc(vm, size, strat); continue; } /* * XXX it is possible to fail to meet restrictions with the * imported region. It is up to the user to specify the * import quantum such that it can satisfy any allocation. */ if (vmem_import(vm, size, align, flags) == 0) continue; /* * Try to free some space from the quantum cache or reclaim * functions if available. */ if (vm->vm_qcache_max != 0 || vm->vm_reclaimfn != NULL) { avail = vm->vm_size - vm->vm_inuse; VMEM_UNLOCK(vm); if (vm->vm_qcache_max != 0) qc_drain(vm); if (vm->vm_reclaimfn != NULL) vm->vm_reclaimfn(vm, flags); VMEM_LOCK(vm); /* If we were successful retry even NOWAIT. */ if (vm->vm_size - vm->vm_inuse > avail) continue; } if ((flags & M_NOWAIT) != 0) { error = ENOMEM; break; } VMEM_CONDVAR_WAIT(vm); } out: VMEM_UNLOCK(vm); if (error != 0 && (flags & M_NOWAIT) == 0) panic("failed to allocate waiting allocation\n"); return (error); } /* * vmem_free: free the resource to the arena. */ void vmem_free(vmem_t *vm, vmem_addr_t addr, vmem_size_t size) { qcache_t *qc; MPASS(size > 0); if (size <= vm->vm_qcache_max) { qc = &vm->vm_qcache[(size - 1) >> vm->vm_quantum_shift]; uma_zfree(qc->qc_cache, (void *)addr); } else vmem_xfree(vm, addr, size); } void vmem_xfree(vmem_t *vm, vmem_addr_t addr, vmem_size_t size) { bt_t *bt; bt_t *t; MPASS(size > 0); VMEM_LOCK(vm); bt = bt_lookupbusy(vm, addr); MPASS(bt != NULL); MPASS(bt->bt_start == addr); MPASS(bt->bt_size == vmem_roundup_size(vm, size) || bt->bt_size - vmem_roundup_size(vm, size) <= vm->vm_quantum_mask); MPASS(bt->bt_type == BT_TYPE_BUSY); bt_rembusy(vm, bt); bt->bt_type = BT_TYPE_FREE; /* coalesce */ t = TAILQ_NEXT(bt, bt_seglist); if (t != NULL && t->bt_type == BT_TYPE_FREE) { MPASS(BT_END(bt) < t->bt_start); /* YYY */ bt->bt_size += t->bt_size; bt_remfree(vm, t); bt_remseg(vm, t); } t = TAILQ_PREV(bt, vmem_seglist, bt_seglist); if (t != NULL && t->bt_type == BT_TYPE_FREE) { MPASS(BT_END(t) < bt->bt_start); /* YYY */ bt->bt_size += t->bt_size; bt->bt_start = t->bt_start; bt_remfree(vm, t); bt_remseg(vm, t); } t = TAILQ_PREV(bt, vmem_seglist, bt_seglist); MPASS(t != NULL); MPASS(BT_ISSPAN_P(t) || t->bt_type == BT_TYPE_BUSY); if (vm->vm_releasefn != NULL && t->bt_type == BT_TYPE_SPAN && t->bt_size == bt->bt_size) { vmem_addr_t spanaddr; vmem_size_t spansize; MPASS(t->bt_start == bt->bt_start); spanaddr = bt->bt_start; spansize = bt->bt_size; bt_remseg(vm, bt); bt_remseg(vm, t); vm->vm_size -= spansize; VMEM_CONDVAR_BROADCAST(vm); bt_freetrim(vm, BT_MAXFREE); (*vm->vm_releasefn)(vm->vm_arg, spanaddr, spansize); } else { bt_insfree(vm, bt); VMEM_CONDVAR_BROADCAST(vm); bt_freetrim(vm, BT_MAXFREE); } } /* * vmem_add: * */ int vmem_add(vmem_t *vm, vmem_addr_t addr, vmem_size_t size, int flags) { int error; error = 0; flags &= VMEM_FLAGS; VMEM_LOCK(vm); if (vm->vm_nfreetags >= BT_MAXALLOC || bt_fill(vm, flags) == 0) vmem_add1(vm, addr, size, BT_TYPE_SPAN_STATIC); else error = ENOMEM; VMEM_UNLOCK(vm); return (error); } /* * vmem_size: information about arenas size */ vmem_size_t vmem_size(vmem_t *vm, int typemask) { int i; switch (typemask) { case VMEM_ALLOC: return vm->vm_inuse; case VMEM_FREE: return vm->vm_size - vm->vm_inuse; case VMEM_FREE|VMEM_ALLOC: return vm->vm_size; case VMEM_MAXFREE: VMEM_LOCK(vm); for (i = VMEM_MAXORDER - 1; i >= 0; i--) { if (LIST_EMPTY(&vm->vm_freelist[i])) continue; VMEM_UNLOCK(vm); return ((vmem_size_t)ORDER2SIZE(i) << vm->vm_quantum_shift); } VMEM_UNLOCK(vm); return (0); default: panic("vmem_size"); } } /* ---- debug */ #if defined(DDB) || defined(DIAGNOSTIC) static void bt_dump(const bt_t *, int (*)(const char *, ...) __printflike(1, 2)); static const char * bt_type_string(int type) { switch (type) { case BT_TYPE_BUSY: return "busy"; case BT_TYPE_FREE: return "free"; case BT_TYPE_SPAN: return "span"; case BT_TYPE_SPAN_STATIC: return "static span"; default: break; } return "BOGUS"; } static void bt_dump(const bt_t *bt, int (*pr)(const char *, ...)) { (*pr)("\t%p: %jx %jx, %d(%s)\n", bt, (intmax_t)bt->bt_start, (intmax_t)bt->bt_size, bt->bt_type, bt_type_string(bt->bt_type)); } static void vmem_dump(const vmem_t *vm , int (*pr)(const char *, ...) __printflike(1, 2)) { const bt_t *bt; int i; (*pr)("vmem %p '%s'\n", vm, vm->vm_name); TAILQ_FOREACH(bt, &vm->vm_seglist, bt_seglist) { bt_dump(bt, pr); } for (i = 0; i < VMEM_MAXORDER; i++) { const struct vmem_freelist *fl = &vm->vm_freelist[i]; if (LIST_EMPTY(fl)) { continue; } (*pr)("freelist[%d]\n", i); LIST_FOREACH(bt, fl, bt_freelist) { bt_dump(bt, pr); } } } #endif /* defined(DDB) || defined(DIAGNOSTIC) */ #if defined(DDB) static bt_t * vmem_whatis_lookup(vmem_t *vm, vmem_addr_t addr) { bt_t *bt; TAILQ_FOREACH(bt, &vm->vm_seglist, bt_seglist) { if (BT_ISSPAN_P(bt)) { continue; } if (bt->bt_start <= addr && addr <= BT_END(bt)) { return bt; } } return NULL; } void vmem_whatis(vmem_addr_t addr, int (*pr)(const char *, ...)) { vmem_t *vm; LIST_FOREACH(vm, &vmem_list, vm_alllist) { bt_t *bt; bt = vmem_whatis_lookup(vm, addr); if (bt == NULL) { continue; } (*pr)("%p is %p+%zu in VMEM '%s' (%s)\n", (void *)addr, (void *)bt->bt_start, (vmem_size_t)(addr - bt->bt_start), vm->vm_name, (bt->bt_type == BT_TYPE_BUSY) ? "allocated" : "free"); } } void vmem_printall(const char *modif, int (*pr)(const char *, ...)) { const vmem_t *vm; LIST_FOREACH(vm, &vmem_list, vm_alllist) { vmem_dump(vm, pr); } } void vmem_print(vmem_addr_t addr, const char *modif, int (*pr)(const char *, ...)) { const vmem_t *vm = (const void *)addr; vmem_dump(vm, pr); } #endif /* defined(DDB) */ #define vmem_printf printf #if defined(DIAGNOSTIC) static bool vmem_check_sanity(vmem_t *vm) { const bt_t *bt, *bt2; MPASS(vm != NULL); TAILQ_FOREACH(bt, &vm->vm_seglist, bt_seglist) { if (bt->bt_start > BT_END(bt)) { printf("corrupted tag\n"); bt_dump(bt, vmem_printf); return false; } } TAILQ_FOREACH(bt, &vm->vm_seglist, bt_seglist) { TAILQ_FOREACH(bt2, &vm->vm_seglist, bt_seglist) { if (bt == bt2) { continue; } if (BT_ISSPAN_P(bt) != BT_ISSPAN_P(bt2)) { continue; } if (bt->bt_start <= BT_END(bt2) && bt2->bt_start <= BT_END(bt)) { printf("overwrapped tags\n"); bt_dump(bt, vmem_printf); bt_dump(bt2, vmem_printf); return false; } } } return true; } static void vmem_check(vmem_t *vm) { if (!vmem_check_sanity(vm)) { panic("insanity vmem %p", vm); } } #endif /* defined(DIAGNOSTIC) */ Index: stable/10/sys/mips/mips/uma_machdep.c =================================================================== --- stable/10/sys/mips/mips/uma_machdep.c (revision 287944) +++ stable/10/sys/mips/mips/uma_machdep.c (revision 287945) @@ -1,83 +1,83 @@ /*- * Copyright (c) 2003 Alan L. Cox * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include void * -uma_small_alloc(uma_zone_t zone, int bytes, u_int8_t *flags, int wait) +uma_small_alloc(uma_zone_t zone, vm_size_t bytes, u_int8_t *flags, int wait) { vm_paddr_t pa; vm_page_t m; int pflags; void *va; *flags = UMA_SLAB_PRIV; pflags = malloc2vm_flags(wait) | VM_ALLOC_WIRED; for (;;) { m = vm_page_alloc_freelist(VM_FREELIST_DIRECT, pflags); if (m == NULL) { if (wait & M_NOWAIT) return (NULL); else pmap_grow_direct_page_cache(); } else break; } pa = VM_PAGE_TO_PHYS(m); va = (void *)MIPS_PHYS_TO_DIRECT(pa); if ((wait & M_ZERO) && (m->flags & PG_ZERO) == 0) bzero(va, PAGE_SIZE); return (va); } void -uma_small_free(void *mem, int size, u_int8_t flags) +uma_small_free(void *mem, vm_size_t size, u_int8_t flags) { vm_page_t m; vm_paddr_t pa; pa = MIPS_DIRECT_TO_PHYS((vm_offset_t)mem); m = PHYS_TO_VM_PAGE(pa); m->wire_count--; vm_page_free(m); atomic_subtract_int(&cnt.v_wire_count, 1); } Index: stable/10/sys/powerpc/aim/mmu_oea64.c =================================================================== --- stable/10/sys/powerpc/aim/mmu_oea64.c (revision 287944) +++ stable/10/sys/powerpc/aim/mmu_oea64.c (revision 287945) @@ -1,2718 +1,2719 @@ /*- * Copyright (c) 2001 The NetBSD Foundation, Inc. * All rights reserved. * * This code is derived from software contributed to The NetBSD Foundation * by Matt Thomas of Allegro Networks, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ /*- * Copyright (C) 1995, 1996 Wolfgang Solfrank. * Copyright (C) 1995, 1996 TooLs GmbH. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by TooLs GmbH. * 4. The name of TooLs GmbH may not be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY TOOLS GMBH ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL TOOLS GMBH BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * * $NetBSD: pmap.c,v 1.28 2000/03/26 20:42:36 kleink Exp $ */ /*- * Copyright (C) 2001 Benno Rice. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY Benno Rice ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL TOOLS GMBH BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); /* * Manages physical address maps. * * Since the information managed by this module is also stored by the * logical address mapping module, this module may throw away valid virtual * to physical mappings at almost any time. However, invalidations of * mappings must be done as requested. * * In order to cope with hardware architectures which make virtual to * physical map invalidates expensive, this module may delay invalidate * reduced protection operations until such time as they are actually * necessary. This module is given full information as to which processors * are currently using which maps, and to when physical maps must be made * correct. */ #include "opt_compat.h" #include "opt_kstack_pages.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "mmu_oea64.h" #include "mmu_if.h" #include "moea64_if.h" void moea64_release_vsid(uint64_t vsid); uintptr_t moea64_get_unique_vsid(void); #define DISABLE_TRANS(msr) msr = mfmsr(); mtmsr(msr & ~PSL_DR) #define ENABLE_TRANS(msr) mtmsr(msr) #define VSID_MAKE(sr, hash) ((sr) | (((hash) & 0xfffff) << 4)) #define VSID_TO_HASH(vsid) (((vsid) >> 4) & 0xfffff) #define VSID_HASH_MASK 0x0000007fffffffffULL /* * Locking semantics: * -- Read lock: if no modifications are being made to either the PVO lists * or page table or if any modifications being made result in internal * changes (e.g. wiring, protection) such that the existence of the PVOs * is unchanged and they remain associated with the same pmap (in which * case the changes should be protected by the pmap lock) * -- Write lock: required if PTEs/PVOs are being inserted or removed. */ #define LOCK_TABLE_RD() rw_rlock(&moea64_table_lock) #define UNLOCK_TABLE_RD() rw_runlock(&moea64_table_lock) #define LOCK_TABLE_WR() rw_wlock(&moea64_table_lock) #define UNLOCK_TABLE_WR() rw_wunlock(&moea64_table_lock) struct ofw_map { cell_t om_va; cell_t om_len; uint64_t om_pa; cell_t om_mode; }; extern unsigned char _etext[]; extern unsigned char _end[]; extern int dumpsys_minidump; /* * Map of physical memory regions. */ static struct mem_region *regions; static struct mem_region *pregions; static u_int phys_avail_count; static int regions_sz, pregions_sz; extern void bs_remap_earlyboot(void); /* * Lock for the pteg and pvo tables. */ struct rwlock moea64_table_lock; struct mtx moea64_slb_mutex; /* * PTEG data. */ u_int moea64_pteg_count; u_int moea64_pteg_mask; /* * PVO data. */ struct pvo_head *moea64_pvo_table; /* pvo entries by pteg index */ uma_zone_t moea64_upvo_zone; /* zone for pvo entries for unmanaged pages */ uma_zone_t moea64_mpvo_zone; /* zone for pvo entries for managed pages */ #define BPVO_POOL_SIZE 327680 static struct pvo_entry *moea64_bpvo_pool; static int moea64_bpvo_pool_index = 0; #define VSID_NBPW (sizeof(u_int32_t) * 8) #ifdef __powerpc64__ #define NVSIDS (NPMAPS * 16) #define VSID_HASHMASK 0xffffffffUL #else #define NVSIDS NPMAPS #define VSID_HASHMASK 0xfffffUL #endif static u_int moea64_vsid_bitmap[NVSIDS / VSID_NBPW]; static boolean_t moea64_initialized = FALSE; /* * Statistics. */ u_int moea64_pte_valid = 0; u_int moea64_pte_overflow = 0; u_int moea64_pvo_entries = 0; u_int moea64_pvo_enter_calls = 0; u_int moea64_pvo_remove_calls = 0; SYSCTL_INT(_machdep, OID_AUTO, moea64_pte_valid, CTLFLAG_RD, &moea64_pte_valid, 0, ""); SYSCTL_INT(_machdep, OID_AUTO, moea64_pte_overflow, CTLFLAG_RD, &moea64_pte_overflow, 0, ""); SYSCTL_INT(_machdep, OID_AUTO, moea64_pvo_entries, CTLFLAG_RD, &moea64_pvo_entries, 0, ""); SYSCTL_INT(_machdep, OID_AUTO, moea64_pvo_enter_calls, CTLFLAG_RD, &moea64_pvo_enter_calls, 0, ""); SYSCTL_INT(_machdep, OID_AUTO, moea64_pvo_remove_calls, CTLFLAG_RD, &moea64_pvo_remove_calls, 0, ""); vm_offset_t moea64_scratchpage_va[2]; struct pvo_entry *moea64_scratchpage_pvo[2]; uintptr_t moea64_scratchpage_pte[2]; struct mtx moea64_scratchpage_mtx; uint64_t moea64_large_page_mask = 0; uint64_t moea64_large_page_size = 0; int moea64_large_page_shift = 0; /* * PVO calls. */ static int moea64_pvo_enter(mmu_t, pmap_t, uma_zone_t, struct pvo_head *, vm_offset_t, vm_offset_t, uint64_t, int, int8_t); static void moea64_pvo_remove(mmu_t, struct pvo_entry *); static struct pvo_entry *moea64_pvo_find_va(pmap_t, vm_offset_t); /* * Utility routines. */ static boolean_t moea64_query_bit(mmu_t, vm_page_t, u_int64_t); static u_int moea64_clear_bit(mmu_t, vm_page_t, u_int64_t); static void moea64_kremove(mmu_t, vm_offset_t); static void moea64_syncicache(mmu_t, pmap_t pmap, vm_offset_t va, vm_offset_t pa, vm_size_t sz); /* * Kernel MMU interface */ void moea64_clear_modify(mmu_t, vm_page_t); void moea64_copy_page(mmu_t, vm_page_t, vm_page_t); void moea64_copy_pages(mmu_t mmu, vm_page_t *ma, vm_offset_t a_offset, vm_page_t *mb, vm_offset_t b_offset, int xfersize); int moea64_enter(mmu_t, pmap_t, vm_offset_t, vm_page_t, vm_prot_t, u_int flags, int8_t psind); void moea64_enter_object(mmu_t, pmap_t, vm_offset_t, vm_offset_t, vm_page_t, vm_prot_t); void moea64_enter_quick(mmu_t, pmap_t, vm_offset_t, vm_page_t, vm_prot_t); vm_paddr_t moea64_extract(mmu_t, pmap_t, vm_offset_t); vm_page_t moea64_extract_and_hold(mmu_t, pmap_t, vm_offset_t, vm_prot_t); void moea64_init(mmu_t); boolean_t moea64_is_modified(mmu_t, vm_page_t); boolean_t moea64_is_prefaultable(mmu_t, pmap_t, vm_offset_t); boolean_t moea64_is_referenced(mmu_t, vm_page_t); int moea64_ts_referenced(mmu_t, vm_page_t); vm_offset_t moea64_map(mmu_t, vm_offset_t *, vm_paddr_t, vm_paddr_t, int); boolean_t moea64_page_exists_quick(mmu_t, pmap_t, vm_page_t); int moea64_page_wired_mappings(mmu_t, vm_page_t); void moea64_pinit(mmu_t, pmap_t); void moea64_pinit0(mmu_t, pmap_t); void moea64_protect(mmu_t, pmap_t, vm_offset_t, vm_offset_t, vm_prot_t); void moea64_qenter(mmu_t, vm_offset_t, vm_page_t *, int); void moea64_qremove(mmu_t, vm_offset_t, int); void moea64_release(mmu_t, pmap_t); void moea64_remove(mmu_t, pmap_t, vm_offset_t, vm_offset_t); void moea64_remove_pages(mmu_t, pmap_t); void moea64_remove_all(mmu_t, vm_page_t); void moea64_remove_write(mmu_t, vm_page_t); void moea64_unwire(mmu_t, pmap_t, vm_offset_t, vm_offset_t); void moea64_zero_page(mmu_t, vm_page_t); void moea64_zero_page_area(mmu_t, vm_page_t, int, int); void moea64_zero_page_idle(mmu_t, vm_page_t); void moea64_activate(mmu_t, struct thread *); void moea64_deactivate(mmu_t, struct thread *); void *moea64_mapdev(mmu_t, vm_paddr_t, vm_size_t); void *moea64_mapdev_attr(mmu_t, vm_offset_t, vm_size_t, vm_memattr_t); void moea64_unmapdev(mmu_t, vm_offset_t, vm_size_t); vm_paddr_t moea64_kextract(mmu_t, vm_offset_t); void moea64_page_set_memattr(mmu_t, vm_page_t m, vm_memattr_t ma); void moea64_kenter_attr(mmu_t, vm_offset_t, vm_offset_t, vm_memattr_t ma); void moea64_kenter(mmu_t, vm_offset_t, vm_paddr_t); boolean_t moea64_dev_direct_mapped(mmu_t, vm_paddr_t, vm_size_t); static void moea64_sync_icache(mmu_t, pmap_t, vm_offset_t, vm_size_t); vm_offset_t moea64_dumpsys_map(mmu_t mmu, struct pmap_md *md, vm_size_t ofs, vm_size_t *sz); struct pmap_md * moea64_scan_md(mmu_t mmu, struct pmap_md *prev); static mmu_method_t moea64_methods[] = { MMUMETHOD(mmu_clear_modify, moea64_clear_modify), MMUMETHOD(mmu_copy_page, moea64_copy_page), MMUMETHOD(mmu_copy_pages, moea64_copy_pages), MMUMETHOD(mmu_enter, moea64_enter), MMUMETHOD(mmu_enter_object, moea64_enter_object), MMUMETHOD(mmu_enter_quick, moea64_enter_quick), MMUMETHOD(mmu_extract, moea64_extract), MMUMETHOD(mmu_extract_and_hold, moea64_extract_and_hold), MMUMETHOD(mmu_init, moea64_init), MMUMETHOD(mmu_is_modified, moea64_is_modified), MMUMETHOD(mmu_is_prefaultable, moea64_is_prefaultable), MMUMETHOD(mmu_is_referenced, moea64_is_referenced), MMUMETHOD(mmu_ts_referenced, moea64_ts_referenced), MMUMETHOD(mmu_map, moea64_map), MMUMETHOD(mmu_page_exists_quick,moea64_page_exists_quick), MMUMETHOD(mmu_page_wired_mappings,moea64_page_wired_mappings), MMUMETHOD(mmu_pinit, moea64_pinit), MMUMETHOD(mmu_pinit0, moea64_pinit0), MMUMETHOD(mmu_protect, moea64_protect), MMUMETHOD(mmu_qenter, moea64_qenter), MMUMETHOD(mmu_qremove, moea64_qremove), MMUMETHOD(mmu_release, moea64_release), MMUMETHOD(mmu_remove, moea64_remove), MMUMETHOD(mmu_remove_pages, moea64_remove_pages), MMUMETHOD(mmu_remove_all, moea64_remove_all), MMUMETHOD(mmu_remove_write, moea64_remove_write), MMUMETHOD(mmu_sync_icache, moea64_sync_icache), MMUMETHOD(mmu_unwire, moea64_unwire), MMUMETHOD(mmu_zero_page, moea64_zero_page), MMUMETHOD(mmu_zero_page_area, moea64_zero_page_area), MMUMETHOD(mmu_zero_page_idle, moea64_zero_page_idle), MMUMETHOD(mmu_activate, moea64_activate), MMUMETHOD(mmu_deactivate, moea64_deactivate), MMUMETHOD(mmu_page_set_memattr, moea64_page_set_memattr), /* Internal interfaces */ MMUMETHOD(mmu_mapdev, moea64_mapdev), MMUMETHOD(mmu_mapdev_attr, moea64_mapdev_attr), MMUMETHOD(mmu_unmapdev, moea64_unmapdev), MMUMETHOD(mmu_kextract, moea64_kextract), MMUMETHOD(mmu_kenter, moea64_kenter), MMUMETHOD(mmu_kenter_attr, moea64_kenter_attr), MMUMETHOD(mmu_dev_direct_mapped,moea64_dev_direct_mapped), MMUMETHOD(mmu_scan_md, moea64_scan_md), MMUMETHOD(mmu_dumpsys_map, moea64_dumpsys_map), { 0, 0 } }; MMU_DEF(oea64_mmu, "mmu_oea64_base", moea64_methods, 0); static __inline u_int va_to_pteg(uint64_t vsid, vm_offset_t addr, int large) { uint64_t hash; int shift; shift = large ? moea64_large_page_shift : ADDR_PIDX_SHFT; hash = (vsid & VSID_HASH_MASK) ^ (((uint64_t)addr & ADDR_PIDX) >> shift); return (hash & moea64_pteg_mask); } static __inline struct pvo_head * vm_page_to_pvoh(vm_page_t m) { return (&m->md.mdpg_pvoh); } static __inline void moea64_pte_create(struct lpte *pt, uint64_t vsid, vm_offset_t va, uint64_t pte_lo, int flags) { /* * Construct a PTE. Default to IMB initially. Valid bit only gets * set when the real pte is set in memory. * * Note: Don't set the valid bit for correct operation of tlb update. */ pt->pte_hi = (vsid << LPTE_VSID_SHIFT) | (((uint64_t)(va & ADDR_PIDX) >> ADDR_API_SHFT64) & LPTE_API); if (flags & PVO_LARGE) pt->pte_hi |= LPTE_BIG; pt->pte_lo = pte_lo; } static __inline uint64_t moea64_calc_wimg(vm_offset_t pa, vm_memattr_t ma) { uint64_t pte_lo; int i; if (ma != VM_MEMATTR_DEFAULT) { switch (ma) { case VM_MEMATTR_UNCACHEABLE: return (LPTE_I | LPTE_G); case VM_MEMATTR_WRITE_COMBINING: case VM_MEMATTR_WRITE_BACK: case VM_MEMATTR_PREFETCHABLE: return (LPTE_I); case VM_MEMATTR_WRITE_THROUGH: return (LPTE_W | LPTE_M); } } /* * Assume the page is cache inhibited and access is guarded unless * it's in our available memory array. */ pte_lo = LPTE_I | LPTE_G; for (i = 0; i < pregions_sz; i++) { if ((pa >= pregions[i].mr_start) && (pa < (pregions[i].mr_start + pregions[i].mr_size))) { pte_lo &= ~(LPTE_I | LPTE_G); pte_lo |= LPTE_M; break; } } return pte_lo; } /* * Quick sort callout for comparing memory regions. */ static int om_cmp(const void *a, const void *b); static int om_cmp(const void *a, const void *b) { const struct ofw_map *mapa; const struct ofw_map *mapb; mapa = a; mapb = b; if (mapa->om_pa < mapb->om_pa) return (-1); else if (mapa->om_pa > mapb->om_pa) return (1); else return (0); } static void moea64_add_ofw_mappings(mmu_t mmup, phandle_t mmu, size_t sz) { struct ofw_map translations[sz/(4*sizeof(cell_t))]; /*>= 4 cells per */ pcell_t acells, trans_cells[sz/sizeof(cell_t)]; register_t msr; vm_offset_t off; vm_paddr_t pa_base; int i, j; bzero(translations, sz); OF_getprop(OF_finddevice("/"), "#address-cells", &acells, sizeof(acells)); if (OF_getprop(mmu, "translations", trans_cells, sz) == -1) panic("moea64_bootstrap: can't get ofw translations"); CTR0(KTR_PMAP, "moea64_add_ofw_mappings: translations"); sz /= sizeof(cell_t); for (i = 0, j = 0; i < sz; j++) { translations[j].om_va = trans_cells[i++]; translations[j].om_len = trans_cells[i++]; translations[j].om_pa = trans_cells[i++]; if (acells == 2) { translations[j].om_pa <<= 32; translations[j].om_pa |= trans_cells[i++]; } translations[j].om_mode = trans_cells[i++]; } KASSERT(i == sz, ("Translations map has incorrect cell count (%d/%zd)", i, sz)); sz = j; qsort(translations, sz, sizeof (*translations), om_cmp); for (i = 0; i < sz; i++) { pa_base = translations[i].om_pa; #ifndef __powerpc64__ if ((translations[i].om_pa >> 32) != 0) panic("OFW translations above 32-bit boundary!"); #endif if (pa_base % PAGE_SIZE) panic("OFW translation not page-aligned (phys)!"); if (translations[i].om_va % PAGE_SIZE) panic("OFW translation not page-aligned (virt)!"); CTR3(KTR_PMAP, "translation: pa=%#zx va=%#x len=%#x", pa_base, translations[i].om_va, translations[i].om_len); /* Now enter the pages for this mapping */ DISABLE_TRANS(msr); for (off = 0; off < translations[i].om_len; off += PAGE_SIZE) { if (moea64_pvo_find_va(kernel_pmap, translations[i].om_va + off) != NULL) continue; moea64_kenter(mmup, translations[i].om_va + off, pa_base + off); } ENABLE_TRANS(msr); } } #ifdef __powerpc64__ static void moea64_probe_large_page(void) { uint16_t pvr = mfpvr() >> 16; switch (pvr) { case IBM970: case IBM970FX: case IBM970MP: powerpc_sync(); isync(); mtspr(SPR_HID4, mfspr(SPR_HID4) & ~HID4_970_DISABLE_LG_PG); powerpc_sync(); isync(); /* FALLTHROUGH */ default: moea64_large_page_size = 0x1000000; /* 16 MB */ moea64_large_page_shift = 24; } moea64_large_page_mask = moea64_large_page_size - 1; } static void moea64_bootstrap_slb_prefault(vm_offset_t va, int large) { struct slb *cache; struct slb entry; uint64_t esid, slbe; uint64_t i; cache = PCPU_GET(slb); esid = va >> ADDR_SR_SHFT; slbe = (esid << SLBE_ESID_SHIFT) | SLBE_VALID; for (i = 0; i < 64; i++) { if (cache[i].slbe == (slbe | i)) return; } entry.slbe = slbe; entry.slbv = KERNEL_VSID(esid) << SLBV_VSID_SHIFT; if (large) entry.slbv |= SLBV_L; slb_insert_kernel(entry.slbe, entry.slbv); } #endif static void moea64_setup_direct_map(mmu_t mmup, vm_offset_t kernelstart, vm_offset_t kernelend) { register_t msr; vm_paddr_t pa; vm_offset_t size, off; uint64_t pte_lo; int i; if (moea64_large_page_size == 0) hw_direct_map = 0; DISABLE_TRANS(msr); if (hw_direct_map) { LOCK_TABLE_WR(); PMAP_LOCK(kernel_pmap); for (i = 0; i < pregions_sz; i++) { for (pa = pregions[i].mr_start; pa < pregions[i].mr_start + pregions[i].mr_size; pa += moea64_large_page_size) { pte_lo = LPTE_M; /* * Set memory access as guarded if prefetch within * the page could exit the available physmem area. */ if (pa & moea64_large_page_mask) { pa &= moea64_large_page_mask; pte_lo |= LPTE_G; } if (pa + moea64_large_page_size > pregions[i].mr_start + pregions[i].mr_size) pte_lo |= LPTE_G; moea64_pvo_enter(mmup, kernel_pmap, moea64_upvo_zone, NULL, pa, pa, pte_lo, PVO_WIRED | PVO_LARGE, 0); } } PMAP_UNLOCK(kernel_pmap); UNLOCK_TABLE_WR(); } else { size = sizeof(struct pvo_head) * moea64_pteg_count; off = (vm_offset_t)(moea64_pvo_table); for (pa = off; pa < off + size; pa += PAGE_SIZE) moea64_kenter(mmup, pa, pa); size = BPVO_POOL_SIZE*sizeof(struct pvo_entry); off = (vm_offset_t)(moea64_bpvo_pool); for (pa = off; pa < off + size; pa += PAGE_SIZE) moea64_kenter(mmup, pa, pa); /* * Map certain important things, like ourselves. * * NOTE: We do not map the exception vector space. That code is * used only in real mode, and leaving it unmapped allows us to * catch NULL pointer deferences, instead of making NULL a valid * address. */ for (pa = kernelstart & ~PAGE_MASK; pa < kernelend; pa += PAGE_SIZE) moea64_kenter(mmup, pa, pa); } ENABLE_TRANS(msr); /* * Allow user to override unmapped_buf_allowed for testing. * XXXKIB Only direct map implementation was tested. */ if (!TUNABLE_INT_FETCH("vfs.unmapped_buf_allowed", &unmapped_buf_allowed)) unmapped_buf_allowed = hw_direct_map; } void moea64_early_bootstrap(mmu_t mmup, vm_offset_t kernelstart, vm_offset_t kernelend) { int i, j; vm_size_t physsz, hwphyssz; #ifndef __powerpc64__ /* We don't have a direct map since there is no BAT */ hw_direct_map = 0; /* Make sure battable is zero, since we have no BAT */ for (i = 0; i < 16; i++) { battable[i].batu = 0; battable[i].batl = 0; } #else moea64_probe_large_page(); /* Use a direct map if we have large page support */ if (moea64_large_page_size > 0) hw_direct_map = 1; else hw_direct_map = 0; #endif /* Get physical memory regions from firmware */ mem_regions(&pregions, &pregions_sz, ®ions, ®ions_sz); CTR0(KTR_PMAP, "moea64_bootstrap: physical memory"); if (sizeof(phys_avail)/sizeof(phys_avail[0]) < regions_sz) panic("moea64_bootstrap: phys_avail too small"); phys_avail_count = 0; physsz = 0; hwphyssz = 0; TUNABLE_ULONG_FETCH("hw.physmem", (u_long *) &hwphyssz); for (i = 0, j = 0; i < regions_sz; i++, j += 2) { CTR3(KTR_PMAP, "region: %#zx - %#zx (%#zx)", regions[i].mr_start, regions[i].mr_start + regions[i].mr_size, regions[i].mr_size); if (hwphyssz != 0 && (physsz + regions[i].mr_size) >= hwphyssz) { if (physsz < hwphyssz) { phys_avail[j] = regions[i].mr_start; phys_avail[j + 1] = regions[i].mr_start + hwphyssz - physsz; physsz = hwphyssz; phys_avail_count++; } break; } phys_avail[j] = regions[i].mr_start; phys_avail[j + 1] = regions[i].mr_start + regions[i].mr_size; phys_avail_count++; physsz += regions[i].mr_size; } /* Check for overlap with the kernel and exception vectors */ for (j = 0; j < 2*phys_avail_count; j+=2) { if (phys_avail[j] < EXC_LAST) phys_avail[j] += EXC_LAST; if (kernelstart >= phys_avail[j] && kernelstart < phys_avail[j+1]) { if (kernelend < phys_avail[j+1]) { phys_avail[2*phys_avail_count] = (kernelend & ~PAGE_MASK) + PAGE_SIZE; phys_avail[2*phys_avail_count + 1] = phys_avail[j+1]; phys_avail_count++; } phys_avail[j+1] = kernelstart & ~PAGE_MASK; } if (kernelend >= phys_avail[j] && kernelend < phys_avail[j+1]) { if (kernelstart > phys_avail[j]) { phys_avail[2*phys_avail_count] = phys_avail[j]; phys_avail[2*phys_avail_count + 1] = kernelstart & ~PAGE_MASK; phys_avail_count++; } phys_avail[j] = (kernelend & ~PAGE_MASK) + PAGE_SIZE; } } physmem = btoc(physsz); #ifdef PTEGCOUNT moea64_pteg_count = PTEGCOUNT; #else moea64_pteg_count = 0x1000; while (moea64_pteg_count < physmem) moea64_pteg_count <<= 1; moea64_pteg_count >>= 1; #endif /* PTEGCOUNT */ } void moea64_mid_bootstrap(mmu_t mmup, vm_offset_t kernelstart, vm_offset_t kernelend) { vm_size_t size; register_t msr; int i; /* * Set PTEG mask */ moea64_pteg_mask = moea64_pteg_count - 1; /* * Allocate pv/overflow lists. */ size = sizeof(struct pvo_head) * moea64_pteg_count; moea64_pvo_table = (struct pvo_head *)moea64_bootstrap_alloc(size, PAGE_SIZE); CTR1(KTR_PMAP, "moea64_bootstrap: PVO table at %p", moea64_pvo_table); DISABLE_TRANS(msr); for (i = 0; i < moea64_pteg_count; i++) LIST_INIT(&moea64_pvo_table[i]); ENABLE_TRANS(msr); /* * Initialize the lock that synchronizes access to the pteg and pvo * tables. */ rw_init_flags(&moea64_table_lock, "pmap tables", RW_RECURSE); mtx_init(&moea64_slb_mutex, "SLB table", NULL, MTX_DEF); /* * Initialise the unmanaged pvo pool. */ moea64_bpvo_pool = (struct pvo_entry *)moea64_bootstrap_alloc( BPVO_POOL_SIZE*sizeof(struct pvo_entry), 0); moea64_bpvo_pool_index = 0; /* * Make sure kernel vsid is allocated as well as VSID 0. */ #ifndef __powerpc64__ moea64_vsid_bitmap[(KERNEL_VSIDBITS & (NVSIDS - 1)) / VSID_NBPW] |= 1 << (KERNEL_VSIDBITS % VSID_NBPW); moea64_vsid_bitmap[0] |= 1; #endif /* * Initialize the kernel pmap (which is statically allocated). */ #ifdef __powerpc64__ for (i = 0; i < 64; i++) { pcpup->pc_slb[i].slbv = 0; pcpup->pc_slb[i].slbe = 0; } #else for (i = 0; i < 16; i++) kernel_pmap->pm_sr[i] = EMPTY_SEGMENT + i; #endif kernel_pmap->pmap_phys = kernel_pmap; CPU_FILL(&kernel_pmap->pm_active); RB_INIT(&kernel_pmap->pmap_pvo); PMAP_LOCK_INIT(kernel_pmap); /* * Now map in all the other buffers we allocated earlier */ moea64_setup_direct_map(mmup, kernelstart, kernelend); } void moea64_late_bootstrap(mmu_t mmup, vm_offset_t kernelstart, vm_offset_t kernelend) { ihandle_t mmui; phandle_t chosen; phandle_t mmu; size_t sz; int i; vm_offset_t pa, va; void *dpcpu; /* * Set up the Open Firmware pmap and add its mappings if not in real * mode. */ chosen = OF_finddevice("/chosen"); if (chosen != -1 && OF_getprop(chosen, "mmu", &mmui, 4) != -1) { mmu = OF_instance_to_package(mmui); if (mmu == -1 || (sz = OF_getproplen(mmu, "translations")) == -1) sz = 0; if (sz > 6144 /* tmpstksz - 2 KB headroom */) panic("moea64_bootstrap: too many ofw translations"); if (sz > 0) moea64_add_ofw_mappings(mmup, mmu, sz); } /* * Calculate the last available physical address. */ for (i = 0; phys_avail[i + 2] != 0; i += 2) ; Maxmem = powerpc_btop(phys_avail[i + 1]); /* * Initialize MMU and remap early physical mappings */ MMU_CPU_BOOTSTRAP(mmup,0); mtmsr(mfmsr() | PSL_DR | PSL_IR); pmap_bootstrapped++; bs_remap_earlyboot(); /* * Set the start and end of kva. */ virtual_avail = VM_MIN_KERNEL_ADDRESS; virtual_end = VM_MAX_SAFE_KERNEL_ADDRESS; /* * Map the entire KVA range into the SLB. We must not fault there. */ #ifdef __powerpc64__ for (va = virtual_avail; va < virtual_end; va += SEGMENT_LENGTH) moea64_bootstrap_slb_prefault(va, 0); #endif /* * Figure out how far we can extend virtual_end into segment 16 * without running into existing mappings. Segment 16 is guaranteed * to contain neither RAM nor devices (at least on Apple hardware), * but will generally contain some OFW mappings we should not * step on. */ #ifndef __powerpc64__ /* KVA is in high memory on PPC64 */ PMAP_LOCK(kernel_pmap); while (virtual_end < VM_MAX_KERNEL_ADDRESS && moea64_pvo_find_va(kernel_pmap, virtual_end+1) == NULL) virtual_end += PAGE_SIZE; PMAP_UNLOCK(kernel_pmap); #endif /* * Allocate a kernel stack with a guard page for thread0 and map it * into the kernel page map. */ pa = moea64_bootstrap_alloc(KSTACK_PAGES * PAGE_SIZE, PAGE_SIZE); va = virtual_avail + KSTACK_GUARD_PAGES * PAGE_SIZE; virtual_avail = va + KSTACK_PAGES * PAGE_SIZE; CTR2(KTR_PMAP, "moea64_bootstrap: kstack0 at %#x (%#x)", pa, va); thread0.td_kstack = va; thread0.td_kstack_pages = KSTACK_PAGES; for (i = 0; i < KSTACK_PAGES; i++) { moea64_kenter(mmup, va, pa); pa += PAGE_SIZE; va += PAGE_SIZE; } /* * Allocate virtual address space for the message buffer. */ pa = msgbuf_phys = moea64_bootstrap_alloc(msgbufsize, PAGE_SIZE); msgbufp = (struct msgbuf *)virtual_avail; va = virtual_avail; virtual_avail += round_page(msgbufsize); while (va < virtual_avail) { moea64_kenter(mmup, va, pa); pa += PAGE_SIZE; va += PAGE_SIZE; } /* * Allocate virtual address space for the dynamic percpu area. */ pa = moea64_bootstrap_alloc(DPCPU_SIZE, PAGE_SIZE); dpcpu = (void *)virtual_avail; va = virtual_avail; virtual_avail += DPCPU_SIZE; while (va < virtual_avail) { moea64_kenter(mmup, va, pa); pa += PAGE_SIZE; va += PAGE_SIZE; } dpcpu_init(dpcpu, 0); /* * Allocate some things for page zeroing. We put this directly * in the page table, marked with LPTE_LOCKED, to avoid any * of the PVO book-keeping or other parts of the VM system * from even knowing that this hack exists. */ if (!hw_direct_map) { mtx_init(&moea64_scratchpage_mtx, "pvo zero page", NULL, MTX_DEF); for (i = 0; i < 2; i++) { moea64_scratchpage_va[i] = (virtual_end+1) - PAGE_SIZE; virtual_end -= PAGE_SIZE; moea64_kenter(mmup, moea64_scratchpage_va[i], 0); moea64_scratchpage_pvo[i] = moea64_pvo_find_va( kernel_pmap, (vm_offset_t)moea64_scratchpage_va[i]); LOCK_TABLE_RD(); moea64_scratchpage_pte[i] = MOEA64_PVO_TO_PTE( mmup, moea64_scratchpage_pvo[i]); moea64_scratchpage_pvo[i]->pvo_pte.lpte.pte_hi |= LPTE_LOCKED; MOEA64_PTE_CHANGE(mmup, moea64_scratchpage_pte[i], &moea64_scratchpage_pvo[i]->pvo_pte.lpte, moea64_scratchpage_pvo[i]->pvo_vpn); UNLOCK_TABLE_RD(); } } } /* * Activate a user pmap. The pmap must be activated before its address * space can be accessed in any way. */ void moea64_activate(mmu_t mmu, struct thread *td) { pmap_t pm; pm = &td->td_proc->p_vmspace->vm_pmap; CPU_SET(PCPU_GET(cpuid), &pm->pm_active); #ifdef __powerpc64__ PCPU_SET(userslb, pm->pm_slb); #else PCPU_SET(curpmap, pm->pmap_phys); #endif } void moea64_deactivate(mmu_t mmu, struct thread *td) { pmap_t pm; pm = &td->td_proc->p_vmspace->vm_pmap; CPU_CLR(PCPU_GET(cpuid), &pm->pm_active); #ifdef __powerpc64__ PCPU_SET(userslb, NULL); #else PCPU_SET(curpmap, NULL); #endif } void moea64_unwire(mmu_t mmu, pmap_t pm, vm_offset_t sva, vm_offset_t eva) { struct pvo_entry key, *pvo; uintptr_t pt; LOCK_TABLE_RD(); PMAP_LOCK(pm); key.pvo_vaddr = sva; for (pvo = RB_NFIND(pvo_tree, &pm->pmap_pvo, &key); pvo != NULL && PVO_VADDR(pvo) < eva; pvo = RB_NEXT(pvo_tree, &pm->pmap_pvo, pvo)) { pt = MOEA64_PVO_TO_PTE(mmu, pvo); if ((pvo->pvo_vaddr & PVO_WIRED) == 0) panic("moea64_unwire: pvo %p is missing PVO_WIRED", pvo); pvo->pvo_vaddr &= ~PVO_WIRED; if ((pvo->pvo_pte.lpte.pte_hi & LPTE_WIRED) == 0) panic("moea64_unwire: pte %p is missing LPTE_WIRED", &pvo->pvo_pte.lpte); pvo->pvo_pte.lpte.pte_hi &= ~LPTE_WIRED; if (pt != -1) { /* * The PTE's wired attribute is not a hardware * feature, so there is no need to invalidate any TLB * entries. */ MOEA64_PTE_CHANGE(mmu, pt, &pvo->pvo_pte.lpte, pvo->pvo_vpn); } pm->pm_stats.wired_count--; } UNLOCK_TABLE_RD(); PMAP_UNLOCK(pm); } /* * This goes through and sets the physical address of our * special scratch PTE to the PA we want to zero or copy. Because * of locking issues (this can get called in pvo_enter() by * the UMA allocator), we can't use most other utility functions here */ static __inline void moea64_set_scratchpage_pa(mmu_t mmup, int which, vm_offset_t pa) { KASSERT(!hw_direct_map, ("Using OEA64 scratchpage with a direct map!")); mtx_assert(&moea64_scratchpage_mtx, MA_OWNED); moea64_scratchpage_pvo[which]->pvo_pte.lpte.pte_lo &= ~(LPTE_WIMG | LPTE_RPGN); moea64_scratchpage_pvo[which]->pvo_pte.lpte.pte_lo |= moea64_calc_wimg(pa, VM_MEMATTR_DEFAULT) | (uint64_t)pa; MOEA64_PTE_CHANGE(mmup, moea64_scratchpage_pte[which], &moea64_scratchpage_pvo[which]->pvo_pte.lpte, moea64_scratchpage_pvo[which]->pvo_vpn); isync(); } void moea64_copy_page(mmu_t mmu, vm_page_t msrc, vm_page_t mdst) { vm_offset_t dst; vm_offset_t src; dst = VM_PAGE_TO_PHYS(mdst); src = VM_PAGE_TO_PHYS(msrc); if (hw_direct_map) { bcopy((void *)src, (void *)dst, PAGE_SIZE); } else { mtx_lock(&moea64_scratchpage_mtx); moea64_set_scratchpage_pa(mmu, 0, src); moea64_set_scratchpage_pa(mmu, 1, dst); bcopy((void *)moea64_scratchpage_va[0], (void *)moea64_scratchpage_va[1], PAGE_SIZE); mtx_unlock(&moea64_scratchpage_mtx); } } static inline void moea64_copy_pages_dmap(mmu_t mmu, vm_page_t *ma, vm_offset_t a_offset, vm_page_t *mb, vm_offset_t b_offset, int xfersize) { void *a_cp, *b_cp; vm_offset_t a_pg_offset, b_pg_offset; int cnt; while (xfersize > 0) { a_pg_offset = a_offset & PAGE_MASK; cnt = min(xfersize, PAGE_SIZE - a_pg_offset); a_cp = (char *)VM_PAGE_TO_PHYS(ma[a_offset >> PAGE_SHIFT]) + a_pg_offset; b_pg_offset = b_offset & PAGE_MASK; cnt = min(cnt, PAGE_SIZE - b_pg_offset); b_cp = (char *)VM_PAGE_TO_PHYS(mb[b_offset >> PAGE_SHIFT]) + b_pg_offset; bcopy(a_cp, b_cp, cnt); a_offset += cnt; b_offset += cnt; xfersize -= cnt; } } static inline void moea64_copy_pages_nodmap(mmu_t mmu, vm_page_t *ma, vm_offset_t a_offset, vm_page_t *mb, vm_offset_t b_offset, int xfersize) { void *a_cp, *b_cp; vm_offset_t a_pg_offset, b_pg_offset; int cnt; mtx_lock(&moea64_scratchpage_mtx); while (xfersize > 0) { a_pg_offset = a_offset & PAGE_MASK; cnt = min(xfersize, PAGE_SIZE - a_pg_offset); moea64_set_scratchpage_pa(mmu, 0, VM_PAGE_TO_PHYS(ma[a_offset >> PAGE_SHIFT])); a_cp = (char *)moea64_scratchpage_va[0] + a_pg_offset; b_pg_offset = b_offset & PAGE_MASK; cnt = min(cnt, PAGE_SIZE - b_pg_offset); moea64_set_scratchpage_pa(mmu, 1, VM_PAGE_TO_PHYS(mb[b_offset >> PAGE_SHIFT])); b_cp = (char *)moea64_scratchpage_va[1] + b_pg_offset; bcopy(a_cp, b_cp, cnt); a_offset += cnt; b_offset += cnt; xfersize -= cnt; } mtx_unlock(&moea64_scratchpage_mtx); } void moea64_copy_pages(mmu_t mmu, vm_page_t *ma, vm_offset_t a_offset, vm_page_t *mb, vm_offset_t b_offset, int xfersize) { if (hw_direct_map) { moea64_copy_pages_dmap(mmu, ma, a_offset, mb, b_offset, xfersize); } else { moea64_copy_pages_nodmap(mmu, ma, a_offset, mb, b_offset, xfersize); } } void moea64_zero_page_area(mmu_t mmu, vm_page_t m, int off, int size) { vm_offset_t pa = VM_PAGE_TO_PHYS(m); if (size + off > PAGE_SIZE) panic("moea64_zero_page: size + off > PAGE_SIZE"); if (hw_direct_map) { bzero((caddr_t)pa + off, size); } else { mtx_lock(&moea64_scratchpage_mtx); moea64_set_scratchpage_pa(mmu, 0, pa); bzero((caddr_t)moea64_scratchpage_va[0] + off, size); mtx_unlock(&moea64_scratchpage_mtx); } } /* * Zero a page of physical memory by temporarily mapping it */ void moea64_zero_page(mmu_t mmu, vm_page_t m) { vm_offset_t pa = VM_PAGE_TO_PHYS(m); vm_offset_t va, off; if (!hw_direct_map) { mtx_lock(&moea64_scratchpage_mtx); moea64_set_scratchpage_pa(mmu, 0, pa); va = moea64_scratchpage_va[0]; } else { va = pa; } for (off = 0; off < PAGE_SIZE; off += cacheline_size) __asm __volatile("dcbz 0,%0" :: "r"(va + off)); if (!hw_direct_map) mtx_unlock(&moea64_scratchpage_mtx); } void moea64_zero_page_idle(mmu_t mmu, vm_page_t m) { moea64_zero_page(mmu, m); } /* * Map the given physical page at the specified virtual address in the * target pmap with the protection requested. If specified the page * will be wired down. */ int moea64_enter(mmu_t mmu, pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot, u_int flags, int8_t psind) { struct pvo_head *pvo_head; uma_zone_t zone; vm_page_t pg; uint64_t pte_lo; u_int pvo_flags; int error; if (!moea64_initialized) { pvo_head = NULL; pg = NULL; zone = moea64_upvo_zone; pvo_flags = 0; } else { pvo_head = vm_page_to_pvoh(m); pg = m; zone = moea64_mpvo_zone; pvo_flags = PVO_MANAGED; } if ((m->oflags & VPO_UNMANAGED) == 0 && !vm_page_xbusied(m)) VM_OBJECT_ASSERT_LOCKED(m->object); /* XXX change the pvo head for fake pages */ if ((m->oflags & VPO_UNMANAGED) != 0) { pvo_flags &= ~PVO_MANAGED; pvo_head = NULL; zone = moea64_upvo_zone; } pte_lo = moea64_calc_wimg(VM_PAGE_TO_PHYS(m), pmap_page_get_memattr(m)); if (prot & VM_PROT_WRITE) { pte_lo |= LPTE_BW; if (pmap_bootstrapped && (m->oflags & VPO_UNMANAGED) == 0) vm_page_aflag_set(m, PGA_WRITEABLE); } else pte_lo |= LPTE_BR; if ((prot & VM_PROT_EXECUTE) == 0) pte_lo |= LPTE_NOEXEC; if ((flags & PMAP_ENTER_WIRED) != 0) pvo_flags |= PVO_WIRED; for (;;) { LOCK_TABLE_WR(); PMAP_LOCK(pmap); error = moea64_pvo_enter(mmu, pmap, zone, pvo_head, va, VM_PAGE_TO_PHYS(m), pte_lo, pvo_flags, psind); PMAP_UNLOCK(pmap); UNLOCK_TABLE_WR(); if (error != ENOMEM) break; if ((flags & PMAP_ENTER_NOSLEEP) != 0) return (KERN_RESOURCE_SHORTAGE); VM_OBJECT_ASSERT_UNLOCKED(m->object); VM_WAIT; } /* * Flush the page from the instruction cache if this page is * mapped executable and cacheable. */ if (pmap != kernel_pmap && !(m->aflags & PGA_EXECUTABLE) && (pte_lo & (LPTE_I | LPTE_G | LPTE_NOEXEC)) == 0) { vm_page_aflag_set(m, PGA_EXECUTABLE); moea64_syncicache(mmu, pmap, va, VM_PAGE_TO_PHYS(m), PAGE_SIZE); } return (KERN_SUCCESS); } static void moea64_syncicache(mmu_t mmu, pmap_t pmap, vm_offset_t va, vm_offset_t pa, vm_size_t sz) { /* * This is much trickier than on older systems because * we can't sync the icache on physical addresses directly * without a direct map. Instead we check a couple of cases * where the memory is already mapped in and, failing that, * use the same trick we use for page zeroing to create * a temporary mapping for this physical address. */ if (!pmap_bootstrapped) { /* * If PMAP is not bootstrapped, we are likely to be * in real mode. */ __syncicache((void *)pa, sz); } else if (pmap == kernel_pmap) { __syncicache((void *)va, sz); } else if (hw_direct_map) { __syncicache((void *)pa, sz); } else { /* Use the scratch page to set up a temp mapping */ mtx_lock(&moea64_scratchpage_mtx); moea64_set_scratchpage_pa(mmu, 1, pa & ~ADDR_POFF); __syncicache((void *)(moea64_scratchpage_va[1] + (va & ADDR_POFF)), sz); mtx_unlock(&moea64_scratchpage_mtx); } } /* * Maps a sequence of resident pages belonging to the same object. * The sequence begins with the given page m_start. This page is * mapped at the given virtual address start. Each subsequent page is * mapped at a virtual address that is offset from start by the same * amount as the page is offset from m_start within the object. The * last page in the sequence is the page with the largest offset from * m_start that can be mapped at a virtual address less than the given * virtual address end. Not every virtual page between start and end * is mapped; only those for which a resident page exists with the * corresponding offset from m_start are mapped. */ void moea64_enter_object(mmu_t mmu, pmap_t pm, vm_offset_t start, vm_offset_t end, vm_page_t m_start, vm_prot_t prot) { vm_page_t m; vm_pindex_t diff, psize; VM_OBJECT_ASSERT_LOCKED(m_start->object); psize = atop(end - start); m = m_start; while (m != NULL && (diff = m->pindex - m_start->pindex) < psize) { moea64_enter(mmu, pm, start + ptoa(diff), m, prot & (VM_PROT_READ | VM_PROT_EXECUTE), PMAP_ENTER_NOSLEEP, 0); m = TAILQ_NEXT(m, listq); } } void moea64_enter_quick(mmu_t mmu, pmap_t pm, vm_offset_t va, vm_page_t m, vm_prot_t prot) { moea64_enter(mmu, pm, va, m, prot & (VM_PROT_READ | VM_PROT_EXECUTE), PMAP_ENTER_NOSLEEP, 0); } vm_paddr_t moea64_extract(mmu_t mmu, pmap_t pm, vm_offset_t va) { struct pvo_entry *pvo; vm_paddr_t pa; PMAP_LOCK(pm); pvo = moea64_pvo_find_va(pm, va); if (pvo == NULL) pa = 0; else pa = (pvo->pvo_pte.lpte.pte_lo & LPTE_RPGN) | (va - PVO_VADDR(pvo)); PMAP_UNLOCK(pm); return (pa); } /* * Atomically extract and hold the physical page with the given * pmap and virtual address pair if that mapping permits the given * protection. */ vm_page_t moea64_extract_and_hold(mmu_t mmu, pmap_t pmap, vm_offset_t va, vm_prot_t prot) { struct pvo_entry *pvo; vm_page_t m; vm_paddr_t pa; m = NULL; pa = 0; PMAP_LOCK(pmap); retry: pvo = moea64_pvo_find_va(pmap, va & ~ADDR_POFF); if (pvo != NULL && (pvo->pvo_pte.lpte.pte_hi & LPTE_VALID) && ((pvo->pvo_pte.lpte.pte_lo & LPTE_PP) == LPTE_RW || (prot & VM_PROT_WRITE) == 0)) { if (vm_page_pa_tryrelock(pmap, pvo->pvo_pte.lpte.pte_lo & LPTE_RPGN, &pa)) goto retry; m = PHYS_TO_VM_PAGE(pvo->pvo_pte.lpte.pte_lo & LPTE_RPGN); vm_page_hold(m); } PA_UNLOCK_COND(pa); PMAP_UNLOCK(pmap); return (m); } static mmu_t installed_mmu; static void * -moea64_uma_page_alloc(uma_zone_t zone, int bytes, u_int8_t *flags, int wait) +moea64_uma_page_alloc(uma_zone_t zone, vm_size_t bytes, uint8_t *flags, + int wait) { /* * This entire routine is a horrible hack to avoid bothering kmem * for new KVA addresses. Because this can get called from inside * kmem allocation routines, calling kmem for a new address here * can lead to multiply locking non-recursive mutexes. */ vm_offset_t va; vm_page_t m; int pflags, needed_lock; *flags = UMA_SLAB_PRIV; needed_lock = !PMAP_LOCKED(kernel_pmap); pflags = malloc2vm_flags(wait) | VM_ALLOC_WIRED; for (;;) { m = vm_page_alloc(NULL, 0, pflags | VM_ALLOC_NOOBJ); if (m == NULL) { if (wait & M_NOWAIT) return (NULL); VM_WAIT; } else break; } va = VM_PAGE_TO_PHYS(m); LOCK_TABLE_WR(); if (needed_lock) PMAP_LOCK(kernel_pmap); moea64_pvo_enter(installed_mmu, kernel_pmap, moea64_upvo_zone, NULL, va, VM_PAGE_TO_PHYS(m), LPTE_M, PVO_WIRED | PVO_BOOTSTRAP, 0); if (needed_lock) PMAP_UNLOCK(kernel_pmap); UNLOCK_TABLE_WR(); if ((wait & M_ZERO) && (m->flags & PG_ZERO) == 0) bzero((void *)va, PAGE_SIZE); return (void *)va; } extern int elf32_nxstack; void moea64_init(mmu_t mmu) { CTR0(KTR_PMAP, "moea64_init"); moea64_upvo_zone = uma_zcreate("UPVO entry", sizeof (struct pvo_entry), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_VM | UMA_ZONE_NOFREE); moea64_mpvo_zone = uma_zcreate("MPVO entry", sizeof(struct pvo_entry), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_VM | UMA_ZONE_NOFREE); if (!hw_direct_map) { installed_mmu = mmu; uma_zone_set_allocf(moea64_upvo_zone,moea64_uma_page_alloc); uma_zone_set_allocf(moea64_mpvo_zone,moea64_uma_page_alloc); } #ifdef COMPAT_FREEBSD32 elf32_nxstack = 1; #endif moea64_initialized = TRUE; } boolean_t moea64_is_referenced(mmu_t mmu, vm_page_t m) { KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("moea64_is_referenced: page %p is not managed", m)); return (moea64_query_bit(mmu, m, PTE_REF)); } boolean_t moea64_is_modified(mmu_t mmu, vm_page_t m) { KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("moea64_is_modified: page %p is not managed", m)); /* * If the page is not exclusive busied, then PGA_WRITEABLE cannot be * concurrently set while the object is locked. Thus, if PGA_WRITEABLE * is clear, no PTEs can have LPTE_CHG set. */ VM_OBJECT_ASSERT_LOCKED(m->object); if (!vm_page_xbusied(m) && (m->aflags & PGA_WRITEABLE) == 0) return (FALSE); return (moea64_query_bit(mmu, m, LPTE_CHG)); } boolean_t moea64_is_prefaultable(mmu_t mmu, pmap_t pmap, vm_offset_t va) { struct pvo_entry *pvo; boolean_t rv; PMAP_LOCK(pmap); pvo = moea64_pvo_find_va(pmap, va & ~ADDR_POFF); rv = pvo == NULL || (pvo->pvo_pte.lpte.pte_hi & LPTE_VALID) == 0; PMAP_UNLOCK(pmap); return (rv); } void moea64_clear_modify(mmu_t mmu, vm_page_t m) { KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("moea64_clear_modify: page %p is not managed", m)); VM_OBJECT_ASSERT_WLOCKED(m->object); KASSERT(!vm_page_xbusied(m), ("moea64_clear_modify: page %p is exclusive busied", m)); /* * If the page is not PGA_WRITEABLE, then no PTEs can have LPTE_CHG * set. If the object containing the page is locked and the page is * not exclusive busied, then PGA_WRITEABLE cannot be concurrently set. */ if ((m->aflags & PGA_WRITEABLE) == 0) return; moea64_clear_bit(mmu, m, LPTE_CHG); } /* * Clear the write and modified bits in each of the given page's mappings. */ void moea64_remove_write(mmu_t mmu, vm_page_t m) { struct pvo_entry *pvo; uintptr_t pt; pmap_t pmap; uint64_t lo = 0; KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("moea64_remove_write: page %p is not managed", m)); /* * If the page is not exclusive busied, then PGA_WRITEABLE cannot be * set by another thread while the object is locked. Thus, * if PGA_WRITEABLE is clear, no page table entries need updating. */ VM_OBJECT_ASSERT_WLOCKED(m->object); if (!vm_page_xbusied(m) && (m->aflags & PGA_WRITEABLE) == 0) return; powerpc_sync(); LOCK_TABLE_RD(); LIST_FOREACH(pvo, vm_page_to_pvoh(m), pvo_vlink) { pmap = pvo->pvo_pmap; PMAP_LOCK(pmap); if ((pvo->pvo_pte.lpte.pte_lo & LPTE_PP) != LPTE_BR) { pt = MOEA64_PVO_TO_PTE(mmu, pvo); pvo->pvo_pte.lpte.pte_lo &= ~LPTE_PP; pvo->pvo_pte.lpte.pte_lo |= LPTE_BR; if (pt != -1) { MOEA64_PTE_SYNCH(mmu, pt, &pvo->pvo_pte.lpte); lo |= pvo->pvo_pte.lpte.pte_lo; pvo->pvo_pte.lpte.pte_lo &= ~LPTE_CHG; MOEA64_PTE_CHANGE(mmu, pt, &pvo->pvo_pte.lpte, pvo->pvo_vpn); if (pvo->pvo_pmap == kernel_pmap) isync(); } } if ((lo & LPTE_CHG) != 0) vm_page_dirty(m); PMAP_UNLOCK(pmap); } UNLOCK_TABLE_RD(); vm_page_aflag_clear(m, PGA_WRITEABLE); } /* * moea64_ts_referenced: * * Return a count of reference bits for a page, clearing those bits. * It is not necessary for every reference bit to be cleared, but it * is necessary that 0 only be returned when there are truly no * reference bits set. * * XXX: The exact number of bits to check and clear is a matter that * should be tested and standardized at some point in the future for * optimal aging of shared pages. */ int moea64_ts_referenced(mmu_t mmu, vm_page_t m) { KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("moea64_ts_referenced: page %p is not managed", m)); return (moea64_clear_bit(mmu, m, LPTE_REF)); } /* * Modify the WIMG settings of all mappings for a page. */ void moea64_page_set_memattr(mmu_t mmu, vm_page_t m, vm_memattr_t ma) { struct pvo_entry *pvo; struct pvo_head *pvo_head; uintptr_t pt; pmap_t pmap; uint64_t lo; if ((m->oflags & VPO_UNMANAGED) != 0) { m->md.mdpg_cache_attrs = ma; return; } pvo_head = vm_page_to_pvoh(m); lo = moea64_calc_wimg(VM_PAGE_TO_PHYS(m), ma); LOCK_TABLE_RD(); LIST_FOREACH(pvo, pvo_head, pvo_vlink) { pmap = pvo->pvo_pmap; PMAP_LOCK(pmap); pt = MOEA64_PVO_TO_PTE(mmu, pvo); pvo->pvo_pte.lpte.pte_lo &= ~LPTE_WIMG; pvo->pvo_pte.lpte.pte_lo |= lo; if (pt != -1) { MOEA64_PTE_CHANGE(mmu, pt, &pvo->pvo_pte.lpte, pvo->pvo_vpn); if (pvo->pvo_pmap == kernel_pmap) isync(); } PMAP_UNLOCK(pmap); } UNLOCK_TABLE_RD(); m->md.mdpg_cache_attrs = ma; } /* * Map a wired page into kernel virtual address space. */ void moea64_kenter_attr(mmu_t mmu, vm_offset_t va, vm_offset_t pa, vm_memattr_t ma) { uint64_t pte_lo; int error; pte_lo = moea64_calc_wimg(pa, ma); LOCK_TABLE_WR(); PMAP_LOCK(kernel_pmap); error = moea64_pvo_enter(mmu, kernel_pmap, moea64_upvo_zone, NULL, va, pa, pte_lo, PVO_WIRED, 0); PMAP_UNLOCK(kernel_pmap); UNLOCK_TABLE_WR(); if (error != 0 && error != ENOENT) panic("moea64_kenter: failed to enter va %#zx pa %#zx: %d", va, pa, error); } void moea64_kenter(mmu_t mmu, vm_offset_t va, vm_paddr_t pa) { moea64_kenter_attr(mmu, va, pa, VM_MEMATTR_DEFAULT); } /* * Extract the physical page address associated with the given kernel virtual * address. */ vm_paddr_t moea64_kextract(mmu_t mmu, vm_offset_t va) { struct pvo_entry *pvo; vm_paddr_t pa; /* * Shortcut the direct-mapped case when applicable. We never put * anything but 1:1 mappings below VM_MIN_KERNEL_ADDRESS. */ if (va < VM_MIN_KERNEL_ADDRESS) return (va); PMAP_LOCK(kernel_pmap); pvo = moea64_pvo_find_va(kernel_pmap, va); KASSERT(pvo != NULL, ("moea64_kextract: no addr found for %#" PRIxPTR, va)); pa = (pvo->pvo_pte.lpte.pte_lo & LPTE_RPGN) | (va - PVO_VADDR(pvo)); PMAP_UNLOCK(kernel_pmap); return (pa); } /* * Remove a wired page from kernel virtual address space. */ void moea64_kremove(mmu_t mmu, vm_offset_t va) { moea64_remove(mmu, kernel_pmap, va, va + PAGE_SIZE); } /* * Map a range of physical addresses into kernel virtual address space. * * The value passed in *virt is a suggested virtual address for the mapping. * Architectures which can support a direct-mapped physical to virtual region * can return the appropriate address within that region, leaving '*virt' * unchanged. We cannot and therefore do not; *virt is updated with the * first usable address after the mapped region. */ vm_offset_t moea64_map(mmu_t mmu, vm_offset_t *virt, vm_paddr_t pa_start, vm_paddr_t pa_end, int prot) { vm_offset_t sva, va; sva = *virt; va = sva; for (; pa_start < pa_end; pa_start += PAGE_SIZE, va += PAGE_SIZE) moea64_kenter(mmu, va, pa_start); *virt = va; return (sva); } /* * Returns true if the pmap's pv is one of the first * 16 pvs linked to from this page. This count may * be changed upwards or downwards in the future; it * is only necessary that true be returned for a small * subset of pmaps for proper page aging. */ boolean_t moea64_page_exists_quick(mmu_t mmu, pmap_t pmap, vm_page_t m) { int loops; struct pvo_entry *pvo; boolean_t rv; KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("moea64_page_exists_quick: page %p is not managed", m)); loops = 0; rv = FALSE; LOCK_TABLE_RD(); LIST_FOREACH(pvo, vm_page_to_pvoh(m), pvo_vlink) { if (pvo->pvo_pmap == pmap) { rv = TRUE; break; } if (++loops >= 16) break; } UNLOCK_TABLE_RD(); return (rv); } /* * Return the number of managed mappings to the given physical page * that are wired. */ int moea64_page_wired_mappings(mmu_t mmu, vm_page_t m) { struct pvo_entry *pvo; int count; count = 0; if ((m->oflags & VPO_UNMANAGED) != 0) return (count); LOCK_TABLE_RD(); LIST_FOREACH(pvo, vm_page_to_pvoh(m), pvo_vlink) if ((pvo->pvo_vaddr & PVO_WIRED) != 0) count++; UNLOCK_TABLE_RD(); return (count); } static uintptr_t moea64_vsidcontext; uintptr_t moea64_get_unique_vsid(void) { u_int entropy; register_t hash; uint32_t mask; int i; entropy = 0; __asm __volatile("mftb %0" : "=r"(entropy)); mtx_lock(&moea64_slb_mutex); for (i = 0; i < NVSIDS; i += VSID_NBPW) { u_int n; /* * Create a new value by mutiplying by a prime and adding in * entropy from the timebase register. This is to make the * VSID more random so that the PT hash function collides * less often. (Note that the prime casues gcc to do shifts * instead of a multiply.) */ moea64_vsidcontext = (moea64_vsidcontext * 0x1105) + entropy; hash = moea64_vsidcontext & (NVSIDS - 1); if (hash == 0) /* 0 is special, avoid it */ continue; n = hash >> 5; mask = 1 << (hash & (VSID_NBPW - 1)); hash = (moea64_vsidcontext & VSID_HASHMASK); if (moea64_vsid_bitmap[n] & mask) { /* collision? */ /* anything free in this bucket? */ if (moea64_vsid_bitmap[n] == 0xffffffff) { entropy = (moea64_vsidcontext >> 20); continue; } i = ffs(~moea64_vsid_bitmap[n]) - 1; mask = 1 << i; hash &= VSID_HASHMASK & ~(VSID_NBPW - 1); hash |= i; } KASSERT(!(moea64_vsid_bitmap[n] & mask), ("Allocating in-use VSID %#zx\n", hash)); moea64_vsid_bitmap[n] |= mask; mtx_unlock(&moea64_slb_mutex); return (hash); } mtx_unlock(&moea64_slb_mutex); panic("%s: out of segments",__func__); } #ifdef __powerpc64__ void moea64_pinit(mmu_t mmu, pmap_t pmap) { RB_INIT(&pmap->pmap_pvo); pmap->pm_slb_tree_root = slb_alloc_tree(); pmap->pm_slb = slb_alloc_user_cache(); pmap->pm_slb_len = 0; } #else void moea64_pinit(mmu_t mmu, pmap_t pmap) { int i; uint32_t hash; RB_INIT(&pmap->pmap_pvo); if (pmap_bootstrapped) pmap->pmap_phys = (pmap_t)moea64_kextract(mmu, (vm_offset_t)pmap); else pmap->pmap_phys = pmap; /* * Allocate some segment registers for this pmap. */ hash = moea64_get_unique_vsid(); for (i = 0; i < 16; i++) pmap->pm_sr[i] = VSID_MAKE(i, hash); KASSERT(pmap->pm_sr[0] != 0, ("moea64_pinit: pm_sr[0] = 0")); } #endif /* * Initialize the pmap associated with process 0. */ void moea64_pinit0(mmu_t mmu, pmap_t pm) { PMAP_LOCK_INIT(pm); moea64_pinit(mmu, pm); bzero(&pm->pm_stats, sizeof(pm->pm_stats)); } /* * Set the physical protection on the specified range of this map as requested. */ static void moea64_pvo_protect(mmu_t mmu, pmap_t pm, struct pvo_entry *pvo, vm_prot_t prot) { uintptr_t pt; struct vm_page *pg; uint64_t oldlo; PMAP_LOCK_ASSERT(pm, MA_OWNED); /* * Grab the PTE pointer before we diddle with the cached PTE * copy. */ pt = MOEA64_PVO_TO_PTE(mmu, pvo); /* * Change the protection of the page. */ oldlo = pvo->pvo_pte.lpte.pte_lo; pvo->pvo_pte.lpte.pte_lo &= ~LPTE_PP; pvo->pvo_pte.lpte.pte_lo &= ~LPTE_NOEXEC; if ((prot & VM_PROT_EXECUTE) == 0) pvo->pvo_pte.lpte.pte_lo |= LPTE_NOEXEC; if (prot & VM_PROT_WRITE) pvo->pvo_pte.lpte.pte_lo |= LPTE_BW; else pvo->pvo_pte.lpte.pte_lo |= LPTE_BR; pg = PHYS_TO_VM_PAGE(pvo->pvo_pte.lpte.pte_lo & LPTE_RPGN); /* * If the PVO is in the page table, update that pte as well. */ if (pt != -1) MOEA64_PTE_CHANGE(mmu, pt, &pvo->pvo_pte.lpte, pvo->pvo_vpn); if (pm != kernel_pmap && pg != NULL && !(pg->aflags & PGA_EXECUTABLE) && (pvo->pvo_pte.lpte.pte_lo & (LPTE_I | LPTE_G | LPTE_NOEXEC)) == 0) { if ((pg->oflags & VPO_UNMANAGED) == 0) vm_page_aflag_set(pg, PGA_EXECUTABLE); moea64_syncicache(mmu, pm, PVO_VADDR(pvo), pvo->pvo_pte.lpte.pte_lo & LPTE_RPGN, PAGE_SIZE); } /* * Update vm about the REF/CHG bits if the page is managed and we have * removed write access. */ if ((pvo->pvo_vaddr & PVO_MANAGED) == PVO_MANAGED && (oldlo & LPTE_PP) != LPTE_BR && !(prot & VM_PROT_WRITE)) { if (pg != NULL) { if (pvo->pvo_pte.lpte.pte_lo & LPTE_CHG) vm_page_dirty(pg); if (pvo->pvo_pte.lpte.pte_lo & LPTE_REF) vm_page_aflag_set(pg, PGA_REFERENCED); } } } void moea64_protect(mmu_t mmu, pmap_t pm, vm_offset_t sva, vm_offset_t eva, vm_prot_t prot) { struct pvo_entry *pvo, *tpvo, key; CTR4(KTR_PMAP, "moea64_protect: pm=%p sva=%#x eva=%#x prot=%#x", pm, sva, eva, prot); KASSERT(pm == &curproc->p_vmspace->vm_pmap || pm == kernel_pmap, ("moea64_protect: non current pmap")); if ((prot & VM_PROT_READ) == VM_PROT_NONE) { moea64_remove(mmu, pm, sva, eva); return; } LOCK_TABLE_RD(); PMAP_LOCK(pm); key.pvo_vaddr = sva; for (pvo = RB_NFIND(pvo_tree, &pm->pmap_pvo, &key); pvo != NULL && PVO_VADDR(pvo) < eva; pvo = tpvo) { tpvo = RB_NEXT(pvo_tree, &pm->pmap_pvo, pvo); moea64_pvo_protect(mmu, pm, pvo, prot); } UNLOCK_TABLE_RD(); PMAP_UNLOCK(pm); } /* * Map a list of wired pages into kernel virtual address space. This is * intended for temporary mappings which do not need page modification or * references recorded. Existing mappings in the region are overwritten. */ void moea64_qenter(mmu_t mmu, vm_offset_t va, vm_page_t *m, int count) { while (count-- > 0) { moea64_kenter(mmu, va, VM_PAGE_TO_PHYS(*m)); va += PAGE_SIZE; m++; } } /* * Remove page mappings from kernel virtual address space. Intended for * temporary mappings entered by moea64_qenter. */ void moea64_qremove(mmu_t mmu, vm_offset_t va, int count) { while (count-- > 0) { moea64_kremove(mmu, va); va += PAGE_SIZE; } } void moea64_release_vsid(uint64_t vsid) { int idx, mask; mtx_lock(&moea64_slb_mutex); idx = vsid & (NVSIDS-1); mask = 1 << (idx % VSID_NBPW); idx /= VSID_NBPW; KASSERT(moea64_vsid_bitmap[idx] & mask, ("Freeing unallocated VSID %#jx", vsid)); moea64_vsid_bitmap[idx] &= ~mask; mtx_unlock(&moea64_slb_mutex); } void moea64_release(mmu_t mmu, pmap_t pmap) { /* * Free segment registers' VSIDs */ #ifdef __powerpc64__ slb_free_tree(pmap); slb_free_user_cache(pmap->pm_slb); #else KASSERT(pmap->pm_sr[0] != 0, ("moea64_release: pm_sr[0] = 0")); moea64_release_vsid(VSID_TO_HASH(pmap->pm_sr[0])); #endif } /* * Remove all pages mapped by the specified pmap */ void moea64_remove_pages(mmu_t mmu, pmap_t pm) { struct pvo_entry *pvo, *tpvo; LOCK_TABLE_WR(); PMAP_LOCK(pm); RB_FOREACH_SAFE(pvo, pvo_tree, &pm->pmap_pvo, tpvo) { if (!(pvo->pvo_vaddr & PVO_WIRED)) moea64_pvo_remove(mmu, pvo); } UNLOCK_TABLE_WR(); PMAP_UNLOCK(pm); } /* * Remove the given range of addresses from the specified map. */ void moea64_remove(mmu_t mmu, pmap_t pm, vm_offset_t sva, vm_offset_t eva) { struct pvo_entry *pvo, *tpvo, key; /* * Perform an unsynchronized read. This is, however, safe. */ if (pm->pm_stats.resident_count == 0) return; LOCK_TABLE_WR(); PMAP_LOCK(pm); key.pvo_vaddr = sva; for (pvo = RB_NFIND(pvo_tree, &pm->pmap_pvo, &key); pvo != NULL && PVO_VADDR(pvo) < eva; pvo = tpvo) { tpvo = RB_NEXT(pvo_tree, &pm->pmap_pvo, pvo); moea64_pvo_remove(mmu, pvo); } UNLOCK_TABLE_WR(); PMAP_UNLOCK(pm); } /* * Remove physical page from all pmaps in which it resides. moea64_pvo_remove() * will reflect changes in pte's back to the vm_page. */ void moea64_remove_all(mmu_t mmu, vm_page_t m) { struct pvo_entry *pvo, *next_pvo; pmap_t pmap; LOCK_TABLE_WR(); LIST_FOREACH_SAFE(pvo, vm_page_to_pvoh(m), pvo_vlink, next_pvo) { pmap = pvo->pvo_pmap; PMAP_LOCK(pmap); moea64_pvo_remove(mmu, pvo); PMAP_UNLOCK(pmap); } UNLOCK_TABLE_WR(); if ((m->aflags & PGA_WRITEABLE) && moea64_is_modified(mmu, m)) vm_page_dirty(m); vm_page_aflag_clear(m, PGA_WRITEABLE); vm_page_aflag_clear(m, PGA_EXECUTABLE); } /* * Allocate a physical page of memory directly from the phys_avail map. * Can only be called from moea64_bootstrap before avail start and end are * calculated. */ vm_offset_t moea64_bootstrap_alloc(vm_size_t size, u_int align) { vm_offset_t s, e; int i, j; size = round_page(size); for (i = 0; phys_avail[i + 1] != 0; i += 2) { if (align != 0) s = (phys_avail[i] + align - 1) & ~(align - 1); else s = phys_avail[i]; e = s + size; if (s < phys_avail[i] || e > phys_avail[i + 1]) continue; if (s + size > platform_real_maxaddr()) continue; if (s == phys_avail[i]) { phys_avail[i] += size; } else if (e == phys_avail[i + 1]) { phys_avail[i + 1] -= size; } else { for (j = phys_avail_count * 2; j > i; j -= 2) { phys_avail[j] = phys_avail[j - 2]; phys_avail[j + 1] = phys_avail[j - 1]; } phys_avail[i + 3] = phys_avail[i + 1]; phys_avail[i + 1] = s; phys_avail[i + 2] = e; phys_avail_count++; } return (s); } panic("moea64_bootstrap_alloc: could not allocate memory"); } static int moea64_pvo_enter(mmu_t mmu, pmap_t pm, uma_zone_t zone, struct pvo_head *pvo_head, vm_offset_t va, vm_offset_t pa, uint64_t pte_lo, int flags, int8_t psind __unused) { struct pvo_entry *pvo; uintptr_t pt; uint64_t vsid; int first; u_int ptegidx; int i; int bootstrap; /* * One nasty thing that can happen here is that the UMA calls to * allocate new PVOs need to map more memory, which calls pvo_enter(), * which calls UMA... * * We break the loop by detecting recursion and allocating out of * the bootstrap pool. */ first = 0; bootstrap = (flags & PVO_BOOTSTRAP); if (!moea64_initialized) bootstrap = 1; PMAP_LOCK_ASSERT(pm, MA_OWNED); rw_assert(&moea64_table_lock, RA_WLOCKED); /* * Compute the PTE Group index. */ va &= ~ADDR_POFF; vsid = va_to_vsid(pm, va); ptegidx = va_to_pteg(vsid, va, flags & PVO_LARGE); /* * Remove any existing mapping for this page. Reuse the pvo entry if * there is a mapping. */ moea64_pvo_enter_calls++; LIST_FOREACH(pvo, &moea64_pvo_table[ptegidx], pvo_olink) { if (pvo->pvo_pmap == pm && PVO_VADDR(pvo) == va) { if ((pvo->pvo_pte.lpte.pte_lo & LPTE_RPGN) == pa && (pvo->pvo_pte.lpte.pte_lo & (LPTE_NOEXEC | LPTE_PP)) == (pte_lo & (LPTE_NOEXEC | LPTE_PP))) { /* * The physical page and protection are not * changing. Instead, this may be a request * to change the mapping's wired attribute. */ pt = -1; if ((flags & PVO_WIRED) != 0 && (pvo->pvo_vaddr & PVO_WIRED) == 0) { pt = MOEA64_PVO_TO_PTE(mmu, pvo); pvo->pvo_vaddr |= PVO_WIRED; pvo->pvo_pte.lpte.pte_hi |= LPTE_WIRED; pm->pm_stats.wired_count++; } else if ((flags & PVO_WIRED) == 0 && (pvo->pvo_vaddr & PVO_WIRED) != 0) { pt = MOEA64_PVO_TO_PTE(mmu, pvo); pvo->pvo_vaddr &= ~PVO_WIRED; pvo->pvo_pte.lpte.pte_hi &= ~LPTE_WIRED; pm->pm_stats.wired_count--; } if (!(pvo->pvo_pte.lpte.pte_hi & LPTE_VALID)) { KASSERT(pt == -1, ("moea64_pvo_enter: valid pt")); /* Re-insert if spilled */ i = MOEA64_PTE_INSERT(mmu, ptegidx, &pvo->pvo_pte.lpte); if (i >= 0) PVO_PTEGIDX_SET(pvo, i); moea64_pte_overflow--; } else if (pt != -1) { /* * The PTE's wired attribute is not a * hardware feature, so there is no * need to invalidate any TLB entries. */ MOEA64_PTE_CHANGE(mmu, pt, &pvo->pvo_pte.lpte, pvo->pvo_vpn); } return (0); } moea64_pvo_remove(mmu, pvo); break; } } /* * If we aren't overwriting a mapping, try to allocate. */ if (bootstrap) { if (moea64_bpvo_pool_index >= BPVO_POOL_SIZE) { panic("moea64_enter: bpvo pool exhausted, %d, %d, %zd", moea64_bpvo_pool_index, BPVO_POOL_SIZE, BPVO_POOL_SIZE * sizeof(struct pvo_entry)); } pvo = &moea64_bpvo_pool[moea64_bpvo_pool_index]; moea64_bpvo_pool_index++; bootstrap = 1; } else { pvo = uma_zalloc(zone, M_NOWAIT); } if (pvo == NULL) return (ENOMEM); moea64_pvo_entries++; pvo->pvo_vaddr = va; pvo->pvo_vpn = (uint64_t)((va & ADDR_PIDX) >> ADDR_PIDX_SHFT) | (vsid << 16); pvo->pvo_pmap = pm; LIST_INSERT_HEAD(&moea64_pvo_table[ptegidx], pvo, pvo_olink); pvo->pvo_vaddr &= ~ADDR_POFF; if (flags & PVO_WIRED) pvo->pvo_vaddr |= PVO_WIRED; if (pvo_head != NULL) pvo->pvo_vaddr |= PVO_MANAGED; if (bootstrap) pvo->pvo_vaddr |= PVO_BOOTSTRAP; if (flags & PVO_LARGE) pvo->pvo_vaddr |= PVO_LARGE; moea64_pte_create(&pvo->pvo_pte.lpte, vsid, va, (uint64_t)(pa) | pte_lo, flags); /* * Add to pmap list */ RB_INSERT(pvo_tree, &pm->pmap_pvo, pvo); /* * Remember if the list was empty and therefore will be the first * item. */ if (pvo_head != NULL) { if (LIST_FIRST(pvo_head) == NULL) first = 1; LIST_INSERT_HEAD(pvo_head, pvo, pvo_vlink); } if (pvo->pvo_vaddr & PVO_WIRED) { pvo->pvo_pte.lpte.pte_hi |= LPTE_WIRED; pm->pm_stats.wired_count++; } pm->pm_stats.resident_count++; /* * We hope this succeeds but it isn't required. */ i = MOEA64_PTE_INSERT(mmu, ptegidx, &pvo->pvo_pte.lpte); if (i >= 0) { PVO_PTEGIDX_SET(pvo, i); } else { panic("moea64_pvo_enter: overflow"); moea64_pte_overflow++; } if (pm == kernel_pmap) isync(); #ifdef __powerpc64__ /* * Make sure all our bootstrap mappings are in the SLB as soon * as virtual memory is switched on. */ if (!pmap_bootstrapped) moea64_bootstrap_slb_prefault(va, flags & PVO_LARGE); #endif return (first ? ENOENT : 0); } static void moea64_pvo_remove(mmu_t mmu, struct pvo_entry *pvo) { struct vm_page *pg; uintptr_t pt; PMAP_LOCK_ASSERT(pvo->pvo_pmap, MA_OWNED); rw_assert(&moea64_table_lock, RA_WLOCKED); /* * If there is an active pte entry, we need to deactivate it (and * save the ref & cfg bits). */ pt = MOEA64_PVO_TO_PTE(mmu, pvo); if (pt != -1) { MOEA64_PTE_UNSET(mmu, pt, &pvo->pvo_pte.lpte, pvo->pvo_vpn); PVO_PTEGIDX_CLR(pvo); } else { moea64_pte_overflow--; } /* * Update our statistics. */ pvo->pvo_pmap->pm_stats.resident_count--; if (pvo->pvo_vaddr & PVO_WIRED) pvo->pvo_pmap->pm_stats.wired_count--; /* * Remove this PVO from the pmap list. */ RB_REMOVE(pvo_tree, &pvo->pvo_pmap->pmap_pvo, pvo); /* * Remove this from the overflow list and return it to the pool * if we aren't going to reuse it. */ LIST_REMOVE(pvo, pvo_olink); /* * Update vm about the REF/CHG bits if the page is managed. */ pg = PHYS_TO_VM_PAGE(pvo->pvo_pte.lpte.pte_lo & LPTE_RPGN); if ((pvo->pvo_vaddr & PVO_MANAGED) == PVO_MANAGED && pg != NULL) { LIST_REMOVE(pvo, pvo_vlink); if ((pvo->pvo_pte.lpte.pte_lo & LPTE_PP) != LPTE_BR) { if (pvo->pvo_pte.lpte.pte_lo & LPTE_CHG) vm_page_dirty(pg); if (pvo->pvo_pte.lpte.pte_lo & LPTE_REF) vm_page_aflag_set(pg, PGA_REFERENCED); if (LIST_EMPTY(vm_page_to_pvoh(pg))) vm_page_aflag_clear(pg, PGA_WRITEABLE); } if (LIST_EMPTY(vm_page_to_pvoh(pg))) vm_page_aflag_clear(pg, PGA_EXECUTABLE); } moea64_pvo_entries--; moea64_pvo_remove_calls++; if (!(pvo->pvo_vaddr & PVO_BOOTSTRAP)) uma_zfree((pvo->pvo_vaddr & PVO_MANAGED) ? moea64_mpvo_zone : moea64_upvo_zone, pvo); } static struct pvo_entry * moea64_pvo_find_va(pmap_t pm, vm_offset_t va) { struct pvo_entry key; key.pvo_vaddr = va & ~ADDR_POFF; return (RB_FIND(pvo_tree, &pm->pmap_pvo, &key)); } static boolean_t moea64_query_bit(mmu_t mmu, vm_page_t m, u_int64_t ptebit) { struct pvo_entry *pvo; uintptr_t pt; LOCK_TABLE_RD(); LIST_FOREACH(pvo, vm_page_to_pvoh(m), pvo_vlink) { /* * See if we saved the bit off. If so, return success. */ if (pvo->pvo_pte.lpte.pte_lo & ptebit) { UNLOCK_TABLE_RD(); return (TRUE); } } /* * No luck, now go through the hard part of looking at the PTEs * themselves. Sync so that any pending REF/CHG bits are flushed to * the PTEs. */ powerpc_sync(); LIST_FOREACH(pvo, vm_page_to_pvoh(m), pvo_vlink) { /* * See if this pvo has a valid PTE. if so, fetch the * REF/CHG bits from the valid PTE. If the appropriate * ptebit is set, return success. */ PMAP_LOCK(pvo->pvo_pmap); pt = MOEA64_PVO_TO_PTE(mmu, pvo); if (pt != -1) { MOEA64_PTE_SYNCH(mmu, pt, &pvo->pvo_pte.lpte); if (pvo->pvo_pte.lpte.pte_lo & ptebit) { PMAP_UNLOCK(pvo->pvo_pmap); UNLOCK_TABLE_RD(); return (TRUE); } } PMAP_UNLOCK(pvo->pvo_pmap); } UNLOCK_TABLE_RD(); return (FALSE); } static u_int moea64_clear_bit(mmu_t mmu, vm_page_t m, u_int64_t ptebit) { u_int count; struct pvo_entry *pvo; uintptr_t pt; /* * Sync so that any pending REF/CHG bits are flushed to the PTEs (so * we can reset the right ones). note that since the pvo entries and * list heads are accessed via BAT0 and are never placed in the page * table, we don't have to worry about further accesses setting the * REF/CHG bits. */ powerpc_sync(); /* * For each pvo entry, clear the pvo's ptebit. If this pvo has a * valid pte clear the ptebit from the valid pte. */ count = 0; LOCK_TABLE_RD(); LIST_FOREACH(pvo, vm_page_to_pvoh(m), pvo_vlink) { PMAP_LOCK(pvo->pvo_pmap); pt = MOEA64_PVO_TO_PTE(mmu, pvo); if (pt != -1) { MOEA64_PTE_SYNCH(mmu, pt, &pvo->pvo_pte.lpte); if (pvo->pvo_pte.lpte.pte_lo & ptebit) { count++; MOEA64_PTE_CLEAR(mmu, pt, &pvo->pvo_pte.lpte, pvo->pvo_vpn, ptebit); } } pvo->pvo_pte.lpte.pte_lo &= ~ptebit; PMAP_UNLOCK(pvo->pvo_pmap); } UNLOCK_TABLE_RD(); return (count); } boolean_t moea64_dev_direct_mapped(mmu_t mmu, vm_paddr_t pa, vm_size_t size) { struct pvo_entry *pvo, key; vm_offset_t ppa; int error = 0; PMAP_LOCK(kernel_pmap); key.pvo_vaddr = ppa = pa & ~ADDR_POFF; for (pvo = RB_FIND(pvo_tree, &kernel_pmap->pmap_pvo, &key); ppa < pa + size; ppa += PAGE_SIZE, pvo = RB_NEXT(pvo_tree, &kernel_pmap->pmap_pvo, pvo)) { if (pvo == NULL || (pvo->pvo_pte.lpte.pte_lo & LPTE_RPGN) != ppa) { error = EFAULT; break; } } PMAP_UNLOCK(kernel_pmap); return (error); } /* * Map a set of physical memory pages into the kernel virtual * address space. Return a pointer to where it is mapped. This * routine is intended to be used for mapping device memory, * NOT real memory. */ void * moea64_mapdev_attr(mmu_t mmu, vm_offset_t pa, vm_size_t size, vm_memattr_t ma) { vm_offset_t va, tmpva, ppa, offset; ppa = trunc_page(pa); offset = pa & PAGE_MASK; size = roundup2(offset + size, PAGE_SIZE); va = kva_alloc(size); if (!va) panic("moea64_mapdev: Couldn't alloc kernel virtual memory"); for (tmpva = va; size > 0;) { moea64_kenter_attr(mmu, tmpva, ppa, ma); size -= PAGE_SIZE; tmpva += PAGE_SIZE; ppa += PAGE_SIZE; } return ((void *)(va + offset)); } void * moea64_mapdev(mmu_t mmu, vm_paddr_t pa, vm_size_t size) { return moea64_mapdev_attr(mmu, pa, size, VM_MEMATTR_DEFAULT); } void moea64_unmapdev(mmu_t mmu, vm_offset_t va, vm_size_t size) { vm_offset_t base, offset; base = trunc_page(va); offset = va & PAGE_MASK; size = roundup2(offset + size, PAGE_SIZE); kva_free(base, size); } void moea64_sync_icache(mmu_t mmu, pmap_t pm, vm_offset_t va, vm_size_t sz) { struct pvo_entry *pvo; vm_offset_t lim; vm_paddr_t pa; vm_size_t len; PMAP_LOCK(pm); while (sz > 0) { lim = round_page(va); len = MIN(lim - va, sz); pvo = moea64_pvo_find_va(pm, va & ~ADDR_POFF); if (pvo != NULL && !(pvo->pvo_pte.lpte.pte_lo & LPTE_I)) { pa = (pvo->pvo_pte.lpte.pte_lo & LPTE_RPGN) | (va & ADDR_POFF); moea64_syncicache(mmu, pm, va, pa, len); } va += len; sz -= len; } PMAP_UNLOCK(pm); } vm_offset_t moea64_dumpsys_map(mmu_t mmu, struct pmap_md *md, vm_size_t ofs, vm_size_t *sz) { if (md->md_vaddr == ~0UL) return (md->md_paddr + ofs); else return (md->md_vaddr + ofs); } struct pmap_md * moea64_scan_md(mmu_t mmu, struct pmap_md *prev) { static struct pmap_md md; struct pvo_entry *pvo; vm_offset_t va; if (dumpsys_minidump) { md.md_paddr = ~0UL; /* Minidumps use virtual addresses. */ if (prev == NULL) { /* 1st: kernel .data and .bss. */ md.md_index = 1; md.md_vaddr = trunc_page((uintptr_t)_etext); md.md_size = round_page((uintptr_t)_end) - md.md_vaddr; return (&md); } switch (prev->md_index) { case 1: /* 2nd: msgbuf and tables (see pmap_bootstrap()). */ md.md_index = 2; md.md_vaddr = (vm_offset_t)msgbufp->msg_ptr; md.md_size = round_page(msgbufp->msg_size); break; case 2: /* 3rd: kernel VM. */ va = prev->md_vaddr + prev->md_size; /* Find start of next chunk (from va). */ while (va < virtual_end) { /* Don't dump the buffer cache. */ if (va >= kmi.buffer_sva && va < kmi.buffer_eva) { va = kmi.buffer_eva; continue; } pvo = moea64_pvo_find_va(kernel_pmap, va & ~ADDR_POFF); if (pvo != NULL && (pvo->pvo_pte.lpte.pte_hi & LPTE_VALID)) break; va += PAGE_SIZE; } if (va < virtual_end) { md.md_vaddr = va; va += PAGE_SIZE; /* Find last page in chunk. */ while (va < virtual_end) { /* Don't run into the buffer cache. */ if (va == kmi.buffer_sva) break; pvo = moea64_pvo_find_va(kernel_pmap, va & ~ADDR_POFF); if (pvo == NULL || !(pvo->pvo_pte.lpte.pte_hi & LPTE_VALID)) break; va += PAGE_SIZE; } md.md_size = va - md.md_vaddr; break; } md.md_index = 3; /* FALLTHROUGH */ default: return (NULL); } } else { /* minidumps */ if (prev == NULL) { /* first physical chunk. */ md.md_paddr = pregions[0].mr_start; md.md_size = pregions[0].mr_size; md.md_vaddr = ~0UL; md.md_index = 1; } else if (md.md_index < pregions_sz) { md.md_paddr = pregions[md.md_index].mr_start; md.md_size = pregions[md.md_index].mr_size; md.md_vaddr = ~0UL; md.md_index++; } else { /* There's no next physical chunk. */ return (NULL); } } return (&md); } Index: stable/10/sys/powerpc/aim/slb.c =================================================================== --- stable/10/sys/powerpc/aim/slb.c (revision 287944) +++ stable/10/sys/powerpc/aim/slb.c (revision 287945) @@ -1,537 +1,537 @@ /*- * Copyright (c) 2010 Nathan Whitehorn * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * * $FreeBSD$ */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include uintptr_t moea64_get_unique_vsid(void); void moea64_release_vsid(uint64_t vsid); static void slb_zone_init(void *); static uma_zone_t slbt_zone; static uma_zone_t slb_cache_zone; int n_slbs = 64; SYSINIT(slb_zone_init, SI_SUB_KMEM, SI_ORDER_ANY, slb_zone_init, NULL); struct slbtnode { uint16_t ua_alloc; uint8_t ua_level; /* Only 36 bits needed for full 64-bit address space. */ uint64_t ua_base; union { struct slbtnode *ua_child[16]; struct slb slb_entries[16]; } u; }; /* * For a full 64-bit address space, there are 36 bits in play in an * esid, so 8 levels, with the leaf being at level 0. * * |3333|3322|2222|2222|1111|1111|11 | | | esid * |5432|1098|7654|3210|9876|5432|1098|7654|3210| bits * +----+----+----+----+----+----+----+----+----+-------- * | 8 | 7 | 6 | 5 | 4 | 3 | 2 | 1 | 0 | level */ #define UAD_ROOT_LEVEL 8 #define UAD_LEAF_LEVEL 0 static inline int esid2idx(uint64_t esid, int level) { int shift; shift = level * 4; return ((esid >> shift) & 0xF); } /* * The ua_base field should have 0 bits after the first 4*(level+1) * bits; i.e. only */ #define uad_baseok(ua) \ (esid2base(ua->ua_base, ua->ua_level) == ua->ua_base) static inline uint64_t esid2base(uint64_t esid, int level) { uint64_t mask; int shift; shift = (level + 1) * 4; mask = ~((1ULL << shift) - 1); return (esid & mask); } /* * Allocate a new leaf node for the specified esid/vmhandle from the * parent node. */ static struct slb * make_new_leaf(uint64_t esid, uint64_t slbv, struct slbtnode *parent) { struct slbtnode *child; struct slb *retval; int idx; idx = esid2idx(esid, parent->ua_level); KASSERT(parent->u.ua_child[idx] == NULL, ("Child already exists!")); /* unlock and M_WAITOK and loop? */ child = uma_zalloc(slbt_zone, M_NOWAIT | M_ZERO); KASSERT(child != NULL, ("unhandled NULL case")); child->ua_level = UAD_LEAF_LEVEL; child->ua_base = esid2base(esid, child->ua_level); idx = esid2idx(esid, child->ua_level); child->u.slb_entries[idx].slbv = slbv; child->u.slb_entries[idx].slbe = (esid << SLBE_ESID_SHIFT) | SLBE_VALID; setbit(&child->ua_alloc, idx); retval = &child->u.slb_entries[idx]; /* * The above stores must be visible before the next one, so * that a lockless searcher always sees a valid path through * the tree. */ mb(); idx = esid2idx(esid, parent->ua_level); parent->u.ua_child[idx] = child; setbit(&parent->ua_alloc, idx); return (retval); } /* * Allocate a new intermediate node to fit between the parent and * esid. */ static struct slbtnode* make_intermediate(uint64_t esid, struct slbtnode *parent) { struct slbtnode *child, *inter; int idx, level; idx = esid2idx(esid, parent->ua_level); child = parent->u.ua_child[idx]; KASSERT(esid2base(esid, child->ua_level) != child->ua_base, ("No need for an intermediate node?")); /* * Find the level where the existing child and our new esid * meet. It must be lower than parent->ua_level or we would * have chosen a different index in parent. */ level = child->ua_level + 1; while (esid2base(esid, level) != esid2base(child->ua_base, level)) level++; KASSERT(level < parent->ua_level, ("Found splitting level %d for %09jx and %09jx, " "but it's the same as %p's", level, esid, child->ua_base, parent)); /* unlock and M_WAITOK and loop? */ inter = uma_zalloc(slbt_zone, M_NOWAIT | M_ZERO); KASSERT(inter != NULL, ("unhandled NULL case")); /* Set up intermediate node to point to child ... */ inter->ua_level = level; inter->ua_base = esid2base(esid, inter->ua_level); idx = esid2idx(child->ua_base, inter->ua_level); inter->u.ua_child[idx] = child; setbit(&inter->ua_alloc, idx); mb(); /* Set up parent to point to intermediate node ... */ idx = esid2idx(inter->ua_base, parent->ua_level); parent->u.ua_child[idx] = inter; setbit(&parent->ua_alloc, idx); return (inter); } uint64_t kernel_va_to_slbv(vm_offset_t va) { uint64_t slbv; /* Set kernel VSID to deterministic value */ slbv = (KERNEL_VSID((uintptr_t)va >> ADDR_SR_SHFT)) << SLBV_VSID_SHIFT; /* Figure out if this is a large-page mapping */ if (hw_direct_map && va < VM_MIN_KERNEL_ADDRESS) { /* * XXX: If we have set up a direct map, assumes * all physical memory is mapped with large pages. */ if (mem_valid(va, 0) == 0) slbv |= SLBV_L; } return (slbv); } struct slb * user_va_to_slb_entry(pmap_t pm, vm_offset_t va) { uint64_t esid = va >> ADDR_SR_SHFT; struct slbtnode *ua; int idx; ua = pm->pm_slb_tree_root; for (;;) { KASSERT(uad_baseok(ua), ("uad base %016jx level %d bad!", ua->ua_base, ua->ua_level)); idx = esid2idx(esid, ua->ua_level); /* * This code is specific to ppc64 where a load is * atomic, so no need for atomic_load macro. */ if (ua->ua_level == UAD_LEAF_LEVEL) return ((ua->u.slb_entries[idx].slbe & SLBE_VALID) ? &ua->u.slb_entries[idx] : NULL); ua = ua->u.ua_child[idx]; if (ua == NULL || esid2base(esid, ua->ua_level) != ua->ua_base) return (NULL); } return (NULL); } uint64_t va_to_vsid(pmap_t pm, vm_offset_t va) { struct slb *entry; /* Shortcut kernel case */ if (pm == kernel_pmap) return (KERNEL_VSID((uintptr_t)va >> ADDR_SR_SHFT)); /* * If there is no vsid for this VA, we need to add a new entry * to the PMAP's segment table. */ entry = user_va_to_slb_entry(pm, va); if (entry == NULL) return (allocate_user_vsid(pm, (uintptr_t)va >> ADDR_SR_SHFT, 0)); return ((entry->slbv & SLBV_VSID_MASK) >> SLBV_VSID_SHIFT); } uint64_t allocate_user_vsid(pmap_t pm, uint64_t esid, int large) { uint64_t vsid, slbv; struct slbtnode *ua, *next, *inter; struct slb *slb; int idx; KASSERT(pm != kernel_pmap, ("Attempting to allocate a kernel VSID")); PMAP_LOCK_ASSERT(pm, MA_OWNED); vsid = moea64_get_unique_vsid(); slbv = vsid << SLBV_VSID_SHIFT; if (large) slbv |= SLBV_L; ua = pm->pm_slb_tree_root; /* Descend to the correct leaf or NULL pointer. */ for (;;) { KASSERT(uad_baseok(ua), ("uad base %09jx level %d bad!", ua->ua_base, ua->ua_level)); idx = esid2idx(esid, ua->ua_level); if (ua->ua_level == UAD_LEAF_LEVEL) { ua->u.slb_entries[idx].slbv = slbv; eieio(); ua->u.slb_entries[idx].slbe = (esid << SLBE_ESID_SHIFT) | SLBE_VALID; setbit(&ua->ua_alloc, idx); slb = &ua->u.slb_entries[idx]; break; } next = ua->u.ua_child[idx]; if (next == NULL) { slb = make_new_leaf(esid, slbv, ua); break; } /* * Check if the next item down has an okay ua_base. * If not, we need to allocate an intermediate node. */ if (esid2base(esid, next->ua_level) != next->ua_base) { inter = make_intermediate(esid, ua); slb = make_new_leaf(esid, slbv, inter); break; } ua = next; } /* * Someone probably wants this soon, and it may be a wired * SLB mapping, so pre-spill this entry. */ eieio(); slb_insert_user(pm, slb); return (vsid); } void free_vsid(pmap_t pm, uint64_t esid, int large) { struct slbtnode *ua; int idx; PMAP_LOCK_ASSERT(pm, MA_OWNED); ua = pm->pm_slb_tree_root; /* Descend to the correct leaf. */ for (;;) { KASSERT(uad_baseok(ua), ("uad base %09jx level %d bad!", ua->ua_base, ua->ua_level)); idx = esid2idx(esid, ua->ua_level); if (ua->ua_level == UAD_LEAF_LEVEL) { ua->u.slb_entries[idx].slbv = 0; eieio(); ua->u.slb_entries[idx].slbe = 0; clrbit(&ua->ua_alloc, idx); return; } ua = ua->u.ua_child[idx]; if (ua == NULL || esid2base(esid, ua->ua_level) != ua->ua_base) { /* Perhaps just return instead of assert? */ KASSERT(0, ("Asked to remove an entry that was never inserted!")); return; } } } static void free_slb_tree_node(struct slbtnode *ua) { int idx; for (idx = 0; idx < 16; idx++) { if (ua->ua_level != UAD_LEAF_LEVEL) { if (ua->u.ua_child[idx] != NULL) free_slb_tree_node(ua->u.ua_child[idx]); } else { if (ua->u.slb_entries[idx].slbv != 0) moea64_release_vsid(ua->u.slb_entries[idx].slbv >> SLBV_VSID_SHIFT); } } uma_zfree(slbt_zone, ua); } void slb_free_tree(pmap_t pm) { free_slb_tree_node(pm->pm_slb_tree_root); } struct slbtnode * slb_alloc_tree(void) { struct slbtnode *root; root = uma_zalloc(slbt_zone, M_NOWAIT | M_ZERO); root->ua_level = UAD_ROOT_LEVEL; return (root); } /* Lock entries mapping kernel text and stacks */ void slb_insert_kernel(uint64_t slbe, uint64_t slbv) { struct slb *slbcache; int i; /* We don't want to be preempted while modifying the kernel map */ critical_enter(); slbcache = PCPU_GET(slb); /* Check for an unused slot, abusing the user slot as a full flag */ if (slbcache[USER_SLB_SLOT].slbe == 0) { for (i = 0; i < n_slbs; i++) { if (i == USER_SLB_SLOT) continue; if (!(slbcache[i].slbe & SLBE_VALID)) goto fillkernslb; } if (i == n_slbs) slbcache[USER_SLB_SLOT].slbe = 1; } i = mftb() % n_slbs; if (i == USER_SLB_SLOT) i = (i+1) % n_slbs; fillkernslb: KASSERT(i != USER_SLB_SLOT, ("Filling user SLB slot with a kernel mapping")); slbcache[i].slbv = slbv; slbcache[i].slbe = slbe | (uint64_t)i; /* If it is for this CPU, put it in the SLB right away */ if (pmap_bootstrapped) { /* slbie not required */ __asm __volatile ("slbmte %0, %1" :: "r"(slbcache[i].slbv), "r"(slbcache[i].slbe)); } critical_exit(); } void slb_insert_user(pmap_t pm, struct slb *slb) { int i; PMAP_LOCK_ASSERT(pm, MA_OWNED); if (pm->pm_slb_len < n_slbs) { i = pm->pm_slb_len; pm->pm_slb_len++; } else { i = mftb() % n_slbs; } /* Note that this replacement is atomic with respect to trap_subr */ pm->pm_slb[i] = slb; } static void * -slb_uma_real_alloc(uma_zone_t zone, int bytes, u_int8_t *flags, int wait) +slb_uma_real_alloc(uma_zone_t zone, vm_size_t bytes, u_int8_t *flags, int wait) { static vm_offset_t realmax = 0; void *va; vm_page_t m; int pflags; if (realmax == 0) realmax = platform_real_maxaddr(); *flags = UMA_SLAB_PRIV; pflags = malloc2vm_flags(wait) | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED; for (;;) { m = vm_page_alloc_contig(NULL, 0, pflags, 1, 0, realmax, PAGE_SIZE, PAGE_SIZE, VM_MEMATTR_DEFAULT); if (m == NULL) { if (wait & M_NOWAIT) return (NULL); VM_WAIT; } else break; } va = (void *) VM_PAGE_TO_PHYS(m); if (!hw_direct_map) pmap_kenter((vm_offset_t)va, VM_PAGE_TO_PHYS(m)); if ((wait & M_ZERO) && (m->flags & PG_ZERO) == 0) bzero(va, PAGE_SIZE); return (va); } static void slb_zone_init(void *dummy) { slbt_zone = uma_zcreate("SLB tree node", sizeof(struct slbtnode), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_VM); slb_cache_zone = uma_zcreate("SLB cache", (n_slbs + 1)*sizeof(struct slb *), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_VM); if (platform_real_maxaddr() != VM_MAX_ADDRESS) { uma_zone_set_allocf(slb_cache_zone, slb_uma_real_alloc); uma_zone_set_allocf(slbt_zone, slb_uma_real_alloc); } } struct slb ** slb_alloc_user_cache(void) { return (uma_zalloc(slb_cache_zone, M_ZERO)); } void slb_free_user_cache(struct slb **slb) { uma_zfree(slb_cache_zone, slb); } Index: stable/10/sys/powerpc/aim/uma_machdep.c =================================================================== --- stable/10/sys/powerpc/aim/uma_machdep.c (revision 287944) +++ stable/10/sys/powerpc/aim/uma_machdep.c (revision 287945) @@ -1,98 +1,98 @@ /*- * Copyright (c) 2003 The FreeBSD Project * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include static int hw_uma_mdpages; SYSCTL_INT(_hw, OID_AUTO, uma_mdpages, CTLFLAG_RD, &hw_uma_mdpages, 0, "UMA MD pages in use"); void * -uma_small_alloc(uma_zone_t zone, int bytes, u_int8_t *flags, int wait) +uma_small_alloc(uma_zone_t zone, vm_size_t bytes, u_int8_t *flags, int wait) { void *va; vm_page_t m; int pflags; *flags = UMA_SLAB_PRIV; pflags = malloc2vm_flags(wait) | VM_ALLOC_WIRED; for (;;) { m = vm_page_alloc(NULL, 0, pflags | VM_ALLOC_NOOBJ); if (m == NULL) { if (wait & M_NOWAIT) return (NULL); VM_WAIT; } else break; } va = (void *) VM_PAGE_TO_PHYS(m); if (!hw_direct_map) pmap_kenter((vm_offset_t)va, VM_PAGE_TO_PHYS(m)); if ((wait & M_ZERO) && (m->flags & PG_ZERO) == 0) bzero(va, PAGE_SIZE); atomic_add_int(&hw_uma_mdpages, 1); return (va); } void -uma_small_free(void *mem, int size, u_int8_t flags) +uma_small_free(void *mem, vm_size_t size, u_int8_t flags) { vm_page_t m; if (!hw_direct_map) pmap_remove(kernel_pmap,(vm_offset_t)mem, (vm_offset_t)mem + PAGE_SIZE); m = PHYS_TO_VM_PAGE((vm_offset_t)mem); m->wire_count--; vm_page_free(m); atomic_subtract_int(&cnt.v_wire_count, 1); atomic_subtract_int(&hw_uma_mdpages, 1); } Index: stable/10/sys/sparc64/sparc64/vm_machdep.c =================================================================== --- stable/10/sys/sparc64/sparc64/vm_machdep.c (revision 287944) +++ stable/10/sys/sparc64/sparc64/vm_machdep.c (revision 287945) @@ -1,552 +1,552 @@ /*- * Copyright (c) 1982, 1986 The Regents of the University of California. * Copyright (c) 1989, 1990 William Jolitz * Copyright (c) 1994 John Dyson * Copyright (c) 2001 Jake Burkholder. * All rights reserved. * * This code is derived from software contributed to Berkeley by * the Systems Programming Group of the University of Utah Computer * Science Department, and William Jolitz. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from: @(#)vm_machdep.c 7.3 (Berkeley) 5/13/91 * Utah $Hdr: vm_machdep.c 1.16.1.1 89/06/23$ * from: FreeBSD: src/sys/i386/i386/vm_machdep.c,v 1.167 2001/07/12 */ #include __FBSDID("$FreeBSD$"); #include "opt_pmap.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifndef NSFBUFS #define NSFBUFS (512 + maxusers * 16) #endif static int nsfbufs; static int nsfbufspeak; static int nsfbufsused; SYSCTL_INT(_kern_ipc, OID_AUTO, nsfbufs, CTLFLAG_RDTUN, &nsfbufs, 0, "Maximum number of sendfile(2) sf_bufs available"); SYSCTL_INT(_kern_ipc, OID_AUTO, nsfbufspeak, CTLFLAG_RD, &nsfbufspeak, 0, "Number of sendfile(2) sf_bufs at peak usage"); SYSCTL_INT(_kern_ipc, OID_AUTO, nsfbufsused, CTLFLAG_RD, &nsfbufsused, 0, "Number of sendfile(2) sf_bufs in use"); static void sf_buf_init(void *arg); SYSINIT(sock_sf, SI_SUB_MBUF, SI_ORDER_ANY, sf_buf_init, NULL); /* * Expanded sf_freelist head. Really an SLIST_HEAD() in disguise, with the * sf_freelist head with the sf_lock mutex. */ static struct { SLIST_HEAD(, sf_buf) sf_head; struct mtx sf_lock; } sf_freelist; static u_int sf_buf_alloc_want; PMAP_STATS_VAR(uma_nsmall_alloc); PMAP_STATS_VAR(uma_nsmall_alloc_oc); PMAP_STATS_VAR(uma_nsmall_free); void cpu_exit(struct thread *td) { struct proc *p; p = td->td_proc; p->p_md.md_sigtramp = NULL; if (p->p_md.md_utrap != NULL) { utrap_free(p->p_md.md_utrap); p->p_md.md_utrap = NULL; } } void cpu_thread_exit(struct thread *td) { } void cpu_thread_clean(struct thread *td) { } void cpu_thread_alloc(struct thread *td) { struct pcb *pcb; pcb = (struct pcb *)((td->td_kstack + td->td_kstack_pages * PAGE_SIZE - sizeof(struct pcb)) & ~0x3fUL); pcb->pcb_nsaved = 0; td->td_frame = (struct trapframe *)pcb - 1; td->td_pcb = pcb; } void cpu_thread_free(struct thread *td) { } void cpu_thread_swapin(struct thread *td) { } void cpu_thread_swapout(struct thread *td) { } void cpu_set_syscall_retval(struct thread *td, int error) { switch (error) { case 0: td->td_frame->tf_out[0] = td->td_retval[0]; td->td_frame->tf_out[1] = td->td_retval[1]; td->td_frame->tf_tstate &= ~TSTATE_XCC_C; break; case ERESTART: /* * Undo the tpc advancement we have done on syscall * enter, we want to reexecute the system call. */ td->td_frame->tf_tpc = td->td_pcb->pcb_tpc; td->td_frame->tf_tnpc -= 4; break; case EJUSTRETURN: break; default: if (td->td_proc->p_sysent->sv_errsize) { if (error >= td->td_proc->p_sysent->sv_errsize) error = -1; /* XXX */ else error = td->td_proc->p_sysent->sv_errtbl[error]; } td->td_frame->tf_out[0] = error; td->td_frame->tf_tstate |= TSTATE_XCC_C; break; } } void cpu_set_upcall(struct thread *td, struct thread *td0) { struct trapframe *tf; struct frame *fr; struct pcb *pcb; bcopy(td0->td_frame, td->td_frame, sizeof(struct trapframe)); pcb = td->td_pcb; tf = td->td_frame; fr = (struct frame *)tf - 1; fr->fr_local[0] = (u_long)fork_return; fr->fr_local[1] = (u_long)td; fr->fr_local[2] = (u_long)tf; pcb->pcb_pc = (u_long)fork_trampoline - 8; pcb->pcb_sp = (u_long)fr - SPOFF; /* Setup to release the spin count in fork_exit(). */ td->td_md.md_spinlock_count = 1; td->td_md.md_saved_pil = 0; } void cpu_set_upcall_kse(struct thread *td, void (*entry)(void *), void *arg, stack_t *stack) { struct trapframe *tf; uint64_t sp; if (td == curthread) flushw(); tf = td->td_frame; sp = (uint64_t)stack->ss_sp + stack->ss_size; tf->tf_out[0] = (uint64_t)arg; tf->tf_out[6] = sp - SPOFF - sizeof(struct frame); tf->tf_tpc = (uint64_t)entry; tf->tf_tnpc = tf->tf_tpc + 4; td->td_retval[0] = tf->tf_out[0]; td->td_retval[1] = tf->tf_out[1]; } int cpu_set_user_tls(struct thread *td, void *tls_base) { if (td == curthread) flushw(); td->td_frame->tf_global[7] = (uint64_t)tls_base; return (0); } /* * Finish a fork operation, with process p2 nearly set up. * Copy and update the pcb, set up the stack so that the child * ready to run and return to user mode. */ void cpu_fork(struct thread *td1, struct proc *p2, struct thread *td2, int flags) { struct trapframe *tf; struct frame *fp; struct pcb *pcb1; struct pcb *pcb2; vm_offset_t sp; int error; int i; KASSERT(td1 == curthread || td1 == &thread0, ("cpu_fork: p1 not curproc and not proc0")); if ((flags & RFPROC) == 0) return; p2->p_md.md_sigtramp = td1->td_proc->p_md.md_sigtramp; p2->p_md.md_utrap = utrap_hold(td1->td_proc->p_md.md_utrap); /* The pcb must be aligned on a 64-byte boundary. */ pcb1 = td1->td_pcb; pcb2 = (struct pcb *)((td2->td_kstack + td2->td_kstack_pages * PAGE_SIZE - sizeof(struct pcb)) & ~0x3fUL); td2->td_pcb = pcb2; /* * Ensure that p1's pcb is up to date. */ critical_enter(); if ((td1->td_frame->tf_fprs & FPRS_FEF) != 0) savefpctx(pcb1->pcb_ufp); critical_exit(); /* Make sure the copied windows are spilled. */ flushw(); /* Copy the pcb (this will copy the windows saved in the pcb, too). */ bcopy(pcb1, pcb2, sizeof(*pcb1)); /* * If we're creating a new user process and we're sharing the address * space, the parent's top most frame must be saved in the pcb. The * child will pop the frame when it returns to user mode, and may * overwrite it with its own data causing much suffering for the * parent. We check if its already in the pcb, and if not copy it * in. Its unlikely that the copyin will fail, but if so there's not * much we can do. The parent will likely crash soon anyway in that * case. */ if ((flags & RFMEM) != 0 && td1 != &thread0) { sp = td1->td_frame->tf_sp; for (i = 0; i < pcb1->pcb_nsaved; i++) { if (pcb1->pcb_rwsp[i] == sp) break; } if (i == pcb1->pcb_nsaved) { error = copyin((caddr_t)sp + SPOFF, &pcb1->pcb_rw[i], sizeof(struct rwindow)); if (error == 0) { pcb1->pcb_rwsp[i] = sp; pcb1->pcb_nsaved++; } } } /* * Create a new fresh stack for the new process. * Copy the trap frame for the return to user mode as if from a * syscall. This copies most of the user mode register values. */ tf = (struct trapframe *)pcb2 - 1; bcopy(td1->td_frame, tf, sizeof(*tf)); tf->tf_out[0] = 0; /* Child returns zero */ tf->tf_out[1] = 0; tf->tf_tstate &= ~TSTATE_XCC_C; /* success */ tf->tf_fprs = 0; td2->td_frame = tf; fp = (struct frame *)tf - 1; fp->fr_local[0] = (u_long)fork_return; fp->fr_local[1] = (u_long)td2; fp->fr_local[2] = (u_long)tf; /* Terminate stack traces at this frame. */ fp->fr_pc = fp->fr_fp = 0; pcb2->pcb_sp = (u_long)fp - SPOFF; pcb2->pcb_pc = (u_long)fork_trampoline - 8; /* Setup to release the spin count in fork_exit(). */ td2->td_md.md_spinlock_count = 1; td2->td_md.md_saved_pil = 0; /* * Now, cpu_switch() can schedule the new process. */ } void cpu_reset(void) { static char bspec[64] = ""; phandle_t chosen; static struct { cell_t name; cell_t nargs; cell_t nreturns; cell_t bootspec; } args = { (cell_t)"boot", 1, 0, (cell_t)bspec }; if ((chosen = OF_finddevice("/chosen")) != -1) { if (OF_getprop(chosen, "bootpath", bspec, sizeof(bspec)) == -1) bspec[0] = '\0'; bspec[sizeof(bspec) - 1] = '\0'; } cpu_shutdown(&args); } /* * Intercept the return address from a freshly forked process that has NOT * been scheduled yet. * * This is needed to make kernel threads stay in kernel mode. */ void cpu_set_fork_handler(struct thread *td, void (*func)(void *), void *arg) { struct frame *fp; struct pcb *pcb; pcb = td->td_pcb; fp = (struct frame *)(pcb->pcb_sp + SPOFF); fp->fr_local[0] = (u_long)func; fp->fr_local[1] = (u_long)arg; } int is_physical_memory(vm_paddr_t addr) { struct ofw_mem_region *mr; for (mr = sparc64_memreg; mr < sparc64_memreg + sparc64_nmemreg; mr++) if (addr >= mr->mr_start && addr < mr->mr_start + mr->mr_size) return (1); return (0); } /* * Allocate a pool of sf_bufs (sendfile(2) or "super-fast" if you prefer. :-)) */ static void sf_buf_init(void *arg) { struct sf_buf *sf_bufs; vm_offset_t sf_base; int i; nsfbufs = NSFBUFS; TUNABLE_INT_FETCH("kern.ipc.nsfbufs", &nsfbufs); mtx_init(&sf_freelist.sf_lock, "sf_bufs list lock", NULL, MTX_DEF); SLIST_INIT(&sf_freelist.sf_head); sf_base = kva_alloc(nsfbufs * PAGE_SIZE); sf_bufs = malloc(nsfbufs * sizeof(struct sf_buf), M_TEMP, M_NOWAIT | M_ZERO); for (i = 0; i < nsfbufs; i++) { sf_bufs[i].kva = sf_base + i * PAGE_SIZE; SLIST_INSERT_HEAD(&sf_freelist.sf_head, &sf_bufs[i], free_list); } sf_buf_alloc_want = 0; } /* * Get an sf_buf from the freelist. Will block if none are available. */ struct sf_buf * sf_buf_alloc(struct vm_page *m, int flags) { struct sf_buf *sf; int error; mtx_lock(&sf_freelist.sf_lock); while ((sf = SLIST_FIRST(&sf_freelist.sf_head)) == NULL) { if (flags & SFB_NOWAIT) break; sf_buf_alloc_want++; SFSTAT_INC(sf_allocwait); error = msleep(&sf_freelist, &sf_freelist.sf_lock, (flags & SFB_CATCH) ? PCATCH | PVM : PVM, "sfbufa", 0); sf_buf_alloc_want--; /* * If we got a signal, don't risk going back to sleep. */ if (error) break; } if (sf != NULL) { SLIST_REMOVE_HEAD(&sf_freelist.sf_head, free_list); sf->m = m; nsfbufsused++; nsfbufspeak = imax(nsfbufspeak, nsfbufsused); pmap_qenter(sf->kva, &sf->m, 1); } mtx_unlock(&sf_freelist.sf_lock); return (sf); } /* * Release resources back to the system. */ void sf_buf_free(struct sf_buf *sf) { pmap_qremove(sf->kva, 1); mtx_lock(&sf_freelist.sf_lock); SLIST_INSERT_HEAD(&sf_freelist.sf_head, sf, free_list); nsfbufsused--; if (sf_buf_alloc_want > 0) wakeup(&sf_freelist); mtx_unlock(&sf_freelist.sf_lock); } void swi_vm(void *v) { /* Nothing to do here - busdma bounce buffers are not implemented. */ } void * -uma_small_alloc(uma_zone_t zone, int bytes, u_int8_t *flags, int wait) +uma_small_alloc(uma_zone_t zone, vm_size_t bytes, u_int8_t *flags, int wait) { vm_paddr_t pa; vm_page_t m; int pflags; void *va; PMAP_STATS_INC(uma_nsmall_alloc); *flags = UMA_SLAB_PRIV; pflags = malloc2vm_flags(wait) | VM_ALLOC_WIRED; for (;;) { m = vm_page_alloc(NULL, 0, pflags | VM_ALLOC_NOOBJ); if (m == NULL) { if (wait & M_NOWAIT) return (NULL); else VM_WAIT; } else break; } pa = VM_PAGE_TO_PHYS(m); if (dcache_color_ignore == 0 && m->md.color != DCACHE_COLOR(pa)) { KASSERT(m->md.colors[0] == 0 && m->md.colors[1] == 0, ("uma_small_alloc: free page %p still has mappings!", m)); PMAP_STATS_INC(uma_nsmall_alloc_oc); m->md.color = DCACHE_COLOR(pa); dcache_page_inval(pa); } va = (void *)TLB_PHYS_TO_DIRECT(pa); if ((wait & M_ZERO) && (m->flags & PG_ZERO) == 0) cpu_block_zero(va, PAGE_SIZE); return (va); } void -uma_small_free(void *mem, int size, u_int8_t flags) +uma_small_free(void *mem, vm_size_t size, u_int8_t flags) { vm_page_t m; PMAP_STATS_INC(uma_nsmall_free); m = PHYS_TO_VM_PAGE(TLB_DIRECT_TO_PHYS((vm_offset_t)mem)); m->wire_count--; vm_page_free(m); atomic_subtract_int(&cnt.v_wire_count, 1); } Index: stable/10/sys/sys/busdma_bufalloc.h =================================================================== --- stable/10/sys/sys/busdma_bufalloc.h (revision 287944) +++ stable/10/sys/sys/busdma_bufalloc.h (revision 287945) @@ -1,118 +1,119 @@ /*- * Copyright (c) 2012 Ian Lepore * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /* * $FreeBSD$ */ /* * A buffer pool manager, for use by a platform's busdma implementation. */ #ifndef _MACHINE_BUSDMA_BUFALLOC_H_ #define _MACHINE_BUSDMA_BUFALLOC_H_ #include #include /* * Information about a buffer zone, returned by busdma_bufalloc_findzone(). */ struct busdma_bufzone { bus_size_t size; uma_zone_t umazone; char name[24]; }; /* * Opaque handle type returned by busdma_bufalloc_create(). */ struct busdma_bufalloc; typedef struct busdma_bufalloc *busdma_bufalloc_t; /* * Create an allocator that manages a pool of DMA buffers. * * The allocator manages a collection of uma(9) zones of buffers in power-of-two * sized increments ranging from minimum_alignment to the platform's PAGE_SIZE. * The buffers within each zone are aligned on boundaries corresponding to the * buffer size, and thus by implication each buffer is contiguous within a page * and does not cross a power of two boundary larger than the buffer size. * These rules are intended to make it easy for a busdma implementation to * check whether a tag's constraints allow use of a buffer from the allocator. * * minimum_alignment is also the minimum buffer allocation size. For platforms * with software-assisted cache coherency, this is typically the data cache line * size (and MUST not be smaller than the cache line size). * * name appears in zone stats as 'dma name nnnnn' where 'dma' is fixed and * 'nnnnn' is the size of buffers in that zone. * * If if the alloc/free function pointers are NULL, the regular uma internal * allocators are used (I.E., you get "plain old kernel memory"). On a platform * with an exclusion zone that applies to all DMA operations, a custom allocator * could be used to ensure no buffer memory is ever allocated from that zone, * allowing the bus_dmamem_alloc() implementation to make the assumption that * buffers provided by the allocation could never lead to the need for a bounce. */ busdma_bufalloc_t busdma_bufalloc_create(const char *name, bus_size_t minimum_alignment, uma_alloc uma_alloc_func, uma_free uma_free_func, u_int32_t uma_zcreate_flags); /* * Destroy an allocator created by busdma_bufalloc_create(). * Safe to call with a NULL pointer. */ void busdma_bufalloc_destroy(busdma_bufalloc_t ba); /* * Return a pointer to the busdma_bufzone that should be used to allocate or * free a buffer of the given size. Returns NULL if the size is larger than the * largest zone handled by the allocator. */ struct busdma_bufzone * busdma_bufalloc_findzone(busdma_bufalloc_t ba, bus_size_t size); /* * These built-in allocation routines are available for managing a pools of * uncacheable memory on platforms that support VM_MEMATTR_UNCACHEABLE. * * Allocation is done using kmem_alloc_attr() with these parameters: * lowaddr = 0 * highaddr = BUS_SPACE_MAXADDR * memattr = VM_MEMATTR_UNCACHEABLE. * * If your platform has no exclusion region (lowaddr/highaddr), and its pmap * routines support pmap_page_set_memattr() and the VM_MEMATTR_UNCACHEABLE flag * you can probably use these when you need uncacheable buffers. */ -void * busdma_bufalloc_alloc_uncacheable(uma_zone_t zone, int size, - u_int8_t *pflag, int wait); -void busdma_bufalloc_free_uncacheable(void *item, int size, u_int8_t pflag); +void * busdma_bufalloc_alloc_uncacheable(uma_zone_t zone, vm_size_t size, + uint8_t *pflag, int wait); +void busdma_bufalloc_free_uncacheable(void *item, vm_size_t size, + uint8_t pflag); #endif /* _MACHINE_BUSDMA_BUFALLOC_H_ */ Index: stable/10/sys/vm/uma.h =================================================================== --- stable/10/sys/vm/uma.h (revision 287944) +++ stable/10/sys/vm/uma.h (revision 287945) @@ -1,695 +1,696 @@ /*- * Copyright (c) 2002, 2003, 2004, 2005 Jeffrey Roberson * Copyright (c) 2004, 2005 Bosko Milekic * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice unmodified, this list of conditions, and the following * disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * * $FreeBSD$ * */ /* * uma.h - External definitions for the Universal Memory Allocator * */ #ifndef _VM_UMA_H_ #define _VM_UMA_H_ #include /* For NULL */ #include /* For M_* */ /* User visible parameters */ #define UMA_SMALLEST_UNIT (PAGE_SIZE / 256) /* Smallest item allocated */ /* Types and type defs */ struct uma_zone; /* Opaque type used as a handle to the zone */ typedef struct uma_zone * uma_zone_t; void zone_drain(uma_zone_t); /* * Item constructor * * Arguments: * item A pointer to the memory which has been allocated. * arg The arg field passed to uma_zalloc_arg * size The size of the allocated item * flags See zalloc flags * * Returns: * 0 on success * errno on failure * * Discussion: * The constructor is called just before the memory is returned * to the user. It may block if necessary. */ typedef int (*uma_ctor)(void *mem, int size, void *arg, int flags); /* * Item destructor * * Arguments: * item A pointer to the memory which has been allocated. * size The size of the item being destructed. * arg Argument passed through uma_zfree_arg * * Returns: * Nothing * * Discussion: * The destructor may perform operations that differ from those performed * by the initializer, but it must leave the object in the same state. * This IS type stable storage. This is called after EVERY zfree call. */ typedef void (*uma_dtor)(void *mem, int size, void *arg); /* * Item initializer * * Arguments: * item A pointer to the memory which has been allocated. * size The size of the item being initialized. * flags See zalloc flags * * Returns: * 0 on success * errno on failure * * Discussion: * The initializer is called when the memory is cached in the uma zone. * The initializer and the destructor should leave the object in the same * state. */ typedef int (*uma_init)(void *mem, int size, int flags); /* * Item discard function * * Arguments: * item A pointer to memory which has been 'freed' but has not left the * zone's cache. * size The size of the item being discarded. * * Returns: * Nothing * * Discussion: * This routine is called when memory leaves a zone and is returned to the * system for other uses. It is the counter-part to the init function. */ typedef void (*uma_fini)(void *mem, int size); /* * Import new memory into a cache zone. */ typedef int (*uma_import)(void *arg, void **store, int count, int flags); /* * Free memory from a cache zone. */ typedef void (*uma_release)(void *arg, void **store, int count); /* * What's the difference between initializing and constructing? * * The item is initialized when it is cached, and this is the state that the * object should be in when returned to the allocator. The purpose of this is * to remove some code which would otherwise be called on each allocation by * utilizing a known, stable state. This differs from the constructor which * will be called on EVERY allocation. * * For example, in the initializer you may want to initialize embedded locks, * NULL list pointers, set up initial states, magic numbers, etc. This way if * the object is held in the allocator and re-used it won't be necessary to * re-initialize it. * * The constructor may be used to lock a data structure, link it on to lists, * bump reference counts or total counts of outstanding structures, etc. * */ /* Function proto types */ /* * Create a new uma zone * * Arguments: * name The text name of the zone for debugging and stats. This memory * should not be freed until the zone has been deallocated. * size The size of the object that is being created. * ctor The constructor that is called when the object is allocated. * dtor The destructor that is called when the object is freed. * init An initializer that sets up the initial state of the memory. * fini A discard function that undoes initialization done by init. * ctor/dtor/init/fini may all be null, see notes above. * align A bitmask that corresponds to the requested alignment * eg 4 would be 0x3 * flags A set of parameters that control the behavior of the zone. * * Returns: * A pointer to a structure which is intended to be opaque to users of * the interface. The value may be null if the wait flag is not set. */ uma_zone_t uma_zcreate(const char *name, size_t size, uma_ctor ctor, uma_dtor dtor, uma_init uminit, uma_fini fini, int align, uint32_t flags); /* * Create a secondary uma zone * * Arguments: * name The text name of the zone for debugging and stats. This memory * should not be freed until the zone has been deallocated. * ctor The constructor that is called when the object is allocated. * dtor The destructor that is called when the object is freed. * zinit An initializer that sets up the initial state of the memory * as the object passes from the Keg's slab to the Zone's cache. * zfini A discard function that undoes initialization done by init * as the object passes from the Zone's cache to the Keg's slab. * * ctor/dtor/zinit/zfini may all be null, see notes above. * Note that the zinit and zfini specified here are NOT * exactly the same as the init/fini specified to uma_zcreate() * when creating a master zone. These zinit/zfini are called * on the TRANSITION from keg to zone (and vice-versa). Once * these are set, the primary zone may alter its init/fini * (which are called when the object passes from VM to keg) * using uma_zone_set_init/fini()) as well as its own * zinit/zfini (unset by default for master zone) with * uma_zone_set_zinit/zfini() (note subtle 'z' prefix). * * master A reference to this zone's Master Zone (Primary Zone), * which contains the backing Keg for the Secondary Zone * being added. * * Returns: * A pointer to a structure which is intended to be opaque to users of * the interface. The value may be null if the wait flag is not set. */ uma_zone_t uma_zsecond_create(char *name, uma_ctor ctor, uma_dtor dtor, uma_init zinit, uma_fini zfini, uma_zone_t master); /* * Add a second master to a secondary zone. This provides multiple data * backends for objects with the same size. Both masters must have * compatible allocation flags. Presently, UMA_ZONE_MALLOC type zones are * the only supported. * * Returns: * Error on failure, 0 on success. */ int uma_zsecond_add(uma_zone_t zone, uma_zone_t master); /* * Create cache-only zones. * * This allows uma's per-cpu cache facilities to handle arbitrary * pointers. Consumers must specify the import and release functions to * fill and destroy caches. UMA does not allocate any memory for these * zones. The 'arg' parameter is passed to import/release and is caller * specific. */ uma_zone_t uma_zcache_create(char *name, int size, uma_ctor ctor, uma_dtor dtor, uma_init zinit, uma_fini zfini, uma_import zimport, uma_release zrelease, void *arg, int flags); /* * Definitions for uma_zcreate flags * * These flags share space with UMA_ZFLAGs in uma_int.h. Be careful not to * overlap when adding new features. 0xf0000000 is in use by uma_int.h. */ #define UMA_ZONE_PAGEABLE 0x0001 /* Return items not fully backed by physical memory XXX Not yet */ #define UMA_ZONE_ZINIT 0x0002 /* Initialize with zeros */ #define UMA_ZONE_STATIC 0x0004 /* Statically sized zone */ #define UMA_ZONE_OFFPAGE 0x0008 /* Force the slab structure allocation off of the real memory */ #define UMA_ZONE_MALLOC 0x0010 /* For use by malloc(9) only! */ #define UMA_ZONE_NOFREE 0x0020 /* Do not free slabs of this type! */ #define UMA_ZONE_MTXCLASS 0x0040 /* Create a new lock class */ #define UMA_ZONE_VM 0x0080 /* * Used for internal vm datastructures * only. */ #define UMA_ZONE_HASH 0x0100 /* * Use a hash table instead of caching * information in the vm_page. */ #define UMA_ZONE_SECONDARY 0x0200 /* Zone is a Secondary Zone */ #define UMA_ZONE_REFCNT 0x0400 /* Allocate refcnts in slabs */ #define UMA_ZONE_MAXBUCKET 0x0800 /* Use largest buckets */ #define UMA_ZONE_CACHESPREAD 0x1000 /* * Spread memory start locations across * all possible cache lines. May * require many virtually contiguous * backend pages and can fail early. */ #define UMA_ZONE_VTOSLAB 0x2000 /* Zone uses vtoslab for lookup. */ #define UMA_ZONE_NODUMP 0x4000 /* * Zone's pages will not be included in * mini-dumps. */ #define UMA_ZONE_PCPU 0x8000 /* * Allocates mp_ncpus slabs sized to * sizeof(struct pcpu). */ /* * These flags are shared between the keg and zone. In zones wishing to add * new kegs these flags must be compatible. Some are determined based on * physical parameters of the request and may not be provided by the consumer. */ #define UMA_ZONE_INHERIT \ (UMA_ZONE_OFFPAGE | UMA_ZONE_MALLOC | UMA_ZONE_NOFREE | \ UMA_ZONE_HASH | UMA_ZONE_REFCNT | UMA_ZONE_VTOSLAB | UMA_ZONE_PCPU) /* Definitions for align */ #define UMA_ALIGN_PTR (sizeof(void *) - 1) /* Alignment fit for ptr */ #define UMA_ALIGN_LONG (sizeof(long) - 1) /* "" long */ #define UMA_ALIGN_INT (sizeof(int) - 1) /* "" int */ #define UMA_ALIGN_SHORT (sizeof(short) - 1) /* "" short */ #define UMA_ALIGN_CHAR (sizeof(char) - 1) /* "" char */ #define UMA_ALIGN_CACHE (0 - 1) /* Cache line size align */ /* * Destroys an empty uma zone. If the zone is not empty uma complains loudly. * * Arguments: * zone The zone we want to destroy. * */ void uma_zdestroy(uma_zone_t zone); /* * Allocates an item out of a zone * * Arguments: * zone The zone we are allocating from * arg This data is passed to the ctor function * flags See sys/malloc.h for available flags. * * Returns: * A non-null pointer to an initialized element from the zone is * guaranteed if the wait flag is M_WAITOK. Otherwise a null pointer * may be returned if the zone is empty or the ctor failed. */ void *uma_zalloc_arg(uma_zone_t zone, void *arg, int flags); /* * Allocates an item out of a zone without supplying an argument * * This is just a wrapper for uma_zalloc_arg for convenience. * */ static __inline void *uma_zalloc(uma_zone_t zone, int flags); static __inline void * uma_zalloc(uma_zone_t zone, int flags) { return uma_zalloc_arg(zone, NULL, flags); } /* * Frees an item back into the specified zone. * * Arguments: * zone The zone the item was originally allocated out of. * item The memory to be freed. * arg Argument passed to the destructor * * Returns: * Nothing. */ void uma_zfree_arg(uma_zone_t zone, void *item, void *arg); /* * Frees an item back to a zone without supplying an argument * * This is just a wrapper for uma_zfree_arg for convenience. * */ static __inline void uma_zfree(uma_zone_t zone, void *item); static __inline void uma_zfree(uma_zone_t zone, void *item) { uma_zfree_arg(zone, item, NULL); } /* * XXX The rest of the prototypes in this header are h0h0 magic for the VM. * If you think you need to use it for a normal zone you're probably incorrect. */ /* * Backend page supplier routines * * Arguments: * zone The zone that is requesting pages. * size The number of bytes being requested. * pflag Flags for these memory pages, see below. * wait Indicates our willingness to block. * * Returns: * A pointer to the allocated memory or NULL on failure. */ -typedef void *(*uma_alloc)(uma_zone_t zone, int size, uint8_t *pflag, int wait); +typedef void *(*uma_alloc)(uma_zone_t zone, vm_size_t size, uint8_t *pflag, + int wait); /* * Backend page free routines * * Arguments: * item A pointer to the previously allocated pages. * size The original size of the allocation. * pflag The flags for the slab. See UMA_SLAB_* below. * * Returns: * None */ -typedef void (*uma_free)(void *item, int size, uint8_t pflag); +typedef void (*uma_free)(void *item, vm_size_t size, uint8_t pflag); /* * Sets up the uma allocator. (Called by vm_mem_init) * * Arguments: * bootmem A pointer to memory used to bootstrap the system. * * Returns: * Nothing * * Discussion: * This memory is used for zones which allocate things before the * backend page supplier can give us pages. It should be * UMA_SLAB_SIZE * boot_pages bytes. (see uma_int.h) * */ void uma_startup(void *bootmem, int boot_pages); /* * Finishes starting up the allocator. This should * be called when kva is ready for normal allocs. * * Arguments: * None * * Returns: * Nothing * * Discussion: * uma_startup2 is called by kmeminit() to enable us of uma for malloc. */ void uma_startup2(void); /* * Reclaims unused memory for all zones * * Arguments: * None * Returns: * None * * This should only be called by the page out daemon. */ void uma_reclaim(void); /* * Sets the alignment mask to be used for all zones requesting cache * alignment. Should be called by MD boot code prior to starting VM/UMA. * * Arguments: * align The alignment mask * * Returns: * Nothing */ void uma_set_align(int align); /* * Set a reserved number of items to hold for M_USE_RESERVE allocations. All * other requests must allocate new backing pages. */ void uma_zone_reserve(uma_zone_t zone, int nitems); /* * Reserves the maximum KVA space required by the zone and configures the zone * to use a VM_ALLOC_NOOBJ-based backend allocator. * * Arguments: * zone The zone to update. * nitems The upper limit on the number of items that can be allocated. * * Returns: * 0 if KVA space can not be allocated * 1 if successful * * Discussion: * When the machine supports a direct map and the zone's items are smaller * than a page, the zone will use the direct map instead of allocating KVA * space. */ int uma_zone_reserve_kva(uma_zone_t zone, int nitems); /* * Sets a high limit on the number of items allowed in a zone * * Arguments: * zone The zone to limit * nitems The requested upper limit on the number of items allowed * * Returns: * int The effective value of nitems after rounding up based on page size */ int uma_zone_set_max(uma_zone_t zone, int nitems); /* * Obtains the effective limit on the number of items in a zone * * Arguments: * zone The zone to obtain the effective limit from * * Return: * 0 No limit * int The effective limit of the zone */ int uma_zone_get_max(uma_zone_t zone); /* * Sets a warning to be printed when limit is reached * * Arguments: * zone The zone we will warn about * warning Warning content * * Returns: * Nothing */ void uma_zone_set_warning(uma_zone_t zone, const char *warning); /* * Obtains the approximate current number of items allocated from a zone * * Arguments: * zone The zone to obtain the current allocation count from * * Return: * int The approximate current number of items allocated from the zone */ int uma_zone_get_cur(uma_zone_t zone); /* * The following two routines (uma_zone_set_init/fini) * are used to set the backend init/fini pair which acts on an * object as it becomes allocated and is placed in a slab within * the specified zone's backing keg. These should probably not * be changed once allocations have already begun, but only be set * immediately upon zone creation. */ void uma_zone_set_init(uma_zone_t zone, uma_init uminit); void uma_zone_set_fini(uma_zone_t zone, uma_fini fini); /* * The following two routines (uma_zone_set_zinit/zfini) are * used to set the zinit/zfini pair which acts on an object as * it passes from the backing Keg's slab cache to the * specified Zone's bucket cache. These should probably not * be changed once allocations have already begun, but only be set * immediately upon zone creation. */ void uma_zone_set_zinit(uma_zone_t zone, uma_init zinit); void uma_zone_set_zfini(uma_zone_t zone, uma_fini zfini); /* * Replaces the standard backend allocator for this zone. * * Arguments: * zone The zone whose backend allocator is being changed. * allocf A pointer to the allocation function * * Returns: * Nothing * * Discussion: * This could be used to implement pageable allocation, or perhaps * even DMA allocators if used in conjunction with the OFFPAGE * zone flag. */ void uma_zone_set_allocf(uma_zone_t zone, uma_alloc allocf); /* * Used for freeing memory provided by the allocf above * * Arguments: * zone The zone that intends to use this free routine. * freef The page freeing routine. * * Returns: * Nothing */ void uma_zone_set_freef(uma_zone_t zone, uma_free freef); /* * These flags are setable in the allocf and visible in the freef. */ #define UMA_SLAB_BOOT 0x01 /* Slab alloced from boot pages */ #define UMA_SLAB_KMEM 0x02 /* Slab alloced from kmem_map */ #define UMA_SLAB_KERNEL 0x04 /* Slab alloced from kernel_map */ #define UMA_SLAB_PRIV 0x08 /* Slab alloced from priv allocator */ #define UMA_SLAB_OFFP 0x10 /* Slab is managed separately */ #define UMA_SLAB_MALLOC 0x20 /* Slab is a large malloc slab */ /* 0x40 and 0x80 are available */ /* * Used to pre-fill a zone with some number of items * * Arguments: * zone The zone to fill * itemcnt The number of items to reserve * * Returns: * Nothing * * NOTE: This is blocking and should only be done at startup */ void uma_prealloc(uma_zone_t zone, int itemcnt); /* * Used to lookup the reference counter allocated for an item * from a UMA_ZONE_REFCNT zone. For UMA_ZONE_REFCNT zones, * reference counters are allocated for items and stored in * the underlying slab header. * * Arguments: * zone The UMA_ZONE_REFCNT zone to which the item belongs. * item The address of the item for which we want a refcnt. * * Returns: * A pointer to a uint32_t reference counter. */ uint32_t *uma_find_refcnt(uma_zone_t zone, void *item); /* * Used to determine if a fixed-size zone is exhausted. * * Arguments: * zone The zone to check * * Returns: * Non-zero if zone is exhausted. */ int uma_zone_exhausted(uma_zone_t zone); int uma_zone_exhausted_nolock(uma_zone_t zone); /* * Common UMA_ZONE_PCPU zones. */ extern uma_zone_t pcpu_zone_64; extern uma_zone_t pcpu_zone_ptr; /* * Exported statistics structures to be used by user space monitoring tools. * Statistics stream consists of a uma_stream_header, followed by a series of * alternative uma_type_header and uma_type_stat structures. */ #define UMA_STREAM_VERSION 0x00000001 struct uma_stream_header { uint32_t ush_version; /* Stream format version. */ uint32_t ush_maxcpus; /* Value of MAXCPU for stream. */ uint32_t ush_count; /* Number of records. */ uint32_t _ush_pad; /* Pad/reserved field. */ }; #define UTH_MAX_NAME 32 #define UTH_ZONE_SECONDARY 0x00000001 struct uma_type_header { /* * Static per-zone data, some extracted from the supporting keg. */ char uth_name[UTH_MAX_NAME]; uint32_t uth_align; /* Keg: alignment. */ uint32_t uth_size; /* Keg: requested size of item. */ uint32_t uth_rsize; /* Keg: real size of item. */ uint32_t uth_maxpages; /* Keg: maximum number of pages. */ uint32_t uth_limit; /* Keg: max items to allocate. */ /* * Current dynamic zone/keg-derived statistics. */ uint32_t uth_pages; /* Keg: pages allocated. */ uint32_t uth_keg_free; /* Keg: items free. */ uint32_t uth_zone_free; /* Zone: items free. */ uint32_t uth_bucketsize; /* Zone: desired bucket size. */ uint32_t uth_zone_flags; /* Zone: flags. */ uint64_t uth_allocs; /* Zone: number of allocations. */ uint64_t uth_frees; /* Zone: number of frees. */ uint64_t uth_fails; /* Zone: number of alloc failures. */ uint64_t uth_sleeps; /* Zone: number of alloc sleeps. */ uint64_t _uth_reserved1[2]; /* Reserved. */ }; struct uma_percpu_stat { uint64_t ups_allocs; /* Cache: number of allocations. */ uint64_t ups_frees; /* Cache: number of frees. */ uint64_t ups_cache_free; /* Cache: free items in cache. */ uint64_t _ups_reserved[5]; /* Reserved. */ }; void uma_reclaim_wakeup(void); void uma_reclaim_worker(void *); #endif /* _VM_UMA_H_ */ Index: stable/10/sys/vm/uma_core.c =================================================================== --- stable/10/sys/vm/uma_core.c (revision 287944) +++ stable/10/sys/vm/uma_core.c (revision 287945) @@ -1,3627 +1,3627 @@ /*- * Copyright (c) 2002-2005, 2009, 2013 Jeffrey Roberson * Copyright (c) 2004, 2005 Bosko Milekic * Copyright (c) 2004-2006 Robert N. M. Watson * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice unmodified, this list of conditions, and the following * disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ /* * uma_core.c Implementation of the Universal Memory allocator * * This allocator is intended to replace the multitude of similar object caches * in the standard FreeBSD kernel. The intent is to be flexible as well as * effecient. A primary design goal is to return unused memory to the rest of * the system. This will make the system as a whole more flexible due to the * ability to move memory to subsystems which most need it instead of leaving * pools of reserved memory unused. * * The basic ideas stem from similar slab/zone based allocators whose algorithms * are well known. * */ /* * TODO: * - Improve memory usage for large allocations * - Investigate cache size adjustments */ #include __FBSDID("$FreeBSD$"); /* I should really use ktr.. */ /* #define UMA_DEBUG 1 #define UMA_DEBUG_ALLOC 1 #define UMA_DEBUG_ALLOC_1 1 */ #include "opt_ddb.h" #include "opt_param.h" #include "opt_vm.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef DEBUG_MEMGUARD #include #endif /* * This is the zone and keg from which all zones are spawned. The idea is that * even the zone & keg heads are allocated from the allocator, so we use the * bss section to bootstrap us. */ static struct uma_keg masterkeg; static struct uma_zone masterzone_k; static struct uma_zone masterzone_z; static uma_zone_t kegs = &masterzone_k; static uma_zone_t zones = &masterzone_z; /* This is the zone from which all of uma_slab_t's are allocated. */ static uma_zone_t slabzone; static uma_zone_t slabrefzone; /* With refcounters (for UMA_ZONE_REFCNT) */ /* * The initial hash tables come out of this zone so they can be allocated * prior to malloc coming up. */ static uma_zone_t hashzone; /* The boot-time adjusted value for cache line alignment. */ int uma_align_cache = 64 - 1; static MALLOC_DEFINE(M_UMAHASH, "UMAHash", "UMA Hash Buckets"); /* * Are we allowed to allocate buckets? */ static int bucketdisable = 1; /* Linked list of all kegs in the system */ static LIST_HEAD(,uma_keg) uma_kegs = LIST_HEAD_INITIALIZER(uma_kegs); /* Linked list of all cache-only zones in the system */ static LIST_HEAD(,uma_zone) uma_cachezones = LIST_HEAD_INITIALIZER(uma_cachezones); /* This RW lock protects the keg list */ static struct rwlock_padalign uma_rwlock; /* Linked list of boot time pages */ static LIST_HEAD(,uma_slab) uma_boot_pages = LIST_HEAD_INITIALIZER(uma_boot_pages); /* This mutex protects the boot time pages list */ static struct mtx_padalign uma_boot_pages_mtx; static struct sx uma_drain_lock; /* Is the VM done starting up? */ static int booted = 0; #define UMA_STARTUP 1 #define UMA_STARTUP2 2 /* * Only mbuf clusters use ref zones. Just provide enough references * to support the one user. New code should not use the ref facility. */ static const u_int uma_max_ipers_ref = PAGE_SIZE / MCLBYTES; /* * This is the handle used to schedule events that need to happen * outside of the allocation fast path. */ static struct callout uma_callout; #define UMA_TIMEOUT 20 /* Seconds for callout interval. */ /* * This structure is passed as the zone ctor arg so that I don't have to create * a special allocation function just for zones. */ struct uma_zctor_args { const char *name; size_t size; uma_ctor ctor; uma_dtor dtor; uma_init uminit; uma_fini fini; uma_import import; uma_release release; void *arg; uma_keg_t keg; int align; uint32_t flags; }; struct uma_kctor_args { uma_zone_t zone; size_t size; uma_init uminit; uma_fini fini; int align; uint32_t flags; }; struct uma_bucket_zone { uma_zone_t ubz_zone; char *ubz_name; int ubz_entries; /* Number of items it can hold. */ int ubz_maxsize; /* Maximum allocation size per-item. */ }; /* * Compute the actual number of bucket entries to pack them in power * of two sizes for more efficient space utilization. */ #define BUCKET_SIZE(n) \ (((sizeof(void *) * (n)) - sizeof(struct uma_bucket)) / sizeof(void *)) #define BUCKET_MAX BUCKET_SIZE(256) struct uma_bucket_zone bucket_zones[] = { { NULL, "4 Bucket", BUCKET_SIZE(4), 4096 }, { NULL, "6 Bucket", BUCKET_SIZE(6), 3072 }, { NULL, "8 Bucket", BUCKET_SIZE(8), 2048 }, { NULL, "12 Bucket", BUCKET_SIZE(12), 1536 }, { NULL, "16 Bucket", BUCKET_SIZE(16), 1024 }, { NULL, "32 Bucket", BUCKET_SIZE(32), 512 }, { NULL, "64 Bucket", BUCKET_SIZE(64), 256 }, { NULL, "128 Bucket", BUCKET_SIZE(128), 128 }, { NULL, "256 Bucket", BUCKET_SIZE(256), 64 }, { NULL, NULL, 0} }; /* * Flags and enumerations to be passed to internal functions. */ enum zfreeskip { SKIP_NONE = 0, SKIP_DTOR, SKIP_FINI }; /* Prototypes.. */ -static void *noobj_alloc(uma_zone_t, int, uint8_t *, int); -static void *page_alloc(uma_zone_t, int, uint8_t *, int); -static void *startup_alloc(uma_zone_t, int, uint8_t *, int); -static void page_free(void *, int, uint8_t); +static void *noobj_alloc(uma_zone_t, vm_size_t, uint8_t *, int); +static void *page_alloc(uma_zone_t, vm_size_t, uint8_t *, int); +static void *startup_alloc(uma_zone_t, vm_size_t, uint8_t *, int); +static void page_free(void *, vm_size_t, uint8_t); static uma_slab_t keg_alloc_slab(uma_keg_t, uma_zone_t, int); static void cache_drain(uma_zone_t); static void bucket_drain(uma_zone_t, uma_bucket_t); static void bucket_cache_drain(uma_zone_t zone); static int keg_ctor(void *, int, void *, int); static void keg_dtor(void *, int, void *); static int zone_ctor(void *, int, void *, int); static void zone_dtor(void *, int, void *); static int zero_init(void *, int, int); static void keg_small_init(uma_keg_t keg); static void keg_large_init(uma_keg_t keg); static void zone_foreach(void (*zfunc)(uma_zone_t)); static void zone_timeout(uma_zone_t zone); static int hash_alloc(struct uma_hash *); static int hash_expand(struct uma_hash *, struct uma_hash *); static void hash_free(struct uma_hash *hash); static void uma_timeout(void *); static void uma_startup3(void); static void *zone_alloc_item(uma_zone_t, void *, int); static void zone_free_item(uma_zone_t, void *, void *, enum zfreeskip); static void bucket_enable(void); static void bucket_init(void); static uma_bucket_t bucket_alloc(uma_zone_t zone, void *, int); static void bucket_free(uma_zone_t zone, uma_bucket_t, void *); static void bucket_zone_drain(void); static uma_bucket_t zone_alloc_bucket(uma_zone_t zone, void *, int flags); static uma_slab_t zone_fetch_slab(uma_zone_t zone, uma_keg_t last, int flags); static uma_slab_t zone_fetch_slab_multi(uma_zone_t zone, uma_keg_t last, int flags); static void *slab_alloc_item(uma_keg_t keg, uma_slab_t slab); static void slab_free_item(uma_keg_t keg, uma_slab_t slab, void *item); static uma_keg_t uma_kcreate(uma_zone_t zone, size_t size, uma_init uminit, uma_fini fini, int align, uint32_t flags); static int zone_import(uma_zone_t zone, void **bucket, int max, int flags); static void zone_release(uma_zone_t zone, void **bucket, int cnt); static void uma_zero_item(void *item, uma_zone_t zone); void uma_print_zone(uma_zone_t); void uma_print_stats(void); static int sysctl_vm_zone_count(SYSCTL_HANDLER_ARGS); static int sysctl_vm_zone_stats(SYSCTL_HANDLER_ARGS); SYSINIT(uma_startup3, SI_SUB_VM_CONF, SI_ORDER_SECOND, uma_startup3, NULL); SYSCTL_PROC(_vm, OID_AUTO, zone_count, CTLFLAG_RD|CTLTYPE_INT, 0, 0, sysctl_vm_zone_count, "I", "Number of UMA zones"); SYSCTL_PROC(_vm, OID_AUTO, zone_stats, CTLFLAG_RD|CTLTYPE_STRUCT, 0, 0, sysctl_vm_zone_stats, "s,struct uma_type_header", "Zone Stats"); static int zone_warnings = 1; TUNABLE_INT("vm.zone_warnings", &zone_warnings); SYSCTL_INT(_vm, OID_AUTO, zone_warnings, CTLFLAG_RW, &zone_warnings, 0, "Warn when UMA zones becomes full"); /* * This routine checks to see whether or not it's safe to enable buckets. */ static void bucket_enable(void) { bucketdisable = vm_page_count_min(); } /* * Initialize bucket_zones, the array of zones of buckets of various sizes. * * For each zone, calculate the memory required for each bucket, consisting * of the header and an array of pointers. */ static void bucket_init(void) { struct uma_bucket_zone *ubz; int size; int i; for (i = 0, ubz = &bucket_zones[0]; ubz->ubz_entries != 0; ubz++) { size = roundup(sizeof(struct uma_bucket), sizeof(void *)); size += sizeof(void *) * ubz->ubz_entries; ubz->ubz_zone = uma_zcreate(ubz->ubz_name, size, NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_MTXCLASS | UMA_ZFLAG_BUCKET); } } /* * Given a desired number of entries for a bucket, return the zone from which * to allocate the bucket. */ static struct uma_bucket_zone * bucket_zone_lookup(int entries) { struct uma_bucket_zone *ubz; for (ubz = &bucket_zones[0]; ubz->ubz_entries != 0; ubz++) if (ubz->ubz_entries >= entries) return (ubz); ubz--; return (ubz); } static int bucket_select(int size) { struct uma_bucket_zone *ubz; ubz = &bucket_zones[0]; if (size > ubz->ubz_maxsize) return MAX((ubz->ubz_maxsize * ubz->ubz_entries) / size, 1); for (; ubz->ubz_entries != 0; ubz++) if (ubz->ubz_maxsize < size) break; ubz--; return (ubz->ubz_entries); } static uma_bucket_t bucket_alloc(uma_zone_t zone, void *udata, int flags) { struct uma_bucket_zone *ubz; uma_bucket_t bucket; /* * This is to stop us from allocating per cpu buckets while we're * running out of vm.boot_pages. Otherwise, we would exhaust the * boot pages. This also prevents us from allocating buckets in * low memory situations. */ if (bucketdisable) return (NULL); /* * To limit bucket recursion we store the original zone flags * in a cookie passed via zalloc_arg/zfree_arg. This allows the * NOVM flag to persist even through deep recursions. We also * store ZFLAG_BUCKET once we have recursed attempting to allocate * a bucket for a bucket zone so we do not allow infinite bucket * recursion. This cookie will even persist to frees of unused * buckets via the allocation path or bucket allocations in the * free path. */ if ((zone->uz_flags & UMA_ZFLAG_BUCKET) == 0) udata = (void *)(uintptr_t)zone->uz_flags; else { if ((uintptr_t)udata & UMA_ZFLAG_BUCKET) return (NULL); udata = (void *)((uintptr_t)udata | UMA_ZFLAG_BUCKET); } if ((uintptr_t)udata & UMA_ZFLAG_CACHEONLY) flags |= M_NOVM; ubz = bucket_zone_lookup(zone->uz_count); if (ubz->ubz_zone == zone && (ubz + 1)->ubz_entries != 0) ubz++; bucket = uma_zalloc_arg(ubz->ubz_zone, udata, flags); if (bucket) { #ifdef INVARIANTS bzero(bucket->ub_bucket, sizeof(void *) * ubz->ubz_entries); #endif bucket->ub_cnt = 0; bucket->ub_entries = ubz->ubz_entries; } return (bucket); } static void bucket_free(uma_zone_t zone, uma_bucket_t bucket, void *udata) { struct uma_bucket_zone *ubz; KASSERT(bucket->ub_cnt == 0, ("bucket_free: Freeing a non free bucket.")); if ((zone->uz_flags & UMA_ZFLAG_BUCKET) == 0) udata = (void *)(uintptr_t)zone->uz_flags; ubz = bucket_zone_lookup(bucket->ub_entries); uma_zfree_arg(ubz->ubz_zone, bucket, udata); } static void bucket_zone_drain(void) { struct uma_bucket_zone *ubz; for (ubz = &bucket_zones[0]; ubz->ubz_entries != 0; ubz++) zone_drain(ubz->ubz_zone); } static void zone_log_warning(uma_zone_t zone) { static const struct timeval warninterval = { 300, 0 }; if (!zone_warnings || zone->uz_warning == NULL) return; if (ratecheck(&zone->uz_ratecheck, &warninterval)) printf("[zone: %s] %s\n", zone->uz_name, zone->uz_warning); } static void zone_foreach_keg(uma_zone_t zone, void (*kegfn)(uma_keg_t)) { uma_klink_t klink; LIST_FOREACH(klink, &zone->uz_kegs, kl_link) kegfn(klink->kl_keg); } /* * Routine called by timeout which is used to fire off some time interval * based calculations. (stats, hash size, etc.) * * Arguments: * arg Unused * * Returns: * Nothing */ static void uma_timeout(void *unused) { bucket_enable(); zone_foreach(zone_timeout); /* Reschedule this event */ callout_reset(&uma_callout, UMA_TIMEOUT * hz, uma_timeout, NULL); } /* * Routine to perform timeout driven calculations. This expands the * hashes and does per cpu statistics aggregation. * * Returns nothing. */ static void keg_timeout(uma_keg_t keg) { KEG_LOCK(keg); /* * Expand the keg hash table. * * This is done if the number of slabs is larger than the hash size. * What I'm trying to do here is completely reduce collisions. This * may be a little aggressive. Should I allow for two collisions max? */ if (keg->uk_flags & UMA_ZONE_HASH && keg->uk_pages / keg->uk_ppera >= keg->uk_hash.uh_hashsize) { struct uma_hash newhash; struct uma_hash oldhash; int ret; /* * This is so involved because allocating and freeing * while the keg lock is held will lead to deadlock. * I have to do everything in stages and check for * races. */ newhash = keg->uk_hash; KEG_UNLOCK(keg); ret = hash_alloc(&newhash); KEG_LOCK(keg); if (ret) { if (hash_expand(&keg->uk_hash, &newhash)) { oldhash = keg->uk_hash; keg->uk_hash = newhash; } else oldhash = newhash; KEG_UNLOCK(keg); hash_free(&oldhash); return; } } KEG_UNLOCK(keg); } static void zone_timeout(uma_zone_t zone) { zone_foreach_keg(zone, &keg_timeout); } /* * Allocate and zero fill the next sized hash table from the appropriate * backing store. * * Arguments: * hash A new hash structure with the old hash size in uh_hashsize * * Returns: * 1 on sucess and 0 on failure. */ static int hash_alloc(struct uma_hash *hash) { int oldsize; int alloc; oldsize = hash->uh_hashsize; /* We're just going to go to a power of two greater */ if (oldsize) { hash->uh_hashsize = oldsize * 2; alloc = sizeof(hash->uh_slab_hash[0]) * hash->uh_hashsize; hash->uh_slab_hash = (struct slabhead *)malloc(alloc, M_UMAHASH, M_NOWAIT); } else { alloc = sizeof(hash->uh_slab_hash[0]) * UMA_HASH_SIZE_INIT; hash->uh_slab_hash = zone_alloc_item(hashzone, NULL, M_WAITOK); hash->uh_hashsize = UMA_HASH_SIZE_INIT; } if (hash->uh_slab_hash) { bzero(hash->uh_slab_hash, alloc); hash->uh_hashmask = hash->uh_hashsize - 1; return (1); } return (0); } /* * Expands the hash table for HASH zones. This is done from zone_timeout * to reduce collisions. This must not be done in the regular allocation * path, otherwise, we can recurse on the vm while allocating pages. * * Arguments: * oldhash The hash you want to expand * newhash The hash structure for the new table * * Returns: * Nothing * * Discussion: */ static int hash_expand(struct uma_hash *oldhash, struct uma_hash *newhash) { uma_slab_t slab; int hval; int i; if (!newhash->uh_slab_hash) return (0); if (oldhash->uh_hashsize >= newhash->uh_hashsize) return (0); /* * I need to investigate hash algorithms for resizing without a * full rehash. */ for (i = 0; i < oldhash->uh_hashsize; i++) while (!SLIST_EMPTY(&oldhash->uh_slab_hash[i])) { slab = SLIST_FIRST(&oldhash->uh_slab_hash[i]); SLIST_REMOVE_HEAD(&oldhash->uh_slab_hash[i], us_hlink); hval = UMA_HASH(newhash, slab->us_data); SLIST_INSERT_HEAD(&newhash->uh_slab_hash[hval], slab, us_hlink); } return (1); } /* * Free the hash bucket to the appropriate backing store. * * Arguments: * slab_hash The hash bucket we're freeing * hashsize The number of entries in that hash bucket * * Returns: * Nothing */ static void hash_free(struct uma_hash *hash) { if (hash->uh_slab_hash == NULL) return; if (hash->uh_hashsize == UMA_HASH_SIZE_INIT) zone_free_item(hashzone, hash->uh_slab_hash, NULL, SKIP_NONE); else free(hash->uh_slab_hash, M_UMAHASH); } /* * Frees all outstanding items in a bucket * * Arguments: * zone The zone to free to, must be unlocked. * bucket The free/alloc bucket with items, cpu queue must be locked. * * Returns: * Nothing */ static void bucket_drain(uma_zone_t zone, uma_bucket_t bucket) { int i; if (bucket == NULL) return; if (zone->uz_fini) for (i = 0; i < bucket->ub_cnt; i++) zone->uz_fini(bucket->ub_bucket[i], zone->uz_size); zone->uz_release(zone->uz_arg, bucket->ub_bucket, bucket->ub_cnt); bucket->ub_cnt = 0; } /* * Drains the per cpu caches for a zone. * * NOTE: This may only be called while the zone is being turn down, and not * during normal operation. This is necessary in order that we do not have * to migrate CPUs to drain the per-CPU caches. * * Arguments: * zone The zone to drain, must be unlocked. * * Returns: * Nothing */ static void cache_drain(uma_zone_t zone) { uma_cache_t cache; int cpu; /* * XXX: It is safe to not lock the per-CPU caches, because we're * tearing down the zone anyway. I.e., there will be no further use * of the caches at this point. * * XXX: It would good to be able to assert that the zone is being * torn down to prevent improper use of cache_drain(). * * XXX: We lock the zone before passing into bucket_cache_drain() as * it is used elsewhere. Should the tear-down path be made special * there in some form? */ CPU_FOREACH(cpu) { cache = &zone->uz_cpu[cpu]; bucket_drain(zone, cache->uc_allocbucket); bucket_drain(zone, cache->uc_freebucket); if (cache->uc_allocbucket != NULL) bucket_free(zone, cache->uc_allocbucket, NULL); if (cache->uc_freebucket != NULL) bucket_free(zone, cache->uc_freebucket, NULL); cache->uc_allocbucket = cache->uc_freebucket = NULL; } ZONE_LOCK(zone); bucket_cache_drain(zone); ZONE_UNLOCK(zone); } static void cache_shrink(uma_zone_t zone) { if (zone->uz_flags & UMA_ZFLAG_INTERNAL) return; ZONE_LOCK(zone); zone->uz_count = (zone->uz_count_min + zone->uz_count) / 2; ZONE_UNLOCK(zone); } static void cache_drain_safe_cpu(uma_zone_t zone) { uma_cache_t cache; uma_bucket_t b1, b2; if (zone->uz_flags & UMA_ZFLAG_INTERNAL) return; b1 = b2 = NULL; ZONE_LOCK(zone); critical_enter(); cache = &zone->uz_cpu[curcpu]; if (cache->uc_allocbucket) { if (cache->uc_allocbucket->ub_cnt != 0) LIST_INSERT_HEAD(&zone->uz_buckets, cache->uc_allocbucket, ub_link); else b1 = cache->uc_allocbucket; cache->uc_allocbucket = NULL; } if (cache->uc_freebucket) { if (cache->uc_freebucket->ub_cnt != 0) LIST_INSERT_HEAD(&zone->uz_buckets, cache->uc_freebucket, ub_link); else b2 = cache->uc_freebucket; cache->uc_freebucket = NULL; } critical_exit(); ZONE_UNLOCK(zone); if (b1) bucket_free(zone, b1, NULL); if (b2) bucket_free(zone, b2, NULL); } /* * Safely drain per-CPU caches of a zone(s) to alloc bucket. * This is an expensive call because it needs to bind to all CPUs * one by one and enter a critical section on each of them in order * to safely access their cache buckets. * Zone lock must not be held on call this function. */ static void cache_drain_safe(uma_zone_t zone) { int cpu; /* * Polite bucket sizes shrinking was not enouth, shrink aggressively. */ if (zone) cache_shrink(zone); else zone_foreach(cache_shrink); CPU_FOREACH(cpu) { thread_lock(curthread); sched_bind(curthread, cpu); thread_unlock(curthread); if (zone) cache_drain_safe_cpu(zone); else zone_foreach(cache_drain_safe_cpu); } thread_lock(curthread); sched_unbind(curthread); thread_unlock(curthread); } /* * Drain the cached buckets from a zone. Expects a locked zone on entry. */ static void bucket_cache_drain(uma_zone_t zone) { uma_bucket_t bucket; /* * Drain the bucket queues and free the buckets, we just keep two per * cpu (alloc/free). */ while ((bucket = LIST_FIRST(&zone->uz_buckets)) != NULL) { LIST_REMOVE(bucket, ub_link); ZONE_UNLOCK(zone); bucket_drain(zone, bucket); bucket_free(zone, bucket, NULL); ZONE_LOCK(zone); } /* * Shrink further bucket sizes. Price of single zone lock collision * is probably lower then price of global cache drain. */ if (zone->uz_count > zone->uz_count_min) zone->uz_count--; } static void keg_free_slab(uma_keg_t keg, uma_slab_t slab, int start) { uint8_t *mem; int i; uint8_t flags; mem = slab->us_data; flags = slab->us_flags; i = start; if (keg->uk_fini != NULL) { for (i--; i > -1; i--) keg->uk_fini(slab->us_data + (keg->uk_rsize * i), keg->uk_size); } if (keg->uk_flags & UMA_ZONE_OFFPAGE) zone_free_item(keg->uk_slabzone, slab, NULL, SKIP_NONE); #ifdef UMA_DEBUG printf("%s: Returning %d bytes.\n", keg->uk_name, PAGE_SIZE * keg->uk_ppera); #endif keg->uk_freef(mem, PAGE_SIZE * keg->uk_ppera, flags); } /* * Frees pages from a keg back to the system. This is done on demand from * the pageout daemon. * * Returns nothing. */ static void keg_drain(uma_keg_t keg) { struct slabhead freeslabs = { 0 }; uma_slab_t slab; uma_slab_t n; /* * We don't want to take pages from statically allocated kegs at this * time */ if (keg->uk_flags & UMA_ZONE_NOFREE || keg->uk_freef == NULL) return; #ifdef UMA_DEBUG printf("%s free items: %u\n", keg->uk_name, keg->uk_free); #endif KEG_LOCK(keg); if (keg->uk_free == 0) goto finished; slab = LIST_FIRST(&keg->uk_free_slab); while (slab) { n = LIST_NEXT(slab, us_link); /* We have no where to free these to */ if (slab->us_flags & UMA_SLAB_BOOT) { slab = n; continue; } LIST_REMOVE(slab, us_link); keg->uk_pages -= keg->uk_ppera; keg->uk_free -= keg->uk_ipers; if (keg->uk_flags & UMA_ZONE_HASH) UMA_HASH_REMOVE(&keg->uk_hash, slab, slab->us_data); SLIST_INSERT_HEAD(&freeslabs, slab, us_hlink); slab = n; } finished: KEG_UNLOCK(keg); while ((slab = SLIST_FIRST(&freeslabs)) != NULL) { SLIST_REMOVE(&freeslabs, slab, uma_slab, us_hlink); keg_free_slab(keg, slab, keg->uk_ipers); } } static void zone_drain_wait(uma_zone_t zone, int waitok) { /* * Set draining to interlock with zone_dtor() so we can release our * locks as we go. Only dtor() should do a WAITOK call since it * is the only call that knows the structure will still be available * when it wakes up. */ ZONE_LOCK(zone); while (zone->uz_flags & UMA_ZFLAG_DRAINING) { if (waitok == M_NOWAIT) goto out; msleep(zone, zone->uz_lockptr, PVM, "zonedrain", 1); } zone->uz_flags |= UMA_ZFLAG_DRAINING; bucket_cache_drain(zone); ZONE_UNLOCK(zone); /* * The DRAINING flag protects us from being freed while * we're running. Normally the uma_rwlock would protect us but we * must be able to release and acquire the right lock for each keg. */ zone_foreach_keg(zone, &keg_drain); ZONE_LOCK(zone); zone->uz_flags &= ~UMA_ZFLAG_DRAINING; wakeup(zone); out: ZONE_UNLOCK(zone); } void zone_drain(uma_zone_t zone) { zone_drain_wait(zone, M_NOWAIT); } /* * Allocate a new slab for a keg. This does not insert the slab onto a list. * * Arguments: * wait Shall we wait? * * Returns: * The slab that was allocated or NULL if there is no memory and the * caller specified M_NOWAIT. */ static uma_slab_t keg_alloc_slab(uma_keg_t keg, uma_zone_t zone, int wait) { uma_slabrefcnt_t slabref; uma_alloc allocf; uma_slab_t slab; uint8_t *mem; uint8_t flags; int i; mtx_assert(&keg->uk_lock, MA_OWNED); slab = NULL; mem = NULL; #ifdef UMA_DEBUG printf("alloc_slab: Allocating a new slab for %s\n", keg->uk_name); #endif allocf = keg->uk_allocf; KEG_UNLOCK(keg); if (keg->uk_flags & UMA_ZONE_OFFPAGE) { slab = zone_alloc_item(keg->uk_slabzone, NULL, wait); if (slab == NULL) goto out; } /* * This reproduces the old vm_zone behavior of zero filling pages the * first time they are added to a zone. * * Malloced items are zeroed in uma_zalloc. */ if ((keg->uk_flags & UMA_ZONE_MALLOC) == 0) wait |= M_ZERO; else wait &= ~M_ZERO; if (keg->uk_flags & UMA_ZONE_NODUMP) wait |= M_NODUMP; /* zone is passed for legacy reasons. */ mem = allocf(zone, keg->uk_ppera * PAGE_SIZE, &flags, wait); if (mem == NULL) { if (keg->uk_flags & UMA_ZONE_OFFPAGE) zone_free_item(keg->uk_slabzone, slab, NULL, SKIP_NONE); slab = NULL; goto out; } /* Point the slab into the allocated memory */ if (!(keg->uk_flags & UMA_ZONE_OFFPAGE)) slab = (uma_slab_t )(mem + keg->uk_pgoff); if (keg->uk_flags & UMA_ZONE_VTOSLAB) for (i = 0; i < keg->uk_ppera; i++) vsetslab((vm_offset_t)mem + (i * PAGE_SIZE), slab); slab->us_keg = keg; slab->us_data = mem; slab->us_freecount = keg->uk_ipers; slab->us_flags = flags; BIT_FILL(SLAB_SETSIZE, &slab->us_free); #ifdef INVARIANTS BIT_ZERO(SLAB_SETSIZE, &slab->us_debugfree); #endif if (keg->uk_flags & UMA_ZONE_REFCNT) { slabref = (uma_slabrefcnt_t)slab; for (i = 0; i < keg->uk_ipers; i++) slabref->us_refcnt[i] = 0; } if (keg->uk_init != NULL) { for (i = 0; i < keg->uk_ipers; i++) if (keg->uk_init(slab->us_data + (keg->uk_rsize * i), keg->uk_size, wait) != 0) break; if (i != keg->uk_ipers) { keg_free_slab(keg, slab, i); slab = NULL; goto out; } } out: KEG_LOCK(keg); if (slab != NULL) { if (keg->uk_flags & UMA_ZONE_HASH) UMA_HASH_INSERT(&keg->uk_hash, slab, mem); keg->uk_pages += keg->uk_ppera; keg->uk_free += keg->uk_ipers; } return (slab); } /* * This function is intended to be used early on in place of page_alloc() so * that we may use the boot time page cache to satisfy allocations before * the VM is ready. */ static void * -startup_alloc(uma_zone_t zone, int bytes, uint8_t *pflag, int wait) +startup_alloc(uma_zone_t zone, vm_size_t bytes, uint8_t *pflag, int wait) { uma_keg_t keg; uma_slab_t tmps; int pages, check_pages; keg = zone_first_keg(zone); pages = howmany(bytes, PAGE_SIZE); check_pages = pages - 1; KASSERT(pages > 0, ("startup_alloc can't reserve 0 pages\n")); /* * Check our small startup cache to see if it has pages remaining. */ mtx_lock(&uma_boot_pages_mtx); /* First check if we have enough room. */ tmps = LIST_FIRST(&uma_boot_pages); while (tmps != NULL && check_pages-- > 0) tmps = LIST_NEXT(tmps, us_link); if (tmps != NULL) { /* * It's ok to lose tmps references. The last one will * have tmps->us_data pointing to the start address of * "pages" contiguous pages of memory. */ while (pages-- > 0) { tmps = LIST_FIRST(&uma_boot_pages); LIST_REMOVE(tmps, us_link); } mtx_unlock(&uma_boot_pages_mtx); *pflag = tmps->us_flags; return (tmps->us_data); } mtx_unlock(&uma_boot_pages_mtx); if (booted < UMA_STARTUP2) panic("UMA: Increase vm.boot_pages"); /* * Now that we've booted reset these users to their real allocator. */ #ifdef UMA_MD_SMALL_ALLOC keg->uk_allocf = (keg->uk_ppera > 1) ? page_alloc : uma_small_alloc; #else keg->uk_allocf = page_alloc; #endif return keg->uk_allocf(zone, bytes, pflag, wait); } /* * Allocates a number of pages from the system * * Arguments: * bytes The number of bytes requested * wait Shall we wait? * * Returns: * A pointer to the alloced memory or possibly * NULL if M_NOWAIT is set. */ static void * -page_alloc(uma_zone_t zone, int bytes, uint8_t *pflag, int wait) +page_alloc(uma_zone_t zone, vm_size_t bytes, uint8_t *pflag, int wait) { void *p; /* Returned page */ *pflag = UMA_SLAB_KMEM; p = (void *) kmem_malloc(kmem_arena, bytes, wait); return (p); } /* * Allocates a number of pages from within an object * * Arguments: * bytes The number of bytes requested * wait Shall we wait? * * Returns: * A pointer to the alloced memory or possibly * NULL if M_NOWAIT is set. */ static void * -noobj_alloc(uma_zone_t zone, int bytes, uint8_t *flags, int wait) +noobj_alloc(uma_zone_t zone, vm_size_t bytes, uint8_t *flags, int wait) { TAILQ_HEAD(, vm_page) alloctail; u_long npages; vm_offset_t retkva, zkva; vm_page_t p, p_next; uma_keg_t keg; TAILQ_INIT(&alloctail); keg = zone_first_keg(zone); npages = howmany(bytes, PAGE_SIZE); while (npages > 0) { p = vm_page_alloc(NULL, 0, VM_ALLOC_INTERRUPT | VM_ALLOC_WIRED | VM_ALLOC_NOOBJ); if (p != NULL) { /* * Since the page does not belong to an object, its * listq is unused. */ TAILQ_INSERT_TAIL(&alloctail, p, listq); npages--; continue; } if (wait & M_WAITOK) { VM_WAIT; continue; } /* * Page allocation failed, free intermediate pages and * exit. */ TAILQ_FOREACH_SAFE(p, &alloctail, listq, p_next) { vm_page_unwire(p, 0); vm_page_free(p); } return (NULL); } *flags = UMA_SLAB_PRIV; zkva = keg->uk_kva + atomic_fetchadd_long(&keg->uk_offset, round_page(bytes)); retkva = zkva; TAILQ_FOREACH(p, &alloctail, listq) { pmap_qenter(zkva, &p, 1); zkva += PAGE_SIZE; } return ((void *)retkva); } /* * Frees a number of pages to the system * * Arguments: * mem A pointer to the memory to be freed * size The size of the memory being freed * flags The original p->us_flags field * * Returns: * Nothing */ static void -page_free(void *mem, int size, uint8_t flags) +page_free(void *mem, vm_size_t size, uint8_t flags) { struct vmem *vmem; if (flags & UMA_SLAB_KMEM) vmem = kmem_arena; else if (flags & UMA_SLAB_KERNEL) vmem = kernel_arena; else panic("UMA: page_free used with invalid flags %d", flags); kmem_free(vmem, (vm_offset_t)mem, size); } /* * Zero fill initializer * * Arguments/Returns follow uma_init specifications */ static int zero_init(void *mem, int size, int flags) { bzero(mem, size); return (0); } /* * Finish creating a small uma keg. This calculates ipers, and the keg size. * * Arguments * keg The zone we should initialize * * Returns * Nothing */ static void keg_small_init(uma_keg_t keg) { u_int rsize; u_int memused; u_int wastedspace; u_int shsize; if (keg->uk_flags & UMA_ZONE_PCPU) { u_int ncpus = mp_ncpus ? mp_ncpus : MAXCPU; keg->uk_slabsize = sizeof(struct pcpu); keg->uk_ppera = howmany(ncpus * sizeof(struct pcpu), PAGE_SIZE); } else { keg->uk_slabsize = UMA_SLAB_SIZE; keg->uk_ppera = 1; } /* * Calculate the size of each allocation (rsize) according to * alignment. If the requested size is smaller than we have * allocation bits for we round it up. */ rsize = keg->uk_size; if (rsize < keg->uk_slabsize / SLAB_SETSIZE) rsize = keg->uk_slabsize / SLAB_SETSIZE; if (rsize & keg->uk_align) rsize = (rsize & ~keg->uk_align) + (keg->uk_align + 1); keg->uk_rsize = rsize; KASSERT((keg->uk_flags & UMA_ZONE_PCPU) == 0 || keg->uk_rsize < sizeof(struct pcpu), ("%s: size %u too large", __func__, keg->uk_rsize)); if (keg->uk_flags & UMA_ZONE_REFCNT) rsize += sizeof(uint32_t); if (keg->uk_flags & UMA_ZONE_OFFPAGE) shsize = 0; else shsize = sizeof(struct uma_slab); keg->uk_ipers = (keg->uk_slabsize - shsize) / rsize; KASSERT(keg->uk_ipers > 0 && keg->uk_ipers <= SLAB_SETSIZE, ("%s: keg->uk_ipers %u", __func__, keg->uk_ipers)); memused = keg->uk_ipers * rsize + shsize; wastedspace = keg->uk_slabsize - memused; /* * We can't do OFFPAGE if we're internal or if we've been * asked to not go to the VM for buckets. If we do this we * may end up going to the VM for slabs which we do not * want to do if we're UMA_ZFLAG_CACHEONLY as a result * of UMA_ZONE_VM, which clearly forbids it. */ if ((keg->uk_flags & UMA_ZFLAG_INTERNAL) || (keg->uk_flags & UMA_ZFLAG_CACHEONLY)) return; /* * See if using an OFFPAGE slab will limit our waste. Only do * this if it permits more items per-slab. * * XXX We could try growing slabsize to limit max waste as well. * Historically this was not done because the VM could not * efficiently handle contiguous allocations. */ if ((wastedspace >= keg->uk_slabsize / UMA_MAX_WASTE) && (keg->uk_ipers < (keg->uk_slabsize / keg->uk_rsize))) { keg->uk_ipers = keg->uk_slabsize / keg->uk_rsize; KASSERT(keg->uk_ipers > 0 && keg->uk_ipers <= SLAB_SETSIZE, ("%s: keg->uk_ipers %u", __func__, keg->uk_ipers)); #ifdef UMA_DEBUG printf("UMA decided we need offpage slab headers for " "keg: %s, calculated wastedspace = %d, " "maximum wasted space allowed = %d, " "calculated ipers = %d, " "new wasted space = %d\n", keg->uk_name, wastedspace, keg->uk_slabsize / UMA_MAX_WASTE, keg->uk_ipers, keg->uk_slabsize - keg->uk_ipers * keg->uk_rsize); #endif keg->uk_flags |= UMA_ZONE_OFFPAGE; } if ((keg->uk_flags & UMA_ZONE_OFFPAGE) && (keg->uk_flags & UMA_ZONE_VTOSLAB) == 0) keg->uk_flags |= UMA_ZONE_HASH; } /* * Finish creating a large (> UMA_SLAB_SIZE) uma kegs. Just give in and do * OFFPAGE for now. When I can allow for more dynamic slab sizes this will be * more complicated. * * Arguments * keg The keg we should initialize * * Returns * Nothing */ static void keg_large_init(uma_keg_t keg) { u_int shsize; KASSERT(keg != NULL, ("Keg is null in keg_large_init")); KASSERT((keg->uk_flags & UMA_ZFLAG_CACHEONLY) == 0, ("keg_large_init: Cannot large-init a UMA_ZFLAG_CACHEONLY keg")); KASSERT((keg->uk_flags & UMA_ZONE_PCPU) == 0, ("%s: Cannot large-init a UMA_ZONE_PCPU keg", __func__)); keg->uk_ppera = howmany(keg->uk_size, PAGE_SIZE); keg->uk_slabsize = keg->uk_ppera * PAGE_SIZE; keg->uk_ipers = 1; keg->uk_rsize = keg->uk_size; /* We can't do OFFPAGE if we're internal, bail out here. */ if (keg->uk_flags & UMA_ZFLAG_INTERNAL) return; /* Check whether we have enough space to not do OFFPAGE. */ if ((keg->uk_flags & UMA_ZONE_OFFPAGE) == 0) { shsize = sizeof(struct uma_slab); if (keg->uk_flags & UMA_ZONE_REFCNT) shsize += keg->uk_ipers * sizeof(uint32_t); if (shsize & UMA_ALIGN_PTR) shsize = (shsize & ~UMA_ALIGN_PTR) + (UMA_ALIGN_PTR + 1); if ((PAGE_SIZE * keg->uk_ppera) - keg->uk_rsize < shsize) keg->uk_flags |= UMA_ZONE_OFFPAGE; } if ((keg->uk_flags & UMA_ZONE_OFFPAGE) && (keg->uk_flags & UMA_ZONE_VTOSLAB) == 0) keg->uk_flags |= UMA_ZONE_HASH; } static void keg_cachespread_init(uma_keg_t keg) { int alignsize; int trailer; int pages; int rsize; KASSERT((keg->uk_flags & UMA_ZONE_PCPU) == 0, ("%s: Cannot cachespread-init a UMA_ZONE_PCPU keg", __func__)); alignsize = keg->uk_align + 1; rsize = keg->uk_size; /* * We want one item to start on every align boundary in a page. To * do this we will span pages. We will also extend the item by the * size of align if it is an even multiple of align. Otherwise, it * would fall on the same boundary every time. */ if (rsize & keg->uk_align) rsize = (rsize & ~keg->uk_align) + alignsize; if ((rsize & alignsize) == 0) rsize += alignsize; trailer = rsize - keg->uk_size; pages = (rsize * (PAGE_SIZE / alignsize)) / PAGE_SIZE; pages = MIN(pages, (128 * 1024) / PAGE_SIZE); keg->uk_rsize = rsize; keg->uk_ppera = pages; keg->uk_slabsize = UMA_SLAB_SIZE; keg->uk_ipers = ((pages * PAGE_SIZE) + trailer) / rsize; keg->uk_flags |= UMA_ZONE_OFFPAGE | UMA_ZONE_VTOSLAB; KASSERT(keg->uk_ipers <= SLAB_SETSIZE, ("%s: keg->uk_ipers too high(%d) increase max_ipers", __func__, keg->uk_ipers)); } /* * Keg header ctor. This initializes all fields, locks, etc. And inserts * the keg onto the global keg list. * * Arguments/Returns follow uma_ctor specifications * udata Actually uma_kctor_args */ static int keg_ctor(void *mem, int size, void *udata, int flags) { struct uma_kctor_args *arg = udata; uma_keg_t keg = mem; uma_zone_t zone; bzero(keg, size); keg->uk_size = arg->size; keg->uk_init = arg->uminit; keg->uk_fini = arg->fini; keg->uk_align = arg->align; keg->uk_free = 0; keg->uk_reserve = 0; keg->uk_pages = 0; keg->uk_flags = arg->flags; keg->uk_allocf = page_alloc; keg->uk_freef = page_free; keg->uk_slabzone = NULL; /* * The master zone is passed to us at keg-creation time. */ zone = arg->zone; keg->uk_name = zone->uz_name; if (arg->flags & UMA_ZONE_VM) keg->uk_flags |= UMA_ZFLAG_CACHEONLY; if (arg->flags & UMA_ZONE_ZINIT) keg->uk_init = zero_init; if (arg->flags & UMA_ZONE_REFCNT || arg->flags & UMA_ZONE_MALLOC) keg->uk_flags |= UMA_ZONE_VTOSLAB; if (arg->flags & UMA_ZONE_PCPU) #ifdef SMP keg->uk_flags |= UMA_ZONE_OFFPAGE; #else keg->uk_flags &= ~UMA_ZONE_PCPU; #endif if (keg->uk_flags & UMA_ZONE_CACHESPREAD) { keg_cachespread_init(keg); } else if (keg->uk_flags & UMA_ZONE_REFCNT) { if (keg->uk_size > (UMA_SLAB_SIZE - sizeof(struct uma_slab_refcnt) - sizeof(uint32_t))) keg_large_init(keg); else keg_small_init(keg); } else { if (keg->uk_size > (UMA_SLAB_SIZE - sizeof(struct uma_slab))) keg_large_init(keg); else keg_small_init(keg); } if (keg->uk_flags & UMA_ZONE_OFFPAGE) { if (keg->uk_flags & UMA_ZONE_REFCNT) { if (keg->uk_ipers > uma_max_ipers_ref) panic("Too many ref items per zone: %d > %d\n", keg->uk_ipers, uma_max_ipers_ref); keg->uk_slabzone = slabrefzone; } else keg->uk_slabzone = slabzone; } /* * If we haven't booted yet we need allocations to go through the * startup cache until the vm is ready. */ if (keg->uk_ppera == 1) { #ifdef UMA_MD_SMALL_ALLOC keg->uk_allocf = uma_small_alloc; keg->uk_freef = uma_small_free; if (booted < UMA_STARTUP) keg->uk_allocf = startup_alloc; #else if (booted < UMA_STARTUP2) keg->uk_allocf = startup_alloc; #endif } else if (booted < UMA_STARTUP2 && (keg->uk_flags & UMA_ZFLAG_INTERNAL)) keg->uk_allocf = startup_alloc; /* * Initialize keg's lock */ KEG_LOCK_INIT(keg, (arg->flags & UMA_ZONE_MTXCLASS)); /* * If we're putting the slab header in the actual page we need to * figure out where in each page it goes. This calculates a right * justified offset into the memory on an ALIGN_PTR boundary. */ if (!(keg->uk_flags & UMA_ZONE_OFFPAGE)) { u_int totsize; /* Size of the slab struct and free list */ totsize = sizeof(struct uma_slab); /* Size of the reference counts. */ if (keg->uk_flags & UMA_ZONE_REFCNT) totsize += keg->uk_ipers * sizeof(uint32_t); if (totsize & UMA_ALIGN_PTR) totsize = (totsize & ~UMA_ALIGN_PTR) + (UMA_ALIGN_PTR + 1); keg->uk_pgoff = (PAGE_SIZE * keg->uk_ppera) - totsize; /* * The only way the following is possible is if with our * UMA_ALIGN_PTR adjustments we are now bigger than * UMA_SLAB_SIZE. I haven't checked whether this is * mathematically possible for all cases, so we make * sure here anyway. */ totsize = keg->uk_pgoff + sizeof(struct uma_slab); if (keg->uk_flags & UMA_ZONE_REFCNT) totsize += keg->uk_ipers * sizeof(uint32_t); if (totsize > PAGE_SIZE * keg->uk_ppera) { printf("zone %s ipers %d rsize %d size %d\n", zone->uz_name, keg->uk_ipers, keg->uk_rsize, keg->uk_size); panic("UMA slab won't fit."); } } if (keg->uk_flags & UMA_ZONE_HASH) hash_alloc(&keg->uk_hash); #ifdef UMA_DEBUG printf("UMA: %s(%p) size %d(%d) flags %#x ipers %d ppera %d out %d free %d\n", zone->uz_name, zone, keg->uk_size, keg->uk_rsize, keg->uk_flags, keg->uk_ipers, keg->uk_ppera, (keg->uk_ipers * keg->uk_pages) - keg->uk_free, keg->uk_free); #endif LIST_INSERT_HEAD(&keg->uk_zones, zone, uz_link); rw_wlock(&uma_rwlock); LIST_INSERT_HEAD(&uma_kegs, keg, uk_link); rw_wunlock(&uma_rwlock); return (0); } /* * Zone header ctor. This initializes all fields, locks, etc. * * Arguments/Returns follow uma_ctor specifications * udata Actually uma_zctor_args */ static int zone_ctor(void *mem, int size, void *udata, int flags) { struct uma_zctor_args *arg = udata; uma_zone_t zone = mem; uma_zone_t z; uma_keg_t keg; bzero(zone, size); zone->uz_name = arg->name; zone->uz_ctor = arg->ctor; zone->uz_dtor = arg->dtor; zone->uz_slab = zone_fetch_slab; zone->uz_init = NULL; zone->uz_fini = NULL; zone->uz_allocs = 0; zone->uz_frees = 0; zone->uz_fails = 0; zone->uz_sleeps = 0; zone->uz_count = 0; zone->uz_count_min = 0; zone->uz_flags = 0; zone->uz_warning = NULL; timevalclear(&zone->uz_ratecheck); keg = arg->keg; ZONE_LOCK_INIT(zone, (arg->flags & UMA_ZONE_MTXCLASS)); /* * This is a pure cache zone, no kegs. */ if (arg->import) { if (arg->flags & UMA_ZONE_VM) arg->flags |= UMA_ZFLAG_CACHEONLY; zone->uz_flags = arg->flags; zone->uz_size = arg->size; zone->uz_import = arg->import; zone->uz_release = arg->release; zone->uz_arg = arg->arg; zone->uz_lockptr = &zone->uz_lock; rw_wlock(&uma_rwlock); LIST_INSERT_HEAD(&uma_cachezones, zone, uz_link); rw_wunlock(&uma_rwlock); goto out; } /* * Use the regular zone/keg/slab allocator. */ zone->uz_import = (uma_import)zone_import; zone->uz_release = (uma_release)zone_release; zone->uz_arg = zone; if (arg->flags & UMA_ZONE_SECONDARY) { KASSERT(arg->keg != NULL, ("Secondary zone on zero'd keg")); zone->uz_init = arg->uminit; zone->uz_fini = arg->fini; zone->uz_lockptr = &keg->uk_lock; zone->uz_flags |= UMA_ZONE_SECONDARY; rw_wlock(&uma_rwlock); ZONE_LOCK(zone); LIST_FOREACH(z, &keg->uk_zones, uz_link) { if (LIST_NEXT(z, uz_link) == NULL) { LIST_INSERT_AFTER(z, zone, uz_link); break; } } ZONE_UNLOCK(zone); rw_wunlock(&uma_rwlock); } else if (keg == NULL) { if ((keg = uma_kcreate(zone, arg->size, arg->uminit, arg->fini, arg->align, arg->flags)) == NULL) return (ENOMEM); } else { struct uma_kctor_args karg; int error; /* We should only be here from uma_startup() */ karg.size = arg->size; karg.uminit = arg->uminit; karg.fini = arg->fini; karg.align = arg->align; karg.flags = arg->flags; karg.zone = zone; error = keg_ctor(arg->keg, sizeof(struct uma_keg), &karg, flags); if (error) return (error); } /* * Link in the first keg. */ zone->uz_klink.kl_keg = keg; LIST_INSERT_HEAD(&zone->uz_kegs, &zone->uz_klink, kl_link); zone->uz_lockptr = &keg->uk_lock; zone->uz_size = keg->uk_size; zone->uz_flags |= (keg->uk_flags & (UMA_ZONE_INHERIT | UMA_ZFLAG_INHERIT)); /* * Some internal zones don't have room allocated for the per cpu * caches. If we're internal, bail out here. */ if (keg->uk_flags & UMA_ZFLAG_INTERNAL) { KASSERT((zone->uz_flags & UMA_ZONE_SECONDARY) == 0, ("Secondary zone requested UMA_ZFLAG_INTERNAL")); return (0); } out: if ((arg->flags & UMA_ZONE_MAXBUCKET) == 0) zone->uz_count = bucket_select(zone->uz_size); else zone->uz_count = BUCKET_MAX; zone->uz_count_min = zone->uz_count; return (0); } /* * Keg header dtor. This frees all data, destroys locks, frees the hash * table and removes the keg from the global list. * * Arguments/Returns follow uma_dtor specifications * udata unused */ static void keg_dtor(void *arg, int size, void *udata) { uma_keg_t keg; keg = (uma_keg_t)arg; KEG_LOCK(keg); if (keg->uk_free != 0) { printf("Freed UMA keg (%s) was not empty (%d items). " " Lost %d pages of memory.\n", keg->uk_name ? keg->uk_name : "", keg->uk_free, keg->uk_pages); } KEG_UNLOCK(keg); hash_free(&keg->uk_hash); KEG_LOCK_FINI(keg); } /* * Zone header dtor. * * Arguments/Returns follow uma_dtor specifications * udata unused */ static void zone_dtor(void *arg, int size, void *udata) { uma_klink_t klink; uma_zone_t zone; uma_keg_t keg; zone = (uma_zone_t)arg; keg = zone_first_keg(zone); if (!(zone->uz_flags & UMA_ZFLAG_INTERNAL)) cache_drain(zone); rw_wlock(&uma_rwlock); LIST_REMOVE(zone, uz_link); rw_wunlock(&uma_rwlock); /* * XXX there are some races here where * the zone can be drained but zone lock * released and then refilled before we * remove it... we dont care for now */ zone_drain_wait(zone, M_WAITOK); /* * Unlink all of our kegs. */ while ((klink = LIST_FIRST(&zone->uz_kegs)) != NULL) { klink->kl_keg = NULL; LIST_REMOVE(klink, kl_link); if (klink == &zone->uz_klink) continue; free(klink, M_TEMP); } /* * We only destroy kegs from non secondary zones. */ if (keg != NULL && (zone->uz_flags & UMA_ZONE_SECONDARY) == 0) { rw_wlock(&uma_rwlock); LIST_REMOVE(keg, uk_link); rw_wunlock(&uma_rwlock); zone_free_item(kegs, keg, NULL, SKIP_NONE); } ZONE_LOCK_FINI(zone); } /* * Traverses every zone in the system and calls a callback * * Arguments: * zfunc A pointer to a function which accepts a zone * as an argument. * * Returns: * Nothing */ static void zone_foreach(void (*zfunc)(uma_zone_t)) { uma_keg_t keg; uma_zone_t zone; rw_rlock(&uma_rwlock); LIST_FOREACH(keg, &uma_kegs, uk_link) { LIST_FOREACH(zone, &keg->uk_zones, uz_link) zfunc(zone); } rw_runlock(&uma_rwlock); } /* Public functions */ /* See uma.h */ void uma_startup(void *bootmem, int boot_pages) { struct uma_zctor_args args; uma_slab_t slab; u_int slabsize; int i; #ifdef UMA_DEBUG printf("Creating uma keg headers zone and keg.\n"); #endif rw_init(&uma_rwlock, "UMA lock"); /* "manually" create the initial zone */ memset(&args, 0, sizeof(args)); args.name = "UMA Kegs"; args.size = sizeof(struct uma_keg); args.ctor = keg_ctor; args.dtor = keg_dtor; args.uminit = zero_init; args.fini = NULL; args.keg = &masterkeg; args.align = 32 - 1; args.flags = UMA_ZFLAG_INTERNAL; /* The initial zone has no Per cpu queues so it's smaller */ zone_ctor(kegs, sizeof(struct uma_zone), &args, M_WAITOK); #ifdef UMA_DEBUG printf("Filling boot free list.\n"); #endif for (i = 0; i < boot_pages; i++) { slab = (uma_slab_t)((uint8_t *)bootmem + (i * UMA_SLAB_SIZE)); slab->us_data = (uint8_t *)slab; slab->us_flags = UMA_SLAB_BOOT; LIST_INSERT_HEAD(&uma_boot_pages, slab, us_link); } mtx_init(&uma_boot_pages_mtx, "UMA boot pages", NULL, MTX_DEF); #ifdef UMA_DEBUG printf("Creating uma zone headers zone and keg.\n"); #endif args.name = "UMA Zones"; args.size = sizeof(struct uma_zone) + (sizeof(struct uma_cache) * (mp_maxid + 1)); args.ctor = zone_ctor; args.dtor = zone_dtor; args.uminit = zero_init; args.fini = NULL; args.keg = NULL; args.align = 32 - 1; args.flags = UMA_ZFLAG_INTERNAL; /* The initial zone has no Per cpu queues so it's smaller */ zone_ctor(zones, sizeof(struct uma_zone), &args, M_WAITOK); #ifdef UMA_DEBUG printf("Initializing pcpu cache locks.\n"); #endif #ifdef UMA_DEBUG printf("Creating slab and hash zones.\n"); #endif /* Now make a zone for slab headers */ slabzone = uma_zcreate("UMA Slabs", sizeof(struct uma_slab), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZFLAG_INTERNAL); /* * We also create a zone for the bigger slabs with reference * counts in them, to accomodate UMA_ZONE_REFCNT zones. */ slabsize = sizeof(struct uma_slab_refcnt); slabsize += uma_max_ipers_ref * sizeof(uint32_t); slabrefzone = uma_zcreate("UMA RCntSlabs", slabsize, NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZFLAG_INTERNAL); hashzone = uma_zcreate("UMA Hash", sizeof(struct slabhead *) * UMA_HASH_SIZE_INIT, NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZFLAG_INTERNAL); bucket_init(); booted = UMA_STARTUP; #ifdef UMA_DEBUG printf("UMA startup complete.\n"); #endif } /* see uma.h */ void uma_startup2(void) { booted = UMA_STARTUP2; bucket_enable(); sx_init(&uma_drain_lock, "umadrain"); #ifdef UMA_DEBUG printf("UMA startup2 complete.\n"); #endif } /* * Initialize our callout handle * */ static void uma_startup3(void) { #ifdef UMA_DEBUG printf("Starting callout.\n"); #endif callout_init(&uma_callout, CALLOUT_MPSAFE); callout_reset(&uma_callout, UMA_TIMEOUT * hz, uma_timeout, NULL); #ifdef UMA_DEBUG printf("UMA startup3 complete.\n"); #endif } static uma_keg_t uma_kcreate(uma_zone_t zone, size_t size, uma_init uminit, uma_fini fini, int align, uint32_t flags) { struct uma_kctor_args args; args.size = size; args.uminit = uminit; args.fini = fini; args.align = (align == UMA_ALIGN_CACHE) ? uma_align_cache : align; args.flags = flags; args.zone = zone; return (zone_alloc_item(kegs, &args, M_WAITOK)); } /* See uma.h */ void uma_set_align(int align) { if (align != UMA_ALIGN_CACHE) uma_align_cache = align; } /* See uma.h */ uma_zone_t uma_zcreate(const char *name, size_t size, uma_ctor ctor, uma_dtor dtor, uma_init uminit, uma_fini fini, int align, uint32_t flags) { struct uma_zctor_args args; uma_zone_t res; bool locked; /* This stuff is essential for the zone ctor */ memset(&args, 0, sizeof(args)); args.name = name; args.size = size; args.ctor = ctor; args.dtor = dtor; args.uminit = uminit; args.fini = fini; args.align = align; args.flags = flags; args.keg = NULL; if (booted < UMA_STARTUP2) { locked = false; } else { sx_slock(&uma_drain_lock); locked = true; } res = zone_alloc_item(zones, &args, M_WAITOK); if (locked) sx_sunlock(&uma_drain_lock); return (res); } /* See uma.h */ uma_zone_t uma_zsecond_create(char *name, uma_ctor ctor, uma_dtor dtor, uma_init zinit, uma_fini zfini, uma_zone_t master) { struct uma_zctor_args args; uma_keg_t keg; uma_zone_t res; bool locked; keg = zone_first_keg(master); memset(&args, 0, sizeof(args)); args.name = name; args.size = keg->uk_size; args.ctor = ctor; args.dtor = dtor; args.uminit = zinit; args.fini = zfini; args.align = keg->uk_align; args.flags = keg->uk_flags | UMA_ZONE_SECONDARY; args.keg = keg; if (booted < UMA_STARTUP2) { locked = false; } else { sx_slock(&uma_drain_lock); locked = true; } /* XXX Attaches only one keg of potentially many. */ res = zone_alloc_item(zones, &args, M_WAITOK); if (locked) sx_sunlock(&uma_drain_lock); return (res); } /* See uma.h */ uma_zone_t uma_zcache_create(char *name, int size, uma_ctor ctor, uma_dtor dtor, uma_init zinit, uma_fini zfini, uma_import zimport, uma_release zrelease, void *arg, int flags) { struct uma_zctor_args args; memset(&args, 0, sizeof(args)); args.name = name; args.size = size; args.ctor = ctor; args.dtor = dtor; args.uminit = zinit; args.fini = zfini; args.import = zimport; args.release = zrelease; args.arg = arg; args.align = 0; args.flags = flags; return (zone_alloc_item(zones, &args, M_WAITOK)); } static void zone_lock_pair(uma_zone_t a, uma_zone_t b) { if (a < b) { ZONE_LOCK(a); mtx_lock_flags(b->uz_lockptr, MTX_DUPOK); } else { ZONE_LOCK(b); mtx_lock_flags(a->uz_lockptr, MTX_DUPOK); } } static void zone_unlock_pair(uma_zone_t a, uma_zone_t b) { ZONE_UNLOCK(a); ZONE_UNLOCK(b); } int uma_zsecond_add(uma_zone_t zone, uma_zone_t master) { uma_klink_t klink; uma_klink_t kl; int error; error = 0; klink = malloc(sizeof(*klink), M_TEMP, M_WAITOK | M_ZERO); zone_lock_pair(zone, master); /* * zone must use vtoslab() to resolve objects and must already be * a secondary. */ if ((zone->uz_flags & (UMA_ZONE_VTOSLAB | UMA_ZONE_SECONDARY)) != (UMA_ZONE_VTOSLAB | UMA_ZONE_SECONDARY)) { error = EINVAL; goto out; } /* * The new master must also use vtoslab(). */ if ((zone->uz_flags & UMA_ZONE_VTOSLAB) != UMA_ZONE_VTOSLAB) { error = EINVAL; goto out; } /* * Both must either be refcnt, or not be refcnt. */ if ((zone->uz_flags & UMA_ZONE_REFCNT) != (master->uz_flags & UMA_ZONE_REFCNT)) { error = EINVAL; goto out; } /* * The underlying object must be the same size. rsize * may be different. */ if (master->uz_size != zone->uz_size) { error = E2BIG; goto out; } /* * Put it at the end of the list. */ klink->kl_keg = zone_first_keg(master); LIST_FOREACH(kl, &zone->uz_kegs, kl_link) { if (LIST_NEXT(kl, kl_link) == NULL) { LIST_INSERT_AFTER(kl, klink, kl_link); break; } } klink = NULL; zone->uz_flags |= UMA_ZFLAG_MULTI; zone->uz_slab = zone_fetch_slab_multi; out: zone_unlock_pair(zone, master); if (klink != NULL) free(klink, M_TEMP); return (error); } /* See uma.h */ void uma_zdestroy(uma_zone_t zone) { sx_slock(&uma_drain_lock); zone_free_item(zones, zone, NULL, SKIP_NONE); sx_sunlock(&uma_drain_lock); } /* See uma.h */ void * uma_zalloc_arg(uma_zone_t zone, void *udata, int flags) { void *item; uma_cache_t cache; uma_bucket_t bucket; int lockfail; int cpu; /* This is the fast path allocation */ #ifdef UMA_DEBUG_ALLOC_1 printf("Allocating one item from %s(%p)\n", zone->uz_name, zone); #endif CTR3(KTR_UMA, "uma_zalloc_arg thread %x zone %s flags %d", curthread, zone->uz_name, flags); if (flags & M_WAITOK) { WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL, "uma_zalloc_arg: zone \"%s\"", zone->uz_name); } #ifdef DEBUG_MEMGUARD if (memguard_cmp_zone(zone)) { item = memguard_alloc(zone->uz_size, flags); if (item != NULL) { /* * Avoid conflict with the use-after-free * protecting infrastructure from INVARIANTS. */ if (zone->uz_init != NULL && zone->uz_init != mtrash_init && zone->uz_init(item, zone->uz_size, flags) != 0) return (NULL); if (zone->uz_ctor != NULL && zone->uz_ctor != mtrash_ctor && zone->uz_ctor(item, zone->uz_size, udata, flags) != 0) { zone->uz_fini(item, zone->uz_size); return (NULL); } return (item); } /* This is unfortunate but should not be fatal. */ } #endif /* * If possible, allocate from the per-CPU cache. There are two * requirements for safe access to the per-CPU cache: (1) the thread * accessing the cache must not be preempted or yield during access, * and (2) the thread must not migrate CPUs without switching which * cache it accesses. We rely on a critical section to prevent * preemption and migration. We release the critical section in * order to acquire the zone mutex if we are unable to allocate from * the current cache; when we re-acquire the critical section, we * must detect and handle migration if it has occurred. */ critical_enter(); cpu = curcpu; cache = &zone->uz_cpu[cpu]; zalloc_start: bucket = cache->uc_allocbucket; if (bucket != NULL && bucket->ub_cnt > 0) { bucket->ub_cnt--; item = bucket->ub_bucket[bucket->ub_cnt]; #ifdef INVARIANTS bucket->ub_bucket[bucket->ub_cnt] = NULL; #endif KASSERT(item != NULL, ("uma_zalloc: Bucket pointer mangled.")); cache->uc_allocs++; critical_exit(); if (zone->uz_ctor != NULL && zone->uz_ctor(item, zone->uz_size, udata, flags) != 0) { atomic_add_long(&zone->uz_fails, 1); zone_free_item(zone, item, udata, SKIP_DTOR); return (NULL); } #ifdef INVARIANTS uma_dbg_alloc(zone, NULL, item); #endif if (flags & M_ZERO) uma_zero_item(item, zone); return (item); } /* * We have run out of items in our alloc bucket. * See if we can switch with our free bucket. */ bucket = cache->uc_freebucket; if (bucket != NULL && bucket->ub_cnt > 0) { #ifdef UMA_DEBUG_ALLOC printf("uma_zalloc: Swapping empty with alloc.\n"); #endif cache->uc_freebucket = cache->uc_allocbucket; cache->uc_allocbucket = bucket; goto zalloc_start; } /* * Discard any empty allocation bucket while we hold no locks. */ bucket = cache->uc_allocbucket; cache->uc_allocbucket = NULL; critical_exit(); if (bucket != NULL) bucket_free(zone, bucket, udata); /* Short-circuit for zones without buckets and low memory. */ if (zone->uz_count == 0 || bucketdisable) goto zalloc_item; /* * Attempt to retrieve the item from the per-CPU cache has failed, so * we must go back to the zone. This requires the zone lock, so we * must drop the critical section, then re-acquire it when we go back * to the cache. Since the critical section is released, we may be * preempted or migrate. As such, make sure not to maintain any * thread-local state specific to the cache from prior to releasing * the critical section. */ lockfail = 0; if (ZONE_TRYLOCK(zone) == 0) { /* Record contention to size the buckets. */ ZONE_LOCK(zone); lockfail = 1; } critical_enter(); cpu = curcpu; cache = &zone->uz_cpu[cpu]; /* * Since we have locked the zone we may as well send back our stats. */ atomic_add_long(&zone->uz_allocs, cache->uc_allocs); atomic_add_long(&zone->uz_frees, cache->uc_frees); cache->uc_allocs = 0; cache->uc_frees = 0; /* See if we lost the race to fill the cache. */ if (cache->uc_allocbucket != NULL) { ZONE_UNLOCK(zone); goto zalloc_start; } /* * Check the zone's cache of buckets. */ if ((bucket = LIST_FIRST(&zone->uz_buckets)) != NULL) { KASSERT(bucket->ub_cnt != 0, ("uma_zalloc_arg: Returning an empty bucket.")); LIST_REMOVE(bucket, ub_link); cache->uc_allocbucket = bucket; ZONE_UNLOCK(zone); goto zalloc_start; } /* We are no longer associated with this CPU. */ critical_exit(); /* * We bump the uz count when the cache size is insufficient to * handle the working set. */ if (lockfail && zone->uz_count < BUCKET_MAX) zone->uz_count++; ZONE_UNLOCK(zone); /* * Now lets just fill a bucket and put it on the free list. If that * works we'll restart the allocation from the begining and it * will use the just filled bucket. */ bucket = zone_alloc_bucket(zone, udata, flags); if (bucket != NULL) { ZONE_LOCK(zone); critical_enter(); cpu = curcpu; cache = &zone->uz_cpu[cpu]; /* * See if we lost the race or were migrated. Cache the * initialized bucket to make this less likely or claim * the memory directly. */ if (cache->uc_allocbucket == NULL) cache->uc_allocbucket = bucket; else LIST_INSERT_HEAD(&zone->uz_buckets, bucket, ub_link); ZONE_UNLOCK(zone); goto zalloc_start; } /* * We may not be able to get a bucket so return an actual item. */ #ifdef UMA_DEBUG printf("uma_zalloc_arg: Bucketzone returned NULL\n"); #endif zalloc_item: item = zone_alloc_item(zone, udata, flags); return (item); } static uma_slab_t keg_fetch_slab(uma_keg_t keg, uma_zone_t zone, int flags) { uma_slab_t slab; int reserve; mtx_assert(&keg->uk_lock, MA_OWNED); slab = NULL; reserve = 0; if ((flags & M_USE_RESERVE) == 0) reserve = keg->uk_reserve; for (;;) { /* * Find a slab with some space. Prefer slabs that are partially * used over those that are totally full. This helps to reduce * fragmentation. */ if (keg->uk_free > reserve) { if (!LIST_EMPTY(&keg->uk_part_slab)) { slab = LIST_FIRST(&keg->uk_part_slab); } else { slab = LIST_FIRST(&keg->uk_free_slab); LIST_REMOVE(slab, us_link); LIST_INSERT_HEAD(&keg->uk_part_slab, slab, us_link); } MPASS(slab->us_keg == keg); return (slab); } /* * M_NOVM means don't ask at all! */ if (flags & M_NOVM) break; if (keg->uk_maxpages && keg->uk_pages >= keg->uk_maxpages) { keg->uk_flags |= UMA_ZFLAG_FULL; /* * If this is not a multi-zone, set the FULL bit. * Otherwise slab_multi() takes care of it. */ if ((zone->uz_flags & UMA_ZFLAG_MULTI) == 0) { zone->uz_flags |= UMA_ZFLAG_FULL; zone_log_warning(zone); } if (flags & M_NOWAIT) break; zone->uz_sleeps++; msleep(keg, &keg->uk_lock, PVM, "keglimit", 0); continue; } slab = keg_alloc_slab(keg, zone, flags); /* * If we got a slab here it's safe to mark it partially used * and return. We assume that the caller is going to remove * at least one item. */ if (slab) { MPASS(slab->us_keg == keg); LIST_INSERT_HEAD(&keg->uk_part_slab, slab, us_link); return (slab); } /* * We might not have been able to get a slab but another cpu * could have while we were unlocked. Check again before we * fail. */ flags |= M_NOVM; } return (slab); } static uma_slab_t zone_fetch_slab(uma_zone_t zone, uma_keg_t keg, int flags) { uma_slab_t slab; if (keg == NULL) { keg = zone_first_keg(zone); KEG_LOCK(keg); } for (;;) { slab = keg_fetch_slab(keg, zone, flags); if (slab) return (slab); if (flags & (M_NOWAIT | M_NOVM)) break; } KEG_UNLOCK(keg); return (NULL); } /* * uma_zone_fetch_slab_multi: Fetches a slab from one available keg. Returns * with the keg locked. On NULL no lock is held. * * The last pointer is used to seed the search. It is not required. */ static uma_slab_t zone_fetch_slab_multi(uma_zone_t zone, uma_keg_t last, int rflags) { uma_klink_t klink; uma_slab_t slab; uma_keg_t keg; int flags; int empty; int full; /* * Don't wait on the first pass. This will skip limit tests * as well. We don't want to block if we can find a provider * without blocking. */ flags = (rflags & ~M_WAITOK) | M_NOWAIT; /* * Use the last slab allocated as a hint for where to start * the search. */ if (last != NULL) { slab = keg_fetch_slab(last, zone, flags); if (slab) return (slab); KEG_UNLOCK(last); } /* * Loop until we have a slab incase of transient failures * while M_WAITOK is specified. I'm not sure this is 100% * required but we've done it for so long now. */ for (;;) { empty = 0; full = 0; /* * Search the available kegs for slabs. Be careful to hold the * correct lock while calling into the keg layer. */ LIST_FOREACH(klink, &zone->uz_kegs, kl_link) { keg = klink->kl_keg; KEG_LOCK(keg); if ((keg->uk_flags & UMA_ZFLAG_FULL) == 0) { slab = keg_fetch_slab(keg, zone, flags); if (slab) return (slab); } if (keg->uk_flags & UMA_ZFLAG_FULL) full++; else empty++; KEG_UNLOCK(keg); } if (rflags & (M_NOWAIT | M_NOVM)) break; flags = rflags; /* * All kegs are full. XXX We can't atomically check all kegs * and sleep so just sleep for a short period and retry. */ if (full && !empty) { ZONE_LOCK(zone); zone->uz_flags |= UMA_ZFLAG_FULL; zone->uz_sleeps++; zone_log_warning(zone); msleep(zone, zone->uz_lockptr, PVM, "zonelimit", hz/100); zone->uz_flags &= ~UMA_ZFLAG_FULL; ZONE_UNLOCK(zone); continue; } } return (NULL); } static void * slab_alloc_item(uma_keg_t keg, uma_slab_t slab) { void *item; uint8_t freei; MPASS(keg == slab->us_keg); mtx_assert(&keg->uk_lock, MA_OWNED); freei = BIT_FFS(SLAB_SETSIZE, &slab->us_free) - 1; BIT_CLR(SLAB_SETSIZE, freei, &slab->us_free); item = slab->us_data + (keg->uk_rsize * freei); slab->us_freecount--; keg->uk_free--; /* Move this slab to the full list */ if (slab->us_freecount == 0) { LIST_REMOVE(slab, us_link); LIST_INSERT_HEAD(&keg->uk_full_slab, slab, us_link); } return (item); } static int zone_import(uma_zone_t zone, void **bucket, int max, int flags) { uma_slab_t slab; uma_keg_t keg; int i; slab = NULL; keg = NULL; /* Try to keep the buckets totally full */ for (i = 0; i < max; ) { if ((slab = zone->uz_slab(zone, keg, flags)) == NULL) break; keg = slab->us_keg; while (slab->us_freecount && i < max) { bucket[i++] = slab_alloc_item(keg, slab); if (keg->uk_free <= keg->uk_reserve) break; } /* Don't grab more than one slab at a time. */ flags &= ~M_WAITOK; flags |= M_NOWAIT; } if (slab != NULL) KEG_UNLOCK(keg); return i; } static uma_bucket_t zone_alloc_bucket(uma_zone_t zone, void *udata, int flags) { uma_bucket_t bucket; int max; /* Don't wait for buckets, preserve caller's NOVM setting. */ bucket = bucket_alloc(zone, udata, M_NOWAIT | (flags & M_NOVM)); if (bucket == NULL) return (NULL); max = MIN(bucket->ub_entries, zone->uz_count); bucket->ub_cnt = zone->uz_import(zone->uz_arg, bucket->ub_bucket, max, flags); /* * Initialize the memory if necessary. */ if (bucket->ub_cnt != 0 && zone->uz_init != NULL) { int i; for (i = 0; i < bucket->ub_cnt; i++) if (zone->uz_init(bucket->ub_bucket[i], zone->uz_size, flags) != 0) break; /* * If we couldn't initialize the whole bucket, put the * rest back onto the freelist. */ if (i != bucket->ub_cnt) { zone->uz_release(zone->uz_arg, &bucket->ub_bucket[i], bucket->ub_cnt - i); #ifdef INVARIANTS bzero(&bucket->ub_bucket[i], sizeof(void *) * (bucket->ub_cnt - i)); #endif bucket->ub_cnt = i; } } if (bucket->ub_cnt == 0) { bucket_free(zone, bucket, udata); atomic_add_long(&zone->uz_fails, 1); return (NULL); } return (bucket); } /* * Allocates a single item from a zone. * * Arguments * zone The zone to alloc for. * udata The data to be passed to the constructor. * flags M_WAITOK, M_NOWAIT, M_ZERO. * * Returns * NULL if there is no memory and M_NOWAIT is set * An item if successful */ static void * zone_alloc_item(uma_zone_t zone, void *udata, int flags) { void *item; item = NULL; #ifdef UMA_DEBUG_ALLOC printf("INTERNAL: Allocating one item from %s(%p)\n", zone->uz_name, zone); #endif if (zone->uz_import(zone->uz_arg, &item, 1, flags) != 1) goto fail; atomic_add_long(&zone->uz_allocs, 1); /* * We have to call both the zone's init (not the keg's init) * and the zone's ctor. This is because the item is going from * a keg slab directly to the user, and the user is expecting it * to be both zone-init'd as well as zone-ctor'd. */ if (zone->uz_init != NULL) { if (zone->uz_init(item, zone->uz_size, flags) != 0) { zone_free_item(zone, item, udata, SKIP_FINI); goto fail; } } if (zone->uz_ctor != NULL) { if (zone->uz_ctor(item, zone->uz_size, udata, flags) != 0) { zone_free_item(zone, item, udata, SKIP_DTOR); goto fail; } } #ifdef INVARIANTS uma_dbg_alloc(zone, NULL, item); #endif if (flags & M_ZERO) uma_zero_item(item, zone); return (item); fail: atomic_add_long(&zone->uz_fails, 1); return (NULL); } /* See uma.h */ void uma_zfree_arg(uma_zone_t zone, void *item, void *udata) { uma_cache_t cache; uma_bucket_t bucket; int lockfail; int cpu; #ifdef UMA_DEBUG_ALLOC_1 printf("Freeing item %p to %s(%p)\n", item, zone->uz_name, zone); #endif CTR2(KTR_UMA, "uma_zfree_arg thread %x zone %s", curthread, zone->uz_name); /* uma_zfree(..., NULL) does nothing, to match free(9). */ if (item == NULL) return; #ifdef DEBUG_MEMGUARD if (is_memguard_addr(item)) { if (zone->uz_dtor != NULL && zone->uz_dtor != mtrash_dtor) zone->uz_dtor(item, zone->uz_size, udata); if (zone->uz_fini != NULL && zone->uz_fini != mtrash_fini) zone->uz_fini(item, zone->uz_size); memguard_free(item); return; } #endif #ifdef INVARIANTS if (zone->uz_flags & UMA_ZONE_MALLOC) uma_dbg_free(zone, udata, item); else uma_dbg_free(zone, NULL, item); #endif if (zone->uz_dtor != NULL) zone->uz_dtor(item, zone->uz_size, udata); /* * The race here is acceptable. If we miss it we'll just have to wait * a little longer for the limits to be reset. */ if (zone->uz_flags & UMA_ZFLAG_FULL) goto zfree_item; /* * If possible, free to the per-CPU cache. There are two * requirements for safe access to the per-CPU cache: (1) the thread * accessing the cache must not be preempted or yield during access, * and (2) the thread must not migrate CPUs without switching which * cache it accesses. We rely on a critical section to prevent * preemption and migration. We release the critical section in * order to acquire the zone mutex if we are unable to free to the * current cache; when we re-acquire the critical section, we must * detect and handle migration if it has occurred. */ zfree_restart: critical_enter(); cpu = curcpu; cache = &zone->uz_cpu[cpu]; zfree_start: /* * Try to free into the allocbucket first to give LIFO ordering * for cache-hot datastructures. Spill over into the freebucket * if necessary. Alloc will swap them if one runs dry. */ bucket = cache->uc_allocbucket; if (bucket == NULL || bucket->ub_cnt >= bucket->ub_entries) bucket = cache->uc_freebucket; if (bucket != NULL && bucket->ub_cnt < bucket->ub_entries) { KASSERT(bucket->ub_bucket[bucket->ub_cnt] == NULL, ("uma_zfree: Freeing to non free bucket index.")); bucket->ub_bucket[bucket->ub_cnt] = item; bucket->ub_cnt++; cache->uc_frees++; critical_exit(); return; } /* * We must go back the zone, which requires acquiring the zone lock, * which in turn means we must release and re-acquire the critical * section. Since the critical section is released, we may be * preempted or migrate. As such, make sure not to maintain any * thread-local state specific to the cache from prior to releasing * the critical section. */ critical_exit(); if (zone->uz_count == 0 || bucketdisable) goto zfree_item; lockfail = 0; if (ZONE_TRYLOCK(zone) == 0) { /* Record contention to size the buckets. */ ZONE_LOCK(zone); lockfail = 1; } critical_enter(); cpu = curcpu; cache = &zone->uz_cpu[cpu]; /* * Since we have locked the zone we may as well send back our stats. */ atomic_add_long(&zone->uz_allocs, cache->uc_allocs); atomic_add_long(&zone->uz_frees, cache->uc_frees); cache->uc_allocs = 0; cache->uc_frees = 0; bucket = cache->uc_freebucket; if (bucket != NULL && bucket->ub_cnt < bucket->ub_entries) { ZONE_UNLOCK(zone); goto zfree_start; } cache->uc_freebucket = NULL; /* Can we throw this on the zone full list? */ if (bucket != NULL) { #ifdef UMA_DEBUG_ALLOC printf("uma_zfree: Putting old bucket on the free list.\n"); #endif /* ub_cnt is pointing to the last free item */ KASSERT(bucket->ub_cnt != 0, ("uma_zfree: Attempting to insert an empty bucket onto the full list.\n")); LIST_INSERT_HEAD(&zone->uz_buckets, bucket, ub_link); } /* We are no longer associated with this CPU. */ critical_exit(); /* * We bump the uz count when the cache size is insufficient to * handle the working set. */ if (lockfail && zone->uz_count < BUCKET_MAX) zone->uz_count++; ZONE_UNLOCK(zone); #ifdef UMA_DEBUG_ALLOC printf("uma_zfree: Allocating new free bucket.\n"); #endif bucket = bucket_alloc(zone, udata, M_NOWAIT); if (bucket) { critical_enter(); cpu = curcpu; cache = &zone->uz_cpu[cpu]; if (cache->uc_freebucket == NULL) { cache->uc_freebucket = bucket; goto zfree_start; } /* * We lost the race, start over. We have to drop our * critical section to free the bucket. */ critical_exit(); bucket_free(zone, bucket, udata); goto zfree_restart; } /* * If nothing else caught this, we'll just do an internal free. */ zfree_item: zone_free_item(zone, item, udata, SKIP_DTOR); return; } static void slab_free_item(uma_keg_t keg, uma_slab_t slab, void *item) { uint8_t freei; mtx_assert(&keg->uk_lock, MA_OWNED); MPASS(keg == slab->us_keg); /* Do we need to remove from any lists? */ if (slab->us_freecount+1 == keg->uk_ipers) { LIST_REMOVE(slab, us_link); LIST_INSERT_HEAD(&keg->uk_free_slab, slab, us_link); } else if (slab->us_freecount == 0) { LIST_REMOVE(slab, us_link); LIST_INSERT_HEAD(&keg->uk_part_slab, slab, us_link); } /* Slab management. */ freei = ((uintptr_t)item - (uintptr_t)slab->us_data) / keg->uk_rsize; BIT_SET(SLAB_SETSIZE, freei, &slab->us_free); slab->us_freecount++; /* Keg statistics. */ keg->uk_free++; } static void zone_release(uma_zone_t zone, void **bucket, int cnt) { void *item; uma_slab_t slab; uma_keg_t keg; uint8_t *mem; int clearfull; int i; clearfull = 0; keg = zone_first_keg(zone); KEG_LOCK(keg); for (i = 0; i < cnt; i++) { item = bucket[i]; if (!(zone->uz_flags & UMA_ZONE_VTOSLAB)) { mem = (uint8_t *)((uintptr_t)item & (~UMA_SLAB_MASK)); if (zone->uz_flags & UMA_ZONE_HASH) { slab = hash_sfind(&keg->uk_hash, mem); } else { mem += keg->uk_pgoff; slab = (uma_slab_t)mem; } } else { slab = vtoslab((vm_offset_t)item); if (slab->us_keg != keg) { KEG_UNLOCK(keg); keg = slab->us_keg; KEG_LOCK(keg); } } slab_free_item(keg, slab, item); if (keg->uk_flags & UMA_ZFLAG_FULL) { if (keg->uk_pages < keg->uk_maxpages) { keg->uk_flags &= ~UMA_ZFLAG_FULL; clearfull = 1; } /* * We can handle one more allocation. Since we're * clearing ZFLAG_FULL, wake up all procs blocked * on pages. This should be uncommon, so keeping this * simple for now (rather than adding count of blocked * threads etc). */ wakeup(keg); } } KEG_UNLOCK(keg); if (clearfull) { ZONE_LOCK(zone); zone->uz_flags &= ~UMA_ZFLAG_FULL; wakeup(zone); ZONE_UNLOCK(zone); } } /* * Frees a single item to any zone. * * Arguments: * zone The zone to free to * item The item we're freeing * udata User supplied data for the dtor * skip Skip dtors and finis */ static void zone_free_item(uma_zone_t zone, void *item, void *udata, enum zfreeskip skip) { #ifdef INVARIANTS if (skip == SKIP_NONE) { if (zone->uz_flags & UMA_ZONE_MALLOC) uma_dbg_free(zone, udata, item); else uma_dbg_free(zone, NULL, item); } #endif if (skip < SKIP_DTOR && zone->uz_dtor) zone->uz_dtor(item, zone->uz_size, udata); if (skip < SKIP_FINI && zone->uz_fini) zone->uz_fini(item, zone->uz_size); atomic_add_long(&zone->uz_frees, 1); zone->uz_release(zone->uz_arg, &item, 1); } /* See uma.h */ int uma_zone_set_max(uma_zone_t zone, int nitems) { uma_keg_t keg; keg = zone_first_keg(zone); if (keg == NULL) return (0); KEG_LOCK(keg); keg->uk_maxpages = (nitems / keg->uk_ipers) * keg->uk_ppera; if (keg->uk_maxpages * keg->uk_ipers < nitems) keg->uk_maxpages += keg->uk_ppera; nitems = keg->uk_maxpages * keg->uk_ipers; KEG_UNLOCK(keg); return (nitems); } /* See uma.h */ int uma_zone_get_max(uma_zone_t zone) { int nitems; uma_keg_t keg; keg = zone_first_keg(zone); if (keg == NULL) return (0); KEG_LOCK(keg); nitems = keg->uk_maxpages * keg->uk_ipers; KEG_UNLOCK(keg); return (nitems); } /* See uma.h */ void uma_zone_set_warning(uma_zone_t zone, const char *warning) { ZONE_LOCK(zone); zone->uz_warning = warning; ZONE_UNLOCK(zone); } /* See uma.h */ int uma_zone_get_cur(uma_zone_t zone) { int64_t nitems; u_int i; ZONE_LOCK(zone); nitems = zone->uz_allocs - zone->uz_frees; CPU_FOREACH(i) { /* * See the comment in sysctl_vm_zone_stats() regarding the * safety of accessing the per-cpu caches. With the zone lock * held, it is safe, but can potentially result in stale data. */ nitems += zone->uz_cpu[i].uc_allocs - zone->uz_cpu[i].uc_frees; } ZONE_UNLOCK(zone); return (nitems < 0 ? 0 : nitems); } /* See uma.h */ void uma_zone_set_init(uma_zone_t zone, uma_init uminit) { uma_keg_t keg; keg = zone_first_keg(zone); KASSERT(keg != NULL, ("uma_zone_set_init: Invalid zone type")); KEG_LOCK(keg); KASSERT(keg->uk_pages == 0, ("uma_zone_set_init on non-empty keg")); keg->uk_init = uminit; KEG_UNLOCK(keg); } /* See uma.h */ void uma_zone_set_fini(uma_zone_t zone, uma_fini fini) { uma_keg_t keg; keg = zone_first_keg(zone); KASSERT(keg != NULL, ("uma_zone_set_fini: Invalid zone type")); KEG_LOCK(keg); KASSERT(keg->uk_pages == 0, ("uma_zone_set_fini on non-empty keg")); keg->uk_fini = fini; KEG_UNLOCK(keg); } /* See uma.h */ void uma_zone_set_zinit(uma_zone_t zone, uma_init zinit) { ZONE_LOCK(zone); KASSERT(zone_first_keg(zone)->uk_pages == 0, ("uma_zone_set_zinit on non-empty keg")); zone->uz_init = zinit; ZONE_UNLOCK(zone); } /* See uma.h */ void uma_zone_set_zfini(uma_zone_t zone, uma_fini zfini) { ZONE_LOCK(zone); KASSERT(zone_first_keg(zone)->uk_pages == 0, ("uma_zone_set_zfini on non-empty keg")); zone->uz_fini = zfini; ZONE_UNLOCK(zone); } /* See uma.h */ /* XXX uk_freef is not actually used with the zone locked */ void uma_zone_set_freef(uma_zone_t zone, uma_free freef) { uma_keg_t keg; keg = zone_first_keg(zone); KASSERT(keg != NULL, ("uma_zone_set_freef: Invalid zone type")); KEG_LOCK(keg); keg->uk_freef = freef; KEG_UNLOCK(keg); } /* See uma.h */ /* XXX uk_allocf is not actually used with the zone locked */ void uma_zone_set_allocf(uma_zone_t zone, uma_alloc allocf) { uma_keg_t keg; keg = zone_first_keg(zone); KEG_LOCK(keg); keg->uk_allocf = allocf; KEG_UNLOCK(keg); } /* See uma.h */ void uma_zone_reserve(uma_zone_t zone, int items) { uma_keg_t keg; keg = zone_first_keg(zone); if (keg == NULL) return; KEG_LOCK(keg); keg->uk_reserve = items; KEG_UNLOCK(keg); return; } /* See uma.h */ int uma_zone_reserve_kva(uma_zone_t zone, int count) { uma_keg_t keg; vm_offset_t kva; int pages; keg = zone_first_keg(zone); if (keg == NULL) return (0); pages = count / keg->uk_ipers; if (pages * keg->uk_ipers < count) pages++; #ifdef UMA_MD_SMALL_ALLOC if (keg->uk_ppera > 1) { #else if (1) { #endif kva = kva_alloc(pages * UMA_SLAB_SIZE); if (kva == 0) return (0); } else kva = 0; KEG_LOCK(keg); keg->uk_kva = kva; keg->uk_offset = 0; keg->uk_maxpages = pages; #ifdef UMA_MD_SMALL_ALLOC keg->uk_allocf = (keg->uk_ppera > 1) ? noobj_alloc : uma_small_alloc; #else keg->uk_allocf = noobj_alloc; #endif keg->uk_flags |= UMA_ZONE_NOFREE; KEG_UNLOCK(keg); return (1); } /* See uma.h */ void uma_prealloc(uma_zone_t zone, int items) { int slabs; uma_slab_t slab; uma_keg_t keg; keg = zone_first_keg(zone); if (keg == NULL) return; KEG_LOCK(keg); slabs = items / keg->uk_ipers; if (slabs * keg->uk_ipers < items) slabs++; while (slabs > 0) { slab = keg_alloc_slab(keg, zone, M_WAITOK); if (slab == NULL) break; MPASS(slab->us_keg == keg); LIST_INSERT_HEAD(&keg->uk_free_slab, slab, us_link); slabs--; } KEG_UNLOCK(keg); } /* See uma.h */ uint32_t * uma_find_refcnt(uma_zone_t zone, void *item) { uma_slabrefcnt_t slabref; uma_slab_t slab; uma_keg_t keg; uint32_t *refcnt; int idx; slab = vtoslab((vm_offset_t)item & (~UMA_SLAB_MASK)); slabref = (uma_slabrefcnt_t)slab; keg = slab->us_keg; KASSERT(keg->uk_flags & UMA_ZONE_REFCNT, ("uma_find_refcnt(): zone possibly not UMA_ZONE_REFCNT")); idx = ((uintptr_t)item - (uintptr_t)slab->us_data) / keg->uk_rsize; refcnt = &slabref->us_refcnt[idx]; return refcnt; } /* See uma.h */ static void uma_reclaim_locked(bool kmem_danger) { #ifdef UMA_DEBUG printf("UMA: vm asked us to release pages!\n"); #endif sx_assert(&uma_drain_lock, SA_XLOCKED); bucket_enable(); zone_foreach(zone_drain); if (vm_page_count_min() || kmem_danger) { cache_drain_safe(NULL); zone_foreach(zone_drain); } /* * Some slabs may have been freed but this zone will be visited early * we visit again so that we can free pages that are empty once other * zones are drained. We have to do the same for buckets. */ zone_drain(slabzone); zone_drain(slabrefzone); bucket_zone_drain(); } void uma_reclaim(void) { sx_xlock(&uma_drain_lock); uma_reclaim_locked(false); sx_xunlock(&uma_drain_lock); } static int uma_reclaim_needed; void uma_reclaim_wakeup(void) { uma_reclaim_needed = 1; wakeup(&uma_reclaim_needed); } void uma_reclaim_worker(void *arg __unused) { sx_xlock(&uma_drain_lock); for (;;) { sx_sleep(&uma_reclaim_needed, &uma_drain_lock, PVM, "umarcl", 0); if (uma_reclaim_needed) { uma_reclaim_needed = 0; uma_reclaim_locked(true); } } } /* See uma.h */ int uma_zone_exhausted(uma_zone_t zone) { int full; ZONE_LOCK(zone); full = (zone->uz_flags & UMA_ZFLAG_FULL); ZONE_UNLOCK(zone); return (full); } int uma_zone_exhausted_nolock(uma_zone_t zone) { return (zone->uz_flags & UMA_ZFLAG_FULL); } void * -uma_large_malloc(int size, int wait) +uma_large_malloc(vm_size_t size, int wait) { void *mem; uma_slab_t slab; uint8_t flags; slab = zone_alloc_item(slabzone, NULL, wait); if (slab == NULL) return (NULL); mem = page_alloc(NULL, size, &flags, wait); if (mem) { vsetslab((vm_offset_t)mem, slab); slab->us_data = mem; slab->us_flags = flags | UMA_SLAB_MALLOC; slab->us_size = size; } else { zone_free_item(slabzone, slab, NULL, SKIP_NONE); } return (mem); } void uma_large_free(uma_slab_t slab) { page_free(slab->us_data, slab->us_size, slab->us_flags); zone_free_item(slabzone, slab, NULL, SKIP_NONE); } static void uma_zero_item(void *item, uma_zone_t zone) { if (zone->uz_flags & UMA_ZONE_PCPU) { for (int i = 0; i < mp_ncpus; i++) bzero(zpcpu_get_cpu(item, i), zone->uz_size); } else bzero(item, zone->uz_size); } void uma_print_stats(void) { zone_foreach(uma_print_zone); } static void slab_print(uma_slab_t slab) { printf("slab: keg %p, data %p, freecount %d\n", slab->us_keg, slab->us_data, slab->us_freecount); } static void cache_print(uma_cache_t cache) { printf("alloc: %p(%d), free: %p(%d)\n", cache->uc_allocbucket, cache->uc_allocbucket?cache->uc_allocbucket->ub_cnt:0, cache->uc_freebucket, cache->uc_freebucket?cache->uc_freebucket->ub_cnt:0); } static void uma_print_keg(uma_keg_t keg) { uma_slab_t slab; printf("keg: %s(%p) size %d(%d) flags %#x ipers %d ppera %d " "out %d free %d limit %d\n", keg->uk_name, keg, keg->uk_size, keg->uk_rsize, keg->uk_flags, keg->uk_ipers, keg->uk_ppera, (keg->uk_ipers * keg->uk_pages) - keg->uk_free, keg->uk_free, (keg->uk_maxpages / keg->uk_ppera) * keg->uk_ipers); printf("Part slabs:\n"); LIST_FOREACH(slab, &keg->uk_part_slab, us_link) slab_print(slab); printf("Free slabs:\n"); LIST_FOREACH(slab, &keg->uk_free_slab, us_link) slab_print(slab); printf("Full slabs:\n"); LIST_FOREACH(slab, &keg->uk_full_slab, us_link) slab_print(slab); } void uma_print_zone(uma_zone_t zone) { uma_cache_t cache; uma_klink_t kl; int i; printf("zone: %s(%p) size %d flags %#x\n", zone->uz_name, zone, zone->uz_size, zone->uz_flags); LIST_FOREACH(kl, &zone->uz_kegs, kl_link) uma_print_keg(kl->kl_keg); CPU_FOREACH(i) { cache = &zone->uz_cpu[i]; printf("CPU %d Cache:\n", i); cache_print(cache); } } #ifdef DDB /* * Generate statistics across both the zone and its per-cpu cache's. Return * desired statistics if the pointer is non-NULL for that statistic. * * Note: does not update the zone statistics, as it can't safely clear the * per-CPU cache statistic. * * XXXRW: Following the uc_allocbucket and uc_freebucket pointers here isn't * safe from off-CPU; we should modify the caches to track this information * directly so that we don't have to. */ static void uma_zone_sumstat(uma_zone_t z, int *cachefreep, uint64_t *allocsp, uint64_t *freesp, uint64_t *sleepsp) { uma_cache_t cache; uint64_t allocs, frees, sleeps; int cachefree, cpu; allocs = frees = sleeps = 0; cachefree = 0; CPU_FOREACH(cpu) { cache = &z->uz_cpu[cpu]; if (cache->uc_allocbucket != NULL) cachefree += cache->uc_allocbucket->ub_cnt; if (cache->uc_freebucket != NULL) cachefree += cache->uc_freebucket->ub_cnt; allocs += cache->uc_allocs; frees += cache->uc_frees; } allocs += z->uz_allocs; frees += z->uz_frees; sleeps += z->uz_sleeps; if (cachefreep != NULL) *cachefreep = cachefree; if (allocsp != NULL) *allocsp = allocs; if (freesp != NULL) *freesp = frees; if (sleepsp != NULL) *sleepsp = sleeps; } #endif /* DDB */ static int sysctl_vm_zone_count(SYSCTL_HANDLER_ARGS) { uma_keg_t kz; uma_zone_t z; int count; count = 0; rw_rlock(&uma_rwlock); LIST_FOREACH(kz, &uma_kegs, uk_link) { LIST_FOREACH(z, &kz->uk_zones, uz_link) count++; } rw_runlock(&uma_rwlock); return (sysctl_handle_int(oidp, &count, 0, req)); } static int sysctl_vm_zone_stats(SYSCTL_HANDLER_ARGS) { struct uma_stream_header ush; struct uma_type_header uth; struct uma_percpu_stat ups; uma_bucket_t bucket; struct sbuf sbuf; uma_cache_t cache; uma_klink_t kl; uma_keg_t kz; uma_zone_t z; uma_keg_t k; int count, error, i; error = sysctl_wire_old_buffer(req, 0); if (error != 0) return (error); sbuf_new_for_sysctl(&sbuf, NULL, 128, req); count = 0; rw_rlock(&uma_rwlock); LIST_FOREACH(kz, &uma_kegs, uk_link) { LIST_FOREACH(z, &kz->uk_zones, uz_link) count++; } /* * Insert stream header. */ bzero(&ush, sizeof(ush)); ush.ush_version = UMA_STREAM_VERSION; ush.ush_maxcpus = (mp_maxid + 1); ush.ush_count = count; (void)sbuf_bcat(&sbuf, &ush, sizeof(ush)); LIST_FOREACH(kz, &uma_kegs, uk_link) { LIST_FOREACH(z, &kz->uk_zones, uz_link) { bzero(&uth, sizeof(uth)); ZONE_LOCK(z); strlcpy(uth.uth_name, z->uz_name, UTH_MAX_NAME); uth.uth_align = kz->uk_align; uth.uth_size = kz->uk_size; uth.uth_rsize = kz->uk_rsize; LIST_FOREACH(kl, &z->uz_kegs, kl_link) { k = kl->kl_keg; uth.uth_maxpages += k->uk_maxpages; uth.uth_pages += k->uk_pages; uth.uth_keg_free += k->uk_free; uth.uth_limit = (k->uk_maxpages / k->uk_ppera) * k->uk_ipers; } /* * A zone is secondary is it is not the first entry * on the keg's zone list. */ if ((z->uz_flags & UMA_ZONE_SECONDARY) && (LIST_FIRST(&kz->uk_zones) != z)) uth.uth_zone_flags = UTH_ZONE_SECONDARY; LIST_FOREACH(bucket, &z->uz_buckets, ub_link) uth.uth_zone_free += bucket->ub_cnt; uth.uth_allocs = z->uz_allocs; uth.uth_frees = z->uz_frees; uth.uth_fails = z->uz_fails; uth.uth_sleeps = z->uz_sleeps; (void)sbuf_bcat(&sbuf, &uth, sizeof(uth)); /* * While it is not normally safe to access the cache * bucket pointers while not on the CPU that owns the * cache, we only allow the pointers to be exchanged * without the zone lock held, not invalidated, so * accept the possible race associated with bucket * exchange during monitoring. */ for (i = 0; i < (mp_maxid + 1); i++) { bzero(&ups, sizeof(ups)); if (kz->uk_flags & UMA_ZFLAG_INTERNAL) goto skip; if (CPU_ABSENT(i)) goto skip; cache = &z->uz_cpu[i]; if (cache->uc_allocbucket != NULL) ups.ups_cache_free += cache->uc_allocbucket->ub_cnt; if (cache->uc_freebucket != NULL) ups.ups_cache_free += cache->uc_freebucket->ub_cnt; ups.ups_allocs = cache->uc_allocs; ups.ups_frees = cache->uc_frees; skip: (void)sbuf_bcat(&sbuf, &ups, sizeof(ups)); } ZONE_UNLOCK(z); } } rw_runlock(&uma_rwlock); error = sbuf_finish(&sbuf); sbuf_delete(&sbuf); return (error); } int sysctl_handle_uma_zone_max(SYSCTL_HANDLER_ARGS) { uma_zone_t zone = *(uma_zone_t *)arg1; int error, max, old; old = max = uma_zone_get_max(zone); error = sysctl_handle_int(oidp, &max, 0, req); if (error || !req->newptr) return (error); if (max < old) return (EINVAL); uma_zone_set_max(zone, max); return (0); } int sysctl_handle_uma_zone_cur(SYSCTL_HANDLER_ARGS) { uma_zone_t zone = *(uma_zone_t *)arg1; int cur; cur = uma_zone_get_cur(zone); return (sysctl_handle_int(oidp, &cur, 0, req)); } #ifdef DDB DB_SHOW_COMMAND(uma, db_show_uma) { uint64_t allocs, frees, sleeps; uma_bucket_t bucket; uma_keg_t kz; uma_zone_t z; int cachefree; db_printf("%18s %8s %8s %8s %12s %8s %8s\n", "Zone", "Size", "Used", "Free", "Requests", "Sleeps", "Bucket"); LIST_FOREACH(kz, &uma_kegs, uk_link) { LIST_FOREACH(z, &kz->uk_zones, uz_link) { if (kz->uk_flags & UMA_ZFLAG_INTERNAL) { allocs = z->uz_allocs; frees = z->uz_frees; sleeps = z->uz_sleeps; cachefree = 0; } else uma_zone_sumstat(z, &cachefree, &allocs, &frees, &sleeps); if (!((z->uz_flags & UMA_ZONE_SECONDARY) && (LIST_FIRST(&kz->uk_zones) != z))) cachefree += kz->uk_free; LIST_FOREACH(bucket, &z->uz_buckets, ub_link) cachefree += bucket->ub_cnt; db_printf("%18s %8ju %8jd %8d %12ju %8ju %8u\n", z->uz_name, (uintmax_t)kz->uk_size, (intmax_t)(allocs - frees), cachefree, (uintmax_t)allocs, sleeps, z->uz_count); if (db_pager_quit) return; } } } DB_SHOW_COMMAND(umacache, db_show_umacache) { uint64_t allocs, frees; uma_bucket_t bucket; uma_zone_t z; int cachefree; db_printf("%18s %8s %8s %8s %12s %8s\n", "Zone", "Size", "Used", "Free", "Requests", "Bucket"); LIST_FOREACH(z, &uma_cachezones, uz_link) { uma_zone_sumstat(z, &cachefree, &allocs, &frees, NULL); LIST_FOREACH(bucket, &z->uz_buckets, ub_link) cachefree += bucket->ub_cnt; db_printf("%18s %8ju %8jd %8d %12ju %8u\n", z->uz_name, (uintmax_t)z->uz_size, (intmax_t)(allocs - frees), cachefree, (uintmax_t)allocs, z->uz_count); if (db_pager_quit) return; } } #endif Index: stable/10/sys/vm/uma_int.h =================================================================== --- stable/10/sys/vm/uma_int.h (revision 287944) +++ stable/10/sys/vm/uma_int.h (revision 287945) @@ -1,431 +1,432 @@ /*- * Copyright (c) 2002-2005, 2009, 2013 Jeffrey Roberson * Copyright (c) 2004, 2005 Bosko Milekic * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice unmodified, this list of conditions, and the following * disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * * $FreeBSD$ * */ /* * This file includes definitions, structures, prototypes, and inlines that * should not be used outside of the actual implementation of UMA. */ /* * Here's a quick description of the relationship between the objects: * * Kegs contain lists of slabs which are stored in either the full bin, empty * bin, or partially allocated bin, to reduce fragmentation. They also contain * the user supplied value for size, which is adjusted for alignment purposes * and rsize is the result of that. The Keg also stores information for * managing a hash of page addresses that maps pages to uma_slab_t structures * for pages that don't have embedded uma_slab_t's. * * The uma_slab_t may be embedded in a UMA_SLAB_SIZE chunk of memory or it may * be allocated off the page from a special slab zone. The free list within a * slab is managed with a bitmask. For item sizes that would yield more than * 10% memory waste we potentially allocate a separate uma_slab_t if this will * improve the number of items per slab that will fit. * * The only really gross cases, with regards to memory waste, are for those * items that are just over half the page size. You can get nearly 50% waste, * so you fall back to the memory footprint of the power of two allocator. I * have looked at memory allocation sizes on many of the machines available to * me, and there does not seem to be an abundance of allocations at this range * so at this time it may not make sense to optimize for it. This can, of * course, be solved with dynamic slab sizes. * * Kegs may serve multiple Zones but by far most of the time they only serve * one. When a Zone is created, a Keg is allocated and setup for it. While * the backing Keg stores slabs, the Zone caches Buckets of items allocated * from the slabs. Each Zone is equipped with an init/fini and ctor/dtor * pair, as well as with its own set of small per-CPU caches, layered above * the Zone's general Bucket cache. * * The PCPU caches are protected by critical sections, and may be accessed * safely only from their associated CPU, while the Zones backed by the same * Keg all share a common Keg lock (to coalesce contention on the backing * slabs). The backing Keg typically only serves one Zone but in the case of * multiple Zones, one of the Zones is considered the Master Zone and all * Zone-related stats from the Keg are done in the Master Zone. For an * example of a Multi-Zone setup, refer to the Mbuf allocation code. */ /* * This is the representation for normal (Non OFFPAGE slab) * * i == item * s == slab pointer * * <---------------- Page (UMA_SLAB_SIZE) ------------------> * ___________________________________________________________ * | _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ ___________ | * ||i||i||i||i||i||i||i||i||i||i||i||i||i||i||i| |slab header|| * ||_||_||_||_||_||_||_||_||_||_||_||_||_||_||_| |___________|| * |___________________________________________________________| * * * This is an OFFPAGE slab. These can be larger than UMA_SLAB_SIZE. * * ___________________________________________________________ * | _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ | * ||i||i||i||i||i||i||i||i||i||i||i||i||i||i||i||i||i||i||i| | * ||_||_||_||_||_||_||_||_||_||_||_||_||_||_||_||_||_||_||_| | * |___________________________________________________________| * ___________ ^ * |slab header| | * |___________|---* * */ #ifndef VM_UMA_INT_H #define VM_UMA_INT_H #define UMA_SLAB_SIZE PAGE_SIZE /* How big are our slabs? */ #define UMA_SLAB_MASK (PAGE_SIZE - 1) /* Mask to get back to the page */ #define UMA_SLAB_SHIFT PAGE_SHIFT /* Number of bits PAGE_MASK */ #define UMA_BOOT_PAGES 64 /* Pages allocated for startup */ /* Max waste percentage before going to off page slab management */ #define UMA_MAX_WASTE 10 /* * I doubt there will be many cases where this is exceeded. This is the initial * size of the hash table for uma_slabs that are managed off page. This hash * does expand by powers of two. Currently it doesn't get smaller. */ #define UMA_HASH_SIZE_INIT 32 /* * I should investigate other hashing algorithms. This should yield a low * number of collisions if the pages are relatively contiguous. */ #define UMA_HASH(h, s) ((((uintptr_t)s) >> UMA_SLAB_SHIFT) & (h)->uh_hashmask) #define UMA_HASH_INSERT(h, s, mem) \ SLIST_INSERT_HEAD(&(h)->uh_slab_hash[UMA_HASH((h), \ (mem))], (s), us_hlink) #define UMA_HASH_REMOVE(h, s, mem) \ SLIST_REMOVE(&(h)->uh_slab_hash[UMA_HASH((h), \ (mem))], (s), uma_slab, us_hlink) /* Hash table for freed address -> slab translation */ SLIST_HEAD(slabhead, uma_slab); struct uma_hash { struct slabhead *uh_slab_hash; /* Hash table for slabs */ int uh_hashsize; /* Current size of the hash table */ int uh_hashmask; /* Mask used during hashing */ }; /* * align field or structure to cache line */ #if defined(__amd64__) #define UMA_ALIGN __aligned(CACHE_LINE_SIZE) #else #define UMA_ALIGN #endif /* * Structures for per cpu queues. */ struct uma_bucket { LIST_ENTRY(uma_bucket) ub_link; /* Link into the zone */ int16_t ub_cnt; /* Count of free items. */ int16_t ub_entries; /* Max items. */ void *ub_bucket[]; /* actual allocation storage */ }; typedef struct uma_bucket * uma_bucket_t; struct uma_cache { uma_bucket_t uc_freebucket; /* Bucket we're freeing to */ uma_bucket_t uc_allocbucket; /* Bucket to allocate from */ uint64_t uc_allocs; /* Count of allocations */ uint64_t uc_frees; /* Count of frees */ } UMA_ALIGN; typedef struct uma_cache * uma_cache_t; /* * Keg management structure * * TODO: Optimize for cache line size * */ struct uma_keg { struct mtx_padalign uk_lock; /* Lock for the keg */ struct uma_hash uk_hash; LIST_HEAD(,uma_zone) uk_zones; /* Keg's zones */ LIST_HEAD(,uma_slab) uk_part_slab; /* partially allocated slabs */ LIST_HEAD(,uma_slab) uk_free_slab; /* empty slab list */ LIST_HEAD(,uma_slab) uk_full_slab; /* full slabs */ uint32_t uk_align; /* Alignment mask */ uint32_t uk_pages; /* Total page count */ uint32_t uk_free; /* Count of items free in slabs */ uint32_t uk_reserve; /* Number of reserved items. */ uint32_t uk_size; /* Requested size of each item */ uint32_t uk_rsize; /* Real size of each item */ uint32_t uk_maxpages; /* Maximum number of pages to alloc */ uma_init uk_init; /* Keg's init routine */ uma_fini uk_fini; /* Keg's fini routine */ uma_alloc uk_allocf; /* Allocation function */ uma_free uk_freef; /* Free routine */ u_long uk_offset; /* Next free offset from base KVA */ vm_offset_t uk_kva; /* Zone base KVA */ uma_zone_t uk_slabzone; /* Slab zone backing us, if OFFPAGE */ uint16_t uk_slabsize; /* Slab size for this keg */ uint16_t uk_pgoff; /* Offset to uma_slab struct */ uint16_t uk_ppera; /* pages per allocation from backend */ uint16_t uk_ipers; /* Items per slab */ uint32_t uk_flags; /* Internal flags */ /* Least used fields go to the last cache line. */ const char *uk_name; /* Name of creating zone. */ LIST_ENTRY(uma_keg) uk_link; /* List of all kegs */ }; typedef struct uma_keg * uma_keg_t; /* * Free bits per-slab. */ #define SLAB_SETSIZE (PAGE_SIZE / UMA_SMALLEST_UNIT) BITSET_DEFINE(slabbits, SLAB_SETSIZE); /* * The slab structure manages a single contiguous allocation from backing * store and subdivides it into individually allocatable items. */ struct uma_slab { uma_keg_t us_keg; /* Keg we live in */ union { LIST_ENTRY(uma_slab) _us_link; /* slabs in zone */ unsigned long _us_size; /* Size of allocation */ } us_type; SLIST_ENTRY(uma_slab) us_hlink; /* Link for hash table */ uint8_t *us_data; /* First item */ struct slabbits us_free; /* Free bitmask. */ #ifdef INVARIANTS struct slabbits us_debugfree; /* Debug bitmask. */ #endif uint16_t us_freecount; /* How many are free? */ uint8_t us_flags; /* Page flags see uma.h */ uint8_t us_pad; /* Pad to 32bits, unused. */ }; #define us_link us_type._us_link #define us_size us_type._us_size /* * The slab structure for UMA_ZONE_REFCNT zones for whose items we * maintain reference counters in the slab for. */ struct uma_slab_refcnt { struct uma_slab us_head; /* slab header data */ uint32_t us_refcnt[0]; /* Actually larger. */ }; typedef struct uma_slab * uma_slab_t; typedef struct uma_slab_refcnt * uma_slabrefcnt_t; typedef uma_slab_t (*uma_slaballoc)(uma_zone_t, uma_keg_t, int); struct uma_klink { LIST_ENTRY(uma_klink) kl_link; uma_keg_t kl_keg; }; typedef struct uma_klink *uma_klink_t; /* * Zone management structure * * TODO: Optimize for cache line size * */ struct uma_zone { struct mtx_padalign uz_lock; /* Lock for the zone */ struct mtx_padalign *uz_lockptr; const char *uz_name; /* Text name of the zone */ LIST_ENTRY(uma_zone) uz_link; /* List of all zones in keg */ LIST_HEAD(,uma_bucket) uz_buckets; /* full buckets */ LIST_HEAD(,uma_klink) uz_kegs; /* List of kegs. */ struct uma_klink uz_klink; /* klink for first keg. */ uma_slaballoc uz_slab; /* Allocate a slab from the backend. */ uma_ctor uz_ctor; /* Constructor for each allocation */ uma_dtor uz_dtor; /* Destructor */ uma_init uz_init; /* Initializer for each item */ uma_fini uz_fini; /* Finalizer for each item. */ uma_import uz_import; /* Import new memory to cache. */ uma_release uz_release; /* Release memory from cache. */ void *uz_arg; /* Import/release argument. */ uint32_t uz_flags; /* Flags inherited from kegs */ uint32_t uz_size; /* Size inherited from kegs */ volatile u_long uz_allocs UMA_ALIGN; /* Total number of allocations */ volatile u_long uz_fails; /* Total number of alloc failures */ volatile u_long uz_frees; /* Total number of frees */ uint64_t uz_sleeps; /* Total number of alloc sleeps */ uint16_t uz_count; /* Amount of items in full bucket */ uint16_t uz_count_min; /* Minimal amount of items there */ /* The next three fields are used to print a rate-limited warnings. */ const char *uz_warning; /* Warning to print on failure */ struct timeval uz_ratecheck; /* Warnings rate-limiting */ /* * This HAS to be the last item because we adjust the zone size * based on NCPU and then allocate the space for the zones. */ struct uma_cache uz_cpu[1]; /* Per cpu caches */ }; /* * These flags must not overlap with the UMA_ZONE flags specified in uma.h. */ #define UMA_ZFLAG_MULTI 0x04000000 /* Multiple kegs in the zone. */ #define UMA_ZFLAG_DRAINING 0x08000000 /* Running zone_drain. */ #define UMA_ZFLAG_BUCKET 0x10000000 /* Bucket zone. */ #define UMA_ZFLAG_INTERNAL 0x20000000 /* No offpage no PCPU. */ #define UMA_ZFLAG_FULL 0x40000000 /* Reached uz_maxpages */ #define UMA_ZFLAG_CACHEONLY 0x80000000 /* Don't ask VM for buckets. */ #define UMA_ZFLAG_INHERIT \ (UMA_ZFLAG_INTERNAL | UMA_ZFLAG_CACHEONLY | UMA_ZFLAG_BUCKET) static inline uma_keg_t zone_first_keg(uma_zone_t zone) { uma_klink_t klink; klink = LIST_FIRST(&zone->uz_kegs); return (klink != NULL) ? klink->kl_keg : NULL; } #undef UMA_ALIGN #ifdef _KERNEL /* Internal prototypes */ static __inline uma_slab_t hash_sfind(struct uma_hash *hash, uint8_t *data); -void *uma_large_malloc(int size, int wait); +void *uma_large_malloc(vm_size_t size, int wait); void uma_large_free(uma_slab_t slab); /* Lock Macros */ #define KEG_LOCK_INIT(k, lc) \ do { \ if ((lc)) \ mtx_init(&(k)->uk_lock, (k)->uk_name, \ (k)->uk_name, MTX_DEF | MTX_DUPOK); \ else \ mtx_init(&(k)->uk_lock, (k)->uk_name, \ "UMA zone", MTX_DEF | MTX_DUPOK); \ } while (0) #define KEG_LOCK_FINI(k) mtx_destroy(&(k)->uk_lock) #define KEG_LOCK(k) mtx_lock(&(k)->uk_lock) #define KEG_UNLOCK(k) mtx_unlock(&(k)->uk_lock) #define ZONE_LOCK_INIT(z, lc) \ do { \ if ((lc)) \ mtx_init(&(z)->uz_lock, (z)->uz_name, \ (z)->uz_name, MTX_DEF | MTX_DUPOK); \ else \ mtx_init(&(z)->uz_lock, (z)->uz_name, \ "UMA zone", MTX_DEF | MTX_DUPOK); \ } while (0) #define ZONE_LOCK(z) mtx_lock((z)->uz_lockptr) #define ZONE_TRYLOCK(z) mtx_trylock((z)->uz_lockptr) #define ZONE_UNLOCK(z) mtx_unlock((z)->uz_lockptr) #define ZONE_LOCK_FINI(z) mtx_destroy(&(z)->uz_lock) /* * Find a slab within a hash table. This is used for OFFPAGE zones to lookup * the slab structure. * * Arguments: * hash The hash table to search. * data The base page of the item. * * Returns: * A pointer to a slab if successful, else NULL. */ static __inline uma_slab_t hash_sfind(struct uma_hash *hash, uint8_t *data) { uma_slab_t slab; int hval; hval = UMA_HASH(hash, data); SLIST_FOREACH(slab, &hash->uh_slab_hash[hval], us_hlink) { if ((uint8_t *)slab->us_data == data) return (slab); } return (NULL); } static __inline uma_slab_t vtoslab(vm_offset_t va) { vm_page_t p; p = PHYS_TO_VM_PAGE(pmap_kextract(va)); return ((uma_slab_t)p->plinks.s.pv); } static __inline void vsetslab(vm_offset_t va, uma_slab_t slab) { vm_page_t p; p = PHYS_TO_VM_PAGE(pmap_kextract(va)); p->plinks.s.pv = slab; } /* * The following two functions may be defined by architecture specific code * if they can provide more effecient allocation functions. This is useful * for using direct mapped addresses. */ -void *uma_small_alloc(uma_zone_t zone, int bytes, uint8_t *pflag, int wait); -void uma_small_free(void *mem, int size, uint8_t flags); +void *uma_small_alloc(uma_zone_t zone, vm_size_t bytes, uint8_t *pflag, + int wait); +void uma_small_free(void *mem, vm_size_t size, uint8_t flags); #endif /* _KERNEL */ #endif /* VM_UMA_INT_H */ Index: stable/10 =================================================================== --- stable/10 (revision 287944) +++ stable/10 (revision 287945) Property changes on: stable/10 ___________________________________________________________________ Modified: svn:mergeinfo ## -0,0 +0,1 ## Merged /head:r280957