Index: head/sys/i386/i386/pmap.c =================================================================== --- head/sys/i386/i386/pmap.c (revision 177920) +++ head/sys/i386/i386/pmap.c (revision 177921) @@ -1,4692 +1,4694 @@ /*- * Copyright (c) 1991 Regents of the University of California. * All rights reserved. * Copyright (c) 1994 John S. Dyson * All rights reserved. * Copyright (c) 1994 David Greenman * All rights reserved. * Copyright (c) 2005-2008 Alan L. Cox * All rights reserved. * * This code is derived from software contributed to Berkeley by * the Systems Programming Group of the University of Utah Computer * Science Department and William Jolitz of UUNET Technologies Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from: @(#)pmap.c 7.7 (Berkeley) 5/12/91 */ /*- * Copyright (c) 2003 Networks Associates Technology, Inc. * All rights reserved. * * This software was developed for the FreeBSD Project by Jake Burkholder, * Safeport Network Services, and Network Associates Laboratories, the * Security Research Division of Network Associates, Inc. under * DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA * CHATS research program. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); /* * Manages physical address maps. * * In addition to hardware address maps, this * module is called upon to provide software-use-only * maps which may or may not be stored in the same * form as hardware maps. These pseudo-maps are * used to store intermediate results from copy * operations to and from address spaces. * * Since the information managed by this module is * also stored by the logical address mapping module, * this module may throw away valid virtual-to-physical * mappings at almost any time. However, invalidations * of virtual-to-physical mappings must be done as * requested. * * In order to cope with hardware architectures which * make virtual-to-physical map invalidates expensive, * this module may delay invalidate or reduced protection * operations until such time as they are actually * necessary. This module is given full information as * to which processors are currently using which maps, * and to when physical maps must be made correct. */ #include "opt_cpu.h" #include "opt_pmap.h" #include "opt_msgbuf.h" #include "opt_smp.h" #include "opt_xbox.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef SMP #include #endif #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef SMP #include #endif #ifdef XBOX #include #endif #if !defined(CPU_DISABLE_SSE) && defined(I686_CPU) #define CPU_ENABLE_SSE #endif #ifndef PMAP_SHPGPERPROC #define PMAP_SHPGPERPROC 200 #endif #if !defined(DIAGNOSTIC) #define PMAP_INLINE __gnu89_inline #else #define PMAP_INLINE #endif #define PV_STATS #ifdef PV_STATS #define PV_STAT(x) do { x ; } while (0) #else #define PV_STAT(x) do { } while (0) #endif #define pa_index(pa) ((pa) >> PDRSHIFT) #define pa_to_pvh(pa) (&pv_table[pa_index(pa)]) /* * Get PDEs and PTEs for user/kernel address space */ #define pmap_pde(m, v) (&((m)->pm_pdir[(vm_offset_t)(v) >> PDRSHIFT])) #define pdir_pde(m, v) (m[(vm_offset_t)(v) >> PDRSHIFT]) #define pmap_pde_v(pte) ((*(int *)pte & PG_V) != 0) #define pmap_pte_w(pte) ((*(int *)pte & PG_W) != 0) #define pmap_pte_m(pte) ((*(int *)pte & PG_M) != 0) #define pmap_pte_u(pte) ((*(int *)pte & PG_A) != 0) #define pmap_pte_v(pte) ((*(int *)pte & PG_V) != 0) #define pmap_pte_set_w(pte, v) ((v) ? atomic_set_int((u_int *)(pte), PG_W) : \ atomic_clear_int((u_int *)(pte), PG_W)) #define pmap_pte_set_prot(pte, v) ((*(int *)pte &= ~PG_PROT), (*(int *)pte |= (v))) struct pmap kernel_pmap_store; LIST_HEAD(pmaplist, pmap); static struct pmaplist allpmaps; static struct mtx allpmaps_lock; vm_offset_t virtual_avail; /* VA of first avail page (after kernel bss) */ vm_offset_t virtual_end; /* VA of last avail page (end of kernel AS) */ int pgeflag = 0; /* PG_G or-in */ int pseflag = 0; /* PG_PS or-in */ static int nkpt; vm_offset_t kernel_vm_end; extern u_int32_t KERNend; #ifdef PAE pt_entry_t pg_nx; static uma_zone_t pdptzone; #endif SYSCTL_NODE(_vm, OID_AUTO, pmap, CTLFLAG_RD, 0, "VM/pmap parameters"); static int pg_ps_enabled; SYSCTL_INT(_vm_pmap, OID_AUTO, pg_ps_enabled, CTLFLAG_RD, &pg_ps_enabled, 0, "Are large page mappings enabled?"); /* * Data for the pv entry allocation mechanism */ static int pv_entry_count = 0, pv_entry_max = 0, pv_entry_high_water = 0; static struct md_page *pv_table; static int shpgperproc = PMAP_SHPGPERPROC; struct pv_chunk *pv_chunkbase; /* KVA block for pv_chunks */ int pv_maxchunks; /* How many chunks we have KVA for */ vm_offset_t pv_vafree; /* freelist stored in the PTE */ /* * All those kernel PT submaps that BSD is so fond of */ struct sysmaps { struct mtx lock; pt_entry_t *CMAP1; pt_entry_t *CMAP2; caddr_t CADDR1; caddr_t CADDR2; }; static struct sysmaps sysmaps_pcpu[MAXCPU]; pt_entry_t *CMAP1 = 0; static pt_entry_t *CMAP3; caddr_t CADDR1 = 0, ptvmmap = 0; static caddr_t CADDR3; struct msgbuf *msgbufp = 0; /* * Crashdump maps. */ static caddr_t crashdumpmap; static pt_entry_t *PMAP1 = 0, *PMAP2; static pt_entry_t *PADDR1 = 0, *PADDR2; #ifdef SMP static int PMAP1cpu; static int PMAP1changedcpu; SYSCTL_INT(_debug, OID_AUTO, PMAP1changedcpu, CTLFLAG_RD, &PMAP1changedcpu, 0, "Number of times pmap_pte_quick changed CPU with same PMAP1"); #endif static int PMAP1changed; SYSCTL_INT(_debug, OID_AUTO, PMAP1changed, CTLFLAG_RD, &PMAP1changed, 0, "Number of times pmap_pte_quick changed PMAP1"); static int PMAP1unchanged; SYSCTL_INT(_debug, OID_AUTO, PMAP1unchanged, CTLFLAG_RD, &PMAP1unchanged, 0, "Number of times pmap_pte_quick didn't change PMAP1"); static struct mtx PMAP2mutex; static void free_pv_entry(pmap_t pmap, pv_entry_t pv); static pv_entry_t get_pv_entry(pmap_t locked_pmap, int try); static void pmap_pv_demote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa); static boolean_t pmap_pv_insert_pde(pmap_t pmap, vm_offset_t va, vm_page_t m); static void pmap_pv_promote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa); static void pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va); static pv_entry_t pmap_pvh_remove(struct md_page *pvh, pmap_t pmap, vm_offset_t va); static boolean_t pmap_demote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va); static boolean_t pmap_enter_pde(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot); static vm_page_t pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot, vm_page_t mpte); static void pmap_insert_pt_page(pmap_t pmap, vm_page_t mpte); static boolean_t pmap_is_modified_pvh(struct md_page *pvh); static vm_page_t pmap_lookup_pt_page(pmap_t pmap, vm_offset_t va); static void pmap_promote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va); static boolean_t pmap_protect_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t sva, vm_prot_t prot); static void pmap_remove_pde(pmap_t pmap, pd_entry_t *pdq, vm_offset_t sva, vm_page_t *free); static int pmap_remove_pte(pmap_t pmap, pt_entry_t *ptq, vm_offset_t sva, vm_page_t *free); static void pmap_remove_pt_page(pmap_t pmap, vm_page_t mpte); static void pmap_remove_page(struct pmap *pmap, vm_offset_t va, vm_page_t *free); static void pmap_remove_entry(struct pmap *pmap, vm_page_t m, vm_offset_t va); static void pmap_insert_entry(pmap_t pmap, vm_offset_t va, vm_page_t m); static boolean_t pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va, vm_page_t m); static vm_page_t pmap_allocpte(pmap_t pmap, vm_offset_t va, int flags); static vm_page_t _pmap_allocpte(pmap_t pmap, unsigned ptepindex, int flags); static int _pmap_unwire_pte_hold(pmap_t pmap, vm_page_t m, vm_page_t *free); static pt_entry_t *pmap_pte_quick(pmap_t pmap, vm_offset_t va); static void pmap_pte_release(pt_entry_t *pte); static int pmap_unuse_pt(pmap_t, vm_offset_t, vm_page_t *); static vm_offset_t pmap_kmem_choose(vm_offset_t addr); #ifdef PAE static void *pmap_pdpt_allocf(uma_zone_t zone, int bytes, u_int8_t *flags, int wait); #endif CTASSERT(1 << PDESHIFT == sizeof(pd_entry_t)); CTASSERT(1 << PTESHIFT == sizeof(pt_entry_t)); /* * If you get an error here, then you set KVA_PAGES wrong! See the * description of KVA_PAGES in sys/i386/include/pmap.h. It must be * multiple of 4 for a normal kernel, or a multiple of 8 for a PAE. */ CTASSERT(KERNBASE % (1 << 24) == 0); /* * Move the kernel virtual free pointer to the next * 4MB. This is used to help improve performance * by using a large (4MB) page for much of the kernel * (.text, .data, .bss) */ static vm_offset_t pmap_kmem_choose(vm_offset_t addr) { vm_offset_t newaddr = addr; #ifndef DISABLE_PSE if (cpu_feature & CPUID_PSE) newaddr = (addr + PDRMASK) & ~PDRMASK; #endif return newaddr; } /* * Bootstrap the system enough to run with virtual memory. * * On the i386 this is called after mapping has already been enabled * and just syncs the pmap module with what has already been done. * [We can't call it easily with mapping off since the kernel is not * mapped with PA == VA, hence we would have to relocate every address * from the linked base (virtual) address "KERNBASE" to the actual * (physical) address starting relative to 0] */ void pmap_bootstrap(vm_paddr_t firstaddr) { vm_offset_t va; pt_entry_t *pte, *unused; struct sysmaps *sysmaps; int i; /* * XXX The calculation of virtual_avail is wrong. It's NKPT*PAGE_SIZE too * large. It should instead be correctly calculated in locore.s and * not based on 'first' (which is a physical address, not a virtual * address, for the start of unused physical memory). The kernel * page tables are NOT double mapped and thus should not be included * in this calculation. */ virtual_avail = (vm_offset_t) KERNBASE + firstaddr; virtual_avail = pmap_kmem_choose(virtual_avail); virtual_end = VM_MAX_KERNEL_ADDRESS; /* * Initialize the kernel pmap (which is statically allocated). */ PMAP_LOCK_INIT(kernel_pmap); kernel_pmap->pm_pdir = (pd_entry_t *) (KERNBASE + (u_int)IdlePTD); #ifdef PAE kernel_pmap->pm_pdpt = (pdpt_entry_t *) (KERNBASE + (u_int)IdlePDPT); #endif kernel_pmap->pm_root = NULL; kernel_pmap->pm_active = -1; /* don't allow deactivation */ TAILQ_INIT(&kernel_pmap->pm_pvchunk); LIST_INIT(&allpmaps); mtx_init(&allpmaps_lock, "allpmaps", NULL, MTX_SPIN); mtx_lock_spin(&allpmaps_lock); LIST_INSERT_HEAD(&allpmaps, kernel_pmap, pm_list); mtx_unlock_spin(&allpmaps_lock); nkpt = NKPT; /* * Reserve some special page table entries/VA space for temporary * mapping of pages. */ #define SYSMAP(c, p, v, n) \ v = (c)va; va += ((n)*PAGE_SIZE); p = pte; pte += (n); va = virtual_avail; pte = vtopte(va); /* * CMAP1/CMAP2 are used for zeroing and copying pages. * CMAP3 is used for the idle process page zeroing. */ for (i = 0; i < MAXCPU; i++) { sysmaps = &sysmaps_pcpu[i]; mtx_init(&sysmaps->lock, "SYSMAPS", NULL, MTX_DEF); SYSMAP(caddr_t, sysmaps->CMAP1, sysmaps->CADDR1, 1) SYSMAP(caddr_t, sysmaps->CMAP2, sysmaps->CADDR2, 1) } SYSMAP(caddr_t, CMAP1, CADDR1, 1) SYSMAP(caddr_t, CMAP3, CADDR3, 1) *CMAP3 = 0; /* * Crashdump maps. */ SYSMAP(caddr_t, unused, crashdumpmap, MAXDUMPPGS) /* * ptvmmap is used for reading arbitrary physical pages via /dev/mem. */ SYSMAP(caddr_t, unused, ptvmmap, 1) /* * msgbufp is used to map the system message buffer. */ SYSMAP(struct msgbuf *, unused, msgbufp, atop(round_page(MSGBUF_SIZE))) /* * ptemap is used for pmap_pte_quick */ SYSMAP(pt_entry_t *, PMAP1, PADDR1, 1); SYSMAP(pt_entry_t *, PMAP2, PADDR2, 1); mtx_init(&PMAP2mutex, "PMAP2", NULL, MTX_DEF); virtual_avail = va; *CMAP1 = 0; /* * Leave in place an identity mapping (virt == phys) for the low 1 MB * physical memory region that is used by the ACPI wakeup code. This * mapping must not have PG_G set. */ #ifdef XBOX /* FIXME: This is gross, but needed for the XBOX. Since we are in such * an early stadium, we cannot yet neatly map video memory ... :-( * Better fixes are very welcome! */ if (!arch_i386_is_xbox) #endif for (i = 1; i < NKPT; i++) PTD[i] = 0; /* Initialize the PAT MSR if present. */ pmap_init_pat(); /* Turn on PG_G on kernel page(s) */ pmap_set_pg(); } /* * Setup the PAT MSR. */ void pmap_init_pat(void) { uint64_t pat_msr; /* Bail if this CPU doesn't implement PAT. */ if (!(cpu_feature & CPUID_PAT)) return; #ifdef PAT_WORKS /* * Leave the indices 0-3 at the default of WB, WT, UC, and UC-. * Program 4 and 5 as WP and WC. * Leave 6 and 7 as UC and UC-. */ pat_msr = rdmsr(MSR_PAT); pat_msr &= ~(PAT_MASK(4) | PAT_MASK(5)); pat_msr |= PAT_VALUE(4, PAT_WRITE_PROTECTED) | PAT_VALUE(5, PAT_WRITE_COMBINING); #else /* * Due to some Intel errata, we can only safely use the lower 4 * PAT entries. Thus, just replace PAT Index 2 with WC instead * of UC-. * * Intel Pentium III Processor Specification Update * Errata E.27 (Upper Four PAT Entries Not Usable With Mode B * or Mode C Paging) * * Intel Pentium IV Processor Specification Update * Errata N46 (PAT Index MSB May Be Calculated Incorrectly) */ pat_msr = rdmsr(MSR_PAT); pat_msr &= ~PAT_MASK(2); pat_msr |= PAT_VALUE(2, PAT_WRITE_COMBINING); #endif wrmsr(MSR_PAT, pat_msr); } /* * Set PG_G on kernel pages. Only the BSP calls this when SMP is turned on. */ void pmap_set_pg(void) { pd_entry_t pdir; pt_entry_t *pte; vm_offset_t va, endva; int i; if (pgeflag == 0) return; i = KERNLOAD/NBPDR; endva = KERNBASE + KERNend; if (pseflag) { va = KERNBASE + KERNLOAD; while (va < endva) { pdir = kernel_pmap->pm_pdir[KPTDI+i]; pdir |= pgeflag; kernel_pmap->pm_pdir[KPTDI+i] = PTD[KPTDI+i] = pdir; invltlb(); /* Play it safe, invltlb() every time */ i++; va += NBPDR; } } else { va = (vm_offset_t)btext; while (va < endva) { pte = vtopte(va); if (*pte) *pte |= pgeflag; invltlb(); /* Play it safe, invltlb() every time */ va += PAGE_SIZE; } } } /* * Initialize a vm_page's machine-dependent fields. */ void pmap_page_init(vm_page_t m) { TAILQ_INIT(&m->md.pv_list); } #ifdef PAE static MALLOC_DEFINE(M_PMAPPDPT, "pmap", "pmap pdpt"); static void * pmap_pdpt_allocf(uma_zone_t zone, int bytes, u_int8_t *flags, int wait) { - *flags = UMA_SLAB_PRIV; + + /* Inform UMA that this allocator uses kernel_map/object. */ + *flags = UMA_SLAB_KERNEL; return (contigmalloc(PAGE_SIZE, M_PMAPPDPT, 0, 0x0ULL, 0xffffffffULL, 1, 0)); } #endif /* * ABuse the pte nodes for unmapped kva to thread a kva freelist through. * Requirements: * - Must deal with pages in order to ensure that none of the PG_* bits * are ever set, PG_V in particular. * - Assumes we can write to ptes without pte_store() atomic ops, even * on PAE systems. This should be ok. * - Assumes nothing will ever test these addresses for 0 to indicate * no mapping instead of correctly checking PG_V. * - Assumes a vm_offset_t will fit in a pte (true for i386). * Because PG_V is never set, there can be no mappings to invalidate. */ static vm_offset_t pmap_ptelist_alloc(vm_offset_t *head) { pt_entry_t *pte; vm_offset_t va; va = *head; if (va == 0) return (va); /* Out of memory */ pte = vtopte(va); *head = *pte; if (*head & PG_V) panic("pmap_ptelist_alloc: va with PG_V set!"); *pte = 0; return (va); } static void pmap_ptelist_free(vm_offset_t *head, vm_offset_t va) { pt_entry_t *pte; if (va & PG_V) panic("pmap_ptelist_free: freeing va with PG_V set!"); pte = vtopte(va); *pte = *head; /* virtual! PG_V is 0 though */ *head = va; } static void pmap_ptelist_init(vm_offset_t *head, void *base, int npages) { int i; vm_offset_t va; *head = 0; for (i = npages - 1; i >= 0; i--) { va = (vm_offset_t)base + i * PAGE_SIZE; pmap_ptelist_free(head, va); } } /* * Initialize the pmap module. * Called by vm_init, to initialize any structures that the pmap * system needs to map virtual memory. */ void pmap_init(void) { vm_page_t mpte; vm_size_t s; int i, pv_npg; /* * Initialize the vm page array entries for the kernel pmap's * page table pages. */ for (i = 0; i < nkpt; i++) { mpte = PHYS_TO_VM_PAGE(PTD[i + KPTDI] & PG_FRAME); KASSERT(mpte >= vm_page_array && mpte < &vm_page_array[vm_page_array_size], ("pmap_init: page table page is out of range")); mpte->pindex = i + KPTDI; mpte->phys_addr = PTD[i + KPTDI] & PG_FRAME; } /* * Initialize the address space (zone) for the pv entries. Set a * high water mark so that the system can recover from excessive * numbers of pv entries. */ TUNABLE_INT_FETCH("vm.pmap.shpgperproc", &shpgperproc); pv_entry_max = shpgperproc * maxproc + cnt.v_page_count; TUNABLE_INT_FETCH("vm.pmap.pv_entries", &pv_entry_max); pv_entry_max = roundup(pv_entry_max, _NPCPV); pv_entry_high_water = 9 * (pv_entry_max / 10); /* * Are large page mappings enabled? */ TUNABLE_INT_FETCH("vm.pmap.pg_ps_enabled", &pg_ps_enabled); /* * Calculate the size of the pv head table for superpages. */ for (i = 0; phys_avail[i + 1]; i += 2); pv_npg = round_4mpage(phys_avail[(i - 2) + 1]) / NBPDR; /* * Allocate memory for the pv head table for superpages. */ s = (vm_size_t)(pv_npg * sizeof(struct md_page)); s = round_page(s); pv_table = (struct md_page *)kmem_alloc(kernel_map, s); for (i = 0; i < pv_npg; i++) TAILQ_INIT(&pv_table[i].pv_list); pv_maxchunks = MAX(pv_entry_max / _NPCPV, maxproc); pv_chunkbase = (struct pv_chunk *)kmem_alloc_nofault(kernel_map, PAGE_SIZE * pv_maxchunks); if (pv_chunkbase == NULL) panic("pmap_init: not enough kvm for pv chunks"); pmap_ptelist_init(&pv_vafree, pv_chunkbase, pv_maxchunks); #ifdef PAE pdptzone = uma_zcreate("PDPT", NPGPTD * sizeof(pdpt_entry_t), NULL, NULL, NULL, NULL, (NPGPTD * sizeof(pdpt_entry_t)) - 1, UMA_ZONE_VM | UMA_ZONE_NOFREE); uma_zone_set_allocf(pdptzone, pmap_pdpt_allocf); #endif } SYSCTL_INT(_vm_pmap, OID_AUTO, pv_entry_max, CTLFLAG_RD, &pv_entry_max, 0, "Max number of PV entries"); SYSCTL_INT(_vm_pmap, OID_AUTO, shpgperproc, CTLFLAG_RD, &shpgperproc, 0, "Page share factor per proc"); SYSCTL_NODE(_vm_pmap, OID_AUTO, pde, CTLFLAG_RD, 0, "2/4MB page mapping counters"); static u_long pmap_pde_demotions; SYSCTL_ULONG(_vm_pmap_pde, OID_AUTO, demotions, CTLFLAG_RD, &pmap_pde_demotions, 0, "2/4MB page demotions"); static u_long pmap_pde_mappings; SYSCTL_ULONG(_vm_pmap_pde, OID_AUTO, mappings, CTLFLAG_RD, &pmap_pde_mappings, 0, "2/4MB page mappings"); static u_long pmap_pde_p_failures; SYSCTL_ULONG(_vm_pmap_pde, OID_AUTO, p_failures, CTLFLAG_RD, &pmap_pde_p_failures, 0, "2/4MB page promotion failures"); static u_long pmap_pde_promotions; SYSCTL_ULONG(_vm_pmap_pde, OID_AUTO, promotions, CTLFLAG_RD, &pmap_pde_promotions, 0, "2/4MB page promotions"); /*************************************************** * Low level helper routines..... ***************************************************/ /* * Determine the appropriate bits to set in a PTE or PDE for a specified * caching mode. */ static int pmap_cache_bits(int mode, boolean_t is_pde) { int pat_flag, pat_index, cache_bits; /* The PAT bit is different for PTE's and PDE's. */ pat_flag = is_pde ? PG_PDE_PAT : PG_PTE_PAT; /* If we don't support PAT, map extended modes to older ones. */ if (!(cpu_feature & CPUID_PAT)) { switch (mode) { case PAT_UNCACHEABLE: case PAT_WRITE_THROUGH: case PAT_WRITE_BACK: break; case PAT_UNCACHED: case PAT_WRITE_COMBINING: case PAT_WRITE_PROTECTED: mode = PAT_UNCACHEABLE; break; } } /* Map the caching mode to a PAT index. */ switch (mode) { #ifdef PAT_WORKS case PAT_UNCACHEABLE: pat_index = 3; break; case PAT_WRITE_THROUGH: pat_index = 1; break; case PAT_WRITE_BACK: pat_index = 0; break; case PAT_UNCACHED: pat_index = 2; break; case PAT_WRITE_COMBINING: pat_index = 5; break; case PAT_WRITE_PROTECTED: pat_index = 4; break; #else case PAT_UNCACHED: case PAT_UNCACHEABLE: case PAT_WRITE_PROTECTED: pat_index = 3; break; case PAT_WRITE_THROUGH: pat_index = 1; break; case PAT_WRITE_BACK: pat_index = 0; break; case PAT_WRITE_COMBINING: pat_index = 2; break; #endif default: panic("Unknown caching mode %d\n", mode); } /* Map the 3-bit index value into the PAT, PCD, and PWT bits. */ cache_bits = 0; if (pat_index & 0x4) cache_bits |= pat_flag; if (pat_index & 0x2) cache_bits |= PG_NC_PCD; if (pat_index & 0x1) cache_bits |= PG_NC_PWT; return (cache_bits); } #ifdef SMP /* * For SMP, these functions have to use the IPI mechanism for coherence. * * N.B.: Before calling any of the following TLB invalidation functions, * the calling processor must ensure that all stores updating a non- * kernel page table are globally performed. Otherwise, another * processor could cache an old, pre-update entry without being * invalidated. This can happen one of two ways: (1) The pmap becomes * active on another processor after its pm_active field is checked by * one of the following functions but before a store updating the page * table is globally performed. (2) The pmap becomes active on another * processor before its pm_active field is checked but due to * speculative loads one of the following functions stills reads the * pmap as inactive on the other processor. * * The kernel page table is exempt because its pm_active field is * immutable. The kernel page table is always active on every * processor. */ void pmap_invalidate_page(pmap_t pmap, vm_offset_t va) { u_int cpumask; u_int other_cpus; sched_pin(); if (pmap == kernel_pmap || pmap->pm_active == all_cpus) { invlpg(va); smp_invlpg(va); } else { cpumask = PCPU_GET(cpumask); other_cpus = PCPU_GET(other_cpus); if (pmap->pm_active & cpumask) invlpg(va); if (pmap->pm_active & other_cpus) smp_masked_invlpg(pmap->pm_active & other_cpus, va); } sched_unpin(); } void pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) { u_int cpumask; u_int other_cpus; vm_offset_t addr; sched_pin(); if (pmap == kernel_pmap || pmap->pm_active == all_cpus) { for (addr = sva; addr < eva; addr += PAGE_SIZE) invlpg(addr); smp_invlpg_range(sva, eva); } else { cpumask = PCPU_GET(cpumask); other_cpus = PCPU_GET(other_cpus); if (pmap->pm_active & cpumask) for (addr = sva; addr < eva; addr += PAGE_SIZE) invlpg(addr); if (pmap->pm_active & other_cpus) smp_masked_invlpg_range(pmap->pm_active & other_cpus, sva, eva); } sched_unpin(); } void pmap_invalidate_all(pmap_t pmap) { u_int cpumask; u_int other_cpus; sched_pin(); if (pmap == kernel_pmap || pmap->pm_active == all_cpus) { invltlb(); smp_invltlb(); } else { cpumask = PCPU_GET(cpumask); other_cpus = PCPU_GET(other_cpus); if (pmap->pm_active & cpumask) invltlb(); if (pmap->pm_active & other_cpus) smp_masked_invltlb(pmap->pm_active & other_cpus); } sched_unpin(); } void pmap_invalidate_cache(void) { sched_pin(); wbinvd(); smp_cache_flush(); sched_unpin(); } #else /* !SMP */ /* * Normal, non-SMP, 486+ invalidation functions. * We inline these within pmap.c for speed. */ PMAP_INLINE void pmap_invalidate_page(pmap_t pmap, vm_offset_t va) { if (pmap == kernel_pmap || pmap->pm_active) invlpg(va); } PMAP_INLINE void pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) { vm_offset_t addr; if (pmap == kernel_pmap || pmap->pm_active) for (addr = sva; addr < eva; addr += PAGE_SIZE) invlpg(addr); } PMAP_INLINE void pmap_invalidate_all(pmap_t pmap) { if (pmap == kernel_pmap || pmap->pm_active) invltlb(); } PMAP_INLINE void pmap_invalidate_cache(void) { wbinvd(); } #endif /* !SMP */ /* * Are we current address space or kernel? N.B. We return FALSE when * a pmap's page table is in use because a kernel thread is borrowing * it. The borrowed page table can change spontaneously, making any * dependence on its continued use subject to a race condition. */ static __inline int pmap_is_current(pmap_t pmap) { return (pmap == kernel_pmap || (pmap == vmspace_pmap(curthread->td_proc->p_vmspace) && (pmap->pm_pdir[PTDPTDI] & PG_FRAME) == (PTDpde[0] & PG_FRAME))); } /* * If the given pmap is not the current or kernel pmap, the returned pte must * be released by passing it to pmap_pte_release(). */ pt_entry_t * pmap_pte(pmap_t pmap, vm_offset_t va) { pd_entry_t newpf; pd_entry_t *pde; pde = pmap_pde(pmap, va); if (*pde & PG_PS) return (pde); if (*pde != 0) { /* are we current address space or kernel? */ if (pmap_is_current(pmap)) return (vtopte(va)); mtx_lock(&PMAP2mutex); newpf = *pde & PG_FRAME; if ((*PMAP2 & PG_FRAME) != newpf) { *PMAP2 = newpf | PG_RW | PG_V | PG_A | PG_M; pmap_invalidate_page(kernel_pmap, (vm_offset_t)PADDR2); } return (PADDR2 + (i386_btop(va) & (NPTEPG - 1))); } return (0); } /* * Releases a pte that was obtained from pmap_pte(). Be prepared for the pte * being NULL. */ static __inline void pmap_pte_release(pt_entry_t *pte) { if ((pt_entry_t *)((vm_offset_t)pte & ~PAGE_MASK) == PADDR2) mtx_unlock(&PMAP2mutex); } static __inline void invlcaddr(void *caddr) { invlpg((u_int)caddr); } /* * Super fast pmap_pte routine best used when scanning * the pv lists. This eliminates many coarse-grained * invltlb calls. Note that many of the pv list * scans are across different pmaps. It is very wasteful * to do an entire invltlb for checking a single mapping. * * If the given pmap is not the current pmap, vm_page_queue_mtx * must be held and curthread pinned to a CPU. */ static pt_entry_t * pmap_pte_quick(pmap_t pmap, vm_offset_t va) { pd_entry_t newpf; pd_entry_t *pde; pde = pmap_pde(pmap, va); if (*pde & PG_PS) return (pde); if (*pde != 0) { /* are we current address space or kernel? */ if (pmap_is_current(pmap)) return (vtopte(va)); mtx_assert(&vm_page_queue_mtx, MA_OWNED); KASSERT(curthread->td_pinned > 0, ("curthread not pinned")); newpf = *pde & PG_FRAME; if ((*PMAP1 & PG_FRAME) != newpf) { *PMAP1 = newpf | PG_RW | PG_V | PG_A | PG_M; #ifdef SMP PMAP1cpu = PCPU_GET(cpuid); #endif invlcaddr(PADDR1); PMAP1changed++; } else #ifdef SMP if (PMAP1cpu != PCPU_GET(cpuid)) { PMAP1cpu = PCPU_GET(cpuid); invlcaddr(PADDR1); PMAP1changedcpu++; } else #endif PMAP1unchanged++; return (PADDR1 + (i386_btop(va) & (NPTEPG - 1))); } return (0); } /* * Routine: pmap_extract * Function: * Extract the physical page address associated * with the given map/virtual_address pair. */ vm_paddr_t pmap_extract(pmap_t pmap, vm_offset_t va) { vm_paddr_t rtval; pt_entry_t *pte; pd_entry_t pde; rtval = 0; PMAP_LOCK(pmap); pde = pmap->pm_pdir[va >> PDRSHIFT]; if (pde != 0) { if ((pde & PG_PS) != 0) { rtval = (pde & PG_PS_FRAME) | (va & PDRMASK); PMAP_UNLOCK(pmap); return rtval; } pte = pmap_pte(pmap, va); rtval = (*pte & PG_FRAME) | (va & PAGE_MASK); pmap_pte_release(pte); } PMAP_UNLOCK(pmap); return (rtval); } /* * Routine: pmap_extract_and_hold * Function: * Atomically extract and hold the physical page * with the given pmap and virtual address pair * if that mapping permits the given protection. */ vm_page_t pmap_extract_and_hold(pmap_t pmap, vm_offset_t va, vm_prot_t prot) { pd_entry_t pde; pt_entry_t pte; vm_page_t m; m = NULL; vm_page_lock_queues(); PMAP_LOCK(pmap); pde = *pmap_pde(pmap, va); if (pde != 0) { if (pde & PG_PS) { if ((pde & PG_RW) || (prot & VM_PROT_WRITE) == 0) { m = PHYS_TO_VM_PAGE((pde & PG_PS_FRAME) | (va & PDRMASK)); vm_page_hold(m); } } else { sched_pin(); pte = *pmap_pte_quick(pmap, va); if (pte != 0 && ((pte & PG_RW) || (prot & VM_PROT_WRITE) == 0)) { m = PHYS_TO_VM_PAGE(pte & PG_FRAME); vm_page_hold(m); } sched_unpin(); } } vm_page_unlock_queues(); PMAP_UNLOCK(pmap); return (m); } /*************************************************** * Low level mapping routines..... ***************************************************/ /* * Add a wired page to the kva. * Note: not SMP coherent. */ PMAP_INLINE void pmap_kenter(vm_offset_t va, vm_paddr_t pa) { pt_entry_t *pte; pte = vtopte(va); pte_store(pte, pa | PG_RW | PG_V | pgeflag); } PMAP_INLINE void pmap_kenter_attr(vm_offset_t va, vm_paddr_t pa, int mode) { pt_entry_t *pte; pte = vtopte(va); pte_store(pte, pa | PG_RW | PG_V | pgeflag | pmap_cache_bits(mode, 0)); } /* * Remove a page from the kernel pagetables. * Note: not SMP coherent. */ PMAP_INLINE void pmap_kremove(vm_offset_t va) { pt_entry_t *pte; pte = vtopte(va); pte_clear(pte); } /* * Used to map a range of physical addresses into kernel * virtual address space. * * The value passed in '*virt' is a suggested virtual address for * the mapping. Architectures which can support a direct-mapped * physical to virtual region can return the appropriate address * within that region, leaving '*virt' unchanged. Other * architectures should map the pages starting at '*virt' and * update '*virt' with the first usable address after the mapped * region. */ vm_offset_t pmap_map(vm_offset_t *virt, vm_paddr_t start, vm_paddr_t end, int prot) { vm_offset_t va, sva; va = sva = *virt; while (start < end) { pmap_kenter(va, start); va += PAGE_SIZE; start += PAGE_SIZE; } pmap_invalidate_range(kernel_pmap, sva, va); *virt = va; return (sva); } /* * Add a list of wired pages to the kva * this routine is only used for temporary * kernel mappings that do not need to have * page modification or references recorded. * Note that old mappings are simply written * over. The page *must* be wired. * Note: SMP coherent. Uses a ranged shootdown IPI. */ void pmap_qenter(vm_offset_t sva, vm_page_t *ma, int count) { pt_entry_t *endpte, oldpte, *pte; oldpte = 0; pte = vtopte(sva); endpte = pte + count; while (pte < endpte) { oldpte |= *pte; pte_store(pte, VM_PAGE_TO_PHYS(*ma) | pgeflag | PG_RW | PG_V); pte++; ma++; } if ((oldpte & PG_V) != 0) pmap_invalidate_range(kernel_pmap, sva, sva + count * PAGE_SIZE); } /* * This routine tears out page mappings from the * kernel -- it is meant only for temporary mappings. * Note: SMP coherent. Uses a ranged shootdown IPI. */ void pmap_qremove(vm_offset_t sva, int count) { vm_offset_t va; va = sva; while (count-- > 0) { pmap_kremove(va); va += PAGE_SIZE; } pmap_invalidate_range(kernel_pmap, sva, va); } /*************************************************** * Page table page management routines..... ***************************************************/ static __inline void pmap_free_zero_pages(vm_page_t free) { vm_page_t m; while (free != NULL) { m = free; free = m->right; /* Preserve the page's PG_ZERO setting. */ vm_page_free_toq(m); } } /* * Schedule the specified unused page table page to be freed. Specifically, * add the page to the specified list of pages that will be released to the * physical memory manager after the TLB has been updated. */ static __inline void pmap_add_delayed_free_list(vm_page_t m, vm_page_t *free, boolean_t set_PG_ZERO) { if (set_PG_ZERO) m->flags |= PG_ZERO; else m->flags &= ~PG_ZERO; m->right = *free; *free = m; } /* * Inserts the specified page table page into the specified pmap's collection * of idle page table pages. Each of a pmap's page table pages is responsible * for mapping a distinct range of virtual addresses. The pmap's collection is * ordered by this virtual address range. */ static void pmap_insert_pt_page(pmap_t pmap, vm_page_t mpte) { vm_page_t root; PMAP_LOCK_ASSERT(pmap, MA_OWNED); root = pmap->pm_root; if (root == NULL) { mpte->left = NULL; mpte->right = NULL; } else { root = vm_page_splay(mpte->pindex, root); if (mpte->pindex < root->pindex) { mpte->left = root->left; mpte->right = root; root->left = NULL; } else if (mpte->pindex == root->pindex) panic("pmap_insert_pt_page: pindex already inserted"); else { mpte->right = root->right; mpte->left = root; root->right = NULL; } } pmap->pm_root = mpte; } /* * Looks for a page table page mapping the specified virtual address in the * specified pmap's collection of idle page table pages. Returns NULL if there * is no page table page corresponding to the specified virtual address. */ static vm_page_t pmap_lookup_pt_page(pmap_t pmap, vm_offset_t va) { vm_page_t mpte; vm_pindex_t pindex = va >> PDRSHIFT; PMAP_LOCK_ASSERT(pmap, MA_OWNED); if ((mpte = pmap->pm_root) != NULL && mpte->pindex != pindex) { mpte = vm_page_splay(pindex, mpte); if ((pmap->pm_root = mpte)->pindex != pindex) mpte = NULL; } return (mpte); } /* * Removes the specified page table page from the specified pmap's collection * of idle page table pages. The specified page table page must be a member of * the pmap's collection. */ static void pmap_remove_pt_page(pmap_t pmap, vm_page_t mpte) { vm_page_t root; PMAP_LOCK_ASSERT(pmap, MA_OWNED); if (mpte != pmap->pm_root) vm_page_splay(mpte->pindex, pmap->pm_root); if (mpte->left == NULL) root = mpte->right; else { root = vm_page_splay(mpte->pindex, mpte->left); root->right = mpte->right; } pmap->pm_root = root; } /* * This routine unholds page table pages, and if the hold count * drops to zero, then it decrements the wire count. */ static __inline int pmap_unwire_pte_hold(pmap_t pmap, vm_page_t m, vm_page_t *free) { --m->wire_count; if (m->wire_count == 0) return _pmap_unwire_pte_hold(pmap, m, free); else return 0; } static int _pmap_unwire_pte_hold(pmap_t pmap, vm_page_t m, vm_page_t *free) { vm_offset_t pteva; /* * unmap the page table page */ pmap->pm_pdir[m->pindex] = 0; --pmap->pm_stats.resident_count; /* * This is a release store so that the ordinary store unmapping * the page table page is globally performed before TLB shoot- * down is begun. */ atomic_subtract_rel_int(&cnt.v_wire_count, 1); /* * Do an invltlb to make the invalidated mapping * take effect immediately. */ pteva = VM_MAXUSER_ADDRESS + i386_ptob(m->pindex); pmap_invalidate_page(pmap, pteva); /* * Put page on a list so that it is released after * *ALL* TLB shootdown is done */ pmap_add_delayed_free_list(m, free, TRUE); return 1; } /* * After removing a page table entry, this routine is used to * conditionally free the page, and manage the hold/wire counts. */ static int pmap_unuse_pt(pmap_t pmap, vm_offset_t va, vm_page_t *free) { pd_entry_t ptepde; vm_page_t mpte; if (va >= VM_MAXUSER_ADDRESS) return 0; ptepde = *pmap_pde(pmap, va); mpte = PHYS_TO_VM_PAGE(ptepde & PG_FRAME); return pmap_unwire_pte_hold(pmap, mpte, free); } void pmap_pinit0(pmap_t pmap) { PMAP_LOCK_INIT(pmap); pmap->pm_pdir = (pd_entry_t *)(KERNBASE + (vm_offset_t)IdlePTD); #ifdef PAE pmap->pm_pdpt = (pdpt_entry_t *)(KERNBASE + (vm_offset_t)IdlePDPT); #endif pmap->pm_root = NULL; pmap->pm_active = 0; PCPU_SET(curpmap, pmap); TAILQ_INIT(&pmap->pm_pvchunk); bzero(&pmap->pm_stats, sizeof pmap->pm_stats); mtx_lock_spin(&allpmaps_lock); LIST_INSERT_HEAD(&allpmaps, pmap, pm_list); mtx_unlock_spin(&allpmaps_lock); } /* * Initialize a preallocated and zeroed pmap structure, * such as one in a vmspace structure. */ int pmap_pinit(pmap_t pmap) { vm_page_t m, ptdpg[NPGPTD]; vm_paddr_t pa; static int color; int i; PMAP_LOCK_INIT(pmap); /* * No need to allocate page table space yet but we do need a valid * page directory table. */ if (pmap->pm_pdir == NULL) { pmap->pm_pdir = (pd_entry_t *)kmem_alloc_nofault(kernel_map, NBPTD); if (pmap->pm_pdir == NULL) { PMAP_LOCK_DESTROY(pmap); return (0); } #ifdef PAE pmap->pm_pdpt = uma_zalloc(pdptzone, M_WAITOK | M_ZERO); KASSERT(((vm_offset_t)pmap->pm_pdpt & ((NPGPTD * sizeof(pdpt_entry_t)) - 1)) == 0, ("pmap_pinit: pdpt misaligned")); KASSERT(pmap_kextract((vm_offset_t)pmap->pm_pdpt) < (4ULL<<30), ("pmap_pinit: pdpt above 4g")); #endif pmap->pm_root = NULL; } KASSERT(pmap->pm_root == NULL, ("pmap_pinit: pmap has reserved page table page(s)")); /* * allocate the page directory page(s) */ for (i = 0; i < NPGPTD;) { m = vm_page_alloc(NULL, color++, VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED | VM_ALLOC_ZERO); if (m == NULL) VM_WAIT; else { ptdpg[i++] = m; } } pmap_qenter((vm_offset_t)pmap->pm_pdir, ptdpg, NPGPTD); for (i = 0; i < NPGPTD; i++) { if ((ptdpg[i]->flags & PG_ZERO) == 0) bzero(pmap->pm_pdir + (i * NPDEPG), PAGE_SIZE); } mtx_lock_spin(&allpmaps_lock); LIST_INSERT_HEAD(&allpmaps, pmap, pm_list); mtx_unlock_spin(&allpmaps_lock); /* Wire in kernel global address entries. */ bcopy(PTD + KPTDI, pmap->pm_pdir + KPTDI, nkpt * sizeof(pd_entry_t)); /* install self-referential address mapping entry(s) */ for (i = 0; i < NPGPTD; i++) { pa = VM_PAGE_TO_PHYS(ptdpg[i]); pmap->pm_pdir[PTDPTDI + i] = pa | PG_V | PG_RW | PG_A | PG_M; #ifdef PAE pmap->pm_pdpt[i] = pa | PG_V; #endif } pmap->pm_active = 0; TAILQ_INIT(&pmap->pm_pvchunk); bzero(&pmap->pm_stats, sizeof pmap->pm_stats); return (1); } /* * this routine is called if the page table page is not * mapped correctly. */ static vm_page_t _pmap_allocpte(pmap_t pmap, unsigned ptepindex, int flags) { vm_paddr_t ptepa; vm_page_t m; KASSERT((flags & (M_NOWAIT | M_WAITOK)) == M_NOWAIT || (flags & (M_NOWAIT | M_WAITOK)) == M_WAITOK, ("_pmap_allocpte: flags is neither M_NOWAIT nor M_WAITOK")); /* * Allocate a page table page. */ if ((m = vm_page_alloc(NULL, ptepindex, VM_ALLOC_NOOBJ | VM_ALLOC_WIRED | VM_ALLOC_ZERO)) == NULL) { if (flags & M_WAITOK) { PMAP_UNLOCK(pmap); vm_page_unlock_queues(); VM_WAIT; vm_page_lock_queues(); PMAP_LOCK(pmap); } /* * Indicate the need to retry. While waiting, the page table * page may have been allocated. */ return (NULL); } if ((m->flags & PG_ZERO) == 0) pmap_zero_page(m); /* * Map the pagetable page into the process address space, if * it isn't already there. */ pmap->pm_stats.resident_count++; ptepa = VM_PAGE_TO_PHYS(m); pmap->pm_pdir[ptepindex] = (pd_entry_t) (ptepa | PG_U | PG_RW | PG_V | PG_A | PG_M); return m; } static vm_page_t pmap_allocpte(pmap_t pmap, vm_offset_t va, int flags) { unsigned ptepindex; pd_entry_t ptepa; vm_page_t m; KASSERT((flags & (M_NOWAIT | M_WAITOK)) == M_NOWAIT || (flags & (M_NOWAIT | M_WAITOK)) == M_WAITOK, ("pmap_allocpte: flags is neither M_NOWAIT nor M_WAITOK")); /* * Calculate pagetable page index */ ptepindex = va >> PDRSHIFT; retry: /* * Get the page directory entry */ ptepa = pmap->pm_pdir[ptepindex]; /* * This supports switching from a 4MB page to a * normal 4K page. */ if (ptepa & PG_PS) { (void)pmap_demote_pde(pmap, &pmap->pm_pdir[ptepindex], va); ptepa = pmap->pm_pdir[ptepindex]; } /* * If the page table page is mapped, we just increment the * hold count, and activate it. */ if (ptepa) { m = PHYS_TO_VM_PAGE(ptepa & PG_FRAME); m->wire_count++; } else { /* * Here if the pte page isn't mapped, or if it has * been deallocated. */ m = _pmap_allocpte(pmap, ptepindex, flags); if (m == NULL && (flags & M_WAITOK)) goto retry; } return (m); } /*************************************************** * Pmap allocation/deallocation routines. ***************************************************/ #ifdef SMP /* * Deal with a SMP shootdown of other users of the pmap that we are * trying to dispose of. This can be a bit hairy. */ static u_int *lazymask; static u_int lazyptd; static volatile u_int lazywait; void pmap_lazyfix_action(void); void pmap_lazyfix_action(void) { u_int mymask = PCPU_GET(cpumask); #ifdef COUNT_IPIS (*ipi_lazypmap_counts[PCPU_GET(cpuid)])++; #endif if (rcr3() == lazyptd) load_cr3(PCPU_GET(curpcb)->pcb_cr3); atomic_clear_int(lazymask, mymask); atomic_store_rel_int(&lazywait, 1); } static void pmap_lazyfix_self(u_int mymask) { if (rcr3() == lazyptd) load_cr3(PCPU_GET(curpcb)->pcb_cr3); atomic_clear_int(lazymask, mymask); } static void pmap_lazyfix(pmap_t pmap) { u_int mymask; u_int mask; u_int spins; while ((mask = pmap->pm_active) != 0) { spins = 50000000; mask = mask & -mask; /* Find least significant set bit */ mtx_lock_spin(&smp_ipi_mtx); #ifdef PAE lazyptd = vtophys(pmap->pm_pdpt); #else lazyptd = vtophys(pmap->pm_pdir); #endif mymask = PCPU_GET(cpumask); if (mask == mymask) { lazymask = &pmap->pm_active; pmap_lazyfix_self(mymask); } else { atomic_store_rel_int((u_int *)&lazymask, (u_int)&pmap->pm_active); atomic_store_rel_int(&lazywait, 0); ipi_selected(mask, IPI_LAZYPMAP); while (lazywait == 0) { ia32_pause(); if (--spins == 0) break; } } mtx_unlock_spin(&smp_ipi_mtx); if (spins == 0) printf("pmap_lazyfix: spun for 50000000\n"); } } #else /* SMP */ /* * Cleaning up on uniprocessor is easy. For various reasons, we're * unlikely to have to even execute this code, including the fact * that the cleanup is deferred until the parent does a wait(2), which * means that another userland process has run. */ static void pmap_lazyfix(pmap_t pmap) { u_int cr3; cr3 = vtophys(pmap->pm_pdir); if (cr3 == rcr3()) { load_cr3(PCPU_GET(curpcb)->pcb_cr3); pmap->pm_active &= ~(PCPU_GET(cpumask)); } } #endif /* SMP */ /* * Release any resources held by the given physical map. * Called when a pmap initialized by pmap_pinit is being released. * Should only be called if the map contains no valid mappings. */ void pmap_release(pmap_t pmap) { vm_page_t m, ptdpg[NPGPTD]; int i; KASSERT(pmap->pm_stats.resident_count == 0, ("pmap_release: pmap resident count %ld != 0", pmap->pm_stats.resident_count)); KASSERT(pmap->pm_root == NULL, ("pmap_release: pmap has reserved page table page(s)")); pmap_lazyfix(pmap); mtx_lock_spin(&allpmaps_lock); LIST_REMOVE(pmap, pm_list); mtx_unlock_spin(&allpmaps_lock); for (i = 0; i < NPGPTD; i++) ptdpg[i] = PHYS_TO_VM_PAGE(pmap->pm_pdir[PTDPTDI + i] & PG_FRAME); bzero(pmap->pm_pdir + PTDPTDI, (nkpt + NPGPTD) * sizeof(*pmap->pm_pdir)); pmap_qremove((vm_offset_t)pmap->pm_pdir, NPGPTD); for (i = 0; i < NPGPTD; i++) { m = ptdpg[i]; #ifdef PAE KASSERT(VM_PAGE_TO_PHYS(m) == (pmap->pm_pdpt[i] & PG_FRAME), ("pmap_release: got wrong ptd page")); #endif m->wire_count--; atomic_subtract_int(&cnt.v_wire_count, 1); vm_page_free_zero(m); } PMAP_LOCK_DESTROY(pmap); } static int kvm_size(SYSCTL_HANDLER_ARGS) { unsigned long ksize = VM_MAX_KERNEL_ADDRESS - KERNBASE; return sysctl_handle_long(oidp, &ksize, 0, req); } SYSCTL_PROC(_vm, OID_AUTO, kvm_size, CTLTYPE_LONG|CTLFLAG_RD, 0, 0, kvm_size, "IU", "Size of KVM"); static int kvm_free(SYSCTL_HANDLER_ARGS) { unsigned long kfree = VM_MAX_KERNEL_ADDRESS - kernel_vm_end; return sysctl_handle_long(oidp, &kfree, 0, req); } SYSCTL_PROC(_vm, OID_AUTO, kvm_free, CTLTYPE_LONG|CTLFLAG_RD, 0, 0, kvm_free, "IU", "Amount of KVM free"); /* * grow the number of kernel page table entries, if needed */ void pmap_growkernel(vm_offset_t addr) { struct pmap *pmap; vm_paddr_t ptppaddr; vm_page_t nkpg; pd_entry_t newpdir; pt_entry_t *pde; mtx_assert(&kernel_map->system_mtx, MA_OWNED); if (kernel_vm_end == 0) { kernel_vm_end = KERNBASE; nkpt = 0; while (pdir_pde(PTD, kernel_vm_end)) { kernel_vm_end = (kernel_vm_end + PAGE_SIZE * NPTEPG) & ~(PAGE_SIZE * NPTEPG - 1); nkpt++; if (kernel_vm_end - 1 >= kernel_map->max_offset) { kernel_vm_end = kernel_map->max_offset; break; } } } addr = roundup2(addr, PAGE_SIZE * NPTEPG); if (addr - 1 >= kernel_map->max_offset) addr = kernel_map->max_offset; while (kernel_vm_end < addr) { if (pdir_pde(PTD, kernel_vm_end)) { kernel_vm_end = (kernel_vm_end + PAGE_SIZE * NPTEPG) & ~(PAGE_SIZE * NPTEPG - 1); if (kernel_vm_end - 1 >= kernel_map->max_offset) { kernel_vm_end = kernel_map->max_offset; break; } continue; } nkpg = vm_page_alloc(NULL, kernel_vm_end >> PDRSHIFT, VM_ALLOC_NOOBJ | VM_ALLOC_SYSTEM | VM_ALLOC_WIRED); if (nkpg == NULL) panic("pmap_growkernel: no memory to grow kernel"); nkpt++; pmap_zero_page(nkpg); ptppaddr = VM_PAGE_TO_PHYS(nkpg); newpdir = (pd_entry_t) (ptppaddr | PG_V | PG_RW | PG_A | PG_M); pdir_pde(PTD, kernel_vm_end) = newpdir; mtx_lock_spin(&allpmaps_lock); LIST_FOREACH(pmap, &allpmaps, pm_list) { pde = pmap_pde(pmap, kernel_vm_end); pde_store(pde, newpdir); } mtx_unlock_spin(&allpmaps_lock); kernel_vm_end = (kernel_vm_end + PAGE_SIZE * NPTEPG) & ~(PAGE_SIZE * NPTEPG - 1); if (kernel_vm_end - 1 >= kernel_map->max_offset) { kernel_vm_end = kernel_map->max_offset; break; } } } /*************************************************** * page management routines. ***************************************************/ CTASSERT(sizeof(struct pv_chunk) == PAGE_SIZE); CTASSERT(_NPCM == 11); static __inline struct pv_chunk * pv_to_chunk(pv_entry_t pv) { return (struct pv_chunk *)((uintptr_t)pv & ~(uintptr_t)PAGE_MASK); } #define PV_PMAP(pv) (pv_to_chunk(pv)->pc_pmap) #define PC_FREE0_9 0xfffffffful /* Free values for index 0 through 9 */ #define PC_FREE10 0x0000fffful /* Free values for index 10 */ static uint32_t pc_freemask[11] = { PC_FREE0_9, PC_FREE0_9, PC_FREE0_9, PC_FREE0_9, PC_FREE0_9, PC_FREE0_9, PC_FREE0_9, PC_FREE0_9, PC_FREE0_9, PC_FREE0_9, PC_FREE10 }; SYSCTL_INT(_vm_pmap, OID_AUTO, pv_entry_count, CTLFLAG_RD, &pv_entry_count, 0, "Current number of pv entries"); #ifdef PV_STATS static int pc_chunk_count, pc_chunk_allocs, pc_chunk_frees, pc_chunk_tryfail; SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_count, CTLFLAG_RD, &pc_chunk_count, 0, "Current number of pv entry chunks"); SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_allocs, CTLFLAG_RD, &pc_chunk_allocs, 0, "Current number of pv entry chunks allocated"); SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_frees, CTLFLAG_RD, &pc_chunk_frees, 0, "Current number of pv entry chunks frees"); SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_tryfail, CTLFLAG_RD, &pc_chunk_tryfail, 0, "Number of times tried to get a chunk page but failed."); static long pv_entry_frees, pv_entry_allocs; static int pv_entry_spare; SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_frees, CTLFLAG_RD, &pv_entry_frees, 0, "Current number of pv entry frees"); SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_allocs, CTLFLAG_RD, &pv_entry_allocs, 0, "Current number of pv entry allocs"); SYSCTL_INT(_vm_pmap, OID_AUTO, pv_entry_spare, CTLFLAG_RD, &pv_entry_spare, 0, "Current number of spare pv entries"); static int pmap_collect_inactive, pmap_collect_active; SYSCTL_INT(_vm_pmap, OID_AUTO, pmap_collect_inactive, CTLFLAG_RD, &pmap_collect_inactive, 0, "Current number times pmap_collect called on inactive queue"); SYSCTL_INT(_vm_pmap, OID_AUTO, pmap_collect_active, CTLFLAG_RD, &pmap_collect_active, 0, "Current number times pmap_collect called on active queue"); #endif /* * We are in a serious low memory condition. Resort to * drastic measures to free some pages so we can allocate * another pv entry chunk. This is normally called to * unmap inactive pages, and if necessary, active pages. */ static void pmap_collect(pmap_t locked_pmap, struct vpgqueues *vpq) { struct md_page *pvh; pd_entry_t *pde; pmap_t pmap; pt_entry_t *pte, tpte; pv_entry_t next_pv, pv; vm_offset_t va; vm_page_t m, free; sched_pin(); TAILQ_FOREACH(m, &vpq->pl, pageq) { if (m->hold_count || m->busy) continue; TAILQ_FOREACH_SAFE(pv, &m->md.pv_list, pv_list, next_pv) { va = pv->pv_va; pmap = PV_PMAP(pv); /* Avoid deadlock and lock recursion. */ if (pmap > locked_pmap) PMAP_LOCK(pmap); else if (pmap != locked_pmap && !PMAP_TRYLOCK(pmap)) continue; pmap->pm_stats.resident_count--; pde = pmap_pde(pmap, va); KASSERT((*pde & PG_PS) == 0, ("pmap_collect: found" " a 4mpage in page %p's pv list", m)); pte = pmap_pte_quick(pmap, va); tpte = pte_load_clear(pte); KASSERT((tpte & PG_W) == 0, ("pmap_collect: wired pte %#jx", (uintmax_t)tpte)); if (tpte & PG_A) vm_page_flag_set(m, PG_REFERENCED); if ((tpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) vm_page_dirty(m); free = NULL; pmap_unuse_pt(pmap, va, &free); pmap_invalidate_page(pmap, va); pmap_free_zero_pages(free); TAILQ_REMOVE(&m->md.pv_list, pv, pv_list); if (TAILQ_EMPTY(&m->md.pv_list)) { pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); if (TAILQ_EMPTY(&pvh->pv_list)) vm_page_flag_clear(m, PG_WRITEABLE); } free_pv_entry(pmap, pv); if (pmap != locked_pmap) PMAP_UNLOCK(pmap); } } sched_unpin(); } /* * free the pv_entry back to the free list */ static void free_pv_entry(pmap_t pmap, pv_entry_t pv) { vm_page_t m; struct pv_chunk *pc; int idx, field, bit; mtx_assert(&vm_page_queue_mtx, MA_OWNED); PMAP_LOCK_ASSERT(pmap, MA_OWNED); PV_STAT(pv_entry_frees++); PV_STAT(pv_entry_spare++); pv_entry_count--; pc = pv_to_chunk(pv); idx = pv - &pc->pc_pventry[0]; field = idx / 32; bit = idx % 32; pc->pc_map[field] |= 1ul << bit; /* move to head of list */ TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list); for (idx = 0; idx < _NPCM; idx++) if (pc->pc_map[idx] != pc_freemask[idx]) return; PV_STAT(pv_entry_spare -= _NPCPV); PV_STAT(pc_chunk_count--); PV_STAT(pc_chunk_frees++); /* entire chunk is free, return it */ TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); m = PHYS_TO_VM_PAGE(pmap_kextract((vm_offset_t)pc)); pmap_qremove((vm_offset_t)pc, 1); vm_page_unwire(m, 0); vm_page_free(m); pmap_ptelist_free(&pv_vafree, (vm_offset_t)pc); } /* * get a new pv_entry, allocating a block from the system * when needed. */ static pv_entry_t get_pv_entry(pmap_t pmap, int try) { static const struct timeval printinterval = { 60, 0 }; static struct timeval lastprint; static vm_pindex_t colour; struct vpgqueues *pq; int bit, field; pv_entry_t pv; struct pv_chunk *pc; vm_page_t m; PMAP_LOCK_ASSERT(pmap, MA_OWNED); mtx_assert(&vm_page_queue_mtx, MA_OWNED); PV_STAT(pv_entry_allocs++); pv_entry_count++; if (pv_entry_count > pv_entry_high_water) if (ratecheck(&lastprint, &printinterval)) printf("Approaching the limit on PV entries, consider " "increasing either the vm.pmap.shpgperproc or the " "vm.pmap.pv_entry_max tunable.\n"); pq = NULL; retry: pc = TAILQ_FIRST(&pmap->pm_pvchunk); if (pc != NULL) { for (field = 0; field < _NPCM; field++) { if (pc->pc_map[field]) { bit = bsfl(pc->pc_map[field]); break; } } if (field < _NPCM) { pv = &pc->pc_pventry[field * 32 + bit]; pc->pc_map[field] &= ~(1ul << bit); /* If this was the last item, move it to tail */ for (field = 0; field < _NPCM; field++) if (pc->pc_map[field] != 0) { PV_STAT(pv_entry_spare--); return (pv); /* not full, return */ } TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, pc_list); PV_STAT(pv_entry_spare--); return (pv); } } /* * Access to the ptelist "pv_vafree" is synchronized by the page * queues lock. If "pv_vafree" is currently non-empty, it will * remain non-empty until pmap_ptelist_alloc() completes. */ if (pv_vafree == 0 || (m = vm_page_alloc(NULL, colour, (pq == &vm_page_queues[PQ_ACTIVE] ? VM_ALLOC_SYSTEM : VM_ALLOC_NORMAL) | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED)) == NULL) { if (try) { pv_entry_count--; PV_STAT(pc_chunk_tryfail++); return (NULL); } /* * Reclaim pv entries: At first, destroy mappings to * inactive pages. After that, if a pv chunk entry * is still needed, destroy mappings to active pages. */ if (pq == NULL) { PV_STAT(pmap_collect_inactive++); pq = &vm_page_queues[PQ_INACTIVE]; } else if (pq == &vm_page_queues[PQ_INACTIVE]) { PV_STAT(pmap_collect_active++); pq = &vm_page_queues[PQ_ACTIVE]; } else panic("get_pv_entry: increase vm.pmap.shpgperproc"); pmap_collect(pmap, pq); goto retry; } PV_STAT(pc_chunk_count++); PV_STAT(pc_chunk_allocs++); colour++; pc = (struct pv_chunk *)pmap_ptelist_alloc(&pv_vafree); pmap_qenter((vm_offset_t)pc, &m, 1); pc->pc_pmap = pmap; pc->pc_map[0] = pc_freemask[0] & ~1ul; /* preallocated bit 0 */ for (field = 1; field < _NPCM; field++) pc->pc_map[field] = pc_freemask[field]; pv = &pc->pc_pventry[0]; TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list); PV_STAT(pv_entry_spare += _NPCPV - 1); return (pv); } static __inline pv_entry_t pmap_pvh_remove(struct md_page *pvh, pmap_t pmap, vm_offset_t va) { pv_entry_t pv; mtx_assert(&vm_page_queue_mtx, MA_OWNED); TAILQ_FOREACH(pv, &pvh->pv_list, pv_list) { if (pmap == PV_PMAP(pv) && va == pv->pv_va) { TAILQ_REMOVE(&pvh->pv_list, pv, pv_list); break; } } return (pv); } static void pmap_pv_demote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa) { struct md_page *pvh; pv_entry_t pv; vm_offset_t va_last; vm_page_t m; mtx_assert(&vm_page_queue_mtx, MA_OWNED); KASSERT((pa & PDRMASK) == 0, ("pmap_pv_demote_pde: pa is not 4mpage aligned")); /* * Transfer the 4mpage's pv entry for this mapping to the first * page's pv list. */ pvh = pa_to_pvh(pa); va = trunc_4mpage(va); pv = pmap_pvh_remove(pvh, pmap, va); KASSERT(pv != NULL, ("pmap_pv_demote_pde: pv not found")); m = PHYS_TO_VM_PAGE(pa); TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_list); /* Instantiate the remaining NPTEPG - 1 pv entries. */ va_last = va + NBPDR - PAGE_SIZE; do { m++; KASSERT((m->flags & (PG_FICTITIOUS | PG_UNMANAGED)) == 0, ("pmap_pv_demote_pde: page %p is not managed", m)); va += PAGE_SIZE; pmap_insert_entry(pmap, va, m); } while (va < va_last); } static void pmap_pv_promote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa) { struct md_page *pvh; pv_entry_t pv; vm_offset_t va_last; vm_page_t m; mtx_assert(&vm_page_queue_mtx, MA_OWNED); KASSERT((pa & PDRMASK) == 0, ("pmap_pv_promote_pde: pa is not 4mpage aligned")); /* * Transfer the first page's pv entry for this mapping to the * 4mpage's pv list. Aside from avoiding the cost of a call * to get_pv_entry(), a transfer avoids the possibility that * get_pv_entry() calls pmap_collect() and that pmap_collect() * removes one of the mappings that is being promoted. */ m = PHYS_TO_VM_PAGE(pa); va = trunc_4mpage(va); pv = pmap_pvh_remove(&m->md, pmap, va); KASSERT(pv != NULL, ("pmap_pv_promote_pde: pv not found")); pvh = pa_to_pvh(pa); TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_list); /* Free the remaining NPTEPG - 1 pv entries. */ va_last = va + NBPDR - PAGE_SIZE; do { m++; va += PAGE_SIZE; pmap_pvh_free(&m->md, pmap, va); } while (va < va_last); } static void pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va) { pv_entry_t pv; pv = pmap_pvh_remove(pvh, pmap, va); KASSERT(pv != NULL, ("pmap_pvh_free: pv not found")); free_pv_entry(pmap, pv); } static void pmap_remove_entry(pmap_t pmap, vm_page_t m, vm_offset_t va) { struct md_page *pvh; mtx_assert(&vm_page_queue_mtx, MA_OWNED); pmap_pvh_free(&m->md, pmap, va); if (TAILQ_EMPTY(&m->md.pv_list)) { pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); if (TAILQ_EMPTY(&pvh->pv_list)) vm_page_flag_clear(m, PG_WRITEABLE); } } /* * Create a pv entry for page at pa for * (pmap, va). */ static void pmap_insert_entry(pmap_t pmap, vm_offset_t va, vm_page_t m) { pv_entry_t pv; PMAP_LOCK_ASSERT(pmap, MA_OWNED); mtx_assert(&vm_page_queue_mtx, MA_OWNED); pv = get_pv_entry(pmap, FALSE); pv->pv_va = va; TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_list); } /* * Conditionally create a pv entry. */ static boolean_t pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va, vm_page_t m) { pv_entry_t pv; PMAP_LOCK_ASSERT(pmap, MA_OWNED); mtx_assert(&vm_page_queue_mtx, MA_OWNED); if (pv_entry_count < pv_entry_high_water && (pv = get_pv_entry(pmap, TRUE)) != NULL) { pv->pv_va = va; TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_list); return (TRUE); } else return (FALSE); } /* * Create the pv entries for each of the pages within a superpage. */ static boolean_t pmap_pv_insert_pde(pmap_t pmap, vm_offset_t va, vm_page_t m) { struct md_page *pvh; pv_entry_t pv; mtx_assert(&vm_page_queue_mtx, MA_OWNED); if (pv_entry_count < pv_entry_high_water && (pv = get_pv_entry(pmap, TRUE)) != NULL) { pv->pv_va = va; pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_list); return (TRUE); } else return (FALSE); } /* * Tries to demote a 2- or 4MB page mapping. */ static boolean_t pmap_demote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va) { pd_entry_t newpde, oldpde; pmap_t allpmaps_entry; pt_entry_t *firstpte, newpte, *pte; vm_paddr_t mptepa; vm_page_t free, mpte; PMAP_LOCK_ASSERT(pmap, MA_OWNED); mpte = pmap_lookup_pt_page(pmap, va); if (mpte != NULL) pmap_remove_pt_page(pmap, mpte); else { KASSERT((*pde & PG_W) == 0, ("pmap_demote_pde: page table page for a wired mapping" " is missing")); free = NULL; pmap_remove_pde(pmap, pde, trunc_4mpage(va), &free); pmap_invalidate_page(pmap, trunc_4mpage(va)); pmap_free_zero_pages(free); CTR2(KTR_PMAP, "pmap_demote_pde: failure for va %#x" " in pmap %p", va, pmap); return (FALSE); } mptepa = VM_PAGE_TO_PHYS(mpte); /* * Temporarily map the page table page (mpte) into the kernel's * address space at either PADDR1 or PADDR2. */ if (curthread->td_pinned > 0 && mtx_owned(&vm_page_queue_mtx)) { if ((*PMAP1 & PG_FRAME) != mptepa) { *PMAP1 = mptepa | PG_RW | PG_V | PG_A | PG_M; #ifdef SMP PMAP1cpu = PCPU_GET(cpuid); #endif invlcaddr(PADDR1); PMAP1changed++; } else #ifdef SMP if (PMAP1cpu != PCPU_GET(cpuid)) { PMAP1cpu = PCPU_GET(cpuid); invlcaddr(PADDR1); PMAP1changedcpu++; } else #endif PMAP1unchanged++; firstpte = PADDR1; } else { mtx_lock(&PMAP2mutex); if ((*PMAP2 & PG_FRAME) != mptepa) { *PMAP2 = mptepa | PG_RW | PG_V | PG_A | PG_M; pmap_invalidate_page(kernel_pmap, (vm_offset_t)PADDR2); } firstpte = PADDR2; } oldpde = *pde; newpde = mptepa | PG_M | PG_A | (oldpde & PG_U) | PG_RW | PG_V; KASSERT((oldpde & (PG_A | PG_V)) == (PG_A | PG_V), ("pmap_demote_pde: oldpde is missing PG_A and/or PG_V")); KASSERT((oldpde & (PG_M | PG_RW)) != PG_RW, ("pmap_demote_pde: oldpde is missing PG_M")); KASSERT((oldpde & PG_PS) != 0, ("pmap_demote_pde: oldpde is missing PG_PS")); newpte = oldpde & ~PG_PS; if ((newpte & PG_PDE_PAT) != 0) newpte ^= PG_PDE_PAT | PG_PTE_PAT; /* * If the mapping has changed attributes, update the page table * entries. */ KASSERT((*firstpte & PG_FRAME) == (newpte & PG_FRAME), ("pmap_demote_pde: firstpte and newpte map different physical" " addresses")); if ((*firstpte & PG_PTE_PROMOTE) != (newpte & PG_PTE_PROMOTE)) for (pte = firstpte; pte < firstpte + NPTEPG; pte++) { *pte = newpte; newpte += PAGE_SIZE; } /* * Demote the mapping. This pmap is locked. The old PDE has * PG_A set. If the old PDE has PG_RW set, it also has PG_M * set. Thus, there is no danger of a race with another * processor changing the setting of PG_A and/or PG_M between * the read above and the store below. */ if (pmap == kernel_pmap) { /* * A harmless race exists between this loop and the bcopy() * in pmap_pinit() that initializes the kernel segment of * the new page table. Specifically, that bcopy() may copy * the new PDE from the PTD, which is first in allpmaps, to * the new page table before this loop updates that new * page table. */ mtx_lock_spin(&allpmaps_lock); LIST_FOREACH(allpmaps_entry, &allpmaps, pm_list) { pde = pmap_pde(allpmaps_entry, va); KASSERT(*pde == newpde || (*pde & PG_PTE_PROMOTE) == (oldpde & PG_PTE_PROMOTE), ("pmap_demote_pde: pde was %#jx, expected %#jx", (uintmax_t)*pde, (uintmax_t)oldpde)); pde_store(pde, newpde); } mtx_unlock_spin(&allpmaps_lock); } else pde_store(pde, newpde); if (firstpte == PADDR2) mtx_unlock(&PMAP2mutex); /* * Invalidate the recursive mapping of the page table page. */ pmap_invalidate_page(pmap, (vm_offset_t)vtopte(va)); /* * Demote the pv entry. This depends on the earlier demotion * of the mapping. Specifically, the (re)creation of a per- * page pv entry might trigger the execution of pmap_collect(), * which might reclaim a newly (re)created per-page pv entry * and destroy the associated mapping. In order to destroy * the mapping, the PDE must have already changed from mapping * the 2mpage to referencing the page table page. */ if ((oldpde & PG_MANAGED) != 0) pmap_pv_demote_pde(pmap, va, oldpde & PG_FRAME); pmap_pde_demotions++; CTR2(KTR_PMAP, "pmap_demote_pde: success for va %#x" " in pmap %p", va, pmap); return (TRUE); } /* * pmap_remove_pde: do the things to unmap a superpage in a process */ static void pmap_remove_pde(pmap_t pmap, pd_entry_t *pdq, vm_offset_t sva, vm_page_t *free) { struct md_page *pvh; pd_entry_t oldpde; vm_offset_t eva, va; vm_page_t m, mpte; PMAP_LOCK_ASSERT(pmap, MA_OWNED); KASSERT((sva & PDRMASK) == 0, ("pmap_remove_pde: sva is not 4mpage aligned")); oldpde = pte_load_clear(pdq); if (oldpde & PG_W) pmap->pm_stats.wired_count -= NBPDR / PAGE_SIZE; /* * Machines that don't support invlpg, also don't support * PG_G. */ if (oldpde & PG_G) pmap_invalidate_page(kernel_pmap, sva); pmap->pm_stats.resident_count -= NBPDR / PAGE_SIZE; if (oldpde & PG_MANAGED) { pvh = pa_to_pvh(oldpde & PG_FRAME); pmap_pvh_free(pvh, pmap, sva); eva = sva + NBPDR; for (va = sva, m = PHYS_TO_VM_PAGE(oldpde & PG_FRAME); va < eva; va += PAGE_SIZE, m++) { if ((oldpde & (PG_M | PG_RW)) == (PG_M | PG_RW)) vm_page_dirty(m); if (oldpde & PG_A) vm_page_flag_set(m, PG_REFERENCED); if (TAILQ_EMPTY(&m->md.pv_list) && TAILQ_EMPTY(&pvh->pv_list)) vm_page_flag_clear(m, PG_WRITEABLE); } } if (pmap == kernel_pmap) { if (!pmap_demote_pde(pmap, pdq, sva)) panic("pmap_remove_pde: failed demotion"); } else { mpte = pmap_lookup_pt_page(pmap, sva); if (mpte != NULL) { pmap_remove_pt_page(pmap, mpte); KASSERT(mpte->wire_count == NPTEPG, ("pmap_remove_pde: pte page wire count error")); mpte->wire_count = 0; pmap_add_delayed_free_list(mpte, free, FALSE); atomic_subtract_int(&cnt.v_wire_count, 1); } } } /* * pmap_remove_pte: do the things to unmap a page in a process */ static int pmap_remove_pte(pmap_t pmap, pt_entry_t *ptq, vm_offset_t va, vm_page_t *free) { pt_entry_t oldpte; vm_page_t m; mtx_assert(&vm_page_queue_mtx, MA_OWNED); PMAP_LOCK_ASSERT(pmap, MA_OWNED); oldpte = pte_load_clear(ptq); if (oldpte & PG_W) pmap->pm_stats.wired_count -= 1; /* * Machines that don't support invlpg, also don't support * PG_G. */ if (oldpte & PG_G) pmap_invalidate_page(kernel_pmap, va); pmap->pm_stats.resident_count -= 1; if (oldpte & PG_MANAGED) { m = PHYS_TO_VM_PAGE(oldpte & PG_FRAME); if ((oldpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) vm_page_dirty(m); if (oldpte & PG_A) vm_page_flag_set(m, PG_REFERENCED); pmap_remove_entry(pmap, m, va); } return (pmap_unuse_pt(pmap, va, free)); } /* * Remove a single page from a process address space */ static void pmap_remove_page(pmap_t pmap, vm_offset_t va, vm_page_t *free) { pt_entry_t *pte; mtx_assert(&vm_page_queue_mtx, MA_OWNED); KASSERT(curthread->td_pinned > 0, ("curthread not pinned")); PMAP_LOCK_ASSERT(pmap, MA_OWNED); if ((pte = pmap_pte_quick(pmap, va)) == NULL || *pte == 0) return; pmap_remove_pte(pmap, pte, va, free); pmap_invalidate_page(pmap, va); } /* * Remove the given range of addresses from the specified map. * * It is assumed that the start and end are properly * rounded to the page size. */ void pmap_remove(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) { vm_offset_t pdnxt; pd_entry_t ptpaddr; pt_entry_t *pte; vm_page_t free = NULL; int anyvalid; /* * Perform an unsynchronized read. This is, however, safe. */ if (pmap->pm_stats.resident_count == 0) return; anyvalid = 0; vm_page_lock_queues(); sched_pin(); PMAP_LOCK(pmap); /* * special handling of removing one page. a very * common operation and easy to short circuit some * code. */ if ((sva + PAGE_SIZE == eva) && ((pmap->pm_pdir[(sva >> PDRSHIFT)] & PG_PS) == 0)) { pmap_remove_page(pmap, sva, &free); goto out; } for (; sva < eva; sva = pdnxt) { unsigned pdirindex; /* * Calculate index for next page table. */ pdnxt = (sva + NBPDR) & ~PDRMASK; if (pdnxt < sva) pdnxt = eva; if (pmap->pm_stats.resident_count == 0) break; pdirindex = sva >> PDRSHIFT; ptpaddr = pmap->pm_pdir[pdirindex]; /* * Weed out invalid mappings. Note: we assume that the page * directory table is always allocated, and in kernel virtual. */ if (ptpaddr == 0) continue; /* * Check for large page. */ if ((ptpaddr & PG_PS) != 0) { /* * Are we removing the entire large page? If not, * demote the mapping and fall through. */ if (sva + NBPDR == pdnxt && eva >= pdnxt) { /* * The TLB entry for a PG_G mapping is * invalidated by pmap_remove_pde(). */ if ((ptpaddr & PG_G) == 0) anyvalid = 1; pmap_remove_pde(pmap, &pmap->pm_pdir[pdirindex], sva, &free); continue; } else if (!pmap_demote_pde(pmap, &pmap->pm_pdir[pdirindex], sva)) { /* The large page mapping was destroyed. */ continue; } } /* * Limit our scan to either the end of the va represented * by the current page table page, or to the end of the * range being removed. */ if (pdnxt > eva) pdnxt = eva; for (pte = pmap_pte_quick(pmap, sva); sva != pdnxt; pte++, sva += PAGE_SIZE) { if (*pte == 0) continue; /* * The TLB entry for a PG_G mapping is invalidated * by pmap_remove_pte(). */ if ((*pte & PG_G) == 0) anyvalid = 1; if (pmap_remove_pte(pmap, pte, sva, &free)) break; } } out: sched_unpin(); if (anyvalid) pmap_invalidate_all(pmap); vm_page_unlock_queues(); PMAP_UNLOCK(pmap); pmap_free_zero_pages(free); } /* * Routine: pmap_remove_all * Function: * Removes this physical page from * all physical maps in which it resides. * Reflects back modify bits to the pager. * * Notes: * Original versions of this routine were very * inefficient because they iteratively called * pmap_remove (slow...) */ void pmap_remove_all(vm_page_t m) { struct md_page *pvh; pv_entry_t pv; pmap_t pmap; pt_entry_t *pte, tpte; pd_entry_t *pde; vm_offset_t va; vm_page_t free; KASSERT((m->flags & PG_FICTITIOUS) == 0, ("pmap_remove_all: page %p is fictitious", m)); mtx_assert(&vm_page_queue_mtx, MA_OWNED); sched_pin(); pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); while ((pv = TAILQ_FIRST(&pvh->pv_list)) != NULL) { va = pv->pv_va; pmap = PV_PMAP(pv); PMAP_LOCK(pmap); pde = pmap_pde(pmap, va); (void)pmap_demote_pde(pmap, pde, va); PMAP_UNLOCK(pmap); } while ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) { pmap = PV_PMAP(pv); PMAP_LOCK(pmap); pmap->pm_stats.resident_count--; pde = pmap_pde(pmap, pv->pv_va); KASSERT((*pde & PG_PS) == 0, ("pmap_remove_all: found" " a 4mpage in page %p's pv list", m)); pte = pmap_pte_quick(pmap, pv->pv_va); tpte = pte_load_clear(pte); if (tpte & PG_W) pmap->pm_stats.wired_count--; if (tpte & PG_A) vm_page_flag_set(m, PG_REFERENCED); /* * Update the vm_page_t clean and reference bits. */ if ((tpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) vm_page_dirty(m); free = NULL; pmap_unuse_pt(pmap, pv->pv_va, &free); pmap_invalidate_page(pmap, pv->pv_va); pmap_free_zero_pages(free); TAILQ_REMOVE(&m->md.pv_list, pv, pv_list); free_pv_entry(pmap, pv); PMAP_UNLOCK(pmap); } vm_page_flag_clear(m, PG_WRITEABLE); sched_unpin(); } /* * pmap_protect_pde: do the things to protect a 4mpage in a process */ static boolean_t pmap_protect_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t sva, vm_prot_t prot) { pd_entry_t newpde, oldpde; vm_offset_t eva, va; vm_page_t m; boolean_t anychanged; PMAP_LOCK_ASSERT(pmap, MA_OWNED); KASSERT((sva & PDRMASK) == 0, ("pmap_protect_pde: sva is not 4mpage aligned")); anychanged = FALSE; retry: oldpde = newpde = *pde; if (oldpde & PG_MANAGED) { eva = sva + NBPDR; for (va = sva, m = PHYS_TO_VM_PAGE(oldpde & PG_FRAME); va < eva; va += PAGE_SIZE, m++) { /* * In contrast to the analogous operation on a 4KB page * mapping, the mapping's PG_A flag is not cleared and * the page's PG_REFERENCED flag is not set. The * reason is that pmap_demote_pde() expects that a 2/4MB * page mapping with a stored page table page has PG_A * set. */ if ((oldpde & (PG_M | PG_RW)) == (PG_M | PG_RW)) vm_page_dirty(m); } } if ((prot & VM_PROT_WRITE) == 0) newpde &= ~(PG_RW | PG_M); #ifdef PAE if ((prot & VM_PROT_EXECUTE) == 0) newpde |= pg_nx; #endif if (newpde != oldpde) { if (!pde_cmpset(pde, oldpde, newpde)) goto retry; if (oldpde & PG_G) pmap_invalidate_page(pmap, sva); else anychanged = TRUE; } return (anychanged); } /* * Set the physical protection on the * specified range of this map as requested. */ void pmap_protect(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, vm_prot_t prot) { vm_offset_t pdnxt; pd_entry_t ptpaddr; pt_entry_t *pte; int anychanged; if ((prot & VM_PROT_READ) == VM_PROT_NONE) { pmap_remove(pmap, sva, eva); return; } #ifdef PAE if ((prot & (VM_PROT_WRITE|VM_PROT_EXECUTE)) == (VM_PROT_WRITE|VM_PROT_EXECUTE)) return; #else if (prot & VM_PROT_WRITE) return; #endif anychanged = 0; vm_page_lock_queues(); sched_pin(); PMAP_LOCK(pmap); for (; sva < eva; sva = pdnxt) { pt_entry_t obits, pbits; unsigned pdirindex; pdnxt = (sva + NBPDR) & ~PDRMASK; if (pdnxt < sva) pdnxt = eva; pdirindex = sva >> PDRSHIFT; ptpaddr = pmap->pm_pdir[pdirindex]; /* * Weed out invalid mappings. Note: we assume that the page * directory table is always allocated, and in kernel virtual. */ if (ptpaddr == 0) continue; /* * Check for large page. */ if ((ptpaddr & PG_PS) != 0) { /* * Are we protecting the entire large page? If not, * demote the mapping and fall through. */ if (sva + NBPDR == pdnxt && eva >= pdnxt) { /* * The TLB entry for a PG_G mapping is * invalidated by pmap_protect_pde(). */ if (pmap_protect_pde(pmap, &pmap->pm_pdir[pdirindex], sva, prot)) anychanged = 1; continue; } else if (!pmap_demote_pde(pmap, &pmap->pm_pdir[pdirindex], sva)) { /* The large page mapping was destroyed. */ continue; } } if (pdnxt > eva) pdnxt = eva; for (pte = pmap_pte_quick(pmap, sva); sva != pdnxt; pte++, sva += PAGE_SIZE) { vm_page_t m; retry: /* * Regardless of whether a pte is 32 or 64 bits in * size, PG_RW, PG_A, and PG_M are among the least * significant 32 bits. */ obits = pbits = *pte; if ((pbits & PG_V) == 0) continue; if (pbits & PG_MANAGED) { m = NULL; if (pbits & PG_A) { m = PHYS_TO_VM_PAGE(pbits & PG_FRAME); vm_page_flag_set(m, PG_REFERENCED); pbits &= ~PG_A; } if ((pbits & (PG_M | PG_RW)) == (PG_M | PG_RW)) { if (m == NULL) m = PHYS_TO_VM_PAGE(pbits & PG_FRAME); vm_page_dirty(m); } } if ((prot & VM_PROT_WRITE) == 0) pbits &= ~(PG_RW | PG_M); #ifdef PAE if ((prot & VM_PROT_EXECUTE) == 0) pbits |= pg_nx; #endif if (pbits != obits) { #ifdef PAE if (!atomic_cmpset_64(pte, obits, pbits)) goto retry; #else if (!atomic_cmpset_int((u_int *)pte, obits, pbits)) goto retry; #endif if (obits & PG_G) pmap_invalidate_page(pmap, sva); else anychanged = 1; } } } sched_unpin(); if (anychanged) pmap_invalidate_all(pmap); vm_page_unlock_queues(); PMAP_UNLOCK(pmap); } /* * Tries to promote the 512 or 1024, contiguous 4KB page mappings that are * within a single page table page to a single 2- or 4MB page mapping. For * promotion to occur, two conditions must be met: (1) the 4KB page mappings * must map aligned, contiguous physical memory and (2) the 4KB page mappings * must have identical characteristics. * * Managed (PG_MANAGED) mappings within the kernel address space are not * promoted. The reason is that kernel PDEs are replicated in each pmap but * pmap_clear_ptes() and pmap_ts_referenced() only read the PDE from the kernel * pmap. */ static void pmap_promote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va) { pd_entry_t newpde; pmap_t allpmaps_entry; pt_entry_t *firstpte, oldpte, *pte; vm_offset_t oldpteva; vm_paddr_t pa; vm_page_t mpte; PMAP_LOCK_ASSERT(pmap, MA_OWNED); firstpte = vtopte(trunc_4mpage(va)); KASSERT((*firstpte & PG_V) != 0, ("pmap_promote_pde: firstpte is missing PG_V")); if ((*firstpte & PG_A) == 0) { pmap_pde_p_failures++; CTR2(KTR_PMAP, "pmap_promote_pde: failure for va %#x" " in pmap %p", va, pmap); return; } if ((*firstpte & PG_MANAGED) != 0 && pmap == kernel_pmap) { pmap_pde_p_failures++; CTR2(KTR_PMAP, "pmap_promote_pde: failure for va %#x" " in pmap %p", va, pmap); return; } pa = *firstpte & PG_PS_FRAME; newpde = *firstpte; if ((newpde & (PG_M | PG_RW)) == PG_RW) newpde &= ~PG_RW; /* * Check all the ptes before promotion */ for (pte = firstpte; pte < firstpte + NPTEPG; pte++) { retry: oldpte = *pte; if ((oldpte & PG_FRAME) != pa) { pmap_pde_p_failures++; CTR2(KTR_PMAP, "pmap_promote_pde: failure for va %#x" " in pmap %p", va, pmap); return; } if ((oldpte & (PG_M | PG_RW)) == PG_RW) { /* * When PG_M is already clear, PG_RW can be cleared * without a TLB invalidation. */ if (!atomic_cmpset_int((u_int *)pte, oldpte, oldpte & ~PG_RW)) goto retry; oldpte &= ~PG_RW; oldpteva = (oldpte & PG_FRAME & PDRMASK) | (va & ~PDRMASK); CTR2(KTR_PMAP, "pmap_promote_pde: protect for va %#x" " in pmap %p", oldpteva, pmap); } if ((oldpte & PG_PTE_PROMOTE) != (newpde & PG_PTE_PROMOTE)) { pmap_pde_p_failures++; CTR2(KTR_PMAP, "pmap_promote_pde: failure for va %#x" " in pmap %p", va, pmap); return; } pa += PAGE_SIZE; } /* * Save the page table page in its current state until the PDE * mapping the superpage is demoted by pmap_demote_pde() or * destroyed by pmap_remove_pde(). */ mpte = PHYS_TO_VM_PAGE(*pde & PG_FRAME); KASSERT(mpte >= vm_page_array && mpte < &vm_page_array[vm_page_array_size], ("pmap_promote_pde: page table page is out of range")); KASSERT(mpte->pindex == va >> PDRSHIFT, ("pmap_promote_pde: page table page's pindex is wrong")); pmap_insert_pt_page(pmap, mpte); /* * Promote the pv entries. */ if ((newpde & PG_MANAGED) != 0) pmap_pv_promote_pde(pmap, va, newpde & PG_FRAME); /* * Propagate the PAT index to its proper position. */ if ((newpde & PG_PTE_PAT) != 0) newpde ^= PG_PDE_PAT | PG_PTE_PAT; /* * Map the superpage. */ if (pmap == kernel_pmap) { mtx_lock_spin(&allpmaps_lock); LIST_FOREACH(allpmaps_entry, &allpmaps, pm_list) { pde = pmap_pde(allpmaps_entry, va); pde_store(pde, PG_PS | newpde); } mtx_unlock_spin(&allpmaps_lock); } else pde_store(pde, PG_PS | newpde); pmap_pde_promotions++; CTR2(KTR_PMAP, "pmap_promote_pde: success for va %#x" " in pmap %p", va, pmap); } /* * Insert the given physical page (p) at * the specified virtual address (v) in the * target physical map with the protection requested. * * If specified, the page will be wired down, meaning * that the related pte can not be reclaimed. * * NB: This is the only routine which MAY NOT lazy-evaluate * or lose information. That is, this routine must actually * insert this page into the given map NOW. */ void pmap_enter(pmap_t pmap, vm_offset_t va, vm_prot_t access, vm_page_t m, vm_prot_t prot, boolean_t wired) { vm_paddr_t pa; pd_entry_t *pde; pt_entry_t *pte; vm_paddr_t opa; pt_entry_t origpte, newpte; vm_page_t mpte, om; boolean_t invlva; va = trunc_page(va); KASSERT(va <= VM_MAX_KERNEL_ADDRESS, ("pmap_enter: toobig")); KASSERT(va < UPT_MIN_ADDRESS || va >= UPT_MAX_ADDRESS, ("pmap_enter: invalid to pmap_enter page table pages (va: 0x%x)", va)); mpte = NULL; vm_page_lock_queues(); PMAP_LOCK(pmap); sched_pin(); /* * In the case that a page table page is not * resident, we are creating it here. */ if (va < VM_MAXUSER_ADDRESS) { mpte = pmap_allocpte(pmap, va, M_WAITOK); } pde = pmap_pde(pmap, va); if ((*pde & PG_PS) != 0) panic("pmap_enter: attempted pmap_enter on 4MB page"); pte = pmap_pte_quick(pmap, va); /* * Page Directory table entry not valid, we need a new PT page */ if (pte == NULL) { panic("pmap_enter: invalid page directory pdir=%#jx, va=%#x", (uintmax_t)pmap->pm_pdir[PTDPTDI], va); } pa = VM_PAGE_TO_PHYS(m); om = NULL; origpte = *pte; opa = origpte & PG_FRAME; /* * Mapping has not changed, must be protection or wiring change. */ if (origpte && (opa == pa)) { /* * Wiring change, just update stats. We don't worry about * wiring PT pages as they remain resident as long as there * are valid mappings in them. Hence, if a user page is wired, * the PT page will be also. */ if (wired && ((origpte & PG_W) == 0)) pmap->pm_stats.wired_count++; else if (!wired && (origpte & PG_W)) pmap->pm_stats.wired_count--; /* * Remove extra pte reference */ if (mpte) mpte->wire_count--; /* * We might be turning off write access to the page, * so we go ahead and sense modify status. */ if (origpte & PG_MANAGED) { om = m; pa |= PG_MANAGED; } goto validate; } /* * Mapping has changed, invalidate old range and fall through to * handle validating new mapping. */ if (opa) { if (origpte & PG_W) pmap->pm_stats.wired_count--; if (origpte & PG_MANAGED) { om = PHYS_TO_VM_PAGE(opa); pmap_remove_entry(pmap, om, va); } if (mpte != NULL) { mpte->wire_count--; KASSERT(mpte->wire_count > 0, ("pmap_enter: missing reference to page table page," " va: 0x%x", va)); } } else pmap->pm_stats.resident_count++; /* * Enter on the PV list if part of our managed memory. */ if ((m->flags & (PG_FICTITIOUS | PG_UNMANAGED)) == 0) { KASSERT(va < kmi.clean_sva || va >= kmi.clean_eva, ("pmap_enter: managed mapping within the clean submap")); pmap_insert_entry(pmap, va, m); pa |= PG_MANAGED; } /* * Increment counters */ if (wired) pmap->pm_stats.wired_count++; validate: /* * Now validate mapping with desired protection/wiring. */ newpte = (pt_entry_t)(pa | PG_V); if ((prot & VM_PROT_WRITE) != 0) { newpte |= PG_RW; vm_page_flag_set(m, PG_WRITEABLE); } #ifdef PAE if ((prot & VM_PROT_EXECUTE) == 0) newpte |= pg_nx; #endif if (wired) newpte |= PG_W; if (va < VM_MAXUSER_ADDRESS) newpte |= PG_U; if (pmap == kernel_pmap) newpte |= pgeflag; /* * if the mapping or permission bits are different, we need * to update the pte. */ if ((origpte & ~(PG_M|PG_A)) != newpte) { newpte |= PG_A; if ((access & VM_PROT_WRITE) != 0) newpte |= PG_M; if (origpte & PG_V) { invlva = FALSE; origpte = pte_load_store(pte, newpte); if (origpte & PG_A) { if (origpte & PG_MANAGED) vm_page_flag_set(om, PG_REFERENCED); if (opa != VM_PAGE_TO_PHYS(m)) invlva = TRUE; #ifdef PAE if ((origpte & PG_NX) == 0 && (newpte & PG_NX) != 0) invlva = TRUE; #endif } if ((origpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) { if ((origpte & PG_MANAGED) != 0) vm_page_dirty(om); if ((prot & VM_PROT_WRITE) == 0) invlva = TRUE; } if (invlva) pmap_invalidate_page(pmap, va); } else pte_store(pte, newpte); } /* * If both the page table page and the reservation are fully * populated, then attempt promotion. */ if ((mpte == NULL || mpte->wire_count == NPTEPG) && pg_ps_enabled && vm_reserv_level_iffullpop(m) == 0) pmap_promote_pde(pmap, pde, va); sched_unpin(); vm_page_unlock_queues(); PMAP_UNLOCK(pmap); } /* * Tries to create a 2- or 4MB page mapping. Returns TRUE if successful and * FALSE otherwise. Fails if (1) a page table page cannot be allocated without * blocking, (2) a mapping already exists at the specified virtual address, or * (3) a pv entry cannot be allocated without reclaiming another pv entry. */ static boolean_t pmap_enter_pde(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot) { pd_entry_t *pde, newpde; mtx_assert(&vm_page_queue_mtx, MA_OWNED); PMAP_LOCK_ASSERT(pmap, MA_OWNED); pde = pmap_pde(pmap, va); if (*pde != 0) { CTR2(KTR_PMAP, "pmap_enter_pde: failure for va %#lx" " in pmap %p", va, pmap); return (FALSE); } newpde = VM_PAGE_TO_PHYS(m) | PG_PS | PG_V; if ((m->flags & (PG_FICTITIOUS | PG_UNMANAGED)) == 0) { newpde |= PG_MANAGED; /* * Abort this mapping if its PV entry could not be created. */ if (!pmap_pv_insert_pde(pmap, va, m)) { CTR2(KTR_PMAP, "pmap_enter_pde: failure for va %#lx" " in pmap %p", va, pmap); return (FALSE); } } #ifdef PAE if ((prot & VM_PROT_EXECUTE) == 0) newpde |= pg_nx; #endif if (va < VM_MAXUSER_ADDRESS) newpde |= PG_U; /* * Increment counters. */ pmap->pm_stats.resident_count += NBPDR / PAGE_SIZE; /* * Map the superpage. */ pde_store(pde, newpde); pmap_pde_mappings++; CTR2(KTR_PMAP, "pmap_enter_pde: success for va %#lx" " in pmap %p", va, pmap); return (TRUE); } /* * Maps a sequence of resident pages belonging to the same object. * The sequence begins with the given page m_start. This page is * mapped at the given virtual address start. Each subsequent page is * mapped at a virtual address that is offset from start by the same * amount as the page is offset from m_start within the object. The * last page in the sequence is the page with the largest offset from * m_start that can be mapped at a virtual address less than the given * virtual address end. Not every virtual page between start and end * is mapped; only those for which a resident page exists with the * corresponding offset from m_start are mapped. */ void pmap_enter_object(pmap_t pmap, vm_offset_t start, vm_offset_t end, vm_page_t m_start, vm_prot_t prot) { vm_offset_t va; vm_page_t m, mpte; vm_pindex_t diff, psize; VM_OBJECT_LOCK_ASSERT(m_start->object, MA_OWNED); psize = atop(end - start); mpte = NULL; m = m_start; PMAP_LOCK(pmap); while (m != NULL && (diff = m->pindex - m_start->pindex) < psize) { va = start + ptoa(diff); if ((va & PDRMASK) == 0 && va + NBPDR <= end && (VM_PAGE_TO_PHYS(m) & PDRMASK) == 0 && pg_ps_enabled && vm_reserv_level_iffullpop(m) == 0 && pmap_enter_pde(pmap, va, m, prot)) m = &m[NBPDR / PAGE_SIZE - 1]; else mpte = pmap_enter_quick_locked(pmap, va, m, prot, mpte); m = TAILQ_NEXT(m, listq); } PMAP_UNLOCK(pmap); } /* * this code makes some *MAJOR* assumptions: * 1. Current pmap & pmap exists. * 2. Not wired. * 3. Read access. * 4. No page table pages. * but is *MUCH* faster than pmap_enter... */ void pmap_enter_quick(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot) { PMAP_LOCK(pmap); (void) pmap_enter_quick_locked(pmap, va, m, prot, NULL); PMAP_UNLOCK(pmap); } static vm_page_t pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot, vm_page_t mpte) { pt_entry_t *pte; vm_paddr_t pa; vm_page_t free; KASSERT(va < kmi.clean_sva || va >= kmi.clean_eva || (m->flags & (PG_FICTITIOUS | PG_UNMANAGED)) != 0, ("pmap_enter_quick_locked: managed mapping within the clean submap")); mtx_assert(&vm_page_queue_mtx, MA_OWNED); PMAP_LOCK_ASSERT(pmap, MA_OWNED); /* * In the case that a page table page is not * resident, we are creating it here. */ if (va < VM_MAXUSER_ADDRESS) { unsigned ptepindex; pd_entry_t ptepa; /* * Calculate pagetable page index */ ptepindex = va >> PDRSHIFT; if (mpte && (mpte->pindex == ptepindex)) { mpte->wire_count++; } else { /* * Get the page directory entry */ ptepa = pmap->pm_pdir[ptepindex]; /* * If the page table page is mapped, we just increment * the hold count, and activate it. */ if (ptepa) { if (ptepa & PG_PS) return (NULL); mpte = PHYS_TO_VM_PAGE(ptepa & PG_FRAME); mpte->wire_count++; } else { mpte = _pmap_allocpte(pmap, ptepindex, M_NOWAIT); if (mpte == NULL) return (mpte); } } } else { mpte = NULL; } /* * This call to vtopte makes the assumption that we are * entering the page into the current pmap. In order to support * quick entry into any pmap, one would likely use pmap_pte_quick. * But that isn't as quick as vtopte. */ pte = vtopte(va); if (*pte) { if (mpte != NULL) { mpte->wire_count--; mpte = NULL; } return (mpte); } /* * Enter on the PV list if part of our managed memory. */ if ((m->flags & (PG_FICTITIOUS | PG_UNMANAGED)) == 0 && !pmap_try_insert_pv_entry(pmap, va, m)) { if (mpte != NULL) { free = NULL; if (pmap_unwire_pte_hold(pmap, mpte, &free)) { pmap_invalidate_page(pmap, va); pmap_free_zero_pages(free); } mpte = NULL; } return (mpte); } /* * Increment counters */ pmap->pm_stats.resident_count++; pa = VM_PAGE_TO_PHYS(m); #ifdef PAE if ((prot & VM_PROT_EXECUTE) == 0) pa |= pg_nx; #endif /* * Now validate mapping with RO protection */ if (m->flags & (PG_FICTITIOUS|PG_UNMANAGED)) pte_store(pte, pa | PG_V | PG_U); else pte_store(pte, pa | PG_V | PG_U | PG_MANAGED); return mpte; } /* * Make a temporary mapping for a physical address. This is only intended * to be used for panic dumps. */ void * pmap_kenter_temporary(vm_paddr_t pa, int i) { vm_offset_t va; va = (vm_offset_t)crashdumpmap + (i * PAGE_SIZE); pmap_kenter(va, pa); invlpg(va); return ((void *)crashdumpmap); } /* * This code maps large physical mmap regions into the * processor address space. Note that some shortcuts * are taken, but the code works. */ void pmap_object_init_pt(pmap_t pmap, vm_offset_t addr, vm_object_t object, vm_pindex_t pindex, vm_size_t size) { vm_page_t p; VM_OBJECT_LOCK_ASSERT(object, MA_OWNED); KASSERT(object->type == OBJT_DEVICE, ("pmap_object_init_pt: non-device object")); if (pseflag && ((addr & (NBPDR - 1)) == 0) && ((size & (NBPDR - 1)) == 0)) { int i; vm_page_t m[1]; unsigned int ptepindex; int npdes; pd_entry_t ptepa; PMAP_LOCK(pmap); if (pmap->pm_pdir[ptepindex = (addr >> PDRSHIFT)]) goto out; PMAP_UNLOCK(pmap); retry: p = vm_page_lookup(object, pindex); if (p != NULL) { if (vm_page_sleep_if_busy(p, FALSE, "init4p")) goto retry; } else { p = vm_page_alloc(object, pindex, VM_ALLOC_NORMAL); if (p == NULL) return; m[0] = p; if (vm_pager_get_pages(object, m, 1, 0) != VM_PAGER_OK) { vm_page_lock_queues(); vm_page_free(p); vm_page_unlock_queues(); return; } p = vm_page_lookup(object, pindex); vm_page_lock_queues(); vm_page_wakeup(p); vm_page_unlock_queues(); } ptepa = VM_PAGE_TO_PHYS(p); if (ptepa & (NBPDR - 1)) return; p->valid = VM_PAGE_BITS_ALL; PMAP_LOCK(pmap); pmap->pm_stats.resident_count += size >> PAGE_SHIFT; npdes = size >> PDRSHIFT; for(i = 0; i < npdes; i++) { pde_store(&pmap->pm_pdir[ptepindex], ptepa | PG_U | PG_RW | PG_V | PG_PS); ptepa += NBPDR; ptepindex += 1; } pmap_invalidate_all(pmap); out: PMAP_UNLOCK(pmap); } } /* * Routine: pmap_change_wiring * Function: Change the wiring attribute for a map/virtual-address * pair. * In/out conditions: * The mapping must already exist in the pmap. */ void pmap_change_wiring(pmap_t pmap, vm_offset_t va, boolean_t wired) { pd_entry_t *pde; pt_entry_t *pte; boolean_t are_queues_locked; are_queues_locked = FALSE; retry: PMAP_LOCK(pmap); pde = pmap_pde(pmap, va); if ((*pde & PG_PS) != 0) { if (!wired != ((*pde & PG_W) == 0)) { if (!are_queues_locked) { are_queues_locked = TRUE; if (!mtx_trylock(&vm_page_queue_mtx)) { PMAP_UNLOCK(pmap); vm_page_lock_queues(); goto retry; } } if (!pmap_demote_pde(pmap, pde, va)) panic("pmap_change_wiring: demotion failed"); } else goto out; } pte = pmap_pte(pmap, va); if (wired && !pmap_pte_w(pte)) pmap->pm_stats.wired_count++; else if (!wired && pmap_pte_w(pte)) pmap->pm_stats.wired_count--; /* * Wiring is not a hardware characteristic so there is no need to * invalidate TLB. */ pmap_pte_set_w(pte, wired); pmap_pte_release(pte); out: if (are_queues_locked) vm_page_unlock_queues(); PMAP_UNLOCK(pmap); } /* * Copy the range specified by src_addr/len * from the source map to the range dst_addr/len * in the destination map. * * This routine is only advisory and need not do anything. */ void pmap_copy(pmap_t dst_pmap, pmap_t src_pmap, vm_offset_t dst_addr, vm_size_t len, vm_offset_t src_addr) { vm_page_t free; vm_offset_t addr; vm_offset_t end_addr = src_addr + len; vm_offset_t pdnxt; if (dst_addr != src_addr) return; if (!pmap_is_current(src_pmap)) return; vm_page_lock_queues(); if (dst_pmap < src_pmap) { PMAP_LOCK(dst_pmap); PMAP_LOCK(src_pmap); } else { PMAP_LOCK(src_pmap); PMAP_LOCK(dst_pmap); } sched_pin(); for (addr = src_addr; addr < end_addr; addr = pdnxt) { pt_entry_t *src_pte, *dst_pte; vm_page_t dstmpte, srcmpte; pd_entry_t srcptepaddr; unsigned ptepindex; KASSERT(addr < UPT_MIN_ADDRESS, ("pmap_copy: invalid to pmap_copy page tables")); pdnxt = (addr + NBPDR) & ~PDRMASK; if (pdnxt < addr) pdnxt = end_addr; ptepindex = addr >> PDRSHIFT; srcptepaddr = src_pmap->pm_pdir[ptepindex]; if (srcptepaddr == 0) continue; if (srcptepaddr & PG_PS) { if (dst_pmap->pm_pdir[ptepindex] == 0 && ((srcptepaddr & PG_MANAGED) == 0 || pmap_pv_insert_pde(dst_pmap, addr, PHYS_TO_VM_PAGE(srcptepaddr & PG_FRAME)))) { dst_pmap->pm_pdir[ptepindex] = srcptepaddr & ~PG_W; dst_pmap->pm_stats.resident_count += NBPDR / PAGE_SIZE; } continue; } srcmpte = PHYS_TO_VM_PAGE(srcptepaddr & PG_FRAME); KASSERT(srcmpte->wire_count > 0, ("pmap_copy: source page table page is unused")); if (pdnxt > end_addr) pdnxt = end_addr; src_pte = vtopte(addr); while (addr < pdnxt) { pt_entry_t ptetemp; ptetemp = *src_pte; /* * we only virtual copy managed pages */ if ((ptetemp & PG_MANAGED) != 0) { dstmpte = pmap_allocpte(dst_pmap, addr, M_NOWAIT); if (dstmpte == NULL) break; dst_pte = pmap_pte_quick(dst_pmap, addr); if (*dst_pte == 0 && pmap_try_insert_pv_entry(dst_pmap, addr, PHYS_TO_VM_PAGE(ptetemp & PG_FRAME))) { /* * Clear the wired, modified, and * accessed (referenced) bits * during the copy. */ *dst_pte = ptetemp & ~(PG_W | PG_M | PG_A); dst_pmap->pm_stats.resident_count++; } else { free = NULL; if (pmap_unwire_pte_hold( dst_pmap, dstmpte, &free)) { pmap_invalidate_page(dst_pmap, addr); pmap_free_zero_pages(free); } } if (dstmpte->wire_count >= srcmpte->wire_count) break; } addr += PAGE_SIZE; src_pte++; } } sched_unpin(); vm_page_unlock_queues(); PMAP_UNLOCK(src_pmap); PMAP_UNLOCK(dst_pmap); } static __inline void pagezero(void *page) { #if defined(I686_CPU) if (cpu_class == CPUCLASS_686) { #if defined(CPU_ENABLE_SSE) if (cpu_feature & CPUID_SSE2) sse2_pagezero(page); else #endif i686_pagezero(page); } else #endif bzero(page, PAGE_SIZE); } /* * pmap_zero_page zeros the specified hardware page by mapping * the page into KVM and using bzero to clear its contents. */ void pmap_zero_page(vm_page_t m) { struct sysmaps *sysmaps; sysmaps = &sysmaps_pcpu[PCPU_GET(cpuid)]; mtx_lock(&sysmaps->lock); if (*sysmaps->CMAP2) panic("pmap_zero_page: CMAP2 busy"); sched_pin(); *sysmaps->CMAP2 = PG_V | PG_RW | VM_PAGE_TO_PHYS(m) | PG_A | PG_M; invlcaddr(sysmaps->CADDR2); pagezero(sysmaps->CADDR2); *sysmaps->CMAP2 = 0; sched_unpin(); mtx_unlock(&sysmaps->lock); } /* * pmap_zero_page_area zeros the specified hardware page by mapping * the page into KVM and using bzero to clear its contents. * * off and size may not cover an area beyond a single hardware page. */ void pmap_zero_page_area(vm_page_t m, int off, int size) { struct sysmaps *sysmaps; sysmaps = &sysmaps_pcpu[PCPU_GET(cpuid)]; mtx_lock(&sysmaps->lock); if (*sysmaps->CMAP2) panic("pmap_zero_page: CMAP2 busy"); sched_pin(); *sysmaps->CMAP2 = PG_V | PG_RW | VM_PAGE_TO_PHYS(m) | PG_A | PG_M; invlcaddr(sysmaps->CADDR2); if (off == 0 && size == PAGE_SIZE) pagezero(sysmaps->CADDR2); else bzero((char *)sysmaps->CADDR2 + off, size); *sysmaps->CMAP2 = 0; sched_unpin(); mtx_unlock(&sysmaps->lock); } /* * pmap_zero_page_idle zeros the specified hardware page by mapping * the page into KVM and using bzero to clear its contents. This * is intended to be called from the vm_pagezero process only and * outside of Giant. */ void pmap_zero_page_idle(vm_page_t m) { if (*CMAP3) panic("pmap_zero_page: CMAP3 busy"); sched_pin(); *CMAP3 = PG_V | PG_RW | VM_PAGE_TO_PHYS(m) | PG_A | PG_M; invlcaddr(CADDR3); pagezero(CADDR3); *CMAP3 = 0; sched_unpin(); } /* * pmap_copy_page copies the specified (machine independent) * page by mapping the page into virtual memory and using * bcopy to copy the page, one machine dependent page at a * time. */ void pmap_copy_page(vm_page_t src, vm_page_t dst) { struct sysmaps *sysmaps; sysmaps = &sysmaps_pcpu[PCPU_GET(cpuid)]; mtx_lock(&sysmaps->lock); if (*sysmaps->CMAP1) panic("pmap_copy_page: CMAP1 busy"); if (*sysmaps->CMAP2) panic("pmap_copy_page: CMAP2 busy"); sched_pin(); invlpg((u_int)sysmaps->CADDR1); invlpg((u_int)sysmaps->CADDR2); *sysmaps->CMAP1 = PG_V | VM_PAGE_TO_PHYS(src) | PG_A; *sysmaps->CMAP2 = PG_V | PG_RW | VM_PAGE_TO_PHYS(dst) | PG_A | PG_M; bcopy(sysmaps->CADDR1, sysmaps->CADDR2, PAGE_SIZE); *sysmaps->CMAP1 = 0; *sysmaps->CMAP2 = 0; sched_unpin(); mtx_unlock(&sysmaps->lock); } /* * Returns true if the pmap's pv is one of the first * 16 pvs linked to from this page. This count may * be changed upwards or downwards in the future; it * is only necessary that true be returned for a small * subset of pmaps for proper page aging. */ boolean_t pmap_page_exists_quick(pmap_t pmap, vm_page_t m) { struct md_page *pvh; pv_entry_t pv; int loops = 0; if (m->flags & PG_FICTITIOUS) return FALSE; mtx_assert(&vm_page_queue_mtx, MA_OWNED); TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) { if (PV_PMAP(pv) == pmap) { return TRUE; } loops++; if (loops >= 16) break; } if (loops < 16) { pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); TAILQ_FOREACH(pv, &pvh->pv_list, pv_list) { if (PV_PMAP(pv) == pmap) return (TRUE); loops++; if (loops >= 16) break; } } return (FALSE); } /* * pmap_page_wired_mappings: * * Return the number of managed mappings to the given physical page * that are wired. */ int pmap_page_wired_mappings(vm_page_t m) { pv_entry_t pv; pt_entry_t *pte; pmap_t pmap; int count; count = 0; if ((m->flags & PG_FICTITIOUS) != 0) return (count); mtx_assert(&vm_page_queue_mtx, MA_OWNED); sched_pin(); TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) { pmap = PV_PMAP(pv); PMAP_LOCK(pmap); pte = pmap_pte_quick(pmap, pv->pv_va); if ((*pte & PG_W) != 0) count++; PMAP_UNLOCK(pmap); } sched_unpin(); return (count); } /* * Returns TRUE if the given page is mapped individually or as part of * a 4mpage. Otherwise, returns FALSE. */ boolean_t pmap_page_is_mapped(vm_page_t m) { struct md_page *pvh; if ((m->flags & (PG_FICTITIOUS | PG_UNMANAGED)) != 0) return (FALSE); mtx_assert(&vm_page_queue_mtx, MA_OWNED); if (TAILQ_EMPTY(&m->md.pv_list)) { pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); return (!TAILQ_EMPTY(&pvh->pv_list)); } else return (TRUE); } /* * Remove all pages from specified address space * this aids process exit speeds. Also, this code * is special cased for current process only, but * can have the more generic (and slightly slower) * mode enabled. This is much faster than pmap_remove * in the case of running down an entire address space. */ void pmap_remove_pages(pmap_t pmap) { pt_entry_t *pte, tpte; vm_page_t free = NULL; vm_page_t m, mpte, mt; pv_entry_t pv; struct md_page *pvh; struct pv_chunk *pc, *npc; int field, idx; int32_t bit; uint32_t inuse, bitmask; int allfree; if (pmap != vmspace_pmap(curthread->td_proc->p_vmspace)) { printf("warning: pmap_remove_pages called with non-current pmap\n"); return; } vm_page_lock_queues(); PMAP_LOCK(pmap); sched_pin(); TAILQ_FOREACH_SAFE(pc, &pmap->pm_pvchunk, pc_list, npc) { allfree = 1; for (field = 0; field < _NPCM; field++) { inuse = (~(pc->pc_map[field])) & pc_freemask[field]; while (inuse != 0) { bit = bsfl(inuse); bitmask = 1UL << bit; idx = field * 32 + bit; pv = &pc->pc_pventry[idx]; inuse &= ~bitmask; pte = pmap_pde(pmap, pv->pv_va); tpte = *pte; if ((tpte & PG_PS) == 0) { pte = vtopte(pv->pv_va); tpte = *pte & ~PG_PTE_PAT; } if (tpte == 0) { printf( "TPTE at %p IS ZERO @ VA %08x\n", pte, pv->pv_va); panic("bad pte"); } /* * We cannot remove wired pages from a process' mapping at this time */ if (tpte & PG_W) { allfree = 0; continue; } m = PHYS_TO_VM_PAGE(tpte & PG_FRAME); KASSERT(m->phys_addr == (tpte & PG_FRAME), ("vm_page_t %p phys_addr mismatch %016jx %016jx", m, (uintmax_t)m->phys_addr, (uintmax_t)tpte)); KASSERT(m < &vm_page_array[vm_page_array_size], ("pmap_remove_pages: bad tpte %#jx", (uintmax_t)tpte)); pte_clear(pte); /* * Update the vm_page_t clean/reference bits. */ if ((tpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) { if ((tpte & PG_PS) != 0) { for (mt = m; mt < &m[NBPDR / PAGE_SIZE]; mt++) vm_page_dirty(mt); } else vm_page_dirty(m); } /* Mark free */ PV_STAT(pv_entry_frees++); PV_STAT(pv_entry_spare++); pv_entry_count--; pc->pc_map[field] |= bitmask; if ((tpte & PG_PS) != 0) { pmap->pm_stats.resident_count -= NBPDR / PAGE_SIZE; pvh = pa_to_pvh(tpte & PG_FRAME); TAILQ_REMOVE(&pvh->pv_list, pv, pv_list); if (TAILQ_EMPTY(&pvh->pv_list)) { for (mt = m; mt < &m[NBPDR / PAGE_SIZE]; mt++) if (TAILQ_EMPTY(&mt->md.pv_list)) vm_page_flag_clear(mt, PG_WRITEABLE); } mpte = pmap_lookup_pt_page(pmap, pv->pv_va); if (mpte != NULL) { pmap_remove_pt_page(pmap, mpte); KASSERT(mpte->wire_count == NPTEPG, ("pmap_remove_pages: pte page wire count error")); mpte->wire_count = 0; pmap_add_delayed_free_list(mpte, &free, FALSE); atomic_subtract_int(&cnt.v_wire_count, 1); } } else { pmap->pm_stats.resident_count--; TAILQ_REMOVE(&m->md.pv_list, pv, pv_list); if (TAILQ_EMPTY(&m->md.pv_list)) { pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); if (TAILQ_EMPTY(&pvh->pv_list)) vm_page_flag_clear(m, PG_WRITEABLE); } pmap_unuse_pt(pmap, pv->pv_va, &free); } } } if (allfree) { PV_STAT(pv_entry_spare -= _NPCPV); PV_STAT(pc_chunk_count--); PV_STAT(pc_chunk_frees++); TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); m = PHYS_TO_VM_PAGE(pmap_kextract((vm_offset_t)pc)); pmap_qremove((vm_offset_t)pc, 1); vm_page_unwire(m, 0); vm_page_free(m); pmap_ptelist_free(&pv_vafree, (vm_offset_t)pc); } } sched_unpin(); pmap_invalidate_all(pmap); vm_page_unlock_queues(); PMAP_UNLOCK(pmap); pmap_free_zero_pages(free); } /* * pmap_is_modified: * * Return whether or not the specified physical page was modified * in any physical maps. */ boolean_t pmap_is_modified(vm_page_t m) { if (m->flags & PG_FICTITIOUS) return (FALSE); if (pmap_is_modified_pvh(&m->md)) return (TRUE); return (pmap_is_modified_pvh(pa_to_pvh(VM_PAGE_TO_PHYS(m)))); } /* * Returns TRUE if any of the given mappings were used to modify * physical memory. Otherwise, returns FALSE. Both page and 2mpage * mappings are supported. */ static boolean_t pmap_is_modified_pvh(struct md_page *pvh) { pv_entry_t pv; pt_entry_t *pte; pmap_t pmap; boolean_t rv; mtx_assert(&vm_page_queue_mtx, MA_OWNED); rv = FALSE; sched_pin(); TAILQ_FOREACH(pv, &pvh->pv_list, pv_list) { pmap = PV_PMAP(pv); PMAP_LOCK(pmap); pte = pmap_pte_quick(pmap, pv->pv_va); rv = (*pte & (PG_M | PG_RW)) == (PG_M | PG_RW); PMAP_UNLOCK(pmap); if (rv) break; } sched_unpin(); return (rv); } /* * pmap_is_prefaultable: * * Return whether or not the specified virtual address is elgible * for prefault. */ boolean_t pmap_is_prefaultable(pmap_t pmap, vm_offset_t addr) { pd_entry_t *pde; pt_entry_t *pte; boolean_t rv; rv = FALSE; PMAP_LOCK(pmap); pde = pmap_pde(pmap, addr); if (*pde != 0 && (*pde & PG_PS) == 0) { pte = vtopte(addr); rv = *pte == 0; } PMAP_UNLOCK(pmap); return (rv); } /* * Clear the write and modified bits in each of the given page's mappings. */ void pmap_remove_write(vm_page_t m) { struct md_page *pvh; pv_entry_t next_pv, pv; pmap_t pmap; pd_entry_t *pde; pt_entry_t oldpte, *pte; vm_offset_t va; mtx_assert(&vm_page_queue_mtx, MA_OWNED); if ((m->flags & PG_FICTITIOUS) != 0 || (m->flags & PG_WRITEABLE) == 0) return; sched_pin(); pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_list, next_pv) { va = pv->pv_va; pmap = PV_PMAP(pv); PMAP_LOCK(pmap); pde = pmap_pde(pmap, va); if ((*pde & PG_RW) != 0) (void)pmap_demote_pde(pmap, pde, va); PMAP_UNLOCK(pmap); } TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) { pmap = PV_PMAP(pv); PMAP_LOCK(pmap); pde = pmap_pde(pmap, pv->pv_va); KASSERT((*pde & PG_PS) == 0, ("pmap_clear_write: found" " a 4mpage in page %p's pv list", m)); pte = pmap_pte_quick(pmap, pv->pv_va); retry: oldpte = *pte; if ((oldpte & PG_RW) != 0) { /* * Regardless of whether a pte is 32 or 64 bits * in size, PG_RW and PG_M are among the least * significant 32 bits. */ if (!atomic_cmpset_int((u_int *)pte, oldpte, oldpte & ~(PG_RW | PG_M))) goto retry; if ((oldpte & PG_M) != 0) vm_page_dirty(m); pmap_invalidate_page(pmap, pv->pv_va); } PMAP_UNLOCK(pmap); } vm_page_flag_clear(m, PG_WRITEABLE); sched_unpin(); } /* * pmap_ts_referenced: * * Return a count of reference bits for a page, clearing those bits. * It is not necessary for every reference bit to be cleared, but it * is necessary that 0 only be returned when there are truly no * reference bits set. * * XXX: The exact number of bits to check and clear is a matter that * should be tested and standardized at some point in the future for * optimal aging of shared pages. */ int pmap_ts_referenced(vm_page_t m) { struct md_page *pvh; pv_entry_t pv, pvf, pvn; pmap_t pmap; pd_entry_t oldpde, *pde; pt_entry_t *pte; vm_offset_t va; int rtval = 0; if (m->flags & PG_FICTITIOUS) return (rtval); sched_pin(); mtx_assert(&vm_page_queue_mtx, MA_OWNED); pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_list, pvn) { va = pv->pv_va; pmap = PV_PMAP(pv); PMAP_LOCK(pmap); pde = pmap_pde(pmap, va); oldpde = *pde; if ((oldpde & PG_A) != 0) { if (pmap_demote_pde(pmap, pde, va)) { if ((oldpde & PG_W) == 0) { /* * Remove the mapping to a single page * so that a subsequent access may * repromote. Since the underlying * page table page is fully populated, * this removal never frees a page * table page. */ va += VM_PAGE_TO_PHYS(m) - (oldpde & PG_FRAME); pmap_remove_page(pmap, va, NULL); rtval++; if (rtval > 4) { PMAP_UNLOCK(pmap); return (rtval); } } } } PMAP_UNLOCK(pmap); } if ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) { pvf = pv; do { pvn = TAILQ_NEXT(pv, pv_list); TAILQ_REMOVE(&m->md.pv_list, pv, pv_list); TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_list); pmap = PV_PMAP(pv); PMAP_LOCK(pmap); pde = pmap_pde(pmap, pv->pv_va); KASSERT((*pde & PG_PS) == 0, ("pmap_ts_referenced:" " found a 4mpage in page %p's pv list", m)); pte = pmap_pte_quick(pmap, pv->pv_va); if ((*pte & PG_A) != 0) { atomic_clear_int((u_int *)pte, PG_A); pmap_invalidate_page(pmap, pv->pv_va); rtval++; if (rtval > 4) pvn = NULL; } PMAP_UNLOCK(pmap); } while ((pv = pvn) != NULL && pv != pvf); } sched_unpin(); return (rtval); } /* * Clear the modify bits on the specified physical page. */ void pmap_clear_modify(vm_page_t m) { struct md_page *pvh; pv_entry_t next_pv, pv; pmap_t pmap; pd_entry_t oldpde, *pde; pt_entry_t oldpte, *pte; vm_offset_t va; mtx_assert(&vm_page_queue_mtx, MA_OWNED); if ((m->flags & PG_FICTITIOUS) != 0) return; sched_pin(); pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_list, next_pv) { va = pv->pv_va; pmap = PV_PMAP(pv); PMAP_LOCK(pmap); pde = pmap_pde(pmap, va); oldpde = *pde; if ((oldpde & PG_RW) != 0) { if (pmap_demote_pde(pmap, pde, va)) { if ((oldpde & PG_W) == 0) { /* * Write protect the mapping to a * single page so that a subsequent * write access may repromote. */ va += VM_PAGE_TO_PHYS(m) - (oldpde & PG_FRAME); pte = pmap_pte_quick(pmap, va); oldpte = *pte; if ((oldpte & PG_V) != 0) { /* * Regardless of whether a pte is 32 or 64 bits * in size, PG_RW and PG_M are among the least * significant 32 bits. */ while (!atomic_cmpset_int((u_int *)pte, oldpte, oldpte & ~(PG_M | PG_RW))) oldpte = *pte; vm_page_dirty(m); pmap_invalidate_page(pmap, va); } } } } PMAP_UNLOCK(pmap); } TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) { pmap = PV_PMAP(pv); PMAP_LOCK(pmap); pde = pmap_pde(pmap, pv->pv_va); KASSERT((*pde & PG_PS) == 0, ("pmap_clear_modify: found" " a 4mpage in page %p's pv list", m)); pte = pmap_pte_quick(pmap, pv->pv_va); if ((*pte & (PG_M | PG_RW)) == (PG_M | PG_RW)) { /* * Regardless of whether a pte is 32 or 64 bits * in size, PG_M is among the least significant * 32 bits. */ atomic_clear_int((u_int *)pte, PG_M); pmap_invalidate_page(pmap, pv->pv_va); } PMAP_UNLOCK(pmap); } sched_unpin(); } /* * pmap_clear_reference: * * Clear the reference bit on the specified physical page. */ void pmap_clear_reference(vm_page_t m) { struct md_page *pvh; pv_entry_t next_pv, pv; pmap_t pmap; pd_entry_t oldpde, *pde; pt_entry_t *pte; vm_offset_t va; mtx_assert(&vm_page_queue_mtx, MA_OWNED); if ((m->flags & PG_FICTITIOUS) != 0) return; sched_pin(); pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_list, next_pv) { va = pv->pv_va; pmap = PV_PMAP(pv); PMAP_LOCK(pmap); pde = pmap_pde(pmap, va); oldpde = *pde; if ((oldpde & PG_A) != 0) { if (pmap_demote_pde(pmap, pde, va)) { /* * Remove the mapping to a single page so * that a subsequent access may repromote. * Since the underlying page table page is * fully populated, this removal never frees * a page table page. */ va += VM_PAGE_TO_PHYS(m) - (oldpde & PG_FRAME); pmap_remove_page(pmap, va, NULL); } } PMAP_UNLOCK(pmap); } TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) { pmap = PV_PMAP(pv); PMAP_LOCK(pmap); pde = pmap_pde(pmap, pv->pv_va); KASSERT((*pde & PG_PS) == 0, ("pmap_clear_reference: found" " a 4mpage in page %p's pv list", m)); pte = pmap_pte_quick(pmap, pv->pv_va); if ((*pte & PG_A) != 0) { /* * Regardless of whether a pte is 32 or 64 bits * in size, PG_A is among the least significant * 32 bits. */ atomic_clear_int((u_int *)pte, PG_A); pmap_invalidate_page(pmap, pv->pv_va); } PMAP_UNLOCK(pmap); } sched_unpin(); } /* * Miscellaneous support routines follow */ /* * Map a set of physical memory pages into the kernel virtual * address space. Return a pointer to where it is mapped. This * routine is intended to be used for mapping device memory, * NOT real memory. */ void * pmap_mapdev_attr(vm_paddr_t pa, vm_size_t size, int mode) { vm_offset_t va, tmpva, offset; offset = pa & PAGE_MASK; size = roundup(offset + size, PAGE_SIZE); pa = pa & PG_FRAME; if (pa < KERNLOAD && pa + size <= KERNLOAD) va = KERNBASE + pa; else va = kmem_alloc_nofault(kernel_map, size); if (!va) panic("pmap_mapdev: Couldn't alloc kernel virtual memory"); for (tmpva = va; size > 0; ) { pmap_kenter_attr(tmpva, pa, mode); size -= PAGE_SIZE; tmpva += PAGE_SIZE; pa += PAGE_SIZE; } pmap_invalidate_range(kernel_pmap, va, tmpva); pmap_invalidate_cache(); return ((void *)(va + offset)); } void * pmap_mapdev(vm_paddr_t pa, vm_size_t size) { return (pmap_mapdev_attr(pa, size, PAT_UNCACHEABLE)); } void * pmap_mapbios(vm_paddr_t pa, vm_size_t size) { return (pmap_mapdev_attr(pa, size, PAT_WRITE_BACK)); } void pmap_unmapdev(vm_offset_t va, vm_size_t size) { vm_offset_t base, offset, tmpva; if (va >= KERNBASE && va + size <= KERNBASE + KERNLOAD) return; base = trunc_page(va); offset = va & PAGE_MASK; size = roundup(offset + size, PAGE_SIZE); for (tmpva = base; tmpva < (base + size); tmpva += PAGE_SIZE) pmap_kremove(tmpva); pmap_invalidate_range(kernel_pmap, va, tmpva); kmem_free(kernel_map, base, size); } int pmap_change_attr(va, size, mode) vm_offset_t va; vm_size_t size; int mode; { vm_offset_t base, offset, tmpva; pt_entry_t *pte; u_int opte, npte; pd_entry_t *pde; base = trunc_page(va); offset = va & PAGE_MASK; size = roundup(offset + size, PAGE_SIZE); /* Only supported on kernel virtual addresses. */ if (base <= VM_MAXUSER_ADDRESS) return (EINVAL); /* 4MB pages and pages that aren't mapped aren't supported. */ for (tmpva = base; tmpva < (base + size); tmpva += PAGE_SIZE) { pde = pmap_pde(kernel_pmap, tmpva); if (*pde & PG_PS) return (EINVAL); if (*pde == 0) return (EINVAL); pte = vtopte(va); if (*pte == 0) return (EINVAL); } /* * Ok, all the pages exist and are 4k, so run through them updating * their cache mode. */ for (tmpva = base; size > 0; ) { pte = vtopte(tmpva); /* * The cache mode bits are all in the low 32-bits of the * PTE, so we can just spin on updating the low 32-bits. */ do { opte = *(u_int *)pte; npte = opte & ~(PG_PTE_PAT | PG_NC_PCD | PG_NC_PWT); npte |= pmap_cache_bits(mode, 0); } while (npte != opte && !atomic_cmpset_int((u_int *)pte, opte, npte)); tmpva += PAGE_SIZE; size -= PAGE_SIZE; } /* * Flush CPU caches to make sure any data isn't cached that shouldn't * be, etc. */ pmap_invalidate_range(kernel_pmap, base, tmpva); pmap_invalidate_cache(); return (0); } /* * perform the pmap work for mincore */ int pmap_mincore(pmap_t pmap, vm_offset_t addr) { pd_entry_t *pdep; pt_entry_t *ptep, pte; vm_paddr_t pa; vm_page_t m; int val = 0; PMAP_LOCK(pmap); pdep = pmap_pde(pmap, addr); if (*pdep != 0) { if (*pdep & PG_PS) { pte = *pdep; val = MINCORE_SUPER; /* Compute the physical address of the 4KB page. */ pa = ((*pdep & PG_PS_FRAME) | (addr & PDRMASK)) & PG_FRAME; } else { ptep = pmap_pte(pmap, addr); pte = *ptep; pmap_pte_release(ptep); pa = pte & PG_FRAME; } } else { pte = 0; pa = 0; } PMAP_UNLOCK(pmap); if (pte != 0) { val |= MINCORE_INCORE; if ((pte & PG_MANAGED) == 0) return val; m = PHYS_TO_VM_PAGE(pa); /* * Modified by us */ if ((pte & (PG_M | PG_RW)) == (PG_M | PG_RW)) val |= MINCORE_MODIFIED|MINCORE_MODIFIED_OTHER; else { /* * Modified by someone else */ vm_page_lock_queues(); if (m->dirty || pmap_is_modified(m)) val |= MINCORE_MODIFIED_OTHER; vm_page_unlock_queues(); } /* * Referenced by us */ if (pte & PG_A) val |= MINCORE_REFERENCED|MINCORE_REFERENCED_OTHER; else { /* * Referenced by someone else */ vm_page_lock_queues(); if ((m->flags & PG_REFERENCED) || pmap_ts_referenced(m)) { val |= MINCORE_REFERENCED_OTHER; vm_page_flag_set(m, PG_REFERENCED); } vm_page_unlock_queues(); } } return val; } void pmap_activate(struct thread *td) { pmap_t pmap, oldpmap; u_int32_t cr3; critical_enter(); pmap = vmspace_pmap(td->td_proc->p_vmspace); oldpmap = PCPU_GET(curpmap); #if defined(SMP) atomic_clear_int(&oldpmap->pm_active, PCPU_GET(cpumask)); atomic_set_int(&pmap->pm_active, PCPU_GET(cpumask)); #else oldpmap->pm_active &= ~1; pmap->pm_active |= 1; #endif #ifdef PAE cr3 = vtophys(pmap->pm_pdpt); #else cr3 = vtophys(pmap->pm_pdir); #endif /* * pmap_activate is for the current thread on the current cpu */ td->td_pcb->pcb_cr3 = cr3; load_cr3(cr3); PCPU_SET(curpmap, pmap); critical_exit(); } vm_offset_t pmap_addr_hint(vm_object_t obj, vm_offset_t addr, vm_size_t size) { if ((obj == NULL) || (size < NBPDR) || (obj->type != OBJT_DEVICE)) { return addr; } addr = (addr + PDRMASK) & ~PDRMASK; return addr; } #if defined(PMAP_DEBUG) pmap_pid_dump(int pid) { pmap_t pmap; struct proc *p; int npte = 0; int index; sx_slock(&allproc_lock); FOREACH_PROC_IN_SYSTEM(p) { if (p->p_pid != pid) continue; if (p->p_vmspace) { int i,j; index = 0; pmap = vmspace_pmap(p->p_vmspace); for (i = 0; i < NPDEPTD; i++) { pd_entry_t *pde; pt_entry_t *pte; vm_offset_t base = i << PDRSHIFT; pde = &pmap->pm_pdir[i]; if (pde && pmap_pde_v(pde)) { for (j = 0; j < NPTEPG; j++) { vm_offset_t va = base + (j << PAGE_SHIFT); if (va >= (vm_offset_t) VM_MIN_KERNEL_ADDRESS) { if (index) { index = 0; printf("\n"); } sx_sunlock(&allproc_lock); return npte; } pte = pmap_pte(pmap, va); if (pte && pmap_pte_v(pte)) { pt_entry_t pa; vm_page_t m; pa = *pte; m = PHYS_TO_VM_PAGE(pa & PG_FRAME); printf("va: 0x%x, pt: 0x%x, h: %d, w: %d, f: 0x%x", va, pa, m->hold_count, m->wire_count, m->flags); npte++; index++; if (index >= 2) { index = 0; printf("\n"); } else { printf(" "); } } } } } } } sx_sunlock(&allproc_lock); return npte; } #endif #if defined(DEBUG) static void pads(pmap_t pm); void pmap_pvdump(vm_offset_t pa); /* print address space of pmap*/ static void pads(pmap_t pm) { int i, j; vm_paddr_t va; pt_entry_t *ptep; if (pm == kernel_pmap) return; for (i = 0; i < NPDEPTD; i++) if (pm->pm_pdir[i]) for (j = 0; j < NPTEPG; j++) { va = (i << PDRSHIFT) + (j << PAGE_SHIFT); if (pm == kernel_pmap && va < KERNBASE) continue; if (pm != kernel_pmap && va > UPT_MAX_ADDRESS) continue; ptep = pmap_pte(pm, va); if (pmap_pte_v(ptep)) printf("%x:%x ", va, *ptep); }; } void pmap_pvdump(vm_paddr_t pa) { pv_entry_t pv; pmap_t pmap; vm_page_t m; printf("pa %x", pa); m = PHYS_TO_VM_PAGE(pa); TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) { pmap = PV_PMAP(pv); printf(" -> pmap %p, va %x", (void *)pmap, pv->pv_va); pads(pmap); } printf(" "); } #endif Index: head/sys/kern/kern_mbuf.c =================================================================== --- head/sys/kern/kern_mbuf.c (revision 177920) +++ head/sys/kern/kern_mbuf.c (revision 177921) @@ -1,674 +1,675 @@ /*- * Copyright (c) 2004, 2005, * Bosko Milekic . All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice unmodified, this list of conditions and the following * disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include "opt_mac.h" #include "opt_param.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* * In FreeBSD, Mbufs and Mbuf Clusters are allocated from UMA * Zones. * * Mbuf Clusters (2K, contiguous) are allocated from the Cluster * Zone. The Zone can be capped at kern.ipc.nmbclusters, if the * administrator so desires. * * Mbufs are allocated from a UMA Master Zone called the Mbuf * Zone. * * Additionally, FreeBSD provides a Packet Zone, which it * configures as a Secondary Zone to the Mbuf Master Zone, * thus sharing backend Slab kegs with the Mbuf Master Zone. * * Thus common-case allocations and locking are simplified: * * m_clget() m_getcl() * | | * | .------------>[(Packet Cache)] m_get(), m_gethdr() * | | [ Packet ] | * [(Cluster Cache)] [ Secondary ] [ (Mbuf Cache) ] * [ Cluster Zone ] [ Zone ] [ Mbuf Master Zone ] * | \________ | * [ Cluster Keg ] \ / * | [ Mbuf Keg ] * [ Cluster Slabs ] | * | [ Mbuf Slabs ] * \____________(VM)_________________/ * * * Whenever an object is allocated with uma_zalloc() out of * one of the Zones its _ctor_ function is executed. The same * for any deallocation through uma_zfree() the _dtor_ function * is executed. * * Caches are per-CPU and are filled from the Master Zone. * * Whenever an object is allocated from the underlying global * memory pool it gets pre-initialized with the _zinit_ functions. * When the Keg's are overfull objects get decomissioned with * _zfini_ functions and free'd back to the global memory pool. * */ int nmbclusters; /* limits number of mbuf clusters */ int nmbjumbop; /* limits number of page size jumbo clusters */ int nmbjumbo9; /* limits number of 9k jumbo clusters */ int nmbjumbo16; /* limits number of 16k jumbo clusters */ struct mbstat mbstat; static void tunable_mbinit(void *dummy) { /* This has to be done before VM init. */ nmbclusters = 1024 + maxusers * 64; nmbjumbop = nmbclusters / 2; nmbjumbo9 = nmbjumbop / 2; nmbjumbo16 = nmbjumbo9 / 2; TUNABLE_INT_FETCH("kern.ipc.nmbclusters", &nmbclusters); } SYSINIT(tunable_mbinit, SI_SUB_TUNABLES, SI_ORDER_ANY, tunable_mbinit, NULL); /* XXX: These should be tuneables. Can't change UMA limits on the fly. */ static int sysctl_nmbclusters(SYSCTL_HANDLER_ARGS) { int error, newnmbclusters; newnmbclusters = nmbclusters; error = sysctl_handle_int(oidp, &newnmbclusters, 0, req); if (error == 0 && req->newptr) { if (newnmbclusters > nmbclusters) { nmbclusters = newnmbclusters; uma_zone_set_max(zone_clust, nmbclusters); EVENTHANDLER_INVOKE(nmbclusters_change); } else error = EINVAL; } return (error); } SYSCTL_PROC(_kern_ipc, OID_AUTO, nmbclusters, CTLTYPE_INT|CTLFLAG_RW, &nmbclusters, 0, sysctl_nmbclusters, "IU", "Maximum number of mbuf clusters allowed"); static int sysctl_nmbjumbop(SYSCTL_HANDLER_ARGS) { int error, newnmbjumbop; newnmbjumbop = nmbjumbop; error = sysctl_handle_int(oidp, &newnmbjumbop, 0, req); if (error == 0 && req->newptr) { if (newnmbjumbop> nmbjumbop) { nmbjumbop = newnmbjumbop; uma_zone_set_max(zone_jumbop, nmbjumbop); } else error = EINVAL; } return (error); } SYSCTL_PROC(_kern_ipc, OID_AUTO, nmbjumbop, CTLTYPE_INT|CTLFLAG_RW, &nmbjumbop, 0, sysctl_nmbjumbop, "IU", "Maximum number of mbuf page size jumbo clusters allowed"); static int sysctl_nmbjumbo9(SYSCTL_HANDLER_ARGS) { int error, newnmbjumbo9; newnmbjumbo9 = nmbjumbo9; error = sysctl_handle_int(oidp, &newnmbjumbo9, 0, req); if (error == 0 && req->newptr) { if (newnmbjumbo9> nmbjumbo9) { nmbjumbo9 = newnmbjumbo9; uma_zone_set_max(zone_jumbo9, nmbjumbo9); } else error = EINVAL; } return (error); } SYSCTL_PROC(_kern_ipc, OID_AUTO, nmbjumbo9, CTLTYPE_INT|CTLFLAG_RW, &nmbjumbo9, 0, sysctl_nmbjumbo9, "IU", "Maximum number of mbuf 9k jumbo clusters allowed"); static int sysctl_nmbjumbo16(SYSCTL_HANDLER_ARGS) { int error, newnmbjumbo16; newnmbjumbo16 = nmbjumbo16; error = sysctl_handle_int(oidp, &newnmbjumbo16, 0, req); if (error == 0 && req->newptr) { if (newnmbjumbo16> nmbjumbo16) { nmbjumbo16 = newnmbjumbo16; uma_zone_set_max(zone_jumbo16, nmbjumbo16); } else error = EINVAL; } return (error); } SYSCTL_PROC(_kern_ipc, OID_AUTO, nmbjumbo16, CTLTYPE_INT|CTLFLAG_RW, &nmbjumbo16, 0, sysctl_nmbjumbo16, "IU", "Maximum number of mbuf 16k jumbo clusters allowed"); SYSCTL_STRUCT(_kern_ipc, OID_AUTO, mbstat, CTLFLAG_RD, &mbstat, mbstat, "Mbuf general information and statistics"); /* * Zones from which we allocate. */ uma_zone_t zone_mbuf; uma_zone_t zone_clust; uma_zone_t zone_pack; uma_zone_t zone_jumbop; uma_zone_t zone_jumbo9; uma_zone_t zone_jumbo16; uma_zone_t zone_ext_refcnt; /* * Local prototypes. */ static int mb_ctor_mbuf(void *, int, void *, int); static int mb_ctor_clust(void *, int, void *, int); static int mb_ctor_pack(void *, int, void *, int); static void mb_dtor_mbuf(void *, int, void *); static void mb_dtor_clust(void *, int, void *); static void mb_dtor_pack(void *, int, void *); static int mb_zinit_pack(void *, int, int); static void mb_zfini_pack(void *, int); static void mb_reclaim(void *); static void mbuf_init(void *); static void *mbuf_jumbo_alloc(uma_zone_t, int, u_int8_t *, int); static void mbuf_jumbo_free(void *, int, u_int8_t); static MALLOC_DEFINE(M_JUMBOFRAME, "jumboframes", "mbuf jumbo frame buffers"); /* Ensure that MSIZE doesn't break dtom() - it must be a power of 2 */ CTASSERT((((MSIZE - 1) ^ MSIZE) + 1) >> 1 == MSIZE); /* * Initialize FreeBSD Network buffer allocation. */ SYSINIT(mbuf, SI_SUB_MBUF, SI_ORDER_FIRST, mbuf_init, NULL); static void mbuf_init(void *dummy) { /* * Configure UMA zones for Mbufs, Clusters, and Packets. */ zone_mbuf = uma_zcreate(MBUF_MEM_NAME, MSIZE, mb_ctor_mbuf, mb_dtor_mbuf, #ifdef INVARIANTS trash_init, trash_fini, #else NULL, NULL, #endif MSIZE - 1, UMA_ZONE_MAXBUCKET); zone_clust = uma_zcreate(MBUF_CLUSTER_MEM_NAME, MCLBYTES, mb_ctor_clust, mb_dtor_clust, #ifdef INVARIANTS trash_init, trash_fini, #else NULL, NULL, #endif UMA_ALIGN_PTR, UMA_ZONE_REFCNT); if (nmbclusters > 0) uma_zone_set_max(zone_clust, nmbclusters); zone_pack = uma_zsecond_create(MBUF_PACKET_MEM_NAME, mb_ctor_pack, mb_dtor_pack, mb_zinit_pack, mb_zfini_pack, zone_mbuf); /* Make jumbo frame zone too. Page size, 9k and 16k. */ zone_jumbop = uma_zcreate(MBUF_JUMBOP_MEM_NAME, MJUMPAGESIZE, mb_ctor_clust, mb_dtor_clust, #ifdef INVARIANTS trash_init, trash_fini, #else NULL, NULL, #endif UMA_ALIGN_PTR, UMA_ZONE_REFCNT); if (nmbjumbop > 0) uma_zone_set_max(zone_jumbop, nmbjumbop); zone_jumbo9 = uma_zcreate(MBUF_JUMBO9_MEM_NAME, MJUM9BYTES, mb_ctor_clust, mb_dtor_clust, #ifdef INVARIANTS trash_init, trash_fini, #else NULL, NULL, #endif UMA_ALIGN_PTR, UMA_ZONE_REFCNT); if (nmbjumbo9 > 0) uma_zone_set_max(zone_jumbo9, nmbjumbo9); uma_zone_set_allocf(zone_jumbo9, mbuf_jumbo_alloc); uma_zone_set_freef(zone_jumbo9, mbuf_jumbo_free); zone_jumbo16 = uma_zcreate(MBUF_JUMBO16_MEM_NAME, MJUM16BYTES, mb_ctor_clust, mb_dtor_clust, #ifdef INVARIANTS trash_init, trash_fini, #else NULL, NULL, #endif UMA_ALIGN_PTR, UMA_ZONE_REFCNT); if (nmbjumbo16 > 0) uma_zone_set_max(zone_jumbo16, nmbjumbo16); uma_zone_set_allocf(zone_jumbo16, mbuf_jumbo_alloc); uma_zone_set_freef(zone_jumbo16, mbuf_jumbo_free); zone_ext_refcnt = uma_zcreate(MBUF_EXTREFCNT_MEM_NAME, sizeof(u_int), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_ZINIT); /* uma_prealloc() goes here... */ /* * Hook event handler for low-memory situation, used to * drain protocols and push data back to the caches (UMA * later pushes it back to VM). */ EVENTHANDLER_REGISTER(vm_lowmem, mb_reclaim, NULL, EVENTHANDLER_PRI_FIRST); /* * [Re]set counters and local statistics knobs. * XXX Some of these should go and be replaced, but UMA stat * gathering needs to be revised. */ mbstat.m_mbufs = 0; mbstat.m_mclusts = 0; mbstat.m_drain = 0; mbstat.m_msize = MSIZE; mbstat.m_mclbytes = MCLBYTES; mbstat.m_minclsize = MINCLSIZE; mbstat.m_mlen = MLEN; mbstat.m_mhlen = MHLEN; mbstat.m_numtypes = MT_NTYPES; mbstat.m_mcfail = mbstat.m_mpfail = 0; mbstat.sf_iocnt = 0; mbstat.sf_allocwait = mbstat.sf_allocfail = 0; } /* * UMA backend page allocator for the jumbo frame zones. * * Allocates kernel virtual memory that is backed by contiguous physical * pages. */ static void * mbuf_jumbo_alloc(uma_zone_t zone, int bytes, u_int8_t *flags, int wait) { - *flags = UMA_SLAB_PRIV; + /* Inform UMA that this allocator uses kernel_map/object. */ + *flags = UMA_SLAB_KERNEL; return (contigmalloc(bytes, M_JUMBOFRAME, wait, (vm_paddr_t)0, ~(vm_paddr_t)0, 1, 0)); } /* * UMA backend page deallocator for the jumbo frame zones. */ static void mbuf_jumbo_free(void *mem, int size, u_int8_t flags) { contigfree(mem, size, M_JUMBOFRAME); } /* * Constructor for Mbuf master zone. * * The 'arg' pointer points to a mb_args structure which * contains call-specific information required to support the * mbuf allocation API. See mbuf.h. */ static int mb_ctor_mbuf(void *mem, int size, void *arg, int how) { struct mbuf *m; struct mb_args *args; #ifdef MAC int error; #endif int flags; short type; #ifdef INVARIANTS trash_ctor(mem, size, arg, how); #endif m = (struct mbuf *)mem; args = (struct mb_args *)arg; flags = args->flags; type = args->type; /* * The mbuf is initialized later. The caller has the * responsibility to set up any MAC labels too. */ if (type == MT_NOINIT) return (0); m->m_next = NULL; m->m_nextpkt = NULL; m->m_len = 0; m->m_flags = flags; m->m_type = type; if (flags & M_PKTHDR) { m->m_data = m->m_pktdat; m->m_pkthdr.rcvif = NULL; m->m_pkthdr.len = 0; m->m_pkthdr.header = NULL; m->m_pkthdr.csum_flags = 0; m->m_pkthdr.csum_data = 0; m->m_pkthdr.tso_segsz = 0; m->m_pkthdr.ether_vtag = 0; SLIST_INIT(&m->m_pkthdr.tags); #ifdef MAC /* If the label init fails, fail the alloc */ error = mac_mbuf_init(m, how); if (error) return (error); #endif } else m->m_data = m->m_dat; return (0); } /* * The Mbuf master zone destructor. */ static void mb_dtor_mbuf(void *mem, int size, void *arg) { struct mbuf *m; unsigned long flags; m = (struct mbuf *)mem; flags = (unsigned long)arg; if ((flags & MB_NOTAGS) == 0 && (m->m_flags & M_PKTHDR) != 0) m_tag_delete_chain(m, NULL); KASSERT((m->m_flags & M_EXT) == 0, ("%s: M_EXT set", __func__)); KASSERT((m->m_flags & M_NOFREE) == 0, ("%s: M_NOFREE set", __func__)); #ifdef INVARIANTS trash_dtor(mem, size, arg); #endif } /* * The Mbuf Packet zone destructor. */ static void mb_dtor_pack(void *mem, int size, void *arg) { struct mbuf *m; m = (struct mbuf *)mem; if ((m->m_flags & M_PKTHDR) != 0) m_tag_delete_chain(m, NULL); /* Make sure we've got a clean cluster back. */ KASSERT((m->m_flags & M_EXT) == M_EXT, ("%s: M_EXT not set", __func__)); KASSERT(m->m_ext.ext_buf != NULL, ("%s: ext_buf == NULL", __func__)); KASSERT(m->m_ext.ext_free == NULL, ("%s: ext_free != NULL", __func__)); KASSERT(m->m_ext.ext_arg1 == NULL, ("%s: ext_arg1 != NULL", __func__)); KASSERT(m->m_ext.ext_arg2 == NULL, ("%s: ext_arg2 != NULL", __func__)); KASSERT(m->m_ext.ext_size == MCLBYTES, ("%s: ext_size != MCLBYTES", __func__)); KASSERT(m->m_ext.ext_type == EXT_PACKET, ("%s: ext_type != EXT_PACKET", __func__)); KASSERT(*m->m_ext.ref_cnt == 1, ("%s: ref_cnt != 1", __func__)); #ifdef INVARIANTS trash_dtor(m->m_ext.ext_buf, MCLBYTES, arg); #endif /* * If there are processes blocked on zone_clust, waiting for pages * to be freed up, * cause them to be woken up by draining the * packet zone. We are exposed to a race here * (in the check for * the UMA_ZFLAG_FULL) where we might miss the flag set, but that * is deliberate. We don't want to acquire the zone lock for every * mbuf free. */ if (uma_zone_exhausted_nolock(zone_clust)) zone_drain(zone_pack); } /* * The Cluster and Jumbo[PAGESIZE|9|16] zone constructor. * * Here the 'arg' pointer points to the Mbuf which we * are configuring cluster storage for. If 'arg' is * empty we allocate just the cluster without setting * the mbuf to it. See mbuf.h. */ static int mb_ctor_clust(void *mem, int size, void *arg, int how) { struct mbuf *m; u_int *refcnt; int type; uma_zone_t zone; #ifdef INVARIANTS trash_ctor(mem, size, arg, how); #endif switch (size) { case MCLBYTES: type = EXT_CLUSTER; zone = zone_clust; break; #if MJUMPAGESIZE != MCLBYTES case MJUMPAGESIZE: type = EXT_JUMBOP; zone = zone_jumbop; break; #endif case MJUM9BYTES: type = EXT_JUMBO9; zone = zone_jumbo9; break; case MJUM16BYTES: type = EXT_JUMBO16; zone = zone_jumbo16; break; default: panic("unknown cluster size"); break; } m = (struct mbuf *)arg; refcnt = uma_find_refcnt(zone, mem); *refcnt = 1; if (m != NULL) { m->m_ext.ext_buf = (caddr_t)mem; m->m_data = m->m_ext.ext_buf; m->m_flags |= M_EXT; m->m_ext.ext_free = NULL; m->m_ext.ext_arg1 = NULL; m->m_ext.ext_arg2 = NULL; m->m_ext.ext_size = size; m->m_ext.ext_type = type; m->m_ext.ref_cnt = refcnt; } return (0); } /* * The Mbuf Cluster zone destructor. */ static void mb_dtor_clust(void *mem, int size, void *arg) { #ifdef INVARIANTS uma_zone_t zone; zone = m_getzone(size); KASSERT(*(uma_find_refcnt(zone, mem)) <= 1, ("%s: refcnt incorrect %u", __func__, *(uma_find_refcnt(zone, mem))) ); trash_dtor(mem, size, arg); #endif } /* * The Packet secondary zone's init routine, executed on the * object's transition from mbuf keg slab to zone cache. */ static int mb_zinit_pack(void *mem, int size, int how) { struct mbuf *m; m = (struct mbuf *)mem; /* m is virgin. */ if (uma_zalloc_arg(zone_clust, m, how) == NULL || m->m_ext.ext_buf == NULL) return (ENOMEM); m->m_ext.ext_type = EXT_PACKET; /* Override. */ #ifdef INVARIANTS trash_init(m->m_ext.ext_buf, MCLBYTES, how); #endif return (0); } /* * The Packet secondary zone's fini routine, executed on the * object's transition from zone cache to keg slab. */ static void mb_zfini_pack(void *mem, int size) { struct mbuf *m; m = (struct mbuf *)mem; #ifdef INVARIANTS trash_fini(m->m_ext.ext_buf, MCLBYTES); #endif uma_zfree_arg(zone_clust, m->m_ext.ext_buf, NULL); #ifdef INVARIANTS trash_dtor(mem, size, NULL); #endif } /* * The "packet" keg constructor. */ static int mb_ctor_pack(void *mem, int size, void *arg, int how) { struct mbuf *m; struct mb_args *args; #ifdef MAC int error; #endif int flags; short type; m = (struct mbuf *)mem; args = (struct mb_args *)arg; flags = args->flags; type = args->type; #ifdef INVARIANTS trash_ctor(m->m_ext.ext_buf, MCLBYTES, arg, how); #endif m->m_next = NULL; m->m_nextpkt = NULL; m->m_data = m->m_ext.ext_buf; m->m_len = 0; m->m_flags = (flags | M_EXT); m->m_type = type; if (flags & M_PKTHDR) { m->m_pkthdr.rcvif = NULL; m->m_pkthdr.len = 0; m->m_pkthdr.header = NULL; m->m_pkthdr.csum_flags = 0; m->m_pkthdr.csum_data = 0; m->m_pkthdr.tso_segsz = 0; m->m_pkthdr.ether_vtag = 0; SLIST_INIT(&m->m_pkthdr.tags); #ifdef MAC /* If the label init fails, fail the alloc */ error = mac_mbuf_init(m, how); if (error) return (error); #endif } /* m_ext is already initialized. */ return (0); } /* * This is the protocol drain routine. * * No locks should be held when this is called. The drain routines have to * presently acquire some locks which raises the possibility of lock order * reversal. */ static void mb_reclaim(void *junk) { struct domain *dp; struct protosw *pr; WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK | WARN_PANIC, NULL, "mb_reclaim()"); for (dp = domains; dp != NULL; dp = dp->dom_next) for (pr = dp->dom_protosw; pr < dp->dom_protoswNPROTOSW; pr++) if (pr->pr_drain != NULL) (*pr->pr_drain)(); } Index: head/sys/vm/uma.h =================================================================== --- head/sys/vm/uma.h (revision 177920) +++ head/sys/vm/uma.h (revision 177921) @@ -1,585 +1,586 @@ /*- * Copyright (c) 2002, 2003, 2004, 2005 Jeffrey Roberson * Copyright (c) 2004, 2005 Bosko Milekic * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice unmodified, this list of conditions, and the following * disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * * $FreeBSD$ * */ /* * uma.h - External definitions for the Universal Memory Allocator * */ #ifndef VM_UMA_H #define VM_UMA_H #include /* For NULL */ #include /* For M_* */ /* User visable parameters */ #define UMA_SMALLEST_UNIT (PAGE_SIZE / 256) /* Smallest item allocated */ /* Types and type defs */ struct uma_zone; /* Opaque type used as a handle to the zone */ typedef struct uma_zone * uma_zone_t; void zone_drain(uma_zone_t); /* * Item constructor * * Arguments: * item A pointer to the memory which has been allocated. * arg The arg field passed to uma_zalloc_arg * size The size of the allocated item * flags See zalloc flags * * Returns: * 0 on success * errno on failure * * Discussion: * The constructor is called just before the memory is returned * to the user. It may block if necessary. */ typedef int (*uma_ctor)(void *mem, int size, void *arg, int flags); /* * Item destructor * * Arguments: * item A pointer to the memory which has been allocated. * size The size of the item being destructed. * arg Argument passed through uma_zfree_arg * * Returns: * Nothing * * Discussion: * The destructor may perform operations that differ from those performed * by the initializer, but it must leave the object in the same state. * This IS type stable storage. This is called after EVERY zfree call. */ typedef void (*uma_dtor)(void *mem, int size, void *arg); /* * Item initializer * * Arguments: * item A pointer to the memory which has been allocated. * size The size of the item being initialized. * flags See zalloc flags * * Returns: * 0 on success * errno on failure * * Discussion: * The initializer is called when the memory is cached in the uma zone. * this should be the same state that the destructor leaves the object in. */ typedef int (*uma_init)(void *mem, int size, int flags); /* * Item discard function * * Arguments: * item A pointer to memory which has been 'freed' but has not left the * zone's cache. * size The size of the item being discarded. * * Returns: * Nothing * * Discussion: * This routine is called when memory leaves a zone and is returned to the * system for other uses. It is the counter part to the init function. */ typedef void (*uma_fini)(void *mem, int size); /* * What's the difference between initializing and constructing? * * The item is initialized when it is cached, and this is the state that the * object should be in when returned to the allocator. The purpose of this is * to remove some code which would otherwise be called on each allocation by * utilizing a known, stable state. This differs from the constructor which * will be called on EVERY allocation. * * For example, in the initializer you may want to initialize embeded locks, * NULL list pointers, set up initial states, magic numbers, etc. This way if * the object is held in the allocator and re-used it won't be necessary to * re-initialize it. * * The constructor may be used to lock a data structure, link it on to lists, * bump reference counts or total counts of outstanding structures, etc. * */ /* Function proto types */ /* * Create a new uma zone * * Arguments: * name The text name of the zone for debugging and stats, this memory * should not be freed until the zone has been deallocated. * size The size of the object that is being created. * ctor The constructor that is called when the object is allocated * dtor The destructor that is called when the object is freed. * init An initializer that sets up the initial state of the memory. * fini A discard function that undoes initialization done by init. * ctor/dtor/init/fini may all be null, see notes above. * align A bitmask that corisponds to the requested alignment * eg 4 would be 0x3 * flags A set of parameters that control the behavior of the zone * * Returns: * A pointer to a structure which is intended to be opaque to users of * the interface. The value may be null if the wait flag is not set. */ uma_zone_t uma_zcreate(char *name, size_t size, uma_ctor ctor, uma_dtor dtor, uma_init uminit, uma_fini fini, int align, u_int32_t flags); /* * Create a secondary uma zone * * Arguments: * name The text name of the zone for debugging and stats, this memory * should not be freed until the zone has been deallocated. * ctor The constructor that is called when the object is allocated * dtor The destructor that is called when the object is freed. * zinit An initializer that sets up the initial state of the memory * as the object passes from the Keg's slab to the Zone's cache. * zfini A discard function that undoes initialization done by init * as the object passes from the Zone's cache to the Keg's slab. * * ctor/dtor/zinit/zfini may all be null, see notes above. * Note that the zinit and zfini specified here are NOT * exactly the same as the init/fini specified to uma_zcreate() * when creating a master zone. These zinit/zfini are called * on the TRANSITION from keg to zone (and vice-versa). Once * these are set, the primary zone may alter its init/fini * (which are called when the object passes from VM to keg) * using uma_zone_set_init/fini()) as well as its own * zinit/zfini (unset by default for master zone) with * uma_zone_set_zinit/zfini() (note subtle 'z' prefix). * * master A reference to this zone's Master Zone (Primary Zone), * which contains the backing Keg for the Secondary Zone * being added. * * Returns: * A pointer to a structure which is intended to be opaque to users of * the interface. The value may be null if the wait flag is not set. */ uma_zone_t uma_zsecond_create(char *name, uma_ctor ctor, uma_dtor dtor, uma_init zinit, uma_fini zfini, uma_zone_t master); /* * Definitions for uma_zcreate flags * * These flags share space with UMA_ZFLAGs in uma_int.h. Be careful not to * overlap when adding new features. 0xf0000000 is in use by uma_int.h. */ #define UMA_ZONE_PAGEABLE 0x0001 /* Return items not fully backed by physical memory XXX Not yet */ #define UMA_ZONE_ZINIT 0x0002 /* Initialize with zeros */ #define UMA_ZONE_STATIC 0x0004 /* Staticly sized zone */ #define UMA_ZONE_OFFPAGE 0x0008 /* Force the slab structure allocation off of the real memory */ #define UMA_ZONE_MALLOC 0x0010 /* For use by malloc(9) only! */ #define UMA_ZONE_NOFREE 0x0020 /* Do not free slabs of this type! */ #define UMA_ZONE_MTXCLASS 0x0040 /* Create a new lock class */ #define UMA_ZONE_VM 0x0080 /* * Used for internal vm datastructures * only. */ #define UMA_ZONE_HASH 0x0100 /* * Use a hash table instead of caching * information in the vm_page. */ #define UMA_ZONE_SECONDARY 0x0200 /* Zone is a Secondary Zone */ #define UMA_ZONE_REFCNT 0x0400 /* Allocate refcnts in slabs */ #define UMA_ZONE_MAXBUCKET 0x0800 /* Use largest buckets */ /* Definitions for align */ #define UMA_ALIGN_PTR (sizeof(void *) - 1) /* Alignment fit for ptr */ #define UMA_ALIGN_LONG (sizeof(long) - 1) /* "" long */ #define UMA_ALIGN_INT (sizeof(int) - 1) /* "" int */ #define UMA_ALIGN_SHORT (sizeof(short) - 1) /* "" short */ #define UMA_ALIGN_CHAR (sizeof(char) - 1) /* "" char */ #define UMA_ALIGN_CACHE (0 - 1) /* Cache line size align */ /* * Destroys an empty uma zone. If the zone is not empty uma complains loudly. * * Arguments: * zone The zone we want to destroy. * */ void uma_zdestroy(uma_zone_t zone); /* * Allocates an item out of a zone * * Arguments: * zone The zone we are allocating from * arg This data is passed to the ctor function * flags See sys/malloc.h for available flags. * * Returns: * A non null pointer to an initialized element from the zone is * garanteed if the wait flag is M_WAITOK, otherwise a null pointer may be * returned if the zone is empty or the ctor failed. */ void *uma_zalloc_arg(uma_zone_t zone, void *arg, int flags); /* * Allocates an item out of a zone without supplying an argument * * This is just a wrapper for uma_zalloc_arg for convenience. * */ static __inline void *uma_zalloc(uma_zone_t zone, int flags); static __inline void * uma_zalloc(uma_zone_t zone, int flags) { return uma_zalloc_arg(zone, NULL, flags); } /* * Frees an item back into the specified zone. * * Arguments: * zone The zone the item was originally allocated out of. * item The memory to be freed. * arg Argument passed to the destructor * * Returns: * Nothing. */ void uma_zfree_arg(uma_zone_t zone, void *item, void *arg); /* * Frees an item back to a zone without supplying an argument * * This is just a wrapper for uma_zfree_arg for convenience. * */ static __inline void uma_zfree(uma_zone_t zone, void *item); static __inline void uma_zfree(uma_zone_t zone, void *item) { uma_zfree_arg(zone, item, NULL); } /* * XXX The rest of the prototypes in this header are h0h0 magic for the VM. * If you think you need to use it for a normal zone you're probably incorrect. */ /* * Backend page supplier routines * * Arguments: * zone The zone that is requesting pages * size The number of bytes being requested * pflag Flags for these memory pages, see below. * wait Indicates our willingness to block. * * Returns: * A pointer to the alloced memory or NULL on failure. */ typedef void *(*uma_alloc)(uma_zone_t zone, int size, u_int8_t *pflag, int wait); /* * Backend page free routines * * Arguments: * item A pointer to the previously allocated pages * size The original size of the allocation * pflag The flags for the slab. See UMA_SLAB_* below * * Returns: * None */ typedef void (*uma_free)(void *item, int size, u_int8_t pflag); /* * Sets up the uma allocator. (Called by vm_mem_init) * * Arguments: * bootmem A pointer to memory used to bootstrap the system. * * Returns: * Nothing * * Discussion: * This memory is used for zones which allocate things before the * backend page supplier can give us pages. It should be * UMA_SLAB_SIZE * boot_pages bytes. (see uma_int.h) * */ void uma_startup(void *bootmem, int boot_pages); /* * Finishes starting up the allocator. This should * be called when kva is ready for normal allocs. * * Arguments: * None * * Returns: * Nothing * * Discussion: * uma_startup2 is called by kmeminit() to enable us of uma for malloc. */ void uma_startup2(void); /* * Reclaims unused memory for all zones * * Arguments: * None * Returns: * None * * This should only be called by the page out daemon. */ void uma_reclaim(void); /* * Sets the alignment mask to be used for all zones requesting cache * alignment. Should be called by MD boot code prior to starting VM/UMA. * * Arguments: * align The alignment mask * * Returns: * Nothing */ void uma_set_align(int align); /* * Switches the backing object of a zone * * Arguments: * zone The zone to update * obj The obj to use for future allocations * size The size of the object to allocate * * Returns: * 0 if kva space can not be allocated * 1 if successful * * Discussion: * A NULL object can be used and uma will allocate one for you. Setting * the size will limit the amount of memory allocated to this zone. * */ struct vm_object; int uma_zone_set_obj(uma_zone_t zone, struct vm_object *obj, int size); /* * Sets a high limit on the number of items allowed in a zone * * Arguments: * zone The zone to limit * * Returns: * Nothing */ void uma_zone_set_max(uma_zone_t zone, int nitems); /* * The following two routines (uma_zone_set_init/fini) * are used to set the backend init/fini pair which acts on an * object as it becomes allocated and is placed in a slab within * the specified zone's backing keg. These should probably not * be changed once allocations have already begun and only * immediately upon zone creation. */ void uma_zone_set_init(uma_zone_t zone, uma_init uminit); void uma_zone_set_fini(uma_zone_t zone, uma_fini fini); /* * The following two routines (uma_zone_set_zinit/zfini) are * used to set the zinit/zfini pair which acts on an object as * it passes from the backing Keg's slab cache to the * specified Zone's bucket cache. These should probably not * be changed once allocations have already begun and * only immediately upon zone creation. */ void uma_zone_set_zinit(uma_zone_t zone, uma_init zinit); void uma_zone_set_zfini(uma_zone_t zone, uma_fini zfini); /* * Replaces the standard page_alloc or obj_alloc functions for this zone * * Arguments: * zone The zone whos back end allocator is being changed. * allocf A pointer to the allocation function * * Returns: * Nothing * * Discussion: * This could be used to implement pageable allocation, or perhaps * even DMA allocators if used in conjunction with the OFFPAGE * zone flag. */ void uma_zone_set_allocf(uma_zone_t zone, uma_alloc allocf); /* * Used for freeing memory provided by the allocf above * * Arguments: * zone The zone that intends to use this free routine. * freef The page freeing routine. * * Returns: * Nothing */ void uma_zone_set_freef(uma_zone_t zone, uma_free freef); /* * These flags are setable in the allocf and visable in the freef. */ #define UMA_SLAB_BOOT 0x01 /* Slab alloced from boot pages */ #define UMA_SLAB_KMEM 0x02 /* Slab alloced from kmem_map */ +#define UMA_SLAB_KERNEL 0x04 /* Slab alloced from kernel_map */ #define UMA_SLAB_PRIV 0x08 /* Slab alloced from priv allocator */ #define UMA_SLAB_OFFP 0x10 /* Slab is managed separately */ #define UMA_SLAB_MALLOC 0x20 /* Slab is a large malloc slab */ /* 0x40 and 0x80 are available */ /* * Used to pre-fill a zone with some number of items * * Arguments: * zone The zone to fill * itemcnt The number of items to reserve * * Returns: * Nothing * * NOTE: This is blocking and should only be done at startup */ void uma_prealloc(uma_zone_t zone, int itemcnt); /* * Used to lookup the reference counter allocated for an item * from a UMA_ZONE_REFCNT zone. For UMA_ZONE_REFCNT zones, * reference counters are allocated for items and stored in * the underlying slab header. * * Arguments: * zone The UMA_ZONE_REFCNT zone to which the item belongs. * item The address of the item for which we want a refcnt. * * Returns: * A pointer to a u_int32_t reference counter. */ u_int32_t *uma_find_refcnt(uma_zone_t zone, void *item); /* * Used to determine if a fixed-size zone is exhausted. * * Arguments: * zone The zone to check * * Returns: * Non-zero if zone is exhausted. */ int uma_zone_exhausted(uma_zone_t zone); int uma_zone_exhausted_nolock(uma_zone_t zone); /* * Exported statistics structures to be used by user space monitoring tools. * Statistics stream consusts of a uma_stream_header, followed by a series of * alternative uma_type_header and uma_type_stat structures. Statistics * structures */ #define UMA_STREAM_VERSION 0x00000001 struct uma_stream_header { u_int32_t ush_version; /* Stream format version. */ u_int32_t ush_maxcpus; /* Value of MAXCPU for stream. */ u_int32_t ush_count; /* Number of records. */ u_int32_t _ush_pad; /* Pad/reserved field. */ }; #define UTH_MAX_NAME 32 #define UTH_ZONE_SECONDARY 0x00000001 struct uma_type_header { /* * Static per-zone data, some extracted from the supporting keg. */ char uth_name[UTH_MAX_NAME]; u_int32_t uth_align; /* Keg: alignment. */ u_int32_t uth_size; /* Keg: requested size of item. */ u_int32_t uth_rsize; /* Keg: real size of item. */ u_int32_t uth_maxpages; /* Keg: maximum number of pages. */ u_int32_t uth_limit; /* Keg: max items to allocate. */ /* * Current dynamic zone/keg-derived statistics. */ u_int32_t uth_pages; /* Keg: pages allocated. */ u_int32_t uth_keg_free; /* Keg: items free. */ u_int32_t uth_zone_free; /* Zone: items free. */ u_int32_t uth_bucketsize; /* Zone: desired bucket size. */ u_int32_t uth_zone_flags; /* Zone: flags. */ u_int64_t uth_allocs; /* Zone: number of allocations. */ u_int64_t uth_frees; /* Zone: number of frees. */ u_int64_t uth_fails; /* Zone: number of alloc failures. */ u_int64_t _uth_reserved1[3]; /* Reserved. */ }; struct uma_percpu_stat { u_int64_t ups_allocs; /* Cache: number of alloctions. */ u_int64_t ups_frees; /* Cache: number of frees. */ u_int64_t ups_cache_free; /* Cache: free items in cache. */ u_int64_t _ups_reserved[5]; /* Reserved. */ }; #endif Index: head/sys/vm/uma_core.c =================================================================== --- head/sys/vm/uma_core.c (revision 177920) +++ head/sys/vm/uma_core.c (revision 177921) @@ -1,3013 +1,3017 @@ /*- * Copyright (c) 2002, 2003, 2004, 2005 Jeffrey Roberson * Copyright (c) 2004, 2005 Bosko Milekic * Copyright (c) 2004-2006 Robert N. M. Watson * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice unmodified, this list of conditions, and the following * disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ /* * uma_core.c Implementation of the Universal Memory allocator * * This allocator is intended to replace the multitude of similar object caches * in the standard FreeBSD kernel. The intent is to be flexible as well as * effecient. A primary design goal is to return unused memory to the rest of * the system. This will make the system as a whole more flexible due to the * ability to move memory to subsystems which most need it instead of leaving * pools of reserved memory unused. * * The basic ideas stem from similar slab/zone based allocators whose algorithms * are well known. * */ /* * TODO: * - Improve memory usage for large allocations * - Investigate cache size adjustments */ #include __FBSDID("$FreeBSD$"); /* I should really use ktr.. */ /* #define UMA_DEBUG 1 #define UMA_DEBUG_ALLOC 1 #define UMA_DEBUG_ALLOC_1 1 */ #include "opt_ddb.h" #include "opt_param.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* * This is the zone and keg from which all zones are spawned. The idea is that * even the zone & keg heads are allocated from the allocator, so we use the * bss section to bootstrap us. */ static struct uma_keg masterkeg; static struct uma_zone masterzone_k; static struct uma_zone masterzone_z; static uma_zone_t kegs = &masterzone_k; static uma_zone_t zones = &masterzone_z; /* This is the zone from which all of uma_slab_t's are allocated. */ static uma_zone_t slabzone; static uma_zone_t slabrefzone; /* With refcounters (for UMA_ZONE_REFCNT) */ /* * The initial hash tables come out of this zone so they can be allocated * prior to malloc coming up. */ static uma_zone_t hashzone; /* The boot-time adjusted value for cache line alignment. */ static int uma_align_cache = 16 - 1; static MALLOC_DEFINE(M_UMAHASH, "UMAHash", "UMA Hash Buckets"); /* * Are we allowed to allocate buckets? */ static int bucketdisable = 1; /* Linked list of all kegs in the system */ static LIST_HEAD(,uma_keg) uma_kegs = LIST_HEAD_INITIALIZER(&uma_kegs); /* This mutex protects the keg list */ static struct mtx uma_mtx; /* Linked list of boot time pages */ static LIST_HEAD(,uma_slab) uma_boot_pages = LIST_HEAD_INITIALIZER(&uma_boot_pages); /* This mutex protects the boot time pages list */ static struct mtx uma_boot_pages_mtx; /* Is the VM done starting up? */ static int booted = 0; /* Maximum number of allowed items-per-slab if the slab header is OFFPAGE */ static u_int uma_max_ipers; static u_int uma_max_ipers_ref; /* * This is the handle used to schedule events that need to happen * outside of the allocation fast path. */ static struct callout uma_callout; #define UMA_TIMEOUT 20 /* Seconds for callout interval. */ /* * This structure is passed as the zone ctor arg so that I don't have to create * a special allocation function just for zones. */ struct uma_zctor_args { char *name; size_t size; uma_ctor ctor; uma_dtor dtor; uma_init uminit; uma_fini fini; uma_keg_t keg; int align; u_int32_t flags; }; struct uma_kctor_args { uma_zone_t zone; size_t size; uma_init uminit; uma_fini fini; int align; u_int32_t flags; }; struct uma_bucket_zone { uma_zone_t ubz_zone; char *ubz_name; int ubz_entries; }; #define BUCKET_MAX 128 struct uma_bucket_zone bucket_zones[] = { { NULL, "16 Bucket", 16 }, { NULL, "32 Bucket", 32 }, { NULL, "64 Bucket", 64 }, { NULL, "128 Bucket", 128 }, { NULL, NULL, 0} }; #define BUCKET_SHIFT 4 #define BUCKET_ZONES ((BUCKET_MAX >> BUCKET_SHIFT) + 1) /* * bucket_size[] maps requested bucket sizes to zones that allocate a bucket * of approximately the right size. */ static uint8_t bucket_size[BUCKET_ZONES]; /* * Flags and enumerations to be passed to internal functions. */ enum zfreeskip { SKIP_NONE, SKIP_DTOR, SKIP_FINI }; #define ZFREE_STATFAIL 0x00000001 /* Update zone failure statistic. */ #define ZFREE_STATFREE 0x00000002 /* Update zone free statistic. */ /* Prototypes.. */ static void *obj_alloc(uma_zone_t, int, u_int8_t *, int); static void *page_alloc(uma_zone_t, int, u_int8_t *, int); static void *startup_alloc(uma_zone_t, int, u_int8_t *, int); static void page_free(void *, int, u_int8_t); static uma_slab_t slab_zalloc(uma_zone_t, int); static void cache_drain(uma_zone_t); static void bucket_drain(uma_zone_t, uma_bucket_t); static void bucket_cache_drain(uma_zone_t zone); static int keg_ctor(void *, int, void *, int); static void keg_dtor(void *, int, void *); static int zone_ctor(void *, int, void *, int); static void zone_dtor(void *, int, void *); static int zero_init(void *, int, int); static void zone_small_init(uma_zone_t zone); static void zone_large_init(uma_zone_t zone); static void zone_foreach(void (*zfunc)(uma_zone_t)); static void zone_timeout(uma_zone_t zone); static int hash_alloc(struct uma_hash *); static int hash_expand(struct uma_hash *, struct uma_hash *); static void hash_free(struct uma_hash *hash); static void uma_timeout(void *); static void uma_startup3(void); static void *uma_zalloc_internal(uma_zone_t, void *, int); static void uma_zfree_internal(uma_zone_t, void *, void *, enum zfreeskip, int); static void bucket_enable(void); static void bucket_init(void); static uma_bucket_t bucket_alloc(int, int); static void bucket_free(uma_bucket_t); static void bucket_zone_drain(void); static int uma_zalloc_bucket(uma_zone_t zone, int flags); static uma_slab_t uma_zone_slab(uma_zone_t zone, int flags); static void *uma_slab_alloc(uma_zone_t zone, uma_slab_t slab); static uma_zone_t uma_kcreate(uma_zone_t zone, size_t size, uma_init uminit, uma_fini fini, int align, u_int32_t flags); void uma_print_zone(uma_zone_t); void uma_print_stats(void); static int sysctl_vm_zone_count(SYSCTL_HANDLER_ARGS); static int sysctl_vm_zone_stats(SYSCTL_HANDLER_ARGS); #ifdef WITNESS static int nosleepwithlocks = 1; #else static int nosleepwithlocks = 0; #endif SYSCTL_INT(_debug, OID_AUTO, nosleepwithlocks, CTLFLAG_RW, &nosleepwithlocks, 0, "Convert M_WAITOK to M_NOWAIT to avoid lock-held-across-sleep paths"); SYSINIT(uma_startup3, SI_SUB_VM_CONF, SI_ORDER_SECOND, uma_startup3, NULL); SYSCTL_PROC(_vm, OID_AUTO, zone_count, CTLFLAG_RD|CTLTYPE_INT, 0, 0, sysctl_vm_zone_count, "I", "Number of UMA zones"); SYSCTL_PROC(_vm, OID_AUTO, zone_stats, CTLFLAG_RD|CTLTYPE_STRUCT, 0, 0, sysctl_vm_zone_stats, "s,struct uma_type_header", "Zone Stats"); /* * This routine checks to see whether or not it's safe to enable buckets. */ static void bucket_enable(void) { if (cnt.v_free_count < cnt.v_free_min) bucketdisable = 1; else bucketdisable = 0; } /* * Initialize bucket_zones, the array of zones of buckets of various sizes. * * For each zone, calculate the memory required for each bucket, consisting * of the header and an array of pointers. Initialize bucket_size[] to point * the range of appropriate bucket sizes at the zone. */ static void bucket_init(void) { struct uma_bucket_zone *ubz; int i; int j; for (i = 0, j = 0; bucket_zones[j].ubz_entries != 0; j++) { int size; ubz = &bucket_zones[j]; size = roundup(sizeof(struct uma_bucket), sizeof(void *)); size += sizeof(void *) * ubz->ubz_entries; ubz->ubz_zone = uma_zcreate(ubz->ubz_name, size, NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZFLAG_INTERNAL); for (; i <= ubz->ubz_entries; i += (1 << BUCKET_SHIFT)) bucket_size[i >> BUCKET_SHIFT] = j; } } /* * Given a desired number of entries for a bucket, return the zone from which * to allocate the bucket. */ static struct uma_bucket_zone * bucket_zone_lookup(int entries) { int idx; idx = howmany(entries, 1 << BUCKET_SHIFT); return (&bucket_zones[bucket_size[idx]]); } static uma_bucket_t bucket_alloc(int entries, int bflags) { struct uma_bucket_zone *ubz; uma_bucket_t bucket; /* * This is to stop us from allocating per cpu buckets while we're * running out of vm.boot_pages. Otherwise, we would exhaust the * boot pages. This also prevents us from allocating buckets in * low memory situations. */ if (bucketdisable) return (NULL); ubz = bucket_zone_lookup(entries); bucket = uma_zalloc_internal(ubz->ubz_zone, NULL, bflags); if (bucket) { #ifdef INVARIANTS bzero(bucket->ub_bucket, sizeof(void *) * ubz->ubz_entries); #endif bucket->ub_cnt = 0; bucket->ub_entries = ubz->ubz_entries; } return (bucket); } static void bucket_free(uma_bucket_t bucket) { struct uma_bucket_zone *ubz; ubz = bucket_zone_lookup(bucket->ub_entries); uma_zfree_internal(ubz->ubz_zone, bucket, NULL, SKIP_NONE, ZFREE_STATFREE); } static void bucket_zone_drain(void) { struct uma_bucket_zone *ubz; for (ubz = &bucket_zones[0]; ubz->ubz_entries != 0; ubz++) zone_drain(ubz->ubz_zone); } /* * Routine called by timeout which is used to fire off some time interval * based calculations. (stats, hash size, etc.) * * Arguments: * arg Unused * * Returns: * Nothing */ static void uma_timeout(void *unused) { bucket_enable(); zone_foreach(zone_timeout); /* Reschedule this event */ callout_reset(&uma_callout, UMA_TIMEOUT * hz, uma_timeout, NULL); } /* * Routine to perform timeout driven calculations. This expands the * hashes and does per cpu statistics aggregation. * * Arguments: * zone The zone to operate on * * Returns: * Nothing */ static void zone_timeout(uma_zone_t zone) { uma_keg_t keg; u_int64_t alloc; keg = zone->uz_keg; alloc = 0; /* * Expand the zone hash table. * * This is done if the number of slabs is larger than the hash size. * What I'm trying to do here is completely reduce collisions. This * may be a little aggressive. Should I allow for two collisions max? */ ZONE_LOCK(zone); if (keg->uk_flags & UMA_ZONE_HASH && keg->uk_pages / keg->uk_ppera >= keg->uk_hash.uh_hashsize) { struct uma_hash newhash; struct uma_hash oldhash; int ret; /* * This is so involved because allocating and freeing * while the zone lock is held will lead to deadlock. * I have to do everything in stages and check for * races. */ newhash = keg->uk_hash; ZONE_UNLOCK(zone); ret = hash_alloc(&newhash); ZONE_LOCK(zone); if (ret) { if (hash_expand(&keg->uk_hash, &newhash)) { oldhash = keg->uk_hash; keg->uk_hash = newhash; } else oldhash = newhash; ZONE_UNLOCK(zone); hash_free(&oldhash); ZONE_LOCK(zone); } } ZONE_UNLOCK(zone); } /* * Allocate and zero fill the next sized hash table from the appropriate * backing store. * * Arguments: * hash A new hash structure with the old hash size in uh_hashsize * * Returns: * 1 on sucess and 0 on failure. */ static int hash_alloc(struct uma_hash *hash) { int oldsize; int alloc; oldsize = hash->uh_hashsize; /* We're just going to go to a power of two greater */ if (oldsize) { hash->uh_hashsize = oldsize * 2; alloc = sizeof(hash->uh_slab_hash[0]) * hash->uh_hashsize; hash->uh_slab_hash = (struct slabhead *)malloc(alloc, M_UMAHASH, M_NOWAIT); } else { alloc = sizeof(hash->uh_slab_hash[0]) * UMA_HASH_SIZE_INIT; hash->uh_slab_hash = uma_zalloc_internal(hashzone, NULL, M_WAITOK); hash->uh_hashsize = UMA_HASH_SIZE_INIT; } if (hash->uh_slab_hash) { bzero(hash->uh_slab_hash, alloc); hash->uh_hashmask = hash->uh_hashsize - 1; return (1); } return (0); } /* * Expands the hash table for HASH zones. This is done from zone_timeout * to reduce collisions. This must not be done in the regular allocation * path, otherwise, we can recurse on the vm while allocating pages. * * Arguments: * oldhash The hash you want to expand * newhash The hash structure for the new table * * Returns: * Nothing * * Discussion: */ static int hash_expand(struct uma_hash *oldhash, struct uma_hash *newhash) { uma_slab_t slab; int hval; int i; if (!newhash->uh_slab_hash) return (0); if (oldhash->uh_hashsize >= newhash->uh_hashsize) return (0); /* * I need to investigate hash algorithms for resizing without a * full rehash. */ for (i = 0; i < oldhash->uh_hashsize; i++) while (!SLIST_EMPTY(&oldhash->uh_slab_hash[i])) { slab = SLIST_FIRST(&oldhash->uh_slab_hash[i]); SLIST_REMOVE_HEAD(&oldhash->uh_slab_hash[i], us_hlink); hval = UMA_HASH(newhash, slab->us_data); SLIST_INSERT_HEAD(&newhash->uh_slab_hash[hval], slab, us_hlink); } return (1); } /* * Free the hash bucket to the appropriate backing store. * * Arguments: * slab_hash The hash bucket we're freeing * hashsize The number of entries in that hash bucket * * Returns: * Nothing */ static void hash_free(struct uma_hash *hash) { if (hash->uh_slab_hash == NULL) return; if (hash->uh_hashsize == UMA_HASH_SIZE_INIT) uma_zfree_internal(hashzone, hash->uh_slab_hash, NULL, SKIP_NONE, ZFREE_STATFREE); else free(hash->uh_slab_hash, M_UMAHASH); } /* * Frees all outstanding items in a bucket * * Arguments: * zone The zone to free to, must be unlocked. * bucket The free/alloc bucket with items, cpu queue must be locked. * * Returns: * Nothing */ static void bucket_drain(uma_zone_t zone, uma_bucket_t bucket) { uma_slab_t slab; int mzone; void *item; if (bucket == NULL) return; slab = NULL; mzone = 0; /* We have to lookup the slab again for malloc.. */ if (zone->uz_keg->uk_flags & UMA_ZONE_MALLOC) mzone = 1; while (bucket->ub_cnt > 0) { bucket->ub_cnt--; item = bucket->ub_bucket[bucket->ub_cnt]; #ifdef INVARIANTS bucket->ub_bucket[bucket->ub_cnt] = NULL; KASSERT(item != NULL, ("bucket_drain: botched ptr, item is NULL")); #endif /* * This is extremely inefficient. The slab pointer was passed * to uma_zfree_arg, but we lost it because the buckets don't * hold them. This will go away when free() gets a size passed * to it. */ if (mzone) slab = vtoslab((vm_offset_t)item & (~UMA_SLAB_MASK)); uma_zfree_internal(zone, item, slab, SKIP_DTOR, 0); } } /* * Drains the per cpu caches for a zone. * * NOTE: This may only be called while the zone is being turn down, and not * during normal operation. This is necessary in order that we do not have * to migrate CPUs to drain the per-CPU caches. * * Arguments: * zone The zone to drain, must be unlocked. * * Returns: * Nothing */ static void cache_drain(uma_zone_t zone) { uma_cache_t cache; int cpu; /* * XXX: It is safe to not lock the per-CPU caches, because we're * tearing down the zone anyway. I.e., there will be no further use * of the caches at this point. * * XXX: It would good to be able to assert that the zone is being * torn down to prevent improper use of cache_drain(). * * XXX: We lock the zone before passing into bucket_cache_drain() as * it is used elsewhere. Should the tear-down path be made special * there in some form? */ for (cpu = 0; cpu <= mp_maxid; cpu++) { if (CPU_ABSENT(cpu)) continue; cache = &zone->uz_cpu[cpu]; bucket_drain(zone, cache->uc_allocbucket); bucket_drain(zone, cache->uc_freebucket); if (cache->uc_allocbucket != NULL) bucket_free(cache->uc_allocbucket); if (cache->uc_freebucket != NULL) bucket_free(cache->uc_freebucket); cache->uc_allocbucket = cache->uc_freebucket = NULL; } ZONE_LOCK(zone); bucket_cache_drain(zone); ZONE_UNLOCK(zone); } /* * Drain the cached buckets from a zone. Expects a locked zone on entry. */ static void bucket_cache_drain(uma_zone_t zone) { uma_bucket_t bucket; /* * Drain the bucket queues and free the buckets, we just keep two per * cpu (alloc/free). */ while ((bucket = LIST_FIRST(&zone->uz_full_bucket)) != NULL) { LIST_REMOVE(bucket, ub_link); ZONE_UNLOCK(zone); bucket_drain(zone, bucket); bucket_free(bucket); ZONE_LOCK(zone); } /* Now we do the free queue.. */ while ((bucket = LIST_FIRST(&zone->uz_free_bucket)) != NULL) { LIST_REMOVE(bucket, ub_link); bucket_free(bucket); } } /* * Frees pages from a zone back to the system. This is done on demand from * the pageout daemon. * * Arguments: * zone The zone to free pages from * all Should we drain all items? * * Returns: * Nothing. */ void zone_drain(uma_zone_t zone) { struct slabhead freeslabs = { 0 }; uma_keg_t keg; uma_slab_t slab; uma_slab_t n; u_int8_t flags; u_int8_t *mem; int i; keg = zone->uz_keg; /* * We don't want to take pages from statically allocated zones at this * time */ if (keg->uk_flags & UMA_ZONE_NOFREE || keg->uk_freef == NULL) return; ZONE_LOCK(zone); #ifdef UMA_DEBUG printf("%s free items: %u\n", zone->uz_name, keg->uk_free); #endif bucket_cache_drain(zone); if (keg->uk_free == 0) goto finished; slab = LIST_FIRST(&keg->uk_free_slab); while (slab) { n = LIST_NEXT(slab, us_link); /* We have no where to free these to */ if (slab->us_flags & UMA_SLAB_BOOT) { slab = n; continue; } LIST_REMOVE(slab, us_link); keg->uk_pages -= keg->uk_ppera; keg->uk_free -= keg->uk_ipers; if (keg->uk_flags & UMA_ZONE_HASH) UMA_HASH_REMOVE(&keg->uk_hash, slab, slab->us_data); SLIST_INSERT_HEAD(&freeslabs, slab, us_hlink); slab = n; } finished: ZONE_UNLOCK(zone); while ((slab = SLIST_FIRST(&freeslabs)) != NULL) { SLIST_REMOVE(&freeslabs, slab, uma_slab, us_hlink); if (keg->uk_fini) for (i = 0; i < keg->uk_ipers; i++) keg->uk_fini( slab->us_data + (keg->uk_rsize * i), keg->uk_size); flags = slab->us_flags; mem = slab->us_data; if ((keg->uk_flags & UMA_ZONE_MALLOC) || (keg->uk_flags & UMA_ZONE_REFCNT)) { vm_object_t obj; if (flags & UMA_SLAB_KMEM) obj = kmem_object; + else if (flags & UMA_SLAB_KERNEL) + obj = kernel_object; else obj = NULL; for (i = 0; i < keg->uk_ppera; i++) vsetobj((vm_offset_t)mem + (i * PAGE_SIZE), obj); } if (keg->uk_flags & UMA_ZONE_OFFPAGE) uma_zfree_internal(keg->uk_slabzone, slab, NULL, SKIP_NONE, ZFREE_STATFREE); #ifdef UMA_DEBUG printf("%s: Returning %d bytes.\n", zone->uz_name, UMA_SLAB_SIZE * keg->uk_ppera); #endif keg->uk_freef(mem, UMA_SLAB_SIZE * keg->uk_ppera, flags); } } /* * Allocate a new slab for a zone. This does not insert the slab onto a list. * * Arguments: * zone The zone to allocate slabs for * wait Shall we wait? * * Returns: * The slab that was allocated or NULL if there is no memory and the * caller specified M_NOWAIT. */ static uma_slab_t slab_zalloc(uma_zone_t zone, int wait) { uma_slabrefcnt_t slabref; uma_slab_t slab; uma_keg_t keg; u_int8_t *mem; u_int8_t flags; int i; slab = NULL; keg = zone->uz_keg; #ifdef UMA_DEBUG printf("slab_zalloc: Allocating a new slab for %s\n", zone->uz_name); #endif ZONE_UNLOCK(zone); if (keg->uk_flags & UMA_ZONE_OFFPAGE) { slab = uma_zalloc_internal(keg->uk_slabzone, NULL, wait); if (slab == NULL) { ZONE_LOCK(zone); return NULL; } } /* * This reproduces the old vm_zone behavior of zero filling pages the * first time they are added to a zone. * * Malloced items are zeroed in uma_zalloc. */ if ((keg->uk_flags & UMA_ZONE_MALLOC) == 0) wait |= M_ZERO; else wait &= ~M_ZERO; mem = keg->uk_allocf(zone, keg->uk_ppera * UMA_SLAB_SIZE, &flags, wait); if (mem == NULL) { if (keg->uk_flags & UMA_ZONE_OFFPAGE) uma_zfree_internal(keg->uk_slabzone, slab, NULL, SKIP_NONE, ZFREE_STATFREE); ZONE_LOCK(zone); return (NULL); } /* Point the slab into the allocated memory */ if (!(keg->uk_flags & UMA_ZONE_OFFPAGE)) slab = (uma_slab_t )(mem + keg->uk_pgoff); if ((keg->uk_flags & UMA_ZONE_MALLOC) || (keg->uk_flags & UMA_ZONE_REFCNT)) for (i = 0; i < keg->uk_ppera; i++) vsetslab((vm_offset_t)mem + (i * PAGE_SIZE), slab); slab->us_keg = keg; slab->us_data = mem; slab->us_freecount = keg->uk_ipers; slab->us_firstfree = 0; slab->us_flags = flags; if (keg->uk_flags & UMA_ZONE_REFCNT) { slabref = (uma_slabrefcnt_t)slab; for (i = 0; i < keg->uk_ipers; i++) { slabref->us_freelist[i].us_refcnt = 0; slabref->us_freelist[i].us_item = i+1; } } else { for (i = 0; i < keg->uk_ipers; i++) slab->us_freelist[i].us_item = i+1; } if (keg->uk_init != NULL) { for (i = 0; i < keg->uk_ipers; i++) if (keg->uk_init(slab->us_data + (keg->uk_rsize * i), keg->uk_size, wait) != 0) break; if (i != keg->uk_ipers) { if (keg->uk_fini != NULL) { for (i--; i > -1; i--) keg->uk_fini(slab->us_data + (keg->uk_rsize * i), keg->uk_size); } if ((keg->uk_flags & UMA_ZONE_MALLOC) || (keg->uk_flags & UMA_ZONE_REFCNT)) { vm_object_t obj; if (flags & UMA_SLAB_KMEM) obj = kmem_object; + else if (flags & UMA_SLAB_KERNEL) + obj = kernel_object; else obj = NULL; for (i = 0; i < keg->uk_ppera; i++) vsetobj((vm_offset_t)mem + (i * PAGE_SIZE), obj); } if (keg->uk_flags & UMA_ZONE_OFFPAGE) uma_zfree_internal(keg->uk_slabzone, slab, NULL, SKIP_NONE, ZFREE_STATFREE); keg->uk_freef(mem, UMA_SLAB_SIZE * keg->uk_ppera, flags); ZONE_LOCK(zone); return (NULL); } } ZONE_LOCK(zone); if (keg->uk_flags & UMA_ZONE_HASH) UMA_HASH_INSERT(&keg->uk_hash, slab, mem); keg->uk_pages += keg->uk_ppera; keg->uk_free += keg->uk_ipers; return (slab); } /* * This function is intended to be used early on in place of page_alloc() so * that we may use the boot time page cache to satisfy allocations before * the VM is ready. */ static void * startup_alloc(uma_zone_t zone, int bytes, u_int8_t *pflag, int wait) { uma_keg_t keg; uma_slab_t tmps; keg = zone->uz_keg; /* * Check our small startup cache to see if it has pages remaining. */ mtx_lock(&uma_boot_pages_mtx); if ((tmps = LIST_FIRST(&uma_boot_pages)) != NULL) { LIST_REMOVE(tmps, us_link); mtx_unlock(&uma_boot_pages_mtx); *pflag = tmps->us_flags; return (tmps->us_data); } mtx_unlock(&uma_boot_pages_mtx); if (booted == 0) panic("UMA: Increase vm.boot_pages"); /* * Now that we've booted reset these users to their real allocator. */ #ifdef UMA_MD_SMALL_ALLOC keg->uk_allocf = uma_small_alloc; #else keg->uk_allocf = page_alloc; #endif return keg->uk_allocf(zone, bytes, pflag, wait); } /* * Allocates a number of pages from the system * * Arguments: * zone Unused * bytes The number of bytes requested * wait Shall we wait? * * Returns: * A pointer to the alloced memory or possibly * NULL if M_NOWAIT is set. */ static void * page_alloc(uma_zone_t zone, int bytes, u_int8_t *pflag, int wait) { void *p; /* Returned page */ *pflag = UMA_SLAB_KMEM; p = (void *) kmem_malloc(kmem_map, bytes, wait); return (p); } /* * Allocates a number of pages from within an object * * Arguments: * zone Unused * bytes The number of bytes requested * wait Shall we wait? * * Returns: * A pointer to the alloced memory or possibly * NULL if M_NOWAIT is set. */ static void * obj_alloc(uma_zone_t zone, int bytes, u_int8_t *flags, int wait) { vm_object_t object; vm_offset_t retkva, zkva; vm_page_t p; int pages, startpages; object = zone->uz_keg->uk_obj; retkva = 0; /* * This looks a little weird since we're getting one page at a time. */ VM_OBJECT_LOCK(object); p = TAILQ_LAST(&object->memq, pglist); pages = p != NULL ? p->pindex + 1 : 0; startpages = pages; zkva = zone->uz_keg->uk_kva + pages * PAGE_SIZE; for (; bytes > 0; bytes -= PAGE_SIZE) { p = vm_page_alloc(object, pages, VM_ALLOC_INTERRUPT | VM_ALLOC_WIRED); if (p == NULL) { if (pages != startpages) pmap_qremove(retkva, pages - startpages); while (pages != startpages) { pages--; p = TAILQ_LAST(&object->memq, pglist); vm_page_lock_queues(); vm_page_unwire(p, 0); vm_page_free(p); vm_page_unlock_queues(); } retkva = 0; goto done; } pmap_qenter(zkva, &p, 1); if (retkva == 0) retkva = zkva; zkva += PAGE_SIZE; pages += 1; } done: VM_OBJECT_UNLOCK(object); *flags = UMA_SLAB_PRIV; return ((void *)retkva); } /* * Frees a number of pages to the system * * Arguments: * mem A pointer to the memory to be freed * size The size of the memory being freed * flags The original p->us_flags field * * Returns: * Nothing */ static void page_free(void *mem, int size, u_int8_t flags) { vm_map_t map; if (flags & UMA_SLAB_KMEM) map = kmem_map; else panic("UMA: page_free used with invalid flags %d\n", flags); kmem_free(map, (vm_offset_t)mem, size); } /* * Zero fill initializer * * Arguments/Returns follow uma_init specifications */ static int zero_init(void *mem, int size, int flags) { bzero(mem, size); return (0); } /* * Finish creating a small uma zone. This calculates ipers, and the zone size. * * Arguments * zone The zone we should initialize * * Returns * Nothing */ static void zone_small_init(uma_zone_t zone) { uma_keg_t keg; u_int rsize; u_int memused; u_int wastedspace; u_int shsize; keg = zone->uz_keg; KASSERT(keg != NULL, ("Keg is null in zone_small_init")); rsize = keg->uk_size; if (rsize < UMA_SMALLEST_UNIT) rsize = UMA_SMALLEST_UNIT; if (rsize & keg->uk_align) rsize = (rsize & ~keg->uk_align) + (keg->uk_align + 1); keg->uk_rsize = rsize; keg->uk_ppera = 1; if (keg->uk_flags & UMA_ZONE_REFCNT) { rsize += UMA_FRITMREF_SZ; /* linkage & refcnt */ shsize = sizeof(struct uma_slab_refcnt); } else { rsize += UMA_FRITM_SZ; /* Account for linkage */ shsize = sizeof(struct uma_slab); } keg->uk_ipers = (UMA_SLAB_SIZE - shsize) / rsize; KASSERT(keg->uk_ipers != 0, ("zone_small_init: ipers is 0")); memused = keg->uk_ipers * rsize + shsize; wastedspace = UMA_SLAB_SIZE - memused; /* * We can't do OFFPAGE if we're internal or if we've been * asked to not go to the VM for buckets. If we do this we * may end up going to the VM (kmem_map) for slabs which we * do not want to do if we're UMA_ZFLAG_CACHEONLY as a * result of UMA_ZONE_VM, which clearly forbids it. */ if ((keg->uk_flags & UMA_ZFLAG_INTERNAL) || (keg->uk_flags & UMA_ZFLAG_CACHEONLY)) return; if ((wastedspace >= UMA_MAX_WASTE) && (keg->uk_ipers < (UMA_SLAB_SIZE / keg->uk_rsize))) { keg->uk_ipers = UMA_SLAB_SIZE / keg->uk_rsize; KASSERT(keg->uk_ipers <= 255, ("zone_small_init: keg->uk_ipers too high!")); #ifdef UMA_DEBUG printf("UMA decided we need offpage slab headers for " "zone: %s, calculated wastedspace = %d, " "maximum wasted space allowed = %d, " "calculated ipers = %d, " "new wasted space = %d\n", zone->uz_name, wastedspace, UMA_MAX_WASTE, keg->uk_ipers, UMA_SLAB_SIZE - keg->uk_ipers * keg->uk_rsize); #endif keg->uk_flags |= UMA_ZONE_OFFPAGE; if ((keg->uk_flags & UMA_ZONE_MALLOC) == 0) keg->uk_flags |= UMA_ZONE_HASH; } } /* * Finish creating a large (> UMA_SLAB_SIZE) uma zone. Just give in and do * OFFPAGE for now. When I can allow for more dynamic slab sizes this will be * more complicated. * * Arguments * zone The zone we should initialize * * Returns * Nothing */ static void zone_large_init(uma_zone_t zone) { uma_keg_t keg; int pages; keg = zone->uz_keg; KASSERT(keg != NULL, ("Keg is null in zone_large_init")); KASSERT((keg->uk_flags & UMA_ZFLAG_CACHEONLY) == 0, ("zone_large_init: Cannot large-init a UMA_ZFLAG_CACHEONLY zone")); pages = keg->uk_size / UMA_SLAB_SIZE; /* Account for remainder */ if ((pages * UMA_SLAB_SIZE) < keg->uk_size) pages++; keg->uk_ppera = pages; keg->uk_ipers = 1; keg->uk_flags |= UMA_ZONE_OFFPAGE; if ((keg->uk_flags & UMA_ZONE_MALLOC) == 0) keg->uk_flags |= UMA_ZONE_HASH; keg->uk_rsize = keg->uk_size; } /* * Keg header ctor. This initializes all fields, locks, etc. And inserts * the keg onto the global keg list. * * Arguments/Returns follow uma_ctor specifications * udata Actually uma_kctor_args */ static int keg_ctor(void *mem, int size, void *udata, int flags) { struct uma_kctor_args *arg = udata; uma_keg_t keg = mem; uma_zone_t zone; bzero(keg, size); keg->uk_size = arg->size; keg->uk_init = arg->uminit; keg->uk_fini = arg->fini; keg->uk_align = arg->align; keg->uk_free = 0; keg->uk_pages = 0; keg->uk_flags = arg->flags; keg->uk_allocf = page_alloc; keg->uk_freef = page_free; keg->uk_recurse = 0; keg->uk_slabzone = NULL; /* * The master zone is passed to us at keg-creation time. */ zone = arg->zone; zone->uz_keg = keg; if (arg->flags & UMA_ZONE_VM) keg->uk_flags |= UMA_ZFLAG_CACHEONLY; if (arg->flags & UMA_ZONE_ZINIT) keg->uk_init = zero_init; /* * The +UMA_FRITM_SZ added to uk_size is to account for the * linkage that is added to the size in zone_small_init(). If * we don't account for this here then we may end up in * zone_small_init() with a calculated 'ipers' of 0. */ if (keg->uk_flags & UMA_ZONE_REFCNT) { if ((keg->uk_size+UMA_FRITMREF_SZ) > (UMA_SLAB_SIZE - sizeof(struct uma_slab_refcnt))) zone_large_init(zone); else zone_small_init(zone); } else { if ((keg->uk_size+UMA_FRITM_SZ) > (UMA_SLAB_SIZE - sizeof(struct uma_slab))) zone_large_init(zone); else zone_small_init(zone); } if (keg->uk_flags & UMA_ZONE_OFFPAGE) { if (keg->uk_flags & UMA_ZONE_REFCNT) keg->uk_slabzone = slabrefzone; else keg->uk_slabzone = slabzone; } /* * If we haven't booted yet we need allocations to go through the * startup cache until the vm is ready. */ if (keg->uk_ppera == 1) { #ifdef UMA_MD_SMALL_ALLOC keg->uk_allocf = uma_small_alloc; keg->uk_freef = uma_small_free; #endif if (booted == 0) keg->uk_allocf = startup_alloc; } /* * Initialize keg's lock (shared among zones) through * Master zone */ zone->uz_lock = &keg->uk_lock; if (arg->flags & UMA_ZONE_MTXCLASS) ZONE_LOCK_INIT(zone, 1); else ZONE_LOCK_INIT(zone, 0); /* * If we're putting the slab header in the actual page we need to * figure out where in each page it goes. This calculates a right * justified offset into the memory on an ALIGN_PTR boundary. */ if (!(keg->uk_flags & UMA_ZONE_OFFPAGE)) { u_int totsize; /* Size of the slab struct and free list */ if (keg->uk_flags & UMA_ZONE_REFCNT) totsize = sizeof(struct uma_slab_refcnt) + keg->uk_ipers * UMA_FRITMREF_SZ; else totsize = sizeof(struct uma_slab) + keg->uk_ipers * UMA_FRITM_SZ; if (totsize & UMA_ALIGN_PTR) totsize = (totsize & ~UMA_ALIGN_PTR) + (UMA_ALIGN_PTR + 1); keg->uk_pgoff = UMA_SLAB_SIZE - totsize; if (keg->uk_flags & UMA_ZONE_REFCNT) totsize = keg->uk_pgoff + sizeof(struct uma_slab_refcnt) + keg->uk_ipers * UMA_FRITMREF_SZ; else totsize = keg->uk_pgoff + sizeof(struct uma_slab) + keg->uk_ipers * UMA_FRITM_SZ; /* * The only way the following is possible is if with our * UMA_ALIGN_PTR adjustments we are now bigger than * UMA_SLAB_SIZE. I haven't checked whether this is * mathematically possible for all cases, so we make * sure here anyway. */ if (totsize > UMA_SLAB_SIZE) { printf("zone %s ipers %d rsize %d size %d\n", zone->uz_name, keg->uk_ipers, keg->uk_rsize, keg->uk_size); panic("UMA slab won't fit.\n"); } } if (keg->uk_flags & UMA_ZONE_HASH) hash_alloc(&keg->uk_hash); #ifdef UMA_DEBUG printf("%s(%p) size = %d ipers = %d ppera = %d pgoff = %d\n", zone->uz_name, zone, keg->uk_size, keg->uk_ipers, keg->uk_ppera, keg->uk_pgoff); #endif LIST_INSERT_HEAD(&keg->uk_zones, zone, uz_link); mtx_lock(&uma_mtx); LIST_INSERT_HEAD(&uma_kegs, keg, uk_link); mtx_unlock(&uma_mtx); return (0); } /* * Zone header ctor. This initializes all fields, locks, etc. * * Arguments/Returns follow uma_ctor specifications * udata Actually uma_zctor_args */ static int zone_ctor(void *mem, int size, void *udata, int flags) { struct uma_zctor_args *arg = udata; uma_zone_t zone = mem; uma_zone_t z; uma_keg_t keg; bzero(zone, size); zone->uz_name = arg->name; zone->uz_ctor = arg->ctor; zone->uz_dtor = arg->dtor; zone->uz_init = NULL; zone->uz_fini = NULL; zone->uz_allocs = 0; zone->uz_frees = 0; zone->uz_fails = 0; zone->uz_fills = zone->uz_count = 0; if (arg->flags & UMA_ZONE_SECONDARY) { KASSERT(arg->keg != NULL, ("Secondary zone on zero'd keg")); keg = arg->keg; zone->uz_keg = keg; zone->uz_init = arg->uminit; zone->uz_fini = arg->fini; zone->uz_lock = &keg->uk_lock; mtx_lock(&uma_mtx); ZONE_LOCK(zone); keg->uk_flags |= UMA_ZONE_SECONDARY; LIST_FOREACH(z, &keg->uk_zones, uz_link) { if (LIST_NEXT(z, uz_link) == NULL) { LIST_INSERT_AFTER(z, zone, uz_link); break; } } ZONE_UNLOCK(zone); mtx_unlock(&uma_mtx); } else if (arg->keg == NULL) { if (uma_kcreate(zone, arg->size, arg->uminit, arg->fini, arg->align, arg->flags) == NULL) return (ENOMEM); } else { struct uma_kctor_args karg; int error; /* We should only be here from uma_startup() */ karg.size = arg->size; karg.uminit = arg->uminit; karg.fini = arg->fini; karg.align = arg->align; karg.flags = arg->flags; karg.zone = zone; error = keg_ctor(arg->keg, sizeof(struct uma_keg), &karg, flags); if (error) return (error); } keg = zone->uz_keg; zone->uz_lock = &keg->uk_lock; /* * Some internal zones don't have room allocated for the per cpu * caches. If we're internal, bail out here. */ if (keg->uk_flags & UMA_ZFLAG_INTERNAL) { KASSERT((keg->uk_flags & UMA_ZONE_SECONDARY) == 0, ("Secondary zone requested UMA_ZFLAG_INTERNAL")); return (0); } if (keg->uk_flags & UMA_ZONE_MAXBUCKET) zone->uz_count = BUCKET_MAX; else if (keg->uk_ipers <= BUCKET_MAX) zone->uz_count = keg->uk_ipers; else zone->uz_count = BUCKET_MAX; return (0); } /* * Keg header dtor. This frees all data, destroys locks, frees the hash * table and removes the keg from the global list. * * Arguments/Returns follow uma_dtor specifications * udata unused */ static void keg_dtor(void *arg, int size, void *udata) { uma_keg_t keg; keg = (uma_keg_t)arg; mtx_lock(&keg->uk_lock); if (keg->uk_free != 0) { printf("Freed UMA keg was not empty (%d items). " " Lost %d pages of memory.\n", keg->uk_free, keg->uk_pages); } mtx_unlock(&keg->uk_lock); if (keg->uk_flags & UMA_ZONE_HASH) hash_free(&keg->uk_hash); mtx_destroy(&keg->uk_lock); } /* * Zone header dtor. * * Arguments/Returns follow uma_dtor specifications * udata unused */ static void zone_dtor(void *arg, int size, void *udata) { uma_zone_t zone; uma_keg_t keg; zone = (uma_zone_t)arg; keg = zone->uz_keg; if (!(keg->uk_flags & UMA_ZFLAG_INTERNAL)) cache_drain(zone); mtx_lock(&uma_mtx); zone_drain(zone); if (keg->uk_flags & UMA_ZONE_SECONDARY) { LIST_REMOVE(zone, uz_link); /* * XXX there are some races here where * the zone can be drained but zone lock * released and then refilled before we * remove it... we dont care for now */ ZONE_LOCK(zone); if (LIST_EMPTY(&keg->uk_zones)) keg->uk_flags &= ~UMA_ZONE_SECONDARY; ZONE_UNLOCK(zone); mtx_unlock(&uma_mtx); } else { LIST_REMOVE(keg, uk_link); LIST_REMOVE(zone, uz_link); mtx_unlock(&uma_mtx); uma_zfree_internal(kegs, keg, NULL, SKIP_NONE, ZFREE_STATFREE); } zone->uz_keg = NULL; } /* * Traverses every zone in the system and calls a callback * * Arguments: * zfunc A pointer to a function which accepts a zone * as an argument. * * Returns: * Nothing */ static void zone_foreach(void (*zfunc)(uma_zone_t)) { uma_keg_t keg; uma_zone_t zone; mtx_lock(&uma_mtx); LIST_FOREACH(keg, &uma_kegs, uk_link) { LIST_FOREACH(zone, &keg->uk_zones, uz_link) zfunc(zone); } mtx_unlock(&uma_mtx); } /* Public functions */ /* See uma.h */ void uma_startup(void *bootmem, int boot_pages) { struct uma_zctor_args args; uma_slab_t slab; u_int slabsize; u_int objsize, totsize, wsize; int i; #ifdef UMA_DEBUG printf("Creating uma keg headers zone and keg.\n"); #endif mtx_init(&uma_mtx, "UMA lock", NULL, MTX_DEF); /* * Figure out the maximum number of items-per-slab we'll have if * we're using the OFFPAGE slab header to track free items, given * all possible object sizes and the maximum desired wastage * (UMA_MAX_WASTE). * * We iterate until we find an object size for * which the calculated wastage in zone_small_init() will be * enough to warrant OFFPAGE. Since wastedspace versus objsize * is an overall increasing see-saw function, we find the smallest * objsize such that the wastage is always acceptable for objects * with that objsize or smaller. Since a smaller objsize always * generates a larger possible uma_max_ipers, we use this computed * objsize to calculate the largest ipers possible. Since the * ipers calculated for OFFPAGE slab headers is always larger than * the ipers initially calculated in zone_small_init(), we use * the former's equation (UMA_SLAB_SIZE / keg->uk_rsize) to * obtain the maximum ipers possible for offpage slab headers. * * It should be noted that ipers versus objsize is an inversly * proportional function which drops off rather quickly so as * long as our UMA_MAX_WASTE is such that the objsize we calculate * falls into the portion of the inverse relation AFTER the steep * falloff, then uma_max_ipers shouldn't be too high (~10 on i386). * * Note that we have 8-bits (1 byte) to use as a freelist index * inside the actual slab header itself and this is enough to * accomodate us. In the worst case, a UMA_SMALLEST_UNIT sized * object with offpage slab header would have ipers = * UMA_SLAB_SIZE / UMA_SMALLEST_UNIT (currently = 256), which is * 1 greater than what our byte-integer freelist index can * accomodate, but we know that this situation never occurs as * for UMA_SMALLEST_UNIT-sized objects, we will never calculate * that we need to go to offpage slab headers. Or, if we do, * then we trap that condition below and panic in the INVARIANTS case. */ wsize = UMA_SLAB_SIZE - sizeof(struct uma_slab) - UMA_MAX_WASTE; totsize = wsize; objsize = UMA_SMALLEST_UNIT; while (totsize >= wsize) { totsize = (UMA_SLAB_SIZE - sizeof(struct uma_slab)) / (objsize + UMA_FRITM_SZ); totsize *= (UMA_FRITM_SZ + objsize); objsize++; } if (objsize > UMA_SMALLEST_UNIT) objsize--; uma_max_ipers = UMA_SLAB_SIZE / objsize; wsize = UMA_SLAB_SIZE - sizeof(struct uma_slab_refcnt) - UMA_MAX_WASTE; totsize = wsize; objsize = UMA_SMALLEST_UNIT; while (totsize >= wsize) { totsize = (UMA_SLAB_SIZE - sizeof(struct uma_slab_refcnt)) / (objsize + UMA_FRITMREF_SZ); totsize *= (UMA_FRITMREF_SZ + objsize); objsize++; } if (objsize > UMA_SMALLEST_UNIT) objsize--; uma_max_ipers_ref = UMA_SLAB_SIZE / objsize; KASSERT((uma_max_ipers_ref <= 255) && (uma_max_ipers <= 255), ("uma_startup: calculated uma_max_ipers values too large!")); #ifdef UMA_DEBUG printf("Calculated uma_max_ipers (for OFFPAGE) is %d\n", uma_max_ipers); printf("Calculated uma_max_ipers_slab (for OFFPAGE) is %d\n", uma_max_ipers_ref); #endif /* "manually" create the initial zone */ args.name = "UMA Kegs"; args.size = sizeof(struct uma_keg); args.ctor = keg_ctor; args.dtor = keg_dtor; args.uminit = zero_init; args.fini = NULL; args.keg = &masterkeg; args.align = 32 - 1; args.flags = UMA_ZFLAG_INTERNAL; /* The initial zone has no Per cpu queues so it's smaller */ zone_ctor(kegs, sizeof(struct uma_zone), &args, M_WAITOK); #ifdef UMA_DEBUG printf("Filling boot free list.\n"); #endif for (i = 0; i < boot_pages; i++) { slab = (uma_slab_t)((u_int8_t *)bootmem + (i * UMA_SLAB_SIZE)); slab->us_data = (u_int8_t *)slab; slab->us_flags = UMA_SLAB_BOOT; LIST_INSERT_HEAD(&uma_boot_pages, slab, us_link); } mtx_init(&uma_boot_pages_mtx, "UMA boot pages", NULL, MTX_DEF); #ifdef UMA_DEBUG printf("Creating uma zone headers zone and keg.\n"); #endif args.name = "UMA Zones"; args.size = sizeof(struct uma_zone) + (sizeof(struct uma_cache) * (mp_maxid + 1)); args.ctor = zone_ctor; args.dtor = zone_dtor; args.uminit = zero_init; args.fini = NULL; args.keg = NULL; args.align = 32 - 1; args.flags = UMA_ZFLAG_INTERNAL; /* The initial zone has no Per cpu queues so it's smaller */ zone_ctor(zones, sizeof(struct uma_zone), &args, M_WAITOK); #ifdef UMA_DEBUG printf("Initializing pcpu cache locks.\n"); #endif #ifdef UMA_DEBUG printf("Creating slab and hash zones.\n"); #endif /* * This is the max number of free list items we'll have with * offpage slabs. */ slabsize = uma_max_ipers * UMA_FRITM_SZ; slabsize += sizeof(struct uma_slab); /* Now make a zone for slab headers */ slabzone = uma_zcreate("UMA Slabs", slabsize, NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZFLAG_INTERNAL); /* * We also create a zone for the bigger slabs with reference * counts in them, to accomodate UMA_ZONE_REFCNT zones. */ slabsize = uma_max_ipers_ref * UMA_FRITMREF_SZ; slabsize += sizeof(struct uma_slab_refcnt); slabrefzone = uma_zcreate("UMA RCntSlabs", slabsize, NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZFLAG_INTERNAL); hashzone = uma_zcreate("UMA Hash", sizeof(struct slabhead *) * UMA_HASH_SIZE_INIT, NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZFLAG_INTERNAL); bucket_init(); #ifdef UMA_MD_SMALL_ALLOC booted = 1; #endif #ifdef UMA_DEBUG printf("UMA startup complete.\n"); #endif } /* see uma.h */ void uma_startup2(void) { booted = 1; bucket_enable(); #ifdef UMA_DEBUG printf("UMA startup2 complete.\n"); #endif } /* * Initialize our callout handle * */ static void uma_startup3(void) { #ifdef UMA_DEBUG printf("Starting callout.\n"); #endif callout_init(&uma_callout, CALLOUT_MPSAFE); callout_reset(&uma_callout, UMA_TIMEOUT * hz, uma_timeout, NULL); #ifdef UMA_DEBUG printf("UMA startup3 complete.\n"); #endif } static uma_zone_t uma_kcreate(uma_zone_t zone, size_t size, uma_init uminit, uma_fini fini, int align, u_int32_t flags) { struct uma_kctor_args args; args.size = size; args.uminit = uminit; args.fini = fini; args.align = (align == UMA_ALIGN_CACHE) ? uma_align_cache : align; args.flags = flags; args.zone = zone; return (uma_zalloc_internal(kegs, &args, M_WAITOK)); } /* See uma.h */ void uma_set_align(int align) { if (align != UMA_ALIGN_CACHE) uma_align_cache = align; } /* See uma.h */ uma_zone_t uma_zcreate(char *name, size_t size, uma_ctor ctor, uma_dtor dtor, uma_init uminit, uma_fini fini, int align, u_int32_t flags) { struct uma_zctor_args args; /* This stuff is essential for the zone ctor */ args.name = name; args.size = size; args.ctor = ctor; args.dtor = dtor; args.uminit = uminit; args.fini = fini; args.align = align; args.flags = flags; args.keg = NULL; return (uma_zalloc_internal(zones, &args, M_WAITOK)); } /* See uma.h */ uma_zone_t uma_zsecond_create(char *name, uma_ctor ctor, uma_dtor dtor, uma_init zinit, uma_fini zfini, uma_zone_t master) { struct uma_zctor_args args; args.name = name; args.size = master->uz_keg->uk_size; args.ctor = ctor; args.dtor = dtor; args.uminit = zinit; args.fini = zfini; args.align = master->uz_keg->uk_align; args.flags = master->uz_keg->uk_flags | UMA_ZONE_SECONDARY; args.keg = master->uz_keg; return (uma_zalloc_internal(zones, &args, M_WAITOK)); } /* See uma.h */ void uma_zdestroy(uma_zone_t zone) { uma_zfree_internal(zones, zone, NULL, SKIP_NONE, ZFREE_STATFREE); } /* See uma.h */ void * uma_zalloc_arg(uma_zone_t zone, void *udata, int flags) { void *item; uma_cache_t cache; uma_bucket_t bucket; int cpu; /* This is the fast path allocation */ #ifdef UMA_DEBUG_ALLOC_1 printf("Allocating one item from %s(%p)\n", zone->uz_name, zone); #endif CTR3(KTR_UMA, "uma_zalloc_arg thread %x zone %s flags %d", curthread, zone->uz_name, flags); if (flags & M_WAITOK) { WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL, "uma_zalloc_arg: zone \"%s\"", zone->uz_name); } /* * If possible, allocate from the per-CPU cache. There are two * requirements for safe access to the per-CPU cache: (1) the thread * accessing the cache must not be preempted or yield during access, * and (2) the thread must not migrate CPUs without switching which * cache it accesses. We rely on a critical section to prevent * preemption and migration. We release the critical section in * order to acquire the zone mutex if we are unable to allocate from * the current cache; when we re-acquire the critical section, we * must detect and handle migration if it has occurred. */ zalloc_restart: critical_enter(); cpu = curcpu; cache = &zone->uz_cpu[cpu]; zalloc_start: bucket = cache->uc_allocbucket; if (bucket) { if (bucket->ub_cnt > 0) { bucket->ub_cnt--; item = bucket->ub_bucket[bucket->ub_cnt]; #ifdef INVARIANTS bucket->ub_bucket[bucket->ub_cnt] = NULL; #endif KASSERT(item != NULL, ("uma_zalloc: Bucket pointer mangled.")); cache->uc_allocs++; critical_exit(); #ifdef INVARIANTS ZONE_LOCK(zone); uma_dbg_alloc(zone, NULL, item); ZONE_UNLOCK(zone); #endif if (zone->uz_ctor != NULL) { if (zone->uz_ctor(item, zone->uz_keg->uk_size, udata, flags) != 0) { uma_zfree_internal(zone, item, udata, SKIP_DTOR, ZFREE_STATFAIL | ZFREE_STATFREE); return (NULL); } } if (flags & M_ZERO) bzero(item, zone->uz_keg->uk_size); return (item); } else if (cache->uc_freebucket) { /* * We have run out of items in our allocbucket. * See if we can switch with our free bucket. */ if (cache->uc_freebucket->ub_cnt > 0) { #ifdef UMA_DEBUG_ALLOC printf("uma_zalloc: Swapping empty with" " alloc.\n"); #endif bucket = cache->uc_freebucket; cache->uc_freebucket = cache->uc_allocbucket; cache->uc_allocbucket = bucket; goto zalloc_start; } } } /* * Attempt to retrieve the item from the per-CPU cache has failed, so * we must go back to the zone. This requires the zone lock, so we * must drop the critical section, then re-acquire it when we go back * to the cache. Since the critical section is released, we may be * preempted or migrate. As such, make sure not to maintain any * thread-local state specific to the cache from prior to releasing * the critical section. */ critical_exit(); ZONE_LOCK(zone); critical_enter(); cpu = curcpu; cache = &zone->uz_cpu[cpu]; bucket = cache->uc_allocbucket; if (bucket != NULL) { if (bucket->ub_cnt > 0) { ZONE_UNLOCK(zone); goto zalloc_start; } bucket = cache->uc_freebucket; if (bucket != NULL && bucket->ub_cnt > 0) { ZONE_UNLOCK(zone); goto zalloc_start; } } /* Since we have locked the zone we may as well send back our stats */ zone->uz_allocs += cache->uc_allocs; cache->uc_allocs = 0; zone->uz_frees += cache->uc_frees; cache->uc_frees = 0; /* Our old one is now a free bucket */ if (cache->uc_allocbucket) { KASSERT(cache->uc_allocbucket->ub_cnt == 0, ("uma_zalloc_arg: Freeing a non free bucket.")); LIST_INSERT_HEAD(&zone->uz_free_bucket, cache->uc_allocbucket, ub_link); cache->uc_allocbucket = NULL; } /* Check the free list for a new alloc bucket */ if ((bucket = LIST_FIRST(&zone->uz_full_bucket)) != NULL) { KASSERT(bucket->ub_cnt != 0, ("uma_zalloc_arg: Returning an empty bucket.")); LIST_REMOVE(bucket, ub_link); cache->uc_allocbucket = bucket; ZONE_UNLOCK(zone); goto zalloc_start; } /* We are no longer associated with this CPU. */ critical_exit(); /* Bump up our uz_count so we get here less */ if (zone->uz_count < BUCKET_MAX) zone->uz_count++; /* * Now lets just fill a bucket and put it on the free list. If that * works we'll restart the allocation from the begining. */ if (uma_zalloc_bucket(zone, flags)) { ZONE_UNLOCK(zone); goto zalloc_restart; } ZONE_UNLOCK(zone); /* * We may not be able to get a bucket so return an actual item. */ #ifdef UMA_DEBUG printf("uma_zalloc_arg: Bucketzone returned NULL\n"); #endif return (uma_zalloc_internal(zone, udata, flags)); } static uma_slab_t uma_zone_slab(uma_zone_t zone, int flags) { uma_slab_t slab; uma_keg_t keg; keg = zone->uz_keg; /* * This is to prevent us from recursively trying to allocate * buckets. The problem is that if an allocation forces us to * grab a new bucket we will call page_alloc, which will go off * and cause the vm to allocate vm_map_entries. If we need new * buckets there too we will recurse in kmem_alloc and bad * things happen. So instead we return a NULL bucket, and make * the code that allocates buckets smart enough to deal with it * * XXX: While we want this protection for the bucket zones so that * recursion from the VM is handled (and the calling code that * allocates buckets knows how to deal with it), we do not want * to prevent allocation from the slab header zones (slabzone * and slabrefzone) if uk_recurse is not zero for them. The * reason is that it could lead to NULL being returned for * slab header allocations even in the M_WAITOK case, and the * caller can't handle that. */ if (keg->uk_flags & UMA_ZFLAG_INTERNAL && keg->uk_recurse != 0) if (zone != slabzone && zone != slabrefzone && zone != zones) return (NULL); slab = NULL; for (;;) { /* * Find a slab with some space. Prefer slabs that are partially * used over those that are totally full. This helps to reduce * fragmentation. */ if (keg->uk_free != 0) { if (!LIST_EMPTY(&keg->uk_part_slab)) { slab = LIST_FIRST(&keg->uk_part_slab); } else { slab = LIST_FIRST(&keg->uk_free_slab); LIST_REMOVE(slab, us_link); LIST_INSERT_HEAD(&keg->uk_part_slab, slab, us_link); } return (slab); } /* * M_NOVM means don't ask at all! */ if (flags & M_NOVM) break; if (keg->uk_maxpages && keg->uk_pages >= keg->uk_maxpages) { keg->uk_flags |= UMA_ZFLAG_FULL; if (flags & M_NOWAIT) break; else msleep(keg, &keg->uk_lock, PVM, "zonelimit", 0); continue; } keg->uk_recurse++; slab = slab_zalloc(zone, flags); keg->uk_recurse--; /* * If we got a slab here it's safe to mark it partially used * and return. We assume that the caller is going to remove * at least one item. */ if (slab) { LIST_INSERT_HEAD(&keg->uk_part_slab, slab, us_link); return (slab); } /* * We might not have been able to get a slab but another cpu * could have while we were unlocked. Check again before we * fail. */ if (flags & M_NOWAIT) flags |= M_NOVM; } return (slab); } static void * uma_slab_alloc(uma_zone_t zone, uma_slab_t slab) { uma_keg_t keg; uma_slabrefcnt_t slabref; void *item; u_int8_t freei; keg = zone->uz_keg; freei = slab->us_firstfree; if (keg->uk_flags & UMA_ZONE_REFCNT) { slabref = (uma_slabrefcnt_t)slab; slab->us_firstfree = slabref->us_freelist[freei].us_item; } else { slab->us_firstfree = slab->us_freelist[freei].us_item; } item = slab->us_data + (keg->uk_rsize * freei); slab->us_freecount--; keg->uk_free--; #ifdef INVARIANTS uma_dbg_alloc(zone, slab, item); #endif /* Move this slab to the full list */ if (slab->us_freecount == 0) { LIST_REMOVE(slab, us_link); LIST_INSERT_HEAD(&keg->uk_full_slab, slab, us_link); } return (item); } static int uma_zalloc_bucket(uma_zone_t zone, int flags) { uma_bucket_t bucket; uma_slab_t slab; int16_t saved; int max, origflags = flags; /* * Try this zone's free list first so we don't allocate extra buckets. */ if ((bucket = LIST_FIRST(&zone->uz_free_bucket)) != NULL) { KASSERT(bucket->ub_cnt == 0, ("uma_zalloc_bucket: Bucket on free list is not empty.")); LIST_REMOVE(bucket, ub_link); } else { int bflags; bflags = (flags & ~M_ZERO); if (zone->uz_keg->uk_flags & UMA_ZFLAG_CACHEONLY) bflags |= M_NOVM; ZONE_UNLOCK(zone); bucket = bucket_alloc(zone->uz_count, bflags); ZONE_LOCK(zone); } if (bucket == NULL) return (0); #ifdef SMP /* * This code is here to limit the number of simultaneous bucket fills * for any given zone to the number of per cpu caches in this zone. This * is done so that we don't allocate more memory than we really need. */ if (zone->uz_fills >= mp_ncpus) goto done; #endif zone->uz_fills++; max = MIN(bucket->ub_entries, zone->uz_count); /* Try to keep the buckets totally full */ saved = bucket->ub_cnt; while (bucket->ub_cnt < max && (slab = uma_zone_slab(zone, flags)) != NULL) { while (slab->us_freecount && bucket->ub_cnt < max) { bucket->ub_bucket[bucket->ub_cnt++] = uma_slab_alloc(zone, slab); } /* Don't block on the next fill */ flags |= M_NOWAIT; } /* * We unlock here because we need to call the zone's init. * It should be safe to unlock because the slab dealt with * above is already on the appropriate list within the keg * and the bucket we filled is not yet on any list, so we * own it. */ if (zone->uz_init != NULL) { int i; ZONE_UNLOCK(zone); for (i = saved; i < bucket->ub_cnt; i++) if (zone->uz_init(bucket->ub_bucket[i], zone->uz_keg->uk_size, origflags) != 0) break; /* * If we couldn't initialize the whole bucket, put the * rest back onto the freelist. */ if (i != bucket->ub_cnt) { int j; for (j = i; j < bucket->ub_cnt; j++) { uma_zfree_internal(zone, bucket->ub_bucket[j], NULL, SKIP_FINI, 0); #ifdef INVARIANTS bucket->ub_bucket[j] = NULL; #endif } bucket->ub_cnt = i; } ZONE_LOCK(zone); } zone->uz_fills--; if (bucket->ub_cnt != 0) { LIST_INSERT_HEAD(&zone->uz_full_bucket, bucket, ub_link); return (1); } #ifdef SMP done: #endif bucket_free(bucket); return (0); } /* * Allocates an item for an internal zone * * Arguments * zone The zone to alloc for. * udata The data to be passed to the constructor. * flags M_WAITOK, M_NOWAIT, M_ZERO. * * Returns * NULL if there is no memory and M_NOWAIT is set * An item if successful */ static void * uma_zalloc_internal(uma_zone_t zone, void *udata, int flags) { uma_keg_t keg; uma_slab_t slab; void *item; item = NULL; keg = zone->uz_keg; #ifdef UMA_DEBUG_ALLOC printf("INTERNAL: Allocating one item from %s(%p)\n", zone->uz_name, zone); #endif ZONE_LOCK(zone); slab = uma_zone_slab(zone, flags); if (slab == NULL) { zone->uz_fails++; ZONE_UNLOCK(zone); return (NULL); } item = uma_slab_alloc(zone, slab); zone->uz_allocs++; ZONE_UNLOCK(zone); /* * We have to call both the zone's init (not the keg's init) * and the zone's ctor. This is because the item is going from * a keg slab directly to the user, and the user is expecting it * to be both zone-init'd as well as zone-ctor'd. */ if (zone->uz_init != NULL) { if (zone->uz_init(item, keg->uk_size, flags) != 0) { uma_zfree_internal(zone, item, udata, SKIP_FINI, ZFREE_STATFAIL | ZFREE_STATFREE); return (NULL); } } if (zone->uz_ctor != NULL) { if (zone->uz_ctor(item, keg->uk_size, udata, flags) != 0) { uma_zfree_internal(zone, item, udata, SKIP_DTOR, ZFREE_STATFAIL | ZFREE_STATFREE); return (NULL); } } if (flags & M_ZERO) bzero(item, keg->uk_size); return (item); } /* See uma.h */ void uma_zfree_arg(uma_zone_t zone, void *item, void *udata) { uma_keg_t keg; uma_cache_t cache; uma_bucket_t bucket; int bflags; int cpu; keg = zone->uz_keg; #ifdef UMA_DEBUG_ALLOC_1 printf("Freeing item %p to %s(%p)\n", item, zone->uz_name, zone); #endif CTR2(KTR_UMA, "uma_zfree_arg thread %x zone %s", curthread, zone->uz_name); if (zone->uz_dtor) zone->uz_dtor(item, keg->uk_size, udata); #ifdef INVARIANTS ZONE_LOCK(zone); if (keg->uk_flags & UMA_ZONE_MALLOC) uma_dbg_free(zone, udata, item); else uma_dbg_free(zone, NULL, item); ZONE_UNLOCK(zone); #endif /* * The race here is acceptable. If we miss it we'll just have to wait * a little longer for the limits to be reset. */ if (keg->uk_flags & UMA_ZFLAG_FULL) goto zfree_internal; /* * If possible, free to the per-CPU cache. There are two * requirements for safe access to the per-CPU cache: (1) the thread * accessing the cache must not be preempted or yield during access, * and (2) the thread must not migrate CPUs without switching which * cache it accesses. We rely on a critical section to prevent * preemption and migration. We release the critical section in * order to acquire the zone mutex if we are unable to free to the * current cache; when we re-acquire the critical section, we must * detect and handle migration if it has occurred. */ zfree_restart: critical_enter(); cpu = curcpu; cache = &zone->uz_cpu[cpu]; zfree_start: bucket = cache->uc_freebucket; if (bucket) { /* * Do we have room in our bucket? It is OK for this uz count * check to be slightly out of sync. */ if (bucket->ub_cnt < bucket->ub_entries) { KASSERT(bucket->ub_bucket[bucket->ub_cnt] == NULL, ("uma_zfree: Freeing to non free bucket index.")); bucket->ub_bucket[bucket->ub_cnt] = item; bucket->ub_cnt++; cache->uc_frees++; critical_exit(); return; } else if (cache->uc_allocbucket) { #ifdef UMA_DEBUG_ALLOC printf("uma_zfree: Swapping buckets.\n"); #endif /* * We have run out of space in our freebucket. * See if we can switch with our alloc bucket. */ if (cache->uc_allocbucket->ub_cnt < cache->uc_freebucket->ub_cnt) { bucket = cache->uc_freebucket; cache->uc_freebucket = cache->uc_allocbucket; cache->uc_allocbucket = bucket; goto zfree_start; } } } /* * We can get here for two reasons: * * 1) The buckets are NULL * 2) The alloc and free buckets are both somewhat full. * * We must go back the zone, which requires acquiring the zone lock, * which in turn means we must release and re-acquire the critical * section. Since the critical section is released, we may be * preempted or migrate. As such, make sure not to maintain any * thread-local state specific to the cache from prior to releasing * the critical section. */ critical_exit(); ZONE_LOCK(zone); critical_enter(); cpu = curcpu; cache = &zone->uz_cpu[cpu]; if (cache->uc_freebucket != NULL) { if (cache->uc_freebucket->ub_cnt < cache->uc_freebucket->ub_entries) { ZONE_UNLOCK(zone); goto zfree_start; } if (cache->uc_allocbucket != NULL && (cache->uc_allocbucket->ub_cnt < cache->uc_freebucket->ub_cnt)) { ZONE_UNLOCK(zone); goto zfree_start; } } /* Since we have locked the zone we may as well send back our stats */ zone->uz_allocs += cache->uc_allocs; cache->uc_allocs = 0; zone->uz_frees += cache->uc_frees; cache->uc_frees = 0; bucket = cache->uc_freebucket; cache->uc_freebucket = NULL; /* Can we throw this on the zone full list? */ if (bucket != NULL) { #ifdef UMA_DEBUG_ALLOC printf("uma_zfree: Putting old bucket on the free list.\n"); #endif /* ub_cnt is pointing to the last free item */ KASSERT(bucket->ub_cnt != 0, ("uma_zfree: Attempting to insert an empty bucket onto the full list.\n")); LIST_INSERT_HEAD(&zone->uz_full_bucket, bucket, ub_link); } if ((bucket = LIST_FIRST(&zone->uz_free_bucket)) != NULL) { LIST_REMOVE(bucket, ub_link); ZONE_UNLOCK(zone); cache->uc_freebucket = bucket; goto zfree_start; } /* We are no longer associated with this CPU. */ critical_exit(); /* And the zone.. */ ZONE_UNLOCK(zone); #ifdef UMA_DEBUG_ALLOC printf("uma_zfree: Allocating new free bucket.\n"); #endif bflags = M_NOWAIT; if (keg->uk_flags & UMA_ZFLAG_CACHEONLY) bflags |= M_NOVM; bucket = bucket_alloc(zone->uz_count, bflags); if (bucket) { ZONE_LOCK(zone); LIST_INSERT_HEAD(&zone->uz_free_bucket, bucket, ub_link); ZONE_UNLOCK(zone); goto zfree_restart; } /* * If nothing else caught this, we'll just do an internal free. */ zfree_internal: uma_zfree_internal(zone, item, udata, SKIP_DTOR, ZFREE_STATFREE); return; } /* * Frees an item to an INTERNAL zone or allocates a free bucket * * Arguments: * zone The zone to free to * item The item we're freeing * udata User supplied data for the dtor * skip Skip dtors and finis */ static void uma_zfree_internal(uma_zone_t zone, void *item, void *udata, enum zfreeskip skip, int flags) { uma_slab_t slab; uma_slabrefcnt_t slabref; uma_keg_t keg; u_int8_t *mem; u_int8_t freei; keg = zone->uz_keg; if (skip < SKIP_DTOR && zone->uz_dtor) zone->uz_dtor(item, keg->uk_size, udata); if (skip < SKIP_FINI && zone->uz_fini) zone->uz_fini(item, keg->uk_size); ZONE_LOCK(zone); if (flags & ZFREE_STATFAIL) zone->uz_fails++; if (flags & ZFREE_STATFREE) zone->uz_frees++; if (!(keg->uk_flags & UMA_ZONE_MALLOC)) { mem = (u_int8_t *)((unsigned long)item & (~UMA_SLAB_MASK)); if (keg->uk_flags & UMA_ZONE_HASH) slab = hash_sfind(&keg->uk_hash, mem); else { mem += keg->uk_pgoff; slab = (uma_slab_t)mem; } } else { slab = (uma_slab_t)udata; } /* Do we need to remove from any lists? */ if (slab->us_freecount+1 == keg->uk_ipers) { LIST_REMOVE(slab, us_link); LIST_INSERT_HEAD(&keg->uk_free_slab, slab, us_link); } else if (slab->us_freecount == 0) { LIST_REMOVE(slab, us_link); LIST_INSERT_HEAD(&keg->uk_part_slab, slab, us_link); } /* Slab management stuff */ freei = ((unsigned long)item - (unsigned long)slab->us_data) / keg->uk_rsize; #ifdef INVARIANTS if (!skip) uma_dbg_free(zone, slab, item); #endif if (keg->uk_flags & UMA_ZONE_REFCNT) { slabref = (uma_slabrefcnt_t)slab; slabref->us_freelist[freei].us_item = slab->us_firstfree; } else { slab->us_freelist[freei].us_item = slab->us_firstfree; } slab->us_firstfree = freei; slab->us_freecount++; /* Zone statistics */ keg->uk_free++; if (keg->uk_flags & UMA_ZFLAG_FULL) { if (keg->uk_pages < keg->uk_maxpages) keg->uk_flags &= ~UMA_ZFLAG_FULL; /* * We can handle one more allocation. Since we're clearing ZFLAG_FULL, * wake up all procs blocked on pages. This should be uncommon, so * keeping this simple for now (rather than adding count of blocked * threads etc). */ wakeup(keg); } ZONE_UNLOCK(zone); } /* See uma.h */ void uma_zone_set_max(uma_zone_t zone, int nitems) { uma_keg_t keg; keg = zone->uz_keg; ZONE_LOCK(zone); if (keg->uk_ppera > 1) keg->uk_maxpages = nitems * keg->uk_ppera; else keg->uk_maxpages = nitems / keg->uk_ipers; if (keg->uk_maxpages * keg->uk_ipers < nitems) keg->uk_maxpages++; ZONE_UNLOCK(zone); } /* See uma.h */ void uma_zone_set_init(uma_zone_t zone, uma_init uminit) { ZONE_LOCK(zone); KASSERT(zone->uz_keg->uk_pages == 0, ("uma_zone_set_init on non-empty keg")); zone->uz_keg->uk_init = uminit; ZONE_UNLOCK(zone); } /* See uma.h */ void uma_zone_set_fini(uma_zone_t zone, uma_fini fini) { ZONE_LOCK(zone); KASSERT(zone->uz_keg->uk_pages == 0, ("uma_zone_set_fini on non-empty keg")); zone->uz_keg->uk_fini = fini; ZONE_UNLOCK(zone); } /* See uma.h */ void uma_zone_set_zinit(uma_zone_t zone, uma_init zinit) { ZONE_LOCK(zone); KASSERT(zone->uz_keg->uk_pages == 0, ("uma_zone_set_zinit on non-empty keg")); zone->uz_init = zinit; ZONE_UNLOCK(zone); } /* See uma.h */ void uma_zone_set_zfini(uma_zone_t zone, uma_fini zfini) { ZONE_LOCK(zone); KASSERT(zone->uz_keg->uk_pages == 0, ("uma_zone_set_zfini on non-empty keg")); zone->uz_fini = zfini; ZONE_UNLOCK(zone); } /* See uma.h */ /* XXX uk_freef is not actually used with the zone locked */ void uma_zone_set_freef(uma_zone_t zone, uma_free freef) { ZONE_LOCK(zone); zone->uz_keg->uk_freef = freef; ZONE_UNLOCK(zone); } /* See uma.h */ /* XXX uk_allocf is not actually used with the zone locked */ void uma_zone_set_allocf(uma_zone_t zone, uma_alloc allocf) { ZONE_LOCK(zone); zone->uz_keg->uk_flags |= UMA_ZFLAG_PRIVALLOC; zone->uz_keg->uk_allocf = allocf; ZONE_UNLOCK(zone); } /* See uma.h */ int uma_zone_set_obj(uma_zone_t zone, struct vm_object *obj, int count) { uma_keg_t keg; vm_offset_t kva; int pages; keg = zone->uz_keg; pages = count / keg->uk_ipers; if (pages * keg->uk_ipers < count) pages++; kva = kmem_alloc_nofault(kernel_map, pages * UMA_SLAB_SIZE); if (kva == 0) return (0); if (obj == NULL) { obj = vm_object_allocate(OBJT_DEFAULT, pages); } else { VM_OBJECT_LOCK_INIT(obj, "uma object"); _vm_object_allocate(OBJT_DEFAULT, pages, obj); } ZONE_LOCK(zone); keg->uk_kva = kva; keg->uk_obj = obj; keg->uk_maxpages = pages; keg->uk_allocf = obj_alloc; keg->uk_flags |= UMA_ZONE_NOFREE | UMA_ZFLAG_PRIVALLOC; ZONE_UNLOCK(zone); return (1); } /* See uma.h */ void uma_prealloc(uma_zone_t zone, int items) { int slabs; uma_slab_t slab; uma_keg_t keg; keg = zone->uz_keg; ZONE_LOCK(zone); slabs = items / keg->uk_ipers; if (slabs * keg->uk_ipers < items) slabs++; while (slabs > 0) { slab = slab_zalloc(zone, M_WAITOK); LIST_INSERT_HEAD(&keg->uk_free_slab, slab, us_link); slabs--; } ZONE_UNLOCK(zone); } /* See uma.h */ u_int32_t * uma_find_refcnt(uma_zone_t zone, void *item) { uma_slabrefcnt_t slabref; uma_keg_t keg; u_int32_t *refcnt; int idx; keg = zone->uz_keg; slabref = (uma_slabrefcnt_t)vtoslab((vm_offset_t)item & (~UMA_SLAB_MASK)); KASSERT(slabref != NULL && slabref->us_keg->uk_flags & UMA_ZONE_REFCNT, ("uma_find_refcnt(): zone possibly not UMA_ZONE_REFCNT")); idx = ((unsigned long)item - (unsigned long)slabref->us_data) / keg->uk_rsize; refcnt = &slabref->us_freelist[idx].us_refcnt; return refcnt; } /* See uma.h */ void uma_reclaim(void) { #ifdef UMA_DEBUG printf("UMA: vm asked us to release pages!\n"); #endif bucket_enable(); zone_foreach(zone_drain); /* * Some slabs may have been freed but this zone will be visited early * we visit again so that we can free pages that are empty once other * zones are drained. We have to do the same for buckets. */ zone_drain(slabzone); zone_drain(slabrefzone); bucket_zone_drain(); } /* See uma.h */ int uma_zone_exhausted(uma_zone_t zone) { int full; ZONE_LOCK(zone); full = (zone->uz_keg->uk_flags & UMA_ZFLAG_FULL); ZONE_UNLOCK(zone); return (full); } int uma_zone_exhausted_nolock(uma_zone_t zone) { return (zone->uz_keg->uk_flags & UMA_ZFLAG_FULL); } void * uma_large_malloc(int size, int wait) { void *mem; uma_slab_t slab; u_int8_t flags; slab = uma_zalloc_internal(slabzone, NULL, wait); if (slab == NULL) return (NULL); mem = page_alloc(NULL, size, &flags, wait); if (mem) { vsetslab((vm_offset_t)mem, slab); slab->us_data = mem; slab->us_flags = flags | UMA_SLAB_MALLOC; slab->us_size = size; } else { uma_zfree_internal(slabzone, slab, NULL, SKIP_NONE, ZFREE_STATFAIL | ZFREE_STATFREE); } return (mem); } void uma_large_free(uma_slab_t slab) { vsetobj((vm_offset_t)slab->us_data, kmem_object); page_free(slab->us_data, slab->us_size, slab->us_flags); uma_zfree_internal(slabzone, slab, NULL, SKIP_NONE, ZFREE_STATFREE); } void uma_print_stats(void) { zone_foreach(uma_print_zone); } static void slab_print(uma_slab_t slab) { printf("slab: keg %p, data %p, freecount %d, firstfree %d\n", slab->us_keg, slab->us_data, slab->us_freecount, slab->us_firstfree); } static void cache_print(uma_cache_t cache) { printf("alloc: %p(%d), free: %p(%d)\n", cache->uc_allocbucket, cache->uc_allocbucket?cache->uc_allocbucket->ub_cnt:0, cache->uc_freebucket, cache->uc_freebucket?cache->uc_freebucket->ub_cnt:0); } void uma_print_zone(uma_zone_t zone) { uma_cache_t cache; uma_keg_t keg; uma_slab_t slab; int i; keg = zone->uz_keg; printf("%s(%p) size %d(%d) flags %d ipers %d ppera %d out %d free %d\n", zone->uz_name, zone, keg->uk_size, keg->uk_rsize, keg->uk_flags, keg->uk_ipers, keg->uk_ppera, (keg->uk_ipers * keg->uk_pages) - keg->uk_free, keg->uk_free); printf("Part slabs:\n"); LIST_FOREACH(slab, &keg->uk_part_slab, us_link) slab_print(slab); printf("Free slabs:\n"); LIST_FOREACH(slab, &keg->uk_free_slab, us_link) slab_print(slab); printf("Full slabs:\n"); LIST_FOREACH(slab, &keg->uk_full_slab, us_link) slab_print(slab); for (i = 0; i <= mp_maxid; i++) { if (CPU_ABSENT(i)) continue; cache = &zone->uz_cpu[i]; printf("CPU %d Cache:\n", i); cache_print(cache); } } #ifdef DDB /* * Generate statistics across both the zone and its per-cpu cache's. Return * desired statistics if the pointer is non-NULL for that statistic. * * Note: does not update the zone statistics, as it can't safely clear the * per-CPU cache statistic. * * XXXRW: Following the uc_allocbucket and uc_freebucket pointers here isn't * safe from off-CPU; we should modify the caches to track this information * directly so that we don't have to. */ static void uma_zone_sumstat(uma_zone_t z, int *cachefreep, u_int64_t *allocsp, u_int64_t *freesp) { uma_cache_t cache; u_int64_t allocs, frees; int cachefree, cpu; allocs = frees = 0; cachefree = 0; for (cpu = 0; cpu <= mp_maxid; cpu++) { if (CPU_ABSENT(cpu)) continue; cache = &z->uz_cpu[cpu]; if (cache->uc_allocbucket != NULL) cachefree += cache->uc_allocbucket->ub_cnt; if (cache->uc_freebucket != NULL) cachefree += cache->uc_freebucket->ub_cnt; allocs += cache->uc_allocs; frees += cache->uc_frees; } allocs += z->uz_allocs; frees += z->uz_frees; if (cachefreep != NULL) *cachefreep = cachefree; if (allocsp != NULL) *allocsp = allocs; if (freesp != NULL) *freesp = frees; } #endif /* DDB */ static int sysctl_vm_zone_count(SYSCTL_HANDLER_ARGS) { uma_keg_t kz; uma_zone_t z; int count; count = 0; mtx_lock(&uma_mtx); LIST_FOREACH(kz, &uma_kegs, uk_link) { LIST_FOREACH(z, &kz->uk_zones, uz_link) count++; } mtx_unlock(&uma_mtx); return (sysctl_handle_int(oidp, &count, 0, req)); } static int sysctl_vm_zone_stats(SYSCTL_HANDLER_ARGS) { struct uma_stream_header ush; struct uma_type_header uth; struct uma_percpu_stat ups; uma_bucket_t bucket; struct sbuf sbuf; uma_cache_t cache; uma_keg_t kz; uma_zone_t z; char *buffer; int buflen, count, error, i; mtx_lock(&uma_mtx); restart: mtx_assert(&uma_mtx, MA_OWNED); count = 0; LIST_FOREACH(kz, &uma_kegs, uk_link) { LIST_FOREACH(z, &kz->uk_zones, uz_link) count++; } mtx_unlock(&uma_mtx); buflen = sizeof(ush) + count * (sizeof(uth) + sizeof(ups) * (mp_maxid + 1)) + 1; buffer = malloc(buflen, M_TEMP, M_WAITOK | M_ZERO); mtx_lock(&uma_mtx); i = 0; LIST_FOREACH(kz, &uma_kegs, uk_link) { LIST_FOREACH(z, &kz->uk_zones, uz_link) i++; } if (i > count) { free(buffer, M_TEMP); goto restart; } count = i; sbuf_new(&sbuf, buffer, buflen, SBUF_FIXEDLEN); /* * Insert stream header. */ bzero(&ush, sizeof(ush)); ush.ush_version = UMA_STREAM_VERSION; ush.ush_maxcpus = (mp_maxid + 1); ush.ush_count = count; if (sbuf_bcat(&sbuf, &ush, sizeof(ush)) < 0) { mtx_unlock(&uma_mtx); error = ENOMEM; goto out; } LIST_FOREACH(kz, &uma_kegs, uk_link) { LIST_FOREACH(z, &kz->uk_zones, uz_link) { bzero(&uth, sizeof(uth)); ZONE_LOCK(z); strlcpy(uth.uth_name, z->uz_name, UTH_MAX_NAME); uth.uth_align = kz->uk_align; uth.uth_pages = kz->uk_pages; uth.uth_keg_free = kz->uk_free; uth.uth_size = kz->uk_size; uth.uth_rsize = kz->uk_rsize; uth.uth_maxpages = kz->uk_maxpages; if (kz->uk_ppera > 1) uth.uth_limit = kz->uk_maxpages / kz->uk_ppera; else uth.uth_limit = kz->uk_maxpages * kz->uk_ipers; /* * A zone is secondary is it is not the first entry * on the keg's zone list. */ if ((kz->uk_flags & UMA_ZONE_SECONDARY) && (LIST_FIRST(&kz->uk_zones) != z)) uth.uth_zone_flags = UTH_ZONE_SECONDARY; LIST_FOREACH(bucket, &z->uz_full_bucket, ub_link) uth.uth_zone_free += bucket->ub_cnt; uth.uth_allocs = z->uz_allocs; uth.uth_frees = z->uz_frees; uth.uth_fails = z->uz_fails; if (sbuf_bcat(&sbuf, &uth, sizeof(uth)) < 0) { ZONE_UNLOCK(z); mtx_unlock(&uma_mtx); error = ENOMEM; goto out; } /* * While it is not normally safe to access the cache * bucket pointers while not on the CPU that owns the * cache, we only allow the pointers to be exchanged * without the zone lock held, not invalidated, so * accept the possible race associated with bucket * exchange during monitoring. */ for (i = 0; i < (mp_maxid + 1); i++) { bzero(&ups, sizeof(ups)); if (kz->uk_flags & UMA_ZFLAG_INTERNAL) goto skip; if (CPU_ABSENT(i)) goto skip; cache = &z->uz_cpu[i]; if (cache->uc_allocbucket != NULL) ups.ups_cache_free += cache->uc_allocbucket->ub_cnt; if (cache->uc_freebucket != NULL) ups.ups_cache_free += cache->uc_freebucket->ub_cnt; ups.ups_allocs = cache->uc_allocs; ups.ups_frees = cache->uc_frees; skip: if (sbuf_bcat(&sbuf, &ups, sizeof(ups)) < 0) { ZONE_UNLOCK(z); mtx_unlock(&uma_mtx); error = ENOMEM; goto out; } } ZONE_UNLOCK(z); } } mtx_unlock(&uma_mtx); sbuf_finish(&sbuf); error = SYSCTL_OUT(req, sbuf_data(&sbuf), sbuf_len(&sbuf)); out: free(buffer, M_TEMP); return (error); } #ifdef DDB DB_SHOW_COMMAND(uma, db_show_uma) { u_int64_t allocs, frees; uma_bucket_t bucket; uma_keg_t kz; uma_zone_t z; int cachefree; db_printf("%18s %8s %8s %8s %12s\n", "Zone", "Size", "Used", "Free", "Requests"); LIST_FOREACH(kz, &uma_kegs, uk_link) { LIST_FOREACH(z, &kz->uk_zones, uz_link) { if (kz->uk_flags & UMA_ZFLAG_INTERNAL) { allocs = z->uz_allocs; frees = z->uz_frees; cachefree = 0; } else uma_zone_sumstat(z, &cachefree, &allocs, &frees); if (!((kz->uk_flags & UMA_ZONE_SECONDARY) && (LIST_FIRST(&kz->uk_zones) != z))) cachefree += kz->uk_free; LIST_FOREACH(bucket, &z->uz_full_bucket, ub_link) cachefree += bucket->ub_cnt; db_printf("%18s %8ju %8jd %8d %12ju\n", z->uz_name, (uintmax_t)kz->uk_size, (intmax_t)(allocs - frees), cachefree, (uintmax_t)allocs); } } } #endif