Index: projects/numa2/lib/libmemstat/memstat_uma.c =================================================================== --- projects/numa2/lib/libmemstat/memstat_uma.c (revision 321505) +++ projects/numa2/lib/libmemstat/memstat_uma.c (revision 321506) @@ -1,463 +1,475 @@ /*- * Copyright (c) 2005-2006 Robert N. M. Watson * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "memstat.h" #include "memstat_internal.h" static struct nlist namelist[] = { #define X_UMA_KEGS 0 { .n_name = "_uma_kegs" }, #define X_MP_MAXID 1 { .n_name = "_mp_maxid" }, #define X_ALL_CPUS 2 { .n_name = "_all_cpus" }, +#define X_VM_NDOMAINS 3 + { .n_name = "_vm_ndomains" }, { .n_name = "" }, }; /* * Extract uma(9) statistics from the running kernel, and store all memory * type information in the passed list. For each type, check the list for an * existing entry with the right name/allocator -- if present, update that * entry. Otherwise, add a new entry. On error, the entire list will be * cleared, as entries will be in an inconsistent state. * * To reduce the level of work for a list that starts empty, we keep around a * hint as to whether it was empty when we began, so we can avoid searching * the list for entries to update. Updates are O(n^2) due to searching for * each entry before adding it. */ int memstat_sysctl_uma(struct memory_type_list *list, int flags) { struct uma_stream_header *ushp; struct uma_type_header *uthp; struct uma_percpu_stat *upsp; struct memory_type *mtp; int count, hint_dontsearch, i, j, maxcpus, maxid; char *buffer, *p; size_t size; hint_dontsearch = LIST_EMPTY(&list->mtl_list); /* * Query the number of CPUs, number of malloc types so that we can * guess an initial buffer size. We loop until we succeed or really * fail. Note that the value of maxcpus we query using sysctl is not * the version we use when processing the real data -- that is read * from the header. */ retry: size = sizeof(maxid); if (sysctlbyname("kern.smp.maxid", &maxid, &size, NULL, 0) < 0) { if (errno == EACCES || errno == EPERM) list->mtl_error = MEMSTAT_ERROR_PERMISSION; else list->mtl_error = MEMSTAT_ERROR_DATAERROR; return (-1); } if (size != sizeof(maxid)) { list->mtl_error = MEMSTAT_ERROR_DATAERROR; return (-1); } size = sizeof(count); if (sysctlbyname("vm.zone_count", &count, &size, NULL, 0) < 0) { if (errno == EACCES || errno == EPERM) list->mtl_error = MEMSTAT_ERROR_PERMISSION; else list->mtl_error = MEMSTAT_ERROR_VERSION; return (-1); } if (size != sizeof(count)) { list->mtl_error = MEMSTAT_ERROR_DATAERROR; return (-1); } size = sizeof(*uthp) + count * (sizeof(*uthp) + sizeof(*upsp) * (maxid + 1)); buffer = malloc(size); if (buffer == NULL) { list->mtl_error = MEMSTAT_ERROR_NOMEMORY; return (-1); } if (sysctlbyname("vm.zone_stats", buffer, &size, NULL, 0) < 0) { /* * XXXRW: ENOMEM is an ambiguous return, we should bound the * number of loops, perhaps. */ if (errno == ENOMEM) { free(buffer); goto retry; } if (errno == EACCES || errno == EPERM) list->mtl_error = MEMSTAT_ERROR_PERMISSION; else list->mtl_error = MEMSTAT_ERROR_VERSION; free(buffer); return (-1); } if (size == 0) { free(buffer); return (0); } if (size < sizeof(*ushp)) { list->mtl_error = MEMSTAT_ERROR_VERSION; free(buffer); return (-1); } p = buffer; ushp = (struct uma_stream_header *)p; p += sizeof(*ushp); if (ushp->ush_version != UMA_STREAM_VERSION) { list->mtl_error = MEMSTAT_ERROR_VERSION; free(buffer); return (-1); } /* * For the remainder of this function, we are quite trusting about * the layout of structures and sizes, since we've determined we have * a matching version and acceptable CPU count. */ maxcpus = ushp->ush_maxcpus; count = ushp->ush_count; for (i = 0; i < count; i++) { uthp = (struct uma_type_header *)p; p += sizeof(*uthp); if (hint_dontsearch == 0) { mtp = memstat_mtl_find(list, ALLOCATOR_UMA, uthp->uth_name); } else mtp = NULL; if (mtp == NULL) mtp = _memstat_mt_allocate(list, ALLOCATOR_UMA, uthp->uth_name, maxid + 1); if (mtp == NULL) { _memstat_mtl_empty(list); free(buffer); list->mtl_error = MEMSTAT_ERROR_NOMEMORY; return (-1); } /* * Reset the statistics on a current node. */ _memstat_mt_reset_stats(mtp, maxid + 1); mtp->mt_numallocs = uthp->uth_allocs; mtp->mt_numfrees = uthp->uth_frees; mtp->mt_failures = uthp->uth_fails; mtp->mt_sleeps = uthp->uth_sleeps; for (j = 0; j < maxcpus; j++) { upsp = (struct uma_percpu_stat *)p; p += sizeof(*upsp); mtp->mt_percpu_cache[j].mtp_free = upsp->ups_cache_free; mtp->mt_free += upsp->ups_cache_free; mtp->mt_numallocs += upsp->ups_allocs; mtp->mt_numfrees += upsp->ups_frees; } mtp->mt_size = uthp->uth_size; mtp->mt_rsize = uthp->uth_rsize; mtp->mt_memalloced = mtp->mt_numallocs * uthp->uth_size; mtp->mt_memfreed = mtp->mt_numfrees * uthp->uth_size; mtp->mt_bytes = mtp->mt_memalloced - mtp->mt_memfreed; mtp->mt_countlimit = uthp->uth_limit; mtp->mt_byteslimit = uthp->uth_limit * uthp->uth_size; mtp->mt_count = mtp->mt_numallocs - mtp->mt_numfrees; mtp->mt_zonefree = uthp->uth_zone_free; /* * UMA secondary zones share a keg with the primary zone. To * avoid double-reporting of free items, report keg free * items only in the primary zone. */ if (!(uthp->uth_zone_flags & UTH_ZONE_SECONDARY)) { mtp->mt_kegfree = uthp->uth_keg_free; mtp->mt_free += mtp->mt_kegfree; } mtp->mt_free += mtp->mt_zonefree; } free(buffer); return (0); } static int kread(kvm_t *kvm, void *kvm_pointer, void *address, size_t size, size_t offset) { ssize_t ret; ret = kvm_read(kvm, (unsigned long)kvm_pointer + offset, address, size); if (ret < 0) return (MEMSTAT_ERROR_KVM); if ((size_t)ret != size) return (MEMSTAT_ERROR_KVM_SHORTREAD); return (0); } static int kread_string(kvm_t *kvm, const void *kvm_pointer, char *buffer, int buflen) { ssize_t ret; int i; for (i = 0; i < buflen; i++) { ret = kvm_read(kvm, (unsigned long)kvm_pointer + i, &(buffer[i]), sizeof(char)); if (ret < 0) return (MEMSTAT_ERROR_KVM); if ((size_t)ret != sizeof(char)) return (MEMSTAT_ERROR_KVM_SHORTREAD); if (buffer[i] == '\0') return (0); } /* Truncate. */ buffer[i-1] = '\0'; return (0); } static int kread_symbol(kvm_t *kvm, int index, void *address, size_t size, size_t offset) { ssize_t ret; ret = kvm_read(kvm, namelist[index].n_value + offset, address, size); if (ret < 0) return (MEMSTAT_ERROR_KVM); if ((size_t)ret != size) return (MEMSTAT_ERROR_KVM_SHORTREAD); return (0); } /* * memstat_kvm_uma() is similar to memstat_sysctl_uma(), only it extracts * UMA(9) statistics from a kernel core/memory file. */ int memstat_kvm_uma(struct memory_type_list *list, void *kvm_handle) { LIST_HEAD(, uma_keg) uma_kegs; struct memory_type *mtp; struct uma_bucket *ubp, ub; struct uma_cache *ucp, *ucp_array; struct uma_zone *uzp, uz; struct uma_keg *kzp, kz; - int hint_dontsearch, i, mp_maxid, ret; + int hint_dontsearch, i, mp_maxid, ndomains, ret; char name[MEMTYPE_MAXNAME]; cpuset_t all_cpus; long cpusetsize; kvm_t *kvm; kvm = (kvm_t *)kvm_handle; hint_dontsearch = LIST_EMPTY(&list->mtl_list); if (kvm_nlist(kvm, namelist) != 0) { list->mtl_error = MEMSTAT_ERROR_KVM; return (-1); } if (namelist[X_UMA_KEGS].n_type == 0 || namelist[X_UMA_KEGS].n_value == 0) { list->mtl_error = MEMSTAT_ERROR_KVM_NOSYMBOL; return (-1); } ret = kread_symbol(kvm, X_MP_MAXID, &mp_maxid, sizeof(mp_maxid), 0); if (ret != 0) { list->mtl_error = ret; return (-1); } + ret = kread_symbol(kvm, X_VM_NDOMAINS, &ndomains, + sizeof(ndomains), 0); + if (ret != 0) { + list->mtl_error = ret; + return (-1); + } ret = kread_symbol(kvm, X_UMA_KEGS, &uma_kegs, sizeof(uma_kegs), 0); if (ret != 0) { list->mtl_error = ret; return (-1); } cpusetsize = sysconf(_SC_CPUSET_SIZE); if (cpusetsize == -1 || (u_long)cpusetsize > sizeof(cpuset_t)) { list->mtl_error = MEMSTAT_ERROR_KVM_NOSYMBOL; return (-1); } CPU_ZERO(&all_cpus); ret = kread_symbol(kvm, X_ALL_CPUS, &all_cpus, cpusetsize, 0); if (ret != 0) { list->mtl_error = ret; return (-1); } ucp_array = malloc(sizeof(struct uma_cache) * (mp_maxid + 1)); if (ucp_array == NULL) { list->mtl_error = MEMSTAT_ERROR_NOMEMORY; return (-1); } for (kzp = LIST_FIRST(&uma_kegs); kzp != NULL; kzp = LIST_NEXT(&kz, uk_link)) { ret = kread(kvm, kzp, &kz, sizeof(kz), 0); if (ret != 0) { free(ucp_array); _memstat_mtl_empty(list); list->mtl_error = ret; return (-1); } for (uzp = LIST_FIRST(&kz.uk_zones); uzp != NULL; uzp = LIST_NEXT(&uz, uz_link)) { ret = kread(kvm, uzp, &uz, sizeof(uz), 0); if (ret != 0) { free(ucp_array); _memstat_mtl_empty(list); list->mtl_error = ret; return (-1); } ret = kread(kvm, uzp, ucp_array, sizeof(struct uma_cache) * (mp_maxid + 1), offsetof(struct uma_zone, uz_cpu[0])); if (ret != 0) { free(ucp_array); _memstat_mtl_empty(list); list->mtl_error = ret; return (-1); } ret = kread_string(kvm, uz.uz_name, name, MEMTYPE_MAXNAME); if (ret != 0) { free(ucp_array); _memstat_mtl_empty(list); list->mtl_error = ret; return (-1); } if (hint_dontsearch == 0) { mtp = memstat_mtl_find(list, ALLOCATOR_UMA, name); } else mtp = NULL; if (mtp == NULL) mtp = _memstat_mt_allocate(list, ALLOCATOR_UMA, name, mp_maxid + 1); if (mtp == NULL) { free(ucp_array); _memstat_mtl_empty(list); list->mtl_error = MEMSTAT_ERROR_NOMEMORY; return (-1); } /* * Reset the statistics on a current node. */ _memstat_mt_reset_stats(mtp, mp_maxid + 1); mtp->mt_numallocs = uz.uz_allocs; mtp->mt_numfrees = uz.uz_frees; mtp->mt_failures = uz.uz_fails; mtp->mt_sleeps = uz.uz_sleeps; if (kz.uk_flags & UMA_ZFLAG_INTERNAL) goto skip_percpu; for (i = 0; i < mp_maxid + 1; i++) { if (!CPU_ISSET(i, &all_cpus)) continue; ucp = &ucp_array[i]; mtp->mt_numallocs += ucp->uc_allocs; mtp->mt_numfrees += ucp->uc_frees; if (ucp->uc_allocbucket != NULL) { ret = kread(kvm, ucp->uc_allocbucket, &ub, sizeof(ub), 0); if (ret != 0) { free(ucp_array); _memstat_mtl_empty(list); list->mtl_error = ret; return (-1); } mtp->mt_free += ub.ub_cnt; } if (ucp->uc_freebucket != NULL) { ret = kread(kvm, ucp->uc_freebucket, &ub, sizeof(ub), 0); if (ret != 0) { free(ucp_array); _memstat_mtl_empty(list); list->mtl_error = ret; return (-1); } mtp->mt_free += ub.ub_cnt; } } skip_percpu: mtp->mt_size = kz.uk_size; mtp->mt_rsize = kz.uk_rsize; mtp->mt_memalloced = mtp->mt_numallocs * mtp->mt_size; mtp->mt_memfreed = mtp->mt_numfrees * mtp->mt_size; mtp->mt_bytes = mtp->mt_memalloced - mtp->mt_memfreed; if (kz.uk_ppera > 1) mtp->mt_countlimit = kz.uk_maxpages / kz.uk_ipers; else mtp->mt_countlimit = kz.uk_maxpages * kz.uk_ipers; mtp->mt_byteslimit = mtp->mt_countlimit * mtp->mt_size; mtp->mt_count = mtp->mt_numallocs - mtp->mt_numfrees; - for (ubp = LIST_FIRST(&uz.uz_buckets); ubp != - NULL; ubp = LIST_NEXT(&ub, ub_link)) { - ret = kread(kvm, ubp, &ub, sizeof(ub), 0); - mtp->mt_zonefree += ub.ub_cnt; - } + for (i = 0; i < ndomains; i++) + for (ubp = + LIST_FIRST(&uz.uz_domain[i].uzd_buckets); + ubp != NULL; + ubp = LIST_NEXT(&ub, ub_link)) { + ret = kread(kvm, ubp, &ub, sizeof(ub), + 0); + mtp->mt_zonefree += ub.ub_cnt; + } if (!((kz.uk_flags & UMA_ZONE_SECONDARY) && LIST_FIRST(&kz.uk_zones) != uzp)) { mtp->mt_kegfree = kz.uk_free; mtp->mt_free += mtp->mt_kegfree; } mtp->mt_free += mtp->mt_zonefree; } } free(ucp_array); return (0); } Index: projects/numa2/sys/amd64/amd64/uma_machdep.c =================================================================== --- projects/numa2/sys/amd64/amd64/uma_machdep.c (revision 321505) +++ projects/numa2/sys/amd64/amd64/uma_machdep.c (revision 321506) @@ -1,85 +1,86 @@ /*- * Copyright (c) 2003 Alan L. Cox * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include void * -uma_small_alloc(uma_zone_t zone, vm_size_t bytes, u_int8_t *flags, int wait) +uma_small_alloc(uma_zone_t zone, vm_size_t bytes, int domain, u_int8_t *flags, + int wait) { vm_page_t m; vm_paddr_t pa; void *va; int pflags; *flags = UMA_SLAB_PRIV; pflags = malloc2vm_flags(wait) | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED; for (;;) { - m = vm_page_alloc(NULL, 0, pflags); + m = vm_page_alloc_domain(NULL, 0, domain, pflags); if (m == NULL) { if (wait & M_NOWAIT) return (NULL); else VM_WAIT; } else break; } pa = m->phys_addr; if ((wait & M_NODUMP) == 0) dump_add_page(pa); va = (void *)PHYS_TO_DMAP(pa); if ((wait & M_ZERO) && (m->flags & PG_ZERO) == 0) pagezero(va); return (va); } void uma_small_free(void *mem, vm_size_t size, u_int8_t flags) { vm_page_t m; vm_paddr_t pa; pa = DMAP_TO_PHYS((vm_offset_t)mem); dump_drop_page(pa); m = PHYS_TO_VM_PAGE(pa); m->wire_count--; vm_page_free(m); atomic_subtract_int(&vm_cnt.v_wire_count, 1); } Index: projects/numa2/sys/arm64/arm64/uma_machdep.c =================================================================== --- projects/numa2/sys/arm64/arm64/uma_machdep.c (revision 321505) +++ projects/numa2/sys/arm64/arm64/uma_machdep.c (revision 321506) @@ -1,85 +1,86 @@ /*- * Copyright (c) 2003 Alan L. Cox * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include void * -uma_small_alloc(uma_zone_t zone, vm_size_t bytes, u_int8_t *flags, int wait) +uma_small_alloc(uma_zone_t zone, vm_size_t bytes, int domain, u_int8_t *flags, + int wait) { vm_page_t m; vm_paddr_t pa; void *va; int pflags; *flags = UMA_SLAB_PRIV; pflags = malloc2vm_flags(wait) | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED; for (;;) { m = vm_page_alloc(NULL, 0, pflags); if (m == NULL) { if (wait & M_NOWAIT) return (NULL); else VM_WAIT; } else break; } pa = m->phys_addr; if ((wait & M_NODUMP) == 0) dump_add_page(pa); va = (void *)PHYS_TO_DMAP(pa); if ((wait & M_ZERO) && (m->flags & PG_ZERO) == 0) bzero(va, PAGE_SIZE); return (va); } void uma_small_free(void *mem, vm_size_t size, u_int8_t flags) { vm_page_t m; vm_paddr_t pa; pa = DMAP_TO_PHYS((vm_offset_t)mem); dump_drop_page(pa); m = PHYS_TO_VM_PAGE(pa); m->wire_count--; vm_page_free(m); atomic_subtract_int(&vm_cnt.v_wire_count, 1); } Index: projects/numa2/sys/i386/i386/pmap.c =================================================================== --- projects/numa2/sys/i386/i386/pmap.c (revision 321505) +++ projects/numa2/sys/i386/i386/pmap.c (revision 321506) @@ -1,5686 +1,5687 @@ /*- * Copyright (c) 1991 Regents of the University of California. * All rights reserved. * Copyright (c) 1994 John S. Dyson * All rights reserved. * Copyright (c) 1994 David Greenman * All rights reserved. * Copyright (c) 2005-2010 Alan L. Cox * All rights reserved. * * This code is derived from software contributed to Berkeley by * the Systems Programming Group of the University of Utah Computer * Science Department and William Jolitz of UUNET Technologies Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from: @(#)pmap.c 7.7 (Berkeley) 5/12/91 */ /*- * Copyright (c) 2003 Networks Associates Technology, Inc. * All rights reserved. * * This software was developed for the FreeBSD Project by Jake Burkholder, * Safeport Network Services, and Network Associates Laboratories, the * Security Research Division of Network Associates, Inc. under * DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA * CHATS research program. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); /* * Manages physical address maps. * * Since the information managed by this module is * also stored by the logical address mapping module, * this module may throw away valid virtual-to-physical * mappings at almost any time. However, invalidations * of virtual-to-physical mappings must be done as * requested. * * In order to cope with hardware architectures which * make virtual-to-physical map invalidates expensive, * this module may delay invalidate or reduced protection * operations until such time as they are actually * necessary. This module is given full information as * to which processors are currently using which maps, * and to when physical maps must be made correct. */ #include "opt_apic.h" #include "opt_cpu.h" #include "opt_pmap.h" #include "opt_smp.h" #include "opt_xbox.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef DEV_APIC #include #include #include #endif #include #include #include #include #include #ifdef SMP #include #endif #ifdef XBOX #include #endif #ifndef PMAP_SHPGPERPROC #define PMAP_SHPGPERPROC 200 #endif #if !defined(DIAGNOSTIC) #ifdef __GNUC_GNU_INLINE__ #define PMAP_INLINE __attribute__((__gnu_inline__)) inline #else #define PMAP_INLINE extern inline #endif #else #define PMAP_INLINE #endif #ifdef PV_STATS #define PV_STAT(x) do { x ; } while (0) #else #define PV_STAT(x) do { } while (0) #endif #define pa_index(pa) ((pa) >> PDRSHIFT) #define pa_to_pvh(pa) (&pv_table[pa_index(pa)]) /* * Get PDEs and PTEs for user/kernel address space */ #define pmap_pde(m, v) (&((m)->pm_pdir[(vm_offset_t)(v) >> PDRSHIFT])) #define pdir_pde(m, v) (m[(vm_offset_t)(v) >> PDRSHIFT]) #define pmap_pde_v(pte) ((*(int *)pte & PG_V) != 0) #define pmap_pte_w(pte) ((*(int *)pte & PG_W) != 0) #define pmap_pte_m(pte) ((*(int *)pte & PG_M) != 0) #define pmap_pte_u(pte) ((*(int *)pte & PG_A) != 0) #define pmap_pte_v(pte) ((*(int *)pte & PG_V) != 0) #define pmap_pte_set_w(pte, v) ((v) ? atomic_set_int((u_int *)(pte), PG_W) : \ atomic_clear_int((u_int *)(pte), PG_W)) #define pmap_pte_set_prot(pte, v) ((*(int *)pte &= ~PG_PROT), (*(int *)pte |= (v))) struct pmap kernel_pmap_store; LIST_HEAD(pmaplist, pmap); static struct pmaplist allpmaps; static struct mtx allpmaps_lock; vm_offset_t virtual_avail; /* VA of first avail page (after kernel bss) */ vm_offset_t virtual_end; /* VA of last avail page (end of kernel AS) */ int pgeflag = 0; /* PG_G or-in */ int pseflag = 0; /* PG_PS or-in */ static int nkpt = NKPT; vm_offset_t kernel_vm_end = KERNBASE + NKPT * NBPDR; extern u_int32_t KERNend; extern u_int32_t KPTphys; #if defined(PAE) || defined(PAE_TABLES) pt_entry_t pg_nx; static uma_zone_t pdptzone; #endif static SYSCTL_NODE(_vm, OID_AUTO, pmap, CTLFLAG_RD, 0, "VM/pmap parameters"); static int pat_works = 1; SYSCTL_INT(_vm_pmap, OID_AUTO, pat_works, CTLFLAG_RD, &pat_works, 1, "Is page attribute table fully functional?"); static int pg_ps_enabled = 1; SYSCTL_INT(_vm_pmap, OID_AUTO, pg_ps_enabled, CTLFLAG_RDTUN | CTLFLAG_NOFETCH, &pg_ps_enabled, 0, "Are large page mappings enabled?"); #define PAT_INDEX_SIZE 8 static int pat_index[PAT_INDEX_SIZE]; /* cache mode to PAT index conversion */ /* * pmap_mapdev support pre initialization (i.e. console) */ #define PMAP_PREINIT_MAPPING_COUNT 8 static struct pmap_preinit_mapping { vm_paddr_t pa; vm_offset_t va; vm_size_t sz; int mode; } pmap_preinit_mapping[PMAP_PREINIT_MAPPING_COUNT]; static int pmap_initialized; static struct rwlock_padalign pvh_global_lock; /* * Data for the pv entry allocation mechanism */ static TAILQ_HEAD(pch, pv_chunk) pv_chunks = TAILQ_HEAD_INITIALIZER(pv_chunks); static int pv_entry_count = 0, pv_entry_max = 0, pv_entry_high_water = 0; static struct md_page *pv_table; static int shpgperproc = PMAP_SHPGPERPROC; struct pv_chunk *pv_chunkbase; /* KVA block for pv_chunks */ int pv_maxchunks; /* How many chunks we have KVA for */ vm_offset_t pv_vafree; /* freelist stored in the PTE */ /* * All those kernel PT submaps that BSD is so fond of */ pt_entry_t *CMAP3; static pd_entry_t *KPTD; caddr_t ptvmmap = 0; caddr_t CADDR3; struct msgbuf *msgbufp = NULL; /* * Crashdump maps. */ static caddr_t crashdumpmap; static pt_entry_t *PMAP1 = NULL, *PMAP2; static pt_entry_t *PADDR1 = NULL, *PADDR2; #ifdef SMP static int PMAP1cpu; static int PMAP1changedcpu; SYSCTL_INT(_debug, OID_AUTO, PMAP1changedcpu, CTLFLAG_RD, &PMAP1changedcpu, 0, "Number of times pmap_pte_quick changed CPU with same PMAP1"); #endif static int PMAP1changed; SYSCTL_INT(_debug, OID_AUTO, PMAP1changed, CTLFLAG_RD, &PMAP1changed, 0, "Number of times pmap_pte_quick changed PMAP1"); static int PMAP1unchanged; SYSCTL_INT(_debug, OID_AUTO, PMAP1unchanged, CTLFLAG_RD, &PMAP1unchanged, 0, "Number of times pmap_pte_quick didn't change PMAP1"); static struct mtx PMAP2mutex; static void free_pv_chunk(struct pv_chunk *pc); static void free_pv_entry(pmap_t pmap, pv_entry_t pv); static pv_entry_t get_pv_entry(pmap_t pmap, boolean_t try); static void pmap_pv_demote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa); static boolean_t pmap_pv_insert_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa); static void pmap_pv_promote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa); static void pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va); static pv_entry_t pmap_pvh_remove(struct md_page *pvh, pmap_t pmap, vm_offset_t va); static int pmap_pvh_wired_mappings(struct md_page *pvh, int count); static boolean_t pmap_demote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va); static boolean_t pmap_enter_pde(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot); static vm_page_t pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot, vm_page_t mpte); static void pmap_flush_page(vm_page_t m); static int pmap_insert_pt_page(pmap_t pmap, vm_page_t mpte); static void pmap_invalidate_pde_page(pmap_t pmap, vm_offset_t va, pd_entry_t pde); static void pmap_fill_ptp(pt_entry_t *firstpte, pt_entry_t newpte); static boolean_t pmap_is_modified_pvh(struct md_page *pvh); static boolean_t pmap_is_referenced_pvh(struct md_page *pvh); static void pmap_kenter_attr(vm_offset_t va, vm_paddr_t pa, int mode); static void pmap_kenter_pde(vm_offset_t va, pd_entry_t newpde); static void pmap_pde_attr(pd_entry_t *pde, int cache_bits); static void pmap_promote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va); static boolean_t pmap_protect_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t sva, vm_prot_t prot); static void pmap_pte_attr(pt_entry_t *pte, int cache_bits); static void pmap_remove_pde(pmap_t pmap, pd_entry_t *pdq, vm_offset_t sva, struct spglist *free); static int pmap_remove_pte(pmap_t pmap, pt_entry_t *ptq, vm_offset_t sva, struct spglist *free); static vm_page_t pmap_remove_pt_page(pmap_t pmap, vm_offset_t va); static void pmap_remove_page(struct pmap *pmap, vm_offset_t va, struct spglist *free); static void pmap_remove_entry(struct pmap *pmap, vm_page_t m, vm_offset_t va); static void pmap_insert_entry(pmap_t pmap, vm_offset_t va, vm_page_t m); static boolean_t pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va, vm_page_t m); static void pmap_update_pde(pmap_t pmap, vm_offset_t va, pd_entry_t *pde, pd_entry_t newpde); static void pmap_update_pde_invalidate(vm_offset_t va, pd_entry_t newpde); static vm_page_t pmap_allocpte(pmap_t pmap, vm_offset_t va, u_int flags); static vm_page_t _pmap_allocpte(pmap_t pmap, u_int ptepindex, u_int flags); static void _pmap_unwire_ptp(pmap_t pmap, vm_page_t m, struct spglist *free); static pt_entry_t *pmap_pte_quick(pmap_t pmap, vm_offset_t va); static void pmap_pte_release(pt_entry_t *pte); static int pmap_unuse_pt(pmap_t, vm_offset_t, struct spglist *); #if defined(PAE) || defined(PAE_TABLES) -static void *pmap_pdpt_allocf(uma_zone_t zone, vm_size_t bytes, uint8_t *flags, - int wait); +static void *pmap_pdpt_allocf(uma_zone_t zone, vm_size_t bytes, int domain, + uint8_t *flags, int wait); #endif static void pmap_set_pg(void); static __inline void pagezero(void *page); CTASSERT(1 << PDESHIFT == sizeof(pd_entry_t)); CTASSERT(1 << PTESHIFT == sizeof(pt_entry_t)); /* * If you get an error here, then you set KVA_PAGES wrong! See the * description of KVA_PAGES in sys/i386/include/pmap.h. It must be * multiple of 4 for a normal kernel, or a multiple of 8 for a PAE. */ CTASSERT(KERNBASE % (1 << 24) == 0); /* * Bootstrap the system enough to run with virtual memory. * * On the i386 this is called after mapping has already been enabled * and just syncs the pmap module with what has already been done. * [We can't call it easily with mapping off since the kernel is not * mapped with PA == VA, hence we would have to relocate every address * from the linked base (virtual) address "KERNBASE" to the actual * (physical) address starting relative to 0] */ void pmap_bootstrap(vm_paddr_t firstaddr) { vm_offset_t va; pt_entry_t *pte, *unused; struct pcpu *pc; int i; /* * Add a physical memory segment (vm_phys_seg) corresponding to the * preallocated kernel page table pages so that vm_page structures * representing these pages will be created. The vm_page structures * are required for promotion of the corresponding kernel virtual * addresses to superpage mappings. */ vm_phys_add_seg(KPTphys, KPTphys + ptoa(nkpt)); /* * Initialize the first available kernel virtual address. However, * using "firstaddr" may waste a few pages of the kernel virtual * address space, because locore may not have mapped every physical * page that it allocated. Preferably, locore would provide a first * unused virtual address in addition to "firstaddr". */ virtual_avail = (vm_offset_t) KERNBASE + firstaddr; virtual_end = VM_MAX_KERNEL_ADDRESS; /* * Initialize the kernel pmap (which is statically allocated). */ PMAP_LOCK_INIT(kernel_pmap); kernel_pmap->pm_pdir = (pd_entry_t *) (KERNBASE + (u_int)IdlePTD); #if defined(PAE) || defined(PAE_TABLES) kernel_pmap->pm_pdpt = (pdpt_entry_t *) (KERNBASE + (u_int)IdlePDPT); #endif CPU_FILL(&kernel_pmap->pm_active); /* don't allow deactivation */ TAILQ_INIT(&kernel_pmap->pm_pvchunk); /* * Initialize the global pv list lock. */ rw_init(&pvh_global_lock, "pmap pv global"); LIST_INIT(&allpmaps); /* * Request a spin mutex so that changes to allpmaps cannot be * preempted by smp_rendezvous_cpus(). Otherwise, * pmap_update_pde_kernel() could access allpmaps while it is * being changed. */ mtx_init(&allpmaps_lock, "allpmaps", NULL, MTX_SPIN); mtx_lock_spin(&allpmaps_lock); LIST_INSERT_HEAD(&allpmaps, kernel_pmap, pm_list); mtx_unlock_spin(&allpmaps_lock); /* * Reserve some special page table entries/VA space for temporary * mapping of pages. */ #define SYSMAP(c, p, v, n) \ v = (c)va; va += ((n)*PAGE_SIZE); p = pte; pte += (n); va = virtual_avail; pte = vtopte(va); /* * Initialize temporary map objects on the current CPU for use * during early boot. * CMAP1/CMAP2 are used for zeroing and copying pages. * CMAP3 is used for the boot-time memory test. */ pc = get_pcpu(); mtx_init(&pc->pc_cmap_lock, "SYSMAPS", NULL, MTX_DEF); SYSMAP(caddr_t, pc->pc_cmap_pte1, pc->pc_cmap_addr1, 1) SYSMAP(caddr_t, pc->pc_cmap_pte2, pc->pc_cmap_addr2, 1) SYSMAP(vm_offset_t, pte, pc->pc_qmap_addr, 1) SYSMAP(caddr_t, CMAP3, CADDR3, 1); /* * Crashdump maps. */ SYSMAP(caddr_t, unused, crashdumpmap, MAXDUMPPGS) /* * ptvmmap is used for reading arbitrary physical pages via /dev/mem. */ SYSMAP(caddr_t, unused, ptvmmap, 1) /* * msgbufp is used to map the system message buffer. */ SYSMAP(struct msgbuf *, unused, msgbufp, atop(round_page(msgbufsize))) /* * KPTmap is used by pmap_kextract(). * * KPTmap is first initialized by locore. However, that initial * KPTmap can only support NKPT page table pages. Here, a larger * KPTmap is created that can support KVA_PAGES page table pages. */ SYSMAP(pt_entry_t *, KPTD, KPTmap, KVA_PAGES) for (i = 0; i < NKPT; i++) KPTD[i] = (KPTphys + (i << PAGE_SHIFT)) | pgeflag | PG_RW | PG_V; /* * Adjust the start of the KPTD and KPTmap so that the implementation * of pmap_kextract() and pmap_growkernel() can be made simpler. */ KPTD -= KPTDI; KPTmap -= i386_btop(KPTDI << PDRSHIFT); /* * PADDR1 and PADDR2 are used by pmap_pte_quick() and pmap_pte(), * respectively. */ SYSMAP(pt_entry_t *, PMAP1, PADDR1, 1) SYSMAP(pt_entry_t *, PMAP2, PADDR2, 1) mtx_init(&PMAP2mutex, "PMAP2", NULL, MTX_DEF); virtual_avail = va; /* * Leave in place an identity mapping (virt == phys) for the low 1 MB * physical memory region that is used by the ACPI wakeup code. This * mapping must not have PG_G set. */ #ifdef XBOX /* FIXME: This is gross, but needed for the XBOX. Since we are in such * an early stadium, we cannot yet neatly map video memory ... :-( * Better fixes are very welcome! */ if (!arch_i386_is_xbox) #endif for (i = 1; i < NKPT; i++) PTD[i] = 0; /* * Initialize the PAT MSR if present. * pmap_init_pat() clears and sets CR4_PGE, which, as a * side-effect, invalidates stale PG_G TLB entries that might * have been created in our pre-boot environment. We assume * that PAT support implies PGE and in reverse, PGE presence * comes with PAT. Both features were added for Pentium Pro. */ pmap_init_pat(); /* Turn on PG_G on kernel page(s) */ pmap_set_pg(); } static void pmap_init_reserved_pages(void) { struct pcpu *pc; vm_offset_t pages; int i; CPU_FOREACH(i) { pc = pcpu_find(i); /* * Skip if the mapping has already been initialized, * i.e. this is the BSP. */ if (pc->pc_cmap_addr1 != 0) continue; mtx_init(&pc->pc_cmap_lock, "SYSMAPS", NULL, MTX_DEF); pages = kva_alloc(PAGE_SIZE * 3); if (pages == 0) panic("%s: unable to allocate KVA", __func__); pc->pc_cmap_pte1 = vtopte(pages); pc->pc_cmap_pte2 = vtopte(pages + PAGE_SIZE); pc->pc_cmap_addr1 = (caddr_t)pages; pc->pc_cmap_addr2 = (caddr_t)(pages + PAGE_SIZE); pc->pc_qmap_addr = pages + (PAGE_SIZE * 2); } } SYSINIT(rpages_init, SI_SUB_CPU, SI_ORDER_ANY, pmap_init_reserved_pages, NULL); /* * Setup the PAT MSR. */ void pmap_init_pat(void) { int pat_table[PAT_INDEX_SIZE]; uint64_t pat_msr; u_long cr0, cr4; int i; /* Set default PAT index table. */ for (i = 0; i < PAT_INDEX_SIZE; i++) pat_table[i] = -1; pat_table[PAT_WRITE_BACK] = 0; pat_table[PAT_WRITE_THROUGH] = 1; pat_table[PAT_UNCACHEABLE] = 3; pat_table[PAT_WRITE_COMBINING] = 3; pat_table[PAT_WRITE_PROTECTED] = 3; pat_table[PAT_UNCACHED] = 3; /* * Bail if this CPU doesn't implement PAT. * We assume that PAT support implies PGE. */ if ((cpu_feature & CPUID_PAT) == 0) { for (i = 0; i < PAT_INDEX_SIZE; i++) pat_index[i] = pat_table[i]; pat_works = 0; return; } /* * Due to some Intel errata, we can only safely use the lower 4 * PAT entries. * * Intel Pentium III Processor Specification Update * Errata E.27 (Upper Four PAT Entries Not Usable With Mode B * or Mode C Paging) * * Intel Pentium IV Processor Specification Update * Errata N46 (PAT Index MSB May Be Calculated Incorrectly) */ if (cpu_vendor_id == CPU_VENDOR_INTEL && !(CPUID_TO_FAMILY(cpu_id) == 6 && CPUID_TO_MODEL(cpu_id) >= 0xe)) pat_works = 0; /* Initialize default PAT entries. */ pat_msr = PAT_VALUE(0, PAT_WRITE_BACK) | PAT_VALUE(1, PAT_WRITE_THROUGH) | PAT_VALUE(2, PAT_UNCACHED) | PAT_VALUE(3, PAT_UNCACHEABLE) | PAT_VALUE(4, PAT_WRITE_BACK) | PAT_VALUE(5, PAT_WRITE_THROUGH) | PAT_VALUE(6, PAT_UNCACHED) | PAT_VALUE(7, PAT_UNCACHEABLE); if (pat_works) { /* * Leave the indices 0-3 at the default of WB, WT, UC-, and UC. * Program 5 and 6 as WP and WC. * Leave 4 and 7 as WB and UC. */ pat_msr &= ~(PAT_MASK(5) | PAT_MASK(6)); pat_msr |= PAT_VALUE(5, PAT_WRITE_PROTECTED) | PAT_VALUE(6, PAT_WRITE_COMBINING); pat_table[PAT_UNCACHED] = 2; pat_table[PAT_WRITE_PROTECTED] = 5; pat_table[PAT_WRITE_COMBINING] = 6; } else { /* * Just replace PAT Index 2 with WC instead of UC-. */ pat_msr &= ~PAT_MASK(2); pat_msr |= PAT_VALUE(2, PAT_WRITE_COMBINING); pat_table[PAT_WRITE_COMBINING] = 2; } /* Disable PGE. */ cr4 = rcr4(); load_cr4(cr4 & ~CR4_PGE); /* Disable caches (CD = 1, NW = 0). */ cr0 = rcr0(); load_cr0((cr0 & ~CR0_NW) | CR0_CD); /* Flushes caches and TLBs. */ wbinvd(); invltlb(); /* Update PAT and index table. */ wrmsr(MSR_PAT, pat_msr); for (i = 0; i < PAT_INDEX_SIZE; i++) pat_index[i] = pat_table[i]; /* Flush caches and TLBs again. */ wbinvd(); invltlb(); /* Restore caches and PGE. */ load_cr0(cr0); load_cr4(cr4); } /* * Set PG_G on kernel pages. Only the BSP calls this when SMP is turned on. */ static void pmap_set_pg(void) { pt_entry_t *pte; vm_offset_t va, endva; if (pgeflag == 0) return; endva = KERNBASE + KERNend; if (pseflag) { va = KERNBASE + KERNLOAD; while (va < endva) { pdir_pde(PTD, va) |= pgeflag; invltlb(); /* Flush non-PG_G entries. */ va += NBPDR; } } else { va = (vm_offset_t)btext; while (va < endva) { pte = vtopte(va); if (*pte) *pte |= pgeflag; invltlb(); /* Flush non-PG_G entries. */ va += PAGE_SIZE; } } } /* * Initialize a vm_page's machine-dependent fields. */ void pmap_page_init(vm_page_t m) { TAILQ_INIT(&m->md.pv_list); m->md.pat_mode = PAT_WRITE_BACK; } #if defined(PAE) || defined(PAE_TABLES) static void * -pmap_pdpt_allocf(uma_zone_t zone, vm_size_t bytes, uint8_t *flags, int wait) +pmap_pdpt_allocf(uma_zone_t zone, vm_size_t bytes, int domain, uint8_t *flags, + int wait) { /* Inform UMA that this allocator uses kernel_map/object. */ *flags = UMA_SLAB_KERNEL; return ((void *)kmem_alloc_contig(kernel_arena, bytes, wait, 0x0ULL, 0xffffffffULL, 1, 0, VM_MEMATTR_DEFAULT)); } #endif /* * Abuse the pte nodes for unmapped kva to thread a kva freelist through. * Requirements: * - Must deal with pages in order to ensure that none of the PG_* bits * are ever set, PG_V in particular. * - Assumes we can write to ptes without pte_store() atomic ops, even * on PAE systems. This should be ok. * - Assumes nothing will ever test these addresses for 0 to indicate * no mapping instead of correctly checking PG_V. * - Assumes a vm_offset_t will fit in a pte (true for i386). * Because PG_V is never set, there can be no mappings to invalidate. */ static vm_offset_t pmap_ptelist_alloc(vm_offset_t *head) { pt_entry_t *pte; vm_offset_t va; va = *head; if (va == 0) panic("pmap_ptelist_alloc: exhausted ptelist KVA"); pte = vtopte(va); *head = *pte; if (*head & PG_V) panic("pmap_ptelist_alloc: va with PG_V set!"); *pte = 0; return (va); } static void pmap_ptelist_free(vm_offset_t *head, vm_offset_t va) { pt_entry_t *pte; if (va & PG_V) panic("pmap_ptelist_free: freeing va with PG_V set!"); pte = vtopte(va); *pte = *head; /* virtual! PG_V is 0 though */ *head = va; } static void pmap_ptelist_init(vm_offset_t *head, void *base, int npages) { int i; vm_offset_t va; *head = 0; for (i = npages - 1; i >= 0; i--) { va = (vm_offset_t)base + i * PAGE_SIZE; pmap_ptelist_free(head, va); } } /* * Initialize the pmap module. * Called by vm_init, to initialize any structures that the pmap * system needs to map virtual memory. */ void pmap_init(void) { struct pmap_preinit_mapping *ppim; vm_page_t mpte; vm_size_t s; int i, pv_npg; /* * Initialize the vm page array entries for the kernel pmap's * page table pages. */ for (i = 0; i < NKPT; i++) { mpte = PHYS_TO_VM_PAGE(KPTphys + (i << PAGE_SHIFT)); KASSERT(mpte >= vm_page_array && mpte < &vm_page_array[vm_page_array_size], ("pmap_init: page table page is out of range")); mpte->pindex = i + KPTDI; mpte->phys_addr = KPTphys + (i << PAGE_SHIFT); } /* * Initialize the address space (zone) for the pv entries. Set a * high water mark so that the system can recover from excessive * numbers of pv entries. */ TUNABLE_INT_FETCH("vm.pmap.shpgperproc", &shpgperproc); pv_entry_max = shpgperproc * maxproc + vm_cnt.v_page_count; TUNABLE_INT_FETCH("vm.pmap.pv_entries", &pv_entry_max); pv_entry_max = roundup(pv_entry_max, _NPCPV); pv_entry_high_water = 9 * (pv_entry_max / 10); /* * If the kernel is running on a virtual machine, then it must assume * that MCA is enabled by the hypervisor. Moreover, the kernel must * be prepared for the hypervisor changing the vendor and family that * are reported by CPUID. Consequently, the workaround for AMD Family * 10h Erratum 383 is enabled if the processor's feature set does not * include at least one feature that is only supported by older Intel * or newer AMD processors. */ if (vm_guest != VM_GUEST_NO && (cpu_feature & CPUID_SS) == 0 && (cpu_feature2 & (CPUID2_SSSE3 | CPUID2_SSE41 | CPUID2_AESNI | CPUID2_AVX | CPUID2_XSAVE)) == 0 && (amd_feature2 & (AMDID2_XOP | AMDID2_FMA4)) == 0) workaround_erratum383 = 1; /* * Are large page mappings supported and enabled? */ TUNABLE_INT_FETCH("vm.pmap.pg_ps_enabled", &pg_ps_enabled); if (pseflag == 0) pg_ps_enabled = 0; else if (pg_ps_enabled) { KASSERT(MAXPAGESIZES > 1 && pagesizes[1] == 0, ("pmap_init: can't assign to pagesizes[1]")); pagesizes[1] = NBPDR; } /* * Calculate the size of the pv head table for superpages. * Handle the possibility that "vm_phys_segs[...].end" is zero. */ pv_npg = trunc_4mpage(vm_phys_segs[vm_phys_nsegs - 1].end - PAGE_SIZE) / NBPDR + 1; /* * Allocate memory for the pv head table for superpages. */ s = (vm_size_t)(pv_npg * sizeof(struct md_page)); s = round_page(s); pv_table = (struct md_page *)kmem_malloc(kernel_arena, s, M_WAITOK | M_ZERO); for (i = 0; i < pv_npg; i++) TAILQ_INIT(&pv_table[i].pv_list); pv_maxchunks = MAX(pv_entry_max / _NPCPV, maxproc); pv_chunkbase = (struct pv_chunk *)kva_alloc(PAGE_SIZE * pv_maxchunks); if (pv_chunkbase == NULL) panic("pmap_init: not enough kvm for pv chunks"); pmap_ptelist_init(&pv_vafree, pv_chunkbase, pv_maxchunks); #if defined(PAE) || defined(PAE_TABLES) pdptzone = uma_zcreate("PDPT", NPGPTD * sizeof(pdpt_entry_t), NULL, NULL, NULL, NULL, (NPGPTD * sizeof(pdpt_entry_t)) - 1, UMA_ZONE_VM | UMA_ZONE_NOFREE); uma_zone_set_allocf(pdptzone, pmap_pdpt_allocf); #endif pmap_initialized = 1; if (!bootverbose) return; for (i = 0; i < PMAP_PREINIT_MAPPING_COUNT; i++) { ppim = pmap_preinit_mapping + i; if (ppim->va == 0) continue; printf("PPIM %u: PA=%#jx, VA=%#x, size=%#x, mode=%#x\n", i, (uintmax_t)ppim->pa, ppim->va, ppim->sz, ppim->mode); } } SYSCTL_INT(_vm_pmap, OID_AUTO, pv_entry_max, CTLFLAG_RD, &pv_entry_max, 0, "Max number of PV entries"); SYSCTL_INT(_vm_pmap, OID_AUTO, shpgperproc, CTLFLAG_RD, &shpgperproc, 0, "Page share factor per proc"); static SYSCTL_NODE(_vm_pmap, OID_AUTO, pde, CTLFLAG_RD, 0, "2/4MB page mapping counters"); static u_long pmap_pde_demotions; SYSCTL_ULONG(_vm_pmap_pde, OID_AUTO, demotions, CTLFLAG_RD, &pmap_pde_demotions, 0, "2/4MB page demotions"); static u_long pmap_pde_mappings; SYSCTL_ULONG(_vm_pmap_pde, OID_AUTO, mappings, CTLFLAG_RD, &pmap_pde_mappings, 0, "2/4MB page mappings"); static u_long pmap_pde_p_failures; SYSCTL_ULONG(_vm_pmap_pde, OID_AUTO, p_failures, CTLFLAG_RD, &pmap_pde_p_failures, 0, "2/4MB page promotion failures"); static u_long pmap_pde_promotions; SYSCTL_ULONG(_vm_pmap_pde, OID_AUTO, promotions, CTLFLAG_RD, &pmap_pde_promotions, 0, "2/4MB page promotions"); /*************************************************** * Low level helper routines..... ***************************************************/ /* * Determine the appropriate bits to set in a PTE or PDE for a specified * caching mode. */ int pmap_cache_bits(int mode, boolean_t is_pde) { int cache_bits, pat_flag, pat_idx; if (mode < 0 || mode >= PAT_INDEX_SIZE || pat_index[mode] < 0) panic("Unknown caching mode %d\n", mode); /* The PAT bit is different for PTE's and PDE's. */ pat_flag = is_pde ? PG_PDE_PAT : PG_PTE_PAT; /* Map the caching mode to a PAT index. */ pat_idx = pat_index[mode]; /* Map the 3-bit index value into the PAT, PCD, and PWT bits. */ cache_bits = 0; if (pat_idx & 0x4) cache_bits |= pat_flag; if (pat_idx & 0x2) cache_bits |= PG_NC_PCD; if (pat_idx & 0x1) cache_bits |= PG_NC_PWT; return (cache_bits); } /* * The caller is responsible for maintaining TLB consistency. */ static void pmap_kenter_pde(vm_offset_t va, pd_entry_t newpde) { pd_entry_t *pde; pmap_t pmap; boolean_t PTD_updated; PTD_updated = FALSE; mtx_lock_spin(&allpmaps_lock); LIST_FOREACH(pmap, &allpmaps, pm_list) { if ((pmap->pm_pdir[PTDPTDI] & PG_FRAME) == (PTDpde[0] & PG_FRAME)) PTD_updated = TRUE; pde = pmap_pde(pmap, va); pde_store(pde, newpde); } mtx_unlock_spin(&allpmaps_lock); KASSERT(PTD_updated, ("pmap_kenter_pde: current page table is not in allpmaps")); } /* * After changing the page size for the specified virtual address in the page * table, flush the corresponding entries from the processor's TLB. Only the * calling processor's TLB is affected. * * The calling thread must be pinned to a processor. */ static void pmap_update_pde_invalidate(vm_offset_t va, pd_entry_t newpde) { u_long cr4; if ((newpde & PG_PS) == 0) /* Demotion: flush a specific 2MB page mapping. */ invlpg(va); else if ((newpde & PG_G) == 0) /* * Promotion: flush every 4KB page mapping from the TLB * because there are too many to flush individually. */ invltlb(); else { /* * Promotion: flush every 4KB page mapping from the TLB, * including any global (PG_G) mappings. */ cr4 = rcr4(); load_cr4(cr4 & ~CR4_PGE); /* * Although preemption at this point could be detrimental to * performance, it would not lead to an error. PG_G is simply * ignored if CR4.PGE is clear. Moreover, in case this block * is re-entered, the load_cr4() either above or below will * modify CR4.PGE flushing the TLB. */ load_cr4(cr4 | CR4_PGE); } } void invltlb_glob(void) { uint64_t cr4; if (pgeflag == 0) { invltlb(); } else { cr4 = rcr4(); load_cr4(cr4 & ~CR4_PGE); load_cr4(cr4 | CR4_PGE); } } #ifdef SMP /* * For SMP, these functions have to use the IPI mechanism for coherence. * * N.B.: Before calling any of the following TLB invalidation functions, * the calling processor must ensure that all stores updating a non- * kernel page table are globally performed. Otherwise, another * processor could cache an old, pre-update entry without being * invalidated. This can happen one of two ways: (1) The pmap becomes * active on another processor after its pm_active field is checked by * one of the following functions but before a store updating the page * table is globally performed. (2) The pmap becomes active on another * processor before its pm_active field is checked but due to * speculative loads one of the following functions stills reads the * pmap as inactive on the other processor. * * The kernel page table is exempt because its pm_active field is * immutable. The kernel page table is always active on every * processor. */ void pmap_invalidate_page(pmap_t pmap, vm_offset_t va) { cpuset_t *mask, other_cpus; u_int cpuid; sched_pin(); if (pmap == kernel_pmap || !CPU_CMP(&pmap->pm_active, &all_cpus)) { invlpg(va); mask = &all_cpus; } else { cpuid = PCPU_GET(cpuid); other_cpus = all_cpus; CPU_CLR(cpuid, &other_cpus); if (CPU_ISSET(cpuid, &pmap->pm_active)) invlpg(va); CPU_AND(&other_cpus, &pmap->pm_active); mask = &other_cpus; } smp_masked_invlpg(*mask, va); sched_unpin(); } /* 4k PTEs -- Chosen to exceed the total size of Broadwell L2 TLB */ #define PMAP_INVLPG_THRESHOLD (4 * 1024 * PAGE_SIZE) void pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) { cpuset_t *mask, other_cpus; vm_offset_t addr; u_int cpuid; if (eva - sva >= PMAP_INVLPG_THRESHOLD) { pmap_invalidate_all(pmap); return; } sched_pin(); if (pmap == kernel_pmap || !CPU_CMP(&pmap->pm_active, &all_cpus)) { for (addr = sva; addr < eva; addr += PAGE_SIZE) invlpg(addr); mask = &all_cpus; } else { cpuid = PCPU_GET(cpuid); other_cpus = all_cpus; CPU_CLR(cpuid, &other_cpus); if (CPU_ISSET(cpuid, &pmap->pm_active)) for (addr = sva; addr < eva; addr += PAGE_SIZE) invlpg(addr); CPU_AND(&other_cpus, &pmap->pm_active); mask = &other_cpus; } smp_masked_invlpg_range(*mask, sva, eva); sched_unpin(); } void pmap_invalidate_all(pmap_t pmap) { cpuset_t *mask, other_cpus; u_int cpuid; sched_pin(); if (pmap == kernel_pmap) { invltlb_glob(); mask = &all_cpus; } else if (!CPU_CMP(&pmap->pm_active, &all_cpus)) { invltlb(); mask = &all_cpus; } else { cpuid = PCPU_GET(cpuid); other_cpus = all_cpus; CPU_CLR(cpuid, &other_cpus); if (CPU_ISSET(cpuid, &pmap->pm_active)) invltlb(); CPU_AND(&other_cpus, &pmap->pm_active); mask = &other_cpus; } smp_masked_invltlb(*mask, pmap); sched_unpin(); } void pmap_invalidate_cache(void) { sched_pin(); wbinvd(); smp_cache_flush(); sched_unpin(); } struct pde_action { cpuset_t invalidate; /* processors that invalidate their TLB */ vm_offset_t va; pd_entry_t *pde; pd_entry_t newpde; u_int store; /* processor that updates the PDE */ }; static void pmap_update_pde_kernel(void *arg) { struct pde_action *act = arg; pd_entry_t *pde; pmap_t pmap; if (act->store == PCPU_GET(cpuid)) { /* * Elsewhere, this operation requires allpmaps_lock for * synchronization. Here, it does not because it is being * performed in the context of an all_cpus rendezvous. */ LIST_FOREACH(pmap, &allpmaps, pm_list) { pde = pmap_pde(pmap, act->va); pde_store(pde, act->newpde); } } } static void pmap_update_pde_user(void *arg) { struct pde_action *act = arg; if (act->store == PCPU_GET(cpuid)) pde_store(act->pde, act->newpde); } static void pmap_update_pde_teardown(void *arg) { struct pde_action *act = arg; if (CPU_ISSET(PCPU_GET(cpuid), &act->invalidate)) pmap_update_pde_invalidate(act->va, act->newpde); } /* * Change the page size for the specified virtual address in a way that * prevents any possibility of the TLB ever having two entries that map the * same virtual address using different page sizes. This is the recommended * workaround for Erratum 383 on AMD Family 10h processors. It prevents a * machine check exception for a TLB state that is improperly diagnosed as a * hardware error. */ static void pmap_update_pde(pmap_t pmap, vm_offset_t va, pd_entry_t *pde, pd_entry_t newpde) { struct pde_action act; cpuset_t active, other_cpus; u_int cpuid; sched_pin(); cpuid = PCPU_GET(cpuid); other_cpus = all_cpus; CPU_CLR(cpuid, &other_cpus); if (pmap == kernel_pmap) active = all_cpus; else active = pmap->pm_active; if (CPU_OVERLAP(&active, &other_cpus)) { act.store = cpuid; act.invalidate = active; act.va = va; act.pde = pde; act.newpde = newpde; CPU_SET(cpuid, &active); smp_rendezvous_cpus(active, smp_no_rendezvous_barrier, pmap == kernel_pmap ? pmap_update_pde_kernel : pmap_update_pde_user, pmap_update_pde_teardown, &act); } else { if (pmap == kernel_pmap) pmap_kenter_pde(va, newpde); else pde_store(pde, newpde); if (CPU_ISSET(cpuid, &active)) pmap_update_pde_invalidate(va, newpde); } sched_unpin(); } #else /* !SMP */ /* * Normal, non-SMP, 486+ invalidation functions. * We inline these within pmap.c for speed. */ PMAP_INLINE void pmap_invalidate_page(pmap_t pmap, vm_offset_t va) { if (pmap == kernel_pmap || !CPU_EMPTY(&pmap->pm_active)) invlpg(va); } PMAP_INLINE void pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) { vm_offset_t addr; if (pmap == kernel_pmap || !CPU_EMPTY(&pmap->pm_active)) for (addr = sva; addr < eva; addr += PAGE_SIZE) invlpg(addr); } PMAP_INLINE void pmap_invalidate_all(pmap_t pmap) { if (pmap == kernel_pmap) invltlb_glob(); else if (!CPU_EMPTY(&pmap->pm_active)) invltlb(); } PMAP_INLINE void pmap_invalidate_cache(void) { wbinvd(); } static void pmap_update_pde(pmap_t pmap, vm_offset_t va, pd_entry_t *pde, pd_entry_t newpde) { if (pmap == kernel_pmap) pmap_kenter_pde(va, newpde); else pde_store(pde, newpde); if (pmap == kernel_pmap || !CPU_EMPTY(&pmap->pm_active)) pmap_update_pde_invalidate(va, newpde); } #endif /* !SMP */ static void pmap_invalidate_pde_page(pmap_t pmap, vm_offset_t va, pd_entry_t pde) { /* * When the PDE has PG_PROMOTED set, the 2- or 4MB page mapping was * created by a promotion that did not invalidate the 512 or 1024 4KB * page mappings that might exist in the TLB. Consequently, at this * point, the TLB may hold both 4KB and 2- or 4MB page mappings for * the address range [va, va + NBPDR). Therefore, the entire range * must be invalidated here. In contrast, when PG_PROMOTED is clear, * the TLB will not hold any 4KB page mappings for the address range * [va, va + NBPDR), and so a single INVLPG suffices to invalidate the * 2- or 4MB page mapping from the TLB. */ if ((pde & PG_PROMOTED) != 0) pmap_invalidate_range(pmap, va, va + NBPDR - 1); else pmap_invalidate_page(pmap, va); } #define PMAP_CLFLUSH_THRESHOLD (2 * 1024 * 1024) void pmap_invalidate_cache_range(vm_offset_t sva, vm_offset_t eva, boolean_t force) { if (force) { sva &= ~(vm_offset_t)(cpu_clflush_line_size - 1); } else { KASSERT((sva & PAGE_MASK) == 0, ("pmap_invalidate_cache_range: sva not page-aligned")); KASSERT((eva & PAGE_MASK) == 0, ("pmap_invalidate_cache_range: eva not page-aligned")); } if ((cpu_feature & CPUID_SS) != 0 && !force) ; /* If "Self Snoop" is supported and allowed, do nothing. */ else if ((cpu_stdext_feature & CPUID_STDEXT_CLFLUSHOPT) != 0 && eva - sva < PMAP_CLFLUSH_THRESHOLD) { #ifdef DEV_APIC /* * XXX: Some CPUs fault, hang, or trash the local APIC * registers if we use CLFLUSH on the local APIC * range. The local APIC is always uncached, so we * don't need to flush for that range anyway. */ if (pmap_kextract(sva) == lapic_paddr) return; #endif /* * Otherwise, do per-cache line flush. Use the sfence * instruction to insure that previous stores are * included in the write-back. The processor * propagates flush to other processors in the cache * coherence domain. */ sfence(); for (; sva < eva; sva += cpu_clflush_line_size) clflushopt(sva); sfence(); } else if ((cpu_feature & CPUID_CLFSH) != 0 && eva - sva < PMAP_CLFLUSH_THRESHOLD) { #ifdef DEV_APIC if (pmap_kextract(sva) == lapic_paddr) return; #endif /* * Writes are ordered by CLFLUSH on Intel CPUs. */ if (cpu_vendor_id != CPU_VENDOR_INTEL) mfence(); for (; sva < eva; sva += cpu_clflush_line_size) clflush(sva); if (cpu_vendor_id != CPU_VENDOR_INTEL) mfence(); } else { /* * No targeted cache flush methods are supported by CPU, * or the supplied range is bigger than 2MB. * Globally invalidate cache. */ pmap_invalidate_cache(); } } void pmap_invalidate_cache_pages(vm_page_t *pages, int count) { int i; if (count >= PMAP_CLFLUSH_THRESHOLD / PAGE_SIZE || (cpu_feature & CPUID_CLFSH) == 0) { pmap_invalidate_cache(); } else { for (i = 0; i < count; i++) pmap_flush_page(pages[i]); } } /* * Are we current address space or kernel? */ static __inline int pmap_is_current(pmap_t pmap) { return (pmap == kernel_pmap || pmap == vmspace_pmap(curthread->td_proc->p_vmspace)); } /* * If the given pmap is not the current or kernel pmap, the returned pte must * be released by passing it to pmap_pte_release(). */ pt_entry_t * pmap_pte(pmap_t pmap, vm_offset_t va) { pd_entry_t newpf; pd_entry_t *pde; pde = pmap_pde(pmap, va); if (*pde & PG_PS) return (pde); if (*pde != 0) { /* are we current address space or kernel? */ if (pmap_is_current(pmap)) return (vtopte(va)); mtx_lock(&PMAP2mutex); newpf = *pde & PG_FRAME; if ((*PMAP2 & PG_FRAME) != newpf) { *PMAP2 = newpf | PG_RW | PG_V | PG_A | PG_M; pmap_invalidate_page(kernel_pmap, (vm_offset_t)PADDR2); } return (PADDR2 + (i386_btop(va) & (NPTEPG - 1))); } return (NULL); } /* * Releases a pte that was obtained from pmap_pte(). Be prepared for the pte * being NULL. */ static __inline void pmap_pte_release(pt_entry_t *pte) { if ((pt_entry_t *)((vm_offset_t)pte & ~PAGE_MASK) == PADDR2) mtx_unlock(&PMAP2mutex); } /* * NB: The sequence of updating a page table followed by accesses to the * corresponding pages is subject to the situation described in the "AMD64 * Architecture Programmer's Manual Volume 2: System Programming" rev. 3.23, * "7.3.1 Special Coherency Considerations". Therefore, issuing the INVLPG * right after modifying the PTE bits is crucial. */ static __inline void invlcaddr(void *caddr) { invlpg((u_int)caddr); } /* * Super fast pmap_pte routine best used when scanning * the pv lists. This eliminates many coarse-grained * invltlb calls. Note that many of the pv list * scans are across different pmaps. It is very wasteful * to do an entire invltlb for checking a single mapping. * * If the given pmap is not the current pmap, pvh_global_lock * must be held and curthread pinned to a CPU. */ static pt_entry_t * pmap_pte_quick(pmap_t pmap, vm_offset_t va) { pd_entry_t newpf; pd_entry_t *pde; pde = pmap_pde(pmap, va); if (*pde & PG_PS) return (pde); if (*pde != 0) { /* are we current address space or kernel? */ if (pmap_is_current(pmap)) return (vtopte(va)); rw_assert(&pvh_global_lock, RA_WLOCKED); KASSERT(curthread->td_pinned > 0, ("curthread not pinned")); newpf = *pde & PG_FRAME; if ((*PMAP1 & PG_FRAME) != newpf) { *PMAP1 = newpf | PG_RW | PG_V | PG_A | PG_M; #ifdef SMP PMAP1cpu = PCPU_GET(cpuid); #endif invlcaddr(PADDR1); PMAP1changed++; } else #ifdef SMP if (PMAP1cpu != PCPU_GET(cpuid)) { PMAP1cpu = PCPU_GET(cpuid); invlcaddr(PADDR1); PMAP1changedcpu++; } else #endif PMAP1unchanged++; return (PADDR1 + (i386_btop(va) & (NPTEPG - 1))); } return (0); } /* * Routine: pmap_extract * Function: * Extract the physical page address associated * with the given map/virtual_address pair. */ vm_paddr_t pmap_extract(pmap_t pmap, vm_offset_t va) { vm_paddr_t rtval; pt_entry_t *pte; pd_entry_t pde; rtval = 0; PMAP_LOCK(pmap); pde = pmap->pm_pdir[va >> PDRSHIFT]; if (pde != 0) { if ((pde & PG_PS) != 0) rtval = (pde & PG_PS_FRAME) | (va & PDRMASK); else { pte = pmap_pte(pmap, va); rtval = (*pte & PG_FRAME) | (va & PAGE_MASK); pmap_pte_release(pte); } } PMAP_UNLOCK(pmap); return (rtval); } /* * Routine: pmap_extract_and_hold * Function: * Atomically extract and hold the physical page * with the given pmap and virtual address pair * if that mapping permits the given protection. */ vm_page_t pmap_extract_and_hold(pmap_t pmap, vm_offset_t va, vm_prot_t prot) { pd_entry_t pde; pt_entry_t pte, *ptep; vm_page_t m; vm_paddr_t pa; pa = 0; m = NULL; PMAP_LOCK(pmap); retry: pde = *pmap_pde(pmap, va); if (pde != 0) { if (pde & PG_PS) { if ((pde & PG_RW) || (prot & VM_PROT_WRITE) == 0) { if (vm_page_pa_tryrelock(pmap, (pde & PG_PS_FRAME) | (va & PDRMASK), &pa)) goto retry; m = PHYS_TO_VM_PAGE((pde & PG_PS_FRAME) | (va & PDRMASK)); vm_page_hold(m); } } else { ptep = pmap_pte(pmap, va); pte = *ptep; pmap_pte_release(ptep); if (pte != 0 && ((pte & PG_RW) || (prot & VM_PROT_WRITE) == 0)) { if (vm_page_pa_tryrelock(pmap, pte & PG_FRAME, &pa)) goto retry; m = PHYS_TO_VM_PAGE(pte & PG_FRAME); vm_page_hold(m); } } } PA_UNLOCK_COND(pa); PMAP_UNLOCK(pmap); return (m); } /*************************************************** * Low level mapping routines..... ***************************************************/ /* * Add a wired page to the kva. * Note: not SMP coherent. * * This function may be used before pmap_bootstrap() is called. */ PMAP_INLINE void pmap_kenter(vm_offset_t va, vm_paddr_t pa) { pt_entry_t *pte; pte = vtopte(va); pte_store(pte, pa | PG_RW | PG_V | pgeflag); } static __inline void pmap_kenter_attr(vm_offset_t va, vm_paddr_t pa, int mode) { pt_entry_t *pte; pte = vtopte(va); pte_store(pte, pa | PG_RW | PG_V | pgeflag | pmap_cache_bits(mode, 0)); } /* * Remove a page from the kernel pagetables. * Note: not SMP coherent. * * This function may be used before pmap_bootstrap() is called. */ PMAP_INLINE void pmap_kremove(vm_offset_t va) { pt_entry_t *pte; pte = vtopte(va); pte_clear(pte); } /* * Used to map a range of physical addresses into kernel * virtual address space. * * The value passed in '*virt' is a suggested virtual address for * the mapping. Architectures which can support a direct-mapped * physical to virtual region can return the appropriate address * within that region, leaving '*virt' unchanged. Other * architectures should map the pages starting at '*virt' and * update '*virt' with the first usable address after the mapped * region. */ vm_offset_t pmap_map(vm_offset_t *virt, vm_paddr_t start, vm_paddr_t end, int prot) { vm_offset_t va, sva; vm_paddr_t superpage_offset; pd_entry_t newpde; va = *virt; /* * Does the physical address range's size and alignment permit at * least one superpage mapping to be created? */ superpage_offset = start & PDRMASK; if ((end - start) - ((NBPDR - superpage_offset) & PDRMASK) >= NBPDR) { /* * Increase the starting virtual address so that its alignment * does not preclude the use of superpage mappings. */ if ((va & PDRMASK) < superpage_offset) va = (va & ~PDRMASK) + superpage_offset; else if ((va & PDRMASK) > superpage_offset) va = ((va + PDRMASK) & ~PDRMASK) + superpage_offset; } sva = va; while (start < end) { if ((start & PDRMASK) == 0 && end - start >= NBPDR && pseflag) { KASSERT((va & PDRMASK) == 0, ("pmap_map: misaligned va %#x", va)); newpde = start | PG_PS | pgeflag | PG_RW | PG_V; pmap_kenter_pde(va, newpde); va += NBPDR; start += NBPDR; } else { pmap_kenter(va, start); va += PAGE_SIZE; start += PAGE_SIZE; } } pmap_invalidate_range(kernel_pmap, sva, va); *virt = va; return (sva); } /* * Add a list of wired pages to the kva * this routine is only used for temporary * kernel mappings that do not need to have * page modification or references recorded. * Note that old mappings are simply written * over. The page *must* be wired. * Note: SMP coherent. Uses a ranged shootdown IPI. */ void pmap_qenter(vm_offset_t sva, vm_page_t *ma, int count) { pt_entry_t *endpte, oldpte, pa, *pte; vm_page_t m; oldpte = 0; pte = vtopte(sva); endpte = pte + count; while (pte < endpte) { m = *ma++; pa = VM_PAGE_TO_PHYS(m) | pmap_cache_bits(m->md.pat_mode, 0); if ((*pte & (PG_FRAME | PG_PTE_CACHE)) != pa) { oldpte |= *pte; pte_store(pte, pa | pgeflag | PG_RW | PG_V); } pte++; } if (__predict_false((oldpte & PG_V) != 0)) pmap_invalidate_range(kernel_pmap, sva, sva + count * PAGE_SIZE); } /* * This routine tears out page mappings from the * kernel -- it is meant only for temporary mappings. * Note: SMP coherent. Uses a ranged shootdown IPI. */ void pmap_qremove(vm_offset_t sva, int count) { vm_offset_t va; va = sva; while (count-- > 0) { pmap_kremove(va); va += PAGE_SIZE; } pmap_invalidate_range(kernel_pmap, sva, va); } /*************************************************** * Page table page management routines..... ***************************************************/ static __inline void pmap_free_zero_pages(struct spglist *free) { vm_page_t m; while ((m = SLIST_FIRST(free)) != NULL) { SLIST_REMOVE_HEAD(free, plinks.s.ss); /* Preserve the page's PG_ZERO setting. */ vm_page_free_toq(m); } } /* * Schedule the specified unused page table page to be freed. Specifically, * add the page to the specified list of pages that will be released to the * physical memory manager after the TLB has been updated. */ static __inline void pmap_add_delayed_free_list(vm_page_t m, struct spglist *free, boolean_t set_PG_ZERO) { if (set_PG_ZERO) m->flags |= PG_ZERO; else m->flags &= ~PG_ZERO; SLIST_INSERT_HEAD(free, m, plinks.s.ss); } /* * Inserts the specified page table page into the specified pmap's collection * of idle page table pages. Each of a pmap's page table pages is responsible * for mapping a distinct range of virtual addresses. The pmap's collection is * ordered by this virtual address range. */ static __inline int pmap_insert_pt_page(pmap_t pmap, vm_page_t mpte) { PMAP_LOCK_ASSERT(pmap, MA_OWNED); return (vm_radix_insert(&pmap->pm_root, mpte)); } /* * Removes the page table page mapping the specified virtual address from the * specified pmap's collection of idle page table pages, and returns it. * Otherwise, returns NULL if there is no page table page corresponding to the * specified virtual address. */ static __inline vm_page_t pmap_remove_pt_page(pmap_t pmap, vm_offset_t va) { PMAP_LOCK_ASSERT(pmap, MA_OWNED); return (vm_radix_remove(&pmap->pm_root, va >> PDRSHIFT)); } /* * Decrements a page table page's wire count, which is used to record the * number of valid page table entries within the page. If the wire count * drops to zero, then the page table page is unmapped. Returns TRUE if the * page table page was unmapped and FALSE otherwise. */ static inline boolean_t pmap_unwire_ptp(pmap_t pmap, vm_page_t m, struct spglist *free) { --m->wire_count; if (m->wire_count == 0) { _pmap_unwire_ptp(pmap, m, free); return (TRUE); } else return (FALSE); } static void _pmap_unwire_ptp(pmap_t pmap, vm_page_t m, struct spglist *free) { vm_offset_t pteva; /* * unmap the page table page */ pmap->pm_pdir[m->pindex] = 0; --pmap->pm_stats.resident_count; /* * This is a release store so that the ordinary store unmapping * the page table page is globally performed before TLB shoot- * down is begun. */ atomic_subtract_rel_int(&vm_cnt.v_wire_count, 1); /* * Do an invltlb to make the invalidated mapping * take effect immediately. */ pteva = VM_MAXUSER_ADDRESS + i386_ptob(m->pindex); pmap_invalidate_page(pmap, pteva); /* * Put page on a list so that it is released after * *ALL* TLB shootdown is done */ pmap_add_delayed_free_list(m, free, TRUE); } /* * After removing a page table entry, this routine is used to * conditionally free the page, and manage the hold/wire counts. */ static int pmap_unuse_pt(pmap_t pmap, vm_offset_t va, struct spglist *free) { pd_entry_t ptepde; vm_page_t mpte; if (va >= VM_MAXUSER_ADDRESS) return (0); ptepde = *pmap_pde(pmap, va); mpte = PHYS_TO_VM_PAGE(ptepde & PG_FRAME); return (pmap_unwire_ptp(pmap, mpte, free)); } /* * Initialize the pmap for the swapper process. */ void pmap_pinit0(pmap_t pmap) { PMAP_LOCK_INIT(pmap); /* * Since the page table directory is shared with the kernel pmap, * which is already included in the list "allpmaps", this pmap does * not need to be inserted into that list. */ pmap->pm_pdir = (pd_entry_t *)(KERNBASE + (vm_offset_t)IdlePTD); #if defined(PAE) || defined(PAE_TABLES) pmap->pm_pdpt = (pdpt_entry_t *)(KERNBASE + (vm_offset_t)IdlePDPT); #endif pmap->pm_root.rt_root = 0; CPU_ZERO(&pmap->pm_active); PCPU_SET(curpmap, pmap); TAILQ_INIT(&pmap->pm_pvchunk); bzero(&pmap->pm_stats, sizeof pmap->pm_stats); } /* * Initialize a preallocated and zeroed pmap structure, * such as one in a vmspace structure. */ int pmap_pinit(pmap_t pmap) { vm_page_t m, ptdpg[NPGPTD]; vm_paddr_t pa; int i; /* * No need to allocate page table space yet but we do need a valid * page directory table. */ if (pmap->pm_pdir == NULL) { pmap->pm_pdir = (pd_entry_t *)kva_alloc(NBPTD); if (pmap->pm_pdir == NULL) return (0); #if defined(PAE) || defined(PAE_TABLES) pmap->pm_pdpt = uma_zalloc(pdptzone, M_WAITOK | M_ZERO); KASSERT(((vm_offset_t)pmap->pm_pdpt & ((NPGPTD * sizeof(pdpt_entry_t)) - 1)) == 0, ("pmap_pinit: pdpt misaligned")); KASSERT(pmap_kextract((vm_offset_t)pmap->pm_pdpt) < (4ULL<<30), ("pmap_pinit: pdpt above 4g")); #endif pmap->pm_root.rt_root = 0; } KASSERT(vm_radix_is_empty(&pmap->pm_root), ("pmap_pinit: pmap has reserved page table page(s)")); /* * allocate the page directory page(s) */ for (i = 0; i < NPGPTD;) { m = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED | VM_ALLOC_ZERO); if (m == NULL) VM_WAIT; else { ptdpg[i++] = m; } } pmap_qenter((vm_offset_t)pmap->pm_pdir, ptdpg, NPGPTD); for (i = 0; i < NPGPTD; i++) if ((ptdpg[i]->flags & PG_ZERO) == 0) pagezero(pmap->pm_pdir + (i * NPDEPG)); mtx_lock_spin(&allpmaps_lock); LIST_INSERT_HEAD(&allpmaps, pmap, pm_list); /* Copy the kernel page table directory entries. */ bcopy(PTD + KPTDI, pmap->pm_pdir + KPTDI, nkpt * sizeof(pd_entry_t)); mtx_unlock_spin(&allpmaps_lock); /* install self-referential address mapping entry(s) */ for (i = 0; i < NPGPTD; i++) { pa = VM_PAGE_TO_PHYS(ptdpg[i]); pmap->pm_pdir[PTDPTDI + i] = pa | PG_V | PG_RW | PG_A | PG_M; #if defined(PAE) || defined(PAE_TABLES) pmap->pm_pdpt[i] = pa | PG_V; #endif } CPU_ZERO(&pmap->pm_active); TAILQ_INIT(&pmap->pm_pvchunk); bzero(&pmap->pm_stats, sizeof pmap->pm_stats); return (1); } /* * this routine is called if the page table page is not * mapped correctly. */ static vm_page_t _pmap_allocpte(pmap_t pmap, u_int ptepindex, u_int flags) { vm_paddr_t ptepa; vm_page_t m; /* * Allocate a page table page. */ if ((m = vm_page_alloc(NULL, ptepindex, VM_ALLOC_NOOBJ | VM_ALLOC_WIRED | VM_ALLOC_ZERO)) == NULL) { if ((flags & PMAP_ENTER_NOSLEEP) == 0) { PMAP_UNLOCK(pmap); rw_wunlock(&pvh_global_lock); VM_WAIT; rw_wlock(&pvh_global_lock); PMAP_LOCK(pmap); } /* * Indicate the need to retry. While waiting, the page table * page may have been allocated. */ return (NULL); } if ((m->flags & PG_ZERO) == 0) pmap_zero_page(m); /* * Map the pagetable page into the process address space, if * it isn't already there. */ pmap->pm_stats.resident_count++; ptepa = VM_PAGE_TO_PHYS(m); pmap->pm_pdir[ptepindex] = (pd_entry_t) (ptepa | PG_U | PG_RW | PG_V | PG_A | PG_M); return (m); } static vm_page_t pmap_allocpte(pmap_t pmap, vm_offset_t va, u_int flags) { u_int ptepindex; pd_entry_t ptepa; vm_page_t m; /* * Calculate pagetable page index */ ptepindex = va >> PDRSHIFT; retry: /* * Get the page directory entry */ ptepa = pmap->pm_pdir[ptepindex]; /* * This supports switching from a 4MB page to a * normal 4K page. */ if (ptepa & PG_PS) { (void)pmap_demote_pde(pmap, &pmap->pm_pdir[ptepindex], va); ptepa = pmap->pm_pdir[ptepindex]; } /* * If the page table page is mapped, we just increment the * hold count, and activate it. */ if (ptepa) { m = PHYS_TO_VM_PAGE(ptepa & PG_FRAME); m->wire_count++; } else { /* * Here if the pte page isn't mapped, or if it has * been deallocated. */ m = _pmap_allocpte(pmap, ptepindex, flags); if (m == NULL && (flags & PMAP_ENTER_NOSLEEP) == 0) goto retry; } return (m); } /*************************************************** * Pmap allocation/deallocation routines. ***************************************************/ /* * Release any resources held by the given physical map. * Called when a pmap initialized by pmap_pinit is being released. * Should only be called if the map contains no valid mappings. */ void pmap_release(pmap_t pmap) { vm_page_t m, ptdpg[NPGPTD]; int i; KASSERT(pmap->pm_stats.resident_count == 0, ("pmap_release: pmap resident count %ld != 0", pmap->pm_stats.resident_count)); KASSERT(vm_radix_is_empty(&pmap->pm_root), ("pmap_release: pmap has reserved page table page(s)")); KASSERT(CPU_EMPTY(&pmap->pm_active), ("releasing active pmap %p", pmap)); mtx_lock_spin(&allpmaps_lock); LIST_REMOVE(pmap, pm_list); mtx_unlock_spin(&allpmaps_lock); for (i = 0; i < NPGPTD; i++) ptdpg[i] = PHYS_TO_VM_PAGE(pmap->pm_pdir[PTDPTDI + i] & PG_FRAME); bzero(pmap->pm_pdir + PTDPTDI, (nkpt + NPGPTD) * sizeof(*pmap->pm_pdir)); pmap_qremove((vm_offset_t)pmap->pm_pdir, NPGPTD); for (i = 0; i < NPGPTD; i++) { m = ptdpg[i]; #if defined(PAE) || defined(PAE_TABLES) KASSERT(VM_PAGE_TO_PHYS(m) == (pmap->pm_pdpt[i] & PG_FRAME), ("pmap_release: got wrong ptd page")); #endif m->wire_count--; atomic_subtract_int(&vm_cnt.v_wire_count, 1); vm_page_free_zero(m); } } static int kvm_size(SYSCTL_HANDLER_ARGS) { unsigned long ksize = VM_MAX_KERNEL_ADDRESS - KERNBASE; return (sysctl_handle_long(oidp, &ksize, 0, req)); } SYSCTL_PROC(_vm, OID_AUTO, kvm_size, CTLTYPE_LONG|CTLFLAG_RD, 0, 0, kvm_size, "IU", "Size of KVM"); static int kvm_free(SYSCTL_HANDLER_ARGS) { unsigned long kfree = VM_MAX_KERNEL_ADDRESS - kernel_vm_end; return (sysctl_handle_long(oidp, &kfree, 0, req)); } SYSCTL_PROC(_vm, OID_AUTO, kvm_free, CTLTYPE_LONG|CTLFLAG_RD, 0, 0, kvm_free, "IU", "Amount of KVM free"); /* * grow the number of kernel page table entries, if needed */ void pmap_growkernel(vm_offset_t addr) { vm_paddr_t ptppaddr; vm_page_t nkpg; pd_entry_t newpdir; mtx_assert(&kernel_map->system_mtx, MA_OWNED); addr = roundup2(addr, NBPDR); if (addr - 1 >= kernel_map->max_offset) addr = kernel_map->max_offset; while (kernel_vm_end < addr) { if (pdir_pde(PTD, kernel_vm_end)) { kernel_vm_end = (kernel_vm_end + NBPDR) & ~PDRMASK; if (kernel_vm_end - 1 >= kernel_map->max_offset) { kernel_vm_end = kernel_map->max_offset; break; } continue; } nkpg = vm_page_alloc(NULL, kernel_vm_end >> PDRSHIFT, VM_ALLOC_INTERRUPT | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED | VM_ALLOC_ZERO); if (nkpg == NULL) panic("pmap_growkernel: no memory to grow kernel"); nkpt++; if ((nkpg->flags & PG_ZERO) == 0) pmap_zero_page(nkpg); ptppaddr = VM_PAGE_TO_PHYS(nkpg); newpdir = (pd_entry_t) (ptppaddr | PG_V | PG_RW | PG_A | PG_M); pdir_pde(KPTD, kernel_vm_end) = pgeflag | newpdir; pmap_kenter_pde(kernel_vm_end, newpdir); kernel_vm_end = (kernel_vm_end + NBPDR) & ~PDRMASK; if (kernel_vm_end - 1 >= kernel_map->max_offset) { kernel_vm_end = kernel_map->max_offset; break; } } } /*************************************************** * page management routines. ***************************************************/ CTASSERT(sizeof(struct pv_chunk) == PAGE_SIZE); CTASSERT(_NPCM == 11); CTASSERT(_NPCPV == 336); static __inline struct pv_chunk * pv_to_chunk(pv_entry_t pv) { return ((struct pv_chunk *)((uintptr_t)pv & ~(uintptr_t)PAGE_MASK)); } #define PV_PMAP(pv) (pv_to_chunk(pv)->pc_pmap) #define PC_FREE0_9 0xfffffffful /* Free values for index 0 through 9 */ #define PC_FREE10 0x0000fffful /* Free values for index 10 */ static const uint32_t pc_freemask[_NPCM] = { PC_FREE0_9, PC_FREE0_9, PC_FREE0_9, PC_FREE0_9, PC_FREE0_9, PC_FREE0_9, PC_FREE0_9, PC_FREE0_9, PC_FREE0_9, PC_FREE0_9, PC_FREE10 }; SYSCTL_INT(_vm_pmap, OID_AUTO, pv_entry_count, CTLFLAG_RD, &pv_entry_count, 0, "Current number of pv entries"); #ifdef PV_STATS static int pc_chunk_count, pc_chunk_allocs, pc_chunk_frees, pc_chunk_tryfail; SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_count, CTLFLAG_RD, &pc_chunk_count, 0, "Current number of pv entry chunks"); SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_allocs, CTLFLAG_RD, &pc_chunk_allocs, 0, "Current number of pv entry chunks allocated"); SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_frees, CTLFLAG_RD, &pc_chunk_frees, 0, "Current number of pv entry chunks frees"); SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_tryfail, CTLFLAG_RD, &pc_chunk_tryfail, 0, "Number of times tried to get a chunk page but failed."); static long pv_entry_frees, pv_entry_allocs; static int pv_entry_spare; SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_frees, CTLFLAG_RD, &pv_entry_frees, 0, "Current number of pv entry frees"); SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_allocs, CTLFLAG_RD, &pv_entry_allocs, 0, "Current number of pv entry allocs"); SYSCTL_INT(_vm_pmap, OID_AUTO, pv_entry_spare, CTLFLAG_RD, &pv_entry_spare, 0, "Current number of spare pv entries"); #endif /* * We are in a serious low memory condition. Resort to * drastic measures to free some pages so we can allocate * another pv entry chunk. */ static vm_page_t pmap_pv_reclaim(pmap_t locked_pmap) { struct pch newtail; struct pv_chunk *pc; struct md_page *pvh; pd_entry_t *pde; pmap_t pmap; pt_entry_t *pte, tpte; pv_entry_t pv; vm_offset_t va; vm_page_t m, m_pc; struct spglist free; uint32_t inuse; int bit, field, freed; PMAP_LOCK_ASSERT(locked_pmap, MA_OWNED); pmap = NULL; m_pc = NULL; SLIST_INIT(&free); TAILQ_INIT(&newtail); while ((pc = TAILQ_FIRST(&pv_chunks)) != NULL && (pv_vafree == 0 || SLIST_EMPTY(&free))) { TAILQ_REMOVE(&pv_chunks, pc, pc_lru); if (pmap != pc->pc_pmap) { if (pmap != NULL) { pmap_invalidate_all(pmap); if (pmap != locked_pmap) PMAP_UNLOCK(pmap); } pmap = pc->pc_pmap; /* Avoid deadlock and lock recursion. */ if (pmap > locked_pmap) PMAP_LOCK(pmap); else if (pmap != locked_pmap && !PMAP_TRYLOCK(pmap)) { pmap = NULL; TAILQ_INSERT_TAIL(&newtail, pc, pc_lru); continue; } } /* * Destroy every non-wired, 4 KB page mapping in the chunk. */ freed = 0; for (field = 0; field < _NPCM; field++) { for (inuse = ~pc->pc_map[field] & pc_freemask[field]; inuse != 0; inuse &= ~(1UL << bit)) { bit = bsfl(inuse); pv = &pc->pc_pventry[field * 32 + bit]; va = pv->pv_va; pde = pmap_pde(pmap, va); if ((*pde & PG_PS) != 0) continue; pte = pmap_pte(pmap, va); tpte = *pte; if ((tpte & PG_W) == 0) tpte = pte_load_clear(pte); pmap_pte_release(pte); if ((tpte & PG_W) != 0) continue; KASSERT(tpte != 0, ("pmap_pv_reclaim: pmap %p va %x zero pte", pmap, va)); if ((tpte & PG_G) != 0) pmap_invalidate_page(pmap, va); m = PHYS_TO_VM_PAGE(tpte & PG_FRAME); if ((tpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) vm_page_dirty(m); if ((tpte & PG_A) != 0) vm_page_aflag_set(m, PGA_REFERENCED); TAILQ_REMOVE(&m->md.pv_list, pv, pv_next); if (TAILQ_EMPTY(&m->md.pv_list) && (m->flags & PG_FICTITIOUS) == 0) { pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); if (TAILQ_EMPTY(&pvh->pv_list)) { vm_page_aflag_clear(m, PGA_WRITEABLE); } } pc->pc_map[field] |= 1UL << bit; pmap_unuse_pt(pmap, va, &free); freed++; } } if (freed == 0) { TAILQ_INSERT_TAIL(&newtail, pc, pc_lru); continue; } /* Every freed mapping is for a 4 KB page. */ pmap->pm_stats.resident_count -= freed; PV_STAT(pv_entry_frees += freed); PV_STAT(pv_entry_spare += freed); pv_entry_count -= freed; TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); for (field = 0; field < _NPCM; field++) if (pc->pc_map[field] != pc_freemask[field]) { TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list); TAILQ_INSERT_TAIL(&newtail, pc, pc_lru); /* * One freed pv entry in locked_pmap is * sufficient. */ if (pmap == locked_pmap) goto out; break; } if (field == _NPCM) { PV_STAT(pv_entry_spare -= _NPCPV); PV_STAT(pc_chunk_count--); PV_STAT(pc_chunk_frees++); /* Entire chunk is free; return it. */ m_pc = PHYS_TO_VM_PAGE(pmap_kextract((vm_offset_t)pc)); pmap_qremove((vm_offset_t)pc, 1); pmap_ptelist_free(&pv_vafree, (vm_offset_t)pc); break; } } out: TAILQ_CONCAT(&pv_chunks, &newtail, pc_lru); if (pmap != NULL) { pmap_invalidate_all(pmap); if (pmap != locked_pmap) PMAP_UNLOCK(pmap); } if (m_pc == NULL && pv_vafree != 0 && SLIST_EMPTY(&free)) { m_pc = SLIST_FIRST(&free); SLIST_REMOVE_HEAD(&free, plinks.s.ss); /* Recycle a freed page table page. */ m_pc->wire_count = 1; atomic_add_int(&vm_cnt.v_wire_count, 1); } pmap_free_zero_pages(&free); return (m_pc); } /* * free the pv_entry back to the free list */ static void free_pv_entry(pmap_t pmap, pv_entry_t pv) { struct pv_chunk *pc; int idx, field, bit; rw_assert(&pvh_global_lock, RA_WLOCKED); PMAP_LOCK_ASSERT(pmap, MA_OWNED); PV_STAT(pv_entry_frees++); PV_STAT(pv_entry_spare++); pv_entry_count--; pc = pv_to_chunk(pv); idx = pv - &pc->pc_pventry[0]; field = idx / 32; bit = idx % 32; pc->pc_map[field] |= 1ul << bit; for (idx = 0; idx < _NPCM; idx++) if (pc->pc_map[idx] != pc_freemask[idx]) { /* * 98% of the time, pc is already at the head of the * list. If it isn't already, move it to the head. */ if (__predict_false(TAILQ_FIRST(&pmap->pm_pvchunk) != pc)) { TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list); } return; } TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); free_pv_chunk(pc); } static void free_pv_chunk(struct pv_chunk *pc) { vm_page_t m; TAILQ_REMOVE(&pv_chunks, pc, pc_lru); PV_STAT(pv_entry_spare -= _NPCPV); PV_STAT(pc_chunk_count--); PV_STAT(pc_chunk_frees++); /* entire chunk is free, return it */ m = PHYS_TO_VM_PAGE(pmap_kextract((vm_offset_t)pc)); pmap_qremove((vm_offset_t)pc, 1); vm_page_unwire(m, PQ_NONE); vm_page_free(m); pmap_ptelist_free(&pv_vafree, (vm_offset_t)pc); } /* * get a new pv_entry, allocating a block from the system * when needed. */ static pv_entry_t get_pv_entry(pmap_t pmap, boolean_t try) { static const struct timeval printinterval = { 60, 0 }; static struct timeval lastprint; int bit, field; pv_entry_t pv; struct pv_chunk *pc; vm_page_t m; rw_assert(&pvh_global_lock, RA_WLOCKED); PMAP_LOCK_ASSERT(pmap, MA_OWNED); PV_STAT(pv_entry_allocs++); pv_entry_count++; if (pv_entry_count > pv_entry_high_water) if (ratecheck(&lastprint, &printinterval)) printf("Approaching the limit on PV entries, consider " "increasing either the vm.pmap.shpgperproc or the " "vm.pmap.pv_entry_max tunable.\n"); retry: pc = TAILQ_FIRST(&pmap->pm_pvchunk); if (pc != NULL) { for (field = 0; field < _NPCM; field++) { if (pc->pc_map[field]) { bit = bsfl(pc->pc_map[field]); break; } } if (field < _NPCM) { pv = &pc->pc_pventry[field * 32 + bit]; pc->pc_map[field] &= ~(1ul << bit); /* If this was the last item, move it to tail */ for (field = 0; field < _NPCM; field++) if (pc->pc_map[field] != 0) { PV_STAT(pv_entry_spare--); return (pv); /* not full, return */ } TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, pc_list); PV_STAT(pv_entry_spare--); return (pv); } } /* * Access to the ptelist "pv_vafree" is synchronized by the pvh * global lock. If "pv_vafree" is currently non-empty, it will * remain non-empty until pmap_ptelist_alloc() completes. */ if (pv_vafree == 0 || (m = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED)) == NULL) { if (try) { pv_entry_count--; PV_STAT(pc_chunk_tryfail++); return (NULL); } m = pmap_pv_reclaim(pmap); if (m == NULL) goto retry; } PV_STAT(pc_chunk_count++); PV_STAT(pc_chunk_allocs++); pc = (struct pv_chunk *)pmap_ptelist_alloc(&pv_vafree); pmap_qenter((vm_offset_t)pc, &m, 1); pc->pc_pmap = pmap; pc->pc_map[0] = pc_freemask[0] & ~1ul; /* preallocated bit 0 */ for (field = 1; field < _NPCM; field++) pc->pc_map[field] = pc_freemask[field]; TAILQ_INSERT_TAIL(&pv_chunks, pc, pc_lru); pv = &pc->pc_pventry[0]; TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list); PV_STAT(pv_entry_spare += _NPCPV - 1); return (pv); } static __inline pv_entry_t pmap_pvh_remove(struct md_page *pvh, pmap_t pmap, vm_offset_t va) { pv_entry_t pv; rw_assert(&pvh_global_lock, RA_WLOCKED); TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { if (pmap == PV_PMAP(pv) && va == pv->pv_va) { TAILQ_REMOVE(&pvh->pv_list, pv, pv_next); break; } } return (pv); } static void pmap_pv_demote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa) { struct md_page *pvh; pv_entry_t pv; vm_offset_t va_last; vm_page_t m; rw_assert(&pvh_global_lock, RA_WLOCKED); KASSERT((pa & PDRMASK) == 0, ("pmap_pv_demote_pde: pa is not 4mpage aligned")); /* * Transfer the 4mpage's pv entry for this mapping to the first * page's pv list. */ pvh = pa_to_pvh(pa); va = trunc_4mpage(va); pv = pmap_pvh_remove(pvh, pmap, va); KASSERT(pv != NULL, ("pmap_pv_demote_pde: pv not found")); m = PHYS_TO_VM_PAGE(pa); TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); /* Instantiate the remaining NPTEPG - 1 pv entries. */ va_last = va + NBPDR - PAGE_SIZE; do { m++; KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("pmap_pv_demote_pde: page %p is not managed", m)); va += PAGE_SIZE; pmap_insert_entry(pmap, va, m); } while (va < va_last); } static void pmap_pv_promote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa) { struct md_page *pvh; pv_entry_t pv; vm_offset_t va_last; vm_page_t m; rw_assert(&pvh_global_lock, RA_WLOCKED); KASSERT((pa & PDRMASK) == 0, ("pmap_pv_promote_pde: pa is not 4mpage aligned")); /* * Transfer the first page's pv entry for this mapping to the * 4mpage's pv list. Aside from avoiding the cost of a call * to get_pv_entry(), a transfer avoids the possibility that * get_pv_entry() calls pmap_collect() and that pmap_collect() * removes one of the mappings that is being promoted. */ m = PHYS_TO_VM_PAGE(pa); va = trunc_4mpage(va); pv = pmap_pvh_remove(&m->md, pmap, va); KASSERT(pv != NULL, ("pmap_pv_promote_pde: pv not found")); pvh = pa_to_pvh(pa); TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next); /* Free the remaining NPTEPG - 1 pv entries. */ va_last = va + NBPDR - PAGE_SIZE; do { m++; va += PAGE_SIZE; pmap_pvh_free(&m->md, pmap, va); } while (va < va_last); } static void pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va) { pv_entry_t pv; pv = pmap_pvh_remove(pvh, pmap, va); KASSERT(pv != NULL, ("pmap_pvh_free: pv not found")); free_pv_entry(pmap, pv); } static void pmap_remove_entry(pmap_t pmap, vm_page_t m, vm_offset_t va) { struct md_page *pvh; rw_assert(&pvh_global_lock, RA_WLOCKED); pmap_pvh_free(&m->md, pmap, va); if (TAILQ_EMPTY(&m->md.pv_list) && (m->flags & PG_FICTITIOUS) == 0) { pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); if (TAILQ_EMPTY(&pvh->pv_list)) vm_page_aflag_clear(m, PGA_WRITEABLE); } } /* * Create a pv entry for page at pa for * (pmap, va). */ static void pmap_insert_entry(pmap_t pmap, vm_offset_t va, vm_page_t m) { pv_entry_t pv; rw_assert(&pvh_global_lock, RA_WLOCKED); PMAP_LOCK_ASSERT(pmap, MA_OWNED); pv = get_pv_entry(pmap, FALSE); pv->pv_va = va; TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); } /* * Conditionally create a pv entry. */ static boolean_t pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va, vm_page_t m) { pv_entry_t pv; rw_assert(&pvh_global_lock, RA_WLOCKED); PMAP_LOCK_ASSERT(pmap, MA_OWNED); if (pv_entry_count < pv_entry_high_water && (pv = get_pv_entry(pmap, TRUE)) != NULL) { pv->pv_va = va; TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); return (TRUE); } else return (FALSE); } /* * Create the pv entries for each of the pages within a superpage. */ static boolean_t pmap_pv_insert_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa) { struct md_page *pvh; pv_entry_t pv; rw_assert(&pvh_global_lock, RA_WLOCKED); if (pv_entry_count < pv_entry_high_water && (pv = get_pv_entry(pmap, TRUE)) != NULL) { pv->pv_va = va; pvh = pa_to_pvh(pa); TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next); return (TRUE); } else return (FALSE); } /* * Fills a page table page with mappings to consecutive physical pages. */ static void pmap_fill_ptp(pt_entry_t *firstpte, pt_entry_t newpte) { pt_entry_t *pte; for (pte = firstpte; pte < firstpte + NPTEPG; pte++) { *pte = newpte; newpte += PAGE_SIZE; } } /* * Tries to demote a 2- or 4MB page mapping. If demotion fails, the * 2- or 4MB page mapping is invalidated. */ static boolean_t pmap_demote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va) { pd_entry_t newpde, oldpde; pt_entry_t *firstpte, newpte; vm_paddr_t mptepa; vm_page_t mpte; struct spglist free; vm_offset_t sva; PMAP_LOCK_ASSERT(pmap, MA_OWNED); oldpde = *pde; KASSERT((oldpde & (PG_PS | PG_V)) == (PG_PS | PG_V), ("pmap_demote_pde: oldpde is missing PG_PS and/or PG_V")); if ((oldpde & PG_A) == 0 || (mpte = pmap_remove_pt_page(pmap, va)) == NULL) { KASSERT((oldpde & PG_W) == 0, ("pmap_demote_pde: page table page for a wired mapping" " is missing")); /* * Invalidate the 2- or 4MB page mapping and return * "failure" if the mapping was never accessed or the * allocation of the new page table page fails. */ if ((oldpde & PG_A) == 0 || (mpte = vm_page_alloc(NULL, va >> PDRSHIFT, VM_ALLOC_NOOBJ | VM_ALLOC_NORMAL | VM_ALLOC_WIRED)) == NULL) { SLIST_INIT(&free); sva = trunc_4mpage(va); pmap_remove_pde(pmap, pde, sva, &free); if ((oldpde & PG_G) == 0) pmap_invalidate_pde_page(pmap, sva, oldpde); pmap_free_zero_pages(&free); CTR2(KTR_PMAP, "pmap_demote_pde: failure for va %#x" " in pmap %p", va, pmap); return (FALSE); } if (va < VM_MAXUSER_ADDRESS) pmap->pm_stats.resident_count++; } mptepa = VM_PAGE_TO_PHYS(mpte); /* * If the page mapping is in the kernel's address space, then the * KPTmap can provide access to the page table page. Otherwise, * temporarily map the page table page (mpte) into the kernel's * address space at either PADDR1 or PADDR2. */ if (va >= KERNBASE) firstpte = &KPTmap[i386_btop(trunc_4mpage(va))]; else if (curthread->td_pinned > 0 && rw_wowned(&pvh_global_lock)) { if ((*PMAP1 & PG_FRAME) != mptepa) { *PMAP1 = mptepa | PG_RW | PG_V | PG_A | PG_M; #ifdef SMP PMAP1cpu = PCPU_GET(cpuid); #endif invlcaddr(PADDR1); PMAP1changed++; } else #ifdef SMP if (PMAP1cpu != PCPU_GET(cpuid)) { PMAP1cpu = PCPU_GET(cpuid); invlcaddr(PADDR1); PMAP1changedcpu++; } else #endif PMAP1unchanged++; firstpte = PADDR1; } else { mtx_lock(&PMAP2mutex); if ((*PMAP2 & PG_FRAME) != mptepa) { *PMAP2 = mptepa | PG_RW | PG_V | PG_A | PG_M; pmap_invalidate_page(kernel_pmap, (vm_offset_t)PADDR2); } firstpte = PADDR2; } newpde = mptepa | PG_M | PG_A | (oldpde & PG_U) | PG_RW | PG_V; KASSERT((oldpde & PG_A) != 0, ("pmap_demote_pde: oldpde is missing PG_A")); KASSERT((oldpde & (PG_M | PG_RW)) != PG_RW, ("pmap_demote_pde: oldpde is missing PG_M")); newpte = oldpde & ~PG_PS; if ((newpte & PG_PDE_PAT) != 0) newpte ^= PG_PDE_PAT | PG_PTE_PAT; /* * If the page table page is new, initialize it. */ if (mpte->wire_count == 1) { mpte->wire_count = NPTEPG; pmap_fill_ptp(firstpte, newpte); } KASSERT((*firstpte & PG_FRAME) == (newpte & PG_FRAME), ("pmap_demote_pde: firstpte and newpte map different physical" " addresses")); /* * If the mapping has changed attributes, update the page table * entries. */ if ((*firstpte & PG_PTE_PROMOTE) != (newpte & PG_PTE_PROMOTE)) pmap_fill_ptp(firstpte, newpte); /* * Demote the mapping. This pmap is locked. The old PDE has * PG_A set. If the old PDE has PG_RW set, it also has PG_M * set. Thus, there is no danger of a race with another * processor changing the setting of PG_A and/or PG_M between * the read above and the store below. */ if (workaround_erratum383) pmap_update_pde(pmap, va, pde, newpde); else if (pmap == kernel_pmap) pmap_kenter_pde(va, newpde); else pde_store(pde, newpde); if (firstpte == PADDR2) mtx_unlock(&PMAP2mutex); /* * Invalidate the recursive mapping of the page table page. */ pmap_invalidate_page(pmap, (vm_offset_t)vtopte(va)); /* * Demote the pv entry. This depends on the earlier demotion * of the mapping. Specifically, the (re)creation of a per- * page pv entry might trigger the execution of pmap_collect(), * which might reclaim a newly (re)created per-page pv entry * and destroy the associated mapping. In order to destroy * the mapping, the PDE must have already changed from mapping * the 2mpage to referencing the page table page. */ if ((oldpde & PG_MANAGED) != 0) pmap_pv_demote_pde(pmap, va, oldpde & PG_PS_FRAME); pmap_pde_demotions++; CTR2(KTR_PMAP, "pmap_demote_pde: success for va %#x" " in pmap %p", va, pmap); return (TRUE); } /* * Removes a 2- or 4MB page mapping from the kernel pmap. */ static void pmap_remove_kernel_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va) { pd_entry_t newpde; vm_paddr_t mptepa; vm_page_t mpte; PMAP_LOCK_ASSERT(pmap, MA_OWNED); mpte = pmap_remove_pt_page(pmap, va); if (mpte == NULL) panic("pmap_remove_kernel_pde: Missing pt page."); mptepa = VM_PAGE_TO_PHYS(mpte); newpde = mptepa | PG_M | PG_A | PG_RW | PG_V; /* * Initialize the page table page. */ pagezero((void *)&KPTmap[i386_btop(trunc_4mpage(va))]); /* * Remove the mapping. */ if (workaround_erratum383) pmap_update_pde(pmap, va, pde, newpde); else pmap_kenter_pde(va, newpde); /* * Invalidate the recursive mapping of the page table page. */ pmap_invalidate_page(pmap, (vm_offset_t)vtopte(va)); } /* * pmap_remove_pde: do the things to unmap a superpage in a process */ static void pmap_remove_pde(pmap_t pmap, pd_entry_t *pdq, vm_offset_t sva, struct spglist *free) { struct md_page *pvh; pd_entry_t oldpde; vm_offset_t eva, va; vm_page_t m, mpte; PMAP_LOCK_ASSERT(pmap, MA_OWNED); KASSERT((sva & PDRMASK) == 0, ("pmap_remove_pde: sva is not 4mpage aligned")); oldpde = pte_load_clear(pdq); if (oldpde & PG_W) pmap->pm_stats.wired_count -= NBPDR / PAGE_SIZE; /* * Machines that don't support invlpg, also don't support * PG_G. */ if ((oldpde & PG_G) != 0) pmap_invalidate_pde_page(kernel_pmap, sva, oldpde); pmap->pm_stats.resident_count -= NBPDR / PAGE_SIZE; if (oldpde & PG_MANAGED) { pvh = pa_to_pvh(oldpde & PG_PS_FRAME); pmap_pvh_free(pvh, pmap, sva); eva = sva + NBPDR; for (va = sva, m = PHYS_TO_VM_PAGE(oldpde & PG_PS_FRAME); va < eva; va += PAGE_SIZE, m++) { if ((oldpde & (PG_M | PG_RW)) == (PG_M | PG_RW)) vm_page_dirty(m); if (oldpde & PG_A) vm_page_aflag_set(m, PGA_REFERENCED); if (TAILQ_EMPTY(&m->md.pv_list) && TAILQ_EMPTY(&pvh->pv_list)) vm_page_aflag_clear(m, PGA_WRITEABLE); } } if (pmap == kernel_pmap) { pmap_remove_kernel_pde(pmap, pdq, sva); } else { mpte = pmap_remove_pt_page(pmap, sva); if (mpte != NULL) { pmap->pm_stats.resident_count--; KASSERT(mpte->wire_count == NPTEPG, ("pmap_remove_pde: pte page wire count error")); mpte->wire_count = 0; pmap_add_delayed_free_list(mpte, free, FALSE); atomic_subtract_int(&vm_cnt.v_wire_count, 1); } } } /* * pmap_remove_pte: do the things to unmap a page in a process */ static int pmap_remove_pte(pmap_t pmap, pt_entry_t *ptq, vm_offset_t va, struct spglist *free) { pt_entry_t oldpte; vm_page_t m; rw_assert(&pvh_global_lock, RA_WLOCKED); PMAP_LOCK_ASSERT(pmap, MA_OWNED); oldpte = pte_load_clear(ptq); KASSERT(oldpte != 0, ("pmap_remove_pte: pmap %p va %x zero pte", pmap, va)); if (oldpte & PG_W) pmap->pm_stats.wired_count -= 1; /* * Machines that don't support invlpg, also don't support * PG_G. */ if (oldpte & PG_G) pmap_invalidate_page(kernel_pmap, va); pmap->pm_stats.resident_count -= 1; if (oldpte & PG_MANAGED) { m = PHYS_TO_VM_PAGE(oldpte & PG_FRAME); if ((oldpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) vm_page_dirty(m); if (oldpte & PG_A) vm_page_aflag_set(m, PGA_REFERENCED); pmap_remove_entry(pmap, m, va); } return (pmap_unuse_pt(pmap, va, free)); } /* * Remove a single page from a process address space */ static void pmap_remove_page(pmap_t pmap, vm_offset_t va, struct spglist *free) { pt_entry_t *pte; rw_assert(&pvh_global_lock, RA_WLOCKED); KASSERT(curthread->td_pinned > 0, ("curthread not pinned")); PMAP_LOCK_ASSERT(pmap, MA_OWNED); if ((pte = pmap_pte_quick(pmap, va)) == NULL || *pte == 0) return; pmap_remove_pte(pmap, pte, va, free); pmap_invalidate_page(pmap, va); } /* * Remove the given range of addresses from the specified map. * * It is assumed that the start and end are properly * rounded to the page size. */ void pmap_remove(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) { vm_offset_t pdnxt; pd_entry_t ptpaddr; pt_entry_t *pte; struct spglist free; int anyvalid; /* * Perform an unsynchronized read. This is, however, safe. */ if (pmap->pm_stats.resident_count == 0) return; anyvalid = 0; SLIST_INIT(&free); rw_wlock(&pvh_global_lock); sched_pin(); PMAP_LOCK(pmap); /* * special handling of removing one page. a very * common operation and easy to short circuit some * code. */ if ((sva + PAGE_SIZE == eva) && ((pmap->pm_pdir[(sva >> PDRSHIFT)] & PG_PS) == 0)) { pmap_remove_page(pmap, sva, &free); goto out; } for (; sva < eva; sva = pdnxt) { u_int pdirindex; /* * Calculate index for next page table. */ pdnxt = (sva + NBPDR) & ~PDRMASK; if (pdnxt < sva) pdnxt = eva; if (pmap->pm_stats.resident_count == 0) break; pdirindex = sva >> PDRSHIFT; ptpaddr = pmap->pm_pdir[pdirindex]; /* * Weed out invalid mappings. Note: we assume that the page * directory table is always allocated, and in kernel virtual. */ if (ptpaddr == 0) continue; /* * Check for large page. */ if ((ptpaddr & PG_PS) != 0) { /* * Are we removing the entire large page? If not, * demote the mapping and fall through. */ if (sva + NBPDR == pdnxt && eva >= pdnxt) { /* * The TLB entry for a PG_G mapping is * invalidated by pmap_remove_pde(). */ if ((ptpaddr & PG_G) == 0) anyvalid = 1; pmap_remove_pde(pmap, &pmap->pm_pdir[pdirindex], sva, &free); continue; } else if (!pmap_demote_pde(pmap, &pmap->pm_pdir[pdirindex], sva)) { /* The large page mapping was destroyed. */ continue; } } /* * Limit our scan to either the end of the va represented * by the current page table page, or to the end of the * range being removed. */ if (pdnxt > eva) pdnxt = eva; for (pte = pmap_pte_quick(pmap, sva); sva != pdnxt; pte++, sva += PAGE_SIZE) { if (*pte == 0) continue; /* * The TLB entry for a PG_G mapping is invalidated * by pmap_remove_pte(). */ if ((*pte & PG_G) == 0) anyvalid = 1; if (pmap_remove_pte(pmap, pte, sva, &free)) break; } } out: sched_unpin(); if (anyvalid) pmap_invalidate_all(pmap); rw_wunlock(&pvh_global_lock); PMAP_UNLOCK(pmap); pmap_free_zero_pages(&free); } /* * Routine: pmap_remove_all * Function: * Removes this physical page from * all physical maps in which it resides. * Reflects back modify bits to the pager. * * Notes: * Original versions of this routine were very * inefficient because they iteratively called * pmap_remove (slow...) */ void pmap_remove_all(vm_page_t m) { struct md_page *pvh; pv_entry_t pv; pmap_t pmap; pt_entry_t *pte, tpte; pd_entry_t *pde; vm_offset_t va; struct spglist free; KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("pmap_remove_all: page %p is not managed", m)); SLIST_INIT(&free); rw_wlock(&pvh_global_lock); sched_pin(); if ((m->flags & PG_FICTITIOUS) != 0) goto small_mappings; pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); while ((pv = TAILQ_FIRST(&pvh->pv_list)) != NULL) { va = pv->pv_va; pmap = PV_PMAP(pv); PMAP_LOCK(pmap); pde = pmap_pde(pmap, va); (void)pmap_demote_pde(pmap, pde, va); PMAP_UNLOCK(pmap); } small_mappings: while ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) { pmap = PV_PMAP(pv); PMAP_LOCK(pmap); pmap->pm_stats.resident_count--; pde = pmap_pde(pmap, pv->pv_va); KASSERT((*pde & PG_PS) == 0, ("pmap_remove_all: found" " a 4mpage in page %p's pv list", m)); pte = pmap_pte_quick(pmap, pv->pv_va); tpte = pte_load_clear(pte); KASSERT(tpte != 0, ("pmap_remove_all: pmap %p va %x zero pte", pmap, pv->pv_va)); if (tpte & PG_W) pmap->pm_stats.wired_count--; if (tpte & PG_A) vm_page_aflag_set(m, PGA_REFERENCED); /* * Update the vm_page_t clean and reference bits. */ if ((tpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) vm_page_dirty(m); pmap_unuse_pt(pmap, pv->pv_va, &free); pmap_invalidate_page(pmap, pv->pv_va); TAILQ_REMOVE(&m->md.pv_list, pv, pv_next); free_pv_entry(pmap, pv); PMAP_UNLOCK(pmap); } vm_page_aflag_clear(m, PGA_WRITEABLE); sched_unpin(); rw_wunlock(&pvh_global_lock); pmap_free_zero_pages(&free); } /* * pmap_protect_pde: do the things to protect a 4mpage in a process */ static boolean_t pmap_protect_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t sva, vm_prot_t prot) { pd_entry_t newpde, oldpde; vm_offset_t eva, va; vm_page_t m; boolean_t anychanged; PMAP_LOCK_ASSERT(pmap, MA_OWNED); KASSERT((sva & PDRMASK) == 0, ("pmap_protect_pde: sva is not 4mpage aligned")); anychanged = FALSE; retry: oldpde = newpde = *pde; if ((oldpde & (PG_MANAGED | PG_M | PG_RW)) == (PG_MANAGED | PG_M | PG_RW)) { eva = sva + NBPDR; for (va = sva, m = PHYS_TO_VM_PAGE(oldpde & PG_PS_FRAME); va < eva; va += PAGE_SIZE, m++) vm_page_dirty(m); } if ((prot & VM_PROT_WRITE) == 0) newpde &= ~(PG_RW | PG_M); #if defined(PAE) || defined(PAE_TABLES) if ((prot & VM_PROT_EXECUTE) == 0) newpde |= pg_nx; #endif if (newpde != oldpde) { /* * As an optimization to future operations on this PDE, clear * PG_PROMOTED. The impending invalidation will remove any * lingering 4KB page mappings from the TLB. */ if (!pde_cmpset(pde, oldpde, newpde & ~PG_PROMOTED)) goto retry; if ((oldpde & PG_G) != 0) pmap_invalidate_pde_page(kernel_pmap, sva, oldpde); else anychanged = TRUE; } return (anychanged); } /* * Set the physical protection on the * specified range of this map as requested. */ void pmap_protect(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, vm_prot_t prot) { vm_offset_t pdnxt; pd_entry_t ptpaddr; pt_entry_t *pte; boolean_t anychanged, pv_lists_locked; KASSERT((prot & ~VM_PROT_ALL) == 0, ("invalid prot %x", prot)); if (prot == VM_PROT_NONE) { pmap_remove(pmap, sva, eva); return; } #if defined(PAE) || defined(PAE_TABLES) if ((prot & (VM_PROT_WRITE|VM_PROT_EXECUTE)) == (VM_PROT_WRITE|VM_PROT_EXECUTE)) return; #else if (prot & VM_PROT_WRITE) return; #endif if (pmap_is_current(pmap)) pv_lists_locked = FALSE; else { pv_lists_locked = TRUE; resume: rw_wlock(&pvh_global_lock); sched_pin(); } anychanged = FALSE; PMAP_LOCK(pmap); for (; sva < eva; sva = pdnxt) { pt_entry_t obits, pbits; u_int pdirindex; pdnxt = (sva + NBPDR) & ~PDRMASK; if (pdnxt < sva) pdnxt = eva; pdirindex = sva >> PDRSHIFT; ptpaddr = pmap->pm_pdir[pdirindex]; /* * Weed out invalid mappings. Note: we assume that the page * directory table is always allocated, and in kernel virtual. */ if (ptpaddr == 0) continue; /* * Check for large page. */ if ((ptpaddr & PG_PS) != 0) { /* * Are we protecting the entire large page? If not, * demote the mapping and fall through. */ if (sva + NBPDR == pdnxt && eva >= pdnxt) { /* * The TLB entry for a PG_G mapping is * invalidated by pmap_protect_pde(). */ if (pmap_protect_pde(pmap, &pmap->pm_pdir[pdirindex], sva, prot)) anychanged = TRUE; continue; } else { if (!pv_lists_locked) { pv_lists_locked = TRUE; if (!rw_try_wlock(&pvh_global_lock)) { if (anychanged) pmap_invalidate_all( pmap); PMAP_UNLOCK(pmap); goto resume; } sched_pin(); } if (!pmap_demote_pde(pmap, &pmap->pm_pdir[pdirindex], sva)) { /* * The large page mapping was * destroyed. */ continue; } } } if (pdnxt > eva) pdnxt = eva; for (pte = pmap_pte_quick(pmap, sva); sva != pdnxt; pte++, sva += PAGE_SIZE) { vm_page_t m; retry: /* * Regardless of whether a pte is 32 or 64 bits in * size, PG_RW, PG_A, and PG_M are among the least * significant 32 bits. */ obits = pbits = *pte; if ((pbits & PG_V) == 0) continue; if ((prot & VM_PROT_WRITE) == 0) { if ((pbits & (PG_MANAGED | PG_M | PG_RW)) == (PG_MANAGED | PG_M | PG_RW)) { m = PHYS_TO_VM_PAGE(pbits & PG_FRAME); vm_page_dirty(m); } pbits &= ~(PG_RW | PG_M); } #if defined(PAE) || defined(PAE_TABLES) if ((prot & VM_PROT_EXECUTE) == 0) pbits |= pg_nx; #endif if (pbits != obits) { #if defined(PAE) || defined(PAE_TABLES) if (!atomic_cmpset_64(pte, obits, pbits)) goto retry; #else if (!atomic_cmpset_int((u_int *)pte, obits, pbits)) goto retry; #endif if (obits & PG_G) pmap_invalidate_page(pmap, sva); else anychanged = TRUE; } } } if (anychanged) pmap_invalidate_all(pmap); if (pv_lists_locked) { sched_unpin(); rw_wunlock(&pvh_global_lock); } PMAP_UNLOCK(pmap); } /* * Tries to promote the 512 or 1024, contiguous 4KB page mappings that are * within a single page table page (PTP) to a single 2- or 4MB page mapping. * For promotion to occur, two conditions must be met: (1) the 4KB page * mappings must map aligned, contiguous physical memory and (2) the 4KB page * mappings must have identical characteristics. * * Managed (PG_MANAGED) mappings within the kernel address space are not * promoted. The reason is that kernel PDEs are replicated in each pmap but * pmap_clear_ptes() and pmap_ts_referenced() only read the PDE from the kernel * pmap. */ static void pmap_promote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va) { pd_entry_t newpde; pt_entry_t *firstpte, oldpte, pa, *pte; vm_offset_t oldpteva; vm_page_t mpte; PMAP_LOCK_ASSERT(pmap, MA_OWNED); /* * Examine the first PTE in the specified PTP. Abort if this PTE is * either invalid, unused, or does not map the first 4KB physical page * within a 2- or 4MB page. */ firstpte = pmap_pte_quick(pmap, trunc_4mpage(va)); setpde: newpde = *firstpte; if ((newpde & ((PG_FRAME & PDRMASK) | PG_A | PG_V)) != (PG_A | PG_V)) { pmap_pde_p_failures++; CTR2(KTR_PMAP, "pmap_promote_pde: failure for va %#x" " in pmap %p", va, pmap); return; } if ((*firstpte & PG_MANAGED) != 0 && pmap == kernel_pmap) { pmap_pde_p_failures++; CTR2(KTR_PMAP, "pmap_promote_pde: failure for va %#x" " in pmap %p", va, pmap); return; } if ((newpde & (PG_M | PG_RW)) == PG_RW) { /* * When PG_M is already clear, PG_RW can be cleared without * a TLB invalidation. */ if (!atomic_cmpset_int((u_int *)firstpte, newpde, newpde & ~PG_RW)) goto setpde; newpde &= ~PG_RW; } /* * Examine each of the other PTEs in the specified PTP. Abort if this * PTE maps an unexpected 4KB physical page or does not have identical * characteristics to the first PTE. */ pa = (newpde & (PG_PS_FRAME | PG_A | PG_V)) + NBPDR - PAGE_SIZE; for (pte = firstpte + NPTEPG - 1; pte > firstpte; pte--) { setpte: oldpte = *pte; if ((oldpte & (PG_FRAME | PG_A | PG_V)) != pa) { pmap_pde_p_failures++; CTR2(KTR_PMAP, "pmap_promote_pde: failure for va %#x" " in pmap %p", va, pmap); return; } if ((oldpte & (PG_M | PG_RW)) == PG_RW) { /* * When PG_M is already clear, PG_RW can be cleared * without a TLB invalidation. */ if (!atomic_cmpset_int((u_int *)pte, oldpte, oldpte & ~PG_RW)) goto setpte; oldpte &= ~PG_RW; oldpteva = (oldpte & PG_FRAME & PDRMASK) | (va & ~PDRMASK); CTR2(KTR_PMAP, "pmap_promote_pde: protect for va %#x" " in pmap %p", oldpteva, pmap); } if ((oldpte & PG_PTE_PROMOTE) != (newpde & PG_PTE_PROMOTE)) { pmap_pde_p_failures++; CTR2(KTR_PMAP, "pmap_promote_pde: failure for va %#x" " in pmap %p", va, pmap); return; } pa -= PAGE_SIZE; } /* * Save the page table page in its current state until the PDE * mapping the superpage is demoted by pmap_demote_pde() or * destroyed by pmap_remove_pde(). */ mpte = PHYS_TO_VM_PAGE(*pde & PG_FRAME); KASSERT(mpte >= vm_page_array && mpte < &vm_page_array[vm_page_array_size], ("pmap_promote_pde: page table page is out of range")); KASSERT(mpte->pindex == va >> PDRSHIFT, ("pmap_promote_pde: page table page's pindex is wrong")); if (pmap_insert_pt_page(pmap, mpte)) { pmap_pde_p_failures++; CTR2(KTR_PMAP, "pmap_promote_pde: failure for va %#x in pmap %p", va, pmap); return; } /* * Promote the pv entries. */ if ((newpde & PG_MANAGED) != 0) pmap_pv_promote_pde(pmap, va, newpde & PG_PS_FRAME); /* * Propagate the PAT index to its proper position. */ if ((newpde & PG_PTE_PAT) != 0) newpde ^= PG_PDE_PAT | PG_PTE_PAT; /* * Map the superpage. */ if (workaround_erratum383) pmap_update_pde(pmap, va, pde, PG_PS | newpde); else if (pmap == kernel_pmap) pmap_kenter_pde(va, PG_PROMOTED | PG_PS | newpde); else pde_store(pde, PG_PROMOTED | PG_PS | newpde); pmap_pde_promotions++; CTR2(KTR_PMAP, "pmap_promote_pde: success for va %#x" " in pmap %p", va, pmap); } /* * Insert the given physical page (p) at * the specified virtual address (v) in the * target physical map with the protection requested. * * If specified, the page will be wired down, meaning * that the related pte can not be reclaimed. * * NB: This is the only routine which MAY NOT lazy-evaluate * or lose information. That is, this routine must actually * insert this page into the given map NOW. */ int pmap_enter(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot, u_int flags, int8_t psind) { pd_entry_t *pde; pt_entry_t *pte; pt_entry_t newpte, origpte; pv_entry_t pv; vm_paddr_t opa, pa; vm_page_t mpte, om; boolean_t invlva, wired; va = trunc_page(va); mpte = NULL; wired = (flags & PMAP_ENTER_WIRED) != 0; KASSERT(va <= VM_MAX_KERNEL_ADDRESS, ("pmap_enter: toobig")); KASSERT(va < UPT_MIN_ADDRESS || va >= UPT_MAX_ADDRESS, ("pmap_enter: invalid to pmap_enter page table pages (va: 0x%x)", va)); if ((m->oflags & VPO_UNMANAGED) == 0 && !vm_page_xbusied(m)) VM_OBJECT_ASSERT_LOCKED(m->object); rw_wlock(&pvh_global_lock); PMAP_LOCK(pmap); sched_pin(); pde = pmap_pde(pmap, va); if (va < VM_MAXUSER_ADDRESS) { /* * va is for UVA. * In the case that a page table page is not resident, * we are creating it here. pmap_allocpte() handles * demotion. */ mpte = pmap_allocpte(pmap, va, flags); if (mpte == NULL) { KASSERT((flags & PMAP_ENTER_NOSLEEP) != 0, ("pmap_allocpte failed with sleep allowed")); sched_unpin(); rw_wunlock(&pvh_global_lock); PMAP_UNLOCK(pmap); return (KERN_RESOURCE_SHORTAGE); } } else { /* * va is for KVA, so pmap_demote_pde() will never fail * to install a page table page. PG_V is also * asserted by pmap_demote_pde(). */ KASSERT(pde != NULL && (*pde & PG_V) != 0, ("KVA %#x invalid pde pdir %#jx", va, (uintmax_t)pmap->pm_pdir[PTDPTDI])); if ((*pde & PG_PS) != 0) pmap_demote_pde(pmap, pde, va); } pte = pmap_pte_quick(pmap, va); /* * Page Directory table entry is not valid, which should not * happen. We should have either allocated the page table * page or demoted the existing mapping above. */ if (pte == NULL) { panic("pmap_enter: invalid page directory pdir=%#jx, va=%#x", (uintmax_t)pmap->pm_pdir[PTDPTDI], va); } pa = VM_PAGE_TO_PHYS(m); om = NULL; origpte = *pte; opa = origpte & PG_FRAME; /* * Mapping has not changed, must be protection or wiring change. */ if (origpte && (opa == pa)) { /* * Wiring change, just update stats. We don't worry about * wiring PT pages as they remain resident as long as there * are valid mappings in them. Hence, if a user page is wired, * the PT page will be also. */ if (wired && ((origpte & PG_W) == 0)) pmap->pm_stats.wired_count++; else if (!wired && (origpte & PG_W)) pmap->pm_stats.wired_count--; /* * Remove extra pte reference */ if (mpte) mpte->wire_count--; if (origpte & PG_MANAGED) { om = m; pa |= PG_MANAGED; } goto validate; } pv = NULL; /* * Mapping has changed, invalidate old range and fall through to * handle validating new mapping. */ if (opa) { if (origpte & PG_W) pmap->pm_stats.wired_count--; if (origpte & PG_MANAGED) { om = PHYS_TO_VM_PAGE(opa); pv = pmap_pvh_remove(&om->md, pmap, va); } if (mpte != NULL) { mpte->wire_count--; KASSERT(mpte->wire_count > 0, ("pmap_enter: missing reference to page table page," " va: 0x%x", va)); } } else pmap->pm_stats.resident_count++; /* * Enter on the PV list if part of our managed memory. */ if ((m->oflags & VPO_UNMANAGED) == 0) { KASSERT(va < kmi.clean_sva || va >= kmi.clean_eva, ("pmap_enter: managed mapping within the clean submap")); if (pv == NULL) pv = get_pv_entry(pmap, FALSE); pv->pv_va = va; TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); pa |= PG_MANAGED; } else if (pv != NULL) free_pv_entry(pmap, pv); /* * Increment counters */ if (wired) pmap->pm_stats.wired_count++; validate: /* * Now validate mapping with desired protection/wiring. */ newpte = (pt_entry_t)(pa | pmap_cache_bits(m->md.pat_mode, 0) | PG_V); if ((prot & VM_PROT_WRITE) != 0) { newpte |= PG_RW; if ((newpte & PG_MANAGED) != 0) vm_page_aflag_set(m, PGA_WRITEABLE); } #if defined(PAE) || defined(PAE_TABLES) if ((prot & VM_PROT_EXECUTE) == 0) newpte |= pg_nx; #endif if (wired) newpte |= PG_W; if (va < VM_MAXUSER_ADDRESS) newpte |= PG_U; if (pmap == kernel_pmap) newpte |= pgeflag; /* * if the mapping or permission bits are different, we need * to update the pte. */ if ((origpte & ~(PG_M|PG_A)) != newpte) { newpte |= PG_A; if ((flags & VM_PROT_WRITE) != 0) newpte |= PG_M; if (origpte & PG_V) { invlva = FALSE; origpte = pte_load_store(pte, newpte); if (origpte & PG_A) { if (origpte & PG_MANAGED) vm_page_aflag_set(om, PGA_REFERENCED); if (opa != VM_PAGE_TO_PHYS(m)) invlva = TRUE; #if defined(PAE) || defined(PAE_TABLES) if ((origpte & PG_NX) == 0 && (newpte & PG_NX) != 0) invlva = TRUE; #endif } if ((origpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) { if ((origpte & PG_MANAGED) != 0) vm_page_dirty(om); if ((prot & VM_PROT_WRITE) == 0) invlva = TRUE; } if ((origpte & PG_MANAGED) != 0 && TAILQ_EMPTY(&om->md.pv_list) && ((om->flags & PG_FICTITIOUS) != 0 || TAILQ_EMPTY(&pa_to_pvh(opa)->pv_list))) vm_page_aflag_clear(om, PGA_WRITEABLE); if (invlva) pmap_invalidate_page(pmap, va); } else pte_store(pte, newpte); } /* * If both the page table page and the reservation are fully * populated, then attempt promotion. */ if ((mpte == NULL || mpte->wire_count == NPTEPG) && pg_ps_enabled && (m->flags & PG_FICTITIOUS) == 0 && vm_reserv_level_iffullpop(m) == 0) pmap_promote_pde(pmap, pde, va); sched_unpin(); rw_wunlock(&pvh_global_lock); PMAP_UNLOCK(pmap); return (KERN_SUCCESS); } /* * Tries to create a 2- or 4MB page mapping. Returns TRUE if successful and * FALSE otherwise. Fails if (1) a page table page cannot be allocated without * blocking, (2) a mapping already exists at the specified virtual address, or * (3) a pv entry cannot be allocated without reclaiming another pv entry. */ static boolean_t pmap_enter_pde(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot) { pd_entry_t *pde, newpde; rw_assert(&pvh_global_lock, RA_WLOCKED); PMAP_LOCK_ASSERT(pmap, MA_OWNED); pde = pmap_pde(pmap, va); if (*pde != 0) { CTR2(KTR_PMAP, "pmap_enter_pde: failure for va %#lx" " in pmap %p", va, pmap); return (FALSE); } newpde = VM_PAGE_TO_PHYS(m) | pmap_cache_bits(m->md.pat_mode, 1) | PG_PS | PG_V; if ((m->oflags & VPO_UNMANAGED) == 0) { newpde |= PG_MANAGED; /* * Abort this mapping if its PV entry could not be created. */ if (!pmap_pv_insert_pde(pmap, va, VM_PAGE_TO_PHYS(m))) { CTR2(KTR_PMAP, "pmap_enter_pde: failure for va %#lx" " in pmap %p", va, pmap); return (FALSE); } } #if defined(PAE) || defined(PAE_TABLES) if ((prot & VM_PROT_EXECUTE) == 0) newpde |= pg_nx; #endif if (va < VM_MAXUSER_ADDRESS) newpde |= PG_U; /* * Increment counters. */ pmap->pm_stats.resident_count += NBPDR / PAGE_SIZE; /* * Map the superpage. (This is not a promoted mapping; there will not * be any lingering 4KB page mappings in the TLB.) */ pde_store(pde, newpde); pmap_pde_mappings++; CTR2(KTR_PMAP, "pmap_enter_pde: success for va %#lx" " in pmap %p", va, pmap); return (TRUE); } /* * Maps a sequence of resident pages belonging to the same object. * The sequence begins with the given page m_start. This page is * mapped at the given virtual address start. Each subsequent page is * mapped at a virtual address that is offset from start by the same * amount as the page is offset from m_start within the object. The * last page in the sequence is the page with the largest offset from * m_start that can be mapped at a virtual address less than the given * virtual address end. Not every virtual page between start and end * is mapped; only those for which a resident page exists with the * corresponding offset from m_start are mapped. */ void pmap_enter_object(pmap_t pmap, vm_offset_t start, vm_offset_t end, vm_page_t m_start, vm_prot_t prot) { vm_offset_t va; vm_page_t m, mpte; vm_pindex_t diff, psize; VM_OBJECT_ASSERT_LOCKED(m_start->object); psize = atop(end - start); mpte = NULL; m = m_start; rw_wlock(&pvh_global_lock); PMAP_LOCK(pmap); while (m != NULL && (diff = m->pindex - m_start->pindex) < psize) { va = start + ptoa(diff); if ((va & PDRMASK) == 0 && va + NBPDR <= end && m->psind == 1 && pg_ps_enabled && pmap_enter_pde(pmap, va, m, prot)) m = &m[NBPDR / PAGE_SIZE - 1]; else mpte = pmap_enter_quick_locked(pmap, va, m, prot, mpte); m = TAILQ_NEXT(m, listq); } rw_wunlock(&pvh_global_lock); PMAP_UNLOCK(pmap); } /* * this code makes some *MAJOR* assumptions: * 1. Current pmap & pmap exists. * 2. Not wired. * 3. Read access. * 4. No page table pages. * but is *MUCH* faster than pmap_enter... */ void pmap_enter_quick(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot) { rw_wlock(&pvh_global_lock); PMAP_LOCK(pmap); (void)pmap_enter_quick_locked(pmap, va, m, prot, NULL); rw_wunlock(&pvh_global_lock); PMAP_UNLOCK(pmap); } static vm_page_t pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot, vm_page_t mpte) { pt_entry_t *pte; vm_paddr_t pa; struct spglist free; KASSERT(va < kmi.clean_sva || va >= kmi.clean_eva || (m->oflags & VPO_UNMANAGED) != 0, ("pmap_enter_quick_locked: managed mapping within the clean submap")); rw_assert(&pvh_global_lock, RA_WLOCKED); PMAP_LOCK_ASSERT(pmap, MA_OWNED); /* * In the case that a page table page is not * resident, we are creating it here. */ if (va < VM_MAXUSER_ADDRESS) { u_int ptepindex; pd_entry_t ptepa; /* * Calculate pagetable page index */ ptepindex = va >> PDRSHIFT; if (mpte && (mpte->pindex == ptepindex)) { mpte->wire_count++; } else { /* * Get the page directory entry */ ptepa = pmap->pm_pdir[ptepindex]; /* * If the page table page is mapped, we just increment * the hold count, and activate it. */ if (ptepa) { if (ptepa & PG_PS) return (NULL); mpte = PHYS_TO_VM_PAGE(ptepa & PG_FRAME); mpte->wire_count++; } else { mpte = _pmap_allocpte(pmap, ptepindex, PMAP_ENTER_NOSLEEP); if (mpte == NULL) return (mpte); } } } else { mpte = NULL; } /* * This call to vtopte makes the assumption that we are * entering the page into the current pmap. In order to support * quick entry into any pmap, one would likely use pmap_pte_quick. * But that isn't as quick as vtopte. */ pte = vtopte(va); if (*pte) { if (mpte != NULL) { mpte->wire_count--; mpte = NULL; } return (mpte); } /* * Enter on the PV list if part of our managed memory. */ if ((m->oflags & VPO_UNMANAGED) == 0 && !pmap_try_insert_pv_entry(pmap, va, m)) { if (mpte != NULL) { SLIST_INIT(&free); if (pmap_unwire_ptp(pmap, mpte, &free)) { pmap_invalidate_page(pmap, va); pmap_free_zero_pages(&free); } mpte = NULL; } return (mpte); } /* * Increment counters */ pmap->pm_stats.resident_count++; pa = VM_PAGE_TO_PHYS(m) | pmap_cache_bits(m->md.pat_mode, 0); #if defined(PAE) || defined(PAE_TABLES) if ((prot & VM_PROT_EXECUTE) == 0) pa |= pg_nx; #endif /* * Now validate mapping with RO protection */ if ((m->oflags & VPO_UNMANAGED) != 0) pte_store(pte, pa | PG_V | PG_U); else pte_store(pte, pa | PG_V | PG_U | PG_MANAGED); return (mpte); } /* * Make a temporary mapping for a physical address. This is only intended * to be used for panic dumps. */ void * pmap_kenter_temporary(vm_paddr_t pa, int i) { vm_offset_t va; va = (vm_offset_t)crashdumpmap + (i * PAGE_SIZE); pmap_kenter(va, pa); invlpg(va); return ((void *)crashdumpmap); } /* * This code maps large physical mmap regions into the * processor address space. Note that some shortcuts * are taken, but the code works. */ void pmap_object_init_pt(pmap_t pmap, vm_offset_t addr, vm_object_t object, vm_pindex_t pindex, vm_size_t size) { pd_entry_t *pde; vm_paddr_t pa, ptepa; vm_page_t p; int pat_mode; VM_OBJECT_ASSERT_WLOCKED(object); KASSERT(object->type == OBJT_DEVICE || object->type == OBJT_SG, ("pmap_object_init_pt: non-device object")); if (pseflag && (addr & (NBPDR - 1)) == 0 && (size & (NBPDR - 1)) == 0) { if (!vm_object_populate(object, pindex, pindex + atop(size))) return; p = vm_page_lookup(object, pindex); KASSERT(p->valid == VM_PAGE_BITS_ALL, ("pmap_object_init_pt: invalid page %p", p)); pat_mode = p->md.pat_mode; /* * Abort the mapping if the first page is not physically * aligned to a 2/4MB page boundary. */ ptepa = VM_PAGE_TO_PHYS(p); if (ptepa & (NBPDR - 1)) return; /* * Skip the first page. Abort the mapping if the rest of * the pages are not physically contiguous or have differing * memory attributes. */ p = TAILQ_NEXT(p, listq); for (pa = ptepa + PAGE_SIZE; pa < ptepa + size; pa += PAGE_SIZE) { KASSERT(p->valid == VM_PAGE_BITS_ALL, ("pmap_object_init_pt: invalid page %p", p)); if (pa != VM_PAGE_TO_PHYS(p) || pat_mode != p->md.pat_mode) return; p = TAILQ_NEXT(p, listq); } /* * Map using 2/4MB pages. Since "ptepa" is 2/4M aligned and * "size" is a multiple of 2/4M, adding the PAT setting to * "pa" will not affect the termination of this loop. */ PMAP_LOCK(pmap); for (pa = ptepa | pmap_cache_bits(pat_mode, 1); pa < ptepa + size; pa += NBPDR) { pde = pmap_pde(pmap, addr); if (*pde == 0) { pde_store(pde, pa | PG_PS | PG_M | PG_A | PG_U | PG_RW | PG_V); pmap->pm_stats.resident_count += NBPDR / PAGE_SIZE; pmap_pde_mappings++; } /* Else continue on if the PDE is already valid. */ addr += NBPDR; } PMAP_UNLOCK(pmap); } } /* * Clear the wired attribute from the mappings for the specified range of * addresses in the given pmap. Every valid mapping within that range * must have the wired attribute set. In contrast, invalid mappings * cannot have the wired attribute set, so they are ignored. * * The wired attribute of the page table entry is not a hardware feature, * so there is no need to invalidate any TLB entries. */ void pmap_unwire(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) { vm_offset_t pdnxt; pd_entry_t *pde; pt_entry_t *pte; boolean_t pv_lists_locked; if (pmap_is_current(pmap)) pv_lists_locked = FALSE; else { pv_lists_locked = TRUE; resume: rw_wlock(&pvh_global_lock); sched_pin(); } PMAP_LOCK(pmap); for (; sva < eva; sva = pdnxt) { pdnxt = (sva + NBPDR) & ~PDRMASK; if (pdnxt < sva) pdnxt = eva; pde = pmap_pde(pmap, sva); if ((*pde & PG_V) == 0) continue; if ((*pde & PG_PS) != 0) { if ((*pde & PG_W) == 0) panic("pmap_unwire: pde %#jx is missing PG_W", (uintmax_t)*pde); /* * Are we unwiring the entire large page? If not, * demote the mapping and fall through. */ if (sva + NBPDR == pdnxt && eva >= pdnxt) { /* * Regardless of whether a pde (or pte) is 32 * or 64 bits in size, PG_W is among the least * significant 32 bits. */ atomic_clear_int((u_int *)pde, PG_W); pmap->pm_stats.wired_count -= NBPDR / PAGE_SIZE; continue; } else { if (!pv_lists_locked) { pv_lists_locked = TRUE; if (!rw_try_wlock(&pvh_global_lock)) { PMAP_UNLOCK(pmap); /* Repeat sva. */ goto resume; } sched_pin(); } if (!pmap_demote_pde(pmap, pde, sva)) panic("pmap_unwire: demotion failed"); } } if (pdnxt > eva) pdnxt = eva; for (pte = pmap_pte_quick(pmap, sva); sva != pdnxt; pte++, sva += PAGE_SIZE) { if ((*pte & PG_V) == 0) continue; if ((*pte & PG_W) == 0) panic("pmap_unwire: pte %#jx is missing PG_W", (uintmax_t)*pte); /* * PG_W must be cleared atomically. Although the pmap * lock synchronizes access to PG_W, another processor * could be setting PG_M and/or PG_A concurrently. * * PG_W is among the least significant 32 bits. */ atomic_clear_int((u_int *)pte, PG_W); pmap->pm_stats.wired_count--; } } if (pv_lists_locked) { sched_unpin(); rw_wunlock(&pvh_global_lock); } PMAP_UNLOCK(pmap); } /* * Copy the range specified by src_addr/len * from the source map to the range dst_addr/len * in the destination map. * * This routine is only advisory and need not do anything. */ void pmap_copy(pmap_t dst_pmap, pmap_t src_pmap, vm_offset_t dst_addr, vm_size_t len, vm_offset_t src_addr) { struct spglist free; vm_offset_t addr; vm_offset_t end_addr = src_addr + len; vm_offset_t pdnxt; if (dst_addr != src_addr) return; if (!pmap_is_current(src_pmap)) return; rw_wlock(&pvh_global_lock); if (dst_pmap < src_pmap) { PMAP_LOCK(dst_pmap); PMAP_LOCK(src_pmap); } else { PMAP_LOCK(src_pmap); PMAP_LOCK(dst_pmap); } sched_pin(); for (addr = src_addr; addr < end_addr; addr = pdnxt) { pt_entry_t *src_pte, *dst_pte; vm_page_t dstmpte, srcmpte; pd_entry_t srcptepaddr; u_int ptepindex; KASSERT(addr < UPT_MIN_ADDRESS, ("pmap_copy: invalid to pmap_copy page tables")); pdnxt = (addr + NBPDR) & ~PDRMASK; if (pdnxt < addr) pdnxt = end_addr; ptepindex = addr >> PDRSHIFT; srcptepaddr = src_pmap->pm_pdir[ptepindex]; if (srcptepaddr == 0) continue; if (srcptepaddr & PG_PS) { if ((addr & PDRMASK) != 0 || addr + NBPDR > end_addr) continue; if (dst_pmap->pm_pdir[ptepindex] == 0 && ((srcptepaddr & PG_MANAGED) == 0 || pmap_pv_insert_pde(dst_pmap, addr, srcptepaddr & PG_PS_FRAME))) { dst_pmap->pm_pdir[ptepindex] = srcptepaddr & ~PG_W; dst_pmap->pm_stats.resident_count += NBPDR / PAGE_SIZE; pmap_pde_mappings++; } continue; } srcmpte = PHYS_TO_VM_PAGE(srcptepaddr & PG_FRAME); KASSERT(srcmpte->wire_count > 0, ("pmap_copy: source page table page is unused")); if (pdnxt > end_addr) pdnxt = end_addr; src_pte = vtopte(addr); while (addr < pdnxt) { pt_entry_t ptetemp; ptetemp = *src_pte; /* * we only virtual copy managed pages */ if ((ptetemp & PG_MANAGED) != 0) { dstmpte = pmap_allocpte(dst_pmap, addr, PMAP_ENTER_NOSLEEP); if (dstmpte == NULL) goto out; dst_pte = pmap_pte_quick(dst_pmap, addr); if (*dst_pte == 0 && pmap_try_insert_pv_entry(dst_pmap, addr, PHYS_TO_VM_PAGE(ptetemp & PG_FRAME))) { /* * Clear the wired, modified, and * accessed (referenced) bits * during the copy. */ *dst_pte = ptetemp & ~(PG_W | PG_M | PG_A); dst_pmap->pm_stats.resident_count++; } else { SLIST_INIT(&free); if (pmap_unwire_ptp(dst_pmap, dstmpte, &free)) { pmap_invalidate_page(dst_pmap, addr); pmap_free_zero_pages(&free); } goto out; } if (dstmpte->wire_count >= srcmpte->wire_count) break; } addr += PAGE_SIZE; src_pte++; } } out: sched_unpin(); rw_wunlock(&pvh_global_lock); PMAP_UNLOCK(src_pmap); PMAP_UNLOCK(dst_pmap); } /* * Zero 1 page of virtual memory mapped from a hardware page by the caller. */ static __inline void pagezero(void *page) { #if defined(I686_CPU) if (cpu_class == CPUCLASS_686) { if (cpu_feature & CPUID_SSE2) sse2_pagezero(page); else i686_pagezero(page); } else #endif bzero(page, PAGE_SIZE); } /* * Zero the specified hardware page. */ void pmap_zero_page(vm_page_t m) { pt_entry_t *cmap_pte2; struct pcpu *pc; sched_pin(); pc = get_pcpu(); cmap_pte2 = pc->pc_cmap_pte2; mtx_lock(&pc->pc_cmap_lock); if (*cmap_pte2) panic("pmap_zero_page: CMAP2 busy"); *cmap_pte2 = PG_V | PG_RW | VM_PAGE_TO_PHYS(m) | PG_A | PG_M | pmap_cache_bits(m->md.pat_mode, 0); invlcaddr(pc->pc_cmap_addr2); pagezero(pc->pc_cmap_addr2); *cmap_pte2 = 0; /* * Unpin the thread before releasing the lock. Otherwise the thread * could be rescheduled while still bound to the current CPU, only * to unpin itself immediately upon resuming execution. */ sched_unpin(); mtx_unlock(&pc->pc_cmap_lock); } /* * Zero an an area within a single hardware page. off and size must not * cover an area beyond a single hardware page. */ void pmap_zero_page_area(vm_page_t m, int off, int size) { pt_entry_t *cmap_pte2; struct pcpu *pc; sched_pin(); pc = get_pcpu(); cmap_pte2 = pc->pc_cmap_pte2; mtx_lock(&pc->pc_cmap_lock); if (*cmap_pte2) panic("pmap_zero_page_area: CMAP2 busy"); *cmap_pte2 = PG_V | PG_RW | VM_PAGE_TO_PHYS(m) | PG_A | PG_M | pmap_cache_bits(m->md.pat_mode, 0); invlcaddr(pc->pc_cmap_addr2); if (off == 0 && size == PAGE_SIZE) pagezero(pc->pc_cmap_addr2); else bzero(pc->pc_cmap_addr2 + off, size); *cmap_pte2 = 0; sched_unpin(); mtx_unlock(&pc->pc_cmap_lock); } /* * Copy 1 specified hardware page to another. */ void pmap_copy_page(vm_page_t src, vm_page_t dst) { pt_entry_t *cmap_pte1, *cmap_pte2; struct pcpu *pc; sched_pin(); pc = get_pcpu(); cmap_pte1 = pc->pc_cmap_pte1; cmap_pte2 = pc->pc_cmap_pte2; mtx_lock(&pc->pc_cmap_lock); if (*cmap_pte1) panic("pmap_copy_page: CMAP1 busy"); if (*cmap_pte2) panic("pmap_copy_page: CMAP2 busy"); *cmap_pte1 = PG_V | VM_PAGE_TO_PHYS(src) | PG_A | pmap_cache_bits(src->md.pat_mode, 0); invlcaddr(pc->pc_cmap_addr1); *cmap_pte2 = PG_V | PG_RW | VM_PAGE_TO_PHYS(dst) | PG_A | PG_M | pmap_cache_bits(dst->md.pat_mode, 0); invlcaddr(pc->pc_cmap_addr2); bcopy(pc->pc_cmap_addr1, pc->pc_cmap_addr2, PAGE_SIZE); *cmap_pte1 = 0; *cmap_pte2 = 0; sched_unpin(); mtx_unlock(&pc->pc_cmap_lock); } int unmapped_buf_allowed = 1; void pmap_copy_pages(vm_page_t ma[], vm_offset_t a_offset, vm_page_t mb[], vm_offset_t b_offset, int xfersize) { vm_page_t a_pg, b_pg; char *a_cp, *b_cp; vm_offset_t a_pg_offset, b_pg_offset; pt_entry_t *cmap_pte1, *cmap_pte2; struct pcpu *pc; int cnt; sched_pin(); pc = get_pcpu(); cmap_pte1 = pc->pc_cmap_pte1; cmap_pte2 = pc->pc_cmap_pte2; mtx_lock(&pc->pc_cmap_lock); if (*cmap_pte1 != 0) panic("pmap_copy_pages: CMAP1 busy"); if (*cmap_pte2 != 0) panic("pmap_copy_pages: CMAP2 busy"); while (xfersize > 0) { a_pg = ma[a_offset >> PAGE_SHIFT]; a_pg_offset = a_offset & PAGE_MASK; cnt = min(xfersize, PAGE_SIZE - a_pg_offset); b_pg = mb[b_offset >> PAGE_SHIFT]; b_pg_offset = b_offset & PAGE_MASK; cnt = min(cnt, PAGE_SIZE - b_pg_offset); *cmap_pte1 = PG_V | VM_PAGE_TO_PHYS(a_pg) | PG_A | pmap_cache_bits(a_pg->md.pat_mode, 0); invlcaddr(pc->pc_cmap_addr1); *cmap_pte2 = PG_V | PG_RW | VM_PAGE_TO_PHYS(b_pg) | PG_A | PG_M | pmap_cache_bits(b_pg->md.pat_mode, 0); invlcaddr(pc->pc_cmap_addr2); a_cp = pc->pc_cmap_addr1 + a_pg_offset; b_cp = pc->pc_cmap_addr2 + b_pg_offset; bcopy(a_cp, b_cp, cnt); a_offset += cnt; b_offset += cnt; xfersize -= cnt; } *cmap_pte1 = 0; *cmap_pte2 = 0; sched_unpin(); mtx_unlock(&pc->pc_cmap_lock); } /* * Returns true if the pmap's pv is one of the first * 16 pvs linked to from this page. This count may * be changed upwards or downwards in the future; it * is only necessary that true be returned for a small * subset of pmaps for proper page aging. */ boolean_t pmap_page_exists_quick(pmap_t pmap, vm_page_t m) { struct md_page *pvh; pv_entry_t pv; int loops = 0; boolean_t rv; KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("pmap_page_exists_quick: page %p is not managed", m)); rv = FALSE; rw_wlock(&pvh_global_lock); TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { if (PV_PMAP(pv) == pmap) { rv = TRUE; break; } loops++; if (loops >= 16) break; } if (!rv && loops < 16 && (m->flags & PG_FICTITIOUS) == 0) { pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { if (PV_PMAP(pv) == pmap) { rv = TRUE; break; } loops++; if (loops >= 16) break; } } rw_wunlock(&pvh_global_lock); return (rv); } /* * pmap_page_wired_mappings: * * Return the number of managed mappings to the given physical page * that are wired. */ int pmap_page_wired_mappings(vm_page_t m) { int count; count = 0; if ((m->oflags & VPO_UNMANAGED) != 0) return (count); rw_wlock(&pvh_global_lock); count = pmap_pvh_wired_mappings(&m->md, count); if ((m->flags & PG_FICTITIOUS) == 0) { count = pmap_pvh_wired_mappings(pa_to_pvh(VM_PAGE_TO_PHYS(m)), count); } rw_wunlock(&pvh_global_lock); return (count); } /* * pmap_pvh_wired_mappings: * * Return the updated number "count" of managed mappings that are wired. */ static int pmap_pvh_wired_mappings(struct md_page *pvh, int count) { pmap_t pmap; pt_entry_t *pte; pv_entry_t pv; rw_assert(&pvh_global_lock, RA_WLOCKED); sched_pin(); TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { pmap = PV_PMAP(pv); PMAP_LOCK(pmap); pte = pmap_pte_quick(pmap, pv->pv_va); if ((*pte & PG_W) != 0) count++; PMAP_UNLOCK(pmap); } sched_unpin(); return (count); } /* * Returns TRUE if the given page is mapped individually or as part of * a 4mpage. Otherwise, returns FALSE. */ boolean_t pmap_page_is_mapped(vm_page_t m) { boolean_t rv; if ((m->oflags & VPO_UNMANAGED) != 0) return (FALSE); rw_wlock(&pvh_global_lock); rv = !TAILQ_EMPTY(&m->md.pv_list) || ((m->flags & PG_FICTITIOUS) == 0 && !TAILQ_EMPTY(&pa_to_pvh(VM_PAGE_TO_PHYS(m))->pv_list)); rw_wunlock(&pvh_global_lock); return (rv); } /* * Remove all pages from specified address space * this aids process exit speeds. Also, this code * is special cased for current process only, but * can have the more generic (and slightly slower) * mode enabled. This is much faster than pmap_remove * in the case of running down an entire address space. */ void pmap_remove_pages(pmap_t pmap) { pt_entry_t *pte, tpte; vm_page_t m, mpte, mt; pv_entry_t pv; struct md_page *pvh; struct pv_chunk *pc, *npc; struct spglist free; int field, idx; int32_t bit; uint32_t inuse, bitmask; int allfree; if (pmap != PCPU_GET(curpmap)) { printf("warning: pmap_remove_pages called with non-current pmap\n"); return; } SLIST_INIT(&free); rw_wlock(&pvh_global_lock); PMAP_LOCK(pmap); sched_pin(); TAILQ_FOREACH_SAFE(pc, &pmap->pm_pvchunk, pc_list, npc) { KASSERT(pc->pc_pmap == pmap, ("Wrong pmap %p %p", pmap, pc->pc_pmap)); allfree = 1; for (field = 0; field < _NPCM; field++) { inuse = ~pc->pc_map[field] & pc_freemask[field]; while (inuse != 0) { bit = bsfl(inuse); bitmask = 1UL << bit; idx = field * 32 + bit; pv = &pc->pc_pventry[idx]; inuse &= ~bitmask; pte = pmap_pde(pmap, pv->pv_va); tpte = *pte; if ((tpte & PG_PS) == 0) { pte = vtopte(pv->pv_va); tpte = *pte & ~PG_PTE_PAT; } if (tpte == 0) { printf( "TPTE at %p IS ZERO @ VA %08x\n", pte, pv->pv_va); panic("bad pte"); } /* * We cannot remove wired pages from a process' mapping at this time */ if (tpte & PG_W) { allfree = 0; continue; } m = PHYS_TO_VM_PAGE(tpte & PG_FRAME); KASSERT(m->phys_addr == (tpte & PG_FRAME), ("vm_page_t %p phys_addr mismatch %016jx %016jx", m, (uintmax_t)m->phys_addr, (uintmax_t)tpte)); KASSERT((m->flags & PG_FICTITIOUS) != 0 || m < &vm_page_array[vm_page_array_size], ("pmap_remove_pages: bad tpte %#jx", (uintmax_t)tpte)); pte_clear(pte); /* * Update the vm_page_t clean/reference bits. */ if ((tpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) { if ((tpte & PG_PS) != 0) { for (mt = m; mt < &m[NBPDR / PAGE_SIZE]; mt++) vm_page_dirty(mt); } else vm_page_dirty(m); } /* Mark free */ PV_STAT(pv_entry_frees++); PV_STAT(pv_entry_spare++); pv_entry_count--; pc->pc_map[field] |= bitmask; if ((tpte & PG_PS) != 0) { pmap->pm_stats.resident_count -= NBPDR / PAGE_SIZE; pvh = pa_to_pvh(tpte & PG_PS_FRAME); TAILQ_REMOVE(&pvh->pv_list, pv, pv_next); if (TAILQ_EMPTY(&pvh->pv_list)) { for (mt = m; mt < &m[NBPDR / PAGE_SIZE]; mt++) if (TAILQ_EMPTY(&mt->md.pv_list)) vm_page_aflag_clear(mt, PGA_WRITEABLE); } mpte = pmap_remove_pt_page(pmap, pv->pv_va); if (mpte != NULL) { pmap->pm_stats.resident_count--; KASSERT(mpte->wire_count == NPTEPG, ("pmap_remove_pages: pte page wire count error")); mpte->wire_count = 0; pmap_add_delayed_free_list(mpte, &free, FALSE); atomic_subtract_int(&vm_cnt.v_wire_count, 1); } } else { pmap->pm_stats.resident_count--; TAILQ_REMOVE(&m->md.pv_list, pv, pv_next); if (TAILQ_EMPTY(&m->md.pv_list) && (m->flags & PG_FICTITIOUS) == 0) { pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); if (TAILQ_EMPTY(&pvh->pv_list)) vm_page_aflag_clear(m, PGA_WRITEABLE); } pmap_unuse_pt(pmap, pv->pv_va, &free); } } } if (allfree) { TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); free_pv_chunk(pc); } } sched_unpin(); pmap_invalidate_all(pmap); rw_wunlock(&pvh_global_lock); PMAP_UNLOCK(pmap); pmap_free_zero_pages(&free); } /* * pmap_is_modified: * * Return whether or not the specified physical page was modified * in any physical maps. */ boolean_t pmap_is_modified(vm_page_t m) { boolean_t rv; KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("pmap_is_modified: page %p is not managed", m)); /* * If the page is not exclusive busied, then PGA_WRITEABLE cannot be * concurrently set while the object is locked. Thus, if PGA_WRITEABLE * is clear, no PTEs can have PG_M set. */ VM_OBJECT_ASSERT_WLOCKED(m->object); if (!vm_page_xbusied(m) && (m->aflags & PGA_WRITEABLE) == 0) return (FALSE); rw_wlock(&pvh_global_lock); rv = pmap_is_modified_pvh(&m->md) || ((m->flags & PG_FICTITIOUS) == 0 && pmap_is_modified_pvh(pa_to_pvh(VM_PAGE_TO_PHYS(m)))); rw_wunlock(&pvh_global_lock); return (rv); } /* * Returns TRUE if any of the given mappings were used to modify * physical memory. Otherwise, returns FALSE. Both page and 2mpage * mappings are supported. */ static boolean_t pmap_is_modified_pvh(struct md_page *pvh) { pv_entry_t pv; pt_entry_t *pte; pmap_t pmap; boolean_t rv; rw_assert(&pvh_global_lock, RA_WLOCKED); rv = FALSE; sched_pin(); TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { pmap = PV_PMAP(pv); PMAP_LOCK(pmap); pte = pmap_pte_quick(pmap, pv->pv_va); rv = (*pte & (PG_M | PG_RW)) == (PG_M | PG_RW); PMAP_UNLOCK(pmap); if (rv) break; } sched_unpin(); return (rv); } /* * pmap_is_prefaultable: * * Return whether or not the specified virtual address is elgible * for prefault. */ boolean_t pmap_is_prefaultable(pmap_t pmap, vm_offset_t addr) { pd_entry_t *pde; pt_entry_t *pte; boolean_t rv; rv = FALSE; PMAP_LOCK(pmap); pde = pmap_pde(pmap, addr); if (*pde != 0 && (*pde & PG_PS) == 0) { pte = vtopte(addr); rv = *pte == 0; } PMAP_UNLOCK(pmap); return (rv); } /* * pmap_is_referenced: * * Return whether or not the specified physical page was referenced * in any physical maps. */ boolean_t pmap_is_referenced(vm_page_t m) { boolean_t rv; KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("pmap_is_referenced: page %p is not managed", m)); rw_wlock(&pvh_global_lock); rv = pmap_is_referenced_pvh(&m->md) || ((m->flags & PG_FICTITIOUS) == 0 && pmap_is_referenced_pvh(pa_to_pvh(VM_PAGE_TO_PHYS(m)))); rw_wunlock(&pvh_global_lock); return (rv); } /* * Returns TRUE if any of the given mappings were referenced and FALSE * otherwise. Both page and 4mpage mappings are supported. */ static boolean_t pmap_is_referenced_pvh(struct md_page *pvh) { pv_entry_t pv; pt_entry_t *pte; pmap_t pmap; boolean_t rv; rw_assert(&pvh_global_lock, RA_WLOCKED); rv = FALSE; sched_pin(); TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { pmap = PV_PMAP(pv); PMAP_LOCK(pmap); pte = pmap_pte_quick(pmap, pv->pv_va); rv = (*pte & (PG_A | PG_V)) == (PG_A | PG_V); PMAP_UNLOCK(pmap); if (rv) break; } sched_unpin(); return (rv); } /* * Clear the write and modified bits in each of the given page's mappings. */ void pmap_remove_write(vm_page_t m) { struct md_page *pvh; pv_entry_t next_pv, pv; pmap_t pmap; pd_entry_t *pde; pt_entry_t oldpte, *pte; vm_offset_t va; KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("pmap_remove_write: page %p is not managed", m)); /* * If the page is not exclusive busied, then PGA_WRITEABLE cannot be * set by another thread while the object is locked. Thus, * if PGA_WRITEABLE is clear, no page table entries need updating. */ VM_OBJECT_ASSERT_WLOCKED(m->object); if (!vm_page_xbusied(m) && (m->aflags & PGA_WRITEABLE) == 0) return; rw_wlock(&pvh_global_lock); sched_pin(); if ((m->flags & PG_FICTITIOUS) != 0) goto small_mappings; pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_next, next_pv) { va = pv->pv_va; pmap = PV_PMAP(pv); PMAP_LOCK(pmap); pde = pmap_pde(pmap, va); if ((*pde & PG_RW) != 0) (void)pmap_demote_pde(pmap, pde, va); PMAP_UNLOCK(pmap); } small_mappings: TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { pmap = PV_PMAP(pv); PMAP_LOCK(pmap); pde = pmap_pde(pmap, pv->pv_va); KASSERT((*pde & PG_PS) == 0, ("pmap_clear_write: found" " a 4mpage in page %p's pv list", m)); pte = pmap_pte_quick(pmap, pv->pv_va); retry: oldpte = *pte; if ((oldpte & PG_RW) != 0) { /* * Regardless of whether a pte is 32 or 64 bits * in size, PG_RW and PG_M are among the least * significant 32 bits. */ if (!atomic_cmpset_int((u_int *)pte, oldpte, oldpte & ~(PG_RW | PG_M))) goto retry; if ((oldpte & PG_M) != 0) vm_page_dirty(m); pmap_invalidate_page(pmap, pv->pv_va); } PMAP_UNLOCK(pmap); } vm_page_aflag_clear(m, PGA_WRITEABLE); sched_unpin(); rw_wunlock(&pvh_global_lock); } /* * pmap_ts_referenced: * * Return a count of reference bits for a page, clearing those bits. * It is not necessary for every reference bit to be cleared, but it * is necessary that 0 only be returned when there are truly no * reference bits set. * * As an optimization, update the page's dirty field if a modified bit is * found while counting reference bits. This opportunistic update can be * performed at low cost and can eliminate the need for some future calls * to pmap_is_modified(). However, since this function stops after * finding PMAP_TS_REFERENCED_MAX reference bits, it may not detect some * dirty pages. Those dirty pages will only be detected by a future call * to pmap_is_modified(). */ int pmap_ts_referenced(vm_page_t m) { struct md_page *pvh; pv_entry_t pv, pvf; pmap_t pmap; pd_entry_t *pde; pt_entry_t *pte; vm_paddr_t pa; int rtval = 0; KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("pmap_ts_referenced: page %p is not managed", m)); pa = VM_PAGE_TO_PHYS(m); pvh = pa_to_pvh(pa); rw_wlock(&pvh_global_lock); sched_pin(); if ((m->flags & PG_FICTITIOUS) != 0 || (pvf = TAILQ_FIRST(&pvh->pv_list)) == NULL) goto small_mappings; pv = pvf; do { pmap = PV_PMAP(pv); PMAP_LOCK(pmap); pde = pmap_pde(pmap, pv->pv_va); if ((*pde & (PG_M | PG_RW)) == (PG_M | PG_RW)) { /* * Although "*pde" is mapping a 2/4MB page, because * this function is called at a 4KB page granularity, * we only update the 4KB page under test. */ vm_page_dirty(m); } if ((*pde & PG_A) != 0) { /* * Since this reference bit is shared by either 1024 * or 512 4KB pages, it should not be cleared every * time it is tested. Apply a simple "hash" function * on the physical page number, the virtual superpage * number, and the pmap address to select one 4KB page * out of the 1024 or 512 on which testing the * reference bit will result in clearing that bit. * This function is designed to avoid the selection of * the same 4KB page for every 2- or 4MB page mapping. * * On demotion, a mapping that hasn't been referenced * is simply destroyed. To avoid the possibility of a * subsequent page fault on a demoted wired mapping, * always leave its reference bit set. Moreover, * since the superpage is wired, the current state of * its reference bit won't affect page replacement. */ if ((((pa >> PAGE_SHIFT) ^ (pv->pv_va >> PDRSHIFT) ^ (uintptr_t)pmap) & (NPTEPG - 1)) == 0 && (*pde & PG_W) == 0) { atomic_clear_int((u_int *)pde, PG_A); pmap_invalidate_page(pmap, pv->pv_va); } rtval++; } PMAP_UNLOCK(pmap); /* Rotate the PV list if it has more than one entry. */ if (TAILQ_NEXT(pv, pv_next) != NULL) { TAILQ_REMOVE(&pvh->pv_list, pv, pv_next); TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next); } if (rtval >= PMAP_TS_REFERENCED_MAX) goto out; } while ((pv = TAILQ_FIRST(&pvh->pv_list)) != pvf); small_mappings: if ((pvf = TAILQ_FIRST(&m->md.pv_list)) == NULL) goto out; pv = pvf; do { pmap = PV_PMAP(pv); PMAP_LOCK(pmap); pde = pmap_pde(pmap, pv->pv_va); KASSERT((*pde & PG_PS) == 0, ("pmap_ts_referenced: found a 4mpage in page %p's pv list", m)); pte = pmap_pte_quick(pmap, pv->pv_va); if ((*pte & (PG_M | PG_RW)) == (PG_M | PG_RW)) vm_page_dirty(m); if ((*pte & PG_A) != 0) { atomic_clear_int((u_int *)pte, PG_A); pmap_invalidate_page(pmap, pv->pv_va); rtval++; } PMAP_UNLOCK(pmap); /* Rotate the PV list if it has more than one entry. */ if (TAILQ_NEXT(pv, pv_next) != NULL) { TAILQ_REMOVE(&m->md.pv_list, pv, pv_next); TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); } } while ((pv = TAILQ_FIRST(&m->md.pv_list)) != pvf && rtval < PMAP_TS_REFERENCED_MAX); out: sched_unpin(); rw_wunlock(&pvh_global_lock); return (rtval); } /* * Apply the given advice to the specified range of addresses within the * given pmap. Depending on the advice, clear the referenced and/or * modified flags in each mapping and set the mapped page's dirty field. */ void pmap_advise(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, int advice) { pd_entry_t oldpde, *pde; pt_entry_t *pte; vm_offset_t va, pdnxt; vm_page_t m; boolean_t anychanged, pv_lists_locked; if (advice != MADV_DONTNEED && advice != MADV_FREE) return; if (pmap_is_current(pmap)) pv_lists_locked = FALSE; else { pv_lists_locked = TRUE; resume: rw_wlock(&pvh_global_lock); sched_pin(); } anychanged = FALSE; PMAP_LOCK(pmap); for (; sva < eva; sva = pdnxt) { pdnxt = (sva + NBPDR) & ~PDRMASK; if (pdnxt < sva) pdnxt = eva; pde = pmap_pde(pmap, sva); oldpde = *pde; if ((oldpde & PG_V) == 0) continue; else if ((oldpde & PG_PS) != 0) { if ((oldpde & PG_MANAGED) == 0) continue; if (!pv_lists_locked) { pv_lists_locked = TRUE; if (!rw_try_wlock(&pvh_global_lock)) { if (anychanged) pmap_invalidate_all(pmap); PMAP_UNLOCK(pmap); goto resume; } sched_pin(); } if (!pmap_demote_pde(pmap, pde, sva)) { /* * The large page mapping was destroyed. */ continue; } /* * Unless the page mappings are wired, remove the * mapping to a single page so that a subsequent * access may repromote. Since the underlying page * table page is fully populated, this removal never * frees a page table page. */ if ((oldpde & PG_W) == 0) { pte = pmap_pte_quick(pmap, sva); KASSERT((*pte & PG_V) != 0, ("pmap_advise: invalid PTE")); pmap_remove_pte(pmap, pte, sva, NULL); anychanged = TRUE; } } if (pdnxt > eva) pdnxt = eva; va = pdnxt; for (pte = pmap_pte_quick(pmap, sva); sva != pdnxt; pte++, sva += PAGE_SIZE) { if ((*pte & (PG_MANAGED | PG_V)) != (PG_MANAGED | PG_V)) goto maybe_invlrng; else if ((*pte & (PG_M | PG_RW)) == (PG_M | PG_RW)) { if (advice == MADV_DONTNEED) { /* * Future calls to pmap_is_modified() * can be avoided by making the page * dirty now. */ m = PHYS_TO_VM_PAGE(*pte & PG_FRAME); vm_page_dirty(m); } atomic_clear_int((u_int *)pte, PG_M | PG_A); } else if ((*pte & PG_A) != 0) atomic_clear_int((u_int *)pte, PG_A); else goto maybe_invlrng; if ((*pte & PG_G) != 0) { if (va == pdnxt) va = sva; } else anychanged = TRUE; continue; maybe_invlrng: if (va != pdnxt) { pmap_invalidate_range(pmap, va, sva); va = pdnxt; } } if (va != pdnxt) pmap_invalidate_range(pmap, va, sva); } if (anychanged) pmap_invalidate_all(pmap); if (pv_lists_locked) { sched_unpin(); rw_wunlock(&pvh_global_lock); } PMAP_UNLOCK(pmap); } /* * Clear the modify bits on the specified physical page. */ void pmap_clear_modify(vm_page_t m) { struct md_page *pvh; pv_entry_t next_pv, pv; pmap_t pmap; pd_entry_t oldpde, *pde; pt_entry_t oldpte, *pte; vm_offset_t va; KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("pmap_clear_modify: page %p is not managed", m)); VM_OBJECT_ASSERT_WLOCKED(m->object); KASSERT(!vm_page_xbusied(m), ("pmap_clear_modify: page %p is exclusive busied", m)); /* * If the page is not PGA_WRITEABLE, then no PTEs can have PG_M set. * If the object containing the page is locked and the page is not * exclusive busied, then PGA_WRITEABLE cannot be concurrently set. */ if ((m->aflags & PGA_WRITEABLE) == 0) return; rw_wlock(&pvh_global_lock); sched_pin(); if ((m->flags & PG_FICTITIOUS) != 0) goto small_mappings; pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_next, next_pv) { va = pv->pv_va; pmap = PV_PMAP(pv); PMAP_LOCK(pmap); pde = pmap_pde(pmap, va); oldpde = *pde; if ((oldpde & PG_RW) != 0) { if (pmap_demote_pde(pmap, pde, va)) { if ((oldpde & PG_W) == 0) { /* * Write protect the mapping to a * single page so that a subsequent * write access may repromote. */ va += VM_PAGE_TO_PHYS(m) - (oldpde & PG_PS_FRAME); pte = pmap_pte_quick(pmap, va); oldpte = *pte; if ((oldpte & PG_V) != 0) { /* * Regardless of whether a pte is 32 or 64 bits * in size, PG_RW and PG_M are among the least * significant 32 bits. */ while (!atomic_cmpset_int((u_int *)pte, oldpte, oldpte & ~(PG_M | PG_RW))) oldpte = *pte; vm_page_dirty(m); pmap_invalidate_page(pmap, va); } } } } PMAP_UNLOCK(pmap); } small_mappings: TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { pmap = PV_PMAP(pv); PMAP_LOCK(pmap); pde = pmap_pde(pmap, pv->pv_va); KASSERT((*pde & PG_PS) == 0, ("pmap_clear_modify: found" " a 4mpage in page %p's pv list", m)); pte = pmap_pte_quick(pmap, pv->pv_va); if ((*pte & (PG_M | PG_RW)) == (PG_M | PG_RW)) { /* * Regardless of whether a pte is 32 or 64 bits * in size, PG_M is among the least significant * 32 bits. */ atomic_clear_int((u_int *)pte, PG_M); pmap_invalidate_page(pmap, pv->pv_va); } PMAP_UNLOCK(pmap); } sched_unpin(); rw_wunlock(&pvh_global_lock); } /* * Miscellaneous support routines follow */ /* Adjust the cache mode for a 4KB page mapped via a PTE. */ static __inline void pmap_pte_attr(pt_entry_t *pte, int cache_bits) { u_int opte, npte; /* * The cache mode bits are all in the low 32-bits of the * PTE, so we can just spin on updating the low 32-bits. */ do { opte = *(u_int *)pte; npte = opte & ~PG_PTE_CACHE; npte |= cache_bits; } while (npte != opte && !atomic_cmpset_int((u_int *)pte, opte, npte)); } /* Adjust the cache mode for a 2/4MB page mapped via a PDE. */ static __inline void pmap_pde_attr(pd_entry_t *pde, int cache_bits) { u_int opde, npde; /* * The cache mode bits are all in the low 32-bits of the * PDE, so we can just spin on updating the low 32-bits. */ do { opde = *(u_int *)pde; npde = opde & ~PG_PDE_CACHE; npde |= cache_bits; } while (npde != opde && !atomic_cmpset_int((u_int *)pde, opde, npde)); } /* * Map a set of physical memory pages into the kernel virtual * address space. Return a pointer to where it is mapped. This * routine is intended to be used for mapping device memory, * NOT real memory. */ void * pmap_mapdev_attr(vm_paddr_t pa, vm_size_t size, int mode) { struct pmap_preinit_mapping *ppim; vm_offset_t va, offset; vm_size_t tmpsize; int i; offset = pa & PAGE_MASK; size = round_page(offset + size); pa = pa & PG_FRAME; if (pa < KERNLOAD && pa + size <= KERNLOAD) va = KERNBASE + pa; else if (!pmap_initialized) { va = 0; for (i = 0; i < PMAP_PREINIT_MAPPING_COUNT; i++) { ppim = pmap_preinit_mapping + i; if (ppim->va == 0) { ppim->pa = pa; ppim->sz = size; ppim->mode = mode; ppim->va = virtual_avail; virtual_avail += size; va = ppim->va; break; } } if (va == 0) panic("%s: too many preinit mappings", __func__); } else { /* * If we have a preinit mapping, re-use it. */ for (i = 0; i < PMAP_PREINIT_MAPPING_COUNT; i++) { ppim = pmap_preinit_mapping + i; if (ppim->pa == pa && ppim->sz == size && ppim->mode == mode) return ((void *)(ppim->va + offset)); } va = kva_alloc(size); if (va == 0) panic("%s: Couldn't allocate KVA", __func__); } for (tmpsize = 0; tmpsize < size; tmpsize += PAGE_SIZE) pmap_kenter_attr(va + tmpsize, pa + tmpsize, mode); pmap_invalidate_range(kernel_pmap, va, va + tmpsize); pmap_invalidate_cache_range(va, va + size, FALSE); return ((void *)(va + offset)); } void * pmap_mapdev(vm_paddr_t pa, vm_size_t size) { return (pmap_mapdev_attr(pa, size, PAT_UNCACHEABLE)); } void * pmap_mapbios(vm_paddr_t pa, vm_size_t size) { return (pmap_mapdev_attr(pa, size, PAT_WRITE_BACK)); } void pmap_unmapdev(vm_offset_t va, vm_size_t size) { struct pmap_preinit_mapping *ppim; vm_offset_t offset; int i; if (va >= KERNBASE && va + size <= KERNBASE + KERNLOAD) return; offset = va & PAGE_MASK; size = round_page(offset + size); va = trunc_page(va); for (i = 0; i < PMAP_PREINIT_MAPPING_COUNT; i++) { ppim = pmap_preinit_mapping + i; if (ppim->va == va && ppim->sz == size) { if (pmap_initialized) return; ppim->pa = 0; ppim->va = 0; ppim->sz = 0; ppim->mode = 0; if (va + size == virtual_avail) virtual_avail = va; return; } } if (pmap_initialized) kva_free(va, size); } /* * Sets the memory attribute for the specified page. */ void pmap_page_set_memattr(vm_page_t m, vm_memattr_t ma) { m->md.pat_mode = ma; if ((m->flags & PG_FICTITIOUS) != 0) return; /* * If "m" is a normal page, flush it from the cache. * See pmap_invalidate_cache_range(). * * First, try to find an existing mapping of the page by sf * buffer. sf_buf_invalidate_cache() modifies mapping and * flushes the cache. */ if (sf_buf_invalidate_cache(m)) return; /* * If page is not mapped by sf buffer, but CPU does not * support self snoop, map the page transient and do * invalidation. In the worst case, whole cache is flushed by * pmap_invalidate_cache_range(). */ if ((cpu_feature & CPUID_SS) == 0) pmap_flush_page(m); } static void pmap_flush_page(vm_page_t m) { pt_entry_t *cmap_pte2; struct pcpu *pc; vm_offset_t sva, eva; bool useclflushopt; useclflushopt = (cpu_stdext_feature & CPUID_STDEXT_CLFLUSHOPT) != 0; if (useclflushopt || (cpu_feature & CPUID_CLFSH) != 0) { sched_pin(); pc = get_pcpu(); cmap_pte2 = pc->pc_cmap_pte2; mtx_lock(&pc->pc_cmap_lock); if (*cmap_pte2) panic("pmap_flush_page: CMAP2 busy"); *cmap_pte2 = PG_V | PG_RW | VM_PAGE_TO_PHYS(m) | PG_A | PG_M | pmap_cache_bits(m->md.pat_mode, 0); invlcaddr(pc->pc_cmap_addr2); sva = (vm_offset_t)pc->pc_cmap_addr2; eva = sva + PAGE_SIZE; /* * Use mfence or sfence despite the ordering implied by * mtx_{un,}lock() because clflush on non-Intel CPUs * and clflushopt are not guaranteed to be ordered by * any other instruction. */ if (useclflushopt) sfence(); else if (cpu_vendor_id != CPU_VENDOR_INTEL) mfence(); for (; sva < eva; sva += cpu_clflush_line_size) { if (useclflushopt) clflushopt(sva); else clflush(sva); } if (useclflushopt) sfence(); else if (cpu_vendor_id != CPU_VENDOR_INTEL) mfence(); *cmap_pte2 = 0; sched_unpin(); mtx_unlock(&pc->pc_cmap_lock); } else pmap_invalidate_cache(); } /* * Changes the specified virtual address range's memory type to that given by * the parameter "mode". The specified virtual address range must be * completely contained within either the kernel map. * * Returns zero if the change completed successfully, and either EINVAL or * ENOMEM if the change failed. Specifically, EINVAL is returned if some part * of the virtual address range was not mapped, and ENOMEM is returned if * there was insufficient memory available to complete the change. */ int pmap_change_attr(vm_offset_t va, vm_size_t size, int mode) { vm_offset_t base, offset, tmpva; pd_entry_t *pde; pt_entry_t *pte; int cache_bits_pte, cache_bits_pde; boolean_t changed; base = trunc_page(va); offset = va & PAGE_MASK; size = round_page(offset + size); /* * Only supported on kernel virtual addresses above the recursive map. */ if (base < VM_MIN_KERNEL_ADDRESS) return (EINVAL); cache_bits_pde = pmap_cache_bits(mode, 1); cache_bits_pte = pmap_cache_bits(mode, 0); changed = FALSE; /* * Pages that aren't mapped aren't supported. Also break down * 2/4MB pages into 4KB pages if required. */ PMAP_LOCK(kernel_pmap); for (tmpva = base; tmpva < base + size; ) { pde = pmap_pde(kernel_pmap, tmpva); if (*pde == 0) { PMAP_UNLOCK(kernel_pmap); return (EINVAL); } if (*pde & PG_PS) { /* * If the current 2/4MB page already has * the required memory type, then we need not * demote this page. Just increment tmpva to * the next 2/4MB page frame. */ if ((*pde & PG_PDE_CACHE) == cache_bits_pde) { tmpva = trunc_4mpage(tmpva) + NBPDR; continue; } /* * If the current offset aligns with a 2/4MB * page frame and there is at least 2/4MB left * within the range, then we need not break * down this page into 4KB pages. */ if ((tmpva & PDRMASK) == 0 && tmpva + PDRMASK < base + size) { tmpva += NBPDR; continue; } if (!pmap_demote_pde(kernel_pmap, pde, tmpva)) { PMAP_UNLOCK(kernel_pmap); return (ENOMEM); } } pte = vtopte(tmpva); if (*pte == 0) { PMAP_UNLOCK(kernel_pmap); return (EINVAL); } tmpva += PAGE_SIZE; } PMAP_UNLOCK(kernel_pmap); /* * Ok, all the pages exist, so run through them updating their * cache mode if required. */ for (tmpva = base; tmpva < base + size; ) { pde = pmap_pde(kernel_pmap, tmpva); if (*pde & PG_PS) { if ((*pde & PG_PDE_CACHE) != cache_bits_pde) { pmap_pde_attr(pde, cache_bits_pde); changed = TRUE; } tmpva = trunc_4mpage(tmpva) + NBPDR; } else { pte = vtopte(tmpva); if ((*pte & PG_PTE_CACHE) != cache_bits_pte) { pmap_pte_attr(pte, cache_bits_pte); changed = TRUE; } tmpva += PAGE_SIZE; } } /* * Flush CPU caches to make sure any data isn't cached that * shouldn't be, etc. */ if (changed) { pmap_invalidate_range(kernel_pmap, base, tmpva); pmap_invalidate_cache_range(base, tmpva, FALSE); } return (0); } /* * perform the pmap work for mincore */ int pmap_mincore(pmap_t pmap, vm_offset_t addr, vm_paddr_t *locked_pa) { pd_entry_t *pdep; pt_entry_t *ptep, pte; vm_paddr_t pa; int val; PMAP_LOCK(pmap); retry: pdep = pmap_pde(pmap, addr); if (*pdep != 0) { if (*pdep & PG_PS) { pte = *pdep; /* Compute the physical address of the 4KB page. */ pa = ((*pdep & PG_PS_FRAME) | (addr & PDRMASK)) & PG_FRAME; val = MINCORE_SUPER; } else { ptep = pmap_pte(pmap, addr); pte = *ptep; pmap_pte_release(ptep); pa = pte & PG_FRAME; val = 0; } } else { pte = 0; pa = 0; val = 0; } if ((pte & PG_V) != 0) { val |= MINCORE_INCORE; if ((pte & (PG_M | PG_RW)) == (PG_M | PG_RW)) val |= MINCORE_MODIFIED | MINCORE_MODIFIED_OTHER; if ((pte & PG_A) != 0) val |= MINCORE_REFERENCED | MINCORE_REFERENCED_OTHER; } if ((val & (MINCORE_MODIFIED_OTHER | MINCORE_REFERENCED_OTHER)) != (MINCORE_MODIFIED_OTHER | MINCORE_REFERENCED_OTHER) && (pte & (PG_MANAGED | PG_V)) == (PG_MANAGED | PG_V)) { /* Ensure that "PHYS_TO_VM_PAGE(pa)->object" doesn't change. */ if (vm_page_pa_tryrelock(pmap, pa, locked_pa)) goto retry; } else PA_UNLOCK_COND(*locked_pa); PMAP_UNLOCK(pmap); return (val); } void pmap_activate(struct thread *td) { pmap_t pmap, oldpmap; u_int cpuid; u_int32_t cr3; critical_enter(); pmap = vmspace_pmap(td->td_proc->p_vmspace); oldpmap = PCPU_GET(curpmap); cpuid = PCPU_GET(cpuid); #if defined(SMP) CPU_CLR_ATOMIC(cpuid, &oldpmap->pm_active); CPU_SET_ATOMIC(cpuid, &pmap->pm_active); #else CPU_CLR(cpuid, &oldpmap->pm_active); CPU_SET(cpuid, &pmap->pm_active); #endif #if defined(PAE) || defined(PAE_TABLES) cr3 = vtophys(pmap->pm_pdpt); #else cr3 = vtophys(pmap->pm_pdir); #endif /* * pmap_activate is for the current thread on the current cpu */ td->td_pcb->pcb_cr3 = cr3; load_cr3(cr3); PCPU_SET(curpmap, pmap); critical_exit(); } void pmap_sync_icache(pmap_t pm, vm_offset_t va, vm_size_t sz) { } /* * Increase the starting virtual address of the given mapping if a * different alignment might result in more superpage mappings. */ void pmap_align_superpage(vm_object_t object, vm_ooffset_t offset, vm_offset_t *addr, vm_size_t size) { vm_offset_t superpage_offset; if (size < NBPDR) return; if (object != NULL && (object->flags & OBJ_COLORED) != 0) offset += ptoa(object->pg_color); superpage_offset = offset & PDRMASK; if (size - ((NBPDR - superpage_offset) & PDRMASK) < NBPDR || (*addr & PDRMASK) == superpage_offset) return; if ((*addr & PDRMASK) < superpage_offset) *addr = (*addr & ~PDRMASK) + superpage_offset; else *addr = ((*addr + PDRMASK) & ~PDRMASK) + superpage_offset; } vm_offset_t pmap_quick_enter_page(vm_page_t m) { vm_offset_t qaddr; pt_entry_t *pte; critical_enter(); qaddr = PCPU_GET(qmap_addr); pte = vtopte(qaddr); KASSERT(*pte == 0, ("pmap_quick_enter_page: PTE busy")); *pte = PG_V | PG_RW | VM_PAGE_TO_PHYS(m) | PG_A | PG_M | pmap_cache_bits(pmap_page_get_memattr(m), 0); invlpg(qaddr); return (qaddr); } void pmap_quick_remove_page(vm_offset_t addr) { vm_offset_t qaddr; pt_entry_t *pte; qaddr = PCPU_GET(qmap_addr); pte = vtopte(qaddr); KASSERT(*pte != 0, ("pmap_quick_remove_page: PTE not in use")); KASSERT(addr == qaddr, ("pmap_quick_remove_page: invalid address")); *pte = 0; critical_exit(); } #if defined(PMAP_DEBUG) pmap_pid_dump(int pid) { pmap_t pmap; struct proc *p; int npte = 0; int index; sx_slock(&allproc_lock); FOREACH_PROC_IN_SYSTEM(p) { if (p->p_pid != pid) continue; if (p->p_vmspace) { int i,j; index = 0; pmap = vmspace_pmap(p->p_vmspace); for (i = 0; i < NPDEPTD; i++) { pd_entry_t *pde; pt_entry_t *pte; vm_offset_t base = i << PDRSHIFT; pde = &pmap->pm_pdir[i]; if (pde && pmap_pde_v(pde)) { for (j = 0; j < NPTEPG; j++) { vm_offset_t va = base + (j << PAGE_SHIFT); if (va >= (vm_offset_t) VM_MIN_KERNEL_ADDRESS) { if (index) { index = 0; printf("\n"); } sx_sunlock(&allproc_lock); return (npte); } pte = pmap_pte(pmap, va); if (pte && pmap_pte_v(pte)) { pt_entry_t pa; vm_page_t m; pa = *pte; m = PHYS_TO_VM_PAGE(pa & PG_FRAME); printf("va: 0x%x, pt: 0x%x, h: %d, w: %d, f: 0x%x", va, pa, m->hold_count, m->wire_count, m->flags); npte++; index++; if (index >= 2) { index = 0; printf("\n"); } else { printf(" "); } } } } } } } sx_sunlock(&allproc_lock); return (npte); } #endif Index: projects/numa2/sys/kern/init_main.c =================================================================== --- projects/numa2/sys/kern/init_main.c (revision 321505) +++ projects/numa2/sys/kern/init_main.c (revision 321506) @@ -1,857 +1,859 @@ /*- * Copyright (c) 1995 Terrence R. Lambert * All rights reserved. * * Copyright (c) 1982, 1986, 1989, 1991, 1992, 1993 * The Regents of the University of California. All rights reserved. * (c) UNIX System Laboratories, Inc. * All or some portions of this file are derived from material licensed * to the University of California by American Telephone and Telegraph * Co. or Unix System Laboratories, Inc. and are reproduced herein with * the permission of UNIX System Laboratories, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)init_main.c 8.9 (Berkeley) 1/21/94 */ #include __FBSDID("$FreeBSD$"); #include "opt_ddb.h" #include "opt_init_path.h" #include "opt_verbose_sysinit.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include void mi_startup(void); /* Should be elsewhere */ /* Components of the first process -- never freed. */ static struct session session0; static struct pgrp pgrp0; struct proc proc0; struct thread0_storage thread0_st __aligned(32); struct vmspace vmspace0; struct proc *initproc; #ifndef BOOTHOWTO #define BOOTHOWTO 0 #endif int boothowto = BOOTHOWTO; /* initialized so that it can be patched */ SYSCTL_INT(_debug, OID_AUTO, boothowto, CTLFLAG_RD, &boothowto, 0, "Boot control flags, passed from loader"); #ifndef BOOTVERBOSE #define BOOTVERBOSE 0 #endif int bootverbose = BOOTVERBOSE; SYSCTL_INT(_debug, OID_AUTO, bootverbose, CTLFLAG_RW, &bootverbose, 0, "Control the output of verbose kernel messages"); #ifdef INVARIANTS FEATURE(invariants, "Kernel compiled with INVARIANTS, may affect performance"); #endif /* * This ensures that there is at least one entry so that the sysinit_set * symbol is not undefined. A sybsystem ID of SI_SUB_DUMMY is never * executed. */ SYSINIT(placeholder, SI_SUB_DUMMY, SI_ORDER_ANY, NULL, NULL); /* * The sysinit table itself. Items are checked off as the are run. * If we want to register new sysinit types, add them to newsysinit. */ SET_DECLARE(sysinit_set, struct sysinit); struct sysinit **sysinit, **sysinit_end; struct sysinit **newsysinit, **newsysinit_end; /* * Merge a new sysinit set into the current set, reallocating it if * necessary. This can only be called after malloc is running. */ void sysinit_add(struct sysinit **set, struct sysinit **set_end) { struct sysinit **newset; struct sysinit **sipp; struct sysinit **xipp; int count; count = set_end - set; if (newsysinit) count += newsysinit_end - newsysinit; else count += sysinit_end - sysinit; newset = malloc(count * sizeof(*sipp), M_TEMP, M_NOWAIT); if (newset == NULL) panic("cannot malloc for sysinit"); xipp = newset; if (newsysinit) for (sipp = newsysinit; sipp < newsysinit_end; sipp++) *xipp++ = *sipp; else for (sipp = sysinit; sipp < sysinit_end; sipp++) *xipp++ = *sipp; for (sipp = set; sipp < set_end; sipp++) *xipp++ = *sipp; if (newsysinit) free(newsysinit, M_TEMP); newsysinit = newset; newsysinit_end = newset + count; } #if defined (DDB) && defined(VERBOSE_SYSINIT) static const char * symbol_name(vm_offset_t va, db_strategy_t strategy) { const char *name; c_db_sym_t sym; db_expr_t offset; if (va == 0) return (NULL); sym = db_search_symbol(va, strategy, &offset); if (offset != 0) return (NULL); db_symbol_values(sym, &name, NULL); return (name); } #endif /* * System startup; initialize the world, create process 0, mount root * filesystem, and fork to create init and pagedaemon. Most of the * hard work is done in the lower-level initialization routines including * startup(), which does memory initialization and autoconfiguration. * * This allows simple addition of new kernel subsystems that require * boot time initialization. It also allows substitution of subsystem * (for instance, a scheduler, kernel profiler, or VM system) by object * module. Finally, it allows for optional "kernel threads". */ void mi_startup(void) { struct sysinit **sipp; /* system initialization*/ struct sysinit **xipp; /* interior loop of sort*/ struct sysinit *save; /* bubble*/ #if defined(VERBOSE_SYSINIT) int last; int verbose; #endif if (boothowto & RB_VERBOSE) bootverbose++; if (sysinit == NULL) { sysinit = SET_BEGIN(sysinit_set); sysinit_end = SET_LIMIT(sysinit_set); } restart: /* * Perform a bubble sort of the system initialization objects by * their subsystem (primary key) and order (secondary key). */ for (sipp = sysinit; sipp < sysinit_end; sipp++) { for (xipp = sipp + 1; xipp < sysinit_end; xipp++) { if ((*sipp)->subsystem < (*xipp)->subsystem || ((*sipp)->subsystem == (*xipp)->subsystem && (*sipp)->order <= (*xipp)->order)) continue; /* skip*/ save = *sipp; *sipp = *xipp; *xipp = save; } } #if defined(VERBOSE_SYSINIT) last = SI_SUB_COPYRIGHT; verbose = 0; #if !defined(DDB) printf("VERBOSE_SYSINIT: DDB not enabled, symbol lookups disabled.\n"); #endif #endif /* * Traverse the (now) ordered list of system initialization tasks. * Perform each task, and continue on to the next task. */ for (sipp = sysinit; sipp < sysinit_end; sipp++) { if ((*sipp)->subsystem == SI_SUB_DUMMY) continue; /* skip dummy task(s)*/ if ((*sipp)->subsystem == SI_SUB_DONE) continue; #if defined(VERBOSE_SYSINIT) if ((*sipp)->subsystem > last) { verbose = 1; last = (*sipp)->subsystem; printf("subsystem %x\n", last); } if (verbose) { #if defined(DDB) const char *func, *data; func = symbol_name((vm_offset_t)(*sipp)->func, DB_STGY_PROC); data = symbol_name((vm_offset_t)(*sipp)->udata, DB_STGY_ANY); if (func != NULL && data != NULL) printf(" %s(&%s)... ", func, data); else if (func != NULL) printf(" %s(%p)... ", func, (*sipp)->udata); else #endif printf(" %p(%p)... ", (*sipp)->func, (*sipp)->udata); } #endif /* Call function */ (*((*sipp)->func))((*sipp)->udata); #if defined(VERBOSE_SYSINIT) if (verbose) printf("done.\n"); #endif /* Check off the one we're just done */ (*sipp)->subsystem = SI_SUB_DONE; /* Check if we've installed more sysinit items via KLD */ if (newsysinit != NULL) { if (sysinit != SET_BEGIN(sysinit_set)) free(sysinit, M_TEMP); sysinit = newsysinit; sysinit_end = newsysinit_end; newsysinit = NULL; newsysinit_end = NULL; goto restart; } } mtx_assert(&Giant, MA_OWNED | MA_NOTRECURSED); mtx_unlock(&Giant); /* * Now hand over this thread to swapper. */ swapper(); /* NOTREACHED*/ } static void print_caddr_t(void *data) { printf("%s", (char *)data); } static void print_version(void *data __unused) { int len; /* Strip a trailing newline from version. */ len = strlen(version); while (len > 0 && version[len - 1] == '\n') len--; printf("%.*s %s\n", len, version, machine); printf("%s\n", compiler_version); } SYSINIT(announce, SI_SUB_COPYRIGHT, SI_ORDER_FIRST, print_caddr_t, copyright); SYSINIT(trademark, SI_SUB_COPYRIGHT, SI_ORDER_SECOND, print_caddr_t, trademark); SYSINIT(version, SI_SUB_COPYRIGHT, SI_ORDER_THIRD, print_version, NULL); #ifdef WITNESS static char wit_warn[] = "WARNING: WITNESS option enabled, expect reduced performance.\n"; SYSINIT(witwarn, SI_SUB_COPYRIGHT, SI_ORDER_THIRD + 1, print_caddr_t, wit_warn); SYSINIT(witwarn2, SI_SUB_LAST, SI_ORDER_THIRD + 1, print_caddr_t, wit_warn); #endif #ifdef DIAGNOSTIC static char diag_warn[] = "WARNING: DIAGNOSTIC option enabled, expect reduced performance.\n"; SYSINIT(diagwarn, SI_SUB_COPYRIGHT, SI_ORDER_THIRD + 2, print_caddr_t, diag_warn); SYSINIT(diagwarn2, SI_SUB_LAST, SI_ORDER_THIRD + 2, print_caddr_t, diag_warn); #endif static int null_fetch_syscall_args(struct thread *td __unused) { panic("null_fetch_syscall_args"); } static void null_set_syscall_retval(struct thread *td __unused, int error __unused) { panic("null_set_syscall_retval"); } struct sysentvec null_sysvec = { .sv_size = 0, .sv_table = NULL, .sv_mask = 0, .sv_errsize = 0, .sv_errtbl = NULL, .sv_transtrap = NULL, .sv_fixup = NULL, .sv_sendsig = NULL, .sv_sigcode = NULL, .sv_szsigcode = NULL, .sv_name = "null", .sv_coredump = NULL, .sv_imgact_try = NULL, .sv_minsigstksz = 0, .sv_pagesize = PAGE_SIZE, .sv_minuser = VM_MIN_ADDRESS, .sv_maxuser = VM_MAXUSER_ADDRESS, .sv_usrstack = USRSTACK, .sv_psstrings = PS_STRINGS, .sv_stackprot = VM_PROT_ALL, .sv_copyout_strings = NULL, .sv_setregs = NULL, .sv_fixlimit = NULL, .sv_maxssiz = NULL, .sv_flags = 0, .sv_set_syscall_retval = null_set_syscall_retval, .sv_fetch_syscall_args = null_fetch_syscall_args, .sv_syscallnames = NULL, .sv_schedtail = NULL, .sv_thread_detach = NULL, .sv_trap = NULL, }; /* * The two following SYSINIT's are proc0 specific glue code. I am not * convinced that they can not be safely combined, but their order of * operation has been maintained as the same as the original init_main.c * for right now. */ /* ARGSUSED*/ static void proc0_init(void *dummy __unused) { struct proc *p; struct thread *td; struct ucred *newcred; vm_paddr_t pageablemem; int i; GIANT_REQUIRED; p = &proc0; td = &thread0; /* * Initialize magic number and osrel. */ p->p_magic = P_MAGIC; p->p_osrel = osreldate; /* * Initialize thread and process structures. */ procinit(); /* set up proc zone */ threadinit(); /* set up UMA zones */ /* * Initialise scheduler resources. * Add scheduler specific parts to proc, thread as needed. */ schedinit(); /* scheduler gets its house in order */ /* * Create process 0 (the swapper). */ LIST_INSERT_HEAD(&allproc, p, p_list); LIST_INSERT_HEAD(PIDHASH(0), p, p_hash); mtx_init(&pgrp0.pg_mtx, "process group", NULL, MTX_DEF | MTX_DUPOK); p->p_pgrp = &pgrp0; LIST_INSERT_HEAD(PGRPHASH(0), &pgrp0, pg_hash); LIST_INIT(&pgrp0.pg_members); LIST_INSERT_HEAD(&pgrp0.pg_members, p, p_pglist); pgrp0.pg_session = &session0; mtx_init(&session0.s_mtx, "session", NULL, MTX_DEF); refcount_init(&session0.s_count, 1); session0.s_leader = p; p->p_sysent = &null_sysvec; p->p_flag = P_SYSTEM | P_INMEM | P_KPROC; p->p_flag2 = 0; p->p_state = PRS_NORMAL; p->p_klist = knlist_alloc(&p->p_mtx); STAILQ_INIT(&p->p_ktr); p->p_nice = NZERO; /* pid_max cannot be greater than PID_MAX */ td->td_tid = PID_MAX + 1; LIST_INSERT_HEAD(TIDHASH(td->td_tid), td, td_hash); td->td_state = TDS_RUNNING; td->td_pri_class = PRI_TIMESHARE; td->td_user_pri = PUSER; td->td_base_user_pri = PUSER; td->td_lend_user_pri = PRI_MAX; td->td_priority = PVM; td->td_base_pri = PVM; td->td_oncpu = curcpu; td->td_flags = TDF_INMEM; td->td_pflags = TDP_KTHREAD; td->td_cpuset = cpuset_thread0(); vm_domain_policy_init(&td->td_vm_dom_policy); vm_domain_policy_set(&td->td_vm_dom_policy, VM_POLICY_NONE, -1); + vm_domain_iterator_set_policy(&td->td_dom_selector, + &td->td_vm_dom_policy); vm_domain_policy_init(&p->p_vm_dom_policy); vm_domain_policy_set(&p->p_vm_dom_policy, VM_POLICY_NONE, -1); prison0_init(); p->p_peers = 0; p->p_leader = p; p->p_reaper = p; LIST_INIT(&p->p_reaplist); strncpy(p->p_comm, "kernel", sizeof (p->p_comm)); strncpy(td->td_name, "swapper", sizeof (td->td_name)); callout_init_mtx(&p->p_itcallout, &p->p_mtx, 0); callout_init_mtx(&p->p_limco, &p->p_mtx, 0); callout_init(&td->td_slpcallout, 1); /* Create credentials. */ newcred = crget(); newcred->cr_ngroups = 1; /* group 0 */ newcred->cr_uidinfo = uifind(0); newcred->cr_ruidinfo = uifind(0); newcred->cr_prison = &prison0; newcred->cr_loginclass = loginclass_find("default"); proc_set_cred_init(p, newcred); #ifdef AUDIT audit_cred_kproc0(newcred); #endif #ifdef MAC mac_cred_create_swapper(newcred); #endif /* Create sigacts. */ p->p_sigacts = sigacts_alloc(); /* Initialize signal state for process 0. */ siginit(&proc0); /* Create the file descriptor table. */ p->p_fd = fdinit(NULL, false); p->p_fdtol = NULL; /* Create the limits structures. */ p->p_limit = lim_alloc(); for (i = 0; i < RLIM_NLIMITS; i++) p->p_limit->pl_rlimit[i].rlim_cur = p->p_limit->pl_rlimit[i].rlim_max = RLIM_INFINITY; p->p_limit->pl_rlimit[RLIMIT_NOFILE].rlim_cur = p->p_limit->pl_rlimit[RLIMIT_NOFILE].rlim_max = maxfiles; p->p_limit->pl_rlimit[RLIMIT_NPROC].rlim_cur = p->p_limit->pl_rlimit[RLIMIT_NPROC].rlim_max = maxproc; p->p_limit->pl_rlimit[RLIMIT_DATA].rlim_cur = dfldsiz; p->p_limit->pl_rlimit[RLIMIT_DATA].rlim_max = maxdsiz; p->p_limit->pl_rlimit[RLIMIT_STACK].rlim_cur = dflssiz; p->p_limit->pl_rlimit[RLIMIT_STACK].rlim_max = maxssiz; /* Cast to avoid overflow on i386/PAE. */ pageablemem = ptoa((vm_paddr_t)vm_cnt.v_free_count); p->p_limit->pl_rlimit[RLIMIT_RSS].rlim_cur = p->p_limit->pl_rlimit[RLIMIT_RSS].rlim_max = pageablemem; p->p_limit->pl_rlimit[RLIMIT_MEMLOCK].rlim_cur = pageablemem / 3; p->p_limit->pl_rlimit[RLIMIT_MEMLOCK].rlim_max = pageablemem; p->p_cpulimit = RLIM_INFINITY; PROC_LOCK(p); thread_cow_get_proc(td, p); PROC_UNLOCK(p); /* Initialize resource accounting structures. */ racct_create(&p->p_racct); p->p_stats = pstats_alloc(); /* Allocate a prototype map so we have something to fork. */ p->p_vmspace = &vmspace0; vmspace0.vm_refcnt = 1; pmap_pinit0(vmspace_pmap(&vmspace0)); /* * proc0 is not expected to enter usermode, so there is no special * handling for sv_minuser here, like is done for exec_new_vmspace(). */ vm_map_init(&vmspace0.vm_map, vmspace_pmap(&vmspace0), p->p_sysent->sv_minuser, p->p_sysent->sv_maxuser); /* * Call the init and ctor for the new thread and proc. We wait * to do this until all other structures are fairly sane. */ EVENTHANDLER_INVOKE(process_init, p); EVENTHANDLER_INVOKE(thread_init, td); EVENTHANDLER_INVOKE(process_ctor, p); EVENTHANDLER_INVOKE(thread_ctor, td); /* * Charge root for one process. */ (void)chgproccnt(p->p_ucred->cr_ruidinfo, 1, 0); PROC_LOCK(p); racct_add_force(p, RACCT_NPROC, 1); PROC_UNLOCK(p); } SYSINIT(p0init, SI_SUB_INTRINSIC, SI_ORDER_FIRST, proc0_init, NULL); /* ARGSUSED*/ static void proc0_post(void *dummy __unused) { struct timespec ts; struct proc *p; struct rusage ru; struct thread *td; /* * Now we can look at the time, having had a chance to verify the * time from the filesystem. Pretend that proc0 started now. */ sx_slock(&allproc_lock); FOREACH_PROC_IN_SYSTEM(p) { microuptime(&p->p_stats->p_start); PROC_STATLOCK(p); rufetch(p, &ru); /* Clears thread stats */ PROC_STATUNLOCK(p); p->p_rux.rux_runtime = 0; p->p_rux.rux_uticks = 0; p->p_rux.rux_sticks = 0; p->p_rux.rux_iticks = 0; FOREACH_THREAD_IN_PROC(p, td) { td->td_runtime = 0; } } sx_sunlock(&allproc_lock); PCPU_SET(switchtime, cpu_ticks()); PCPU_SET(switchticks, ticks); /* * Give the ``random'' number generator a thump. */ nanotime(&ts); srandom(ts.tv_sec ^ ts.tv_nsec); } SYSINIT(p0post, SI_SUB_INTRINSIC_POST, SI_ORDER_FIRST, proc0_post, NULL); static void random_init(void *dummy __unused) { /* * After CPU has been started we have some randomness on most * platforms via get_cyclecount(). For platforms that don't * we will reseed random(9) in proc0_post() as well. */ srandom(get_cyclecount()); } SYSINIT(random, SI_SUB_RANDOM, SI_ORDER_FIRST, random_init, NULL); /* *************************************************************************** **** **** The following SYSINIT's and glue code should be moved to the **** respective files on a per subsystem basis. **** *************************************************************************** */ /* * List of paths to try when searching for "init". */ static char init_path[MAXPATHLEN] = #ifdef INIT_PATH __XSTRING(INIT_PATH); #else "/sbin/init:/sbin/oinit:/sbin/init.bak:/rescue/init"; #endif SYSCTL_STRING(_kern, OID_AUTO, init_path, CTLFLAG_RD, init_path, 0, "Path used to search the init process"); /* * Shutdown timeout of init(8). * Unused within kernel, but used to control init(8), hence do not remove. */ #ifndef INIT_SHUTDOWN_TIMEOUT #define INIT_SHUTDOWN_TIMEOUT 120 #endif static int init_shutdown_timeout = INIT_SHUTDOWN_TIMEOUT; SYSCTL_INT(_kern, OID_AUTO, init_shutdown_timeout, CTLFLAG_RW, &init_shutdown_timeout, 0, "Shutdown timeout of init(8). " "Unused within kernel, but used to control init(8)"); /* * Start the initial user process; try exec'ing each pathname in init_path. * The program is invoked with one argument containing the boot flags. */ static void start_init(void *dummy) { vm_offset_t addr; struct execve_args args; int options, error; char *var, *path, *next, *s; char *ucp, **uap, *arg0, *arg1; struct thread *td; struct proc *p; mtx_lock(&Giant); GIANT_REQUIRED; td = curthread; p = td->td_proc; vfs_mountroot(); /* Wipe GELI passphrase from the environment. */ kern_unsetenv("kern.geom.eli.passphrase"); /* * Need just enough stack to hold the faked-up "execve()" arguments. */ addr = p->p_sysent->sv_usrstack - PAGE_SIZE; if (vm_map_find(&p->p_vmspace->vm_map, NULL, 0, &addr, PAGE_SIZE, 0, VMFS_NO_SPACE, VM_PROT_ALL, VM_PROT_ALL, 0) != 0) panic("init: couldn't allocate argument space"); p->p_vmspace->vm_maxsaddr = (caddr_t)addr; p->p_vmspace->vm_ssize = 1; if ((var = kern_getenv("init_path")) != NULL) { strlcpy(init_path, var, sizeof(init_path)); freeenv(var); } for (path = init_path; *path != '\0'; path = next) { while (*path == ':') path++; if (*path == '\0') break; for (next = path; *next != '\0' && *next != ':'; next++) /* nothing */ ; if (bootverbose) printf("start_init: trying %.*s\n", (int)(next - path), path); /* * Move out the boot flag argument. */ options = 0; ucp = (char *)p->p_sysent->sv_usrstack; (void)subyte(--ucp, 0); /* trailing zero */ if (boothowto & RB_SINGLE) { (void)subyte(--ucp, 's'); options = 1; } #ifdef notyet if (boothowto & RB_FASTBOOT) { (void)subyte(--ucp, 'f'); options = 1; } #endif #ifdef BOOTCDROM (void)subyte(--ucp, 'C'); options = 1; #endif if (options == 0) (void)subyte(--ucp, '-'); (void)subyte(--ucp, '-'); /* leading hyphen */ arg1 = ucp; /* * Move out the file name (also arg 0). */ (void)subyte(--ucp, 0); for (s = next - 1; s >= path; s--) (void)subyte(--ucp, *s); arg0 = ucp; /* * Move out the arg pointers. */ uap = (char **)rounddown2((intptr_t)ucp, sizeof(intptr_t)); (void)suword((caddr_t)--uap, (long)0); /* terminator */ (void)suword((caddr_t)--uap, (long)(intptr_t)arg1); (void)suword((caddr_t)--uap, (long)(intptr_t)arg0); /* * Point at the arguments. */ args.fname = arg0; args.argv = uap; args.envv = NULL; /* * Now try to exec the program. If can't for any reason * other than it doesn't exist, complain. * * Otherwise, return via fork_trampoline() all the way * to user mode as init! */ if ((error = sys_execve(td, &args)) == 0) { mtx_unlock(&Giant); return; } if (error != ENOENT) printf("exec %.*s: error %d\n", (int)(next - path), path, error); } printf("init: not found in path %s\n", init_path); panic("no init"); } /* * Like kproc_create(), but runs in its own address space. * We do this early to reserve pid 1. * * Note special case - do not make it runnable yet. Other work * in progress will change this more. */ static void create_init(const void *udata __unused) { struct fork_req fr; struct ucred *newcred, *oldcred; struct thread *td; int error; bzero(&fr, sizeof(fr)); fr.fr_flags = RFFDG | RFPROC | RFSTOPPED; fr.fr_procp = &initproc; error = fork1(&thread0, &fr); if (error) panic("cannot fork init: %d\n", error); KASSERT(initproc->p_pid == 1, ("create_init: initproc->p_pid != 1")); /* divorce init's credentials from the kernel's */ newcred = crget(); sx_xlock(&proctree_lock); PROC_LOCK(initproc); initproc->p_flag |= P_SYSTEM | P_INMEM; initproc->p_treeflag |= P_TREE_REAPER; LIST_INSERT_HEAD(&initproc->p_reaplist, &proc0, p_reapsibling); oldcred = initproc->p_ucred; crcopy(newcred, oldcred); #ifdef MAC mac_cred_create_init(newcred); #endif #ifdef AUDIT audit_cred_proc1(newcred); #endif proc_set_cred(initproc, newcred); td = FIRST_THREAD_IN_PROC(initproc); crfree(td->td_ucred); td->td_ucred = crhold(initproc->p_ucred); PROC_UNLOCK(initproc); sx_xunlock(&proctree_lock); crfree(oldcred); cpu_fork_kthread_handler(FIRST_THREAD_IN_PROC(initproc), start_init, NULL); } SYSINIT(init, SI_SUB_CREATE_INIT, SI_ORDER_FIRST, create_init, NULL); /* * Make it runnable now. */ static void kick_init(const void *udata __unused) { struct thread *td; td = FIRST_THREAD_IN_PROC(initproc); thread_lock(td); TD_SET_CAN_RUN(td); sched_add(td, SRQ_BORING); thread_unlock(td); } SYSINIT(kickinit, SI_SUB_KTHREAD_INIT, SI_ORDER_MIDDLE, kick_init, NULL); Index: projects/numa2/sys/kern/kern_fork.c =================================================================== --- projects/numa2/sys/kern/kern_fork.c (revision 321505) +++ projects/numa2/sys/kern/kern_fork.c (revision 321506) @@ -1,1116 +1,1118 @@ /*- * Copyright (c) 1982, 1986, 1989, 1991, 1993 * The Regents of the University of California. All rights reserved. * (c) UNIX System Laboratories, Inc. * All or some portions of this file are derived from material licensed * to the University of California by American Telephone and Telegraph * Co. or Unix System Laboratories, Inc. and are reproduced herein with * the permission of UNIX System Laboratories, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)kern_fork.c 8.6 (Berkeley) 4/8/94 */ #include __FBSDID("$FreeBSD$"); #include "opt_ktrace.h" #include "opt_kstack_pages.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef KDTRACE_HOOKS #include dtrace_fork_func_t dtrace_fasttrap_fork; #endif SDT_PROVIDER_DECLARE(proc); SDT_PROBE_DEFINE3(proc, , , create, "struct proc *", "struct proc *", "int"); #ifndef _SYS_SYSPROTO_H_ struct fork_args { int dummy; }; #endif /* ARGSUSED */ int sys_fork(struct thread *td, struct fork_args *uap) { struct fork_req fr; int error, pid; bzero(&fr, sizeof(fr)); fr.fr_flags = RFFDG | RFPROC; fr.fr_pidp = &pid; error = fork1(td, &fr); if (error == 0) { td->td_retval[0] = pid; td->td_retval[1] = 0; } return (error); } /* ARGUSED */ int sys_pdfork(struct thread *td, struct pdfork_args *uap) { struct fork_req fr; int error, fd, pid; bzero(&fr, sizeof(fr)); fr.fr_flags = RFFDG | RFPROC | RFPROCDESC; fr.fr_pidp = &pid; fr.fr_pd_fd = &fd; fr.fr_pd_flags = uap->flags; /* * It is necessary to return fd by reference because 0 is a valid file * descriptor number, and the child needs to be able to distinguish * itself from the parent using the return value. */ error = fork1(td, &fr); if (error == 0) { td->td_retval[0] = pid; td->td_retval[1] = 0; error = copyout(&fd, uap->fdp, sizeof(fd)); } return (error); } /* ARGSUSED */ int sys_vfork(struct thread *td, struct vfork_args *uap) { struct fork_req fr; int error, pid; bzero(&fr, sizeof(fr)); fr.fr_flags = RFFDG | RFPROC | RFPPWAIT | RFMEM; fr.fr_pidp = &pid; error = fork1(td, &fr); if (error == 0) { td->td_retval[0] = pid; td->td_retval[1] = 0; } return (error); } int sys_rfork(struct thread *td, struct rfork_args *uap) { struct fork_req fr; int error, pid; /* Don't allow kernel-only flags. */ if ((uap->flags & RFKERNELONLY) != 0) return (EINVAL); AUDIT_ARG_FFLAGS(uap->flags); bzero(&fr, sizeof(fr)); fr.fr_flags = uap->flags; fr.fr_pidp = &pid; error = fork1(td, &fr); if (error == 0) { td->td_retval[0] = pid; td->td_retval[1] = 0; } return (error); } int nprocs = 1; /* process 0 */ int lastpid = 0; SYSCTL_INT(_kern, OID_AUTO, lastpid, CTLFLAG_RD, &lastpid, 0, "Last used PID"); /* * Random component to lastpid generation. We mix in a random factor to make * it a little harder to predict. We sanity check the modulus value to avoid * doing it in critical paths. Don't let it be too small or we pointlessly * waste randomness entropy, and don't let it be impossibly large. Using a * modulus that is too big causes a LOT more process table scans and slows * down fork processing as the pidchecked caching is defeated. */ static int randompid = 0; static int sysctl_kern_randompid(SYSCTL_HANDLER_ARGS) { int error, pid; error = sysctl_wire_old_buffer(req, sizeof(int)); if (error != 0) return(error); sx_xlock(&allproc_lock); pid = randompid; error = sysctl_handle_int(oidp, &pid, 0, req); if (error == 0 && req->newptr != NULL) { if (pid < 0 || pid > pid_max - 100) /* out of range */ pid = pid_max - 100; else if (pid < 2) /* NOP */ pid = 0; else if (pid < 100) /* Make it reasonable */ pid = 100; randompid = pid; } sx_xunlock(&allproc_lock); return (error); } SYSCTL_PROC(_kern, OID_AUTO, randompid, CTLTYPE_INT|CTLFLAG_RW, 0, 0, sysctl_kern_randompid, "I", "Random PID modulus"); static int fork_findpid(int flags) { struct proc *p; int trypid; static int pidchecked = 0; /* * Requires allproc_lock in order to iterate over the list * of processes, and proctree_lock to access p_pgrp. */ sx_assert(&allproc_lock, SX_LOCKED); sx_assert(&proctree_lock, SX_LOCKED); /* * Find an unused process ID. We remember a range of unused IDs * ready to use (from lastpid+1 through pidchecked-1). * * If RFHIGHPID is set (used during system boot), do not allocate * low-numbered pids. */ trypid = lastpid + 1; if (flags & RFHIGHPID) { if (trypid < 10) trypid = 10; } else { if (randompid) trypid += arc4random() % randompid; } retry: /* * If the process ID prototype has wrapped around, * restart somewhat above 0, as the low-numbered procs * tend to include daemons that don't exit. */ if (trypid >= pid_max) { trypid = trypid % pid_max; if (trypid < 100) trypid += 100; pidchecked = 0; } if (trypid >= pidchecked) { int doingzomb = 0; pidchecked = PID_MAX; /* * Scan the active and zombie procs to check whether this pid * is in use. Remember the lowest pid that's greater * than trypid, so we can avoid checking for a while. * * Avoid reuse of the process group id, session id or * the reaper subtree id. Note that for process group * and sessions, the amount of reserved pids is * limited by process limit. For the subtree ids, the * id is kept reserved only while there is a * non-reaped process in the subtree, so amount of * reserved pids is limited by process limit times * two. */ p = LIST_FIRST(&allproc); again: for (; p != NULL; p = LIST_NEXT(p, p_list)) { while (p->p_pid == trypid || p->p_reapsubtree == trypid || (p->p_pgrp != NULL && (p->p_pgrp->pg_id == trypid || (p->p_session != NULL && p->p_session->s_sid == trypid)))) { trypid++; if (trypid >= pidchecked) goto retry; } if (p->p_pid > trypid && pidchecked > p->p_pid) pidchecked = p->p_pid; if (p->p_pgrp != NULL) { if (p->p_pgrp->pg_id > trypid && pidchecked > p->p_pgrp->pg_id) pidchecked = p->p_pgrp->pg_id; if (p->p_session != NULL && p->p_session->s_sid > trypid && pidchecked > p->p_session->s_sid) pidchecked = p->p_session->s_sid; } } if (!doingzomb) { doingzomb = 1; p = LIST_FIRST(&zombproc); goto again; } } /* * RFHIGHPID does not mess with the lastpid counter during boot. */ if (flags & RFHIGHPID) pidchecked = 0; else lastpid = trypid; return (trypid); } static int fork_norfproc(struct thread *td, int flags) { int error; struct proc *p1; KASSERT((flags & RFPROC) == 0, ("fork_norfproc called with RFPROC set")); p1 = td->td_proc; if (((p1->p_flag & (P_HADTHREADS|P_SYSTEM)) == P_HADTHREADS) && (flags & (RFCFDG | RFFDG))) { PROC_LOCK(p1); if (thread_single(p1, SINGLE_BOUNDARY)) { PROC_UNLOCK(p1); return (ERESTART); } PROC_UNLOCK(p1); } error = vm_forkproc(td, NULL, NULL, NULL, flags); if (error) goto fail; /* * Close all file descriptors. */ if (flags & RFCFDG) { struct filedesc *fdtmp; fdtmp = fdinit(td->td_proc->p_fd, false); fdescfree(td); p1->p_fd = fdtmp; } /* * Unshare file descriptors (from parent). */ if (flags & RFFDG) fdunshare(td); fail: if (((p1->p_flag & (P_HADTHREADS|P_SYSTEM)) == P_HADTHREADS) && (flags & (RFCFDG | RFFDG))) { PROC_LOCK(p1); thread_single_end(p1, SINGLE_BOUNDARY); PROC_UNLOCK(p1); } return (error); } static void do_fork(struct thread *td, struct fork_req *fr, struct proc *p2, struct thread *td2, struct vmspace *vm2, struct file *fp_procdesc) { struct proc *p1, *pptr; int trypid; struct filedesc *fd; struct filedesc_to_leader *fdtol; struct sigacts *newsigacts; sx_assert(&proctree_lock, SX_SLOCKED); sx_assert(&allproc_lock, SX_XLOCKED); p1 = td->td_proc; trypid = fork_findpid(fr->fr_flags); sx_sunlock(&proctree_lock); p2->p_state = PRS_NEW; /* protect against others */ p2->p_pid = trypid; AUDIT_ARG_PID(p2->p_pid); LIST_INSERT_HEAD(&allproc, p2, p_list); allproc_gen++; LIST_INSERT_HEAD(PIDHASH(p2->p_pid), p2, p_hash); tidhash_add(td2); PROC_LOCK(p2); PROC_LOCK(p1); sx_xunlock(&allproc_lock); bcopy(&p1->p_startcopy, &p2->p_startcopy, __rangeof(struct proc, p_startcopy, p_endcopy)); pargs_hold(p2->p_args); PROC_UNLOCK(p1); bzero(&p2->p_startzero, __rangeof(struct proc, p_startzero, p_endzero)); /* Tell the prison that we exist. */ prison_proc_hold(p2->p_ucred->cr_prison); PROC_UNLOCK(p2); /* * Malloc things while we don't hold any locks. */ if (fr->fr_flags & RFSIGSHARE) newsigacts = NULL; else newsigacts = sigacts_alloc(); /* * Copy filedesc. */ if (fr->fr_flags & RFCFDG) { fd = fdinit(p1->p_fd, false); fdtol = NULL; } else if (fr->fr_flags & RFFDG) { fd = fdcopy(p1->p_fd); fdtol = NULL; } else { fd = fdshare(p1->p_fd); if (p1->p_fdtol == NULL) p1->p_fdtol = filedesc_to_leader_alloc(NULL, NULL, p1->p_leader); if ((fr->fr_flags & RFTHREAD) != 0) { /* * Shared file descriptor table, and shared * process leaders. */ fdtol = p1->p_fdtol; FILEDESC_XLOCK(p1->p_fd); fdtol->fdl_refcount++; FILEDESC_XUNLOCK(p1->p_fd); } else { /* * Shared file descriptor table, and different * process leaders. */ fdtol = filedesc_to_leader_alloc(p1->p_fdtol, p1->p_fd, p2); } } /* * Make a proc table entry for the new process. * Start by zeroing the section of proc that is zero-initialized, * then copy the section that is copied directly from the parent. */ PROC_LOCK(p2); PROC_LOCK(p1); bzero(&td2->td_startzero, __rangeof(struct thread, td_startzero, td_endzero)); bcopy(&td->td_startcopy, &td2->td_startcopy, __rangeof(struct thread, td_startcopy, td_endcopy)); bcopy(&p2->p_comm, &td2->td_name, sizeof(td2->td_name)); td2->td_sigstk = td->td_sigstk; td2->td_flags = TDF_INMEM; td2->td_lend_user_pri = PRI_MAX; #ifdef VIMAGE td2->td_vnet = NULL; td2->td_vnet_lpush = NULL; #endif /* * Allow the scheduler to initialize the child. */ thread_lock(td); sched_fork(td, td2); thread_unlock(td); /* * Duplicate sub-structures as needed. * Increase reference counts on shared objects. */ p2->p_flag = P_INMEM; p2->p_flag2 = p1->p_flag2 & (P2_NOTRACE | P2_NOTRACE_EXEC | P2_TRAPCAP); p2->p_swtick = ticks; if (p1->p_flag & P_PROFIL) startprofclock(p2); /* * Whilst the proc lock is held, copy the VM domain data out * using the VM domain method. */ vm_domain_policy_init(&p2->p_vm_dom_policy); vm_domain_policy_localcopy(&p2->p_vm_dom_policy, &p1->p_vm_dom_policy); + vm_domain_iterator_set_policy(&td2->td_dom_selector, + &p2->p_vm_dom_policy); if (fr->fr_flags & RFSIGSHARE) { p2->p_sigacts = sigacts_hold(p1->p_sigacts); } else { sigacts_copy(newsigacts, p1->p_sigacts); p2->p_sigacts = newsigacts; } if (fr->fr_flags & RFTSIGZMB) p2->p_sigparent = RFTSIGNUM(fr->fr_flags); else if (fr->fr_flags & RFLINUXTHPN) p2->p_sigparent = SIGUSR1; else p2->p_sigparent = SIGCHLD; p2->p_textvp = p1->p_textvp; p2->p_fd = fd; p2->p_fdtol = fdtol; if (p1->p_flag2 & P2_INHERIT_PROTECTED) { p2->p_flag |= P_PROTECTED; p2->p_flag2 |= P2_INHERIT_PROTECTED; } /* * p_limit is copy-on-write. Bump its refcount. */ lim_fork(p1, p2); thread_cow_get_proc(td2, p2); pstats_fork(p1->p_stats, p2->p_stats); PROC_UNLOCK(p1); PROC_UNLOCK(p2); /* Bump references to the text vnode (for procfs). */ if (p2->p_textvp) vrefact(p2->p_textvp); /* * Set up linkage for kernel based threading. */ if ((fr->fr_flags & RFTHREAD) != 0) { mtx_lock(&ppeers_lock); p2->p_peers = p1->p_peers; p1->p_peers = p2; p2->p_leader = p1->p_leader; mtx_unlock(&ppeers_lock); PROC_LOCK(p1->p_leader); if ((p1->p_leader->p_flag & P_WEXIT) != 0) { PROC_UNLOCK(p1->p_leader); /* * The task leader is exiting, so process p1 is * going to be killed shortly. Since p1 obviously * isn't dead yet, we know that the leader is either * sending SIGKILL's to all the processes in this * task or is sleeping waiting for all the peers to * exit. We let p1 complete the fork, but we need * to go ahead and kill the new process p2 since * the task leader may not get a chance to send * SIGKILL to it. We leave it on the list so that * the task leader will wait for this new process * to commit suicide. */ PROC_LOCK(p2); kern_psignal(p2, SIGKILL); PROC_UNLOCK(p2); } else PROC_UNLOCK(p1->p_leader); } else { p2->p_peers = NULL; p2->p_leader = p2; } sx_xlock(&proctree_lock); PGRP_LOCK(p1->p_pgrp); PROC_LOCK(p2); PROC_LOCK(p1); /* * Preserve some more flags in subprocess. P_PROFIL has already * been preserved. */ p2->p_flag |= p1->p_flag & P_SUGID; td2->td_pflags |= (td->td_pflags & TDP_ALTSTACK) | TDP_FORKING; SESS_LOCK(p1->p_session); if (p1->p_session->s_ttyvp != NULL && p1->p_flag & P_CONTROLT) p2->p_flag |= P_CONTROLT; SESS_UNLOCK(p1->p_session); if (fr->fr_flags & RFPPWAIT) p2->p_flag |= P_PPWAIT; p2->p_pgrp = p1->p_pgrp; LIST_INSERT_AFTER(p1, p2, p_pglist); PGRP_UNLOCK(p1->p_pgrp); LIST_INIT(&p2->p_children); LIST_INIT(&p2->p_orphans); callout_init_mtx(&p2->p_itcallout, &p2->p_mtx, 0); /* * If PF_FORK is set, the child process inherits the * procfs ioctl flags from its parent. */ if (p1->p_pfsflags & PF_FORK) { p2->p_stops = p1->p_stops; p2->p_pfsflags = p1->p_pfsflags; } /* * This begins the section where we must prevent the parent * from being swapped. */ _PHOLD(p1); PROC_UNLOCK(p1); /* * Attach the new process to its parent. * * If RFNOWAIT is set, the newly created process becomes a child * of init. This effectively disassociates the child from the * parent. */ if ((fr->fr_flags & RFNOWAIT) != 0) { pptr = p1->p_reaper; p2->p_reaper = pptr; } else { p2->p_reaper = (p1->p_treeflag & P_TREE_REAPER) != 0 ? p1 : p1->p_reaper; pptr = p1; } p2->p_pptr = pptr; LIST_INSERT_HEAD(&pptr->p_children, p2, p_sibling); LIST_INIT(&p2->p_reaplist); LIST_INSERT_HEAD(&p2->p_reaper->p_reaplist, p2, p_reapsibling); if (p2->p_reaper == p1) p2->p_reapsubtree = p2->p_pid; sx_xunlock(&proctree_lock); /* Inform accounting that we have forked. */ p2->p_acflag = AFORK; PROC_UNLOCK(p2); #ifdef KTRACE ktrprocfork(p1, p2); #endif /* * Finish creating the child process. It will return via a different * execution path later. (ie: directly into user mode) */ vm_forkproc(td, p2, td2, vm2, fr->fr_flags); if (fr->fr_flags == (RFFDG | RFPROC)) { VM_CNT_INC(v_forks); VM_CNT_ADD(v_forkpages, p2->p_vmspace->vm_dsize + p2->p_vmspace->vm_ssize); } else if (fr->fr_flags == (RFFDG | RFPROC | RFPPWAIT | RFMEM)) { VM_CNT_INC(v_vforks); VM_CNT_ADD(v_vforkpages, p2->p_vmspace->vm_dsize + p2->p_vmspace->vm_ssize); } else if (p1 == &proc0) { VM_CNT_INC(v_kthreads); VM_CNT_ADD(v_kthreadpages, p2->p_vmspace->vm_dsize + p2->p_vmspace->vm_ssize); } else { VM_CNT_INC(v_rforks); VM_CNT_ADD(v_rforkpages, p2->p_vmspace->vm_dsize + p2->p_vmspace->vm_ssize); } /* * Associate the process descriptor with the process before anything * can happen that might cause that process to need the descriptor. * However, don't do this until after fork(2) can no longer fail. */ if (fr->fr_flags & RFPROCDESC) procdesc_new(p2, fr->fr_pd_flags); /* * Both processes are set up, now check if any loadable modules want * to adjust anything. */ EVENTHANDLER_INVOKE(process_fork, p1, p2, fr->fr_flags); /* * Set the child start time and mark the process as being complete. */ PROC_LOCK(p2); PROC_LOCK(p1); microuptime(&p2->p_stats->p_start); PROC_SLOCK(p2); p2->p_state = PRS_NORMAL; PROC_SUNLOCK(p2); #ifdef KDTRACE_HOOKS /* * Tell the DTrace fasttrap provider about the new process so that any * tracepoints inherited from the parent can be removed. We have to do * this only after p_state is PRS_NORMAL since the fasttrap module will * use pfind() later on. */ if ((fr->fr_flags & RFMEM) == 0 && dtrace_fasttrap_fork) dtrace_fasttrap_fork(p1, p2); #endif /* * Hold the process so that it cannot exit after we make it runnable, * but before we wait for the debugger. */ _PHOLD(p2); if (p1->p_ptevents & PTRACE_FORK) { /* * Arrange for debugger to receive the fork event. * * We can report PL_FLAG_FORKED regardless of * P_FOLLOWFORK settings, but it does not make a sense * for runaway child. */ td->td_dbgflags |= TDB_FORK; td->td_dbg_forked = p2->p_pid; td2->td_dbgflags |= TDB_STOPATFORK; } if (fr->fr_flags & RFPPWAIT) { td->td_pflags |= TDP_RFPPWAIT; td->td_rfppwait_p = p2; td->td_dbgflags |= TDB_VFORK; } PROC_UNLOCK(p2); /* * Now can be swapped. */ _PRELE(p1); PROC_UNLOCK(p1); /* * Tell any interested parties about the new process. */ knote_fork(p1->p_klist, p2->p_pid); SDT_PROBE3(proc, , , create, p2, p1, fr->fr_flags); if (fr->fr_flags & RFPROCDESC) { procdesc_finit(p2->p_procdesc, fp_procdesc); fdrop(fp_procdesc, td); } if ((fr->fr_flags & RFSTOPPED) == 0) { /* * If RFSTOPPED not requested, make child runnable and * add to run queue. */ thread_lock(td2); TD_SET_CAN_RUN(td2); sched_add(td2, SRQ_BORING); thread_unlock(td2); if (fr->fr_pidp != NULL) *fr->fr_pidp = p2->p_pid; } else { *fr->fr_procp = p2; } PROC_LOCK(p2); /* * Wait until debugger is attached to child. */ while (td2->td_proc == p2 && (td2->td_dbgflags & TDB_STOPATFORK) != 0) cv_wait(&p2->p_dbgwait, &p2->p_mtx); _PRELE(p2); racct_proc_fork_done(p2); PROC_UNLOCK(p2); } int fork1(struct thread *td, struct fork_req *fr) { struct proc *p1, *newproc; struct thread *td2; struct vmspace *vm2; struct file *fp_procdesc; vm_ooffset_t mem_charged; int error, nprocs_new, ok; static int curfail; static struct timeval lastfail; int flags, pages; flags = fr->fr_flags; pages = fr->fr_pages; if ((flags & RFSTOPPED) != 0) MPASS(fr->fr_procp != NULL && fr->fr_pidp == NULL); else MPASS(fr->fr_procp == NULL); /* Check for the undefined or unimplemented flags. */ if ((flags & ~(RFFLAGS | RFTSIGFLAGS(RFTSIGMASK))) != 0) return (EINVAL); /* Signal value requires RFTSIGZMB. */ if ((flags & RFTSIGFLAGS(RFTSIGMASK)) != 0 && (flags & RFTSIGZMB) == 0) return (EINVAL); /* Can't copy and clear. */ if ((flags & (RFFDG|RFCFDG)) == (RFFDG|RFCFDG)) return (EINVAL); /* Check the validity of the signal number. */ if ((flags & RFTSIGZMB) != 0 && (u_int)RFTSIGNUM(flags) > _SIG_MAXSIG) return (EINVAL); if ((flags & RFPROCDESC) != 0) { /* Can't not create a process yet get a process descriptor. */ if ((flags & RFPROC) == 0) return (EINVAL); /* Must provide a place to put a procdesc if creating one. */ if (fr->fr_pd_fd == NULL) return (EINVAL); /* Check if we are using supported flags. */ if ((fr->fr_pd_flags & ~PD_ALLOWED_AT_FORK) != 0) return (EINVAL); } p1 = td->td_proc; /* * Here we don't create a new process, but we divorce * certain parts of a process from itself. */ if ((flags & RFPROC) == 0) { if (fr->fr_procp != NULL) *fr->fr_procp = NULL; else if (fr->fr_pidp != NULL) *fr->fr_pidp = 0; return (fork_norfproc(td, flags)); } fp_procdesc = NULL; newproc = NULL; vm2 = NULL; /* * Increment the nprocs resource before allocations occur. * Although process entries are dynamically created, we still * keep a global limit on the maximum number we will * create. There are hard-limits as to the number of processes * that can run, established by the KVA and memory usage for * the process data. * * Don't allow a nonprivileged user to use the last ten * processes; don't let root exceed the limit. */ nprocs_new = atomic_fetchadd_int(&nprocs, 1) + 1; if ((nprocs_new >= maxproc - 10 && priv_check_cred(td->td_ucred, PRIV_MAXPROC, 0) != 0) || nprocs_new >= maxproc) { error = EAGAIN; sx_xlock(&allproc_lock); if (ppsratecheck(&lastfail, &curfail, 1)) { printf("maxproc limit exceeded by uid %u (pid %d); " "see tuning(7) and login.conf(5)\n", td->td_ucred->cr_ruid, p1->p_pid); } sx_xunlock(&allproc_lock); goto fail2; } /* * If required, create a process descriptor in the parent first; we * will abandon it if something goes wrong. We don't finit() until * later. */ if (flags & RFPROCDESC) { error = procdesc_falloc(td, &fp_procdesc, fr->fr_pd_fd, fr->fr_pd_flags, fr->fr_pd_fcaps); if (error != 0) goto fail2; } mem_charged = 0; if (pages == 0) pages = kstack_pages; /* Allocate new proc. */ newproc = uma_zalloc(proc_zone, M_WAITOK); td2 = FIRST_THREAD_IN_PROC(newproc); if (td2 == NULL) { td2 = thread_alloc(pages); if (td2 == NULL) { error = ENOMEM; goto fail2; } proc_linkup(newproc, td2); } else { if (td2->td_kstack == 0 || td2->td_kstack_pages != pages) { if (td2->td_kstack != 0) vm_thread_dispose(td2); if (!thread_alloc_stack(td2, pages)) { error = ENOMEM; goto fail2; } } } if ((flags & RFMEM) == 0) { vm2 = vmspace_fork(p1->p_vmspace, &mem_charged); if (vm2 == NULL) { error = ENOMEM; goto fail2; } if (!swap_reserve(mem_charged)) { /* * The swap reservation failed. The accounting * from the entries of the copied vm2 will be * subtracted in vmspace_free(), so force the * reservation there. */ swap_reserve_force(mem_charged); error = ENOMEM; goto fail2; } } else vm2 = NULL; /* * XXX: This is ugly; when we copy resource usage, we need to bump * per-cred resource counters. */ proc_set_cred_init(newproc, crhold(td->td_ucred)); /* * Initialize resource accounting for the child process. */ error = racct_proc_fork(p1, newproc); if (error != 0) { error = EAGAIN; goto fail1; } #ifdef MAC mac_proc_init(newproc); #endif newproc->p_klist = knlist_alloc(&newproc->p_mtx); STAILQ_INIT(&newproc->p_ktr); /* We have to lock the process tree while we look for a pid. */ sx_slock(&proctree_lock); sx_xlock(&allproc_lock); /* * Increment the count of procs running with this uid. Don't allow * a nonprivileged user to exceed their current limit. * * XXXRW: Can we avoid privilege here if it's not needed? */ error = priv_check_cred(td->td_ucred, PRIV_PROC_LIMIT, 0); if (error == 0) ok = chgproccnt(td->td_ucred->cr_ruidinfo, 1, 0); else { ok = chgproccnt(td->td_ucred->cr_ruidinfo, 1, lim_cur(td, RLIMIT_NPROC)); } if (ok) { do_fork(td, fr, newproc, td2, vm2, fp_procdesc); return (0); } error = EAGAIN; sx_sunlock(&proctree_lock); sx_xunlock(&allproc_lock); #ifdef MAC mac_proc_destroy(newproc); #endif racct_proc_exit(newproc); fail1: crfree(newproc->p_ucred); newproc->p_ucred = NULL; fail2: if (vm2 != NULL) vmspace_free(vm2); uma_zfree(proc_zone, newproc); if ((flags & RFPROCDESC) != 0 && fp_procdesc != NULL) { fdclose(td, fp_procdesc, *fr->fr_pd_fd); fdrop(fp_procdesc, td); } atomic_add_int(&nprocs, -1); pause("fork", hz / 2); return (error); } /* * Handle the return of a child process from fork1(). This function * is called from the MD fork_trampoline() entry point. */ void fork_exit(void (*callout)(void *, struct trapframe *), void *arg, struct trapframe *frame) { struct proc *p; struct thread *td; struct thread *dtd; td = curthread; p = td->td_proc; KASSERT(p->p_state == PRS_NORMAL, ("executing process is still new")); CTR4(KTR_PROC, "fork_exit: new thread %p (td_sched %p, pid %d, %s)", td, td_get_sched(td), p->p_pid, td->td_name); sched_fork_exit(td); /* * Processes normally resume in mi_switch() after being * cpu_switch()'ed to, but when children start up they arrive here * instead, so we must do much the same things as mi_switch() would. */ if ((dtd = PCPU_GET(deadthread))) { PCPU_SET(deadthread, NULL); thread_stash(dtd); } thread_unlock(td); /* * cpu_fork_kthread_handler intercepts this function call to * have this call a non-return function to stay in kernel mode. * initproc has its own fork handler, but it does return. */ KASSERT(callout != NULL, ("NULL callout in fork_exit")); callout(arg, frame); /* * Check if a kernel thread misbehaved and returned from its main * function. */ if (p->p_flag & P_KPROC) { printf("Kernel thread \"%s\" (pid %d) exited prematurely.\n", td->td_name, p->p_pid); kthread_exit(); } mtx_assert(&Giant, MA_NOTOWNED); if (p->p_sysent->sv_schedtail != NULL) (p->p_sysent->sv_schedtail)(td); td->td_pflags &= ~TDP_FORKING; } /* * Simplified back end of syscall(), used when returning from fork() * directly into user mode. This function is passed in to fork_exit() * as the first parameter and is called when returning to a new * userland process. */ void fork_return(struct thread *td, struct trapframe *frame) { struct proc *p, *dbg; p = td->td_proc; if (td->td_dbgflags & TDB_STOPATFORK) { sx_xlock(&proctree_lock); PROC_LOCK(p); if (p->p_pptr->p_ptevents & PTRACE_FORK) { /* * If debugger still wants auto-attach for the * parent's children, do it now. */ dbg = p->p_pptr->p_pptr; proc_set_traced(p, true); CTR2(KTR_PTRACE, "fork_return: attaching to new child pid %d: oppid %d", p->p_pid, p->p_oppid); proc_reparent(p, dbg); sx_xunlock(&proctree_lock); td->td_dbgflags |= TDB_CHILD | TDB_SCX | TDB_FSTP; ptracestop(td, SIGSTOP, NULL); td->td_dbgflags &= ~(TDB_CHILD | TDB_SCX); } else { /* * ... otherwise clear the request. */ sx_xunlock(&proctree_lock); td->td_dbgflags &= ~TDB_STOPATFORK; cv_broadcast(&p->p_dbgwait); } PROC_UNLOCK(p); } else if (p->p_flag & P_TRACED || td->td_dbgflags & TDB_BORN) { /* * This is the start of a new thread in a traced * process. Report a system call exit event. */ PROC_LOCK(p); td->td_dbgflags |= TDB_SCX; _STOPEVENT(p, S_SCX, td->td_sa.code); if ((p->p_ptevents & PTRACE_SCX) != 0 || (td->td_dbgflags & TDB_BORN) != 0) ptracestop(td, SIGTRAP, NULL); td->td_dbgflags &= ~(TDB_SCX | TDB_BORN); PROC_UNLOCK(p); } userret(td, frame); #ifdef KTRACE if (KTRPOINT(td, KTR_SYSRET)) ktrsysret(SYS_fork, 0, 0); #endif } Index: projects/numa2/sys/kern/kern_mbuf.c =================================================================== --- projects/numa2/sys/kern/kern_mbuf.c (revision 321505) +++ projects/numa2/sys/kern/kern_mbuf.c (revision 321506) @@ -1,955 +1,956 @@ /*- * Copyright (c) 2004, 2005, * Bosko Milekic . All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice unmodified, this list of conditions and the following * disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include "opt_param.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* * In FreeBSD, Mbufs and Mbuf Clusters are allocated from UMA * Zones. * * Mbuf Clusters (2K, contiguous) are allocated from the Cluster * Zone. The Zone can be capped at kern.ipc.nmbclusters, if the * administrator so desires. * * Mbufs are allocated from a UMA Master Zone called the Mbuf * Zone. * * Additionally, FreeBSD provides a Packet Zone, which it * configures as a Secondary Zone to the Mbuf Master Zone, * thus sharing backend Slab kegs with the Mbuf Master Zone. * * Thus common-case allocations and locking are simplified: * * m_clget() m_getcl() * | | * | .------------>[(Packet Cache)] m_get(), m_gethdr() * | | [ Packet ] | * [(Cluster Cache)] [ Secondary ] [ (Mbuf Cache) ] * [ Cluster Zone ] [ Zone ] [ Mbuf Master Zone ] * | \________ | * [ Cluster Keg ] \ / * | [ Mbuf Keg ] * [ Cluster Slabs ] | * | [ Mbuf Slabs ] * \____________(VM)_________________/ * * * Whenever an object is allocated with uma_zalloc() out of * one of the Zones its _ctor_ function is executed. The same * for any deallocation through uma_zfree() the _dtor_ function * is executed. * * Caches are per-CPU and are filled from the Master Zone. * * Whenever an object is allocated from the underlying global * memory pool it gets pre-initialized with the _zinit_ functions. * When the Keg's are overfull objects get decommissioned with * _zfini_ functions and free'd back to the global memory pool. * */ int nmbufs; /* limits number of mbufs */ int nmbclusters; /* limits number of mbuf clusters */ int nmbjumbop; /* limits number of page size jumbo clusters */ int nmbjumbo9; /* limits number of 9k jumbo clusters */ int nmbjumbo16; /* limits number of 16k jumbo clusters */ static quad_t maxmbufmem; /* overall real memory limit for all mbufs */ SYSCTL_QUAD(_kern_ipc, OID_AUTO, maxmbufmem, CTLFLAG_RDTUN | CTLFLAG_NOFETCH, &maxmbufmem, 0, "Maximum real memory allocatable to various mbuf types"); /* * tunable_mbinit() has to be run before any mbuf allocations are done. */ static void tunable_mbinit(void *dummy) { quad_t realmem; /* * The default limit for all mbuf related memory is 1/2 of all * available kernel memory (physical or kmem). * At most it can be 3/4 of available kernel memory. */ realmem = qmin((quad_t)physmem * PAGE_SIZE, vm_kmem_size); maxmbufmem = realmem / 2; TUNABLE_QUAD_FETCH("kern.ipc.maxmbufmem", &maxmbufmem); if (maxmbufmem > realmem / 4 * 3) maxmbufmem = realmem / 4 * 3; TUNABLE_INT_FETCH("kern.ipc.nmbclusters", &nmbclusters); if (nmbclusters == 0) nmbclusters = maxmbufmem / MCLBYTES / 4; TUNABLE_INT_FETCH("kern.ipc.nmbjumbop", &nmbjumbop); if (nmbjumbop == 0) nmbjumbop = maxmbufmem / MJUMPAGESIZE / 4; TUNABLE_INT_FETCH("kern.ipc.nmbjumbo9", &nmbjumbo9); if (nmbjumbo9 == 0) nmbjumbo9 = maxmbufmem / MJUM9BYTES / 6; TUNABLE_INT_FETCH("kern.ipc.nmbjumbo16", &nmbjumbo16); if (nmbjumbo16 == 0) nmbjumbo16 = maxmbufmem / MJUM16BYTES / 6; /* * We need at least as many mbufs as we have clusters of * the various types added together. */ TUNABLE_INT_FETCH("kern.ipc.nmbufs", &nmbufs); if (nmbufs < nmbclusters + nmbjumbop + nmbjumbo9 + nmbjumbo16) nmbufs = lmax(maxmbufmem / MSIZE / 5, nmbclusters + nmbjumbop + nmbjumbo9 + nmbjumbo16); } SYSINIT(tunable_mbinit, SI_SUB_KMEM, SI_ORDER_MIDDLE, tunable_mbinit, NULL); static int sysctl_nmbclusters(SYSCTL_HANDLER_ARGS) { int error, newnmbclusters; newnmbclusters = nmbclusters; error = sysctl_handle_int(oidp, &newnmbclusters, 0, req); if (error == 0 && req->newptr && newnmbclusters != nmbclusters) { if (newnmbclusters > nmbclusters && nmbufs >= nmbclusters + nmbjumbop + nmbjumbo9 + nmbjumbo16) { nmbclusters = newnmbclusters; nmbclusters = uma_zone_set_max(zone_clust, nmbclusters); EVENTHANDLER_INVOKE(nmbclusters_change); } else error = EINVAL; } return (error); } SYSCTL_PROC(_kern_ipc, OID_AUTO, nmbclusters, CTLTYPE_INT|CTLFLAG_RW, &nmbclusters, 0, sysctl_nmbclusters, "IU", "Maximum number of mbuf clusters allowed"); static int sysctl_nmbjumbop(SYSCTL_HANDLER_ARGS) { int error, newnmbjumbop; newnmbjumbop = nmbjumbop; error = sysctl_handle_int(oidp, &newnmbjumbop, 0, req); if (error == 0 && req->newptr && newnmbjumbop != nmbjumbop) { if (newnmbjumbop > nmbjumbop && nmbufs >= nmbclusters + nmbjumbop + nmbjumbo9 + nmbjumbo16) { nmbjumbop = newnmbjumbop; nmbjumbop = uma_zone_set_max(zone_jumbop, nmbjumbop); } else error = EINVAL; } return (error); } SYSCTL_PROC(_kern_ipc, OID_AUTO, nmbjumbop, CTLTYPE_INT|CTLFLAG_RW, &nmbjumbop, 0, sysctl_nmbjumbop, "IU", "Maximum number of mbuf page size jumbo clusters allowed"); static int sysctl_nmbjumbo9(SYSCTL_HANDLER_ARGS) { int error, newnmbjumbo9; newnmbjumbo9 = nmbjumbo9; error = sysctl_handle_int(oidp, &newnmbjumbo9, 0, req); if (error == 0 && req->newptr && newnmbjumbo9 != nmbjumbo9) { if (newnmbjumbo9 > nmbjumbo9 && nmbufs >= nmbclusters + nmbjumbop + nmbjumbo9 + nmbjumbo16) { nmbjumbo9 = newnmbjumbo9; nmbjumbo9 = uma_zone_set_max(zone_jumbo9, nmbjumbo9); } else error = EINVAL; } return (error); } SYSCTL_PROC(_kern_ipc, OID_AUTO, nmbjumbo9, CTLTYPE_INT|CTLFLAG_RW, &nmbjumbo9, 0, sysctl_nmbjumbo9, "IU", "Maximum number of mbuf 9k jumbo clusters allowed"); static int sysctl_nmbjumbo16(SYSCTL_HANDLER_ARGS) { int error, newnmbjumbo16; newnmbjumbo16 = nmbjumbo16; error = sysctl_handle_int(oidp, &newnmbjumbo16, 0, req); if (error == 0 && req->newptr && newnmbjumbo16 != nmbjumbo16) { if (newnmbjumbo16 > nmbjumbo16 && nmbufs >= nmbclusters + nmbjumbop + nmbjumbo9 + nmbjumbo16) { nmbjumbo16 = newnmbjumbo16; nmbjumbo16 = uma_zone_set_max(zone_jumbo16, nmbjumbo16); } else error = EINVAL; } return (error); } SYSCTL_PROC(_kern_ipc, OID_AUTO, nmbjumbo16, CTLTYPE_INT|CTLFLAG_RW, &nmbjumbo16, 0, sysctl_nmbjumbo16, "IU", "Maximum number of mbuf 16k jumbo clusters allowed"); static int sysctl_nmbufs(SYSCTL_HANDLER_ARGS) { int error, newnmbufs; newnmbufs = nmbufs; error = sysctl_handle_int(oidp, &newnmbufs, 0, req); if (error == 0 && req->newptr && newnmbufs != nmbufs) { if (newnmbufs > nmbufs) { nmbufs = newnmbufs; nmbufs = uma_zone_set_max(zone_mbuf, nmbufs); EVENTHANDLER_INVOKE(nmbufs_change); } else error = EINVAL; } return (error); } SYSCTL_PROC(_kern_ipc, OID_AUTO, nmbufs, CTLTYPE_INT|CTLFLAG_RW, &nmbufs, 0, sysctl_nmbufs, "IU", "Maximum number of mbufs allowed"); /* * Zones from which we allocate. */ uma_zone_t zone_mbuf; uma_zone_t zone_clust; uma_zone_t zone_pack; uma_zone_t zone_jumbop; uma_zone_t zone_jumbo9; uma_zone_t zone_jumbo16; /* * Local prototypes. */ static int mb_ctor_mbuf(void *, int, void *, int); static int mb_ctor_clust(void *, int, void *, int); static int mb_ctor_pack(void *, int, void *, int); static void mb_dtor_mbuf(void *, int, void *); static void mb_dtor_pack(void *, int, void *); static int mb_zinit_pack(void *, int, int); static void mb_zfini_pack(void *, int); static void mb_reclaim(uma_zone_t, int); -static void *mbuf_jumbo_alloc(uma_zone_t, vm_size_t, uint8_t *, int); +static void *mbuf_jumbo_alloc(uma_zone_t, vm_size_t, int, uint8_t *, int); /* Ensure that MSIZE is a power of 2. */ CTASSERT((((MSIZE - 1) ^ MSIZE) + 1) >> 1 == MSIZE); /* * Initialize FreeBSD Network buffer allocation. */ static void mbuf_init(void *dummy) { /* * Configure UMA zones for Mbufs, Clusters, and Packets. */ zone_mbuf = uma_zcreate(MBUF_MEM_NAME, MSIZE, mb_ctor_mbuf, mb_dtor_mbuf, #ifdef INVARIANTS trash_init, trash_fini, #else NULL, NULL, #endif MSIZE - 1, UMA_ZONE_MAXBUCKET); if (nmbufs > 0) nmbufs = uma_zone_set_max(zone_mbuf, nmbufs); uma_zone_set_warning(zone_mbuf, "kern.ipc.nmbufs limit reached"); uma_zone_set_maxaction(zone_mbuf, mb_reclaim); zone_clust = uma_zcreate(MBUF_CLUSTER_MEM_NAME, MCLBYTES, mb_ctor_clust, #ifdef INVARIANTS trash_dtor, trash_init, trash_fini, #else NULL, NULL, NULL, #endif UMA_ALIGN_PTR, 0); if (nmbclusters > 0) nmbclusters = uma_zone_set_max(zone_clust, nmbclusters); uma_zone_set_warning(zone_clust, "kern.ipc.nmbclusters limit reached"); uma_zone_set_maxaction(zone_clust, mb_reclaim); zone_pack = uma_zsecond_create(MBUF_PACKET_MEM_NAME, mb_ctor_pack, mb_dtor_pack, mb_zinit_pack, mb_zfini_pack, zone_mbuf); /* Make jumbo frame zone too. Page size, 9k and 16k. */ zone_jumbop = uma_zcreate(MBUF_JUMBOP_MEM_NAME, MJUMPAGESIZE, mb_ctor_clust, #ifdef INVARIANTS trash_dtor, trash_init, trash_fini, #else NULL, NULL, NULL, #endif UMA_ALIGN_PTR, 0); if (nmbjumbop > 0) nmbjumbop = uma_zone_set_max(zone_jumbop, nmbjumbop); uma_zone_set_warning(zone_jumbop, "kern.ipc.nmbjumbop limit reached"); uma_zone_set_maxaction(zone_jumbop, mb_reclaim); zone_jumbo9 = uma_zcreate(MBUF_JUMBO9_MEM_NAME, MJUM9BYTES, mb_ctor_clust, #ifdef INVARIANTS trash_dtor, trash_init, trash_fini, #else NULL, NULL, NULL, #endif UMA_ALIGN_PTR, 0); uma_zone_set_allocf(zone_jumbo9, mbuf_jumbo_alloc); if (nmbjumbo9 > 0) nmbjumbo9 = uma_zone_set_max(zone_jumbo9, nmbjumbo9); uma_zone_set_warning(zone_jumbo9, "kern.ipc.nmbjumbo9 limit reached"); uma_zone_set_maxaction(zone_jumbo9, mb_reclaim); zone_jumbo16 = uma_zcreate(MBUF_JUMBO16_MEM_NAME, MJUM16BYTES, mb_ctor_clust, #ifdef INVARIANTS trash_dtor, trash_init, trash_fini, #else NULL, NULL, NULL, #endif UMA_ALIGN_PTR, 0); uma_zone_set_allocf(zone_jumbo16, mbuf_jumbo_alloc); if (nmbjumbo16 > 0) nmbjumbo16 = uma_zone_set_max(zone_jumbo16, nmbjumbo16); uma_zone_set_warning(zone_jumbo16, "kern.ipc.nmbjumbo16 limit reached"); uma_zone_set_maxaction(zone_jumbo16, mb_reclaim); /* * Hook event handler for low-memory situation, used to * drain protocols and push data back to the caches (UMA * later pushes it back to VM). */ EVENTHANDLER_REGISTER(vm_lowmem, mb_reclaim, NULL, EVENTHANDLER_PRI_FIRST); } SYSINIT(mbuf, SI_SUB_MBUF, SI_ORDER_FIRST, mbuf_init, NULL); /* * UMA backend page allocator for the jumbo frame zones. * * Allocates kernel virtual memory that is backed by contiguous physical * pages. */ static void * -mbuf_jumbo_alloc(uma_zone_t zone, vm_size_t bytes, uint8_t *flags, int wait) +mbuf_jumbo_alloc(uma_zone_t zone, vm_size_t bytes, int domain, uint8_t *flags, + int wait) { /* Inform UMA that this allocator uses kernel_map/object. */ *flags = UMA_SLAB_KERNEL; return ((void *)kmem_alloc_contig(kernel_arena, bytes, wait, (vm_paddr_t)0, ~(vm_paddr_t)0, 1, 0, VM_MEMATTR_DEFAULT)); } /* * Constructor for Mbuf master zone. * * The 'arg' pointer points to a mb_args structure which * contains call-specific information required to support the * mbuf allocation API. See mbuf.h. */ static int mb_ctor_mbuf(void *mem, int size, void *arg, int how) { struct mbuf *m; struct mb_args *args; int error; int flags; short type; #ifdef INVARIANTS trash_ctor(mem, size, arg, how); #endif args = (struct mb_args *)arg; type = args->type; /* * The mbuf is initialized later. The caller has the * responsibility to set up any MAC labels too. */ if (type == MT_NOINIT) return (0); m = (struct mbuf *)mem; flags = args->flags; MPASS((flags & M_NOFREE) == 0); error = m_init(m, how, type, flags); return (error); } /* * The Mbuf master zone destructor. */ static void mb_dtor_mbuf(void *mem, int size, void *arg) { struct mbuf *m; unsigned long flags; m = (struct mbuf *)mem; flags = (unsigned long)arg; KASSERT((m->m_flags & M_NOFREE) == 0, ("%s: M_NOFREE set", __func__)); if (!(flags & MB_DTOR_SKIP) && (m->m_flags & M_PKTHDR) && !SLIST_EMPTY(&m->m_pkthdr.tags)) m_tag_delete_chain(m, NULL); #ifdef INVARIANTS trash_dtor(mem, size, arg); #endif } /* * The Mbuf Packet zone destructor. */ static void mb_dtor_pack(void *mem, int size, void *arg) { struct mbuf *m; m = (struct mbuf *)mem; if ((m->m_flags & M_PKTHDR) != 0) m_tag_delete_chain(m, NULL); /* Make sure we've got a clean cluster back. */ KASSERT((m->m_flags & M_EXT) == M_EXT, ("%s: M_EXT not set", __func__)); KASSERT(m->m_ext.ext_buf != NULL, ("%s: ext_buf == NULL", __func__)); KASSERT(m->m_ext.ext_free == NULL, ("%s: ext_free != NULL", __func__)); KASSERT(m->m_ext.ext_arg1 == NULL, ("%s: ext_arg1 != NULL", __func__)); KASSERT(m->m_ext.ext_arg2 == NULL, ("%s: ext_arg2 != NULL", __func__)); KASSERT(m->m_ext.ext_size == MCLBYTES, ("%s: ext_size != MCLBYTES", __func__)); KASSERT(m->m_ext.ext_type == EXT_PACKET, ("%s: ext_type != EXT_PACKET", __func__)); #ifdef INVARIANTS trash_dtor(m->m_ext.ext_buf, MCLBYTES, arg); #endif /* * If there are processes blocked on zone_clust, waiting for pages * to be freed up, * cause them to be woken up by draining the * packet zone. We are exposed to a race here * (in the check for * the UMA_ZFLAG_FULL) where we might miss the flag set, but that * is deliberate. We don't want to acquire the zone lock for every * mbuf free. */ if (uma_zone_exhausted_nolock(zone_clust)) zone_drain(zone_pack); } /* * The Cluster and Jumbo[PAGESIZE|9|16] zone constructor. * * Here the 'arg' pointer points to the Mbuf which we * are configuring cluster storage for. If 'arg' is * empty we allocate just the cluster without setting * the mbuf to it. See mbuf.h. */ static int mb_ctor_clust(void *mem, int size, void *arg, int how) { struct mbuf *m; #ifdef INVARIANTS trash_ctor(mem, size, arg, how); #endif m = (struct mbuf *)arg; if (m != NULL) { m->m_ext.ext_buf = (caddr_t)mem; m->m_data = m->m_ext.ext_buf; m->m_flags |= M_EXT; m->m_ext.ext_free = NULL; m->m_ext.ext_arg1 = NULL; m->m_ext.ext_arg2 = NULL; m->m_ext.ext_size = size; m->m_ext.ext_type = m_gettype(size); m->m_ext.ext_flags = EXT_FLAG_EMBREF; m->m_ext.ext_count = 1; } return (0); } /* * The Packet secondary zone's init routine, executed on the * object's transition from mbuf keg slab to zone cache. */ static int mb_zinit_pack(void *mem, int size, int how) { struct mbuf *m; m = (struct mbuf *)mem; /* m is virgin. */ if (uma_zalloc_arg(zone_clust, m, how) == NULL || m->m_ext.ext_buf == NULL) return (ENOMEM); m->m_ext.ext_type = EXT_PACKET; /* Override. */ #ifdef INVARIANTS trash_init(m->m_ext.ext_buf, MCLBYTES, how); #endif return (0); } /* * The Packet secondary zone's fini routine, executed on the * object's transition from zone cache to keg slab. */ static void mb_zfini_pack(void *mem, int size) { struct mbuf *m; m = (struct mbuf *)mem; #ifdef INVARIANTS trash_fini(m->m_ext.ext_buf, MCLBYTES); #endif uma_zfree_arg(zone_clust, m->m_ext.ext_buf, NULL); #ifdef INVARIANTS trash_dtor(mem, size, NULL); #endif } /* * The "packet" keg constructor. */ static int mb_ctor_pack(void *mem, int size, void *arg, int how) { struct mbuf *m; struct mb_args *args; int error, flags; short type; m = (struct mbuf *)mem; args = (struct mb_args *)arg; flags = args->flags; type = args->type; MPASS((flags & M_NOFREE) == 0); #ifdef INVARIANTS trash_ctor(m->m_ext.ext_buf, MCLBYTES, arg, how); #endif error = m_init(m, how, type, flags); /* m_ext is already initialized. */ m->m_data = m->m_ext.ext_buf; m->m_flags = (flags | M_EXT); return (error); } /* * This is the protocol drain routine. Called by UMA whenever any of the * mbuf zones is closed to its limit. * * No locks should be held when this is called. The drain routines have to * presently acquire some locks which raises the possibility of lock order * reversal. */ static void mb_reclaim(uma_zone_t zone __unused, int pending __unused) { struct domain *dp; struct protosw *pr; WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK | WARN_PANIC, NULL, __func__); for (dp = domains; dp != NULL; dp = dp->dom_next) for (pr = dp->dom_protosw; pr < dp->dom_protoswNPROTOSW; pr++) if (pr->pr_drain != NULL) (*pr->pr_drain)(); } /* * Clean up after mbufs with M_EXT storage attached to them if the * reference count hits 1. */ void mb_free_ext(struct mbuf *m) { volatile u_int *refcnt; struct mbuf *mref; int freembuf; KASSERT(m->m_flags & M_EXT, ("%s: M_EXT not set on %p", __func__, m)); /* See if this is the mbuf that holds the embedded refcount. */ if (m->m_ext.ext_flags & EXT_FLAG_EMBREF) { refcnt = &m->m_ext.ext_count; mref = m; } else { KASSERT(m->m_ext.ext_cnt != NULL, ("%s: no refcounting pointer on %p", __func__, m)); refcnt = m->m_ext.ext_cnt; mref = __containerof(refcnt, struct mbuf, m_ext.ext_count); } /* * Check if the header is embedded in the cluster. It is * important that we can't touch any of the mbuf fields * after we have freed the external storage, since mbuf * could have been embedded in it. For now, the mbufs * embedded into the cluster are always of type EXT_EXTREF, * and for this type we won't free the mref. */ if (m->m_flags & M_NOFREE) { freembuf = 0; KASSERT(m->m_ext.ext_type == EXT_EXTREF, ("%s: no-free mbuf %p has wrong type", __func__, m)); } else freembuf = 1; /* Free attached storage if this mbuf is the only reference to it. */ if (*refcnt == 1 || atomic_fetchadd_int(refcnt, -1) == 1) { switch (m->m_ext.ext_type) { case EXT_PACKET: /* The packet zone is special. */ if (*refcnt == 0) *refcnt = 1; uma_zfree(zone_pack, mref); break; case EXT_CLUSTER: uma_zfree(zone_clust, m->m_ext.ext_buf); uma_zfree(zone_mbuf, mref); break; case EXT_JUMBOP: uma_zfree(zone_jumbop, m->m_ext.ext_buf); uma_zfree(zone_mbuf, mref); break; case EXT_JUMBO9: uma_zfree(zone_jumbo9, m->m_ext.ext_buf); uma_zfree(zone_mbuf, mref); break; case EXT_JUMBO16: uma_zfree(zone_jumbo16, m->m_ext.ext_buf); uma_zfree(zone_mbuf, mref); break; case EXT_SFBUF: sf_ext_free(m->m_ext.ext_arg1, m->m_ext.ext_arg2); uma_zfree(zone_mbuf, mref); break; case EXT_SFBUF_NOCACHE: sf_ext_free_nocache(m->m_ext.ext_arg1, m->m_ext.ext_arg2); uma_zfree(zone_mbuf, mref); break; case EXT_NET_DRV: case EXT_MOD_TYPE: case EXT_DISPOSABLE: KASSERT(m->m_ext.ext_free != NULL, ("%s: ext_free not set", __func__)); (*(m->m_ext.ext_free))(m, m->m_ext.ext_arg1, m->m_ext.ext_arg2); uma_zfree(zone_mbuf, mref); break; case EXT_EXTREF: KASSERT(m->m_ext.ext_free != NULL, ("%s: ext_free not set", __func__)); (*(m->m_ext.ext_free))(m, m->m_ext.ext_arg1, m->m_ext.ext_arg2); break; default: KASSERT(m->m_ext.ext_type == 0, ("%s: unknown ext_type", __func__)); } } if (freembuf && m != mref) uma_zfree(zone_mbuf, m); } /* * Official mbuf(9) allocation KPI for stack and drivers: * * m_get() - a single mbuf without any attachments, sys/mbuf.h. * m_gethdr() - a single mbuf initialized as M_PKTHDR, sys/mbuf.h. * m_getcl() - an mbuf + 2k cluster, sys/mbuf.h. * m_clget() - attach cluster to already allocated mbuf. * m_cljget() - attach jumbo cluster to already allocated mbuf. * m_get2() - allocate minimum mbuf that would fit size argument. * m_getm2() - allocate a chain of mbufs/clusters. * m_extadd() - attach external cluster to mbuf. * * m_free() - free single mbuf with its tags and ext, sys/mbuf.h. * m_freem() - free chain of mbufs. */ int m_clget(struct mbuf *m, int how) { KASSERT((m->m_flags & M_EXT) == 0, ("%s: mbuf %p has M_EXT", __func__, m)); m->m_ext.ext_buf = (char *)NULL; uma_zalloc_arg(zone_clust, m, how); /* * On a cluster allocation failure, drain the packet zone and retry, * we might be able to loosen a few clusters up on the drain. */ if ((how & M_NOWAIT) && (m->m_ext.ext_buf == NULL)) { zone_drain(zone_pack); uma_zalloc_arg(zone_clust, m, how); } MBUF_PROBE2(m__clget, m, how); return (m->m_flags & M_EXT); } /* * m_cljget() is different from m_clget() as it can allocate clusters without * attaching them to an mbuf. In that case the return value is the pointer * to the cluster of the requested size. If an mbuf was specified, it gets * the cluster attached to it and the return value can be safely ignored. * For size it takes MCLBYTES, MJUMPAGESIZE, MJUM9BYTES, MJUM16BYTES. */ void * m_cljget(struct mbuf *m, int how, int size) { uma_zone_t zone; void *retval; if (m != NULL) { KASSERT((m->m_flags & M_EXT) == 0, ("%s: mbuf %p has M_EXT", __func__, m)); m->m_ext.ext_buf = NULL; } zone = m_getzone(size); retval = uma_zalloc_arg(zone, m, how); MBUF_PROBE4(m__cljget, m, how, size, retval); return (retval); } /* * m_get2() allocates minimum mbuf that would fit "size" argument. */ struct mbuf * m_get2(int size, int how, short type, int flags) { struct mb_args args; struct mbuf *m, *n; args.flags = flags; args.type = type; if (size <= MHLEN || (size <= MLEN && (flags & M_PKTHDR) == 0)) return (uma_zalloc_arg(zone_mbuf, &args, how)); if (size <= MCLBYTES) return (uma_zalloc_arg(zone_pack, &args, how)); if (size > MJUMPAGESIZE) return (NULL); m = uma_zalloc_arg(zone_mbuf, &args, how); if (m == NULL) return (NULL); n = uma_zalloc_arg(zone_jumbop, m, how); if (n == NULL) { uma_zfree(zone_mbuf, m); return (NULL); } return (m); } /* * m_getjcl() returns an mbuf with a cluster of the specified size attached. * For size it takes MCLBYTES, MJUMPAGESIZE, MJUM9BYTES, MJUM16BYTES. */ struct mbuf * m_getjcl(int how, short type, int flags, int size) { struct mb_args args; struct mbuf *m, *n; uma_zone_t zone; if (size == MCLBYTES) return m_getcl(how, type, flags); args.flags = flags; args.type = type; m = uma_zalloc_arg(zone_mbuf, &args, how); if (m == NULL) return (NULL); zone = m_getzone(size); n = uma_zalloc_arg(zone, m, how); if (n == NULL) { uma_zfree(zone_mbuf, m); return (NULL); } return (m); } /* * Allocate a given length worth of mbufs and/or clusters (whatever fits * best) and return a pointer to the top of the allocated chain. If an * existing mbuf chain is provided, then we will append the new chain * to the existing one but still return the top of the newly allocated * chain. */ struct mbuf * m_getm2(struct mbuf *m, int len, int how, short type, int flags) { struct mbuf *mb, *nm = NULL, *mtail = NULL; KASSERT(len >= 0, ("%s: len is < 0", __func__)); /* Validate flags. */ flags &= (M_PKTHDR | M_EOR); /* Packet header mbuf must be first in chain. */ if ((flags & M_PKTHDR) && m != NULL) flags &= ~M_PKTHDR; /* Loop and append maximum sized mbufs to the chain tail. */ while (len > 0) { if (len > MCLBYTES) mb = m_getjcl(how, type, (flags & M_PKTHDR), MJUMPAGESIZE); else if (len >= MINCLSIZE) mb = m_getcl(how, type, (flags & M_PKTHDR)); else if (flags & M_PKTHDR) mb = m_gethdr(how, type); else mb = m_get(how, type); /* Fail the whole operation if one mbuf can't be allocated. */ if (mb == NULL) { if (nm != NULL) m_freem(nm); return (NULL); } /* Book keeping. */ len -= M_SIZE(mb); if (mtail != NULL) mtail->m_next = mb; else nm = mb; mtail = mb; flags &= ~M_PKTHDR; /* Only valid on the first mbuf. */ } if (flags & M_EOR) mtail->m_flags |= M_EOR; /* Only valid on the last mbuf. */ /* If mbuf was supplied, append new chain to the end of it. */ if (m != NULL) { for (mtail = m; mtail->m_next != NULL; mtail = mtail->m_next) ; mtail->m_next = nm; mtail->m_flags &= ~M_EOR; } else m = nm; return (m); } /*- * Configure a provided mbuf to refer to the provided external storage * buffer and setup a reference count for said buffer. * * Arguments: * mb The existing mbuf to which to attach the provided buffer. * buf The address of the provided external storage buffer. * size The size of the provided buffer. * freef A pointer to a routine that is responsible for freeing the * provided external storage buffer. * args A pointer to an argument structure (of any type) to be passed * to the provided freef routine (may be NULL). * flags Any other flags to be passed to the provided mbuf. * type The type that the external storage buffer should be * labeled with. * * Returns: * Nothing. */ void m_extadd(struct mbuf *mb, caddr_t buf, u_int size, void (*freef)(struct mbuf *, void *, void *), void *arg1, void *arg2, int flags, int type) { KASSERT(type != EXT_CLUSTER, ("%s: EXT_CLUSTER not allowed", __func__)); mb->m_flags |= (M_EXT | flags); mb->m_ext.ext_buf = buf; mb->m_data = mb->m_ext.ext_buf; mb->m_ext.ext_size = size; mb->m_ext.ext_free = freef; mb->m_ext.ext_arg1 = arg1; mb->m_ext.ext_arg2 = arg2; mb->m_ext.ext_type = type; if (type != EXT_EXTREF) { mb->m_ext.ext_count = 1; mb->m_ext.ext_flags = EXT_FLAG_EMBREF; } else mb->m_ext.ext_flags = 0; } /* * Free an entire chain of mbufs and associated external buffers, if * applicable. */ void m_freem(struct mbuf *mb) { MBUF_PROBE1(m__freem, mb); while (mb != NULL) mb = m_free(mb); } Index: projects/numa2/sys/kern/kern_numa.c =================================================================== --- projects/numa2/sys/kern/kern_numa.c (revision 321505) +++ projects/numa2/sys/kern/kern_numa.c (revision 321506) @@ -1,169 +1,178 @@ /*- * Copyright (c) 2015, Adrian Chadd * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice unmodified, this list of conditions, and the following * disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include int sys_numa_setaffinity(struct thread *td, struct numa_setaffinity_args *uap) { int error; struct vm_domain_policy vp; struct thread *ttd; struct proc *p; struct cpuset *set; set = NULL; p = NULL; /* * Copy in just the policy information into the policy * struct. Userland only supplies vm_domain_policy_entry. */ error = copyin(uap->policy, &vp.p, sizeof(vp.p)); if (error) goto out; /* * Ensure the seq number is zero - otherwise seq.h * may get very confused. */ vp.seq = 0; /* * Validate policy. */ if (vm_domain_policy_validate(&vp) != 0) { error = EINVAL; goto out; } /* * Go find the desired proc/tid for this operation. */ error = cpuset_which(uap->which, uap->id, &p, &ttd, &set); if (error) goto out; /* Only handle CPU_WHICH_TID and CPU_WHICH_PID */ /* * XXX if cpuset_which is called with WHICH_CPUSET and NULL cpuset, * it'll return ESRCH. We should just return EINVAL. + * + * XXXMJ nothing synchronizes updates to the thread iterators. */ switch (uap->which) { case CPU_WHICH_TID: vm_domain_policy_copy(&ttd->td_vm_dom_policy, &vp); + vm_domain_iterator_set_policy(&ttd->td_dom_selector, &vp); break; case CPU_WHICH_PID: vm_domain_policy_copy(&p->p_vm_dom_policy, &vp); + PROC_LOCK(p); + FOREACH_THREAD_IN_PROC(p, ttd) { + vm_domain_iterator_set_policy(&ttd->td_dom_selector, + &vp); + } + PROC_UNLOCK(p); break; default: error = EINVAL; break; } PROC_UNLOCK(p); out: if (set) cpuset_rel(set); return (error); } int sys_numa_getaffinity(struct thread *td, struct numa_getaffinity_args *uap) { int error; struct vm_domain_policy vp; struct thread *ttd; struct proc *p; struct cpuset *set; set = NULL; p = NULL; error = cpuset_which(uap->which, uap->id, &p, &ttd, &set); if (error) goto out; /* Only handle CPU_WHICH_TID and CPU_WHICH_PID */ /* * XXX if cpuset_which is called with WHICH_CPUSET and NULL cpuset, * it'll return ESRCH. We should just return EINVAL. */ switch (uap->which) { case CPU_WHICH_TID: vm_domain_policy_localcopy(&vp, &ttd->td_vm_dom_policy); break; case CPU_WHICH_PID: vm_domain_policy_localcopy(&vp, &p->p_vm_dom_policy); break; default: error = EINVAL; break; } if (p) PROC_UNLOCK(p); /* * Copy out only the vm_domain_policy_entry part. */ if (error == 0) error = copyout(&vp.p, uap->policy, sizeof(vp.p)); out: if (set) cpuset_rel(set); return (error); } Index: projects/numa2/sys/kern/kern_thread.c =================================================================== --- projects/numa2/sys/kern/kern_thread.c (revision 321505) +++ projects/numa2/sys/kern/kern_thread.c (revision 321506) @@ -1,1260 +1,1260 @@ /*- * Copyright (C) 2001 Julian Elischer . * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice(s), this list of conditions and the following disclaimer as * the first lines of this file unmodified other than the possible * addition of one or more copyright notices. * 2. Redistributions in binary form must reproduce the above copyright * notice(s), this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER(S) ``AS IS'' AND ANY * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER(S) BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH * DAMAGE. */ #include "opt_witness.h" #include "opt_hwpmc_hooks.h" #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef HWPMC_HOOKS #include #endif #include #include #include #include #include #include /* * Asserts below verify the stability of struct thread and struct proc * layout, as exposed by KBI to modules. On head, the KBI is allowed * to drift, change to the structures must be accompanied by the * assert update. * * On the stable branches after KBI freeze, conditions must not be * violated. Typically new fields are moved to the end of the * structures. */ #ifdef __amd64__ _Static_assert(offsetof(struct thread, td_flags) == 0xf4, "struct thread KBI td_flags"); _Static_assert(offsetof(struct thread, td_pflags) == 0xfc, "struct thread KBI td_pflags"); -_Static_assert(offsetof(struct thread, td_frame) == 0x460, +_Static_assert(offsetof(struct thread, td_frame) == 0x468, "struct thread KBI td_frame"); -_Static_assert(offsetof(struct thread, td_emuldata) == 0x508, +_Static_assert(offsetof(struct thread, td_emuldata) == 0x510, "struct thread KBI td_emuldata"); _Static_assert(offsetof(struct proc, p_flag) == 0xb0, "struct proc KBI p_flag"); _Static_assert(offsetof(struct proc, p_pid) == 0xbc, "struct proc KBI p_pid"); _Static_assert(offsetof(struct proc, p_filemon) == 0x3d0, "struct proc KBI p_filemon"); _Static_assert(offsetof(struct proc, p_comm) == 0x3e0, "struct proc KBI p_comm"); _Static_assert(offsetof(struct proc, p_emuldata) == 0x4b8, "struct proc KBI p_emuldata"); #endif #ifdef __i386__ _Static_assert(offsetof(struct thread, td_flags) == 0x9c, "struct thread KBI td_flags"); _Static_assert(offsetof(struct thread, td_pflags) == 0xa4, "struct thread KBI td_pflags"); _Static_assert(offsetof(struct thread, td_frame) == 0x2ec, "struct thread KBI td_frame"); _Static_assert(offsetof(struct thread, td_emuldata) == 0x338, "struct thread KBI td_emuldata"); _Static_assert(offsetof(struct proc, p_flag) == 0x68, "struct proc KBI p_flag"); _Static_assert(offsetof(struct proc, p_pid) == 0x74, "struct proc KBI p_pid"); _Static_assert(offsetof(struct proc, p_filemon) == 0x27c, "struct proc KBI p_filemon"); _Static_assert(offsetof(struct proc, p_comm) == 0x288, "struct proc KBI p_comm"); _Static_assert(offsetof(struct proc, p_emuldata) == 0x314, "struct proc KBI p_emuldata"); #endif SDT_PROVIDER_DECLARE(proc); SDT_PROBE_DEFINE(proc, , , lwp__exit); /* * thread related storage. */ static uma_zone_t thread_zone; TAILQ_HEAD(, thread) zombie_threads = TAILQ_HEAD_INITIALIZER(zombie_threads); static struct mtx zombie_lock; MTX_SYSINIT(zombie_lock, &zombie_lock, "zombie lock", MTX_SPIN); static void thread_zombie(struct thread *); static int thread_unsuspend_one(struct thread *td, struct proc *p, bool boundary); #define TID_BUFFER_SIZE 1024 struct mtx tid_lock; static struct unrhdr *tid_unrhdr; static lwpid_t tid_buffer[TID_BUFFER_SIZE]; static int tid_head, tid_tail; static MALLOC_DEFINE(M_TIDHASH, "tidhash", "thread hash"); struct tidhashhead *tidhashtbl; u_long tidhash; struct rwlock tidhash_lock; static lwpid_t tid_alloc(void) { lwpid_t tid; tid = alloc_unr(tid_unrhdr); if (tid != -1) return (tid); mtx_lock(&tid_lock); if (tid_head == tid_tail) { mtx_unlock(&tid_lock); return (-1); } tid = tid_buffer[tid_head]; tid_head = (tid_head + 1) % TID_BUFFER_SIZE; mtx_unlock(&tid_lock); return (tid); } static void tid_free(lwpid_t tid) { lwpid_t tmp_tid = -1; mtx_lock(&tid_lock); if ((tid_tail + 1) % TID_BUFFER_SIZE == tid_head) { tmp_tid = tid_buffer[tid_head]; tid_head = (tid_head + 1) % TID_BUFFER_SIZE; } tid_buffer[tid_tail] = tid; tid_tail = (tid_tail + 1) % TID_BUFFER_SIZE; mtx_unlock(&tid_lock); if (tmp_tid != -1) free_unr(tid_unrhdr, tmp_tid); } /* * Prepare a thread for use. */ static int thread_ctor(void *mem, int size, void *arg, int flags) { struct thread *td; td = (struct thread *)mem; td->td_state = TDS_INACTIVE; td->td_oncpu = NOCPU; td->td_tid = tid_alloc(); /* * Note that td_critnest begins life as 1 because the thread is not * running and is thereby implicitly waiting to be on the receiving * end of a context switch. */ td->td_critnest = 1; td->td_lend_user_pri = PRI_MAX; EVENTHANDLER_INVOKE(thread_ctor, td); #ifdef AUDIT audit_thread_alloc(td); #endif umtx_thread_alloc(td); return (0); } /* * Reclaim a thread after use. */ static void thread_dtor(void *mem, int size, void *arg) { struct thread *td; td = (struct thread *)mem; #ifdef INVARIANTS /* Verify that this thread is in a safe state to free. */ switch (td->td_state) { case TDS_INHIBITED: case TDS_RUNNING: case TDS_CAN_RUN: case TDS_RUNQ: /* * We must never unlink a thread that is in one of * these states, because it is currently active. */ panic("bad state for thread unlinking"); /* NOTREACHED */ case TDS_INACTIVE: break; default: panic("bad thread state"); /* NOTREACHED */ } #endif #ifdef AUDIT audit_thread_free(td); #endif /* Free all OSD associated to this thread. */ osd_thread_exit(td); td_softdep_cleanup(td); MPASS(td->td_su == NULL); EVENTHANDLER_INVOKE(thread_dtor, td); tid_free(td->td_tid); } /* * Initialize type-stable parts of a thread (when newly created). */ static int thread_init(void *mem, int size, int flags) { struct thread *td; td = (struct thread *)mem; td->td_sleepqueue = sleepq_alloc(); td->td_turnstile = turnstile_alloc(); td->td_rlqe = NULL; EVENTHANDLER_INVOKE(thread_init, td); umtx_thread_init(td); td->td_kstack = 0; td->td_sel = NULL; return (0); } /* * Tear down type-stable parts of a thread (just before being discarded). */ static void thread_fini(void *mem, int size) { struct thread *td; td = (struct thread *)mem; EVENTHANDLER_INVOKE(thread_fini, td); rlqentry_free(td->td_rlqe); turnstile_free(td->td_turnstile); sleepq_free(td->td_sleepqueue); umtx_thread_fini(td); seltdfini(td); } /* * For a newly created process, * link up all the structures and its initial threads etc. * called from: * {arch}/{arch}/machdep.c {arch}_init(), init386() etc. * proc_dtor() (should go away) * proc_init() */ void proc_linkup0(struct proc *p, struct thread *td) { TAILQ_INIT(&p->p_threads); /* all threads in proc */ proc_linkup(p, td); } void proc_linkup(struct proc *p, struct thread *td) { sigqueue_init(&p->p_sigqueue, p); p->p_ksi = ksiginfo_alloc(1); if (p->p_ksi != NULL) { /* XXX p_ksi may be null if ksiginfo zone is not ready */ p->p_ksi->ksi_flags = KSI_EXT | KSI_INS; } LIST_INIT(&p->p_mqnotifier); p->p_numthreads = 0; thread_link(td, p); } /* * Initialize global thread allocation resources. */ void threadinit(void) { mtx_init(&tid_lock, "TID lock", NULL, MTX_DEF); /* * pid_max cannot be greater than PID_MAX. * leave one number for thread0. */ tid_unrhdr = new_unrhdr(PID_MAX + 2, INT_MAX, &tid_lock); thread_zone = uma_zcreate("THREAD", sched_sizeof_thread(), thread_ctor, thread_dtor, thread_init, thread_fini, 32 - 1, UMA_ZONE_NOFREE); tidhashtbl = hashinit(maxproc / 2, M_TIDHASH, &tidhash); rw_init(&tidhash_lock, "tidhash"); } /* * Place an unused thread on the zombie list. * Use the slpq as that must be unused by now. */ void thread_zombie(struct thread *td) { mtx_lock_spin(&zombie_lock); TAILQ_INSERT_HEAD(&zombie_threads, td, td_slpq); mtx_unlock_spin(&zombie_lock); } /* * Release a thread that has exited after cpu_throw(). */ void thread_stash(struct thread *td) { atomic_subtract_rel_int(&td->td_proc->p_exitthreads, 1); thread_zombie(td); } /* * Reap zombie resources. */ void thread_reap(void) { struct thread *td_first, *td_next; /* * Don't even bother to lock if none at this instant, * we really don't care about the next instant. */ if (!TAILQ_EMPTY(&zombie_threads)) { mtx_lock_spin(&zombie_lock); td_first = TAILQ_FIRST(&zombie_threads); if (td_first) TAILQ_INIT(&zombie_threads); mtx_unlock_spin(&zombie_lock); while (td_first) { td_next = TAILQ_NEXT(td_first, td_slpq); thread_cow_free(td_first); thread_free(td_first); td_first = td_next; } } } /* * Allocate a thread. */ struct thread * thread_alloc(int pages) { struct thread *td; thread_reap(); /* check if any zombies to get */ td = (struct thread *)uma_zalloc(thread_zone, M_WAITOK); KASSERT(td->td_kstack == 0, ("thread_alloc got thread with kstack")); if (!vm_thread_new(td, pages)) { uma_zfree(thread_zone, td); return (NULL); } cpu_thread_alloc(td); vm_domain_policy_init(&td->td_vm_dom_policy); return (td); } int thread_alloc_stack(struct thread *td, int pages) { KASSERT(td->td_kstack == 0, ("thread_alloc_stack called on a thread with kstack")); if (!vm_thread_new(td, pages)) return (0); cpu_thread_alloc(td); return (1); } /* * Deallocate a thread. */ void thread_free(struct thread *td) { lock_profile_thread_exit(td); if (td->td_cpuset) cpuset_rel(td->td_cpuset); td->td_cpuset = NULL; cpu_thread_free(td); if (td->td_kstack != 0) vm_thread_dispose(td); vm_domain_policy_cleanup(&td->td_vm_dom_policy); callout_drain(&td->td_slpcallout); uma_zfree(thread_zone, td); } void thread_cow_get_proc(struct thread *newtd, struct proc *p) { PROC_LOCK_ASSERT(p, MA_OWNED); newtd->td_ucred = crhold(p->p_ucred); newtd->td_limit = lim_hold(p->p_limit); newtd->td_cowgen = p->p_cowgen; } void thread_cow_get(struct thread *newtd, struct thread *td) { newtd->td_ucred = crhold(td->td_ucred); newtd->td_limit = lim_hold(td->td_limit); newtd->td_cowgen = td->td_cowgen; } void thread_cow_free(struct thread *td) { if (td->td_ucred != NULL) crfree(td->td_ucred); if (td->td_limit != NULL) lim_free(td->td_limit); } void thread_cow_update(struct thread *td) { struct proc *p; struct ucred *oldcred; struct plimit *oldlimit; p = td->td_proc; oldcred = NULL; oldlimit = NULL; PROC_LOCK(p); if (td->td_ucred != p->p_ucred) { oldcred = td->td_ucred; td->td_ucred = crhold(p->p_ucred); } if (td->td_limit != p->p_limit) { oldlimit = td->td_limit; td->td_limit = lim_hold(p->p_limit); } td->td_cowgen = p->p_cowgen; PROC_UNLOCK(p); if (oldcred != NULL) crfree(oldcred); if (oldlimit != NULL) lim_free(oldlimit); } /* * Discard the current thread and exit from its context. * Always called with scheduler locked. * * Because we can't free a thread while we're operating under its context, * push the current thread into our CPU's deadthread holder. This means * we needn't worry about someone else grabbing our context before we * do a cpu_throw(). */ void thread_exit(void) { uint64_t runtime, new_switchtime; struct thread *td; struct thread *td2; struct proc *p; int wakeup_swapper; td = curthread; p = td->td_proc; PROC_SLOCK_ASSERT(p, MA_OWNED); mtx_assert(&Giant, MA_NOTOWNED); PROC_LOCK_ASSERT(p, MA_OWNED); KASSERT(p != NULL, ("thread exiting without a process")); CTR3(KTR_PROC, "thread_exit: thread %p (pid %ld, %s)", td, (long)p->p_pid, td->td_name); SDT_PROBE0(proc, , , lwp__exit); KASSERT(TAILQ_EMPTY(&td->td_sigqueue.sq_list), ("signal pending")); #ifdef AUDIT AUDIT_SYSCALL_EXIT(0, td); #endif /* * drop FPU & debug register state storage, or any other * architecture specific resources that * would not be on a new untouched process. */ cpu_thread_exit(td); /* * The last thread is left attached to the process * So that the whole bundle gets recycled. Skip * all this stuff if we never had threads. * EXIT clears all sign of other threads when * it goes to single threading, so the last thread always * takes the short path. */ if (p->p_flag & P_HADTHREADS) { if (p->p_numthreads > 1) { atomic_add_int(&td->td_proc->p_exitthreads, 1); thread_unlink(td); td2 = FIRST_THREAD_IN_PROC(p); sched_exit_thread(td2, td); /* * The test below is NOT true if we are the * sole exiting thread. P_STOPPED_SINGLE is unset * in exit1() after it is the only survivor. */ if (P_SHOULDSTOP(p) == P_STOPPED_SINGLE) { if (p->p_numthreads == p->p_suspcount) { thread_lock(p->p_singlethread); wakeup_swapper = thread_unsuspend_one( p->p_singlethread, p, false); thread_unlock(p->p_singlethread); if (wakeup_swapper) kick_proc0(); } } PCPU_SET(deadthread, td); } else { /* * The last thread is exiting.. but not through exit() */ panic ("thread_exit: Last thread exiting on its own"); } } #ifdef HWPMC_HOOKS /* * If this thread is part of a process that is being tracked by hwpmc(4), * inform the module of the thread's impending exit. */ if (PMC_PROC_IS_USING_PMCS(td->td_proc)) PMC_SWITCH_CONTEXT(td, PMC_FN_CSW_OUT); #endif PROC_UNLOCK(p); PROC_STATLOCK(p); thread_lock(td); PROC_SUNLOCK(p); /* Do the same timestamp bookkeeping that mi_switch() would do. */ new_switchtime = cpu_ticks(); runtime = new_switchtime - PCPU_GET(switchtime); td->td_runtime += runtime; td->td_incruntime += runtime; PCPU_SET(switchtime, new_switchtime); PCPU_SET(switchticks, ticks); VM_CNT_INC(v_swtch); /* Save our resource usage in our process. */ td->td_ru.ru_nvcsw++; ruxagg(p, td); rucollect(&p->p_ru, &td->td_ru); PROC_STATUNLOCK(p); td->td_state = TDS_INACTIVE; #ifdef WITNESS witness_thread_exit(td); #endif CTR1(KTR_PROC, "thread_exit: cpu_throw() thread %p", td); sched_throw(td); panic("I'm a teapot!"); /* NOTREACHED */ } /* * Do any thread specific cleanups that may be needed in wait() * called with Giant, proc and schedlock not held. */ void thread_wait(struct proc *p) { struct thread *td; mtx_assert(&Giant, MA_NOTOWNED); KASSERT(p->p_numthreads == 1, ("multiple threads in thread_wait()")); KASSERT(p->p_exitthreads == 0, ("p_exitthreads leaking")); td = FIRST_THREAD_IN_PROC(p); /* Lock the last thread so we spin until it exits cpu_throw(). */ thread_lock(td); thread_unlock(td); lock_profile_thread_exit(td); cpuset_rel(td->td_cpuset); td->td_cpuset = NULL; cpu_thread_clean(td); thread_cow_free(td); callout_drain(&td->td_slpcallout); thread_reap(); /* check for zombie threads etc. */ } /* * Link a thread to a process. * set up anything that needs to be initialized for it to * be used by the process. */ void thread_link(struct thread *td, struct proc *p) { /* * XXX This can't be enabled because it's called for proc0 before * its lock has been created. * PROC_LOCK_ASSERT(p, MA_OWNED); */ td->td_state = TDS_INACTIVE; td->td_proc = p; td->td_flags = TDF_INMEM; LIST_INIT(&td->td_contested); LIST_INIT(&td->td_lprof[0]); LIST_INIT(&td->td_lprof[1]); sigqueue_init(&td->td_sigqueue, p); callout_init(&td->td_slpcallout, 1); TAILQ_INSERT_TAIL(&p->p_threads, td, td_plist); p->p_numthreads++; } /* * Called from: * thread_exit() */ void thread_unlink(struct thread *td) { struct proc *p = td->td_proc; PROC_LOCK_ASSERT(p, MA_OWNED); TAILQ_REMOVE(&p->p_threads, td, td_plist); p->p_numthreads--; /* could clear a few other things here */ /* Must NOT clear links to proc! */ } static int calc_remaining(struct proc *p, int mode) { int remaining; PROC_LOCK_ASSERT(p, MA_OWNED); PROC_SLOCK_ASSERT(p, MA_OWNED); if (mode == SINGLE_EXIT) remaining = p->p_numthreads; else if (mode == SINGLE_BOUNDARY) remaining = p->p_numthreads - p->p_boundary_count; else if (mode == SINGLE_NO_EXIT || mode == SINGLE_ALLPROC) remaining = p->p_numthreads - p->p_suspcount; else panic("calc_remaining: wrong mode %d", mode); return (remaining); } static int remain_for_mode(int mode) { return (mode == SINGLE_ALLPROC ? 0 : 1); } static int weed_inhib(int mode, struct thread *td2, struct proc *p) { int wakeup_swapper; PROC_LOCK_ASSERT(p, MA_OWNED); PROC_SLOCK_ASSERT(p, MA_OWNED); THREAD_LOCK_ASSERT(td2, MA_OWNED); wakeup_swapper = 0; switch (mode) { case SINGLE_EXIT: if (TD_IS_SUSPENDED(td2)) wakeup_swapper |= thread_unsuspend_one(td2, p, true); if (TD_ON_SLEEPQ(td2) && (td2->td_flags & TDF_SINTR) != 0) wakeup_swapper |= sleepq_abort(td2, EINTR); break; case SINGLE_BOUNDARY: case SINGLE_NO_EXIT: if (TD_IS_SUSPENDED(td2) && (td2->td_flags & TDF_BOUNDARY) == 0) wakeup_swapper |= thread_unsuspend_one(td2, p, false); if (TD_ON_SLEEPQ(td2) && (td2->td_flags & TDF_SINTR) != 0) wakeup_swapper |= sleepq_abort(td2, ERESTART); break; case SINGLE_ALLPROC: /* * ALLPROC suspend tries to avoid spurious EINTR for * threads sleeping interruptable, by suspending the * thread directly, similarly to sig_suspend_threads(). * Since such sleep is not performed at the user * boundary, TDF_BOUNDARY flag is not set, and TDF_ALLPROCSUSP * is used to avoid immediate un-suspend. */ if (TD_IS_SUSPENDED(td2) && (td2->td_flags & (TDF_BOUNDARY | TDF_ALLPROCSUSP)) == 0) wakeup_swapper |= thread_unsuspend_one(td2, p, false); if (TD_ON_SLEEPQ(td2) && (td2->td_flags & TDF_SINTR) != 0) { if ((td2->td_flags & TDF_SBDRY) == 0) { thread_suspend_one(td2); td2->td_flags |= TDF_ALLPROCSUSP; } else { wakeup_swapper |= sleepq_abort(td2, ERESTART); } } break; } return (wakeup_swapper); } /* * Enforce single-threading. * * Returns 1 if the caller must abort (another thread is waiting to * exit the process or similar). Process is locked! * Returns 0 when you are successfully the only thread running. * A process has successfully single threaded in the suspend mode when * There are no threads in user mode. Threads in the kernel must be * allowed to continue until they get to the user boundary. They may even * copy out their return values and data before suspending. They may however be * accelerated in reaching the user boundary as we will wake up * any sleeping threads that are interruptable. (PCATCH). */ int thread_single(struct proc *p, int mode) { struct thread *td; struct thread *td2; int remaining, wakeup_swapper; td = curthread; KASSERT(mode == SINGLE_EXIT || mode == SINGLE_BOUNDARY || mode == SINGLE_ALLPROC || mode == SINGLE_NO_EXIT, ("invalid mode %d", mode)); /* * If allowing non-ALLPROC singlethreading for non-curproc * callers, calc_remaining() and remain_for_mode() should be * adjusted to also account for td->td_proc != p. For now * this is not implemented because it is not used. */ KASSERT((mode == SINGLE_ALLPROC && td->td_proc != p) || (mode != SINGLE_ALLPROC && td->td_proc == p), ("mode %d proc %p curproc %p", mode, p, td->td_proc)); mtx_assert(&Giant, MA_NOTOWNED); PROC_LOCK_ASSERT(p, MA_OWNED); if ((p->p_flag & P_HADTHREADS) == 0 && mode != SINGLE_ALLPROC) return (0); /* Is someone already single threading? */ if (p->p_singlethread != NULL && p->p_singlethread != td) return (1); if (mode == SINGLE_EXIT) { p->p_flag |= P_SINGLE_EXIT; p->p_flag &= ~P_SINGLE_BOUNDARY; } else { p->p_flag &= ~P_SINGLE_EXIT; if (mode == SINGLE_BOUNDARY) p->p_flag |= P_SINGLE_BOUNDARY; else p->p_flag &= ~P_SINGLE_BOUNDARY; } if (mode == SINGLE_ALLPROC) p->p_flag |= P_TOTAL_STOP; p->p_flag |= P_STOPPED_SINGLE; PROC_SLOCK(p); p->p_singlethread = td; remaining = calc_remaining(p, mode); while (remaining != remain_for_mode(mode)) { if (P_SHOULDSTOP(p) != P_STOPPED_SINGLE) goto stopme; wakeup_swapper = 0; FOREACH_THREAD_IN_PROC(p, td2) { if (td2 == td) continue; thread_lock(td2); td2->td_flags |= TDF_ASTPENDING | TDF_NEEDSUSPCHK; if (TD_IS_INHIBITED(td2)) { wakeup_swapper |= weed_inhib(mode, td2, p); #ifdef SMP } else if (TD_IS_RUNNING(td2) && td != td2) { forward_signal(td2); #endif } thread_unlock(td2); } if (wakeup_swapper) kick_proc0(); remaining = calc_remaining(p, mode); /* * Maybe we suspended some threads.. was it enough? */ if (remaining == remain_for_mode(mode)) break; stopme: /* * Wake us up when everyone else has suspended. * In the mean time we suspend as well. */ thread_suspend_switch(td, p); remaining = calc_remaining(p, mode); } if (mode == SINGLE_EXIT) { /* * Convert the process to an unthreaded process. The * SINGLE_EXIT is called by exit1() or execve(), in * both cases other threads must be retired. */ KASSERT(p->p_numthreads == 1, ("Unthreading with >1 threads")); p->p_singlethread = NULL; p->p_flag &= ~(P_STOPPED_SINGLE | P_SINGLE_EXIT | P_HADTHREADS); /* * Wait for any remaining threads to exit cpu_throw(). */ while (p->p_exitthreads != 0) { PROC_SUNLOCK(p); PROC_UNLOCK(p); sched_relinquish(td); PROC_LOCK(p); PROC_SLOCK(p); } } else if (mode == SINGLE_BOUNDARY) { /* * Wait until all suspended threads are removed from * the processors. The thread_suspend_check() * increments p_boundary_count while it is still * running, which makes it possible for the execve() * to destroy vmspace while our other threads are * still using the address space. * * We lock the thread, which is only allowed to * succeed after context switch code finished using * the address space. */ FOREACH_THREAD_IN_PROC(p, td2) { if (td2 == td) continue; thread_lock(td2); KASSERT((td2->td_flags & TDF_BOUNDARY) != 0, ("td %p not on boundary", td2)); KASSERT(TD_IS_SUSPENDED(td2), ("td %p is not suspended", td2)); thread_unlock(td2); } } PROC_SUNLOCK(p); return (0); } bool thread_suspend_check_needed(void) { struct proc *p; struct thread *td; td = curthread; p = td->td_proc; PROC_LOCK_ASSERT(p, MA_OWNED); return (P_SHOULDSTOP(p) || ((p->p_flag & P_TRACED) != 0 && (td->td_dbgflags & TDB_SUSPEND) != 0)); } /* * Called in from locations that can safely check to see * whether we have to suspend or at least throttle for a * single-thread event (e.g. fork). * * Such locations include userret(). * If the "return_instead" argument is non zero, the thread must be able to * accept 0 (caller may continue), or 1 (caller must abort) as a result. * * The 'return_instead' argument tells the function if it may do a * thread_exit() or suspend, or whether the caller must abort and back * out instead. * * If the thread that set the single_threading request has set the * P_SINGLE_EXIT bit in the process flags then this call will never return * if 'return_instead' is false, but will exit. * * P_SINGLE_EXIT | return_instead == 0| return_instead != 0 *---------------+--------------------+--------------------- * 0 | returns 0 | returns 0 or 1 * | when ST ends | immediately *---------------+--------------------+--------------------- * 1 | thread exits | returns 1 * | | immediately * 0 = thread_exit() or suspension ok, * other = return error instead of stopping the thread. * * While a full suspension is under effect, even a single threading * thread would be suspended if it made this call (but it shouldn't). * This call should only be made from places where * thread_exit() would be safe as that may be the outcome unless * return_instead is set. */ int thread_suspend_check(int return_instead) { struct thread *td; struct proc *p; int wakeup_swapper; td = curthread; p = td->td_proc; mtx_assert(&Giant, MA_NOTOWNED); PROC_LOCK_ASSERT(p, MA_OWNED); while (thread_suspend_check_needed()) { if (P_SHOULDSTOP(p) == P_STOPPED_SINGLE) { KASSERT(p->p_singlethread != NULL, ("singlethread not set")); /* * The only suspension in action is a * single-threading. Single threader need not stop. * It is safe to access p->p_singlethread unlocked * because it can only be set to our address by us. */ if (p->p_singlethread == td) return (0); /* Exempt from stopping. */ } if ((p->p_flag & P_SINGLE_EXIT) && return_instead) return (EINTR); /* Should we goto user boundary if we didn't come from there? */ if (P_SHOULDSTOP(p) == P_STOPPED_SINGLE && (p->p_flag & P_SINGLE_BOUNDARY) && return_instead) return (ERESTART); /* * Ignore suspend requests if they are deferred. */ if ((td->td_flags & TDF_SBDRY) != 0) { KASSERT(return_instead, ("TDF_SBDRY set for unsafe thread_suspend_check")); KASSERT((td->td_flags & (TDF_SEINTR | TDF_SERESTART)) != (TDF_SEINTR | TDF_SERESTART), ("both TDF_SEINTR and TDF_SERESTART")); return (TD_SBDRY_INTR(td) ? TD_SBDRY_ERRNO(td) : 0); } /* * If the process is waiting for us to exit, * this thread should just suicide. * Assumes that P_SINGLE_EXIT implies P_STOPPED_SINGLE. */ if ((p->p_flag & P_SINGLE_EXIT) && (p->p_singlethread != td)) { PROC_UNLOCK(p); /* * Allow Linux emulation layer to do some work * before thread suicide. */ if (__predict_false(p->p_sysent->sv_thread_detach != NULL)) (p->p_sysent->sv_thread_detach)(td); umtx_thread_exit(td); kern_thr_exit(td); panic("stopped thread did not exit"); } PROC_SLOCK(p); thread_stopped(p); if (P_SHOULDSTOP(p) == P_STOPPED_SINGLE) { if (p->p_numthreads == p->p_suspcount + 1) { thread_lock(p->p_singlethread); wakeup_swapper = thread_unsuspend_one( p->p_singlethread, p, false); thread_unlock(p->p_singlethread); if (wakeup_swapper) kick_proc0(); } } PROC_UNLOCK(p); thread_lock(td); /* * When a thread suspends, it just * gets taken off all queues. */ thread_suspend_one(td); if (return_instead == 0) { p->p_boundary_count++; td->td_flags |= TDF_BOUNDARY; } PROC_SUNLOCK(p); mi_switch(SW_INVOL | SWT_SUSPEND, NULL); thread_unlock(td); PROC_LOCK(p); } return (0); } void thread_suspend_switch(struct thread *td, struct proc *p) { KASSERT(!TD_IS_SUSPENDED(td), ("already suspended")); PROC_LOCK_ASSERT(p, MA_OWNED); PROC_SLOCK_ASSERT(p, MA_OWNED); /* * We implement thread_suspend_one in stages here to avoid * dropping the proc lock while the thread lock is owned. */ if (p == td->td_proc) { thread_stopped(p); p->p_suspcount++; } PROC_UNLOCK(p); thread_lock(td); td->td_flags &= ~TDF_NEEDSUSPCHK; TD_SET_SUSPENDED(td); sched_sleep(td, 0); PROC_SUNLOCK(p); DROP_GIANT(); mi_switch(SW_VOL | SWT_SUSPEND, NULL); thread_unlock(td); PICKUP_GIANT(); PROC_LOCK(p); PROC_SLOCK(p); } void thread_suspend_one(struct thread *td) { struct proc *p; p = td->td_proc; PROC_SLOCK_ASSERT(p, MA_OWNED); THREAD_LOCK_ASSERT(td, MA_OWNED); KASSERT(!TD_IS_SUSPENDED(td), ("already suspended")); p->p_suspcount++; td->td_flags &= ~TDF_NEEDSUSPCHK; TD_SET_SUSPENDED(td); sched_sleep(td, 0); } static int thread_unsuspend_one(struct thread *td, struct proc *p, bool boundary) { THREAD_LOCK_ASSERT(td, MA_OWNED); KASSERT(TD_IS_SUSPENDED(td), ("Thread not suspended")); TD_CLR_SUSPENDED(td); td->td_flags &= ~TDF_ALLPROCSUSP; if (td->td_proc == p) { PROC_SLOCK_ASSERT(p, MA_OWNED); p->p_suspcount--; if (boundary && (td->td_flags & TDF_BOUNDARY) != 0) { td->td_flags &= ~TDF_BOUNDARY; p->p_boundary_count--; } } return (setrunnable(td)); } /* * Allow all threads blocked by single threading to continue running. */ void thread_unsuspend(struct proc *p) { struct thread *td; int wakeup_swapper; PROC_LOCK_ASSERT(p, MA_OWNED); PROC_SLOCK_ASSERT(p, MA_OWNED); wakeup_swapper = 0; if (!P_SHOULDSTOP(p)) { FOREACH_THREAD_IN_PROC(p, td) { thread_lock(td); if (TD_IS_SUSPENDED(td)) { wakeup_swapper |= thread_unsuspend_one(td, p, true); } thread_unlock(td); } } else if (P_SHOULDSTOP(p) == P_STOPPED_SINGLE && p->p_numthreads == p->p_suspcount) { /* * Stopping everything also did the job for the single * threading request. Now we've downgraded to single-threaded, * let it continue. */ if (p->p_singlethread->td_proc == p) { thread_lock(p->p_singlethread); wakeup_swapper = thread_unsuspend_one( p->p_singlethread, p, false); thread_unlock(p->p_singlethread); } } if (wakeup_swapper) kick_proc0(); } /* * End the single threading mode.. */ void thread_single_end(struct proc *p, int mode) { struct thread *td; int wakeup_swapper; KASSERT(mode == SINGLE_EXIT || mode == SINGLE_BOUNDARY || mode == SINGLE_ALLPROC || mode == SINGLE_NO_EXIT, ("invalid mode %d", mode)); PROC_LOCK_ASSERT(p, MA_OWNED); KASSERT((mode == SINGLE_ALLPROC && (p->p_flag & P_TOTAL_STOP) != 0) || (mode != SINGLE_ALLPROC && (p->p_flag & P_TOTAL_STOP) == 0), ("mode %d does not match P_TOTAL_STOP", mode)); KASSERT(mode == SINGLE_ALLPROC || p->p_singlethread == curthread, ("thread_single_end from other thread %p %p", curthread, p->p_singlethread)); KASSERT(mode != SINGLE_BOUNDARY || (p->p_flag & P_SINGLE_BOUNDARY) != 0, ("mis-matched SINGLE_BOUNDARY flags %x", p->p_flag)); p->p_flag &= ~(P_STOPPED_SINGLE | P_SINGLE_EXIT | P_SINGLE_BOUNDARY | P_TOTAL_STOP); PROC_SLOCK(p); p->p_singlethread = NULL; wakeup_swapper = 0; /* * If there are other threads they may now run, * unless of course there is a blanket 'stop order' * on the process. The single threader must be allowed * to continue however as this is a bad place to stop. */ if (p->p_numthreads != remain_for_mode(mode) && !P_SHOULDSTOP(p)) { FOREACH_THREAD_IN_PROC(p, td) { thread_lock(td); if (TD_IS_SUSPENDED(td)) { wakeup_swapper |= thread_unsuspend_one(td, p, mode == SINGLE_BOUNDARY); } thread_unlock(td); } } KASSERT(mode != SINGLE_BOUNDARY || p->p_boundary_count == 0, ("inconsistent boundary count %d", p->p_boundary_count)); PROC_SUNLOCK(p); if (wakeup_swapper) kick_proc0(); } struct thread * thread_find(struct proc *p, lwpid_t tid) { struct thread *td; PROC_LOCK_ASSERT(p, MA_OWNED); FOREACH_THREAD_IN_PROC(p, td) { if (td->td_tid == tid) break; } return (td); } /* Locate a thread by number; return with proc lock held. */ struct thread * tdfind(lwpid_t tid, pid_t pid) { #define RUN_THRESH 16 struct thread *td; int run = 0; rw_rlock(&tidhash_lock); LIST_FOREACH(td, TIDHASH(tid), td_hash) { if (td->td_tid == tid) { if (pid != -1 && td->td_proc->p_pid != pid) { td = NULL; break; } PROC_LOCK(td->td_proc); if (td->td_proc->p_state == PRS_NEW) { PROC_UNLOCK(td->td_proc); td = NULL; break; } if (run > RUN_THRESH) { if (rw_try_upgrade(&tidhash_lock)) { LIST_REMOVE(td, td_hash); LIST_INSERT_HEAD(TIDHASH(td->td_tid), td, td_hash); rw_wunlock(&tidhash_lock); return (td); } } break; } run++; } rw_runlock(&tidhash_lock); return (td); } void tidhash_add(struct thread *td) { rw_wlock(&tidhash_lock); LIST_INSERT_HEAD(TIDHASH(td->td_tid), td, td_hash); rw_wunlock(&tidhash_lock); } void tidhash_remove(struct thread *td) { rw_wlock(&tidhash_lock); LIST_REMOVE(td, td_hash); rw_wunlock(&tidhash_lock); } Index: projects/numa2/sys/kern/subr_busdma_bufalloc.c =================================================================== --- projects/numa2/sys/kern/subr_busdma_bufalloc.c (revision 321505) +++ projects/numa2/sys/kern/subr_busdma_bufalloc.c (revision 321506) @@ -1,174 +1,174 @@ /*- * Copyright (c) 2012 Ian Lepore * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); /* * Buffer allocation support routines for bus_dmamem_alloc implementations. */ #include #include #include #include #include #include #include #include #include /* * We manage buffer zones up to a page in size. Buffers larger than a page can * be managed by one of the kernel's page-oriented memory allocation routines as * efficiently as what we can do here. Also, a page is the largest size for * which we can g'tee contiguity when using uma, and contiguity is one of the * requirements we have to fulfill. */ #define MIN_ZONE_BUFSIZE 32 #define MAX_ZONE_BUFSIZE PAGE_SIZE /* * The static array of 12 bufzones is big enough to handle all the zones for the * smallest supported allocation size of 32 through the largest supported page * size of 64K. If you up the biggest page size number, up the array size too. * Basically the size of the array needs to be log2(maxsize)-log2(minsize)+1, * but I don't know of an easy way to express that as a compile-time constant. */ #if PAGE_SIZE > 65536 #error Unsupported page size #endif struct busdma_bufalloc { bus_size_t min_size; size_t num_zones; struct busdma_bufzone buf_zones[12]; }; busdma_bufalloc_t busdma_bufalloc_create(const char *name, bus_size_t minimum_alignment, uma_alloc alloc_func, uma_free free_func, u_int32_t zcreate_flags) { struct busdma_bufalloc *ba; struct busdma_bufzone *bz; int i; bus_size_t cursize; ba = malloc(sizeof(struct busdma_bufalloc), M_DEVBUF, M_ZERO | M_WAITOK); ba->min_size = MAX(MIN_ZONE_BUFSIZE, minimum_alignment); /* * Each uma zone is created with an alignment of size-1, meaning that * the alignment is equal to the size (I.E., 64 byte buffers are aligned * to 64 byte boundaries, etc). This allows for a fast efficient test * when deciding whether a pool buffer meets the constraints of a given * tag used for allocation: the buffer is usable if tag->alignment <= * bufzone->size. */ for (i = 0, bz = ba->buf_zones, cursize = ba->min_size; i < nitems(ba->buf_zones) && cursize <= MAX_ZONE_BUFSIZE; ++i, ++bz, cursize <<= 1) { snprintf(bz->name, sizeof(bz->name), "dma %.10s %ju", name, (uintmax_t)cursize); bz->size = cursize; bz->umazone = uma_zcreate(bz->name, bz->size, NULL, NULL, NULL, NULL, bz->size - 1, zcreate_flags); if (bz->umazone == NULL) { busdma_bufalloc_destroy(ba); return (NULL); } if (alloc_func != NULL) uma_zone_set_allocf(bz->umazone, alloc_func); if (free_func != NULL) uma_zone_set_freef(bz->umazone, free_func); ++ba->num_zones; } return (ba); } void busdma_bufalloc_destroy(busdma_bufalloc_t ba) { struct busdma_bufzone *bz; int i; if (ba == NULL) return; for (i = 0, bz = ba->buf_zones; i < ba->num_zones; ++i, ++bz) { uma_zdestroy(bz->umazone); } free(ba, M_DEVBUF); } struct busdma_bufzone * busdma_bufalloc_findzone(busdma_bufalloc_t ba, bus_size_t size) { struct busdma_bufzone *bz; int i; if (size > MAX_ZONE_BUFSIZE) return (NULL); for (i = 0, bz = ba->buf_zones; i < ba->num_zones; ++i, ++bz) { if (bz->size >= size) return (bz); } panic("Didn't find a buffer zone of the right size"); } void * -busdma_bufalloc_alloc_uncacheable(uma_zone_t zone, vm_size_t size, +busdma_bufalloc_alloc_uncacheable(uma_zone_t zone, vm_size_t size, int domain, uint8_t *pflag, int wait) { #ifdef VM_MEMATTR_UNCACHEABLE /* Inform UMA that this allocator uses kernel_arena/object. */ *pflag = UMA_SLAB_KERNEL; return ((void *)kmem_alloc_attr(kernel_arena, size, wait, 0, BUS_SPACE_MAXADDR, VM_MEMATTR_UNCACHEABLE)); #else panic("VM_MEMATTR_UNCACHEABLE unavailable"); #endif /* VM_MEMATTR_UNCACHEABLE */ } void busdma_bufalloc_free_uncacheable(void *item, vm_size_t size, uint8_t pflag) { kmem_free(kernel_arena, (vm_offset_t)item, size); } Index: projects/numa2/sys/kern/subr_vmem.c =================================================================== --- projects/numa2/sys/kern/subr_vmem.c (revision 321505) +++ projects/numa2/sys/kern/subr_vmem.c (revision 321506) @@ -1,1586 +1,1587 @@ /*- * Copyright (c)2006,2007,2008,2009 YAMAMOTO Takashi, * Copyright (c) 2013 EMC Corp. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /* * From: * $NetBSD: vmem_impl.h,v 1.2 2013/01/29 21:26:24 para Exp $ * $NetBSD: subr_vmem.c,v 1.83 2013/03/06 11:20:10 yamt Exp $ */ /* * reference: * - Magazines and Vmem: Extending the Slab Allocator * to Many CPUs and Arbitrary Resources * http://www.usenix.org/event/usenix01/bonwick.html */ #include __FBSDID("$FreeBSD$"); #include "opt_ddb.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "opt_vm.h" #include #include #include #include #include #include #include #include #include #define VMEM_OPTORDER 5 #define VMEM_OPTVALUE (1 << VMEM_OPTORDER) #define VMEM_MAXORDER \ (VMEM_OPTVALUE - 1 + sizeof(vmem_size_t) * NBBY - VMEM_OPTORDER) #define VMEM_HASHSIZE_MIN 16 #define VMEM_HASHSIZE_MAX 131072 #define VMEM_QCACHE_IDX_MAX 16 #define VMEM_FITMASK (M_BESTFIT | M_FIRSTFIT) #define VMEM_FLAGS \ (M_NOWAIT | M_WAITOK | M_USE_RESERVE | M_NOVM | M_BESTFIT | M_FIRSTFIT) #define BT_FLAGS (M_NOWAIT | M_WAITOK | M_USE_RESERVE | M_NOVM) #define QC_NAME_MAX 16 /* * Data structures private to vmem. */ MALLOC_DEFINE(M_VMEM, "vmem", "vmem internal structures"); typedef struct vmem_btag bt_t; TAILQ_HEAD(vmem_seglist, vmem_btag); LIST_HEAD(vmem_freelist, vmem_btag); LIST_HEAD(vmem_hashlist, vmem_btag); struct qcache { uma_zone_t qc_cache; vmem_t *qc_vmem; vmem_size_t qc_size; char qc_name[QC_NAME_MAX]; }; typedef struct qcache qcache_t; #define QC_POOL_TO_QCACHE(pool) ((qcache_t *)(pool->pr_qcache)) #define VMEM_NAME_MAX 16 /* vmem arena */ struct vmem { struct mtx_padalign vm_lock; struct cv vm_cv; char vm_name[VMEM_NAME_MAX+1]; LIST_ENTRY(vmem) vm_alllist; struct vmem_hashlist vm_hash0[VMEM_HASHSIZE_MIN]; struct vmem_freelist vm_freelist[VMEM_MAXORDER]; struct vmem_seglist vm_seglist; struct vmem_hashlist *vm_hashlist; vmem_size_t vm_hashsize; /* Constant after init */ vmem_size_t vm_qcache_max; vmem_size_t vm_quantum_mask; vmem_size_t vm_import_quantum; int vm_quantum_shift; /* Written on alloc/free */ LIST_HEAD(, vmem_btag) vm_freetags; int vm_nfreetags; int vm_nbusytag; vmem_size_t vm_inuse; vmem_size_t vm_size; /* Used on import. */ vmem_import_t *vm_importfn; vmem_release_t *vm_releasefn; void *vm_arg; /* Space exhaustion callback. */ vmem_reclaim_t *vm_reclaimfn; /* quantum cache */ qcache_t vm_qcache[VMEM_QCACHE_IDX_MAX]; }; /* boundary tag */ struct vmem_btag { TAILQ_ENTRY(vmem_btag) bt_seglist; union { LIST_ENTRY(vmem_btag) u_freelist; /* BT_TYPE_FREE */ LIST_ENTRY(vmem_btag) u_hashlist; /* BT_TYPE_BUSY */ } bt_u; #define bt_hashlist bt_u.u_hashlist #define bt_freelist bt_u.u_freelist vmem_addr_t bt_start; vmem_size_t bt_size; int bt_type; }; #define BT_TYPE_SPAN 1 /* Allocated from importfn */ #define BT_TYPE_SPAN_STATIC 2 /* vmem_add() or create. */ #define BT_TYPE_FREE 3 /* Available space. */ #define BT_TYPE_BUSY 4 /* Used space. */ #define BT_ISSPAN_P(bt) ((bt)->bt_type <= BT_TYPE_SPAN_STATIC) #define BT_END(bt) ((bt)->bt_start + (bt)->bt_size - 1) #if defined(DIAGNOSTIC) static int enable_vmem_check = 1; SYSCTL_INT(_debug, OID_AUTO, vmem_check, CTLFLAG_RWTUN, &enable_vmem_check, 0, "Enable vmem check"); static void vmem_check(vmem_t *); #endif static struct callout vmem_periodic_ch; static int vmem_periodic_interval; static struct task vmem_periodic_wk; static struct mtx_padalign vmem_list_lock; static LIST_HEAD(, vmem) vmem_list = LIST_HEAD_INITIALIZER(vmem_list); /* ---- misc */ #define VMEM_CONDVAR_INIT(vm, wchan) cv_init(&vm->vm_cv, wchan) #define VMEM_CONDVAR_DESTROY(vm) cv_destroy(&vm->vm_cv) #define VMEM_CONDVAR_WAIT(vm) cv_wait(&vm->vm_cv, &vm->vm_lock) #define VMEM_CONDVAR_BROADCAST(vm) cv_broadcast(&vm->vm_cv) #define VMEM_LOCK(vm) mtx_lock(&vm->vm_lock) #define VMEM_TRYLOCK(vm) mtx_trylock(&vm->vm_lock) #define VMEM_UNLOCK(vm) mtx_unlock(&vm->vm_lock) #define VMEM_LOCK_INIT(vm, name) mtx_init(&vm->vm_lock, (name), NULL, MTX_DEF) #define VMEM_LOCK_DESTROY(vm) mtx_destroy(&vm->vm_lock) #define VMEM_ASSERT_LOCKED(vm) mtx_assert(&vm->vm_lock, MA_OWNED); #define VMEM_ALIGNUP(addr, align) (-(-(addr) & -(align))) #define VMEM_CROSS_P(addr1, addr2, boundary) \ ((((addr1) ^ (addr2)) & -(boundary)) != 0) #define ORDER2SIZE(order) ((order) < VMEM_OPTVALUE ? ((order) + 1) : \ (vmem_size_t)1 << ((order) - (VMEM_OPTVALUE - VMEM_OPTORDER - 1))) #define SIZE2ORDER(size) ((size) <= VMEM_OPTVALUE ? ((size) - 1) : \ (flsl(size) + (VMEM_OPTVALUE - VMEM_OPTORDER - 2))) /* * Maximum number of boundary tags that may be required to satisfy an * allocation. Two may be required to import. Another two may be * required to clip edges. */ #define BT_MAXALLOC 4 /* * Max free limits the number of locally cached boundary tags. We * just want to avoid hitting the zone allocator for every call. */ #define BT_MAXFREE (BT_MAXALLOC * 8) /* Allocator for boundary tags. */ static uma_zone_t vmem_bt_zone; /* boot time arena storage. */ static struct vmem kernel_arena_storage; static struct vmem kmem_arena_storage; static struct vmem buffer_arena_storage; static struct vmem transient_arena_storage; vmem_t *kernel_arena = &kernel_arena_storage; vmem_t *kmem_arena = &kmem_arena_storage; vmem_t *buffer_arena = &buffer_arena_storage; vmem_t *transient_arena = &transient_arena_storage; #ifdef DEBUG_MEMGUARD static struct vmem memguard_arena_storage; vmem_t *memguard_arena = &memguard_arena_storage; #endif /* * Fill the vmem's boundary tag cache. We guarantee that boundary tag * allocation will not fail once bt_fill() passes. To do so we cache * at least the maximum possible tag allocations in the arena. */ static int bt_fill(vmem_t *vm, int flags) { bt_t *bt; VMEM_ASSERT_LOCKED(vm); /* * Only allow the kmem arena to dip into reserve tags. It is the * vmem where new tags come from. */ flags &= BT_FLAGS; if (vm != kmem_arena) flags &= ~M_USE_RESERVE; /* * Loop until we meet the reserve. To minimize the lock shuffle * and prevent simultaneous fills we first try a NOWAIT regardless * of the caller's flags. Specify M_NOVM so we don't recurse while * holding a vmem lock. */ while (vm->vm_nfreetags < BT_MAXALLOC) { bt = uma_zalloc(vmem_bt_zone, (flags & M_USE_RESERVE) | M_NOWAIT | M_NOVM); if (bt == NULL) { VMEM_UNLOCK(vm); bt = uma_zalloc(vmem_bt_zone, flags); VMEM_LOCK(vm); if (bt == NULL && (flags & M_NOWAIT) != 0) break; } LIST_INSERT_HEAD(&vm->vm_freetags, bt, bt_freelist); vm->vm_nfreetags++; } if (vm->vm_nfreetags < BT_MAXALLOC) return ENOMEM; return 0; } /* * Pop a tag off of the freetag stack. */ static bt_t * bt_alloc(vmem_t *vm) { bt_t *bt; VMEM_ASSERT_LOCKED(vm); bt = LIST_FIRST(&vm->vm_freetags); MPASS(bt != NULL); LIST_REMOVE(bt, bt_freelist); vm->vm_nfreetags--; return bt; } /* * Trim the per-vmem free list. Returns with the lock released to * avoid allocator recursions. */ static void bt_freetrim(vmem_t *vm, int freelimit) { LIST_HEAD(, vmem_btag) freetags; bt_t *bt; LIST_INIT(&freetags); VMEM_ASSERT_LOCKED(vm); while (vm->vm_nfreetags > freelimit) { bt = LIST_FIRST(&vm->vm_freetags); LIST_REMOVE(bt, bt_freelist); vm->vm_nfreetags--; LIST_INSERT_HEAD(&freetags, bt, bt_freelist); } VMEM_UNLOCK(vm); while ((bt = LIST_FIRST(&freetags)) != NULL) { LIST_REMOVE(bt, bt_freelist); uma_zfree(vmem_bt_zone, bt); } } static inline void bt_free(vmem_t *vm, bt_t *bt) { VMEM_ASSERT_LOCKED(vm); MPASS(LIST_FIRST(&vm->vm_freetags) != bt); LIST_INSERT_HEAD(&vm->vm_freetags, bt, bt_freelist); vm->vm_nfreetags++; } /* * freelist[0] ... [1, 1] * freelist[1] ... [2, 2] * : * freelist[29] ... [30, 30] * freelist[30] ... [31, 31] * freelist[31] ... [32, 63] * freelist[33] ... [64, 127] * : * freelist[n] ... [(1 << (n - 26)), (1 << (n - 25)) - 1] * : */ static struct vmem_freelist * bt_freehead_tofree(vmem_t *vm, vmem_size_t size) { const vmem_size_t qsize = size >> vm->vm_quantum_shift; const int idx = SIZE2ORDER(qsize); MPASS(size != 0 && qsize != 0); MPASS((size & vm->vm_quantum_mask) == 0); MPASS(idx >= 0); MPASS(idx < VMEM_MAXORDER); return &vm->vm_freelist[idx]; } /* * bt_freehead_toalloc: return the freelist for the given size and allocation * strategy. * * For M_FIRSTFIT, return the list in which any blocks are large enough * for the requested size. otherwise, return the list which can have blocks * large enough for the requested size. */ static struct vmem_freelist * bt_freehead_toalloc(vmem_t *vm, vmem_size_t size, int strat) { const vmem_size_t qsize = size >> vm->vm_quantum_shift; int idx = SIZE2ORDER(qsize); MPASS(size != 0 && qsize != 0); MPASS((size & vm->vm_quantum_mask) == 0); if (strat == M_FIRSTFIT && ORDER2SIZE(idx) != qsize) { idx++; /* check too large request? */ } MPASS(idx >= 0); MPASS(idx < VMEM_MAXORDER); return &vm->vm_freelist[idx]; } /* ---- boundary tag hash */ static struct vmem_hashlist * bt_hashhead(vmem_t *vm, vmem_addr_t addr) { struct vmem_hashlist *list; unsigned int hash; hash = hash32_buf(&addr, sizeof(addr), 0); list = &vm->vm_hashlist[hash % vm->vm_hashsize]; return list; } static bt_t * bt_lookupbusy(vmem_t *vm, vmem_addr_t addr) { struct vmem_hashlist *list; bt_t *bt; VMEM_ASSERT_LOCKED(vm); list = bt_hashhead(vm, addr); LIST_FOREACH(bt, list, bt_hashlist) { if (bt->bt_start == addr) { break; } } return bt; } static void bt_rembusy(vmem_t *vm, bt_t *bt) { VMEM_ASSERT_LOCKED(vm); MPASS(vm->vm_nbusytag > 0); vm->vm_inuse -= bt->bt_size; vm->vm_nbusytag--; LIST_REMOVE(bt, bt_hashlist); } static void bt_insbusy(vmem_t *vm, bt_t *bt) { struct vmem_hashlist *list; VMEM_ASSERT_LOCKED(vm); MPASS(bt->bt_type == BT_TYPE_BUSY); list = bt_hashhead(vm, bt->bt_start); LIST_INSERT_HEAD(list, bt, bt_hashlist); vm->vm_nbusytag++; vm->vm_inuse += bt->bt_size; } /* ---- boundary tag list */ static void bt_remseg(vmem_t *vm, bt_t *bt) { TAILQ_REMOVE(&vm->vm_seglist, bt, bt_seglist); bt_free(vm, bt); } static void bt_insseg(vmem_t *vm, bt_t *bt, bt_t *prev) { TAILQ_INSERT_AFTER(&vm->vm_seglist, prev, bt, bt_seglist); } static void bt_insseg_tail(vmem_t *vm, bt_t *bt) { TAILQ_INSERT_TAIL(&vm->vm_seglist, bt, bt_seglist); } static void bt_remfree(vmem_t *vm, bt_t *bt) { MPASS(bt->bt_type == BT_TYPE_FREE); LIST_REMOVE(bt, bt_freelist); } static void bt_insfree(vmem_t *vm, bt_t *bt) { struct vmem_freelist *list; list = bt_freehead_tofree(vm, bt->bt_size); LIST_INSERT_HEAD(list, bt, bt_freelist); } /* ---- vmem internal functions */ /* * Import from the arena into the quantum cache in UMA. */ static int -qc_import(void *arg, void **store, int cnt, int flags) +qc_import(void *arg, void **store, int cnt, int domain, int flags) { qcache_t *qc; vmem_addr_t addr; int i; qc = arg; if ((flags & VMEM_FITMASK) == 0) flags |= M_BESTFIT; for (i = 0; i < cnt; i++) { if (vmem_xalloc(qc->qc_vmem, qc->qc_size, 0, 0, 0, VMEM_ADDR_MIN, VMEM_ADDR_MAX, flags, &addr) != 0) break; store[i] = (void *)addr; /* Only guarantee one allocation. */ flags &= ~M_WAITOK; flags |= M_NOWAIT; } return i; } /* * Release memory from the UMA cache to the arena. */ static void qc_release(void *arg, void **store, int cnt) { qcache_t *qc; int i; qc = arg; for (i = 0; i < cnt; i++) vmem_xfree(qc->qc_vmem, (vmem_addr_t)store[i], qc->qc_size); } static void qc_init(vmem_t *vm, vmem_size_t qcache_max) { qcache_t *qc; vmem_size_t size; int qcache_idx_max; int i; MPASS((qcache_max & vm->vm_quantum_mask) == 0); qcache_idx_max = MIN(qcache_max >> vm->vm_quantum_shift, VMEM_QCACHE_IDX_MAX); vm->vm_qcache_max = qcache_idx_max << vm->vm_quantum_shift; for (i = 0; i < qcache_idx_max; i++) { qc = &vm->vm_qcache[i]; size = (i + 1) << vm->vm_quantum_shift; snprintf(qc->qc_name, sizeof(qc->qc_name), "%s-%zu", vm->vm_name, size); qc->qc_vmem = vm; qc->qc_size = size; qc->qc_cache = uma_zcache_create(qc->qc_name, size, NULL, NULL, NULL, NULL, qc_import, qc_release, qc, UMA_ZONE_VM); MPASS(qc->qc_cache); } } static void qc_destroy(vmem_t *vm) { int qcache_idx_max; int i; qcache_idx_max = vm->vm_qcache_max >> vm->vm_quantum_shift; for (i = 0; i < qcache_idx_max; i++) uma_zdestroy(vm->vm_qcache[i].qc_cache); } static void qc_drain(vmem_t *vm) { int qcache_idx_max; int i; qcache_idx_max = vm->vm_qcache_max >> vm->vm_quantum_shift; for (i = 0; i < qcache_idx_max; i++) zone_drain(vm->vm_qcache[i].qc_cache); } #ifndef UMA_MD_SMALL_ALLOC static struct mtx_padalign vmem_bt_lock; /* * vmem_bt_alloc: Allocate a new page of boundary tags. * * On architectures with uma_small_alloc there is no recursion; no address * space need be allocated to allocate boundary tags. For the others, we * must handle recursion. Boundary tags are necessary to allocate new * boundary tags. * * UMA guarantees that enough tags are held in reserve to allocate a new * page of kva. We dip into this reserve by specifying M_USE_RESERVE only * when allocating the page to hold new boundary tags. In this way the * reserve is automatically filled by the allocation that uses the reserve. * * We still have to guarantee that the new tags are allocated atomically since * many threads may try concurrently. The bt_lock provides this guarantee. * We convert WAITOK allocations to NOWAIT and then handle the blocking here * on failure. It's ok to return NULL for a WAITOK allocation as UMA will * loop again after checking to see if we lost the race to allocate. * * There is a small race between vmem_bt_alloc() returning the page and the * zone lock being acquired to add the page to the zone. For WAITOK * allocations we just pause briefly. NOWAIT may experience a transient * failure. To alleviate this we permit a small number of simultaneous * fills to proceed concurrently so NOWAIT is less likely to fail unless * we are really out of KVA. */ static void * -vmem_bt_alloc(uma_zone_t zone, vm_size_t bytes, uint8_t *pflag, int wait) +vmem_bt_alloc(uma_zone_t zone, vm_size_t bytes, int domain, uint8_t *pflag, + int wait) { vmem_addr_t addr; *pflag = UMA_SLAB_KMEM; /* * Single thread boundary tag allocation so that the address space * and memory are added in one atomic operation. */ mtx_lock(&vmem_bt_lock); if (vmem_xalloc(kmem_arena, bytes, 0, 0, 0, VMEM_ADDR_MIN, VMEM_ADDR_MAX, M_NOWAIT | M_NOVM | M_USE_RESERVE | M_BESTFIT, &addr) == 0) { if (kmem_back(kmem_object, addr, bytes, M_NOWAIT | M_USE_RESERVE) == 0) { mtx_unlock(&vmem_bt_lock); return ((void *)addr); } vmem_xfree(kmem_arena, addr, bytes); mtx_unlock(&vmem_bt_lock); /* * Out of memory, not address space. This may not even be * possible due to M_USE_RESERVE page allocation. */ if (wait & M_WAITOK) VM_WAIT; return (NULL); } mtx_unlock(&vmem_bt_lock); /* * We're either out of address space or lost a fill race. */ if (wait & M_WAITOK) pause("btalloc", 1); return (NULL); } #endif void vmem_startup(void) { mtx_init(&vmem_list_lock, "vmem list lock", NULL, MTX_DEF); vmem_bt_zone = uma_zcreate("vmem btag", sizeof(struct vmem_btag), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_VM); #ifndef UMA_MD_SMALL_ALLOC mtx_init(&vmem_bt_lock, "btag lock", NULL, MTX_DEF); uma_prealloc(vmem_bt_zone, BT_MAXALLOC); /* * Reserve enough tags to allocate new tags. We allow multiple * CPUs to attempt to allocate new tags concurrently to limit * false restarts in UMA. */ uma_zone_reserve(vmem_bt_zone, BT_MAXALLOC * (mp_ncpus + 1) / 2); uma_zone_set_allocf(vmem_bt_zone, vmem_bt_alloc); #endif } /* ---- rehash */ static int vmem_rehash(vmem_t *vm, vmem_size_t newhashsize) { bt_t *bt; int i; struct vmem_hashlist *newhashlist; struct vmem_hashlist *oldhashlist; vmem_size_t oldhashsize; MPASS(newhashsize > 0); newhashlist = malloc(sizeof(struct vmem_hashlist) * newhashsize, M_VMEM, M_NOWAIT); if (newhashlist == NULL) return ENOMEM; for (i = 0; i < newhashsize; i++) { LIST_INIT(&newhashlist[i]); } VMEM_LOCK(vm); oldhashlist = vm->vm_hashlist; oldhashsize = vm->vm_hashsize; vm->vm_hashlist = newhashlist; vm->vm_hashsize = newhashsize; if (oldhashlist == NULL) { VMEM_UNLOCK(vm); return 0; } for (i = 0; i < oldhashsize; i++) { while ((bt = LIST_FIRST(&oldhashlist[i])) != NULL) { bt_rembusy(vm, bt); bt_insbusy(vm, bt); } } VMEM_UNLOCK(vm); if (oldhashlist != vm->vm_hash0) { free(oldhashlist, M_VMEM); } return 0; } static void vmem_periodic_kick(void *dummy) { taskqueue_enqueue(taskqueue_thread, &vmem_periodic_wk); } static void vmem_periodic(void *unused, int pending) { vmem_t *vm; vmem_size_t desired; vmem_size_t current; mtx_lock(&vmem_list_lock); LIST_FOREACH(vm, &vmem_list, vm_alllist) { #ifdef DIAGNOSTIC /* Convenient time to verify vmem state. */ if (enable_vmem_check == 1) { VMEM_LOCK(vm); vmem_check(vm); VMEM_UNLOCK(vm); } #endif desired = 1 << flsl(vm->vm_nbusytag); desired = MIN(MAX(desired, VMEM_HASHSIZE_MIN), VMEM_HASHSIZE_MAX); current = vm->vm_hashsize; /* Grow in powers of two. Shrink less aggressively. */ if (desired >= current * 2 || desired * 4 <= current) vmem_rehash(vm, desired); /* * Periodically wake up threads waiting for resources, * so they could ask for reclamation again. */ VMEM_CONDVAR_BROADCAST(vm); } mtx_unlock(&vmem_list_lock); callout_reset(&vmem_periodic_ch, vmem_periodic_interval, vmem_periodic_kick, NULL); } static void vmem_start_callout(void *unused) { TASK_INIT(&vmem_periodic_wk, 0, vmem_periodic, NULL); vmem_periodic_interval = hz * 10; callout_init(&vmem_periodic_ch, 1); callout_reset(&vmem_periodic_ch, vmem_periodic_interval, vmem_periodic_kick, NULL); } SYSINIT(vfs, SI_SUB_CONFIGURE, SI_ORDER_ANY, vmem_start_callout, NULL); static void vmem_add1(vmem_t *vm, vmem_addr_t addr, vmem_size_t size, int type) { bt_t *btspan; bt_t *btfree; MPASS(type == BT_TYPE_SPAN || type == BT_TYPE_SPAN_STATIC); MPASS((size & vm->vm_quantum_mask) == 0); btspan = bt_alloc(vm); btspan->bt_type = type; btspan->bt_start = addr; btspan->bt_size = size; bt_insseg_tail(vm, btspan); btfree = bt_alloc(vm); btfree->bt_type = BT_TYPE_FREE; btfree->bt_start = addr; btfree->bt_size = size; bt_insseg(vm, btfree, btspan); bt_insfree(vm, btfree); vm->vm_size += size; } static void vmem_destroy1(vmem_t *vm) { bt_t *bt; /* * Drain per-cpu quantum caches. */ qc_destroy(vm); /* * The vmem should now only contain empty segments. */ VMEM_LOCK(vm); MPASS(vm->vm_nbusytag == 0); while ((bt = TAILQ_FIRST(&vm->vm_seglist)) != NULL) bt_remseg(vm, bt); if (vm->vm_hashlist != NULL && vm->vm_hashlist != vm->vm_hash0) free(vm->vm_hashlist, M_VMEM); bt_freetrim(vm, 0); VMEM_CONDVAR_DESTROY(vm); VMEM_LOCK_DESTROY(vm); free(vm, M_VMEM); } static int vmem_import(vmem_t *vm, vmem_size_t size, vmem_size_t align, int flags) { vmem_addr_t addr; int error; if (vm->vm_importfn == NULL) return EINVAL; /* * To make sure we get a span that meets the alignment we double it * and add the size to the tail. This slightly overestimates. */ if (align != vm->vm_quantum_mask + 1) size = (align * 2) + size; size = roundup(size, vm->vm_import_quantum); /* * Hide MAXALLOC tags so we're guaranteed to be able to add this * span and the tag we want to allocate from it. */ MPASS(vm->vm_nfreetags >= BT_MAXALLOC); vm->vm_nfreetags -= BT_MAXALLOC; VMEM_UNLOCK(vm); error = (vm->vm_importfn)(vm->vm_arg, size, flags, &addr); VMEM_LOCK(vm); vm->vm_nfreetags += BT_MAXALLOC; if (error) return ENOMEM; vmem_add1(vm, addr, size, BT_TYPE_SPAN); return 0; } /* * vmem_fit: check if a bt can satisfy the given restrictions. * * it's a caller's responsibility to ensure the region is big enough * before calling us. */ static int vmem_fit(const bt_t *bt, vmem_size_t size, vmem_size_t align, vmem_size_t phase, vmem_size_t nocross, vmem_addr_t minaddr, vmem_addr_t maxaddr, vmem_addr_t *addrp) { vmem_addr_t start; vmem_addr_t end; MPASS(size > 0); MPASS(bt->bt_size >= size); /* caller's responsibility */ /* * XXX assumption: vmem_addr_t and vmem_size_t are * unsigned integer of the same size. */ start = bt->bt_start; if (start < minaddr) { start = minaddr; } end = BT_END(bt); if (end > maxaddr) end = maxaddr; if (start > end) return (ENOMEM); start = VMEM_ALIGNUP(start - phase, align) + phase; if (start < bt->bt_start) start += align; if (VMEM_CROSS_P(start, start + size - 1, nocross)) { MPASS(align < nocross); start = VMEM_ALIGNUP(start - phase, nocross) + phase; } if (start <= end && end - start >= size - 1) { MPASS((start & (align - 1)) == phase); MPASS(!VMEM_CROSS_P(start, start + size - 1, nocross)); MPASS(minaddr <= start); MPASS(maxaddr == 0 || start + size - 1 <= maxaddr); MPASS(bt->bt_start <= start); MPASS(BT_END(bt) - start >= size - 1); *addrp = start; return (0); } return (ENOMEM); } /* * vmem_clip: Trim the boundary tag edges to the requested start and size. */ static void vmem_clip(vmem_t *vm, bt_t *bt, vmem_addr_t start, vmem_size_t size) { bt_t *btnew; bt_t *btprev; VMEM_ASSERT_LOCKED(vm); MPASS(bt->bt_type == BT_TYPE_FREE); MPASS(bt->bt_size >= size); bt_remfree(vm, bt); if (bt->bt_start != start) { btprev = bt_alloc(vm); btprev->bt_type = BT_TYPE_FREE; btprev->bt_start = bt->bt_start; btprev->bt_size = start - bt->bt_start; bt->bt_start = start; bt->bt_size -= btprev->bt_size; bt_insfree(vm, btprev); bt_insseg(vm, btprev, TAILQ_PREV(bt, vmem_seglist, bt_seglist)); } MPASS(bt->bt_start == start); if (bt->bt_size != size && bt->bt_size - size > vm->vm_quantum_mask) { /* split */ btnew = bt_alloc(vm); btnew->bt_type = BT_TYPE_BUSY; btnew->bt_start = bt->bt_start; btnew->bt_size = size; bt->bt_start = bt->bt_start + size; bt->bt_size -= size; bt_insfree(vm, bt); bt_insseg(vm, btnew, TAILQ_PREV(bt, vmem_seglist, bt_seglist)); bt_insbusy(vm, btnew); bt = btnew; } else { bt->bt_type = BT_TYPE_BUSY; bt_insbusy(vm, bt); } MPASS(bt->bt_size >= size); bt->bt_type = BT_TYPE_BUSY; } /* ---- vmem API */ void vmem_set_import(vmem_t *vm, vmem_import_t *importfn, vmem_release_t *releasefn, void *arg, vmem_size_t import_quantum) { VMEM_LOCK(vm); vm->vm_importfn = importfn; vm->vm_releasefn = releasefn; vm->vm_arg = arg; vm->vm_import_quantum = import_quantum; VMEM_UNLOCK(vm); } void vmem_set_reclaim(vmem_t *vm, vmem_reclaim_t *reclaimfn) { VMEM_LOCK(vm); vm->vm_reclaimfn = reclaimfn; VMEM_UNLOCK(vm); } /* * vmem_init: Initializes vmem arena. */ vmem_t * vmem_init(vmem_t *vm, const char *name, vmem_addr_t base, vmem_size_t size, vmem_size_t quantum, vmem_size_t qcache_max, int flags) { int i; MPASS(quantum > 0); MPASS((quantum & (quantum - 1)) == 0); bzero(vm, sizeof(*vm)); VMEM_CONDVAR_INIT(vm, name); VMEM_LOCK_INIT(vm, name); vm->vm_nfreetags = 0; LIST_INIT(&vm->vm_freetags); strlcpy(vm->vm_name, name, sizeof(vm->vm_name)); vm->vm_quantum_mask = quantum - 1; vm->vm_quantum_shift = flsl(quantum) - 1; vm->vm_nbusytag = 0; vm->vm_size = 0; vm->vm_inuse = 0; qc_init(vm, qcache_max); TAILQ_INIT(&vm->vm_seglist); for (i = 0; i < VMEM_MAXORDER; i++) { LIST_INIT(&vm->vm_freelist[i]); } memset(&vm->vm_hash0, 0, sizeof(vm->vm_hash0)); vm->vm_hashsize = VMEM_HASHSIZE_MIN; vm->vm_hashlist = vm->vm_hash0; if (size != 0) { if (vmem_add(vm, base, size, flags) != 0) { vmem_destroy1(vm); return NULL; } } mtx_lock(&vmem_list_lock); LIST_INSERT_HEAD(&vmem_list, vm, vm_alllist); mtx_unlock(&vmem_list_lock); return vm; } /* * vmem_create: create an arena. */ vmem_t * vmem_create(const char *name, vmem_addr_t base, vmem_size_t size, vmem_size_t quantum, vmem_size_t qcache_max, int flags) { vmem_t *vm; vm = malloc(sizeof(*vm), M_VMEM, flags & (M_WAITOK|M_NOWAIT)); if (vm == NULL) return (NULL); if (vmem_init(vm, name, base, size, quantum, qcache_max, flags) == NULL) return (NULL); return (vm); } void vmem_destroy(vmem_t *vm) { mtx_lock(&vmem_list_lock); LIST_REMOVE(vm, vm_alllist); mtx_unlock(&vmem_list_lock); vmem_destroy1(vm); } vmem_size_t vmem_roundup_size(vmem_t *vm, vmem_size_t size) { return (size + vm->vm_quantum_mask) & ~vm->vm_quantum_mask; } /* * vmem_alloc: allocate resource from the arena. */ int vmem_alloc(vmem_t *vm, vmem_size_t size, int flags, vmem_addr_t *addrp) { const int strat __unused = flags & VMEM_FITMASK; qcache_t *qc; flags &= VMEM_FLAGS; MPASS(size > 0); MPASS(strat == M_BESTFIT || strat == M_FIRSTFIT); if ((flags & M_NOWAIT) == 0) WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL, "vmem_alloc"); if (size <= vm->vm_qcache_max) { qc = &vm->vm_qcache[(size - 1) >> vm->vm_quantum_shift]; *addrp = (vmem_addr_t)uma_zalloc(qc->qc_cache, flags); if (*addrp == 0) return (ENOMEM); return (0); } return vmem_xalloc(vm, size, 0, 0, 0, VMEM_ADDR_MIN, VMEM_ADDR_MAX, flags, addrp); } int vmem_xalloc(vmem_t *vm, const vmem_size_t size0, vmem_size_t align, const vmem_size_t phase, const vmem_size_t nocross, const vmem_addr_t minaddr, const vmem_addr_t maxaddr, int flags, vmem_addr_t *addrp) { const vmem_size_t size = vmem_roundup_size(vm, size0); struct vmem_freelist *list; struct vmem_freelist *first; struct vmem_freelist *end; vmem_size_t avail; bt_t *bt; int error; int strat; flags &= VMEM_FLAGS; strat = flags & VMEM_FITMASK; MPASS(size0 > 0); MPASS(size > 0); MPASS(strat == M_BESTFIT || strat == M_FIRSTFIT); MPASS((flags & (M_NOWAIT|M_WAITOK)) != (M_NOWAIT|M_WAITOK)); if ((flags & M_NOWAIT) == 0) WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL, "vmem_xalloc"); MPASS((align & vm->vm_quantum_mask) == 0); MPASS((align & (align - 1)) == 0); MPASS((phase & vm->vm_quantum_mask) == 0); MPASS((nocross & vm->vm_quantum_mask) == 0); MPASS((nocross & (nocross - 1)) == 0); MPASS((align == 0 && phase == 0) || phase < align); MPASS(nocross == 0 || nocross >= size); MPASS(minaddr <= maxaddr); MPASS(!VMEM_CROSS_P(phase, phase + size - 1, nocross)); if (align == 0) align = vm->vm_quantum_mask + 1; *addrp = 0; end = &vm->vm_freelist[VMEM_MAXORDER]; /* * choose a free block from which we allocate. */ first = bt_freehead_toalloc(vm, size, strat); VMEM_LOCK(vm); for (;;) { /* * Make sure we have enough tags to complete the * operation. */ if (vm->vm_nfreetags < BT_MAXALLOC && bt_fill(vm, flags) != 0) { error = ENOMEM; break; } /* * Scan freelists looking for a tag that satisfies the * allocation. If we're doing BESTFIT we may encounter * sizes below the request. If we're doing FIRSTFIT we * inspect only the first element from each list. */ for (list = first; list < end; list++) { LIST_FOREACH(bt, list, bt_freelist) { if (bt->bt_size >= size) { error = vmem_fit(bt, size, align, phase, nocross, minaddr, maxaddr, addrp); if (error == 0) { vmem_clip(vm, bt, *addrp, size); goto out; } } /* FIRST skips to the next list. */ if (strat == M_FIRSTFIT) break; } } /* * Retry if the fast algorithm failed. */ if (strat == M_FIRSTFIT) { strat = M_BESTFIT; first = bt_freehead_toalloc(vm, size, strat); continue; } /* * XXX it is possible to fail to meet restrictions with the * imported region. It is up to the user to specify the * import quantum such that it can satisfy any allocation. */ if (vmem_import(vm, size, align, flags) == 0) continue; /* * Try to free some space from the quantum cache or reclaim * functions if available. */ if (vm->vm_qcache_max != 0 || vm->vm_reclaimfn != NULL) { avail = vm->vm_size - vm->vm_inuse; VMEM_UNLOCK(vm); if (vm->vm_qcache_max != 0) qc_drain(vm); if (vm->vm_reclaimfn != NULL) vm->vm_reclaimfn(vm, flags); VMEM_LOCK(vm); /* If we were successful retry even NOWAIT. */ if (vm->vm_size - vm->vm_inuse > avail) continue; } if ((flags & M_NOWAIT) != 0) { error = ENOMEM; break; } VMEM_CONDVAR_WAIT(vm); } out: VMEM_UNLOCK(vm); if (error != 0 && (flags & M_NOWAIT) == 0) panic("failed to allocate waiting allocation\n"); return (error); } /* * vmem_free: free the resource to the arena. */ void vmem_free(vmem_t *vm, vmem_addr_t addr, vmem_size_t size) { qcache_t *qc; MPASS(size > 0); if (size <= vm->vm_qcache_max) { qc = &vm->vm_qcache[(size - 1) >> vm->vm_quantum_shift]; uma_zfree(qc->qc_cache, (void *)addr); } else vmem_xfree(vm, addr, size); } void vmem_xfree(vmem_t *vm, vmem_addr_t addr, vmem_size_t size) { bt_t *bt; bt_t *t; MPASS(size > 0); VMEM_LOCK(vm); bt = bt_lookupbusy(vm, addr); MPASS(bt != NULL); MPASS(bt->bt_start == addr); MPASS(bt->bt_size == vmem_roundup_size(vm, size) || bt->bt_size - vmem_roundup_size(vm, size) <= vm->vm_quantum_mask); MPASS(bt->bt_type == BT_TYPE_BUSY); bt_rembusy(vm, bt); bt->bt_type = BT_TYPE_FREE; /* coalesce */ t = TAILQ_NEXT(bt, bt_seglist); if (t != NULL && t->bt_type == BT_TYPE_FREE) { MPASS(BT_END(bt) < t->bt_start); /* YYY */ bt->bt_size += t->bt_size; bt_remfree(vm, t); bt_remseg(vm, t); } t = TAILQ_PREV(bt, vmem_seglist, bt_seglist); if (t != NULL && t->bt_type == BT_TYPE_FREE) { MPASS(BT_END(t) < bt->bt_start); /* YYY */ bt->bt_size += t->bt_size; bt->bt_start = t->bt_start; bt_remfree(vm, t); bt_remseg(vm, t); } t = TAILQ_PREV(bt, vmem_seglist, bt_seglist); MPASS(t != NULL); MPASS(BT_ISSPAN_P(t) || t->bt_type == BT_TYPE_BUSY); if (vm->vm_releasefn != NULL && t->bt_type == BT_TYPE_SPAN && t->bt_size == bt->bt_size) { vmem_addr_t spanaddr; vmem_size_t spansize; MPASS(t->bt_start == bt->bt_start); spanaddr = bt->bt_start; spansize = bt->bt_size; bt_remseg(vm, bt); bt_remseg(vm, t); vm->vm_size -= spansize; VMEM_CONDVAR_BROADCAST(vm); bt_freetrim(vm, BT_MAXFREE); (*vm->vm_releasefn)(vm->vm_arg, spanaddr, spansize); } else { bt_insfree(vm, bt); VMEM_CONDVAR_BROADCAST(vm); bt_freetrim(vm, BT_MAXFREE); } } /* * vmem_add: * */ int vmem_add(vmem_t *vm, vmem_addr_t addr, vmem_size_t size, int flags) { int error; error = 0; flags &= VMEM_FLAGS; VMEM_LOCK(vm); if (vm->vm_nfreetags >= BT_MAXALLOC || bt_fill(vm, flags) == 0) vmem_add1(vm, addr, size, BT_TYPE_SPAN_STATIC); else error = ENOMEM; VMEM_UNLOCK(vm); return (error); } /* * vmem_size: information about arenas size */ vmem_size_t vmem_size(vmem_t *vm, int typemask) { int i; switch (typemask) { case VMEM_ALLOC: return vm->vm_inuse; case VMEM_FREE: return vm->vm_size - vm->vm_inuse; case VMEM_FREE|VMEM_ALLOC: return vm->vm_size; case VMEM_MAXFREE: VMEM_LOCK(vm); for (i = VMEM_MAXORDER - 1; i >= 0; i--) { if (LIST_EMPTY(&vm->vm_freelist[i])) continue; VMEM_UNLOCK(vm); return ((vmem_size_t)ORDER2SIZE(i) << vm->vm_quantum_shift); } VMEM_UNLOCK(vm); return (0); default: panic("vmem_size"); } } /* ---- debug */ #if defined(DDB) || defined(DIAGNOSTIC) static void bt_dump(const bt_t *, int (*)(const char *, ...) __printflike(1, 2)); static const char * bt_type_string(int type) { switch (type) { case BT_TYPE_BUSY: return "busy"; case BT_TYPE_FREE: return "free"; case BT_TYPE_SPAN: return "span"; case BT_TYPE_SPAN_STATIC: return "static span"; default: break; } return "BOGUS"; } static void bt_dump(const bt_t *bt, int (*pr)(const char *, ...)) { (*pr)("\t%p: %jx %jx, %d(%s)\n", bt, (intmax_t)bt->bt_start, (intmax_t)bt->bt_size, bt->bt_type, bt_type_string(bt->bt_type)); } static void vmem_dump(const vmem_t *vm , int (*pr)(const char *, ...) __printflike(1, 2)) { const bt_t *bt; int i; (*pr)("vmem %p '%s'\n", vm, vm->vm_name); TAILQ_FOREACH(bt, &vm->vm_seglist, bt_seglist) { bt_dump(bt, pr); } for (i = 0; i < VMEM_MAXORDER; i++) { const struct vmem_freelist *fl = &vm->vm_freelist[i]; if (LIST_EMPTY(fl)) { continue; } (*pr)("freelist[%d]\n", i); LIST_FOREACH(bt, fl, bt_freelist) { bt_dump(bt, pr); } } } #endif /* defined(DDB) || defined(DIAGNOSTIC) */ #if defined(DDB) #include static bt_t * vmem_whatis_lookup(vmem_t *vm, vmem_addr_t addr) { bt_t *bt; TAILQ_FOREACH(bt, &vm->vm_seglist, bt_seglist) { if (BT_ISSPAN_P(bt)) { continue; } if (bt->bt_start <= addr && addr <= BT_END(bt)) { return bt; } } return NULL; } void vmem_whatis(vmem_addr_t addr, int (*pr)(const char *, ...)) { vmem_t *vm; LIST_FOREACH(vm, &vmem_list, vm_alllist) { bt_t *bt; bt = vmem_whatis_lookup(vm, addr); if (bt == NULL) { continue; } (*pr)("%p is %p+%zu in VMEM '%s' (%s)\n", (void *)addr, (void *)bt->bt_start, (vmem_size_t)(addr - bt->bt_start), vm->vm_name, (bt->bt_type == BT_TYPE_BUSY) ? "allocated" : "free"); } } void vmem_printall(const char *modif, int (*pr)(const char *, ...)) { const vmem_t *vm; LIST_FOREACH(vm, &vmem_list, vm_alllist) { vmem_dump(vm, pr); } } void vmem_print(vmem_addr_t addr, const char *modif, int (*pr)(const char *, ...)) { const vmem_t *vm = (const void *)addr; vmem_dump(vm, pr); } DB_SHOW_COMMAND(vmemdump, vmemdump) { if (!have_addr) { db_printf("usage: show vmemdump \n"); return; } vmem_dump((const vmem_t *)addr, db_printf); } DB_SHOW_ALL_COMMAND(vmemdump, vmemdumpall) { const vmem_t *vm; LIST_FOREACH(vm, &vmem_list, vm_alllist) vmem_dump(vm, db_printf); } DB_SHOW_COMMAND(vmem, vmem_summ) { const vmem_t *vm = (const void *)addr; const bt_t *bt; size_t ft[VMEM_MAXORDER], ut[VMEM_MAXORDER]; size_t fs[VMEM_MAXORDER], us[VMEM_MAXORDER]; int ord; if (!have_addr) { db_printf("usage: show vmem \n"); return; } db_printf("vmem %p '%s'\n", vm, vm->vm_name); db_printf("\tquantum:\t%zu\n", vm->vm_quantum_mask + 1); db_printf("\tsize:\t%zu\n", vm->vm_size); db_printf("\tinuse:\t%zu\n", vm->vm_inuse); db_printf("\tfree:\t%zu\n", vm->vm_size - vm->vm_inuse); db_printf("\tbusy tags:\t%d\n", vm->vm_nbusytag); db_printf("\tfree tags:\t%d\n", vm->vm_nfreetags); memset(&ft, 0, sizeof(ft)); memset(&ut, 0, sizeof(ut)); memset(&fs, 0, sizeof(fs)); memset(&us, 0, sizeof(us)); TAILQ_FOREACH(bt, &vm->vm_seglist, bt_seglist) { ord = SIZE2ORDER(bt->bt_size >> vm->vm_quantum_shift); if (bt->bt_type == BT_TYPE_BUSY) { ut[ord]++; us[ord] += bt->bt_size; } else if (bt->bt_type == BT_TYPE_FREE) { ft[ord]++; fs[ord] += bt->bt_size; } } db_printf("\t\t\tinuse\tsize\t\tfree\tsize\n"); for (ord = 0; ord < VMEM_MAXORDER; ord++) { if (ut[ord] == 0 && ft[ord] == 0) continue; db_printf("\t%-15zu %zu\t%-15zu %zu\t%-16zu\n", ORDER2SIZE(ord) << vm->vm_quantum_shift, ut[ord], us[ord], ft[ord], fs[ord]); } } DB_SHOW_ALL_COMMAND(vmem, vmem_summall) { const vmem_t *vm; LIST_FOREACH(vm, &vmem_list, vm_alllist) vmem_summ((db_expr_t)vm, TRUE, count, modif); } #endif /* defined(DDB) */ #define vmem_printf printf #if defined(DIAGNOSTIC) static bool vmem_check_sanity(vmem_t *vm) { const bt_t *bt, *bt2; MPASS(vm != NULL); TAILQ_FOREACH(bt, &vm->vm_seglist, bt_seglist) { if (bt->bt_start > BT_END(bt)) { printf("corrupted tag\n"); bt_dump(bt, vmem_printf); return false; } } TAILQ_FOREACH(bt, &vm->vm_seglist, bt_seglist) { TAILQ_FOREACH(bt2, &vm->vm_seglist, bt_seglist) { if (bt == bt2) { continue; } if (BT_ISSPAN_P(bt) != BT_ISSPAN_P(bt2)) { continue; } if (bt->bt_start <= BT_END(bt2) && bt2->bt_start <= BT_END(bt)) { printf("overwrapped tags\n"); bt_dump(bt, vmem_printf); bt_dump(bt2, vmem_printf); return false; } } } return true; } static void vmem_check(vmem_t *vm) { if (!vmem_check_sanity(vm)) { panic("insanity vmem %p", vm); } } #endif /* defined(DIAGNOSTIC) */ Index: projects/numa2/sys/kern/vfs_bio.c =================================================================== --- projects/numa2/sys/kern/vfs_bio.c (revision 321505) +++ projects/numa2/sys/kern/vfs_bio.c (revision 321506) @@ -1,5033 +1,5033 @@ /*- * Copyright (c) 2004 Poul-Henning Kamp * Copyright (c) 1994,1997 John S. Dyson * Copyright (c) 2013 The FreeBSD Foundation * All rights reserved. * * Portions of this software were developed by Konstantin Belousov * under sponsorship from the FreeBSD Foundation. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /* * this file contains a new buffer I/O scheme implementing a coherent * VM object and buffer cache scheme. Pains have been taken to make * sure that the performance degradation associated with schemes such * as this is not realized. * * Author: John S. Dyson * Significant help during the development and debugging phases * had been provided by David Greenman, also of the FreeBSD core team. * * see man buf(9) for more info. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "opt_compat.h" #include "opt_swap.h" static MALLOC_DEFINE(M_BIOBUF, "biobuf", "BIO buffer"); struct bio_ops bioops; /* I/O operation notification */ struct buf_ops buf_ops_bio = { .bop_name = "buf_ops_bio", .bop_write = bufwrite, .bop_strategy = bufstrategy, .bop_sync = bufsync, .bop_bdflush = bufbdflush, }; static struct buf *buf; /* buffer header pool */ extern struct buf *swbuf; /* Swap buffer header pool. */ caddr_t unmapped_buf; /* Used below and for softdep flushing threads in ufs/ffs/ffs_softdep.c */ struct proc *bufdaemonproc; struct proc *bufspacedaemonproc; static int inmem(struct vnode *vp, daddr_t blkno); static void vm_hold_free_pages(struct buf *bp, int newbsize); static void vm_hold_load_pages(struct buf *bp, vm_offset_t from, vm_offset_t to); static void vfs_page_set_valid(struct buf *bp, vm_ooffset_t off, vm_page_t m); static void vfs_page_set_validclean(struct buf *bp, vm_ooffset_t off, vm_page_t m); static void vfs_clean_pages_dirty_buf(struct buf *bp); static void vfs_setdirty_locked_object(struct buf *bp); static void vfs_vmio_invalidate(struct buf *bp); static void vfs_vmio_truncate(struct buf *bp, int npages); static void vfs_vmio_extend(struct buf *bp, int npages, int size); static int vfs_bio_clcheck(struct vnode *vp, int size, daddr_t lblkno, daddr_t blkno); static int buf_flush(struct vnode *vp, int); static int buf_recycle(bool); static int buf_scan(bool); static int flushbufqueues(struct vnode *, int, int); static void buf_daemon(void); static void bremfreel(struct buf *bp); static __inline void bd_wakeup(void); static int sysctl_runningspace(SYSCTL_HANDLER_ARGS); static void bufkva_reclaim(vmem_t *, int); static void bufkva_free(struct buf *); -static int buf_import(void *, void **, int, int); +static int buf_import(void *, void **, int, int, int); static void buf_release(void *, void **, int); static void maxbcachebuf_adjust(void); #if defined(COMPAT_FREEBSD4) || defined(COMPAT_FREEBSD5) || \ defined(COMPAT_FREEBSD6) || defined(COMPAT_FREEBSD7) static int sysctl_bufspace(SYSCTL_HANDLER_ARGS); #endif int vmiodirenable = TRUE; SYSCTL_INT(_vfs, OID_AUTO, vmiodirenable, CTLFLAG_RW, &vmiodirenable, 0, "Use the VM system for directory writes"); long runningbufspace; SYSCTL_LONG(_vfs, OID_AUTO, runningbufspace, CTLFLAG_RD, &runningbufspace, 0, "Amount of presently outstanding async buffer io"); static long bufspace; #if defined(COMPAT_FREEBSD4) || defined(COMPAT_FREEBSD5) || \ defined(COMPAT_FREEBSD6) || defined(COMPAT_FREEBSD7) SYSCTL_PROC(_vfs, OID_AUTO, bufspace, CTLTYPE_LONG|CTLFLAG_MPSAFE|CTLFLAG_RD, &bufspace, 0, sysctl_bufspace, "L", "Virtual memory used for buffers"); #else SYSCTL_LONG(_vfs, OID_AUTO, bufspace, CTLFLAG_RD, &bufspace, 0, "Physical memory used for buffers"); #endif static long bufkvaspace; SYSCTL_LONG(_vfs, OID_AUTO, bufkvaspace, CTLFLAG_RD, &bufkvaspace, 0, "Kernel virtual memory used for buffers"); static long maxbufspace; SYSCTL_LONG(_vfs, OID_AUTO, maxbufspace, CTLFLAG_RW, &maxbufspace, 0, "Maximum allowed value of bufspace (including metadata)"); static long bufmallocspace; SYSCTL_LONG(_vfs, OID_AUTO, bufmallocspace, CTLFLAG_RD, &bufmallocspace, 0, "Amount of malloced memory for buffers"); static long maxbufmallocspace; SYSCTL_LONG(_vfs, OID_AUTO, maxmallocbufspace, CTLFLAG_RW, &maxbufmallocspace, 0, "Maximum amount of malloced memory for buffers"); static long lobufspace; SYSCTL_LONG(_vfs, OID_AUTO, lobufspace, CTLFLAG_RW, &lobufspace, 0, "Minimum amount of buffers we want to have"); long hibufspace; SYSCTL_LONG(_vfs, OID_AUTO, hibufspace, CTLFLAG_RW, &hibufspace, 0, "Maximum allowed value of bufspace (excluding metadata)"); long bufspacethresh; SYSCTL_LONG(_vfs, OID_AUTO, bufspacethresh, CTLFLAG_RW, &bufspacethresh, 0, "Bufspace consumed before waking the daemon to free some"); static int buffreekvacnt; SYSCTL_INT(_vfs, OID_AUTO, buffreekvacnt, CTLFLAG_RW, &buffreekvacnt, 0, "Number of times we have freed the KVA space from some buffer"); static int bufdefragcnt; SYSCTL_INT(_vfs, OID_AUTO, bufdefragcnt, CTLFLAG_RW, &bufdefragcnt, 0, "Number of times we have had to repeat buffer allocation to defragment"); static long lorunningspace; SYSCTL_PROC(_vfs, OID_AUTO, lorunningspace, CTLTYPE_LONG | CTLFLAG_MPSAFE | CTLFLAG_RW, &lorunningspace, 0, sysctl_runningspace, "L", "Minimum preferred space used for in-progress I/O"); static long hirunningspace; SYSCTL_PROC(_vfs, OID_AUTO, hirunningspace, CTLTYPE_LONG | CTLFLAG_MPSAFE | CTLFLAG_RW, &hirunningspace, 0, sysctl_runningspace, "L", "Maximum amount of space to use for in-progress I/O"); int dirtybufferflushes; SYSCTL_INT(_vfs, OID_AUTO, dirtybufferflushes, CTLFLAG_RW, &dirtybufferflushes, 0, "Number of bdwrite to bawrite conversions to limit dirty buffers"); int bdwriteskip; SYSCTL_INT(_vfs, OID_AUTO, bdwriteskip, CTLFLAG_RW, &bdwriteskip, 0, "Number of buffers supplied to bdwrite with snapshot deadlock risk"); int altbufferflushes; SYSCTL_INT(_vfs, OID_AUTO, altbufferflushes, CTLFLAG_RW, &altbufferflushes, 0, "Number of fsync flushes to limit dirty buffers"); static int recursiveflushes; SYSCTL_INT(_vfs, OID_AUTO, recursiveflushes, CTLFLAG_RW, &recursiveflushes, 0, "Number of flushes skipped due to being recursive"); static int numdirtybuffers; SYSCTL_INT(_vfs, OID_AUTO, numdirtybuffers, CTLFLAG_RD, &numdirtybuffers, 0, "Number of buffers that are dirty (has unwritten changes) at the moment"); static int lodirtybuffers; SYSCTL_INT(_vfs, OID_AUTO, lodirtybuffers, CTLFLAG_RW, &lodirtybuffers, 0, "How many buffers we want to have free before bufdaemon can sleep"); static int hidirtybuffers; SYSCTL_INT(_vfs, OID_AUTO, hidirtybuffers, CTLFLAG_RW, &hidirtybuffers, 0, "When the number of dirty buffers is considered severe"); int dirtybufthresh; SYSCTL_INT(_vfs, OID_AUTO, dirtybufthresh, CTLFLAG_RW, &dirtybufthresh, 0, "Number of bdwrite to bawrite conversions to clear dirty buffers"); static int numfreebuffers; SYSCTL_INT(_vfs, OID_AUTO, numfreebuffers, CTLFLAG_RD, &numfreebuffers, 0, "Number of free buffers"); static int lofreebuffers; SYSCTL_INT(_vfs, OID_AUTO, lofreebuffers, CTLFLAG_RW, &lofreebuffers, 0, "Target number of free buffers"); static int hifreebuffers; SYSCTL_INT(_vfs, OID_AUTO, hifreebuffers, CTLFLAG_RW, &hifreebuffers, 0, "Threshold for clean buffer recycling"); static int getnewbufcalls; SYSCTL_INT(_vfs, OID_AUTO, getnewbufcalls, CTLFLAG_RW, &getnewbufcalls, 0, "Number of calls to getnewbuf"); static int getnewbufrestarts; SYSCTL_INT(_vfs, OID_AUTO, getnewbufrestarts, CTLFLAG_RW, &getnewbufrestarts, 0, "Number of times getnewbuf has had to restart a buffer acquisition"); static int mappingrestarts; SYSCTL_INT(_vfs, OID_AUTO, mappingrestarts, CTLFLAG_RW, &mappingrestarts, 0, "Number of times getblk has had to restart a buffer mapping for " "unmapped buffer"); static int numbufallocfails; SYSCTL_INT(_vfs, OID_AUTO, numbufallocfails, CTLFLAG_RW, &numbufallocfails, 0, "Number of times buffer allocations failed"); static int flushbufqtarget = 100; SYSCTL_INT(_vfs, OID_AUTO, flushbufqtarget, CTLFLAG_RW, &flushbufqtarget, 0, "Amount of work to do in flushbufqueues when helping bufdaemon"); static long notbufdflushes; SYSCTL_LONG(_vfs, OID_AUTO, notbufdflushes, CTLFLAG_RD, ¬bufdflushes, 0, "Number of dirty buffer flushes done by the bufdaemon helpers"); static long barrierwrites; SYSCTL_LONG(_vfs, OID_AUTO, barrierwrites, CTLFLAG_RW, &barrierwrites, 0, "Number of barrier writes"); SYSCTL_INT(_vfs, OID_AUTO, unmapped_buf_allowed, CTLFLAG_RD, &unmapped_buf_allowed, 0, "Permit the use of the unmapped i/o"); int maxbcachebuf = MAXBCACHEBUF; SYSCTL_INT(_vfs, OID_AUTO, maxbcachebuf, CTLFLAG_RDTUN, &maxbcachebuf, 0, "Maximum size of a buffer cache block"); /* * This lock synchronizes access to bd_request. */ static struct mtx_padalign bdlock; /* * This lock protects the runningbufreq and synchronizes runningbufwakeup and * waitrunningbufspace(). */ static struct mtx_padalign rbreqlock; /* * Lock that protects needsbuffer and the sleeps/wakeups surrounding it. */ static struct rwlock_padalign nblock; /* * Lock that protects bdirtywait. */ static struct mtx_padalign bdirtylock; /* * Wakeup point for bufdaemon, as well as indicator of whether it is already * active. Set to 1 when the bufdaemon is already "on" the queue, 0 when it * is idling. */ static int bd_request; /* * Request/wakeup point for the bufspace daemon. */ static int bufspace_request; /* * Request for the buf daemon to write more buffers than is indicated by * lodirtybuf. This may be necessary to push out excess dependencies or * defragment the address space where a simple count of the number of dirty * buffers is insufficient to characterize the demand for flushing them. */ static int bd_speedupreq; /* * Synchronization (sleep/wakeup) variable for active buffer space requests. * Set when wait starts, cleared prior to wakeup(). * Used in runningbufwakeup() and waitrunningbufspace(). */ static int runningbufreq; /* * Synchronization (sleep/wakeup) variable for buffer requests. * Can contain the VFS_BIO_NEED flags defined below; setting/clearing is done * by and/or. * Used in numdirtywakeup(), bufspace_wakeup(), bwillwrite(), * getnewbuf(), and getblk(). */ static volatile int needsbuffer; /* * Synchronization for bwillwrite() waiters. */ static int bdirtywait; /* * Definitions for the buffer free lists. */ #define QUEUE_NONE 0 /* on no queue */ #define QUEUE_EMPTY 1 /* empty buffer headers */ #define QUEUE_DIRTY 2 /* B_DELWRI buffers */ #define QUEUE_CLEAN 3 /* non-B_DELWRI buffers */ #define QUEUE_SENTINEL 1024 /* not an queue index, but mark for sentinel */ /* Maximum number of clean buffer queues. */ #define CLEAN_QUEUES 16 /* Configured number of clean queues. */ static int clean_queues; /* Maximum number of buffer queues. */ #define BUFFER_QUEUES (QUEUE_CLEAN + CLEAN_QUEUES) /* Queues for free buffers with various properties */ static TAILQ_HEAD(bqueues, buf) bufqueues[BUFFER_QUEUES] = { { 0 } }; #ifdef INVARIANTS static int bq_len[BUFFER_QUEUES]; #endif /* * Lock for each bufqueue */ static struct mtx_padalign bqlocks[BUFFER_QUEUES]; /* * per-cpu empty buffer cache. */ uma_zone_t buf_zone; /* * Single global constant for BUF_WMESG, to avoid getting multiple references. * buf_wmesg is referred from macros. */ const char *buf_wmesg = BUF_WMESG; static int sysctl_runningspace(SYSCTL_HANDLER_ARGS) { long value; int error; value = *(long *)arg1; error = sysctl_handle_long(oidp, &value, 0, req); if (error != 0 || req->newptr == NULL) return (error); mtx_lock(&rbreqlock); if (arg1 == &hirunningspace) { if (value < lorunningspace) error = EINVAL; else hirunningspace = value; } else { KASSERT(arg1 == &lorunningspace, ("%s: unknown arg1", __func__)); if (value > hirunningspace) error = EINVAL; else lorunningspace = value; } mtx_unlock(&rbreqlock); return (error); } #if defined(COMPAT_FREEBSD4) || defined(COMPAT_FREEBSD5) || \ defined(COMPAT_FREEBSD6) || defined(COMPAT_FREEBSD7) static int sysctl_bufspace(SYSCTL_HANDLER_ARGS) { long lvalue; int ivalue; if (sizeof(int) == sizeof(long) || req->oldlen >= sizeof(long)) return (sysctl_handle_long(oidp, arg1, arg2, req)); lvalue = *(long *)arg1; if (lvalue > INT_MAX) /* On overflow, still write out a long to trigger ENOMEM. */ return (sysctl_handle_long(oidp, &lvalue, 0, req)); ivalue = lvalue; return (sysctl_handle_int(oidp, &ivalue, 0, req)); } #endif static int bqcleanq(void) { static int nextq; return ((atomic_fetchadd_int(&nextq, 1) % clean_queues) + QUEUE_CLEAN); } static int bqisclean(int qindex) { return (qindex >= QUEUE_CLEAN && qindex < QUEUE_CLEAN + CLEAN_QUEUES); } /* * bqlock: * * Return the appropriate queue lock based on the index. */ static inline struct mtx * bqlock(int qindex) { return (struct mtx *)&bqlocks[qindex]; } /* * bdirtywakeup: * * Wakeup any bwillwrite() waiters. */ static void bdirtywakeup(void) { mtx_lock(&bdirtylock); if (bdirtywait) { bdirtywait = 0; wakeup(&bdirtywait); } mtx_unlock(&bdirtylock); } /* * bdirtysub: * * Decrement the numdirtybuffers count by one and wakeup any * threads blocked in bwillwrite(). */ static void bdirtysub(void) { if (atomic_fetchadd_int(&numdirtybuffers, -1) == (lodirtybuffers + hidirtybuffers) / 2) bdirtywakeup(); } /* * bdirtyadd: * * Increment the numdirtybuffers count by one and wakeup the buf * daemon if needed. */ static void bdirtyadd(void) { /* * Only do the wakeup once as we cross the boundary. The * buf daemon will keep running until the condition clears. */ if (atomic_fetchadd_int(&numdirtybuffers, 1) == (lodirtybuffers + hidirtybuffers) / 2) bd_wakeup(); } /* * bufspace_wakeup: * * Called when buffer space is potentially available for recovery. * getnewbuf() will block on this flag when it is unable to free * sufficient buffer space. Buffer space becomes recoverable when * bp's get placed back in the queues. */ static void bufspace_wakeup(void) { /* * If someone is waiting for bufspace, wake them up. * * Since needsbuffer is set prior to doing an additional queue * scan it is safe to check for the flag prior to acquiring the * lock. The thread that is preparing to scan again before * blocking would discover the buf we released. */ if (needsbuffer) { rw_rlock(&nblock); if (atomic_cmpset_int(&needsbuffer, 1, 0) == 1) wakeup(__DEVOLATILE(void *, &needsbuffer)); rw_runlock(&nblock); } } /* * bufspace_daemonwakeup: * * Wakeup the daemon responsible for freeing clean bufs. */ static void bufspace_daemonwakeup(void) { rw_rlock(&nblock); if (bufspace_request == 0) { bufspace_request = 1; wakeup(&bufspace_request); } rw_runlock(&nblock); } /* * bufspace_adjust: * * Adjust the reported bufspace for a KVA managed buffer, possibly * waking any waiters. */ static void bufspace_adjust(struct buf *bp, int bufsize) { long space; int diff; KASSERT((bp->b_flags & B_MALLOC) == 0, ("bufspace_adjust: malloc buf %p", bp)); diff = bufsize - bp->b_bufsize; if (diff < 0) { atomic_subtract_long(&bufspace, -diff); bufspace_wakeup(); } else { space = atomic_fetchadd_long(&bufspace, diff); /* Wake up the daemon on the transition. */ if (space < bufspacethresh && space + diff >= bufspacethresh) bufspace_daemonwakeup(); } bp->b_bufsize = bufsize; } /* * bufspace_reserve: * * Reserve bufspace before calling allocbuf(). metadata has a * different space limit than data. */ static int bufspace_reserve(int size, bool metadata) { long limit; long space; if (metadata) limit = maxbufspace; else limit = hibufspace; do { space = bufspace; if (space + size > limit) return (ENOSPC); } while (atomic_cmpset_long(&bufspace, space, space + size) == 0); /* Wake up the daemon on the transition. */ if (space < bufspacethresh && space + size >= bufspacethresh) bufspace_daemonwakeup(); return (0); } /* * bufspace_release: * * Release reserved bufspace after bufspace_adjust() has consumed it. */ static void bufspace_release(int size) { atomic_subtract_long(&bufspace, size); bufspace_wakeup(); } /* * bufspace_wait: * * Wait for bufspace, acting as the buf daemon if a locked vnode is * supplied. needsbuffer must be set in a safe fashion prior to * polling for space. The operation must be re-tried on return. */ static void bufspace_wait(struct vnode *vp, int gbflags, int slpflag, int slptimeo) { struct thread *td; int error, fl, norunbuf; if ((gbflags & GB_NOWAIT_BD) != 0) return; td = curthread; rw_wlock(&nblock); while (needsbuffer != 0) { if (vp != NULL && vp->v_type != VCHR && (td->td_pflags & TDP_BUFNEED) == 0) { rw_wunlock(&nblock); /* * getblk() is called with a vnode locked, and * some majority of the dirty buffers may as * well belong to the vnode. Flushing the * buffers there would make a progress that * cannot be achieved by the buf_daemon, that * cannot lock the vnode. */ norunbuf = ~(TDP_BUFNEED | TDP_NORUNNINGBUF) | (td->td_pflags & TDP_NORUNNINGBUF); /* * Play bufdaemon. The getnewbuf() function * may be called while the thread owns lock * for another dirty buffer for the same * vnode, which makes it impossible to use * VOP_FSYNC() there, due to the buffer lock * recursion. */ td->td_pflags |= TDP_BUFNEED | TDP_NORUNNINGBUF; fl = buf_flush(vp, flushbufqtarget); td->td_pflags &= norunbuf; rw_wlock(&nblock); if (fl != 0) continue; if (needsbuffer == 0) break; } error = rw_sleep(__DEVOLATILE(void *, &needsbuffer), &nblock, (PRIBIO + 4) | slpflag, "newbuf", slptimeo); if (error != 0) break; } rw_wunlock(&nblock); } /* * bufspace_daemon: * * buffer space management daemon. Tries to maintain some marginal * amount of free buffer space so that requesting processes neither * block nor work to reclaim buffers. */ static void bufspace_daemon(void) { for (;;) { kproc_suspend_check(bufspacedaemonproc); /* * Free buffers from the clean queue until we meet our * targets. * * Theory of operation: The buffer cache is most efficient * when some free buffer headers and space are always * available to getnewbuf(). This daemon attempts to prevent * the excessive blocking and synchronization associated * with shortfall. It goes through three phases according * demand: * * 1) The daemon wakes up voluntarily once per-second * during idle periods when the counters are below * the wakeup thresholds (bufspacethresh, lofreebuffers). * * 2) The daemon wakes up as we cross the thresholds * ahead of any potential blocking. This may bounce * slightly according to the rate of consumption and * release. * * 3) The daemon and consumers are starved for working * clean buffers. This is the 'bufspace' sleep below * which will inefficiently trade bufs with bqrelse * until we return to condition 2. */ while (bufspace > lobufspace || numfreebuffers < hifreebuffers) { if (buf_recycle(false) != 0) { atomic_set_int(&needsbuffer, 1); if (buf_recycle(false) != 0) { rw_wlock(&nblock); if (needsbuffer) rw_sleep(__DEVOLATILE(void *, &needsbuffer), &nblock, PRIBIO|PDROP, "bufspace", hz/10); else rw_wunlock(&nblock); } } maybe_yield(); } /* * Re-check our limits under the exclusive nblock. */ rw_wlock(&nblock); if (bufspace < bufspacethresh && numfreebuffers > lofreebuffers) { bufspace_request = 0; rw_sleep(&bufspace_request, &nblock, PRIBIO|PDROP, "-", hz); } else rw_wunlock(&nblock); } } static struct kproc_desc bufspace_kp = { "bufspacedaemon", bufspace_daemon, &bufspacedaemonproc }; SYSINIT(bufspacedaemon, SI_SUB_KTHREAD_BUF, SI_ORDER_FIRST, kproc_start, &bufspace_kp); /* * bufmallocadjust: * * Adjust the reported bufspace for a malloc managed buffer, possibly * waking any waiters. */ static void bufmallocadjust(struct buf *bp, int bufsize) { int diff; KASSERT((bp->b_flags & B_MALLOC) != 0, ("bufmallocadjust: non-malloc buf %p", bp)); diff = bufsize - bp->b_bufsize; if (diff < 0) atomic_subtract_long(&bufmallocspace, -diff); else atomic_add_long(&bufmallocspace, diff); bp->b_bufsize = bufsize; } /* * runningwakeup: * * Wake up processes that are waiting on asynchronous writes to fall * below lorunningspace. */ static void runningwakeup(void) { mtx_lock(&rbreqlock); if (runningbufreq) { runningbufreq = 0; wakeup(&runningbufreq); } mtx_unlock(&rbreqlock); } /* * runningbufwakeup: * * Decrement the outstanding write count according. */ void runningbufwakeup(struct buf *bp) { long space, bspace; bspace = bp->b_runningbufspace; if (bspace == 0) return; space = atomic_fetchadd_long(&runningbufspace, -bspace); KASSERT(space >= bspace, ("runningbufspace underflow %ld %ld", space, bspace)); bp->b_runningbufspace = 0; /* * Only acquire the lock and wakeup on the transition from exceeding * the threshold to falling below it. */ if (space < lorunningspace) return; if (space - bspace > lorunningspace) return; runningwakeup(); } /* * waitrunningbufspace() * * runningbufspace is a measure of the amount of I/O currently * running. This routine is used in async-write situations to * prevent creating huge backups of pending writes to a device. * Only asynchronous writes are governed by this function. * * This does NOT turn an async write into a sync write. It waits * for earlier writes to complete and generally returns before the * caller's write has reached the device. */ void waitrunningbufspace(void) { mtx_lock(&rbreqlock); while (runningbufspace > hirunningspace) { runningbufreq = 1; msleep(&runningbufreq, &rbreqlock, PVM, "wdrain", 0); } mtx_unlock(&rbreqlock); } /* * vfs_buf_test_cache: * * Called when a buffer is extended. This function clears the B_CACHE * bit if the newly extended portion of the buffer does not contain * valid data. */ static __inline void vfs_buf_test_cache(struct buf *bp, vm_ooffset_t foff, vm_offset_t off, vm_offset_t size, vm_page_t m) { VM_OBJECT_ASSERT_LOCKED(m->object); if (bp->b_flags & B_CACHE) { int base = (foff + off) & PAGE_MASK; if (vm_page_is_valid(m, base, size) == 0) bp->b_flags &= ~B_CACHE; } } /* Wake up the buffer daemon if necessary */ static __inline void bd_wakeup(void) { mtx_lock(&bdlock); if (bd_request == 0) { bd_request = 1; wakeup(&bd_request); } mtx_unlock(&bdlock); } /* * Adjust the maxbcachbuf tunable. */ static void maxbcachebuf_adjust(void) { int i; /* * maxbcachebuf must be a power of 2 >= MAXBSIZE. */ i = 2; while (i * 2 <= maxbcachebuf) i *= 2; maxbcachebuf = i; if (maxbcachebuf < MAXBSIZE) maxbcachebuf = MAXBSIZE; if (maxbcachebuf > MAXPHYS) maxbcachebuf = MAXPHYS; if (bootverbose != 0 && maxbcachebuf != MAXBCACHEBUF) printf("maxbcachebuf=%d\n", maxbcachebuf); } /* * bd_speedup - speedup the buffer cache flushing code */ void bd_speedup(void) { int needwake; mtx_lock(&bdlock); needwake = 0; if (bd_speedupreq == 0 || bd_request == 0) needwake = 1; bd_speedupreq = 1; bd_request = 1; if (needwake) wakeup(&bd_request); mtx_unlock(&bdlock); } #ifndef NSWBUF_MIN #define NSWBUF_MIN 16 #endif #ifdef __i386__ #define TRANSIENT_DENOM 5 #else #define TRANSIENT_DENOM 10 #endif /* * Calculating buffer cache scaling values and reserve space for buffer * headers. This is called during low level kernel initialization and * may be called more then once. We CANNOT write to the memory area * being reserved at this time. */ caddr_t kern_vfs_bio_buffer_alloc(caddr_t v, long physmem_est) { int tuned_nbuf; long maxbuf, maxbuf_sz, buf_sz, biotmap_sz; /* * physmem_est is in pages. Convert it to kilobytes (assumes * PAGE_SIZE is >= 1K) */ physmem_est = physmem_est * (PAGE_SIZE / 1024); maxbcachebuf_adjust(); /* * The nominal buffer size (and minimum KVA allocation) is BKVASIZE. * For the first 64MB of ram nominally allocate sufficient buffers to * cover 1/4 of our ram. Beyond the first 64MB allocate additional * buffers to cover 1/10 of our ram over 64MB. When auto-sizing * the buffer cache we limit the eventual kva reservation to * maxbcache bytes. * * factor represents the 1/4 x ram conversion. */ if (nbuf == 0) { int factor = 4 * BKVASIZE / 1024; nbuf = 50; if (physmem_est > 4096) nbuf += min((physmem_est - 4096) / factor, 65536 / factor); if (physmem_est > 65536) nbuf += min((physmem_est - 65536) * 2 / (factor * 5), 32 * 1024 * 1024 / (factor * 5)); if (maxbcache && nbuf > maxbcache / BKVASIZE) nbuf = maxbcache / BKVASIZE; tuned_nbuf = 1; } else tuned_nbuf = 0; /* XXX Avoid unsigned long overflows later on with maxbufspace. */ maxbuf = (LONG_MAX / 3) / BKVASIZE; if (nbuf > maxbuf) { if (!tuned_nbuf) printf("Warning: nbufs lowered from %d to %ld\n", nbuf, maxbuf); nbuf = maxbuf; } /* * Ideal allocation size for the transient bio submap is 10% * of the maximal space buffer map. This roughly corresponds * to the amount of the buffer mapped for typical UFS load. * * Clip the buffer map to reserve space for the transient * BIOs, if its extent is bigger than 90% (80% on i386) of the * maximum buffer map extent on the platform. * * The fall-back to the maxbuf in case of maxbcache unset, * allows to not trim the buffer KVA for the architectures * with ample KVA space. */ if (bio_transient_maxcnt == 0 && unmapped_buf_allowed) { maxbuf_sz = maxbcache != 0 ? maxbcache : maxbuf * BKVASIZE; buf_sz = (long)nbuf * BKVASIZE; if (buf_sz < maxbuf_sz / TRANSIENT_DENOM * (TRANSIENT_DENOM - 1)) { /* * There is more KVA than memory. Do not * adjust buffer map size, and assign the rest * of maxbuf to transient map. */ biotmap_sz = maxbuf_sz - buf_sz; } else { /* * Buffer map spans all KVA we could afford on * this platform. Give 10% (20% on i386) of * the buffer map to the transient bio map. */ biotmap_sz = buf_sz / TRANSIENT_DENOM; buf_sz -= biotmap_sz; } if (biotmap_sz / INT_MAX > MAXPHYS) bio_transient_maxcnt = INT_MAX; else bio_transient_maxcnt = biotmap_sz / MAXPHYS; /* * Artificially limit to 1024 simultaneous in-flight I/Os * using the transient mapping. */ if (bio_transient_maxcnt > 1024) bio_transient_maxcnt = 1024; if (tuned_nbuf) nbuf = buf_sz / BKVASIZE; } /* * swbufs are used as temporary holders for I/O, such as paging I/O. * We have no less then 16 and no more then 256. */ nswbuf = min(nbuf / 4, 256); TUNABLE_INT_FETCH("kern.nswbuf", &nswbuf); if (nswbuf < NSWBUF_MIN) nswbuf = NSWBUF_MIN; /* * Reserve space for the buffer cache buffers */ swbuf = (void *)v; v = (caddr_t)(swbuf + nswbuf); buf = (void *)v; v = (caddr_t)(buf + nbuf); return(v); } /* Initialize the buffer subsystem. Called before use of any buffers. */ void bufinit(void) { struct buf *bp; int i; KASSERT(maxbcachebuf >= MAXBSIZE, ("maxbcachebuf (%d) must be >= MAXBSIZE (%d)\n", maxbcachebuf, MAXBSIZE)); mtx_init(&bqlocks[QUEUE_DIRTY], "bufq dirty lock", NULL, MTX_DEF); mtx_init(&bqlocks[QUEUE_EMPTY], "bufq empty lock", NULL, MTX_DEF); for (i = QUEUE_CLEAN; i < QUEUE_CLEAN + CLEAN_QUEUES; i++) mtx_init(&bqlocks[i], "bufq clean lock", NULL, MTX_DEF); mtx_init(&rbreqlock, "runningbufspace lock", NULL, MTX_DEF); rw_init(&nblock, "needsbuffer lock"); mtx_init(&bdlock, "buffer daemon lock", NULL, MTX_DEF); mtx_init(&bdirtylock, "dirty buf lock", NULL, MTX_DEF); /* next, make a null set of free lists */ for (i = 0; i < BUFFER_QUEUES; i++) TAILQ_INIT(&bufqueues[i]); unmapped_buf = (caddr_t)kva_alloc(MAXPHYS); /* finally, initialize each buffer header and stick on empty q */ for (i = 0; i < nbuf; i++) { bp = &buf[i]; bzero(bp, sizeof *bp); bp->b_flags = B_INVAL; bp->b_rcred = NOCRED; bp->b_wcred = NOCRED; bp->b_qindex = QUEUE_EMPTY; bp->b_xflags = 0; bp->b_data = bp->b_kvabase = unmapped_buf; LIST_INIT(&bp->b_dep); BUF_LOCKINIT(bp); TAILQ_INSERT_TAIL(&bufqueues[QUEUE_EMPTY], bp, b_freelist); #ifdef INVARIANTS bq_len[QUEUE_EMPTY]++; #endif } /* * maxbufspace is the absolute maximum amount of buffer space we are * allowed to reserve in KVM and in real terms. The absolute maximum * is nominally used by metadata. hibufspace is the nominal maximum * used by most other requests. The differential is required to * ensure that metadata deadlocks don't occur. * * maxbufspace is based on BKVASIZE. Allocating buffers larger then * this may result in KVM fragmentation which is not handled optimally * by the system. XXX This is less true with vmem. We could use * PAGE_SIZE. */ maxbufspace = (long)nbuf * BKVASIZE; hibufspace = lmax(3 * maxbufspace / 4, maxbufspace - maxbcachebuf * 10); lobufspace = (hibufspace / 20) * 19; /* 95% */ bufspacethresh = lobufspace + (hibufspace - lobufspace) / 2; /* * Note: The 16 MiB upper limit for hirunningspace was chosen * arbitrarily and may need further tuning. It corresponds to * 128 outstanding write IO requests (if IO size is 128 KiB), * which fits with many RAID controllers' tagged queuing limits. * The lower 1 MiB limit is the historical upper limit for * hirunningspace. */ hirunningspace = lmax(lmin(roundup(hibufspace / 64, maxbcachebuf), 16 * 1024 * 1024), 1024 * 1024); lorunningspace = roundup((hirunningspace * 2) / 3, maxbcachebuf); /* * Limit the amount of malloc memory since it is wired permanently into * the kernel space. Even though this is accounted for in the buffer * allocation, we don't want the malloced region to grow uncontrolled. * The malloc scheme improves memory utilization significantly on * average (small) directories. */ maxbufmallocspace = hibufspace / 20; /* * Reduce the chance of a deadlock occurring by limiting the number * of delayed-write dirty buffers we allow to stack up. */ hidirtybuffers = nbuf / 4 + 20; dirtybufthresh = hidirtybuffers * 9 / 10; numdirtybuffers = 0; /* * To support extreme low-memory systems, make sure hidirtybuffers * cannot eat up all available buffer space. This occurs when our * minimum cannot be met. We try to size hidirtybuffers to 3/4 our * buffer space assuming BKVASIZE'd buffers. */ while ((long)hidirtybuffers * BKVASIZE > 3 * hibufspace / 4) { hidirtybuffers >>= 1; } lodirtybuffers = hidirtybuffers / 2; /* * lofreebuffers should be sufficient to avoid stalling waiting on * buf headers under heavy utilization. The bufs in per-cpu caches * are counted as free but will be unavailable to threads executing * on other cpus. * * hifreebuffers is the free target for the bufspace daemon. This * should be set appropriately to limit work per-iteration. */ lofreebuffers = MIN((nbuf / 25) + (20 * mp_ncpus), 128 * mp_ncpus); hifreebuffers = (3 * lofreebuffers) / 2; numfreebuffers = nbuf; /* Setup the kva and free list allocators. */ vmem_set_reclaim(buffer_arena, bufkva_reclaim); buf_zone = uma_zcache_create("buf free cache", sizeof(struct buf), NULL, NULL, NULL, NULL, buf_import, buf_release, NULL, 0); /* * Size the clean queue according to the amount of buffer space. * One queue per-256mb up to the max. More queues gives better * concurrency but less accurate LRU. */ clean_queues = MIN(howmany(maxbufspace, 256*1024*1024), CLEAN_QUEUES); } #ifdef INVARIANTS static inline void vfs_buf_check_mapped(struct buf *bp) { KASSERT(bp->b_kvabase != unmapped_buf, ("mapped buf: b_kvabase was not updated %p", bp)); KASSERT(bp->b_data != unmapped_buf, ("mapped buf: b_data was not updated %p", bp)); KASSERT(bp->b_data < unmapped_buf || bp->b_data >= unmapped_buf + MAXPHYS, ("b_data + b_offset unmapped %p", bp)); } static inline void vfs_buf_check_unmapped(struct buf *bp) { KASSERT(bp->b_data == unmapped_buf, ("unmapped buf: corrupted b_data %p", bp)); } #define BUF_CHECK_MAPPED(bp) vfs_buf_check_mapped(bp) #define BUF_CHECK_UNMAPPED(bp) vfs_buf_check_unmapped(bp) #else #define BUF_CHECK_MAPPED(bp) do {} while (0) #define BUF_CHECK_UNMAPPED(bp) do {} while (0) #endif static int isbufbusy(struct buf *bp) { if (((bp->b_flags & B_INVAL) == 0 && BUF_ISLOCKED(bp)) || ((bp->b_flags & (B_DELWRI | B_INVAL)) == B_DELWRI)) return (1); return (0); } /* * Shutdown the system cleanly to prepare for reboot, halt, or power off. */ void bufshutdown(int show_busybufs) { static int first_buf_printf = 1; struct buf *bp; int iter, nbusy, pbusy; #ifndef PREEMPTION int subiter; #endif /* * Sync filesystems for shutdown */ wdog_kern_pat(WD_LASTVAL); sys_sync(curthread, NULL); /* * With soft updates, some buffers that are * written will be remarked as dirty until other * buffers are written. */ for (iter = pbusy = 0; iter < 20; iter++) { nbusy = 0; for (bp = &buf[nbuf]; --bp >= buf; ) if (isbufbusy(bp)) nbusy++; if (nbusy == 0) { if (first_buf_printf) printf("All buffers synced."); break; } if (first_buf_printf) { printf("Syncing disks, buffers remaining... "); first_buf_printf = 0; } printf("%d ", nbusy); if (nbusy < pbusy) iter = 0; pbusy = nbusy; wdog_kern_pat(WD_LASTVAL); sys_sync(curthread, NULL); #ifdef PREEMPTION /* * Drop Giant and spin for a while to allow * interrupt threads to run. */ DROP_GIANT(); DELAY(50000 * iter); PICKUP_GIANT(); #else /* * Drop Giant and context switch several times to * allow interrupt threads to run. */ DROP_GIANT(); for (subiter = 0; subiter < 50 * iter; subiter++) { thread_lock(curthread); mi_switch(SW_VOL, NULL); thread_unlock(curthread); DELAY(1000); } PICKUP_GIANT(); #endif } printf("\n"); /* * Count only busy local buffers to prevent forcing * a fsck if we're just a client of a wedged NFS server */ nbusy = 0; for (bp = &buf[nbuf]; --bp >= buf; ) { if (isbufbusy(bp)) { #if 0 /* XXX: This is bogus. We should probably have a BO_REMOTE flag instead */ if (bp->b_dev == NULL) { TAILQ_REMOVE(&mountlist, bp->b_vp->v_mount, mnt_list); continue; } #endif nbusy++; if (show_busybufs > 0) { printf( "%d: buf:%p, vnode:%p, flags:%0x, blkno:%jd, lblkno:%jd, buflock:", nbusy, bp, bp->b_vp, bp->b_flags, (intmax_t)bp->b_blkno, (intmax_t)bp->b_lblkno); BUF_LOCKPRINTINFO(bp); if (show_busybufs > 1) vn_printf(bp->b_vp, "vnode content: "); } } } if (nbusy) { /* * Failed to sync all blocks. Indicate this and don't * unmount filesystems (thus forcing an fsck on reboot). */ printf("Giving up on %d buffers\n", nbusy); DELAY(5000000); /* 5 seconds */ } else { if (!first_buf_printf) printf("Final sync complete\n"); /* * Unmount filesystems */ if (panicstr == NULL) vfs_unmountall(); } swapoff_all(); DELAY(100000); /* wait for console output to finish */ } static void bpmap_qenter(struct buf *bp) { BUF_CHECK_MAPPED(bp); /* * bp->b_data is relative to bp->b_offset, but * bp->b_offset may be offset into the first page. */ bp->b_data = (caddr_t)trunc_page((vm_offset_t)bp->b_data); pmap_qenter((vm_offset_t)bp->b_data, bp->b_pages, bp->b_npages); bp->b_data = (caddr_t)((vm_offset_t)bp->b_data | (vm_offset_t)(bp->b_offset & PAGE_MASK)); } /* * binsfree: * * Insert the buffer into the appropriate free list. */ static void binsfree(struct buf *bp, int qindex) { struct mtx *olock, *nlock; if (qindex != QUEUE_EMPTY) { BUF_ASSERT_XLOCKED(bp); } /* * Stick to the same clean queue for the lifetime of the buf to * limit locking below. Otherwise pick ont sequentially. */ if (qindex == QUEUE_CLEAN) { if (bqisclean(bp->b_qindex)) qindex = bp->b_qindex; else qindex = bqcleanq(); } /* * Handle delayed bremfree() processing. */ nlock = bqlock(qindex); if (bp->b_flags & B_REMFREE) { olock = bqlock(bp->b_qindex); mtx_lock(olock); bremfreel(bp); if (olock != nlock) { mtx_unlock(olock); mtx_lock(nlock); } } else mtx_lock(nlock); if (bp->b_qindex != QUEUE_NONE) panic("binsfree: free buffer onto another queue???"); bp->b_qindex = qindex; if (bp->b_flags & B_AGE) TAILQ_INSERT_HEAD(&bufqueues[bp->b_qindex], bp, b_freelist); else TAILQ_INSERT_TAIL(&bufqueues[bp->b_qindex], bp, b_freelist); #ifdef INVARIANTS bq_len[bp->b_qindex]++; #endif mtx_unlock(nlock); } /* * buf_free: * * Free a buffer to the buf zone once it no longer has valid contents. */ static void buf_free(struct buf *bp) { if (bp->b_flags & B_REMFREE) bremfreef(bp); if (bp->b_vflags & BV_BKGRDINPROG) panic("losing buffer 1"); if (bp->b_rcred != NOCRED) { crfree(bp->b_rcred); bp->b_rcred = NOCRED; } if (bp->b_wcred != NOCRED) { crfree(bp->b_wcred); bp->b_wcred = NOCRED; } if (!LIST_EMPTY(&bp->b_dep)) buf_deallocate(bp); bufkva_free(bp); BUF_UNLOCK(bp); uma_zfree(buf_zone, bp); atomic_add_int(&numfreebuffers, 1); bufspace_wakeup(); } /* * buf_import: * * Import bufs into the uma cache from the buf list. The system still * expects a static array of bufs and much of the synchronization * around bufs assumes type stable storage. As a result, UMA is used * only as a per-cpu cache of bufs still maintained on a global list. */ static int -buf_import(void *arg, void **store, int cnt, int flags) +buf_import(void *arg, void **store, int cnt, int domain, int flags) { struct buf *bp; int i; mtx_lock(&bqlocks[QUEUE_EMPTY]); for (i = 0; i < cnt; i++) { bp = TAILQ_FIRST(&bufqueues[QUEUE_EMPTY]); if (bp == NULL) break; bremfreel(bp); store[i] = bp; } mtx_unlock(&bqlocks[QUEUE_EMPTY]); return (i); } /* * buf_release: * * Release bufs from the uma cache back to the buffer queues. */ static void buf_release(void *arg, void **store, int cnt) { int i; for (i = 0; i < cnt; i++) binsfree(store[i], QUEUE_EMPTY); } /* * buf_alloc: * * Allocate an empty buffer header. */ static struct buf * buf_alloc(void) { struct buf *bp; bp = uma_zalloc(buf_zone, M_NOWAIT); if (bp == NULL) { bufspace_daemonwakeup(); atomic_add_int(&numbufallocfails, 1); return (NULL); } /* * Wake-up the bufspace daemon on transition. */ if (atomic_fetchadd_int(&numfreebuffers, -1) == lofreebuffers) bufspace_daemonwakeup(); if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL) != 0) panic("getnewbuf_empty: Locked buf %p on free queue.", bp); KASSERT(bp->b_vp == NULL, ("bp: %p still has vnode %p.", bp, bp->b_vp)); KASSERT((bp->b_flags & (B_DELWRI | B_NOREUSE)) == 0, ("invalid buffer %p flags %#x", bp, bp->b_flags)); KASSERT((bp->b_xflags & (BX_VNCLEAN|BX_VNDIRTY)) == 0, ("bp: %p still on a buffer list. xflags %X", bp, bp->b_xflags)); KASSERT(bp->b_npages == 0, ("bp: %p still has %d vm pages\n", bp, bp->b_npages)); KASSERT(bp->b_kvasize == 0, ("bp: %p still has kva\n", bp)); KASSERT(bp->b_bufsize == 0, ("bp: %p still has bufspace\n", bp)); bp->b_flags = 0; bp->b_ioflags = 0; bp->b_xflags = 0; bp->b_vflags = 0; bp->b_vp = NULL; bp->b_blkno = bp->b_lblkno = 0; bp->b_offset = NOOFFSET; bp->b_iodone = 0; bp->b_error = 0; bp->b_resid = 0; bp->b_bcount = 0; bp->b_npages = 0; bp->b_dirtyoff = bp->b_dirtyend = 0; bp->b_bufobj = NULL; bp->b_data = bp->b_kvabase = unmapped_buf; bp->b_fsprivate1 = NULL; bp->b_fsprivate2 = NULL; bp->b_fsprivate3 = NULL; LIST_INIT(&bp->b_dep); return (bp); } /* * buf_qrecycle: * * Free a buffer from the given bufqueue. kva controls whether the * freed buf must own some kva resources. This is used for * defragmenting. */ static int buf_qrecycle(int qindex, bool kva) { struct buf *bp, *nbp; if (kva) atomic_add_int(&bufdefragcnt, 1); nbp = NULL; mtx_lock(&bqlocks[qindex]); nbp = TAILQ_FIRST(&bufqueues[qindex]); /* * Run scan, possibly freeing data and/or kva mappings on the fly * depending. */ while ((bp = nbp) != NULL) { /* * Calculate next bp (we can only use it if we do not * release the bqlock). */ nbp = TAILQ_NEXT(bp, b_freelist); /* * If we are defragging then we need a buffer with * some kva to reclaim. */ if (kva && bp->b_kvasize == 0) continue; if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL) != 0) continue; /* * Skip buffers with background writes in progress. */ if ((bp->b_vflags & BV_BKGRDINPROG) != 0) { BUF_UNLOCK(bp); continue; } KASSERT(bp->b_qindex == qindex, ("getnewbuf: inconsistent queue %d bp %p", qindex, bp)); /* * NOTE: nbp is now entirely invalid. We can only restart * the scan from this point on. */ bremfreel(bp); mtx_unlock(&bqlocks[qindex]); /* * Requeue the background write buffer with error and * restart the scan. */ if ((bp->b_vflags & BV_BKGRDERR) != 0) { bqrelse(bp); mtx_lock(&bqlocks[qindex]); nbp = TAILQ_FIRST(&bufqueues[qindex]); continue; } bp->b_flags |= B_INVAL; brelse(bp); return (0); } mtx_unlock(&bqlocks[qindex]); return (ENOBUFS); } /* * buf_recycle: * * Iterate through all clean queues until we find a buf to recycle or * exhaust the search. */ static int buf_recycle(bool kva) { int qindex, first_qindex; qindex = first_qindex = bqcleanq(); do { if (buf_qrecycle(qindex, kva) == 0) return (0); if (++qindex == QUEUE_CLEAN + clean_queues) qindex = QUEUE_CLEAN; } while (qindex != first_qindex); return (ENOBUFS); } /* * buf_scan: * * Scan the clean queues looking for a buffer to recycle. needsbuffer * is set on failure so that the caller may optionally bufspace_wait() * in a race-free fashion. */ static int buf_scan(bool defrag) { int error; /* * To avoid heavy synchronization and wakeup races we set * needsbuffer and re-poll before failing. This ensures that * no frees can be missed between an unsuccessful poll and * going to sleep in a synchronized fashion. */ if ((error = buf_recycle(defrag)) != 0) { atomic_set_int(&needsbuffer, 1); bufspace_daemonwakeup(); error = buf_recycle(defrag); } if (error == 0) atomic_add_int(&getnewbufrestarts, 1); return (error); } /* * bremfree: * * Mark the buffer for removal from the appropriate free list. * */ void bremfree(struct buf *bp) { CTR3(KTR_BUF, "bremfree(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags); KASSERT((bp->b_flags & B_REMFREE) == 0, ("bremfree: buffer %p already marked for delayed removal.", bp)); KASSERT(bp->b_qindex != QUEUE_NONE, ("bremfree: buffer %p not on a queue.", bp)); BUF_ASSERT_XLOCKED(bp); bp->b_flags |= B_REMFREE; } /* * bremfreef: * * Force an immediate removal from a free list. Used only in nfs when * it abuses the b_freelist pointer. */ void bremfreef(struct buf *bp) { struct mtx *qlock; qlock = bqlock(bp->b_qindex); mtx_lock(qlock); bremfreel(bp); mtx_unlock(qlock); } /* * bremfreel: * * Removes a buffer from the free list, must be called with the * correct qlock held. */ static void bremfreel(struct buf *bp) { CTR3(KTR_BUF, "bremfreel(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags); KASSERT(bp->b_qindex != QUEUE_NONE, ("bremfreel: buffer %p not on a queue.", bp)); if (bp->b_qindex != QUEUE_EMPTY) { BUF_ASSERT_XLOCKED(bp); } mtx_assert(bqlock(bp->b_qindex), MA_OWNED); TAILQ_REMOVE(&bufqueues[bp->b_qindex], bp, b_freelist); #ifdef INVARIANTS KASSERT(bq_len[bp->b_qindex] >= 1, ("queue %d underflow", bp->b_qindex)); bq_len[bp->b_qindex]--; #endif bp->b_qindex = QUEUE_NONE; bp->b_flags &= ~B_REMFREE; } /* * bufkva_free: * * Free the kva allocation for a buffer. * */ static void bufkva_free(struct buf *bp) { #ifdef INVARIANTS if (bp->b_kvasize == 0) { KASSERT(bp->b_kvabase == unmapped_buf && bp->b_data == unmapped_buf, ("Leaked KVA space on %p", bp)); } else if (buf_mapped(bp)) BUF_CHECK_MAPPED(bp); else BUF_CHECK_UNMAPPED(bp); #endif if (bp->b_kvasize == 0) return; vmem_free(buffer_arena, (vm_offset_t)bp->b_kvabase, bp->b_kvasize); atomic_subtract_long(&bufkvaspace, bp->b_kvasize); atomic_add_int(&buffreekvacnt, 1); bp->b_data = bp->b_kvabase = unmapped_buf; bp->b_kvasize = 0; } /* * bufkva_alloc: * * Allocate the buffer KVA and set b_kvasize and b_kvabase. */ static int bufkva_alloc(struct buf *bp, int maxsize, int gbflags) { vm_offset_t addr; int error; KASSERT((gbflags & GB_UNMAPPED) == 0 || (gbflags & GB_KVAALLOC) != 0, ("Invalid gbflags 0x%x in %s", gbflags, __func__)); bufkva_free(bp); addr = 0; error = vmem_alloc(buffer_arena, maxsize, M_BESTFIT | M_NOWAIT, &addr); if (error != 0) { /* * Buffer map is too fragmented. Request the caller * to defragment the map. */ return (error); } bp->b_kvabase = (caddr_t)addr; bp->b_kvasize = maxsize; atomic_add_long(&bufkvaspace, bp->b_kvasize); if ((gbflags & GB_UNMAPPED) != 0) { bp->b_data = unmapped_buf; BUF_CHECK_UNMAPPED(bp); } else { bp->b_data = bp->b_kvabase; BUF_CHECK_MAPPED(bp); } return (0); } /* * bufkva_reclaim: * * Reclaim buffer kva by freeing buffers holding kva. This is a vmem * callback that fires to avoid returning failure. */ static void bufkva_reclaim(vmem_t *vmem, int flags) { int i; for (i = 0; i < 5; i++) if (buf_scan(true) != 0) break; return; } /* * Attempt to initiate asynchronous I/O on read-ahead blocks. We must * clear BIO_ERROR and B_INVAL prior to initiating I/O . If B_CACHE is set, * the buffer is valid and we do not have to do anything. */ void breada(struct vnode * vp, daddr_t * rablkno, int * rabsize, int cnt, struct ucred * cred) { struct buf *rabp; int i; for (i = 0; i < cnt; i++, rablkno++, rabsize++) { if (inmem(vp, *rablkno)) continue; rabp = getblk(vp, *rablkno, *rabsize, 0, 0, 0); if ((rabp->b_flags & B_CACHE) == 0) { if (!TD_IS_IDLETHREAD(curthread)) { #ifdef RACCT if (racct_enable) { PROC_LOCK(curproc); racct_add_buf(curproc, rabp, 0); PROC_UNLOCK(curproc); } #endif /* RACCT */ curthread->td_ru.ru_inblock++; } rabp->b_flags |= B_ASYNC; rabp->b_flags &= ~B_INVAL; rabp->b_ioflags &= ~BIO_ERROR; rabp->b_iocmd = BIO_READ; if (rabp->b_rcred == NOCRED && cred != NOCRED) rabp->b_rcred = crhold(cred); vfs_busy_pages(rabp, 0); BUF_KERNPROC(rabp); rabp->b_iooffset = dbtob(rabp->b_blkno); bstrategy(rabp); } else { brelse(rabp); } } } /* * Entry point for bread() and breadn() via #defines in sys/buf.h. * * Get a buffer with the specified data. Look in the cache first. We * must clear BIO_ERROR and B_INVAL prior to initiating I/O. If B_CACHE * is set, the buffer is valid and we do not have to do anything, see * getblk(). Also starts asynchronous I/O on read-ahead blocks. * * Always return a NULL buffer pointer (in bpp) when returning an error. */ int breadn_flags(struct vnode *vp, daddr_t blkno, int size, daddr_t *rablkno, int *rabsize, int cnt, struct ucred *cred, int flags, struct buf **bpp) { struct buf *bp; int rv = 0, readwait = 0; CTR3(KTR_BUF, "breadn(%p, %jd, %d)", vp, blkno, size); /* * Can only return NULL if GB_LOCK_NOWAIT flag is specified. */ *bpp = bp = getblk(vp, blkno, size, 0, 0, flags); if (bp == NULL) return (EBUSY); /* if not found in cache, do some I/O */ if ((bp->b_flags & B_CACHE) == 0) { if (!TD_IS_IDLETHREAD(curthread)) { #ifdef RACCT if (racct_enable) { PROC_LOCK(curproc); racct_add_buf(curproc, bp, 0); PROC_UNLOCK(curproc); } #endif /* RACCT */ curthread->td_ru.ru_inblock++; } bp->b_iocmd = BIO_READ; bp->b_flags &= ~B_INVAL; bp->b_ioflags &= ~BIO_ERROR; if (bp->b_rcred == NOCRED && cred != NOCRED) bp->b_rcred = crhold(cred); vfs_busy_pages(bp, 0); bp->b_iooffset = dbtob(bp->b_blkno); bstrategy(bp); ++readwait; } breada(vp, rablkno, rabsize, cnt, cred); if (readwait) { rv = bufwait(bp); if (rv != 0) { brelse(bp); *bpp = NULL; } } return (rv); } /* * Write, release buffer on completion. (Done by iodone * if async). Do not bother writing anything if the buffer * is invalid. * * Note that we set B_CACHE here, indicating that buffer is * fully valid and thus cacheable. This is true even of NFS * now so we set it generally. This could be set either here * or in biodone() since the I/O is synchronous. We put it * here. */ int bufwrite(struct buf *bp) { int oldflags; struct vnode *vp; long space; int vp_md; CTR3(KTR_BUF, "bufwrite(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags); if ((bp->b_bufobj->bo_flag & BO_DEAD) != 0) { bp->b_flags |= B_INVAL | B_RELBUF; bp->b_flags &= ~B_CACHE; brelse(bp); return (ENXIO); } if (bp->b_flags & B_INVAL) { brelse(bp); return (0); } if (bp->b_flags & B_BARRIER) barrierwrites++; oldflags = bp->b_flags; BUF_ASSERT_HELD(bp); KASSERT(!(bp->b_vflags & BV_BKGRDINPROG), ("FFS background buffer should not get here %p", bp)); vp = bp->b_vp; if (vp) vp_md = vp->v_vflag & VV_MD; else vp_md = 0; /* * Mark the buffer clean. Increment the bufobj write count * before bundirty() call, to prevent other thread from seeing * empty dirty list and zero counter for writes in progress, * falsely indicating that the bufobj is clean. */ bufobj_wref(bp->b_bufobj); bundirty(bp); bp->b_flags &= ~B_DONE; bp->b_ioflags &= ~BIO_ERROR; bp->b_flags |= B_CACHE; bp->b_iocmd = BIO_WRITE; vfs_busy_pages(bp, 1); /* * Normal bwrites pipeline writes */ bp->b_runningbufspace = bp->b_bufsize; space = atomic_fetchadd_long(&runningbufspace, bp->b_runningbufspace); if (!TD_IS_IDLETHREAD(curthread)) { #ifdef RACCT if (racct_enable) { PROC_LOCK(curproc); racct_add_buf(curproc, bp, 1); PROC_UNLOCK(curproc); } #endif /* RACCT */ curthread->td_ru.ru_oublock++; } if (oldflags & B_ASYNC) BUF_KERNPROC(bp); bp->b_iooffset = dbtob(bp->b_blkno); buf_track(bp, __func__); bstrategy(bp); if ((oldflags & B_ASYNC) == 0) { int rtval = bufwait(bp); brelse(bp); return (rtval); } else if (space > hirunningspace) { /* * don't allow the async write to saturate the I/O * system. We will not deadlock here because * we are blocking waiting for I/O that is already in-progress * to complete. We do not block here if it is the update * or syncer daemon trying to clean up as that can lead * to deadlock. */ if ((curthread->td_pflags & TDP_NORUNNINGBUF) == 0 && !vp_md) waitrunningbufspace(); } return (0); } void bufbdflush(struct bufobj *bo, struct buf *bp) { struct buf *nbp; if (bo->bo_dirty.bv_cnt > dirtybufthresh + 10) { (void) VOP_FSYNC(bp->b_vp, MNT_NOWAIT, curthread); altbufferflushes++; } else if (bo->bo_dirty.bv_cnt > dirtybufthresh) { BO_LOCK(bo); /* * Try to find a buffer to flush. */ TAILQ_FOREACH(nbp, &bo->bo_dirty.bv_hd, b_bobufs) { if ((nbp->b_vflags & BV_BKGRDINPROG) || BUF_LOCK(nbp, LK_EXCLUSIVE | LK_NOWAIT, NULL)) continue; if (bp == nbp) panic("bdwrite: found ourselves"); BO_UNLOCK(bo); /* Don't countdeps with the bo lock held. */ if (buf_countdeps(nbp, 0)) { BO_LOCK(bo); BUF_UNLOCK(nbp); continue; } if (nbp->b_flags & B_CLUSTEROK) { vfs_bio_awrite(nbp); } else { bremfree(nbp); bawrite(nbp); } dirtybufferflushes++; break; } if (nbp == NULL) BO_UNLOCK(bo); } } /* * Delayed write. (Buffer is marked dirty). Do not bother writing * anything if the buffer is marked invalid. * * Note that since the buffer must be completely valid, we can safely * set B_CACHE. In fact, we have to set B_CACHE here rather then in * biodone() in order to prevent getblk from writing the buffer * out synchronously. */ void bdwrite(struct buf *bp) { struct thread *td = curthread; struct vnode *vp; struct bufobj *bo; CTR3(KTR_BUF, "bdwrite(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags); KASSERT(bp->b_bufobj != NULL, ("No b_bufobj %p", bp)); KASSERT((bp->b_flags & B_BARRIER) == 0, ("Barrier request in delayed write %p", bp)); BUF_ASSERT_HELD(bp); if (bp->b_flags & B_INVAL) { brelse(bp); return; } /* * If we have too many dirty buffers, don't create any more. * If we are wildly over our limit, then force a complete * cleanup. Otherwise, just keep the situation from getting * out of control. Note that we have to avoid a recursive * disaster and not try to clean up after our own cleanup! */ vp = bp->b_vp; bo = bp->b_bufobj; if ((td->td_pflags & (TDP_COWINPROGRESS|TDP_INBDFLUSH)) == 0) { td->td_pflags |= TDP_INBDFLUSH; BO_BDFLUSH(bo, bp); td->td_pflags &= ~TDP_INBDFLUSH; } else recursiveflushes++; bdirty(bp); /* * Set B_CACHE, indicating that the buffer is fully valid. This is * true even of NFS now. */ bp->b_flags |= B_CACHE; /* * This bmap keeps the system from needing to do the bmap later, * perhaps when the system is attempting to do a sync. Since it * is likely that the indirect block -- or whatever other datastructure * that the filesystem needs is still in memory now, it is a good * thing to do this. Note also, that if the pageout daemon is * requesting a sync -- there might not be enough memory to do * the bmap then... So, this is important to do. */ if (vp->v_type != VCHR && bp->b_lblkno == bp->b_blkno) { VOP_BMAP(vp, bp->b_lblkno, NULL, &bp->b_blkno, NULL, NULL); } buf_track(bp, __func__); /* * Set the *dirty* buffer range based upon the VM system dirty * pages. * * Mark the buffer pages as clean. We need to do this here to * satisfy the vnode_pager and the pageout daemon, so that it * thinks that the pages have been "cleaned". Note that since * the pages are in a delayed write buffer -- the VFS layer * "will" see that the pages get written out on the next sync, * or perhaps the cluster will be completed. */ vfs_clean_pages_dirty_buf(bp); bqrelse(bp); /* * note: we cannot initiate I/O from a bdwrite even if we wanted to, * due to the softdep code. */ } /* * bdirty: * * Turn buffer into delayed write request. We must clear BIO_READ and * B_RELBUF, and we must set B_DELWRI. We reassign the buffer to * itself to properly update it in the dirty/clean lists. We mark it * B_DONE to ensure that any asynchronization of the buffer properly * clears B_DONE ( else a panic will occur later ). * * bdirty() is kinda like bdwrite() - we have to clear B_INVAL which * might have been set pre-getblk(). Unlike bwrite/bdwrite, bdirty() * should only be called if the buffer is known-good. * * Since the buffer is not on a queue, we do not update the numfreebuffers * count. * * The buffer must be on QUEUE_NONE. */ void bdirty(struct buf *bp) { CTR3(KTR_BUF, "bdirty(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags); KASSERT(bp->b_bufobj != NULL, ("No b_bufobj %p", bp)); KASSERT(bp->b_flags & B_REMFREE || bp->b_qindex == QUEUE_NONE, ("bdirty: buffer %p still on queue %d", bp, bp->b_qindex)); BUF_ASSERT_HELD(bp); bp->b_flags &= ~(B_RELBUF); bp->b_iocmd = BIO_WRITE; if ((bp->b_flags & B_DELWRI) == 0) { bp->b_flags |= /* XXX B_DONE | */ B_DELWRI; reassignbuf(bp); bdirtyadd(); } } /* * bundirty: * * Clear B_DELWRI for buffer. * * Since the buffer is not on a queue, we do not update the numfreebuffers * count. * * The buffer must be on QUEUE_NONE. */ void bundirty(struct buf *bp) { CTR3(KTR_BUF, "bundirty(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags); KASSERT(bp->b_bufobj != NULL, ("No b_bufobj %p", bp)); KASSERT(bp->b_flags & B_REMFREE || bp->b_qindex == QUEUE_NONE, ("bundirty: buffer %p still on queue %d", bp, bp->b_qindex)); BUF_ASSERT_HELD(bp); if (bp->b_flags & B_DELWRI) { bp->b_flags &= ~B_DELWRI; reassignbuf(bp); bdirtysub(); } /* * Since it is now being written, we can clear its deferred write flag. */ bp->b_flags &= ~B_DEFERRED; } /* * bawrite: * * Asynchronous write. Start output on a buffer, but do not wait for * it to complete. The buffer is released when the output completes. * * bwrite() ( or the VOP routine anyway ) is responsible for handling * B_INVAL buffers. Not us. */ void bawrite(struct buf *bp) { bp->b_flags |= B_ASYNC; (void) bwrite(bp); } /* * babarrierwrite: * * Asynchronous barrier write. Start output on a buffer, but do not * wait for it to complete. Place a write barrier after this write so * that this buffer and all buffers written before it are committed to * the disk before any buffers written after this write are committed * to the disk. The buffer is released when the output completes. */ void babarrierwrite(struct buf *bp) { bp->b_flags |= B_ASYNC | B_BARRIER; (void) bwrite(bp); } /* * bbarrierwrite: * * Synchronous barrier write. Start output on a buffer and wait for * it to complete. Place a write barrier after this write so that * this buffer and all buffers written before it are committed to * the disk before any buffers written after this write are committed * to the disk. The buffer is released when the output completes. */ int bbarrierwrite(struct buf *bp) { bp->b_flags |= B_BARRIER; return (bwrite(bp)); } /* * bwillwrite: * * Called prior to the locking of any vnodes when we are expecting to * write. We do not want to starve the buffer cache with too many * dirty buffers so we block here. By blocking prior to the locking * of any vnodes we attempt to avoid the situation where a locked vnode * prevents the various system daemons from flushing related buffers. */ void bwillwrite(void) { if (numdirtybuffers >= hidirtybuffers) { mtx_lock(&bdirtylock); while (numdirtybuffers >= hidirtybuffers) { bdirtywait = 1; msleep(&bdirtywait, &bdirtylock, (PRIBIO + 4), "flswai", 0); } mtx_unlock(&bdirtylock); } } /* * Return true if we have too many dirty buffers. */ int buf_dirty_count_severe(void) { return(numdirtybuffers >= hidirtybuffers); } /* * brelse: * * Release a busy buffer and, if requested, free its resources. The * buffer will be stashed in the appropriate bufqueue[] allowing it * to be accessed later as a cache entity or reused for other purposes. */ void brelse(struct buf *bp) { int qindex; /* * Many functions erroneously call brelse with a NULL bp under rare * error conditions. Simply return when called with a NULL bp. */ if (bp == NULL) return; CTR3(KTR_BUF, "brelse(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags); KASSERT(!(bp->b_flags & (B_CLUSTER|B_PAGING)), ("brelse: inappropriate B_PAGING or B_CLUSTER bp %p", bp)); KASSERT((bp->b_flags & B_VMIO) != 0 || (bp->b_flags & B_NOREUSE) == 0, ("brelse: non-VMIO buffer marked NOREUSE")); if (BUF_LOCKRECURSED(bp)) { /* * Do not process, in particular, do not handle the * B_INVAL/B_RELBUF and do not release to free list. */ BUF_UNLOCK(bp); return; } if (bp->b_flags & B_MANAGED) { bqrelse(bp); return; } if ((bp->b_vflags & (BV_BKGRDINPROG | BV_BKGRDERR)) == BV_BKGRDERR) { BO_LOCK(bp->b_bufobj); bp->b_vflags &= ~BV_BKGRDERR; BO_UNLOCK(bp->b_bufobj); bdirty(bp); } if (bp->b_iocmd == BIO_WRITE && (bp->b_ioflags & BIO_ERROR) && (bp->b_error != ENXIO || !LIST_EMPTY(&bp->b_dep)) && !(bp->b_flags & B_INVAL)) { /* * Failed write, redirty. All errors except ENXIO (which * means the device is gone) are expected to be potentially * transient - underlying media might work if tried again * after EIO, and memory might be available after an ENOMEM. * * Do this also for buffers that failed with ENXIO, but have * non-empty dependencies - the soft updates code might need * to access the buffer to untangle them. * * Must clear BIO_ERROR to prevent pages from being scrapped. */ bp->b_ioflags &= ~BIO_ERROR; bdirty(bp); } else if ((bp->b_flags & (B_NOCACHE | B_INVAL)) || (bp->b_ioflags & BIO_ERROR) || (bp->b_bufsize <= 0)) { /* * Either a failed read I/O, or we were asked to free or not * cache the buffer, or we failed to write to a device that's * no longer present. */ bp->b_flags |= B_INVAL; if (!LIST_EMPTY(&bp->b_dep)) buf_deallocate(bp); if (bp->b_flags & B_DELWRI) bdirtysub(); bp->b_flags &= ~(B_DELWRI | B_CACHE); if ((bp->b_flags & B_VMIO) == 0) { allocbuf(bp, 0); if (bp->b_vp) brelvp(bp); } } /* * We must clear B_RELBUF if B_DELWRI is set. If vfs_vmio_truncate() * is called with B_DELWRI set, the underlying pages may wind up * getting freed causing a previous write (bdwrite()) to get 'lost' * because pages associated with a B_DELWRI bp are marked clean. * * We still allow the B_INVAL case to call vfs_vmio_truncate(), even * if B_DELWRI is set. */ if (bp->b_flags & B_DELWRI) bp->b_flags &= ~B_RELBUF; /* * VMIO buffer rundown. It is not very necessary to keep a VMIO buffer * constituted, not even NFS buffers now. Two flags effect this. If * B_INVAL, the struct buf is invalidated but the VM object is kept * around ( i.e. so it is trivial to reconstitute the buffer later ). * * If BIO_ERROR or B_NOCACHE is set, pages in the VM object will be * invalidated. BIO_ERROR cannot be set for a failed write unless the * buffer is also B_INVAL because it hits the re-dirtying code above. * * Normally we can do this whether a buffer is B_DELWRI or not. If * the buffer is an NFS buffer, it is tracking piecemeal writes or * the commit state and we cannot afford to lose the buffer. If the * buffer has a background write in progress, we need to keep it * around to prevent it from being reconstituted and starting a second * background write. */ if ((bp->b_flags & B_VMIO) && (bp->b_flags & B_NOCACHE || (bp->b_ioflags & BIO_ERROR && bp->b_iocmd == BIO_READ)) && !(bp->b_vp->v_mount != NULL && (bp->b_vp->v_mount->mnt_vfc->vfc_flags & VFCF_NETWORK) != 0 && !vn_isdisk(bp->b_vp, NULL) && (bp->b_flags & B_DELWRI))) { vfs_vmio_invalidate(bp); allocbuf(bp, 0); } if ((bp->b_flags & (B_INVAL | B_RELBUF)) != 0 || (bp->b_flags & (B_DELWRI | B_NOREUSE)) == B_NOREUSE) { allocbuf(bp, 0); bp->b_flags &= ~B_NOREUSE; if (bp->b_vp != NULL) brelvp(bp); } /* * If the buffer has junk contents signal it and eventually * clean up B_DELWRI and diassociate the vnode so that gbincore() * doesn't find it. */ if (bp->b_bufsize == 0 || (bp->b_ioflags & BIO_ERROR) != 0 || (bp->b_flags & (B_INVAL | B_NOCACHE | B_RELBUF)) != 0) bp->b_flags |= B_INVAL; if (bp->b_flags & B_INVAL) { if (bp->b_flags & B_DELWRI) bundirty(bp); if (bp->b_vp) brelvp(bp); } buf_track(bp, __func__); /* buffers with no memory */ if (bp->b_bufsize == 0) { buf_free(bp); return; } /* buffers with junk contents */ if (bp->b_flags & (B_INVAL | B_NOCACHE | B_RELBUF) || (bp->b_ioflags & BIO_ERROR)) { bp->b_xflags &= ~(BX_BKGRDWRITE | BX_ALTDATA); if (bp->b_vflags & BV_BKGRDINPROG) panic("losing buffer 2"); qindex = QUEUE_CLEAN; bp->b_flags |= B_AGE; /* remaining buffers */ } else if (bp->b_flags & B_DELWRI) qindex = QUEUE_DIRTY; else qindex = QUEUE_CLEAN; binsfree(bp, qindex); bp->b_flags &= ~(B_ASYNC | B_NOCACHE | B_AGE | B_RELBUF | B_DIRECT); if ((bp->b_flags & B_DELWRI) == 0 && (bp->b_xflags & BX_VNDIRTY)) panic("brelse: not dirty"); /* unlock */ BUF_UNLOCK(bp); if (qindex == QUEUE_CLEAN) bufspace_wakeup(); } /* * Release a buffer back to the appropriate queue but do not try to free * it. The buffer is expected to be used again soon. * * bqrelse() is used by bdwrite() to requeue a delayed write, and used by * biodone() to requeue an async I/O on completion. It is also used when * known good buffers need to be requeued but we think we may need the data * again soon. * * XXX we should be able to leave the B_RELBUF hint set on completion. */ void bqrelse(struct buf *bp) { int qindex; CTR3(KTR_BUF, "bqrelse(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags); KASSERT(!(bp->b_flags & (B_CLUSTER|B_PAGING)), ("bqrelse: inappropriate B_PAGING or B_CLUSTER bp %p", bp)); qindex = QUEUE_NONE; if (BUF_LOCKRECURSED(bp)) { /* do not release to free list */ BUF_UNLOCK(bp); return; } bp->b_flags &= ~(B_ASYNC | B_NOCACHE | B_AGE | B_RELBUF); if (bp->b_flags & B_MANAGED) { if (bp->b_flags & B_REMFREE) bremfreef(bp); goto out; } /* buffers with stale but valid contents */ if ((bp->b_flags & B_DELWRI) != 0 || (bp->b_vflags & (BV_BKGRDINPROG | BV_BKGRDERR)) == BV_BKGRDERR) { BO_LOCK(bp->b_bufobj); bp->b_vflags &= ~BV_BKGRDERR; BO_UNLOCK(bp->b_bufobj); qindex = QUEUE_DIRTY; } else { if ((bp->b_flags & B_DELWRI) == 0 && (bp->b_xflags & BX_VNDIRTY)) panic("bqrelse: not dirty"); if ((bp->b_flags & B_NOREUSE) != 0) { brelse(bp); return; } qindex = QUEUE_CLEAN; } binsfree(bp, qindex); out: buf_track(bp, __func__); /* unlock */ BUF_UNLOCK(bp); if (qindex == QUEUE_CLEAN) bufspace_wakeup(); } /* * Complete I/O to a VMIO backed page. Validate the pages as appropriate, * restore bogus pages. */ static void vfs_vmio_iodone(struct buf *bp) { vm_ooffset_t foff; vm_page_t m; vm_object_t obj; struct vnode *vp; int i, iosize, resid; bool bogus; obj = bp->b_bufobj->bo_object; KASSERT(obj->paging_in_progress >= bp->b_npages, ("vfs_vmio_iodone: paging in progress(%d) < b_npages(%d)", obj->paging_in_progress, bp->b_npages)); vp = bp->b_vp; KASSERT(vp->v_holdcnt > 0, ("vfs_vmio_iodone: vnode %p has zero hold count", vp)); KASSERT(vp->v_object != NULL, ("vfs_vmio_iodone: vnode %p has no vm_object", vp)); foff = bp->b_offset; KASSERT(bp->b_offset != NOOFFSET, ("vfs_vmio_iodone: bp %p has no buffer offset", bp)); bogus = false; iosize = bp->b_bcount - bp->b_resid; VM_OBJECT_WLOCK(obj); for (i = 0; i < bp->b_npages; i++) { resid = ((foff + PAGE_SIZE) & ~(off_t)PAGE_MASK) - foff; if (resid > iosize) resid = iosize; /* * cleanup bogus pages, restoring the originals */ m = bp->b_pages[i]; if (m == bogus_page) { bogus = true; m = vm_page_lookup(obj, OFF_TO_IDX(foff)); if (m == NULL) panic("biodone: page disappeared!"); bp->b_pages[i] = m; } else if ((bp->b_iocmd == BIO_READ) && resid > 0) { /* * In the write case, the valid and clean bits are * already changed correctly ( see bdwrite() ), so we * only need to do this here in the read case. */ KASSERT((m->dirty & vm_page_bits(foff & PAGE_MASK, resid)) == 0, ("vfs_vmio_iodone: page %p " "has unexpected dirty bits", m)); vfs_page_set_valid(bp, foff, m); } KASSERT(OFF_TO_IDX(foff) == m->pindex, ("vfs_vmio_iodone: foff(%jd)/pindex(%ju) mismatch", (intmax_t)foff, (uintmax_t)m->pindex)); vm_page_sunbusy(m); foff = (foff + PAGE_SIZE) & ~(off_t)PAGE_MASK; iosize -= resid; } vm_object_pip_wakeupn(obj, bp->b_npages); VM_OBJECT_WUNLOCK(obj); if (bogus && buf_mapped(bp)) { BUF_CHECK_MAPPED(bp); pmap_qenter(trunc_page((vm_offset_t)bp->b_data), bp->b_pages, bp->b_npages); } } /* * Unwire a page held by a buf and place it on the appropriate vm queue. */ static void vfs_vmio_unwire(struct buf *bp, vm_page_t m) { bool freed; vm_page_lock(m); if (vm_page_unwire(m, PQ_NONE)) { /* * Determine if the page should be freed before adding * it to the inactive queue. */ if (m->valid == 0) { freed = !vm_page_busied(m); if (freed) vm_page_free(m); } else if ((bp->b_flags & B_DIRECT) != 0) freed = vm_page_try_to_free(m); else freed = false; if (!freed) { /* * If the page is unlikely to be reused, let the * VM know. Otherwise, maintain LRU page * ordering and put the page at the tail of the * inactive queue. */ if ((bp->b_flags & B_NOREUSE) != 0) vm_page_deactivate_noreuse(m); else vm_page_deactivate(m); } } vm_page_unlock(m); } /* * Perform page invalidation when a buffer is released. The fully invalid * pages will be reclaimed later in vfs_vmio_truncate(). */ static void vfs_vmio_invalidate(struct buf *bp) { vm_object_t obj; vm_page_t m; int i, resid, poffset, presid; if (buf_mapped(bp)) { BUF_CHECK_MAPPED(bp); pmap_qremove(trunc_page((vm_offset_t)bp->b_data), bp->b_npages); } else BUF_CHECK_UNMAPPED(bp); /* * Get the base offset and length of the buffer. Note that * in the VMIO case if the buffer block size is not * page-aligned then b_data pointer may not be page-aligned. * But our b_pages[] array *IS* page aligned. * * block sizes less then DEV_BSIZE (usually 512) are not * supported due to the page granularity bits (m->valid, * m->dirty, etc...). * * See man buf(9) for more information */ obj = bp->b_bufobj->bo_object; resid = bp->b_bufsize; poffset = bp->b_offset & PAGE_MASK; VM_OBJECT_WLOCK(obj); for (i = 0; i < bp->b_npages; i++) { m = bp->b_pages[i]; if (m == bogus_page) panic("vfs_vmio_invalidate: Unexpected bogus page."); bp->b_pages[i] = NULL; presid = resid > (PAGE_SIZE - poffset) ? (PAGE_SIZE - poffset) : resid; KASSERT(presid >= 0, ("brelse: extra page")); while (vm_page_xbusied(m)) { vm_page_lock(m); VM_OBJECT_WUNLOCK(obj); vm_page_busy_sleep(m, "mbncsh", true); VM_OBJECT_WLOCK(obj); } if (pmap_page_wired_mappings(m) == 0) vm_page_set_invalid(m, poffset, presid); vfs_vmio_unwire(bp, m); resid -= presid; poffset = 0; } VM_OBJECT_WUNLOCK(obj); bp->b_npages = 0; } /* * Page-granular truncation of an existing VMIO buffer. */ static void vfs_vmio_truncate(struct buf *bp, int desiredpages) { vm_object_t obj; vm_page_t m; int i; if (bp->b_npages == desiredpages) return; if (buf_mapped(bp)) { BUF_CHECK_MAPPED(bp); pmap_qremove((vm_offset_t)trunc_page((vm_offset_t)bp->b_data) + (desiredpages << PAGE_SHIFT), bp->b_npages - desiredpages); } else BUF_CHECK_UNMAPPED(bp); obj = bp->b_bufobj->bo_object; if (obj != NULL) VM_OBJECT_WLOCK(obj); for (i = desiredpages; i < bp->b_npages; i++) { m = bp->b_pages[i]; KASSERT(m != bogus_page, ("allocbuf: bogus page found")); bp->b_pages[i] = NULL; vfs_vmio_unwire(bp, m); } if (obj != NULL) VM_OBJECT_WUNLOCK(obj); bp->b_npages = desiredpages; } /* * Byte granular extension of VMIO buffers. */ static void vfs_vmio_extend(struct buf *bp, int desiredpages, int size) { /* * We are growing the buffer, possibly in a * byte-granular fashion. */ vm_object_t obj; vm_offset_t toff; vm_offset_t tinc; vm_page_t m; /* * Step 1, bring in the VM pages from the object, allocating * them if necessary. We must clear B_CACHE if these pages * are not valid for the range covered by the buffer. */ obj = bp->b_bufobj->bo_object; VM_OBJECT_WLOCK(obj); while (bp->b_npages < desiredpages) { /* * We must allocate system pages since blocking * here could interfere with paging I/O, no * matter which process we are. * * Only exclusive busy can be tested here. * Blocking on shared busy might lead to * deadlocks once allocbuf() is called after * pages are vfs_busy_pages(). */ m = vm_page_grab(obj, OFF_TO_IDX(bp->b_offset) + bp->b_npages, VM_ALLOC_NOBUSY | VM_ALLOC_SYSTEM | VM_ALLOC_WIRED | VM_ALLOC_IGN_SBUSY | VM_ALLOC_COUNT(desiredpages - bp->b_npages)); if (m->valid == 0) bp->b_flags &= ~B_CACHE; bp->b_pages[bp->b_npages] = m; ++bp->b_npages; } /* * Step 2. We've loaded the pages into the buffer, * we have to figure out if we can still have B_CACHE * set. Note that B_CACHE is set according to the * byte-granular range ( bcount and size ), not the * aligned range ( newbsize ). * * The VM test is against m->valid, which is DEV_BSIZE * aligned. Needless to say, the validity of the data * needs to also be DEV_BSIZE aligned. Note that this * fails with NFS if the server or some other client * extends the file's EOF. If our buffer is resized, * B_CACHE may remain set! XXX */ toff = bp->b_bcount; tinc = PAGE_SIZE - ((bp->b_offset + toff) & PAGE_MASK); while ((bp->b_flags & B_CACHE) && toff < size) { vm_pindex_t pi; if (tinc > (size - toff)) tinc = size - toff; pi = ((bp->b_offset & PAGE_MASK) + toff) >> PAGE_SHIFT; m = bp->b_pages[pi]; vfs_buf_test_cache(bp, bp->b_offset, toff, tinc, m); toff += tinc; tinc = PAGE_SIZE; } VM_OBJECT_WUNLOCK(obj); /* * Step 3, fixup the KVA pmap. */ if (buf_mapped(bp)) bpmap_qenter(bp); else BUF_CHECK_UNMAPPED(bp); } /* * Check to see if a block at a particular lbn is available for a clustered * write. */ static int vfs_bio_clcheck(struct vnode *vp, int size, daddr_t lblkno, daddr_t blkno) { struct buf *bpa; int match; match = 0; /* If the buf isn't in core skip it */ if ((bpa = gbincore(&vp->v_bufobj, lblkno)) == NULL) return (0); /* If the buf is busy we don't want to wait for it */ if (BUF_LOCK(bpa, LK_EXCLUSIVE | LK_NOWAIT, NULL) != 0) return (0); /* Only cluster with valid clusterable delayed write buffers */ if ((bpa->b_flags & (B_DELWRI | B_CLUSTEROK | B_INVAL)) != (B_DELWRI | B_CLUSTEROK)) goto done; if (bpa->b_bufsize != size) goto done; /* * Check to see if it is in the expected place on disk and that the * block has been mapped. */ if ((bpa->b_blkno != bpa->b_lblkno) && (bpa->b_blkno == blkno)) match = 1; done: BUF_UNLOCK(bpa); return (match); } /* * vfs_bio_awrite: * * Implement clustered async writes for clearing out B_DELWRI buffers. * This is much better then the old way of writing only one buffer at * a time. Note that we may not be presented with the buffers in the * correct order, so we search for the cluster in both directions. */ int vfs_bio_awrite(struct buf *bp) { struct bufobj *bo; int i; int j; daddr_t lblkno = bp->b_lblkno; struct vnode *vp = bp->b_vp; int ncl; int nwritten; int size; int maxcl; int gbflags; bo = &vp->v_bufobj; gbflags = (bp->b_data == unmapped_buf) ? GB_UNMAPPED : 0; /* * right now we support clustered writing only to regular files. If * we find a clusterable block we could be in the middle of a cluster * rather then at the beginning. */ if ((vp->v_type == VREG) && (vp->v_mount != 0) && /* Only on nodes that have the size info */ (bp->b_flags & (B_CLUSTEROK | B_INVAL)) == B_CLUSTEROK) { size = vp->v_mount->mnt_stat.f_iosize; maxcl = MAXPHYS / size; BO_RLOCK(bo); for (i = 1; i < maxcl; i++) if (vfs_bio_clcheck(vp, size, lblkno + i, bp->b_blkno + ((i * size) >> DEV_BSHIFT)) == 0) break; for (j = 1; i + j <= maxcl && j <= lblkno; j++) if (vfs_bio_clcheck(vp, size, lblkno - j, bp->b_blkno - ((j * size) >> DEV_BSHIFT)) == 0) break; BO_RUNLOCK(bo); --j; ncl = i + j; /* * this is a possible cluster write */ if (ncl != 1) { BUF_UNLOCK(bp); nwritten = cluster_wbuild(vp, size, lblkno - j, ncl, gbflags); return (nwritten); } } bremfree(bp); bp->b_flags |= B_ASYNC; /* * default (old) behavior, writing out only one block * * XXX returns b_bufsize instead of b_bcount for nwritten? */ nwritten = bp->b_bufsize; (void) bwrite(bp); return (nwritten); } /* * getnewbuf_kva: * * Allocate KVA for an empty buf header according to gbflags. */ static int getnewbuf_kva(struct buf *bp, int gbflags, int maxsize) { if ((gbflags & (GB_UNMAPPED | GB_KVAALLOC)) != GB_UNMAPPED) { /* * In order to keep fragmentation sane we only allocate kva * in BKVASIZE chunks. XXX with vmem we can do page size. */ maxsize = (maxsize + BKVAMASK) & ~BKVAMASK; if (maxsize != bp->b_kvasize && bufkva_alloc(bp, maxsize, gbflags)) return (ENOSPC); } return (0); } /* * getnewbuf: * * Find and initialize a new buffer header, freeing up existing buffers * in the bufqueues as necessary. The new buffer is returned locked. * * We block if: * We have insufficient buffer headers * We have insufficient buffer space * buffer_arena is too fragmented ( space reservation fails ) * If we have to flush dirty buffers ( but we try to avoid this ) * * The caller is responsible for releasing the reserved bufspace after * allocbuf() is called. */ static struct buf * getnewbuf(struct vnode *vp, int slpflag, int slptimeo, int maxsize, int gbflags) { struct buf *bp; bool metadata, reserved; bp = NULL; KASSERT((gbflags & (GB_UNMAPPED | GB_KVAALLOC)) != GB_KVAALLOC, ("GB_KVAALLOC only makes sense with GB_UNMAPPED")); if (!unmapped_buf_allowed) gbflags &= ~(GB_UNMAPPED | GB_KVAALLOC); if (vp == NULL || (vp->v_vflag & (VV_MD | VV_SYSTEM)) != 0 || vp->v_type == VCHR) metadata = true; else metadata = false; atomic_add_int(&getnewbufcalls, 1); reserved = false; do { if (reserved == false && bufspace_reserve(maxsize, metadata) != 0) continue; reserved = true; if ((bp = buf_alloc()) == NULL) continue; if (getnewbuf_kva(bp, gbflags, maxsize) == 0) return (bp); break; } while(buf_scan(false) == 0); if (reserved) atomic_subtract_long(&bufspace, maxsize); if (bp != NULL) { bp->b_flags |= B_INVAL; brelse(bp); } bufspace_wait(vp, gbflags, slpflag, slptimeo); return (NULL); } /* * buf_daemon: * * buffer flushing daemon. Buffers are normally flushed by the * update daemon but if it cannot keep up this process starts to * take the load in an attempt to prevent getnewbuf() from blocking. */ static struct kproc_desc buf_kp = { "bufdaemon", buf_daemon, &bufdaemonproc }; SYSINIT(bufdaemon, SI_SUB_KTHREAD_BUF, SI_ORDER_FIRST, kproc_start, &buf_kp); static int buf_flush(struct vnode *vp, int target) { int flushed; flushed = flushbufqueues(vp, target, 0); if (flushed == 0) { /* * Could not find any buffers without rollback * dependencies, so just write the first one * in the hopes of eventually making progress. */ if (vp != NULL && target > 2) target /= 2; flushbufqueues(vp, target, 1); } return (flushed); } static void buf_daemon() { int lodirty; /* * This process needs to be suspended prior to shutdown sync. */ EVENTHANDLER_REGISTER(shutdown_pre_sync, kproc_shutdown, bufdaemonproc, SHUTDOWN_PRI_LAST); /* * This process is allowed to take the buffer cache to the limit */ curthread->td_pflags |= TDP_NORUNNINGBUF | TDP_BUFNEED; mtx_lock(&bdlock); for (;;) { bd_request = 0; mtx_unlock(&bdlock); kproc_suspend_check(bufdaemonproc); lodirty = lodirtybuffers; if (bd_speedupreq) { lodirty = numdirtybuffers / 2; bd_speedupreq = 0; } /* * Do the flush. Limit the amount of in-transit I/O we * allow to build up, otherwise we would completely saturate * the I/O system. */ while (numdirtybuffers > lodirty) { if (buf_flush(NULL, numdirtybuffers - lodirty) == 0) break; kern_yield(PRI_USER); } /* * Only clear bd_request if we have reached our low water * mark. The buf_daemon normally waits 1 second and * then incrementally flushes any dirty buffers that have * built up, within reason. * * If we were unable to hit our low water mark and couldn't * find any flushable buffers, we sleep for a short period * to avoid endless loops on unlockable buffers. */ mtx_lock(&bdlock); if (numdirtybuffers <= lodirtybuffers) { /* * We reached our low water mark, reset the * request and sleep until we are needed again. * The sleep is just so the suspend code works. */ bd_request = 0; /* * Do an extra wakeup in case dirty threshold * changed via sysctl and the explicit transition * out of shortfall was missed. */ bdirtywakeup(); if (runningbufspace <= lorunningspace) runningwakeup(); msleep(&bd_request, &bdlock, PVM, "psleep", hz); } else { /* * We couldn't find any flushable dirty buffers but * still have too many dirty buffers, we * have to sleep and try again. (rare) */ msleep(&bd_request, &bdlock, PVM, "qsleep", hz / 10); } } } /* * flushbufqueues: * * Try to flush a buffer in the dirty queue. We must be careful to * free up B_INVAL buffers instead of write them, which NFS is * particularly sensitive to. */ static int flushwithdeps = 0; SYSCTL_INT(_vfs, OID_AUTO, flushwithdeps, CTLFLAG_RW, &flushwithdeps, 0, "Number of buffers flushed with dependecies that require rollbacks"); static int flushbufqueues(struct vnode *lvp, int target, int flushdeps) { struct buf *sentinel; struct vnode *vp; struct mount *mp; struct buf *bp; int hasdeps; int flushed; int queue; int error; bool unlock; flushed = 0; queue = QUEUE_DIRTY; bp = NULL; sentinel = malloc(sizeof(struct buf), M_TEMP, M_WAITOK | M_ZERO); sentinel->b_qindex = QUEUE_SENTINEL; mtx_lock(&bqlocks[queue]); TAILQ_INSERT_HEAD(&bufqueues[queue], sentinel, b_freelist); mtx_unlock(&bqlocks[queue]); while (flushed != target) { maybe_yield(); mtx_lock(&bqlocks[queue]); bp = TAILQ_NEXT(sentinel, b_freelist); if (bp != NULL) { TAILQ_REMOVE(&bufqueues[queue], sentinel, b_freelist); TAILQ_INSERT_AFTER(&bufqueues[queue], bp, sentinel, b_freelist); } else { mtx_unlock(&bqlocks[queue]); break; } /* * Skip sentinels inserted by other invocations of the * flushbufqueues(), taking care to not reorder them. * * Only flush the buffers that belong to the * vnode locked by the curthread. */ if (bp->b_qindex == QUEUE_SENTINEL || (lvp != NULL && bp->b_vp != lvp)) { mtx_unlock(&bqlocks[queue]); continue; } error = BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL); mtx_unlock(&bqlocks[queue]); if (error != 0) continue; /* * BKGRDINPROG can only be set with the buf and bufobj * locks both held. We tolerate a race to clear it here. */ if ((bp->b_vflags & BV_BKGRDINPROG) != 0 || (bp->b_flags & B_DELWRI) == 0) { BUF_UNLOCK(bp); continue; } if (bp->b_flags & B_INVAL) { bremfreef(bp); brelse(bp); flushed++; continue; } if (!LIST_EMPTY(&bp->b_dep) && buf_countdeps(bp, 0)) { if (flushdeps == 0) { BUF_UNLOCK(bp); continue; } hasdeps = 1; } else hasdeps = 0; /* * We must hold the lock on a vnode before writing * one of its buffers. Otherwise we may confuse, or * in the case of a snapshot vnode, deadlock the * system. * * The lock order here is the reverse of the normal * of vnode followed by buf lock. This is ok because * the NOWAIT will prevent deadlock. */ vp = bp->b_vp; if (vn_start_write(vp, &mp, V_NOWAIT) != 0) { BUF_UNLOCK(bp); continue; } if (lvp == NULL) { unlock = true; error = vn_lock(vp, LK_EXCLUSIVE | LK_NOWAIT); } else { ASSERT_VOP_LOCKED(vp, "getbuf"); unlock = false; error = VOP_ISLOCKED(vp) == LK_EXCLUSIVE ? 0 : vn_lock(vp, LK_TRYUPGRADE); } if (error == 0) { CTR3(KTR_BUF, "flushbufqueue(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags); if (curproc == bufdaemonproc) { vfs_bio_awrite(bp); } else { bremfree(bp); bwrite(bp); notbufdflushes++; } vn_finished_write(mp); if (unlock) VOP_UNLOCK(vp, 0); flushwithdeps += hasdeps; flushed++; /* * Sleeping on runningbufspace while holding * vnode lock leads to deadlock. */ if (curproc == bufdaemonproc && runningbufspace > hirunningspace) waitrunningbufspace(); continue; } vn_finished_write(mp); BUF_UNLOCK(bp); } mtx_lock(&bqlocks[queue]); TAILQ_REMOVE(&bufqueues[queue], sentinel, b_freelist); mtx_unlock(&bqlocks[queue]); free(sentinel, M_TEMP); return (flushed); } /* * Check to see if a block is currently memory resident. */ struct buf * incore(struct bufobj *bo, daddr_t blkno) { struct buf *bp; BO_RLOCK(bo); bp = gbincore(bo, blkno); BO_RUNLOCK(bo); return (bp); } /* * Returns true if no I/O is needed to access the * associated VM object. This is like incore except * it also hunts around in the VM system for the data. */ static int inmem(struct vnode * vp, daddr_t blkno) { vm_object_t obj; vm_offset_t toff, tinc, size; vm_page_t m; vm_ooffset_t off; ASSERT_VOP_LOCKED(vp, "inmem"); if (incore(&vp->v_bufobj, blkno)) return 1; if (vp->v_mount == NULL) return 0; obj = vp->v_object; if (obj == NULL) return (0); size = PAGE_SIZE; if (size > vp->v_mount->mnt_stat.f_iosize) size = vp->v_mount->mnt_stat.f_iosize; off = (vm_ooffset_t)blkno * (vm_ooffset_t)vp->v_mount->mnt_stat.f_iosize; VM_OBJECT_RLOCK(obj); for (toff = 0; toff < vp->v_mount->mnt_stat.f_iosize; toff += tinc) { m = vm_page_lookup(obj, OFF_TO_IDX(off + toff)); if (!m) goto notinmem; tinc = size; if (tinc > PAGE_SIZE - ((toff + off) & PAGE_MASK)) tinc = PAGE_SIZE - ((toff + off) & PAGE_MASK); if (vm_page_is_valid(m, (vm_offset_t) ((toff + off) & PAGE_MASK), tinc) == 0) goto notinmem; } VM_OBJECT_RUNLOCK(obj); return 1; notinmem: VM_OBJECT_RUNLOCK(obj); return (0); } /* * Set the dirty range for a buffer based on the status of the dirty * bits in the pages comprising the buffer. The range is limited * to the size of the buffer. * * Tell the VM system that the pages associated with this buffer * are clean. This is used for delayed writes where the data is * going to go to disk eventually without additional VM intevention. * * Note that while we only really need to clean through to b_bcount, we * just go ahead and clean through to b_bufsize. */ static void vfs_clean_pages_dirty_buf(struct buf *bp) { vm_ooffset_t foff, noff, eoff; vm_page_t m; int i; if ((bp->b_flags & B_VMIO) == 0 || bp->b_bufsize == 0) return; foff = bp->b_offset; KASSERT(bp->b_offset != NOOFFSET, ("vfs_clean_pages_dirty_buf: no buffer offset")); VM_OBJECT_WLOCK(bp->b_bufobj->bo_object); vfs_drain_busy_pages(bp); vfs_setdirty_locked_object(bp); for (i = 0; i < bp->b_npages; i++) { noff = (foff + PAGE_SIZE) & ~(off_t)PAGE_MASK; eoff = noff; if (eoff > bp->b_offset + bp->b_bufsize) eoff = bp->b_offset + bp->b_bufsize; m = bp->b_pages[i]; vfs_page_set_validclean(bp, foff, m); /* vm_page_clear_dirty(m, foff & PAGE_MASK, eoff - foff); */ foff = noff; } VM_OBJECT_WUNLOCK(bp->b_bufobj->bo_object); } static void vfs_setdirty_locked_object(struct buf *bp) { vm_object_t object; int i; object = bp->b_bufobj->bo_object; VM_OBJECT_ASSERT_WLOCKED(object); /* * We qualify the scan for modified pages on whether the * object has been flushed yet. */ if ((object->flags & OBJ_MIGHTBEDIRTY) != 0) { vm_offset_t boffset; vm_offset_t eoffset; /* * test the pages to see if they have been modified directly * by users through the VM system. */ for (i = 0; i < bp->b_npages; i++) vm_page_test_dirty(bp->b_pages[i]); /* * Calculate the encompassing dirty range, boffset and eoffset, * (eoffset - boffset) bytes. */ for (i = 0; i < bp->b_npages; i++) { if (bp->b_pages[i]->dirty) break; } boffset = (i << PAGE_SHIFT) - (bp->b_offset & PAGE_MASK); for (i = bp->b_npages - 1; i >= 0; --i) { if (bp->b_pages[i]->dirty) { break; } } eoffset = ((i + 1) << PAGE_SHIFT) - (bp->b_offset & PAGE_MASK); /* * Fit it to the buffer. */ if (eoffset > bp->b_bcount) eoffset = bp->b_bcount; /* * If we have a good dirty range, merge with the existing * dirty range. */ if (boffset < eoffset) { if (bp->b_dirtyoff > boffset) bp->b_dirtyoff = boffset; if (bp->b_dirtyend < eoffset) bp->b_dirtyend = eoffset; } } } /* * Allocate the KVA mapping for an existing buffer. * If an unmapped buffer is provided but a mapped buffer is requested, take * also care to properly setup mappings between pages and KVA. */ static void bp_unmapped_get_kva(struct buf *bp, daddr_t blkno, int size, int gbflags) { int bsize, maxsize, need_mapping, need_kva; off_t offset; need_mapping = bp->b_data == unmapped_buf && (gbflags & GB_UNMAPPED) == 0; need_kva = bp->b_kvabase == unmapped_buf && bp->b_data == unmapped_buf && (gbflags & GB_KVAALLOC) != 0; if (!need_mapping && !need_kva) return; BUF_CHECK_UNMAPPED(bp); if (need_mapping && bp->b_kvabase != unmapped_buf) { /* * Buffer is not mapped, but the KVA was already * reserved at the time of the instantiation. Use the * allocated space. */ goto has_addr; } /* * Calculate the amount of the address space we would reserve * if the buffer was mapped. */ bsize = vn_isdisk(bp->b_vp, NULL) ? DEV_BSIZE : bp->b_bufobj->bo_bsize; KASSERT(bsize != 0, ("bsize == 0, check bo->bo_bsize")); offset = blkno * bsize; maxsize = size + (offset & PAGE_MASK); maxsize = imax(maxsize, bsize); while (bufkva_alloc(bp, maxsize, gbflags) != 0) { if ((gbflags & GB_NOWAIT_BD) != 0) { /* * XXXKIB: defragmentation cannot * succeed, not sure what else to do. */ panic("GB_NOWAIT_BD and GB_UNMAPPED %p", bp); } atomic_add_int(&mappingrestarts, 1); bufspace_wait(bp->b_vp, gbflags, 0, 0); } has_addr: if (need_mapping) { /* b_offset is handled by bpmap_qenter. */ bp->b_data = bp->b_kvabase; BUF_CHECK_MAPPED(bp); bpmap_qenter(bp); } } /* * getblk: * * Get a block given a specified block and offset into a file/device. * The buffers B_DONE bit will be cleared on return, making it almost * ready for an I/O initiation. B_INVAL may or may not be set on * return. The caller should clear B_INVAL prior to initiating a * READ. * * For a non-VMIO buffer, B_CACHE is set to the opposite of B_INVAL for * an existing buffer. * * For a VMIO buffer, B_CACHE is modified according to the backing VM. * If getblk()ing a previously 0-sized invalid buffer, B_CACHE is set * and then cleared based on the backing VM. If the previous buffer is * non-0-sized but invalid, B_CACHE will be cleared. * * If getblk() must create a new buffer, the new buffer is returned with * both B_INVAL and B_CACHE clear unless it is a VMIO buffer, in which * case it is returned with B_INVAL clear and B_CACHE set based on the * backing VM. * * getblk() also forces a bwrite() for any B_DELWRI buffer whos * B_CACHE bit is clear. * * What this means, basically, is that the caller should use B_CACHE to * determine whether the buffer is fully valid or not and should clear * B_INVAL prior to issuing a read. If the caller intends to validate * the buffer by loading its data area with something, the caller needs * to clear B_INVAL. If the caller does this without issuing an I/O, * the caller should set B_CACHE ( as an optimization ), else the caller * should issue the I/O and biodone() will set B_CACHE if the I/O was * a write attempt or if it was a successful read. If the caller * intends to issue a READ, the caller must clear B_INVAL and BIO_ERROR * prior to issuing the READ. biodone() will *not* clear B_INVAL. */ struct buf * getblk(struct vnode *vp, daddr_t blkno, int size, int slpflag, int slptimeo, int flags) { struct buf *bp; struct bufobj *bo; int bsize, error, maxsize, vmio; off_t offset; CTR3(KTR_BUF, "getblk(%p, %ld, %d)", vp, (long)blkno, size); KASSERT((flags & (GB_UNMAPPED | GB_KVAALLOC)) != GB_KVAALLOC, ("GB_KVAALLOC only makes sense with GB_UNMAPPED")); ASSERT_VOP_LOCKED(vp, "getblk"); if (size > maxbcachebuf) panic("getblk: size(%d) > maxbcachebuf(%d)\n", size, maxbcachebuf); if (!unmapped_buf_allowed) flags &= ~(GB_UNMAPPED | GB_KVAALLOC); bo = &vp->v_bufobj; loop: BO_RLOCK(bo); bp = gbincore(bo, blkno); if (bp != NULL) { int lockflags; /* * Buffer is in-core. If the buffer is not busy nor managed, * it must be on a queue. */ lockflags = LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK; if (flags & GB_LOCK_NOWAIT) lockflags |= LK_NOWAIT; error = BUF_TIMELOCK(bp, lockflags, BO_LOCKPTR(bo), "getblk", slpflag, slptimeo); /* * If we slept and got the lock we have to restart in case * the buffer changed identities. */ if (error == ENOLCK) goto loop; /* We timed out or were interrupted. */ else if (error) return (NULL); /* If recursed, assume caller knows the rules. */ else if (BUF_LOCKRECURSED(bp)) goto end; /* * The buffer is locked. B_CACHE is cleared if the buffer is * invalid. Otherwise, for a non-VMIO buffer, B_CACHE is set * and for a VMIO buffer B_CACHE is adjusted according to the * backing VM cache. */ if (bp->b_flags & B_INVAL) bp->b_flags &= ~B_CACHE; else if ((bp->b_flags & (B_VMIO | B_INVAL)) == 0) bp->b_flags |= B_CACHE; if (bp->b_flags & B_MANAGED) MPASS(bp->b_qindex == QUEUE_NONE); else bremfree(bp); /* * check for size inconsistencies for non-VMIO case. */ if (bp->b_bcount != size) { if ((bp->b_flags & B_VMIO) == 0 || (size > bp->b_kvasize)) { if (bp->b_flags & B_DELWRI) { bp->b_flags |= B_NOCACHE; bwrite(bp); } else { if (LIST_EMPTY(&bp->b_dep)) { bp->b_flags |= B_RELBUF; brelse(bp); } else { bp->b_flags |= B_NOCACHE; bwrite(bp); } } goto loop; } } /* * Handle the case of unmapped buffer which should * become mapped, or the buffer for which KVA * reservation is requested. */ bp_unmapped_get_kva(bp, blkno, size, flags); /* * If the size is inconsistent in the VMIO case, we can resize * the buffer. This might lead to B_CACHE getting set or * cleared. If the size has not changed, B_CACHE remains * unchanged from its previous state. */ allocbuf(bp, size); KASSERT(bp->b_offset != NOOFFSET, ("getblk: no buffer offset")); /* * A buffer with B_DELWRI set and B_CACHE clear must * be committed before we can return the buffer in * order to prevent the caller from issuing a read * ( due to B_CACHE not being set ) and overwriting * it. * * Most callers, including NFS and FFS, need this to * operate properly either because they assume they * can issue a read if B_CACHE is not set, or because * ( for example ) an uncached B_DELWRI might loop due * to softupdates re-dirtying the buffer. In the latter * case, B_CACHE is set after the first write completes, * preventing further loops. * NOTE! b*write() sets B_CACHE. If we cleared B_CACHE * above while extending the buffer, we cannot allow the * buffer to remain with B_CACHE set after the write * completes or it will represent a corrupt state. To * deal with this we set B_NOCACHE to scrap the buffer * after the write. * * We might be able to do something fancy, like setting * B_CACHE in bwrite() except if B_DELWRI is already set, * so the below call doesn't set B_CACHE, but that gets real * confusing. This is much easier. */ if ((bp->b_flags & (B_CACHE|B_DELWRI)) == B_DELWRI) { bp->b_flags |= B_NOCACHE; bwrite(bp); goto loop; } bp->b_flags &= ~B_DONE; } else { /* * Buffer is not in-core, create new buffer. The buffer * returned by getnewbuf() is locked. Note that the returned * buffer is also considered valid (not marked B_INVAL). */ BO_RUNLOCK(bo); /* * If the user does not want us to create the buffer, bail out * here. */ if (flags & GB_NOCREAT) return NULL; if (numfreebuffers == 0 && TD_IS_IDLETHREAD(curthread)) return NULL; bsize = vn_isdisk(vp, NULL) ? DEV_BSIZE : bo->bo_bsize; KASSERT(bsize != 0, ("bsize == 0, check bo->bo_bsize")); offset = blkno * bsize; vmio = vp->v_object != NULL; if (vmio) { maxsize = size + (offset & PAGE_MASK); } else { maxsize = size; /* Do not allow non-VMIO notmapped buffers. */ flags &= ~(GB_UNMAPPED | GB_KVAALLOC); } maxsize = imax(maxsize, bsize); bp = getnewbuf(vp, slpflag, slptimeo, maxsize, flags); if (bp == NULL) { if (slpflag || slptimeo) return NULL; /* * XXX This is here until the sleep path is diagnosed * enough to work under very low memory conditions. * * There's an issue on low memory, 4BSD+non-preempt * systems (eg MIPS routers with 32MB RAM) where buffer * exhaustion occurs without sleeping for buffer * reclaimation. This just sticks in a loop and * constantly attempts to allocate a buffer, which * hits exhaustion and tries to wakeup bufdaemon. * This never happens because we never yield. * * The real solution is to identify and fix these cases * so we aren't effectively busy-waiting in a loop * until the reclaimation path has cycles to run. */ kern_yield(PRI_USER); goto loop; } /* * This code is used to make sure that a buffer is not * created while the getnewbuf routine is blocked. * This can be a problem whether the vnode is locked or not. * If the buffer is created out from under us, we have to * throw away the one we just created. * * Note: this must occur before we associate the buffer * with the vp especially considering limitations in * the splay tree implementation when dealing with duplicate * lblkno's. */ BO_LOCK(bo); if (gbincore(bo, blkno)) { BO_UNLOCK(bo); bp->b_flags |= B_INVAL; brelse(bp); bufspace_release(maxsize); goto loop; } /* * Insert the buffer into the hash, so that it can * be found by incore. */ bp->b_blkno = bp->b_lblkno = blkno; bp->b_offset = offset; bgetvp(vp, bp); BO_UNLOCK(bo); /* * set B_VMIO bit. allocbuf() the buffer bigger. Since the * buffer size starts out as 0, B_CACHE will be set by * allocbuf() for the VMIO case prior to it testing the * backing store for validity. */ if (vmio) { bp->b_flags |= B_VMIO; KASSERT(vp->v_object == bp->b_bufobj->bo_object, ("ARGH! different b_bufobj->bo_object %p %p %p\n", bp, vp->v_object, bp->b_bufobj->bo_object)); } else { bp->b_flags &= ~B_VMIO; KASSERT(bp->b_bufobj->bo_object == NULL, ("ARGH! has b_bufobj->bo_object %p %p\n", bp, bp->b_bufobj->bo_object)); BUF_CHECK_MAPPED(bp); } allocbuf(bp, size); bufspace_release(maxsize); bp->b_flags &= ~B_DONE; } CTR4(KTR_BUF, "getblk(%p, %ld, %d) = %p", vp, (long)blkno, size, bp); BUF_ASSERT_HELD(bp); end: buf_track(bp, __func__); KASSERT(bp->b_bufobj == bo, ("bp %p wrong b_bufobj %p should be %p", bp, bp->b_bufobj, bo)); return (bp); } /* * Get an empty, disassociated buffer of given size. The buffer is initially * set to B_INVAL. */ struct buf * geteblk(int size, int flags) { struct buf *bp; int maxsize; maxsize = (size + BKVAMASK) & ~BKVAMASK; while ((bp = getnewbuf(NULL, 0, 0, maxsize, flags)) == NULL) { if ((flags & GB_NOWAIT_BD) && (curthread->td_pflags & TDP_BUFNEED) != 0) return (NULL); } allocbuf(bp, size); bufspace_release(maxsize); bp->b_flags |= B_INVAL; /* b_dep cleared by getnewbuf() */ BUF_ASSERT_HELD(bp); return (bp); } /* * Truncate the backing store for a non-vmio buffer. */ static void vfs_nonvmio_truncate(struct buf *bp, int newbsize) { if (bp->b_flags & B_MALLOC) { /* * malloced buffers are not shrunk */ if (newbsize == 0) { bufmallocadjust(bp, 0); free(bp->b_data, M_BIOBUF); bp->b_data = bp->b_kvabase; bp->b_flags &= ~B_MALLOC; } return; } vm_hold_free_pages(bp, newbsize); bufspace_adjust(bp, newbsize); } /* * Extend the backing for a non-VMIO buffer. */ static void vfs_nonvmio_extend(struct buf *bp, int newbsize) { caddr_t origbuf; int origbufsize; /* * We only use malloced memory on the first allocation. * and revert to page-allocated memory when the buffer * grows. * * There is a potential smp race here that could lead * to bufmallocspace slightly passing the max. It * is probably extremely rare and not worth worrying * over. */ if (bp->b_bufsize == 0 && newbsize <= PAGE_SIZE/2 && bufmallocspace < maxbufmallocspace) { bp->b_data = malloc(newbsize, M_BIOBUF, M_WAITOK); bp->b_flags |= B_MALLOC; bufmallocadjust(bp, newbsize); return; } /* * If the buffer is growing on its other-than-first * allocation then we revert to the page-allocation * scheme. */ origbuf = NULL; origbufsize = 0; if (bp->b_flags & B_MALLOC) { origbuf = bp->b_data; origbufsize = bp->b_bufsize; bp->b_data = bp->b_kvabase; bufmallocadjust(bp, 0); bp->b_flags &= ~B_MALLOC; newbsize = round_page(newbsize); } vm_hold_load_pages(bp, (vm_offset_t) bp->b_data + bp->b_bufsize, (vm_offset_t) bp->b_data + newbsize); if (origbuf != NULL) { bcopy(origbuf, bp->b_data, origbufsize); free(origbuf, M_BIOBUF); } bufspace_adjust(bp, newbsize); } /* * This code constitutes the buffer memory from either anonymous system * memory (in the case of non-VMIO operations) or from an associated * VM object (in the case of VMIO operations). This code is able to * resize a buffer up or down. * * Note that this code is tricky, and has many complications to resolve * deadlock or inconsistent data situations. Tread lightly!!! * There are B_CACHE and B_DELWRI interactions that must be dealt with by * the caller. Calling this code willy nilly can result in the loss of data. * * allocbuf() only adjusts B_CACHE for VMIO buffers. getblk() deals with * B_CACHE for the non-VMIO case. */ int allocbuf(struct buf *bp, int size) { int newbsize; BUF_ASSERT_HELD(bp); if (bp->b_bcount == size) return (1); if (bp->b_kvasize != 0 && bp->b_kvasize < size) panic("allocbuf: buffer too small"); newbsize = roundup2(size, DEV_BSIZE); if ((bp->b_flags & B_VMIO) == 0) { if ((bp->b_flags & B_MALLOC) == 0) newbsize = round_page(newbsize); /* * Just get anonymous memory from the kernel. Don't * mess with B_CACHE. */ if (newbsize < bp->b_bufsize) vfs_nonvmio_truncate(bp, newbsize); else if (newbsize > bp->b_bufsize) vfs_nonvmio_extend(bp, newbsize); } else { int desiredpages; desiredpages = (size == 0) ? 0 : num_pages((bp->b_offset & PAGE_MASK) + newbsize); if (bp->b_flags & B_MALLOC) panic("allocbuf: VMIO buffer can't be malloced"); /* * Set B_CACHE initially if buffer is 0 length or will become * 0-length. */ if (size == 0 || bp->b_bufsize == 0) bp->b_flags |= B_CACHE; if (newbsize < bp->b_bufsize) vfs_vmio_truncate(bp, desiredpages); /* XXX This looks as if it should be newbsize > b_bufsize */ else if (size > bp->b_bcount) vfs_vmio_extend(bp, desiredpages, size); bufspace_adjust(bp, newbsize); } bp->b_bcount = size; /* requested buffer size. */ return (1); } extern int inflight_transient_maps; void biodone(struct bio *bp) { struct mtx *mtxp; void (*done)(struct bio *); vm_offset_t start, end; biotrack(bp, __func__); if ((bp->bio_flags & BIO_TRANSIENT_MAPPING) != 0) { bp->bio_flags &= ~BIO_TRANSIENT_MAPPING; bp->bio_flags |= BIO_UNMAPPED; start = trunc_page((vm_offset_t)bp->bio_data); end = round_page((vm_offset_t)bp->bio_data + bp->bio_length); bp->bio_data = unmapped_buf; pmap_qremove(start, atop(end - start)); vmem_free(transient_arena, start, end - start); atomic_add_int(&inflight_transient_maps, -1); } done = bp->bio_done; if (done == NULL) { mtxp = mtx_pool_find(mtxpool_sleep, bp); mtx_lock(mtxp); bp->bio_flags |= BIO_DONE; wakeup(bp); mtx_unlock(mtxp); } else done(bp); } /* * Wait for a BIO to finish. */ int biowait(struct bio *bp, const char *wchan) { struct mtx *mtxp; mtxp = mtx_pool_find(mtxpool_sleep, bp); mtx_lock(mtxp); while ((bp->bio_flags & BIO_DONE) == 0) msleep(bp, mtxp, PRIBIO, wchan, 0); mtx_unlock(mtxp); if (bp->bio_error != 0) return (bp->bio_error); if (!(bp->bio_flags & BIO_ERROR)) return (0); return (EIO); } void biofinish(struct bio *bp, struct devstat *stat, int error) { if (error) { bp->bio_error = error; bp->bio_flags |= BIO_ERROR; } if (stat != NULL) devstat_end_transaction_bio(stat, bp); biodone(bp); } #if defined(BUF_TRACKING) || defined(FULL_BUF_TRACKING) void biotrack_buf(struct bio *bp, const char *location) { buf_track(bp->bio_track_bp, location); } #endif /* * bufwait: * * Wait for buffer I/O completion, returning error status. The buffer * is left locked and B_DONE on return. B_EINTR is converted into an EINTR * error and cleared. */ int bufwait(struct buf *bp) { if (bp->b_iocmd == BIO_READ) bwait(bp, PRIBIO, "biord"); else bwait(bp, PRIBIO, "biowr"); if (bp->b_flags & B_EINTR) { bp->b_flags &= ~B_EINTR; return (EINTR); } if (bp->b_ioflags & BIO_ERROR) { return (bp->b_error ? bp->b_error : EIO); } else { return (0); } } /* * bufdone: * * Finish I/O on a buffer, optionally calling a completion function. * This is usually called from an interrupt so process blocking is * not allowed. * * biodone is also responsible for setting B_CACHE in a B_VMIO bp. * In a non-VMIO bp, B_CACHE will be set on the next getblk() * assuming B_INVAL is clear. * * For the VMIO case, we set B_CACHE if the op was a read and no * read error occurred, or if the op was a write. B_CACHE is never * set if the buffer is invalid or otherwise uncacheable. * * biodone does not mess with B_INVAL, allowing the I/O routine or the * initiator to leave B_INVAL set to brelse the buffer out of existence * in the biodone routine. */ void bufdone(struct buf *bp) { struct bufobj *dropobj; void (*biodone)(struct buf *); buf_track(bp, __func__); CTR3(KTR_BUF, "bufdone(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags); dropobj = NULL; KASSERT(!(bp->b_flags & B_DONE), ("biodone: bp %p already done", bp)); BUF_ASSERT_HELD(bp); runningbufwakeup(bp); if (bp->b_iocmd == BIO_WRITE) dropobj = bp->b_bufobj; /* call optional completion function if requested */ if (bp->b_iodone != NULL) { biodone = bp->b_iodone; bp->b_iodone = NULL; (*biodone) (bp); if (dropobj) bufobj_wdrop(dropobj); return; } bufdone_finish(bp); if (dropobj) bufobj_wdrop(dropobj); } void bufdone_finish(struct buf *bp) { BUF_ASSERT_HELD(bp); if (!LIST_EMPTY(&bp->b_dep)) buf_complete(bp); if (bp->b_flags & B_VMIO) { /* * Set B_CACHE if the op was a normal read and no error * occurred. B_CACHE is set for writes in the b*write() * routines. */ if (bp->b_iocmd == BIO_READ && !(bp->b_flags & (B_INVAL|B_NOCACHE)) && !(bp->b_ioflags & BIO_ERROR)) bp->b_flags |= B_CACHE; vfs_vmio_iodone(bp); } /* * For asynchronous completions, release the buffer now. The brelse * will do a wakeup there if necessary - so no need to do a wakeup * here in the async case. The sync case always needs to do a wakeup. */ if (bp->b_flags & B_ASYNC) { if ((bp->b_flags & (B_NOCACHE | B_INVAL | B_RELBUF)) || (bp->b_ioflags & BIO_ERROR)) brelse(bp); else bqrelse(bp); } else bdone(bp); } /* * This routine is called in lieu of iodone in the case of * incomplete I/O. This keeps the busy status for pages * consistent. */ void vfs_unbusy_pages(struct buf *bp) { int i; vm_object_t obj; vm_page_t m; runningbufwakeup(bp); if (!(bp->b_flags & B_VMIO)) return; obj = bp->b_bufobj->bo_object; VM_OBJECT_WLOCK(obj); for (i = 0; i < bp->b_npages; i++) { m = bp->b_pages[i]; if (m == bogus_page) { m = vm_page_lookup(obj, OFF_TO_IDX(bp->b_offset) + i); if (!m) panic("vfs_unbusy_pages: page missing\n"); bp->b_pages[i] = m; if (buf_mapped(bp)) { BUF_CHECK_MAPPED(bp); pmap_qenter(trunc_page((vm_offset_t)bp->b_data), bp->b_pages, bp->b_npages); } else BUF_CHECK_UNMAPPED(bp); } vm_page_sunbusy(m); } vm_object_pip_wakeupn(obj, bp->b_npages); VM_OBJECT_WUNLOCK(obj); } /* * vfs_page_set_valid: * * Set the valid bits in a page based on the supplied offset. The * range is restricted to the buffer's size. * * This routine is typically called after a read completes. */ static void vfs_page_set_valid(struct buf *bp, vm_ooffset_t off, vm_page_t m) { vm_ooffset_t eoff; /* * Compute the end offset, eoff, such that [off, eoff) does not span a * page boundary and eoff is not greater than the end of the buffer. * The end of the buffer, in this case, is our file EOF, not the * allocation size of the buffer. */ eoff = (off + PAGE_SIZE) & ~(vm_ooffset_t)PAGE_MASK; if (eoff > bp->b_offset + bp->b_bcount) eoff = bp->b_offset + bp->b_bcount; /* * Set valid range. This is typically the entire buffer and thus the * entire page. */ if (eoff > off) vm_page_set_valid_range(m, off & PAGE_MASK, eoff - off); } /* * vfs_page_set_validclean: * * Set the valid bits and clear the dirty bits in a page based on the * supplied offset. The range is restricted to the buffer's size. */ static void vfs_page_set_validclean(struct buf *bp, vm_ooffset_t off, vm_page_t m) { vm_ooffset_t soff, eoff; /* * Start and end offsets in buffer. eoff - soff may not cross a * page boundary or cross the end of the buffer. The end of the * buffer, in this case, is our file EOF, not the allocation size * of the buffer. */ soff = off; eoff = (off + PAGE_SIZE) & ~(off_t)PAGE_MASK; if (eoff > bp->b_offset + bp->b_bcount) eoff = bp->b_offset + bp->b_bcount; /* * Set valid range. This is typically the entire buffer and thus the * entire page. */ if (eoff > soff) { vm_page_set_validclean( m, (vm_offset_t) (soff & PAGE_MASK), (vm_offset_t) (eoff - soff) ); } } /* * Ensure that all buffer pages are not exclusive busied. If any page is * exclusive busy, drain it. */ void vfs_drain_busy_pages(struct buf *bp) { vm_page_t m; int i, last_busied; VM_OBJECT_ASSERT_WLOCKED(bp->b_bufobj->bo_object); last_busied = 0; for (i = 0; i < bp->b_npages; i++) { m = bp->b_pages[i]; if (vm_page_xbusied(m)) { for (; last_busied < i; last_busied++) vm_page_sbusy(bp->b_pages[last_busied]); while (vm_page_xbusied(m)) { vm_page_lock(m); VM_OBJECT_WUNLOCK(bp->b_bufobj->bo_object); vm_page_busy_sleep(m, "vbpage", true); VM_OBJECT_WLOCK(bp->b_bufobj->bo_object); } } } for (i = 0; i < last_busied; i++) vm_page_sunbusy(bp->b_pages[i]); } /* * This routine is called before a device strategy routine. * It is used to tell the VM system that paging I/O is in * progress, and treat the pages associated with the buffer * almost as being exclusive busy. Also the object paging_in_progress * flag is handled to make sure that the object doesn't become * inconsistent. * * Since I/O has not been initiated yet, certain buffer flags * such as BIO_ERROR or B_INVAL may be in an inconsistent state * and should be ignored. */ void vfs_busy_pages(struct buf *bp, int clear_modify) { vm_object_t obj; vm_ooffset_t foff; vm_page_t m; int i; bool bogus; if (!(bp->b_flags & B_VMIO)) return; obj = bp->b_bufobj->bo_object; foff = bp->b_offset; KASSERT(bp->b_offset != NOOFFSET, ("vfs_busy_pages: no buffer offset")); VM_OBJECT_WLOCK(obj); vfs_drain_busy_pages(bp); if (bp->b_bufsize != 0) vfs_setdirty_locked_object(bp); bogus = false; for (i = 0; i < bp->b_npages; i++) { m = bp->b_pages[i]; if ((bp->b_flags & B_CLUSTER) == 0) { vm_object_pip_add(obj, 1); vm_page_sbusy(m); } /* * When readying a buffer for a read ( i.e * clear_modify == 0 ), it is important to do * bogus_page replacement for valid pages in * partially instantiated buffers. Partially * instantiated buffers can, in turn, occur when * reconstituting a buffer from its VM backing store * base. We only have to do this if B_CACHE is * clear ( which causes the I/O to occur in the * first place ). The replacement prevents the read * I/O from overwriting potentially dirty VM-backed * pages. XXX bogus page replacement is, uh, bogus. * It may not work properly with small-block devices. * We need to find a better way. */ if (clear_modify) { pmap_remove_write(m); vfs_page_set_validclean(bp, foff, m); } else if (m->valid == VM_PAGE_BITS_ALL && (bp->b_flags & B_CACHE) == 0) { bp->b_pages[i] = bogus_page; bogus = true; } foff = (foff + PAGE_SIZE) & ~(off_t)PAGE_MASK; } VM_OBJECT_WUNLOCK(obj); if (bogus && buf_mapped(bp)) { BUF_CHECK_MAPPED(bp); pmap_qenter(trunc_page((vm_offset_t)bp->b_data), bp->b_pages, bp->b_npages); } } /* * vfs_bio_set_valid: * * Set the range within the buffer to valid. The range is * relative to the beginning of the buffer, b_offset. Note that * b_offset itself may be offset from the beginning of the first * page. */ void vfs_bio_set_valid(struct buf *bp, int base, int size) { int i, n; vm_page_t m; if (!(bp->b_flags & B_VMIO)) return; /* * Fixup base to be relative to beginning of first page. * Set initial n to be the maximum number of bytes in the * first page that can be validated. */ base += (bp->b_offset & PAGE_MASK); n = PAGE_SIZE - (base & PAGE_MASK); VM_OBJECT_WLOCK(bp->b_bufobj->bo_object); for (i = base / PAGE_SIZE; size > 0 && i < bp->b_npages; ++i) { m = bp->b_pages[i]; if (n > size) n = size; vm_page_set_valid_range(m, base & PAGE_MASK, n); base += n; size -= n; n = PAGE_SIZE; } VM_OBJECT_WUNLOCK(bp->b_bufobj->bo_object); } /* * vfs_bio_clrbuf: * * If the specified buffer is a non-VMIO buffer, clear the entire * buffer. If the specified buffer is a VMIO buffer, clear and * validate only the previously invalid portions of the buffer. * This routine essentially fakes an I/O, so we need to clear * BIO_ERROR and B_INVAL. * * Note that while we only theoretically need to clear through b_bcount, * we go ahead and clear through b_bufsize. */ void vfs_bio_clrbuf(struct buf *bp) { int i, j, mask, sa, ea, slide; if ((bp->b_flags & (B_VMIO | B_MALLOC)) != B_VMIO) { clrbuf(bp); return; } bp->b_flags &= ~B_INVAL; bp->b_ioflags &= ~BIO_ERROR; VM_OBJECT_WLOCK(bp->b_bufobj->bo_object); if ((bp->b_npages == 1) && (bp->b_bufsize < PAGE_SIZE) && (bp->b_offset & PAGE_MASK) == 0) { if (bp->b_pages[0] == bogus_page) goto unlock; mask = (1 << (bp->b_bufsize / DEV_BSIZE)) - 1; VM_OBJECT_ASSERT_WLOCKED(bp->b_pages[0]->object); if ((bp->b_pages[0]->valid & mask) == mask) goto unlock; if ((bp->b_pages[0]->valid & mask) == 0) { pmap_zero_page_area(bp->b_pages[0], 0, bp->b_bufsize); bp->b_pages[0]->valid |= mask; goto unlock; } } sa = bp->b_offset & PAGE_MASK; slide = 0; for (i = 0; i < bp->b_npages; i++, sa = 0) { slide = imin(slide + PAGE_SIZE, bp->b_offset + bp->b_bufsize); ea = slide & PAGE_MASK; if (ea == 0) ea = PAGE_SIZE; if (bp->b_pages[i] == bogus_page) continue; j = sa / DEV_BSIZE; mask = ((1 << ((ea - sa) / DEV_BSIZE)) - 1) << j; VM_OBJECT_ASSERT_WLOCKED(bp->b_pages[i]->object); if ((bp->b_pages[i]->valid & mask) == mask) continue; if ((bp->b_pages[i]->valid & mask) == 0) pmap_zero_page_area(bp->b_pages[i], sa, ea - sa); else { for (; sa < ea; sa += DEV_BSIZE, j++) { if ((bp->b_pages[i]->valid & (1 << j)) == 0) { pmap_zero_page_area(bp->b_pages[i], sa, DEV_BSIZE); } } } bp->b_pages[i]->valid |= mask; } unlock: VM_OBJECT_WUNLOCK(bp->b_bufobj->bo_object); bp->b_resid = 0; } void vfs_bio_bzero_buf(struct buf *bp, int base, int size) { vm_page_t m; int i, n; if (buf_mapped(bp)) { BUF_CHECK_MAPPED(bp); bzero(bp->b_data + base, size); } else { BUF_CHECK_UNMAPPED(bp); n = PAGE_SIZE - (base & PAGE_MASK); for (i = base / PAGE_SIZE; size > 0 && i < bp->b_npages; ++i) { m = bp->b_pages[i]; if (n > size) n = size; pmap_zero_page_area(m, base & PAGE_MASK, n); base += n; size -= n; n = PAGE_SIZE; } } } /* * Update buffer flags based on I/O request parameters, optionally releasing the * buffer. If it's VMIO or direct I/O, the buffer pages are released to the VM, * where they may be placed on a page queue (VMIO) or freed immediately (direct * I/O). Otherwise the buffer is released to the cache. */ static void b_io_dismiss(struct buf *bp, int ioflag, bool release) { KASSERT((ioflag & IO_NOREUSE) == 0 || (ioflag & IO_VMIO) != 0, ("buf %p non-VMIO noreuse", bp)); if ((ioflag & IO_DIRECT) != 0) bp->b_flags |= B_DIRECT; if ((ioflag & (IO_VMIO | IO_DIRECT)) != 0 && LIST_EMPTY(&bp->b_dep)) { bp->b_flags |= B_RELBUF; if ((ioflag & IO_NOREUSE) != 0) bp->b_flags |= B_NOREUSE; if (release) brelse(bp); } else if (release) bqrelse(bp); } void vfs_bio_brelse(struct buf *bp, int ioflag) { b_io_dismiss(bp, ioflag, true); } void vfs_bio_set_flags(struct buf *bp, int ioflag) { b_io_dismiss(bp, ioflag, false); } /* * vm_hold_load_pages and vm_hold_free_pages get pages into * a buffers address space. The pages are anonymous and are * not associated with a file object. */ static void vm_hold_load_pages(struct buf *bp, vm_offset_t from, vm_offset_t to) { vm_offset_t pg; vm_page_t p; int index; BUF_CHECK_MAPPED(bp); to = round_page(to); from = round_page(from); index = (from - trunc_page((vm_offset_t)bp->b_data)) >> PAGE_SHIFT; for (pg = from; pg < to; pg += PAGE_SIZE, index++) { tryagain: /* * note: must allocate system pages since blocking here * could interfere with paging I/O, no matter which * process we are. */ p = vm_page_alloc(NULL, 0, VM_ALLOC_SYSTEM | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED | VM_ALLOC_COUNT((to - pg) >> PAGE_SHIFT)); if (p == NULL) { VM_WAIT; goto tryagain; } pmap_qenter(pg, &p, 1); bp->b_pages[index] = p; } bp->b_npages = index; } /* Return pages associated with this buf to the vm system */ static void vm_hold_free_pages(struct buf *bp, int newbsize) { vm_offset_t from; vm_page_t p; int index, newnpages; BUF_CHECK_MAPPED(bp); from = round_page((vm_offset_t)bp->b_data + newbsize); newnpages = (from - trunc_page((vm_offset_t)bp->b_data)) >> PAGE_SHIFT; if (bp->b_npages > newnpages) pmap_qremove(from, bp->b_npages - newnpages); for (index = newnpages; index < bp->b_npages; index++) { p = bp->b_pages[index]; bp->b_pages[index] = NULL; if (vm_page_sbusied(p)) printf("vm_hold_free_pages: blkno: %jd, lblkno: %jd\n", (intmax_t)bp->b_blkno, (intmax_t)bp->b_lblkno); p->wire_count--; vm_page_free(p); atomic_subtract_int(&vm_cnt.v_wire_count, 1); } bp->b_npages = newnpages; } /* * Map an IO request into kernel virtual address space. * * All requests are (re)mapped into kernel VA space. * Notice that we use b_bufsize for the size of the buffer * to be mapped. b_bcount might be modified by the driver. * * Note that even if the caller determines that the address space should * be valid, a race or a smaller-file mapped into a larger space may * actually cause vmapbuf() to fail, so all callers of vmapbuf() MUST * check the return value. * * This function only works with pager buffers. */ int vmapbuf(struct buf *bp, int mapbuf) { vm_prot_t prot; int pidx; if (bp->b_bufsize < 0) return (-1); prot = VM_PROT_READ; if (bp->b_iocmd == BIO_READ) prot |= VM_PROT_WRITE; /* Less backwards than it looks */ if ((pidx = vm_fault_quick_hold_pages(&curproc->p_vmspace->vm_map, (vm_offset_t)bp->b_data, bp->b_bufsize, prot, bp->b_pages, btoc(MAXPHYS))) < 0) return (-1); bp->b_npages = pidx; bp->b_offset = ((vm_offset_t)bp->b_data) & PAGE_MASK; if (mapbuf || !unmapped_buf_allowed) { pmap_qenter((vm_offset_t)bp->b_kvabase, bp->b_pages, pidx); bp->b_data = bp->b_kvabase + bp->b_offset; } else bp->b_data = unmapped_buf; return(0); } /* * Free the io map PTEs associated with this IO operation. * We also invalidate the TLB entries and restore the original b_addr. * * This function only works with pager buffers. */ void vunmapbuf(struct buf *bp) { int npages; npages = bp->b_npages; if (buf_mapped(bp)) pmap_qremove(trunc_page((vm_offset_t)bp->b_data), npages); vm_page_unhold_pages(bp->b_pages, npages); bp->b_data = unmapped_buf; } void bdone(struct buf *bp) { struct mtx *mtxp; mtxp = mtx_pool_find(mtxpool_sleep, bp); mtx_lock(mtxp); bp->b_flags |= B_DONE; wakeup(bp); mtx_unlock(mtxp); } void bwait(struct buf *bp, u_char pri, const char *wchan) { struct mtx *mtxp; mtxp = mtx_pool_find(mtxpool_sleep, bp); mtx_lock(mtxp); while ((bp->b_flags & B_DONE) == 0) msleep(bp, mtxp, pri, wchan, 0); mtx_unlock(mtxp); } int bufsync(struct bufobj *bo, int waitfor) { return (VOP_FSYNC(bo2vnode(bo), waitfor, curthread)); } void bufstrategy(struct bufobj *bo, struct buf *bp) { int i = 0; struct vnode *vp; vp = bp->b_vp; KASSERT(vp == bo->bo_private, ("Inconsistent vnode bufstrategy")); KASSERT(vp->v_type != VCHR && vp->v_type != VBLK, ("Wrong vnode in bufstrategy(bp=%p, vp=%p)", bp, vp)); i = VOP_STRATEGY(vp, bp); KASSERT(i == 0, ("VOP_STRATEGY failed bp=%p vp=%p", bp, bp->b_vp)); } void bufobj_wrefl(struct bufobj *bo) { KASSERT(bo != NULL, ("NULL bo in bufobj_wref")); ASSERT_BO_WLOCKED(bo); bo->bo_numoutput++; } void bufobj_wref(struct bufobj *bo) { KASSERT(bo != NULL, ("NULL bo in bufobj_wref")); BO_LOCK(bo); bo->bo_numoutput++; BO_UNLOCK(bo); } void bufobj_wdrop(struct bufobj *bo) { KASSERT(bo != NULL, ("NULL bo in bufobj_wdrop")); BO_LOCK(bo); KASSERT(bo->bo_numoutput > 0, ("bufobj_wdrop non-positive count")); if ((--bo->bo_numoutput == 0) && (bo->bo_flag & BO_WWAIT)) { bo->bo_flag &= ~BO_WWAIT; wakeup(&bo->bo_numoutput); } BO_UNLOCK(bo); } int bufobj_wwait(struct bufobj *bo, int slpflag, int timeo) { int error; KASSERT(bo != NULL, ("NULL bo in bufobj_wwait")); ASSERT_BO_WLOCKED(bo); error = 0; while (bo->bo_numoutput) { bo->bo_flag |= BO_WWAIT; error = msleep(&bo->bo_numoutput, BO_LOCKPTR(bo), slpflag | (PRIBIO + 1), "bo_wwait", timeo); if (error) break; } return (error); } /* * Set bio_data or bio_ma for struct bio from the struct buf. */ void bdata2bio(struct buf *bp, struct bio *bip) { if (!buf_mapped(bp)) { KASSERT(unmapped_buf_allowed, ("unmapped")); bip->bio_ma = bp->b_pages; bip->bio_ma_n = bp->b_npages; bip->bio_data = unmapped_buf; bip->bio_ma_offset = (vm_offset_t)bp->b_offset & PAGE_MASK; bip->bio_flags |= BIO_UNMAPPED; KASSERT(round_page(bip->bio_ma_offset + bip->bio_length) / PAGE_SIZE == bp->b_npages, ("Buffer %p too short: %d %lld %d", bp, bip->bio_ma_offset, (long long)bip->bio_length, bip->bio_ma_n)); } else { bip->bio_data = bp->b_data; bip->bio_ma = NULL; } } /* * The MIPS pmap code currently doesn't handle aliased pages. * The VIPT caches may not handle page aliasing themselves, leading * to data corruption. * * As such, this code makes a system extremely unhappy if said * system doesn't support unaliasing the above situation in hardware. * Some "recent" systems (eg some mips24k/mips74k cores) don't enable * this feature at build time, so it has to be handled in software. * * Once the MIPS pmap/cache code grows to support this function on * earlier chips, it should be flipped back off. */ #ifdef __mips__ static int buf_pager_relbuf = 1; #else static int buf_pager_relbuf = 0; #endif SYSCTL_INT(_vfs, OID_AUTO, buf_pager_relbuf, CTLFLAG_RWTUN, &buf_pager_relbuf, 0, "Make buffer pager release buffers after reading"); /* * The buffer pager. It uses buffer reads to validate pages. * * In contrast to the generic local pager from vm/vnode_pager.c, this * pager correctly and easily handles volumes where the underlying * device block size is greater than the machine page size. The * buffer cache transparently extends the requested page run to be * aligned at the block boundary, and does the necessary bogus page * replacements in the addends to avoid obliterating already valid * pages. * * The only non-trivial issue is that the exclusive busy state for * pages, which is assumed by the vm_pager_getpages() interface, is * incompatible with the VMIO buffer cache's desire to share-busy the * pages. This function performs a trivial downgrade of the pages' * state before reading buffers, and a less trivial upgrade from the * shared-busy to excl-busy state after the read. */ int vfs_bio_getpages(struct vnode *vp, vm_page_t *ma, int count, int *rbehind, int *rahead, vbg_get_lblkno_t get_lblkno, vbg_get_blksize_t get_blksize) { vm_page_t m; vm_object_t object; struct buf *bp; struct mount *mp; daddr_t lbn, lbnp; vm_ooffset_t la, lb, poff, poffe; long bsize; int bo_bs, br_flags, error, i, pgsin, pgsin_a, pgsin_b; bool redo, lpart; object = vp->v_object; mp = vp->v_mount; la = IDX_TO_OFF(ma[count - 1]->pindex); if (la >= object->un_pager.vnp.vnp_size) return (VM_PAGER_BAD); lpart = la + PAGE_SIZE > object->un_pager.vnp.vnp_size; bo_bs = get_blksize(vp, get_lblkno(vp, IDX_TO_OFF(ma[0]->pindex))); /* * Calculate read-ahead, behind and total pages. */ pgsin = count; lb = IDX_TO_OFF(ma[0]->pindex); pgsin_b = OFF_TO_IDX(lb - rounddown2(lb, bo_bs)); pgsin += pgsin_b; if (rbehind != NULL) *rbehind = pgsin_b; pgsin_a = OFF_TO_IDX(roundup2(la, bo_bs) - la); if (la + IDX_TO_OFF(pgsin_a) >= object->un_pager.vnp.vnp_size) pgsin_a = OFF_TO_IDX(roundup2(object->un_pager.vnp.vnp_size, PAGE_SIZE) - la); pgsin += pgsin_a; if (rahead != NULL) *rahead = pgsin_a; VM_CNT_INC(v_vnodein); VM_CNT_ADD(v_vnodepgsin, pgsin); br_flags = (mp != NULL && (mp->mnt_kern_flag & MNTK_UNMAPPED_BUFS) != 0) ? GB_UNMAPPED : 0; VM_OBJECT_WLOCK(object); again: for (i = 0; i < count; i++) vm_page_busy_downgrade(ma[i]); VM_OBJECT_WUNLOCK(object); lbnp = -1; for (i = 0; i < count; i++) { m = ma[i]; /* * Pages are shared busy and the object lock is not * owned, which together allow for the pages' * invalidation. The racy test for validity avoids * useless creation of the buffer for the most typical * case when invalidation is not used in redo or for * parallel read. The shared->excl upgrade loop at * the end of the function catches the race in a * reliable way (protected by the object lock). */ if (m->valid == VM_PAGE_BITS_ALL) continue; poff = IDX_TO_OFF(m->pindex); poffe = MIN(poff + PAGE_SIZE, object->un_pager.vnp.vnp_size); for (; poff < poffe; poff += bsize) { lbn = get_lblkno(vp, poff); if (lbn == lbnp) goto next_page; lbnp = lbn; bsize = get_blksize(vp, lbn); error = bread_gb(vp, lbn, bsize, curthread->td_ucred, br_flags, &bp); if (error != 0) goto end_pages; if (LIST_EMPTY(&bp->b_dep)) { /* * Invalidation clears m->valid, but * may leave B_CACHE flag if the * buffer existed at the invalidation * time. In this case, recycle the * buffer to do real read on next * bread() after redo. * * Otherwise B_RELBUF is not strictly * necessary, enable to reduce buf * cache pressure. */ if (buf_pager_relbuf || m->valid != VM_PAGE_BITS_ALL) bp->b_flags |= B_RELBUF; bp->b_flags &= ~B_NOCACHE; brelse(bp); } else { bqrelse(bp); } } KASSERT(1 /* racy, enable for debugging */ || m->valid == VM_PAGE_BITS_ALL || i == count - 1, ("buf %d %p invalid", i, m)); if (i == count - 1 && lpart) { VM_OBJECT_WLOCK(object); if (m->valid != 0 && m->valid != VM_PAGE_BITS_ALL) vm_page_zero_invalid(m, TRUE); VM_OBJECT_WUNLOCK(object); } next_page:; } end_pages: VM_OBJECT_WLOCK(object); redo = false; for (i = 0; i < count; i++) { vm_page_sunbusy(ma[i]); ma[i] = vm_page_grab(object, ma[i]->pindex, VM_ALLOC_NORMAL); /* * Since the pages were only sbusy while neither the * buffer nor the object lock was held by us, or * reallocated while vm_page_grab() slept for busy * relinguish, they could have been invalidated. * Recheck the valid bits and re-read as needed. * * Note that the last page is made fully valid in the * read loop, and partial validity for the page at * index count - 1 could mean that the page was * invalidated or removed, so we must restart for * safety as well. */ if (ma[i]->valid != VM_PAGE_BITS_ALL) redo = true; } if (redo && error == 0) goto again; VM_OBJECT_WUNLOCK(object); return (error != 0 ? VM_PAGER_ERROR : VM_PAGER_OK); } #include "opt_ddb.h" #ifdef DDB #include /* DDB command to show buffer data */ DB_SHOW_COMMAND(buffer, db_show_buffer) { /* get args */ struct buf *bp = (struct buf *)addr; #ifdef FULL_BUF_TRACKING uint32_t i, j; #endif if (!have_addr) { db_printf("usage: show buffer \n"); return; } db_printf("buf at %p\n", bp); db_printf("b_flags = 0x%b, b_xflags=0x%b, b_vflags=0x%b\n", (u_int)bp->b_flags, PRINT_BUF_FLAGS, (u_int)bp->b_xflags, PRINT_BUF_XFLAGS, (u_int)bp->b_vflags, PRINT_BUF_VFLAGS); db_printf( "b_error = %d, b_bufsize = %ld, b_bcount = %ld, b_resid = %ld\n" "b_bufobj = (%p), b_data = %p, b_blkno = %jd, b_lblkno = %jd, " "b_dep = %p\n", bp->b_error, bp->b_bufsize, bp->b_bcount, bp->b_resid, bp->b_bufobj, bp->b_data, (intmax_t)bp->b_blkno, (intmax_t)bp->b_lblkno, bp->b_dep.lh_first); db_printf("b_kvabase = %p, b_kvasize = %d\n", bp->b_kvabase, bp->b_kvasize); if (bp->b_npages) { int i; db_printf("b_npages = %d, pages(OBJ, IDX, PA): ", bp->b_npages); for (i = 0; i < bp->b_npages; i++) { vm_page_t m; m = bp->b_pages[i]; if (m != NULL) db_printf("(%p, 0x%lx, 0x%lx)", m->object, (u_long)m->pindex, (u_long)VM_PAGE_TO_PHYS(m)); else db_printf("( ??? )"); if ((i + 1) < bp->b_npages) db_printf(","); } db_printf("\n"); } #if defined(FULL_BUF_TRACKING) db_printf("b_io_tracking: b_io_tcnt = %u\n", bp->b_io_tcnt); i = bp->b_io_tcnt % BUF_TRACKING_SIZE; for (j = 1; j <= BUF_TRACKING_SIZE; j++) { if (bp->b_io_tracking[BUF_TRACKING_ENTRY(i - j)] == NULL) continue; db_printf(" %2u: %s\n", j, bp->b_io_tracking[BUF_TRACKING_ENTRY(i - j)]); } #elif defined(BUF_TRACKING) db_printf("b_io_tracking: %s\n", bp->b_io_tracking); #endif db_printf(" "); BUF_LOCKPRINTINFO(bp); } DB_SHOW_COMMAND(lockedbufs, lockedbufs) { struct buf *bp; int i; for (i = 0; i < nbuf; i++) { bp = &buf[i]; if (BUF_ISLOCKED(bp)) { db_show_buffer((uintptr_t)bp, 1, 0, NULL); db_printf("\n"); if (db_pager_quit) break; } } } DB_SHOW_COMMAND(vnodebufs, db_show_vnodebufs) { struct vnode *vp; struct buf *bp; if (!have_addr) { db_printf("usage: show vnodebufs \n"); return; } vp = (struct vnode *)addr; db_printf("Clean buffers:\n"); TAILQ_FOREACH(bp, &vp->v_bufobj.bo_clean.bv_hd, b_bobufs) { db_show_buffer((uintptr_t)bp, 1, 0, NULL); db_printf("\n"); } db_printf("Dirty buffers:\n"); TAILQ_FOREACH(bp, &vp->v_bufobj.bo_dirty.bv_hd, b_bobufs) { db_show_buffer((uintptr_t)bp, 1, 0, NULL); db_printf("\n"); } } DB_COMMAND(countfreebufs, db_coundfreebufs) { struct buf *bp; int i, used = 0, nfree = 0; if (have_addr) { db_printf("usage: countfreebufs\n"); return; } for (i = 0; i < nbuf; i++) { bp = &buf[i]; if (bp->b_qindex == QUEUE_EMPTY) nfree++; else used++; } db_printf("Counted %d free, %d used (%d tot)\n", nfree, used, nfree + used); db_printf("numfreebuffers is %d\n", numfreebuffers); } #endif /* DDB */ Index: projects/numa2/sys/mips/mips/pmap.c =================================================================== --- projects/numa2/sys/mips/mips/pmap.c (revision 321505) +++ projects/numa2/sys/mips/mips/pmap.c (revision 321506) @@ -1,3643 +1,3643 @@ /* * Copyright (c) 1991 Regents of the University of California. * All rights reserved. * Copyright (c) 1994 John S. Dyson * All rights reserved. * Copyright (c) 1994 David Greenman * All rights reserved. * * This code is derived from software contributed to Berkeley by * the Systems Programming Group of the University of Utah Computer * Science Department and William Jolitz of UUNET Technologies Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from: @(#)pmap.c 7.7 (Berkeley) 5/12/91 * from: src/sys/i386/i386/pmap.c,v 1.250.2.8 2000/11/21 00:09:14 ps * JNPR: pmap.c,v 1.11.2.1 2007/08/16 11:51:06 girish */ /* * Manages physical address maps. * * Since the information managed by this module is * also stored by the logical address mapping module, * this module may throw away valid virtual-to-physical * mappings at almost any time. However, invalidations * of virtual-to-physical mappings must be done as * requested. * * In order to cope with hardware architectures which * make virtual-to-physical map invalidates expensive, * this module may delay invalidate or reduced protection * operations until such time as they are actually * necessary. This module is given full information as * to which processors are currently using which maps, * and to when physical maps must be made correct. */ #include __FBSDID("$FreeBSD$"); #include "opt_ddb.h" #include "opt_pmap.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef DDB #include #endif #include #include #include #include #include #include #include #include #include #include #include #include #include #undef PMAP_DEBUG #if !defined(DIAGNOSTIC) #define PMAP_INLINE __inline #else #define PMAP_INLINE #endif #ifdef PV_STATS #define PV_STAT(x) do { x ; } while (0) #else #define PV_STAT(x) do { } while (0) #endif /* * Get PDEs and PTEs for user/kernel address space */ #define pmap_seg_index(v) (((v) >> SEGSHIFT) & (NPDEPG - 1)) #define pmap_pde_index(v) (((v) >> PDRSHIFT) & (NPDEPG - 1)) #define pmap_pte_index(v) (((v) >> PAGE_SHIFT) & (NPTEPG - 1)) #define pmap_pde_pindex(v) ((v) >> PDRSHIFT) #ifdef __mips_n64 #define NUPDE (NPDEPG * NPDEPG) #define NUSERPGTBLS (NUPDE + NPDEPG) #else #define NUPDE (NPDEPG) #define NUSERPGTBLS (NUPDE) #endif #define is_kernel_pmap(x) ((x) == kernel_pmap) struct pmap kernel_pmap_store; pd_entry_t *kernel_segmap; vm_offset_t virtual_avail; /* VA of first avail page (after kernel bss) */ vm_offset_t virtual_end; /* VA of last avail page (end of kernel AS) */ static int nkpt; unsigned pmap_max_asid; /* max ASID supported by the system */ #define PMAP_ASID_RESERVED 0 vm_offset_t kernel_vm_end = VM_MIN_KERNEL_ADDRESS; static void pmap_asid_alloc(pmap_t pmap); static struct rwlock_padalign pvh_global_lock; /* * Data for the pv entry allocation mechanism */ static TAILQ_HEAD(pch, pv_chunk) pv_chunks = TAILQ_HEAD_INITIALIZER(pv_chunks); static int pv_entry_count; static void free_pv_chunk(struct pv_chunk *pc); static void free_pv_entry(pmap_t pmap, pv_entry_t pv); static pv_entry_t get_pv_entry(pmap_t pmap, boolean_t try); static vm_page_t pmap_pv_reclaim(pmap_t locked_pmap); static void pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va); static pv_entry_t pmap_pvh_remove(struct md_page *pvh, pmap_t pmap, vm_offset_t va); static vm_page_t pmap_alloc_direct_page(unsigned int index, int req); static vm_page_t pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot, vm_page_t mpte); static void pmap_grow_direct_page(int req); static int pmap_remove_pte(struct pmap *pmap, pt_entry_t *ptq, vm_offset_t va, pd_entry_t pde); static void pmap_remove_page(struct pmap *pmap, vm_offset_t va); static void pmap_remove_entry(struct pmap *pmap, vm_page_t m, vm_offset_t va); static boolean_t pmap_try_insert_pv_entry(pmap_t pmap, vm_page_t mpte, vm_offset_t va, vm_page_t m); static void pmap_update_page(pmap_t pmap, vm_offset_t va, pt_entry_t pte); static void pmap_invalidate_all(pmap_t pmap); static void pmap_invalidate_page(pmap_t pmap, vm_offset_t va); static void _pmap_unwire_ptp(pmap_t pmap, vm_offset_t va, vm_page_t m); static vm_page_t pmap_allocpte(pmap_t pmap, vm_offset_t va, u_int flags); static vm_page_t _pmap_allocpte(pmap_t pmap, unsigned ptepindex, u_int flags); static int pmap_unuse_pt(pmap_t, vm_offset_t, pd_entry_t); static pt_entry_t init_pte_prot(vm_page_t m, vm_prot_t access, vm_prot_t prot); static void pmap_invalidate_page_action(void *arg); static void pmap_invalidate_range_action(void *arg); static void pmap_update_page_action(void *arg); #ifndef __mips_n64 /* * This structure is for high memory (memory above 512Meg in 32 bit) support. * The highmem area does not have a KSEG0 mapping, and we need a mechanism to * do temporary per-CPU mappings for pmap_zero_page, pmap_copy_page etc. * * At bootup, we reserve 2 virtual pages per CPU for mapping highmem pages. To * access a highmem physical address on a CPU, we map the physical address to * the reserved virtual address for the CPU in the kernel pagetable. This is * done with interrupts disabled(although a spinlock and sched_pin would be * sufficient). */ struct local_sysmaps { vm_offset_t base; uint32_t saved_intr; uint16_t valid1, valid2; }; static struct local_sysmaps sysmap_lmem[MAXCPU]; static __inline void pmap_alloc_lmem_map(void) { int i; for (i = 0; i < MAXCPU; i++) { sysmap_lmem[i].base = virtual_avail; virtual_avail += PAGE_SIZE * 2; sysmap_lmem[i].valid1 = sysmap_lmem[i].valid2 = 0; } } static __inline vm_offset_t pmap_lmem_map1(vm_paddr_t phys) { struct local_sysmaps *sysm; pt_entry_t *pte, npte; vm_offset_t va; uint32_t intr; int cpu; intr = intr_disable(); cpu = PCPU_GET(cpuid); sysm = &sysmap_lmem[cpu]; sysm->saved_intr = intr; va = sysm->base; npte = TLBLO_PA_TO_PFN(phys) | PTE_C_CACHE | PTE_D | PTE_V | PTE_G; pte = pmap_pte(kernel_pmap, va); *pte = npte; sysm->valid1 = 1; return (va); } static __inline vm_offset_t pmap_lmem_map2(vm_paddr_t phys1, vm_paddr_t phys2) { struct local_sysmaps *sysm; pt_entry_t *pte, npte; vm_offset_t va1, va2; uint32_t intr; int cpu; intr = intr_disable(); cpu = PCPU_GET(cpuid); sysm = &sysmap_lmem[cpu]; sysm->saved_intr = intr; va1 = sysm->base; va2 = sysm->base + PAGE_SIZE; npte = TLBLO_PA_TO_PFN(phys1) | PTE_C_CACHE | PTE_D | PTE_V | PTE_G; pte = pmap_pte(kernel_pmap, va1); *pte = npte; npte = TLBLO_PA_TO_PFN(phys2) | PTE_C_CACHE | PTE_D | PTE_V | PTE_G; pte = pmap_pte(kernel_pmap, va2); *pte = npte; sysm->valid1 = 1; sysm->valid2 = 1; return (va1); } static __inline void pmap_lmem_unmap(void) { struct local_sysmaps *sysm; pt_entry_t *pte; int cpu; cpu = PCPU_GET(cpuid); sysm = &sysmap_lmem[cpu]; pte = pmap_pte(kernel_pmap, sysm->base); *pte = PTE_G; tlb_invalidate_address(kernel_pmap, sysm->base); sysm->valid1 = 0; if (sysm->valid2) { pte = pmap_pte(kernel_pmap, sysm->base + PAGE_SIZE); *pte = PTE_G; tlb_invalidate_address(kernel_pmap, sysm->base + PAGE_SIZE); sysm->valid2 = 0; } intr_restore(sysm->saved_intr); } #else /* __mips_n64 */ static __inline void pmap_alloc_lmem_map(void) { } static __inline vm_offset_t pmap_lmem_map1(vm_paddr_t phys) { return (0); } static __inline vm_offset_t pmap_lmem_map2(vm_paddr_t phys1, vm_paddr_t phys2) { return (0); } static __inline vm_offset_t pmap_lmem_unmap(void) { return (0); } #endif /* !__mips_n64 */ static __inline int pmap_pte_cache_bits(vm_paddr_t pa, vm_page_t m) { vm_memattr_t ma; ma = pmap_page_get_memattr(m); if (ma == VM_MEMATTR_WRITE_BACK && !is_cacheable_mem(pa)) ma = VM_MEMATTR_UNCACHEABLE; return PTE_C(ma); } #define PMAP_PTE_SET_CACHE_BITS(pte, ps, m) { \ pte &= ~PTE_C_MASK; \ pte |= pmap_pte_cache_bits(pa, m); \ } /* * Page table entry lookup routines. */ static __inline pd_entry_t * pmap_segmap(pmap_t pmap, vm_offset_t va) { return (&pmap->pm_segtab[pmap_seg_index(va)]); } #ifdef __mips_n64 static __inline pd_entry_t * pmap_pdpe_to_pde(pd_entry_t *pdpe, vm_offset_t va) { pd_entry_t *pde; pde = (pd_entry_t *)*pdpe; return (&pde[pmap_pde_index(va)]); } static __inline pd_entry_t * pmap_pde(pmap_t pmap, vm_offset_t va) { pd_entry_t *pdpe; pdpe = pmap_segmap(pmap, va); if (*pdpe == NULL) return (NULL); return (pmap_pdpe_to_pde(pdpe, va)); } #else static __inline pd_entry_t * pmap_pdpe_to_pde(pd_entry_t *pdpe, vm_offset_t va) { return (pdpe); } static __inline pd_entry_t *pmap_pde(pmap_t pmap, vm_offset_t va) { return (pmap_segmap(pmap, va)); } #endif static __inline pt_entry_t * pmap_pde_to_pte(pd_entry_t *pde, vm_offset_t va) { pt_entry_t *pte; pte = (pt_entry_t *)*pde; return (&pte[pmap_pte_index(va)]); } pt_entry_t * pmap_pte(pmap_t pmap, vm_offset_t va) { pd_entry_t *pde; pde = pmap_pde(pmap, va); if (pde == NULL || *pde == NULL) return (NULL); return (pmap_pde_to_pte(pde, va)); } vm_offset_t pmap_steal_memory(vm_size_t size) { vm_paddr_t bank_size, pa; vm_offset_t va; size = round_page(size); bank_size = phys_avail[1] - phys_avail[0]; while (size > bank_size) { int i; for (i = 0; phys_avail[i + 2]; i += 2) { phys_avail[i] = phys_avail[i + 2]; phys_avail[i + 1] = phys_avail[i + 3]; } phys_avail[i] = 0; phys_avail[i + 1] = 0; if (!phys_avail[0]) panic("pmap_steal_memory: out of memory"); bank_size = phys_avail[1] - phys_avail[0]; } pa = phys_avail[0]; phys_avail[0] += size; if (MIPS_DIRECT_MAPPABLE(pa) == 0) panic("Out of memory below 512Meg?"); va = MIPS_PHYS_TO_DIRECT(pa); bzero((caddr_t)va, size); return (va); } /* * Bootstrap the system enough to run with virtual memory. This * assumes that the phys_avail array has been initialized. */ static void pmap_create_kernel_pagetable(void) { int i, j; vm_offset_t ptaddr; pt_entry_t *pte; #ifdef __mips_n64 pd_entry_t *pde; vm_offset_t pdaddr; int npt, npde; #endif /* * Allocate segment table for the kernel */ kernel_segmap = (pd_entry_t *)pmap_steal_memory(PAGE_SIZE); /* * Allocate second level page tables for the kernel */ #ifdef __mips_n64 npde = howmany(NKPT, NPDEPG); pdaddr = pmap_steal_memory(PAGE_SIZE * npde); #endif nkpt = NKPT; ptaddr = pmap_steal_memory(PAGE_SIZE * nkpt); /* * The R[4-7]?00 stores only one copy of the Global bit in the * translation lookaside buffer for each 2 page entry. Thus invalid * entrys must have the Global bit set so when Entry LO and Entry HI * G bits are anded together they will produce a global bit to store * in the tlb. */ for (i = 0, pte = (pt_entry_t *)ptaddr; i < (nkpt * NPTEPG); i++, pte++) *pte = PTE_G; #ifdef __mips_n64 for (i = 0, npt = nkpt; npt > 0; i++) { kernel_segmap[i] = (pd_entry_t)(pdaddr + i * PAGE_SIZE); pde = (pd_entry_t *)kernel_segmap[i]; for (j = 0; j < NPDEPG && npt > 0; j++, npt--) pde[j] = (pd_entry_t)(ptaddr + (i * NPDEPG + j) * PAGE_SIZE); } #else for (i = 0, j = pmap_seg_index(VM_MIN_KERNEL_ADDRESS); i < nkpt; i++, j++) kernel_segmap[j] = (pd_entry_t)(ptaddr + (i * PAGE_SIZE)); #endif PMAP_LOCK_INIT(kernel_pmap); kernel_pmap->pm_segtab = kernel_segmap; CPU_FILL(&kernel_pmap->pm_active); TAILQ_INIT(&kernel_pmap->pm_pvchunk); kernel_pmap->pm_asid[0].asid = PMAP_ASID_RESERVED; kernel_pmap->pm_asid[0].gen = 0; kernel_vm_end += nkpt * NPTEPG * PAGE_SIZE; } void pmap_bootstrap(void) { int i; int need_local_mappings = 0; /* Sort. */ again: for (i = 0; phys_avail[i + 1] != 0; i += 2) { /* * Keep the memory aligned on page boundary. */ phys_avail[i] = round_page(phys_avail[i]); phys_avail[i + 1] = trunc_page(phys_avail[i + 1]); if (i < 2) continue; if (phys_avail[i - 2] > phys_avail[i]) { vm_paddr_t ptemp[2]; ptemp[0] = phys_avail[i + 0]; ptemp[1] = phys_avail[i + 1]; phys_avail[i + 0] = phys_avail[i - 2]; phys_avail[i + 1] = phys_avail[i - 1]; phys_avail[i - 2] = ptemp[0]; phys_avail[i - 1] = ptemp[1]; goto again; } } /* * In 32 bit, we may have memory which cannot be mapped directly. * This memory will need temporary mapping before it can be * accessed. */ if (!MIPS_DIRECT_MAPPABLE(phys_avail[i - 1] - 1)) need_local_mappings = 1; /* * Copy the phys_avail[] array before we start stealing memory from it. */ for (i = 0; phys_avail[i + 1] != 0; i += 2) { physmem_desc[i] = phys_avail[i]; physmem_desc[i + 1] = phys_avail[i + 1]; } Maxmem = atop(phys_avail[i - 1]); if (bootverbose) { printf("Physical memory chunk(s):\n"); for (i = 0; phys_avail[i + 1] != 0; i += 2) { vm_paddr_t size; size = phys_avail[i + 1] - phys_avail[i]; printf("%#08jx - %#08jx, %ju bytes (%ju pages)\n", (uintmax_t) phys_avail[i], (uintmax_t) phys_avail[i + 1] - 1, (uintmax_t) size, (uintmax_t) size / PAGE_SIZE); } printf("Maxmem is 0x%0jx\n", ptoa((uintmax_t)Maxmem)); } /* * Steal the message buffer from the beginning of memory. */ msgbufp = (struct msgbuf *)pmap_steal_memory(msgbufsize); msgbufinit(msgbufp, msgbufsize); /* * Steal thread0 kstack. */ kstack0 = pmap_steal_memory(KSTACK_PAGES << PAGE_SHIFT); virtual_avail = VM_MIN_KERNEL_ADDRESS; virtual_end = VM_MAX_KERNEL_ADDRESS; #ifdef SMP /* * Steal some virtual address space to map the pcpu area. */ virtual_avail = roundup2(virtual_avail, PAGE_SIZE * 2); pcpup = (struct pcpu *)virtual_avail; virtual_avail += PAGE_SIZE * 2; /* * Initialize the wired TLB entry mapping the pcpu region for * the BSP at 'pcpup'. Up until this point we were operating * with the 'pcpup' for the BSP pointing to a virtual address * in KSEG0 so there was no need for a TLB mapping. */ mips_pcpu_tlb_init(PCPU_ADDR(0)); if (bootverbose) printf("pcpu is available at virtual address %p.\n", pcpup); #endif if (need_local_mappings) pmap_alloc_lmem_map(); pmap_create_kernel_pagetable(); pmap_max_asid = VMNUM_PIDS; mips_wr_entryhi(0); mips_wr_pagemask(0); /* * Initialize the global pv list lock. */ rw_init(&pvh_global_lock, "pmap pv global"); } /* * Initialize a vm_page's machine-dependent fields. */ void pmap_page_init(vm_page_t m) { TAILQ_INIT(&m->md.pv_list); m->md.pv_flags = VM_MEMATTR_DEFAULT << PV_MEMATTR_SHIFT; } /* * Initialize the pmap module. * Called by vm_init, to initialize any structures that the pmap * system needs to map virtual memory. */ void pmap_init(void) { } /*************************************************** * Low level helper routines..... ***************************************************/ #ifdef SMP static __inline void pmap_call_on_active_cpus(pmap_t pmap, void (*fn)(void *), void *arg) { int cpuid, cpu, self; cpuset_t active_cpus; sched_pin(); if (is_kernel_pmap(pmap)) { smp_rendezvous(NULL, fn, NULL, arg); goto out; } /* Force ASID update on inactive CPUs */ CPU_FOREACH(cpu) { if (!CPU_ISSET(cpu, &pmap->pm_active)) pmap->pm_asid[cpu].gen = 0; } cpuid = PCPU_GET(cpuid); /* * XXX: barrier/locking for active? * * Take a snapshot of active here, any further changes are ignored. * tlb update/invalidate should be harmless on inactive CPUs */ active_cpus = pmap->pm_active; self = CPU_ISSET(cpuid, &active_cpus); CPU_CLR(cpuid, &active_cpus); /* Optimize for the case where this cpu is the only active one */ if (CPU_EMPTY(&active_cpus)) { if (self) fn(arg); } else { if (self) CPU_SET(cpuid, &active_cpus); smp_rendezvous_cpus(active_cpus, NULL, fn, NULL, arg); } out: sched_unpin(); } #else /* !SMP */ static __inline void pmap_call_on_active_cpus(pmap_t pmap, void (*fn)(void *), void *arg) { int cpuid; if (is_kernel_pmap(pmap)) { fn(arg); return; } cpuid = PCPU_GET(cpuid); if (!CPU_ISSET(cpuid, &pmap->pm_active)) pmap->pm_asid[cpuid].gen = 0; else fn(arg); } #endif /* SMP */ static void pmap_invalidate_all(pmap_t pmap) { pmap_call_on_active_cpus(pmap, (void (*)(void *))tlb_invalidate_all_user, pmap); } struct pmap_invalidate_page_arg { pmap_t pmap; vm_offset_t va; }; static void pmap_invalidate_page_action(void *arg) { struct pmap_invalidate_page_arg *p = arg; tlb_invalidate_address(p->pmap, p->va); } static void pmap_invalidate_page(pmap_t pmap, vm_offset_t va) { struct pmap_invalidate_page_arg arg; arg.pmap = pmap; arg.va = va; pmap_call_on_active_cpus(pmap, pmap_invalidate_page_action, &arg); } struct pmap_invalidate_range_arg { pmap_t pmap; vm_offset_t sva; vm_offset_t eva; }; static void pmap_invalidate_range_action(void *arg) { struct pmap_invalidate_range_arg *p = arg; tlb_invalidate_range(p->pmap, p->sva, p->eva); } static void pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) { struct pmap_invalidate_range_arg arg; arg.pmap = pmap; arg.sva = sva; arg.eva = eva; pmap_call_on_active_cpus(pmap, pmap_invalidate_range_action, &arg); } struct pmap_update_page_arg { pmap_t pmap; vm_offset_t va; pt_entry_t pte; }; static void pmap_update_page_action(void *arg) { struct pmap_update_page_arg *p = arg; tlb_update(p->pmap, p->va, p->pte); } static void pmap_update_page(pmap_t pmap, vm_offset_t va, pt_entry_t pte) { struct pmap_update_page_arg arg; arg.pmap = pmap; arg.va = va; arg.pte = pte; pmap_call_on_active_cpus(pmap, pmap_update_page_action, &arg); } /* * Routine: pmap_extract * Function: * Extract the physical page address associated * with the given map/virtual_address pair. */ vm_paddr_t pmap_extract(pmap_t pmap, vm_offset_t va) { pt_entry_t *pte; vm_offset_t retval = 0; PMAP_LOCK(pmap); pte = pmap_pte(pmap, va); if (pte) { retval = TLBLO_PTE_TO_PA(*pte) | (va & PAGE_MASK); } PMAP_UNLOCK(pmap); return (retval); } /* * Routine: pmap_extract_and_hold * Function: * Atomically extract and hold the physical page * with the given pmap and virtual address pair * if that mapping permits the given protection. */ vm_page_t pmap_extract_and_hold(pmap_t pmap, vm_offset_t va, vm_prot_t prot) { pt_entry_t pte, *ptep; vm_paddr_t pa, pte_pa; vm_page_t m; m = NULL; pa = 0; PMAP_LOCK(pmap); retry: ptep = pmap_pte(pmap, va); if (ptep != NULL) { pte = *ptep; if (pte_test(&pte, PTE_V) && (!pte_test(&pte, PTE_RO) || (prot & VM_PROT_WRITE) == 0)) { pte_pa = TLBLO_PTE_TO_PA(pte); if (vm_page_pa_tryrelock(pmap, pte_pa, &pa)) goto retry; m = PHYS_TO_VM_PAGE(pte_pa); vm_page_hold(m); } } PA_UNLOCK_COND(pa); PMAP_UNLOCK(pmap); return (m); } /*************************************************** * Low level mapping routines..... ***************************************************/ /* * add a wired page to the kva */ void pmap_kenter_attr(vm_offset_t va, vm_paddr_t pa, vm_memattr_t ma) { pt_entry_t *pte; pt_entry_t opte, npte; #ifdef PMAP_DEBUG printf("pmap_kenter: va: %p -> pa: %p\n", (void *)va, (void *)pa); #endif pte = pmap_pte(kernel_pmap, va); opte = *pte; npte = TLBLO_PA_TO_PFN(pa) | PTE_C(ma) | PTE_D | PTE_V | PTE_G; *pte = npte; if (pte_test(&opte, PTE_V) && opte != npte) pmap_update_page(kernel_pmap, va, npte); } void pmap_kenter(vm_offset_t va, vm_paddr_t pa) { KASSERT(is_cacheable_mem(pa), ("pmap_kenter: memory at 0x%lx is not cacheable", (u_long)pa)); pmap_kenter_attr(va, pa, VM_MEMATTR_DEFAULT); } /* * remove a page from the kernel pagetables */ /* PMAP_INLINE */ void pmap_kremove(vm_offset_t va) { pt_entry_t *pte; /* * Write back all caches from the page being destroyed */ mips_dcache_wbinv_range_index(va, PAGE_SIZE); pte = pmap_pte(kernel_pmap, va); *pte = PTE_G; pmap_invalidate_page(kernel_pmap, va); } /* * Used to map a range of physical addresses into kernel * virtual address space. * * The value passed in '*virt' is a suggested virtual address for * the mapping. Architectures which can support a direct-mapped * physical to virtual region can return the appropriate address * within that region, leaving '*virt' unchanged. Other * architectures should map the pages starting at '*virt' and * update '*virt' with the first usable address after the mapped * region. * * Use XKPHYS for 64 bit, and KSEG0 where possible for 32 bit. */ vm_offset_t pmap_map(vm_offset_t *virt, vm_paddr_t start, vm_paddr_t end, int prot) { vm_offset_t va, sva; if (MIPS_DIRECT_MAPPABLE(end - 1)) return (MIPS_PHYS_TO_DIRECT(start)); va = sva = *virt; while (start < end) { pmap_kenter(va, start); va += PAGE_SIZE; start += PAGE_SIZE; } *virt = va; return (sva); } /* * Add a list of wired pages to the kva * this routine is only used for temporary * kernel mappings that do not need to have * page modification or references recorded. * Note that old mappings are simply written * over. The page *must* be wired. */ void pmap_qenter(vm_offset_t va, vm_page_t *m, int count) { int i; vm_offset_t origva = va; for (i = 0; i < count; i++) { pmap_flush_pvcache(m[i]); pmap_kenter(va, VM_PAGE_TO_PHYS(m[i])); va += PAGE_SIZE; } mips_dcache_wbinv_range_index(origva, PAGE_SIZE*count); } /* * this routine jerks page mappings from the * kernel -- it is meant only for temporary mappings. */ void pmap_qremove(vm_offset_t va, int count) { pt_entry_t *pte; vm_offset_t origva; if (count < 1) return; mips_dcache_wbinv_range_index(va, PAGE_SIZE * count); origva = va; do { pte = pmap_pte(kernel_pmap, va); *pte = PTE_G; va += PAGE_SIZE; } while (--count > 0); pmap_invalidate_range(kernel_pmap, origva, va); } /*************************************************** * Page table page management routines..... ***************************************************/ /* * Decrements a page table page's wire count, which is used to record the * number of valid page table entries within the page. If the wire count * drops to zero, then the page table page is unmapped. Returns TRUE if the * page table page was unmapped and FALSE otherwise. */ static PMAP_INLINE boolean_t pmap_unwire_ptp(pmap_t pmap, vm_offset_t va, vm_page_t m) { --m->wire_count; if (m->wire_count == 0) { _pmap_unwire_ptp(pmap, va, m); return (TRUE); } else return (FALSE); } static void _pmap_unwire_ptp(pmap_t pmap, vm_offset_t va, vm_page_t m) { pd_entry_t *pde; PMAP_LOCK_ASSERT(pmap, MA_OWNED); /* * unmap the page table page */ #ifdef __mips_n64 if (m->pindex < NUPDE) pde = pmap_pde(pmap, va); else pde = pmap_segmap(pmap, va); #else pde = pmap_pde(pmap, va); #endif *pde = 0; pmap->pm_stats.resident_count--; #ifdef __mips_n64 if (m->pindex < NUPDE) { pd_entry_t *pdp; vm_page_t pdpg; /* * Recursively decrement next level pagetable refcount */ pdp = (pd_entry_t *)*pmap_segmap(pmap, va); pdpg = PHYS_TO_VM_PAGE(MIPS_DIRECT_TO_PHYS(pdp)); pmap_unwire_ptp(pmap, va, pdpg); } #endif /* * If the page is finally unwired, simply free it. */ vm_page_free_zero(m); atomic_subtract_int(&vm_cnt.v_wire_count, 1); } /* * After removing a page table entry, this routine is used to * conditionally free the page, and manage the hold/wire counts. */ static int pmap_unuse_pt(pmap_t pmap, vm_offset_t va, pd_entry_t pde) { vm_page_t mpte; if (va >= VM_MAXUSER_ADDRESS) return (0); KASSERT(pde != 0, ("pmap_unuse_pt: pde != 0")); mpte = PHYS_TO_VM_PAGE(MIPS_DIRECT_TO_PHYS(pde)); return (pmap_unwire_ptp(pmap, va, mpte)); } void pmap_pinit0(pmap_t pmap) { int i; PMAP_LOCK_INIT(pmap); pmap->pm_segtab = kernel_segmap; CPU_ZERO(&pmap->pm_active); for (i = 0; i < MAXCPU; i++) { pmap->pm_asid[i].asid = PMAP_ASID_RESERVED; pmap->pm_asid[i].gen = 0; } PCPU_SET(curpmap, pmap); TAILQ_INIT(&pmap->pm_pvchunk); bzero(&pmap->pm_stats, sizeof pmap->pm_stats); } static void pmap_grow_direct_page(int req) { #ifdef __mips_n64 VM_WAIT; #else if (!vm_page_reclaim_contig(req, 1, 0, MIPS_KSEG0_LARGEST_PHYS, PAGE_SIZE, 0)) VM_WAIT; #endif } static vm_page_t pmap_alloc_direct_page(unsigned int index, int req) { vm_page_t m; - m = vm_page_alloc_freelist(VM_FREELIST_DIRECT, req | VM_ALLOC_WIRED | + m = vm_page_alloc_freelist(0, VM_FREELIST_DIRECT, req | VM_ALLOC_WIRED | VM_ALLOC_ZERO); if (m == NULL) return (NULL); if ((m->flags & PG_ZERO) == 0) pmap_zero_page(m); m->pindex = index; return (m); } /* * Initialize a preallocated and zeroed pmap structure, * such as one in a vmspace structure. */ int pmap_pinit(pmap_t pmap) { vm_offset_t ptdva; vm_page_t ptdpg; int i, req_class; /* * allocate the page directory page */ req_class = VM_ALLOC_NORMAL; while ((ptdpg = pmap_alloc_direct_page(NUSERPGTBLS, req_class)) == NULL) pmap_grow_direct_page(req_class); ptdva = MIPS_PHYS_TO_DIRECT(VM_PAGE_TO_PHYS(ptdpg)); pmap->pm_segtab = (pd_entry_t *)ptdva; CPU_ZERO(&pmap->pm_active); for (i = 0; i < MAXCPU; i++) { pmap->pm_asid[i].asid = PMAP_ASID_RESERVED; pmap->pm_asid[i].gen = 0; } TAILQ_INIT(&pmap->pm_pvchunk); bzero(&pmap->pm_stats, sizeof pmap->pm_stats); return (1); } /* * this routine is called if the page table page is not * mapped correctly. */ static vm_page_t _pmap_allocpte(pmap_t pmap, unsigned ptepindex, u_int flags) { vm_offset_t pageva; vm_page_t m; int req_class; /* * Find or fabricate a new pagetable page */ req_class = VM_ALLOC_NORMAL; if ((m = pmap_alloc_direct_page(ptepindex, req_class)) == NULL) { if ((flags & PMAP_ENTER_NOSLEEP) == 0) { PMAP_UNLOCK(pmap); rw_wunlock(&pvh_global_lock); pmap_grow_direct_page(req_class); rw_wlock(&pvh_global_lock); PMAP_LOCK(pmap); } /* * Indicate the need to retry. While waiting, the page * table page may have been allocated. */ return (NULL); } /* * Map the pagetable page into the process address space, if it * isn't already there. */ pageva = MIPS_PHYS_TO_DIRECT(VM_PAGE_TO_PHYS(m)); #ifdef __mips_n64 if (ptepindex >= NUPDE) { pmap->pm_segtab[ptepindex - NUPDE] = (pd_entry_t)pageva; } else { pd_entry_t *pdep, *pde; int segindex = ptepindex >> (SEGSHIFT - PDRSHIFT); int pdeindex = ptepindex & (NPDEPG - 1); vm_page_t pg; pdep = &pmap->pm_segtab[segindex]; if (*pdep == NULL) { /* recurse for allocating page dir */ if (_pmap_allocpte(pmap, NUPDE + segindex, flags) == NULL) { /* alloc failed, release current */ --m->wire_count; atomic_subtract_int(&vm_cnt.v_wire_count, 1); vm_page_free_zero(m); return (NULL); } } else { pg = PHYS_TO_VM_PAGE(MIPS_DIRECT_TO_PHYS(*pdep)); pg->wire_count++; } /* Next level entry */ pde = (pd_entry_t *)*pdep; pde[pdeindex] = (pd_entry_t)pageva; } #else pmap->pm_segtab[ptepindex] = (pd_entry_t)pageva; #endif pmap->pm_stats.resident_count++; return (m); } static vm_page_t pmap_allocpte(pmap_t pmap, vm_offset_t va, u_int flags) { unsigned ptepindex; pd_entry_t *pde; vm_page_t m; /* * Calculate pagetable page index */ ptepindex = pmap_pde_pindex(va); retry: /* * Get the page directory entry */ pde = pmap_pde(pmap, va); /* * If the page table page is mapped, we just increment the hold * count, and activate it. */ if (pde != NULL && *pde != NULL) { m = PHYS_TO_VM_PAGE(MIPS_DIRECT_TO_PHYS(*pde)); m->wire_count++; } else { /* * Here if the pte page isn't mapped, or if it has been * deallocated. */ m = _pmap_allocpte(pmap, ptepindex, flags); if (m == NULL && (flags & PMAP_ENTER_NOSLEEP) == 0) goto retry; } return (m); } /*************************************************** * Pmap allocation/deallocation routines. ***************************************************/ /* * Release any resources held by the given physical map. * Called when a pmap initialized by pmap_pinit is being released. * Should only be called if the map contains no valid mappings. */ void pmap_release(pmap_t pmap) { vm_offset_t ptdva; vm_page_t ptdpg; KASSERT(pmap->pm_stats.resident_count == 0, ("pmap_release: pmap resident count %ld != 0", pmap->pm_stats.resident_count)); ptdva = (vm_offset_t)pmap->pm_segtab; ptdpg = PHYS_TO_VM_PAGE(MIPS_DIRECT_TO_PHYS(ptdva)); ptdpg->wire_count--; atomic_subtract_int(&vm_cnt.v_wire_count, 1); vm_page_free_zero(ptdpg); } /* * grow the number of kernel page table entries, if needed */ void pmap_growkernel(vm_offset_t addr) { vm_page_t nkpg; pd_entry_t *pde, *pdpe; pt_entry_t *pte; int i, req_class; mtx_assert(&kernel_map->system_mtx, MA_OWNED); req_class = VM_ALLOC_INTERRUPT; addr = roundup2(addr, NBSEG); if (addr - 1 >= kernel_map->max_offset) addr = kernel_map->max_offset; while (kernel_vm_end < addr) { pdpe = pmap_segmap(kernel_pmap, kernel_vm_end); #ifdef __mips_n64 if (*pdpe == 0) { /* new intermediate page table entry */ nkpg = pmap_alloc_direct_page(nkpt, req_class); if (nkpg == NULL) panic("pmap_growkernel: no memory to grow kernel"); *pdpe = (pd_entry_t)MIPS_PHYS_TO_DIRECT(VM_PAGE_TO_PHYS(nkpg)); continue; /* try again */ } #endif pde = pmap_pdpe_to_pde(pdpe, kernel_vm_end); if (*pde != 0) { kernel_vm_end = (kernel_vm_end + NBPDR) & ~PDRMASK; if (kernel_vm_end - 1 >= kernel_map->max_offset) { kernel_vm_end = kernel_map->max_offset; break; } continue; } /* * This index is bogus, but out of the way */ nkpg = pmap_alloc_direct_page(nkpt, req_class); #ifndef __mips_n64 if (nkpg == NULL && vm_page_reclaim_contig(req_class, 1, 0, MIPS_KSEG0_LARGEST_PHYS, PAGE_SIZE, 0)) nkpg = pmap_alloc_direct_page(nkpt, req_class); #endif if (nkpg == NULL) panic("pmap_growkernel: no memory to grow kernel"); nkpt++; *pde = (pd_entry_t)MIPS_PHYS_TO_DIRECT(VM_PAGE_TO_PHYS(nkpg)); /* * The R[4-7]?00 stores only one copy of the Global bit in * the translation lookaside buffer for each 2 page entry. * Thus invalid entrys must have the Global bit set so when * Entry LO and Entry HI G bits are anded together they will * produce a global bit to store in the tlb. */ pte = (pt_entry_t *)*pde; for (i = 0; i < NPTEPG; i++) pte[i] = PTE_G; kernel_vm_end = (kernel_vm_end + NBPDR) & ~PDRMASK; if (kernel_vm_end - 1 >= kernel_map->max_offset) { kernel_vm_end = kernel_map->max_offset; break; } } } /*************************************************** * page management routines. ***************************************************/ CTASSERT(sizeof(struct pv_chunk) == PAGE_SIZE); #ifdef __mips_n64 CTASSERT(_NPCM == 3); CTASSERT(_NPCPV == 168); #else CTASSERT(_NPCM == 11); CTASSERT(_NPCPV == 336); #endif static __inline struct pv_chunk * pv_to_chunk(pv_entry_t pv) { return ((struct pv_chunk *)((uintptr_t)pv & ~(uintptr_t)PAGE_MASK)); } #define PV_PMAP(pv) (pv_to_chunk(pv)->pc_pmap) #ifdef __mips_n64 #define PC_FREE0_1 0xfffffffffffffffful #define PC_FREE2 0x000000fffffffffful #else #define PC_FREE0_9 0xfffffffful /* Free values for index 0 through 9 */ #define PC_FREE10 0x0000fffful /* Free values for index 10 */ #endif static const u_long pc_freemask[_NPCM] = { #ifdef __mips_n64 PC_FREE0_1, PC_FREE0_1, PC_FREE2 #else PC_FREE0_9, PC_FREE0_9, PC_FREE0_9, PC_FREE0_9, PC_FREE0_9, PC_FREE0_9, PC_FREE0_9, PC_FREE0_9, PC_FREE0_9, PC_FREE0_9, PC_FREE10 #endif }; static SYSCTL_NODE(_vm, OID_AUTO, pmap, CTLFLAG_RD, 0, "VM/pmap parameters"); SYSCTL_INT(_vm_pmap, OID_AUTO, pv_entry_count, CTLFLAG_RD, &pv_entry_count, 0, "Current number of pv entries"); #ifdef PV_STATS static int pc_chunk_count, pc_chunk_allocs, pc_chunk_frees, pc_chunk_tryfail; SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_count, CTLFLAG_RD, &pc_chunk_count, 0, "Current number of pv entry chunks"); SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_allocs, CTLFLAG_RD, &pc_chunk_allocs, 0, "Current number of pv entry chunks allocated"); SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_frees, CTLFLAG_RD, &pc_chunk_frees, 0, "Current number of pv entry chunks frees"); SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_tryfail, CTLFLAG_RD, &pc_chunk_tryfail, 0, "Number of times tried to get a chunk page but failed."); static long pv_entry_frees, pv_entry_allocs; static int pv_entry_spare; SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_frees, CTLFLAG_RD, &pv_entry_frees, 0, "Current number of pv entry frees"); SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_allocs, CTLFLAG_RD, &pv_entry_allocs, 0, "Current number of pv entry allocs"); SYSCTL_INT(_vm_pmap, OID_AUTO, pv_entry_spare, CTLFLAG_RD, &pv_entry_spare, 0, "Current number of spare pv entries"); #endif /* * We are in a serious low memory condition. Resort to * drastic measures to free some pages so we can allocate * another pv entry chunk. */ static vm_page_t pmap_pv_reclaim(pmap_t locked_pmap) { struct pch newtail; struct pv_chunk *pc; pd_entry_t *pde; pmap_t pmap; pt_entry_t *pte, oldpte; pv_entry_t pv; vm_offset_t va; vm_page_t m, m_pc; u_long inuse; int bit, field, freed, idx; PMAP_LOCK_ASSERT(locked_pmap, MA_OWNED); pmap = NULL; m_pc = NULL; TAILQ_INIT(&newtail); while ((pc = TAILQ_FIRST(&pv_chunks)) != NULL) { TAILQ_REMOVE(&pv_chunks, pc, pc_lru); if (pmap != pc->pc_pmap) { if (pmap != NULL) { pmap_invalidate_all(pmap); if (pmap != locked_pmap) PMAP_UNLOCK(pmap); } pmap = pc->pc_pmap; /* Avoid deadlock and lock recursion. */ if (pmap > locked_pmap) PMAP_LOCK(pmap); else if (pmap != locked_pmap && !PMAP_TRYLOCK(pmap)) { pmap = NULL; TAILQ_INSERT_TAIL(&newtail, pc, pc_lru); continue; } } /* * Destroy every non-wired, 4 KB page mapping in the chunk. */ freed = 0; for (field = 0; field < _NPCM; field++) { for (inuse = ~pc->pc_map[field] & pc_freemask[field]; inuse != 0; inuse &= ~(1UL << bit)) { bit = ffsl(inuse) - 1; idx = field * sizeof(inuse) * NBBY + bit; pv = &pc->pc_pventry[idx]; va = pv->pv_va; pde = pmap_pde(pmap, va); KASSERT(pde != NULL && *pde != 0, ("pmap_pv_reclaim: pde")); pte = pmap_pde_to_pte(pde, va); oldpte = *pte; if (pte_test(&oldpte, PTE_W)) continue; if (is_kernel_pmap(pmap)) *pte = PTE_G; else *pte = 0; m = PHYS_TO_VM_PAGE(TLBLO_PTE_TO_PA(oldpte)); if (pte_test(&oldpte, PTE_D)) vm_page_dirty(m); if (m->md.pv_flags & PV_TABLE_REF) vm_page_aflag_set(m, PGA_REFERENCED); m->md.pv_flags &= ~PV_TABLE_REF; TAILQ_REMOVE(&m->md.pv_list, pv, pv_list); if (TAILQ_EMPTY(&m->md.pv_list)) vm_page_aflag_clear(m, PGA_WRITEABLE); pc->pc_map[field] |= 1UL << bit; pmap_unuse_pt(pmap, va, *pde); freed++; } } if (freed == 0) { TAILQ_INSERT_TAIL(&newtail, pc, pc_lru); continue; } /* Every freed mapping is for a 4 KB page. */ pmap->pm_stats.resident_count -= freed; PV_STAT(pv_entry_frees += freed); PV_STAT(pv_entry_spare += freed); pv_entry_count -= freed; TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); for (field = 0; field < _NPCM; field++) if (pc->pc_map[field] != pc_freemask[field]) { TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list); TAILQ_INSERT_TAIL(&newtail, pc, pc_lru); /* * One freed pv entry in locked_pmap is * sufficient. */ if (pmap == locked_pmap) goto out; break; } if (field == _NPCM) { PV_STAT(pv_entry_spare -= _NPCPV); PV_STAT(pc_chunk_count--); PV_STAT(pc_chunk_frees++); /* Entire chunk is free; return it. */ m_pc = PHYS_TO_VM_PAGE(MIPS_DIRECT_TO_PHYS( (vm_offset_t)pc)); dump_drop_page(m_pc->phys_addr); break; } } out: TAILQ_CONCAT(&pv_chunks, &newtail, pc_lru); if (pmap != NULL) { pmap_invalidate_all(pmap); if (pmap != locked_pmap) PMAP_UNLOCK(pmap); } return (m_pc); } /* * free the pv_entry back to the free list */ static void free_pv_entry(pmap_t pmap, pv_entry_t pv) { struct pv_chunk *pc; int bit, field, idx; rw_assert(&pvh_global_lock, RA_WLOCKED); PMAP_LOCK_ASSERT(pmap, MA_OWNED); PV_STAT(pv_entry_frees++); PV_STAT(pv_entry_spare++); pv_entry_count--; pc = pv_to_chunk(pv); idx = pv - &pc->pc_pventry[0]; field = idx / (sizeof(u_long) * NBBY); bit = idx % (sizeof(u_long) * NBBY); pc->pc_map[field] |= 1ul << bit; for (idx = 0; idx < _NPCM; idx++) if (pc->pc_map[idx] != pc_freemask[idx]) { /* * 98% of the time, pc is already at the head of the * list. If it isn't already, move it to the head. */ if (__predict_false(TAILQ_FIRST(&pmap->pm_pvchunk) != pc)) { TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list); } return; } TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); free_pv_chunk(pc); } static void free_pv_chunk(struct pv_chunk *pc) { vm_page_t m; TAILQ_REMOVE(&pv_chunks, pc, pc_lru); PV_STAT(pv_entry_spare -= _NPCPV); PV_STAT(pc_chunk_count--); PV_STAT(pc_chunk_frees++); /* entire chunk is free, return it */ m = PHYS_TO_VM_PAGE(MIPS_DIRECT_TO_PHYS((vm_offset_t)pc)); dump_drop_page(m->phys_addr); vm_page_unwire(m, PQ_NONE); vm_page_free(m); } /* * get a new pv_entry, allocating a block from the system * when needed. */ static pv_entry_t get_pv_entry(pmap_t pmap, boolean_t try) { struct pv_chunk *pc; pv_entry_t pv; vm_page_t m; int bit, field, idx; rw_assert(&pvh_global_lock, RA_WLOCKED); PMAP_LOCK_ASSERT(pmap, MA_OWNED); PV_STAT(pv_entry_allocs++); pv_entry_count++; retry: pc = TAILQ_FIRST(&pmap->pm_pvchunk); if (pc != NULL) { for (field = 0; field < _NPCM; field++) { if (pc->pc_map[field]) { bit = ffsl(pc->pc_map[field]) - 1; break; } } if (field < _NPCM) { idx = field * sizeof(pc->pc_map[field]) * NBBY + bit; pv = &pc->pc_pventry[idx]; pc->pc_map[field] &= ~(1ul << bit); /* If this was the last item, move it to tail */ for (field = 0; field < _NPCM; field++) if (pc->pc_map[field] != 0) { PV_STAT(pv_entry_spare--); return (pv); /* not full, return */ } TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, pc_list); PV_STAT(pv_entry_spare--); return (pv); } } /* No free items, allocate another chunk */ - m = vm_page_alloc_freelist(VM_FREELIST_DIRECT, VM_ALLOC_NORMAL | + m = vm_page_alloc_freelist(0, VM_FREELIST_DIRECT, VM_ALLOC_NORMAL | VM_ALLOC_WIRED); if (m == NULL) { if (try) { pv_entry_count--; PV_STAT(pc_chunk_tryfail++); return (NULL); } m = pmap_pv_reclaim(pmap); if (m == NULL) goto retry; } PV_STAT(pc_chunk_count++); PV_STAT(pc_chunk_allocs++); dump_add_page(m->phys_addr); pc = (struct pv_chunk *)MIPS_PHYS_TO_DIRECT(VM_PAGE_TO_PHYS(m)); pc->pc_pmap = pmap; pc->pc_map[0] = pc_freemask[0] & ~1ul; /* preallocated bit 0 */ for (field = 1; field < _NPCM; field++) pc->pc_map[field] = pc_freemask[field]; TAILQ_INSERT_TAIL(&pv_chunks, pc, pc_lru); pv = &pc->pc_pventry[0]; TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list); PV_STAT(pv_entry_spare += _NPCPV - 1); return (pv); } static pv_entry_t pmap_pvh_remove(struct md_page *pvh, pmap_t pmap, vm_offset_t va) { pv_entry_t pv; rw_assert(&pvh_global_lock, RA_WLOCKED); TAILQ_FOREACH(pv, &pvh->pv_list, pv_list) { if (pmap == PV_PMAP(pv) && va == pv->pv_va) { TAILQ_REMOVE(&pvh->pv_list, pv, pv_list); break; } } return (pv); } static void pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va) { pv_entry_t pv; pv = pmap_pvh_remove(pvh, pmap, va); KASSERT(pv != NULL, ("pmap_pvh_free: pv not found, pa %lx va %lx", (u_long)VM_PAGE_TO_PHYS(__containerof(pvh, struct vm_page, md)), (u_long)va)); free_pv_entry(pmap, pv); } static void pmap_remove_entry(pmap_t pmap, vm_page_t m, vm_offset_t va) { rw_assert(&pvh_global_lock, RA_WLOCKED); pmap_pvh_free(&m->md, pmap, va); if (TAILQ_EMPTY(&m->md.pv_list)) vm_page_aflag_clear(m, PGA_WRITEABLE); } /* * Conditionally create a pv entry. */ static boolean_t pmap_try_insert_pv_entry(pmap_t pmap, vm_page_t mpte, vm_offset_t va, vm_page_t m) { pv_entry_t pv; rw_assert(&pvh_global_lock, RA_WLOCKED); PMAP_LOCK_ASSERT(pmap, MA_OWNED); if ((pv = get_pv_entry(pmap, TRUE)) != NULL) { pv->pv_va = va; TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_list); return (TRUE); } else return (FALSE); } /* * pmap_remove_pte: do the things to unmap a page in a process */ static int pmap_remove_pte(struct pmap *pmap, pt_entry_t *ptq, vm_offset_t va, pd_entry_t pde) { pt_entry_t oldpte; vm_page_t m; vm_paddr_t pa; rw_assert(&pvh_global_lock, RA_WLOCKED); PMAP_LOCK_ASSERT(pmap, MA_OWNED); /* * Write back all cache lines from the page being unmapped. */ mips_dcache_wbinv_range_index(va, PAGE_SIZE); oldpte = *ptq; if (is_kernel_pmap(pmap)) *ptq = PTE_G; else *ptq = 0; if (pte_test(&oldpte, PTE_W)) pmap->pm_stats.wired_count -= 1; pmap->pm_stats.resident_count -= 1; if (pte_test(&oldpte, PTE_MANAGED)) { pa = TLBLO_PTE_TO_PA(oldpte); m = PHYS_TO_VM_PAGE(pa); if (pte_test(&oldpte, PTE_D)) { KASSERT(!pte_test(&oldpte, PTE_RO), ("%s: modified page not writable: va: %p, pte: %#jx", __func__, (void *)va, (uintmax_t)oldpte)); vm_page_dirty(m); } if (m->md.pv_flags & PV_TABLE_REF) vm_page_aflag_set(m, PGA_REFERENCED); m->md.pv_flags &= ~PV_TABLE_REF; pmap_remove_entry(pmap, m, va); } return (pmap_unuse_pt(pmap, va, pde)); } /* * Remove a single page from a process address space */ static void pmap_remove_page(struct pmap *pmap, vm_offset_t va) { pd_entry_t *pde; pt_entry_t *ptq; rw_assert(&pvh_global_lock, RA_WLOCKED); PMAP_LOCK_ASSERT(pmap, MA_OWNED); pde = pmap_pde(pmap, va); if (pde == NULL || *pde == 0) return; ptq = pmap_pde_to_pte(pde, va); /* * If there is no pte for this address, just skip it! */ if (!pte_test(ptq, PTE_V)) return; (void)pmap_remove_pte(pmap, ptq, va, *pde); pmap_invalidate_page(pmap, va); } /* * Remove the given range of addresses from the specified map. * * It is assumed that the start and end are properly * rounded to the page size. */ void pmap_remove(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) { pd_entry_t *pde, *pdpe; pt_entry_t *pte; vm_offset_t va, va_next; /* * Perform an unsynchronized read. This is, however, safe. */ if (pmap->pm_stats.resident_count == 0) return; rw_wlock(&pvh_global_lock); PMAP_LOCK(pmap); /* * special handling of removing one page. a very common operation * and easy to short circuit some code. */ if ((sva + PAGE_SIZE) == eva) { pmap_remove_page(pmap, sva); goto out; } for (; sva < eva; sva = va_next) { pdpe = pmap_segmap(pmap, sva); #ifdef __mips_n64 if (*pdpe == 0) { va_next = (sva + NBSEG) & ~SEGMASK; if (va_next < sva) va_next = eva; continue; } #endif va_next = (sva + NBPDR) & ~PDRMASK; if (va_next < sva) va_next = eva; pde = pmap_pdpe_to_pde(pdpe, sva); if (*pde == NULL) continue; /* * Limit our scan to either the end of the va represented * by the current page table page, or to the end of the * range being removed. */ if (va_next > eva) va_next = eva; va = va_next; for (pte = pmap_pde_to_pte(pde, sva); sva != va_next; pte++, sva += PAGE_SIZE) { if (!pte_test(pte, PTE_V)) { if (va != va_next) { pmap_invalidate_range(pmap, va, sva); va = va_next; } continue; } if (va == va_next) va = sva; if (pmap_remove_pte(pmap, pte, sva, *pde)) { sva += PAGE_SIZE; break; } } if (va != va_next) pmap_invalidate_range(pmap, va, sva); } out: rw_wunlock(&pvh_global_lock); PMAP_UNLOCK(pmap); } /* * Routine: pmap_remove_all * Function: * Removes this physical page from * all physical maps in which it resides. * Reflects back modify bits to the pager. * * Notes: * Original versions of this routine were very * inefficient because they iteratively called * pmap_remove (slow...) */ void pmap_remove_all(vm_page_t m) { pv_entry_t pv; pmap_t pmap; pd_entry_t *pde; pt_entry_t *pte, tpte; KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("pmap_remove_all: page %p is not managed", m)); rw_wlock(&pvh_global_lock); if (m->md.pv_flags & PV_TABLE_REF) vm_page_aflag_set(m, PGA_REFERENCED); while ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) { pmap = PV_PMAP(pv); PMAP_LOCK(pmap); /* * If it's last mapping writeback all caches from * the page being destroyed */ if (TAILQ_NEXT(pv, pv_list) == NULL) mips_dcache_wbinv_range_index(pv->pv_va, PAGE_SIZE); pmap->pm_stats.resident_count--; pde = pmap_pde(pmap, pv->pv_va); KASSERT(pde != NULL && *pde != 0, ("pmap_remove_all: pde")); pte = pmap_pde_to_pte(pde, pv->pv_va); tpte = *pte; if (is_kernel_pmap(pmap)) *pte = PTE_G; else *pte = 0; if (pte_test(&tpte, PTE_W)) pmap->pm_stats.wired_count--; /* * Update the vm_page_t clean and reference bits. */ if (pte_test(&tpte, PTE_D)) { KASSERT(!pte_test(&tpte, PTE_RO), ("%s: modified page not writable: va: %p, pte: %#jx", __func__, (void *)pv->pv_va, (uintmax_t)tpte)); vm_page_dirty(m); } pmap_invalidate_page(pmap, pv->pv_va); TAILQ_REMOVE(&m->md.pv_list, pv, pv_list); pmap_unuse_pt(pmap, pv->pv_va, *pde); free_pv_entry(pmap, pv); PMAP_UNLOCK(pmap); } vm_page_aflag_clear(m, PGA_WRITEABLE); m->md.pv_flags &= ~PV_TABLE_REF; rw_wunlock(&pvh_global_lock); } /* * Set the physical protection on the * specified range of this map as requested. */ void pmap_protect(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, vm_prot_t prot) { pt_entry_t pbits, *pte; pd_entry_t *pde, *pdpe; vm_offset_t va, va_next; vm_paddr_t pa; vm_page_t m; if ((prot & VM_PROT_READ) == VM_PROT_NONE) { pmap_remove(pmap, sva, eva); return; } if (prot & VM_PROT_WRITE) return; PMAP_LOCK(pmap); for (; sva < eva; sva = va_next) { pdpe = pmap_segmap(pmap, sva); #ifdef __mips_n64 if (*pdpe == 0) { va_next = (sva + NBSEG) & ~SEGMASK; if (va_next < sva) va_next = eva; continue; } #endif va_next = (sva + NBPDR) & ~PDRMASK; if (va_next < sva) va_next = eva; pde = pmap_pdpe_to_pde(pdpe, sva); if (*pde == NULL) continue; /* * Limit our scan to either the end of the va represented * by the current page table page, or to the end of the * range being write protected. */ if (va_next > eva) va_next = eva; va = va_next; for (pte = pmap_pde_to_pte(pde, sva); sva != va_next; pte++, sva += PAGE_SIZE) { pbits = *pte; if (!pte_test(&pbits, PTE_V) || pte_test(&pbits, PTE_RO)) { if (va != va_next) { pmap_invalidate_range(pmap, va, sva); va = va_next; } continue; } pte_set(&pbits, PTE_RO); if (pte_test(&pbits, PTE_D)) { pte_clear(&pbits, PTE_D); if (pte_test(&pbits, PTE_MANAGED)) { pa = TLBLO_PTE_TO_PA(pbits); m = PHYS_TO_VM_PAGE(pa); vm_page_dirty(m); } if (va == va_next) va = sva; } else { /* * Unless PTE_D is set, any TLB entries * mapping "sva" don't allow write access, so * they needn't be invalidated. */ if (va != va_next) { pmap_invalidate_range(pmap, va, sva); va = va_next; } } *pte = pbits; } if (va != va_next) pmap_invalidate_range(pmap, va, sva); } PMAP_UNLOCK(pmap); } /* * Insert the given physical page (p) at * the specified virtual address (v) in the * target physical map with the protection requested. * * If specified, the page will be wired down, meaning * that the related pte can not be reclaimed. * * NB: This is the only routine which MAY NOT lazy-evaluate * or lose information. That is, this routine must actually * insert this page into the given map NOW. */ int pmap_enter(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot, u_int flags, int8_t psind __unused) { vm_paddr_t pa, opa; pt_entry_t *pte; pt_entry_t origpte, newpte; pv_entry_t pv; vm_page_t mpte, om; va &= ~PAGE_MASK; KASSERT(va <= VM_MAX_KERNEL_ADDRESS, ("pmap_enter: toobig")); KASSERT((m->oflags & VPO_UNMANAGED) != 0 || va < kmi.clean_sva || va >= kmi.clean_eva, ("pmap_enter: managed mapping within the clean submap")); if ((m->oflags & VPO_UNMANAGED) == 0 && !vm_page_xbusied(m)) VM_OBJECT_ASSERT_LOCKED(m->object); pa = VM_PAGE_TO_PHYS(m); newpte = TLBLO_PA_TO_PFN(pa) | init_pte_prot(m, flags, prot); if ((flags & PMAP_ENTER_WIRED) != 0) newpte |= PTE_W; if (is_kernel_pmap(pmap)) newpte |= PTE_G; PMAP_PTE_SET_CACHE_BITS(newpte, pa, m); mpte = NULL; rw_wlock(&pvh_global_lock); PMAP_LOCK(pmap); /* * In the case that a page table page is not resident, we are * creating it here. */ if (va < VM_MAXUSER_ADDRESS) { mpte = pmap_allocpte(pmap, va, flags); if (mpte == NULL) { KASSERT((flags & PMAP_ENTER_NOSLEEP) != 0, ("pmap_allocpte failed with sleep allowed")); rw_wunlock(&pvh_global_lock); PMAP_UNLOCK(pmap); return (KERN_RESOURCE_SHORTAGE); } } pte = pmap_pte(pmap, va); /* * Page Directory table entry not valid, we need a new PT page */ if (pte == NULL) { panic("pmap_enter: invalid page directory, pdir=%p, va=%p", (void *)pmap->pm_segtab, (void *)va); } om = NULL; origpte = *pte; opa = TLBLO_PTE_TO_PA(origpte); /* * Mapping has not changed, must be protection or wiring change. */ if (pte_test(&origpte, PTE_V) && opa == pa) { /* * Wiring change, just update stats. We don't worry about * wiring PT pages as they remain resident as long as there * are valid mappings in them. Hence, if a user page is * wired, the PT page will be also. */ if (pte_test(&newpte, PTE_W) && !pte_test(&origpte, PTE_W)) pmap->pm_stats.wired_count++; else if (!pte_test(&newpte, PTE_W) && pte_test(&origpte, PTE_W)) pmap->pm_stats.wired_count--; KASSERT(!pte_test(&origpte, PTE_D | PTE_RO), ("%s: modified page not writable: va: %p, pte: %#jx", __func__, (void *)va, (uintmax_t)origpte)); /* * Remove extra pte reference */ if (mpte) mpte->wire_count--; if (pte_test(&origpte, PTE_MANAGED)) { m->md.pv_flags |= PV_TABLE_REF; om = m; newpte |= PTE_MANAGED; if (!pte_test(&newpte, PTE_RO)) vm_page_aflag_set(m, PGA_WRITEABLE); } goto validate; } pv = NULL; /* * Mapping has changed, invalidate old range and fall through to * handle validating new mapping. */ if (opa) { if (pte_test(&origpte, PTE_W)) pmap->pm_stats.wired_count--; if (pte_test(&origpte, PTE_MANAGED)) { om = PHYS_TO_VM_PAGE(opa); pv = pmap_pvh_remove(&om->md, pmap, va); } if (mpte != NULL) { mpte->wire_count--; KASSERT(mpte->wire_count > 0, ("pmap_enter: missing reference to page table page," " va: %p", (void *)va)); } } else pmap->pm_stats.resident_count++; /* * Enter on the PV list if part of our managed memory. */ if ((m->oflags & VPO_UNMANAGED) == 0) { m->md.pv_flags |= PV_TABLE_REF; if (pv == NULL) pv = get_pv_entry(pmap, FALSE); pv->pv_va = va; TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_list); newpte |= PTE_MANAGED; if (!pte_test(&newpte, PTE_RO)) vm_page_aflag_set(m, PGA_WRITEABLE); } else if (pv != NULL) free_pv_entry(pmap, pv); /* * Increment counters */ if (pte_test(&newpte, PTE_W)) pmap->pm_stats.wired_count++; validate: #ifdef PMAP_DEBUG printf("pmap_enter: va: %p -> pa: %p\n", (void *)va, (void *)pa); #endif /* * if the mapping or permission bits are different, we need to * update the pte. */ if (origpte != newpte) { *pte = newpte; if (pte_test(&origpte, PTE_V)) { if (pte_test(&origpte, PTE_MANAGED) && opa != pa) { if (om->md.pv_flags & PV_TABLE_REF) vm_page_aflag_set(om, PGA_REFERENCED); om->md.pv_flags &= ~PV_TABLE_REF; } if (pte_test(&origpte, PTE_D)) { KASSERT(!pte_test(&origpte, PTE_RO), ("pmap_enter: modified page not writable:" " va: %p, pte: %#jx", (void *)va, (uintmax_t)origpte)); if (pte_test(&origpte, PTE_MANAGED)) vm_page_dirty(om); } if (pte_test(&origpte, PTE_MANAGED) && TAILQ_EMPTY(&om->md.pv_list)) vm_page_aflag_clear(om, PGA_WRITEABLE); pmap_update_page(pmap, va, newpte); } } /* * Sync I & D caches for executable pages. Do this only if the * target pmap belongs to the current process. Otherwise, an * unresolvable TLB miss may occur. */ if (!is_kernel_pmap(pmap) && (pmap == &curproc->p_vmspace->vm_pmap) && (prot & VM_PROT_EXECUTE)) { mips_icache_sync_range(va, PAGE_SIZE); mips_dcache_wbinv_range(va, PAGE_SIZE); } rw_wunlock(&pvh_global_lock); PMAP_UNLOCK(pmap); return (KERN_SUCCESS); } /* * this code makes some *MAJOR* assumptions: * 1. Current pmap & pmap exists. * 2. Not wired. * 3. Read access. * 4. No page table pages. * but is *MUCH* faster than pmap_enter... */ void pmap_enter_quick(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot) { rw_wlock(&pvh_global_lock); PMAP_LOCK(pmap); (void)pmap_enter_quick_locked(pmap, va, m, prot, NULL); rw_wunlock(&pvh_global_lock); PMAP_UNLOCK(pmap); } static vm_page_t pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot, vm_page_t mpte) { pt_entry_t *pte, npte; vm_paddr_t pa; KASSERT(va < kmi.clean_sva || va >= kmi.clean_eva || (m->oflags & VPO_UNMANAGED) != 0, ("pmap_enter_quick_locked: managed mapping within the clean submap")); rw_assert(&pvh_global_lock, RA_WLOCKED); PMAP_LOCK_ASSERT(pmap, MA_OWNED); /* * In the case that a page table page is not resident, we are * creating it here. */ if (va < VM_MAXUSER_ADDRESS) { pd_entry_t *pde; unsigned ptepindex; /* * Calculate pagetable page index */ ptepindex = pmap_pde_pindex(va); if (mpte && (mpte->pindex == ptepindex)) { mpte->wire_count++; } else { /* * Get the page directory entry */ pde = pmap_pde(pmap, va); /* * If the page table page is mapped, we just * increment the hold count, and activate it. */ if (pde && *pde != 0) { mpte = PHYS_TO_VM_PAGE( MIPS_DIRECT_TO_PHYS(*pde)); mpte->wire_count++; } else { mpte = _pmap_allocpte(pmap, ptepindex, PMAP_ENTER_NOSLEEP); if (mpte == NULL) return (mpte); } } } else { mpte = NULL; } pte = pmap_pte(pmap, va); if (pte_test(pte, PTE_V)) { if (mpte != NULL) { mpte->wire_count--; mpte = NULL; } return (mpte); } /* * Enter on the PV list if part of our managed memory. */ if ((m->oflags & VPO_UNMANAGED) == 0 && !pmap_try_insert_pv_entry(pmap, mpte, va, m)) { if (mpte != NULL) { pmap_unwire_ptp(pmap, va, mpte); mpte = NULL; } return (mpte); } /* * Increment counters */ pmap->pm_stats.resident_count++; pa = VM_PAGE_TO_PHYS(m); /* * Now validate mapping with RO protection */ npte = PTE_RO | TLBLO_PA_TO_PFN(pa) | PTE_V; if ((m->oflags & VPO_UNMANAGED) == 0) npte |= PTE_MANAGED; PMAP_PTE_SET_CACHE_BITS(npte, pa, m); if (is_kernel_pmap(pmap)) *pte = npte | PTE_G; else { *pte = npte; /* * Sync I & D caches. Do this only if the target pmap * belongs to the current process. Otherwise, an * unresolvable TLB miss may occur. */ if (pmap == &curproc->p_vmspace->vm_pmap) { va &= ~PAGE_MASK; mips_icache_sync_range(va, PAGE_SIZE); mips_dcache_wbinv_range(va, PAGE_SIZE); } } return (mpte); } /* * Make a temporary mapping for a physical address. This is only intended * to be used for panic dumps. * * Use XKPHYS for 64 bit, and KSEG0 where possible for 32 bit. */ void * pmap_kenter_temporary(vm_paddr_t pa, int i) { vm_offset_t va; if (i != 0) printf("%s: ERROR!!! More than one page of virtual address mapping not supported\n", __func__); if (MIPS_DIRECT_MAPPABLE(pa)) { va = MIPS_PHYS_TO_DIRECT(pa); } else { #ifndef __mips_n64 /* XXX : to be converted to new style */ int cpu; register_t intr; struct local_sysmaps *sysm; pt_entry_t *pte, npte; /* If this is used other than for dumps, we may need to leave * interrupts disasbled on return. If crash dumps don't work when * we get to this point, we might want to consider this (leaving things * disabled as a starting point ;-) */ intr = intr_disable(); cpu = PCPU_GET(cpuid); sysm = &sysmap_lmem[cpu]; /* Since this is for the debugger, no locks or any other fun */ npte = TLBLO_PA_TO_PFN(pa) | PTE_C_CACHE | PTE_D | PTE_V | PTE_G; pte = pmap_pte(kernel_pmap, sysm->base); *pte = npte; sysm->valid1 = 1; pmap_update_page(kernel_pmap, sysm->base, npte); va = sysm->base; intr_restore(intr); #endif } return ((void *)va); } void pmap_kenter_temporary_free(vm_paddr_t pa) { #ifndef __mips_n64 /* XXX : to be converted to new style */ int cpu; register_t intr; struct local_sysmaps *sysm; #endif if (MIPS_DIRECT_MAPPABLE(pa)) { /* nothing to do for this case */ return; } #ifndef __mips_n64 /* XXX : to be converted to new style */ cpu = PCPU_GET(cpuid); sysm = &sysmap_lmem[cpu]; if (sysm->valid1) { pt_entry_t *pte; intr = intr_disable(); pte = pmap_pte(kernel_pmap, sysm->base); *pte = PTE_G; pmap_invalidate_page(kernel_pmap, sysm->base); intr_restore(intr); sysm->valid1 = 0; } #endif } /* * Maps a sequence of resident pages belonging to the same object. * The sequence begins with the given page m_start. This page is * mapped at the given virtual address start. Each subsequent page is * mapped at a virtual address that is offset from start by the same * amount as the page is offset from m_start within the object. The * last page in the sequence is the page with the largest offset from * m_start that can be mapped at a virtual address less than the given * virtual address end. Not every virtual page between start and end * is mapped; only those for which a resident page exists with the * corresponding offset from m_start are mapped. */ void pmap_enter_object(pmap_t pmap, vm_offset_t start, vm_offset_t end, vm_page_t m_start, vm_prot_t prot) { vm_page_t m, mpte; vm_pindex_t diff, psize; VM_OBJECT_ASSERT_LOCKED(m_start->object); psize = atop(end - start); mpte = NULL; m = m_start; rw_wlock(&pvh_global_lock); PMAP_LOCK(pmap); while (m != NULL && (diff = m->pindex - m_start->pindex) < psize) { mpte = pmap_enter_quick_locked(pmap, start + ptoa(diff), m, prot, mpte); m = TAILQ_NEXT(m, listq); } rw_wunlock(&pvh_global_lock); PMAP_UNLOCK(pmap); } /* * pmap_object_init_pt preloads the ptes for a given object * into the specified pmap. This eliminates the blast of soft * faults on process startup and immediately after an mmap. */ void pmap_object_init_pt(pmap_t pmap, vm_offset_t addr, vm_object_t object, vm_pindex_t pindex, vm_size_t size) { VM_OBJECT_ASSERT_WLOCKED(object); KASSERT(object->type == OBJT_DEVICE || object->type == OBJT_SG, ("pmap_object_init_pt: non-device object")); } /* * Clear the wired attribute from the mappings for the specified range of * addresses in the given pmap. Every valid mapping within that range * must have the wired attribute set. In contrast, invalid mappings * cannot have the wired attribute set, so they are ignored. * * The wired attribute of the page table entry is not a hardware feature, * so there is no need to invalidate any TLB entries. */ void pmap_unwire(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) { pd_entry_t *pde, *pdpe; pt_entry_t *pte; vm_offset_t va_next; PMAP_LOCK(pmap); for (; sva < eva; sva = va_next) { pdpe = pmap_segmap(pmap, sva); #ifdef __mips_n64 if (*pdpe == NULL) { va_next = (sva + NBSEG) & ~SEGMASK; if (va_next < sva) va_next = eva; continue; } #endif va_next = (sva + NBPDR) & ~PDRMASK; if (va_next < sva) va_next = eva; pde = pmap_pdpe_to_pde(pdpe, sva); if (*pde == NULL) continue; if (va_next > eva) va_next = eva; for (pte = pmap_pde_to_pte(pde, sva); sva != va_next; pte++, sva += PAGE_SIZE) { if (!pte_test(pte, PTE_V)) continue; if (!pte_test(pte, PTE_W)) panic("pmap_unwire: pte %#jx is missing PG_W", (uintmax_t)*pte); pte_clear(pte, PTE_W); pmap->pm_stats.wired_count--; } } PMAP_UNLOCK(pmap); } /* * Copy the range specified by src_addr/len * from the source map to the range dst_addr/len * in the destination map. * * This routine is only advisory and need not do anything. */ void pmap_copy(pmap_t dst_pmap, pmap_t src_pmap, vm_offset_t dst_addr, vm_size_t len, vm_offset_t src_addr) { } /* * pmap_zero_page zeros the specified hardware page by mapping * the page into KVM and using bzero to clear its contents. * * Use XKPHYS for 64 bit, and KSEG0 where possible for 32 bit. */ void pmap_zero_page(vm_page_t m) { vm_offset_t va; vm_paddr_t phys = VM_PAGE_TO_PHYS(m); if (MIPS_DIRECT_MAPPABLE(phys)) { va = MIPS_PHYS_TO_DIRECT(phys); bzero((caddr_t)va, PAGE_SIZE); mips_dcache_wbinv_range(va, PAGE_SIZE); } else { va = pmap_lmem_map1(phys); bzero((caddr_t)va, PAGE_SIZE); mips_dcache_wbinv_range(va, PAGE_SIZE); pmap_lmem_unmap(); } } /* * pmap_zero_page_area zeros the specified hardware page by mapping * the page into KVM and using bzero to clear its contents. * * off and size may not cover an area beyond a single hardware page. */ void pmap_zero_page_area(vm_page_t m, int off, int size) { vm_offset_t va; vm_paddr_t phys = VM_PAGE_TO_PHYS(m); if (MIPS_DIRECT_MAPPABLE(phys)) { va = MIPS_PHYS_TO_DIRECT(phys); bzero((char *)(caddr_t)va + off, size); mips_dcache_wbinv_range(va + off, size); } else { va = pmap_lmem_map1(phys); bzero((char *)va + off, size); mips_dcache_wbinv_range(va + off, size); pmap_lmem_unmap(); } } /* * pmap_copy_page copies the specified (machine independent) * page by mapping the page into virtual memory and using * bcopy to copy the page, one machine dependent page at a * time. * * Use XKPHYS for 64 bit, and KSEG0 where possible for 32 bit. */ void pmap_copy_page(vm_page_t src, vm_page_t dst) { vm_offset_t va_src, va_dst; vm_paddr_t phys_src = VM_PAGE_TO_PHYS(src); vm_paddr_t phys_dst = VM_PAGE_TO_PHYS(dst); if (MIPS_DIRECT_MAPPABLE(phys_src) && MIPS_DIRECT_MAPPABLE(phys_dst)) { /* easy case, all can be accessed via KSEG0 */ /* * Flush all caches for VA that are mapped to this page * to make sure that data in SDRAM is up to date */ pmap_flush_pvcache(src); mips_dcache_wbinv_range_index( MIPS_PHYS_TO_DIRECT(phys_dst), PAGE_SIZE); va_src = MIPS_PHYS_TO_DIRECT(phys_src); va_dst = MIPS_PHYS_TO_DIRECT(phys_dst); bcopy((caddr_t)va_src, (caddr_t)va_dst, PAGE_SIZE); mips_dcache_wbinv_range(va_dst, PAGE_SIZE); } else { va_src = pmap_lmem_map2(phys_src, phys_dst); va_dst = va_src + PAGE_SIZE; bcopy((void *)va_src, (void *)va_dst, PAGE_SIZE); mips_dcache_wbinv_range(va_dst, PAGE_SIZE); pmap_lmem_unmap(); } } int unmapped_buf_allowed; void pmap_copy_pages(vm_page_t ma[], vm_offset_t a_offset, vm_page_t mb[], vm_offset_t b_offset, int xfersize) { char *a_cp, *b_cp; vm_page_t a_m, b_m; vm_offset_t a_pg_offset, b_pg_offset; vm_paddr_t a_phys, b_phys; int cnt; while (xfersize > 0) { a_pg_offset = a_offset & PAGE_MASK; cnt = min(xfersize, PAGE_SIZE - a_pg_offset); a_m = ma[a_offset >> PAGE_SHIFT]; a_phys = VM_PAGE_TO_PHYS(a_m); b_pg_offset = b_offset & PAGE_MASK; cnt = min(cnt, PAGE_SIZE - b_pg_offset); b_m = mb[b_offset >> PAGE_SHIFT]; b_phys = VM_PAGE_TO_PHYS(b_m); if (MIPS_DIRECT_MAPPABLE(a_phys) && MIPS_DIRECT_MAPPABLE(b_phys)) { pmap_flush_pvcache(a_m); mips_dcache_wbinv_range_index( MIPS_PHYS_TO_DIRECT(b_phys), PAGE_SIZE); a_cp = (char *)MIPS_PHYS_TO_DIRECT(a_phys) + a_pg_offset; b_cp = (char *)MIPS_PHYS_TO_DIRECT(b_phys) + b_pg_offset; bcopy(a_cp, b_cp, cnt); mips_dcache_wbinv_range((vm_offset_t)b_cp, cnt); } else { a_cp = (char *)pmap_lmem_map2(a_phys, b_phys); b_cp = (char *)a_cp + PAGE_SIZE; a_cp += a_pg_offset; b_cp += b_pg_offset; bcopy(a_cp, b_cp, cnt); mips_dcache_wbinv_range((vm_offset_t)b_cp, cnt); pmap_lmem_unmap(); } a_offset += cnt; b_offset += cnt; xfersize -= cnt; } } vm_offset_t pmap_quick_enter_page(vm_page_t m) { #if defined(__mips_n64) return MIPS_PHYS_TO_DIRECT(VM_PAGE_TO_PHYS(m)); #else vm_paddr_t pa; struct local_sysmaps *sysm; pt_entry_t *pte, npte; pa = VM_PAGE_TO_PHYS(m); if (MIPS_DIRECT_MAPPABLE(pa)) { if (pmap_page_get_memattr(m) != VM_MEMATTR_WRITE_BACK) return (MIPS_PHYS_TO_DIRECT_UNCACHED(pa)); else return (MIPS_PHYS_TO_DIRECT(pa)); } critical_enter(); sysm = &sysmap_lmem[PCPU_GET(cpuid)]; KASSERT(sysm->valid1 == 0, ("pmap_quick_enter_page: PTE busy")); pte = pmap_pte(kernel_pmap, sysm->base); npte = TLBLO_PA_TO_PFN(pa) | PTE_D | PTE_V | PTE_G; PMAP_PTE_SET_CACHE_BITS(npte, pa, m); *pte = npte; sysm->valid1 = 1; return (sysm->base); #endif } void pmap_quick_remove_page(vm_offset_t addr) { mips_dcache_wbinv_range(addr, PAGE_SIZE); #if !defined(__mips_n64) struct local_sysmaps *sysm; pt_entry_t *pte; if (addr >= MIPS_KSEG0_START && addr < MIPS_KSEG0_END) return; sysm = &sysmap_lmem[PCPU_GET(cpuid)]; KASSERT(sysm->valid1 != 0, ("pmap_quick_remove_page: PTE not in use")); KASSERT(sysm->base == addr, ("pmap_quick_remove_page: invalid address")); pte = pmap_pte(kernel_pmap, addr); *pte = PTE_G; tlb_invalidate_address(kernel_pmap, addr); sysm->valid1 = 0; critical_exit(); #endif } /* * Returns true if the pmap's pv is one of the first * 16 pvs linked to from this page. This count may * be changed upwards or downwards in the future; it * is only necessary that true be returned for a small * subset of pmaps for proper page aging. */ boolean_t pmap_page_exists_quick(pmap_t pmap, vm_page_t m) { pv_entry_t pv; int loops = 0; boolean_t rv; KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("pmap_page_exists_quick: page %p is not managed", m)); rv = FALSE; rw_wlock(&pvh_global_lock); TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) { if (PV_PMAP(pv) == pmap) { rv = TRUE; break; } loops++; if (loops >= 16) break; } rw_wunlock(&pvh_global_lock); return (rv); } /* * Remove all pages from specified address space * this aids process exit speeds. Also, this code * is special cased for current process only, but * can have the more generic (and slightly slower) * mode enabled. This is much faster than pmap_remove * in the case of running down an entire address space. */ void pmap_remove_pages(pmap_t pmap) { pd_entry_t *pde; pt_entry_t *pte, tpte; pv_entry_t pv; vm_page_t m; struct pv_chunk *pc, *npc; u_long inuse, bitmask; int allfree, bit, field, idx; if (pmap != vmspace_pmap(curthread->td_proc->p_vmspace)) { printf("warning: pmap_remove_pages called with non-current pmap\n"); return; } rw_wlock(&pvh_global_lock); PMAP_LOCK(pmap); TAILQ_FOREACH_SAFE(pc, &pmap->pm_pvchunk, pc_list, npc) { allfree = 1; for (field = 0; field < _NPCM; field++) { inuse = ~pc->pc_map[field] & pc_freemask[field]; while (inuse != 0) { bit = ffsl(inuse) - 1; bitmask = 1UL << bit; idx = field * sizeof(inuse) * NBBY + bit; pv = &pc->pc_pventry[idx]; inuse &= ~bitmask; pde = pmap_pde(pmap, pv->pv_va); KASSERT(pde != NULL && *pde != 0, ("pmap_remove_pages: pde")); pte = pmap_pde_to_pte(pde, pv->pv_va); if (!pte_test(pte, PTE_V)) panic("pmap_remove_pages: bad pte"); tpte = *pte; /* * We cannot remove wired pages from a process' mapping at this time */ if (pte_test(&tpte, PTE_W)) { allfree = 0; continue; } *pte = is_kernel_pmap(pmap) ? PTE_G : 0; m = PHYS_TO_VM_PAGE(TLBLO_PTE_TO_PA(tpte)); KASSERT(m != NULL, ("pmap_remove_pages: bad tpte %#jx", (uintmax_t)tpte)); /* * Update the vm_page_t clean and reference bits. */ if (pte_test(&tpte, PTE_D)) vm_page_dirty(m); /* Mark free */ PV_STAT(pv_entry_frees++); PV_STAT(pv_entry_spare++); pv_entry_count--; pc->pc_map[field] |= bitmask; pmap->pm_stats.resident_count--; TAILQ_REMOVE(&m->md.pv_list, pv, pv_list); if (TAILQ_EMPTY(&m->md.pv_list)) vm_page_aflag_clear(m, PGA_WRITEABLE); pmap_unuse_pt(pmap, pv->pv_va, *pde); } } if (allfree) { TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); free_pv_chunk(pc); } } pmap_invalidate_all(pmap); PMAP_UNLOCK(pmap); rw_wunlock(&pvh_global_lock); } /* * pmap_testbit tests bits in pte's */ static boolean_t pmap_testbit(vm_page_t m, int bit) { pv_entry_t pv; pmap_t pmap; pt_entry_t *pte; boolean_t rv = FALSE; if (m->oflags & VPO_UNMANAGED) return (rv); rw_assert(&pvh_global_lock, RA_WLOCKED); TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) { pmap = PV_PMAP(pv); PMAP_LOCK(pmap); pte = pmap_pte(pmap, pv->pv_va); rv = pte_test(pte, bit); PMAP_UNLOCK(pmap); if (rv) break; } return (rv); } /* * pmap_page_wired_mappings: * * Return the number of managed mappings to the given physical page * that are wired. */ int pmap_page_wired_mappings(vm_page_t m) { pv_entry_t pv; pmap_t pmap; pt_entry_t *pte; int count; count = 0; if ((m->oflags & VPO_UNMANAGED) != 0) return (count); rw_wlock(&pvh_global_lock); TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) { pmap = PV_PMAP(pv); PMAP_LOCK(pmap); pte = pmap_pte(pmap, pv->pv_va); if (pte_test(pte, PTE_W)) count++; PMAP_UNLOCK(pmap); } rw_wunlock(&pvh_global_lock); return (count); } /* * Clear the write and modified bits in each of the given page's mappings. */ void pmap_remove_write(vm_page_t m) { pmap_t pmap; pt_entry_t pbits, *pte; pv_entry_t pv; KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("pmap_remove_write: page %p is not managed", m)); /* * If the page is not exclusive busied, then PGA_WRITEABLE cannot be * set by another thread while the object is locked. Thus, * if PGA_WRITEABLE is clear, no page table entries need updating. */ VM_OBJECT_ASSERT_WLOCKED(m->object); if (!vm_page_xbusied(m) && (m->aflags & PGA_WRITEABLE) == 0) return; rw_wlock(&pvh_global_lock); TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) { pmap = PV_PMAP(pv); PMAP_LOCK(pmap); pte = pmap_pte(pmap, pv->pv_va); KASSERT(pte != NULL && pte_test(pte, PTE_V), ("page on pv_list has no pte")); pbits = *pte; if (pte_test(&pbits, PTE_D)) { pte_clear(&pbits, PTE_D); vm_page_dirty(m); } pte_set(&pbits, PTE_RO); if (pbits != *pte) { *pte = pbits; pmap_update_page(pmap, pv->pv_va, pbits); } PMAP_UNLOCK(pmap); } vm_page_aflag_clear(m, PGA_WRITEABLE); rw_wunlock(&pvh_global_lock); } /* * pmap_ts_referenced: * * Return the count of reference bits for a page, clearing all of them. */ int pmap_ts_referenced(vm_page_t m) { KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("pmap_ts_referenced: page %p is not managed", m)); if (m->md.pv_flags & PV_TABLE_REF) { rw_wlock(&pvh_global_lock); m->md.pv_flags &= ~PV_TABLE_REF; rw_wunlock(&pvh_global_lock); return (1); } return (0); } /* * pmap_is_modified: * * Return whether or not the specified physical page was modified * in any physical maps. */ boolean_t pmap_is_modified(vm_page_t m) { boolean_t rv; KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("pmap_is_modified: page %p is not managed", m)); /* * If the page is not exclusive busied, then PGA_WRITEABLE cannot be * concurrently set while the object is locked. Thus, if PGA_WRITEABLE * is clear, no PTEs can have PTE_D set. */ VM_OBJECT_ASSERT_WLOCKED(m->object); if (!vm_page_xbusied(m) && (m->aflags & PGA_WRITEABLE) == 0) return (FALSE); rw_wlock(&pvh_global_lock); rv = pmap_testbit(m, PTE_D); rw_wunlock(&pvh_global_lock); return (rv); } /* N/C */ /* * pmap_is_prefaultable: * * Return whether or not the specified virtual address is elgible * for prefault. */ boolean_t pmap_is_prefaultable(pmap_t pmap, vm_offset_t addr) { pd_entry_t *pde; pt_entry_t *pte; boolean_t rv; rv = FALSE; PMAP_LOCK(pmap); pde = pmap_pde(pmap, addr); if (pde != NULL && *pde != 0) { pte = pmap_pde_to_pte(pde, addr); rv = (*pte == 0); } PMAP_UNLOCK(pmap); return (rv); } /* * Apply the given advice to the specified range of addresses within the * given pmap. Depending on the advice, clear the referenced and/or * modified flags in each mapping and set the mapped page's dirty field. */ void pmap_advise(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, int advice) { pd_entry_t *pde, *pdpe; pt_entry_t *pte; vm_offset_t va, va_next; vm_paddr_t pa; vm_page_t m; if (advice != MADV_DONTNEED && advice != MADV_FREE) return; rw_wlock(&pvh_global_lock); PMAP_LOCK(pmap); for (; sva < eva; sva = va_next) { pdpe = pmap_segmap(pmap, sva); #ifdef __mips_n64 if (*pdpe == 0) { va_next = (sva + NBSEG) & ~SEGMASK; if (va_next < sva) va_next = eva; continue; } #endif va_next = (sva + NBPDR) & ~PDRMASK; if (va_next < sva) va_next = eva; pde = pmap_pdpe_to_pde(pdpe, sva); if (*pde == NULL) continue; /* * Limit our scan to either the end of the va represented * by the current page table page, or to the end of the * range being write protected. */ if (va_next > eva) va_next = eva; va = va_next; for (pte = pmap_pde_to_pte(pde, sva); sva != va_next; pte++, sva += PAGE_SIZE) { if (!pte_test(pte, PTE_MANAGED | PTE_V)) { if (va != va_next) { pmap_invalidate_range(pmap, va, sva); va = va_next; } continue; } pa = TLBLO_PTE_TO_PA(*pte); m = PHYS_TO_VM_PAGE(pa); m->md.pv_flags &= ~PV_TABLE_REF; if (pte_test(pte, PTE_D)) { if (advice == MADV_DONTNEED) { /* * Future calls to pmap_is_modified() * can be avoided by making the page * dirty now. */ vm_page_dirty(m); } else { pte_clear(pte, PTE_D); if (va == va_next) va = sva; } } else { /* * Unless PTE_D is set, any TLB entries * mapping "sva" don't allow write access, so * they needn't be invalidated. */ if (va != va_next) { pmap_invalidate_range(pmap, va, sva); va = va_next; } } } if (va != va_next) pmap_invalidate_range(pmap, va, sva); } rw_wunlock(&pvh_global_lock); PMAP_UNLOCK(pmap); } /* * Clear the modify bits on the specified physical page. */ void pmap_clear_modify(vm_page_t m) { pmap_t pmap; pt_entry_t *pte; pv_entry_t pv; KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("pmap_clear_modify: page %p is not managed", m)); VM_OBJECT_ASSERT_WLOCKED(m->object); KASSERT(!vm_page_xbusied(m), ("pmap_clear_modify: page %p is exclusive busied", m)); /* * If the page is not PGA_WRITEABLE, then no PTEs can have PTE_D set. * If the object containing the page is locked and the page is not * write busied, then PGA_WRITEABLE cannot be concurrently set. */ if ((m->aflags & PGA_WRITEABLE) == 0) return; rw_wlock(&pvh_global_lock); TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) { pmap = PV_PMAP(pv); PMAP_LOCK(pmap); pte = pmap_pte(pmap, pv->pv_va); if (pte_test(pte, PTE_D)) { pte_clear(pte, PTE_D); pmap_update_page(pmap, pv->pv_va, *pte); } PMAP_UNLOCK(pmap); } rw_wunlock(&pvh_global_lock); } /* * pmap_is_referenced: * * Return whether or not the specified physical page was referenced * in any physical maps. */ boolean_t pmap_is_referenced(vm_page_t m) { KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("pmap_is_referenced: page %p is not managed", m)); return ((m->md.pv_flags & PV_TABLE_REF) != 0); } /* * Miscellaneous support routines follow */ /* * Map a set of physical memory pages into the kernel virtual * address space. Return a pointer to where it is mapped. This * routine is intended to be used for mapping device memory, * NOT real memory. * * Use XKPHYS uncached for 64 bit, and KSEG1 where possible for 32 bit. */ void * pmap_mapdev_attr(vm_paddr_t pa, vm_size_t size, vm_memattr_t ma) { vm_offset_t va, tmpva, offset; /* * KSEG1 maps only first 512M of phys address space. For * pa > 0x20000000 we should make proper mapping * using pmap_kenter. */ if (MIPS_DIRECT_MAPPABLE(pa + size - 1) && ma == VM_MEMATTR_UNCACHEABLE) return ((void *)MIPS_PHYS_TO_DIRECT_UNCACHED(pa)); else { offset = pa & PAGE_MASK; size = roundup(size + offset, PAGE_SIZE); va = kva_alloc(size); if (!va) panic("pmap_mapdev: Couldn't alloc kernel virtual memory"); pa = trunc_page(pa); for (tmpva = va; size > 0;) { pmap_kenter_attr(tmpva, pa, ma); size -= PAGE_SIZE; tmpva += PAGE_SIZE; pa += PAGE_SIZE; } } return ((void *)(va + offset)); } void * pmap_mapdev(vm_paddr_t pa, vm_size_t size) { return pmap_mapdev_attr(pa, size, VM_MEMATTR_UNCACHEABLE); } void pmap_unmapdev(vm_offset_t va, vm_size_t size) { #ifndef __mips_n64 vm_offset_t base, offset; /* If the address is within KSEG1 then there is nothing to do */ if (va >= MIPS_KSEG1_START && va <= MIPS_KSEG1_END) return; base = trunc_page(va); offset = va & PAGE_MASK; size = roundup(size + offset, PAGE_SIZE); kva_free(base, size); #endif } /* * perform the pmap work for mincore */ int pmap_mincore(pmap_t pmap, vm_offset_t addr, vm_paddr_t *locked_pa) { pt_entry_t *ptep, pte; vm_paddr_t pa; vm_page_t m; int val; PMAP_LOCK(pmap); retry: ptep = pmap_pte(pmap, addr); pte = (ptep != NULL) ? *ptep : 0; if (!pte_test(&pte, PTE_V)) { val = 0; goto out; } val = MINCORE_INCORE; if (pte_test(&pte, PTE_D)) val |= MINCORE_MODIFIED | MINCORE_MODIFIED_OTHER; pa = TLBLO_PTE_TO_PA(pte); if (pte_test(&pte, PTE_MANAGED)) { /* * This may falsely report the given address as * MINCORE_REFERENCED. Unfortunately, due to the lack of * per-PTE reference information, it is impossible to * determine if the address is MINCORE_REFERENCED. */ m = PHYS_TO_VM_PAGE(pa); if ((m->aflags & PGA_REFERENCED) != 0) val |= MINCORE_REFERENCED | MINCORE_REFERENCED_OTHER; } if ((val & (MINCORE_MODIFIED_OTHER | MINCORE_REFERENCED_OTHER)) != (MINCORE_MODIFIED_OTHER | MINCORE_REFERENCED_OTHER) && pte_test(&pte, PTE_MANAGED)) { /* Ensure that "PHYS_TO_VM_PAGE(pa)->object" doesn't change. */ if (vm_page_pa_tryrelock(pmap, pa, locked_pa)) goto retry; } else out: PA_UNLOCK_COND(*locked_pa); PMAP_UNLOCK(pmap); return (val); } void pmap_activate(struct thread *td) { pmap_t pmap, oldpmap; struct proc *p = td->td_proc; u_int cpuid; critical_enter(); pmap = vmspace_pmap(p->p_vmspace); oldpmap = PCPU_GET(curpmap); cpuid = PCPU_GET(cpuid); if (oldpmap) CPU_CLR_ATOMIC(cpuid, &oldpmap->pm_active); CPU_SET_ATOMIC(cpuid, &pmap->pm_active); pmap_asid_alloc(pmap); if (td == curthread) { PCPU_SET(segbase, pmap->pm_segtab); mips_wr_entryhi(pmap->pm_asid[cpuid].asid); } PCPU_SET(curpmap, pmap); critical_exit(); } static void pmap_sync_icache_one(void *arg __unused) { mips_icache_sync_all(); mips_dcache_wbinv_all(); } void pmap_sync_icache(pmap_t pm, vm_offset_t va, vm_size_t sz) { smp_rendezvous(NULL, pmap_sync_icache_one, NULL, NULL); } /* * Increase the starting virtual address of the given mapping if a * different alignment might result in more superpage mappings. */ void pmap_align_superpage(vm_object_t object, vm_ooffset_t offset, vm_offset_t *addr, vm_size_t size) { vm_offset_t superpage_offset; if (size < PDRSIZE) return; if (object != NULL && (object->flags & OBJ_COLORED) != 0) offset += ptoa(object->pg_color); superpage_offset = offset & PDRMASK; if (size - ((PDRSIZE - superpage_offset) & PDRMASK) < PDRSIZE || (*addr & PDRMASK) == superpage_offset) return; if ((*addr & PDRMASK) < superpage_offset) *addr = (*addr & ~PDRMASK) + superpage_offset; else *addr = ((*addr + PDRMASK) & ~PDRMASK) + superpage_offset; } #ifdef DDB DB_SHOW_COMMAND(ptable, ddb_pid_dump) { pmap_t pmap; struct thread *td = NULL; struct proc *p; int i, j, k; vm_paddr_t pa; vm_offset_t va; if (have_addr) { td = db_lookup_thread(addr, true); if (td == NULL) { db_printf("Invalid pid or tid"); return; } p = td->td_proc; if (p->p_vmspace == NULL) { db_printf("No vmspace for process"); return; } pmap = vmspace_pmap(p->p_vmspace); } else pmap = kernel_pmap; db_printf("pmap:%p segtab:%p asid:%x generation:%x\n", pmap, pmap->pm_segtab, pmap->pm_asid[0].asid, pmap->pm_asid[0].gen); for (i = 0; i < NPDEPG; i++) { pd_entry_t *pdpe; pt_entry_t *pde; pt_entry_t pte; pdpe = (pd_entry_t *)pmap->pm_segtab[i]; if (pdpe == NULL) continue; db_printf("[%4d] %p\n", i, pdpe); #ifdef __mips_n64 for (j = 0; j < NPDEPG; j++) { pde = (pt_entry_t *)pdpe[j]; if (pde == NULL) continue; db_printf("\t[%4d] %p\n", j, pde); #else { j = 0; pde = (pt_entry_t *)pdpe; #endif for (k = 0; k < NPTEPG; k++) { pte = pde[k]; if (pte == 0 || !pte_test(&pte, PTE_V)) continue; pa = TLBLO_PTE_TO_PA(pte); va = ((u_long)i << SEGSHIFT) | (j << PDRSHIFT) | (k << PAGE_SHIFT); db_printf("\t\t[%04d] va: %p pte: %8jx pa:%jx\n", k, (void *)va, (uintmax_t)pte, (uintmax_t)pa); } } } } #endif /* * Allocate TLB address space tag (called ASID or TLBPID) and return it. * It takes almost as much or more time to search the TLB for a * specific ASID and flush those entries as it does to flush the entire TLB. * Therefore, when we allocate a new ASID, we just take the next number. When * we run out of numbers, we flush the TLB, increment the generation count * and start over. ASID zero is reserved for kernel use. */ static void pmap_asid_alloc(pmap) pmap_t pmap; { if (pmap->pm_asid[PCPU_GET(cpuid)].asid != PMAP_ASID_RESERVED && pmap->pm_asid[PCPU_GET(cpuid)].gen == PCPU_GET(asid_generation)); else { if (PCPU_GET(next_asid) == pmap_max_asid) { tlb_invalidate_all_user(NULL); PCPU_SET(asid_generation, (PCPU_GET(asid_generation) + 1) & ASIDGEN_MASK); if (PCPU_GET(asid_generation) == 0) { PCPU_SET(asid_generation, 1); } PCPU_SET(next_asid, 1); /* 0 means invalid */ } pmap->pm_asid[PCPU_GET(cpuid)].asid = PCPU_GET(next_asid); pmap->pm_asid[PCPU_GET(cpuid)].gen = PCPU_GET(asid_generation); PCPU_SET(next_asid, PCPU_GET(next_asid) + 1); } } static pt_entry_t init_pte_prot(vm_page_t m, vm_prot_t access, vm_prot_t prot) { pt_entry_t rw; if (!(prot & VM_PROT_WRITE)) rw = PTE_V | PTE_RO; else if ((m->oflags & VPO_UNMANAGED) == 0) { if ((access & VM_PROT_WRITE) != 0) rw = PTE_V | PTE_D; else rw = PTE_V; } else /* Needn't emulate a modified bit for unmanaged pages. */ rw = PTE_V | PTE_D; return (rw); } /* * pmap_emulate_modified : do dirty bit emulation * * On SMP, update just the local TLB, other CPUs will update their * TLBs from PTE lazily, if they get the exception. * Returns 0 in case of sucess, 1 if the page is read only and we * need to fault. */ int pmap_emulate_modified(pmap_t pmap, vm_offset_t va) { pt_entry_t *pte; PMAP_LOCK(pmap); pte = pmap_pte(pmap, va); if (pte == NULL) panic("pmap_emulate_modified: can't find PTE"); #ifdef SMP /* It is possible that some other CPU changed m-bit */ if (!pte_test(pte, PTE_V) || pte_test(pte, PTE_D)) { tlb_update(pmap, va, *pte); PMAP_UNLOCK(pmap); return (0); } #else if (!pte_test(pte, PTE_V) || pte_test(pte, PTE_D)) panic("pmap_emulate_modified: invalid pte"); #endif if (pte_test(pte, PTE_RO)) { PMAP_UNLOCK(pmap); return (1); } pte_set(pte, PTE_D); tlb_update(pmap, va, *pte); if (!pte_test(pte, PTE_MANAGED)) panic("pmap_emulate_modified: unmanaged page"); PMAP_UNLOCK(pmap); return (0); } /* * Routine: pmap_kextract * Function: * Extract the physical page address associated * virtual address. */ vm_paddr_t pmap_kextract(vm_offset_t va) { int mapped; /* * First, the direct-mapped regions. */ #if defined(__mips_n64) if (va >= MIPS_XKPHYS_START && va < MIPS_XKPHYS_END) return (MIPS_XKPHYS_TO_PHYS(va)); #endif if (va >= MIPS_KSEG0_START && va < MIPS_KSEG0_END) return (MIPS_KSEG0_TO_PHYS(va)); if (va >= MIPS_KSEG1_START && va < MIPS_KSEG1_END) return (MIPS_KSEG1_TO_PHYS(va)); /* * User virtual addresses. */ if (va < VM_MAXUSER_ADDRESS) { pt_entry_t *ptep; if (curproc && curproc->p_vmspace) { ptep = pmap_pte(&curproc->p_vmspace->vm_pmap, va); if (ptep) { return (TLBLO_PTE_TO_PA(*ptep) | (va & PAGE_MASK)); } return (0); } } /* * Should be kernel virtual here, otherwise fail */ mapped = (va >= MIPS_KSEG2_START || va < MIPS_KSEG2_END); #if defined(__mips_n64) mapped = mapped || (va >= MIPS_XKSEG_START || va < MIPS_XKSEG_END); #endif /* * Kernel virtual. */ if (mapped) { pt_entry_t *ptep; /* Is the kernel pmap initialized? */ if (!CPU_EMPTY(&kernel_pmap->pm_active)) { /* It's inside the virtual address range */ ptep = pmap_pte(kernel_pmap, va); if (ptep) { return (TLBLO_PTE_TO_PA(*ptep) | (va & PAGE_MASK)); } } return (0); } panic("%s for unknown address space %p.", __func__, (void *)va); } void pmap_flush_pvcache(vm_page_t m) { pv_entry_t pv; if (m != NULL) { for (pv = TAILQ_FIRST(&m->md.pv_list); pv; pv = TAILQ_NEXT(pv, pv_list)) { mips_dcache_wbinv_range_index(pv->pv_va, PAGE_SIZE); } } } void pmap_page_set_memattr(vm_page_t m, vm_memattr_t ma) { /* * It appears that this function can only be called before any mappings * for the page are established. If this ever changes, this code will * need to walk the pv_list and make each of the existing mappings * uncacheable, being careful to sync caches and PTEs (and maybe * invalidate TLB?) for any current mapping it modifies. */ if (TAILQ_FIRST(&m->md.pv_list) != NULL) panic("Can't change memattr on page with existing mappings"); /* Clean memattr portion of pv_flags */ m->md.pv_flags &= ~PV_MEMATTR_MASK; m->md.pv_flags |= (ma << PV_MEMATTR_SHIFT) & PV_MEMATTR_MASK; } static __inline void pmap_pte_attr(pt_entry_t *pte, vm_memattr_t ma) { u_int npte; npte = *(u_int *)pte; npte &= ~PTE_C_MASK; npte |= PTE_C(ma); *pte = npte; } int pmap_change_attr(vm_offset_t sva, vm_size_t size, vm_memattr_t ma) { pd_entry_t *pde, *pdpe; pt_entry_t *pte; vm_offset_t ova, eva, va, va_next; pmap_t pmap; ova = sva; eva = sva + size; if (eva < sva) return (EINVAL); pmap = kernel_pmap; PMAP_LOCK(pmap); for (; sva < eva; sva = va_next) { pdpe = pmap_segmap(pmap, sva); #ifdef __mips_n64 if (*pdpe == 0) { va_next = (sva + NBSEG) & ~SEGMASK; if (va_next < sva) va_next = eva; continue; } #endif va_next = (sva + NBPDR) & ~PDRMASK; if (va_next < sva) va_next = eva; pde = pmap_pdpe_to_pde(pdpe, sva); if (*pde == NULL) continue; /* * Limit our scan to either the end of the va represented * by the current page table page, or to the end of the * range being removed. */ if (va_next > eva) va_next = eva; va = va_next; for (pte = pmap_pde_to_pte(pde, sva); sva != va_next; pte++, sva += PAGE_SIZE) { if (!pte_test(pte, PTE_V) || pte_cache_bits(pte) == ma) { if (va != va_next) { pmap_invalidate_range(pmap, va, sva); va = va_next; } continue; } if (va == va_next) va = sva; pmap_pte_attr(pte, ma); } if (va != va_next) pmap_invalidate_range(pmap, va, sva); } PMAP_UNLOCK(pmap); /* Flush caches to be in the safe side */ mips_dcache_wbinv_range(ova, size); return 0; } Index: projects/numa2/sys/mips/mips/uma_machdep.c =================================================================== --- projects/numa2/sys/mips/mips/uma_machdep.c (revision 321505) +++ projects/numa2/sys/mips/mips/uma_machdep.c (revision 321506) @@ -1,92 +1,93 @@ /*- * Copyright (c) 2003 Alan L. Cox * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include void * -uma_small_alloc(uma_zone_t zone, vm_size_t bytes, u_int8_t *flags, int wait) +uma_small_alloc(uma_zone_t zone, vm_size_t bytes, int domain, u_int8_t *flags, + int wait) { vm_paddr_t pa; vm_page_t m; int pflags; void *va; *flags = UMA_SLAB_PRIV; pflags = malloc2vm_flags(wait) | VM_ALLOC_WIRED; for (;;) { - m = vm_page_alloc_freelist(VM_FREELIST_DIRECT, pflags); + m = vm_page_alloc_freelist(domain, VM_FREELIST_DIRECT, pflags); #ifndef __mips_n64 if (m == NULL && vm_page_reclaim_contig(pflags, 1, 0, MIPS_KSEG0_LARGEST_PHYS, PAGE_SIZE, 0)) continue; #endif if (m == NULL) { if (wait & M_NOWAIT) return (NULL); else VM_WAIT; } else break; } pa = VM_PAGE_TO_PHYS(m); if ((wait & M_NODUMP) == 0) dump_add_page(pa); va = (void *)MIPS_PHYS_TO_DIRECT(pa); if ((wait & M_ZERO) && (m->flags & PG_ZERO) == 0) bzero(va, PAGE_SIZE); return (va); } void uma_small_free(void *mem, vm_size_t size, u_int8_t flags) { vm_page_t m; vm_paddr_t pa; pa = MIPS_DIRECT_TO_PHYS((vm_offset_t)mem); dump_drop_page(pa); m = PHYS_TO_VM_PAGE(pa); m->wire_count--; vm_page_free(m); atomic_subtract_int(&vm_cnt.v_wire_count, 1); } Index: projects/numa2/sys/powerpc/aim/mmu_oea64.c =================================================================== --- projects/numa2/sys/powerpc/aim/mmu_oea64.c (revision 321505) +++ projects/numa2/sys/powerpc/aim/mmu_oea64.c (revision 321506) @@ -1,2718 +1,2718 @@ /*- * Copyright (c) 2008-2015 Nathan Whitehorn * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); /* * Manages physical address maps. * * Since the information managed by this module is also stored by the * logical address mapping module, this module may throw away valid virtual * to physical mappings at almost any time. However, invalidations of * mappings must be done as requested. * * In order to cope with hardware architectures which make virtual to * physical map invalidates expensive, this module may delay invalidate * reduced protection operations until such time as they are actually * necessary. This module is given full information as to which processors * are currently using which maps, and to when physical maps must be made * correct. */ #include "opt_compat.h" #include "opt_kstack_pages.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "mmu_oea64.h" #include "mmu_if.h" #include "moea64_if.h" void moea64_release_vsid(uint64_t vsid); uintptr_t moea64_get_unique_vsid(void); #define DISABLE_TRANS(msr) msr = mfmsr(); mtmsr(msr & ~PSL_DR) #define ENABLE_TRANS(msr) mtmsr(msr) #define VSID_MAKE(sr, hash) ((sr) | (((hash) & 0xfffff) << 4)) #define VSID_TO_HASH(vsid) (((vsid) >> 4) & 0xfffff) #define VSID_HASH_MASK 0x0000007fffffffffULL /* * Locking semantics: * * There are two locks of interest: the page locks and the pmap locks, which * protect their individual PVO lists and are locked in that order. The contents * of all PVO entries are protected by the locks of their respective pmaps. * The pmap of any PVO is guaranteed not to change so long as the PVO is linked * into any list. * */ #define PV_LOCK_COUNT PA_LOCK_COUNT*3 static struct mtx_padalign pv_lock[PV_LOCK_COUNT]; #define PV_LOCKPTR(pa) ((struct mtx *)(&pv_lock[pa_index(pa) % PV_LOCK_COUNT])) #define PV_LOCK(pa) mtx_lock(PV_LOCKPTR(pa)) #define PV_UNLOCK(pa) mtx_unlock(PV_LOCKPTR(pa)) #define PV_LOCKASSERT(pa) mtx_assert(PV_LOCKPTR(pa), MA_OWNED) #define PV_PAGE_LOCK(m) PV_LOCK(VM_PAGE_TO_PHYS(m)) #define PV_PAGE_UNLOCK(m) PV_UNLOCK(VM_PAGE_TO_PHYS(m)) #define PV_PAGE_LOCKASSERT(m) PV_LOCKASSERT(VM_PAGE_TO_PHYS(m)) struct ofw_map { cell_t om_va; cell_t om_len; uint64_t om_pa; cell_t om_mode; }; extern unsigned char _etext[]; extern unsigned char _end[]; /* * Map of physical memory regions. */ static struct mem_region *regions; static struct mem_region *pregions; static u_int phys_avail_count; static int regions_sz, pregions_sz; extern void bs_remap_earlyboot(void); /* * Lock for the SLB tables. */ struct mtx moea64_slb_mutex; /* * PTEG data. */ u_int moea64_pteg_count; u_int moea64_pteg_mask; /* * PVO data. */ uma_zone_t moea64_pvo_zone; /* zone for pvo entries */ static struct pvo_entry *moea64_bpvo_pool; static int moea64_bpvo_pool_index = 0; static int moea64_bpvo_pool_size = 327680; TUNABLE_INT("machdep.moea64_bpvo_pool_size", &moea64_bpvo_pool_size); SYSCTL_INT(_machdep, OID_AUTO, moea64_allocated_bpvo_entries, CTLFLAG_RD, &moea64_bpvo_pool_index, 0, ""); #define VSID_NBPW (sizeof(u_int32_t) * 8) #ifdef __powerpc64__ #define NVSIDS (NPMAPS * 16) #define VSID_HASHMASK 0xffffffffUL #else #define NVSIDS NPMAPS #define VSID_HASHMASK 0xfffffUL #endif static u_int moea64_vsid_bitmap[NVSIDS / VSID_NBPW]; static boolean_t moea64_initialized = FALSE; /* * Statistics. */ u_int moea64_pte_valid = 0; u_int moea64_pte_overflow = 0; u_int moea64_pvo_entries = 0; u_int moea64_pvo_enter_calls = 0; u_int moea64_pvo_remove_calls = 0; SYSCTL_INT(_machdep, OID_AUTO, moea64_pte_valid, CTLFLAG_RD, &moea64_pte_valid, 0, ""); SYSCTL_INT(_machdep, OID_AUTO, moea64_pte_overflow, CTLFLAG_RD, &moea64_pte_overflow, 0, ""); SYSCTL_INT(_machdep, OID_AUTO, moea64_pvo_entries, CTLFLAG_RD, &moea64_pvo_entries, 0, ""); SYSCTL_INT(_machdep, OID_AUTO, moea64_pvo_enter_calls, CTLFLAG_RD, &moea64_pvo_enter_calls, 0, ""); SYSCTL_INT(_machdep, OID_AUTO, moea64_pvo_remove_calls, CTLFLAG_RD, &moea64_pvo_remove_calls, 0, ""); vm_offset_t moea64_scratchpage_va[2]; struct pvo_entry *moea64_scratchpage_pvo[2]; struct mtx moea64_scratchpage_mtx; uint64_t moea64_large_page_mask = 0; uint64_t moea64_large_page_size = 0; int moea64_large_page_shift = 0; /* * PVO calls. */ static int moea64_pvo_enter(mmu_t mmu, struct pvo_entry *pvo, struct pvo_head *pvo_head); static void moea64_pvo_remove_from_pmap(mmu_t mmu, struct pvo_entry *pvo); static void moea64_pvo_remove_from_page(mmu_t mmu, struct pvo_entry *pvo); static struct pvo_entry *moea64_pvo_find_va(pmap_t, vm_offset_t); /* * Utility routines. */ static boolean_t moea64_query_bit(mmu_t, vm_page_t, uint64_t); static u_int moea64_clear_bit(mmu_t, vm_page_t, uint64_t); static void moea64_kremove(mmu_t, vm_offset_t); static void moea64_syncicache(mmu_t, pmap_t pmap, vm_offset_t va, vm_paddr_t pa, vm_size_t sz); static void moea64_pmap_init_qpages(void); /* * Kernel MMU interface */ void moea64_clear_modify(mmu_t, vm_page_t); void moea64_copy_page(mmu_t, vm_page_t, vm_page_t); void moea64_copy_pages(mmu_t mmu, vm_page_t *ma, vm_offset_t a_offset, vm_page_t *mb, vm_offset_t b_offset, int xfersize); int moea64_enter(mmu_t, pmap_t, vm_offset_t, vm_page_t, vm_prot_t, u_int flags, int8_t psind); void moea64_enter_object(mmu_t, pmap_t, vm_offset_t, vm_offset_t, vm_page_t, vm_prot_t); void moea64_enter_quick(mmu_t, pmap_t, vm_offset_t, vm_page_t, vm_prot_t); vm_paddr_t moea64_extract(mmu_t, pmap_t, vm_offset_t); vm_page_t moea64_extract_and_hold(mmu_t, pmap_t, vm_offset_t, vm_prot_t); void moea64_init(mmu_t); boolean_t moea64_is_modified(mmu_t, vm_page_t); boolean_t moea64_is_prefaultable(mmu_t, pmap_t, vm_offset_t); boolean_t moea64_is_referenced(mmu_t, vm_page_t); int moea64_ts_referenced(mmu_t, vm_page_t); vm_offset_t moea64_map(mmu_t, vm_offset_t *, vm_paddr_t, vm_paddr_t, int); boolean_t moea64_page_exists_quick(mmu_t, pmap_t, vm_page_t); int moea64_page_wired_mappings(mmu_t, vm_page_t); void moea64_pinit(mmu_t, pmap_t); void moea64_pinit0(mmu_t, pmap_t); void moea64_protect(mmu_t, pmap_t, vm_offset_t, vm_offset_t, vm_prot_t); void moea64_qenter(mmu_t, vm_offset_t, vm_page_t *, int); void moea64_qremove(mmu_t, vm_offset_t, int); void moea64_release(mmu_t, pmap_t); void moea64_remove(mmu_t, pmap_t, vm_offset_t, vm_offset_t); void moea64_remove_pages(mmu_t, pmap_t); void moea64_remove_all(mmu_t, vm_page_t); void moea64_remove_write(mmu_t, vm_page_t); void moea64_unwire(mmu_t, pmap_t, vm_offset_t, vm_offset_t); void moea64_zero_page(mmu_t, vm_page_t); void moea64_zero_page_area(mmu_t, vm_page_t, int, int); void moea64_activate(mmu_t, struct thread *); void moea64_deactivate(mmu_t, struct thread *); void *moea64_mapdev(mmu_t, vm_paddr_t, vm_size_t); void *moea64_mapdev_attr(mmu_t, vm_paddr_t, vm_size_t, vm_memattr_t); void moea64_unmapdev(mmu_t, vm_offset_t, vm_size_t); vm_paddr_t moea64_kextract(mmu_t, vm_offset_t); void moea64_page_set_memattr(mmu_t, vm_page_t m, vm_memattr_t ma); void moea64_kenter_attr(mmu_t, vm_offset_t, vm_paddr_t, vm_memattr_t ma); void moea64_kenter(mmu_t, vm_offset_t, vm_paddr_t); boolean_t moea64_dev_direct_mapped(mmu_t, vm_paddr_t, vm_size_t); static void moea64_sync_icache(mmu_t, pmap_t, vm_offset_t, vm_size_t); void moea64_dumpsys_map(mmu_t mmu, vm_paddr_t pa, size_t sz, void **va); void moea64_scan_init(mmu_t mmu); vm_offset_t moea64_quick_enter_page(mmu_t mmu, vm_page_t m); void moea64_quick_remove_page(mmu_t mmu, vm_offset_t addr); static mmu_method_t moea64_methods[] = { MMUMETHOD(mmu_clear_modify, moea64_clear_modify), MMUMETHOD(mmu_copy_page, moea64_copy_page), MMUMETHOD(mmu_copy_pages, moea64_copy_pages), MMUMETHOD(mmu_enter, moea64_enter), MMUMETHOD(mmu_enter_object, moea64_enter_object), MMUMETHOD(mmu_enter_quick, moea64_enter_quick), MMUMETHOD(mmu_extract, moea64_extract), MMUMETHOD(mmu_extract_and_hold, moea64_extract_and_hold), MMUMETHOD(mmu_init, moea64_init), MMUMETHOD(mmu_is_modified, moea64_is_modified), MMUMETHOD(mmu_is_prefaultable, moea64_is_prefaultable), MMUMETHOD(mmu_is_referenced, moea64_is_referenced), MMUMETHOD(mmu_ts_referenced, moea64_ts_referenced), MMUMETHOD(mmu_map, moea64_map), MMUMETHOD(mmu_page_exists_quick,moea64_page_exists_quick), MMUMETHOD(mmu_page_wired_mappings,moea64_page_wired_mappings), MMUMETHOD(mmu_pinit, moea64_pinit), MMUMETHOD(mmu_pinit0, moea64_pinit0), MMUMETHOD(mmu_protect, moea64_protect), MMUMETHOD(mmu_qenter, moea64_qenter), MMUMETHOD(mmu_qremove, moea64_qremove), MMUMETHOD(mmu_release, moea64_release), MMUMETHOD(mmu_remove, moea64_remove), MMUMETHOD(mmu_remove_pages, moea64_remove_pages), MMUMETHOD(mmu_remove_all, moea64_remove_all), MMUMETHOD(mmu_remove_write, moea64_remove_write), MMUMETHOD(mmu_sync_icache, moea64_sync_icache), MMUMETHOD(mmu_unwire, moea64_unwire), MMUMETHOD(mmu_zero_page, moea64_zero_page), MMUMETHOD(mmu_zero_page_area, moea64_zero_page_area), MMUMETHOD(mmu_activate, moea64_activate), MMUMETHOD(mmu_deactivate, moea64_deactivate), MMUMETHOD(mmu_page_set_memattr, moea64_page_set_memattr), MMUMETHOD(mmu_quick_enter_page, moea64_quick_enter_page), MMUMETHOD(mmu_quick_remove_page, moea64_quick_remove_page), /* Internal interfaces */ MMUMETHOD(mmu_mapdev, moea64_mapdev), MMUMETHOD(mmu_mapdev_attr, moea64_mapdev_attr), MMUMETHOD(mmu_unmapdev, moea64_unmapdev), MMUMETHOD(mmu_kextract, moea64_kextract), MMUMETHOD(mmu_kenter, moea64_kenter), MMUMETHOD(mmu_kenter_attr, moea64_kenter_attr), MMUMETHOD(mmu_dev_direct_mapped,moea64_dev_direct_mapped), MMUMETHOD(mmu_scan_init, moea64_scan_init), MMUMETHOD(mmu_dumpsys_map, moea64_dumpsys_map), { 0, 0 } }; MMU_DEF(oea64_mmu, "mmu_oea64_base", moea64_methods, 0); static struct pvo_head * vm_page_to_pvoh(vm_page_t m) { mtx_assert(PV_LOCKPTR(VM_PAGE_TO_PHYS(m)), MA_OWNED); return (&m->md.mdpg_pvoh); } static struct pvo_entry * alloc_pvo_entry(int bootstrap) { struct pvo_entry *pvo; if (!moea64_initialized || bootstrap) { if (moea64_bpvo_pool_index >= moea64_bpvo_pool_size) { panic("moea64_enter: bpvo pool exhausted, %d, %d, %zd", moea64_bpvo_pool_index, moea64_bpvo_pool_size, moea64_bpvo_pool_size * sizeof(struct pvo_entry)); } pvo = &moea64_bpvo_pool[ atomic_fetchadd_int(&moea64_bpvo_pool_index, 1)]; bzero(pvo, sizeof(*pvo)); pvo->pvo_vaddr = PVO_BOOTSTRAP; } else { pvo = uma_zalloc(moea64_pvo_zone, M_NOWAIT); bzero(pvo, sizeof(*pvo)); } return (pvo); } static void init_pvo_entry(struct pvo_entry *pvo, pmap_t pmap, vm_offset_t va) { uint64_t vsid; uint64_t hash; int shift; PMAP_LOCK_ASSERT(pmap, MA_OWNED); pvo->pvo_pmap = pmap; va &= ~ADDR_POFF; pvo->pvo_vaddr |= va; vsid = va_to_vsid(pmap, va); pvo->pvo_vpn = (uint64_t)((va & ADDR_PIDX) >> ADDR_PIDX_SHFT) | (vsid << 16); shift = (pvo->pvo_vaddr & PVO_LARGE) ? moea64_large_page_shift : ADDR_PIDX_SHFT; hash = (vsid & VSID_HASH_MASK) ^ (((uint64_t)va & ADDR_PIDX) >> shift); pvo->pvo_pte.slot = (hash & moea64_pteg_mask) << 3; } static void free_pvo_entry(struct pvo_entry *pvo) { if (!(pvo->pvo_vaddr & PVO_BOOTSTRAP)) uma_zfree(moea64_pvo_zone, pvo); } void moea64_pte_from_pvo(const struct pvo_entry *pvo, struct lpte *lpte) { lpte->pte_hi = (pvo->pvo_vpn >> (ADDR_API_SHFT64 - ADDR_PIDX_SHFT)) & LPTE_AVPN_MASK; lpte->pte_hi |= LPTE_VALID; if (pvo->pvo_vaddr & PVO_LARGE) lpte->pte_hi |= LPTE_BIG; if (pvo->pvo_vaddr & PVO_WIRED) lpte->pte_hi |= LPTE_WIRED; if (pvo->pvo_vaddr & PVO_HID) lpte->pte_hi |= LPTE_HID; lpte->pte_lo = pvo->pvo_pte.pa; /* Includes WIMG bits */ if (pvo->pvo_pte.prot & VM_PROT_WRITE) lpte->pte_lo |= LPTE_BW; else lpte->pte_lo |= LPTE_BR; if (!(pvo->pvo_pte.prot & VM_PROT_EXECUTE)) lpte->pte_lo |= LPTE_NOEXEC; } static __inline uint64_t moea64_calc_wimg(vm_paddr_t pa, vm_memattr_t ma) { uint64_t pte_lo; int i; if (ma != VM_MEMATTR_DEFAULT) { switch (ma) { case VM_MEMATTR_UNCACHEABLE: return (LPTE_I | LPTE_G); case VM_MEMATTR_CACHEABLE: return (LPTE_M); case VM_MEMATTR_WRITE_COMBINING: case VM_MEMATTR_WRITE_BACK: case VM_MEMATTR_PREFETCHABLE: return (LPTE_I); case VM_MEMATTR_WRITE_THROUGH: return (LPTE_W | LPTE_M); } } /* * Assume the page is cache inhibited and access is guarded unless * it's in our available memory array. */ pte_lo = LPTE_I | LPTE_G; for (i = 0; i < pregions_sz; i++) { if ((pa >= pregions[i].mr_start) && (pa < (pregions[i].mr_start + pregions[i].mr_size))) { pte_lo &= ~(LPTE_I | LPTE_G); pte_lo |= LPTE_M; break; } } return pte_lo; } /* * Quick sort callout for comparing memory regions. */ static int om_cmp(const void *a, const void *b); static int om_cmp(const void *a, const void *b) { const struct ofw_map *mapa; const struct ofw_map *mapb; mapa = a; mapb = b; if (mapa->om_pa < mapb->om_pa) return (-1); else if (mapa->om_pa > mapb->om_pa) return (1); else return (0); } static void moea64_add_ofw_mappings(mmu_t mmup, phandle_t mmu, size_t sz) { struct ofw_map translations[sz/(4*sizeof(cell_t))]; /*>= 4 cells per */ pcell_t acells, trans_cells[sz/sizeof(cell_t)]; struct pvo_entry *pvo; register_t msr; vm_offset_t off; vm_paddr_t pa_base; int i, j; bzero(translations, sz); OF_getencprop(OF_finddevice("/"), "#address-cells", &acells, sizeof(acells)); if (OF_getencprop(mmu, "translations", trans_cells, sz) == -1) panic("moea64_bootstrap: can't get ofw translations"); CTR0(KTR_PMAP, "moea64_add_ofw_mappings: translations"); sz /= sizeof(cell_t); for (i = 0, j = 0; i < sz; j++) { translations[j].om_va = trans_cells[i++]; translations[j].om_len = trans_cells[i++]; translations[j].om_pa = trans_cells[i++]; if (acells == 2) { translations[j].om_pa <<= 32; translations[j].om_pa |= trans_cells[i++]; } translations[j].om_mode = trans_cells[i++]; } KASSERT(i == sz, ("Translations map has incorrect cell count (%d/%zd)", i, sz)); sz = j; qsort(translations, sz, sizeof (*translations), om_cmp); for (i = 0; i < sz; i++) { pa_base = translations[i].om_pa; #ifndef __powerpc64__ if ((translations[i].om_pa >> 32) != 0) panic("OFW translations above 32-bit boundary!"); #endif if (pa_base % PAGE_SIZE) panic("OFW translation not page-aligned (phys)!"); if (translations[i].om_va % PAGE_SIZE) panic("OFW translation not page-aligned (virt)!"); CTR3(KTR_PMAP, "translation: pa=%#zx va=%#x len=%#x", pa_base, translations[i].om_va, translations[i].om_len); /* Now enter the pages for this mapping */ DISABLE_TRANS(msr); for (off = 0; off < translations[i].om_len; off += PAGE_SIZE) { /* If this address is direct-mapped, skip remapping */ if (hw_direct_map && translations[i].om_va == pa_base && moea64_calc_wimg(pa_base + off, VM_MEMATTR_DEFAULT) == LPTE_M) continue; PMAP_LOCK(kernel_pmap); pvo = moea64_pvo_find_va(kernel_pmap, translations[i].om_va + off); PMAP_UNLOCK(kernel_pmap); if (pvo != NULL) continue; moea64_kenter(mmup, translations[i].om_va + off, pa_base + off); } ENABLE_TRANS(msr); } } #ifdef __powerpc64__ static void moea64_probe_large_page(void) { uint16_t pvr = mfpvr() >> 16; switch (pvr) { case IBM970: case IBM970FX: case IBM970MP: powerpc_sync(); isync(); mtspr(SPR_HID4, mfspr(SPR_HID4) & ~HID4_970_DISABLE_LG_PG); powerpc_sync(); isync(); /* FALLTHROUGH */ default: moea64_large_page_size = 0x1000000; /* 16 MB */ moea64_large_page_shift = 24; } moea64_large_page_mask = moea64_large_page_size - 1; } static void moea64_bootstrap_slb_prefault(vm_offset_t va, int large) { struct slb *cache; struct slb entry; uint64_t esid, slbe; uint64_t i; cache = PCPU_GET(slb); esid = va >> ADDR_SR_SHFT; slbe = (esid << SLBE_ESID_SHIFT) | SLBE_VALID; for (i = 0; i < 64; i++) { if (cache[i].slbe == (slbe | i)) return; } entry.slbe = slbe; entry.slbv = KERNEL_VSID(esid) << SLBV_VSID_SHIFT; if (large) entry.slbv |= SLBV_L; slb_insert_kernel(entry.slbe, entry.slbv); } #endif static void moea64_setup_direct_map(mmu_t mmup, vm_offset_t kernelstart, vm_offset_t kernelend) { struct pvo_entry *pvo; register_t msr; vm_paddr_t pa; vm_offset_t size, off; uint64_t pte_lo; int i; if (moea64_large_page_size == 0) hw_direct_map = 0; DISABLE_TRANS(msr); if (hw_direct_map) { PMAP_LOCK(kernel_pmap); for (i = 0; i < pregions_sz; i++) { for (pa = pregions[i].mr_start; pa < pregions[i].mr_start + pregions[i].mr_size; pa += moea64_large_page_size) { pte_lo = LPTE_M; pvo = alloc_pvo_entry(1 /* bootstrap */); pvo->pvo_vaddr |= PVO_WIRED | PVO_LARGE; init_pvo_entry(pvo, kernel_pmap, pa); /* * Set memory access as guarded if prefetch within * the page could exit the available physmem area. */ if (pa & moea64_large_page_mask) { pa &= moea64_large_page_mask; pte_lo |= LPTE_G; } if (pa + moea64_large_page_size > pregions[i].mr_start + pregions[i].mr_size) pte_lo |= LPTE_G; pvo->pvo_pte.prot = VM_PROT_READ | VM_PROT_WRITE | VM_PROT_EXECUTE; pvo->pvo_pte.pa = pa | pte_lo; moea64_pvo_enter(mmup, pvo, NULL); } } PMAP_UNLOCK(kernel_pmap); } else { size = moea64_bpvo_pool_size*sizeof(struct pvo_entry); off = (vm_offset_t)(moea64_bpvo_pool); for (pa = off; pa < off + size; pa += PAGE_SIZE) moea64_kenter(mmup, pa, pa); /* * Map certain important things, like ourselves. * * NOTE: We do not map the exception vector space. That code is * used only in real mode, and leaving it unmapped allows us to * catch NULL pointer deferences, instead of making NULL a valid * address. */ for (pa = kernelstart & ~PAGE_MASK; pa < kernelend; pa += PAGE_SIZE) moea64_kenter(mmup, pa, pa); } ENABLE_TRANS(msr); /* * Allow user to override unmapped_buf_allowed for testing. * XXXKIB Only direct map implementation was tested. */ if (!TUNABLE_INT_FETCH("vfs.unmapped_buf_allowed", &unmapped_buf_allowed)) unmapped_buf_allowed = hw_direct_map; } void moea64_early_bootstrap(mmu_t mmup, vm_offset_t kernelstart, vm_offset_t kernelend) { int i, j; vm_size_t physsz, hwphyssz; #ifndef __powerpc64__ /* We don't have a direct map since there is no BAT */ hw_direct_map = 0; /* Make sure battable is zero, since we have no BAT */ for (i = 0; i < 16; i++) { battable[i].batu = 0; battable[i].batl = 0; } #else moea64_probe_large_page(); /* Use a direct map if we have large page support */ if (moea64_large_page_size > 0) hw_direct_map = 1; else hw_direct_map = 0; #endif /* Get physical memory regions from firmware */ mem_regions(&pregions, &pregions_sz, ®ions, ®ions_sz); CTR0(KTR_PMAP, "moea64_bootstrap: physical memory"); if (sizeof(phys_avail)/sizeof(phys_avail[0]) < regions_sz) panic("moea64_bootstrap: phys_avail too small"); phys_avail_count = 0; physsz = 0; hwphyssz = 0; TUNABLE_ULONG_FETCH("hw.physmem", (u_long *) &hwphyssz); for (i = 0, j = 0; i < regions_sz; i++, j += 2) { CTR3(KTR_PMAP, "region: %#zx - %#zx (%#zx)", regions[i].mr_start, regions[i].mr_start + regions[i].mr_size, regions[i].mr_size); if (hwphyssz != 0 && (physsz + regions[i].mr_size) >= hwphyssz) { if (physsz < hwphyssz) { phys_avail[j] = regions[i].mr_start; phys_avail[j + 1] = regions[i].mr_start + hwphyssz - physsz; physsz = hwphyssz; phys_avail_count++; } break; } phys_avail[j] = regions[i].mr_start; phys_avail[j + 1] = regions[i].mr_start + regions[i].mr_size; phys_avail_count++; physsz += regions[i].mr_size; } /* Check for overlap with the kernel and exception vectors */ for (j = 0; j < 2*phys_avail_count; j+=2) { if (phys_avail[j] < EXC_LAST) phys_avail[j] += EXC_LAST; if (kernelstart >= phys_avail[j] && kernelstart < phys_avail[j+1]) { if (kernelend < phys_avail[j+1]) { phys_avail[2*phys_avail_count] = (kernelend & ~PAGE_MASK) + PAGE_SIZE; phys_avail[2*phys_avail_count + 1] = phys_avail[j+1]; phys_avail_count++; } phys_avail[j+1] = kernelstart & ~PAGE_MASK; } if (kernelend >= phys_avail[j] && kernelend < phys_avail[j+1]) { if (kernelstart > phys_avail[j]) { phys_avail[2*phys_avail_count] = phys_avail[j]; phys_avail[2*phys_avail_count + 1] = kernelstart & ~PAGE_MASK; phys_avail_count++; } phys_avail[j] = (kernelend & ~PAGE_MASK) + PAGE_SIZE; } } physmem = btoc(physsz); #ifdef PTEGCOUNT moea64_pteg_count = PTEGCOUNT; #else moea64_pteg_count = 0x1000; while (moea64_pteg_count < physmem) moea64_pteg_count <<= 1; moea64_pteg_count >>= 1; #endif /* PTEGCOUNT */ } void moea64_mid_bootstrap(mmu_t mmup, vm_offset_t kernelstart, vm_offset_t kernelend) { int i; /* * Set PTEG mask */ moea64_pteg_mask = moea64_pteg_count - 1; /* * Initialize SLB table lock and page locks */ mtx_init(&moea64_slb_mutex, "SLB table", NULL, MTX_DEF); for (i = 0; i < PV_LOCK_COUNT; i++) mtx_init(&pv_lock[i], "page pv", NULL, MTX_DEF); /* * Initialise the bootstrap pvo pool. */ moea64_bpvo_pool = (struct pvo_entry *)moea64_bootstrap_alloc( moea64_bpvo_pool_size*sizeof(struct pvo_entry), 0); moea64_bpvo_pool_index = 0; /* * Make sure kernel vsid is allocated as well as VSID 0. */ #ifndef __powerpc64__ moea64_vsid_bitmap[(KERNEL_VSIDBITS & (NVSIDS - 1)) / VSID_NBPW] |= 1 << (KERNEL_VSIDBITS % VSID_NBPW); moea64_vsid_bitmap[0] |= 1; #endif /* * Initialize the kernel pmap (which is statically allocated). */ #ifdef __powerpc64__ for (i = 0; i < 64; i++) { pcpup->pc_slb[i].slbv = 0; pcpup->pc_slb[i].slbe = 0; } #else for (i = 0; i < 16; i++) kernel_pmap->pm_sr[i] = EMPTY_SEGMENT + i; #endif kernel_pmap->pmap_phys = kernel_pmap; CPU_FILL(&kernel_pmap->pm_active); RB_INIT(&kernel_pmap->pmap_pvo); PMAP_LOCK_INIT(kernel_pmap); /* * Now map in all the other buffers we allocated earlier */ moea64_setup_direct_map(mmup, kernelstart, kernelend); } void moea64_late_bootstrap(mmu_t mmup, vm_offset_t kernelstart, vm_offset_t kernelend) { ihandle_t mmui; phandle_t chosen; phandle_t mmu; ssize_t sz; int i; vm_offset_t pa, va; void *dpcpu; /* * Set up the Open Firmware pmap and add its mappings if not in real * mode. */ chosen = OF_finddevice("/chosen"); if (chosen != -1 && OF_getencprop(chosen, "mmu", &mmui, 4) != -1) { mmu = OF_instance_to_package(mmui); if (mmu == -1 || (sz = OF_getproplen(mmu, "translations")) == -1) sz = 0; if (sz > 6144 /* tmpstksz - 2 KB headroom */) panic("moea64_bootstrap: too many ofw translations"); if (sz > 0) moea64_add_ofw_mappings(mmup, mmu, sz); } /* * Calculate the last available physical address. */ for (i = 0; phys_avail[i + 2] != 0; i += 2) ; Maxmem = powerpc_btop(phys_avail[i + 1]); /* * Initialize MMU and remap early physical mappings */ MMU_CPU_BOOTSTRAP(mmup,0); mtmsr(mfmsr() | PSL_DR | PSL_IR); pmap_bootstrapped++; bs_remap_earlyboot(); /* * Set the start and end of kva. */ virtual_avail = VM_MIN_KERNEL_ADDRESS; virtual_end = VM_MAX_SAFE_KERNEL_ADDRESS; /* * Map the entire KVA range into the SLB. We must not fault there. */ #ifdef __powerpc64__ for (va = virtual_avail; va < virtual_end; va += SEGMENT_LENGTH) moea64_bootstrap_slb_prefault(va, 0); #endif /* * Figure out how far we can extend virtual_end into segment 16 * without running into existing mappings. Segment 16 is guaranteed * to contain neither RAM nor devices (at least on Apple hardware), * but will generally contain some OFW mappings we should not * step on. */ #ifndef __powerpc64__ /* KVA is in high memory on PPC64 */ PMAP_LOCK(kernel_pmap); while (virtual_end < VM_MAX_KERNEL_ADDRESS && moea64_pvo_find_va(kernel_pmap, virtual_end+1) == NULL) virtual_end += PAGE_SIZE; PMAP_UNLOCK(kernel_pmap); #endif /* * Allocate a kernel stack with a guard page for thread0 and map it * into the kernel page map. */ pa = moea64_bootstrap_alloc(kstack_pages * PAGE_SIZE, PAGE_SIZE); va = virtual_avail + KSTACK_GUARD_PAGES * PAGE_SIZE; virtual_avail = va + kstack_pages * PAGE_SIZE; CTR2(KTR_PMAP, "moea64_bootstrap: kstack0 at %#x (%#x)", pa, va); thread0.td_kstack = va; thread0.td_kstack_pages = kstack_pages; for (i = 0; i < kstack_pages; i++) { moea64_kenter(mmup, va, pa); pa += PAGE_SIZE; va += PAGE_SIZE; } /* * Allocate virtual address space for the message buffer. */ pa = msgbuf_phys = moea64_bootstrap_alloc(msgbufsize, PAGE_SIZE); msgbufp = (struct msgbuf *)virtual_avail; va = virtual_avail; virtual_avail += round_page(msgbufsize); while (va < virtual_avail) { moea64_kenter(mmup, va, pa); pa += PAGE_SIZE; va += PAGE_SIZE; } /* * Allocate virtual address space for the dynamic percpu area. */ pa = moea64_bootstrap_alloc(DPCPU_SIZE, PAGE_SIZE); dpcpu = (void *)virtual_avail; va = virtual_avail; virtual_avail += DPCPU_SIZE; while (va < virtual_avail) { moea64_kenter(mmup, va, pa); pa += PAGE_SIZE; va += PAGE_SIZE; } dpcpu_init(dpcpu, 0); /* * Allocate some things for page zeroing. We put this directly * in the page table and use MOEA64_PTE_REPLACE to avoid any * of the PVO book-keeping or other parts of the VM system * from even knowing that this hack exists. */ if (!hw_direct_map) { mtx_init(&moea64_scratchpage_mtx, "pvo zero page", NULL, MTX_DEF); for (i = 0; i < 2; i++) { moea64_scratchpage_va[i] = (virtual_end+1) - PAGE_SIZE; virtual_end -= PAGE_SIZE; moea64_kenter(mmup, moea64_scratchpage_va[i], 0); PMAP_LOCK(kernel_pmap); moea64_scratchpage_pvo[i] = moea64_pvo_find_va( kernel_pmap, (vm_offset_t)moea64_scratchpage_va[i]); PMAP_UNLOCK(kernel_pmap); } } } static void moea64_pmap_init_qpages(void) { struct pcpu *pc; int i; if (hw_direct_map) return; CPU_FOREACH(i) { pc = pcpu_find(i); pc->pc_qmap_addr = kva_alloc(PAGE_SIZE); if (pc->pc_qmap_addr == 0) panic("pmap_init_qpages: unable to allocate KVA"); PMAP_LOCK(kernel_pmap); pc->pc_qmap_pvo = moea64_pvo_find_va(kernel_pmap, pc->pc_qmap_addr); PMAP_UNLOCK(kernel_pmap); mtx_init(&pc->pc_qmap_lock, "qmap lock", NULL, MTX_DEF); } } SYSINIT(qpages_init, SI_SUB_CPU, SI_ORDER_ANY, moea64_pmap_init_qpages, NULL); /* * Activate a user pmap. This mostly involves setting some non-CPU * state. */ void moea64_activate(mmu_t mmu, struct thread *td) { pmap_t pm; pm = &td->td_proc->p_vmspace->vm_pmap; CPU_SET(PCPU_GET(cpuid), &pm->pm_active); #ifdef __powerpc64__ PCPU_SET(userslb, pm->pm_slb); __asm __volatile("slbmte %0, %1; isync" :: "r"(td->td_pcb->pcb_cpu.aim.usr_vsid), "r"(USER_SLB_SLBE)); #else PCPU_SET(curpmap, pm->pmap_phys); mtsrin(USER_SR << ADDR_SR_SHFT, td->td_pcb->pcb_cpu.aim.usr_vsid); #endif } void moea64_deactivate(mmu_t mmu, struct thread *td) { pmap_t pm; __asm __volatile("isync; slbie %0" :: "r"(USER_ADDR)); pm = &td->td_proc->p_vmspace->vm_pmap; CPU_CLR(PCPU_GET(cpuid), &pm->pm_active); #ifdef __powerpc64__ PCPU_SET(userslb, NULL); #else PCPU_SET(curpmap, NULL); #endif } void moea64_unwire(mmu_t mmu, pmap_t pm, vm_offset_t sva, vm_offset_t eva) { struct pvo_entry key, *pvo; vm_page_t m; int64_t refchg; key.pvo_vaddr = sva; PMAP_LOCK(pm); for (pvo = RB_NFIND(pvo_tree, &pm->pmap_pvo, &key); pvo != NULL && PVO_VADDR(pvo) < eva; pvo = RB_NEXT(pvo_tree, &pm->pmap_pvo, pvo)) { if ((pvo->pvo_vaddr & PVO_WIRED) == 0) panic("moea64_unwire: pvo %p is missing PVO_WIRED", pvo); pvo->pvo_vaddr &= ~PVO_WIRED; refchg = MOEA64_PTE_REPLACE(mmu, pvo, 0 /* No invalidation */); if ((pvo->pvo_vaddr & PVO_MANAGED) && (pvo->pvo_pte.prot & VM_PROT_WRITE)) { if (refchg < 0) refchg = LPTE_CHG; m = PHYS_TO_VM_PAGE(pvo->pvo_pte.pa & LPTE_RPGN); refchg |= atomic_readandclear_32(&m->md.mdpg_attrs); if (refchg & LPTE_CHG) vm_page_dirty(m); if (refchg & LPTE_REF) vm_page_aflag_set(m, PGA_REFERENCED); } pm->pm_stats.wired_count--; } PMAP_UNLOCK(pm); } /* * This goes through and sets the physical address of our * special scratch PTE to the PA we want to zero or copy. Because * of locking issues (this can get called in pvo_enter() by * the UMA allocator), we can't use most other utility functions here */ static __inline void moea64_set_scratchpage_pa(mmu_t mmup, int which, vm_paddr_t pa) { KASSERT(!hw_direct_map, ("Using OEA64 scratchpage with a direct map!")); mtx_assert(&moea64_scratchpage_mtx, MA_OWNED); moea64_scratchpage_pvo[which]->pvo_pte.pa = moea64_calc_wimg(pa, VM_MEMATTR_DEFAULT) | (uint64_t)pa; MOEA64_PTE_REPLACE(mmup, moea64_scratchpage_pvo[which], MOEA64_PTE_INVALIDATE); isync(); } void moea64_copy_page(mmu_t mmu, vm_page_t msrc, vm_page_t mdst) { vm_offset_t dst; vm_offset_t src; dst = VM_PAGE_TO_PHYS(mdst); src = VM_PAGE_TO_PHYS(msrc); if (hw_direct_map) { bcopy((void *)src, (void *)dst, PAGE_SIZE); } else { mtx_lock(&moea64_scratchpage_mtx); moea64_set_scratchpage_pa(mmu, 0, src); moea64_set_scratchpage_pa(mmu, 1, dst); bcopy((void *)moea64_scratchpage_va[0], (void *)moea64_scratchpage_va[1], PAGE_SIZE); mtx_unlock(&moea64_scratchpage_mtx); } } static inline void moea64_copy_pages_dmap(mmu_t mmu, vm_page_t *ma, vm_offset_t a_offset, vm_page_t *mb, vm_offset_t b_offset, int xfersize) { void *a_cp, *b_cp; vm_offset_t a_pg_offset, b_pg_offset; int cnt; while (xfersize > 0) { a_pg_offset = a_offset & PAGE_MASK; cnt = min(xfersize, PAGE_SIZE - a_pg_offset); a_cp = (char *)VM_PAGE_TO_PHYS(ma[a_offset >> PAGE_SHIFT]) + a_pg_offset; b_pg_offset = b_offset & PAGE_MASK; cnt = min(cnt, PAGE_SIZE - b_pg_offset); b_cp = (char *)VM_PAGE_TO_PHYS(mb[b_offset >> PAGE_SHIFT]) + b_pg_offset; bcopy(a_cp, b_cp, cnt); a_offset += cnt; b_offset += cnt; xfersize -= cnt; } } static inline void moea64_copy_pages_nodmap(mmu_t mmu, vm_page_t *ma, vm_offset_t a_offset, vm_page_t *mb, vm_offset_t b_offset, int xfersize) { void *a_cp, *b_cp; vm_offset_t a_pg_offset, b_pg_offset; int cnt; mtx_lock(&moea64_scratchpage_mtx); while (xfersize > 0) { a_pg_offset = a_offset & PAGE_MASK; cnt = min(xfersize, PAGE_SIZE - a_pg_offset); moea64_set_scratchpage_pa(mmu, 0, VM_PAGE_TO_PHYS(ma[a_offset >> PAGE_SHIFT])); a_cp = (char *)moea64_scratchpage_va[0] + a_pg_offset; b_pg_offset = b_offset & PAGE_MASK; cnt = min(cnt, PAGE_SIZE - b_pg_offset); moea64_set_scratchpage_pa(mmu, 1, VM_PAGE_TO_PHYS(mb[b_offset >> PAGE_SHIFT])); b_cp = (char *)moea64_scratchpage_va[1] + b_pg_offset; bcopy(a_cp, b_cp, cnt); a_offset += cnt; b_offset += cnt; xfersize -= cnt; } mtx_unlock(&moea64_scratchpage_mtx); } void moea64_copy_pages(mmu_t mmu, vm_page_t *ma, vm_offset_t a_offset, vm_page_t *mb, vm_offset_t b_offset, int xfersize) { if (hw_direct_map) { moea64_copy_pages_dmap(mmu, ma, a_offset, mb, b_offset, xfersize); } else { moea64_copy_pages_nodmap(mmu, ma, a_offset, mb, b_offset, xfersize); } } void moea64_zero_page_area(mmu_t mmu, vm_page_t m, int off, int size) { vm_paddr_t pa = VM_PAGE_TO_PHYS(m); if (size + off > PAGE_SIZE) panic("moea64_zero_page: size + off > PAGE_SIZE"); if (hw_direct_map) { bzero((caddr_t)pa + off, size); } else { mtx_lock(&moea64_scratchpage_mtx); moea64_set_scratchpage_pa(mmu, 0, pa); bzero((caddr_t)moea64_scratchpage_va[0] + off, size); mtx_unlock(&moea64_scratchpage_mtx); } } /* * Zero a page of physical memory by temporarily mapping it */ void moea64_zero_page(mmu_t mmu, vm_page_t m) { vm_paddr_t pa = VM_PAGE_TO_PHYS(m); vm_offset_t va, off; if (!hw_direct_map) { mtx_lock(&moea64_scratchpage_mtx); moea64_set_scratchpage_pa(mmu, 0, pa); va = moea64_scratchpage_va[0]; } else { va = pa; } for (off = 0; off < PAGE_SIZE; off += cacheline_size) __asm __volatile("dcbz 0,%0" :: "r"(va + off)); if (!hw_direct_map) mtx_unlock(&moea64_scratchpage_mtx); } vm_offset_t moea64_quick_enter_page(mmu_t mmu, vm_page_t m) { struct pvo_entry *pvo; vm_paddr_t pa = VM_PAGE_TO_PHYS(m); if (hw_direct_map) return (pa); /* * MOEA64_PTE_REPLACE does some locking, so we can't just grab * a critical section and access the PCPU data like on i386. * Instead, pin the thread and grab the PCPU lock to prevent * a preempting thread from using the same PCPU data. */ sched_pin(); mtx_assert(PCPU_PTR(qmap_lock), MA_NOTOWNED); pvo = PCPU_GET(qmap_pvo); mtx_lock(PCPU_PTR(qmap_lock)); pvo->pvo_pte.pa = moea64_calc_wimg(pa, pmap_page_get_memattr(m)) | (uint64_t)pa; MOEA64_PTE_REPLACE(mmu, pvo, MOEA64_PTE_INVALIDATE); isync(); return (PCPU_GET(qmap_addr)); } void moea64_quick_remove_page(mmu_t mmu, vm_offset_t addr) { if (hw_direct_map) return; mtx_assert(PCPU_PTR(qmap_lock), MA_OWNED); KASSERT(PCPU_GET(qmap_addr) == addr, ("moea64_quick_remove_page: invalid address")); mtx_unlock(PCPU_PTR(qmap_lock)); sched_unpin(); } /* * Map the given physical page at the specified virtual address in the * target pmap with the protection requested. If specified the page * will be wired down. */ int moea64_enter(mmu_t mmu, pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot, u_int flags, int8_t psind) { struct pvo_entry *pvo, *oldpvo; struct pvo_head *pvo_head; uint64_t pte_lo; int error; if ((m->oflags & VPO_UNMANAGED) == 0 && !vm_page_xbusied(m)) VM_OBJECT_ASSERT_LOCKED(m->object); pvo = alloc_pvo_entry(0); pvo->pvo_pmap = NULL; /* to be filled in later */ pvo->pvo_pte.prot = prot; pte_lo = moea64_calc_wimg(VM_PAGE_TO_PHYS(m), pmap_page_get_memattr(m)); pvo->pvo_pte.pa = VM_PAGE_TO_PHYS(m) | pte_lo; if ((flags & PMAP_ENTER_WIRED) != 0) pvo->pvo_vaddr |= PVO_WIRED; if ((m->oflags & VPO_UNMANAGED) != 0 || !moea64_initialized) { pvo_head = NULL; } else { pvo_head = &m->md.mdpg_pvoh; pvo->pvo_vaddr |= PVO_MANAGED; } for (;;) { PV_PAGE_LOCK(m); PMAP_LOCK(pmap); if (pvo->pvo_pmap == NULL) init_pvo_entry(pvo, pmap, va); if (prot & VM_PROT_WRITE) if (pmap_bootstrapped && (m->oflags & VPO_UNMANAGED) == 0) vm_page_aflag_set(m, PGA_WRITEABLE); oldpvo = moea64_pvo_find_va(pmap, va); if (oldpvo != NULL) { if (oldpvo->pvo_vaddr == pvo->pvo_vaddr && oldpvo->pvo_pte.pa == pvo->pvo_pte.pa && oldpvo->pvo_pte.prot == prot) { /* Identical mapping already exists */ error = 0; /* If not in page table, reinsert it */ if (MOEA64_PTE_SYNCH(mmu, oldpvo) < 0) { moea64_pte_overflow--; MOEA64_PTE_INSERT(mmu, oldpvo); } /* Then just clean up and go home */ PV_PAGE_UNLOCK(m); PMAP_UNLOCK(pmap); free_pvo_entry(pvo); break; } /* Otherwise, need to kill it first */ KASSERT(oldpvo->pvo_pmap == pmap, ("pmap of old " "mapping does not match new mapping")); moea64_pvo_remove_from_pmap(mmu, oldpvo); } error = moea64_pvo_enter(mmu, pvo, pvo_head); PV_PAGE_UNLOCK(m); PMAP_UNLOCK(pmap); /* Free any dead pages */ if (oldpvo != NULL) { PV_LOCK(oldpvo->pvo_pte.pa & LPTE_RPGN); moea64_pvo_remove_from_page(mmu, oldpvo); PV_UNLOCK(oldpvo->pvo_pte.pa & LPTE_RPGN); free_pvo_entry(oldpvo); } if (error != ENOMEM) break; if ((flags & PMAP_ENTER_NOSLEEP) != 0) return (KERN_RESOURCE_SHORTAGE); VM_OBJECT_ASSERT_UNLOCKED(m->object); VM_WAIT; } /* * Flush the page from the instruction cache if this page is * mapped executable and cacheable. */ if (pmap != kernel_pmap && !(m->aflags & PGA_EXECUTABLE) && (pte_lo & (LPTE_I | LPTE_G | LPTE_NOEXEC)) == 0) { vm_page_aflag_set(m, PGA_EXECUTABLE); moea64_syncicache(mmu, pmap, va, VM_PAGE_TO_PHYS(m), PAGE_SIZE); } return (KERN_SUCCESS); } static void moea64_syncicache(mmu_t mmu, pmap_t pmap, vm_offset_t va, vm_paddr_t pa, vm_size_t sz) { /* * This is much trickier than on older systems because * we can't sync the icache on physical addresses directly * without a direct map. Instead we check a couple of cases * where the memory is already mapped in and, failing that, * use the same trick we use for page zeroing to create * a temporary mapping for this physical address. */ if (!pmap_bootstrapped) { /* * If PMAP is not bootstrapped, we are likely to be * in real mode. */ __syncicache((void *)pa, sz); } else if (pmap == kernel_pmap) { __syncicache((void *)va, sz); } else if (hw_direct_map) { __syncicache((void *)pa, sz); } else { /* Use the scratch page to set up a temp mapping */ mtx_lock(&moea64_scratchpage_mtx); moea64_set_scratchpage_pa(mmu, 1, pa & ~ADDR_POFF); __syncicache((void *)(moea64_scratchpage_va[1] + (va & ADDR_POFF)), sz); mtx_unlock(&moea64_scratchpage_mtx); } } /* * Maps a sequence of resident pages belonging to the same object. * The sequence begins with the given page m_start. This page is * mapped at the given virtual address start. Each subsequent page is * mapped at a virtual address that is offset from start by the same * amount as the page is offset from m_start within the object. The * last page in the sequence is the page with the largest offset from * m_start that can be mapped at a virtual address less than the given * virtual address end. Not every virtual page between start and end * is mapped; only those for which a resident page exists with the * corresponding offset from m_start are mapped. */ void moea64_enter_object(mmu_t mmu, pmap_t pm, vm_offset_t start, vm_offset_t end, vm_page_t m_start, vm_prot_t prot) { vm_page_t m; vm_pindex_t diff, psize; VM_OBJECT_ASSERT_LOCKED(m_start->object); psize = atop(end - start); m = m_start; while (m != NULL && (diff = m->pindex - m_start->pindex) < psize) { moea64_enter(mmu, pm, start + ptoa(diff), m, prot & (VM_PROT_READ | VM_PROT_EXECUTE), PMAP_ENTER_NOSLEEP, 0); m = TAILQ_NEXT(m, listq); } } void moea64_enter_quick(mmu_t mmu, pmap_t pm, vm_offset_t va, vm_page_t m, vm_prot_t prot) { moea64_enter(mmu, pm, va, m, prot & (VM_PROT_READ | VM_PROT_EXECUTE), PMAP_ENTER_NOSLEEP, 0); } vm_paddr_t moea64_extract(mmu_t mmu, pmap_t pm, vm_offset_t va) { struct pvo_entry *pvo; vm_paddr_t pa; PMAP_LOCK(pm); pvo = moea64_pvo_find_va(pm, va); if (pvo == NULL) pa = 0; else pa = (pvo->pvo_pte.pa & LPTE_RPGN) | (va - PVO_VADDR(pvo)); PMAP_UNLOCK(pm); return (pa); } /* * Atomically extract and hold the physical page with the given * pmap and virtual address pair if that mapping permits the given * protection. */ vm_page_t moea64_extract_and_hold(mmu_t mmu, pmap_t pmap, vm_offset_t va, vm_prot_t prot) { struct pvo_entry *pvo; vm_page_t m; vm_paddr_t pa; m = NULL; pa = 0; PMAP_LOCK(pmap); retry: pvo = moea64_pvo_find_va(pmap, va & ~ADDR_POFF); if (pvo != NULL && (pvo->pvo_pte.prot & prot) == prot) { if (vm_page_pa_tryrelock(pmap, pvo->pvo_pte.pa & LPTE_RPGN, &pa)) goto retry; m = PHYS_TO_VM_PAGE(pvo->pvo_pte.pa & LPTE_RPGN); vm_page_hold(m); } PA_UNLOCK_COND(pa); PMAP_UNLOCK(pmap); return (m); } static mmu_t installed_mmu; static void * -moea64_uma_page_alloc(uma_zone_t zone, vm_size_t bytes, uint8_t *flags, - int wait) +moea64_uma_page_alloc(uma_zone_t zone, vm_size_t bytes, int domain, + uint8_t *flags, int wait) { struct pvo_entry *pvo; vm_offset_t va; vm_page_t m; int pflags, needed_lock; /* * This entire routine is a horrible hack to avoid bothering kmem * for new KVA addresses. Because this can get called from inside * kmem allocation routines, calling kmem for a new address here * can lead to multiply locking non-recursive mutexes. */ *flags = UMA_SLAB_PRIV; needed_lock = !PMAP_LOCKED(kernel_pmap); pflags = malloc2vm_flags(wait) | VM_ALLOC_WIRED; for (;;) { m = vm_page_alloc(NULL, 0, pflags | VM_ALLOC_NOOBJ); if (m == NULL) { if (wait & M_NOWAIT) return (NULL); VM_WAIT; } else break; } va = VM_PAGE_TO_PHYS(m); pvo = alloc_pvo_entry(1 /* bootstrap */); pvo->pvo_pte.prot = VM_PROT_READ | VM_PROT_WRITE; pvo->pvo_pte.pa = VM_PAGE_TO_PHYS(m) | LPTE_M; if (needed_lock) PMAP_LOCK(kernel_pmap); init_pvo_entry(pvo, kernel_pmap, va); pvo->pvo_vaddr |= PVO_WIRED; moea64_pvo_enter(installed_mmu, pvo, NULL); if (needed_lock) PMAP_UNLOCK(kernel_pmap); if ((wait & M_ZERO) && (m->flags & PG_ZERO) == 0) bzero((void *)va, PAGE_SIZE); return (void *)va; } extern int elf32_nxstack; void moea64_init(mmu_t mmu) { CTR0(KTR_PMAP, "moea64_init"); moea64_pvo_zone = uma_zcreate("UPVO entry", sizeof (struct pvo_entry), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_VM | UMA_ZONE_NOFREE); if (!hw_direct_map) { installed_mmu = mmu; uma_zone_set_allocf(moea64_pvo_zone,moea64_uma_page_alloc); } #ifdef COMPAT_FREEBSD32 elf32_nxstack = 1; #endif moea64_initialized = TRUE; } boolean_t moea64_is_referenced(mmu_t mmu, vm_page_t m) { KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("moea64_is_referenced: page %p is not managed", m)); return (moea64_query_bit(mmu, m, LPTE_REF)); } boolean_t moea64_is_modified(mmu_t mmu, vm_page_t m) { KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("moea64_is_modified: page %p is not managed", m)); /* * If the page is not exclusive busied, then PGA_WRITEABLE cannot be * concurrently set while the object is locked. Thus, if PGA_WRITEABLE * is clear, no PTEs can have LPTE_CHG set. */ VM_OBJECT_ASSERT_LOCKED(m->object); if (!vm_page_xbusied(m) && (m->aflags & PGA_WRITEABLE) == 0) return (FALSE); return (moea64_query_bit(mmu, m, LPTE_CHG)); } boolean_t moea64_is_prefaultable(mmu_t mmu, pmap_t pmap, vm_offset_t va) { struct pvo_entry *pvo; boolean_t rv = TRUE; PMAP_LOCK(pmap); pvo = moea64_pvo_find_va(pmap, va & ~ADDR_POFF); if (pvo != NULL) rv = FALSE; PMAP_UNLOCK(pmap); return (rv); } void moea64_clear_modify(mmu_t mmu, vm_page_t m) { KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("moea64_clear_modify: page %p is not managed", m)); VM_OBJECT_ASSERT_WLOCKED(m->object); KASSERT(!vm_page_xbusied(m), ("moea64_clear_modify: page %p is exclusive busied", m)); /* * If the page is not PGA_WRITEABLE, then no PTEs can have LPTE_CHG * set. If the object containing the page is locked and the page is * not exclusive busied, then PGA_WRITEABLE cannot be concurrently set. */ if ((m->aflags & PGA_WRITEABLE) == 0) return; moea64_clear_bit(mmu, m, LPTE_CHG); } /* * Clear the write and modified bits in each of the given page's mappings. */ void moea64_remove_write(mmu_t mmu, vm_page_t m) { struct pvo_entry *pvo; int64_t refchg, ret; pmap_t pmap; KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("moea64_remove_write: page %p is not managed", m)); /* * If the page is not exclusive busied, then PGA_WRITEABLE cannot be * set by another thread while the object is locked. Thus, * if PGA_WRITEABLE is clear, no page table entries need updating. */ VM_OBJECT_ASSERT_WLOCKED(m->object); if (!vm_page_xbusied(m) && (m->aflags & PGA_WRITEABLE) == 0) return; powerpc_sync(); PV_PAGE_LOCK(m); refchg = 0; LIST_FOREACH(pvo, vm_page_to_pvoh(m), pvo_vlink) { pmap = pvo->pvo_pmap; PMAP_LOCK(pmap); if (!(pvo->pvo_vaddr & PVO_DEAD) && (pvo->pvo_pte.prot & VM_PROT_WRITE)) { pvo->pvo_pte.prot &= ~VM_PROT_WRITE; ret = MOEA64_PTE_REPLACE(mmu, pvo, MOEA64_PTE_PROT_UPDATE); if (ret < 0) ret = LPTE_CHG; refchg |= ret; if (pvo->pvo_pmap == kernel_pmap) isync(); } PMAP_UNLOCK(pmap); } if ((refchg | atomic_readandclear_32(&m->md.mdpg_attrs)) & LPTE_CHG) vm_page_dirty(m); vm_page_aflag_clear(m, PGA_WRITEABLE); PV_PAGE_UNLOCK(m); } /* * moea64_ts_referenced: * * Return a count of reference bits for a page, clearing those bits. * It is not necessary for every reference bit to be cleared, but it * is necessary that 0 only be returned when there are truly no * reference bits set. * * XXX: The exact number of bits to check and clear is a matter that * should be tested and standardized at some point in the future for * optimal aging of shared pages. */ int moea64_ts_referenced(mmu_t mmu, vm_page_t m) { KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("moea64_ts_referenced: page %p is not managed", m)); return (moea64_clear_bit(mmu, m, LPTE_REF)); } /* * Modify the WIMG settings of all mappings for a page. */ void moea64_page_set_memattr(mmu_t mmu, vm_page_t m, vm_memattr_t ma) { struct pvo_entry *pvo; int64_t refchg; pmap_t pmap; uint64_t lo; if ((m->oflags & VPO_UNMANAGED) != 0) { m->md.mdpg_cache_attrs = ma; return; } lo = moea64_calc_wimg(VM_PAGE_TO_PHYS(m), ma); PV_PAGE_LOCK(m); LIST_FOREACH(pvo, vm_page_to_pvoh(m), pvo_vlink) { pmap = pvo->pvo_pmap; PMAP_LOCK(pmap); if (!(pvo->pvo_vaddr & PVO_DEAD)) { pvo->pvo_pte.pa &= ~LPTE_WIMG; pvo->pvo_pte.pa |= lo; refchg = MOEA64_PTE_REPLACE(mmu, pvo, MOEA64_PTE_INVALIDATE); if (refchg < 0) refchg = (pvo->pvo_pte.prot & VM_PROT_WRITE) ? LPTE_CHG : 0; if ((pvo->pvo_vaddr & PVO_MANAGED) && (pvo->pvo_pte.prot & VM_PROT_WRITE)) { refchg |= atomic_readandclear_32(&m->md.mdpg_attrs); if (refchg & LPTE_CHG) vm_page_dirty(m); if (refchg & LPTE_REF) vm_page_aflag_set(m, PGA_REFERENCED); } if (pvo->pvo_pmap == kernel_pmap) isync(); } PMAP_UNLOCK(pmap); } m->md.mdpg_cache_attrs = ma; PV_PAGE_UNLOCK(m); } /* * Map a wired page into kernel virtual address space. */ void moea64_kenter_attr(mmu_t mmu, vm_offset_t va, vm_paddr_t pa, vm_memattr_t ma) { int error; struct pvo_entry *pvo, *oldpvo; pvo = alloc_pvo_entry(0); pvo->pvo_pte.prot = VM_PROT_READ | VM_PROT_WRITE | VM_PROT_EXECUTE; pvo->pvo_pte.pa = (pa & ~ADDR_POFF) | moea64_calc_wimg(pa, ma); pvo->pvo_vaddr |= PVO_WIRED; PMAP_LOCK(kernel_pmap); oldpvo = moea64_pvo_find_va(kernel_pmap, va); if (oldpvo != NULL) moea64_pvo_remove_from_pmap(mmu, oldpvo); init_pvo_entry(pvo, kernel_pmap, va); error = moea64_pvo_enter(mmu, pvo, NULL); PMAP_UNLOCK(kernel_pmap); /* Free any dead pages */ if (oldpvo != NULL) { PV_LOCK(oldpvo->pvo_pte.pa & LPTE_RPGN); moea64_pvo_remove_from_page(mmu, oldpvo); PV_UNLOCK(oldpvo->pvo_pte.pa & LPTE_RPGN); free_pvo_entry(oldpvo); } if (error != 0 && error != ENOENT) panic("moea64_kenter: failed to enter va %#zx pa %#zx: %d", va, pa, error); } void moea64_kenter(mmu_t mmu, vm_offset_t va, vm_paddr_t pa) { moea64_kenter_attr(mmu, va, pa, VM_MEMATTR_DEFAULT); } /* * Extract the physical page address associated with the given kernel virtual * address. */ vm_paddr_t moea64_kextract(mmu_t mmu, vm_offset_t va) { struct pvo_entry *pvo; vm_paddr_t pa; /* * Shortcut the direct-mapped case when applicable. We never put * anything but 1:1 mappings below VM_MIN_KERNEL_ADDRESS. */ if (va < VM_MIN_KERNEL_ADDRESS) return (va); PMAP_LOCK(kernel_pmap); pvo = moea64_pvo_find_va(kernel_pmap, va); KASSERT(pvo != NULL, ("moea64_kextract: no addr found for %#" PRIxPTR, va)); pa = (pvo->pvo_pte.pa & LPTE_RPGN) | (va - PVO_VADDR(pvo)); PMAP_UNLOCK(kernel_pmap); return (pa); } /* * Remove a wired page from kernel virtual address space. */ void moea64_kremove(mmu_t mmu, vm_offset_t va) { moea64_remove(mmu, kernel_pmap, va, va + PAGE_SIZE); } /* * Map a range of physical addresses into kernel virtual address space. * * The value passed in *virt is a suggested virtual address for the mapping. * Architectures which can support a direct-mapped physical to virtual region * can return the appropriate address within that region, leaving '*virt' * unchanged. Other architectures should map the pages starting at '*virt' and * update '*virt' with the first usable address after the mapped region. */ vm_offset_t moea64_map(mmu_t mmu, vm_offset_t *virt, vm_paddr_t pa_start, vm_paddr_t pa_end, int prot) { vm_offset_t sva, va; if (hw_direct_map) { /* * Check if every page in the region is covered by the direct * map. The direct map covers all of physical memory. Use * moea64_calc_wimg() as a shortcut to see if the page is in * physical memory as a way to see if the direct map covers it. */ for (va = pa_start; va < pa_end; va += PAGE_SIZE) if (moea64_calc_wimg(va, VM_MEMATTR_DEFAULT) != LPTE_M) break; if (va == pa_end) return (pa_start); } sva = *virt; va = sva; /* XXX respect prot argument */ for (; pa_start < pa_end; pa_start += PAGE_SIZE, va += PAGE_SIZE) moea64_kenter(mmu, va, pa_start); *virt = va; return (sva); } /* * Returns true if the pmap's pv is one of the first * 16 pvs linked to from this page. This count may * be changed upwards or downwards in the future; it * is only necessary that true be returned for a small * subset of pmaps for proper page aging. */ boolean_t moea64_page_exists_quick(mmu_t mmu, pmap_t pmap, vm_page_t m) { int loops; struct pvo_entry *pvo; boolean_t rv; KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("moea64_page_exists_quick: page %p is not managed", m)); loops = 0; rv = FALSE; PV_PAGE_LOCK(m); LIST_FOREACH(pvo, vm_page_to_pvoh(m), pvo_vlink) { if (!(pvo->pvo_vaddr & PVO_DEAD) && pvo->pvo_pmap == pmap) { rv = TRUE; break; } if (++loops >= 16) break; } PV_PAGE_UNLOCK(m); return (rv); } /* * Return the number of managed mappings to the given physical page * that are wired. */ int moea64_page_wired_mappings(mmu_t mmu, vm_page_t m) { struct pvo_entry *pvo; int count; count = 0; if ((m->oflags & VPO_UNMANAGED) != 0) return (count); PV_PAGE_LOCK(m); LIST_FOREACH(pvo, vm_page_to_pvoh(m), pvo_vlink) if ((pvo->pvo_vaddr & (PVO_DEAD | PVO_WIRED)) == PVO_WIRED) count++; PV_PAGE_UNLOCK(m); return (count); } static uintptr_t moea64_vsidcontext; uintptr_t moea64_get_unique_vsid(void) { u_int entropy; register_t hash; uint32_t mask; int i; entropy = 0; __asm __volatile("mftb %0" : "=r"(entropy)); mtx_lock(&moea64_slb_mutex); for (i = 0; i < NVSIDS; i += VSID_NBPW) { u_int n; /* * Create a new value by mutiplying by a prime and adding in * entropy from the timebase register. This is to make the * VSID more random so that the PT hash function collides * less often. (Note that the prime casues gcc to do shifts * instead of a multiply.) */ moea64_vsidcontext = (moea64_vsidcontext * 0x1105) + entropy; hash = moea64_vsidcontext & (NVSIDS - 1); if (hash == 0) /* 0 is special, avoid it */ continue; n = hash >> 5; mask = 1 << (hash & (VSID_NBPW - 1)); hash = (moea64_vsidcontext & VSID_HASHMASK); if (moea64_vsid_bitmap[n] & mask) { /* collision? */ /* anything free in this bucket? */ if (moea64_vsid_bitmap[n] == 0xffffffff) { entropy = (moea64_vsidcontext >> 20); continue; } i = ffs(~moea64_vsid_bitmap[n]) - 1; mask = 1 << i; hash &= rounddown2(VSID_HASHMASK, VSID_NBPW); hash |= i; } if (hash == VSID_VRMA) /* also special, avoid this too */ continue; KASSERT(!(moea64_vsid_bitmap[n] & mask), ("Allocating in-use VSID %#zx\n", hash)); moea64_vsid_bitmap[n] |= mask; mtx_unlock(&moea64_slb_mutex); return (hash); } mtx_unlock(&moea64_slb_mutex); panic("%s: out of segments",__func__); } #ifdef __powerpc64__ void moea64_pinit(mmu_t mmu, pmap_t pmap) { RB_INIT(&pmap->pmap_pvo); pmap->pm_slb_tree_root = slb_alloc_tree(); pmap->pm_slb = slb_alloc_user_cache(); pmap->pm_slb_len = 0; } #else void moea64_pinit(mmu_t mmu, pmap_t pmap) { int i; uint32_t hash; RB_INIT(&pmap->pmap_pvo); if (pmap_bootstrapped) pmap->pmap_phys = (pmap_t)moea64_kextract(mmu, (vm_offset_t)pmap); else pmap->pmap_phys = pmap; /* * Allocate some segment registers for this pmap. */ hash = moea64_get_unique_vsid(); for (i = 0; i < 16; i++) pmap->pm_sr[i] = VSID_MAKE(i, hash); KASSERT(pmap->pm_sr[0] != 0, ("moea64_pinit: pm_sr[0] = 0")); } #endif /* * Initialize the pmap associated with process 0. */ void moea64_pinit0(mmu_t mmu, pmap_t pm) { PMAP_LOCK_INIT(pm); moea64_pinit(mmu, pm); bzero(&pm->pm_stats, sizeof(pm->pm_stats)); } /* * Set the physical protection on the specified range of this map as requested. */ static void moea64_pvo_protect(mmu_t mmu, pmap_t pm, struct pvo_entry *pvo, vm_prot_t prot) { struct vm_page *pg; vm_prot_t oldprot; int32_t refchg; PMAP_LOCK_ASSERT(pm, MA_OWNED); /* * Change the protection of the page. */ oldprot = pvo->pvo_pte.prot; pvo->pvo_pte.prot = prot; pg = PHYS_TO_VM_PAGE(pvo->pvo_pte.pa & LPTE_RPGN); /* * If the PVO is in the page table, update mapping */ refchg = MOEA64_PTE_REPLACE(mmu, pvo, MOEA64_PTE_PROT_UPDATE); if (refchg < 0) refchg = (oldprot & VM_PROT_WRITE) ? LPTE_CHG : 0; if (pm != kernel_pmap && pg != NULL && !(pg->aflags & PGA_EXECUTABLE) && (pvo->pvo_pte.pa & (LPTE_I | LPTE_G | LPTE_NOEXEC)) == 0) { if ((pg->oflags & VPO_UNMANAGED) == 0) vm_page_aflag_set(pg, PGA_EXECUTABLE); moea64_syncicache(mmu, pm, PVO_VADDR(pvo), pvo->pvo_pte.pa & LPTE_RPGN, PAGE_SIZE); } /* * Update vm about the REF/CHG bits if the page is managed and we have * removed write access. */ if (pg != NULL && (pvo->pvo_vaddr & PVO_MANAGED) && (oldprot & VM_PROT_WRITE)) { refchg |= atomic_readandclear_32(&pg->md.mdpg_attrs); if (refchg & LPTE_CHG) vm_page_dirty(pg); if (refchg & LPTE_REF) vm_page_aflag_set(pg, PGA_REFERENCED); } } void moea64_protect(mmu_t mmu, pmap_t pm, vm_offset_t sva, vm_offset_t eva, vm_prot_t prot) { struct pvo_entry *pvo, *tpvo, key; CTR4(KTR_PMAP, "moea64_protect: pm=%p sva=%#x eva=%#x prot=%#x", pm, sva, eva, prot); KASSERT(pm == &curproc->p_vmspace->vm_pmap || pm == kernel_pmap, ("moea64_protect: non current pmap")); if ((prot & VM_PROT_READ) == VM_PROT_NONE) { moea64_remove(mmu, pm, sva, eva); return; } PMAP_LOCK(pm); key.pvo_vaddr = sva; for (pvo = RB_NFIND(pvo_tree, &pm->pmap_pvo, &key); pvo != NULL && PVO_VADDR(pvo) < eva; pvo = tpvo) { tpvo = RB_NEXT(pvo_tree, &pm->pmap_pvo, pvo); moea64_pvo_protect(mmu, pm, pvo, prot); } PMAP_UNLOCK(pm); } /* * Map a list of wired pages into kernel virtual address space. This is * intended for temporary mappings which do not need page modification or * references recorded. Existing mappings in the region are overwritten. */ void moea64_qenter(mmu_t mmu, vm_offset_t va, vm_page_t *m, int count) { while (count-- > 0) { moea64_kenter(mmu, va, VM_PAGE_TO_PHYS(*m)); va += PAGE_SIZE; m++; } } /* * Remove page mappings from kernel virtual address space. Intended for * temporary mappings entered by moea64_qenter. */ void moea64_qremove(mmu_t mmu, vm_offset_t va, int count) { while (count-- > 0) { moea64_kremove(mmu, va); va += PAGE_SIZE; } } void moea64_release_vsid(uint64_t vsid) { int idx, mask; mtx_lock(&moea64_slb_mutex); idx = vsid & (NVSIDS-1); mask = 1 << (idx % VSID_NBPW); idx /= VSID_NBPW; KASSERT(moea64_vsid_bitmap[idx] & mask, ("Freeing unallocated VSID %#jx", vsid)); moea64_vsid_bitmap[idx] &= ~mask; mtx_unlock(&moea64_slb_mutex); } void moea64_release(mmu_t mmu, pmap_t pmap) { /* * Free segment registers' VSIDs */ #ifdef __powerpc64__ slb_free_tree(pmap); slb_free_user_cache(pmap->pm_slb); #else KASSERT(pmap->pm_sr[0] != 0, ("moea64_release: pm_sr[0] = 0")); moea64_release_vsid(VSID_TO_HASH(pmap->pm_sr[0])); #endif } /* * Remove all pages mapped by the specified pmap */ void moea64_remove_pages(mmu_t mmu, pmap_t pm) { struct pvo_entry *pvo, *tpvo; struct pvo_tree tofree; RB_INIT(&tofree); PMAP_LOCK(pm); RB_FOREACH_SAFE(pvo, pvo_tree, &pm->pmap_pvo, tpvo) { if (pvo->pvo_vaddr & PVO_WIRED) continue; /* * For locking reasons, remove this from the page table and * pmap, but save delinking from the vm_page for a second * pass */ moea64_pvo_remove_from_pmap(mmu, pvo); RB_INSERT(pvo_tree, &tofree, pvo); } PMAP_UNLOCK(pm); RB_FOREACH_SAFE(pvo, pvo_tree, &tofree, tpvo) { PV_LOCK(pvo->pvo_pte.pa & LPTE_RPGN); moea64_pvo_remove_from_page(mmu, pvo); PV_UNLOCK(pvo->pvo_pte.pa & LPTE_RPGN); RB_REMOVE(pvo_tree, &tofree, pvo); free_pvo_entry(pvo); } } /* * Remove the given range of addresses from the specified map. */ void moea64_remove(mmu_t mmu, pmap_t pm, vm_offset_t sva, vm_offset_t eva) { struct pvo_entry *pvo, *tpvo, key; struct pvo_tree tofree; /* * Perform an unsynchronized read. This is, however, safe. */ if (pm->pm_stats.resident_count == 0) return; key.pvo_vaddr = sva; RB_INIT(&tofree); PMAP_LOCK(pm); for (pvo = RB_NFIND(pvo_tree, &pm->pmap_pvo, &key); pvo != NULL && PVO_VADDR(pvo) < eva; pvo = tpvo) { tpvo = RB_NEXT(pvo_tree, &pm->pmap_pvo, pvo); /* * For locking reasons, remove this from the page table and * pmap, but save delinking from the vm_page for a second * pass */ moea64_pvo_remove_from_pmap(mmu, pvo); RB_INSERT(pvo_tree, &tofree, pvo); } PMAP_UNLOCK(pm); RB_FOREACH_SAFE(pvo, pvo_tree, &tofree, tpvo) { PV_LOCK(pvo->pvo_pte.pa & LPTE_RPGN); moea64_pvo_remove_from_page(mmu, pvo); PV_UNLOCK(pvo->pvo_pte.pa & LPTE_RPGN); RB_REMOVE(pvo_tree, &tofree, pvo); free_pvo_entry(pvo); } } /* * Remove physical page from all pmaps in which it resides. moea64_pvo_remove() * will reflect changes in pte's back to the vm_page. */ void moea64_remove_all(mmu_t mmu, vm_page_t m) { struct pvo_entry *pvo, *next_pvo; struct pvo_head freequeue; int wasdead; pmap_t pmap; LIST_INIT(&freequeue); PV_PAGE_LOCK(m); LIST_FOREACH_SAFE(pvo, vm_page_to_pvoh(m), pvo_vlink, next_pvo) { pmap = pvo->pvo_pmap; PMAP_LOCK(pmap); wasdead = (pvo->pvo_vaddr & PVO_DEAD); if (!wasdead) moea64_pvo_remove_from_pmap(mmu, pvo); moea64_pvo_remove_from_page(mmu, pvo); if (!wasdead) LIST_INSERT_HEAD(&freequeue, pvo, pvo_vlink); PMAP_UNLOCK(pmap); } KASSERT(!pmap_page_is_mapped(m), ("Page still has mappings")); KASSERT(!(m->aflags & PGA_WRITEABLE), ("Page still writable")); PV_PAGE_UNLOCK(m); /* Clean up UMA allocations */ LIST_FOREACH_SAFE(pvo, &freequeue, pvo_vlink, next_pvo) free_pvo_entry(pvo); } /* * Allocate a physical page of memory directly from the phys_avail map. * Can only be called from moea64_bootstrap before avail start and end are * calculated. */ vm_offset_t moea64_bootstrap_alloc(vm_size_t size, u_int align) { vm_offset_t s, e; int i, j; size = round_page(size); for (i = 0; phys_avail[i + 1] != 0; i += 2) { if (align != 0) s = roundup2(phys_avail[i], align); else s = phys_avail[i]; e = s + size; if (s < phys_avail[i] || e > phys_avail[i + 1]) continue; if (s + size > platform_real_maxaddr()) continue; if (s == phys_avail[i]) { phys_avail[i] += size; } else if (e == phys_avail[i + 1]) { phys_avail[i + 1] -= size; } else { for (j = phys_avail_count * 2; j > i; j -= 2) { phys_avail[j] = phys_avail[j - 2]; phys_avail[j + 1] = phys_avail[j - 1]; } phys_avail[i + 3] = phys_avail[i + 1]; phys_avail[i + 1] = s; phys_avail[i + 2] = e; phys_avail_count++; } return (s); } panic("moea64_bootstrap_alloc: could not allocate memory"); } static int moea64_pvo_enter(mmu_t mmu, struct pvo_entry *pvo, struct pvo_head *pvo_head) { int first, err; PMAP_LOCK_ASSERT(pvo->pvo_pmap, MA_OWNED); KASSERT(moea64_pvo_find_va(pvo->pvo_pmap, PVO_VADDR(pvo)) == NULL, ("Existing mapping for VA %#jx", (uintmax_t)PVO_VADDR(pvo))); moea64_pvo_enter_calls++; /* * Add to pmap list */ RB_INSERT(pvo_tree, &pvo->pvo_pmap->pmap_pvo, pvo); /* * Remember if the list was empty and therefore will be the first * item. */ if (pvo_head != NULL) { if (LIST_FIRST(pvo_head) == NULL) first = 1; LIST_INSERT_HEAD(pvo_head, pvo, pvo_vlink); } if (pvo->pvo_vaddr & PVO_WIRED) pvo->pvo_pmap->pm_stats.wired_count++; pvo->pvo_pmap->pm_stats.resident_count++; /* * Insert it into the hardware page table */ err = MOEA64_PTE_INSERT(mmu, pvo); if (err != 0) { panic("moea64_pvo_enter: overflow"); } moea64_pvo_entries++; if (pvo->pvo_pmap == kernel_pmap) isync(); #ifdef __powerpc64__ /* * Make sure all our bootstrap mappings are in the SLB as soon * as virtual memory is switched on. */ if (!pmap_bootstrapped) moea64_bootstrap_slb_prefault(PVO_VADDR(pvo), pvo->pvo_vaddr & PVO_LARGE); #endif return (first ? ENOENT : 0); } static void moea64_pvo_remove_from_pmap(mmu_t mmu, struct pvo_entry *pvo) { struct vm_page *pg; int32_t refchg; KASSERT(pvo->pvo_pmap != NULL, ("Trying to remove PVO with no pmap")); PMAP_LOCK_ASSERT(pvo->pvo_pmap, MA_OWNED); KASSERT(!(pvo->pvo_vaddr & PVO_DEAD), ("Trying to remove dead PVO")); /* * If there is an active pte entry, we need to deactivate it */ refchg = MOEA64_PTE_UNSET(mmu, pvo); if (refchg < 0) { /* * If it was evicted from the page table, be pessimistic and * dirty the page. */ if (pvo->pvo_pte.prot & VM_PROT_WRITE) refchg = LPTE_CHG; else refchg = 0; } /* * Update our statistics. */ pvo->pvo_pmap->pm_stats.resident_count--; if (pvo->pvo_vaddr & PVO_WIRED) pvo->pvo_pmap->pm_stats.wired_count--; /* * Remove this PVO from the pmap list. */ RB_REMOVE(pvo_tree, &pvo->pvo_pmap->pmap_pvo, pvo); /* * Mark this for the next sweep */ pvo->pvo_vaddr |= PVO_DEAD; /* Send RC bits to VM */ if ((pvo->pvo_vaddr & PVO_MANAGED) && (pvo->pvo_pte.prot & VM_PROT_WRITE)) { pg = PHYS_TO_VM_PAGE(pvo->pvo_pte.pa & LPTE_RPGN); if (pg != NULL) { refchg |= atomic_readandclear_32(&pg->md.mdpg_attrs); if (refchg & LPTE_CHG) vm_page_dirty(pg); if (refchg & LPTE_REF) vm_page_aflag_set(pg, PGA_REFERENCED); } } } static void moea64_pvo_remove_from_page(mmu_t mmu, struct pvo_entry *pvo) { struct vm_page *pg; KASSERT(pvo->pvo_vaddr & PVO_DEAD, ("Trying to delink live page")); /* Use NULL pmaps as a sentinel for races in page deletion */ if (pvo->pvo_pmap == NULL) return; pvo->pvo_pmap = NULL; /* * Update vm about page writeability/executability if managed */ PV_LOCKASSERT(pvo->pvo_pte.pa & LPTE_RPGN); pg = PHYS_TO_VM_PAGE(pvo->pvo_pte.pa & LPTE_RPGN); if ((pvo->pvo_vaddr & PVO_MANAGED) && pg != NULL) { LIST_REMOVE(pvo, pvo_vlink); if (LIST_EMPTY(vm_page_to_pvoh(pg))) vm_page_aflag_clear(pg, PGA_WRITEABLE | PGA_EXECUTABLE); } moea64_pvo_entries--; moea64_pvo_remove_calls++; } static struct pvo_entry * moea64_pvo_find_va(pmap_t pm, vm_offset_t va) { struct pvo_entry key; PMAP_LOCK_ASSERT(pm, MA_OWNED); key.pvo_vaddr = va & ~ADDR_POFF; return (RB_FIND(pvo_tree, &pm->pmap_pvo, &key)); } static boolean_t moea64_query_bit(mmu_t mmu, vm_page_t m, uint64_t ptebit) { struct pvo_entry *pvo; int64_t ret; boolean_t rv; /* * See if this bit is stored in the page already. */ if (m->md.mdpg_attrs & ptebit) return (TRUE); /* * Examine each PTE. Sync so that any pending REF/CHG bits are * flushed to the PTEs. */ rv = FALSE; powerpc_sync(); PV_PAGE_LOCK(m); LIST_FOREACH(pvo, vm_page_to_pvoh(m), pvo_vlink) { ret = 0; /* * See if this pvo has a valid PTE. if so, fetch the * REF/CHG bits from the valid PTE. If the appropriate * ptebit is set, return success. */ PMAP_LOCK(pvo->pvo_pmap); if (!(pvo->pvo_vaddr & PVO_DEAD)) ret = MOEA64_PTE_SYNCH(mmu, pvo); PMAP_UNLOCK(pvo->pvo_pmap); if (ret > 0) { atomic_set_32(&m->md.mdpg_attrs, ret & (LPTE_CHG | LPTE_REF)); if (ret & ptebit) { rv = TRUE; break; } } } PV_PAGE_UNLOCK(m); return (rv); } static u_int moea64_clear_bit(mmu_t mmu, vm_page_t m, u_int64_t ptebit) { u_int count; struct pvo_entry *pvo; int64_t ret; /* * Sync so that any pending REF/CHG bits are flushed to the PTEs (so * we can reset the right ones). */ powerpc_sync(); /* * For each pvo entry, clear the pte's ptebit. */ count = 0; PV_PAGE_LOCK(m); LIST_FOREACH(pvo, vm_page_to_pvoh(m), pvo_vlink) { ret = 0; PMAP_LOCK(pvo->pvo_pmap); if (!(pvo->pvo_vaddr & PVO_DEAD)) ret = MOEA64_PTE_CLEAR(mmu, pvo, ptebit); PMAP_UNLOCK(pvo->pvo_pmap); if (ret > 0 && (ret & ptebit)) count++; } atomic_clear_32(&m->md.mdpg_attrs, ptebit); PV_PAGE_UNLOCK(m); return (count); } boolean_t moea64_dev_direct_mapped(mmu_t mmu, vm_paddr_t pa, vm_size_t size) { struct pvo_entry *pvo, key; vm_offset_t ppa; int error = 0; PMAP_LOCK(kernel_pmap); key.pvo_vaddr = ppa = pa & ~ADDR_POFF; for (pvo = RB_FIND(pvo_tree, &kernel_pmap->pmap_pvo, &key); ppa < pa + size; ppa += PAGE_SIZE, pvo = RB_NEXT(pvo_tree, &kernel_pmap->pmap_pvo, pvo)) { if (pvo == NULL || (pvo->pvo_pte.pa & LPTE_RPGN) != ppa) { error = EFAULT; break; } } PMAP_UNLOCK(kernel_pmap); return (error); } /* * Map a set of physical memory pages into the kernel virtual * address space. Return a pointer to where it is mapped. This * routine is intended to be used for mapping device memory, * NOT real memory. */ void * moea64_mapdev_attr(mmu_t mmu, vm_paddr_t pa, vm_size_t size, vm_memattr_t ma) { vm_offset_t va, tmpva, ppa, offset; ppa = trunc_page(pa); offset = pa & PAGE_MASK; size = roundup2(offset + size, PAGE_SIZE); va = kva_alloc(size); if (!va) panic("moea64_mapdev: Couldn't alloc kernel virtual memory"); for (tmpva = va; size > 0;) { moea64_kenter_attr(mmu, tmpva, ppa, ma); size -= PAGE_SIZE; tmpva += PAGE_SIZE; ppa += PAGE_SIZE; } return ((void *)(va + offset)); } void * moea64_mapdev(mmu_t mmu, vm_paddr_t pa, vm_size_t size) { return moea64_mapdev_attr(mmu, pa, size, VM_MEMATTR_DEFAULT); } void moea64_unmapdev(mmu_t mmu, vm_offset_t va, vm_size_t size) { vm_offset_t base, offset; base = trunc_page(va); offset = va & PAGE_MASK; size = roundup2(offset + size, PAGE_SIZE); kva_free(base, size); } void moea64_sync_icache(mmu_t mmu, pmap_t pm, vm_offset_t va, vm_size_t sz) { struct pvo_entry *pvo; vm_offset_t lim; vm_paddr_t pa; vm_size_t len; PMAP_LOCK(pm); while (sz > 0) { lim = round_page(va); len = MIN(lim - va, sz); pvo = moea64_pvo_find_va(pm, va & ~ADDR_POFF); if (pvo != NULL && !(pvo->pvo_pte.pa & LPTE_I)) { pa = (pvo->pvo_pte.pa & LPTE_RPGN) | (va & ADDR_POFF); moea64_syncicache(mmu, pm, va, pa, len); } va += len; sz -= len; } PMAP_UNLOCK(pm); } void moea64_dumpsys_map(mmu_t mmu, vm_paddr_t pa, size_t sz, void **va) { *va = (void *)pa; } extern struct dump_pa dump_map[PHYS_AVAIL_SZ + 1]; void moea64_scan_init(mmu_t mmu) { struct pvo_entry *pvo; vm_offset_t va; int i; if (!do_minidump) { /* Initialize phys. segments for dumpsys(). */ memset(&dump_map, 0, sizeof(dump_map)); mem_regions(&pregions, &pregions_sz, ®ions, ®ions_sz); for (i = 0; i < pregions_sz; i++) { dump_map[i].pa_start = pregions[i].mr_start; dump_map[i].pa_size = pregions[i].mr_size; } return; } /* Virtual segments for minidumps: */ memset(&dump_map, 0, sizeof(dump_map)); /* 1st: kernel .data and .bss. */ dump_map[0].pa_start = trunc_page((uintptr_t)_etext); dump_map[0].pa_size = round_page((uintptr_t)_end) - dump_map[0].pa_start; /* 2nd: msgbuf and tables (see pmap_bootstrap()). */ dump_map[1].pa_start = (vm_paddr_t)msgbufp->msg_ptr; dump_map[1].pa_size = round_page(msgbufp->msg_size); /* 3rd: kernel VM. */ va = dump_map[1].pa_start + dump_map[1].pa_size; /* Find start of next chunk (from va). */ while (va < virtual_end) { /* Don't dump the buffer cache. */ if (va >= kmi.buffer_sva && va < kmi.buffer_eva) { va = kmi.buffer_eva; continue; } pvo = moea64_pvo_find_va(kernel_pmap, va & ~ADDR_POFF); if (pvo != NULL && !(pvo->pvo_vaddr & PVO_DEAD)) break; va += PAGE_SIZE; } if (va < virtual_end) { dump_map[2].pa_start = va; va += PAGE_SIZE; /* Find last page in chunk. */ while (va < virtual_end) { /* Don't run into the buffer cache. */ if (va == kmi.buffer_sva) break; pvo = moea64_pvo_find_va(kernel_pmap, va & ~ADDR_POFF); if (pvo != NULL && !(pvo->pvo_vaddr & PVO_DEAD)) break; va += PAGE_SIZE; } dump_map[2].pa_size = va - dump_map[2].pa_start; } } Index: projects/numa2/sys/powerpc/aim/slb.c =================================================================== --- projects/numa2/sys/powerpc/aim/slb.c (revision 321505) +++ projects/numa2/sys/powerpc/aim/slb.c (revision 321506) @@ -1,542 +1,543 @@ /*- * Copyright (c) 2010 Nathan Whitehorn * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * * $FreeBSD$ */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include uintptr_t moea64_get_unique_vsid(void); void moea64_release_vsid(uint64_t vsid); static void slb_zone_init(void *); static uma_zone_t slbt_zone; static uma_zone_t slb_cache_zone; int n_slbs = 64; SYSINIT(slb_zone_init, SI_SUB_KMEM, SI_ORDER_ANY, slb_zone_init, NULL); struct slbtnode { uint16_t ua_alloc; uint8_t ua_level; /* Only 36 bits needed for full 64-bit address space. */ uint64_t ua_base; union { struct slbtnode *ua_child[16]; struct slb slb_entries[16]; } u; }; /* * For a full 64-bit address space, there are 36 bits in play in an * esid, so 8 levels, with the leaf being at level 0. * * |3333|3322|2222|2222|1111|1111|11 | | | esid * |5432|1098|7654|3210|9876|5432|1098|7654|3210| bits * +----+----+----+----+----+----+----+----+----+-------- * | 8 | 7 | 6 | 5 | 4 | 3 | 2 | 1 | 0 | level */ #define UAD_ROOT_LEVEL 8 #define UAD_LEAF_LEVEL 0 static inline int esid2idx(uint64_t esid, int level) { int shift; shift = level * 4; return ((esid >> shift) & 0xF); } /* * The ua_base field should have 0 bits after the first 4*(level+1) * bits; i.e. only */ #define uad_baseok(ua) \ (esid2base(ua->ua_base, ua->ua_level) == ua->ua_base) static inline uint64_t esid2base(uint64_t esid, int level) { uint64_t mask; int shift; shift = (level + 1) * 4; mask = ~((1ULL << shift) - 1); return (esid & mask); } /* * Allocate a new leaf node for the specified esid/vmhandle from the * parent node. */ static struct slb * make_new_leaf(uint64_t esid, uint64_t slbv, struct slbtnode *parent) { struct slbtnode *child; struct slb *retval; int idx; idx = esid2idx(esid, parent->ua_level); KASSERT(parent->u.ua_child[idx] == NULL, ("Child already exists!")); /* unlock and M_WAITOK and loop? */ child = uma_zalloc(slbt_zone, M_NOWAIT | M_ZERO); KASSERT(child != NULL, ("unhandled NULL case")); child->ua_level = UAD_LEAF_LEVEL; child->ua_base = esid2base(esid, child->ua_level); idx = esid2idx(esid, child->ua_level); child->u.slb_entries[idx].slbv = slbv; child->u.slb_entries[idx].slbe = (esid << SLBE_ESID_SHIFT) | SLBE_VALID; setbit(&child->ua_alloc, idx); retval = &child->u.slb_entries[idx]; /* * The above stores must be visible before the next one, so * that a lockless searcher always sees a valid path through * the tree. */ powerpc_lwsync(); idx = esid2idx(esid, parent->ua_level); parent->u.ua_child[idx] = child; setbit(&parent->ua_alloc, idx); return (retval); } /* * Allocate a new intermediate node to fit between the parent and * esid. */ static struct slbtnode* make_intermediate(uint64_t esid, struct slbtnode *parent) { struct slbtnode *child, *inter; int idx, level; idx = esid2idx(esid, parent->ua_level); child = parent->u.ua_child[idx]; KASSERT(esid2base(esid, child->ua_level) != child->ua_base, ("No need for an intermediate node?")); /* * Find the level where the existing child and our new esid * meet. It must be lower than parent->ua_level or we would * have chosen a different index in parent. */ level = child->ua_level + 1; while (esid2base(esid, level) != esid2base(child->ua_base, level)) level++; KASSERT(level < parent->ua_level, ("Found splitting level %d for %09jx and %09jx, " "but it's the same as %p's", level, esid, child->ua_base, parent)); /* unlock and M_WAITOK and loop? */ inter = uma_zalloc(slbt_zone, M_NOWAIT | M_ZERO); KASSERT(inter != NULL, ("unhandled NULL case")); /* Set up intermediate node to point to child ... */ inter->ua_level = level; inter->ua_base = esid2base(esid, inter->ua_level); idx = esid2idx(child->ua_base, inter->ua_level); inter->u.ua_child[idx] = child; setbit(&inter->ua_alloc, idx); powerpc_lwsync(); /* Set up parent to point to intermediate node ... */ idx = esid2idx(inter->ua_base, parent->ua_level); parent->u.ua_child[idx] = inter; setbit(&parent->ua_alloc, idx); return (inter); } uint64_t kernel_va_to_slbv(vm_offset_t va) { uint64_t slbv; /* Set kernel VSID to deterministic value */ slbv = (KERNEL_VSID((uintptr_t)va >> ADDR_SR_SHFT)) << SLBV_VSID_SHIFT; /* Figure out if this is a large-page mapping */ if (hw_direct_map && va < VM_MIN_KERNEL_ADDRESS) { /* * XXX: If we have set up a direct map, assumes * all physical memory is mapped with large pages. */ if (mem_valid(va, 0) == 0) slbv |= SLBV_L; } return (slbv); } struct slb * user_va_to_slb_entry(pmap_t pm, vm_offset_t va) { uint64_t esid = va >> ADDR_SR_SHFT; struct slbtnode *ua; int idx; ua = pm->pm_slb_tree_root; for (;;) { KASSERT(uad_baseok(ua), ("uad base %016jx level %d bad!", ua->ua_base, ua->ua_level)); idx = esid2idx(esid, ua->ua_level); /* * This code is specific to ppc64 where a load is * atomic, so no need for atomic_load macro. */ if (ua->ua_level == UAD_LEAF_LEVEL) return ((ua->u.slb_entries[idx].slbe & SLBE_VALID) ? &ua->u.slb_entries[idx] : NULL); /* * The following accesses are implicitly ordered under the POWER * ISA by load dependencies (the store ordering is provided by * the powerpc_lwsync() calls elsewhere) and so are run without * barriers. */ ua = ua->u.ua_child[idx]; if (ua == NULL || esid2base(esid, ua->ua_level) != ua->ua_base) return (NULL); } return (NULL); } uint64_t va_to_vsid(pmap_t pm, vm_offset_t va) { struct slb *entry; /* Shortcut kernel case */ if (pm == kernel_pmap) return (KERNEL_VSID((uintptr_t)va >> ADDR_SR_SHFT)); /* * If there is no vsid for this VA, we need to add a new entry * to the PMAP's segment table. */ entry = user_va_to_slb_entry(pm, va); if (entry == NULL) return (allocate_user_vsid(pm, (uintptr_t)va >> ADDR_SR_SHFT, 0)); return ((entry->slbv & SLBV_VSID_MASK) >> SLBV_VSID_SHIFT); } uint64_t allocate_user_vsid(pmap_t pm, uint64_t esid, int large) { uint64_t vsid, slbv; struct slbtnode *ua, *next, *inter; struct slb *slb; int idx; KASSERT(pm != kernel_pmap, ("Attempting to allocate a kernel VSID")); PMAP_LOCK_ASSERT(pm, MA_OWNED); vsid = moea64_get_unique_vsid(); slbv = vsid << SLBV_VSID_SHIFT; if (large) slbv |= SLBV_L; ua = pm->pm_slb_tree_root; /* Descend to the correct leaf or NULL pointer. */ for (;;) { KASSERT(uad_baseok(ua), ("uad base %09jx level %d bad!", ua->ua_base, ua->ua_level)); idx = esid2idx(esid, ua->ua_level); if (ua->ua_level == UAD_LEAF_LEVEL) { ua->u.slb_entries[idx].slbv = slbv; eieio(); ua->u.slb_entries[idx].slbe = (esid << SLBE_ESID_SHIFT) | SLBE_VALID; setbit(&ua->ua_alloc, idx); slb = &ua->u.slb_entries[idx]; break; } next = ua->u.ua_child[idx]; if (next == NULL) { slb = make_new_leaf(esid, slbv, ua); break; } /* * Check if the next item down has an okay ua_base. * If not, we need to allocate an intermediate node. */ if (esid2base(esid, next->ua_level) != next->ua_base) { inter = make_intermediate(esid, ua); slb = make_new_leaf(esid, slbv, inter); break; } ua = next; } /* * Someone probably wants this soon, and it may be a wired * SLB mapping, so pre-spill this entry. */ eieio(); slb_insert_user(pm, slb); return (vsid); } void free_vsid(pmap_t pm, uint64_t esid, int large) { struct slbtnode *ua; int idx; PMAP_LOCK_ASSERT(pm, MA_OWNED); ua = pm->pm_slb_tree_root; /* Descend to the correct leaf. */ for (;;) { KASSERT(uad_baseok(ua), ("uad base %09jx level %d bad!", ua->ua_base, ua->ua_level)); idx = esid2idx(esid, ua->ua_level); if (ua->ua_level == UAD_LEAF_LEVEL) { ua->u.slb_entries[idx].slbv = 0; eieio(); ua->u.slb_entries[idx].slbe = 0; clrbit(&ua->ua_alloc, idx); return; } ua = ua->u.ua_child[idx]; if (ua == NULL || esid2base(esid, ua->ua_level) != ua->ua_base) { /* Perhaps just return instead of assert? */ KASSERT(0, ("Asked to remove an entry that was never inserted!")); return; } } } static void free_slb_tree_node(struct slbtnode *ua) { int idx; for (idx = 0; idx < 16; idx++) { if (ua->ua_level != UAD_LEAF_LEVEL) { if (ua->u.ua_child[idx] != NULL) free_slb_tree_node(ua->u.ua_child[idx]); } else { if (ua->u.slb_entries[idx].slbv != 0) moea64_release_vsid(ua->u.slb_entries[idx].slbv >> SLBV_VSID_SHIFT); } } uma_zfree(slbt_zone, ua); } void slb_free_tree(pmap_t pm) { free_slb_tree_node(pm->pm_slb_tree_root); } struct slbtnode * slb_alloc_tree(void) { struct slbtnode *root; root = uma_zalloc(slbt_zone, M_NOWAIT | M_ZERO); root->ua_level = UAD_ROOT_LEVEL; return (root); } /* Lock entries mapping kernel text and stacks */ void slb_insert_kernel(uint64_t slbe, uint64_t slbv) { struct slb *slbcache; int i; /* We don't want to be preempted while modifying the kernel map */ critical_enter(); slbcache = PCPU_GET(slb); /* Check for an unused slot, abusing the user slot as a full flag */ if (slbcache[USER_SLB_SLOT].slbe == 0) { for (i = 0; i < n_slbs; i++) { if (i == USER_SLB_SLOT) continue; if (!(slbcache[i].slbe & SLBE_VALID)) goto fillkernslb; } if (i == n_slbs) slbcache[USER_SLB_SLOT].slbe = 1; } i = mftb() % n_slbs; if (i == USER_SLB_SLOT) i = (i+1) % n_slbs; fillkernslb: KASSERT(i != USER_SLB_SLOT, ("Filling user SLB slot with a kernel mapping")); slbcache[i].slbv = slbv; slbcache[i].slbe = slbe | (uint64_t)i; /* If it is for this CPU, put it in the SLB right away */ if (pmap_bootstrapped) { /* slbie not required */ __asm __volatile ("slbmte %0, %1" :: "r"(slbcache[i].slbv), "r"(slbcache[i].slbe)); } critical_exit(); } void slb_insert_user(pmap_t pm, struct slb *slb) { int i; PMAP_LOCK_ASSERT(pm, MA_OWNED); if (pm->pm_slb_len < n_slbs) { i = pm->pm_slb_len; pm->pm_slb_len++; } else { i = mftb() % n_slbs; } /* Note that this replacement is atomic with respect to trap_subr */ pm->pm_slb[i] = slb; } static void * -slb_uma_real_alloc(uma_zone_t zone, vm_size_t bytes, u_int8_t *flags, int wait) +slb_uma_real_alloc(uma_zone_t zone, vm_size_t bytes, int domain, + u_int8_t *flags, int wait) { static vm_offset_t realmax = 0; void *va; vm_page_t m; int pflags; if (realmax == 0) realmax = platform_real_maxaddr(); *flags = UMA_SLAB_PRIV; pflags = malloc2vm_flags(wait) | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED; for (;;) { m = vm_page_alloc_contig(NULL, 0, pflags, 1, 0, realmax, PAGE_SIZE, PAGE_SIZE, VM_MEMATTR_DEFAULT); if (m == NULL) { if (wait & M_NOWAIT) return (NULL); VM_WAIT; } else break; } va = (void *) VM_PAGE_TO_PHYS(m); if (!hw_direct_map) pmap_kenter((vm_offset_t)va, VM_PAGE_TO_PHYS(m)); if ((wait & M_ZERO) && (m->flags & PG_ZERO) == 0) bzero(va, PAGE_SIZE); return (va); } static void slb_zone_init(void *dummy) { slbt_zone = uma_zcreate("SLB tree node", sizeof(struct slbtnode), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_VM); slb_cache_zone = uma_zcreate("SLB cache", (n_slbs + 1)*sizeof(struct slb *), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_VM); if (platform_real_maxaddr() != VM_MAX_ADDRESS) { uma_zone_set_allocf(slb_cache_zone, slb_uma_real_alloc); uma_zone_set_allocf(slbt_zone, slb_uma_real_alloc); } } struct slb ** slb_alloc_user_cache(void) { return (uma_zalloc(slb_cache_zone, M_ZERO)); } void slb_free_user_cache(struct slb **slb) { uma_zfree(slb_cache_zone, slb); } Index: projects/numa2/sys/powerpc/powerpc/uma_machdep.c =================================================================== --- projects/numa2/sys/powerpc/powerpc/uma_machdep.c (revision 321505) +++ projects/numa2/sys/powerpc/powerpc/uma_machdep.c (revision 321506) @@ -1,106 +1,107 @@ /*- * Copyright (c) 2003 The FreeBSD Project * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include static int hw_uma_mdpages; SYSCTL_INT(_hw, OID_AUTO, uma_mdpages, CTLFLAG_RD, &hw_uma_mdpages, 0, "UMA MD pages in use"); void * -uma_small_alloc(uma_zone_t zone, vm_size_t bytes, u_int8_t *flags, int wait) +uma_small_alloc(uma_zone_t zone, vm_size_t bytes, int domain, u_int8_t *flags, + int wait) { void *va; vm_paddr_t pa; vm_page_t m; int pflags; *flags = UMA_SLAB_PRIV; pflags = malloc2vm_flags(wait) | VM_ALLOC_WIRED; for (;;) { m = vm_page_alloc(NULL, 0, pflags | VM_ALLOC_NOOBJ); if (m == NULL) { if (wait & M_NOWAIT) return (NULL); VM_WAIT; } else break; } pa = VM_PAGE_TO_PHYS(m); /* On book-e sizeof(void *) < sizeof(vm_paddr_t) */ if ((vm_offset_t)pa != pa) return (NULL); va = (void *)(vm_offset_t)pa; if (!hw_direct_map) pmap_kenter((vm_offset_t)va, VM_PAGE_TO_PHYS(m)); if ((wait & M_ZERO) && (m->flags & PG_ZERO) == 0) bzero(va, PAGE_SIZE); atomic_add_int(&hw_uma_mdpages, 1); return (va); } void uma_small_free(void *mem, vm_size_t size, u_int8_t flags) { vm_page_t m; if (!hw_direct_map) pmap_remove(kernel_pmap,(vm_offset_t)mem, (vm_offset_t)mem + PAGE_SIZE); m = PHYS_TO_VM_PAGE((vm_offset_t)mem); m->wire_count--; vm_page_free(m); atomic_subtract_int(&vm_cnt.v_wire_count, 1); atomic_subtract_int(&hw_uma_mdpages, 1); } Index: projects/numa2/sys/riscv/riscv/uma_machdep.c =================================================================== --- projects/numa2/sys/riscv/riscv/uma_machdep.c (revision 321505) +++ projects/numa2/sys/riscv/riscv/uma_machdep.c (revision 321506) @@ -1,55 +1,56 @@ /*- * Copyright (c) 2003 Alan L. Cox * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include void * -uma_small_alloc(uma_zone_t zone, vm_size_t bytes, u_int8_t *flags, int wait) +uma_small_alloc(uma_zone_t zone, vm_size_t bytes, int domain, u_int8_t *flags, + int wait) { panic("uma_small_alloc"); } void uma_small_free(void *mem, vm_size_t size, u_int8_t flags) { panic("uma_small_free"); } Index: projects/numa2/sys/sparc64/sparc64/vm_machdep.c =================================================================== --- projects/numa2/sys/sparc64/sparc64/vm_machdep.c (revision 321505) +++ projects/numa2/sys/sparc64/sparc64/vm_machdep.c (revision 321506) @@ -1,455 +1,456 @@ /*- * Copyright (c) 1982, 1986 The Regents of the University of California. * Copyright (c) 1989, 1990 William Jolitz * Copyright (c) 1994 John Dyson * Copyright (c) 2001 Jake Burkholder. * All rights reserved. * * This code is derived from software contributed to Berkeley by * the Systems Programming Group of the University of Utah Computer * Science Department, and William Jolitz. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from: @(#)vm_machdep.c 7.3 (Berkeley) 5/13/91 * Utah $Hdr: vm_machdep.c 1.16.1.1 89/06/23$ * from: FreeBSD: src/sys/i386/i386/vm_machdep.c,v 1.167 2001/07/12 */ #include __FBSDID("$FreeBSD$"); #include "opt_pmap.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include PMAP_STATS_VAR(uma_nsmall_alloc); PMAP_STATS_VAR(uma_nsmall_alloc_oc); PMAP_STATS_VAR(uma_nsmall_free); void cpu_exit(struct thread *td) { struct proc *p; p = td->td_proc; p->p_md.md_sigtramp = NULL; if (p->p_md.md_utrap != NULL) { utrap_free(p->p_md.md_utrap); p->p_md.md_utrap = NULL; } } void cpu_thread_exit(struct thread *td) { } void cpu_thread_clean(struct thread *td) { } void cpu_thread_alloc(struct thread *td) { struct pcb *pcb; pcb = (struct pcb *)((td->td_kstack + td->td_kstack_pages * PAGE_SIZE - sizeof(struct pcb)) & ~0x3fUL); pcb->pcb_nsaved = 0; td->td_frame = (struct trapframe *)pcb - 1; td->td_pcb = pcb; } void cpu_thread_free(struct thread *td) { } void cpu_thread_swapin(struct thread *td) { } void cpu_thread_swapout(struct thread *td) { } void cpu_set_syscall_retval(struct thread *td, int error) { switch (error) { case 0: td->td_frame->tf_out[0] = td->td_retval[0]; td->td_frame->tf_out[1] = td->td_retval[1]; td->td_frame->tf_tstate &= ~TSTATE_XCC_C; break; case ERESTART: /* * Undo the tpc advancement we have done on syscall * enter, we want to reexecute the system call. */ td->td_frame->tf_tpc = td->td_pcb->pcb_tpc; td->td_frame->tf_tnpc -= 4; break; case EJUSTRETURN: break; default: td->td_frame->tf_out[0] = SV_ABI_ERRNO(td->td_proc, error); td->td_frame->tf_tstate |= TSTATE_XCC_C; break; } } void cpu_copy_thread(struct thread *td, struct thread *td0) { struct trapframe *tf; struct frame *fr; struct pcb *pcb; bcopy(td0->td_frame, td->td_frame, sizeof(struct trapframe)); pcb = td->td_pcb; tf = td->td_frame; fr = (struct frame *)tf - 1; fr->fr_local[0] = (u_long)fork_return; fr->fr_local[1] = (u_long)td; fr->fr_local[2] = (u_long)tf; pcb->pcb_pc = (u_long)fork_trampoline - 8; pcb->pcb_sp = (u_long)fr - SPOFF; /* Setup to release the spin count in fork_exit(). */ td->td_md.md_spinlock_count = 1; td->td_md.md_saved_pil = 0; } void cpu_set_upcall(struct thread *td, void (*entry)(void *), void *arg, stack_t *stack) { struct trapframe *tf; uint64_t sp; if (td == curthread) flushw(); tf = td->td_frame; sp = (uint64_t)stack->ss_sp + stack->ss_size; tf->tf_out[0] = (uint64_t)arg; tf->tf_out[6] = sp - SPOFF - sizeof(struct frame); tf->tf_tpc = (uint64_t)entry; tf->tf_tnpc = tf->tf_tpc + 4; td->td_retval[0] = tf->tf_out[0]; td->td_retval[1] = tf->tf_out[1]; } int cpu_set_user_tls(struct thread *td, void *tls_base) { if (td == curthread) flushw(); td->td_frame->tf_global[7] = (uint64_t)tls_base; return (0); } /* * Finish a fork operation, with process p2 nearly set up. * Copy and update the pcb, set up the stack so that the child * ready to run and return to user mode. */ void cpu_fork(struct thread *td1, struct proc *p2, struct thread *td2, int flags) { struct trapframe *tf; struct frame *fp; struct pcb *pcb1; struct pcb *pcb2; vm_offset_t sp; int error; int i; KASSERT(td1 == curthread || td1 == &thread0, ("cpu_fork: p1 not curproc and not proc0")); if ((flags & RFPROC) == 0) return; p2->p_md.md_sigtramp = td1->td_proc->p_md.md_sigtramp; p2->p_md.md_utrap = utrap_hold(td1->td_proc->p_md.md_utrap); /* The pcb must be aligned on a 64-byte boundary. */ pcb1 = td1->td_pcb; pcb2 = (struct pcb *)((td2->td_kstack + td2->td_kstack_pages * PAGE_SIZE - sizeof(struct pcb)) & ~0x3fUL); td2->td_pcb = pcb2; /* * Ensure that p1's pcb is up to date. */ critical_enter(); if ((td1->td_frame->tf_fprs & FPRS_FEF) != 0) savefpctx(pcb1->pcb_ufp); critical_exit(); /* Make sure the copied windows are spilled. */ flushw(); /* Copy the pcb (this will copy the windows saved in the pcb, too). */ bcopy(pcb1, pcb2, sizeof(*pcb1)); /* * If we're creating a new user process and we're sharing the address * space, the parent's top most frame must be saved in the pcb. The * child will pop the frame when it returns to user mode, and may * overwrite it with its own data causing much suffering for the * parent. We check if its already in the pcb, and if not copy it * in. Its unlikely that the copyin will fail, but if so there's not * much we can do. The parent will likely crash soon anyway in that * case. */ if ((flags & RFMEM) != 0 && td1 != &thread0) { sp = td1->td_frame->tf_sp; for (i = 0; i < pcb1->pcb_nsaved; i++) { if (pcb1->pcb_rwsp[i] == sp) break; } if (i == pcb1->pcb_nsaved) { error = copyin((caddr_t)sp + SPOFF, &pcb1->pcb_rw[i], sizeof(struct rwindow)); if (error == 0) { pcb1->pcb_rwsp[i] = sp; pcb1->pcb_nsaved++; } } } /* * Create a new fresh stack for the new process. * Copy the trap frame for the return to user mode as if from a * syscall. This copies most of the user mode register values. */ tf = (struct trapframe *)pcb2 - 1; bcopy(td1->td_frame, tf, sizeof(*tf)); tf->tf_out[0] = 0; /* Child returns zero */ tf->tf_out[1] = 0; tf->tf_tstate &= ~TSTATE_XCC_C; /* success */ tf->tf_fprs = 0; td2->td_frame = tf; fp = (struct frame *)tf - 1; fp->fr_local[0] = (u_long)fork_return; fp->fr_local[1] = (u_long)td2; fp->fr_local[2] = (u_long)tf; /* Terminate stack traces at this frame. */ fp->fr_pc = fp->fr_fp = 0; pcb2->pcb_sp = (u_long)fp - SPOFF; pcb2->pcb_pc = (u_long)fork_trampoline - 8; /* Setup to release the spin count in fork_exit(). */ td2->td_md.md_spinlock_count = 1; td2->td_md.md_saved_pil = 0; /* * Now, cpu_switch() can schedule the new process. */ } void cpu_reset(void) { static char bspec[64] = ""; phandle_t chosen; static struct { cell_t name; cell_t nargs; cell_t nreturns; cell_t bootspec; } args = { (cell_t)"boot", 1, 0, (cell_t)bspec }; if ((chosen = OF_finddevice("/chosen")) != -1) { if (OF_getprop(chosen, "bootpath", bspec, sizeof(bspec)) == -1) bspec[0] = '\0'; bspec[sizeof(bspec) - 1] = '\0'; } cpu_shutdown(&args); } /* * Intercept the return address from a freshly forked process that has NOT * been scheduled yet. * * This is needed to make kernel threads stay in kernel mode. */ void cpu_fork_kthread_handler(struct thread *td, void (*func)(void *), void *arg) { struct frame *fp; struct pcb *pcb; pcb = td->td_pcb; fp = (struct frame *)(pcb->pcb_sp + SPOFF); fp->fr_local[0] = (u_long)func; fp->fr_local[1] = (u_long)arg; } int is_physical_memory(vm_paddr_t addr) { struct ofw_mem_region *mr; for (mr = sparc64_memreg; mr < sparc64_memreg + sparc64_nmemreg; mr++) if (addr >= mr->mr_start && addr < mr->mr_start + mr->mr_size) return (1); return (0); } void swi_vm(void *v) { /* Nothing to do here - busdma bounce buffers are not implemented. */ } void * -uma_small_alloc(uma_zone_t zone, vm_size_t bytes, u_int8_t *flags, int wait) +uma_small_alloc(uma_zone_t zone, vm_size_t bytes, int domain, u_int8_t *flags, + int wait) { vm_paddr_t pa; vm_page_t m; int pflags; void *va; PMAP_STATS_INC(uma_nsmall_alloc); *flags = UMA_SLAB_PRIV; pflags = malloc2vm_flags(wait) | VM_ALLOC_WIRED; for (;;) { m = vm_page_alloc(NULL, 0, pflags | VM_ALLOC_NOOBJ); if (m == NULL) { if (wait & M_NOWAIT) return (NULL); else VM_WAIT; } else break; } pa = VM_PAGE_TO_PHYS(m); if (dcache_color_ignore == 0 && m->md.color != DCACHE_COLOR(pa)) { KASSERT(m->md.colors[0] == 0 && m->md.colors[1] == 0, ("uma_small_alloc: free page %p still has mappings!", m)); PMAP_STATS_INC(uma_nsmall_alloc_oc); m->md.color = DCACHE_COLOR(pa); dcache_page_inval(pa); } va = (void *)TLB_PHYS_TO_DIRECT(pa); if ((wait & M_ZERO) && (m->flags & PG_ZERO) == 0) cpu_block_zero(va, PAGE_SIZE); return (va); } void uma_small_free(void *mem, vm_size_t size, u_int8_t flags) { vm_page_t m; PMAP_STATS_INC(uma_nsmall_free); m = PHYS_TO_VM_PAGE(TLB_DIRECT_TO_PHYS((vm_offset_t)mem)); m->wire_count--; vm_page_free(m); atomic_subtract_int(&vm_cnt.v_wire_count, 1); } void sf_buf_map(struct sf_buf *sf, int flags) { pmap_qenter(sf->kva, &sf->m, 1); } int sf_buf_unmap(struct sf_buf *sf) { pmap_qremove(sf->kva, 1); return (1); } Index: projects/numa2/sys/sys/_vm_domain.h =================================================================== --- projects/numa2/sys/sys/_vm_domain.h (revision 321505) +++ projects/numa2/sys/sys/_vm_domain.h (revision 321506) @@ -1,61 +1,66 @@ /*- * Copyright (c) 2015 Adrian Chadd . * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer, * without modification. * 2. Redistributions in binary form must reproduce at minimum a disclaimer * similar to the "NO WARRANTY" disclaimer below ("Disclaimer") and any * redistribution must be conditioned upon including a substantially * similar Disclaimer requirement for further binary redistribution. * * NO WARRANTY * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT * LIMITED TO, THE IMPLIED WARRANTIES OF NONINFRINGEMENT, MERCHANTIBILITY * AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL * THE COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR SPECIAL, EXEMPLARY, * OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF * THE POSSIBILITY OF SUCH DAMAGES. * * $FreeBSD$ */ #ifndef __SYS_VM_DOMAIN_H__ #define __SYS_VM_DOMAIN_H__ #include typedef enum { VM_POLICY_NONE, VM_POLICY_ROUND_ROBIN, VM_POLICY_FIXED_DOMAIN, VM_POLICY_FIXED_DOMAIN_ROUND_ROBIN, VM_POLICY_FIRST_TOUCH, VM_POLICY_FIRST_TOUCH_ROUND_ROBIN, - VM_POLICY_MAX } vm_domain_policy_type_t; struct vm_domain_policy_entry { vm_domain_policy_type_t policy; int domain; }; struct vm_domain_policy { seq_t seq; struct vm_domain_policy_entry p; +}; + +struct vm_domain_iterator { + vm_domain_policy_type_t policy; + int cursor; + int domain; }; #define VM_DOMAIN_POLICY_STATIC_INITIALISER(vt, vd) \ { .seq = 0, \ .p.policy = vt, \ .p.domain = vd } #endif /* __SYS_VM_DOMAIN_H__ */ Index: projects/numa2/sys/sys/busdma_bufalloc.h =================================================================== --- projects/numa2/sys/sys/busdma_bufalloc.h (revision 321505) +++ projects/numa2/sys/sys/busdma_bufalloc.h (revision 321506) @@ -1,118 +1,118 @@ /*- * Copyright (c) 2012 Ian Lepore * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /* * $FreeBSD$ */ /* * A buffer pool manager, for use by a platform's busdma implementation. */ #ifndef _MACHINE_BUSDMA_BUFALLOC_H_ #define _MACHINE_BUSDMA_BUFALLOC_H_ #include #include /* * Information about a buffer zone, returned by busdma_bufalloc_findzone(). */ struct busdma_bufzone { bus_size_t size; uma_zone_t umazone; char name[24]; }; /* * Opaque handle type returned by busdma_bufalloc_create(). */ struct busdma_bufalloc; typedef struct busdma_bufalloc *busdma_bufalloc_t; /* * Create an allocator that manages a pool of DMA buffers. * * The allocator manages a collection of uma(9) zones of buffers in power-of-two * sized increments ranging from minimum_alignment to the platform's PAGE_SIZE. * The buffers within each zone are aligned on boundaries corresponding to the * buffer size, and thus by implication each buffer is contiguous within a page * and does not cross a power of two boundary larger than the buffer size. * These rules are intended to make it easy for a busdma implementation to * check whether a tag's constraints allow use of a buffer from the allocator. * * minimum_alignment is also the minimum buffer allocation size. For platforms * with software-assisted cache coherency, this is typically the data cache line * size (and MUST not be smaller than the cache line size). * * name appears in zone stats as 'dma name nnnnn' where 'dma' is fixed and * 'nnnnn' is the size of buffers in that zone. * * If the alloc/free function pointers are NULL, the regular uma internal * allocators are used (I.E., you get "plain old kernel memory"). On a platform * with an exclusion zone that applies to all DMA operations, a custom allocator * could be used to ensure no buffer memory is ever allocated from that zone, * allowing the bus_dmamem_alloc() implementation to make the assumption that * buffers provided by the allocation could never lead to the need for a bounce. */ busdma_bufalloc_t busdma_bufalloc_create(const char *name, bus_size_t minimum_alignment, uma_alloc uma_alloc_func, uma_free uma_free_func, u_int32_t uma_zcreate_flags); /* * Destroy an allocator created by busdma_bufalloc_create(). * Safe to call with a NULL pointer. */ void busdma_bufalloc_destroy(busdma_bufalloc_t ba); /* * Return a pointer to the busdma_bufzone that should be used to allocate or * free a buffer of the given size. Returns NULL if the size is larger than the * largest zone handled by the allocator. */ struct busdma_bufzone * busdma_bufalloc_findzone(busdma_bufalloc_t ba, bus_size_t size); /* * These built-in allocation routines are available for managing a pools of * uncacheable memory on platforms that support VM_MEMATTR_UNCACHEABLE. * * Allocation is done using kmem_alloc_attr() with these parameters: * lowaddr = 0 * highaddr = BUS_SPACE_MAXADDR * memattr = VM_MEMATTR_UNCACHEABLE. * * If your platform has no exclusion region (lowaddr/highaddr), and its pmap * routines support pmap_page_set_memattr() and the VM_MEMATTR_UNCACHEABLE flag * you can probably use these when you need uncacheable buffers. */ void * busdma_bufalloc_alloc_uncacheable(uma_zone_t zone, vm_size_t size, - uint8_t *pflag, int wait); + int domain, uint8_t *pflag, int wait); void busdma_bufalloc_free_uncacheable(void *item, vm_size_t size, uint8_t pflag); #endif /* _MACHINE_BUSDMA_BUFALLOC_H_ */ Index: projects/numa2/sys/sys/proc.h =================================================================== --- projects/numa2/sys/sys/proc.h (revision 321505) +++ projects/numa2/sys/sys/proc.h (revision 321506) @@ -1,1138 +1,1138 @@ /*- * Copyright (c) 1986, 1989, 1991, 1993 * The Regents of the University of California. All rights reserved. * (c) UNIX System Laboratories, Inc. * All or some portions of this file are derived from material licensed * to the University of California by American Telephone and Telegraph * Co. or Unix System Laboratories, Inc. and are reproduced herein with * the permission of UNIX System Laboratories, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)proc.h 8.15 (Berkeley) 5/19/95 * $FreeBSD$ */ #ifndef _SYS_PROC_H_ #define _SYS_PROC_H_ #include /* For struct callout. */ #include /* For struct klist. */ #include #ifndef _KERNEL #include #endif #include #include #include #include #include #include #include /* XXX. */ #include #include #include #include #include #ifndef _KERNEL #include /* For structs itimerval, timeval. */ #else #include #endif #include #include #include #include /* Machine-dependent proc substruct. */ /* * One structure allocated per session. * * List of locks * (m) locked by s_mtx mtx * (e) locked by proctree_lock sx * (c) const until freeing */ struct session { u_int s_count; /* Ref cnt; pgrps in session - atomic. */ struct proc *s_leader; /* (m + e) Session leader. */ struct vnode *s_ttyvp; /* (m) Vnode of controlling tty. */ struct cdev_priv *s_ttydp; /* (m) Device of controlling tty. */ struct tty *s_ttyp; /* (e) Controlling tty. */ pid_t s_sid; /* (c) Session ID. */ /* (m) Setlogin() name: */ char s_login[roundup(MAXLOGNAME, sizeof(long))]; struct mtx s_mtx; /* Mutex to protect members. */ }; /* * One structure allocated per process group. * * List of locks * (m) locked by pg_mtx mtx * (e) locked by proctree_lock sx * (c) const until freeing */ struct pgrp { LIST_ENTRY(pgrp) pg_hash; /* (e) Hash chain. */ LIST_HEAD(, proc) pg_members; /* (m + e) Pointer to pgrp members. */ struct session *pg_session; /* (c) Pointer to session. */ struct sigiolst pg_sigiolst; /* (m) List of sigio sources. */ pid_t pg_id; /* (c) Process group id. */ int pg_jobc; /* (m) Job control process count. */ struct mtx pg_mtx; /* Mutex to protect members */ }; /* * pargs, used to hold a copy of the command line, if it had a sane length. */ struct pargs { u_int ar_ref; /* Reference count. */ u_int ar_length; /* Length. */ u_char ar_args[1]; /* Arguments. */ }; /*- * Description of a process. * * This structure contains the information needed to manage a thread of * control, known in UN*X as a process; it has references to substructures * containing descriptions of things that the process uses, but may share * with related processes. The process structure and the substructures * are always addressable except for those marked "(CPU)" below, * which might be addressable only on a processor on which the process * is running. * * Below is a key of locks used to protect each member of struct proc. The * lock is indicated by a reference to a specific character in parens in the * associated comment. * * - not yet protected * a - only touched by curproc or parent during fork/wait * b - created at fork, never changes * (exception aiods switch vmspaces, but they are also * marked 'P_SYSTEM' so hopefully it will be left alone) * c - locked by proc mtx * d - locked by allproc_lock lock * e - locked by proctree_lock lock * f - session mtx * g - process group mtx * h - callout_lock mtx * i - by curproc or the master session mtx * j - locked by proc slock * k - only accessed by curthread * k*- only accessed by curthread and from an interrupt * kx- only accessed by curthread and by debugger * l - the attaching proc or attaching proc parent * m - Giant * n - not locked, lazy * o - ktrace lock * q - td_contested lock * r - p_peers lock * s - see sleepq_switch(), sleeping_on_old_rtc(), and sleep(9) * t - thread lock * u - process stat lock * w - process timer lock * x - created at fork, only changes during single threading in exec * y - created at first aio, doesn't change until exit or exec at which * point we are single-threaded and only curthread changes it * z - zombie threads lock * * If the locking key specifies two identifiers (for example, p_pptr) then * either lock is sufficient for read access, but both locks must be held * for write access. */ struct cpuset; struct filecaps; struct filemon; struct kaioinfo; struct kaudit_record; struct kdtrace_proc; struct kdtrace_thread; struct mqueue_notifier; struct nlminfo; struct p_sched; struct proc; struct procdesc; struct racct; struct sbuf; struct sleepqueue; struct syscall_args; struct td_sched; struct thread; struct trapframe; struct turnstile; /* * XXX: Does this belong in resource.h or resourcevar.h instead? * Resource usage extension. The times in rusage structs in the kernel are * never up to date. The actual times are kept as runtimes and tick counts * (with control info in the "previous" times), and are converted when * userland asks for rusage info. Backwards compatibility prevents putting * this directly in the user-visible rusage struct. * * Locking for p_rux: (cu) means (u) for p_rux and (c) for p_crux. * Locking for td_rux: (t) for all fields. */ struct rusage_ext { uint64_t rux_runtime; /* (cu) Real time. */ uint64_t rux_uticks; /* (cu) Statclock hits in user mode. */ uint64_t rux_sticks; /* (cu) Statclock hits in sys mode. */ uint64_t rux_iticks; /* (cu) Statclock hits in intr mode. */ uint64_t rux_uu; /* (c) Previous user time in usec. */ uint64_t rux_su; /* (c) Previous sys time in usec. */ uint64_t rux_tu; /* (c) Previous total time in usec. */ }; /* * Kernel runnable context (thread). * This is what is put to sleep and reactivated. * Thread context. Processes may have multiple threads. */ struct thread { struct mtx *volatile td_lock; /* replaces sched lock */ struct proc *td_proc; /* (*) Associated process. */ TAILQ_ENTRY(thread) td_plist; /* (*) All threads in this proc. */ TAILQ_ENTRY(thread) td_runq; /* (t) Run queue. */ TAILQ_ENTRY(thread) td_slpq; /* (t) Sleep queue. */ TAILQ_ENTRY(thread) td_lockq; /* (t) Lock queue. */ LIST_ENTRY(thread) td_hash; /* (d) Hash chain. */ struct cpuset *td_cpuset; /* (t) CPU affinity mask. */ struct seltd *td_sel; /* Select queue/channel. */ struct sleepqueue *td_sleepqueue; /* (k) Associated sleep queue. */ struct turnstile *td_turnstile; /* (k) Associated turnstile. */ struct rl_q_entry *td_rlqe; /* (k) Associated range lock entry. */ struct umtx_q *td_umtxq; /* (c?) Link for when we're blocked. */ struct vm_domain_policy td_vm_dom_policy; /* (c) current numa domain policy */ lwpid_t td_tid; /* (b) Thread ID. */ sigqueue_t td_sigqueue; /* (c) Sigs arrived, not delivered. */ #define td_siglist td_sigqueue.sq_signals u_char td_lend_user_pri; /* (t) Lend user pri. */ /* Cleared during fork1() */ #define td_startzero td_flags int td_flags; /* (t) TDF_* flags. */ int td_inhibitors; /* (t) Why can not run. */ int td_pflags; /* (k) Private thread (TDP_*) flags. */ int td_dupfd; /* (k) Ret value from fdopen. XXX */ int td_sqqueue; /* (t) Sleepqueue queue blocked on. */ void *td_wchan; /* (t) Sleep address. */ const char *td_wmesg; /* (t) Reason for sleep. */ volatile u_char td_owepreempt; /* (k*) Preempt on last critical_exit */ u_char td_tsqueue; /* (t) Turnstile queue blocked on. */ short td_locks; /* (k) Debug: count of non-spin locks */ short td_rw_rlocks; /* (k) Count of rwlock read locks. */ short td_lk_slocks; /* (k) Count of lockmgr shared locks. */ short td_stopsched; /* (k) Scheduler stopped. */ struct turnstile *td_blocked; /* (t) Lock thread is blocked on. */ const char *td_lockname; /* (t) Name of lock blocked on. */ LIST_HEAD(, turnstile) td_contested; /* (q) Contested locks. */ struct lock_list_entry *td_sleeplocks; /* (k) Held sleep locks. */ int td_intr_nesting_level; /* (k) Interrupt recursion. */ int td_pinned; /* (k) Temporary cpu pin count. */ struct ucred *td_ucred; /* (k) Reference to credentials. */ struct plimit *td_limit; /* (k) Resource limits. */ int td_slptick; /* (t) Time at sleep. */ int td_blktick; /* (t) Time spent blocked. */ int td_swvoltick; /* (t) Time at last SW_VOL switch. */ int td_swinvoltick; /* (t) Time at last SW_INVOL switch. */ u_int td_cow; /* (*) Number of copy-on-write faults */ struct rusage td_ru; /* (t) rusage information. */ struct rusage_ext td_rux; /* (t) Internal rusage information. */ uint64_t td_incruntime; /* (t) Cpu ticks to transfer to proc. */ uint64_t td_runtime; /* (t) How many cpu ticks we've run. */ u_int td_pticks; /* (t) Statclock hits for profiling */ u_int td_sticks; /* (t) Statclock hits in system mode. */ u_int td_iticks; /* (t) Statclock hits in intr mode. */ u_int td_uticks; /* (t) Statclock hits in user mode. */ int td_intrval; /* (t) Return value for sleepq. */ sigset_t td_oldsigmask; /* (k) Saved mask from pre sigpause. */ volatile u_int td_generation; /* (k) For detection of preemption */ stack_t td_sigstk; /* (k) Stack ptr and on-stack flag. */ int td_xsig; /* (c) Signal for ptrace */ u_long td_profil_addr; /* (k) Temporary addr until AST. */ u_int td_profil_ticks; /* (k) Temporary ticks until AST. */ char td_name[MAXCOMLEN + 1]; /* (*) Thread name. */ struct file *td_fpop; /* (k) file referencing cdev under op */ int td_dbgflags; /* (c) Userland debugger flags */ siginfo_t td_si; /* (c) For debugger or core file */ int td_ng_outbound; /* (k) Thread entered ng from above. */ struct osd td_osd; /* (k) Object specific data. */ struct vm_map_entry *td_map_def_user; /* (k) Deferred entries. */ pid_t td_dbg_forked; /* (c) Child pid for debugger. */ u_int td_vp_reserv; /* (k) Count of reserved vnodes. */ int td_no_sleeping; /* (k) Sleeping disabled count. */ - int td_dom_rr_idx; /* (k) RR Numa domain selection. */ + struct vm_domain_iterator td_dom_selector; /* (k) VM domain selector */ void *td_su; /* (k) FFS SU private */ sbintime_t td_sleeptimo; /* (t) Sleep timeout. */ int td_rtcgen; /* (s) rtc_generation of abs. sleep */ #define td_endzero td_sigmask /* Copied during fork1() or create_thread(). */ #define td_startcopy td_endzero sigset_t td_sigmask; /* (c) Current signal mask. */ u_char td_rqindex; /* (t) Run queue index. */ u_char td_base_pri; /* (t) Thread base kernel priority. */ u_char td_priority; /* (t) Thread active priority. */ u_char td_pri_class; /* (t) Scheduling class. */ u_char td_user_pri; /* (t) User pri from estcpu and nice. */ u_char td_base_user_pri; /* (t) Base user pri */ uintptr_t td_rb_list; /* (k) Robust list head. */ uintptr_t td_rbp_list; /* (k) Robust priv list head. */ uintptr_t td_rb_inact; /* (k) Current in-action mutex loc. */ struct syscall_args td_sa; /* (kx) Syscall parameters. Copied on fork for child tracing. */ #define td_endcopy td_pcb /* * Fields that must be manually set in fork1() or create_thread() * or already have been set in the allocator, constructor, etc. */ struct pcb *td_pcb; /* (k) Kernel VA of pcb and kstack. */ enum { TDS_INACTIVE = 0x0, TDS_INHIBITED, TDS_CAN_RUN, TDS_RUNQ, TDS_RUNNING } td_state; /* (t) thread state */ union { register_t tdu_retval[2]; off_t tdu_off; } td_uretoff; /* (k) Syscall aux returns. */ #define td_retval td_uretoff.tdu_retval u_int td_cowgen; /* (k) Generation of COW pointers. */ struct callout td_slpcallout; /* (h) Callout for sleep. */ struct trapframe *td_frame; /* (k) */ struct vm_object *td_kstack_obj;/* (a) Kstack object. */ vm_offset_t td_kstack; /* (a) Kernel VA of kstack. */ int td_kstack_pages; /* (a) Size of the kstack. */ volatile u_int td_critnest; /* (k*) Critical section nest level. */ struct mdthread td_md; /* (k) Any machine-dependent fields. */ struct kaudit_record *td_ar; /* (k) Active audit record, if any. */ struct lpohead td_lprof[2]; /* (a) lock profiling objects. */ struct kdtrace_thread *td_dtrace; /* (*) DTrace-specific data. */ int td_errno; /* Error returned by last syscall. */ struct vnet *td_vnet; /* (k) Effective vnet. */ const char *td_vnet_lpush; /* (k) Debugging vnet push / pop. */ struct trapframe *td_intr_frame;/* (k) Frame of the current irq */ struct proc *td_rfppwait_p; /* (k) The vforked child */ struct vm_page **td_ma; /* (k) uio pages held */ int td_ma_cnt; /* (k) size of *td_ma */ void *td_emuldata; /* Emulator state data */ int td_lastcpu; /* (t) Last cpu we were on. */ int td_oncpu; /* (t) Which cpu we are on. */ void *td_lkpi_task; /* LinuxKPI task struct pointer */ }; struct thread0_storage { struct thread t0st_thread; uint64_t t0st_sched[10]; }; struct mtx *thread_lock_block(struct thread *); void thread_lock_unblock(struct thread *, struct mtx *); void thread_lock_set(struct thread *, struct mtx *); #define THREAD_LOCK_ASSERT(td, type) \ do { \ struct mtx *__m = (td)->td_lock; \ if (__m != &blocked_lock) \ mtx_assert(__m, (type)); \ } while (0) #ifdef INVARIANTS #define THREAD_LOCKPTR_ASSERT(td, lock) \ do { \ struct mtx *__m = (td)->td_lock; \ KASSERT((__m == &blocked_lock || __m == (lock)), \ ("Thread %p lock %p does not match %p", td, __m, (lock))); \ } while (0) #define TD_LOCKS_INC(td) ((td)->td_locks++) #define TD_LOCKS_DEC(td) ((td)->td_locks--) #else #define THREAD_LOCKPTR_ASSERT(td, lock) #define TD_LOCKS_INC(td) #define TD_LOCKS_DEC(td) #endif /* * Flags kept in td_flags: * To change these you MUST have the scheduler lock. */ #define TDF_BORROWING 0x00000001 /* Thread is borrowing pri from another. */ #define TDF_INPANIC 0x00000002 /* Caused a panic, let it drive crashdump. */ #define TDF_INMEM 0x00000004 /* Thread's stack is in memory. */ #define TDF_SINTR 0x00000008 /* Sleep is interruptible. */ #define TDF_TIMEOUT 0x00000010 /* Timing out during sleep. */ #define TDF_IDLETD 0x00000020 /* This is a per-CPU idle thread. */ #define TDF_CANSWAP 0x00000040 /* Thread can be swapped. */ #define TDF_SLEEPABORT 0x00000080 /* sleepq_abort was called. */ #define TDF_KTH_SUSP 0x00000100 /* kthread is suspended */ #define TDF_ALLPROCSUSP 0x00000200 /* suspended by SINGLE_ALLPROC */ #define TDF_BOUNDARY 0x00000400 /* Thread suspended at user boundary */ #define TDF_ASTPENDING 0x00000800 /* Thread has some asynchronous events. */ #define TDF_UNUSED12 0x00001000 /* --available-- */ #define TDF_SBDRY 0x00002000 /* Stop only on usermode boundary. */ #define TDF_UPIBLOCKED 0x00004000 /* Thread blocked on user PI mutex. */ #define TDF_NEEDSUSPCHK 0x00008000 /* Thread may need to suspend. */ #define TDF_NEEDRESCHED 0x00010000 /* Thread needs to yield. */ #define TDF_NEEDSIGCHK 0x00020000 /* Thread may need signal delivery. */ #define TDF_NOLOAD 0x00040000 /* Ignore during load avg calculations. */ #define TDF_SERESTART 0x00080000 /* ERESTART on stop attempts. */ #define TDF_THRWAKEUP 0x00100000 /* Libthr thread must not suspend itself. */ #define TDF_SEINTR 0x00200000 /* EINTR on stop attempts. */ #define TDF_SWAPINREQ 0x00400000 /* Swapin request due to wakeup. */ #define TDF_UNUSED23 0x00800000 /* --available-- */ #define TDF_SCHED0 0x01000000 /* Reserved for scheduler private use */ #define TDF_SCHED1 0x02000000 /* Reserved for scheduler private use */ #define TDF_SCHED2 0x04000000 /* Reserved for scheduler private use */ #define TDF_SCHED3 0x08000000 /* Reserved for scheduler private use */ #define TDF_ALRMPEND 0x10000000 /* Pending SIGVTALRM needs to be posted. */ #define TDF_PROFPEND 0x20000000 /* Pending SIGPROF needs to be posted. */ #define TDF_MACPEND 0x40000000 /* AST-based MAC event pending. */ /* Userland debug flags */ #define TDB_SUSPEND 0x00000001 /* Thread is suspended by debugger */ #define TDB_XSIG 0x00000002 /* Thread is exchanging signal under trace */ #define TDB_USERWR 0x00000004 /* Debugger modified memory or registers */ #define TDB_SCE 0x00000008 /* Thread performs syscall enter */ #define TDB_SCX 0x00000010 /* Thread performs syscall exit */ #define TDB_EXEC 0x00000020 /* TDB_SCX from exec(2) family */ #define TDB_FORK 0x00000040 /* TDB_SCX from fork(2) that created new process */ #define TDB_STOPATFORK 0x00000080 /* Stop at the return from fork (child only) */ #define TDB_CHILD 0x00000100 /* New child indicator for ptrace() */ #define TDB_BORN 0x00000200 /* New LWP indicator for ptrace() */ #define TDB_EXIT 0x00000400 /* Exiting LWP indicator for ptrace() */ #define TDB_VFORK 0x00000800 /* vfork indicator for ptrace() */ #define TDB_FSTP 0x00001000 /* The thread is PT_ATTACH leader */ /* * "Private" flags kept in td_pflags: * These are only written by curthread and thus need no locking. */ #define TDP_OLDMASK 0x00000001 /* Need to restore mask after suspend. */ #define TDP_INKTR 0x00000002 /* Thread is currently in KTR code. */ #define TDP_INKTRACE 0x00000004 /* Thread is currently in KTRACE code. */ #define TDP_BUFNEED 0x00000008 /* Do not recurse into the buf flush */ #define TDP_COWINPROGRESS 0x00000010 /* Snapshot copy-on-write in progress. */ #define TDP_ALTSTACK 0x00000020 /* Have alternate signal stack. */ #define TDP_DEADLKTREAT 0x00000040 /* Lock acquisition - deadlock treatment. */ #define TDP_NOFAULTING 0x00000080 /* Do not handle page faults. */ #define TDP_UNUSED9 0x00000100 /* --available-- */ #define TDP_OWEUPC 0x00000200 /* Call addupc() at next AST. */ #define TDP_ITHREAD 0x00000400 /* Thread is an interrupt thread. */ #define TDP_SYNCIO 0x00000800 /* Local override, disable async i/o. */ #define TDP_SCHED1 0x00001000 /* Reserved for scheduler private use */ #define TDP_SCHED2 0x00002000 /* Reserved for scheduler private use */ #define TDP_SCHED3 0x00004000 /* Reserved for scheduler private use */ #define TDP_SCHED4 0x00008000 /* Reserved for scheduler private use */ #define TDP_GEOM 0x00010000 /* Settle GEOM before finishing syscall */ #define TDP_SOFTDEP 0x00020000 /* Stuck processing softdep worklist */ #define TDP_NORUNNINGBUF 0x00040000 /* Ignore runningbufspace check */ #define TDP_WAKEUP 0x00080000 /* Don't sleep in umtx cond_wait */ #define TDP_INBDFLUSH 0x00100000 /* Already in BO_BDFLUSH, do not recurse */ #define TDP_KTHREAD 0x00200000 /* This is an official kernel thread */ #define TDP_CALLCHAIN 0x00400000 /* Capture thread's callchain */ #define TDP_IGNSUSP 0x00800000 /* Permission to ignore the MNTK_SUSPEND* */ #define TDP_AUDITREC 0x01000000 /* Audit record pending on thread */ #define TDP_RFPPWAIT 0x02000000 /* Handle RFPPWAIT on syscall exit */ #define TDP_RESETSPUR 0x04000000 /* Reset spurious page fault history. */ #define TDP_NERRNO 0x08000000 /* Last errno is already in td_errno */ #define TDP_UIOHELD 0x10000000 /* Current uio has pages held in td_ma */ #define TDP_FORKING 0x20000000 /* Thread is being created through fork() */ #define TDP_EXECVMSPC 0x40000000 /* Execve destroyed old vmspace */ /* * Reasons that the current thread can not be run yet. * More than one may apply. */ #define TDI_SUSPENDED 0x0001 /* On suspension queue. */ #define TDI_SLEEPING 0x0002 /* Actually asleep! (tricky). */ #define TDI_SWAPPED 0x0004 /* Stack not in mem. Bad juju if run. */ #define TDI_LOCK 0x0008 /* Stopped on a lock. */ #define TDI_IWAIT 0x0010 /* Awaiting interrupt. */ #define TD_IS_SLEEPING(td) ((td)->td_inhibitors & TDI_SLEEPING) #define TD_ON_SLEEPQ(td) ((td)->td_wchan != NULL) #define TD_IS_SUSPENDED(td) ((td)->td_inhibitors & TDI_SUSPENDED) #define TD_IS_SWAPPED(td) ((td)->td_inhibitors & TDI_SWAPPED) #define TD_ON_LOCK(td) ((td)->td_inhibitors & TDI_LOCK) #define TD_AWAITING_INTR(td) ((td)->td_inhibitors & TDI_IWAIT) #define TD_IS_RUNNING(td) ((td)->td_state == TDS_RUNNING) #define TD_ON_RUNQ(td) ((td)->td_state == TDS_RUNQ) #define TD_CAN_RUN(td) ((td)->td_state == TDS_CAN_RUN) #define TD_IS_INHIBITED(td) ((td)->td_state == TDS_INHIBITED) #define TD_ON_UPILOCK(td) ((td)->td_flags & TDF_UPIBLOCKED) #define TD_IS_IDLETHREAD(td) ((td)->td_flags & TDF_IDLETD) #define KTDSTATE(td) \ (((td)->td_inhibitors & TDI_SLEEPING) != 0 ? "sleep" : \ ((td)->td_inhibitors & TDI_SUSPENDED) != 0 ? "suspended" : \ ((td)->td_inhibitors & TDI_SWAPPED) != 0 ? "swapped" : \ ((td)->td_inhibitors & TDI_LOCK) != 0 ? "blocked" : \ ((td)->td_inhibitors & TDI_IWAIT) != 0 ? "iwait" : "yielding") #define TD_SET_INHIB(td, inhib) do { \ (td)->td_state = TDS_INHIBITED; \ (td)->td_inhibitors |= (inhib); \ } while (0) #define TD_CLR_INHIB(td, inhib) do { \ if (((td)->td_inhibitors & (inhib)) && \ (((td)->td_inhibitors &= ~(inhib)) == 0)) \ (td)->td_state = TDS_CAN_RUN; \ } while (0) #define TD_SET_SLEEPING(td) TD_SET_INHIB((td), TDI_SLEEPING) #define TD_SET_SWAPPED(td) TD_SET_INHIB((td), TDI_SWAPPED) #define TD_SET_LOCK(td) TD_SET_INHIB((td), TDI_LOCK) #define TD_SET_SUSPENDED(td) TD_SET_INHIB((td), TDI_SUSPENDED) #define TD_SET_IWAIT(td) TD_SET_INHIB((td), TDI_IWAIT) #define TD_SET_EXITING(td) TD_SET_INHIB((td), TDI_EXITING) #define TD_CLR_SLEEPING(td) TD_CLR_INHIB((td), TDI_SLEEPING) #define TD_CLR_SWAPPED(td) TD_CLR_INHIB((td), TDI_SWAPPED) #define TD_CLR_LOCK(td) TD_CLR_INHIB((td), TDI_LOCK) #define TD_CLR_SUSPENDED(td) TD_CLR_INHIB((td), TDI_SUSPENDED) #define TD_CLR_IWAIT(td) TD_CLR_INHIB((td), TDI_IWAIT) #define TD_SET_RUNNING(td) (td)->td_state = TDS_RUNNING #define TD_SET_RUNQ(td) (td)->td_state = TDS_RUNQ #define TD_SET_CAN_RUN(td) (td)->td_state = TDS_CAN_RUN #define TD_SBDRY_INTR(td) \ (((td)->td_flags & (TDF_SEINTR | TDF_SERESTART)) != 0) #define TD_SBDRY_ERRNO(td) \ (((td)->td_flags & TDF_SEINTR) != 0 ? EINTR : ERESTART) /* * Process structure. */ struct proc { LIST_ENTRY(proc) p_list; /* (d) List of all processes. */ TAILQ_HEAD(, thread) p_threads; /* (c) all threads. */ struct mtx p_slock; /* process spin lock */ struct ucred *p_ucred; /* (c) Process owner's identity. */ struct filedesc *p_fd; /* (b) Open files. */ struct filedesc_to_leader *p_fdtol; /* (b) Tracking node */ struct pstats *p_stats; /* (b) Accounting/statistics (CPU). */ struct plimit *p_limit; /* (c) Resource limits. */ struct callout p_limco; /* (c) Limit callout handle */ struct sigacts *p_sigacts; /* (x) Signal actions, state (CPU). */ int p_flag; /* (c) P_* flags. */ int p_flag2; /* (c) P2_* flags. */ enum { PRS_NEW = 0, /* In creation */ PRS_NORMAL, /* threads can be run. */ PRS_ZOMBIE } p_state; /* (j/c) Process status. */ pid_t p_pid; /* (b) Process identifier. */ LIST_ENTRY(proc) p_hash; /* (d) Hash chain. */ LIST_ENTRY(proc) p_pglist; /* (g + e) List of processes in pgrp. */ struct proc *p_pptr; /* (c + e) Pointer to parent process. */ LIST_ENTRY(proc) p_sibling; /* (e) List of sibling processes. */ LIST_HEAD(, proc) p_children; /* (e) Pointer to list of children. */ struct proc *p_reaper; /* (e) My reaper. */ LIST_HEAD(, proc) p_reaplist; /* (e) List of my descendants (if I am reaper). */ LIST_ENTRY(proc) p_reapsibling; /* (e) List of siblings - descendants of the same reaper. */ struct mtx p_mtx; /* (n) Lock for this struct. */ struct mtx p_statmtx; /* Lock for the stats */ struct mtx p_itimmtx; /* Lock for the virt/prof timers */ struct mtx p_profmtx; /* Lock for the profiling */ struct ksiginfo *p_ksi; /* Locked by parent proc lock */ sigqueue_t p_sigqueue; /* (c) Sigs not delivered to a td. */ #define p_siglist p_sigqueue.sq_signals /* The following fields are all zeroed upon creation in fork. */ #define p_startzero p_oppid pid_t p_oppid; /* (c + e) Save ppid in ptrace. XXX */ struct vmspace *p_vmspace; /* (b) Address space. */ u_int p_swtick; /* (c) Tick when swapped in or out. */ u_int p_cowgen; /* (c) Generation of COW pointers. */ struct itimerval p_realtimer; /* (c) Alarm timer. */ struct rusage p_ru; /* (a) Exit information. */ struct rusage_ext p_rux; /* (cu) Internal resource usage. */ struct rusage_ext p_crux; /* (c) Internal child resource usage. */ int p_profthreads; /* (c) Num threads in addupc_task. */ volatile int p_exitthreads; /* (j) Number of threads exiting */ int p_traceflag; /* (o) Kernel trace points. */ struct vnode *p_tracevp; /* (c + o) Trace to vnode. */ struct ucred *p_tracecred; /* (o) Credentials to trace with. */ struct vnode *p_textvp; /* (b) Vnode of executable. */ u_int p_lock; /* (c) Proclock (prevent swap) count. */ struct sigiolst p_sigiolst; /* (c) List of sigio sources. */ int p_sigparent; /* (c) Signal to parent on exit. */ int p_sig; /* (n) For core dump/debugger XXX. */ u_long p_code; /* (n) For core dump/debugger XXX. */ u_int p_stops; /* (c) Stop event bitmask. */ u_int p_stype; /* (c) Stop event type. */ char p_step; /* (c) Process is stopped. */ u_char p_pfsflags; /* (c) Procfs flags. */ u_int p_ptevents; /* (c) ptrace() event mask. */ struct nlminfo *p_nlminfo; /* (?) Only used by/for lockd. */ struct kaioinfo *p_aioinfo; /* (y) ASYNC I/O info. */ struct thread *p_singlethread;/* (c + j) If single threading this is it */ int p_suspcount; /* (j) Num threads in suspended mode. */ struct thread *p_xthread; /* (c) Trap thread */ int p_boundary_count;/* (j) Num threads at user boundary */ int p_pendingcnt; /* how many signals are pending */ struct itimers *p_itimers; /* (c) POSIX interval timers. */ struct procdesc *p_procdesc; /* (e) Process descriptor, if any. */ u_int p_treeflag; /* (e) P_TREE flags */ int p_pendingexits; /* (c) Count of pending thread exits. */ struct filemon *p_filemon; /* (c) filemon-specific data. */ /* End area that is zeroed on creation. */ #define p_endzero p_magic /* The following fields are all copied upon creation in fork. */ #define p_startcopy p_endzero u_int p_magic; /* (b) Magic number. */ int p_osrel; /* (x) osreldate for the binary (from ELF note, if any) */ char p_comm[MAXCOMLEN + 1]; /* (x) Process name. */ struct sysentvec *p_sysent; /* (b) Syscall dispatch info. */ struct pargs *p_args; /* (c) Process arguments. */ rlim_t p_cpulimit; /* (c) Current CPU limit in seconds. */ signed char p_nice; /* (c) Process "nice" value. */ int p_fibnum; /* in this routing domain XXX MRT */ pid_t p_reapsubtree; /* (e) Pid of the direct child of the reaper which spawned our subtree. */ uint16_t p_elf_machine; /* (x) ELF machine type */ uint64_t p_elf_flags; /* (x) ELF flags */ /* End area that is copied on creation. */ #define p_endcopy p_xexit u_int p_xexit; /* (c) Exit code. */ u_int p_xsig; /* (c) Stop/kill sig. */ struct pgrp *p_pgrp; /* (c + e) Pointer to process group. */ struct knlist *p_klist; /* (c) Knotes attached to this proc. */ int p_numthreads; /* (c) Number of threads. */ struct mdproc p_md; /* Any machine-dependent fields. */ struct callout p_itcallout; /* (h + c) Interval timer callout. */ u_short p_acflag; /* (c) Accounting flags. */ struct proc *p_peers; /* (r) */ struct proc *p_leader; /* (b) */ void *p_emuldata; /* (c) Emulator state data. */ struct label *p_label; /* (*) Proc (not subject) MAC label. */ STAILQ_HEAD(, ktr_request) p_ktr; /* (o) KTR event queue. */ LIST_HEAD(, mqueue_notifier) p_mqnotifier; /* (c) mqueue notifiers.*/ struct kdtrace_proc *p_dtrace; /* (*) DTrace-specific data. */ struct cv p_pwait; /* (*) wait cv for exit/exec. */ struct cv p_dbgwait; /* (*) wait cv for debugger attach after fork. */ uint64_t p_prev_runtime; /* (c) Resource usage accounting. */ struct racct *p_racct; /* (b) Resource accounting. */ int p_throttled; /* (c) Flag for racct pcpu throttling */ struct vm_domain_policy p_vm_dom_policy; /* (c) process default VM domain, or -1 */ /* * An orphan is the child that has beed re-parented to the * debugger as a result of attaching to it. Need to keep * track of them for parent to be able to collect the exit * status of what used to be children. */ LIST_ENTRY(proc) p_orphan; /* (e) List of orphan processes. */ LIST_HEAD(, proc) p_orphans; /* (e) Pointer to list of orphans. */ }; #define p_session p_pgrp->pg_session #define p_pgid p_pgrp->pg_id #define NOCPU (-1) /* For when we aren't on a CPU. */ #define NOCPU_OLD (255) #define MAXCPU_OLD (254) #define PROC_SLOCK(p) mtx_lock_spin(&(p)->p_slock) #define PROC_SUNLOCK(p) mtx_unlock_spin(&(p)->p_slock) #define PROC_SLOCK_ASSERT(p, type) mtx_assert(&(p)->p_slock, (type)) #define PROC_STATLOCK(p) mtx_lock_spin(&(p)->p_statmtx) #define PROC_STATUNLOCK(p) mtx_unlock_spin(&(p)->p_statmtx) #define PROC_STATLOCK_ASSERT(p, type) mtx_assert(&(p)->p_statmtx, (type)) #define PROC_ITIMLOCK(p) mtx_lock_spin(&(p)->p_itimmtx) #define PROC_ITIMUNLOCK(p) mtx_unlock_spin(&(p)->p_itimmtx) #define PROC_ITIMLOCK_ASSERT(p, type) mtx_assert(&(p)->p_itimmtx, (type)) #define PROC_PROFLOCK(p) mtx_lock_spin(&(p)->p_profmtx) #define PROC_PROFUNLOCK(p) mtx_unlock_spin(&(p)->p_profmtx) #define PROC_PROFLOCK_ASSERT(p, type) mtx_assert(&(p)->p_profmtx, (type)) /* These flags are kept in p_flag. */ #define P_ADVLOCK 0x00001 /* Process may hold a POSIX advisory lock. */ #define P_CONTROLT 0x00002 /* Has a controlling terminal. */ #define P_KPROC 0x00004 /* Kernel process. */ #define P_UNUSED3 0x00008 /* --available-- */ #define P_PPWAIT 0x00010 /* Parent is waiting for child to exec/exit. */ #define P_PROFIL 0x00020 /* Has started profiling. */ #define P_STOPPROF 0x00040 /* Has thread requesting to stop profiling. */ #define P_HADTHREADS 0x00080 /* Has had threads (no cleanup shortcuts) */ #define P_SUGID 0x00100 /* Had set id privileges since last exec. */ #define P_SYSTEM 0x00200 /* System proc: no sigs, stats or swapping. */ #define P_SINGLE_EXIT 0x00400 /* Threads suspending should exit, not wait. */ #define P_TRACED 0x00800 /* Debugged process being traced. */ #define P_WAITED 0x01000 /* Someone is waiting for us. */ #define P_WEXIT 0x02000 /* Working on exiting. */ #define P_EXEC 0x04000 /* Process called exec. */ #define P_WKILLED 0x08000 /* Killed, go to kernel/user boundary ASAP. */ #define P_CONTINUED 0x10000 /* Proc has continued from a stopped state. */ #define P_STOPPED_SIG 0x20000 /* Stopped due to SIGSTOP/SIGTSTP. */ #define P_STOPPED_TRACE 0x40000 /* Stopped because of tracing. */ #define P_STOPPED_SINGLE 0x80000 /* Only 1 thread can continue (not to user). */ #define P_PROTECTED 0x100000 /* Do not kill on memory overcommit. */ #define P_SIGEVENT 0x200000 /* Process pending signals changed. */ #define P_SINGLE_BOUNDARY 0x400000 /* Threads should suspend at user boundary. */ #define P_HWPMC 0x800000 /* Process is using HWPMCs */ #define P_JAILED 0x1000000 /* Process is in jail. */ #define P_TOTAL_STOP 0x2000000 /* Stopped in stop_all_proc. */ #define P_INEXEC 0x4000000 /* Process is in execve(). */ #define P_STATCHILD 0x8000000 /* Child process stopped or exited. */ #define P_INMEM 0x10000000 /* Loaded into memory. */ #define P_SWAPPINGOUT 0x20000000 /* Process is being swapped out. */ #define P_SWAPPINGIN 0x40000000 /* Process is being swapped in. */ #define P_PPTRACE 0x80000000 /* PT_TRACEME by vforked child. */ #define P_STOPPED (P_STOPPED_SIG|P_STOPPED_SINGLE|P_STOPPED_TRACE) #define P_SHOULDSTOP(p) ((p)->p_flag & P_STOPPED) #define P_KILLED(p) ((p)->p_flag & P_WKILLED) /* These flags are kept in p_flag2. */ #define P2_INHERIT_PROTECTED 0x00000001 /* New children get P_PROTECTED. */ #define P2_NOTRACE 0x00000002 /* No ptrace(2) attach or coredumps. */ #define P2_NOTRACE_EXEC 0x00000004 /* Keep P2_NOPTRACE on exec(2). */ #define P2_AST_SU 0x00000008 /* Handles SU ast for kthreads. */ #define P2_PTRACE_FSTP 0x00000010 /* SIGSTOP from PT_ATTACH not yet handled. */ #define P2_TRAPCAP 0x00000020 /* SIGTRAP on ENOTCAPABLE */ /* Flags protected by proctree_lock, kept in p_treeflags. */ #define P_TREE_ORPHANED 0x00000001 /* Reparented, on orphan list */ #define P_TREE_FIRST_ORPHAN 0x00000002 /* First element of orphan list */ #define P_TREE_REAPER 0x00000004 /* Reaper of subtree */ /* * These were process status values (p_stat), now they are only used in * legacy conversion code. */ #define SIDL 1 /* Process being created by fork. */ #define SRUN 2 /* Currently runnable. */ #define SSLEEP 3 /* Sleeping on an address. */ #define SSTOP 4 /* Process debugging or suspension. */ #define SZOMB 5 /* Awaiting collection by parent. */ #define SWAIT 6 /* Waiting for interrupt. */ #define SLOCK 7 /* Blocked on a lock. */ #define P_MAGIC 0xbeefface #ifdef _KERNEL /* Types and flags for mi_switch(). */ #define SW_TYPE_MASK 0xff /* First 8 bits are switch type */ #define SWT_NONE 0 /* Unspecified switch. */ #define SWT_PREEMPT 1 /* Switching due to preemption. */ #define SWT_OWEPREEMPT 2 /* Switching due to owepreempt. */ #define SWT_TURNSTILE 3 /* Turnstile contention. */ #define SWT_SLEEPQ 4 /* Sleepq wait. */ #define SWT_SLEEPQTIMO 5 /* Sleepq timeout wait. */ #define SWT_RELINQUISH 6 /* yield call. */ #define SWT_NEEDRESCHED 7 /* NEEDRESCHED was set. */ #define SWT_IDLE 8 /* Switching from the idle thread. */ #define SWT_IWAIT 9 /* Waiting for interrupts. */ #define SWT_SUSPEND 10 /* Thread suspended. */ #define SWT_REMOTEPREEMPT 11 /* Remote processor preempted. */ #define SWT_REMOTEWAKEIDLE 12 /* Remote processor preempted idle. */ #define SWT_COUNT 13 /* Number of switch types. */ /* Flags */ #define SW_VOL 0x0100 /* Voluntary switch. */ #define SW_INVOL 0x0200 /* Involuntary switch. */ #define SW_PREEMPT 0x0400 /* The invol switch is a preemption */ /* How values for thread_single(). */ #define SINGLE_NO_EXIT 0 #define SINGLE_EXIT 1 #define SINGLE_BOUNDARY 2 #define SINGLE_ALLPROC 3 #ifdef MALLOC_DECLARE MALLOC_DECLARE(M_PARGS); MALLOC_DECLARE(M_PGRP); MALLOC_DECLARE(M_SESSION); MALLOC_DECLARE(M_SUBPROC); #endif #define FOREACH_PROC_IN_SYSTEM(p) \ LIST_FOREACH((p), &allproc, p_list) #define FOREACH_THREAD_IN_PROC(p, td) \ TAILQ_FOREACH((td), &(p)->p_threads, td_plist) #define FIRST_THREAD_IN_PROC(p) TAILQ_FIRST(&(p)->p_threads) /* * We use process IDs <= pid_max <= PID_MAX; PID_MAX + 1 must also fit * in a pid_t, as it is used to represent "no process group". */ #define PID_MAX 99999 #define NO_PID 100000 extern pid_t pid_max; #define SESS_LEADER(p) ((p)->p_session->s_leader == (p)) #define STOPEVENT(p, e, v) do { \ WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL, \ "checking stopevent %d", (e)); \ if ((p)->p_stops & (e)) { \ PROC_LOCK(p); \ stopevent((p), (e), (v)); \ PROC_UNLOCK(p); \ } \ } while (0) #define _STOPEVENT(p, e, v) do { \ PROC_LOCK_ASSERT(p, MA_OWNED); \ WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, &p->p_mtx.lock_object, \ "checking stopevent %d", (e)); \ if ((p)->p_stops & (e)) \ stopevent((p), (e), (v)); \ } while (0) /* Lock and unlock a process. */ #define PROC_LOCK(p) mtx_lock(&(p)->p_mtx) #define PROC_TRYLOCK(p) mtx_trylock(&(p)->p_mtx) #define PROC_UNLOCK(p) mtx_unlock(&(p)->p_mtx) #define PROC_LOCKED(p) mtx_owned(&(p)->p_mtx) #define PROC_LOCK_ASSERT(p, type) mtx_assert(&(p)->p_mtx, (type)) /* Lock and unlock a process group. */ #define PGRP_LOCK(pg) mtx_lock(&(pg)->pg_mtx) #define PGRP_UNLOCK(pg) mtx_unlock(&(pg)->pg_mtx) #define PGRP_LOCKED(pg) mtx_owned(&(pg)->pg_mtx) #define PGRP_LOCK_ASSERT(pg, type) mtx_assert(&(pg)->pg_mtx, (type)) #define PGRP_LOCK_PGSIGNAL(pg) do { \ if ((pg) != NULL) \ PGRP_LOCK(pg); \ } while (0) #define PGRP_UNLOCK_PGSIGNAL(pg) do { \ if ((pg) != NULL) \ PGRP_UNLOCK(pg); \ } while (0) /* Lock and unlock a session. */ #define SESS_LOCK(s) mtx_lock(&(s)->s_mtx) #define SESS_UNLOCK(s) mtx_unlock(&(s)->s_mtx) #define SESS_LOCKED(s) mtx_owned(&(s)->s_mtx) #define SESS_LOCK_ASSERT(s, type) mtx_assert(&(s)->s_mtx, (type)) /* * Non-zero p_lock ensures that: * - exit1() is not performed until p_lock reaches zero; * - the process' threads stack are not swapped out if they are currently * not (P_INMEM). * * PHOLD() asserts that the process (except the current process) is * not exiting, increments p_lock and swaps threads stacks into memory, * if needed. * _PHOLD() is same as PHOLD(), it takes the process locked. * _PHOLD_LITE() also takes the process locked, but comparing with * _PHOLD(), it only guarantees that exit1() is not executed, * faultin() is not called. */ #define PHOLD(p) do { \ PROC_LOCK(p); \ _PHOLD(p); \ PROC_UNLOCK(p); \ } while (0) #define _PHOLD(p) do { \ PROC_LOCK_ASSERT((p), MA_OWNED); \ KASSERT(!((p)->p_flag & P_WEXIT) || (p) == curproc, \ ("PHOLD of exiting process %p", p)); \ (p)->p_lock++; \ if (((p)->p_flag & P_INMEM) == 0) \ faultin((p)); \ } while (0) #define _PHOLD_LITE(p) do { \ PROC_LOCK_ASSERT((p), MA_OWNED); \ KASSERT(!((p)->p_flag & P_WEXIT) || (p) == curproc, \ ("PHOLD of exiting process %p", p)); \ (p)->p_lock++; \ } while (0) #define PROC_ASSERT_HELD(p) do { \ KASSERT((p)->p_lock > 0, ("process %p not held", p)); \ } while (0) #define PRELE(p) do { \ PROC_LOCK((p)); \ _PRELE((p)); \ PROC_UNLOCK((p)); \ } while (0) #define _PRELE(p) do { \ PROC_LOCK_ASSERT((p), MA_OWNED); \ PROC_ASSERT_HELD(p); \ (--(p)->p_lock); \ if (((p)->p_flag & P_WEXIT) && (p)->p_lock == 0) \ wakeup(&(p)->p_lock); \ } while (0) #define PROC_ASSERT_NOT_HELD(p) do { \ KASSERT((p)->p_lock == 0, ("process %p held", p)); \ } while (0) #define PROC_UPDATE_COW(p) do { \ PROC_LOCK_ASSERT((p), MA_OWNED); \ (p)->p_cowgen++; \ } while (0) /* Check whether a thread is safe to be swapped out. */ #define thread_safetoswapout(td) ((td)->td_flags & TDF_CANSWAP) /* Control whether or not it is safe for curthread to sleep. */ #define THREAD_NO_SLEEPING() ((curthread)->td_no_sleeping++) #define THREAD_SLEEPING_OK() ((curthread)->td_no_sleeping--) #define THREAD_CAN_SLEEP() ((curthread)->td_no_sleeping == 0) #define PIDHASH(pid) (&pidhashtbl[(pid) & pidhash]) extern LIST_HEAD(pidhashhead, proc) *pidhashtbl; extern u_long pidhash; #define TIDHASH(tid) (&tidhashtbl[(tid) & tidhash]) extern LIST_HEAD(tidhashhead, thread) *tidhashtbl; extern u_long tidhash; extern struct rwlock tidhash_lock; #define PGRPHASH(pgid) (&pgrphashtbl[(pgid) & pgrphash]) extern LIST_HEAD(pgrphashhead, pgrp) *pgrphashtbl; extern u_long pgrphash; extern struct sx allproc_lock; extern int allproc_gen; extern struct sx proctree_lock; extern struct mtx ppeers_lock; extern struct proc proc0; /* Process slot for swapper. */ extern struct thread0_storage thread0_st; /* Primary thread in proc0. */ #define thread0 (thread0_st.t0st_thread) extern struct vmspace vmspace0; /* VM space for proc0. */ extern int hogticks; /* Limit on kernel cpu hogs. */ extern int lastpid; extern int nprocs, maxproc; /* Current and max number of procs. */ extern int maxprocperuid; /* Max procs per uid. */ extern u_long ps_arg_cache_limit; LIST_HEAD(proclist, proc); TAILQ_HEAD(procqueue, proc); TAILQ_HEAD(threadqueue, thread); extern struct proclist allproc; /* List of all processes. */ extern struct proclist zombproc; /* List of zombie processes. */ extern struct proc *initproc, *pageproc; /* Process slots for init, pager. */ extern struct uma_zone *proc_zone; struct proc *pfind(pid_t); /* Find process by id. */ struct proc *pfind_locked(pid_t pid); struct pgrp *pgfind(pid_t); /* Find process group by id. */ struct proc *zpfind(pid_t); /* Find zombie process by id. */ struct fork_req { int fr_flags; int fr_pages; int *fr_pidp; struct proc **fr_procp; int *fr_pd_fd; int fr_pd_flags; struct filecaps *fr_pd_fcaps; }; /* * pget() flags. */ #define PGET_HOLD 0x00001 /* Hold the process. */ #define PGET_CANSEE 0x00002 /* Check against p_cansee(). */ #define PGET_CANDEBUG 0x00004 /* Check against p_candebug(). */ #define PGET_ISCURRENT 0x00008 /* Check that the found process is current. */ #define PGET_NOTWEXIT 0x00010 /* Check that the process is not in P_WEXIT. */ #define PGET_NOTINEXEC 0x00020 /* Check that the process is not in P_INEXEC. */ #define PGET_NOTID 0x00040 /* Do not assume tid if pid > PID_MAX. */ #define PGET_WANTREAD (PGET_HOLD | PGET_CANDEBUG | PGET_NOTWEXIT) int pget(pid_t pid, int flags, struct proc **pp); void ast(struct trapframe *framep); struct thread *choosethread(void); int cr_cansee(struct ucred *u1, struct ucred *u2); int cr_canseesocket(struct ucred *cred, struct socket *so); int cr_canseeothergids(struct ucred *u1, struct ucred *u2); int cr_canseeotheruids(struct ucred *u1, struct ucred *u2); int cr_canseejailproc(struct ucred *u1, struct ucred *u2); int cr_cansignal(struct ucred *cred, struct proc *proc, int signum); int enterpgrp(struct proc *p, pid_t pgid, struct pgrp *pgrp, struct session *sess); int enterthispgrp(struct proc *p, struct pgrp *pgrp); void faultin(struct proc *p); void fixjobc(struct proc *p, struct pgrp *pgrp, int entering); int fork1(struct thread *, struct fork_req *); void fork_exit(void (*)(void *, struct trapframe *), void *, struct trapframe *); void fork_return(struct thread *, struct trapframe *); int inferior(struct proc *p); void kern_yield(int); void kick_proc0(void); void killjobc(void); int leavepgrp(struct proc *p); int maybe_preempt(struct thread *td); void maybe_yield(void); void mi_switch(int flags, struct thread *newtd); int p_candebug(struct thread *td, struct proc *p); int p_cansee(struct thread *td, struct proc *p); int p_cansched(struct thread *td, struct proc *p); int p_cansignal(struct thread *td, struct proc *p, int signum); int p_canwait(struct thread *td, struct proc *p); struct pargs *pargs_alloc(int len); void pargs_drop(struct pargs *pa); void pargs_hold(struct pargs *pa); int proc_getargv(struct thread *td, struct proc *p, struct sbuf *sb); int proc_getauxv(struct thread *td, struct proc *p, struct sbuf *sb); int proc_getenvv(struct thread *td, struct proc *p, struct sbuf *sb); void procinit(void); void proc_linkup0(struct proc *p, struct thread *td); void proc_linkup(struct proc *p, struct thread *td); struct proc *proc_realparent(struct proc *child); void proc_reap(struct thread *td, struct proc *p, int *status, int options); void proc_reparent(struct proc *child, struct proc *newparent); void proc_set_traced(struct proc *p, bool stop); struct pstats *pstats_alloc(void); void pstats_fork(struct pstats *src, struct pstats *dst); void pstats_free(struct pstats *ps); void reaper_abandon_children(struct proc *p, bool exiting); int securelevel_ge(struct ucred *cr, int level); int securelevel_gt(struct ucred *cr, int level); void sess_hold(struct session *); void sess_release(struct session *); int setrunnable(struct thread *); void setsugid(struct proc *p); int should_yield(void); int sigonstack(size_t sp); void stopevent(struct proc *, u_int, u_int); struct thread *tdfind(lwpid_t, pid_t); void threadinit(void); void tidhash_add(struct thread *); void tidhash_remove(struct thread *); void cpu_idle(int); int cpu_idle_wakeup(int); extern void (*cpu_idle_hook)(sbintime_t); /* Hook to machdep CPU idler. */ void cpu_switch(struct thread *, struct thread *, struct mtx *); void cpu_throw(struct thread *, struct thread *) __dead2; void unsleep(struct thread *); void userret(struct thread *, struct trapframe *); void cpu_exit(struct thread *); void exit1(struct thread *, int, int) __dead2; void cpu_copy_thread(struct thread *td, struct thread *td0); int cpu_fetch_syscall_args(struct thread *td); void cpu_fork(struct thread *, struct proc *, struct thread *, int); void cpu_fork_kthread_handler(struct thread *, void (*)(void *), void *); void cpu_set_syscall_retval(struct thread *, int); void cpu_set_upcall(struct thread *, void (*)(void *), void *, stack_t *); int cpu_set_user_tls(struct thread *, void *tls_base); void cpu_thread_alloc(struct thread *); void cpu_thread_clean(struct thread *); void cpu_thread_exit(struct thread *); void cpu_thread_free(struct thread *); void cpu_thread_swapin(struct thread *); void cpu_thread_swapout(struct thread *); struct thread *thread_alloc(int pages); int thread_alloc_stack(struct thread *, int pages); void thread_cow_get_proc(struct thread *newtd, struct proc *p); void thread_cow_get(struct thread *newtd, struct thread *td); void thread_cow_free(struct thread *td); void thread_cow_update(struct thread *td); int thread_create(struct thread *td, struct rtprio *rtp, int (*initialize_thread)(struct thread *, void *), void *thunk); void thread_exit(void) __dead2; void thread_free(struct thread *td); void thread_link(struct thread *td, struct proc *p); void thread_reap(void); int thread_single(struct proc *p, int how); void thread_single_end(struct proc *p, int how); void thread_stash(struct thread *td); void thread_stopped(struct proc *p); void childproc_stopped(struct proc *child, int reason); void childproc_continued(struct proc *child); void childproc_exited(struct proc *child); int thread_suspend_check(int how); bool thread_suspend_check_needed(void); void thread_suspend_switch(struct thread *, struct proc *p); void thread_suspend_one(struct thread *td); void thread_unlink(struct thread *td); void thread_unsuspend(struct proc *p); void thread_wait(struct proc *p); struct thread *thread_find(struct proc *p, lwpid_t tid); void stop_all_proc(void); void resume_all_proc(void); static __inline int curthread_pflags_set(int flags) { struct thread *td; int save; td = curthread; save = ~flags | (td->td_pflags & flags); td->td_pflags |= flags; return (save); } static __inline void curthread_pflags_restore(int save) { curthread->td_pflags &= save; } static __inline __pure2 struct td_sched * td_get_sched(struct thread *td) { return ((struct td_sched *)&td[1]); } extern void (*softdep_ast_cleanup)(struct thread *); static __inline void td_softdep_cleanup(struct thread *td) { if (td->td_su != NULL && softdep_ast_cleanup != NULL) softdep_ast_cleanup(td); } #endif /* _KERNEL */ #endif /* !_SYS_PROC_H_ */ Index: projects/numa2/sys/vm/uma.h =================================================================== --- projects/numa2/sys/vm/uma.h (revision 321505) +++ projects/numa2/sys/vm/uma.h (revision 321506) @@ -1,694 +1,703 @@ /*- * Copyright (c) 2002, 2003, 2004, 2005 Jeffrey Roberson * Copyright (c) 2004, 2005 Bosko Milekic * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice unmodified, this list of conditions, and the following * disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * * $FreeBSD$ * */ /* * uma.h - External definitions for the Universal Memory Allocator * */ #ifndef _VM_UMA_H_ #define _VM_UMA_H_ #include /* For NULL */ #include /* For M_* */ /* User visible parameters */ #define UMA_SMALLEST_UNIT (PAGE_SIZE / 256) /* Smallest item allocated */ /* Types and type defs */ struct uma_zone; +struct vm_domain_iterator; /* Opaque type used as a handle to the zone */ typedef struct uma_zone * uma_zone_t; void zone_drain(uma_zone_t); /* * Item constructor * * Arguments: * item A pointer to the memory which has been allocated. * arg The arg field passed to uma_zalloc_arg * size The size of the allocated item * flags See zalloc flags * * Returns: * 0 on success * errno on failure * * Discussion: * The constructor is called just before the memory is returned * to the user. It may block if necessary. */ typedef int (*uma_ctor)(void *mem, int size, void *arg, int flags); /* * Item destructor * * Arguments: * item A pointer to the memory which has been allocated. * size The size of the item being destructed. * arg Argument passed through uma_zfree_arg * * Returns: * Nothing * * Discussion: * The destructor may perform operations that differ from those performed * by the initializer, but it must leave the object in the same state. * This IS type stable storage. This is called after EVERY zfree call. */ typedef void (*uma_dtor)(void *mem, int size, void *arg); /* * Item initializer * * Arguments: * item A pointer to the memory which has been allocated. * size The size of the item being initialized. * flags See zalloc flags * * Returns: * 0 on success * errno on failure * * Discussion: * The initializer is called when the memory is cached in the uma zone. * The initializer and the destructor should leave the object in the same * state. */ typedef int (*uma_init)(void *mem, int size, int flags); /* * Item discard function * * Arguments: * item A pointer to memory which has been 'freed' but has not left the * zone's cache. * size The size of the item being discarded. * * Returns: * Nothing * * Discussion: * This routine is called when memory leaves a zone and is returned to the * system for other uses. It is the counter-part to the init function. */ typedef void (*uma_fini)(void *mem, int size); /* * Import new memory into a cache zone. */ -typedef int (*uma_import)(void *arg, void **store, int count, int flags); +typedef int (*uma_import)(void *arg, void **store, int count, int domain, + int flags); /* * Free memory from a cache zone. */ typedef void (*uma_release)(void *arg, void **store, int count); /* * What's the difference between initializing and constructing? * * The item is initialized when it is cached, and this is the state that the * object should be in when returned to the allocator. The purpose of this is * to remove some code which would otherwise be called on each allocation by * utilizing a known, stable state. This differs from the constructor which * will be called on EVERY allocation. * * For example, in the initializer you may want to initialize embedded locks, * NULL list pointers, set up initial states, magic numbers, etc. This way if * the object is held in the allocator and re-used it won't be necessary to * re-initialize it. * * The constructor may be used to lock a data structure, link it on to lists, * bump reference counts or total counts of outstanding structures, etc. * */ /* Function proto types */ /* * Create a new uma zone * * Arguments: * name The text name of the zone for debugging and stats. This memory * should not be freed until the zone has been deallocated. * size The size of the object that is being created. * ctor The constructor that is called when the object is allocated. * dtor The destructor that is called when the object is freed. * init An initializer that sets up the initial state of the memory. * fini A discard function that undoes initialization done by init. * ctor/dtor/init/fini may all be null, see notes above. * align A bitmask that corresponds to the requested alignment * eg 4 would be 0x3 * flags A set of parameters that control the behavior of the zone. * * Returns: * A pointer to a structure which is intended to be opaque to users of * the interface. The value may be null if the wait flag is not set. */ uma_zone_t uma_zcreate(const char *name, size_t size, uma_ctor ctor, uma_dtor dtor, uma_init uminit, uma_fini fini, int align, uint32_t flags); /* * Create a secondary uma zone * * Arguments: * name The text name of the zone for debugging and stats. This memory * should not be freed until the zone has been deallocated. * ctor The constructor that is called when the object is allocated. * dtor The destructor that is called when the object is freed. * zinit An initializer that sets up the initial state of the memory * as the object passes from the Keg's slab to the Zone's cache. * zfini A discard function that undoes initialization done by init * as the object passes from the Zone's cache to the Keg's slab. * * ctor/dtor/zinit/zfini may all be null, see notes above. * Note that the zinit and zfini specified here are NOT * exactly the same as the init/fini specified to uma_zcreate() * when creating a master zone. These zinit/zfini are called * on the TRANSITION from keg to zone (and vice-versa). Once * these are set, the primary zone may alter its init/fini * (which are called when the object passes from VM to keg) * using uma_zone_set_init/fini()) as well as its own * zinit/zfini (unset by default for master zone) with * uma_zone_set_zinit/zfini() (note subtle 'z' prefix). * * master A reference to this zone's Master Zone (Primary Zone), * which contains the backing Keg for the Secondary Zone * being added. * * Returns: * A pointer to a structure which is intended to be opaque to users of * the interface. The value may be null if the wait flag is not set. */ uma_zone_t uma_zsecond_create(char *name, uma_ctor ctor, uma_dtor dtor, uma_init zinit, uma_fini zfini, uma_zone_t master); /* * Add a second master to a secondary zone. This provides multiple data * backends for objects with the same size. Both masters must have * compatible allocation flags. Presently, UMA_ZONE_MALLOC type zones are * the only supported. * * Returns: * Error on failure, 0 on success. */ int uma_zsecond_add(uma_zone_t zone, uma_zone_t master); /* * Create cache-only zones. * * This allows uma's per-cpu cache facilities to handle arbitrary * pointers. Consumers must specify the import and release functions to * fill and destroy caches. UMA does not allocate any memory for these * zones. The 'arg' parameter is passed to import/release and is caller * specific. */ uma_zone_t uma_zcache_create(char *name, int size, uma_ctor ctor, uma_dtor dtor, uma_init zinit, uma_fini zfini, uma_import zimport, uma_release zrelease, void *arg, int flags); /* * Definitions for uma_zcreate flags * * These flags share space with UMA_ZFLAGs in uma_int.h. Be careful not to * overlap when adding new features. 0xff000000 is in use by uma_int.h. */ #define UMA_ZONE_PAGEABLE 0x0001 /* Return items not fully backed by physical memory XXX Not yet */ #define UMA_ZONE_ZINIT 0x0002 /* Initialize with zeros */ #define UMA_ZONE_STATIC 0x0004 /* Statically sized zone */ #define UMA_ZONE_OFFPAGE 0x0008 /* Force the slab structure allocation off of the real memory */ #define UMA_ZONE_MALLOC 0x0010 /* For use by malloc(9) only! */ #define UMA_ZONE_NOFREE 0x0020 /* Do not free slabs of this type! */ #define UMA_ZONE_MTXCLASS 0x0040 /* Create a new lock class */ #define UMA_ZONE_VM 0x0080 /* * Used for internal vm datastructures * only. */ #define UMA_ZONE_HASH 0x0100 /* * Use a hash table instead of caching * information in the vm_page. */ #define UMA_ZONE_SECONDARY 0x0200 /* Zone is a Secondary Zone */ /* 0x0400 Unused */ #define UMA_ZONE_MAXBUCKET 0x0800 /* Use largest buckets */ #define UMA_ZONE_CACHESPREAD 0x1000 /* * Spread memory start locations across * all possible cache lines. May * require many virtually contiguous * backend pages and can fail early. */ #define UMA_ZONE_VTOSLAB 0x2000 /* Zone uses vtoslab for lookup. */ #define UMA_ZONE_NODUMP 0x4000 /* * Zone's pages will not be included in * mini-dumps. */ #define UMA_ZONE_PCPU 0x8000 /* * Allocates mp_maxid + 1 slabs sized to * sizeof(struct pcpu). */ /* * These flags are shared between the keg and zone. In zones wishing to add * new kegs these flags must be compatible. Some are determined based on * physical parameters of the request and may not be provided by the consumer. */ #define UMA_ZONE_INHERIT \ (UMA_ZONE_OFFPAGE | UMA_ZONE_MALLOC | UMA_ZONE_NOFREE | \ UMA_ZONE_HASH | UMA_ZONE_VTOSLAB | UMA_ZONE_PCPU) /* Definitions for align */ #define UMA_ALIGN_PTR (sizeof(void *) - 1) /* Alignment fit for ptr */ #define UMA_ALIGN_LONG (sizeof(long) - 1) /* "" long */ #define UMA_ALIGN_INT (sizeof(int) - 1) /* "" int */ #define UMA_ALIGN_SHORT (sizeof(short) - 1) /* "" short */ #define UMA_ALIGN_CHAR (sizeof(char) - 1) /* "" char */ #define UMA_ALIGN_CACHE (0 - 1) /* Cache line size align */ /* * Destroys an empty uma zone. If the zone is not empty uma complains loudly. * * Arguments: * zone The zone we want to destroy. * */ void uma_zdestroy(uma_zone_t zone); /* * Allocates an item out of a zone * * Arguments: * zone The zone we are allocating from * arg This data is passed to the ctor function * flags See sys/malloc.h for available flags. * * Returns: * A non-null pointer to an initialized element from the zone is * guaranteed if the wait flag is M_WAITOK. Otherwise a null pointer * may be returned if the zone is empty or the ctor failed. */ void *uma_zalloc_arg(uma_zone_t zone, void *arg, int flags); /* * Allocates an item out of a zone without supplying an argument * * This is just a wrapper for uma_zalloc_arg for convenience. * */ static __inline void *uma_zalloc(uma_zone_t zone, int flags); static __inline void * uma_zalloc(uma_zone_t zone, int flags) { return uma_zalloc_arg(zone, NULL, flags); } /* * Frees an item back into the specified zone. * * Arguments: * zone The zone the item was originally allocated out of. * item The memory to be freed. * arg Argument passed to the destructor * * Returns: * Nothing. */ void uma_zfree_arg(uma_zone_t zone, void *item, void *arg); /* * Frees an item back to a zone without supplying an argument * * This is just a wrapper for uma_zfree_arg for convenience. * */ static __inline void uma_zfree(uma_zone_t zone, void *item); static __inline void uma_zfree(uma_zone_t zone, void *item) { uma_zfree_arg(zone, item, NULL); } /* - * XXX The rest of the prototypes in this header are h0h0 magic for the VM. - * If you think you need to use it for a normal zone you're probably incorrect. - */ - -/* * Backend page supplier routines * * Arguments: * zone The zone that is requesting pages. * size The number of bytes being requested. * pflag Flags for these memory pages, see below. + * domain The NUMA domain that we prefer for this allocation. * wait Indicates our willingness to block. * * Returns: * A pointer to the allocated memory or NULL on failure. */ -typedef void *(*uma_alloc)(uma_zone_t zone, vm_size_t size, uint8_t *pflag, - int wait); +typedef void *(*uma_alloc)(uma_zone_t zone, vm_size_t size, int domain, + uint8_t *pflag, int wait); /* * Backend page free routines * * Arguments: * item A pointer to the previously allocated pages. * size The original size of the allocation. * pflag The flags for the slab. See UMA_SLAB_* below. * * Returns: * None */ typedef void (*uma_free)(void *item, vm_size_t size, uint8_t pflag); - - /* * Sets up the uma allocator. (Called by vm_mem_init) * * Arguments: * bootmem A pointer to memory used to bootstrap the system. * * Returns: * Nothing * * Discussion: * This memory is used for zones which allocate things before the * backend page supplier can give us pages. It should be * UMA_SLAB_SIZE * boot_pages bytes. (see uma_int.h) * */ void uma_startup(void *bootmem, int boot_pages); /* * Finishes starting up the allocator. This should * be called when kva is ready for normal allocs. * * Arguments: * None * * Returns: * Nothing * * Discussion: * uma_startup2 is called by kmeminit() to enable us of uma for malloc. */ void uma_startup2(void); /* * Reclaims unused memory for all zones * * Arguments: * None * Returns: * None * * This should only be called by the page out daemon. */ void uma_reclaim(void); /* * Sets the alignment mask to be used for all zones requesting cache * alignment. Should be called by MD boot code prior to starting VM/UMA. * * Arguments: * align The alignment mask * * Returns: * Nothing */ void uma_set_align(int align); /* * Set a reserved number of items to hold for M_USE_RESERVE allocations. All * other requests must allocate new backing pages. */ void uma_zone_reserve(uma_zone_t zone, int nitems); /* * Reserves the maximum KVA space required by the zone and configures the zone * to use a VM_ALLOC_NOOBJ-based backend allocator. * * Arguments: * zone The zone to update. * nitems The upper limit on the number of items that can be allocated. * * Returns: * 0 if KVA space can not be allocated * 1 if successful * * Discussion: * When the machine supports a direct map and the zone's items are smaller * than a page, the zone will use the direct map instead of allocating KVA * space. */ int uma_zone_reserve_kva(uma_zone_t zone, int nitems); /* * Sets a high limit on the number of items allowed in a zone * * Arguments: * zone The zone to limit * nitems The requested upper limit on the number of items allowed * * Returns: * int The effective value of nitems after rounding up based on page size */ int uma_zone_set_max(uma_zone_t zone, int nitems); /* * Obtains the effective limit on the number of items in a zone * * Arguments: * zone The zone to obtain the effective limit from * * Return: * 0 No limit * int The effective limit of the zone */ int uma_zone_get_max(uma_zone_t zone); /* * Sets a warning to be printed when limit is reached * * Arguments: * zone The zone we will warn about * warning Warning content * * Returns: * Nothing */ void uma_zone_set_warning(uma_zone_t zone, const char *warning); /* * Sets a function to run when limit is reached * * Arguments: * zone The zone to which this applies * fx The function ro run * * Returns: * Nothing */ typedef void (*uma_maxaction_t)(uma_zone_t, int); void uma_zone_set_maxaction(uma_zone_t zone, uma_maxaction_t); /* * Obtains the approximate current number of items allocated from a zone * * Arguments: * zone The zone to obtain the current allocation count from * * Return: * int The approximate current number of items allocated from the zone */ int uma_zone_get_cur(uma_zone_t zone); /* * The following two routines (uma_zone_set_init/fini) * are used to set the backend init/fini pair which acts on an * object as it becomes allocated and is placed in a slab within * the specified zone's backing keg. These should probably not * be changed once allocations have already begun, but only be set * immediately upon zone creation. */ void uma_zone_set_init(uma_zone_t zone, uma_init uminit); void uma_zone_set_fini(uma_zone_t zone, uma_fini fini); /* * The following two routines (uma_zone_set_zinit/zfini) are * used to set the zinit/zfini pair which acts on an object as * it passes from the backing Keg's slab cache to the * specified Zone's bucket cache. These should probably not * be changed once allocations have already begun, but only be set * immediately upon zone creation. */ void uma_zone_set_zinit(uma_zone_t zone, uma_init zinit); void uma_zone_set_zfini(uma_zone_t zone, uma_fini zfini); /* * Replaces the standard backend allocator for this zone. * * Arguments: * zone The zone whose backend allocator is being changed. * allocf A pointer to the allocation function * * Returns: * Nothing * * Discussion: * This could be used to implement pageable allocation, or perhaps * even DMA allocators if used in conjunction with the OFFPAGE * zone flag. */ void uma_zone_set_allocf(uma_zone_t zone, uma_alloc allocf); /* * Used for freeing memory provided by the allocf above * * Arguments: * zone The zone that intends to use this free routine. * freef The page freeing routine. * * Returns: * Nothing */ void uma_zone_set_freef(uma_zone_t zone, uma_free freef); + +/* + * XXX + * + * Arguments: + * zone The zone NUMA policy is being installed into. + * sel Selector of the NUMA policy requested. + * + * Returns: + * Nothing + */ +void uma_zone_set_domain_selector(uma_zone_t zone, + struct vm_domain_iterator *sel); /* * These flags are setable in the allocf and visible in the freef. */ #define UMA_SLAB_BOOT 0x01 /* Slab alloced from boot pages */ #define UMA_SLAB_KMEM 0x02 /* Slab alloced from kmem_map */ #define UMA_SLAB_KERNEL 0x04 /* Slab alloced from kernel_map */ #define UMA_SLAB_PRIV 0x08 /* Slab alloced from priv allocator */ #define UMA_SLAB_OFFP 0x10 /* Slab is managed separately */ #define UMA_SLAB_MALLOC 0x20 /* Slab is a large malloc slab */ /* 0x40 and 0x80 are available */ /* * Used to pre-fill a zone with some number of items * * Arguments: * zone The zone to fill * itemcnt The number of items to reserve * * Returns: * Nothing * * NOTE: This is blocking and should only be done at startup */ void uma_prealloc(uma_zone_t zone, int itemcnt); /* * Used to determine if a fixed-size zone is exhausted. * * Arguments: * zone The zone to check * * Returns: * Non-zero if zone is exhausted. */ int uma_zone_exhausted(uma_zone_t zone); int uma_zone_exhausted_nolock(uma_zone_t zone); /* * Common UMA_ZONE_PCPU zones. */ extern uma_zone_t pcpu_zone_64; extern uma_zone_t pcpu_zone_ptr; /* * Exported statistics structures to be used by user space monitoring tools. * Statistics stream consists of a uma_stream_header, followed by a series of * alternative uma_type_header and uma_type_stat structures. */ #define UMA_STREAM_VERSION 0x00000001 struct uma_stream_header { uint32_t ush_version; /* Stream format version. */ uint32_t ush_maxcpus; /* Value of MAXCPU for stream. */ uint32_t ush_count; /* Number of records. */ uint32_t _ush_pad; /* Pad/reserved field. */ }; #define UTH_MAX_NAME 32 #define UTH_ZONE_SECONDARY 0x00000001 struct uma_type_header { /* * Static per-zone data, some extracted from the supporting keg. */ char uth_name[UTH_MAX_NAME]; uint32_t uth_align; /* Keg: alignment. */ uint32_t uth_size; /* Keg: requested size of item. */ uint32_t uth_rsize; /* Keg: real size of item. */ uint32_t uth_maxpages; /* Keg: maximum number of pages. */ uint32_t uth_limit; /* Keg: max items to allocate. */ /* * Current dynamic zone/keg-derived statistics. */ uint32_t uth_pages; /* Keg: pages allocated. */ uint32_t uth_keg_free; /* Keg: items free. */ uint32_t uth_zone_free; /* Zone: items free. */ uint32_t uth_bucketsize; /* Zone: desired bucket size. */ uint32_t uth_zone_flags; /* Zone: flags. */ uint64_t uth_allocs; /* Zone: number of allocations. */ uint64_t uth_frees; /* Zone: number of frees. */ uint64_t uth_fails; /* Zone: number of alloc failures. */ uint64_t uth_sleeps; /* Zone: number of alloc sleeps. */ uint64_t _uth_reserved1[2]; /* Reserved. */ }; struct uma_percpu_stat { uint64_t ups_allocs; /* Cache: number of allocations. */ uint64_t ups_frees; /* Cache: number of frees. */ uint64_t ups_cache_free; /* Cache: free items in cache. */ uint64_t _ups_reserved[5]; /* Reserved. */ }; void uma_reclaim_wakeup(void); void uma_reclaim_worker(void *); #endif /* _VM_UMA_H_ */ Index: projects/numa2/sys/vm/uma_core.c =================================================================== --- projects/numa2/sys/vm/uma_core.c (revision 321505) +++ projects/numa2/sys/vm/uma_core.c (revision 321506) @@ -1,3619 +1,3764 @@ /*- * Copyright (c) 2002-2005, 2009, 2013 Jeffrey Roberson * Copyright (c) 2004, 2005 Bosko Milekic * Copyright (c) 2004-2006 Robert N. M. Watson * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice unmodified, this list of conditions, and the following * disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ /* * uma_core.c Implementation of the Universal Memory allocator * * This allocator is intended to replace the multitude of similar object caches * in the standard FreeBSD kernel. The intent is to be flexible as well as * efficient. A primary design goal is to return unused memory to the rest of * the system. This will make the system as a whole more flexible due to the * ability to move memory to subsystems which most need it instead of leaving * pools of reserved memory unused. * * The basic ideas stem from similar slab/zone based allocators whose algorithms * are well known. * */ /* * TODO: * - Improve memory usage for large allocations * - Investigate cache size adjustments */ #include __FBSDID("$FreeBSD$"); #include "opt_ddb.h" #include "opt_param.h" #include "opt_vm.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include +#include #include #include #include #include +#include #include #include #include #include #include #include #include #ifdef DEBUG_MEMGUARD #include #endif /* * This is the zone and keg from which all zones are spawned. The idea is that * even the zone & keg heads are allocated from the allocator, so we use the * bss section to bootstrap us. */ static struct uma_keg masterkeg; static struct uma_zone masterzone_k; static struct uma_zone masterzone_z; static uma_zone_t kegs = &masterzone_k; static uma_zone_t zones = &masterzone_z; /* This is the zone from which all of uma_slab_t's are allocated. */ static uma_zone_t slabzone; /* * The initial hash tables come out of this zone so they can be allocated * prior to malloc coming up. */ static uma_zone_t hashzone; /* The boot-time adjusted value for cache line alignment. */ int uma_align_cache = 64 - 1; static MALLOC_DEFINE(M_UMAHASH, "UMAHash", "UMA Hash Buckets"); /* * Are we allowed to allocate buckets? */ static int bucketdisable = 1; /* Linked list of all kegs in the system */ static LIST_HEAD(,uma_keg) uma_kegs = LIST_HEAD_INITIALIZER(uma_kegs); /* Linked list of all cache-only zones in the system */ static LIST_HEAD(,uma_zone) uma_cachezones = LIST_HEAD_INITIALIZER(uma_cachezones); /* This RW lock protects the keg list */ static struct rwlock_padalign uma_rwlock; /* * Pointer and counter to pool of pages, that is preallocated at * startup to bootstrap UMA. Early zones continue to use the pool * until it is depleted, so allocations may happen after boot, thus * we need a mutex to protect it. */ static char *bootmem; static int boot_pages; static struct mtx uma_boot_pages_mtx; static struct sx uma_drain_lock; /* Is the VM done starting up? */ static int booted = 0; #define UMA_STARTUP 1 #define UMA_STARTUP2 2 /* * This is the handle used to schedule events that need to happen * outside of the allocation fast path. */ static struct callout uma_callout; #define UMA_TIMEOUT 20 /* Seconds for callout interval. */ /* * This structure is passed as the zone ctor arg so that I don't have to create * a special allocation function just for zones. */ struct uma_zctor_args { const char *name; size_t size; uma_ctor ctor; uma_dtor dtor; uma_init uminit; uma_fini fini; uma_import import; uma_release release; void *arg; uma_keg_t keg; int align; uint32_t flags; }; struct uma_kctor_args { uma_zone_t zone; size_t size; uma_init uminit; uma_fini fini; int align; uint32_t flags; }; struct uma_bucket_zone { uma_zone_t ubz_zone; char *ubz_name; int ubz_entries; /* Number of items it can hold. */ int ubz_maxsize; /* Maximum allocation size per-item. */ }; /* * Compute the actual number of bucket entries to pack them in power * of two sizes for more efficient space utilization. */ #define BUCKET_SIZE(n) \ (((sizeof(void *) * (n)) - sizeof(struct uma_bucket)) / sizeof(void *)) #define BUCKET_MAX BUCKET_SIZE(256) struct uma_bucket_zone bucket_zones[] = { { NULL, "4 Bucket", BUCKET_SIZE(4), 4096 }, { NULL, "6 Bucket", BUCKET_SIZE(6), 3072 }, { NULL, "8 Bucket", BUCKET_SIZE(8), 2048 }, { NULL, "12 Bucket", BUCKET_SIZE(12), 1536 }, { NULL, "16 Bucket", BUCKET_SIZE(16), 1024 }, { NULL, "32 Bucket", BUCKET_SIZE(32), 512 }, { NULL, "64 Bucket", BUCKET_SIZE(64), 256 }, { NULL, "128 Bucket", BUCKET_SIZE(128), 128 }, { NULL, "256 Bucket", BUCKET_SIZE(256), 64 }, { NULL, NULL, 0} }; /* * Flags and enumerations to be passed to internal functions. */ enum zfreeskip { SKIP_NONE = 0, SKIP_DTOR, SKIP_FINI }; /* Prototypes.. */ -static void *noobj_alloc(uma_zone_t, vm_size_t, uint8_t *, int); -static void *page_alloc(uma_zone_t, vm_size_t, uint8_t *, int); -static void *startup_alloc(uma_zone_t, vm_size_t, uint8_t *, int); +static void *noobj_alloc(uma_zone_t, vm_size_t, int, uint8_t *, int); +static void *page_alloc(uma_zone_t, vm_size_t, int, uint8_t *, int); +static void *startup_alloc(uma_zone_t, vm_size_t, int, uint8_t *, int); static void page_free(void *, vm_size_t, uint8_t); -static uma_slab_t keg_alloc_slab(uma_keg_t, uma_zone_t, int); +static uma_slab_t keg_alloc_slab(uma_keg_t, uma_zone_t, int, int); static void cache_drain(uma_zone_t); static void bucket_drain(uma_zone_t, uma_bucket_t); static void bucket_cache_drain(uma_zone_t zone); static int keg_ctor(void *, int, void *, int); static void keg_dtor(void *, int, void *); static int zone_ctor(void *, int, void *, int); static void zone_dtor(void *, int, void *); static int zero_init(void *, int, int); static void keg_small_init(uma_keg_t keg); static void keg_large_init(uma_keg_t keg); static void zone_foreach(void (*zfunc)(uma_zone_t)); static void zone_timeout(uma_zone_t zone); static int hash_alloc(struct uma_hash *); static int hash_expand(struct uma_hash *, struct uma_hash *); static void hash_free(struct uma_hash *hash); static void uma_timeout(void *); static void uma_startup3(void); -static void *zone_alloc_item(uma_zone_t, void *, int); +static void *zone_alloc_item(uma_zone_t, void *, int, int); static void zone_free_item(uma_zone_t, void *, void *, enum zfreeskip); static void bucket_enable(void); static void bucket_init(void); static uma_bucket_t bucket_alloc(uma_zone_t zone, void *, int); static void bucket_free(uma_zone_t zone, uma_bucket_t, void *); static void bucket_zone_drain(void); -static uma_bucket_t zone_alloc_bucket(uma_zone_t zone, void *, int flags); -static uma_slab_t zone_fetch_slab(uma_zone_t zone, uma_keg_t last, int flags); -static uma_slab_t zone_fetch_slab_multi(uma_zone_t zone, uma_keg_t last, int flags); +static uma_bucket_t zone_alloc_bucket(uma_zone_t, void *, int, int); +static uma_slab_t zone_fetch_slab(uma_zone_t, uma_keg_t, int, int); +static uma_slab_t zone_fetch_slab_multi(uma_zone_t, uma_keg_t, int, int); static void *slab_alloc_item(uma_keg_t keg, uma_slab_t slab); static void slab_free_item(uma_keg_t keg, uma_slab_t slab, void *item); static uma_keg_t uma_kcreate(uma_zone_t zone, size_t size, uma_init uminit, uma_fini fini, int align, uint32_t flags); -static int zone_import(uma_zone_t zone, void **bucket, int max, int flags); -static void zone_release(uma_zone_t zone, void **bucket, int cnt); -static void uma_zero_item(void *item, uma_zone_t zone); +static int zone_import(uma_zone_t, void **, int, int, int); +static void zone_release(uma_zone_t, void **, int); +static void uma_zero_item(void *, uma_zone_t); void uma_print_zone(uma_zone_t); void uma_print_stats(void); static int sysctl_vm_zone_count(SYSCTL_HANDLER_ARGS); static int sysctl_vm_zone_stats(SYSCTL_HANDLER_ARGS); #ifdef INVARIANTS static void uma_dbg_free(uma_zone_t zone, uma_slab_t slab, void *item); static void uma_dbg_alloc(uma_zone_t zone, uma_slab_t slab, void *item); #endif SYSINIT(uma_startup3, SI_SUB_VM_CONF, SI_ORDER_SECOND, uma_startup3, NULL); SYSCTL_PROC(_vm, OID_AUTO, zone_count, CTLFLAG_RD|CTLTYPE_INT, 0, 0, sysctl_vm_zone_count, "I", "Number of UMA zones"); SYSCTL_PROC(_vm, OID_AUTO, zone_stats, CTLFLAG_RD|CTLTYPE_STRUCT, 0, 0, sysctl_vm_zone_stats, "s,struct uma_type_header", "Zone Stats"); static int zone_warnings = 1; SYSCTL_INT(_vm, OID_AUTO, zone_warnings, CTLFLAG_RWTUN, &zone_warnings, 0, "Warn when UMA zones becomes full"); /* * This routine checks to see whether or not it's safe to enable buckets. */ static void bucket_enable(void) { bucketdisable = vm_page_count_min(); } /* * Initialize bucket_zones, the array of zones of buckets of various sizes. * * For each zone, calculate the memory required for each bucket, consisting * of the header and an array of pointers. */ +static struct vm_domain_policy bucket_policy = + VM_DOMAIN_POLICY_STATIC_INITIALISER(VM_POLICY_FIRST_TOUCH, 0); +static struct vm_domain_iterator bucket_iterator; + static void bucket_init(void) { struct uma_bucket_zone *ubz; int size; + vm_domain_iterator_set_policy(&bucket_iterator, &bucket_policy); + for (ubz = &bucket_zones[0]; ubz->ubz_entries != 0; ubz++) { size = roundup(sizeof(struct uma_bucket), sizeof(void *)); size += sizeof(void *) * ubz->ubz_entries; ubz->ubz_zone = uma_zcreate(ubz->ubz_name, size, NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_MTXCLASS | UMA_ZFLAG_BUCKET); + uma_zone_set_domain_selector(ubz->ubz_zone, &bucket_iterator); } } /* * Given a desired number of entries for a bucket, return the zone from which * to allocate the bucket. */ static struct uma_bucket_zone * bucket_zone_lookup(int entries) { struct uma_bucket_zone *ubz; for (ubz = &bucket_zones[0]; ubz->ubz_entries != 0; ubz++) if (ubz->ubz_entries >= entries) return (ubz); ubz--; return (ubz); } static int bucket_select(int size) { struct uma_bucket_zone *ubz; ubz = &bucket_zones[0]; if (size > ubz->ubz_maxsize) return MAX((ubz->ubz_maxsize * ubz->ubz_entries) / size, 1); for (; ubz->ubz_entries != 0; ubz++) if (ubz->ubz_maxsize < size) break; ubz--; return (ubz->ubz_entries); } static uma_bucket_t bucket_alloc(uma_zone_t zone, void *udata, int flags) { struct uma_bucket_zone *ubz; uma_bucket_t bucket; /* * This is to stop us from allocating per cpu buckets while we're * running out of vm.boot_pages. Otherwise, we would exhaust the * boot pages. This also prevents us from allocating buckets in * low memory situations. */ if (bucketdisable) return (NULL); /* * To limit bucket recursion we store the original zone flags * in a cookie passed via zalloc_arg/zfree_arg. This allows the * NOVM flag to persist even through deep recursions. We also * store ZFLAG_BUCKET once we have recursed attempting to allocate * a bucket for a bucket zone so we do not allow infinite bucket * recursion. This cookie will even persist to frees of unused * buckets via the allocation path or bucket allocations in the * free path. */ if ((zone->uz_flags & UMA_ZFLAG_BUCKET) == 0) udata = (void *)(uintptr_t)zone->uz_flags; else { if ((uintptr_t)udata & UMA_ZFLAG_BUCKET) return (NULL); udata = (void *)((uintptr_t)udata | UMA_ZFLAG_BUCKET); } if ((uintptr_t)udata & UMA_ZFLAG_CACHEONLY) flags |= M_NOVM; ubz = bucket_zone_lookup(zone->uz_count); if (ubz->ubz_zone == zone && (ubz + 1)->ubz_entries != 0) ubz++; bucket = uma_zalloc_arg(ubz->ubz_zone, udata, flags); if (bucket) { #ifdef INVARIANTS bzero(bucket->ub_bucket, sizeof(void *) * ubz->ubz_entries); #endif bucket->ub_cnt = 0; bucket->ub_entries = ubz->ubz_entries; } return (bucket); } static void bucket_free(uma_zone_t zone, uma_bucket_t bucket, void *udata) { struct uma_bucket_zone *ubz; KASSERT(bucket->ub_cnt == 0, ("bucket_free: Freeing a non free bucket.")); if ((zone->uz_flags & UMA_ZFLAG_BUCKET) == 0) udata = (void *)(uintptr_t)zone->uz_flags; ubz = bucket_zone_lookup(bucket->ub_entries); uma_zfree_arg(ubz->ubz_zone, bucket, udata); } static void bucket_zone_drain(void) { struct uma_bucket_zone *ubz; for (ubz = &bucket_zones[0]; ubz->ubz_entries != 0; ubz++) zone_drain(ubz->ubz_zone); } static void zone_log_warning(uma_zone_t zone) { static const struct timeval warninterval = { 300, 0 }; if (!zone_warnings || zone->uz_warning == NULL) return; if (ratecheck(&zone->uz_ratecheck, &warninterval)) printf("[zone: %s] %s\n", zone->uz_name, zone->uz_warning); } static inline void zone_maxaction(uma_zone_t zone) { if (zone->uz_maxaction.ta_func != NULL) taskqueue_enqueue(taskqueue_thread, &zone->uz_maxaction); } static void zone_foreach_keg(uma_zone_t zone, void (*kegfn)(uma_keg_t)) { uma_klink_t klink; LIST_FOREACH(klink, &zone->uz_kegs, kl_link) kegfn(klink->kl_keg); } /* * Routine called by timeout which is used to fire off some time interval * based calculations. (stats, hash size, etc.) * * Arguments: * arg Unused * * Returns: * Nothing */ static void uma_timeout(void *unused) { bucket_enable(); zone_foreach(zone_timeout); /* Reschedule this event */ callout_reset(&uma_callout, UMA_TIMEOUT * hz, uma_timeout, NULL); } /* * Routine to perform timeout driven calculations. This expands the * hashes and does per cpu statistics aggregation. * * Returns nothing. */ static void keg_timeout(uma_keg_t keg) { KEG_LOCK(keg); /* * Expand the keg hash table. * * This is done if the number of slabs is larger than the hash size. * What I'm trying to do here is completely reduce collisions. This * may be a little aggressive. Should I allow for two collisions max? */ if (keg->uk_flags & UMA_ZONE_HASH && keg->uk_pages / keg->uk_ppera >= keg->uk_hash.uh_hashsize) { struct uma_hash newhash; struct uma_hash oldhash; int ret; /* * This is so involved because allocating and freeing * while the keg lock is held will lead to deadlock. * I have to do everything in stages and check for * races. */ newhash = keg->uk_hash; KEG_UNLOCK(keg); ret = hash_alloc(&newhash); KEG_LOCK(keg); if (ret) { if (hash_expand(&keg->uk_hash, &newhash)) { oldhash = keg->uk_hash; keg->uk_hash = newhash; } else oldhash = newhash; KEG_UNLOCK(keg); hash_free(&oldhash); return; } } KEG_UNLOCK(keg); } static void zone_timeout(uma_zone_t zone) { zone_foreach_keg(zone, &keg_timeout); } /* * Allocate and zero fill the next sized hash table from the appropriate * backing store. * * Arguments: * hash A new hash structure with the old hash size in uh_hashsize * * Returns: * 1 on success and 0 on failure. */ static int hash_alloc(struct uma_hash *hash) { int oldsize; int alloc; oldsize = hash->uh_hashsize; /* We're just going to go to a power of two greater */ if (oldsize) { hash->uh_hashsize = oldsize * 2; alloc = sizeof(hash->uh_slab_hash[0]) * hash->uh_hashsize; hash->uh_slab_hash = (struct slabhead *)malloc(alloc, M_UMAHASH, M_NOWAIT); } else { alloc = sizeof(hash->uh_slab_hash[0]) * UMA_HASH_SIZE_INIT; hash->uh_slab_hash = zone_alloc_item(hashzone, NULL, - M_WAITOK); + UMA_ANYDOMAIN, M_WAITOK); hash->uh_hashsize = UMA_HASH_SIZE_INIT; } if (hash->uh_slab_hash) { bzero(hash->uh_slab_hash, alloc); hash->uh_hashmask = hash->uh_hashsize - 1; return (1); } return (0); } /* * Expands the hash table for HASH zones. This is done from zone_timeout * to reduce collisions. This must not be done in the regular allocation * path, otherwise, we can recurse on the vm while allocating pages. * * Arguments: * oldhash The hash you want to expand * newhash The hash structure for the new table * * Returns: * Nothing * * Discussion: */ static int hash_expand(struct uma_hash *oldhash, struct uma_hash *newhash) { uma_slab_t slab; int hval; int i; if (!newhash->uh_slab_hash) return (0); if (oldhash->uh_hashsize >= newhash->uh_hashsize) return (0); /* * I need to investigate hash algorithms for resizing without a * full rehash. */ for (i = 0; i < oldhash->uh_hashsize; i++) while (!SLIST_EMPTY(&oldhash->uh_slab_hash[i])) { slab = SLIST_FIRST(&oldhash->uh_slab_hash[i]); SLIST_REMOVE_HEAD(&oldhash->uh_slab_hash[i], us_hlink); hval = UMA_HASH(newhash, slab->us_data); SLIST_INSERT_HEAD(&newhash->uh_slab_hash[hval], slab, us_hlink); } return (1); } /* * Free the hash bucket to the appropriate backing store. * * Arguments: * slab_hash The hash bucket we're freeing * hashsize The number of entries in that hash bucket * * Returns: * Nothing */ static void hash_free(struct uma_hash *hash) { if (hash->uh_slab_hash == NULL) return; if (hash->uh_hashsize == UMA_HASH_SIZE_INIT) zone_free_item(hashzone, hash->uh_slab_hash, NULL, SKIP_NONE); else free(hash->uh_slab_hash, M_UMAHASH); } /* * Frees all outstanding items in a bucket * * Arguments: * zone The zone to free to, must be unlocked. * bucket The free/alloc bucket with items, cpu queue must be locked. * * Returns: * Nothing */ static void bucket_drain(uma_zone_t zone, uma_bucket_t bucket) { int i; if (bucket == NULL) return; if (zone->uz_fini) for (i = 0; i < bucket->ub_cnt; i++) zone->uz_fini(bucket->ub_bucket[i], zone->uz_size); zone->uz_release(zone->uz_arg, bucket->ub_bucket, bucket->ub_cnt); bucket->ub_cnt = 0; } /* * Drains the per cpu caches for a zone. * * NOTE: This may only be called while the zone is being turn down, and not * during normal operation. This is necessary in order that we do not have * to migrate CPUs to drain the per-CPU caches. * * Arguments: * zone The zone to drain, must be unlocked. * * Returns: * Nothing */ static void cache_drain(uma_zone_t zone) { uma_cache_t cache; int cpu; /* * XXX: It is safe to not lock the per-CPU caches, because we're * tearing down the zone anyway. I.e., there will be no further use * of the caches at this point. * * XXX: It would good to be able to assert that the zone is being * torn down to prevent improper use of cache_drain(). * * XXX: We lock the zone before passing into bucket_cache_drain() as * it is used elsewhere. Should the tear-down path be made special * there in some form? */ CPU_FOREACH(cpu) { cache = &zone->uz_cpu[cpu]; bucket_drain(zone, cache->uc_allocbucket); bucket_drain(zone, cache->uc_freebucket); if (cache->uc_allocbucket != NULL) bucket_free(zone, cache->uc_allocbucket, NULL); if (cache->uc_freebucket != NULL) bucket_free(zone, cache->uc_freebucket, NULL); cache->uc_allocbucket = cache->uc_freebucket = NULL; } ZONE_LOCK(zone); bucket_cache_drain(zone); ZONE_UNLOCK(zone); } static void cache_shrink(uma_zone_t zone) { if (zone->uz_flags & UMA_ZFLAG_INTERNAL) return; ZONE_LOCK(zone); zone->uz_count = (zone->uz_count_min + zone->uz_count) / 2; ZONE_UNLOCK(zone); } static void cache_drain_safe_cpu(uma_zone_t zone) { uma_cache_t cache; uma_bucket_t b1, b2; + int domain; if (zone->uz_flags & UMA_ZFLAG_INTERNAL) return; b1 = b2 = NULL; ZONE_LOCK(zone); critical_enter(); + if (zone->uz_sel == NULL) + domain = 0; + else + domain = vm_domain_select_first(zone->uz_sel); cache = &zone->uz_cpu[curcpu]; if (cache->uc_allocbucket) { if (cache->uc_allocbucket->ub_cnt != 0) - LIST_INSERT_HEAD(&zone->uz_buckets, + LIST_INSERT_HEAD(&zone->uz_domain[domain].uzd_buckets, cache->uc_allocbucket, ub_link); else b1 = cache->uc_allocbucket; cache->uc_allocbucket = NULL; } if (cache->uc_freebucket) { if (cache->uc_freebucket->ub_cnt != 0) - LIST_INSERT_HEAD(&zone->uz_buckets, + LIST_INSERT_HEAD(&zone->uz_domain[domain].uzd_buckets, cache->uc_freebucket, ub_link); else b2 = cache->uc_freebucket; cache->uc_freebucket = NULL; } critical_exit(); ZONE_UNLOCK(zone); if (b1) bucket_free(zone, b1, NULL); if (b2) bucket_free(zone, b2, NULL); } /* * Safely drain per-CPU caches of a zone(s) to alloc bucket. * This is an expensive call because it needs to bind to all CPUs * one by one and enter a critical section on each of them in order * to safely access their cache buckets. * Zone lock must not be held on call this function. */ static void cache_drain_safe(uma_zone_t zone) { int cpu; /* * Polite bucket sizes shrinking was not enouth, shrink aggressively. */ if (zone) cache_shrink(zone); else zone_foreach(cache_shrink); CPU_FOREACH(cpu) { thread_lock(curthread); sched_bind(curthread, cpu); thread_unlock(curthread); if (zone) cache_drain_safe_cpu(zone); else zone_foreach(cache_drain_safe_cpu); } thread_lock(curthread); sched_unbind(curthread); thread_unlock(curthread); } /* * Drain the cached buckets from a zone. Expects a locked zone on entry. */ static void bucket_cache_drain(uma_zone_t zone) { + uma_zone_domain_t zdom; uma_bucket_t bucket; + int i; /* - * Drain the bucket queues and free the buckets, we just keep two per - * cpu (alloc/free). + * Drain the bucket queues and free the buckets. */ - while ((bucket = LIST_FIRST(&zone->uz_buckets)) != NULL) { - LIST_REMOVE(bucket, ub_link); - ZONE_UNLOCK(zone); - bucket_drain(zone, bucket); - bucket_free(zone, bucket, NULL); - ZONE_LOCK(zone); + for (i = 0; i < vm_ndomains; i++) { + zdom = &zone->uz_domain[i]; + while ((bucket = LIST_FIRST(&zdom->uzd_buckets)) != NULL) { + LIST_REMOVE(bucket, ub_link); + ZONE_UNLOCK(zone); + bucket_drain(zone, bucket); + bucket_free(zone, bucket, NULL); + ZONE_LOCK(zone); + } } /* * Shrink further bucket sizes. Price of single zone lock collision * is probably lower then price of global cache drain. */ if (zone->uz_count > zone->uz_count_min) zone->uz_count--; } static void keg_free_slab(uma_keg_t keg, uma_slab_t slab, int start) { uint8_t *mem; int i; uint8_t flags; CTR4(KTR_UMA, "keg_free_slab keg %s(%p) slab %p, returning %d bytes", keg->uk_name, keg, slab, PAGE_SIZE * keg->uk_ppera); mem = slab->us_data; flags = slab->us_flags; i = start; if (keg->uk_fini != NULL) { for (i--; i > -1; i--) keg->uk_fini(slab->us_data + (keg->uk_rsize * i), keg->uk_size); } if (keg->uk_flags & UMA_ZONE_OFFPAGE) zone_free_item(keg->uk_slabzone, slab, NULL, SKIP_NONE); keg->uk_freef(mem, PAGE_SIZE * keg->uk_ppera, flags); } /* * Frees pages from a keg back to the system. This is done on demand from * the pageout daemon. * * Returns nothing. */ static void keg_drain(uma_keg_t keg) { struct slabhead freeslabs = { 0 }; + uma_domain_t dom; uma_slab_t slab, tmp; + int i; /* * We don't want to take pages from statically allocated kegs at this * time */ if (keg->uk_flags & UMA_ZONE_NOFREE || keg->uk_freef == NULL) return; CTR3(KTR_UMA, "keg_drain %s(%p) free items: %u", keg->uk_name, keg, keg->uk_free); KEG_LOCK(keg); if (keg->uk_free == 0) goto finished; - LIST_FOREACH_SAFE(slab, &keg->uk_free_slab, us_link, tmp) { - /* We have nowhere to free these to. */ - if (slab->us_flags & UMA_SLAB_BOOT) - continue; + for (i = 0; i < vm_ndomains; i++) { + dom = &keg->uk_domain[i]; + LIST_FOREACH_SAFE(slab, &dom->ud_free_slab, us_link, tmp) { + /* We have nowhere to free these to. */ + if (slab->us_flags & UMA_SLAB_BOOT) + continue; - LIST_REMOVE(slab, us_link); - keg->uk_pages -= keg->uk_ppera; - keg->uk_free -= keg->uk_ipers; + LIST_REMOVE(slab, us_link); + keg->uk_pages -= keg->uk_ppera; + keg->uk_free -= keg->uk_ipers; - if (keg->uk_flags & UMA_ZONE_HASH) - UMA_HASH_REMOVE(&keg->uk_hash, slab, slab->us_data); + if (keg->uk_flags & UMA_ZONE_HASH) + UMA_HASH_REMOVE(&keg->uk_hash, slab, + slab->us_data); - SLIST_INSERT_HEAD(&freeslabs, slab, us_hlink); + SLIST_INSERT_HEAD(&freeslabs, slab, us_hlink); + } } + finished: KEG_UNLOCK(keg); while ((slab = SLIST_FIRST(&freeslabs)) != NULL) { SLIST_REMOVE(&freeslabs, slab, uma_slab, us_hlink); keg_free_slab(keg, slab, keg->uk_ipers); } } static void zone_drain_wait(uma_zone_t zone, int waitok) { /* * Set draining to interlock with zone_dtor() so we can release our * locks as we go. Only dtor() should do a WAITOK call since it * is the only call that knows the structure will still be available * when it wakes up. */ ZONE_LOCK(zone); while (zone->uz_flags & UMA_ZFLAG_DRAINING) { if (waitok == M_NOWAIT) goto out; msleep(zone, zone->uz_lockptr, PVM, "zonedrain", 1); } zone->uz_flags |= UMA_ZFLAG_DRAINING; bucket_cache_drain(zone); ZONE_UNLOCK(zone); /* * The DRAINING flag protects us from being freed while * we're running. Normally the uma_rwlock would protect us but we * must be able to release and acquire the right lock for each keg. */ zone_foreach_keg(zone, &keg_drain); ZONE_LOCK(zone); zone->uz_flags &= ~UMA_ZFLAG_DRAINING; wakeup(zone); out: ZONE_UNLOCK(zone); } void zone_drain(uma_zone_t zone) { zone_drain_wait(zone, M_NOWAIT); } /* * Allocate a new slab for a keg. This does not insert the slab onto a list. * * Arguments: * wait Shall we wait? * * Returns: * The slab that was allocated or NULL if there is no memory and the * caller specified M_NOWAIT. */ static uma_slab_t -keg_alloc_slab(uma_keg_t keg, uma_zone_t zone, int wait) +keg_alloc_slab(uma_keg_t keg, uma_zone_t zone, int domain, int wait) { uma_alloc allocf; uma_slab_t slab; uint8_t *mem; uint8_t flags; int i; + KASSERT(domain >= 0 && domain < vm_ndomains, + ("keg_alloc_slab: domain %d out of range", domain)); mtx_assert(&keg->uk_lock, MA_OWNED); slab = NULL; mem = NULL; allocf = keg->uk_allocf; KEG_UNLOCK(keg); if (keg->uk_flags & UMA_ZONE_OFFPAGE) { - slab = zone_alloc_item(keg->uk_slabzone, NULL, wait); + slab = zone_alloc_item(keg->uk_slabzone, NULL, domain, wait); if (slab == NULL) goto out; } /* * This reproduces the old vm_zone behavior of zero filling pages the * first time they are added to a zone. * * Malloced items are zeroed in uma_zalloc. */ if ((keg->uk_flags & UMA_ZONE_MALLOC) == 0) wait |= M_ZERO; else wait &= ~M_ZERO; if (keg->uk_flags & UMA_ZONE_NODUMP) wait |= M_NODUMP; /* zone is passed for legacy reasons. */ - mem = allocf(zone, keg->uk_ppera * PAGE_SIZE, &flags, wait); + mem = allocf(zone, keg->uk_ppera * PAGE_SIZE, domain, &flags, wait); if (mem == NULL) { if (keg->uk_flags & UMA_ZONE_OFFPAGE) zone_free_item(keg->uk_slabzone, slab, NULL, SKIP_NONE); slab = NULL; goto out; } /* Point the slab into the allocated memory */ if (!(keg->uk_flags & UMA_ZONE_OFFPAGE)) slab = (uma_slab_t )(mem + keg->uk_pgoff); if (keg->uk_flags & UMA_ZONE_VTOSLAB) for (i = 0; i < keg->uk_ppera; i++) vsetslab((vm_offset_t)mem + (i * PAGE_SIZE), slab); slab->us_keg = keg; slab->us_data = mem; slab->us_freecount = keg->uk_ipers; slab->us_flags = flags; BIT_FILL(SLAB_SETSIZE, &slab->us_free); #ifdef INVARIANTS BIT_ZERO(SLAB_SETSIZE, &slab->us_debugfree); #endif + /* + * Set the domain based on the first page. This may be incorrect for + * multi-page allocations depending on the NUMA policy specified. + */ +#if MAXMEMDOM > 1 + if ((flags & UMA_SLAB_BOOT) == 0) + slab->us_domain = vm_phys_domain(PHYS_TO_VM_PAGE( + pmap_kextract((vm_offset_t)mem))); + else +#endif + slab->us_domain = 0; + if (keg->uk_init != NULL) { for (i = 0; i < keg->uk_ipers; i++) if (keg->uk_init(slab->us_data + (keg->uk_rsize * i), keg->uk_size, wait) != 0) break; if (i != keg->uk_ipers) { keg_free_slab(keg, slab, i); slab = NULL; goto out; } } out: KEG_LOCK(keg); CTR3(KTR_UMA, "keg_alloc_slab: allocated slab %p for %s(%p)", slab, keg->uk_name, keg); if (slab != NULL) { if (keg->uk_flags & UMA_ZONE_HASH) UMA_HASH_INSERT(&keg->uk_hash, slab, mem); keg->uk_pages += keg->uk_ppera; keg->uk_free += keg->uk_ipers; } return (slab); } /* * This function is intended to be used early on in place of page_alloc() so * that we may use the boot time page cache to satisfy allocations before * the VM is ready. */ static void * -startup_alloc(uma_zone_t zone, vm_size_t bytes, uint8_t *pflag, int wait) +startup_alloc(uma_zone_t zone, vm_size_t bytes, int domain, uint8_t *pflag, + int wait) { uma_keg_t keg; void *mem; int pages; keg = zone_first_keg(zone); pages = howmany(bytes, PAGE_SIZE); KASSERT(pages > 0, ("startup_alloc can't reserve 0 pages\n")); /* * Check our small startup cache to see if it has pages remaining. */ mtx_lock(&uma_boot_pages_mtx); if (pages <= boot_pages) { mem = bootmem; boot_pages -= pages; bootmem += pages * PAGE_SIZE; mtx_unlock(&uma_boot_pages_mtx); *pflag = UMA_SLAB_BOOT; return (mem); } mtx_unlock(&uma_boot_pages_mtx); if (booted < UMA_STARTUP2) panic("UMA: Increase vm.boot_pages"); /* * Now that we've booted reset these users to their real allocator. */ #ifdef UMA_MD_SMALL_ALLOC keg->uk_allocf = (keg->uk_ppera > 1) ? page_alloc : uma_small_alloc; #else keg->uk_allocf = page_alloc; #endif - return keg->uk_allocf(zone, bytes, pflag, wait); + return keg->uk_allocf(zone, bytes, domain, pflag, wait); } /* * Allocates a number of pages from the system * * Arguments: * bytes The number of bytes requested * wait Shall we wait? * * Returns: * A pointer to the alloced memory or possibly * NULL if M_NOWAIT is set. */ static void * -page_alloc(uma_zone_t zone, vm_size_t bytes, uint8_t *pflag, int wait) +page_alloc(uma_zone_t zone, vm_size_t bytes, int domain, uint8_t *pflag, + int wait) { void *p; /* Returned page */ *pflag = UMA_SLAB_KMEM; p = (void *) kmem_malloc(kmem_arena, bytes, wait); return (p); } /* * Allocates a number of pages from within an object * * Arguments: * bytes The number of bytes requested * wait Shall we wait? * * Returns: * A pointer to the alloced memory or possibly * NULL if M_NOWAIT is set. */ static void * -noobj_alloc(uma_zone_t zone, vm_size_t bytes, uint8_t *flags, int wait) +noobj_alloc(uma_zone_t zone, vm_size_t bytes, int domain, uint8_t *flags, + int wait) { TAILQ_HEAD(, vm_page) alloctail; u_long npages; vm_offset_t retkva, zkva; vm_page_t p, p_next; uma_keg_t keg; TAILQ_INIT(&alloctail); keg = zone_first_keg(zone); npages = howmany(bytes, PAGE_SIZE); while (npages > 0) { - p = vm_page_alloc(NULL, 0, VM_ALLOC_INTERRUPT | - VM_ALLOC_WIRED | VM_ALLOC_NOOBJ); + p = vm_page_alloc_domain(NULL, 0, domain, + VM_ALLOC_INTERRUPT | VM_ALLOC_WIRED | VM_ALLOC_NOOBJ); if (p != NULL) { /* * Since the page does not belong to an object, its * listq is unused. */ TAILQ_INSERT_TAIL(&alloctail, p, listq); npages--; continue; } if (wait & M_WAITOK) { VM_WAIT; continue; } /* * Page allocation failed, free intermediate pages and * exit. */ TAILQ_FOREACH_SAFE(p, &alloctail, listq, p_next) { vm_page_unwire(p, PQ_NONE); vm_page_free(p); } return (NULL); } *flags = UMA_SLAB_PRIV; zkva = keg->uk_kva + atomic_fetchadd_long(&keg->uk_offset, round_page(bytes)); retkva = zkva; TAILQ_FOREACH(p, &alloctail, listq) { pmap_qenter(zkva, &p, 1); zkva += PAGE_SIZE; } return ((void *)retkva); } /* * Frees a number of pages to the system * * Arguments: * mem A pointer to the memory to be freed * size The size of the memory being freed * flags The original p->us_flags field * * Returns: * Nothing */ static void page_free(void *mem, vm_size_t size, uint8_t flags) { struct vmem *vmem; if (flags & UMA_SLAB_KMEM) vmem = kmem_arena; else if (flags & UMA_SLAB_KERNEL) vmem = kernel_arena; else panic("UMA: page_free used with invalid flags %x", flags); kmem_free(vmem, (vm_offset_t)mem, size); } /* * Zero fill initializer * * Arguments/Returns follow uma_init specifications */ static int zero_init(void *mem, int size, int flags) { bzero(mem, size); return (0); } /* * Finish creating a small uma keg. This calculates ipers, and the keg size. * * Arguments * keg The zone we should initialize * * Returns * Nothing */ static void keg_small_init(uma_keg_t keg) { u_int rsize; u_int memused; u_int wastedspace; u_int shsize; u_int slabsize; if (keg->uk_flags & UMA_ZONE_PCPU) { u_int ncpus = (mp_maxid + 1) ? (mp_maxid + 1) : MAXCPU; slabsize = sizeof(struct pcpu); keg->uk_ppera = howmany(ncpus * sizeof(struct pcpu), PAGE_SIZE); } else { slabsize = UMA_SLAB_SIZE; keg->uk_ppera = 1; } /* * Calculate the size of each allocation (rsize) according to * alignment. If the requested size is smaller than we have * allocation bits for we round it up. */ rsize = keg->uk_size; if (rsize < slabsize / SLAB_SETSIZE) rsize = slabsize / SLAB_SETSIZE; if (rsize & keg->uk_align) rsize = (rsize & ~keg->uk_align) + (keg->uk_align + 1); keg->uk_rsize = rsize; KASSERT((keg->uk_flags & UMA_ZONE_PCPU) == 0 || keg->uk_rsize < sizeof(struct pcpu), ("%s: size %u too large", __func__, keg->uk_rsize)); if (keg->uk_flags & UMA_ZONE_OFFPAGE) shsize = 0; else shsize = sizeof(struct uma_slab); keg->uk_ipers = (slabsize - shsize) / rsize; KASSERT(keg->uk_ipers > 0 && keg->uk_ipers <= SLAB_SETSIZE, ("%s: keg->uk_ipers %u", __func__, keg->uk_ipers)); memused = keg->uk_ipers * rsize + shsize; wastedspace = slabsize - memused; /* * We can't do OFFPAGE if we're internal or if we've been * asked to not go to the VM for buckets. If we do this we * may end up going to the VM for slabs which we do not * want to do if we're UMA_ZFLAG_CACHEONLY as a result * of UMA_ZONE_VM, which clearly forbids it. */ if ((keg->uk_flags & UMA_ZFLAG_INTERNAL) || (keg->uk_flags & UMA_ZFLAG_CACHEONLY)) return; /* * See if using an OFFPAGE slab will limit our waste. Only do * this if it permits more items per-slab. * * XXX We could try growing slabsize to limit max waste as well. * Historically this was not done because the VM could not * efficiently handle contiguous allocations. */ if ((wastedspace >= slabsize / UMA_MAX_WASTE) && (keg->uk_ipers < (slabsize / keg->uk_rsize))) { keg->uk_ipers = slabsize / keg->uk_rsize; KASSERT(keg->uk_ipers > 0 && keg->uk_ipers <= SLAB_SETSIZE, ("%s: keg->uk_ipers %u", __func__, keg->uk_ipers)); CTR6(KTR_UMA, "UMA decided we need offpage slab headers for " "keg: %s(%p), calculated wastedspace = %d, " "maximum wasted space allowed = %d, " "calculated ipers = %d, " "new wasted space = %d\n", keg->uk_name, keg, wastedspace, slabsize / UMA_MAX_WASTE, keg->uk_ipers, slabsize - keg->uk_ipers * keg->uk_rsize); keg->uk_flags |= UMA_ZONE_OFFPAGE; } if ((keg->uk_flags & UMA_ZONE_OFFPAGE) && (keg->uk_flags & UMA_ZONE_VTOSLAB) == 0) keg->uk_flags |= UMA_ZONE_HASH; } /* * Finish creating a large (> UMA_SLAB_SIZE) uma kegs. Just give in and do * OFFPAGE for now. When I can allow for more dynamic slab sizes this will be * more complicated. * * Arguments * keg The keg we should initialize * * Returns * Nothing */ static void keg_large_init(uma_keg_t keg) { u_int shsize; KASSERT(keg != NULL, ("Keg is null in keg_large_init")); KASSERT((keg->uk_flags & UMA_ZFLAG_CACHEONLY) == 0, ("keg_large_init: Cannot large-init a UMA_ZFLAG_CACHEONLY keg")); KASSERT((keg->uk_flags & UMA_ZONE_PCPU) == 0, ("%s: Cannot large-init a UMA_ZONE_PCPU keg", __func__)); keg->uk_ppera = howmany(keg->uk_size, PAGE_SIZE); keg->uk_ipers = 1; keg->uk_rsize = keg->uk_size; /* We can't do OFFPAGE if we're internal, bail out here. */ if (keg->uk_flags & UMA_ZFLAG_INTERNAL) return; /* Check whether we have enough space to not do OFFPAGE. */ if ((keg->uk_flags & UMA_ZONE_OFFPAGE) == 0) { shsize = sizeof(struct uma_slab); if (shsize & UMA_ALIGN_PTR) shsize = (shsize & ~UMA_ALIGN_PTR) + (UMA_ALIGN_PTR + 1); if ((PAGE_SIZE * keg->uk_ppera) - keg->uk_rsize < shsize) keg->uk_flags |= UMA_ZONE_OFFPAGE; } if ((keg->uk_flags & UMA_ZONE_OFFPAGE) && (keg->uk_flags & UMA_ZONE_VTOSLAB) == 0) keg->uk_flags |= UMA_ZONE_HASH; } static void keg_cachespread_init(uma_keg_t keg) { int alignsize; int trailer; int pages; int rsize; KASSERT((keg->uk_flags & UMA_ZONE_PCPU) == 0, ("%s: Cannot cachespread-init a UMA_ZONE_PCPU keg", __func__)); alignsize = keg->uk_align + 1; rsize = keg->uk_size; /* * We want one item to start on every align boundary in a page. To * do this we will span pages. We will also extend the item by the * size of align if it is an even multiple of align. Otherwise, it * would fall on the same boundary every time. */ if (rsize & keg->uk_align) rsize = (rsize & ~keg->uk_align) + alignsize; if ((rsize & alignsize) == 0) rsize += alignsize; trailer = rsize - keg->uk_size; pages = (rsize * (PAGE_SIZE / alignsize)) / PAGE_SIZE; pages = MIN(pages, (128 * 1024) / PAGE_SIZE); keg->uk_rsize = rsize; keg->uk_ppera = pages; keg->uk_ipers = ((pages * PAGE_SIZE) + trailer) / rsize; keg->uk_flags |= UMA_ZONE_OFFPAGE | UMA_ZONE_VTOSLAB; KASSERT(keg->uk_ipers <= SLAB_SETSIZE, ("%s: keg->uk_ipers too high(%d) increase max_ipers", __func__, keg->uk_ipers)); } /* * Keg header ctor. This initializes all fields, locks, etc. And inserts * the keg onto the global keg list. * * Arguments/Returns follow uma_ctor specifications * udata Actually uma_kctor_args */ static int keg_ctor(void *mem, int size, void *udata, int flags) { struct uma_kctor_args *arg = udata; uma_keg_t keg = mem; uma_zone_t zone; bzero(keg, size); keg->uk_size = arg->size; keg->uk_init = arg->uminit; keg->uk_fini = arg->fini; keg->uk_align = arg->align; + keg->uk_cursor = 0; keg->uk_free = 0; keg->uk_reserve = 0; keg->uk_pages = 0; keg->uk_flags = arg->flags; keg->uk_slabzone = NULL; /* * The master zone is passed to us at keg-creation time. */ zone = arg->zone; keg->uk_name = zone->uz_name; if (arg->flags & UMA_ZONE_VM) keg->uk_flags |= UMA_ZFLAG_CACHEONLY; if (arg->flags & UMA_ZONE_ZINIT) keg->uk_init = zero_init; if (arg->flags & UMA_ZONE_MALLOC) keg->uk_flags |= UMA_ZONE_VTOSLAB; if (arg->flags & UMA_ZONE_PCPU) #ifdef SMP keg->uk_flags |= UMA_ZONE_OFFPAGE; #else keg->uk_flags &= ~UMA_ZONE_PCPU; #endif if (keg->uk_flags & UMA_ZONE_CACHESPREAD) { keg_cachespread_init(keg); } else { if (keg->uk_size > (UMA_SLAB_SIZE - sizeof(struct uma_slab))) keg_large_init(keg); else keg_small_init(keg); } if (keg->uk_flags & UMA_ZONE_OFFPAGE) keg->uk_slabzone = slabzone; /* * If we haven't booted yet we need allocations to go through the * startup cache until the vm is ready. */ if (booted < UMA_STARTUP2) keg->uk_allocf = startup_alloc; #ifdef UMA_MD_SMALL_ALLOC else if (keg->uk_ppera == 1) keg->uk_allocf = uma_small_alloc; #endif else keg->uk_allocf = page_alloc; #ifdef UMA_MD_SMALL_ALLOC if (keg->uk_ppera == 1) keg->uk_freef = uma_small_free; else #endif keg->uk_freef = page_free; /* * Initialize keg's lock */ KEG_LOCK_INIT(keg, (arg->flags & UMA_ZONE_MTXCLASS)); /* * If we're putting the slab header in the actual page we need to * figure out where in each page it goes. This calculates a right * justified offset into the memory on an ALIGN_PTR boundary. */ if (!(keg->uk_flags & UMA_ZONE_OFFPAGE)) { u_int totsize; /* Size of the slab struct and free list */ totsize = sizeof(struct uma_slab); if (totsize & UMA_ALIGN_PTR) totsize = (totsize & ~UMA_ALIGN_PTR) + (UMA_ALIGN_PTR + 1); keg->uk_pgoff = (PAGE_SIZE * keg->uk_ppera) - totsize; /* * The only way the following is possible is if with our * UMA_ALIGN_PTR adjustments we are now bigger than * UMA_SLAB_SIZE. I haven't checked whether this is * mathematically possible for all cases, so we make * sure here anyway. */ totsize = keg->uk_pgoff + sizeof(struct uma_slab); if (totsize > PAGE_SIZE * keg->uk_ppera) { printf("zone %s ipers %d rsize %d size %d\n", zone->uz_name, keg->uk_ipers, keg->uk_rsize, keg->uk_size); panic("UMA slab won't fit."); } } if (keg->uk_flags & UMA_ZONE_HASH) hash_alloc(&keg->uk_hash); CTR5(KTR_UMA, "keg_ctor %p zone %s(%p) out %d free %d\n", keg, zone->uz_name, zone, (keg->uk_pages / keg->uk_ppera) * keg->uk_ipers - keg->uk_free, keg->uk_free); LIST_INSERT_HEAD(&keg->uk_zones, zone, uz_link); rw_wlock(&uma_rwlock); LIST_INSERT_HEAD(&uma_kegs, keg, uk_link); rw_wunlock(&uma_rwlock); return (0); } /* * Zone header ctor. This initializes all fields, locks, etc. * * Arguments/Returns follow uma_ctor specifications * udata Actually uma_zctor_args */ static int zone_ctor(void *mem, int size, void *udata, int flags) { struct uma_zctor_args *arg = udata; uma_zone_t zone = mem; uma_zone_t z; uma_keg_t keg; bzero(zone, size); zone->uz_name = arg->name; zone->uz_ctor = arg->ctor; zone->uz_dtor = arg->dtor; zone->uz_slab = zone_fetch_slab; zone->uz_init = NULL; zone->uz_fini = NULL; zone->uz_allocs = 0; zone->uz_frees = 0; zone->uz_fails = 0; zone->uz_sleeps = 0; zone->uz_count = 0; zone->uz_count_min = 0; + zone->uz_sel = NULL; zone->uz_flags = 0; zone->uz_warning = NULL; timevalclear(&zone->uz_ratecheck); keg = arg->keg; ZONE_LOCK_INIT(zone, (arg->flags & UMA_ZONE_MTXCLASS)); /* * This is a pure cache zone, no kegs. */ if (arg->import) { if (arg->flags & UMA_ZONE_VM) arg->flags |= UMA_ZFLAG_CACHEONLY; zone->uz_flags = arg->flags; zone->uz_size = arg->size; zone->uz_import = arg->import; zone->uz_release = arg->release; zone->uz_arg = arg->arg; zone->uz_lockptr = &zone->uz_lock; rw_wlock(&uma_rwlock); LIST_INSERT_HEAD(&uma_cachezones, zone, uz_link); rw_wunlock(&uma_rwlock); goto out; } /* * Use the regular zone/keg/slab allocator. */ zone->uz_import = (uma_import)zone_import; zone->uz_release = (uma_release)zone_release; zone->uz_arg = zone; if (arg->flags & UMA_ZONE_SECONDARY) { KASSERT(arg->keg != NULL, ("Secondary zone on zero'd keg")); zone->uz_init = arg->uminit; zone->uz_fini = arg->fini; zone->uz_lockptr = &keg->uk_lock; zone->uz_flags |= UMA_ZONE_SECONDARY; rw_wlock(&uma_rwlock); ZONE_LOCK(zone); LIST_FOREACH(z, &keg->uk_zones, uz_link) { if (LIST_NEXT(z, uz_link) == NULL) { LIST_INSERT_AFTER(z, zone, uz_link); break; } } ZONE_UNLOCK(zone); rw_wunlock(&uma_rwlock); } else if (keg == NULL) { if ((keg = uma_kcreate(zone, arg->size, arg->uminit, arg->fini, arg->align, arg->flags)) == NULL) return (ENOMEM); } else { struct uma_kctor_args karg; int error; /* We should only be here from uma_startup() */ karg.size = arg->size; karg.uminit = arg->uminit; karg.fini = arg->fini; karg.align = arg->align; karg.flags = arg->flags; karg.zone = zone; error = keg_ctor(arg->keg, sizeof(struct uma_keg), &karg, flags); if (error) return (error); } /* * Link in the first keg. */ zone->uz_klink.kl_keg = keg; LIST_INSERT_HEAD(&zone->uz_kegs, &zone->uz_klink, kl_link); zone->uz_lockptr = &keg->uk_lock; zone->uz_size = keg->uk_size; zone->uz_flags |= (keg->uk_flags & (UMA_ZONE_INHERIT | UMA_ZFLAG_INHERIT)); /* * Some internal zones don't have room allocated for the per cpu * caches. If we're internal, bail out here. */ if (keg->uk_flags & UMA_ZFLAG_INTERNAL) { KASSERT((zone->uz_flags & UMA_ZONE_SECONDARY) == 0, ("Secondary zone requested UMA_ZFLAG_INTERNAL")); return (0); } out: if ((arg->flags & UMA_ZONE_MAXBUCKET) == 0) zone->uz_count = bucket_select(zone->uz_size); else zone->uz_count = BUCKET_MAX; zone->uz_count_min = zone->uz_count; return (0); } /* * Keg header dtor. This frees all data, destroys locks, frees the hash * table and removes the keg from the global list. * * Arguments/Returns follow uma_dtor specifications * udata unused */ static void keg_dtor(void *arg, int size, void *udata) { uma_keg_t keg; keg = (uma_keg_t)arg; KEG_LOCK(keg); if (keg->uk_free != 0) { printf("Freed UMA keg (%s) was not empty (%d items). " " Lost %d pages of memory.\n", keg->uk_name ? keg->uk_name : "", keg->uk_free, keg->uk_pages); } KEG_UNLOCK(keg); hash_free(&keg->uk_hash); KEG_LOCK_FINI(keg); } /* * Zone header dtor. * * Arguments/Returns follow uma_dtor specifications * udata unused */ static void zone_dtor(void *arg, int size, void *udata) { uma_klink_t klink; uma_zone_t zone; uma_keg_t keg; zone = (uma_zone_t)arg; keg = zone_first_keg(zone); if (!(zone->uz_flags & UMA_ZFLAG_INTERNAL)) cache_drain(zone); rw_wlock(&uma_rwlock); LIST_REMOVE(zone, uz_link); rw_wunlock(&uma_rwlock); /* * XXX there are some races here where * the zone can be drained but zone lock * released and then refilled before we * remove it... we dont care for now */ zone_drain_wait(zone, M_WAITOK); /* * Unlink all of our kegs. */ while ((klink = LIST_FIRST(&zone->uz_kegs)) != NULL) { klink->kl_keg = NULL; LIST_REMOVE(klink, kl_link); if (klink == &zone->uz_klink) continue; free(klink, M_TEMP); } /* * We only destroy kegs from non secondary zones. */ if (keg != NULL && (zone->uz_flags & UMA_ZONE_SECONDARY) == 0) { rw_wlock(&uma_rwlock); LIST_REMOVE(keg, uk_link); rw_wunlock(&uma_rwlock); zone_free_item(kegs, keg, NULL, SKIP_NONE); } ZONE_LOCK_FINI(zone); } /* * Traverses every zone in the system and calls a callback * * Arguments: * zfunc A pointer to a function which accepts a zone * as an argument. * * Returns: * Nothing */ static void zone_foreach(void (*zfunc)(uma_zone_t)) { uma_keg_t keg; uma_zone_t zone; rw_rlock(&uma_rwlock); LIST_FOREACH(keg, &uma_kegs, uk_link) { LIST_FOREACH(zone, &keg->uk_zones, uz_link) zfunc(zone); } rw_runlock(&uma_rwlock); } /* Public functions */ /* See uma.h */ void uma_startup(void *mem, int npages) { struct uma_zctor_args args; rw_init(&uma_rwlock, "UMA lock"); /* "manually" create the initial zone */ memset(&args, 0, sizeof(args)); args.name = "UMA Kegs"; args.size = sizeof(struct uma_keg); args.ctor = keg_ctor; args.dtor = keg_dtor; args.uminit = zero_init; args.fini = NULL; args.keg = &masterkeg; args.align = 32 - 1; args.flags = UMA_ZFLAG_INTERNAL; /* The initial zone has no Per cpu queues so it's smaller */ zone_ctor(kegs, sizeof(struct uma_zone), &args, M_WAITOK); mtx_init(&uma_boot_pages_mtx, "UMA boot pages", NULL, MTX_DEF); bootmem = mem; boot_pages = npages; args.name = "UMA Zones"; args.size = sizeof(struct uma_zone) + (sizeof(struct uma_cache) * (mp_maxid + 1)); args.ctor = zone_ctor; args.dtor = zone_dtor; args.uminit = zero_init; args.fini = NULL; args.keg = NULL; args.align = 32 - 1; args.flags = UMA_ZFLAG_INTERNAL; /* The initial zone has no Per cpu queues so it's smaller */ zone_ctor(zones, sizeof(struct uma_zone), &args, M_WAITOK); /* Now make a zone for slab headers */ slabzone = uma_zcreate("UMA Slabs", sizeof(struct uma_slab), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZFLAG_INTERNAL); hashzone = uma_zcreate("UMA Hash", sizeof(struct slabhead *) * UMA_HASH_SIZE_INIT, NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZFLAG_INTERNAL); bucket_init(); booted = UMA_STARTUP; } /* see uma.h */ void uma_startup2(void) { booted = UMA_STARTUP2; bucket_enable(); sx_init(&uma_drain_lock, "umadrain"); } /* * Initialize our callout handle * */ static void uma_startup3(void) { callout_init(&uma_callout, 1); callout_reset(&uma_callout, UMA_TIMEOUT * hz, uma_timeout, NULL); } static uma_keg_t uma_kcreate(uma_zone_t zone, size_t size, uma_init uminit, uma_fini fini, int align, uint32_t flags) { struct uma_kctor_args args; args.size = size; args.uminit = uminit; args.fini = fini; args.align = (align == UMA_ALIGN_CACHE) ? uma_align_cache : align; args.flags = flags; args.zone = zone; - return (zone_alloc_item(kegs, &args, M_WAITOK)); + return (zone_alloc_item(kegs, &args, UMA_ANYDOMAIN, M_WAITOK)); } /* See uma.h */ void uma_set_align(int align) { if (align != UMA_ALIGN_CACHE) uma_align_cache = align; } /* See uma.h */ uma_zone_t uma_zcreate(const char *name, size_t size, uma_ctor ctor, uma_dtor dtor, uma_init uminit, uma_fini fini, int align, uint32_t flags) { struct uma_zctor_args args; uma_zone_t res; bool locked; KASSERT(powerof2(align + 1), ("invalid zone alignment %d for \"%s\"", align, name)); /* This stuff is essential for the zone ctor */ memset(&args, 0, sizeof(args)); args.name = name; args.size = size; args.ctor = ctor; args.dtor = dtor; args.uminit = uminit; args.fini = fini; #ifdef INVARIANTS /* * If a zone is being created with an empty constructor and * destructor, pass UMA constructor/destructor which checks for * memory use after free. */ if ((!(flags & (UMA_ZONE_ZINIT | UMA_ZONE_NOFREE))) && ctor == NULL && dtor == NULL && uminit == NULL && fini == NULL) { args.ctor = trash_ctor; args.dtor = trash_dtor; args.uminit = trash_init; args.fini = trash_fini; } #endif args.align = align; args.flags = flags; args.keg = NULL; if (booted < UMA_STARTUP2) { locked = false; } else { sx_slock(&uma_drain_lock); locked = true; } - res = zone_alloc_item(zones, &args, M_WAITOK); + res = zone_alloc_item(zones, &args, UMA_ANYDOMAIN, M_WAITOK); if (locked) sx_sunlock(&uma_drain_lock); return (res); } /* See uma.h */ uma_zone_t uma_zsecond_create(char *name, uma_ctor ctor, uma_dtor dtor, uma_init zinit, uma_fini zfini, uma_zone_t master) { struct uma_zctor_args args; uma_keg_t keg; uma_zone_t res; bool locked; keg = zone_first_keg(master); memset(&args, 0, sizeof(args)); args.name = name; args.size = keg->uk_size; args.ctor = ctor; args.dtor = dtor; args.uminit = zinit; args.fini = zfini; args.align = keg->uk_align; args.flags = keg->uk_flags | UMA_ZONE_SECONDARY; args.keg = keg; if (booted < UMA_STARTUP2) { locked = false; } else { sx_slock(&uma_drain_lock); locked = true; } /* XXX Attaches only one keg of potentially many. */ - res = zone_alloc_item(zones, &args, M_WAITOK); + res = zone_alloc_item(zones, &args, UMA_ANYDOMAIN, M_WAITOK); if (locked) sx_sunlock(&uma_drain_lock); return (res); } /* See uma.h */ uma_zone_t uma_zcache_create(char *name, int size, uma_ctor ctor, uma_dtor dtor, uma_init zinit, uma_fini zfini, uma_import zimport, uma_release zrelease, void *arg, int flags) { struct uma_zctor_args args; memset(&args, 0, sizeof(args)); args.name = name; args.size = size; args.ctor = ctor; args.dtor = dtor; args.uminit = zinit; args.fini = zfini; args.import = zimport; args.release = zrelease; args.arg = arg; args.align = 0; args.flags = flags; - return (zone_alloc_item(zones, &args, M_WAITOK)); + return (zone_alloc_item(zones, &args, UMA_ANYDOMAIN, M_WAITOK)); } static void zone_lock_pair(uma_zone_t a, uma_zone_t b) { if (a < b) { ZONE_LOCK(a); mtx_lock_flags(b->uz_lockptr, MTX_DUPOK); } else { ZONE_LOCK(b); mtx_lock_flags(a->uz_lockptr, MTX_DUPOK); } } static void zone_unlock_pair(uma_zone_t a, uma_zone_t b) { ZONE_UNLOCK(a); ZONE_UNLOCK(b); } int uma_zsecond_add(uma_zone_t zone, uma_zone_t master) { uma_klink_t klink; uma_klink_t kl; int error; error = 0; klink = malloc(sizeof(*klink), M_TEMP, M_WAITOK | M_ZERO); zone_lock_pair(zone, master); /* * zone must use vtoslab() to resolve objects and must already be * a secondary. */ if ((zone->uz_flags & (UMA_ZONE_VTOSLAB | UMA_ZONE_SECONDARY)) != (UMA_ZONE_VTOSLAB | UMA_ZONE_SECONDARY)) { error = EINVAL; goto out; } /* * The new master must also use vtoslab(). */ if ((zone->uz_flags & UMA_ZONE_VTOSLAB) != UMA_ZONE_VTOSLAB) { error = EINVAL; goto out; } /* * The underlying object must be the same size. rsize * may be different. */ if (master->uz_size != zone->uz_size) { error = E2BIG; goto out; } /* * Put it at the end of the list. */ klink->kl_keg = zone_first_keg(master); LIST_FOREACH(kl, &zone->uz_kegs, kl_link) { if (LIST_NEXT(kl, kl_link) == NULL) { LIST_INSERT_AFTER(kl, klink, kl_link); break; } } klink = NULL; zone->uz_flags |= UMA_ZFLAG_MULTI; zone->uz_slab = zone_fetch_slab_multi; out: zone_unlock_pair(zone, master); if (klink != NULL) free(klink, M_TEMP); return (error); } /* See uma.h */ void uma_zdestroy(uma_zone_t zone) { sx_slock(&uma_drain_lock); zone_free_item(zones, zone, NULL, SKIP_NONE); sx_sunlock(&uma_drain_lock); } /* See uma.h */ void * uma_zalloc_arg(uma_zone_t zone, void *udata, int flags) { - void *item; - uma_cache_t cache; + uma_zone_domain_t zdom; uma_bucket_t bucket; - int lockfail; - int cpu; + uma_cache_t cache; + void *item; + int cpu, domain, lockfail; /* Enable entropy collection for RANDOM_ENABLE_UMA kernel option */ random_harvest_fast_uma(&zone, sizeof(zone), 1, RANDOM_UMA); /* This is the fast path allocation */ CTR4(KTR_UMA, "uma_zalloc_arg thread %x zone %s(%p) flags %d", curthread, zone->uz_name, zone, flags); if (flags & M_WAITOK) { WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL, "uma_zalloc_arg: zone \"%s\"", zone->uz_name); } KASSERT(curthread->td_critnest == 0 || SCHEDULER_STOPPED(), ("uma_zalloc_arg: called with spinlock or critical section held")); #ifdef DEBUG_MEMGUARD if (memguard_cmp_zone(zone)) { item = memguard_alloc(zone->uz_size, flags); if (item != NULL) { if (zone->uz_init != NULL && zone->uz_init(item, zone->uz_size, flags) != 0) return (NULL); if (zone->uz_ctor != NULL && zone->uz_ctor(item, zone->uz_size, udata, flags) != 0) { zone->uz_fini(item, zone->uz_size); return (NULL); } return (item); } /* This is unfortunate but should not be fatal. */ } #endif /* * If possible, allocate from the per-CPU cache. There are two * requirements for safe access to the per-CPU cache: (1) the thread * accessing the cache must not be preempted or yield during access, * and (2) the thread must not migrate CPUs without switching which * cache it accesses. We rely on a critical section to prevent * preemption and migration. We release the critical section in * order to acquire the zone mutex if we are unable to allocate from * the current cache; when we re-acquire the critical section, we * must detect and handle migration if it has occurred. */ critical_enter(); cpu = curcpu; cache = &zone->uz_cpu[cpu]; zalloc_start: bucket = cache->uc_allocbucket; if (bucket != NULL && bucket->ub_cnt > 0) { bucket->ub_cnt--; item = bucket->ub_bucket[bucket->ub_cnt]; #ifdef INVARIANTS bucket->ub_bucket[bucket->ub_cnt] = NULL; #endif KASSERT(item != NULL, ("uma_zalloc: Bucket pointer mangled.")); cache->uc_allocs++; critical_exit(); if (zone->uz_ctor != NULL && zone->uz_ctor(item, zone->uz_size, udata, flags) != 0) { atomic_add_long(&zone->uz_fails, 1); zone_free_item(zone, item, udata, SKIP_DTOR); return (NULL); } #ifdef INVARIANTS uma_dbg_alloc(zone, NULL, item); #endif if (flags & M_ZERO) uma_zero_item(item, zone); return (item); } /* * We have run out of items in our alloc bucket. * See if we can switch with our free bucket. */ bucket = cache->uc_freebucket; if (bucket != NULL && bucket->ub_cnt > 0) { CTR2(KTR_UMA, "uma_zalloc: zone %s(%p) swapping empty with alloc", zone->uz_name, zone); cache->uc_freebucket = cache->uc_allocbucket; cache->uc_allocbucket = bucket; goto zalloc_start; } /* * Discard any empty allocation bucket while we hold no locks. */ bucket = cache->uc_allocbucket; cache->uc_allocbucket = NULL; critical_exit(); if (bucket != NULL) bucket_free(zone, bucket, udata); /* Short-circuit for zones without buckets and low memory. */ - if (zone->uz_count == 0 || bucketdisable) + if (zone->uz_count == 0 || bucketdisable) { + domain = UMA_ANYDOMAIN; goto zalloc_item; + } /* * Attempt to retrieve the item from the per-CPU cache has failed, so * we must go back to the zone. This requires the zone lock, so we * must drop the critical section, then re-acquire it when we go back * to the cache. Since the critical section is released, we may be * preempted or migrate. As such, make sure not to maintain any * thread-local state specific to the cache from prior to releasing * the critical section. */ lockfail = 0; if (ZONE_TRYLOCK(zone) == 0) { /* Record contention to size the buckets. */ ZONE_LOCK(zone); lockfail = 1; } critical_enter(); cpu = curcpu; cache = &zone->uz_cpu[cpu]; /* * Since we have locked the zone we may as well send back our stats. */ atomic_add_long(&zone->uz_allocs, cache->uc_allocs); atomic_add_long(&zone->uz_frees, cache->uc_frees); cache->uc_allocs = 0; cache->uc_frees = 0; /* See if we lost the race to fill the cache. */ if (cache->uc_allocbucket != NULL) { ZONE_UNLOCK(zone); goto zalloc_start; } + if (zone->uz_sel == NULL) { + domain = UMA_ANYDOMAIN; + zdom = &zone->uz_domain[0]; + } else { + domain = vm_domain_select_first(zone->uz_sel); + zdom = &zone->uz_domain[domain]; + } + /* * Check the zone's cache of buckets. */ - if ((bucket = LIST_FIRST(&zone->uz_buckets)) != NULL) { + if ((bucket = LIST_FIRST(&zdom->uzd_buckets)) != NULL) { KASSERT(bucket->ub_cnt != 0, ("uma_zalloc_arg: Returning an empty bucket.")); LIST_REMOVE(bucket, ub_link); cache->uc_allocbucket = bucket; ZONE_UNLOCK(zone); goto zalloc_start; } /* We are no longer associated with this CPU. */ critical_exit(); /* * We bump the uz count when the cache size is insufficient to * handle the working set. */ if (lockfail && zone->uz_count < BUCKET_MAX) zone->uz_count++; ZONE_UNLOCK(zone); /* * Now lets just fill a bucket and put it on the free list. If that * works we'll restart the allocation from the beginning and it * will use the just filled bucket. */ - bucket = zone_alloc_bucket(zone, udata, flags); + bucket = zone_alloc_bucket(zone, udata, domain, flags); CTR3(KTR_UMA, "uma_zalloc: zone %s(%p) bucket zone returned %p", zone->uz_name, zone, bucket); if (bucket != NULL) { ZONE_LOCK(zone); critical_enter(); cpu = curcpu; cache = &zone->uz_cpu[cpu]; /* * See if we lost the race or were migrated. Cache the * initialized bucket to make this less likely or claim * the memory directly. */ - if (cache->uc_allocbucket == NULL) - cache->uc_allocbucket = bucket; + if (cache->uc_allocbucket != NULL || + (domain != UMA_ANYDOMAIN && domain != PCPU_GET(domain))) + LIST_INSERT_HEAD(&zdom->uzd_buckets, bucket, ub_link); else - LIST_INSERT_HEAD(&zone->uz_buckets, bucket, ub_link); + cache->uc_allocbucket = bucket; ZONE_UNLOCK(zone); goto zalloc_start; } /* * We may not be able to get a bucket so return an actual item. */ zalloc_item: - item = zone_alloc_item(zone, udata, flags); + item = zone_alloc_item(zone, udata, domain, flags); return (item); } +/* + * Find a slab with some space. Prefer slabs that are partially used over those + * that are totally full. This helps to reduce fragmentation. + */ static uma_slab_t -keg_fetch_slab(uma_keg_t keg, uma_zone_t zone, int flags) +keg_first_slab(uma_keg_t keg, int domain) { + uma_domain_t dom; uma_slab_t slab; - int reserve; + KASSERT(domain >= 0 && domain < vm_ndomains, + ("keg_first_slab: domain %d out of range", domain)); + + dom = &keg->uk_domain[domain]; + if (!LIST_EMPTY(&dom->ud_part_slab)) + return (LIST_FIRST(&dom->ud_part_slab)); + if (LIST_EMPTY(&dom->ud_free_slab)) + return (NULL); + slab = LIST_FIRST(&dom->ud_free_slab); + LIST_REMOVE(slab, us_link); + LIST_INSERT_HEAD(&dom->ud_part_slab, slab, us_link); + return (slab); +} + +static uma_slab_t +keg_fetch_slab(uma_keg_t keg, uma_zone_t zone, int rdomain, int flags) +{ + uma_domain_t dom; + uma_slab_t slab; + int domain, reserve, start; + mtx_assert(&keg->uk_lock, MA_OWNED); slab = NULL; reserve = 0; if ((flags & M_USE_RESERVE) == 0) reserve = keg->uk_reserve; - for (;;) { - /* - * Find a slab with some space. Prefer slabs that are partially - * used over those that are totally full. This helps to reduce - * fragmentation. - */ - if (keg->uk_free > reserve) { - if (!LIST_EMPTY(&keg->uk_part_slab)) { - slab = LIST_FIRST(&keg->uk_part_slab); - } else { - slab = LIST_FIRST(&keg->uk_free_slab); - LIST_REMOVE(slab, us_link); - LIST_INSERT_HEAD(&keg->uk_part_slab, slab, - us_link); - } + if (rdomain == UMA_ANYDOMAIN) { + keg->uk_cursor = (keg->uk_cursor + 1) % vm_ndomains; + domain = start = keg->uk_cursor; + } else + domain = start = rdomain; + + do { + if (keg->uk_free > reserve && + (slab = keg_first_slab(keg, domain)) != NULL) { MPASS(slab->us_keg == keg); return (slab); } /* * M_NOVM means don't ask at all! */ if (flags & M_NOVM) break; if (keg->uk_maxpages && keg->uk_pages >= keg->uk_maxpages) { keg->uk_flags |= UMA_ZFLAG_FULL; /* * If this is not a multi-zone, set the FULL bit. * Otherwise slab_multi() takes care of it. */ if ((zone->uz_flags & UMA_ZFLAG_MULTI) == 0) { zone->uz_flags |= UMA_ZFLAG_FULL; zone_log_warning(zone); zone_maxaction(zone); } if (flags & M_NOWAIT) break; zone->uz_sleeps++; msleep(keg, &keg->uk_lock, PVM, "keglimit", 0); continue; } - slab = keg_alloc_slab(keg, zone, flags); + slab = keg_alloc_slab(keg, zone, domain, flags); /* * If we got a slab here it's safe to mark it partially used * and return. We assume that the caller is going to remove * at least one item. */ if (slab) { MPASS(slab->us_keg == keg); - LIST_INSERT_HEAD(&keg->uk_part_slab, slab, us_link); + dom = &keg->uk_domain[slab->us_domain]; + LIST_INSERT_HEAD(&dom->ud_part_slab, slab, us_link); return (slab); } /* * We might not have been able to get a slab but another cpu * could have while we were unlocked. Check again before we * fail. */ - flags |= M_NOVM; - } - return (slab); + if ((slab = keg_first_slab(keg, domain)) != NULL) { + MPASS(slab->us_keg == keg); + return (slab); + } + if (rdomain == UMA_ANYDOMAIN) { + keg->uk_cursor = (keg->uk_cursor + 1) % vm_ndomains; + domain = keg->uk_cursor; + } + } while (domain != start); + + return (NULL); } static uma_slab_t -zone_fetch_slab(uma_zone_t zone, uma_keg_t keg, int flags) +zone_fetch_slab(uma_zone_t zone, uma_keg_t keg, int domain, int flags) { uma_slab_t slab; if (keg == NULL) { keg = zone_first_keg(zone); KEG_LOCK(keg); } for (;;) { - slab = keg_fetch_slab(keg, zone, flags); + slab = keg_fetch_slab(keg, zone, domain, flags); if (slab) return (slab); if (flags & (M_NOWAIT | M_NOVM)) break; } KEG_UNLOCK(keg); return (NULL); } /* * uma_zone_fetch_slab_multi: Fetches a slab from one available keg. Returns * with the keg locked. On NULL no lock is held. * * The last pointer is used to seed the search. It is not required. */ static uma_slab_t -zone_fetch_slab_multi(uma_zone_t zone, uma_keg_t last, int rflags) +zone_fetch_slab_multi(uma_zone_t zone, uma_keg_t last, int domain, int rflags) { uma_klink_t klink; uma_slab_t slab; uma_keg_t keg; int flags; int empty; int full; /* * Don't wait on the first pass. This will skip limit tests * as well. We don't want to block if we can find a provider * without blocking. */ flags = (rflags & ~M_WAITOK) | M_NOWAIT; /* * Use the last slab allocated as a hint for where to start * the search. */ if (last != NULL) { - slab = keg_fetch_slab(last, zone, flags); + slab = keg_fetch_slab(last, zone, domain, flags); if (slab) return (slab); KEG_UNLOCK(last); } /* * Loop until we have a slab incase of transient failures * while M_WAITOK is specified. I'm not sure this is 100% * required but we've done it for so long now. */ for (;;) { empty = 0; full = 0; /* * Search the available kegs for slabs. Be careful to hold the * correct lock while calling into the keg layer. */ LIST_FOREACH(klink, &zone->uz_kegs, kl_link) { keg = klink->kl_keg; KEG_LOCK(keg); if ((keg->uk_flags & UMA_ZFLAG_FULL) == 0) { - slab = keg_fetch_slab(keg, zone, flags); + slab = keg_fetch_slab(keg, zone, domain, flags); if (slab) return (slab); } if (keg->uk_flags & UMA_ZFLAG_FULL) full++; else empty++; KEG_UNLOCK(keg); } if (rflags & (M_NOWAIT | M_NOVM)) break; flags = rflags; /* * All kegs are full. XXX We can't atomically check all kegs * and sleep so just sleep for a short period and retry. */ if (full && !empty) { ZONE_LOCK(zone); zone->uz_flags |= UMA_ZFLAG_FULL; zone->uz_sleeps++; zone_log_warning(zone); zone_maxaction(zone); msleep(zone, zone->uz_lockptr, PVM, "zonelimit", hz/100); zone->uz_flags &= ~UMA_ZFLAG_FULL; ZONE_UNLOCK(zone); continue; } } return (NULL); } static void * slab_alloc_item(uma_keg_t keg, uma_slab_t slab) { + uma_domain_t dom; void *item; uint8_t freei; MPASS(keg == slab->us_keg); mtx_assert(&keg->uk_lock, MA_OWNED); freei = BIT_FFS(SLAB_SETSIZE, &slab->us_free) - 1; BIT_CLR(SLAB_SETSIZE, freei, &slab->us_free); item = slab->us_data + (keg->uk_rsize * freei); slab->us_freecount--; keg->uk_free--; /* Move this slab to the full list */ if (slab->us_freecount == 0) { LIST_REMOVE(slab, us_link); - LIST_INSERT_HEAD(&keg->uk_full_slab, slab, us_link); + dom = &keg->uk_domain[slab->us_domain]; + LIST_INSERT_HEAD(&dom->ud_full_slab, slab, us_link); } return (item); } static int -zone_import(uma_zone_t zone, void **bucket, int max, int flags) +zone_import(uma_zone_t zone, void **bucket, int max, int domain, int flags) { uma_slab_t slab; uma_keg_t keg; int i; slab = NULL; keg = NULL; /* Try to keep the buckets totally full */ for (i = 0; i < max; ) { - if ((slab = zone->uz_slab(zone, keg, flags)) == NULL) + if ((slab = zone->uz_slab(zone, keg, domain, flags)) == NULL) break; keg = slab->us_keg; while (slab->us_freecount && i < max) { bucket[i++] = slab_alloc_item(keg, slab); if (keg->uk_free <= keg->uk_reserve) break; +#if MAXMEMDOM > 1 + /* + * If the zone is striped we pick a new slab for every + * allocation. Eliminating this conditional will + * instead pick a new domain for each bucket rather than + * stripe within each bucket. The current option + * produces more fragmentation but yields better + * distribution. + */ + if (domain == UMA_ANYDOMAIN && vm_ndomains > 1) + break; +#endif } /* Don't grab more than one slab at a time. */ flags &= ~M_WAITOK; flags |= M_NOWAIT; } if (slab != NULL) KEG_UNLOCK(keg); return i; } static uma_bucket_t -zone_alloc_bucket(uma_zone_t zone, void *udata, int flags) +zone_alloc_bucket(uma_zone_t zone, void *udata, int domain, int flags) { uma_bucket_t bucket; int max; /* Don't wait for buckets, preserve caller's NOVM setting. */ bucket = bucket_alloc(zone, udata, M_NOWAIT | (flags & M_NOVM)); if (bucket == NULL) return (NULL); max = MIN(bucket->ub_entries, zone->uz_count); bucket->ub_cnt = zone->uz_import(zone->uz_arg, bucket->ub_bucket, - max, flags); + max, domain, flags); /* * Initialize the memory if necessary. */ if (bucket->ub_cnt != 0 && zone->uz_init != NULL) { int i; for (i = 0; i < bucket->ub_cnt; i++) if (zone->uz_init(bucket->ub_bucket[i], zone->uz_size, flags) != 0) break; /* * If we couldn't initialize the whole bucket, put the * rest back onto the freelist. */ if (i != bucket->ub_cnt) { zone->uz_release(zone->uz_arg, &bucket->ub_bucket[i], bucket->ub_cnt - i); #ifdef INVARIANTS bzero(&bucket->ub_bucket[i], sizeof(void *) * (bucket->ub_cnt - i)); #endif bucket->ub_cnt = i; } } if (bucket->ub_cnt == 0) { bucket_free(zone, bucket, udata); atomic_add_long(&zone->uz_fails, 1); return (NULL); } return (bucket); } /* * Allocates a single item from a zone. * * Arguments * zone The zone to alloc for. * udata The data to be passed to the constructor. * flags M_WAITOK, M_NOWAIT, M_ZERO. * * Returns * NULL if there is no memory and M_NOWAIT is set * An item if successful */ static void * -zone_alloc_item(uma_zone_t zone, void *udata, int flags) +zone_alloc_item(uma_zone_t zone, void *udata, int domain, int flags) { void *item; item = NULL; - if (zone->uz_import(zone->uz_arg, &item, 1, flags) != 1) + if (zone->uz_import(zone->uz_arg, &item, 1, domain, flags) != 1) goto fail; atomic_add_long(&zone->uz_allocs, 1); /* * We have to call both the zone's init (not the keg's init) * and the zone's ctor. This is because the item is going from * a keg slab directly to the user, and the user is expecting it * to be both zone-init'd as well as zone-ctor'd. */ if (zone->uz_init != NULL) { if (zone->uz_init(item, zone->uz_size, flags) != 0) { zone_free_item(zone, item, udata, SKIP_FINI); goto fail; } } if (zone->uz_ctor != NULL) { if (zone->uz_ctor(item, zone->uz_size, udata, flags) != 0) { zone_free_item(zone, item, udata, SKIP_DTOR); goto fail; } } #ifdef INVARIANTS uma_dbg_alloc(zone, NULL, item); #endif if (flags & M_ZERO) uma_zero_item(item, zone); CTR3(KTR_UMA, "zone_alloc_item item %p from %s(%p)", item, zone->uz_name, zone); return (item); fail: CTR2(KTR_UMA, "zone_alloc_item failed from %s(%p)", zone->uz_name, zone); atomic_add_long(&zone->uz_fails, 1); return (NULL); } /* See uma.h */ void uma_zfree_arg(uma_zone_t zone, void *item, void *udata) { uma_cache_t cache; uma_bucket_t bucket; - int lockfail; - int cpu; + uma_zone_domain_t zdom; + int cpu, domain, lockfail; /* Enable entropy collection for RANDOM_ENABLE_UMA kernel option */ random_harvest_fast_uma(&zone, sizeof(zone), 1, RANDOM_UMA); CTR2(KTR_UMA, "uma_zfree_arg thread %x zone %s", curthread, zone->uz_name); KASSERT(curthread->td_critnest == 0 || SCHEDULER_STOPPED(), ("uma_zfree_arg: called with spinlock or critical section held")); /* uma_zfree(..., NULL) does nothing, to match free(9). */ if (item == NULL) return; #ifdef DEBUG_MEMGUARD if (is_memguard_addr(item)) { if (zone->uz_dtor != NULL) zone->uz_dtor(item, zone->uz_size, udata); if (zone->uz_fini != NULL) zone->uz_fini(item, zone->uz_size); memguard_free(item); return; } #endif #ifdef INVARIANTS if (zone->uz_flags & UMA_ZONE_MALLOC) uma_dbg_free(zone, udata, item); else uma_dbg_free(zone, NULL, item); #endif if (zone->uz_dtor != NULL) zone->uz_dtor(item, zone->uz_size, udata); /* * The race here is acceptable. If we miss it we'll just have to wait * a little longer for the limits to be reset. */ if (zone->uz_flags & UMA_ZFLAG_FULL) goto zfree_item; /* * If possible, free to the per-CPU cache. There are two * requirements for safe access to the per-CPU cache: (1) the thread * accessing the cache must not be preempted or yield during access, * and (2) the thread must not migrate CPUs without switching which * cache it accesses. We rely on a critical section to prevent * preemption and migration. We release the critical section in * order to acquire the zone mutex if we are unable to free to the * current cache; when we re-acquire the critical section, we must * detect and handle migration if it has occurred. */ zfree_restart: critical_enter(); cpu = curcpu; cache = &zone->uz_cpu[cpu]; zfree_start: /* * Try to free into the allocbucket first to give LIFO ordering * for cache-hot datastructures. Spill over into the freebucket * if necessary. Alloc will swap them if one runs dry. */ bucket = cache->uc_allocbucket; if (bucket == NULL || bucket->ub_cnt >= bucket->ub_entries) bucket = cache->uc_freebucket; if (bucket != NULL && bucket->ub_cnt < bucket->ub_entries) { KASSERT(bucket->ub_bucket[bucket->ub_cnt] == NULL, ("uma_zfree: Freeing to non free bucket index.")); bucket->ub_bucket[bucket->ub_cnt] = item; bucket->ub_cnt++; cache->uc_frees++; critical_exit(); return; } /* * We must go back the zone, which requires acquiring the zone lock, * which in turn means we must release and re-acquire the critical * section. Since the critical section is released, we may be * preempted or migrate. As such, make sure not to maintain any * thread-local state specific to the cache from prior to releasing * the critical section. */ critical_exit(); if (zone->uz_count == 0 || bucketdisable) goto zfree_item; lockfail = 0; if (ZONE_TRYLOCK(zone) == 0) { /* Record contention to size the buckets. */ ZONE_LOCK(zone); lockfail = 1; } critical_enter(); cpu = curcpu; cache = &zone->uz_cpu[cpu]; /* * Since we have locked the zone we may as well send back our stats. */ atomic_add_long(&zone->uz_allocs, cache->uc_allocs); atomic_add_long(&zone->uz_frees, cache->uc_frees); cache->uc_allocs = 0; cache->uc_frees = 0; bucket = cache->uc_freebucket; if (bucket != NULL && bucket->ub_cnt < bucket->ub_entries) { ZONE_UNLOCK(zone); goto zfree_start; } cache->uc_freebucket = NULL; /* We are no longer associated with this CPU. */ critical_exit(); + if (zone->uz_sel == NULL) { + zdom = &zone->uz_domain[0]; + domain = UMA_ANYDOMAIN; + } else { + domain = vm_domain_select_first(zone->uz_sel); + zdom = &zone->uz_domain[domain]; + } + /* Can we throw this on the zone full list? */ if (bucket != NULL) { CTR3(KTR_UMA, "uma_zfree: zone %s(%p) putting bucket %p on free list", zone->uz_name, zone, bucket); /* ub_cnt is pointing to the last free item */ KASSERT(bucket->ub_cnt != 0, ("uma_zfree: Attempting to insert an empty bucket onto the full list.\n")); - LIST_INSERT_HEAD(&zone->uz_buckets, bucket, ub_link); + LIST_INSERT_HEAD(&zdom->uzd_buckets, bucket, ub_link); } /* * We bump the uz count when the cache size is insufficient to * handle the working set. */ if (lockfail && zone->uz_count < BUCKET_MAX) zone->uz_count++; ZONE_UNLOCK(zone); bucket = bucket_alloc(zone, udata, M_NOWAIT); CTR3(KTR_UMA, "uma_zfree: zone %s(%p) allocated bucket %p", zone->uz_name, zone, bucket); if (bucket) { critical_enter(); cpu = curcpu; cache = &zone->uz_cpu[cpu]; - if (cache->uc_freebucket == NULL) { + if (cache->uc_freebucket == NULL && + (domain == UMA_ANYDOMAIN || domain == PCPU_GET(domain))) { cache->uc_freebucket = bucket; goto zfree_start; } /* * We lost the race, start over. We have to drop our * critical section to free the bucket. */ critical_exit(); bucket_free(zone, bucket, udata); goto zfree_restart; } /* * If nothing else caught this, we'll just do an internal free. */ zfree_item: zone_free_item(zone, item, udata, SKIP_DTOR); return; } static void slab_free_item(uma_keg_t keg, uma_slab_t slab, void *item) { + uma_domain_t dom; uint8_t freei; mtx_assert(&keg->uk_lock, MA_OWNED); MPASS(keg == slab->us_keg); + dom = &keg->uk_domain[slab->us_domain]; + /* Do we need to remove from any lists? */ if (slab->us_freecount+1 == keg->uk_ipers) { LIST_REMOVE(slab, us_link); - LIST_INSERT_HEAD(&keg->uk_free_slab, slab, us_link); + LIST_INSERT_HEAD(&dom->ud_free_slab, slab, us_link); } else if (slab->us_freecount == 0) { LIST_REMOVE(slab, us_link); - LIST_INSERT_HEAD(&keg->uk_part_slab, slab, us_link); + LIST_INSERT_HEAD(&dom->ud_part_slab, slab, us_link); } /* Slab management. */ freei = ((uintptr_t)item - (uintptr_t)slab->us_data) / keg->uk_rsize; BIT_SET(SLAB_SETSIZE, freei, &slab->us_free); slab->us_freecount++; /* Keg statistics. */ keg->uk_free++; } static void zone_release(uma_zone_t zone, void **bucket, int cnt) { void *item; uma_slab_t slab; uma_keg_t keg; uint8_t *mem; int clearfull; int i; clearfull = 0; keg = zone_first_keg(zone); KEG_LOCK(keg); for (i = 0; i < cnt; i++) { item = bucket[i]; if (!(zone->uz_flags & UMA_ZONE_VTOSLAB)) { mem = (uint8_t *)((uintptr_t)item & (~UMA_SLAB_MASK)); if (zone->uz_flags & UMA_ZONE_HASH) { slab = hash_sfind(&keg->uk_hash, mem); } else { mem += keg->uk_pgoff; slab = (uma_slab_t)mem; } } else { slab = vtoslab((vm_offset_t)item); if (slab->us_keg != keg) { KEG_UNLOCK(keg); keg = slab->us_keg; KEG_LOCK(keg); } } slab_free_item(keg, slab, item); if (keg->uk_flags & UMA_ZFLAG_FULL) { if (keg->uk_pages < keg->uk_maxpages) { keg->uk_flags &= ~UMA_ZFLAG_FULL; clearfull = 1; } /* * We can handle one more allocation. Since we're * clearing ZFLAG_FULL, wake up all procs blocked * on pages. This should be uncommon, so keeping this * simple for now (rather than adding count of blocked * threads etc). */ wakeup(keg); } } KEG_UNLOCK(keg); if (clearfull) { ZONE_LOCK(zone); zone->uz_flags &= ~UMA_ZFLAG_FULL; wakeup(zone); ZONE_UNLOCK(zone); } } /* * Frees a single item to any zone. * * Arguments: * zone The zone to free to * item The item we're freeing * udata User supplied data for the dtor * skip Skip dtors and finis */ static void zone_free_item(uma_zone_t zone, void *item, void *udata, enum zfreeskip skip) { #ifdef INVARIANTS if (skip == SKIP_NONE) { if (zone->uz_flags & UMA_ZONE_MALLOC) uma_dbg_free(zone, udata, item); else uma_dbg_free(zone, NULL, item); } #endif if (skip < SKIP_DTOR && zone->uz_dtor) zone->uz_dtor(item, zone->uz_size, udata); if (skip < SKIP_FINI && zone->uz_fini) zone->uz_fini(item, zone->uz_size); atomic_add_long(&zone->uz_frees, 1); zone->uz_release(zone->uz_arg, &item, 1); } /* See uma.h */ int uma_zone_set_max(uma_zone_t zone, int nitems) { uma_keg_t keg; keg = zone_first_keg(zone); if (keg == NULL) return (0); KEG_LOCK(keg); keg->uk_maxpages = (nitems / keg->uk_ipers) * keg->uk_ppera; if (keg->uk_maxpages * keg->uk_ipers < nitems) keg->uk_maxpages += keg->uk_ppera; nitems = (keg->uk_maxpages / keg->uk_ppera) * keg->uk_ipers; KEG_UNLOCK(keg); return (nitems); } /* See uma.h */ int uma_zone_get_max(uma_zone_t zone) { int nitems; uma_keg_t keg; keg = zone_first_keg(zone); if (keg == NULL) return (0); KEG_LOCK(keg); nitems = (keg->uk_maxpages / keg->uk_ppera) * keg->uk_ipers; KEG_UNLOCK(keg); return (nitems); } /* See uma.h */ void uma_zone_set_warning(uma_zone_t zone, const char *warning) { ZONE_LOCK(zone); zone->uz_warning = warning; ZONE_UNLOCK(zone); } /* See uma.h */ void uma_zone_set_maxaction(uma_zone_t zone, uma_maxaction_t maxaction) { ZONE_LOCK(zone); TASK_INIT(&zone->uz_maxaction, 0, (task_fn_t *)maxaction, zone); ZONE_UNLOCK(zone); } /* See uma.h */ int uma_zone_get_cur(uma_zone_t zone) { int64_t nitems; u_int i; ZONE_LOCK(zone); nitems = zone->uz_allocs - zone->uz_frees; CPU_FOREACH(i) { /* * See the comment in sysctl_vm_zone_stats() regarding the * safety of accessing the per-cpu caches. With the zone lock * held, it is safe, but can potentially result in stale data. */ nitems += zone->uz_cpu[i].uc_allocs - zone->uz_cpu[i].uc_frees; } ZONE_UNLOCK(zone); return (nitems < 0 ? 0 : nitems); } /* See uma.h */ void uma_zone_set_init(uma_zone_t zone, uma_init uminit) { uma_keg_t keg; keg = zone_first_keg(zone); KASSERT(keg != NULL, ("uma_zone_set_init: Invalid zone type")); KEG_LOCK(keg); KASSERT(keg->uk_pages == 0, ("uma_zone_set_init on non-empty keg")); keg->uk_init = uminit; KEG_UNLOCK(keg); } /* See uma.h */ void uma_zone_set_fini(uma_zone_t zone, uma_fini fini) { uma_keg_t keg; keg = zone_first_keg(zone); KASSERT(keg != NULL, ("uma_zone_set_fini: Invalid zone type")); KEG_LOCK(keg); KASSERT(keg->uk_pages == 0, ("uma_zone_set_fini on non-empty keg")); keg->uk_fini = fini; KEG_UNLOCK(keg); } /* See uma.h */ void uma_zone_set_zinit(uma_zone_t zone, uma_init zinit) { ZONE_LOCK(zone); KASSERT(zone_first_keg(zone)->uk_pages == 0, ("uma_zone_set_zinit on non-empty keg")); zone->uz_init = zinit; ZONE_UNLOCK(zone); } /* See uma.h */ void uma_zone_set_zfini(uma_zone_t zone, uma_fini zfini) { ZONE_LOCK(zone); KASSERT(zone_first_keg(zone)->uk_pages == 0, ("uma_zone_set_zfini on non-empty keg")); zone->uz_fini = zfini; ZONE_UNLOCK(zone); } /* See uma.h */ /* XXX uk_freef is not actually used with the zone locked */ void uma_zone_set_freef(uma_zone_t zone, uma_free freef) { uma_keg_t keg; keg = zone_first_keg(zone); KASSERT(keg != NULL, ("uma_zone_set_freef: Invalid zone type")); KEG_LOCK(keg); keg->uk_freef = freef; KEG_UNLOCK(keg); } /* See uma.h */ /* XXX uk_allocf is not actually used with the zone locked */ void uma_zone_set_allocf(uma_zone_t zone, uma_alloc allocf) { uma_keg_t keg; keg = zone_first_keg(zone); KEG_LOCK(keg); keg->uk_allocf = allocf; KEG_UNLOCK(keg); } /* See uma.h */ void +uma_zone_set_domain_selector(uma_zone_t zone, struct vm_domain_iterator *policy) +{ + + ZONE_LOCK(zone); + zone->uz_sel = policy; + ZONE_UNLOCK(zone); +} + +/* See uma.h */ +void uma_zone_reserve(uma_zone_t zone, int items) { uma_keg_t keg; keg = zone_first_keg(zone); if (keg == NULL) return; KEG_LOCK(keg); keg->uk_reserve = items; KEG_UNLOCK(keg); return; } /* See uma.h */ int uma_zone_reserve_kva(uma_zone_t zone, int count) { uma_keg_t keg; vm_offset_t kva; u_int pages; keg = zone_first_keg(zone); if (keg == NULL) return (0); pages = count / keg->uk_ipers; if (pages * keg->uk_ipers < count) pages++; pages *= keg->uk_ppera; #ifdef UMA_MD_SMALL_ALLOC if (keg->uk_ppera > 1) { #else if (1) { #endif kva = kva_alloc((vm_size_t)pages * PAGE_SIZE); if (kva == 0) return (0); } else kva = 0; KEG_LOCK(keg); keg->uk_kva = kva; keg->uk_offset = 0; keg->uk_maxpages = pages; #ifdef UMA_MD_SMALL_ALLOC keg->uk_allocf = (keg->uk_ppera > 1) ? noobj_alloc : uma_small_alloc; #else keg->uk_allocf = noobj_alloc; #endif keg->uk_flags |= UMA_ZONE_NOFREE; KEG_UNLOCK(keg); return (1); } /* See uma.h */ void uma_prealloc(uma_zone_t zone, int items) { - int slabs; + uma_domain_t dom; uma_slab_t slab; uma_keg_t keg; + int domain, slabs; keg = zone_first_keg(zone); if (keg == NULL) return; KEG_LOCK(keg); slabs = items / keg->uk_ipers; + domain = 0; if (slabs * keg->uk_ipers < items) slabs++; while (slabs > 0) { - slab = keg_alloc_slab(keg, zone, M_WAITOK); + slab = keg_alloc_slab(keg, zone, domain, M_WAITOK); if (slab == NULL) break; MPASS(slab->us_keg == keg); - LIST_INSERT_HEAD(&keg->uk_free_slab, slab, us_link); + dom = &keg->uk_domain[slab->us_domain]; + LIST_INSERT_HEAD(&dom->ud_free_slab, slab, us_link); slabs--; + domain = (domain + 1) % vm_ndomains; } KEG_UNLOCK(keg); } /* See uma.h */ static void uma_reclaim_locked(bool kmem_danger) { CTR0(KTR_UMA, "UMA: vm asked us to release pages!"); sx_assert(&uma_drain_lock, SA_XLOCKED); bucket_enable(); zone_foreach(zone_drain); if (vm_page_count_min() || kmem_danger) { cache_drain_safe(NULL); zone_foreach(zone_drain); } /* * Some slabs may have been freed but this zone will be visited early * we visit again so that we can free pages that are empty once other * zones are drained. We have to do the same for buckets. */ zone_drain(slabzone); bucket_zone_drain(); } void uma_reclaim(void) { sx_xlock(&uma_drain_lock); uma_reclaim_locked(false); sx_xunlock(&uma_drain_lock); } static int uma_reclaim_needed; void uma_reclaim_wakeup(void) { uma_reclaim_needed = 1; wakeup(&uma_reclaim_needed); } void uma_reclaim_worker(void *arg __unused) { sx_xlock(&uma_drain_lock); for (;;) { sx_sleep(&uma_reclaim_needed, &uma_drain_lock, PVM, "umarcl", 0); if (uma_reclaim_needed) { uma_reclaim_needed = 0; sx_xunlock(&uma_drain_lock); EVENTHANDLER_INVOKE(vm_lowmem, VM_LOW_KMEM); sx_xlock(&uma_drain_lock); uma_reclaim_locked(true); } } } /* See uma.h */ int uma_zone_exhausted(uma_zone_t zone) { int full; ZONE_LOCK(zone); full = (zone->uz_flags & UMA_ZFLAG_FULL); ZONE_UNLOCK(zone); return (full); } int uma_zone_exhausted_nolock(uma_zone_t zone) { return (zone->uz_flags & UMA_ZFLAG_FULL); } void * uma_large_malloc(vm_size_t size, int wait) { + static unsigned int large_domain; void *mem; uma_slab_t slab; + int domain; uint8_t flags; - slab = zone_alloc_item(slabzone, NULL, wait); + slab = zone_alloc_item(slabzone, NULL, UMA_ANYDOMAIN, wait); if (slab == NULL) return (NULL); - mem = page_alloc(NULL, size, &flags, wait); + domain = atomic_fetchadd_int(&large_domain, 1) % vm_ndomains; + mem = page_alloc(NULL, size, domain, &flags, wait); if (mem) { vsetslab((vm_offset_t)mem, slab); slab->us_data = mem; slab->us_flags = flags | UMA_SLAB_MALLOC; slab->us_size = size; } else { zone_free_item(slabzone, slab, NULL, SKIP_NONE); } return (mem); } void uma_large_free(uma_slab_t slab) { page_free(slab->us_data, slab->us_size, slab->us_flags); zone_free_item(slabzone, slab, NULL, SKIP_NONE); } static void uma_zero_item(void *item, uma_zone_t zone) { int i; if (zone->uz_flags & UMA_ZONE_PCPU) { CPU_FOREACH(i) bzero(zpcpu_get_cpu(item, i), zone->uz_size); } else bzero(item, zone->uz_size); } void uma_print_stats(void) { zone_foreach(uma_print_zone); } static void slab_print(uma_slab_t slab) { printf("slab: keg %p, data %p, freecount %d\n", slab->us_keg, slab->us_data, slab->us_freecount); } static void cache_print(uma_cache_t cache) { printf("alloc: %p(%d), free: %p(%d)\n", cache->uc_allocbucket, cache->uc_allocbucket?cache->uc_allocbucket->ub_cnt:0, cache->uc_freebucket, cache->uc_freebucket?cache->uc_freebucket->ub_cnt:0); } static void uma_print_keg(uma_keg_t keg) { + uma_domain_t dom; uma_slab_t slab; + int i; printf("keg: %s(%p) size %d(%d) flags %#x ipers %d ppera %d " "out %d free %d limit %d\n", keg->uk_name, keg, keg->uk_size, keg->uk_rsize, keg->uk_flags, keg->uk_ipers, keg->uk_ppera, (keg->uk_pages / keg->uk_ppera) * keg->uk_ipers - keg->uk_free, keg->uk_free, (keg->uk_maxpages / keg->uk_ppera) * keg->uk_ipers); - printf("Part slabs:\n"); - LIST_FOREACH(slab, &keg->uk_part_slab, us_link) - slab_print(slab); - printf("Free slabs:\n"); - LIST_FOREACH(slab, &keg->uk_free_slab, us_link) - slab_print(slab); - printf("Full slabs:\n"); - LIST_FOREACH(slab, &keg->uk_full_slab, us_link) - slab_print(slab); + for (i = 0; i < vm_ndomains; i++) { + dom = &keg->uk_domain[i]; + printf("Part slabs:\n"); + LIST_FOREACH(slab, &dom->ud_part_slab, us_link) + slab_print(slab); + printf("Free slabs:\n"); + LIST_FOREACH(slab, &dom->ud_free_slab, us_link) + slab_print(slab); + printf("Full slabs:\n"); + LIST_FOREACH(slab, &dom->ud_full_slab, us_link) + slab_print(slab); + } } void uma_print_zone(uma_zone_t zone) { uma_cache_t cache; uma_klink_t kl; int i; printf("zone: %s(%p) size %d flags %#x\n", zone->uz_name, zone, zone->uz_size, zone->uz_flags); LIST_FOREACH(kl, &zone->uz_kegs, kl_link) uma_print_keg(kl->kl_keg); CPU_FOREACH(i) { cache = &zone->uz_cpu[i]; printf("CPU %d Cache:\n", i); cache_print(cache); } } #ifdef DDB /* * Generate statistics across both the zone and its per-cpu cache's. Return * desired statistics if the pointer is non-NULL for that statistic. * * Note: does not update the zone statistics, as it can't safely clear the * per-CPU cache statistic. * * XXXRW: Following the uc_allocbucket and uc_freebucket pointers here isn't * safe from off-CPU; we should modify the caches to track this information * directly so that we don't have to. */ static void uma_zone_sumstat(uma_zone_t z, int *cachefreep, uint64_t *allocsp, uint64_t *freesp, uint64_t *sleepsp) { uma_cache_t cache; uint64_t allocs, frees, sleeps; int cachefree, cpu; allocs = frees = sleeps = 0; cachefree = 0; CPU_FOREACH(cpu) { cache = &z->uz_cpu[cpu]; if (cache->uc_allocbucket != NULL) cachefree += cache->uc_allocbucket->ub_cnt; if (cache->uc_freebucket != NULL) cachefree += cache->uc_freebucket->ub_cnt; allocs += cache->uc_allocs; frees += cache->uc_frees; } allocs += z->uz_allocs; frees += z->uz_frees; sleeps += z->uz_sleeps; if (cachefreep != NULL) *cachefreep = cachefree; if (allocsp != NULL) *allocsp = allocs; if (freesp != NULL) *freesp = frees; if (sleepsp != NULL) *sleepsp = sleeps; } #endif /* DDB */ static int sysctl_vm_zone_count(SYSCTL_HANDLER_ARGS) { uma_keg_t kz; uma_zone_t z; int count; count = 0; rw_rlock(&uma_rwlock); LIST_FOREACH(kz, &uma_kegs, uk_link) { LIST_FOREACH(z, &kz->uk_zones, uz_link) count++; } rw_runlock(&uma_rwlock); return (sysctl_handle_int(oidp, &count, 0, req)); } static int sysctl_vm_zone_stats(SYSCTL_HANDLER_ARGS) { struct uma_stream_header ush; struct uma_type_header uth; struct uma_percpu_stat ups; uma_bucket_t bucket; + uma_zone_domain_t zdom; struct sbuf sbuf; uma_cache_t cache; uma_klink_t kl; uma_keg_t kz; uma_zone_t z; uma_keg_t k; int count, error, i; error = sysctl_wire_old_buffer(req, 0); if (error != 0) return (error); sbuf_new_for_sysctl(&sbuf, NULL, 128, req); sbuf_clear_flags(&sbuf, SBUF_INCLUDENUL); count = 0; rw_rlock(&uma_rwlock); LIST_FOREACH(kz, &uma_kegs, uk_link) { LIST_FOREACH(z, &kz->uk_zones, uz_link) count++; } /* * Insert stream header. */ bzero(&ush, sizeof(ush)); ush.ush_version = UMA_STREAM_VERSION; ush.ush_maxcpus = (mp_maxid + 1); ush.ush_count = count; (void)sbuf_bcat(&sbuf, &ush, sizeof(ush)); LIST_FOREACH(kz, &uma_kegs, uk_link) { LIST_FOREACH(z, &kz->uk_zones, uz_link) { bzero(&uth, sizeof(uth)); ZONE_LOCK(z); strlcpy(uth.uth_name, z->uz_name, UTH_MAX_NAME); uth.uth_align = kz->uk_align; uth.uth_size = kz->uk_size; uth.uth_rsize = kz->uk_rsize; LIST_FOREACH(kl, &z->uz_kegs, kl_link) { k = kl->kl_keg; uth.uth_maxpages += k->uk_maxpages; uth.uth_pages += k->uk_pages; uth.uth_keg_free += k->uk_free; uth.uth_limit = (k->uk_maxpages / k->uk_ppera) * k->uk_ipers; } /* * A zone is secondary is it is not the first entry * on the keg's zone list. */ if ((z->uz_flags & UMA_ZONE_SECONDARY) && (LIST_FIRST(&kz->uk_zones) != z)) uth.uth_zone_flags = UTH_ZONE_SECONDARY; - LIST_FOREACH(bucket, &z->uz_buckets, ub_link) - uth.uth_zone_free += bucket->ub_cnt; + for (i = 0; i < vm_ndomains; i++) { + zdom = &z->uz_domain[i]; + LIST_FOREACH(bucket, &zdom->uzd_buckets, + ub_link) + uth.uth_zone_free += bucket->ub_cnt; + } uth.uth_allocs = z->uz_allocs; uth.uth_frees = z->uz_frees; uth.uth_fails = z->uz_fails; uth.uth_sleeps = z->uz_sleeps; (void)sbuf_bcat(&sbuf, &uth, sizeof(uth)); /* * While it is not normally safe to access the cache * bucket pointers while not on the CPU that owns the * cache, we only allow the pointers to be exchanged * without the zone lock held, not invalidated, so * accept the possible race associated with bucket * exchange during monitoring. */ for (i = 0; i < (mp_maxid + 1); i++) { bzero(&ups, sizeof(ups)); if (kz->uk_flags & UMA_ZFLAG_INTERNAL) goto skip; if (CPU_ABSENT(i)) goto skip; cache = &z->uz_cpu[i]; if (cache->uc_allocbucket != NULL) ups.ups_cache_free += cache->uc_allocbucket->ub_cnt; if (cache->uc_freebucket != NULL) ups.ups_cache_free += cache->uc_freebucket->ub_cnt; ups.ups_allocs = cache->uc_allocs; ups.ups_frees = cache->uc_frees; skip: (void)sbuf_bcat(&sbuf, &ups, sizeof(ups)); } ZONE_UNLOCK(z); } } rw_runlock(&uma_rwlock); error = sbuf_finish(&sbuf); sbuf_delete(&sbuf); return (error); } int sysctl_handle_uma_zone_max(SYSCTL_HANDLER_ARGS) { uma_zone_t zone = *(uma_zone_t *)arg1; int error, max; max = uma_zone_get_max(zone); error = sysctl_handle_int(oidp, &max, 0, req); if (error || !req->newptr) return (error); uma_zone_set_max(zone, max); return (0); } int sysctl_handle_uma_zone_cur(SYSCTL_HANDLER_ARGS) { uma_zone_t zone = *(uma_zone_t *)arg1; int cur; cur = uma_zone_get_cur(zone); return (sysctl_handle_int(oidp, &cur, 0, req)); } #ifdef INVARIANTS static uma_slab_t uma_dbg_getslab(uma_zone_t zone, void *item) { uma_slab_t slab; uma_keg_t keg; uint8_t *mem; mem = (uint8_t *)((uintptr_t)item & (~UMA_SLAB_MASK)); if (zone->uz_flags & UMA_ZONE_VTOSLAB) { slab = vtoslab((vm_offset_t)mem); } else { /* * It is safe to return the slab here even though the * zone is unlocked because the item's allocation state * essentially holds a reference. */ ZONE_LOCK(zone); keg = LIST_FIRST(&zone->uz_kegs)->kl_keg; if (keg->uk_flags & UMA_ZONE_HASH) slab = hash_sfind(&keg->uk_hash, mem); else slab = (uma_slab_t)(mem + keg->uk_pgoff); ZONE_UNLOCK(zone); } return (slab); } /* * Set up the slab's freei data such that uma_dbg_free can function. * */ static void uma_dbg_alloc(uma_zone_t zone, uma_slab_t slab, void *item) { uma_keg_t keg; int freei; if (zone_first_keg(zone) == NULL) return; if (slab == NULL) { slab = uma_dbg_getslab(zone, item); if (slab == NULL) panic("uma: item %p did not belong to zone %s\n", item, zone->uz_name); } keg = slab->us_keg; freei = ((uintptr_t)item - (uintptr_t)slab->us_data) / keg->uk_rsize; if (BIT_ISSET(SLAB_SETSIZE, freei, &slab->us_debugfree)) panic("Duplicate alloc of %p from zone %p(%s) slab %p(%d)\n", item, zone, zone->uz_name, slab, freei); BIT_SET_ATOMIC(SLAB_SETSIZE, freei, &slab->us_debugfree); return; } /* * Verifies freed addresses. Checks for alignment, valid slab membership * and duplicate frees. * */ static void uma_dbg_free(uma_zone_t zone, uma_slab_t slab, void *item) { uma_keg_t keg; int freei; if (zone_first_keg(zone) == NULL) return; if (slab == NULL) { slab = uma_dbg_getslab(zone, item); if (slab == NULL) panic("uma: Freed item %p did not belong to zone %s\n", item, zone->uz_name); } keg = slab->us_keg; freei = ((uintptr_t)item - (uintptr_t)slab->us_data) / keg->uk_rsize; if (freei >= keg->uk_ipers) panic("Invalid free of %p from zone %p(%s) slab %p(%d)\n", item, zone, zone->uz_name, slab, freei); if (((freei * keg->uk_rsize) + slab->us_data) != item) panic("Unaligned free of %p from zone %p(%s) slab %p(%d)\n", item, zone, zone->uz_name, slab, freei); if (!BIT_ISSET(SLAB_SETSIZE, freei, &slab->us_debugfree)) panic("Duplicate free of %p from zone %p(%s) slab %p(%d)\n", item, zone, zone->uz_name, slab, freei); BIT_CLR_ATOMIC(SLAB_SETSIZE, freei, &slab->us_debugfree); } #endif /* INVARIANTS */ #ifdef DDB DB_SHOW_COMMAND(uma, db_show_uma) { - uint64_t allocs, frees, sleeps; uma_bucket_t bucket; uma_keg_t kz; uma_zone_t z; - int cachefree; + uma_zone_domain_t zdom; + uint64_t allocs, frees, sleeps; + int cachefree, i; db_printf("%18s %8s %8s %8s %12s %8s %8s\n", "Zone", "Size", "Used", "Free", "Requests", "Sleeps", "Bucket"); LIST_FOREACH(kz, &uma_kegs, uk_link) { LIST_FOREACH(z, &kz->uk_zones, uz_link) { if (kz->uk_flags & UMA_ZFLAG_INTERNAL) { allocs = z->uz_allocs; frees = z->uz_frees; sleeps = z->uz_sleeps; cachefree = 0; } else uma_zone_sumstat(z, &cachefree, &allocs, &frees, &sleeps); if (!((z->uz_flags & UMA_ZONE_SECONDARY) && (LIST_FIRST(&kz->uk_zones) != z))) cachefree += kz->uk_free; - LIST_FOREACH(bucket, &z->uz_buckets, ub_link) - cachefree += bucket->ub_cnt; + for (i = 0; i < vm_ndomains; i++) { + zdom = &z->uz_domain[i]; + LIST_FOREACH(bucket, &zdom->uzd_buckets, + ub_link) + cachefree += bucket->ub_cnt; + } db_printf("%18s %8ju %8jd %8d %12ju %8ju %8u\n", z->uz_name, (uintmax_t)kz->uk_size, (intmax_t)(allocs - frees), cachefree, (uintmax_t)allocs, sleeps, z->uz_count); if (db_pager_quit) return; } } } DB_SHOW_COMMAND(umacache, db_show_umacache) { - uint64_t allocs, frees; uma_bucket_t bucket; uma_zone_t z; - int cachefree; + uma_zone_domain_t zdom; + uint64_t allocs, frees; + int cachefree, i; db_printf("%18s %8s %8s %8s %12s %8s\n", "Zone", "Size", "Used", "Free", "Requests", "Bucket"); LIST_FOREACH(z, &uma_cachezones, uz_link) { uma_zone_sumstat(z, &cachefree, &allocs, &frees, NULL); - LIST_FOREACH(bucket, &z->uz_buckets, ub_link) - cachefree += bucket->ub_cnt; + for (i = 0; i < vm_ndomains; i++) { + zdom = &z->uz_domain[i]; + LIST_FOREACH(bucket, &zdom->uzd_buckets, ub_link) + cachefree += bucket->ub_cnt; + } db_printf("%18s %8ju %8jd %8d %12ju %8u\n", z->uz_name, (uintmax_t)z->uz_size, (intmax_t)(allocs - frees), cachefree, (uintmax_t)allocs, z->uz_count); if (db_pager_quit) return; } } #endif /* DDB */ Index: projects/numa2/sys/vm/uma_int.h =================================================================== --- projects/numa2/sys/vm/uma_int.h (revision 321505) +++ projects/numa2/sys/vm/uma_int.h (revision 321506) @@ -1,427 +1,473 @@ /*- * Copyright (c) 2002-2005, 2009, 2013 Jeffrey Roberson * Copyright (c) 2004, 2005 Bosko Milekic * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice unmodified, this list of conditions, and the following * disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * * $FreeBSD$ * */ #include /* * This file includes definitions, structures, prototypes, and inlines that * should not be used outside of the actual implementation of UMA. */ /* - * Here's a quick description of the relationship between the objects: + * The brief summary; Zones describe unique allocation types. Zones are + * organized into per-CPU caches which are filled by buckets. Buckets are + * organized according to memory domains. Buckets are filled from kegs which + * are also organized according to memory domains. Kegs describe a unique + * allocation type, backend memory provider, and layout. Kegs are associated + * with one or more zones and zones reference one or more kegs. Kegs provide + * slabs which are virtually contiguous collections of pages. Each slab is + * broken down int one or more items that will satisfy an individual allocation. * + * Allocation is satisfied in the following order: + * 1) Per-CPU cache + * 2) Per-domain cache of buckets + * 3) Slab from any of N kegs + * 4) Backend page provider + * + * More detail on individual objects is contained below: + * * Kegs contain lists of slabs which are stored in either the full bin, empty * bin, or partially allocated bin, to reduce fragmentation. They also contain * the user supplied value for size, which is adjusted for alignment purposes * and rsize is the result of that. The Keg also stores information for * managing a hash of page addresses that maps pages to uma_slab_t structures * for pages that don't have embedded uma_slab_t's. + * + * Keg slab lists are organized by memory domain to support NUMA allocation + * policies. By default allocations are spread across domains to reduce the + * potential for hotspots. Special keg creation flags may be specified to + * prefer location allocation. However there is no strict enforcement as frees + * may happen on any CPU and these are returned to the CPU-local cache + * regardless of the originating domain. * * The uma_slab_t may be embedded in a UMA_SLAB_SIZE chunk of memory or it may * be allocated off the page from a special slab zone. The free list within a * slab is managed with a bitmask. For item sizes that would yield more than * 10% memory waste we potentially allocate a separate uma_slab_t if this will * improve the number of items per slab that will fit. * * The only really gross cases, with regards to memory waste, are for those * items that are just over half the page size. You can get nearly 50% waste, * so you fall back to the memory footprint of the power of two allocator. I * have looked at memory allocation sizes on many of the machines available to * me, and there does not seem to be an abundance of allocations at this range * so at this time it may not make sense to optimize for it. This can, of * course, be solved with dynamic slab sizes. * * Kegs may serve multiple Zones but by far most of the time they only serve * one. When a Zone is created, a Keg is allocated and setup for it. While * the backing Keg stores slabs, the Zone caches Buckets of items allocated * from the slabs. Each Zone is equipped with an init/fini and ctor/dtor * pair, as well as with its own set of small per-CPU caches, layered above * the Zone's general Bucket cache. * * The PCPU caches are protected by critical sections, and may be accessed * safely only from their associated CPU, while the Zones backed by the same * Keg all share a common Keg lock (to coalesce contention on the backing * slabs). The backing Keg typically only serves one Zone but in the case of * multiple Zones, one of the Zones is considered the Master Zone and all * Zone-related stats from the Keg are done in the Master Zone. For an * example of a Multi-Zone setup, refer to the Mbuf allocation code. */ /* * This is the representation for normal (Non OFFPAGE slab) * * i == item * s == slab pointer * * <---------------- Page (UMA_SLAB_SIZE) ------------------> * ___________________________________________________________ * | _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ ___________ | * ||i||i||i||i||i||i||i||i||i||i||i||i||i||i||i| |slab header|| * ||_||_||_||_||_||_||_||_||_||_||_||_||_||_||_| |___________|| * |___________________________________________________________| * * * This is an OFFPAGE slab. These can be larger than UMA_SLAB_SIZE. * * ___________________________________________________________ * | _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ | * ||i||i||i||i||i||i||i||i||i||i||i||i||i||i||i||i||i||i||i| | * ||_||_||_||_||_||_||_||_||_||_||_||_||_||_||_||_||_||_||_| | * |___________________________________________________________| * ___________ ^ * |slab header| | * |___________|---* * */ #ifndef VM_UMA_INT_H #define VM_UMA_INT_H #define UMA_SLAB_SIZE PAGE_SIZE /* How big are our slabs? */ #define UMA_SLAB_MASK (PAGE_SIZE - 1) /* Mask to get back to the page */ #define UMA_SLAB_SHIFT PAGE_SHIFT /* Number of bits PAGE_MASK */ #define UMA_BOOT_PAGES 64 /* Pages allocated for startup */ #define UMA_BOOT_PAGES_ZONES 32 /* Multiplier for pages to reserve */ /* if uma_zone > PAGE_SIZE */ /* Max waste percentage before going to off page slab management */ #define UMA_MAX_WASTE 10 /* * I doubt there will be many cases where this is exceeded. This is the initial * size of the hash table for uma_slabs that are managed off page. This hash * does expand by powers of two. Currently it doesn't get smaller. */ #define UMA_HASH_SIZE_INIT 32 /* * I should investigate other hashing algorithms. This should yield a low * number of collisions if the pages are relatively contiguous. */ #define UMA_HASH(h, s) ((((uintptr_t)s) >> UMA_SLAB_SHIFT) & (h)->uh_hashmask) #define UMA_HASH_INSERT(h, s, mem) \ SLIST_INSERT_HEAD(&(h)->uh_slab_hash[UMA_HASH((h), \ (mem))], (s), us_hlink) #define UMA_HASH_REMOVE(h, s, mem) \ SLIST_REMOVE(&(h)->uh_slab_hash[UMA_HASH((h), \ (mem))], (s), uma_slab, us_hlink) /* Hash table for freed address -> slab translation */ SLIST_HEAD(slabhead, uma_slab); struct uma_hash { struct slabhead *uh_slab_hash; /* Hash table for slabs */ int uh_hashsize; /* Current size of the hash table */ int uh_hashmask; /* Mask used during hashing */ }; /* * align field or structure to cache line */ #if defined(__amd64__) #define UMA_ALIGN __aligned(CACHE_LINE_SIZE) #else #define UMA_ALIGN #endif /* * Structures for per cpu queues. */ struct uma_bucket { LIST_ENTRY(uma_bucket) ub_link; /* Link into the zone */ int16_t ub_cnt; /* Count of free items. */ int16_t ub_entries; /* Max items. */ void *ub_bucket[]; /* actual allocation storage */ }; typedef struct uma_bucket * uma_bucket_t; struct uma_cache { uma_bucket_t uc_freebucket; /* Bucket we're freeing to */ uma_bucket_t uc_allocbucket; /* Bucket to allocate from */ uint64_t uc_allocs; /* Count of allocations */ uint64_t uc_frees; /* Count of frees */ } UMA_ALIGN; typedef struct uma_cache * uma_cache_t; /* + * Per-domain memory list. Embedded in the kegs. + */ +struct uma_domain { + LIST_HEAD(,uma_slab) ud_part_slab; /* partially allocated slabs */ + LIST_HEAD(,uma_slab) ud_free_slab; /* empty slab list */ + LIST_HEAD(,uma_slab) ud_full_slab; /* full slabs */ +}; + +typedef struct uma_domain * uma_domain_t; + +/* * Keg management structure * * TODO: Optimize for cache line size * */ struct uma_keg { struct mtx_padalign uk_lock; /* Lock for the keg */ struct uma_hash uk_hash; LIST_HEAD(,uma_zone) uk_zones; /* Keg's zones */ - LIST_HEAD(,uma_slab) uk_part_slab; /* partially allocated slabs */ - LIST_HEAD(,uma_slab) uk_free_slab; /* empty slab list */ - LIST_HEAD(,uma_slab) uk_full_slab; /* full slabs */ + struct uma_domain uk_domain[MAXMEMDOM]; /* Keg's slab lists. */ + uint32_t uk_cursor; /* Domain alloc cursor. */ uint32_t uk_align; /* Alignment mask */ uint32_t uk_pages; /* Total page count */ uint32_t uk_free; /* Count of items free in slabs */ uint32_t uk_reserve; /* Number of reserved items. */ uint32_t uk_size; /* Requested size of each item */ uint32_t uk_rsize; /* Real size of each item */ uint32_t uk_maxpages; /* Maximum number of pages to alloc */ uma_init uk_init; /* Keg's init routine */ uma_fini uk_fini; /* Keg's fini routine */ uma_alloc uk_allocf; /* Allocation function */ uma_free uk_freef; /* Free routine */ u_long uk_offset; /* Next free offset from base KVA */ vm_offset_t uk_kva; /* Zone base KVA */ uma_zone_t uk_slabzone; /* Slab zone backing us, if OFFPAGE */ uint16_t uk_pgoff; /* Offset to uma_slab struct */ uint16_t uk_ppera; /* pages per allocation from backend */ uint16_t uk_ipers; /* Items per slab */ uint32_t uk_flags; /* Internal flags */ /* Least used fields go to the last cache line. */ const char *uk_name; /* Name of creating zone. */ LIST_ENTRY(uma_keg) uk_link; /* List of all kegs */ }; typedef struct uma_keg * uma_keg_t; /* * Free bits per-slab. */ #define SLAB_SETSIZE (PAGE_SIZE / UMA_SMALLEST_UNIT) BITSET_DEFINE(slabbits, SLAB_SETSIZE); /* * The slab structure manages a single contiguous allocation from backing * store and subdivides it into individually allocatable items. */ struct uma_slab { uma_keg_t us_keg; /* Keg we live in */ union { LIST_ENTRY(uma_slab) _us_link; /* slabs in zone */ unsigned long _us_size; /* Size of allocation */ } us_type; SLIST_ENTRY(uma_slab) us_hlink; /* Link for hash table */ uint8_t *us_data; /* First item */ struct slabbits us_free; /* Free bitmask. */ #ifdef INVARIANTS struct slabbits us_debugfree; /* Debug bitmask. */ #endif uint16_t us_freecount; /* How many are free? */ uint8_t us_flags; /* Page flags see uma.h */ - uint8_t us_pad; /* Pad to 32bits, unused. */ + uint8_t us_domain; /* Backing NUMA domain. */ }; #define us_link us_type._us_link #define us_size us_type._us_size +#if MAXMEMDOM >= 255 +#error "Slab domain type insufficient" +#endif + +#define UMA_ANYDOMAIN -1 + typedef struct uma_slab * uma_slab_t; -typedef uma_slab_t (*uma_slaballoc)(uma_zone_t, uma_keg_t, int); +typedef uma_slab_t (*uma_slaballoc)(uma_zone_t, uma_keg_t, int, int); struct uma_klink { LIST_ENTRY(uma_klink) kl_link; uma_keg_t kl_keg; }; typedef struct uma_klink *uma_klink_t; +struct uma_zone_domain { + LIST_HEAD(,uma_bucket) uzd_buckets; /* full buckets */ +}; + +typedef struct uma_zone_domain * uma_zone_domain_t; + /* * Zone management structure * * TODO: Optimize for cache line size * */ struct uma_zone { struct mtx_padalign uz_lock; /* Lock for the zone */ struct mtx_padalign *uz_lockptr; const char *uz_name; /* Text name of the zone */ LIST_ENTRY(uma_zone) uz_link; /* List of all zones in keg */ - LIST_HEAD(,uma_bucket) uz_buckets; /* full buckets */ + struct uma_zone_domain uz_domain[MAXMEMDOM]; /* per-domain buckets */ LIST_HEAD(,uma_klink) uz_kegs; /* List of kegs. */ struct uma_klink uz_klink; /* klink for first keg. */ uma_slaballoc uz_slab; /* Allocate a slab from the backend. */ uma_ctor uz_ctor; /* Constructor for each allocation */ uma_dtor uz_dtor; /* Destructor */ uma_init uz_init; /* Initializer for each item */ uma_fini uz_fini; /* Finalizer for each item. */ uma_import uz_import; /* Import new memory to cache. */ uma_release uz_release; /* Release memory from cache. */ void *uz_arg; /* Import/release argument. */ uint32_t uz_flags; /* Flags inherited from kegs */ uint32_t uz_size; /* Size inherited from kegs */ volatile u_long uz_allocs UMA_ALIGN; /* Total number of allocations */ volatile u_long uz_fails; /* Total number of alloc failures */ volatile u_long uz_frees; /* Total number of frees */ uint64_t uz_sleeps; /* Total number of alloc sleeps */ uint16_t uz_count; /* Amount of items in full bucket */ uint16_t uz_count_min; /* Minimal amount of items there */ + struct vm_domain_iterator *uz_sel; /* Domain selector. */ + /* The next two fields are used to print a rate-limited warnings. */ const char *uz_warning; /* Warning to print on failure */ struct timeval uz_ratecheck; /* Warnings rate-limiting */ struct task uz_maxaction; /* Task to run when at limit */ /* * This HAS to be the last item because we adjust the zone size * based on NCPU and then allocate the space for the zones. */ struct uma_cache uz_cpu[1]; /* Per cpu caches */ }; /* * These flags must not overlap with the UMA_ZONE flags specified in uma.h. */ #define UMA_ZFLAG_MULTI 0x04000000 /* Multiple kegs in the zone. */ #define UMA_ZFLAG_DRAINING 0x08000000 /* Running zone_drain. */ #define UMA_ZFLAG_BUCKET 0x10000000 /* Bucket zone. */ #define UMA_ZFLAG_INTERNAL 0x20000000 /* No offpage no PCPU. */ #define UMA_ZFLAG_FULL 0x40000000 /* Reached uz_maxpages */ #define UMA_ZFLAG_CACHEONLY 0x80000000 /* Don't ask VM for buckets. */ #define UMA_ZFLAG_INHERIT \ (UMA_ZFLAG_INTERNAL | UMA_ZFLAG_CACHEONLY | UMA_ZFLAG_BUCKET) static inline uma_keg_t zone_first_keg(uma_zone_t zone) { uma_klink_t klink; klink = LIST_FIRST(&zone->uz_kegs); return (klink != NULL) ? klink->kl_keg : NULL; } #undef UMA_ALIGN #ifdef _KERNEL /* Internal prototypes */ static __inline uma_slab_t hash_sfind(struct uma_hash *hash, uint8_t *data); void *uma_large_malloc(vm_size_t size, int wait); void uma_large_free(uma_slab_t slab); /* Lock Macros */ #define KEG_LOCK_INIT(k, lc) \ do { \ if ((lc)) \ mtx_init(&(k)->uk_lock, (k)->uk_name, \ (k)->uk_name, MTX_DEF | MTX_DUPOK); \ else \ mtx_init(&(k)->uk_lock, (k)->uk_name, \ "UMA zone", MTX_DEF | MTX_DUPOK); \ } while (0) #define KEG_LOCK_FINI(k) mtx_destroy(&(k)->uk_lock) #define KEG_LOCK(k) mtx_lock(&(k)->uk_lock) #define KEG_UNLOCK(k) mtx_unlock(&(k)->uk_lock) #define ZONE_LOCK_INIT(z, lc) \ do { \ if ((lc)) \ mtx_init(&(z)->uz_lock, (z)->uz_name, \ (z)->uz_name, MTX_DEF | MTX_DUPOK); \ else \ mtx_init(&(z)->uz_lock, (z)->uz_name, \ "UMA zone", MTX_DEF | MTX_DUPOK); \ } while (0) #define ZONE_LOCK(z) mtx_lock((z)->uz_lockptr) #define ZONE_TRYLOCK(z) mtx_trylock((z)->uz_lockptr) #define ZONE_UNLOCK(z) mtx_unlock((z)->uz_lockptr) #define ZONE_LOCK_FINI(z) mtx_destroy(&(z)->uz_lock) /* * Find a slab within a hash table. This is used for OFFPAGE zones to lookup * the slab structure. * * Arguments: * hash The hash table to search. * data The base page of the item. * * Returns: * A pointer to a slab if successful, else NULL. */ static __inline uma_slab_t hash_sfind(struct uma_hash *hash, uint8_t *data) { uma_slab_t slab; int hval; hval = UMA_HASH(hash, data); SLIST_FOREACH(slab, &hash->uh_slab_hash[hval], us_hlink) { if ((uint8_t *)slab->us_data == data) return (slab); } return (NULL); } static __inline uma_slab_t vtoslab(vm_offset_t va) { vm_page_t p; p = PHYS_TO_VM_PAGE(pmap_kextract(va)); return ((uma_slab_t)p->plinks.s.pv); } static __inline void vsetslab(vm_offset_t va, uma_slab_t slab) { vm_page_t p; p = PHYS_TO_VM_PAGE(pmap_kextract(va)); p->plinks.s.pv = slab; } /* * The following two functions may be defined by architecture specific code * if they can provide more efficient allocation functions. This is useful * for using direct mapped addresses. */ -void *uma_small_alloc(uma_zone_t zone, vm_size_t bytes, uint8_t *pflag, - int wait); +void *uma_small_alloc(uma_zone_t zone, vm_size_t bytes, int domain, + uint8_t *pflag, int wait); void uma_small_free(void *mem, vm_size_t size, uint8_t flags); #endif /* _KERNEL */ #endif /* VM_UMA_INT_H */ Index: projects/numa2/sys/vm/vm_domain.c =================================================================== --- projects/numa2/sys/vm/vm_domain.c (revision 321505) +++ projects/numa2/sys/vm/vm_domain.c (revision 321506) @@ -1,400 +1,398 @@ /*- * Copyright (c) 2015 Adrian Chadd . * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer, * without modification. * 2. Redistributions in binary form must reproduce at minimum a disclaimer * similar to the "NO WARRANTY" disclaimer below ("Disclaimer") and any * redistribution must be conditioned upon including a substantially * similar Disclaimer requirement for further binary redistribution. * * NO WARRANTY * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT * LIMITED TO, THE IMPLIED WARRANTIES OF NONINFRINGEMENT, MERCHANTIBILITY * AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL * THE COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR SPECIAL, EXEMPLARY, * OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF * THE POSSIBILITY OF SUCH DAMAGES. */ #include __FBSDID("$FreeBSD$"); #include "opt_vm.h" #include "opt_ddb.h" #include #include #include #include #include #include #ifdef VM_NUMA_ALLOC #include #endif #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include +#if 0 #ifdef VM_NUMA_ALLOC static __inline int vm_domain_rr_selectdomain(int skip_domain) { struct thread *td; td = curthread; td->td_dom_rr_idx++; td->td_dom_rr_idx %= vm_ndomains; /* * If skip_domain is provided then skip over that * domain. This is intended for round robin variants * which first try a fixed domain. */ if ((skip_domain > -1) && (td->td_dom_rr_idx == skip_domain)) { td->td_dom_rr_idx++; td->td_dom_rr_idx %= vm_ndomains; } return (td->td_dom_rr_idx); } #endif +#endif /* * This implements a very simple set of VM domain memory allocation * policies and iterators. */ /* * A VM domain policy represents a desired VM domain policy. * Iterators implement searching through VM domains in a specific * order. */ /* * When setting a policy, the caller must establish their own * exclusive write protection for the contents of the domain * policy. */ int vm_domain_policy_init(struct vm_domain_policy *vp) { bzero(vp, sizeof(*vp)); vp->p.policy = VM_POLICY_NONE; vp->p.domain = -1; return (0); } int vm_domain_policy_set(struct vm_domain_policy *vp, vm_domain_policy_type_t vt, int domain) { seq_write_begin(&vp->seq); vp->p.policy = vt; vp->p.domain = domain; seq_write_end(&vp->seq); return (0); } /* * Take a local copy of a policy. * * The destination policy isn't write-barriered; this is used * for doing local copies into something that isn't shared. */ void vm_domain_policy_localcopy(struct vm_domain_policy *dst, const struct vm_domain_policy *src) { seq_t seq; for (;;) { seq = seq_read(&src->seq); *dst = *src; if (seq_consistent(&src->seq, seq)) return; } } /* * Take a write-barrier copy of a policy. * * The destination policy is write -barriered; this is used * for doing copies into policies that may be read by other * threads. */ void vm_domain_policy_copy(struct vm_domain_policy *dst, const struct vm_domain_policy *src) { seq_t seq; struct vm_domain_policy d; for (;;) { seq = seq_read(&src->seq); d = *src; if (seq_consistent(&src->seq, seq)) { seq_write_begin(&dst->seq); dst->p.domain = d.p.domain; dst->p.policy = d.p.policy; seq_write_end(&dst->seq); return; } } } int vm_domain_policy_validate(const struct vm_domain_policy *vp) { switch (vp->p.policy) { case VM_POLICY_NONE: case VM_POLICY_ROUND_ROBIN: case VM_POLICY_FIRST_TOUCH: case VM_POLICY_FIRST_TOUCH_ROUND_ROBIN: if (vp->p.domain == -1) return (0); return (-1); case VM_POLICY_FIXED_DOMAIN: case VM_POLICY_FIXED_DOMAIN_ROUND_ROBIN: #ifdef VM_NUMA_ALLOC if (vp->p.domain >= 0 && vp->p.domain < vm_ndomains) return (0); #else if (vp->p.domain == 0) return (0); #endif return (-1); default: return (-1); } return (-1); } int vm_domain_policy_cleanup(struct vm_domain_policy *vp) { /* For now, empty */ return (0); } int vm_domain_iterator_init(struct vm_domain_iterator *vi) { /* Nothing to do for now */ return (0); } +#if 0 /* * Manually setup an iterator with the given details. */ int vm_domain_iterator_set(struct vm_domain_iterator *vi, vm_domain_policy_type_t vt, int domain) { #ifdef VM_NUMA_ALLOC switch (vt) { case VM_POLICY_FIXED_DOMAIN: vi->policy = VM_POLICY_FIXED_DOMAIN; vi->domain = domain; vi->n = 1; break; case VM_POLICY_FIXED_DOMAIN_ROUND_ROBIN: vi->policy = VM_POLICY_FIXED_DOMAIN_ROUND_ROBIN; vi->domain = domain; vi->n = vm_ndomains; break; case VM_POLICY_FIRST_TOUCH: vi->policy = VM_POLICY_FIRST_TOUCH; vi->domain = PCPU_GET(domain); vi->n = 1; break; case VM_POLICY_FIRST_TOUCH_ROUND_ROBIN: vi->policy = VM_POLICY_FIRST_TOUCH_ROUND_ROBIN; vi->domain = PCPU_GET(domain); vi->n = vm_ndomains; break; case VM_POLICY_ROUND_ROBIN: default: vi->policy = VM_POLICY_ROUND_ROBIN; vi->domain = -1; vi->n = vm_ndomains; break; } #else vi->domain = 0; vi->n = 1; #endif return (0); } +#endif /* * Setup an iterator based on the given policy. */ static inline void _vm_domain_iterator_set_policy(struct vm_domain_iterator *vi, const struct vm_domain_policy *vt) { + #ifdef VM_NUMA_ALLOC /* * Initialise the iterator. * * For first-touch, the initial domain is set * via the current thread CPU domain. * * For fixed-domain, it's assumed that the * caller has initialised the specific domain * it is after. */ switch (vt->p.policy) { case VM_POLICY_FIXED_DOMAIN: - vi->policy = vt->p.policy; - vi->domain = vt->p.domain; - vi->n = 1; - break; case VM_POLICY_FIXED_DOMAIN_ROUND_ROBIN: vi->policy = vt->p.policy; + vi->cursor = -1; vi->domain = vt->p.domain; - vi->n = vm_ndomains; break; case VM_POLICY_FIRST_TOUCH: - vi->policy = vt->p.policy; - vi->domain = PCPU_GET(domain); - vi->n = 1; - break; case VM_POLICY_FIRST_TOUCH_ROUND_ROBIN: vi->policy = vt->p.policy; - vi->domain = PCPU_GET(domain); - vi->n = vm_ndomains; + vi->cursor = PCPU_GET(domain); + vi->domain = -1; break; case VM_POLICY_ROUND_ROBIN: default: /* * Default to round-robin policy. */ vi->policy = VM_POLICY_ROUND_ROBIN; - vi->domain = -1; - vi->n = vm_ndomains; + vi->cursor = vi->domain = -1; break; } #else + vi->cursor = -1; vi->domain = 0; - vi->n = 1; #endif } void vm_domain_iterator_set_policy(struct vm_domain_iterator *vi, const struct vm_domain_policy *vt) { seq_t seq; struct vm_domain_policy vt_lcl; for (;;) { seq = seq_read(&vt->seq); vt_lcl = *vt; if (seq_consistent(&vt->seq, seq)) { _vm_domain_iterator_set_policy(vi, &vt_lcl); return; } } } +#if 0 /* * Return the next VM domain to use. * * Returns 0 w/ domain set to the next domain to use, or * -1 to indicate no more domains are available. */ int vm_domain_iterator_run(struct vm_domain_iterator *vi, int *domain) { /* General catch-all */ if (vi->n <= 0) return (-1); #ifdef VM_NUMA_ALLOC switch (vi->policy) { case VM_POLICY_FIXED_DOMAIN: case VM_POLICY_FIRST_TOUCH: *domain = vi->domain; vi->n--; break; case VM_POLICY_FIXED_DOMAIN_ROUND_ROBIN: case VM_POLICY_FIRST_TOUCH_ROUND_ROBIN: /* * XXX TODO: skip over the rr'ed domain * if it equals the one we started with. */ if (vi->n == vm_ndomains) *domain = vi->domain; else *domain = vm_domain_rr_selectdomain(vi->domain); vi->n--; break; case VM_POLICY_ROUND_ROBIN: default: *domain = vm_domain_rr_selectdomain(-1); vi->n--; break; } #else *domain = 0; vi->n--; #endif return (0); } /* * Returns 1 if the iteration is done, or 0 if it has not. * This can only be called after at least one loop through * the iterator. Ie, it's designed to be used as a tail * check of a loop, not the head check of a loop. */ int vm_domain_iterator_isdone(struct vm_domain_iterator *vi) { return (vi->n <= 0); } +#endif int vm_domain_iterator_cleanup(struct vm_domain_iterator *vi) { return (0); } Index: projects/numa2/sys/vm/vm_domain.h =================================================================== --- projects/numa2/sys/vm/vm_domain.h (revision 321505) +++ projects/numa2/sys/vm/vm_domain.h (revision 321506) @@ -1,66 +1,100 @@ /*- * Copyright (c) 2015 Adrian Chadd . * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer, * without modification. * 2. Redistributions in binary form must reproduce at minimum a disclaimer * similar to the "NO WARRANTY" disclaimer below ("Disclaimer") and any * redistribution must be conditioned upon including a substantially * similar Disclaimer requirement for further binary redistribution. * * NO WARRANTY * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT * LIMITED TO, THE IMPLIED WARRANTIES OF NONINFRINGEMENT, MERCHANTIBILITY * AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL * THE COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR SPECIAL, EXEMPLARY, * OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF * THE POSSIBILITY OF SUCH DAMAGES. * * $FreeBSD$ */ #ifndef __VM_DOMAIN_H__ #define __VM_DOMAIN_H__ #include -struct vm_domain_iterator { - vm_domain_policy_type_t policy; +extern int vm_ndomains; +extern struct vm_domain_policy *vm_default_policy; + +static inline int +vm_domain_select_first(struct vm_domain_iterator *vi) +{ int domain; - int n; -}; + + switch (vi->policy) { + case VM_POLICY_NONE: + domain = 0; + break; + case VM_POLICY_ROUND_ROBIN: + domain = atomic_fetchadd_int(&vi->cursor, 1) % vm_ndomains; + break; + case VM_POLICY_FIXED_DOMAIN: + case VM_POLICY_FIXED_DOMAIN_ROUND_ROBIN: + domain = vi->domain; + break; + case VM_POLICY_FIRST_TOUCH: + case VM_POLICY_FIRST_TOUCH_ROUND_ROBIN: + domain = PCPU_GET(domain); + break; + } + return (domain); +} + +static inline int +vm_domain_select_next(struct vm_domain_iterator *vi, int domain) +{ + + switch (vi->policy) { + case VM_POLICY_FIXED_DOMAIN: + case VM_POLICY_FIRST_TOUCH: + return (-1); + default: + return ((domain + 1) % vm_ndomains); + } +} /* * TODO: check to see if these should just become inline functions * at some point. */ extern int vm_domain_policy_init(struct vm_domain_policy *vp); extern int vm_domain_policy_set(struct vm_domain_policy *vp, vm_domain_policy_type_t vt, int domain); extern int vm_domain_policy_cleanup(struct vm_domain_policy *vp); extern void vm_domain_policy_localcopy(struct vm_domain_policy *dst, const struct vm_domain_policy *src); extern void vm_domain_policy_copy(struct vm_domain_policy *dst, const struct vm_domain_policy *src); extern int vm_domain_policy_validate(const struct vm_domain_policy *vp); extern int vm_domain_iterator_init(struct vm_domain_iterator *vi); extern int vm_domain_iterator_set(struct vm_domain_iterator *vi, vm_domain_policy_type_t vt, int domain); extern void vm_domain_iterator_set_policy(struct vm_domain_iterator *vi, const struct vm_domain_policy *vt); extern int vm_domain_iterator_run(struct vm_domain_iterator *vi, int *domain); extern int vm_domain_iterator_isdone(struct vm_domain_iterator *vi); extern int vm_domain_iterator_cleanup(struct vm_domain_iterator *vi); #endif /* __VM_DOMAIN_H__ */ Index: projects/numa2/sys/vm/vm_object.c =================================================================== --- projects/numa2/sys/vm/vm_object.c (revision 321505) +++ projects/numa2/sys/vm/vm_object.c (revision 321506) @@ -1,2649 +1,2659 @@ /*- * Copyright (c) 1991, 1993 * The Regents of the University of California. All rights reserved. * * This code is derived from software contributed to Berkeley by * The Mach Operating System project at Carnegie-Mellon University. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from: @(#)vm_object.c 8.5 (Berkeley) 3/22/94 * * * Copyright (c) 1987, 1990 Carnegie-Mellon University. * All rights reserved. * * Authors: Avadis Tevanian, Jr., Michael Wayne Young * * Permission to use, copy, modify and distribute this software and * its documentation is hereby granted, provided that both the copyright * notice and this permission notice appear in all copies of the * software, derivative works or modified versions, and any portions * thereof, and that both notices appear in supporting documentation. * * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. * * Carnegie Mellon requests users of this software to return to * * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU * School of Computer Science * Carnegie Mellon University * Pittsburgh PA 15213-3890 * * any improvements or extensions that they make and grant Carnegie the * rights to redistribute these changes. */ /* * Virtual memory object module. */ #include __FBSDID("$FreeBSD$"); #include "opt_vm.h" #include #include #include #include #include #include #include #include #include /* for curproc, pageproc */ #include #include #include #include #include #include #include #include #include #include +#include #include #include #include #include #include #include #include #include #include #include #include static int old_msync; SYSCTL_INT(_vm, OID_AUTO, old_msync, CTLFLAG_RW, &old_msync, 0, "Use old (insecure) msync behavior"); static int vm_object_page_collect_flush(vm_object_t object, vm_page_t p, int pagerflags, int flags, boolean_t *clearobjflags, boolean_t *eio); static boolean_t vm_object_page_remove_write(vm_page_t p, int flags, boolean_t *clearobjflags); static void vm_object_qcollapse(vm_object_t object); static void vm_object_vndeallocate(vm_object_t object); /* * Virtual memory objects maintain the actual data * associated with allocated virtual memory. A given * page of memory exists within exactly one object. * * An object is only deallocated when all "references" * are given up. Only one "reference" to a given * region of an object should be writeable. * * Associated with each object is a list of all resident * memory pages belonging to that object; this list is * maintained by the "vm_page" module, and locked by the object's * lock. * * Each object also records a "pager" routine which is * used to retrieve (and store) pages to the proper backing * storage. In addition, objects may be backed by other * objects from which they were virtual-copied. * * The only items within the object structure which are * modified after time of creation are: * reference count locked by object's lock * pager routine locked by object's lock * */ struct object_q vm_object_list; struct mtx vm_object_list_mtx; /* lock for object list and count */ struct vm_object kernel_object_store; struct vm_object kmem_object_store; static SYSCTL_NODE(_vm_stats, OID_AUTO, object, CTLFLAG_RD, 0, "VM object stats"); static long object_collapses; SYSCTL_LONG(_vm_stats_object, OID_AUTO, collapses, CTLFLAG_RD, &object_collapses, 0, "VM object collapses"); static long object_bypasses; SYSCTL_LONG(_vm_stats_object, OID_AUTO, bypasses, CTLFLAG_RD, &object_bypasses, 0, "VM object bypasses"); static uma_zone_t obj_zone; static int vm_object_zinit(void *mem, int size, int flags); #ifdef INVARIANTS static void vm_object_zdtor(void *mem, int size, void *arg); static void vm_object_zdtor(void *mem, int size, void *arg) { vm_object_t object; object = (vm_object_t)mem; KASSERT(object->ref_count == 0, ("object %p ref_count = %d", object, object->ref_count)); KASSERT(TAILQ_EMPTY(&object->memq), ("object %p has resident pages in its memq", object)); KASSERT(vm_radix_is_empty(&object->rtree), ("object %p has resident pages in its trie", object)); #if VM_NRESERVLEVEL > 0 KASSERT(LIST_EMPTY(&object->rvq), ("object %p has reservations", object)); #endif KASSERT(object->paging_in_progress == 0, ("object %p paging_in_progress = %d", object, object->paging_in_progress)); KASSERT(object->resident_page_count == 0, ("object %p resident_page_count = %d", object, object->resident_page_count)); KASSERT(object->shadow_count == 0, ("object %p shadow_count = %d", object, object->shadow_count)); KASSERT(object->type == OBJT_DEAD, ("object %p has non-dead type %d", object, object->type)); } #endif static int vm_object_zinit(void *mem, int size, int flags) { vm_object_t object; object = (vm_object_t)mem; rw_init_flags(&object->lock, "vm object", RW_DUPOK | RW_NEW); /* These are true for any object that has been freed */ object->type = OBJT_DEAD; object->ref_count = 0; vm_radix_init(&object->rtree); object->paging_in_progress = 0; object->resident_page_count = 0; object->shadow_count = 0; mtx_lock(&vm_object_list_mtx); TAILQ_INSERT_TAIL(&vm_object_list, object, object_list); mtx_unlock(&vm_object_list_mtx); return (0); } static void _vm_object_allocate(objtype_t type, vm_pindex_t size, vm_object_t object) { TAILQ_INIT(&object->memq); LIST_INIT(&object->shadow_head); object->type = type; switch (type) { case OBJT_DEAD: panic("_vm_object_allocate: can't create OBJT_DEAD"); case OBJT_DEFAULT: case OBJT_SWAP: object->flags = OBJ_ONEMAPPING; break; case OBJT_DEVICE: case OBJT_SG: object->flags = OBJ_FICTITIOUS | OBJ_UNMANAGED; break; case OBJT_MGTDEVICE: object->flags = OBJ_FICTITIOUS; break; case OBJT_PHYS: object->flags = OBJ_UNMANAGED; break; case OBJT_VNODE: object->flags = 0; break; default: panic("_vm_object_allocate: type %d is undefined", type); } object->size = size; +#if MAXMEMDOM > 1 + vm_domain_iterator_set_policy(&object->selector, vm_default_policy); +#endif object->generation = 1; object->ref_count = 1; object->memattr = VM_MEMATTR_DEFAULT; object->cred = NULL; object->charge = 0; object->handle = NULL; object->backing_object = NULL; object->backing_object_offset = (vm_ooffset_t) 0; #if VM_NRESERVLEVEL > 0 LIST_INIT(&object->rvq); #endif umtx_shm_object_init(object); } /* * vm_object_init: * * Initialize the VM objects module. */ void vm_object_init(void) { TAILQ_INIT(&vm_object_list); mtx_init(&vm_object_list_mtx, "vm object_list", NULL, MTX_DEF); rw_init(&kernel_object->lock, "kernel vm object"); _vm_object_allocate(OBJT_PHYS, atop(VM_MAX_KERNEL_ADDRESS - VM_MIN_KERNEL_ADDRESS), kernel_object); #if VM_NRESERVLEVEL > 0 kernel_object->flags |= OBJ_COLORED; kernel_object->pg_color = (u_short)atop(VM_MIN_KERNEL_ADDRESS); #endif rw_init(&kmem_object->lock, "kmem vm object"); _vm_object_allocate(OBJT_PHYS, atop(VM_MAX_KERNEL_ADDRESS - VM_MIN_KERNEL_ADDRESS), kmem_object); #if VM_NRESERVLEVEL > 0 kmem_object->flags |= OBJ_COLORED; kmem_object->pg_color = (u_short)atop(VM_MIN_KERNEL_ADDRESS); #endif /* * The lock portion of struct vm_object must be type stable due * to vm_pageout_fallback_object_lock locking a vm object * without holding any references to it. */ obj_zone = uma_zcreate("VM OBJECT", sizeof (struct vm_object), NULL, #ifdef INVARIANTS vm_object_zdtor, #else NULL, #endif vm_object_zinit, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE); vm_radix_zinit(); } void vm_object_clear_flag(vm_object_t object, u_short bits) { VM_OBJECT_ASSERT_WLOCKED(object); object->flags &= ~bits; } /* * Sets the default memory attribute for the specified object. Pages * that are allocated to this object are by default assigned this memory * attribute. * * Presently, this function must be called before any pages are allocated * to the object. In the future, this requirement may be relaxed for * "default" and "swap" objects. */ int vm_object_set_memattr(vm_object_t object, vm_memattr_t memattr) { VM_OBJECT_ASSERT_WLOCKED(object); switch (object->type) { case OBJT_DEFAULT: case OBJT_DEVICE: case OBJT_MGTDEVICE: case OBJT_PHYS: case OBJT_SG: case OBJT_SWAP: case OBJT_VNODE: if (!TAILQ_EMPTY(&object->memq)) return (KERN_FAILURE); break; case OBJT_DEAD: return (KERN_INVALID_ARGUMENT); default: panic("vm_object_set_memattr: object %p is of undefined type", object); } object->memattr = memattr; return (KERN_SUCCESS); } void vm_object_pip_add(vm_object_t object, short i) { VM_OBJECT_ASSERT_WLOCKED(object); object->paging_in_progress += i; } void vm_object_pip_subtract(vm_object_t object, short i) { VM_OBJECT_ASSERT_WLOCKED(object); object->paging_in_progress -= i; } void vm_object_pip_wakeup(vm_object_t object) { VM_OBJECT_ASSERT_WLOCKED(object); object->paging_in_progress--; if ((object->flags & OBJ_PIPWNT) && object->paging_in_progress == 0) { vm_object_clear_flag(object, OBJ_PIPWNT); wakeup(object); } } void vm_object_pip_wakeupn(vm_object_t object, short i) { VM_OBJECT_ASSERT_WLOCKED(object); if (i) object->paging_in_progress -= i; if ((object->flags & OBJ_PIPWNT) && object->paging_in_progress == 0) { vm_object_clear_flag(object, OBJ_PIPWNT); wakeup(object); } } void vm_object_pip_wait(vm_object_t object, char *waitid) { VM_OBJECT_ASSERT_WLOCKED(object); while (object->paging_in_progress) { object->flags |= OBJ_PIPWNT; VM_OBJECT_SLEEP(object, object, PVM, waitid, 0); } } /* * vm_object_allocate: * * Returns a new object with the given size. */ vm_object_t vm_object_allocate(objtype_t type, vm_pindex_t size) { vm_object_t object; object = (vm_object_t)uma_zalloc(obj_zone, M_WAITOK); _vm_object_allocate(type, size, object); return (object); } /* * vm_object_reference: * * Gets another reference to the given object. Note: OBJ_DEAD * objects can be referenced during final cleaning. */ void vm_object_reference(vm_object_t object) { if (object == NULL) return; VM_OBJECT_WLOCK(object); vm_object_reference_locked(object); VM_OBJECT_WUNLOCK(object); } /* * vm_object_reference_locked: * * Gets another reference to the given object. * * The object must be locked. */ void vm_object_reference_locked(vm_object_t object) { struct vnode *vp; VM_OBJECT_ASSERT_WLOCKED(object); object->ref_count++; if (object->type == OBJT_VNODE) { vp = object->handle; vref(vp); } } /* * Handle deallocating an object of type OBJT_VNODE. */ static void vm_object_vndeallocate(vm_object_t object) { struct vnode *vp = (struct vnode *) object->handle; VM_OBJECT_ASSERT_WLOCKED(object); KASSERT(object->type == OBJT_VNODE, ("vm_object_vndeallocate: not a vnode object")); KASSERT(vp != NULL, ("vm_object_vndeallocate: missing vp")); #ifdef INVARIANTS if (object->ref_count == 0) { vn_printf(vp, "vm_object_vndeallocate "); panic("vm_object_vndeallocate: bad object reference count"); } #endif if (!umtx_shm_vnobj_persistent && object->ref_count == 1) umtx_shm_object_terminated(object); /* * The test for text of vp vnode does not need a bypass to * reach right VV_TEXT there, since it is obtained from * object->handle. */ if (object->ref_count > 1 || (vp->v_vflag & VV_TEXT) == 0) { object->ref_count--; VM_OBJECT_WUNLOCK(object); /* vrele may need the vnode lock. */ vrele(vp); } else { vhold(vp); VM_OBJECT_WUNLOCK(object); vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); vdrop(vp); VM_OBJECT_WLOCK(object); object->ref_count--; if (object->type == OBJT_DEAD) { VM_OBJECT_WUNLOCK(object); VOP_UNLOCK(vp, 0); } else { if (object->ref_count == 0) VOP_UNSET_TEXT(vp); VM_OBJECT_WUNLOCK(object); vput(vp); } } } /* * vm_object_deallocate: * * Release a reference to the specified object, * gained either through a vm_object_allocate * or a vm_object_reference call. When all references * are gone, storage associated with this object * may be relinquished. * * No object may be locked. */ void vm_object_deallocate(vm_object_t object) { vm_object_t temp; struct vnode *vp; while (object != NULL) { VM_OBJECT_WLOCK(object); if (object->type == OBJT_VNODE) { vm_object_vndeallocate(object); return; } KASSERT(object->ref_count != 0, ("vm_object_deallocate: object deallocated too many times: %d", object->type)); /* * If the reference count goes to 0 we start calling * vm_object_terminate() on the object chain. * A ref count of 1 may be a special case depending on the * shadow count being 0 or 1. */ object->ref_count--; if (object->ref_count > 1) { VM_OBJECT_WUNLOCK(object); return; } else if (object->ref_count == 1) { if (object->type == OBJT_SWAP && (object->flags & OBJ_TMPFS) != 0) { vp = object->un_pager.swp.swp_tmpfs; vhold(vp); VM_OBJECT_WUNLOCK(object); vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); VM_OBJECT_WLOCK(object); if (object->type == OBJT_DEAD || object->ref_count != 1) { VM_OBJECT_WUNLOCK(object); VOP_UNLOCK(vp, 0); vdrop(vp); return; } if ((object->flags & OBJ_TMPFS) != 0) VOP_UNSET_TEXT(vp); VOP_UNLOCK(vp, 0); vdrop(vp); } if (object->shadow_count == 0 && object->handle == NULL && (object->type == OBJT_DEFAULT || (object->type == OBJT_SWAP && (object->flags & OBJ_TMPFS_NODE) == 0))) { vm_object_set_flag(object, OBJ_ONEMAPPING); } else if ((object->shadow_count == 1) && (object->handle == NULL) && (object->type == OBJT_DEFAULT || object->type == OBJT_SWAP)) { vm_object_t robject; robject = LIST_FIRST(&object->shadow_head); KASSERT(robject != NULL, ("vm_object_deallocate: ref_count: %d, shadow_count: %d", object->ref_count, object->shadow_count)); KASSERT((robject->flags & OBJ_TMPFS_NODE) == 0, ("shadowed tmpfs v_object %p", object)); if (!VM_OBJECT_TRYWLOCK(robject)) { /* * Avoid a potential deadlock. */ object->ref_count++; VM_OBJECT_WUNLOCK(object); /* * More likely than not the thread * holding robject's lock has lower * priority than the current thread. * Let the lower priority thread run. */ pause("vmo_de", 1); continue; } /* * Collapse object into its shadow unless its * shadow is dead. In that case, object will * be deallocated by the thread that is * deallocating its shadow. */ if ((robject->flags & OBJ_DEAD) == 0 && (robject->handle == NULL) && (robject->type == OBJT_DEFAULT || robject->type == OBJT_SWAP)) { robject->ref_count++; retry: if (robject->paging_in_progress) { VM_OBJECT_WUNLOCK(object); vm_object_pip_wait(robject, "objde1"); temp = robject->backing_object; if (object == temp) { VM_OBJECT_WLOCK(object); goto retry; } } else if (object->paging_in_progress) { VM_OBJECT_WUNLOCK(robject); object->flags |= OBJ_PIPWNT; VM_OBJECT_SLEEP(object, object, PDROP | PVM, "objde2", 0); VM_OBJECT_WLOCK(robject); temp = robject->backing_object; if (object == temp) { VM_OBJECT_WLOCK(object); goto retry; } } else VM_OBJECT_WUNLOCK(object); if (robject->ref_count == 1) { robject->ref_count--; object = robject; goto doterm; } object = robject; vm_object_collapse(object); VM_OBJECT_WUNLOCK(object); continue; } VM_OBJECT_WUNLOCK(robject); } VM_OBJECT_WUNLOCK(object); return; } doterm: umtx_shm_object_terminated(object); temp = object->backing_object; if (temp != NULL) { KASSERT((object->flags & OBJ_TMPFS_NODE) == 0, ("shadowed tmpfs v_object 2 %p", object)); VM_OBJECT_WLOCK(temp); LIST_REMOVE(object, shadow_list); temp->shadow_count--; VM_OBJECT_WUNLOCK(temp); object->backing_object = NULL; } /* * Don't double-terminate, we could be in a termination * recursion due to the terminate having to sync data * to disk. */ if ((object->flags & OBJ_DEAD) == 0) vm_object_terminate(object); else VM_OBJECT_WUNLOCK(object); object = temp; } } /* * vm_object_destroy removes the object from the global object list * and frees the space for the object. */ void vm_object_destroy(vm_object_t object) { /* * Release the allocation charge. */ if (object->cred != NULL) { swap_release_by_cred(object->charge, object->cred); object->charge = 0; crfree(object->cred); object->cred = NULL; } /* * Free the space for the object. */ uma_zfree(obj_zone, object); } /* * vm_object_terminate actually destroys the specified object, freeing * up all previously used resources. * * The object must be locked. * This routine may block. */ void vm_object_terminate(vm_object_t object) { vm_page_t p, p_next; VM_OBJECT_ASSERT_WLOCKED(object); /* * Make sure no one uses us. */ vm_object_set_flag(object, OBJ_DEAD); /* * wait for the pageout daemon to be done with the object */ vm_object_pip_wait(object, "objtrm"); KASSERT(!object->paging_in_progress, ("vm_object_terminate: pageout in progress")); /* * Clean and free the pages, as appropriate. All references to the * object are gone, so we don't need to lock it. */ if (object->type == OBJT_VNODE) { struct vnode *vp = (struct vnode *)object->handle; /* * Clean pages and flush buffers. */ vm_object_page_clean(object, 0, 0, OBJPC_SYNC); VM_OBJECT_WUNLOCK(object); vinvalbuf(vp, V_SAVE, 0, 0); BO_LOCK(&vp->v_bufobj); vp->v_bufobj.bo_flag |= BO_DEAD; BO_UNLOCK(&vp->v_bufobj); VM_OBJECT_WLOCK(object); } KASSERT(object->ref_count == 0, ("vm_object_terminate: object with references, ref_count=%d", object->ref_count)); /* * Free any remaining pageable pages. This also removes them from the * paging queues. However, don't free wired pages, just remove them * from the object. Rather than incrementally removing each page from * the object, the page and object are reset to any empty state. */ TAILQ_FOREACH_SAFE(p, &object->memq, listq, p_next) { vm_page_assert_unbusied(p); vm_page_lock(p); /* * Optimize the page's removal from the object by resetting * its "object" field. Specifically, if the page is not * wired, then the effect of this assignment is that * vm_page_free()'s call to vm_page_remove() will return * immediately without modifying the page or the object. */ p->object = NULL; if (p->wire_count == 0) { vm_page_free(p); VM_CNT_INC(v_pfree); } vm_page_unlock(p); } /* * If the object contained any pages, then reset it to an empty state. * None of the object's fields, including "resident_page_count", were * modified by the preceding loop. */ if (object->resident_page_count != 0) { vm_radix_reclaim_allnodes(&object->rtree); TAILQ_INIT(&object->memq); object->resident_page_count = 0; if (object->type == OBJT_VNODE) vdrop(object->handle); } #if VM_NRESERVLEVEL > 0 if (__predict_false(!LIST_EMPTY(&object->rvq))) vm_reserv_break_all(object); #endif KASSERT(object->cred == NULL || object->type == OBJT_DEFAULT || object->type == OBJT_SWAP, ("%s: non-swap obj %p has cred", __func__, object)); /* * Let the pager know object is dead. */ vm_pager_deallocate(object); VM_OBJECT_WUNLOCK(object); vm_object_destroy(object); } /* * Make the page read-only so that we can clear the object flags. However, if * this is a nosync mmap then the object is likely to stay dirty so do not * mess with the page and do not clear the object flags. Returns TRUE if the * page should be flushed, and FALSE otherwise. */ static boolean_t vm_object_page_remove_write(vm_page_t p, int flags, boolean_t *clearobjflags) { /* * If we have been asked to skip nosync pages and this is a * nosync page, skip it. Note that the object flags were not * cleared in this case so we do not have to set them. */ if ((flags & OBJPC_NOSYNC) != 0 && (p->oflags & VPO_NOSYNC) != 0) { *clearobjflags = FALSE; return (FALSE); } else { pmap_remove_write(p); return (p->dirty != 0); } } /* * vm_object_page_clean * * Clean all dirty pages in the specified range of object. Leaves page * on whatever queue it is currently on. If NOSYNC is set then do not * write out pages with VPO_NOSYNC set (originally comes from MAP_NOSYNC), * leaving the object dirty. * * When stuffing pages asynchronously, allow clustering. XXX we need a * synchronous clustering mode implementation. * * Odd semantics: if start == end, we clean everything. * * The object must be locked. * * Returns FALSE if some page from the range was not written, as * reported by the pager, and TRUE otherwise. */ boolean_t vm_object_page_clean(vm_object_t object, vm_ooffset_t start, vm_ooffset_t end, int flags) { vm_page_t np, p; vm_pindex_t pi, tend, tstart; int curgeneration, n, pagerflags; boolean_t clearobjflags, eio, res; VM_OBJECT_ASSERT_WLOCKED(object); /* * The OBJ_MIGHTBEDIRTY flag is only set for OBJT_VNODE * objects. The check below prevents the function from * operating on non-vnode objects. */ if ((object->flags & OBJ_MIGHTBEDIRTY) == 0 || object->resident_page_count == 0) return (TRUE); pagerflags = (flags & (OBJPC_SYNC | OBJPC_INVAL)) != 0 ? VM_PAGER_PUT_SYNC : VM_PAGER_CLUSTER_OK; pagerflags |= (flags & OBJPC_INVAL) != 0 ? VM_PAGER_PUT_INVAL : 0; tstart = OFF_TO_IDX(start); tend = (end == 0) ? object->size : OFF_TO_IDX(end + PAGE_MASK); clearobjflags = tstart == 0 && tend >= object->size; res = TRUE; rescan: curgeneration = object->generation; for (p = vm_page_find_least(object, tstart); p != NULL; p = np) { pi = p->pindex; if (pi >= tend) break; np = TAILQ_NEXT(p, listq); if (p->valid == 0) continue; if (vm_page_sleep_if_busy(p, "vpcwai")) { if (object->generation != curgeneration) { if ((flags & OBJPC_SYNC) != 0) goto rescan; else clearobjflags = FALSE; } np = vm_page_find_least(object, pi); continue; } if (!vm_object_page_remove_write(p, flags, &clearobjflags)) continue; n = vm_object_page_collect_flush(object, p, pagerflags, flags, &clearobjflags, &eio); if (eio) { res = FALSE; clearobjflags = FALSE; } if (object->generation != curgeneration) { if ((flags & OBJPC_SYNC) != 0) goto rescan; else clearobjflags = FALSE; } /* * If the VOP_PUTPAGES() did a truncated write, so * that even the first page of the run is not fully * written, vm_pageout_flush() returns 0 as the run * length. Since the condition that caused truncated * write may be permanent, e.g. exhausted free space, * accepting n == 0 would cause an infinite loop. * * Forwarding the iterator leaves the unwritten page * behind, but there is not much we can do there if * filesystem refuses to write it. */ if (n == 0) { n = 1; clearobjflags = FALSE; } np = vm_page_find_least(object, pi + n); } #if 0 VOP_FSYNC(vp, (pagerflags & VM_PAGER_PUT_SYNC) ? MNT_WAIT : 0); #endif if (clearobjflags) vm_object_clear_flag(object, OBJ_MIGHTBEDIRTY); return (res); } static int vm_object_page_collect_flush(vm_object_t object, vm_page_t p, int pagerflags, int flags, boolean_t *clearobjflags, boolean_t *eio) { vm_page_t ma[vm_pageout_page_count], p_first, tp; int count, i, mreq, runlen; vm_page_lock_assert(p, MA_NOTOWNED); VM_OBJECT_ASSERT_WLOCKED(object); count = 1; mreq = 0; for (tp = p; count < vm_pageout_page_count; count++) { tp = vm_page_next(tp); if (tp == NULL || vm_page_busied(tp)) break; if (!vm_object_page_remove_write(tp, flags, clearobjflags)) break; } for (p_first = p; count < vm_pageout_page_count; count++) { tp = vm_page_prev(p_first); if (tp == NULL || vm_page_busied(tp)) break; if (!vm_object_page_remove_write(tp, flags, clearobjflags)) break; p_first = tp; mreq++; } for (tp = p_first, i = 0; i < count; tp = TAILQ_NEXT(tp, listq), i++) ma[i] = tp; vm_pageout_flush(ma, count, pagerflags, mreq, &runlen, eio); return (runlen); } /* * Note that there is absolutely no sense in writing out * anonymous objects, so we track down the vnode object * to write out. * We invalidate (remove) all pages from the address space * for semantic correctness. * * If the backing object is a device object with unmanaged pages, then any * mappings to the specified range of pages must be removed before this * function is called. * * Note: certain anonymous maps, such as MAP_NOSYNC maps, * may start out with a NULL object. */ boolean_t vm_object_sync(vm_object_t object, vm_ooffset_t offset, vm_size_t size, boolean_t syncio, boolean_t invalidate) { vm_object_t backing_object; struct vnode *vp; struct mount *mp; int error, flags, fsync_after; boolean_t res; if (object == NULL) return (TRUE); res = TRUE; error = 0; VM_OBJECT_WLOCK(object); while ((backing_object = object->backing_object) != NULL) { VM_OBJECT_WLOCK(backing_object); offset += object->backing_object_offset; VM_OBJECT_WUNLOCK(object); object = backing_object; if (object->size < OFF_TO_IDX(offset + size)) size = IDX_TO_OFF(object->size) - offset; } /* * Flush pages if writing is allowed, invalidate them * if invalidation requested. Pages undergoing I/O * will be ignored by vm_object_page_remove(). * * We cannot lock the vnode and then wait for paging * to complete without deadlocking against vm_fault. * Instead we simply call vm_object_page_remove() and * allow it to block internally on a page-by-page * basis when it encounters pages undergoing async * I/O. */ if (object->type == OBJT_VNODE && (object->flags & OBJ_MIGHTBEDIRTY) != 0) { vp = object->handle; VM_OBJECT_WUNLOCK(object); (void) vn_start_write(vp, &mp, V_WAIT); vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); if (syncio && !invalidate && offset == 0 && atop(size) == object->size) { /* * If syncing the whole mapping of the file, * it is faster to schedule all the writes in * async mode, also allowing the clustering, * and then wait for i/o to complete. */ flags = 0; fsync_after = TRUE; } else { flags = (syncio || invalidate) ? OBJPC_SYNC : 0; flags |= invalidate ? (OBJPC_SYNC | OBJPC_INVAL) : 0; fsync_after = FALSE; } VM_OBJECT_WLOCK(object); res = vm_object_page_clean(object, offset, offset + size, flags); VM_OBJECT_WUNLOCK(object); if (fsync_after) error = VOP_FSYNC(vp, MNT_WAIT, curthread); VOP_UNLOCK(vp, 0); vn_finished_write(mp); if (error != 0) res = FALSE; VM_OBJECT_WLOCK(object); } if ((object->type == OBJT_VNODE || object->type == OBJT_DEVICE) && invalidate) { if (object->type == OBJT_DEVICE) /* * The option OBJPR_NOTMAPPED must be passed here * because vm_object_page_remove() cannot remove * unmanaged mappings. */ flags = OBJPR_NOTMAPPED; else if (old_msync) flags = 0; else flags = OBJPR_CLEANONLY; vm_object_page_remove(object, OFF_TO_IDX(offset), OFF_TO_IDX(offset + size + PAGE_MASK), flags); } VM_OBJECT_WUNLOCK(object); return (res); } /* * Determine whether the given advice can be applied to the object. Advice is * not applied to unmanaged pages since they never belong to page queues, and * since MADV_FREE is destructive, it can apply only to anonymous pages that * have been mapped at most once. */ static bool vm_object_advice_applies(vm_object_t object, int advice) { if ((object->flags & OBJ_UNMANAGED) != 0) return (false); if (advice != MADV_FREE) return (true); return ((object->type == OBJT_DEFAULT || object->type == OBJT_SWAP) && (object->flags & OBJ_ONEMAPPING) != 0); } static void vm_object_madvise_freespace(vm_object_t object, int advice, vm_pindex_t pindex, vm_size_t size) { if (advice == MADV_FREE && object->type == OBJT_SWAP) swap_pager_freespace(object, pindex, size); } /* * vm_object_madvise: * * Implements the madvise function at the object/page level. * * MADV_WILLNEED (any object) * * Activate the specified pages if they are resident. * * MADV_DONTNEED (any object) * * Deactivate the specified pages if they are resident. * * MADV_FREE (OBJT_DEFAULT/OBJT_SWAP objects, * OBJ_ONEMAPPING only) * * Deactivate and clean the specified pages if they are * resident. This permits the process to reuse the pages * without faulting or the kernel to reclaim the pages * without I/O. */ void vm_object_madvise(vm_object_t object, vm_pindex_t pindex, vm_pindex_t end, int advice) { vm_pindex_t tpindex; vm_object_t backing_object, tobject; vm_page_t m, tm; if (object == NULL) return; relookup: VM_OBJECT_WLOCK(object); if (!vm_object_advice_applies(object, advice)) { VM_OBJECT_WUNLOCK(object); return; } for (m = vm_page_find_least(object, pindex); pindex < end; pindex++) { tobject = object; /* * If the next page isn't resident in the top-level object, we * need to search the shadow chain. When applying MADV_FREE, we * take care to release any swap space used to store * non-resident pages. */ if (m == NULL || pindex < m->pindex) { /* * Optimize a common case: if the top-level object has * no backing object, we can skip over the non-resident * range in constant time. */ if (object->backing_object == NULL) { tpindex = (m != NULL && m->pindex < end) ? m->pindex : end; vm_object_madvise_freespace(object, advice, pindex, tpindex - pindex); if ((pindex = tpindex) == end) break; goto next_page; } tpindex = pindex; do { vm_object_madvise_freespace(tobject, advice, tpindex, 1); /* * Prepare to search the next object in the * chain. */ backing_object = tobject->backing_object; if (backing_object == NULL) goto next_pindex; VM_OBJECT_WLOCK(backing_object); tpindex += OFF_TO_IDX(tobject->backing_object_offset); if (tobject != object) VM_OBJECT_WUNLOCK(tobject); tobject = backing_object; if (!vm_object_advice_applies(tobject, advice)) goto next_pindex; } while ((tm = vm_page_lookup(tobject, tpindex)) == NULL); } else { next_page: tm = m; m = TAILQ_NEXT(m, listq); } /* * If the page is not in a normal state, skip it. */ if (tm->valid != VM_PAGE_BITS_ALL) goto next_pindex; vm_page_lock(tm); if (tm->hold_count != 0 || tm->wire_count != 0) { vm_page_unlock(tm); goto next_pindex; } KASSERT((tm->flags & PG_FICTITIOUS) == 0, ("vm_object_madvise: page %p is fictitious", tm)); KASSERT((tm->oflags & VPO_UNMANAGED) == 0, ("vm_object_madvise: page %p is not managed", tm)); if (vm_page_busied(tm)) { if (object != tobject) VM_OBJECT_WUNLOCK(tobject); VM_OBJECT_WUNLOCK(object); if (advice == MADV_WILLNEED) { /* * Reference the page before unlocking and * sleeping so that the page daemon is less * likely to reclaim it. */ vm_page_aflag_set(tm, PGA_REFERENCED); } vm_page_busy_sleep(tm, "madvpo", false); goto relookup; } vm_page_advise(tm, advice); vm_page_unlock(tm); vm_object_madvise_freespace(tobject, advice, tm->pindex, 1); next_pindex: if (tobject != object) VM_OBJECT_WUNLOCK(tobject); } VM_OBJECT_WUNLOCK(object); } /* * vm_object_shadow: * * Create a new object which is backed by the * specified existing object range. The source * object reference is deallocated. * * The new object and offset into that object * are returned in the source parameters. */ void vm_object_shadow( vm_object_t *object, /* IN/OUT */ vm_ooffset_t *offset, /* IN/OUT */ vm_size_t length) { vm_object_t source; vm_object_t result; source = *object; /* * Don't create the new object if the old object isn't shared. */ if (source != NULL) { VM_OBJECT_WLOCK(source); if (source->ref_count == 1 && source->handle == NULL && (source->type == OBJT_DEFAULT || source->type == OBJT_SWAP)) { VM_OBJECT_WUNLOCK(source); return; } VM_OBJECT_WUNLOCK(source); } /* * Allocate a new object with the given length. */ result = vm_object_allocate(OBJT_DEFAULT, atop(length)); /* * The new object shadows the source object, adding a reference to it. * Our caller changes his reference to point to the new object, * removing a reference to the source object. Net result: no change * of reference count. * * Try to optimize the result object's page color when shadowing * in order to maintain page coloring consistency in the combined * shadowed object. */ result->backing_object = source; /* * Store the offset into the source object, and fix up the offset into * the new object. */ result->backing_object_offset = *offset; if (source != NULL) { VM_OBJECT_WLOCK(source); LIST_INSERT_HEAD(&source->shadow_head, result, shadow_list); source->shadow_count++; #if VM_NRESERVLEVEL > 0 result->flags |= source->flags & OBJ_COLORED; result->pg_color = (source->pg_color + OFF_TO_IDX(*offset)) & ((1 << (VM_NFREEORDER - 1)) - 1); #endif +#if MAXMEMDOM > 1 + result->selector = source->selector; +#endif VM_OBJECT_WUNLOCK(source); } /* * Return the new things */ *offset = 0; *object = result; } /* * vm_object_split: * * Split the pages in a map entry into a new object. This affords * easier removal of unused pages, and keeps object inheritance from * being a negative impact on memory usage. */ void vm_object_split(vm_map_entry_t entry) { vm_page_t m, m_next; vm_object_t orig_object, new_object, source; vm_pindex_t idx, offidxstart; vm_size_t size; orig_object = entry->object.vm_object; if (orig_object->type != OBJT_DEFAULT && orig_object->type != OBJT_SWAP) return; if (orig_object->ref_count <= 1) return; VM_OBJECT_WUNLOCK(orig_object); offidxstart = OFF_TO_IDX(entry->offset); size = atop(entry->end - entry->start); /* * If swap_pager_copy() is later called, it will convert new_object * into a swap object. */ new_object = vm_object_allocate(OBJT_DEFAULT, size); +#if MAXMEMDOM > 1 + new_object->selector = orig_object->selector; +#endif /* * At this point, the new object is still private, so the order in * which the original and new objects are locked does not matter. */ VM_OBJECT_WLOCK(new_object); VM_OBJECT_WLOCK(orig_object); source = orig_object->backing_object; if (source != NULL) { VM_OBJECT_WLOCK(source); if ((source->flags & OBJ_DEAD) != 0) { VM_OBJECT_WUNLOCK(source); VM_OBJECT_WUNLOCK(orig_object); VM_OBJECT_WUNLOCK(new_object); vm_object_deallocate(new_object); VM_OBJECT_WLOCK(orig_object); return; } LIST_INSERT_HEAD(&source->shadow_head, new_object, shadow_list); source->shadow_count++; vm_object_reference_locked(source); /* for new_object */ vm_object_clear_flag(source, OBJ_ONEMAPPING); VM_OBJECT_WUNLOCK(source); new_object->backing_object_offset = orig_object->backing_object_offset + entry->offset; new_object->backing_object = source; } if (orig_object->cred != NULL) { new_object->cred = orig_object->cred; crhold(orig_object->cred); new_object->charge = ptoa(size); KASSERT(orig_object->charge >= ptoa(size), ("orig_object->charge < 0")); orig_object->charge -= ptoa(size); } retry: m = vm_page_find_least(orig_object, offidxstart); for (; m != NULL && (idx = m->pindex - offidxstart) < size; m = m_next) { m_next = TAILQ_NEXT(m, listq); /* * We must wait for pending I/O to complete before we can * rename the page. * * We do not have to VM_PROT_NONE the page as mappings should * not be changed by this operation. */ if (vm_page_busied(m)) { VM_OBJECT_WUNLOCK(new_object); vm_page_lock(m); VM_OBJECT_WUNLOCK(orig_object); vm_page_busy_sleep(m, "spltwt", false); VM_OBJECT_WLOCK(orig_object); VM_OBJECT_WLOCK(new_object); goto retry; } /* vm_page_rename() will dirty the page. */ if (vm_page_rename(m, new_object, idx)) { VM_OBJECT_WUNLOCK(new_object); VM_OBJECT_WUNLOCK(orig_object); VM_WAIT; VM_OBJECT_WLOCK(orig_object); VM_OBJECT_WLOCK(new_object); goto retry; } #if VM_NRESERVLEVEL > 0 /* * If some of the reservation's allocated pages remain with * the original object, then transferring the reservation to * the new object is neither particularly beneficial nor * particularly harmful as compared to leaving the reservation * with the original object. If, however, all of the * reservation's allocated pages are transferred to the new * object, then transferring the reservation is typically * beneficial. Determining which of these two cases applies * would be more costly than unconditionally renaming the * reservation. */ vm_reserv_rename(m, new_object, orig_object, offidxstart); #endif if (orig_object->type == OBJT_SWAP) vm_page_xbusy(m); } if (orig_object->type == OBJT_SWAP) { /* * swap_pager_copy() can sleep, in which case the orig_object's * and new_object's locks are released and reacquired. */ swap_pager_copy(orig_object, new_object, offidxstart, 0); TAILQ_FOREACH(m, &new_object->memq, listq) vm_page_xunbusy(m); } VM_OBJECT_WUNLOCK(orig_object); VM_OBJECT_WUNLOCK(new_object); entry->object.vm_object = new_object; entry->offset = 0LL; vm_object_deallocate(orig_object); VM_OBJECT_WLOCK(new_object); } #define OBSC_COLLAPSE_NOWAIT 0x0002 #define OBSC_COLLAPSE_WAIT 0x0004 static vm_page_t vm_object_collapse_scan_wait(vm_object_t object, vm_page_t p, vm_page_t next, int op) { vm_object_t backing_object; VM_OBJECT_ASSERT_WLOCKED(object); backing_object = object->backing_object; VM_OBJECT_ASSERT_WLOCKED(backing_object); KASSERT(p == NULL || vm_page_busied(p), ("unbusy page %p", p)); KASSERT(p == NULL || p->object == object || p->object == backing_object, ("invalid ownership %p %p %p", p, object, backing_object)); if ((op & OBSC_COLLAPSE_NOWAIT) != 0) return (next); if (p != NULL) vm_page_lock(p); VM_OBJECT_WUNLOCK(object); VM_OBJECT_WUNLOCK(backing_object); if (p == NULL) VM_WAIT; else vm_page_busy_sleep(p, "vmocol", false); VM_OBJECT_WLOCK(object); VM_OBJECT_WLOCK(backing_object); return (TAILQ_FIRST(&backing_object->memq)); } static bool vm_object_scan_all_shadowed(vm_object_t object) { vm_object_t backing_object; vm_page_t p, pp; vm_pindex_t backing_offset_index, new_pindex, pi, ps; VM_OBJECT_ASSERT_WLOCKED(object); VM_OBJECT_ASSERT_WLOCKED(object->backing_object); backing_object = object->backing_object; if (backing_object->type != OBJT_DEFAULT && backing_object->type != OBJT_SWAP) return (false); pi = backing_offset_index = OFF_TO_IDX(object->backing_object_offset); p = vm_page_find_least(backing_object, pi); ps = swap_pager_find_least(backing_object, pi); /* * Only check pages inside the parent object's range and * inside the parent object's mapping of the backing object. */ for (;; pi++) { if (p != NULL && p->pindex < pi) p = TAILQ_NEXT(p, listq); if (ps < pi) ps = swap_pager_find_least(backing_object, pi); if (p == NULL && ps >= backing_object->size) break; else if (p == NULL) pi = ps; else pi = MIN(p->pindex, ps); new_pindex = pi - backing_offset_index; if (new_pindex >= object->size) break; /* * See if the parent has the page or if the parent's object * pager has the page. If the parent has the page but the page * is not valid, the parent's object pager must have the page. * * If this fails, the parent does not completely shadow the * object and we might as well give up now. */ pp = vm_page_lookup(object, new_pindex); if ((pp == NULL || pp->valid == 0) && !vm_pager_has_page(object, new_pindex, NULL, NULL)) return (false); } return (true); } static bool vm_object_collapse_scan(vm_object_t object, int op) { vm_object_t backing_object; vm_page_t next, p, pp; vm_pindex_t backing_offset_index, new_pindex; VM_OBJECT_ASSERT_WLOCKED(object); VM_OBJECT_ASSERT_WLOCKED(object->backing_object); backing_object = object->backing_object; backing_offset_index = OFF_TO_IDX(object->backing_object_offset); /* * Initial conditions */ if ((op & OBSC_COLLAPSE_WAIT) != 0) vm_object_set_flag(backing_object, OBJ_DEAD); /* * Our scan */ for (p = TAILQ_FIRST(&backing_object->memq); p != NULL; p = next) { next = TAILQ_NEXT(p, listq); new_pindex = p->pindex - backing_offset_index; /* * Check for busy page */ if (vm_page_busied(p)) { next = vm_object_collapse_scan_wait(object, p, next, op); continue; } KASSERT(p->object == backing_object, ("vm_object_collapse_scan: object mismatch")); if (p->pindex < backing_offset_index || new_pindex >= object->size) { if (backing_object->type == OBJT_SWAP) swap_pager_freespace(backing_object, p->pindex, 1); /* * Page is out of the parent object's range, we can * simply destroy it. */ vm_page_lock(p); KASSERT(!pmap_page_is_mapped(p), ("freeing mapped page %p", p)); if (p->wire_count == 0) vm_page_free(p); else vm_page_remove(p); vm_page_unlock(p); continue; } pp = vm_page_lookup(object, new_pindex); if (pp != NULL && vm_page_busied(pp)) { /* * The page in the parent is busy and possibly not * (yet) valid. Until its state is finalized by the * busy bit owner, we can't tell whether it shadows the * original page. Therefore, we must either skip it * and the original (backing_object) page or wait for * its state to be finalized. * * This is due to a race with vm_fault() where we must * unbusy the original (backing_obj) page before we can * (re)lock the parent. Hence we can get here. */ next = vm_object_collapse_scan_wait(object, pp, next, op); continue; } KASSERT(pp == NULL || pp->valid != 0, ("unbusy invalid page %p", pp)); if (pp != NULL || vm_pager_has_page(object, new_pindex, NULL, NULL)) { /* * The page already exists in the parent OR swap exists * for this location in the parent. Leave the parent's * page alone. Destroy the original page from the * backing object. */ if (backing_object->type == OBJT_SWAP) swap_pager_freespace(backing_object, p->pindex, 1); vm_page_lock(p); KASSERT(!pmap_page_is_mapped(p), ("freeing mapped page %p", p)); if (p->wire_count == 0) vm_page_free(p); else vm_page_remove(p); vm_page_unlock(p); continue; } /* * Page does not exist in parent, rename the page from the * backing object to the main object. * * If the page was mapped to a process, it can remain mapped * through the rename. vm_page_rename() will dirty the page. */ if (vm_page_rename(p, object, new_pindex)) { next = vm_object_collapse_scan_wait(object, NULL, next, op); continue; } /* Use the old pindex to free the right page. */ if (backing_object->type == OBJT_SWAP) swap_pager_freespace(backing_object, new_pindex + backing_offset_index, 1); #if VM_NRESERVLEVEL > 0 /* * Rename the reservation. */ vm_reserv_rename(p, object, backing_object, backing_offset_index); #endif } return (true); } /* * this version of collapse allows the operation to occur earlier and * when paging_in_progress is true for an object... This is not a complete * operation, but should plug 99.9% of the rest of the leaks. */ static void vm_object_qcollapse(vm_object_t object) { vm_object_t backing_object = object->backing_object; VM_OBJECT_ASSERT_WLOCKED(object); VM_OBJECT_ASSERT_WLOCKED(backing_object); if (backing_object->ref_count != 1) return; vm_object_collapse_scan(object, OBSC_COLLAPSE_NOWAIT); } /* * vm_object_collapse: * * Collapse an object with the object backing it. * Pages in the backing object are moved into the * parent, and the backing object is deallocated. */ void vm_object_collapse(vm_object_t object) { vm_object_t backing_object, new_backing_object; VM_OBJECT_ASSERT_WLOCKED(object); while (TRUE) { /* * Verify that the conditions are right for collapse: * * The object exists and the backing object exists. */ if ((backing_object = object->backing_object) == NULL) break; /* * we check the backing object first, because it is most likely * not collapsable. */ VM_OBJECT_WLOCK(backing_object); if (backing_object->handle != NULL || (backing_object->type != OBJT_DEFAULT && backing_object->type != OBJT_SWAP) || (backing_object->flags & OBJ_DEAD) || object->handle != NULL || (object->type != OBJT_DEFAULT && object->type != OBJT_SWAP) || (object->flags & OBJ_DEAD)) { VM_OBJECT_WUNLOCK(backing_object); break; } if (object->paging_in_progress != 0 || backing_object->paging_in_progress != 0) { vm_object_qcollapse(object); VM_OBJECT_WUNLOCK(backing_object); break; } /* * We know that we can either collapse the backing object (if * the parent is the only reference to it) or (perhaps) have * the parent bypass the object if the parent happens to shadow * all the resident pages in the entire backing object. * * This is ignoring pager-backed pages such as swap pages. * vm_object_collapse_scan fails the shadowing test in this * case. */ if (backing_object->ref_count == 1) { vm_object_pip_add(object, 1); vm_object_pip_add(backing_object, 1); /* * If there is exactly one reference to the backing * object, we can collapse it into the parent. */ vm_object_collapse_scan(object, OBSC_COLLAPSE_WAIT); #if VM_NRESERVLEVEL > 0 /* * Break any reservations from backing_object. */ if (__predict_false(!LIST_EMPTY(&backing_object->rvq))) vm_reserv_break_all(backing_object); #endif /* * Move the pager from backing_object to object. */ if (backing_object->type == OBJT_SWAP) { /* * swap_pager_copy() can sleep, in which case * the backing_object's and object's locks are * released and reacquired. * Since swap_pager_copy() is being asked to * destroy the source, it will change the * backing_object's type to OBJT_DEFAULT. */ swap_pager_copy( backing_object, object, OFF_TO_IDX(object->backing_object_offset), TRUE); } /* * Object now shadows whatever backing_object did. * Note that the reference to * backing_object->backing_object moves from within * backing_object to within object. */ LIST_REMOVE(object, shadow_list); backing_object->shadow_count--; if (backing_object->backing_object) { VM_OBJECT_WLOCK(backing_object->backing_object); LIST_REMOVE(backing_object, shadow_list); LIST_INSERT_HEAD( &backing_object->backing_object->shadow_head, object, shadow_list); /* * The shadow_count has not changed. */ VM_OBJECT_WUNLOCK(backing_object->backing_object); } object->backing_object = backing_object->backing_object; object->backing_object_offset += backing_object->backing_object_offset; /* * Discard backing_object. * * Since the backing object has no pages, no pager left, * and no object references within it, all that is * necessary is to dispose of it. */ KASSERT(backing_object->ref_count == 1, ( "backing_object %p was somehow re-referenced during collapse!", backing_object)); vm_object_pip_wakeup(backing_object); backing_object->type = OBJT_DEAD; backing_object->ref_count = 0; VM_OBJECT_WUNLOCK(backing_object); vm_object_destroy(backing_object); vm_object_pip_wakeup(object); object_collapses++; } else { /* * If we do not entirely shadow the backing object, * there is nothing we can do so we give up. */ if (object->resident_page_count != object->size && !vm_object_scan_all_shadowed(object)) { VM_OBJECT_WUNLOCK(backing_object); break; } /* * Make the parent shadow the next object in the * chain. Deallocating backing_object will not remove * it, since its reference count is at least 2. */ LIST_REMOVE(object, shadow_list); backing_object->shadow_count--; new_backing_object = backing_object->backing_object; if ((object->backing_object = new_backing_object) != NULL) { VM_OBJECT_WLOCK(new_backing_object); LIST_INSERT_HEAD( &new_backing_object->shadow_head, object, shadow_list ); new_backing_object->shadow_count++; vm_object_reference_locked(new_backing_object); VM_OBJECT_WUNLOCK(new_backing_object); object->backing_object_offset += backing_object->backing_object_offset; } /* * Drop the reference count on backing_object. Since * its ref_count was at least 2, it will not vanish. */ backing_object->ref_count--; VM_OBJECT_WUNLOCK(backing_object); object_bypasses++; } /* * Try again with this object's new backing object. */ } } /* * vm_object_page_remove: * * For the given object, either frees or invalidates each of the * specified pages. In general, a page is freed. However, if a page is * wired for any reason other than the existence of a managed, wired * mapping, then it may be invalidated but not removed from the object. * Pages are specified by the given range ["start", "end") and the option * OBJPR_CLEANONLY. As a special case, if "end" is zero, then the range * extends from "start" to the end of the object. If the option * OBJPR_CLEANONLY is specified, then only the non-dirty pages within the * specified range are affected. If the option OBJPR_NOTMAPPED is * specified, then the pages within the specified range must have no * mappings. Otherwise, if this option is not specified, any mappings to * the specified pages are removed before the pages are freed or * invalidated. * * In general, this operation should only be performed on objects that * contain managed pages. There are, however, two exceptions. First, it * is performed on the kernel and kmem objects by vm_map_entry_delete(). * Second, it is used by msync(..., MS_INVALIDATE) to invalidate device- * backed pages. In both of these cases, the option OBJPR_CLEANONLY must * not be specified and the option OBJPR_NOTMAPPED must be specified. * * The object must be locked. */ void vm_object_page_remove(vm_object_t object, vm_pindex_t start, vm_pindex_t end, int options) { vm_page_t p, next; VM_OBJECT_ASSERT_WLOCKED(object); KASSERT((object->flags & OBJ_UNMANAGED) == 0 || (options & (OBJPR_CLEANONLY | OBJPR_NOTMAPPED)) == OBJPR_NOTMAPPED, ("vm_object_page_remove: illegal options for object %p", object)); if (object->resident_page_count == 0) return; vm_object_pip_add(object, 1); again: p = vm_page_find_least(object, start); /* * Here, the variable "p" is either (1) the page with the least pindex * greater than or equal to the parameter "start" or (2) NULL. */ for (; p != NULL && (p->pindex < end || end == 0); p = next) { next = TAILQ_NEXT(p, listq); /* * If the page is wired for any reason besides the existence * of managed, wired mappings, then it cannot be freed. For * example, fictitious pages, which represent device memory, * are inherently wired and cannot be freed. They can, * however, be invalidated if the option OBJPR_CLEANONLY is * not specified. */ vm_page_lock(p); if (vm_page_xbusied(p)) { VM_OBJECT_WUNLOCK(object); vm_page_busy_sleep(p, "vmopax", true); VM_OBJECT_WLOCK(object); goto again; } if (p->wire_count != 0) { if ((options & OBJPR_NOTMAPPED) == 0) pmap_remove_all(p); if ((options & OBJPR_CLEANONLY) == 0) { p->valid = 0; vm_page_undirty(p); } goto next; } if (vm_page_busied(p)) { VM_OBJECT_WUNLOCK(object); vm_page_busy_sleep(p, "vmopar", false); VM_OBJECT_WLOCK(object); goto again; } KASSERT((p->flags & PG_FICTITIOUS) == 0, ("vm_object_page_remove: page %p is fictitious", p)); if ((options & OBJPR_CLEANONLY) != 0 && p->valid != 0) { if ((options & OBJPR_NOTMAPPED) == 0) pmap_remove_write(p); if (p->dirty) goto next; } if ((options & OBJPR_NOTMAPPED) == 0) pmap_remove_all(p); vm_page_free(p); next: vm_page_unlock(p); } vm_object_pip_wakeup(object); } /* * vm_object_page_noreuse: * * For the given object, attempt to move the specified pages to * the head of the inactive queue. This bypasses regular LRU * operation and allows the pages to be reused quickly under memory * pressure. If a page is wired for any reason, then it will not * be queued. Pages are specified by the range ["start", "end"). * As a special case, if "end" is zero, then the range extends from * "start" to the end of the object. * * This operation should only be performed on objects that * contain non-fictitious, managed pages. * * The object must be locked. */ void vm_object_page_noreuse(vm_object_t object, vm_pindex_t start, vm_pindex_t end) { struct mtx *mtx, *new_mtx; vm_page_t p, next; VM_OBJECT_ASSERT_LOCKED(object); KASSERT((object->flags & (OBJ_FICTITIOUS | OBJ_UNMANAGED)) == 0, ("vm_object_page_noreuse: illegal object %p", object)); if (object->resident_page_count == 0) return; p = vm_page_find_least(object, start); /* * Here, the variable "p" is either (1) the page with the least pindex * greater than or equal to the parameter "start" or (2) NULL. */ mtx = NULL; for (; p != NULL && (p->pindex < end || end == 0); p = next) { next = TAILQ_NEXT(p, listq); /* * Avoid releasing and reacquiring the same page lock. */ new_mtx = vm_page_lockptr(p); if (mtx != new_mtx) { if (mtx != NULL) mtx_unlock(mtx); mtx = new_mtx; mtx_lock(mtx); } vm_page_deactivate_noreuse(p); } if (mtx != NULL) mtx_unlock(mtx); } /* * Populate the specified range of the object with valid pages. Returns * TRUE if the range is successfully populated and FALSE otherwise. * * Note: This function should be optimized to pass a larger array of * pages to vm_pager_get_pages() before it is applied to a non- * OBJT_DEVICE object. * * The object must be locked. */ boolean_t vm_object_populate(vm_object_t object, vm_pindex_t start, vm_pindex_t end) { vm_page_t m; vm_pindex_t pindex; int rv; VM_OBJECT_ASSERT_WLOCKED(object); for (pindex = start; pindex < end; pindex++) { m = vm_page_grab(object, pindex, VM_ALLOC_NORMAL); if (m->valid != VM_PAGE_BITS_ALL) { rv = vm_pager_get_pages(object, &m, 1, NULL, NULL); if (rv != VM_PAGER_OK) { vm_page_lock(m); vm_page_free(m); vm_page_unlock(m); break; } } /* * Keep "m" busy because a subsequent iteration may unlock * the object. */ } if (pindex > start) { m = vm_page_lookup(object, start); while (m != NULL && m->pindex < pindex) { vm_page_xunbusy(m); m = TAILQ_NEXT(m, listq); } } return (pindex == end); } /* * Routine: vm_object_coalesce * Function: Coalesces two objects backing up adjoining * regions of memory into a single object. * * returns TRUE if objects were combined. * * NOTE: Only works at the moment if the second object is NULL - * if it's not, which object do we lock first? * * Parameters: * prev_object First object to coalesce * prev_offset Offset into prev_object * prev_size Size of reference to prev_object * next_size Size of reference to the second object * reserved Indicator that extension region has * swap accounted for * * Conditions: * The object must *not* be locked. */ boolean_t vm_object_coalesce(vm_object_t prev_object, vm_ooffset_t prev_offset, vm_size_t prev_size, vm_size_t next_size, boolean_t reserved) { vm_pindex_t next_pindex; if (prev_object == NULL) return (TRUE); VM_OBJECT_WLOCK(prev_object); if ((prev_object->type != OBJT_DEFAULT && prev_object->type != OBJT_SWAP) || (prev_object->flags & OBJ_TMPFS_NODE) != 0) { VM_OBJECT_WUNLOCK(prev_object); return (FALSE); } /* * Try to collapse the object first */ vm_object_collapse(prev_object); /* * Can't coalesce if: . more than one reference . paged out . shadows * another object . has a copy elsewhere (any of which mean that the * pages not mapped to prev_entry may be in use anyway) */ if (prev_object->backing_object != NULL) { VM_OBJECT_WUNLOCK(prev_object); return (FALSE); } prev_size >>= PAGE_SHIFT; next_size >>= PAGE_SHIFT; next_pindex = OFF_TO_IDX(prev_offset) + prev_size; if ((prev_object->ref_count > 1) && (prev_object->size != next_pindex)) { VM_OBJECT_WUNLOCK(prev_object); return (FALSE); } /* * Account for the charge. */ if (prev_object->cred != NULL) { /* * If prev_object was charged, then this mapping, * although not charged now, may become writable * later. Non-NULL cred in the object would prevent * swap reservation during enabling of the write * access, so reserve swap now. Failed reservation * cause allocation of the separate object for the map * entry, and swap reservation for this entry is * managed in appropriate time. */ if (!reserved && !swap_reserve_by_cred(ptoa(next_size), prev_object->cred)) { VM_OBJECT_WUNLOCK(prev_object); return (FALSE); } prev_object->charge += ptoa(next_size); } /* * Remove any pages that may still be in the object from a previous * deallocation. */ if (next_pindex < prev_object->size) { vm_object_page_remove(prev_object, next_pindex, next_pindex + next_size, 0); if (prev_object->type == OBJT_SWAP) swap_pager_freespace(prev_object, next_pindex, next_size); #if 0 if (prev_object->cred != NULL) { KASSERT(prev_object->charge >= ptoa(prev_object->size - next_pindex), ("object %p overcharged 1 %jx %jx", prev_object, (uintmax_t)next_pindex, (uintmax_t)next_size)); prev_object->charge -= ptoa(prev_object->size - next_pindex); } #endif } /* * Extend the object if necessary. */ if (next_pindex + next_size > prev_object->size) prev_object->size = next_pindex + next_size; VM_OBJECT_WUNLOCK(prev_object); return (TRUE); } void vm_object_set_writeable_dirty(vm_object_t object) { VM_OBJECT_ASSERT_WLOCKED(object); if (object->type != OBJT_VNODE) { if ((object->flags & OBJ_TMPFS_NODE) != 0) { KASSERT(object->type == OBJT_SWAP, ("non-swap tmpfs")); vm_object_set_flag(object, OBJ_TMPFS_DIRTY); } return; } object->generation++; if ((object->flags & OBJ_MIGHTBEDIRTY) != 0) return; vm_object_set_flag(object, OBJ_MIGHTBEDIRTY); } /* * vm_object_unwire: * * For each page offset within the specified range of the given object, * find the highest-level page in the shadow chain and unwire it. A page * must exist at every page offset, and the highest-level page must be * wired. */ void vm_object_unwire(vm_object_t object, vm_ooffset_t offset, vm_size_t length, uint8_t queue) { vm_object_t tobject; vm_page_t m, tm; vm_pindex_t end_pindex, pindex, tpindex; int depth, locked_depth; KASSERT((offset & PAGE_MASK) == 0, ("vm_object_unwire: offset is not page aligned")); KASSERT((length & PAGE_MASK) == 0, ("vm_object_unwire: length is not a multiple of PAGE_SIZE")); /* The wired count of a fictitious page never changes. */ if ((object->flags & OBJ_FICTITIOUS) != 0) return; pindex = OFF_TO_IDX(offset); end_pindex = pindex + atop(length); locked_depth = 1; VM_OBJECT_RLOCK(object); m = vm_page_find_least(object, pindex); while (pindex < end_pindex) { if (m == NULL || pindex < m->pindex) { /* * The first object in the shadow chain doesn't * contain a page at the current index. Therefore, * the page must exist in a backing object. */ tobject = object; tpindex = pindex; depth = 0; do { tpindex += OFF_TO_IDX(tobject->backing_object_offset); tobject = tobject->backing_object; KASSERT(tobject != NULL, ("vm_object_unwire: missing page")); if ((tobject->flags & OBJ_FICTITIOUS) != 0) goto next_page; depth++; if (depth == locked_depth) { locked_depth++; VM_OBJECT_RLOCK(tobject); } } while ((tm = vm_page_lookup(tobject, tpindex)) == NULL); } else { tm = m; m = TAILQ_NEXT(m, listq); } vm_page_lock(tm); vm_page_unwire(tm, queue); vm_page_unlock(tm); next_page: pindex++; } /* Release the accumulated object locks. */ for (depth = 0; depth < locked_depth; depth++) { tobject = object->backing_object; VM_OBJECT_RUNLOCK(object); object = tobject; } } struct vnode * vm_object_vnode(vm_object_t object) { VM_OBJECT_ASSERT_LOCKED(object); if (object->type == OBJT_VNODE) return (object->handle); if (object->type == OBJT_SWAP && (object->flags & OBJ_TMPFS) != 0) return (object->un_pager.swp.swp_tmpfs); return (NULL); } static int sysctl_vm_object_list(SYSCTL_HANDLER_ARGS) { struct kinfo_vmobject *kvo; char *fullpath, *freepath; struct vnode *vp; struct vattr va; vm_object_t obj; vm_page_t m; int count, error; if (req->oldptr == NULL) { /* * If an old buffer has not been provided, generate an * estimate of the space needed for a subsequent call. */ mtx_lock(&vm_object_list_mtx); count = 0; TAILQ_FOREACH(obj, &vm_object_list, object_list) { if (obj->type == OBJT_DEAD) continue; count++; } mtx_unlock(&vm_object_list_mtx); return (SYSCTL_OUT(req, NULL, sizeof(struct kinfo_vmobject) * count * 11 / 10)); } kvo = malloc(sizeof(*kvo), M_TEMP, M_WAITOK); error = 0; /* * VM objects are type stable and are never removed from the * list once added. This allows us to safely read obj->object_list * after reacquiring the VM object lock. */ mtx_lock(&vm_object_list_mtx); TAILQ_FOREACH(obj, &vm_object_list, object_list) { if (obj->type == OBJT_DEAD) continue; VM_OBJECT_RLOCK(obj); if (obj->type == OBJT_DEAD) { VM_OBJECT_RUNLOCK(obj); continue; } mtx_unlock(&vm_object_list_mtx); kvo->kvo_size = ptoa(obj->size); kvo->kvo_resident = obj->resident_page_count; kvo->kvo_ref_count = obj->ref_count; kvo->kvo_shadow_count = obj->shadow_count; kvo->kvo_memattr = obj->memattr; kvo->kvo_active = 0; kvo->kvo_inactive = 0; TAILQ_FOREACH(m, &obj->memq, listq) { /* * A page may belong to the object but be * dequeued and set to PQ_NONE while the * object lock is not held. This makes the * reads of m->queue below racy, and we do not * count pages set to PQ_NONE. However, this * sysctl is only meant to give an * approximation of the system anyway. */ if (vm_page_active(m)) kvo->kvo_active++; else if (vm_page_inactive(m)) kvo->kvo_inactive++; } kvo->kvo_vn_fileid = 0; kvo->kvo_vn_fsid = 0; kvo->kvo_vn_fsid_freebsd11 = 0; freepath = NULL; fullpath = ""; vp = NULL; switch (obj->type) { case OBJT_DEFAULT: kvo->kvo_type = KVME_TYPE_DEFAULT; break; case OBJT_VNODE: kvo->kvo_type = KVME_TYPE_VNODE; vp = obj->handle; vref(vp); break; case OBJT_SWAP: kvo->kvo_type = KVME_TYPE_SWAP; break; case OBJT_DEVICE: kvo->kvo_type = KVME_TYPE_DEVICE; break; case OBJT_PHYS: kvo->kvo_type = KVME_TYPE_PHYS; break; case OBJT_DEAD: kvo->kvo_type = KVME_TYPE_DEAD; break; case OBJT_SG: kvo->kvo_type = KVME_TYPE_SG; break; case OBJT_MGTDEVICE: kvo->kvo_type = KVME_TYPE_MGTDEVICE; break; default: kvo->kvo_type = KVME_TYPE_UNKNOWN; break; } VM_OBJECT_RUNLOCK(obj); if (vp != NULL) { vn_fullpath(curthread, vp, &fullpath, &freepath); vn_lock(vp, LK_SHARED | LK_RETRY); if (VOP_GETATTR(vp, &va, curthread->td_ucred) == 0) { kvo->kvo_vn_fileid = va.va_fileid; kvo->kvo_vn_fsid = va.va_fsid; kvo->kvo_vn_fsid_freebsd11 = va.va_fsid; /* truncate */ } vput(vp); } strlcpy(kvo->kvo_path, fullpath, sizeof(kvo->kvo_path)); if (freepath != NULL) free(freepath, M_TEMP); /* Pack record size down */ kvo->kvo_structsize = offsetof(struct kinfo_vmobject, kvo_path) + strlen(kvo->kvo_path) + 1; kvo->kvo_structsize = roundup(kvo->kvo_structsize, sizeof(uint64_t)); error = SYSCTL_OUT(req, kvo, kvo->kvo_structsize); mtx_lock(&vm_object_list_mtx); if (error) break; } mtx_unlock(&vm_object_list_mtx); free(kvo, M_TEMP); return (error); } SYSCTL_PROC(_vm, OID_AUTO, objects, CTLTYPE_STRUCT | CTLFLAG_RW | CTLFLAG_SKIP | CTLFLAG_MPSAFE, NULL, 0, sysctl_vm_object_list, "S,kinfo_vmobject", "List of VM objects"); #include "opt_ddb.h" #ifdef DDB #include #include #include static int _vm_object_in_map(vm_map_t map, vm_object_t object, vm_map_entry_t entry) { vm_map_t tmpm; vm_map_entry_t tmpe; vm_object_t obj; int entcount; if (map == 0) return 0; if (entry == 0) { tmpe = map->header.next; entcount = map->nentries; while (entcount-- && (tmpe != &map->header)) { if (_vm_object_in_map(map, object, tmpe)) { return 1; } tmpe = tmpe->next; } } else if (entry->eflags & MAP_ENTRY_IS_SUB_MAP) { tmpm = entry->object.sub_map; tmpe = tmpm->header.next; entcount = tmpm->nentries; while (entcount-- && tmpe != &tmpm->header) { if (_vm_object_in_map(tmpm, object, tmpe)) { return 1; } tmpe = tmpe->next; } } else if ((obj = entry->object.vm_object) != NULL) { for (; obj; obj = obj->backing_object) if (obj == object) { return 1; } } return 0; } static int vm_object_in_map(vm_object_t object) { struct proc *p; /* sx_slock(&allproc_lock); */ FOREACH_PROC_IN_SYSTEM(p) { if (!p->p_vmspace /* || (p->p_flag & (P_SYSTEM|P_WEXIT)) */) continue; if (_vm_object_in_map(&p->p_vmspace->vm_map, object, 0)) { /* sx_sunlock(&allproc_lock); */ return 1; } } /* sx_sunlock(&allproc_lock); */ if (_vm_object_in_map(kernel_map, object, 0)) return 1; return 0; } DB_SHOW_COMMAND(vmochk, vm_object_check) { vm_object_t object; /* * make sure that internal objs are in a map somewhere * and none have zero ref counts. */ TAILQ_FOREACH(object, &vm_object_list, object_list) { if (object->handle == NULL && (object->type == OBJT_DEFAULT || object->type == OBJT_SWAP)) { if (object->ref_count == 0) { db_printf("vmochk: internal obj has zero ref count: %ld\n", (long)object->size); } if (!vm_object_in_map(object)) { db_printf( "vmochk: internal obj is not in a map: " "ref: %d, size: %lu: 0x%lx, backing_object: %p\n", object->ref_count, (u_long)object->size, (u_long)object->size, (void *)object->backing_object); } } } } /* * vm_object_print: [ debug ] */ DB_SHOW_COMMAND(object, vm_object_print_static) { /* XXX convert args. */ vm_object_t object = (vm_object_t)addr; boolean_t full = have_addr; vm_page_t p; /* XXX count is an (unused) arg. Avoid shadowing it. */ #define count was_count int count; if (object == NULL) return; db_iprintf( "Object %p: type=%d, size=0x%jx, res=%d, ref=%d, flags=0x%x ruid %d charge %jx\n", object, (int)object->type, (uintmax_t)object->size, object->resident_page_count, object->ref_count, object->flags, object->cred ? object->cred->cr_ruid : -1, (uintmax_t)object->charge); db_iprintf(" sref=%d, backing_object(%d)=(%p)+0x%jx\n", object->shadow_count, object->backing_object ? object->backing_object->ref_count : 0, object->backing_object, (uintmax_t)object->backing_object_offset); if (!full) return; db_indent += 2; count = 0; TAILQ_FOREACH(p, &object->memq, listq) { if (count == 0) db_iprintf("memory:="); else if (count == 6) { db_printf("\n"); db_iprintf(" ..."); count = 0; } else db_printf(","); count++; db_printf("(off=0x%jx,page=0x%jx)", (uintmax_t)p->pindex, (uintmax_t)VM_PAGE_TO_PHYS(p)); } if (count != 0) db_printf("\n"); db_indent -= 2; } /* XXX. */ #undef count /* XXX need this non-static entry for calling from vm_map_print. */ void vm_object_print( /* db_expr_t */ long addr, boolean_t have_addr, /* db_expr_t */ long count, char *modif) { vm_object_print_static(addr, have_addr, count, modif); } DB_SHOW_COMMAND(vmopag, vm_object_print_pages) { vm_object_t object; vm_pindex_t fidx; vm_paddr_t pa; vm_page_t m, prev_m; int rcount, nl, c; nl = 0; TAILQ_FOREACH(object, &vm_object_list, object_list) { db_printf("new object: %p\n", (void *)object); if (nl > 18) { c = cngetc(); if (c != ' ') return; nl = 0; } nl++; rcount = 0; fidx = 0; pa = -1; TAILQ_FOREACH(m, &object->memq, listq) { if (m->pindex > 128) break; if ((prev_m = TAILQ_PREV(m, pglist, listq)) != NULL && prev_m->pindex + 1 != m->pindex) { if (rcount) { db_printf(" index(%ld)run(%d)pa(0x%lx)\n", (long)fidx, rcount, (long)pa); if (nl > 18) { c = cngetc(); if (c != ' ') return; nl = 0; } nl++; rcount = 0; } } if (rcount && (VM_PAGE_TO_PHYS(m) == pa + rcount * PAGE_SIZE)) { ++rcount; continue; } if (rcount) { db_printf(" index(%ld)run(%d)pa(0x%lx)\n", (long)fidx, rcount, (long)pa); if (nl > 18) { c = cngetc(); if (c != ' ') return; nl = 0; } nl++; } fidx = m->pindex; pa = VM_PAGE_TO_PHYS(m); rcount = 1; } if (rcount) { db_printf(" index(%ld)run(%d)pa(0x%lx)\n", (long)fidx, rcount, (long)pa); if (nl > 18) { c = cngetc(); if (c != ' ') return; nl = 0; } nl++; } } } #endif /* DDB */ Index: projects/numa2/sys/vm/vm_object.h =================================================================== --- projects/numa2/sys/vm/vm_object.h (revision 321505) +++ projects/numa2/sys/vm/vm_object.h (revision 321506) @@ -1,331 +1,335 @@ /*- * Copyright (c) 1991, 1993 * The Regents of the University of California. All rights reserved. * * This code is derived from software contributed to Berkeley by * The Mach Operating System project at Carnegie-Mellon University. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from: @(#)vm_object.h 8.3 (Berkeley) 1/12/94 * * * Copyright (c) 1987, 1990 Carnegie-Mellon University. * All rights reserved. * * Authors: Avadis Tevanian, Jr., Michael Wayne Young * * Permission to use, copy, modify and distribute this software and * its documentation is hereby granted, provided that both the copyright * notice and this permission notice appear in all copies of the * software, derivative works or modified versions, and any portions * thereof, and that both notices appear in supporting documentation. * * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. * * Carnegie Mellon requests users of this software to return to * * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU * School of Computer Science * Carnegie Mellon University * Pittsburgh PA 15213-3890 * * any improvements or extensions that they make and grant Carnegie the * rights to redistribute these changes. * * $FreeBSD$ */ /* * Virtual memory object module definitions. */ #ifndef _VM_OBJECT_ #define _VM_OBJECT_ #include #include #include #include +#include #include /* * Types defined: * * vm_object_t Virtual memory object. * * List of locks * (c) const until freed * (o) per-object lock * (f) free pages queue mutex * */ struct vm_object { struct rwlock lock; TAILQ_ENTRY(vm_object) object_list; /* list of all objects */ LIST_HEAD(, vm_object) shadow_head; /* objects that this is a shadow for */ LIST_ENTRY(vm_object) shadow_list; /* chain of shadow objects */ TAILQ_HEAD(respgs, vm_page) memq; /* list of resident pages */ struct vm_radix rtree; /* root of the resident page radix trie*/ vm_pindex_t size; /* Object size */ +#if MAXMEMDOM > 1 + struct vm_domain_iterator selector; /* NUMA domain selector. */ +#endif int generation; /* generation ID */ int ref_count; /* How many refs?? */ int shadow_count; /* how many objects that this is a shadow for */ vm_memattr_t memattr; /* default memory attribute for pages */ objtype_t type; /* type of pager */ u_short flags; /* see below */ u_short pg_color; /* (c) color of first page in obj */ u_int paging_in_progress; /* Paging (in or out) so don't collapse or destroy */ int resident_page_count; /* number of resident pages */ struct vm_object *backing_object; /* object that I'm a shadow of */ vm_ooffset_t backing_object_offset;/* Offset in backing object */ TAILQ_ENTRY(vm_object) pager_object_list; /* list of all objects of this pager type */ LIST_HEAD(, vm_reserv) rvq; /* list of reservations */ void *handle; union { /* * VNode pager * * vnp_size - current size of file */ struct { off_t vnp_size; vm_ooffset_t writemappings; } vnp; /* * Device pager * * devp_pglist - list of allocated pages */ struct { TAILQ_HEAD(, vm_page) devp_pglist; struct cdev_pager_ops *ops; struct cdev *dev; } devp; /* * SG pager * * sgp_pglist - list of allocated pages */ struct { TAILQ_HEAD(, vm_page) sgp_pglist; } sgp; /* * Swap pager * * swp_tmpfs - back-pointer to the tmpfs vnode, * if any, which uses the vm object * as backing store. The handle * cannot be reused for linking, * because the vnode can be * reclaimed and recreated, making * the handle changed and hash-chain * invalid. * * swp_bcount - number of swap 'swblock' metablocks, each * contains up to 16 swapblk assignments. * see vm/swap_pager.h */ struct { void *swp_tmpfs; int swp_bcount; } swp; } un_pager; struct ucred *cred; vm_ooffset_t charge; void *umtx_data; }; /* * Flags */ #define OBJ_FICTITIOUS 0x0001 /* (c) contains fictitious pages */ #define OBJ_UNMANAGED 0x0002 /* (c) contains unmanaged pages */ #define OBJ_POPULATE 0x0004 /* pager implements populate() */ #define OBJ_DEAD 0x0008 /* dead objects (during rundown) */ #define OBJ_NOSPLIT 0x0010 /* dont split this object */ #define OBJ_UMTXDEAD 0x0020 /* umtx pshared was terminated */ #define OBJ_PIPWNT 0x0040 /* paging in progress wanted */ #define OBJ_MIGHTBEDIRTY 0x0100 /* object might be dirty, only for vnode */ #define OBJ_TMPFS_NODE 0x0200 /* object belongs to tmpfs VREG node */ #define OBJ_TMPFS_DIRTY 0x0400 /* dirty tmpfs obj */ #define OBJ_COLORED 0x1000 /* pg_color is defined */ #define OBJ_ONEMAPPING 0x2000 /* One USE (a single, non-forked) mapping flag */ #define OBJ_DISCONNECTWNT 0x4000 /* disconnect from vnode wanted */ #define OBJ_TMPFS 0x8000 /* has tmpfs vnode allocated */ /* * Helpers to perform conversion between vm_object page indexes and offsets. * IDX_TO_OFF() converts an index into an offset. * OFF_TO_IDX() converts an offset into an index. Since offsets are signed * by default, the sign propagation in OFF_TO_IDX(), when applied to * negative offsets, is intentional and returns a vm_object page index * that cannot be created by a userspace mapping. * UOFF_TO_IDX() treats the offset as an unsigned value and converts it * into an index accordingly. Use it only when the full range of offset * values are allowed. Currently, this only applies to device mappings. * OBJ_MAX_SIZE specifies the maximum page index corresponding to the * maximum unsigned offset. */ #define IDX_TO_OFF(idx) (((vm_ooffset_t)(idx)) << PAGE_SHIFT) #define OFF_TO_IDX(off) ((vm_pindex_t)(((vm_ooffset_t)(off)) >> PAGE_SHIFT)) #define UOFF_TO_IDX(off) (((vm_pindex_t)(off)) >> PAGE_SHIFT) #define OBJ_MAX_SIZE (UOFF_TO_IDX(UINT64_MAX) + 1) #ifdef _KERNEL #define OBJPC_SYNC 0x1 /* sync I/O */ #define OBJPC_INVAL 0x2 /* invalidate */ #define OBJPC_NOSYNC 0x4 /* skip if VPO_NOSYNC */ /* * The following options are supported by vm_object_page_remove(). */ #define OBJPR_CLEANONLY 0x1 /* Don't remove dirty pages. */ #define OBJPR_NOTMAPPED 0x2 /* Don't unmap pages. */ TAILQ_HEAD(object_q, vm_object); extern struct object_q vm_object_list; /* list of allocated objects */ extern struct mtx vm_object_list_mtx; /* lock for object list and count */ extern struct vm_object kernel_object_store; extern struct vm_object kmem_object_store; #define kernel_object (&kernel_object_store) #define kmem_object (&kmem_object_store) #define VM_OBJECT_ASSERT_LOCKED(object) \ rw_assert(&(object)->lock, RA_LOCKED) #define VM_OBJECT_ASSERT_RLOCKED(object) \ rw_assert(&(object)->lock, RA_RLOCKED) #define VM_OBJECT_ASSERT_WLOCKED(object) \ rw_assert(&(object)->lock, RA_WLOCKED) #define VM_OBJECT_ASSERT_UNLOCKED(object) \ rw_assert(&(object)->lock, RA_UNLOCKED) #define VM_OBJECT_LOCK_DOWNGRADE(object) \ rw_downgrade(&(object)->lock) #define VM_OBJECT_RLOCK(object) \ rw_rlock(&(object)->lock) #define VM_OBJECT_RUNLOCK(object) \ rw_runlock(&(object)->lock) #define VM_OBJECT_SLEEP(object, wchan, pri, wmesg, timo) \ rw_sleep((wchan), &(object)->lock, (pri), (wmesg), (timo)) #define VM_OBJECT_TRYRLOCK(object) \ rw_try_rlock(&(object)->lock) #define VM_OBJECT_TRYWLOCK(object) \ rw_try_wlock(&(object)->lock) #define VM_OBJECT_TRYUPGRADE(object) \ rw_try_upgrade(&(object)->lock) #define VM_OBJECT_WLOCK(object) \ rw_wlock(&(object)->lock) #define VM_OBJECT_WOWNED(object) \ rw_wowned(&(object)->lock) #define VM_OBJECT_WUNLOCK(object) \ rw_wunlock(&(object)->lock) /* * The object must be locked or thread private. */ static __inline void vm_object_set_flag(vm_object_t object, u_short bits) { object->flags |= bits; } /* * Conditionally set the object's color, which (1) enables the allocation * of physical memory reservations for anonymous objects and larger-than- * superpage-sized named objects and (2) determines the first page offset * within the object at which a reservation may be allocated. In other * words, the color determines the alignment of the object with respect * to the largest superpage boundary. When mapping named objects, like * files or POSIX shared memory objects, the color should be set to zero * before a virtual address is selected for the mapping. In contrast, * for anonymous objects, the color may be set after the virtual address * is selected. * * The object must be locked. */ static __inline void vm_object_color(vm_object_t object, u_short color) { if ((object->flags & OBJ_COLORED) == 0) { object->pg_color = color; object->flags |= OBJ_COLORED; } } void vm_object_clear_flag(vm_object_t object, u_short bits); void vm_object_pip_add(vm_object_t object, short i); void vm_object_pip_subtract(vm_object_t object, short i); void vm_object_pip_wakeup(vm_object_t object); void vm_object_pip_wakeupn(vm_object_t object, short i); void vm_object_pip_wait(vm_object_t object, char *waitid); void umtx_shm_object_init(vm_object_t object); void umtx_shm_object_terminated(vm_object_t object); extern int umtx_shm_vnobj_persistent; vm_object_t vm_object_allocate (objtype_t, vm_pindex_t); boolean_t vm_object_coalesce(vm_object_t, vm_ooffset_t, vm_size_t, vm_size_t, boolean_t); void vm_object_collapse (vm_object_t); void vm_object_deallocate (vm_object_t); void vm_object_destroy (vm_object_t); void vm_object_terminate (vm_object_t); void vm_object_set_writeable_dirty (vm_object_t); void vm_object_init (void); void vm_object_madvise(vm_object_t, vm_pindex_t, vm_pindex_t, int); boolean_t vm_object_page_clean(vm_object_t object, vm_ooffset_t start, vm_ooffset_t end, int flags); void vm_object_page_noreuse(vm_object_t object, vm_pindex_t start, vm_pindex_t end); void vm_object_page_remove(vm_object_t object, vm_pindex_t start, vm_pindex_t end, int options); boolean_t vm_object_populate(vm_object_t, vm_pindex_t, vm_pindex_t); void vm_object_print(long addr, boolean_t have_addr, long count, char *modif); void vm_object_reference (vm_object_t); void vm_object_reference_locked(vm_object_t); int vm_object_set_memattr(vm_object_t object, vm_memattr_t memattr); void vm_object_shadow (vm_object_t *, vm_ooffset_t *, vm_size_t); void vm_object_split(vm_map_entry_t); boolean_t vm_object_sync(vm_object_t, vm_ooffset_t, vm_size_t, boolean_t, boolean_t); void vm_object_unwire(vm_object_t object, vm_ooffset_t offset, vm_size_t length, uint8_t queue); struct vnode *vm_object_vnode(vm_object_t object); #endif /* _KERNEL */ #endif /* _VM_OBJECT_ */ Index: projects/numa2/sys/vm/vm_page.c =================================================================== --- projects/numa2/sys/vm/vm_page.c (revision 321505) +++ projects/numa2/sys/vm/vm_page.c (revision 321506) @@ -1,3661 +1,3719 @@ /*- * Copyright (c) 1991 Regents of the University of California. * All rights reserved. * Copyright (c) 1998 Matthew Dillon. All Rights Reserved. * * This code is derived from software contributed to Berkeley by * The Mach Operating System project at Carnegie-Mellon University. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from: @(#)vm_page.c 7.4 (Berkeley) 5/7/91 */ /*- * Copyright (c) 1987, 1990 Carnegie-Mellon University. * All rights reserved. * * Authors: Avadis Tevanian, Jr., Michael Wayne Young * * Permission to use, copy, modify and distribute this software and * its documentation is hereby granted, provided that both the copyright * notice and this permission notice appear in all copies of the * software, derivative works or modified versions, and any portions * thereof, and that both notices appear in supporting documentation. * * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. * * Carnegie Mellon requests users of this software to return to * * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU * School of Computer Science * Carnegie Mellon University * Pittsburgh PA 15213-3890 * * any improvements or extensions that they make and grant Carnegie the * rights to redistribute these changes. */ /* * GENERAL RULES ON VM_PAGE MANIPULATION * * - A page queue lock is required when adding or removing a page from a * page queue regardless of other locks or the busy state of a page. * * * In general, no thread besides the page daemon can acquire or * hold more than one page queue lock at a time. * * * The page daemon can acquire and hold any pair of page queue * locks in any order. * * - The object lock is required when inserting or removing * pages from an object (vm_page_insert() or vm_page_remove()). * */ /* * Resident memory management module. */ #include __FBSDID("$FreeBSD$"); #include "opt_vm.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include +#include #include #include #include #include #include #include #include #include #include #include #include #include /* * Associated with page of user-allocatable memory is a * page structure. */ struct vm_domain vm_dom[MAXMEMDOM]; struct mtx_padalign vm_page_queue_free_mtx; struct mtx_padalign pa_lock[PA_LOCK_COUNT]; /* * bogus page -- for I/O to/from partially complete buffers, * or for paging into sparsely invalid regions. */ vm_page_t bogus_page; vm_page_t vm_page_array; long vm_page_array_size; long first_page; static int boot_pages = UMA_BOOT_PAGES; SYSCTL_INT(_vm, OID_AUTO, boot_pages, CTLFLAG_RDTUN | CTLFLAG_NOFETCH, &boot_pages, 0, "number of pages allocated for bootstrapping the VM system"); static int pa_tryrelock_restart; SYSCTL_INT(_vm, OID_AUTO, tryrelock_restart, CTLFLAG_RD, &pa_tryrelock_restart, 0, "Number of tryrelock restarts"); static TAILQ_HEAD(, vm_page) blacklist_head; static int sysctl_vm_page_blacklist(SYSCTL_HANDLER_ARGS); SYSCTL_PROC(_vm, OID_AUTO, page_blacklist, CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0, sysctl_vm_page_blacklist, "A", "Blacklist pages"); /* Is the page daemon waiting for free pages? */ static int vm_pageout_pages_needed; static uma_zone_t fakepg_zone; static void vm_page_alloc_check(vm_page_t m); static void vm_page_clear_dirty_mask(vm_page_t m, vm_page_bits_t pagebits); static void vm_page_enqueue(uint8_t queue, vm_page_t m); static void vm_page_free_wakeup(void); static void vm_page_init(void *dummy); static int vm_page_insert_after(vm_page_t m, vm_object_t object, vm_pindex_t pindex, vm_page_t mpred); static void vm_page_insert_radixdone(vm_page_t m, vm_object_t object, vm_page_t mpred); static int vm_page_reclaim_run(int req_class, u_long npages, vm_page_t m_run, vm_paddr_t high); SYSINIT(vm_page, SI_SUB_VM, SI_ORDER_SECOND, vm_page_init, NULL); static void vm_page_init(void *dummy) { fakepg_zone = uma_zcreate("fakepg", sizeof(struct vm_page), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE | UMA_ZONE_VM); bogus_page = vm_page_alloc(NULL, 0, VM_ALLOC_NOOBJ | VM_ALLOC_NORMAL | VM_ALLOC_WIRED); } /* Make sure that u_long is at least 64 bits when PAGE_SIZE is 32K. */ #if PAGE_SIZE == 32768 #ifdef CTASSERT CTASSERT(sizeof(u_long) >= 8); #endif #endif /* * Try to acquire a physical address lock while a pmap is locked. If we * fail to trylock we unlock and lock the pmap directly and cache the * locked pa in *locked. The caller should then restart their loop in case * the virtual to physical mapping has changed. */ int vm_page_pa_tryrelock(pmap_t pmap, vm_paddr_t pa, vm_paddr_t *locked) { vm_paddr_t lockpa; lockpa = *locked; *locked = pa; if (lockpa) { PA_LOCK_ASSERT(lockpa, MA_OWNED); if (PA_LOCKPTR(pa) == PA_LOCKPTR(lockpa)) return (0); PA_UNLOCK(lockpa); } if (PA_TRYLOCK(pa)) return (0); PMAP_UNLOCK(pmap); atomic_add_int(&pa_tryrelock_restart, 1); PA_LOCK(pa); PMAP_LOCK(pmap); return (EAGAIN); } /* * vm_set_page_size: * * Sets the page size, perhaps based upon the memory * size. Must be called before any use of page-size * dependent functions. */ void vm_set_page_size(void) { if (vm_cnt.v_page_size == 0) vm_cnt.v_page_size = PAGE_SIZE; if (((vm_cnt.v_page_size - 1) & vm_cnt.v_page_size) != 0) panic("vm_set_page_size: page size not a power of two"); } /* * vm_page_blacklist_next: * * Find the next entry in the provided string of blacklist * addresses. Entries are separated by space, comma, or newline. * If an invalid integer is encountered then the rest of the * string is skipped. Updates the list pointer to the next * character, or NULL if the string is exhausted or invalid. */ static vm_paddr_t vm_page_blacklist_next(char **list, char *end) { vm_paddr_t bad; char *cp, *pos; if (list == NULL || *list == NULL) return (0); if (**list =='\0') { *list = NULL; return (0); } /* * If there's no end pointer then the buffer is coming from * the kenv and we know it's null-terminated. */ if (end == NULL) end = *list + strlen(*list); /* Ensure that strtoq() won't walk off the end */ if (*end != '\0') { if (*end == '\n' || *end == ' ' || *end == ',') *end = '\0'; else { printf("Blacklist not terminated, skipping\n"); *list = NULL; return (0); } } for (pos = *list; *pos != '\0'; pos = cp) { bad = strtoq(pos, &cp, 0); if (*cp == '\0' || *cp == ' ' || *cp == ',' || *cp == '\n') { if (bad == 0) { if (++cp < end) continue; else break; } } else break; if (*cp == '\0' || ++cp >= end) *list = NULL; else *list = cp; return (trunc_page(bad)); } printf("Garbage in RAM blacklist, skipping\n"); *list = NULL; return (0); } /* * vm_page_blacklist_check: * * Iterate through the provided string of blacklist addresses, pulling * each entry out of the physical allocator free list and putting it * onto a list for reporting via the vm.page_blacklist sysctl. */ static void vm_page_blacklist_check(char *list, char *end) { vm_paddr_t pa; vm_page_t m; char *next; int ret; next = list; while (next != NULL) { if ((pa = vm_page_blacklist_next(&next, end)) == 0) continue; m = vm_phys_paddr_to_vm_page(pa); if (m == NULL) continue; mtx_lock(&vm_page_queue_free_mtx); ret = vm_phys_unfree_page(m); mtx_unlock(&vm_page_queue_free_mtx); if (ret == TRUE) { TAILQ_INSERT_TAIL(&blacklist_head, m, listq); if (bootverbose) printf("Skipping page with pa 0x%jx\n", (uintmax_t)pa); } } } /* * vm_page_blacklist_load: * * Search for a special module named "ram_blacklist". It'll be a * plain text file provided by the user via the loader directive * of the same name. */ static void vm_page_blacklist_load(char **list, char **end) { void *mod; u_char *ptr; u_int len; mod = NULL; ptr = NULL; mod = preload_search_by_type("ram_blacklist"); if (mod != NULL) { ptr = preload_fetch_addr(mod); len = preload_fetch_size(mod); } *list = ptr; if (ptr != NULL) *end = ptr + len; else *end = NULL; return; } static int sysctl_vm_page_blacklist(SYSCTL_HANDLER_ARGS) { vm_page_t m; struct sbuf sbuf; int error, first; first = 1; error = sysctl_wire_old_buffer(req, 0); if (error != 0) return (error); sbuf_new_for_sysctl(&sbuf, NULL, 128, req); TAILQ_FOREACH(m, &blacklist_head, listq) { sbuf_printf(&sbuf, "%s%#jx", first ? "" : ",", (uintmax_t)m->phys_addr); first = 0; } error = sbuf_finish(&sbuf); sbuf_delete(&sbuf); return (error); } static void vm_page_domain_init(struct vm_domain *vmd) { struct vm_pagequeue *pq; int i; *__DECONST(char **, &vmd->vmd_pagequeues[PQ_INACTIVE].pq_name) = "vm inactive pagequeue"; *__DECONST(u_int **, &vmd->vmd_pagequeues[PQ_INACTIVE].pq_vcnt) = &vm_cnt.v_inactive_count; *__DECONST(char **, &vmd->vmd_pagequeues[PQ_ACTIVE].pq_name) = "vm active pagequeue"; *__DECONST(u_int **, &vmd->vmd_pagequeues[PQ_ACTIVE].pq_vcnt) = &vm_cnt.v_active_count; *__DECONST(char **, &vmd->vmd_pagequeues[PQ_LAUNDRY].pq_name) = "vm laundry pagequeue"; *__DECONST(int **, &vmd->vmd_pagequeues[PQ_LAUNDRY].pq_vcnt) = &vm_cnt.v_laundry_count; *__DECONST(char **, &vmd->vmd_pagequeues[PQ_UNSWAPPABLE].pq_name) = "vm unswappable pagequeue"; /* Unswappable dirty pages are counted as being in the laundry. */ *__DECONST(int **, &vmd->vmd_pagequeues[PQ_UNSWAPPABLE].pq_vcnt) = &vm_cnt.v_laundry_count; vmd->vmd_page_count = 0; vmd->vmd_free_count = 0; vmd->vmd_segs = 0; vmd->vmd_oom = FALSE; for (i = 0; i < PQ_COUNT; i++) { pq = &vmd->vmd_pagequeues[i]; TAILQ_INIT(&pq->pq_pl); mtx_init(&pq->pq_mutex, pq->pq_name, "vm pagequeue", MTX_DEF | MTX_DUPOK); } } /* * vm_page_startup: * * Initializes the resident memory module. Allocates physical memory for * bootstrapping UMA and some data structures that are used to manage * physical pages. Initializes these structures, and populates the free * page queues. */ vm_offset_t vm_page_startup(vm_offset_t vaddr) { vm_offset_t mapped; vm_paddr_t high_avail, low_avail, page_range, size; vm_paddr_t new_end; int i; vm_paddr_t pa; vm_paddr_t last_pa; char *list, *listend; vm_paddr_t end; vm_paddr_t biggestsize; int biggestone; int pages_per_zone; biggestsize = 0; biggestone = 0; vaddr = round_page(vaddr); for (i = 0; phys_avail[i + 1]; i += 2) { phys_avail[i] = round_page(phys_avail[i]); phys_avail[i + 1] = trunc_page(phys_avail[i + 1]); } for (i = 0; phys_avail[i + 1]; i += 2) { size = phys_avail[i + 1] - phys_avail[i]; if (size > biggestsize) { biggestone = i; biggestsize = size; } } end = phys_avail[biggestone+1]; /* * Initialize the page and queue locks. */ mtx_init(&vm_page_queue_free_mtx, "vm page free queue", NULL, MTX_DEF); for (i = 0; i < PA_LOCK_COUNT; i++) mtx_init(&pa_lock[i], "vm page", NULL, MTX_DEF); for (i = 0; i < vm_ndomains; i++) vm_page_domain_init(&vm_dom[i]); /* * Almost all of the pages needed for bootstrapping UMA are used * for zone structures, so if the number of CPUs results in those * structures taking more than one page each, we set aside more pages * in proportion to the zone structure size. */ pages_per_zone = howmany(sizeof(struct uma_zone) + sizeof(struct uma_cache) * (mp_maxid + 1), UMA_SLAB_SIZE); if (pages_per_zone > 1) { /* Reserve more pages so that we don't run out. */ boot_pages = UMA_BOOT_PAGES_ZONES * pages_per_zone; } /* * Allocate memory for use when boot strapping the kernel memory * allocator. * * CTFLAG_RDTUN doesn't work during the early boot process, so we must * manually fetch the value. */ TUNABLE_INT_FETCH("vm.boot_pages", &boot_pages); new_end = end - (boot_pages * UMA_SLAB_SIZE); new_end = trunc_page(new_end); mapped = pmap_map(&vaddr, new_end, end, VM_PROT_READ | VM_PROT_WRITE); bzero((void *)mapped, end - new_end); uma_startup((void *)mapped, boot_pages); #if defined(__aarch64__) || defined(__amd64__) || defined(__arm__) || \ defined(__i386__) || defined(__mips__) /* * Allocate a bitmap to indicate that a random physical page * needs to be included in a minidump. * * The amd64 port needs this to indicate which direct map pages * need to be dumped, via calls to dump_add_page()/dump_drop_page(). * * However, i386 still needs this workspace internally within the * minidump code. In theory, they are not needed on i386, but are * included should the sf_buf code decide to use them. */ last_pa = 0; for (i = 0; dump_avail[i + 1] != 0; i += 2) if (dump_avail[i + 1] > last_pa) last_pa = dump_avail[i + 1]; page_range = last_pa / PAGE_SIZE; vm_page_dump_size = round_page(roundup2(page_range, NBBY) / NBBY); new_end -= vm_page_dump_size; vm_page_dump = (void *)(uintptr_t)pmap_map(&vaddr, new_end, new_end + vm_page_dump_size, VM_PROT_READ | VM_PROT_WRITE); bzero((void *)vm_page_dump, vm_page_dump_size); #endif #if defined(__aarch64__) || defined(__amd64__) || defined(__mips__) /* * Include the UMA bootstrap pages and vm_page_dump in a crash dump. * When pmap_map() uses the direct map, they are not automatically * included. */ for (pa = new_end; pa < end; pa += PAGE_SIZE) dump_add_page(pa); #endif phys_avail[biggestone + 1] = new_end; #ifdef __amd64__ /* * Request that the physical pages underlying the message buffer be * included in a crash dump. Since the message buffer is accessed * through the direct map, they are not automatically included. */ pa = DMAP_TO_PHYS((vm_offset_t)msgbufp->msg_ptr); last_pa = pa + round_page(msgbufsize); while (pa < last_pa) { dump_add_page(pa); pa += PAGE_SIZE; } #endif /* * Compute the number of pages of memory that will be available for * use, taking into account the overhead of a page structure per page. * In other words, solve * "available physical memory" - round_page(page_range * * sizeof(struct vm_page)) = page_range * PAGE_SIZE * for page_range. */ low_avail = phys_avail[0]; high_avail = phys_avail[1]; for (i = 0; i < vm_phys_nsegs; i++) { if (vm_phys_segs[i].start < low_avail) low_avail = vm_phys_segs[i].start; if (vm_phys_segs[i].end > high_avail) high_avail = vm_phys_segs[i].end; } /* Skip the first chunk. It is already accounted for. */ for (i = 2; phys_avail[i + 1] != 0; i += 2) { if (phys_avail[i] < low_avail) low_avail = phys_avail[i]; if (phys_avail[i + 1] > high_avail) high_avail = phys_avail[i + 1]; } first_page = low_avail / PAGE_SIZE; #ifdef VM_PHYSSEG_SPARSE size = 0; for (i = 0; i < vm_phys_nsegs; i++) size += vm_phys_segs[i].end - vm_phys_segs[i].start; for (i = 0; phys_avail[i + 1] != 0; i += 2) size += phys_avail[i + 1] - phys_avail[i]; #elif defined(VM_PHYSSEG_DENSE) size = high_avail - low_avail; #else #error "Either VM_PHYSSEG_DENSE or VM_PHYSSEG_SPARSE must be defined." #endif #ifdef VM_PHYSSEG_DENSE /* * In the VM_PHYSSEG_DENSE case, the number of pages can account for * the overhead of a page structure per page only if vm_page_array is * allocated from the last physical memory chunk. Otherwise, we must * allocate page structures representing the physical memory * underlying vm_page_array, even though they will not be used. */ if (new_end != high_avail) page_range = size / PAGE_SIZE; else #endif { page_range = size / (PAGE_SIZE + sizeof(struct vm_page)); /* * If the partial bytes remaining are large enough for * a page (PAGE_SIZE) without a corresponding * 'struct vm_page', then new_end will contain an * extra page after subtracting the length of the VM * page array. Compensate by subtracting an extra * page from new_end. */ if (size % (PAGE_SIZE + sizeof(struct vm_page)) >= PAGE_SIZE) { if (new_end == high_avail) high_avail -= PAGE_SIZE; new_end -= PAGE_SIZE; } } end = new_end; /* * Reserve an unmapped guard page to trap access to vm_page_array[-1]. * However, because this page is allocated from KVM, out-of-bounds * accesses using the direct map will not be trapped. */ vaddr += PAGE_SIZE; /* * Allocate physical memory for the page structures, and map it. */ new_end = trunc_page(end - page_range * sizeof(struct vm_page)); mapped = pmap_map(&vaddr, new_end, end, VM_PROT_READ | VM_PROT_WRITE); vm_page_array = (vm_page_t) mapped; #if VM_NRESERVLEVEL > 0 /* * Allocate physical memory for the reservation management system's * data structures, and map it. */ if (high_avail == end) high_avail = new_end; new_end = vm_reserv_startup(&vaddr, new_end, high_avail); #endif #if defined(__aarch64__) || defined(__amd64__) || defined(__mips__) /* * Include vm_page_array and vm_reserv_array in a crash dump. */ for (pa = new_end; pa < end; pa += PAGE_SIZE) dump_add_page(pa); #endif phys_avail[biggestone + 1] = new_end; /* * Add physical memory segments corresponding to the available * physical pages. */ for (i = 0; phys_avail[i + 1] != 0; i += 2) vm_phys_add_seg(phys_avail[i], phys_avail[i + 1]); /* * Clear all of the page structures */ bzero((caddr_t) vm_page_array, page_range * sizeof(struct vm_page)); for (i = 0; i < page_range; i++) vm_page_array[i].order = VM_NFREEORDER; vm_page_array_size = page_range; /* * Initialize the physical memory allocator. */ vm_phys_init(); /* * Add every available physical page that is not blacklisted to * the free lists. */ vm_cnt.v_page_count = 0; vm_cnt.v_free_count = 0; for (i = 0; phys_avail[i + 1] != 0; i += 2) { pa = phys_avail[i]; last_pa = phys_avail[i + 1]; while (pa < last_pa) { vm_phys_add_page(pa); pa += PAGE_SIZE; } } TAILQ_INIT(&blacklist_head); vm_page_blacklist_load(&list, &listend); vm_page_blacklist_check(list, listend); list = kern_getenv("vm.blacklist"); vm_page_blacklist_check(list, NULL); freeenv(list); #if VM_NRESERVLEVEL > 0 /* * Initialize the reservation management system. */ vm_reserv_init(); #endif return (vaddr); } void vm_page_reference(vm_page_t m) { vm_page_aflag_set(m, PGA_REFERENCED); } /* * vm_page_busy_downgrade: * * Downgrade an exclusive busy page into a single shared busy page. */ void vm_page_busy_downgrade(vm_page_t m) { u_int x; bool locked; vm_page_assert_xbusied(m); locked = mtx_owned(vm_page_lockptr(m)); for (;;) { x = m->busy_lock; x &= VPB_BIT_WAITERS; if (x != 0 && !locked) vm_page_lock(m); if (atomic_cmpset_rel_int(&m->busy_lock, VPB_SINGLE_EXCLUSIVER | x, VPB_SHARERS_WORD(1))) break; if (x != 0 && !locked) vm_page_unlock(m); } if (x != 0) { wakeup(m); if (!locked) vm_page_unlock(m); } } /* * vm_page_sbusied: * * Return a positive value if the page is shared busied, 0 otherwise. */ int vm_page_sbusied(vm_page_t m) { u_int x; x = m->busy_lock; return ((x & VPB_BIT_SHARED) != 0 && x != VPB_UNBUSIED); } /* * vm_page_sunbusy: * * Shared unbusy a page. */ void vm_page_sunbusy(vm_page_t m) { u_int x; vm_page_assert_sbusied(m); for (;;) { x = m->busy_lock; if (VPB_SHARERS(x) > 1) { if (atomic_cmpset_int(&m->busy_lock, x, x - VPB_ONE_SHARER)) break; continue; } if ((x & VPB_BIT_WAITERS) == 0) { KASSERT(x == VPB_SHARERS_WORD(1), ("vm_page_sunbusy: invalid lock state")); if (atomic_cmpset_int(&m->busy_lock, VPB_SHARERS_WORD(1), VPB_UNBUSIED)) break; continue; } KASSERT(x == (VPB_SHARERS_WORD(1) | VPB_BIT_WAITERS), ("vm_page_sunbusy: invalid lock state for waiters")); vm_page_lock(m); if (!atomic_cmpset_int(&m->busy_lock, x, VPB_UNBUSIED)) { vm_page_unlock(m); continue; } wakeup(m); vm_page_unlock(m); break; } } /* * vm_page_busy_sleep: * * Sleep and release the page lock, using the page pointer as wchan. * This is used to implement the hard-path of busying mechanism. * * The given page must be locked. * * If nonshared is true, sleep only if the page is xbusy. */ void vm_page_busy_sleep(vm_page_t m, const char *wmesg, bool nonshared) { u_int x; vm_page_assert_locked(m); x = m->busy_lock; if (x == VPB_UNBUSIED || (nonshared && (x & VPB_BIT_SHARED) != 0) || ((x & VPB_BIT_WAITERS) == 0 && !atomic_cmpset_int(&m->busy_lock, x, x | VPB_BIT_WAITERS))) { vm_page_unlock(m); return; } msleep(m, vm_page_lockptr(m), PVM | PDROP, wmesg, 0); } /* * vm_page_trysbusy: * * Try to shared busy a page. * If the operation succeeds 1 is returned otherwise 0. * The operation never sleeps. */ int vm_page_trysbusy(vm_page_t m) { u_int x; for (;;) { x = m->busy_lock; if ((x & VPB_BIT_SHARED) == 0) return (0); if (atomic_cmpset_acq_int(&m->busy_lock, x, x + VPB_ONE_SHARER)) return (1); } } static void vm_page_xunbusy_locked(vm_page_t m) { vm_page_assert_xbusied(m); vm_page_assert_locked(m); atomic_store_rel_int(&m->busy_lock, VPB_UNBUSIED); /* There is a waiter, do wakeup() instead of vm_page_flash(). */ wakeup(m); } void vm_page_xunbusy_maybelocked(vm_page_t m) { bool lockacq; vm_page_assert_xbusied(m); /* * Fast path for unbusy. If it succeeds, we know that there * are no waiters, so we do not need a wakeup. */ if (atomic_cmpset_rel_int(&m->busy_lock, VPB_SINGLE_EXCLUSIVER, VPB_UNBUSIED)) return; lockacq = !mtx_owned(vm_page_lockptr(m)); if (lockacq) vm_page_lock(m); vm_page_xunbusy_locked(m); if (lockacq) vm_page_unlock(m); } /* * vm_page_xunbusy_hard: * * Called after the first try the exclusive unbusy of a page failed. * It is assumed that the waiters bit is on. */ void vm_page_xunbusy_hard(vm_page_t m) { vm_page_assert_xbusied(m); vm_page_lock(m); vm_page_xunbusy_locked(m); vm_page_unlock(m); } /* * vm_page_flash: * * Wakeup anyone waiting for the page. * The ownership bits do not change. * * The given page must be locked. */ void vm_page_flash(vm_page_t m) { u_int x; vm_page_lock_assert(m, MA_OWNED); for (;;) { x = m->busy_lock; if ((x & VPB_BIT_WAITERS) == 0) return; if (atomic_cmpset_int(&m->busy_lock, x, x & (~VPB_BIT_WAITERS))) break; } wakeup(m); } /* * Keep page from being freed by the page daemon * much of the same effect as wiring, except much lower * overhead and should be used only for *very* temporary * holding ("wiring"). */ void vm_page_hold(vm_page_t mem) { vm_page_lock_assert(mem, MA_OWNED); mem->hold_count++; } void vm_page_unhold(vm_page_t mem) { vm_page_lock_assert(mem, MA_OWNED); KASSERT(mem->hold_count >= 1, ("vm_page_unhold: hold count < 0!!!")); --mem->hold_count; if (mem->hold_count == 0 && (mem->flags & PG_UNHOLDFREE) != 0) vm_page_free_toq(mem); } /* * vm_page_unhold_pages: * * Unhold each of the pages that is referenced by the given array. */ void vm_page_unhold_pages(vm_page_t *ma, int count) { struct mtx *mtx, *new_mtx; mtx = NULL; for (; count != 0; count--) { /* * Avoid releasing and reacquiring the same page lock. */ new_mtx = vm_page_lockptr(*ma); if (mtx != new_mtx) { if (mtx != NULL) mtx_unlock(mtx); mtx = new_mtx; mtx_lock(mtx); } vm_page_unhold(*ma); ma++; } if (mtx != NULL) mtx_unlock(mtx); } vm_page_t PHYS_TO_VM_PAGE(vm_paddr_t pa) { vm_page_t m; #ifdef VM_PHYSSEG_SPARSE m = vm_phys_paddr_to_vm_page(pa); if (m == NULL) m = vm_phys_fictitious_to_vm_page(pa); return (m); #elif defined(VM_PHYSSEG_DENSE) long pi; pi = atop(pa); if (pi >= first_page && (pi - first_page) < vm_page_array_size) { m = &vm_page_array[pi - first_page]; return (m); } return (vm_phys_fictitious_to_vm_page(pa)); #else #error "Either VM_PHYSSEG_DENSE or VM_PHYSSEG_SPARSE must be defined." #endif } /* * vm_page_getfake: * * Create a fictitious page with the specified physical address and * memory attribute. The memory attribute is the only the machine- * dependent aspect of a fictitious page that must be initialized. */ vm_page_t vm_page_getfake(vm_paddr_t paddr, vm_memattr_t memattr) { vm_page_t m; m = uma_zalloc(fakepg_zone, M_WAITOK | M_ZERO); vm_page_initfake(m, paddr, memattr); return (m); } void vm_page_initfake(vm_page_t m, vm_paddr_t paddr, vm_memattr_t memattr) { if ((m->flags & PG_FICTITIOUS) != 0) { /* * The page's memattr might have changed since the * previous initialization. Update the pmap to the * new memattr. */ goto memattr; } m->phys_addr = paddr; m->queue = PQ_NONE; /* Fictitious pages don't use "segind". */ m->flags = PG_FICTITIOUS; /* Fictitious pages don't use "order" or "pool". */ m->oflags = VPO_UNMANAGED; m->busy_lock = VPB_SINGLE_EXCLUSIVER; m->wire_count = 1; pmap_page_init(m); memattr: pmap_page_set_memattr(m, memattr); } /* * vm_page_putfake: * * Release a fictitious page. */ void vm_page_putfake(vm_page_t m) { KASSERT((m->oflags & VPO_UNMANAGED) != 0, ("managed %p", m)); KASSERT((m->flags & PG_FICTITIOUS) != 0, ("vm_page_putfake: bad page %p", m)); uma_zfree(fakepg_zone, m); } /* * vm_page_updatefake: * * Update the given fictitious page to the specified physical address and * memory attribute. */ void vm_page_updatefake(vm_page_t m, vm_paddr_t paddr, vm_memattr_t memattr) { KASSERT((m->flags & PG_FICTITIOUS) != 0, ("vm_page_updatefake: bad page %p", m)); m->phys_addr = paddr; pmap_page_set_memattr(m, memattr); } /* * vm_page_free: * * Free a page. */ void vm_page_free(vm_page_t m) { m->flags &= ~PG_ZERO; vm_page_free_toq(m); } /* * vm_page_free_zero: * * Free a page to the zerod-pages queue */ void vm_page_free_zero(vm_page_t m) { m->flags |= PG_ZERO; vm_page_free_toq(m); } /* * Unbusy and handle the page queueing for a page from a getpages request that * was optionally read ahead or behind. */ void vm_page_readahead_finish(vm_page_t m) { /* We shouldn't put invalid pages on queues. */ KASSERT(m->valid != 0, ("%s: %p is invalid", __func__, m)); /* * Since the page is not the actually needed one, whether it should * be activated or deactivated is not obvious. Empirical results * have shown that deactivating the page is usually the best choice, * unless the page is wanted by another thread. */ vm_page_lock(m); if ((m->busy_lock & VPB_BIT_WAITERS) != 0) vm_page_activate(m); else vm_page_deactivate(m); vm_page_unlock(m); vm_page_xunbusy(m); } /* * vm_page_sleep_if_busy: * * Sleep and release the page queues lock if the page is busied. * Returns TRUE if the thread slept. * * The given page must be unlocked and object containing it must * be locked. */ int vm_page_sleep_if_busy(vm_page_t m, const char *msg) { vm_object_t obj; vm_page_lock_assert(m, MA_NOTOWNED); VM_OBJECT_ASSERT_WLOCKED(m->object); if (vm_page_busied(m)) { /* * The page-specific object must be cached because page * identity can change during the sleep, causing the * re-lock of a different object. * It is assumed that a reference to the object is already * held by the callers. */ obj = m->object; vm_page_lock(m); VM_OBJECT_WUNLOCK(obj); vm_page_busy_sleep(m, msg, false); VM_OBJECT_WLOCK(obj); return (TRUE); } return (FALSE); } /* * vm_page_dirty_KBI: [ internal use only ] * * Set all bits in the page's dirty field. * * The object containing the specified page must be locked if the * call is made from the machine-independent layer. * * See vm_page_clear_dirty_mask(). * * This function should only be called by vm_page_dirty(). */ void vm_page_dirty_KBI(vm_page_t m) { /* Refer to this operation by its public name. */ KASSERT(m->valid == VM_PAGE_BITS_ALL, ("vm_page_dirty: page is invalid!")); m->dirty = VM_PAGE_BITS_ALL; } /* * vm_page_insert: [ internal use only ] * * Inserts the given mem entry into the object and object list. * * The object must be locked. */ int vm_page_insert(vm_page_t m, vm_object_t object, vm_pindex_t pindex) { vm_page_t mpred; VM_OBJECT_ASSERT_WLOCKED(object); mpred = vm_radix_lookup_le(&object->rtree, pindex); return (vm_page_insert_after(m, object, pindex, mpred)); } /* * vm_page_insert_after: * * Inserts the page "m" into the specified object at offset "pindex". * * The page "mpred" must immediately precede the offset "pindex" within * the specified object. * * The object must be locked. */ static int vm_page_insert_after(vm_page_t m, vm_object_t object, vm_pindex_t pindex, vm_page_t mpred) { vm_page_t msucc; VM_OBJECT_ASSERT_WLOCKED(object); KASSERT(m->object == NULL, ("vm_page_insert_after: page already inserted")); if (mpred != NULL) { KASSERT(mpred->object == object, ("vm_page_insert_after: object doesn't contain mpred")); KASSERT(mpred->pindex < pindex, ("vm_page_insert_after: mpred doesn't precede pindex")); msucc = TAILQ_NEXT(mpred, listq); } else msucc = TAILQ_FIRST(&object->memq); if (msucc != NULL) KASSERT(msucc->pindex > pindex, ("vm_page_insert_after: msucc doesn't succeed pindex")); /* * Record the object/offset pair in this page */ m->object = object; m->pindex = pindex; /* * Now link into the object's ordered list of backed pages. */ if (vm_radix_insert(&object->rtree, m)) { m->object = NULL; m->pindex = 0; return (1); } vm_page_insert_radixdone(m, object, mpred); return (0); } /* * vm_page_insert_radixdone: * * Complete page "m" insertion into the specified object after the * radix trie hooking. * * The page "mpred" must precede the offset "m->pindex" within the * specified object. * * The object must be locked. */ static void vm_page_insert_radixdone(vm_page_t m, vm_object_t object, vm_page_t mpred) { VM_OBJECT_ASSERT_WLOCKED(object); KASSERT(object != NULL && m->object == object, ("vm_page_insert_radixdone: page %p has inconsistent object", m)); if (mpred != NULL) { KASSERT(mpred->object == object, ("vm_page_insert_after: object doesn't contain mpred")); KASSERT(mpred->pindex < m->pindex, ("vm_page_insert_after: mpred doesn't precede pindex")); } if (mpred != NULL) TAILQ_INSERT_AFTER(&object->memq, mpred, m, listq); else TAILQ_INSERT_HEAD(&object->memq, m, listq); /* * Show that the object has one more resident page. */ object->resident_page_count++; /* * Hold the vnode until the last page is released. */ if (object->resident_page_count == 1 && object->type == OBJT_VNODE) vhold(object->handle); /* * Since we are inserting a new and possibly dirty page, * update the object's OBJ_MIGHTBEDIRTY flag. */ if (pmap_page_is_write_mapped(m)) vm_object_set_writeable_dirty(object); } /* * vm_page_remove: * * Removes the specified page from its containing object, but does not * invalidate any backing storage. * * The object must be locked. The page must be locked if it is managed. */ void vm_page_remove(vm_page_t m) { vm_object_t object; vm_page_t mrem; if ((m->oflags & VPO_UNMANAGED) == 0) vm_page_assert_locked(m); if ((object = m->object) == NULL) return; VM_OBJECT_ASSERT_WLOCKED(object); if (vm_page_xbusied(m)) vm_page_xunbusy_maybelocked(m); mrem = vm_radix_remove(&object->rtree, m->pindex); KASSERT(mrem == m, ("removed page %p, expected page %p", mrem, m)); /* * Now remove from the object's list of backed pages. */ TAILQ_REMOVE(&object->memq, m, listq); /* * And show that the object has one fewer resident page. */ object->resident_page_count--; /* * The vnode may now be recycled. */ if (object->resident_page_count == 0 && object->type == OBJT_VNODE) vdrop(object->handle); m->object = NULL; } /* * vm_page_lookup: * * Returns the page associated with the object/offset * pair specified; if none is found, NULL is returned. * * The object must be locked. */ vm_page_t vm_page_lookup(vm_object_t object, vm_pindex_t pindex) { VM_OBJECT_ASSERT_LOCKED(object); return (vm_radix_lookup(&object->rtree, pindex)); } /* * vm_page_find_least: * * Returns the page associated with the object with least pindex * greater than or equal to the parameter pindex, or NULL. * * The object must be locked. */ vm_page_t vm_page_find_least(vm_object_t object, vm_pindex_t pindex) { vm_page_t m; VM_OBJECT_ASSERT_LOCKED(object); if ((m = TAILQ_FIRST(&object->memq)) != NULL && m->pindex < pindex) m = vm_radix_lookup_ge(&object->rtree, pindex); return (m); } /* * Returns the given page's successor (by pindex) within the object if it is * resident; if none is found, NULL is returned. * * The object must be locked. */ vm_page_t vm_page_next(vm_page_t m) { vm_page_t next; VM_OBJECT_ASSERT_LOCKED(m->object); if ((next = TAILQ_NEXT(m, listq)) != NULL) { MPASS(next->object == m->object); if (next->pindex != m->pindex + 1) next = NULL; } return (next); } /* * Returns the given page's predecessor (by pindex) within the object if it is * resident; if none is found, NULL is returned. * * The object must be locked. */ vm_page_t vm_page_prev(vm_page_t m) { vm_page_t prev; VM_OBJECT_ASSERT_LOCKED(m->object); if ((prev = TAILQ_PREV(m, pglist, listq)) != NULL) { MPASS(prev->object == m->object); if (prev->pindex != m->pindex - 1) prev = NULL; } return (prev); } /* * Uses the page mnew as a replacement for an existing page at index * pindex which must be already present in the object. * * The existing page must not be on a paging queue. */ vm_page_t vm_page_replace(vm_page_t mnew, vm_object_t object, vm_pindex_t pindex) { vm_page_t mold; VM_OBJECT_ASSERT_WLOCKED(object); KASSERT(mnew->object == NULL, ("vm_page_replace: page already in object")); /* * This function mostly follows vm_page_insert() and * vm_page_remove() without the radix, object count and vnode * dance. Double check such functions for more comments. */ mnew->object = object; mnew->pindex = pindex; mold = vm_radix_replace(&object->rtree, mnew); KASSERT(mold->queue == PQ_NONE, ("vm_page_replace: mold is on a paging queue")); /* Keep the resident page list in sorted order. */ TAILQ_INSERT_AFTER(&object->memq, mold, mnew, listq); TAILQ_REMOVE(&object->memq, mold, listq); mold->object = NULL; vm_page_xunbusy_maybelocked(mold); /* * The object's resident_page_count does not change because we have * swapped one page for another, but OBJ_MIGHTBEDIRTY. */ if (pmap_page_is_write_mapped(mnew)) vm_object_set_writeable_dirty(object); return (mold); } /* * vm_page_rename: * * Move the given memory entry from its * current object to the specified target object/offset. * * Note: swap associated with the page must be invalidated by the move. We * have to do this for several reasons: (1) we aren't freeing the * page, (2) we are dirtying the page, (3) the VM system is probably * moving the page from object A to B, and will then later move * the backing store from A to B and we can't have a conflict. * * Note: we *always* dirty the page. It is necessary both for the * fact that we moved it, and because we may be invalidating * swap. * * The objects must be locked. */ int vm_page_rename(vm_page_t m, vm_object_t new_object, vm_pindex_t new_pindex) { vm_page_t mpred; vm_pindex_t opidx; VM_OBJECT_ASSERT_WLOCKED(new_object); mpred = vm_radix_lookup_le(&new_object->rtree, new_pindex); KASSERT(mpred == NULL || mpred->pindex != new_pindex, ("vm_page_rename: pindex already renamed")); /* * Create a custom version of vm_page_insert() which does not depend * by m_prev and can cheat on the implementation aspects of the * function. */ opidx = m->pindex; m->pindex = new_pindex; if (vm_radix_insert(&new_object->rtree, m)) { m->pindex = opidx; return (1); } /* * The operation cannot fail anymore. The removal must happen before * the listq iterator is tainted. */ m->pindex = opidx; vm_page_lock(m); vm_page_remove(m); /* Return back to the new pindex to complete vm_page_insert(). */ m->pindex = new_pindex; m->object = new_object; vm_page_unlock(m); vm_page_insert_radixdone(m, new_object, mpred); vm_page_dirty(m); return (0); } /* * vm_page_alloc: * * Allocate and return a page that is associated with the specified * object and offset pair. By default, this page is exclusive busied. * * The caller must always specify an allocation class. * * allocation classes: * VM_ALLOC_NORMAL normal process request * VM_ALLOC_SYSTEM system *really* needs a page * VM_ALLOC_INTERRUPT interrupt time request * * optional allocation flags: * VM_ALLOC_COUNT(number) the number of additional pages that the caller * intends to allocate * VM_ALLOC_NOBUSY do not exclusive busy the page * VM_ALLOC_NODUMP do not include the page in a kernel core dump * VM_ALLOC_NOOBJ page is not associated with an object and * should not be exclusive busy * VM_ALLOC_SBUSY shared busy the allocated page * VM_ALLOC_WIRED wire the allocated page * VM_ALLOC_ZERO prefer a zeroed page * * This routine may not sleep. */ vm_page_t vm_page_alloc(vm_object_t object, vm_pindex_t pindex, int req) { +#ifdef VM_NUMA_ALLOC + struct vm_domain_iterator *vip; + vm_page_t m; + int domain, i; + + if (object != NULL) + vip = &object->selector; + else + vip = &curthread->td_dom_selector; + + for (i = 0, domain = vm_domain_select_first(vip); + i < vm_ndomains && domain != -1; + i++, domain = vm_domain_select_next(vip, domain)) + if ((m = vm_page_alloc_domain(object, pindex, domain, + req)) != NULL) + return (m); + return (NULL); +#else + return (vm_page_alloc_domain(object, pindex, 0, req)); +#endif +} + +vm_page_t +vm_page_alloc_domain(vm_object_t object, vm_pindex_t pindex, int domain, + int req) +{ vm_page_t m, mpred; int flags, req_class; mpred = NULL; /* XXX: pacify gcc */ KASSERT((object != NULL) == ((req & VM_ALLOC_NOOBJ) == 0) && (object != NULL || (req & VM_ALLOC_SBUSY) == 0) && ((req & (VM_ALLOC_NOBUSY | VM_ALLOC_SBUSY)) != (VM_ALLOC_NOBUSY | VM_ALLOC_SBUSY)), ("vm_page_alloc: inconsistent object(%p)/req(%x)", object, req)); if (object != NULL) VM_OBJECT_ASSERT_WLOCKED(object); req_class = req & VM_ALLOC_CLASS_MASK; /* * The page daemon is allowed to dig deeper into the free page list. */ if (curproc == pageproc && req_class != VM_ALLOC_INTERRUPT) req_class = VM_ALLOC_SYSTEM; if (object != NULL) { mpred = vm_radix_lookup_le(&object->rtree, pindex); KASSERT(mpred == NULL || mpred->pindex != pindex, ("vm_page_alloc: pindex already allocated")); } /* * Allocate a page if the number of free pages exceeds the minimum * for the request class. */ mtx_lock(&vm_page_queue_free_mtx); if (vm_cnt.v_free_count > vm_cnt.v_free_reserved || (req_class == VM_ALLOC_SYSTEM && vm_cnt.v_free_count > vm_cnt.v_interrupt_free_min) || (req_class == VM_ALLOC_INTERRUPT && vm_cnt.v_free_count > 0)) { /* * Can we allocate the page from a reservation? */ #if VM_NRESERVLEVEL > 0 if (object == NULL || (object->flags & (OBJ_COLORED | OBJ_FICTITIOUS)) != OBJ_COLORED || (m = - vm_reserv_alloc_page(object, pindex, mpred)) == NULL) + vm_reserv_alloc_page(object, pindex, domain, + mpred)) == NULL) #endif { /* * If not, allocate it from the free page queues. */ - m = vm_phys_alloc_pages(object != NULL ? + m = vm_phys_alloc_pages(domain, object != NULL ? VM_FREEPOOL_DEFAULT : VM_FREEPOOL_DIRECT, 0); #if VM_NRESERVLEVEL > 0 - if (m == NULL && vm_reserv_reclaim_inactive()) { - m = vm_phys_alloc_pages(object != NULL ? + if (m == NULL && vm_reserv_reclaim_inactive(domain)) { + m = vm_phys_alloc_pages(domain, object != NULL ? VM_FREEPOOL_DEFAULT : VM_FREEPOOL_DIRECT, 0); } #endif } } else { /* * Not allocatable, give up. */ mtx_unlock(&vm_page_queue_free_mtx); atomic_add_int(&vm_pageout_deficit, max((u_int)req >> VM_ALLOC_COUNT_SHIFT, 1)); pagedaemon_wakeup(); return (NULL); } /* * At this point we had better have found a good page. */ KASSERT(m != NULL, ("vm_page_alloc: missing page")); vm_phys_freecnt_adj(m, -1); mtx_unlock(&vm_page_queue_free_mtx); vm_page_alloc_check(m); /* * Initialize the page. Only the PG_ZERO flag is inherited. */ flags = 0; if ((req & VM_ALLOC_ZERO) != 0) flags = PG_ZERO; flags &= m->flags; if ((req & VM_ALLOC_NODUMP) != 0) flags |= PG_NODUMP; m->flags = flags; m->aflags = 0; m->oflags = object == NULL || (object->flags & OBJ_UNMANAGED) != 0 ? VPO_UNMANAGED : 0; m->busy_lock = VPB_UNBUSIED; if ((req & (VM_ALLOC_NOBUSY | VM_ALLOC_NOOBJ | VM_ALLOC_SBUSY)) == 0) m->busy_lock = VPB_SINGLE_EXCLUSIVER; if ((req & VM_ALLOC_SBUSY) != 0) m->busy_lock = VPB_SHARERS_WORD(1); if (req & VM_ALLOC_WIRED) { /* * The page lock is not required for wiring a page until that * page is inserted into the object. */ atomic_add_int(&vm_cnt.v_wire_count, 1); m->wire_count = 1; } m->act_count = 0; if (object != NULL) { if (vm_page_insert_after(m, object, pindex, mpred)) { pagedaemon_wakeup(); if (req & VM_ALLOC_WIRED) { atomic_subtract_int(&vm_cnt.v_wire_count, 1); m->wire_count = 0; } KASSERT(m->object == NULL, ("page %p has object", m)); m->oflags = VPO_UNMANAGED; m->busy_lock = VPB_UNBUSIED; /* Don't change PG_ZERO. */ vm_page_free_toq(m); return (NULL); } /* Ignore device objects; the pager sets "memattr" for them. */ if (object->memattr != VM_MEMATTR_DEFAULT && (object->flags & OBJ_FICTITIOUS) == 0) pmap_page_set_memattr(m, object->memattr); } else m->pindex = pindex; /* * Don't wakeup too often - wakeup the pageout daemon when * we would be nearly out of memory. */ if (vm_paging_needed()) pagedaemon_wakeup(); return (m); } /* * vm_page_alloc_contig: * * Allocate a contiguous set of physical pages of the given size "npages" * from the free lists. All of the physical pages must be at or above * the given physical address "low" and below the given physical address * "high". The given value "alignment" determines the alignment of the * first physical page in the set. If the given value "boundary" is * non-zero, then the set of physical pages cannot cross any physical * address boundary that is a multiple of that value. Both "alignment" * and "boundary" must be a power of two. * * If the specified memory attribute, "memattr", is VM_MEMATTR_DEFAULT, * then the memory attribute setting for the physical pages is configured * to the object's memory attribute setting. Otherwise, the memory * attribute setting for the physical pages is configured to "memattr", * overriding the object's memory attribute setting. However, if the * object's memory attribute setting is not VM_MEMATTR_DEFAULT, then the * memory attribute setting for the physical pages cannot be configured * to VM_MEMATTR_DEFAULT. * * The specified object may not contain fictitious pages. * * The caller must always specify an allocation class. * * allocation classes: * VM_ALLOC_NORMAL normal process request * VM_ALLOC_SYSTEM system *really* needs a page * VM_ALLOC_INTERRUPT interrupt time request * * optional allocation flags: * VM_ALLOC_NOBUSY do not exclusive busy the page * VM_ALLOC_NODUMP do not include the page in a kernel core dump * VM_ALLOC_NOOBJ page is not associated with an object and * should not be exclusive busy * VM_ALLOC_SBUSY shared busy the allocated page * VM_ALLOC_WIRED wire the allocated page * VM_ALLOC_ZERO prefer a zeroed page * * This routine may not sleep. */ vm_page_t vm_page_alloc_contig(vm_object_t object, vm_pindex_t pindex, int req, u_long npages, vm_paddr_t low, vm_paddr_t high, u_long alignment, vm_paddr_t boundary, vm_memattr_t memattr) { +#ifdef VM_NUMA_ALLOC + struct vm_domain_iterator *vip; + vm_page_t m; + int domain, i; + + if (object != NULL) + vip = &object->selector; + else + vip = &curthread->td_dom_selector; + + for (i = 0, domain = vm_domain_select_first(vip); + i < vm_ndomains && domain != -1; + i++, domain = vm_domain_select_next(vip, domain)) + if ((m = vm_page_alloc_contig_domain(object, pindex, domain, + req, npages, low, high, alignment, boundary, + memattr)) != NULL) + return (m); + return (NULL); +#else + return (vm_page_alloc_contig_domain(object, pindex, 0, req, npages, + low, high, alignment, boundary, memattr)); +#endif +} + +vm_page_t +vm_page_alloc_contig_domain(vm_object_t object, vm_pindex_t pindex, int domain, + int req, u_long npages, vm_paddr_t low, vm_paddr_t high, u_long alignment, + vm_paddr_t boundary, vm_memattr_t memattr) +{ vm_page_t m, m_ret, mpred; u_int busy_lock, flags, oflags; int req_class; mpred = NULL; /* XXX: pacify gcc */ KASSERT((object != NULL) == ((req & VM_ALLOC_NOOBJ) == 0) && (object != NULL || (req & VM_ALLOC_SBUSY) == 0) && ((req & (VM_ALLOC_NOBUSY | VM_ALLOC_SBUSY)) != (VM_ALLOC_NOBUSY | VM_ALLOC_SBUSY)), ("vm_page_alloc_contig: inconsistent object(%p)/req(%x)", object, req)); if (object != NULL) { VM_OBJECT_ASSERT_WLOCKED(object); KASSERT((object->flags & OBJ_FICTITIOUS) == 0, ("vm_page_alloc_contig: object %p has fictitious pages", object)); } KASSERT(npages > 0, ("vm_page_alloc_contig: npages is zero")); req_class = req & VM_ALLOC_CLASS_MASK; /* * The page daemon is allowed to dig deeper into the free page list. */ if (curproc == pageproc && req_class != VM_ALLOC_INTERRUPT) req_class = VM_ALLOC_SYSTEM; if (object != NULL) { mpred = vm_radix_lookup_le(&object->rtree, pindex); KASSERT(mpred == NULL || mpred->pindex != pindex, ("vm_page_alloc_contig: pindex already allocated")); } /* * Can we allocate the pages without the number of free pages falling * below the lower bound for the allocation class? */ mtx_lock(&vm_page_queue_free_mtx); if (vm_cnt.v_free_count >= npages + vm_cnt.v_free_reserved || (req_class == VM_ALLOC_SYSTEM && vm_cnt.v_free_count >= npages + vm_cnt.v_interrupt_free_min) || (req_class == VM_ALLOC_INTERRUPT && vm_cnt.v_free_count >= npages)) { /* * Can we allocate the pages from a reservation? */ #if VM_NRESERVLEVEL > 0 retry: if (object == NULL || (object->flags & OBJ_COLORED) == 0 || - (m_ret = vm_reserv_alloc_contig(object, pindex, npages, - low, high, alignment, boundary, mpred)) == NULL) + (m_ret = vm_reserv_alloc_contig(object, pindex, domain, + npages, low, high, alignment, boundary, mpred)) == NULL) #endif /* * If not, allocate them from the free page queues. */ - m_ret = vm_phys_alloc_contig(npages, low, high, + m_ret = vm_phys_alloc_contig(domain, npages, low, high, alignment, boundary); } else { mtx_unlock(&vm_page_queue_free_mtx); atomic_add_int(&vm_pageout_deficit, npages); pagedaemon_wakeup(); return (NULL); } if (m_ret != NULL) vm_phys_freecnt_adj(m_ret, -npages); else { #if VM_NRESERVLEVEL > 0 - if (vm_reserv_reclaim_contig(npages, low, high, alignment, - boundary)) + if (vm_reserv_reclaim_contig(domain, npages, low, high, + alignment, boundary)) goto retry; #endif } mtx_unlock(&vm_page_queue_free_mtx); if (m_ret == NULL) return (NULL); for (m = m_ret; m < &m_ret[npages]; m++) vm_page_alloc_check(m); /* * Initialize the pages. Only the PG_ZERO flag is inherited. */ flags = 0; if ((req & VM_ALLOC_ZERO) != 0) flags = PG_ZERO; if ((req & VM_ALLOC_NODUMP) != 0) flags |= PG_NODUMP; oflags = object == NULL || (object->flags & OBJ_UNMANAGED) != 0 ? VPO_UNMANAGED : 0; busy_lock = VPB_UNBUSIED; if ((req & (VM_ALLOC_NOBUSY | VM_ALLOC_NOOBJ | VM_ALLOC_SBUSY)) == 0) busy_lock = VPB_SINGLE_EXCLUSIVER; if ((req & VM_ALLOC_SBUSY) != 0) busy_lock = VPB_SHARERS_WORD(1); if ((req & VM_ALLOC_WIRED) != 0) atomic_add_int(&vm_cnt.v_wire_count, npages); if (object != NULL) { if (object->memattr != VM_MEMATTR_DEFAULT && memattr == VM_MEMATTR_DEFAULT) memattr = object->memattr; } for (m = m_ret; m < &m_ret[npages]; m++) { m->aflags = 0; m->flags = (m->flags | PG_NODUMP) & flags; m->busy_lock = busy_lock; if ((req & VM_ALLOC_WIRED) != 0) m->wire_count = 1; m->act_count = 0; m->oflags = oflags; if (object != NULL) { if (vm_page_insert_after(m, object, pindex, mpred)) { pagedaemon_wakeup(); if ((req & VM_ALLOC_WIRED) != 0) atomic_subtract_int( &vm_cnt.v_wire_count, npages); KASSERT(m->object == NULL, ("page %p has object", m)); mpred = m; for (m = m_ret; m < &m_ret[npages]; m++) { if (m <= mpred && (req & VM_ALLOC_WIRED) != 0) m->wire_count = 0; m->oflags = VPO_UNMANAGED; m->busy_lock = VPB_UNBUSIED; /* Don't change PG_ZERO. */ vm_page_free_toq(m); } return (NULL); } mpred = m; } else m->pindex = pindex; if (memattr != VM_MEMATTR_DEFAULT) pmap_page_set_memattr(m, memattr); pindex++; } if (vm_paging_needed()) pagedaemon_wakeup(); return (m_ret); } /* * Check a page that has been freshly dequeued from a freelist. */ static void vm_page_alloc_check(vm_page_t m) { KASSERT(m->object == NULL, ("page %p has object", m)); KASSERT(m->queue == PQ_NONE, ("page %p has unexpected queue %d", m, m->queue)); KASSERT(m->wire_count == 0, ("page %p is wired", m)); KASSERT(m->hold_count == 0, ("page %p is held", m)); KASSERT(!vm_page_busied(m), ("page %p is busy", m)); KASSERT(m->dirty == 0, ("page %p is dirty", m)); KASSERT(pmap_page_get_memattr(m) == VM_MEMATTR_DEFAULT, ("page %p has unexpected memattr %d", m, pmap_page_get_memattr(m))); KASSERT(m->valid == 0, ("free page %p is valid", m)); } /* * vm_page_alloc_freelist: * * Allocate a physical page from the specified free page list. * * The caller must always specify an allocation class. * * allocation classes: * VM_ALLOC_NORMAL normal process request * VM_ALLOC_SYSTEM system *really* needs a page * VM_ALLOC_INTERRUPT interrupt time request * * optional allocation flags: * VM_ALLOC_COUNT(number) the number of additional pages that the caller * intends to allocate * VM_ALLOC_WIRED wire the allocated page * VM_ALLOC_ZERO prefer a zeroed page * * This routine may not sleep. */ vm_page_t -vm_page_alloc_freelist(int flind, int req) +vm_page_alloc_freelist(int domain, int flind, int req) { vm_page_t m; u_int flags; int req_class; req_class = req & VM_ALLOC_CLASS_MASK; /* * The page daemon is allowed to dig deeper into the free page list. */ if (curproc == pageproc && req_class != VM_ALLOC_INTERRUPT) req_class = VM_ALLOC_SYSTEM; /* * Do not allocate reserved pages unless the req has asked for it. */ mtx_lock(&vm_page_queue_free_mtx); if (vm_cnt.v_free_count > vm_cnt.v_free_reserved || (req_class == VM_ALLOC_SYSTEM && vm_cnt.v_free_count > vm_cnt.v_interrupt_free_min) || (req_class == VM_ALLOC_INTERRUPT && vm_cnt.v_free_count > 0)) - m = vm_phys_alloc_freelist_pages(flind, VM_FREEPOOL_DIRECT, 0); + m = vm_phys_alloc_freelist_pages(domain, flind, + VM_FREEPOOL_DIRECT, 0); else { mtx_unlock(&vm_page_queue_free_mtx); atomic_add_int(&vm_pageout_deficit, max((u_int)req >> VM_ALLOC_COUNT_SHIFT, 1)); pagedaemon_wakeup(); return (NULL); } if (m == NULL) { mtx_unlock(&vm_page_queue_free_mtx); return (NULL); } vm_phys_freecnt_adj(m, -1); mtx_unlock(&vm_page_queue_free_mtx); vm_page_alloc_check(m); /* * Initialize the page. Only the PG_ZERO flag is inherited. */ m->aflags = 0; flags = 0; if ((req & VM_ALLOC_ZERO) != 0) flags = PG_ZERO; m->flags &= flags; if ((req & VM_ALLOC_WIRED) != 0) { /* * The page lock is not required for wiring a page that does * not belong to an object. */ atomic_add_int(&vm_cnt.v_wire_count, 1); m->wire_count = 1; } /* Unmanaged pages don't use "act_count". */ m->oflags = VPO_UNMANAGED; if (vm_paging_needed()) pagedaemon_wakeup(); return (m); } #define VPSC_ANY 0 /* No restrictions. */ #define VPSC_NORESERV 1 /* Skip reservations; implies VPSC_NOSUPER. */ #define VPSC_NOSUPER 2 /* Skip superpages. */ /* * vm_page_scan_contig: * * Scan vm_page_array[] between the specified entries "m_start" and * "m_end" for a run of contiguous physical pages that satisfy the * specified conditions, and return the lowest page in the run. The * specified "alignment" determines the alignment of the lowest physical * page in the run. If the specified "boundary" is non-zero, then the * run of physical pages cannot span a physical address that is a * multiple of "boundary". * * "m_end" is never dereferenced, so it need not point to a vm_page * structure within vm_page_array[]. * * "npages" must be greater than zero. "m_start" and "m_end" must not * span a hole (or discontiguity) in the physical address space. Both * "alignment" and "boundary" must be a power of two. */ vm_page_t vm_page_scan_contig(u_long npages, vm_page_t m_start, vm_page_t m_end, u_long alignment, vm_paddr_t boundary, int options) { struct mtx *m_mtx, *new_mtx; vm_object_t object; vm_paddr_t pa; vm_page_t m, m_run; #if VM_NRESERVLEVEL > 0 int level; #endif int m_inc, order, run_ext, run_len; KASSERT(npages > 0, ("npages is 0")); KASSERT(powerof2(alignment), ("alignment is not a power of 2")); KASSERT(powerof2(boundary), ("boundary is not a power of 2")); m_run = NULL; run_len = 0; m_mtx = NULL; for (m = m_start; m < m_end && run_len < npages; m += m_inc) { KASSERT((m->flags & (PG_FICTITIOUS | PG_MARKER)) == 0, ("page %p is PG_FICTITIOUS or PG_MARKER", m)); /* * If the current page would be the start of a run, check its * physical address against the end, alignment, and boundary * conditions. If it doesn't satisfy these conditions, either * terminate the scan or advance to the next page that * satisfies the failed condition. */ if (run_len == 0) { KASSERT(m_run == NULL, ("m_run != NULL")); if (m + npages > m_end) break; pa = VM_PAGE_TO_PHYS(m); if ((pa & (alignment - 1)) != 0) { m_inc = atop(roundup2(pa, alignment) - pa); continue; } if (rounddown2(pa ^ (pa + ptoa(npages) - 1), boundary) != 0) { m_inc = atop(roundup2(pa, boundary) - pa); continue; } } else KASSERT(m_run != NULL, ("m_run == NULL")); /* * Avoid releasing and reacquiring the same page lock. */ new_mtx = vm_page_lockptr(m); if (m_mtx != new_mtx) { if (m_mtx != NULL) mtx_unlock(m_mtx); m_mtx = new_mtx; mtx_lock(m_mtx); } m_inc = 1; retry: if (m->wire_count != 0 || m->hold_count != 0) run_ext = 0; #if VM_NRESERVLEVEL > 0 else if ((level = vm_reserv_level(m)) >= 0 && (options & VPSC_NORESERV) != 0) { run_ext = 0; /* Advance to the end of the reservation. */ pa = VM_PAGE_TO_PHYS(m); m_inc = atop(roundup2(pa + 1, vm_reserv_size(level)) - pa); } #endif else if ((object = m->object) != NULL) { /* * The page is considered eligible for relocation if * and only if it could be laundered or reclaimed by * the page daemon. */ if (!VM_OBJECT_TRYRLOCK(object)) { mtx_unlock(m_mtx); VM_OBJECT_RLOCK(object); mtx_lock(m_mtx); if (m->object != object) { /* * The page may have been freed. */ VM_OBJECT_RUNLOCK(object); goto retry; } else if (m->wire_count != 0 || m->hold_count != 0) { run_ext = 0; goto unlock; } } KASSERT((m->flags & PG_UNHOLDFREE) == 0, ("page %p is PG_UNHOLDFREE", m)); /* Don't care: PG_NODUMP, PG_ZERO. */ if (object->type != OBJT_DEFAULT && object->type != OBJT_SWAP && object->type != OBJT_VNODE) { run_ext = 0; #if VM_NRESERVLEVEL > 0 } else if ((options & VPSC_NOSUPER) != 0 && (level = vm_reserv_level_iffullpop(m)) >= 0) { run_ext = 0; /* Advance to the end of the superpage. */ pa = VM_PAGE_TO_PHYS(m); m_inc = atop(roundup2(pa + 1, vm_reserv_size(level)) - pa); #endif } else if (object->memattr == VM_MEMATTR_DEFAULT && m->queue != PQ_NONE && !vm_page_busied(m)) { /* * The page is allocated but eligible for * relocation. Extend the current run by one * page. */ KASSERT(pmap_page_get_memattr(m) == VM_MEMATTR_DEFAULT, ("page %p has an unexpected memattr", m)); KASSERT((m->oflags & (VPO_SWAPINPROG | VPO_SWAPSLEEP | VPO_UNMANAGED)) == 0, ("page %p has unexpected oflags", m)); /* Don't care: VPO_NOSYNC. */ run_ext = 1; } else run_ext = 0; unlock: VM_OBJECT_RUNLOCK(object); #if VM_NRESERVLEVEL > 0 } else if (level >= 0) { /* * The page is reserved but not yet allocated. In * other words, it is still free. Extend the current * run by one page. */ run_ext = 1; #endif } else if ((order = m->order) < VM_NFREEORDER) { /* * The page is enqueued in the physical memory * allocator's free page queues. Moreover, it is the * first page in a power-of-two-sized run of * contiguous free pages. Add these pages to the end * of the current run, and jump ahead. */ run_ext = 1 << order; m_inc = 1 << order; } else { /* * Skip the page for one of the following reasons: (1) * It is enqueued in the physical memory allocator's * free page queues. However, it is not the first * page in a run of contiguous free pages. (This case * rarely occurs because the scan is performed in * ascending order.) (2) It is not reserved, and it is * transitioning from free to allocated. (Conversely, * the transition from allocated to free for managed * pages is blocked by the page lock.) (3) It is * allocated but not contained by an object and not * wired, e.g., allocated by Xen's balloon driver. */ run_ext = 0; } /* * Extend or reset the current run of pages. */ if (run_ext > 0) { if (run_len == 0) m_run = m; run_len += run_ext; } else { if (run_len > 0) { m_run = NULL; run_len = 0; } } } if (m_mtx != NULL) mtx_unlock(m_mtx); if (run_len >= npages) return (m_run); return (NULL); } /* * vm_page_reclaim_run: * * Try to relocate each of the allocated virtual pages within the * specified run of physical pages to a new physical address. Free the * physical pages underlying the relocated virtual pages. A virtual page * is relocatable if and only if it could be laundered or reclaimed by * the page daemon. Whenever possible, a virtual page is relocated to a * physical address above "high". * * Returns 0 if every physical page within the run was already free or * just freed by a successful relocation. Otherwise, returns a non-zero * value indicating why the last attempt to relocate a virtual page was * unsuccessful. * * "req_class" must be an allocation class. */ static int vm_page_reclaim_run(int req_class, u_long npages, vm_page_t m_run, vm_paddr_t high) { struct mtx *m_mtx, *new_mtx; struct spglist free; vm_object_t object; vm_paddr_t pa; vm_page_t m, m_end, m_new; int error, order, req; KASSERT((req_class & VM_ALLOC_CLASS_MASK) == req_class, ("req_class is not an allocation class")); SLIST_INIT(&free); error = 0; m = m_run; m_end = m_run + npages; m_mtx = NULL; for (; error == 0 && m < m_end; m++) { KASSERT((m->flags & (PG_FICTITIOUS | PG_MARKER)) == 0, ("page %p is PG_FICTITIOUS or PG_MARKER", m)); /* * Avoid releasing and reacquiring the same page lock. */ new_mtx = vm_page_lockptr(m); if (m_mtx != new_mtx) { if (m_mtx != NULL) mtx_unlock(m_mtx); m_mtx = new_mtx; mtx_lock(m_mtx); } retry: if (m->wire_count != 0 || m->hold_count != 0) error = EBUSY; else if ((object = m->object) != NULL) { /* * The page is relocated if and only if it could be * laundered or reclaimed by the page daemon. */ if (!VM_OBJECT_TRYWLOCK(object)) { mtx_unlock(m_mtx); VM_OBJECT_WLOCK(object); mtx_lock(m_mtx); if (m->object != object) { /* * The page may have been freed. */ VM_OBJECT_WUNLOCK(object); goto retry; } else if (m->wire_count != 0 || m->hold_count != 0) { error = EBUSY; goto unlock; } } KASSERT((m->flags & PG_UNHOLDFREE) == 0, ("page %p is PG_UNHOLDFREE", m)); /* Don't care: PG_NODUMP, PG_ZERO. */ if (object->type != OBJT_DEFAULT && object->type != OBJT_SWAP && object->type != OBJT_VNODE) error = EINVAL; else if (object->memattr != VM_MEMATTR_DEFAULT) error = EINVAL; else if (m->queue != PQ_NONE && !vm_page_busied(m)) { KASSERT(pmap_page_get_memattr(m) == VM_MEMATTR_DEFAULT, ("page %p has an unexpected memattr", m)); KASSERT((m->oflags & (VPO_SWAPINPROG | VPO_SWAPSLEEP | VPO_UNMANAGED)) == 0, ("page %p has unexpected oflags", m)); /* Don't care: VPO_NOSYNC. */ if (m->valid != 0) { /* * First, try to allocate a new page * that is above "high". Failing * that, try to allocate a new page * that is below "m_run". Allocate * the new page between the end of * "m_run" and "high" only as a last * resort. */ req = req_class | VM_ALLOC_NOOBJ; if ((m->flags & PG_NODUMP) != 0) req |= VM_ALLOC_NODUMP; if (trunc_page(high) != ~(vm_paddr_t)PAGE_MASK) { m_new = vm_page_alloc_contig( NULL, 0, req, 1, round_page(high), ~(vm_paddr_t)0, PAGE_SIZE, 0, VM_MEMATTR_DEFAULT); } else m_new = NULL; if (m_new == NULL) { pa = VM_PAGE_TO_PHYS(m_run); m_new = vm_page_alloc_contig( NULL, 0, req, 1, 0, pa - 1, PAGE_SIZE, 0, VM_MEMATTR_DEFAULT); } if (m_new == NULL) { pa += ptoa(npages); m_new = vm_page_alloc_contig( NULL, 0, req, 1, pa, high, PAGE_SIZE, 0, VM_MEMATTR_DEFAULT); } if (m_new == NULL) { error = ENOMEM; goto unlock; } KASSERT(m_new->wire_count == 0, ("page %p is wired", m)); /* * Replace "m" with the new page. For * vm_page_replace(), "m" must be busy * and dequeued. Finally, change "m" * as if vm_page_free() was called. */ if (object->ref_count != 0) pmap_remove_all(m); m_new->aflags = m->aflags; KASSERT(m_new->oflags == VPO_UNMANAGED, ("page %p is managed", m)); m_new->oflags = m->oflags & VPO_NOSYNC; pmap_copy_page(m, m_new); m_new->valid = m->valid; m_new->dirty = m->dirty; m->flags &= ~PG_ZERO; vm_page_xbusy(m); vm_page_remque(m); vm_page_replace_checked(m_new, object, m->pindex, m); m->valid = 0; vm_page_undirty(m); /* * The new page must be deactivated * before the object is unlocked. */ new_mtx = vm_page_lockptr(m_new); if (m_mtx != new_mtx) { mtx_unlock(m_mtx); m_mtx = new_mtx; mtx_lock(m_mtx); } vm_page_deactivate(m_new); } else { m->flags &= ~PG_ZERO; vm_page_remque(m); vm_page_remove(m); KASSERT(m->dirty == 0, ("page %p is dirty", m)); } SLIST_INSERT_HEAD(&free, m, plinks.s.ss); } else error = EBUSY; unlock: VM_OBJECT_WUNLOCK(object); } else { mtx_lock(&vm_page_queue_free_mtx); order = m->order; if (order < VM_NFREEORDER) { /* * The page is enqueued in the physical memory * allocator's free page queues. Moreover, it * is the first page in a power-of-two-sized * run of contiguous free pages. Jump ahead * to the last page within that run, and * continue from there. */ m += (1 << order) - 1; } #if VM_NRESERVLEVEL > 0 else if (vm_reserv_is_page_free(m)) order = 0; #endif mtx_unlock(&vm_page_queue_free_mtx); if (order == VM_NFREEORDER) error = EINVAL; } } if (m_mtx != NULL) mtx_unlock(m_mtx); if ((m = SLIST_FIRST(&free)) != NULL) { mtx_lock(&vm_page_queue_free_mtx); do { SLIST_REMOVE_HEAD(&free, plinks.s.ss); vm_phys_freecnt_adj(m, 1); #if VM_NRESERVLEVEL > 0 if (!vm_reserv_free_page(m)) #else if (true) #endif vm_phys_free_pages(m, 0); } while ((m = SLIST_FIRST(&free)) != NULL); vm_page_free_wakeup(); mtx_unlock(&vm_page_queue_free_mtx); } return (error); } #define NRUNS 16 CTASSERT(powerof2(NRUNS)); #define RUN_INDEX(count) ((count) & (NRUNS - 1)) #define MIN_RECLAIM 8 /* * vm_page_reclaim_contig: * * Reclaim allocated, contiguous physical memory satisfying the specified * conditions by relocating the virtual pages using that physical memory. * Returns true if reclamation is successful and false otherwise. Since * relocation requires the allocation of physical pages, reclamation may * fail due to a shortage of free pages. When reclamation fails, callers * are expected to perform VM_WAIT before retrying a failed allocation * operation, e.g., vm_page_alloc_contig(). * * The caller must always specify an allocation class through "req". * * allocation classes: * VM_ALLOC_NORMAL normal process request * VM_ALLOC_SYSTEM system *really* needs a page * VM_ALLOC_INTERRUPT interrupt time request * * The optional allocation flags are ignored. * * "npages" must be greater than zero. Both "alignment" and "boundary" * must be a power of two. */ bool vm_page_reclaim_contig(int req, u_long npages, vm_paddr_t low, vm_paddr_t high, u_long alignment, vm_paddr_t boundary) { vm_paddr_t curr_low; vm_page_t m_run, m_runs[NRUNS]; u_long count, reclaimed; int error, i, options, req_class; KASSERT(npages > 0, ("npages is 0")); KASSERT(powerof2(alignment), ("alignment is not a power of 2")); KASSERT(powerof2(boundary), ("boundary is not a power of 2")); req_class = req & VM_ALLOC_CLASS_MASK; /* * The page daemon is allowed to dig deeper into the free page list. */ if (curproc == pageproc && req_class != VM_ALLOC_INTERRUPT) req_class = VM_ALLOC_SYSTEM; /* * Return if the number of free pages cannot satisfy the requested * allocation. */ count = vm_cnt.v_free_count; if (count < npages + vm_cnt.v_free_reserved || (count < npages + vm_cnt.v_interrupt_free_min && req_class == VM_ALLOC_SYSTEM) || (count < npages && req_class == VM_ALLOC_INTERRUPT)) return (false); /* * Scan up to three times, relaxing the restrictions ("options") on * the reclamation of reservations and superpages each time. */ for (options = VPSC_NORESERV;;) { /* * Find the highest runs that satisfy the given constraints * and restrictions, and record them in "m_runs". */ curr_low = low; count = 0; for (;;) { m_run = vm_phys_scan_contig(npages, curr_low, high, alignment, boundary, options); if (m_run == NULL) break; curr_low = VM_PAGE_TO_PHYS(m_run) + ptoa(npages); m_runs[RUN_INDEX(count)] = m_run; count++; } /* * Reclaim the highest runs in LIFO (descending) order until * the number of reclaimed pages, "reclaimed", is at least * MIN_RECLAIM. Reset "reclaimed" each time because each * reclamation is idempotent, and runs will (likely) recur * from one scan to the next as restrictions are relaxed. */ reclaimed = 0; for (i = 0; count > 0 && i < NRUNS; i++) { count--; m_run = m_runs[RUN_INDEX(count)]; error = vm_page_reclaim_run(req_class, npages, m_run, high); if (error == 0) { reclaimed += npages; if (reclaimed >= MIN_RECLAIM) return (true); } } /* * Either relax the restrictions on the next scan or return if * the last scan had no restrictions. */ if (options == VPSC_NORESERV) options = VPSC_NOSUPER; else if (options == VPSC_NOSUPER) options = VPSC_ANY; else if (options == VPSC_ANY) return (reclaimed != 0); } } /* * vm_wait: (also see VM_WAIT macro) * * Sleep until free pages are available for allocation. * - Called in various places before memory allocations. */ void vm_wait(void) { mtx_lock(&vm_page_queue_free_mtx); if (curproc == pageproc) { vm_pageout_pages_needed = 1; msleep(&vm_pageout_pages_needed, &vm_page_queue_free_mtx, PDROP | PSWP, "VMWait", 0); } else { if (__predict_false(pageproc == NULL)) panic("vm_wait in early boot"); if (!vm_pageout_wanted) { vm_pageout_wanted = true; wakeup(&vm_pageout_wanted); } vm_pages_needed = true; msleep(&vm_cnt.v_free_count, &vm_page_queue_free_mtx, PDROP | PVM, "vmwait", 0); } } /* * vm_waitpfault: (also see VM_WAITPFAULT macro) * * Sleep until free pages are available for allocation. * - Called only in vm_fault so that processes page faulting * can be easily tracked. * - Sleeps at a lower priority than vm_wait() so that vm_wait()ing * processes will be able to grab memory first. Do not change * this balance without careful testing first. */ void vm_waitpfault(void) { mtx_lock(&vm_page_queue_free_mtx); if (!vm_pageout_wanted) { vm_pageout_wanted = true; wakeup(&vm_pageout_wanted); } vm_pages_needed = true; msleep(&vm_cnt.v_free_count, &vm_page_queue_free_mtx, PDROP | PUSER, "pfault", 0); } struct vm_pagequeue * vm_page_pagequeue(vm_page_t m) { if (vm_page_in_laundry(m)) return (&vm_dom[0].vmd_pagequeues[m->queue]); else - return (&vm_phys_domain(m)->vmd_pagequeues[m->queue]); + return (&vm_page_domain(m)->vmd_pagequeues[m->queue]); } /* * vm_page_dequeue: * * Remove the given page from its current page queue. * * The page must be locked. */ void vm_page_dequeue(vm_page_t m) { struct vm_pagequeue *pq; vm_page_assert_locked(m); KASSERT(m->queue < PQ_COUNT, ("vm_page_dequeue: page %p is not queued", m)); pq = vm_page_pagequeue(m); vm_pagequeue_lock(pq); m->queue = PQ_NONE; TAILQ_REMOVE(&pq->pq_pl, m, plinks.q); vm_pagequeue_cnt_dec(pq); vm_pagequeue_unlock(pq); } /* * vm_page_dequeue_locked: * * Remove the given page from its current page queue. * * The page and page queue must be locked. */ void vm_page_dequeue_locked(vm_page_t m) { struct vm_pagequeue *pq; vm_page_lock_assert(m, MA_OWNED); pq = vm_page_pagequeue(m); vm_pagequeue_assert_locked(pq); m->queue = PQ_NONE; TAILQ_REMOVE(&pq->pq_pl, m, plinks.q); vm_pagequeue_cnt_dec(pq); } /* * vm_page_enqueue: * * Add the given page to the specified page queue. * * The page must be locked. */ static void vm_page_enqueue(uint8_t queue, vm_page_t m) { struct vm_pagequeue *pq; vm_page_lock_assert(m, MA_OWNED); KASSERT(queue < PQ_COUNT, ("vm_page_enqueue: invalid queue %u request for page %p", queue, m)); if (queue == PQ_LAUNDRY || queue == PQ_UNSWAPPABLE) pq = &vm_dom[0].vmd_pagequeues[queue]; else - pq = &vm_phys_domain(m)->vmd_pagequeues[queue]; + pq = &vm_page_domain(m)->vmd_pagequeues[queue]; vm_pagequeue_lock(pq); m->queue = queue; TAILQ_INSERT_TAIL(&pq->pq_pl, m, plinks.q); vm_pagequeue_cnt_inc(pq); vm_pagequeue_unlock(pq); } /* * vm_page_requeue: * * Move the given page to the tail of its current page queue. * * The page must be locked. */ void vm_page_requeue(vm_page_t m) { struct vm_pagequeue *pq; vm_page_lock_assert(m, MA_OWNED); KASSERT(m->queue != PQ_NONE, ("vm_page_requeue: page %p is not queued", m)); pq = vm_page_pagequeue(m); vm_pagequeue_lock(pq); TAILQ_REMOVE(&pq->pq_pl, m, plinks.q); TAILQ_INSERT_TAIL(&pq->pq_pl, m, plinks.q); vm_pagequeue_unlock(pq); } /* * vm_page_requeue_locked: * * Move the given page to the tail of its current page queue. * * The page queue must be locked. */ void vm_page_requeue_locked(vm_page_t m) { struct vm_pagequeue *pq; KASSERT(m->queue != PQ_NONE, ("vm_page_requeue_locked: page %p is not queued", m)); pq = vm_page_pagequeue(m); vm_pagequeue_assert_locked(pq); TAILQ_REMOVE(&pq->pq_pl, m, plinks.q); TAILQ_INSERT_TAIL(&pq->pq_pl, m, plinks.q); } /* * vm_page_activate: * * Put the specified page on the active list (if appropriate). * Ensure that act_count is at least ACT_INIT but do not otherwise * mess with it. * * The page must be locked. */ void vm_page_activate(vm_page_t m) { int queue; vm_page_lock_assert(m, MA_OWNED); if ((queue = m->queue) != PQ_ACTIVE) { if (m->wire_count == 0 && (m->oflags & VPO_UNMANAGED) == 0) { if (m->act_count < ACT_INIT) m->act_count = ACT_INIT; if (queue != PQ_NONE) vm_page_dequeue(m); vm_page_enqueue(PQ_ACTIVE, m); } else KASSERT(queue == PQ_NONE, ("vm_page_activate: wired page %p is queued", m)); } else { if (m->act_count < ACT_INIT) m->act_count = ACT_INIT; } } /* * vm_page_free_wakeup: * * Helper routine for vm_page_free_toq(). This routine is called * when a page is added to the free queues. * * The page queues must be locked. */ static inline void vm_page_free_wakeup(void) { mtx_assert(&vm_page_queue_free_mtx, MA_OWNED); /* * if pageout daemon needs pages, then tell it that there are * some free. */ if (vm_pageout_pages_needed && vm_cnt.v_free_count >= vm_cnt.v_pageout_free_min) { wakeup(&vm_pageout_pages_needed); vm_pageout_pages_needed = 0; } /* * wakeup processes that are waiting on memory if we hit a * high water mark. And wakeup scheduler process if we have * lots of memory. this process will swapin processes. */ if (vm_pages_needed && !vm_page_count_min()) { vm_pages_needed = false; wakeup(&vm_cnt.v_free_count); } } /* * vm_page_free_toq: * * Returns the given page to the free list, * disassociating it with any VM object. * * The object must be locked. The page must be locked if it is managed. */ void vm_page_free_toq(vm_page_t m) { if ((m->oflags & VPO_UNMANAGED) == 0) { vm_page_lock_assert(m, MA_OWNED); KASSERT(!pmap_page_is_mapped(m), ("vm_page_free_toq: freeing mapped page %p", m)); } else KASSERT(m->queue == PQ_NONE, ("vm_page_free_toq: unmanaged page %p is queued", m)); VM_CNT_INC(v_tfree); if (vm_page_sbusied(m)) panic("vm_page_free: freeing busy page %p", m); /* * Unqueue, then remove page. Note that we cannot destroy * the page here because we do not want to call the pager's * callback routine until after we've put the page on the * appropriate free queue. */ vm_page_remque(m); vm_page_remove(m); /* * If fictitious remove object association and * return, otherwise delay object association removal. */ if ((m->flags & PG_FICTITIOUS) != 0) { return; } m->valid = 0; vm_page_undirty(m); if (m->wire_count != 0) panic("vm_page_free: freeing wired page %p", m); if (m->hold_count != 0) { m->flags &= ~PG_ZERO; KASSERT((m->flags & PG_UNHOLDFREE) == 0, ("vm_page_free: freeing PG_UNHOLDFREE page %p", m)); m->flags |= PG_UNHOLDFREE; } else { /* * Restore the default memory attribute to the page. */ if (pmap_page_get_memattr(m) != VM_MEMATTR_DEFAULT) pmap_page_set_memattr(m, VM_MEMATTR_DEFAULT); /* * Insert the page into the physical memory allocator's free * page queues. */ mtx_lock(&vm_page_queue_free_mtx); vm_phys_freecnt_adj(m, 1); #if VM_NRESERVLEVEL > 0 if (!vm_reserv_free_page(m)) #else if (TRUE) #endif vm_phys_free_pages(m, 0); vm_page_free_wakeup(); mtx_unlock(&vm_page_queue_free_mtx); } } /* * vm_page_wire: * * Mark this page as wired down by yet * another map, removing it from paging queues * as necessary. * * If the page is fictitious, then its wire count must remain one. * * The page must be locked. */ void vm_page_wire(vm_page_t m) { /* * Only bump the wire statistics if the page is not already wired, * and only unqueue the page if it is on some queue (if it is unmanaged * it is already off the queues). */ vm_page_lock_assert(m, MA_OWNED); if ((m->flags & PG_FICTITIOUS) != 0) { KASSERT(m->wire_count == 1, ("vm_page_wire: fictitious page %p's wire count isn't one", m)); return; } if (m->wire_count == 0) { KASSERT((m->oflags & VPO_UNMANAGED) == 0 || m->queue == PQ_NONE, ("vm_page_wire: unmanaged page %p is queued", m)); vm_page_remque(m); atomic_add_int(&vm_cnt.v_wire_count, 1); } m->wire_count++; KASSERT(m->wire_count != 0, ("vm_page_wire: wire_count overflow m=%p", m)); } /* * vm_page_unwire: * * Release one wiring of the specified page, potentially allowing it to be * paged out. Returns TRUE if the number of wirings transitions to zero and * FALSE otherwise. * * Only managed pages belonging to an object can be paged out. If the number * of wirings transitions to zero and the page is eligible for page out, then * the page is added to the specified paging queue (unless PQ_NONE is * specified). * * If a page is fictitious, then its wire count must always be one. * * A managed page must be locked. */ boolean_t vm_page_unwire(vm_page_t m, uint8_t queue) { KASSERT(queue < PQ_COUNT || queue == PQ_NONE, ("vm_page_unwire: invalid queue %u request for page %p", queue, m)); if ((m->oflags & VPO_UNMANAGED) == 0) vm_page_assert_locked(m); if ((m->flags & PG_FICTITIOUS) != 0) { KASSERT(m->wire_count == 1, ("vm_page_unwire: fictitious page %p's wire count isn't one", m)); return (FALSE); } if (m->wire_count > 0) { m->wire_count--; if (m->wire_count == 0) { atomic_subtract_int(&vm_cnt.v_wire_count, 1); if ((m->oflags & VPO_UNMANAGED) == 0 && m->object != NULL && queue != PQ_NONE) vm_page_enqueue(queue, m); return (TRUE); } else return (FALSE); } else panic("vm_page_unwire: page %p's wire count is zero", m); } /* * Move the specified page to the inactive queue. * * Normally, "noreuse" is FALSE, resulting in LRU ordering of the inactive * queue. However, setting "noreuse" to TRUE will accelerate the specified * page's reclamation, but it will not unmap the page from any address space. * This is implemented by inserting the page near the head of the inactive * queue, using a marker page to guide FIFO insertion ordering. * * The page must be locked. */ static inline void _vm_page_deactivate(vm_page_t m, boolean_t noreuse) { struct vm_pagequeue *pq; int queue; vm_page_assert_locked(m); /* * Ignore if the page is already inactive, unless it is unlikely to be * reactivated. */ if ((queue = m->queue) == PQ_INACTIVE && !noreuse) return; if (m->wire_count == 0 && (m->oflags & VPO_UNMANAGED) == 0) { - pq = &vm_phys_domain(m)->vmd_pagequeues[PQ_INACTIVE]; + pq = &vm_page_domain(m)->vmd_pagequeues[PQ_INACTIVE]; /* Avoid multiple acquisitions of the inactive queue lock. */ if (queue == PQ_INACTIVE) { vm_pagequeue_lock(pq); vm_page_dequeue_locked(m); } else { if (queue != PQ_NONE) vm_page_dequeue(m); vm_pagequeue_lock(pq); } m->queue = PQ_INACTIVE; if (noreuse) - TAILQ_INSERT_BEFORE(&vm_phys_domain(m)->vmd_inacthead, + TAILQ_INSERT_BEFORE(&vm_page_domain(m)->vmd_inacthead, m, plinks.q); else TAILQ_INSERT_TAIL(&pq->pq_pl, m, plinks.q); vm_pagequeue_cnt_inc(pq); vm_pagequeue_unlock(pq); } } /* * Move the specified page to the inactive queue. * * The page must be locked. */ void vm_page_deactivate(vm_page_t m) { _vm_page_deactivate(m, FALSE); } /* * Move the specified page to the inactive queue with the expectation * that it is unlikely to be reused. * * The page must be locked. */ void vm_page_deactivate_noreuse(vm_page_t m) { _vm_page_deactivate(m, TRUE); } /* * vm_page_launder * * Put a page in the laundry. */ void vm_page_launder(vm_page_t m) { int queue; vm_page_assert_locked(m); if ((queue = m->queue) != PQ_LAUNDRY) { if (m->wire_count == 0 && (m->oflags & VPO_UNMANAGED) == 0) { if (queue != PQ_NONE) vm_page_dequeue(m); vm_page_enqueue(PQ_LAUNDRY, m); } else KASSERT(queue == PQ_NONE, ("wired page %p is queued", m)); } } /* * vm_page_unswappable * * Put a page in the PQ_UNSWAPPABLE holding queue. */ void vm_page_unswappable(vm_page_t m) { vm_page_assert_locked(m); KASSERT(m->wire_count == 0 && (m->oflags & VPO_UNMANAGED) == 0, ("page %p already unswappable", m)); if (m->queue != PQ_NONE) vm_page_dequeue(m); vm_page_enqueue(PQ_UNSWAPPABLE, m); } /* * vm_page_try_to_free() * * Attempt to free the page. If we cannot free it, we do nothing. * 1 is returned on success, 0 on failure. */ int vm_page_try_to_free(vm_page_t m) { vm_page_lock_assert(m, MA_OWNED); if (m->object != NULL) VM_OBJECT_ASSERT_WLOCKED(m->object); if (m->dirty || m->hold_count || m->wire_count || (m->oflags & VPO_UNMANAGED) != 0 || vm_page_busied(m)) return (0); pmap_remove_all(m); if (m->dirty) return (0); vm_page_free(m); return (1); } /* * vm_page_advise * * Apply the specified advice to the given page. * * The object and page must be locked. */ void vm_page_advise(vm_page_t m, int advice) { vm_page_assert_locked(m); VM_OBJECT_ASSERT_WLOCKED(m->object); if (advice == MADV_FREE) /* * Mark the page clean. This will allow the page to be freed * without first paging it out. MADV_FREE pages are often * quickly reused by malloc(3), so we do not do anything that * would result in a page fault on a later access. */ vm_page_undirty(m); else if (advice != MADV_DONTNEED) { if (advice == MADV_WILLNEED) vm_page_activate(m); return; } /* * Clear any references to the page. Otherwise, the page daemon will * immediately reactivate the page. */ vm_page_aflag_clear(m, PGA_REFERENCED); if (advice != MADV_FREE && m->dirty == 0 && pmap_is_modified(m)) vm_page_dirty(m); /* * Place clean pages near the head of the inactive queue rather than * the tail, thus defeating the queue's LRU operation and ensuring that * the page will be reused quickly. Dirty pages not already in the * laundry are moved there. */ if (m->dirty == 0) vm_page_deactivate_noreuse(m); else vm_page_launder(m); } /* * Grab a page, waiting until we are waken up due to the page * changing state. We keep on waiting, if the page continues * to be in the object. If the page doesn't exist, first allocate it * and then conditionally zero it. * * This routine may sleep. * * The object must be locked on entry. The lock will, however, be released * and reacquired if the routine sleeps. */ vm_page_t vm_page_grab(vm_object_t object, vm_pindex_t pindex, int allocflags) { vm_page_t m; int sleep; VM_OBJECT_ASSERT_WLOCKED(object); KASSERT((allocflags & VM_ALLOC_SBUSY) == 0 || (allocflags & VM_ALLOC_IGN_SBUSY) != 0, ("vm_page_grab: VM_ALLOC_SBUSY/VM_ALLOC_IGN_SBUSY mismatch")); retrylookup: if ((m = vm_page_lookup(object, pindex)) != NULL) { sleep = (allocflags & VM_ALLOC_IGN_SBUSY) != 0 ? vm_page_xbusied(m) : vm_page_busied(m); if (sleep) { if ((allocflags & VM_ALLOC_NOWAIT) != 0) return (NULL); /* * Reference the page before unlocking and * sleeping so that the page daemon is less * likely to reclaim it. */ vm_page_aflag_set(m, PGA_REFERENCED); vm_page_lock(m); VM_OBJECT_WUNLOCK(object); vm_page_busy_sleep(m, "pgrbwt", (allocflags & VM_ALLOC_IGN_SBUSY) != 0); VM_OBJECT_WLOCK(object); goto retrylookup; } else { if ((allocflags & VM_ALLOC_WIRED) != 0) { vm_page_lock(m); vm_page_wire(m); vm_page_unlock(m); } if ((allocflags & (VM_ALLOC_NOBUSY | VM_ALLOC_SBUSY)) == 0) vm_page_xbusy(m); if ((allocflags & VM_ALLOC_SBUSY) != 0) vm_page_sbusy(m); return (m); } } m = vm_page_alloc(object, pindex, allocflags); if (m == NULL) { if ((allocflags & VM_ALLOC_NOWAIT) != 0) return (NULL); VM_OBJECT_WUNLOCK(object); VM_WAIT; VM_OBJECT_WLOCK(object); goto retrylookup; } if (allocflags & VM_ALLOC_ZERO && (m->flags & PG_ZERO) == 0) pmap_zero_page(m); return (m); } /* * Mapping function for valid or dirty bits in a page. * * Inputs are required to range within a page. */ vm_page_bits_t vm_page_bits(int base, int size) { int first_bit; int last_bit; KASSERT( base + size <= PAGE_SIZE, ("vm_page_bits: illegal base/size %d/%d", base, size) ); if (size == 0) /* handle degenerate case */ return (0); first_bit = base >> DEV_BSHIFT; last_bit = (base + size - 1) >> DEV_BSHIFT; return (((vm_page_bits_t)2 << last_bit) - ((vm_page_bits_t)1 << first_bit)); } /* * vm_page_set_valid_range: * * Sets portions of a page valid. The arguments are expected * to be DEV_BSIZE aligned but if they aren't the bitmap is inclusive * of any partial chunks touched by the range. The invalid portion of * such chunks will be zeroed. * * (base + size) must be less then or equal to PAGE_SIZE. */ void vm_page_set_valid_range(vm_page_t m, int base, int size) { int endoff, frag; VM_OBJECT_ASSERT_WLOCKED(m->object); if (size == 0) /* handle degenerate case */ return; /* * If the base is not DEV_BSIZE aligned and the valid * bit is clear, we have to zero out a portion of the * first block. */ if ((frag = rounddown2(base, DEV_BSIZE)) != base && (m->valid & (1 << (base >> DEV_BSHIFT))) == 0) pmap_zero_page_area(m, frag, base - frag); /* * If the ending offset is not DEV_BSIZE aligned and the * valid bit is clear, we have to zero out a portion of * the last block. */ endoff = base + size; if ((frag = rounddown2(endoff, DEV_BSIZE)) != endoff && (m->valid & (1 << (endoff >> DEV_BSHIFT))) == 0) pmap_zero_page_area(m, endoff, DEV_BSIZE - (endoff & (DEV_BSIZE - 1))); /* * Assert that no previously invalid block that is now being validated * is already dirty. */ KASSERT((~m->valid & vm_page_bits(base, size) & m->dirty) == 0, ("vm_page_set_valid_range: page %p is dirty", m)); /* * Set valid bits inclusive of any overlap. */ m->valid |= vm_page_bits(base, size); } /* * Clear the given bits from the specified page's dirty field. */ static __inline void vm_page_clear_dirty_mask(vm_page_t m, vm_page_bits_t pagebits) { uintptr_t addr; #if PAGE_SIZE < 16384 int shift; #endif /* * If the object is locked and the page is neither exclusive busy nor * write mapped, then the page's dirty field cannot possibly be * set by a concurrent pmap operation. */ VM_OBJECT_ASSERT_WLOCKED(m->object); if (!vm_page_xbusied(m) && !pmap_page_is_write_mapped(m)) m->dirty &= ~pagebits; else { /* * The pmap layer can call vm_page_dirty() without * holding a distinguished lock. The combination of * the object's lock and an atomic operation suffice * to guarantee consistency of the page dirty field. * * For PAGE_SIZE == 32768 case, compiler already * properly aligns the dirty field, so no forcible * alignment is needed. Only require existence of * atomic_clear_64 when page size is 32768. */ addr = (uintptr_t)&m->dirty; #if PAGE_SIZE == 32768 atomic_clear_64((uint64_t *)addr, pagebits); #elif PAGE_SIZE == 16384 atomic_clear_32((uint32_t *)addr, pagebits); #else /* PAGE_SIZE <= 8192 */ /* * Use a trick to perform a 32-bit atomic on the * containing aligned word, to not depend on the existence * of atomic_clear_{8, 16}. */ shift = addr & (sizeof(uint32_t) - 1); #if BYTE_ORDER == BIG_ENDIAN shift = (sizeof(uint32_t) - sizeof(m->dirty) - shift) * NBBY; #else shift *= NBBY; #endif addr &= ~(sizeof(uint32_t) - 1); atomic_clear_32((uint32_t *)addr, pagebits << shift); #endif /* PAGE_SIZE */ } } /* * vm_page_set_validclean: * * Sets portions of a page valid and clean. The arguments are expected * to be DEV_BSIZE aligned but if they aren't the bitmap is inclusive * of any partial chunks touched by the range. The invalid portion of * such chunks will be zero'd. * * (base + size) must be less then or equal to PAGE_SIZE. */ void vm_page_set_validclean(vm_page_t m, int base, int size) { vm_page_bits_t oldvalid, pagebits; int endoff, frag; VM_OBJECT_ASSERT_WLOCKED(m->object); if (size == 0) /* handle degenerate case */ return; /* * If the base is not DEV_BSIZE aligned and the valid * bit is clear, we have to zero out a portion of the * first block. */ if ((frag = rounddown2(base, DEV_BSIZE)) != base && (m->valid & ((vm_page_bits_t)1 << (base >> DEV_BSHIFT))) == 0) pmap_zero_page_area(m, frag, base - frag); /* * If the ending offset is not DEV_BSIZE aligned and the * valid bit is clear, we have to zero out a portion of * the last block. */ endoff = base + size; if ((frag = rounddown2(endoff, DEV_BSIZE)) != endoff && (m->valid & ((vm_page_bits_t)1 << (endoff >> DEV_BSHIFT))) == 0) pmap_zero_page_area(m, endoff, DEV_BSIZE - (endoff & (DEV_BSIZE - 1))); /* * Set valid, clear dirty bits. If validating the entire * page we can safely clear the pmap modify bit. We also * use this opportunity to clear the VPO_NOSYNC flag. If a process * takes a write fault on a MAP_NOSYNC memory area the flag will * be set again. * * We set valid bits inclusive of any overlap, but we can only * clear dirty bits for DEV_BSIZE chunks that are fully within * the range. */ oldvalid = m->valid; pagebits = vm_page_bits(base, size); m->valid |= pagebits; #if 0 /* NOT YET */ if ((frag = base & (DEV_BSIZE - 1)) != 0) { frag = DEV_BSIZE - frag; base += frag; size -= frag; if (size < 0) size = 0; } pagebits = vm_page_bits(base, size & (DEV_BSIZE - 1)); #endif if (base == 0 && size == PAGE_SIZE) { /* * The page can only be modified within the pmap if it is * mapped, and it can only be mapped if it was previously * fully valid. */ if (oldvalid == VM_PAGE_BITS_ALL) /* * Perform the pmap_clear_modify() first. Otherwise, * a concurrent pmap operation, such as * pmap_protect(), could clear a modification in the * pmap and set the dirty field on the page before * pmap_clear_modify() had begun and after the dirty * field was cleared here. */ pmap_clear_modify(m); m->dirty = 0; m->oflags &= ~VPO_NOSYNC; } else if (oldvalid != VM_PAGE_BITS_ALL) m->dirty &= ~pagebits; else vm_page_clear_dirty_mask(m, pagebits); } void vm_page_clear_dirty(vm_page_t m, int base, int size) { vm_page_clear_dirty_mask(m, vm_page_bits(base, size)); } /* * vm_page_set_invalid: * * Invalidates DEV_BSIZE'd chunks within a page. Both the * valid and dirty bits for the effected areas are cleared. */ void vm_page_set_invalid(vm_page_t m, int base, int size) { vm_page_bits_t bits; vm_object_t object; object = m->object; VM_OBJECT_ASSERT_WLOCKED(object); if (object->type == OBJT_VNODE && base == 0 && IDX_TO_OFF(m->pindex) + size >= object->un_pager.vnp.vnp_size) bits = VM_PAGE_BITS_ALL; else bits = vm_page_bits(base, size); if (object->ref_count != 0 && m->valid == VM_PAGE_BITS_ALL && bits != 0) pmap_remove_all(m); KASSERT((bits == 0 && m->valid == VM_PAGE_BITS_ALL) || !pmap_page_is_mapped(m), ("vm_page_set_invalid: page %p is mapped", m)); m->valid &= ~bits; m->dirty &= ~bits; } /* * vm_page_zero_invalid() * * The kernel assumes that the invalid portions of a page contain * garbage, but such pages can be mapped into memory by user code. * When this occurs, we must zero out the non-valid portions of the * page so user code sees what it expects. * * Pages are most often semi-valid when the end of a file is mapped * into memory and the file's size is not page aligned. */ void vm_page_zero_invalid(vm_page_t m, boolean_t setvalid) { int b; int i; VM_OBJECT_ASSERT_WLOCKED(m->object); /* * Scan the valid bits looking for invalid sections that * must be zeroed. Invalid sub-DEV_BSIZE'd areas ( where the * valid bit may be set ) have already been zeroed by * vm_page_set_validclean(). */ for (b = i = 0; i <= PAGE_SIZE / DEV_BSIZE; ++i) { if (i == (PAGE_SIZE / DEV_BSIZE) || (m->valid & ((vm_page_bits_t)1 << i))) { if (i > b) { pmap_zero_page_area(m, b << DEV_BSHIFT, (i - b) << DEV_BSHIFT); } b = i + 1; } } /* * setvalid is TRUE when we can safely set the zero'd areas * as being valid. We can do this if there are no cache consistancy * issues. e.g. it is ok to do with UFS, but not ok to do with NFS. */ if (setvalid) m->valid = VM_PAGE_BITS_ALL; } /* * vm_page_is_valid: * * Is (partial) page valid? Note that the case where size == 0 * will return FALSE in the degenerate case where the page is * entirely invalid, and TRUE otherwise. */ int vm_page_is_valid(vm_page_t m, int base, int size) { vm_page_bits_t bits; VM_OBJECT_ASSERT_LOCKED(m->object); bits = vm_page_bits(base, size); return (m->valid != 0 && (m->valid & bits) == bits); } /* * Returns true if all of the specified predicates are true for the entire * (super)page and false otherwise. */ bool vm_page_ps_test(vm_page_t m, int flags, vm_page_t skip_m) { vm_object_t object; int i, npages; object = m->object; VM_OBJECT_ASSERT_LOCKED(object); npages = atop(pagesizes[m->psind]); /* * The physically contiguous pages that make up a superpage, i.e., a * page with a page size index ("psind") greater than zero, will * occupy adjacent entries in vm_page_array[]. */ for (i = 0; i < npages; i++) { /* Always test object consistency, including "skip_m". */ if (m[i].object != object) return (false); if (&m[i] == skip_m) continue; if ((flags & PS_NONE_BUSY) != 0 && vm_page_busied(&m[i])) return (false); if ((flags & PS_ALL_DIRTY) != 0) { /* * Calling vm_page_test_dirty() or pmap_is_modified() * might stop this case from spuriously returning * "false". However, that would require a write lock * on the object containing "m[i]". */ if (m[i].dirty != VM_PAGE_BITS_ALL) return (false); } if ((flags & PS_ALL_VALID) != 0 && m[i].valid != VM_PAGE_BITS_ALL) return (false); } return (true); } /* * Set the page's dirty bits if the page is modified. */ void vm_page_test_dirty(vm_page_t m) { VM_OBJECT_ASSERT_WLOCKED(m->object); if (m->dirty != VM_PAGE_BITS_ALL && pmap_is_modified(m)) vm_page_dirty(m); } void vm_page_lock_KBI(vm_page_t m, const char *file, int line) { mtx_lock_flags_(vm_page_lockptr(m), 0, file, line); } void vm_page_unlock_KBI(vm_page_t m, const char *file, int line) { mtx_unlock_flags_(vm_page_lockptr(m), 0, file, line); } int vm_page_trylock_KBI(vm_page_t m, const char *file, int line) { return (mtx_trylock_flags_(vm_page_lockptr(m), 0, file, line)); } #if defined(INVARIANTS) || defined(INVARIANT_SUPPORT) void vm_page_assert_locked_KBI(vm_page_t m, const char *file, int line) { vm_page_lock_assert_KBI(m, MA_OWNED, file, line); } void vm_page_lock_assert_KBI(vm_page_t m, int a, const char *file, int line) { mtx_assert_(vm_page_lockptr(m), a, file, line); } #endif #ifdef INVARIANTS void vm_page_object_lock_assert(vm_page_t m) { /* * Certain of the page's fields may only be modified by the * holder of the containing object's lock or the exclusive busy. * holder. Unfortunately, the holder of the write busy is * not recorded, and thus cannot be checked here. */ if (m->object != NULL && !vm_page_xbusied(m)) VM_OBJECT_ASSERT_WLOCKED(m->object); } void vm_page_assert_pga_writeable(vm_page_t m, uint8_t bits) { if ((bits & PGA_WRITEABLE) == 0) return; /* * The PGA_WRITEABLE flag can only be set if the page is * managed, is exclusively busied or the object is locked. * Currently, this flag is only set by pmap_enter(). */ KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("PGA_WRITEABLE on unmanaged page")); if (!vm_page_xbusied(m)) VM_OBJECT_ASSERT_LOCKED(m->object); } #endif #include "opt_ddb.h" #ifdef DDB #include #include DB_SHOW_COMMAND(page, vm_page_print_page_info) { db_printf("vm_cnt.v_free_count: %d\n", vm_cnt.v_free_count); db_printf("vm_cnt.v_inactive_count: %d\n", vm_cnt.v_inactive_count); db_printf("vm_cnt.v_active_count: %d\n", vm_cnt.v_active_count); db_printf("vm_cnt.v_laundry_count: %d\n", vm_cnt.v_laundry_count); db_printf("vm_cnt.v_wire_count: %d\n", vm_cnt.v_wire_count); db_printf("vm_cnt.v_free_reserved: %d\n", vm_cnt.v_free_reserved); db_printf("vm_cnt.v_free_min: %d\n", vm_cnt.v_free_min); db_printf("vm_cnt.v_free_target: %d\n", vm_cnt.v_free_target); db_printf("vm_cnt.v_inactive_target: %d\n", vm_cnt.v_inactive_target); } DB_SHOW_COMMAND(pageq, vm_page_print_pageq_info) { int dom; db_printf("pq_free %d\n", vm_cnt.v_free_count); for (dom = 0; dom < vm_ndomains; dom++) { db_printf( "dom %d page_cnt %d free %d pq_act %d pq_inact %d pq_laund %d pq_unsw %d\n", dom, vm_dom[dom].vmd_page_count, vm_dom[dom].vmd_free_count, vm_dom[dom].vmd_pagequeues[PQ_ACTIVE].pq_cnt, vm_dom[dom].vmd_pagequeues[PQ_INACTIVE].pq_cnt, vm_dom[dom].vmd_pagequeues[PQ_LAUNDRY].pq_cnt, vm_dom[dom].vmd_pagequeues[PQ_UNSWAPPABLE].pq_cnt); } } DB_SHOW_COMMAND(pginfo, vm_page_print_pginfo) { vm_page_t m; boolean_t phys; if (!have_addr) { db_printf("show pginfo addr\n"); return; } phys = strchr(modif, 'p') != NULL; if (phys) m = PHYS_TO_VM_PAGE(addr); else m = (vm_page_t)addr; db_printf( "page %p obj %p pidx 0x%jx phys 0x%jx q %d hold %d wire %d\n" " af 0x%x of 0x%x f 0x%x act %d busy %x valid 0x%x dirty 0x%x\n", m, m->object, (uintmax_t)m->pindex, (uintmax_t)m->phys_addr, m->queue, m->hold_count, m->wire_count, m->aflags, m->oflags, m->flags, m->act_count, m->busy_lock, m->valid, m->dirty); } #endif /* DDB */ Index: projects/numa2/sys/vm/vm_page.h =================================================================== --- projects/numa2/sys/vm/vm_page.h (revision 321505) +++ projects/numa2/sys/vm/vm_page.h (revision 321506) @@ -1,734 +1,738 @@ /*- * Copyright (c) 1991, 1993 * The Regents of the University of California. All rights reserved. * * This code is derived from software contributed to Berkeley by * The Mach Operating System project at Carnegie-Mellon University. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from: @(#)vm_page.h 8.2 (Berkeley) 12/13/93 * * * Copyright (c) 1987, 1990 Carnegie-Mellon University. * All rights reserved. * * Authors: Avadis Tevanian, Jr., Michael Wayne Young * * Permission to use, copy, modify and distribute this software and * its documentation is hereby granted, provided that both the copyright * notice and this permission notice appear in all copies of the * software, derivative works or modified versions, and any portions * thereof, and that both notices appear in supporting documentation. * * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. * * Carnegie Mellon requests users of this software to return to * * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU * School of Computer Science * Carnegie Mellon University * Pittsburgh PA 15213-3890 * * any improvements or extensions that they make and grant Carnegie the * rights to redistribute these changes. * * $FreeBSD$ */ /* * Resident memory system definitions. */ #ifndef _VM_PAGE_ #define _VM_PAGE_ #include /* * Management of resident (logical) pages. * * A small structure is kept for each resident * page, indexed by page number. Each structure * is an element of several collections: * * A radix tree used to quickly * perform object/offset lookups * * A list of all pages for a given object, * so they can be quickly deactivated at * time of deallocation. * * An ordered list of pages due for pageout. * * In addition, the structure contains the object * and offset to which this page belongs (for pageout), * and sundry status bits. * * In general, operations on this structure's mutable fields are * synchronized using either one of or a combination of the lock on the * object that the page belongs to (O), the pool lock for the page (P), * or the lock for either the free or paging queue (Q). If a field is * annotated below with two of these locks, then holding either lock is * sufficient for read access, but both locks are required for write * access. * * In contrast, the synchronization of accesses to the page's * dirty field is machine dependent (M). In the * machine-independent layer, the lock on the object that the * page belongs to must be held in order to operate on the field. * However, the pmap layer is permitted to set all bits within * the field without holding that lock. If the underlying * architecture does not support atomic read-modify-write * operations on the field's type, then the machine-independent * layer uses a 32-bit atomic on the aligned 32-bit word that * contains the dirty field. In the machine-independent layer, * the implementation of read-modify-write operations on the * field is encapsulated in vm_page_clear_dirty_mask(). */ #if PAGE_SIZE == 4096 #define VM_PAGE_BITS_ALL 0xffu typedef uint8_t vm_page_bits_t; #elif PAGE_SIZE == 8192 #define VM_PAGE_BITS_ALL 0xffffu typedef uint16_t vm_page_bits_t; #elif PAGE_SIZE == 16384 #define VM_PAGE_BITS_ALL 0xffffffffu typedef uint32_t vm_page_bits_t; #elif PAGE_SIZE == 32768 #define VM_PAGE_BITS_ALL 0xfffffffffffffffflu typedef uint64_t vm_page_bits_t; #endif struct vm_page { union { TAILQ_ENTRY(vm_page) q; /* page queue or free list (Q) */ struct { SLIST_ENTRY(vm_page) ss; /* private slists */ void *pv; } s; struct { u_long p; u_long v; } memguard; } plinks; TAILQ_ENTRY(vm_page) listq; /* pages in same object (O) */ vm_object_t object; /* which object am I in (O,P) */ vm_pindex_t pindex; /* offset into object (O,P) */ vm_paddr_t phys_addr; /* physical address of page */ struct md_page md; /* machine dependent stuff */ u_int wire_count; /* wired down maps refs (P) */ volatile u_int busy_lock; /* busy owners lock */ uint16_t hold_count; /* page hold count (P) */ uint16_t flags; /* page PG_* flags (P) */ uint8_t aflags; /* access is atomic */ uint8_t oflags; /* page VPO_* flags (O) */ uint8_t queue; /* page queue index (P,Q) */ int8_t psind; /* pagesizes[] index (O) */ int8_t segind; uint8_t order; /* index of the buddy queue */ uint8_t pool; u_char act_count; /* page usage count (P) */ /* NOTE that these must support one bit per DEV_BSIZE in a page */ /* so, on normal X86 kernels, they must be at least 8 bits wide */ vm_page_bits_t valid; /* map of valid DEV_BSIZE chunks (O) */ vm_page_bits_t dirty; /* map of dirty DEV_BSIZE chunks (M) */ }; /* * Page flags stored in oflags: * * Access to these page flags is synchronized by the lock on the object * containing the page (O). * * Note: VPO_UNMANAGED (used by OBJT_DEVICE, OBJT_PHYS and OBJT_SG) * indicates that the page is not under PV management but * otherwise should be treated as a normal page. Pages not * under PV management cannot be paged out via the * object/vm_page_t because there is no knowledge of their pte * mappings, and such pages are also not on any PQ queue. * */ #define VPO_UNUSED01 0x01 /* --available-- */ #define VPO_SWAPSLEEP 0x02 /* waiting for swap to finish */ #define VPO_UNMANAGED 0x04 /* no PV management for page */ #define VPO_SWAPINPROG 0x08 /* swap I/O in progress on page */ #define VPO_NOSYNC 0x10 /* do not collect for syncer */ /* * Busy page implementation details. * The algorithm is taken mostly by rwlock(9) and sx(9) locks implementation, * even if the support for owner identity is removed because of size * constraints. Checks on lock recursion are then not possible, while the * lock assertions effectiveness is someway reduced. */ #define VPB_BIT_SHARED 0x01 #define VPB_BIT_EXCLUSIVE 0x02 #define VPB_BIT_WAITERS 0x04 #define VPB_BIT_FLAGMASK \ (VPB_BIT_SHARED | VPB_BIT_EXCLUSIVE | VPB_BIT_WAITERS) #define VPB_SHARERS_SHIFT 3 #define VPB_SHARERS(x) \ (((x) & ~VPB_BIT_FLAGMASK) >> VPB_SHARERS_SHIFT) #define VPB_SHARERS_WORD(x) ((x) << VPB_SHARERS_SHIFT | VPB_BIT_SHARED) #define VPB_ONE_SHARER (1 << VPB_SHARERS_SHIFT) #define VPB_SINGLE_EXCLUSIVER VPB_BIT_EXCLUSIVE #define VPB_UNBUSIED VPB_SHARERS_WORD(0) #define PQ_NONE 255 #define PQ_INACTIVE 0 #define PQ_ACTIVE 1 #define PQ_LAUNDRY 2 #define PQ_UNSWAPPABLE 3 #define PQ_COUNT 4 TAILQ_HEAD(pglist, vm_page); SLIST_HEAD(spglist, vm_page); struct vm_pagequeue { struct mtx pq_mutex; struct pglist pq_pl; int pq_cnt; u_int * const pq_vcnt; const char * const pq_name; } __aligned(CACHE_LINE_SIZE); struct vm_domain { struct vm_pagequeue vmd_pagequeues[PQ_COUNT]; u_int vmd_page_count; u_int vmd_free_count; long vmd_segs; /* bitmask of the segments */ boolean_t vmd_oom; int vmd_oom_seq; int vmd_last_active_scan; struct vm_page vmd_laundry_marker; struct vm_page vmd_marker; /* marker for pagedaemon private use */ struct vm_page vmd_inacthead; /* marker for LRU-defeating insertions */ }; extern struct vm_domain vm_dom[MAXMEMDOM]; #define vm_pagequeue_assert_locked(pq) mtx_assert(&(pq)->pq_mutex, MA_OWNED) #define vm_pagequeue_lock(pq) mtx_lock(&(pq)->pq_mutex) #define vm_pagequeue_lockptr(pq) (&(pq)->pq_mutex) #define vm_pagequeue_unlock(pq) mtx_unlock(&(pq)->pq_mutex) #ifdef _KERNEL extern vm_page_t bogus_page; static __inline void vm_pagequeue_cnt_add(struct vm_pagequeue *pq, int addend) { #ifdef notyet vm_pagequeue_assert_locked(pq); #endif pq->pq_cnt += addend; atomic_add_int(pq->pq_vcnt, addend); } #define vm_pagequeue_cnt_inc(pq) vm_pagequeue_cnt_add((pq), 1) #define vm_pagequeue_cnt_dec(pq) vm_pagequeue_cnt_add((pq), -1) #endif /* _KERNEL */ extern struct mtx_padalign vm_page_queue_free_mtx; extern struct mtx_padalign pa_lock[]; #if defined(__arm__) #define PDRSHIFT PDR_SHIFT #elif !defined(PDRSHIFT) #define PDRSHIFT 21 #endif #define pa_index(pa) ((pa) >> PDRSHIFT) #define PA_LOCKPTR(pa) ((struct mtx *)(&pa_lock[pa_index(pa) % PA_LOCK_COUNT])) #define PA_LOCKOBJPTR(pa) ((struct lock_object *)PA_LOCKPTR((pa))) #define PA_LOCK(pa) mtx_lock(PA_LOCKPTR(pa)) #define PA_TRYLOCK(pa) mtx_trylock(PA_LOCKPTR(pa)) #define PA_UNLOCK(pa) mtx_unlock(PA_LOCKPTR(pa)) #define PA_UNLOCK_COND(pa) \ do { \ if ((pa) != 0) { \ PA_UNLOCK((pa)); \ (pa) = 0; \ } \ } while (0) #define PA_LOCK_ASSERT(pa, a) mtx_assert(PA_LOCKPTR(pa), (a)) #ifdef KLD_MODULE #define vm_page_lock(m) vm_page_lock_KBI((m), LOCK_FILE, LOCK_LINE) #define vm_page_unlock(m) vm_page_unlock_KBI((m), LOCK_FILE, LOCK_LINE) #define vm_page_trylock(m) vm_page_trylock_KBI((m), LOCK_FILE, LOCK_LINE) #else /* !KLD_MODULE */ #define vm_page_lockptr(m) (PA_LOCKPTR(VM_PAGE_TO_PHYS((m)))) #define vm_page_lock(m) mtx_lock(vm_page_lockptr((m))) #define vm_page_unlock(m) mtx_unlock(vm_page_lockptr((m))) #define vm_page_trylock(m) mtx_trylock(vm_page_lockptr((m))) #endif #if defined(INVARIANTS) #define vm_page_assert_locked(m) \ vm_page_assert_locked_KBI((m), __FILE__, __LINE__) #define vm_page_lock_assert(m, a) \ vm_page_lock_assert_KBI((m), (a), __FILE__, __LINE__) #else #define vm_page_assert_locked(m) #define vm_page_lock_assert(m, a) #endif /* * The vm_page's aflags are updated using atomic operations. To set or clear * these flags, the functions vm_page_aflag_set() and vm_page_aflag_clear() * must be used. Neither these flags nor these functions are part of the KBI. * * PGA_REFERENCED may be cleared only if the page is locked. It is set by * both the MI and MD VM layers. However, kernel loadable modules should not * directly set this flag. They should call vm_page_reference() instead. * * PGA_WRITEABLE is set exclusively on managed pages by pmap_enter(). * When it does so, the object must be locked, or the page must be * exclusive busied. The MI VM layer must never access this flag * directly. Instead, it should call pmap_page_is_write_mapped(). * * PGA_EXECUTABLE may be set by pmap routines, and indicates that a page has * at least one executable mapping. It is not consumed by the MI VM layer. */ #define PGA_WRITEABLE 0x01 /* page may be mapped writeable */ #define PGA_REFERENCED 0x02 /* page has been referenced */ #define PGA_EXECUTABLE 0x04 /* page may be mapped executable */ /* * Page flags. If changed at any other time than page allocation or * freeing, the modification must be protected by the vm_page lock. */ #define PG_FICTITIOUS 0x0004 /* physical page doesn't exist */ #define PG_ZERO 0x0008 /* page is zeroed */ #define PG_MARKER 0x0010 /* special queue marker page */ #define PG_NODUMP 0x0080 /* don't include this page in a dump */ #define PG_UNHOLDFREE 0x0100 /* delayed free of a held page */ /* * Misc constants. */ #define ACT_DECLINE 1 #define ACT_ADVANCE 3 #define ACT_INIT 5 #define ACT_MAX 64 #ifdef _KERNEL #include #include /* * Each pageable resident page falls into one of five lists: * * free * Available for allocation now. * * inactive * Low activity, candidates for reclamation. * This list is approximately LRU ordered. * * laundry * This is the list of pages that should be * paged out next. * * unswappable * Dirty anonymous pages that cannot be paged * out because no swap device is configured. * * active * Pages that are "active", i.e., they have been * recently referenced. * */ extern int vm_page_zero_count; extern vm_page_t vm_page_array; /* First resident page in table */ extern long vm_page_array_size; /* number of vm_page_t's */ extern long first_page; /* first physical page number */ #define VM_PAGE_TO_PHYS(entry) ((entry)->phys_addr) /* * PHYS_TO_VM_PAGE() returns the vm_page_t object that represents a memory * page to which the given physical address belongs. The correct vm_page_t * object is returned for addresses that are not page-aligned. */ vm_page_t PHYS_TO_VM_PAGE(vm_paddr_t pa); /* * Page allocation parameters for vm_page for the functions * vm_page_alloc(), vm_page_grab(), vm_page_alloc_contig() and * vm_page_alloc_freelist(). Some functions support only a subset * of the flags, and ignore others, see the flags legend. * * Bits 0 - 1 define class. * Bits 2 - 15 dedicated for flags. * Legend: * (a) - vm_page_alloc() supports the flag. * (c) - vm_page_alloc_contig() supports the flag. * (f) - vm_page_alloc_freelist() supports the flag. * (g) - vm_page_grab() supports the flag. * Bits above 15 define the count of additional pages that the caller * intends to allocate. */ #define VM_ALLOC_NORMAL 0 #define VM_ALLOC_INTERRUPT 1 #define VM_ALLOC_SYSTEM 2 #define VM_ALLOC_CLASS_MASK 3 #define VM_ALLOC_WIRED 0x0020 /* (acfg) Allocate non pageable page */ #define VM_ALLOC_ZERO 0x0040 /* (acfg) Try to obtain a zeroed page */ #define VM_ALLOC_NOOBJ 0x0100 /* (acg) No associated object */ #define VM_ALLOC_NOBUSY 0x0200 /* (acg) Do not busy the page */ #define VM_ALLOC_IGN_SBUSY 0x1000 /* (g) Ignore shared busy flag */ #define VM_ALLOC_NODUMP 0x2000 /* (ag) don't include in dump */ #define VM_ALLOC_SBUSY 0x4000 /* (acg) Shared busy the page */ #define VM_ALLOC_NOWAIT 0x8000 /* (g) Do not sleep, return NULL */ #define VM_ALLOC_COUNT_SHIFT 16 #define VM_ALLOC_COUNT(count) ((count) << VM_ALLOC_COUNT_SHIFT) #ifdef M_NOWAIT static inline int malloc2vm_flags(int malloc_flags) { int pflags; KASSERT((malloc_flags & M_USE_RESERVE) == 0 || (malloc_flags & M_NOWAIT) != 0, ("M_USE_RESERVE requires M_NOWAIT")); pflags = (malloc_flags & M_USE_RESERVE) != 0 ? VM_ALLOC_INTERRUPT : VM_ALLOC_SYSTEM; if ((malloc_flags & M_ZERO) != 0) pflags |= VM_ALLOC_ZERO; if ((malloc_flags & M_NODUMP) != 0) pflags |= VM_ALLOC_NODUMP; return (pflags); } #endif /* * Predicates supported by vm_page_ps_test(): * * PS_ALL_DIRTY is true only if the entire (super)page is dirty. * However, it can be spuriously false when the (super)page has become * dirty in the pmap but that information has not been propagated to the * machine-independent layer. */ #define PS_ALL_DIRTY 0x1 #define PS_ALL_VALID 0x2 #define PS_NONE_BUSY 0x4 void vm_page_busy_downgrade(vm_page_t m); void vm_page_busy_sleep(vm_page_t m, const char *msg, bool nonshared); void vm_page_flash(vm_page_t m); void vm_page_hold(vm_page_t mem); void vm_page_unhold(vm_page_t mem); void vm_page_free(vm_page_t m); void vm_page_free_zero(vm_page_t m); void vm_page_activate (vm_page_t); void vm_page_advise(vm_page_t m, int advice); -vm_page_t vm_page_alloc (vm_object_t, vm_pindex_t, int); +vm_page_t vm_page_alloc(vm_object_t, vm_pindex_t, int); vm_page_t vm_page_alloc_contig(vm_object_t object, vm_pindex_t pindex, int req, u_long npages, vm_paddr_t low, vm_paddr_t high, u_long alignment, vm_paddr_t boundary, vm_memattr_t memattr); -vm_page_t vm_page_alloc_freelist(int, int); +vm_page_t vm_page_alloc_contig_domain(vm_object_t object, vm_pindex_t pindex, + int domain, int req, u_long npages, vm_paddr_t low, vm_paddr_t high, + u_long alignment, vm_paddr_t boundary, vm_memattr_t memattr); +vm_page_t vm_page_alloc_domain(vm_object_t, vm_pindex_t, int, int); +vm_page_t vm_page_alloc_freelist(int, int, int); vm_page_t vm_page_grab (vm_object_t, vm_pindex_t, int); int vm_page_try_to_free (vm_page_t); void vm_page_deactivate (vm_page_t); void vm_page_deactivate_noreuse(vm_page_t); void vm_page_dequeue(vm_page_t m); void vm_page_dequeue_locked(vm_page_t m); vm_page_t vm_page_find_least(vm_object_t, vm_pindex_t); vm_page_t vm_page_getfake(vm_paddr_t paddr, vm_memattr_t memattr); void vm_page_initfake(vm_page_t m, vm_paddr_t paddr, vm_memattr_t memattr); int vm_page_insert (vm_page_t, vm_object_t, vm_pindex_t); void vm_page_launder(vm_page_t m); vm_page_t vm_page_lookup (vm_object_t, vm_pindex_t); vm_page_t vm_page_next(vm_page_t m); int vm_page_pa_tryrelock(pmap_t, vm_paddr_t, vm_paddr_t *); struct vm_pagequeue *vm_page_pagequeue(vm_page_t m); vm_page_t vm_page_prev(vm_page_t m); bool vm_page_ps_test(vm_page_t m, int flags, vm_page_t skip_m); void vm_page_putfake(vm_page_t m); void vm_page_readahead_finish(vm_page_t m); bool vm_page_reclaim_contig(int req, u_long npages, vm_paddr_t low, vm_paddr_t high, u_long alignment, vm_paddr_t boundary); void vm_page_reference(vm_page_t m); void vm_page_remove (vm_page_t); int vm_page_rename (vm_page_t, vm_object_t, vm_pindex_t); vm_page_t vm_page_replace(vm_page_t mnew, vm_object_t object, vm_pindex_t pindex); void vm_page_requeue(vm_page_t m); void vm_page_requeue_locked(vm_page_t m); int vm_page_sbusied(vm_page_t m); vm_page_t vm_page_scan_contig(u_long npages, vm_page_t m_start, vm_page_t m_end, u_long alignment, vm_paddr_t boundary, int options); void vm_page_set_valid_range(vm_page_t m, int base, int size); int vm_page_sleep_if_busy(vm_page_t m, const char *msg); vm_offset_t vm_page_startup(vm_offset_t vaddr); void vm_page_sunbusy(vm_page_t m); int vm_page_trysbusy(vm_page_t m); void vm_page_unhold_pages(vm_page_t *ma, int count); void vm_page_unswappable(vm_page_t m); boolean_t vm_page_unwire(vm_page_t m, uint8_t queue); void vm_page_updatefake(vm_page_t m, vm_paddr_t paddr, vm_memattr_t memattr); void vm_page_wire (vm_page_t); void vm_page_xunbusy_hard(vm_page_t m); void vm_page_xunbusy_maybelocked(vm_page_t m); void vm_page_set_validclean (vm_page_t, int, int); void vm_page_clear_dirty (vm_page_t, int, int); void vm_page_set_invalid (vm_page_t, int, int); int vm_page_is_valid (vm_page_t, int, int); void vm_page_test_dirty (vm_page_t); vm_page_bits_t vm_page_bits(int base, int size); void vm_page_zero_invalid(vm_page_t m, boolean_t setvalid); void vm_page_free_toq(vm_page_t m); void vm_page_dirty_KBI(vm_page_t m); void vm_page_lock_KBI(vm_page_t m, const char *file, int line); void vm_page_unlock_KBI(vm_page_t m, const char *file, int line); int vm_page_trylock_KBI(vm_page_t m, const char *file, int line); #if defined(INVARIANTS) || defined(INVARIANT_SUPPORT) void vm_page_assert_locked_KBI(vm_page_t m, const char *file, int line); void vm_page_lock_assert_KBI(vm_page_t m, int a, const char *file, int line); #endif #define vm_page_assert_sbusied(m) \ KASSERT(vm_page_sbusied(m), \ ("vm_page_assert_sbusied: page %p not shared busy @ %s:%d", \ (m), __FILE__, __LINE__)) #define vm_page_assert_unbusied(m) \ KASSERT(!vm_page_busied(m), \ ("vm_page_assert_unbusied: page %p busy @ %s:%d", \ (m), __FILE__, __LINE__)) #define vm_page_assert_xbusied(m) \ KASSERT(vm_page_xbusied(m), \ ("vm_page_assert_xbusied: page %p not exclusive busy @ %s:%d", \ (m), __FILE__, __LINE__)) #define vm_page_busied(m) \ ((m)->busy_lock != VPB_UNBUSIED) #define vm_page_sbusy(m) do { \ if (!vm_page_trysbusy(m)) \ panic("%s: page %p failed shared busying", __func__, \ (m)); \ } while (0) #define vm_page_tryxbusy(m) \ (atomic_cmpset_acq_int(&(m)->busy_lock, VPB_UNBUSIED, \ VPB_SINGLE_EXCLUSIVER)) #define vm_page_xbusied(m) \ (((m)->busy_lock & VPB_SINGLE_EXCLUSIVER) != 0) #define vm_page_xbusy(m) do { \ if (!vm_page_tryxbusy(m)) \ panic("%s: page %p failed exclusive busying", __func__, \ (m)); \ } while (0) /* Note: page m's lock must not be owned by the caller. */ #define vm_page_xunbusy(m) do { \ if (!atomic_cmpset_rel_int(&(m)->busy_lock, \ VPB_SINGLE_EXCLUSIVER, VPB_UNBUSIED)) \ vm_page_xunbusy_hard(m); \ } while (0) #ifdef INVARIANTS void vm_page_object_lock_assert(vm_page_t m); #define VM_PAGE_OBJECT_LOCK_ASSERT(m) vm_page_object_lock_assert(m) void vm_page_assert_pga_writeable(vm_page_t m, uint8_t bits); #define VM_PAGE_ASSERT_PGA_WRITEABLE(m, bits) \ vm_page_assert_pga_writeable(m, bits) #else #define VM_PAGE_OBJECT_LOCK_ASSERT(m) (void)0 #define VM_PAGE_ASSERT_PGA_WRITEABLE(m, bits) (void)0 #endif /* * We want to use atomic updates for the aflags field, which is 8 bits wide. * However, not all architectures support atomic operations on 8-bit * destinations. In order that we can easily use a 32-bit operation, we * require that the aflags field be 32-bit aligned. */ CTASSERT(offsetof(struct vm_page, aflags) % sizeof(uint32_t) == 0); /* * Clear the given bits in the specified page. */ static inline void vm_page_aflag_clear(vm_page_t m, uint8_t bits) { uint32_t *addr, val; /* * The PGA_REFERENCED flag can only be cleared if the page is locked. */ if ((bits & PGA_REFERENCED) != 0) vm_page_assert_locked(m); /* * Access the whole 32-bit word containing the aflags field with an * atomic update. Parallel non-atomic updates to the other fields * within this word are handled properly by the atomic update. */ addr = (void *)&m->aflags; KASSERT(((uintptr_t)addr & (sizeof(uint32_t) - 1)) == 0, ("vm_page_aflag_clear: aflags is misaligned")); val = bits; #if BYTE_ORDER == BIG_ENDIAN val <<= 24; #endif atomic_clear_32(addr, val); } /* * Set the given bits in the specified page. */ static inline void vm_page_aflag_set(vm_page_t m, uint8_t bits) { uint32_t *addr, val; VM_PAGE_ASSERT_PGA_WRITEABLE(m, bits); /* * Access the whole 32-bit word containing the aflags field with an * atomic update. Parallel non-atomic updates to the other fields * within this word are handled properly by the atomic update. */ addr = (void *)&m->aflags; KASSERT(((uintptr_t)addr & (sizeof(uint32_t) - 1)) == 0, ("vm_page_aflag_set: aflags is misaligned")); val = bits; #if BYTE_ORDER == BIG_ENDIAN val <<= 24; #endif atomic_set_32(addr, val); } /* * vm_page_dirty: * * Set all bits in the page's dirty field. * * The object containing the specified page must be locked if the * call is made from the machine-independent layer. * * See vm_page_clear_dirty_mask(). */ static __inline void vm_page_dirty(vm_page_t m) { /* Use vm_page_dirty_KBI() under INVARIANTS to save memory. */ #if defined(KLD_MODULE) || defined(INVARIANTS) vm_page_dirty_KBI(m); #else m->dirty = VM_PAGE_BITS_ALL; #endif } /* * vm_page_remque: * * If the given page is in a page queue, then remove it from that page * queue. * * The page must be locked. */ static inline void vm_page_remque(vm_page_t m) { if (m->queue != PQ_NONE) vm_page_dequeue(m); } /* * vm_page_undirty: * * Set page to not be dirty. Note: does not clear pmap modify bits */ static __inline void vm_page_undirty(vm_page_t m) { VM_PAGE_OBJECT_LOCK_ASSERT(m); m->dirty = 0; } static inline void vm_page_replace_checked(vm_page_t mnew, vm_object_t object, vm_pindex_t pindex, vm_page_t mold) { vm_page_t mret; mret = vm_page_replace(mnew, object, pindex); KASSERT(mret == mold, ("invalid page replacement, mold=%p, mret=%p", mold, mret)); /* Unused if !INVARIANTS. */ (void)mold; (void)mret; } static inline bool vm_page_active(vm_page_t m) { return (m->queue == PQ_ACTIVE); } static inline bool vm_page_inactive(vm_page_t m) { return (m->queue == PQ_INACTIVE); } static inline bool vm_page_in_laundry(vm_page_t m) { return (m->queue == PQ_LAUNDRY || m->queue == PQ_UNSWAPPABLE); } #endif /* _KERNEL */ #endif /* !_VM_PAGE_ */ Index: projects/numa2/sys/vm/vm_phys.c =================================================================== --- projects/numa2/sys/vm/vm_phys.c (revision 321505) +++ projects/numa2/sys/vm/vm_phys.c (revision 321506) @@ -1,1481 +1,1424 @@ /*- * Copyright (c) 2002-2006 Rice University * Copyright (c) 2007 Alan L. Cox * All rights reserved. * * This software was developed for the FreeBSD Project by Alan L. Cox, * Olivier Crameri, Peter Druschel, Sitaram Iyer, and Juan Navarro. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT * HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY * WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ /* * Physical memory system implementation * * Any external functions defined by this module are only to be used by the * virtual memory system. */ #include __FBSDID("$FreeBSD$"); #include "opt_ddb.h" #include "opt_vm.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include _Static_assert(sizeof(long) * NBBY >= VM_PHYSSEG_MAX, "Too many physsegs."); #ifdef VM_NUMA_ALLOC struct mem_affinity *mem_affinity; int *mem_locality; #endif int vm_ndomains = 1; struct vm_phys_seg vm_phys_segs[VM_PHYSSEG_MAX]; int vm_phys_nsegs; struct vm_phys_fictitious_seg; static int vm_phys_fictitious_cmp(struct vm_phys_fictitious_seg *, struct vm_phys_fictitious_seg *); RB_HEAD(fict_tree, vm_phys_fictitious_seg) vm_phys_fictitious_tree = RB_INITIALIZER(_vm_phys_fictitious_tree); struct vm_phys_fictitious_seg { RB_ENTRY(vm_phys_fictitious_seg) node; /* Memory region data */ vm_paddr_t start; vm_paddr_t end; vm_page_t first_page; }; RB_GENERATE_STATIC(fict_tree, vm_phys_fictitious_seg, node, vm_phys_fictitious_cmp); static struct rwlock vm_phys_fictitious_reg_lock; MALLOC_DEFINE(M_FICT_PAGES, "vm_fictitious", "Fictitious VM pages"); static struct vm_freelist vm_phys_free_queues[MAXMEMDOM][VM_NFREELIST][VM_NFREEPOOL][VM_NFREEORDER]; static int vm_nfreelists; /* * Provides the mapping from VM_FREELIST_* to free list indices (flind). */ static int vm_freelist_to_flind[VM_NFREELIST]; CTASSERT(VM_FREELIST_DEFAULT == 0); #ifdef VM_FREELIST_ISADMA #define VM_ISADMA_BOUNDARY 16777216 #endif #ifdef VM_FREELIST_DMA32 #define VM_DMA32_BOUNDARY ((vm_paddr_t)1 << 32) #endif /* * Enforce the assumptions made by vm_phys_add_seg() and vm_phys_init() about * the ordering of the free list boundaries. */ #if defined(VM_ISADMA_BOUNDARY) && defined(VM_LOWMEM_BOUNDARY) CTASSERT(VM_ISADMA_BOUNDARY < VM_LOWMEM_BOUNDARY); #endif #if defined(VM_LOWMEM_BOUNDARY) && defined(VM_DMA32_BOUNDARY) CTASSERT(VM_LOWMEM_BOUNDARY < VM_DMA32_BOUNDARY); #endif static int sysctl_vm_phys_free(SYSCTL_HANDLER_ARGS); SYSCTL_OID(_vm, OID_AUTO, phys_free, CTLTYPE_STRING | CTLFLAG_RD, NULL, 0, sysctl_vm_phys_free, "A", "Phys Free Info"); static int sysctl_vm_phys_segs(SYSCTL_HANDLER_ARGS); SYSCTL_OID(_vm, OID_AUTO, phys_segs, CTLTYPE_STRING | CTLFLAG_RD, NULL, 0, sysctl_vm_phys_segs, "A", "Phys Seg Info"); #ifdef VM_NUMA_ALLOC static int sysctl_vm_phys_locality(SYSCTL_HANDLER_ARGS); SYSCTL_OID(_vm, OID_AUTO, phys_locality, CTLTYPE_STRING | CTLFLAG_RD, NULL, 0, sysctl_vm_phys_locality, "A", "Phys Locality Info"); #endif SYSCTL_INT(_vm, OID_AUTO, ndomains, CTLFLAG_RD, &vm_ndomains, 0, "Number of physical memory domains available."); /* * Default to first-touch + round-robin. */ static struct mtx vm_default_policy_mtx; MTX_SYSINIT(vm_default_policy, &vm_default_policy_mtx, "default policy mutex", MTX_DEF); #ifdef VM_NUMA_ALLOC -static struct vm_domain_policy vm_default_policy = +static struct vm_domain_policy vm_default_policy_storage = VM_DOMAIN_POLICY_STATIC_INITIALISER(VM_POLICY_FIRST_TOUCH_ROUND_ROBIN, 0); #else /* Use round-robin so the domain policy code will only try once per allocation */ -static struct vm_domain_policy vm_default_policy = +static struct vm_domain_policy vm_default_policy_storage = VM_DOMAIN_POLICY_STATIC_INITIALISER(VM_POLICY_ROUND_ROBIN, 0); #endif -static vm_page_t vm_phys_alloc_domain_pages(int domain, int flind, int pool, - int order); +struct vm_domain_policy *vm_default_policy = &vm_default_policy_storage; + static vm_page_t vm_phys_alloc_seg_contig(struct vm_phys_seg *seg, u_long npages, vm_paddr_t low, vm_paddr_t high, u_long alignment, vm_paddr_t boundary); static void _vm_phys_create_seg(vm_paddr_t start, vm_paddr_t end, int domain); static void vm_phys_create_seg(vm_paddr_t start, vm_paddr_t end); static int vm_phys_paddr_to_segind(vm_paddr_t pa); static void vm_phys_split_pages(vm_page_t m, int oind, struct vm_freelist *fl, int order); static int sysctl_vm_default_policy(SYSCTL_HANDLER_ARGS) { char policy_name[32]; int error; mtx_lock(&vm_default_policy_mtx); /* Map policy to output string */ - switch (vm_default_policy.p.policy) { + switch (vm_default_policy->p.policy) { case VM_POLICY_FIRST_TOUCH: strcpy(policy_name, "first-touch"); break; case VM_POLICY_FIRST_TOUCH_ROUND_ROBIN: strcpy(policy_name, "first-touch-rr"); break; case VM_POLICY_ROUND_ROBIN: default: strcpy(policy_name, "rr"); break; } mtx_unlock(&vm_default_policy_mtx); error = sysctl_handle_string(oidp, &policy_name[0], sizeof(policy_name), req); if (error != 0 || req->newptr == NULL) return (error); mtx_lock(&vm_default_policy_mtx); /* Set: match on the subset of policies that make sense as a default */ if (strcmp("first-touch-rr", policy_name) == 0) { - vm_domain_policy_set(&vm_default_policy, + vm_domain_policy_set(vm_default_policy, VM_POLICY_FIRST_TOUCH_ROUND_ROBIN, 0); } else if (strcmp("first-touch", policy_name) == 0) { - vm_domain_policy_set(&vm_default_policy, + vm_domain_policy_set(vm_default_policy, VM_POLICY_FIRST_TOUCH, 0); } else if (strcmp("rr", policy_name) == 0) { - vm_domain_policy_set(&vm_default_policy, + vm_domain_policy_set(vm_default_policy, VM_POLICY_ROUND_ROBIN, 0); } else { error = EINVAL; goto finish; } error = 0; finish: mtx_unlock(&vm_default_policy_mtx); return (error); } SYSCTL_PROC(_vm, OID_AUTO, default_policy, CTLTYPE_STRING | CTLFLAG_RW, 0, 0, sysctl_vm_default_policy, "A", "Default policy (rr, first-touch, first-touch-rr"); /* * Red-black tree helpers for vm fictitious range management. */ static inline int vm_phys_fictitious_in_range(struct vm_phys_fictitious_seg *p, struct vm_phys_fictitious_seg *range) { KASSERT(range->start != 0 && range->end != 0, ("Invalid range passed on search for vm_fictitious page")); if (p->start >= range->end) return (1); if (p->start < range->start) return (-1); return (0); } static int vm_phys_fictitious_cmp(struct vm_phys_fictitious_seg *p1, struct vm_phys_fictitious_seg *p2) { /* Check if this is a search for a page */ if (p1->end == 0) return (vm_phys_fictitious_in_range(p1, p2)); KASSERT(p2->end != 0, ("Invalid range passed as second parameter to vm fictitious comparison")); /* Searching to add a new range */ if (p1->end <= p2->start) return (-1); if (p1->start >= p2->end) return (1); panic("Trying to add overlapping vm fictitious ranges:\n" "[%#jx:%#jx] and [%#jx:%#jx]", (uintmax_t)p1->start, (uintmax_t)p1->end, (uintmax_t)p2->start, (uintmax_t)p2->end); } -#ifdef notyet -static __inline int -vm_rr_selectdomain(void) -{ -#ifdef VM_NUMA_ALLOC - struct thread *td; - - td = curthread; - - td->td_dom_rr_idx++; - td->td_dom_rr_idx %= vm_ndomains; - return (td->td_dom_rr_idx); -#else - return (0); -#endif -} -#endif /* notyet */ - +#if 0 /* * Initialise a VM domain iterator. * * Check the thread policy, then the proc policy, * then default to the system policy. * * Later on the various layers will have this logic * plumbed into them and the phys code will be explicitly * handed a VM domain policy to use. */ static void vm_policy_iterator_init(struct vm_domain_iterator *vi) { #ifdef VM_NUMA_ALLOC struct vm_domain_policy lcl; #endif vm_domain_iterator_init(vi); #ifdef VM_NUMA_ALLOC /* Copy out the thread policy */ vm_domain_policy_localcopy(&lcl, &curthread->td_vm_dom_policy); if (lcl.p.policy != VM_POLICY_NONE) { /* Thread policy is present; use it */ vm_domain_iterator_set_policy(vi, &lcl); return; } vm_domain_policy_localcopy(&lcl, &curthread->td_proc->p_vm_dom_policy); if (lcl.p.policy != VM_POLICY_NONE) { /* Process policy is present; use it */ vm_domain_iterator_set_policy(vi, &lcl); return; } #endif /* Use system default policy */ - vm_domain_iterator_set_policy(vi, &vm_default_policy); + vm_domain_iterator_set_policy(vi, vm_default_policy); } static void vm_policy_iterator_finish(struct vm_domain_iterator *vi) { vm_domain_iterator_cleanup(vi); } +#endif boolean_t vm_phys_domain_intersects(long mask, vm_paddr_t low, vm_paddr_t high) { struct vm_phys_seg *s; int idx; while ((idx = ffsl(mask)) != 0) { idx--; /* ffsl counts from 1 */ mask &= ~(1UL << idx); s = &vm_phys_segs[idx]; if (low < s->end && high > s->start) return (TRUE); } return (FALSE); } /* * Outputs the state of the physical memory allocator, specifically, * the amount of physical memory in each free list. */ static int sysctl_vm_phys_free(SYSCTL_HANDLER_ARGS) { struct sbuf sbuf; struct vm_freelist *fl; int dom, error, flind, oind, pind; error = sysctl_wire_old_buffer(req, 0); if (error != 0) return (error); sbuf_new_for_sysctl(&sbuf, NULL, 128 * vm_ndomains, req); for (dom = 0; dom < vm_ndomains; dom++) { sbuf_printf(&sbuf,"\nDOMAIN %d:\n", dom); for (flind = 0; flind < vm_nfreelists; flind++) { sbuf_printf(&sbuf, "\nFREE LIST %d:\n" "\n ORDER (SIZE) | NUMBER" "\n ", flind); for (pind = 0; pind < VM_NFREEPOOL; pind++) sbuf_printf(&sbuf, " | POOL %d", pind); sbuf_printf(&sbuf, "\n-- "); for (pind = 0; pind < VM_NFREEPOOL; pind++) sbuf_printf(&sbuf, "-- -- "); sbuf_printf(&sbuf, "--\n"); for (oind = VM_NFREEORDER - 1; oind >= 0; oind--) { sbuf_printf(&sbuf, " %2d (%6dK)", oind, 1 << (PAGE_SHIFT - 10 + oind)); for (pind = 0; pind < VM_NFREEPOOL; pind++) { fl = vm_phys_free_queues[dom][flind][pind]; sbuf_printf(&sbuf, " | %6d", fl[oind].lcnt); } sbuf_printf(&sbuf, "\n"); } } } error = sbuf_finish(&sbuf); sbuf_delete(&sbuf); return (error); } /* * Outputs the set of physical memory segments. */ static int sysctl_vm_phys_segs(SYSCTL_HANDLER_ARGS) { struct sbuf sbuf; struct vm_phys_seg *seg; int error, segind; error = sysctl_wire_old_buffer(req, 0); if (error != 0) return (error); sbuf_new_for_sysctl(&sbuf, NULL, 128, req); for (segind = 0; segind < vm_phys_nsegs; segind++) { sbuf_printf(&sbuf, "\nSEGMENT %d:\n\n", segind); seg = &vm_phys_segs[segind]; sbuf_printf(&sbuf, "start: %#jx\n", (uintmax_t)seg->start); sbuf_printf(&sbuf, "end: %#jx\n", (uintmax_t)seg->end); sbuf_printf(&sbuf, "domain: %d\n", seg->domain); sbuf_printf(&sbuf, "free list: %p\n", seg->free_queues); } error = sbuf_finish(&sbuf); sbuf_delete(&sbuf); return (error); } /* * Return affinity, or -1 if there's no affinity information. */ int vm_phys_mem_affinity(int f, int t) { #ifdef VM_NUMA_ALLOC if (mem_locality == NULL) return (-1); if (f >= vm_ndomains || t >= vm_ndomains) return (-1); return (mem_locality[f * vm_ndomains + t]); #else return (-1); #endif } #ifdef VM_NUMA_ALLOC /* * Outputs the VM locality table. */ static int sysctl_vm_phys_locality(SYSCTL_HANDLER_ARGS) { struct sbuf sbuf; int error, i, j; error = sysctl_wire_old_buffer(req, 0); if (error != 0) return (error); sbuf_new_for_sysctl(&sbuf, NULL, 128, req); sbuf_printf(&sbuf, "\n"); for (i = 0; i < vm_ndomains; i++) { sbuf_printf(&sbuf, "%d: ", i); for (j = 0; j < vm_ndomains; j++) { sbuf_printf(&sbuf, "%d ", vm_phys_mem_affinity(i, j)); } sbuf_printf(&sbuf, "\n"); } error = sbuf_finish(&sbuf); sbuf_delete(&sbuf); return (error); } #endif static void vm_freelist_add(struct vm_freelist *fl, vm_page_t m, int order, int tail) { m->order = order; if (tail) TAILQ_INSERT_TAIL(&fl[order].pl, m, plinks.q); else TAILQ_INSERT_HEAD(&fl[order].pl, m, plinks.q); fl[order].lcnt++; } static void vm_freelist_rem(struct vm_freelist *fl, vm_page_t m, int order) { TAILQ_REMOVE(&fl[order].pl, m, plinks.q); fl[order].lcnt--; m->order = VM_NFREEORDER; } /* * Create a physical memory segment. */ static void _vm_phys_create_seg(vm_paddr_t start, vm_paddr_t end, int domain) { struct vm_phys_seg *seg; KASSERT(vm_phys_nsegs < VM_PHYSSEG_MAX, ("vm_phys_create_seg: increase VM_PHYSSEG_MAX")); KASSERT(domain < vm_ndomains, ("vm_phys_create_seg: invalid domain provided")); seg = &vm_phys_segs[vm_phys_nsegs++]; while (seg > vm_phys_segs && (seg - 1)->start >= end) { *seg = *(seg - 1); seg--; } seg->start = start; seg->end = end; seg->domain = domain; } static void vm_phys_create_seg(vm_paddr_t start, vm_paddr_t end) { #ifdef VM_NUMA_ALLOC int i; if (mem_affinity == NULL) { _vm_phys_create_seg(start, end, 0); return; } for (i = 0;; i++) { if (mem_affinity[i].end == 0) panic("Reached end of affinity info"); if (mem_affinity[i].end <= start) continue; if (mem_affinity[i].start > start) panic("No affinity info for start %jx", (uintmax_t)start); if (mem_affinity[i].end >= end) { _vm_phys_create_seg(start, end, mem_affinity[i].domain); break; } _vm_phys_create_seg(start, mem_affinity[i].end, mem_affinity[i].domain); start = mem_affinity[i].end; } #else _vm_phys_create_seg(start, end, 0); #endif } /* * Add a physical memory segment. */ void vm_phys_add_seg(vm_paddr_t start, vm_paddr_t end) { vm_paddr_t paddr; KASSERT((start & PAGE_MASK) == 0, ("vm_phys_define_seg: start is not page aligned")); KASSERT((end & PAGE_MASK) == 0, ("vm_phys_define_seg: end is not page aligned")); /* * Split the physical memory segment if it spans two or more free * list boundaries. */ paddr = start; #ifdef VM_FREELIST_ISADMA if (paddr < VM_ISADMA_BOUNDARY && end > VM_ISADMA_BOUNDARY) { vm_phys_create_seg(paddr, VM_ISADMA_BOUNDARY); paddr = VM_ISADMA_BOUNDARY; } #endif #ifdef VM_FREELIST_LOWMEM if (paddr < VM_LOWMEM_BOUNDARY && end > VM_LOWMEM_BOUNDARY) { vm_phys_create_seg(paddr, VM_LOWMEM_BOUNDARY); paddr = VM_LOWMEM_BOUNDARY; } #endif #ifdef VM_FREELIST_DMA32 if (paddr < VM_DMA32_BOUNDARY && end > VM_DMA32_BOUNDARY) { vm_phys_create_seg(paddr, VM_DMA32_BOUNDARY); paddr = VM_DMA32_BOUNDARY; } #endif vm_phys_create_seg(paddr, end); } /* * Initialize the physical memory allocator. * * Requires that vm_page_array is initialized! */ void vm_phys_init(void) { struct vm_freelist *fl; struct vm_phys_seg *seg; u_long npages; int dom, flind, freelist, oind, pind, segind; /* * Compute the number of free lists, and generate the mapping from the * manifest constants VM_FREELIST_* to the free list indices. * * Initially, the entries of vm_freelist_to_flind[] are set to either * 0 or 1 to indicate which free lists should be created. */ npages = 0; for (segind = vm_phys_nsegs - 1; segind >= 0; segind--) { seg = &vm_phys_segs[segind]; #ifdef VM_FREELIST_ISADMA if (seg->end <= VM_ISADMA_BOUNDARY) vm_freelist_to_flind[VM_FREELIST_ISADMA] = 1; else #endif #ifdef VM_FREELIST_LOWMEM if (seg->end <= VM_LOWMEM_BOUNDARY) vm_freelist_to_flind[VM_FREELIST_LOWMEM] = 1; else #endif #ifdef VM_FREELIST_DMA32 if ( #ifdef VM_DMA32_NPAGES_THRESHOLD /* * Create the DMA32 free list only if the amount of * physical memory above physical address 4G exceeds the * given threshold. */ npages > VM_DMA32_NPAGES_THRESHOLD && #endif seg->end <= VM_DMA32_BOUNDARY) vm_freelist_to_flind[VM_FREELIST_DMA32] = 1; else #endif { npages += atop(seg->end - seg->start); vm_freelist_to_flind[VM_FREELIST_DEFAULT] = 1; } } /* Change each entry into a running total of the free lists. */ for (freelist = 1; freelist < VM_NFREELIST; freelist++) { vm_freelist_to_flind[freelist] += vm_freelist_to_flind[freelist - 1]; } vm_nfreelists = vm_freelist_to_flind[VM_NFREELIST - 1]; KASSERT(vm_nfreelists > 0, ("vm_phys_init: no free lists")); /* Change each entry into a free list index. */ for (freelist = 0; freelist < VM_NFREELIST; freelist++) vm_freelist_to_flind[freelist]--; /* * Initialize the first_page and free_queues fields of each physical * memory segment. */ #ifdef VM_PHYSSEG_SPARSE npages = 0; #endif for (segind = 0; segind < vm_phys_nsegs; segind++) { seg = &vm_phys_segs[segind]; #ifdef VM_PHYSSEG_SPARSE seg->first_page = &vm_page_array[npages]; npages += atop(seg->end - seg->start); #else seg->first_page = PHYS_TO_VM_PAGE(seg->start); #endif #ifdef VM_FREELIST_ISADMA if (seg->end <= VM_ISADMA_BOUNDARY) { flind = vm_freelist_to_flind[VM_FREELIST_ISADMA]; KASSERT(flind >= 0, ("vm_phys_init: ISADMA flind < 0")); } else #endif #ifdef VM_FREELIST_LOWMEM if (seg->end <= VM_LOWMEM_BOUNDARY) { flind = vm_freelist_to_flind[VM_FREELIST_LOWMEM]; KASSERT(flind >= 0, ("vm_phys_init: LOWMEM flind < 0")); } else #endif #ifdef VM_FREELIST_DMA32 if (seg->end <= VM_DMA32_BOUNDARY) { flind = vm_freelist_to_flind[VM_FREELIST_DMA32]; KASSERT(flind >= 0, ("vm_phys_init: DMA32 flind < 0")); } else #endif { flind = vm_freelist_to_flind[VM_FREELIST_DEFAULT]; KASSERT(flind >= 0, ("vm_phys_init: DEFAULT flind < 0")); } seg->free_queues = &vm_phys_free_queues[seg->domain][flind]; } /* * Initialize the free queues. */ for (dom = 0; dom < vm_ndomains; dom++) { for (flind = 0; flind < vm_nfreelists; flind++) { for (pind = 0; pind < VM_NFREEPOOL; pind++) { fl = vm_phys_free_queues[dom][flind][pind]; for (oind = 0; oind < VM_NFREEORDER; oind++) TAILQ_INIT(&fl[oind].pl); } } } rw_init(&vm_phys_fictitious_reg_lock, "vmfctr"); } /* * Split a contiguous, power of two-sized set of physical pages. */ static __inline void vm_phys_split_pages(vm_page_t m, int oind, struct vm_freelist *fl, int order) { vm_page_t m_buddy; while (oind > order) { oind--; m_buddy = &m[1 << oind]; KASSERT(m_buddy->order == VM_NFREEORDER, ("vm_phys_split_pages: page %p has unexpected order %d", m_buddy, m_buddy->order)); vm_freelist_add(fl, m_buddy, oind, 0); } } /* * Initialize a physical page and add it to the free lists. */ void vm_phys_add_page(vm_paddr_t pa) { vm_page_t m; struct vm_domain *vmd; vm_cnt.v_page_count++; m = vm_phys_paddr_to_vm_page(pa); m->busy_lock = VPB_UNBUSIED; m->phys_addr = pa; m->queue = PQ_NONE; m->segind = vm_phys_paddr_to_segind(pa); - vmd = vm_phys_domain(m); + vmd = vm_page_domain(m); vmd->vmd_page_count++; vmd->vmd_segs |= 1UL << m->segind; KASSERT(m->order == VM_NFREEORDER, ("vm_phys_add_page: page %p has unexpected order %d", m, m->order)); m->pool = VM_FREEPOOL_DEFAULT; pmap_page_init(m); mtx_lock(&vm_page_queue_free_mtx); vm_phys_freecnt_adj(m, 1); vm_phys_free_pages(m, 0); mtx_unlock(&vm_page_queue_free_mtx); } /* * Allocate a contiguous, power of two-sized set of physical pages * from the free lists. * * The free page queues must be locked. */ vm_page_t -vm_phys_alloc_pages(int pool, int order) +vm_phys_alloc_pages(int domain, int pool, int order) { vm_page_t m; - int domain, flind; - struct vm_domain_iterator vi; + int flind; - KASSERT(pool < VM_NFREEPOOL, - ("vm_phys_alloc_pages: pool %d is out of range", pool)); - KASSERT(order < VM_NFREEORDER, - ("vm_phys_alloc_pages: order %d is out of range", order)); - - vm_policy_iterator_init(&vi); - - while ((vm_domain_iterator_run(&vi, &domain)) == 0) { - for (flind = 0; flind < vm_nfreelists; flind++) { - m = vm_phys_alloc_domain_pages(domain, flind, pool, - order); - if (m != NULL) - return (m); - } + for (flind = 0; flind < vm_nfreelists; flind++) { + m = vm_phys_alloc_freelist_pages(domain, flind, pool, order); + if (m != NULL) + return (m); } - - vm_policy_iterator_finish(&vi); return (NULL); } /* * Allocate a contiguous, power of two-sized set of physical pages from the * specified free list. The free list must be specified using one of the * manifest constants VM_FREELIST_*. * * The free page queues must be locked. */ vm_page_t -vm_phys_alloc_freelist_pages(int freelist, int pool, int order) +vm_phys_alloc_freelist_pages(int domain, int flind, int pool, int order) { + struct vm_freelist *alt, *fl; vm_page_t m; - struct vm_domain_iterator vi; - int domain; + int oind, pind; - KASSERT(freelist < VM_NFREELIST, + KASSERT(domain >= 0 && domain < vm_ndomains, + ("vm_phys_alloc_freelist_pages: domain %d is out of range", + domain)); + KASSERT(flind < VM_NFREELIST, ("vm_phys_alloc_freelist_pages: freelist %d is out of range", - freelist)); + flind)); KASSERT(pool < VM_NFREEPOOL, ("vm_phys_alloc_freelist_pages: pool %d is out of range", pool)); KASSERT(order < VM_NFREEORDER, ("vm_phys_alloc_freelist_pages: order %d is out of range", order)); - vm_policy_iterator_init(&vi); - - while ((vm_domain_iterator_run(&vi, &domain)) == 0) { - m = vm_phys_alloc_domain_pages(domain, - vm_freelist_to_flind[freelist], pool, order); - if (m != NULL) - return (m); - } - - vm_policy_iterator_finish(&vi); - return (NULL); -} - -static vm_page_t -vm_phys_alloc_domain_pages(int domain, int flind, int pool, int order) -{ - struct vm_freelist *fl; - struct vm_freelist *alt; - int oind, pind; - vm_page_t m; - mtx_assert(&vm_page_queue_free_mtx, MA_OWNED); fl = &vm_phys_free_queues[domain][flind][pool][0]; for (oind = order; oind < VM_NFREEORDER; oind++) { m = TAILQ_FIRST(&fl[oind].pl); if (m != NULL) { vm_freelist_rem(fl, m, oind); vm_phys_split_pages(m, oind, fl, order); return (m); } } /* * The given pool was empty. Find the largest * contiguous, power-of-two-sized set of pages in any * pool. Transfer these pages to the given pool, and * use them to satisfy the allocation. */ for (oind = VM_NFREEORDER - 1; oind >= order; oind--) { for (pind = 0; pind < VM_NFREEPOOL; pind++) { alt = &vm_phys_free_queues[domain][flind][pind][0]; m = TAILQ_FIRST(&alt[oind].pl); if (m != NULL) { vm_freelist_rem(alt, m, oind); vm_phys_set_pool(pool, m, oind); vm_phys_split_pages(m, oind, fl, order); return (m); } } } return (NULL); } /* * Find the vm_page corresponding to the given physical address. */ vm_page_t vm_phys_paddr_to_vm_page(vm_paddr_t pa) { struct vm_phys_seg *seg; int segind; for (segind = 0; segind < vm_phys_nsegs; segind++) { seg = &vm_phys_segs[segind]; if (pa >= seg->start && pa < seg->end) return (&seg->first_page[atop(pa - seg->start)]); } return (NULL); } vm_page_t vm_phys_fictitious_to_vm_page(vm_paddr_t pa) { struct vm_phys_fictitious_seg tmp, *seg; vm_page_t m; m = NULL; tmp.start = pa; tmp.end = 0; rw_rlock(&vm_phys_fictitious_reg_lock); seg = RB_FIND(fict_tree, &vm_phys_fictitious_tree, &tmp); rw_runlock(&vm_phys_fictitious_reg_lock); if (seg == NULL) return (NULL); m = &seg->first_page[atop(pa - seg->start)]; KASSERT((m->flags & PG_FICTITIOUS) != 0, ("%p not fictitious", m)); return (m); } static inline void vm_phys_fictitious_init_range(vm_page_t range, vm_paddr_t start, long page_count, vm_memattr_t memattr) { long i; for (i = 0; i < page_count; i++) { vm_page_initfake(&range[i], start + PAGE_SIZE * i, memattr); range[i].oflags &= ~VPO_UNMANAGED; range[i].busy_lock = VPB_UNBUSIED; } } int vm_phys_fictitious_reg_range(vm_paddr_t start, vm_paddr_t end, vm_memattr_t memattr) { struct vm_phys_fictitious_seg *seg; vm_page_t fp; long page_count; #ifdef VM_PHYSSEG_DENSE long pi, pe; long dpage_count; #endif KASSERT(start < end, ("Start of segment isn't less than end (start: %jx end: %jx)", (uintmax_t)start, (uintmax_t)end)); page_count = (end - start) / PAGE_SIZE; #ifdef VM_PHYSSEG_DENSE pi = atop(start); pe = atop(end); if (pi >= first_page && (pi - first_page) < vm_page_array_size) { fp = &vm_page_array[pi - first_page]; if ((pe - first_page) > vm_page_array_size) { /* * We have a segment that starts inside * of vm_page_array, but ends outside of it. * * Use vm_page_array pages for those that are * inside of the vm_page_array range, and * allocate the remaining ones. */ dpage_count = vm_page_array_size - (pi - first_page); vm_phys_fictitious_init_range(fp, start, dpage_count, memattr); page_count -= dpage_count; start += ptoa(dpage_count); goto alloc; } /* * We can allocate the full range from vm_page_array, * so there's no need to register the range in the tree. */ vm_phys_fictitious_init_range(fp, start, page_count, memattr); return (0); } else if (pe > first_page && (pe - first_page) < vm_page_array_size) { /* * We have a segment that ends inside of vm_page_array, * but starts outside of it. */ fp = &vm_page_array[0]; dpage_count = pe - first_page; vm_phys_fictitious_init_range(fp, ptoa(first_page), dpage_count, memattr); end -= ptoa(dpage_count); page_count -= dpage_count; goto alloc; } else if (pi < first_page && pe > (first_page + vm_page_array_size)) { /* * Trying to register a fictitious range that expands before * and after vm_page_array. */ return (EINVAL); } else { alloc: #endif fp = malloc(page_count * sizeof(struct vm_page), M_FICT_PAGES, M_WAITOK | M_ZERO); #ifdef VM_PHYSSEG_DENSE } #endif vm_phys_fictitious_init_range(fp, start, page_count, memattr); seg = malloc(sizeof(*seg), M_FICT_PAGES, M_WAITOK | M_ZERO); seg->start = start; seg->end = end; seg->first_page = fp; rw_wlock(&vm_phys_fictitious_reg_lock); RB_INSERT(fict_tree, &vm_phys_fictitious_tree, seg); rw_wunlock(&vm_phys_fictitious_reg_lock); return (0); } void vm_phys_fictitious_unreg_range(vm_paddr_t start, vm_paddr_t end) { struct vm_phys_fictitious_seg *seg, tmp; #ifdef VM_PHYSSEG_DENSE long pi, pe; #endif KASSERT(start < end, ("Start of segment isn't less than end (start: %jx end: %jx)", (uintmax_t)start, (uintmax_t)end)); #ifdef VM_PHYSSEG_DENSE pi = atop(start); pe = atop(end); if (pi >= first_page && (pi - first_page) < vm_page_array_size) { if ((pe - first_page) <= vm_page_array_size) { /* * This segment was allocated using vm_page_array * only, there's nothing to do since those pages * were never added to the tree. */ return; } /* * We have a segment that starts inside * of vm_page_array, but ends outside of it. * * Calculate how many pages were added to the * tree and free them. */ start = ptoa(first_page + vm_page_array_size); } else if (pe > first_page && (pe - first_page) < vm_page_array_size) { /* * We have a segment that ends inside of vm_page_array, * but starts outside of it. */ end = ptoa(first_page); } else if (pi < first_page && pe > (first_page + vm_page_array_size)) { /* Since it's not possible to register such a range, panic. */ panic( "Unregistering not registered fictitious range [%#jx:%#jx]", (uintmax_t)start, (uintmax_t)end); } #endif tmp.start = start; tmp.end = 0; rw_wlock(&vm_phys_fictitious_reg_lock); seg = RB_FIND(fict_tree, &vm_phys_fictitious_tree, &tmp); if (seg->start != start || seg->end != end) { rw_wunlock(&vm_phys_fictitious_reg_lock); panic( "Unregistering not registered fictitious range [%#jx:%#jx]", (uintmax_t)start, (uintmax_t)end); } RB_REMOVE(fict_tree, &vm_phys_fictitious_tree, seg); rw_wunlock(&vm_phys_fictitious_reg_lock); free(seg->first_page, M_FICT_PAGES); free(seg, M_FICT_PAGES); } /* * Find the segment containing the given physical address. */ static int vm_phys_paddr_to_segind(vm_paddr_t pa) { struct vm_phys_seg *seg; int segind; for (segind = 0; segind < vm_phys_nsegs; segind++) { seg = &vm_phys_segs[segind]; if (pa >= seg->start && pa < seg->end) return (segind); } panic("vm_phys_paddr_to_segind: paddr %#jx is not in any segment" , (uintmax_t)pa); } /* * Free a contiguous, power of two-sized set of physical pages. * * The free page queues must be locked. */ void vm_phys_free_pages(vm_page_t m, int order) { struct vm_freelist *fl; struct vm_phys_seg *seg; vm_paddr_t pa; vm_page_t m_buddy; KASSERT(m->order == VM_NFREEORDER, ("vm_phys_free_pages: page %p has unexpected order %d", m, m->order)); KASSERT(m->pool < VM_NFREEPOOL, ("vm_phys_free_pages: page %p has unexpected pool %d", m, m->pool)); KASSERT(order < VM_NFREEORDER, ("vm_phys_free_pages: order %d is out of range", order)); mtx_assert(&vm_page_queue_free_mtx, MA_OWNED); seg = &vm_phys_segs[m->segind]; if (order < VM_NFREEORDER - 1) { pa = VM_PAGE_TO_PHYS(m); do { pa ^= ((vm_paddr_t)1 << (PAGE_SHIFT + order)); if (pa < seg->start || pa >= seg->end) break; m_buddy = &seg->first_page[atop(pa - seg->start)]; if (m_buddy->order != order) break; fl = (*seg->free_queues)[m_buddy->pool]; vm_freelist_rem(fl, m_buddy, order); if (m_buddy->pool != m->pool) vm_phys_set_pool(m->pool, m_buddy, order); order++; pa &= ~(((vm_paddr_t)1 << (PAGE_SHIFT + order)) - 1); m = &seg->first_page[atop(pa - seg->start)]; } while (order < VM_NFREEORDER - 1); } fl = (*seg->free_queues)[m->pool]; vm_freelist_add(fl, m, order, 1); } /* * Free a contiguous, arbitrarily sized set of physical pages. * * The free page queues must be locked. */ void vm_phys_free_contig(vm_page_t m, u_long npages) { u_int n; int order; /* * Avoid unnecessary coalescing by freeing the pages in the largest * possible power-of-two-sized subsets. */ mtx_assert(&vm_page_queue_free_mtx, MA_OWNED); for (;; npages -= n) { /* * Unsigned "min" is used here so that "order" is assigned * "VM_NFREEORDER - 1" when "m"'s physical address is zero * or the low-order bits of its physical address are zero * because the size of a physical address exceeds the size of * a long. */ order = min(ffsl(VM_PAGE_TO_PHYS(m) >> PAGE_SHIFT) - 1, VM_NFREEORDER - 1); n = 1 << order; if (npages < n) break; vm_phys_free_pages(m, order); m += n; } /* The residual "npages" is less than "1 << (VM_NFREEORDER - 1)". */ for (; npages > 0; npages -= n) { order = flsl(npages) - 1; n = 1 << order; vm_phys_free_pages(m, order); m += n; } } /* * Scan physical memory between the specified addresses "low" and "high" for a * run of contiguous physical pages that satisfy the specified conditions, and * return the lowest page in the run. The specified "alignment" determines * the alignment of the lowest physical page in the run. If the specified * "boundary" is non-zero, then the run of physical pages cannot span a * physical address that is a multiple of "boundary". * * "npages" must be greater than zero. Both "alignment" and "boundary" must * be a power of two. */ vm_page_t vm_phys_scan_contig(u_long npages, vm_paddr_t low, vm_paddr_t high, u_long alignment, vm_paddr_t boundary, int options) { vm_paddr_t pa_end; vm_page_t m_end, m_run, m_start; struct vm_phys_seg *seg; int segind; KASSERT(npages > 0, ("npages is 0")); KASSERT(powerof2(alignment), ("alignment is not a power of 2")); KASSERT(powerof2(boundary), ("boundary is not a power of 2")); if (low >= high) return (NULL); for (segind = 0; segind < vm_phys_nsegs; segind++) { seg = &vm_phys_segs[segind]; if (seg->start >= high) break; if (low >= seg->end) continue; if (low <= seg->start) m_start = seg->first_page; else m_start = &seg->first_page[atop(low - seg->start)]; if (high < seg->end) pa_end = high; else pa_end = seg->end; if (pa_end - VM_PAGE_TO_PHYS(m_start) < ptoa(npages)) continue; m_end = &seg->first_page[atop(pa_end - seg->start)]; m_run = vm_page_scan_contig(npages, m_start, m_end, alignment, boundary, options); if (m_run != NULL) return (m_run); } return (NULL); } /* * Set the pool for a contiguous, power of two-sized set of physical pages. */ void vm_phys_set_pool(int pool, vm_page_t m, int order) { vm_page_t m_tmp; for (m_tmp = m; m_tmp < &m[1 << order]; m_tmp++) m_tmp->pool = pool; } /* * Search for the given physical page "m" in the free lists. If the search * succeeds, remove "m" from the free lists and return TRUE. Otherwise, return * FALSE, indicating that "m" is not in the free lists. * * The free page queues must be locked. */ boolean_t vm_phys_unfree_page(vm_page_t m) { struct vm_freelist *fl; struct vm_phys_seg *seg; vm_paddr_t pa, pa_half; vm_page_t m_set, m_tmp; int order; mtx_assert(&vm_page_queue_free_mtx, MA_OWNED); /* * First, find the contiguous, power of two-sized set of free * physical pages containing the given physical page "m" and * assign it to "m_set". */ seg = &vm_phys_segs[m->segind]; for (m_set = m, order = 0; m_set->order == VM_NFREEORDER && order < VM_NFREEORDER - 1; ) { order++; pa = m->phys_addr & (~(vm_paddr_t)0 << (PAGE_SHIFT + order)); if (pa >= seg->start) m_set = &seg->first_page[atop(pa - seg->start)]; else return (FALSE); } if (m_set->order < order) return (FALSE); if (m_set->order == VM_NFREEORDER) return (FALSE); KASSERT(m_set->order < VM_NFREEORDER, ("vm_phys_unfree_page: page %p has unexpected order %d", m_set, m_set->order)); /* * Next, remove "m_set" from the free lists. Finally, extract * "m" from "m_set" using an iterative algorithm: While "m_set" * is larger than a page, shrink "m_set" by returning the half * of "m_set" that does not contain "m" to the free lists. */ fl = (*seg->free_queues)[m_set->pool]; order = m_set->order; vm_freelist_rem(fl, m_set, order); while (order > 0) { order--; pa_half = m_set->phys_addr ^ (1 << (PAGE_SHIFT + order)); if (m->phys_addr < pa_half) m_tmp = &seg->first_page[atop(pa_half - seg->start)]; else { m_tmp = m_set; m_set = &seg->first_page[atop(pa_half - seg->start)]; } vm_freelist_add(fl, m_tmp, order, 0); } KASSERT(m_set == m, ("vm_phys_unfree_page: fatal inconsistency")); return (TRUE); } /* * Allocate a contiguous set of physical pages of the given size * "npages" from the free lists. All of the physical pages must be at * or above the given physical address "low" and below the given * physical address "high". The given value "alignment" determines the * alignment of the first physical page in the set. If the given value * "boundary" is non-zero, then the set of physical pages cannot cross * any physical address boundary that is a multiple of that value. Both * "alignment" and "boundary" must be a power of two. */ vm_page_t -vm_phys_alloc_contig(u_long npages, vm_paddr_t low, vm_paddr_t high, +vm_phys_alloc_contig(int domain, u_long npages, vm_paddr_t low, vm_paddr_t high, u_long alignment, vm_paddr_t boundary) { vm_paddr_t pa_end, pa_start; vm_page_t m_run; - struct vm_domain_iterator vi; struct vm_phys_seg *seg; - int domain, segind; + int segind; KASSERT(npages > 0, ("npages is 0")); KASSERT(powerof2(alignment), ("alignment is not a power of 2")); KASSERT(powerof2(boundary), ("boundary is not a power of 2")); mtx_assert(&vm_page_queue_free_mtx, MA_OWNED); if (low >= high) return (NULL); - vm_policy_iterator_init(&vi); -restartdom: - if (vm_domain_iterator_run(&vi, &domain) != 0) { - vm_policy_iterator_finish(&vi); - return (NULL); - } m_run = NULL; for (segind = vm_phys_nsegs - 1; segind >= 0; segind--) { seg = &vm_phys_segs[segind]; if (seg->start >= high || seg->domain != domain) continue; if (low >= seg->end) break; if (low <= seg->start) pa_start = seg->start; else pa_start = low; if (high < seg->end) pa_end = high; else pa_end = seg->end; if (pa_end - pa_start < ptoa(npages)) continue; m_run = vm_phys_alloc_seg_contig(seg, npages, low, high, alignment, boundary); if (m_run != NULL) break; } - if (m_run == NULL && !vm_domain_iterator_isdone(&vi)) - goto restartdom; - vm_policy_iterator_finish(&vi); return (m_run); } /* * Allocate a run of contiguous physical pages from the free list for the * specified segment. */ static vm_page_t vm_phys_alloc_seg_contig(struct vm_phys_seg *seg, u_long npages, vm_paddr_t low, vm_paddr_t high, u_long alignment, vm_paddr_t boundary) { struct vm_freelist *fl; vm_paddr_t pa, pa_end, size; vm_page_t m, m_ret; u_long npages_end; int oind, order, pind; KASSERT(npages > 0, ("npages is 0")); KASSERT(powerof2(alignment), ("alignment is not a power of 2")); KASSERT(powerof2(boundary), ("boundary is not a power of 2")); mtx_assert(&vm_page_queue_free_mtx, MA_OWNED); /* Compute the queue that is the best fit for npages. */ for (order = 0; (1 << order) < npages; order++); /* Search for a run satisfying the specified conditions. */ size = npages << PAGE_SHIFT; for (oind = min(order, VM_NFREEORDER - 1); oind < VM_NFREEORDER; oind++) { for (pind = 0; pind < VM_NFREEPOOL; pind++) { fl = (*seg->free_queues)[pind]; TAILQ_FOREACH(m_ret, &fl[oind].pl, plinks.q) { /* * Is the size of this allocation request * larger than the largest block size? */ if (order >= VM_NFREEORDER) { /* * Determine if a sufficient number of * subsequent blocks to satisfy the * allocation request are free. */ pa = VM_PAGE_TO_PHYS(m_ret); pa_end = pa + size; for (;;) { pa += 1 << (PAGE_SHIFT + VM_NFREEORDER - 1); if (pa >= pa_end || pa < seg->start || pa >= seg->end) break; m = &seg->first_page[atop(pa - seg->start)]; if (m->order != VM_NFREEORDER - 1) break; } /* If not, go to the next block. */ if (pa < pa_end) continue; } /* * Determine if the blocks are within the * given range, satisfy the given alignment, * and do not cross the given boundary. */ pa = VM_PAGE_TO_PHYS(m_ret); pa_end = pa + size; if (pa >= low && pa_end <= high && (pa & (alignment - 1)) == 0 && rounddown2(pa ^ (pa_end - 1), boundary) == 0) goto done; } } } return (NULL); done: for (m = m_ret; m < &m_ret[npages]; m = &m[1 << oind]) { fl = (*seg->free_queues)[m->pool]; vm_freelist_rem(fl, m, m->order); } if (m_ret->pool != VM_FREEPOOL_DEFAULT) vm_phys_set_pool(VM_FREEPOOL_DEFAULT, m_ret, oind); fl = (*seg->free_queues)[m_ret->pool]; vm_phys_split_pages(m_ret, oind, fl, order); /* Return excess pages to the free lists. */ npages_end = roundup2(npages, 1 << imin(oind, order)); if (npages < npages_end) vm_phys_free_contig(&m_ret[npages], npages_end - npages); return (m_ret); } #ifdef DDB /* * Show the number of physical pages in each of the free lists. */ DB_SHOW_COMMAND(freepages, db_show_freepages) { struct vm_freelist *fl; int flind, oind, pind, dom; for (dom = 0; dom < vm_ndomains; dom++) { db_printf("DOMAIN: %d\n", dom); for (flind = 0; flind < vm_nfreelists; flind++) { db_printf("FREE LIST %d:\n" "\n ORDER (SIZE) | NUMBER" "\n ", flind); for (pind = 0; pind < VM_NFREEPOOL; pind++) db_printf(" | POOL %d", pind); db_printf("\n-- "); for (pind = 0; pind < VM_NFREEPOOL; pind++) db_printf("-- -- "); db_printf("--\n"); for (oind = VM_NFREEORDER - 1; oind >= 0; oind--) { db_printf(" %2.2d (%6.6dK)", oind, 1 << (PAGE_SHIFT - 10 + oind)); for (pind = 0; pind < VM_NFREEPOOL; pind++) { fl = vm_phys_free_queues[dom][flind][pind]; db_printf(" | %6.6d", fl[oind].lcnt); } db_printf("\n"); } db_printf("\n"); } db_printf("\n"); } } #endif Index: projects/numa2/sys/vm/vm_phys.h =================================================================== --- projects/numa2/sys/vm/vm_phys.h (revision 321505) +++ projects/numa2/sys/vm/vm_phys.h (revision 321506) @@ -1,125 +1,139 @@ /*- * Copyright (c) 2002-2006 Rice University * Copyright (c) 2007 Alan L. Cox * All rights reserved. * * This software was developed for the FreeBSD Project by Alan L. Cox, * Olivier Crameri, Peter Druschel, Sitaram Iyer, and Juan Navarro. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT * HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY * WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. * * $FreeBSD$ */ /* * Physical memory system definitions */ #ifndef _VM_PHYS_H_ #define _VM_PHYS_H_ #ifdef _KERNEL /* Domains must be dense (non-sparse) and zero-based. */ struct mem_affinity { vm_paddr_t start; vm_paddr_t end; int domain; }; struct vm_freelist { struct pglist pl; int lcnt; }; struct vm_phys_seg { vm_paddr_t start; vm_paddr_t end; vm_page_t first_page; int domain; struct vm_freelist (*free_queues)[VM_NFREEPOOL][VM_NFREEORDER]; }; extern struct mem_affinity *mem_affinity; extern int *mem_locality; extern int vm_ndomains; extern struct vm_phys_seg vm_phys_segs[]; extern int vm_phys_nsegs; /* * The following functions are only to be used by the virtual memory system. */ void vm_phys_add_page(vm_paddr_t pa); void vm_phys_add_seg(vm_paddr_t start, vm_paddr_t end); -vm_page_t vm_phys_alloc_contig(u_long npages, vm_paddr_t low, vm_paddr_t high, - u_long alignment, vm_paddr_t boundary); -vm_page_t vm_phys_alloc_freelist_pages(int freelist, int pool, int order); -vm_page_t vm_phys_alloc_pages(int pool, int order); +vm_page_t vm_phys_alloc_contig(int domain, u_long npages, vm_paddr_t low, + vm_paddr_t high, u_long alignment, vm_paddr_t boundary); +vm_page_t vm_phys_alloc_freelist_pages(int domain, int freelist, int pool, + int order); +vm_page_t vm_phys_alloc_pages(int domain, int pool, int order); boolean_t vm_phys_domain_intersects(long mask, vm_paddr_t low, vm_paddr_t high); int vm_phys_fictitious_reg_range(vm_paddr_t start, vm_paddr_t end, vm_memattr_t memattr); void vm_phys_fictitious_unreg_range(vm_paddr_t start, vm_paddr_t end); vm_page_t vm_phys_fictitious_to_vm_page(vm_paddr_t pa); void vm_phys_free_contig(vm_page_t m, u_long npages); void vm_phys_free_pages(vm_page_t m, int order); void vm_phys_init(void); vm_page_t vm_phys_paddr_to_vm_page(vm_paddr_t pa); vm_page_t vm_phys_scan_contig(u_long npages, vm_paddr_t low, vm_paddr_t high, u_long alignment, vm_paddr_t boundary, int options); void vm_phys_set_pool(int pool, vm_page_t m, int order); boolean_t vm_phys_unfree_page(vm_page_t m); int vm_phys_mem_affinity(int f, int t); /* * vm_phys_domain: * * Return the memory domain the page belongs to. */ -static inline struct vm_domain * +static inline int vm_phys_domain(vm_page_t m) { #ifdef VM_NUMA_ALLOC - int domn, segind; + int segind; /* XXXKIB try to assert that the page is managed */ segind = m->segind; KASSERT(segind < vm_phys_nsegs, ("segind %d m %p", segind, m)); - domn = vm_phys_segs[segind].domain; - KASSERT(domn < vm_ndomains, ("domain %d m %p", domn, m)); - return (&vm_dom[domn]); + return (vm_phys_segs[segind].domain); #else - return (&vm_dom[0]); + return (0); #endif } +/* + * vm_page_domain: + * + * Return the memory domain structure the page belongs to. + */ +static inline struct vm_domain * +vm_page_domain(vm_page_t m) +{ + int domn; + + domn = vm_phys_domain(m); + KASSERT(domn < vm_ndomains, ("domain %d m %p", domn, m)); + return (&vm_dom[domn]); +} + static inline void vm_phys_freecnt_adj(vm_page_t m, int adj) { mtx_assert(&vm_page_queue_free_mtx, MA_OWNED); vm_cnt.v_free_count += adj; - vm_phys_domain(m)->vmd_free_count += adj; + vm_page_domain(m)->vmd_free_count += adj; } #endif /* _KERNEL */ #endif /* !_VM_PHYS_H_ */ Index: projects/numa2/sys/vm/vm_reserv.c =================================================================== --- projects/numa2/sys/vm/vm_reserv.c (revision 321505) +++ projects/numa2/sys/vm/vm_reserv.c (revision 321506) @@ -1,1138 +1,1145 @@ /*- * Copyright (c) 2002-2006 Rice University * Copyright (c) 2007-2011 Alan L. Cox * All rights reserved. * * This software was developed for the FreeBSD Project by Alan L. Cox, * Olivier Crameri, Peter Druschel, Sitaram Iyer, and Juan Navarro. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT * HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY * WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ /* * Superpage reservation management module * * Any external functions defined by this module are only to be used by the * virtual memory system. */ #include __FBSDID("$FreeBSD$"); #include "opt_vm.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* * The reservation system supports the speculative allocation of large physical * pages ("superpages"). Speculative allocation enables the fully automatic * utilization of superpages by the virtual memory system. In other words, no * programmatic directives are required to use superpages. */ #if VM_NRESERVLEVEL > 0 /* * The number of small pages that are contained in a level 0 reservation */ #define VM_LEVEL_0_NPAGES (1 << VM_LEVEL_0_ORDER) /* * The number of bits by which a physical address is shifted to obtain the * reservation number */ #define VM_LEVEL_0_SHIFT (VM_LEVEL_0_ORDER + PAGE_SHIFT) /* * The size of a level 0 reservation in bytes */ #define VM_LEVEL_0_SIZE (1 << VM_LEVEL_0_SHIFT) /* * Computes the index of the small page underlying the given (object, pindex) * within the reservation's array of small pages. */ #define VM_RESERV_INDEX(object, pindex) \ (((object)->pg_color + (pindex)) & (VM_LEVEL_0_NPAGES - 1)) /* * The size of a population map entry */ typedef u_long popmap_t; /* * The number of bits in a population map entry */ #define NBPOPMAP (NBBY * sizeof(popmap_t)) /* * The number of population map entries in a reservation */ #define NPOPMAP howmany(VM_LEVEL_0_NPAGES, NBPOPMAP) /* * Clear a bit in the population map. */ static __inline void popmap_clear(popmap_t popmap[], int i) { popmap[i / NBPOPMAP] &= ~(1UL << (i % NBPOPMAP)); } /* * Set a bit in the population map. */ static __inline void popmap_set(popmap_t popmap[], int i) { popmap[i / NBPOPMAP] |= 1UL << (i % NBPOPMAP); } /* * Is a bit in the population map clear? */ static __inline boolean_t popmap_is_clear(popmap_t popmap[], int i) { return ((popmap[i / NBPOPMAP] & (1UL << (i % NBPOPMAP))) == 0); } /* * Is a bit in the population map set? */ static __inline boolean_t popmap_is_set(popmap_t popmap[], int i) { return ((popmap[i / NBPOPMAP] & (1UL << (i % NBPOPMAP))) != 0); } /* * The reservation structure * * A reservation structure is constructed whenever a large physical page is * speculatively allocated to an object. The reservation provides the small * physical pages for the range [pindex, pindex + VM_LEVEL_0_NPAGES) of offsets * within that object. The reservation's "popcnt" tracks the number of these * small physical pages that are in use at any given time. When and if the * reservation is not fully utilized, it appears in the queue of partially * populated reservations. The reservation always appears on the containing * object's list of reservations. * * A partially populated reservation can be broken and reclaimed at any time. */ struct vm_reserv { TAILQ_ENTRY(vm_reserv) partpopq; LIST_ENTRY(vm_reserv) objq; vm_object_t object; /* containing object */ vm_pindex_t pindex; /* offset within object */ vm_page_t pages; /* first page of a superpage */ + int domain; /* NUMA domain */ int popcnt; /* # of pages in use */ char inpartpopq; popmap_t popmap[NPOPMAP]; /* bit vector of used pages */ }; /* * The reservation array * * This array is analoguous in function to vm_page_array. It differs in the * respect that it may contain a greater number of useful reservation * structures than there are (physical) superpages. These "invalid" * reservation structures exist to trade-off space for time in the * implementation of vm_reserv_from_page(). Invalid reservation structures are * distinguishable from "valid" reservation structures by inspecting the * reservation's "pages" field. Invalid reservation structures have a NULL * "pages" field. * * vm_reserv_from_page() maps a small (physical) page to an element of this * array by computing a physical reservation number from the page's physical * address. The physical reservation number is used as the array index. * * An "active" reservation is a valid reservation structure that has a non-NULL * "object" field and a non-zero "popcnt" field. In other words, every active * reservation belongs to a particular object. Moreover, every active * reservation has an entry in the containing object's list of reservations. */ static vm_reserv_t vm_reserv_array; /* * The partially populated reservation queue * * This queue enables the fast recovery of an unused free small page from a * partially populated reservation. The reservation at the head of this queue * is the least recently changed, partially populated reservation. * * Access to this queue is synchronized by the free page queue lock. */ -static TAILQ_HEAD(, vm_reserv) vm_rvq_partpop = - TAILQ_HEAD_INITIALIZER(vm_rvq_partpop); +static TAILQ_HEAD(, vm_reserv) vm_rvq_partpop[MAXMEMDOM]; static SYSCTL_NODE(_vm, OID_AUTO, reserv, CTLFLAG_RD, 0, "Reservation Info"); static long vm_reserv_broken; SYSCTL_LONG(_vm_reserv, OID_AUTO, broken, CTLFLAG_RD, &vm_reserv_broken, 0, "Cumulative number of broken reservations"); static long vm_reserv_freed; SYSCTL_LONG(_vm_reserv, OID_AUTO, freed, CTLFLAG_RD, &vm_reserv_freed, 0, "Cumulative number of freed reservations"); static int sysctl_vm_reserv_fullpop(SYSCTL_HANDLER_ARGS); SYSCTL_PROC(_vm_reserv, OID_AUTO, fullpop, CTLTYPE_INT | CTLFLAG_RD, NULL, 0, sysctl_vm_reserv_fullpop, "I", "Current number of full reservations"); static int sysctl_vm_reserv_partpopq(SYSCTL_HANDLER_ARGS); SYSCTL_OID(_vm_reserv, OID_AUTO, partpopq, CTLTYPE_STRING | CTLFLAG_RD, NULL, 0, sysctl_vm_reserv_partpopq, "A", "Partially populated reservation queues"); static long vm_reserv_reclaimed; SYSCTL_LONG(_vm_reserv, OID_AUTO, reclaimed, CTLFLAG_RD, &vm_reserv_reclaimed, 0, "Cumulative number of reclaimed reservations"); static void vm_reserv_break(vm_reserv_t rv, vm_page_t m); static void vm_reserv_depopulate(vm_reserv_t rv, int index); static vm_reserv_t vm_reserv_from_page(vm_page_t m); static boolean_t vm_reserv_has_pindex(vm_reserv_t rv, vm_pindex_t pindex); static void vm_reserv_populate(vm_reserv_t rv, int index); static void vm_reserv_reclaim(vm_reserv_t rv); /* * Returns the current number of full reservations. * * Since the number of full reservations is computed without acquiring the * free page queue lock, the returned value may be inexact. */ static int sysctl_vm_reserv_fullpop(SYSCTL_HANDLER_ARGS) { vm_paddr_t paddr; struct vm_phys_seg *seg; vm_reserv_t rv; int fullpop, segind; fullpop = 0; for (segind = 0; segind < vm_phys_nsegs; segind++) { seg = &vm_phys_segs[segind]; paddr = roundup2(seg->start, VM_LEVEL_0_SIZE); while (paddr + VM_LEVEL_0_SIZE <= seg->end) { rv = &vm_reserv_array[paddr >> VM_LEVEL_0_SHIFT]; fullpop += rv->popcnt == VM_LEVEL_0_NPAGES; paddr += VM_LEVEL_0_SIZE; } } return (sysctl_handle_int(oidp, &fullpop, 0, req)); } /* * Describes the current state of the partially populated reservation queue. */ static int sysctl_vm_reserv_partpopq(SYSCTL_HANDLER_ARGS) { struct sbuf sbuf; vm_reserv_t rv; - int counter, error, level, unused_pages; + int counter, error, i, level, unused_pages; error = sysctl_wire_old_buffer(req, 0); if (error != 0) return (error); sbuf_new_for_sysctl(&sbuf, NULL, 128, req); sbuf_printf(&sbuf, "\nLEVEL SIZE NUMBER\n\n"); for (level = -1; level <= VM_NRESERVLEVEL - 2; level++) { counter = 0; unused_pages = 0; mtx_lock(&vm_page_queue_free_mtx); - TAILQ_FOREACH(rv, &vm_rvq_partpop/*[level]*/, partpopq) { - counter++; - unused_pages += VM_LEVEL_0_NPAGES - rv->popcnt; + for (i = 0; i < vm_ndomains; i++) { + TAILQ_FOREACH(rv, &vm_rvq_partpop[i], partpopq) { + counter++; + unused_pages += VM_LEVEL_0_NPAGES - rv->popcnt; + } } mtx_unlock(&vm_page_queue_free_mtx); sbuf_printf(&sbuf, "%5d: %6dK, %6d\n", level, unused_pages * ((int)PAGE_SIZE / 1024), counter); } error = sbuf_finish(&sbuf); sbuf_delete(&sbuf); return (error); } /* * Reduces the given reservation's population count. If the population count * becomes zero, the reservation is destroyed. Additionally, moves the * reservation to the tail of the partially populated reservation queue if the * population count is non-zero. * * The free page queue lock must be held. */ static void vm_reserv_depopulate(vm_reserv_t rv, int index) { mtx_assert(&vm_page_queue_free_mtx, MA_OWNED); KASSERT(rv->object != NULL, ("vm_reserv_depopulate: reserv %p is free", rv)); KASSERT(popmap_is_set(rv->popmap, index), ("vm_reserv_depopulate: reserv %p's popmap[%d] is clear", rv, index)); KASSERT(rv->popcnt > 0, ("vm_reserv_depopulate: reserv %p's popcnt is corrupted", rv)); if (rv->inpartpopq) { - TAILQ_REMOVE(&vm_rvq_partpop, rv, partpopq); + TAILQ_REMOVE(&vm_rvq_partpop[rv->domain], rv, partpopq); rv->inpartpopq = FALSE; } else { KASSERT(rv->pages->psind == 1, ("vm_reserv_depopulate: reserv %p is already demoted", rv)); rv->pages->psind = 0; } popmap_clear(rv->popmap, index); rv->popcnt--; if (rv->popcnt == 0) { LIST_REMOVE(rv, objq); rv->object = NULL; vm_phys_free_pages(rv->pages, VM_LEVEL_0_ORDER); vm_reserv_freed++; } else { rv->inpartpopq = TRUE; - TAILQ_INSERT_TAIL(&vm_rvq_partpop, rv, partpopq); + TAILQ_INSERT_TAIL(&vm_rvq_partpop[rv->domain], rv, partpopq); } } /* * Returns the reservation to which the given page might belong. */ static __inline vm_reserv_t vm_reserv_from_page(vm_page_t m) { return (&vm_reserv_array[VM_PAGE_TO_PHYS(m) >> VM_LEVEL_0_SHIFT]); } /* * Returns TRUE if the given reservation contains the given page index and * FALSE otherwise. */ static __inline boolean_t vm_reserv_has_pindex(vm_reserv_t rv, vm_pindex_t pindex) { return (((pindex - rv->pindex) & ~(VM_LEVEL_0_NPAGES - 1)) == 0); } /* * Increases the given reservation's population count. Moves the reservation * to the tail of the partially populated reservation queue. * * The free page queue must be locked. */ static void vm_reserv_populate(vm_reserv_t rv, int index) { mtx_assert(&vm_page_queue_free_mtx, MA_OWNED); KASSERT(rv->object != NULL, ("vm_reserv_populate: reserv %p is free", rv)); KASSERT(popmap_is_clear(rv->popmap, index), ("vm_reserv_populate: reserv %p's popmap[%d] is set", rv, index)); KASSERT(rv->popcnt < VM_LEVEL_0_NPAGES, ("vm_reserv_populate: reserv %p is already full", rv)); KASSERT(rv->pages->psind == 0, ("vm_reserv_populate: reserv %p is already promoted", rv)); if (rv->inpartpopq) { - TAILQ_REMOVE(&vm_rvq_partpop, rv, partpopq); + TAILQ_REMOVE(&vm_rvq_partpop[rv->domain], rv, partpopq); rv->inpartpopq = FALSE; } popmap_set(rv->popmap, index); rv->popcnt++; if (rv->popcnt < VM_LEVEL_0_NPAGES) { rv->inpartpopq = TRUE; - TAILQ_INSERT_TAIL(&vm_rvq_partpop, rv, partpopq); + TAILQ_INSERT_TAIL(&vm_rvq_partpop[rv->domain], rv, partpopq); } else rv->pages->psind = 1; } /* * Allocates a contiguous set of physical pages of the given size "npages" * from existing or newly created reservations. All of the physical pages * must be at or above the given physical address "low" and below the given * physical address "high". The given value "alignment" determines the * alignment of the first physical page in the set. If the given value * "boundary" is non-zero, then the set of physical pages cannot cross any * physical address boundary that is a multiple of that value. Both * "alignment" and "boundary" must be a power of two. * * The page "mpred" must immediately precede the offset "pindex" within the * specified object. * * The object and free page queue must be locked. */ vm_page_t -vm_reserv_alloc_contig(vm_object_t object, vm_pindex_t pindex, u_long npages, - vm_paddr_t low, vm_paddr_t high, u_long alignment, vm_paddr_t boundary, - vm_page_t mpred) +vm_reserv_alloc_contig(vm_object_t object, vm_pindex_t pindex, int domain, + u_long npages, vm_paddr_t low, vm_paddr_t high, u_long alignment, + vm_paddr_t boundary, vm_page_t mpred) { vm_paddr_t pa, size; vm_page_t m, m_ret, msucc; vm_pindex_t first, leftcap, rightcap; vm_reserv_t rv; u_long allocpages, maxpages, minpages; int i, index, n; mtx_assert(&vm_page_queue_free_mtx, MA_OWNED); VM_OBJECT_ASSERT_WLOCKED(object); KASSERT(npages != 0, ("vm_reserv_alloc_contig: npages is 0")); /* * Is a reservation fundamentally impossible? */ if (pindex < VM_RESERV_INDEX(object, pindex) || pindex + npages > object->size) return (NULL); /* * All reservations of a particular size have the same alignment. * Assuming that the first page is allocated from a reservation, the * least significant bits of its physical address can be determined * from its offset from the beginning of the reservation and the size * of the reservation. * * Could the specified index within a reservation of the smallest * possible size satisfy the alignment and boundary requirements? */ pa = VM_RESERV_INDEX(object, pindex) << PAGE_SHIFT; if ((pa & (alignment - 1)) != 0) return (NULL); size = npages << PAGE_SHIFT; if (((pa ^ (pa + size - 1)) & ~(boundary - 1)) != 0) return (NULL); /* * Look for an existing reservation. */ if (mpred != NULL) { KASSERT(mpred->object == object, ("vm_reserv_alloc_contig: object doesn't contain mpred")); KASSERT(mpred->pindex < pindex, ("vm_reserv_alloc_contig: mpred doesn't precede pindex")); rv = vm_reserv_from_page(mpred); if (rv->object == object && vm_reserv_has_pindex(rv, pindex)) goto found; msucc = TAILQ_NEXT(mpred, listq); } else msucc = TAILQ_FIRST(&object->memq); if (msucc != NULL) { KASSERT(msucc->pindex > pindex, ("vm_reserv_alloc_contig: msucc doesn't succeed pindex")); rv = vm_reserv_from_page(msucc); if (rv->object == object && vm_reserv_has_pindex(rv, pindex)) goto found; } /* * Could at least one reservation fit between the first index to the * left that can be used ("leftcap") and the first index to the right * that cannot be used ("rightcap")? */ first = pindex - VM_RESERV_INDEX(object, pindex); if (mpred != NULL) { if ((rv = vm_reserv_from_page(mpred))->object != object) leftcap = mpred->pindex + 1; else leftcap = rv->pindex + VM_LEVEL_0_NPAGES; if (leftcap > first) return (NULL); } minpages = VM_RESERV_INDEX(object, pindex) + npages; maxpages = roundup2(minpages, VM_LEVEL_0_NPAGES); allocpages = maxpages; if (msucc != NULL) { if ((rv = vm_reserv_from_page(msucc))->object != object) rightcap = msucc->pindex; else rightcap = rv->pindex; if (first + maxpages > rightcap) { if (maxpages == VM_LEVEL_0_NPAGES) return (NULL); /* * At least one reservation will fit between "leftcap" * and "rightcap". However, a reservation for the * last of the requested pages will not fit. Reduce * the size of the upcoming allocation accordingly. */ allocpages = minpages; } } /* * Would the last new reservation extend past the end of the object? */ if (first + maxpages > object->size) { /* * Don't allocate the last new reservation if the object is a * vnode or backed by another object that is a vnode. */ if (object->type == OBJT_VNODE || (object->backing_object != NULL && object->backing_object->type == OBJT_VNODE)) { if (maxpages == VM_LEVEL_0_NPAGES) return (NULL); allocpages = minpages; } /* Speculate that the object may grow. */ } /* * Allocate the physical pages. The alignment and boundary specified * for this allocation may be different from the alignment and * boundary specified for the requested pages. For instance, the * specified index may not be the first page within the first new * reservation. */ - m = vm_phys_alloc_contig(allocpages, low, high, ulmax(alignment, + m = vm_phys_alloc_contig(domain, allocpages, low, high, ulmax(alignment, VM_LEVEL_0_SIZE), boundary > VM_LEVEL_0_SIZE ? boundary : 0); if (m == NULL) return (NULL); /* * The allocated physical pages always begin at a reservation * boundary, but they do not always end at a reservation boundary. * Initialize every reservation that is completely covered by the * allocated physical pages. */ m_ret = NULL; index = VM_RESERV_INDEX(object, pindex); do { rv = vm_reserv_from_page(m); KASSERT(rv->pages == m, ("vm_reserv_alloc_contig: reserv %p's pages is corrupted", rv)); KASSERT(rv->object == NULL, ("vm_reserv_alloc_contig: reserv %p isn't free", rv)); LIST_INSERT_HEAD(&object->rvq, rv, objq); rv->object = object; rv->pindex = first; + rv->domain = vm_phys_domain(m); KASSERT(rv->popcnt == 0, ("vm_reserv_alloc_contig: reserv %p's popcnt is corrupted", rv)); KASSERT(!rv->inpartpopq, ("vm_reserv_alloc_contig: reserv %p's inpartpopq is TRUE", rv)); for (i = 0; i < NPOPMAP; i++) KASSERT(rv->popmap[i] == 0, ("vm_reserv_alloc_contig: reserv %p's popmap is corrupted", rv)); n = ulmin(VM_LEVEL_0_NPAGES - index, npages); for (i = 0; i < n; i++) vm_reserv_populate(rv, index + i); npages -= n; if (m_ret == NULL) { m_ret = &rv->pages[index]; index = 0; } m += VM_LEVEL_0_NPAGES; first += VM_LEVEL_0_NPAGES; allocpages -= VM_LEVEL_0_NPAGES; } while (allocpages >= VM_LEVEL_0_NPAGES); return (m_ret); /* * Found a matching reservation. */ found: index = VM_RESERV_INDEX(object, pindex); /* Does the allocation fit within the reservation? */ if (index + npages > VM_LEVEL_0_NPAGES) return (NULL); m = &rv->pages[index]; pa = VM_PAGE_TO_PHYS(m); if (pa < low || pa + size > high || (pa & (alignment - 1)) != 0 || ((pa ^ (pa + size - 1)) & ~(boundary - 1)) != 0) return (NULL); /* Handle vm_page_rename(m, new_object, ...). */ for (i = 0; i < npages; i++) if (popmap_is_set(rv->popmap, index + i)) return (NULL); for (i = 0; i < npages; i++) vm_reserv_populate(rv, index + i); return (m); } /* * Allocates a page from an existing or newly created reservation. * * The page "mpred" must immediately precede the offset "pindex" within the * specified object. * * The object and free page queue must be locked. */ vm_page_t -vm_reserv_alloc_page(vm_object_t object, vm_pindex_t pindex, vm_page_t mpred) +vm_reserv_alloc_page(vm_object_t object, vm_pindex_t pindex, int domain, + vm_page_t mpred) { vm_page_t m, msucc; vm_pindex_t first, leftcap, rightcap; vm_reserv_t rv; int i, index; mtx_assert(&vm_page_queue_free_mtx, MA_OWNED); VM_OBJECT_ASSERT_WLOCKED(object); /* * Is a reservation fundamentally impossible? */ if (pindex < VM_RESERV_INDEX(object, pindex) || pindex >= object->size) return (NULL); /* * Look for an existing reservation. */ if (mpred != NULL) { KASSERT(mpred->object == object, ("vm_reserv_alloc_page: object doesn't contain mpred")); KASSERT(mpred->pindex < pindex, ("vm_reserv_alloc_page: mpred doesn't precede pindex")); rv = vm_reserv_from_page(mpred); if (rv->object == object && vm_reserv_has_pindex(rv, pindex)) goto found; msucc = TAILQ_NEXT(mpred, listq); } else msucc = TAILQ_FIRST(&object->memq); if (msucc != NULL) { KASSERT(msucc->pindex > pindex, ("vm_reserv_alloc_page: msucc doesn't succeed pindex")); rv = vm_reserv_from_page(msucc); if (rv->object == object && vm_reserv_has_pindex(rv, pindex)) goto found; } /* * Could a reservation fit between the first index to the left that * can be used and the first index to the right that cannot be used? */ first = pindex - VM_RESERV_INDEX(object, pindex); if (mpred != NULL) { if ((rv = vm_reserv_from_page(mpred))->object != object) leftcap = mpred->pindex + 1; else leftcap = rv->pindex + VM_LEVEL_0_NPAGES; if (leftcap > first) return (NULL); } if (msucc != NULL) { if ((rv = vm_reserv_from_page(msucc))->object != object) rightcap = msucc->pindex; else rightcap = rv->pindex; if (first + VM_LEVEL_0_NPAGES > rightcap) return (NULL); } /* * Would a new reservation extend past the end of the object? */ if (first + VM_LEVEL_0_NPAGES > object->size) { /* * Don't allocate a new reservation if the object is a vnode or * backed by another object that is a vnode. */ if (object->type == OBJT_VNODE || (object->backing_object != NULL && object->backing_object->type == OBJT_VNODE)) return (NULL); /* Speculate that the object may grow. */ } /* * Allocate and populate the new reservation. */ - m = vm_phys_alloc_pages(VM_FREEPOOL_DEFAULT, VM_LEVEL_0_ORDER); + m = vm_phys_alloc_pages(domain, VM_FREEPOOL_DEFAULT, VM_LEVEL_0_ORDER); if (m == NULL) return (NULL); rv = vm_reserv_from_page(m); KASSERT(rv->pages == m, ("vm_reserv_alloc_page: reserv %p's pages is corrupted", rv)); KASSERT(rv->object == NULL, ("vm_reserv_alloc_page: reserv %p isn't free", rv)); LIST_INSERT_HEAD(&object->rvq, rv, objq); rv->object = object; rv->pindex = first; + rv->domain = vm_phys_domain(m); KASSERT(rv->popcnt == 0, ("vm_reserv_alloc_page: reserv %p's popcnt is corrupted", rv)); KASSERT(!rv->inpartpopq, ("vm_reserv_alloc_page: reserv %p's inpartpopq is TRUE", rv)); for (i = 0; i < NPOPMAP; i++) KASSERT(rv->popmap[i] == 0, ("vm_reserv_alloc_page: reserv %p's popmap is corrupted", rv)); index = VM_RESERV_INDEX(object, pindex); vm_reserv_populate(rv, index); return (&rv->pages[index]); /* * Found a matching reservation. */ found: index = VM_RESERV_INDEX(object, pindex); m = &rv->pages[index]; /* Handle vm_page_rename(m, new_object, ...). */ if (popmap_is_set(rv->popmap, index)) return (NULL); vm_reserv_populate(rv, index); return (m); } /* * Breaks the given reservation. Except for the specified free page, all free * pages in the reservation are returned to the physical memory allocator. * The reservation's population count and map are reset to their initial * state. * * The given reservation must not be in the partially populated reservation * queue. The free page queue lock must be held. */ static void vm_reserv_break(vm_reserv_t rv, vm_page_t m) { int begin_zeroes, hi, i, lo; mtx_assert(&vm_page_queue_free_mtx, MA_OWNED); KASSERT(rv->object != NULL, ("vm_reserv_break: reserv %p is free", rv)); KASSERT(!rv->inpartpopq, ("vm_reserv_break: reserv %p's inpartpopq is TRUE", rv)); LIST_REMOVE(rv, objq); rv->object = NULL; if (m != NULL) { /* * Since the reservation is being broken, there is no harm in * abusing the population map to stop "m" from being returned * to the physical memory allocator. */ i = m - rv->pages; KASSERT(popmap_is_clear(rv->popmap, i), ("vm_reserv_break: reserv %p's popmap is corrupted", rv)); popmap_set(rv->popmap, i); rv->popcnt++; } i = hi = 0; do { /* Find the next 0 bit. Any previous 0 bits are < "hi". */ lo = ffsl(~(((1UL << hi) - 1) | rv->popmap[i])); if (lo == 0) { /* Redundantly clears bits < "hi". */ rv->popmap[i] = 0; rv->popcnt -= NBPOPMAP - hi; while (++i < NPOPMAP) { lo = ffsl(~rv->popmap[i]); if (lo == 0) { rv->popmap[i] = 0; rv->popcnt -= NBPOPMAP; } else break; } if (i == NPOPMAP) break; hi = 0; } KASSERT(lo > 0, ("vm_reserv_break: lo is %d", lo)); /* Convert from ffsl() to ordinary bit numbering. */ lo--; if (lo > 0) { /* Redundantly clears bits < "hi". */ rv->popmap[i] &= ~((1UL << lo) - 1); rv->popcnt -= lo - hi; } begin_zeroes = NBPOPMAP * i + lo; /* Find the next 1 bit. */ do hi = ffsl(rv->popmap[i]); while (hi == 0 && ++i < NPOPMAP); if (i != NPOPMAP) /* Convert from ffsl() to ordinary bit numbering. */ hi--; vm_phys_free_contig(&rv->pages[begin_zeroes], NBPOPMAP * i + hi - begin_zeroes); } while (i < NPOPMAP); KASSERT(rv->popcnt == 0, ("vm_reserv_break: reserv %p's popcnt is corrupted", rv)); vm_reserv_broken++; } /* * Breaks all reservations belonging to the given object. */ void vm_reserv_break_all(vm_object_t object) { vm_reserv_t rv; mtx_lock(&vm_page_queue_free_mtx); while ((rv = LIST_FIRST(&object->rvq)) != NULL) { KASSERT(rv->object == object, ("vm_reserv_break_all: reserv %p is corrupted", rv)); if (rv->inpartpopq) { - TAILQ_REMOVE(&vm_rvq_partpop, rv, partpopq); + TAILQ_REMOVE(&vm_rvq_partpop[rv->domain], rv, partpopq); rv->inpartpopq = FALSE; } vm_reserv_break(rv, NULL); } mtx_unlock(&vm_page_queue_free_mtx); } /* * Frees the given page if it belongs to a reservation. Returns TRUE if the * page is freed and FALSE otherwise. * * The free page queue lock must be held. */ boolean_t vm_reserv_free_page(vm_page_t m) { vm_reserv_t rv; mtx_assert(&vm_page_queue_free_mtx, MA_OWNED); rv = vm_reserv_from_page(m); if (rv->object == NULL) return (FALSE); vm_reserv_depopulate(rv, m - rv->pages); return (TRUE); } /* * Initializes the reservation management system. Specifically, initializes * the reservation array. * * Requires that vm_page_array and first_page are initialized! */ void vm_reserv_init(void) { vm_paddr_t paddr; struct vm_phys_seg *seg; - int segind; + int i, segind; /* * Initialize the reservation array. Specifically, initialize the * "pages" field for every element that has an underlying superpage. */ for (segind = 0; segind < vm_phys_nsegs; segind++) { seg = &vm_phys_segs[segind]; paddr = roundup2(seg->start, VM_LEVEL_0_SIZE); while (paddr + VM_LEVEL_0_SIZE <= seg->end) { vm_reserv_array[paddr >> VM_LEVEL_0_SHIFT].pages = PHYS_TO_VM_PAGE(paddr); paddr += VM_LEVEL_0_SIZE; } } + for (i = 0; i < MAXMEMDOM; i++) + TAILQ_INIT(&vm_rvq_partpop[i]); } /* * Returns true if the given page belongs to a reservation and that page is * free. Otherwise, returns false. */ bool vm_reserv_is_page_free(vm_page_t m) { vm_reserv_t rv; mtx_assert(&vm_page_queue_free_mtx, MA_OWNED); rv = vm_reserv_from_page(m); if (rv->object == NULL) return (false); return (popmap_is_clear(rv->popmap, m - rv->pages)); } /* * If the given page belongs to a reservation, returns the level of that * reservation. Otherwise, returns -1. */ int vm_reserv_level(vm_page_t m) { vm_reserv_t rv; rv = vm_reserv_from_page(m); return (rv->object != NULL ? 0 : -1); } /* * Returns a reservation level if the given page belongs to a fully populated * reservation and -1 otherwise. */ int vm_reserv_level_iffullpop(vm_page_t m) { vm_reserv_t rv; rv = vm_reserv_from_page(m); return (rv->popcnt == VM_LEVEL_0_NPAGES ? 0 : -1); } /* * Breaks the given partially populated reservation, releasing its free pages * to the physical memory allocator. * * The free page queue lock must be held. */ static void vm_reserv_reclaim(vm_reserv_t rv) { mtx_assert(&vm_page_queue_free_mtx, MA_OWNED); KASSERT(rv->inpartpopq, ("vm_reserv_reclaim: reserv %p's inpartpopq is FALSE", rv)); - TAILQ_REMOVE(&vm_rvq_partpop, rv, partpopq); + TAILQ_REMOVE(&vm_rvq_partpop[rv->domain], rv, partpopq); rv->inpartpopq = FALSE; vm_reserv_break(rv, NULL); vm_reserv_reclaimed++; } /* * Breaks the reservation at the head of the partially populated reservation * queue, releasing its free pages to the physical memory allocator. Returns * TRUE if a reservation is broken and FALSE otherwise. * * The free page queue lock must be held. */ boolean_t -vm_reserv_reclaim_inactive(void) +vm_reserv_reclaim_inactive(int domain) { vm_reserv_t rv; mtx_assert(&vm_page_queue_free_mtx, MA_OWNED); - if ((rv = TAILQ_FIRST(&vm_rvq_partpop)) != NULL) { + if ((rv = TAILQ_FIRST(&vm_rvq_partpop[domain])) != NULL) { vm_reserv_reclaim(rv); return (TRUE); } return (FALSE); } /* * Searches the partially populated reservation queue for the least recently * changed reservation with free pages that satisfy the given request for * contiguous physical memory. If a satisfactory reservation is found, it is * broken. Returns TRUE if a reservation is broken and FALSE otherwise. * * The free page queue lock must be held. */ boolean_t -vm_reserv_reclaim_contig(u_long npages, vm_paddr_t low, vm_paddr_t high, - u_long alignment, vm_paddr_t boundary) +vm_reserv_reclaim_contig(int domain, u_long npages, vm_paddr_t low, + vm_paddr_t high, u_long alignment, vm_paddr_t boundary) { vm_paddr_t pa, size; vm_reserv_t rv; int hi, i, lo, low_index, next_free; mtx_assert(&vm_page_queue_free_mtx, MA_OWNED); if (npages > VM_LEVEL_0_NPAGES - 1) return (FALSE); size = npages << PAGE_SHIFT; - TAILQ_FOREACH(rv, &vm_rvq_partpop, partpopq) { + TAILQ_FOREACH(rv, &vm_rvq_partpop[domain], partpopq) { pa = VM_PAGE_TO_PHYS(&rv->pages[VM_LEVEL_0_NPAGES - 1]); if (pa + PAGE_SIZE - size < low) { /* This entire reservation is too low; go to next. */ continue; } pa = VM_PAGE_TO_PHYS(&rv->pages[0]); if (pa + size > high) { /* This entire reservation is too high; go to next. */ continue; } if (pa < low) { /* Start the search for free pages at "low". */ low_index = (low + PAGE_MASK - pa) >> PAGE_SHIFT; i = low_index / NBPOPMAP; hi = low_index % NBPOPMAP; } else i = hi = 0; do { /* Find the next free page. */ lo = ffsl(~(((1UL << hi) - 1) | rv->popmap[i])); while (lo == 0 && ++i < NPOPMAP) lo = ffsl(~rv->popmap[i]); if (i == NPOPMAP) break; /* Convert from ffsl() to ordinary bit numbering. */ lo--; next_free = NBPOPMAP * i + lo; pa = VM_PAGE_TO_PHYS(&rv->pages[next_free]); KASSERT(pa >= low, ("vm_reserv_reclaim_contig: pa is too low")); if (pa + size > high) { /* The rest of this reservation is too high. */ break; } else if ((pa & (alignment - 1)) != 0 || ((pa ^ (pa + size - 1)) & ~(boundary - 1)) != 0) { /* * The current page doesn't meet the alignment * and/or boundary requirements. Continue * searching this reservation until the rest * of its free pages are either excluded or * exhausted. */ hi = lo + 1; if (hi >= NBPOPMAP) { hi = 0; i++; } continue; } /* Find the next used page. */ hi = ffsl(rv->popmap[i] & ~((1UL << lo) - 1)); while (hi == 0 && ++i < NPOPMAP) { if ((NBPOPMAP * i - next_free) * PAGE_SIZE >= size) { vm_reserv_reclaim(rv); return (TRUE); } hi = ffsl(rv->popmap[i]); } /* Convert from ffsl() to ordinary bit numbering. */ if (i != NPOPMAP) hi--; if ((NBPOPMAP * i + hi - next_free) * PAGE_SIZE >= size) { vm_reserv_reclaim(rv); return (TRUE); } } while (i < NPOPMAP); } return (FALSE); } /* * Transfers the reservation underlying the given page to a new object. * * The object must be locked. */ void vm_reserv_rename(vm_page_t m, vm_object_t new_object, vm_object_t old_object, vm_pindex_t old_object_offset) { vm_reserv_t rv; VM_OBJECT_ASSERT_WLOCKED(new_object); rv = vm_reserv_from_page(m); if (rv->object == old_object) { mtx_lock(&vm_page_queue_free_mtx); if (rv->object == old_object) { LIST_REMOVE(rv, objq); LIST_INSERT_HEAD(&new_object->rvq, rv, objq); rv->object = new_object; rv->pindex -= old_object_offset; } mtx_unlock(&vm_page_queue_free_mtx); } } /* * Returns the size (in bytes) of a reservation of the specified level. */ int vm_reserv_size(int level) { switch (level) { case 0: return (VM_LEVEL_0_SIZE); case -1: return (PAGE_SIZE); default: return (0); } } /* * Allocates the virtual and physical memory required by the reservation * management system's data structures, in particular, the reservation array. */ vm_paddr_t vm_reserv_startup(vm_offset_t *vaddr, vm_paddr_t end, vm_paddr_t high_water) { vm_paddr_t new_end; size_t size; /* * Calculate the size (in bytes) of the reservation array. Round up * from "high_water" because every small page is mapped to an element * in the reservation array based on its physical address. Thus, the * number of elements in the reservation array can be greater than the * number of superpages. */ size = howmany(high_water, VM_LEVEL_0_SIZE) * sizeof(struct vm_reserv); /* * Allocate and map the physical memory for the reservation array. The * next available virtual address is returned by reference. */ new_end = end - round_page(size); vm_reserv_array = (void *)(uintptr_t)pmap_map(vaddr, new_end, end, VM_PROT_READ | VM_PROT_WRITE); bzero(vm_reserv_array, size); /* * Return the next available physical address. */ return (new_end); } /* * Returns the superpage containing the given page. */ vm_page_t vm_reserv_to_superpage(vm_page_t m) { vm_reserv_t rv; VM_OBJECT_ASSERT_LOCKED(m->object); rv = vm_reserv_from_page(m); return (rv->object == m->object && rv->popcnt == VM_LEVEL_0_NPAGES ? rv->pages : NULL); } #endif /* VM_NRESERVLEVEL > 0 */ Index: projects/numa2/sys/vm/vm_reserv.h =================================================================== --- projects/numa2/sys/vm/vm_reserv.h (revision 321505) +++ projects/numa2/sys/vm/vm_reserv.h (revision 321506) @@ -1,71 +1,72 @@ /*- * Copyright (c) 2002-2006 Rice University * Copyright (c) 2007-2008 Alan L. Cox * All rights reserved. * * This software was developed for the FreeBSD Project by Alan L. Cox, * Olivier Crameri, Peter Druschel, Sitaram Iyer, and Juan Navarro. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT * HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY * WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. * * $FreeBSD$ */ /* * Superpage reservation management definitions */ #ifndef _VM_RESERV_H_ #define _VM_RESERV_H_ #ifdef _KERNEL #if VM_NRESERVLEVEL > 0 /* * The following functions are only to be used by the virtual memory system. */ vm_page_t vm_reserv_alloc_contig(vm_object_t object, vm_pindex_t pindex, - u_long npages, vm_paddr_t low, vm_paddr_t high, + int domain, u_long npages, vm_paddr_t low, vm_paddr_t high, u_long alignment, vm_paddr_t boundary, vm_page_t mpred); vm_page_t vm_reserv_alloc_page(vm_object_t object, vm_pindex_t pindex, - vm_page_t mpred); + int domain, vm_page_t mpred); void vm_reserv_break_all(vm_object_t object); boolean_t vm_reserv_free_page(vm_page_t m); void vm_reserv_init(void); bool vm_reserv_is_page_free(vm_page_t m); int vm_reserv_level(vm_page_t m); int vm_reserv_level_iffullpop(vm_page_t m); -boolean_t vm_reserv_reclaim_contig(u_long npages, vm_paddr_t low, - vm_paddr_t high, u_long alignment, vm_paddr_t boundary); -boolean_t vm_reserv_reclaim_inactive(void); +boolean_t vm_reserv_reclaim_contig(int domain, u_long npages, + vm_paddr_t low, vm_paddr_t high, u_long alignment, + vm_paddr_t boundary); +boolean_t vm_reserv_reclaim_inactive(int domain); void vm_reserv_rename(vm_page_t m, vm_object_t new_object, vm_object_t old_object, vm_pindex_t old_object_offset); int vm_reserv_size(int level); vm_paddr_t vm_reserv_startup(vm_offset_t *vaddr, vm_paddr_t end, vm_paddr_t high_water); vm_page_t vm_reserv_to_superpage(vm_page_t m); #endif /* VM_NRESERVLEVEL > 0 */ #endif /* _KERNEL */ #endif /* !_VM_RESERV_H_ */