Index: user/jeff/numa/sys/kern/subr_vmem.c =================================================================== --- user/jeff/numa/sys/kern/subr_vmem.c (revision 328171) +++ user/jeff/numa/sys/kern/subr_vmem.c (revision 328172) @@ -1,1611 +1,1611 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c)2006,2007,2008,2009 YAMAMOTO Takashi, * Copyright (c) 2013 EMC Corp. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /* * From: * $NetBSD: vmem_impl.h,v 1.2 2013/01/29 21:26:24 para Exp $ * $NetBSD: subr_vmem.c,v 1.83 2013/03/06 11:20:10 yamt Exp $ */ /* * reference: * - Magazines and Vmem: Extending the Slab Allocator * to Many CPUs and Arbitrary Resources * http://www.usenix.org/event/usenix01/bonwick.html */ #include __FBSDID("$FreeBSD$"); #include "opt_ddb.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "opt_vm.h" #include #include #include #include #include #include #include #include #include #include #include #include #define VMEM_OPTORDER 5 #define VMEM_OPTVALUE (1 << VMEM_OPTORDER) #define VMEM_MAXORDER \ (VMEM_OPTVALUE - 1 + sizeof(vmem_size_t) * NBBY - VMEM_OPTORDER) #define VMEM_HASHSIZE_MIN 16 #define VMEM_HASHSIZE_MAX 131072 #define VMEM_QCACHE_IDX_MAX 16 #define VMEM_FITMASK (M_BESTFIT | M_FIRSTFIT) #define VMEM_FLAGS \ (M_NOWAIT | M_WAITOK | M_USE_RESERVE | M_NOVM | M_BESTFIT | M_FIRSTFIT) #define BT_FLAGS (M_NOWAIT | M_WAITOK | M_USE_RESERVE | M_NOVM) #define QC_NAME_MAX 16 /* * Data structures private to vmem. */ MALLOC_DEFINE(M_VMEM, "vmem", "vmem internal structures"); typedef struct vmem_btag bt_t; TAILQ_HEAD(vmem_seglist, vmem_btag); LIST_HEAD(vmem_freelist, vmem_btag); LIST_HEAD(vmem_hashlist, vmem_btag); struct qcache { uma_zone_t qc_cache; vmem_t *qc_vmem; vmem_size_t qc_size; char qc_name[QC_NAME_MAX]; }; typedef struct qcache qcache_t; #define QC_POOL_TO_QCACHE(pool) ((qcache_t *)(pool->pr_qcache)) #define VMEM_NAME_MAX 16 /* vmem arena */ struct vmem { struct mtx_padalign vm_lock; struct cv vm_cv; char vm_name[VMEM_NAME_MAX+1]; LIST_ENTRY(vmem) vm_alllist; struct vmem_hashlist vm_hash0[VMEM_HASHSIZE_MIN]; struct vmem_freelist vm_freelist[VMEM_MAXORDER]; struct vmem_seglist vm_seglist; struct vmem_hashlist *vm_hashlist; vmem_size_t vm_hashsize; /* Constant after init */ vmem_size_t vm_qcache_max; vmem_size_t vm_quantum_mask; vmem_size_t vm_import_quantum; int vm_quantum_shift; /* Written on alloc/free */ LIST_HEAD(, vmem_btag) vm_freetags; int vm_nfreetags; int vm_nbusytag; vmem_size_t vm_inuse; vmem_size_t vm_size; vmem_size_t vm_limit; /* Used on import. */ vmem_import_t *vm_importfn; vmem_release_t *vm_releasefn; void *vm_arg; /* Space exhaustion callback. */ vmem_reclaim_t *vm_reclaimfn; /* quantum cache */ qcache_t vm_qcache[VMEM_QCACHE_IDX_MAX]; }; /* boundary tag */ struct vmem_btag { TAILQ_ENTRY(vmem_btag) bt_seglist; union { LIST_ENTRY(vmem_btag) u_freelist; /* BT_TYPE_FREE */ LIST_ENTRY(vmem_btag) u_hashlist; /* BT_TYPE_BUSY */ } bt_u; #define bt_hashlist bt_u.u_hashlist #define bt_freelist bt_u.u_freelist vmem_addr_t bt_start; vmem_size_t bt_size; int bt_type; }; #define BT_TYPE_SPAN 1 /* Allocated from importfn */ #define BT_TYPE_SPAN_STATIC 2 /* vmem_add() or create. */ #define BT_TYPE_FREE 3 /* Available space. */ #define BT_TYPE_BUSY 4 /* Used space. */ #define BT_ISSPAN_P(bt) ((bt)->bt_type <= BT_TYPE_SPAN_STATIC) #define BT_END(bt) ((bt)->bt_start + (bt)->bt_size - 1) #if defined(DIAGNOSTIC) static int enable_vmem_check = 1; SYSCTL_INT(_debug, OID_AUTO, vmem_check, CTLFLAG_RWTUN, &enable_vmem_check, 0, "Enable vmem check"); static void vmem_check(vmem_t *); #endif static struct callout vmem_periodic_ch; static int vmem_periodic_interval; static struct task vmem_periodic_wk; static struct mtx_padalign __exclusive_cache_line vmem_list_lock; static LIST_HEAD(, vmem) vmem_list = LIST_HEAD_INITIALIZER(vmem_list); static uma_zone_t vmem_zone; /* ---- misc */ #define VMEM_CONDVAR_INIT(vm, wchan) cv_init(&vm->vm_cv, wchan) #define VMEM_CONDVAR_DESTROY(vm) cv_destroy(&vm->vm_cv) #define VMEM_CONDVAR_WAIT(vm) cv_wait(&vm->vm_cv, &vm->vm_lock) #define VMEM_CONDVAR_BROADCAST(vm) cv_broadcast(&vm->vm_cv) #define VMEM_LOCK(vm) mtx_lock(&vm->vm_lock) #define VMEM_TRYLOCK(vm) mtx_trylock(&vm->vm_lock) #define VMEM_UNLOCK(vm) mtx_unlock(&vm->vm_lock) #define VMEM_LOCK_INIT(vm, name) mtx_init(&vm->vm_lock, (name), NULL, MTX_DEF) #define VMEM_LOCK_DESTROY(vm) mtx_destroy(&vm->vm_lock) #define VMEM_ASSERT_LOCKED(vm) mtx_assert(&vm->vm_lock, MA_OWNED); #define VMEM_ALIGNUP(addr, align) (-(-(addr) & -(align))) #define VMEM_CROSS_P(addr1, addr2, boundary) \ ((((addr1) ^ (addr2)) & -(boundary)) != 0) #define ORDER2SIZE(order) ((order) < VMEM_OPTVALUE ? ((order) + 1) : \ (vmem_size_t)1 << ((order) - (VMEM_OPTVALUE - VMEM_OPTORDER - 1))) #define SIZE2ORDER(size) ((size) <= VMEM_OPTVALUE ? ((size) - 1) : \ (flsl(size) + (VMEM_OPTVALUE - VMEM_OPTORDER - 2))) /* * Maximum number of boundary tags that may be required to satisfy an * allocation. Two may be required to import. Another two may be * required to clip edges. */ #define BT_MAXALLOC 4 /* * Max free limits the number of locally cached boundary tags. We * just want to avoid hitting the zone allocator for every call. */ #define BT_MAXFREE (BT_MAXALLOC * 8) /* Allocator for boundary tags. */ static uma_zone_t vmem_bt_zone; /* boot time arena storage. */ static struct vmem kernel_arena_storage; static struct vmem buffer_arena_storage; static struct vmem transient_arena_storage; /* kernel and kmem arenas are aliased for backwards KPI compat. */ vmem_t *kernel_arena = &kernel_arena_storage; vmem_t *kmem_arena = &kernel_arena_storage; vmem_t *buffer_arena = &buffer_arena_storage; vmem_t *transient_arena = &transient_arena_storage; #ifdef DEBUG_MEMGUARD static struct vmem memguard_arena_storage; vmem_t *memguard_arena = &memguard_arena_storage; #endif /* * Fill the vmem's boundary tag cache. We guarantee that boundary tag * allocation will not fail once bt_fill() passes. To do so we cache * at least the maximum possible tag allocations in the arena. */ static int bt_fill(vmem_t *vm, int flags) { bt_t *bt; VMEM_ASSERT_LOCKED(vm); /* * Only allow the kernel arena and arenas derived from kernel arena to * dip into reserve tags. They are where new tags come from. */ flags &= BT_FLAGS; if (vm != kernel_arena && vm->vm_arg != kernel_arena) flags &= ~M_USE_RESERVE; /* * Loop until we meet the reserve. To minimize the lock shuffle * and prevent simultaneous fills we first try a NOWAIT regardless * of the caller's flags. Specify M_NOVM so we don't recurse while * holding a vmem lock. */ while (vm->vm_nfreetags < BT_MAXALLOC) { bt = uma_zalloc(vmem_bt_zone, (flags & M_USE_RESERVE) | M_NOWAIT | M_NOVM); if (bt == NULL) { VMEM_UNLOCK(vm); bt = uma_zalloc(vmem_bt_zone, flags); VMEM_LOCK(vm); if (bt == NULL && (flags & M_NOWAIT) != 0) break; } LIST_INSERT_HEAD(&vm->vm_freetags, bt, bt_freelist); vm->vm_nfreetags++; } if (vm->vm_nfreetags < BT_MAXALLOC) return ENOMEM; return 0; } /* * Pop a tag off of the freetag stack. */ static bt_t * bt_alloc(vmem_t *vm) { bt_t *bt; VMEM_ASSERT_LOCKED(vm); bt = LIST_FIRST(&vm->vm_freetags); MPASS(bt != NULL); LIST_REMOVE(bt, bt_freelist); vm->vm_nfreetags--; return bt; } /* * Trim the per-vmem free list. Returns with the lock released to * avoid allocator recursions. */ static void bt_freetrim(vmem_t *vm, int freelimit) { LIST_HEAD(, vmem_btag) freetags; bt_t *bt; LIST_INIT(&freetags); VMEM_ASSERT_LOCKED(vm); while (vm->vm_nfreetags > freelimit) { bt = LIST_FIRST(&vm->vm_freetags); LIST_REMOVE(bt, bt_freelist); vm->vm_nfreetags--; LIST_INSERT_HEAD(&freetags, bt, bt_freelist); } VMEM_UNLOCK(vm); while ((bt = LIST_FIRST(&freetags)) != NULL) { LIST_REMOVE(bt, bt_freelist); uma_zfree(vmem_bt_zone, bt); } } static inline void bt_free(vmem_t *vm, bt_t *bt) { VMEM_ASSERT_LOCKED(vm); MPASS(LIST_FIRST(&vm->vm_freetags) != bt); LIST_INSERT_HEAD(&vm->vm_freetags, bt, bt_freelist); vm->vm_nfreetags++; } /* * freelist[0] ... [1, 1] * freelist[1] ... [2, 2] * : * freelist[29] ... [30, 30] * freelist[30] ... [31, 31] * freelist[31] ... [32, 63] * freelist[33] ... [64, 127] * : * freelist[n] ... [(1 << (n - 26)), (1 << (n - 25)) - 1] * : */ static struct vmem_freelist * bt_freehead_tofree(vmem_t *vm, vmem_size_t size) { const vmem_size_t qsize = size >> vm->vm_quantum_shift; const int idx = SIZE2ORDER(qsize); MPASS(size != 0 && qsize != 0); MPASS((size & vm->vm_quantum_mask) == 0); MPASS(idx >= 0); MPASS(idx < VMEM_MAXORDER); return &vm->vm_freelist[idx]; } /* * bt_freehead_toalloc: return the freelist for the given size and allocation * strategy. * * For M_FIRSTFIT, return the list in which any blocks are large enough * for the requested size. otherwise, return the list which can have blocks * large enough for the requested size. */ static struct vmem_freelist * bt_freehead_toalloc(vmem_t *vm, vmem_size_t size, int strat) { const vmem_size_t qsize = size >> vm->vm_quantum_shift; int idx = SIZE2ORDER(qsize); MPASS(size != 0 && qsize != 0); MPASS((size & vm->vm_quantum_mask) == 0); if (strat == M_FIRSTFIT && ORDER2SIZE(idx) != qsize) { idx++; /* check too large request? */ } MPASS(idx >= 0); MPASS(idx < VMEM_MAXORDER); return &vm->vm_freelist[idx]; } /* ---- boundary tag hash */ static struct vmem_hashlist * bt_hashhead(vmem_t *vm, vmem_addr_t addr) { struct vmem_hashlist *list; unsigned int hash; hash = hash32_buf(&addr, sizeof(addr), 0); list = &vm->vm_hashlist[hash % vm->vm_hashsize]; return list; } static bt_t * bt_lookupbusy(vmem_t *vm, vmem_addr_t addr) { struct vmem_hashlist *list; bt_t *bt; VMEM_ASSERT_LOCKED(vm); list = bt_hashhead(vm, addr); LIST_FOREACH(bt, list, bt_hashlist) { if (bt->bt_start == addr) { break; } } return bt; } static void bt_rembusy(vmem_t *vm, bt_t *bt) { VMEM_ASSERT_LOCKED(vm); MPASS(vm->vm_nbusytag > 0); vm->vm_inuse -= bt->bt_size; vm->vm_nbusytag--; LIST_REMOVE(bt, bt_hashlist); } static void bt_insbusy(vmem_t *vm, bt_t *bt) { struct vmem_hashlist *list; VMEM_ASSERT_LOCKED(vm); MPASS(bt->bt_type == BT_TYPE_BUSY); list = bt_hashhead(vm, bt->bt_start); LIST_INSERT_HEAD(list, bt, bt_hashlist); vm->vm_nbusytag++; vm->vm_inuse += bt->bt_size; } /* ---- boundary tag list */ static void bt_remseg(vmem_t *vm, bt_t *bt) { TAILQ_REMOVE(&vm->vm_seglist, bt, bt_seglist); bt_free(vm, bt); } static void bt_insseg(vmem_t *vm, bt_t *bt, bt_t *prev) { TAILQ_INSERT_AFTER(&vm->vm_seglist, prev, bt, bt_seglist); } static void bt_insseg_tail(vmem_t *vm, bt_t *bt) { TAILQ_INSERT_TAIL(&vm->vm_seglist, bt, bt_seglist); } static void bt_remfree(vmem_t *vm, bt_t *bt) { MPASS(bt->bt_type == BT_TYPE_FREE); LIST_REMOVE(bt, bt_freelist); } static void bt_insfree(vmem_t *vm, bt_t *bt) { struct vmem_freelist *list; list = bt_freehead_tofree(vm, bt->bt_size); LIST_INSERT_HEAD(list, bt, bt_freelist); } /* ---- vmem internal functions */ /* * Import from the arena into the quantum cache in UMA. */ static int qc_import(void *arg, void **store, int cnt, int domain, int flags) { qcache_t *qc; vmem_addr_t addr; int i; qc = arg; if ((flags & VMEM_FITMASK) == 0) flags |= M_BESTFIT; for (i = 0; i < cnt; i++) { if (vmem_xalloc(qc->qc_vmem, qc->qc_size, 0, 0, 0, VMEM_ADDR_MIN, VMEM_ADDR_MAX, flags, &addr) != 0) break; store[i] = (void *)addr; /* Only guarantee one allocation. */ flags &= ~M_WAITOK; flags |= M_NOWAIT; } return i; } /* * Release memory from the UMA cache to the arena. */ static void qc_release(void *arg, void **store, int cnt) { qcache_t *qc; int i; qc = arg; for (i = 0; i < cnt; i++) vmem_xfree(qc->qc_vmem, (vmem_addr_t)store[i], qc->qc_size); } static void qc_init(vmem_t *vm, vmem_size_t qcache_max) { qcache_t *qc; vmem_size_t size; int qcache_idx_max; int i; MPASS((qcache_max & vm->vm_quantum_mask) == 0); qcache_idx_max = MIN(qcache_max >> vm->vm_quantum_shift, VMEM_QCACHE_IDX_MAX); vm->vm_qcache_max = qcache_idx_max << vm->vm_quantum_shift; for (i = 0; i < qcache_idx_max; i++) { qc = &vm->vm_qcache[i]; size = (i + 1) << vm->vm_quantum_shift; snprintf(qc->qc_name, sizeof(qc->qc_name), "%s-%zu", vm->vm_name, size); qc->qc_vmem = vm; qc->qc_size = size; qc->qc_cache = uma_zcache_create(qc->qc_name, size, NULL, NULL, NULL, NULL, qc_import, qc_release, qc, UMA_ZONE_VM); MPASS(qc->qc_cache); } } static void qc_destroy(vmem_t *vm) { int qcache_idx_max; int i; qcache_idx_max = vm->vm_qcache_max >> vm->vm_quantum_shift; for (i = 0; i < qcache_idx_max; i++) uma_zdestroy(vm->vm_qcache[i].qc_cache); } static void qc_drain(vmem_t *vm) { int qcache_idx_max; int i; qcache_idx_max = vm->vm_qcache_max >> vm->vm_quantum_shift; for (i = 0; i < qcache_idx_max; i++) zone_drain(vm->vm_qcache[i].qc_cache); } #ifndef UMA_MD_SMALL_ALLOC static struct mtx_padalign __exclusive_cache_line vmem_bt_lock; /* * vmem_bt_alloc: Allocate a new page of boundary tags. * * On architectures with uma_small_alloc there is no recursion; no address * space need be allocated to allocate boundary tags. For the others, we * must handle recursion. Boundary tags are necessary to allocate new * boundary tags. * * UMA guarantees that enough tags are held in reserve to allocate a new * page of kva. We dip into this reserve by specifying M_USE_RESERVE only * when allocating the page to hold new boundary tags. In this way the * reserve is automatically filled by the allocation that uses the reserve. * * We still have to guarantee that the new tags are allocated atomically since * many threads may try concurrently. The bt_lock provides this guarantee. * We convert WAITOK allocations to NOWAIT and then handle the blocking here * on failure. It's ok to return NULL for a WAITOK allocation as UMA will * loop again after checking to see if we lost the race to allocate. * * There is a small race between vmem_bt_alloc() returning the page and the * zone lock being acquired to add the page to the zone. For WAITOK * allocations we just pause briefly. NOWAIT may experience a transient * failure. To alleviate this we permit a small number of simultaneous * fills to proceed concurrently so NOWAIT is less likely to fail unless * we are really out of KVA. */ static void * vmem_bt_alloc(uma_zone_t zone, vm_size_t bytes, int domain, uint8_t *pflag, int wait) { vmem_addr_t addr; *pflag = UMA_SLAB_KERNEL; /* * Single thread boundary tag allocation so that the address space * and memory are added in one atomic operation. */ mtx_lock(&vmem_bt_lock); if (vmem_xalloc(vm_dom[domain].vmd_kernel_arena, bytes, 0, 0, 0, VMEM_ADDR_MIN, VMEM_ADDR_MAX, M_NOWAIT | M_NOVM | M_USE_RESERVE | M_BESTFIT, &addr) == 0) { if (kmem_back_domain(domain, kernel_object, addr, bytes, M_NOWAIT | M_USE_RESERVE) == 0) { mtx_unlock(&vmem_bt_lock); return ((void *)addr); } vmem_xfree(vm_dom[domain].vmd_kernel_arena, addr, bytes); mtx_unlock(&vmem_bt_lock); /* * Out of memory, not address space. This may not even be * possible due to M_USE_RESERVE page allocation. */ if (wait & M_WAITOK) - VM_WAIT; + vm_wait_domain(domain); return (NULL); } mtx_unlock(&vmem_bt_lock); /* * We're either out of address space or lost a fill race. */ if (wait & M_WAITOK) pause("btalloc", 1); return (NULL); } #endif void vmem_startup(void) { mtx_init(&vmem_list_lock, "vmem list lock", NULL, MTX_DEF); vmem_zone = uma_zcreate("vmem", sizeof(struct vmem), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_VM); vmem_bt_zone = uma_zcreate("vmem btag", sizeof(struct vmem_btag), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_VM | UMA_ZONE_NOFREE); #ifndef UMA_MD_SMALL_ALLOC mtx_init(&vmem_bt_lock, "btag lock", NULL, MTX_DEF); uma_prealloc(vmem_bt_zone, BT_MAXALLOC); /* * Reserve enough tags to allocate new tags. We allow multiple * CPUs to attempt to allocate new tags concurrently to limit * false restarts in UMA. */ uma_zone_reserve(vmem_bt_zone, BT_MAXALLOC * (mp_ncpus + 1) / 2); uma_zone_set_allocf(vmem_bt_zone, vmem_bt_alloc); #endif } /* ---- rehash */ static int vmem_rehash(vmem_t *vm, vmem_size_t newhashsize) { bt_t *bt; int i; struct vmem_hashlist *newhashlist; struct vmem_hashlist *oldhashlist; vmem_size_t oldhashsize; MPASS(newhashsize > 0); newhashlist = malloc(sizeof(struct vmem_hashlist) * newhashsize, M_VMEM, M_NOWAIT); if (newhashlist == NULL) return ENOMEM; for (i = 0; i < newhashsize; i++) { LIST_INIT(&newhashlist[i]); } VMEM_LOCK(vm); oldhashlist = vm->vm_hashlist; oldhashsize = vm->vm_hashsize; vm->vm_hashlist = newhashlist; vm->vm_hashsize = newhashsize; if (oldhashlist == NULL) { VMEM_UNLOCK(vm); return 0; } for (i = 0; i < oldhashsize; i++) { while ((bt = LIST_FIRST(&oldhashlist[i])) != NULL) { bt_rembusy(vm, bt); bt_insbusy(vm, bt); } } VMEM_UNLOCK(vm); if (oldhashlist != vm->vm_hash0) { free(oldhashlist, M_VMEM); } return 0; } static void vmem_periodic_kick(void *dummy) { taskqueue_enqueue(taskqueue_thread, &vmem_periodic_wk); } static void vmem_periodic(void *unused, int pending) { vmem_t *vm; vmem_size_t desired; vmem_size_t current; mtx_lock(&vmem_list_lock); LIST_FOREACH(vm, &vmem_list, vm_alllist) { #ifdef DIAGNOSTIC /* Convenient time to verify vmem state. */ if (enable_vmem_check == 1) { VMEM_LOCK(vm); vmem_check(vm); VMEM_UNLOCK(vm); } #endif desired = 1 << flsl(vm->vm_nbusytag); desired = MIN(MAX(desired, VMEM_HASHSIZE_MIN), VMEM_HASHSIZE_MAX); current = vm->vm_hashsize; /* Grow in powers of two. Shrink less aggressively. */ if (desired >= current * 2 || desired * 4 <= current) vmem_rehash(vm, desired); /* * Periodically wake up threads waiting for resources, * so they could ask for reclamation again. */ VMEM_CONDVAR_BROADCAST(vm); } mtx_unlock(&vmem_list_lock); callout_reset(&vmem_periodic_ch, vmem_periodic_interval, vmem_periodic_kick, NULL); } static void vmem_start_callout(void *unused) { TASK_INIT(&vmem_periodic_wk, 0, vmem_periodic, NULL); vmem_periodic_interval = hz * 10; callout_init(&vmem_periodic_ch, 1); callout_reset(&vmem_periodic_ch, vmem_periodic_interval, vmem_periodic_kick, NULL); } SYSINIT(vfs, SI_SUB_CONFIGURE, SI_ORDER_ANY, vmem_start_callout, NULL); static void vmem_add1(vmem_t *vm, vmem_addr_t addr, vmem_size_t size, int type) { bt_t *btspan; bt_t *btfree; MPASS(type == BT_TYPE_SPAN || type == BT_TYPE_SPAN_STATIC); MPASS((size & vm->vm_quantum_mask) == 0); btspan = bt_alloc(vm); btspan->bt_type = type; btspan->bt_start = addr; btspan->bt_size = size; bt_insseg_tail(vm, btspan); btfree = bt_alloc(vm); btfree->bt_type = BT_TYPE_FREE; btfree->bt_start = addr; btfree->bt_size = size; bt_insseg(vm, btfree, btspan); bt_insfree(vm, btfree); vm->vm_size += size; } static void vmem_destroy1(vmem_t *vm) { bt_t *bt; /* * Drain per-cpu quantum caches. */ qc_destroy(vm); /* * The vmem should now only contain empty segments. */ VMEM_LOCK(vm); MPASS(vm->vm_nbusytag == 0); while ((bt = TAILQ_FIRST(&vm->vm_seglist)) != NULL) bt_remseg(vm, bt); if (vm->vm_hashlist != NULL && vm->vm_hashlist != vm->vm_hash0) free(vm->vm_hashlist, M_VMEM); bt_freetrim(vm, 0); VMEM_CONDVAR_DESTROY(vm); VMEM_LOCK_DESTROY(vm); uma_zfree(vmem_zone, vm); } static int vmem_import(vmem_t *vm, vmem_size_t size, vmem_size_t align, int flags) { vmem_addr_t addr; int error; if (vm->vm_importfn == NULL) return (EINVAL); /* * To make sure we get a span that meets the alignment we double it * and add the size to the tail. This slightly overestimates. */ if (align != vm->vm_quantum_mask + 1) size = (align * 2) + size; size = roundup(size, vm->vm_import_quantum); if (vm->vm_limit != 0 && vm->vm_limit < vm->vm_size + size) return (ENOMEM); /* * Hide MAXALLOC tags so we're guaranteed to be able to add this * span and the tag we want to allocate from it. */ MPASS(vm->vm_nfreetags >= BT_MAXALLOC); vm->vm_nfreetags -= BT_MAXALLOC; VMEM_UNLOCK(vm); error = (vm->vm_importfn)(vm->vm_arg, size, flags, &addr); VMEM_LOCK(vm); vm->vm_nfreetags += BT_MAXALLOC; if (error) return (ENOMEM); vmem_add1(vm, addr, size, BT_TYPE_SPAN); return 0; } /* * vmem_fit: check if a bt can satisfy the given restrictions. * * it's a caller's responsibility to ensure the region is big enough * before calling us. */ static int vmem_fit(const bt_t *bt, vmem_size_t size, vmem_size_t align, vmem_size_t phase, vmem_size_t nocross, vmem_addr_t minaddr, vmem_addr_t maxaddr, vmem_addr_t *addrp) { vmem_addr_t start; vmem_addr_t end; MPASS(size > 0); MPASS(bt->bt_size >= size); /* caller's responsibility */ /* * XXX assumption: vmem_addr_t and vmem_size_t are * unsigned integer of the same size. */ start = bt->bt_start; if (start < minaddr) { start = minaddr; } end = BT_END(bt); if (end > maxaddr) end = maxaddr; if (start > end) return (ENOMEM); start = VMEM_ALIGNUP(start - phase, align) + phase; if (start < bt->bt_start) start += align; if (VMEM_CROSS_P(start, start + size - 1, nocross)) { MPASS(align < nocross); start = VMEM_ALIGNUP(start - phase, nocross) + phase; } if (start <= end && end - start >= size - 1) { MPASS((start & (align - 1)) == phase); MPASS(!VMEM_CROSS_P(start, start + size - 1, nocross)); MPASS(minaddr <= start); MPASS(maxaddr == 0 || start + size - 1 <= maxaddr); MPASS(bt->bt_start <= start); MPASS(BT_END(bt) - start >= size - 1); *addrp = start; return (0); } return (ENOMEM); } /* * vmem_clip: Trim the boundary tag edges to the requested start and size. */ static void vmem_clip(vmem_t *vm, bt_t *bt, vmem_addr_t start, vmem_size_t size) { bt_t *btnew; bt_t *btprev; VMEM_ASSERT_LOCKED(vm); MPASS(bt->bt_type == BT_TYPE_FREE); MPASS(bt->bt_size >= size); bt_remfree(vm, bt); if (bt->bt_start != start) { btprev = bt_alloc(vm); btprev->bt_type = BT_TYPE_FREE; btprev->bt_start = bt->bt_start; btprev->bt_size = start - bt->bt_start; bt->bt_start = start; bt->bt_size -= btprev->bt_size; bt_insfree(vm, btprev); bt_insseg(vm, btprev, TAILQ_PREV(bt, vmem_seglist, bt_seglist)); } MPASS(bt->bt_start == start); if (bt->bt_size != size && bt->bt_size - size > vm->vm_quantum_mask) { /* split */ btnew = bt_alloc(vm); btnew->bt_type = BT_TYPE_BUSY; btnew->bt_start = bt->bt_start; btnew->bt_size = size; bt->bt_start = bt->bt_start + size; bt->bt_size -= size; bt_insfree(vm, bt); bt_insseg(vm, btnew, TAILQ_PREV(bt, vmem_seglist, bt_seglist)); bt_insbusy(vm, btnew); bt = btnew; } else { bt->bt_type = BT_TYPE_BUSY; bt_insbusy(vm, bt); } MPASS(bt->bt_size >= size); bt->bt_type = BT_TYPE_BUSY; } /* ---- vmem API */ void vmem_set_import(vmem_t *vm, vmem_import_t *importfn, vmem_release_t *releasefn, void *arg, vmem_size_t import_quantum) { VMEM_LOCK(vm); vm->vm_importfn = importfn; vm->vm_releasefn = releasefn; vm->vm_arg = arg; vm->vm_import_quantum = import_quantum; VMEM_UNLOCK(vm); } void vmem_set_limit(vmem_t *vm, vmem_size_t limit) { VMEM_LOCK(vm); vm->vm_limit = limit; VMEM_UNLOCK(vm); } void vmem_set_reclaim(vmem_t *vm, vmem_reclaim_t *reclaimfn) { VMEM_LOCK(vm); vm->vm_reclaimfn = reclaimfn; VMEM_UNLOCK(vm); } /* * vmem_init: Initializes vmem arena. */ vmem_t * vmem_init(vmem_t *vm, const char *name, vmem_addr_t base, vmem_size_t size, vmem_size_t quantum, vmem_size_t qcache_max, int flags) { int i; MPASS(quantum > 0); MPASS((quantum & (quantum - 1)) == 0); bzero(vm, sizeof(*vm)); VMEM_CONDVAR_INIT(vm, name); VMEM_LOCK_INIT(vm, name); vm->vm_nfreetags = 0; LIST_INIT(&vm->vm_freetags); strlcpy(vm->vm_name, name, sizeof(vm->vm_name)); vm->vm_quantum_mask = quantum - 1; vm->vm_quantum_shift = flsl(quantum) - 1; vm->vm_nbusytag = 0; vm->vm_size = 0; vm->vm_limit = 0; vm->vm_inuse = 0; qc_init(vm, qcache_max); TAILQ_INIT(&vm->vm_seglist); for (i = 0; i < VMEM_MAXORDER; i++) { LIST_INIT(&vm->vm_freelist[i]); } memset(&vm->vm_hash0, 0, sizeof(vm->vm_hash0)); vm->vm_hashsize = VMEM_HASHSIZE_MIN; vm->vm_hashlist = vm->vm_hash0; if (size != 0) { if (vmem_add(vm, base, size, flags) != 0) { vmem_destroy1(vm); return NULL; } } mtx_lock(&vmem_list_lock); LIST_INSERT_HEAD(&vmem_list, vm, vm_alllist); mtx_unlock(&vmem_list_lock); return vm; } /* * vmem_create: create an arena. */ vmem_t * vmem_create(const char *name, vmem_addr_t base, vmem_size_t size, vmem_size_t quantum, vmem_size_t qcache_max, int flags) { vmem_t *vm; vm = uma_zalloc(vmem_zone, flags & (M_WAITOK|M_NOWAIT)); if (vm == NULL) return (NULL); if (vmem_init(vm, name, base, size, quantum, qcache_max, flags) == NULL) return (NULL); return (vm); } void vmem_destroy(vmem_t *vm) { mtx_lock(&vmem_list_lock); LIST_REMOVE(vm, vm_alllist); mtx_unlock(&vmem_list_lock); vmem_destroy1(vm); } vmem_size_t vmem_roundup_size(vmem_t *vm, vmem_size_t size) { return (size + vm->vm_quantum_mask) & ~vm->vm_quantum_mask; } /* * vmem_alloc: allocate resource from the arena. */ int vmem_alloc(vmem_t *vm, vmem_size_t size, int flags, vmem_addr_t *addrp) { const int strat __unused = flags & VMEM_FITMASK; qcache_t *qc; flags &= VMEM_FLAGS; MPASS(size > 0); MPASS(strat == M_BESTFIT || strat == M_FIRSTFIT); if ((flags & M_NOWAIT) == 0) WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL, "vmem_alloc"); if (size <= vm->vm_qcache_max) { qc = &vm->vm_qcache[(size - 1) >> vm->vm_quantum_shift]; *addrp = (vmem_addr_t)uma_zalloc(qc->qc_cache, flags); if (*addrp == 0) return (ENOMEM); return (0); } return vmem_xalloc(vm, size, 0, 0, 0, VMEM_ADDR_MIN, VMEM_ADDR_MAX, flags, addrp); } int vmem_xalloc(vmem_t *vm, const vmem_size_t size0, vmem_size_t align, const vmem_size_t phase, const vmem_size_t nocross, const vmem_addr_t minaddr, const vmem_addr_t maxaddr, int flags, vmem_addr_t *addrp) { const vmem_size_t size = vmem_roundup_size(vm, size0); struct vmem_freelist *list; struct vmem_freelist *first; struct vmem_freelist *end; vmem_size_t avail; bt_t *bt; int error; int strat; flags &= VMEM_FLAGS; strat = flags & VMEM_FITMASK; MPASS(size0 > 0); MPASS(size > 0); MPASS(strat == M_BESTFIT || strat == M_FIRSTFIT); MPASS((flags & (M_NOWAIT|M_WAITOK)) != (M_NOWAIT|M_WAITOK)); if ((flags & M_NOWAIT) == 0) WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL, "vmem_xalloc"); MPASS((align & vm->vm_quantum_mask) == 0); MPASS((align & (align - 1)) == 0); MPASS((phase & vm->vm_quantum_mask) == 0); MPASS((nocross & vm->vm_quantum_mask) == 0); MPASS((nocross & (nocross - 1)) == 0); MPASS((align == 0 && phase == 0) || phase < align); MPASS(nocross == 0 || nocross >= size); MPASS(minaddr <= maxaddr); MPASS(!VMEM_CROSS_P(phase, phase + size - 1, nocross)); if (align == 0) align = vm->vm_quantum_mask + 1; *addrp = 0; end = &vm->vm_freelist[VMEM_MAXORDER]; /* * choose a free block from which we allocate. */ first = bt_freehead_toalloc(vm, size, strat); VMEM_LOCK(vm); for (;;) { /* * Make sure we have enough tags to complete the * operation. */ if (vm->vm_nfreetags < BT_MAXALLOC && bt_fill(vm, flags) != 0) { error = ENOMEM; break; } /* * Scan freelists looking for a tag that satisfies the * allocation. If we're doing BESTFIT we may encounter * sizes below the request. If we're doing FIRSTFIT we * inspect only the first element from each list. */ for (list = first; list < end; list++) { LIST_FOREACH(bt, list, bt_freelist) { if (bt->bt_size >= size) { error = vmem_fit(bt, size, align, phase, nocross, minaddr, maxaddr, addrp); if (error == 0) { vmem_clip(vm, bt, *addrp, size); goto out; } } /* FIRST skips to the next list. */ if (strat == M_FIRSTFIT) break; } } /* * Retry if the fast algorithm failed. */ if (strat == M_FIRSTFIT) { strat = M_BESTFIT; first = bt_freehead_toalloc(vm, size, strat); continue; } /* * XXX it is possible to fail to meet restrictions with the * imported region. It is up to the user to specify the * import quantum such that it can satisfy any allocation. */ if (vmem_import(vm, size, align, flags) == 0) continue; /* * Try to free some space from the quantum cache or reclaim * functions if available. */ if (vm->vm_qcache_max != 0 || vm->vm_reclaimfn != NULL) { avail = vm->vm_size - vm->vm_inuse; VMEM_UNLOCK(vm); if (vm->vm_qcache_max != 0) qc_drain(vm); if (vm->vm_reclaimfn != NULL) vm->vm_reclaimfn(vm, flags); VMEM_LOCK(vm); /* If we were successful retry even NOWAIT. */ if (vm->vm_size - vm->vm_inuse > avail) continue; } if ((flags & M_NOWAIT) != 0) { error = ENOMEM; break; } VMEM_CONDVAR_WAIT(vm); } out: VMEM_UNLOCK(vm); if (error != 0 && (flags & M_NOWAIT) == 0) panic("failed to allocate waiting allocation\n"); return (error); } /* * vmem_free: free the resource to the arena. */ void vmem_free(vmem_t *vm, vmem_addr_t addr, vmem_size_t size) { qcache_t *qc; MPASS(size > 0); if (size <= vm->vm_qcache_max) { qc = &vm->vm_qcache[(size - 1) >> vm->vm_quantum_shift]; uma_zfree(qc->qc_cache, (void *)addr); } else vmem_xfree(vm, addr, size); } void vmem_xfree(vmem_t *vm, vmem_addr_t addr, vmem_size_t size) { bt_t *bt; bt_t *t; MPASS(size > 0); VMEM_LOCK(vm); bt = bt_lookupbusy(vm, addr); MPASS(bt != NULL); MPASS(bt->bt_start == addr); MPASS(bt->bt_size == vmem_roundup_size(vm, size) || bt->bt_size - vmem_roundup_size(vm, size) <= vm->vm_quantum_mask); MPASS(bt->bt_type == BT_TYPE_BUSY); bt_rembusy(vm, bt); bt->bt_type = BT_TYPE_FREE; /* coalesce */ t = TAILQ_NEXT(bt, bt_seglist); if (t != NULL && t->bt_type == BT_TYPE_FREE) { MPASS(BT_END(bt) < t->bt_start); /* YYY */ bt->bt_size += t->bt_size; bt_remfree(vm, t); bt_remseg(vm, t); } t = TAILQ_PREV(bt, vmem_seglist, bt_seglist); if (t != NULL && t->bt_type == BT_TYPE_FREE) { MPASS(BT_END(t) < bt->bt_start); /* YYY */ bt->bt_size += t->bt_size; bt->bt_start = t->bt_start; bt_remfree(vm, t); bt_remseg(vm, t); } t = TAILQ_PREV(bt, vmem_seglist, bt_seglist); MPASS(t != NULL); MPASS(BT_ISSPAN_P(t) || t->bt_type == BT_TYPE_BUSY); if (vm->vm_releasefn != NULL && t->bt_type == BT_TYPE_SPAN && t->bt_size == bt->bt_size) { vmem_addr_t spanaddr; vmem_size_t spansize; MPASS(t->bt_start == bt->bt_start); spanaddr = bt->bt_start; spansize = bt->bt_size; bt_remseg(vm, bt); bt_remseg(vm, t); vm->vm_size -= spansize; VMEM_CONDVAR_BROADCAST(vm); bt_freetrim(vm, BT_MAXFREE); (*vm->vm_releasefn)(vm->vm_arg, spanaddr, spansize); } else { bt_insfree(vm, bt); VMEM_CONDVAR_BROADCAST(vm); bt_freetrim(vm, BT_MAXFREE); } } /* * vmem_add: * */ int vmem_add(vmem_t *vm, vmem_addr_t addr, vmem_size_t size, int flags) { int error; error = 0; flags &= VMEM_FLAGS; VMEM_LOCK(vm); if (vm->vm_nfreetags >= BT_MAXALLOC || bt_fill(vm, flags) == 0) vmem_add1(vm, addr, size, BT_TYPE_SPAN_STATIC); else error = ENOMEM; VMEM_UNLOCK(vm); return (error); } /* * vmem_size: information about arenas size */ vmem_size_t vmem_size(vmem_t *vm, int typemask) { int i; switch (typemask) { case VMEM_ALLOC: return vm->vm_inuse; case VMEM_FREE: return vm->vm_size - vm->vm_inuse; case VMEM_FREE|VMEM_ALLOC: return vm->vm_size; case VMEM_MAXFREE: VMEM_LOCK(vm); for (i = VMEM_MAXORDER - 1; i >= 0; i--) { if (LIST_EMPTY(&vm->vm_freelist[i])) continue; VMEM_UNLOCK(vm); return ((vmem_size_t)ORDER2SIZE(i) << vm->vm_quantum_shift); } VMEM_UNLOCK(vm); return (0); default: panic("vmem_size"); } } /* ---- debug */ #if defined(DDB) || defined(DIAGNOSTIC) static void bt_dump(const bt_t *, int (*)(const char *, ...) __printflike(1, 2)); static const char * bt_type_string(int type) { switch (type) { case BT_TYPE_BUSY: return "busy"; case BT_TYPE_FREE: return "free"; case BT_TYPE_SPAN: return "span"; case BT_TYPE_SPAN_STATIC: return "static span"; default: break; } return "BOGUS"; } static void bt_dump(const bt_t *bt, int (*pr)(const char *, ...)) { (*pr)("\t%p: %jx %jx, %d(%s)\n", bt, (intmax_t)bt->bt_start, (intmax_t)bt->bt_size, bt->bt_type, bt_type_string(bt->bt_type)); } static void vmem_dump(const vmem_t *vm , int (*pr)(const char *, ...) __printflike(1, 2)) { const bt_t *bt; int i; (*pr)("vmem %p '%s'\n", vm, vm->vm_name); TAILQ_FOREACH(bt, &vm->vm_seglist, bt_seglist) { bt_dump(bt, pr); } for (i = 0; i < VMEM_MAXORDER; i++) { const struct vmem_freelist *fl = &vm->vm_freelist[i]; if (LIST_EMPTY(fl)) { continue; } (*pr)("freelist[%d]\n", i); LIST_FOREACH(bt, fl, bt_freelist) { bt_dump(bt, pr); } } } #endif /* defined(DDB) || defined(DIAGNOSTIC) */ #if defined(DDB) #include static bt_t * vmem_whatis_lookup(vmem_t *vm, vmem_addr_t addr) { bt_t *bt; TAILQ_FOREACH(bt, &vm->vm_seglist, bt_seglist) { if (BT_ISSPAN_P(bt)) { continue; } if (bt->bt_start <= addr && addr <= BT_END(bt)) { return bt; } } return NULL; } void vmem_whatis(vmem_addr_t addr, int (*pr)(const char *, ...)) { vmem_t *vm; LIST_FOREACH(vm, &vmem_list, vm_alllist) { bt_t *bt; bt = vmem_whatis_lookup(vm, addr); if (bt == NULL) { continue; } (*pr)("%p is %p+%zu in VMEM '%s' (%s)\n", (void *)addr, (void *)bt->bt_start, (vmem_size_t)(addr - bt->bt_start), vm->vm_name, (bt->bt_type == BT_TYPE_BUSY) ? "allocated" : "free"); } } void vmem_printall(const char *modif, int (*pr)(const char *, ...)) { const vmem_t *vm; LIST_FOREACH(vm, &vmem_list, vm_alllist) { vmem_dump(vm, pr); } } void vmem_print(vmem_addr_t addr, const char *modif, int (*pr)(const char *, ...)) { const vmem_t *vm = (const void *)addr; vmem_dump(vm, pr); } DB_SHOW_COMMAND(vmemdump, vmemdump) { if (!have_addr) { db_printf("usage: show vmemdump \n"); return; } vmem_dump((const vmem_t *)addr, db_printf); } DB_SHOW_ALL_COMMAND(vmemdump, vmemdumpall) { const vmem_t *vm; LIST_FOREACH(vm, &vmem_list, vm_alllist) vmem_dump(vm, db_printf); } DB_SHOW_COMMAND(vmem, vmem_summ) { const vmem_t *vm = (const void *)addr; const bt_t *bt; size_t ft[VMEM_MAXORDER], ut[VMEM_MAXORDER]; size_t fs[VMEM_MAXORDER], us[VMEM_MAXORDER]; int ord; if (!have_addr) { db_printf("usage: show vmem \n"); return; } db_printf("vmem %p '%s'\n", vm, vm->vm_name); db_printf("\tquantum:\t%zu\n", vm->vm_quantum_mask + 1); db_printf("\tsize:\t%zu\n", vm->vm_size); db_printf("\tinuse:\t%zu\n", vm->vm_inuse); db_printf("\tfree:\t%zu\n", vm->vm_size - vm->vm_inuse); db_printf("\tbusy tags:\t%d\n", vm->vm_nbusytag); db_printf("\tfree tags:\t%d\n", vm->vm_nfreetags); memset(&ft, 0, sizeof(ft)); memset(&ut, 0, sizeof(ut)); memset(&fs, 0, sizeof(fs)); memset(&us, 0, sizeof(us)); TAILQ_FOREACH(bt, &vm->vm_seglist, bt_seglist) { ord = SIZE2ORDER(bt->bt_size >> vm->vm_quantum_shift); if (bt->bt_type == BT_TYPE_BUSY) { ut[ord]++; us[ord] += bt->bt_size; } else if (bt->bt_type == BT_TYPE_FREE) { ft[ord]++; fs[ord] += bt->bt_size; } } db_printf("\t\t\tinuse\tsize\t\tfree\tsize\n"); for (ord = 0; ord < VMEM_MAXORDER; ord++) { if (ut[ord] == 0 && ft[ord] == 0) continue; db_printf("\t%-15zu %zu\t%-15zu %zu\t%-16zu\n", ORDER2SIZE(ord) << vm->vm_quantum_shift, ut[ord], us[ord], ft[ord], fs[ord]); } } DB_SHOW_ALL_COMMAND(vmem, vmem_summall) { const vmem_t *vm; LIST_FOREACH(vm, &vmem_list, vm_alllist) vmem_summ((db_expr_t)vm, TRUE, count, modif); } #endif /* defined(DDB) */ #define vmem_printf printf #if defined(DIAGNOSTIC) static bool vmem_check_sanity(vmem_t *vm) { const bt_t *bt, *bt2; MPASS(vm != NULL); TAILQ_FOREACH(bt, &vm->vm_seglist, bt_seglist) { if (bt->bt_start > BT_END(bt)) { printf("corrupted tag\n"); bt_dump(bt, vmem_printf); return false; } } TAILQ_FOREACH(bt, &vm->vm_seglist, bt_seglist) { TAILQ_FOREACH(bt2, &vm->vm_seglist, bt_seglist) { if (bt == bt2) { continue; } if (BT_ISSPAN_P(bt) != BT_ISSPAN_P(bt2)) { continue; } if (bt->bt_start <= BT_END(bt2) && bt2->bt_start <= BT_END(bt)) { printf("overwrapped tags\n"); bt_dump(bt, vmem_printf); bt_dump(bt2, vmem_printf); return false; } } } return true; } static void vmem_check(vmem_t *vm) { if (!vmem_check_sanity(vm)) { panic("insanity vmem %p", vm); } } #endif /* defined(DIAGNOSTIC) */ Index: user/jeff/numa/sys/sys/vmmeter.h =================================================================== --- user/jeff/numa/sys/sys/vmmeter.h (revision 328171) +++ user/jeff/numa/sys/sys/vmmeter.h (revision 328172) @@ -1,189 +1,191 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1982, 1986, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)vmmeter.h 8.2 (Berkeley) 7/10/94 * $FreeBSD$ */ #ifndef _SYS_VMMETER_H_ #define _SYS_VMMETER_H_ /* * This value is used by ps(1) to change sleep state flag from 'S' to * 'I' and by the sched process to set the alarm clock. */ #define MAXSLP 20 /* Systemwide totals computed every five seconds. */ struct vmtotal { uint64_t t_vm; /* total virtual memory */ uint64_t t_avm; /* active virtual memory */ uint64_t t_rm; /* total real memory in use */ uint64_t t_arm; /* active real memory */ uint64_t t_vmshr; /* shared virtual memory */ uint64_t t_avmshr; /* active shared virtual memory */ uint64_t t_rmshr; /* shared real memory */ uint64_t t_armshr; /* active shared real memory */ uint64_t t_free; /* free memory pages */ int16_t t_rq; /* length of the run queue */ int16_t t_dw; /* jobs in ``disk wait'' (neg priority) */ int16_t t_pw; /* jobs in page wait */ int16_t t_sl; /* jobs sleeping in core */ int16_t t_sw; /* swapped out runnable/short block jobs */ uint16_t t_pad[3]; }; #if defined(_KERNEL) || defined(_WANT_VMMETER) #include #ifdef _KERNEL #define VMMETER_ALIGNED __aligned(CACHE_LINE_SIZE) #else #define VMMETER_ALIGNED #endif /* * System wide statistics counters. * Locking: * a - locked by atomic operations * c - constant after initialization * f - locked by vm_page_queue_free_mtx * p - uses counter(9) * q - changes are synchronized by the corresponding vm_pagequeue lock */ struct vmmeter { /* * General system activity. */ counter_u64_t v_swtch; /* (p) context switches */ counter_u64_t v_trap; /* (p) calls to trap */ counter_u64_t v_syscall; /* (p) calls to syscall() */ counter_u64_t v_intr; /* (p) device interrupts */ counter_u64_t v_soft; /* (p) software interrupts */ /* * Virtual memory activity. */ counter_u64_t v_vm_faults; /* (p) address memory faults */ counter_u64_t v_io_faults; /* (p) page faults requiring I/O */ counter_u64_t v_cow_faults; /* (p) copy-on-writes faults */ counter_u64_t v_cow_optim; /* (p) optimized COW faults */ counter_u64_t v_zfod; /* (p) pages zero filled on demand */ counter_u64_t v_ozfod; /* (p) optimized zero fill pages */ counter_u64_t v_swapin; /* (p) swap pager pageins */ counter_u64_t v_swapout; /* (p) swap pager pageouts */ counter_u64_t v_swappgsin; /* (p) swap pager pages paged in */ counter_u64_t v_swappgsout; /* (p) swap pager pages paged out */ counter_u64_t v_vnodein; /* (p) vnode pager pageins */ counter_u64_t v_vnodeout; /* (p) vnode pager pageouts */ counter_u64_t v_vnodepgsin; /* (p) vnode_pager pages paged in */ counter_u64_t v_vnodepgsout; /* (p) vnode pager pages paged out */ counter_u64_t v_intrans; /* (p) intransit blocking page faults */ counter_u64_t v_reactivated; /* (p) reactivated by the pagedaemon */ counter_u64_t v_pdwakeups; /* (p) times daemon has awaken */ counter_u64_t v_pdpages; /* (p) pages analyzed by daemon */ counter_u64_t v_pdshortfalls; /* (p) page reclamation shortfalls */ counter_u64_t v_dfree; /* (p) pages freed by daemon */ counter_u64_t v_pfree; /* (p) pages freed by processes */ counter_u64_t v_tfree; /* (p) total pages freed */ /* * Fork/vfork/rfork activity. */ counter_u64_t v_forks; /* (p) fork() calls */ counter_u64_t v_vforks; /* (p) vfork() calls */ counter_u64_t v_rforks; /* (p) rfork() calls */ counter_u64_t v_kthreads; /* (p) fork() calls by kernel */ counter_u64_t v_forkpages; /* (p) pages affected by fork() */ counter_u64_t v_vforkpages; /* (p) pages affected by vfork() */ counter_u64_t v_rforkpages; /* (p) pages affected by rfork() */ counter_u64_t v_kthreadpages; /* (p) ... and by kernel fork() */ #define VM_METER_NCOUNTERS \ (offsetof(struct vmmeter, v_page_size) / sizeof(counter_u64_t)) /* * Distribution of page usages. */ u_int v_page_size; /* (c) page size in bytes */ u_int v_page_count; /* (c) total number of pages in system */ u_int v_free_reserved; /* (c) pages reserved for deadlock */ u_int v_free_target; /* (c) pages desired free */ u_int v_free_min; /* (c) pages desired free */ u_int v_inactive_target; /* (c) pages desired inactive */ u_int v_pageout_free_min; /* (c) min pages reserved for kernel */ u_int v_interrupt_free_min; /* (c) reserved pages for int code */ u_int v_free_severe; /* (c) severe page depletion point */ u_int v_wire_count VMMETER_ALIGNED; /* (a) pages wired down */ }; #endif /* _KERNEL || _WANT_VMMETER */ #ifdef _KERNEL +#include + extern struct vmmeter vm_cnt; +extern domainset_t vm_min_domains; +extern domainset_t vm_severe_domains; #define VM_CNT_ADD(var, x) counter_u64_add(vm_cnt.var, x) #define VM_CNT_INC(var) VM_CNT_ADD(var, 1) #define VM_CNT_FETCH(var) counter_u64_fetch(vm_cnt.var) u_int vm_free_count(void); /* * Return TRUE if we are under our severe low-free-pages threshold * * This routine is typically used at the user<->system interface to determine * whether we need to block in order to avoid a low memory deadlock. */ static inline int vm_page_count_severe(void) { - /* XXX */ - return (vm_cnt.v_free_severe > vm_free_count()); + return (!DOMAINSET_EMPTY(&vm_severe_domains)); } /* * Return TRUE if we are under our minimum low-free-pages threshold. * * This routine is typically used within the system to determine whether * we can execute potentially very expensive code in terms of memory. It * is also used by the pageout daemon to calculate when to sleep, when * to wake waiters up, and when (after making a pass) to become more * desperate. */ static inline int vm_page_count_min(void) { - /* XXX */ - return (vm_cnt.v_free_min > vm_free_count()); + return (!DOMAINSET_EMPTY(&vm_min_domains)); } #endif /* _KERNEL */ #endif /* _SYS_VMMETER_H_ */ Index: user/jeff/numa/sys/vm/vm_glue.c =================================================================== --- user/jeff/numa/sys/vm/vm_glue.c (revision 328171) +++ user/jeff/numa/sys/vm/vm_glue.c (revision 328172) @@ -1,590 +1,590 @@ /*- * SPDX-License-Identifier: (BSD-3-Clause AND MIT-CMU) * * Copyright (c) 1991, 1993 * The Regents of the University of California. All rights reserved. * * This code is derived from software contributed to Berkeley by * The Mach Operating System project at Carnegie-Mellon University. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from: @(#)vm_glue.c 8.6 (Berkeley) 1/5/94 * * * Copyright (c) 1987, 1990 Carnegie-Mellon University. * All rights reserved. * * Permission to use, copy, modify and distribute this software and * its documentation is hereby granted, provided that both the copyright * notice and this permission notice appear in all copies of the * software, derivative works or modified versions, and any portions * thereof, and that both notices appear in supporting documentation. * * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. * * Carnegie Mellon requests users of this software to return to * * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU * School of Computer Science * Carnegie Mellon University * Pittsburgh PA 15213-3890 * * any improvements or extensions that they make and grant Carnegie the * rights to redistribute these changes. */ #include __FBSDID("$FreeBSD$"); #include "opt_vm.h" #include "opt_kstack_pages.h" #include "opt_kstack_max_pages.h" #include "opt_kstack_usage_prof.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* * MPSAFE * * WARNING! This code calls vm_map_check_protection() which only checks * the associated vm_map_entry range. It does not determine whether the * contents of the memory is actually readable or writable. In most cases * just checking the vm_map_entry is sufficient within the kernel's address * space. */ int kernacc(void *addr, int len, int rw) { boolean_t rv; vm_offset_t saddr, eaddr; vm_prot_t prot; KASSERT((rw & ~VM_PROT_ALL) == 0, ("illegal ``rw'' argument to kernacc (%x)\n", rw)); if ((vm_offset_t)addr + len > kernel_map->max_offset || (vm_offset_t)addr + len < (vm_offset_t)addr) return (FALSE); prot = rw; saddr = trunc_page((vm_offset_t)addr); eaddr = round_page((vm_offset_t)addr + len); vm_map_lock_read(kernel_map); rv = vm_map_check_protection(kernel_map, saddr, eaddr, prot); vm_map_unlock_read(kernel_map); return (rv == TRUE); } /* * MPSAFE * * WARNING! This code calls vm_map_check_protection() which only checks * the associated vm_map_entry range. It does not determine whether the * contents of the memory is actually readable or writable. vmapbuf(), * vm_fault_quick(), or copyin()/copout()/su*()/fu*() functions should be * used in conjunction with this call. */ int useracc(void *addr, int len, int rw) { boolean_t rv; vm_prot_t prot; vm_map_t map; KASSERT((rw & ~VM_PROT_ALL) == 0, ("illegal ``rw'' argument to useracc (%x)\n", rw)); prot = rw; map = &curproc->p_vmspace->vm_map; if ((vm_offset_t)addr + len > vm_map_max(map) || (vm_offset_t)addr + len < (vm_offset_t)addr) { return (FALSE); } vm_map_lock_read(map); rv = vm_map_check_protection(map, trunc_page((vm_offset_t)addr), round_page((vm_offset_t)addr + len), prot); vm_map_unlock_read(map); return (rv == TRUE); } int vslock(void *addr, size_t len) { vm_offset_t end, last, start; vm_size_t npages; int error; last = (vm_offset_t)addr + len; start = trunc_page((vm_offset_t)addr); end = round_page(last); if (last < (vm_offset_t)addr || end < (vm_offset_t)addr) return (EINVAL); npages = atop(end - start); if (npages > vm_page_max_wired) return (ENOMEM); #if 0 /* * XXX - not yet * * The limit for transient usage of wired pages should be * larger than for "permanent" wired pages (mlock()). * * Also, the sysctl code, which is the only present user * of vslock(), does a hard loop on EAGAIN. */ if (npages + vm_cnt.v_wire_count > vm_page_max_wired) return (EAGAIN); #endif error = vm_map_wire(&curproc->p_vmspace->vm_map, start, end, VM_MAP_WIRE_SYSTEM | VM_MAP_WIRE_NOHOLES); /* * Return EFAULT on error to match copy{in,out}() behaviour * rather than returning ENOMEM like mlock() would. */ return (error == KERN_SUCCESS ? 0 : EFAULT); } void vsunlock(void *addr, size_t len) { /* Rely on the parameter sanity checks performed by vslock(). */ (void)vm_map_unwire(&curproc->p_vmspace->vm_map, trunc_page((vm_offset_t)addr), round_page((vm_offset_t)addr + len), VM_MAP_WIRE_SYSTEM | VM_MAP_WIRE_NOHOLES); } /* * Pin the page contained within the given object at the given offset. If the * page is not resident, allocate and load it using the given object's pager. * Return the pinned page if successful; otherwise, return NULL. */ static vm_page_t vm_imgact_hold_page(vm_object_t object, vm_ooffset_t offset) { vm_page_t m; vm_pindex_t pindex; int rv; VM_OBJECT_WLOCK(object); pindex = OFF_TO_IDX(offset); m = vm_page_grab(object, pindex, VM_ALLOC_NORMAL | VM_ALLOC_NOBUSY); if (m->valid != VM_PAGE_BITS_ALL) { vm_page_xbusy(m); rv = vm_pager_get_pages(object, &m, 1, NULL, NULL); if (rv != VM_PAGER_OK) { vm_page_lock(m); vm_page_free(m); vm_page_unlock(m); m = NULL; goto out; } vm_page_xunbusy(m); } vm_page_lock(m); vm_page_hold(m); vm_page_activate(m); vm_page_unlock(m); out: VM_OBJECT_WUNLOCK(object); return (m); } /* * Return a CPU private mapping to the page at the given offset within the * given object. The page is pinned before it is mapped. */ struct sf_buf * vm_imgact_map_page(vm_object_t object, vm_ooffset_t offset) { vm_page_t m; m = vm_imgact_hold_page(object, offset); if (m == NULL) return (NULL); sched_pin(); return (sf_buf_alloc(m, SFB_CPUPRIVATE)); } /* * Destroy the given CPU private mapping and unpin the page that it mapped. */ void vm_imgact_unmap_page(struct sf_buf *sf) { vm_page_t m; m = sf_buf_page(sf); sf_buf_free(sf); sched_unpin(); vm_page_lock(m); vm_page_unhold(m); vm_page_unlock(m); } void vm_sync_icache(vm_map_t map, vm_offset_t va, vm_offset_t sz) { pmap_sync_icache(map->pmap, va, sz); } struct kstack_cache_entry *kstack_cache; static int kstack_cache_size = 128; static int kstacks; static struct mtx kstack_cache_mtx; MTX_SYSINIT(kstack_cache, &kstack_cache_mtx, "kstkch", MTX_DEF); SYSCTL_INT(_vm, OID_AUTO, kstack_cache_size, CTLFLAG_RW, &kstack_cache_size, 0, ""); SYSCTL_INT(_vm, OID_AUTO, kstacks, CTLFLAG_RD, &kstacks, 0, ""); /* * Create the kernel stack (including pcb for i386) for a new thread. * This routine directly affects the fork perf for a process and * create performance for a thread. */ int vm_thread_new(struct thread *td, int pages) { vm_object_t ksobj; vm_offset_t ks; vm_page_t ma[KSTACK_MAX_PAGES]; struct kstack_cache_entry *ks_ce; int i; /* Bounds check */ if (pages <= 1) pages = kstack_pages; else if (pages > KSTACK_MAX_PAGES) pages = KSTACK_MAX_PAGES; if (pages == kstack_pages) { mtx_lock(&kstack_cache_mtx); if (kstack_cache != NULL) { ks_ce = kstack_cache; kstack_cache = ks_ce->next_ks_entry; mtx_unlock(&kstack_cache_mtx); td->td_kstack_obj = ks_ce->ksobj; td->td_kstack = (vm_offset_t)ks_ce; td->td_kstack_pages = kstack_pages; return (1); } mtx_unlock(&kstack_cache_mtx); } /* * Allocate an object for the kstack. */ ksobj = vm_object_allocate(OBJT_DEFAULT, pages); /* * Get a kernel virtual address for this thread's kstack. */ #if defined(__mips__) /* * We need to align the kstack's mapped address to fit within * a single TLB entry. */ if (vmem_xalloc(kernel_arena, (pages + KSTACK_GUARD_PAGES) * PAGE_SIZE, PAGE_SIZE * 2, 0, 0, VMEM_ADDR_MIN, VMEM_ADDR_MAX, M_BESTFIT | M_NOWAIT, &ks)) { ks = 0; } #else ks = kva_alloc((pages + KSTACK_GUARD_PAGES) * PAGE_SIZE); #endif if (ks == 0) { printf("vm_thread_new: kstack allocation failed\n"); vm_object_deallocate(ksobj); return (0); } atomic_add_int(&kstacks, 1); if (KSTACK_GUARD_PAGES != 0) { pmap_qremove(ks, KSTACK_GUARD_PAGES); ks += KSTACK_GUARD_PAGES * PAGE_SIZE; } td->td_kstack_obj = ksobj; td->td_kstack = ks; /* * Knowing the number of pages allocated is useful when you * want to deallocate them. */ td->td_kstack_pages = pages; /* * For the length of the stack, link in a real page of ram for each * page of stack. */ VM_OBJECT_WLOCK(ksobj); (void)vm_page_grab_pages(ksobj, 0, VM_ALLOC_NORMAL | VM_ALLOC_NOBUSY | VM_ALLOC_WIRED, ma, pages); for (i = 0; i < pages; i++) ma[i]->valid = VM_PAGE_BITS_ALL; VM_OBJECT_WUNLOCK(ksobj); pmap_qenter(ks, ma, pages); return (1); } static void vm_thread_stack_dispose(vm_object_t ksobj, vm_offset_t ks, int pages) { vm_page_t m; int i; atomic_add_int(&kstacks, -1); pmap_qremove(ks, pages); VM_OBJECT_WLOCK(ksobj); for (i = 0; i < pages; i++) { m = vm_page_lookup(ksobj, i); if (m == NULL) panic("vm_thread_dispose: kstack already missing?"); vm_page_lock(m); vm_page_unwire(m, PQ_NONE); vm_page_free(m); vm_page_unlock(m); } VM_OBJECT_WUNLOCK(ksobj); vm_object_deallocate(ksobj); kva_free(ks - (KSTACK_GUARD_PAGES * PAGE_SIZE), (pages + KSTACK_GUARD_PAGES) * PAGE_SIZE); } /* * Dispose of a thread's kernel stack. */ void vm_thread_dispose(struct thread *td) { vm_object_t ksobj; vm_offset_t ks; struct kstack_cache_entry *ks_ce; int pages; pages = td->td_kstack_pages; ksobj = td->td_kstack_obj; ks = td->td_kstack; td->td_kstack = 0; td->td_kstack_pages = 0; if (pages == kstack_pages && kstacks <= kstack_cache_size) { ks_ce = (struct kstack_cache_entry *)ks; ks_ce->ksobj = ksobj; mtx_lock(&kstack_cache_mtx); ks_ce->next_ks_entry = kstack_cache; kstack_cache = ks_ce; mtx_unlock(&kstack_cache_mtx); return; } vm_thread_stack_dispose(ksobj, ks, pages); } static void vm_thread_stack_lowmem(void *nulll) { struct kstack_cache_entry *ks_ce, *ks_ce1; mtx_lock(&kstack_cache_mtx); ks_ce = kstack_cache; kstack_cache = NULL; mtx_unlock(&kstack_cache_mtx); while (ks_ce != NULL) { ks_ce1 = ks_ce; ks_ce = ks_ce->next_ks_entry; vm_thread_stack_dispose(ks_ce1->ksobj, (vm_offset_t)ks_ce1, kstack_pages); } } static void kstack_cache_init(void *nulll) { EVENTHANDLER_REGISTER(vm_lowmem, vm_thread_stack_lowmem, NULL, EVENTHANDLER_PRI_ANY); } SYSINIT(vm_kstacks, SI_SUB_KTHREAD_INIT, SI_ORDER_ANY, kstack_cache_init, NULL); #ifdef KSTACK_USAGE_PROF /* * Track maximum stack used by a thread in kernel. */ static int max_kstack_used; SYSCTL_INT(_debug, OID_AUTO, max_kstack_used, CTLFLAG_RD, &max_kstack_used, 0, "Maxiumum stack depth used by a thread in kernel"); void intr_prof_stack_use(struct thread *td, struct trapframe *frame) { vm_offset_t stack_top; vm_offset_t current; int used, prev_used; /* * Testing for interrupted kernel mode isn't strictly * needed. It optimizes the execution, since interrupts from * usermode will have only the trap frame on the stack. */ if (TRAPF_USERMODE(frame)) return; stack_top = td->td_kstack + td->td_kstack_pages * PAGE_SIZE; current = (vm_offset_t)(uintptr_t)&stack_top; /* * Try to detect if interrupt is using kernel thread stack. * Hardware could use a dedicated stack for interrupt handling. */ if (stack_top <= current || current < td->td_kstack) return; used = stack_top - current; for (;;) { prev_used = max_kstack_used; if (prev_used >= used) break; if (atomic_cmpset_int(&max_kstack_used, prev_used, used)) break; } } #endif /* KSTACK_USAGE_PROF */ /* * Implement fork's actions on an address space. * Here we arrange for the address space to be copied or referenced, * allocate a user struct (pcb and kernel stack), then call the * machine-dependent layer to fill those in and make the new process * ready to run. The new process is set up so that it returns directly * to user mode to avoid stack copying and relocation problems. */ int vm_forkproc(struct thread *td, struct proc *p2, struct thread *td2, struct vmspace *vm2, int flags) { struct proc *p1 = td->td_proc; int error; if ((flags & RFPROC) == 0) { /* * Divorce the memory, if it is shared, essentially * this changes shared memory amongst threads, into * COW locally. */ if ((flags & RFMEM) == 0) { if (p1->p_vmspace->vm_refcnt > 1) { error = vmspace_unshare(p1); if (error) return (error); } } cpu_fork(td, p2, td2, flags); return (0); } if (flags & RFMEM) { p2->p_vmspace = p1->p_vmspace; atomic_add_int(&p1->p_vmspace->vm_refcnt, 1); } while (vm_page_count_severe()) { - VM_WAIT; + vm_wait_severe(); } if ((flags & RFMEM) == 0) { p2->p_vmspace = vm2; if (p1->p_vmspace->vm_shm) shmfork(p1, p2); } /* * cpu_fork will copy and update the pcb, set up the kernel stack, * and make the child ready to run. */ cpu_fork(td, p2, td2, flags); return (0); } /* * Called after process has been wait(2)'ed upon and is being reaped. * The idea is to reclaim resources that we could not reclaim while * the process was still executing. */ void vm_waitproc(p) struct proc *p; { vmspace_exitfree(p); /* and clean-out the vmspace */ } void kick_proc0(void) { wakeup(&proc0); } Index: user/jeff/numa/sys/vm/vm_kern.c =================================================================== --- user/jeff/numa/sys/vm/vm_kern.c (revision 328171) +++ user/jeff/numa/sys/vm/vm_kern.c (revision 328172) @@ -1,688 +1,688 @@ /*- * SPDX-License-Identifier: (BSD-3-Clause AND MIT-CMU) * * Copyright (c) 1991, 1993 * The Regents of the University of California. All rights reserved. * * This code is derived from software contributed to Berkeley by * The Mach Operating System project at Carnegie-Mellon University. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from: @(#)vm_kern.c 8.3 (Berkeley) 1/12/94 * * * Copyright (c) 1987, 1990 Carnegie-Mellon University. * All rights reserved. * * Authors: Avadis Tevanian, Jr., Michael Wayne Young * * Permission to use, copy, modify and distribute this software and * its documentation is hereby granted, provided that both the copyright * notice and this permission notice appear in all copies of the * software, derivative works or modified versions, and any portions * thereof, and that both notices appear in supporting documentation. * * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. * * Carnegie Mellon requests users of this software to return to * * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU * School of Computer Science * Carnegie Mellon University * Pittsburgh PA 15213-3890 * * any improvements or extensions that they make and grant Carnegie the * rights to redistribute these changes. */ /* * Kernel memory management. */ #include __FBSDID("$FreeBSD$"); #include "opt_vm.h" #include #include #include /* for ticks and hz */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include vm_map_t kernel_map; vm_map_t exec_map; vm_map_t pipe_map; const void *zero_region; CTASSERT((ZERO_REGION_SIZE & PAGE_MASK) == 0); /* NB: Used by kernel debuggers. */ const u_long vm_maxuser_address = VM_MAXUSER_ADDRESS; u_int exec_map_entry_size; u_int exec_map_entries; SYSCTL_ULONG(_vm, OID_AUTO, min_kernel_address, CTLFLAG_RD, SYSCTL_NULL_ULONG_PTR, VM_MIN_KERNEL_ADDRESS, "Min kernel address"); SYSCTL_ULONG(_vm, OID_AUTO, max_kernel_address, CTLFLAG_RD, #if defined(__arm__) || defined(__sparc64__) &vm_max_kernel_address, 0, #else SYSCTL_NULL_ULONG_PTR, VM_MAX_KERNEL_ADDRESS, #endif "Max kernel address"); /* * kva_alloc: * * Allocate a virtual address range with no underlying object and * no initial mapping to physical memory. Any mapping from this * range to physical memory must be explicitly created prior to * its use, typically with pmap_qenter(). Any attempt to create * a mapping on demand through vm_fault() will result in a panic. */ vm_offset_t kva_alloc(vm_size_t size) { vm_offset_t addr; size = round_page(size); if (vmem_alloc(kernel_arena, size, M_BESTFIT | M_NOWAIT, &addr)) return (0); return (addr); } /* * kva_free: * * Release a region of kernel virtual memory allocated * with kva_alloc, and return the physical pages * associated with that region. * * This routine may not block on kernel maps. */ void kva_free(vm_offset_t addr, vm_size_t size) { size = round_page(size); vmem_free(kernel_arena, addr, size); } /* * Allocates a region from the kernel address map and physical pages * within the specified address range to the kernel object. Creates a * wired mapping from this region to these pages, and returns the * region's starting virtual address. The allocated pages are not * necessarily physically contiguous. If M_ZERO is specified through the * given flags, then the pages are zeroed before they are mapped. */ vm_offset_t kmem_alloc_attr_domain(int domain, vm_size_t size, int flags, vm_paddr_t low, vm_paddr_t high, vm_memattr_t memattr) { vmem_t *vmem; vm_object_t object = kernel_object; vm_offset_t addr, i, offset; vm_page_t m; int pflags, tries; size = round_page(size); vmem = vm_dom[domain].vmd_kernel_arena; if (vmem_alloc(vmem, size, M_BESTFIT | flags, &addr)) return (0); offset = addr - VM_MIN_KERNEL_ADDRESS; pflags = malloc2vm_flags(flags) | VM_ALLOC_NOBUSY | VM_ALLOC_WIRED; pflags &= ~(VM_ALLOC_NOWAIT | VM_ALLOC_WAITOK | VM_ALLOC_WAITFAIL); pflags |= VM_ALLOC_NOWAIT; VM_OBJECT_WLOCK(object); for (i = 0; i < size; i += PAGE_SIZE) { tries = 0; retry: m = vm_page_alloc_contig_domain(object, atop(offset + i), domain, pflags, 1, low, high, PAGE_SIZE, 0, memattr); if (m == NULL) { VM_OBJECT_WUNLOCK(object); if (tries < ((flags & M_NOWAIT) != 0 ? 1 : 3)) { if (!vm_page_reclaim_contig_domain(domain, pflags, 1, low, high, PAGE_SIZE, 0) && (flags & M_WAITOK) != 0) - VM_WAIT; + vm_wait_domain(domain); VM_OBJECT_WLOCK(object); tries++; goto retry; } kmem_unback(object, addr, i); vmem_free(vmem, addr, size); return (0); } KASSERT(vm_phys_domain(m) == domain, ("kmem_alloc_attr_domain: Domain mismatch %d != %d", vm_phys_domain(m), domain)); if ((flags & M_ZERO) && (m->flags & PG_ZERO) == 0) pmap_zero_page(m); m->valid = VM_PAGE_BITS_ALL; pmap_enter(kernel_pmap, addr + i, m, VM_PROT_ALL, VM_PROT_ALL | PMAP_ENTER_WIRED, 0); } VM_OBJECT_WUNLOCK(object); return (addr); } vm_offset_t kmem_alloc_attr(vmem_t *vmem, vm_size_t size, int flags, vm_paddr_t low, vm_paddr_t high, vm_memattr_t memattr) { struct vm_domainset_iter di; vm_offset_t addr; int domain; KASSERT(vmem == kernel_arena, ("kmem_alloc_attr: Only kernel_arena is supported.")); vm_domainset_iter_malloc_init(&di, kernel_object, &domain, &flags); do { addr = kmem_alloc_attr_domain(domain, size, flags, low, high, memattr); if (addr != 0) break; } while (vm_domainset_iter_malloc(&di, &domain, &flags) == 0); return (addr); } /* * Allocates a region from the kernel address map and physically * contiguous pages within the specified address range to the kernel * object. Creates a wired mapping from this region to these pages, and * returns the region's starting virtual address. If M_ZERO is specified * through the given flags, then the pages are zeroed before they are * mapped. */ vm_offset_t kmem_alloc_contig_domain(int domain, vm_size_t size, int flags, vm_paddr_t low, vm_paddr_t high, u_long alignment, vm_paddr_t boundary, vm_memattr_t memattr) { vmem_t *vmem; vm_object_t object = kernel_object; vm_offset_t addr, offset, tmp; vm_page_t end_m, m; u_long npages; int pflags, tries; size = round_page(size); vmem = vm_dom[domain].vmd_kernel_arena; if (vmem_alloc(vmem, size, flags | M_BESTFIT, &addr)) return (0); offset = addr - VM_MIN_KERNEL_ADDRESS; pflags = malloc2vm_flags(flags) | VM_ALLOC_NOBUSY | VM_ALLOC_WIRED; pflags &= ~(VM_ALLOC_NOWAIT | VM_ALLOC_WAITOK | VM_ALLOC_WAITFAIL); pflags |= VM_ALLOC_NOWAIT; npages = atop(size); VM_OBJECT_WLOCK(object); tries = 0; retry: m = vm_page_alloc_contig_domain(object, atop(offset), domain, pflags, npages, low, high, alignment, boundary, memattr); if (m == NULL) { VM_OBJECT_WUNLOCK(object); if (tries < ((flags & M_NOWAIT) != 0 ? 1 : 3)) { if (!vm_page_reclaim_contig_domain(domain, pflags, npages, low, high, alignment, boundary) && (flags & M_WAITOK) != 0) - VM_WAIT; + vm_wait_domain(domain); VM_OBJECT_WLOCK(object); tries++; goto retry; } vmem_free(vmem, addr, size); return (0); } KASSERT(vm_phys_domain(m) == domain, ("kmem_alloc_contig_domain: Domain mismatch %d != %d", vm_phys_domain(m), domain)); end_m = m + npages; tmp = addr; for (; m < end_m; m++) { if ((flags & M_ZERO) && (m->flags & PG_ZERO) == 0) pmap_zero_page(m); m->valid = VM_PAGE_BITS_ALL; pmap_enter(kernel_pmap, tmp, m, VM_PROT_ALL, VM_PROT_ALL | PMAP_ENTER_WIRED, 0); tmp += PAGE_SIZE; } VM_OBJECT_WUNLOCK(object); return (addr); } vm_offset_t kmem_alloc_contig(struct vmem *vmem, vm_size_t size, int flags, vm_paddr_t low, vm_paddr_t high, u_long alignment, vm_paddr_t boundary, vm_memattr_t memattr) { struct vm_domainset_iter di; vm_offset_t addr; int domain; KASSERT(vmem == kernel_arena, ("kmem_alloc_contig: Only kernel_arena is supported.")); vm_domainset_iter_malloc_init(&di, kernel_object, &domain, &flags); do { addr = kmem_alloc_contig_domain(domain, size, flags, low, high, alignment, boundary, memattr); if (addr != 0) break; } while (vm_domainset_iter_malloc(&di, &domain, &flags) == 0); return (addr); } /* * kmem_suballoc: * * Allocates a map to manage a subrange * of the kernel virtual address space. * * Arguments are as follows: * * parent Map to take range from * min, max Returned endpoints of map * size Size of range to find * superpage_align Request that min is superpage aligned */ vm_map_t kmem_suballoc(vm_map_t parent, vm_offset_t *min, vm_offset_t *max, vm_size_t size, boolean_t superpage_align) { int ret; vm_map_t result; size = round_page(size); *min = vm_map_min(parent); ret = vm_map_find(parent, NULL, 0, min, size, 0, superpage_align ? VMFS_SUPER_SPACE : VMFS_ANY_SPACE, VM_PROT_ALL, VM_PROT_ALL, MAP_ACC_NO_CHARGE); if (ret != KERN_SUCCESS) panic("kmem_suballoc: bad status return of %d", ret); *max = *min + size; result = vm_map_create(vm_map_pmap(parent), *min, *max); if (result == NULL) panic("kmem_suballoc: cannot create submap"); if (vm_map_submap(parent, *min, *max, result) != KERN_SUCCESS) panic("kmem_suballoc: unable to change range to submap"); return (result); } /* * kmem_malloc: * * Allocate wired-down pages in the kernel's address space. */ vm_offset_t kmem_malloc_domain(int domain, vm_size_t size, int flags) { vmem_t *vmem; vm_offset_t addr; int rv; vmem = vm_dom[domain].vmd_kernel_arena; size = round_page(size); if (vmem_alloc(vmem, size, flags | M_BESTFIT, &addr)) return (0); rv = kmem_back_domain(domain, kernel_object, addr, size, flags); if (rv != KERN_SUCCESS) { vmem_free(vmem, addr, size); return (0); } return (addr); } vm_offset_t kmem_malloc(struct vmem *vmem, vm_size_t size, int flags) { struct vm_domainset_iter di; vm_offset_t addr; int domain; KASSERT(vmem == kernel_arena, ("kmem_malloc: Only kernel_arena is supported.")); vm_domainset_iter_malloc_init(&di, kernel_object, &domain, &flags); do { addr = kmem_malloc_domain(domain, size, flags); if (addr != 0) break; } while (vm_domainset_iter_malloc(&di, &domain, &flags) == 0); return (addr); } /* * kmem_back: * * Allocate physical pages for the specified virtual address range. */ int kmem_back_domain(int domain, vm_object_t object, vm_offset_t addr, vm_size_t size, int flags) { vm_offset_t offset, i; vm_page_t m, mpred; int pflags; KASSERT(object == kernel_object, ("kmem_back_domain: only supports kernel object.")); offset = addr - VM_MIN_KERNEL_ADDRESS; pflags = malloc2vm_flags(flags) | VM_ALLOC_NOBUSY | VM_ALLOC_WIRED; pflags &= ~(VM_ALLOC_NOWAIT | VM_ALLOC_WAITOK | VM_ALLOC_WAITFAIL); if (flags & M_WAITOK) pflags |= VM_ALLOC_WAITFAIL; i = 0; VM_OBJECT_WLOCK(object); retry: mpred = vm_radix_lookup_le(&object->rtree, atop(offset + i)); for (; i < size; i += PAGE_SIZE, mpred = m) { m = vm_page_alloc_domain_after(object, atop(offset + i), domain, pflags, mpred); /* * Ran out of space, free everything up and return. Don't need * to lock page queues here as we know that the pages we got * aren't on any queues. */ if (m == NULL) { if ((flags & M_NOWAIT) == 0) goto retry; VM_OBJECT_WUNLOCK(object); kmem_unback(object, addr, i); return (KERN_NO_SPACE); } KASSERT(vm_phys_domain(m) == domain, ("kmem_back_domain: Domain mismatch %d != %d", vm_phys_domain(m), domain)); if (flags & M_ZERO && (m->flags & PG_ZERO) == 0) pmap_zero_page(m); KASSERT((m->oflags & VPO_UNMANAGED) != 0, ("kmem_malloc: page %p is managed", m)); m->valid = VM_PAGE_BITS_ALL; pmap_enter(kernel_pmap, addr + i, m, VM_PROT_ALL, VM_PROT_ALL | PMAP_ENTER_WIRED, 0); } VM_OBJECT_WUNLOCK(object); return (KERN_SUCCESS); } int kmem_back(vm_object_t object, vm_offset_t addr, vm_size_t size, int flags) { struct vm_domainset_iter di; int domain; int ret; KASSERT(object == kernel_object, ("kmem_back: only supports kernel object.")); vm_domainset_iter_malloc_init(&di, kernel_object, &domain, &flags); do { ret = kmem_back_domain(domain, object, addr, size, flags); if (ret == KERN_SUCCESS) break; } while (vm_domainset_iter_malloc(&di, &domain, &flags) == 0); return (ret); } /* * kmem_unback: * * Unmap and free the physical pages underlying the specified virtual * address range. * * A physical page must exist within the specified object at each index * that is being unmapped. */ static int _kmem_unback(vm_object_t object, vm_offset_t addr, vm_size_t size) { vm_page_t m, next; vm_offset_t end, offset; int domain; KASSERT(object == kernel_object, ("kmem_unback: only supports kernel object.")); if (size == 0) return (0); pmap_remove(kernel_pmap, addr, addr + size); offset = addr - VM_MIN_KERNEL_ADDRESS; end = offset + size; VM_OBJECT_WLOCK(object); m = vm_page_lookup(object, atop(offset)); domain = vm_phys_domain(m); for (; offset < end; offset += PAGE_SIZE, m = next) { next = vm_page_next(m); vm_page_unwire(m, PQ_NONE); vm_page_free(m); } VM_OBJECT_WUNLOCK(object); return (domain); } void kmem_unback(vm_object_t object, vm_offset_t addr, vm_size_t size) { _kmem_unback(object, addr, size); } /* * kmem_free: * * Free memory allocated with kmem_malloc. The size must match the * original allocation. */ void kmem_free(struct vmem *vmem, vm_offset_t addr, vm_size_t size) { int domain; KASSERT(vmem == kernel_arena, ("kmem_free: Only kernel_arena is supported.")); size = round_page(size); domain = _kmem_unback(kernel_object, addr, size); vmem_free(vm_dom[domain].vmd_kernel_arena, addr, size); } /* * kmap_alloc_wait: * * Allocates pageable memory from a sub-map of the kernel. If the submap * has no room, the caller sleeps waiting for more memory in the submap. * * This routine may block. */ vm_offset_t kmap_alloc_wait(vm_map_t map, vm_size_t size) { vm_offset_t addr; size = round_page(size); if (!swap_reserve(size)) return (0); for (;;) { /* * To make this work for more than one map, use the map's lock * to lock out sleepers/wakers. */ vm_map_lock(map); if (vm_map_findspace(map, vm_map_min(map), size, &addr) == 0) break; /* no space now; see if we can ever get space */ if (vm_map_max(map) - vm_map_min(map) < size) { vm_map_unlock(map); swap_release(size); return (0); } map->needs_wakeup = TRUE; vm_map_unlock_and_wait(map, 0); } vm_map_insert(map, NULL, 0, addr, addr + size, VM_PROT_ALL, VM_PROT_ALL, MAP_ACC_CHARGED); vm_map_unlock(map); return (addr); } /* * kmap_free_wakeup: * * Returns memory to a submap of the kernel, and wakes up any processes * waiting for memory in that map. */ void kmap_free_wakeup(vm_map_t map, vm_offset_t addr, vm_size_t size) { vm_map_lock(map); (void) vm_map_delete(map, trunc_page(addr), round_page(addr + size)); if (map->needs_wakeup) { map->needs_wakeup = FALSE; vm_map_wakeup(map); } vm_map_unlock(map); } void kmem_init_zero_region(void) { vm_offset_t addr, i; vm_page_t m; /* * Map a single physical page of zeros to a larger virtual range. * This requires less looping in places that want large amounts of * zeros, while not using much more physical resources. */ addr = kva_alloc(ZERO_REGION_SIZE); m = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED | VM_ALLOC_ZERO); if ((m->flags & PG_ZERO) == 0) pmap_zero_page(m); for (i = 0; i < ZERO_REGION_SIZE; i += PAGE_SIZE) pmap_qenter(addr + i, &m, 1); pmap_protect(kernel_pmap, addr, addr + ZERO_REGION_SIZE, VM_PROT_READ); zero_region = (const void *)addr; } /* * kmem_init: * * Create the kernel map; insert a mapping covering kernel text, * data, bss, and all space allocated thus far (`boostrap' data). The * new map will thus map the range between VM_MIN_KERNEL_ADDRESS and * `start' as allocated, and the range between `start' and `end' as free. */ void kmem_init(vm_offset_t start, vm_offset_t end) { vm_map_t m; m = vm_map_create(kernel_pmap, VM_MIN_KERNEL_ADDRESS, end); m->system_map = 1; vm_map_lock(m); /* N.B.: cannot use kgdb to debug, starting with this assignment ... */ kernel_map = m; (void) vm_map_insert(m, NULL, (vm_ooffset_t) 0, #ifdef __amd64__ KERNBASE, #else VM_MIN_KERNEL_ADDRESS, #endif start, VM_PROT_ALL, VM_PROT_ALL, MAP_NOFAULT); /* ... and ending with the completion of the above `insert' */ vm_map_unlock(m); } #ifdef DIAGNOSTIC /* * Allow userspace to directly trigger the VM drain routine for testing * purposes. */ static int debug_vm_lowmem(SYSCTL_HANDLER_ARGS) { int error, i; i = 0; error = sysctl_handle_int(oidp, &i, 0, req); if (error) return (error); if ((i & ~(VM_LOW_KMEM | VM_LOW_PAGES)) != 0) return (EINVAL); if (i != 0) EVENTHANDLER_INVOKE(vm_lowmem, i); return (0); } SYSCTL_PROC(_debug, OID_AUTO, vm_lowmem, CTLTYPE_INT | CTLFLAG_RW, 0, 0, debug_vm_lowmem, "I", "set to trigger vm_lowmem event with given flags"); #endif Index: user/jeff/numa/sys/vm/vm_page.c =================================================================== --- user/jeff/numa/sys/vm/vm_page.c (revision 328171) +++ user/jeff/numa/sys/vm/vm_page.c (revision 328172) @@ -1,4042 +1,4127 @@ /*- * SPDX-License-Identifier: (BSD-3-Clause AND MIT-CMU) * * Copyright (c) 1991 Regents of the University of California. * All rights reserved. * Copyright (c) 1998 Matthew Dillon. All Rights Reserved. * * This code is derived from software contributed to Berkeley by * The Mach Operating System project at Carnegie-Mellon University. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from: @(#)vm_page.c 7.4 (Berkeley) 5/7/91 */ /*- * Copyright (c) 1987, 1990 Carnegie-Mellon University. * All rights reserved. * * Authors: Avadis Tevanian, Jr., Michael Wayne Young * * Permission to use, copy, modify and distribute this software and * its documentation is hereby granted, provided that both the copyright * notice and this permission notice appear in all copies of the * software, derivative works or modified versions, and any portions * thereof, and that both notices appear in supporting documentation. * * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. * * Carnegie Mellon requests users of this software to return to * * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU * School of Computer Science * Carnegie Mellon University * Pittsburgh PA 15213-3890 * * any improvements or extensions that they make and grant Carnegie the * rights to redistribute these changes. */ /* * GENERAL RULES ON VM_PAGE MANIPULATION * * - A page queue lock is required when adding or removing a page from a * page queue regardless of other locks or the busy state of a page. * * * In general, no thread besides the page daemon can acquire or * hold more than one page queue lock at a time. * * * The page daemon can acquire and hold any pair of page queue * locks in any order. * * - The object lock is required when inserting or removing * pages from an object (vm_page_insert() or vm_page_remove()). * */ /* * Resident memory management module. */ #include __FBSDID("$FreeBSD$"); #include "opt_vm.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* * Associated with page of user-allocatable memory is a * page structure. */ struct vm_domain vm_dom[MAXMEMDOM]; struct mtx_padalign __exclusive_cache_line pa_lock[PA_LOCK_COUNT]; +struct mtx_padalign __exclusive_cache_line vm_domainset_lock; +domainset_t __exclusive_cache_line vm_min_domains; +domainset_t __exclusive_cache_line vm_severe_domains; + /* * bogus page -- for I/O to/from partially complete buffers, * or for paging into sparsely invalid regions. */ vm_page_t bogus_page; vm_page_t vm_page_array; long vm_page_array_size; long first_page; static int boot_pages = UMA_BOOT_PAGES; SYSCTL_INT(_vm, OID_AUTO, boot_pages, CTLFLAG_RDTUN | CTLFLAG_NOFETCH, &boot_pages, 0, "number of pages allocated for bootstrapping the VM system"); static int pa_tryrelock_restart; SYSCTL_INT(_vm, OID_AUTO, tryrelock_restart, CTLFLAG_RD, &pa_tryrelock_restart, 0, "Number of tryrelock restarts"); static TAILQ_HEAD(, vm_page) blacklist_head; static int sysctl_vm_page_blacklist(SYSCTL_HANDLER_ARGS); SYSCTL_PROC(_vm, OID_AUTO, page_blacklist, CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0, sysctl_vm_page_blacklist, "A", "Blacklist pages"); static uma_zone_t fakepg_zone; static void vm_page_alloc_check(vm_page_t m); static void vm_page_clear_dirty_mask(vm_page_t m, vm_page_bits_t pagebits); static void vm_page_enqueue(uint8_t queue, vm_page_t m); static void vm_page_free_phys(vm_page_t m); static void vm_page_free_wakeup(int domain); static void vm_page_init(void *dummy); static int vm_page_insert_after(vm_page_t m, vm_object_t object, vm_pindex_t pindex, vm_page_t mpred); static void vm_page_insert_radixdone(vm_page_t m, vm_object_t object, vm_page_t mpred); static int vm_page_reclaim_run(int req_class, int domain, u_long npages, vm_page_t m_run, vm_paddr_t high); static int vm_page_alloc_fail(vm_object_t object, int domain, int req); SYSINIT(vm_page, SI_SUB_VM, SI_ORDER_SECOND, vm_page_init, NULL); static void vm_page_init(void *dummy) { fakepg_zone = uma_zcreate("fakepg", sizeof(struct vm_page), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE | UMA_ZONE_VM); bogus_page = vm_page_alloc(NULL, 0, VM_ALLOC_NOOBJ | VM_ALLOC_NORMAL | VM_ALLOC_WIRED); } /* Make sure that u_long is at least 64 bits when PAGE_SIZE is 32K. */ #if PAGE_SIZE == 32768 #ifdef CTASSERT CTASSERT(sizeof(u_long) >= 8); #endif #endif /* * Try to acquire a physical address lock while a pmap is locked. If we * fail to trylock we unlock and lock the pmap directly and cache the * locked pa in *locked. The caller should then restart their loop in case * the virtual to physical mapping has changed. */ int vm_page_pa_tryrelock(pmap_t pmap, vm_paddr_t pa, vm_paddr_t *locked) { vm_paddr_t lockpa; lockpa = *locked; *locked = pa; if (lockpa) { PA_LOCK_ASSERT(lockpa, MA_OWNED); if (PA_LOCKPTR(pa) == PA_LOCKPTR(lockpa)) return (0); PA_UNLOCK(lockpa); } if (PA_TRYLOCK(pa)) return (0); PMAP_UNLOCK(pmap); atomic_add_int(&pa_tryrelock_restart, 1); PA_LOCK(pa); PMAP_LOCK(pmap); return (EAGAIN); } /* * vm_set_page_size: * * Sets the page size, perhaps based upon the memory * size. Must be called before any use of page-size * dependent functions. */ void vm_set_page_size(void) { if (vm_cnt.v_page_size == 0) vm_cnt.v_page_size = PAGE_SIZE; if (((vm_cnt.v_page_size - 1) & vm_cnt.v_page_size) != 0) panic("vm_set_page_size: page size not a power of two"); } /* * vm_page_blacklist_next: * * Find the next entry in the provided string of blacklist * addresses. Entries are separated by space, comma, or newline. * If an invalid integer is encountered then the rest of the * string is skipped. Updates the list pointer to the next * character, or NULL if the string is exhausted or invalid. */ static vm_paddr_t vm_page_blacklist_next(char **list, char *end) { vm_paddr_t bad; char *cp, *pos; if (list == NULL || *list == NULL) return (0); if (**list =='\0') { *list = NULL; return (0); } /* * If there's no end pointer then the buffer is coming from * the kenv and we know it's null-terminated. */ if (end == NULL) end = *list + strlen(*list); /* Ensure that strtoq() won't walk off the end */ if (*end != '\0') { if (*end == '\n' || *end == ' ' || *end == ',') *end = '\0'; else { printf("Blacklist not terminated, skipping\n"); *list = NULL; return (0); } } for (pos = *list; *pos != '\0'; pos = cp) { bad = strtoq(pos, &cp, 0); if (*cp == '\0' || *cp == ' ' || *cp == ',' || *cp == '\n') { if (bad == 0) { if (++cp < end) continue; else break; } } else break; if (*cp == '\0' || ++cp >= end) *list = NULL; else *list = cp; return (trunc_page(bad)); } printf("Garbage in RAM blacklist, skipping\n"); *list = NULL; return (0); } /* * vm_page_blacklist_check: * * Iterate through the provided string of blacklist addresses, pulling * each entry out of the physical allocator free list and putting it * onto a list for reporting via the vm.page_blacklist sysctl. */ static void vm_page_blacklist_check(char *list, char *end) { vm_paddr_t pa; vm_page_t m; char *next; int ret, domain; next = list; while (next != NULL) { if ((pa = vm_page_blacklist_next(&next, end)) == 0) continue; m = vm_phys_paddr_to_vm_page(pa); if (m == NULL) continue; domain = vm_phys_domain(m); vm_pagequeue_free_lock(domain); ret = vm_phys_unfree_page(m); vm_pagequeue_free_unlock(domain); if (ret == TRUE) { TAILQ_INSERT_TAIL(&blacklist_head, m, listq); if (bootverbose) printf("Skipping page with pa 0x%jx\n", (uintmax_t)pa); } } } /* * vm_page_blacklist_load: * * Search for a special module named "ram_blacklist". It'll be a * plain text file provided by the user via the loader directive * of the same name. */ static void vm_page_blacklist_load(char **list, char **end) { void *mod; u_char *ptr; u_int len; mod = NULL; ptr = NULL; mod = preload_search_by_type("ram_blacklist"); if (mod != NULL) { ptr = preload_fetch_addr(mod); len = preload_fetch_size(mod); } *list = ptr; if (ptr != NULL) *end = ptr + len; else *end = NULL; return; } static int sysctl_vm_page_blacklist(SYSCTL_HANDLER_ARGS) { vm_page_t m; struct sbuf sbuf; int error, first; first = 1; error = sysctl_wire_old_buffer(req, 0); if (error != 0) return (error); sbuf_new_for_sysctl(&sbuf, NULL, 128, req); TAILQ_FOREACH(m, &blacklist_head, listq) { sbuf_printf(&sbuf, "%s%#jx", first ? "" : ",", (uintmax_t)m->phys_addr); first = 0; } error = sbuf_finish(&sbuf); sbuf_delete(&sbuf); return (error); } static void vm_page_domain_init(struct vm_domain *vmd) { struct vm_pagequeue *pq; int i; bzero(vmd, sizeof(*vmd)); *__DECONST(char **, &vmd->vmd_pagequeues[PQ_INACTIVE].pq_name) = "vm inactive pagequeue"; *__DECONST(char **, &vmd->vmd_pagequeues[PQ_ACTIVE].pq_name) = "vm active pagequeue"; *__DECONST(char **, &vmd->vmd_pagequeues[PQ_LAUNDRY].pq_name) = "vm laundry pagequeue"; *__DECONST(char **, &vmd->vmd_pagequeues[PQ_UNSWAPPABLE].pq_name) = "vm unswappable pagequeue"; vmd->vmd_page_count = 0; vmd->vmd_free_count = 0; vmd->vmd_segs = 0; vmd->vmd_oom = FALSE; for (i = 0; i < PQ_COUNT; i++) { pq = &vmd->vmd_pagequeues[i]; TAILQ_INIT(&pq->pq_pl); mtx_init(&pq->pq_mutex, pq->pq_name, "vm pagequeue", MTX_DEF | MTX_DUPOK); } mtx_init(&vmd->vmd_pagequeue_free_mtx, "vm page free queue", NULL, MTX_DEF); } /* * Initialize a physical page in preparation for adding it to the free * lists. */ static void vm_page_init_page(vm_page_t m, vm_paddr_t pa, int segind) { m->object = NULL; m->wire_count = 0; m->busy_lock = VPB_UNBUSIED; m->hold_count = 0; m->flags = 0; m->phys_addr = pa; m->queue = PQ_NONE; m->psind = 0; m->segind = segind; m->order = VM_NFREEORDER; m->pool = VM_FREEPOOL_DEFAULT; m->valid = m->dirty = 0; pmap_page_init(m); } /* * vm_page_startup: * * Initializes the resident memory module. Allocates physical memory for * bootstrapping UMA and some data structures that are used to manage * physical pages. Initializes these structures, and populates the free * page queues. */ vm_offset_t vm_page_startup(vm_offset_t vaddr) { struct vm_domain *vmd; struct vm_phys_seg *seg; vm_page_t m; char *list, *listend; vm_offset_t mapped; vm_paddr_t end, high_avail, low_avail, new_end, page_range, size; vm_paddr_t biggestsize, last_pa, pa; u_long pagecount; int biggestone, i, pages_per_zone, segind; biggestsize = 0; biggestone = 0; vaddr = round_page(vaddr); for (i = 0; phys_avail[i + 1]; i += 2) { phys_avail[i] = round_page(phys_avail[i]); phys_avail[i + 1] = trunc_page(phys_avail[i + 1]); } for (i = 0; phys_avail[i + 1]; i += 2) { size = phys_avail[i + 1] - phys_avail[i]; if (size > biggestsize) { biggestone = i; biggestsize = size; } } end = phys_avail[biggestone+1]; /* * Initialize the page and queue locks. */ + mtx_init(&vm_domainset_lock, "vm domainset lock", NULL, MTX_DEF); for (i = 0; i < PA_LOCK_COUNT; i++) mtx_init(&pa_lock[i], "vm page", NULL, MTX_DEF); for (i = 0; i < vm_ndomains; i++) vm_page_domain_init(VM_DOMAIN(i)); /* * Almost all of the pages needed for bootstrapping UMA are used * for zone structures, so if the number of CPUs results in those * structures taking more than one page each, we set aside more pages * in proportion to the zone structure size. */ pages_per_zone = howmany(sizeof(struct uma_zone) + sizeof(struct uma_cache) * (mp_maxid + 1) + roundup2(sizeof(struct uma_slab), sizeof(void *)), UMA_SLAB_SIZE); if (pages_per_zone > 1) { /* Reserve more pages so that we don't run out. */ boot_pages = UMA_BOOT_PAGES_ZONES * pages_per_zone; } /* * Allocate memory for use when boot strapping the kernel memory * allocator. * * CTFLAG_RDTUN doesn't work during the early boot process, so we must * manually fetch the value. */ TUNABLE_INT_FETCH("vm.boot_pages", &boot_pages); new_end = end - (boot_pages * UMA_SLAB_SIZE); new_end = trunc_page(new_end); mapped = pmap_map(&vaddr, new_end, end, VM_PROT_READ | VM_PROT_WRITE); bzero((void *)mapped, end - new_end); uma_startup((void *)mapped, boot_pages); #if defined(__aarch64__) || defined(__amd64__) || defined(__arm__) || \ defined(__i386__) || defined(__mips__) /* * Allocate a bitmap to indicate that a random physical page * needs to be included in a minidump. * * The amd64 port needs this to indicate which direct map pages * need to be dumped, via calls to dump_add_page()/dump_drop_page(). * * However, i386 still needs this workspace internally within the * minidump code. In theory, they are not needed on i386, but are * included should the sf_buf code decide to use them. */ last_pa = 0; for (i = 0; dump_avail[i + 1] != 0; i += 2) if (dump_avail[i + 1] > last_pa) last_pa = dump_avail[i + 1]; page_range = last_pa / PAGE_SIZE; vm_page_dump_size = round_page(roundup2(page_range, NBBY) / NBBY); new_end -= vm_page_dump_size; vm_page_dump = (void *)(uintptr_t)pmap_map(&vaddr, new_end, new_end + vm_page_dump_size, VM_PROT_READ | VM_PROT_WRITE); bzero((void *)vm_page_dump, vm_page_dump_size); #else (void)last_pa; #endif #if defined(__aarch64__) || defined(__amd64__) || defined(__mips__) /* * Include the UMA bootstrap pages and vm_page_dump in a crash dump. * When pmap_map() uses the direct map, they are not automatically * included. */ for (pa = new_end; pa < end; pa += PAGE_SIZE) dump_add_page(pa); #endif phys_avail[biggestone + 1] = new_end; #ifdef __amd64__ /* * Request that the physical pages underlying the message buffer be * included in a crash dump. Since the message buffer is accessed * through the direct map, they are not automatically included. */ pa = DMAP_TO_PHYS((vm_offset_t)msgbufp->msg_ptr); last_pa = pa + round_page(msgbufsize); while (pa < last_pa) { dump_add_page(pa); pa += PAGE_SIZE; } #endif /* * Compute the number of pages of memory that will be available for * use, taking into account the overhead of a page structure per page. * In other words, solve * "available physical memory" - round_page(page_range * * sizeof(struct vm_page)) = page_range * PAGE_SIZE * for page_range. */ low_avail = phys_avail[0]; high_avail = phys_avail[1]; for (i = 0; i < vm_phys_nsegs; i++) { if (vm_phys_segs[i].start < low_avail) low_avail = vm_phys_segs[i].start; if (vm_phys_segs[i].end > high_avail) high_avail = vm_phys_segs[i].end; } /* Skip the first chunk. It is already accounted for. */ for (i = 2; phys_avail[i + 1] != 0; i += 2) { if (phys_avail[i] < low_avail) low_avail = phys_avail[i]; if (phys_avail[i + 1] > high_avail) high_avail = phys_avail[i + 1]; } first_page = low_avail / PAGE_SIZE; #ifdef VM_PHYSSEG_SPARSE size = 0; for (i = 0; i < vm_phys_nsegs; i++) size += vm_phys_segs[i].end - vm_phys_segs[i].start; for (i = 0; phys_avail[i + 1] != 0; i += 2) size += phys_avail[i + 1] - phys_avail[i]; #elif defined(VM_PHYSSEG_DENSE) size = high_avail - low_avail; #else #error "Either VM_PHYSSEG_DENSE or VM_PHYSSEG_SPARSE must be defined." #endif #ifdef VM_PHYSSEG_DENSE /* * In the VM_PHYSSEG_DENSE case, the number of pages can account for * the overhead of a page structure per page only if vm_page_array is * allocated from the last physical memory chunk. Otherwise, we must * allocate page structures representing the physical memory * underlying vm_page_array, even though they will not be used. */ if (new_end != high_avail) page_range = size / PAGE_SIZE; else #endif { page_range = size / (PAGE_SIZE + sizeof(struct vm_page)); /* * If the partial bytes remaining are large enough for * a page (PAGE_SIZE) without a corresponding * 'struct vm_page', then new_end will contain an * extra page after subtracting the length of the VM * page array. Compensate by subtracting an extra * page from new_end. */ if (size % (PAGE_SIZE + sizeof(struct vm_page)) >= PAGE_SIZE) { if (new_end == high_avail) high_avail -= PAGE_SIZE; new_end -= PAGE_SIZE; } } end = new_end; /* * Reserve an unmapped guard page to trap access to vm_page_array[-1]. * However, because this page is allocated from KVM, out-of-bounds * accesses using the direct map will not be trapped. */ vaddr += PAGE_SIZE; /* * Allocate physical memory for the page structures, and map it. */ new_end = trunc_page(end - page_range * sizeof(struct vm_page)); mapped = pmap_map(&vaddr, new_end, end, VM_PROT_READ | VM_PROT_WRITE); vm_page_array = (vm_page_t)mapped; vm_page_array_size = page_range; #if VM_NRESERVLEVEL > 0 /* * Allocate physical memory for the reservation management system's * data structures, and map it. */ if (high_avail == end) high_avail = new_end; new_end = vm_reserv_startup(&vaddr, new_end, high_avail); #endif #if defined(__aarch64__) || defined(__amd64__) || defined(__mips__) /* * Include vm_page_array and vm_reserv_array in a crash dump. */ for (pa = new_end; pa < end; pa += PAGE_SIZE) dump_add_page(pa); #endif phys_avail[biggestone + 1] = new_end; /* * Add physical memory segments corresponding to the available * physical pages. */ for (i = 0; phys_avail[i + 1] != 0; i += 2) vm_phys_add_seg(phys_avail[i], phys_avail[i + 1]); /* * Initialize the physical memory allocator. */ vm_phys_init(); /* * Initialize the page structures and add every available page to the * physical memory allocator's free lists. */ vm_cnt.v_page_count = 0; for (segind = 0; segind < vm_phys_nsegs; segind++) { seg = &vm_phys_segs[segind]; for (m = seg->first_page, pa = seg->start; pa < seg->end; m++, pa += PAGE_SIZE) vm_page_init_page(m, pa, segind); /* * Add the segment to the free lists only if it is covered by * one of the ranges in phys_avail. Because we've added the * ranges to the vm_phys_segs array, we can assume that each * segment is either entirely contained in one of the ranges, * or doesn't overlap any of them. */ for (i = 0; phys_avail[i + 1] != 0; i += 2) { if (seg->start < phys_avail[i] || seg->end > phys_avail[i + 1]) continue; m = seg->first_page; pagecount = (u_long)atop(seg->end - seg->start); vm_pagequeue_free_lock(seg->domain); vm_phys_free_contig(m, pagecount); vm_pagequeue_freecnt_adj(seg->domain, (int)pagecount); vm_pagequeue_free_unlock(seg->domain); vm_cnt.v_page_count += (u_int)pagecount; vmd = VM_DOMAIN(seg->domain);; vmd->vmd_page_count += (u_int)pagecount; vmd->vmd_segs |= 1UL << m->segind; break; } } /* * Remove blacklisted pages from the physical memory allocator. */ TAILQ_INIT(&blacklist_head); vm_page_blacklist_load(&list, &listend); vm_page_blacklist_check(list, listend); list = kern_getenv("vm.blacklist"); vm_page_blacklist_check(list, NULL); freeenv(list); #if VM_NRESERVLEVEL > 0 /* * Initialize the reservation management system. */ vm_reserv_init(); #endif /* * Set an initial domain policy for thread0 so that allocations * can work. */ domainset_zero(); return (vaddr); } void vm_page_reference(vm_page_t m) { vm_page_aflag_set(m, PGA_REFERENCED); } /* * vm_page_busy_downgrade: * * Downgrade an exclusive busy page into a single shared busy page. */ void vm_page_busy_downgrade(vm_page_t m) { u_int x; bool locked; vm_page_assert_xbusied(m); locked = mtx_owned(vm_page_lockptr(m)); for (;;) { x = m->busy_lock; x &= VPB_BIT_WAITERS; if (x != 0 && !locked) vm_page_lock(m); if (atomic_cmpset_rel_int(&m->busy_lock, VPB_SINGLE_EXCLUSIVER | x, VPB_SHARERS_WORD(1))) break; if (x != 0 && !locked) vm_page_unlock(m); } if (x != 0) { wakeup(m); if (!locked) vm_page_unlock(m); } } /* * vm_page_sbusied: * * Return a positive value if the page is shared busied, 0 otherwise. */ int vm_page_sbusied(vm_page_t m) { u_int x; x = m->busy_lock; return ((x & VPB_BIT_SHARED) != 0 && x != VPB_UNBUSIED); } /* * vm_page_sunbusy: * * Shared unbusy a page. */ void vm_page_sunbusy(vm_page_t m) { u_int x; vm_page_lock_assert(m, MA_NOTOWNED); vm_page_assert_sbusied(m); for (;;) { x = m->busy_lock; if (VPB_SHARERS(x) > 1) { if (atomic_cmpset_int(&m->busy_lock, x, x - VPB_ONE_SHARER)) break; continue; } if ((x & VPB_BIT_WAITERS) == 0) { KASSERT(x == VPB_SHARERS_WORD(1), ("vm_page_sunbusy: invalid lock state")); if (atomic_cmpset_int(&m->busy_lock, VPB_SHARERS_WORD(1), VPB_UNBUSIED)) break; continue; } KASSERT(x == (VPB_SHARERS_WORD(1) | VPB_BIT_WAITERS), ("vm_page_sunbusy: invalid lock state for waiters")); vm_page_lock(m); if (!atomic_cmpset_int(&m->busy_lock, x, VPB_UNBUSIED)) { vm_page_unlock(m); continue; } wakeup(m); vm_page_unlock(m); break; } } /* * vm_page_busy_sleep: * * Sleep and release the page lock, using the page pointer as wchan. * This is used to implement the hard-path of busying mechanism. * * The given page must be locked. * * If nonshared is true, sleep only if the page is xbusy. */ void vm_page_busy_sleep(vm_page_t m, const char *wmesg, bool nonshared) { u_int x; vm_page_assert_locked(m); x = m->busy_lock; if (x == VPB_UNBUSIED || (nonshared && (x & VPB_BIT_SHARED) != 0) || ((x & VPB_BIT_WAITERS) == 0 && !atomic_cmpset_int(&m->busy_lock, x, x | VPB_BIT_WAITERS))) { vm_page_unlock(m); return; } msleep(m, vm_page_lockptr(m), PVM | PDROP, wmesg, 0); } /* * vm_page_trysbusy: * * Try to shared busy a page. * If the operation succeeds 1 is returned otherwise 0. * The operation never sleeps. */ int vm_page_trysbusy(vm_page_t m) { u_int x; for (;;) { x = m->busy_lock; if ((x & VPB_BIT_SHARED) == 0) return (0); if (atomic_cmpset_acq_int(&m->busy_lock, x, x + VPB_ONE_SHARER)) return (1); } } static void vm_page_xunbusy_locked(vm_page_t m) { vm_page_assert_xbusied(m); vm_page_assert_locked(m); atomic_store_rel_int(&m->busy_lock, VPB_UNBUSIED); /* There is a waiter, do wakeup() instead of vm_page_flash(). */ wakeup(m); } void vm_page_xunbusy_maybelocked(vm_page_t m) { bool lockacq; vm_page_assert_xbusied(m); /* * Fast path for unbusy. If it succeeds, we know that there * are no waiters, so we do not need a wakeup. */ if (atomic_cmpset_rel_int(&m->busy_lock, VPB_SINGLE_EXCLUSIVER, VPB_UNBUSIED)) return; lockacq = !mtx_owned(vm_page_lockptr(m)); if (lockacq) vm_page_lock(m); vm_page_xunbusy_locked(m); if (lockacq) vm_page_unlock(m); } /* * vm_page_xunbusy_hard: * * Called after the first try the exclusive unbusy of a page failed. * It is assumed that the waiters bit is on. */ void vm_page_xunbusy_hard(vm_page_t m) { vm_page_assert_xbusied(m); vm_page_lock(m); vm_page_xunbusy_locked(m); vm_page_unlock(m); } /* * vm_page_flash: * * Wakeup anyone waiting for the page. * The ownership bits do not change. * * The given page must be locked. */ void vm_page_flash(vm_page_t m) { u_int x; vm_page_lock_assert(m, MA_OWNED); for (;;) { x = m->busy_lock; if ((x & VPB_BIT_WAITERS) == 0) return; if (atomic_cmpset_int(&m->busy_lock, x, x & (~VPB_BIT_WAITERS))) break; } wakeup(m); } /* * Avoid releasing and reacquiring the same page lock. */ void vm_page_change_lock(vm_page_t m, struct mtx **mtx) { struct mtx *mtx1; mtx1 = vm_page_lockptr(m); if (*mtx == mtx1) return; if (*mtx != NULL) mtx_unlock(*mtx); *mtx = mtx1; mtx_lock(mtx1); } /* * Keep page from being freed by the page daemon * much of the same effect as wiring, except much lower * overhead and should be used only for *very* temporary * holding ("wiring"). */ void vm_page_hold(vm_page_t mem) { vm_page_lock_assert(mem, MA_OWNED); mem->hold_count++; } void vm_page_unhold(vm_page_t mem) { vm_page_lock_assert(mem, MA_OWNED); KASSERT(mem->hold_count >= 1, ("vm_page_unhold: hold count < 0!!!")); --mem->hold_count; if (mem->hold_count == 0 && (mem->flags & PG_UNHOLDFREE) != 0) vm_page_free_toq(mem); } /* * vm_page_unhold_pages: * * Unhold each of the pages that is referenced by the given array. */ void vm_page_unhold_pages(vm_page_t *ma, int count) { struct mtx *mtx; mtx = NULL; for (; count != 0; count--) { vm_page_change_lock(*ma, &mtx); vm_page_unhold(*ma); ma++; } if (mtx != NULL) mtx_unlock(mtx); } vm_page_t PHYS_TO_VM_PAGE(vm_paddr_t pa) { vm_page_t m; #ifdef VM_PHYSSEG_SPARSE m = vm_phys_paddr_to_vm_page(pa); if (m == NULL) m = vm_phys_fictitious_to_vm_page(pa); return (m); #elif defined(VM_PHYSSEG_DENSE) long pi; pi = atop(pa); if (pi >= first_page && (pi - first_page) < vm_page_array_size) { m = &vm_page_array[pi - first_page]; return (m); } return (vm_phys_fictitious_to_vm_page(pa)); #else #error "Either VM_PHYSSEG_DENSE or VM_PHYSSEG_SPARSE must be defined." #endif } /* * vm_page_getfake: * * Create a fictitious page with the specified physical address and * memory attribute. The memory attribute is the only the machine- * dependent aspect of a fictitious page that must be initialized. */ vm_page_t vm_page_getfake(vm_paddr_t paddr, vm_memattr_t memattr) { vm_page_t m; m = uma_zalloc(fakepg_zone, M_WAITOK | M_ZERO); vm_page_initfake(m, paddr, memattr); return (m); } void vm_page_initfake(vm_page_t m, vm_paddr_t paddr, vm_memattr_t memattr) { if ((m->flags & PG_FICTITIOUS) != 0) { /* * The page's memattr might have changed since the * previous initialization. Update the pmap to the * new memattr. */ goto memattr; } m->phys_addr = paddr; m->queue = PQ_NONE; /* Fictitious pages don't use "segind". */ m->flags = PG_FICTITIOUS; /* Fictitious pages don't use "order" or "pool". */ m->oflags = VPO_UNMANAGED; m->busy_lock = VPB_SINGLE_EXCLUSIVER; m->wire_count = 1; pmap_page_init(m); memattr: pmap_page_set_memattr(m, memattr); } /* * vm_page_putfake: * * Release a fictitious page. */ void vm_page_putfake(vm_page_t m) { KASSERT((m->oflags & VPO_UNMANAGED) != 0, ("managed %p", m)); KASSERT((m->flags & PG_FICTITIOUS) != 0, ("vm_page_putfake: bad page %p", m)); uma_zfree(fakepg_zone, m); } /* * vm_page_updatefake: * * Update the given fictitious page to the specified physical address and * memory attribute. */ void vm_page_updatefake(vm_page_t m, vm_paddr_t paddr, vm_memattr_t memattr) { KASSERT((m->flags & PG_FICTITIOUS) != 0, ("vm_page_updatefake: bad page %p", m)); m->phys_addr = paddr; pmap_page_set_memattr(m, memattr); } /* * vm_page_free: * * Free a page. */ void vm_page_free(vm_page_t m) { m->flags &= ~PG_ZERO; vm_page_free_toq(m); } /* * vm_page_free_zero: * * Free a page to the zerod-pages queue */ void vm_page_free_zero(vm_page_t m) { m->flags |= PG_ZERO; vm_page_free_toq(m); } /* * Unbusy and handle the page queueing for a page from a getpages request that * was optionally read ahead or behind. */ void vm_page_readahead_finish(vm_page_t m) { /* We shouldn't put invalid pages on queues. */ KASSERT(m->valid != 0, ("%s: %p is invalid", __func__, m)); /* * Since the page is not the actually needed one, whether it should * be activated or deactivated is not obvious. Empirical results * have shown that deactivating the page is usually the best choice, * unless the page is wanted by another thread. */ vm_page_lock(m); if ((m->busy_lock & VPB_BIT_WAITERS) != 0) vm_page_activate(m); else vm_page_deactivate(m); vm_page_unlock(m); vm_page_xunbusy(m); } /* * vm_page_sleep_if_busy: * * Sleep and release the page queues lock if the page is busied. * Returns TRUE if the thread slept. * * The given page must be unlocked and object containing it must * be locked. */ int vm_page_sleep_if_busy(vm_page_t m, const char *msg) { vm_object_t obj; vm_page_lock_assert(m, MA_NOTOWNED); VM_OBJECT_ASSERT_WLOCKED(m->object); if (vm_page_busied(m)) { /* * The page-specific object must be cached because page * identity can change during the sleep, causing the * re-lock of a different object. * It is assumed that a reference to the object is already * held by the callers. */ obj = m->object; vm_page_lock(m); VM_OBJECT_WUNLOCK(obj); vm_page_busy_sleep(m, msg, false); VM_OBJECT_WLOCK(obj); return (TRUE); } return (FALSE); } /* * vm_page_dirty_KBI: [ internal use only ] * * Set all bits in the page's dirty field. * * The object containing the specified page must be locked if the * call is made from the machine-independent layer. * * See vm_page_clear_dirty_mask(). * * This function should only be called by vm_page_dirty(). */ void vm_page_dirty_KBI(vm_page_t m) { /* Refer to this operation by its public name. */ KASSERT(m->valid == VM_PAGE_BITS_ALL, ("vm_page_dirty: page is invalid!")); m->dirty = VM_PAGE_BITS_ALL; } /* * vm_page_insert: [ internal use only ] * * Inserts the given mem entry into the object and object list. * * The object must be locked. */ int vm_page_insert(vm_page_t m, vm_object_t object, vm_pindex_t pindex) { vm_page_t mpred; VM_OBJECT_ASSERT_WLOCKED(object); mpred = vm_radix_lookup_le(&object->rtree, pindex); return (vm_page_insert_after(m, object, pindex, mpred)); } /* * vm_page_insert_after: * * Inserts the page "m" into the specified object at offset "pindex". * * The page "mpred" must immediately precede the offset "pindex" within * the specified object. * * The object must be locked. */ static int vm_page_insert_after(vm_page_t m, vm_object_t object, vm_pindex_t pindex, vm_page_t mpred) { vm_page_t msucc; VM_OBJECT_ASSERT_WLOCKED(object); KASSERT(m->object == NULL, ("vm_page_insert_after: page already inserted")); if (mpred != NULL) { KASSERT(mpred->object == object, ("vm_page_insert_after: object doesn't contain mpred")); KASSERT(mpred->pindex < pindex, ("vm_page_insert_after: mpred doesn't precede pindex")); msucc = TAILQ_NEXT(mpred, listq); } else msucc = TAILQ_FIRST(&object->memq); if (msucc != NULL) KASSERT(msucc->pindex > pindex, ("vm_page_insert_after: msucc doesn't succeed pindex")); /* * Record the object/offset pair in this page */ m->object = object; m->pindex = pindex; /* * Now link into the object's ordered list of backed pages. */ if (vm_radix_insert(&object->rtree, m)) { m->object = NULL; m->pindex = 0; return (1); } vm_page_insert_radixdone(m, object, mpred); return (0); } /* * vm_page_insert_radixdone: * * Complete page "m" insertion into the specified object after the * radix trie hooking. * * The page "mpred" must precede the offset "m->pindex" within the * specified object. * * The object must be locked. */ static void vm_page_insert_radixdone(vm_page_t m, vm_object_t object, vm_page_t mpred) { VM_OBJECT_ASSERT_WLOCKED(object); KASSERT(object != NULL && m->object == object, ("vm_page_insert_radixdone: page %p has inconsistent object", m)); if (mpred != NULL) { KASSERT(mpred->object == object, ("vm_page_insert_after: object doesn't contain mpred")); KASSERT(mpred->pindex < m->pindex, ("vm_page_insert_after: mpred doesn't precede pindex")); } if (mpred != NULL) TAILQ_INSERT_AFTER(&object->memq, mpred, m, listq); else TAILQ_INSERT_HEAD(&object->memq, m, listq); /* * Show that the object has one more resident page. */ object->resident_page_count++; /* * Hold the vnode until the last page is released. */ if (object->resident_page_count == 1 && object->type == OBJT_VNODE) vhold(object->handle); /* * Since we are inserting a new and possibly dirty page, * update the object's OBJ_MIGHTBEDIRTY flag. */ if (pmap_page_is_write_mapped(m)) vm_object_set_writeable_dirty(object); } /* * vm_page_remove: * * Removes the specified page from its containing object, but does not * invalidate any backing storage. * * The object must be locked. The page must be locked if it is managed. */ void vm_page_remove(vm_page_t m) { vm_object_t object; vm_page_t mrem; if ((m->oflags & VPO_UNMANAGED) == 0) vm_page_assert_locked(m); if ((object = m->object) == NULL) return; VM_OBJECT_ASSERT_WLOCKED(object); if (vm_page_xbusied(m)) vm_page_xunbusy_maybelocked(m); mrem = vm_radix_remove(&object->rtree, m->pindex); KASSERT(mrem == m, ("removed page %p, expected page %p", mrem, m)); /* * Now remove from the object's list of backed pages. */ TAILQ_REMOVE(&object->memq, m, listq); /* * And show that the object has one fewer resident page. */ object->resident_page_count--; /* * The vnode may now be recycled. */ if (object->resident_page_count == 0 && object->type == OBJT_VNODE) vdrop(object->handle); m->object = NULL; } /* * vm_page_lookup: * * Returns the page associated with the object/offset * pair specified; if none is found, NULL is returned. * * The object must be locked. */ vm_page_t vm_page_lookup(vm_object_t object, vm_pindex_t pindex) { VM_OBJECT_ASSERT_LOCKED(object); return (vm_radix_lookup(&object->rtree, pindex)); } /* * vm_page_find_least: * * Returns the page associated with the object with least pindex * greater than or equal to the parameter pindex, or NULL. * * The object must be locked. */ vm_page_t vm_page_find_least(vm_object_t object, vm_pindex_t pindex) { vm_page_t m; VM_OBJECT_ASSERT_LOCKED(object); if ((m = TAILQ_FIRST(&object->memq)) != NULL && m->pindex < pindex) m = vm_radix_lookup_ge(&object->rtree, pindex); return (m); } /* * Returns the given page's successor (by pindex) within the object if it is * resident; if none is found, NULL is returned. * * The object must be locked. */ vm_page_t vm_page_next(vm_page_t m) { vm_page_t next; VM_OBJECT_ASSERT_LOCKED(m->object); if ((next = TAILQ_NEXT(m, listq)) != NULL) { MPASS(next->object == m->object); if (next->pindex != m->pindex + 1) next = NULL; } return (next); } /* * Returns the given page's predecessor (by pindex) within the object if it is * resident; if none is found, NULL is returned. * * The object must be locked. */ vm_page_t vm_page_prev(vm_page_t m) { vm_page_t prev; VM_OBJECT_ASSERT_LOCKED(m->object); if ((prev = TAILQ_PREV(m, pglist, listq)) != NULL) { MPASS(prev->object == m->object); if (prev->pindex != m->pindex - 1) prev = NULL; } return (prev); } /* * Uses the page mnew as a replacement for an existing page at index * pindex which must be already present in the object. * * The existing page must not be on a paging queue. */ vm_page_t vm_page_replace(vm_page_t mnew, vm_object_t object, vm_pindex_t pindex) { vm_page_t mold; VM_OBJECT_ASSERT_WLOCKED(object); KASSERT(mnew->object == NULL, ("vm_page_replace: page already in object")); /* * This function mostly follows vm_page_insert() and * vm_page_remove() without the radix, object count and vnode * dance. Double check such functions for more comments. */ mnew->object = object; mnew->pindex = pindex; mold = vm_radix_replace(&object->rtree, mnew); KASSERT(mold->queue == PQ_NONE, ("vm_page_replace: mold is on a paging queue")); /* Keep the resident page list in sorted order. */ TAILQ_INSERT_AFTER(&object->memq, mold, mnew, listq); TAILQ_REMOVE(&object->memq, mold, listq); mold->object = NULL; vm_page_xunbusy_maybelocked(mold); /* * The object's resident_page_count does not change because we have * swapped one page for another, but OBJ_MIGHTBEDIRTY. */ if (pmap_page_is_write_mapped(mnew)) vm_object_set_writeable_dirty(object); return (mold); } /* * vm_page_rename: * * Move the given memory entry from its * current object to the specified target object/offset. * * Note: swap associated with the page must be invalidated by the move. We * have to do this for several reasons: (1) we aren't freeing the * page, (2) we are dirtying the page, (3) the VM system is probably * moving the page from object A to B, and will then later move * the backing store from A to B and we can't have a conflict. * * Note: we *always* dirty the page. It is necessary both for the * fact that we moved it, and because we may be invalidating * swap. * * The objects must be locked. */ int vm_page_rename(vm_page_t m, vm_object_t new_object, vm_pindex_t new_pindex) { vm_page_t mpred; vm_pindex_t opidx; VM_OBJECT_ASSERT_WLOCKED(new_object); mpred = vm_radix_lookup_le(&new_object->rtree, new_pindex); KASSERT(mpred == NULL || mpred->pindex != new_pindex, ("vm_page_rename: pindex already renamed")); /* * Create a custom version of vm_page_insert() which does not depend * by m_prev and can cheat on the implementation aspects of the * function. */ opidx = m->pindex; m->pindex = new_pindex; if (vm_radix_insert(&new_object->rtree, m)) { m->pindex = opidx; return (1); } /* * The operation cannot fail anymore. The removal must happen before * the listq iterator is tainted. */ m->pindex = opidx; vm_page_lock(m); vm_page_remove(m); /* Return back to the new pindex to complete vm_page_insert(). */ m->pindex = new_pindex; m->object = new_object; vm_page_unlock(m); vm_page_insert_radixdone(m, new_object, mpred); vm_page_dirty(m); return (0); } /* * vm_page_alloc: * * Allocate and return a page that is associated with the specified * object and offset pair. By default, this page is exclusive busied. * * The caller must always specify an allocation class. * * allocation classes: * VM_ALLOC_NORMAL normal process request * VM_ALLOC_SYSTEM system *really* needs a page * VM_ALLOC_INTERRUPT interrupt time request * * optional allocation flags: * VM_ALLOC_COUNT(number) the number of additional pages that the caller * intends to allocate * VM_ALLOC_NOBUSY do not exclusive busy the page * VM_ALLOC_NODUMP do not include the page in a kernel core dump * VM_ALLOC_NOOBJ page is not associated with an object and * should not be exclusive busy * VM_ALLOC_SBUSY shared busy the allocated page * VM_ALLOC_WIRED wire the allocated page * VM_ALLOC_ZERO prefer a zeroed page */ vm_page_t vm_page_alloc(vm_object_t object, vm_pindex_t pindex, int req) { return (vm_page_alloc_after(object, pindex, req, object != NULL ? vm_radix_lookup_le(&object->rtree, pindex) : NULL)); } vm_page_t vm_page_alloc_domain(vm_object_t object, vm_pindex_t pindex, int domain, int req) { return (vm_page_alloc_domain_after(object, pindex, domain, req, object != NULL ? vm_radix_lookup_le(&object->rtree, pindex) : NULL)); } /* * Allocate a page in the specified object with the given page index. To * optimize insertion of the page into the object, the caller must also specifiy * the resident page in the object with largest index smaller than the given * page index, or NULL if no such page exists. */ vm_page_t vm_page_alloc_after(vm_object_t object, vm_pindex_t pindex, int req, vm_page_t mpred) { struct vm_domainset_iter di; vm_page_t m; int domain; vm_domainset_iter_page_init(&di, object, &domain, &req); do { m = vm_page_alloc_domain_after(object, pindex, domain, req, mpred); if (m != NULL) break; } while (vm_domainset_iter_page(&di, &domain, &req) == 0); return (m); } /* * Returns true if the number of free pages exceeds the minimum * for the request class and false otherwise. */ int vm_page_available(int domain, int req, int npages) { struct vm_domain *vmd; vm_pagequeue_free_assert_locked(domain); vmd = VM_DOMAIN(domain); req = req & VM_ALLOC_CLASS_MASK; /* * The page daemon is allowed to dig deeper into the free page list. */ if (curproc == pageproc && req != VM_ALLOC_INTERRUPT) req = VM_ALLOC_SYSTEM; if (vmd->vmd_free_count >= npages + vmd->vmd_free_reserved || (req == VM_ALLOC_SYSTEM && vmd->vmd_free_count >= npages + vmd->vmd_interrupt_free_min) || (req == VM_ALLOC_INTERRUPT && vmd->vmd_free_count >= npages)) return (1); return (0); } vm_page_t vm_page_alloc_domain_after(vm_object_t object, vm_pindex_t pindex, int domain, int req, vm_page_t mpred) { vm_page_t m; int flags; u_int free_count; #if VM_NRESERVLEVEL > 0 int reserv; reserv = object != NULL && (object->flags & (OBJ_COLORED | OBJ_FICTITIOUS)) == OBJ_COLORED; #endif KASSERT((object != NULL) == ((req & VM_ALLOC_NOOBJ) == 0) && (object != NULL || (req & VM_ALLOC_SBUSY) == 0) && ((req & (VM_ALLOC_NOBUSY | VM_ALLOC_SBUSY)) != (VM_ALLOC_NOBUSY | VM_ALLOC_SBUSY)), ("inconsistent object(%p)/req(%x)", object, req)); KASSERT(object == NULL || (req & VM_ALLOC_WAITOK) == 0, ("Can't sleep and retry object insertion.")); KASSERT(mpred == NULL || mpred->pindex < pindex, ("mpred %p doesn't precede pindex 0x%jx", mpred, (uintmax_t)pindex)); if (object != NULL) VM_OBJECT_ASSERT_WLOCKED(object); again: m = NULL; #if VM_NRESERVLEVEL > 0 if (reserv && (m = vm_reserv_extend(req, object, pindex, domain, mpred)) != NULL) { domain = vm_phys_domain(m); goto found; } #endif vm_pagequeue_free_lock(domain); if (vm_page_available(domain, req, 1)) { /* * Can we allocate the page from a reservation? */ #if VM_NRESERVLEVEL > 0 if (!reserv || (m = vm_reserv_alloc_page(object, pindex, domain, mpred)) == NULL) #endif { /* * If not, allocate it from the free page queues. */ m = vm_phys_alloc_pages(domain, object != NULL ? VM_FREEPOOL_DEFAULT : VM_FREEPOOL_DIRECT, 0); #if VM_NRESERVLEVEL > 0 if (m == NULL && vm_reserv_reclaim_inactive(domain)) { m = vm_phys_alloc_pages(domain, object != NULL ? VM_FREEPOOL_DEFAULT : VM_FREEPOOL_DIRECT, 0); } #endif } } if (m == NULL) { /* * Not allocatable, give up. */ if (vm_page_alloc_fail(object, domain, req)) goto again; return (NULL); } /* * At this point we had better have found a good page. */ KASSERT(m != NULL, ("missing page")); free_count = vm_pagequeue_freecnt_adj(domain, -1); vm_pagequeue_free_unlock(domain); /* * Don't wakeup too often - wakeup the pageout daemon when * we would be nearly out of memory. */ if (vm_paging_needed(VM_DOMAIN(domain), free_count)) pagedaemon_wakeup(domain); #if VM_NRESERVLEVEL > 0 found: #endif vm_page_alloc_check(m); /* * Initialize the page. Only the PG_ZERO flag is inherited. */ flags = 0; if ((req & VM_ALLOC_ZERO) != 0) flags = PG_ZERO; flags &= m->flags; if ((req & VM_ALLOC_NODUMP) != 0) flags |= PG_NODUMP; m->flags = flags; m->aflags = 0; m->oflags = object == NULL || (object->flags & OBJ_UNMANAGED) != 0 ? VPO_UNMANAGED : 0; m->busy_lock = VPB_UNBUSIED; if ((req & (VM_ALLOC_NOBUSY | VM_ALLOC_NOOBJ | VM_ALLOC_SBUSY)) == 0) m->busy_lock = VPB_SINGLE_EXCLUSIVER; if ((req & VM_ALLOC_SBUSY) != 0) m->busy_lock = VPB_SHARERS_WORD(1); if (req & VM_ALLOC_WIRED) { /* * The page lock is not required for wiring a page until that * page is inserted into the object. */ atomic_add_int(&vm_cnt.v_wire_count, 1); m->wire_count = 1; } m->act_count = 0; if (object != NULL) { if (vm_page_insert_after(m, object, pindex, mpred)) { pagedaemon_wakeup(domain); if (req & VM_ALLOC_WIRED) { atomic_subtract_int(&vm_cnt.v_wire_count, 1); m->wire_count = 0; } KASSERT(m->object == NULL, ("page %p has object", m)); m->oflags = VPO_UNMANAGED; m->busy_lock = VPB_UNBUSIED; /* Don't change PG_ZERO. */ vm_page_free_toq(m); if (req & VM_ALLOC_WAITFAIL) { VM_OBJECT_WUNLOCK(object); vm_radix_wait(); VM_OBJECT_WLOCK(object); } return (NULL); } /* Ignore device objects; the pager sets "memattr" for them. */ if (object->memattr != VM_MEMATTR_DEFAULT && (object->flags & OBJ_FICTITIOUS) == 0) pmap_page_set_memattr(m, object->memattr); } else m->pindex = pindex; return (m); } /* * vm_page_alloc_contig: * * Allocate a contiguous set of physical pages of the given size "npages" * from the free lists. All of the physical pages must be at or above * the given physical address "low" and below the given physical address * "high". The given value "alignment" determines the alignment of the * first physical page in the set. If the given value "boundary" is * non-zero, then the set of physical pages cannot cross any physical * address boundary that is a multiple of that value. Both "alignment" * and "boundary" must be a power of two. * * If the specified memory attribute, "memattr", is VM_MEMATTR_DEFAULT, * then the memory attribute setting for the physical pages is configured * to the object's memory attribute setting. Otherwise, the memory * attribute setting for the physical pages is configured to "memattr", * overriding the object's memory attribute setting. However, if the * object's memory attribute setting is not VM_MEMATTR_DEFAULT, then the * memory attribute setting for the physical pages cannot be configured * to VM_MEMATTR_DEFAULT. * * The specified object may not contain fictitious pages. * * The caller must always specify an allocation class. * * allocation classes: * VM_ALLOC_NORMAL normal process request * VM_ALLOC_SYSTEM system *really* needs a page * VM_ALLOC_INTERRUPT interrupt time request * * optional allocation flags: * VM_ALLOC_NOBUSY do not exclusive busy the page * VM_ALLOC_NODUMP do not include the page in a kernel core dump * VM_ALLOC_NOOBJ page is not associated with an object and * should not be exclusive busy * VM_ALLOC_SBUSY shared busy the allocated page * VM_ALLOC_WIRED wire the allocated page * VM_ALLOC_ZERO prefer a zeroed page */ vm_page_t vm_page_alloc_contig(vm_object_t object, vm_pindex_t pindex, int req, u_long npages, vm_paddr_t low, vm_paddr_t high, u_long alignment, vm_paddr_t boundary, vm_memattr_t memattr) { struct vm_domainset_iter di; vm_page_t m; int domain; vm_domainset_iter_page_init(&di, object, &domain, &req); do { m = vm_page_alloc_contig_domain(object, pindex, domain, req, npages, low, high, alignment, boundary, memattr); if (m != NULL) break; } while (vm_domainset_iter_page(&di, &domain, &req) == 0); return (m); } vm_page_t vm_page_alloc_contig_domain(vm_object_t object, vm_pindex_t pindex, int domain, int req, u_long npages, vm_paddr_t low, vm_paddr_t high, u_long alignment, vm_paddr_t boundary, vm_memattr_t memattr) { struct vm_domain *vmd; vm_page_t m, m_ret, mpred; u_int busy_lock, flags, oflags; #if VM_NRESERVLEVEL > 0 int reserv; reserv = object != NULL && (object->flags & (OBJ_COLORED | OBJ_FICTITIOUS)) == OBJ_COLORED; #endif mpred = NULL; /* XXX: pacify gcc */ KASSERT((object != NULL) == ((req & VM_ALLOC_NOOBJ) == 0) && (object != NULL || (req & VM_ALLOC_SBUSY) == 0) && ((req & (VM_ALLOC_NOBUSY | VM_ALLOC_SBUSY)) != (VM_ALLOC_NOBUSY | VM_ALLOC_SBUSY)), ("vm_page_alloc_contig: inconsistent object(%p)/req(%x)", object, req)); KASSERT(object == NULL || (req & VM_ALLOC_WAITOK) == 0, ("Can't sleep and retry object insertion.")); if (object != NULL) { VM_OBJECT_ASSERT_WLOCKED(object); KASSERT((object->flags & OBJ_FICTITIOUS) == 0, ("vm_page_alloc_contig: object %p has fictitious pages", object)); } KASSERT(npages > 0, ("vm_page_alloc_contig: npages is zero")); if (object != NULL) { mpred = vm_radix_lookup_le(&object->rtree, pindex); KASSERT(mpred == NULL || mpred->pindex != pindex, ("vm_page_alloc_contig: pindex already allocated")); } /* * Can we allocate the pages without the number of free pages falling * below the lower bound for the allocation class? */ again: #if VM_NRESERVLEVEL > 0 if (reserv && (m_ret = vm_reserv_extend_contig(req, object, pindex, domain, npages, low, high, alignment, boundary, mpred)) != NULL) { domain = vm_phys_domain(m_ret); goto found; } #endif m_ret = NULL; vm_pagequeue_free_lock(domain); if (vm_page_available(domain, req, npages)) { /* * Can we allocate the pages from a reservation? */ #if VM_NRESERVLEVEL > 0 retry: if (!reserv || (m_ret = vm_reserv_alloc_contig(object, pindex, domain, npages, low, high, alignment, boundary, mpred)) == NULL) #endif /* * If not, allocate them from the free page queues. */ m_ret = vm_phys_alloc_contig(domain, npages, low, high, alignment, boundary); #if VM_NRESERVLEVEL > 0 if (m_ret == NULL && vm_reserv_reclaim_contig( domain, npages, low, high, alignment, boundary)) goto retry; #endif } if (m_ret == NULL) { if (vm_page_alloc_fail(object, domain, req)) goto again; return (NULL); } vm_pagequeue_freecnt_adj(domain, -npages); vm_pagequeue_free_unlock(domain); #if VM_NRESERVLEVEL > 0 found: #endif for (m = m_ret; m < &m_ret[npages]; m++) vm_page_alloc_check(m); /* * Initialize the pages. Only the PG_ZERO flag is inherited. */ flags = 0; if ((req & VM_ALLOC_ZERO) != 0) flags = PG_ZERO; if ((req & VM_ALLOC_NODUMP) != 0) flags |= PG_NODUMP; oflags = object == NULL || (object->flags & OBJ_UNMANAGED) != 0 ? VPO_UNMANAGED : 0; busy_lock = VPB_UNBUSIED; if ((req & (VM_ALLOC_NOBUSY | VM_ALLOC_NOOBJ | VM_ALLOC_SBUSY)) == 0) busy_lock = VPB_SINGLE_EXCLUSIVER; if ((req & VM_ALLOC_SBUSY) != 0) busy_lock = VPB_SHARERS_WORD(1); if ((req & VM_ALLOC_WIRED) != 0) atomic_add_int(&vm_cnt.v_wire_count, npages); if (object != NULL) { if (object->memattr != VM_MEMATTR_DEFAULT && memattr == VM_MEMATTR_DEFAULT) memattr = object->memattr; } for (m = m_ret; m < &m_ret[npages]; m++) { m->aflags = 0; m->flags = (m->flags | PG_NODUMP) & flags; m->busy_lock = busy_lock; if ((req & VM_ALLOC_WIRED) != 0) m->wire_count = 1; m->act_count = 0; m->oflags = oflags; if (object != NULL) { if (vm_page_insert_after(m, object, pindex, mpred)) { pagedaemon_wakeup(domain); if ((req & VM_ALLOC_WIRED) != 0) atomic_subtract_int( &vm_cnt.v_wire_count, npages); KASSERT(m->object == NULL, ("page %p has object", m)); mpred = m; for (m = m_ret; m < &m_ret[npages]; m++) { if (m <= mpred && (req & VM_ALLOC_WIRED) != 0) m->wire_count = 0; m->oflags = VPO_UNMANAGED; m->busy_lock = VPB_UNBUSIED; /* Don't change PG_ZERO. */ vm_page_free_toq(m); } if (req & VM_ALLOC_WAITFAIL) { VM_OBJECT_WUNLOCK(object); vm_radix_wait(); VM_OBJECT_WLOCK(object); } return (NULL); } mpred = m; } else m->pindex = pindex; if (memattr != VM_MEMATTR_DEFAULT) pmap_page_set_memattr(m, memattr); pindex++; } vmd = VM_DOMAIN(domain); if (vm_paging_needed(vmd, vmd->vmd_free_count)) pagedaemon_wakeup(domain); return (m_ret); } /* * Check a page that has been freshly dequeued from a freelist. */ static void vm_page_alloc_check(vm_page_t m) { KASSERT(m->object == NULL, ("page %p has object", m)); KASSERT(m->queue == PQ_NONE, ("page %p has unexpected queue %d", m, m->queue)); KASSERT(m->wire_count == 0, ("page %p is wired", m)); KASSERT(m->hold_count == 0, ("page %p is held", m)); KASSERT(!vm_page_busied(m), ("page %p is busy", m)); KASSERT(m->dirty == 0, ("page %p is dirty", m)); KASSERT(pmap_page_get_memattr(m) == VM_MEMATTR_DEFAULT, ("page %p has unexpected memattr %d", m, pmap_page_get_memattr(m))); KASSERT(m->valid == 0, ("free page %p is valid", m)); } /* * vm_page_alloc_freelist: * * Allocate a physical page from the specified free page list. * * The caller must always specify an allocation class. * * allocation classes: * VM_ALLOC_NORMAL normal process request * VM_ALLOC_SYSTEM system *really* needs a page * VM_ALLOC_INTERRUPT interrupt time request * * optional allocation flags: * VM_ALLOC_COUNT(number) the number of additional pages that the caller * intends to allocate * VM_ALLOC_WIRED wire the allocated page * VM_ALLOC_ZERO prefer a zeroed page */ vm_page_t vm_page_alloc_freelist(int freelist, int req) { struct vm_domainset_iter di; vm_page_t m; int domain; vm_domainset_iter_page_init(&di, kernel_object, &domain, &req); do { m = vm_page_alloc_freelist_domain(domain, freelist, req); if (m != NULL) break; } while (vm_domainset_iter_page(&di, &domain, &req) == 0); return (m); } vm_page_t vm_page_alloc_freelist_domain(int domain, int freelist, int req) { vm_page_t m; u_int flags, free_count; /* * Do not allocate reserved pages unless the req has asked for it. */ again: vm_pagequeue_free_lock(domain); if (vm_page_available(domain, req, 1)) m = vm_phys_alloc_freelist_pages(domain, freelist, VM_FREEPOOL_DIRECT, 0); if (m == NULL) { if (vm_page_alloc_fail(NULL, domain, req)) goto again; return (NULL); } free_count = vm_pagequeue_freecnt_adj(domain, -1); vm_pagequeue_free_unlock(domain); vm_page_alloc_check(m); /* * Initialize the page. Only the PG_ZERO flag is inherited. */ m->aflags = 0; flags = 0; if ((req & VM_ALLOC_ZERO) != 0) flags = PG_ZERO; m->flags &= flags; if ((req & VM_ALLOC_WIRED) != 0) { /* * The page lock is not required for wiring a page that does * not belong to an object. */ atomic_add_int(&vm_cnt.v_wire_count, 1); m->wire_count = 1; } /* Unmanaged pages don't use "act_count". */ m->oflags = VPO_UNMANAGED; if (vm_paging_needed(VM_DOMAIN(domain), free_count)) pagedaemon_wakeup(domain); return (m); } #define VPSC_ANY 0 /* No restrictions. */ #define VPSC_NORESERV 1 /* Skip reservations; implies VPSC_NOSUPER. */ #define VPSC_NOSUPER 2 /* Skip superpages. */ /* * vm_page_scan_contig: * * Scan vm_page_array[] between the specified entries "m_start" and * "m_end" for a run of contiguous physical pages that satisfy the * specified conditions, and return the lowest page in the run. The * specified "alignment" determines the alignment of the lowest physical * page in the run. If the specified "boundary" is non-zero, then the * run of physical pages cannot span a physical address that is a * multiple of "boundary". * * "m_end" is never dereferenced, so it need not point to a vm_page * structure within vm_page_array[]. * * "npages" must be greater than zero. "m_start" and "m_end" must not * span a hole (or discontiguity) in the physical address space. Both * "alignment" and "boundary" must be a power of two. */ vm_page_t vm_page_scan_contig(u_long npages, vm_page_t m_start, vm_page_t m_end, u_long alignment, vm_paddr_t boundary, int options) { struct mtx *m_mtx; vm_object_t object; vm_paddr_t pa; vm_page_t m, m_run; #if VM_NRESERVLEVEL > 0 int level; #endif int m_inc, order, run_ext, run_len; KASSERT(npages > 0, ("npages is 0")); KASSERT(powerof2(alignment), ("alignment is not a power of 2")); KASSERT(powerof2(boundary), ("boundary is not a power of 2")); m_run = NULL; run_len = 0; m_mtx = NULL; for (m = m_start; m < m_end && run_len < npages; m += m_inc) { KASSERT((m->flags & PG_MARKER) == 0, ("page %p is PG_MARKER", m)); KASSERT((m->flags & PG_FICTITIOUS) == 0 || m->wire_count == 1, ("fictitious page %p has invalid wire count", m)); /* * If the current page would be the start of a run, check its * physical address against the end, alignment, and boundary * conditions. If it doesn't satisfy these conditions, either * terminate the scan or advance to the next page that * satisfies the failed condition. */ if (run_len == 0) { KASSERT(m_run == NULL, ("m_run != NULL")); if (m + npages > m_end) break; pa = VM_PAGE_TO_PHYS(m); if ((pa & (alignment - 1)) != 0) { m_inc = atop(roundup2(pa, alignment) - pa); continue; } if (rounddown2(pa ^ (pa + ptoa(npages) - 1), boundary) != 0) { m_inc = atop(roundup2(pa, boundary) - pa); continue; } } else KASSERT(m_run != NULL, ("m_run == NULL")); vm_page_change_lock(m, &m_mtx); m_inc = 1; retry: if (m->wire_count != 0 || m->hold_count != 0) run_ext = 0; #if VM_NRESERVLEVEL > 0 else if ((level = vm_reserv_level(m)) >= 0 && (options & VPSC_NORESERV) != 0) { run_ext = 0; /* Advance to the end of the reservation. */ pa = VM_PAGE_TO_PHYS(m); m_inc = atop(roundup2(pa + 1, vm_reserv_size(level)) - pa); } #endif else if ((object = m->object) != NULL) { /* * The page is considered eligible for relocation if * and only if it could be laundered or reclaimed by * the page daemon. */ if (!VM_OBJECT_TRYRLOCK(object)) { mtx_unlock(m_mtx); VM_OBJECT_RLOCK(object); mtx_lock(m_mtx); if (m->object != object) { /* * The page may have been freed. */ VM_OBJECT_RUNLOCK(object); goto retry; } else if (m->wire_count != 0 || m->hold_count != 0) { run_ext = 0; goto unlock; } } KASSERT((m->flags & PG_UNHOLDFREE) == 0, ("page %p is PG_UNHOLDFREE", m)); /* Don't care: PG_NODUMP, PG_ZERO. */ if (object->type != OBJT_DEFAULT && object->type != OBJT_SWAP && object->type != OBJT_VNODE) { run_ext = 0; #if VM_NRESERVLEVEL > 0 } else if ((options & VPSC_NOSUPER) != 0 && (level = vm_reserv_level_iffullpop(m)) >= 0) { run_ext = 0; /* Advance to the end of the superpage. */ pa = VM_PAGE_TO_PHYS(m); m_inc = atop(roundup2(pa + 1, vm_reserv_size(level)) - pa); #endif } else if (object->memattr == VM_MEMATTR_DEFAULT && m->queue != PQ_NONE && !vm_page_busied(m)) { /* * The page is allocated but eligible for * relocation. Extend the current run by one * page. */ KASSERT(pmap_page_get_memattr(m) == VM_MEMATTR_DEFAULT, ("page %p has an unexpected memattr", m)); KASSERT((m->oflags & (VPO_SWAPINPROG | VPO_SWAPSLEEP | VPO_UNMANAGED)) == 0, ("page %p has unexpected oflags", m)); /* Don't care: VPO_NOSYNC. */ run_ext = 1; } else run_ext = 0; unlock: VM_OBJECT_RUNLOCK(object); #if VM_NRESERVLEVEL > 0 } else if (level >= 0) { /* * The page is reserved but not yet allocated. In * other words, it is still free. Extend the current * run by one page. */ run_ext = 1; #endif } else if ((order = m->order) < VM_NFREEORDER) { /* * The page is enqueued in the physical memory * allocator's free page queues. Moreover, it is the * first page in a power-of-two-sized run of * contiguous free pages. Add these pages to the end * of the current run, and jump ahead. */ run_ext = 1 << order; m_inc = 1 << order; } else { /* * Skip the page for one of the following reasons: (1) * It is enqueued in the physical memory allocator's * free page queues. However, it is not the first * page in a run of contiguous free pages. (This case * rarely occurs because the scan is performed in * ascending order.) (2) It is not reserved, and it is * transitioning from free to allocated. (Conversely, * the transition from allocated to free for managed * pages is blocked by the page lock.) (3) It is * allocated but not contained by an object and not * wired, e.g., allocated by Xen's balloon driver. */ run_ext = 0; } /* * Extend or reset the current run of pages. */ if (run_ext > 0) { if (run_len == 0) m_run = m; run_len += run_ext; } else { if (run_len > 0) { m_run = NULL; run_len = 0; } } } if (m_mtx != NULL) mtx_unlock(m_mtx); if (run_len >= npages) return (m_run); return (NULL); } /* * vm_page_reclaim_run: * * Try to relocate each of the allocated virtual pages within the * specified run of physical pages to a new physical address. Free the * physical pages underlying the relocated virtual pages. A virtual page * is relocatable if and only if it could be laundered or reclaimed by * the page daemon. Whenever possible, a virtual page is relocated to a * physical address above "high". * * Returns 0 if every physical page within the run was already free or * just freed by a successful relocation. Otherwise, returns a non-zero * value indicating why the last attempt to relocate a virtual page was * unsuccessful. * * "req_class" must be an allocation class. */ static int vm_page_reclaim_run(int req_class, int domain, u_long npages, vm_page_t m_run, vm_paddr_t high) { struct mtx *m_mtx; struct spglist free; vm_object_t object; vm_paddr_t pa; vm_page_t m, m_end, m_new; int error, order, req; KASSERT((req_class & VM_ALLOC_CLASS_MASK) == req_class, ("req_class is not an allocation class")); SLIST_INIT(&free); error = 0; m = m_run; m_end = m_run + npages; m_mtx = NULL; for (; error == 0 && m < m_end; m++) { KASSERT((m->flags & (PG_FICTITIOUS | PG_MARKER)) == 0, ("page %p is PG_FICTITIOUS or PG_MARKER", m)); /* * Avoid releasing and reacquiring the same page lock. */ vm_page_change_lock(m, &m_mtx); retry: if (m->wire_count != 0 || m->hold_count != 0) error = EBUSY; else if ((object = m->object) != NULL) { /* * The page is relocated if and only if it could be * laundered or reclaimed by the page daemon. */ if (!VM_OBJECT_TRYWLOCK(object)) { mtx_unlock(m_mtx); VM_OBJECT_WLOCK(object); mtx_lock(m_mtx); if (m->object != object) { /* * The page may have been freed. */ VM_OBJECT_WUNLOCK(object); goto retry; } else if (m->wire_count != 0 || m->hold_count != 0) { error = EBUSY; goto unlock; } } KASSERT((m->flags & PG_UNHOLDFREE) == 0, ("page %p is PG_UNHOLDFREE", m)); /* Don't care: PG_NODUMP, PG_ZERO. */ if (object->type != OBJT_DEFAULT && object->type != OBJT_SWAP && object->type != OBJT_VNODE) error = EINVAL; else if (object->memattr != VM_MEMATTR_DEFAULT) error = EINVAL; else if (m->queue != PQ_NONE && !vm_page_busied(m)) { KASSERT(pmap_page_get_memattr(m) == VM_MEMATTR_DEFAULT, ("page %p has an unexpected memattr", m)); KASSERT((m->oflags & (VPO_SWAPINPROG | VPO_SWAPSLEEP | VPO_UNMANAGED)) == 0, ("page %p has unexpected oflags", m)); /* Don't care: VPO_NOSYNC. */ if (m->valid != 0) { /* * First, try to allocate a new page * that is above "high". Failing * that, try to allocate a new page * that is below "m_run". Allocate * the new page between the end of * "m_run" and "high" only as a last * resort. */ req = req_class | VM_ALLOC_NOOBJ; if ((m->flags & PG_NODUMP) != 0) req |= VM_ALLOC_NODUMP; if (trunc_page(high) != ~(vm_paddr_t)PAGE_MASK) { m_new = vm_page_alloc_contig( NULL, 0, req, 1, round_page(high), ~(vm_paddr_t)0, PAGE_SIZE, 0, VM_MEMATTR_DEFAULT); } else m_new = NULL; if (m_new == NULL) { pa = VM_PAGE_TO_PHYS(m_run); m_new = vm_page_alloc_contig( NULL, 0, req, 1, 0, pa - 1, PAGE_SIZE, 0, VM_MEMATTR_DEFAULT); } if (m_new == NULL) { pa += ptoa(npages); m_new = vm_page_alloc_contig( NULL, 0, req, 1, pa, high, PAGE_SIZE, 0, VM_MEMATTR_DEFAULT); } if (m_new == NULL) { error = ENOMEM; goto unlock; } KASSERT(m_new->wire_count == 0, ("page %p is wired", m)); /* * Replace "m" with the new page. For * vm_page_replace(), "m" must be busy * and dequeued. Finally, change "m" * as if vm_page_free() was called. */ if (object->ref_count != 0) pmap_remove_all(m); m_new->aflags = m->aflags; KASSERT(m_new->oflags == VPO_UNMANAGED, ("page %p is managed", m)); m_new->oflags = m->oflags & VPO_NOSYNC; pmap_copy_page(m, m_new); m_new->valid = m->valid; m_new->dirty = m->dirty; m->flags &= ~PG_ZERO; vm_page_xbusy(m); vm_page_remque(m); vm_page_replace_checked(m_new, object, m->pindex, m); m->valid = 0; vm_page_undirty(m); /* * The new page must be deactivated * before the object is unlocked. */ vm_page_change_lock(m_new, &m_mtx); vm_page_deactivate(m_new); } else { m->flags &= ~PG_ZERO; vm_page_remque(m); vm_page_remove(m); KASSERT(m->dirty == 0, ("page %p is dirty", m)); } SLIST_INSERT_HEAD(&free, m, plinks.s.ss); } else error = EBUSY; unlock: VM_OBJECT_WUNLOCK(object); } else { MPASS(vm_phys_domain(m) == domain); vm_pagequeue_free_lock(domain); order = m->order; if (order < VM_NFREEORDER) { /* * The page is enqueued in the physical memory * allocator's free page queues. Moreover, it * is the first page in a power-of-two-sized * run of contiguous free pages. Jump ahead * to the last page within that run, and * continue from there. */ m += (1 << order) - 1; } #if VM_NRESERVLEVEL > 0 else if (vm_reserv_is_page_free(m)) order = 0; #endif vm_pagequeue_free_unlock(domain); if (order == VM_NFREEORDER) error = EINVAL; } } if (m_mtx != NULL) mtx_unlock(m_mtx); if ((m = SLIST_FIRST(&free)) != NULL) { MPASS(vm_phys_domain(m) == domain); vm_pagequeue_free_lock(domain); do { SLIST_REMOVE_HEAD(&free, plinks.s.ss); vm_page_free_phys(m); } while ((m = SLIST_FIRST(&free)) != NULL); vm_page_free_wakeup(domain); vm_pagequeue_free_unlock(domain); } return (error); } #define NRUNS 16 CTASSERT(powerof2(NRUNS)); #define RUN_INDEX(count) ((count) & (NRUNS - 1)) #define MIN_RECLAIM 8 /* * vm_page_reclaim_contig: * * Reclaim allocated, contiguous physical memory satisfying the specified * conditions by relocating the virtual pages using that physical memory. * Returns true if reclamation is successful and false otherwise. Since * relocation requires the allocation of physical pages, reclamation may * fail due to a shortage of free pages. When reclamation fails, callers * are expected to perform VM_WAIT before retrying a failed allocation * operation, e.g., vm_page_alloc_contig(). * * The caller must always specify an allocation class through "req". * * allocation classes: * VM_ALLOC_NORMAL normal process request * VM_ALLOC_SYSTEM system *really* needs a page * VM_ALLOC_INTERRUPT interrupt time request * * The optional allocation flags are ignored. * * "npages" must be greater than zero. Both "alignment" and "boundary" * must be a power of two. */ bool vm_page_reclaim_contig_domain(int domain, int req, u_long npages, vm_paddr_t low, vm_paddr_t high, u_long alignment, vm_paddr_t boundary) { struct vm_domain *vmd; vm_paddr_t curr_low; vm_page_t m_run, m_runs[NRUNS]; u_long count, reclaimed; int error, i, options, req_class; KASSERT(npages > 0, ("npages is 0")); KASSERT(powerof2(alignment), ("alignment is not a power of 2")); KASSERT(powerof2(boundary), ("boundary is not a power of 2")); req_class = req & VM_ALLOC_CLASS_MASK; /* * The page daemon is allowed to dig deeper into the free page list. */ if (curproc == pageproc && req_class != VM_ALLOC_INTERRUPT) req_class = VM_ALLOC_SYSTEM; /* * Return if the number of free pages cannot satisfy the requested * allocation. */ vmd = VM_DOMAIN(domain); count = vmd->vmd_free_count; if (count < npages + vmd->vmd_free_reserved || (count < npages + vmd->vmd_interrupt_free_min && req_class == VM_ALLOC_SYSTEM) || (count < npages && req_class == VM_ALLOC_INTERRUPT)) return (false); /* * Scan up to three times, relaxing the restrictions ("options") on * the reclamation of reservations and superpages each time. */ for (options = VPSC_NORESERV;;) { /* * Find the highest runs that satisfy the given constraints * and restrictions, and record them in "m_runs". */ curr_low = low; count = 0; for (;;) { m_run = vm_phys_scan_contig(domain, npages, curr_low, high, alignment, boundary, options); if (m_run == NULL) break; curr_low = VM_PAGE_TO_PHYS(m_run) + ptoa(npages); m_runs[RUN_INDEX(count)] = m_run; count++; } /* * Reclaim the highest runs in LIFO (descending) order until * the number of reclaimed pages, "reclaimed", is at least * MIN_RECLAIM. Reset "reclaimed" each time because each * reclamation is idempotent, and runs will (likely) recur * from one scan to the next as restrictions are relaxed. */ reclaimed = 0; for (i = 0; count > 0 && i < NRUNS; i++) { count--; m_run = m_runs[RUN_INDEX(count)]; error = vm_page_reclaim_run(req_class, domain, npages, m_run, high); if (error == 0) { reclaimed += npages; if (reclaimed >= MIN_RECLAIM) return (true); } } /* * Either relax the restrictions on the next scan or return if * the last scan had no restrictions. */ if (options == VPSC_NORESERV) options = VPSC_NOSUPER; else if (options == VPSC_NOSUPER) options = VPSC_ANY; else if (options == VPSC_ANY) return (reclaimed != 0); } } bool vm_page_reclaim_contig(int req, u_long npages, vm_paddr_t low, vm_paddr_t high, u_long alignment, vm_paddr_t boundary) { struct vm_domainset_iter di; int domain; bool ret; vm_domainset_iter_page_init(&di, kernel_object, &domain, &req); do { ret = vm_page_reclaim_contig_domain(domain, req, npages, low, high, alignment, boundary); if (ret) break; } while (vm_domainset_iter_page(&di, &domain, &req) == 0); return (ret); } +/* + * Set the domain in the appropriate page level domainset. + */ +void +vm_domain_set(int domain) +{ + struct vm_domain *vmd; + vmd = VM_DOMAIN(domain); + mtx_lock(&vm_domainset_lock); + if (!vmd->vmd_minset && vm_paging_min(vmd)) { + vmd->vmd_minset = 1; + DOMAINSET_SET(domain, &vm_min_domains); + } + if (!vmd->vmd_severeset && vm_paging_severe(vmd)) { + vmd->vmd_severeset = 1; + DOMAINSET_CLR(domain, &vm_severe_domains); + } + mtx_unlock(&vm_domainset_lock); +} + /* - * vm_wait: (also see VM_WAIT macro) + * Clear the domain from the appropriate page level domainset. + */ +static void +vm_domain_clear(int domain) +{ + struct vm_domain *vmd; + + vmd = VM_DOMAIN(domain); + mtx_lock(&vm_domainset_lock); + if (vmd->vmd_minset && !vm_paging_min(vmd)) { + vmd->vmd_minset = 0; + DOMAINSET_CLR(domain, &vm_min_domains); + if (!vm_page_count_min()) + wakeup(&vm_min_domains); + } + if (vmd->vmd_severeset && !vm_paging_severe(vmd)) { + vmd->vmd_severeset = 0; + DOMAINSET_CLR(domain, &vm_severe_domains); + if (!vm_page_count_severe()) + wakeup(&vm_severe_domains); + } + mtx_unlock(&vm_domainset_lock); +} + +/* + * Wait for free pages to exceed the min threshold globally. + */ +void +vm_wait_min(void) +{ + + mtx_lock(&vm_domainset_lock); + while (vm_page_count_min()) + msleep(&vm_min_domains, &vm_domainset_lock, PVM, "vmwait", 0); + mtx_unlock(&vm_domainset_lock); +} + +/* + * Wait for free pages to exceed the severe threshold globally. + */ +void +vm_wait_severe(void) +{ + + mtx_lock(&vm_domainset_lock); + while (vm_page_count_min()) + msleep(&vm_min_domains, &vm_domainset_lock, PVM, "vmwait", 0); + mtx_unlock(&vm_domainset_lock); +} + +/* + * vm_wait_domain: * * Sleep until free pages are available for allocation. - * - Called in various places before memory allocations. + * - Called in various places after failed memory allocations. */ -static void +void vm_wait_domain(int domain) { struct vm_domain *vmd; vm_pagequeue_free_assert_locked(domain); vmd = VM_DOMAIN(domain); if (curproc == pageproc) { vmd->vmd_pageout_pages_needed = 1; msleep(&vmd->vmd_pageout_pages_needed, &vmd->vmd_pagequeue_free_mtx, PDROP | PSWP, "VMWait", 0); } else { if (pageproc == NULL) panic("vm_wait in early boot"); pagedaemon_wait(domain, PVM, "vmwait"); } } +/* + * vm_wait: (also see VM_WAIT macro) + * + * Sleep until free pages are available for allocation. + * - Called in various places after failed memory allocations. + */ void vm_wait(void) { #if 0 /* XXX */ mtx_lock(&vm_page_queue_free_mtx); _vm_wait(); #else pause("vmxxx", 1); #endif } /* * vm_page_alloc_fail: * * Called when a page allocation function fails. Informs the * pagedaemon and performs the requested wait. Requires the * page_queue_free and object lock on entry. Returns with the * object lock held and free lock released. Returns an error when * retry is necessary. * */ static int vm_page_alloc_fail(vm_object_t object, int domain, int req) { struct vm_domain *vmd; vm_pagequeue_free_assert_locked(domain); vmd = VM_DOMAIN(domain); atomic_add_int(&vmd->vmd_pageout_deficit, max((u_int)req >> VM_ALLOC_COUNT_SHIFT, 1)); if (req & (VM_ALLOC_WAITOK | VM_ALLOC_WAITFAIL)) { if (object != NULL) VM_OBJECT_WUNLOCK(object); vm_wait_domain(domain); if (object != NULL) VM_OBJECT_WLOCK(object); if (req & VM_ALLOC_WAITOK) return (EAGAIN); } else { vm_pagequeue_free_unlock(domain); pagedaemon_wakeup(domain); } return (0); } /* * vm_waitpfault: (also see VM_WAITPFAULT macro) * * Sleep until free pages are available for allocation. * - Called only in vm_fault so that processes page faulting * can be easily tracked. * - Sleeps at a lower priority than vm_wait() so that vm_wait()ing * processes will be able to grab memory first. Do not change * this balance without careful testing first. */ void vm_waitpfault(void) { #if 0 /* XXX */ mtx_lock(&vm_page_queue_free_mtx); pagedaemon_wait(PUSER, "pfault"); #else pause("vmxxx", 1); #endif } struct vm_pagequeue * vm_page_pagequeue(vm_page_t m) { return (&vm_pagequeue_domain(m)->vmd_pagequeues[m->queue]); } /* * vm_page_dequeue: * * Remove the given page from its current page queue. * * The page must be locked. */ void vm_page_dequeue(vm_page_t m) { struct vm_pagequeue *pq; vm_page_assert_locked(m); KASSERT(m->queue < PQ_COUNT, ("vm_page_dequeue: page %p is not queued", m)); pq = vm_page_pagequeue(m); vm_pagequeue_lock(pq); m->queue = PQ_NONE; TAILQ_REMOVE(&pq->pq_pl, m, plinks.q); vm_pagequeue_cnt_dec(pq); vm_pagequeue_unlock(pq); } /* * vm_page_dequeue_locked: * * Remove the given page from its current page queue. * * The page and page queue must be locked. */ void vm_page_dequeue_locked(vm_page_t m) { struct vm_pagequeue *pq; vm_page_lock_assert(m, MA_OWNED); pq = vm_page_pagequeue(m); vm_pagequeue_assert_locked(pq); m->queue = PQ_NONE; TAILQ_REMOVE(&pq->pq_pl, m, plinks.q); vm_pagequeue_cnt_dec(pq); } /* * vm_page_enqueue: * * Add the given page to the specified page queue. * * The page must be locked. */ static void vm_page_enqueue(uint8_t queue, vm_page_t m) { struct vm_pagequeue *pq; vm_page_lock_assert(m, MA_OWNED); KASSERT(queue < PQ_COUNT, ("vm_page_enqueue: invalid queue %u request for page %p", queue, m)); pq = &vm_pagequeue_domain(m)->vmd_pagequeues[queue]; vm_pagequeue_lock(pq); m->queue = queue; TAILQ_INSERT_TAIL(&pq->pq_pl, m, plinks.q); vm_pagequeue_cnt_inc(pq); vm_pagequeue_unlock(pq); } /* * vm_page_requeue: * * Move the given page to the tail of its current page queue. * * The page must be locked. */ void vm_page_requeue(vm_page_t m) { struct vm_pagequeue *pq; vm_page_lock_assert(m, MA_OWNED); KASSERT(m->queue != PQ_NONE, ("vm_page_requeue: page %p is not queued", m)); pq = vm_page_pagequeue(m); vm_pagequeue_lock(pq); TAILQ_REMOVE(&pq->pq_pl, m, plinks.q); TAILQ_INSERT_TAIL(&pq->pq_pl, m, plinks.q); vm_pagequeue_unlock(pq); } /* * vm_page_requeue_locked: * * Move the given page to the tail of its current page queue. * * The page queue must be locked. */ void vm_page_requeue_locked(vm_page_t m) { struct vm_pagequeue *pq; KASSERT(m->queue != PQ_NONE, ("vm_page_requeue_locked: page %p is not queued", m)); pq = vm_page_pagequeue(m); vm_pagequeue_assert_locked(pq); TAILQ_REMOVE(&pq->pq_pl, m, plinks.q); TAILQ_INSERT_TAIL(&pq->pq_pl, m, plinks.q); } /* * vm_page_activate: * * Put the specified page on the active list (if appropriate). * Ensure that act_count is at least ACT_INIT but do not otherwise * mess with it. * * The page must be locked. */ void vm_page_activate(vm_page_t m) { int queue; vm_page_lock_assert(m, MA_OWNED); if ((queue = m->queue) != PQ_ACTIVE) { if (m->wire_count == 0 && (m->oflags & VPO_UNMANAGED) == 0) { if (m->act_count < ACT_INIT) m->act_count = ACT_INIT; if (queue != PQ_NONE) vm_page_dequeue(m); vm_page_enqueue(PQ_ACTIVE, m); } else KASSERT(queue == PQ_NONE, ("vm_page_activate: wired page %p is queued", m)); } else { if (m->act_count < ACT_INIT) m->act_count = ACT_INIT; } } /* * vm_page_free_wakeup: * * Helper routine for vm_page_free_toq(). This routine is called * when a page is added to the free queues. * * The page queues must be locked. */ static void vm_page_free_wakeup(int domain) { struct vm_domain *vmd; vm_pagequeue_free_assert_locked(domain); vmd = VM_DOMAIN(domain); /* * if pageout daemon needs pages, then tell it that there are * some free. */ if (vmd->vmd_pageout_pages_needed && vmd->vmd_free_count >= vmd->vmd_pageout_free_min) { wakeup(&vmd->vmd_pageout_pages_needed); vmd->vmd_pageout_pages_needed = 0; } /* * wakeup processes that are waiting on memory if we hit a * high water mark. And wakeup scheduler process if we have * lots of memory. this process will swapin processes. */ if (vmd->vmd_pages_needed && !vm_paging_min(vmd)) { vmd->vmd_pages_needed = false; wakeup(&vmd->vmd_free_count); } + if ((vmd->vmd_minset && !vm_paging_min(vmd)) || + (vmd->vmd_severeset && !vm_paging_severe(vmd))) + vm_domain_clear(domain); } /* * vm_page_free_prep: * * Prepares the given page to be put on the free list, * disassociating it from any VM object. The caller may return * the page to the free list only if this function returns true. * * The object must be locked. The page must be locked if it is * managed. For a queued managed page, the pagequeue_locked * argument specifies whether the page queue is already locked. */ bool vm_page_free_prep(vm_page_t m, bool pagequeue_locked) { #if defined(DIAGNOSTIC) && defined(PHYS_TO_DMAP) if ((m->flags & PG_ZERO) != 0) { uint64_t *p; int i; p = (uint64_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)); for (i = 0; i < PAGE_SIZE / sizeof(uint64_t); i++, p++) KASSERT(*p == 0, ("vm_page_free_prep %p PG_ZERO %d %jx", m, i, (uintmax_t)*p)); } #endif if ((m->oflags & VPO_UNMANAGED) == 0) { vm_page_lock_assert(m, MA_OWNED); KASSERT(!pmap_page_is_mapped(m), ("vm_page_free_toq: freeing mapped page %p", m)); } else KASSERT(m->queue == PQ_NONE, ("vm_page_free_toq: unmanaged page %p is queued", m)); VM_CNT_INC(v_tfree); if (vm_page_sbusied(m)) panic("vm_page_free: freeing busy page %p", m); vm_page_remove(m); /* * If fictitious remove object association and * return. */ if ((m->flags & PG_FICTITIOUS) != 0) { KASSERT(m->wire_count == 1, ("fictitious page %p is not wired", m)); KASSERT(m->queue == PQ_NONE, ("fictitious page %p is queued", m)); return (false); } if (m->queue != PQ_NONE) { if (pagequeue_locked) vm_page_dequeue_locked(m); else vm_page_dequeue(m); } m->valid = 0; vm_page_undirty(m); if (m->wire_count != 0) panic("vm_page_free: freeing wired page %p", m); if (m->hold_count != 0) { m->flags &= ~PG_ZERO; KASSERT((m->flags & PG_UNHOLDFREE) == 0, ("vm_page_free: freeing PG_UNHOLDFREE page %p", m)); m->flags |= PG_UNHOLDFREE; return (false); } /* * Restore the default memory attribute to the page. */ if (pmap_page_get_memattr(m) != VM_MEMATTR_DEFAULT) pmap_page_set_memattr(m, VM_MEMATTR_DEFAULT); return (true); } /* * Insert the page into the physical memory allocator's free page * queues. This is the last step to free a page. */ static void vm_page_free_phys(vm_page_t m) { vm_pagequeue_free_assert_locked(vm_phys_domain(m)); vm_pagequeue_freecnt_adj(vm_phys_domain(m), 1); #if VM_NRESERVLEVEL > 0 if (!vm_reserv_free_page(m)) #endif vm_phys_free_pages(m, 0); } void vm_page_free_phys_pglist(struct pglist *tq) { vm_page_t m; int domain; if (TAILQ_EMPTY(tq)) return; domain = -1; TAILQ_FOREACH(m, tq, listq) { if (domain != vm_phys_domain(m)) { if (domain != -1) { vm_page_free_wakeup(domain); vm_pagequeue_free_unlock(domain); } domain = vm_phys_domain(m); vm_pagequeue_free_lock(domain); } vm_page_free_phys(m); } if (domain != -1) { vm_page_free_wakeup(domain); vm_pagequeue_free_unlock(domain); } } /* * vm_page_free_toq: * * Returns the given page to the free list, disassociating it * from any VM object. * * The object must be locked. The page must be locked if it is * managed. */ void vm_page_free_toq(vm_page_t m) { int domain; if (!vm_page_free_prep(m, false)) return; domain = vm_phys_domain(m); vm_pagequeue_free_lock(domain); vm_page_free_phys(m); vm_page_free_wakeup(domain); vm_pagequeue_free_unlock(domain); } /* * vm_page_wire: * * Mark this page as wired down by yet * another map, removing it from paging queues * as necessary. * * If the page is fictitious, then its wire count must remain one. * * The page must be locked. */ void vm_page_wire(vm_page_t m) { /* * Only bump the wire statistics if the page is not already wired, * and only unqueue the page if it is on some queue (if it is unmanaged * it is already off the queues). */ vm_page_lock_assert(m, MA_OWNED); if ((m->flags & PG_FICTITIOUS) != 0) { KASSERT(m->wire_count == 1, ("vm_page_wire: fictitious page %p's wire count isn't one", m)); return; } if (m->wire_count == 0) { KASSERT((m->oflags & VPO_UNMANAGED) == 0 || m->queue == PQ_NONE, ("vm_page_wire: unmanaged page %p is queued", m)); vm_page_remque(m); atomic_add_int(&vm_cnt.v_wire_count, 1); } m->wire_count++; KASSERT(m->wire_count != 0, ("vm_page_wire: wire_count overflow m=%p", m)); } /* * vm_page_unwire: * * Release one wiring of the specified page, potentially allowing it to be * paged out. Returns TRUE if the number of wirings transitions to zero and * FALSE otherwise. * * Only managed pages belonging to an object can be paged out. If the number * of wirings transitions to zero and the page is eligible for page out, then * the page is added to the specified paging queue (unless PQ_NONE is * specified). * * If a page is fictitious, then its wire count must always be one. * * A managed page must be locked. */ boolean_t vm_page_unwire(vm_page_t m, uint8_t queue) { KASSERT(queue < PQ_COUNT || queue == PQ_NONE, ("vm_page_unwire: invalid queue %u request for page %p", queue, m)); if ((m->oflags & VPO_UNMANAGED) == 0) vm_page_assert_locked(m); if ((m->flags & PG_FICTITIOUS) != 0) { KASSERT(m->wire_count == 1, ("vm_page_unwire: fictitious page %p's wire count isn't one", m)); return (FALSE); } if (m->wire_count > 0) { m->wire_count--; if (m->wire_count == 0) { atomic_subtract_int(&vm_cnt.v_wire_count, 1); if ((m->oflags & VPO_UNMANAGED) == 0 && m->object != NULL && queue != PQ_NONE) vm_page_enqueue(queue, m); return (TRUE); } else return (FALSE); } else panic("vm_page_unwire: page %p's wire count is zero", m); } /* * Move the specified page to the inactive queue. * * Normally, "noreuse" is FALSE, resulting in LRU ordering of the inactive * queue. However, setting "noreuse" to TRUE will accelerate the specified * page's reclamation, but it will not unmap the page from any address space. * This is implemented by inserting the page near the head of the inactive * queue, using a marker page to guide FIFO insertion ordering. * * The page must be locked. */ static inline void _vm_page_deactivate(vm_page_t m, boolean_t noreuse) { struct vm_pagequeue *pq; int queue; vm_page_assert_locked(m); /* * Ignore if the page is already inactive, unless it is unlikely to be * reactivated. */ if ((queue = m->queue) == PQ_INACTIVE && !noreuse) return; if (m->wire_count == 0 && (m->oflags & VPO_UNMANAGED) == 0) { pq = &vm_pagequeue_domain(m)->vmd_pagequeues[PQ_INACTIVE]; /* Avoid multiple acquisitions of the inactive queue lock. */ if (queue == PQ_INACTIVE) { vm_pagequeue_lock(pq); vm_page_dequeue_locked(m); } else { if (queue != PQ_NONE) vm_page_dequeue(m); vm_pagequeue_lock(pq); } m->queue = PQ_INACTIVE; if (noreuse) TAILQ_INSERT_BEFORE( &vm_pagequeue_domain(m)->vmd_inacthead, m, plinks.q); else TAILQ_INSERT_TAIL(&pq->pq_pl, m, plinks.q); vm_pagequeue_cnt_inc(pq); vm_pagequeue_unlock(pq); } } /* * Move the specified page to the inactive queue. * * The page must be locked. */ void vm_page_deactivate(vm_page_t m) { _vm_page_deactivate(m, FALSE); } /* * Move the specified page to the inactive queue with the expectation * that it is unlikely to be reused. * * The page must be locked. */ void vm_page_deactivate_noreuse(vm_page_t m) { _vm_page_deactivate(m, TRUE); } /* * vm_page_launder * * Put a page in the laundry. */ void vm_page_launder(vm_page_t m) { int queue; vm_page_assert_locked(m); if ((queue = m->queue) != PQ_LAUNDRY) { if (m->wire_count == 0 && (m->oflags & VPO_UNMANAGED) == 0) { if (queue != PQ_NONE) vm_page_dequeue(m); vm_page_enqueue(PQ_LAUNDRY, m); } else KASSERT(queue == PQ_NONE, ("wired page %p is queued", m)); } } /* * vm_page_unswappable * * Put a page in the PQ_UNSWAPPABLE holding queue. */ void vm_page_unswappable(vm_page_t m) { vm_page_assert_locked(m); KASSERT(m->wire_count == 0 && (m->oflags & VPO_UNMANAGED) == 0, ("page %p already unswappable", m)); if (m->queue != PQ_NONE) vm_page_dequeue(m); vm_page_enqueue(PQ_UNSWAPPABLE, m); } /* * Attempt to free the page. If it cannot be freed, do nothing. Returns true * if the page is freed and false otherwise. * * The page must be managed. The page and its containing object must be * locked. */ bool vm_page_try_to_free(vm_page_t m) { vm_page_assert_locked(m); VM_OBJECT_ASSERT_WLOCKED(m->object); KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("page %p is unmanaged", m)); if (m->dirty != 0 || m->hold_count != 0 || m->wire_count != 0 || vm_page_busied(m)) return (false); if (m->object->ref_count != 0) { pmap_remove_all(m); if (m->dirty != 0) return (false); } vm_page_free(m); return (true); } /* * vm_page_advise * * Apply the specified advice to the given page. * * The object and page must be locked. */ void vm_page_advise(vm_page_t m, int advice) { vm_page_assert_locked(m); VM_OBJECT_ASSERT_WLOCKED(m->object); if (advice == MADV_FREE) /* * Mark the page clean. This will allow the page to be freed * without first paging it out. MADV_FREE pages are often * quickly reused by malloc(3), so we do not do anything that * would result in a page fault on a later access. */ vm_page_undirty(m); else if (advice != MADV_DONTNEED) { if (advice == MADV_WILLNEED) vm_page_activate(m); return; } /* * Clear any references to the page. Otherwise, the page daemon will * immediately reactivate the page. */ vm_page_aflag_clear(m, PGA_REFERENCED); if (advice != MADV_FREE && m->dirty == 0 && pmap_is_modified(m)) vm_page_dirty(m); /* * Place clean pages near the head of the inactive queue rather than * the tail, thus defeating the queue's LRU operation and ensuring that * the page will be reused quickly. Dirty pages not already in the * laundry are moved there. */ if (m->dirty == 0) vm_page_deactivate_noreuse(m); else vm_page_launder(m); } /* * Grab a page, waiting until we are waken up due to the page * changing state. We keep on waiting, if the page continues * to be in the object. If the page doesn't exist, first allocate it * and then conditionally zero it. * * This routine may sleep. * * The object must be locked on entry. The lock will, however, be released * and reacquired if the routine sleeps. */ vm_page_t vm_page_grab(vm_object_t object, vm_pindex_t pindex, int allocflags) { vm_page_t m; int sleep; int pflags; VM_OBJECT_ASSERT_WLOCKED(object); KASSERT((allocflags & VM_ALLOC_SBUSY) == 0 || (allocflags & VM_ALLOC_IGN_SBUSY) != 0, ("vm_page_grab: VM_ALLOC_SBUSY/VM_ALLOC_IGN_SBUSY mismatch")); pflags = allocflags & ~(VM_ALLOC_NOWAIT | VM_ALLOC_WAITOK | VM_ALLOC_WAITFAIL); if ((allocflags & VM_ALLOC_NOWAIT) == 0) pflags |= VM_ALLOC_WAITFAIL; retrylookup: if ((m = vm_page_lookup(object, pindex)) != NULL) { sleep = (allocflags & VM_ALLOC_IGN_SBUSY) != 0 ? vm_page_xbusied(m) : vm_page_busied(m); if (sleep) { if ((allocflags & VM_ALLOC_NOWAIT) != 0) return (NULL); /* * Reference the page before unlocking and * sleeping so that the page daemon is less * likely to reclaim it. */ vm_page_aflag_set(m, PGA_REFERENCED); vm_page_lock(m); VM_OBJECT_WUNLOCK(object); vm_page_busy_sleep(m, "pgrbwt", (allocflags & VM_ALLOC_IGN_SBUSY) != 0); VM_OBJECT_WLOCK(object); goto retrylookup; } else { if ((allocflags & VM_ALLOC_WIRED) != 0) { vm_page_lock(m); vm_page_wire(m); vm_page_unlock(m); } if ((allocflags & (VM_ALLOC_NOBUSY | VM_ALLOC_SBUSY)) == 0) vm_page_xbusy(m); if ((allocflags & VM_ALLOC_SBUSY) != 0) vm_page_sbusy(m); return (m); } } m = vm_page_alloc(object, pindex, pflags); if (m == NULL) { if ((allocflags & VM_ALLOC_NOWAIT) != 0) return (NULL); goto retrylookup; } if (allocflags & VM_ALLOC_ZERO && (m->flags & PG_ZERO) == 0) pmap_zero_page(m); return (m); } /* * Return the specified range of pages from the given object. For each * page offset within the range, if a page already exists within the object * at that offset and it is busy, then wait for it to change state. If, * instead, the page doesn't exist, then allocate it. * * The caller must always specify an allocation class. * * allocation classes: * VM_ALLOC_NORMAL normal process request * VM_ALLOC_SYSTEM system *really* needs the pages * * The caller must always specify that the pages are to be busied and/or * wired. * * optional allocation flags: * VM_ALLOC_IGN_SBUSY do not sleep on soft busy pages * VM_ALLOC_NOBUSY do not exclusive busy the page * VM_ALLOC_NOWAIT do not sleep * VM_ALLOC_SBUSY set page to sbusy state * VM_ALLOC_WIRED wire the pages * VM_ALLOC_ZERO zero and validate any invalid pages * * If VM_ALLOC_NOWAIT is not specified, this routine may sleep. Otherwise, it * may return a partial prefix of the requested range. */ int vm_page_grab_pages(vm_object_t object, vm_pindex_t pindex, int allocflags, vm_page_t *ma, int count) { vm_page_t m, mpred; int pflags; int i; bool sleep; VM_OBJECT_ASSERT_WLOCKED(object); KASSERT(((u_int)allocflags >> VM_ALLOC_COUNT_SHIFT) == 0, ("vm_page_grap_pages: VM_ALLOC_COUNT() is not allowed")); KASSERT((allocflags & VM_ALLOC_NOBUSY) == 0 || (allocflags & VM_ALLOC_WIRED) != 0, ("vm_page_grab_pages: the pages must be busied or wired")); KASSERT((allocflags & VM_ALLOC_SBUSY) == 0 || (allocflags & VM_ALLOC_IGN_SBUSY) != 0, ("vm_page_grab_pages: VM_ALLOC_SBUSY/IGN_SBUSY mismatch")); if (count == 0) return (0); pflags = allocflags & ~(VM_ALLOC_NOWAIT | VM_ALLOC_WAITOK | VM_ALLOC_WAITFAIL | VM_ALLOC_IGN_SBUSY); if ((allocflags & VM_ALLOC_NOWAIT) == 0) pflags |= VM_ALLOC_WAITFAIL; i = 0; retrylookup: m = vm_radix_lookup_le(&object->rtree, pindex + i); if (m == NULL || m->pindex != pindex + i) { mpred = m; m = NULL; } else mpred = TAILQ_PREV(m, pglist, listq); for (; i < count; i++) { if (m != NULL) { sleep = (allocflags & VM_ALLOC_IGN_SBUSY) != 0 ? vm_page_xbusied(m) : vm_page_busied(m); if (sleep) { if ((allocflags & VM_ALLOC_NOWAIT) != 0) break; /* * Reference the page before unlocking and * sleeping so that the page daemon is less * likely to reclaim it. */ vm_page_aflag_set(m, PGA_REFERENCED); vm_page_lock(m); VM_OBJECT_WUNLOCK(object); vm_page_busy_sleep(m, "grbmaw", (allocflags & VM_ALLOC_IGN_SBUSY) != 0); VM_OBJECT_WLOCK(object); goto retrylookup; } if ((allocflags & VM_ALLOC_WIRED) != 0) { vm_page_lock(m); vm_page_wire(m); vm_page_unlock(m); } if ((allocflags & (VM_ALLOC_NOBUSY | VM_ALLOC_SBUSY)) == 0) vm_page_xbusy(m); if ((allocflags & VM_ALLOC_SBUSY) != 0) vm_page_sbusy(m); } else { m = vm_page_alloc_after(object, pindex + i, pflags | VM_ALLOC_COUNT(count - i), mpred); if (m == NULL) { if ((allocflags & VM_ALLOC_NOWAIT) != 0) break; goto retrylookup; } } if (m->valid == 0 && (allocflags & VM_ALLOC_ZERO) != 0) { if ((m->flags & PG_ZERO) == 0) pmap_zero_page(m); m->valid = VM_PAGE_BITS_ALL; } ma[i] = mpred = m; m = vm_page_next(m); } return (i); } /* * Mapping function for valid or dirty bits in a page. * * Inputs are required to range within a page. */ vm_page_bits_t vm_page_bits(int base, int size) { int first_bit; int last_bit; KASSERT( base + size <= PAGE_SIZE, ("vm_page_bits: illegal base/size %d/%d", base, size) ); if (size == 0) /* handle degenerate case */ return (0); first_bit = base >> DEV_BSHIFT; last_bit = (base + size - 1) >> DEV_BSHIFT; return (((vm_page_bits_t)2 << last_bit) - ((vm_page_bits_t)1 << first_bit)); } /* * vm_page_set_valid_range: * * Sets portions of a page valid. The arguments are expected * to be DEV_BSIZE aligned but if they aren't the bitmap is inclusive * of any partial chunks touched by the range. The invalid portion of * such chunks will be zeroed. * * (base + size) must be less then or equal to PAGE_SIZE. */ void vm_page_set_valid_range(vm_page_t m, int base, int size) { int endoff, frag; VM_OBJECT_ASSERT_WLOCKED(m->object); if (size == 0) /* handle degenerate case */ return; /* * If the base is not DEV_BSIZE aligned and the valid * bit is clear, we have to zero out a portion of the * first block. */ if ((frag = rounddown2(base, DEV_BSIZE)) != base && (m->valid & (1 << (base >> DEV_BSHIFT))) == 0) pmap_zero_page_area(m, frag, base - frag); /* * If the ending offset is not DEV_BSIZE aligned and the * valid bit is clear, we have to zero out a portion of * the last block. */ endoff = base + size; if ((frag = rounddown2(endoff, DEV_BSIZE)) != endoff && (m->valid & (1 << (endoff >> DEV_BSHIFT))) == 0) pmap_zero_page_area(m, endoff, DEV_BSIZE - (endoff & (DEV_BSIZE - 1))); /* * Assert that no previously invalid block that is now being validated * is already dirty. */ KASSERT((~m->valid & vm_page_bits(base, size) & m->dirty) == 0, ("vm_page_set_valid_range: page %p is dirty", m)); /* * Set valid bits inclusive of any overlap. */ m->valid |= vm_page_bits(base, size); } /* * Clear the given bits from the specified page's dirty field. */ static __inline void vm_page_clear_dirty_mask(vm_page_t m, vm_page_bits_t pagebits) { uintptr_t addr; #if PAGE_SIZE < 16384 int shift; #endif /* * If the object is locked and the page is neither exclusive busy nor * write mapped, then the page's dirty field cannot possibly be * set by a concurrent pmap operation. */ VM_OBJECT_ASSERT_WLOCKED(m->object); if (!vm_page_xbusied(m) && !pmap_page_is_write_mapped(m)) m->dirty &= ~pagebits; else { /* * The pmap layer can call vm_page_dirty() without * holding a distinguished lock. The combination of * the object's lock and an atomic operation suffice * to guarantee consistency of the page dirty field. * * For PAGE_SIZE == 32768 case, compiler already * properly aligns the dirty field, so no forcible * alignment is needed. Only require existence of * atomic_clear_64 when page size is 32768. */ addr = (uintptr_t)&m->dirty; #if PAGE_SIZE == 32768 atomic_clear_64((uint64_t *)addr, pagebits); #elif PAGE_SIZE == 16384 atomic_clear_32((uint32_t *)addr, pagebits); #else /* PAGE_SIZE <= 8192 */ /* * Use a trick to perform a 32-bit atomic on the * containing aligned word, to not depend on the existence * of atomic_clear_{8, 16}. */ shift = addr & (sizeof(uint32_t) - 1); #if BYTE_ORDER == BIG_ENDIAN shift = (sizeof(uint32_t) - sizeof(m->dirty) - shift) * NBBY; #else shift *= NBBY; #endif addr &= ~(sizeof(uint32_t) - 1); atomic_clear_32((uint32_t *)addr, pagebits << shift); #endif /* PAGE_SIZE */ } } /* * vm_page_set_validclean: * * Sets portions of a page valid and clean. The arguments are expected * to be DEV_BSIZE aligned but if they aren't the bitmap is inclusive * of any partial chunks touched by the range. The invalid portion of * such chunks will be zero'd. * * (base + size) must be less then or equal to PAGE_SIZE. */ void vm_page_set_validclean(vm_page_t m, int base, int size) { vm_page_bits_t oldvalid, pagebits; int endoff, frag; VM_OBJECT_ASSERT_WLOCKED(m->object); if (size == 0) /* handle degenerate case */ return; /* * If the base is not DEV_BSIZE aligned and the valid * bit is clear, we have to zero out a portion of the * first block. */ if ((frag = rounddown2(base, DEV_BSIZE)) != base && (m->valid & ((vm_page_bits_t)1 << (base >> DEV_BSHIFT))) == 0) pmap_zero_page_area(m, frag, base - frag); /* * If the ending offset is not DEV_BSIZE aligned and the * valid bit is clear, we have to zero out a portion of * the last block. */ endoff = base + size; if ((frag = rounddown2(endoff, DEV_BSIZE)) != endoff && (m->valid & ((vm_page_bits_t)1 << (endoff >> DEV_BSHIFT))) == 0) pmap_zero_page_area(m, endoff, DEV_BSIZE - (endoff & (DEV_BSIZE - 1))); /* * Set valid, clear dirty bits. If validating the entire * page we can safely clear the pmap modify bit. We also * use this opportunity to clear the VPO_NOSYNC flag. If a process * takes a write fault on a MAP_NOSYNC memory area the flag will * be set again. * * We set valid bits inclusive of any overlap, but we can only * clear dirty bits for DEV_BSIZE chunks that are fully within * the range. */ oldvalid = m->valid; pagebits = vm_page_bits(base, size); m->valid |= pagebits; #if 0 /* NOT YET */ if ((frag = base & (DEV_BSIZE - 1)) != 0) { frag = DEV_BSIZE - frag; base += frag; size -= frag; if (size < 0) size = 0; } pagebits = vm_page_bits(base, size & (DEV_BSIZE - 1)); #endif if (base == 0 && size == PAGE_SIZE) { /* * The page can only be modified within the pmap if it is * mapped, and it can only be mapped if it was previously * fully valid. */ if (oldvalid == VM_PAGE_BITS_ALL) /* * Perform the pmap_clear_modify() first. Otherwise, * a concurrent pmap operation, such as * pmap_protect(), could clear a modification in the * pmap and set the dirty field on the page before * pmap_clear_modify() had begun and after the dirty * field was cleared here. */ pmap_clear_modify(m); m->dirty = 0; m->oflags &= ~VPO_NOSYNC; } else if (oldvalid != VM_PAGE_BITS_ALL) m->dirty &= ~pagebits; else vm_page_clear_dirty_mask(m, pagebits); } void vm_page_clear_dirty(vm_page_t m, int base, int size) { vm_page_clear_dirty_mask(m, vm_page_bits(base, size)); } /* * vm_page_set_invalid: * * Invalidates DEV_BSIZE'd chunks within a page. Both the * valid and dirty bits for the effected areas are cleared. */ void vm_page_set_invalid(vm_page_t m, int base, int size) { vm_page_bits_t bits; vm_object_t object; object = m->object; VM_OBJECT_ASSERT_WLOCKED(object); if (object->type == OBJT_VNODE && base == 0 && IDX_TO_OFF(m->pindex) + size >= object->un_pager.vnp.vnp_size) bits = VM_PAGE_BITS_ALL; else bits = vm_page_bits(base, size); if (object->ref_count != 0 && m->valid == VM_PAGE_BITS_ALL && bits != 0) pmap_remove_all(m); KASSERT((bits == 0 && m->valid == VM_PAGE_BITS_ALL) || !pmap_page_is_mapped(m), ("vm_page_set_invalid: page %p is mapped", m)); m->valid &= ~bits; m->dirty &= ~bits; } /* * vm_page_zero_invalid() * * The kernel assumes that the invalid portions of a page contain * garbage, but such pages can be mapped into memory by user code. * When this occurs, we must zero out the non-valid portions of the * page so user code sees what it expects. * * Pages are most often semi-valid when the end of a file is mapped * into memory and the file's size is not page aligned. */ void vm_page_zero_invalid(vm_page_t m, boolean_t setvalid) { int b; int i; VM_OBJECT_ASSERT_WLOCKED(m->object); /* * Scan the valid bits looking for invalid sections that * must be zeroed. Invalid sub-DEV_BSIZE'd areas ( where the * valid bit may be set ) have already been zeroed by * vm_page_set_validclean(). */ for (b = i = 0; i <= PAGE_SIZE / DEV_BSIZE; ++i) { if (i == (PAGE_SIZE / DEV_BSIZE) || (m->valid & ((vm_page_bits_t)1 << i))) { if (i > b) { pmap_zero_page_area(m, b << DEV_BSHIFT, (i - b) << DEV_BSHIFT); } b = i + 1; } } /* * setvalid is TRUE when we can safely set the zero'd areas * as being valid. We can do this if there are no cache consistancy * issues. e.g. it is ok to do with UFS, but not ok to do with NFS. */ if (setvalid) m->valid = VM_PAGE_BITS_ALL; } /* * vm_page_is_valid: * * Is (partial) page valid? Note that the case where size == 0 * will return FALSE in the degenerate case where the page is * entirely invalid, and TRUE otherwise. */ int vm_page_is_valid(vm_page_t m, int base, int size) { vm_page_bits_t bits; VM_OBJECT_ASSERT_LOCKED(m->object); bits = vm_page_bits(base, size); return (m->valid != 0 && (m->valid & bits) == bits); } /* * Returns true if all of the specified predicates are true for the entire * (super)page and false otherwise. */ bool vm_page_ps_test(vm_page_t m, int flags, vm_page_t skip_m) { vm_object_t object; int i, npages; object = m->object; VM_OBJECT_ASSERT_LOCKED(object); npages = atop(pagesizes[m->psind]); /* * The physically contiguous pages that make up a superpage, i.e., a * page with a page size index ("psind") greater than zero, will * occupy adjacent entries in vm_page_array[]. */ for (i = 0; i < npages; i++) { /* Always test object consistency, including "skip_m". */ if (m[i].object != object) return (false); if (&m[i] == skip_m) continue; if ((flags & PS_NONE_BUSY) != 0 && vm_page_busied(&m[i])) return (false); if ((flags & PS_ALL_DIRTY) != 0) { /* * Calling vm_page_test_dirty() or pmap_is_modified() * might stop this case from spuriously returning * "false". However, that would require a write lock * on the object containing "m[i]". */ if (m[i].dirty != VM_PAGE_BITS_ALL) return (false); } if ((flags & PS_ALL_VALID) != 0 && m[i].valid != VM_PAGE_BITS_ALL) return (false); } return (true); } /* * Set the page's dirty bits if the page is modified. */ void vm_page_test_dirty(vm_page_t m) { VM_OBJECT_ASSERT_WLOCKED(m->object); if (m->dirty != VM_PAGE_BITS_ALL && pmap_is_modified(m)) vm_page_dirty(m); } void vm_page_lock_KBI(vm_page_t m, const char *file, int line) { mtx_lock_flags_(vm_page_lockptr(m), 0, file, line); } void vm_page_unlock_KBI(vm_page_t m, const char *file, int line) { mtx_unlock_flags_(vm_page_lockptr(m), 0, file, line); } int vm_page_trylock_KBI(vm_page_t m, const char *file, int line) { return (mtx_trylock_flags_(vm_page_lockptr(m), 0, file, line)); } #if defined(INVARIANTS) || defined(INVARIANT_SUPPORT) void vm_page_assert_locked_KBI(vm_page_t m, const char *file, int line) { vm_page_lock_assert_KBI(m, MA_OWNED, file, line); } void vm_page_lock_assert_KBI(vm_page_t m, int a, const char *file, int line) { mtx_assert_(vm_page_lockptr(m), a, file, line); } #endif #ifdef INVARIANTS void vm_page_object_lock_assert(vm_page_t m) { /* * Certain of the page's fields may only be modified by the * holder of the containing object's lock or the exclusive busy. * holder. Unfortunately, the holder of the write busy is * not recorded, and thus cannot be checked here. */ if (m->object != NULL && !vm_page_xbusied(m)) VM_OBJECT_ASSERT_WLOCKED(m->object); } void vm_page_assert_pga_writeable(vm_page_t m, uint8_t bits) { if ((bits & PGA_WRITEABLE) == 0) return; /* * The PGA_WRITEABLE flag can only be set if the page is * managed, is exclusively busied or the object is locked. * Currently, this flag is only set by pmap_enter(). */ KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("PGA_WRITEABLE on unmanaged page")); if (!vm_page_xbusied(m)) VM_OBJECT_ASSERT_LOCKED(m->object); } #endif #include "opt_ddb.h" #ifdef DDB #include #include DB_SHOW_COMMAND(page, vm_page_print_page_info) { db_printf("vm_cnt.v_free_count: %d\n", vm_free_count()); db_printf("vm_cnt.v_inactive_count: %d\n", vm_inactive_count()); db_printf("vm_cnt.v_active_count: %d\n", vm_active_count()); db_printf("vm_cnt.v_laundry_count: %d\n", vm_laundry_count()); db_printf("vm_cnt.v_wire_count: %d\n", vm_cnt.v_wire_count); db_printf("vm_cnt.v_free_reserved: %d\n", vm_cnt.v_free_reserved); db_printf("vm_cnt.v_free_min: %d\n", vm_cnt.v_free_min); db_printf("vm_cnt.v_free_target: %d\n", vm_cnt.v_free_target); db_printf("vm_cnt.v_inactive_target: %d\n", vm_cnt.v_inactive_target); } DB_SHOW_COMMAND(pageq, vm_page_print_pageq_info) { int dom; db_printf("pq_free %d\n", vm_free_count()); for (dom = 0; dom < vm_ndomains; dom++) { db_printf( "dom %d page_cnt %d free %d pq_act %d pq_inact %d pq_laund %d pq_unsw %d\n", dom, vm_dom[dom].vmd_page_count, vm_dom[dom].vmd_free_count, vm_dom[dom].vmd_pagequeues[PQ_ACTIVE].pq_cnt, vm_dom[dom].vmd_pagequeues[PQ_INACTIVE].pq_cnt, vm_dom[dom].vmd_pagequeues[PQ_LAUNDRY].pq_cnt, vm_dom[dom].vmd_pagequeues[PQ_UNSWAPPABLE].pq_cnt); } } DB_SHOW_COMMAND(pginfo, vm_page_print_pginfo) { vm_page_t m; boolean_t phys; if (!have_addr) { db_printf("show pginfo addr\n"); return; } phys = strchr(modif, 'p') != NULL; if (phys) m = PHYS_TO_VM_PAGE(addr); else m = (vm_page_t)addr; db_printf( "page %p obj %p pidx 0x%jx phys 0x%jx q %d hold %d wire %d\n" " af 0x%x of 0x%x f 0x%x act %d busy %x valid 0x%x dirty 0x%x\n", m, m->object, (uintmax_t)m->pindex, (uintmax_t)m->phys_addr, m->queue, m->hold_count, m->wire_count, m->aflags, m->oflags, m->flags, m->act_count, m->busy_lock, m->valid, m->dirty); } #endif /* DDB */ Index: user/jeff/numa/sys/vm/vm_pageout.h =================================================================== --- user/jeff/numa/sys/vm/vm_pageout.h (revision 328171) +++ user/jeff/numa/sys/vm/vm_pageout.h (revision 328172) @@ -1,110 +1,113 @@ /*- * SPDX-License-Identifier: (BSD-3-Clause AND MIT-CMU) * * Copyright (c) 1991, 1993 * The Regents of the University of California. All rights reserved. * * This code is derived from software contributed to Berkeley by * The Mach Operating System project at Carnegie-Mellon University. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from: @(#)vm_pageout.h 8.2 (Berkeley) 1/12/94 * * * Copyright (c) 1987, 1990 Carnegie-Mellon University. * All rights reserved. * * Author: Avadis Tevanian, Jr. * * Permission to use, copy, modify and distribute this software and * its documentation is hereby granted, provided that both the copyright * notice and this permission notice appear in all copies of the * software, derivative works or modified versions, and any portions * thereof, and that both notices appear in supporting documentation. * * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. * * Carnegie Mellon requests users of this software to return to * * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU * School of Computer Science * Carnegie Mellon University * Pittsburgh PA 15213-3890 * * any improvements or extensions that they make and grant Carnegie the * rights to redistribute these changes. * * $FreeBSD$ */ #ifndef _VM_VM_PAGEOUT_H_ #define _VM_VM_PAGEOUT_H_ /* * Header file for pageout daemon. */ /* * Exported data structures. */ extern int vm_page_max_wired; extern int vm_pageout_page_count; #define VM_OOM_MEM 1 #define VM_OOM_SWAPZ 2 /* * vm_lowmem flags. */ #define VM_LOW_KMEM 0x01 #define VM_LOW_PAGES 0x02 /* * Exported routines. */ /* * Signal pageout-daemon and wait for it. */ void pagedaemon_wait(int domain, int pri, const char *wmesg); void pagedaemon_wakeup(int domain); #define VM_WAIT vm_wait() #define VM_WAITPFAULT vm_waitpfault() void vm_wait(void); void vm_waitpfault(void); +void vm_wait_domain(int domain); +void vm_wait_min(void); +void vm_wait_severe(void); #ifdef _KERNEL int vm_pageout_flush(vm_page_t *, int, int, int, int *, boolean_t *); void vm_pageout_oom(int shortage); void vm_swapout_run(void); void vm_swapout_run_idle(void); #endif #endif /* _VM_VM_PAGEOUT_H_ */ Index: user/jeff/numa/sys/vm/vm_pagequeue.h =================================================================== --- user/jeff/numa/sys/vm/vm_pagequeue.h (revision 328171) +++ user/jeff/numa/sys/vm/vm_pagequeue.h (revision 328172) @@ -1,204 +1,230 @@ /*- * SPDX-License-Identifier: (BSD-3-Clause AND MIT-CMU) * * Copyright (c) 1991, 1993 * The Regents of the University of California. All rights reserved. * * This code is derived from software contributed to Berkeley by * The Mach Operating System project at Carnegie-Mellon University. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from: @(#)vm_page.h 8.2 (Berkeley) 12/13/93 * * * Copyright (c) 1987, 1990 Carnegie-Mellon University. * All rights reserved. * * Authors: Avadis Tevanian, Jr., Michael Wayne Young * * Permission to use, copy, modify and distribute this software and * its documentation is hereby granted, provided that both the copyright * notice and this permission notice appear in all copies of the * software, derivative works or modified versions, and any portions * thereof, and that both notices appear in supporting documentation. * * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. * * Carnegie Mellon requests users of this software to return to * * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU * School of Computer Science * Carnegie Mellon University * Pittsburgh PA 15213-3890 * * any improvements or extensions that they make and grant Carnegie the * rights to redistribute these changes. * * $FreeBSD$ */ #ifndef _VM_PAGEQUEUE_ #define _VM_PAGEQUEUE_ #ifdef _KERNEL struct vm_pagequeue { struct mtx pq_mutex; struct pglist pq_pl; int pq_cnt; const char * const pq_name; } __aligned(CACHE_LINE_SIZE); struct vm_domain { struct vm_pagequeue vmd_pagequeues[PQ_COUNT]; struct mtx_padalign vmd_pagequeue_free_mtx; struct vmem *vmd_kernel_arena; u_int vmd_page_count; u_int vmd_free_count; long vmd_segs; /* bitmask of the segments */ boolean_t vmd_oom; int vmd_oom_seq; int vmd_last_active_scan; struct vm_page vmd_laundry_marker; struct vm_page vmd_marker; /* marker for pagedaemon private use */ struct vm_page vmd_inacthead; /* marker for LRU-defeating insertions */ int vmd_pageout_pages_needed; /* page daemon waiting for pages? */ int vmd_pageout_deficit; /* Estimated number of pages deficit */ bool vmd_pages_needed; /* Are threads waiting for free pages? */ bool vmd_pageout_wanted; /* pageout daemon wait channel */ + bool vmd_minset; /* Are we in vm_min_domains? */ + bool vmd_severeset; /* Are we in vm_severe_domains? */ int vmd_inactq_scans; enum { VM_LAUNDRY_IDLE = 0, VM_LAUNDRY_BACKGROUND, VM_LAUNDRY_SHORTFALL } vmd_laundry_request; u_int vmd_free_reserved; /* (c) pages reserved for deadlock */ u_int vmd_free_target; /* (c) pages desired free */ u_int vmd_free_min; /* (c) pages desired free */ u_int vmd_inactive_target; /* (c) pages desired inactive */ u_int vmd_pageout_free_min; /* (c) min pages reserved for kernel */ u_int vmd_pageout_wakeup_thresh;/* (c) min pages to wake pagedaemon */ u_int vmd_interrupt_free_min; /* (c) reserved pages for int code */ u_int vmd_free_severe; /* (c) severe page depletion point */ -}; +} __aligned(CACHE_LINE_SIZE); extern struct vm_domain vm_dom[MAXMEMDOM]; #define VM_DOMAIN(n) (&vm_dom[(n)]) #define vm_pagequeue_assert_locked(pq) mtx_assert(&(pq)->pq_mutex, MA_OWNED) #define vm_pagequeue_lock(pq) mtx_lock(&(pq)->pq_mutex) #define vm_pagequeue_lockptr(pq) (&(pq)->pq_mutex) #define vm_pagequeue_unlock(pq) mtx_unlock(&(pq)->pq_mutex) #define vm_pagequeue_free_assert_locked(n) \ mtx_assert(vm_pagequeue_free_lockptr((n)), MA_OWNED) #define vm_pagequeue_free_assert_unlocked(n) \ mtx_assert(vm_pagequeue_free_lockptr((n)), MA_NOTOWNED) #define vm_pagequeue_free_lock(n) \ mtx_lock(vm_pagequeue_free_lockptr((n))) #define vm_pagequeue_free_lockptr(n) \ (&VM_DOMAIN((n))->vmd_pagequeue_free_mtx) #define vm_pagequeue_free_unlock(n) \ mtx_unlock(vm_pagequeue_free_lockptr((n))) static __inline void vm_pagequeue_cnt_add(struct vm_pagequeue *pq, int addend) { #ifdef notyet vm_pagequeue_assert_locked(pq); #endif pq->pq_cnt += addend; } #define vm_pagequeue_cnt_inc(pq) vm_pagequeue_cnt_add((pq), 1) #define vm_pagequeue_cnt_dec(pq) vm_pagequeue_cnt_add((pq), -1) -static inline u_int -vm_pagequeue_freecnt_adj(int domain, int adj) -{ +void vm_domain_set(int domain); - vm_pagequeue_free_assert_locked(domain); - return (vm_dom[domain].vmd_free_count += adj); -} - /* * vm_pagequeue_domain: * * Return the memory domain the page belongs to. */ static inline struct vm_domain * vm_pagequeue_domain(vm_page_t m) { return (VM_DOMAIN(vm_phys_domain(m))); } /* * Return the number of pages we need to free-up or cache * A positive number indicates that we do not have enough free pages. */ static inline int vm_paging_target(struct vm_domain *vmd) { return (vmd->vmd_free_target - vmd->vmd_free_count); } /* * Returns TRUE if the pagedaemon needs to be woken up. */ static inline int vm_paging_needed(struct vm_domain *vmd, u_int free_count) { return (free_count < vmd->vmd_pageout_wakeup_thresh); } +/* + * Returns TRUE if the domain is below the min paging target. + */ static inline int vm_paging_min(struct vm_domain *vmd) { return (vmd->vmd_free_min > vmd->vmd_free_count); } /* + * Returns TRUE if the domain is below the severe paging target. + */ +static inline int +vm_paging_severe(struct vm_domain *vmd) +{ + + return (vmd->vmd_free_severe > vmd->vmd_free_count); +} + +/* * Return the number of pages we need to launder. * A positive number indicates that we have a shortfall of clean pages. */ static inline int vm_laundry_target(struct vm_domain *vmd) { return (vm_paging_target(vmd)); } + +static inline u_int +vm_pagequeue_freecnt_adj(int domain, int adj) +{ + struct vm_domain *vmd; + u_int ret; + + vm_pagequeue_free_assert_locked(domain); + vmd = VM_DOMAIN(domain); + ret = vmd->vmd_free_count += adj; + if ((!vmd->vmd_minset && vm_paging_min(vmd)) || + (!vmd->vmd_severeset && vm_paging_severe(vmd))) + vm_domain_set(domain); + + return (ret); +} + #endif /* _KERNEL */ #endif /* !_VM_PAGEQUEUE_ */ Index: user/jeff/numa/sys/vm/vm_swapout.c =================================================================== --- user/jeff/numa/sys/vm/vm_swapout.c (revision 328171) +++ user/jeff/numa/sys/vm/vm_swapout.c (revision 328172) @@ -1,893 +1,893 @@ /*- * SPDX-License-Identifier: (BSD-4-Clause AND MIT-CMU) * * Copyright (c) 1991 Regents of the University of California. * All rights reserved. * Copyright (c) 1994 John S. Dyson * All rights reserved. * Copyright (c) 1994 David Greenman * All rights reserved. * Copyright (c) 2005 Yahoo! Technologies Norway AS * All rights reserved. * * This code is derived from software contributed to Berkeley by * The Mach Operating System project at Carnegie-Mellon University. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from: @(#)vm_pageout.c 7.4 (Berkeley) 5/7/91 * * * Copyright (c) 1987, 1990 Carnegie-Mellon University. * All rights reserved. * * Authors: Avadis Tevanian, Jr., Michael Wayne Young * * Permission to use, copy, modify and distribute this software and * its documentation is hereby granted, provided that both the copyright * notice and this permission notice appear in all copies of the * software, derivative works or modified versions, and any portions * thereof, and that both notices appear in supporting documentation. * * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. * * Carnegie Mellon requests users of this software to return to * * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU * School of Computer Science * Carnegie Mellon University * Pittsburgh PA 15213-3890 * * any improvements or extensions that they make and grant Carnegie the * rights to redistribute these changes. */ #include __FBSDID("$FreeBSD$"); #include "opt_kstack_pages.h" #include "opt_kstack_max_pages.h" #include "opt_vm.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* the kernel process "vm_daemon" */ static void vm_daemon(void); static struct proc *vmproc; static struct kproc_desc vm_kp = { "vmdaemon", vm_daemon, &vmproc }; SYSINIT(vmdaemon, SI_SUB_KTHREAD_VM, SI_ORDER_FIRST, kproc_start, &vm_kp); static int vm_swap_enabled = 1; static int vm_swap_idle_enabled = 0; SYSCTL_INT(_vm, VM_SWAPPING_ENABLED, swap_enabled, CTLFLAG_RW, &vm_swap_enabled, 0, "Enable entire process swapout"); SYSCTL_INT(_vm, OID_AUTO, swap_idle_enabled, CTLFLAG_RW, &vm_swap_idle_enabled, 0, "Allow swapout on idle criteria"); /* * Swap_idle_threshold1 is the guaranteed swapped in time for a process */ static int swap_idle_threshold1 = 2; SYSCTL_INT(_vm, OID_AUTO, swap_idle_threshold1, CTLFLAG_RW, &swap_idle_threshold1, 0, "Guaranteed swapped in time for a process"); /* * Swap_idle_threshold2 is the time that a process can be idle before * it will be swapped out, if idle swapping is enabled. */ static int swap_idle_threshold2 = 10; SYSCTL_INT(_vm, OID_AUTO, swap_idle_threshold2, CTLFLAG_RW, &swap_idle_threshold2, 0, "Time before a process will be swapped out"); static int vm_pageout_req_swapout; /* XXX */ static int vm_daemon_needed; static struct mtx vm_daemon_mtx; /* Allow for use by vm_pageout before vm_daemon is initialized. */ MTX_SYSINIT(vm_daemon, &vm_daemon_mtx, "vm daemon", MTX_DEF); static void swapclear(struct proc *); static int swapout(struct proc *); static void vm_swapout_map_deactivate_pages(vm_map_t, long); static void vm_swapout_object_deactivate_pages(pmap_t, vm_object_t, long); static void swapout_procs(int action); static void vm_req_vmdaemon(int req); static void vm_thread_swapin(struct thread *td); static void vm_thread_swapout(struct thread *td); /* * vm_swapout_object_deactivate_pages * * Deactivate enough pages to satisfy the inactive target * requirements. * * The object and map must be locked. */ static void vm_swapout_object_deactivate_pages(pmap_t pmap, vm_object_t first_object, long desired) { vm_object_t backing_object, object; vm_page_t p; int act_delta, remove_mode; VM_OBJECT_ASSERT_LOCKED(first_object); if ((first_object->flags & OBJ_FICTITIOUS) != 0) return; for (object = first_object;; object = backing_object) { if (pmap_resident_count(pmap) <= desired) goto unlock_return; VM_OBJECT_ASSERT_LOCKED(object); if ((object->flags & OBJ_UNMANAGED) != 0 || object->paging_in_progress != 0) goto unlock_return; remove_mode = 0; if (object->shadow_count > 1) remove_mode = 1; /* * Scan the object's entire memory queue. */ TAILQ_FOREACH(p, &object->memq, listq) { if (pmap_resident_count(pmap) <= desired) goto unlock_return; if (should_yield()) goto unlock_return; if (vm_page_busied(p)) continue; VM_CNT_INC(v_pdpages); vm_page_lock(p); if (p->wire_count != 0 || p->hold_count != 0 || !pmap_page_exists_quick(pmap, p)) { vm_page_unlock(p); continue; } act_delta = pmap_ts_referenced(p); if ((p->aflags & PGA_REFERENCED) != 0) { if (act_delta == 0) act_delta = 1; vm_page_aflag_clear(p, PGA_REFERENCED); } if (!vm_page_active(p) && act_delta != 0) { vm_page_activate(p); p->act_count += act_delta; } else if (vm_page_active(p)) { if (act_delta == 0) { p->act_count -= min(p->act_count, ACT_DECLINE); if (!remove_mode && p->act_count == 0) { pmap_remove_all(p); vm_page_deactivate(p); } else vm_page_requeue(p); } else { vm_page_activate(p); if (p->act_count < ACT_MAX - ACT_ADVANCE) p->act_count += ACT_ADVANCE; vm_page_requeue(p); } } else if (vm_page_inactive(p)) pmap_remove_all(p); vm_page_unlock(p); } if ((backing_object = object->backing_object) == NULL) goto unlock_return; VM_OBJECT_RLOCK(backing_object); if (object != first_object) VM_OBJECT_RUNLOCK(object); } unlock_return: if (object != first_object) VM_OBJECT_RUNLOCK(object); } /* * deactivate some number of pages in a map, try to do it fairly, but * that is really hard to do. */ static void vm_swapout_map_deactivate_pages(vm_map_t map, long desired) { vm_map_entry_t tmpe; vm_object_t obj, bigobj; int nothingwired; if (!vm_map_trylock_read(map)) return; bigobj = NULL; nothingwired = TRUE; /* * first, search out the biggest object, and try to free pages from * that. */ tmpe = map->header.next; while (tmpe != &map->header) { if ((tmpe->eflags & MAP_ENTRY_IS_SUB_MAP) == 0) { obj = tmpe->object.vm_object; if (obj != NULL && VM_OBJECT_TRYRLOCK(obj)) { if (obj->shadow_count <= 1 && (bigobj == NULL || bigobj->resident_page_count < obj->resident_page_count)) { if (bigobj != NULL) VM_OBJECT_RUNLOCK(bigobj); bigobj = obj; } else VM_OBJECT_RUNLOCK(obj); } } if (tmpe->wired_count > 0) nothingwired = FALSE; tmpe = tmpe->next; } if (bigobj != NULL) { vm_swapout_object_deactivate_pages(map->pmap, bigobj, desired); VM_OBJECT_RUNLOCK(bigobj); } /* * Next, hunt around for other pages to deactivate. We actually * do this search sort of wrong -- .text first is not the best idea. */ tmpe = map->header.next; while (tmpe != &map->header) { if (pmap_resident_count(vm_map_pmap(map)) <= desired) break; if ((tmpe->eflags & MAP_ENTRY_IS_SUB_MAP) == 0) { obj = tmpe->object.vm_object; if (obj != NULL) { VM_OBJECT_RLOCK(obj); vm_swapout_object_deactivate_pages(map->pmap, obj, desired); VM_OBJECT_RUNLOCK(obj); } } tmpe = tmpe->next; } /* * Remove all mappings if a process is swapped out, this will free page * table pages. */ if (desired == 0 && nothingwired) { pmap_remove(vm_map_pmap(map), vm_map_min(map), vm_map_max(map)); } vm_map_unlock_read(map); } /* * Swap out requests */ #define VM_SWAP_NORMAL 1 #define VM_SWAP_IDLE 2 void vm_swapout_run(void) { if (vm_swap_enabled) vm_req_vmdaemon(VM_SWAP_NORMAL); } /* * Idle process swapout -- run once per second when pagedaemons are * reclaiming pages. */ void vm_swapout_run_idle(void) { static long lsec; if (!vm_swap_idle_enabled || time_second == lsec) return; vm_req_vmdaemon(VM_SWAP_IDLE); lsec = time_second; } static void vm_req_vmdaemon(int req) { static int lastrun = 0; mtx_lock(&vm_daemon_mtx); vm_pageout_req_swapout |= req; if ((ticks > (lastrun + hz)) || (ticks < lastrun)) { wakeup(&vm_daemon_needed); lastrun = ticks; } mtx_unlock(&vm_daemon_mtx); } static void vm_daemon(void) { struct rlimit rsslim; struct proc *p; struct thread *td; struct vmspace *vm; int breakout, swapout_flags, tryagain, attempts; #ifdef RACCT uint64_t rsize, ravailable; #endif while (TRUE) { mtx_lock(&vm_daemon_mtx); msleep(&vm_daemon_needed, &vm_daemon_mtx, PPAUSE, "psleep", #ifdef RACCT racct_enable ? hz : 0 #else 0 #endif ); swapout_flags = vm_pageout_req_swapout; vm_pageout_req_swapout = 0; mtx_unlock(&vm_daemon_mtx); if (swapout_flags) swapout_procs(swapout_flags); /* * scan the processes for exceeding their rlimits or if * process is swapped out -- deactivate pages */ tryagain = 0; attempts = 0; again: attempts++; sx_slock(&allproc_lock); FOREACH_PROC_IN_SYSTEM(p) { vm_pindex_t limit, size; /* * if this is a system process or if we have already * looked at this process, skip it. */ PROC_LOCK(p); if (p->p_state != PRS_NORMAL || p->p_flag & (P_INEXEC | P_SYSTEM | P_WEXIT)) { PROC_UNLOCK(p); continue; } /* * if the process is in a non-running type state, * don't touch it. */ breakout = 0; FOREACH_THREAD_IN_PROC(p, td) { thread_lock(td); if (!TD_ON_RUNQ(td) && !TD_IS_RUNNING(td) && !TD_IS_SLEEPING(td) && !TD_IS_SUSPENDED(td)) { thread_unlock(td); breakout = 1; break; } thread_unlock(td); } if (breakout) { PROC_UNLOCK(p); continue; } /* * get a limit */ lim_rlimit_proc(p, RLIMIT_RSS, &rsslim); limit = OFF_TO_IDX( qmin(rsslim.rlim_cur, rsslim.rlim_max)); /* * let processes that are swapped out really be * swapped out set the limit to nothing (will force a * swap-out.) */ if ((p->p_flag & P_INMEM) == 0) limit = 0; /* XXX */ vm = vmspace_acquire_ref(p); _PHOLD_LITE(p); PROC_UNLOCK(p); if (vm == NULL) { PRELE(p); continue; } sx_sunlock(&allproc_lock); size = vmspace_resident_count(vm); if (size >= limit) { vm_swapout_map_deactivate_pages( &vm->vm_map, limit); size = vmspace_resident_count(vm); } #ifdef RACCT if (racct_enable) { rsize = IDX_TO_OFF(size); PROC_LOCK(p); if (p->p_state == PRS_NORMAL) racct_set(p, RACCT_RSS, rsize); ravailable = racct_get_available(p, RACCT_RSS); PROC_UNLOCK(p); if (rsize > ravailable) { /* * Don't be overly aggressive; this * might be an innocent process, * and the limit could've been exceeded * by some memory hog. Don't try * to deactivate more than 1/4th * of process' resident set size. */ if (attempts <= 8) { if (ravailable < rsize - (rsize / 4)) { ravailable = rsize - (rsize / 4); } } vm_swapout_map_deactivate_pages( &vm->vm_map, OFF_TO_IDX(ravailable)); /* Update RSS usage after paging out. */ size = vmspace_resident_count(vm); rsize = IDX_TO_OFF(size); PROC_LOCK(p); if (p->p_state == PRS_NORMAL) racct_set(p, RACCT_RSS, rsize); PROC_UNLOCK(p); if (rsize > ravailable) tryagain = 1; } } #endif vmspace_free(vm); sx_slock(&allproc_lock); PRELE(p); } sx_sunlock(&allproc_lock); if (tryagain != 0 && attempts <= 10) { maybe_yield(); goto again; } } } /* * Allow a thread's kernel stack to be paged out. */ static void vm_thread_swapout(struct thread *td) { vm_object_t ksobj; vm_page_t m; int i, pages; cpu_thread_swapout(td); pages = td->td_kstack_pages; ksobj = td->td_kstack_obj; pmap_qremove(td->td_kstack, pages); VM_OBJECT_WLOCK(ksobj); for (i = 0; i < pages; i++) { m = vm_page_lookup(ksobj, i); if (m == NULL) panic("vm_thread_swapout: kstack already missing?"); vm_page_dirty(m); vm_page_lock(m); vm_page_unwire(m, PQ_LAUNDRY); vm_page_unlock(m); } VM_OBJECT_WUNLOCK(ksobj); } /* * Bring the kernel stack for a specified thread back in. */ static void vm_thread_swapin(struct thread *td) { vm_object_t ksobj; vm_page_t ma[KSTACK_MAX_PAGES]; int a, count, i, j, pages, rv; pages = td->td_kstack_pages; ksobj = td->td_kstack_obj; VM_OBJECT_WLOCK(ksobj); (void)vm_page_grab_pages(ksobj, 0, VM_ALLOC_NORMAL | VM_ALLOC_WIRED, ma, pages); for (i = 0; i < pages;) { vm_page_assert_xbusied(ma[i]); if (ma[i]->valid == VM_PAGE_BITS_ALL) { vm_page_xunbusy(ma[i]); i++; continue; } vm_object_pip_add(ksobj, 1); for (j = i + 1; j < pages; j++) if (ma[j]->valid == VM_PAGE_BITS_ALL) break; rv = vm_pager_has_page(ksobj, ma[i]->pindex, NULL, &a); KASSERT(rv == 1, ("%s: missing page %p", __func__, ma[i])); count = min(a + 1, j - i); rv = vm_pager_get_pages(ksobj, ma + i, count, NULL, NULL); KASSERT(rv == VM_PAGER_OK, ("%s: cannot get kstack for proc %d", __func__, td->td_proc->p_pid)); vm_object_pip_wakeup(ksobj); for (j = i; j < i + count; j++) vm_page_xunbusy(ma[j]); i += count; } VM_OBJECT_WUNLOCK(ksobj); pmap_qenter(td->td_kstack, ma, pages); cpu_thread_swapin(td); } void faultin(struct proc *p) { struct thread *td; PROC_LOCK_ASSERT(p, MA_OWNED); /* * If another process is swapping in this process, * just wait until it finishes. */ if (p->p_flag & P_SWAPPINGIN) { while (p->p_flag & P_SWAPPINGIN) msleep(&p->p_flag, &p->p_mtx, PVM, "faultin", 0); return; } if ((p->p_flag & P_INMEM) == 0) { /* * Don't let another thread swap process p out while we are * busy swapping it in. */ ++p->p_lock; p->p_flag |= P_SWAPPINGIN; PROC_UNLOCK(p); /* * We hold no lock here because the list of threads * can not change while all threads in the process are * swapped out. */ FOREACH_THREAD_IN_PROC(p, td) vm_thread_swapin(td); PROC_LOCK(p); swapclear(p); p->p_swtick = ticks; wakeup(&p->p_flag); /* Allow other threads to swap p out now. */ --p->p_lock; } } /* * This swapin algorithm attempts to swap-in processes only if there * is enough space for them. Of course, if a process waits for a long * time, it will be swapped in anyway. */ void swapper(void) { struct proc *p, *pp; struct thread *td; int ppri, pri, slptime, swtime; loop: if (vm_page_count_min()) { - VM_WAIT; + vm_wait_min(); goto loop; } pp = NULL; ppri = INT_MIN; sx_slock(&allproc_lock); FOREACH_PROC_IN_SYSTEM(p) { PROC_LOCK(p); if (p->p_state == PRS_NEW || p->p_flag & (P_SWAPPINGOUT | P_SWAPPINGIN | P_INMEM)) { PROC_UNLOCK(p); continue; } swtime = (ticks - p->p_swtick) / hz; FOREACH_THREAD_IN_PROC(p, td) { /* * An otherwise runnable thread of a process * swapped out has only the TDI_SWAPPED bit set. */ thread_lock(td); if (td->td_inhibitors == TDI_SWAPPED) { slptime = (ticks - td->td_slptick) / hz; pri = swtime + slptime; if ((td->td_flags & TDF_SWAPINREQ) == 0) pri -= p->p_nice * 8; /* * if this thread is higher priority * and there is enough space, then select * this process instead of the previous * selection. */ if (pri > ppri) { pp = p; ppri = pri; } } thread_unlock(td); } PROC_UNLOCK(p); } sx_sunlock(&allproc_lock); /* * Nothing to do, back to sleep. */ if ((p = pp) == NULL) { tsleep(&proc0, PVM, "swapin", MAXSLP * hz / 2); goto loop; } PROC_LOCK(p); /* * Another process may be bringing or may have already * brought this process in while we traverse all threads. * Or, this process may even be being swapped out again. */ if (p->p_flag & (P_INMEM | P_SWAPPINGOUT | P_SWAPPINGIN)) { PROC_UNLOCK(p); goto loop; } /* * We would like to bring someone in. */ faultin(p); PROC_UNLOCK(p); goto loop; } /* * First, if any processes have been sleeping or stopped for at least * "swap_idle_threshold1" seconds, they are swapped out. If, however, * no such processes exist, then the longest-sleeping or stopped * process is swapped out. Finally, and only as a last resort, if * there are no sleeping or stopped processes, the longest-resident * process is swapped out. */ static void swapout_procs(int action) { struct proc *p; struct thread *td; int slptime; bool didswap, doswap; MPASS((action & (VM_SWAP_NORMAL | VM_SWAP_IDLE)) != 0); didswap = false; sx_slock(&allproc_lock); FOREACH_PROC_IN_SYSTEM(p) { /* * Filter out not yet fully constructed processes. Do * not swap out held processes. Avoid processes which * are system, exiting, execing, traced, already swapped * out or are in the process of being swapped in or out. */ PROC_LOCK(p); if (p->p_state != PRS_NORMAL || p->p_lock != 0 || (p->p_flag & (P_SYSTEM | P_WEXIT | P_INEXEC | P_STOPPED_SINGLE | P_TRACED | P_SWAPPINGOUT | P_SWAPPINGIN | P_INMEM)) != P_INMEM) { PROC_UNLOCK(p); continue; } /* * Further consideration of this process for swap out * requires iterating over its threads. We release * allproc_lock here so that process creation and * destruction are not blocked while we iterate. * * To later reacquire allproc_lock and resume * iteration over the allproc list, we will first have * to release the lock on the process. We place a * hold on the process so that it remains in the * allproc list while it is unlocked. */ _PHOLD_LITE(p); sx_sunlock(&allproc_lock); /* * Do not swapout a realtime process. * Guarantee swap_idle_threshold1 time in memory. * If the system is under memory stress, or if we are * swapping idle processes >= swap_idle_threshold2, * then swap the process out. */ doswap = true; FOREACH_THREAD_IN_PROC(p, td) { thread_lock(td); slptime = (ticks - td->td_slptick) / hz; if (PRI_IS_REALTIME(td->td_pri_class) || slptime < swap_idle_threshold1 || !thread_safetoswapout(td) || ((action & VM_SWAP_NORMAL) == 0 && slptime < swap_idle_threshold2)) doswap = false; thread_unlock(td); if (!doswap) break; } if (doswap && swapout(p) == 0) didswap = true; PROC_UNLOCK(p); sx_slock(&allproc_lock); PRELE(p); } sx_sunlock(&allproc_lock); /* * If we swapped something out, and another process needed memory, * then wakeup the sched process. */ if (didswap) wakeup(&proc0); } static void swapclear(struct proc *p) { struct thread *td; PROC_LOCK_ASSERT(p, MA_OWNED); FOREACH_THREAD_IN_PROC(p, td) { thread_lock(td); td->td_flags |= TDF_INMEM; td->td_flags &= ~TDF_SWAPINREQ; TD_CLR_SWAPPED(td); if (TD_CAN_RUN(td)) if (setrunnable(td)) { #ifdef INVARIANTS /* * XXX: We just cleared TDI_SWAPPED * above and set TDF_INMEM, so this * should never happen. */ panic("not waking up swapper"); #endif } thread_unlock(td); } p->p_flag &= ~(P_SWAPPINGIN | P_SWAPPINGOUT); p->p_flag |= P_INMEM; } static int swapout(struct proc *p) { struct thread *td; PROC_LOCK_ASSERT(p, MA_OWNED); /* * The states of this process and its threads may have changed * by now. Assuming that there is only one pageout daemon thread, * this process should still be in memory. */ KASSERT((p->p_flag & (P_INMEM | P_SWAPPINGOUT | P_SWAPPINGIN)) == P_INMEM, ("swapout: lost a swapout race?")); /* * Remember the resident count. */ p->p_vmspace->vm_swrss = vmspace_resident_count(p->p_vmspace); /* * Check and mark all threads before we proceed. */ p->p_flag &= ~P_INMEM; p->p_flag |= P_SWAPPINGOUT; FOREACH_THREAD_IN_PROC(p, td) { thread_lock(td); if (!thread_safetoswapout(td)) { thread_unlock(td); swapclear(p); return (EBUSY); } td->td_flags &= ~TDF_INMEM; TD_SET_SWAPPED(td); thread_unlock(td); } td = FIRST_THREAD_IN_PROC(p); ++td->td_ru.ru_nswap; PROC_UNLOCK(p); /* * This list is stable because all threads are now prevented from * running. The list is only modified in the context of a running * thread in this process. */ FOREACH_THREAD_IN_PROC(p, td) vm_thread_swapout(td); PROC_LOCK(p); p->p_flag &= ~P_SWAPPINGOUT; p->p_swtick = ticks; return (0); }