diff --git a/sys/vm/swap_pager.c b/sys/vm/swap_pager.c index 7b7e5d18e16d..46f49d24967b 100644 --- a/sys/vm/swap_pager.c +++ b/sys/vm/swap_pager.c @@ -1,2574 +1,2576 @@ /*- * Copyright (c) 1998 Matthew Dillon, * Copyright (c) 1994 John S. Dyson * Copyright (c) 1990 University of Utah. * Copyright (c) 1982, 1986, 1989, 1993 * The Regents of the University of California. All rights reserved. * * This code is derived from software contributed to Berkeley by * the Systems Programming Group of the University of Utah Computer * Science Department. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * New Swap System * Matthew Dillon * * Radix Bitmap 'blists'. * * - The new swapper uses the new radix bitmap code. This should scale * to arbitrarily small or arbitrarily large swap spaces and an almost * arbitrary degree of fragmentation. * * Features: * * - on the fly reallocation of swap during putpages. The new system * does not try to keep previously allocated swap blocks for dirty * pages. * * - on the fly deallocation of swap * * - No more garbage collection required. Unnecessarily allocated swap * blocks only exist for dirty vm_page_t's now and these are already * cycled (in a high-load system) by the pager. We also do on-the-fly * removal of invalidated swap blocks when a page is destroyed * or renamed. * * from: Utah $Hdr: swap_pager.c 1.4 91/04/30$ * * @(#)swap_pager.c 8.9 (Berkeley) 3/21/94 * @(#)vm_swap.c 8.5 (Berkeley) 2/17/94 */ #include __FBSDID("$FreeBSD$"); #include "opt_mac.h" #include "opt_swap.h" #include "opt_vm.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* * SWB_NPAGES must be a power of 2. It may be set to 1, 2, 4, 8, or 16 * pages per allocation. We recommend you stick with the default of 8. * The 16-page limit is due to the radix code (kern/subr_blist.c). */ #ifndef MAX_PAGEOUT_CLUSTER #define MAX_PAGEOUT_CLUSTER 16 #endif #if !defined(SWB_NPAGES) #define SWB_NPAGES MAX_PAGEOUT_CLUSTER #endif /* * Piecemeal swap metadata structure. Swap is stored in a radix tree. * * If SWB_NPAGES is 8 and sizeof(char *) == sizeof(daddr_t), our radix * is basically 8. Assuming PAGE_SIZE == 4096, one tree level represents * 32K worth of data, two levels represent 256K, three levels represent * 2 MBytes. This is acceptable. * * Overall memory utilization is about the same as the old swap structure. */ #define SWCORRECT(n) (sizeof(void *) * (n) / sizeof(daddr_t)) #define SWAP_META_PAGES (SWB_NPAGES * 2) #define SWAP_META_MASK (SWAP_META_PAGES - 1) typedef int32_t swblk_t; /* * swap offset. This is the type used to * address the "virtual swap device" and * therefore the maximum swap space is * 2^32 pages. */ struct swdevt; typedef void sw_strategy_t(struct buf *bp, struct swdevt *sw); typedef void sw_close_t(struct thread *td, struct swdevt *sw); /* * Swap device table */ struct swdevt { int sw_flags; int sw_nblks; int sw_used; dev_t sw_dev; struct vnode *sw_vp; void *sw_id; swblk_t sw_first; swblk_t sw_end; struct blist *sw_blist; TAILQ_ENTRY(swdevt) sw_list; sw_strategy_t *sw_strategy; sw_close_t *sw_close; }; #define SW_CLOSING 0x04 struct swblock { struct swblock *swb_hnext; vm_object_t swb_object; vm_pindex_t swb_index; int swb_count; daddr_t swb_pages[SWAP_META_PAGES]; }; static struct mtx sw_dev_mtx; static TAILQ_HEAD(, swdevt) swtailq = TAILQ_HEAD_INITIALIZER(swtailq); static struct swdevt *swdevhd; /* Allocate from here next */ static int nswapdev; /* Number of swap devices */ int swap_pager_avail; static int swdev_syscall_active = 0; /* serialize swap(on|off) */ static void swapdev_strategy(struct buf *, struct swdevt *sw); #define SWM_FREE 0x02 /* free, period */ #define SWM_POP 0x04 /* pop out */ int swap_pager_full = 2; /* swap space exhaustion (task killing) */ static int swap_pager_almost_full = 1; /* swap space exhaustion (w/hysteresis)*/ static int nsw_rcount; /* free read buffers */ static int nsw_wcount_sync; /* limit write buffers / synchronous */ static int nsw_wcount_async; /* limit write buffers / asynchronous */ static int nsw_wcount_async_max;/* assigned maximum */ static int nsw_cluster_max; /* maximum VOP I/O allowed */ static struct swblock **swhash; static int swhash_mask; static struct mtx swhash_mtx; static int swap_async_max = 4; /* maximum in-progress async I/O's */ static struct sx sw_alloc_sx; SYSCTL_INT(_vm, OID_AUTO, swap_async_max, CTLFLAG_RW, &swap_async_max, 0, "Maximum running async swap ops"); /* * "named" and "unnamed" anon region objects. Try to reduce the overhead * of searching a named list by hashing it just a little. */ #define NOBJLISTS 8 #define NOBJLIST(handle) \ (&swap_pager_object_list[((int)(intptr_t)handle >> 4) & (NOBJLISTS-1)]) static struct mtx sw_alloc_mtx; /* protect list manipulation */ static struct pagerlst swap_pager_object_list[NOBJLISTS]; static uma_zone_t swap_zone; static struct vm_object swap_zone_obj; /* * pagerops for OBJT_SWAP - "swap pager". Some ops are also global procedure * calls hooked from other parts of the VM system and do not appear here. * (see vm/swap_pager.h). */ static vm_object_t swap_pager_alloc(void *handle, vm_ooffset_t size, vm_prot_t prot, vm_ooffset_t offset); static void swap_pager_dealloc(vm_object_t object); static int swap_pager_getpages(vm_object_t, vm_page_t *, int, int); static void swap_pager_putpages(vm_object_t, vm_page_t *, int, boolean_t, int *); static boolean_t swap_pager_haspage(vm_object_t object, vm_pindex_t pindex, int *before, int *after); static void swap_pager_init(void); static void swap_pager_unswapped(vm_page_t); static void swap_pager_swapoff(struct swdevt *sp); struct pagerops swappagerops = { .pgo_init = swap_pager_init, /* early system initialization of pager */ .pgo_alloc = swap_pager_alloc, /* allocate an OBJT_SWAP object */ .pgo_dealloc = swap_pager_dealloc, /* deallocate an OBJT_SWAP object */ .pgo_getpages = swap_pager_getpages, /* pagein */ .pgo_putpages = swap_pager_putpages, /* pageout */ .pgo_haspage = swap_pager_haspage, /* get backing store status for page */ .pgo_pageunswapped = swap_pager_unswapped, /* remove swap related to page */ }; /* * dmmax is in page-sized chunks with the new swap system. It was * dev-bsized chunks in the old. dmmax is always a power of 2. * * swap_*() routines are externally accessible. swp_*() routines are * internal. */ static int dmmax; static int nswap_lowat = 128; /* in pages, swap_pager_almost_full warn */ static int nswap_hiwat = 512; /* in pages, swap_pager_almost_full warn */ SYSCTL_INT(_vm, OID_AUTO, dmmax, CTLFLAG_RD, &dmmax, 0, "Maximum size of a swap block"); static void swp_sizecheck(void); static void swp_pager_async_iodone(struct buf *bp); static int swapongeom(struct thread *, struct vnode *); static int swaponvp(struct thread *, struct vnode *, u_long); static int swapoff_one(struct swdevt *sp, struct thread *td); /* * Swap bitmap functions */ static void swp_pager_freeswapspace(daddr_t blk, int npages); static daddr_t swp_pager_getswapspace(int npages); /* * Metadata functions */ static struct swblock **swp_pager_hash(vm_object_t object, vm_pindex_t index); static void swp_pager_meta_build(vm_object_t, vm_pindex_t, daddr_t); static void swp_pager_meta_free(vm_object_t, vm_pindex_t, daddr_t); static void swp_pager_meta_free_all(vm_object_t); static daddr_t swp_pager_meta_ctl(vm_object_t, vm_pindex_t, int); /* * SWP_SIZECHECK() - update swap_pager_full indication * * update the swap_pager_almost_full indication and warn when we are * about to run out of swap space, using lowat/hiwat hysteresis. * * Clear swap_pager_full ( task killing ) indication when lowat is met. * * No restrictions on call * This routine may not block. * This routine must be called at splvm() */ static void swp_sizecheck(void) { if (swap_pager_avail < nswap_lowat) { if (swap_pager_almost_full == 0) { printf("swap_pager: out of swap space\n"); swap_pager_almost_full = 1; } } else { swap_pager_full = 0; if (swap_pager_avail > nswap_hiwat) swap_pager_almost_full = 0; } } /* * SWP_PAGER_HASH() - hash swap meta data * * This is an helper function which hashes the swapblk given * the object and page index. It returns a pointer to a pointer * to the object, or a pointer to a NULL pointer if it could not * find a swapblk. * * This routine must be called at splvm(). */ static struct swblock ** swp_pager_hash(vm_object_t object, vm_pindex_t index) { struct swblock **pswap; struct swblock *swap; index &= ~(vm_pindex_t)SWAP_META_MASK; pswap = &swhash[(index ^ (int)(intptr_t)object) & swhash_mask]; while ((swap = *pswap) != NULL) { if (swap->swb_object == object && swap->swb_index == index ) { break; } pswap = &swap->swb_hnext; } return (pswap); } /* * SWAP_PAGER_INIT() - initialize the swap pager! * * Expected to be started from system init. NOTE: This code is run * before much else so be careful what you depend on. Most of the VM * system has yet to be initialized at this point. */ static void swap_pager_init(void) { /* * Initialize object lists */ int i; for (i = 0; i < NOBJLISTS; ++i) TAILQ_INIT(&swap_pager_object_list[i]); mtx_init(&sw_alloc_mtx, "swap_pager list", NULL, MTX_DEF); mtx_init(&sw_dev_mtx, "swapdev", NULL, MTX_DEF); /* * Device Stripe, in PAGE_SIZE'd blocks */ dmmax = SWB_NPAGES * 2; } /* * SWAP_PAGER_SWAP_INIT() - swap pager initialization from pageout process * * Expected to be started from pageout process once, prior to entering * its main loop. */ void swap_pager_swap_init(void) { int n, n2; /* * Number of in-transit swap bp operations. Don't * exhaust the pbufs completely. Make sure we * initialize workable values (0 will work for hysteresis * but it isn't very efficient). * * The nsw_cluster_max is constrained by the bp->b_pages[] * array (MAXPHYS/PAGE_SIZE) and our locally defined * MAX_PAGEOUT_CLUSTER. Also be aware that swap ops are * constrained by the swap device interleave stripe size. * * Currently we hardwire nsw_wcount_async to 4. This limit is * designed to prevent other I/O from having high latencies due to * our pageout I/O. The value 4 works well for one or two active swap * devices but is probably a little low if you have more. Even so, * a higher value would probably generate only a limited improvement * with three or four active swap devices since the system does not * typically have to pageout at extreme bandwidths. We will want * at least 2 per swap devices, and 4 is a pretty good value if you * have one NFS swap device due to the command/ack latency over NFS. * So it all works out pretty well. */ nsw_cluster_max = min((MAXPHYS/PAGE_SIZE), MAX_PAGEOUT_CLUSTER); mtx_lock(&pbuf_mtx); nsw_rcount = (nswbuf + 1) / 2; nsw_wcount_sync = (nswbuf + 3) / 4; nsw_wcount_async = 4; nsw_wcount_async_max = nsw_wcount_async; mtx_unlock(&pbuf_mtx); /* * Initialize our zone. Right now I'm just guessing on the number * we need based on the number of pages in the system. Each swblock * can hold 16 pages, so this is probably overkill. This reservation * is typically limited to around 32MB by default. */ n = cnt.v_page_count / 2; if (maxswzone && n > maxswzone / sizeof(struct swblock)) n = maxswzone / sizeof(struct swblock); n2 = n; swap_zone = uma_zcreate("SWAPMETA", sizeof(struct swblock), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE | UMA_ZONE_VM); if (swap_zone == NULL) panic("failed to create swap_zone."); do { if (uma_zone_set_obj(swap_zone, &swap_zone_obj, n)) break; /* * if the allocation failed, try a zone two thirds the * size of the previous attempt. */ n -= ((n + 2) / 3); } while (n > 0); if (n2 != n) printf("Swap zone entries reduced from %d to %d.\n", n2, n); n2 = n; /* * Initialize our meta-data hash table. The swapper does not need to * be quite as efficient as the VM system, so we do not use an * oversized hash table. * * n: size of hash table, must be power of 2 * swhash_mask: hash table index mask */ for (n = 1; n < n2 / 8; n *= 2) ; swhash = malloc(sizeof(struct swblock *) * n, M_VMPGDATA, M_WAITOK | M_ZERO); swhash_mask = n - 1; mtx_init(&swhash_mtx, "swap_pager swhash", NULL, MTX_DEF); } /* * SWAP_PAGER_ALLOC() - allocate a new OBJT_SWAP VM object and instantiate * its metadata structures. * * This routine is called from the mmap and fork code to create a new * OBJT_SWAP object. We do this by creating an OBJT_DEFAULT object * and then converting it with swp_pager_meta_build(). * * This routine may block in vm_object_allocate() and create a named * object lookup race, so we must interlock. We must also run at * splvm() for the object lookup to handle races with interrupts, but * we do not have to maintain splvm() in between the lookup and the * add because (I believe) it is not possible to attempt to create * a new swap object w/handle when a default object with that handle * already exists. * * MPSAFE */ static vm_object_t swap_pager_alloc(void *handle, vm_ooffset_t size, vm_prot_t prot, vm_ooffset_t offset) { vm_object_t object; vm_pindex_t pindex; pindex = OFF_TO_IDX(offset + PAGE_MASK + size); if (handle) { mtx_lock(&Giant); /* * Reference existing named region or allocate new one. There * should not be a race here against swp_pager_meta_build() * as called from vm_page_remove() in regards to the lookup * of the handle. */ sx_xlock(&sw_alloc_sx); object = vm_pager_object_lookup(NOBJLIST(handle), handle); if (object != NULL) { vm_object_reference(object); } else { object = vm_object_allocate(OBJT_DEFAULT, pindex); object->handle = handle; VM_OBJECT_LOCK(object); swp_pager_meta_build(object, 0, SWAPBLK_NONE); VM_OBJECT_UNLOCK(object); } sx_xunlock(&sw_alloc_sx); mtx_unlock(&Giant); } else { object = vm_object_allocate(OBJT_DEFAULT, pindex); VM_OBJECT_LOCK(object); swp_pager_meta_build(object, 0, SWAPBLK_NONE); VM_OBJECT_UNLOCK(object); } return (object); } /* * SWAP_PAGER_DEALLOC() - remove swap metadata from object * * The swap backing for the object is destroyed. The code is * designed such that we can reinstantiate it later, but this * routine is typically called only when the entire object is * about to be destroyed. * * This routine may block, but no longer does. * * The object must be locked or unreferenceable. */ static void swap_pager_dealloc(vm_object_t object) { /* * Remove from list right away so lookups will fail if we block for * pageout completion. */ if (object->handle != NULL) { mtx_lock(&sw_alloc_mtx); TAILQ_REMOVE(NOBJLIST(object->handle), object, pager_object_list); mtx_unlock(&sw_alloc_mtx); } VM_OBJECT_LOCK_ASSERT(object, MA_OWNED); vm_object_pip_wait(object, "swpdea"); /* * Free all remaining metadata. We only bother to free it from * the swap meta data. We do not attempt to free swapblk's still * associated with vm_page_t's for this object. We do not care * if paging is still in progress on some objects. */ swp_pager_meta_free_all(object); } /************************************************************************ * SWAP PAGER BITMAP ROUTINES * ************************************************************************/ /* * SWP_PAGER_GETSWAPSPACE() - allocate raw swap space * * Allocate swap for the requested number of pages. The starting * swap block number (a page index) is returned or SWAPBLK_NONE * if the allocation failed. * * Also has the side effect of advising that somebody made a mistake * when they configured swap and didn't configure enough. * * Must be called at splvm() to avoid races with bitmap frees from * vm_page_remove() aka swap_pager_page_removed(). * * This routine may not block * This routine must be called at splvm(). * * We allocate in round-robin fashion from the configured devices. */ static daddr_t swp_pager_getswapspace(int npages) { daddr_t blk; struct swdevt *sp; int i; blk = SWAPBLK_NONE; mtx_lock(&sw_dev_mtx); sp = swdevhd; for (i = 0; i < nswapdev; i++) { if (sp == NULL) sp = TAILQ_FIRST(&swtailq); if (!(sp->sw_flags & SW_CLOSING)) { blk = blist_alloc(sp->sw_blist, npages); if (blk != SWAPBLK_NONE) { blk += sp->sw_first; sp->sw_used += npages; swap_pager_avail -= npages; swp_sizecheck(); swdevhd = TAILQ_NEXT(sp, sw_list); goto done; } } sp = TAILQ_NEXT(sp, sw_list); } if (swap_pager_full != 2) { printf("swap_pager_getswapspace(%d): failed\n", npages); swap_pager_full = 2; swap_pager_almost_full = 1; } swdevhd = NULL; done: mtx_unlock(&sw_dev_mtx); return (blk); } static int swp_pager_isondev(daddr_t blk, struct swdevt *sp) { return (blk >= sp->sw_first && blk < sp->sw_end); } static void swp_pager_strategy(struct buf *bp) { struct swdevt *sp; mtx_lock(&sw_dev_mtx); TAILQ_FOREACH(sp, &swtailq, sw_list) { if (bp->b_blkno >= sp->sw_first && bp->b_blkno < sp->sw_end) { mtx_unlock(&sw_dev_mtx); sp->sw_strategy(bp, sp); return; } } panic("Swapdev not found"); } /* * SWP_PAGER_FREESWAPSPACE() - free raw swap space * * This routine returns the specified swap blocks back to the bitmap. * * Note: This routine may not block (it could in the old swap code), * and through the use of the new blist routines it does not block. * * We must be called at splvm() to avoid races with bitmap frees from * vm_page_remove() aka swap_pager_page_removed(). * * This routine may not block * This routine must be called at splvm(). */ static void swp_pager_freeswapspace(daddr_t blk, int npages) { struct swdevt *sp; mtx_lock(&sw_dev_mtx); TAILQ_FOREACH(sp, &swtailq, sw_list) { if (blk >= sp->sw_first && blk < sp->sw_end) { sp->sw_used -= npages; /* * If we are attempting to stop swapping on * this device, we don't want to mark any * blocks free lest they be reused. */ if ((sp->sw_flags & SW_CLOSING) == 0) { blist_free(sp->sw_blist, blk - sp->sw_first, npages); swap_pager_avail += npages; swp_sizecheck(); } mtx_unlock(&sw_dev_mtx); return; } } panic("Swapdev not found"); } /* * SWAP_PAGER_FREESPACE() - frees swap blocks associated with a page * range within an object. * * This is a globally accessible routine. * * This routine removes swapblk assignments from swap metadata. * * The external callers of this routine typically have already destroyed * or renamed vm_page_t's associated with this range in the object so * we should be ok. * * This routine may be called at any spl. We up our spl to splvm temporarily * in order to perform the metadata removal. */ void swap_pager_freespace(vm_object_t object, vm_pindex_t start, vm_size_t size) { VM_OBJECT_LOCK_ASSERT(object, MA_OWNED); swp_pager_meta_free(object, start, size); } /* * SWAP_PAGER_RESERVE() - reserve swap blocks in object * * Assigns swap blocks to the specified range within the object. The * swap blocks are not zerod. Any previous swap assignment is destroyed. * * Returns 0 on success, -1 on failure. */ int swap_pager_reserve(vm_object_t object, vm_pindex_t start, vm_size_t size) { int n = 0; daddr_t blk = SWAPBLK_NONE; vm_pindex_t beg = start; /* save start index */ VM_OBJECT_LOCK(object); while (size) { if (n == 0) { n = BLIST_MAX_ALLOC; while ((blk = swp_pager_getswapspace(n)) == SWAPBLK_NONE) { n >>= 1; if (n == 0) { swp_pager_meta_free(object, beg, start - beg); VM_OBJECT_UNLOCK(object); return (-1); } } } swp_pager_meta_build(object, start, blk); --size; ++start; ++blk; --n; } swp_pager_meta_free(object, start, n); VM_OBJECT_UNLOCK(object); return (0); } /* * SWAP_PAGER_COPY() - copy blocks from source pager to destination pager * and destroy the source. * * Copy any valid swapblks from the source to the destination. In * cases where both the source and destination have a valid swapblk, * we keep the destination's. * * This routine is allowed to block. It may block allocating metadata * indirectly through swp_pager_meta_build() or if paging is still in * progress on the source. * * This routine can be called at any spl * * XXX vm_page_collapse() kinda expects us not to block because we * supposedly do not need to allocate memory, but for the moment we * *may* have to get a little memory from the zone allocator, but * it is taken from the interrupt memory. We should be ok. * * The source object contains no vm_page_t's (which is just as well) * * The source object is of type OBJT_SWAP. * * The source and destination objects must be locked or * inaccessible (XXX are they ?) */ void swap_pager_copy(vm_object_t srcobject, vm_object_t dstobject, vm_pindex_t offset, int destroysource) { vm_pindex_t i; VM_OBJECT_LOCK_ASSERT(srcobject, MA_OWNED); VM_OBJECT_LOCK_ASSERT(dstobject, MA_OWNED); /* * If destroysource is set, we remove the source object from the * swap_pager internal queue now. */ if (destroysource) { if (srcobject->handle != NULL) { mtx_lock(&sw_alloc_mtx); TAILQ_REMOVE( NOBJLIST(srcobject->handle), srcobject, pager_object_list ); mtx_unlock(&sw_alloc_mtx); } } /* * transfer source to destination. */ for (i = 0; i < dstobject->size; ++i) { daddr_t dstaddr; /* * Locate (without changing) the swapblk on the destination, * unless it is invalid in which case free it silently, or * if the destination is a resident page, in which case the * source is thrown away. */ dstaddr = swp_pager_meta_ctl(dstobject, i, 0); if (dstaddr == SWAPBLK_NONE) { /* * Destination has no swapblk and is not resident, * copy source. */ daddr_t srcaddr; srcaddr = swp_pager_meta_ctl( srcobject, i + offset, SWM_POP ); if (srcaddr != SWAPBLK_NONE) { /* * swp_pager_meta_build() can sleep. */ vm_object_pip_add(srcobject, 1); VM_OBJECT_UNLOCK(srcobject); vm_object_pip_add(dstobject, 1); swp_pager_meta_build(dstobject, i, srcaddr); vm_object_pip_wakeup(dstobject); VM_OBJECT_LOCK(srcobject); vm_object_pip_wakeup(srcobject); } } else { /* * Destination has valid swapblk or it is represented * by a resident page. We destroy the sourceblock. */ swp_pager_meta_ctl(srcobject, i + offset, SWM_FREE); } } /* * Free left over swap blocks in source. * * We have to revert the type to OBJT_DEFAULT so we do not accidently * double-remove the object from the swap queues. */ if (destroysource) { swp_pager_meta_free_all(srcobject); /* * Reverting the type is not necessary, the caller is going * to destroy srcobject directly, but I'm doing it here * for consistency since we've removed the object from its * queues. */ srcobject->type = OBJT_DEFAULT; } } /* * SWAP_PAGER_HASPAGE() - determine if we have good backing store for * the requested page. * * We determine whether good backing store exists for the requested * page and return TRUE if it does, FALSE if it doesn't. * * If TRUE, we also try to determine how much valid, contiguous backing * store exists before and after the requested page within a reasonable * distance. We do not try to restrict it to the swap device stripe * (that is handled in getpages/putpages). It probably isn't worth * doing here. */ static boolean_t swap_pager_haspage(vm_object_t object, vm_pindex_t pindex, int *before, int *after) { daddr_t blk0; VM_OBJECT_LOCK_ASSERT(object, MA_OWNED); /* * do we have good backing store at the requested index ? */ blk0 = swp_pager_meta_ctl(object, pindex, 0); if (blk0 == SWAPBLK_NONE) { if (before) *before = 0; if (after) *after = 0; return (FALSE); } /* * find backwards-looking contiguous good backing store */ if (before != NULL) { int i; for (i = 1; i < (SWB_NPAGES/2); ++i) { daddr_t blk; if (i > pindex) break; blk = swp_pager_meta_ctl(object, pindex - i, 0); if (blk != blk0 - i) break; } *before = (i - 1); } /* * find forward-looking contiguous good backing store */ if (after != NULL) { int i; for (i = 1; i < (SWB_NPAGES/2); ++i) { daddr_t blk; blk = swp_pager_meta_ctl(object, pindex + i, 0); if (blk != blk0 + i) break; } *after = (i - 1); } return (TRUE); } /* * SWAP_PAGER_PAGE_UNSWAPPED() - remove swap backing store related to page * * This removes any associated swap backing store, whether valid or * not, from the page. * * This routine is typically called when a page is made dirty, at * which point any associated swap can be freed. MADV_FREE also * calls us in a special-case situation * * NOTE!!! If the page is clean and the swap was valid, the caller * should make the page dirty before calling this routine. This routine * does NOT change the m->dirty status of the page. Also: MADV_FREE * depends on it. * * This routine may not block * This routine must be called at splvm() */ static void swap_pager_unswapped(vm_page_t m) { VM_OBJECT_LOCK_ASSERT(m->object, MA_OWNED); swp_pager_meta_ctl(m->object, m->pindex, SWM_FREE); } /* * SWAP_PAGER_GETPAGES() - bring pages in from swap * * Attempt to retrieve (m, count) pages from backing store, but make * sure we retrieve at least m[reqpage]. We try to load in as large * a chunk surrounding m[reqpage] as is contiguous in swap and which * belongs to the same object. * * The code is designed for asynchronous operation and * immediate-notification of 'reqpage' but tends not to be * used that way. Please do not optimize-out this algorithmic * feature, I intend to improve on it in the future. * * The parent has a single vm_object_pip_add() reference prior to * calling us and we should return with the same. * * The parent has BUSY'd the pages. We should return with 'm' * left busy, but the others adjusted. */ static int swap_pager_getpages(vm_object_t object, vm_page_t *m, int count, int reqpage) { struct buf *bp; vm_page_t mreq; int i; int j; daddr_t blk; mreq = m[reqpage]; KASSERT(mreq->object == object, ("swap_pager_getpages: object mismatch %p/%p", object, mreq->object)); /* * Calculate range to retrieve. The pages have already been assigned * their swapblks. We require a *contiguous* range but we know it to * not span devices. If we do not supply it, bad things * happen. Note that blk, iblk & jblk can be SWAPBLK_NONE, but the * loops are set up such that the case(s) are handled implicitly. * * The swp_*() calls must be made at splvm(). vm_page_free() does * not need to be, but it will go a little faster if it is. */ blk = swp_pager_meta_ctl(mreq->object, mreq->pindex, 0); for (i = reqpage - 1; i >= 0; --i) { daddr_t iblk; iblk = swp_pager_meta_ctl(m[i]->object, m[i]->pindex, 0); if (blk != iblk + (reqpage - i)) break; } ++i; for (j = reqpage + 1; j < count; ++j) { daddr_t jblk; jblk = swp_pager_meta_ctl(m[j]->object, m[j]->pindex, 0); if (blk != jblk - (j - reqpage)) break; } /* * free pages outside our collection range. Note: we never free * mreq, it must remain busy throughout. */ if (0 < i || j < count) { int k; vm_page_lock_queues(); for (k = 0; k < i; ++k) vm_page_free(m[k]); for (k = j; k < count; ++k) vm_page_free(m[k]); vm_page_unlock_queues(); } /* * Return VM_PAGER_FAIL if we have nothing to do. Return mreq * still busy, but the others unbusied. */ if (blk == SWAPBLK_NONE) return (VM_PAGER_FAIL); /* * Getpbuf() can sleep. */ VM_OBJECT_UNLOCK(object); /* * Get a swap buffer header to perform the IO */ bp = getpbuf(&nsw_rcount); bp->b_flags |= B_PAGING; /* * map our page(s) into kva for input */ pmap_qenter((vm_offset_t)bp->b_data, m + i, j - i); bp->b_iocmd = BIO_READ; bp->b_iodone = swp_pager_async_iodone; bp->b_rcred = crhold(thread0.td_ucred); bp->b_wcred = crhold(thread0.td_ucred); bp->b_blkno = blk - (reqpage - i); bp->b_bcount = PAGE_SIZE * (j - i); bp->b_bufsize = PAGE_SIZE * (j - i); bp->b_pager.pg_reqpage = reqpage - i; VM_OBJECT_LOCK(object); { int k; for (k = i; k < j; ++k) { bp->b_pages[k - i] = m[k]; m[k]->oflags |= VPO_SWAPINPROG; } } bp->b_npages = j - i; cnt.v_swapin++; cnt.v_swappgsin += bp->b_npages; /* * We still hold the lock on mreq, and our automatic completion routine * does not remove it. */ vm_object_pip_add(object, bp->b_npages); VM_OBJECT_UNLOCK(object); /* * perform the I/O. NOTE!!! bp cannot be considered valid after * this point because we automatically release it on completion. * Instead, we look at the one page we are interested in which we * still hold a lock on even through the I/O completion. * * The other pages in our m[] array are also released on completion, * so we cannot assume they are valid anymore either. * * NOTE: b_blkno is destroyed by the call to swapdev_strategy */ BUF_KERNPROC(bp); swp_pager_strategy(bp); /* * wait for the page we want to complete. VPO_SWAPINPROG is always * cleared on completion. If an I/O error occurs, SWAPBLK_NONE * is set in the meta-data. */ VM_OBJECT_LOCK(object); while ((mreq->oflags & VPO_SWAPINPROG) != 0) { mreq->oflags |= VPO_WANTED; vm_page_lock_queues(); vm_page_flag_set(mreq, PG_REFERENCED); vm_page_unlock_queues(); cnt.v_intrans++; if (msleep(mreq, VM_OBJECT_MTX(object), PSWP, "swread", hz*20)) { printf( "swap_pager: indefinite wait buffer: bufobj: %p, blkno: %jd, size: %ld\n", bp->b_bufobj, (intmax_t)bp->b_blkno, bp->b_bcount); } } /* * mreq is left busied after completion, but all the other pages * are freed. If we had an unrecoverable read error the page will * not be valid. */ if (mreq->valid != VM_PAGE_BITS_ALL) { return (VM_PAGER_ERROR); } else { return (VM_PAGER_OK); } /* * A final note: in a low swap situation, we cannot deallocate swap * and mark a page dirty here because the caller is likely to mark * the page clean when we return, causing the page to possibly revert * to all-zero's later. */ } /* * swap_pager_putpages: * * Assign swap (if necessary) and initiate I/O on the specified pages. * * We support both OBJT_DEFAULT and OBJT_SWAP objects. DEFAULT objects * are automatically converted to SWAP objects. * * In a low memory situation we may block in VOP_STRATEGY(), but the new * vm_page reservation system coupled with properly written VFS devices * should ensure that no low-memory deadlock occurs. This is an area * which needs work. * * The parent has N vm_object_pip_add() references prior to * calling us and will remove references for rtvals[] that are * not set to VM_PAGER_PEND. We need to remove the rest on I/O * completion. * * The parent has soft-busy'd the pages it passes us and will unbusy * those whos rtvals[] entry is not set to VM_PAGER_PEND on return. * We need to unbusy the rest on I/O completion. */ void swap_pager_putpages(vm_object_t object, vm_page_t *m, int count, boolean_t sync, int *rtvals) { int i; int n = 0; GIANT_REQUIRED; if (count && m[0]->object != object) { panic("swap_pager_getpages: object mismatch %p/%p", object, m[0]->object ); } /* * Step 1 * * Turn object into OBJT_SWAP * check for bogus sysops * force sync if not pageout process */ if (object->type != OBJT_SWAP) swp_pager_meta_build(object, 0, SWAPBLK_NONE); VM_OBJECT_UNLOCK(object); if (curproc != pageproc) sync = TRUE; /* * Step 2 * * Update nsw parameters from swap_async_max sysctl values. * Do not let the sysop crash the machine with bogus numbers. */ mtx_lock(&pbuf_mtx); if (swap_async_max != nsw_wcount_async_max) { int n; /* * limit range */ if ((n = swap_async_max) > nswbuf / 2) n = nswbuf / 2; if (n < 1) n = 1; swap_async_max = n; /* * Adjust difference ( if possible ). If the current async * count is too low, we may not be able to make the adjustment * at this time. */ n -= nsw_wcount_async_max; if (nsw_wcount_async + n >= 0) { nsw_wcount_async += n; nsw_wcount_async_max += n; wakeup(&nsw_wcount_async); } } mtx_unlock(&pbuf_mtx); /* * Step 3 * * Assign swap blocks and issue I/O. We reallocate swap on the fly. * The page is left dirty until the pageout operation completes * successfully. */ for (i = 0; i < count; i += n) { int j; struct buf *bp; daddr_t blk; /* * Maximum I/O size is limited by a number of factors. */ n = min(BLIST_MAX_ALLOC, count - i); n = min(n, nsw_cluster_max); /* * Get biggest block of swap we can. If we fail, fall * back and try to allocate a smaller block. Don't go * overboard trying to allocate space if it would overly * fragment swap. */ while ( (blk = swp_pager_getswapspace(n)) == SWAPBLK_NONE && n > 4 ) { n >>= 1; } if (blk == SWAPBLK_NONE) { for (j = 0; j < n; ++j) rtvals[i+j] = VM_PAGER_FAIL; continue; } /* * All I/O parameters have been satisfied, build the I/O * request and assign the swap space. */ if (sync == TRUE) { bp = getpbuf(&nsw_wcount_sync); } else { bp = getpbuf(&nsw_wcount_async); bp->b_flags = B_ASYNC; } bp->b_flags |= B_PAGING; bp->b_iocmd = BIO_WRITE; pmap_qenter((vm_offset_t)bp->b_data, &m[i], n); bp->b_rcred = crhold(thread0.td_ucred); bp->b_wcred = crhold(thread0.td_ucred); bp->b_bcount = PAGE_SIZE * n; bp->b_bufsize = PAGE_SIZE * n; bp->b_blkno = blk; VM_OBJECT_LOCK(object); for (j = 0; j < n; ++j) { vm_page_t mreq = m[i+j]; swp_pager_meta_build( mreq->object, mreq->pindex, blk + j ); vm_page_dirty(mreq); rtvals[i+j] = VM_PAGER_OK; mreq->oflags |= VPO_SWAPINPROG; bp->b_pages[j] = mreq; } VM_OBJECT_UNLOCK(object); bp->b_npages = n; /* * Must set dirty range for NFS to work. */ bp->b_dirtyoff = 0; bp->b_dirtyend = bp->b_bcount; cnt.v_swapout++; cnt.v_swappgsout += bp->b_npages; /* * asynchronous * * NOTE: b_blkno is destroyed by the call to swapdev_strategy */ if (sync == FALSE) { bp->b_iodone = swp_pager_async_iodone; BUF_KERNPROC(bp); swp_pager_strategy(bp); for (j = 0; j < n; ++j) rtvals[i+j] = VM_PAGER_PEND; /* restart outter loop */ continue; } /* * synchronous * * NOTE: b_blkno is destroyed by the call to swapdev_strategy */ bp->b_iodone = bdone; swp_pager_strategy(bp); /* * Wait for the sync I/O to complete, then update rtvals. * We just set the rtvals[] to VM_PAGER_PEND so we can call * our async completion routine at the end, thus avoiding a * double-free. */ bwait(bp, PVM, "swwrt"); for (j = 0; j < n; ++j) rtvals[i+j] = VM_PAGER_PEND; /* * Now that we are through with the bp, we can call the * normal async completion, which frees everything up. */ swp_pager_async_iodone(bp); } VM_OBJECT_LOCK(object); } /* * swp_pager_async_iodone: * * Completion routine for asynchronous reads and writes from/to swap. * Also called manually by synchronous code to finish up a bp. * * For READ operations, the pages are PG_BUSY'd. For WRITE operations, * the pages are vm_page_t->busy'd. For READ operations, we PG_BUSY * unbusy all pages except the 'main' request page. For WRITE * operations, we vm_page_t->busy'd unbusy all pages ( we can do this * because we marked them all VM_PAGER_PEND on return from putpages ). * * This routine may not block. * This routine is called at splbio() or better * * We up ourselves to splvm() as required for various vm_page related * calls. */ static void swp_pager_async_iodone(struct buf *bp) { int i; vm_object_t object = NULL; /* * report error */ if (bp->b_ioflags & BIO_ERROR) { printf( "swap_pager: I/O error - %s failed; blkno %ld," "size %ld, error %d\n", ((bp->b_iocmd == BIO_READ) ? "pagein" : "pageout"), (long)bp->b_blkno, (long)bp->b_bcount, bp->b_error ); } /* * remove the mapping for kernel virtual */ pmap_qremove((vm_offset_t)bp->b_data, bp->b_npages); if (bp->b_npages) { object = bp->b_pages[0]->object; VM_OBJECT_LOCK(object); } vm_page_lock_queues(); /* * cleanup pages. If an error occurs writing to swap, we are in * very serious trouble. If it happens to be a disk error, though, * we may be able to recover by reassigning the swap later on. So * in this case we remove the m->swapblk assignment for the page * but do not free it in the rlist. The errornous block(s) are thus * never reallocated as swap. Redirty the page and continue. */ for (i = 0; i < bp->b_npages; ++i) { vm_page_t m = bp->b_pages[i]; m->oflags &= ~VPO_SWAPINPROG; if (bp->b_ioflags & BIO_ERROR) { /* * If an error occurs I'd love to throw the swapblk * away without freeing it back to swapspace, so it * can never be used again. But I can't from an * interrupt. */ if (bp->b_iocmd == BIO_READ) { /* * When reading, reqpage needs to stay * locked for the parent, but all other * pages can be freed. We still want to * wakeup the parent waiting on the page, * though. ( also: pg_reqpage can be -1 and * not match anything ). * * We have to wake specifically requested pages * up too because we cleared VPO_SWAPINPROG and * someone may be waiting for that. * * NOTE: for reads, m->dirty will probably * be overridden by the original caller of * getpages so don't play cute tricks here. */ m->valid = 0; if (i != bp->b_pager.pg_reqpage) vm_page_free(m); else vm_page_flash(m); /* * If i == bp->b_pager.pg_reqpage, do not wake * the page up. The caller needs to. */ } else { /* * If a write error occurs, reactivate page * so it doesn't clog the inactive list, * then finish the I/O. */ vm_page_dirty(m); vm_page_activate(m); vm_page_io_finish(m); } } else if (bp->b_iocmd == BIO_READ) { /* * For read success, clear dirty bits. Nobody should * have this page mapped but don't take any chances, * make sure the pmap modify bits are also cleared. * * NOTE: for reads, m->dirty will probably be * overridden by the original caller of getpages so * we cannot set them in order to free the underlying * swap in a low-swap situation. I don't think we'd * want to do that anyway, but it was an optimization * that existed in the old swapper for a time before * it got ripped out due to precisely this problem. * * If not the requested page then deactivate it. * * Note that the requested page, reqpage, is left * busied, but we still have to wake it up. The * other pages are released (unbusied) by * vm_page_wakeup(). We do not set reqpage's * valid bits here, it is up to the caller. */ pmap_clear_modify(m); m->valid = VM_PAGE_BITS_ALL; vm_page_undirty(m); /* * We have to wake specifically requested pages * up too because we cleared VPO_SWAPINPROG and * could be waiting for it in getpages. However, * be sure to not unbusy getpages specifically * requested page - getpages expects it to be * left busy. */ if (i != bp->b_pager.pg_reqpage) { vm_page_deactivate(m); vm_page_wakeup(m); } else { vm_page_flash(m); } } else { /* * For write success, clear the modify and dirty * status, then finish the I/O ( which decrements the * busy count and possibly wakes waiter's up ). */ pmap_clear_modify(m); vm_page_undirty(m); vm_page_io_finish(m); if (vm_page_count_severe()) vm_page_try_to_cache(m); } } vm_page_unlock_queues(); /* * adjust pip. NOTE: the original parent may still have its own * pip refs on the object. */ if (object != NULL) { vm_object_pip_wakeupn(object, bp->b_npages); VM_OBJECT_UNLOCK(object); } /* * swapdev_strategy() manually sets b_vp and b_bufobj before calling * bstrategy(). Set them back to NULL now we're done with it, or we'll * trigger a KASSERT in relpbuf(). */ if (bp->b_vp) { bp->b_vp = NULL; bp->b_bufobj = NULL; } /* * release the physical I/O buffer */ relpbuf( bp, ((bp->b_iocmd == BIO_READ) ? &nsw_rcount : ((bp->b_flags & B_ASYNC) ? &nsw_wcount_async : &nsw_wcount_sync ) ) ); } /* * swap_pager_isswapped: * * Return 1 if at least one page in the given object is paged * out to the given swap device. * * This routine may not block. */ int swap_pager_isswapped(vm_object_t object, struct swdevt *sp) { daddr_t index = 0; int bcount; int i; VM_OBJECT_LOCK_ASSERT(object, MA_OWNED); if (object->type != OBJT_SWAP) return (0); mtx_lock(&swhash_mtx); for (bcount = 0; bcount < object->un_pager.swp.swp_bcount; bcount++) { struct swblock *swap; if ((swap = *swp_pager_hash(object, index)) != NULL) { for (i = 0; i < SWAP_META_PAGES; ++i) { if (swp_pager_isondev(swap->swb_pages[i], sp)) { mtx_unlock(&swhash_mtx); return (1); } } } index += SWAP_META_PAGES; if (index > 0x20000000) panic("swap_pager_isswapped: failed to locate all swap meta blocks"); } mtx_unlock(&swhash_mtx); return (0); } /* * SWP_PAGER_FORCE_PAGEIN() - force a swap block to be paged in * * This routine dissociates the page at the given index within a * swap block from its backing store, paging it in if necessary. * If the page is paged in, it is placed in the inactive queue, * since it had its backing store ripped out from under it. * We also attempt to swap in all other pages in the swap block, * we only guarantee that the one at the specified index is * paged in. * * XXX - The code to page the whole block in doesn't work, so we * revert to the one-by-one behavior for now. Sigh. */ static inline void swp_pager_force_pagein(vm_object_t object, vm_pindex_t pindex) { vm_page_t m; vm_object_pip_add(object, 1); m = vm_page_grab(object, pindex, VM_ALLOC_NORMAL|VM_ALLOC_RETRY); if (m->valid == VM_PAGE_BITS_ALL) { vm_object_pip_subtract(object, 1); vm_page_lock_queues(); vm_page_activate(m); vm_page_dirty(m); vm_page_unlock_queues(); vm_page_wakeup(m); vm_pager_page_unswapped(m); return; } if (swap_pager_getpages(object, &m, 1, 0) != VM_PAGER_OK) panic("swap_pager_force_pagein: read from swap failed");/*XXX*/ vm_object_pip_subtract(object, 1); vm_page_lock_queues(); vm_page_dirty(m); vm_page_dontneed(m); vm_page_unlock_queues(); vm_page_wakeup(m); vm_pager_page_unswapped(m); } /* * swap_pager_swapoff: * * Page in all of the pages that have been paged out to the * given device. The corresponding blocks in the bitmap must be * marked as allocated and the device must be flagged SW_CLOSING. * There may be no processes swapped out to the device. * * This routine may block. */ static void swap_pager_swapoff(struct swdevt *sp) { struct swblock *swap; int i, j, retries; GIANT_REQUIRED; retries = 0; full_rescan: mtx_lock(&swhash_mtx); for (i = 0; i <= swhash_mask; i++) { /* '<=' is correct here */ restart: for (swap = swhash[i]; swap != NULL; swap = swap->swb_hnext) { vm_object_t object = swap->swb_object; vm_pindex_t pindex = swap->swb_index; for (j = 0; j < SWAP_META_PAGES; ++j) { if (swp_pager_isondev(swap->swb_pages[j], sp)) { /* avoid deadlock */ if (!VM_OBJECT_TRYLOCK(object)) { break; } else { mtx_unlock(&swhash_mtx); swp_pager_force_pagein(object, pindex + j); VM_OBJECT_UNLOCK(object); mtx_lock(&swhash_mtx); goto restart; } } } } } mtx_unlock(&swhash_mtx); if (sp->sw_used) { int dummy; /* * Objects may be locked or paging to the device being * removed, so we will miss their pages and need to * make another pass. We have marked this device as * SW_CLOSING, so the activity should finish soon. */ retries++; if (retries > 100) { panic("swapoff: failed to locate %d swap blocks", sp->sw_used); } tsleep(&dummy, PVM, "swpoff", hz / 20); goto full_rescan; } } /************************************************************************ * SWAP META DATA * ************************************************************************ * * These routines manipulate the swap metadata stored in the * OBJT_SWAP object. All swp_*() routines must be called at * splvm() because swap can be freed up by the low level vm_page * code which might be called from interrupts beyond what splbio() covers. * * Swap metadata is implemented with a global hash and not directly * linked into the object. Instead the object simply contains * appropriate tracking counters. */ /* * SWP_PAGER_META_BUILD() - add swap block to swap meta data for object * * We first convert the object to a swap object if it is a default * object. * * The specified swapblk is added to the object's swap metadata. If * the swapblk is not valid, it is freed instead. Any previously * assigned swapblk is freed. * * This routine must be called at splvm(), except when used to convert * an OBJT_DEFAULT object into an OBJT_SWAP object. */ static void swp_pager_meta_build(vm_object_t object, vm_pindex_t pindex, daddr_t swapblk) { struct swblock *swap; struct swblock **pswap; int idx; VM_OBJECT_LOCK_ASSERT(object, MA_OWNED); /* * Convert default object to swap object if necessary */ if (object->type != OBJT_SWAP) { object->type = OBJT_SWAP; object->un_pager.swp.swp_bcount = 0; if (object->handle != NULL) { mtx_lock(&sw_alloc_mtx); TAILQ_INSERT_TAIL( NOBJLIST(object->handle), object, pager_object_list ); mtx_unlock(&sw_alloc_mtx); } } /* * Locate hash entry. If not found create, but if we aren't adding * anything just return. If we run out of space in the map we wait * and, since the hash table may have changed, retry. */ retry: mtx_lock(&swhash_mtx); pswap = swp_pager_hash(object, pindex); if ((swap = *pswap) == NULL) { int i; if (swapblk == SWAPBLK_NONE) goto done; swap = *pswap = uma_zalloc(swap_zone, M_NOWAIT); if (swap == NULL) { mtx_unlock(&swhash_mtx); VM_OBJECT_UNLOCK(object); + if (uma_zone_exhausted(swap_zone)) + printf("swap zone exhausted, increase kern.maxswzone\n"); VM_WAIT; VM_OBJECT_LOCK(object); goto retry; } swap->swb_hnext = NULL; swap->swb_object = object; swap->swb_index = pindex & ~(vm_pindex_t)SWAP_META_MASK; swap->swb_count = 0; ++object->un_pager.swp.swp_bcount; for (i = 0; i < SWAP_META_PAGES; ++i) swap->swb_pages[i] = SWAPBLK_NONE; } /* * Delete prior contents of metadata */ idx = pindex & SWAP_META_MASK; if (swap->swb_pages[idx] != SWAPBLK_NONE) { swp_pager_freeswapspace(swap->swb_pages[idx], 1); --swap->swb_count; } /* * Enter block into metadata */ swap->swb_pages[idx] = swapblk; if (swapblk != SWAPBLK_NONE) ++swap->swb_count; done: mtx_unlock(&swhash_mtx); } /* * SWP_PAGER_META_FREE() - free a range of blocks in the object's swap metadata * * The requested range of blocks is freed, with any associated swap * returned to the swap bitmap. * * This routine will free swap metadata structures as they are cleaned * out. This routine does *NOT* operate on swap metadata associated * with resident pages. * * This routine must be called at splvm() */ static void swp_pager_meta_free(vm_object_t object, vm_pindex_t index, daddr_t count) { VM_OBJECT_LOCK_ASSERT(object, MA_OWNED); if (object->type != OBJT_SWAP) return; while (count > 0) { struct swblock **pswap; struct swblock *swap; mtx_lock(&swhash_mtx); pswap = swp_pager_hash(object, index); if ((swap = *pswap) != NULL) { daddr_t v = swap->swb_pages[index & SWAP_META_MASK]; if (v != SWAPBLK_NONE) { swp_pager_freeswapspace(v, 1); swap->swb_pages[index & SWAP_META_MASK] = SWAPBLK_NONE; if (--swap->swb_count == 0) { *pswap = swap->swb_hnext; uma_zfree(swap_zone, swap); --object->un_pager.swp.swp_bcount; } } --count; ++index; } else { int n = SWAP_META_PAGES - (index & SWAP_META_MASK); count -= n; index += n; } mtx_unlock(&swhash_mtx); } } /* * SWP_PAGER_META_FREE_ALL() - destroy all swap metadata associated with object * * This routine locates and destroys all swap metadata associated with * an object. * * This routine must be called at splvm() */ static void swp_pager_meta_free_all(vm_object_t object) { daddr_t index = 0; VM_OBJECT_LOCK_ASSERT(object, MA_OWNED); if (object->type != OBJT_SWAP) return; while (object->un_pager.swp.swp_bcount) { struct swblock **pswap; struct swblock *swap; mtx_lock(&swhash_mtx); pswap = swp_pager_hash(object, index); if ((swap = *pswap) != NULL) { int i; for (i = 0; i < SWAP_META_PAGES; ++i) { daddr_t v = swap->swb_pages[i]; if (v != SWAPBLK_NONE) { --swap->swb_count; swp_pager_freeswapspace(v, 1); } } if (swap->swb_count != 0) panic("swap_pager_meta_free_all: swb_count != 0"); *pswap = swap->swb_hnext; uma_zfree(swap_zone, swap); --object->un_pager.swp.swp_bcount; } mtx_unlock(&swhash_mtx); index += SWAP_META_PAGES; if (index > 0x20000000) panic("swp_pager_meta_free_all: failed to locate all swap meta blocks"); } } /* * SWP_PAGER_METACTL() - misc control of swap and vm_page_t meta data. * * This routine is capable of looking up, popping, or freeing * swapblk assignments in the swap meta data or in the vm_page_t. * The routine typically returns the swapblk being looked-up, or popped, * or SWAPBLK_NONE if the block was freed, or SWAPBLK_NONE if the block * was invalid. This routine will automatically free any invalid * meta-data swapblks. * * It is not possible to store invalid swapblks in the swap meta data * (other then a literal 'SWAPBLK_NONE'), so we don't bother checking. * * When acting on a busy resident page and paging is in progress, we * have to wait until paging is complete but otherwise can act on the * busy page. * * This routine must be called at splvm(). * * SWM_FREE remove and free swap block from metadata * SWM_POP remove from meta data but do not free.. pop it out */ static daddr_t swp_pager_meta_ctl(vm_object_t object, vm_pindex_t pindex, int flags) { struct swblock **pswap; struct swblock *swap; daddr_t r1; int idx; VM_OBJECT_LOCK_ASSERT(object, MA_OWNED); /* * The meta data only exists of the object is OBJT_SWAP * and even then might not be allocated yet. */ if (object->type != OBJT_SWAP) return (SWAPBLK_NONE); r1 = SWAPBLK_NONE; mtx_lock(&swhash_mtx); pswap = swp_pager_hash(object, pindex); if ((swap = *pswap) != NULL) { idx = pindex & SWAP_META_MASK; r1 = swap->swb_pages[idx]; if (r1 != SWAPBLK_NONE) { if (flags & SWM_FREE) { swp_pager_freeswapspace(r1, 1); r1 = SWAPBLK_NONE; } if (flags & (SWM_FREE|SWM_POP)) { swap->swb_pages[idx] = SWAPBLK_NONE; if (--swap->swb_count == 0) { *pswap = swap->swb_hnext; uma_zfree(swap_zone, swap); --object->un_pager.swp.swp_bcount; } } } } mtx_unlock(&swhash_mtx); return (r1); } /* * System call swapon(name) enables swapping on device name, * which must be in the swdevsw. Return EBUSY * if already swapping on this device. */ #ifndef _SYS_SYSPROTO_H_ struct swapon_args { char *name; }; #endif /* * MPSAFE */ /* ARGSUSED */ int swapon(struct thread *td, struct swapon_args *uap) { struct vattr attr; struct vnode *vp; struct nameidata nd; int error; error = priv_check(td, PRIV_SWAPON); if (error) return (error); mtx_lock(&Giant); while (swdev_syscall_active) tsleep(&swdev_syscall_active, PUSER - 1, "swpon", 0); swdev_syscall_active = 1; /* * Swap metadata may not fit in the KVM if we have physical * memory of >1GB. */ if (swap_zone == NULL) { error = ENOMEM; goto done; } NDINIT(&nd, LOOKUP, ISOPEN | FOLLOW, UIO_USERSPACE, uap->name, td); error = namei(&nd); if (error) goto done; NDFREE(&nd, NDF_ONLY_PNBUF); vp = nd.ni_vp; if (vn_isdisk(vp, &error)) { error = swapongeom(td, vp); } else if (vp->v_type == VREG && (vp->v_mount->mnt_vfc->vfc_flags & VFCF_NETWORK) != 0 && (error = VOP_GETATTR(vp, &attr, td->td_ucred, td)) == 0) { /* * Allow direct swapping to NFS regular files in the same * way that nfs_mountroot() sets up diskless swapping. */ error = swaponvp(td, vp, attr.va_size / DEV_BSIZE); } if (error) vrele(vp); done: swdev_syscall_active = 0; wakeup_one(&swdev_syscall_active); mtx_unlock(&Giant); return (error); } static void swaponsomething(struct vnode *vp, void *id, u_long nblks, sw_strategy_t *strategy, sw_close_t *close, dev_t dev) { struct swdevt *sp, *tsp; swblk_t dvbase; u_long mblocks; /* * If we go beyond this, we get overflows in the radix * tree bitmap code. */ mblocks = 0x40000000 / BLIST_META_RADIX; if (nblks > mblocks) { printf("WARNING: reducing size to maximum of %lu blocks per swap unit\n", mblocks); nblks = mblocks; } /* * nblks is in DEV_BSIZE'd chunks, convert to PAGE_SIZE'd chunks. * First chop nblks off to page-align it, then convert. * * sw->sw_nblks is in page-sized chunks now too. */ nblks &= ~(ctodb(1) - 1); nblks = dbtoc(nblks); sp = malloc(sizeof *sp, M_VMPGDATA, M_WAITOK | M_ZERO); sp->sw_vp = vp; sp->sw_id = id; sp->sw_dev = dev; sp->sw_flags = 0; sp->sw_nblks = nblks; sp->sw_used = 0; sp->sw_strategy = strategy; sp->sw_close = close; sp->sw_blist = blist_create(nblks); /* * Do not free the first two block in order to avoid overwriting * any bsd label at the front of the partition */ blist_free(sp->sw_blist, 2, nblks - 2); dvbase = 0; mtx_lock(&sw_dev_mtx); TAILQ_FOREACH(tsp, &swtailq, sw_list) { if (tsp->sw_end >= dvbase) { /* * We put one uncovered page between the devices * in order to definitively prevent any cross-device * I/O requests */ dvbase = tsp->sw_end + 1; } } sp->sw_first = dvbase; sp->sw_end = dvbase + nblks; TAILQ_INSERT_TAIL(&swtailq, sp, sw_list); nswapdev++; swap_pager_avail += nblks; swp_sizecheck(); mtx_unlock(&sw_dev_mtx); } /* * SYSCALL: swapoff(devname) * * Disable swapping on the given device. * * XXX: Badly designed system call: it should use a device index * rather than filename as specification. We keep sw_vp around * only to make this work. */ #ifndef _SYS_SYSPROTO_H_ struct swapoff_args { char *name; }; #endif /* * MPSAFE */ /* ARGSUSED */ int swapoff(struct thread *td, struct swapoff_args *uap) { struct vnode *vp; struct nameidata nd; struct swdevt *sp; int error; error = priv_check(td, PRIV_SWAPOFF); if (error) return (error); mtx_lock(&Giant); while (swdev_syscall_active) tsleep(&swdev_syscall_active, PUSER - 1, "swpoff", 0); swdev_syscall_active = 1; NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, uap->name, td); error = namei(&nd); if (error) goto done; NDFREE(&nd, NDF_ONLY_PNBUF); vp = nd.ni_vp; mtx_lock(&sw_dev_mtx); TAILQ_FOREACH(sp, &swtailq, sw_list) { if (sp->sw_vp == vp) break; } mtx_unlock(&sw_dev_mtx); if (sp == NULL) { error = EINVAL; goto done; } error = swapoff_one(sp, td); done: swdev_syscall_active = 0; wakeup_one(&swdev_syscall_active); mtx_unlock(&Giant); return (error); } static int swapoff_one(struct swdevt *sp, struct thread *td) { u_long nblks, dvbase; #ifdef MAC int error; #endif mtx_assert(&Giant, MA_OWNED); #ifdef MAC (void) vn_lock(sp->sw_vp, LK_EXCLUSIVE | LK_RETRY, td); error = mac_check_system_swapoff(td->td_ucred, sp->sw_vp); (void) VOP_UNLOCK(sp->sw_vp, 0, td); if (error != 0) return (error); #endif nblks = sp->sw_nblks; /* * We can turn off this swap device safely only if the * available virtual memory in the system will fit the amount * of data we will have to page back in, plus an epsilon so * the system doesn't become critically low on swap space. */ if (cnt.v_free_count + cnt.v_cache_count + swap_pager_avail < nblks + nswap_lowat) { return (ENOMEM); } /* * Prevent further allocations on this device. */ mtx_lock(&sw_dev_mtx); sp->sw_flags |= SW_CLOSING; for (dvbase = 0; dvbase < sp->sw_end; dvbase += dmmax) { swap_pager_avail -= blist_fill(sp->sw_blist, dvbase, dmmax); } mtx_unlock(&sw_dev_mtx); /* * Page in the contents of the device and close it. */ swap_pager_swapoff(sp); sp->sw_close(td, sp); sp->sw_id = NULL; mtx_lock(&sw_dev_mtx); TAILQ_REMOVE(&swtailq, sp, sw_list); nswapdev--; if (nswapdev == 0) { swap_pager_full = 2; swap_pager_almost_full = 1; } if (swdevhd == sp) swdevhd = NULL; mtx_unlock(&sw_dev_mtx); blist_destroy(sp->sw_blist); free(sp, M_VMPGDATA); return (0); } void swapoff_all(void) { struct swdevt *sp, *spt; const char *devname; int error; mtx_lock(&Giant); while (swdev_syscall_active) tsleep(&swdev_syscall_active, PUSER - 1, "swpoff", 0); swdev_syscall_active = 1; mtx_lock(&sw_dev_mtx); TAILQ_FOREACH_SAFE(sp, &swtailq, sw_list, spt) { mtx_unlock(&sw_dev_mtx); if (vn_isdisk(sp->sw_vp, NULL)) devname = sp->sw_vp->v_rdev->si_name; else devname = "[file]"; error = swapoff_one(sp, &thread0); if (error != 0) { printf("Cannot remove swap device %s (error=%d), " "skipping.\n", devname, error); } else if (bootverbose) { printf("Swap device %s removed.\n", devname); } mtx_lock(&sw_dev_mtx); } mtx_unlock(&sw_dev_mtx); swdev_syscall_active = 0; wakeup_one(&swdev_syscall_active); mtx_unlock(&Giant); } void swap_pager_status(int *total, int *used) { struct swdevt *sp; *total = 0; *used = 0; mtx_lock(&sw_dev_mtx); TAILQ_FOREACH(sp, &swtailq, sw_list) { *total += sp->sw_nblks; *used += sp->sw_used; } mtx_unlock(&sw_dev_mtx); } static int sysctl_vm_swap_info(SYSCTL_HANDLER_ARGS) { int *name = (int *)arg1; int error, n; struct xswdev xs; struct swdevt *sp; if (arg2 != 1) /* name length */ return (EINVAL); n = 0; mtx_lock(&sw_dev_mtx); TAILQ_FOREACH(sp, &swtailq, sw_list) { if (n == *name) { mtx_unlock(&sw_dev_mtx); xs.xsw_version = XSWDEV_VERSION; xs.xsw_dev = sp->sw_dev; xs.xsw_flags = sp->sw_flags; xs.xsw_nblks = sp->sw_nblks; xs.xsw_used = sp->sw_used; error = SYSCTL_OUT(req, &xs, sizeof(xs)); return (error); } n++; } mtx_unlock(&sw_dev_mtx); return (ENOENT); } SYSCTL_INT(_vm, OID_AUTO, nswapdev, CTLFLAG_RD, &nswapdev, 0, "Number of swap devices"); SYSCTL_NODE(_vm, OID_AUTO, swap_info, CTLFLAG_RD, sysctl_vm_swap_info, "Swap statistics by device"); /* * vmspace_swap_count() - count the approximate swap useage in pages for a * vmspace. * * The map must be locked. * * Swap useage is determined by taking the proportional swap used by * VM objects backing the VM map. To make up for fractional losses, * if the VM object has any swap use at all the associated map entries * count for at least 1 swap page. */ int vmspace_swap_count(struct vmspace *vmspace) { vm_map_t map = &vmspace->vm_map; vm_map_entry_t cur; int count = 0; for (cur = map->header.next; cur != &map->header; cur = cur->next) { vm_object_t object; if ((cur->eflags & MAP_ENTRY_IS_SUB_MAP) == 0 && (object = cur->object.vm_object) != NULL) { VM_OBJECT_LOCK(object); if (object->type == OBJT_SWAP && object->un_pager.swp.swp_bcount != 0) { int n = (cur->end - cur->start) / PAGE_SIZE; count += object->un_pager.swp.swp_bcount * SWAP_META_PAGES * n / object->size + 1; } VM_OBJECT_UNLOCK(object); } } return (count); } /* * GEOM backend * * Swapping onto disk devices. * */ static g_orphan_t swapgeom_orphan; static struct g_class g_swap_class = { .name = "SWAP", .version = G_VERSION, .orphan = swapgeom_orphan, }; DECLARE_GEOM_CLASS(g_swap_class, g_class); static void swapgeom_done(struct bio *bp2) { struct buf *bp; bp = bp2->bio_caller2; bp->b_ioflags = bp2->bio_flags; if (bp2->bio_error) bp->b_ioflags |= BIO_ERROR; bp->b_resid = bp->b_bcount - bp2->bio_completed; bp->b_error = bp2->bio_error; bufdone(bp); g_destroy_bio(bp2); } static void swapgeom_strategy(struct buf *bp, struct swdevt *sp) { struct bio *bio; struct g_consumer *cp; cp = sp->sw_id; if (cp == NULL) { bp->b_error = ENXIO; bp->b_ioflags |= BIO_ERROR; bufdone(bp); return; } bio = g_alloc_bio(); #if 0 /* * XXX: We shouldn't really sleep here when we run out of buffers * XXX: but the alternative is worse right now. */ if (bio == NULL) { bp->b_error = ENOMEM; bp->b_ioflags |= BIO_ERROR; bufdone(bp); return; } #endif bio->bio_caller2 = bp; bio->bio_cmd = bp->b_iocmd; bio->bio_data = bp->b_data; bio->bio_offset = (bp->b_blkno - sp->sw_first) * PAGE_SIZE; bio->bio_length = bp->b_bcount; bio->bio_done = swapgeom_done; g_io_request(bio, cp); return; } static void swapgeom_orphan(struct g_consumer *cp) { struct swdevt *sp; mtx_lock(&sw_dev_mtx); TAILQ_FOREACH(sp, &swtailq, sw_list) if (sp->sw_id == cp) sp->sw_id = NULL; mtx_unlock(&sw_dev_mtx); } static void swapgeom_close_ev(void *arg, int flags) { struct g_consumer *cp; cp = arg; g_access(cp, -1, -1, 0); g_detach(cp); g_destroy_consumer(cp); } static void swapgeom_close(struct thread *td, struct swdevt *sw) { /* XXX: direct call when Giant untangled */ g_waitfor_event(swapgeom_close_ev, sw->sw_id, M_WAITOK, NULL); } struct swh0h0 { struct cdev *dev; struct vnode *vp; int error; }; static void swapongeom_ev(void *arg, int flags) { struct swh0h0 *swh; struct g_provider *pp; struct g_consumer *cp; static struct g_geom *gp; struct swdevt *sp; u_long nblks; int error; swh = arg; swh->error = 0; pp = g_dev_getprovider(swh->dev); if (pp == NULL) { swh->error = ENODEV; return; } mtx_lock(&sw_dev_mtx); TAILQ_FOREACH(sp, &swtailq, sw_list) { cp = sp->sw_id; if (cp != NULL && cp->provider == pp) { mtx_unlock(&sw_dev_mtx); swh->error = EBUSY; return; } } mtx_unlock(&sw_dev_mtx); if (gp == NULL) gp = g_new_geomf(&g_swap_class, "swap", NULL); cp = g_new_consumer(gp); g_attach(cp, pp); /* * XXX: Everytime you think you can improve the margin for * footshooting, somebody depends on the ability to do so: * savecore(8) wants to write to our swapdev so we cannot * set an exclusive count :-( */ error = g_access(cp, 1, 1, 0); if (error) { g_detach(cp); g_destroy_consumer(cp); swh->error = error; return; } nblks = pp->mediasize / DEV_BSIZE; swaponsomething(swh->vp, cp, nblks, swapgeom_strategy, swapgeom_close, dev2udev(swh->dev)); swh->error = 0; return; } static int swapongeom(struct thread *td, struct vnode *vp) { int error; struct swh0h0 swh; vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); swh.dev = vp->v_rdev; swh.vp = vp; swh.error = 0; /* XXX: direct call when Giant untangled */ error = g_waitfor_event(swapongeom_ev, &swh, M_WAITOK, NULL); if (!error) error = swh.error; VOP_UNLOCK(vp, 0, td); return (error); } /* * VNODE backend * * This is used mainly for network filesystem (read: probably only tested * with NFS) swapfiles. * */ static void swapdev_strategy(struct buf *bp, struct swdevt *sp) { struct vnode *vp2; bp->b_blkno = ctodb(bp->b_blkno - sp->sw_first); vp2 = sp->sw_id; vhold(vp2); if (bp->b_iocmd == BIO_WRITE) { if (bp->b_bufobj) bufobj_wdrop(bp->b_bufobj); bufobj_wref(&vp2->v_bufobj); } if (bp->b_bufobj != &vp2->v_bufobj) bp->b_bufobj = &vp2->v_bufobj; bp->b_vp = vp2; bp->b_iooffset = dbtob(bp->b_blkno); bstrategy(bp); return; } static void swapdev_close(struct thread *td, struct swdevt *sp) { VOP_CLOSE(sp->sw_vp, FREAD | FWRITE, td->td_ucred, td); vrele(sp->sw_vp); } static int swaponvp(struct thread *td, struct vnode *vp, u_long nblks) { struct swdevt *sp; int error; if (nblks == 0) return (ENXIO); mtx_lock(&sw_dev_mtx); TAILQ_FOREACH(sp, &swtailq, sw_list) { if (sp->sw_id == vp) { mtx_unlock(&sw_dev_mtx); return (EBUSY); } } mtx_unlock(&sw_dev_mtx); (void) vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); #ifdef MAC error = mac_check_system_swapon(td->td_ucred, vp); if (error == 0) #endif error = VOP_OPEN(vp, FREAD | FWRITE, td->td_ucred, td, -1); (void) VOP_UNLOCK(vp, 0, td); if (error) return (error); swaponsomething(vp, vp, nblks, swapdev_strategy, swapdev_close, NODEV); return (0); } diff --git a/sys/vm/uma.h b/sys/vm/uma.h index 2181ec72dfec..08a55d675e88 100644 --- a/sys/vm/uma.h +++ b/sys/vm/uma.h @@ -1,559 +1,570 @@ /*- * Copyright (c) 2002, 2003, 2004, 2005 Jeffrey Roberson * Copyright (c) 2004, 2005 Bosko Milekic * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice unmodified, this list of conditions, and the following * disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * * $FreeBSD$ * */ /* * uma.h - External definitions for the Universal Memory Allocator * */ #ifndef VM_UMA_H #define VM_UMA_H #include /* For NULL */ #include /* For M_* */ /* User visable parameters */ #define UMA_SMALLEST_UNIT (PAGE_SIZE / 256) /* Smallest item allocated */ /* Types and type defs */ struct uma_zone; /* Opaque type used as a handle to the zone */ typedef struct uma_zone * uma_zone_t; /* * Item constructor * * Arguments: * item A pointer to the memory which has been allocated. * arg The arg field passed to uma_zalloc_arg * size The size of the allocated item * flags See zalloc flags * * Returns: * 0 on success * errno on failure * * Discussion: * The constructor is called just before the memory is returned * to the user. It may block if necessary. */ typedef int (*uma_ctor)(void *mem, int size, void *arg, int flags); /* * Item destructor * * Arguments: * item A pointer to the memory which has been allocated. * size The size of the item being destructed. * arg Argument passed through uma_zfree_arg * * Returns: * Nothing * * Discussion: * The destructor may perform operations that differ from those performed * by the initializer, but it must leave the object in the same state. * This IS type stable storage. This is called after EVERY zfree call. */ typedef void (*uma_dtor)(void *mem, int size, void *arg); /* * Item initializer * * Arguments: * item A pointer to the memory which has been allocated. * size The size of the item being initialized. * flags See zalloc flags * * Returns: * 0 on success * errno on failure * * Discussion: * The initializer is called when the memory is cached in the uma zone. * this should be the same state that the destructor leaves the object in. */ typedef int (*uma_init)(void *mem, int size, int flags); /* * Item discard function * * Arguments: * item A pointer to memory which has been 'freed' but has not left the * zone's cache. * size The size of the item being discarded. * * Returns: * Nothing * * Discussion: * This routine is called when memory leaves a zone and is returned to the * system for other uses. It is the counter part to the init function. */ typedef void (*uma_fini)(void *mem, int size); /* * What's the difference between initializing and constructing? * * The item is initialized when it is cached, and this is the state that the * object should be in when returned to the allocator. The purpose of this is * to remove some code which would otherwise be called on each allocation by * utilizing a known, stable state. This differs from the constructor which * will be called on EVERY allocation. * * For example, in the initializer you may want to initialize embeded locks, * NULL list pointers, set up initial states, magic numbers, etc. This way if * the object is held in the allocator and re-used it won't be necessary to * re-initialize it. * * The constructor may be used to lock a data structure, link it on to lists, * bump reference counts or total counts of outstanding structures, etc. * */ /* Function proto types */ /* * Create a new uma zone * * Arguments: * name The text name of the zone for debugging and stats, this memory * should not be freed until the zone has been deallocated. * size The size of the object that is being created. * ctor The constructor that is called when the object is allocated * dtor The destructor that is called when the object is freed. * init An initializer that sets up the initial state of the memory. * fini A discard function that undoes initialization done by init. * ctor/dtor/init/fini may all be null, see notes above. * align A bitmask that corisponds to the requested alignment * eg 4 would be 0x3 * flags A set of parameters that control the behavior of the zone * * Returns: * A pointer to a structure which is intended to be opaque to users of * the interface. The value may be null if the wait flag is not set. */ uma_zone_t uma_zcreate(char *name, size_t size, uma_ctor ctor, uma_dtor dtor, uma_init uminit, uma_fini fini, int align, u_int32_t flags); /* * Create a secondary uma zone * * Arguments: * name The text name of the zone for debugging and stats, this memory * should not be freed until the zone has been deallocated. * ctor The constructor that is called when the object is allocated * dtor The destructor that is called when the object is freed. * zinit An initializer that sets up the initial state of the memory * as the object passes from the Keg's slab to the Zone's cache. * zfini A discard function that undoes initialization done by init * as the object passes from the Zone's cache to the Keg's slab. * * ctor/dtor/zinit/zfini may all be null, see notes above. * Note that the zinit and zfini specified here are NOT * exactly the same as the init/fini specified to uma_zcreate() * when creating a master zone. These zinit/zfini are called * on the TRANSITION from keg to zone (and vice-versa). Once * these are set, the primary zone may alter its init/fini * (which are called when the object passes from VM to keg) * using uma_zone_set_init/fini()) as well as its own * zinit/zfini (unset by default for master zone) with * uma_zone_set_zinit/zfini() (note subtle 'z' prefix). * * master A reference to this zone's Master Zone (Primary Zone), * which contains the backing Keg for the Secondary Zone * being added. * * Returns: * A pointer to a structure which is intended to be opaque to users of * the interface. The value may be null if the wait flag is not set. */ uma_zone_t uma_zsecond_create(char *name, uma_ctor ctor, uma_dtor dtor, uma_init zinit, uma_fini zfini, uma_zone_t master); /* * Definitions for uma_zcreate flags * * These flags share space with UMA_ZFLAGs in uma_int.h. Be careful not to * overlap when adding new features. 0xf0000000 is in use by uma_int.h. */ #define UMA_ZONE_PAGEABLE 0x0001 /* Return items not fully backed by physical memory XXX Not yet */ #define UMA_ZONE_ZINIT 0x0002 /* Initialize with zeros */ #define UMA_ZONE_STATIC 0x0004 /* Staticly sized zone */ #define UMA_ZONE_OFFPAGE 0x0008 /* Force the slab structure allocation off of the real memory */ #define UMA_ZONE_MALLOC 0x0010 /* For use by malloc(9) only! */ #define UMA_ZONE_NOFREE 0x0020 /* Do not free slabs of this type! */ #define UMA_ZONE_MTXCLASS 0x0040 /* Create a new lock class */ #define UMA_ZONE_VM 0x0080 /* * Used for internal vm datastructures * only. */ #define UMA_ZONE_HASH 0x0100 /* * Use a hash table instead of caching * information in the vm_page. */ #define UMA_ZONE_SECONDARY 0x0200 /* Zone is a Secondary Zone */ #define UMA_ZONE_REFCNT 0x0400 /* Allocate refcnts in slabs */ #define UMA_ZONE_MAXBUCKET 0x0800 /* Use largest buckets */ /* Definitions for align */ #define UMA_ALIGN_PTR (sizeof(void *) - 1) /* Alignment fit for ptr */ #define UMA_ALIGN_LONG (sizeof(long) - 1) /* "" long */ #define UMA_ALIGN_INT (sizeof(int) - 1) /* "" int */ #define UMA_ALIGN_SHORT (sizeof(short) - 1) /* "" short */ #define UMA_ALIGN_CHAR (sizeof(char) - 1) /* "" char */ #define UMA_ALIGN_CACHE (16 - 1) /* Cache line size align */ /* * Destroys an empty uma zone. If the zone is not empty uma complains loudly. * * Arguments: * zone The zone we want to destroy. * */ void uma_zdestroy(uma_zone_t zone); /* * Allocates an item out of a zone * * Arguments: * zone The zone we are allocating from * arg This data is passed to the ctor function * flags See sys/malloc.h for available flags. * * Returns: * A non null pointer to an initialized element from the zone is * garanteed if the wait flag is M_WAITOK, otherwise a null pointer may be * returned if the zone is empty or the ctor failed. */ void *uma_zalloc_arg(uma_zone_t zone, void *arg, int flags); /* * Allocates an item out of a zone without supplying an argument * * This is just a wrapper for uma_zalloc_arg for convenience. * */ static __inline void *uma_zalloc(uma_zone_t zone, int flags); static __inline void * uma_zalloc(uma_zone_t zone, int flags) { return uma_zalloc_arg(zone, NULL, flags); } /* * Frees an item back into the specified zone. * * Arguments: * zone The zone the item was originally allocated out of. * item The memory to be freed. * arg Argument passed to the destructor * * Returns: * Nothing. */ void uma_zfree_arg(uma_zone_t zone, void *item, void *arg); /* * Frees an item back to a zone without supplying an argument * * This is just a wrapper for uma_zfree_arg for convenience. * */ static __inline void uma_zfree(uma_zone_t zone, void *item); static __inline void uma_zfree(uma_zone_t zone, void *item) { uma_zfree_arg(zone, item, NULL); } /* * XXX The rest of the prototypes in this header are h0h0 magic for the VM. * If you think you need to use it for a normal zone you're probably incorrect. */ /* * Backend page supplier routines * * Arguments: * zone The zone that is requesting pages * size The number of bytes being requested * pflag Flags for these memory pages, see below. * wait Indicates our willingness to block. * * Returns: * A pointer to the alloced memory or NULL on failure. */ typedef void *(*uma_alloc)(uma_zone_t zone, int size, u_int8_t *pflag, int wait); /* * Backend page free routines * * Arguments: * item A pointer to the previously allocated pages * size The original size of the allocation * pflag The flags for the slab. See UMA_SLAB_* below * * Returns: * None */ typedef void (*uma_free)(void *item, int size, u_int8_t pflag); /* * Sets up the uma allocator. (Called by vm_mem_init) * * Arguments: * bootmem A pointer to memory used to bootstrap the system. * * Returns: * Nothing * * Discussion: * This memory is used for zones which allocate things before the * backend page supplier can give us pages. It should be * UMA_SLAB_SIZE * boot_pages bytes. (see uma_int.h) * */ void uma_startup(void *bootmem, int boot_pages); /* * Finishes starting up the allocator. This should * be called when kva is ready for normal allocs. * * Arguments: * None * * Returns: * Nothing * * Discussion: * uma_startup2 is called by kmeminit() to enable us of uma for malloc. */ void uma_startup2(void); /* * Reclaims unused memory for all zones * * Arguments: * None * Returns: * None * * This should only be called by the page out daemon. */ void uma_reclaim(void); /* * Switches the backing object of a zone * * Arguments: * zone The zone to update * obj The obj to use for future allocations * size The size of the object to allocate * * Returns: * 0 if kva space can not be allocated * 1 if successful * * Discussion: * A NULL object can be used and uma will allocate one for you. Setting * the size will limit the amount of memory allocated to this zone. * */ struct vm_object; int uma_zone_set_obj(uma_zone_t zone, struct vm_object *obj, int size); /* * Sets a high limit on the number of items allowed in a zone * * Arguments: * zone The zone to limit * * Returns: * Nothing */ void uma_zone_set_max(uma_zone_t zone, int nitems); /* * The following two routines (uma_zone_set_init/fini) * are used to set the backend init/fini pair which acts on an * object as it becomes allocated and is placed in a slab within * the specified zone's backing keg. These should probably not * be changed once allocations have already begun and only * immediately upon zone creation. */ void uma_zone_set_init(uma_zone_t zone, uma_init uminit); void uma_zone_set_fini(uma_zone_t zone, uma_fini fini); /* * The following two routines (uma_zone_set_zinit/zfini) are * used to set the zinit/zfini pair which acts on an object as * it passes from the backing Keg's slab cache to the * specified Zone's bucket cache. These should probably not * be changed once allocations have already begun and * only immediately upon zone creation. */ void uma_zone_set_zinit(uma_zone_t zone, uma_init zinit); void uma_zone_set_zfini(uma_zone_t zone, uma_fini zfini); /* * Replaces the standard page_alloc or obj_alloc functions for this zone * * Arguments: * zone The zone whos back end allocator is being changed. * allocf A pointer to the allocation function * * Returns: * Nothing * * Discussion: * This could be used to implement pageable allocation, or perhaps * even DMA allocators if used in conjunction with the OFFPAGE * zone flag. */ void uma_zone_set_allocf(uma_zone_t zone, uma_alloc allocf); /* * Used for freeing memory provided by the allocf above * * Arguments: * zone The zone that intends to use this free routine. * freef The page freeing routine. * * Returns: * Nothing */ void uma_zone_set_freef(uma_zone_t zone, uma_free freef); /* * These flags are setable in the allocf and visable in the freef. */ #define UMA_SLAB_BOOT 0x01 /* Slab alloced from boot pages */ #define UMA_SLAB_KMEM 0x02 /* Slab alloced from kmem_map */ #define UMA_SLAB_PRIV 0x08 /* Slab alloced from priv allocator */ #define UMA_SLAB_OFFP 0x10 /* Slab is managed separately */ #define UMA_SLAB_MALLOC 0x20 /* Slab is a large malloc slab */ /* 0x40 and 0x80 are available */ /* * Used to pre-fill a zone with some number of items * * Arguments: * zone The zone to fill * itemcnt The number of items to reserve * * Returns: * Nothing * * NOTE: This is blocking and should only be done at startup */ void uma_prealloc(uma_zone_t zone, int itemcnt); /* * Used to lookup the reference counter allocated for an item * from a UMA_ZONE_REFCNT zone. For UMA_ZONE_REFCNT zones, * reference counters are allocated for items and stored in * the underlying slab header. * * Arguments: * zone The UMA_ZONE_REFCNT zone to which the item belongs. * item The address of the item for which we want a refcnt. * * Returns: * A pointer to a u_int32_t reference counter. */ u_int32_t *uma_find_refcnt(uma_zone_t zone, void *item); +/* + * Used to determine if a fixed-size zone is exhausted. + * + * Arguments: + * zone The zone to check + * + * Returns: + * Non-zero if zone is exhausted. + */ +int uma_zone_exhausted(uma_zone_t zone); + /* * Exported statistics structures to be used by user space monitoring tools. * Statistics stream consusts of a uma_stream_header, followed by a series of * alternative uma_type_header and uma_type_stat structures. Statistics * structures */ #define UMA_STREAM_VERSION 0x00000001 struct uma_stream_header { u_int32_t ush_version; /* Stream format version. */ u_int32_t ush_maxcpus; /* Value of MAXCPU for stream. */ u_int32_t ush_count; /* Number of records. */ u_int32_t _ush_pad; /* Pad/reserved field. */ }; #define UTH_MAX_NAME 32 #define UTH_ZONE_SECONDARY 0x00000001 struct uma_type_header { /* * Static per-zone data, some extracted from the supporting keg. */ char uth_name[UTH_MAX_NAME]; u_int32_t uth_align; /* Keg: alignment. */ u_int32_t uth_size; /* Keg: requested size of item. */ u_int32_t uth_rsize; /* Keg: real size of item. */ u_int32_t uth_maxpages; /* Keg: maximum number of pages. */ u_int32_t uth_limit; /* Keg: max items to allocate. */ /* * Current dynamic zone/keg-derived statistics. */ u_int32_t uth_pages; /* Keg: pages allocated. */ u_int32_t uth_keg_free; /* Keg: items free. */ u_int32_t uth_zone_free; /* Zone: items free. */ u_int32_t uth_bucketsize; /* Zone: desired bucket size. */ u_int32_t uth_zone_flags; /* Zone: flags. */ u_int64_t uth_allocs; /* Zone: number of allocations. */ u_int64_t uth_frees; /* Zone: number of frees. */ u_int64_t uth_fails; /* Zone: number of alloc failures. */ u_int64_t _uth_reserved1[3]; /* Reserved. */ }; struct uma_percpu_stat { u_int64_t ups_allocs; /* Cache: number of alloctions. */ u_int64_t ups_frees; /* Cache: number of frees. */ u_int64_t ups_cache_free; /* Cache: free items in cache. */ u_int64_t _ups_reserved[5]; /* Reserved. */ }; #endif diff --git a/sys/vm/uma_core.c b/sys/vm/uma_core.c index 3e8bb2e4a309..ac3ffc425d6a 100644 --- a/sys/vm/uma_core.c +++ b/sys/vm/uma_core.c @@ -1,2998 +1,3010 @@ /*- * Copyright (c) 2002, 2003, 2004, 2005 Jeffrey Roberson * Copyright (c) 2004, 2005 Bosko Milekic * Copyright (c) 2004-2006 Robert N. M. Watson * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice unmodified, this list of conditions, and the following * disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ /* * uma_core.c Implementation of the Universal Memory allocator * * This allocator is intended to replace the multitude of similar object caches * in the standard FreeBSD kernel. The intent is to be flexible as well as * effecient. A primary design goal is to return unused memory to the rest of * the system. This will make the system as a whole more flexible due to the * ability to move memory to subsystems which most need it instead of leaving * pools of reserved memory unused. * * The basic ideas stem from similar slab/zone based allocators whose algorithms * are well known. * */ /* * TODO: * - Improve memory usage for large allocations * - Investigate cache size adjustments */ #include __FBSDID("$FreeBSD$"); /* I should really use ktr.. */ /* #define UMA_DEBUG 1 #define UMA_DEBUG_ALLOC 1 #define UMA_DEBUG_ALLOC_1 1 */ #include "opt_ddb.h" #include "opt_param.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* * This is the zone and keg from which all zones are spawned. The idea is that * even the zone & keg heads are allocated from the allocator, so we use the * bss section to bootstrap us. */ static struct uma_keg masterkeg; static struct uma_zone masterzone_k; static struct uma_zone masterzone_z; static uma_zone_t kegs = &masterzone_k; static uma_zone_t zones = &masterzone_z; /* This is the zone from which all of uma_slab_t's are allocated. */ static uma_zone_t slabzone; static uma_zone_t slabrefzone; /* With refcounters (for UMA_ZONE_REFCNT) */ /* * The initial hash tables come out of this zone so they can be allocated * prior to malloc coming up. */ static uma_zone_t hashzone; static MALLOC_DEFINE(M_UMAHASH, "UMAHash", "UMA Hash Buckets"); /* * Are we allowed to allocate buckets? */ static int bucketdisable = 1; /* Linked list of all kegs in the system */ static LIST_HEAD(,uma_keg) uma_kegs = LIST_HEAD_INITIALIZER(&uma_kegs); /* This mutex protects the keg list */ static struct mtx uma_mtx; /* Linked list of boot time pages */ static LIST_HEAD(,uma_slab) uma_boot_pages = LIST_HEAD_INITIALIZER(&uma_boot_pages); /* This mutex protects the boot time pages list */ static struct mtx uma_boot_pages_mtx; /* Is the VM done starting up? */ static int booted = 0; /* Maximum number of allowed items-per-slab if the slab header is OFFPAGE */ static u_int uma_max_ipers; static u_int uma_max_ipers_ref; /* * This is the handle used to schedule events that need to happen * outside of the allocation fast path. */ static struct callout uma_callout; #define UMA_TIMEOUT 20 /* Seconds for callout interval. */ /* * This structure is passed as the zone ctor arg so that I don't have to create * a special allocation function just for zones. */ struct uma_zctor_args { char *name; size_t size; uma_ctor ctor; uma_dtor dtor; uma_init uminit; uma_fini fini; uma_keg_t keg; int align; u_int32_t flags; }; struct uma_kctor_args { uma_zone_t zone; size_t size; uma_init uminit; uma_fini fini; int align; u_int32_t flags; }; struct uma_bucket_zone { uma_zone_t ubz_zone; char *ubz_name; int ubz_entries; }; #define BUCKET_MAX 128 struct uma_bucket_zone bucket_zones[] = { { NULL, "16 Bucket", 16 }, { NULL, "32 Bucket", 32 }, { NULL, "64 Bucket", 64 }, { NULL, "128 Bucket", 128 }, { NULL, NULL, 0} }; #define BUCKET_SHIFT 4 #define BUCKET_ZONES ((BUCKET_MAX >> BUCKET_SHIFT) + 1) /* * bucket_size[] maps requested bucket sizes to zones that allocate a bucket * of approximately the right size. */ static uint8_t bucket_size[BUCKET_ZONES]; /* * Flags and enumerations to be passed to internal functions. */ enum zfreeskip { SKIP_NONE, SKIP_DTOR, SKIP_FINI }; #define ZFREE_STATFAIL 0x00000001 /* Update zone failure statistic. */ #define ZFREE_STATFREE 0x00000002 /* Update zone free statistic. */ /* Prototypes.. */ static void *obj_alloc(uma_zone_t, int, u_int8_t *, int); static void *page_alloc(uma_zone_t, int, u_int8_t *, int); static void *startup_alloc(uma_zone_t, int, u_int8_t *, int); static void page_free(void *, int, u_int8_t); static uma_slab_t slab_zalloc(uma_zone_t, int); static void cache_drain(uma_zone_t); static void bucket_drain(uma_zone_t, uma_bucket_t); static void bucket_cache_drain(uma_zone_t zone); static int keg_ctor(void *, int, void *, int); static void keg_dtor(void *, int, void *); static int zone_ctor(void *, int, void *, int); static void zone_dtor(void *, int, void *); static int zero_init(void *, int, int); static void zone_small_init(uma_zone_t zone); static void zone_large_init(uma_zone_t zone); static void zone_foreach(void (*zfunc)(uma_zone_t)); static void zone_timeout(uma_zone_t zone); static int hash_alloc(struct uma_hash *); static int hash_expand(struct uma_hash *, struct uma_hash *); static void hash_free(struct uma_hash *hash); static void uma_timeout(void *); static void uma_startup3(void); static void *uma_zalloc_internal(uma_zone_t, void *, int); static void uma_zfree_internal(uma_zone_t, void *, void *, enum zfreeskip, int); static void bucket_enable(void); static void bucket_init(void); static uma_bucket_t bucket_alloc(int, int); static void bucket_free(uma_bucket_t); static void bucket_zone_drain(void); static int uma_zalloc_bucket(uma_zone_t zone, int flags); static uma_slab_t uma_zone_slab(uma_zone_t zone, int flags); static void *uma_slab_alloc(uma_zone_t zone, uma_slab_t slab); static void zone_drain(uma_zone_t); static uma_zone_t uma_kcreate(uma_zone_t zone, size_t size, uma_init uminit, uma_fini fini, int align, u_int32_t flags); void uma_print_zone(uma_zone_t); void uma_print_stats(void); static int sysctl_vm_zone_count(SYSCTL_HANDLER_ARGS); static int sysctl_vm_zone_stats(SYSCTL_HANDLER_ARGS); #ifdef WITNESS static int nosleepwithlocks = 1; #else static int nosleepwithlocks = 0; #endif SYSCTL_INT(_debug, OID_AUTO, nosleepwithlocks, CTLFLAG_RW, &nosleepwithlocks, 0, "Convert M_WAITOK to M_NOWAIT to avoid lock-held-across-sleep paths"); SYSINIT(uma_startup3, SI_SUB_VM_CONF, SI_ORDER_SECOND, uma_startup3, NULL); SYSCTL_PROC(_vm, OID_AUTO, zone_count, CTLFLAG_RD|CTLTYPE_INT, 0, 0, sysctl_vm_zone_count, "I", "Number of UMA zones"); SYSCTL_PROC(_vm, OID_AUTO, zone_stats, CTLFLAG_RD|CTLTYPE_STRUCT, 0, 0, sysctl_vm_zone_stats, "s,struct uma_type_header", "Zone Stats"); /* * This routine checks to see whether or not it's safe to enable buckets. */ static void bucket_enable(void) { if (cnt.v_free_count < cnt.v_free_min) bucketdisable = 1; else bucketdisable = 0; } /* * Initialize bucket_zones, the array of zones of buckets of various sizes. * * For each zone, calculate the memory required for each bucket, consisting * of the header and an array of pointers. Initialize bucket_size[] to point * the range of appropriate bucket sizes at the zone. */ static void bucket_init(void) { struct uma_bucket_zone *ubz; int i; int j; for (i = 0, j = 0; bucket_zones[j].ubz_entries != 0; j++) { int size; ubz = &bucket_zones[j]; size = roundup(sizeof(struct uma_bucket), sizeof(void *)); size += sizeof(void *) * ubz->ubz_entries; ubz->ubz_zone = uma_zcreate(ubz->ubz_name, size, NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZFLAG_INTERNAL); for (; i <= ubz->ubz_entries; i += (1 << BUCKET_SHIFT)) bucket_size[i >> BUCKET_SHIFT] = j; } } /* * Given a desired number of entries for a bucket, return the zone from which * to allocate the bucket. */ static struct uma_bucket_zone * bucket_zone_lookup(int entries) { int idx; idx = howmany(entries, 1 << BUCKET_SHIFT); return (&bucket_zones[bucket_size[idx]]); } static uma_bucket_t bucket_alloc(int entries, int bflags) { struct uma_bucket_zone *ubz; uma_bucket_t bucket; /* * This is to stop us from allocating per cpu buckets while we're * running out of vm.boot_pages. Otherwise, we would exhaust the * boot pages. This also prevents us from allocating buckets in * low memory situations. */ if (bucketdisable) return (NULL); ubz = bucket_zone_lookup(entries); bucket = uma_zalloc_internal(ubz->ubz_zone, NULL, bflags); if (bucket) { #ifdef INVARIANTS bzero(bucket->ub_bucket, sizeof(void *) * ubz->ubz_entries); #endif bucket->ub_cnt = 0; bucket->ub_entries = ubz->ubz_entries; } return (bucket); } static void bucket_free(uma_bucket_t bucket) { struct uma_bucket_zone *ubz; ubz = bucket_zone_lookup(bucket->ub_entries); uma_zfree_internal(ubz->ubz_zone, bucket, NULL, SKIP_NONE, ZFREE_STATFREE); } static void bucket_zone_drain(void) { struct uma_bucket_zone *ubz; for (ubz = &bucket_zones[0]; ubz->ubz_entries != 0; ubz++) zone_drain(ubz->ubz_zone); } /* * Routine called by timeout which is used to fire off some time interval * based calculations. (stats, hash size, etc.) * * Arguments: * arg Unused * * Returns: * Nothing */ static void uma_timeout(void *unused) { bucket_enable(); zone_foreach(zone_timeout); /* Reschedule this event */ callout_reset(&uma_callout, UMA_TIMEOUT * hz, uma_timeout, NULL); } /* * Routine to perform timeout driven calculations. This expands the * hashes and does per cpu statistics aggregation. * * Arguments: * zone The zone to operate on * * Returns: * Nothing */ static void zone_timeout(uma_zone_t zone) { uma_keg_t keg; u_int64_t alloc; keg = zone->uz_keg; alloc = 0; /* * Expand the zone hash table. * * This is done if the number of slabs is larger than the hash size. * What I'm trying to do here is completely reduce collisions. This * may be a little aggressive. Should I allow for two collisions max? */ ZONE_LOCK(zone); if (keg->uk_flags & UMA_ZONE_HASH && keg->uk_pages / keg->uk_ppera >= keg->uk_hash.uh_hashsize) { struct uma_hash newhash; struct uma_hash oldhash; int ret; /* * This is so involved because allocating and freeing * while the zone lock is held will lead to deadlock. * I have to do everything in stages and check for * races. */ newhash = keg->uk_hash; ZONE_UNLOCK(zone); ret = hash_alloc(&newhash); ZONE_LOCK(zone); if (ret) { if (hash_expand(&keg->uk_hash, &newhash)) { oldhash = keg->uk_hash; keg->uk_hash = newhash; } else oldhash = newhash; ZONE_UNLOCK(zone); hash_free(&oldhash); ZONE_LOCK(zone); } } ZONE_UNLOCK(zone); } /* * Allocate and zero fill the next sized hash table from the appropriate * backing store. * * Arguments: * hash A new hash structure with the old hash size in uh_hashsize * * Returns: * 1 on sucess and 0 on failure. */ static int hash_alloc(struct uma_hash *hash) { int oldsize; int alloc; oldsize = hash->uh_hashsize; /* We're just going to go to a power of two greater */ if (oldsize) { hash->uh_hashsize = oldsize * 2; alloc = sizeof(hash->uh_slab_hash[0]) * hash->uh_hashsize; hash->uh_slab_hash = (struct slabhead *)malloc(alloc, M_UMAHASH, M_NOWAIT); } else { alloc = sizeof(hash->uh_slab_hash[0]) * UMA_HASH_SIZE_INIT; hash->uh_slab_hash = uma_zalloc_internal(hashzone, NULL, M_WAITOK); hash->uh_hashsize = UMA_HASH_SIZE_INIT; } if (hash->uh_slab_hash) { bzero(hash->uh_slab_hash, alloc); hash->uh_hashmask = hash->uh_hashsize - 1; return (1); } return (0); } /* * Expands the hash table for HASH zones. This is done from zone_timeout * to reduce collisions. This must not be done in the regular allocation * path, otherwise, we can recurse on the vm while allocating pages. * * Arguments: * oldhash The hash you want to expand * newhash The hash structure for the new table * * Returns: * Nothing * * Discussion: */ static int hash_expand(struct uma_hash *oldhash, struct uma_hash *newhash) { uma_slab_t slab; int hval; int i; if (!newhash->uh_slab_hash) return (0); if (oldhash->uh_hashsize >= newhash->uh_hashsize) return (0); /* * I need to investigate hash algorithms for resizing without a * full rehash. */ for (i = 0; i < oldhash->uh_hashsize; i++) while (!SLIST_EMPTY(&oldhash->uh_slab_hash[i])) { slab = SLIST_FIRST(&oldhash->uh_slab_hash[i]); SLIST_REMOVE_HEAD(&oldhash->uh_slab_hash[i], us_hlink); hval = UMA_HASH(newhash, slab->us_data); SLIST_INSERT_HEAD(&newhash->uh_slab_hash[hval], slab, us_hlink); } return (1); } /* * Free the hash bucket to the appropriate backing store. * * Arguments: * slab_hash The hash bucket we're freeing * hashsize The number of entries in that hash bucket * * Returns: * Nothing */ static void hash_free(struct uma_hash *hash) { if (hash->uh_slab_hash == NULL) return; if (hash->uh_hashsize == UMA_HASH_SIZE_INIT) uma_zfree_internal(hashzone, hash->uh_slab_hash, NULL, SKIP_NONE, ZFREE_STATFREE); else free(hash->uh_slab_hash, M_UMAHASH); } /* * Frees all outstanding items in a bucket * * Arguments: * zone The zone to free to, must be unlocked. * bucket The free/alloc bucket with items, cpu queue must be locked. * * Returns: * Nothing */ static void bucket_drain(uma_zone_t zone, uma_bucket_t bucket) { uma_slab_t slab; int mzone; void *item; if (bucket == NULL) return; slab = NULL; mzone = 0; /* We have to lookup the slab again for malloc.. */ if (zone->uz_keg->uk_flags & UMA_ZONE_MALLOC) mzone = 1; while (bucket->ub_cnt > 0) { bucket->ub_cnt--; item = bucket->ub_bucket[bucket->ub_cnt]; #ifdef INVARIANTS bucket->ub_bucket[bucket->ub_cnt] = NULL; KASSERT(item != NULL, ("bucket_drain: botched ptr, item is NULL")); #endif /* * This is extremely inefficient. The slab pointer was passed * to uma_zfree_arg, but we lost it because the buckets don't * hold them. This will go away when free() gets a size passed * to it. */ if (mzone) slab = vtoslab((vm_offset_t)item & (~UMA_SLAB_MASK)); uma_zfree_internal(zone, item, slab, SKIP_DTOR, 0); } } /* * Drains the per cpu caches for a zone. * * NOTE: This may only be called while the zone is being turn down, and not * during normal operation. This is necessary in order that we do not have * to migrate CPUs to drain the per-CPU caches. * * Arguments: * zone The zone to drain, must be unlocked. * * Returns: * Nothing */ static void cache_drain(uma_zone_t zone) { uma_cache_t cache; int cpu; /* * XXX: It is safe to not lock the per-CPU caches, because we're * tearing down the zone anyway. I.e., there will be no further use * of the caches at this point. * * XXX: It would good to be able to assert that the zone is being * torn down to prevent improper use of cache_drain(). * * XXX: We lock the zone before passing into bucket_cache_drain() as * it is used elsewhere. Should the tear-down path be made special * there in some form? */ for (cpu = 0; cpu <= mp_maxid; cpu++) { if (CPU_ABSENT(cpu)) continue; cache = &zone->uz_cpu[cpu]; bucket_drain(zone, cache->uc_allocbucket); bucket_drain(zone, cache->uc_freebucket); if (cache->uc_allocbucket != NULL) bucket_free(cache->uc_allocbucket); if (cache->uc_freebucket != NULL) bucket_free(cache->uc_freebucket); cache->uc_allocbucket = cache->uc_freebucket = NULL; } ZONE_LOCK(zone); bucket_cache_drain(zone); ZONE_UNLOCK(zone); } /* * Drain the cached buckets from a zone. Expects a locked zone on entry. */ static void bucket_cache_drain(uma_zone_t zone) { uma_bucket_t bucket; /* * Drain the bucket queues and free the buckets, we just keep two per * cpu (alloc/free). */ while ((bucket = LIST_FIRST(&zone->uz_full_bucket)) != NULL) { LIST_REMOVE(bucket, ub_link); ZONE_UNLOCK(zone); bucket_drain(zone, bucket); bucket_free(bucket); ZONE_LOCK(zone); } /* Now we do the free queue.. */ while ((bucket = LIST_FIRST(&zone->uz_free_bucket)) != NULL) { LIST_REMOVE(bucket, ub_link); bucket_free(bucket); } } /* * Frees pages from a zone back to the system. This is done on demand from * the pageout daemon. * * Arguments: * zone The zone to free pages from * all Should we drain all items? * * Returns: * Nothing. */ static void zone_drain(uma_zone_t zone) { struct slabhead freeslabs = { 0 }; uma_keg_t keg; uma_slab_t slab; uma_slab_t n; u_int8_t flags; u_int8_t *mem; int i; keg = zone->uz_keg; /* * We don't want to take pages from statically allocated zones at this * time */ if (keg->uk_flags & UMA_ZONE_NOFREE || keg->uk_freef == NULL) return; ZONE_LOCK(zone); #ifdef UMA_DEBUG printf("%s free items: %u\n", zone->uz_name, keg->uk_free); #endif bucket_cache_drain(zone); if (keg->uk_free == 0) goto finished; slab = LIST_FIRST(&keg->uk_free_slab); while (slab) { n = LIST_NEXT(slab, us_link); /* We have no where to free these to */ if (slab->us_flags & UMA_SLAB_BOOT) { slab = n; continue; } LIST_REMOVE(slab, us_link); keg->uk_pages -= keg->uk_ppera; keg->uk_free -= keg->uk_ipers; if (keg->uk_flags & UMA_ZONE_HASH) UMA_HASH_REMOVE(&keg->uk_hash, slab, slab->us_data); SLIST_INSERT_HEAD(&freeslabs, slab, us_hlink); slab = n; } finished: ZONE_UNLOCK(zone); while ((slab = SLIST_FIRST(&freeslabs)) != NULL) { SLIST_REMOVE(&freeslabs, slab, uma_slab, us_hlink); if (keg->uk_fini) for (i = 0; i < keg->uk_ipers; i++) keg->uk_fini( slab->us_data + (keg->uk_rsize * i), keg->uk_size); flags = slab->us_flags; mem = slab->us_data; if ((keg->uk_flags & UMA_ZONE_MALLOC) || (keg->uk_flags & UMA_ZONE_REFCNT)) { vm_object_t obj; if (flags & UMA_SLAB_KMEM) obj = kmem_object; else obj = NULL; for (i = 0; i < keg->uk_ppera; i++) vsetobj((vm_offset_t)mem + (i * PAGE_SIZE), obj); } if (keg->uk_flags & UMA_ZONE_OFFPAGE) uma_zfree_internal(keg->uk_slabzone, slab, NULL, SKIP_NONE, ZFREE_STATFREE); #ifdef UMA_DEBUG printf("%s: Returning %d bytes.\n", zone->uz_name, UMA_SLAB_SIZE * keg->uk_ppera); #endif keg->uk_freef(mem, UMA_SLAB_SIZE * keg->uk_ppera, flags); } } /* * Allocate a new slab for a zone. This does not insert the slab onto a list. * * Arguments: * zone The zone to allocate slabs for * wait Shall we wait? * * Returns: * The slab that was allocated or NULL if there is no memory and the * caller specified M_NOWAIT. */ static uma_slab_t slab_zalloc(uma_zone_t zone, int wait) { uma_slabrefcnt_t slabref; uma_slab_t slab; uma_keg_t keg; u_int8_t *mem; u_int8_t flags; int i; slab = NULL; keg = zone->uz_keg; #ifdef UMA_DEBUG printf("slab_zalloc: Allocating a new slab for %s\n", zone->uz_name); #endif ZONE_UNLOCK(zone); if (keg->uk_flags & UMA_ZONE_OFFPAGE) { slab = uma_zalloc_internal(keg->uk_slabzone, NULL, wait); if (slab == NULL) { ZONE_LOCK(zone); return NULL; } } /* * This reproduces the old vm_zone behavior of zero filling pages the * first time they are added to a zone. * * Malloced items are zeroed in uma_zalloc. */ if ((keg->uk_flags & UMA_ZONE_MALLOC) == 0) wait |= M_ZERO; else wait &= ~M_ZERO; mem = keg->uk_allocf(zone, keg->uk_ppera * UMA_SLAB_SIZE, &flags, wait); if (mem == NULL) { if (keg->uk_flags & UMA_ZONE_OFFPAGE) uma_zfree_internal(keg->uk_slabzone, slab, NULL, SKIP_NONE, ZFREE_STATFREE); ZONE_LOCK(zone); return (NULL); } /* Point the slab into the allocated memory */ if (!(keg->uk_flags & UMA_ZONE_OFFPAGE)) slab = (uma_slab_t )(mem + keg->uk_pgoff); if ((keg->uk_flags & UMA_ZONE_MALLOC) || (keg->uk_flags & UMA_ZONE_REFCNT)) for (i = 0; i < keg->uk_ppera; i++) vsetslab((vm_offset_t)mem + (i * PAGE_SIZE), slab); slab->us_keg = keg; slab->us_data = mem; slab->us_freecount = keg->uk_ipers; slab->us_firstfree = 0; slab->us_flags = flags; if (keg->uk_flags & UMA_ZONE_REFCNT) { slabref = (uma_slabrefcnt_t)slab; for (i = 0; i < keg->uk_ipers; i++) { slabref->us_freelist[i].us_refcnt = 0; slabref->us_freelist[i].us_item = i+1; } } else { for (i = 0; i < keg->uk_ipers; i++) slab->us_freelist[i].us_item = i+1; } if (keg->uk_init != NULL) { for (i = 0; i < keg->uk_ipers; i++) if (keg->uk_init(slab->us_data + (keg->uk_rsize * i), keg->uk_size, wait) != 0) break; if (i != keg->uk_ipers) { if (keg->uk_fini != NULL) { for (i--; i > -1; i--) keg->uk_fini(slab->us_data + (keg->uk_rsize * i), keg->uk_size); } if ((keg->uk_flags & UMA_ZONE_MALLOC) || (keg->uk_flags & UMA_ZONE_REFCNT)) { vm_object_t obj; if (flags & UMA_SLAB_KMEM) obj = kmem_object; else obj = NULL; for (i = 0; i < keg->uk_ppera; i++) vsetobj((vm_offset_t)mem + (i * PAGE_SIZE), obj); } if (keg->uk_flags & UMA_ZONE_OFFPAGE) uma_zfree_internal(keg->uk_slabzone, slab, NULL, SKIP_NONE, ZFREE_STATFREE); keg->uk_freef(mem, UMA_SLAB_SIZE * keg->uk_ppera, flags); ZONE_LOCK(zone); return (NULL); } } ZONE_LOCK(zone); if (keg->uk_flags & UMA_ZONE_HASH) UMA_HASH_INSERT(&keg->uk_hash, slab, mem); keg->uk_pages += keg->uk_ppera; keg->uk_free += keg->uk_ipers; return (slab); } /* * This function is intended to be used early on in place of page_alloc() so * that we may use the boot time page cache to satisfy allocations before * the VM is ready. */ static void * startup_alloc(uma_zone_t zone, int bytes, u_int8_t *pflag, int wait) { uma_keg_t keg; uma_slab_t tmps; keg = zone->uz_keg; /* * Check our small startup cache to see if it has pages remaining. */ mtx_lock(&uma_boot_pages_mtx); if ((tmps = LIST_FIRST(&uma_boot_pages)) != NULL) { LIST_REMOVE(tmps, us_link); mtx_unlock(&uma_boot_pages_mtx); *pflag = tmps->us_flags; return (tmps->us_data); } mtx_unlock(&uma_boot_pages_mtx); if (booted == 0) panic("UMA: Increase vm.boot_pages"); /* * Now that we've booted reset these users to their real allocator. */ #ifdef UMA_MD_SMALL_ALLOC keg->uk_allocf = uma_small_alloc; #else keg->uk_allocf = page_alloc; #endif return keg->uk_allocf(zone, bytes, pflag, wait); } /* * Allocates a number of pages from the system * * Arguments: * zone Unused * bytes The number of bytes requested * wait Shall we wait? * * Returns: * A pointer to the alloced memory or possibly * NULL if M_NOWAIT is set. */ static void * page_alloc(uma_zone_t zone, int bytes, u_int8_t *pflag, int wait) { void *p; /* Returned page */ *pflag = UMA_SLAB_KMEM; p = (void *) kmem_malloc(kmem_map, bytes, wait); return (p); } /* * Allocates a number of pages from within an object * * Arguments: * zone Unused * bytes The number of bytes requested * wait Shall we wait? * * Returns: * A pointer to the alloced memory or possibly * NULL if M_NOWAIT is set. */ static void * obj_alloc(uma_zone_t zone, int bytes, u_int8_t *flags, int wait) { vm_object_t object; vm_offset_t retkva, zkva; vm_page_t p; int pages, startpages; object = zone->uz_keg->uk_obj; retkva = 0; /* * This looks a little weird since we're getting one page at a time. */ VM_OBJECT_LOCK(object); p = TAILQ_LAST(&object->memq, pglist); pages = p != NULL ? p->pindex + 1 : 0; startpages = pages; zkva = zone->uz_keg->uk_kva + pages * PAGE_SIZE; for (; bytes > 0; bytes -= PAGE_SIZE) { p = vm_page_alloc(object, pages, VM_ALLOC_INTERRUPT | VM_ALLOC_WIRED); if (p == NULL) { if (pages != startpages) pmap_qremove(retkva, pages - startpages); while (pages != startpages) { pages--; p = TAILQ_LAST(&object->memq, pglist); vm_page_lock_queues(); vm_page_unwire(p, 0); vm_page_free(p); vm_page_unlock_queues(); } retkva = 0; goto done; } pmap_qenter(zkva, &p, 1); if (retkva == 0) retkva = zkva; zkva += PAGE_SIZE; pages += 1; } done: VM_OBJECT_UNLOCK(object); *flags = UMA_SLAB_PRIV; return ((void *)retkva); } /* * Frees a number of pages to the system * * Arguments: * mem A pointer to the memory to be freed * size The size of the memory being freed * flags The original p->us_flags field * * Returns: * Nothing */ static void page_free(void *mem, int size, u_int8_t flags) { vm_map_t map; if (flags & UMA_SLAB_KMEM) map = kmem_map; else panic("UMA: page_free used with invalid flags %d\n", flags); kmem_free(map, (vm_offset_t)mem, size); } /* * Zero fill initializer * * Arguments/Returns follow uma_init specifications */ static int zero_init(void *mem, int size, int flags) { bzero(mem, size); return (0); } /* * Finish creating a small uma zone. This calculates ipers, and the zone size. * * Arguments * zone The zone we should initialize * * Returns * Nothing */ static void zone_small_init(uma_zone_t zone) { uma_keg_t keg; u_int rsize; u_int memused; u_int wastedspace; u_int shsize; keg = zone->uz_keg; KASSERT(keg != NULL, ("Keg is null in zone_small_init")); rsize = keg->uk_size; if (rsize < UMA_SMALLEST_UNIT) rsize = UMA_SMALLEST_UNIT; if (rsize & keg->uk_align) rsize = (rsize & ~keg->uk_align) + (keg->uk_align + 1); keg->uk_rsize = rsize; keg->uk_ppera = 1; if (keg->uk_flags & UMA_ZONE_REFCNT) { rsize += UMA_FRITMREF_SZ; /* linkage & refcnt */ shsize = sizeof(struct uma_slab_refcnt); } else { rsize += UMA_FRITM_SZ; /* Account for linkage */ shsize = sizeof(struct uma_slab); } keg->uk_ipers = (UMA_SLAB_SIZE - shsize) / rsize; KASSERT(keg->uk_ipers != 0, ("zone_small_init: ipers is 0")); memused = keg->uk_ipers * rsize + shsize; wastedspace = UMA_SLAB_SIZE - memused; /* * We can't do OFFPAGE if we're internal or if we've been * asked to not go to the VM for buckets. If we do this we * may end up going to the VM (kmem_map) for slabs which we * do not want to do if we're UMA_ZFLAG_CACHEONLY as a * result of UMA_ZONE_VM, which clearly forbids it. */ if ((keg->uk_flags & UMA_ZFLAG_INTERNAL) || (keg->uk_flags & UMA_ZFLAG_CACHEONLY)) return; if ((wastedspace >= UMA_MAX_WASTE) && (keg->uk_ipers < (UMA_SLAB_SIZE / keg->uk_rsize))) { keg->uk_ipers = UMA_SLAB_SIZE / keg->uk_rsize; KASSERT(keg->uk_ipers <= 255, ("zone_small_init: keg->uk_ipers too high!")); #ifdef UMA_DEBUG printf("UMA decided we need offpage slab headers for " "zone: %s, calculated wastedspace = %d, " "maximum wasted space allowed = %d, " "calculated ipers = %d, " "new wasted space = %d\n", zone->uz_name, wastedspace, UMA_MAX_WASTE, keg->uk_ipers, UMA_SLAB_SIZE - keg->uk_ipers * keg->uk_rsize); #endif keg->uk_flags |= UMA_ZONE_OFFPAGE; if ((keg->uk_flags & UMA_ZONE_MALLOC) == 0) keg->uk_flags |= UMA_ZONE_HASH; } } /* * Finish creating a large (> UMA_SLAB_SIZE) uma zone. Just give in and do * OFFPAGE for now. When I can allow for more dynamic slab sizes this will be * more complicated. * * Arguments * zone The zone we should initialize * * Returns * Nothing */ static void zone_large_init(uma_zone_t zone) { uma_keg_t keg; int pages; keg = zone->uz_keg; KASSERT(keg != NULL, ("Keg is null in zone_large_init")); KASSERT((keg->uk_flags & UMA_ZFLAG_CACHEONLY) == 0, ("zone_large_init: Cannot large-init a UMA_ZFLAG_CACHEONLY zone")); pages = keg->uk_size / UMA_SLAB_SIZE; /* Account for remainder */ if ((pages * UMA_SLAB_SIZE) < keg->uk_size) pages++; keg->uk_ppera = pages; keg->uk_ipers = 1; keg->uk_flags |= UMA_ZONE_OFFPAGE; if ((keg->uk_flags & UMA_ZONE_MALLOC) == 0) keg->uk_flags |= UMA_ZONE_HASH; keg->uk_rsize = keg->uk_size; } /* * Keg header ctor. This initializes all fields, locks, etc. And inserts * the keg onto the global keg list. * * Arguments/Returns follow uma_ctor specifications * udata Actually uma_kctor_args */ static int keg_ctor(void *mem, int size, void *udata, int flags) { struct uma_kctor_args *arg = udata; uma_keg_t keg = mem; uma_zone_t zone; bzero(keg, size); keg->uk_size = arg->size; keg->uk_init = arg->uminit; keg->uk_fini = arg->fini; keg->uk_align = arg->align; keg->uk_free = 0; keg->uk_pages = 0; keg->uk_flags = arg->flags; keg->uk_allocf = page_alloc; keg->uk_freef = page_free; keg->uk_recurse = 0; keg->uk_slabzone = NULL; /* * The master zone is passed to us at keg-creation time. */ zone = arg->zone; zone->uz_keg = keg; if (arg->flags & UMA_ZONE_VM) keg->uk_flags |= UMA_ZFLAG_CACHEONLY; if (arg->flags & UMA_ZONE_ZINIT) keg->uk_init = zero_init; /* * The +UMA_FRITM_SZ added to uk_size is to account for the * linkage that is added to the size in zone_small_init(). If * we don't account for this here then we may end up in * zone_small_init() with a calculated 'ipers' of 0. */ if (keg->uk_flags & UMA_ZONE_REFCNT) { if ((keg->uk_size+UMA_FRITMREF_SZ) > (UMA_SLAB_SIZE - sizeof(struct uma_slab_refcnt))) zone_large_init(zone); else zone_small_init(zone); } else { if ((keg->uk_size+UMA_FRITM_SZ) > (UMA_SLAB_SIZE - sizeof(struct uma_slab))) zone_large_init(zone); else zone_small_init(zone); } if (keg->uk_flags & UMA_ZONE_OFFPAGE) { if (keg->uk_flags & UMA_ZONE_REFCNT) keg->uk_slabzone = slabrefzone; else keg->uk_slabzone = slabzone; } /* * If we haven't booted yet we need allocations to go through the * startup cache until the vm is ready. */ if (keg->uk_ppera == 1) { #ifdef UMA_MD_SMALL_ALLOC keg->uk_allocf = uma_small_alloc; keg->uk_freef = uma_small_free; #endif if (booted == 0) keg->uk_allocf = startup_alloc; } /* * Initialize keg's lock (shared among zones) through * Master zone */ zone->uz_lock = &keg->uk_lock; if (arg->flags & UMA_ZONE_MTXCLASS) ZONE_LOCK_INIT(zone, 1); else ZONE_LOCK_INIT(zone, 0); /* * If we're putting the slab header in the actual page we need to * figure out where in each page it goes. This calculates a right * justified offset into the memory on an ALIGN_PTR boundary. */ if (!(keg->uk_flags & UMA_ZONE_OFFPAGE)) { u_int totsize; /* Size of the slab struct and free list */ if (keg->uk_flags & UMA_ZONE_REFCNT) totsize = sizeof(struct uma_slab_refcnt) + keg->uk_ipers * UMA_FRITMREF_SZ; else totsize = sizeof(struct uma_slab) + keg->uk_ipers * UMA_FRITM_SZ; if (totsize & UMA_ALIGN_PTR) totsize = (totsize & ~UMA_ALIGN_PTR) + (UMA_ALIGN_PTR + 1); keg->uk_pgoff = UMA_SLAB_SIZE - totsize; if (keg->uk_flags & UMA_ZONE_REFCNT) totsize = keg->uk_pgoff + sizeof(struct uma_slab_refcnt) + keg->uk_ipers * UMA_FRITMREF_SZ; else totsize = keg->uk_pgoff + sizeof(struct uma_slab) + keg->uk_ipers * UMA_FRITM_SZ; /* * The only way the following is possible is if with our * UMA_ALIGN_PTR adjustments we are now bigger than * UMA_SLAB_SIZE. I haven't checked whether this is * mathematically possible for all cases, so we make * sure here anyway. */ if (totsize > UMA_SLAB_SIZE) { printf("zone %s ipers %d rsize %d size %d\n", zone->uz_name, keg->uk_ipers, keg->uk_rsize, keg->uk_size); panic("UMA slab won't fit.\n"); } } if (keg->uk_flags & UMA_ZONE_HASH) hash_alloc(&keg->uk_hash); #ifdef UMA_DEBUG printf("%s(%p) size = %d ipers = %d ppera = %d pgoff = %d\n", zone->uz_name, zone, keg->uk_size, keg->uk_ipers, keg->uk_ppera, keg->uk_pgoff); #endif LIST_INSERT_HEAD(&keg->uk_zones, zone, uz_link); mtx_lock(&uma_mtx); LIST_INSERT_HEAD(&uma_kegs, keg, uk_link); mtx_unlock(&uma_mtx); return (0); } /* * Zone header ctor. This initializes all fields, locks, etc. * * Arguments/Returns follow uma_ctor specifications * udata Actually uma_zctor_args */ static int zone_ctor(void *mem, int size, void *udata, int flags) { struct uma_zctor_args *arg = udata; uma_zone_t zone = mem; uma_zone_t z; uma_keg_t keg; bzero(zone, size); zone->uz_name = arg->name; zone->uz_ctor = arg->ctor; zone->uz_dtor = arg->dtor; zone->uz_init = NULL; zone->uz_fini = NULL; zone->uz_allocs = 0; zone->uz_frees = 0; zone->uz_fails = 0; zone->uz_fills = zone->uz_count = 0; if (arg->flags & UMA_ZONE_SECONDARY) { KASSERT(arg->keg != NULL, ("Secondary zone on zero'd keg")); keg = arg->keg; zone->uz_keg = keg; zone->uz_init = arg->uminit; zone->uz_fini = arg->fini; zone->uz_lock = &keg->uk_lock; mtx_lock(&uma_mtx); ZONE_LOCK(zone); keg->uk_flags |= UMA_ZONE_SECONDARY; LIST_FOREACH(z, &keg->uk_zones, uz_link) { if (LIST_NEXT(z, uz_link) == NULL) { LIST_INSERT_AFTER(z, zone, uz_link); break; } } ZONE_UNLOCK(zone); mtx_unlock(&uma_mtx); } else if (arg->keg == NULL) { if (uma_kcreate(zone, arg->size, arg->uminit, arg->fini, arg->align, arg->flags) == NULL) return (ENOMEM); } else { struct uma_kctor_args karg; int error; /* We should only be here from uma_startup() */ karg.size = arg->size; karg.uminit = arg->uminit; karg.fini = arg->fini; karg.align = arg->align; karg.flags = arg->flags; karg.zone = zone; error = keg_ctor(arg->keg, sizeof(struct uma_keg), &karg, flags); if (error) return (error); } keg = zone->uz_keg; zone->uz_lock = &keg->uk_lock; /* * Some internal zones don't have room allocated for the per cpu * caches. If we're internal, bail out here. */ if (keg->uk_flags & UMA_ZFLAG_INTERNAL) { KASSERT((keg->uk_flags & UMA_ZONE_SECONDARY) == 0, ("Secondary zone requested UMA_ZFLAG_INTERNAL")); return (0); } if (keg->uk_flags & UMA_ZONE_MAXBUCKET) zone->uz_count = BUCKET_MAX; else if (keg->uk_ipers <= BUCKET_MAX) zone->uz_count = keg->uk_ipers; else zone->uz_count = BUCKET_MAX; return (0); } /* * Keg header dtor. This frees all data, destroys locks, frees the hash * table and removes the keg from the global list. * * Arguments/Returns follow uma_dtor specifications * udata unused */ static void keg_dtor(void *arg, int size, void *udata) { uma_keg_t keg; keg = (uma_keg_t)arg; mtx_lock(&keg->uk_lock); if (keg->uk_free != 0) { printf("Freed UMA keg was not empty (%d items). " " Lost %d pages of memory.\n", keg->uk_free, keg->uk_pages); } mtx_unlock(&keg->uk_lock); if (keg->uk_flags & UMA_ZONE_HASH) hash_free(&keg->uk_hash); mtx_destroy(&keg->uk_lock); } /* * Zone header dtor. * * Arguments/Returns follow uma_dtor specifications * udata unused */ static void zone_dtor(void *arg, int size, void *udata) { uma_zone_t zone; uma_keg_t keg; zone = (uma_zone_t)arg; keg = zone->uz_keg; if (!(keg->uk_flags & UMA_ZFLAG_INTERNAL)) cache_drain(zone); mtx_lock(&uma_mtx); zone_drain(zone); if (keg->uk_flags & UMA_ZONE_SECONDARY) { LIST_REMOVE(zone, uz_link); /* * XXX there are some races here where * the zone can be drained but zone lock * released and then refilled before we * remove it... we dont care for now */ ZONE_LOCK(zone); if (LIST_EMPTY(&keg->uk_zones)) keg->uk_flags &= ~UMA_ZONE_SECONDARY; ZONE_UNLOCK(zone); mtx_unlock(&uma_mtx); } else { LIST_REMOVE(keg, uk_link); LIST_REMOVE(zone, uz_link); mtx_unlock(&uma_mtx); uma_zfree_internal(kegs, keg, NULL, SKIP_NONE, ZFREE_STATFREE); } zone->uz_keg = NULL; } /* * Traverses every zone in the system and calls a callback * * Arguments: * zfunc A pointer to a function which accepts a zone * as an argument. * * Returns: * Nothing */ static void zone_foreach(void (*zfunc)(uma_zone_t)) { uma_keg_t keg; uma_zone_t zone; mtx_lock(&uma_mtx); LIST_FOREACH(keg, &uma_kegs, uk_link) { LIST_FOREACH(zone, &keg->uk_zones, uz_link) zfunc(zone); } mtx_unlock(&uma_mtx); } /* Public functions */ /* See uma.h */ void uma_startup(void *bootmem, int boot_pages) { struct uma_zctor_args args; uma_slab_t slab; u_int slabsize; u_int objsize, totsize, wsize; int i; #ifdef UMA_DEBUG printf("Creating uma keg headers zone and keg.\n"); #endif mtx_init(&uma_mtx, "UMA lock", NULL, MTX_DEF); /* * Figure out the maximum number of items-per-slab we'll have if * we're using the OFFPAGE slab header to track free items, given * all possible object sizes and the maximum desired wastage * (UMA_MAX_WASTE). * * We iterate until we find an object size for * which the calculated wastage in zone_small_init() will be * enough to warrant OFFPAGE. Since wastedspace versus objsize * is an overall increasing see-saw function, we find the smallest * objsize such that the wastage is always acceptable for objects * with that objsize or smaller. Since a smaller objsize always * generates a larger possible uma_max_ipers, we use this computed * objsize to calculate the largest ipers possible. Since the * ipers calculated for OFFPAGE slab headers is always larger than * the ipers initially calculated in zone_small_init(), we use * the former's equation (UMA_SLAB_SIZE / keg->uk_rsize) to * obtain the maximum ipers possible for offpage slab headers. * * It should be noted that ipers versus objsize is an inversly * proportional function which drops off rather quickly so as * long as our UMA_MAX_WASTE is such that the objsize we calculate * falls into the portion of the inverse relation AFTER the steep * falloff, then uma_max_ipers shouldn't be too high (~10 on i386). * * Note that we have 8-bits (1 byte) to use as a freelist index * inside the actual slab header itself and this is enough to * accomodate us. In the worst case, a UMA_SMALLEST_UNIT sized * object with offpage slab header would have ipers = * UMA_SLAB_SIZE / UMA_SMALLEST_UNIT (currently = 256), which is * 1 greater than what our byte-integer freelist index can * accomodate, but we know that this situation never occurs as * for UMA_SMALLEST_UNIT-sized objects, we will never calculate * that we need to go to offpage slab headers. Or, if we do, * then we trap that condition below and panic in the INVARIANTS case. */ wsize = UMA_SLAB_SIZE - sizeof(struct uma_slab) - UMA_MAX_WASTE; totsize = wsize; objsize = UMA_SMALLEST_UNIT; while (totsize >= wsize) { totsize = (UMA_SLAB_SIZE - sizeof(struct uma_slab)) / (objsize + UMA_FRITM_SZ); totsize *= (UMA_FRITM_SZ + objsize); objsize++; } if (objsize > UMA_SMALLEST_UNIT) objsize--; uma_max_ipers = UMA_SLAB_SIZE / objsize; wsize = UMA_SLAB_SIZE - sizeof(struct uma_slab_refcnt) - UMA_MAX_WASTE; totsize = wsize; objsize = UMA_SMALLEST_UNIT; while (totsize >= wsize) { totsize = (UMA_SLAB_SIZE - sizeof(struct uma_slab_refcnt)) / (objsize + UMA_FRITMREF_SZ); totsize *= (UMA_FRITMREF_SZ + objsize); objsize++; } if (objsize > UMA_SMALLEST_UNIT) objsize--; uma_max_ipers_ref = UMA_SLAB_SIZE / objsize; KASSERT((uma_max_ipers_ref <= 255) && (uma_max_ipers <= 255), ("uma_startup: calculated uma_max_ipers values too large!")); #ifdef UMA_DEBUG printf("Calculated uma_max_ipers (for OFFPAGE) is %d\n", uma_max_ipers); printf("Calculated uma_max_ipers_slab (for OFFPAGE) is %d\n", uma_max_ipers_ref); #endif /* "manually" create the initial zone */ args.name = "UMA Kegs"; args.size = sizeof(struct uma_keg); args.ctor = keg_ctor; args.dtor = keg_dtor; args.uminit = zero_init; args.fini = NULL; args.keg = &masterkeg; args.align = 32 - 1; args.flags = UMA_ZFLAG_INTERNAL; /* The initial zone has no Per cpu queues so it's smaller */ zone_ctor(kegs, sizeof(struct uma_zone), &args, M_WAITOK); #ifdef UMA_DEBUG printf("Filling boot free list.\n"); #endif for (i = 0; i < boot_pages; i++) { slab = (uma_slab_t)((u_int8_t *)bootmem + (i * UMA_SLAB_SIZE)); slab->us_data = (u_int8_t *)slab; slab->us_flags = UMA_SLAB_BOOT; LIST_INSERT_HEAD(&uma_boot_pages, slab, us_link); } mtx_init(&uma_boot_pages_mtx, "UMA boot pages", NULL, MTX_DEF); #ifdef UMA_DEBUG printf("Creating uma zone headers zone and keg.\n"); #endif args.name = "UMA Zones"; args.size = sizeof(struct uma_zone) + (sizeof(struct uma_cache) * (mp_maxid + 1)); args.ctor = zone_ctor; args.dtor = zone_dtor; args.uminit = zero_init; args.fini = NULL; args.keg = NULL; args.align = 32 - 1; args.flags = UMA_ZFLAG_INTERNAL; /* The initial zone has no Per cpu queues so it's smaller */ zone_ctor(zones, sizeof(struct uma_zone), &args, M_WAITOK); #ifdef UMA_DEBUG printf("Initializing pcpu cache locks.\n"); #endif #ifdef UMA_DEBUG printf("Creating slab and hash zones.\n"); #endif /* * This is the max number of free list items we'll have with * offpage slabs. */ slabsize = uma_max_ipers * UMA_FRITM_SZ; slabsize += sizeof(struct uma_slab); /* Now make a zone for slab headers */ slabzone = uma_zcreate("UMA Slabs", slabsize, NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZFLAG_INTERNAL); /* * We also create a zone for the bigger slabs with reference * counts in them, to accomodate UMA_ZONE_REFCNT zones. */ slabsize = uma_max_ipers_ref * UMA_FRITMREF_SZ; slabsize += sizeof(struct uma_slab_refcnt); slabrefzone = uma_zcreate("UMA RCntSlabs", slabsize, NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZFLAG_INTERNAL); hashzone = uma_zcreate("UMA Hash", sizeof(struct slabhead *) * UMA_HASH_SIZE_INIT, NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZFLAG_INTERNAL); bucket_init(); #ifdef UMA_MD_SMALL_ALLOC booted = 1; #endif #ifdef UMA_DEBUG printf("UMA startup complete.\n"); #endif } /* see uma.h */ void uma_startup2(void) { booted = 1; bucket_enable(); #ifdef UMA_DEBUG printf("UMA startup2 complete.\n"); #endif } /* * Initialize our callout handle * */ static void uma_startup3(void) { #ifdef UMA_DEBUG printf("Starting callout.\n"); #endif callout_init(&uma_callout, CALLOUT_MPSAFE); callout_reset(&uma_callout, UMA_TIMEOUT * hz, uma_timeout, NULL); #ifdef UMA_DEBUG printf("UMA startup3 complete.\n"); #endif } static uma_zone_t uma_kcreate(uma_zone_t zone, size_t size, uma_init uminit, uma_fini fini, int align, u_int32_t flags) { struct uma_kctor_args args; args.size = size; args.uminit = uminit; args.fini = fini; args.align = align; args.flags = flags; args.zone = zone; return (uma_zalloc_internal(kegs, &args, M_WAITOK)); } /* See uma.h */ uma_zone_t uma_zcreate(char *name, size_t size, uma_ctor ctor, uma_dtor dtor, uma_init uminit, uma_fini fini, int align, u_int32_t flags) { struct uma_zctor_args args; /* This stuff is essential for the zone ctor */ args.name = name; args.size = size; args.ctor = ctor; args.dtor = dtor; args.uminit = uminit; args.fini = fini; args.align = align; args.flags = flags; args.keg = NULL; return (uma_zalloc_internal(zones, &args, M_WAITOK)); } /* See uma.h */ uma_zone_t uma_zsecond_create(char *name, uma_ctor ctor, uma_dtor dtor, uma_init zinit, uma_fini zfini, uma_zone_t master) { struct uma_zctor_args args; args.name = name; args.size = master->uz_keg->uk_size; args.ctor = ctor; args.dtor = dtor; args.uminit = zinit; args.fini = zfini; args.align = master->uz_keg->uk_align; args.flags = master->uz_keg->uk_flags | UMA_ZONE_SECONDARY; args.keg = master->uz_keg; return (uma_zalloc_internal(zones, &args, M_WAITOK)); } /* See uma.h */ void uma_zdestroy(uma_zone_t zone) { uma_zfree_internal(zones, zone, NULL, SKIP_NONE, ZFREE_STATFREE); } /* See uma.h */ void * uma_zalloc_arg(uma_zone_t zone, void *udata, int flags) { void *item; uma_cache_t cache; uma_bucket_t bucket; int cpu; int badness; /* This is the fast path allocation */ #ifdef UMA_DEBUG_ALLOC_1 printf("Allocating one item from %s(%p)\n", zone->uz_name, zone); #endif CTR3(KTR_UMA, "uma_zalloc_arg thread %x zone %s flags %d", curthread, zone->uz_name, flags); if (!(flags & M_NOWAIT)) { KASSERT(curthread->td_intr_nesting_level == 0, ("malloc(M_WAITOK) in interrupt context")); if (nosleepwithlocks) { #ifdef WITNESS badness = WITNESS_CHECK(WARN_GIANTOK | WARN_SLEEPOK, NULL, "malloc(M_WAITOK) of \"%s\", forcing M_NOWAIT", zone->uz_name); #else badness = 1; #endif } else { badness = 0; WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL, "malloc(M_WAITOK) of \"%s\"", zone->uz_name); } if (badness) { flags &= ~M_WAITOK; flags |= M_NOWAIT; } } /* * If possible, allocate from the per-CPU cache. There are two * requirements for safe access to the per-CPU cache: (1) the thread * accessing the cache must not be preempted or yield during access, * and (2) the thread must not migrate CPUs without switching which * cache it accesses. We rely on a critical section to prevent * preemption and migration. We release the critical section in * order to acquire the zone mutex if we are unable to allocate from * the current cache; when we re-acquire the critical section, we * must detect and handle migration if it has occurred. */ zalloc_restart: critical_enter(); cpu = curcpu; cache = &zone->uz_cpu[cpu]; zalloc_start: bucket = cache->uc_allocbucket; if (bucket) { if (bucket->ub_cnt > 0) { bucket->ub_cnt--; item = bucket->ub_bucket[bucket->ub_cnt]; #ifdef INVARIANTS bucket->ub_bucket[bucket->ub_cnt] = NULL; #endif KASSERT(item != NULL, ("uma_zalloc: Bucket pointer mangled.")); cache->uc_allocs++; critical_exit(); #ifdef INVARIANTS ZONE_LOCK(zone); uma_dbg_alloc(zone, NULL, item); ZONE_UNLOCK(zone); #endif if (zone->uz_ctor != NULL) { if (zone->uz_ctor(item, zone->uz_keg->uk_size, udata, flags) != 0) { uma_zfree_internal(zone, item, udata, SKIP_DTOR, ZFREE_STATFAIL | ZFREE_STATFREE); return (NULL); } } if (flags & M_ZERO) bzero(item, zone->uz_keg->uk_size); return (item); } else if (cache->uc_freebucket) { /* * We have run out of items in our allocbucket. * See if we can switch with our free bucket. */ if (cache->uc_freebucket->ub_cnt > 0) { #ifdef UMA_DEBUG_ALLOC printf("uma_zalloc: Swapping empty with" " alloc.\n"); #endif bucket = cache->uc_freebucket; cache->uc_freebucket = cache->uc_allocbucket; cache->uc_allocbucket = bucket; goto zalloc_start; } } } /* * Attempt to retrieve the item from the per-CPU cache has failed, so * we must go back to the zone. This requires the zone lock, so we * must drop the critical section, then re-acquire it when we go back * to the cache. Since the critical section is released, we may be * preempted or migrate. As such, make sure not to maintain any * thread-local state specific to the cache from prior to releasing * the critical section. */ critical_exit(); ZONE_LOCK(zone); critical_enter(); cpu = curcpu; cache = &zone->uz_cpu[cpu]; bucket = cache->uc_allocbucket; if (bucket != NULL) { if (bucket->ub_cnt > 0) { ZONE_UNLOCK(zone); goto zalloc_start; } bucket = cache->uc_freebucket; if (bucket != NULL && bucket->ub_cnt > 0) { ZONE_UNLOCK(zone); goto zalloc_start; } } /* Since we have locked the zone we may as well send back our stats */ zone->uz_allocs += cache->uc_allocs; cache->uc_allocs = 0; zone->uz_frees += cache->uc_frees; cache->uc_frees = 0; /* Our old one is now a free bucket */ if (cache->uc_allocbucket) { KASSERT(cache->uc_allocbucket->ub_cnt == 0, ("uma_zalloc_arg: Freeing a non free bucket.")); LIST_INSERT_HEAD(&zone->uz_free_bucket, cache->uc_allocbucket, ub_link); cache->uc_allocbucket = NULL; } /* Check the free list for a new alloc bucket */ if ((bucket = LIST_FIRST(&zone->uz_full_bucket)) != NULL) { KASSERT(bucket->ub_cnt != 0, ("uma_zalloc_arg: Returning an empty bucket.")); LIST_REMOVE(bucket, ub_link); cache->uc_allocbucket = bucket; ZONE_UNLOCK(zone); goto zalloc_start; } /* We are no longer associated with this CPU. */ critical_exit(); /* Bump up our uz_count so we get here less */ if (zone->uz_count < BUCKET_MAX) zone->uz_count++; /* * Now lets just fill a bucket and put it on the free list. If that * works we'll restart the allocation from the begining. */ if (uma_zalloc_bucket(zone, flags)) { ZONE_UNLOCK(zone); goto zalloc_restart; } ZONE_UNLOCK(zone); /* * We may not be able to get a bucket so return an actual item. */ #ifdef UMA_DEBUG printf("uma_zalloc_arg: Bucketzone returned NULL\n"); #endif return (uma_zalloc_internal(zone, udata, flags)); } static uma_slab_t uma_zone_slab(uma_zone_t zone, int flags) { uma_slab_t slab; uma_keg_t keg; keg = zone->uz_keg; /* * This is to prevent us from recursively trying to allocate * buckets. The problem is that if an allocation forces us to * grab a new bucket we will call page_alloc, which will go off * and cause the vm to allocate vm_map_entries. If we need new * buckets there too we will recurse in kmem_alloc and bad * things happen. So instead we return a NULL bucket, and make * the code that allocates buckets smart enough to deal with it * * XXX: While we want this protection for the bucket zones so that * recursion from the VM is handled (and the calling code that * allocates buckets knows how to deal with it), we do not want * to prevent allocation from the slab header zones (slabzone * and slabrefzone) if uk_recurse is not zero for them. The * reason is that it could lead to NULL being returned for * slab header allocations even in the M_WAITOK case, and the * caller can't handle that. */ if (keg->uk_flags & UMA_ZFLAG_INTERNAL && keg->uk_recurse != 0) if ((zone != slabzone) && (zone != slabrefzone)) return (NULL); slab = NULL; for (;;) { /* * Find a slab with some space. Prefer slabs that are partially * used over those that are totally full. This helps to reduce * fragmentation. */ if (keg->uk_free != 0) { if (!LIST_EMPTY(&keg->uk_part_slab)) { slab = LIST_FIRST(&keg->uk_part_slab); } else { slab = LIST_FIRST(&keg->uk_free_slab); LIST_REMOVE(slab, us_link); LIST_INSERT_HEAD(&keg->uk_part_slab, slab, us_link); } return (slab); } /* * M_NOVM means don't ask at all! */ if (flags & M_NOVM) break; if (keg->uk_maxpages && keg->uk_pages >= keg->uk_maxpages) { keg->uk_flags |= UMA_ZFLAG_FULL; if (flags & M_NOWAIT) break; else msleep(keg, &keg->uk_lock, PVM, "zonelimit", 0); continue; } keg->uk_recurse++; slab = slab_zalloc(zone, flags); keg->uk_recurse--; /* * If we got a slab here it's safe to mark it partially used * and return. We assume that the caller is going to remove * at least one item. */ if (slab) { LIST_INSERT_HEAD(&keg->uk_part_slab, slab, us_link); return (slab); } /* * We might not have been able to get a slab but another cpu * could have while we were unlocked. Check again before we * fail. */ if (flags & M_NOWAIT) flags |= M_NOVM; } return (slab); } static void * uma_slab_alloc(uma_zone_t zone, uma_slab_t slab) { uma_keg_t keg; uma_slabrefcnt_t slabref; void *item; u_int8_t freei; keg = zone->uz_keg; freei = slab->us_firstfree; if (keg->uk_flags & UMA_ZONE_REFCNT) { slabref = (uma_slabrefcnt_t)slab; slab->us_firstfree = slabref->us_freelist[freei].us_item; } else { slab->us_firstfree = slab->us_freelist[freei].us_item; } item = slab->us_data + (keg->uk_rsize * freei); slab->us_freecount--; keg->uk_free--; #ifdef INVARIANTS uma_dbg_alloc(zone, slab, item); #endif /* Move this slab to the full list */ if (slab->us_freecount == 0) { LIST_REMOVE(slab, us_link); LIST_INSERT_HEAD(&keg->uk_full_slab, slab, us_link); } return (item); } static int uma_zalloc_bucket(uma_zone_t zone, int flags) { uma_bucket_t bucket; uma_slab_t slab; int16_t saved; int max, origflags = flags; /* * Try this zone's free list first so we don't allocate extra buckets. */ if ((bucket = LIST_FIRST(&zone->uz_free_bucket)) != NULL) { KASSERT(bucket->ub_cnt == 0, ("uma_zalloc_bucket: Bucket on free list is not empty.")); LIST_REMOVE(bucket, ub_link); } else { int bflags; bflags = (flags & ~M_ZERO); if (zone->uz_keg->uk_flags & UMA_ZFLAG_CACHEONLY) bflags |= M_NOVM; ZONE_UNLOCK(zone); bucket = bucket_alloc(zone->uz_count, bflags); ZONE_LOCK(zone); } if (bucket == NULL) return (0); #ifdef SMP /* * This code is here to limit the number of simultaneous bucket fills * for any given zone to the number of per cpu caches in this zone. This * is done so that we don't allocate more memory than we really need. */ if (zone->uz_fills >= mp_ncpus) goto done; #endif zone->uz_fills++; max = MIN(bucket->ub_entries, zone->uz_count); /* Try to keep the buckets totally full */ saved = bucket->ub_cnt; while (bucket->ub_cnt < max && (slab = uma_zone_slab(zone, flags)) != NULL) { while (slab->us_freecount && bucket->ub_cnt < max) { bucket->ub_bucket[bucket->ub_cnt++] = uma_slab_alloc(zone, slab); } /* Don't block on the next fill */ flags |= M_NOWAIT; } /* * We unlock here because we need to call the zone's init. * It should be safe to unlock because the slab dealt with * above is already on the appropriate list within the keg * and the bucket we filled is not yet on any list, so we * own it. */ if (zone->uz_init != NULL) { int i; ZONE_UNLOCK(zone); for (i = saved; i < bucket->ub_cnt; i++) if (zone->uz_init(bucket->ub_bucket[i], zone->uz_keg->uk_size, origflags) != 0) break; /* * If we couldn't initialize the whole bucket, put the * rest back onto the freelist. */ if (i != bucket->ub_cnt) { int j; for (j = i; j < bucket->ub_cnt; j++) { uma_zfree_internal(zone, bucket->ub_bucket[j], NULL, SKIP_FINI, 0); #ifdef INVARIANTS bucket->ub_bucket[j] = NULL; #endif } bucket->ub_cnt = i; } ZONE_LOCK(zone); } zone->uz_fills--; if (bucket->ub_cnt != 0) { LIST_INSERT_HEAD(&zone->uz_full_bucket, bucket, ub_link); return (1); } #ifdef SMP done: #endif bucket_free(bucket); return (0); } /* * Allocates an item for an internal zone * * Arguments * zone The zone to alloc for. * udata The data to be passed to the constructor. * flags M_WAITOK, M_NOWAIT, M_ZERO. * * Returns * NULL if there is no memory and M_NOWAIT is set * An item if successful */ static void * uma_zalloc_internal(uma_zone_t zone, void *udata, int flags) { uma_keg_t keg; uma_slab_t slab; void *item; item = NULL; keg = zone->uz_keg; #ifdef UMA_DEBUG_ALLOC printf("INTERNAL: Allocating one item from %s(%p)\n", zone->uz_name, zone); #endif ZONE_LOCK(zone); slab = uma_zone_slab(zone, flags); if (slab == NULL) { zone->uz_fails++; ZONE_UNLOCK(zone); return (NULL); } item = uma_slab_alloc(zone, slab); zone->uz_allocs++; ZONE_UNLOCK(zone); /* * We have to call both the zone's init (not the keg's init) * and the zone's ctor. This is because the item is going from * a keg slab directly to the user, and the user is expecting it * to be both zone-init'd as well as zone-ctor'd. */ if (zone->uz_init != NULL) { if (zone->uz_init(item, keg->uk_size, flags) != 0) { uma_zfree_internal(zone, item, udata, SKIP_FINI, ZFREE_STATFAIL | ZFREE_STATFREE); return (NULL); } } if (zone->uz_ctor != NULL) { if (zone->uz_ctor(item, keg->uk_size, udata, flags) != 0) { uma_zfree_internal(zone, item, udata, SKIP_DTOR, ZFREE_STATFAIL | ZFREE_STATFREE); return (NULL); } } if (flags & M_ZERO) bzero(item, keg->uk_size); return (item); } /* See uma.h */ void uma_zfree_arg(uma_zone_t zone, void *item, void *udata) { uma_keg_t keg; uma_cache_t cache; uma_bucket_t bucket; int bflags; int cpu; keg = zone->uz_keg; #ifdef UMA_DEBUG_ALLOC_1 printf("Freeing item %p to %s(%p)\n", item, zone->uz_name, zone); #endif CTR2(KTR_UMA, "uma_zfree_arg thread %x zone %s", curthread, zone->uz_name); if (zone->uz_dtor) zone->uz_dtor(item, keg->uk_size, udata); #ifdef INVARIANTS ZONE_LOCK(zone); if (keg->uk_flags & UMA_ZONE_MALLOC) uma_dbg_free(zone, udata, item); else uma_dbg_free(zone, NULL, item); ZONE_UNLOCK(zone); #endif /* * The race here is acceptable. If we miss it we'll just have to wait * a little longer for the limits to be reset. */ if (keg->uk_flags & UMA_ZFLAG_FULL) goto zfree_internal; /* * If possible, free to the per-CPU cache. There are two * requirements for safe access to the per-CPU cache: (1) the thread * accessing the cache must not be preempted or yield during access, * and (2) the thread must not migrate CPUs without switching which * cache it accesses. We rely on a critical section to prevent * preemption and migration. We release the critical section in * order to acquire the zone mutex if we are unable to free to the * current cache; when we re-acquire the critical section, we must * detect and handle migration if it has occurred. */ zfree_restart: critical_enter(); cpu = curcpu; cache = &zone->uz_cpu[cpu]; zfree_start: bucket = cache->uc_freebucket; if (bucket) { /* * Do we have room in our bucket? It is OK for this uz count * check to be slightly out of sync. */ if (bucket->ub_cnt < bucket->ub_entries) { KASSERT(bucket->ub_bucket[bucket->ub_cnt] == NULL, ("uma_zfree: Freeing to non free bucket index.")); bucket->ub_bucket[bucket->ub_cnt] = item; bucket->ub_cnt++; cache->uc_frees++; critical_exit(); return; } else if (cache->uc_allocbucket) { #ifdef UMA_DEBUG_ALLOC printf("uma_zfree: Swapping buckets.\n"); #endif /* * We have run out of space in our freebucket. * See if we can switch with our alloc bucket. */ if (cache->uc_allocbucket->ub_cnt < cache->uc_freebucket->ub_cnt) { bucket = cache->uc_freebucket; cache->uc_freebucket = cache->uc_allocbucket; cache->uc_allocbucket = bucket; goto zfree_start; } } } /* * We can get here for two reasons: * * 1) The buckets are NULL * 2) The alloc and free buckets are both somewhat full. * * We must go back the zone, which requires acquiring the zone lock, * which in turn means we must release and re-acquire the critical * section. Since the critical section is released, we may be * preempted or migrate. As such, make sure not to maintain any * thread-local state specific to the cache from prior to releasing * the critical section. */ critical_exit(); ZONE_LOCK(zone); critical_enter(); cpu = curcpu; cache = &zone->uz_cpu[cpu]; if (cache->uc_freebucket != NULL) { if (cache->uc_freebucket->ub_cnt < cache->uc_freebucket->ub_entries) { ZONE_UNLOCK(zone); goto zfree_start; } if (cache->uc_allocbucket != NULL && (cache->uc_allocbucket->ub_cnt < cache->uc_freebucket->ub_cnt)) { ZONE_UNLOCK(zone); goto zfree_start; } } /* Since we have locked the zone we may as well send back our stats */ zone->uz_allocs += cache->uc_allocs; cache->uc_allocs = 0; zone->uz_frees += cache->uc_frees; cache->uc_frees = 0; bucket = cache->uc_freebucket; cache->uc_freebucket = NULL; /* Can we throw this on the zone full list? */ if (bucket != NULL) { #ifdef UMA_DEBUG_ALLOC printf("uma_zfree: Putting old bucket on the free list.\n"); #endif /* ub_cnt is pointing to the last free item */ KASSERT(bucket->ub_cnt != 0, ("uma_zfree: Attempting to insert an empty bucket onto the full list.\n")); LIST_INSERT_HEAD(&zone->uz_full_bucket, bucket, ub_link); } if ((bucket = LIST_FIRST(&zone->uz_free_bucket)) != NULL) { LIST_REMOVE(bucket, ub_link); ZONE_UNLOCK(zone); cache->uc_freebucket = bucket; goto zfree_start; } /* We are no longer associated with this CPU. */ critical_exit(); /* And the zone.. */ ZONE_UNLOCK(zone); #ifdef UMA_DEBUG_ALLOC printf("uma_zfree: Allocating new free bucket.\n"); #endif bflags = M_NOWAIT; if (keg->uk_flags & UMA_ZFLAG_CACHEONLY) bflags |= M_NOVM; bucket = bucket_alloc(zone->uz_count, bflags); if (bucket) { ZONE_LOCK(zone); LIST_INSERT_HEAD(&zone->uz_free_bucket, bucket, ub_link); ZONE_UNLOCK(zone); goto zfree_restart; } /* * If nothing else caught this, we'll just do an internal free. */ zfree_internal: uma_zfree_internal(zone, item, udata, SKIP_DTOR, ZFREE_STATFREE); return; } /* * Frees an item to an INTERNAL zone or allocates a free bucket * * Arguments: * zone The zone to free to * item The item we're freeing * udata User supplied data for the dtor * skip Skip dtors and finis */ static void uma_zfree_internal(uma_zone_t zone, void *item, void *udata, enum zfreeskip skip, int flags) { uma_slab_t slab; uma_slabrefcnt_t slabref; uma_keg_t keg; u_int8_t *mem; u_int8_t freei; keg = zone->uz_keg; if (skip < SKIP_DTOR && zone->uz_dtor) zone->uz_dtor(item, keg->uk_size, udata); if (skip < SKIP_FINI && zone->uz_fini) zone->uz_fini(item, keg->uk_size); ZONE_LOCK(zone); if (flags & ZFREE_STATFAIL) zone->uz_fails++; if (flags & ZFREE_STATFREE) zone->uz_frees++; if (!(keg->uk_flags & UMA_ZONE_MALLOC)) { mem = (u_int8_t *)((unsigned long)item & (~UMA_SLAB_MASK)); if (keg->uk_flags & UMA_ZONE_HASH) slab = hash_sfind(&keg->uk_hash, mem); else { mem += keg->uk_pgoff; slab = (uma_slab_t)mem; } } else { slab = (uma_slab_t)udata; } /* Do we need to remove from any lists? */ if (slab->us_freecount+1 == keg->uk_ipers) { LIST_REMOVE(slab, us_link); LIST_INSERT_HEAD(&keg->uk_free_slab, slab, us_link); } else if (slab->us_freecount == 0) { LIST_REMOVE(slab, us_link); LIST_INSERT_HEAD(&keg->uk_part_slab, slab, us_link); } /* Slab management stuff */ freei = ((unsigned long)item - (unsigned long)slab->us_data) / keg->uk_rsize; #ifdef INVARIANTS if (!skip) uma_dbg_free(zone, slab, item); #endif if (keg->uk_flags & UMA_ZONE_REFCNT) { slabref = (uma_slabrefcnt_t)slab; slabref->us_freelist[freei].us_item = slab->us_firstfree; } else { slab->us_freelist[freei].us_item = slab->us_firstfree; } slab->us_firstfree = freei; slab->us_freecount++; /* Zone statistics */ keg->uk_free++; if (keg->uk_flags & UMA_ZFLAG_FULL) { if (keg->uk_pages < keg->uk_maxpages) keg->uk_flags &= ~UMA_ZFLAG_FULL; /* We can handle one more allocation */ wakeup_one(keg); } ZONE_UNLOCK(zone); } /* See uma.h */ void uma_zone_set_max(uma_zone_t zone, int nitems) { uma_keg_t keg; keg = zone->uz_keg; ZONE_LOCK(zone); if (keg->uk_ppera > 1) keg->uk_maxpages = nitems * keg->uk_ppera; else keg->uk_maxpages = nitems / keg->uk_ipers; if (keg->uk_maxpages * keg->uk_ipers < nitems) keg->uk_maxpages++; ZONE_UNLOCK(zone); } /* See uma.h */ void uma_zone_set_init(uma_zone_t zone, uma_init uminit) { ZONE_LOCK(zone); KASSERT(zone->uz_keg->uk_pages == 0, ("uma_zone_set_init on non-empty keg")); zone->uz_keg->uk_init = uminit; ZONE_UNLOCK(zone); } /* See uma.h */ void uma_zone_set_fini(uma_zone_t zone, uma_fini fini) { ZONE_LOCK(zone); KASSERT(zone->uz_keg->uk_pages == 0, ("uma_zone_set_fini on non-empty keg")); zone->uz_keg->uk_fini = fini; ZONE_UNLOCK(zone); } /* See uma.h */ void uma_zone_set_zinit(uma_zone_t zone, uma_init zinit) { ZONE_LOCK(zone); KASSERT(zone->uz_keg->uk_pages == 0, ("uma_zone_set_zinit on non-empty keg")); zone->uz_init = zinit; ZONE_UNLOCK(zone); } /* See uma.h */ void uma_zone_set_zfini(uma_zone_t zone, uma_fini zfini) { ZONE_LOCK(zone); KASSERT(zone->uz_keg->uk_pages == 0, ("uma_zone_set_zfini on non-empty keg")); zone->uz_fini = zfini; ZONE_UNLOCK(zone); } /* See uma.h */ /* XXX uk_freef is not actually used with the zone locked */ void uma_zone_set_freef(uma_zone_t zone, uma_free freef) { ZONE_LOCK(zone); zone->uz_keg->uk_freef = freef; ZONE_UNLOCK(zone); } /* See uma.h */ /* XXX uk_allocf is not actually used with the zone locked */ void uma_zone_set_allocf(uma_zone_t zone, uma_alloc allocf) { ZONE_LOCK(zone); zone->uz_keg->uk_flags |= UMA_ZFLAG_PRIVALLOC; zone->uz_keg->uk_allocf = allocf; ZONE_UNLOCK(zone); } /* See uma.h */ int uma_zone_set_obj(uma_zone_t zone, struct vm_object *obj, int count) { uma_keg_t keg; vm_offset_t kva; int pages; keg = zone->uz_keg; pages = count / keg->uk_ipers; if (pages * keg->uk_ipers < count) pages++; kva = kmem_alloc_nofault(kernel_map, pages * UMA_SLAB_SIZE); if (kva == 0) return (0); if (obj == NULL) { obj = vm_object_allocate(OBJT_DEFAULT, pages); } else { VM_OBJECT_LOCK_INIT(obj, "uma object"); _vm_object_allocate(OBJT_DEFAULT, pages, obj); } ZONE_LOCK(zone); keg->uk_kva = kva; keg->uk_obj = obj; keg->uk_maxpages = pages; keg->uk_allocf = obj_alloc; keg->uk_flags |= UMA_ZONE_NOFREE | UMA_ZFLAG_PRIVALLOC; ZONE_UNLOCK(zone); return (1); } /* See uma.h */ void uma_prealloc(uma_zone_t zone, int items) { int slabs; uma_slab_t slab; uma_keg_t keg; keg = zone->uz_keg; ZONE_LOCK(zone); slabs = items / keg->uk_ipers; if (slabs * keg->uk_ipers < items) slabs++; while (slabs > 0) { slab = slab_zalloc(zone, M_WAITOK); LIST_INSERT_HEAD(&keg->uk_free_slab, slab, us_link); slabs--; } ZONE_UNLOCK(zone); } /* See uma.h */ u_int32_t * uma_find_refcnt(uma_zone_t zone, void *item) { uma_slabrefcnt_t slabref; uma_keg_t keg; u_int32_t *refcnt; int idx; keg = zone->uz_keg; slabref = (uma_slabrefcnt_t)vtoslab((vm_offset_t)item & (~UMA_SLAB_MASK)); KASSERT(slabref != NULL && slabref->us_keg->uk_flags & UMA_ZONE_REFCNT, ("uma_find_refcnt(): zone possibly not UMA_ZONE_REFCNT")); idx = ((unsigned long)item - (unsigned long)slabref->us_data) / keg->uk_rsize; refcnt = &slabref->us_freelist[idx].us_refcnt; return refcnt; } /* See uma.h */ void uma_reclaim(void) { #ifdef UMA_DEBUG printf("UMA: vm asked us to release pages!\n"); #endif bucket_enable(); zone_foreach(zone_drain); /* * Some slabs may have been freed but this zone will be visited early * we visit again so that we can free pages that are empty once other * zones are drained. We have to do the same for buckets. */ zone_drain(slabzone); zone_drain(slabrefzone); bucket_zone_drain(); } +/* See uma.h */ +int +uma_zone_exhausted(uma_zone_t zone) +{ + int full; + + ZONE_LOCK(zone); + full = (zone->uz_keg->uk_flags & UMA_ZFLAG_FULL); + ZONE_UNLOCK(zone); + return (full); +} + void * uma_large_malloc(int size, int wait) { void *mem; uma_slab_t slab; u_int8_t flags; slab = uma_zalloc_internal(slabzone, NULL, wait); if (slab == NULL) return (NULL); mem = page_alloc(NULL, size, &flags, wait); if (mem) { vsetslab((vm_offset_t)mem, slab); slab->us_data = mem; slab->us_flags = flags | UMA_SLAB_MALLOC; slab->us_size = size; } else { uma_zfree_internal(slabzone, slab, NULL, SKIP_NONE, ZFREE_STATFAIL | ZFREE_STATFREE); } return (mem); } void uma_large_free(uma_slab_t slab) { vsetobj((vm_offset_t)slab->us_data, kmem_object); page_free(slab->us_data, slab->us_size, slab->us_flags); uma_zfree_internal(slabzone, slab, NULL, SKIP_NONE, ZFREE_STATFREE); } void uma_print_stats(void) { zone_foreach(uma_print_zone); } static void slab_print(uma_slab_t slab) { printf("slab: keg %p, data %p, freecount %d, firstfree %d\n", slab->us_keg, slab->us_data, slab->us_freecount, slab->us_firstfree); } static void cache_print(uma_cache_t cache) { printf("alloc: %p(%d), free: %p(%d)\n", cache->uc_allocbucket, cache->uc_allocbucket?cache->uc_allocbucket->ub_cnt:0, cache->uc_freebucket, cache->uc_freebucket?cache->uc_freebucket->ub_cnt:0); } void uma_print_zone(uma_zone_t zone) { uma_cache_t cache; uma_keg_t keg; uma_slab_t slab; int i; keg = zone->uz_keg; printf("%s(%p) size %d(%d) flags %d ipers %d ppera %d out %d free %d\n", zone->uz_name, zone, keg->uk_size, keg->uk_rsize, keg->uk_flags, keg->uk_ipers, keg->uk_ppera, (keg->uk_ipers * keg->uk_pages) - keg->uk_free, keg->uk_free); printf("Part slabs:\n"); LIST_FOREACH(slab, &keg->uk_part_slab, us_link) slab_print(slab); printf("Free slabs:\n"); LIST_FOREACH(slab, &keg->uk_free_slab, us_link) slab_print(slab); printf("Full slabs:\n"); LIST_FOREACH(slab, &keg->uk_full_slab, us_link) slab_print(slab); for (i = 0; i <= mp_maxid; i++) { if (CPU_ABSENT(i)) continue; cache = &zone->uz_cpu[i]; printf("CPU %d Cache:\n", i); cache_print(cache); } } #ifdef DDB /* * Generate statistics across both the zone and its per-cpu cache's. Return * desired statistics if the pointer is non-NULL for that statistic. * * Note: does not update the zone statistics, as it can't safely clear the * per-CPU cache statistic. * * XXXRW: Following the uc_allocbucket and uc_freebucket pointers here isn't * safe from off-CPU; we should modify the caches to track this information * directly so that we don't have to. */ static void uma_zone_sumstat(uma_zone_t z, int *cachefreep, u_int64_t *allocsp, u_int64_t *freesp) { uma_cache_t cache; u_int64_t allocs, frees; int cachefree, cpu; allocs = frees = 0; cachefree = 0; for (cpu = 0; cpu <= mp_maxid; cpu++) { if (CPU_ABSENT(cpu)) continue; cache = &z->uz_cpu[cpu]; if (cache->uc_allocbucket != NULL) cachefree += cache->uc_allocbucket->ub_cnt; if (cache->uc_freebucket != NULL) cachefree += cache->uc_freebucket->ub_cnt; allocs += cache->uc_allocs; frees += cache->uc_frees; } allocs += z->uz_allocs; frees += z->uz_frees; if (cachefreep != NULL) *cachefreep = cachefree; if (allocsp != NULL) *allocsp = allocs; if (freesp != NULL) *freesp = frees; } #endif /* DDB */ static int sysctl_vm_zone_count(SYSCTL_HANDLER_ARGS) { uma_keg_t kz; uma_zone_t z; int count; count = 0; mtx_lock(&uma_mtx); LIST_FOREACH(kz, &uma_kegs, uk_link) { LIST_FOREACH(z, &kz->uk_zones, uz_link) count++; } mtx_unlock(&uma_mtx); return (sysctl_handle_int(oidp, &count, 0, req)); } static int sysctl_vm_zone_stats(SYSCTL_HANDLER_ARGS) { struct uma_stream_header ush; struct uma_type_header uth; struct uma_percpu_stat ups; uma_bucket_t bucket; struct sbuf sbuf; uma_cache_t cache; uma_keg_t kz; uma_zone_t z; char *buffer; int buflen, count, error, i; mtx_lock(&uma_mtx); restart: mtx_assert(&uma_mtx, MA_OWNED); count = 0; LIST_FOREACH(kz, &uma_kegs, uk_link) { LIST_FOREACH(z, &kz->uk_zones, uz_link) count++; } mtx_unlock(&uma_mtx); buflen = sizeof(ush) + count * (sizeof(uth) + sizeof(ups) * (mp_maxid + 1)) + 1; buffer = malloc(buflen, M_TEMP, M_WAITOK | M_ZERO); mtx_lock(&uma_mtx); i = 0; LIST_FOREACH(kz, &uma_kegs, uk_link) { LIST_FOREACH(z, &kz->uk_zones, uz_link) i++; } if (i > count) { free(buffer, M_TEMP); goto restart; } count = i; sbuf_new(&sbuf, buffer, buflen, SBUF_FIXEDLEN); /* * Insert stream header. */ bzero(&ush, sizeof(ush)); ush.ush_version = UMA_STREAM_VERSION; ush.ush_maxcpus = (mp_maxid + 1); ush.ush_count = count; if (sbuf_bcat(&sbuf, &ush, sizeof(ush)) < 0) { mtx_unlock(&uma_mtx); error = ENOMEM; goto out; } LIST_FOREACH(kz, &uma_kegs, uk_link) { LIST_FOREACH(z, &kz->uk_zones, uz_link) { bzero(&uth, sizeof(uth)); ZONE_LOCK(z); strlcpy(uth.uth_name, z->uz_name, UTH_MAX_NAME); uth.uth_align = kz->uk_align; uth.uth_pages = kz->uk_pages; uth.uth_keg_free = kz->uk_free; uth.uth_size = kz->uk_size; uth.uth_rsize = kz->uk_rsize; uth.uth_maxpages = kz->uk_maxpages; if (kz->uk_ppera > 1) uth.uth_limit = kz->uk_maxpages / kz->uk_ppera; else uth.uth_limit = kz->uk_maxpages * kz->uk_ipers; /* * A zone is secondary is it is not the first entry * on the keg's zone list. */ if ((kz->uk_flags & UMA_ZONE_SECONDARY) && (LIST_FIRST(&kz->uk_zones) != z)) uth.uth_zone_flags = UTH_ZONE_SECONDARY; LIST_FOREACH(bucket, &z->uz_full_bucket, ub_link) uth.uth_zone_free += bucket->ub_cnt; uth.uth_allocs = z->uz_allocs; uth.uth_frees = z->uz_frees; uth.uth_fails = z->uz_fails; if (sbuf_bcat(&sbuf, &uth, sizeof(uth)) < 0) { ZONE_UNLOCK(z); mtx_unlock(&uma_mtx); error = ENOMEM; goto out; } /* * While it is not normally safe to access the cache * bucket pointers while not on the CPU that owns the * cache, we only allow the pointers to be exchanged * without the zone lock held, not invalidated, so * accept the possible race associated with bucket * exchange during monitoring. */ for (i = 0; i < (mp_maxid + 1); i++) { bzero(&ups, sizeof(ups)); if (kz->uk_flags & UMA_ZFLAG_INTERNAL) goto skip; if (CPU_ABSENT(i)) goto skip; cache = &z->uz_cpu[i]; if (cache->uc_allocbucket != NULL) ups.ups_cache_free += cache->uc_allocbucket->ub_cnt; if (cache->uc_freebucket != NULL) ups.ups_cache_free += cache->uc_freebucket->ub_cnt; ups.ups_allocs = cache->uc_allocs; ups.ups_frees = cache->uc_frees; skip: if (sbuf_bcat(&sbuf, &ups, sizeof(ups)) < 0) { ZONE_UNLOCK(z); mtx_unlock(&uma_mtx); error = ENOMEM; goto out; } } ZONE_UNLOCK(z); } } mtx_unlock(&uma_mtx); sbuf_finish(&sbuf); error = SYSCTL_OUT(req, sbuf_data(&sbuf), sbuf_len(&sbuf)); out: free(buffer, M_TEMP); return (error); } #ifdef DDB DB_SHOW_COMMAND(uma, db_show_uma) { u_int64_t allocs, frees; uma_bucket_t bucket; uma_keg_t kz; uma_zone_t z; int cachefree; db_printf("%18s %8s %8s %8s %12s\n", "Zone", "Size", "Used", "Free", "Requests"); LIST_FOREACH(kz, &uma_kegs, uk_link) { LIST_FOREACH(z, &kz->uk_zones, uz_link) { if (kz->uk_flags & UMA_ZFLAG_INTERNAL) { allocs = z->uz_allocs; frees = z->uz_frees; cachefree = 0; } else uma_zone_sumstat(z, &cachefree, &allocs, &frees); if (!((kz->uk_flags & UMA_ZONE_SECONDARY) && (LIST_FIRST(&kz->uk_zones) != z))) cachefree += kz->uk_free; LIST_FOREACH(bucket, &z->uz_full_bucket, ub_link) cachefree += bucket->ub_cnt; db_printf("%18s %8ju %8jd %8d %12ju\n", z->uz_name, (uintmax_t)kz->uk_size, (intmax_t)(allocs - frees), cachefree, (uintmax_t)allocs); } } } #endif