Index: lib/libmemstat/memstat_uma.c =================================================================== --- lib/libmemstat/memstat_uma.c +++ lib/libmemstat/memstat_uma.c @@ -448,12 +448,7 @@ mtp->mt_memalloced = mtp->mt_numallocs * mtp->mt_size; mtp->mt_memfreed = mtp->mt_numfrees * mtp->mt_size; mtp->mt_bytes = mtp->mt_memalloced - mtp->mt_memfreed; - if (kz.uk_ppera > 1) - mtp->mt_countlimit = kz.uk_maxpages / - kz.uk_ipers; - else - mtp->mt_countlimit = kz.uk_maxpages * - kz.uk_ipers; + mtp->mt_countlimit = uz.uz_maxitems; mtp->mt_byteslimit = mtp->mt_countlimit * mtp->mt_size; mtp->mt_count = mtp->mt_numallocs - mtp->mt_numfrees; for (i = 0; i < ndomains; i++) { Index: sys/cam/cam_periph.c =================================================================== --- sys/cam/cam_periph.c +++ sys/cam/cam_periph.c @@ -936,7 +936,7 @@ /* * Get the buffer. */ - mapinfo->bp[i] = getpbuf(NULL); + mapinfo->bp[i] = uma_zalloc(pbuf_zone, M_WAITOK); /* put our pointer in the data slot */ mapinfo->bp[i]->b_data = *data_ptrs[i]; @@ -962,9 +962,9 @@ for (j = 0; j < i; ++j) { *data_ptrs[j] = mapinfo->bp[j]->b_caller1; vunmapbuf(mapinfo->bp[j]); - relpbuf(mapinfo->bp[j], NULL); + uma_zfree(pbuf_zone, mapinfo->bp[j]); } - relpbuf(mapinfo->bp[i], NULL); + uma_zfree(pbuf_zone, mapinfo->bp[i]); PRELE(curproc); return(EACCES); } @@ -1052,7 +1052,7 @@ vunmapbuf(mapinfo->bp[i]); /* release the buffer */ - relpbuf(mapinfo->bp[i], NULL); + uma_zfree(pbuf_zone, mapinfo->bp[i]); } /* allow ourselves to be swapped once again */ Index: sys/dev/md/md.c =================================================================== --- sys/dev/md/md.c +++ sys/dev/md/md.c @@ -231,7 +231,7 @@ #define NMASK (NINDIR-1) static int nshift; -static int md_vnode_pbuf_freecnt; +static uma_zone_t md_pbuf_zone; struct indir { uintptr_t *array; @@ -962,7 +962,7 @@ auio.uio_iovcnt = piov - auio.uio_iov; piov = auio.uio_iov; } else if ((bp->bio_flags & BIO_UNMAPPED) != 0) { - pb = getpbuf(&md_vnode_pbuf_freecnt); + pb = uma_zalloc(md_pbuf_zone, M_WAITOK); bp->bio_resid = len; unmapped_step: npages = atop(min(MAXPHYS, round_page(len + (ma_offs & @@ -1011,7 +1011,7 @@ if (len > 0) goto unmapped_step; } - relpbuf(pb, &md_vnode_pbuf_freecnt); + uma_zfree(md_pbuf_zone, pb); } free(piov, M_MD); @@ -2105,7 +2105,9 @@ sx_xunlock(&md_sx); } } - md_vnode_pbuf_freecnt = nswbuf / 10; + md_pbuf_zone = uma_zsecond_create("mdpbuf", pbuf_ctor, pbuf_dtor, + pbuf_init, NULL, pbuf_zone); + uma_zone_set_max(md_pbuf_zone, nswbuf / 10); status_dev = make_dev(&mdctl_cdevsw, INT_MAX, UID_ROOT, GID_WHEEL, 0600, MDCTL_NAME); g_topology_lock(); @@ -2198,5 +2200,6 @@ sx_destroy(&md_sx); if (status_dev != NULL) destroy_dev(status_dev); + uma_zdestroy(md_pbuf_zone); delete_unrhdr(md_uh); } Index: sys/dev/nvme/nvme_ctrlr.c =================================================================== --- sys/dev/nvme/nvme_ctrlr.c +++ sys/dev/nvme/nvme_ctrlr.c @@ -1052,7 +1052,7 @@ * this passthrough command. */ PHOLD(curproc); - buf = getpbuf(NULL); + buf = uma_zalloc(pbuf_zone, M_WAITOK); buf->b_data = pt->buf; buf->b_bufsize = pt->len; buf->b_iocmd = pt->is_read ? BIO_READ : BIO_WRITE; @@ -1099,7 +1099,7 @@ err: if (buf != NULL) { - relpbuf(buf, NULL); + uma_zfree(pbuf_zone, buf); PRELE(curproc); } Index: sys/fs/fuse/fuse_main.c =================================================================== --- sys/fs/fuse/fuse_main.c +++ sys/fs/fuse/fuse_main.c @@ -84,7 +84,7 @@ extern struct vfsops fuse_vfsops; extern struct cdevsw fuse_cdevsw; extern struct vop_vector fuse_vnops; -extern int fuse_pbuf_freecnt; +extern uma_zone_t fuse_pbuf_zone; static struct vfsconf fuse_vfsconf = { .vfc_version = VFS_VERSION, @@ -122,7 +122,6 @@ switch (what) { case MOD_LOAD: /* kldload */ - fuse_pbuf_freecnt = nswbuf / 2 + 1; mtx_init(&fuse_mtx, "fuse_mtx", NULL, MTX_DEF); err = fuse_device_init(); if (err) { @@ -130,6 +129,9 @@ return (err); } fuse_ipc_init(); + fuse_pbuf_zone = uma_zsecond_create("fusepbuf", pbuf_ctor, + pbuf_dtor, pbuf_init, NULL, pbuf_zone); + uma_zone_set_max(fuse_pbuf_zone, nswbuf / 2 + 1); /* vfs_modevent ignores its first arg */ if ((err = vfs_modevent(NULL, what, &fuse_vfsconf))) @@ -144,6 +146,7 @@ if ((err = vfs_modevent(NULL, what, &fuse_vfsconf))) return (err); fuse_bringdown(eh_tag); + uma_zdestroy(fuse_pbuf_zone); break; default: return (EINVAL); Index: sys/fs/fuse/fuse_vnops.c =================================================================== --- sys/fs/fuse/fuse_vnops.c +++ sys/fs/fuse/fuse_vnops.c @@ -201,7 +201,7 @@ SYSCTL_INT(_vfs_fuse, OID_AUTO, reclaim_revoked, CTLFLAG_RW, &fuse_reclaim_revoked, 0, ""); -int fuse_pbuf_freecnt = -1; +uma_zone_t fuse_pbuf_zone; #define fuse_vm_page_lock(m) vm_page_lock((m)); #define fuse_vm_page_unlock(m) vm_page_unlock((m)); @@ -1824,7 +1824,7 @@ * We use only the kva address for the buffer, but this is extremely * convenient and fast. */ - bp = getpbuf(&fuse_pbuf_freecnt); + bp = uma_zalloc(fuse_pbuf_zone, M_WAITOK); kva = (vm_offset_t)bp->b_data; pmap_qenter(kva, pages, npages); @@ -1845,7 +1845,7 @@ error = fuse_io_dispatch(vp, &uio, IO_DIRECT, cred); pmap_qremove(kva, npages); - relpbuf(bp, &fuse_pbuf_freecnt); + uma_zfree(fuse_pbuf_zone, bp); if (error && (uio.uio_resid == count)) { FS_DEBUG("error %d\n", error); @@ -1958,7 +1958,7 @@ * We use only the kva address for the buffer, but this is extremely * convenient and fast. */ - bp = getpbuf(&fuse_pbuf_freecnt); + bp = uma_zalloc(fuse_pbuf_zone, M_WAITOK); kva = (vm_offset_t)bp->b_data; pmap_qenter(kva, pages, npages); @@ -1978,7 +1978,7 @@ error = fuse_io_dispatch(vp, &uio, IO_DIRECT, cred); pmap_qremove(kva, npages); - relpbuf(bp, &fuse_pbuf_freecnt); + uma_zfree(fuse_pbuf_zone, bp); if (!error) { int nwritten = round_page(count - uio.uio_resid) / PAGE_SIZE; Index: sys/fs/nfsclient/nfs_clbio.c =================================================================== --- sys/fs/nfsclient/nfs_clbio.c +++ sys/fs/nfsclient/nfs_clbio.c @@ -70,7 +70,7 @@ extern int newnfs_directio_enable; extern int nfs_keep_dirty_on_error; -int ncl_pbuf_freecnt = -1; /* start out unlimited */ +uma_zone_t ncl_pbuf_zone; static struct buf *nfs_getcacheblk(struct vnode *vp, daddr_t bn, int size, struct thread *td); @@ -182,7 +182,7 @@ * We use only the kva address for the buffer, but this is extremely * convenient and fast. */ - bp = getpbuf(&ncl_pbuf_freecnt); + bp = uma_zalloc(ncl_pbuf_zone, M_WAITOK); kva = (vm_offset_t) bp->b_data; pmap_qenter(kva, pages, npages); @@ -203,7 +203,7 @@ error = ncl_readrpc(vp, &uio, cred); pmap_qremove(kva, npages); - relpbuf(bp, &ncl_pbuf_freecnt); + uma_zfree(ncl_pbuf_zone, bp); if (error && (uio.uio_resid == count)) { printf("ncl_getpages: error %d\n", error); @@ -793,7 +793,7 @@ while (uiop->uio_resid > 0) { size = MIN(uiop->uio_resid, wsize); size = MIN(uiop->uio_iov->iov_len, size); - bp = getpbuf(&ncl_pbuf_freecnt); + bp = uma_zalloc(ncl_pbuf_zone, M_WAITOK); t_uio = malloc(sizeof(struct uio), M_NFSDIRECTIO, M_WAITOK); t_iov = malloc(sizeof(struct iovec), M_NFSDIRECTIO, M_WAITOK); t_iov->iov_base = malloc(size, M_NFSDIRECTIO, M_WAITOK); @@ -836,7 +836,7 @@ free(t_iov, M_NFSDIRECTIO); free(t_uio, M_NFSDIRECTIO); bp->b_vp = NULL; - relpbuf(bp, &ncl_pbuf_freecnt); + uma_zfree(ncl_pbuf_zone, bp); if (error == EINTR) return (error); goto do_sync; @@ -1571,7 +1571,7 @@ mtx_unlock(&np->n_mtx); } bp->b_vp = NULL; - relpbuf(bp, &ncl_pbuf_freecnt); + uma_zfree(ncl_pbuf_zone, bp); } /* Index: sys/fs/nfsclient/nfs_clport.c =================================================================== --- sys/fs/nfsclient/nfs_clport.c +++ sys/fs/nfsclient/nfs_clport.c @@ -79,7 +79,7 @@ extern struct vop_vector newnfs_fifoops; extern uma_zone_t newnfsnode_zone; extern struct buf_ops buf_ops_newnfs; -extern int ncl_pbuf_freecnt; +extern uma_zone_t ncl_pbuf_zone; extern short nfsv4_cbport; extern int nfscl_enablecallb; extern int nfs_numnfscbd; @@ -1023,7 +1023,9 @@ return; inited = 1; nfscl_inited = 1; - ncl_pbuf_freecnt = nswbuf / 2 + 1; + ncl_pbuf_zone = uma_zsecond_create("nfspbuf", pbuf_ctor, pbuf_dtor, + pbuf_init, NULL, pbuf_zone); + uma_zone_set_max(ncl_pbuf_zone, nswbuf / 2 + 1); } /* @@ -1357,6 +1359,7 @@ #if 0 ncl_call_invalcaches = NULL; nfsd_call_nfscl = NULL; + uma_zdestroy(ncl_pbuf_zone); /* and get rid of the mutexes */ mtx_destroy(&ncl_iod_mutex); loaded = 0; Index: sys/fs/smbfs/smbfs_io.c =================================================================== --- sys/fs/smbfs/smbfs_io.c +++ sys/fs/smbfs/smbfs_io.c @@ -63,7 +63,7 @@ /*#define SMBFS_RWGENERIC*/ -extern int smbfs_pbuf_freecnt; +extern uma_zone_t smbfs_pbuf_zone; static int smbfs_fastlookup = 1; @@ -468,7 +468,7 @@ scred = smbfs_malloc_scred(); smb_makescred(scred, td, cred); - bp = getpbuf(&smbfs_pbuf_freecnt); + bp = uma_zalloc(smbfs_pbuf_zone, M_WAITOK); kva = (vm_offset_t) bp->b_data; pmap_qenter(kva, pages, npages); @@ -490,7 +490,7 @@ smbfs_free_scred(scred); pmap_qremove(kva, npages); - relpbuf(bp, &smbfs_pbuf_freecnt); + uma_zfree(smbfs_pbuf_zone, bp); if (error && (uio.uio_resid == count)) { printf("smbfs_getpages: error %d\n",error); @@ -593,7 +593,7 @@ rtvals[i] = VM_PAGER_ERROR; } - bp = getpbuf(&smbfs_pbuf_freecnt); + bp = uma_zalloc(smbfs_pbuf_zone, M_WAITOK); kva = (vm_offset_t) bp->b_data; pmap_qenter(kva, pages, npages); @@ -621,7 +621,7 @@ pmap_qremove(kva, npages); - relpbuf(bp, &smbfs_pbuf_freecnt); + uma_zfree(smbfs_pbuf_zone, bp); if (error == 0) { vnode_pager_undirty_pages(pages, rtvals, count - uio.uio_resid, Index: sys/fs/smbfs/smbfs_vfsops.c =================================================================== --- sys/fs/smbfs/smbfs_vfsops.c +++ sys/fs/smbfs/smbfs_vfsops.c @@ -88,7 +88,7 @@ MODULE_DEPEND(smbfs, libiconv, 1, 1, 2); MODULE_DEPEND(smbfs, libmchain, 1, 1, 1); -int smbfs_pbuf_freecnt = -1; /* start out unlimited */ +uma_zone_t smbfs_pbuf_zone; static int smbfs_cmount(struct mntarg *ma, void * data, uint64_t flags) @@ -367,7 +367,8 @@ int smbfs_init(struct vfsconf *vfsp) { - smbfs_pbuf_freecnt = nswbuf / 2 + 1; + smbfs_pbuf_zone = uma_zsecond_create("smbpbuf", pbuf_ctor, pbuf_dtor, pbuf_init, NULL, pbuf_zone); + uma_zone_set_max(smbfs_pbuf_zone, nswbuf / 2 + 1); SMBVDEBUG("done.\n"); return 0; } @@ -377,6 +378,7 @@ smbfs_uninit(struct vfsconf *vfsp) { + uma_zdestroy(smbfs_pbuf_zone); SMBVDEBUG("done.\n"); return 0; } Index: sys/kern/kern_lock.c =================================================================== --- sys/kern/kern_lock.c +++ sys/kern/kern_lock.c @@ -450,6 +450,8 @@ iflags |= LO_QUIET; if (flags & LK_IS_VNODE) iflags |= LO_IS_VNODE; + if (flags & LK_NEW) + iflags |= LO_NEW; iflags |= flags & (LK_ADAPTIVE | LK_NOSHARE); lock_init(&lk->lock_object, &lock_class_lockmgr, wmesg, NULL, iflags); Index: sys/kern/kern_physio.c =================================================================== --- sys/kern/kern_physio.c +++ sys/kern/kern_physio.c @@ -104,7 +104,7 @@ maxpages = btoc(MIN(uio->uio_resid, MAXPHYS)) + 1; pages = malloc(sizeof(*pages) * maxpages, M_DEVBUF, M_WAITOK); } else { - pbuf = getpbuf(NULL); + pbuf = uma_zalloc(pbuf_zone, M_WAITOK); sa = pbuf->b_data; maxpages = btoc(MAXPHYS); pages = pbuf->b_pages; @@ -220,7 +220,7 @@ } doerror: if (pbuf) - relpbuf(pbuf, NULL); + uma_zfree(pbuf_zone, pbuf); else if (pages) free(pages, M_DEVBUF); g_destroy_bio(bp); Index: sys/kern/vfs_aio.c =================================================================== --- sys/kern/vfs_aio.c +++ sys/kern/vfs_aio.c @@ -1267,7 +1267,7 @@ goto unref; } - job->pbuf = pbuf = (struct buf *)getpbuf(NULL); + job->pbuf = pbuf = uma_zalloc(pbuf_zone, M_WAITOK); BUF_KERNPROC(pbuf); AIO_LOCK(ki); ki->kaio_buffer_count++; @@ -1318,7 +1318,7 @@ AIO_LOCK(ki); ki->kaio_buffer_count--; AIO_UNLOCK(ki); - relpbuf(pbuf, NULL); + uma_zfree(pbuf_zone, pbuf); job->pbuf = NULL; } g_destroy_bio(bp); @@ -2342,7 +2342,7 @@ ki = userp->p_aioinfo; if (job->pbuf) { pmap_qremove((vm_offset_t)job->pbuf->b_data, job->npages); - relpbuf(job->pbuf, NULL); + uma_zfree(pbuf_zone, job->pbuf); job->pbuf = NULL; atomic_subtract_int(&num_buf_aio, 1); AIO_LOCK(ki); Index: sys/kern/vfs_bio.c =================================================================== --- sys/kern/vfs_bio.c +++ sys/kern/vfs_bio.c @@ -86,7 +86,6 @@ #include #include #include -#include "opt_swap.h" static MALLOC_DEFINE(M_BIOBUF, "biobuf", "BIO buffer"); @@ -1017,10 +1016,6 @@ mtx_unlock(&bdlock); } -#ifndef NSWBUF_MIN -#define NSWBUF_MIN 16 -#endif - #ifdef __i386__ #define TRANSIENT_DENOM 5 #else @@ -1129,20 +1124,9 @@ nbuf = buf_sz / BKVASIZE; } - /* - * swbufs are used as temporary holders for I/O, such as paging I/O. - * We have no less then 16 and no more then 256. - */ - nswbuf = min(nbuf / 4, 256); - TUNABLE_INT_FETCH("kern.nswbuf", &nswbuf); - if (nswbuf < NSWBUF_MIN) - nswbuf = NSWBUF_MIN; - /* * Reserve space for the buffer cache buffers */ - swbuf = (void *)v; - v = (caddr_t)(swbuf + nswbuf); buf = (void *)v; v = (caddr_t)(buf + nbuf); Index: sys/kern/vfs_cluster.c =================================================================== --- sys/kern/vfs_cluster.c +++ sys/kern/vfs_cluster.c @@ -63,7 +63,9 @@ #endif static MALLOC_DEFINE(M_SEGMENT, "cl_savebuf", "cluster_save buffer"); +static uma_zone_t cluster_pbuf_zone; +static void cluster_init(void *); static struct cluster_save *cluster_collectbufs(struct vnode *vp, struct buf *last_bp, int gbflags); static struct buf *cluster_rbuild(struct vnode *vp, u_quad_t filesize, @@ -83,6 +85,17 @@ SYSCTL_INT(_vfs, OID_AUTO, read_min, CTLFLAG_RW, &read_min, 0, "Cluster read min block count"); +SYSINIT(cluster, SI_SUB_CPU, SI_ORDER_ANY, cluster_init, NULL); + +static void +cluster_init(void *dummy) +{ + + cluster_pbuf_zone = uma_zsecond_create("clpbuf", pbuf_ctor, pbuf_dtor, + pbuf_init, NULL, pbuf_zone); + uma_zone_set_max(cluster_pbuf_zone, nswbuf / 2); +} + /* * Read data to a buf, including read-ahead if we find this to be beneficial. * cluster_read replaces bread. @@ -372,7 +385,7 @@ ((tbp->b_flags & B_VMIO) == 0) || (run <= 1) ) return tbp; - bp = trypbuf(&cluster_pbuf_freecnt); + bp = uma_zalloc(cluster_pbuf_zone, M_NOWAIT); if (bp == NULL) return tbp; @@ -603,7 +616,7 @@ bufdone(tbp); } pbrelvp(bp); - relpbuf(bp, &cluster_pbuf_freecnt); + uma_zfree(cluster_pbuf_zone, bp); } /* @@ -856,9 +869,8 @@ (tbp->b_bcount != tbp->b_bufsize) || (tbp->b_bcount != size) || (len == 1) || - ((bp = (vp->v_vflag & VV_MD) != 0 ? - trypbuf(&cluster_pbuf_freecnt) : - getpbuf(&cluster_pbuf_freecnt)) == NULL)) { + ((bp = uma_zalloc(cluster_pbuf_zone, + (vp->v_vflag & VV_MD) != 0 ? M_NOWAIT : M_WAITOK)) == NULL)) { totalwritten += tbp->b_bufsize; bawrite(tbp); ++start_lbn; Index: sys/sys/buf.h =================================================================== --- sys/sys/buf.h +++ sys/sys/buf.h @@ -44,6 +44,7 @@ #include #include #include +#include struct bio; struct buf; @@ -287,7 +288,7 @@ * Initialize a lock. */ #define BUF_LOCKINIT(bp) \ - lockinit(&(bp)->b_lock, PRIBIO + 4, buf_wmesg, 0, 0) + lockinit(&(bp)->b_lock, PRIBIO + 4, buf_wmesg, 0, LK_NEW) /* * * Get a lock sleeping non-interruptably until it becomes available. @@ -493,10 +494,6 @@ extern int dirtybufferflushes; extern int altbufferflushes; extern int nswbuf; /* Number of swap I/O buffer headers. */ -extern int cluster_pbuf_freecnt; /* Number of pbufs for clusters */ -extern int vnode_pbuf_freecnt; /* Number of pbufs for vnode pager */ -extern int vnode_async_pbuf_freecnt; /* Number of pbufs for vnode pager, - asynchronous reads */ extern caddr_t unmapped_buf; /* Data address for unmapped buffers. */ static inline int @@ -537,7 +534,6 @@ void bqrelse(struct buf *); int vfs_bio_awrite(struct buf *); void vfs_drain_busy_pages(struct buf *bp); -struct buf * getpbuf(int *); struct buf *incore(struct bufobj *, daddr_t); struct buf *gbincore(struct bufobj *, daddr_t); struct buf *getblk(struct vnode *, daddr_t, int, int, int, int); @@ -549,6 +545,11 @@ void bufdone(struct buf *); void bd_speedup(void); +extern uma_zone_t pbuf_zone; +int pbuf_init(void *, int, int); +int pbuf_ctor(void *, int, void *, int); +void pbuf_dtor(void *, int, void *); + int cluster_read(struct vnode *, u_quad_t, daddr_t, long, struct ucred *, long, int, int, struct buf **); int cluster_wbuild(struct vnode *, long, daddr_t, int, int); @@ -562,7 +563,6 @@ void vfs_unbusy_pages(struct buf *); int vmapbuf(struct buf *, int); void vunmapbuf(struct buf *); -void relpbuf(struct buf *, int *); void brelvp(struct buf *); void bgetvp(struct vnode *, struct buf *); void pbgetbo(struct bufobj *bo, struct buf *bp); @@ -571,7 +571,6 @@ void pbrelvp(struct buf *); int allocbuf(struct buf *bp, int size); void reassignbuf(struct buf *); -struct buf *trypbuf(int *); void bwait(struct buf *, u_char, const char *); void bdone(struct buf *); Index: sys/sys/lockmgr.h =================================================================== --- sys/sys/lockmgr.h +++ sys/sys/lockmgr.h @@ -143,7 +143,7 @@ /* * Flags for lockinit(). */ -#define LK_INIT_MASK 0x0000FF +#define LK_INIT_MASK 0x0001FF #define LK_CANRECURSE 0x000001 #define LK_NODUP 0x000002 #define LK_NOPROFILE 0x000004 @@ -152,6 +152,7 @@ #define LK_QUIET 0x000020 #define LK_ADAPTIVE 0x000040 #define LK_IS_VNODE 0x000080 /* Tell WITNESS about a VNODE lock */ +#define LK_NEW 0x000100 /* * Additional attributes to be used in lockmgr(). Index: sys/ufs/ffs/ffs_rawread.c =================================================================== --- sys/ufs/ffs/ffs_rawread.c +++ sys/ufs/ffs/ffs_rawread.c @@ -74,9 +74,7 @@ SYSCTL_DECL(_vfs_ffs); -static int ffsrawbufcnt = 4; -SYSCTL_INT(_vfs_ffs, OID_AUTO, ffsrawbufcnt, CTLFLAG_RD, &ffsrawbufcnt, 0, - "Buffers available for raw reads"); +static uma_zone_t ffsraw_pbuf_zone; static int allowrawread = 1; SYSCTL_INT(_vfs_ffs, OID_AUTO, allowrawread, CTLFLAG_RW, &allowrawread, 0, @@ -90,7 +88,10 @@ ffs_rawread_setup(void *arg __unused) { - ffsrawbufcnt = (nswbuf > 100 ) ? (nswbuf - (nswbuf >> 4)) : nswbuf - 8; + ffsraw_pbuf_zone = uma_zsecond_create("ffsrawpbuf", pbuf_ctor, + pbuf_dtor, pbuf_init, NULL, pbuf_zone); + uma_zone_set_max(ffsraw_pbuf_zone, (nswbuf > 100 ) ? + (nswbuf - (nswbuf >> 4)) : nswbuf - 8); } SYSINIT(ffs_raw, SI_SUB_VM_CONF, SI_ORDER_ANY, ffs_rawread_setup, NULL); @@ -296,8 +297,7 @@ while (resid > 0) { if (bp == NULL) { /* Setup first read */ - /* XXX: Leave some bufs for swap */ - bp = getpbuf(&ffsrawbufcnt); + bp = uma_zalloc(ffsraw_pbuf_zone, M_WAITOK); pbgetvp(vp, bp); error = ffs_rawread_readahead(vp, udata, offset, resid, td, bp); @@ -305,9 +305,9 @@ break; if (resid > bp->b_bufsize) { /* Setup fist readahead */ - /* XXX: Leave bufs for swap */ if (rawreadahead != 0) - nbp = trypbuf(&ffsrawbufcnt); + nbp = uma_zalloc(ffsraw_pbuf_zone, + M_NOWAIT); else nbp = NULL; if (nbp != NULL) { @@ -324,7 +324,8 @@ nbp); if (nerror) { pbrelvp(nbp); - relpbuf(nbp, &ffsrawbufcnt); + uma_zfree(ffsraw_pbuf_zone, + nbp); nbp = NULL; } } @@ -365,7 +366,7 @@ if (resid <= bp->b_bufsize) { /* No more readaheads */ pbrelvp(nbp); - relpbuf(nbp, &ffsrawbufcnt); + uma_zfree(ffsraw_pbuf_zone, nbp); nbp = NULL; } else { /* Setup next readahead */ nerror = ffs_rawread_readahead(vp, @@ -379,7 +380,7 @@ nbp); if (nerror != 0) { pbrelvp(nbp); - relpbuf(nbp, &ffsrawbufcnt); + uma_zfree(ffsraw_pbuf_zone, nbp); nbp = NULL; } } @@ -395,13 +396,13 @@ if (bp != NULL) { pbrelvp(bp); - relpbuf(bp, &ffsrawbufcnt); + uma_zfree(ffsraw_pbuf_zone, bp); } if (nbp != NULL) { /* Run down readahead buffer */ bwait(nbp, PRIBIO, "rawrd"); vunmapbuf(nbp); pbrelvp(nbp); - relpbuf(nbp, &ffsrawbufcnt); + uma_zfree(ffsraw_pbuf_zone, nbp); } if (error == 0) Index: sys/vm/swap_pager.c =================================================================== --- sys/vm/swap_pager.c +++ sys/vm/swap_pager.c @@ -324,9 +324,8 @@ static int swap_pager_full = 2; /* swap space exhaustion (task killing) */ static int swap_pager_almost_full = 1; /* swap space exhaustion (w/hysteresis)*/ -static int nsw_rcount; /* free read buffers */ -static int nsw_wcount_sync; /* limit write buffers / synchronous */ -static int nsw_wcount_async; /* limit write buffers / asynchronous */ +static struct mtx swbuf_mtx; /* to sync nsw_wcount_async */ +static int nsw_wcount_async; /* limit async write buffers */ static int nsw_wcount_async_max;/* assigned maximum */ static int nsw_cluster_max; /* maximum VOP I/O allowed */ @@ -352,6 +351,8 @@ (&swap_pager_object_list[((int)(intptr_t)handle >> 4) & (NOBJLISTS-1)]) static struct pagerlst swap_pager_object_list[NOBJLISTS]; +static uma_zone_t swwbuf_zone; +static uma_zone_t swrbuf_zone; static uma_zone_t swblk_zone; static uma_zone_t swpctrie_zone; @@ -539,12 +540,16 @@ */ nsw_cluster_max = min((MAXPHYS/PAGE_SIZE), MAX_PAGEOUT_CLUSTER); - mtx_lock(&pbuf_mtx); - nsw_rcount = (nswbuf + 1) / 2; - nsw_wcount_sync = (nswbuf + 3) / 4; nsw_wcount_async = 4; nsw_wcount_async_max = nsw_wcount_async; - mtx_unlock(&pbuf_mtx); + mtx_init(&swbuf_mtx, "async swbuf mutex", NULL, MTX_DEF); + + swwbuf_zone = uma_zsecond_create("swwbuf", pbuf_ctor, pbuf_dtor, + pbuf_init, NULL, pbuf_zone); + uma_zone_set_max(swwbuf_zone, (nswbuf + 3) / 4); + swrbuf_zone = uma_zsecond_create("swrbuf", pbuf_ctor, pbuf_dtor, + pbuf_init, NULL, pbuf_zone); + uma_zone_set_max(swrbuf_zone, (nswbuf + 1) / 2); /* * Initialize our zone, guessing on the number we need based @@ -1205,7 +1210,7 @@ ("no swap blocking containing %p(%jx)", object, (uintmax_t)pindex)); VM_OBJECT_WUNLOCK(object); - bp = getpbuf(&nsw_rcount); + bp = uma_zalloc(swrbuf_zone, M_WAITOK); /* Pages cannot leave the object while busy. */ for (i = 0, p = bm; i < count; i++, p = TAILQ_NEXT(p, listq)) { MPASS(p->pindex == bm->pindex + i); @@ -1406,12 +1411,17 @@ * All I/O parameters have been satisfied, build the I/O * request and assign the swap space. */ - if (sync == TRUE) { - bp = getpbuf(&nsw_wcount_sync); - } else { - bp = getpbuf(&nsw_wcount_async); - bp->b_flags = B_ASYNC; + if (sync != TRUE) { + mtx_lock(&swbuf_mtx); + while (nsw_wcount_async == 0) + msleep(&nsw_wcount_async, &swbuf_mtx, PVM, + "swbufa", 0); + nsw_wcount_async--; + mtx_unlock(&swbuf_mtx); } + bp = uma_zalloc(swwbuf_zone, M_WAITOK); + if (sync != TRUE) + bp->b_flags = B_ASYNC; bp->b_flags |= B_PAGING; bp->b_iocmd = BIO_WRITE; @@ -1634,15 +1644,13 @@ /* * release the physical I/O buffer */ - relpbuf( - bp, - ((bp->b_iocmd == BIO_READ) ? &nsw_rcount : - ((bp->b_flags & B_ASYNC) ? - &nsw_wcount_async : - &nsw_wcount_sync - ) - ) - ); + if (bp->b_flags & B_ASYNC) { + mtx_lock(&swbuf_mtx); + if (++nsw_wcount_async == 1) + wakeup(&nsw_wcount_async); + mtx_unlock(&swbuf_mtx); + } + uma_zfree((bp->b_iocmd == BIO_READ) ? swrbuf_zone : swwbuf_zone, bp); } int @@ -2627,6 +2635,7 @@ bp->b_ioflags |= BIO_ERROR; bp->b_resid = bp->b_bcount - bp2->bio_completed; bp->b_error = bp2->bio_error; + bp->b_caller1 = NULL; bufdone(bp); sp = bp2->bio_caller1; mtx_lock(&sw_dev_mtx); @@ -2666,6 +2675,7 @@ return; } + bp->b_caller1 = bio; bio->bio_caller1 = sp; bio->bio_caller2 = bp; bio->bio_cmd = bp->b_iocmd; @@ -2880,7 +2890,7 @@ if (new > nswbuf / 2 || new < 1) return (EINVAL); - mtx_lock(&pbuf_mtx); + mtx_lock(&swbuf_mtx); while (nsw_wcount_async_max != new) { /* * Adjust difference. If the current async count is too low, @@ -2895,11 +2905,11 @@ } else { nsw_wcount_async_max -= nsw_wcount_async; nsw_wcount_async = 0; - msleep(&nsw_wcount_async, &pbuf_mtx, PSWP, + msleep(&nsw_wcount_async, &swbuf_mtx, PSWP, "swpsysctl", 0); } } - mtx_unlock(&pbuf_mtx); + mtx_unlock(&swbuf_mtx); return (0); } Index: sys/vm/uma.h =================================================================== --- sys/vm/uma.h +++ sys/vm/uma.h @@ -217,17 +217,6 @@ uma_zone_t uma_zsecond_create(char *name, uma_ctor ctor, uma_dtor dtor, uma_init zinit, uma_fini zfini, uma_zone_t master); -/* - * Add a second master to a secondary zone. This provides multiple data - * backends for objects with the same size. Both masters must have - * compatible allocation flags. Presently, UMA_ZONE_MALLOC type zones are - * the only supported. - * - * Returns: - * Error on failure, 0 on success. - */ -int uma_zsecond_add(uma_zone_t zone, uma_zone_t master); - /* * Create cache-only zones. * @@ -285,10 +274,6 @@ * NUMA aware Zone. Implements a best * effort first-touch policy. */ -#define UMA_ZONE_NOBUCKETCACHE 0x20000 /* - * Don't cache full buckets. Limit - * UMA to per-cpu state. - */ /* * These flags are shared between the keg and zone. In zones wishing to add @@ -511,6 +496,18 @@ */ int uma_zone_set_max(uma_zone_t zone, int nitems); +/* + * Sets a high limit on the number of items allowed in zone's bucket cache + * + * Arguments: + * zone The zone to limit + * nitems The requested upper limit on the number of items allowed + * + * Returns: + * int The effective value of nitems set + */ +int uma_zone_set_maxcache(uma_zone_t zone, int nitems); + /* * Obtains the effective limit on the number of items in a zone * Index: sys/vm/uma_core.c =================================================================== --- sys/vm/uma_core.c +++ sys/vm/uma_core.c @@ -261,11 +261,9 @@ static uma_bucket_t bucket_alloc(uma_zone_t zone, void *, int); static void bucket_free(uma_zone_t zone, uma_bucket_t, void *); static void bucket_zone_drain(void); -static uma_bucket_t zone_alloc_bucket(uma_zone_t, void *, int, int); static uma_slab_t zone_fetch_slab(uma_zone_t, uma_keg_t, int, int); -static uma_slab_t zone_fetch_slab_multi(uma_zone_t, uma_keg_t, int, int); static void *slab_alloc_item(uma_keg_t keg, uma_slab_t slab); -static void slab_free_item(uma_keg_t keg, uma_slab_t slab, void *item); +static void slab_free_item(uma_zone_t zone, uma_slab_t slab, void *item); static uma_keg_t uma_kcreate(uma_zone_t zone, size_t size, uma_init uminit, uma_fini fini, int align, uint32_t flags); static int zone_import(uma_zone_t, void **, int, int, int); @@ -472,6 +470,7 @@ zdom->uzd_nitems -= bucket->ub_cnt; if (ws && zdom->uzd_imin > zdom->uzd_nitems) zdom->uzd_imin = zdom->uzd_nitems; + zone->uz_bktcount -= bucket->ub_cnt; } return (bucket); } @@ -487,6 +486,9 @@ zdom->uzd_nitems += bucket->ub_cnt; if (ws && zdom->uzd_imax < zdom->uzd_nitems) zdom->uzd_imax = zdom->uzd_nitems; + KASSERT(zone->uz_bktcount < zone->uz_bktmax, ("%s: zone %p overflow", + __func__, zone)); + zone->uz_bktcount += bucket->ub_cnt; } static void @@ -509,15 +511,6 @@ taskqueue_enqueue(taskqueue_thread, &zone->uz_maxaction); } -static void -zone_foreach_keg(uma_zone_t zone, void (*kegfn)(uma_keg_t)) -{ - uma_klink_t klink; - - LIST_FOREACH(klink, &zone->uz_kegs, kl_link) - kegfn(klink->kl_keg); -} - /* * Routine called by timeout which is used to fire off some time interval * based calculations. (stats, hash size, etc.) @@ -562,8 +555,9 @@ * Returns nothing. */ static void -keg_timeout(uma_keg_t keg) +zone_timeout(uma_zone_t zone) { + uma_keg_t keg = zone->uz_keg; KEG_LOCK(keg); /* @@ -601,20 +595,11 @@ return; } } - KEG_UNLOCK(keg); -} - -static void -zone_timeout(uma_zone_t zone) -{ - int i; - zone_foreach_keg(zone, &keg_timeout); - - ZONE_LOCK(zone); - for (i = 0; i < vm_ndomains; i++) + for (int i = 0; i < vm_ndomains; i++) zone_domain_update_wss(&zone->uz_domain[i]); - ZONE_UNLOCK(zone); + + KEG_UNLOCK(keg); } /* @@ -744,6 +729,12 @@ for (i = 0; i < bucket->ub_cnt; i++) zone->uz_fini(bucket->ub_bucket[i], zone->uz_size); zone->uz_release(zone->uz_arg, bucket->ub_bucket, bucket->ub_cnt); + ZONE_LOCK(zone); + zone->uz_frees++; + zone->uz_items -= bucket->ub_cnt; + if (zone->uz_sleepers && zone->uz_items < zone->uz_maxitems) + wakeup_one(zone); + ZONE_UNLOCK(zone); bucket->ub_cnt = 0; } @@ -1029,7 +1020,7 @@ * we're running. Normally the uma_rwlock would protect us but we * must be able to release and acquire the right lock for each keg. */ - zone_foreach_keg(zone, &keg_drain); + keg_drain(zone->uz_keg); ZONE_LOCK(zone); zone->uz_flags &= ~UMA_ZFLAG_DRAINING; wakeup(zone); @@ -1069,6 +1060,7 @@ KASSERT(domain >= 0 && domain < vm_ndomains, ("keg_alloc_slab: domain %d out of range", domain)); mtx_assert(&keg->uk_lock, MA_OWNED); + MPASS(zone->uz_lockptr == &keg->uk_lock); allocf = keg->uk_allocf; KEG_UNLOCK(keg); @@ -1164,8 +1156,7 @@ void *mem; int pages; - keg = zone_first_keg(zone); - + keg = zone->uz_keg; /* * If we are in BOOT_BUCKETS or higher, than switch to real * allocator. Zones with page sized slabs switch at BOOT_PAGEALLOC. @@ -1303,7 +1294,7 @@ uma_keg_t keg; TAILQ_INIT(&alloctail); - keg = zone_first_keg(zone); + keg = zone->uz_keg; npages = howmany(bytes, PAGE_SIZE); while (npages > 0) { @@ -1526,8 +1517,6 @@ u_int shsize; KASSERT(keg != NULL, ("Keg is null in keg_large_init")); - KASSERT((keg->uk_flags & UMA_ZFLAG_CACHEONLY) == 0, - ("keg_large_init: Cannot large-init a UMA_ZFLAG_CACHEONLY keg")); KASSERT((keg->uk_flags & UMA_ZONE_PCPU) == 0, ("%s: Cannot large-init a UMA_ZONE_PCPU keg", __func__)); @@ -1766,14 +1755,13 @@ zone->uz_sleeps = 0; zone->uz_count = 0; zone->uz_count_min = 0; + zone->uz_count_max = BUCKET_MAX; zone->uz_flags = 0; zone->uz_warning = NULL; /* The domain structures follow the cpu structures. */ zone->uz_domain = (struct uma_zone_domain *)&zone->uz_cpu[mp_ncpus]; + zone->uz_bktmax = ULONG_MAX; timevalclear(&zone->uz_ratecheck); - keg = arg->keg; - - ZONE_LOCK_INIT(zone, (arg->flags & UMA_ZONE_MTXCLASS)); /* * This is a pure cache zone, no kegs. @@ -1787,6 +1775,7 @@ zone->uz_release = arg->release; zone->uz_arg = arg->arg; zone->uz_lockptr = &zone->uz_lock; + ZONE_LOCK_INIT(zone, (arg->flags & UMA_ZONE_MTXCLASS)); rw_wlock(&uma_rwlock); LIST_INSERT_HEAD(&uma_cachezones, zone, uz_link); rw_wunlock(&uma_rwlock); @@ -1799,6 +1788,7 @@ zone->uz_import = (uma_import)zone_import; zone->uz_release = (uma_release)zone_release; zone->uz_arg = zone; + keg = arg->keg; if (arg->flags & UMA_ZONE_SECONDARY) { KASSERT(arg->keg != NULL, ("Secondary zone on zero'd keg")); @@ -1837,12 +1827,7 @@ return (error); } - /* - * Link in the first keg. - */ - zone->uz_klink.kl_keg = keg; - LIST_INSERT_HEAD(&zone->uz_kegs, &zone->uz_klink, kl_link); - zone->uz_lockptr = &keg->uk_lock; + zone->uz_keg = keg; zone->uz_size = keg->uk_size; zone->uz_flags |= (keg->uk_flags & (UMA_ZONE_INHERIT | UMA_ZFLAG_INHERIT)); @@ -1908,12 +1893,10 @@ static void zone_dtor(void *arg, int size, void *udata) { - uma_klink_t klink; uma_zone_t zone; uma_keg_t keg; zone = (uma_zone_t)arg; - keg = zone_first_keg(zone); if (!(zone->uz_flags & UMA_ZFLAG_INTERNAL)) cache_drain(zone); @@ -1928,26 +1911,18 @@ * remove it... we dont care for now */ zone_drain_wait(zone, M_WAITOK); - /* - * Unlink all of our kegs. - */ - while ((klink = LIST_FIRST(&zone->uz_kegs)) != NULL) { - klink->kl_keg = NULL; - LIST_REMOVE(klink, kl_link); - if (klink == &zone->uz_klink) - continue; - free(klink, M_TEMP); - } /* * We only destroy kegs from non secondary zones. */ - if (keg != NULL && (zone->uz_flags & UMA_ZONE_SECONDARY) == 0) { + if ((keg = zone->uz_keg) != NULL && + (zone->uz_flags & UMA_ZONE_SECONDARY) == 0) { rw_wlock(&uma_rwlock); LIST_REMOVE(keg, uk_link); rw_wunlock(&uma_rwlock); zone_free_item(kegs, keg, NULL, SKIP_NONE); } - ZONE_LOCK_FINI(zone); + if (zone->uz_lockptr == &zone->uz_lock) + ZONE_LOCK_FINI(zone); } /* @@ -2231,7 +2206,7 @@ uma_zone_t res; bool locked; - keg = zone_first_keg(master); + keg = master->uz_keg; memset(&args, 0, sizeof(args)); args.name = name; args.size = keg->uk_size; @@ -2280,85 +2255,6 @@ return (zone_alloc_item(zones, &args, UMA_ANYDOMAIN, M_WAITOK)); } -static void -zone_lock_pair(uma_zone_t a, uma_zone_t b) -{ - if (a < b) { - ZONE_LOCK(a); - mtx_lock_flags(b->uz_lockptr, MTX_DUPOK); - } else { - ZONE_LOCK(b); - mtx_lock_flags(a->uz_lockptr, MTX_DUPOK); - } -} - -static void -zone_unlock_pair(uma_zone_t a, uma_zone_t b) -{ - - ZONE_UNLOCK(a); - ZONE_UNLOCK(b); -} - -int -uma_zsecond_add(uma_zone_t zone, uma_zone_t master) -{ - uma_klink_t klink; - uma_klink_t kl; - int error; - - error = 0; - klink = malloc(sizeof(*klink), M_TEMP, M_WAITOK | M_ZERO); - - zone_lock_pair(zone, master); - /* - * zone must use vtoslab() to resolve objects and must already be - * a secondary. - */ - if ((zone->uz_flags & (UMA_ZONE_VTOSLAB | UMA_ZONE_SECONDARY)) - != (UMA_ZONE_VTOSLAB | UMA_ZONE_SECONDARY)) { - error = EINVAL; - goto out; - } - /* - * The new master must also use vtoslab(). - */ - if ((zone->uz_flags & UMA_ZONE_VTOSLAB) != UMA_ZONE_VTOSLAB) { - error = EINVAL; - goto out; - } - - /* - * The underlying object must be the same size. rsize - * may be different. - */ - if (master->uz_size != zone->uz_size) { - error = E2BIG; - goto out; - } - /* - * Put it at the end of the list. - */ - klink->kl_keg = zone_first_keg(master); - LIST_FOREACH(kl, &zone->uz_kegs, kl_link) { - if (LIST_NEXT(kl, kl_link) == NULL) { - LIST_INSERT_AFTER(kl, klink, kl_link); - break; - } - } - klink = NULL; - zone->uz_flags |= UMA_ZFLAG_MULTI; - zone->uz_slab = zone_fetch_slab_multi; - -out: - zone_unlock_pair(zone, master); - if (klink != NULL) - free(klink, M_TEMP); - - return (error); -} - - /* See uma.h */ void uma_zdestroy(uma_zone_t zone) @@ -2420,7 +2316,7 @@ uma_bucket_t bucket; uma_cache_t cache; void *item; - int cpu, domain, lockfail; + int cpu, domain, lockfail, max; #ifdef INVARIANTS bool skipdbg; #endif @@ -2590,8 +2486,40 @@ * We bump the uz count when the cache size is insufficient to * handle the working set. */ - if (lockfail && zone->uz_count < BUCKET_MAX) + if (lockfail && zone->uz_count < zone->uz_count_max) zone->uz_count++; + + /* + * Short-circuit if we can't allocate more buckets. + */ + if (zone->uz_bktcount >= zone->uz_bktmax) { + ZONE_UNLOCK(zone); + goto zalloc_item; + } + + if (zone->uz_maxitems) { + if (zone->uz_items >= zone->uz_maxitems) { + zone_log_warning(zone); + zone_maxaction(zone); + if (flags & M_NOWAIT) { + ZONE_UNLOCK(zone); + return (NULL); + } + zone->uz_sleeps++; + zone->uz_sleepers++; + msleep(zone, zone->uz_lockptr, PVM, "zonelimit", 0); + zone->uz_sleepers--; + if (zone->uz_items >= zone->uz_maxitems) { + ZONE_UNLOCK(zone); + goto zalloc_restart; + } + } + max = MIN(zone->uz_count, zone->uz_maxitems - zone->uz_items); + } else + max = zone->uz_count; + zone->uz_items += max; + if (zone->uz_sleepers && zone->uz_items < zone->uz_maxitems) + wakeup_one(zone); ZONE_UNLOCK(zone); /* @@ -2599,7 +2527,61 @@ * works we'll restart the allocation from the beginning and it * will use the just filled bucket. */ - bucket = zone_alloc_bucket(zone, udata, domain, flags); + /* Don't wait for buckets, preserve caller's NOVM setting. */ + bucket = bucket_alloc(zone, udata, M_NOWAIT | (flags & M_NOVM)); + if (bucket == NULL) { + ZONE_LOCK(zone); + zone->uz_items -= max; + ZONE_UNLOCK(zone); + goto zalloc_item; + } + + bucket->ub_cnt = zone->uz_import(zone->uz_arg, bucket->ub_bucket, + MIN(bucket->ub_entries, max), domain, flags); + + /* + * Initialize the memory if necessary. + */ + if (bucket->ub_cnt != 0 && zone->uz_init != NULL) { + int i; + + for (i = 0; i < bucket->ub_cnt; i++) + if (zone->uz_init(bucket->ub_bucket[i], zone->uz_size, + flags) != 0) + break; + /* + * If we couldn't initialize the whole bucket, put the + * rest back onto the freelist. + */ + if (i != bucket->ub_cnt) { + zone->uz_release(zone->uz_arg, &bucket->ub_bucket[i], + bucket->ub_cnt - i); +#ifdef INVARIANTS + bzero(&bucket->ub_bucket[i], + sizeof(void *) * (bucket->ub_cnt - i)); +#endif + bucket->ub_cnt = i; + } + } + + /* + * Check if bucket_alloc() returned a strange bucket or + * we reduced ub_cnt due to failed uz_init. + */ + if (bucket->ub_cnt < max) { + ZONE_LOCK(zone); + zone->uz_items -= max - bucket->ub_cnt; + if (zone->uz_sleepers > 0 && + zone->uz_items < zone->uz_maxitems) + wakeup_one(zone); + ZONE_UNLOCK(zone); + if (bucket->ub_cnt == 0) { + bucket_free(zone, bucket, udata); + atomic_add_long(&zone->uz_fails, 1); + goto zalloc_item; + } + } + CTR3(KTR_UMA, "uma_zalloc: zone %s(%p) bucket zone returned %p", zone->uz_name, zone, bucket); if (bucket != NULL) { @@ -2618,7 +2600,7 @@ domain == PCPU_GET(domain))) { cache->uc_allocbucket = bucket; zdom->uzd_imax += bucket->ub_cnt; - } else if ((zone->uz_flags & UMA_ZONE_NOBUCKETCACHE) != 0) { + } else if (zone->uz_bktcount >= zone->uz_bktmax) { critical_exit(); ZONE_UNLOCK(zone); bucket_drain(zone, bucket); @@ -2677,6 +2659,7 @@ KASSERT(domain >= 0 && domain < vm_ndomains, ("keg_first_slab: domain %d out of range", domain)); + mtx_assert(&keg->uk_lock, MA_OWNED); slab = NULL; start = domain; @@ -2753,23 +2736,10 @@ if (flags & M_NOVM) break; - if (keg->uk_maxpages && keg->uk_pages >= keg->uk_maxpages) { - keg->uk_flags |= UMA_ZFLAG_FULL; - /* - * If this is not a multi-zone, set the FULL bit. - * Otherwise slab_multi() takes care of it. - */ - if ((zone->uz_flags & UMA_ZFLAG_MULTI) == 0) { - zone->uz_flags |= UMA_ZFLAG_FULL; - zone_log_warning(zone); - zone_maxaction(zone); - } - if (flags & M_NOWAIT) - return (NULL); - zone->uz_sleeps++; - msleep(keg, &keg->uk_lock, PVM, "keglimit", 0); - continue; - } + KASSERT(zone->uz_maxitems == 0 || + zone->uz_items <= zone->uz_maxitems, + ("%s: zone %p overflow", __func__, zone)); + slab = keg_alloc_slab(keg, zone, domain, aflags); /* * If we got a slab here it's safe to mark it partially used @@ -2812,7 +2782,7 @@ uma_slab_t slab; if (keg == NULL) { - keg = zone_first_keg(zone); + keg = zone->uz_keg; KEG_LOCK(keg); } @@ -2827,87 +2797,6 @@ return (NULL); } -/* - * uma_zone_fetch_slab_multi: Fetches a slab from one available keg. Returns - * with the keg locked. On NULL no lock is held. - * - * The last pointer is used to seed the search. It is not required. - */ -static uma_slab_t -zone_fetch_slab_multi(uma_zone_t zone, uma_keg_t last, int domain, int rflags) -{ - uma_klink_t klink; - uma_slab_t slab; - uma_keg_t keg; - int flags; - int empty; - int full; - - /* - * Don't wait on the first pass. This will skip limit tests - * as well. We don't want to block if we can find a provider - * without blocking. - */ - flags = (rflags & ~M_WAITOK) | M_NOWAIT; - /* - * Use the last slab allocated as a hint for where to start - * the search. - */ - if (last != NULL) { - slab = keg_fetch_slab(last, zone, domain, flags); - if (slab) - return (slab); - KEG_UNLOCK(last); - } - /* - * Loop until we have a slab incase of transient failures - * while M_WAITOK is specified. I'm not sure this is 100% - * required but we've done it for so long now. - */ - for (;;) { - empty = 0; - full = 0; - /* - * Search the available kegs for slabs. Be careful to hold the - * correct lock while calling into the keg layer. - */ - LIST_FOREACH(klink, &zone->uz_kegs, kl_link) { - keg = klink->kl_keg; - KEG_LOCK(keg); - if ((keg->uk_flags & UMA_ZFLAG_FULL) == 0) { - slab = keg_fetch_slab(keg, zone, domain, flags); - if (slab) - return (slab); - } - if (keg->uk_flags & UMA_ZFLAG_FULL) - full++; - else - empty++; - KEG_UNLOCK(keg); - } - if (rflags & (M_NOWAIT | M_NOVM)) - break; - flags = rflags; - /* - * All kegs are full. XXX We can't atomically check all kegs - * and sleep so just sleep for a short period and retry. - */ - if (full && !empty) { - ZONE_LOCK(zone); - zone->uz_flags |= UMA_ZFLAG_FULL; - zone->uz_sleeps++; - zone_log_warning(zone); - zone_maxaction(zone); - msleep(zone, zone->uz_lockptr, PVM, - "zonelimit", hz/100); - zone->uz_flags &= ~UMA_ZFLAG_FULL; - ZONE_UNLOCK(zone); - continue; - } - } - return (NULL); -} - static void * slab_alloc_item(uma_keg_t keg, uma_slab_t slab) { @@ -2982,57 +2871,6 @@ return i; } -static uma_bucket_t -zone_alloc_bucket(uma_zone_t zone, void *udata, int domain, int flags) -{ - uma_bucket_t bucket; - int max; - - CTR1(KTR_UMA, "zone_alloc:_bucket domain %d)", domain); - - /* Don't wait for buckets, preserve caller's NOVM setting. */ - bucket = bucket_alloc(zone, udata, M_NOWAIT | (flags & M_NOVM)); - if (bucket == NULL) - return (NULL); - - max = MIN(bucket->ub_entries, zone->uz_count); - bucket->ub_cnt = zone->uz_import(zone->uz_arg, bucket->ub_bucket, - max, domain, flags); - - /* - * Initialize the memory if necessary. - */ - if (bucket->ub_cnt != 0 && zone->uz_init != NULL) { - int i; - - for (i = 0; i < bucket->ub_cnt; i++) - if (zone->uz_init(bucket->ub_bucket[i], zone->uz_size, - flags) != 0) - break; - /* - * If we couldn't initialize the whole bucket, put the - * rest back onto the freelist. - */ - if (i != bucket->ub_cnt) { - zone->uz_release(zone->uz_arg, &bucket->ub_bucket[i], - bucket->ub_cnt - i); -#ifdef INVARIANTS - bzero(&bucket->ub_bucket[i], - sizeof(void *) * (bucket->ub_cnt - i)); -#endif - bucket->ub_cnt = i; - } - } - - if (bucket->ub_cnt == 0) { - bucket_free(zone, bucket, udata); - atomic_add_long(&zone->uz_fails, 1); - return (NULL); - } - - return (bucket); -} - /* * Allocates a single item from a zone. * @@ -3055,16 +2893,35 @@ bool skipdbg; #endif - item = NULL; + ZONE_LOCK(zone); + if (zone->uz_maxitems && zone->uz_items >= zone->uz_maxitems) { + zone_log_warning(zone); + zone_maxaction(zone); + if (flags & M_NOWAIT) { + ZONE_UNLOCK(zone); + return (NULL); + } + zone->uz_sleeps++; + zone->uz_sleepers++; + msleep(zone, zone->uz_lockptr, PVM, "zonelimit", 0); + KASSERT(zone->uz_items < zone->uz_maxitems, + ("%s: woke up with full zone %p", __func__, zone)); + zone->uz_sleepers--; + } + zone->uz_items++; + zone->uz_allocs++; + if (zone->uz_sleepers && zone->uz_items < zone->uz_maxitems) + wakeup_one(zone); + ZONE_UNLOCK(zone); if (domain != UMA_ANYDOMAIN) { /* avoid allocs targeting empty domains */ if (VM_DOMAIN_EMPTY(domain)) domain = UMA_ANYDOMAIN; } + item = NULL; if (zone->uz_import(zone->uz_arg, &item, 1, domain, flags) != 1) goto fail; - atomic_add_long(&zone->uz_allocs, 1); #ifdef INVARIANTS skipdbg = uma_dbg_zskip(zone, item); @@ -3105,6 +2962,10 @@ fail: CTR2(KTR_UMA, "zone_alloc_item failed from %s(%p)", zone->uz_name, zone); + ZONE_LOCK(zone); + zone->uz_items--; + zone->uz_allocs--; + ZONE_UNLOCK(zone); atomic_add_long(&zone->uz_fails, 1); return (NULL); } @@ -3116,7 +2977,8 @@ uma_cache_t cache; uma_bucket_t bucket; uma_zone_domain_t zdom; - int cpu, domain, lockfail; + int cpu, domain; + bool lockfail, locked; #ifdef INVARIANTS bool skipdbg; #endif @@ -3158,13 +3020,6 @@ #endif zone->uz_dtor(item, zone->uz_size, udata); - /* - * The race here is acceptable. If we miss it we'll just have to wait - * a little longer for the limits to be reset. - */ - if (zone->uz_flags & UMA_ZFLAG_FULL) - goto zfree_item; - /* * If possible, free to the per-CPU cache. There are two * requirements for safe access to the per-CPU cache: (1) the thread @@ -3176,6 +3031,7 @@ * current cache; when we re-acquire the critical section, we must * detect and handle migration if it has occurred. */ + locked = false; zfree_restart: critical_enter(); cpu = curcpu; @@ -3197,6 +3053,8 @@ bucket->ub_cnt++; cache->uc_frees++; critical_exit(); + if (locked) + ZONE_UNLOCK(zone); return; } @@ -3212,21 +3070,30 @@ if (zone->uz_count == 0 || bucketdisable) goto zfree_item; - lockfail = 0; - if (ZONE_TRYLOCK(zone) == 0) { - /* Record contention to size the buckets. */ - ZONE_LOCK(zone); - lockfail = 1; + lockfail = false; + if (!locked) { + if (ZONE_TRYLOCK(zone) == 0) { + /* Record contention to size the buckets. */ + ZONE_LOCK(zone); + lockfail = true; + } + locked = true; } + /* + * Now we got the lock, check for sleepers and give a chance to + * first one to allocate. If item will end up on CPU cache, + * and they will wake up on wrong CPU, then they will go back + * to sleep. + */ + if (zone->uz_sleepers) + wakeup_one(zone); critical_enter(); cpu = curcpu; cache = &zone->uz_cpu[cpu]; bucket = cache->uc_freebucket; - if (bucket != NULL && bucket->ub_cnt < bucket->ub_entries) { - ZONE_UNLOCK(zone); + if (bucket != NULL && bucket->ub_cnt < bucket->ub_entries) goto zfree_start; - } cache->uc_freebucket = NULL; /* We are no longer associated with this CPU. */ critical_exit(); @@ -3245,10 +3112,11 @@ "uma_zfree: zone %s(%p) putting bucket %p on free list", zone->uz_name, zone, bucket); /* ub_cnt is pointing to the last free item */ - KASSERT(bucket->ub_cnt != 0, - ("uma_zfree: Attempting to insert an empty bucket onto the full list.\n")); - if ((zone->uz_flags & UMA_ZONE_NOBUCKETCACHE) != 0) { + KASSERT(bucket->ub_cnt == bucket->ub_entries, + ("uma_zfree: Attempting to insert not full bucket onto the full list.\n")); + if (zone->uz_bktcount >= zone->uz_bktmax) { ZONE_UNLOCK(zone); + locked = false; bucket_drain(zone, bucket); bucket_free(zone, bucket, udata); goto zfree_restart; @@ -3260,9 +3128,10 @@ * We bump the uz count when the cache size is insufficient to * handle the working set. */ - if (lockfail && zone->uz_count < BUCKET_MAX) + if (lockfail && zone->uz_count < zone->uz_count_max) zone->uz_count++; ZONE_UNLOCK(zone); + locked = false; bucket = bucket_alloc(zone, udata, M_NOWAIT); CTR3(KTR_UMA, "uma_zfree: zone %s(%p) allocated bucket %p", @@ -3290,9 +3159,9 @@ * If nothing else caught this, we'll just do an internal free. */ zfree_item: + if (locked) + ZONE_UNLOCK(zone); zone_free_item(zone, item, udata, SKIP_DTOR); - - return; } void @@ -3315,11 +3184,14 @@ } static void -slab_free_item(uma_keg_t keg, uma_slab_t slab, void *item) +slab_free_item(uma_zone_t zone, uma_slab_t slab, void *item) { + uma_keg_t keg; uma_domain_t dom; uint8_t freei; + keg = zone->uz_keg; + MPASS(zone->uz_lockptr == &keg->uk_lock); mtx_assert(&keg->uk_lock, MA_OWNED); MPASS(keg == slab->us_keg); @@ -3350,11 +3222,9 @@ uma_slab_t slab; uma_keg_t keg; uint8_t *mem; - int clearfull; int i; - clearfull = 0; - keg = zone_first_keg(zone); + keg = zone->uz_keg; KEG_LOCK(keg); for (i = 0; i < cnt; i++) { item = bucket[i]; @@ -3368,37 +3238,13 @@ } } else { slab = vtoslab((vm_offset_t)item); - if (slab->us_keg != keg) { - KEG_UNLOCK(keg); - keg = slab->us_keg; - KEG_LOCK(keg); - } - } - slab_free_item(keg, slab, item); - if (keg->uk_flags & UMA_ZFLAG_FULL) { - if (keg->uk_pages < keg->uk_maxpages) { - keg->uk_flags &= ~UMA_ZFLAG_FULL; - clearfull = 1; - } - - /* - * We can handle one more allocation. Since we're - * clearing ZFLAG_FULL, wake up all procs blocked - * on pages. This should be uncommon, so keeping this - * simple for now (rather than adding count of blocked - * threads etc). - */ - wakeup(keg); + MPASS(slab->us_keg == keg); } + slab_free_item(zone, slab, item); + if (zone->uz_sleepers && zone->uz_items < zone->uz_maxitems) + wakeup_one(zone); } KEG_UNLOCK(keg); - if (clearfull) { - ZONE_LOCK(zone); - zone->uz_flags &= ~UMA_ZFLAG_FULL; - wakeup(zone); - ZONE_UNLOCK(zone); - } - } /* @@ -3435,25 +3281,53 @@ if (skip < SKIP_FINI && zone->uz_fini) zone->uz_fini(item, zone->uz_size); - atomic_add_long(&zone->uz_frees, 1); zone->uz_release(zone->uz_arg, &item, 1); + + ZONE_LOCK(zone); + zone->uz_frees++; + zone->uz_items--; + if (zone->uz_sleepers && zone->uz_items < zone->uz_maxitems) + wakeup_one(zone); + ZONE_UNLOCK(zone); } /* See uma.h */ int uma_zone_set_max(uma_zone_t zone, int nitems) { - uma_keg_t keg; + struct uma_bucket_zone *ubz; - keg = zone_first_keg(zone); - if (keg == NULL) - return (0); - KEG_LOCK(keg); - keg->uk_maxpages = (nitems / keg->uk_ipers) * keg->uk_ppera; - if (keg->uk_maxpages * keg->uk_ipers < nitems) - keg->uk_maxpages += keg->uk_ppera; - nitems = (keg->uk_maxpages / keg->uk_ppera) * keg->uk_ipers; - KEG_UNLOCK(keg); + /* + * If limit is very low we may need to limit how + * much items are allowed in CPU caches. + */ + ubz = &bucket_zones[0]; + for (; ubz->ubz_entries != 0; ubz++) + if (ubz->ubz_entries * 2 * mp_ncpus > nitems) + break; + if (ubz == &bucket_zones[0]) + nitems = ubz->ubz_entries * 2 * mp_ncpus; + else + ubz--; + + ZONE_LOCK(zone); + zone->uz_count_max = zone->uz_count = ubz->ubz_entries; + if (zone->uz_count_min > zone->uz_count_max) + zone->uz_count_min = zone->uz_count_max; + zone->uz_maxitems = nitems; + ZONE_UNLOCK(zone); + + return (nitems); +} + +/* See uma.h */ +int +uma_zone_set_maxcache(uma_zone_t zone, int nitems) +{ + + ZONE_LOCK(zone); + zone->uz_bktmax = nitems; + ZONE_UNLOCK(zone); return (nitems); } @@ -3463,14 +3337,10 @@ uma_zone_get_max(uma_zone_t zone) { int nitems; - uma_keg_t keg; - keg = zone_first_keg(zone); - if (keg == NULL) - return (0); - KEG_LOCK(keg); - nitems = (keg->uk_maxpages / keg->uk_ppera) * keg->uk_ipers; - KEG_UNLOCK(keg); + ZONE_LOCK(zone); + nitems = zone->uz_maxitems; + ZONE_UNLOCK(zone); return (nitems); } @@ -3524,8 +3394,7 @@ { uma_keg_t keg; - keg = zone_first_keg(zone); - KASSERT(keg != NULL, ("uma_zone_set_init: Invalid zone type")); + KEG_GET(zone, keg); KEG_LOCK(keg); KASSERT(keg->uk_pages == 0, ("uma_zone_set_init on non-empty keg")); @@ -3539,8 +3408,7 @@ { uma_keg_t keg; - keg = zone_first_keg(zone); - KASSERT(keg != NULL, ("uma_zone_set_fini: Invalid zone type")); + KEG_GET(zone, keg); KEG_LOCK(keg); KASSERT(keg->uk_pages == 0, ("uma_zone_set_fini on non-empty keg")); @@ -3554,7 +3422,7 @@ { ZONE_LOCK(zone); - KASSERT(zone_first_keg(zone)->uk_pages == 0, + KASSERT(zone->uz_keg->uk_pages == 0, ("uma_zone_set_zinit on non-empty keg")); zone->uz_init = zinit; ZONE_UNLOCK(zone); @@ -3566,7 +3434,7 @@ { ZONE_LOCK(zone); - KASSERT(zone_first_keg(zone)->uk_pages == 0, + KASSERT(zone->uz_keg->uk_pages == 0, ("uma_zone_set_zfini on non-empty keg")); zone->uz_fini = zfini; ZONE_UNLOCK(zone); @@ -3579,7 +3447,7 @@ { uma_keg_t keg; - keg = zone_first_keg(zone); + KEG_GET(zone, keg); KASSERT(keg != NULL, ("uma_zone_set_freef: Invalid zone type")); KEG_LOCK(keg); keg->uk_freef = freef; @@ -3593,7 +3461,7 @@ { uma_keg_t keg; - keg = zone_first_keg(zone); + KEG_GET(zone, keg); KEG_LOCK(keg); keg->uk_allocf = allocf; KEG_UNLOCK(keg); @@ -3605,9 +3473,7 @@ { uma_keg_t keg; - keg = zone_first_keg(zone); - if (keg == NULL) - return; + KEG_GET(zone, keg); KEG_LOCK(keg); keg->uk_reserve = items; KEG_UNLOCK(keg); @@ -3623,11 +3489,9 @@ vm_offset_t kva; u_int pages; - keg = zone_first_keg(zone); - if (keg == NULL) - return (0); - pages = count / keg->uk_ipers; + KEG_GET(zone, keg); + pages = count / keg->uk_ipers; if (pages * keg->uk_ipers < count) pages++; pages *= keg->uk_ppera; @@ -3645,7 +3509,6 @@ KEG_LOCK(keg); keg->uk_kva = kva; keg->uk_offset = 0; - keg->uk_maxpages = pages; #ifdef UMA_MD_SMALL_ALLOC keg->uk_allocf = (keg->uk_ppera > 1) ? noobj_alloc : uma_small_alloc; #else @@ -3667,9 +3530,7 @@ uma_keg_t keg; int domain, flags, slabs; - keg = zone_first_keg(zone); - if (keg == NULL) - return; + KEG_GET(zone, keg); KEG_LOCK(keg); slabs = items / keg->uk_ipers; if (slabs * keg->uk_ipers < items) @@ -3758,7 +3619,7 @@ int full; ZONE_LOCK(zone); - full = (zone->uz_flags & UMA_ZFLAG_FULL); + full = (zone->uz_sleepers > 0); ZONE_UNLOCK(zone); return (full); } @@ -3766,7 +3627,7 @@ int uma_zone_exhausted_nolock(uma_zone_t zone) { - return (zone->uz_flags & UMA_ZFLAG_FULL); + return (zone->uz_sleepers > 0); } void * @@ -3886,11 +3747,11 @@ int i; printf("keg: %s(%p) size %d(%d) flags %#x ipers %d ppera %d " - "out %d free %d limit %d\n", + "out %d free %d\n", keg->uk_name, keg, keg->uk_size, keg->uk_rsize, keg->uk_flags, keg->uk_ipers, keg->uk_ppera, (keg->uk_pages / keg->uk_ppera) * keg->uk_ipers - keg->uk_free, - keg->uk_free, (keg->uk_maxpages / keg->uk_ppera) * keg->uk_ipers); + keg->uk_free); for (i = 0; i < vm_ndomains; i++) { dom = &keg->uk_domain[i]; printf("Part slabs:\n"); @@ -3909,13 +3770,13 @@ uma_print_zone(uma_zone_t zone) { uma_cache_t cache; - uma_klink_t kl; int i; - printf("zone: %s(%p) size %d flags %#x\n", - zone->uz_name, zone, zone->uz_size, zone->uz_flags); - LIST_FOREACH(kl, &zone->uz_kegs, kl_link) - uma_print_keg(kl->kl_keg); + printf("zone: %s(%p) size %d maxitems %lu flags %#x\n", + zone->uz_name, zone, zone->uz_size, zone->uz_maxitems, + zone->uz_flags); + if (zone->uz_lockptr != &zone->uz_lock) + uma_print_keg(zone->uz_keg); CPU_FOREACH(i) { cache = &zone->uz_cpu[i]; printf("CPU %d Cache:\n", i); @@ -3994,10 +3855,8 @@ uma_zone_domain_t zdom; struct sbuf sbuf; uma_cache_t cache; - uma_klink_t kl; uma_keg_t kz; uma_zone_t z; - uma_keg_t k; int count, error, i; error = sysctl_wire_old_buffer(req, 0); @@ -4031,14 +3890,12 @@ uth.uth_align = kz->uk_align; uth.uth_size = kz->uk_size; uth.uth_rsize = kz->uk_rsize; - LIST_FOREACH(kl, &z->uz_kegs, kl_link) { - k = kl->kl_keg; - uth.uth_maxpages += k->uk_maxpages; - uth.uth_pages += k->uk_pages; - uth.uth_keg_free += k->uk_free; - uth.uth_limit = (k->uk_maxpages / k->uk_ppera) - * k->uk_ipers; - } + uth.uth_pages += (z->uz_items / kz->uk_ipers) * + kz->uk_ppera; + uth.uth_maxpages += (z->uz_maxitems / kz->uk_ipers) * + kz->uk_ppera; + uth.uth_limit = z->uz_maxitems; + uth.uth_keg_free += z->uz_keg->uk_free; /* * A zone is secondary is it is not the first entry @@ -4135,8 +3992,10 @@ * zone is unlocked because the item's allocation state * essentially holds a reference. */ + if (zone->uz_lockptr == &zone->uz_lock) + return (NULL); ZONE_LOCK(zone); - keg = LIST_FIRST(&zone->uz_kegs)->kl_keg; + keg = zone->uz_keg; if (keg->uk_flags & UMA_ZONE_HASH) slab = hash_sfind(&keg->uk_hash, mem); else @@ -4150,12 +4009,11 @@ static bool uma_dbg_zskip(uma_zone_t zone, void *mem) { - uma_keg_t keg; - if ((keg = zone_first_keg(zone)) == NULL) + if (zone->uz_lockptr == &zone->uz_lock) return (true); - return (uma_dbg_kskip(keg, mem)); + return (uma_dbg_kskip(zone->uz_keg, mem)); } static bool Index: sys/vm/uma_int.h =================================================================== --- sys/vm/uma_int.h +++ sys/vm/uma_int.h @@ -223,7 +223,7 @@ * */ struct uma_keg { - struct mtx uk_lock; /* Lock for the keg */ + struct mtx uk_lock; /* Lock for the keg. MUST be first! */ struct uma_hash uk_hash; LIST_HEAD(,uma_zone) uk_zones; /* Keg's zones */ @@ -234,7 +234,7 @@ uint32_t uk_reserve; /* Number of reserved items. */ uint32_t uk_size; /* Requested size of each item */ uint32_t uk_rsize; /* Real size of each item */ - uint32_t uk_maxpages; /* Maximum number of pages to alloc */ + /* 32 bit pad */ uma_init uk_init; /* Keg's init routine */ uma_fini uk_fini; /* Keg's fini routine */ @@ -296,12 +296,6 @@ typedef struct uma_slab * uma_slab_t; typedef uma_slab_t (*uma_slaballoc)(uma_zone_t, uma_keg_t, int, int); -struct uma_klink { - LIST_ENTRY(uma_klink) kl_link; - uma_keg_t kl_keg; -}; -typedef struct uma_klink *uma_klink_t; - struct uma_zone_domain { LIST_HEAD(,uma_bucket) uzd_buckets; /* full buckets */ long uzd_nitems; /* total item count */ @@ -320,26 +314,30 @@ */ struct uma_zone { /* Offset 0, used in alloc/free fast/medium fast path and const. */ - struct mtx *uz_lockptr; - const char *uz_name; /* Text name of the zone */ + union { + uma_keg_t uz_keg; /* This zone keg */ + struct mtx *uz_lockptr; /* To keg or to self */ + }; struct uma_zone_domain *uz_domain; /* per-domain buckets */ uint32_t uz_flags; /* Flags inherited from kegs */ uint32_t uz_size; /* Size inherited from kegs */ uma_ctor uz_ctor; /* Constructor for each allocation */ uma_dtor uz_dtor; /* Destructor */ - uma_init uz_init; /* Initializer for each item */ - uma_fini uz_fini; /* Finalizer for each item. */ + uint64_t uz_items; /* Total items count */ + uint64_t uz_maxitems; /* Maximum number of items to alloc */ + uint32_t uz_sleepers; /* Number of sleepers on memory */ + uint16_t uz_count; /* Amount of items in full bucket */ + uint16_t uz_count_max; /* Maximum amount of items there */ /* Offset 64, used in bucket replenish. */ uma_import uz_import; /* Import new memory to cache. */ uma_release uz_release; /* Release memory from cache. */ void *uz_arg; /* Import/release argument. */ + uma_init uz_init; /* Initializer for each item */ + uma_fini uz_fini; /* Finalizer for each item. */ uma_slaballoc uz_slab; /* Allocate a slab from the backend. */ - uint16_t uz_count; /* Amount of items in full bucket */ - uint16_t uz_count_min; /* Minimal amount of items there */ - /* 32bit pad on 64bit. */ - LIST_ENTRY(uma_zone) uz_link; /* List of all zones in keg */ - LIST_HEAD(,uma_klink) uz_kegs; /* List of kegs. */ + uint64_t uz_bktcount; /* Items in bucket cache */ + uint64_t uz_bktmax; /* Maximum bucket cache size */ /* Offset 128 Rare. */ /* @@ -348,19 +346,19 @@ * members to reduce alignment overhead. */ struct mtx uz_lock; /* Lock for the zone */ - struct uma_klink uz_klink; /* klink for first keg. */ + LIST_ENTRY(uma_zone) uz_link; /* List of all zones in keg */ + const char *uz_name; /* Text name of the zone */ /* The next two fields are used to print a rate-limited warnings. */ const char *uz_warning; /* Warning to print on failure */ struct timeval uz_ratecheck; /* Warnings rate-limiting */ struct task uz_maxaction; /* Task to run when at limit */ + uint16_t uz_count_min; /* Minimal amount of items in bucket */ - /* 16 bytes of pad. */ - - /* Offset 256, atomic stats. */ - volatile u_long uz_allocs UMA_ALIGN; /* Total number of allocations */ - volatile u_long uz_fails; /* Total number of alloc failures */ - volatile u_long uz_frees; /* Total number of frees */ + /* Offset 256, stats. */ + uint64_t uz_allocs UMA_ALIGN; /* Total number of allocations */ uint64_t uz_sleeps; /* Total number of alloc sleeps */ + uint64_t uz_frees; /* Total number of frees */ + volatile u_long uz_fails; /* Total number of alloc failures */ /* * This HAS to be the last item because we adjust the zone size @@ -378,21 +376,11 @@ #define UMA_ZFLAG_DRAINING 0x08000000 /* Running zone_drain. */ #define UMA_ZFLAG_BUCKET 0x10000000 /* Bucket zone. */ #define UMA_ZFLAG_INTERNAL 0x20000000 /* No offpage no PCPU. */ -#define UMA_ZFLAG_FULL 0x40000000 /* Reached uz_maxpages */ #define UMA_ZFLAG_CACHEONLY 0x80000000 /* Don't ask VM for buckets. */ #define UMA_ZFLAG_INHERIT \ (UMA_ZFLAG_INTERNAL | UMA_ZFLAG_CACHEONLY | UMA_ZFLAG_BUCKET) -static inline uma_keg_t -zone_first_keg(uma_zone_t zone) -{ - uma_klink_t klink; - - klink = LIST_FIRST(&zone->uz_kegs); - return (klink != NULL) ? klink->kl_keg : NULL; -} - #undef UMA_ALIGN #ifdef _KERNEL @@ -418,6 +406,12 @@ #define KEG_LOCK(k) mtx_lock(&(k)->uk_lock) #define KEG_UNLOCK(k) mtx_unlock(&(k)->uk_lock) +#define KEG_GET(zone, keg) do { \ + (keg) = (zone)->uz_keg; \ + KASSERT((void *)(keg) != (void *)&(zone)->uz_lock, \ + ("%s: Invalid zone %p type", __func__, (zone))); \ + } while (0) + #define ZONE_LOCK_INIT(z, lc) \ do { \ if ((lc)) \ @@ -431,6 +425,7 @@ #define ZONE_LOCK(z) mtx_lock((z)->uz_lockptr) #define ZONE_TRYLOCK(z) mtx_trylock((z)->uz_lockptr) #define ZONE_UNLOCK(z) mtx_unlock((z)->uz_lockptr) +#define ZONE_LOCK_ASSERT(z) mtx_assert((z)->uz_lockptr, MA_OWNED) #define ZONE_LOCK_FINI(z) mtx_destroy(&(z)->uz_lock) #define ZONE_LOCK_ASSERT(z) mtx_assert((z)->uz_lockptr, MA_OWNED) Index: sys/vm/vm_page.c =================================================================== --- sys/vm/vm_page.c +++ sys/vm/vm_page.c @@ -222,7 +222,8 @@ vmd->vmd_pgcache = uma_zcache_create("vm pgcache", sizeof(struct vm_page), NULL, NULL, NULL, NULL, vm_page_import, vm_page_release, vmd, - UMA_ZONE_NOBUCKETCACHE | UMA_ZONE_MAXBUCKET | UMA_ZONE_VM); + UMA_ZONE_MAXBUCKET | UMA_ZONE_VM); + (void )uma_zone_set_maxcache(vmd->vmd_pgcache, 0); } } SYSINIT(vm_page2, SI_SUB_VM_CONF, SI_ORDER_ANY, vm_page_init_cache_zones, NULL); Index: sys/vm/vm_pager.c =================================================================== --- sys/vm/vm_pager.c +++ sys/vm/vm_pager.c @@ -85,10 +85,10 @@ #include #include #include +#include +#include "opt_swap.h" -int cluster_pbuf_freecnt = -1; /* unlimited to begin with */ - -struct buf *swbuf; +uma_zone_t pbuf_zone; static int dead_pager_getpages(vm_object_t, vm_page_t *, int, int *, int *); static vm_object_t dead_pager_alloc(void *, vm_ooffset_t, vm_prot_t, @@ -167,9 +167,6 @@ * cleaning requests (NPENDINGIO == 64) * the maximum swap cluster size * (MAXPHYS == 64k) if you want to get the most efficiency. */ -struct mtx_padalign __exclusive_cache_line pbuf_mtx; -static TAILQ_HEAD(swqueue, buf) bswlist; -static int bswneeded; vm_offset_t swapbkva; /* swap buffers kva */ void @@ -177,7 +174,6 @@ { struct pagerops **pgops; - TAILQ_INIT(&bswlist); /* * Initialize known pagers */ @@ -189,25 +185,24 @@ void vm_pager_bufferinit(void) { - struct buf *bp; - int i; - mtx_init(&pbuf_mtx, "pbuf mutex", NULL, MTX_DEF); - bp = swbuf; /* - * Now set up swap and physical I/O buffer headers. + * swbufs are used as temporary holders for I/O, such as paging I/O. + * We have no less then 16 and no more then 256. */ - for (i = 0; i < nswbuf; i++, bp++) { - TAILQ_INSERT_HEAD(&bswlist, bp, b_freelist); - BUF_LOCKINIT(bp); - LIST_INIT(&bp->b_dep); - bp->b_rcred = bp->b_wcred = NOCRED; - bp->b_xflags = 0; - } - - cluster_pbuf_freecnt = nswbuf / 2; - vnode_pbuf_freecnt = nswbuf / 2 + 1; - vnode_async_pbuf_freecnt = nswbuf / 2; +#ifndef NSWBUF_MIN +#define NSWBUF_MIN 16 +#endif + nswbuf = min(nbuf / 4, 256); + TUNABLE_INT_FETCH("kern.nswbuf", &nswbuf); + if (nswbuf < NSWBUF_MIN) + nswbuf = NSWBUF_MIN; + + /* Main zone for paging bufs. */ + pbuf_zone = uma_zcreate("pbuf", sizeof(struct buf), + pbuf_ctor, pbuf_dtor, pbuf_init, NULL, UMA_ALIGN_CACHE, + UMA_ZONE_VM | UMA_ZONE_NOFREE); + uma_zone_set_max(pbuf_zone, nswbuf); } /* @@ -347,110 +342,33 @@ return (object); } -/* - * initialize a physical buffer - */ - -/* - * XXX This probably belongs in vfs_bio.c - */ -static void -initpbuf(struct buf *bp) +int +pbuf_ctor(void *mem, int size, void *arg, int flags) { + struct buf *bp = mem; - KASSERT(bp->b_bufobj == NULL, ("initpbuf with bufobj")); - KASSERT(bp->b_vp == NULL, ("initpbuf with vp")); + bp->b_vp = NULL; + bp->b_bufobj = NULL; + + /* copied from initpbuf() */ bp->b_rcred = NOCRED; bp->b_wcred = NOCRED; - bp->b_qindex = 0; /* On no queue (QUEUE_NONE) */ - bp->b_kvabase = (caddr_t)(MAXPHYS * (bp - swbuf)) + swapbkva; + bp->b_qindex = 0; /* On no queue (QUEUE_NONE) */ bp->b_data = bp->b_kvabase; - bp->b_kvasize = MAXPHYS; - bp->b_flags = 0; bp->b_xflags = 0; + bp->b_flags = 0; bp->b_ioflags = 0; bp->b_iodone = NULL; bp->b_error = 0; BUF_LOCK(bp, LK_EXCLUSIVE, NULL); - buf_track(bp, __func__); -} - -/* - * allocate a physical buffer - * - * There are a limited number (nswbuf) of physical buffers. We need - * to make sure that no single subsystem is able to hog all of them, - * so each subsystem implements a counter which is typically initialized - * to 1/2 nswbuf. getpbuf() decrements this counter in allocation and - * increments it on release, and blocks if the counter hits zero. A - * subsystem may initialize the counter to -1 to disable the feature, - * but it must still be sure to match up all uses of getpbuf() with - * relpbuf() using the same variable. - * - * NOTE: pfreecnt can be NULL, but this 'feature' will be removed - * relatively soon when the rest of the subsystems get smart about it. XXX - */ -struct buf * -getpbuf(int *pfreecnt) -{ - struct buf *bp; - mtx_lock(&pbuf_mtx); - for (;;) { - if (pfreecnt != NULL) { - while (*pfreecnt == 0) { - msleep(pfreecnt, &pbuf_mtx, PVM, "wswbuf0", 0); - } - } - - /* get a bp from the swap buffer header pool */ - if ((bp = TAILQ_FIRST(&bswlist)) != NULL) - break; - - bswneeded = 1; - msleep(&bswneeded, &pbuf_mtx, PVM, "wswbuf1", 0); - /* loop in case someone else grabbed one */ - } - TAILQ_REMOVE(&bswlist, bp, b_freelist); - if (pfreecnt) - --*pfreecnt; - mtx_unlock(&pbuf_mtx); - initpbuf(bp); - return (bp); -} - -/* - * allocate a physical buffer, if one is available. - * - * Note that there is no NULL hack here - all subsystems using this - * call understand how to use pfreecnt. - */ -struct buf * -trypbuf(int *pfreecnt) -{ - struct buf *bp; - - mtx_lock(&pbuf_mtx); - if (*pfreecnt == 0 || (bp = TAILQ_FIRST(&bswlist)) == NULL) { - mtx_unlock(&pbuf_mtx); - return NULL; - } - TAILQ_REMOVE(&bswlist, bp, b_freelist); - --*pfreecnt; - mtx_unlock(&pbuf_mtx); - initpbuf(bp); - return (bp); + return (0); } -/* - * release a physical buffer - * - * NOTE: pfreecnt can be NULL, but this 'feature' will be removed - * relatively soon when the rest of the subsystems get smart about it. XXX - */ void -relpbuf(struct buf *bp, int *pfreecnt) +pbuf_dtor(void *mem, int size, void *arg) { + struct buf *bp = mem; if (bp->b_rcred != NOCRED) { crfree(bp->b_rcred); @@ -461,24 +379,24 @@ bp->b_wcred = NOCRED; } - KASSERT(bp->b_vp == NULL, ("relpbuf with vp")); - KASSERT(bp->b_bufobj == NULL, ("relpbuf with bufobj")); - - buf_track(bp, __func__); BUF_UNLOCK(bp); +} + +int +pbuf_init(void *mem, int size, int flags) +{ + struct buf *bp = mem; - mtx_lock(&pbuf_mtx); - TAILQ_INSERT_HEAD(&bswlist, bp, b_freelist); + bp->b_kvabase = (void *)kva_alloc(MAXPHYS); + if (bp->b_kvabase == NULL) + return (ENOMEM); + bp->b_kvasize = MAXPHYS; + BUF_LOCKINIT(bp); + LIST_INIT(&bp->b_dep); + bp->b_rcred = bp->b_wcred = NOCRED; + bp->b_xflags = 0; - if (bswneeded) { - bswneeded = 0; - wakeup(&bswneeded); - } - if (pfreecnt) { - if (++*pfreecnt == 1) - wakeup(pfreecnt); - } - mtx_unlock(&pbuf_mtx); + return (0); } /* Index: sys/vm/vnode_pager.c =================================================================== --- sys/vm/vnode_pager.c +++ sys/vm/vnode_pager.c @@ -58,6 +58,7 @@ #include "opt_vm.h" #include +#include #include #include #include @@ -82,6 +83,7 @@ #include #include #include +#include static int vnode_pager_addr(struct vnode *vp, vm_ooffset_t address, daddr_t *rtaddress, int *run); @@ -107,15 +109,27 @@ .pgo_haspage = vnode_pager_haspage, }; -int vnode_pbuf_freecnt; -int vnode_async_pbuf_freecnt; - static struct domainset *vnode_domainset = NULL; SYSCTL_PROC(_debug, OID_AUTO, vnode_domainset, CTLTYPE_STRING | CTLFLAG_RW, &vnode_domainset, 0, sysctl_handle_domainset, "A", "Default vnode NUMA policy"); +static uma_zone_t vnode_pbuf_zone; + +static void +vnode_pager_init(void *dummy) +{ + + vnode_pbuf_zone = uma_zsecond_create("vnpbuf", pbuf_ctor, pbuf_dtor, + pbuf_init, NULL, pbuf_zone); + uma_zone_set_max(vnode_pbuf_zone, nswbuf * 8); +#if 0 + uma_prealloc(vnode_pbuf_zone, nswbuf * 8); +#endif +} +SYSINIT(vnode_pager, SI_SUB_CPU, SI_ORDER_ANY, vnode_pager_init, NULL); + /* Create the VM system backing object for this vnode */ int vnode_create_vobject(struct vnode *vp, off_t isize, struct thread *td) @@ -563,7 +577,7 @@ break; } if (fileaddr != -1) { - bp = getpbuf(&vnode_pbuf_freecnt); + bp = uma_zalloc(vnode_pbuf_zone, M_WAITOK); /* build a minimal buffer header */ bp->b_iocmd = BIO_READ; @@ -595,7 +609,7 @@ */ bp->b_vp = NULL; pbrelbo(bp); - relpbuf(bp, &vnode_pbuf_freecnt); + uma_zfree(vnode_pbuf_zone, bp); if (error) break; } else @@ -757,7 +771,7 @@ #ifdef INVARIANTS off_t blkno0; #endif - int bsize, pagesperblock, *freecnt; + int bsize, pagesperblock; int error, before, after, rbehind, rahead, poff, i; int bytecount, secmask; @@ -788,17 +802,7 @@ return (VM_PAGER_OK); } - /* - * Synchronous and asynchronous paging operations use different - * free pbuf counters. This is done to avoid asynchronous requests - * to consume all pbufs. - * Allocate the pbuf at the very beginning of the function, so that - * if we are low on certain kind of pbufs don't even proceed to BMAP, - * but sleep. - */ - freecnt = iodone != NULL ? - &vnode_async_pbuf_freecnt : &vnode_pbuf_freecnt; - bp = getpbuf(freecnt); + bp = uma_zalloc(vnode_pbuf_zone, M_WAITOK); /* * Get the underlying device blocks for the file with VOP_BMAP(). @@ -807,7 +811,7 @@ */ error = VOP_BMAP(vp, foff / bsize, &bo, &bp->b_blkno, &after, &before); if (error == EOPNOTSUPP) { - relpbuf(bp, freecnt); + uma_zfree(vnode_pbuf_zone, bp); VM_OBJECT_WLOCK(object); for (i = 0; i < count; i++) { VM_CNT_INC(v_vnodein); @@ -819,7 +823,7 @@ VM_OBJECT_WUNLOCK(object); return (error); } else if (error != 0) { - relpbuf(bp, freecnt); + uma_zfree(vnode_pbuf_zone, bp); return (VM_PAGER_ERROR); } @@ -828,7 +832,7 @@ * than a page size, then use special small filesystem code. */ if (pagesperblock == 0) { - relpbuf(bp, freecnt); + uma_zfree(vnode_pbuf_zone, bp); for (i = 0; i < count; i++) { VM_CNT_INC(v_vnodein); VM_CNT_INC(v_vnodepgsin); @@ -847,7 +851,7 @@ KASSERT(count == 1, ("%s: array[%d] request to a sparse file %p", __func__, count, vp)); - relpbuf(bp, freecnt); + uma_zfree(vnode_pbuf_zone, bp); pmap_zero_page(m[0]); KASSERT(m[0]->dirty == 0, ("%s: page %p is dirty", __func__, m[0])); @@ -1061,7 +1065,7 @@ bp->b_pages[i] = NULL; bp->b_vp = NULL; pbrelbo(bp); - relpbuf(bp, &vnode_pbuf_freecnt); + uma_zfree(vnode_pbuf_zone, bp); return (error != 0 ? VM_PAGER_ERROR : VM_PAGER_OK); } } @@ -1079,7 +1083,7 @@ bp->b_pages[i] = NULL; bp->b_vp = NULL; pbrelbo(bp); - relpbuf(bp, &vnode_async_pbuf_freecnt); + uma_zfree(vnode_pbuf_zone, bp); } static int