Index: sys/kern/subr_blist.c =================================================================== --- sys/kern/subr_blist.c +++ sys/kern/subr_blist.c @@ -295,7 +295,7 @@ * not be allocated. */ daddr_t -blist_alloc(blist_t bl, int *count, int maxcount) +blist_alloc(blist_t bl, daddr_t *io_cursor, int *count, int maxcount) { daddr_t blk, cursor; @@ -310,14 +310,14 @@ * non-zero. When the cursor is zero, an allocation failure will * stop further iterations. */ - for (cursor = bl->bl_cursor;; cursor = 0) { + for (cursor = *io_cursor;; cursor = 0) { blk = blst_meta_alloc(bl->bl_root, cursor, count, maxcount, bl->bl_radix); if (blk != SWAPBLK_NONE) { bl->bl_avail -= *count; - bl->bl_cursor = blk + *count; - if (bl->bl_cursor == bl->bl_blocks) - bl->bl_cursor = 0; + *io_cursor = blk + *count; + if (*io_cursor == bl->bl_blocks) + *io_cursor = 0; return (blk); } if (cursor == 0) @@ -404,8 +404,7 @@ void blist_print(blist_t bl) { - printf("BLIST avail = %jd, cursor = %08jx {\n", - (uintmax_t)bl->bl_avail, (uintmax_t)bl->bl_cursor); + printf("BLIST avail = %jd {\n", (uintmax_t)bl->bl_avail); if (bl->bl_root->bm_bitmap != 0) blst_radix_print(bl->bl_root, 0, bl->bl_radix, 4); Index: sys/sys/blist.h =================================================================== --- sys/sys/blist.h +++ sys/sys/blist.h @@ -81,7 +81,6 @@ daddr_t bl_blocks; /* area of coverage */ daddr_t bl_avail; /* # available blocks */ u_daddr_t bl_radix; /* coverage radix */ - daddr_t bl_cursor; /* next-fit search starts at */ blmeta_t bl_root[1]; /* root of radix tree */ } *blist_t; @@ -92,7 +91,7 @@ struct sbuf; -daddr_t blist_alloc(blist_t blist, int *count, int maxcount); +daddr_t blist_alloc(blist_t blist, daddr_t *cursor, int *count, int maxcount); daddr_t blist_avail(blist_t blist); blist_t blist_create(daddr_t blocks, int flags); void blist_destroy(blist_t blist); Index: sys/vm/swap_pager.h =================================================================== --- sys/vm/swap_pager.h +++ sys/vm/swap_pager.h @@ -57,6 +57,7 @@ */ struct swdevt { int sw_flags; + int sw_priority; int sw_nblks; int sw_used; dev_t sw_dev; @@ -65,12 +66,15 @@ swblk_t sw_first; swblk_t sw_end; struct blist *sw_blist; + daddr_t sw_cursor; /* next-fit search starts at */ + daddr_t sw_trimmer; /* where to look for free space to trim */ TAILQ_ENTRY(swdevt) sw_list; sw_strategy_t *sw_strategy; sw_close_t *sw_close; }; #define SW_UNMAPPED 0x01 +#define SW_TRIM 0x02 #define SW_CLOSING 0x04 #ifdef _KERNEL Index: sys/vm/swap_pager.c =================================================================== --- sys/vm/swap_pager.c +++ sys/vm/swap_pager.c @@ -400,8 +400,9 @@ static void swp_sizecheck(void); static void swp_pager_async_iodone(struct buf *bp); static bool swp_pager_swblk_empty(struct swblk *sb, int start, int limit); -static int swapongeom(struct vnode *); -static int swaponvp(struct thread *, struct vnode *, u_long); +static int swapongeom(struct vnode *, int flags, int priority); +static int swaponvp(struct thread *, struct vnode *, u_long nblks, + int flags, int priority); static int swapoff_one(struct swdevt *sp, struct ucred *cred); /* @@ -737,7 +738,8 @@ if (sp == NULL) sp = TAILQ_FIRST(&swtailq); if ((sp->sw_flags & SW_CLOSING) == 0) - blk = blist_alloc(sp->sw_blist, &npages, mpages); + blk = blist_alloc(sp->sw_blist, &sp->sw_cursor, + &npages, mpages); if (blk != SWAPBLK_NONE) break; sp = TAILQ_NEXT(sp, sw_list); @@ -800,7 +802,149 @@ } +static void +swp_pager_async_trimdone(struct buf *bp) +{ + struct swdevt *sp; + daddr_t blk; + int npages; + + sp = (struct swdevt *)bp->b_fsprivate1; + blk = bp->b_blkno; + npages = bp->b_npages; + uma_zfree(swwbuf_zone, bp); + + mtx_lock(&swbuf_mtx); + if (++nsw_wcount_async == 1) + wakeup(&nsw_wcount_async); + mtx_unlock(&swbuf_mtx); + + mtx_lock(&sw_dev_mtx); + blk -= sp->sw_first; + blist_free(sp->sw_blist, blk, npages); + mtx_unlock(&sw_dev_mtx); +} + +static int trimzone_numer = 1; +static int trimzone_denom = 8; +static int min_trim_alloc = 1; +static int max_trim_alloc = 32768; /* + * The 'trim zone' is the small range of addresses just ahead of the cursor + * likely to be allocated soon. When the trimmer falls into the trim zone, + * we allocate blocks with the trimmer. + */ +static bool +swapdev_in_trim_zone(daddr_t start, daddr_t end, daddr_t nblks) +{ + return ((end + nblks - start) % nblks / trimzone_numer < + nblks / trimzone_denom); +} + + +/* sw_dev_mtx lock is held. */ + +static struct buf * +swapdev_trim(struct swdevt *sp) +{ + struct buf *bp; + daddr_t blk; + u_long nblks; + int npages; + + /* Quit if trimming is disabled. */ + if ((sp->sw_flags & SW_TRIM) == 0) + return (NULL); + + /* Quit if the cursor is too far behind the trimmer. */ + nblks = sp->sw_nblks; + if (!swapdev_in_trim_zone(sp->sw_cursor, sp->sw_trimmer, nblks)) + return (NULL); + + /* Grab a (hopefully) lot of free space allocated long ago. */ + npages = min_trim_alloc; + blk = blist_alloc(sp->sw_blist, &sp->sw_trimmer, + &npages, max_trim_alloc); + + /* Quit if there's nothing free. */ + if (blk == SWAPBLK_NONE) + return (NULL); + if (swapdev_in_trim_zone(sp->sw_cursor, sp->sw_trimmer, nblks)) { + /* + * Trim allocation is too close to the cursor. Either we've + * wrapped around and jumped the cursor, or we've found too + * little free space close to the cursor to move the trimmer + * out of the trim zone. Bail out. + * + */ + CTR5(KTR_SPARE5, "%s: cursor %p blk %p size %5d trimmer %p, too close", + __func__, + (void*)sp->sw_cursor, (void*)blk, npages, (void*)sp->sw_trimmer); + blist_free(sp->sw_blist, blk, npages); + return (NULL); + } + mtx_unlock(&sw_dev_mtx); + mtx_lock(&swbuf_mtx); + if (nsw_wcount_async == 0) { + mtx_unlock(&swbuf_mtx); + CTR5(KTR_SPARE5, "%s: cursor %p blk %p size %5d trimmer %p, low count", + __func__, + (void*)sp->sw_cursor, (void*)blk, npages, (void*)sp->sw_trimmer); + mtx_lock(&sw_dev_mtx); + blist_free(sp->sw_blist, blk, npages); + sp->sw_trimmer = blk; + return (NULL); + } + nsw_wcount_async--; + mtx_unlock(&swbuf_mtx); + bp = uma_zalloc(swwbuf_zone, M_NOWAIT); + if (bp == NULL) { + CTR5(KTR_SPARE5, "%s: cursor %p blk %p size %5d trimmer %p, no buf", + __func__, + (void*)sp->sw_cursor, (void*)blk, npages, (void*)sp->sw_trimmer); + mtx_lock(&sw_dev_mtx); + blist_free(sp->sw_blist, blk, npages); + sp->sw_trimmer = blk; + return (NULL); + } + CTR5(KTR_SPARE5, "%s: cursor %p blk %p size %5d trimmer %p, start trim", + __func__, + (void*)sp->sw_cursor, (void*)blk, npages, (void*)sp->sw_trimmer); + bp->b_flags = B_ASYNC; + bp->b_data = NULL; + bp->b_iocmd = BIO_DELETE; + bp->b_rcred = crhold(thread0.td_ucred); + bp->b_wcred = crhold(thread0.td_ucred); + bp->b_bcount = PAGE_SIZE * npages; + bp->b_bufsize = PAGE_SIZE * npages; + bp->b_blkno = blk + sp->sw_first; + bp->b_npages = npages; + bp->b_iodone = swp_pager_async_trimdone; + bp->b_fsprivate1 = sp; + BUF_KERNPROC(bp); + return (bp); +} + +/* find a swap device that needs trimming, and start trimming */ +static void +swp_pager_trimswapspace(void) +{ + struct swdevt *sp; + struct buf *bp; + + mtx_lock(&sw_dev_mtx); + TAILQ_FOREACH(sp, &swtailq, sw_list) { + bp = swapdev_trim(sp); + if (bp == NULL) + continue; + /* sw_dev_mtx released in swapdev_trim */ + sp->sw_strategy(bp,sp); + return; + } + mtx_unlock(&sw_dev_mtx); +} + +/* * SWP_PAGER_FREESWAPSPACE() - free raw swap space * * This routine returns the specified swap blocks back to the bitmap. @@ -908,8 +1052,7 @@ blk = swp_pager_getswapspace(&n, 1); if (blk == SWAPBLK_NONE) { swp_pager_meta_free(object, start, i); - VM_OBJECT_WUNLOCK(object); - return (-1); + break; } for (j = 0; j < n; ++j) { addr = swp_pager_meta_build(object, @@ -919,9 +1062,10 @@ addr); } } + VM_OBJECT_WUNLOCK(object); swp_pager_freeswapspace(s_free, n_free); - VM_OBJECT_WUNLOCK(object); - return (0); + swp_pager_trimswapspace(); + return (i == size ? 0 : -1); } /* @@ -1471,6 +1615,7 @@ swp_pager_async_iodone(bp); } swp_pager_freeswapspace(s_free, n_free); + swp_pager_trimswapspace(); VM_OBJECT_WLOCK(object); } @@ -2178,12 +2323,8 @@ }; #endif -/* - * MPSAFE - */ -/* ARGSUSED */ -int -sys_swapon(struct thread *td, struct swapon_args *uap) +static int +kern_swapon(struct thread *td, const char *name, int flags, int priority) { struct vattr attr; struct vnode *vp; @@ -2206,7 +2347,7 @@ } NDINIT(&nd, LOOKUP, ISOPEN | FOLLOW | AUDITVNODE1, UIO_USERSPACE, - uap->name, td); + name, td); error = namei(&nd); if (error) goto done; @@ -2215,7 +2356,7 @@ vp = nd.ni_vp; if (vn_isdisk(vp, &error)) { - error = swapongeom(vp); + error = swapongeom(vp, flags, priority); } else if (vp->v_type == VREG && (vp->v_mount->mnt_vfc->vfc_flags & VFCF_NETWORK) != 0 && (error = VOP_GETATTR(vp, &attr, td->td_ucred)) == 0) { @@ -2223,7 +2364,8 @@ * Allow direct swapping to NFS regular files in the same * way that nfs_mountroot() sets up diskless swapping. */ - error = swaponvp(td, vp, attr.va_size / DEV_BSIZE); + error = swaponvp(td, vp, attr.va_size / DEV_BSIZE, + flags, priority); } if (error) @@ -2233,6 +2375,14 @@ return (error); } +/* ARGSUSED */ +int +sys_swapon(struct thread *td, struct swapon_args *uap) +{ + + return kern_swapon(td, uap->name, SW_TRIM, 0); +} + /* * Check that the total amount of swap currently configured does not * exceed half the theoretical maximum. If it does, print a warning @@ -2259,7 +2409,8 @@ static void swaponsomething(struct vnode *vp, void *id, u_long nblks, - sw_strategy_t *strategy, sw_close_t *close, dev_t dev, int flags) + sw_strategy_t *strategy, sw_close_t *close, dev_t dev, + int flags, int priority) { struct swdevt *sp, *tsp; swblk_t dvbase; @@ -2295,8 +2446,11 @@ sp->sw_strategy = strategy; sp->sw_close = close; sp->sw_flags = flags; + sp->sw_priority = priority; sp->sw_blist = blist_create(nblks, M_WAITOK); + sp->sw_cursor = 0; + sp->sw_trimmer = 0; /* * Do not free the first two block in order to avoid overwriting * any bsd label at the front of the partition @@ -2736,7 +2890,7 @@ } swapgeom_acquire(cp); mtx_unlock(&sw_dev_mtx); - if (bp->b_iocmd == BIO_WRITE) + if (bp->b_iocmd == BIO_WRITE || bp->b_iocmd == BIO_DELETE) bio = g_new_bio(); else bio = g_alloc_bio(); @@ -2819,7 +2973,7 @@ } static int -swapongeom_locked(struct cdev *dev, struct vnode *vp) +swapongeom_locked(struct cdev *dev, struct vnode *vp, int flags, int priority) { struct g_provider *pp; struct g_consumer *cp; @@ -2859,14 +3013,15 @@ return (error); } nblks = pp->mediasize / DEV_BSIZE; + if ((pp->flags & G_PF_ACCEPT_UNMAPPED) != 0) + flags |= SW_UNMAPPED; swaponsomething(vp, cp, nblks, swapgeom_strategy, - swapgeom_close, dev2udev(dev), - (pp->flags & G_PF_ACCEPT_UNMAPPED) != 0 ? SW_UNMAPPED : 0); + swapgeom_close, dev2udev(dev), flags, priority); return (0); } static int -swapongeom(struct vnode *vp) +swapongeom(struct vnode *vp, int flags, int priority) { int error; @@ -2875,7 +3030,7 @@ error = ENOENT; } else { g_topology_lock(); - error = swapongeom_locked(vp->v_rdev, vp); + error = swapongeom_locked(vp->v_rdev, vp, flags, priority); g_topology_unlock(); } VOP_UNLOCK(vp, 0); @@ -2922,7 +3077,8 @@ static int -swaponvp(struct thread *td, struct vnode *vp, u_long nblks) +swaponvp(struct thread *td, struct vnode *vp, u_long nblks, int flags, + int priority) { struct swdevt *sp; int error; @@ -2949,7 +3105,7 @@ return (error); swaponsomething(vp, vp, nblks, swapdev_strategy, swapdev_close, - NODEV, 0); + NODEV, flags, priority); return (0); }