Index: sys/kern/subr_blist.c
===================================================================
--- sys/kern/subr_blist.c
+++ sys/kern/subr_blist.c
@@ -295,7 +295,7 @@
  *		     not be allocated.
  */
 daddr_t
-blist_alloc(blist_t bl, int *count, int maxcount)
+blist_alloc(blist_t bl, daddr_t *io_cursor, int *count, int maxcount)
 {
 	daddr_t blk, cursor;
 
@@ -310,14 +310,14 @@
 	 * non-zero.  When the cursor is zero, an allocation failure will
 	 * stop further iterations.
 	 */
-	for (cursor = bl->bl_cursor;; cursor = 0) {
+	for (cursor = *io_cursor;; cursor = 0) {
 		blk = blst_meta_alloc(bl->bl_root, cursor, count, maxcount,
 		    bl->bl_radix);
 		if (blk != SWAPBLK_NONE) {
 			bl->bl_avail -= *count;
-			bl->bl_cursor = blk + *count;
-			if (bl->bl_cursor == bl->bl_blocks)
-				bl->bl_cursor = 0;
+			*io_cursor = blk + *count;
+			if (*io_cursor == bl->bl_blocks)
+				*io_cursor = 0;
 			return (blk);
 		}
 		if (cursor == 0)
@@ -404,8 +404,7 @@
 void
 blist_print(blist_t bl)
 {
-	printf("BLIST avail = %jd, cursor = %08jx {\n",
-	    (uintmax_t)bl->bl_avail, (uintmax_t)bl->bl_cursor);
+	printf("BLIST avail = %jd {\n", (uintmax_t)bl->bl_avail);
 
 	if (bl->bl_root->bm_bitmap != 0)
 		blst_radix_print(bl->bl_root, 0, bl->bl_radix, 4);
Index: sys/sys/blist.h
===================================================================
--- sys/sys/blist.h
+++ sys/sys/blist.h
@@ -81,7 +81,6 @@
 	daddr_t		bl_blocks;	/* area of coverage		*/
 	daddr_t		bl_avail;	/* # available blocks		*/
 	u_daddr_t	bl_radix;	/* coverage radix		*/
-	daddr_t		bl_cursor;	/* next-fit search starts at	*/
 	blmeta_t	bl_root[1];	/* root of radix tree		*/
 } *blist_t;
 
@@ -92,7 +91,7 @@
 
 struct sbuf;
 
-daddr_t	blist_alloc(blist_t blist, int *count, int maxcount);
+daddr_t	blist_alloc(blist_t blist, daddr_t *cursor, int *count, int maxcount);
 daddr_t	blist_avail(blist_t blist);
 blist_t	blist_create(daddr_t blocks, int flags);
 void	blist_destroy(blist_t blist);
Index: sys/vm/swap_pager.h
===================================================================
--- sys/vm/swap_pager.h
+++ sys/vm/swap_pager.h
@@ -57,6 +57,7 @@
  */
 struct swdevt {
 	int	sw_flags;
+	int	sw_priority;
 	int	sw_nblks;
 	int     sw_used;
 	dev_t	sw_dev;
@@ -65,12 +66,15 @@
 	swblk_t	sw_first;
 	swblk_t	sw_end;
 	struct blist *sw_blist;
+	daddr_t	sw_cursor;	/* next-fit search starts at	*/
+	daddr_t sw_trimmer;	/* where to look for free space to trim */
 	TAILQ_ENTRY(swdevt)	sw_list;
 	sw_strategy_t		*sw_strategy;
 	sw_close_t		*sw_close;
 };
 
 #define	SW_UNMAPPED	0x01
+#define SW_TRIM		0x02
 #define	SW_CLOSING	0x04
 
 #ifdef _KERNEL
Index: sys/vm/swap_pager.c
===================================================================
--- sys/vm/swap_pager.c
+++ sys/vm/swap_pager.c
@@ -400,8 +400,9 @@
 static void	swp_sizecheck(void);
 static void	swp_pager_async_iodone(struct buf *bp);
 static bool	swp_pager_swblk_empty(struct swblk *sb, int start, int limit);
-static int	swapongeom(struct vnode *);
-static int	swaponvp(struct thread *, struct vnode *, u_long);
+static int	swapongeom(struct vnode *, int flags, int priority);
+static int	swaponvp(struct thread *, struct vnode *, u_long nblks,
+    int flags, int priority);
 static int	swapoff_one(struct swdevt *sp, struct ucred *cred);
 
 /*
@@ -737,7 +738,8 @@
 		if (sp == NULL)
 			sp = TAILQ_FIRST(&swtailq);
 		if ((sp->sw_flags & SW_CLOSING) == 0)
-			blk = blist_alloc(sp->sw_blist, &npages, mpages);
+			blk = blist_alloc(sp->sw_blist, &sp->sw_cursor,
+			    &npages, mpages);
 		if (blk != SWAPBLK_NONE)
 			break;
 		sp = TAILQ_NEXT(sp, sw_list);
@@ -800,7 +802,149 @@
 }
 
 
+static void
+swp_pager_async_trimdone(struct buf *bp)
+{
+	struct swdevt *sp;
+	daddr_t blk;
+	int npages;
+
+	sp = (struct swdevt *)bp->b_fsprivate1;
+	blk = bp->b_blkno;
+	npages = bp->b_npages;
+	uma_zfree(swwbuf_zone, bp);
+
+	mtx_lock(&swbuf_mtx);
+	if (++nsw_wcount_async == 1)
+		wakeup(&nsw_wcount_async);
+	mtx_unlock(&swbuf_mtx);
+
+	mtx_lock(&sw_dev_mtx);
+	blk -= sp->sw_first;
+	blist_free(sp->sw_blist, blk, npages);
+	mtx_unlock(&sw_dev_mtx);
+}
+
+static int trimzone_numer = 1;
+static int trimzone_denom = 8;
+static int min_trim_alloc = 1;
+static int max_trim_alloc = 32768;
 /*
+ *  The 'trim zone' is the small range of addresses just ahead of the cursor
+ *  likely to be allocated soon.  When the trimmer falls into the trim zone,
+ *  we allocate blocks with the trimmer.
+ */
+static bool
+swapdev_in_trim_zone(daddr_t start, daddr_t end, daddr_t nblks)
+{
+	return ((end + nblks - start) % nblks / trimzone_numer <
+	    nblks / trimzone_denom);
+}
+
+
+/* 	sw_dev_mtx lock is held. */
+
+static struct buf *
+swapdev_trim(struct swdevt *sp)
+{
+	struct buf *bp;
+	daddr_t blk;
+	u_long nblks;
+	int npages;
+
+	/* Quit if trimming is disabled. */
+	if ((sp->sw_flags & SW_TRIM) == 0)
+		return (NULL);
+
+	/* Quit if the cursor is too far behind the trimmer. */
+	nblks = sp->sw_nblks;
+	if (!swapdev_in_trim_zone(sp->sw_cursor, sp->sw_trimmer, nblks))
+		return (NULL);
+
+	/* Grab a (hopefully) lot of free space allocated long ago. */
+	npages = min_trim_alloc;
+	blk = blist_alloc(sp->sw_blist, &sp->sw_trimmer,
+	    &npages, max_trim_alloc);
+
+	/* Quit if there's nothing free. */
+	if (blk == SWAPBLK_NONE)
+		return (NULL);
+	if (swapdev_in_trim_zone(sp->sw_cursor, sp->sw_trimmer, nblks)) {
+		/*
+		 * Trim allocation is too close to the cursor. Either we've
+		 * wrapped around and jumped the cursor, or we've found too
+		 * little free space close to the cursor to move the trimmer
+		 * out of the trim zone.  Bail out.
+		 *
+		 */
+		CTR5(KTR_SPARE5, "%s: cursor %p blk %p size %5d trimmer %p, too close",
+		    __func__,
+		    (void*)sp->sw_cursor, (void*)blk, npages, (void*)sp->sw_trimmer);
+		blist_free(sp->sw_blist, blk, npages);
+		return (NULL);
+	}
+	mtx_unlock(&sw_dev_mtx);
+	mtx_lock(&swbuf_mtx);
+	if (nsw_wcount_async == 0) {
+		mtx_unlock(&swbuf_mtx);
+		CTR5(KTR_SPARE5, "%s: cursor %p blk %p size %5d trimmer %p, low count",
+		    __func__,
+		    (void*)sp->sw_cursor, (void*)blk, npages, (void*)sp->sw_trimmer);
+		mtx_lock(&sw_dev_mtx);
+		blist_free(sp->sw_blist, blk, npages);
+		sp->sw_trimmer = blk;
+		return (NULL);
+	}
+	nsw_wcount_async--;
+	mtx_unlock(&swbuf_mtx);
+	bp = uma_zalloc(swwbuf_zone, M_NOWAIT);
+	if (bp == NULL) {
+		CTR5(KTR_SPARE5, "%s: cursor %p blk %p size %5d trimmer %p, no buf",
+		    __func__,
+		    (void*)sp->sw_cursor, (void*)blk, npages, (void*)sp->sw_trimmer);
+		mtx_lock(&sw_dev_mtx);
+		blist_free(sp->sw_blist, blk, npages);
+		sp->sw_trimmer = blk;
+		return (NULL);
+	}
+	CTR5(KTR_SPARE5, "%s: cursor %p blk %p size %5d trimmer %p, start trim",
+	    __func__,
+	    (void*)sp->sw_cursor, (void*)blk, npages, (void*)sp->sw_trimmer);
+	bp->b_flags = B_ASYNC;
+	bp->b_data = NULL;
+	bp->b_iocmd = BIO_DELETE;
+	bp->b_rcred = crhold(thread0.td_ucred);
+	bp->b_wcred = crhold(thread0.td_ucred);
+	bp->b_bcount = PAGE_SIZE * npages;
+	bp->b_bufsize = PAGE_SIZE * npages;
+	bp->b_blkno = blk + sp->sw_first;
+	bp->b_npages = npages;
+	bp->b_iodone = swp_pager_async_trimdone;
+	bp->b_fsprivate1 = sp;
+	BUF_KERNPROC(bp);
+	return (bp);
+}
+
+/* find a swap device that needs trimming, and start trimming */
+static void
+swp_pager_trimswapspace(void)
+{
+	struct swdevt *sp;
+	struct buf *bp;
+
+	mtx_lock(&sw_dev_mtx);
+	TAILQ_FOREACH(sp, &swtailq, sw_list) {
+		bp = swapdev_trim(sp);
+		if (bp == NULL)
+			continue;
+		/* sw_dev_mtx released in swapdev_trim */
+		sp->sw_strategy(bp,sp);
+		return;
+	}
+	mtx_unlock(&sw_dev_mtx);
+}
+
+/*
  * SWP_PAGER_FREESWAPSPACE() -	free raw swap space
  *
  *	This routine returns the specified swap blocks back to the bitmap.
@@ -908,8 +1052,7 @@
 		blk = swp_pager_getswapspace(&n, 1);
 		if (blk == SWAPBLK_NONE) {
 			swp_pager_meta_free(object, start, i);
-			VM_OBJECT_WUNLOCK(object);
-			return (-1);
+			break;
 		}
 		for (j = 0; j < n; ++j) {
 			addr = swp_pager_meta_build(object,
@@ -919,9 +1062,10 @@
 				    addr);
 		}
 	}
+	VM_OBJECT_WUNLOCK(object);
 	swp_pager_freeswapspace(s_free, n_free);
-	VM_OBJECT_WUNLOCK(object);
-	return (0);
+	swp_pager_trimswapspace();
+	return (i == size ? 0 : -1);
 }
 
 /*
@@ -1471,6 +1615,7 @@
 		swp_pager_async_iodone(bp);
 	}
 	swp_pager_freeswapspace(s_free, n_free);
+	swp_pager_trimswapspace();
 	VM_OBJECT_WLOCK(object);
 }
 
@@ -2178,12 +2323,8 @@
 };
 #endif
 
-/*
- * MPSAFE
- */
-/* ARGSUSED */
-int
-sys_swapon(struct thread *td, struct swapon_args *uap)
+static int
+kern_swapon(struct thread *td, const char *name, int flags, int priority)
 {
 	struct vattr attr;
 	struct vnode *vp;
@@ -2206,7 +2347,7 @@
 	}
 
 	NDINIT(&nd, LOOKUP, ISOPEN | FOLLOW | AUDITVNODE1, UIO_USERSPACE,
-	    uap->name, td);
+	    name, td);
 	error = namei(&nd);
 	if (error)
 		goto done;
@@ -2215,7 +2356,7 @@
 	vp = nd.ni_vp;
 
 	if (vn_isdisk(vp, &error)) {
-		error = swapongeom(vp);
+		error = swapongeom(vp, flags, priority);
 	} else if (vp->v_type == VREG &&
 	    (vp->v_mount->mnt_vfc->vfc_flags & VFCF_NETWORK) != 0 &&
 	    (error = VOP_GETATTR(vp, &attr, td->td_ucred)) == 0) {
@@ -2223,7 +2364,8 @@
 		 * Allow direct swapping to NFS regular files in the same
 		 * way that nfs_mountroot() sets up diskless swapping.
 		 */
-		error = swaponvp(td, vp, attr.va_size / DEV_BSIZE);
+		error = swaponvp(td, vp, attr.va_size / DEV_BSIZE,
+		    flags, priority);
 	}
 
 	if (error)
@@ -2233,6 +2375,14 @@
 	return (error);
 }
 
+/* ARGSUSED */
+int
+sys_swapon(struct thread *td, struct swapon_args *uap)
+{
+
+	return kern_swapon(td, uap->name, SW_TRIM, 0);
+}
+
 /*
  * Check that the total amount of swap currently configured does not
  * exceed half the theoretical maximum.  If it does, print a warning
@@ -2259,7 +2409,8 @@
 
 static void
 swaponsomething(struct vnode *vp, void *id, u_long nblks,
-    sw_strategy_t *strategy, sw_close_t *close, dev_t dev, int flags)
+    sw_strategy_t *strategy, sw_close_t *close, dev_t dev,
+    int flags, int priority)
 {
 	struct swdevt *sp, *tsp;
 	swblk_t dvbase;
@@ -2295,8 +2446,11 @@
 	sp->sw_strategy = strategy;
 	sp->sw_close = close;
 	sp->sw_flags = flags;
+	sp->sw_priority = priority;
 
 	sp->sw_blist = blist_create(nblks, M_WAITOK);
+	sp->sw_cursor = 0;
+	sp->sw_trimmer = 0;
 	/*
 	 * Do not free the first two block in order to avoid overwriting
 	 * any bsd label at the front of the partition
@@ -2736,7 +2890,7 @@
 	}
 	swapgeom_acquire(cp);
 	mtx_unlock(&sw_dev_mtx);
-	if (bp->b_iocmd == BIO_WRITE)
+	if (bp->b_iocmd == BIO_WRITE || bp->b_iocmd == BIO_DELETE)
 		bio = g_new_bio();
 	else
 		bio = g_alloc_bio();
@@ -2819,7 +2973,7 @@
 }
 
 static int
-swapongeom_locked(struct cdev *dev, struct vnode *vp)
+swapongeom_locked(struct cdev *dev, struct vnode *vp, int flags, int priority)
 {
 	struct g_provider *pp;
 	struct g_consumer *cp;
@@ -2859,14 +3013,15 @@
 		return (error);
 	}
 	nblks = pp->mediasize / DEV_BSIZE;
+	if ((pp->flags & G_PF_ACCEPT_UNMAPPED) != 0)
+		flags  |= SW_UNMAPPED;
 	swaponsomething(vp, cp, nblks, swapgeom_strategy,
-	    swapgeom_close, dev2udev(dev),
-	    (pp->flags & G_PF_ACCEPT_UNMAPPED) != 0 ? SW_UNMAPPED : 0);
+	    swapgeom_close, dev2udev(dev), flags, priority);
 	return (0);
 }
 
 static int
-swapongeom(struct vnode *vp)
+swapongeom(struct vnode *vp, int flags, int priority)
 {
 	int error;
 
@@ -2875,7 +3030,7 @@
 		error = ENOENT;
 	} else {
 		g_topology_lock();
-		error = swapongeom_locked(vp->v_rdev, vp);
+		error = swapongeom_locked(vp->v_rdev, vp, flags, priority);
 		g_topology_unlock();
 	}
 	VOP_UNLOCK(vp, 0);
@@ -2922,7 +3077,8 @@
 
 
 static int
-swaponvp(struct thread *td, struct vnode *vp, u_long nblks)
+swaponvp(struct thread *td, struct vnode *vp, u_long nblks, int flags,
+    int priority)
 {
 	struct swdevt *sp;
 	int error;
@@ -2949,7 +3105,7 @@
 		return (error);
 
 	swaponsomething(vp, vp, nblks, swapdev_strategy, swapdev_close,
-	    NODEV, 0);
+	    NODEV, flags, priority);
 	return (0);
 }