Index: sys/cam/cam_iosched.c =================================================================== --- sys/cam/cam_iosched.c +++ sys/cam/cam_iosched.c @@ -1496,6 +1496,52 @@ cam_iosched_queue_work(struct cam_iosched_softc *isc, struct bio *bp) { + /* + * A BIO_SPEEDUP from the uppper layers means that they have a block + * shortage. At the present, this is only sent when we're trying to + * allocate blocks, but have a shortage before giving up. bio_length is + * the size of their shortage. We will complete just enough BIO_DELETEs + * in the queue to satisfy the need. If bio_length is 0, we'll complete + * them all. This allows the scheduler to delay BIO_DELETEs to improve + * read/write performance without worrying about the upper layers. When + * it's possibly a problem, we respond by pretending the BIO_DELETEs + * just worked. We can't do anything about the BIO_DELETEs in the + * hardware, though. We have to wait for them to complete. + */ + if (bp->bio_cmd == BIO_SPEEDUP) { + off_t len; + struct bio *nbp; + + /* + * If it's not a speedup for trims, then do nothing and complete + * it. We'll need to do something completely different for + * speeding up writes. + */ + if ((bp->bio_flags & BIO_SPEEDUP_TRIM) == 0) { + bp->bio_error = 0; + biodone(bp); + return; + } + + len = 0; + while (bioq_first(&isc->trim_queue) && + (bp->bio_length == 0 || len < bp->bio_length)) { + nbp = bioq_takefirst(&isc->trim_queue); + len += nbp->bio_length; + nbp->bio_error = 0; + biodone(nbp); + } + if (bp->bio_length > 0) { + if (bp->bio_length > len) + bp->bio_resid = bp->bio_length - len; + else + bp->bio_resid = 0; + } + bp->bio_error = 0; + biodone(bp); + return; + } + /* * If we get a BIO_FLUSH, and we're doing delayed BIO_DELETEs then we * set the last tick time to one less than the current ticks minus the Index: sys/geom/geom.h =================================================================== --- sys/geom/geom.h +++ sys/geom/geom.h @@ -336,6 +336,7 @@ int g_io_getattr(const char *attr, struct g_consumer *cp, int *len, void *ptr); int g_io_zonecmd(struct disk_zone_args *zone_args, struct g_consumer *cp); int g_io_flush(struct g_consumer *cp); +int g_io_speedup(size_t shortage, u_int flags, size_t *resid, struct g_consumer *cp); int g_register_classifier(struct g_classifier_hook *hook); void g_unregister_classifier(struct g_classifier_hook *hook); void g_io_request(struct bio *bp, struct g_consumer *cp); Index: sys/geom/geom_io.c =================================================================== --- sys/geom/geom_io.c +++ sys/geom/geom_io.c @@ -338,6 +338,42 @@ return (error); } +/* + * Send a BIO_SPEEDUP down the stack. It tells the lower layers that the upper + * layers have encountered a resource shortage. The lower layers are advised to + * stop delaying bio transactions that they might be holding for performance + * reasons and to schedule them (read/write/flush) or complete them successfully + * (trims) as quickly as it can. bio_length is the amount of the shortage. + * bio_resid is used to communicate back if the lower layers couldn't find + * bio_length worth of I/O to schedule or discard. A length of 0 means to do as + * much as you can (schedule the h/w queues full, discard all trims). flags are + * a hint from the upper layers to the lower layers what operation should be + * done. The call should be non-blocking. + */ +int +g_io_speedup(size_t shortage, u_int flags, size_t *resid, struct g_consumer *cp) +{ + struct bio *bp; + int error; + + KASSERT((flags & (BIO_SPEEDUP_TRIM | BIO_SPEEDUP_WRITE)) != 0, + ("Invalid flags passed to g_io_speedup: %#x", flags)); + g_trace(G_T_BIO, "bio_speedup(%s, %zu, %#x)", cp->provider->name, + shortage, flags); + bp = g_new_bio(); + if (bp == NULL) + return (ENOMEM); + bp->bio_cmd = BIO_SPEEDUP; + bp->bio_length = shortage; + bp->bio_done = NULL; + bp->bio_flags |= flags; + g_io_request(bp, cp); + error = biowait(bp, "gflush"); + *resid = bp->bio_resid; + g_destroy_bio(bp); + return (error); +} + int g_io_flush(struct g_consumer *cp) { Index: sys/sys/bio.h =================================================================== --- sys/sys/bio.h +++ sys/sys/bio.h @@ -53,6 +53,7 @@ #define BIO_CMD1 0x07 /* Available for local hacks */ #define BIO_CMD2 0x08 /* Available for local hacks */ #define BIO_ZONE 0x09 /* Zone command */ +#define BIO_SPEEDUP 0x0a /* Upper layers face shortage */ /* bio_flags */ #define BIO_ERROR 0x01 /* An error occurred processing this bio. */ @@ -67,6 +68,9 @@ #define BIO_TRANSIENT_MAPPING 0x20 #define BIO_VLIST 0x40 +#define BIO_SPEEDUP_WRITE 0x4000 /* Resource shortage at upper layers */ +#define BIO_SPEEDUP_TRIM 0x8000 /* Resource shortage at upper layers */ + #ifdef _KERNEL struct disk; struct bio; Index: sys/ufs/ffs/ffs_softdep.c =================================================================== --- sys/ufs/ffs/ffs_softdep.c +++ sys/ufs/ffs/ffs_softdep.c @@ -903,7 +903,6 @@ int, struct pagedep **); static int pagedep_find(struct pagedep_hashhead *, ino_t, ufs_lbn_t, struct pagedep **); -static void pause_timer(void *); static int request_cleanup(struct mount *, int); static int softdep_request_cleanup_flush(struct mount *, struct ufsmount *); static void schedule_cleanup(struct mount *); @@ -1256,9 +1255,6 @@ */ static int max_softdeps; /* maximum number of structs before slowdown */ static int tickdelay = 2; /* number of ticks to pause during slowdown */ -static int proc_waiting; /* tracks whether we have a timeout posted */ -static int *stat_countp; /* statistic to count in proc_waiting timeout */ -static struct callout softdep_callout; static int req_clear_inodedeps; /* syncer process flush some inodedeps */ static int req_clear_remove; /* syncer process flush some freeblks */ static int softdep_flushcache = 0; /* Should we do BIO_FLUSH? */ @@ -2429,9 +2425,6 @@ bioops.io_deallocate = softdep_deallocate_dependencies; bioops.io_countdeps = softdep_count_dependencies; softdep_ast_cleanup = softdep_ast_cleanup_proc; - - /* Initialize the callout with an mtx. */ - callout_init_mtx(&softdep_callout, &lk, 0); } /* @@ -2448,8 +2441,6 @@ bioops.io_deallocate = NULL; bioops.io_countdeps = NULL; softdep_ast_cleanup = NULL; - - callout_drain(&softdep_callout); } /* @@ -13328,6 +13319,7 @@ struct ufsmount *ump; struct mount *mp; long starttime; + size_t resid; ufs2_daddr_t needed; int error, failed_vnode; @@ -13402,6 +13394,10 @@ } starttime = time_second; retry: + if (resource == FLUSH_BLOCKS_WAIT && + fs->fs_cstotal.cs_nbfree <= needed) + g_io_speedup(needed * fs->fs_bsize, BIO_SPEEDUP_TRIM, &resid, + ump->um_cp); if ((resource == FLUSH_BLOCKS_WAIT && ump->softdep_on_worklist > 0 && fs->fs_cstotal.cs_nbfree <= needed) || (resource == FLUSH_INODES_WAIT && fs->fs_pendinginodes > 0 && @@ -13548,6 +13544,7 @@ { struct mount *mp; struct ufsmount *ump; + size_t resid; int error; bool req; @@ -13559,6 +13556,7 @@ return; if (ffs_own_mount(mp) && MOUNTEDSOFTDEP(mp)) { ump = VFSTOUFS(mp); + g_io_speedup(0, BIO_SPEEDUP_TRIM, &resid, ump->um_cp); for (;;) { req = false; ACQUIRE_LOCK(ump); @@ -13598,6 +13596,8 @@ * If memory utilization has gotten too high, deliberately slow things * down and speed up the I/O processing. */ +static int trimclear = 10; /* number of TRIM blocks to dump */ + static int request_cleanup(mp, resource) struct mount *mp; @@ -13605,6 +13605,7 @@ { struct thread *td = curthread; struct ufsmount *ump; + size_t resid; ump = VFSTOUFS(mp); LOCK_OWNED(ump); @@ -13640,11 +13641,7 @@ * If we are resource constrained on inode dependencies, try * flushing some dirty inodes. Otherwise, we are constrained * by file deletions, so try accelerating flushes of directories - * with removal dependencies. We would like to do the cleanup - * here, but we probably hold an inode locked at this point and - * that might deadlock against one that we try to clean. So, - * the best that we can do is request the syncer daemon to do - * the cleanup for us. + * with removal dependencies. */ switch (resource) { @@ -13654,7 +13651,7 @@ stat_ino_limit_push += 1; req_clear_inodedeps += 1; FREE_GBLLOCK(&lk); - stat_countp = &stat_ino_limit_hit; + clear_inodedeps(mp); break; case FLUSH_BLOCKS: @@ -13663,50 +13660,17 @@ stat_blk_limit_push += 1; req_clear_remove += 1; FREE_GBLLOCK(&lk); - stat_countp = &stat_blk_limit_hit; + g_io_speedup(trimclear * ump->um_fs->fs_bsize, + BIO_SPEEDUP_TRIM, &resid, ump->um_cp); + clear_remove(mp); break; default: panic("request_cleanup: unknown type"); } - /* - * Hopefully the syncer daemon will catch up and awaken us. - * We wait at most tickdelay before proceeding in any case. - */ - ACQUIRE_GBLLOCK(&lk); - FREE_LOCK(ump); - proc_waiting += 1; - if (callout_pending(&softdep_callout) == FALSE) - callout_reset(&softdep_callout, tickdelay > 2 ? tickdelay : 2, - pause_timer, 0); - - if ((td->td_pflags & TDP_KTHREAD) == 0) - msleep((caddr_t)&proc_waiting, &lk, PPAUSE, "softupdate", 0); - proc_waiting -= 1; - FREE_GBLLOCK(&lk); - ACQUIRE_LOCK(ump); return (1); } -/* - * Awaken processes pausing in request_cleanup and clear proc_waiting - * to indicate that there is no longer a timer running. Pause_timer - * will be called with the global softdep mutex (&lk) locked. - */ -static void -pause_timer(arg) - void *arg; -{ - - GBLLOCK_OWNED(&lk); - /* - * The callout_ API has acquired mtx and will hold it around this - * function call. - */ - *stat_countp += proc_waiting; - wakeup(&proc_waiting); -} - /* * If requested, try removing inode or removal dependencies. */ @@ -13730,14 +13694,12 @@ FREE_GBLLOCK(&lk); clear_inodedeps(mp); ACQUIRE_GBLLOCK(&lk); - wakeup(&proc_waiting); } if (req_clear_remove) { req_clear_remove -= 1; FREE_GBLLOCK(&lk); clear_remove(mp); ACQUIRE_GBLLOCK(&lk); - wakeup(&proc_waiting); } FREE_GBLLOCK(&lk); }