Index: sys/cam/cam_iosched.c =================================================================== --- sys/cam/cam_iosched.c +++ sys/cam/cam_iosched.c @@ -281,11 +281,15 @@ int trim_ticks; /* Max ticks to hold trims */ int last_trim_tick; /* Last 'tick' time ld a trim */ int queued_trims; /* Number of trims in the queue */ + int max_trims; /* Maximum number of trims pending at once */ + int pend_trims; /* Number of pending trims now */ #ifdef CAM_IOSCHED_DYNAMIC int read_bias; /* Read bias setting */ int current_read_bias; /* Current read bias state */ int total_ticks; int load; /* EMA of 'load average' of disk / 2^16 */ + int speedup_ticks; /* When != 0, don't delay I/O for performance */ +#define SPEEDUP_TICKS 11 struct bio_queue_head write_queue; struct iop_stats read_stats, write_stats, trim_stats; @@ -574,6 +578,11 @@ isc->this_frac = (uint32_t)delta >> 16; /* Note: discards seconds -- should be 0 harmless if not */ isc->last_time = now; + if (isc->speedup_ticks > 0) { + isc->current_read_bias = 1; + isc->speedup_ticks--; + } + cam_iosched_cl_maybe_steer(&isc->cl); cam_iosched_limiter_tick(&isc->read_stats); @@ -707,11 +716,6 @@ } #endif -/* - * Trim or similar currently pending completion. Should only be set for - * those drivers wishing only one Trim active at a time. - */ -#define CAM_IOSCHED_FLAG_TRIM_ACTIVE (1ul << 0) /* Callout active, and needs to be torn down */ #define CAM_IOSCHED_FLAG_CALLOUT_ACTIVE (1ul << 1) @@ -755,6 +759,19 @@ static inline bool cam_iosched_has_more_trim(struct cam_iosched_softc *isc) { + struct bio *bp; + + bp = bioq_first(&isc->trim_queue); +#ifdef CAM_IOSCHED_DYNAMIC + if (do_dynamic_iosched) { + /* + * If we're limiting trims, then defer action on trims + * for a bit. + */ + if (bp == NULL || cam_iosched_limiter_caniop(&isc->trim_stats, bp) != 0) + return false; + } +#endif /* * If we've set a trim_goal, then if we exceed that allow trims @@ -771,8 +788,7 @@ return false; } - return !(isc->flags & CAM_IOSCHED_FLAG_TRIM_ACTIVE) && - bioq_first(&isc->trim_queue); + return isc->pend_trims <= isc->max_trims && bp != NULL; } #define cam_iosched_sort_queue(isc) ((isc)->sort_io_queue >= 0 ? \ @@ -1096,6 +1112,7 @@ (*iscp)->sort_io_queue = -1; bioq_init(&(*iscp)->bio_queue); bioq_init(&(*iscp)->trim_queue); + (*iscp)->max_trims = 1; #ifdef CAM_IOSCHED_DYNAMIC if (do_dynamic_iosched) { bioq_init(&(*iscp)->write_queue); @@ -1311,7 +1328,7 @@ /* * See if our current limiter allows this I/O. */ - if (cam_iosched_limiter_iop(&isc->write_stats, bp) != 0) { + if (isc->speedup_ticks == 0 && cam_iosched_limiter_iop(&isc->write_stats, bp) != 0) { if (iosched_debug) printf("Can't write because limiter says no.\n"); isc->write_stats.state_flags |= IOP_RATE_LIMITED; @@ -1322,7 +1339,10 @@ * Let's do this: We've passed all the gates and we're a go * to schedule the I/O in the SIM. */ - isc->current_read_bias = isc->read_bias; + if (isc->speedup_ticks > 0) + isc->current_read_bias = 1; + else + isc->current_read_bias = isc->read_bias; bioq_remove(&isc->write_queue, bp); if (bp->bio_cmd == BIO_WRITE) { isc->write_stats.queued--; @@ -1389,31 +1409,59 @@ struct bio * cam_iosched_get_trim(struct cam_iosched_softc *isc) { +#ifdef CAM_IOSCHED_DYNAMIC + struct bio *bp; +#endif if (!cam_iosched_has_more_trim(isc)) return NULL; #ifdef CAM_IOSCHED_DYNAMIC + if (!do_dynamic_iosched) + return cam_iosched_next_trim(isc); + + bp = bioq_first(&isc->trim_queue); + if (bp == NULL) + return NULL; + /* * If pending read, prefer that based on current read bias setting. The * read bias is shared for both writes and TRIMs, but on TRIMs the bias - * is for a combined TRIM not a single TRIM request that's come in. + * is for a combined TRIM not a single TRIM request that's come in. If + * we do return NULL, we're not rate-limiting TRIMs, so we don't change + * the limiter flag. */ - if (do_dynamic_iosched) { - if (bioq_first(&isc->bio_queue) && isc->current_read_bias) { - if (iosched_debug) - printf("Reads present and current_read_bias is %d" - " queued trims %d queued reads %d\n", - isc->current_read_bias, isc->trim_stats.queued, - isc->read_stats.queued); - isc->current_read_bias--; - /* We're not limiting TRIMS, per se, just doing reads first */ - return NULL; - } - /* - * We're going to do a trim, so reset the bias. - */ - isc->current_read_bias = isc->read_bias; + if (bioq_first(&isc->bio_queue) && isc->current_read_bias) { + if (iosched_debug) + printf("Reads present and current_read_bias is %d" + " queued trims %d queued reads %d\n", + isc->current_read_bias, isc->trim_stats.queued, + isc->read_stats.queued); + isc->current_read_bias--; + return NULL; + } + + /* + * See if our current limiter allows this I/O. Because we only call this + * here, and not in next_trim, the 'bandwidth' limits for trims won't + * work, while the iops or max queued limits will work. It's tricky + * because we want the limits to be from the perspective of the + * "commands sent to the device." To make iops work, we need to check + * only here (since we want all the ops we combine to count as one). To + * make bw limits work, we'd need to check in next_trim, but that would + * have the effect of limiting the iops as seen from the upper layers. + */ + if (isc->speedup_ticks == 0 && cam_iosched_limiter_iop(&isc->trim_stats, bp) != 0) { + if (iosched_debug) + printf("Can't trim because limiter says no.\n"); + isc->trim_stats.state_flags |= IOP_RATE_LIMITED; + return NULL; } + if (isc->speedup_ticks > 0) + isc->current_read_bias = 1; + else + isc->current_read_bias = isc->read_bias; + isc->trim_stats.state_flags &= ~IOP_RATE_LIMITED; + /* cam_iosched_next_trim below keeps proper book */ #endif return cam_iosched_next_trim(isc); } @@ -1496,6 +1544,60 @@ cam_iosched_queue_work(struct cam_iosched_softc *isc, struct bio *bp) { + /* + * A BIO_SPEEDUP from the uppper layers means that they have a block + * shortage. At the present, this is only sent when we're trying to + * allocate blocks, but have a shortage before giving up. bio_length is + * the size of their shortage. We will complete just enough BIO_DELETEs + * in the queue to satisfy the need. If bio_length is 0, we'll complete + * them all. This allows the scheduler to delay BIO_DELETEs to improve + * read/write performance without worrying about the upper layers. When + * it's possibly a problem, we respond by pretending the BIO_DELETEs + * just worked. We can't do anything about the BIO_DELETEs in the + * hardware, though. We have to wait for them to complete. + */ + if (bp->bio_cmd == BIO_SPEEDUP) { + off_t len; + struct bio *nbp; + + /* + * Either request of 0 length puts us into a special mdoe. + */ + if (bp->bio_length == 0) { + isc->speedup_ticks = SPEEDUP_TICKS; + bp->bio_error = 0; + biodone(bp); + return; + } + + /* + * Ignore non-trim speedup requests. + */ + if ((bp->bio_flags & BIO_SPEEDUP_TRIM) == 0) { + bp->bio_error = 0; + biodone(bp); + return; + } + + len = 0; + while (bioq_first(&isc->trim_queue) && + (bp->bio_length == 0 || len < bp->bio_length)) { + nbp = bioq_takefirst(&isc->trim_queue); + len += nbp->bio_length; + nbp->bio_error = 0; + biodone(nbp); + } + if (bp->bio_length > 0) { + if (bp->bio_length > len) + bp->bio_resid = bp->bio_length - len; + else + bp->bio_resid = 0; + } + bp->bio_error = 0; + biodone(bp); + return; + } + /* * If we get a BIO_FLUSH, and we're doing delayed BIO_DELETEs then we * set the last tick time to one less than the current ticks minus the @@ -1569,7 +1671,7 @@ cam_iosched_trim_done(struct cam_iosched_softc *isc) { - isc->flags &= ~CAM_IOSCHED_FLAG_TRIM_ACTIVE; + isc->pend_trims--; } /* @@ -1637,7 +1739,7 @@ cam_iosched_submit_trim(struct cam_iosched_softc *isc) { - isc->flags |= CAM_IOSCHED_FLAG_TRIM_ACTIVE; + isc->pend_trims++; } /* @@ -1863,7 +1965,7 @@ db_printf("in_reads: %d\n", isc->read_stats.in); db_printf("out_reads: %d\n", isc->read_stats.out); db_printf("queued_reads: %d\n", isc->read_stats.queued); - db_printf("Current Q len %d\n", biolen(&isc->bio_queue)); + db_printf("Read Q len %d\n", biolen(&isc->bio_queue)); db_printf("pending_writes: %d\n", isc->write_stats.pending); db_printf("min_writes: %d\n", isc->write_stats.min); db_printf("max_writes: %d\n", isc->write_stats.max); @@ -1871,7 +1973,7 @@ db_printf("in_writes: %d\n", isc->write_stats.in); db_printf("out_writes: %d\n", isc->write_stats.out); db_printf("queued_writes: %d\n", isc->write_stats.queued); - db_printf("Current Q len %d\n", biolen(&isc->write_queue)); + db_printf("Write Q len %d\n", biolen(&isc->write_queue)); db_printf("pending_trims: %d\n", isc->trim_stats.pending); db_printf("min_trims: %d\n", isc->trim_stats.min); db_printf("max_trims: %d\n", isc->trim_stats.max); @@ -1879,11 +1981,11 @@ db_printf("in_trims: %d\n", isc->trim_stats.in); db_printf("out_trims: %d\n", isc->trim_stats.out); db_printf("queued_trims: %d\n", isc->trim_stats.queued); - db_printf("Current Q len %d\n", biolen(&isc->trim_queue)); + db_printf("Trim Q len %d\n", biolen(&isc->trim_queue)); db_printf("read_bias: %d\n", isc->read_bias); db_printf("current_read_bias: %d\n", isc->current_read_bias); - db_printf("Trim active? %s\n", - (isc->flags & CAM_IOSCHED_FLAG_TRIM_ACTIVE) ? "yes" : "no"); + db_printf("Trims active %d\n", isc->pend_trims); + db_printf("Max trims active %d\n", isc->max_trims); } #endif #endif Index: sys/geom/geom.h =================================================================== --- sys/geom/geom.h +++ sys/geom/geom.h @@ -336,6 +336,7 @@ int g_io_getattr(const char *attr, struct g_consumer *cp, int *len, void *ptr); int g_io_zonecmd(struct disk_zone_args *zone_args, struct g_consumer *cp); int g_io_flush(struct g_consumer *cp); +int g_io_speedup(size_t shortage, u_int flags, size_t *resid, struct g_consumer *cp); int g_register_classifier(struct g_classifier_hook *hook); void g_unregister_classifier(struct g_classifier_hook *hook); void g_io_request(struct bio *bp, struct g_consumer *cp); Index: sys/geom/geom_io.c =================================================================== --- sys/geom/geom_io.c +++ sys/geom/geom_io.c @@ -338,6 +338,42 @@ return (error); } +/* + * Send a BIO_SPEEDUP down the stack. It tells the lower layers that the upper + * layers have encountered a resource shortage. The lower layers are advised to + * stop delaying bio transactions that they might be holding for performance + * reasons and to schedule them (read/write/flush) or complete them successfully + * (trims) as quickly as it can. bio_length is the amount of the shortage. + * bio_resid is used to communicate back if the lower layers couldn't find + * bio_length worth of I/O to schedule or discard. A length of 0 means to do as + * much as you can (schedule the h/w queues full, discard all trims). flags are + * a hint from the upper layers to the lower layers what operation should be + * done. The call should be non-blocking. + */ +int +g_io_speedup(size_t shortage, u_int flags, size_t *resid, struct g_consumer *cp) +{ + struct bio *bp; + int error; + + KASSERT((flags & (BIO_SPEEDUP_TRIM | BIO_SPEEDUP_WRITE)) != 0, + ("Invalid flags passed to g_io_speedup: %#x", flags)); + g_trace(G_T_BIO, "bio_speedup(%s, %zu, %#x)", cp->provider->name, + shortage, flags); + bp = g_new_bio(); + if (bp == NULL) + return (ENOMEM); + bp->bio_cmd = BIO_SPEEDUP; + bp->bio_length = shortage; + bp->bio_done = NULL; + bp->bio_flags |= flags; + g_io_request(bp, cp); + error = biowait(bp, "gflush"); + *resid = bp->bio_resid; + g_destroy_bio(bp); + return (error); +} + int g_io_flush(struct g_consumer *cp) { Index: sys/sys/bio.h =================================================================== --- sys/sys/bio.h +++ sys/sys/bio.h @@ -53,6 +53,7 @@ #define BIO_CMD1 0x07 /* Available for local hacks */ #define BIO_CMD2 0x08 /* Available for local hacks */ #define BIO_ZONE 0x09 /* Zone command */ +#define BIO_SPEEDUP 0x0a /* Upper layers face shortage */ /* bio_flags */ #define BIO_ERROR 0x01 /* An error occurred processing this bio. */ @@ -67,6 +68,9 @@ #define BIO_TRANSIENT_MAPPING 0x20 #define BIO_VLIST 0x40 +#define BIO_SPEEDUP_WRITE 0x4000 /* Resource shortage at upper layers */ +#define BIO_SPEEDUP_TRIM 0x8000 /* Resource shortage at upper layers */ + #ifdef _KERNEL struct disk; struct bio; Index: sys/ufs/ffs/ffs_softdep.c =================================================================== --- sys/ufs/ffs/ffs_softdep.c +++ sys/ufs/ffs/ffs_softdep.c @@ -903,7 +903,6 @@ int, struct pagedep **); static int pagedep_find(struct pagedep_hashhead *, ino_t, ufs_lbn_t, struct pagedep **); -static void pause_timer(void *); static int request_cleanup(struct mount *, int); static int softdep_request_cleanup_flush(struct mount *, struct ufsmount *); static void schedule_cleanup(struct mount *); @@ -1256,9 +1255,6 @@ */ static int max_softdeps; /* maximum number of structs before slowdown */ static int tickdelay = 2; /* number of ticks to pause during slowdown */ -static int proc_waiting; /* tracks whether we have a timeout posted */ -static int *stat_countp; /* statistic to count in proc_waiting timeout */ -static struct callout softdep_callout; static int req_clear_inodedeps; /* syncer process flush some inodedeps */ static int req_clear_remove; /* syncer process flush some freeblks */ static int softdep_flushcache = 0; /* Should we do BIO_FLUSH? */ @@ -1448,6 +1444,7 @@ LOCK_OWNED(ump); worklist_speedup(ump->um_mountp); + g_io_speedup(0, BIO_SPEEDUP_WRITE, NULL, ump->um_cp); bd_speedup(); /* * If we have global shortages, then we need other @@ -2429,9 +2426,6 @@ bioops.io_deallocate = softdep_deallocate_dependencies; bioops.io_countdeps = softdep_count_dependencies; softdep_ast_cleanup = softdep_ast_cleanup_proc; - - /* Initialize the callout with an mtx. */ - callout_init_mtx(&softdep_callout, &lk, 0); } /* @@ -2448,8 +2442,6 @@ bioops.io_deallocate = NULL; bioops.io_countdeps = NULL; softdep_ast_cleanup = NULL; - - callout_drain(&softdep_callout); } /* @@ -13328,6 +13320,7 @@ struct ufsmount *ump; struct mount *mp; long starttime; + size_t resid; ufs2_daddr_t needed; int error, failed_vnode; @@ -13402,14 +13395,17 @@ } starttime = time_second; retry: - if ((resource == FLUSH_BLOCKS_WAIT && ump->softdep_on_worklist > 0 && + if (resource == FLUSH_BLOCKS_WAIT && + fs->fs_cstotal.cs_nbfree <= needed) + g_io_speedup(needed * fs->fs_bsize, BIO_SPEEDUP_TRIM, &resid, + ump->um_cp); + while ((resource == FLUSH_BLOCKS_WAIT && ump->softdep_on_worklist > 0 && fs->fs_cstotal.cs_nbfree <= needed) || (resource == FLUSH_INODES_WAIT && fs->fs_pendinginodes > 0 && fs->fs_cstotal.cs_nifree <= needed)) { ACQUIRE_LOCK(ump); if (ump->softdep_on_worklist > 0 && - process_worklist_item(UFSTOVFS(ump), - ump->softdep_on_worklist, LK_NOWAIT) != 0) + process_worklist_item(UFSTOVFS(ump), 1, LK_NOWAIT) != 0) stat_worklist_push += 1; FREE_LOCK(ump); } @@ -13640,11 +13636,7 @@ * If we are resource constrained on inode dependencies, try * flushing some dirty inodes. Otherwise, we are constrained * by file deletions, so try accelerating flushes of directories - * with removal dependencies. We would like to do the cleanup - * here, but we probably hold an inode locked at this point and - * that might deadlock against one that we try to clean. So, - * the best that we can do is request the syncer daemon to do - * the cleanup for us. + * with removal dependencies. */ switch (resource) { @@ -13654,7 +13646,7 @@ stat_ino_limit_push += 1; req_clear_inodedeps += 1; FREE_GBLLOCK(&lk); - stat_countp = &stat_ino_limit_hit; + clear_inodedeps(mp); break; case FLUSH_BLOCKS: @@ -13663,50 +13655,16 @@ stat_blk_limit_push += 1; req_clear_remove += 1; FREE_GBLLOCK(&lk); - stat_countp = &stat_blk_limit_hit; + g_io_speedup(0, BIO_SPEEDUP_TRIM | BIO_SPEEDUP_WRITE, NULL, ump->um_cp); + clear_remove(mp); break; default: panic("request_cleanup: unknown type"); } - /* - * Hopefully the syncer daemon will catch up and awaken us. - * We wait at most tickdelay before proceeding in any case. - */ - ACQUIRE_GBLLOCK(&lk); - FREE_LOCK(ump); - proc_waiting += 1; - if (callout_pending(&softdep_callout) == FALSE) - callout_reset(&softdep_callout, tickdelay > 2 ? tickdelay : 2, - pause_timer, 0); - - if ((td->td_pflags & TDP_KTHREAD) == 0) - msleep((caddr_t)&proc_waiting, &lk, PPAUSE, "softupdate", 0); - proc_waiting -= 1; - FREE_GBLLOCK(&lk); - ACQUIRE_LOCK(ump); return (1); } -/* - * Awaken processes pausing in request_cleanup and clear proc_waiting - * to indicate that there is no longer a timer running. Pause_timer - * will be called with the global softdep mutex (&lk) locked. - */ -static void -pause_timer(arg) - void *arg; -{ - - GBLLOCK_OWNED(&lk); - /* - * The callout_ API has acquired mtx and will hold it around this - * function call. - */ - *stat_countp += proc_waiting; - wakeup(&proc_waiting); -} - /* * If requested, try removing inode or removal dependencies. */ @@ -13730,14 +13688,12 @@ FREE_GBLLOCK(&lk); clear_inodedeps(mp); ACQUIRE_GBLLOCK(&lk); - wakeup(&proc_waiting); } if (req_clear_remove) { req_clear_remove -= 1; FREE_GBLLOCK(&lk); clear_remove(mp); ACQUIRE_GBLLOCK(&lk); - wakeup(&proc_waiting); } FREE_GBLLOCK(&lk); }