Index: sys/cam/cam_iosched.h =================================================================== --- sys/cam/cam_iosched.h +++ sys/cam/cam_iosched.h @@ -98,6 +98,8 @@ void cam_iosched_clr_work_flags(struct cam_iosched_softc *isc, uint32_t flags); void cam_iosched_trim_done(struct cam_iosched_softc *isc); int cam_iosched_bio_complete(struct cam_iosched_softc *isc, struct bio *bp, union ccb *done_ccb); +void cam_iosched_set_trim_goal(struct cam_iosched_softc *isc, int goal); +void cam_iosched_set_trim_ticks(struct cam_iosched_softc *isc, int ticks); #endif #endif Index: sys/cam/cam_iosched.c =================================================================== --- sys/cam/cam_iosched.c +++ sys/cam/cam_iosched.c @@ -277,6 +277,12 @@ /* scheduler flags < 16, user flags >= 16 */ uint32_t flags; int sort_io_queue; + int trim_goal; /* # of trims to queue before sending */ + int trim_ticks; /* Max ticks to hold trims */ + int last_trim_tick; /* Last 'tick' time ld a trim */ + int queued_trims; /* Number of trims in the queue */ + int max_trims; /* Maximum number of trims pending at once */ + int pend_trims; /* Number of pending trims now */ #ifdef CAM_IOSCHED_DYNAMIC int read_bias; /* Read bias setting */ int current_read_bias; /* Current read bias state */ @@ -700,11 +706,6 @@ } #endif -/* - * Trim or similar currently pending completion. Should only be set for - * those drivers wishing only one Trim active at a time. - */ -#define CAM_IOSCHED_FLAG_TRIM_ACTIVE (1ul << 0) /* Callout active, and needs to be torn down */ #define CAM_IOSCHED_FLAG_CALLOUT_ACTIVE (1ul << 1) @@ -748,8 +749,36 @@ static inline bool cam_iosched_has_more_trim(struct cam_iosched_softc *isc) { - return !(isc->flags & CAM_IOSCHED_FLAG_TRIM_ACTIVE) && - bioq_first(&isc->trim_queue); + struct bio *bp; + + bp = bioq_first(&isc->trim_queue); +#ifdef CAM_IOSCHED_DYNAMIC + if (do_dynamic_iosched) { + /* + * If we're limiting trims, then defer action on trims + * for a bit. + */ + if (bp == NULL || cam_iosched_limiter_caniop(&isc->trim_stats, bp) != 0) + return false; + } +#endif + + /* + * If we've set a trim_goal, then if we exceed that allow trims + * to be passed back to the driver. If we've also set a tick timeout + * allow trims back to the driver. Otherwise, don't allow trims yet. + */ + if (isc->trim_goal > 0) { + if (isc->queued_trims >= isc->trim_goal) + return true; + if (isc->queued_trims > 0 && + isc->trim_ticks > 0 && + ticks - isc->last_trim_tick > isc->trim_ticks) + return true; + return false; + } + + return isc->pend_trims <= isc->max_trims && bp != NULL; } #define cam_iosched_sort_queue(isc) ((isc)->sort_io_queue >= 0 ? \ @@ -1073,6 +1102,7 @@ (*iscp)->sort_io_queue = -1; bioq_init(&(*iscp)->bio_queue); bioq_init(&(*iscp)->trim_queue); + (*iscp)->max_trims = 1; #ifdef CAM_IOSCHED_DYNAMIC if (do_dynamic_iosched) { bioq_init(&(*iscp)->write_queue); @@ -1174,6 +1204,41 @@ #endif } +/* + * Client drivers can set two parameters. "goal" is the number of BIO_DELETEs + * that will be queued up before iosched will "release" the trims to the client + * driver to wo with what they will (usually combine as many as possible). If we + * don't get this many, after trim_ticks we'll submit the I/O anyway with + * whatever we have. We do need an I/O of some kind of to clock the deferred + * trims out to disk. Since we will eventually get a write for the super block + * or something before we shutdown, the trims will complete. To be safe, when a + * BIO_FLUSH is presented to the iosched work queue, we set the ticks time far + * enough in the past so we'll present the BIO_DELETEs to the client driver. + * There might be a race if no BIO_DELETESs were queued, a BIO_FLUSH comes in + * and then a BIO_DELETE is sent down. No know client does this, and there's + * already a race between an ordered BIO_FLUSH and any BIO_DELETEs in flight, + * but no client depends on the ordering being honored. + * + * XXX I'm not sure what the interaction between UFS direct BIOs and the BUF + * flushing on shutdown. I think there's bufs that would be dependent on the BIO + * finishing to write out at least metadata, so we'll be fine. To be safe, keep + * the number of ticks low (less than maybe 10s) to avoid shutdown races. + */ + +void +cam_iosched_set_trim_goal(struct cam_iosched_softc *isc, int goal) +{ + + isc->trim_goal = goal; +} + +void +cam_iosched_set_trim_ticks(struct cam_iosched_softc *isc, int trim_ticks) +{ + + isc->trim_ticks = trim_ticks; +} + /* * Flush outstanding I/O. Consumers of this library don't know all the * queues we may keep, so this allows all I/O to be flushed in one @@ -1222,7 +1287,7 @@ "Reads present and current_read_bias is %d queued " "writes %d queued reads %d\n", isc->current_read_bias, isc->write_stats.queued, - isc->read_stats.queued); + isc->read_stats.queued); isc->current_read_bias--; /* We're not limiting writes, per se, just doing reads first */ return NULL; @@ -1263,6 +1328,9 @@ cam_iosched_put_back_trim(struct cam_iosched_softc *isc, struct bio *bp) { bioq_insert_head(&isc->trim_queue, bp); + if (isc->queued_trims == 0) + isc->last_trim_tick = ticks; + isc->queued_trims++; #ifdef CAM_IOSCHED_DYNAMIC isc->trim_stats.queued++; isc->trim_stats.total--; /* since we put it back, don't double count */ @@ -1286,6 +1354,8 @@ if (bp == NULL) return NULL; bioq_remove(&isc->trim_queue, bp); + isc->queued_trims--; + isc->last_trim_tick = ticks; /* Reset the tick timer when we take trims */ #ifdef CAM_IOSCHED_DYNAMIC isc->trim_stats.queued--; isc->trim_stats.total++; @@ -1304,10 +1374,58 @@ struct bio * cam_iosched_get_trim(struct cam_iosched_softc *isc) { +#ifdef CAM_IOSCHED_DYNAMIC + struct bio *bp; +#endif if (!cam_iosched_has_more_trim(isc)) return NULL; +#ifdef CAM_IOSCHED_DYNAMIC + bp = bioq_first(&isc->trim_queue); + if (bp == NULL) + return NULL; + + /* + * If pending read, prefer that based on current read bias + * setting. The read bias is shared for both writes and + * TRIMs, but on TRIMs the bias is for a combined TRIM + * not a single TRIM request that's come in. + */ + if (do_dynamic_iosched && bioq_first(&isc->bio_queue) && \ + isc->current_read_bias) { + if (iosched_debug) + printf( + "Reads present and current_read_bias is %d queued " + "trims %d queued reads %d\n", + isc->current_read_bias, isc->trim_stats.queued, + isc->read_stats.queued); + isc->current_read_bias--; + /* We're not limiting trims, per se, just doing reads first */ + return NULL; + } + + /* + * See if our current limiter allows this I/O. Because we only call this + * here, and not in next_trim, the 'bandwidth' limits for trims won't + * work, while the iops or max queued limits will work. It's tricky + * because we want the limits to be from the perspective of the + * "commands sent to the device." To make iops work, we need to check + * only here (since we want all the ops we combine to count as one). To + * make bw limits work, we'd need to check in next_trim, but that would + * have the effect of limiting the iops as seen from the upper layers. + */ + if (cam_iosched_limiter_iop(&isc->trim_stats, bp) != 0) { + if (iosched_debug) + printf("Can't trim because limiter says no.\n"); + isc->trim_stats.state_flags |= IOP_RATE_LIMITED; + return NULL; + } + isc->current_read_bias = isc->read_bias; + isc->trim_stats.state_flags &= ~IOP_RATE_LIMITED; + /* cam_iosched_next_trim below keeps proper book */ +#endif + return cam_iosched_next_trim(isc); } @@ -1390,12 +1508,57 @@ { /* - * Put all trims on the trim queue sorted, since we know - * that the collapsing code requires this. Otherwise put - * the work on the bio queue. + * A BIO_SPEEDUP from the uppper layers means that they have a block + * shortage. At the present, this is only sent when we're trying to + * allocate blocks, but have a shortage before giving up. bio_length is + * the size of their shortage. We will complete just enough BIO_DELETEs + * in the queue to satisfy the need. If bio_length is 0, we'll complete + * them all. This allows the scheduler to delay BIO_DELETEs to improve + * read/write performance without worrying about the upper layers. When + * it's possibly a problem, we respond by pretending the BIO_DELETEs + * just worked. We can't do anything about the BIO_DELETEs in the + * hardware, though. We have to wait for them to complete. + */ + if (bp->bio_cmd == BIO_SPEEDUP) { + off_t len; + struct bio *nbp; + + len = 0; + while (bioq_first(&isc->trim_queue) && + (bp->bio_length == 0 || len < bp->bio_length)) { + nbp = bioq_takefirst(&isc->trim_queue); + len += nbp->bio_length; + nbp->bio_error = 0; + biodone(nbp); + } + if (bp->bio_length > 0) { + if (bp->bio_length > len) + bp->bio_resid = bp->bio_length - len; + else + bp->bio_resid = 0; + } + bp->bio_error = 0; + biodone(bp); + return; + } + + /* + * If we get a BIO_FLUSH, and we're doing delayed BIO_DELETEs then we + * set the last tick time to one less than the current ticks minus the + * delay to force the BIO_DELETEs to be presented to the client driver. + */ + if (bp->bio_cmd == BIO_FLUSH && isc->trim_ticks > 0) + isc->last_trim_tick = ticks - isc->trim_ticks - 1; + + /* + * Put all trims on the trim queue. Otherwise put the work on the bio + * queue. */ if (bp->bio_cmd == BIO_DELETE) { bioq_insert_tail(&isc->trim_queue, bp); + if (isc->queued_trims == 0) + isc->last_trim_tick = ticks; + isc->queued_trims++; #ifdef CAM_IOSCHED_DYNAMIC isc->trim_stats.in++; isc->trim_stats.queued++; @@ -1452,7 +1615,7 @@ cam_iosched_trim_done(struct cam_iosched_softc *isc) { - isc->flags &= ~CAM_IOSCHED_FLAG_TRIM_ACTIVE; + isc->pend_trims--; } /* @@ -1509,7 +1672,7 @@ cam_iosched_submit_trim(struct cam_iosched_softc *isc) { - isc->flags |= CAM_IOSCHED_FLAG_TRIM_ACTIVE; + isc->pend_trims++; } /* @@ -1735,7 +1898,7 @@ db_printf("in_reads: %d\n", isc->read_stats.in); db_printf("out_reads: %d\n", isc->read_stats.out); db_printf("queued_reads: %d\n", isc->read_stats.queued); - db_printf("Current Q len %d\n", biolen(&isc->bio_queue)); + db_printf("Read Q len %d\n", biolen(&isc->bio_queue)); db_printf("pending_writes: %d\n", isc->write_stats.pending); db_printf("min_writes: %d\n", isc->write_stats.min); db_printf("max_writes: %d\n", isc->write_stats.max); @@ -1743,7 +1906,7 @@ db_printf("in_writes: %d\n", isc->write_stats.in); db_printf("out_writes: %d\n", isc->write_stats.out); db_printf("queued_writes: %d\n", isc->write_stats.queued); - db_printf("Current Q len %d\n", biolen(&isc->write_queue)); + db_printf("Write Q len %d\n", biolen(&isc->write_queue)); db_printf("pending_trims: %d\n", isc->trim_stats.pending); db_printf("min_trims: %d\n", isc->trim_stats.min); db_printf("max_trims: %d\n", isc->trim_stats.max); @@ -1751,11 +1914,11 @@ db_printf("in_trims: %d\n", isc->trim_stats.in); db_printf("out_trims: %d\n", isc->trim_stats.out); db_printf("queued_trims: %d\n", isc->trim_stats.queued); - db_printf("Current Q len %d\n", biolen(&isc->trim_queue)); + db_printf("Trim Q len %d\n", biolen(&isc->trim_queue)); db_printf("read_bias: %d\n", isc->read_bias); db_printf("current_read_bias: %d\n", isc->current_read_bias); - db_printf("Trim active? %s\n", - (isc->flags & CAM_IOSCHED_FLAG_TRIM_ACTIVE) ? "yes" : "no"); + db_printf("Trims active %d\n", isc->pend_trims); + db_printf("Max trims active %d\n", isc->max_trims); } #endif #endif Index: sys/cam/nvme/nvme_da.c =================================================================== --- sys/cam/nvme/nvme_da.c +++ sys/cam/nvme/nvme_da.c @@ -176,6 +176,14 @@ SYSCTL_INT(_kern_cam_nda, OID_AUTO, max_trim, CTLFLAG_RDTUN, &nda_max_trim_entries, NDA_MAX_TRIM_ENTRIES, "Maximum number of BIO_DELETE to send down as a DSM TRIM."); +static int nda_goal_trim_entries = NDA_MAX_TRIM_ENTRIES / 2; +SYSCTL_INT(_kern_cam_nda, OID_AUTO, goal_trim, CTLFLAG_RDTUN, + &nda_goal_trim_entries, NDA_MAX_TRIM_ENTRIES / 2, + "Number of BIO_DELETE to try to accumulate before sending a DSM TRIM."); +static int nda_trim_ticks = 50; /* 50ms ~ 1000 Hz */ +SYSCTL_INT(_kern_cam_nda, OID_AUTO, trim_ticks, CTLFLAG_RDTUN, + &nda_trim_ticks, 50, + "Number of ticks to hold BIO_DELETEs before sending down a trim"); /* * All NVMe media is non-rotational, so all nvme device instances @@ -731,6 +739,9 @@ free(softc, M_DEVBUF); return(CAM_REQ_CMP_ERR); } + /* Statically set these for the moment */ + cam_iosched_set_trim_goal(softc->cam_iosched, nda_goal_trim_entries); + cam_iosched_set_trim_ticks(softc->cam_iosched, nda_trim_ticks); /* ident_data parsing */ Index: sys/geom/geom.h =================================================================== --- sys/geom/geom.h +++ sys/geom/geom.h @@ -336,6 +336,7 @@ int g_io_getattr(const char *attr, struct g_consumer *cp, int *len, void *ptr); int g_io_zonecmd(struct disk_zone_args *zone_args, struct g_consumer *cp); int g_io_flush(struct g_consumer *cp); +int g_io_speedup(size_t shortage, u_int flags, size_t *resid, struct g_consumer *cp); int g_register_classifier(struct g_classifier_hook *hook); void g_unregister_classifier(struct g_classifier_hook *hook); void g_io_request(struct bio *bp, struct g_consumer *cp); Index: sys/geom/geom_io.c =================================================================== --- sys/geom/geom_io.c +++ sys/geom/geom_io.c @@ -338,6 +338,42 @@ return (error); } +/* + * Send a IBO_SPEEDUP down the stack. This is used to tell the lower layers that + * the upper layers has detected a resource shortage. The lower layers are + * advised to stop delaying I/O that they might be holding for performance + * reasons and to schedule it (non-trims) or complete it successfully (trims) as + * quickly as it can. bio_length is the amount of the shortage. This call + * should be non-blocking. bio_resid is used to communicate back if the lower + * layers couldn't find bio_length worth of I/O to schedule or discard. A length + * of 0 means to do as much as you can (schedule the h/w queues full, discard + * all trims). flags are a hint from the upper layers to the lower layers what + * operation should be done. + */ +int +g_io_speedup(size_t shortage, u_int flags, size_t *resid, struct g_consumer *cp) +{ + struct bio *bp; + int error; + + KASSERT((flags & (BIO_SPEEDUP_TRIM | BIO_SPEEDUP_WRITE)) != 0, + ("Invalid flags passed to g_io_speedup: %#x", flags)); + g_trace(G_T_BIO, "bio_speedup(%s, %zu, %#x)", cp->provider->name, + shortage, flags); + bp = g_new_bio(); + if (bp == NULL) + return (ENOMEM); + bp->bio_cmd = BIO_SPEEDUP; + bp->bio_length = shortage; + bp->bio_done = NULL; + bp->bio_flags |= flags; + g_io_request(bp, cp); + error = biowait(bp, "gflush"); + *resid = bp->bio_resid; + g_destroy_bio(bp); + return (error); +} + int g_io_flush(struct g_consumer *cp) { Index: sys/sys/bio.h =================================================================== --- sys/sys/bio.h +++ sys/sys/bio.h @@ -53,6 +53,7 @@ #define BIO_CMD1 0x07 /* Available for local hacks */ #define BIO_CMD2 0x08 /* Available for local hacks */ #define BIO_ZONE 0x09 /* Zone command */ +#define BIO_SPEEDUP 0x0a /* Upper layers face shortage */ /* bio_flags */ #define BIO_ERROR 0x01 /* An error occurred processing this bio. */ @@ -67,6 +68,9 @@ #define BIO_TRANSIENT_MAPPING 0x20 #define BIO_VLIST 0x40 +#define BIO_SPEEDUP_WRITE 0x4000 /* Resource shortage at upper layers */ +#define BIO_SPEEDUP_TRIM 0x8000 /* Resource shortage at upper layers */ + #ifdef _KERNEL struct disk; struct bio; Index: sys/ufs/ffs/ffs_softdep.c =================================================================== --- sys/ufs/ffs/ffs_softdep.c +++ sys/ufs/ffs/ffs_softdep.c @@ -13282,7 +13282,9 @@ { struct ufsmount *ump; struct mount *mp; + struct g_consumer *cp; long starttime; + size_t resid; ufs2_daddr_t needed; int error, failed_vnode; @@ -13304,6 +13306,7 @@ mp = vp->v_mount; ump = VFSTOUFS(mp); + cp = (struct g_consumer *)ump->um_devvp->v_bufobj.bo_private; mtx_assert(UFS_MTX(ump), MA_OWNED); UFS_UNLOCK(ump); error = ffs_update(vp, 1); @@ -13357,6 +13360,9 @@ } starttime = time_second; retry: + if (resource == FLUSH_BLOCKS_WAIT && + fs->fs_cstotal.cs_nbfree <= needed) + g_io_speedup(needed * fs->fs_bsize, BIO_SPEEDUP_TRIM, &resid, cp); if ((resource == FLUSH_BLOCKS_WAIT && ump->softdep_on_worklist > 0 && fs->fs_cstotal.cs_nbfree <= needed) || (resource == FLUSH_INODES_WAIT && fs->fs_pendinginodes > 0 && @@ -13503,6 +13509,8 @@ { struct mount *mp; struct ufsmount *ump; + struct g_consumer *cp; + size_t resid; int error; bool req; @@ -13514,6 +13522,8 @@ return; if (ffs_own_mount(mp) && MOUNTEDSOFTDEP(mp)) { ump = VFSTOUFS(mp); + cp = (struct g_consumer *)ump->um_devvp->v_bufobj.bo_private; + g_io_speedup(0, BIO_SPEEDUP_TRIM, &resid, cp); for (;;) { req = false; ACQUIRE_LOCK(ump);