Index: sys/cam/cam_iosched.c =================================================================== --- sys/cam/cam_iosched.c +++ sys/cam/cam_iosched.c @@ -281,6 +281,8 @@ int trim_ticks; /* Max ticks to hold trims */ int last_trim_tick; /* Last 'tick' time ld a trim */ int queued_trims; /* Number of trims in the queue */ + int max_trims; /* Maximum number of trims pending at once */ + int pend_trims; /* Number of pending trims now */ #ifdef CAM_IOSCHED_DYNAMIC int read_bias; /* Read bias setting */ int current_read_bias; /* Current read bias state */ @@ -707,11 +709,6 @@ } #endif -/* - * Trim or similar currently pending completion. Should only be set for - * those drivers wishing only one Trim active at a time. - */ -#define CAM_IOSCHED_FLAG_TRIM_ACTIVE (1ul << 0) /* Callout active, and needs to be torn down */ #define CAM_IOSCHED_FLAG_CALLOUT_ACTIVE (1ul << 1) @@ -755,6 +752,19 @@ static inline bool cam_iosched_has_more_trim(struct cam_iosched_softc *isc) { + struct bio *bp; + + bp = bioq_first(&isc->trim_queue); +#ifdef CAM_IOSCHED_DYNAMIC + if (do_dynamic_iosched) { + /* + * If we're limiting trims, then defer action on trims + * for a bit. + */ + if (bp == NULL || cam_iosched_limiter_caniop(&isc->trim_stats, bp) != 0) + return false; + } +#endif /* * If we've set a trim_goal, then if we exceed that allow trims @@ -771,8 +781,7 @@ return false; } - return !(isc->flags & CAM_IOSCHED_FLAG_TRIM_ACTIVE) && - bioq_first(&isc->trim_queue); + return isc->pend_trims <= isc->max_trims && bp != NULL; } #define cam_iosched_sort_queue(isc) ((isc)->sort_io_queue >= 0 ? \ @@ -1096,6 +1105,7 @@ (*iscp)->sort_io_queue = -1; bioq_init(&(*iscp)->bio_queue); bioq_init(&(*iscp)->trim_queue); + (*iscp)->max_trims = 1; #ifdef CAM_IOSCHED_DYNAMIC if (do_dynamic_iosched) { bioq_init(&(*iscp)->write_queue); @@ -1389,10 +1399,17 @@ struct bio * cam_iosched_get_trim(struct cam_iosched_softc *isc) { +#ifdef CAM_IOSCHED_DYNAMIC + struct bio *bp; +#endif if (!cam_iosched_has_more_trim(isc)) return NULL; #ifdef CAM_IOSCHED_DYNAMIC + bp = bioq_first(&isc->trim_queue); + if (bp == NULL) + return NULL; + /* * If pending read, prefer that based on current read bias setting. The * read bias is shared for both writes and TRIMs, but on TRIMs the bias @@ -1414,6 +1431,26 @@ */ isc->current_read_bias = isc->read_bias; } + + /* + * See if our current limiter allows this I/O. Because we only call this + * here, and not in next_trim, the 'bandwidth' limits for trims won't + * work, while the iops or max queued limits will work. It's tricky + * because we want the limits to be from the perspective of the + * "commands sent to the device." To make iops work, we need to check + * only here (since we want all the ops we combine to count as one). To + * make bw limits work, we'd need to check in next_trim, but that would + * have the effect of limiting the iops as seen from the upper layers. + */ + if (cam_iosched_limiter_iop(&isc->trim_stats, bp) != 0) { + if (iosched_debug) + printf("Can't trim because limiter says no.\n"); + isc->trim_stats.state_flags |= IOP_RATE_LIMITED; + return NULL; + } + isc->current_read_bias = isc->read_bias; + isc->trim_stats.state_flags &= ~IOP_RATE_LIMITED; + /* cam_iosched_next_trim below keeps proper book */ #endif return cam_iosched_next_trim(isc); } @@ -1496,6 +1533,41 @@ cam_iosched_queue_work(struct cam_iosched_softc *isc, struct bio *bp) { + /* + * A BIO_SPEEDUP from the uppper layers means that they have a block + * shortage. At the present, this is only sent when we're trying to + * allocate blocks, but have a shortage before giving up. bio_length is + * the size of their shortage. We will complete just enough BIO_DELETEs + * in the queue to satisfy the need. If bio_length is 0, we'll complete + * them all. This allows the scheduler to delay BIO_DELETEs to improve + * read/write performance without worrying about the upper layers. When + * it's possibly a problem, we respond by pretending the BIO_DELETEs + * just worked. We can't do anything about the BIO_DELETEs in the + * hardware, though. We have to wait for them to complete. + */ + if (bp->bio_cmd == BIO_SPEEDUP) { + off_t len; + struct bio *nbp; + + len = 0; + while (bioq_first(&isc->trim_queue) && + (bp->bio_length == 0 || len < bp->bio_length)) { + nbp = bioq_takefirst(&isc->trim_queue); + len += nbp->bio_length; + nbp->bio_error = 0; + biodone(nbp); + } + if (bp->bio_length > 0) { + if (bp->bio_length > len) + bp->bio_resid = bp->bio_length - len; + else + bp->bio_resid = 0; + } + bp->bio_error = 0; + biodone(bp); + return; + } + /* * If we get a BIO_FLUSH, and we're doing delayed BIO_DELETEs then we * set the last tick time to one less than the current ticks minus the @@ -1569,7 +1641,7 @@ cam_iosched_trim_done(struct cam_iosched_softc *isc) { - isc->flags &= ~CAM_IOSCHED_FLAG_TRIM_ACTIVE; + isc->pend_trims--; } /* @@ -1637,7 +1709,7 @@ cam_iosched_submit_trim(struct cam_iosched_softc *isc) { - isc->flags |= CAM_IOSCHED_FLAG_TRIM_ACTIVE; + isc->pend_trims++; } /* @@ -1863,7 +1935,7 @@ db_printf("in_reads: %d\n", isc->read_stats.in); db_printf("out_reads: %d\n", isc->read_stats.out); db_printf("queued_reads: %d\n", isc->read_stats.queued); - db_printf("Current Q len %d\n", biolen(&isc->bio_queue)); + db_printf("Read Q len %d\n", biolen(&isc->bio_queue)); db_printf("pending_writes: %d\n", isc->write_stats.pending); db_printf("min_writes: %d\n", isc->write_stats.min); db_printf("max_writes: %d\n", isc->write_stats.max); @@ -1871,7 +1943,7 @@ db_printf("in_writes: %d\n", isc->write_stats.in); db_printf("out_writes: %d\n", isc->write_stats.out); db_printf("queued_writes: %d\n", isc->write_stats.queued); - db_printf("Current Q len %d\n", biolen(&isc->write_queue)); + db_printf("Write Q len %d\n", biolen(&isc->write_queue)); db_printf("pending_trims: %d\n", isc->trim_stats.pending); db_printf("min_trims: %d\n", isc->trim_stats.min); db_printf("max_trims: %d\n", isc->trim_stats.max); @@ -1879,11 +1951,11 @@ db_printf("in_trims: %d\n", isc->trim_stats.in); db_printf("out_trims: %d\n", isc->trim_stats.out); db_printf("queued_trims: %d\n", isc->trim_stats.queued); - db_printf("Current Q len %d\n", biolen(&isc->trim_queue)); + db_printf("Trim Q len %d\n", biolen(&isc->trim_queue)); db_printf("read_bias: %d\n", isc->read_bias); db_printf("current_read_bias: %d\n", isc->current_read_bias); - db_printf("Trim active? %s\n", - (isc->flags & CAM_IOSCHED_FLAG_TRIM_ACTIVE) ? "yes" : "no"); + db_printf("Trims active %d\n", isc->pend_trims); + db_printf("Max trims active %d\n", isc->max_trims); } #endif #endif Index: sys/cam/nvme/nvme_da.c =================================================================== --- sys/cam/nvme/nvme_da.c +++ sys/cam/nvme/nvme_da.c @@ -177,6 +177,14 @@ SYSCTL_INT(_kern_cam_nda, OID_AUTO, max_trim, CTLFLAG_RDTUN, &nda_max_trim_entries, NDA_MAX_TRIM_ENTRIES, "Maximum number of BIO_DELETE to send down as a DSM TRIM."); +static int nda_goal_trim_entries = NDA_MAX_TRIM_ENTRIES / 2; +SYSCTL_INT(_kern_cam_nda, OID_AUTO, goal_trim, CTLFLAG_RDTUN, + &nda_goal_trim_entries, NDA_MAX_TRIM_ENTRIES / 2, + "Number of BIO_DELETE to try to accumulate before sending a DSM TRIM."); +static int nda_trim_ticks = 50; /* 50ms ~ 1000 Hz */ +SYSCTL_INT(_kern_cam_nda, OID_AUTO, trim_ticks, CTLFLAG_RDTUN, + &nda_trim_ticks, 50, + "Number of ticks to hold BIO_DELETEs before sending down a trim"); /* * All NVMe media is non-rotational, so all nvme device instances @@ -741,6 +749,9 @@ free(softc, M_DEVBUF); return(CAM_REQ_CMP_ERR); } + /* Statically set these for the moment */ + cam_iosched_set_trim_goal(softc->cam_iosched, nda_goal_trim_entries); + cam_iosched_set_trim_ticks(softc->cam_iosched, nda_trim_ticks); /* ident_data parsing */ Index: sys/geom/geom.h =================================================================== --- sys/geom/geom.h +++ sys/geom/geom.h @@ -343,6 +343,7 @@ int g_io_getattr(const char *attr, struct g_consumer *cp, int *len, void *ptr); int g_io_zonecmd(struct disk_zone_args *zone_args, struct g_consumer *cp); int g_io_flush(struct g_consumer *cp); +int g_io_speedup(size_t shortage, u_int flags, size_t *resid, struct g_consumer *cp); int g_register_classifier(struct g_classifier_hook *hook); void g_unregister_classifier(struct g_classifier_hook *hook); void g_io_request(struct bio *bp, struct g_consumer *cp); Index: sys/geom/geom_io.c =================================================================== --- sys/geom/geom_io.c +++ sys/geom/geom_io.c @@ -340,6 +340,42 @@ return (error); } +/* + * Send a BIO_SPEEDUP down the stack. This is used to tell the lower layers that + * the upper layers have detected a resource shortage. The lower layers are + * advised to stop delaying I/O that they might be holding for performance + * reasons and to schedule it (non-trims) or complete it successfully (trims) as + * quickly as it can. bio_length is the amount of the shortage. This call + * should be non-blocking. bio_resid is used to communicate back if the lower + * layers couldn't find bio_length worth of I/O to schedule or discard. A length + * of 0 means to do as much as you can (schedule the h/w queues full, discard + * all trims). flags are a hint from the upper layers to the lower layers what + * operation should be done. + */ +int +g_io_speedup(size_t shortage, u_int flags, size_t *resid, struct g_consumer *cp) +{ + struct bio *bp; + int error; + + KASSERT((flags & (BIO_SPEEDUP_TRIM | BIO_SPEEDUP_WRITE)) != 0, + ("Invalid flags passed to g_io_speedup: %#x", flags)); + g_trace(G_T_BIO, "bio_speedup(%s, %zu, %#x)", cp->provider->name, + shortage, flags); + bp = g_new_bio(); + if (bp == NULL) + return (ENOMEM); + bp->bio_cmd = BIO_SPEEDUP; + bp->bio_length = shortage; + bp->bio_done = NULL; + bp->bio_flags |= flags; + g_io_request(bp, cp); + error = biowait(bp, "gflush"); + *resid = bp->bio_resid; + g_destroy_bio(bp); + return (error); +} + int g_io_flush(struct g_consumer *cp) { Index: sys/sys/bio.h =================================================================== --- sys/sys/bio.h +++ sys/sys/bio.h @@ -53,6 +53,7 @@ #define BIO_CMD1 0x07 /* Available for local hacks */ #define BIO_CMD2 0x08 /* Available for local hacks */ #define BIO_ZONE 0x09 /* Zone command */ +#define BIO_SPEEDUP 0x0a /* Upper layers face shortage */ /* bio_flags */ #define BIO_ERROR 0x01 /* An error occurred processing this bio. */ @@ -70,6 +71,9 @@ #define PRINT_BIO_FLAGS "\20\7vlist\6transient_mapping\5unmapped" \ "\4ordered\3onqueue\2done\1error" +#define BIO_SPEEDUP_WRITE 0x4000 /* Resource shortage at upper layers */ +#define BIO_SPEEDUP_TRIM 0x8000 /* Resource shortage at upper layers */ + #ifdef _KERNEL struct disk; struct bio; Index: sys/ufs/ffs/ffs_softdep.c =================================================================== --- sys/ufs/ffs/ffs_softdep.c +++ sys/ufs/ffs/ffs_softdep.c @@ -13352,7 +13352,9 @@ { struct ufsmount *ump; struct mount *mp; + struct g_consumer *cp; long starttime; + size_t resid; ufs2_daddr_t needed; int error, failed_vnode; @@ -13374,6 +13376,7 @@ mp = vp->v_mount; ump = VFSTOUFS(mp); + cp = (struct g_consumer *)ump->um_devvp->v_bufobj.bo_private; mtx_assert(UFS_MTX(ump), MA_OWNED); UFS_UNLOCK(ump); error = ffs_update(vp, 1); @@ -13428,6 +13431,9 @@ } starttime = time_second; retry: + if (resource == FLUSH_BLOCKS_WAIT && + fs->fs_cstotal.cs_nbfree <= needed) + g_io_speedup(needed * fs->fs_bsize, BIO_SPEEDUP_TRIM, &resid, cp); if ((resource == FLUSH_BLOCKS_WAIT && ump->softdep_on_worklist > 0 && fs->fs_cstotal.cs_nbfree <= needed) || (resource == FLUSH_INODES_WAIT && fs->fs_pendinginodes > 0 && @@ -13574,6 +13580,8 @@ { struct mount *mp; struct ufsmount *ump; + struct g_consumer *cp; + size_t resid; int error; bool req; @@ -13585,6 +13593,8 @@ return; if (ffs_own_mount(mp) && MOUNTEDSOFTDEP(mp)) { ump = VFSTOUFS(mp); + cp = (struct g_consumer *)ump->um_devvp->v_bufobj.bo_private; + g_io_speedup(0, BIO_SPEEDUP_TRIM, &resid, cp); for (;;) { req = false; ACQUIRE_LOCK(ump);